From 8d00a6c8f6b08e7167bc03bf955cdc7e47c5132e Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Tue, 22 Jul 2008 08:39:57 +0200
Subject: genirq: remove last NO_IDLE_HZ leftovers

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 include/linux/irq.h | 4 ----
 1 file changed, 4 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/irq.h b/include/linux/irq.h
index 8ccb462ea42c..f3047df2d23c 100644
--- a/include/linux/irq.h
+++ b/include/linux/irq.h
@@ -197,10 +197,6 @@ extern int setup_irq(unsigned int irq, struct irqaction *new);
 
 #ifdef CONFIG_GENERIC_HARDIRQS
 
-#ifndef handle_dynamic_tick
-# define handle_dynamic_tick(a)		do { } while (0)
-#endif
-
 #ifdef CONFIG_SMP
 
 #if defined(CONFIG_GENERIC_PENDING_IRQ) || defined(CONFIG_IRQBALANCE)
-- 
cgit v1.2.3


From feb2f55db45919aa80731f8877b60cab454b7b94 Mon Sep 17 00:00:00 2001
From: Adrian Hunter <ext-adrian.hunter@nokia.com>
Date: Fri, 1 Aug 2008 11:53:29 +0300
Subject: [MTD] [OneNAND] Add defines for HF and sync write

Signed-off-by: Adrian Hunter <ext-adrian.hunter@nokia.com>
Signed-off-by: David Woodhouse <David.Woodhouse@intel.com>
---
 include/linux/mtd/onenand_regs.h | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/mtd/onenand_regs.h b/include/linux/mtd/onenand_regs.h
index d1b310c92eb4..0c6bbe28f38c 100644
--- a/include/linux/mtd/onenand_regs.h
+++ b/include/linux/mtd/onenand_regs.h
@@ -152,6 +152,8 @@
 #define ONENAND_SYS_CFG1_INT		(1 << 6)
 #define ONENAND_SYS_CFG1_IOBE		(1 << 5)
 #define ONENAND_SYS_CFG1_RDY_CONF	(1 << 4)
+#define ONENAND_SYS_CFG1_HF		(1 << 2)
+#define ONENAND_SYS_CFG1_SYNC_WRITE	(1 << 1)
 
 /*
  * Controller Status Register F240h (R)
-- 
cgit v1.2.3


From 2e489e077a6ad118c4f247faedf330117b107cce Mon Sep 17 00:00:00 2001
From: Alexey Korolev <akorolev@infradead.org>
Date: Tue, 5 Aug 2008 16:39:42 +0100
Subject: [MTD] [NOR] Add qry_mode_on()/qry_omde_off() to deal with odd chips

There are some CFI chips which require non standard procedures to get
into QRY mode. The possible way to support them would be trying
different modes till QRY will be read. This patch introduce two new
functions qry_mode_on qry_mode_off. qry_mode_on tries different commands
in order switch chip into QRY mode.

So if we have one more "odd" chip - we just could add several lines to
qry_mode_on. Also using these functions remove unnecessary code
duplicaton in porbe procedure.

Currently there are two "odd" cases
1. Some old intel chips which require 0xFF before 0x98
2. ST M29DW chip which requires 0x98 to be sent at 0x555 (according to
CFI should be 0x55)

This patch is partialy based on the patch from Uwe
(see "[PATCH 2/4] [RFC][MTD] cfi_probe: remove Intel chip workaround"
thread )

Signed-off-by: Alexey Korolev <akorolev@infradead.org>
Signed-off-by: Alexander Belyakov <abelyako@gmail.com>
Signed-off-by: David Woodhouse <David.Woodhouse@intel.com>
---
 drivers/mtd/chips/cfi_probe.c | 52 +++++-------------------------------
 drivers/mtd/chips/cfi_util.c  | 62 ++++++++++++++++++++++++++++++++++++++++---
 include/linux/mtd/cfi.h       |  9 ++++++-
 3 files changed, 73 insertions(+), 50 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/mtd/chips/cfi_probe.c b/drivers/mtd/chips/cfi_probe.c
index c418e92e1d92..e706be2ad0cb 100644
--- a/drivers/mtd/chips/cfi_probe.c
+++ b/drivers/mtd/chips/cfi_probe.c
@@ -44,17 +44,14 @@ do { \
 
 #define xip_enable(base, map, cfi) \
 do { \
-	cfi_send_gen_cmd(0xF0, 0, base, map, cfi, cfi->device_type, NULL); \
-	cfi_send_gen_cmd(0xFF, 0, base, map, cfi, cfi->device_type, NULL); \
+	qry_mode_off(base, map, cfi); \
 	xip_allowed(base, map); \
 } while (0)
 
 #define xip_disable_qry(base, map, cfi) \
 do { \
 	xip_disable(); \
-	cfi_send_gen_cmd(0xF0, 0, base, map, cfi, cfi->device_type, NULL); \
-	cfi_send_gen_cmd(0xFF, 0, base, map, cfi, cfi->device_type, NULL); \
-	cfi_send_gen_cmd(0x98, 0x55, base, map, cfi, cfi->device_type, NULL); \
+	qry_mode_on(base, map, cfi); \
 } while (0)
 
 #else
@@ -70,32 +67,6 @@ do { \
    in: interleave,type,mode
    ret: table index, <0 for error
  */
-static int __xipram qry_present(struct map_info *map, __u32 base,
-				struct cfi_private *cfi)
-{
-	int osf = cfi->interleave * cfi->device_type;	// scale factor
-	map_word val[3];
-	map_word qry[3];
-
-	qry[0] = cfi_build_cmd('Q', map, cfi);
-	qry[1] = cfi_build_cmd('R', map, cfi);
-	qry[2] = cfi_build_cmd('Y', map, cfi);
-
-	val[0] = map_read(map, base + osf*0x10);
-	val[1] = map_read(map, base + osf*0x11);
-	val[2] = map_read(map, base + osf*0x12);
-
-	if (!map_word_equal(map, qry[0], val[0]))
-		return 0;
-
-	if (!map_word_equal(map, qry[1], val[1]))
-		return 0;
-
-	if (!map_word_equal(map, qry[2], val[2]))
-		return 0;
-
-	return 1; 	// "QRY" found
-}
 
 static int __xipram cfi_probe_chip(struct map_info *map, __u32 base,
 				   unsigned long *chip_map, struct cfi_private *cfi)
@@ -116,11 +87,7 @@ static int __xipram cfi_probe_chip(struct map_info *map, __u32 base,
 	}
 
 	xip_disable();
-	cfi_send_gen_cmd(0xF0, 0, base, map, cfi, cfi->device_type, NULL);
-	cfi_send_gen_cmd(0xFF, 0, base, map, cfi, cfi->device_type, NULL);
-	cfi_send_gen_cmd(0x98, 0x55, base, map, cfi, cfi->device_type, NULL);
-
-	if (!qry_present(map,base,cfi)) {
+	if (!qry_mode_on(base, map, cfi)) {
 		xip_enable(base, map, cfi);
 		return 0;
 	}
@@ -144,8 +111,7 @@ static int __xipram cfi_probe_chip(struct map_info *map, __u32 base,
 		if (qry_present(map, start, cfi)) {
 			/* Eep. This chip also had the QRY marker.
 			 * Is it an alias for the new one? */
-			cfi_send_gen_cmd(0xF0, 0, start, map, cfi, cfi->device_type, NULL);
-			cfi_send_gen_cmd(0xFF, 0, start, map, cfi, cfi->device_type, NULL);
+			qry_mode_off(start, map, cfi);
 
 			/* If the QRY marker goes away, it's an alias */
 			if (!qry_present(map, start, cfi)) {
@@ -158,8 +124,7 @@ static int __xipram cfi_probe_chip(struct map_info *map, __u32 base,
 			 * unfortunate. Stick the new chip in read mode
 			 * too and if it's the same, assume it's an alias. */
 			/* FIXME: Use other modes to do a proper check */
-			cfi_send_gen_cmd(0xF0, 0, base, map, cfi, cfi->device_type, NULL);
-			cfi_send_gen_cmd(0xFF, 0, start, map, cfi, cfi->device_type, NULL);
+			qry_mode_off(base, map, cfi);
 
 			if (qry_present(map, base, cfi)) {
 				xip_allowed(base, map);
@@ -176,8 +141,7 @@ static int __xipram cfi_probe_chip(struct map_info *map, __u32 base,
 	cfi->numchips++;
 
 	/* Put it back into Read Mode */
-	cfi_send_gen_cmd(0xF0, 0, base, map, cfi, cfi->device_type, NULL);
-	cfi_send_gen_cmd(0xFF, 0, base, map, cfi, cfi->device_type, NULL);
+	qry_mode_off(base, map, cfi);
 	xip_allowed(base, map);
 
 	printk(KERN_INFO "%s: Found %d x%d devices at 0x%x in %d-bit bank\n",
@@ -237,9 +201,7 @@ static int __xipram cfi_chip_setup(struct map_info *map,
 			  cfi_read_query(map, base + 0xf * ofs_factor);
 
 	/* Put it back into Read Mode */
-	cfi_send_gen_cmd(0xF0, 0, base, map, cfi, cfi->device_type, NULL);
-	/* ... even if it's an Intel chip */
-	cfi_send_gen_cmd(0xFF, 0, base, map, cfi, cfi->device_type, NULL);
+	qry_mode_off(base, map, cfi);
 	xip_allowed(base, map);
 
 	/* Do any necessary byteswapping */
diff --git a/drivers/mtd/chips/cfi_util.c b/drivers/mtd/chips/cfi_util.c
index 0ee457018016..8d7553670526 100644
--- a/drivers/mtd/chips/cfi_util.c
+++ b/drivers/mtd/chips/cfi_util.c
@@ -24,6 +24,62 @@
 #include <linux/mtd/cfi.h>
 #include <linux/mtd/compatmac.h>
 
+int __xipram qry_present(struct map_info *map, __u32 base,
+				struct cfi_private *cfi)
+{
+	int osf = cfi->interleave * cfi->device_type;	/* scale factor */
+	map_word val[3];
+	map_word qry[3];
+
+	qry[0] = cfi_build_cmd('Q', map, cfi);
+	qry[1] = cfi_build_cmd('R', map, cfi);
+	qry[2] = cfi_build_cmd('Y', map, cfi);
+
+	val[0] = map_read(map, base + osf*0x10);
+	val[1] = map_read(map, base + osf*0x11);
+	val[2] = map_read(map, base + osf*0x12);
+
+	if (!map_word_equal(map, qry[0], val[0]))
+		return 0;
+
+	if (!map_word_equal(map, qry[1], val[1]))
+		return 0;
+
+	if (!map_word_equal(map, qry[2], val[2]))
+		return 0;
+
+	return 1; 	/* "QRY" found */
+}
+
+int __xipram qry_mode_on(uint32_t base, struct map_info *map,
+				struct cfi_private *cfi)
+{
+	cfi_send_gen_cmd(0xF0, 0, base, map, cfi, cfi->device_type, NULL);
+	cfi_send_gen_cmd(0x98, 0x55, base, map, cfi, cfi->device_type, NULL);
+	if (qry_present(map, base, cfi))
+		return 1;
+	/* QRY not found probably we deal with some odd CFI chips */
+	/* Some revisions of some old Intel chips? */
+	cfi_send_gen_cmd(0xF0, 0, base, map, cfi, cfi->device_type, NULL);
+	cfi_send_gen_cmd(0xFF, 0, base, map, cfi, cfi->device_type, NULL);
+	cfi_send_gen_cmd(0x98, 0x55, base, map, cfi, cfi->device_type, NULL);
+	if (qry_present(map, base, cfi))
+		return 1;
+	/* ST M29DW chips */
+	cfi_send_gen_cmd(0xF0, 0, base, map, cfi, cfi->device_type, NULL);
+	cfi_send_gen_cmd(0x98, 0x555, base, map, cfi, cfi->device_type, NULL);
+	if (qry_present(map, base, cfi))
+		return 1;
+	/* QRY not found */
+	return 0;
+}
+void __xipram qry_mode_off(uint32_t base, struct map_info *map,
+				struct cfi_private *cfi)
+{
+	cfi_send_gen_cmd(0xF0, 0, base, map, cfi, cfi->device_type, NULL);
+	cfi_send_gen_cmd(0xFF, 0, base, map, cfi, cfi->device_type, NULL);
+}
+
 struct cfi_extquery *
 __xipram cfi_read_pri(struct map_info *map, __u16 adr, __u16 size, const char* name)
 {
@@ -48,8 +104,7 @@ __xipram cfi_read_pri(struct map_info *map, __u16 adr, __u16 size, const char* n
 #endif
 
 	/* Switch it into Query Mode */
-	cfi_send_gen_cmd(0x98, 0x55, base, map, cfi, cfi->device_type, NULL);
-
+	qry_mode_on(base, map, cfi);
 	/* Read in the Extended Query Table */
 	for (i=0; i<size; i++) {
 		((unsigned char *)extp)[i] =
@@ -57,8 +112,7 @@ __xipram cfi_read_pri(struct map_info *map, __u16 adr, __u16 size, const char* n
 	}
 
 	/* Make sure it returns to read mode */
-	cfi_send_gen_cmd(0xf0, 0, base, map, cfi, cfi->device_type, NULL);
-	cfi_send_gen_cmd(0xff, 0, base, map, cfi, cfi->device_type, NULL);
+	qry_mode_off(base, map, cfi);
 
 #ifdef CONFIG_MTD_XIP
 	(void) map_read(map, base);
diff --git a/include/linux/mtd/cfi.h b/include/linux/mtd/cfi.h
index d6fb115f5a07..3058917d7b92 100644
--- a/include/linux/mtd/cfi.h
+++ b/include/linux/mtd/cfi.h
@@ -12,6 +12,7 @@
 #include <linux/mtd/flashchip.h>
 #include <linux/mtd/map.h>
 #include <linux/mtd/cfi_endian.h>
+#include <linux/mtd/xip.h>
 
 #ifdef CONFIG_MTD_CFI_I1
 #define cfi_interleave(cfi) 1
@@ -430,7 +431,6 @@ static inline uint32_t cfi_send_gen_cmd(u_char cmd, uint32_t cmd_addr, uint32_t
 {
 	map_word val;
 	uint32_t addr = base + cfi_build_cmd_addr(cmd_addr, cfi_interleave(cfi), type);
-
 	val = cfi_build_cmd(cmd, map, cfi);
 
 	if (prev_val)
@@ -483,6 +483,13 @@ static inline void cfi_udelay(int us)
 	}
 }
 
+int __xipram qry_present(struct map_info *map, __u32 base,
+				struct cfi_private *cfi);
+int __xipram qry_mode_on(uint32_t base, struct map_info *map,
+				struct cfi_private *cfi);
+void __xipram qry_mode_off(uint32_t base, struct map_info *map,
+				struct cfi_private *cfi);
+
 struct cfi_extquery *cfi_read_pri(struct map_info *map, uint16_t adr, uint16_t size,
 			     const char* name);
 struct cfi_fixup {
-- 
cgit v1.2.3


From e93cafe45fd74935e0aca2b79e533f0e3ed9640f Mon Sep 17 00:00:00 2001
From: Anders Grafström <grfstrm@users.sourceforge.net>
Date: Tue, 5 Aug 2008 18:37:41 +0200
Subject: [MTD] [NOR] cfi_cmdset_0001: Timeouts for erase, write and unlock
 operations
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Timeouts are currently given by the typical operation time times 8.
It works in the general well-behaved case but not when an erase block is
failing. For erase operations, it seems that a failing erase block will
keep the device state machine in erasing state until the vendor
specified maximum timeout period has passed. By this time the driver
would have long since timed out, left erasing state and attempted
further operations which all fail. This patch implements timeouts using
values from the CFI Query structure when available.
The patch also sets a longer timeout for locking operations. The current
value used for locking/unlocking given by 1000000/HZ microseconds is too
short for devices like J3 and J5 Strataflash which have a typical clear
lock-bits time of 0.5 seconds.

Signed-off-by: Anders Grafström <grfstrm@users.sourceforge.net>
Signed-off-by: David Woodhouse <David.Woodhouse@intel.com>
---
 drivers/mtd/chips/cfi_cmdset_0001.c | 52 +++++++++++++++++++++++++++----------
 include/linux/mtd/flashchip.h       |  4 +++
 2 files changed, 42 insertions(+), 14 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/mtd/chips/cfi_cmdset_0001.c b/drivers/mtd/chips/cfi_cmdset_0001.c
index d49cbe2738a8..5157e3cb4b9e 100644
--- a/drivers/mtd/chips/cfi_cmdset_0001.c
+++ b/drivers/mtd/chips/cfi_cmdset_0001.c
@@ -478,6 +478,28 @@ struct mtd_info *cfi_cmdset_0001(struct map_info *map, int primary)
 		else
 			cfi->chips[i].erase_time = 2000000;
 
+		if (cfi->cfiq->WordWriteTimeoutTyp &&
+		    cfi->cfiq->WordWriteTimeoutMax)
+			cfi->chips[i].word_write_time_max =
+				1<<(cfi->cfiq->WordWriteTimeoutTyp +
+				    cfi->cfiq->WordWriteTimeoutMax);
+		else
+			cfi->chips[i].word_write_time_max = 50000 * 8;
+
+		if (cfi->cfiq->BufWriteTimeoutTyp &&
+		    cfi->cfiq->BufWriteTimeoutMax)
+			cfi->chips[i].buffer_write_time_max =
+				1<<(cfi->cfiq->BufWriteTimeoutTyp +
+				    cfi->cfiq->BufWriteTimeoutMax);
+
+		if (cfi->cfiq->BlockEraseTimeoutTyp &&
+		    cfi->cfiq->BlockEraseTimeoutMax)
+			cfi->chips[i].erase_time_max =
+				1000<<(cfi->cfiq->BlockEraseTimeoutTyp +
+				       cfi->cfiq->BlockEraseTimeoutMax);
+		else
+			cfi->chips[i].erase_time_max = 2000000 * 8;
+
 		cfi->chips[i].ref_point_counter = 0;
 		init_waitqueue_head(&(cfi->chips[i].wq));
 	}
@@ -1012,7 +1034,7 @@ static void __xipram xip_enable(struct map_info *map, struct flchip *chip,
 
 static int __xipram xip_wait_for_operation(
 		struct map_info *map, struct flchip *chip,
-		unsigned long adr, unsigned int chip_op_time )
+		unsigned long adr, unsigned int chip_op_time_max)
 {
 	struct cfi_private *cfi = map->fldrv_priv;
 	struct cfi_pri_intelext *cfip = cfi->cmdset_priv;
@@ -1021,7 +1043,7 @@ static int __xipram xip_wait_for_operation(
 	flstate_t oldstate, newstate;
 
        	start = xip_currtime();
-	usec = chip_op_time * 8;
+	usec = chip_op_time_max;
 	if (usec == 0)
 		usec = 500000;
 	done = 0;
@@ -1131,8 +1153,8 @@ static int __xipram xip_wait_for_operation(
 #define XIP_INVAL_CACHED_RANGE(map, from, size)  \
 	INVALIDATE_CACHED_RANGE(map, from, size)
 
-#define INVAL_CACHE_AND_WAIT(map, chip, cmd_adr, inval_adr, inval_len, usec) \
-	xip_wait_for_operation(map, chip, cmd_adr, usec)
+#define INVAL_CACHE_AND_WAIT(map, chip, cmd_adr, inval_adr, inval_len, usec, usec_max) \
+	xip_wait_for_operation(map, chip, cmd_adr, usec_max)
 
 #else
 
@@ -1144,7 +1166,7 @@ static int __xipram xip_wait_for_operation(
 static int inval_cache_and_wait_for_operation(
 		struct map_info *map, struct flchip *chip,
 		unsigned long cmd_adr, unsigned long inval_adr, int inval_len,
-		unsigned int chip_op_time)
+		unsigned int chip_op_time, unsigned int chip_op_time_max)
 {
 	struct cfi_private *cfi = map->fldrv_priv;
 	map_word status, status_OK = CMD(0x80);
@@ -1156,8 +1178,7 @@ static int inval_cache_and_wait_for_operation(
 		INVALIDATE_CACHED_RANGE(map, inval_adr, inval_len);
 	spin_lock(chip->mutex);
 
-	/* set our timeout to 8 times the expected delay */
-	timeo = chip_op_time * 8;
+	timeo = chip_op_time_max;
 	if (!timeo)
 		timeo = 500000;
 	reset_timeo = timeo;
@@ -1217,8 +1238,8 @@ static int inval_cache_and_wait_for_operation(
 
 #endif
 
-#define WAIT_TIMEOUT(map, chip, adr, udelay) \
-	INVAL_CACHE_AND_WAIT(map, chip, adr, 0, 0, udelay);
+#define WAIT_TIMEOUT(map, chip, adr, udelay, udelay_max) \
+	INVAL_CACHE_AND_WAIT(map, chip, adr, 0, 0, udelay, udelay_max);
 
 
 static int do_point_onechip (struct map_info *map, struct flchip *chip, loff_t adr, size_t len)
@@ -1452,7 +1473,8 @@ static int __xipram do_write_oneword(struct map_info *map, struct flchip *chip,
 
 	ret = INVAL_CACHE_AND_WAIT(map, chip, adr,
 				   adr, map_bankwidth(map),
-				   chip->word_write_time);
+				   chip->word_write_time,
+				   chip->word_write_time_max);
 	if (ret) {
 		xip_enable(map, chip, adr);
 		printk(KERN_ERR "%s: word write error (status timeout)\n", map->name);
@@ -1623,7 +1645,7 @@ static int __xipram do_write_buffer(struct map_info *map, struct flchip *chip,
 
 	chip->state = FL_WRITING_TO_BUFFER;
 	map_write(map, write_cmd, cmd_adr);
-	ret = WAIT_TIMEOUT(map, chip, cmd_adr, 0);
+	ret = WAIT_TIMEOUT(map, chip, cmd_adr, 0, 0);
 	if (ret) {
 		/* Argh. Not ready for write to buffer */
 		map_word Xstatus = map_read(map, cmd_adr);
@@ -1692,7 +1714,8 @@ static int __xipram do_write_buffer(struct map_info *map, struct flchip *chip,
 
 	ret = INVAL_CACHE_AND_WAIT(map, chip, cmd_adr,
 				   initial_adr, initial_len,
-				   chip->buffer_write_time);
+				   chip->buffer_write_time,
+				   chip->buffer_write_time_max);
 	if (ret) {
 		map_write(map, CMD(0x70), cmd_adr);
 		chip->state = FL_STATUS;
@@ -1827,7 +1850,8 @@ static int __xipram do_erase_oneblock(struct map_info *map, struct flchip *chip,
 
 	ret = INVAL_CACHE_AND_WAIT(map, chip, adr,
 				   adr, len,
-				   chip->erase_time);
+				   chip->erase_time,
+				   chip->erase_time_max);
 	if (ret) {
 		map_write(map, CMD(0x70), adr);
 		chip->state = FL_STATUS;
@@ -2006,7 +2030,7 @@ static int __xipram do_xxlock_oneblock(struct map_info *map, struct flchip *chip
 	 */
 	udelay = (!extp || !(extp->FeatureSupport & (1 << 5))) ? 1000000/HZ : 0;
 
-	ret = WAIT_TIMEOUT(map, chip, adr, udelay);
+	ret = WAIT_TIMEOUT(map, chip, adr, udelay, udelay * 100);
 	if (ret) {
 		map_write(map, CMD(0x70), adr);
 		chip->state = FL_STATUS;
diff --git a/include/linux/mtd/flashchip.h b/include/linux/mtd/flashchip.h
index 08dd131301c1..d4f38c5fd44e 100644
--- a/include/linux/mtd/flashchip.h
+++ b/include/linux/mtd/flashchip.h
@@ -73,6 +73,10 @@ struct flchip {
 	int buffer_write_time;
 	int erase_time;
 
+	int word_write_time_max;
+	int buffer_write_time_max;
+	int erase_time_max;
+
 	void *priv;
 };
 
-- 
cgit v1.2.3


From c314dfdc358847eef0fc07ec8682e1acc8cadd00 Mon Sep 17 00:00:00 2001
From: David Woodhouse <David.Woodhouse@intel.com>
Date: Thu, 7 Aug 2008 11:55:07 +0100
Subject: [MTD] [NOR] Rename and export new cfi_qry_*() functions

They need to be exported, so let's give them less generic-sounding names
while we're at it.

Original export patch, along with the suggestion about the nomenclature,
from Stephen Rothwell.

Signed-off-by: David Woodhouse <David.Woodhouse@intel.com>
---
 drivers/mtd/chips/cfi_probe.c | 20 ++++++++++----------
 drivers/mtd/chips/cfi_util.c  | 26 +++++++++++++++-----------
 include/linux/mtd/cfi.h       | 12 ++++++------
 3 files changed, 31 insertions(+), 27 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/mtd/chips/cfi_probe.c b/drivers/mtd/chips/cfi_probe.c
index e706be2ad0cb..e63e6749429a 100644
--- a/drivers/mtd/chips/cfi_probe.c
+++ b/drivers/mtd/chips/cfi_probe.c
@@ -44,14 +44,14 @@ do { \
 
 #define xip_enable(base, map, cfi) \
 do { \
-	qry_mode_off(base, map, cfi); \
+	cfi_qry_mode_off(base, map, cfi);		\
 	xip_allowed(base, map); \
 } while (0)
 
 #define xip_disable_qry(base, map, cfi) \
 do { \
 	xip_disable(); \
-	qry_mode_on(base, map, cfi); \
+	cfi_qry_mode_on(base, map, cfi); \
 } while (0)
 
 #else
@@ -87,7 +87,7 @@ static int __xipram cfi_probe_chip(struct map_info *map, __u32 base,
 	}
 
 	xip_disable();
-	if (!qry_mode_on(base, map, cfi)) {
+	if (!cfi_qry_mode_on(base, map, cfi)) {
 		xip_enable(base, map, cfi);
 		return 0;
 	}
@@ -108,13 +108,13 @@ static int __xipram cfi_probe_chip(struct map_info *map, __u32 base,
  		start = i << cfi->chipshift;
 		/* This chip should be in read mode if it's one
 		   we've already touched. */
-		if (qry_present(map, start, cfi)) {
+		if (cfi_qry_present(map, start, cfi)) {
 			/* Eep. This chip also had the QRY marker.
 			 * Is it an alias for the new one? */
-			qry_mode_off(start, map, cfi);
+			cfi_qry_mode_off(start, map, cfi);
 
 			/* If the QRY marker goes away, it's an alias */
-			if (!qry_present(map, start, cfi)) {
+			if (!cfi_qry_present(map, start, cfi)) {
 				xip_allowed(base, map);
 				printk(KERN_DEBUG "%s: Found an alias at 0x%x for the chip at 0x%lx\n",
 				       map->name, base, start);
@@ -124,9 +124,9 @@ static int __xipram cfi_probe_chip(struct map_info *map, __u32 base,
 			 * unfortunate. Stick the new chip in read mode
 			 * too and if it's the same, assume it's an alias. */
 			/* FIXME: Use other modes to do a proper check */
-			qry_mode_off(base, map, cfi);
+			cfi_qry_mode_off(base, map, cfi);
 
-			if (qry_present(map, base, cfi)) {
+			if (cfi_qry_present(map, base, cfi)) {
 				xip_allowed(base, map);
 				printk(KERN_DEBUG "%s: Found an alias at 0x%x for the chip at 0x%lx\n",
 				       map->name, base, start);
@@ -141,7 +141,7 @@ static int __xipram cfi_probe_chip(struct map_info *map, __u32 base,
 	cfi->numchips++;
 
 	/* Put it back into Read Mode */
-	qry_mode_off(base, map, cfi);
+	cfi_qry_mode_off(base, map, cfi);
 	xip_allowed(base, map);
 
 	printk(KERN_INFO "%s: Found %d x%d devices at 0x%x in %d-bit bank\n",
@@ -201,7 +201,7 @@ static int __xipram cfi_chip_setup(struct map_info *map,
 			  cfi_read_query(map, base + 0xf * ofs_factor);
 
 	/* Put it back into Read Mode */
-	qry_mode_off(base, map, cfi);
+	cfi_qry_mode_off(base, map, cfi);
 	xip_allowed(base, map);
 
 	/* Do any necessary byteswapping */
diff --git a/drivers/mtd/chips/cfi_util.c b/drivers/mtd/chips/cfi_util.c
index 8d7553670526..34d40e25d312 100644
--- a/drivers/mtd/chips/cfi_util.c
+++ b/drivers/mtd/chips/cfi_util.c
@@ -24,8 +24,8 @@
 #include <linux/mtd/cfi.h>
 #include <linux/mtd/compatmac.h>
 
-int __xipram qry_present(struct map_info *map, __u32 base,
-				struct cfi_private *cfi)
+int __xipram cfi_qry_present(struct map_info *map, __u32 base,
+			     struct cfi_private *cfi)
 {
 	int osf = cfi->interleave * cfi->device_type;	/* scale factor */
 	map_word val[3];
@@ -50,35 +50,39 @@ int __xipram qry_present(struct map_info *map, __u32 base,
 
 	return 1; 	/* "QRY" found */
 }
+EXPORT_SYMBOL_GPL(cfi_qry_present);
 
-int __xipram qry_mode_on(uint32_t base, struct map_info *map,
-				struct cfi_private *cfi)
+int __xipram cfi_qry_mode_on(uint32_t base, struct map_info *map,
+			     struct cfi_private *cfi)
 {
 	cfi_send_gen_cmd(0xF0, 0, base, map, cfi, cfi->device_type, NULL);
 	cfi_send_gen_cmd(0x98, 0x55, base, map, cfi, cfi->device_type, NULL);
-	if (qry_present(map, base, cfi))
+	if (cfi_qry_present(map, base, cfi))
 		return 1;
 	/* QRY not found probably we deal with some odd CFI chips */
 	/* Some revisions of some old Intel chips? */
 	cfi_send_gen_cmd(0xF0, 0, base, map, cfi, cfi->device_type, NULL);
 	cfi_send_gen_cmd(0xFF, 0, base, map, cfi, cfi->device_type, NULL);
 	cfi_send_gen_cmd(0x98, 0x55, base, map, cfi, cfi->device_type, NULL);
-	if (qry_present(map, base, cfi))
+	if (cfi_qry_present(map, base, cfi))
 		return 1;
 	/* ST M29DW chips */
 	cfi_send_gen_cmd(0xF0, 0, base, map, cfi, cfi->device_type, NULL);
 	cfi_send_gen_cmd(0x98, 0x555, base, map, cfi, cfi->device_type, NULL);
-	if (qry_present(map, base, cfi))
+	if (cfi_qry_present(map, base, cfi))
 		return 1;
 	/* QRY not found */
 	return 0;
 }
-void __xipram qry_mode_off(uint32_t base, struct map_info *map,
-				struct cfi_private *cfi)
+EXPORT_SYMBOL_GPL(cfi_qry_mode_on);
+
+void __xipram cfi_qry_mode_off(uint32_t base, struct map_info *map,
+			       struct cfi_private *cfi)
 {
 	cfi_send_gen_cmd(0xF0, 0, base, map, cfi, cfi->device_type, NULL);
 	cfi_send_gen_cmd(0xFF, 0, base, map, cfi, cfi->device_type, NULL);
 }
+EXPORT_SYMBOL_GPL(cfi_qry_mode_off);
 
 struct cfi_extquery *
 __xipram cfi_read_pri(struct map_info *map, __u16 adr, __u16 size, const char* name)
@@ -104,7 +108,7 @@ __xipram cfi_read_pri(struct map_info *map, __u16 adr, __u16 size, const char* n
 #endif
 
 	/* Switch it into Query Mode */
-	qry_mode_on(base, map, cfi);
+	cfi_qry_mode_on(base, map, cfi);
 	/* Read in the Extended Query Table */
 	for (i=0; i<size; i++) {
 		((unsigned char *)extp)[i] =
@@ -112,7 +116,7 @@ __xipram cfi_read_pri(struct map_info *map, __u16 adr, __u16 size, const char* n
 	}
 
 	/* Make sure it returns to read mode */
-	qry_mode_off(base, map, cfi);
+	cfi_qry_mode_off(base, map, cfi);
 
 #ifdef CONFIG_MTD_XIP
 	(void) map_read(map, base);
diff --git a/include/linux/mtd/cfi.h b/include/linux/mtd/cfi.h
index 3058917d7b92..ee5124ec319e 100644
--- a/include/linux/mtd/cfi.h
+++ b/include/linux/mtd/cfi.h
@@ -483,12 +483,12 @@ static inline void cfi_udelay(int us)
 	}
 }
 
-int __xipram qry_present(struct map_info *map, __u32 base,
-				struct cfi_private *cfi);
-int __xipram qry_mode_on(uint32_t base, struct map_info *map,
-				struct cfi_private *cfi);
-void __xipram qry_mode_off(uint32_t base, struct map_info *map,
-				struct cfi_private *cfi);
+int __xipram cfi_qry_present(struct map_info *map, __u32 base,
+			     struct cfi_private *cfi);
+int __xipram cfi_qry_mode_on(uint32_t base, struct map_info *map,
+			     struct cfi_private *cfi);
+void __xipram cfi_qry_mode_off(uint32_t base, struct map_info *map,
+			       struct cfi_private *cfi);
 
 struct cfi_extquery *cfi_read_pri(struct map_info *map, uint16_t adr, uint16_t size,
 			     const char* name);
-- 
cgit v1.2.3


From bb0eb217c980d50c45f3e793b4dcc70ab9ee820d Mon Sep 17 00:00:00 2001
From: Adrian Hunter <ext-adrian.hunter@nokia.com>
Date: Tue, 12 Aug 2008 12:40:50 +0300
Subject: [MTD] Define and use MTD_FAIL_ADDR_UNKNOWN instead of 0xffffffff

Signed-off-by: Adrian Hunter <ext-adrian.hunter@nokia.com>
Signed-off-by: David Woodhouse <David.Woodhouse@intel.com>
---
 drivers/mtd/mtdconcat.c            | 4 ++--
 drivers/mtd/mtdpart.c              | 4 ++--
 drivers/mtd/nand/nand_base.c       | 2 +-
 drivers/mtd/onenand/onenand_base.c | 2 +-
 fs/jffs2/erase.c                   | 4 ++--
 include/linux/mtd/mtd.h            | 4 +++-
 6 files changed, 11 insertions(+), 9 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/mtd/mtdconcat.c b/drivers/mtd/mtdconcat.c
index 2972a5edb73d..789842d0e6f2 100644
--- a/drivers/mtd/mtdconcat.c
+++ b/drivers/mtd/mtdconcat.c
@@ -444,7 +444,7 @@ static int concat_erase(struct mtd_info *mtd, struct erase_info *instr)
 			return -EINVAL;
 	}
 
-	instr->fail_addr = 0xffffffff;
+	instr->fail_addr = MTD_FAIL_ADDR_UNKNOWN;
 
 	/* make a local copy of instr to avoid modifying the caller's struct */
 	erase = kmalloc(sizeof (struct erase_info), GFP_KERNEL);
@@ -493,7 +493,7 @@ static int concat_erase(struct mtd_info *mtd, struct erase_info *instr)
 			/* sanity check: should never happen since
 			 * block alignment has been checked above */
 			BUG_ON(err == -EINVAL);
-			if (erase->fail_addr != 0xffffffff)
+			if (erase->fail_addr != MTD_FAIL_ADDR_UNKNOWN)
 				instr->fail_addr = erase->fail_addr + offset;
 			break;
 		}
diff --git a/drivers/mtd/mtdpart.c b/drivers/mtd/mtdpart.c
index edb90b58a9b1..8e77e36e75ee 100644
--- a/drivers/mtd/mtdpart.c
+++ b/drivers/mtd/mtdpart.c
@@ -214,7 +214,7 @@ static int part_erase(struct mtd_info *mtd, struct erase_info *instr)
 	instr->addr += part->offset;
 	ret = part->master->erase(part->master, instr);
 	if (ret) {
-		if (instr->fail_addr != 0xffffffff)
+		if (instr->fail_addr != MTD_FAIL_ADDR_UNKNOWN)
 			instr->fail_addr -= part->offset;
 		instr->addr -= part->offset;
 	}
@@ -226,7 +226,7 @@ void mtd_erase_callback(struct erase_info *instr)
 	if (instr->mtd->erase == part_erase) {
 		struct mtd_part *part = PART(instr->mtd);
 
-		if (instr->fail_addr != 0xffffffff)
+		if (instr->fail_addr != MTD_FAIL_ADDR_UNKNOWN)
 			instr->fail_addr -= part->offset;
 		instr->addr -= part->offset;
 	}
diff --git a/drivers/mtd/nand/nand_base.c b/drivers/mtd/nand/nand_base.c
index d1129bae6c27..582280560c89 100644
--- a/drivers/mtd/nand/nand_base.c
+++ b/drivers/mtd/nand/nand_base.c
@@ -2042,7 +2042,7 @@ int nand_erase_nand(struct mtd_info *mtd, struct erase_info *instr,
 		return -EINVAL;
 	}
 
-	instr->fail_addr = 0xffffffff;
+	instr->fail_addr = MTD_FAIL_ADDR_UNKNOWN;
 
 	/* Grab the lock and see if the device is available */
 	nand_get_device(chip, mtd, FL_ERASING);
diff --git a/drivers/mtd/onenand/onenand_base.c b/drivers/mtd/onenand/onenand_base.c
index 926cf3a4135d..90ed319f26e6 100644
--- a/drivers/mtd/onenand/onenand_base.c
+++ b/drivers/mtd/onenand/onenand_base.c
@@ -1794,7 +1794,7 @@ static int onenand_erase(struct mtd_info *mtd, struct erase_info *instr)
 		return -EINVAL;
 	}
 
-	instr->fail_addr = 0xffffffff;
+	instr->fail_addr = MTD_FAIL_ADDR_UNKNOWN;
 
 	/* Grab the lock and see if the device is available */
 	onenand_get_device(mtd, FL_ERASING);
diff --git a/fs/jffs2/erase.c b/fs/jffs2/erase.c
index dddb2a6c9e2c..259461b910af 100644
--- a/fs/jffs2/erase.c
+++ b/fs/jffs2/erase.c
@@ -68,7 +68,7 @@ static void jffs2_erase_block(struct jffs2_sb_info *c,
 	instr->len = c->sector_size;
 	instr->callback = jffs2_erase_callback;
 	instr->priv = (unsigned long)(&instr[1]);
-	instr->fail_addr = 0xffffffff;
+	instr->fail_addr = MTD_FAIL_ADDR_UNKNOWN;
 
 	((struct erase_priv_struct *)instr->priv)->jeb = jeb;
 	((struct erase_priv_struct *)instr->priv)->c = c;
@@ -175,7 +175,7 @@ static void jffs2_erase_failed(struct jffs2_sb_info *c, struct jffs2_eraseblock
 {
 	/* For NAND, if the failure did not occur at the device level for a
 	   specific physical page, don't bother updating the bad block table. */
-	if (jffs2_cleanmarker_oob(c) && (bad_offset != 0xffffffff)) {
+	if (jffs2_cleanmarker_oob(c) && (bad_offset != MTD_FAIL_ADDR_UNKNOWN)) {
 		/* We had a device-level failure to erase.  Let's see if we've
 		   failed too many times. */
 		if (!jffs2_write_nand_badblock(c, jeb, bad_offset)) {
diff --git a/include/linux/mtd/mtd.h b/include/linux/mtd/mtd.h
index 922636548558..eae26bb6430a 100644
--- a/include/linux/mtd/mtd.h
+++ b/include/linux/mtd/mtd.h
@@ -25,8 +25,10 @@
 #define MTD_ERASE_DONE          0x08
 #define MTD_ERASE_FAILED        0x10
 
+#define MTD_FAIL_ADDR_UNKNOWN 0xffffffff
+
 /* If the erase fails, fail_addr might indicate exactly which block failed.  If
-   fail_addr = 0xffffffff, the failure was not at the device level or was not
+   fail_addr = MTD_FAIL_ADDR_UNKNOWN, the failure was not at the device level or was not
    specific to any particular block. */
 struct erase_info {
 	struct mtd_info *mtd;
-- 
cgit v1.2.3


From 17c1d2be28e485c0c8b09661db39d5bf2605069d Mon Sep 17 00:00:00 2001
From: Alexey Korolev <akorolev@infradead.org>
Date: Wed, 20 Aug 2008 22:32:08 +0100
Subject: [MTD] [NAND] Fix missing kernel-doc

[Reported by Randy Dunlap]

Signed-off-by: Alexey Korolev <akorolev@infradead.org>
Signed-off-by: David Woodhouse <David.Woodhouse@intel.com>
---
 drivers/mtd/nand/nand_base.c | 6 +++---
 drivers/mtd/nand/nand_ecc.c  | 6 +++---
 include/linux/mtd/nand.h     | 1 +
 3 files changed, 7 insertions(+), 6 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/mtd/nand/nand_base.c b/drivers/mtd/nand/nand_base.c
index 582280560c89..d303db39c48d 100644
--- a/drivers/mtd/nand/nand_base.c
+++ b/drivers/mtd/nand/nand_base.c
@@ -801,9 +801,9 @@ static int nand_read_page_swecc(struct mtd_info *mtd, struct nand_chip *chip,
  * nand_read_subpage - [REPLACABLE] software ecc based sub-page read function
  * @mtd:	mtd info structure
  * @chip:	nand chip info structure
- * @dataofs	offset of requested data within the page
- * @readlen	data length
- * @buf:	buffer to store read data
+ * @data_offs:	offset of requested data within the page
+ * @readlen:	data length
+ * @bufpoi:	buffer to store read data
  */
 static int nand_read_subpage(struct mtd_info *mtd, struct nand_chip *chip, uint32_t data_offs, uint32_t readlen, uint8_t *bufpoi)
 {
diff --git a/drivers/mtd/nand/nand_ecc.c b/drivers/mtd/nand/nand_ecc.c
index d99e569e999f..fd19787c9ce7 100644
--- a/drivers/mtd/nand/nand_ecc.c
+++ b/drivers/mtd/nand/nand_ecc.c
@@ -150,8 +150,8 @@ static const char addressbits[256] = {
 /**
  * nand_calculate_ecc - [NAND Interface] Calculate 3-byte ECC for 256-byte block
  * @mtd:	MTD block structure (unused)
- * @dat:	raw data
- * @ecc_code:	buffer for ECC
+ * @buf:	input buffer with raw data
+ * @code:	output buffer with ECC
  */
 int nand_calculate_ecc(struct mtd_info *mtd, const unsigned char *buf,
 		       unsigned char *code)
@@ -390,7 +390,7 @@ EXPORT_SYMBOL(nand_calculate_ecc);
 /**
  * nand_correct_data - [NAND Interface] Detect and correct bit error(s)
  * @mtd:	MTD block structure (unused)
- * @dat:	raw data read from the chip
+ * @buf:	raw data read from the chip
  * @read_ecc:	ECC from the chip
  * @calc_ecc:	the ECC calculated from raw data
  *
diff --git a/include/linux/mtd/nand.h b/include/linux/mtd/nand.h
index 81774e5facf4..733d3f3b4eb8 100644
--- a/include/linux/mtd/nand.h
+++ b/include/linux/mtd/nand.h
@@ -248,6 +248,7 @@ struct nand_hw_control {
  * @read_page_raw:	function to read a raw page without ECC
  * @write_page_raw:	function to write a raw page without ECC
  * @read_page:	function to read a page according to the ecc generator requirements
+ * @read_subpage:	function to read parts of the page covered by ECC.
  * @write_page:	function to write a page according to the ecc generator requirements
  * @read_oob:	function to read chip OOB data
  * @write_oob:	function to write chip OOB data
-- 
cgit v1.2.3


From 1aa5dfb751d275ae7117d3b73ac423b4a46f2a73 Mon Sep 17 00:00:00 2001
From: John Stultz <johnstul@us.ibm.com>
Date: Wed, 20 Aug 2008 16:37:28 -0700
Subject: clocksource: keep track of original clocksource frequency

The clocksource frequency is represented by
clocksource->mult/2^(clocksource->shift).  Currently, when NTP makes
adjustments to the clock frequency, they are made directly to the mult
value.

This has the drawback that once changed, we cannot know what the orignal
mult value was, or how much adjustment has been applied.

This property causes problems in calculating proper ntp intervals when
switching back and forth between clocksources.

This patch separates the current mult value into a mult and mult_orig
pair.  The mult_orig value stays constant, while the ntp clocksource
adjustments are done only to the mult value.

This allows for correct ntp interval calculation and additionally lays the
groundwork for a new notion of time, what I'm calling the monotonic-raw
time, which is introduced in a following patch.

Signed-off-by: John Stultz <johnstul@us.ibm.com>
Signed-off-by: Roman Zippel <zippel@linux-m68k.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 include/linux/clocksource.h | 11 +++++++----
 kernel/time/clocksource.c   |  3 +++
 kernel/time/jiffies.c       |  1 +
 3 files changed, 11 insertions(+), 4 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/clocksource.h b/include/linux/clocksource.h
index 55e434feec99..f0a7fb984413 100644
--- a/include/linux/clocksource.h
+++ b/include/linux/clocksource.h
@@ -45,7 +45,8 @@ struct clocksource;
  * @read:		returns a cycle value
  * @mask:		bitmask for two's complement
  *			subtraction of non 64 bit counters
- * @mult:		cycle to nanosecond multiplier
+ * @mult:		cycle to nanosecond multiplier (adjusted by NTP)
+ * @mult_orig:		cycle to nanosecond multiplier (unadjusted by NTP)
  * @shift:		cycle to nanosecond divisor (power of two)
  * @flags:		flags describing special properties
  * @vread:		vsyscall based read
@@ -63,6 +64,7 @@ struct clocksource {
 	cycle_t (*read)(void);
 	cycle_t mask;
 	u32 mult;
+	u32 mult_orig;
 	u32 shift;
 	unsigned long flags;
 	cycle_t (*vread)(void);
@@ -201,16 +203,17 @@ static inline void clocksource_calculate_interval(struct clocksource *c,
 {
 	u64 tmp;
 
-	/* XXX - All of this could use a whole lot of optimization */
+	/* Do the ns -> cycle conversion first, using original mult */
 	tmp = length_nsec;
 	tmp <<= c->shift;
-	tmp += c->mult/2;
-	do_div(tmp, c->mult);
+	tmp += c->mult_orig/2;
+	do_div(tmp, c->mult_orig);
 
 	c->cycle_interval = (cycle_t)tmp;
 	if (c->cycle_interval == 0)
 		c->cycle_interval = 1;
 
+	/* Go back from cycles -> shifted ns, this time use ntp adjused mult */
 	c->xtime_interval = (u64)c->cycle_interval * c->mult;
 }
 
diff --git a/kernel/time/clocksource.c b/kernel/time/clocksource.c
index 093d4acf993b..9ed2eec97526 100644
--- a/kernel/time/clocksource.c
+++ b/kernel/time/clocksource.c
@@ -325,6 +325,9 @@ int clocksource_register(struct clocksource *c)
 	unsigned long flags;
 	int ret;
 
+	/* save mult_orig on registration */
+	c->mult_orig = c->mult;
+
 	spin_lock_irqsave(&clocksource_lock, flags);
 	ret = clocksource_enqueue(c);
 	if (!ret)
diff --git a/kernel/time/jiffies.c b/kernel/time/jiffies.c
index 4c256fdb8875..1ca99557e929 100644
--- a/kernel/time/jiffies.c
+++ b/kernel/time/jiffies.c
@@ -61,6 +61,7 @@ struct clocksource clocksource_jiffies = {
 	.read		= jiffies_read,
 	.mask		= 0xffffffff, /*32bits*/
 	.mult		= NSEC_PER_JIFFY << JIFFIES_SHIFT, /* details above */
+	.mult_orig	= NSEC_PER_JIFFY << JIFFIES_SHIFT,
 	.shift		= JIFFIES_SHIFT,
 };
 
-- 
cgit v1.2.3


From 2d42244ae71d6c7b0884b5664cf2eda30fb2ae68 Mon Sep 17 00:00:00 2001
From: John Stultz <johnstul@us.ibm.com>
Date: Wed, 20 Aug 2008 16:37:30 -0700
Subject: clocksource: introduce CLOCK_MONOTONIC_RAW

In talking with Josip Loncaric, and his work on clock synchronization (see
btime.sf.net), he mentioned that for really close synchronization, it is
useful to have access to "hardware time", that is a notion of time that is
not in any way adjusted by the clock slewing done to keep close time sync.

Part of the issue is if we are using the kernel's ntp adjusted
representation of time in order to measure how we should correct time, we
can run into what Paul McKenney aptly described as "Painting a road using
the lines we're painting as the guide".

I had been thinking of a similar problem, and was trying to come up with a
way to give users access to a purely hardware based time representation
that avoided users having to know the underlying frequency and mask values
needed to deal with the wide variety of possible underlying hardware
counters.

My solution is to introduce CLOCK_MONOTONIC_RAW.  This exposes a
nanosecond based time value, that increments starting at bootup and has no
frequency adjustments made to it what so ever.

The time is accessed from userspace via the posix_clock_gettime() syscall,
passing CLOCK_MONOTONIC_RAW as the clock_id.

Signed-off-by: John Stultz <johnstul@us.ibm.com>
Signed-off-by: Roman Zippel <zippel@linux-m68k.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 include/linux/clocksource.h |  3 +++
 include/linux/time.h        |  2 ++
 kernel/posix-timers.c       | 15 +++++++++++++++
 kernel/time/timekeeping.c   | 44 ++++++++++++++++++++++++++++++++++++++++++++
 4 files changed, 64 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/clocksource.h b/include/linux/clocksource.h
index f0a7fb984413..f88d32f8ff7c 100644
--- a/include/linux/clocksource.h
+++ b/include/linux/clocksource.h
@@ -79,6 +79,7 @@ struct clocksource {
 	/* timekeeping specific data, ignore */
 	cycle_t cycle_interval;
 	u64	xtime_interval;
+	u32	raw_interval;
 	/*
 	 * Second part is written at each timer interrupt
 	 * Keep it in a different cache line to dirty no
@@ -87,6 +88,7 @@ struct clocksource {
 	cycle_t cycle_last ____cacheline_aligned_in_smp;
 	u64 xtime_nsec;
 	s64 error;
+	struct timespec raw_time;
 
 #ifdef CONFIG_CLOCKSOURCE_WATCHDOG
 	/* Watchdog related data, used by the framework */
@@ -215,6 +217,7 @@ static inline void clocksource_calculate_interval(struct clocksource *c,
 
 	/* Go back from cycles -> shifted ns, this time use ntp adjused mult */
 	c->xtime_interval = (u64)c->cycle_interval * c->mult;
+	c->raw_interval = ((u64)c->cycle_interval * c->mult_orig) >> c->shift;
 }
 
 
diff --git a/include/linux/time.h b/include/linux/time.h
index e15206a7e82e..205f974b9ebf 100644
--- a/include/linux/time.h
+++ b/include/linux/time.h
@@ -117,6 +117,7 @@ extern int do_setitimer(int which, struct itimerval *value,
 extern unsigned int alarm_setitimer(unsigned int seconds);
 extern int do_getitimer(int which, struct itimerval *value);
 extern void getnstimeofday(struct timespec *tv);
+extern void getrawmonotonic(struct timespec *ts);
 extern void getboottime(struct timespec *ts);
 extern void monotonic_to_bootbased(struct timespec *ts);
 
@@ -214,6 +215,7 @@ struct itimerval {
 #define CLOCK_MONOTONIC			1
 #define CLOCK_PROCESS_CPUTIME_ID	2
 #define CLOCK_THREAD_CPUTIME_ID		3
+#define CLOCK_MONOTONIC_RAW		4
 
 /*
  * The IDs of various hardware clocks:
diff --git a/kernel/posix-timers.c b/kernel/posix-timers.c
index e36d5798cbff..d3c66b53dff6 100644
--- a/kernel/posix-timers.c
+++ b/kernel/posix-timers.c
@@ -222,6 +222,15 @@ static int posix_ktime_get_ts(clockid_t which_clock, struct timespec *tp)
 	return 0;
 }
 
+/*
+ * Get monotonic time for posix timers
+ */
+static int posix_get_monotonic_raw(clockid_t which_clock, struct timespec *tp)
+{
+	getrawmonotonic(tp);
+	return 0;
+}
+
 /*
  * Initialize everything, well, just everything in Posix clocks/timers ;)
  */
@@ -235,9 +244,15 @@ static __init int init_posix_timers(void)
 		.clock_get = posix_ktime_get_ts,
 		.clock_set = do_posix_clock_nosettime,
 	};
+	struct k_clock clock_monotonic_raw = {
+		.clock_getres = hrtimer_get_res,
+		.clock_get = posix_get_monotonic_raw,
+		.clock_set = do_posix_clock_nosettime,
+	};
 
 	register_posix_clock(CLOCK_REALTIME, &clock_realtime);
 	register_posix_clock(CLOCK_MONOTONIC, &clock_monotonic);
+	register_posix_clock(CLOCK_MONOTONIC_RAW, &clock_monotonic_raw);
 
 	posix_timers_cache = kmem_cache_create("posix_timers_cache",
 					sizeof (struct k_itimer), 0, SLAB_PANIC,
diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c
index 83d3555a6998..5099c95b8aa2 100644
--- a/kernel/time/timekeeping.c
+++ b/kernel/time/timekeeping.c
@@ -75,6 +75,9 @@ static void clocksource_forward_now(void)
 
 	nsec = cyc2ns(clock, cycle_delta);
 	timespec_add_ns(&xtime, nsec);
+
+	nsec = ((s64)cycle_delta * clock->mult_orig) >> clock->shift;
+	clock->raw_time.tv_nsec += nsec;
 }
 
 /**
@@ -183,6 +186,8 @@ static void change_clocksource(void)
 
 	clocksource_forward_now();
 
+	new->raw_time = clock->raw_time;
+
 	clock = new;
 	clock->cycle_last = 0;
 	clock->cycle_last = clocksource_read(new);
@@ -204,6 +209,39 @@ static inline void clocksource_forward_now(void) { }
 static inline void change_clocksource(void) { }
 #endif
 
+/**
+ * getrawmonotonic - Returns the raw monotonic time in a timespec
+ * @ts:		pointer to the timespec to be set
+ *
+ * Returns the raw monotonic time (completely un-modified by ntp)
+ */
+void getrawmonotonic(struct timespec *ts)
+{
+	unsigned long seq;
+	s64 nsecs;
+	cycle_t cycle_now, cycle_delta;
+
+	do {
+		seq = read_seqbegin(&xtime_lock);
+
+		/* read clocksource: */
+		cycle_now = clocksource_read(clock);
+
+		/* calculate the delta since the last update_wall_time: */
+		cycle_delta = (cycle_now - clock->cycle_last) & clock->mask;
+
+		/* convert to nanoseconds: */
+		nsecs = ((s64)cycle_delta * clock->mult_orig) >> clock->shift;
+
+		*ts = clock->raw_time;
+
+	} while (read_seqretry(&xtime_lock, seq));
+
+	timespec_add_ns(ts, nsecs);
+}
+EXPORT_SYMBOL(getrawmonotonic);
+
+
 /**
  * timekeeping_valid_for_hres - Check if timekeeping is suitable for hres
  */
@@ -466,6 +504,12 @@ void update_wall_time(void)
 			second_overflow();
 		}
 
+		clock->raw_time.tv_nsec += clock->raw_interval;
+		if (clock->raw_time.tv_nsec >= NSEC_PER_SEC) {
+			clock->raw_time.tv_nsec -= NSEC_PER_SEC;
+			clock->raw_time.tv_sec++;
+		}
+
 		/* accumulate error between NTP and clock interval */
 		clock->error += tick_length;
 		clock->error -= clock->xtime_interval << (NTP_SCALE_SHIFT - clock->shift);
-- 
cgit v1.2.3


From 916c7a855174e3b53d182b97a26b2e27a29726a1 Mon Sep 17 00:00:00 2001
From: Roman Zippel <zippel@linux-m68k.org>
Date: Wed, 20 Aug 2008 16:46:08 -0700
Subject: ntp: fix ADJ_OFFSET_SS_READ bug and do_adjtimex() cleanup

Thanks to the review by Michael Kerrisk a bug in the recent
ADJ_OFFSET_SS_READ option was discovered, where the ntp time_offset was
inadvertently set by it.  This fixes this by making the adjtime code
more separate from the ntp_adjtime code (both of which really want to
be separate syscalls).

Signed-off-by: Roman Zippel <zippel@linux-m68k.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Acked-by: John Stultz <johnstul@us.ibm.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 include/linux/timex.h |  9 +++++-
 kernel/time/ntp.c     | 76 +++++++++++++++++++++++++++------------------------
 2 files changed, 48 insertions(+), 37 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/timex.h b/include/linux/timex.h
index fc6035d29d56..c00bcdd3ae42 100644
--- a/include/linux/timex.h
+++ b/include/linux/timex.h
@@ -141,8 +141,15 @@ struct timex {
 #define ADJ_MICRO		0x1000	/* select microsecond resolution */
 #define ADJ_NANO		0x2000	/* select nanosecond resolution */
 #define ADJ_TICK		0x4000	/* tick value */
+
+#ifdef __KERNEL__
+#define ADJ_ADJTIME		0x8000	/* switch between adjtime/adjtimex modes */
+#define ADJ_OFFSET_SINGLESHOT	0x0001	/* old-fashioned adjtime */
+#define ADJ_OFFSET_READONLY	0x2000	/* read-only adjtime */
+#else
 #define ADJ_OFFSET_SINGLESHOT	0x8001	/* old-fashioned adjtime */
-#define ADJ_OFFSET_SS_READ	0xa001  /* read-only adjtime */
+#define ADJ_OFFSET_SS_READ	0xa001	/* read-only adjtime */
+#endif
 
 /* xntp 3.4 compatibility names */
 #define MOD_OFFSET	ADJ_OFFSET
diff --git a/kernel/time/ntp.c b/kernel/time/ntp.c
index 5125ddd8196b..c6921aa1a42a 100644
--- a/kernel/time/ntp.c
+++ b/kernel/time/ntp.c
@@ -277,38 +277,50 @@ static inline void notify_cmos_timer(void) { }
 int do_adjtimex(struct timex *txc)
 {
 	struct timespec ts;
-	long save_adjust, sec;
 	int result;
 
-	/* In order to modify anything, you gotta be super-user! */
-	if (txc->modes && !capable(CAP_SYS_TIME))
-		return -EPERM;
-
-	/* Now we validate the data before disabling interrupts */
-
-	if ((txc->modes & ADJ_OFFSET_SINGLESHOT) == ADJ_OFFSET_SINGLESHOT) {
+	/* Validate the data before disabling interrupts */
+	if (txc->modes & ADJ_ADJTIME) {
 		/* singleshot must not be used with any other mode bits */
-		if (txc->modes & ~ADJ_OFFSET_SS_READ)
+		if (!(txc->modes & ADJ_OFFSET_SINGLESHOT))
 			return -EINVAL;
+		if (!(txc->modes & ADJ_OFFSET_READONLY) &&
+		    !capable(CAP_SYS_TIME))
+			return -EPERM;
+	} else {
+		/* In order to modify anything, you gotta be super-user! */
+		 if (txc->modes && !capable(CAP_SYS_TIME))
+			return -EPERM;
+
+		/* if the quartz is off by more than 10% something is VERY wrong! */
+		if (txc->modes & ADJ_TICK &&
+		    (txc->tick <  900000/USER_HZ ||
+		     txc->tick > 1100000/USER_HZ))
+				return -EINVAL;
+
+		if (txc->modes & ADJ_STATUS && time_state != TIME_OK)
+			hrtimer_cancel(&leap_timer);
 	}
 
-	/* if the quartz is off by more than 10% something is VERY wrong ! */
-	if (txc->modes & ADJ_TICK)
-		if (txc->tick <  900000/USER_HZ ||
-		    txc->tick > 1100000/USER_HZ)
-			return -EINVAL;
-
-	if (time_state != TIME_OK && txc->modes & ADJ_STATUS)
-		hrtimer_cancel(&leap_timer);
 	getnstimeofday(&ts);
 
 	write_seqlock_irq(&xtime_lock);
 
-	/* Save for later - semantics of adjtime is to return old value */
-	save_adjust = time_adjust;
-
 	/* If there are input parameters, then process them */
+	if (txc->modes & ADJ_ADJTIME) {
+		long save_adjust = time_adjust;
+
+		if (!(txc->modes & ADJ_OFFSET_READONLY)) {
+			/* adjtime() is independent from ntp_adjtime() */
+			time_adjust = txc->offset;
+			ntp_update_frequency();
+		}
+		txc->offset = save_adjust;
+		goto adj_done;
+	}
 	if (txc->modes) {
+		long sec;
+
 		if (txc->modes & ADJ_STATUS) {
 			if ((time_status & STA_PLL) &&
 			    !(txc->status & STA_PLL)) {
@@ -375,13 +387,8 @@ int do_adjtimex(struct timex *txc)
 		if (txc->modes & ADJ_TAI && txc->constant > 0)
 			time_tai = txc->constant;
 
-		if (txc->modes & ADJ_OFFSET) {
-			if (txc->modes == ADJ_OFFSET_SINGLESHOT)
-				/* adjtime() is independent from ntp_adjtime() */
-				time_adjust = txc->offset;
-			else
-				ntp_update_offset(txc->offset);
-		}
+		if (txc->modes & ADJ_OFFSET)
+			ntp_update_offset(txc->offset);
 		if (txc->modes & ADJ_TICK)
 			tick_usec = txc->tick;
 
@@ -389,19 +396,16 @@ int do_adjtimex(struct timex *txc)
 			ntp_update_frequency();
 	}
 
+	txc->offset = shift_right(time_offset * NTP_INTERVAL_FREQ,
+				  NTP_SCALE_SHIFT);
+	if (!(time_status & STA_NANO))
+		txc->offset /= NSEC_PER_USEC;
+
+adj_done:
 	result = time_state;	/* mostly `TIME_OK' */
 	if (time_status & (STA_UNSYNC|STA_CLOCKERR))
 		result = TIME_ERROR;
 
-	if ((txc->modes == ADJ_OFFSET_SINGLESHOT) ||
-	    (txc->modes == ADJ_OFFSET_SS_READ))
-		txc->offset = save_adjust;
-	else {
-		txc->offset = shift_right(time_offset * NTP_INTERVAL_FREQ,
-					  NTP_SCALE_SHIFT);
-		if (!(time_status & STA_NANO))
-			txc->offset /= NSEC_PER_USEC;
-	}
 	txc->freq	   = shift_right((s32)(time_freq >> PPM_SCALE_INV_SHIFT) *
 					 (s64)PPM_SCALE_INV,
 					 NTP_SCALE_SHIFT);
-- 
cgit v1.2.3


From 942ed161944b3476639916cf544e6975b29c985a Mon Sep 17 00:00:00 2001
From: Matthew Garrett <mjg59@srcf.ucam.org>
Date: Tue, 26 Aug 2008 21:09:59 +0100
Subject: power_supply: Add function to return system-wide power state

Certain drivers benefit from knowing whether the system is on ac or
battery, for instance when determining which backlight registers to
read. This adds a simple call to determine whether there's an online
power supply other than any batteries.

Signed-off-by: Matthew Garrett <mjg@redhat.com>
Signed-off-by: Anton Vorontsov <cbouatmailru@gmail.com>
---
 drivers/power/power_supply_core.c | 25 +++++++++++++++++++++++++
 include/linux/power_supply.h      |  6 ++++++
 2 files changed, 31 insertions(+)

(limited to 'include/linux')

diff --git a/drivers/power/power_supply_core.c b/drivers/power/power_supply_core.c
index cb1ccb472921..f44f5b608f6a 100644
--- a/drivers/power/power_supply_core.c
+++ b/drivers/power/power_supply_core.c
@@ -87,6 +87,30 @@ int power_supply_am_i_supplied(struct power_supply *psy)
 	return error;
 }
 
+static int __power_supply_is_system_supplied(struct device *dev, void *data)
+{
+	union power_supply_propval ret = {0,};
+	struct power_supply *psy = dev_get_drvdata(dev);
+
+	if (psy->type != POWER_SUPPLY_TYPE_BATTERY) {
+		if (psy->get_property(psy, POWER_SUPPLY_PROP_ONLINE, &ret))
+			return 0;
+		if (ret.intval)
+			return ret.intval;
+	}
+	return 0;
+}
+
+int power_supply_is_system_supplied(void)
+{
+	int error;
+
+	error = class_for_each_device(power_supply_class, NULL, NULL,
+				      __power_supply_is_system_supplied);
+
+	return error;
+}
+
 int power_supply_register(struct device *parent, struct power_supply *psy)
 {
 	int rc = 0;
@@ -148,6 +172,7 @@ static void __exit power_supply_class_exit(void)
 
 EXPORT_SYMBOL_GPL(power_supply_changed);
 EXPORT_SYMBOL_GPL(power_supply_am_i_supplied);
+EXPORT_SYMBOL_GPL(power_supply_is_system_supplied);
 EXPORT_SYMBOL_GPL(power_supply_register);
 EXPORT_SYMBOL_GPL(power_supply_unregister);
 
diff --git a/include/linux/power_supply.h b/include/linux/power_supply.h
index ea96ead1d39d..f9348cba6dc1 100644
--- a/include/linux/power_supply.h
+++ b/include/linux/power_supply.h
@@ -165,6 +165,12 @@ struct power_supply_info {
 extern void power_supply_changed(struct power_supply *psy);
 extern int power_supply_am_i_supplied(struct power_supply *psy);
 
+#if defined(CONFIG_POWER_SUPPLY) || defined(CONFIG_POWER_SUPPLY_MODULE)
+extern int power_supply_is_system_supplied(void);
+#else
+static inline int power_supply_is_system_supplied(void) { return -ENOSYS; }
+#endif
+
 extern int power_supply_register(struct device *parent,
 				 struct power_supply *psy);
 extern void power_supply_unregister(struct power_supply *psy);
-- 
cgit v1.2.3


From 7bb67439bf6bd3782f07f1d7be1e63406453d5de Mon Sep 17 00:00:00 2001
From: Arjan van de Ven <arjan@linux.intel.com>
Date: Sun, 31 Aug 2008 08:05:58 -0700
Subject: select: Introduce a hrtimeout function

This patch adds a schedule_hrtimeout() function, to be used by select() and
poll() in a later patch. This function works similar to schedule_timeout()
in most ways, but takes a timespec rather than jiffies.

With a lot of contributions/fixes from Thomas

Signed-off-by: Arjan van de Ven <arjan@linux.intel.com>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 include/linux/hrtimer.h |  2 ++
 kernel/hrtimer.c        | 65 +++++++++++++++++++++++++++++++++++++++++++++++++
 2 files changed, 67 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/hrtimer.h b/include/linux/hrtimer.h
index 6d93dce61cbb..becd17db1a1a 100644
--- a/include/linux/hrtimer.h
+++ b/include/linux/hrtimer.h
@@ -346,6 +346,8 @@ extern long hrtimer_nanosleep_restart(struct restart_block *restart_block);
 extern void hrtimer_init_sleeper(struct hrtimer_sleeper *sl,
 				 struct task_struct *tsk);
 
+extern int schedule_hrtimeout(ktime_t *expires, const enum hrtimer_mode mode);
+
 /* Soft interrupt function to run the hrtimer queues: */
 extern void hrtimer_run_queues(void);
 extern void hrtimer_run_pending(void);
diff --git a/kernel/hrtimer.c b/kernel/hrtimer.c
index b8e4dce80a74..782137dc755f 100644
--- a/kernel/hrtimer.c
+++ b/kernel/hrtimer.c
@@ -1678,3 +1678,68 @@ void __init hrtimers_init(void)
 #endif
 }
 
+/**
+ * schedule_hrtimeout - sleep until timeout
+ * @expires:	timeout value (ktime_t)
+ * @mode:	timer mode, HRTIMER_MODE_ABS or HRTIMER_MODE_REL
+ *
+ * Make the current task sleep until the given expiry time has
+ * elapsed. The routine will return immediately unless
+ * the current task state has been set (see set_current_state()).
+ *
+ * You can set the task state as follows -
+ *
+ * %TASK_UNINTERRUPTIBLE - at least @timeout time is guaranteed to
+ * pass before the routine returns.
+ *
+ * %TASK_INTERRUPTIBLE - the routine may return early if a signal is
+ * delivered to the current task.
+ *
+ * The current task state is guaranteed to be TASK_RUNNING when this
+ * routine returns.
+ *
+ * Returns 0 when the timer has expired otherwise -EINTR
+ */
+int __sched schedule_hrtimeout(ktime_t *expires,
+			       const enum hrtimer_mode mode)
+{
+	struct hrtimer_sleeper t;
+
+	/*
+	 * Optimize when a zero timeout value is given. It does not
+	 * matter whether this is an absolute or a relative time.
+	 */
+	if (expires && !expires->tv64) {
+		__set_current_state(TASK_RUNNING);
+		return 0;
+	}
+
+	/*
+	 * A NULL parameter means "inifinte"
+	 */
+	if (!expires) {
+		schedule();
+		__set_current_state(TASK_RUNNING);
+		return -EINTR;
+	}
+
+	hrtimer_init_on_stack(&t.timer, CLOCK_MONOTONIC, mode);
+	t.timer.expires = *expires;
+
+	hrtimer_init_sleeper(&t, current);
+
+	hrtimer_start(&t.timer, t.timer.expires, mode);
+	if (!hrtimer_active(&t.timer))
+		t.task = NULL;
+
+	if (likely(t.task))
+		schedule();
+
+	hrtimer_cancel(&t.timer);
+	destroy_hrtimer_on_stack(&t.timer);
+
+	__set_current_state(TASK_RUNNING);
+
+	return !t.task ? 0 : -EINTR;
+}
+EXPORT_SYMBOL_GPL(schedule_hrtimeout);
-- 
cgit v1.2.3


From df0cc0539b4127bd02f64de2c335b4af1fdb3845 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Sun, 31 Aug 2008 08:09:53 -0700
Subject: select: add a timespec_add_safe() function

For the select() rework, it's important to be able to add timespec
structures in an overflow-safe manner.

This patch adds a timespec_add_safe() function for this which is similar in
operation to ktime_add_safe(), but works on a struct timespec.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Arjan van de Ven <arjan@linux.intel.com>
---
 include/linux/time.h |  4 ++++
 kernel/time.c        | 18 ++++++++++++++++++
 2 files changed, 22 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/time.h b/include/linux/time.h
index e15206a7e82e..726976478480 100644
--- a/include/linux/time.h
+++ b/include/linux/time.h
@@ -38,6 +38,8 @@ struct timezone {
 #define NSEC_PER_SEC	1000000000L
 #define FSEC_PER_SEC	1000000000000000L
 
+#define TIME_T_MAX	(time_t)((1UL << ((sizeof(time_t) << 3) - 1)) - 1)
+
 static inline int timespec_equal(const struct timespec *a,
                                  const struct timespec *b)
 {
@@ -72,6 +74,8 @@ extern unsigned long mktime(const unsigned int year, const unsigned int mon,
 			    const unsigned int min, const unsigned int sec);
 
 extern void set_normalized_timespec(struct timespec *ts, time_t sec, long nsec);
+extern struct timespec timespec_add_safe(const struct timespec lhs,
+					 const struct timespec rhs);
 
 /*
  * sub = lhs - rhs, in normalized form
diff --git a/kernel/time.c b/kernel/time.c
index 6a08660b4fac..d63a4336fad6 100644
--- a/kernel/time.c
+++ b/kernel/time.c
@@ -669,3 +669,21 @@ EXPORT_SYMBOL(get_jiffies_64);
 #endif
 
 EXPORT_SYMBOL(jiffies);
+
+/*
+ * Add two timespec values and do a safety check for overflow.
+ * It's assumed that both values are valid (>= 0)
+ */
+struct timespec timespec_add_safe(const struct timespec lhs,
+				  const struct timespec rhs)
+{
+	struct timespec res;
+
+	set_normalized_timespec(&res, lhs.tv_sec + rhs.tv_sec,
+				lhs.tv_nsec + rhs.tv_nsec);
+
+	if (res.tv_sec < lhs.tv_sec || res.tv_sec < rhs.tv_sec)
+		res.tv_sec = TIME_T_MAX;
+
+	return res;
+}
-- 
cgit v1.2.3


From b773ad40aca5bd755ba886620842f16e8fef6d75 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Sun, 31 Aug 2008 08:16:57 -0700
Subject: select: add poll_select_set_timeout() and
 poll_select_copy_remaining() helpers

This patch adds 2 helpers that will be used for the hrtimer based select/poll:

poll_select_set_timeout() is a helper that takes a timeout (as a second, nanosecond
pair) and turns that into a "struct timespec" that represents the absolute end time.
This is a common operation in the many select() and poll() variants and needs various,
common, sanity checks.

poll_select_copy_remaining() is a helper that takes care of copying the remaining
time to userspace, as select(), pselect() and ppoll() do. This function comes in
both a natural and a compat implementation (due to datastructure differences).

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Arjan van de Ven <arjan@linux.intel.com>
---
 fs/compat.c          | 51 +++++++++++++++++++++++++++++++++++
 fs/select.c          | 75 ++++++++++++++++++++++++++++++++++++++++++++++++++++
 include/linux/poll.h |  2 ++
 3 files changed, 128 insertions(+)

(limited to 'include/linux')

diff --git a/fs/compat.c b/fs/compat.c
index 075d0509970d..424767c954a0 100644
--- a/fs/compat.c
+++ b/fs/compat.c
@@ -1436,6 +1436,57 @@ out_ret:
 
 #define __COMPAT_NFDBITS       (8 * sizeof(compat_ulong_t))
 
+static int poll_select_copy_remaining(struct timespec *end_time, void __user *p,
+				      int timeval, int ret)
+{
+	struct timespec ts;
+
+	if (!p)
+		return ret;
+
+	if (current->personality & STICKY_TIMEOUTS)
+		goto sticky;
+
+	/* No update for zero timeout */
+	if (!end_time->tv_sec && !end_time->tv_nsec)
+		return ret;
+
+	ktime_get_ts(&ts);
+	ts = timespec_sub(*end_time, ts);
+	if (ts.tv_sec < 0)
+		ts.tv_sec = ts.tv_nsec = 0;
+
+	if (timeval) {
+		struct compat_timeval rtv;
+
+		rtv.tv_sec = ts.tv_sec;
+		rtv.tv_usec = ts.tv_nsec / NSEC_PER_USEC;
+
+		if (!copy_to_user(p, &rtv, sizeof(rtv)))
+			return ret;
+	} else {
+		struct compat_timespec rts;
+
+		rts.tv_sec = ts.tv_sec;
+		rts.tv_nsec = ts.tv_nsec;
+
+		if (!copy_to_user(p, &rts, sizeof(rts)))
+			return ret;
+	}
+	/*
+	 * If an application puts its timeval in read-only memory, we
+	 * don't want the Linux-specific update to the timeval to
+	 * cause a fault after the select has completed
+	 * successfully. However, because we're not updating the
+	 * timeval, we can't restart the system call.
+	 */
+
+sticky:
+	if (ret == -ERESTARTNOHAND)
+		ret = -EINTR;
+	return ret;
+}
+
 /*
  * Ooo, nasty.  We need here to frob 32-bit unsigned longs to
  * 64-bit unsigned longs.
diff --git a/fs/select.c b/fs/select.c
index da0e88201c3a..1180a6207789 100644
--- a/fs/select.c
+++ b/fs/select.c
@@ -130,6 +130,81 @@ static void __pollwait(struct file *filp, wait_queue_head_t *wait_address,
 	add_wait_queue(wait_address, &entry->wait);
 }
 
+/**
+ * poll_select_set_timeout - helper function to setup the timeout value
+ * @to:		pointer to timespec variable for the final timeout
+ * @sec:	seconds (from user space)
+ * @nsec:	nanoseconds (from user space)
+ *
+ * Note, we do not use a timespec for the user space value here, That
+ * way we can use the function for timeval and compat interfaces as well.
+ *
+ * Returns -EINVAL if sec/nsec are not normalized. Otherwise 0.
+ */
+int poll_select_set_timeout(struct timespec *to, long sec, long nsec)
+{
+	struct timespec ts = {.tv_sec = sec, .tv_nsec = nsec};
+
+	if (!timespec_valid(&ts))
+		return -EINVAL;
+
+	/* Optimize for the zero timeout value here */
+	if (!sec && !nsec) {
+		to->tv_sec = to->tv_nsec = 0;
+	} else {
+		ktime_get_ts(to);
+		*to = timespec_add_safe(*to, ts);
+	}
+	return 0;
+}
+
+static int poll_select_copy_remaining(struct timespec *end_time, void __user *p,
+				      int timeval, int ret)
+{
+	struct timespec rts;
+	struct timeval rtv;
+
+	if (!p)
+		return ret;
+
+	if (current->personality & STICKY_TIMEOUTS)
+		goto sticky;
+
+	/* No update for zero timeout */
+	if (!end_time->tv_sec && !end_time->tv_nsec)
+		return ret;
+
+	ktime_get_ts(&rts);
+	rts = timespec_sub(*end_time, rts);
+	if (rts.tv_sec < 0)
+		rts.tv_sec = rts.tv_nsec = 0;
+
+	if (timeval) {
+		rtv.tv_sec = rts.tv_sec;
+		rtv.tv_usec = rts.tv_nsec / NSEC_PER_USEC;
+
+		if (!copy_to_user(p, &rtv, sizeof(rtv)))
+			return ret;
+
+	} else if (!copy_to_user(p, &rts, sizeof(rts)))
+		return ret;
+
+	/*
+	 * If an application puts its timeval in read-only memory, we
+	 * don't want the Linux-specific update to the timeval to
+	 * cause a fault after the select has completed
+	 * successfully. However, because we're not updating the
+	 * timeval, we can't restart the system call.
+	 */
+
+sticky:
+	if (ret == -ERESTARTNOHAND)
+		ret = -EINTR;
+	return ret;
+}
+
+
+
 #define FDS_IN(fds, n)		(fds->in + n)
 #define FDS_OUT(fds, n)		(fds->out + n)
 #define FDS_EX(fds, n)		(fds->ex + n)
diff --git a/include/linux/poll.h b/include/linux/poll.h
index ef453828877a..f65de5128a9e 100644
--- a/include/linux/poll.h
+++ b/include/linux/poll.h
@@ -120,6 +120,8 @@ extern int do_sys_poll(struct pollfd __user * ufds, unsigned int nfds,
 extern int core_sys_select(int n, fd_set __user *inp, fd_set __user *outp,
 			   fd_set __user *exp, s64 *timeout);
 
+extern int poll_select_set_timeout(struct timespec *to, long sec, long nsec);
+
 #endif /* KERNEL */
 
 #endif /* _LINUX_POLL_H */
-- 
cgit v1.2.3


From be5dad20a55e054a35dac7f6f5f184dc72b379b4 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Sun, 31 Aug 2008 08:19:15 -0700
Subject: select: add a poll specific struct to the restart_block union

with hrtimer poll/select, the signal restart data no longer is a single
long representing a jiffies count, but it becomes a second/nanosecond pair
that also needs to encode if there was a timeout at all or not.

This patch adds a struct to the restart_block union for this purpose

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Arjan van de Ven <arjan@linux.intel.com>
---
 include/linux/thread_info.h | 8 ++++++++
 1 file changed, 8 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/thread_info.h b/include/linux/thread_info.h
index 38a56477f27a..e6b820f8b56b 100644
--- a/include/linux/thread_info.h
+++ b/include/linux/thread_info.h
@@ -38,6 +38,14 @@ struct restart_block {
 #endif
 			u64 expires;
 		} nanosleep;
+		/* For poll */
+		struct {
+			struct pollfd __user *ufds;
+			int nfds;
+			int has_timeout;
+			unsigned long tv_sec;
+			unsigned long tv_nsec;
+		} poll;
 	};
 };
 
-- 
cgit v1.2.3


From 8ff3e8e85fa6c312051134b3953e397feb639f51 Mon Sep 17 00:00:00 2001
From: Arjan van de Ven <arjan@linux.intel.com>
Date: Sun, 31 Aug 2008 08:26:40 -0700
Subject: select: switch select() and poll() over to hrtimers

With lots of help, input and cleanups from Thomas Gleixner

This patch switches select() and poll() over to hrtimers.

The core of the patch is replacing the "s64 timeout" with a
"struct timespec end_time" in all the plumbing.

But most of the diffstat comes from using the just introduced helpers:
	poll_select_set_timeout
	poll_select_copy_remaining
	timespec_add_safe
which make manipulating the timespec easier and less error-prone.

Signed-off-by: Arjan van de Ven <arjan@linux.intel.com>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 fs/compat.c          | 136 ++++----------------------
 fs/select.c          | 263 +++++++++++++++++----------------------------------
 include/linux/poll.h |   6 +-
 3 files changed, 111 insertions(+), 294 deletions(-)

(limited to 'include/linux')

diff --git a/fs/compat.c b/fs/compat.c
index 424767c954a0..133ed7f5d681 100644
--- a/fs/compat.c
+++ b/fs/compat.c
@@ -1568,7 +1568,8 @@ int compat_set_fd_set(unsigned long nr, compat_ulong_t __user *ufdset,
 	((unsigned long) (MAX_SCHEDULE_TIMEOUT / HZ)-1)
 
 int compat_core_sys_select(int n, compat_ulong_t __user *inp,
-	compat_ulong_t __user *outp, compat_ulong_t __user *exp, s64 *timeout)
+	compat_ulong_t __user *outp, compat_ulong_t __user *exp,
+	struct timespec *end_time)
 {
 	fd_set_bits fds;
 	void *bits;
@@ -1615,7 +1616,7 @@ int compat_core_sys_select(int n, compat_ulong_t __user *inp,
 	zero_fd_set(n, fds.res_out);
 	zero_fd_set(n, fds.res_ex);
 
-	ret = do_select(n, &fds, timeout);
+	ret = do_select(n, &fds, end_time);
 
 	if (ret < 0)
 		goto out;
@@ -1641,7 +1642,7 @@ asmlinkage long compat_sys_select(int n, compat_ulong_t __user *inp,
 	compat_ulong_t __user *outp, compat_ulong_t __user *exp,
 	struct compat_timeval __user *tvp)
 {
-	s64 timeout = -1;
+	struct timespec end_time, *to = NULL;
 	struct compat_timeval tv;
 	int ret;
 
@@ -1649,43 +1650,14 @@ asmlinkage long compat_sys_select(int n, compat_ulong_t __user *inp,
 		if (copy_from_user(&tv, tvp, sizeof(tv)))
 			return -EFAULT;
 
-		if (tv.tv_sec < 0 || tv.tv_usec < 0)
+		to = &end_time;
+		if (poll_select_set_timeout(to, tv.tv_sec,
+					    tv.tv_usec * NSEC_PER_USEC))
 			return -EINVAL;
-
-		/* Cast to u64 to make GCC stop complaining */
-		if ((u64)tv.tv_sec >= (u64)MAX_INT64_SECONDS)
-			timeout = -1;	/* infinite */
-		else {
-			timeout = DIV_ROUND_UP(tv.tv_usec, 1000000/HZ);
-			timeout += tv.tv_sec * HZ;
-		}
 	}
 
-	ret = compat_core_sys_select(n, inp, outp, exp, &timeout);
-
-	if (tvp) {
-		struct compat_timeval rtv;
-
-		if (current->personality & STICKY_TIMEOUTS)
-			goto sticky;
-		rtv.tv_usec = jiffies_to_usecs(do_div((*(u64*)&timeout), HZ));
-		rtv.tv_sec = timeout;
-		if (compat_timeval_compare(&rtv, &tv) >= 0)
-			rtv = tv;
-		if (copy_to_user(tvp, &rtv, sizeof(rtv))) {
-sticky:
-			/*
-			 * If an application puts its timeval in read-only
-			 * memory, we don't want the Linux-specific update to
-			 * the timeval to cause a fault after the select has
-			 * completed successfully. However, because we're not
-			 * updating the timeval, we can't restart the system
-			 * call.
-			 */
-			if (ret == -ERESTARTNOHAND)
-				ret = -EINTR;
-		}
-	}
+	ret = compat_core_sys_select(n, inp, outp, exp, to);
+	ret = poll_select_copy_remaining(&end_time, tvp, 1, ret);
 
 	return ret;
 }
@@ -1698,15 +1670,16 @@ asmlinkage long compat_sys_pselect7(int n, compat_ulong_t __user *inp,
 {
 	compat_sigset_t ss32;
 	sigset_t ksigmask, sigsaved;
-	s64 timeout = MAX_SCHEDULE_TIMEOUT;
 	struct compat_timespec ts;
+	struct timespec end_time, *to = NULL;
 	int ret;
 
 	if (tsp) {
 		if (copy_from_user(&ts, tsp, sizeof(ts)))
 			return -EFAULT;
 
-		if (ts.tv_sec < 0 || ts.tv_nsec < 0)
+		to = &end_time;
+		if (poll_select_set_timeout(to, ts.tv_sec, ts.tv_nsec))
 			return -EINVAL;
 	}
 
@@ -1721,51 +1694,8 @@ asmlinkage long compat_sys_pselect7(int n, compat_ulong_t __user *inp,
 		sigprocmask(SIG_SETMASK, &ksigmask, &sigsaved);
 	}
 
-	do {
-		if (tsp) {
-			if ((unsigned long)ts.tv_sec < MAX_SELECT_SECONDS) {
-				timeout = DIV_ROUND_UP(ts.tv_nsec, 1000000000/HZ);
-				timeout += ts.tv_sec * (unsigned long)HZ;
-				ts.tv_sec = 0;
-				ts.tv_nsec = 0;
-			} else {
-				ts.tv_sec -= MAX_SELECT_SECONDS;
-				timeout = MAX_SELECT_SECONDS * HZ;
-			}
-		}
-
-		ret = compat_core_sys_select(n, inp, outp, exp, &timeout);
-
-	} while (!ret && !timeout && tsp && (ts.tv_sec || ts.tv_nsec));
-
-	if (tsp) {
-		struct compat_timespec rts;
-
-		if (current->personality & STICKY_TIMEOUTS)
-			goto sticky;
-
-		rts.tv_sec = timeout / HZ;
-		rts.tv_nsec = (timeout % HZ) * (NSEC_PER_SEC/HZ);
-		if (rts.tv_nsec >= NSEC_PER_SEC) {
-			rts.tv_sec++;
-			rts.tv_nsec -= NSEC_PER_SEC;
-		}
-		if (compat_timespec_compare(&rts, &ts) >= 0)
-			rts = ts;
-		if (copy_to_user(tsp, &rts, sizeof(rts))) {
-sticky:
-			/*
-			 * If an application puts its timeval in read-only
-			 * memory, we don't want the Linux-specific update to
-			 * the timeval to cause a fault after the select has
-			 * completed successfully. However, because we're not
-			 * updating the timeval, we can't restart the system
-			 * call.
-			 */
-			if (ret == -ERESTARTNOHAND)
-				ret = -EINTR;
-		}
-	}
+	ret = compat_core_sys_select(n, inp, outp, exp, to);
+	ret = poll_select_copy_remaining(&end_time, tsp, 0, ret);
 
 	if (ret == -ERESTARTNOHAND) {
 		/*
@@ -1810,18 +1740,16 @@ asmlinkage long compat_sys_ppoll(struct pollfd __user *ufds,
 	compat_sigset_t ss32;
 	sigset_t ksigmask, sigsaved;
 	struct compat_timespec ts;
-	s64 timeout = -1;
+	struct timespec end_time, *to = NULL;
 	int ret;
 
 	if (tsp) {
 		if (copy_from_user(&ts, tsp, sizeof(ts)))
 			return -EFAULT;
 
-		/* We assume that ts.tv_sec is always lower than
-		   the number of seconds that can be expressed in
-		   an s64. Otherwise the compiler bitches at us */
-		timeout = DIV_ROUND_UP(ts.tv_nsec, 1000000000/HZ);
-		timeout += ts.tv_sec * HZ;
+		to = &end_time;
+		if (poll_select_set_timeout(to, ts.tv_sec, ts.tv_nsec))
+			return -EINVAL;
 	}
 
 	if (sigmask) {
@@ -1835,7 +1763,7 @@ asmlinkage long compat_sys_ppoll(struct pollfd __user *ufds,
 		sigprocmask(SIG_SETMASK, &ksigmask, &sigsaved);
 	}
 
-	ret = do_sys_poll(ufds, nfds, &timeout);
+	ret = do_sys_poll(ufds, nfds, to);
 
 	/* We can restart this syscall, usually */
 	if (ret == -EINTR) {
@@ -1853,31 +1781,7 @@ asmlinkage long compat_sys_ppoll(struct pollfd __user *ufds,
 	} else if (sigmask)
 		sigprocmask(SIG_SETMASK, &sigsaved, NULL);
 
-	if (tsp && timeout >= 0) {
-		struct compat_timespec rts;
-
-		if (current->personality & STICKY_TIMEOUTS)
-			goto sticky;
-		/* Yes, we know it's actually an s64, but it's also positive. */
-		rts.tv_nsec = jiffies_to_usecs(do_div((*(u64*)&timeout), HZ)) *
-					1000;
-		rts.tv_sec = timeout;
-		if (compat_timespec_compare(&rts, &ts) >= 0)
-			rts = ts;
-		if (copy_to_user(tsp, &rts, sizeof(rts))) {
-sticky:
-			/*
-			 * If an application puts its timeval in read-only
-			 * memory, we don't want the Linux-specific update to
-			 * the timeval to cause a fault after the select has
-			 * completed successfully. However, because we're not
-			 * updating the timeval, we can't restart the system
-			 * call.
-			 */
-			if (ret == -ERESTARTNOHAND && timeout >= 0)
-				ret = -EINTR;
-		}
-	}
+	ret = poll_select_copy_remaining(&end_time, tsp, 0, ret);
 
 	return ret;
 }
diff --git a/fs/select.c b/fs/select.c
index 1180a6207789..f6dceb56793f 100644
--- a/fs/select.c
+++ b/fs/select.c
@@ -24,6 +24,7 @@
 #include <linux/fdtable.h>
 #include <linux/fs.h>
 #include <linux/rcupdate.h>
+#include <linux/hrtimer.h>
 
 #include <asm/uaccess.h>
 
@@ -203,8 +204,6 @@ sticky:
 	return ret;
 }
 
-
-
 #define FDS_IN(fds, n)		(fds->in + n)
 #define FDS_OUT(fds, n)		(fds->out + n)
 #define FDS_EX(fds, n)		(fds->ex + n)
@@ -257,11 +256,12 @@ get_max:
 #define POLLOUT_SET (POLLWRBAND | POLLWRNORM | POLLOUT | POLLERR)
 #define POLLEX_SET (POLLPRI)
 
-int do_select(int n, fd_set_bits *fds, s64 *timeout)
+int do_select(int n, fd_set_bits *fds, struct timespec *end_time)
 {
+	ktime_t expire, *to = NULL;
 	struct poll_wqueues table;
 	poll_table *wait;
-	int retval, i;
+	int retval, i, timed_out = 0;
 
 	rcu_read_lock();
 	retval = max_select_fd(n, fds);
@@ -273,12 +273,14 @@ int do_select(int n, fd_set_bits *fds, s64 *timeout)
 
 	poll_initwait(&table);
 	wait = &table.pt;
-	if (!*timeout)
+	if (end_time && !end_time->tv_sec && !end_time->tv_nsec) {
 		wait = NULL;
+		timed_out = 1;
+	}
+
 	retval = 0;
 	for (;;) {
 		unsigned long *rinp, *routp, *rexp, *inp, *outp, *exp;
-		long __timeout;
 
 		set_current_state(TASK_INTERRUPTIBLE);
 
@@ -334,27 +336,25 @@ int do_select(int n, fd_set_bits *fds, s64 *timeout)
 			cond_resched();
 		}
 		wait = NULL;
-		if (retval || !*timeout || signal_pending(current))
+		if (retval || timed_out || signal_pending(current))
 			break;
 		if (table.error) {
 			retval = table.error;
 			break;
 		}
 
-		if (*timeout < 0) {
-			/* Wait indefinitely */
-			__timeout = MAX_SCHEDULE_TIMEOUT;
-		} else if (unlikely(*timeout >= (s64)MAX_SCHEDULE_TIMEOUT - 1)) {
-			/* Wait for longer than MAX_SCHEDULE_TIMEOUT. Do it in a loop */
-			__timeout = MAX_SCHEDULE_TIMEOUT - 1;
-			*timeout -= __timeout;
-		} else {
-			__timeout = *timeout;
-			*timeout = 0;
+		/*
+		 * If this is the first loop and we have a timeout
+		 * given, then we convert to ktime_t and set the to
+		 * pointer to the expiry value.
+		 */
+		if (end_time && !to) {
+			expire = timespec_to_ktime(*end_time);
+			to = &expire;
 		}
-		__timeout = schedule_timeout(__timeout);
-		if (*timeout >= 0)
-			*timeout += __timeout;
+
+		if (!schedule_hrtimeout(to, HRTIMER_MODE_ABS))
+			timed_out = 1;
 	}
 	__set_current_state(TASK_RUNNING);
 
@@ -375,7 +375,7 @@ int do_select(int n, fd_set_bits *fds, s64 *timeout)
 	((unsigned long) (MAX_SCHEDULE_TIMEOUT / HZ)-1)
 
 int core_sys_select(int n, fd_set __user *inp, fd_set __user *outp,
-			   fd_set __user *exp, s64 *timeout)
+			   fd_set __user *exp, struct timespec *end_time)
 {
 	fd_set_bits fds;
 	void *bits;
@@ -426,7 +426,7 @@ int core_sys_select(int n, fd_set __user *inp, fd_set __user *outp,
 	zero_fd_set(n, fds.res_out);
 	zero_fd_set(n, fds.res_ex);
 
-	ret = do_select(n, &fds, timeout);
+	ret = do_select(n, &fds, end_time);
 
 	if (ret < 0)
 		goto out;
@@ -452,7 +452,7 @@ out_nofds:
 asmlinkage long sys_select(int n, fd_set __user *inp, fd_set __user *outp,
 			fd_set __user *exp, struct timeval __user *tvp)
 {
-	s64 timeout = -1;
+	struct timespec end_time, *to = NULL;
 	struct timeval tv;
 	int ret;
 
@@ -460,43 +460,14 @@ asmlinkage long sys_select(int n, fd_set __user *inp, fd_set __user *outp,
 		if (copy_from_user(&tv, tvp, sizeof(tv)))
 			return -EFAULT;
 
-		if (tv.tv_sec < 0 || tv.tv_usec < 0)
+		to = &end_time;
+		if (poll_select_set_timeout(to, tv.tv_sec,
+					    tv.tv_usec * NSEC_PER_USEC))
 			return -EINVAL;
-
-		/* Cast to u64 to make GCC stop complaining */
-		if ((u64)tv.tv_sec >= (u64)MAX_INT64_SECONDS)
-			timeout = -1;	/* infinite */
-		else {
-			timeout = DIV_ROUND_UP(tv.tv_usec, USEC_PER_SEC/HZ);
-			timeout += tv.tv_sec * HZ;
-		}
 	}
 
-	ret = core_sys_select(n, inp, outp, exp, &timeout);
-
-	if (tvp) {
-		struct timeval rtv;
-
-		if (current->personality & STICKY_TIMEOUTS)
-			goto sticky;
-		rtv.tv_usec = jiffies_to_usecs(do_div((*(u64*)&timeout), HZ));
-		rtv.tv_sec = timeout;
-		if (timeval_compare(&rtv, &tv) >= 0)
-			rtv = tv;
-		if (copy_to_user(tvp, &rtv, sizeof(rtv))) {
-sticky:
-			/*
-			 * If an application puts its timeval in read-only
-			 * memory, we don't want the Linux-specific update to
-			 * the timeval to cause a fault after the select has
-			 * completed successfully. However, because we're not
-			 * updating the timeval, we can't restart the system
-			 * call.
-			 */
-			if (ret == -ERESTARTNOHAND)
-				ret = -EINTR;
-		}
-	}
+	ret = core_sys_select(n, inp, outp, exp, to);
+	ret = poll_select_copy_remaining(&end_time, tvp, 1, ret);
 
 	return ret;
 }
@@ -506,25 +477,17 @@ asmlinkage long sys_pselect7(int n, fd_set __user *inp, fd_set __user *outp,
 		fd_set __user *exp, struct timespec __user *tsp,
 		const sigset_t __user *sigmask, size_t sigsetsize)
 {
-	s64 timeout = MAX_SCHEDULE_TIMEOUT;
 	sigset_t ksigmask, sigsaved;
-	struct timespec ts;
+	struct timespec ts, end_time, *to = NULL;
 	int ret;
 
 	if (tsp) {
 		if (copy_from_user(&ts, tsp, sizeof(ts)))
 			return -EFAULT;
 
-		if (ts.tv_sec < 0 || ts.tv_nsec < 0)
+		to = &end_time;
+		if (poll_select_set_timeout(to, ts.tv_sec, ts.tv_nsec))
 			return -EINVAL;
-
-		/* Cast to u64 to make GCC stop complaining */
-		if ((u64)ts.tv_sec >= (u64)MAX_INT64_SECONDS)
-			timeout = -1;	/* infinite */
-		else {
-			timeout = DIV_ROUND_UP(ts.tv_nsec, NSEC_PER_SEC/HZ);
-			timeout += ts.tv_sec * HZ;
-		}
 	}
 
 	if (sigmask) {
@@ -538,32 +501,8 @@ asmlinkage long sys_pselect7(int n, fd_set __user *inp, fd_set __user *outp,
 		sigprocmask(SIG_SETMASK, &ksigmask, &sigsaved);
 	}
 
-	ret = core_sys_select(n, inp, outp, exp, &timeout);
-
-	if (tsp) {
-		struct timespec rts;
-
-		if (current->personality & STICKY_TIMEOUTS)
-			goto sticky;
-		rts.tv_nsec = jiffies_to_usecs(do_div((*(u64*)&timeout), HZ)) *
-						1000;
-		rts.tv_sec = timeout;
-		if (timespec_compare(&rts, &ts) >= 0)
-			rts = ts;
-		if (copy_to_user(tsp, &rts, sizeof(rts))) {
-sticky:
-			/*
-			 * If an application puts its timeval in read-only
-			 * memory, we don't want the Linux-specific update to
-			 * the timeval to cause a fault after the select has
-			 * completed successfully. However, because we're not
-			 * updating the timeval, we can't restart the system
-			 * call.
-			 */
-			if (ret == -ERESTARTNOHAND)
-				ret = -EINTR;
-		}
-	}
+	ret = core_sys_select(n, inp, outp, exp, &end_time);
+	ret = poll_select_copy_remaining(&end_time, tsp, 0, ret);
 
 	if (ret == -ERESTARTNOHAND) {
 		/*
@@ -649,18 +588,20 @@ static inline unsigned int do_pollfd(struct pollfd *pollfd, poll_table *pwait)
 }
 
 static int do_poll(unsigned int nfds,  struct poll_list *list,
-		   struct poll_wqueues *wait, s64 *timeout)
+		   struct poll_wqueues *wait, struct timespec *end_time)
 {
-	int count = 0;
 	poll_table* pt = &wait->pt;
+	ktime_t expire, *to = NULL;
+	int timed_out = 0, count = 0;
 
 	/* Optimise the no-wait case */
-	if (!(*timeout))
+	if (end_time && !end_time->tv_sec && !end_time->tv_nsec) {
 		pt = NULL;
+		timed_out = 1;
+	}
 
 	for (;;) {
 		struct poll_list *walk;
-		long __timeout;
 
 		set_current_state(TASK_INTERRUPTIBLE);
 		for (walk = list; walk != NULL; walk = walk->next) {
@@ -692,27 +633,21 @@ static int do_poll(unsigned int nfds,  struct poll_list *list,
 			if (signal_pending(current))
 				count = -EINTR;
 		}
-		if (count || !*timeout)
+		if (count || timed_out)
 			break;
 
-		if (*timeout < 0) {
-			/* Wait indefinitely */
-			__timeout = MAX_SCHEDULE_TIMEOUT;
-		} else if (unlikely(*timeout >= (s64)MAX_SCHEDULE_TIMEOUT-1)) {
-			/*
-			 * Wait for longer than MAX_SCHEDULE_TIMEOUT. Do it in
-			 * a loop
-			 */
-			__timeout = MAX_SCHEDULE_TIMEOUT - 1;
-			*timeout -= __timeout;
-		} else {
-			__timeout = *timeout;
-			*timeout = 0;
+		/*
+		 * If this is the first loop and we have a timeout
+		 * given, then we convert to ktime_t and set the to
+		 * pointer to the expiry value.
+		 */
+		if (end_time && !to) {
+			expire = timespec_to_ktime(*end_time);
+			to = &expire;
 		}
 
-		__timeout = schedule_timeout(__timeout);
-		if (*timeout >= 0)
-			*timeout += __timeout;
+		if (!schedule_hrtimeout(to, HRTIMER_MODE_ABS))
+			timed_out = 1;
 	}
 	__set_current_state(TASK_RUNNING);
 	return count;
@@ -721,7 +656,8 @@ static int do_poll(unsigned int nfds,  struct poll_list *list,
 #define N_STACK_PPS ((sizeof(stack_pps) - sizeof(struct poll_list))  / \
 			sizeof(struct pollfd))
 
-int do_sys_poll(struct pollfd __user *ufds, unsigned int nfds, s64 *timeout)
+int do_sys_poll(struct pollfd __user *ufds, unsigned int nfds,
+		struct timespec *end_time)
 {
 	struct poll_wqueues table;
  	int err = -EFAULT, fdcount, len, size;
@@ -761,7 +697,7 @@ int do_sys_poll(struct pollfd __user *ufds, unsigned int nfds, s64 *timeout)
 	}
 
 	poll_initwait(&table);
-	fdcount = do_poll(nfds, head, &table, timeout);
+	fdcount = do_poll(nfds, head, &table, end_time);
 	poll_freewait(&table);
 
 	for (walk = head; walk; walk = walk->next) {
@@ -787,16 +723,21 @@ out_fds:
 
 static long do_restart_poll(struct restart_block *restart_block)
 {
-	struct pollfd __user *ufds = (struct pollfd __user*)restart_block->arg0;
-	int nfds = restart_block->arg1;
-	s64 timeout = ((s64)restart_block->arg3<<32) | (s64)restart_block->arg2;
+	struct pollfd __user *ufds = restart_block->poll.ufds;
+	int nfds = restart_block->poll.nfds;
+	struct timespec *to = NULL, end_time;
 	int ret;
 
-	ret = do_sys_poll(ufds, nfds, &timeout);
+	if (restart_block->poll.has_timeout) {
+		end_time.tv_sec = restart_block->poll.tv_sec;
+		end_time.tv_nsec = restart_block->poll.tv_nsec;
+		to = &end_time;
+	}
+
+	ret = do_sys_poll(ufds, nfds, to);
+
 	if (ret == -EINTR) {
 		restart_block->fn = do_restart_poll;
-		restart_block->arg2 = timeout & 0xFFFFFFFF;
-		restart_block->arg3 = (u64)timeout >> 32;
 		ret = -ERESTART_RESTARTBLOCK;
 	}
 	return ret;
@@ -805,31 +746,32 @@ static long do_restart_poll(struct restart_block *restart_block)
 asmlinkage long sys_poll(struct pollfd __user *ufds, unsigned int nfds,
 			long timeout_msecs)
 {
-	s64 timeout_jiffies;
+	struct timespec end_time, *to = NULL;
 	int ret;
 
-	if (timeout_msecs > 0) {
-#if HZ > 1000
-		/* We can only overflow if HZ > 1000 */
-		if (timeout_msecs / 1000 > (s64)0x7fffffffffffffffULL / (s64)HZ)
-			timeout_jiffies = -1;
-		else
-#endif
-			timeout_jiffies = msecs_to_jiffies(timeout_msecs) + 1;
-	} else {
-		/* Infinite (< 0) or no (0) timeout */
-		timeout_jiffies = timeout_msecs;
+	if (timeout_msecs >= 0) {
+		to = &end_time;
+		poll_select_set_timeout(to, timeout_msecs / MSEC_PER_SEC,
+			NSEC_PER_MSEC * (timeout_msecs % MSEC_PER_SEC));
 	}
 
-	ret = do_sys_poll(ufds, nfds, &timeout_jiffies);
+	ret = do_sys_poll(ufds, nfds, to);
+
 	if (ret == -EINTR) {
 		struct restart_block *restart_block;
+
 		restart_block = &current_thread_info()->restart_block;
 		restart_block->fn = do_restart_poll;
-		restart_block->arg0 = (unsigned long)ufds;
-		restart_block->arg1 = nfds;
-		restart_block->arg2 = timeout_jiffies & 0xFFFFFFFF;
-		restart_block->arg3 = (u64)timeout_jiffies >> 32;
+		restart_block->poll.ufds = ufds;
+		restart_block->poll.nfds = nfds;
+
+		if (timeout_msecs >= 0) {
+			restart_block->poll.tv_sec = end_time.tv_sec;
+			restart_block->poll.tv_nsec = end_time.tv_nsec;
+			restart_block->poll.has_timeout = 1;
+		} else
+			restart_block->poll.has_timeout = 0;
+
 		ret = -ERESTART_RESTARTBLOCK;
 	}
 	return ret;
@@ -841,21 +783,16 @@ asmlinkage long sys_ppoll(struct pollfd __user *ufds, unsigned int nfds,
 	size_t sigsetsize)
 {
 	sigset_t ksigmask, sigsaved;
-	struct timespec ts;
-	s64 timeout = -1;
+	struct timespec ts, end_time, *to = NULL;
 	int ret;
 
 	if (tsp) {
 		if (copy_from_user(&ts, tsp, sizeof(ts)))
 			return -EFAULT;
 
-		/* Cast to u64 to make GCC stop complaining */
-		if ((u64)ts.tv_sec >= (u64)MAX_INT64_SECONDS)
-			timeout = -1;	/* infinite */
-		else {
-			timeout = DIV_ROUND_UP(ts.tv_nsec, NSEC_PER_SEC/HZ);
-			timeout += ts.tv_sec * HZ;
-		}
+		to = &end_time;
+		if (poll_select_set_timeout(to, ts.tv_sec, ts.tv_nsec))
+			return -EINVAL;
 	}
 
 	if (sigmask) {
@@ -869,7 +806,7 @@ asmlinkage long sys_ppoll(struct pollfd __user *ufds, unsigned int nfds,
 		sigprocmask(SIG_SETMASK, &ksigmask, &sigsaved);
 	}
 
-	ret = do_sys_poll(ufds, nfds, &timeout);
+	ret = do_sys_poll(ufds, nfds, to);
 
 	/* We can restart this syscall, usually */
 	if (ret == -EINTR) {
@@ -887,31 +824,7 @@ asmlinkage long sys_ppoll(struct pollfd __user *ufds, unsigned int nfds,
 	} else if (sigmask)
 		sigprocmask(SIG_SETMASK, &sigsaved, NULL);
 
-	if (tsp && timeout >= 0) {
-		struct timespec rts;
-
-		if (current->personality & STICKY_TIMEOUTS)
-			goto sticky;
-		/* Yes, we know it's actually an s64, but it's also positive. */
-		rts.tv_nsec = jiffies_to_usecs(do_div((*(u64*)&timeout), HZ)) *
-						1000;
-		rts.tv_sec = timeout;
-		if (timespec_compare(&rts, &ts) >= 0)
-			rts = ts;
-		if (copy_to_user(tsp, &rts, sizeof(rts))) {
-		sticky:
-			/*
-			 * If an application puts its timeval in read-only
-			 * memory, we don't want the Linux-specific update to
-			 * the timeval to cause a fault after the select has
-			 * completed successfully. However, because we're not
-			 * updating the timeval, we can't restart the system
-			 * call.
-			 */
-			if (ret == -ERESTARTNOHAND && timeout >= 0)
-				ret = -EINTR;
-		}
-	}
+	ret = poll_select_copy_remaining(&end_time, tsp, 0, ret);
 
 	return ret;
 }
diff --git a/include/linux/poll.h b/include/linux/poll.h
index f65de5128a9e..badd98ab06f6 100644
--- a/include/linux/poll.h
+++ b/include/linux/poll.h
@@ -114,11 +114,11 @@ void zero_fd_set(unsigned long nr, unsigned long *fdset)
 
 #define MAX_INT64_SECONDS (((s64)(~((u64)0)>>1)/HZ)-1)
 
-extern int do_select(int n, fd_set_bits *fds, s64 *timeout);
+extern int do_select(int n, fd_set_bits *fds, struct timespec *end_time);
 extern int do_sys_poll(struct pollfd __user * ufds, unsigned int nfds,
-		       s64 *timeout);
+		       struct timespec *end_time);
 extern int core_sys_select(int n, fd_set __user *inp, fd_set __user *outp,
-			   fd_set __user *exp, s64 *timeout);
+			   fd_set __user *exp, struct timespec *end_time);
 
 extern int poll_select_set_timeout(struct timespec *to, long sec, long nsec);
 
-- 
cgit v1.2.3


From 63ca243b271f5b44e0b1057003cf498b6d0fadf7 Mon Sep 17 00:00:00 2001
From: Arjan van de Ven <arjan@linux.intel.com>
Date: Mon, 1 Sep 2008 14:35:02 -0700
Subject: hrtimer: add abstraction functions for accessing the "expires" member

In order to be able to turn hrtimers into range based, we need to provide
accessor functions for getting to the "expires" ktime_t member of the
struct hrtimer.

This patch adds a set of accessors for this purpose:
* hrtimer_set_expires
* hrtimer_set_expires_tv64
* hrtimer_add_expires
* hrtimer_add_expires_ns
* hrtimer_get_expires
* hrtimer_get_expires_tv64
* hrtimer_get_expires_ns
* hrtimer_expires_remaining
* hrtimer_start_expires

No users of these new accessors are added yet; these follow in later patches.
Hopefully this patch can even go into 2.6.27-rc so that the conversions will
not have a bottleneck in -next

Signed-off-by: Arjan van de Ven <arjan@linux.intel.com>
---
 include/linux/hrtimer.h | 45 +++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 45 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/hrtimer.h b/include/linux/hrtimer.h
index becd17db1a1a..9900e998ea8f 100644
--- a/include/linux/hrtimer.h
+++ b/include/linux/hrtimer.h
@@ -217,6 +217,45 @@ static inline int hrtimer_is_hres_active(struct hrtimer *timer)
 	return timer->base->cpu_base->hres_active;
 }
 
+static inline void hrtimer_set_expires(struct hrtimer *timer, ktime_t time)
+{
+	timer->expires = time;
+}
+static inline void hrtimer_set_expires_tv64(struct hrtimer *timer, s64 tv64)
+{
+	timer->expires.tv64 = tv64;
+}
+
+static inline void hrtimer_add_expires(struct hrtimer *timer, ktime_t time)
+{
+	timer->expires = ktime_add_safe(timer->expires, time);
+}
+
+static inline void hrtimer_add_expires_ns(struct hrtimer *timer, unsigned long ns)
+{
+	timer->expires = ktime_add_ns(timer->expires, ns);
+}
+
+static inline ktime_t hrtimer_get_expires(const struct hrtimer *timer)
+{
+	return timer->expires;
+}
+
+static inline s64 hrtimer_get_expires_tv64(const struct hrtimer *timer)
+{
+	return timer->expires.tv64;
+}
+
+static inline s64 hrtimer_get_expires_ns(const struct hrtimer *timer)
+{
+	return ktime_to_ns(timer->expires);
+}
+
+static inline ktime_t hrtimer_expires_remaining(const struct hrtimer *timer)
+{
+    return ktime_sub(timer->expires, timer->base->get_time());
+}
+
 /*
  * The resolution of the clocks. The resolution value is returned in
  * the clock_getres() system call to give application programmers an
@@ -287,6 +326,12 @@ extern int hrtimer_start(struct hrtimer *timer, ktime_t tim,
 extern int hrtimer_cancel(struct hrtimer *timer);
 extern int hrtimer_try_to_cancel(struct hrtimer *timer);
 
+static inline int hrtimer_start_expires(struct hrtimer *timer,
+						enum hrtimer_mode mode)
+{
+	return hrtimer_start(timer, hrtimer_get_expires(timer), mode);
+}
+
 static inline int hrtimer_restart(struct hrtimer *timer)
 {
 	return hrtimer_start(timer, timer->expires, HRTIMER_MODE_ABS);
-- 
cgit v1.2.3


From 799b64de256ea68fbb5db63bb55f61c305870643 Mon Sep 17 00:00:00 2001
From: Arjan van de Ven <arjan@linux.intel.com>
Date: Mon, 1 Sep 2008 15:27:58 -0700
Subject: hrtimer: rename the "expires" struct member to avoid accidental usage

To catch code that still touches the "expires" memory directly, rename it
to have the compiler complain rather than get nasty, hard to explain,
runtime behavior

Signed-off-by: Arjan van de Ven <arjan@linux.intel.com>
---
 include/linux/hrtimer.h | 20 ++++++++++----------
 1 file changed, 10 insertions(+), 10 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/hrtimer.h b/include/linux/hrtimer.h
index 9900e998ea8f..485a634fd6e2 100644
--- a/include/linux/hrtimer.h
+++ b/include/linux/hrtimer.h
@@ -111,7 +111,7 @@ enum hrtimer_cb_mode {
  */
 struct hrtimer {
 	struct rb_node			node;
-	ktime_t				expires;
+	ktime_t				_expires;
 	enum hrtimer_restart		(*function)(struct hrtimer *);
 	struct hrtimer_clock_base	*base;
 	unsigned long			state;
@@ -219,41 +219,41 @@ static inline int hrtimer_is_hres_active(struct hrtimer *timer)
 
 static inline void hrtimer_set_expires(struct hrtimer *timer, ktime_t time)
 {
-	timer->expires = time;
+	timer->_expires = time;
 }
 static inline void hrtimer_set_expires_tv64(struct hrtimer *timer, s64 tv64)
 {
-	timer->expires.tv64 = tv64;
+	timer->_expires.tv64 = tv64;
 }
 
 static inline void hrtimer_add_expires(struct hrtimer *timer, ktime_t time)
 {
-	timer->expires = ktime_add_safe(timer->expires, time);
+	timer->_expires = ktime_add_safe(timer->_expires, time);
 }
 
 static inline void hrtimer_add_expires_ns(struct hrtimer *timer, unsigned long ns)
 {
-	timer->expires = ktime_add_ns(timer->expires, ns);
+	timer->_expires = ktime_add_ns(timer->_expires, ns);
 }
 
 static inline ktime_t hrtimer_get_expires(const struct hrtimer *timer)
 {
-	return timer->expires;
+	return timer->_expires;
 }
 
 static inline s64 hrtimer_get_expires_tv64(const struct hrtimer *timer)
 {
-	return timer->expires.tv64;
+	return timer->_expires.tv64;
 }
 
 static inline s64 hrtimer_get_expires_ns(const struct hrtimer *timer)
 {
-	return ktime_to_ns(timer->expires);
+	return ktime_to_ns(timer->_expires);
 }
 
 static inline ktime_t hrtimer_expires_remaining(const struct hrtimer *timer)
 {
-    return ktime_sub(timer->expires, timer->base->get_time());
+    return ktime_sub(timer->_expires, timer->base->get_time());
 }
 
 /*
@@ -334,7 +334,7 @@ static inline int hrtimer_start_expires(struct hrtimer *timer,
 
 static inline int hrtimer_restart(struct hrtimer *timer)
 {
-	return hrtimer_start(timer, timer->expires, HRTIMER_MODE_ABS);
+	return hrtimer_start(timer, timer->_expires, HRTIMER_MODE_ABS);
 }
 
 /* Query timers: */
-- 
cgit v1.2.3


From 654c8e0b1c623b156c5b92f28d914ab38c9c2c90 Mon Sep 17 00:00:00 2001
From: Arjan van de Ven <arjan@linux.intel.com>
Date: Mon, 1 Sep 2008 15:47:08 -0700
Subject: hrtimer: turn hrtimers into range timers

this patch turns hrtimers into range timers; they have 2 expire points
1) the soft expire point
2) the hard expire point

the kernel will do it's regular best effort attempt to get the timer run
at the hard expire point. However, if some other time fires after the soft
expire point, the kernel now has the freedom to fire this timer at this point,
and thus grouping the events and preventing a power-expensive wakeup in the
future.

Signed-off-by: Arjan van de Ven <arjan@linux.intel.com>
---
 include/linux/hrtimer.h | 31 ++++++++++++++++++++++++++-
 kernel/hrtimer.c        | 56 +++++++++++++++++++++++++++++++++++++++++++++----
 2 files changed, 82 insertions(+), 5 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/hrtimer.h b/include/linux/hrtimer.h
index 485a634fd6e2..28259c336679 100644
--- a/include/linux/hrtimer.h
+++ b/include/linux/hrtimer.h
@@ -112,6 +112,7 @@ enum hrtimer_cb_mode {
 struct hrtimer {
 	struct rb_node			node;
 	ktime_t				_expires;
+	ktime_t				_softexpires;
 	enum hrtimer_restart		(*function)(struct hrtimer *);
 	struct hrtimer_clock_base	*base;
 	unsigned long			state;
@@ -220,20 +221,37 @@ static inline int hrtimer_is_hres_active(struct hrtimer *timer)
 static inline void hrtimer_set_expires(struct hrtimer *timer, ktime_t time)
 {
 	timer->_expires = time;
+	timer->_softexpires = time;
 }
+
+static inline void hrtimer_set_expires_range(struct hrtimer *timer, ktime_t time, ktime_t delta)
+{
+	timer->_softexpires = time;
+	timer->_expires = ktime_add_safe(time, delta);
+}
+
+static inline void hrtimer_set_expires_range_ns(struct hrtimer *timer, ktime_t time, unsigned long delta)
+{
+	timer->_softexpires = time;
+	timer->_expires = ktime_add_safe(time, ns_to_ktime(delta));
+}
+
 static inline void hrtimer_set_expires_tv64(struct hrtimer *timer, s64 tv64)
 {
 	timer->_expires.tv64 = tv64;
+	timer->_softexpires.tv64 = tv64;
 }
 
 static inline void hrtimer_add_expires(struct hrtimer *timer, ktime_t time)
 {
 	timer->_expires = ktime_add_safe(timer->_expires, time);
+	timer->_softexpires = ktime_add_safe(timer->_softexpires, time);
 }
 
 static inline void hrtimer_add_expires_ns(struct hrtimer *timer, unsigned long ns)
 {
 	timer->_expires = ktime_add_ns(timer->_expires, ns);
+	timer->_softexpires = ktime_add_ns(timer->_softexpires, ns);
 }
 
 static inline ktime_t hrtimer_get_expires(const struct hrtimer *timer)
@@ -241,10 +259,19 @@ static inline ktime_t hrtimer_get_expires(const struct hrtimer *timer)
 	return timer->_expires;
 }
 
+static inline ktime_t hrtimer_get_softexpires(const struct hrtimer *timer)
+{
+	return timer->_softexpires;
+}
+
 static inline s64 hrtimer_get_expires_tv64(const struct hrtimer *timer)
 {
 	return timer->_expires.tv64;
 }
+static inline s64 hrtimer_get_softexpires_tv64(const struct hrtimer *timer)
+{
+	return timer->_softexpires.tv64;
+}
 
 static inline s64 hrtimer_get_expires_ns(const struct hrtimer *timer)
 {
@@ -334,7 +361,7 @@ static inline int hrtimer_start_expires(struct hrtimer *timer,
 
 static inline int hrtimer_restart(struct hrtimer *timer)
 {
-	return hrtimer_start(timer, timer->_expires, HRTIMER_MODE_ABS);
+	return hrtimer_start_expires(timer, HRTIMER_MODE_ABS);
 }
 
 /* Query timers: */
@@ -391,6 +418,8 @@ extern long hrtimer_nanosleep_restart(struct restart_block *restart_block);
 extern void hrtimer_init_sleeper(struct hrtimer_sleeper *sl,
 				 struct task_struct *tsk);
 
+extern int schedule_hrtimeout_range(ktime_t *expires, unsigned long delta,
+						const enum hrtimer_mode mode);
 extern int schedule_hrtimeout(ktime_t *expires, const enum hrtimer_mode mode);
 
 /* Soft interrupt function to run the hrtimer queues: */
diff --git a/kernel/hrtimer.c b/kernel/hrtimer.c
index ae307feec74c..01483004183d 100644
--- a/kernel/hrtimer.c
+++ b/kernel/hrtimer.c
@@ -1309,7 +1309,20 @@ void hrtimer_interrupt(struct clock_event_device *dev)
 
 			timer = rb_entry(node, struct hrtimer, node);
 
-			if (basenow.tv64 < hrtimer_get_expires_tv64(timer)) {
+			/*
+			 * The immediate goal for using the softexpires is
+			 * minimizing wakeups, not running timers at the
+			 * earliest interrupt after their soft expiration.
+			 * This allows us to avoid using a Priority Search
+			 * Tree, which can answer a stabbing querry for
+			 * overlapping intervals and instead use the simple
+			 * BST we already have.
+			 * We don't add extra wakeups by delaying timers that
+			 * are right-of a not yet expired timer, because that
+			 * timer will have to trigger a wakeup anyway.
+			 */
+
+			if (basenow.tv64 < hrtimer_get_softexpires_tv64(timer)) {
 				ktime_t expires;
 
 				expires = ktime_sub(hrtimer_get_expires(timer),
@@ -1681,14 +1694,20 @@ void __init hrtimers_init(void)
 }
 
 /**
- * schedule_hrtimeout - sleep until timeout
+ * schedule_hrtimeout_range - sleep until timeout
  * @expires:	timeout value (ktime_t)
+ * @delta:	slack in expires timeout (ktime_t)
  * @mode:	timer mode, HRTIMER_MODE_ABS or HRTIMER_MODE_REL
  *
  * Make the current task sleep until the given expiry time has
  * elapsed. The routine will return immediately unless
  * the current task state has been set (see set_current_state()).
  *
+ * The @delta argument gives the kernel the freedom to schedule the
+ * actual wakeup to a time that is both power and performance friendly.
+ * The kernel give the normal best effort behavior for "@expires+@delta",
+ * but may decide to fire the timer earlier, but no earlier than @expires.
+ *
  * You can set the task state as follows -
  *
  * %TASK_UNINTERRUPTIBLE - at least @timeout time is guaranteed to
@@ -1702,7 +1721,7 @@ void __init hrtimers_init(void)
  *
  * Returns 0 when the timer has expired otherwise -EINTR
  */
-int __sched schedule_hrtimeout(ktime_t *expires,
+int __sched schedule_hrtimeout_range(ktime_t *expires, unsigned long delta,
 			       const enum hrtimer_mode mode)
 {
 	struct hrtimer_sleeper t;
@@ -1726,7 +1745,7 @@ int __sched schedule_hrtimeout(ktime_t *expires,
 	}
 
 	hrtimer_init_on_stack(&t.timer, CLOCK_MONOTONIC, mode);
-	hrtimer_set_expires(&t.timer, *expires);
+	hrtimer_set_expires_range_ns(&t.timer, *expires, delta);
 
 	hrtimer_init_sleeper(&t, current);
 
@@ -1744,4 +1763,33 @@ int __sched schedule_hrtimeout(ktime_t *expires,
 
 	return !t.task ? 0 : -EINTR;
 }
+EXPORT_SYMBOL_GPL(schedule_hrtimeout_range);
+
+/**
+ * schedule_hrtimeout - sleep until timeout
+ * @expires:	timeout value (ktime_t)
+ * @mode:	timer mode, HRTIMER_MODE_ABS or HRTIMER_MODE_REL
+ *
+ * Make the current task sleep until the given expiry time has
+ * elapsed. The routine will return immediately unless
+ * the current task state has been set (see set_current_state()).
+ *
+ * You can set the task state as follows -
+ *
+ * %TASK_UNINTERRUPTIBLE - at least @timeout time is guaranteed to
+ * pass before the routine returns.
+ *
+ * %TASK_INTERRUPTIBLE - the routine may return early if a signal is
+ * delivered to the current task.
+ *
+ * The current task state is guaranteed to be TASK_RUNNING when this
+ * routine returns.
+ *
+ * Returns 0 when the timer has expired otherwise -EINTR
+ */
+int __sched schedule_hrtimeout(ktime_t *expires,
+			       const enum hrtimer_mode mode)
+{
+	return schedule_hrtimeout_range(expires, 0, mode);
+}
 EXPORT_SYMBOL_GPL(schedule_hrtimeout);
-- 
cgit v1.2.3


From 6976675d94042fbd446231d1bd8b7de71a980ada Mon Sep 17 00:00:00 2001
From: Arjan van de Ven <arjan@linux.intel.com>
Date: Mon, 1 Sep 2008 15:52:40 -0700
Subject: hrtimer: create a "timer_slack" field in the task struct

We want to be able to control the default "rounding" that is used by
select() and poll() and friends. This is a per process property
(so that we can have a "nice" like program to start certain programs with
a looser or stricter rounding) that can be set/get via a prctl().

For this purpose, a field called "timer_slack_ns" is added to the task
struct. In addition, a field called "default_timer_slack"ns" is added
so that tasks easily can temporarily to a more/less accurate slack and then
back to the default.

The default value of the slack is set to 50 usec; this is significantly less
than 2.6.27's average select() and poll() timing error but still allows
the kernel to group timers somewhat to preserve power behavior. Applications
and admins can override this via the prctl()

Signed-off-by: Arjan van de Ven <arjan@linux.intel.com>
---
 include/linux/init_task.h |  1 +
 include/linux/prctl.h     |  7 +++++++
 include/linux/sched.h     |  6 ++++++
 kernel/fork.c             |  2 ++
 kernel/sys.c              | 10 ++++++++++
 5 files changed, 26 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/init_task.h b/include/linux/init_task.h
index 021d8e720c79..23fd8909b9e5 100644
--- a/include/linux/init_task.h
+++ b/include/linux/init_task.h
@@ -170,6 +170,7 @@ extern struct group_info init_groups;
 	.cpu_timers	= INIT_CPU_TIMERS(tsk.cpu_timers),		\
 	.fs_excl	= ATOMIC_INIT(0),				\
 	.pi_lock	= __SPIN_LOCK_UNLOCKED(tsk.pi_lock),		\
+	.timer_slack_ns = 50000, /* 50 usec default slack */		\
 	.pids = {							\
 		[PIDTYPE_PID]  = INIT_PID_LINK(PIDTYPE_PID),		\
 		[PIDTYPE_PGID] = INIT_PID_LINK(PIDTYPE_PGID),		\
diff --git a/include/linux/prctl.h b/include/linux/prctl.h
index 5ad79198d6f9..48d887e3c6e7 100644
--- a/include/linux/prctl.h
+++ b/include/linux/prctl.h
@@ -78,4 +78,11 @@
 #define PR_GET_SECUREBITS 27
 #define PR_SET_SECUREBITS 28
 
+/*
+ * Get/set the timerslack as used by poll/select/nanosleep
+ * A value of 0 means "use default"
+ */
+#define PR_SET_TIMERSLACK 29
+#define PR_GET_TIMERSLACK 30
+
 #endif /* _LINUX_PRCTL_H */
diff --git a/include/linux/sched.h b/include/linux/sched.h
index 3d9120c5ad15..dcc03fd5a7f3 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -1301,6 +1301,12 @@ struct task_struct {
 	int latency_record_count;
 	struct latency_record latency_record[LT_SAVECOUNT];
 #endif
+	/*
+	 * time slack values; these are used to round up poll() and
+	 * select() etc timeout values. These are in nanoseconds.
+	 */
+	unsigned long timer_slack_ns;
+	unsigned long default_timer_slack_ns;
 };
 
 /*
diff --git a/kernel/fork.c b/kernel/fork.c
index 7ce2ebe84796..4308d75f0fa5 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -987,6 +987,8 @@ static struct task_struct *copy_process(unsigned long clone_flags,
 	p->prev_utime = cputime_zero;
 	p->prev_stime = cputime_zero;
 
+	p->default_timer_slack_ns = current->timer_slack_ns;
+
 #ifdef CONFIG_DETECT_SOFTLOCKUP
 	p->last_switch_count = 0;
 	p->last_switch_timestamp = 0;
diff --git a/kernel/sys.c b/kernel/sys.c
index 038a7bc0901d..1b96401a0576 100644
--- a/kernel/sys.c
+++ b/kernel/sys.c
@@ -1727,6 +1727,16 @@ asmlinkage long sys_prctl(int option, unsigned long arg2, unsigned long arg3,
 		case PR_SET_TSC:
 			error = SET_TSC_CTL(arg2);
 			break;
+		case PR_GET_TIMERSLACK:
+			error = current->timer_slack_ns;
+			break;
+		case PR_SET_TIMERSLACK:
+			if (arg2 <= 0)
+				current->timer_slack_ns =
+					current->default_timer_slack_ns;
+			else
+				current->timer_slack_ns = arg2;
+			break;
 		default:
 			error = -EINVAL;
 			break;
-- 
cgit v1.2.3


From 584fb4a76413ec9215741e075e0dfb69173b213f Mon Sep 17 00:00:00 2001
From: Arjan van de Ven <arjan@linux.intel.com>
Date: Sat, 6 Sep 2008 08:32:57 -0700
Subject: hrtimer: fix build bug found by Ingo

in some randconfig configurations, hrtimers are used even though
the hrtimer config if off; and it broke the build due to some of
the new functions being on the wrong side of the ifdef.

This patch moves the functions to the other side of the ifdef, fixing
the build bug.

Signed-off-by: Arjan van de Ven <arjan@linux.intel.com>
---
 include/linux/hrtimer.h | 14 +++++++-------
 1 file changed, 7 insertions(+), 7 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/hrtimer.h b/include/linux/hrtimer.h
index 28259c336679..c407b33ef844 100644
--- a/include/linux/hrtimer.h
+++ b/include/linux/hrtimer.h
@@ -198,13 +198,6 @@ struct hrtimer_cpu_base {
 #endif
 };
 
-#ifdef CONFIG_HIGH_RES_TIMERS
-struct clock_event_device;
-
-extern void clock_was_set(void);
-extern void hres_timers_resume(void);
-extern void hrtimer_interrupt(struct clock_event_device *dev);
-
 /*
  * In high resolution mode the time reference must be read accurate
  */
@@ -283,6 +276,13 @@ static inline ktime_t hrtimer_expires_remaining(const struct hrtimer *timer)
     return ktime_sub(timer->_expires, timer->base->get_time());
 }
 
+#ifdef CONFIG_HIGH_RES_TIMERS
+struct clock_event_device;
+
+extern void clock_was_set(void);
+extern void hres_timers_resume(void);
+extern void hrtimer_interrupt(struct clock_event_device *dev);
+
 /*
  * The resolution of the clocks. The resolution value is returned in
  * the clock_getres() system call to give application programmers an
-- 
cgit v1.2.3


From 2ec02270c00f94b08fddfb68c37510a9fb47ac7c Mon Sep 17 00:00:00 2001
From: Arjan van de Ven <arjan@linux.intel.com>
Date: Sat, 6 Sep 2008 09:36:56 -0700
Subject: hrtimer: another build fix

More randconfig testing

Signed-off-by: Arjan van de Ven <arjan@linux.intel.com>
---
 include/linux/hrtimer.h | 26 +++++++++++++-------------
 1 file changed, 13 insertions(+), 13 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/hrtimer.h b/include/linux/hrtimer.h
index c407b33ef844..4c1a834b9849 100644
--- a/include/linux/hrtimer.h
+++ b/include/linux/hrtimer.h
@@ -198,19 +198,6 @@ struct hrtimer_cpu_base {
 #endif
 };
 
-/*
- * In high resolution mode the time reference must be read accurate
- */
-static inline ktime_t hrtimer_cb_get_time(struct hrtimer *timer)
-{
-	return timer->base->get_time();
-}
-
-static inline int hrtimer_is_hres_active(struct hrtimer *timer)
-{
-	return timer->base->cpu_base->hres_active;
-}
-
 static inline void hrtimer_set_expires(struct hrtimer *timer, ktime_t time)
 {
 	timer->_expires = time;
@@ -283,6 +270,19 @@ extern void clock_was_set(void);
 extern void hres_timers_resume(void);
 extern void hrtimer_interrupt(struct clock_event_device *dev);
 
+/*
+ * In high resolution mode the time reference must be read accurate
+ */
+static inline ktime_t hrtimer_cb_get_time(struct hrtimer *timer)
+{
+	return timer->base->get_time();
+}
+
+static inline int hrtimer_is_hres_active(struct hrtimer *timer)
+{
+	return timer->base->cpu_base->hres_active;
+}
+
 /*
  * The resolution of the clocks. The resolution value is returned in
  * the clock_getres() system call to give application programmers an
-- 
cgit v1.2.3


From da8f2e170ea94cc20f8ebbc8ee8d127edb8f12f1 Mon Sep 17 00:00:00 2001
From: Arjan van de Ven <arjan@linux.intel.com>
Date: Sun, 7 Sep 2008 10:47:46 -0700
Subject: hrtimer: add a hrtimer_start_range() function

this patch adds a _range version of hrtimer_start() so that range timers
can be created; the hrtimer_start() function is just a wrapper around this.

In addition, hrtimer_start_expires() will now preserve existing ranges.

Signed-off-by: Arjan van de Ven <arjan@linux.intel.com>
---
 include/linux/hrtimer.h |  9 ++++++++-
 kernel/hrtimer.c        | 26 +++++++++++++++++++++++---
 2 files changed, 31 insertions(+), 4 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/hrtimer.h b/include/linux/hrtimer.h
index 4c1a834b9849..1c0473e8ecb4 100644
--- a/include/linux/hrtimer.h
+++ b/include/linux/hrtimer.h
@@ -350,13 +350,20 @@ static inline void destroy_hrtimer_on_stack(struct hrtimer *timer) { }
 /* Basic timer operations: */
 extern int hrtimer_start(struct hrtimer *timer, ktime_t tim,
 			 const enum hrtimer_mode mode);
+extern int hrtimer_start_range_ns(struct hrtimer *timer, ktime_t tim,
+			unsigned long range_ns, const enum hrtimer_mode mode);
 extern int hrtimer_cancel(struct hrtimer *timer);
 extern int hrtimer_try_to_cancel(struct hrtimer *timer);
 
 static inline int hrtimer_start_expires(struct hrtimer *timer,
 						enum hrtimer_mode mode)
 {
-	return hrtimer_start(timer, hrtimer_get_expires(timer), mode);
+	unsigned long delta;
+	ktime_t soft, hard;
+	soft = hrtimer_get_softexpires(timer);
+	hard = hrtimer_get_expires(timer);
+	delta = ktime_to_ns(ktime_sub(hard, soft));
+	return hrtimer_start_range_ns(timer, hrtimer_get_expires(timer), delta, mode);
 }
 
 static inline int hrtimer_restart(struct hrtimer *timer)
diff --git a/kernel/hrtimer.c b/kernel/hrtimer.c
index 01483004183d..a0222097c57e 100644
--- a/kernel/hrtimer.c
+++ b/kernel/hrtimer.c
@@ -945,9 +945,10 @@ remove_hrtimer(struct hrtimer *timer, struct hrtimer_clock_base *base)
 }
 
 /**
- * hrtimer_start - (re)start an relative timer on the current CPU
+ * hrtimer_start_range_ns - (re)start an relative timer on the current CPU
  * @timer:	the timer to be added
  * @tim:	expiry time
+ * @delta_ns:	"slack" range for the timer
  * @mode:	expiry mode: absolute (HRTIMER_ABS) or relative (HRTIMER_REL)
  *
  * Returns:
@@ -955,7 +956,8 @@ remove_hrtimer(struct hrtimer *timer, struct hrtimer_clock_base *base)
  *  1 when the timer was active
  */
 int
-hrtimer_start(struct hrtimer *timer, ktime_t tim, const enum hrtimer_mode mode)
+hrtimer_start_range_ns(struct hrtimer *timer, ktime_t tim, unsigned long delta_ns,
+			const enum hrtimer_mode mode)
 {
 	struct hrtimer_clock_base *base, *new_base;
 	unsigned long flags;
@@ -983,7 +985,7 @@ hrtimer_start(struct hrtimer *timer, ktime_t tim, const enum hrtimer_mode mode)
 #endif
 	}
 
-	hrtimer_set_expires(timer, tim);
+	hrtimer_set_expires_range_ns(timer, tim, delta_ns);
 
 	timer_stats_hrtimer_set_start_info(timer);
 
@@ -1016,8 +1018,26 @@ hrtimer_start(struct hrtimer *timer, ktime_t tim, const enum hrtimer_mode mode)
 
 	return ret;
 }
+EXPORT_SYMBOL_GPL(hrtimer_start_range_ns);
+
+/**
+ * hrtimer_start - (re)start an relative timer on the current CPU
+ * @timer:	the timer to be added
+ * @tim:	expiry time
+ * @mode:	expiry mode: absolute (HRTIMER_ABS) or relative (HRTIMER_REL)
+ *
+ * Returns:
+ *  0 on success
+ *  1 when the timer was active
+ */
+int
+hrtimer_start(struct hrtimer *timer, ktime_t tim, const enum hrtimer_mode mode)
+{
+	return hrtimer_start_range_ns(timer, tim, 0, mode);
+}
 EXPORT_SYMBOL_GPL(hrtimer_start);
 
+
 /**
  * hrtimer_try_to_cancel - try to deactivate a timer
  * @timer:	hrtimer to stop
-- 
cgit v1.2.3


From 4ce105d30e08fb8a1783c55a0e48aa3fa200c455 Mon Sep 17 00:00:00 2001
From: Arjan van de Ven <arjan@linux.intel.com>
Date: Sun, 7 Sep 2008 15:31:39 -0700
Subject: hrtimer: incorporate feedback from Peter Zijlstra

(based on  lkml review)
* use rt_task()
* task_nice() has a sign

Signed-off-by: Arjan van de Ven <arjan@linux.intel.com>
---
 fs/select.c             | 5 ++---
 include/linux/hrtimer.h | 2 +-
 2 files changed, 3 insertions(+), 4 deletions(-)

(limited to 'include/linux')

diff --git a/fs/select.c b/fs/select.c
index 5e61b43d0766..fdd8584e536d 100644
--- a/fs/select.c
+++ b/fs/select.c
@@ -46,7 +46,7 @@ static unsigned long __estimate_accuracy(struct timespec *tv)
 	unsigned long slack;
 	int divfactor = 1000;
 
-	if (task_nice(current))
+	if (task_nice(current) > 0)
 		divfactor = divfactor / 5;
 
 	slack = tv->tv_nsec / divfactor;
@@ -66,8 +66,7 @@ static unsigned long estimate_accuracy(struct timespec *tv)
 	 * Realtime tasks get a slack of 0 for obvious reasons.
 	 */
 
-	if (current->policy == SCHED_FIFO ||
-		current->policy == SCHED_RR)
+	if (rt_task(current))
 		return 0;
 
 	ktime_get_ts(&now);
diff --git a/include/linux/hrtimer.h b/include/linux/hrtimer.h
index 1c0473e8ecb4..95db11f62ff2 100644
--- a/include/linux/hrtimer.h
+++ b/include/linux/hrtimer.h
@@ -363,7 +363,7 @@ static inline int hrtimer_start_expires(struct hrtimer *timer,
 	soft = hrtimer_get_softexpires(timer);
 	hard = hrtimer_get_expires(timer);
 	delta = ktime_to_ns(ktime_sub(hard, soft));
-	return hrtimer_start_range_ns(timer, hrtimer_get_expires(timer), delta, mode);
+	return hrtimer_start_range_ns(timer, soft, delta, mode);
 }
 
 static inline int hrtimer_restart(struct hrtimer *timer)
-- 
cgit v1.2.3


From 2e94d1f71f7e4404d997e6fb4f1618aa147d76f9 Mon Sep 17 00:00:00 2001
From: Arjan van de Ven <arjan@linux.intel.com>
Date: Wed, 10 Sep 2008 16:06:00 -0700
Subject: hrtimer: peek at the timer queue just before going idle

As part of going idle, we already look at the time of the next timer event to determine
which C-state to select etc.

This patch adds functionality that causes the timers that are past their
soft expire time, to fire at this time, before we calculate the next wakeup
time. This functionality will thus avoid wakeups by running timers before
going idle rather than specially waking up for it.

Signed-off-by: Arjan van de Ven <arjan@linux.intel.com>
---
 drivers/cpuidle/cpuidle.c |  7 +++++++
 include/linux/hrtimer.h   |  5 +++++
 kernel/hrtimer.c          | 30 ++++++++++++++++++++++++++++++
 3 files changed, 42 insertions(+)

(limited to 'include/linux')

diff --git a/drivers/cpuidle/cpuidle.c b/drivers/cpuidle/cpuidle.c
index 5ce07b517c58..2e3148499368 100644
--- a/drivers/cpuidle/cpuidle.c
+++ b/drivers/cpuidle/cpuidle.c
@@ -16,6 +16,7 @@
 #include <linux/cpu.h>
 #include <linux/cpuidle.h>
 #include <linux/ktime.h>
+#include <linux/hrtimer.h>
 
 #include "cpuidle.h"
 
@@ -60,6 +61,12 @@ static void cpuidle_idle_call(void)
 		return;
 	}
 
+	/*
+	 * run any timers that can be run now, at this point
+	 * before calculating the idle duration etc.
+	 */
+	hrtimer_peek_ahead_timers();
+
 	/* ask the governor for the next state */
 	next_state = cpuidle_curr_governor->select(dev);
 	if (need_resched())
diff --git a/include/linux/hrtimer.h b/include/linux/hrtimer.h
index 95db11f62ff2..d93b1e1dc169 100644
--- a/include/linux/hrtimer.h
+++ b/include/linux/hrtimer.h
@@ -326,6 +326,11 @@ static inline int hrtimer_is_hres_active(struct hrtimer *timer)
 extern ktime_t ktime_get(void);
 extern ktime_t ktime_get_real(void);
 
+
+DECLARE_PER_CPU(struct tick_device, tick_cpu_device);
+extern void hrtimer_peek_ahead_timers(void);
+
+
 /* Exported timer functions: */
 
 /* Initialize timers: */
diff --git a/kernel/hrtimer.c b/kernel/hrtimer.c
index 9a4c90185566..eb2cf984959f 100644
--- a/kernel/hrtimer.c
+++ b/kernel/hrtimer.c
@@ -1381,6 +1381,36 @@ void hrtimer_interrupt(struct clock_event_device *dev)
 		raise_softirq(HRTIMER_SOFTIRQ);
 }
 
+/**
+ * hrtimer_peek_ahead_timers -- run soft-expired timers now
+ *
+ * hrtimer_peek_ahead_timers will peek at the timer queue of
+ * the current cpu and check if there are any timers for which
+ * the soft expires time has passed. If any such timers exist,
+ * they are run immediately and then removed from the timer queue.
+ *
+ */
+void hrtimer_peek_ahead_timers(void)
+{
+	unsigned long flags;
+	struct tick_device *td;
+	struct clock_event_device *dev;
+
+	if (hrtimer_hres_active())
+		return;
+
+	local_irq_save(flags);
+	td = &__get_cpu_var(tick_cpu_device);
+	if (!td)
+		goto out;
+	dev = td->evtdev;
+	if (!dev)
+		goto out;
+	hrtimer_interrupt(dev);
+out:
+	local_irq_restore(flags);
+}
+
 static void run_hrtimer_softirq(struct softirq_action *h)
 {
 	run_hrtimer_pending(&__get_cpu_var(hrtimer_bases));
-- 
cgit v1.2.3


From f06febc96ba8e0af80bcc3eaec0a109e88275fac Mon Sep 17 00:00:00 2001
From: Frank Mayhar <fmayhar@google.com>
Date: Fri, 12 Sep 2008 09:54:39 -0700
Subject: timers: fix itimer/many thread hang

Overview

This patch reworks the handling of POSIX CPU timers, including the
ITIMER_PROF, ITIMER_VIRT timers and rlimit handling.  It was put together
with the help of Roland McGrath, the owner and original writer of this code.

The problem we ran into, and the reason for this rework, has to do with using
a profiling timer in a process with a large number of threads.  It appears
that the performance of the old implementation of run_posix_cpu_timers() was
at least O(n*3) (where "n" is the number of threads in a process) or worse.
Everything is fine with an increasing number of threads until the time taken
for that routine to run becomes the same as or greater than the tick time, at
which point things degrade rather quickly.

This patch fixes bug 9906, "Weird hang with NPTL and SIGPROF."

Code Changes

This rework corrects the implementation of run_posix_cpu_timers() to make it
run in constant time for a particular machine.  (Performance may vary between
one machine and another depending upon whether the kernel is built as single-
or multiprocessor and, in the latter case, depending upon the number of
running processors.)  To do this, at each tick we now update fields in
signal_struct as well as task_struct.  The run_posix_cpu_timers() function
uses those fields to make its decisions.

We define a new structure, "task_cputime," to contain user, system and
scheduler times and use these in appropriate places:

struct task_cputime {
	cputime_t utime;
	cputime_t stime;
	unsigned long long sum_exec_runtime;
};

This is included in the structure "thread_group_cputime," which is a new
substructure of signal_struct and which varies for uniprocessor versus
multiprocessor kernels.  For uniprocessor kernels, it uses "task_cputime" as
a simple substructure, while for multiprocessor kernels it is a pointer:

struct thread_group_cputime {
	struct task_cputime totals;
};

struct thread_group_cputime {
	struct task_cputime *totals;
};

We also add a new task_cputime substructure directly to signal_struct, to
cache the earliest expiration of process-wide timers, and task_cputime also
replaces the it_*_expires fields of task_struct (used for earliest expiration
of thread timers).  The "thread_group_cputime" structure contains process-wide
timers that are updated via account_user_time() and friends.  In the non-SMP
case the structure is a simple aggregator; unfortunately in the SMP case that
simplicity was not achievable due to cache-line contention between CPUs (in
one measured case performance was actually _worse_ on a 16-cpu system than
the same test on a 4-cpu system, due to this contention).  For SMP, the
thread_group_cputime counters are maintained as a per-cpu structure allocated
using alloc_percpu().  The timer functions update only the timer field in
the structure corresponding to the running CPU, obtained using per_cpu_ptr().

We define a set of inline functions in sched.h that we use to maintain the
thread_group_cputime structure and hide the differences between UP and SMP
implementations from the rest of the kernel.  The thread_group_cputime_init()
function initializes the thread_group_cputime structure for the given task.
The thread_group_cputime_alloc() is a no-op for UP; for SMP it calls the
out-of-line function thread_group_cputime_alloc_smp() to allocate and fill
in the per-cpu structures and fields.  The thread_group_cputime_free()
function, also a no-op for UP, in SMP frees the per-cpu structures.  The
thread_group_cputime_clone_thread() function (also a UP no-op) for SMP calls
thread_group_cputime_alloc() if the per-cpu structures haven't yet been
allocated.  The thread_group_cputime() function fills the task_cputime
structure it is passed with the contents of the thread_group_cputime fields;
in UP it's that simple but in SMP it must also safely check that tsk->signal
is non-NULL (if it is it just uses the appropriate fields of task_struct) and,
if so, sums the per-cpu values for each online CPU.  Finally, the three
functions account_group_user_time(), account_group_system_time() and
account_group_exec_runtime() are used by timer functions to update the
respective fields of the thread_group_cputime structure.

Non-SMP operation is trivial and will not be mentioned further.

The per-cpu structure is always allocated when a task creates its first new
thread, via a call to thread_group_cputime_clone_thread() from copy_signal().
It is freed at process exit via a call to thread_group_cputime_free() from
cleanup_signal().

All functions that formerly summed utime/stime/sum_sched_runtime values from
from all threads in the thread group now use thread_group_cputime() to
snapshot the values in the thread_group_cputime structure or the values in
the task structure itself if the per-cpu structure hasn't been allocated.

Finally, the code in kernel/posix-cpu-timers.c has changed quite a bit.
The run_posix_cpu_timers() function has been split into a fast path and a
slow path; the former safely checks whether there are any expired thread
timers and, if not, just returns, while the slow path does the heavy lifting.
With the dedicated thread group fields, timers are no longer "rebalanced" and
the process_timer_rebalance() function and related code has gone away.  All
summing loops are gone and all code that used them now uses the
thread_group_cputime() inline.  When process-wide timers are set, the new
task_cputime structure in signal_struct is used to cache the earliest
expiration; this is checked in the fast path.

Performance

The fix appears not to add significant overhead to existing operations.  It
generally performs the same as the current code except in two cases, one in
which it performs slightly worse (Case 5 below) and one in which it performs
very significantly better (Case 2 below).  Overall it's a wash except in those
two cases.

I've since done somewhat more involved testing on a dual-core Opteron system.

Case 1: With no itimer running, for a test with 100,000 threads, the fixed
	kernel took 1428.5 seconds, 513 seconds more than the unfixed system,
	all of which was spent in the system.  There were twice as many
	voluntary context switches with the fix as without it.

Case 2: With an itimer running at .01 second ticks and 4000 threads (the most
	an unmodified kernel can handle), the fixed kernel ran the test in
	eight percent of the time (5.8 seconds as opposed to 70 seconds) and
	had better tick accuracy (.012 seconds per tick as opposed to .023
	seconds per tick).

Case 3: A 4000-thread test with an initial timer tick of .01 second and an
	interval of 10,000 seconds (i.e. a timer that ticks only once) had
	very nearly the same performance in both cases:  6.3 seconds elapsed
	for the fixed kernel versus 5.5 seconds for the unfixed kernel.

With fewer threads (eight in these tests), the Case 1 test ran in essentially
the same time on both the modified and unmodified kernels (5.2 seconds versus
5.8 seconds).  The Case 2 test ran in about the same time as well, 5.9 seconds
versus 5.4 seconds but again with much better tick accuracy, .013 seconds per
tick versus .025 seconds per tick for the unmodified kernel.

Since the fix affected the rlimit code, I also tested soft and hard CPU limits.

Case 4: With a hard CPU limit of 20 seconds and eight threads (and an itimer
	running), the modified kernel was very slightly favored in that while
	it killed the process in 19.997 seconds of CPU time (5.002 seconds of
	wall time), only .003 seconds of that was system time, the rest was
	user time.  The unmodified kernel killed the process in 20.001 seconds
	of CPU (5.014 seconds of wall time) of which .016 seconds was system
	time.  Really, though, the results were too close to call.  The results
	were essentially the same with no itimer running.

Case 5: With a soft limit of 20 seconds and a hard limit of 2000 seconds
	(where the hard limit would never be reached) and an itimer running,
	the modified kernel exhibited worse tick accuracy than the unmodified
	kernel: .050 seconds/tick versus .028 seconds/tick.  Otherwise,
	performance was almost indistinguishable.  With no itimer running this
	test exhibited virtually identical behavior and times in both cases.

In times past I did some limited performance testing.  those results are below.

On a four-cpu Opteron system without this fix, a sixteen-thread test executed
in 3569.991 seconds, of which user was 3568.435s and system was 1.556s.  On
the same system with the fix, user and elapsed time were about the same, but
system time dropped to 0.007 seconds.  Performance with eight, four and one
thread were comparable.  Interestingly, the timer ticks with the fix seemed
more accurate:  The sixteen-thread test with the fix received 149543 ticks
for 0.024 seconds per tick, while the same test without the fix received 58720
for 0.061 seconds per tick.  Both cases were configured for an interval of
0.01 seconds.  Again, the other tests were comparable.  Each thread in this
test computed the primes up to 25,000,000.

I also did a test with a large number of threads, 100,000 threads, which is
impossible without the fix.  In this case each thread computed the primes only
up to 10,000 (to make the runtime manageable).  System time dominated, at
1546.968 seconds out of a total 2176.906 seconds (giving a user time of
629.938s).  It received 147651 ticks for 0.015 seconds per tick, still quite
accurate.  There is obviously no comparable test without the fix.

Signed-off-by: Frank Mayhar <fmayhar@google.com>
Cc: Roland McGrath <roland@redhat.com>
Cc: Alexey Dobriyan <adobriyan@gmail.com>
Cc: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 fs/binfmt_elf.c              |  19 +-
 fs/proc/array.c              |   8 +-
 include/linux/posix-timers.h |   2 +
 include/linux/sched.h        | 257 +++++++++++++++++++++--
 include/linux/time.h         |   3 +
 kernel/compat.c              |  53 ++---
 kernel/exit.c                |  19 +-
 kernel/fork.c                |  88 ++++----
 kernel/itimer.c              |  33 +--
 kernel/posix-cpu-timers.c    | 471 +++++++++++++++++++++++--------------------
 kernel/sched.c               |  53 ++++-
 kernel/sched_fair.c          |   1 +
 kernel/sched_rt.c            |   4 +-
 kernel/signal.c              |   8 +-
 kernel/sys.c                 |  75 +++----
 security/selinux/hooks.c     |   9 +-
 16 files changed, 677 insertions(+), 426 deletions(-)

(limited to 'include/linux')

diff --git a/fs/binfmt_elf.c b/fs/binfmt_elf.c
index 655ed8d30a86..a8635f637038 100644
--- a/fs/binfmt_elf.c
+++ b/fs/binfmt_elf.c
@@ -1333,20 +1333,15 @@ static void fill_prstatus(struct elf_prstatus *prstatus,
 	prstatus->pr_pgrp = task_pgrp_vnr(p);
 	prstatus->pr_sid = task_session_vnr(p);
 	if (thread_group_leader(p)) {
+		struct task_cputime cputime;
+
 		/*
-		 * This is the record for the group leader.  Add in the
-		 * cumulative times of previous dead threads.  This total
-		 * won't include the time of each live thread whose state
-		 * is included in the core dump.  The final total reported
-		 * to our parent process when it calls wait4 will include
-		 * those sums as well as the little bit more time it takes
-		 * this and each other thread to finish dying after the
-		 * core dump synchronization phase.
+		 * This is the record for the group leader.  It shows the
+		 * group-wide total, not its individual thread total.
 		 */
-		cputime_to_timeval(cputime_add(p->utime, p->signal->utime),
-				   &prstatus->pr_utime);
-		cputime_to_timeval(cputime_add(p->stime, p->signal->stime),
-				   &prstatus->pr_stime);
+		thread_group_cputime(p, &cputime);
+		cputime_to_timeval(cputime.utime, &prstatus->pr_utime);
+		cputime_to_timeval(cputime.stime, &prstatus->pr_stime);
 	} else {
 		cputime_to_timeval(p->utime, &prstatus->pr_utime);
 		cputime_to_timeval(p->stime, &prstatus->pr_stime);
diff --git a/fs/proc/array.c b/fs/proc/array.c
index 71c9be59c9c2..933953c4e407 100644
--- a/fs/proc/array.c
+++ b/fs/proc/array.c
@@ -395,20 +395,20 @@ static int do_task_stat(struct seq_file *m, struct pid_namespace *ns,
 
 		/* add up live thread stats at the group level */
 		if (whole) {
+			struct task_cputime cputime;
 			struct task_struct *t = task;
 			do {
 				min_flt += t->min_flt;
 				maj_flt += t->maj_flt;
-				utime = cputime_add(utime, task_utime(t));
-				stime = cputime_add(stime, task_stime(t));
 				gtime = cputime_add(gtime, task_gtime(t));
 				t = next_thread(t);
 			} while (t != task);
 
 			min_flt += sig->min_flt;
 			maj_flt += sig->maj_flt;
-			utime = cputime_add(utime, sig->utime);
-			stime = cputime_add(stime, sig->stime);
+			thread_group_cputime(task, &cputime);
+			utime = cputime.utime;
+			stime = cputime.stime;
 			gtime = cputime_add(gtime, sig->gtime);
 		}
 
diff --git a/include/linux/posix-timers.h b/include/linux/posix-timers.h
index a7dd38f30ade..f9d8e9e94e9b 100644
--- a/include/linux/posix-timers.h
+++ b/include/linux/posix-timers.h
@@ -115,4 +115,6 @@ void set_process_cpu_timer(struct task_struct *task, unsigned int clock_idx,
 
 long clock_nanosleep_restart(struct restart_block *restart_block);
 
+void update_rlimit_cpu(unsigned long rlim_new);
+
 #endif
diff --git a/include/linux/sched.h b/include/linux/sched.h
index 3d9120c5ad15..26d7a5f2d0ba 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -425,6 +425,45 @@ struct pacct_struct {
 	unsigned long		ac_minflt, ac_majflt;
 };
 
+/**
+ * struct task_cputime - collected CPU time counts
+ * @utime:		time spent in user mode, in &cputime_t units
+ * @stime:		time spent in kernel mode, in &cputime_t units
+ * @sum_exec_runtime:	total time spent on the CPU, in nanoseconds
+ * 
+ * This structure groups together three kinds of CPU time that are
+ * tracked for threads and thread groups.  Most things considering
+ * CPU time want to group these counts together and treat all three
+ * of them in parallel.
+ */
+struct task_cputime {
+	cputime_t utime;
+	cputime_t stime;
+	unsigned long long sum_exec_runtime;
+};
+/* Alternate field names when used to cache expirations. */
+#define prof_exp	stime
+#define virt_exp	utime
+#define sched_exp	sum_exec_runtime
+
+/**
+ * struct thread_group_cputime - thread group interval timer counts
+ * @totals:		thread group interval timers; substructure for
+ *			uniprocessor kernel, per-cpu for SMP kernel.
+ *
+ * This structure contains the version of task_cputime, above, that is
+ * used for thread group CPU clock calculations.
+ */
+#ifdef CONFIG_SMP
+struct thread_group_cputime {
+	struct task_cputime *totals;
+};
+#else
+struct thread_group_cputime {
+	struct task_cputime totals;
+};
+#endif
+
 /*
  * NOTE! "signal_struct" does not have it's own
  * locking, because a shared signal_struct always
@@ -470,6 +509,17 @@ struct signal_struct {
 	cputime_t it_prof_expires, it_virt_expires;
 	cputime_t it_prof_incr, it_virt_incr;
 
+	/*
+	 * Thread group totals for process CPU clocks.
+	 * See thread_group_cputime(), et al, for details.
+	 */
+	struct thread_group_cputime cputime;
+
+	/* Earliest-expiration cache. */
+	struct task_cputime cputime_expires;
+
+	struct list_head cpu_timers[3];
+
 	/* job control IDs */
 
 	/*
@@ -500,7 +550,7 @@ struct signal_struct {
 	 * Live threads maintain their own counters and add to these
 	 * in __exit_signal, except for the group leader.
 	 */
-	cputime_t utime, stime, cutime, cstime;
+	cputime_t cutime, cstime;
 	cputime_t gtime;
 	cputime_t cgtime;
 	unsigned long nvcsw, nivcsw, cnvcsw, cnivcsw;
@@ -508,14 +558,6 @@ struct signal_struct {
 	unsigned long inblock, oublock, cinblock, coublock;
 	struct task_io_accounting ioac;
 
-	/*
-	 * Cumulative ns of scheduled CPU time for dead threads in the
-	 * group, not including a zombie group leader.  (This only differs
-	 * from jiffies_to_ns(utime + stime) if sched_clock uses something
-	 * other than jiffies.)
-	 */
-	unsigned long long sum_sched_runtime;
-
 	/*
 	 * We don't bother to synchronize most readers of this at all,
 	 * because there is no reader checking a limit that actually needs
@@ -527,8 +569,6 @@ struct signal_struct {
 	 */
 	struct rlimit rlim[RLIM_NLIMITS];
 
-	struct list_head cpu_timers[3];
-
 	/* keep the process-shared keyrings here so that they do the right
 	 * thing in threads created with CLONE_THREAD */
 #ifdef CONFIG_KEYS
@@ -1134,8 +1174,7 @@ struct task_struct {
 /* mm fault and swap info: this can arguably be seen as either mm-specific or thread-specific */
 	unsigned long min_flt, maj_flt;
 
-  	cputime_t it_prof_expires, it_virt_expires;
-	unsigned long long it_sched_expires;
+	struct task_cputime cputime_expires;
 	struct list_head cpu_timers[3];
 
 /* process credentials */
@@ -1585,6 +1624,7 @@ extern unsigned long long cpu_clock(int cpu);
 
 extern unsigned long long
 task_sched_runtime(struct task_struct *task);
+extern unsigned long long thread_group_sched_runtime(struct task_struct *task);
 
 /* sched_exec is called by processes performing an exec */
 #ifdef CONFIG_SMP
@@ -2081,6 +2121,197 @@ static inline int spin_needbreak(spinlock_t *lock)
 #endif
 }
 
+/*
+ * Thread group CPU time accounting.
+ */
+#ifdef CONFIG_SMP
+
+extern int thread_group_cputime_alloc_smp(struct task_struct *);
+extern void thread_group_cputime_smp(struct task_struct *, struct task_cputime *);
+
+static inline void thread_group_cputime_init(struct signal_struct *sig)
+{
+	sig->cputime.totals = NULL;
+}
+
+static inline int thread_group_cputime_clone_thread(struct task_struct *curr,
+						    struct task_struct *new)
+{
+	if (curr->signal->cputime.totals)
+		return 0;
+	return thread_group_cputime_alloc_smp(curr);
+}
+
+static inline void thread_group_cputime_free(struct signal_struct *sig)
+{
+	free_percpu(sig->cputime.totals);
+}
+
+/**
+ * thread_group_cputime - Sum the thread group time fields across all CPUs.
+ *
+ * This is a wrapper for the real routine, thread_group_cputime_smp().  See
+ * that routine for details.
+ */
+static inline void thread_group_cputime(
+	struct task_struct *tsk,
+	struct task_cputime *times)
+{
+	thread_group_cputime_smp(tsk, times);
+}
+
+/**
+ * thread_group_cputime_account_user - Maintain utime for a thread group.
+ *
+ * @tgtimes:	Pointer to thread_group_cputime structure.
+ * @cputime:	Time value by which to increment the utime field of that
+ *		structure.
+ *
+ * If thread group time is being maintained, get the structure for the
+ * running CPU and update the utime field there.
+ */
+static inline void thread_group_cputime_account_user(
+	struct thread_group_cputime *tgtimes,
+	cputime_t cputime)
+{
+	if (tgtimes->totals) {
+		struct task_cputime *times;
+
+		times = per_cpu_ptr(tgtimes->totals, get_cpu());
+		times->utime = cputime_add(times->utime, cputime);
+		put_cpu_no_resched();
+	}
+}
+
+/**
+ * thread_group_cputime_account_system - Maintain stime for a thread group.
+ *
+ * @tgtimes:	Pointer to thread_group_cputime structure.
+ * @cputime:	Time value by which to increment the stime field of that
+ *		structure.
+ *
+ * If thread group time is being maintained, get the structure for the
+ * running CPU and update the stime field there.
+ */
+static inline void thread_group_cputime_account_system(
+	struct thread_group_cputime *tgtimes,
+	cputime_t cputime)
+{
+	if (tgtimes->totals) {
+		struct task_cputime *times;
+
+		times = per_cpu_ptr(tgtimes->totals, get_cpu());
+		times->stime = cputime_add(times->stime, cputime);
+		put_cpu_no_resched();
+	}
+}
+
+/**
+ * thread_group_cputime_account_exec_runtime - Maintain exec runtime for a
+ *						thread group.
+ *
+ * @tgtimes:	Pointer to thread_group_cputime structure.
+ * @ns:		Time value by which to increment the sum_exec_runtime field
+ *		of that structure.
+ *
+ * If thread group time is being maintained, get the structure for the
+ * running CPU and update the sum_exec_runtime field there.
+ */
+static inline void thread_group_cputime_account_exec_runtime(
+	struct thread_group_cputime *tgtimes,
+	unsigned long long ns)
+{
+	if (tgtimes->totals) {
+		struct task_cputime *times;
+
+		times = per_cpu_ptr(tgtimes->totals, get_cpu());
+		times->sum_exec_runtime += ns;
+		put_cpu_no_resched();
+	}
+}
+
+#else /* CONFIG_SMP */
+
+static inline void thread_group_cputime_init(struct signal_struct *sig)
+{
+	sig->cputime.totals.utime = cputime_zero;
+	sig->cputime.totals.stime = cputime_zero;
+	sig->cputime.totals.sum_exec_runtime = 0;
+}
+
+static inline int thread_group_cputime_alloc(struct task_struct *tsk)
+{
+	return 0;
+}
+
+static inline void thread_group_cputime_free(struct signal_struct *sig)
+{
+}
+
+static inline int thread_group_cputime_clone_thread(struct task_struct *curr,
+						     struct task_struct *tsk)
+{
+}
+
+static inline void thread_group_cputime(struct task_struct *tsk,
+					 struct task_cputime *cputime)
+{
+	*cputime = tsk->signal->cputime.totals;
+}
+
+static inline void thread_group_cputime_account_user(
+	struct thread_group_cputime *tgtimes,
+	cputime_t cputime)
+{
+	tgtimes->totals->utime = cputime_add(tgtimes->totals->utime, cputime);
+}
+
+static inline void thread_group_cputime_account_system(
+	struct thread_group_cputime *tgtimes,
+	cputime_t cputime)
+{
+	tgtimes->totals->stime = cputime_add(tgtimes->totals->stime, cputime);
+}
+
+static inline void thread_group_cputime_account_exec_runtime(
+	struct thread_group_cputime *tgtimes,
+	unsigned long long ns)
+{
+	tgtimes->totals->sum_exec_runtime += ns;
+}
+
+#endif /* CONFIG_SMP */
+
+static inline void account_group_user_time(struct task_struct *tsk,
+					    cputime_t cputime)
+{
+	struct signal_struct *sig;
+
+	sig = tsk->signal;
+	if (likely(sig))
+		thread_group_cputime_account_user(&sig->cputime, cputime);
+}
+
+static inline void account_group_system_time(struct task_struct *tsk,
+					      cputime_t cputime)
+{
+	struct signal_struct *sig;
+
+	sig = tsk->signal;
+	if (likely(sig))
+		thread_group_cputime_account_system(&sig->cputime, cputime);
+}
+
+static inline void account_group_exec_runtime(struct task_struct *tsk,
+					       unsigned long long ns)
+{
+	struct signal_struct *sig;
+
+	sig = tsk->signal;
+	if (likely(sig))
+		thread_group_cputime_account_exec_runtime(&sig->cputime, ns);
+}
+
 /*
  * Reevaluate whether the task has signals pending delivery.
  * Wake the task if so.
diff --git a/include/linux/time.h b/include/linux/time.h
index e15206a7e82e..1b70b3c293e9 100644
--- a/include/linux/time.h
+++ b/include/linux/time.h
@@ -125,6 +125,9 @@ extern int timekeeping_valid_for_hres(void);
 extern void update_wall_time(void);
 extern void update_xtime_cache(u64 nsec);
 
+struct tms;
+extern void do_sys_times(struct tms *);
+
 /**
  * timespec_to_ns - Convert timespec to nanoseconds
  * @ts:		pointer to the timespec variable to be converted
diff --git a/kernel/compat.c b/kernel/compat.c
index 32c254a8ab9a..72650e39b3e6 100644
--- a/kernel/compat.c
+++ b/kernel/compat.c
@@ -23,6 +23,7 @@
 #include <linux/timex.h>
 #include <linux/migrate.h>
 #include <linux/posix-timers.h>
+#include <linux/times.h>
 
 #include <asm/uaccess.h>
 
@@ -150,49 +151,23 @@ asmlinkage long compat_sys_setitimer(int which,
 	return 0;
 }
 
+static compat_clock_t clock_t_to_compat_clock_t(clock_t x)
+{
+	return compat_jiffies_to_clock_t(clock_t_to_jiffies(x));
+}
+
 asmlinkage long compat_sys_times(struct compat_tms __user *tbuf)
 {
-	/*
-	 *	In the SMP world we might just be unlucky and have one of
-	 *	the times increment as we use it. Since the value is an
-	 *	atomically safe type this is just fine. Conceptually its
-	 *	as if the syscall took an instant longer to occur.
-	 */
 	if (tbuf) {
+		struct tms tms;
 		struct compat_tms tmp;
-		struct task_struct *tsk = current;
-		struct task_struct *t;
-		cputime_t utime, stime, cutime, cstime;
-
-		read_lock(&tasklist_lock);
-		utime = tsk->signal->utime;
-		stime = tsk->signal->stime;
-		t = tsk;
-		do {
-			utime = cputime_add(utime, t->utime);
-			stime = cputime_add(stime, t->stime);
-			t = next_thread(t);
-		} while (t != tsk);
-
-		/*
-		 * While we have tasklist_lock read-locked, no dying thread
-		 * can be updating current->signal->[us]time.  Instead,
-		 * we got their counts included in the live thread loop.
-		 * However, another thread can come in right now and
-		 * do a wait call that updates current->signal->c[us]time.
-		 * To make sure we always see that pair updated atomically,
-		 * we take the siglock around fetching them.
-		 */
-		spin_lock_irq(&tsk->sighand->siglock);
-		cutime = tsk->signal->cutime;
-		cstime = tsk->signal->cstime;
-		spin_unlock_irq(&tsk->sighand->siglock);
-		read_unlock(&tasklist_lock);
-
-		tmp.tms_utime = compat_jiffies_to_clock_t(cputime_to_jiffies(utime));
-		tmp.tms_stime = compat_jiffies_to_clock_t(cputime_to_jiffies(stime));
-		tmp.tms_cutime = compat_jiffies_to_clock_t(cputime_to_jiffies(cutime));
-		tmp.tms_cstime = compat_jiffies_to_clock_t(cputime_to_jiffies(cstime));
+
+		do_sys_times(&tms);
+		/* Convert our struct tms to the compat version. */
+		tmp.tms_utime = clock_t_to_compat_clock_t(tms.tms_utime);
+		tmp.tms_stime = clock_t_to_compat_clock_t(tms.tms_stime);
+		tmp.tms_cutime = clock_t_to_compat_clock_t(tms.tms_cutime);
+		tmp.tms_cstime = clock_t_to_compat_clock_t(tms.tms_cstime);
 		if (copy_to_user(tbuf, &tmp, sizeof(tmp)))
 			return -EFAULT;
 	}
diff --git a/kernel/exit.c b/kernel/exit.c
index 16395644a98f..40036ac04271 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -112,8 +112,6 @@ static void __exit_signal(struct task_struct *tsk)
 		 * We won't ever get here for the group leader, since it
 		 * will have been the last reference on the signal_struct.
 		 */
-		sig->utime = cputime_add(sig->utime, task_utime(tsk));
-		sig->stime = cputime_add(sig->stime, task_stime(tsk));
 		sig->gtime = cputime_add(sig->gtime, task_gtime(tsk));
 		sig->min_flt += tsk->min_flt;
 		sig->maj_flt += tsk->maj_flt;
@@ -122,7 +120,6 @@ static void __exit_signal(struct task_struct *tsk)
 		sig->inblock += task_io_get_inblock(tsk);
 		sig->oublock += task_io_get_oublock(tsk);
 		task_io_accounting_add(&sig->ioac, &tsk->ioac);
-		sig->sum_sched_runtime += tsk->se.sum_exec_runtime;
 		sig = NULL; /* Marker for below. */
 	}
 
@@ -1294,6 +1291,7 @@ static int wait_task_zombie(struct task_struct *p, int options,
 	if (likely(!traced)) {
 		struct signal_struct *psig;
 		struct signal_struct *sig;
+		struct task_cputime cputime;
 
 		/*
 		 * The resource counters for the group leader are in its
@@ -1309,20 +1307,23 @@ static int wait_task_zombie(struct task_struct *p, int options,
 		 * need to protect the access to p->parent->signal fields,
 		 * as other threads in the parent group can be right
 		 * here reaping other children at the same time.
+		 *
+		 * We use thread_group_cputime() to get times for the thread
+		 * group, which consolidates times for all threads in the
+		 * group including the group leader.
 		 */
 		spin_lock_irq(&p->parent->sighand->siglock);
 		psig = p->parent->signal;
 		sig = p->signal;
+		thread_group_cputime(p, &cputime);
 		psig->cutime =
 			cputime_add(psig->cutime,
-			cputime_add(p->utime,
-			cputime_add(sig->utime,
-				    sig->cutime)));
+			cputime_add(cputime.utime,
+				    sig->cutime));
 		psig->cstime =
 			cputime_add(psig->cstime,
-			cputime_add(p->stime,
-			cputime_add(sig->stime,
-				    sig->cstime)));
+			cputime_add(cputime.stime,
+				    sig->cstime));
 		psig->cgtime =
 			cputime_add(psig->cgtime,
 			cputime_add(p->gtime,
diff --git a/kernel/fork.c b/kernel/fork.c
index 7ce2ebe84796..a8ac2efb8e30 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -759,15 +759,44 @@ void __cleanup_sighand(struct sighand_struct *sighand)
 		kmem_cache_free(sighand_cachep, sighand);
 }
 
+
+/*
+ * Initialize POSIX timer handling for a thread group.
+ */
+static void posix_cpu_timers_init_group(struct signal_struct *sig)
+{
+	/* Thread group counters. */
+	thread_group_cputime_init(sig);
+
+	/* Expiration times and increments. */
+	sig->it_virt_expires = cputime_zero;
+	sig->it_virt_incr = cputime_zero;
+	sig->it_prof_expires = cputime_zero;
+	sig->it_prof_incr = cputime_zero;
+
+	/* Cached expiration times. */
+	sig->cputime_expires.prof_exp = cputime_zero;
+	sig->cputime_expires.virt_exp = cputime_zero;
+	sig->cputime_expires.sched_exp = 0;
+
+	/* The timer lists. */
+	INIT_LIST_HEAD(&sig->cpu_timers[0]);
+	INIT_LIST_HEAD(&sig->cpu_timers[1]);
+	INIT_LIST_HEAD(&sig->cpu_timers[2]);
+}
+
 static int copy_signal(unsigned long clone_flags, struct task_struct *tsk)
 {
 	struct signal_struct *sig;
 	int ret;
 
 	if (clone_flags & CLONE_THREAD) {
-		atomic_inc(&current->signal->count);
-		atomic_inc(&current->signal->live);
-		return 0;
+		ret = thread_group_cputime_clone_thread(current, tsk);
+		if (likely(!ret)) {
+			atomic_inc(&current->signal->count);
+			atomic_inc(&current->signal->live);
+		}
+		return ret;
 	}
 	sig = kmem_cache_alloc(signal_cachep, GFP_KERNEL);
 	tsk->signal = sig;
@@ -795,15 +824,10 @@ static int copy_signal(unsigned long clone_flags, struct task_struct *tsk)
 	sig->it_real_incr.tv64 = 0;
 	sig->real_timer.function = it_real_fn;
 
-	sig->it_virt_expires = cputime_zero;
-	sig->it_virt_incr = cputime_zero;
-	sig->it_prof_expires = cputime_zero;
-	sig->it_prof_incr = cputime_zero;
-
 	sig->leader = 0;	/* session leadership doesn't inherit */
 	sig->tty_old_pgrp = NULL;
 
-	sig->utime = sig->stime = sig->cutime = sig->cstime = cputime_zero;
+	sig->cutime = sig->cstime = cputime_zero;
 	sig->gtime = cputime_zero;
 	sig->cgtime = cputime_zero;
 	sig->nvcsw = sig->nivcsw = sig->cnvcsw = sig->cnivcsw = 0;
@@ -820,14 +844,8 @@ static int copy_signal(unsigned long clone_flags, struct task_struct *tsk)
 	memcpy(sig->rlim, current->signal->rlim, sizeof sig->rlim);
 	task_unlock(current->group_leader);
 
-	if (sig->rlim[RLIMIT_CPU].rlim_cur != RLIM_INFINITY) {
-		/*
-		 * New sole thread in the process gets an expiry time
-		 * of the whole CPU time limit.
-		 */
-		tsk->it_prof_expires =
-			secs_to_cputime(sig->rlim[RLIMIT_CPU].rlim_cur);
-	}
+	posix_cpu_timers_init_group(sig);
+
 	acct_init_pacct(&sig->pacct);
 
 	tty_audit_fork(sig);
@@ -837,6 +855,7 @@ static int copy_signal(unsigned long clone_flags, struct task_struct *tsk)
 
 void __cleanup_signal(struct signal_struct *sig)
 {
+	thread_group_cputime_free(sig);
 	exit_thread_group_keys(sig);
 	kmem_cache_free(signal_cachep, sig);
 }
@@ -885,6 +904,19 @@ void mm_init_owner(struct mm_struct *mm, struct task_struct *p)
 }
 #endif /* CONFIG_MM_OWNER */
 
+/*
+ * Initialize POSIX timer handling for a single task.
+ */
+static void posix_cpu_timers_init(struct task_struct *tsk)
+{
+	tsk->cputime_expires.prof_exp = cputime_zero;
+	tsk->cputime_expires.virt_exp = cputime_zero;
+	tsk->cputime_expires.sched_exp = 0;
+	INIT_LIST_HEAD(&tsk->cpu_timers[0]);
+	INIT_LIST_HEAD(&tsk->cpu_timers[1]);
+	INIT_LIST_HEAD(&tsk->cpu_timers[2]);
+}
+
 /*
  * This creates a new process as a copy of the old one,
  * but does not actually start it yet.
@@ -995,12 +1027,7 @@ static struct task_struct *copy_process(unsigned long clone_flags,
 	task_io_accounting_init(&p->ioac);
 	acct_clear_integrals(p);
 
-	p->it_virt_expires = cputime_zero;
-	p->it_prof_expires = cputime_zero;
-	p->it_sched_expires = 0;
-	INIT_LIST_HEAD(&p->cpu_timers[0]);
-	INIT_LIST_HEAD(&p->cpu_timers[1]);
-	INIT_LIST_HEAD(&p->cpu_timers[2]);
+	posix_cpu_timers_init(p);
 
 	p->lock_depth = -1;		/* -1 = no lock */
 	do_posix_clock_monotonic_gettime(&p->start_time);
@@ -1201,21 +1228,6 @@ static struct task_struct *copy_process(unsigned long clone_flags,
 	if (clone_flags & CLONE_THREAD) {
 		p->group_leader = current->group_leader;
 		list_add_tail_rcu(&p->thread_group, &p->group_leader->thread_group);
-
-		if (!cputime_eq(current->signal->it_virt_expires,
-				cputime_zero) ||
-		    !cputime_eq(current->signal->it_prof_expires,
-				cputime_zero) ||
-		    current->signal->rlim[RLIMIT_CPU].rlim_cur != RLIM_INFINITY ||
-		    !list_empty(&current->signal->cpu_timers[0]) ||
-		    !list_empty(&current->signal->cpu_timers[1]) ||
-		    !list_empty(&current->signal->cpu_timers[2])) {
-			/*
-			 * Have child wake up on its first tick to check
-			 * for process CPU timers.
-			 */
-			p->it_prof_expires = jiffies_to_cputime(1);
-		}
 	}
 
 	if (likely(p->pid)) {
diff --git a/kernel/itimer.c b/kernel/itimer.c
index ab982747d9bd..db7c358b9a02 100644
--- a/kernel/itimer.c
+++ b/kernel/itimer.c
@@ -55,17 +55,15 @@ int do_getitimer(int which, struct itimerval *value)
 		spin_unlock_irq(&tsk->sighand->siglock);
 		break;
 	case ITIMER_VIRTUAL:
-		read_lock(&tasklist_lock);
 		spin_lock_irq(&tsk->sighand->siglock);
 		cval = tsk->signal->it_virt_expires;
 		cinterval = tsk->signal->it_virt_incr;
 		if (!cputime_eq(cval, cputime_zero)) {
-			struct task_struct *t = tsk;
-			cputime_t utime = tsk->signal->utime;
-			do {
-				utime = cputime_add(utime, t->utime);
-				t = next_thread(t);
-			} while (t != tsk);
+			struct task_cputime cputime;
+			cputime_t utime;
+
+			thread_group_cputime(tsk, &cputime);
+			utime = cputime.utime;
 			if (cputime_le(cval, utime)) { /* about to fire */
 				cval = jiffies_to_cputime(1);
 			} else {
@@ -73,25 +71,19 @@ int do_getitimer(int which, struct itimerval *value)
 			}
 		}
 		spin_unlock_irq(&tsk->sighand->siglock);
-		read_unlock(&tasklist_lock);
 		cputime_to_timeval(cval, &value->it_value);
 		cputime_to_timeval(cinterval, &value->it_interval);
 		break;
 	case ITIMER_PROF:
-		read_lock(&tasklist_lock);
 		spin_lock_irq(&tsk->sighand->siglock);
 		cval = tsk->signal->it_prof_expires;
 		cinterval = tsk->signal->it_prof_incr;
 		if (!cputime_eq(cval, cputime_zero)) {
-			struct task_struct *t = tsk;
-			cputime_t ptime = cputime_add(tsk->signal->utime,
-						      tsk->signal->stime);
-			do {
-				ptime = cputime_add(ptime,
-						    cputime_add(t->utime,
-								t->stime));
-				t = next_thread(t);
-			} while (t != tsk);
+			struct task_cputime times;
+			cputime_t ptime;
+
+			thread_group_cputime(tsk, &times);
+			ptime = cputime_add(times.utime, times.stime);
 			if (cputime_le(cval, ptime)) { /* about to fire */
 				cval = jiffies_to_cputime(1);
 			} else {
@@ -99,7 +91,6 @@ int do_getitimer(int which, struct itimerval *value)
 			}
 		}
 		spin_unlock_irq(&tsk->sighand->siglock);
-		read_unlock(&tasklist_lock);
 		cputime_to_timeval(cval, &value->it_value);
 		cputime_to_timeval(cinterval, &value->it_interval);
 		break;
@@ -185,7 +176,6 @@ again:
 	case ITIMER_VIRTUAL:
 		nval = timeval_to_cputime(&value->it_value);
 		ninterval = timeval_to_cputime(&value->it_interval);
-		read_lock(&tasklist_lock);
 		spin_lock_irq(&tsk->sighand->siglock);
 		cval = tsk->signal->it_virt_expires;
 		cinterval = tsk->signal->it_virt_incr;
@@ -200,7 +190,6 @@ again:
 		tsk->signal->it_virt_expires = nval;
 		tsk->signal->it_virt_incr = ninterval;
 		spin_unlock_irq(&tsk->sighand->siglock);
-		read_unlock(&tasklist_lock);
 		if (ovalue) {
 			cputime_to_timeval(cval, &ovalue->it_value);
 			cputime_to_timeval(cinterval, &ovalue->it_interval);
@@ -209,7 +198,6 @@ again:
 	case ITIMER_PROF:
 		nval = timeval_to_cputime(&value->it_value);
 		ninterval = timeval_to_cputime(&value->it_interval);
-		read_lock(&tasklist_lock);
 		spin_lock_irq(&tsk->sighand->siglock);
 		cval = tsk->signal->it_prof_expires;
 		cinterval = tsk->signal->it_prof_incr;
@@ -224,7 +212,6 @@ again:
 		tsk->signal->it_prof_expires = nval;
 		tsk->signal->it_prof_incr = ninterval;
 		spin_unlock_irq(&tsk->sighand->siglock);
-		read_unlock(&tasklist_lock);
 		if (ovalue) {
 			cputime_to_timeval(cval, &ovalue->it_value);
 			cputime_to_timeval(cinterval, &ovalue->it_interval);
diff --git a/kernel/posix-cpu-timers.c b/kernel/posix-cpu-timers.c
index c42a03aef36f..dba1c334c3e8 100644
--- a/kernel/posix-cpu-timers.c
+++ b/kernel/posix-cpu-timers.c
@@ -8,6 +8,99 @@
 #include <linux/math64.h>
 #include <asm/uaccess.h>
 
+#ifdef CONFIG_SMP
+/*
+ * Allocate the thread_group_cputime structure appropriately for SMP kernels
+ * and fill in the current values of the fields.  Called from copy_signal()
+ * via thread_group_cputime_clone_thread() when adding a second or subsequent
+ * thread to a thread group.  Assumes interrupts are enabled when called.
+ */
+int thread_group_cputime_alloc_smp(struct task_struct *tsk)
+{
+	struct signal_struct *sig = tsk->signal;
+	struct task_cputime *cputime;
+
+	/*
+	 * If we have multiple threads and we don't already have a
+	 * per-CPU task_cputime struct, allocate one and fill it in with
+	 * the times accumulated so far.
+	 */
+	if (sig->cputime.totals)
+		return 0;
+	cputime = alloc_percpu(struct task_cputime);
+	if (cputime == NULL)
+		return -ENOMEM;
+	read_lock(&tasklist_lock);
+	spin_lock_irq(&tsk->sighand->siglock);
+	if (sig->cputime.totals) {
+		spin_unlock_irq(&tsk->sighand->siglock);
+		read_unlock(&tasklist_lock);
+		free_percpu(cputime);
+		return 0;
+	}
+	sig->cputime.totals = cputime;
+	cputime = per_cpu_ptr(sig->cputime.totals, get_cpu());
+	cputime->utime = tsk->utime;
+	cputime->stime = tsk->stime;
+	cputime->sum_exec_runtime = tsk->se.sum_exec_runtime;
+	put_cpu_no_resched();
+	spin_unlock_irq(&tsk->sighand->siglock);
+	read_unlock(&tasklist_lock);
+	return 0;
+}
+
+/**
+ * thread_group_cputime_smp - Sum the thread group time fields across all CPUs.
+ *
+ * @tsk:	The task we use to identify the thread group.
+ * @times:	task_cputime structure in which we return the summed fields.
+ *
+ * Walk the list of CPUs to sum the per-CPU time fields in the thread group
+ * time structure.
+ */
+void thread_group_cputime_smp(
+	struct task_struct *tsk,
+	struct task_cputime *times)
+{
+	struct signal_struct *sig;
+	int i;
+	struct task_cputime *tot;
+
+	sig = tsk->signal;
+	if (unlikely(!sig) || !sig->cputime.totals) {
+		times->utime = tsk->utime;
+		times->stime = tsk->stime;
+		times->sum_exec_runtime = tsk->se.sum_exec_runtime;
+		return;
+	}
+	times->stime = times->utime = cputime_zero;
+	times->sum_exec_runtime = 0;
+	for_each_possible_cpu(i) {
+		tot = per_cpu_ptr(tsk->signal->cputime.totals, i);
+		times->utime = cputime_add(times->utime, tot->utime);
+		times->stime = cputime_add(times->stime, tot->stime);
+		times->sum_exec_runtime += tot->sum_exec_runtime;
+	}
+}
+
+#endif /* CONFIG_SMP */
+
+/*
+ * Called after updating RLIMIT_CPU to set timer expiration if necessary.
+ */
+void update_rlimit_cpu(unsigned long rlim_new)
+{
+	cputime_t cputime;
+
+	cputime = secs_to_cputime(rlim_new);
+	if (cputime_eq(current->signal->it_prof_expires, cputime_zero) ||
+            cputime_lt(current->signal->it_prof_expires, cputime)) {
+		spin_lock_irq(&current->sighand->siglock);
+		set_process_cpu_timer(current, CPUCLOCK_PROF, &cputime, NULL);
+		spin_unlock_irq(&current->sighand->siglock);
+	}
+}
+
 static int check_clock(const clockid_t which_clock)
 {
 	int error = 0;
@@ -158,10 +251,6 @@ static inline cputime_t virt_ticks(struct task_struct *p)
 {
 	return p->utime;
 }
-static inline unsigned long long sched_ns(struct task_struct *p)
-{
-	return task_sched_runtime(p);
-}
 
 int posix_cpu_clock_getres(const clockid_t which_clock, struct timespec *tp)
 {
@@ -211,7 +300,7 @@ static int cpu_clock_sample(const clockid_t which_clock, struct task_struct *p,
 		cpu->cpu = virt_ticks(p);
 		break;
 	case CPUCLOCK_SCHED:
-		cpu->sched = sched_ns(p);
+		cpu->sched = task_sched_runtime(p);
 		break;
 	}
 	return 0;
@@ -226,31 +315,20 @@ static int cpu_clock_sample_group_locked(unsigned int clock_idx,
 					 struct task_struct *p,
 					 union cpu_time_count *cpu)
 {
-	struct task_struct *t = p;
- 	switch (clock_idx) {
+	struct task_cputime cputime;
+
+	thread_group_cputime(p, &cputime);
+	switch (clock_idx) {
 	default:
 		return -EINVAL;
 	case CPUCLOCK_PROF:
-		cpu->cpu = cputime_add(p->signal->utime, p->signal->stime);
-		do {
-			cpu->cpu = cputime_add(cpu->cpu, prof_ticks(t));
-			t = next_thread(t);
-		} while (t != p);
+		cpu->cpu = cputime_add(cputime.utime, cputime.stime);
 		break;
 	case CPUCLOCK_VIRT:
-		cpu->cpu = p->signal->utime;
-		do {
-			cpu->cpu = cputime_add(cpu->cpu, virt_ticks(t));
-			t = next_thread(t);
-		} while (t != p);
+		cpu->cpu = cputime.utime;
 		break;
 	case CPUCLOCK_SCHED:
-		cpu->sched = p->signal->sum_sched_runtime;
-		/* Add in each other live thread.  */
-		while ((t = next_thread(t)) != p) {
-			cpu->sched += t->se.sum_exec_runtime;
-		}
-		cpu->sched += sched_ns(p);
+		cpu->sched = thread_group_sched_runtime(p);
 		break;
 	}
 	return 0;
@@ -471,80 +549,11 @@ void posix_cpu_timers_exit(struct task_struct *tsk)
 }
 void posix_cpu_timers_exit_group(struct task_struct *tsk)
 {
-	cleanup_timers(tsk->signal->cpu_timers,
-		       cputime_add(tsk->utime, tsk->signal->utime),
-		       cputime_add(tsk->stime, tsk->signal->stime),
-		     tsk->se.sum_exec_runtime + tsk->signal->sum_sched_runtime);
-}
-
-
-/*
- * Set the expiry times of all the threads in the process so one of them
- * will go off before the process cumulative expiry total is reached.
- */
-static void process_timer_rebalance(struct task_struct *p,
-				    unsigned int clock_idx,
-				    union cpu_time_count expires,
-				    union cpu_time_count val)
-{
-	cputime_t ticks, left;
-	unsigned long long ns, nsleft;
- 	struct task_struct *t = p;
-	unsigned int nthreads = atomic_read(&p->signal->live);
-
-	if (!nthreads)
-		return;
+	struct task_cputime cputime;
 
-	switch (clock_idx) {
-	default:
-		BUG();
-		break;
-	case CPUCLOCK_PROF:
-		left = cputime_div_non_zero(cputime_sub(expires.cpu, val.cpu),
-				       nthreads);
-		do {
-			if (likely(!(t->flags & PF_EXITING))) {
-				ticks = cputime_add(prof_ticks(t), left);
-				if (cputime_eq(t->it_prof_expires,
-					       cputime_zero) ||
-				    cputime_gt(t->it_prof_expires, ticks)) {
-					t->it_prof_expires = ticks;
-				}
-			}
-			t = next_thread(t);
-		} while (t != p);
-		break;
-	case CPUCLOCK_VIRT:
-		left = cputime_div_non_zero(cputime_sub(expires.cpu, val.cpu),
-				       nthreads);
-		do {
-			if (likely(!(t->flags & PF_EXITING))) {
-				ticks = cputime_add(virt_ticks(t), left);
-				if (cputime_eq(t->it_virt_expires,
-					       cputime_zero) ||
-				    cputime_gt(t->it_virt_expires, ticks)) {
-					t->it_virt_expires = ticks;
-				}
-			}
-			t = next_thread(t);
-		} while (t != p);
-		break;
-	case CPUCLOCK_SCHED:
-		nsleft = expires.sched - val.sched;
-		do_div(nsleft, nthreads);
-		nsleft = max_t(unsigned long long, nsleft, 1);
-		do {
-			if (likely(!(t->flags & PF_EXITING))) {
-				ns = t->se.sum_exec_runtime + nsleft;
-				if (t->it_sched_expires == 0 ||
-				    t->it_sched_expires > ns) {
-					t->it_sched_expires = ns;
-				}
-			}
-			t = next_thread(t);
-		} while (t != p);
-		break;
-	}
+	thread_group_cputime(tsk, &cputime);
+	cleanup_timers(tsk->signal->cpu_timers,
+		       cputime.utime, cputime.stime, cputime.sum_exec_runtime);
 }
 
 static void clear_dead_task(struct k_itimer *timer, union cpu_time_count now)
@@ -608,29 +617,32 @@ static void arm_timer(struct k_itimer *timer, union cpu_time_count now)
 			default:
 				BUG();
 			case CPUCLOCK_PROF:
-				if (cputime_eq(p->it_prof_expires,
+				if (cputime_eq(p->cputime_expires.prof_exp,
 					       cputime_zero) ||
-				    cputime_gt(p->it_prof_expires,
+				    cputime_gt(p->cputime_expires.prof_exp,
 					       nt->expires.cpu))
-					p->it_prof_expires = nt->expires.cpu;
+					p->cputime_expires.prof_exp =
+						nt->expires.cpu;
 				break;
 			case CPUCLOCK_VIRT:
-				if (cputime_eq(p->it_virt_expires,
+				if (cputime_eq(p->cputime_expires.virt_exp,
 					       cputime_zero) ||
-				    cputime_gt(p->it_virt_expires,
+				    cputime_gt(p->cputime_expires.virt_exp,
 					       nt->expires.cpu))
-					p->it_virt_expires = nt->expires.cpu;
+					p->cputime_expires.virt_exp =
+						nt->expires.cpu;
 				break;
 			case CPUCLOCK_SCHED:
-				if (p->it_sched_expires == 0 ||
-				    p->it_sched_expires > nt->expires.sched)
-					p->it_sched_expires = nt->expires.sched;
+				if (p->cputime_expires.sched_exp == 0 ||
+				    p->cputime_expires.sched_exp >
+							nt->expires.sched)
+					p->cputime_expires.sched_exp =
+						nt->expires.sched;
 				break;
 			}
 		} else {
 			/*
-			 * For a process timer, we must balance
-			 * all the live threads' expirations.
+			 * For a process timer, set the cached expiration time.
 			 */
 			switch (CPUCLOCK_WHICH(timer->it_clock)) {
 			default:
@@ -641,7 +653,9 @@ static void arm_timer(struct k_itimer *timer, union cpu_time_count now)
 				    cputime_lt(p->signal->it_virt_expires,
 					       timer->it.cpu.expires.cpu))
 					break;
-				goto rebalance;
+				p->signal->cputime_expires.virt_exp =
+					timer->it.cpu.expires.cpu;
+				break;
 			case CPUCLOCK_PROF:
 				if (!cputime_eq(p->signal->it_prof_expires,
 						cputime_zero) &&
@@ -652,13 +666,12 @@ static void arm_timer(struct k_itimer *timer, union cpu_time_count now)
 				if (i != RLIM_INFINITY &&
 				    i <= cputime_to_secs(timer->it.cpu.expires.cpu))
 					break;
-				goto rebalance;
+				p->signal->cputime_expires.prof_exp =
+					timer->it.cpu.expires.cpu;
+				break;
 			case CPUCLOCK_SCHED:
-			rebalance:
-				process_timer_rebalance(
-					timer->it.cpu.task,
-					CPUCLOCK_WHICH(timer->it_clock),
-					timer->it.cpu.expires, now);
+				p->signal->cputime_expires.sched_exp =
+					timer->it.cpu.expires.sched;
 				break;
 			}
 		}
@@ -969,13 +982,13 @@ static void check_thread_timers(struct task_struct *tsk,
 	struct signal_struct *const sig = tsk->signal;
 
 	maxfire = 20;
-	tsk->it_prof_expires = cputime_zero;
+	tsk->cputime_expires.prof_exp = cputime_zero;
 	while (!list_empty(timers)) {
 		struct cpu_timer_list *t = list_first_entry(timers,
 						      struct cpu_timer_list,
 						      entry);
 		if (!--maxfire || cputime_lt(prof_ticks(tsk), t->expires.cpu)) {
-			tsk->it_prof_expires = t->expires.cpu;
+			tsk->cputime_expires.prof_exp = t->expires.cpu;
 			break;
 		}
 		t->firing = 1;
@@ -984,13 +997,13 @@ static void check_thread_timers(struct task_struct *tsk,
 
 	++timers;
 	maxfire = 20;
-	tsk->it_virt_expires = cputime_zero;
+	tsk->cputime_expires.virt_exp = cputime_zero;
 	while (!list_empty(timers)) {
 		struct cpu_timer_list *t = list_first_entry(timers,
 						      struct cpu_timer_list,
 						      entry);
 		if (!--maxfire || cputime_lt(virt_ticks(tsk), t->expires.cpu)) {
-			tsk->it_virt_expires = t->expires.cpu;
+			tsk->cputime_expires.virt_exp = t->expires.cpu;
 			break;
 		}
 		t->firing = 1;
@@ -999,13 +1012,13 @@ static void check_thread_timers(struct task_struct *tsk,
 
 	++timers;
 	maxfire = 20;
-	tsk->it_sched_expires = 0;
+	tsk->cputime_expires.sched_exp = 0;
 	while (!list_empty(timers)) {
 		struct cpu_timer_list *t = list_first_entry(timers,
 						      struct cpu_timer_list,
 						      entry);
 		if (!--maxfire || tsk->se.sum_exec_runtime < t->expires.sched) {
-			tsk->it_sched_expires = t->expires.sched;
+			tsk->cputime_expires.sched_exp = t->expires.sched;
 			break;
 		}
 		t->firing = 1;
@@ -1055,10 +1068,10 @@ static void check_process_timers(struct task_struct *tsk,
 {
 	int maxfire;
 	struct signal_struct *const sig = tsk->signal;
-	cputime_t utime, stime, ptime, virt_expires, prof_expires;
+	cputime_t utime, ptime, virt_expires, prof_expires;
 	unsigned long long sum_sched_runtime, sched_expires;
-	struct task_struct *t;
 	struct list_head *timers = sig->cpu_timers;
+	struct task_cputime cputime;
 
 	/*
 	 * Don't sample the current process CPU clocks if there are no timers.
@@ -1074,18 +1087,10 @@ static void check_process_timers(struct task_struct *tsk,
 	/*
 	 * Collect the current process totals.
 	 */
-	utime = sig->utime;
-	stime = sig->stime;
-	sum_sched_runtime = sig->sum_sched_runtime;
-	t = tsk;
-	do {
-		utime = cputime_add(utime, t->utime);
-		stime = cputime_add(stime, t->stime);
-		sum_sched_runtime += t->se.sum_exec_runtime;
-		t = next_thread(t);
-	} while (t != tsk);
-	ptime = cputime_add(utime, stime);
-
+	thread_group_cputime(tsk, &cputime);
+	utime = cputime.utime;
+	ptime = cputime_add(utime, cputime.stime);
+	sum_sched_runtime = cputime.sum_exec_runtime;
 	maxfire = 20;
 	prof_expires = cputime_zero;
 	while (!list_empty(timers)) {
@@ -1193,60 +1198,18 @@ static void check_process_timers(struct task_struct *tsk,
 		}
 	}
 
-	if (!cputime_eq(prof_expires, cputime_zero) ||
-	    !cputime_eq(virt_expires, cputime_zero) ||
-	    sched_expires != 0) {
-		/*
-		 * Rebalance the threads' expiry times for the remaining
-		 * process CPU timers.
-		 */
-
-		cputime_t prof_left, virt_left, ticks;
-		unsigned long long sched_left, sched;
-		const unsigned int nthreads = atomic_read(&sig->live);
-
-		if (!nthreads)
-			return;
-
-		prof_left = cputime_sub(prof_expires, utime);
-		prof_left = cputime_sub(prof_left, stime);
-		prof_left = cputime_div_non_zero(prof_left, nthreads);
-		virt_left = cputime_sub(virt_expires, utime);
-		virt_left = cputime_div_non_zero(virt_left, nthreads);
-		if (sched_expires) {
-			sched_left = sched_expires - sum_sched_runtime;
-			do_div(sched_left, nthreads);
-			sched_left = max_t(unsigned long long, sched_left, 1);
-		} else {
-			sched_left = 0;
-		}
-		t = tsk;
-		do {
-			if (unlikely(t->flags & PF_EXITING))
-				continue;
-
-			ticks = cputime_add(cputime_add(t->utime, t->stime),
-					    prof_left);
-			if (!cputime_eq(prof_expires, cputime_zero) &&
-			    (cputime_eq(t->it_prof_expires, cputime_zero) ||
-			     cputime_gt(t->it_prof_expires, ticks))) {
-				t->it_prof_expires = ticks;
-			}
-
-			ticks = cputime_add(t->utime, virt_left);
-			if (!cputime_eq(virt_expires, cputime_zero) &&
-			    (cputime_eq(t->it_virt_expires, cputime_zero) ||
-			     cputime_gt(t->it_virt_expires, ticks))) {
-				t->it_virt_expires = ticks;
-			}
-
-			sched = t->se.sum_exec_runtime + sched_left;
-			if (sched_expires && (t->it_sched_expires == 0 ||
-					      t->it_sched_expires > sched)) {
-				t->it_sched_expires = sched;
-			}
-		} while ((t = next_thread(t)) != tsk);
-	}
+	if (!cputime_eq(prof_expires, cputime_zero) &&
+	    (cputime_eq(sig->cputime_expires.prof_exp, cputime_zero) ||
+	     cputime_gt(sig->cputime_expires.prof_exp, prof_expires)))
+		sig->cputime_expires.prof_exp = prof_expires;
+	if (!cputime_eq(virt_expires, cputime_zero) &&
+	    (cputime_eq(sig->cputime_expires.virt_exp, cputime_zero) ||
+	     cputime_gt(sig->cputime_expires.virt_exp, virt_expires)))
+		sig->cputime_expires.virt_exp = virt_expires;
+	if (sched_expires != 0 &&
+	    (sig->cputime_expires.sched_exp == 0 ||
+	     sig->cputime_expires.sched_exp > sched_expires))
+		sig->cputime_expires.sched_exp = sched_expires;
 }
 
 /*
@@ -1314,6 +1277,78 @@ out:
 	++timer->it_requeue_pending;
 }
 
+/**
+ * task_cputime_zero - Check a task_cputime struct for all zero fields.
+ *
+ * @cputime:	The struct to compare.
+ *
+ * Checks @cputime to see if all fields are zero.  Returns true if all fields
+ * are zero, false if any field is nonzero.
+ */
+static inline int task_cputime_zero(const struct task_cputime *cputime)
+{
+	if (cputime_eq(cputime->utime, cputime_zero) &&
+	    cputime_eq(cputime->stime, cputime_zero) &&
+	    cputime->sum_exec_runtime == 0)
+		return 1;
+	return 0;
+}
+
+/**
+ * task_cputime_expired - Compare two task_cputime entities.
+ *
+ * @sample:	The task_cputime structure to be checked for expiration.
+ * @expires:	Expiration times, against which @sample will be checked.
+ *
+ * Checks @sample against @expires to see if any field of @sample has expired.
+ * Returns true if any field of the former is greater than the corresponding
+ * field of the latter if the latter field is set.  Otherwise returns false.
+ */
+static inline int task_cputime_expired(const struct task_cputime *sample,
+					const struct task_cputime *expires)
+{
+	if (!cputime_eq(expires->utime, cputime_zero) &&
+	    cputime_ge(sample->utime, expires->utime))
+		return 1;
+	if (!cputime_eq(expires->stime, cputime_zero) &&
+	    cputime_ge(cputime_add(sample->utime, sample->stime),
+		       expires->stime))
+		return 1;
+	if (expires->sum_exec_runtime != 0 &&
+	    sample->sum_exec_runtime >= expires->sum_exec_runtime)
+		return 1;
+	return 0;
+}
+
+/**
+ * fastpath_timer_check - POSIX CPU timers fast path.
+ *
+ * @tsk:	The task (thread) being checked.
+ * @sig:	The signal pointer for that task.
+ *
+ * If there are no timers set return false.  Otherwise snapshot the task and
+ * thread group timers, then compare them with the corresponding expiration
+ # times.  Returns true if a timer has expired, else returns false.
+ */
+static inline int fastpath_timer_check(struct task_struct *tsk,
+					struct signal_struct *sig)
+{
+	struct task_cputime task_sample = {
+		.utime = tsk->utime,
+		.stime = tsk->stime,
+		.sum_exec_runtime = tsk->se.sum_exec_runtime
+	};
+	struct task_cputime group_sample;
+
+	if (task_cputime_zero(&tsk->cputime_expires) &&
+	    task_cputime_zero(&sig->cputime_expires))
+		return 0;
+	if (task_cputime_expired(&task_sample, &tsk->cputime_expires))
+		return 1;
+	thread_group_cputime(tsk, &group_sample);
+	return task_cputime_expired(&group_sample, &sig->cputime_expires);
+}
+
 /*
  * This is called from the timer interrupt handler.  The irq handler has
  * already updated our counts.  We need to check if any timers fire now.
@@ -1323,30 +1358,29 @@ void run_posix_cpu_timers(struct task_struct *tsk)
 {
 	LIST_HEAD(firing);
 	struct k_itimer *timer, *next;
+	struct signal_struct *sig;
+	struct sighand_struct *sighand;
+	unsigned long flags;
 
 	BUG_ON(!irqs_disabled());
 
-#define UNEXPIRED(clock) \
-		(cputime_eq(tsk->it_##clock##_expires, cputime_zero) || \
-		 cputime_lt(clock##_ticks(tsk), tsk->it_##clock##_expires))
-
-	if (UNEXPIRED(prof) && UNEXPIRED(virt) &&
-	    (tsk->it_sched_expires == 0 ||
-	     tsk->se.sum_exec_runtime < tsk->it_sched_expires))
-		return;
-
-#undef	UNEXPIRED
-
+	/* Pick up tsk->signal and make sure it's valid. */
+	sig = tsk->signal;
 	/*
-	 * Double-check with locks held.
+	 * The fast path checks that there are no expired thread or thread
+	 * group timers.  If that's so, just return.  Also check that
+	 * tsk->signal is non-NULL; this probably can't happen but cover the
+	 * possibility anyway.
 	 */
-	read_lock(&tasklist_lock);
-	if (likely(tsk->signal != NULL)) {
-		spin_lock(&tsk->sighand->siglock);
-
+	if (unlikely(!sig) || !fastpath_timer_check(tsk, sig)) {
+		return;
+	}
+	sighand = lock_task_sighand(tsk, &flags);
+	if (likely(sighand)) {
 		/*
-		 * Here we take off tsk->cpu_timers[N] and tsk->signal->cpu_timers[N]
-		 * all the timers that are firing, and put them on the firing list.
+		 * Here we take off tsk->signal->cpu_timers[N] and
+		 * tsk->cpu_timers[N] all the timers that are firing, and
+		 * put them on the firing list.
 		 */
 		check_thread_timers(tsk, &firing);
 		check_process_timers(tsk, &firing);
@@ -1359,9 +1393,8 @@ void run_posix_cpu_timers(struct task_struct *tsk)
 		 * that gets the timer lock before we do will give it up and
 		 * spin until we've taken care of that timer below.
 		 */
-		spin_unlock(&tsk->sighand->siglock);
 	}
-	read_unlock(&tasklist_lock);
+	unlock_task_sighand(tsk, &flags);
 
 	/*
 	 * Now that all the timers on our list have the firing flag,
@@ -1389,10 +1422,9 @@ void run_posix_cpu_timers(struct task_struct *tsk)
 
 /*
  * Set one of the process-wide special case CPU timers.
- * The tasklist_lock and tsk->sighand->siglock must be held by the caller.
- * The oldval argument is null for the RLIMIT_CPU timer, where *newval is
- * absolute; non-null for ITIMER_*, where *newval is relative and we update
- * it to be absolute, *oldval is absolute and we update it to be relative.
+ * The tsk->sighand->siglock must be held by the caller.
+ * The *newval argument is relative and we update it to be absolute, *oldval
+ * is absolute and we update it to be relative.
  */
 void set_process_cpu_timer(struct task_struct *tsk, unsigned int clock_idx,
 			   cputime_t *newval, cputime_t *oldval)
@@ -1435,13 +1467,14 @@ void set_process_cpu_timer(struct task_struct *tsk, unsigned int clock_idx,
 	    cputime_ge(list_first_entry(head,
 				  struct cpu_timer_list, entry)->expires.cpu,
 		       *newval)) {
-		/*
-		 * Rejigger each thread's expiry time so that one will
-		 * notice before we hit the process-cumulative expiry time.
-		 */
-		union cpu_time_count expires = { .sched = 0 };
-		expires.cpu = *newval;
-		process_timer_rebalance(tsk, clock_idx, expires, now);
+		switch (clock_idx) {
+		case CPUCLOCK_PROF:
+			tsk->signal->cputime_expires.prof_exp = *newval;
+			break;
+		case CPUCLOCK_VIRT:
+			tsk->signal->cputime_expires.virt_exp = *newval;
+			break;
+		}
 	}
 }
 
diff --git a/kernel/sched.c b/kernel/sched.c
index cc1f81b50b82..c51b5d276665 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -4036,6 +4036,25 @@ DEFINE_PER_CPU(struct kernel_stat, kstat);
 
 EXPORT_PER_CPU_SYMBOL(kstat);
 
+/*
+ * Return any ns on the sched_clock that have not yet been banked in
+ * @p in case that task is currently running.
+ *
+ * Called with task_rq_lock() held on @rq.
+ */
+static unsigned long long task_delta_exec(struct task_struct *p, struct rq *rq)
+{
+	if (task_current(rq, p)) {
+		u64 delta_exec;
+
+		update_rq_clock(rq);
+		delta_exec = rq->clock - p->se.exec_start;
+		if ((s64)delta_exec > 0)
+			return delta_exec;
+	}
+	return 0;
+}
+
 /*
  * Return p->sum_exec_runtime plus any more ns on the sched_clock
  * that have not yet been banked in case the task is currently running.
@@ -4043,17 +4062,31 @@ EXPORT_PER_CPU_SYMBOL(kstat);
 unsigned long long task_sched_runtime(struct task_struct *p)
 {
 	unsigned long flags;
-	u64 ns, delta_exec;
+	u64 ns;
 	struct rq *rq;
 
 	rq = task_rq_lock(p, &flags);
-	ns = p->se.sum_exec_runtime;
-	if (task_current(rq, p)) {
-		update_rq_clock(rq);
-		delta_exec = rq->clock - p->se.exec_start;
-		if ((s64)delta_exec > 0)
-			ns += delta_exec;
-	}
+	ns = p->se.sum_exec_runtime + task_delta_exec(p, rq);
+	task_rq_unlock(rq, &flags);
+
+	return ns;
+}
+
+/*
+ * Return sum_exec_runtime for the thread group plus any more ns on the
+ * sched_clock that have not yet been banked in case the task is currently
+ * running.
+ */
+unsigned long long thread_group_sched_runtime(struct task_struct *p)
+{
+	unsigned long flags;
+	u64 ns;
+	struct rq *rq;
+	struct task_cputime totals;
+
+	rq = task_rq_lock(p, &flags);
+	thread_group_cputime(p, &totals);
+	ns = totals.sum_exec_runtime + task_delta_exec(p, rq);
 	task_rq_unlock(rq, &flags);
 
 	return ns;
@@ -4070,6 +4103,7 @@ void account_user_time(struct task_struct *p, cputime_t cputime)
 	cputime64_t tmp;
 
 	p->utime = cputime_add(p->utime, cputime);
+	account_group_user_time(p, cputime);
 
 	/* Add user time to cpustat. */
 	tmp = cputime_to_cputime64(cputime);
@@ -4094,6 +4128,7 @@ static void account_guest_time(struct task_struct *p, cputime_t cputime)
 	tmp = cputime_to_cputime64(cputime);
 
 	p->utime = cputime_add(p->utime, cputime);
+	account_group_user_time(p, cputime);
 	p->gtime = cputime_add(p->gtime, cputime);
 
 	cpustat->user = cputime64_add(cpustat->user, tmp);
@@ -4129,6 +4164,7 @@ void account_system_time(struct task_struct *p, int hardirq_offset,
 	}
 
 	p->stime = cputime_add(p->stime, cputime);
+	account_group_system_time(p, cputime);
 
 	/* Add system time to cpustat. */
 	tmp = cputime_to_cputime64(cputime);
@@ -4170,6 +4206,7 @@ void account_steal_time(struct task_struct *p, cputime_t steal)
 
 	if (p == rq->idle) {
 		p->stime = cputime_add(p->stime, steal);
+		account_group_system_time(p, steal);
 		if (atomic_read(&rq->nr_iowait) > 0)
 			cpustat->iowait = cputime64_add(cpustat->iowait, tmp);
 		else
diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c
index fb8994c6d4bb..99aa31acc544 100644
--- a/kernel/sched_fair.c
+++ b/kernel/sched_fair.c
@@ -507,6 +507,7 @@ static void update_curr(struct cfs_rq *cfs_rq)
 		struct task_struct *curtask = task_of(curr);
 
 		cpuacct_charge(curtask, delta_exec);
+		account_group_exec_runtime(curtask, delta_exec);
 	}
 }
 
diff --git a/kernel/sched_rt.c b/kernel/sched_rt.c
index 552310798dad..8375e69af36a 100644
--- a/kernel/sched_rt.c
+++ b/kernel/sched_rt.c
@@ -483,6 +483,8 @@ static void update_curr_rt(struct rq *rq)
 	schedstat_set(curr->se.exec_max, max(curr->se.exec_max, delta_exec));
 
 	curr->se.sum_exec_runtime += delta_exec;
+	account_group_exec_runtime(curr, delta_exec);
+
 	curr->se.exec_start = rq->clock;
 	cpuacct_charge(curr, delta_exec);
 
@@ -1412,7 +1414,7 @@ static void watchdog(struct rq *rq, struct task_struct *p)
 		p->rt.timeout++;
 		next = DIV_ROUND_UP(min(soft, hard), USEC_PER_SEC/HZ);
 		if (p->rt.timeout > next)
-			p->it_sched_expires = p->se.sum_exec_runtime;
+			p->cputime_expires.sched_exp = p->se.sum_exec_runtime;
 	}
 }
 
diff --git a/kernel/signal.c b/kernel/signal.c
index e661b01d340f..6eea5826d618 100644
--- a/kernel/signal.c
+++ b/kernel/signal.c
@@ -1338,6 +1338,7 @@ int do_notify_parent(struct task_struct *tsk, int sig)
 	struct siginfo info;
 	unsigned long flags;
 	struct sighand_struct *psig;
+	struct task_cputime cputime;
 	int ret = sig;
 
 	BUG_ON(sig == -1);
@@ -1368,10 +1369,9 @@ int do_notify_parent(struct task_struct *tsk, int sig)
 
 	info.si_uid = tsk->uid;
 
-	info.si_utime = cputime_to_clock_t(cputime_add(tsk->utime,
-						       tsk->signal->utime));
-	info.si_stime = cputime_to_clock_t(cputime_add(tsk->stime,
-						       tsk->signal->stime));
+	thread_group_cputime(tsk, &cputime);
+	info.si_utime = cputime_to_jiffies(cputime.utime);
+	info.si_stime = cputime_to_jiffies(cputime.stime);
 
 	info.si_status = tsk->exit_code & 0x7f;
 	if (tsk->exit_code & 0x80)
diff --git a/kernel/sys.c b/kernel/sys.c
index 038a7bc0901d..d046a7a055c2 100644
--- a/kernel/sys.c
+++ b/kernel/sys.c
@@ -853,38 +853,28 @@ asmlinkage long sys_setfsgid(gid_t gid)
 	return old_fsgid;
 }
 
+void do_sys_times(struct tms *tms)
+{
+	struct task_cputime cputime;
+	cputime_t cutime, cstime;
+
+	spin_lock_irq(&current->sighand->siglock);
+	thread_group_cputime(current, &cputime);
+	cutime = current->signal->cutime;
+	cstime = current->signal->cstime;
+	spin_unlock_irq(&current->sighand->siglock);
+	tms->tms_utime = cputime_to_clock_t(cputime.utime);
+	tms->tms_stime = cputime_to_clock_t(cputime.stime);
+	tms->tms_cutime = cputime_to_clock_t(cutime);
+	tms->tms_cstime = cputime_to_clock_t(cstime);
+}
+
 asmlinkage long sys_times(struct tms __user * tbuf)
 {
-	/*
-	 *	In the SMP world we might just be unlucky and have one of
-	 *	the times increment as we use it. Since the value is an
-	 *	atomically safe type this is just fine. Conceptually its
-	 *	as if the syscall took an instant longer to occur.
-	 */
 	if (tbuf) {
 		struct tms tmp;
-		struct task_struct *tsk = current;
-		struct task_struct *t;
-		cputime_t utime, stime, cutime, cstime;
-
-		spin_lock_irq(&tsk->sighand->siglock);
-		utime = tsk->signal->utime;
-		stime = tsk->signal->stime;
-		t = tsk;
-		do {
-			utime = cputime_add(utime, t->utime);
-			stime = cputime_add(stime, t->stime);
-			t = next_thread(t);
-		} while (t != tsk);
-
-		cutime = tsk->signal->cutime;
-		cstime = tsk->signal->cstime;
-		spin_unlock_irq(&tsk->sighand->siglock);
-
-		tmp.tms_utime = cputime_to_clock_t(utime);
-		tmp.tms_stime = cputime_to_clock_t(stime);
-		tmp.tms_cutime = cputime_to_clock_t(cutime);
-		tmp.tms_cstime = cputime_to_clock_t(cstime);
+
+		do_sys_times(&tmp);
 		if (copy_to_user(tbuf, &tmp, sizeof(struct tms)))
 			return -EFAULT;
 	}
@@ -1445,7 +1435,6 @@ asmlinkage long sys_old_getrlimit(unsigned int resource, struct rlimit __user *r
 asmlinkage long sys_setrlimit(unsigned int resource, struct rlimit __user *rlim)
 {
 	struct rlimit new_rlim, *old_rlim;
-	unsigned long it_prof_secs;
 	int retval;
 
 	if (resource >= RLIM_NLIMITS)
@@ -1491,18 +1480,7 @@ asmlinkage long sys_setrlimit(unsigned int resource, struct rlimit __user *rlim)
 	if (new_rlim.rlim_cur == RLIM_INFINITY)
 		goto out;
 
-	it_prof_secs = cputime_to_secs(current->signal->it_prof_expires);
-	if (it_prof_secs == 0 || new_rlim.rlim_cur <= it_prof_secs) {
-		unsigned long rlim_cur = new_rlim.rlim_cur;
-		cputime_t cputime;
-
-		cputime = secs_to_cputime(rlim_cur);
-		read_lock(&tasklist_lock);
-		spin_lock_irq(&current->sighand->siglock);
-		set_process_cpu_timer(current, CPUCLOCK_PROF, &cputime, NULL);
-		spin_unlock_irq(&current->sighand->siglock);
-		read_unlock(&tasklist_lock);
-	}
+	update_rlimit_cpu(new_rlim.rlim_cur);
 out:
 	return 0;
 }
@@ -1540,11 +1518,8 @@ out:
  *
  */
 
-static void accumulate_thread_rusage(struct task_struct *t, struct rusage *r,
-				     cputime_t *utimep, cputime_t *stimep)
+static void accumulate_thread_rusage(struct task_struct *t, struct rusage *r)
 {
-	*utimep = cputime_add(*utimep, t->utime);
-	*stimep = cputime_add(*stimep, t->stime);
 	r->ru_nvcsw += t->nvcsw;
 	r->ru_nivcsw += t->nivcsw;
 	r->ru_minflt += t->min_flt;
@@ -1558,12 +1533,13 @@ static void k_getrusage(struct task_struct *p, int who, struct rusage *r)
 	struct task_struct *t;
 	unsigned long flags;
 	cputime_t utime, stime;
+	struct task_cputime cputime;
 
 	memset((char *) r, 0, sizeof *r);
 	utime = stime = cputime_zero;
 
 	if (who == RUSAGE_THREAD) {
-		accumulate_thread_rusage(p, r, &utime, &stime);
+		accumulate_thread_rusage(p, r);
 		goto out;
 	}
 
@@ -1586,8 +1562,9 @@ static void k_getrusage(struct task_struct *p, int who, struct rusage *r)
 				break;
 
 		case RUSAGE_SELF:
-			utime = cputime_add(utime, p->signal->utime);
-			stime = cputime_add(stime, p->signal->stime);
+			thread_group_cputime(p, &cputime);
+			utime = cputime_add(utime, cputime.utime);
+			stime = cputime_add(stime, cputime.stime);
 			r->ru_nvcsw += p->signal->nvcsw;
 			r->ru_nivcsw += p->signal->nivcsw;
 			r->ru_minflt += p->signal->min_flt;
@@ -1596,7 +1573,7 @@ static void k_getrusage(struct task_struct *p, int who, struct rusage *r)
 			r->ru_oublock += p->signal->oublock;
 			t = p;
 			do {
-				accumulate_thread_rusage(t, r, &utime, &stime);
+				accumulate_thread_rusage(t, r);
 				t = next_thread(t);
 			} while (t != p);
 			break;
diff --git a/security/selinux/hooks.c b/security/selinux/hooks.c
index 03fc6a81ae32..69649783c266 100644
--- a/security/selinux/hooks.c
+++ b/security/selinux/hooks.c
@@ -75,6 +75,7 @@
 #include <linux/string.h>
 #include <linux/selinux.h>
 #include <linux/mutex.h>
+#include <linux/posix-timers.h>
 
 #include "avc.h"
 #include "objsec.h"
@@ -2321,13 +2322,7 @@ static void selinux_bprm_post_apply_creds(struct linux_binprm *bprm)
 			initrlim = init_task.signal->rlim+i;
 			rlim->rlim_cur = min(rlim->rlim_max, initrlim->rlim_cur);
 		}
-		if (current->signal->rlim[RLIMIT_CPU].rlim_cur != RLIM_INFINITY) {
-			/*
-			 * This will cause RLIMIT_CPU calculations
-			 * to be refigured.
-			 */
-			current->it_prof_expires = jiffies_to_cputime(1);
-		}
+		update_rlimit_cpu(rlim->rlim_cur);
 	}
 
 	/* Wake up the parent if it is waiting so that it can
-- 
cgit v1.2.3


From 0a8eaa4f9b58759595a1bfe13a1295fdc25ba026 Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Sun, 14 Sep 2008 17:03:52 +0200
Subject: timers: fix itimer/many thread hang, fix #2
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

fix the UP build:

In file included from arch/x86/kernel/asm-offsets_32.c:9,
                 from arch/x86/kernel/asm-offsets.c:3:
include/linux/sched.h: In function ‘thread_group_cputime_clone_thread’:
include/linux/sched.h:2272: warning: no return statement in function returning non-void
include/linux/sched.h: In function ‘thread_group_cputime_account_user’:
include/linux/sched.h:2284: error: invalid type argument of ‘->’ (have ‘struct task_cputime’)
include/linux/sched.h:2284: error: invalid type argument of ‘->’ (have ‘struct task_cputime’)
include/linux/sched.h: In function ‘thread_group_cputime_account_system’:
include/linux/sched.h:2291: error: invalid type argument of ‘->’ (have ‘struct task_cputime’)
include/linux/sched.h:2291: error: invalid type argument of ‘->’ (have ‘struct task_cputime’)
include/linux/sched.h: In function ‘thread_group_cputime_account_exec_runtime’:
include/linux/sched.h:2298: error: invalid type argument of ‘->’ (have ‘struct task_cputime’)
distcc[14501] ERROR: compile arch/x86/kernel/asm-offsets.c on a/30 failed
make[1]: *** [arch/x86/kernel/asm-offsets.s] Error 1

Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 include/linux/sched.h | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/sched.h b/include/linux/sched.h
index 26d7a5f2d0ba..ed355f02d329 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -2251,6 +2251,7 @@ static inline void thread_group_cputime_free(struct signal_struct *sig)
 static inline int thread_group_cputime_clone_thread(struct task_struct *curr,
 						     struct task_struct *tsk)
 {
+	return 0;
 }
 
 static inline void thread_group_cputime(struct task_struct *tsk,
@@ -2263,21 +2264,21 @@ static inline void thread_group_cputime_account_user(
 	struct thread_group_cputime *tgtimes,
 	cputime_t cputime)
 {
-	tgtimes->totals->utime = cputime_add(tgtimes->totals->utime, cputime);
+	tgtimes->totals.utime = cputime_add(tgtimes->totals.utime, cputime);
 }
 
 static inline void thread_group_cputime_account_system(
 	struct thread_group_cputime *tgtimes,
 	cputime_t cputime)
 {
-	tgtimes->totals->stime = cputime_add(tgtimes->totals->stime, cputime);
+	tgtimes->totals.stime = cputime_add(tgtimes->totals.stime, cputime);
 }
 
 static inline void thread_group_cputime_account_exec_runtime(
 	struct thread_group_cputime *tgtimes,
 	unsigned long long ns)
 {
-	tgtimes->totals->sum_exec_runtime += ns;
+	tgtimes->totals.sum_exec_runtime += ns;
 }
 
 #endif /* CONFIG_SMP */
-- 
cgit v1.2.3


From 5ce73a4a5a4893a1aa4cdeed1b1a5a6de42c43b6 Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Sun, 14 Sep 2008 17:11:46 +0200
Subject: timers: fix itimer/many thread hang, cleanups

Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 include/linux/sched.h     | 2 +-
 kernel/posix-cpu-timers.c | 6 +++---
 2 files changed, 4 insertions(+), 4 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/sched.h b/include/linux/sched.h
index ed355f02d329..7ce8d4e53565 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -430,7 +430,7 @@ struct pacct_struct {
  * @utime:		time spent in user mode, in &cputime_t units
  * @stime:		time spent in kernel mode, in &cputime_t units
  * @sum_exec_runtime:	total time spent on the CPU, in nanoseconds
- * 
+ *
  * This structure groups together three kinds of CPU time that are
  * tracked for threads and thread groups.  Most things considering
  * CPU time want to group these counts together and treat all three
diff --git a/kernel/posix-cpu-timers.c b/kernel/posix-cpu-timers.c
index dba1c334c3e8..9a7ea049fcdc 100644
--- a/kernel/posix-cpu-timers.c
+++ b/kernel/posix-cpu-timers.c
@@ -94,7 +94,7 @@ void update_rlimit_cpu(unsigned long rlim_new)
 
 	cputime = secs_to_cputime(rlim_new);
 	if (cputime_eq(current->signal->it_prof_expires, cputime_zero) ||
-            cputime_lt(current->signal->it_prof_expires, cputime)) {
+	    cputime_lt(current->signal->it_prof_expires, cputime)) {
 		spin_lock_irq(&current->sighand->siglock);
 		set_process_cpu_timer(current, CPUCLOCK_PROF, &cputime, NULL);
 		spin_unlock_irq(&current->sighand->siglock);
@@ -1372,9 +1372,9 @@ void run_posix_cpu_timers(struct task_struct *tsk)
 	 * tsk->signal is non-NULL; this probably can't happen but cover the
 	 * possibility anyway.
 	 */
-	if (unlikely(!sig) || !fastpath_timer_check(tsk, sig)) {
+	if (unlikely(!sig) || !fastpath_timer_check(tsk, sig))
 		return;
-	}
+
 	sighand = lock_task_sighand(tsk, &flags);
 	if (likely(sighand)) {
 		/*
-- 
cgit v1.2.3


From ccbe329bcd87924baed96474ec0a6725e3957897 Mon Sep 17 00:00:00 2001
From: David Vrabel <david.vrabel@csr.com>
Date: Wed, 17 Sep 2008 16:34:03 +0100
Subject: bitmap: add bitmap_copy_le()

bitmap_copy_le() copies a bitmap, putting the bits into little-endian
order (i.e., each unsigned long word in the bitmap is put into
little-endian order).

The UWB stack used bitmaps to manage Medium Access Slot availability,
and these bitmaps need to be written to the hardware in LE order.

Signed-off-by: David Vrabel <david.vrabel@csr.com>
---
 include/linux/bitmap.h |  1 +
 lib/bitmap.c           | 22 ++++++++++++++++++++++
 2 files changed, 23 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/bitmap.h b/include/linux/bitmap.h
index 89781fd48859..5379913aca52 100644
--- a/include/linux/bitmap.h
+++ b/include/linux/bitmap.h
@@ -130,6 +130,7 @@ extern void bitmap_fold(unsigned long *dst, const unsigned long *orig,
 extern int bitmap_find_free_region(unsigned long *bitmap, int bits, int order);
 extern void bitmap_release_region(unsigned long *bitmap, int pos, int order);
 extern int bitmap_allocate_region(unsigned long *bitmap, int pos, int order);
+extern void bitmap_copy_le(void *dst, const unsigned long *src, int nbits);
 
 #define BITMAP_LAST_WORD_MASK(nbits)					\
 (									\
diff --git a/lib/bitmap.c b/lib/bitmap.c
index 06fb57c86de0..c2006bfeea41 100644
--- a/lib/bitmap.c
+++ b/lib/bitmap.c
@@ -1007,3 +1007,25 @@ int bitmap_allocate_region(unsigned long *bitmap, int pos, int order)
 	return 0;
 }
 EXPORT_SYMBOL(bitmap_allocate_region);
+
+/**
+ * bitmap_copy_le - copy a bitmap, putting the bits into little-endian order.
+ * @dst:   destination buffer
+ * @src:   bitmap to copy
+ * @nbits: number of bits in the bitmap
+ *
+ * Require nbits % BITS_PER_LONG == 0.
+ */
+void bitmap_copy_le(void *dst, const unsigned long *src, int nbits)
+{
+	unsigned long *d = dst;
+	int i;
+
+	for (i = 0; i < nbits/BITS_PER_LONG; i++) {
+		if (BITS_PER_LONG == 64)
+			d[i] = cpu_to_le64(src[i]);
+		else
+			d[i] = cpu_to_le32(src[i]);
+	}
+}
+EXPORT_SYMBOL(bitmap_copy_le);
-- 
cgit v1.2.3


From 34e95e41f1fd751e33a7eb3fa66594903b81f13d Mon Sep 17 00:00:00 2001
From: Inaky Perez-Gonzalez <inaky.perez-gonzalez@intel.com>
Date: Wed, 17 Sep 2008 16:34:05 +0100
Subject: uwb: add the uwb include files

Signed-off-by: David Vrabel <david.vrabel@csr.com>
---
 include/linux/uwb.h           | 761 ++++++++++++++++++++++++++++++++++++++++++
 include/linux/uwb/debug-cmd.h |  57 ++++
 include/linux/uwb/debug.h     |  82 +++++
 include/linux/uwb/spec.h      | 727 ++++++++++++++++++++++++++++++++++++++++
 include/linux/wlp.h           | 735 ++++++++++++++++++++++++++++++++++++++++
 5 files changed, 2362 insertions(+)
 create mode 100644 include/linux/uwb.h
 create mode 100644 include/linux/uwb/debug-cmd.h
 create mode 100644 include/linux/uwb/debug.h
 create mode 100644 include/linux/uwb/spec.h
 create mode 100644 include/linux/wlp.h

(limited to 'include/linux')

diff --git a/include/linux/uwb.h b/include/linux/uwb.h
new file mode 100644
index 000000000000..0cd35937e120
--- /dev/null
+++ b/include/linux/uwb.h
@@ -0,0 +1,761 @@
+/*
+ * Ultra Wide Band
+ * UWB API
+ *
+ * Copyright (C) 2005-2006 Intel Corporation
+ * Inaky Perez-Gonzalez <inaky.perez-gonzalez@intel.com>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License version
+ * 2 as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
+ * 02110-1301, USA.
+ *
+ *
+ * FIXME: doc: overview of the API, different parts and pointers
+ */
+
+#ifndef __LINUX__UWB_H__
+#define __LINUX__UWB_H__
+
+#include <linux/limits.h>
+#include <linux/device.h>
+#include <linux/mutex.h>
+#include <linux/timer.h>
+#include <linux/workqueue.h>
+#include <linux/uwb/spec.h>
+
+struct uwb_dev;
+struct uwb_beca_e;
+struct uwb_rc;
+struct uwb_rsv;
+struct uwb_dbg;
+
+/**
+ * struct uwb_dev - a UWB Device
+ * @rc: UWB Radio Controller that discovered the device (kind of its
+ *     parent).
+ * @bce: a beacon cache entry for this device; or NULL if the device
+ *     is a local radio controller.
+ * @mac_addr: the EUI-48 address of this device.
+ * @dev_addr: the current DevAddr used by this device.
+ * @beacon_slot: the slot number the beacon is using.
+ * @streams: bitmap of streams allocated to reservations targeted at
+ *     this device.  For an RC, this is the streams allocated for
+ *     reservations targeted at DevAddrs.
+ *
+ * A UWB device may either by a neighbor or part of a local radio
+ * controller.
+ */
+struct uwb_dev {
+	struct mutex mutex;
+	struct list_head list_node;
+	struct device dev;
+	struct uwb_rc *rc;		/* radio controller */
+	struct uwb_beca_e *bce;		/* Beacon Cache Entry */
+
+	struct uwb_mac_addr mac_addr;
+	struct uwb_dev_addr dev_addr;
+	int beacon_slot;
+	DECLARE_BITMAP(streams, UWB_NUM_STREAMS);
+};
+#define to_uwb_dev(d) container_of(d, struct uwb_dev, dev)
+
+/**
+ * UWB HWA/WHCI Radio Control {Command|Event} Block context IDs
+ *
+ * RC[CE]Bs have a 'context ID' field that matches the command with
+ * the event received to confirm it.
+ *
+ * Maximum number of context IDs
+ */
+enum { UWB_RC_CTX_MAX = 256 };
+
+
+/** Notification chain head for UWB generated events to listeners */
+struct uwb_notifs_chain {
+	struct list_head list;
+	struct mutex mutex;
+};
+
+/**
+ * struct uwb_mas_bm - a bitmap of all MAS in a superframe
+ * @bm: a bitmap of length #UWB_NUM_MAS
+ */
+struct uwb_mas_bm {
+	DECLARE_BITMAP(bm, UWB_NUM_MAS);
+};
+
+/**
+ * uwb_rsv_state - UWB Reservation state.
+ *
+ * NONE - reservation is not active (no DRP IE being transmitted).
+ *
+ * Owner reservation states:
+ *
+ * INITIATED - owner has sent an initial DRP request.
+ * PENDING - target responded with pending Reason Code.
+ * MODIFIED - reservation manager is modifying an established
+ * reservation with a different MAS allocation.
+ * ESTABLISHED - the reservation has been successfully negotiated.
+ *
+ * Target reservation states:
+ *
+ * DENIED - request is denied.
+ * ACCEPTED - request is accepted.
+ * PENDING - PAL has yet to make a decision to whether to accept or
+ * deny.
+ *
+ * FIXME: further target states TBD.
+ */
+enum uwb_rsv_state {
+	UWB_RSV_STATE_NONE,
+	UWB_RSV_STATE_O_INITIATED,
+	UWB_RSV_STATE_O_PENDING,
+	UWB_RSV_STATE_O_MODIFIED,
+	UWB_RSV_STATE_O_ESTABLISHED,
+	UWB_RSV_STATE_T_ACCEPTED,
+	UWB_RSV_STATE_T_DENIED,
+	UWB_RSV_STATE_T_PENDING,
+
+	UWB_RSV_STATE_LAST,
+};
+
+enum uwb_rsv_target_type {
+	UWB_RSV_TARGET_DEV,
+	UWB_RSV_TARGET_DEVADDR,
+};
+
+/**
+ * struct uwb_rsv_target - the target of a reservation.
+ *
+ * Reservations unicast and targeted at a single device
+ * (UWB_RSV_TARGET_DEV); or (e.g., in the case of WUSB) targeted at a
+ * specific (private) DevAddr (UWB_RSV_TARGET_DEVADDR).
+ */
+struct uwb_rsv_target {
+	enum uwb_rsv_target_type type;
+	union {
+		struct uwb_dev *dev;
+		struct uwb_dev_addr devaddr;
+	};
+};
+
+/*
+ * Number of streams reserved for reservations targeted at DevAddrs.
+ */
+#define UWB_NUM_GLOBAL_STREAMS 1
+
+typedef void (*uwb_rsv_cb_f)(struct uwb_rsv *rsv);
+
+/**
+ * struct uwb_rsv - a DRP reservation
+ *
+ * Data structure management:
+ *
+ * @rc:             the radio controller this reservation is for
+ *                  (as target or owner)
+ * @rc_node:        a list node for the RC
+ * @pal_node:       a list node for the PAL
+ *
+ * Owner and target parameters:
+ *
+ * @owner:          the UWB device owning this reservation
+ * @target:         the target UWB device
+ * @type:           reservation type
+ *
+ * Owner parameters:
+ *
+ * @max_mas:        maxiumum number of MAS
+ * @min_mas:        minimum number of MAS
+ * @sparsity:       owner selected sparsity
+ * @is_multicast:   true iff multicast
+ *
+ * @callback:       callback function when the reservation completes
+ * @pal_priv:       private data for the PAL making the reservation
+ *
+ * Reservation status:
+ *
+ * @status:         negotiation status
+ * @stream:         stream index allocated for this reservation
+ * @mas:            reserved MAS
+ * @drp_ie:         the DRP IE
+ * @ie_valid:       true iff the DRP IE matches the reservation parameters
+ *
+ * DRP reservations are uniquely identified by the owner, target and
+ * stream index.  However, when using a DevAddr as a target (e.g., for
+ * a WUSB cluster reservation) the responses may be received from
+ * devices with different DevAddrs.  In this case, reservations are
+ * uniquely identified by just the stream index.  A number of stream
+ * indexes (UWB_NUM_GLOBAL_STREAMS) are reserved for this.
+ */
+struct uwb_rsv {
+	struct uwb_rc *rc;
+	struct list_head rc_node;
+	struct list_head pal_node;
+
+	struct uwb_dev *owner;
+	struct uwb_rsv_target target;
+	enum uwb_drp_type type;
+	int max_mas;
+	int min_mas;
+	int sparsity;
+	bool is_multicast;
+
+	uwb_rsv_cb_f callback;
+	void *pal_priv;
+
+	enum uwb_rsv_state state;
+	u8 stream;
+	struct uwb_mas_bm mas;
+	struct uwb_ie_drp *drp_ie;
+	bool ie_valid;
+	struct timer_list timer;
+	bool expired;
+};
+
+static const
+struct uwb_mas_bm uwb_mas_bm_zero = { .bm = { 0 } };
+
+static inline void uwb_mas_bm_copy_le(void *dst, const struct uwb_mas_bm *mas)
+{
+	bitmap_copy_le(dst, mas->bm, UWB_NUM_MAS);
+}
+
+/**
+ * struct uwb_drp_avail - a radio controller's view of MAS usage
+ * @global:   MAS unused by neighbors (excluding reservations targetted
+ *            or owned by the local radio controller) or the beaon period
+ * @local:    MAS unused by local established reservations
+ * @pending:  MAS unused by local pending reservations
+ * @ie:       DRP Availability IE to be included in the beacon
+ * @ie_valid: true iff @ie is valid and does not need to regenerated from
+ *            @global and @local
+ *
+ * Each radio controller maintains a view of MAS usage or
+ * availability. MAS available for a new reservation are determined
+ * from the intersection of @global, @local, and @pending.
+ *
+ * The radio controller must transmit a DRP Availability IE that's the
+ * intersection of @global and @local.
+ *
+ * A set bit indicates the MAS is unused and available.
+ *
+ * rc->rsvs_mutex should be held before accessing this data structure.
+ *
+ * [ECMA-368] section 17.4.3.
+ */
+struct uwb_drp_avail {
+	DECLARE_BITMAP(global, UWB_NUM_MAS);
+	DECLARE_BITMAP(local, UWB_NUM_MAS);
+	DECLARE_BITMAP(pending, UWB_NUM_MAS);
+	struct uwb_ie_drp_avail ie;
+	bool ie_valid;
+};
+
+
+const char *uwb_rsv_state_str(enum uwb_rsv_state state);
+const char *uwb_rsv_type_str(enum uwb_drp_type type);
+
+struct uwb_rsv *uwb_rsv_create(struct uwb_rc *rc, uwb_rsv_cb_f cb,
+			       void *pal_priv);
+void uwb_rsv_destroy(struct uwb_rsv *rsv);
+
+int uwb_rsv_establish(struct uwb_rsv *rsv);
+int uwb_rsv_modify(struct uwb_rsv *rsv,
+		   int max_mas, int min_mas, int sparsity);
+void uwb_rsv_terminate(struct uwb_rsv *rsv);
+
+void uwb_rsv_accept(struct uwb_rsv *rsv, uwb_rsv_cb_f cb, void *pal_priv);
+
+/**
+ * Radio Control Interface instance
+ *
+ *
+ * Life cycle rules: those of the UWB Device.
+ *
+ * @index:    an index number for this radio controller, as used in the
+ *            device name.
+ * @version:  version of protocol supported by this device
+ * @priv:     Backend implementation; rw with uwb_dev.dev.sem taken.
+ * @cmd:      Backend implementation to execute commands; rw and call
+ *            only  with uwb_dev.dev.sem taken.
+ * @reset:    Hardware reset of radio controller and any PAL controllers.
+ * @filter:   Backend implementation to manipulate data to and from device
+ *            to be compliant to specification assumed by driver (WHCI
+ *            0.95).
+ *
+ *            uwb_dev.dev.mutex is used to execute commands and update
+ *            the corresponding structures; can't use a spinlock
+ *            because rc->cmd() can sleep.
+ * @ies:         This is a dynamically allocated array cacheing the
+ *               IEs (settable by the host) that the beacon of this
+ *               radio controller is currently sending.
+ *
+ *               In reality, we store here the full command we set to
+ *               the radio controller (which is basically a command
+ *               prefix followed by all the IEs the beacon currently
+ *               contains). This way we don't have to realloc and
+ *               memcpy when setting it.
+ *
+ *               We set this up in uwb_rc_ie_setup(), where we alloc
+ *               this struct, call get_ie() [so we know which IEs are
+ *               currently being sent, if any].
+ *
+ * @ies_capacity:Amount of space (in bytes) allocated in @ies. The
+ *               amount used is given by sizeof(*ies) plus ies->wIELength
+ *               (which is a little endian quantity all the time).
+ * @ies_mutex:   protect the IE cache
+ * @dbg:         information for the debug interface
+ */
+struct uwb_rc {
+	struct uwb_dev uwb_dev;
+	int index;
+	u16 version;
+
+	struct module *owner;
+	void *priv;
+	int (*start)(struct uwb_rc *rc);
+	void (*stop)(struct uwb_rc *rc);
+	int (*cmd)(struct uwb_rc *, const struct uwb_rccb *, size_t);
+	int (*reset)(struct uwb_rc *rc);
+	int (*filter_cmd)(struct uwb_rc *, struct uwb_rccb **, size_t *);
+	int (*filter_event)(struct uwb_rc *, struct uwb_rceb **, const size_t,
+			    size_t *, size_t *);
+
+	spinlock_t neh_lock;		/* protects neh_* and ctx_* */
+	struct list_head neh_list;	/* Open NE handles */
+	unsigned long ctx_bm[UWB_RC_CTX_MAX / 8 / sizeof(unsigned long)];
+	u8 ctx_roll;
+
+	int beaconing;			/* Beaconing state [channel number] */
+	int scanning;
+	enum uwb_scan_type scan_type:3;
+	unsigned ready:1;
+	struct uwb_notifs_chain notifs_chain;
+
+	struct uwb_drp_avail drp_avail;
+	struct list_head reservations;
+	struct mutex rsvs_mutex;
+	struct workqueue_struct *rsv_workq;
+	struct work_struct rsv_update_work;
+
+	struct mutex ies_mutex;
+	struct uwb_rc_cmd_set_ie *ies;
+	size_t ies_capacity;
+
+	spinlock_t pal_lock;
+	struct list_head pals;
+
+	struct uwb_dbg *dbg;
+};
+
+
+/**
+ * struct uwb_pal - a UWB PAL
+ * @new_rsv: called when a peer requests a reservation (may be NULL if
+ *           the PAL cannot accept reservation requests).
+ *
+ * A Protocol Adaptation Layer (PAL) is a user of the WiMedia UWB
+ * radio platform (e.g., WUSB, WLP or Bluetooth UWB AMP).
+ *
+ * The PALs using a radio controller must register themselves to
+ * permit the UWB stack to coordinate usage of the radio between the
+ * various PALs or to allow PALs to response to certain requests from
+ * peers.
+ *
+ * A struct uwb_pal should be embedded in a containing structure
+ * belonging to the PAL and initialized with uwb_pal_init()).  Fields
+ * should be set appropriately by the PAL before registering the PAL
+ * with uwb_pal_register().
+ */
+struct uwb_pal {
+	struct list_head node;
+
+	void (*new_rsv)(struct uwb_rsv *rsv);
+};
+
+void uwb_pal_init(struct uwb_pal *pal);
+int uwb_pal_register(struct uwb_rc *rc, struct uwb_pal *pal);
+void uwb_pal_unregister(struct uwb_rc *rc, struct uwb_pal *pal);
+
+/*
+ * General public API
+ *
+ * This API can be used by UWB device drivers or by those implementing
+ * UWB Radio Controllers
+ */
+struct uwb_dev *uwb_dev_get_by_devaddr(struct uwb_rc *rc,
+				       const struct uwb_dev_addr *devaddr);
+struct uwb_dev *uwb_dev_get_by_rc(struct uwb_dev *, struct uwb_rc *);
+static inline void uwb_dev_get(struct uwb_dev *uwb_dev)
+{
+	get_device(&uwb_dev->dev);
+}
+static inline void uwb_dev_put(struct uwb_dev *uwb_dev)
+{
+	put_device(&uwb_dev->dev);
+}
+struct uwb_dev *uwb_dev_try_get(struct uwb_rc *rc, struct uwb_dev *uwb_dev);
+
+/**
+ * Callback function for 'uwb_{dev,rc}_foreach()'.
+ *
+ * @dev:  Linux device instance
+ *        'uwb_dev = container_of(dev, struct uwb_dev, dev)'
+ * @priv: Data passed by the caller to 'uwb_{dev,rc}_foreach()'.
+ *
+ * @returns: 0 to continue the iterations, any other val to stop
+ *           iterating and return the value to the caller of
+ *           _foreach().
+ */
+typedef int (*uwb_dev_for_each_f)(struct device *dev, void *priv);
+int uwb_dev_for_each(struct uwb_rc *rc, uwb_dev_for_each_f func, void *priv);
+
+struct uwb_rc *uwb_rc_alloc(void);
+struct uwb_rc *uwb_rc_get_by_dev(const struct uwb_dev_addr *);
+struct uwb_rc *uwb_rc_get_by_grandpa(const struct device *);
+void uwb_rc_put(struct uwb_rc *rc);
+
+typedef void (*uwb_rc_cmd_cb_f)(struct uwb_rc *rc, void *arg,
+                                struct uwb_rceb *reply, ssize_t reply_size);
+
+int uwb_rc_cmd_async(struct uwb_rc *rc, const char *cmd_name,
+		     struct uwb_rccb *cmd, size_t cmd_size,
+		     u8 expected_type, u16 expected_event,
+		     uwb_rc_cmd_cb_f cb, void *arg);
+ssize_t uwb_rc_cmd(struct uwb_rc *rc, const char *cmd_name,
+		   struct uwb_rccb *cmd, size_t cmd_size,
+		   struct uwb_rceb *reply, size_t reply_size);
+ssize_t uwb_rc_vcmd(struct uwb_rc *rc, const char *cmd_name,
+		    struct uwb_rccb *cmd, size_t cmd_size,
+		    u8 expected_type, u16 expected_event,
+		    struct uwb_rceb **preply);
+ssize_t uwb_rc_get_ie(struct uwb_rc *, struct uwb_rc_evt_get_ie **);
+int uwb_bg_joined(struct uwb_rc *rc);
+
+size_t __uwb_addr_print(char *, size_t, const unsigned char *, int);
+
+int uwb_rc_dev_addr_set(struct uwb_rc *, const struct uwb_dev_addr *);
+int uwb_rc_dev_addr_get(struct uwb_rc *, struct uwb_dev_addr *);
+int uwb_rc_mac_addr_set(struct uwb_rc *, const struct uwb_mac_addr *);
+int uwb_rc_mac_addr_get(struct uwb_rc *, struct uwb_mac_addr *);
+int __uwb_mac_addr_assigned_check(struct device *, void *);
+int __uwb_dev_addr_assigned_check(struct device *, void *);
+
+/* Print in @buf a pretty repr of @addr */
+static inline size_t uwb_dev_addr_print(char *buf, size_t buf_size,
+					const struct uwb_dev_addr *addr)
+{
+	return __uwb_addr_print(buf, buf_size, addr->data, 0);
+}
+
+/* Print in @buf a pretty repr of @addr */
+static inline size_t uwb_mac_addr_print(char *buf, size_t buf_size,
+					const struct uwb_mac_addr *addr)
+{
+	return __uwb_addr_print(buf, buf_size, addr->data, 1);
+}
+
+/* @returns 0 if device addresses @addr2 and @addr1 are equal */
+static inline int uwb_dev_addr_cmp(const struct uwb_dev_addr *addr1,
+				   const struct uwb_dev_addr *addr2)
+{
+	return memcmp(addr1, addr2, sizeof(*addr1));
+}
+
+/* @returns 0 if MAC addresses @addr2 and @addr1 are equal */
+static inline int uwb_mac_addr_cmp(const struct uwb_mac_addr *addr1,
+				   const struct uwb_mac_addr *addr2)
+{
+	return memcmp(addr1, addr2, sizeof(*addr1));
+}
+
+/* @returns !0 if a MAC @addr is a broadcast address */
+static inline int uwb_mac_addr_bcast(const struct uwb_mac_addr *addr)
+{
+	struct uwb_mac_addr bcast = {
+		.data = { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff }
+	};
+	return !uwb_mac_addr_cmp(addr, &bcast);
+}
+
+/* @returns !0 if a MAC @addr is all zeroes*/
+static inline int uwb_mac_addr_unset(const struct uwb_mac_addr *addr)
+{
+	struct uwb_mac_addr unset = {
+		.data = { 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 }
+	};
+	return !uwb_mac_addr_cmp(addr, &unset);
+}
+
+/* @returns !0 if the address is in use. */
+static inline unsigned __uwb_dev_addr_assigned(struct uwb_rc *rc,
+					       struct uwb_dev_addr *addr)
+{
+	return uwb_dev_for_each(rc, __uwb_dev_addr_assigned_check, addr);
+}
+
+/*
+ * UWB Radio Controller API
+ *
+ * This API is used (in addition to the general API) to implement UWB
+ * Radio Controllers.
+ */
+void uwb_rc_init(struct uwb_rc *);
+int uwb_rc_add(struct uwb_rc *, struct device *dev, void *rc_priv);
+void uwb_rc_rm(struct uwb_rc *);
+void uwb_rc_neh_grok(struct uwb_rc *, void *, size_t);
+void uwb_rc_neh_error(struct uwb_rc *, int);
+void uwb_rc_reset_all(struct uwb_rc *rc);
+
+/**
+ * uwb_rsv_is_owner - is the owner of this reservation the RC?
+ * @rsv: the reservation
+ */
+static inline bool uwb_rsv_is_owner(struct uwb_rsv *rsv)
+{
+	return rsv->owner == &rsv->rc->uwb_dev;
+}
+
+/**
+ * Events generated by UWB that can be passed to any listeners
+ *
+ * Higher layers can register callback functions with the radio
+ * controller using uwb_notifs_register(). The radio controller
+ * maintains a list of all registered handlers and will notify all
+ * nodes when an event occurs.
+ */
+enum uwb_notifs {
+	UWB_NOTIF_BG_JOIN = 0,	/* radio controller joined a beacon group */
+	UWB_NOTIF_BG_LEAVE = 1,	/* radio controller left a beacon group */
+	UWB_NOTIF_ONAIR,
+	UWB_NOTIF_OFFAIR,
+};
+
+/* Callback function registered with UWB */
+struct uwb_notifs_handler {
+	struct list_head list_node;
+	void (*cb)(void *, struct uwb_dev *, enum uwb_notifs);
+	void *data;
+};
+
+int uwb_notifs_register(struct uwb_rc *, struct uwb_notifs_handler *);
+int uwb_notifs_deregister(struct uwb_rc *, struct uwb_notifs_handler *);
+
+
+/**
+ * UWB radio controller Event Size Entry (for creating entry tables)
+ *
+ * WUSB and WHCI define events and notifications, and they might have
+ * fixed or variable size.
+ *
+ * Each event/notification has a size which is not necessarily known
+ * in advance based on the event code. As well, vendor specific
+ * events/notifications will have a size impossible to determine
+ * unless we know about the device's specific details.
+ *
+ * It was way too smart of the spec writers not to think that it would
+ * be impossible for a generic driver to skip over vendor specific
+ * events/notifications if there are no LENGTH fields in the HEADER of
+ * each message...the transaction size cannot be counted on as the
+ * spec does not forbid to pack more than one event in a single
+ * transaction.
+ *
+ * Thus, we guess sizes with tables (or for events, when you know the
+ * size ahead of time you can use uwb_rc_neh_extra_size*()). We
+ * register tables with the known events and their sizes, and then we
+ * traverse those tables. For those with variable length, we provide a
+ * way to lookup the size inside the event/notification's
+ * payload. This allows device-specific event size tables to be
+ * registered.
+ *
+ * @size:   Size of the payload
+ *
+ * @offset: if != 0, at offset @offset-1 starts a field with a length
+ *          that has to be added to @size. The format of the field is
+ *          given by @type.
+ *
+ * @type:   Type and length of the offset field. Most common is LE 16
+ *          bits (that's why that is zero); others are there mostly to
+ *          cover for bugs and weirdos.
+ */
+struct uwb_est_entry {
+	size_t size;
+	unsigned offset;
+	enum { UWB_EST_16 = 0, UWB_EST_8 = 1 } type;
+};
+
+int uwb_est_register(u8 type, u8 code_high, u16 vendor, u16 product,
+		     const struct uwb_est_entry *, size_t entries);
+int uwb_est_unregister(u8 type, u8 code_high, u16 vendor, u16 product,
+		       const struct uwb_est_entry *, size_t entries);
+ssize_t uwb_est_find_size(struct uwb_rc *rc, const struct uwb_rceb *rceb,
+			  size_t len);
+
+/* -- Misc */
+
+enum {
+	EDC_MAX_ERRORS = 10,
+	EDC_ERROR_TIMEFRAME = HZ,
+};
+
+/* error density counter */
+struct edc {
+	unsigned long timestart;
+	u16 errorcount;
+};
+
+static inline
+void edc_init(struct edc *edc)
+{
+	edc->timestart = jiffies;
+}
+
+/* Called when an error occured.
+ * This is way to determine if the number of acceptable errors per time
+ * period has been exceeded. It is not accurate as there are cases in which
+ * this scheme will not work, for example if there are periodic occurences
+ * of errors that straddle updates to the start time. This scheme is
+ * sufficient for our usage.
+ *
+ * @returns 1 if maximum acceptable errors per timeframe has been exceeded.
+ */
+static inline int edc_inc(struct edc *err_hist, u16 max_err, u16 timeframe)
+{
+	unsigned long now;
+
+	now = jiffies;
+	if (now - err_hist->timestart > timeframe) {
+		err_hist->errorcount = 1;
+		err_hist->timestart = now;
+	} else if (++err_hist->errorcount > max_err) {
+			err_hist->errorcount = 0;
+			err_hist->timestart = now;
+			return 1;
+	}
+	return 0;
+}
+
+
+/* Information Element handling */
+
+/* For representing the state of writing to a buffer when iterating */
+struct uwb_buf_ctx {
+	char *buf;
+	size_t bytes, size;
+};
+
+typedef int (*uwb_ie_f)(struct uwb_dev *, const struct uwb_ie_hdr *,
+			size_t, void *);
+struct uwb_ie_hdr *uwb_ie_next(void **ptr, size_t *len);
+ssize_t uwb_ie_for_each(struct uwb_dev *uwb_dev, uwb_ie_f fn, void *data,
+			const void *buf, size_t size);
+int uwb_ie_dump_hex(struct uwb_dev *, const struct uwb_ie_hdr *,
+		    size_t, void *);
+int uwb_rc_set_ie(struct uwb_rc *, struct uwb_rc_cmd_set_ie *);
+struct uwb_ie_hdr *uwb_ie_next(void **ptr, size_t *len);
+
+
+/*
+ * Transmission statistics
+ *
+ * UWB uses LQI and RSSI (one byte values) for reporting radio signal
+ * strength and line quality indication. We do quick and dirty
+ * averages of those. They are signed values, btw.
+ *
+ * For 8 bit quantities, we keep the min, the max, an accumulator
+ * (@sigma) and a # of samples. When @samples gets to 255, we compute
+ * the average (@sigma / @samples), place it in @sigma and reset
+ * @samples to 1 (so we use it as the first sample).
+ *
+ * Now, statistically speaking, probably I am kicking the kidneys of
+ * some books I have in my shelves collecting dust, but I just want to
+ * get an approx, not the Nobel.
+ *
+ * LOCKING: there is no locking per se, but we try to keep a lockless
+ * schema. Only _add_samples() modifies the values--as long as you
+ * have other locking on top that makes sure that no two calls of
+ * _add_sample() happen at the same time, then we are fine. Now, for
+ * resetting the values we just set @samples to 0 and that makes the
+ * next _add_sample() to start with defaults. Reading the values in
+ * _show() currently can race, so you need to make sure the calls are
+ * under the same lock that protects calls to _add_sample(). FIXME:
+ * currently unlocked (It is not ultraprecise but does the trick. Bite
+ * me).
+ */
+struct stats {
+	s8 min, max;
+	s16 sigma;
+	atomic_t samples;
+};
+
+static inline
+void stats_init(struct stats *stats)
+{
+	atomic_set(&stats->samples, 0);
+	wmb();
+}
+
+static inline
+void stats_add_sample(struct stats *stats, s8 sample)
+{
+	s8 min, max;
+	s16 sigma;
+	unsigned samples = atomic_read(&stats->samples);
+	if (samples == 0) {	/* it was zero before, so we initialize */
+		min = 127;
+		max = -128;
+		sigma = 0;
+	} else {
+		min = stats->min;
+		max = stats->max;
+		sigma = stats->sigma;
+	}
+
+	if (sample < min)	/* compute new values */
+		min = sample;
+	else if (sample > max)
+		max = sample;
+	sigma += sample;
+
+	stats->min = min;	/* commit */
+	stats->max = max;
+	stats->sigma = sigma;
+	if (atomic_add_return(1, &stats->samples) > 255) {
+		/* wrapped around! reset */
+		stats->sigma = sigma / 256;
+		atomic_set(&stats->samples, 1);
+	}
+}
+
+static inline ssize_t stats_show(struct stats *stats, char *buf)
+{
+	int min, max, avg;
+	int samples = atomic_read(&stats->samples);
+	if (samples == 0)
+		min = max = avg = 0;
+	else {
+		min = stats->min;
+		max = stats->max;
+		avg = stats->sigma / samples;
+	}
+	return scnprintf(buf, PAGE_SIZE, "%d %d %d\n", min, max, avg);
+}
+
+static inline ssize_t stats_store(struct stats *stats, const char *buf,
+				  size_t size)
+{
+	stats_init(stats);
+	return size;
+}
+
+#endif /* #ifndef __LINUX__UWB_H__ */
diff --git a/include/linux/uwb/debug-cmd.h b/include/linux/uwb/debug-cmd.h
new file mode 100644
index 000000000000..1141f41bab5c
--- /dev/null
+++ b/include/linux/uwb/debug-cmd.h
@@ -0,0 +1,57 @@
+/*
+ * Ultra Wide Band
+ * Debug interface commands
+ *
+ * Copyright (C) 2008 Cambridge Silicon Radio Ltd.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License version
+ * 2 as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program.  If not, see <http://www.gnu.org/licenses/>.
+ */
+#ifndef __LINUX__UWB__DEBUG_CMD_H__
+#define __LINUX__UWB__DEBUG_CMD_H__
+
+#include <linux/types.h>
+
+/*
+ * Debug interface commands
+ *
+ * UWB_DBG_CMD_RSV_ESTABLISH: Establish a new unicast reservation.
+ *
+ * UWB_DBG_CMD_RSV_TERMINATE: Terminate the Nth reservation.
+ */
+
+enum uwb_dbg_cmd_type {
+	UWB_DBG_CMD_RSV_ESTABLISH = 1,
+	UWB_DBG_CMD_RSV_TERMINATE = 2,
+};
+
+struct uwb_dbg_cmd_rsv_establish {
+	__u8  target[6];
+	__u8  type;
+	__u16 max_mas;
+	__u16 min_mas;
+	__u8  sparsity;
+};
+
+struct uwb_dbg_cmd_rsv_terminate {
+	int index;
+};
+
+struct uwb_dbg_cmd {
+	__u32 type;
+	union {
+		struct uwb_dbg_cmd_rsv_establish rsv_establish;
+		struct uwb_dbg_cmd_rsv_terminate rsv_terminate;
+	};
+};
+
+#endif /* #ifndef __LINUX__UWB__DEBUG_CMD_H__ */
diff --git a/include/linux/uwb/debug.h b/include/linux/uwb/debug.h
new file mode 100644
index 000000000000..a86a73fe303f
--- /dev/null
+++ b/include/linux/uwb/debug.h
@@ -0,0 +1,82 @@
+/*
+ * Ultra Wide Band
+ * Debug Support
+ *
+ * Copyright (C) 2005-2006 Intel Corporation
+ * Inaky Perez-Gonzalez <inaky.perez-gonzalez@intel.com>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License version
+ * 2 as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
+ * 02110-1301, USA.
+ *
+ *
+ * FIXME: doc
+ * Invoke like:
+ *
+ * #define D_LOCAL 4
+ * #include <linux/uwb/debug.h>
+ *
+ * At the end of your include files.
+ */
+#include <linux/types.h>
+
+struct device;
+extern void dump_bytes(struct device *dev, const void *_buf, size_t rsize);
+
+/* Master debug switch; !0 enables, 0 disables */
+#define D_MASTER (!0)
+
+/* Local (per-file) debug switch; #define before #including */
+#ifndef D_LOCAL
+#define D_LOCAL 0
+#endif
+
+#undef __d_printf
+#undef d_fnstart
+#undef d_fnend
+#undef d_printf
+#undef d_dump
+
+#define __d_printf(l, _tag, _dev, f, a...)				\
+do {									\
+	struct device *__dev = (_dev);					\
+	if (D_MASTER && D_LOCAL >= (l)) {				\
+		char __head[64] = "";					\
+		if (_dev != NULL) {					\
+			if ((unsigned long)__dev < 4096)		\
+				printk(KERN_ERR "E: Corrupt dev %p\n",	\
+					__dev);				\
+			else						\
+				snprintf(__head, sizeof(__head),	\
+					 "%s %s: ",			\
+					 dev_driver_string(__dev),	\
+					 __dev->bus_id);		\
+		}							\
+		printk(KERN_ERR "%s%s" _tag ": " f, __head,		\
+			__func__, ## a);				\
+	}								\
+} while (0 && _dev)
+
+#define d_fnstart(l, _dev, f, a...)	\
+	__d_printf(l, " FNSTART", _dev, f, ## a)
+#define d_fnend(l, _dev, f, a...)	\
+	__d_printf(l, " FNEND", _dev, f, ## a)
+#define d_printf(l, _dev, f, a...)	\
+	__d_printf(l, "", _dev, f, ## a)
+#define d_dump(l, _dev, ptr, size)		\
+do {						\
+	struct device *__dev = _dev;		\
+	if (D_MASTER && D_LOCAL >= (l))		\
+		dump_bytes(__dev, ptr, size);	\
+} while (0 && _dev)
+#define d_test(l) (D_MASTER && D_LOCAL >= (l))
diff --git a/include/linux/uwb/spec.h b/include/linux/uwb/spec.h
new file mode 100644
index 000000000000..198c15f8e251
--- /dev/null
+++ b/include/linux/uwb/spec.h
@@ -0,0 +1,727 @@
+/*
+ * Ultra Wide Band
+ * UWB Standard definitions
+ *
+ * Copyright (C) 2005-2006 Intel Corporation
+ * Inaky Perez-Gonzalez <inaky.perez-gonzalez@intel.com>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License version
+ * 2 as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
+ * 02110-1301, USA.
+ *
+ *
+ * All these definitions are based on the ECMA-368 standard.
+ *
+ * Note all definitions are Little Endian in the wire, and we will
+ * convert them to host order before operating on the bitfields (that
+ * yes, we use extensively).
+ */
+
+#ifndef __LINUX__UWB_SPEC_H__
+#define __LINUX__UWB_SPEC_H__
+
+#include <linux/types.h>
+#include <linux/bitmap.h>
+
+#define i1480_FW 0x00000303
+/* #define i1480_FW 0x00000302 */
+
+/**
+ * Number of Medium Access Slots in a superframe.
+ *
+ * UWB divides time in SuperFrames, each one divided in 256 pieces, or
+ * Medium Access Slots. See MBOA MAC[5.4.5] for details. The MAS is the
+ * basic bandwidth allocation unit in UWB.
+ */
+enum { UWB_NUM_MAS = 256 };
+
+/**
+ * Number of Zones in superframe.
+ *
+ * UWB divides the superframe into zones with numbering starting from BPST.
+ * See MBOA MAC[16.8.6]
+ */
+enum { UWB_NUM_ZONES = 16 };
+
+/*
+ * Number of MAS in a zone.
+ */
+#define UWB_MAS_PER_ZONE (UWB_NUM_MAS / UWB_NUM_ZONES)
+
+/*
+ * Number of streams per DRP reservation between a pair of devices.
+ *
+ * [ECMA-368] section 16.8.6.
+ */
+enum { UWB_NUM_STREAMS = 8 };
+
+/*
+ * mMasLength
+ *
+ * The length of a MAS in microseconds.
+ *
+ * [ECMA-368] section 17.16.
+ */
+enum { UWB_MAS_LENGTH_US = 256 };
+
+/*
+ * mBeaconSlotLength
+ *
+ * The length of the beacon slot in microseconds.
+ *
+ * [ECMA-368] section 17.16
+ */
+enum { UWB_BEACON_SLOT_LENGTH_US = 85 };
+
+/*
+ * mMaxLostBeacons
+ *
+ * The number beacons missing in consecutive superframes before a
+ * device can be considered as unreachable.
+ *
+ * [ECMA-368] section 17.16
+ */
+enum { UWB_MAX_LOST_BEACONS = 3 };
+
+/*
+ * Length of a superframe in microseconds.
+ */
+#define UWB_SUPERFRAME_LENGTH_US (UWB_MAS_LENGTH_US * UWB_NUM_MAS)
+
+/**
+ * UWB MAC address
+ *
+ * It is *imperative* that this struct is exactly 6 packed bytes (as
+ * it is also used to define headers sent down and up the wire/radio).
+ */
+struct uwb_mac_addr {
+	u8 data[6];
+} __attribute__((packed));
+
+
+/**
+ * UWB device address
+ *
+ * It is *imperative* that this struct is exactly 6 packed bytes (as
+ * it is also used to define headers sent down and up the wire/radio).
+ */
+struct uwb_dev_addr {
+	u8 data[2];
+} __attribute__((packed));
+
+
+/**
+ * Types of UWB addresses
+ *
+ * Order matters (by size).
+ */
+enum uwb_addr_type {
+	UWB_ADDR_DEV = 0,
+	UWB_ADDR_MAC = 1,
+};
+
+
+/** Size of a char buffer for printing a MAC/device address */
+enum { UWB_ADDR_STRSIZE = 32 };
+
+
+/** UWB WiMedia protocol IDs. */
+enum uwb_prid {
+	UWB_PRID_WLP_RESERVED   = 0x0000,
+	UWB_PRID_WLP		= 0x0001,
+	UWB_PRID_WUSB_BOT	= 0x0010,
+	UWB_PRID_WUSB		= 0x0010,
+	UWB_PRID_WUSB_TOP	= 0x001F,
+};
+
+
+/** PHY Rate (MBOA MAC[7.8.12, Table 61]) */
+enum uwb_phy_rate {
+	UWB_PHY_RATE_53 = 0,
+	UWB_PHY_RATE_80,
+	UWB_PHY_RATE_106,
+	UWB_PHY_RATE_160,
+	UWB_PHY_RATE_200,
+	UWB_PHY_RATE_320,
+	UWB_PHY_RATE_400,
+	UWB_PHY_RATE_480,
+	UWB_PHY_RATE_INVALID
+};
+
+
+/**
+ * Different ways to scan (MBOA MAC[6.2.2, Table 8], WUSB[Table 8-78])
+ */
+enum uwb_scan_type {
+	UWB_SCAN_ONLY = 0,
+	UWB_SCAN_OUTSIDE_BP,
+	UWB_SCAN_WHILE_INACTIVE,
+	UWB_SCAN_DISABLED,
+	UWB_SCAN_ONLY_STARTTIME,
+	UWB_SCAN_TOP
+};
+
+
+/** ACK Policy types (MBOA MAC[7.2.1.3]) */
+enum uwb_ack_pol {
+	UWB_ACK_NO = 0,
+	UWB_ACK_INM = 1,
+	UWB_ACK_B = 2,
+	UWB_ACK_B_REQ = 3,
+};
+
+
+/** DRP reservation types ([ECMA-368 table 106) */
+enum uwb_drp_type {
+	UWB_DRP_TYPE_ALIEN_BP = 0,
+	UWB_DRP_TYPE_HARD,
+	UWB_DRP_TYPE_SOFT,
+	UWB_DRP_TYPE_PRIVATE,
+	UWB_DRP_TYPE_PCA,
+};
+
+
+/** DRP Reason Codes ([ECMA-368] table 107) */
+enum uwb_drp_reason {
+	UWB_DRP_REASON_ACCEPTED = 0,
+	UWB_DRP_REASON_CONFLICT,
+	UWB_DRP_REASON_PENDING,
+	UWB_DRP_REASON_DENIED,
+	UWB_DRP_REASON_MODIFIED,
+};
+
+/**
+ *  DRP Notification Reason Codes (WHCI 0.95 [3.1.4.9])
+ */
+enum uwb_drp_notif_reason {
+	UWB_DRP_NOTIF_DRP_IE_RCVD = 0,
+	UWB_DRP_NOTIF_CONFLICT,
+	UWB_DRP_NOTIF_TERMINATE,
+};
+
+
+/** Allocation of MAS slots in a DRP request MBOA MAC[7.8.7] */
+struct uwb_drp_alloc {
+	__le16 zone_bm;
+	__le16 mas_bm;
+} __attribute__((packed));
+
+
+/** General MAC Header format (ECMA-368[16.2]) */
+struct uwb_mac_frame_hdr {
+	__le16 Frame_Control;
+	struct uwb_dev_addr DestAddr;
+	struct uwb_dev_addr SrcAddr;
+	__le16 Sequence_Control;
+	__le16 Access_Information;
+} __attribute__((packed));
+
+
+/**
+ * uwb_beacon_frame - a beacon frame including MAC headers
+ *
+ * [ECMA] section 16.3.
+ */
+struct uwb_beacon_frame {
+	struct uwb_mac_frame_hdr hdr;
+	struct uwb_mac_addr Device_Identifier;	/* may be a NULL EUI-48 */
+	u8 Beacon_Slot_Number;
+	u8 Device_Control;
+	u8 IEData[];
+} __attribute__((packed));
+
+
+/** Information Element codes (MBOA MAC[T54]) */
+enum uwb_ie {
+	UWB_PCA_AVAILABILITY = 2,
+	UWB_IE_DRP_AVAILABILITY = 8,
+	UWB_IE_DRP = 9,
+	UWB_BP_SWITCH_IE = 11,
+	UWB_MAC_CAPABILITIES_IE = 12,
+	UWB_PHY_CAPABILITIES_IE = 13,
+	UWB_APP_SPEC_PROBE_IE = 15,
+	UWB_IDENTIFICATION_IE = 19,
+	UWB_MASTER_KEY_ID_IE = 20,
+	UWB_IE_WLP = 250, /* WiMedia Logical Link Control Protocol WLP 0.99 */
+	UWB_APP_SPEC_IE = 255,
+};
+
+
+/**
+ * Header common to all Information Elements (IEs)
+ */
+struct uwb_ie_hdr {
+	u8 element_id;	/* enum uwb_ie */
+	u8 length;
+} __attribute__((packed));
+
+
+/** Dynamic Reservation Protocol IE (MBOA MAC[7.8.6]) */
+struct uwb_ie_drp {
+	struct uwb_ie_hdr	hdr;
+	__le16                  drp_control;
+	struct uwb_dev_addr	dev_addr;
+	struct uwb_drp_alloc	allocs[];
+} __attribute__((packed));
+
+static inline int uwb_ie_drp_type(struct uwb_ie_drp *ie)
+{
+	return (le16_to_cpu(ie->drp_control) >> 0) & 0x7;
+}
+
+static inline int uwb_ie_drp_stream_index(struct uwb_ie_drp *ie)
+{
+	return (le16_to_cpu(ie->drp_control) >> 3) & 0x7;
+}
+
+static inline int uwb_ie_drp_reason_code(struct uwb_ie_drp *ie)
+{
+	return (le16_to_cpu(ie->drp_control) >> 6) & 0x7;
+}
+
+static inline int uwb_ie_drp_status(struct uwb_ie_drp *ie)
+{
+	return (le16_to_cpu(ie->drp_control) >> 9) & 0x1;
+}
+
+static inline int uwb_ie_drp_owner(struct uwb_ie_drp *ie)
+{
+	return (le16_to_cpu(ie->drp_control) >> 10) & 0x1;
+}
+
+static inline int uwb_ie_drp_tiebreaker(struct uwb_ie_drp *ie)
+{
+	return (le16_to_cpu(ie->drp_control) >> 11) & 0x1;
+}
+
+static inline int uwb_ie_drp_unsafe(struct uwb_ie_drp *ie)
+{
+	return (le16_to_cpu(ie->drp_control) >> 12) & 0x1;
+}
+
+static inline void uwb_ie_drp_set_type(struct uwb_ie_drp *ie, enum uwb_drp_type type)
+{
+	u16 drp_control = le16_to_cpu(ie->drp_control);
+	drp_control = (drp_control & ~(0x7 << 0)) | (type << 0);
+	ie->drp_control = cpu_to_le16(drp_control);
+}
+
+static inline void uwb_ie_drp_set_stream_index(struct uwb_ie_drp *ie, int stream_index)
+{
+	u16 drp_control = le16_to_cpu(ie->drp_control);
+	drp_control = (drp_control & ~(0x7 << 3)) | (stream_index << 3);
+	ie->drp_control = cpu_to_le16(drp_control);
+}
+
+static inline void uwb_ie_drp_set_reason_code(struct uwb_ie_drp *ie,
+				       enum uwb_drp_reason reason_code)
+{
+	u16 drp_control = le16_to_cpu(ie->drp_control);
+	drp_control = (ie->drp_control & ~(0x7 << 6)) | (reason_code << 6);
+	ie->drp_control = cpu_to_le16(drp_control);
+}
+
+static inline void uwb_ie_drp_set_status(struct uwb_ie_drp *ie, int status)
+{
+	u16 drp_control = le16_to_cpu(ie->drp_control);
+	drp_control = (drp_control & ~(0x1 << 9)) | (status << 9);
+	ie->drp_control = cpu_to_le16(drp_control);
+}
+
+static inline void uwb_ie_drp_set_owner(struct uwb_ie_drp *ie, int owner)
+{
+	u16 drp_control = le16_to_cpu(ie->drp_control);
+	drp_control = (drp_control & ~(0x1 << 10)) | (owner << 10);
+	ie->drp_control = cpu_to_le16(drp_control);
+}
+
+static inline void uwb_ie_drp_set_tiebreaker(struct uwb_ie_drp *ie, int tiebreaker)
+{
+	u16 drp_control = le16_to_cpu(ie->drp_control);
+	drp_control = (drp_control & ~(0x1 << 11)) | (tiebreaker << 11);
+	ie->drp_control = cpu_to_le16(drp_control);
+}
+
+static inline void uwb_ie_drp_set_unsafe(struct uwb_ie_drp *ie, int unsafe)
+{
+	u16 drp_control = le16_to_cpu(ie->drp_control);
+	drp_control = (drp_control & ~(0x1 << 12)) | (unsafe << 12);
+	ie->drp_control = cpu_to_le16(drp_control);
+}
+
+/** Dynamic Reservation Protocol IE (MBOA MAC[7.8.7]) */
+struct uwb_ie_drp_avail {
+	struct uwb_ie_hdr	hdr;
+	DECLARE_BITMAP(bmp, UWB_NUM_MAS);
+} __attribute__((packed));
+
+/**
+ * The Vendor ID is set to an OUI that indicates the vendor of the device.
+ * ECMA-368 [16.8.10]
+ */
+struct uwb_vendor_id {
+	u8 data[3];
+} __attribute__((packed));
+
+/**
+ * The device type ID
+ * FIXME: clarify what this means
+ * ECMA-368 [16.8.10]
+ */
+struct uwb_device_type_id {
+	u8 data[3];
+} __attribute__((packed));
+
+
+/**
+ * UWB device information types
+ * ECMA-368 [16.8.10]
+ */
+enum uwb_dev_info_type {
+	UWB_DEV_INFO_VENDOR_ID = 0,
+	UWB_DEV_INFO_VENDOR_TYPE,
+	UWB_DEV_INFO_NAME,
+};
+
+/**
+ * UWB device information found in Identification IE
+ * ECMA-368 [16.8.10]
+ */
+struct uwb_dev_info {
+	u8 type;	/* enum uwb_dev_info_type */
+	u8 length;
+	u8 data[];
+} __attribute__((packed));
+
+/**
+ * UWB Identification IE
+ * ECMA-368 [16.8.10]
+ */
+struct uwb_identification_ie {
+	struct uwb_ie_hdr hdr;
+	struct uwb_dev_info info[];
+} __attribute__((packed));
+
+/*
+ * UWB Radio Controller
+ *
+ * These definitions are common to the Radio Control layers as
+ * exported by the WUSB1.0 HWA and WHCI interfaces.
+ */
+
+/** Radio Control Command Block (WUSB1.0[Table 8-65] and WHCI 0.95) */
+struct uwb_rccb {
+	u8 bCommandType;		/* enum hwa_cet */
+	__le16 wCommand;		/* Command code */
+	u8 bCommandContext;		/* Context ID */
+} __attribute__((packed));
+
+
+/** Radio Control Event Block (WUSB[table 8-66], WHCI 0.95) */
+struct uwb_rceb {
+	u8 bEventType;			/* enum hwa_cet */
+	__le16 wEvent;			/* Event code */
+	u8 bEventContext;		/* Context ID */
+} __attribute__((packed));
+
+
+enum {
+	UWB_RC_CET_GENERAL = 0,		/* General Command/Event type */
+	UWB_RC_CET_EX_TYPE_1 = 1,	/* Extended Type 1 Command/Event type */
+};
+
+/* Commands to the radio controller */
+enum uwb_rc_cmd {
+	UWB_RC_CMD_CHANNEL_CHANGE = 16,
+	UWB_RC_CMD_DEV_ADDR_MGMT = 17,	/* Device Address Management */
+	UWB_RC_CMD_GET_IE = 18,		/* GET Information Elements */
+	UWB_RC_CMD_RESET = 19,
+	UWB_RC_CMD_SCAN = 20,		/* Scan management  */
+	UWB_RC_CMD_SET_BEACON_FILTER = 21,
+	UWB_RC_CMD_SET_DRP_IE = 22,	/* Dynamic Reservation Protocol IEs */
+	UWB_RC_CMD_SET_IE = 23,		/* Information Element management */
+	UWB_RC_CMD_SET_NOTIFICATION_FILTER = 24,
+	UWB_RC_CMD_SET_TX_POWER = 25,
+	UWB_RC_CMD_SLEEP = 26,
+	UWB_RC_CMD_START_BEACON = 27,
+	UWB_RC_CMD_STOP_BEACON = 28,
+	UWB_RC_CMD_BP_MERGE = 29,
+	UWB_RC_CMD_SEND_COMMAND_FRAME = 30,
+	UWB_RC_CMD_SET_ASIE_NOTIF = 31,
+};
+
+/* Notifications from the radio controller */
+enum uwb_rc_evt {
+	UWB_RC_EVT_IE_RCV = 0,
+	UWB_RC_EVT_BEACON = 1,
+	UWB_RC_EVT_BEACON_SIZE = 2,
+	UWB_RC_EVT_BPOIE_CHANGE = 3,
+	UWB_RC_EVT_BP_SLOT_CHANGE = 4,
+	UWB_RC_EVT_BP_SWITCH_IE_RCV = 5,
+	UWB_RC_EVT_DEV_ADDR_CONFLICT = 6,
+	UWB_RC_EVT_DRP_AVAIL = 7,
+	UWB_RC_EVT_DRP = 8,
+	UWB_RC_EVT_BP_SWITCH_STATUS = 9,
+	UWB_RC_EVT_CMD_FRAME_RCV = 10,
+	UWB_RC_EVT_CHANNEL_CHANGE_IE_RCV = 11,
+	/* Events (command responses) use the same code as the command */
+	UWB_RC_EVT_UNKNOWN_CMD_RCV = 65535,
+};
+
+enum uwb_rc_extended_type_1_cmd {
+	UWB_RC_SET_DAA_ENERGY_MASK = 32,
+	UWB_RC_SET_NOTIFICATION_FILTER_EX = 33,
+};
+
+enum uwb_rc_extended_type_1_evt {
+	UWB_RC_DAA_ENERGY_DETECTED = 0,
+};
+
+/* Radio Control Result Code. [WHCI] table 3-3. */
+enum {
+	UWB_RC_RES_SUCCESS = 0,
+	UWB_RC_RES_FAIL,
+	UWB_RC_RES_FAIL_HARDWARE,
+	UWB_RC_RES_FAIL_NO_SLOTS,
+	UWB_RC_RES_FAIL_BEACON_TOO_LARGE,
+	UWB_RC_RES_FAIL_INVALID_PARAMETER,
+	UWB_RC_RES_FAIL_UNSUPPORTED_PWR_LEVEL,
+	UWB_RC_RES_FAIL_INVALID_IE_DATA,
+	UWB_RC_RES_FAIL_BEACON_SIZE_EXCEEDED,
+	UWB_RC_RES_FAIL_CANCELLED,
+	UWB_RC_RES_FAIL_INVALID_STATE,
+	UWB_RC_RES_FAIL_INVALID_SIZE,
+	UWB_RC_RES_FAIL_ACK_NOT_RECEIVED,
+	UWB_RC_RES_FAIL_NO_MORE_ASIE_NOTIF,
+	UWB_RC_RES_FAIL_TIME_OUT = 255,
+};
+
+/* Confirm event. [WHCI] section 3.1.3.1 etc. */
+struct uwb_rc_evt_confirm {
+	struct uwb_rceb rceb;
+	u8 bResultCode;
+} __attribute__((packed));
+
+/* Device Address Management event. [WHCI] section 3.1.3.2. */
+struct uwb_rc_evt_dev_addr_mgmt {
+	struct uwb_rceb rceb;
+	u8 baAddr[6];
+	u8 bResultCode;
+} __attribute__((packed));
+
+
+/* Get IE Event. [WHCI] section 3.1.3.3. */
+struct uwb_rc_evt_get_ie {
+	struct uwb_rceb rceb;
+	__le16 wIELength;
+	u8 IEData[];
+} __attribute__((packed));
+
+/* Set DRP IE Event. [WHCI] section 3.1.3.7. */
+struct uwb_rc_evt_set_drp_ie {
+	struct uwb_rceb rceb;
+	__le16 wRemainingSpace;
+	u8 bResultCode;
+} __attribute__((packed));
+
+/* Set IE Event. [WHCI] section 3.1.3.8. */
+struct uwb_rc_evt_set_ie {
+	struct uwb_rceb rceb;
+	__le16 RemainingSpace;
+	u8 bResultCode;
+} __attribute__((packed));
+
+/* Scan command. [WHCI] 3.1.3.5. */
+struct uwb_rc_cmd_scan {
+	struct uwb_rccb rccb;
+	u8 bChannelNumber;
+	u8 bScanState;
+	__le16 wStartTime;
+} __attribute__((packed));
+
+/* Set DRP IE command. [WHCI] section 3.1.3.7. */
+struct uwb_rc_cmd_set_drp_ie {
+	struct uwb_rccb rccb;
+	__le16 wIELength;
+	struct uwb_ie_drp IEData[];
+} __attribute__((packed));
+
+/* Set IE command. [WHCI] section 3.1.3.8. */
+struct uwb_rc_cmd_set_ie {
+	struct uwb_rccb rccb;
+	__le16 wIELength;
+	u8 IEData[];
+} __attribute__((packed));
+
+/* Set DAA Energy Mask event. [WHCI 0.96] section 3.1.3.17. */
+struct uwb_rc_evt_set_daa_energy_mask {
+	struct uwb_rceb rceb;
+	__le16 wLength;
+	u8 result;
+} __attribute__((packed));
+
+/* Set Notification Filter Extended event. [WHCI 0.96] section 3.1.3.18. */
+struct uwb_rc_evt_set_notification_filter_ex {
+	struct uwb_rceb rceb;
+	__le16 wLength;
+	u8 result;
+} __attribute__((packed));
+
+/* IE Received notification. [WHCI] section 3.1.4.1. */
+struct uwb_rc_evt_ie_rcv {
+	struct uwb_rceb rceb;
+	struct uwb_dev_addr SrcAddr;
+	__le16 wIELength;
+	u8 IEData[];
+} __attribute__((packed));
+
+/* Type of the received beacon. [WHCI] section 3.1.4.2. */
+enum uwb_rc_beacon_type {
+	UWB_RC_BEACON_TYPE_SCAN = 0,
+	UWB_RC_BEACON_TYPE_NEIGHBOR,
+	UWB_RC_BEACON_TYPE_OL_ALIEN,
+	UWB_RC_BEACON_TYPE_NOL_ALIEN,
+};
+
+/* Beacon received notification. [WHCI] 3.1.4.2. */
+struct uwb_rc_evt_beacon {
+	struct uwb_rceb rceb;
+	u8	bChannelNumber;
+	u8	bBeaconType;
+	__le16	wBPSTOffset;
+	u8	bLQI;
+	u8	bRSSI;
+	__le16	wBeaconInfoLength;
+	u8	BeaconInfo[];
+} __attribute__((packed));
+
+
+/* Beacon Size Change notification. [WHCI] section 3.1.4.3 */
+struct uwb_rc_evt_beacon_size {
+	struct uwb_rceb rceb;
+	__le16 wNewBeaconSize;
+} __attribute__((packed));
+
+
+/* BPOIE Change notification. [WHCI] section 3.1.4.4. */
+struct uwb_rc_evt_bpoie_change {
+	struct uwb_rceb rceb;
+	__le16 wBPOIELength;
+	u8 BPOIE[];
+} __attribute__((packed));
+
+
+/* Beacon Slot Change notification. [WHCI] section 3.1.4.5. */
+struct uwb_rc_evt_bp_slot_change {
+	struct uwb_rceb rceb;
+	u8 slot_info;
+} __attribute__((packed));
+
+static inline int uwb_rc_evt_bp_slot_change_slot_num(
+	const struct uwb_rc_evt_bp_slot_change *evt)
+{
+	return evt->slot_info & 0x7f;
+}
+
+static inline int uwb_rc_evt_bp_slot_change_no_slot(
+	const struct uwb_rc_evt_bp_slot_change *evt)
+{
+	return (evt->slot_info & 0x80) >> 7;
+}
+
+/* BP Switch IE Received notification. [WHCI] section 3.1.4.6. */
+struct uwb_rc_evt_bp_switch_ie_rcv {
+	struct uwb_rceb rceb;
+	struct uwb_dev_addr wSrcAddr;
+	__le16 wIELength;
+	u8 IEData[];
+} __attribute__((packed));
+
+/* DevAddr Conflict notification. [WHCI] section 3.1.4.7. */
+struct uwb_rc_evt_dev_addr_conflict {
+	struct uwb_rceb rceb;
+} __attribute__((packed));
+
+/* DRP notification. [WHCI] section 3.1.4.9. */
+struct uwb_rc_evt_drp {
+	struct uwb_rceb           rceb;
+	struct uwb_dev_addr       src_addr;
+	u8                        reason;
+	u8                        beacon_slot_number;
+	__le16                    ie_length;
+	u8                        ie_data[];
+} __attribute__((packed));
+
+static inline enum uwb_drp_notif_reason uwb_rc_evt_drp_reason(struct uwb_rc_evt_drp *evt)
+{
+	return evt->reason & 0x0f;
+}
+
+
+/* DRP Availability Change notification. [WHCI] section 3.1.4.8. */
+struct uwb_rc_evt_drp_avail {
+	struct uwb_rceb rceb;
+	DECLARE_BITMAP(bmp, UWB_NUM_MAS);
+} __attribute__((packed));
+
+/* BP switch status notification. [WHCI] section 3.1.4.10. */
+struct uwb_rc_evt_bp_switch_status {
+	struct uwb_rceb rceb;
+	u8 status;
+	u8 slot_offset;
+	__le16 bpst_offset;
+	u8 move_countdown;
+} __attribute__((packed));
+
+/* Command Frame Received notification. [WHCI] section 3.1.4.11. */
+struct uwb_rc_evt_cmd_frame_rcv {
+	struct uwb_rceb rceb;
+	__le16 receive_time;
+	struct uwb_dev_addr wSrcAddr;
+	struct uwb_dev_addr wDstAddr;
+	__le16 control;
+	__le16 reserved;
+	__le16 dataLength;
+	u8 data[];
+} __attribute__((packed));
+
+/* Channel Change IE Received notification. [WHCI] section 3.1.4.12. */
+struct uwb_rc_evt_channel_change_ie_rcv {
+	struct uwb_rceb rceb;
+	struct uwb_dev_addr wSrcAddr;
+	__le16 wIELength;
+	u8 IEData[];
+} __attribute__((packed));
+
+/* DAA Energy Detected notification. [WHCI 0.96] section 3.1.4.14. */
+struct uwb_rc_evt_daa_energy_detected {
+	struct uwb_rceb rceb;
+	__le16 wLength;
+	u8 bandID;
+	u8 reserved;
+	u8 toneBmp[16];
+} __attribute__((packed));
+
+
+/**
+ * Radio Control Interface Class Descriptor
+ *
+ *  WUSB 1.0 [8.6.1.2]
+ */
+struct uwb_rc_control_intf_class_desc {
+	u8 bLength;
+	u8 bDescriptorType;
+	__le16 bcdRCIVersion;
+} __attribute__((packed));
+
+#endif /* #ifndef __LINUX__UWB_SPEC_H__ */
diff --git a/include/linux/wlp.h b/include/linux/wlp.h
new file mode 100644
index 000000000000..033545e145c7
--- /dev/null
+++ b/include/linux/wlp.h
@@ -0,0 +1,735 @@
+/*
+ * WiMedia Logical Link Control Protocol (WLP)
+ *
+ * Copyright (C) 2005-2006 Intel Corporation
+ * Reinette Chatre <reinette.chatre@intel.com>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License version
+ * 2 as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
+ * 02110-1301, USA.
+ *
+ *
+ * FIXME: docs
+ *
+ * - Does not (yet) include support for WLP control frames
+ *   WLP Draft 0.99 [6.5].
+ *
+ *   A visual representation of the data structures.
+ *
+ *                              wssidB      wssidB
+ *                               ^           ^
+ *                               |           |
+ *                              wssidA      wssidA
+ *   wlp interface {             ^           ^
+ *       ...                     |           |
+ *       ...               ...  wssid      wssid ...
+ *       wlp --- ...             |           |
+ *   };          neighbors --> neighbA --> neighbB
+ *               ...
+ *               wss
+ *               ...
+ *               eda cache  --> neighborA --> neighborB --> neighborC ...
+ */
+
+#ifndef __LINUX__WLP_H_
+#define __LINUX__WLP_H_
+
+#include <linux/netdevice.h>
+#include <linux/skbuff.h>
+#include <linux/list.h>
+#include <linux/uwb.h>
+
+/**
+ * WLP Protocol ID
+ * WLP Draft 0.99 [6.2]
+ *
+ * The MUX header for all WLP frames
+ */
+#define WLP_PROTOCOL_ID 0x0100
+
+/**
+ * WLP Version
+ * WLP version placed in the association frames (WLP 0.99 [6.6])
+ */
+#define WLP_VERSION 0x10
+
+/**
+ * Bytes needed to print UUID as string
+ */
+#define WLP_WSS_UUID_STRSIZE 48
+
+/**
+ * Bytes needed to print nonce as string
+ */
+#define WLP_WSS_NONCE_STRSIZE 48
+
+
+/**
+ * Size used for WLP name size
+ *
+ * The WSS name is set to 65 bytes, 1 byte larger than the maximum
+ * allowed by the WLP spec. This is to have a null terminated string
+ * for display to the user. A maximum of 64 bytes will still be used
+ * when placing the WSS name field in association frames.
+ */
+#define WLP_WSS_NAME_SIZE 65
+
+/**
+ * Number of bytes added by WLP to data frame
+ *
+ * A data frame transmitted from a host will be placed in a Standard or
+ * Abbreviated WLP frame. These have an extra 4 bytes of header (struct
+ * wlp_frame_std_abbrv_hdr).
+ * When the stack sends this data frame for transmission it needs to ensure
+ * there is enough headroom for this header.
+ */
+#define WLP_DATA_HLEN 4
+
+/**
+ * State of device regarding WLP Service Set
+ *
+ * WLP_WSS_STATE_NONE: the host does not participate in any WSS
+ * WLP_WSS_STATE_PART_ENROLLED: used as part of the enrollment sequence
+ *                            ("Partial Enroll"). This state is used to
+ *                            indicate the first part of enrollment that is
+ *                            unsecure. If the WSS is unsecure then the
+ *                            state will promptly go to WLP_WSS_STATE_ENROLLED,
+ *                            if the WSS is not secure then the enrollment
+ *                            procedure is a few more steps before we are
+ *                            enrolled.
+ * WLP_WSS_STATE_ENROLLED: the host is enrolled in a WSS
+ * WLP_WSS_STATE_ACTIVE: WSS is activated
+ * WLP_WSS_STATE_CONNECTED: host is connected to neighbor in WSS
+ *
+ */
+enum wlp_wss_state {
+	WLP_WSS_STATE_NONE = 0,
+	WLP_WSS_STATE_PART_ENROLLED,
+	WLP_WSS_STATE_ENROLLED,
+	WLP_WSS_STATE_ACTIVE,
+	WLP_WSS_STATE_CONNECTED,
+};
+
+/**
+ * WSS Secure status
+ * WLP 0.99 Table 6
+ *
+ * Set to one if the WSS is secure, zero if it is not secure
+ */
+enum wlp_wss_sec_status {
+	WLP_WSS_UNSECURE = 0,
+	WLP_WSS_SECURE,
+};
+
+/**
+ * WLP frame type
+ * WLP Draft 0.99 [6.2 Table 1]
+ */
+enum wlp_frame_type {
+	WLP_FRAME_STANDARD = 0,
+	WLP_FRAME_ABBREVIATED,
+	WLP_FRAME_CONTROL,
+	WLP_FRAME_ASSOCIATION,
+};
+
+/**
+ * WLP Association Message Type
+ * WLP Draft 0.99 [6.6.1.2 Table 8]
+ */
+enum wlp_assoc_type {
+	WLP_ASSOC_D1 = 2,
+	WLP_ASSOC_D2 = 3,
+	WLP_ASSOC_M1 = 4,
+	WLP_ASSOC_M2 = 5,
+	WLP_ASSOC_M3 = 7,
+	WLP_ASSOC_M4 = 8,
+	WLP_ASSOC_M5 = 9,
+	WLP_ASSOC_M6 = 10,
+	WLP_ASSOC_M7 = 11,
+	WLP_ASSOC_M8 = 12,
+	WLP_ASSOC_F0 = 14,
+	WLP_ASSOC_E1 = 32,
+	WLP_ASSOC_E2 = 33,
+	WLP_ASSOC_C1 = 34,
+	WLP_ASSOC_C2 = 35,
+	WLP_ASSOC_C3 = 36,
+	WLP_ASSOC_C4 = 37,
+};
+
+/**
+ * WLP Attribute Type
+ * WLP Draft 0.99 [6.6.1 Table 6]
+ */
+enum wlp_attr_type {
+	WLP_ATTR_AUTH		= 0x1005, /* Authenticator */
+	WLP_ATTR_DEV_NAME 	= 0x1011, /* Device Name */
+	WLP_ATTR_DEV_PWD_ID 	= 0x1012, /* Device Password ID */
+	WLP_ATTR_E_HASH1	= 0x1014, /* E-Hash1 */
+	WLP_ATTR_E_HASH2	= 0x1015, /* E-Hash2 */
+	WLP_ATTR_E_SNONCE1	= 0x1016, /* E-SNonce1 */
+	WLP_ATTR_E_SNONCE2	= 0x1017, /* E-SNonce2 */
+	WLP_ATTR_ENCR_SET	= 0x1018, /* Encrypted Settings */
+	WLP_ATTR_ENRL_NONCE	= 0x101A, /* Enrollee Nonce */
+	WLP_ATTR_KEYWRAP_AUTH	= 0x101E, /* Key Wrap Authenticator */
+	WLP_ATTR_MANUF		= 0x1021, /* Manufacturer */
+	WLP_ATTR_MSG_TYPE	= 0x1022, /* Message Type */
+	WLP_ATTR_MODEL_NAME	= 0x1023, /* Model Name */
+	WLP_ATTR_MODEL_NR	= 0x1024, /* Model Number */
+	WLP_ATTR_PUB_KEY	= 0x1032, /* Public Key */
+	WLP_ATTR_REG_NONCE	= 0x1039, /* Registrar Nonce */
+	WLP_ATTR_R_HASH1	= 0x103D, /* R-Hash1 */
+	WLP_ATTR_R_HASH2	= 0x103E, /* R-Hash2 */
+	WLP_ATTR_R_SNONCE1	= 0x103F, /* R-SNonce1 */
+	WLP_ATTR_R_SNONCE2	= 0x1040, /* R-SNonce2 */
+	WLP_ATTR_SERIAL		= 0x1042, /* Serial number */
+	WLP_ATTR_UUID_E		= 0x1047, /* UUID-E */
+	WLP_ATTR_UUID_R		= 0x1048, /* UUID-R */
+	WLP_ATTR_PRI_DEV_TYPE	= 0x1054, /* Primary Device Type */
+	WLP_ATTR_SEC_DEV_TYPE	= 0x1055, /* Secondary Device Type */
+	WLP_ATTR_PORT_DEV	= 0x1056, /* Portable Device */
+	WLP_ATTR_APP_EXT	= 0x1058, /* Application Extension */
+	WLP_ATTR_WLP_VER	= 0x2000, /* WLP Version */
+	WLP_ATTR_WSSID		= 0x2001, /* WSSID */
+	WLP_ATTR_WSS_NAME	= 0x2002, /* WSS Name */
+	WLP_ATTR_WSS_SEC_STAT	= 0x2003, /* WSS Secure Status */
+	WLP_ATTR_WSS_BCAST	= 0x2004, /* WSS Broadcast Address */
+	WLP_ATTR_WSS_M_KEY	= 0x2005, /* WSS Master Key */
+	WLP_ATTR_ACC_ENRL	= 0x2006, /* Accepting Enrollment */
+	WLP_ATTR_WSS_INFO	= 0x2007, /* WSS Information */
+	WLP_ATTR_WSS_SEL_MTHD	= 0x2008, /* WSS Selection Method */
+	WLP_ATTR_ASSC_MTHD_LIST	= 0x2009, /* Association Methods List */
+	WLP_ATTR_SEL_ASSC_MTHD	= 0x200A, /* Selected Association Method */
+	WLP_ATTR_ENRL_HASH_COMM	= 0x200B, /* Enrollee Hash Commitment */
+	WLP_ATTR_WSS_TAG	= 0x200C, /* WSS Tag */
+	WLP_ATTR_WSS_VIRT	= 0x200D, /* WSS Virtual EUI-48 */
+	WLP_ATTR_WLP_ASSC_ERR	= 0x200E, /* WLP Association Error */
+	WLP_ATTR_VNDR_EXT	= 0x200F, /* Vendor Extension */
+};
+
+/**
+ * WLP Category ID of primary/secondary device
+ * WLP Draft 0.99 [6.6.1.8 Table 12]
+ */
+enum wlp_dev_category_id {
+	WLP_DEV_CAT_COMPUTER = 1,
+	WLP_DEV_CAT_INPUT,
+	WLP_DEV_CAT_PRINT_SCAN_FAX_COPIER,
+	WLP_DEV_CAT_CAMERA,
+	WLP_DEV_CAT_STORAGE,
+	WLP_DEV_CAT_INFRASTRUCTURE,
+	WLP_DEV_CAT_DISPLAY,
+	WLP_DEV_CAT_MULTIM,
+	WLP_DEV_CAT_GAMING,
+	WLP_DEV_CAT_TELEPHONE,
+	WLP_DEV_CAT_OTHER = 65535,
+};
+
+/**
+ * WLP WSS selection method
+ * WLP Draft 0.99 [6.6.1.6 Table 10]
+ */
+enum wlp_wss_sel_mthd {
+	WLP_WSS_ENRL_SELECT = 1,	/* Enrollee selects */
+	WLP_WSS_REG_SELECT,		/* Registrar selects */
+};
+
+/**
+ * WLP association error values
+ * WLP Draft 0.99 [6.6.1.5 Table 9]
+ */
+enum wlp_assc_error {
+	WLP_ASSOC_ERROR_NONE,
+	WLP_ASSOC_ERROR_AUTH,		/* Authenticator Failure */
+	WLP_ASSOC_ERROR_ROGUE,		/* Rogue activity suspected */
+	WLP_ASSOC_ERROR_BUSY,		/* Device busy */
+	WLP_ASSOC_ERROR_LOCK,		/* Setup Locked */
+	WLP_ASSOC_ERROR_NOT_READY,	/* Registrar not ready */
+	WLP_ASSOC_ERROR_INV,		/* Invalid WSS selection */
+	WLP_ASSOC_ERROR_MSG_TIME,	/* Message timeout */
+	WLP_ASSOC_ERROR_ENR_TIME,	/* Enrollment session timeout */
+	WLP_ASSOC_ERROR_PW,		/* Device password invalid */
+	WLP_ASSOC_ERROR_VER,		/* Unsupported version */
+	WLP_ASSOC_ERROR_INT,		/* Internal error */
+	WLP_ASSOC_ERROR_UNDEF,		/* Undefined error */
+	WLP_ASSOC_ERROR_NUM,		/* Numeric comparison failure */
+	WLP_ASSOC_ERROR_WAIT,		/* Waiting for user input */
+};
+
+/**
+ * WLP Parameters
+ * WLP 0.99 [7.7]
+ */
+enum wlp_parameters {
+	WLP_PER_MSG_TIMEOUT = 15,	/* Seconds to wait for response to
+					   association message. */
+};
+
+/**
+ * WLP IE
+ *
+ * The WLP IE should be included in beacons by all devices.
+ *
+ * The driver can set only a few of the fields in this information element,
+ * most fields are managed by the device self. When the driver needs to set
+ * a field it will only provide values for the fields of interest, the rest
+ * will be filled with zeroes. The fields of interest are:
+ *
+ * Element ID
+ * Length
+ * Capabilities (only to include WSSID Hash list length)
+ * WSSID Hash List fields
+ *
+ * WLP 0.99 [6.7]
+ *
+ * Only the fields that will be used are detailed in this structure, rest
+ * are not detailed or marked as "notused".
+ */
+struct wlp_ie {
+	struct uwb_ie_hdr hdr;
+	__le16 capabilities;
+	__le16 cycle_param;
+	__le16 acw_anchor_addr;
+	u8 wssid_hash_list[];
+} __attribute__((packed));
+
+static inline int wlp_ie_hash_length(struct wlp_ie *ie)
+{
+	return (le16_to_cpu(ie->capabilities) >> 12) & 0xf;
+}
+
+static inline void wlp_ie_set_hash_length(struct wlp_ie *ie, int hash_length)
+{
+	u16 caps = le16_to_cpu(ie->capabilities);
+	caps = (caps & ~(0xf << 12)) | (hash_length << 12);
+	ie->capabilities = cpu_to_le16(caps);
+}
+
+/**
+ * WLP nonce
+ * WLP Draft 0.99 [6.6.1 Table 6]
+ *
+ * A 128-bit random number often used (E-SNonce1, E-SNonce2, Enrollee
+ * Nonce, Registrar Nonce, R-SNonce1, R-SNonce2). It is passed to HW so
+ * it is packed.
+ */
+struct wlp_nonce {
+	u8 data[16];
+} __attribute__((packed));
+
+/**
+ * WLP UUID
+ * WLP Draft 0.99 [6.6.1 Table 6]
+ *
+ * Universally Unique Identifier (UUID) encoded as an octet string in the
+ * order the octets are shown in string representation in RFC4122. A UUID
+ * is often used (UUID-E, UUID-R, WSSID). It is passed to HW so it is packed.
+ */
+struct wlp_uuid {
+	u8 data[16];
+} __attribute__((packed));
+
+
+/**
+ * Primary and secondary device type attributes
+ * WLP Draft 0.99 [6.6.1.8]
+ */
+struct wlp_dev_type {
+	enum wlp_dev_category_id category:16;
+	u8 OUI[3];
+	u8 OUIsubdiv;
+	__le16 subID;
+} __attribute__((packed));
+
+/**
+ * WLP frame header
+ * WLP Draft 0.99 [6.2]
+ */
+struct wlp_frame_hdr {
+	__le16 mux_hdr;			/* WLP_PROTOCOL_ID */
+	enum wlp_frame_type type:8;
+} __attribute__((packed));
+
+/**
+ * WLP attribute field header
+ * WLP Draft 0.99 [6.6.1]
+ *
+ * Header of each attribute found in an association frame
+ */
+struct wlp_attr_hdr {
+	__le16 type;
+	__le16 length;
+} __attribute__((packed));
+
+/**
+ * Device information commonly used together
+ *
+ * Each of these device information elements has a specified range in which it
+ * should fit (WLP 0.99 [Table 6]). This range provided in the spec does not
+ * include the termination null '\0' character (when used in the
+ * association protocol the attribute fields are accompanied
+ * with a "length" field so the full range from the spec can be used for
+ * the value). We thus allocate an extra byte to be able to store a string
+ * of max length with a terminating '\0'.
+ */
+struct wlp_device_info {
+	char name[33];
+	char model_name[33];
+	char manufacturer[65];
+	char model_nr[33];
+	char serial[33];
+	struct wlp_dev_type prim_dev_type;
+};
+
+/**
+ * Macros for the WLP attributes
+ *
+ * There are quite a few attributes (total is 43). The attribute layout can be
+ * in one of three categories: one value, an array, an enum forced to 8 bits.
+ * These macros help with their definitions.
+ */
+#define wlp_attr(type, name)						\
+struct wlp_attr_##name {						\
+	struct wlp_attr_hdr hdr;					\
+	type name;							\
+} __attribute__((packed));
+
+#define wlp_attr_array(type, name)					\
+struct wlp_attr_##name {						\
+	struct wlp_attr_hdr hdr;					\
+	type name[];							\
+} __attribute__((packed));
+
+/**
+ * WLP association attribute fields
+ * WLP Draft 0.99 [6.6.1 Table 6]
+ *
+ * Attributes appear in same order as the Table in the spec
+ * FIXME Does not define all attributes yet
+ */
+
+/* Device name: Friendly name of sending device */
+wlp_attr_array(u8, dev_name)
+
+/* Enrollee Nonce: Random number generated by enrollee for an enrollment
+ * session */
+wlp_attr(struct wlp_nonce, enonce)
+
+/* Manufacturer name: Name of manufacturer of the sending device */
+wlp_attr_array(u8, manufacturer)
+
+/* WLP Message Type */
+wlp_attr(u8, msg_type)
+
+/* WLP Model name: Model name of sending device */
+wlp_attr_array(u8, model_name)
+
+/* WLP Model number: Model number of sending device */
+wlp_attr_array(u8, model_nr)
+
+/* Registrar Nonce: Random number generated by registrar for an enrollment
+ * session */
+wlp_attr(struct wlp_nonce, rnonce)
+
+/* Serial number of device */
+wlp_attr_array(u8, serial)
+
+/* UUID of enrollee */
+wlp_attr(struct wlp_uuid, uuid_e)
+
+/* UUID of registrar */
+wlp_attr(struct wlp_uuid, uuid_r)
+
+/* WLP Primary device type */
+wlp_attr(struct wlp_dev_type, prim_dev_type)
+
+/* WLP Secondary device type */
+wlp_attr(struct wlp_dev_type, sec_dev_type)
+
+/* WLP protocol version */
+wlp_attr(u8, version)
+
+/* WLP service set identifier */
+wlp_attr(struct wlp_uuid, wssid)
+
+/* WLP WSS name */
+wlp_attr_array(u8, wss_name)
+
+/* WLP WSS Secure Status */
+wlp_attr(u8, wss_sec_status)
+
+/* WSS Broadcast Address */
+wlp_attr(struct uwb_mac_addr, wss_bcast)
+
+/* WLP Accepting Enrollment */
+wlp_attr(u8, accept_enrl)
+
+/**
+ * WSS information attributes
+ * WLP Draft 0.99 [6.6.3 Table 15]
+ */
+struct wlp_wss_info {
+	struct wlp_attr_wssid wssid;
+	struct wlp_attr_wss_name name;
+	struct wlp_attr_accept_enrl accept;
+	struct wlp_attr_wss_sec_status sec_stat;
+	struct wlp_attr_wss_bcast bcast;
+} __attribute__((packed));
+
+/* WLP WSS Information */
+wlp_attr_array(struct wlp_wss_info, wss_info)
+
+/* WLP WSS Selection method */
+wlp_attr(u8, wss_sel_mthd)
+
+/* WLP WSS tag */
+wlp_attr(u8, wss_tag)
+
+/* WSS Virtual Address */
+wlp_attr(struct uwb_mac_addr, wss_virt)
+
+/* WLP association error */
+wlp_attr(u8, wlp_assc_err)
+
+/**
+ * WLP standard and abbreviated frames
+ *
+ * WLP Draft 0.99 [6.3] and [6.4]
+ *
+ * The difference between the WLP standard frame and the WLP
+ * abbreviated frame is that the standard frame includes the src
+ * and dest addresses from the Ethernet header, the abbreviated frame does
+ * not.
+ * The src/dest (as well as the type/length and client data) are already
+ * defined as part of the Ethernet header, we do not do this here.
+ * From this perspective the standard and abbreviated frames appear the
+ * same - they will be treated differently though.
+ *
+ * The size of this header is also captured in WLP_DATA_HLEN to enable
+ * interfaces to prepare their headroom.
+ */
+struct wlp_frame_std_abbrv_hdr {
+	struct wlp_frame_hdr hdr;
+	u8 tag;
+} __attribute__((packed));
+
+/**
+ * WLP association frames
+ *
+ * WLP Draft 0.99 [6.6]
+ */
+struct wlp_frame_assoc {
+	struct wlp_frame_hdr hdr;
+	enum wlp_assoc_type type:8;
+	struct wlp_attr_version version;
+	struct wlp_attr_msg_type msg_type;
+	u8 attr[];
+} __attribute__((packed));
+
+/* Ethernet to dev address mapping */
+struct wlp_eda {
+	spinlock_t lock;
+	struct list_head cache;	/* Eth<->Dev Addr cache */
+};
+
+/**
+ * WSS information temporary storage
+ *
+ * This information is only stored temporarily during discovery. It should
+ * not be stored unless the device is enrolled in the advertised WSS. This
+ * is done mainly because we follow the letter of the spec in this regard.
+ * See WLP 0.99 [7.2.3].
+ * When the device does become enrolled in a WSS the WSS information will
+ * be stored as part of the more comprehensive struct wlp_wss.
+ */
+struct wlp_wss_tmp_info {
+	char name[WLP_WSS_NAME_SIZE];
+	u8 accept_enroll;
+	u8 sec_status;
+	struct uwb_mac_addr bcast;
+};
+
+struct wlp_wssid_e {
+	struct list_head node;
+	struct wlp_uuid wssid;
+	struct wlp_wss_tmp_info *info;
+};
+
+/**
+ * A cache entry of WLP neighborhood
+ *
+ * @node: head of list is wlp->neighbors
+ * @wssid: list of wssids of this neighbor, element is wlp_wssid_e
+ * @info:  temporary storage for information learned during discovery. This
+ *         storage is used together with the wssid_e temporary storage
+ *         during discovery.
+ */
+struct wlp_neighbor_e {
+	struct list_head node;
+	struct wlp_uuid uuid;
+	struct uwb_dev *uwb_dev;
+	struct list_head wssid; /* Elements are wlp_wssid_e */
+	struct wlp_device_info *info;
+};
+
+struct wlp;
+/**
+ * Information for an association session in progress.
+ *
+ * @exp_message: The type of the expected message. Both this message and a
+ *               F0 message (which can be sent in response to any
+ *               association frame) will be accepted as a valid message for
+ *               this session.
+ * @cb:          The function that will be called upon receipt of this
+ *               message.
+ * @cb_priv:     Private data of callback
+ * @data:        Data used in association process (always a sk_buff?)
+ * @neighbor:    Address of neighbor with which association session is in
+ *               progress.
+ */
+struct wlp_session {
+	enum wlp_assoc_type exp_message;
+	void (*cb)(struct wlp *);
+	void *cb_priv;
+	void *data;
+	struct uwb_dev_addr neighbor_addr;
+};
+
+/**
+ * WLP Service Set
+ *
+ * @mutex: used to protect entire WSS structure.
+ *
+ * @name: The WSS name is set to 65 bytes, 1 byte larger than the maximum
+ *        allowed by the WLP spec. This is to have a null terminated string
+ *        for display to the user. A maximum of 64 bytes will still be used
+ *        when placing the WSS name field in association frames.
+ *
+ * @accept_enroll: Accepting enrollment: Set to one if registrar is
+ *                 accepting enrollment in WSS, or zero otherwise.
+ *
+ * Global and local information for each WSS in which we are enrolled.
+ * WLP 0.99 Section 7.2.1 and Section 7.2.2
+ */
+struct wlp_wss {
+	struct mutex mutex;
+	struct kobject kobj;
+	/* Global properties. */
+	struct wlp_uuid wssid;
+	u8 hash;
+	char name[WLP_WSS_NAME_SIZE];
+	struct uwb_mac_addr bcast;
+	u8 secure_status:1;
+	u8 master_key[16];
+	/* Local properties. */
+	u8 tag;
+	struct uwb_mac_addr virtual_addr;
+	/* Extra */
+	u8 accept_enroll:1;
+	enum wlp_wss_state state;
+};
+
+/**
+ * WLP main structure
+ * @mutex: protect changes to WLP structure. We only allow changes to the
+ *         uuid, so currently this mutex only protects this field.
+ */
+struct wlp {
+	struct mutex mutex;
+	struct uwb_rc *rc;		/* UWB radio controller */
+	struct uwb_pal pal;
+	struct wlp_eda eda;
+	struct wlp_uuid uuid;
+	struct wlp_session *session;
+	struct wlp_wss wss;
+	struct mutex nbmutex; /* Neighbor mutex protects neighbors list */
+	struct list_head neighbors; /* Elements are wlp_neighbor_e */
+	struct uwb_notifs_handler uwb_notifs_handler;
+	struct wlp_device_info *dev_info;
+	void (*fill_device_info)(struct wlp *wlp, struct wlp_device_info *info);
+	int (*xmit_frame)(struct wlp *, struct sk_buff *,
+			  struct uwb_dev_addr *);
+	void (*stop_queue)(struct wlp *);
+	void (*start_queue)(struct wlp *);
+};
+
+/* sysfs */
+
+
+struct wlp_wss_attribute {
+	struct attribute attr;
+	ssize_t (*show)(struct wlp_wss *wss, char *buf);
+	ssize_t (*store)(struct wlp_wss *wss, const char *buf, size_t count);
+};
+
+#define WSS_ATTR(_name, _mode, _show, _store) \
+static struct wlp_wss_attribute wss_attr_##_name = __ATTR(_name, _mode,	\
+							  _show, _store)
+
+extern int wlp_setup(struct wlp *, struct uwb_rc *);
+extern void wlp_remove(struct wlp *);
+extern ssize_t wlp_neighborhood_show(struct wlp *, char *);
+extern int wlp_wss_setup(struct net_device *, struct wlp_wss *);
+extern void wlp_wss_remove(struct wlp_wss *);
+extern ssize_t wlp_wss_activate_show(struct wlp_wss *, char *);
+extern ssize_t wlp_wss_activate_store(struct wlp_wss *, const char *, size_t);
+extern ssize_t wlp_eda_show(struct wlp *, char *);
+extern ssize_t wlp_eda_store(struct wlp *, const char *, size_t);
+extern ssize_t wlp_uuid_show(struct wlp *, char *);
+extern ssize_t wlp_uuid_store(struct wlp *, const char *, size_t);
+extern ssize_t wlp_dev_name_show(struct wlp *, char *);
+extern ssize_t wlp_dev_name_store(struct wlp *, const char *, size_t);
+extern ssize_t wlp_dev_manufacturer_show(struct wlp *, char *);
+extern ssize_t wlp_dev_manufacturer_store(struct wlp *, const char *, size_t);
+extern ssize_t wlp_dev_model_name_show(struct wlp *, char *);
+extern ssize_t wlp_dev_model_name_store(struct wlp *, const char *, size_t);
+extern ssize_t wlp_dev_model_nr_show(struct wlp *, char *);
+extern ssize_t wlp_dev_model_nr_store(struct wlp *, const char *, size_t);
+extern ssize_t wlp_dev_serial_show(struct wlp *, char *);
+extern ssize_t wlp_dev_serial_store(struct wlp *, const char *, size_t);
+extern ssize_t wlp_dev_prim_category_show(struct wlp *, char *);
+extern ssize_t wlp_dev_prim_category_store(struct wlp *, const char *,
+					   size_t);
+extern ssize_t wlp_dev_prim_OUI_show(struct wlp *, char *);
+extern ssize_t wlp_dev_prim_OUI_store(struct wlp *, const char *, size_t);
+extern ssize_t wlp_dev_prim_OUI_sub_show(struct wlp *, char *);
+extern ssize_t wlp_dev_prim_OUI_sub_store(struct wlp *, const char *,
+					  size_t);
+extern ssize_t wlp_dev_prim_subcat_show(struct wlp *, char *);
+extern ssize_t wlp_dev_prim_subcat_store(struct wlp *, const char *,
+					 size_t);
+extern int wlp_receive_frame(struct device *, struct wlp *, struct sk_buff *,
+			     struct uwb_dev_addr *);
+extern int wlp_prepare_tx_frame(struct device *, struct wlp *,
+			       struct sk_buff *, struct uwb_dev_addr *);
+void wlp_reset_all(struct wlp *wlp);
+
+/**
+ * Initialize WSS
+ */
+static inline
+void wlp_wss_init(struct wlp_wss *wss)
+{
+	mutex_init(&wss->mutex);
+}
+
+static inline
+void wlp_init(struct wlp *wlp)
+{
+	INIT_LIST_HEAD(&wlp->neighbors);
+	mutex_init(&wlp->mutex);
+	mutex_init(&wlp->nbmutex);
+	wlp_wss_init(&wlp->wss);
+}
+
+
+#endif /* #ifndef __LINUX__WLP_H_ */
-- 
cgit v1.2.3


From da389eac31be24556a71dd59ea6539ae4cba5c15 Mon Sep 17 00:00:00 2001
From: David Vrabel <david.vrabel@csr.com>
Date: Wed, 17 Sep 2008 16:34:12 +0100
Subject: uwb: add the umc bus

The UMC bus is used for the capabilities exposed by a UWB Multi-interface
Controller as described in the WHCI specification.

Signed-off-by: David Vrabel <david.vrabel@csr.com>
---
 drivers/uwb/Makefile    |   6 ++
 drivers/uwb/umc-bus.c   | 218 ++++++++++++++++++++++++++++++++++++++++++++++++
 drivers/uwb/umc-dev.c   | 104 +++++++++++++++++++++++
 drivers/uwb/umc-drv.c   |  31 +++++++
 include/linux/uwb/umc.h | 194 ++++++++++++++++++++++++++++++++++++++++++
 5 files changed, 553 insertions(+)
 create mode 100644 drivers/uwb/umc-bus.c
 create mode 100644 drivers/uwb/umc-dev.c
 create mode 100644 drivers/uwb/umc-drv.c
 create mode 100644 include/linux/uwb/umc.h

(limited to 'include/linux')

diff --git a/drivers/uwb/Makefile b/drivers/uwb/Makefile
index 9a67be5ac5c1..41c9fca5f875 100644
--- a/drivers/uwb/Makefile
+++ b/drivers/uwb/Makefile
@@ -1,4 +1,5 @@
 obj-$(CONFIG_UWB)		+= uwb.o
+obj-$(CONFIG_UWB_WHCI)		+= umc.o
 
 uwb-objs :=		\
 	address.o	\
@@ -18,3 +19,8 @@ uwb-objs :=		\
 	scan.o		\
 	uwb-debug.o	\
 	uwbd.o
+
+umc-objs :=		\
+	umc-bus.o	\
+	umc-dev.o	\
+	umc-drv.o
diff --git a/drivers/uwb/umc-bus.c b/drivers/uwb/umc-bus.c
new file mode 100644
index 000000000000..2d8d62d9f53e
--- /dev/null
+++ b/drivers/uwb/umc-bus.c
@@ -0,0 +1,218 @@
+/*
+ * Bus for UWB Multi-interface Controller capabilities.
+ *
+ * Copyright (C) 2007 Cambridge Silicon Radio Ltd.
+ *
+ * This file is released under the GNU GPL v2.
+ */
+#include <linux/kernel.h>
+#include <linux/sysfs.h>
+#include <linux/workqueue.h>
+#include <linux/uwb/umc.h>
+#include <linux/pci.h>
+
+static int umc_bus_unbind_helper(struct device *dev, void *data)
+{
+	struct device *parent = data;
+
+	if (dev->parent == parent && dev->driver)
+		device_release_driver(dev);
+	return 0;
+}
+
+/**
+ * umc_controller_reset - reset the whole UMC controller
+ * @umc: the UMC device for the radio controller.
+ *
+ * Drivers will be unbound from all UMC devices belonging to the
+ * controller and then the radio controller will be rebound.  The
+ * radio controller is expected to do a full hardware reset when it is
+ * probed.
+ *
+ * If this is called while a probe() or remove() is in progress it
+ * will return -EAGAIN and not perform the reset.
+ */
+int umc_controller_reset(struct umc_dev *umc)
+{
+	struct device *parent = umc->dev.parent;
+	int ret;
+
+	if (down_trylock(&parent->sem))
+		return -EAGAIN;
+	bus_for_each_dev(&umc_bus_type, NULL, parent, umc_bus_unbind_helper);
+	ret = device_attach(&umc->dev);
+	if (ret == 1)
+		ret = 0;
+	up(&parent->sem);
+
+	return ret;
+}
+EXPORT_SYMBOL_GPL(umc_controller_reset);
+
+/**
+ * umc_match_pci_id - match a UMC driver to a UMC device's parent PCI device.
+ * @umc_drv: umc driver with match_data pointing to a zero-terminated
+ * table of pci_device_id's.
+ * @umc: umc device whose parent is to be matched.
+ */
+int umc_match_pci_id(struct umc_driver *umc_drv, struct umc_dev *umc)
+{
+	const struct pci_device_id *id_table = umc_drv->match_data;
+	struct pci_dev *pci;
+
+	if (umc->dev.parent->bus != &pci_bus_type)
+		return 0;
+
+	pci = to_pci_dev(umc->dev.parent);
+	return pci_match_id(id_table, pci) != NULL;
+}
+EXPORT_SYMBOL_GPL(umc_match_pci_id);
+
+static int umc_bus_rescan_helper(struct device *dev, void *data)
+{
+	int ret = 0;
+
+	if (!dev->driver)
+		ret = device_attach(dev);
+
+	return ret < 0 ? ret : 0;
+}
+
+static void umc_bus_rescan(void)
+{
+	int err;
+
+	/*
+	 * We can't use bus_rescan_devices() here as it deadlocks when
+	 * it tries to retake the dev->parent semaphore.
+	 */
+	err = bus_for_each_dev(&umc_bus_type, NULL, NULL, umc_bus_rescan_helper);
+	if (err < 0)
+		printk(KERN_WARNING "%s: rescan of bus failed: %d\n",
+		       KBUILD_MODNAME, err);
+}
+
+static int umc_bus_match(struct device *dev, struct device_driver *drv)
+{
+	struct umc_dev *umc = to_umc_dev(dev);
+	struct umc_driver *umc_driver = to_umc_driver(drv);
+
+	if (umc->cap_id == umc_driver->cap_id) {
+		if (umc_driver->match)
+			return umc_driver->match(umc_driver, umc);
+		else
+			return 1;
+	}
+	return 0;
+}
+
+static int umc_device_probe(struct device *dev)
+{
+	struct umc_dev *umc;
+	struct umc_driver *umc_driver;
+	int err;
+
+	umc_driver = to_umc_driver(dev->driver);
+	umc = to_umc_dev(dev);
+
+	get_device(dev);
+	err = umc_driver->probe(umc);
+	if (err)
+		put_device(dev);
+	else
+		umc_bus_rescan();
+
+	return err;
+}
+
+static int umc_device_remove(struct device *dev)
+{
+	struct umc_dev *umc;
+	struct umc_driver *umc_driver;
+
+	umc_driver = to_umc_driver(dev->driver);
+	umc = to_umc_dev(dev);
+
+	umc_driver->remove(umc);
+	put_device(dev);
+	return 0;
+}
+
+static int umc_device_suspend(struct device *dev, pm_message_t state)
+{
+	struct umc_dev *umc;
+	struct umc_driver *umc_driver;
+	int err = 0;
+
+	umc = to_umc_dev(dev);
+
+	if (dev->driver) {
+		umc_driver = to_umc_driver(dev->driver);
+		if (umc_driver->suspend)
+			err = umc_driver->suspend(umc, state);
+	}
+	return err;
+}
+
+static int umc_device_resume(struct device *dev)
+{
+	struct umc_dev *umc;
+	struct umc_driver *umc_driver;
+	int err = 0;
+
+	umc = to_umc_dev(dev);
+
+	if (dev->driver) {
+		umc_driver = to_umc_driver(dev->driver);
+		if (umc_driver->resume)
+			err = umc_driver->resume(umc);
+	}
+	return err;
+}
+
+static ssize_t capability_id_show(struct device *dev, struct device_attribute *attr, char *buf)
+{
+	struct umc_dev *umc = to_umc_dev(dev);
+
+	return sprintf(buf, "0x%02x\n", umc->cap_id);
+}
+
+static ssize_t version_show(struct device *dev, struct device_attribute *attr, char *buf)
+{
+	struct umc_dev *umc = to_umc_dev(dev);
+
+	return sprintf(buf, "0x%04x\n", umc->version);
+}
+
+static struct device_attribute umc_dev_attrs[] = {
+	__ATTR_RO(capability_id),
+	__ATTR_RO(version),
+	__ATTR_NULL,
+};
+
+struct bus_type umc_bus_type = {
+	.name		= "umc",
+	.match		= umc_bus_match,
+	.probe		= umc_device_probe,
+	.remove		= umc_device_remove,
+	.suspend        = umc_device_suspend,
+	.resume         = umc_device_resume,
+	.dev_attrs	= umc_dev_attrs,
+};
+EXPORT_SYMBOL_GPL(umc_bus_type);
+
+static int __init umc_bus_init(void)
+{
+	return bus_register(&umc_bus_type);
+}
+module_init(umc_bus_init);
+
+static void __exit umc_bus_exit(void)
+{
+	bus_unregister(&umc_bus_type);
+}
+module_exit(umc_bus_exit);
+
+MODULE_DESCRIPTION("UWB Multi-interface Controller capability bus");
+MODULE_AUTHOR("Cambridge Silicon Radio Ltd.");
+MODULE_LICENSE("GPL");
diff --git a/drivers/uwb/umc-dev.c b/drivers/uwb/umc-dev.c
new file mode 100644
index 000000000000..aa44e1c1a102
--- /dev/null
+++ b/drivers/uwb/umc-dev.c
@@ -0,0 +1,104 @@
+/*
+ * UWB Multi-interface Controller device management.
+ *
+ * Copyright (C) 2007 Cambridge Silicon Radio Ltd.
+ *
+ * This file is released under the GNU GPL v2.
+ */
+#include <linux/kernel.h>
+#include <linux/uwb/umc.h>
+#define D_LOCAL 0
+#include <linux/uwb/debug.h>
+
+static void umc_device_release(struct device *dev)
+{
+	struct umc_dev *umc = to_umc_dev(dev);
+
+	kfree(umc);
+}
+
+/**
+ * umc_device_create - allocate a child UMC device
+ * @parent: parent of the new UMC device.
+ * @n:      index of the new device.
+ *
+ * The new UMC device will have a bus ID of the parent with '-n'
+ * appended.
+ */
+struct umc_dev *umc_device_create(struct device *parent, int n)
+{
+	struct umc_dev *umc;
+
+	umc = kzalloc(sizeof(struct umc_dev), GFP_KERNEL);
+	if (umc) {
+		snprintf(umc->dev.bus_id, sizeof(umc->dev.bus_id), "%s-%d",
+			 parent->bus_id, n);
+		umc->dev.parent  = parent;
+		umc->dev.bus     = &umc_bus_type;
+		umc->dev.release = umc_device_release;
+
+		umc->dev.dma_mask = parent->dma_mask;
+	}
+	return umc;
+}
+EXPORT_SYMBOL_GPL(umc_device_create);
+
+/**
+ * umc_device_register - register a UMC device
+ * @umc: pointer to the UMC device
+ *
+ * The memory resource for the UMC device is acquired and the device
+ * registered with the system.
+ */
+int umc_device_register(struct umc_dev *umc)
+{
+	int err;
+
+	d_fnstart(3, &umc->dev, "(umc_dev %p)\n", umc);
+
+	err = request_resource(umc->resource.parent, &umc->resource);
+	if (err < 0) {
+		dev_err(&umc->dev, "can't allocate resource range "
+			"%016Lx to %016Lx: %d\n",
+			(unsigned long long)umc->resource.start,
+			(unsigned long long)umc->resource.end,
+			err);
+		goto error_request_resource;
+	}
+
+	err = device_register(&umc->dev);
+	if (err < 0)
+		goto error_device_register;
+	d_fnend(3, &umc->dev, "(umc_dev %p) = 0\n", umc);
+	return 0;
+
+error_device_register:
+	release_resource(&umc->resource);
+error_request_resource:
+	d_fnend(3, &umc->dev, "(umc_dev %p) = %d\n", umc, err);
+	return err;
+}
+EXPORT_SYMBOL_GPL(umc_device_register);
+
+/**
+ * umc_device_unregister - unregister a UMC device
+ * @umc: pointer to the UMC device
+ *
+ * First we unregister the device, make sure the driver can do it's
+ * resource release thing and then we try to release any left over
+ * resources. We take a ref to the device, to make sure it doesn't
+ * dissapear under our feet.
+ */
+void umc_device_unregister(struct umc_dev *umc)
+{
+	struct device *dev;
+	if (!umc)
+		return;
+	dev = get_device(&umc->dev);
+	d_fnstart(3, dev, "(umc_dev %p)\n", umc);
+	device_unregister(&umc->dev);
+	release_resource(&umc->resource);
+	d_fnend(3, dev, "(umc_dev %p) = void\n", umc);
+	put_device(dev);
+}
+EXPORT_SYMBOL_GPL(umc_device_unregister);
diff --git a/drivers/uwb/umc-drv.c b/drivers/uwb/umc-drv.c
new file mode 100644
index 000000000000..367b5eb85d60
--- /dev/null
+++ b/drivers/uwb/umc-drv.c
@@ -0,0 +1,31 @@
+/*
+ * UWB Multi-interface Controller driver management.
+ *
+ * Copyright (C) 2007 Cambridge Silicon Radio Ltd.
+ *
+ * This file is released under the GNU GPL v2.
+ */
+#include <linux/kernel.h>
+#include <linux/uwb/umc.h>
+
+int __umc_driver_register(struct umc_driver *umc_drv, struct module *module,
+			  const char *mod_name)
+{
+	umc_drv->driver.name     = umc_drv->name;
+	umc_drv->driver.owner    = module;
+	umc_drv->driver.mod_name = mod_name;
+	umc_drv->driver.bus      = &umc_bus_type;
+
+	return driver_register(&umc_drv->driver);
+}
+EXPORT_SYMBOL_GPL(__umc_driver_register);
+
+/**
+ * umc_driver_register - unregister a UMC capabiltity driver.
+ * @umc_drv:  pointer to the driver.
+ */
+void umc_driver_unregister(struct umc_driver *umc_drv)
+{
+	driver_unregister(&umc_drv->driver);
+}
+EXPORT_SYMBOL_GPL(umc_driver_unregister);
diff --git a/include/linux/uwb/umc.h b/include/linux/uwb/umc.h
new file mode 100644
index 000000000000..36a39e34f8d7
--- /dev/null
+++ b/include/linux/uwb/umc.h
@@ -0,0 +1,194 @@
+/*
+ * UWB Multi-interface Controller support.
+ *
+ * Copyright (C) 2007 Cambridge Silicon Radio Ltd.
+ *
+ * This file is released under the GPLv2
+ *
+ * UMC (UWB Multi-interface Controller) capabilities (e.g., radio
+ * controller, host controller) are presented as devices on the "umc"
+ * bus.
+ *
+ * The radio controller is not strictly a UMC capability but it's
+ * useful to present it as such.
+ *
+ * References:
+ *
+ *   [WHCI] Wireless Host Controller Interface Specification for
+ *          Certified Wireless Universal Serial Bus, revision 0.95.
+ *
+ * How this works is kind of convoluted but simple. The whci.ko driver
+ * loads when WHCI devices are detected. These WHCI devices expose
+ * many devices in the same PCI function (they couldn't have reused
+ * functions, no), so for each PCI function that exposes these many
+ * devices, whci ceates a umc_dev [whci_probe() -> whci_add_cap()]
+ * with umc_device_create() and adds it to the bus with
+ * umc_device_register().
+ *
+ * umc_device_register() calls device_register() which will push the
+ * bus management code to load your UMC driver's somehting_probe()
+ * that you have registered for that capability code.
+ *
+ * Now when the WHCI device is removed, whci_remove() will go over
+ * each umc_dev assigned to each of the PCI function's capabilities
+ * and through whci_del_cap() call umc_device_unregister() each
+ * created umc_dev. Of course, if you are bound to the device, your
+ * driver's something_remove() will be called.
+ */
+
+#ifndef _LINUX_UWB_UMC_H_
+#define _LINUX_UWB_UMC_H_
+
+#include <linux/device.h>
+#include <linux/pci.h>
+
+/*
+ * UMC capability IDs.
+ *
+ * 0x00 is reserved so use it for the radio controller device.
+ *
+ * [WHCI] table 2-8
+ */
+#define UMC_CAP_ID_WHCI_RC      0x00 /* radio controller */
+#define UMC_CAP_ID_WHCI_WUSB_HC 0x01 /* WUSB host controller */
+
+/**
+ * struct umc_dev - UMC capability device
+ *
+ * @version:  version of the specification this capability conforms to.
+ * @cap_id:   capability ID.
+ * @bar:      PCI Bar (64 bit) where the resource lies
+ * @resource: register space resource.
+ * @irq:      interrupt line.
+ */
+struct umc_dev {
+	u16		version;
+	u8		cap_id;
+	u8		bar;
+	struct resource resource;
+	unsigned	irq;
+	struct device	dev;
+};
+
+#define to_umc_dev(d) container_of(d, struct umc_dev, dev)
+
+/**
+ * struct umc_driver - UMC capability driver
+ * @cap_id: supported capability ID.
+ * @match: driver specific capability matching function.
+ * @match_data: driver specific data for match() (e.g., a
+ * table of pci_device_id's if umc_match_pci_id() is used).
+ */
+struct umc_driver {
+	char *name;
+	u8 cap_id;
+	int (*match)(struct umc_driver *, struct umc_dev *);
+	const void *match_data;
+
+	int  (*probe)(struct umc_dev *);
+	void (*remove)(struct umc_dev *);
+	int  (*suspend)(struct umc_dev *, pm_message_t state);
+	int  (*resume)(struct umc_dev *);
+
+	struct device_driver driver;
+};
+
+#define to_umc_driver(d) container_of(d, struct umc_driver, driver)
+
+extern struct bus_type umc_bus_type;
+
+struct umc_dev *umc_device_create(struct device *parent, int n);
+int __must_check umc_device_register(struct umc_dev *umc);
+void umc_device_unregister(struct umc_dev *umc);
+
+int __must_check __umc_driver_register(struct umc_driver *umc_drv,
+				       struct module *mod,
+				       const char *mod_name);
+
+/**
+ * umc_driver_register - register a UMC capabiltity driver.
+ * @umc_drv:  pointer to the driver.
+ */
+static inline int __must_check umc_driver_register(struct umc_driver *umc_drv)
+{
+	return __umc_driver_register(umc_drv, THIS_MODULE, KBUILD_MODNAME);
+}
+void umc_driver_unregister(struct umc_driver *umc_drv);
+
+/*
+ * Utility function you can use to match (umc_driver->match) against a
+ * null-terminated array of 'struct pci_device_id' in
+ * umc_driver->match_data.
+ */
+int umc_match_pci_id(struct umc_driver *umc_drv, struct umc_dev *umc);
+
+/**
+ * umc_parent_pci_dev - return the UMC's parent PCI device or NULL if none
+ * @umc_dev: UMC device whose parent PCI device we are looking for
+ *
+ * DIRTY!!! DON'T RELY ON THIS
+ *
+ * FIXME: This is as dirty as it gets, but we need some way to check
+ * the correct type of umc_dev->parent (so that for example, we can
+ * cast to pci_dev). Casting to pci_dev is necesary because at some
+ * point we need to request resources from the device. Mapping is
+ * easily over come (ioremap and stuff are bus agnostic), but hooking
+ * up to some error handlers (such as pci error handlers) might need
+ * this.
+ *
+ * THIS might (probably will) be removed in the future, so don't count
+ * on it.
+ */
+static inline struct pci_dev *umc_parent_pci_dev(struct umc_dev *umc_dev)
+{
+	struct pci_dev *pci_dev = NULL;
+	if (umc_dev->dev.parent->bus == &pci_bus_type)
+		pci_dev = to_pci_dev(umc_dev->dev.parent);
+	return pci_dev;
+}
+
+/**
+ * umc_dev_get() - reference a UMC device.
+ * @umc_dev: Pointer to UMC device.
+ *
+ * NOTE: we are assuming in this whole scheme that the parent device
+ *       is referenced at _probe() time and unreferenced at _remove()
+ *       time by the parent's subsystem.
+ */
+static inline struct umc_dev *umc_dev_get(struct umc_dev *umc_dev)
+{
+	get_device(&umc_dev->dev);
+	return umc_dev;
+}
+
+/**
+ * umc_dev_put() - unreference a UMC device.
+ * @umc_dev: Pointer to UMC device.
+ */
+static inline void umc_dev_put(struct umc_dev *umc_dev)
+{
+	put_device(&umc_dev->dev);
+}
+
+/**
+ * umc_set_drvdata - set UMC device's driver data.
+ * @umc_dev: Pointer to UMC device.
+ * @data:    Data to set.
+ */
+static inline void umc_set_drvdata(struct umc_dev *umc_dev, void *data)
+{
+	dev_set_drvdata(&umc_dev->dev, data);
+}
+
+/**
+ * umc_get_drvdata - recover UMC device's driver data.
+ * @umc_dev: Pointer to UMC device.
+ */
+static inline void *umc_get_drvdata(struct umc_dev *umc_dev)
+{
+	return dev_get_drvdata(&umc_dev->dev);
+}
+
+int umc_controller_reset(struct umc_dev *umc);
+
+#endif /* #ifndef _LINUX_UWB_UMC_H_ */
-- 
cgit v1.2.3


From 8f1b678ab900c2bda1620dfb6e1f1f02604fc3a2 Mon Sep 17 00:00:00 2001
From: David Vrabel <david.vrabel@csr.com>
Date: Wed, 17 Sep 2008 16:34:13 +0100
Subject: uwb: add the driver to enumerate WHCI capabilities

This enumerates the capabilties of a WHCI device, adding a umc device for
each one.

Signed-off-by: David Vrabel <david.vrabel@csr.com>
---
 drivers/uwb/Makefile     |   2 +-
 drivers/uwb/whci.c       | 269 +++++++++++++++++++++++++++++++++++++++++++++++
 include/linux/uwb/whci.h | 117 +++++++++++++++++++++
 3 files changed, 387 insertions(+), 1 deletion(-)
 create mode 100644 drivers/uwb/whci.c
 create mode 100644 include/linux/uwb/whci.h

(limited to 'include/linux')

diff --git a/drivers/uwb/Makefile b/drivers/uwb/Makefile
index 41c9fca5f875..b054471af28d 100644
--- a/drivers/uwb/Makefile
+++ b/drivers/uwb/Makefile
@@ -1,5 +1,5 @@
 obj-$(CONFIG_UWB)		+= uwb.o
-obj-$(CONFIG_UWB_WHCI)		+= umc.o
+obj-$(CONFIG_UWB_WHCI)		+= umc.o whci.o
 
 uwb-objs :=		\
 	address.o	\
diff --git a/drivers/uwb/whci.c b/drivers/uwb/whci.c
new file mode 100644
index 000000000000..3df2388f908f
--- /dev/null
+++ b/drivers/uwb/whci.c
@@ -0,0 +1,269 @@
+/*
+ * WHCI UWB Multi-interface Controller enumerator.
+ *
+ * Copyright (C) 2007 Cambridge Silicon Radio Ltd.
+ *
+ * This file is released under the GNU GPL v2.
+ */
+#include <linux/delay.h>
+#include <linux/kernel.h>
+#include <linux/pci.h>
+#include <linux/dma-mapping.h>
+#include <linux/uwb/whci.h>
+#include <linux/uwb/umc.h>
+
+struct whci_card {
+	struct pci_dev *pci;
+	void __iomem *uwbbase;
+	u8 n_caps;
+	struct umc_dev *devs[0];
+};
+
+
+/* Fix faulty HW :( */
+static
+u64 whci_capdata_quirks(struct whci_card *card, u64 capdata)
+{
+	u64 capdata_orig = capdata;
+	struct pci_dev *pci_dev = card->pci;
+	if (pci_dev->vendor == PCI_VENDOR_ID_INTEL
+	    && (pci_dev->device == 0x0c3b || pci_dev->device == 0004)
+	    && pci_dev->class == 0x0d1010) {
+		switch (UWBCAPDATA_TO_CAP_ID(capdata)) {
+			/* WLP capability has 0x100 bytes of aperture */
+		case 0x80:
+			capdata |= 0x40 << 8; break;
+			/* WUSB capability has 0x80 bytes of aperture
+			 * and ID is 1 */
+		case 0x02:
+			capdata &= ~0xffff;
+			capdata |= 0x2001;
+			break;
+		}
+	}
+	if (capdata_orig != capdata)
+		dev_warn(&pci_dev->dev,
+			 "PCI v%04x d%04x c%06x#%02x: "
+			 "corrected capdata from %016Lx to %016Lx\n",
+			 pci_dev->vendor, pci_dev->device, pci_dev->class,
+			 (unsigned)UWBCAPDATA_TO_CAP_ID(capdata),
+			 (unsigned long long)capdata_orig,
+			 (unsigned long long)capdata);
+	return capdata;
+}
+
+
+/**
+ * whci_wait_for - wait for a WHCI register to be set
+ *
+ * Polls (for at most @max_ms ms) until '*@reg & @mask == @result'.
+ */
+int whci_wait_for(struct device *dev, u32 __iomem *reg, u32 mask, u32 result,
+	unsigned long max_ms, const char *tag)
+{
+	unsigned t = 0;
+	u32 val;
+	for (;;) {
+		val = le_readl(reg);
+		if ((val & mask) == result)
+			break;
+		msleep(10);
+		if (t >= max_ms) {
+			dev_err(dev, "timed out waiting for %s ", tag);
+			return -ETIMEDOUT;
+		}
+		t += 10;
+	}
+	return 0;
+}
+EXPORT_SYMBOL_GPL(whci_wait_for);
+
+
+/*
+ * NOTE: the capinfo and capdata registers are slightly different
+ *       (size and cap-id fields). So for cap #0, we need to fill
+ *       in. Size comes from the size of the register block
+ *       (statically calculated); cap_id comes from nowhere, we use
+ *       zero, that is reserved, for the radio controller, because
+ *       none was defined at the spec level.
+ */
+static int whci_add_cap(struct whci_card *card, int n)
+{
+	struct umc_dev *umc;
+	u64 capdata;
+	int bar, err;
+
+	umc = umc_device_create(&card->pci->dev, n);
+	if (umc == NULL)
+		return -ENOMEM;
+
+	capdata = le_readq(card->uwbbase + UWBCAPDATA(n));
+
+	bar = UWBCAPDATA_TO_BAR(capdata) << 1;
+
+	capdata = whci_capdata_quirks(card, capdata);
+	/* Capability 0 is the radio controller. It's size is 32
+	 * bytes (WHCI0.95[2.3, T2-9]). */
+	umc->version         = UWBCAPDATA_TO_VERSION(capdata);
+	umc->cap_id          = n == 0 ? 0 : UWBCAPDATA_TO_CAP_ID(capdata);
+	umc->bar	     = bar;
+	umc->resource.start  = pci_resource_start(card->pci, bar)
+		+ UWBCAPDATA_TO_OFFSET(capdata);
+	umc->resource.end    = umc->resource.start
+		+ (n == 0 ? 0x20 : UWBCAPDATA_TO_SIZE(capdata)) - 1;
+	umc->resource.name   = umc->dev.bus_id;
+	umc->resource.flags  = card->pci->resource[bar].flags;
+	umc->resource.parent = &card->pci->resource[bar];
+	umc->irq             = card->pci->irq;
+
+	err = umc_device_register(umc);
+	if (err < 0)
+		goto error;
+	card->devs[n] = umc;
+	return 0;
+
+error:
+	kfree(umc);
+	return err;
+}
+
+static void whci_del_cap(struct whci_card *card, int n)
+{
+	struct umc_dev *umc = card->devs[n];
+
+	if (umc != NULL)
+		umc_device_unregister(umc);
+}
+
+static int whci_n_caps(struct pci_dev *pci)
+{
+	void __iomem *uwbbase;
+	u64 capinfo;
+
+	uwbbase = pci_iomap(pci, 0, 8);
+	if (!uwbbase)
+		return -ENOMEM;
+	capinfo = le_readq(uwbbase + UWBCAPINFO);
+	pci_iounmap(pci, uwbbase);
+
+	return UWBCAPINFO_TO_N_CAPS(capinfo);
+}
+
+static int whci_probe(struct pci_dev *pci, const struct pci_device_id *id)
+{
+	struct whci_card *card;
+	int err, n_caps, n;
+
+	err = pci_enable_device(pci);
+	if (err < 0)
+		goto error;
+	pci_enable_msi(pci);
+	pci_set_master(pci);
+	err = -ENXIO;
+	if (!pci_set_dma_mask(pci, DMA_64BIT_MASK))
+		pci_set_consistent_dma_mask(pci, DMA_64BIT_MASK);
+	else if (!pci_set_dma_mask(pci, DMA_32BIT_MASK))
+		pci_set_consistent_dma_mask(pci, DMA_32BIT_MASK);
+	else
+		goto error_dma;
+
+	err = n_caps = whci_n_caps(pci);
+	if (n_caps < 0)
+		goto error_ncaps;
+
+	err = -ENOMEM;
+	card = kzalloc(sizeof(struct whci_card)
+		       + sizeof(struct whci_dev *) * (n_caps + 1),
+		       GFP_KERNEL);
+	if (card == NULL)
+		goto error_kzalloc;
+	card->pci = pci;
+	card->n_caps = n_caps;
+
+	err = -EBUSY;
+	if (!request_mem_region(pci_resource_start(pci, 0),
+				UWBCAPDATA_SIZE(card->n_caps),
+				"whci (capability data)"))
+		goto error_request_memregion;
+	err = -ENOMEM;
+	card->uwbbase = pci_iomap(pci, 0, UWBCAPDATA_SIZE(card->n_caps));
+	if (!card->uwbbase)
+		goto error_iomap;
+
+	/* Add each capability. */
+	for (n = 0; n <= card->n_caps; n++) {
+		err = whci_add_cap(card, n);
+		if (err < 0 && n == 0) {
+			dev_err(&pci->dev, "cannot bind UWB radio controller:"
+				" %d\n", err);
+			goto error_bind;
+		}
+		if (err < 0)
+			dev_warn(&pci->dev, "warning: cannot bind capability "
+				 "#%u: %d\n", n, err);
+	}
+	pci_set_drvdata(pci, card);
+	return 0;
+
+error_bind:
+	pci_iounmap(pci, card->uwbbase);
+error_iomap:
+	release_mem_region(pci_resource_start(pci, 0), UWBCAPDATA_SIZE(card->n_caps));
+error_request_memregion:
+	kfree(card);
+error_kzalloc:
+error_ncaps:
+error_dma:
+	pci_disable_msi(pci);
+	pci_disable_device(pci);
+error:
+	return err;
+}
+
+static void whci_remove(struct pci_dev *pci)
+{
+	struct whci_card *card = pci_get_drvdata(pci);
+	int n;
+
+	pci_set_drvdata(pci, NULL);
+	/* Unregister each capability in reverse (so the master device
+	 * is unregistered last). */
+	for (n = card->n_caps; n >= 0 ; n--)
+		whci_del_cap(card, n);
+	pci_iounmap(pci, card->uwbbase);
+	release_mem_region(pci_resource_start(pci, 0), UWBCAPDATA_SIZE(card->n_caps));
+	kfree(card);
+	pci_disable_msi(pci);
+	pci_disable_device(pci);
+}
+
+static struct pci_device_id whci_id_table[] = {
+	{ PCI_DEVICE_CLASS(PCI_CLASS_WIRELESS_WHCI, ~0) },
+	{ 0 },
+};
+MODULE_DEVICE_TABLE(pci, whci_id_table);
+
+
+static struct pci_driver whci_driver = {
+	.name     = "whci",
+	.id_table = whci_id_table,
+	.probe    = whci_probe,
+	.remove   = whci_remove,
+};
+
+static int __init whci_init(void)
+{
+	return pci_register_driver(&whci_driver);
+}
+
+static void __exit whci_exit(void)
+{
+	pci_unregister_driver(&whci_driver);
+}
+
+module_init(whci_init);
+module_exit(whci_exit);
+
+MODULE_DESCRIPTION("WHCI UWB Multi-interface Controller enumerator");
+MODULE_AUTHOR("Cambridge Silicon Radio Ltd.");
+MODULE_LICENSE("GPL");
diff --git a/include/linux/uwb/whci.h b/include/linux/uwb/whci.h
new file mode 100644
index 000000000000..915ec23042d4
--- /dev/null
+++ b/include/linux/uwb/whci.h
@@ -0,0 +1,117 @@
+/*
+ * Wireless Host Controller Interface for Ultra-Wide-Band and Wireless USB
+ *
+ * Copyright (C) 2005-2006 Intel Corporation
+ * Inaky Perez-Gonzalez <inaky.perez-gonzalez@intel.com>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License version
+ * 2 as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
+ * 02110-1301, USA.
+ *
+ *
+ *
+ * References:
+ *   [WHCI] Wireless Host Controller Interface Specification for
+ *          Certified Wireless Universal Serial Bus, revision 0.95.
+ */
+#ifndef _LINUX_UWB_WHCI_H_
+#define _LINUX_UWB_WHCI_H_
+
+#include <linux/pci.h>
+
+/*
+ * UWB interface capability registers (offsets from UWBBASE)
+ *
+ * [WHCI] section 2.2
+ */
+#define UWBCAPINFO	0x00 /* == UWBCAPDATA(0) */
+#  define UWBCAPINFO_TO_N_CAPS(c)	(((c) >> 0)  & 0xFull)
+#define UWBCAPDATA(n)	(8*(n))
+#  define UWBCAPDATA_TO_VERSION(c)	(((c) >> 32) & 0xFFFFull)
+#  define UWBCAPDATA_TO_OFFSET(c)	(((c) >> 18) & 0x3FFFull)
+#  define UWBCAPDATA_TO_BAR(c)		(((c) >> 16) & 0x3ull)
+#  define UWBCAPDATA_TO_SIZE(c)		((((c) >> 8) & 0xFFull) * sizeof(u32))
+#  define UWBCAPDATA_TO_CAP_ID(c)	(((c) >> 0)  & 0xFFull)
+
+/* Size of the WHCI capability data (including the RC capability) for
+   a device with n capabilities. */
+#define UWBCAPDATA_SIZE(n) (8 + 8*(n))
+
+
+/*
+ * URC registers (offsets from URCBASE)
+ *
+ * [WHCI] section 2.3
+ */
+#define URCCMD		0x00
+#  define URCCMD_RESET		(1 << 31)  /* UMC Hardware reset */
+#  define URCCMD_RS		(1 << 30)  /* Run/Stop */
+#  define URCCMD_EARV		(1 << 29)  /* Event Address Register Valid */
+#  define URCCMD_ACTIVE		(1 << 15)  /* Command is active */
+#  define URCCMD_IWR		(1 << 14)  /* Interrupt When Ready */
+#  define URCCMD_SIZE_MASK	0x00000fff /* Command size mask */
+#define URCSTS		0x04
+#  define URCSTS_EPS		(1 << 17)  /* Event Processing Status */
+#  define URCSTS_HALTED		(1 << 16)  /* RC halted */
+#  define URCSTS_HSE		(1 << 10)  /* Host System Error...fried */
+#  define URCSTS_ER		(1 <<  9)  /* Event Ready */
+#  define URCSTS_RCI		(1 <<  8)  /* Ready for Command Interrupt */
+#  define URCSTS_INT_MASK	0x00000700 /* URC interrupt sources */
+#  define URCSTS_ISI		0x000000ff /* Interrupt Source Identification */
+#define URCINTR		0x08
+#  define URCINTR_EN_ALL	0x000007ff /* Enable all interrupt sources */
+#define URCCMDADDR	0x10
+#define URCEVTADDR	0x18
+#  define URCEVTADDR_OFFSET_MASK 0xfff    /* Event pointer offset mask */
+
+
+/** Write 32 bit @value to little endian register at @addr */
+static inline
+void le_writel(u32 value, void __iomem *addr)
+{
+	iowrite32(value, addr);
+}
+
+
+/** Read from 32 bit little endian register at @addr */
+static inline
+u32 le_readl(void __iomem *addr)
+{
+	return ioread32(addr);
+}
+
+
+/** Write 64 bit @value to little endian register at @addr */
+static inline
+void le_writeq(u64 value, void __iomem *addr)
+{
+	iowrite32(value, addr);
+	iowrite32(value >> 32, addr + 4);
+}
+
+
+/** Read from 64 bit little endian register at @addr */
+static inline
+u64 le_readq(void __iomem *addr)
+{
+	u64 value;
+	value  = ioread32(addr);
+	value |= (u64)ioread32(addr + 4) << 32;
+	return value;
+}
+
+extern int whci_wait_for(struct device *dev, u32 __iomem *reg,
+			 u32 mask, u32 result,
+			 unsigned long max_ms,  const char *tag);
+
+#endif /* #ifndef _LINUX_UWB_WHCI_H_ */
-- 
cgit v1.2.3


From c7f736484f8ecde4dc1bc8459179c4d65f2ccbe4 Mon Sep 17 00:00:00 2001
From: Inaky Perez-Gonzalez <inaky.perez-gonzalez@intel.com>
Date: Wed, 17 Sep 2008 16:34:22 +0100
Subject: wusb: add the Wireless USB include files.

Common header files derived from the WUSB 1.0 specification.

Signed-off-by: David Vrabel <david.vrabel@csr.com>
---
 include/linux/usb/wusb-wa.h | 271 +++++++++++++++++++++++++++++++
 include/linux/usb/wusb.h    | 376 ++++++++++++++++++++++++++++++++++++++++++++
 2 files changed, 647 insertions(+)
 create mode 100644 include/linux/usb/wusb-wa.h
 create mode 100644 include/linux/usb/wusb.h

(limited to 'include/linux')

diff --git a/include/linux/usb/wusb-wa.h b/include/linux/usb/wusb-wa.h
new file mode 100644
index 000000000000..a102561e7026
--- /dev/null
+++ b/include/linux/usb/wusb-wa.h
@@ -0,0 +1,271 @@
+/*
+ * Wireless USB Wire Adapter constants and structures.
+ *
+ * Copyright (C) 2005-2006 Intel Corporation.
+ * Inaky Perez-Gonzalez <inaky.perez-gonzalez@intel.com>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License version
+ * 2 as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
+ * 02110-1301, USA.
+ *
+ *
+ * FIXME: docs
+ * FIXME: organize properly, group logically
+ *
+ * All the event structures are defined in uwb/spec.h, as they are
+ * common to the WHCI and WUSB radio control interfaces.
+ *
+ * References:
+ *   [WUSB] Wireless Universal Serial Bus Specification, revision 1.0, ch8
+ */
+#ifndef __LINUX_USB_WUSB_WA_H
+#define __LINUX_USB_WUSB_WA_H
+
+/**
+ * Radio Command Request for the Radio Control Interface
+ *
+ * Radio Control Interface command and event codes are the same as
+ * WHCI, and listed in include/linux/uwb.h:UWB_RC_{CMD,EVT}_*
+ */
+enum {
+	WA_EXEC_RC_CMD = 40,	/* Radio Control command Request */
+};
+
+/* Wireless Adapter Requests ([WUSB] table 8-51) */
+enum {
+	WUSB_REQ_ADD_MMC_IE     = 20,
+	WUSB_REQ_REMOVE_MMC_IE  = 21,
+	WUSB_REQ_SET_NUM_DNTS   = 22,
+	WUSB_REQ_SET_CLUSTER_ID = 23,
+	WUSB_REQ_SET_DEV_INFO   = 24,
+	WUSB_REQ_GET_TIME       = 25,
+	WUSB_REQ_SET_STREAM_IDX = 26,
+	WUSB_REQ_SET_WUSB_MAS   = 27,
+};
+
+
+/* Wireless Adapter WUSB Channel Time types ([WUSB] table 8-52) */
+enum {
+	WUSB_TIME_ADJ   = 0,
+	WUSB_TIME_BPST  = 1,
+	WUSB_TIME_WUSB  = 2,
+};
+
+enum {
+	WA_ENABLE = 0x01,
+	WA_RESET = 0x02,
+	RPIPE_PAUSE = 0x1,
+};
+
+/* Responses from Get Status request ([WUSB] section 8.3.1.6) */
+enum {
+	WA_STATUS_ENABLED = 0x01,
+	WA_STATUS_RESETTING = 0x02
+};
+
+enum rpipe_crs {
+	RPIPE_CRS_CTL = 0x01,
+	RPIPE_CRS_ISO = 0x02,
+	RPIPE_CRS_BULK = 0x04,
+	RPIPE_CRS_INTR = 0x08
+};
+
+/**
+ * RPipe descriptor ([WUSB] section 8.5.2.11)
+ *
+ * FIXME: explain rpipes
+ */
+struct usb_rpipe_descriptor {
+	u8 	bLength;
+	u8	bDescriptorType;
+	__le16  wRPipeIndex;
+	__le16	wRequests;
+	__le16	wBlocks;		/* rw if 0 */
+	__le16	wMaxPacketSize;		/* rw? */
+	u8	bHSHubAddress;		/* reserved: 0 */
+	u8	bHSHubPort;		/* ??? FIXME ??? */
+	u8	bSpeed;			/* rw: xfer rate 'enum uwb_phy_rate' */
+	u8	bDeviceAddress;		/* rw: Target device address */
+	u8	bEndpointAddress;	/* rw: Target EP address */
+	u8	bDataSequence;		/* ro: Current Data sequence */
+	__le32	dwCurrentWindow;	/* ro */
+	u8	bMaxDataSequence;	/* ro?: max supported seq */
+	u8	bInterval;		/* rw:  */
+	u8	bOverTheAirInterval;	/* rw:  */
+	u8	bmAttribute;		/* ro?  */
+	u8	bmCharacteristics;	/* ro? enum rpipe_attr, supported xsactions */
+	u8	bmRetryOptions;		/* rw? */
+	__le16	wNumTransactionErrors;	/* rw */
+} __attribute__ ((packed));
+
+/**
+ * Wire Adapter Notification types ([WUSB] sections 8.4.5 & 8.5.4)
+ *
+ * These are the notifications coming on the notification endpoint of
+ * an HWA and a DWA.
+ */
+enum wa_notif_type {
+	DWA_NOTIF_RWAKE = 0x91,
+	DWA_NOTIF_PORTSTATUS = 0x92,
+	WA_NOTIF_TRANSFER = 0x93,
+	HWA_NOTIF_BPST_ADJ = 0x94,
+	HWA_NOTIF_DN = 0x95,
+};
+
+/**
+ * Wire Adapter notification header
+ *
+ * Notifications coming from a wire adapter use a common header
+ * defined in [WUSB] sections 8.4.5 & 8.5.4.
+ */
+struct wa_notif_hdr {
+	u8 bLength;
+	u8 bNotifyType;			/* enum wa_notif_type */
+} __attribute__((packed));
+
+/**
+ * HWA DN Received notification [(WUSB] section 8.5.4.2)
+ *
+ * The DNData is specified in WUSB1.0[7.6]. For each device
+ * notification we received, we just need to dispatch it.
+ *
+ * @dndata:  this is really an array of notifications, but all start
+ *           with the same header.
+ */
+struct hwa_notif_dn {
+	struct wa_notif_hdr hdr;
+	u8 bSourceDeviceAddr;		/* from errata 2005/07 */
+	u8 bmAttributes;
+	struct wusb_dn_hdr dndata[];
+} __attribute__((packed));
+
+/* [WUSB] section 8.3.3 */
+enum wa_xfer_type {
+	WA_XFER_TYPE_CTL = 0x80,
+	WA_XFER_TYPE_BI = 0x81,		/* bulk/interrupt */
+	WA_XFER_TYPE_ISO = 0x82,
+	WA_XFER_RESULT = 0x83,
+	WA_XFER_ABORT = 0x84,
+};
+
+/* [WUSB] section 8.3.3 */
+struct wa_xfer_hdr {
+	u8 bLength;			/* 0x18 */
+	u8 bRequestType;		/* 0x80 WA_REQUEST_TYPE_CTL */
+	__le16 wRPipe;			/* RPipe index */
+	__le32 dwTransferID;		/* Host-assigned ID */
+	__le32 dwTransferLength;	/* Length of data to xfer */
+	u8 bTransferSegment;
+} __attribute__((packed));
+
+struct wa_xfer_ctl {
+	struct wa_xfer_hdr hdr;
+	u8 bmAttribute;
+	__le16 wReserved;
+	struct usb_ctrlrequest baSetupData;
+} __attribute__((packed));
+
+struct wa_xfer_bi {
+	struct wa_xfer_hdr hdr;
+	u8 bReserved;
+	__le16 wReserved;
+} __attribute__((packed));
+
+struct wa_xfer_hwaiso {
+	struct wa_xfer_hdr hdr;
+	u8 bReserved;
+	__le16 wPresentationTime;
+	__le32 dwNumOfPackets;
+	/* FIXME: u8 pktdata[]? */
+} __attribute__((packed));
+
+/* [WUSB] section 8.3.3.5 */
+struct wa_xfer_abort {
+	u8 bLength;
+	u8 bRequestType;
+	__le16 wRPipe;			/* RPipe index */
+	__le32 dwTransferID;		/* Host-assigned ID */
+} __attribute__((packed));
+
+/**
+ * WA Transfer Complete notification ([WUSB] section 8.3.3.3)
+ *
+ */
+struct wa_notif_xfer {
+	struct wa_notif_hdr hdr;
+	u8 bEndpoint;
+	u8 Reserved;
+} __attribute__((packed));
+
+/** Transfer result basic codes [WUSB] table 8-15 */
+enum {
+	WA_XFER_STATUS_SUCCESS,
+	WA_XFER_STATUS_HALTED,
+	WA_XFER_STATUS_DATA_BUFFER_ERROR,
+	WA_XFER_STATUS_BABBLE,
+	WA_XFER_RESERVED,
+	WA_XFER_STATUS_NOT_FOUND,
+	WA_XFER_STATUS_INSUFFICIENT_RESOURCE,
+	WA_XFER_STATUS_TRANSACTION_ERROR,
+	WA_XFER_STATUS_ABORTED,
+	WA_XFER_STATUS_RPIPE_NOT_READY,
+	WA_XFER_INVALID_FORMAT,
+	WA_XFER_UNEXPECTED_SEGMENT_NUMBER,
+	WA_XFER_STATUS_RPIPE_TYPE_MISMATCH,
+};
+
+/** [WUSB] section 8.3.3.4 */
+struct wa_xfer_result {
+	struct wa_notif_hdr hdr;
+	__le32 dwTransferID;
+	__le32 dwTransferLength;
+	u8     bTransferSegment;
+	u8     bTransferStatus;
+	__le32 dwNumOfPackets;
+} __attribute__((packed));
+
+/**
+ * Wire Adapter Class Descriptor ([WUSB] section 8.5.2.7).
+ *
+ * NOTE: u16 fields are read Little Endian from the hardware.
+ *
+ * @bNumPorts is the original max number of devices that the host can
+ *            connect; we might chop this so the stack can handle
+ *            it. In case you need to access it, use wusbhc->ports_max
+ *            if it is a Wireless USB WA.
+ */
+struct usb_wa_descriptor {
+	u8	bLength;
+	u8	bDescriptorType;
+	u16	bcdWAVersion;
+	u8	bNumPorts;		/* don't use!! */
+	u8	bmAttributes;		/* Reserved == 0 */
+	u16	wNumRPipes;
+	u16	wRPipeMaxBlock;
+	u8	bRPipeBlockSize;
+	u8	bPwrOn2PwrGood;
+	u8	bNumMMCIEs;
+	u8	DeviceRemovable;	/* FIXME: in DWA this is up to 16 bytes */
+} __attribute__((packed));
+
+/**
+ * HWA Device Information Buffer (WUSB1.0[T8.54])
+ */
+struct hwa_dev_info {
+	u8	bmDeviceAvailability[32];       /* FIXME: ignored for now */
+	u8	bDeviceAddress;
+	__le16	wPHYRates;
+	u8	bmDeviceAttribute;
+} __attribute__((packed));
+
+#endif /* #ifndef __LINUX_USB_WUSB_WA_H */
diff --git a/include/linux/usb/wusb.h b/include/linux/usb/wusb.h
new file mode 100644
index 000000000000..5f401b644ed5
--- /dev/null
+++ b/include/linux/usb/wusb.h
@@ -0,0 +1,376 @@
+/*
+ * Wireless USB Standard Definitions
+ * Event Size Tables
+ *
+ * Copyright (C) 2005-2006 Intel Corporation
+ * Inaky Perez-Gonzalez <inaky.perez-gonzalez@intel.com>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License version
+ * 2 as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
+ * 02110-1301, USA.
+ *
+ *
+ * FIXME: docs
+ * FIXME: organize properly, group logically
+ *
+ * All the event structures are defined in uwb/spec.h, as they are
+ * common to the WHCI and WUSB radio control interfaces.
+ */
+
+#ifndef __WUSB_H__
+#define __WUSB_H__
+
+#include <linux/types.h>
+#include <linux/kernel.h>
+#include <linux/uwb/spec.h>
+#include <linux/usb/ch9.h>
+#include <linux/param.h>
+
+/**
+ * WUSB Information Element header
+ *
+ * I don't know why, they decided to make it different to the MBOA MAC
+ * IE Header; beats me.
+ */
+struct wuie_hdr {
+	u8 bLength;
+	u8 bIEIdentifier;
+} __attribute__((packed));
+
+enum {
+	WUIE_ID_WCTA = 0x80,
+	WUIE_ID_CONNECTACK,
+	WUIE_ID_HOST_INFO,
+	WUIE_ID_CHANGE_ANNOUNCE,
+	WUIE_ID_DEVICE_DISCONNECT,
+	WUIE_ID_HOST_DISCONNECT,
+	WUIE_ID_KEEP_ALIVE = 0x89,
+	WUIE_ID_ISOCH_DISCARD,
+	WUIE_ID_RESET_DEVICE,
+};
+
+/**
+ * Maximum number of array elements in a WUSB IE.
+ *
+ * WUSB1.0[7.5 before table 7-38] says that in WUSB IEs that
+ * are "arrays" have to limited to 4 elements. So we define it
+ * like that to ease up and submit only the neeed size.
+ */
+#define WUIE_ELT_MAX 4
+
+/**
+ * Wrapper for the data that defines a CHID, a CDID or a CK
+ *
+ * WUSB defines that CHIDs, CDIDs and CKs are a 16 byte string of
+ * data. In order to avoid confusion and enforce types, we wrap it.
+ *
+ * Make it packed, as we use it in some hw defintions.
+ */
+struct wusb_ckhdid {
+	u8 data[16];
+} __attribute__((packed));
+
+const static
+struct wusb_ckhdid wusb_ckhdid_zero = { .data = { 0 } };
+
+#define WUSB_CKHDID_STRSIZE (3 * sizeof(struct wusb_ckhdid) + 1)
+
+/**
+ * WUSB IE: Host Information (WUSB1.0[7.5.2])
+ *
+ * Used to provide information about the host to the Wireless USB
+ * devices in range (CHID can be used as an ASCII string).
+ */
+struct wuie_host_info {
+	struct wuie_hdr hdr;
+	__le16 attributes;
+	struct wusb_ckhdid CHID;
+} __attribute__((packed));
+
+/**
+ * WUSB IE: Connect Ack (WUSB1.0[7.5.1])
+ *
+ * Used to acknowledge device connect requests. See note for
+ * WUIE_ELT_MAX.
+ */
+struct wuie_connect_ack {
+	struct wuie_hdr hdr;
+	struct {
+		struct wusb_ckhdid CDID;
+		u8 bDeviceAddress;	/* 0 means unused */
+		u8 bReserved;
+	} blk[WUIE_ELT_MAX];
+} __attribute__((packed));
+
+/**
+ * WUSB IE Host Information Element, Connect Availability
+ *
+ * WUSB1.0[7.5.2], bmAttributes description
+ */
+enum {
+	WUIE_HI_CAP_RECONNECT = 0,
+	WUIE_HI_CAP_LIMITED,
+	WUIE_HI_CAP_RESERVED,
+	WUIE_HI_CAP_ALL,
+};
+
+/**
+ * WUSB IE: Channel Stop (WUSB1.0[7.5.8])
+ *
+ * Tells devices the host is going to stop sending MMCs and will dissapear.
+ */
+struct wuie_channel_stop {
+	struct wuie_hdr hdr;
+	u8 attributes;
+	u8 timestamp[3];
+} __attribute__((packed));
+
+/**
+ * WUSB IE: Keepalive (WUSB1.0[7.5.9])
+ *
+ * Ask device(s) to send keepalives.
+ */
+struct wuie_keep_alive {
+	struct wuie_hdr hdr;
+	u8 bDeviceAddress[WUIE_ELT_MAX];
+} __attribute__((packed));
+
+/**
+ * WUSB IE: Reset device (WUSB1.0[7.5.11])
+ *
+ * Tell device to reset; in all truth, we can fit 4 CDIDs, but we only
+ * use it for one at the time...
+ *
+ * In any case, this request is a wee bit silly: why don't they target
+ * by address??
+ */
+struct wuie_reset {
+	struct wuie_hdr hdr;
+	struct wusb_ckhdid CDID;
+} __attribute__((packed));
+
+/**
+ * WUSB IE: Disconnect device (WUSB1.0[7.5.11])
+ *
+ * Tell device to disconnect; we can fit 4 addresses, but we only use
+ * it for one at the time...
+ */
+struct wuie_disconnect {
+	struct wuie_hdr hdr;
+	u8 bDeviceAddress;
+	u8 padding;
+} __attribute__((packed));
+
+/**
+ * WUSB IE: Host disconnect ([WUSB] section 7.5.5)
+ *
+ * Tells all connected devices to disconnect.
+ */
+struct wuie_host_disconnect {
+	struct wuie_hdr hdr;
+} __attribute__((packed));
+
+/**
+ * WUSB Device Notification header (WUSB1.0[7.6])
+ */
+struct wusb_dn_hdr {
+	u8 bType;
+	u8 notifdata[];
+} __attribute__((packed));
+
+/** Device Notification codes (WUSB1.0[Table 7-54]) */
+enum WUSB_DN {
+	WUSB_DN_CONNECT = 0x01,
+	WUSB_DN_DISCONNECT = 0x02,
+	WUSB_DN_EPRDY = 0x03,
+	WUSB_DN_MASAVAILCHANGED = 0x04,
+	WUSB_DN_RWAKE = 0x05,
+	WUSB_DN_SLEEP = 0x06,
+	WUSB_DN_ALIVE = 0x07,
+};
+
+/** WUSB Device Notification Connect */
+struct wusb_dn_connect {
+	struct wusb_dn_hdr hdr;
+	__le16 attributes;
+	struct wusb_ckhdid CDID;
+} __attribute__((packed));
+
+static inline int wusb_dn_connect_prev_dev_addr(const struct wusb_dn_connect *dn)
+{
+	return le16_to_cpu(dn->attributes) & 0xff;
+}
+
+static inline int wusb_dn_connect_new_connection(const struct wusb_dn_connect *dn)
+{
+	return (le16_to_cpu(dn->attributes) >> 8) & 0x1;
+}
+
+static inline int wusb_dn_connect_beacon_behavior(const struct wusb_dn_connect *dn)
+{
+	return (le16_to_cpu(dn->attributes) >> 9) & 0x03;
+}
+
+/** Device is alive (aka: pong) (WUSB1.0[7.6.7]) */
+struct wusb_dn_alive {
+	struct wusb_dn_hdr hdr;
+} __attribute__((packed));
+
+/** Device is disconnecting (WUSB1.0[7.6.2]) */
+struct wusb_dn_disconnect {
+	struct wusb_dn_hdr hdr;
+} __attribute__((packed));
+
+/* General constants */
+enum {
+	WUSB_TRUST_TIMEOUT_MS = 4000,	/* [WUSB] section 4.15.1 */
+};
+
+static inline size_t ckhdid_printf(char *pr_ckhdid, size_t size,
+				   const struct wusb_ckhdid *ckhdid)
+{
+	return scnprintf(pr_ckhdid, size,
+			 "%02hx %02hx %02hx %02hx %02hx %02hx %02hx %02hx "
+			 "%02hx %02hx %02hx %02hx %02hx %02hx %02hx %02hx",
+			 ckhdid->data[0],  ckhdid->data[1],
+			 ckhdid->data[2],  ckhdid->data[3],
+			 ckhdid->data[4],  ckhdid->data[5],
+			 ckhdid->data[6],  ckhdid->data[7],
+			 ckhdid->data[8],  ckhdid->data[9],
+			 ckhdid->data[10], ckhdid->data[11],
+			 ckhdid->data[12], ckhdid->data[13],
+			 ckhdid->data[14], ckhdid->data[15]);
+}
+
+/*
+ * WUSB Crypto stuff (WUSB1.0[6])
+ */
+
+extern const char *wusb_et_name(u8);
+
+/**
+ * WUSB key index WUSB1.0[7.3.2.4], for usage when setting keys for
+ * the host or the device.
+ */
+static inline u8 wusb_key_index(int index, int type, int originator)
+{
+	return (originator << 6) | (type << 4) | index;
+}
+
+#define WUSB_KEY_INDEX_TYPE_PTK			0 /* for HWA only */
+#define WUSB_KEY_INDEX_TYPE_ASSOC		1
+#define WUSB_KEY_INDEX_TYPE_GTK			2
+#define WUSB_KEY_INDEX_ORIGINATOR_HOST		0
+#define WUSB_KEY_INDEX_ORIGINATOR_DEVICE	1
+
+/* A CCM Nonce, defined in WUSB1.0[6.4.1] */
+struct aes_ccm_nonce {
+	u8 sfn[6];              /* Little Endian */
+	u8 tkid[3];             /* LE */
+	struct uwb_dev_addr dest_addr;
+	struct uwb_dev_addr src_addr;
+} __attribute__((packed));
+
+/* A CCM operation label, defined on WUSB1.0[6.5.x] */
+struct aes_ccm_label {
+	u8 data[14];
+} __attribute__((packed));
+
+/*
+ * Input to the key derivation sequence defined in
+ * WUSB1.0[6.5.1]. Rest of the data is in the CCM Nonce passed to the
+ * PRF function.
+ */
+struct wusb_keydvt_in {
+	u8 hnonce[16];
+	u8 dnonce[16];
+} __attribute__((packed));
+
+/*
+ * Output from the key derivation sequence defined in
+ * WUSB1.0[6.5.1].
+ */
+struct wusb_keydvt_out {
+	u8 kck[16];
+	u8 ptk[16];
+} __attribute__((packed));
+
+/* Pseudo Random Function WUSB1.0[6.5] */
+extern int wusb_crypto_init(void);
+extern void wusb_crypto_exit(void);
+extern ssize_t wusb_prf(void *out, size_t out_size,
+			const u8 key[16], const struct aes_ccm_nonce *_n,
+			const struct aes_ccm_label *a,
+			const void *b, size_t blen, size_t len);
+
+static inline int wusb_prf_64(void *out, size_t out_size, const u8 key[16],
+			      const struct aes_ccm_nonce *n,
+			      const struct aes_ccm_label *a,
+			      const void *b, size_t blen)
+{
+	return wusb_prf(out, out_size, key, n, a, b, blen, 64);
+}
+
+static inline int wusb_prf_128(void *out, size_t out_size, const u8 key[16],
+			       const struct aes_ccm_nonce *n,
+			       const struct aes_ccm_label *a,
+			       const void *b, size_t blen)
+{
+	return wusb_prf(out, out_size, key, n, a, b, blen, 128);
+}
+
+static inline int wusb_prf_256(void *out, size_t out_size, const u8 key[16],
+			       const struct aes_ccm_nonce *n,
+			       const struct aes_ccm_label *a,
+			       const void *b, size_t blen)
+{
+	return wusb_prf(out, out_size, key, n, a, b, blen, 256);
+}
+
+/* Key derivation WUSB1.0[6.5.1] */
+static inline int wusb_key_derive(struct wusb_keydvt_out *keydvt_out,
+				  const u8 key[16],
+				  const struct aes_ccm_nonce *n,
+				  const struct wusb_keydvt_in *keydvt_in)
+{
+	const struct aes_ccm_label a = { .data = "Pair-wise keys" };
+	return wusb_prf_256(keydvt_out, sizeof(*keydvt_out), key, n, &a,
+			    keydvt_in, sizeof(*keydvt_in));
+}
+
+/*
+ * Out-of-band MIC Generation WUSB1.0[6.5.2]
+ *
+ * Compute the MIC over @key, @n and @hs and place it in @mic_out.
+ *
+ * @mic_out:  Where to place the 8 byte MIC tag
+ * @key:      KCK from the derivation process
+ * @n:        CCM nonce, n->sfn == 0, TKID as established in the
+ *            process.
+ * @hs:       Handshake struct for phase 2 of the 4-way.
+ *            hs->bStatus and hs->bReserved are zero.
+ *            hs->bMessageNumber is 2 (WUSB1.0[7.3.2.5.2]
+ *            hs->dest_addr is the device's USB address padded with 0
+ *            hs->src_addr is the hosts's UWB device address
+ *            hs->mic is ignored (as we compute that value).
+ */
+static inline int wusb_oob_mic(u8 mic_out[8], const u8 key[16],
+			       const struct aes_ccm_nonce *n,
+			       const struct usb_handshake *hs)
+{
+	const struct aes_ccm_label a = { .data = "out-of-bandMIC" };
+	return wusb_prf_64(mic_out, 8, key, n, &a,
+			   hs, sizeof(*hs) - sizeof(hs->MIC));
+}
+
+#endif /* #ifndef __WUSB_H__ */
-- 
cgit v1.2.3


From b60066c141997ac2e4ef08459b75638ae86ae781 Mon Sep 17 00:00:00 2001
From: David Vrabel <david.vrabel@csr.com>
Date: Wed, 17 Sep 2008 16:34:40 +0100
Subject: uwb: add symlinks in sysfs between radio controllers and PALs

Add a facility for PALs to have symlinks to their radio controller
(and vice-versa) and make WUSB host controllers use this.

Signed-off-by: David Vrabel <david.vrabel@csr.com>
---
 drivers/usb/wusbcore/pal.c    |  3 +++
 drivers/usb/wusbcore/wusbhc.c | 16 +++++++++-------
 drivers/uwb/pal.c             | 20 ++++++++++++++++++++
 include/linux/uwb.h           |  6 +++++-
 4 files changed, 37 insertions(+), 8 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/usb/wusbcore/pal.c b/drivers/usb/wusbcore/pal.c
index cc126b444734..7cc51e9905cf 100644
--- a/drivers/usb/wusbcore/pal.c
+++ b/drivers/usb/wusbcore/pal.c
@@ -26,6 +26,9 @@ int wusbhc_pal_register(struct wusbhc *wusbhc)
 {
 	uwb_pal_init(&wusbhc->pal);
 
+	wusbhc->pal.name   = "wusbhc";
+	wusbhc->pal.device = wusbhc->usb_hcd.self.controller;
+
 	return uwb_pal_register(wusbhc->uwb_rc, &wusbhc->pal);
 }
 
diff --git a/drivers/usb/wusbcore/wusbhc.c b/drivers/usb/wusbcore/wusbhc.c
index 1149b1e59c86..07c63a31c799 100644
--- a/drivers/usb/wusbcore/wusbhc.c
+++ b/drivers/usb/wusbcore/wusbhc.c
@@ -192,13 +192,8 @@ int wusbhc_create(struct wusbhc *wusbhc)
 	result = wusbhc_sec_create(wusbhc);
 	if (result < 0)
 		goto error_sec_create;
-	result = wusbhc_pal_register(wusbhc);
-	if (result < 0)
-		goto error_pal_register;
 	return 0;
 
-error_pal_register:
-	wusbhc_sec_destroy(wusbhc);
 error_sec_create:
 	wusbhc_rh_destroy(wusbhc);
 error_rh_create:
@@ -235,7 +230,14 @@ int wusbhc_b_create(struct wusbhc *wusbhc)
 		dev_err(dev, "Cannot register WUSBHC attributes: %d\n", result);
 		goto error_create_attr_group;
 	}
-	/* Yep, I plan to add stuff here... */
+
+	result = wusbhc_pal_register(wusbhc);
+	if (result < 0)
+		goto error_pal_register;
+	return 0;
+
+error_pal_register:
+	sysfs_remove_group(wusbhc_kobj(wusbhc), &wusbhc_attr_group);
 error_create_attr_group:
 	return result;
 }
@@ -243,13 +245,13 @@ EXPORT_SYMBOL_GPL(wusbhc_b_create);
 
 void wusbhc_b_destroy(struct wusbhc *wusbhc)
 {
+	wusbhc_pal_unregister(wusbhc);
 	sysfs_remove_group(wusbhc_kobj(wusbhc), &wusbhc_attr_group);
 }
 EXPORT_SYMBOL_GPL(wusbhc_b_destroy);
 
 void wusbhc_destroy(struct wusbhc *wusbhc)
 {
-	wusbhc_pal_unregister(wusbhc);
 	wusbhc_sec_destroy(wusbhc);
 	wusbhc_rh_destroy(wusbhc);
 	wusbhc_devconnect_destroy(wusbhc);
diff --git a/drivers/uwb/pal.c b/drivers/uwb/pal.c
index 5508993a820e..1afb38eacb9a 100644
--- a/drivers/uwb/pal.c
+++ b/drivers/uwb/pal.c
@@ -39,6 +39,21 @@ EXPORT_SYMBOL_GPL(uwb_pal_init);
  */
 int uwb_pal_register(struct uwb_rc *rc, struct uwb_pal *pal)
 {
+	int ret;
+
+	if (pal->device) {
+		ret = sysfs_create_link(&pal->device->kobj,
+					&rc->uwb_dev.dev.kobj, "uwb_rc");
+		if (ret < 0)
+			return ret;
+		ret = sysfs_create_link(&rc->uwb_dev.dev.kobj,
+					&pal->device->kobj, pal->name);
+		if (ret < 0) {
+			sysfs_remove_link(&pal->device->kobj, "uwb_rc");
+			return ret;
+		}
+	}
+
 	spin_lock(&rc->pal_lock);
 	list_add(&pal->node, &rc->pals);
 	spin_unlock(&rc->pal_lock);
@@ -57,6 +72,11 @@ void uwb_pal_unregister(struct uwb_rc *rc, struct uwb_pal *pal)
 	spin_lock(&rc->pal_lock);
 	list_del(&pal->node);
 	spin_unlock(&rc->pal_lock);
+
+	if (pal->device) {
+		sysfs_remove_link(&rc->uwb_dev.dev.kobj, pal->name);
+		sysfs_remove_link(&pal->device->kobj, "uwb_rc");
+	}
 }
 EXPORT_SYMBOL_GPL(uwb_pal_unregister);
 
diff --git a/include/linux/uwb.h b/include/linux/uwb.h
index 0cd35937e120..f9ccbd9a2ced 100644
--- a/include/linux/uwb.h
+++ b/include/linux/uwb.h
@@ -361,6 +361,9 @@ struct uwb_rc {
 
 /**
  * struct uwb_pal - a UWB PAL
+ * @name:    descriptive name for this PAL (wushc, wlp, etc.).
+ * @device:  a device for the PAL.  Used to link the PAL and the radio
+ *           controller in sysfs.
  * @new_rsv: called when a peer requests a reservation (may be NULL if
  *           the PAL cannot accept reservation requests).
  *
@@ -379,7 +382,8 @@ struct uwb_rc {
  */
 struct uwb_pal {
 	struct list_head node;
-
+	const char *name;
+	struct device *device;
 	void (*new_rsv)(struct uwb_rsv *rsv);
 };
 
-- 
cgit v1.2.3


From d7cfb60c5cf904ecf1e0ae23ec178175b86f0d4a Mon Sep 17 00:00:00 2001
From: Mark McLoughlin <markmc@redhat.com>
Date: Fri, 19 Sep 2008 13:13:44 +0100
Subject: hrtimer: remove hrtimer_clock_base::get_softirq_time()

Peter Zijlstra noticed this 8 months ago and I just noticed
it again.

hrtimer_clock_base::get_softirq_time() is currently unused
in the entire tree. In fact, looking at the logs, it appears
as if it was never used. Remove it.

Signed-off-by: Mark McLoughlin <markmc@redhat.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 include/linux/hrtimer.h | 2 --
 kernel/hrtimer.c        | 4 +---
 2 files changed, 1 insertion(+), 5 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/hrtimer.h b/include/linux/hrtimer.h
index 6d93dce61cbb..1b079bd29c35 100644
--- a/include/linux/hrtimer.h
+++ b/include/linux/hrtimer.h
@@ -145,7 +145,6 @@ struct hrtimer_sleeper {
  * @first:		pointer to the timer node which expires first
  * @resolution:		the resolution of the clock, in nanoseconds
  * @get_time:		function to retrieve the current time of the clock
- * @get_softirq_time:	function to retrieve the current time from the softirq
  * @softirq_time:	the time when running the hrtimer queue in the softirq
  * @offset:		offset of this clock to the monotonic base
  * @reprogram:		function to reprogram the timer event
@@ -157,7 +156,6 @@ struct hrtimer_clock_base {
 	struct rb_node		*first;
 	ktime_t			resolution;
 	ktime_t			(*get_time)(void);
-	ktime_t			(*get_softirq_time)(void);
 	ktime_t			softirq_time;
 #ifdef CONFIG_HIGH_RES_TIMERS
 	ktime_t			offset;
diff --git a/kernel/hrtimer.c b/kernel/hrtimer.c
index 03ea1378c43b..4d761d50c529 100644
--- a/kernel/hrtimer.c
+++ b/kernel/hrtimer.c
@@ -1401,9 +1401,7 @@ void hrtimer_run_queues(void)
 		if (!base->first)
 			continue;
 
-		if (base->get_softirq_time)
-			base->softirq_time = base->get_softirq_time();
-		else if (gettime) {
+		if (gettime) {
 			hrtimer_get_softirq_time(cpu_base);
 			gettime = 0;
 		}
-- 
cgit v1.2.3


From b91c4996df56fcd201f85c392a1de7bc3f6641f5 Mon Sep 17 00:00:00 2001
From: Mark McLoughlin <markmc@redhat.com>
Date: Fri, 19 Sep 2008 13:13:48 +0100
Subject: hrtimer: remove hrtimer_clock_base::reprogram()

hrtimer_clock_base::reprogram() also appears to never
have been used, so remove it.

Signed-off-by: Mark McLoughlin <markmc@redhat.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 include/linux/hrtimer.h | 4 ----
 1 file changed, 4 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/hrtimer.h b/include/linux/hrtimer.h
index 1b079bd29c35..68b0196d8696 100644
--- a/include/linux/hrtimer.h
+++ b/include/linux/hrtimer.h
@@ -147,7 +147,6 @@ struct hrtimer_sleeper {
  * @get_time:		function to retrieve the current time of the clock
  * @softirq_time:	the time when running the hrtimer queue in the softirq
  * @offset:		offset of this clock to the monotonic base
- * @reprogram:		function to reprogram the timer event
  */
 struct hrtimer_clock_base {
 	struct hrtimer_cpu_base	*cpu_base;
@@ -159,9 +158,6 @@ struct hrtimer_clock_base {
 	ktime_t			softirq_time;
 #ifdef CONFIG_HIGH_RES_TIMERS
 	ktime_t			offset;
-	int			(*reprogram)(struct hrtimer *t,
-					     struct hrtimer_clock_base *b,
-					     ktime_t n);
 #endif
 };
 
-- 
cgit v1.2.3


From a0ad05c75aa362c91f4d9cd91ff375a739574dd8 Mon Sep 17 00:00:00 2001
From: Thomas Renninger <trenn@suse.de>
Date: Mon, 1 Sep 2008 14:27:02 +0200
Subject: Introduce FW_BUG, FW_WARN and FW_INFO to consistenly tell users about
 BIOS bugs

The idea is to add this to printk after the severity:
printk(KERN_ERR FW_BUG "This is not our fault, BIOS developer: fix it by
simply add ...\n");

If a Firmware issue should be hidden, because it is
work-arounded, but you still want to see something popping up e.g.
for info only:
printk(KERN_INFO FW_INFO "This is done stupid, we can handle it,
but it should better be avoided in future\n");

or on the Linuxfirmwarekit to tell vendors that they did something
stupid or wrong without bothering the user:
printk(KERN_INFO FW_BUG "This is done stupid, we can handle it,
but it should better be avoided in future\n");

Some use cases:
  - If a user sees a [Firmware Bug] message in the kernel
    he should first update the BIOS before wasting time with
    debugging and submiting on old firmware code to mailing
    lists.

  - The linuxfirmwarekit (http://www.linuxfirmwarekit.org)
    tries to detect firmware bugs. It currently is doing that
    in userspace which results in:
        - Huge test scripts that could be a one liner in the kernel
        - A lot of BIOS bugs are already absorbed by the kernel

What do we need such a stupid linuxfirmwarekit for?

  - Vendors: Can test their BIOSes for Linux compatibility.
    There will be the time when vendors realize that the test utils
    on Linux are more strict and using them increases the qualitity
    and stability of their products.

  - Vendors: Can easily fix up their BIOSes and be more Linux
    compatible by:
    dmesg |grep "Firmware Bug"
    and send the result to their BIOS developer colleagues who should
    know what the messages are about and how to fix them, without
    the need of studying kernel code.

  - Distributions: can do a first automated HW/BIOS checks.
    This can then be done without the need of asking kernel developers
    who need to dig down the code and explain the details.
    Certification can/will just be rejected until
    dmesg |grep "Firmware Bug" is empty.

  - Thus this can be used as an instrument to enforce cleaner BIOS
    code. Currently every stupid Windows ACPI bug is
    re-implemented in Linux which is a rather unfortunate situation.
    We already have the power to avoid this in e.g. memory
    or cpu hot-plug ACPI implementations, because Linux certification
    is a must for most vendors in the server area.
    Working towards being able to do that in the laptop area
    (vendors are starting to look at Linux here also and will use this tool)
    is the goal. At least provide them a tool to make it as easy
    for this guys (e.g. not needing to browse kernel code) as possible.

  - The ordinary Linux user: can go into the next shop, boots the
    firmwarekit on his most preferred machines. He chooses one without
    BIOS bugs. Unsupported HW is ok, he likes to try out latest projects
    which might support them or likes to dig on it on his own, but he
    hates to workaround broken BIOSes like hell.

I double checked with the firmwarekit.
There they have:
So the mapping generally is (also depending on how likely the BIOS is
to blame, this could sometimes be difficult):
FW_INFO  = INFO
FW_WARN  = WARN
FW_BUG   = FAIL

For more info about the linuxfirmwarekit and why this is needed
can be found here:
http://www.linuxfirmwarekit.org

While severity matches with the firmwarekit, it might be tricky
to hide messages from the user.
E.g. we recently found out that on HP BIOSes negative temperatures
are returned, which seem to indicate that the thermal zone is
invalid.
We can work around that gracefully by ignoring the thermal zone
and we do not want to bother the ordinary user with a frightening
message: Firmware Bug: thermal management absolutely broken
but want to hide it from the user.

But in the linuxfirmwarekit this should be shown as a real
show stopper (the temperatures could really be wrong,
broken thermal management is one of the worst things
that can happen and the BIOS guys of the machine must
implement this properly).

It is intended to do that (hide it from the user with
KERN_INFO msg, but still print it as a BIOS bug) by:
printk(KERN_INFO FW_BUG "Negativ temperature values detected.
Try to workarounded, BIOS must get fixed\n");
Hope that works out..., no idea how to better hide it
as printk is the only way to easily provide this functionality.

Signed-off-by: Thomas Renninger <trenn@suse.de>
Signed-off-by: Andi Kleen <ak@linux.intel.com>
Signed-off-by: Len Brown <len.brown@intel.com>
---
 include/linux/kernel.h | 24 ++++++++++++++++++++++++
 1 file changed, 24 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/kernel.h b/include/linux/kernel.h
index 2651f805ba6d..0b19848e380e 100644
--- a/include/linux/kernel.h
+++ b/include/linux/kernel.h
@@ -190,6 +190,30 @@ extern int kernel_text_address(unsigned long addr);
 struct pid;
 extern struct pid *session_of_pgrp(struct pid *pgrp);
 
+/*
+ * FW_BUG
+ * Add this to a message where you are sure the firmware is buggy or behaves
+ * really stupid or out of spec. Be aware that the responsible BIOS developer
+ * should be able to fix this issue or at least get a concrete idea of the
+ * problem by reading your message without the need of looking at the kernel
+ * code.
+ * 
+ * Use it for definite and high priority BIOS bugs.
+ *
+ * FW_WARN
+ * Use it for not that clear (e.g. could the kernel messed up things already?)
+ * and medium priority BIOS bugs.
+ *
+ * FW_INFO
+ * Use this one if you want to tell the user or vendor about something
+ * suspicious, but generally harmless related to the firmware.
+ *
+ * Use it for information or very low priority BIOS bugs.
+ */
+#define FW_BUG		"[Firmware Bug]: "
+#define FW_WARN		"[Firmware Warn]: "
+#define FW_INFO		"[Firmware Info]: "
+
 #ifdef CONFIG_PRINTK
 asmlinkage int vprintk(const char *fmt, va_list args)
 	__attribute__ ((format (printf, 1, 0)));
-- 
cgit v1.2.3


From bb34d92f643086d546b49cef680f6f305ed84414 Mon Sep 17 00:00:00 2001
From: Frank Mayhar <fmayhar@google.com>
Date: Fri, 12 Sep 2008 09:54:39 -0700
Subject: timers: fix itimer/many thread hang, v2

This is the second resubmission of the posix timer rework patch, posted
a few days ago.

This includes the changes from the previous resubmittion, which addressed
Oleg Nesterov's comments, removing the RCU stuff from the patch and
un-inlining the thread_group_cputime() function for SMP.

In addition, per Ingo Molnar it simplifies the UP code, consolidating much
of it with the SMP version and depending on lower-level SMP/UP handling to
take care of the differences.

It also cleans up some UP compile errors, moves the scheduler stats-related
macros into kernel/sched_stats.h, cleans up a merge error in
kernel/fork.c and has a few other minor fixes and cleanups as suggested
by Oleg and Ingo. Thanks for the review, guys.

Signed-off-by: Frank Mayhar <fmayhar@google.com>
Cc: Roland McGrath <roland@redhat.com>
Cc: Alexey Dobriyan <adobriyan@gmail.com>
Cc: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 include/linux/kernel_stat.h |   1 +
 include/linux/sched.h       | 183 ++------------------------------------------
 kernel/fork.c               |   5 +-
 kernel/posix-cpu-timers.c   | 153 ++++++++++++++++--------------------
 kernel/sched.c              |  47 ++----------
 kernel/sched_stats.h        | 136 ++++++++++++++++++++++++++++++++
 6 files changed, 214 insertions(+), 311 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/kernel_stat.h b/include/linux/kernel_stat.h
index cf9f40a91c9c..cac3750cd65e 100644
--- a/include/linux/kernel_stat.h
+++ b/include/linux/kernel_stat.h
@@ -52,6 +52,7 @@ static inline int kstat_irqs(int irq)
 	return sum;
 }
 
+extern unsigned long long task_delta_exec(struct task_struct *);
 extern void account_user_time(struct task_struct *, cputime_t);
 extern void account_user_time_scaled(struct task_struct *, cputime_t);
 extern void account_system_time(struct task_struct *, int, cputime_t);
diff --git a/include/linux/sched.h b/include/linux/sched.h
index 7ce8d4e53565..b982fb48c8f0 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -454,15 +454,9 @@ struct task_cputime {
  * This structure contains the version of task_cputime, above, that is
  * used for thread group CPU clock calculations.
  */
-#ifdef CONFIG_SMP
 struct thread_group_cputime {
 	struct task_cputime *totals;
 };
-#else
-struct thread_group_cputime {
-	struct task_cputime totals;
-};
-#endif
 
 /*
  * NOTE! "signal_struct" does not have it's own
@@ -2124,193 +2118,26 @@ static inline int spin_needbreak(spinlock_t *lock)
 /*
  * Thread group CPU time accounting.
  */
-#ifdef CONFIG_SMP
 
-extern int thread_group_cputime_alloc_smp(struct task_struct *);
-extern void thread_group_cputime_smp(struct task_struct *, struct task_cputime *);
+extern int thread_group_cputime_alloc(struct task_struct *);
+extern void thread_group_cputime(struct task_struct *, struct task_cputime *);
 
 static inline void thread_group_cputime_init(struct signal_struct *sig)
 {
 	sig->cputime.totals = NULL;
 }
 
-static inline int thread_group_cputime_clone_thread(struct task_struct *curr,
-						    struct task_struct *new)
+static inline int thread_group_cputime_clone_thread(struct task_struct *curr)
 {
 	if (curr->signal->cputime.totals)
 		return 0;
-	return thread_group_cputime_alloc_smp(curr);
+	return thread_group_cputime_alloc(curr);
 }
 
-static inline void thread_group_cputime_free(struct signal_struct *sig)
-{
-	free_percpu(sig->cputime.totals);
-}
-
-/**
- * thread_group_cputime - Sum the thread group time fields across all CPUs.
- *
- * This is a wrapper for the real routine, thread_group_cputime_smp().  See
- * that routine for details.
- */
-static inline void thread_group_cputime(
-	struct task_struct *tsk,
-	struct task_cputime *times)
-{
-	thread_group_cputime_smp(tsk, times);
-}
-
-/**
- * thread_group_cputime_account_user - Maintain utime for a thread group.
- *
- * @tgtimes:	Pointer to thread_group_cputime structure.
- * @cputime:	Time value by which to increment the utime field of that
- *		structure.
- *
- * If thread group time is being maintained, get the structure for the
- * running CPU and update the utime field there.
- */
-static inline void thread_group_cputime_account_user(
-	struct thread_group_cputime *tgtimes,
-	cputime_t cputime)
-{
-	if (tgtimes->totals) {
-		struct task_cputime *times;
-
-		times = per_cpu_ptr(tgtimes->totals, get_cpu());
-		times->utime = cputime_add(times->utime, cputime);
-		put_cpu_no_resched();
-	}
-}
-
-/**
- * thread_group_cputime_account_system - Maintain stime for a thread group.
- *
- * @tgtimes:	Pointer to thread_group_cputime structure.
- * @cputime:	Time value by which to increment the stime field of that
- *		structure.
- *
- * If thread group time is being maintained, get the structure for the
- * running CPU and update the stime field there.
- */
-static inline void thread_group_cputime_account_system(
-	struct thread_group_cputime *tgtimes,
-	cputime_t cputime)
-{
-	if (tgtimes->totals) {
-		struct task_cputime *times;
-
-		times = per_cpu_ptr(tgtimes->totals, get_cpu());
-		times->stime = cputime_add(times->stime, cputime);
-		put_cpu_no_resched();
-	}
-}
-
-/**
- * thread_group_cputime_account_exec_runtime - Maintain exec runtime for a
- *						thread group.
- *
- * @tgtimes:	Pointer to thread_group_cputime structure.
- * @ns:		Time value by which to increment the sum_exec_runtime field
- *		of that structure.
- *
- * If thread group time is being maintained, get the structure for the
- * running CPU and update the sum_exec_runtime field there.
- */
-static inline void thread_group_cputime_account_exec_runtime(
-	struct thread_group_cputime *tgtimes,
-	unsigned long long ns)
-{
-	if (tgtimes->totals) {
-		struct task_cputime *times;
-
-		times = per_cpu_ptr(tgtimes->totals, get_cpu());
-		times->sum_exec_runtime += ns;
-		put_cpu_no_resched();
-	}
-}
-
-#else /* CONFIG_SMP */
-
-static inline void thread_group_cputime_init(struct signal_struct *sig)
-{
-	sig->cputime.totals.utime = cputime_zero;
-	sig->cputime.totals.stime = cputime_zero;
-	sig->cputime.totals.sum_exec_runtime = 0;
-}
-
-static inline int thread_group_cputime_alloc(struct task_struct *tsk)
-{
-	return 0;
-}
 
 static inline void thread_group_cputime_free(struct signal_struct *sig)
 {
-}
-
-static inline int thread_group_cputime_clone_thread(struct task_struct *curr,
-						     struct task_struct *tsk)
-{
-	return 0;
-}
-
-static inline void thread_group_cputime(struct task_struct *tsk,
-					 struct task_cputime *cputime)
-{
-	*cputime = tsk->signal->cputime.totals;
-}
-
-static inline void thread_group_cputime_account_user(
-	struct thread_group_cputime *tgtimes,
-	cputime_t cputime)
-{
-	tgtimes->totals.utime = cputime_add(tgtimes->totals.utime, cputime);
-}
-
-static inline void thread_group_cputime_account_system(
-	struct thread_group_cputime *tgtimes,
-	cputime_t cputime)
-{
-	tgtimes->totals.stime = cputime_add(tgtimes->totals.stime, cputime);
-}
-
-static inline void thread_group_cputime_account_exec_runtime(
-	struct thread_group_cputime *tgtimes,
-	unsigned long long ns)
-{
-	tgtimes->totals.sum_exec_runtime += ns;
-}
-
-#endif /* CONFIG_SMP */
-
-static inline void account_group_user_time(struct task_struct *tsk,
-					    cputime_t cputime)
-{
-	struct signal_struct *sig;
-
-	sig = tsk->signal;
-	if (likely(sig))
-		thread_group_cputime_account_user(&sig->cputime, cputime);
-}
-
-static inline void account_group_system_time(struct task_struct *tsk,
-					      cputime_t cputime)
-{
-	struct signal_struct *sig;
-
-	sig = tsk->signal;
-	if (likely(sig))
-		thread_group_cputime_account_system(&sig->cputime, cputime);
-}
-
-static inline void account_group_exec_runtime(struct task_struct *tsk,
-					       unsigned long long ns)
-{
-	struct signal_struct *sig;
-
-	sig = tsk->signal;
-	if (likely(sig))
-		thread_group_cputime_account_exec_runtime(&sig->cputime, ns);
+	free_percpu(sig->cputime.totals);
 }
 
 /*
diff --git a/kernel/fork.c b/kernel/fork.c
index 1181b9aac48e..021ae012cc75 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -791,7 +791,7 @@ static int copy_signal(unsigned long clone_flags, struct task_struct *tsk)
 	int ret;
 
 	if (clone_flags & CLONE_THREAD) {
-		ret = thread_group_cputime_clone_thread(current, tsk);
+		ret = thread_group_cputime_clone_thread(current);
 		if (likely(!ret)) {
 			atomic_inc(&current->signal->count);
 			atomic_inc(&current->signal->live);
@@ -834,9 +834,6 @@ static int copy_signal(unsigned long clone_flags, struct task_struct *tsk)
 	sig->min_flt = sig->maj_flt = sig->cmin_flt = sig->cmaj_flt = 0;
 	sig->inblock = sig->oublock = sig->cinblock = sig->coublock = 0;
 	task_io_accounting_init(&sig->ioac);
-	INIT_LIST_HEAD(&sig->cpu_timers[0]);
-	INIT_LIST_HEAD(&sig->cpu_timers[1]);
-	INIT_LIST_HEAD(&sig->cpu_timers[2]);
 	taskstats_tgid_init(sig);
 
 	task_lock(current->group_leader);
diff --git a/kernel/posix-cpu-timers.c b/kernel/posix-cpu-timers.c
index 9a7ea049fcdc..153dcb2639c3 100644
--- a/kernel/posix-cpu-timers.c
+++ b/kernel/posix-cpu-timers.c
@@ -7,50 +7,46 @@
 #include <linux/errno.h>
 #include <linux/math64.h>
 #include <asm/uaccess.h>
+#include <linux/kernel_stat.h>
 
-#ifdef CONFIG_SMP
 /*
- * Allocate the thread_group_cputime structure appropriately for SMP kernels
- * and fill in the current values of the fields.  Called from copy_signal()
- * via thread_group_cputime_clone_thread() when adding a second or subsequent
+ * Allocate the thread_group_cputime structure appropriately and fill in the
+ * current values of the fields.  Called from copy_signal() via
+ * thread_group_cputime_clone_thread() when adding a second or subsequent
  * thread to a thread group.  Assumes interrupts are enabled when called.
  */
-int thread_group_cputime_alloc_smp(struct task_struct *tsk)
+int thread_group_cputime_alloc(struct task_struct *tsk)
 {
 	struct signal_struct *sig = tsk->signal;
 	struct task_cputime *cputime;
 
 	/*
 	 * If we have multiple threads and we don't already have a
-	 * per-CPU task_cputime struct, allocate one and fill it in with
-	 * the times accumulated so far.
+	 * per-CPU task_cputime struct (checked in the caller), allocate
+	 * one and fill it in with the times accumulated so far.  We may
+	 * race with another thread so recheck after we pick up the sighand
+	 * lock.
 	 */
-	if (sig->cputime.totals)
-		return 0;
 	cputime = alloc_percpu(struct task_cputime);
 	if (cputime == NULL)
 		return -ENOMEM;
-	read_lock(&tasklist_lock);
 	spin_lock_irq(&tsk->sighand->siglock);
 	if (sig->cputime.totals) {
 		spin_unlock_irq(&tsk->sighand->siglock);
-		read_unlock(&tasklist_lock);
 		free_percpu(cputime);
 		return 0;
 	}
 	sig->cputime.totals = cputime;
-	cputime = per_cpu_ptr(sig->cputime.totals, get_cpu());
+	cputime = per_cpu_ptr(sig->cputime.totals, smp_processor_id());
 	cputime->utime = tsk->utime;
 	cputime->stime = tsk->stime;
 	cputime->sum_exec_runtime = tsk->se.sum_exec_runtime;
-	put_cpu_no_resched();
 	spin_unlock_irq(&tsk->sighand->siglock);
-	read_unlock(&tasklist_lock);
 	return 0;
 }
 
 /**
- * thread_group_cputime_smp - Sum the thread group time fields across all CPUs.
+ * thread_group_cputime - Sum the thread group time fields across all CPUs.
  *
  * @tsk:	The task we use to identify the thread group.
  * @times:	task_cputime structure in which we return the summed fields.
@@ -58,7 +54,7 @@ int thread_group_cputime_alloc_smp(struct task_struct *tsk)
  * Walk the list of CPUs to sum the per-CPU time fields in the thread group
  * time structure.
  */
-void thread_group_cputime_smp(
+void thread_group_cputime(
 	struct task_struct *tsk,
 	struct task_cputime *times)
 {
@@ -83,8 +79,6 @@ void thread_group_cputime_smp(
 	}
 }
 
-#endif /* CONFIG_SMP */
-
 /*
  * Called after updating RLIMIT_CPU to set timer expiration if necessary.
  */
@@ -300,7 +294,7 @@ static int cpu_clock_sample(const clockid_t which_clock, struct task_struct *p,
 		cpu->cpu = virt_ticks(p);
 		break;
 	case CPUCLOCK_SCHED:
-		cpu->sched = task_sched_runtime(p);
+		cpu->sched = p->se.sum_exec_runtime + task_delta_exec(p);
 		break;
 	}
 	return 0;
@@ -309,16 +303,15 @@ static int cpu_clock_sample(const clockid_t which_clock, struct task_struct *p,
 /*
  * Sample a process (thread group) clock for the given group_leader task.
  * Must be called with tasklist_lock held for reading.
- * Must be called with tasklist_lock held for reading, and p->sighand->siglock.
  */
-static int cpu_clock_sample_group_locked(unsigned int clock_idx,
-					 struct task_struct *p,
-					 union cpu_time_count *cpu)
+static int cpu_clock_sample_group(const clockid_t which_clock,
+				  struct task_struct *p,
+				  union cpu_time_count *cpu)
 {
 	struct task_cputime cputime;
 
 	thread_group_cputime(p, &cputime);
-	switch (clock_idx) {
+	switch (which_clock) {
 	default:
 		return -EINVAL;
 	case CPUCLOCK_PROF:
@@ -328,29 +321,12 @@ static int cpu_clock_sample_group_locked(unsigned int clock_idx,
 		cpu->cpu = cputime.utime;
 		break;
 	case CPUCLOCK_SCHED:
-		cpu->sched = thread_group_sched_runtime(p);
+		cpu->sched = cputime.sum_exec_runtime + task_delta_exec(p);
 		break;
 	}
 	return 0;
 }
 
-/*
- * Sample a process (thread group) clock for the given group_leader task.
- * Must be called with tasklist_lock held for reading.
- */
-static int cpu_clock_sample_group(const clockid_t which_clock,
-				  struct task_struct *p,
-				  union cpu_time_count *cpu)
-{
-	int ret;
-	unsigned long flags;
-	spin_lock_irqsave(&p->sighand->siglock, flags);
-	ret = cpu_clock_sample_group_locked(CPUCLOCK_WHICH(which_clock), p,
-					    cpu);
-	spin_unlock_irqrestore(&p->sighand->siglock, flags);
-	return ret;
-}
-
 
 int posix_cpu_clock_get(const clockid_t which_clock, struct timespec *tp)
 {
@@ -1324,29 +1300,37 @@ static inline int task_cputime_expired(const struct task_cputime *sample,
  * fastpath_timer_check - POSIX CPU timers fast path.
  *
  * @tsk:	The task (thread) being checked.
- * @sig:	The signal pointer for that task.
  *
- * If there are no timers set return false.  Otherwise snapshot the task and
- * thread group timers, then compare them with the corresponding expiration
- # times.  Returns true if a timer has expired, else returns false.
+ * Check the task and thread group timers.  If both are zero (there are no
+ * timers set) return false.  Otherwise snapshot the task and thread group
+ * timers and compare them with the corresponding expiration times.  Return
+ * true if a timer has expired, else return false.
  */
-static inline int fastpath_timer_check(struct task_struct *tsk,
-					struct signal_struct *sig)
+static inline int fastpath_timer_check(struct task_struct *tsk)
 {
-	struct task_cputime task_sample = {
-		.utime = tsk->utime,
-		.stime = tsk->stime,
-		.sum_exec_runtime = tsk->se.sum_exec_runtime
-	};
-	struct task_cputime group_sample;
+	struct signal_struct *sig = tsk->signal;
 
-	if (task_cputime_zero(&tsk->cputime_expires) &&
-	    task_cputime_zero(&sig->cputime_expires))
+	if (unlikely(!sig))
 		return 0;
-	if (task_cputime_expired(&task_sample, &tsk->cputime_expires))
-		return 1;
-	thread_group_cputime(tsk, &group_sample);
-	return task_cputime_expired(&group_sample, &sig->cputime_expires);
+
+	if (!task_cputime_zero(&tsk->cputime_expires)) {
+		struct task_cputime task_sample = {
+			.utime = tsk->utime,
+			.stime = tsk->stime,
+			.sum_exec_runtime = tsk->se.sum_exec_runtime
+		};
+
+		if (task_cputime_expired(&task_sample, &tsk->cputime_expires))
+			return 1;
+	}
+	if (!task_cputime_zero(&sig->cputime_expires)) {
+		struct task_cputime group_sample;
+
+		thread_group_cputime(tsk, &group_sample);
+		if (task_cputime_expired(&group_sample, &sig->cputime_expires))
+			return 1;
+	}
+	return 0;
 }
 
 /*
@@ -1358,43 +1342,34 @@ void run_posix_cpu_timers(struct task_struct *tsk)
 {
 	LIST_HEAD(firing);
 	struct k_itimer *timer, *next;
-	struct signal_struct *sig;
-	struct sighand_struct *sighand;
-	unsigned long flags;
 
 	BUG_ON(!irqs_disabled());
 
-	/* Pick up tsk->signal and make sure it's valid. */
-	sig = tsk->signal;
 	/*
 	 * The fast path checks that there are no expired thread or thread
-	 * group timers.  If that's so, just return.  Also check that
-	 * tsk->signal is non-NULL; this probably can't happen but cover the
-	 * possibility anyway.
+	 * group timers.  If that's so, just return.
 	 */
-	if (unlikely(!sig) || !fastpath_timer_check(tsk, sig))
+	if (!fastpath_timer_check(tsk))
 		return;
 
-	sighand = lock_task_sighand(tsk, &flags);
-	if (likely(sighand)) {
-		/*
-		 * Here we take off tsk->signal->cpu_timers[N] and
-		 * tsk->cpu_timers[N] all the timers that are firing, and
-		 * put them on the firing list.
-		 */
-		check_thread_timers(tsk, &firing);
-		check_process_timers(tsk, &firing);
+	spin_lock(&tsk->sighand->siglock);
+	/*
+	 * Here we take off tsk->signal->cpu_timers[N] and
+	 * tsk->cpu_timers[N] all the timers that are firing, and
+	 * put them on the firing list.
+	 */
+	check_thread_timers(tsk, &firing);
+	check_process_timers(tsk, &firing);
 
-		/*
-		 * We must release these locks before taking any timer's lock.
-		 * There is a potential race with timer deletion here, as the
-		 * siglock now protects our private firing list.  We have set
-		 * the firing flag in each timer, so that a deletion attempt
-		 * that gets the timer lock before we do will give it up and
-		 * spin until we've taken care of that timer below.
-		 */
-	}
-	unlock_task_sighand(tsk, &flags);
+	/*
+	 * We must release these locks before taking any timer's lock.
+	 * There is a potential race with timer deletion here, as the
+	 * siglock now protects our private firing list.  We have set
+	 * the firing flag in each timer, so that a deletion attempt
+	 * that gets the timer lock before we do will give it up and
+	 * spin until we've taken care of that timer below.
+	 */
+	spin_unlock(&tsk->sighand->siglock);
 
 	/*
 	 * Now that all the timers on our list have the firing flag,
@@ -1433,7 +1408,7 @@ void set_process_cpu_timer(struct task_struct *tsk, unsigned int clock_idx,
 	struct list_head *head;
 
 	BUG_ON(clock_idx == CPUCLOCK_SCHED);
-	cpu_clock_sample_group_locked(clock_idx, tsk, &now);
+	cpu_clock_sample_group(clock_idx, tsk, &now);
 
 	if (oldval) {
 		if (!cputime_eq(*oldval, cputime_zero)) {
diff --git a/kernel/sched.c b/kernel/sched.c
index c51b5d276665..260c22cc530a 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -4039,55 +4039,22 @@ EXPORT_PER_CPU_SYMBOL(kstat);
 /*
  * Return any ns on the sched_clock that have not yet been banked in
  * @p in case that task is currently running.
- *
- * Called with task_rq_lock() held on @rq.
  */
-static unsigned long long task_delta_exec(struct task_struct *p, struct rq *rq)
+unsigned long long task_delta_exec(struct task_struct *p)
 {
+	struct rq *rq;
+	unsigned long flags;
+	u64 ns = 0;
+
+	rq = task_rq_lock(p, &flags);
 	if (task_current(rq, p)) {
 		u64 delta_exec;
 
 		update_rq_clock(rq);
 		delta_exec = rq->clock - p->se.exec_start;
 		if ((s64)delta_exec > 0)
-			return delta_exec;
+			ns = delta_exec;
 	}
-	return 0;
-}
-
-/*
- * Return p->sum_exec_runtime plus any more ns on the sched_clock
- * that have not yet been banked in case the task is currently running.
- */
-unsigned long long task_sched_runtime(struct task_struct *p)
-{
-	unsigned long flags;
-	u64 ns;
-	struct rq *rq;
-
-	rq = task_rq_lock(p, &flags);
-	ns = p->se.sum_exec_runtime + task_delta_exec(p, rq);
-	task_rq_unlock(rq, &flags);
-
-	return ns;
-}
-
-/*
- * Return sum_exec_runtime for the thread group plus any more ns on the
- * sched_clock that have not yet been banked in case the task is currently
- * running.
- */
-unsigned long long thread_group_sched_runtime(struct task_struct *p)
-{
-	unsigned long flags;
-	u64 ns;
-	struct rq *rq;
-	struct task_cputime totals;
-
-	rq = task_rq_lock(p, &flags);
-	thread_group_cputime(p, &totals);
-	ns = totals.sum_exec_runtime + task_delta_exec(p, rq);
-	task_rq_unlock(rq, &flags);
 
 	return ns;
 }
diff --git a/kernel/sched_stats.h b/kernel/sched_stats.h
index 8385d43987e2..d6903bd0c7a8 100644
--- a/kernel/sched_stats.h
+++ b/kernel/sched_stats.h
@@ -270,3 +270,139 @@ sched_info_switch(struct task_struct *prev, struct task_struct *next)
 #define sched_info_switch(t, next)		do { } while (0)
 #endif /* CONFIG_SCHEDSTATS || CONFIG_TASK_DELAY_ACCT */
 
+/*
+ * The following are functions that support scheduler-internal time accounting.
+ * These functions are generally called at the timer tick.  None of this depends
+ * on CONFIG_SCHEDSTATS.
+ */
+
+#ifdef CONFIG_SMP
+
+/**
+ * thread_group_cputime_account_user - Maintain utime for a thread group.
+ *
+ * @tgtimes:	Pointer to thread_group_cputime structure.
+ * @cputime:	Time value by which to increment the utime field of that
+ *		structure.
+ *
+ * If thread group time is being maintained, get the structure for the
+ * running CPU and update the utime field there.
+ */
+static inline void thread_group_cputime_account_user(
+	struct thread_group_cputime *tgtimes,
+	cputime_t cputime)
+{
+	if (tgtimes->totals) {
+		struct task_cputime *times;
+
+		times = per_cpu_ptr(tgtimes->totals, get_cpu());
+		times->utime = cputime_add(times->utime, cputime);
+		put_cpu_no_resched();
+	}
+}
+
+/**
+ * thread_group_cputime_account_system - Maintain stime for a thread group.
+ *
+ * @tgtimes:	Pointer to thread_group_cputime structure.
+ * @cputime:	Time value by which to increment the stime field of that
+ *		structure.
+ *
+ * If thread group time is being maintained, get the structure for the
+ * running CPU and update the stime field there.
+ */
+static inline void thread_group_cputime_account_system(
+	struct thread_group_cputime *tgtimes,
+	cputime_t cputime)
+{
+	if (tgtimes->totals) {
+		struct task_cputime *times;
+
+		times = per_cpu_ptr(tgtimes->totals, get_cpu());
+		times->stime = cputime_add(times->stime, cputime);
+		put_cpu_no_resched();
+	}
+}
+
+/**
+ * thread_group_cputime_account_exec_runtime - Maintain exec runtime for a
+ *						thread group.
+ *
+ * @tgtimes:	Pointer to thread_group_cputime structure.
+ * @ns:		Time value by which to increment the sum_exec_runtime field
+ *		of that structure.
+ *
+ * If thread group time is being maintained, get the structure for the
+ * running CPU and update the sum_exec_runtime field there.
+ */
+static inline void thread_group_cputime_account_exec_runtime(
+	struct thread_group_cputime *tgtimes,
+	unsigned long long ns)
+{
+	if (tgtimes->totals) {
+		struct task_cputime *times;
+
+		times = per_cpu_ptr(tgtimes->totals, get_cpu());
+		times->sum_exec_runtime += ns;
+		put_cpu_no_resched();
+	}
+}
+
+#else /* CONFIG_SMP */
+
+static inline void thread_group_cputime_account_user(
+	struct thread_group_cputime *tgtimes,
+	cputime_t cputime)
+{
+	tgtimes->totals->utime = cputime_add(tgtimes->totals->utime, cputime);
+}
+
+static inline void thread_group_cputime_account_system(
+	struct thread_group_cputime *tgtimes,
+	cputime_t cputime)
+{
+	tgtimes->totals->stime = cputime_add(tgtimes->totals->stime, cputime);
+}
+
+static inline void thread_group_cputime_account_exec_runtime(
+	struct thread_group_cputime *tgtimes,
+	unsigned long long ns)
+{
+	tgtimes->totals->sum_exec_runtime += ns;
+}
+
+#endif /* CONFIG_SMP */
+
+/*
+ * These are the generic time-accounting routines that use the above
+ * functions.  They are the functions actually called by the scheduler.
+ */
+static inline void account_group_user_time(struct task_struct *tsk,
+					    cputime_t cputime)
+{
+	struct signal_struct *sig;
+
+	sig = tsk->signal;
+	if (likely(sig))
+		thread_group_cputime_account_user(&sig->cputime, cputime);
+}
+
+static inline void account_group_system_time(struct task_struct *tsk,
+					      cputime_t cputime)
+{
+	struct signal_struct *sig;
+
+	sig = tsk->signal;
+	if (likely(sig))
+		thread_group_cputime_account_system(&sig->cputime, cputime);
+}
+
+static inline void account_group_exec_runtime(struct task_struct *tsk,
+					       unsigned long long ns)
+{
+	struct signal_struct *sig;
+
+	sig = tsk->signal;
+	if (likely(sig))
+		thread_group_cputime_account_exec_runtime(&sig->cputime, ns);
+}
-- 
cgit v1.2.3


From 5a9fa73072854981a5c05eb7ba18a96d49c2804f Mon Sep 17 00:00:00 2001
From: Oleg Nesterov <oleg@tv-sign.ru>
Date: Mon, 22 Sep 2008 14:42:50 -0700
Subject: posix-timers: kill ->it_sigev_signo and ->it_sigev_value

With the recent changes ->it_sigev_signo and ->it_sigev_value are only
used in sys_timer_create(), kill them.

Signed-off-by: Oleg Nesterov <oleg@tv-sign.ru>
Cc: mingo@elte.hu
Cc: Roland McGrath <roland@redhat.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 include/linux/posix-timers.h |  2 --
 kernel/posix-timers.c        | 17 +++++++----------
 2 files changed, 7 insertions(+), 12 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/posix-timers.h b/include/linux/posix-timers.h
index f9d8e9e94e9b..a7c721355549 100644
--- a/include/linux/posix-timers.h
+++ b/include/linux/posix-timers.h
@@ -45,8 +45,6 @@ struct k_itimer {
 	int it_requeue_pending;		/* waiting to requeue this timer */
 #define REQUEUE_PENDING 1
 	int it_sigev_notify;		/* notify word of sigevent struct */
-	int it_sigev_signo;		/* signo word of sigevent struct */
-	sigval_t it_sigev_value;	/* value word of sigevent struct */
 	struct task_struct *it_process;	/* process to send signal to */
 	struct sigqueue *sigq;		/* signal queue entry. */
 	union {
diff --git a/kernel/posix-timers.c b/kernel/posix-timers.c
index 7be385fe4eca..3eff47b0d8d5 100644
--- a/kernel/posix-timers.c
+++ b/kernel/posix-timers.c
@@ -510,10 +510,6 @@ sys_timer_create(const clockid_t which_clock,
 			error = -EFAULT;
 			goto out;
 		}
-		new_timer->it_sigev_notify = event.sigev_notify;
-		new_timer->it_sigev_signo = event.sigev_signo;
-		new_timer->it_sigev_value = event.sigev_value;
-
 		rcu_read_lock();
 		process = good_sigevent(&event);
 		if (process)
@@ -524,17 +520,18 @@ sys_timer_create(const clockid_t which_clock,
 			goto out;
 		}
 	} else {
-		new_timer->it_sigev_notify = SIGEV_SIGNAL;
-		new_timer->it_sigev_signo = SIGALRM;
-		new_timer->it_sigev_value.sival_int = new_timer->it_id;
+		event.sigev_notify = SIGEV_SIGNAL;
+		event.sigev_signo = SIGALRM;
+		event.sigev_value.sival_int = new_timer->it_id;
 		process = current->group_leader;
 		get_task_struct(process);
 	}
 
-	new_timer->sigq->info.si_code  = SI_TIMER;
+	new_timer->it_sigev_notify     = event.sigev_notify;
+	new_timer->sigq->info.si_signo = event.sigev_signo;
+	new_timer->sigq->info.si_value = event.sigev_value;
 	new_timer->sigq->info.si_tid   = new_timer->it_id;
-	new_timer->sigq->info.si_signo = new_timer->it_sigev_signo;
-	new_timer->sigq->info.si_value = new_timer->it_sigev_value;
+	new_timer->sigq->info.si_code  = SI_TIMER;
 
 	spin_lock_irq(&current->sighand->siglock);
 	new_timer->it_process = process;
-- 
cgit v1.2.3


From 1b02469088ac7a13d7e622b618b7410d0f1ce5ec Mon Sep 17 00:00:00 2001
From: Richard Kennedy <richard@rsk.demon.co.uk>
Date: Mon, 22 Sep 2008 14:42:43 -0700
Subject: hrtimer: reorder struct hrtimer to save 8 bytes on 64bit builds

reorder struct hrtimer to save 8 bytes on 64 bit builds when
CONFIG_TIMER_STATS selected.  (also removes 8 bytes from signal_struct)

Signed-off-by: Richard Kennedy <richard@rsk.demon.co.uk>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 include/linux/hrtimer.h | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/hrtimer.h b/include/linux/hrtimer.h
index 68b0196d8696..8730b60c9432 100644
--- a/include/linux/hrtimer.h
+++ b/include/linux/hrtimer.h
@@ -115,12 +115,12 @@ struct hrtimer {
 	enum hrtimer_restart		(*function)(struct hrtimer *);
 	struct hrtimer_clock_base	*base;
 	unsigned long			state;
-	enum hrtimer_cb_mode		cb_mode;
 	struct list_head		cb_entry;
+	enum hrtimer_cb_mode		cb_mode;
 #ifdef CONFIG_TIMER_STATS
+	int				start_pid;
 	void				*start_site;
 	char				start_comm[16];
-	int				start_pid;
 #endif
 };
 
-- 
cgit v1.2.3


From d40e944c25fb4642adb2a4c580a48218a9f3f824 Mon Sep 17 00:00:00 2001
From: Roman Zippel <zippel@linux-m68k.org>
Date: Mon, 22 Sep 2008 14:42:44 -0700
Subject: ntp: improve adjtimex frequency rounding

Change PPM_SCALE_INV_SHIFT so that it doesn't throw away any input bits
(19 is the amount of the factor 2 in PPM_SCALE), the output frequency
can then be calculated back to its input value, as the inverse divide
produce a slightly larger value, which is then correctly rounded by the
final shift.

Reported-by: Martin Ziegler <ziegler@uni-freiburg.de>
Signed-off-by: Roman Zippel <zippel@linux-m68k.org>
Cc: John Stultz <johnstul@us.ibm.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 include/linux/timex.h | 2 +-
 kernel/time/ntp.c     | 5 ++---
 2 files changed, 3 insertions(+), 4 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/timex.h b/include/linux/timex.h
index c00bcdd3ae42..9007313b5b71 100644
--- a/include/linux/timex.h
+++ b/include/linux/timex.h
@@ -82,7 +82,7 @@
  */
 #define SHIFT_USEC 16		/* frequency offset scale (shift) */
 #define PPM_SCALE (NSEC_PER_USEC << (NTP_SCALE_SHIFT - SHIFT_USEC))
-#define PPM_SCALE_INV_SHIFT 20
+#define PPM_SCALE_INV_SHIFT 19
 #define PPM_SCALE_INV ((1ll << (PPM_SCALE_INV_SHIFT + NTP_SCALE_SHIFT)) / \
 		       PPM_SCALE + 1)
 
diff --git a/kernel/time/ntp.c b/kernel/time/ntp.c
index 450a45cb01c1..ddb0465a6baa 100644
--- a/kernel/time/ntp.c
+++ b/kernel/time/ntp.c
@@ -406,9 +406,8 @@ adj_done:
 	if (time_status & (STA_UNSYNC|STA_CLOCKERR))
 		result = TIME_ERROR;
 
-	txc->freq	   = shift_right((s32)(time_freq >> PPM_SCALE_INV_SHIFT) *
-					 (s64)PPM_SCALE_INV,
-					 NTP_SCALE_SHIFT);
+	txc->freq	   = shift_right((time_freq >> PPM_SCALE_INV_SHIFT) *
+					 (s64)PPM_SCALE_INV, NTP_SCALE_SHIFT);
 	txc->maxerror	   = time_maxerror;
 	txc->esterror	   = time_esterror;
 	txc->status	   = time_status;
-- 
cgit v1.2.3


From e416de5e61e1a9b7f987804cbb67230b5f5293c6 Mon Sep 17 00:00:00 2001
From: Alan Cox <alan@lxorguk.ukuu.org.uk>
Date: Tue, 23 Sep 2008 17:25:10 +0100
Subject: Export the ROM enable/disable helpers

.... so that they can be used by MTD map drivers. Lets us close #9420

Signed-off-by: Alan Cox <alan@redhat.com>
Signed-off-by: David Woodhouse <David.Woodhouse@intel.com>
---
 drivers/pci/rom.c   | 6 ++++--
 include/linux/pci.h | 2 ++
 2 files changed, 6 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/pci/rom.c b/drivers/pci/rom.c
index bd5c0e031398..1f5f6143f35c 100644
--- a/drivers/pci/rom.c
+++ b/drivers/pci/rom.c
@@ -21,7 +21,7 @@
  * between the ROM and other resources, so enabling it may disable access
  * to MMIO registers or other card memory.
  */
-static int pci_enable_rom(struct pci_dev *pdev)
+int pci_enable_rom(struct pci_dev *pdev)
 {
 	struct resource *res = pdev->resource + PCI_ROM_RESOURCE;
 	struct pci_bus_region region;
@@ -45,7 +45,7 @@ static int pci_enable_rom(struct pci_dev *pdev)
  * Disable ROM decoding on a PCI device by turning off the last bit in the
  * ROM BAR.
  */
-static void pci_disable_rom(struct pci_dev *pdev)
+void pci_disable_rom(struct pci_dev *pdev)
 {
 	u32 rom_addr;
 	pci_read_config_dword(pdev, pdev->rom_base_reg, &rom_addr);
@@ -260,3 +260,5 @@ void pci_cleanup_rom(struct pci_dev *pdev)
 
 EXPORT_SYMBOL(pci_map_rom);
 EXPORT_SYMBOL(pci_unmap_rom);
+EXPORT_SYMBOL_GPL(pci_enable_rom);
+EXPORT_SYMBOL_GPL(pci_disable_rom);
diff --git a/include/linux/pci.h b/include/linux/pci.h
index c0e14008a3c2..7a4cee00c1d6 100644
--- a/include/linux/pci.h
+++ b/include/linux/pci.h
@@ -631,6 +631,8 @@ int __must_check pci_assign_resource(struct pci_dev *dev, int i);
 int pci_select_bars(struct pci_dev *dev, unsigned long flags);
 
 /* ROM control related routines */
+int pci_enable_rom(struct pci_dev *pdev);
+void pci_disable_rom(struct pci_dev *pdev);
 void __iomem __must_check *pci_map_rom(struct pci_dev *pdev, size_t *size);
 void pci_unmap_rom(struct pci_dev *pdev, void __iomem *rom);
 size_t pci_get_rom_size(void __iomem *rom, size_t size);
-- 
cgit v1.2.3


From 7086efe1c1536f6bc160e7d60a9bfd645b91f279 Mon Sep 17 00:00:00 2001
From: Frank Mayhar <fmayhar@google.com>
Date: Fri, 12 Sep 2008 09:54:39 -0700
Subject: timers: fix itimer/many thread hang, v3

- fix UP lockup
- another set of UP/SMP cleanups and simplifications

Signed-off-by: Frank Mayhar <fmayhar@google.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 include/linux/sched.h |   1 -
 kernel/sched.c        |   1 -
 kernel/sched_stats.h  | 126 +++++++++++++++-----------------------------------
 3 files changed, 38 insertions(+), 90 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/sched.h b/include/linux/sched.h
index b982fb48c8f0..23d9d5464544 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -2134,7 +2134,6 @@ static inline int thread_group_cputime_clone_thread(struct task_struct *curr)
 	return thread_group_cputime_alloc(curr);
 }
 
-
 static inline void thread_group_cputime_free(struct signal_struct *sig)
 {
 	free_percpu(sig->cputime.totals);
diff --git a/kernel/sched.c b/kernel/sched.c
index 260c22cc530a..29a3152c45db 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -4046,7 +4046,6 @@ unsigned long long task_delta_exec(struct task_struct *p)
 	unsigned long flags;
 	u64 ns = 0;
 
-	rq = task_rq_lock(p, &flags);
 	if (task_current(rq, p)) {
 		u64 delta_exec;
 
diff --git a/kernel/sched_stats.h b/kernel/sched_stats.h
index d6903bd0c7a8..b8c156979cf2 100644
--- a/kernel/sched_stats.h
+++ b/kernel/sched_stats.h
@@ -276,133 +276,83 @@ sched_info_switch(struct task_struct *prev, struct task_struct *next)
  * on CONFIG_SCHEDSTATS.
  */
 
-#ifdef CONFIG_SMP
-
 /**
- * thread_group_cputime_account_user - Maintain utime for a thread group.
+ * account_group_user_time - Maintain utime for a thread group.
  *
- * @tgtimes:	Pointer to thread_group_cputime structure.
- * @cputime:	Time value by which to increment the utime field of that
- *		structure.
+ * @tsk:	Pointer to task structure.
+ * @cputime:	Time value by which to increment the utime field of the
+ *		thread_group_cputime structure.
  *
  * If thread group time is being maintained, get the structure for the
  * running CPU and update the utime field there.
  */
-static inline void thread_group_cputime_account_user(
-	struct thread_group_cputime *tgtimes,
-	cputime_t cputime)
+static inline void account_group_user_time(struct task_struct *tsk,
+					   cputime_t cputime)
 {
-	if (tgtimes->totals) {
+	struct signal_struct *sig;
+
+	sig = tsk->signal;
+	if (unlikely(!sig))
+		return;
+	if (sig->cputime.totals) {
 		struct task_cputime *times;
 
-		times = per_cpu_ptr(tgtimes->totals, get_cpu());
+		times = per_cpu_ptr(sig->cputime.totals, get_cpu());
 		times->utime = cputime_add(times->utime, cputime);
 		put_cpu_no_resched();
 	}
 }
 
 /**
- * thread_group_cputime_account_system - Maintain stime for a thread group.
+ * account_group_system_time - Maintain stime for a thread group.
  *
- * @tgtimes:	Pointer to thread_group_cputime structure.
- * @cputime:	Time value by which to increment the stime field of that
- *		structure.
+ * @tsk:	Pointer to task structure.
+ * @cputime:	Time value by which to increment the stime field of the
+ *		thread_group_cputime structure.
  *
  * If thread group time is being maintained, get the structure for the
  * running CPU and update the stime field there.
  */
-static inline void thread_group_cputime_account_system(
-	struct thread_group_cputime *tgtimes,
-	cputime_t cputime)
+static inline void account_group_system_time(struct task_struct *tsk,
+					     cputime_t cputime)
 {
-	if (tgtimes->totals) {
+	struct signal_struct *sig;
+
+	sig = tsk->signal;
+	if (unlikely(!sig))
+		return;
+	if (sig->cputime.totals) {
 		struct task_cputime *times;
 
-		times = per_cpu_ptr(tgtimes->totals, get_cpu());
+		times = per_cpu_ptr(sig->cputime.totals, get_cpu());
 		times->stime = cputime_add(times->stime, cputime);
 		put_cpu_no_resched();
 	}
 }
 
 /**
- * thread_group_cputime_account_exec_runtime - Maintain exec runtime for a
- *						thread group.
+ * account_group_exec_runtime - Maintain exec runtime for a thread group.
  *
- * @tgtimes:	Pointer to thread_group_cputime structure.
+ * @tsk:	Pointer to task structure.
  * @ns:		Time value by which to increment the sum_exec_runtime field
- *		of that structure.
+ *		of the thread_group_cputime structure.
  *
  * If thread group time is being maintained, get the structure for the
  * running CPU and update the sum_exec_runtime field there.
  */
-static inline void thread_group_cputime_account_exec_runtime(
-	struct thread_group_cputime *tgtimes,
-	unsigned long long ns)
+static inline void account_group_exec_runtime(struct task_struct *tsk,
+					      unsigned long long ns)
 {
-	if (tgtimes->totals) {
+	struct signal_struct *sig;
+
+	sig = tsk->signal;
+	if (unlikely(!sig))
+		return;
+	if (sig->cputime.totals) {
 		struct task_cputime *times;
 
-		times = per_cpu_ptr(tgtimes->totals, get_cpu());
+		times = per_cpu_ptr(sig->cputime.totals, get_cpu());
 		times->sum_exec_runtime += ns;
 		put_cpu_no_resched();
 	}
 }
-
-#else /* CONFIG_SMP */
-
-static inline void thread_group_cputime_account_user(
-	struct thread_group_cputime *tgtimes,
-	cputime_t cputime)
-{
-	tgtimes->totals->utime = cputime_add(tgtimes->totals->utime, cputime);
-}
-
-static inline void thread_group_cputime_account_system(
-	struct thread_group_cputime *tgtimes,
-	cputime_t cputime)
-{
-	tgtimes->totals->stime = cputime_add(tgtimes->totals->stime, cputime);
-}
-
-static inline void thread_group_cputime_account_exec_runtime(
-	struct thread_group_cputime *tgtimes,
-	unsigned long long ns)
-{
-	tgtimes->totals->sum_exec_runtime += ns;
-}
-
-#endif /* CONFIG_SMP */
-
-/*
- * These are the generic time-accounting routines that use the above
- * functions.  They are the functions actually called by the scheduler.
- */
-static inline void account_group_user_time(struct task_struct *tsk,
-					    cputime_t cputime)
-{
-	struct signal_struct *sig;
-
-	sig = tsk->signal;
-	if (likely(sig))
-		thread_group_cputime_account_user(&sig->cputime, cputime);
-}
-
-static inline void account_group_system_time(struct task_struct *tsk,
-					      cputime_t cputime)
-{
-	struct signal_struct *sig;
-
-	sig = tsk->signal;
-	if (likely(sig))
-		thread_group_cputime_account_system(&sig->cputime, cputime);
-}
-
-static inline void account_group_exec_runtime(struct task_struct *tsk,
-					       unsigned long long ns)
-{
-	struct signal_struct *sig;
-
-	sig = tsk->signal;
-	if (likely(sig))
-		thread_group_cputime_account_exec_runtime(&sig->cputime, ns);
-}
-- 
cgit v1.2.3


From bbfbd8b151fe35c9a1180a7f5254c5d6b8387cc0 Mon Sep 17 00:00:00 2001
From: Paul Mundt <lethal@linux-sh.org>
Date: Wed, 1 Oct 2008 16:13:54 +0900
Subject: sh: Move the shared INTC code out to drivers/sh/

The INTC code will be re-used across different architectures, so move
this out to drivers/sh/ and include/linux/sh_intc.h respectively.

Signed-off-by: Paul Mundt <lethal@linux-sh.org>
---
 arch/sh/include/asm/hw_irq.h    |  92 +-----
 arch/sh/kernel/cpu/irq/Makefile |   2 -
 arch/sh/kernel/cpu/irq/intc.c   | 712 ---------------------------------------
 drivers/sh/Makefile             |   2 +-
 drivers/sh/intc.c               | 713 ++++++++++++++++++++++++++++++++++++++++
 include/linux/sh_intc.h         |  91 +++++
 6 files changed, 807 insertions(+), 805 deletions(-)
 delete mode 100644 arch/sh/kernel/cpu/irq/intc.c
 create mode 100644 drivers/sh/intc.c
 create mode 100644 include/linux/sh_intc.h

(limited to 'include/linux')

diff --git a/arch/sh/include/asm/hw_irq.h b/arch/sh/include/asm/hw_irq.h
index d557b00111bf..603cdde813d1 100644
--- a/arch/sh/include/asm/hw_irq.h
+++ b/arch/sh/include/asm/hw_irq.h
@@ -2,6 +2,7 @@
 #define __ASM_SH_HW_IRQ_H
 
 #include <linux/init.h>
+#include <linux/sh_intc.h>
 #include <asm/atomic.h>
 
 extern atomic_t irq_err_count;
@@ -23,101 +24,12 @@ struct ipr_desc {
 
 void register_ipr_controller(struct ipr_desc *);
 
-typedef unsigned char intc_enum;
-
-struct intc_vect {
-	intc_enum enum_id;
-	unsigned short vect;
-};
-
-#define INTC_VECT(enum_id, vect) { enum_id, vect }
-#define INTC_IRQ(enum_id, irq) INTC_VECT(enum_id, irq2evt(irq))
-
-struct intc_group {
-	intc_enum enum_id;
-	intc_enum enum_ids[32];
-};
-
-#define INTC_GROUP(enum_id, ids...) { enum_id, { ids } }
-
-struct intc_mask_reg {
-	unsigned long set_reg, clr_reg, reg_width;
-	intc_enum enum_ids[32];
-#ifdef CONFIG_SMP
-	unsigned long smp;
-#endif
-};
-
-struct intc_prio_reg {
-	unsigned long set_reg, clr_reg, reg_width, field_width;
-	intc_enum enum_ids[16];
-#ifdef CONFIG_SMP
-	unsigned long smp;
-#endif
-};
-
-struct intc_sense_reg {
-	unsigned long reg, reg_width, field_width;
-	intc_enum enum_ids[16];
-};
-
-#ifdef CONFIG_SMP
-#define INTC_SMP(stride, nr) .smp = (stride) | ((nr) << 8)
-#else
-#define INTC_SMP(stride, nr)
-#endif
-
-struct intc_desc {
-	struct intc_vect *vectors;
-	unsigned int nr_vectors;
-	struct intc_group *groups;
-	unsigned int nr_groups;
-	struct intc_mask_reg *mask_regs;
-	unsigned int nr_mask_regs;
-	struct intc_prio_reg *prio_regs;
-	unsigned int nr_prio_regs;
-	struct intc_sense_reg *sense_regs;
-	unsigned int nr_sense_regs;
-	char *name;
-#if defined(CONFIG_CPU_SH3) || defined(CONFIG_CPU_SH4A)
-	struct intc_mask_reg *ack_regs;
-	unsigned int nr_ack_regs;
-#endif
-};
-
-#define _INTC_ARRAY(a) a, sizeof(a)/sizeof(*a)
-#define DECLARE_INTC_DESC(symbol, chipname, vectors, groups,		\
-	mask_regs, prio_regs, sense_regs)				\
-struct intc_desc symbol __initdata = {					\
-	_INTC_ARRAY(vectors), _INTC_ARRAY(groups),			\
-	_INTC_ARRAY(mask_regs), _INTC_ARRAY(prio_regs),			\
-	_INTC_ARRAY(sense_regs),					\
-	chipname,							\
-}
-
-#if defined(CONFIG_CPU_SH3) || defined(CONFIG_CPU_SH4A)
-#define DECLARE_INTC_DESC_ACK(symbol, chipname, vectors, groups,	\
-	mask_regs, prio_regs, sense_regs, ack_regs)			\
-struct intc_desc symbol __initdata = {					\
-	_INTC_ARRAY(vectors), _INTC_ARRAY(groups),			\
-	_INTC_ARRAY(mask_regs), _INTC_ARRAY(prio_regs),			\
-	_INTC_ARRAY(sense_regs),					\
-	chipname,							\
-	_INTC_ARRAY(ack_regs),						\
-}
-#endif
-
-void __init register_intc_controller(struct intc_desc *desc);
-int intc_set_priority(unsigned int irq, unsigned int prio);
-
 void __init plat_irq_setup(void);
-#ifdef CONFIG_CPU_SH3
 void __init plat_irq_setup_sh3(void);
-#endif
+void __init plat_irq_setup_pins(int mode);
 
 enum { IRQ_MODE_IRQ, IRQ_MODE_IRQ7654, IRQ_MODE_IRQ3210,
        IRQ_MODE_IRL7654_MASK, IRQ_MODE_IRL3210_MASK,
        IRQ_MODE_IRL7654, IRQ_MODE_IRL3210 };
-void __init plat_irq_setup_pins(int mode);
 
 #endif /* __ASM_SH_HW_IRQ_H */
diff --git a/arch/sh/kernel/cpu/irq/Makefile b/arch/sh/kernel/cpu/irq/Makefile
index 462a8f6dfee2..f0c7025a67d1 100644
--- a/arch/sh/kernel/cpu/irq/Makefile
+++ b/arch/sh/kernel/cpu/irq/Makefile
@@ -1,8 +1,6 @@
 #
 # Makefile for the Linux/SuperH CPU-specifc IRQ handlers.
 #
-obj-y	+= intc.o
-
 obj-$(CONFIG_SUPERH32)			+= imask.o
 obj-$(CONFIG_CPU_SH5)			+= intc-sh5.o
 obj-$(CONFIG_CPU_HAS_IPR_IRQ)		+= ipr.o
diff --git a/arch/sh/kernel/cpu/irq/intc.c b/arch/sh/kernel/cpu/irq/intc.c
deleted file mode 100644
index 138efa4e95db..000000000000
--- a/arch/sh/kernel/cpu/irq/intc.c
+++ /dev/null
@@ -1,712 +0,0 @@
-/*
- * Shared interrupt handling code for IPR and INTC2 types of IRQs.
- *
- * Copyright (C) 2007, 2008 Magnus Damm
- *
- * Based on intc2.c and ipr.c
- *
- * Copyright (C) 1999  Niibe Yutaka & Takeshi Yaegashi
- * Copyright (C) 2000  Kazumoto Kojima
- * Copyright (C) 2001  David J. Mckay (david.mckay@st.com)
- * Copyright (C) 2003  Takashi Kusuda <kusuda-takashi@hitachi-ul.co.jp>
- * Copyright (C) 2005, 2006  Paul Mundt
- *
- * This file is subject to the terms and conditions of the GNU General Public
- * License.  See the file "COPYING" in the main directory of this archive
- * for more details.
- */
-#include <linux/init.h>
-#include <linux/irq.h>
-#include <linux/module.h>
-#include <linux/io.h>
-#include <linux/interrupt.h>
-#include <linux/bootmem.h>
-
-#define _INTC_MK(fn, mode, addr_e, addr_d, width, shift) \
-	((shift) | ((width) << 5) | ((fn) << 9) | ((mode) << 13) | \
-	 ((addr_e) << 16) | ((addr_d << 24)))
-
-#define _INTC_SHIFT(h) (h & 0x1f)
-#define _INTC_WIDTH(h) ((h >> 5) & 0xf)
-#define _INTC_FN(h) ((h >> 9) & 0xf)
-#define _INTC_MODE(h) ((h >> 13) & 0x7)
-#define _INTC_ADDR_E(h) ((h >> 16) & 0xff)
-#define _INTC_ADDR_D(h) ((h >> 24) & 0xff)
-
-struct intc_handle_int {
-	unsigned int irq;
-	unsigned long handle;
-};
-
-struct intc_desc_int {
-	unsigned long *reg;
-#ifdef CONFIG_SMP
-	unsigned long *smp;
-#endif
-	unsigned int nr_reg;
-	struct intc_handle_int *prio;
-	unsigned int nr_prio;
-	struct intc_handle_int *sense;
-	unsigned int nr_sense;
-	struct irq_chip chip;
-};
-
-#ifdef CONFIG_SMP
-#define IS_SMP(x) x.smp
-#define INTC_REG(d, x, c) (d->reg[(x)] + ((d->smp[(x)] & 0xff) * c))
-#define SMP_NR(d, x) ((d->smp[(x)] >> 8) ? (d->smp[(x)] >> 8) : 1)
-#else
-#define IS_SMP(x) 0
-#define INTC_REG(d, x, c) (d->reg[(x)])
-#define SMP_NR(d, x) 1
-#endif
-
-static unsigned int intc_prio_level[NR_IRQS]; /* for now */
-#if defined(CONFIG_CPU_SH3) || defined(CONFIG_CPU_SH4A)
-static unsigned long ack_handle[NR_IRQS];
-#endif
-
-static inline struct intc_desc_int *get_intc_desc(unsigned int irq)
-{
-	struct irq_chip *chip = get_irq_chip(irq);
-	return (void *)((char *)chip - offsetof(struct intc_desc_int, chip));
-}
-
-static inline unsigned int set_field(unsigned int value,
-				     unsigned int field_value,
-				     unsigned int handle)
-{
-	unsigned int width = _INTC_WIDTH(handle);
-	unsigned int shift = _INTC_SHIFT(handle);
-
-	value &= ~(((1 << width) - 1) << shift);
-	value |= field_value << shift;
-	return value;
-}
-
-static void write_8(unsigned long addr, unsigned long h, unsigned long data)
-{
-	__raw_writeb(set_field(0, data, h), addr);
-}
-
-static void write_16(unsigned long addr, unsigned long h, unsigned long data)
-{
-	__raw_writew(set_field(0, data, h), addr);
-}
-
-static void write_32(unsigned long addr, unsigned long h, unsigned long data)
-{
-	__raw_writel(set_field(0, data, h), addr);
-}
-
-static void modify_8(unsigned long addr, unsigned long h, unsigned long data)
-{
-	unsigned long flags;
-	local_irq_save(flags);
-	__raw_writeb(set_field(__raw_readb(addr), data, h), addr);
-	local_irq_restore(flags);
-}
-
-static void modify_16(unsigned long addr, unsigned long h, unsigned long data)
-{
-	unsigned long flags;
-	local_irq_save(flags);
-	__raw_writew(set_field(__raw_readw(addr), data, h), addr);
-	local_irq_restore(flags);
-}
-
-static void modify_32(unsigned long addr, unsigned long h, unsigned long data)
-{
-	unsigned long flags;
-	local_irq_save(flags);
-	__raw_writel(set_field(__raw_readl(addr), data, h), addr);
-	local_irq_restore(flags);
-}
-
-enum {	REG_FN_ERR = 0, REG_FN_WRITE_BASE = 1, REG_FN_MODIFY_BASE = 5 };
-
-static void (*intc_reg_fns[])(unsigned long addr,
-			      unsigned long h,
-			      unsigned long data) = {
-	[REG_FN_WRITE_BASE + 0] = write_8,
-	[REG_FN_WRITE_BASE + 1] = write_16,
-	[REG_FN_WRITE_BASE + 3] = write_32,
-	[REG_FN_MODIFY_BASE + 0] = modify_8,
-	[REG_FN_MODIFY_BASE + 1] = modify_16,
-	[REG_FN_MODIFY_BASE + 3] = modify_32,
-};
-
-enum {	MODE_ENABLE_REG = 0, /* Bit(s) set -> interrupt enabled */
-	MODE_MASK_REG,       /* Bit(s) set -> interrupt disabled */
-	MODE_DUAL_REG,       /* Two registers, set bit to enable / disable */
-	MODE_PRIO_REG,       /* Priority value written to enable interrupt */
-	MODE_PCLR_REG,       /* Above plus all bits set to disable interrupt */
-};
-
-static void intc_mode_field(unsigned long addr,
-			    unsigned long handle,
-			    void (*fn)(unsigned long,
-				       unsigned long,
-				       unsigned long),
-			    unsigned int irq)
-{
-	fn(addr, handle, ((1 << _INTC_WIDTH(handle)) - 1));
-}
-
-static void intc_mode_zero(unsigned long addr,
-			   unsigned long handle,
-			   void (*fn)(unsigned long,
-				       unsigned long,
-				       unsigned long),
-			   unsigned int irq)
-{
-	fn(addr, handle, 0);
-}
-
-static void intc_mode_prio(unsigned long addr,
-			   unsigned long handle,
-			   void (*fn)(unsigned long,
-				       unsigned long,
-				       unsigned long),
-			   unsigned int irq)
-{
-	fn(addr, handle, intc_prio_level[irq]);
-}
-
-static void (*intc_enable_fns[])(unsigned long addr,
-				 unsigned long handle,
-				 void (*fn)(unsigned long,
-					    unsigned long,
-					    unsigned long),
-				 unsigned int irq) = {
-	[MODE_ENABLE_REG] = intc_mode_field,
-	[MODE_MASK_REG] = intc_mode_zero,
-	[MODE_DUAL_REG] = intc_mode_field,
-	[MODE_PRIO_REG] = intc_mode_prio,
-	[MODE_PCLR_REG] = intc_mode_prio,
-};
-
-static void (*intc_disable_fns[])(unsigned long addr,
-				  unsigned long handle,
-				  void (*fn)(unsigned long,
-					     unsigned long,
-					     unsigned long),
-				  unsigned int irq) = {
-	[MODE_ENABLE_REG] = intc_mode_zero,
-	[MODE_MASK_REG] = intc_mode_field,
-	[MODE_DUAL_REG] = intc_mode_field,
-	[MODE_PRIO_REG] = intc_mode_zero,
-	[MODE_PCLR_REG] = intc_mode_field,
-};
-
-static inline void _intc_enable(unsigned int irq, unsigned long handle)
-{
-	struct intc_desc_int *d = get_intc_desc(irq);
-	unsigned long addr;
-	unsigned int cpu;
-
-	for (cpu = 0; cpu < SMP_NR(d, _INTC_ADDR_E(handle)); cpu++) {
-		addr = INTC_REG(d, _INTC_ADDR_E(handle), cpu);
-		intc_enable_fns[_INTC_MODE(handle)](addr, handle, intc_reg_fns\
-						    [_INTC_FN(handle)], irq);
-	}
-}
-
-static void intc_enable(unsigned int irq)
-{
-	_intc_enable(irq, (unsigned long)get_irq_chip_data(irq));
-}
-
-static void intc_disable(unsigned int irq)
-{
-	struct intc_desc_int *d = get_intc_desc(irq);
-	unsigned long handle = (unsigned long) get_irq_chip_data(irq);
-	unsigned long addr;
-	unsigned int cpu;
-
-	for (cpu = 0; cpu < SMP_NR(d, _INTC_ADDR_D(handle)); cpu++) {
-		addr = INTC_REG(d, _INTC_ADDR_D(handle), cpu);
-		intc_disable_fns[_INTC_MODE(handle)](addr, handle,intc_reg_fns\
-						     [_INTC_FN(handle)], irq);
-	}
-}
-
-#if defined(CONFIG_CPU_SH3) || defined(CONFIG_CPU_SH4A)
-static void intc_mask_ack(unsigned int irq)
-{
-	struct intc_desc_int *d = get_intc_desc(irq);
-	unsigned long handle = ack_handle[irq];
-	unsigned long addr;
-
-	intc_disable(irq);
-
-	/* read register and write zero only to the assocaited bit */
-
-	if (handle) {
-		addr = INTC_REG(d, _INTC_ADDR_D(handle), 0);
-		switch (_INTC_FN(handle)) {
-		case REG_FN_MODIFY_BASE + 0:	/* 8bit */
-			__raw_readb(addr);
-			__raw_writeb(0xff ^ set_field(0, 1, handle), addr);
-			break;
-		case REG_FN_MODIFY_BASE + 1:	/* 16bit */
-			__raw_readw(addr);
-			__raw_writew(0xffff ^ set_field(0, 1, handle), addr);
-			break;
-		case REG_FN_MODIFY_BASE + 3:	/* 32bit */
-			__raw_readl(addr);
-			__raw_writel(0xffffffff ^ set_field(0, 1, handle), addr);
-			break;
-		default:
-			BUG();
-			break;
-		}
-	}
-}
-#endif
-
-static struct intc_handle_int *intc_find_irq(struct intc_handle_int *hp,
-					     unsigned int nr_hp,
-					     unsigned int irq)
-{
-	int i;
-
-	/* this doesn't scale well, but...
-	 *
-	 * this function should only be used for cerain uncommon
-	 * operations such as intc_set_priority() and intc_set_sense()
-	 * and in those rare cases performance doesn't matter that much.
-	 * keeping the memory footprint low is more important.
-	 *
-	 * one rather simple way to speed this up and still keep the
-	 * memory footprint down is to make sure the array is sorted
-	 * and then perform a bisect to lookup the irq.
-	 */
-
-	for (i = 0; i < nr_hp; i++) {
-		if ((hp + i)->irq != irq)
-			continue;
-
-		return hp + i;
-	}
-
-	return NULL;
-}
-
-int intc_set_priority(unsigned int irq, unsigned int prio)
-{
-	struct intc_desc_int *d = get_intc_desc(irq);
-	struct intc_handle_int *ihp;
-
-	if (!intc_prio_level[irq] || prio <= 1)
-		return -EINVAL;
-
-	ihp = intc_find_irq(d->prio, d->nr_prio, irq);
-	if (ihp) {
-		if (prio >= (1 << _INTC_WIDTH(ihp->handle)))
-			return -EINVAL;
-
-		intc_prio_level[irq] = prio;
-
-		/*
-		 * only set secondary masking method directly
-		 * primary masking method is using intc_prio_level[irq]
-		 * priority level will be set during next enable()
-		 */
-
-		if (_INTC_FN(ihp->handle) != REG_FN_ERR)
-			_intc_enable(irq, ihp->handle);
-	}
-	return 0;
-}
-
-#define VALID(x) (x | 0x80)
-
-static unsigned char intc_irq_sense_table[IRQ_TYPE_SENSE_MASK + 1] = {
-	[IRQ_TYPE_EDGE_FALLING] = VALID(0),
-	[IRQ_TYPE_EDGE_RISING] = VALID(1),
-	[IRQ_TYPE_LEVEL_LOW] = VALID(2),
-	/* SH7706, SH7707 and SH7709 do not support high level triggered */
-#if !defined(CONFIG_CPU_SUBTYPE_SH7706) && \
-    !defined(CONFIG_CPU_SUBTYPE_SH7707) && \
-    !defined(CONFIG_CPU_SUBTYPE_SH7709)
-	[IRQ_TYPE_LEVEL_HIGH] = VALID(3),
-#endif
-};
-
-static int intc_set_sense(unsigned int irq, unsigned int type)
-{
-	struct intc_desc_int *d = get_intc_desc(irq);
-	unsigned char value = intc_irq_sense_table[type & IRQ_TYPE_SENSE_MASK];
-	struct intc_handle_int *ihp;
-	unsigned long addr;
-
-	if (!value)
-		return -EINVAL;
-
-	ihp = intc_find_irq(d->sense, d->nr_sense, irq);
-	if (ihp) {
-		addr = INTC_REG(d, _INTC_ADDR_E(ihp->handle), 0);
-		intc_reg_fns[_INTC_FN(ihp->handle)](addr, ihp->handle, value);
-	}
-	return 0;
-}
-
-static unsigned int __init intc_get_reg(struct intc_desc_int *d,
-				 unsigned long address)
-{
-	unsigned int k;
-
-	for (k = 0; k < d->nr_reg; k++) {
-		if (d->reg[k] == address)
-			return k;
-	}
-
-	BUG();
-	return 0;
-}
-
-static intc_enum __init intc_grp_id(struct intc_desc *desc,
-				    intc_enum enum_id)
-{
-	struct intc_group *g = desc->groups;
-	unsigned int i, j;
-
-	for (i = 0; g && enum_id && i < desc->nr_groups; i++) {
-		g = desc->groups + i;
-
-		for (j = 0; g->enum_ids[j]; j++) {
-			if (g->enum_ids[j] != enum_id)
-				continue;
-
-			return g->enum_id;
-		}
-	}
-
-	return 0;
-}
-
-static unsigned int __init intc_mask_data(struct intc_desc *desc,
-					  struct intc_desc_int *d,
-					  intc_enum enum_id, int do_grps)
-{
-	struct intc_mask_reg *mr = desc->mask_regs;
-	unsigned int i, j, fn, mode;
-	unsigned long reg_e, reg_d;
-
-	for (i = 0; mr && enum_id && i < desc->nr_mask_regs; i++) {
-		mr = desc->mask_regs + i;
-
-		for (j = 0; j < ARRAY_SIZE(mr->enum_ids); j++) {
-			if (mr->enum_ids[j] != enum_id)
-				continue;
-
-			if (mr->set_reg && mr->clr_reg) {
-				fn = REG_FN_WRITE_BASE;
-				mode = MODE_DUAL_REG;
-				reg_e = mr->clr_reg;
-				reg_d = mr->set_reg;
-			} else {
-				fn = REG_FN_MODIFY_BASE;
-				if (mr->set_reg) {
-					mode = MODE_ENABLE_REG;
-					reg_e = mr->set_reg;
-					reg_d = mr->set_reg;
-				} else {
-					mode = MODE_MASK_REG;
-					reg_e = mr->clr_reg;
-					reg_d = mr->clr_reg;
-				}
-			}
-
-			fn += (mr->reg_width >> 3) - 1;
-			return _INTC_MK(fn, mode,
-					intc_get_reg(d, reg_e),
-					intc_get_reg(d, reg_d),
-					1,
-					(mr->reg_width - 1) - j);
-		}
-	}
-
-	if (do_grps)
-		return intc_mask_data(desc, d, intc_grp_id(desc, enum_id), 0);
-
-	return 0;
-}
-
-static unsigned int __init intc_prio_data(struct intc_desc *desc,
-					  struct intc_desc_int *d,
-					  intc_enum enum_id, int do_grps)
-{
-	struct intc_prio_reg *pr = desc->prio_regs;
-	unsigned int i, j, fn, mode, bit;
-	unsigned long reg_e, reg_d;
-
-	for (i = 0; pr && enum_id && i < desc->nr_prio_regs; i++) {
-		pr = desc->prio_regs + i;
-
-		for (j = 0; j < ARRAY_SIZE(pr->enum_ids); j++) {
-			if (pr->enum_ids[j] != enum_id)
-				continue;
-
-			if (pr->set_reg && pr->clr_reg) {
-				fn = REG_FN_WRITE_BASE;
-				mode = MODE_PCLR_REG;
-				reg_e = pr->set_reg;
-				reg_d = pr->clr_reg;
-			} else {
-				fn = REG_FN_MODIFY_BASE;
-				mode = MODE_PRIO_REG;
-				if (!pr->set_reg)
-					BUG();
-				reg_e = pr->set_reg;
-				reg_d = pr->set_reg;
-			}
-
-			fn += (pr->reg_width >> 3) - 1;
-
-			BUG_ON((j + 1) * pr->field_width > pr->reg_width);
-
-			bit = pr->reg_width - ((j + 1) * pr->field_width);
-
-			return _INTC_MK(fn, mode,
-					intc_get_reg(d, reg_e),
-					intc_get_reg(d, reg_d),
-					pr->field_width, bit);
-		}
-	}
-
-	if (do_grps)
-		return intc_prio_data(desc, d, intc_grp_id(desc, enum_id), 0);
-
-	return 0;
-}
-
-#if defined(CONFIG_CPU_SH3) || defined(CONFIG_CPU_SH4A)
-static unsigned int __init intc_ack_data(struct intc_desc *desc,
-					  struct intc_desc_int *d,
-					  intc_enum enum_id)
-{
-	struct intc_mask_reg *mr = desc->ack_regs;
-	unsigned int i, j, fn, mode;
-	unsigned long reg_e, reg_d;
-
-	for (i = 0; mr && enum_id && i < desc->nr_ack_regs; i++) {
-		mr = desc->ack_regs + i;
-
-		for (j = 0; j < ARRAY_SIZE(mr->enum_ids); j++) {
-			if (mr->enum_ids[j] != enum_id)
-				continue;
-
-			fn = REG_FN_MODIFY_BASE;
-			mode = MODE_ENABLE_REG;
-			reg_e = mr->set_reg;
-			reg_d = mr->set_reg;
-
-			fn += (mr->reg_width >> 3) - 1;
-			return _INTC_MK(fn, mode,
-					intc_get_reg(d, reg_e),
-					intc_get_reg(d, reg_d),
-					1,
-					(mr->reg_width - 1) - j);
-		}
-	}
-
-	return 0;
-}
-#endif
-
-static unsigned int __init intc_sense_data(struct intc_desc *desc,
-					   struct intc_desc_int *d,
-					   intc_enum enum_id)
-{
-	struct intc_sense_reg *sr = desc->sense_regs;
-	unsigned int i, j, fn, bit;
-
-	for (i = 0; sr && enum_id && i < desc->nr_sense_regs; i++) {
-		sr = desc->sense_regs + i;
-
-		for (j = 0; j < ARRAY_SIZE(sr->enum_ids); j++) {
-			if (sr->enum_ids[j] != enum_id)
-				continue;
-
-			fn = REG_FN_MODIFY_BASE;
-			fn += (sr->reg_width >> 3) - 1;
-
-			BUG_ON((j + 1) * sr->field_width > sr->reg_width);
-
-			bit = sr->reg_width - ((j + 1) * sr->field_width);
-
-			return _INTC_MK(fn, 0, intc_get_reg(d, sr->reg),
-					0, sr->field_width, bit);
-		}
-	}
-
-	return 0;
-}
-
-static void __init intc_register_irq(struct intc_desc *desc,
-				     struct intc_desc_int *d,
-				     intc_enum enum_id,
-				     unsigned int irq)
-{
-	struct intc_handle_int *hp;
-	unsigned int data[2], primary;
-
-	/* Prefer single interrupt source bitmap over other combinations:
-	 * 1. bitmap, single interrupt source
-	 * 2. priority, single interrupt source
-	 * 3. bitmap, multiple interrupt sources (groups)
-	 * 4. priority, multiple interrupt sources (groups)
-	 */
-
-	data[0] = intc_mask_data(desc, d, enum_id, 0);
-	data[1] = intc_prio_data(desc, d, enum_id, 0);
-
-	primary = 0;
-	if (!data[0] && data[1])
-		primary = 1;
-
-	data[0] = data[0] ? data[0] : intc_mask_data(desc, d, enum_id, 1);
-	data[1] = data[1] ? data[1] : intc_prio_data(desc, d, enum_id, 1);
-
-	if (!data[primary])
-		primary ^= 1;
-
-	BUG_ON(!data[primary]); /* must have primary masking method */
-
-	disable_irq_nosync(irq);
-	set_irq_chip_and_handler_name(irq, &d->chip,
-				      handle_level_irq, "level");
-	set_irq_chip_data(irq, (void *)data[primary]);
-
-	/* set priority level
-	 * - this needs to be at least 2 for 5-bit priorities on 7780
-	 */
-	intc_prio_level[irq] = 2;
-
-	/* enable secondary masking method if present */
-	if (data[!primary])
-		_intc_enable(irq, data[!primary]);
-
-	/* add irq to d->prio list if priority is available */
-	if (data[1]) {
-		hp = d->prio + d->nr_prio;
-		hp->irq = irq;
-		hp->handle = data[1];
-
-		if (primary) {
-			/*
-			 * only secondary priority should access registers, so
-			 * set _INTC_FN(h) = REG_FN_ERR for intc_set_priority()
-			 */
-
-			hp->handle &= ~_INTC_MK(0x0f, 0, 0, 0, 0, 0);
-			hp->handle |= _INTC_MK(REG_FN_ERR, 0, 0, 0, 0, 0);
-		}
-		d->nr_prio++;
-	}
-
-	/* add irq to d->sense list if sense is available */
-	data[0] = intc_sense_data(desc, d, enum_id);
-	if (data[0]) {
-		(d->sense + d->nr_sense)->irq = irq;
-		(d->sense + d->nr_sense)->handle = data[0];
-		d->nr_sense++;
-	}
-
-	/* irq should be disabled by default */
-	d->chip.mask(irq);
-
-#if defined(CONFIG_CPU_SH3) || defined(CONFIG_CPU_SH4A)
-	if (desc->ack_regs)
-		ack_handle[irq] = intc_ack_data(desc, d, enum_id);
-#endif
-}
-
-static unsigned int __init save_reg(struct intc_desc_int *d,
-				    unsigned int cnt,
-				    unsigned long value,
-				    unsigned int smp)
-{
-	if (value) {
-		d->reg[cnt] = value;
-#ifdef CONFIG_SMP
-		d->smp[cnt] = smp;
-#endif
-		return 1;
-	}
-
-	return 0;
-}
-
-
-void __init register_intc_controller(struct intc_desc *desc)
-{
-	unsigned int i, k, smp;
-	struct intc_desc_int *d;
-
-	d = alloc_bootmem(sizeof(*d));
-
-	d->nr_reg = desc->mask_regs ? desc->nr_mask_regs * 2 : 0;
-	d->nr_reg += desc->prio_regs ? desc->nr_prio_regs * 2 : 0;
-	d->nr_reg += desc->sense_regs ? desc->nr_sense_regs : 0;
-
-#if defined(CONFIG_CPU_SH3) || defined(CONFIG_CPU_SH4A)
-	d->nr_reg += desc->ack_regs ? desc->nr_ack_regs : 0;
-#endif
-	d->reg = alloc_bootmem(d->nr_reg * sizeof(*d->reg));
-#ifdef CONFIG_SMP
-	d->smp = alloc_bootmem(d->nr_reg * sizeof(*d->smp));
-#endif
-	k = 0;
-
-	if (desc->mask_regs) {
-		for (i = 0; i < desc->nr_mask_regs; i++) {
-			smp = IS_SMP(desc->mask_regs[i]);
-			k += save_reg(d, k, desc->mask_regs[i].set_reg, smp);
-			k += save_reg(d, k, desc->mask_regs[i].clr_reg, smp);
-		}
-	}
-
-	if (desc->prio_regs) {
-		d->prio = alloc_bootmem(desc->nr_vectors * sizeof(*d->prio));
-
-		for (i = 0; i < desc->nr_prio_regs; i++) {
-			smp = IS_SMP(desc->prio_regs[i]);
-			k += save_reg(d, k, desc->prio_regs[i].set_reg, smp);
-			k += save_reg(d, k, desc->prio_regs[i].clr_reg, smp);
-		}
-	}
-
-	if (desc->sense_regs) {
-		d->sense = alloc_bootmem(desc->nr_vectors * sizeof(*d->sense));
-
-		for (i = 0; i < desc->nr_sense_regs; i++) {
-			k += save_reg(d, k, desc->sense_regs[i].reg, 0);
-		}
-	}
-
-	d->chip.name = desc->name;
-	d->chip.mask = intc_disable;
-	d->chip.unmask = intc_enable;
-	d->chip.mask_ack = intc_disable;
-	d->chip.set_type = intc_set_sense;
-
-#if defined(CONFIG_CPU_SH3) || defined(CONFIG_CPU_SH4A)
-	if (desc->ack_regs) {
-		for (i = 0; i < desc->nr_ack_regs; i++)
-			k += save_reg(d, k, desc->ack_regs[i].set_reg, 0);
-
-		d->chip.mask_ack = intc_mask_ack;
-	}
-#endif
-
-	BUG_ON(k > 256); /* _INTC_ADDR_E() and _INTC_ADDR_D() are 8 bits */
-
-	for (i = 0; i < desc->nr_vectors; i++) {
-		struct intc_vect *vect = desc->vectors + i;
-
-		intc_register_irq(desc, d, vect->enum_id, evt2irq(vect->vect));
-	}
-}
diff --git a/drivers/sh/Makefile b/drivers/sh/Makefile
index a96f4a8cfeb8..6a025cefe6dc 100644
--- a/drivers/sh/Makefile
+++ b/drivers/sh/Makefile
@@ -1,6 +1,6 @@
 #
 # Makefile for the SuperH specific drivers.
 #
-
 obj-$(CONFIG_SUPERHYWAY)	+= superhyway/
 obj-$(CONFIG_MAPLE)		+= maple/
+obj-y				+= intc.o
diff --git a/drivers/sh/intc.c b/drivers/sh/intc.c
new file mode 100644
index 000000000000..58d24c5a76ce
--- /dev/null
+++ b/drivers/sh/intc.c
@@ -0,0 +1,713 @@
+/*
+ * Shared interrupt handling code for IPR and INTC2 types of IRQs.
+ *
+ * Copyright (C) 2007, 2008 Magnus Damm
+ *
+ * Based on intc2.c and ipr.c
+ *
+ * Copyright (C) 1999  Niibe Yutaka & Takeshi Yaegashi
+ * Copyright (C) 2000  Kazumoto Kojima
+ * Copyright (C) 2001  David J. Mckay (david.mckay@st.com)
+ * Copyright (C) 2003  Takashi Kusuda <kusuda-takashi@hitachi-ul.co.jp>
+ * Copyright (C) 2005, 2006  Paul Mundt
+ *
+ * This file is subject to the terms and conditions of the GNU General Public
+ * License.  See the file "COPYING" in the main directory of this archive
+ * for more details.
+ */
+#include <linux/init.h>
+#include <linux/irq.h>
+#include <linux/module.h>
+#include <linux/io.h>
+#include <linux/interrupt.h>
+#include <linux/bootmem.h>
+#include <linux/sh_intc.h>
+
+#define _INTC_MK(fn, mode, addr_e, addr_d, width, shift) \
+	((shift) | ((width) << 5) | ((fn) << 9) | ((mode) << 13) | \
+	 ((addr_e) << 16) | ((addr_d << 24)))
+
+#define _INTC_SHIFT(h) (h & 0x1f)
+#define _INTC_WIDTH(h) ((h >> 5) & 0xf)
+#define _INTC_FN(h) ((h >> 9) & 0xf)
+#define _INTC_MODE(h) ((h >> 13) & 0x7)
+#define _INTC_ADDR_E(h) ((h >> 16) & 0xff)
+#define _INTC_ADDR_D(h) ((h >> 24) & 0xff)
+
+struct intc_handle_int {
+	unsigned int irq;
+	unsigned long handle;
+};
+
+struct intc_desc_int {
+	unsigned long *reg;
+#ifdef CONFIG_SMP
+	unsigned long *smp;
+#endif
+	unsigned int nr_reg;
+	struct intc_handle_int *prio;
+	unsigned int nr_prio;
+	struct intc_handle_int *sense;
+	unsigned int nr_sense;
+	struct irq_chip chip;
+};
+
+#ifdef CONFIG_SMP
+#define IS_SMP(x) x.smp
+#define INTC_REG(d, x, c) (d->reg[(x)] + ((d->smp[(x)] & 0xff) * c))
+#define SMP_NR(d, x) ((d->smp[(x)] >> 8) ? (d->smp[(x)] >> 8) : 1)
+#else
+#define IS_SMP(x) 0
+#define INTC_REG(d, x, c) (d->reg[(x)])
+#define SMP_NR(d, x) 1
+#endif
+
+static unsigned int intc_prio_level[NR_IRQS]; /* for now */
+#if defined(CONFIG_CPU_SH3) || defined(CONFIG_CPU_SH4A)
+static unsigned long ack_handle[NR_IRQS];
+#endif
+
+static inline struct intc_desc_int *get_intc_desc(unsigned int irq)
+{
+	struct irq_chip *chip = get_irq_chip(irq);
+	return (void *)((char *)chip - offsetof(struct intc_desc_int, chip));
+}
+
+static inline unsigned int set_field(unsigned int value,
+				     unsigned int field_value,
+				     unsigned int handle)
+{
+	unsigned int width = _INTC_WIDTH(handle);
+	unsigned int shift = _INTC_SHIFT(handle);
+
+	value &= ~(((1 << width) - 1) << shift);
+	value |= field_value << shift;
+	return value;
+}
+
+static void write_8(unsigned long addr, unsigned long h, unsigned long data)
+{
+	__raw_writeb(set_field(0, data, h), addr);
+}
+
+static void write_16(unsigned long addr, unsigned long h, unsigned long data)
+{
+	__raw_writew(set_field(0, data, h), addr);
+}
+
+static void write_32(unsigned long addr, unsigned long h, unsigned long data)
+{
+	__raw_writel(set_field(0, data, h), addr);
+}
+
+static void modify_8(unsigned long addr, unsigned long h, unsigned long data)
+{
+	unsigned long flags;
+	local_irq_save(flags);
+	__raw_writeb(set_field(__raw_readb(addr), data, h), addr);
+	local_irq_restore(flags);
+}
+
+static void modify_16(unsigned long addr, unsigned long h, unsigned long data)
+{
+	unsigned long flags;
+	local_irq_save(flags);
+	__raw_writew(set_field(__raw_readw(addr), data, h), addr);
+	local_irq_restore(flags);
+}
+
+static void modify_32(unsigned long addr, unsigned long h, unsigned long data)
+{
+	unsigned long flags;
+	local_irq_save(flags);
+	__raw_writel(set_field(__raw_readl(addr), data, h), addr);
+	local_irq_restore(flags);
+}
+
+enum {	REG_FN_ERR = 0, REG_FN_WRITE_BASE = 1, REG_FN_MODIFY_BASE = 5 };
+
+static void (*intc_reg_fns[])(unsigned long addr,
+			      unsigned long h,
+			      unsigned long data) = {
+	[REG_FN_WRITE_BASE + 0] = write_8,
+	[REG_FN_WRITE_BASE + 1] = write_16,
+	[REG_FN_WRITE_BASE + 3] = write_32,
+	[REG_FN_MODIFY_BASE + 0] = modify_8,
+	[REG_FN_MODIFY_BASE + 1] = modify_16,
+	[REG_FN_MODIFY_BASE + 3] = modify_32,
+};
+
+enum {	MODE_ENABLE_REG = 0, /* Bit(s) set -> interrupt enabled */
+	MODE_MASK_REG,       /* Bit(s) set -> interrupt disabled */
+	MODE_DUAL_REG,       /* Two registers, set bit to enable / disable */
+	MODE_PRIO_REG,       /* Priority value written to enable interrupt */
+	MODE_PCLR_REG,       /* Above plus all bits set to disable interrupt */
+};
+
+static void intc_mode_field(unsigned long addr,
+			    unsigned long handle,
+			    void (*fn)(unsigned long,
+				       unsigned long,
+				       unsigned long),
+			    unsigned int irq)
+{
+	fn(addr, handle, ((1 << _INTC_WIDTH(handle)) - 1));
+}
+
+static void intc_mode_zero(unsigned long addr,
+			   unsigned long handle,
+			   void (*fn)(unsigned long,
+				       unsigned long,
+				       unsigned long),
+			   unsigned int irq)
+{
+	fn(addr, handle, 0);
+}
+
+static void intc_mode_prio(unsigned long addr,
+			   unsigned long handle,
+			   void (*fn)(unsigned long,
+				       unsigned long,
+				       unsigned long),
+			   unsigned int irq)
+{
+	fn(addr, handle, intc_prio_level[irq]);
+}
+
+static void (*intc_enable_fns[])(unsigned long addr,
+				 unsigned long handle,
+				 void (*fn)(unsigned long,
+					    unsigned long,
+					    unsigned long),
+				 unsigned int irq) = {
+	[MODE_ENABLE_REG] = intc_mode_field,
+	[MODE_MASK_REG] = intc_mode_zero,
+	[MODE_DUAL_REG] = intc_mode_field,
+	[MODE_PRIO_REG] = intc_mode_prio,
+	[MODE_PCLR_REG] = intc_mode_prio,
+};
+
+static void (*intc_disable_fns[])(unsigned long addr,
+				  unsigned long handle,
+				  void (*fn)(unsigned long,
+					     unsigned long,
+					     unsigned long),
+				  unsigned int irq) = {
+	[MODE_ENABLE_REG] = intc_mode_zero,
+	[MODE_MASK_REG] = intc_mode_field,
+	[MODE_DUAL_REG] = intc_mode_field,
+	[MODE_PRIO_REG] = intc_mode_zero,
+	[MODE_PCLR_REG] = intc_mode_field,
+};
+
+static inline void _intc_enable(unsigned int irq, unsigned long handle)
+{
+	struct intc_desc_int *d = get_intc_desc(irq);
+	unsigned long addr;
+	unsigned int cpu;
+
+	for (cpu = 0; cpu < SMP_NR(d, _INTC_ADDR_E(handle)); cpu++) {
+		addr = INTC_REG(d, _INTC_ADDR_E(handle), cpu);
+		intc_enable_fns[_INTC_MODE(handle)](addr, handle, intc_reg_fns\
+						    [_INTC_FN(handle)], irq);
+	}
+}
+
+static void intc_enable(unsigned int irq)
+{
+	_intc_enable(irq, (unsigned long)get_irq_chip_data(irq));
+}
+
+static void intc_disable(unsigned int irq)
+{
+	struct intc_desc_int *d = get_intc_desc(irq);
+	unsigned long handle = (unsigned long) get_irq_chip_data(irq);
+	unsigned long addr;
+	unsigned int cpu;
+
+	for (cpu = 0; cpu < SMP_NR(d, _INTC_ADDR_D(handle)); cpu++) {
+		addr = INTC_REG(d, _INTC_ADDR_D(handle), cpu);
+		intc_disable_fns[_INTC_MODE(handle)](addr, handle,intc_reg_fns\
+						     [_INTC_FN(handle)], irq);
+	}
+}
+
+#if defined(CONFIG_CPU_SH3) || defined(CONFIG_CPU_SH4A)
+static void intc_mask_ack(unsigned int irq)
+{
+	struct intc_desc_int *d = get_intc_desc(irq);
+	unsigned long handle = ack_handle[irq];
+	unsigned long addr;
+
+	intc_disable(irq);
+
+	/* read register and write zero only to the assocaited bit */
+
+	if (handle) {
+		addr = INTC_REG(d, _INTC_ADDR_D(handle), 0);
+		switch (_INTC_FN(handle)) {
+		case REG_FN_MODIFY_BASE + 0:	/* 8bit */
+			__raw_readb(addr);
+			__raw_writeb(0xff ^ set_field(0, 1, handle), addr);
+			break;
+		case REG_FN_MODIFY_BASE + 1:	/* 16bit */
+			__raw_readw(addr);
+			__raw_writew(0xffff ^ set_field(0, 1, handle), addr);
+			break;
+		case REG_FN_MODIFY_BASE + 3:	/* 32bit */
+			__raw_readl(addr);
+			__raw_writel(0xffffffff ^ set_field(0, 1, handle), addr);
+			break;
+		default:
+			BUG();
+			break;
+		}
+	}
+}
+#endif
+
+static struct intc_handle_int *intc_find_irq(struct intc_handle_int *hp,
+					     unsigned int nr_hp,
+					     unsigned int irq)
+{
+	int i;
+
+	/* this doesn't scale well, but...
+	 *
+	 * this function should only be used for cerain uncommon
+	 * operations such as intc_set_priority() and intc_set_sense()
+	 * and in those rare cases performance doesn't matter that much.
+	 * keeping the memory footprint low is more important.
+	 *
+	 * one rather simple way to speed this up and still keep the
+	 * memory footprint down is to make sure the array is sorted
+	 * and then perform a bisect to lookup the irq.
+	 */
+
+	for (i = 0; i < nr_hp; i++) {
+		if ((hp + i)->irq != irq)
+			continue;
+
+		return hp + i;
+	}
+
+	return NULL;
+}
+
+int intc_set_priority(unsigned int irq, unsigned int prio)
+{
+	struct intc_desc_int *d = get_intc_desc(irq);
+	struct intc_handle_int *ihp;
+
+	if (!intc_prio_level[irq] || prio <= 1)
+		return -EINVAL;
+
+	ihp = intc_find_irq(d->prio, d->nr_prio, irq);
+	if (ihp) {
+		if (prio >= (1 << _INTC_WIDTH(ihp->handle)))
+			return -EINVAL;
+
+		intc_prio_level[irq] = prio;
+
+		/*
+		 * only set secondary masking method directly
+		 * primary masking method is using intc_prio_level[irq]
+		 * priority level will be set during next enable()
+		 */
+
+		if (_INTC_FN(ihp->handle) != REG_FN_ERR)
+			_intc_enable(irq, ihp->handle);
+	}
+	return 0;
+}
+
+#define VALID(x) (x | 0x80)
+
+static unsigned char intc_irq_sense_table[IRQ_TYPE_SENSE_MASK + 1] = {
+	[IRQ_TYPE_EDGE_FALLING] = VALID(0),
+	[IRQ_TYPE_EDGE_RISING] = VALID(1),
+	[IRQ_TYPE_LEVEL_LOW] = VALID(2),
+	/* SH7706, SH7707 and SH7709 do not support high level triggered */
+#if !defined(CONFIG_CPU_SUBTYPE_SH7706) && \
+    !defined(CONFIG_CPU_SUBTYPE_SH7707) && \
+    !defined(CONFIG_CPU_SUBTYPE_SH7709)
+	[IRQ_TYPE_LEVEL_HIGH] = VALID(3),
+#endif
+};
+
+static int intc_set_sense(unsigned int irq, unsigned int type)
+{
+	struct intc_desc_int *d = get_intc_desc(irq);
+	unsigned char value = intc_irq_sense_table[type & IRQ_TYPE_SENSE_MASK];
+	struct intc_handle_int *ihp;
+	unsigned long addr;
+
+	if (!value)
+		return -EINVAL;
+
+	ihp = intc_find_irq(d->sense, d->nr_sense, irq);
+	if (ihp) {
+		addr = INTC_REG(d, _INTC_ADDR_E(ihp->handle), 0);
+		intc_reg_fns[_INTC_FN(ihp->handle)](addr, ihp->handle, value);
+	}
+	return 0;
+}
+
+static unsigned int __init intc_get_reg(struct intc_desc_int *d,
+				 unsigned long address)
+{
+	unsigned int k;
+
+	for (k = 0; k < d->nr_reg; k++) {
+		if (d->reg[k] == address)
+			return k;
+	}
+
+	BUG();
+	return 0;
+}
+
+static intc_enum __init intc_grp_id(struct intc_desc *desc,
+				    intc_enum enum_id)
+{
+	struct intc_group *g = desc->groups;
+	unsigned int i, j;
+
+	for (i = 0; g && enum_id && i < desc->nr_groups; i++) {
+		g = desc->groups + i;
+
+		for (j = 0; g->enum_ids[j]; j++) {
+			if (g->enum_ids[j] != enum_id)
+				continue;
+
+			return g->enum_id;
+		}
+	}
+
+	return 0;
+}
+
+static unsigned int __init intc_mask_data(struct intc_desc *desc,
+					  struct intc_desc_int *d,
+					  intc_enum enum_id, int do_grps)
+{
+	struct intc_mask_reg *mr = desc->mask_regs;
+	unsigned int i, j, fn, mode;
+	unsigned long reg_e, reg_d;
+
+	for (i = 0; mr && enum_id && i < desc->nr_mask_regs; i++) {
+		mr = desc->mask_regs + i;
+
+		for (j = 0; j < ARRAY_SIZE(mr->enum_ids); j++) {
+			if (mr->enum_ids[j] != enum_id)
+				continue;
+
+			if (mr->set_reg && mr->clr_reg) {
+				fn = REG_FN_WRITE_BASE;
+				mode = MODE_DUAL_REG;
+				reg_e = mr->clr_reg;
+				reg_d = mr->set_reg;
+			} else {
+				fn = REG_FN_MODIFY_BASE;
+				if (mr->set_reg) {
+					mode = MODE_ENABLE_REG;
+					reg_e = mr->set_reg;
+					reg_d = mr->set_reg;
+				} else {
+					mode = MODE_MASK_REG;
+					reg_e = mr->clr_reg;
+					reg_d = mr->clr_reg;
+				}
+			}
+
+			fn += (mr->reg_width >> 3) - 1;
+			return _INTC_MK(fn, mode,
+					intc_get_reg(d, reg_e),
+					intc_get_reg(d, reg_d),
+					1,
+					(mr->reg_width - 1) - j);
+		}
+	}
+
+	if (do_grps)
+		return intc_mask_data(desc, d, intc_grp_id(desc, enum_id), 0);
+
+	return 0;
+}
+
+static unsigned int __init intc_prio_data(struct intc_desc *desc,
+					  struct intc_desc_int *d,
+					  intc_enum enum_id, int do_grps)
+{
+	struct intc_prio_reg *pr = desc->prio_regs;
+	unsigned int i, j, fn, mode, bit;
+	unsigned long reg_e, reg_d;
+
+	for (i = 0; pr && enum_id && i < desc->nr_prio_regs; i++) {
+		pr = desc->prio_regs + i;
+
+		for (j = 0; j < ARRAY_SIZE(pr->enum_ids); j++) {
+			if (pr->enum_ids[j] != enum_id)
+				continue;
+
+			if (pr->set_reg && pr->clr_reg) {
+				fn = REG_FN_WRITE_BASE;
+				mode = MODE_PCLR_REG;
+				reg_e = pr->set_reg;
+				reg_d = pr->clr_reg;
+			} else {
+				fn = REG_FN_MODIFY_BASE;
+				mode = MODE_PRIO_REG;
+				if (!pr->set_reg)
+					BUG();
+				reg_e = pr->set_reg;
+				reg_d = pr->set_reg;
+			}
+
+			fn += (pr->reg_width >> 3) - 1;
+
+			BUG_ON((j + 1) * pr->field_width > pr->reg_width);
+
+			bit = pr->reg_width - ((j + 1) * pr->field_width);
+
+			return _INTC_MK(fn, mode,
+					intc_get_reg(d, reg_e),
+					intc_get_reg(d, reg_d),
+					pr->field_width, bit);
+		}
+	}
+
+	if (do_grps)
+		return intc_prio_data(desc, d, intc_grp_id(desc, enum_id), 0);
+
+	return 0;
+}
+
+#if defined(CONFIG_CPU_SH3) || defined(CONFIG_CPU_SH4A)
+static unsigned int __init intc_ack_data(struct intc_desc *desc,
+					  struct intc_desc_int *d,
+					  intc_enum enum_id)
+{
+	struct intc_mask_reg *mr = desc->ack_regs;
+	unsigned int i, j, fn, mode;
+	unsigned long reg_e, reg_d;
+
+	for (i = 0; mr && enum_id && i < desc->nr_ack_regs; i++) {
+		mr = desc->ack_regs + i;
+
+		for (j = 0; j < ARRAY_SIZE(mr->enum_ids); j++) {
+			if (mr->enum_ids[j] != enum_id)
+				continue;
+
+			fn = REG_FN_MODIFY_BASE;
+			mode = MODE_ENABLE_REG;
+			reg_e = mr->set_reg;
+			reg_d = mr->set_reg;
+
+			fn += (mr->reg_width >> 3) - 1;
+			return _INTC_MK(fn, mode,
+					intc_get_reg(d, reg_e),
+					intc_get_reg(d, reg_d),
+					1,
+					(mr->reg_width - 1) - j);
+		}
+	}
+
+	return 0;
+}
+#endif
+
+static unsigned int __init intc_sense_data(struct intc_desc *desc,
+					   struct intc_desc_int *d,
+					   intc_enum enum_id)
+{
+	struct intc_sense_reg *sr = desc->sense_regs;
+	unsigned int i, j, fn, bit;
+
+	for (i = 0; sr && enum_id && i < desc->nr_sense_regs; i++) {
+		sr = desc->sense_regs + i;
+
+		for (j = 0; j < ARRAY_SIZE(sr->enum_ids); j++) {
+			if (sr->enum_ids[j] != enum_id)
+				continue;
+
+			fn = REG_FN_MODIFY_BASE;
+			fn += (sr->reg_width >> 3) - 1;
+
+			BUG_ON((j + 1) * sr->field_width > sr->reg_width);
+
+			bit = sr->reg_width - ((j + 1) * sr->field_width);
+
+			return _INTC_MK(fn, 0, intc_get_reg(d, sr->reg),
+					0, sr->field_width, bit);
+		}
+	}
+
+	return 0;
+}
+
+static void __init intc_register_irq(struct intc_desc *desc,
+				     struct intc_desc_int *d,
+				     intc_enum enum_id,
+				     unsigned int irq)
+{
+	struct intc_handle_int *hp;
+	unsigned int data[2], primary;
+
+	/* Prefer single interrupt source bitmap over other combinations:
+	 * 1. bitmap, single interrupt source
+	 * 2. priority, single interrupt source
+	 * 3. bitmap, multiple interrupt sources (groups)
+	 * 4. priority, multiple interrupt sources (groups)
+	 */
+
+	data[0] = intc_mask_data(desc, d, enum_id, 0);
+	data[1] = intc_prio_data(desc, d, enum_id, 0);
+
+	primary = 0;
+	if (!data[0] && data[1])
+		primary = 1;
+
+	data[0] = data[0] ? data[0] : intc_mask_data(desc, d, enum_id, 1);
+	data[1] = data[1] ? data[1] : intc_prio_data(desc, d, enum_id, 1);
+
+	if (!data[primary])
+		primary ^= 1;
+
+	BUG_ON(!data[primary]); /* must have primary masking method */
+
+	disable_irq_nosync(irq);
+	set_irq_chip_and_handler_name(irq, &d->chip,
+				      handle_level_irq, "level");
+	set_irq_chip_data(irq, (void *)data[primary]);
+
+	/* set priority level
+	 * - this needs to be at least 2 for 5-bit priorities on 7780
+	 */
+	intc_prio_level[irq] = 2;
+
+	/* enable secondary masking method if present */
+	if (data[!primary])
+		_intc_enable(irq, data[!primary]);
+
+	/* add irq to d->prio list if priority is available */
+	if (data[1]) {
+		hp = d->prio + d->nr_prio;
+		hp->irq = irq;
+		hp->handle = data[1];
+
+		if (primary) {
+			/*
+			 * only secondary priority should access registers, so
+			 * set _INTC_FN(h) = REG_FN_ERR for intc_set_priority()
+			 */
+
+			hp->handle &= ~_INTC_MK(0x0f, 0, 0, 0, 0, 0);
+			hp->handle |= _INTC_MK(REG_FN_ERR, 0, 0, 0, 0, 0);
+		}
+		d->nr_prio++;
+	}
+
+	/* add irq to d->sense list if sense is available */
+	data[0] = intc_sense_data(desc, d, enum_id);
+	if (data[0]) {
+		(d->sense + d->nr_sense)->irq = irq;
+		(d->sense + d->nr_sense)->handle = data[0];
+		d->nr_sense++;
+	}
+
+	/* irq should be disabled by default */
+	d->chip.mask(irq);
+
+#if defined(CONFIG_CPU_SH3) || defined(CONFIG_CPU_SH4A)
+	if (desc->ack_regs)
+		ack_handle[irq] = intc_ack_data(desc, d, enum_id);
+#endif
+}
+
+static unsigned int __init save_reg(struct intc_desc_int *d,
+				    unsigned int cnt,
+				    unsigned long value,
+				    unsigned int smp)
+{
+	if (value) {
+		d->reg[cnt] = value;
+#ifdef CONFIG_SMP
+		d->smp[cnt] = smp;
+#endif
+		return 1;
+	}
+
+	return 0;
+}
+
+
+void __init register_intc_controller(struct intc_desc *desc)
+{
+	unsigned int i, k, smp;
+	struct intc_desc_int *d;
+
+	d = alloc_bootmem(sizeof(*d));
+
+	d->nr_reg = desc->mask_regs ? desc->nr_mask_regs * 2 : 0;
+	d->nr_reg += desc->prio_regs ? desc->nr_prio_regs * 2 : 0;
+	d->nr_reg += desc->sense_regs ? desc->nr_sense_regs : 0;
+
+#if defined(CONFIG_CPU_SH3) || defined(CONFIG_CPU_SH4A)
+	d->nr_reg += desc->ack_regs ? desc->nr_ack_regs : 0;
+#endif
+	d->reg = alloc_bootmem(d->nr_reg * sizeof(*d->reg));
+#ifdef CONFIG_SMP
+	d->smp = alloc_bootmem(d->nr_reg * sizeof(*d->smp));
+#endif
+	k = 0;
+
+	if (desc->mask_regs) {
+		for (i = 0; i < desc->nr_mask_regs; i++) {
+			smp = IS_SMP(desc->mask_regs[i]);
+			k += save_reg(d, k, desc->mask_regs[i].set_reg, smp);
+			k += save_reg(d, k, desc->mask_regs[i].clr_reg, smp);
+		}
+	}
+
+	if (desc->prio_regs) {
+		d->prio = alloc_bootmem(desc->nr_vectors * sizeof(*d->prio));
+
+		for (i = 0; i < desc->nr_prio_regs; i++) {
+			smp = IS_SMP(desc->prio_regs[i]);
+			k += save_reg(d, k, desc->prio_regs[i].set_reg, smp);
+			k += save_reg(d, k, desc->prio_regs[i].clr_reg, smp);
+		}
+	}
+
+	if (desc->sense_regs) {
+		d->sense = alloc_bootmem(desc->nr_vectors * sizeof(*d->sense));
+
+		for (i = 0; i < desc->nr_sense_regs; i++) {
+			k += save_reg(d, k, desc->sense_regs[i].reg, 0);
+		}
+	}
+
+	d->chip.name = desc->name;
+	d->chip.mask = intc_disable;
+	d->chip.unmask = intc_enable;
+	d->chip.mask_ack = intc_disable;
+	d->chip.set_type = intc_set_sense;
+
+#if defined(CONFIG_CPU_SH3) || defined(CONFIG_CPU_SH4A)
+	if (desc->ack_regs) {
+		for (i = 0; i < desc->nr_ack_regs; i++)
+			k += save_reg(d, k, desc->ack_regs[i].set_reg, 0);
+
+		d->chip.mask_ack = intc_mask_ack;
+	}
+#endif
+
+	BUG_ON(k > 256); /* _INTC_ADDR_E() and _INTC_ADDR_D() are 8 bits */
+
+	for (i = 0; i < desc->nr_vectors; i++) {
+		struct intc_vect *vect = desc->vectors + i;
+
+		intc_register_irq(desc, d, vect->enum_id, evt2irq(vect->vect));
+	}
+}
diff --git a/include/linux/sh_intc.h b/include/linux/sh_intc.h
new file mode 100644
index 000000000000..68e212ff9dde
--- /dev/null
+++ b/include/linux/sh_intc.h
@@ -0,0 +1,91 @@
+#ifndef __SH_INTC_H
+#define __SH_INTC_H
+
+typedef unsigned char intc_enum;
+
+struct intc_vect {
+	intc_enum enum_id;
+	unsigned short vect;
+};
+
+#define INTC_VECT(enum_id, vect) { enum_id, vect }
+#define INTC_IRQ(enum_id, irq) INTC_VECT(enum_id, irq2evt(irq))
+
+struct intc_group {
+	intc_enum enum_id;
+	intc_enum enum_ids[32];
+};
+
+#define INTC_GROUP(enum_id, ids...) { enum_id, { ids } }
+
+struct intc_mask_reg {
+	unsigned long set_reg, clr_reg, reg_width;
+	intc_enum enum_ids[32];
+#ifdef CONFIG_SMP
+	unsigned long smp;
+#endif
+};
+
+struct intc_prio_reg {
+	unsigned long set_reg, clr_reg, reg_width, field_width;
+	intc_enum enum_ids[16];
+#ifdef CONFIG_SMP
+	unsigned long smp;
+#endif
+};
+
+struct intc_sense_reg {
+	unsigned long reg, reg_width, field_width;
+	intc_enum enum_ids[16];
+};
+
+#ifdef CONFIG_SMP
+#define INTC_SMP(stride, nr) .smp = (stride) | ((nr) << 8)
+#else
+#define INTC_SMP(stride, nr)
+#endif
+
+struct intc_desc {
+	struct intc_vect *vectors;
+	unsigned int nr_vectors;
+	struct intc_group *groups;
+	unsigned int nr_groups;
+	struct intc_mask_reg *mask_regs;
+	unsigned int nr_mask_regs;
+	struct intc_prio_reg *prio_regs;
+	unsigned int nr_prio_regs;
+	struct intc_sense_reg *sense_regs;
+	unsigned int nr_sense_regs;
+	char *name;
+#if defined(CONFIG_CPU_SH3) || defined(CONFIG_CPU_SH4A)
+	struct intc_mask_reg *ack_regs;
+	unsigned int nr_ack_regs;
+#endif
+};
+
+#define _INTC_ARRAY(a) a, sizeof(a)/sizeof(*a)
+#define DECLARE_INTC_DESC(symbol, chipname, vectors, groups,		\
+	mask_regs, prio_regs, sense_regs)				\
+struct intc_desc symbol __initdata = {					\
+	_INTC_ARRAY(vectors), _INTC_ARRAY(groups),			\
+	_INTC_ARRAY(mask_regs), _INTC_ARRAY(prio_regs),			\
+	_INTC_ARRAY(sense_regs),					\
+	chipname,							\
+}
+
+#if defined(CONFIG_CPU_SH3) || defined(CONFIG_CPU_SH4A)
+#define DECLARE_INTC_DESC_ACK(symbol, chipname, vectors, groups,	\
+	mask_regs, prio_regs, sense_regs, ack_regs)			\
+struct intc_desc symbol __initdata = {					\
+	_INTC_ARRAY(vectors), _INTC_ARRAY(groups),			\
+	_INTC_ARRAY(mask_regs), _INTC_ARRAY(prio_regs),			\
+	_INTC_ARRAY(sense_regs),					\
+	chipname,							\
+	_INTC_ARRAY(ack_regs),						\
+}
+#endif
+
+void __init register_intc_controller(struct intc_desc *desc);
+int intc_set_priority(unsigned int irq, unsigned int prio);
+
+#endif /* __SH_INTC_H */
-- 
cgit v1.2.3


From 2075eb8d95612cadde91ef5be82691d97a2ea6c5 Mon Sep 17 00:00:00 2001
From: Arjan van de Ven <arjan@linux.intel.com>
Date: Tue, 7 Oct 2008 10:57:54 -0700
Subject: rangetimer: fix x86 build failure for the !HRTIMERS case

the timer peek function was on the wrong side of an ifdef,
breaking for the !HRTIMERs case. Just provide an empty inline
for that case since it doesn't make sense in that scenario.

Signed-off-by: Arjan van de Ven <arjan@linux.intel.com>
---
 include/linux/hrtimer.h | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/hrtimer.h b/include/linux/hrtimer.h
index d93b1e1dc169..508ce20b8f9c 100644
--- a/include/linux/hrtimer.h
+++ b/include/linux/hrtimer.h
@@ -283,6 +283,8 @@ static inline int hrtimer_is_hres_active(struct hrtimer *timer)
 	return timer->base->cpu_base->hres_active;
 }
 
+extern void hrtimer_peek_ahead_timers(void);
+
 /*
  * The resolution of the clocks. The resolution value is returned in
  * the clock_getres() system call to give application programmers an
@@ -305,6 +307,7 @@ static inline int hrtimer_is_hres_active(struct hrtimer *timer)
  * is expired in the next softirq when the clock was advanced.
  */
 static inline void clock_was_set(void) { }
+static inline void hrtimer_peek_ahead_timers(void) { }
 
 static inline void hres_timers_resume(void) { }
 
@@ -328,7 +331,6 @@ extern ktime_t ktime_get_real(void);
 
 
 DECLARE_PER_CPU(struct tick_device, tick_cpu_device);
-extern void hrtimer_peek_ahead_timers(void);
 
 
 /* Exported timer functions: */
-- 
cgit v1.2.3


From a3cdcbfa8fb1fccfe48d359da86e99546610c562 Mon Sep 17 00:00:00 2001
From: Yevgeny Petrilin <yevgenyp@mellanox.co.il>
Date: Fri, 10 Oct 2008 12:01:37 -0700
Subject: mlx4_core: Add QP range reservation support

To allow allocating an aligned range of consecutive QP numbers, add an
interface to reserve an aligned range of QP numbers and have the QP
allocation function always take a QP number.

This will be used for RSS support in the mlx4_en Ethernet driver and
also potentially by IPoIB RSS support.

Signed-off-by: Yevgeny Petrilin <yevgenyp@mellanox.co.il>
Signed-off-by: Roland Dreier <rolandd@cisco.com>
---
 drivers/infiniband/hw/mlx4/qp.c | 21 ++++++++++--
 drivers/net/mlx4/alloc.c        | 74 ++++++++++++++++++++++++++++++++++++++++-
 drivers/net/mlx4/mlx4.h         |  2 ++
 drivers/net/mlx4/qp.c           | 45 +++++++++++++++++--------
 include/linux/mlx4/device.h     |  5 ++-
 5 files changed, 129 insertions(+), 18 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/infiniband/hw/mlx4/qp.c b/drivers/infiniband/hw/mlx4/qp.c
index baa01deb2436..39167a797f99 100644
--- a/drivers/infiniband/hw/mlx4/qp.c
+++ b/drivers/infiniband/hw/mlx4/qp.c
@@ -451,6 +451,7 @@ static int create_qp_common(struct mlx4_ib_dev *dev, struct ib_pd *pd,
 			    struct ib_qp_init_attr *init_attr,
 			    struct ib_udata *udata, int sqpn, struct mlx4_ib_qp *qp)
 {
+	int qpn;
 	int err;
 
 	mutex_init(&qp->mutex);
@@ -545,9 +546,17 @@ static int create_qp_common(struct mlx4_ib_dev *dev, struct ib_pd *pd,
 		}
 	}
 
-	err = mlx4_qp_alloc(dev->dev, sqpn, &qp->mqp);
+	if (sqpn) {
+		qpn = sqpn;
+	} else {
+		err = mlx4_qp_reserve_range(dev->dev, 1, 1, &qpn);
+		if (err)
+			goto err_wrid;
+	}
+
+	err = mlx4_qp_alloc(dev->dev, qpn, &qp->mqp);
 	if (err)
-		goto err_wrid;
+		goto err_qpn;
 
 	/*
 	 * Hardware wants QPN written in big-endian order (after
@@ -560,6 +569,10 @@ static int create_qp_common(struct mlx4_ib_dev *dev, struct ib_pd *pd,
 
 	return 0;
 
+err_qpn:
+	if (!sqpn)
+		mlx4_qp_release_range(dev->dev, qpn, 1);
+
 err_wrid:
 	if (pd->uobject) {
 		if (!init_attr->srq)
@@ -655,6 +668,10 @@ static void destroy_qp_common(struct mlx4_ib_dev *dev, struct mlx4_ib_qp *qp,
 	mlx4_ib_unlock_cqs(send_cq, recv_cq);
 
 	mlx4_qp_free(dev->dev, &qp->mqp);
+
+	if (!is_sqp(dev, qp))
+		mlx4_qp_release_range(dev->dev, qp->mqp.qpn, 1);
+
 	mlx4_mtt_cleanup(dev->dev, &qp->mtt);
 
 	if (is_user) {
diff --git a/drivers/net/mlx4/alloc.c b/drivers/net/mlx4/alloc.c
index 096bca54bcf7..e6c0d5bb5dcb 100644
--- a/drivers/net/mlx4/alloc.c
+++ b/drivers/net/mlx4/alloc.c
@@ -65,10 +65,82 @@ u32 mlx4_bitmap_alloc(struct mlx4_bitmap *bitmap)
 
 void mlx4_bitmap_free(struct mlx4_bitmap *bitmap, u32 obj)
 {
+	mlx4_bitmap_free_range(bitmap, obj, 1);
+}
+
+static unsigned long find_aligned_range(unsigned long *bitmap,
+					u32 start, u32 nbits,
+					int len, int align)
+{
+	unsigned long end, i;
+
+again:
+	start = ALIGN(start, align);
+
+	while ((start < nbits) && test_bit(start, bitmap))
+		start += align;
+
+	if (start >= nbits)
+		return -1;
+
+	end = start+len;
+	if (end > nbits)
+		return -1;
+
+	for (i = start + 1; i < end; i++) {
+		if (test_bit(i, bitmap)) {
+			start = i + 1;
+			goto again;
+		}
+	}
+
+	return start;
+}
+
+u32 mlx4_bitmap_alloc_range(struct mlx4_bitmap *bitmap, int cnt, int align)
+{
+	u32 obj, i;
+
+	if (likely(cnt == 1 && align == 1))
+		return mlx4_bitmap_alloc(bitmap);
+
+	spin_lock(&bitmap->lock);
+
+	obj = find_aligned_range(bitmap->table, bitmap->last,
+				 bitmap->max, cnt, align);
+	if (obj >= bitmap->max) {
+		bitmap->top = (bitmap->top + bitmap->max) & bitmap->mask;
+		obj = find_aligned_range(bitmap->table, 0,
+					 bitmap->max,
+					 cnt, align);
+	}
+
+	if (obj < bitmap->max) {
+		for (i = 0; i < cnt; i++)
+			set_bit(obj + i, bitmap->table);
+		if (obj == bitmap->last) {
+			bitmap->last = (obj + cnt);
+			if (bitmap->last >= bitmap->max)
+				bitmap->last = 0;
+		}
+		obj |= bitmap->top;
+	} else
+		obj = -1;
+
+	spin_unlock(&bitmap->lock);
+
+	return obj;
+}
+
+void mlx4_bitmap_free_range(struct mlx4_bitmap *bitmap, u32 obj, int cnt)
+{
+	u32 i;
+
 	obj &= bitmap->max - 1;
 
 	spin_lock(&bitmap->lock);
-	clear_bit(obj, bitmap->table);
+	for (i = 0; i < cnt; i++)
+		clear_bit(obj + i, bitmap->table);
 	bitmap->last = min(bitmap->last, obj);
 	bitmap->top = (bitmap->top + bitmap->max) & bitmap->mask;
 	spin_unlock(&bitmap->lock);
diff --git a/drivers/net/mlx4/mlx4.h b/drivers/net/mlx4/mlx4.h
index 5337e3ac3e78..b55ddab73f66 100644
--- a/drivers/net/mlx4/mlx4.h
+++ b/drivers/net/mlx4/mlx4.h
@@ -288,6 +288,8 @@ static inline struct mlx4_priv *mlx4_priv(struct mlx4_dev *dev)
 
 u32 mlx4_bitmap_alloc(struct mlx4_bitmap *bitmap);
 void mlx4_bitmap_free(struct mlx4_bitmap *bitmap, u32 obj);
+u32 mlx4_bitmap_alloc_range(struct mlx4_bitmap *bitmap, int cnt, int align);
+void mlx4_bitmap_free_range(struct mlx4_bitmap *bitmap, u32 obj, int cnt);
 int mlx4_bitmap_init(struct mlx4_bitmap *bitmap, u32 num, u32 mask, u32 reserved);
 void mlx4_bitmap_cleanup(struct mlx4_bitmap *bitmap);
 
diff --git a/drivers/net/mlx4/qp.c b/drivers/net/mlx4/qp.c
index c49a86044bf7..98e0c40ba368 100644
--- a/drivers/net/mlx4/qp.c
+++ b/drivers/net/mlx4/qp.c
@@ -147,19 +147,42 @@ int mlx4_qp_modify(struct mlx4_dev *dev, struct mlx4_mtt *mtt,
 }
 EXPORT_SYMBOL_GPL(mlx4_qp_modify);
 
-int mlx4_qp_alloc(struct mlx4_dev *dev, int sqpn, struct mlx4_qp *qp)
+int mlx4_qp_reserve_range(struct mlx4_dev *dev, int cnt, int align, int *base)
+{
+	struct mlx4_priv *priv = mlx4_priv(dev);
+	struct mlx4_qp_table *qp_table = &priv->qp_table;
+	int qpn;
+
+	qpn = mlx4_bitmap_alloc_range(&qp_table->bitmap, cnt, align);
+	if (qpn == -1)
+		return -ENOMEM;
+
+	*base = qpn;
+	return 0;
+}
+EXPORT_SYMBOL_GPL(mlx4_qp_reserve_range);
+
+void mlx4_qp_release_range(struct mlx4_dev *dev, int base_qpn, int cnt)
+{
+	struct mlx4_priv *priv = mlx4_priv(dev);
+	struct mlx4_qp_table *qp_table = &priv->qp_table;
+	if (base_qpn < dev->caps.sqp_start + 8)
+		return;
+
+	mlx4_bitmap_free_range(&qp_table->bitmap, base_qpn, cnt);
+}
+EXPORT_SYMBOL_GPL(mlx4_qp_release_range);
+
+int mlx4_qp_alloc(struct mlx4_dev *dev, int qpn, struct mlx4_qp *qp)
 {
 	struct mlx4_priv *priv = mlx4_priv(dev);
 	struct mlx4_qp_table *qp_table = &priv->qp_table;
 	int err;
 
-	if (sqpn)
-		qp->qpn = sqpn;
-	else {
-		qp->qpn = mlx4_bitmap_alloc(&qp_table->bitmap);
-		if (qp->qpn == -1)
-			return -ENOMEM;
-	}
+	if (!qpn)
+		return -EINVAL;
+
+	qp->qpn = qpn;
 
 	err = mlx4_table_get(dev, &qp_table->qp_table, qp->qpn);
 	if (err)
@@ -208,9 +231,6 @@ err_put_qp:
 	mlx4_table_put(dev, &qp_table->qp_table, qp->qpn);
 
 err_out:
-	if (!sqpn)
-		mlx4_bitmap_free(&qp_table->bitmap, qp->qpn);
-
 	return err;
 }
 EXPORT_SYMBOL_GPL(mlx4_qp_alloc);
@@ -239,9 +259,6 @@ void mlx4_qp_free(struct mlx4_dev *dev, struct mlx4_qp *qp)
 	mlx4_table_put(dev, &qp_table->altc_table, qp->qpn);
 	mlx4_table_put(dev, &qp_table->auxc_table, qp->qpn);
 	mlx4_table_put(dev, &qp_table->qp_table, qp->qpn);
-
-	if (qp->qpn >= dev->caps.sqp_start + 8)
-		mlx4_bitmap_free(&qp_table->bitmap, qp->qpn);
 }
 EXPORT_SYMBOL_GPL(mlx4_qp_free);
 
diff --git a/include/linux/mlx4/device.h b/include/linux/mlx4/device.h
index b2f944468313..d21e879f3c90 100644
--- a/include/linux/mlx4/device.h
+++ b/include/linux/mlx4/device.h
@@ -400,7 +400,10 @@ int mlx4_cq_alloc(struct mlx4_dev *dev, int nent, struct mlx4_mtt *mtt,
 		  int collapsed);
 void mlx4_cq_free(struct mlx4_dev *dev, struct mlx4_cq *cq);
 
-int mlx4_qp_alloc(struct mlx4_dev *dev, int sqpn, struct mlx4_qp *qp);
+int mlx4_qp_reserve_range(struct mlx4_dev *dev, int cnt, int align, int *base);
+void mlx4_qp_release_range(struct mlx4_dev *dev, int base_qpn, int cnt);
+
+int mlx4_qp_alloc(struct mlx4_dev *dev, int qpn, struct mlx4_qp *qp);
 void mlx4_qp_free(struct mlx4_dev *dev, struct mlx4_qp *qp);
 
 int mlx4_srq_alloc(struct mlx4_dev *dev, u32 pdn, struct mlx4_mtt *mtt,
-- 
cgit v1.2.3


From c865d2f6eb160c15b74245b4891c8e945d67d96c Mon Sep 17 00:00:00 2001
From: Bjorn Helgaas <bjorn.helgaas@hp.com>
Date: Tue, 19 Aug 2008 16:53:26 -0600
Subject: PNP: convert the last few pnp_info() uses to printk()

There are only a few remaining uses of pnp_info(), so I just
converted them to printk and removed the pnp_err(), pnp_info(),
pnp_warn(), and pnp_dbg() wrappers.

I also removed a couple debug messages that don't seem useful any
more ("driver registered", "driver unregistered", "driver attached").

Signed-off-by: Bjorn Helgaas <bjorn.helgaas@hp.com>
Signed-off-by: Andi Kleen <ak@linux.intel.com>
Signed-off-by: Len Brown <len.brown@intel.com>
---
 drivers/pnp/driver.c       |  4 ----
 drivers/pnp/pnpacpi/core.c |  6 +++---
 include/linux/pnp.h        | 10 ----------
 3 files changed, 3 insertions(+), 17 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/pnp/driver.c b/drivers/pnp/driver.c
index d3f869ee1d92..54673a4f5901 100644
--- a/drivers/pnp/driver.c
+++ b/drivers/pnp/driver.c
@@ -114,7 +114,6 @@ static int pnp_device_probe(struct device *dev)
 	} else
 		goto fail;
 
-	dev_dbg(dev, "driver attached\n");
 	return error;
 
 fail:
@@ -210,8 +209,6 @@ struct bus_type pnp_bus_type = {
 
 int pnp_register_driver(struct pnp_driver *drv)
 {
-	pnp_dbg("the driver '%s' has been registered", drv->name);
-
 	drv->driver.name = drv->name;
 	drv->driver.bus = &pnp_bus_type;
 
@@ -221,7 +218,6 @@ int pnp_register_driver(struct pnp_driver *drv)
 void pnp_unregister_driver(struct pnp_driver *drv)
 {
 	driver_unregister(&drv->driver);
-	pnp_dbg("the driver '%s' has been unregistered", drv->name);
 }
 
 /**
diff --git a/drivers/pnp/pnpacpi/core.c b/drivers/pnp/pnpacpi/core.c
index c1b9ea34977b..67c651bcaf71 100644
--- a/drivers/pnp/pnpacpi/core.c
+++ b/drivers/pnp/pnpacpi/core.c
@@ -255,14 +255,14 @@ int pnpacpi_disabled __initdata;
 static int __init pnpacpi_init(void)
 {
 	if (acpi_disabled || pnpacpi_disabled) {
-		pnp_info("PnP ACPI: disabled");
+		printk(KERN_INFO "pnp: PnP ACPI: disabled\n");
 		return 0;
 	}
-	pnp_info("PnP ACPI init");
+	printk(KERN_INFO "pnp: PnP ACPI init\n");
 	pnp_register_protocol(&pnpacpi_protocol);
 	register_acpi_bus_type(&acpi_pnp_bus);
 	acpi_get_devices(NULL, pnpacpi_add_device_handler, NULL, NULL);
-	pnp_info("PnP ACPI: found %d devices", num);
+	printk(KERN_INFO "pnp: PnP ACPI: found %d devices\n", num);
 	unregister_acpi_bus_type(&acpi_pnp_bus);
 	pnp_platform_devices = 1;
 	return 0;
diff --git a/include/linux/pnp.h b/include/linux/pnp.h
index be764e514e35..05daecec16c5 100644
--- a/include/linux/pnp.h
+++ b/include/linux/pnp.h
@@ -483,14 +483,4 @@ static inline void pnp_unregister_driver(struct pnp_driver *drv) { }
 
 #endif /* CONFIG_PNP */
 
-#define pnp_err(format, arg...) printk(KERN_ERR "pnp: " format "\n" , ## arg)
-#define pnp_info(format, arg...) printk(KERN_INFO "pnp: " format "\n" , ## arg)
-#define pnp_warn(format, arg...) printk(KERN_WARNING "pnp: " format "\n" , ## arg)
-
-#ifdef CONFIG_PNP_DEBUG
-#define pnp_dbg(format, arg...) printk(KERN_DEBUG "pnp: " format "\n" , ## arg)
-#else
-#define pnp_dbg(format, arg...) do {} while (0)
-#endif
-
 #endif /* _LINUX_PNP_H */
-- 
cgit v1.2.3


From fa89b6089b5f4c7a5244b642caaca3e72b06ebe4 Mon Sep 17 00:00:00 2001
From: Yinghai Lu <yhlu.kernel@gmail.com>
Date: Wed, 24 Sep 2008 19:04:32 -0700
Subject: ACPI: remove unused have_arch_parse_srat

This was a workaround for 32bit numa SRAT processing, and we removed those
workarounds, making 32 bit more like 64 bit.  HAVE_ARCH_PARSE_SRAT is no
longer defined anywhere.

Signed-off-by: Yinghai Lu <yhlu.kernel@gmail.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Len Brown <len.brown@intel.com>
---
 include/linux/acpi.h | 8 --------
 1 file changed, 8 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/acpi.h b/include/linux/acpi.h
index 702f79dad16a..fd6a452b0ceb 100644
--- a/include/linux/acpi.h
+++ b/include/linux/acpi.h
@@ -94,18 +94,10 @@ int acpi_parse_mcfg (struct acpi_table_header *header);
 void acpi_table_print_madt_entry (struct acpi_subtable_header *madt);
 
 /* the following four functions are architecture-dependent */
-#ifdef CONFIG_HAVE_ARCH_PARSE_SRAT
-#define NR_NODE_MEMBLKS MAX_NUMNODES
-#define acpi_numa_slit_init(slit) do {} while (0)
-#define acpi_numa_processor_affinity_init(pa) do {} while (0)
-#define acpi_numa_memory_affinity_init(ma) do {} while (0)
-#define acpi_numa_arch_fixup() do {} while (0)
-#else
 void acpi_numa_slit_init (struct acpi_table_slit *slit);
 void acpi_numa_processor_affinity_init (struct acpi_srat_cpu_affinity *pa);
 void acpi_numa_memory_affinity_init (struct acpi_srat_mem_affinity *ma);
 void acpi_numa_arch_fixup(void);
-#endif
 
 #ifdef CONFIG_ACPI_HOTPLUG_CPU
 /* Arch dependent functions for cpu hotplug support */
-- 
cgit v1.2.3


From 69fd3a8d098faf41a04930afa83757c0555ee360 Mon Sep 17 00:00:00 2001
From: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
Date: Sun, 12 Oct 2008 16:18:36 +0200
Subject: [MTD] remove unused mtd parameter in of_mtd_parse_partitions()

Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
Signed-off-by: David Woodhouse <David.Woodhouse@intel.com>
---
 drivers/mtd/maps/physmap_of.c    | 3 +--
 drivers/mtd/nand/fsl_elbc_nand.c | 3 +--
 drivers/mtd/ofpart.c             | 1 -
 include/linux/mtd/partitions.h   | 1 -
 4 files changed, 2 insertions(+), 6 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/mtd/maps/physmap_of.c b/drivers/mtd/maps/physmap_of.c
index 49acd4171893..5fcfec034a94 100644
--- a/drivers/mtd/maps/physmap_of.c
+++ b/drivers/mtd/maps/physmap_of.c
@@ -230,8 +230,7 @@ static int __devinit of_flash_probe(struct of_device *dev,
 
 #ifdef CONFIG_MTD_OF_PARTS
 	if (err == 0) {
-		err = of_mtd_parse_partitions(&dev->dev, info->mtd,
-		                              dp, &info->parts);
+		err = of_mtd_parse_partitions(&dev->dev, dp, &info->parts);
 		if (err < 0)
 			return err;
 	}
diff --git a/drivers/mtd/nand/fsl_elbc_nand.c b/drivers/mtd/nand/fsl_elbc_nand.c
index 98ad3cefcaf4..4aa5bd6158da 100644
--- a/drivers/mtd/nand/fsl_elbc_nand.c
+++ b/drivers/mtd/nand/fsl_elbc_nand.c
@@ -918,8 +918,7 @@ static int __devinit fsl_elbc_chip_probe(struct fsl_elbc_ctrl *ctrl,
 
 #ifdef CONFIG_MTD_OF_PARTS
 	if (ret == 0) {
-		ret = of_mtd_parse_partitions(priv->dev, &priv->mtd,
-		                              node, &parts);
+		ret = of_mtd_parse_partitions(priv->dev, node, &parts);
 		if (ret < 0)
 			goto err;
 	}
diff --git a/drivers/mtd/ofpart.c b/drivers/mtd/ofpart.c
index 4f80c2fd89af..9e45b3f39c0e 100644
--- a/drivers/mtd/ofpart.c
+++ b/drivers/mtd/ofpart.c
@@ -20,7 +20,6 @@
 #include <linux/mtd/partitions.h>
 
 int __devinit of_mtd_parse_partitions(struct device *dev,
-                                      struct mtd_info *mtd,
                                       struct device_node *node,
                                       struct mtd_partition **pparts)
 {
diff --git a/include/linux/mtd/partitions.h b/include/linux/mtd/partitions.h
index 5014f7a9f5df..c92b4d439609 100644
--- a/include/linux/mtd/partitions.h
+++ b/include/linux/mtd/partitions.h
@@ -73,7 +73,6 @@ struct device;
 struct device_node;
 
 int __devinit of_mtd_parse_partitions(struct device *dev,
-                                      struct mtd_info *mtd,
                                       struct device_node *node,
                                       struct mtd_partition **pparts);
 
-- 
cgit v1.2.3


From 97e1c18e8d17bd87e1e383b2e9d9fc740332c8e2 Mon Sep 17 00:00:00 2001
From: Mathieu Desnoyers <mathieu.desnoyers@polymtl.ca>
Date: Fri, 18 Jul 2008 12:16:16 -0400
Subject: tracing: Kernel Tracepoints

Implementation of kernel tracepoints. Inspired from the Linux Kernel
Markers. Allows complete typing verification by declaring both tracing
statement inline functions and probe registration/unregistration static
inline functions within the same macro "DEFINE_TRACE". No format string
is required. See the tracepoint Documentation and Samples patches for
usage examples.

Taken from the documentation patch :

"A tracepoint placed in code provides a hook to call a function (probe)
that you can provide at runtime. A tracepoint can be "on" (a probe is
connected to it) or "off" (no probe is attached). When a tracepoint is
"off" it has no effect, except for adding a tiny time penalty (checking
a condition for a branch) and space penalty (adding a few bytes for the
function call at the end of the instrumented function and adds a data
structure in a separate section).  When a tracepoint is "on", the
function you provide is called each time the tracepoint is executed, in
the execution context of the caller. When the function provided ends its
execution, it returns to the caller (continuing from the tracepoint
site).

You can put tracepoints at important locations in the code. They are
lightweight hooks that can pass an arbitrary number of parameters, which
prototypes are described in a tracepoint declaration placed in a header
file."

Addition and removal of tracepoints is synchronized by RCU using the
scheduler (and preempt_disable) as guarantees to find a quiescent state
(this is really RCU "classic"). The update side uses rcu_barrier_sched()
with call_rcu_sched() and the read/execute side uses
"preempt_disable()/preempt_enable()".

We make sure the previous array containing probes, which has been
scheduled for deletion by the rcu callback, is indeed freed before we
proceed to the next update. It therefore limits the rate of modification
of a single tracepoint to one update per RCU period. The objective here
is to permit fast batch add/removal of probes on _different_
tracepoints.

Changelog :
- Use #name ":" #proto as string to identify the tracepoint in the
  tracepoint table. This will make sure not type mismatch happens due to
  connexion of a probe with the wrong type to a tracepoint declared with
  the same name in a different header.
- Add tracepoint_entry_free_old.
- Change __TO_TRACE to get rid of the 'i' iterator.

Masami Hiramatsu <mhiramat@redhat.com> :
Tested on x86-64.

Performance impact of a tracepoint : same as markers, except that it
adds about 70 bytes of instructions in an unlikely branch of each
instrumented function (the for loop, the stack setup and the function
call). It currently adds a memory read, a test and a conditional branch
at the instrumentation site (in the hot path). Immediate values will
eventually change this into a load immediate, test and branch, which
removes the memory read which will make the i-cache impact smaller
(changing the memory read for a load immediate removes 3-4 bytes per
site on x86_32 (depending on mov prefixes), or 7-8 bytes on x86_64, it
also saves the d-cache hit).

About the performance impact of tracepoints (which is comparable to
markers), even without immediate values optimizations, tests done by
Hideo Aoki on ia64 show no regression. His test case was using hackbench
on a kernel where scheduler instrumentation (about 5 events in code
scheduler code) was added.

Quoting Hideo Aoki about Markers :

I evaluated overhead of kernel marker using linux-2.6-sched-fixes git
tree, which includes several markers for LTTng, using an ia64 server.

While the immediate trace mark feature isn't implemented on ia64, there
is no major performance regression. So, I think that we don't have any
issues to propose merging marker point patches into Linus's tree from
the viewpoint of performance impact.

I prepared two kernels to evaluate. The first one was compiled without
CONFIG_MARKERS. The second one was enabled CONFIG_MARKERS.

I downloaded the original hackbench from the following URL:
http://devresources.linux-foundation.org/craiger/hackbench/src/hackbench.c

I ran hackbench 5 times in each condition and calculated the average and
difference between the kernels.

    The parameter of hackbench: every 50 from 50 to 800
    The number of CPUs of the server: 2, 4, and 8

Below is the results. As you can see, major performance regression
wasn't found in any case. Even if number of processes increases,
differences between marker-enabled kernel and marker- disabled kernel
doesn't increase. Moreover, if number of CPUs increases, the differences
doesn't increase either.

Curiously, marker-enabled kernel is better than marker-disabled kernel
in more than half cases, although I guess it comes from the difference
of memory access pattern.

* 2 CPUs

Number of | without      | with         | diff     | diff    |
processes | Marker [Sec] | Marker [Sec] |   [Sec]  |   [%]   |
--------------------------------------------------------------
       50 |      4.811   |       4.872  |  +0.061  |  +1.27  |
      100 |      9.854   |      10.309  |  +0.454  |  +4.61  |
      150 |     15.602   |      15.040  |  -0.562  |  -3.6   |
      200 |     20.489   |      20.380  |  -0.109  |  -0.53  |
      250 |     25.798   |      25.652  |  -0.146  |  -0.56  |
      300 |     31.260   |      30.797  |  -0.463  |  -1.48  |
      350 |     36.121   |      35.770  |  -0.351  |  -0.97  |
      400 |     42.288   |      42.102  |  -0.186  |  -0.44  |
      450 |     47.778   |      47.253  |  -0.526  |  -1.1   |
      500 |     51.953   |      52.278  |  +0.325  |  +0.63  |
      550 |     58.401   |      57.700  |  -0.701  |  -1.2   |
      600 |     63.334   |      63.222  |  -0.112  |  -0.18  |
      650 |     68.816   |      68.511  |  -0.306  |  -0.44  |
      700 |     74.667   |      74.088  |  -0.579  |  -0.78  |
      750 |     78.612   |      79.582  |  +0.970  |  +1.23  |
      800 |     85.431   |      85.263  |  -0.168  |  -0.2   |
--------------------------------------------------------------

* 4 CPUs

Number of | without      | with         | diff     | diff    |
processes | Marker [Sec] | Marker [Sec] |   [Sec]  |   [%]   |
--------------------------------------------------------------
       50 |      2.586   |       2.584  |  -0.003  |  -0.1   |
      100 |      5.254   |       5.283  |  +0.030  |  +0.56  |
      150 |      8.012   |       8.074  |  +0.061  |  +0.76  |
      200 |     11.172   |      11.000  |  -0.172  |  -1.54  |
      250 |     13.917   |      14.036  |  +0.119  |  +0.86  |
      300 |     16.905   |      16.543  |  -0.362  |  -2.14  |
      350 |     19.901   |      20.036  |  +0.135  |  +0.68  |
      400 |     22.908   |      23.094  |  +0.186  |  +0.81  |
      450 |     26.273   |      26.101  |  -0.172  |  -0.66  |
      500 |     29.554   |      29.092  |  -0.461  |  -1.56  |
      550 |     32.377   |      32.274  |  -0.103  |  -0.32  |
      600 |     35.855   |      35.322  |  -0.533  |  -1.49  |
      650 |     39.192   |      38.388  |  -0.804  |  -2.05  |
      700 |     41.744   |      41.719  |  -0.025  |  -0.06  |
      750 |     45.016   |      44.496  |  -0.520  |  -1.16  |
      800 |     48.212   |      47.603  |  -0.609  |  -1.26  |
--------------------------------------------------------------

* 8 CPUs

Number of | without      | with         | diff     | diff    |
processes | Marker [Sec] | Marker [Sec] |   [Sec]  |   [%]   |
--------------------------------------------------------------
       50 |      2.094   |       2.072  |  -0.022  |  -1.07  |
      100 |      4.162   |       4.273  |  +0.111  |  +2.66  |
      150 |      6.485   |       6.540  |  +0.055  |  +0.84  |
      200 |      8.556   |       8.478  |  -0.078  |  -0.91  |
      250 |     10.458   |      10.258  |  -0.200  |  -1.91  |
      300 |     12.425   |      12.750  |  +0.325  |  +2.62  |
      350 |     14.807   |      14.839  |  +0.032  |  +0.22  |
      400 |     16.801   |      16.959  |  +0.158  |  +0.94  |
      450 |     19.478   |      19.009  |  -0.470  |  -2.41  |
      500 |     21.296   |      21.504  |  +0.208  |  +0.98  |
      550 |     23.842   |      23.979  |  +0.137  |  +0.57  |
      600 |     26.309   |      26.111  |  -0.198  |  -0.75  |
      650 |     28.705   |      28.446  |  -0.259  |  -0.9   |
      700 |     31.233   |      31.394  |  +0.161  |  +0.52  |
      750 |     34.064   |      33.720  |  -0.344  |  -1.01  |
      800 |     36.320   |      36.114  |  -0.206  |  -0.57  |
--------------------------------------------------------------

Signed-off-by: Mathieu Desnoyers <mathieu.desnoyers@polymtl.ca>
Acked-by: Masami Hiramatsu <mhiramat@redhat.com>
Acked-by: 'Peter Zijlstra' <peterz@infradead.org>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 include/asm-generic/vmlinux.lds.h |   6 +-
 include/linux/module.h            |  17 ++
 include/linux/tracepoint.h        | 127 ++++++++++
 init/Kconfig                      |   7 +
 kernel/Makefile                   |   1 +
 kernel/module.c                   |  66 +++++-
 kernel/tracepoint.c               | 476 ++++++++++++++++++++++++++++++++++++++
 7 files changed, 698 insertions(+), 2 deletions(-)
 create mode 100644 include/linux/tracepoint.h
 create mode 100644 kernel/tracepoint.c

(limited to 'include/linux')

diff --git a/include/asm-generic/vmlinux.lds.h b/include/asm-generic/vmlinux.lds.h
index 7440a0dceddb..3d8e472a09c8 100644
--- a/include/asm-generic/vmlinux.lds.h
+++ b/include/asm-generic/vmlinux.lds.h
@@ -52,7 +52,10 @@
 	. = ALIGN(8);							\
 	VMLINUX_SYMBOL(__start___markers) = .;				\
 	*(__markers)							\
-	VMLINUX_SYMBOL(__stop___markers) = .;
+	VMLINUX_SYMBOL(__stop___markers) = .;				\
+	VMLINUX_SYMBOL(__start___tracepoints) = .;			\
+	*(__tracepoints)						\
+	VMLINUX_SYMBOL(__stop___tracepoints) = .;
 
 #define RO_DATA(align)							\
 	. = ALIGN((align));						\
@@ -61,6 +64,7 @@
 		*(.rodata) *(.rodata.*)					\
 		*(__vermagic)		/* Kernel version magic */	\
 		*(__markers_strings)	/* Markers: strings */		\
+		*(__tracepoints_strings)/* Tracepoints: strings */	\
 	}								\
 									\
 	.rodata1          : AT(ADDR(.rodata1) - LOAD_OFFSET) {		\
diff --git a/include/linux/module.h b/include/linux/module.h
index 68e09557c951..8b6113503863 100644
--- a/include/linux/module.h
+++ b/include/linux/module.h
@@ -16,6 +16,7 @@
 #include <linux/kobject.h>
 #include <linux/moduleparam.h>
 #include <linux/marker.h>
+#include <linux/tracepoint.h>
 #include <asm/local.h>
 
 #include <asm/module.h>
@@ -331,6 +332,10 @@ struct module
 	struct marker *markers;
 	unsigned int num_markers;
 #endif
+#ifdef CONFIG_TRACEPOINTS
+	struct tracepoint *tracepoints;
+	unsigned int num_tracepoints;
+#endif
 
 #ifdef CONFIG_MODULE_UNLOAD
 	/* What modules depend on me? */
@@ -454,6 +459,9 @@ extern void print_modules(void);
 
 extern void module_update_markers(void);
 
+extern void module_update_tracepoints(void);
+extern int module_get_iter_tracepoints(struct tracepoint_iter *iter);
+
 #else /* !CONFIG_MODULES... */
 #define EXPORT_SYMBOL(sym)
 #define EXPORT_SYMBOL_GPL(sym)
@@ -558,6 +566,15 @@ static inline void module_update_markers(void)
 {
 }
 
+static inline void module_update_tracepoints(void)
+{
+}
+
+static inline int module_get_iter_tracepoints(struct tracepoint_iter *iter)
+{
+	return 0;
+}
+
 #endif /* CONFIG_MODULES */
 
 struct device_driver;
diff --git a/include/linux/tracepoint.h b/include/linux/tracepoint.h
new file mode 100644
index 000000000000..e623a6fca5c3
--- /dev/null
+++ b/include/linux/tracepoint.h
@@ -0,0 +1,127 @@
+#ifndef _LINUX_TRACEPOINT_H
+#define _LINUX_TRACEPOINT_H
+
+/*
+ * Kernel Tracepoint API.
+ *
+ * See Documentation/tracepoint.txt.
+ *
+ * (C) Copyright 2008 Mathieu Desnoyers <mathieu.desnoyers@polymtl.ca>
+ *
+ * Heavily inspired from the Linux Kernel Markers.
+ *
+ * This file is released under the GPLv2.
+ * See the file COPYING for more details.
+ */
+
+#include <linux/types.h>
+#include <linux/rcupdate.h>
+
+struct module;
+struct tracepoint;
+
+struct tracepoint {
+	const char *name;		/* Tracepoint name */
+	int state;			/* State. */
+	void **funcs;
+} __attribute__((aligned(8)));
+
+
+#define TPPROTO(args...)	args
+#define TPARGS(args...)		args
+
+#ifdef CONFIG_TRACEPOINTS
+
+/*
+ * it_func[0] is never NULL because there is at least one element in the array
+ * when the array itself is non NULL.
+ */
+#define __DO_TRACE(tp, proto, args)					\
+	do {								\
+		void **it_func;						\
+									\
+		rcu_read_lock_sched();					\
+		it_func = rcu_dereference((tp)->funcs);			\
+		if (it_func) {						\
+			do {						\
+				((void(*)(proto))(*it_func))(args);	\
+			} while (*(++it_func));				\
+		}							\
+		rcu_read_unlock_sched();				\
+	} while (0)
+
+/*
+ * Make sure the alignment of the structure in the __tracepoints section will
+ * not add unwanted padding between the beginning of the section and the
+ * structure. Force alignment to the same alignment as the section start.
+ */
+#define DEFINE_TRACE(name, proto, args)					\
+	static inline void trace_##name(proto)				\
+	{								\
+		static const char __tpstrtab_##name[]			\
+		__attribute__((section("__tracepoints_strings")))	\
+		= #name ":" #proto;					\
+		static struct tracepoint __tracepoint_##name		\
+		__attribute__((section("__tracepoints"), aligned(8))) =	\
+		{ __tpstrtab_##name, 0, NULL };				\
+		if (unlikely(__tracepoint_##name.state))		\
+			__DO_TRACE(&__tracepoint_##name,		\
+				TPPROTO(proto), TPARGS(args));		\
+	}								\
+	static inline int register_trace_##name(void (*probe)(proto))	\
+	{								\
+		return tracepoint_probe_register(#name ":" #proto,	\
+			(void *)probe);					\
+	}								\
+	static inline void unregister_trace_##name(void (*probe)(proto))\
+	{								\
+		tracepoint_probe_unregister(#name ":" #proto,		\
+			(void *)probe);					\
+	}
+
+extern void tracepoint_update_probe_range(struct tracepoint *begin,
+	struct tracepoint *end);
+
+#else /* !CONFIG_TRACEPOINTS */
+#define DEFINE_TRACE(name, proto, args)			\
+	static inline void _do_trace_##name(struct tracepoint *tp, proto) \
+	{ }								\
+	static inline void trace_##name(proto)				\
+	{ }								\
+	static inline int register_trace_##name(void (*probe)(proto))	\
+	{								\
+		return -ENOSYS;						\
+	}								\
+	static inline void unregister_trace_##name(void (*probe)(proto))\
+	{ }
+
+static inline void tracepoint_update_probe_range(struct tracepoint *begin,
+	struct tracepoint *end)
+{ }
+#endif /* CONFIG_TRACEPOINTS */
+
+/*
+ * Connect a probe to a tracepoint.
+ * Internal API, should not be used directly.
+ */
+extern int tracepoint_probe_register(const char *name, void *probe);
+
+/*
+ * Disconnect a probe from a tracepoint.
+ * Internal API, should not be used directly.
+ */
+extern int tracepoint_probe_unregister(const char *name, void *probe);
+
+struct tracepoint_iter {
+	struct module *module;
+	struct tracepoint *tracepoint;
+};
+
+extern void tracepoint_iter_start(struct tracepoint_iter *iter);
+extern void tracepoint_iter_next(struct tracepoint_iter *iter);
+extern void tracepoint_iter_stop(struct tracepoint_iter *iter);
+extern void tracepoint_iter_reset(struct tracepoint_iter *iter);
+extern int tracepoint_get_iter_range(struct tracepoint **tracepoint,
+	struct tracepoint *begin, struct tracepoint *end);
+
+#endif
diff --git a/init/Kconfig b/init/Kconfig
index c11da38837e5..70082678a914 100644
--- a/init/Kconfig
+++ b/init/Kconfig
@@ -771,6 +771,13 @@ config PROFILING
 	  Say Y here to enable the extended profiling support mechanisms used
 	  by profilers such as OProfile.
 
+config TRACEPOINTS
+	bool "Activate tracepoints"
+	default y
+	help
+	  Place an empty function call at each tracepoint site. Can be
+	  dynamically changed for a probe function.
+
 config MARKERS
 	bool "Activate markers"
 	help
diff --git a/kernel/Makefile b/kernel/Makefile
index 4e1d7df7c3e2..8f9ce7ec21b6 100644
--- a/kernel/Makefile
+++ b/kernel/Makefile
@@ -83,6 +83,7 @@ obj-$(CONFIG_SYSCTL) += utsname_sysctl.o
 obj-$(CONFIG_TASK_DELAY_ACCT) += delayacct.o
 obj-$(CONFIG_TASKSTATS) += taskstats.o tsacct.o
 obj-$(CONFIG_MARKERS) += marker.o
+obj-$(CONFIG_TRACEPOINTS) += tracepoint.o
 obj-$(CONFIG_LATENCYTOP) += latencytop.o
 obj-$(CONFIG_HAVE_GENERIC_DMA_COHERENT) += dma-coherent.o
 obj-$(CONFIG_FTRACE) += trace/
diff --git a/kernel/module.c b/kernel/module.c
index 9db11911e04b..661d73db786e 100644
--- a/kernel/module.c
+++ b/kernel/module.c
@@ -46,6 +46,7 @@
 #include <asm/cacheflush.h>
 #include <linux/license.h>
 #include <asm/sections.h>
+#include <linux/tracepoint.h>
 
 #if 0
 #define DEBUGP printk
@@ -1831,6 +1832,8 @@ static noinline struct module *load_module(void __user *umod,
 #endif
 	unsigned int markersindex;
 	unsigned int markersstringsindex;
+	unsigned int tracepointsindex;
+	unsigned int tracepointsstringsindex;
 	struct module *mod;
 	long err = 0;
 	void *percpu = NULL, *ptr = NULL; /* Stops spurious gcc warning */
@@ -2117,6 +2120,9 @@ static noinline struct module *load_module(void __user *umod,
 	markersindex = find_sec(hdr, sechdrs, secstrings, "__markers");
  	markersstringsindex = find_sec(hdr, sechdrs, secstrings,
 					"__markers_strings");
+	tracepointsindex = find_sec(hdr, sechdrs, secstrings, "__tracepoints");
+	tracepointsstringsindex = find_sec(hdr, sechdrs, secstrings,
+					"__tracepoints_strings");
 
 	/* Now do relocations. */
 	for (i = 1; i < hdr->e_shnum; i++) {
@@ -2144,6 +2150,12 @@ static noinline struct module *load_module(void __user *umod,
 	mod->num_markers =
 		sechdrs[markersindex].sh_size / sizeof(*mod->markers);
 #endif
+#ifdef CONFIG_TRACEPOINTS
+	mod->tracepoints = (void *)sechdrs[tracepointsindex].sh_addr;
+	mod->num_tracepoints =
+		sechdrs[tracepointsindex].sh_size / sizeof(*mod->tracepoints);
+#endif
+
 
         /* Find duplicate symbols */
 	err = verify_export_symbols(mod);
@@ -2162,11 +2174,16 @@ static noinline struct module *load_module(void __user *umod,
 
 	add_kallsyms(mod, sechdrs, symindex, strindex, secstrings);
 
+	if (!mod->taints) {
 #ifdef CONFIG_MARKERS
-	if (!mod->taints)
 		marker_update_probe_range(mod->markers,
 			mod->markers + mod->num_markers);
 #endif
+#ifdef CONFIG_TRACEPOINTS
+		tracepoint_update_probe_range(mod->tracepoints,
+			mod->tracepoints + mod->num_tracepoints);
+#endif
+	}
 	err = module_finalize(hdr, sechdrs, mod);
 	if (err < 0)
 		goto cleanup;
@@ -2717,3 +2734,50 @@ void module_update_markers(void)
 	mutex_unlock(&module_mutex);
 }
 #endif
+
+#ifdef CONFIG_TRACEPOINTS
+void module_update_tracepoints(void)
+{
+	struct module *mod;
+
+	mutex_lock(&module_mutex);
+	list_for_each_entry(mod, &modules, list)
+		if (!mod->taints)
+			tracepoint_update_probe_range(mod->tracepoints,
+				mod->tracepoints + mod->num_tracepoints);
+	mutex_unlock(&module_mutex);
+}
+
+/*
+ * Returns 0 if current not found.
+ * Returns 1 if current found.
+ */
+int module_get_iter_tracepoints(struct tracepoint_iter *iter)
+{
+	struct module *iter_mod;
+	int found = 0;
+
+	mutex_lock(&module_mutex);
+	list_for_each_entry(iter_mod, &modules, list) {
+		if (!iter_mod->taints) {
+			/*
+			 * Sorted module list
+			 */
+			if (iter_mod < iter->module)
+				continue;
+			else if (iter_mod > iter->module)
+				iter->tracepoint = NULL;
+			found = tracepoint_get_iter_range(&iter->tracepoint,
+				iter_mod->tracepoints,
+				iter_mod->tracepoints
+					+ iter_mod->num_tracepoints);
+			if (found) {
+				iter->module = iter_mod;
+				break;
+			}
+		}
+	}
+	mutex_unlock(&module_mutex);
+	return found;
+}
+#endif
diff --git a/kernel/tracepoint.c b/kernel/tracepoint.c
new file mode 100644
index 000000000000..42e86ddbd2a0
--- /dev/null
+++ b/kernel/tracepoint.c
@@ -0,0 +1,476 @@
+/*
+ * Copyright (C) 2008 Mathieu Desnoyers
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
+ */
+#include <linux/module.h>
+#include <linux/mutex.h>
+#include <linux/types.h>
+#include <linux/jhash.h>
+#include <linux/list.h>
+#include <linux/rcupdate.h>
+#include <linux/tracepoint.h>
+#include <linux/err.h>
+#include <linux/slab.h>
+
+extern struct tracepoint __start___tracepoints[];
+extern struct tracepoint __stop___tracepoints[];
+
+/* Set to 1 to enable tracepoint debug output */
+static const int tracepoint_debug;
+
+/*
+ * tracepoints_mutex nests inside module_mutex. Tracepoints mutex protects the
+ * builtin and module tracepoints and the hash table.
+ */
+static DEFINE_MUTEX(tracepoints_mutex);
+
+/*
+ * Tracepoint hash table, containing the active tracepoints.
+ * Protected by tracepoints_mutex.
+ */
+#define TRACEPOINT_HASH_BITS 6
+#define TRACEPOINT_TABLE_SIZE (1 << TRACEPOINT_HASH_BITS)
+
+/*
+ * Note about RCU :
+ * It is used to to delay the free of multiple probes array until a quiescent
+ * state is reached.
+ * Tracepoint entries modifications are protected by the tracepoints_mutex.
+ */
+struct tracepoint_entry {
+	struct hlist_node hlist;
+	void **funcs;
+	int refcount;	/* Number of times armed. 0 if disarmed. */
+	struct rcu_head rcu;
+	void *oldptr;
+	unsigned char rcu_pending:1;
+	char name[0];
+};
+
+static struct hlist_head tracepoint_table[TRACEPOINT_TABLE_SIZE];
+
+static void free_old_closure(struct rcu_head *head)
+{
+	struct tracepoint_entry *entry = container_of(head,
+		struct tracepoint_entry, rcu);
+	kfree(entry->oldptr);
+	/* Make sure we free the data before setting the pending flag to 0 */
+	smp_wmb();
+	entry->rcu_pending = 0;
+}
+
+static void tracepoint_entry_free_old(struct tracepoint_entry *entry, void *old)
+{
+	if (!old)
+		return;
+	entry->oldptr = old;
+	entry->rcu_pending = 1;
+	/* write rcu_pending before calling the RCU callback */
+	smp_wmb();
+#ifdef CONFIG_PREEMPT_RCU
+	synchronize_sched();	/* Until we have the call_rcu_sched() */
+#endif
+	call_rcu(&entry->rcu, free_old_closure);
+}
+
+static void debug_print_probes(struct tracepoint_entry *entry)
+{
+	int i;
+
+	if (!tracepoint_debug)
+		return;
+
+	for (i = 0; entry->funcs[i]; i++)
+		printk(KERN_DEBUG "Probe %d : %p\n", i, entry->funcs[i]);
+}
+
+static void *
+tracepoint_entry_add_probe(struct tracepoint_entry *entry, void *probe)
+{
+	int nr_probes = 0;
+	void **old, **new;
+
+	WARN_ON(!probe);
+
+	debug_print_probes(entry);
+	old = entry->funcs;
+	if (old) {
+		/* (N -> N+1), (N != 0, 1) probes */
+		for (nr_probes = 0; old[nr_probes]; nr_probes++)
+			if (old[nr_probes] == probe)
+				return ERR_PTR(-EEXIST);
+	}
+	/* + 2 : one for new probe, one for NULL func */
+	new = kzalloc((nr_probes + 2) * sizeof(void *), GFP_KERNEL);
+	if (new == NULL)
+		return ERR_PTR(-ENOMEM);
+	if (old)
+		memcpy(new, old, nr_probes * sizeof(void *));
+	new[nr_probes] = probe;
+	entry->refcount = nr_probes + 1;
+	entry->funcs = new;
+	debug_print_probes(entry);
+	return old;
+}
+
+static void *
+tracepoint_entry_remove_probe(struct tracepoint_entry *entry, void *probe)
+{
+	int nr_probes = 0, nr_del = 0, i;
+	void **old, **new;
+
+	old = entry->funcs;
+
+	debug_print_probes(entry);
+	/* (N -> M), (N > 1, M >= 0) probes */
+	for (nr_probes = 0; old[nr_probes]; nr_probes++) {
+		if ((!probe || old[nr_probes] == probe))
+			nr_del++;
+	}
+
+	if (nr_probes - nr_del == 0) {
+		/* N -> 0, (N > 1) */
+		entry->funcs = NULL;
+		entry->refcount = 0;
+		debug_print_probes(entry);
+		return old;
+	} else {
+		int j = 0;
+		/* N -> M, (N > 1, M > 0) */
+		/* + 1 for NULL */
+		new = kzalloc((nr_probes - nr_del + 1)
+			* sizeof(void *), GFP_KERNEL);
+		if (new == NULL)
+			return ERR_PTR(-ENOMEM);
+		for (i = 0; old[i]; i++)
+			if ((probe && old[i] != probe))
+				new[j++] = old[i];
+		entry->refcount = nr_probes - nr_del;
+		entry->funcs = new;
+	}
+	debug_print_probes(entry);
+	return old;
+}
+
+/*
+ * Get tracepoint if the tracepoint is present in the tracepoint hash table.
+ * Must be called with tracepoints_mutex held.
+ * Returns NULL if not present.
+ */
+static struct tracepoint_entry *get_tracepoint(const char *name)
+{
+	struct hlist_head *head;
+	struct hlist_node *node;
+	struct tracepoint_entry *e;
+	u32 hash = jhash(name, strlen(name), 0);
+
+	head = &tracepoint_table[hash & ((1 << TRACEPOINT_HASH_BITS)-1)];
+	hlist_for_each_entry(e, node, head, hlist) {
+		if (!strcmp(name, e->name))
+			return e;
+	}
+	return NULL;
+}
+
+/*
+ * Add the tracepoint to the tracepoint hash table. Must be called with
+ * tracepoints_mutex held.
+ */
+static struct tracepoint_entry *add_tracepoint(const char *name)
+{
+	struct hlist_head *head;
+	struct hlist_node *node;
+	struct tracepoint_entry *e;
+	size_t name_len = strlen(name) + 1;
+	u32 hash = jhash(name, name_len-1, 0);
+
+	head = &tracepoint_table[hash & ((1 << TRACEPOINT_HASH_BITS)-1)];
+	hlist_for_each_entry(e, node, head, hlist) {
+		if (!strcmp(name, e->name)) {
+			printk(KERN_NOTICE
+				"tracepoint %s busy\n", name);
+			return ERR_PTR(-EEXIST);	/* Already there */
+		}
+	}
+	/*
+	 * Using kmalloc here to allocate a variable length element. Could
+	 * cause some memory fragmentation if overused.
+	 */
+	e = kmalloc(sizeof(struct tracepoint_entry) + name_len, GFP_KERNEL);
+	if (!e)
+		return ERR_PTR(-ENOMEM);
+	memcpy(&e->name[0], name, name_len);
+	e->funcs = NULL;
+	e->refcount = 0;
+	e->rcu_pending = 0;
+	hlist_add_head(&e->hlist, head);
+	return e;
+}
+
+/*
+ * Remove the tracepoint from the tracepoint hash table. Must be called with
+ * mutex_lock held.
+ */
+static int remove_tracepoint(const char *name)
+{
+	struct hlist_head *head;
+	struct hlist_node *node;
+	struct tracepoint_entry *e;
+	int found = 0;
+	size_t len = strlen(name) + 1;
+	u32 hash = jhash(name, len-1, 0);
+
+	head = &tracepoint_table[hash & ((1 << TRACEPOINT_HASH_BITS)-1)];
+	hlist_for_each_entry(e, node, head, hlist) {
+		if (!strcmp(name, e->name)) {
+			found = 1;
+			break;
+		}
+	}
+	if (!found)
+		return -ENOENT;
+	if (e->refcount)
+		return -EBUSY;
+	hlist_del(&e->hlist);
+	/* Make sure the call_rcu has been executed */
+	if (e->rcu_pending)
+		rcu_barrier();
+	kfree(e);
+	return 0;
+}
+
+/*
+ * Sets the probe callback corresponding to one tracepoint.
+ */
+static void set_tracepoint(struct tracepoint_entry **entry,
+	struct tracepoint *elem, int active)
+{
+	WARN_ON(strcmp((*entry)->name, elem->name) != 0);
+
+	/*
+	 * rcu_assign_pointer has a smp_wmb() which makes sure that the new
+	 * probe callbacks array is consistent before setting a pointer to it.
+	 * This array is referenced by __DO_TRACE from
+	 * include/linux/tracepoints.h. A matching smp_read_barrier_depends()
+	 * is used.
+	 */
+	rcu_assign_pointer(elem->funcs, (*entry)->funcs);
+	elem->state = active;
+}
+
+/*
+ * Disable a tracepoint and its probe callback.
+ * Note: only waiting an RCU period after setting elem->call to the empty
+ * function insures that the original callback is not used anymore. This insured
+ * by preempt_disable around the call site.
+ */
+static void disable_tracepoint(struct tracepoint *elem)
+{
+	elem->state = 0;
+}
+
+/**
+ * tracepoint_update_probe_range - Update a probe range
+ * @begin: beginning of the range
+ * @end: end of the range
+ *
+ * Updates the probe callback corresponding to a range of tracepoints.
+ */
+void tracepoint_update_probe_range(struct tracepoint *begin,
+	struct tracepoint *end)
+{
+	struct tracepoint *iter;
+	struct tracepoint_entry *mark_entry;
+
+	mutex_lock(&tracepoints_mutex);
+	for (iter = begin; iter < end; iter++) {
+		mark_entry = get_tracepoint(iter->name);
+		if (mark_entry) {
+			set_tracepoint(&mark_entry, iter,
+					!!mark_entry->refcount);
+		} else {
+			disable_tracepoint(iter);
+		}
+	}
+	mutex_unlock(&tracepoints_mutex);
+}
+
+/*
+ * Update probes, removing the faulty probes.
+ */
+static void tracepoint_update_probes(void)
+{
+	/* Core kernel tracepoints */
+	tracepoint_update_probe_range(__start___tracepoints,
+		__stop___tracepoints);
+	/* tracepoints in modules. */
+	module_update_tracepoints();
+}
+
+/**
+ * tracepoint_probe_register -  Connect a probe to a tracepoint
+ * @name: tracepoint name
+ * @probe: probe handler
+ *
+ * Returns 0 if ok, error value on error.
+ * The probe address must at least be aligned on the architecture pointer size.
+ */
+int tracepoint_probe_register(const char *name, void *probe)
+{
+	struct tracepoint_entry *entry;
+	int ret = 0;
+	void *old;
+
+	mutex_lock(&tracepoints_mutex);
+	entry = get_tracepoint(name);
+	if (!entry) {
+		entry = add_tracepoint(name);
+		if (IS_ERR(entry)) {
+			ret = PTR_ERR(entry);
+			goto end;
+		}
+	}
+	/*
+	 * If we detect that a call_rcu is pending for this tracepoint,
+	 * make sure it's executed now.
+	 */
+	if (entry->rcu_pending)
+		rcu_barrier();
+	old = tracepoint_entry_add_probe(entry, probe);
+	if (IS_ERR(old)) {
+		ret = PTR_ERR(old);
+		goto end;
+	}
+	mutex_unlock(&tracepoints_mutex);
+	tracepoint_update_probes();		/* may update entry */
+	mutex_lock(&tracepoints_mutex);
+	entry = get_tracepoint(name);
+	WARN_ON(!entry);
+	tracepoint_entry_free_old(entry, old);
+end:
+	mutex_unlock(&tracepoints_mutex);
+	return ret;
+}
+EXPORT_SYMBOL_GPL(tracepoint_probe_register);
+
+/**
+ * tracepoint_probe_unregister -  Disconnect a probe from a tracepoint
+ * @name: tracepoint name
+ * @probe: probe function pointer
+ *
+ * We do not need to call a synchronize_sched to make sure the probes have
+ * finished running before doing a module unload, because the module unload
+ * itself uses stop_machine(), which insures that every preempt disabled section
+ * have finished.
+ */
+int tracepoint_probe_unregister(const char *name, void *probe)
+{
+	struct tracepoint_entry *entry;
+	void *old;
+	int ret = -ENOENT;
+
+	mutex_lock(&tracepoints_mutex);
+	entry = get_tracepoint(name);
+	if (!entry)
+		goto end;
+	if (entry->rcu_pending)
+		rcu_barrier();
+	old = tracepoint_entry_remove_probe(entry, probe);
+	mutex_unlock(&tracepoints_mutex);
+	tracepoint_update_probes();		/* may update entry */
+	mutex_lock(&tracepoints_mutex);
+	entry = get_tracepoint(name);
+	if (!entry)
+		goto end;
+	tracepoint_entry_free_old(entry, old);
+	remove_tracepoint(name);	/* Ignore busy error message */
+	ret = 0;
+end:
+	mutex_unlock(&tracepoints_mutex);
+	return ret;
+}
+EXPORT_SYMBOL_GPL(tracepoint_probe_unregister);
+
+/**
+ * tracepoint_get_iter_range - Get a next tracepoint iterator given a range.
+ * @tracepoint: current tracepoints (in), next tracepoint (out)
+ * @begin: beginning of the range
+ * @end: end of the range
+ *
+ * Returns whether a next tracepoint has been found (1) or not (0).
+ * Will return the first tracepoint in the range if the input tracepoint is
+ * NULL.
+ */
+int tracepoint_get_iter_range(struct tracepoint **tracepoint,
+	struct tracepoint *begin, struct tracepoint *end)
+{
+	if (!*tracepoint && begin != end) {
+		*tracepoint = begin;
+		return 1;
+	}
+	if (*tracepoint >= begin && *tracepoint < end)
+		return 1;
+	return 0;
+}
+EXPORT_SYMBOL_GPL(tracepoint_get_iter_range);
+
+static void tracepoint_get_iter(struct tracepoint_iter *iter)
+{
+	int found = 0;
+
+	/* Core kernel tracepoints */
+	if (!iter->module) {
+		found = tracepoint_get_iter_range(&iter->tracepoint,
+				__start___tracepoints, __stop___tracepoints);
+		if (found)
+			goto end;
+	}
+	/* tracepoints in modules. */
+	found = module_get_iter_tracepoints(iter);
+end:
+	if (!found)
+		tracepoint_iter_reset(iter);
+}
+
+void tracepoint_iter_start(struct tracepoint_iter *iter)
+{
+	tracepoint_get_iter(iter);
+}
+EXPORT_SYMBOL_GPL(tracepoint_iter_start);
+
+void tracepoint_iter_next(struct tracepoint_iter *iter)
+{
+	iter->tracepoint++;
+	/*
+	 * iter->tracepoint may be invalid because we blindly incremented it.
+	 * Make sure it is valid by marshalling on the tracepoints, getting the
+	 * tracepoints from following modules if necessary.
+	 */
+	tracepoint_get_iter(iter);
+}
+EXPORT_SYMBOL_GPL(tracepoint_iter_next);
+
+void tracepoint_iter_stop(struct tracepoint_iter *iter)
+{
+}
+EXPORT_SYMBOL_GPL(tracepoint_iter_stop);
+
+void tracepoint_iter_reset(struct tracepoint_iter *iter)
+{
+	iter->module = NULL;
+	iter->tracepoint = NULL;
+}
+EXPORT_SYMBOL_GPL(tracepoint_iter_reset);
-- 
cgit v1.2.3


From 36dcd67ae994fece615b7c700958d215e884b9ae Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Tue, 29 Jul 2008 12:00:59 +0200
Subject: ftrace: ignore functions that cannot be kprobe-ed

kprobes already has an extensive list of annotations for functions
that should not be instrumented. Add notrace annotations to these
functions as well.

This is particularly useful for functions called by the NMI path.

Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 include/linux/kprobes.h | 5 +++--
 kernel/notifier.c       | 2 +-
 2 files changed, 4 insertions(+), 3 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/kprobes.h b/include/linux/kprobes.h
index 0be7795655fa..497b1d1f7a05 100644
--- a/include/linux/kprobes.h
+++ b/include/linux/kprobes.h
@@ -29,6 +29,7 @@
  *		<jkenisto@us.ibm.com>  and Prasanna S Panchamukhi
  *		<prasanna@in.ibm.com> added function-return probes.
  */
+#include <linux/linkage.h>
 #include <linux/list.h>
 #include <linux/notifier.h>
 #include <linux/smp.h>
@@ -47,7 +48,7 @@
 #define KPROBE_HIT_SSDONE	0x00000008
 
 /* Attach to insert probes on any functions which should be ignored*/
-#define __kprobes	__attribute__((__section__(".kprobes.text")))
+#define __kprobes	__attribute__((__section__(".kprobes.text"))) notrace
 
 struct kprobe;
 struct pt_regs;
@@ -256,7 +257,7 @@ void recycle_rp_inst(struct kretprobe_instance *ri, struct hlist_head *head);
 
 #else /* CONFIG_KPROBES */
 
-#define __kprobes	/**/
+#define __kprobes	notrace
 struct jprobe;
 struct kretprobe;
 
diff --git a/kernel/notifier.c b/kernel/notifier.c
index 823be11584ef..4282c0a40a57 100644
--- a/kernel/notifier.c
+++ b/kernel/notifier.c
@@ -550,7 +550,7 @@ EXPORT_SYMBOL(unregister_reboot_notifier);
 
 static ATOMIC_NOTIFIER_HEAD(die_chain);
 
-int notify_die(enum die_val val, const char *str,
+int notrace notify_die(enum die_val val, const char *str,
 	       struct pt_regs *regs, long err, int trap, int sig)
 {
 	struct die_args args = {
-- 
cgit v1.2.3


From 68bf21aa15c85d2e9b623dcda2b1ed8893275fa1 Mon Sep 17 00:00:00 2001
From: Steven Rostedt <rostedt@goodmis.org>
Date: Thu, 14 Aug 2008 15:45:08 -0400
Subject: ftrace: mcount call site on boot nops core

This is the infrastructure to the converting the mcount call sites
recorded by the __mcount_loc section into nops on boot. It also allows
for using these sites to enable tracing as normal. When the __mcount_loc
section is used, the "ftraced" kernel thread is disabled.

This uses the current infrastructure to record the mcount call sites
as well as convert them to nops. The mcount function is kept as a stub
on boot up and not converted to the ftrace_record_ip function. We use the
ftrace_record_ip to only record from the table.

This patch does not handle modules. That comes with a later patch.

Signed-off-by: Steven Rostedt <srostedt@redhat.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 include/asm-x86/ftrace.h |  10 ++++
 include/linux/ftrace.h   |   6 ++
 init/main.c              |   3 +
 kernel/trace/ftrace.c    | 148 +++++++++++++++++++++++++++++++++--------------
 4 files changed, 124 insertions(+), 43 deletions(-)

(limited to 'include/linux')

diff --git a/include/asm-x86/ftrace.h b/include/asm-x86/ftrace.h
index be0e004ad148..1bb6f9bbe1ab 100644
--- a/include/asm-x86/ftrace.h
+++ b/include/asm-x86/ftrace.h
@@ -7,6 +7,16 @@
 
 #ifndef __ASSEMBLY__
 extern void mcount(void);
+
+static inline unsigned long ftrace_call_adjust(unsigned long addr)
+{
+	/*
+	 * call mcount is "e8 <4 byte offset>"
+	 * The addr points to the 4 byte offset and the caller of this
+	 * function wants the pointer to e8. Simply subtract one.
+	 */
+	return addr - 1;
+}
 #endif
 
 #endif /* CONFIG_FTRACE */
diff --git a/include/linux/ftrace.h b/include/linux/ftrace.h
index bb384068272e..d4d6ab453b78 100644
--- a/include/linux/ftrace.h
+++ b/include/linux/ftrace.h
@@ -162,4 +162,10 @@ static inline void
 ftrace_special(unsigned long arg1, unsigned long arg2, unsigned long arg3) { }
 #endif
 
+#ifdef CONFIG_FTRACE_MCOUNT_RECORD
+extern void ftrace_init(void);
+#else
+static inline void ftrace_init(void) { }
+#endif
+
 #endif /* _LINUX_FTRACE_H */
diff --git a/init/main.c b/init/main.c
index 3820323c4c84..ded1fae965ab 100644
--- a/init/main.c
+++ b/init/main.c
@@ -60,6 +60,7 @@
 #include <linux/sched.h>
 #include <linux/signal.h>
 #include <linux/idr.h>
+#include <linux/ftrace.h>
 
 #include <asm/io.h>
 #include <asm/bugs.h>
@@ -687,6 +688,8 @@ asmlinkage void __init start_kernel(void)
 
 	acpi_early_init(); /* before LAPIC and SMP init */
 
+	ftrace_init();
+
 	/* Do the rest non-__init'ed, we're now alive */
 	rest_init();
 }
diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c
index f6e3af31b403..df96d5990c04 100644
--- a/kernel/trace/ftrace.c
+++ b/kernel/trace/ftrace.c
@@ -792,47 +792,7 @@ static int ftrace_update_code(void)
 	return 1;
 }
 
-static int ftraced(void *ignore)
-{
-	unsigned long usecs;
-
-	while (!kthread_should_stop()) {
-
-		set_current_state(TASK_INTERRUPTIBLE);
-
-		/* check once a second */
-		schedule_timeout(HZ);
-
-		if (unlikely(ftrace_disabled))
-			continue;
-
-		mutex_lock(&ftrace_sysctl_lock);
-		mutex_lock(&ftraced_lock);
-		if (!ftraced_suspend && !ftraced_stop &&
-		    ftrace_update_code()) {
-			usecs = nsecs_to_usecs(ftrace_update_time);
-			if (ftrace_update_tot_cnt > 100000) {
-				ftrace_update_tot_cnt = 0;
-				pr_info("hm, dftrace overflow: %lu change%s"
-					" (%lu total) in %lu usec%s\n",
-					ftrace_update_cnt,
-					ftrace_update_cnt != 1 ? "s" : "",
-					ftrace_update_tot_cnt,
-					usecs, usecs != 1 ? "s" : "");
-				ftrace_disabled = 1;
-				WARN_ON_ONCE(1);
-			}
-		}
-		mutex_unlock(&ftraced_lock);
-		mutex_unlock(&ftrace_sysctl_lock);
-
-		ftrace_shutdown_replenish();
-	}
-	__set_current_state(TASK_RUNNING);
-	return 0;
-}
-
-static int __init ftrace_dyn_table_alloc(void)
+static int __init ftrace_dyn_table_alloc(unsigned long num_to_init)
 {
 	struct ftrace_page *pg;
 	int cnt;
@@ -859,7 +819,9 @@ static int __init ftrace_dyn_table_alloc(void)
 
 	pg = ftrace_pages = ftrace_pages_start;
 
-	cnt = NR_TO_INIT / ENTRIES_PER_PAGE;
+	cnt = num_to_init / ENTRIES_PER_PAGE;
+	pr_info("ftrace: allocating %ld hash entries in %d pages\n",
+		num_to_init, cnt);
 
 	for (i = 0; i < cnt; i++) {
 		pg->next = (void *)get_zeroed_page(GFP_KERNEL);
@@ -1556,6 +1518,104 @@ static __init int ftrace_init_debugfs(void)
 
 fs_initcall(ftrace_init_debugfs);
 
+#ifdef CONFIG_FTRACE_MCOUNT_RECORD
+static int ftrace_convert_nops(unsigned long *start,
+			       unsigned long *end)
+{
+	unsigned long *p;
+	unsigned long addr;
+	unsigned long flags;
+
+	p = start;
+	while (p < end) {
+		addr = ftrace_call_adjust(*p++);
+		ftrace_record_ip(addr);
+		ftrace_shutdown_replenish();
+	}
+
+	/* p is ignored */
+	local_irq_save(flags);
+	__ftrace_update_code(p);
+	local_irq_restore(flags);
+
+	return 0;
+}
+
+extern unsigned long __start_mcount_loc[];
+extern unsigned long __stop_mcount_loc[];
+
+void __init ftrace_init(void)
+{
+	unsigned long count, addr, flags;
+	int ret;
+
+	/* Keep the ftrace pointer to the stub */
+	addr = (unsigned long)ftrace_stub;
+
+	local_irq_save(flags);
+	ftrace_dyn_arch_init(&addr);
+	local_irq_restore(flags);
+
+	/* ftrace_dyn_arch_init places the return code in addr */
+	if (addr)
+		goto failed;
+
+	count = __stop_mcount_loc - __start_mcount_loc;
+
+	ret = ftrace_dyn_table_alloc(count);
+	if (ret)
+		goto failed;
+
+	last_ftrace_enabled = ftrace_enabled = 1;
+
+	ret = ftrace_convert_nops(__start_mcount_loc,
+				  __stop_mcount_loc);
+
+	return;
+ failed:
+	ftrace_disabled = 1;
+}
+#else /* CONFIG_FTRACE_MCOUNT_RECORD */
+static int ftraced(void *ignore)
+{
+	unsigned long usecs;
+
+	while (!kthread_should_stop()) {
+
+		set_current_state(TASK_INTERRUPTIBLE);
+
+		/* check once a second */
+		schedule_timeout(HZ);
+
+		if (unlikely(ftrace_disabled))
+			continue;
+
+		mutex_lock(&ftrace_sysctl_lock);
+		mutex_lock(&ftraced_lock);
+		if (!ftraced_suspend && !ftraced_stop &&
+		    ftrace_update_code()) {
+			usecs = nsecs_to_usecs(ftrace_update_time);
+			if (ftrace_update_tot_cnt > 100000) {
+				ftrace_update_tot_cnt = 0;
+				pr_info("hm, dftrace overflow: %lu change%s"
+					" (%lu total) in %lu usec%s\n",
+					ftrace_update_cnt,
+					ftrace_update_cnt != 1 ? "s" : "",
+					ftrace_update_tot_cnt,
+					usecs, usecs != 1 ? "s" : "");
+				ftrace_disabled = 1;
+				WARN_ON_ONCE(1);
+			}
+		}
+		mutex_unlock(&ftraced_lock);
+		mutex_unlock(&ftrace_sysctl_lock);
+
+		ftrace_shutdown_replenish();
+	}
+	__set_current_state(TASK_RUNNING);
+	return 0;
+}
+
 static int __init ftrace_dynamic_init(void)
 {
 	struct task_struct *p;
@@ -1572,7 +1632,7 @@ static int __init ftrace_dynamic_init(void)
 		goto failed;
 	}
 
-	ret = ftrace_dyn_table_alloc();
+	ret = ftrace_dyn_table_alloc(NR_TO_INIT);
 	if (ret)
 		goto failed;
 
@@ -1593,6 +1653,8 @@ static int __init ftrace_dynamic_init(void)
 }
 
 core_initcall(ftrace_dynamic_init);
+#endif /* CONFIG_FTRACE_MCOUNT_RECORD */
+
 #else
 # define ftrace_startup()		do { } while (0)
 # define ftrace_shutdown()		do { } while (0)
-- 
cgit v1.2.3


From 90d595fe5ca4b685465c068907e6e554760abea8 Mon Sep 17 00:00:00 2001
From: Steven Rostedt <rostedt@goodmis.org>
Date: Thu, 14 Aug 2008 15:45:09 -0400
Subject: ftrace: enable mcount recording for modules

This patch enables the loading of the __mcount_section of modules and
changing all the callers of mcount into nops.

The modification is done before the init_module function is called, so
again, we do not need to use kstop_machine to make these changes.

Signed-off-by: Steven Rostedt <srostedt@redhat.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 include/linux/ftrace.h |  3 +++
 kernel/module.c        | 11 +++++++++++
 kernel/trace/ftrace.c  |  5 +++++
 3 files changed, 19 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/ftrace.h b/include/linux/ftrace.h
index d4d6ab453b78..4936489f9ed8 100644
--- a/include/linux/ftrace.h
+++ b/include/linux/ftrace.h
@@ -164,8 +164,11 @@ ftrace_special(unsigned long arg1, unsigned long arg2, unsigned long arg3) { }
 
 #ifdef CONFIG_FTRACE_MCOUNT_RECORD
 extern void ftrace_init(void);
+extern void ftrace_init_module(unsigned long *start, unsigned long *end);
 #else
 static inline void ftrace_init(void) { }
+static inline void
+ftrace_init_module(unsigned long *start, unsigned long *end) { }
 #endif
 
 #endif /* _LINUX_FTRACE_H */
diff --git a/kernel/module.c b/kernel/module.c
index 661d73db786e..d753fd9d83ec 100644
--- a/kernel/module.c
+++ b/kernel/module.c
@@ -47,6 +47,7 @@
 #include <linux/license.h>
 #include <asm/sections.h>
 #include <linux/tracepoint.h>
+#include <linux/ftrace.h>
 
 #if 0
 #define DEBUGP printk
@@ -1834,6 +1835,7 @@ static noinline struct module *load_module(void __user *umod,
 	unsigned int markersstringsindex;
 	unsigned int tracepointsindex;
 	unsigned int tracepointsstringsindex;
+	unsigned int mcountindex;
 	struct module *mod;
 	long err = 0;
 	void *percpu = NULL, *ptr = NULL; /* Stops spurious gcc warning */
@@ -2124,6 +2126,9 @@ static noinline struct module *load_module(void __user *umod,
 	tracepointsstringsindex = find_sec(hdr, sechdrs, secstrings,
 					"__tracepoints_strings");
 
+	mcountindex = find_sec(hdr, sechdrs, secstrings,
+			       "__mcount_loc");
+
 	/* Now do relocations. */
 	for (i = 1; i < hdr->e_shnum; i++) {
 		const char *strtab = (char *)sechdrs[strindex].sh_addr;
@@ -2184,6 +2189,12 @@ static noinline struct module *load_module(void __user *umod,
 			mod->tracepoints + mod->num_tracepoints);
 #endif
 	}
+
+	if (mcountindex) {
+		void *mseg = (void *)sechdrs[mcountindex].sh_addr;
+		ftrace_init_module(mseg, mseg + sechdrs[mcountindex].sh_size);
+	}
+
 	err = module_finalize(hdr, sechdrs, mod);
 	if (err < 0)
 		goto cleanup;
diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c
index df96d5990c04..ea45bb1c0fd6 100644
--- a/kernel/trace/ftrace.c
+++ b/kernel/trace/ftrace.c
@@ -1541,6 +1541,11 @@ static int ftrace_convert_nops(unsigned long *start,
 	return 0;
 }
 
+void ftrace_init_module(unsigned long *start, unsigned long *end)
+{
+	ftrace_convert_nops(start, end);
+}
+
 extern unsigned long __start_mcount_loc[];
 extern unsigned long __stop_mcount_loc[];
 
-- 
cgit v1.2.3


From 29e71abf56cebc5c5a4e184a6eb4360cc58554ad Mon Sep 17 00:00:00 2001
From: Steven Rostedt <rostedt@goodmis.org>
Date: Thu, 14 Aug 2008 15:45:10 -0400
Subject: ftrace: rebuild everything on change to FTRACE_MCOUNT_RECORD

When enabling or disabling CONFIG_FTRACE_MCOUNT_RECORD, we want a full
kernel compile to handle the adding of the __mcount_loc sections.

Signed-off-by: Steven Rostedt <srostedt@redhat.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 include/linux/kernel.h | 5 +++++
 1 file changed, 5 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/kernel.h b/include/linux/kernel.h
index 75d81f157d2e..ecce4a4ccd5f 100644
--- a/include/linux/kernel.h
+++ b/include/linux/kernel.h
@@ -486,4 +486,9 @@ struct sysinfo {
 #define NUMA_BUILD 0
 #endif
 
+/* Rebuild everything on CONFIG_FTRACE_MCOUNT_RECORD */
+#ifdef CONFIG_FTRACE_MCOUNT_RECORD
+# define REBUILD_DUE_TO_FTRACE_MCOUNT_RECORD
+#endif
+
 #endif
-- 
cgit v1.2.3


From 28614889bcb2558a47d02d52394b7fd9795a9547 Mon Sep 17 00:00:00 2001
From: Steven Rostedt <rostedt@goodmis.org>
Date: Thu, 14 Aug 2008 22:47:18 -0400
Subject: ftrace: move notrace to compiler.h

The notrace define belongs in compiler.h so that it can be used in
init.h

Signed-off-by: Steven Rostedt <srostedt@redhat.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 include/linux/compiler.h | 2 ++
 include/linux/linkage.h  | 2 --
 2 files changed, 2 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/compiler.h b/include/linux/compiler.h
index 8322141ee480..98115d9d04da 100644
--- a/include/linux/compiler.h
+++ b/include/linux/compiler.h
@@ -44,6 +44,8 @@ extern void __chk_io_ptr(const volatile void __iomem *);
 # error Sorry, your compiler is too old/not recognized.
 #endif
 
+#define notrace __attribute__((no_instrument_function))
+
 /* Intel compiler defines __GNUC__. So we will overwrite implementations
  * coming from above header files here
  */
diff --git a/include/linux/linkage.h b/include/linux/linkage.h
index 56ba37394656..9fd1f859021b 100644
--- a/include/linux/linkage.h
+++ b/include/linux/linkage.h
@@ -4,8 +4,6 @@
 #include <linux/compiler.h>
 #include <asm/linkage.h>
 
-#define notrace __attribute__((no_instrument_function))
-
 #ifdef __cplusplus
 #define CPP_ASMLINKAGE extern "C"
 #else
-- 
cgit v1.2.3


From fed1939c64d2288938fdc1c367d49082da65e195 Mon Sep 17 00:00:00 2001
From: Steven Rostedt <rostedt@goodmis.org>
Date: Thu, 14 Aug 2008 22:47:19 -0400
Subject: ftrace: remove old pointers to mcount

When a mcount pointer is recorded into a table, it is used to add or
remove calls to mcount (replacing them with nops). If the code is removed
via removing a module, the pointers still exist.  At modifying the code
a check is always made to make sure the code being replaced is the code
expected. In-other-words, the code being replaced is compared to what
it is expected to be before being replaced.

There is a very small chance that the code being replaced just happens
to look like code that calls mcount (very small since the call to mcount
is relative). To remove this chance, this patch adds ftrace_release to
allow module unloading to remove the pointers to mcount within the module.

Another change for init calls is made to not trace calls marked with
__init. The tracing can not be started until after init is done anyway.

Signed-off-by: Steven Rostedt <srostedt@redhat.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 include/linux/ftrace.h |  2 ++
 include/linux/init.h   |  2 +-
 kernel/module.c        | 12 ++++++++----
 kernel/trace/ftrace.c  | 32 ++++++++++++++++++++++++++++++--
 4 files changed, 41 insertions(+), 7 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/ftrace.h b/include/linux/ftrace.h
index 4936489f9ed8..6b232a2460c0 100644
--- a/include/linux/ftrace.h
+++ b/include/linux/ftrace.h
@@ -165,10 +165,12 @@ ftrace_special(unsigned long arg1, unsigned long arg2, unsigned long arg3) { }
 #ifdef CONFIG_FTRACE_MCOUNT_RECORD
 extern void ftrace_init(void);
 extern void ftrace_init_module(unsigned long *start, unsigned long *end);
+extern void ftrace_release(void *start, unsigned long size);
 #else
 static inline void ftrace_init(void) { }
 static inline void
 ftrace_init_module(unsigned long *start, unsigned long *end) { }
+static inline void ftrace_release(void *start, unsigned long size) { }
 #endif
 
 #endif /* _LINUX_FTRACE_H */
diff --git a/include/linux/init.h b/include/linux/init.h
index 93538b696e3d..27f61f6b3cb9 100644
--- a/include/linux/init.h
+++ b/include/linux/init.h
@@ -40,7 +40,7 @@
 
 /* These are for everybody (although not all archs will actually
    discard it in modules) */
-#define __init		__section(.init.text) __cold
+#define __init		__section(.init.text) __cold notrace
 #define __initdata	__section(.init.data)
 #define __initconst	__section(.init.rodata)
 #define __exitdata	__section(.exit.data)
diff --git a/kernel/module.c b/kernel/module.c
index d753fd9d83ec..7576c2d9462f 100644
--- a/kernel/module.c
+++ b/kernel/module.c
@@ -1431,6 +1431,9 @@ static void free_module(struct module *mod)
 	/* Module unload stuff */
 	module_unload_free(mod);
 
+	/* release any pointers to mcount in this module */
+	ftrace_release(mod->module_core, mod->core_size);
+
 	/* This may be NULL, but that's OK */
 	module_free(mod, mod->module_init);
 	kfree(mod->args);
@@ -1839,6 +1842,7 @@ static noinline struct module *load_module(void __user *umod,
 	struct module *mod;
 	long err = 0;
 	void *percpu = NULL, *ptr = NULL; /* Stops spurious gcc warning */
+	void *mseg;
 	struct exception_table_entry *extable;
 	mm_segment_t old_fs;
 
@@ -2190,10 +2194,9 @@ static noinline struct module *load_module(void __user *umod,
 #endif
 	}
 
-	if (mcountindex) {
-		void *mseg = (void *)sechdrs[mcountindex].sh_addr;
-		ftrace_init_module(mseg, mseg + sechdrs[mcountindex].sh_size);
-	}
+	/* sechdrs[0].sh_size is always zero */
+	mseg = (void *)sechdrs[mcountindex].sh_addr;
+	ftrace_init_module(mseg, mseg + sechdrs[mcountindex].sh_size);
 
 	err = module_finalize(hdr, sechdrs, mod);
 	if (err < 0)
@@ -2264,6 +2267,7 @@ static noinline struct module *load_module(void __user *umod,
  cleanup:
 	kobject_del(&mod->mkobj.kobj);
 	kobject_put(&mod->mkobj.kobj);
+	ftrace_release(mod->module_core, mod->core_size);
  free_unload:
 	module_unload_free(mod);
 	module_free(mod, mod->module_init);
diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c
index 8affb6d00ec1..eadd0eaea9b6 100644
--- a/kernel/trace/ftrace.c
+++ b/kernel/trace/ftrace.c
@@ -294,13 +294,37 @@ static inline void ftrace_del_hash(struct dyn_ftrace *node)
 
 static void ftrace_free_rec(struct dyn_ftrace *rec)
 {
-	/* no locking, only called from kstop_machine */
-
 	rec->ip = (unsigned long)ftrace_free_records;
 	ftrace_free_records = rec;
 	rec->flags |= FTRACE_FL_FREE;
 }
 
+void ftrace_release(void *start, unsigned long size)
+{
+	struct dyn_ftrace *rec;
+	struct ftrace_page *pg;
+	unsigned long s = (unsigned long)start;
+	unsigned long e = s + size;
+	int i;
+
+	if (!start)
+		return;
+
+	/* No interrupt should call this */
+	spin_lock(&ftrace_lock);
+
+	for (pg = ftrace_pages_start; pg; pg = pg->next) {
+		for (i = 0; i < pg->index; i++) {
+			rec = &pg->records[i];
+
+			if ((rec->ip >= s) && (rec->ip < e))
+				ftrace_free_rec(rec);
+		}
+	}
+	spin_unlock(&ftrace_lock);
+
+}
+
 static struct dyn_ftrace *ftrace_alloc_dyn_node(unsigned long ip)
 {
 	struct dyn_ftrace *rec;
@@ -1527,7 +1551,9 @@ static int ftrace_convert_nops(unsigned long *start,
 	p = start;
 	while (p < end) {
 		addr = ftrace_call_adjust(*p++);
+		spin_lock(&ftrace_lock);
 		ftrace_record_ip(addr);
+		spin_unlock(&ftrace_lock);
 		ftrace_shutdown_replenish();
 	}
 
@@ -1541,6 +1567,8 @@ static int ftrace_convert_nops(unsigned long *start,
 
 void ftrace_init_module(unsigned long *start, unsigned long *end)
 {
+	if (start == end)
+		return;
 	ftrace_convert_nops(start, end);
 }
 
-- 
cgit v1.2.3


From dd0e545f061f90099a3dcc13aa77e29c6295cf23 Mon Sep 17 00:00:00 2001
From: Steven Rostedt <rostedt@goodmis.org>
Date: Fri, 1 Aug 2008 12:26:41 -0400
Subject: ftrace: printk formatting infrastructure

This patch adds a feature that can help kernel developers debug their
code using ftrace.

  int ftrace_printk(const char *fmt, ...);

This records into the ftrace buffer using printf formatting. The entry
size in the buffers are still a fixed length. A new type has been added
that allows for more entries to be used for a single recording.

The start of the print is still the same as the other entries.

It returns the number of characters written to the ftrace buffer.

For example:

Having a module with the following code:

static int __init ftrace_print_test(void)
{
        ftrace_printk("jiffies are %ld\n", jiffies);
        return 0;
}

Gives me:

  insmod-5441  3...1 7569us : ftrace_print_test: jiffies are 4296626666

for the latency_trace file and:

          insmod-5441  [03]  1959.370498: ftrace_print_test jiffies are 4296626666

for the trace file.

Note: Only the infrastructure should go into the kernel. It is to help
facilitate debugging for other kernel developers. Calls to ftrace_printk
is not intended to be left in the kernel, and should be frowned upon just
like scattering printks around in the code.

But having this easily at your fingertips helps the debugging go faster
and bugs be solved quicker.

Maybe later on, we can hook this with markers and have their printf format
be sucked into ftrace output.

Signed-off-by: Steven Rostedt <srostedt@redhat.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 include/linux/ftrace.h        |  10 ++
 kernel/trace/trace.c          | 273 +++++++++++++++++++++++++++++++++++++-----
 kernel/trace/trace.h          |  11 ++
 kernel/trace/trace_selftest.c |  11 +-
 4 files changed, 272 insertions(+), 33 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/ftrace.h b/include/linux/ftrace.h
index 6b232a2460c0..f53b975e32fa 100644
--- a/include/linux/ftrace.h
+++ b/include/linux/ftrace.h
@@ -157,9 +157,18 @@ static inline void __ftrace_enabled_restore(int enabled)
 #ifdef CONFIG_TRACING
 extern void
 ftrace_special(unsigned long arg1, unsigned long arg2, unsigned long arg3);
+# define ftrace_printk(x...) __ftrace_printk(_THIS_IP_, x)
+extern int
+__ftrace_printk(unsigned long ip, const char *fmt, ...)
+	__attribute__ ((format (printf, 2, 3)));
 #else
 static inline void
 ftrace_special(unsigned long arg1, unsigned long arg2, unsigned long arg3) { }
+static inline int
+ftrace_printk(const char *fmt, ...) __attribute__ ((format (printf, 1, 0)))
+{
+	return 0;
+}
 #endif
 
 #ifdef CONFIG_FTRACE_MCOUNT_RECORD
@@ -173,4 +182,5 @@ ftrace_init_module(unsigned long *start, unsigned long *end) { }
 static inline void ftrace_release(void *start, unsigned long size) { }
 #endif
 
+
 #endif /* _LINUX_FTRACE_H */
diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index 76dfe6d2466c..a917bea82715 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -197,12 +197,14 @@ unsigned long nsecs_to_usecs(unsigned long nsecs)
  *  NEED_RESCED - reschedule is requested
  *  HARDIRQ	- inside an interrupt handler
  *  SOFTIRQ	- inside a softirq handler
+ *  CONT	- multiple entries hold the trace item
  */
 enum trace_flag_type {
 	TRACE_FLAG_IRQS_OFF		= 0x01,
 	TRACE_FLAG_NEED_RESCHED		= 0x02,
 	TRACE_FLAG_HARDIRQ		= 0x04,
 	TRACE_FLAG_SOFTIRQ		= 0x08,
+	TRACE_FLAG_CONT			= 0x10,
 };
 
 /*
@@ -1074,6 +1076,7 @@ enum trace_file_type {
 	TRACE_FILE_LAT_FMT	= 1,
 };
 
+/* Return the current entry.  */
 static struct trace_entry *
 trace_entry_idx(struct trace_array *tr, struct trace_array_cpu *data,
 		struct trace_iterator *iter, int cpu)
@@ -1104,8 +1107,58 @@ trace_entry_idx(struct trace_array *tr, struct trace_array_cpu *data,
 	return &array[iter->next_page_idx[cpu]];
 }
 
+/* Increment the index counter of an iterator by one */
+static void trace_iterator_increment(struct trace_iterator *iter, int cpu)
+{
+	iter->idx++;
+	iter->next_idx[cpu]++;
+	iter->next_page_idx[cpu]++;
+
+	if (iter->next_page_idx[cpu] >= ENTRIES_PER_PAGE) {
+		struct trace_array_cpu *data = iter->tr->data[cpu];
+
+		iter->next_page_idx[cpu] = 0;
+		iter->next_page[cpu] =
+			trace_next_list(data, iter->next_page[cpu]);
+	}
+}
+
 static struct trace_entry *
-find_next_entry(struct trace_iterator *iter, int *ent_cpu)
+trace_entry_next(struct trace_array *tr, struct trace_array_cpu *data,
+		 struct trace_iterator *iter, int cpu)
+{
+	struct list_head *next_page;
+	struct trace_entry *ent;
+	int idx, next_idx, next_page_idx;
+
+	ent = trace_entry_idx(tr, tr->data[cpu], iter, cpu);
+
+	if (likely(!ent || ent->type != TRACE_CONT))
+		return ent;
+
+	/* save the iterator details */
+	idx		= iter->idx;
+	next_idx	= iter->next_idx[cpu];
+	next_page_idx	= iter->next_page_idx[cpu];
+	next_page	= iter->next_page[cpu];
+
+	/* find a real entry */
+	do {
+		trace_iterator_increment(iter, cpu);
+		ent = trace_entry_idx(tr, tr->data[cpu], iter, cpu);
+	} while (ent && ent->type != TRACE_CONT);
+
+	/* reset the iterator */
+	iter->idx			= idx;
+	iter->next_idx[cpu]		= next_idx;
+	iter->next_page_idx[cpu]	= next_page_idx;
+	iter->next_page[cpu]		= next_page;
+
+	return ent;
+}
+
+static struct trace_entry *
+__find_next_entry(struct trace_iterator *iter, int *ent_cpu, int inc)
 {
 	struct trace_array *tr = iter->tr;
 	struct trace_entry *ent, *next = NULL;
@@ -1115,7 +1168,23 @@ find_next_entry(struct trace_iterator *iter, int *ent_cpu)
 	for_each_tracing_cpu(cpu) {
 		if (!head_page(tr->data[cpu]))
 			continue;
+
 		ent = trace_entry_idx(tr, tr->data[cpu], iter, cpu);
+
+		if (ent && ent->type == TRACE_CONT) {
+			struct trace_array_cpu *data = tr->data[cpu];
+
+			if (!inc)
+				ent = trace_entry_next(tr, data, iter, cpu);
+			else {
+				while (ent && ent->type == TRACE_CONT) {
+					trace_iterator_increment(iter, cpu);
+					ent = trace_entry_idx(tr, tr->data[cpu],
+							      iter, cpu);
+				}
+			}
+		}
+
 		/*
 		 * Pick the entry with the smallest timestamp:
 		 */
@@ -1131,25 +1200,39 @@ find_next_entry(struct trace_iterator *iter, int *ent_cpu)
 	return next;
 }
 
-static void trace_iterator_increment(struct trace_iterator *iter)
+/* Find the next real entry, without updating the iterator itself */
+static struct trace_entry *
+find_next_entry(struct trace_iterator *iter, int *ent_cpu)
 {
-	iter->idx++;
-	iter->next_idx[iter->cpu]++;
-	iter->next_page_idx[iter->cpu]++;
+	return __find_next_entry(iter, ent_cpu, 0);
+}
+
+/* Find the next real entry, and increment the iterator to the next entry */
+static void *find_next_entry_inc(struct trace_iterator *iter)
+{
+	struct trace_entry *next;
+	int next_cpu = -1;
 
-	if (iter->next_page_idx[iter->cpu] >= ENTRIES_PER_PAGE) {
-		struct trace_array_cpu *data = iter->tr->data[iter->cpu];
+	next = __find_next_entry(iter, &next_cpu, 1);
 
-		iter->next_page_idx[iter->cpu] = 0;
-		iter->next_page[iter->cpu] =
-			trace_next_list(data, iter->next_page[iter->cpu]);
-	}
+	iter->prev_ent = iter->ent;
+	iter->prev_cpu = iter->cpu;
+
+	iter->ent = next;
+	iter->cpu = next_cpu;
+
+	if (next)
+		trace_iterator_increment(iter, iter->cpu);
+
+	return next ? iter : NULL;
 }
 
 static void trace_consume(struct trace_iterator *iter)
 {
 	struct trace_array_cpu *data = iter->tr->data[iter->cpu];
+	struct trace_entry *ent;
 
+ again:
 	data->trace_tail_idx++;
 	if (data->trace_tail_idx >= ENTRIES_PER_PAGE) {
 		data->trace_tail = trace_next_page(data, data->trace_tail);
@@ -1160,25 +1243,11 @@ static void trace_consume(struct trace_iterator *iter)
 	if (data->trace_head == data->trace_tail &&
 	    data->trace_head_idx == data->trace_tail_idx)
 		data->trace_idx = 0;
-}
-
-static void *find_next_entry_inc(struct trace_iterator *iter)
-{
-	struct trace_entry *next;
-	int next_cpu = -1;
-
-	next = find_next_entry(iter, &next_cpu);
-
-	iter->prev_ent = iter->ent;
-	iter->prev_cpu = iter->cpu;
-
-	iter->ent = next;
-	iter->cpu = next_cpu;
-
-	if (next)
-		trace_iterator_increment(iter);
 
-	return next ? iter : NULL;
+	ent = trace_entry_idx(iter->tr, iter->tr->data[iter->cpu],
+			      iter, iter->cpu);
+	if (ent && ent->type == TRACE_CONT)
+		goto again;
 }
 
 static void *s_next(struct seq_file *m, void *v, loff_t *pos)
@@ -1473,6 +1542,26 @@ lat_print_timestamp(struct trace_seq *s, unsigned long long abs_usecs,
 
 static const char state_to_char[] = TASK_STATE_TO_CHAR_STR;
 
+static void
+trace_seq_print_cont(struct trace_seq *s, struct trace_iterator *iter)
+{
+	struct trace_array *tr = iter->tr;
+	struct trace_array_cpu *data = tr->data[iter->cpu];
+	struct trace_entry *ent;
+
+	ent = trace_entry_idx(tr, data, iter, iter->cpu);
+	if (!ent || ent->type != TRACE_CONT) {
+		trace_seq_putc(s, '\n');
+		return;
+	}
+
+	do {
+		trace_seq_printf(s, "%s", ent->cont.buf);
+		trace_iterator_increment(iter, iter->cpu);
+		ent = trace_entry_idx(tr, data, iter, iter->cpu);
+	} while (ent && ent->type == TRACE_CONT);
+}
+
 static int
 print_lat_fmt(struct trace_iterator *iter, unsigned int trace_idx, int cpu)
 {
@@ -1491,6 +1580,10 @@ print_lat_fmt(struct trace_iterator *iter, unsigned int trace_idx, int cpu)
 
 	if (!next_entry)
 		next_entry = entry;
+
+	if (entry->type == TRACE_CONT)
+		return 1;
+
 	rel_usecs = ns2usecs(next_entry->field.t - entry->field.t);
 	abs_usecs = ns2usecs(entry->field.t - iter->tr->time_start);
 
@@ -1550,6 +1643,12 @@ print_lat_fmt(struct trace_iterator *iter, unsigned int trace_idx, int cpu)
 		}
 		trace_seq_puts(s, "\n");
 		break;
+	case TRACE_PRINT:
+		seq_print_ip_sym(s, field->print.ip, sym_flags);
+		trace_seq_printf(s, ": %s", field->print.buf);
+		if (field->flags && TRACE_FLAG_CONT)
+			trace_seq_print_cont(s, iter);
+		break;
 	default:
 		trace_seq_printf(s, "Unknown type %d\n", entry->type);
 	}
@@ -1571,6 +1670,10 @@ static int print_trace_fmt(struct trace_iterator *iter)
 	int i;
 
 	entry = iter->ent;
+
+	if (entry->type == TRACE_CONT)
+		return 1;
+
 	field = &entry->field;
 
 	comm = trace_find_cmdline(iter->ent->field.pid);
@@ -1653,6 +1756,12 @@ static int print_trace_fmt(struct trace_iterator *iter)
 		if (!ret)
 			return 0;
 		break;
+	case TRACE_PRINT:
+		seq_print_ip_sym(s, field->print.ip, sym_flags);
+		trace_seq_printf(s, ": %s", field->print.buf);
+		if (field->flags && TRACE_FLAG_CONT)
+			trace_seq_print_cont(s, iter);
+		break;
 	}
 	return 1;
 }
@@ -1666,6 +1775,10 @@ static int print_raw_fmt(struct trace_iterator *iter)
 	int S, T;
 
 	entry = iter->ent;
+
+	if (entry->type == TRACE_CONT)
+		return 1;
+
 	field = &entry->field;
 
 	ret = trace_seq_printf(s, "%d %d %llu ",
@@ -1708,6 +1821,12 @@ static int print_raw_fmt(struct trace_iterator *iter)
 		if (!ret)
 			return 0;
 		break;
+	case TRACE_PRINT:
+		trace_seq_printf(s, "# %lx %s",
+				 field->print.ip, field->print.buf);
+		if (field->flags && TRACE_FLAG_CONT)
+			trace_seq_print_cont(s, iter);
+		break;
 	}
 	return 1;
 }
@@ -1733,6 +1852,10 @@ static int print_hex_fmt(struct trace_iterator *iter)
 	int S, T;
 
 	entry = iter->ent;
+
+	if (entry->type == TRACE_CONT)
+		return 1;
+
 	field = &entry->field;
 
 	SEQ_PUT_HEX_FIELD_RET(s, field->pid);
@@ -1778,6 +1901,10 @@ static int print_bin_fmt(struct trace_iterator *iter)
 	struct trace_field *field;
 
 	entry = iter->ent;
+
+	if (entry->type == TRACE_CONT)
+		return 1;
+
 	field = &entry->field;
 
 	SEQ_PUT_FIELD_RET(s, field->pid);
@@ -2943,6 +3070,94 @@ static __init void tracer_init_debugfs(void)
 #endif
 }
 
+#define TRACE_BUF_SIZE 1024
+#define TRACE_PRINT_BUF_SIZE \
+	(sizeof(struct trace_field) - offsetof(struct trace_field, print.buf))
+#define TRACE_CONT_BUF_SIZE sizeof(struct trace_field)
+
+/**
+ * ftrace_printk - printf formatting in the ftrace buffer
+ * @fmt - the printf format for printing.
+ *
+ * Note: __ftrace_printk is an internal function for ftrace_printk and
+ *       the @ip is passed in via the ftrace_printk macro.
+ *
+ * This function allows a kernel developer to debug fast path sections
+ * that printk is not appropriate for. By scattering in various
+ * printk like tracing in the code, a developer can quickly see
+ * where problems are occurring.
+ *
+ * This is intended as a debugging tool for the developer only.
+ * Please reframe from leaving ftrace_printks scattered around in
+ * your code.
+ */
+int __ftrace_printk(unsigned long ip, const char *fmt, ...)
+{
+	struct trace_array *tr = &global_trace;
+	static DEFINE_SPINLOCK(trace_buf_lock);
+	static char trace_buf[TRACE_BUF_SIZE];
+	struct trace_array_cpu *data;
+	struct trace_entry *entry;
+	unsigned long flags;
+	long disabled;
+	va_list ap;
+	int cpu, len = 0, write, written = 0;
+
+	if (likely(!ftrace_function_enabled))
+		return 0;
+
+	local_irq_save(flags);
+	cpu = raw_smp_processor_id();
+	data = tr->data[cpu];
+	disabled = atomic_inc_return(&data->disabled);
+
+	if (unlikely(disabled != 1 || !ftrace_function_enabled))
+		goto out;
+
+	spin_lock(&trace_buf_lock);
+	va_start(ap, fmt);
+	len = vsnprintf(trace_buf, TRACE_BUF_SIZE, fmt, ap);
+	va_end(ap);
+
+	len = min(len, TRACE_BUF_SIZE-1);
+	trace_buf[len] = 0;
+
+	__raw_spin_lock(&data->lock);
+	entry				= tracing_get_trace_entry(tr, data);
+	tracing_generic_entry_update(entry, flags);
+	entry->type			= TRACE_PRINT;
+	entry->field.print.ip		= ip;
+
+	write = min(len, (int)(TRACE_PRINT_BUF_SIZE-1));
+
+	memcpy(&entry->field.print.buf, trace_buf, write);
+	entry->field.print.buf[write] = 0;
+	written = write;
+
+	if (written != len)
+		entry->field.flags |= TRACE_FLAG_CONT;
+
+	while (written != len) {
+		entry = tracing_get_trace_entry(tr, data);
+
+		entry->type = TRACE_CONT;
+		write = min(len - written, (int)(TRACE_CONT_BUF_SIZE-1));
+		memcpy(&entry->cont.buf, trace_buf+written, write);
+		entry->cont.buf[write] = 0;
+		written += write;
+	}
+	__raw_spin_unlock(&data->lock);
+
+	spin_unlock(&trace_buf_lock);
+
+ out:
+	atomic_dec(&data->disabled);
+	local_irq_restore(flags);
+
+	return len;
+}
+EXPORT_SYMBOL_GPL(__ftrace_printk);
+
 static int trace_alloc_page(void)
 {
 	struct trace_array_cpu *data;
diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h
index 6ddd6a6556cf..50b6d7a6f01a 100644
--- a/kernel/trace/trace.h
+++ b/kernel/trace/trace.h
@@ -13,7 +13,9 @@ enum trace_type {
 	TRACE_FN,
 	TRACE_CTX,
 	TRACE_WAKE,
+	TRACE_CONT,
 	TRACE_STACK,
+	TRACE_PRINT,
 	TRACE_SPECIAL,
 	TRACE_MMIO_RW,
 	TRACE_MMIO_MAP,
@@ -60,6 +62,14 @@ struct stack_entry {
 	unsigned long		caller[FTRACE_STACK_ENTRIES];
 };
 
+/*
+ * ftrace_printk entry:
+ */
+struct print_entry {
+	unsigned long		ip;
+	char			buf[];
+};
+
 /*
  * The trace field - the most basic unit of tracing. This is what
  * is printed in the end as a single line in the trace output, such as:
@@ -77,6 +87,7 @@ struct trace_field {
 		struct ctx_switch_entry		ctx;
 		struct special_entry		special;
 		struct stack_entry		stack;
+		struct print_entry		print;
 		struct mmiotrace_rw		mmiorw;
 		struct mmiotrace_map		mmiomap;
 	};
diff --git a/kernel/trace/trace_selftest.c b/kernel/trace/trace_selftest.c
index 0911b7e073bf..630715bbd572 100644
--- a/kernel/trace/trace_selftest.c
+++ b/kernel/trace/trace_selftest.c
@@ -9,7 +9,9 @@ static inline int trace_valid_entry(struct trace_entry *entry)
 	case TRACE_FN:
 	case TRACE_CTX:
 	case TRACE_WAKE:
+	case TRACE_CONT:
 	case TRACE_STACK:
+	case TRACE_PRINT:
 	case TRACE_SPECIAL:
 		return 1;
 	}
@@ -120,11 +122,11 @@ int trace_selftest_startup_dynamic_tracing(struct tracer *trace,
 					   struct trace_array *tr,
 					   int (*func)(void))
 {
-	unsigned long count;
-	int ret;
 	int save_ftrace_enabled = ftrace_enabled;
 	int save_tracer_enabled = tracer_enabled;
+	unsigned long count;
 	char *func_name;
+	int ret;
 
 	/* The ftrace test PASSED */
 	printk(KERN_CONT "PASSED\n");
@@ -157,6 +159,7 @@ int trace_selftest_startup_dynamic_tracing(struct tracer *trace,
 	/* enable tracing */
 	tr->ctrl = 1;
 	trace->init(tr);
+
 	/* Sleep for a 1/10 of a second */
 	msleep(100);
 
@@ -212,10 +215,10 @@ int trace_selftest_startup_dynamic_tracing(struct tracer *trace,
 int
 trace_selftest_startup_function(struct tracer *trace, struct trace_array *tr)
 {
-	unsigned long count;
-	int ret;
 	int save_ftrace_enabled = ftrace_enabled;
 	int save_tracer_enabled = tracer_enabled;
+	unsigned long count;
+	int ret;
 
 	/* make sure msleep has been recorded */
 	msleep(1);
-- 
cgit v1.2.3


From 2f2c99dba2398ef7d9c21f7c793180a50e68b1f0 Mon Sep 17 00:00:00 2001
From: Steven Rostedt <rostedt@goodmis.org>
Date: Fri, 1 Aug 2008 16:45:49 -0400
Subject: ftrace: ftrace_printk doc moved

Based on Randy Dunlap's suggestion, the ftrace_printk kernel-doc belongs
with the ftrace_printk macro that should be used. Not with the
__ftrace_printk internal function.

Signed-off-by: Steven Rostedt <srostedt@redhat.com>
Acked-by: Randy Dunlap <randy.dunlap@oracle.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 include/linux/ftrace.h | 19 ++++++++++++++++++-
 kernel/trace/trace.c   | 16 ----------------
 2 files changed, 18 insertions(+), 17 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/ftrace.h b/include/linux/ftrace.h
index f53b975e32fa..018af16bce5c 100644
--- a/include/linux/ftrace.h
+++ b/include/linux/ftrace.h
@@ -157,7 +157,24 @@ static inline void __ftrace_enabled_restore(int enabled)
 #ifdef CONFIG_TRACING
 extern void
 ftrace_special(unsigned long arg1, unsigned long arg2, unsigned long arg3);
-# define ftrace_printk(x...) __ftrace_printk(_THIS_IP_, x)
+
+/**
+ * ftrace_printk - printf formatting in the ftrace buffer
+ * @fmt: the printf format for printing
+ *
+ * Note: __ftrace_printk is an internal function for ftrace_printk and
+ *       the @ip is passed in via the ftrace_printk macro.
+ *
+ * This function allows a kernel developer to debug fast path sections
+ * that printk is not appropriate for. By scattering in various
+ * printk like tracing in the code, a developer can quickly see
+ * where problems are occurring.
+ *
+ * This is intended as a debugging tool for the developer only.
+ * Please refrain from leaving ftrace_printks scattered around in
+ * your code.
+ */
+# define ftrace_printk(fmt...) __ftrace_printk(_THIS_IP_, fmt)
 extern int
 __ftrace_printk(unsigned long ip, const char *fmt, ...)
 	__attribute__ ((format (printf, 2, 3)));
diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index a917bea82715..2597e7e49c35 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -3075,22 +3075,6 @@ static __init void tracer_init_debugfs(void)
 	(sizeof(struct trace_field) - offsetof(struct trace_field, print.buf))
 #define TRACE_CONT_BUF_SIZE sizeof(struct trace_field)
 
-/**
- * ftrace_printk - printf formatting in the ftrace buffer
- * @fmt - the printf format for printing.
- *
- * Note: __ftrace_printk is an internal function for ftrace_printk and
- *       the @ip is passed in via the ftrace_printk macro.
- *
- * This function allows a kernel developer to debug fast path sections
- * that printk is not appropriate for. By scattering in various
- * printk like tracing in the code, a developer can quickly see
- * where problems are occurring.
- *
- * This is intended as a debugging tool for the developer only.
- * Please reframe from leaving ftrace_printks scattered around in
- * your code.
- */
 int __ftrace_printk(unsigned long ip, const char *fmt, ...)
 {
 	struct trace_array *tr = &global_trace;
-- 
cgit v1.2.3


From 3f5a54e371ca20b119b73704f6c01b71295c1714 Mon Sep 17 00:00:00 2001
From: Steven Rostedt <rostedt@goodmis.org>
Date: Wed, 30 Jul 2008 22:36:46 -0400
Subject: ftrace: dump out ftrace buffers to console on panic

At OLS I had a lot of interest to be able to have the ftrace buffers
dumped on panic.  Usually one would expect to uses kexec and examine
the buffers after a new kernel is loaded. But sometimes the resources
do not permit kdump and kexec, so having an option to still see the
sequence of events up to the crash is very advantageous.

This patch adds the option to have the ftrace buffers dumped to the
console in the latency_trace format on a panic. When the option is set,
the default entries per CPU buffer are lowered to 16384, since the writing
to the serial (if that is the console) may take an awful long time
otherwise.

[
 Changes since -v1:
  Got alpine to send correctly (as well as spell check working).
  Removed config option.
  Moved the static variables into ftrace_dump itself.
  Gave printk a log level.
]

Signed-off-by: Steven Rostedt <srostedt@redhat.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 include/linux/ftrace.h |   2 +
 kernel/trace/trace.c   | 175 ++++++++++++++++++++++++++++++++++++++++++++++++-
 2 files changed, 176 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/ftrace.h b/include/linux/ftrace.h
index 018af16bce5c..f7fb92045bf0 100644
--- a/include/linux/ftrace.h
+++ b/include/linux/ftrace.h
@@ -178,6 +178,7 @@ ftrace_special(unsigned long arg1, unsigned long arg2, unsigned long arg3);
 extern int
 __ftrace_printk(unsigned long ip, const char *fmt, ...)
 	__attribute__ ((format (printf, 2, 3)));
+extern void ftrace_dump(void);
 #else
 static inline void
 ftrace_special(unsigned long arg1, unsigned long arg2, unsigned long arg3) { }
@@ -186,6 +187,7 @@ ftrace_printk(const char *fmt, ...) __attribute__ ((format (printf, 1, 0)))
 {
 	return 0;
 }
+static inline void ftrace_dump(void) { }
 #endif
 
 #ifdef CONFIG_FTRACE_MCOUNT_RECORD
diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index 2597e7e49c35..97513c8ecd67 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -14,6 +14,7 @@
 #include <linux/utsrelease.h>
 #include <linux/kallsyms.h>
 #include <linux/seq_file.h>
+#include <linux/notifier.h>
 #include <linux/debugfs.h>
 #include <linux/pagemap.h>
 #include <linux/hardirq.h>
@@ -22,6 +23,7 @@
 #include <linux/ftrace.h>
 #include <linux/module.h>
 #include <linux/percpu.h>
+#include <linux/kdebug.h>
 #include <linux/ctype.h>
 #include <linux/init.h>
 #include <linux/poll.h>
@@ -103,8 +105,15 @@ int				ftrace_function_enabled;
  * trace_nr_entries is the number of entries that is allocated
  * for a buffer. Note, the number of entries is always rounded
  * to ENTRIES_PER_PAGE.
+ *
+ * This number is purposely set to a low number of 16384.
+ * If the dump on oops happens, it will be much appreciated
+ * to not have to wait for all that output. Anyway this can be
+ * boot time and run time configurable.
  */
-static unsigned long		trace_nr_entries = 65536UL;
+#define TRACE_ENTRIES_DEFAULT	16384UL
+
+static unsigned long		trace_nr_entries = TRACE_ENTRIES_DEFAULT;
 
 /* trace_types holds a link list of available tracers. */
 static struct tracer		*trace_types __read_mostly;
@@ -3142,6 +3151,165 @@ int __ftrace_printk(unsigned long ip, const char *fmt, ...)
 }
 EXPORT_SYMBOL_GPL(__ftrace_printk);
 
+static int trace_panic_handler(struct notifier_block *this,
+			       unsigned long event, void *unused)
+{
+	ftrace_dump();
+	return NOTIFY_OK;
+}
+
+static struct notifier_block trace_panic_notifier = {
+	.notifier_call  = trace_panic_handler,
+	.next           = NULL,
+	.priority       = 150   /* priority: INT_MAX >= x >= 0 */
+};
+
+static int trace_die_handler(struct notifier_block *self,
+			     unsigned long val,
+			     void *data)
+{
+	switch (val) {
+	case DIE_OOPS:
+		ftrace_dump();
+		break;
+	default:
+		break;
+	}
+	return NOTIFY_OK;
+}
+
+static struct notifier_block trace_die_notifier = {
+	.notifier_call = trace_die_handler,
+	.priority = 200
+};
+
+/*
+ * printk is set to max of 1024, we really don't need it that big.
+ * Nothing should be printing 1000 characters anyway.
+ */
+#define TRACE_MAX_PRINT		1000
+
+/*
+ * Define here KERN_TRACE so that we have one place to modify
+ * it if we decide to change what log level the ftrace dump
+ * should be at.
+ */
+#define KERN_TRACE		KERN_INFO
+
+static void
+trace_printk_seq(struct trace_seq *s)
+{
+	/* Probably should print a warning here. */
+	if (s->len >= 1000)
+		s->len = 1000;
+
+	/* should be zero ended, but we are paranoid. */
+	s->buffer[s->len] = 0;
+
+	printk(KERN_TRACE "%s", s->buffer);
+
+	trace_seq_reset(s);
+}
+
+
+void ftrace_dump(void)
+{
+	static DEFINE_SPINLOCK(ftrace_dump_lock);
+	/* use static because iter can be a bit big for the stack */
+	static struct trace_iterator iter;
+	struct trace_array_cpu *data;
+	static cpumask_t mask;
+	static int dump_ran;
+	unsigned long flags;
+	int cnt = 0;
+	int cpu;
+
+	/* only one dump */
+	spin_lock_irqsave(&ftrace_dump_lock, flags);
+	if (dump_ran)
+		goto out;
+
+	dump_ran = 1;
+
+	/* No turning back! */
+	ftrace_kill_atomic();
+
+	printk(KERN_TRACE "Dumping ftrace buffer:\n");
+
+	iter.tr = &global_trace;
+	iter.trace = current_trace;
+
+	/*
+	 * We need to stop all tracing on all CPUS to read the
+	 * the next buffer. This is a bit expensive, but is
+	 * not done often. We fill all what we can read,
+	 * and then release the locks again.
+	 */
+
+	cpus_clear(mask);
+
+	for_each_tracing_cpu(cpu) {
+		data = iter.tr->data[cpu];
+
+		if (!head_page(data) || !data->trace_idx)
+			continue;
+
+		atomic_inc(&data->disabled);
+		cpu_set(cpu, mask);
+	}
+
+	for_each_cpu_mask(cpu, mask) {
+		data = iter.tr->data[cpu];
+		__raw_spin_lock(&data->lock);
+
+		if (data->overrun > iter.last_overrun[cpu])
+			iter.overrun[cpu] +=
+				data->overrun - iter.last_overrun[cpu];
+		iter.last_overrun[cpu] = data->overrun;
+	}
+
+	while (!trace_empty(&iter)) {
+
+		if (!cnt)
+			printk(KERN_TRACE "---------------------------------\n");
+
+		cnt++;
+
+		/* reset all but tr, trace, and overruns */
+		memset(&iter.seq, 0,
+		       sizeof(struct trace_iterator) -
+		       offsetof(struct trace_iterator, seq));
+		iter.iter_flags |= TRACE_FILE_LAT_FMT;
+		iter.pos = -1;
+
+		if (find_next_entry_inc(&iter) != NULL) {
+			print_trace_line(&iter);
+			trace_consume(&iter);
+		}
+
+		trace_printk_seq(&iter.seq);
+	}
+
+	if (!cnt)
+		printk(KERN_TRACE "   (ftrace buffer empty)\n");
+	else
+		printk(KERN_TRACE "---------------------------------\n");
+
+	for_each_cpu_mask(cpu, mask) {
+		data = iter.tr->data[cpu];
+		__raw_spin_unlock(&data->lock);
+	}
+
+	for_each_cpu_mask(cpu, mask) {
+		data = iter.tr->data[cpu];
+		atomic_dec(&data->disabled);
+	}
+
+
+ out:
+	spin_unlock_irqrestore(&ftrace_dump_lock, flags);
+}
+
 static int trace_alloc_page(void)
 {
 	struct trace_array_cpu *data;
@@ -3338,6 +3506,11 @@ __init static int tracer_alloc_buffers(void)
 	global_trace.ctrl = tracer_enabled;
 	tracing_disabled = 0;
 
+	atomic_notifier_chain_register(&panic_notifier_list,
+				       &trace_panic_notifier);
+
+	register_die_notifier(&trace_die_notifier);
+
 	return 0;
 
  free_buffers:
-- 
cgit v1.2.3


From 7b928c23fa3e9fa37d1d4ba52ba963f41ee5aae0 Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Fri, 15 Aug 2008 17:48:02 +0200
Subject: ftrace: build fix
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

fix:

 In file included from init/main.c:65:
 include/linux/ftrace.h:166: error: expected ‘,' or ‘;' before ‘{' token
 make[1]: *** [init/main.o] Error 1
 make: *** [init/main.o] Error 2

Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 include/linux/ftrace.h | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/ftrace.h b/include/linux/ftrace.h
index f7fb92045bf0..ce929cb55435 100644
--- a/include/linux/ftrace.h
+++ b/include/linux/ftrace.h
@@ -183,7 +183,10 @@ extern void ftrace_dump(void);
 static inline void
 ftrace_special(unsigned long arg1, unsigned long arg2, unsigned long arg3) { }
 static inline int
-ftrace_printk(const char *fmt, ...) __attribute__ ((format (printf, 1, 0)))
+ftrace_printk(const char *fmt, ...) __attribute__ ((format (printf, 1, 0)));
+
+static inline int
+ftrace_printk(const char *fmt, ...)
 {
 	return 0;
 }
-- 
cgit v1.2.3


From c5131ad6c3cbe8f6674993e29a76cecf8deb4384 Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Fri, 15 Aug 2008 18:22:09 +0200
Subject: ftrace: ftrace_kill_atomic() build fix

fix:

 kernel/built-in.o: In function `ftrace_dump':
 (.text+0x2e2ea): undefined reference to `ftrace_kill_atomic'

Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 include/linux/ftrace.h | 1 +
 1 file changed, 1 insertion(+)

(limited to 'include/linux')

diff --git a/include/linux/ftrace.h b/include/linux/ftrace.h
index ce929cb55435..36c439927ff1 100644
--- a/include/linux/ftrace.h
+++ b/include/linux/ftrace.h
@@ -36,6 +36,7 @@ extern void ftrace_stub(unsigned long a0, unsigned long a1);
 # define register_ftrace_function(ops) do { } while (0)
 # define unregister_ftrace_function(ops) do { } while (0)
 # define clear_ftrace_function(ops) do { } while (0)
+static inline void ftrace_kill_atomic(void) { }
 #endif /* CONFIG_FTRACE */
 
 #ifdef CONFIG_DYNAMIC_FTRACE
-- 
cgit v1.2.3


From 3700273586ee6a58b95dd07d9f8a02db4a9b476f Mon Sep 17 00:00:00 2001
From: Huang Ying <ying.huang@intel.com>
Date: Mon, 18 Aug 2008 16:24:56 +0800
Subject: ftrace: fix incorrect comment style of __ftrace_enabled_save()

This patch fixes incorrect comment style of __ftrace_enabled_save().

Signed-off-by: Huang Ying <ying.huang@intel.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 include/linux/ftrace.h | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/ftrace.h b/include/linux/ftrace.h
index 36c439927ff1..8b4cf38c80d2 100644
--- a/include/linux/ftrace.h
+++ b/include/linux/ftrace.h
@@ -99,9 +99,11 @@ static inline void tracer_disable(void)
 #endif
 }
 
-/* Ftrace disable/restore without lock. Some synchronization mechanism
+/*
+ * Ftrace disable/restore without lock. Some synchronization mechanism
  * must be used to prevent ftrace_enabled to be changed between
- * disable/restore. */
+ * disable/restore.
+ */
 static inline int __ftrace_enabled_save(void)
 {
 #ifdef CONFIG_FTRACE
-- 
cgit v1.2.3


From c0719e5a4b1ccc04180b7a7b71095c9fb7131919 Mon Sep 17 00:00:00 2001
From: Steven Rostedt <rostedt@goodmis.org>
Date: Sat, 6 Sep 2008 01:06:03 -0400
Subject: ftrace: use ftrace_release for all dynamic ftrace functions

ftrace_release is necessary for all uses of dynamic ftrace and not just
the archs that have CONFIG_FTRACE_MCOUNT_RECORD defined.

Signed-off-by: Steven Rostedt <srostedt@redhat.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 include/linux/ftrace.h | 9 +++++----
 1 file changed, 5 insertions(+), 4 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/ftrace.h b/include/linux/ftrace.h
index 8b4cf38c80d2..5de9903645d5 100644
--- a/include/linux/ftrace.h
+++ b/include/linux/ftrace.h
@@ -77,8 +77,10 @@ extern void mcount_call(void);
 
 extern int skip_trace(unsigned long ip);
 
-void ftrace_disable_daemon(void);
-void ftrace_enable_daemon(void);
+extern void ftrace_release(void *start, unsigned long size);
+
+extern void ftrace_disable_daemon(void);
+extern void ftrace_enable_daemon(void);
 
 #else
 # define skip_trace(ip)				({ 0; })
@@ -86,6 +88,7 @@ void ftrace_enable_daemon(void);
 # define ftrace_set_filter(buf, len, reset)	do { } while (0)
 # define ftrace_disable_daemon()		do { } while (0)
 # define ftrace_enable_daemon()			do { } while (0)
+static inline void ftrace_release(void *start, unsigned long size) { }
 #endif /* CONFIG_DYNAMIC_FTRACE */
 
 /* totally disable ftrace - can not re-enable after this */
@@ -199,12 +202,10 @@ static inline void ftrace_dump(void) { }
 #ifdef CONFIG_FTRACE_MCOUNT_RECORD
 extern void ftrace_init(void);
 extern void ftrace_init_module(unsigned long *start, unsigned long *end);
-extern void ftrace_release(void *start, unsigned long size);
 #else
 static inline void ftrace_init(void) { }
 static inline void
 ftrace_init_module(unsigned long *start, unsigned long *end) { }
-static inline void ftrace_release(void *start, unsigned long size) { }
 #endif
 
 
-- 
cgit v1.2.3


From 9e57fb35d711331a9b1410c5c56ebeb3733428a0 Mon Sep 17 00:00:00 2001
From: Pekka Paalanen <pq@iki.fi>
Date: Tue, 16 Sep 2008 22:00:34 +0300
Subject: x86 mmiotrace: implement mmiotrace_printk()

Offer mmiotrace users a function to inject markers from inside the kernel.
This depends on the trace_vprintk() patch.

Signed-off-by: Pekka Paalanen <pq@iki.fi>
Acked-by: Steven Rostedt <rostedt@goodmis.org>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/mm/mmio-mod.c         | 19 ++++++++++++++++++-
 arch/x86/mm/testmmiotrace.c    |  4 ++++
 include/linux/mmiotrace.h      | 17 +++++++++++++++--
 kernel/trace/trace_mmiotrace.c |  5 +++++
 4 files changed, 42 insertions(+), 3 deletions(-)

(limited to 'include/linux')

diff --git a/arch/x86/mm/mmio-mod.c b/arch/x86/mm/mmio-mod.c
index 754bd1eaf4f6..5e2e2e72ee80 100644
--- a/arch/x86/mm/mmio-mod.c
+++ b/arch/x86/mm/mmio-mod.c
@@ -75,7 +75,7 @@ static LIST_HEAD(trace_list);		/* struct remap_trace */
  *   and trace_lock.
  * - Routines depending on is_enabled() must take trace_lock.
  * - trace_list users must hold trace_lock.
- * - is_enabled() guarantees that mmio_trace_record is allowed.
+ * - is_enabled() guarantees that mmio_trace_{rw,mapping} are allowed.
  * - pre/post callbacks assume the effect of is_enabled() being true.
  */
 
@@ -379,6 +379,23 @@ void mmiotrace_iounmap(volatile void __iomem *addr)
 		iounmap_trace_core(addr);
 }
 
+int mmiotrace_printk(const char *fmt, ...)
+{
+	int ret = 0;
+	va_list args;
+	unsigned long flags;
+	va_start(args, fmt);
+
+	spin_lock_irqsave(&trace_lock, flags);
+	if (is_enabled())
+		ret = mmio_trace_printk(fmt, args);
+	spin_unlock_irqrestore(&trace_lock, flags);
+
+	va_end(args);
+	return ret;
+}
+EXPORT_SYMBOL(mmiotrace_printk);
+
 static void clear_trace_list(void)
 {
 	struct remap_trace *trace;
diff --git a/arch/x86/mm/testmmiotrace.c b/arch/x86/mm/testmmiotrace.c
index d877c5b423ef..ab50a8d7402c 100644
--- a/arch/x86/mm/testmmiotrace.c
+++ b/arch/x86/mm/testmmiotrace.c
@@ -3,6 +3,7 @@
  */
 #include <linux/module.h>
 #include <linux/io.h>
+#include <linux/mmiotrace.h>
 
 #define MODULE_NAME "testmmiotrace"
 
@@ -13,6 +14,7 @@ MODULE_PARM_DESC(mmio_address, "Start address of the mapping of 16 kB.");
 static void do_write_test(void __iomem *p)
 {
 	unsigned int i;
+	mmiotrace_printk("Write test.\n");
 	for (i = 0; i < 256; i++)
 		iowrite8(i, p + i);
 	for (i = 1024; i < (5 * 1024); i += 2)
@@ -24,6 +26,7 @@ static void do_write_test(void __iomem *p)
 static void do_read_test(void __iomem *p)
 {
 	unsigned int i;
+	mmiotrace_printk("Read test.\n");
 	for (i = 0; i < 256; i++)
 		ioread8(p + i);
 	for (i = 1024; i < (5 * 1024); i += 2)
@@ -39,6 +42,7 @@ static void do_test(void)
 		pr_err(MODULE_NAME ": could not ioremap, aborting.\n");
 		return;
 	}
+	mmiotrace_printk("ioremap returned %p.\n", p);
 	do_write_test(p);
 	do_read_test(p);
 	iounmap(p);
diff --git a/include/linux/mmiotrace.h b/include/linux/mmiotrace.h
index 61d19e1b7a0b..60cc3bf5c538 100644
--- a/include/linux/mmiotrace.h
+++ b/include/linux/mmiotrace.h
@@ -34,11 +34,15 @@ extern void unregister_kmmio_probe(struct kmmio_probe *p);
 /* Called from page fault handler. */
 extern int kmmio_handler(struct pt_regs *regs, unsigned long addr);
 
-/* Called from ioremap.c */
 #ifdef CONFIG_MMIOTRACE
+/* Called from ioremap.c */
 extern void mmiotrace_ioremap(resource_size_t offset, unsigned long size,
 							void __iomem *addr);
 extern void mmiotrace_iounmap(volatile void __iomem *addr);
+
+/* For anyone to insert markers. Remember trailing newline. */
+extern int mmiotrace_printk(const char *fmt, ...)
+				__attribute__ ((format (printf, 1, 2)));
 #else
 static inline void mmiotrace_ioremap(resource_size_t offset,
 					unsigned long size, void __iomem *addr)
@@ -48,7 +52,15 @@ static inline void mmiotrace_ioremap(resource_size_t offset,
 static inline void mmiotrace_iounmap(volatile void __iomem *addr)
 {
 }
-#endif /* CONFIG_MMIOTRACE_HOOKS */
+
+static inline int mmiotrace_printk(const char *fmt, ...)
+				__attribute__ ((format (printf, 1, 0)));
+
+static inline int mmiotrace_printk(const char *fmt, ...)
+{
+	return 0;
+}
+#endif /* CONFIG_MMIOTRACE */
 
 enum mm_io_opcode {
 	MMIO_READ = 0x1,     /* struct mmiotrace_rw */
@@ -81,5 +93,6 @@ extern void enable_mmiotrace(void);
 extern void disable_mmiotrace(void);
 extern void mmio_trace_rw(struct mmiotrace_rw *rw);
 extern void mmio_trace_mapping(struct mmiotrace_map *map);
+extern int mmio_trace_printk(const char *fmt, va_list args);
 
 #endif /* MMIOTRACE_H */
diff --git a/kernel/trace/trace_mmiotrace.c b/kernel/trace/trace_mmiotrace.c
index ef02747b26d9..767d1faf56e5 100644
--- a/kernel/trace/trace_mmiotrace.c
+++ b/kernel/trace/trace_mmiotrace.c
@@ -335,3 +335,8 @@ void mmio_trace_mapping(struct mmiotrace_map *map)
 	__trace_mmiotrace_map(tr, data, map);
 	preempt_enable();
 }
+
+int mmio_trace_printk(const char *fmt, va_list args)
+{
+	return trace_vprintk(0, fmt, args);
+}
-- 
cgit v1.2.3


From 4427414170a63331a9cc36b9598502c5cdfe453b Mon Sep 17 00:00:00 2001
From: Pekka Paalanen <pq@iki.fi>
Date: Tue, 16 Sep 2008 22:03:56 +0300
Subject: mmiotrace: remove left-over marker cruft

Signed-off-by: Pekka Paalanen <pq@iki.fi>
Acked-by: Steven Rostedt <rostedt@goodmis.org>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/mm/mmio-mod.c    | 64 -----------------------------------------------
 include/linux/mmiotrace.h |  3 +--
 2 files changed, 1 insertion(+), 66 deletions(-)

(limited to 'include/linux')

diff --git a/arch/x86/mm/mmio-mod.c b/arch/x86/mm/mmio-mod.c
index 5e2e2e72ee80..2c4baa88f2cb 100644
--- a/arch/x86/mm/mmio-mod.c
+++ b/arch/x86/mm/mmio-mod.c
@@ -56,13 +56,6 @@ struct remap_trace {
 static DEFINE_PER_CPU(struct trap_reason, pf_reason);
 static DEFINE_PER_CPU(struct mmiotrace_rw, cpu_trace);
 
-#if 0 /* XXX: no way gather this info anymore */
-/* Access to this is not per-cpu. */
-static DEFINE_PER_CPU(atomic_t, dropped);
-#endif
-
-static struct dentry *marker_file;
-
 static DEFINE_MUTEX(mmiotrace_mutex);
 static DEFINE_SPINLOCK(trace_lock);
 static atomic_t mmiotrace_enabled;
@@ -97,44 +90,6 @@ static bool is_enabled(void)
 	return atomic_read(&mmiotrace_enabled);
 }
 
-#if 0 /* XXX: needs rewrite */
-/*
- * Write callback for the debugfs entry:
- * Read a marker and write it to the mmio trace log
- */
-static ssize_t write_marker(struct file *file, const char __user *buffer,
-						size_t count, loff_t *ppos)
-{
-	char *event = NULL;
-	struct mm_io_header *headp;
-	ssize_t len = (count > 65535) ? 65535 : count;
-
-	event = kzalloc(sizeof(*headp) + len, GFP_KERNEL);
-	if (!event)
-		return -ENOMEM;
-
-	headp = (struct mm_io_header *)event;
-	headp->type = MMIO_MAGIC | (MMIO_MARKER << MMIO_OPCODE_SHIFT);
-	headp->data_len = len;
-
-	if (copy_from_user(event + sizeof(*headp), buffer, len)) {
-		kfree(event);
-		return -EFAULT;
-	}
-
-	spin_lock_irq(&trace_lock);
-#if 0 /* XXX: convert this to use tracing */
-	if (is_enabled())
-		relay_write(chan, event, sizeof(*headp) + len);
-	else
-#endif
-		len = -EINVAL;
-	spin_unlock_irq(&trace_lock);
-	kfree(event);
-	return len;
-}
-#endif
-
 static void print_pte(unsigned long address)
 {
 	unsigned int level;
@@ -481,26 +436,12 @@ static void leave_uniprocessor(void)
 }
 #endif
 
-#if 0 /* XXX: out of order */
-static struct file_operations fops_marker = {
-	.owner =	THIS_MODULE,
-	.write =	write_marker
-};
-#endif
-
 void enable_mmiotrace(void)
 {
 	mutex_lock(&mmiotrace_mutex);
 	if (is_enabled())
 		goto out;
 
-#if 0 /* XXX: tracing does not support text entries */
-	marker_file = debugfs_create_file("marker", 0660, dir, NULL,
-								&fops_marker);
-	if (!marker_file)
-		pr_err(NAME "marker file creation failed.\n");
-#endif
-
 	if (nommiotrace)
 		pr_info(NAME "MMIO tracing disabled.\n");
 	enter_uniprocessor();
@@ -525,11 +466,6 @@ void disable_mmiotrace(void)
 
 	clear_trace_list(); /* guarantees: no more kmmio callbacks */
 	leave_uniprocessor();
-	if (marker_file) {
-		debugfs_remove(marker_file);
-		marker_file = NULL;
-	}
-
 	pr_info(NAME "disabled.\n");
 out:
 	mutex_unlock(&mmiotrace_mutex);
diff --git a/include/linux/mmiotrace.h b/include/linux/mmiotrace.h
index 60cc3bf5c538..139d7c88d9c9 100644
--- a/include/linux/mmiotrace.h
+++ b/include/linux/mmiotrace.h
@@ -67,8 +67,7 @@ enum mm_io_opcode {
 	MMIO_WRITE = 0x2,    /* struct mmiotrace_rw */
 	MMIO_PROBE = 0x3,    /* struct mmiotrace_map */
 	MMIO_UNPROBE = 0x4,  /* struct mmiotrace_map */
-	MMIO_MARKER = 0x5,   /* raw char data */
-	MMIO_UNKNOWN_OP = 0x6, /* struct mmiotrace_rw */
+	MMIO_UNKNOWN_OP = 0x5, /* struct mmiotrace_rw */
 };
 
 struct mmiotrace_rw {
-- 
cgit v1.2.3


From e98d0eabef2748d88fa58760d104e8e68517406b Mon Sep 17 00:00:00 2001
From: Mathieu Desnoyers <mathieu.desnoyers@polymtl.ca>
Date: Mon, 29 Sep 2008 11:05:13 -0400
Subject: markers: marker_synchronize_unregister()

Create marker_synchronize_unregister() which must be called before the end of
exit() to make sure every probe callers have exited the non preemptible section
and thus are not executing the probe code anymore.

Signed-off-by: Mathieu Desnoyers <mathieu.desnoyers@polymtl.ca>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 include/linux/marker.h | 7 +++++++
 1 file changed, 7 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/marker.h b/include/linux/marker.h
index 1290653f9241..889196c7fbb1 100644
--- a/include/linux/marker.h
+++ b/include/linux/marker.h
@@ -160,4 +160,11 @@ extern int marker_probe_unregister_private_data(marker_probe_func *probe,
 extern void *marker_get_private_data(const char *name, marker_probe_func *probe,
 	int num);
 
+/*
+ * marker_synchronize_unregister must be called between the last marker probe
+ * unregistration and the end of module exit to make sure there is no caller
+ * executing a probe when it is freed.
+ */
+#define marker_synchronize_unregister() synchronize_sched()
+
 #endif
-- 
cgit v1.2.3


From 53c8c8fdfd2d2d515bdcb3d0f2a11d1f3f42ece1 Mon Sep 17 00:00:00 2001
From: Mathieu Desnoyers <mathieu.desnoyers@polymtl.ca>
Date: Fri, 3 Oct 2008 11:52:54 -0400
Subject: markers: turn marker_synchronize_unregister() into an inline

Turn marker synchronize unregister into a static inline. There is no
reason to keep it as a macro over a static inline.

Signed-off-by: Mathieu Desnoyers <mathieu.desnoyers@polymtl.ca>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 include/linux/marker.h | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/marker.h b/include/linux/marker.h
index 889196c7fbb1..38e32e781ed7 100644
--- a/include/linux/marker.h
+++ b/include/linux/marker.h
@@ -13,6 +13,7 @@
  */
 
 #include <linux/types.h>
+#include <linux/rcupdate.h>
 
 struct module;
 struct marker;
@@ -165,6 +166,9 @@ extern void *marker_get_private_data(const char *name, marker_probe_func *probe,
  * unregistration and the end of module exit to make sure there is no caller
  * executing a probe when it is freed.
  */
-#define marker_synchronize_unregister() synchronize_sched()
+static inline void marker_synchronize_unregister(void)
+{
+	synchronize_sched();
+}
 
 #endif
-- 
cgit v1.2.3


From d13744cd6e3fef373a3fe656ac349b4e7c49ff79 Mon Sep 17 00:00:00 2001
From: Frédéric Weisbecker <fweisbec@gmail.com>
Date: Tue, 23 Sep 2008 11:32:08 +0100
Subject: tracing/ftrace: add the boot tracer

Add the boot/initcall tracer.

It's primary purpose is to be able to trace the initcalls.

It is intended to be used with scripts/bootgraph.pl after some small
improvements.

Note that it is not active after its init. To avoid tracing (and so
crashing) before the whole tracing engine init, you have to explicitly
call start_boot_trace() after do_pre_smp_initcalls() to enable it.

Signed-off-by: Frederic Weisbecker <fweisbec@gmail.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 include/linux/ftrace.h    |  19 +++++++++
 kernel/trace/trace.h      |   4 ++
 kernel/trace/trace_boot.c | 101 ++++++++++++++++++++++++++++++++++++++++++++++
 3 files changed, 124 insertions(+)
 create mode 100644 kernel/trace/trace_boot.c

(limited to 'include/linux')

diff --git a/include/linux/ftrace.h b/include/linux/ftrace.h
index 5de9903645d5..91954eb6460f 100644
--- a/include/linux/ftrace.h
+++ b/include/linux/ftrace.h
@@ -5,6 +5,8 @@
 
 #include <linux/linkage.h>
 #include <linux/fs.h>
+#include <linux/init.h>
+#include <linux/types.h>
 
 extern int ftrace_enabled;
 extern int
@@ -209,4 +211,21 @@ ftrace_init_module(unsigned long *start, unsigned long *end) { }
 #endif
 
 
+struct boot_trace {
+	pid_t			caller;
+	initcall_t		func;
+	int			result;
+	unsigned long long	duration;
+};
+
+#ifdef CONFIG_BOOT_TRACER
+extern void trace_boot(struct boot_trace *it);
+extern void start_boot_trace(void);
+#else
+static inline void trace_boot(struct boot_trace *it) { }
+static inline void start_boot_trace(void) { }
+#endif
+
+
+
 #endif /* _LINUX_FTRACE_H */
diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h
index cb2c3fb7dd54..b28bf8812efc 100644
--- a/kernel/trace/trace.h
+++ b/kernel/trace/trace.h
@@ -6,6 +6,7 @@
 #include <linux/sched.h>
 #include <linux/clocksource.h>
 #include <linux/mmiotrace.h>
+#include <linux/ftrace.h>
 
 enum trace_type {
 	__TRACE_FIRST_TYPE = 0,
@@ -19,6 +20,7 @@ enum trace_type {
 	TRACE_SPECIAL,
 	TRACE_MMIO_RW,
 	TRACE_MMIO_MAP,
+	TRACE_BOOT,
 
 	__TRACE_LAST_TYPE
 };
@@ -30,6 +32,7 @@ struct ftrace_entry {
 	unsigned long		ip;
 	unsigned long		parent_ip;
 };
+extern struct tracer boot_tracer;
 
 /*
  * Context switch trace entry - which task (and prio) we switched from/to:
@@ -108,6 +111,7 @@ struct trace_field {
 		struct print_entry		print;
 		struct mmiotrace_rw		mmiorw;
 		struct mmiotrace_map		mmiomap;
+		struct boot_trace		initcall;
 	};
 };
 
diff --git a/kernel/trace/trace_boot.c b/kernel/trace/trace_boot.c
new file mode 100644
index 000000000000..c65ef8ffd6b4
--- /dev/null
+++ b/kernel/trace/trace_boot.c
@@ -0,0 +1,101 @@
+/*
+ * ring buffer based initcalls tracer
+ *
+ * Copyright (C) 2008 Frederic Weisbecker <fweisbec@gmail.com>
+ *
+ */
+
+#include <linux/init.h>
+#include <linux/debugfs.h>
+#include <linux/ftrace.h>
+
+#include "trace.h"
+
+static struct trace_array *boot_trace;
+static int trace_boot_enabled;
+
+
+/* Should be started after do_pre_smp_initcalls() in init/main.c */
+void start_boot_trace(void)
+{
+	trace_boot_enabled = 1;
+}
+
+void stop_boot_trace(struct trace_array *tr)
+{
+	trace_boot_enabled = 0;
+}
+
+static void boot_trace_init(struct trace_array *tr)
+{
+	int cpu;
+	boot_trace = tr;
+
+	trace_boot_enabled = 0;
+
+	for_each_cpu_mask(cpu, cpu_possible_map)
+		tracing_reset(tr->data[cpu]);
+}
+
+static void boot_trace_ctrl_update(struct trace_array *tr)
+{
+	if (tr->ctrl)
+		start_boot_trace();
+	else
+		stop_boot_trace(tr);
+}
+
+static int initcall_print_line(struct trace_iterator *iter)
+{
+	int ret = 1;
+	struct trace_entry *entry = iter->ent;
+	struct boot_trace *it = &entry->field.initcall;
+	struct trace_seq *s = &iter->seq;
+
+	if (iter->ent->type == TRACE_BOOT)
+		ret = trace_seq_printf(s, "%pF called from %i "
+				       "returned %d after %lld msecs\n",
+				       it->func, it->caller, it->result,
+				       it->duration);
+	if (ret)
+		return 1;
+	return 0;
+}
+
+struct tracer boot_tracer __read_mostly =
+{
+	.name		= "initcall",
+	.init		= boot_trace_init,
+	.reset		= stop_boot_trace,
+	.ctrl_update	= boot_trace_ctrl_update,
+	.print_line	= initcall_print_line,
+};
+
+
+void trace_boot(struct boot_trace *it)
+{
+	struct trace_entry *entry;
+	struct trace_array_cpu *data;
+	unsigned long irq_flags;
+	struct trace_array *tr = boot_trace;
+
+	if (!trace_boot_enabled)
+		return;
+
+	preempt_disable();
+	data = tr->data[smp_processor_id()];
+
+	raw_local_irq_save(irq_flags);
+	__raw_spin_lock(&data->lock);
+
+	entry = tracing_get_trace_entry(tr, data);
+	tracing_generic_entry_update(entry, 0);
+	entry->type = TRACE_BOOT;
+	entry->field.initcall = *it;
+
+	__raw_spin_unlock(&data->lock);
+	raw_local_irq_restore(irq_flags);
+	trace_wake_up();
+
+	preempt_enable();
+}
-- 
cgit v1.2.3


From 7a8e76a3829f1067b70f715771ff88baf2fbf3c3 Mon Sep 17 00:00:00 2001
From: Steven Rostedt <rostedt@goodmis.org>
Date: Mon, 29 Sep 2008 23:02:38 -0400
Subject: tracing: unified trace buffer

This is a unified tracing buffer that implements a ring buffer that
hopefully everyone will eventually be able to use.

The events recorded into the buffer have the following structure:

  struct ring_buffer_event {
	u32 type:2, len:3, time_delta:27;
	u32 array[];
  };

The minimum size of an event is 8 bytes. All events are 4 byte
aligned inside the buffer.

There are 4 types (all internal use for the ring buffer, only
the data type is exported to the interface users).

 RINGBUF_TYPE_PADDING: this type is used to note extra space at the end
	of a buffer page.

 RINGBUF_TYPE_TIME_EXTENT: This type is used when the time between events
	is greater than the 27 bit delta can hold. We add another
	32 bits, and record that in its own event (8 byte size).

 RINGBUF_TYPE_TIME_STAMP: (Not implemented yet). This will hold data to
	help keep the buffer timestamps in sync.

RINGBUF_TYPE_DATA: The event actually holds user data.

The "len" field is only three bits. Since the data must be
4 byte aligned, this field is shifted left by 2, giving a
max length of 28 bytes. If the data load is greater than 28
bytes, the first array field holds the full length of the
data load and the len field is set to zero.

Example, data size of 7 bytes:

	type = RINGBUF_TYPE_DATA
	len = 2
	time_delta: <time-stamp> - <prev_event-time-stamp>
	array[0..1]: <7 bytes of data> <1 byte empty>

This event is saved in 12 bytes of the buffer.

An event with 82 bytes of data:

	type = RINGBUF_TYPE_DATA
	len = 0
	time_delta: <time-stamp> - <prev_event-time-stamp>
	array[0]: 84 (Note the alignment)
	array[1..14]: <82 bytes of data> <2 bytes empty>

The above event is saved in 92 bytes (if my math is correct).
82 bytes of data, 2 bytes empty, 4 byte header, 4 byte length.

Do not reference the above event struct directly. Use the following
functions to gain access to the event table, since the
ring_buffer_event structure may change in the future.

ring_buffer_event_length(event): get the length of the event.
	This is the size of the memory used to record this
	event, and not the size of the data pay load.

ring_buffer_time_delta(event): get the time delta of the event
	This returns the delta time stamp since the last event.
	Note: Even though this is in the header, there should
		be no reason to access this directly, accept
		for debugging.

ring_buffer_event_data(event): get the data from the event
	This is the function to use to get the actual data
	from the event. Note, it is only a pointer to the
	data inside the buffer. This data must be copied to
	another location otherwise you risk it being written
	over in the buffer.

ring_buffer_lock: A way to lock the entire buffer.
ring_buffer_unlock: unlock the buffer.

ring_buffer_alloc: create a new ring buffer. Can choose between
	overwrite or consumer/producer mode. Overwrite will
	overwrite old data, where as consumer producer will
	throw away new data if the consumer catches up with the
	producer.  The consumer/producer is the default.

ring_buffer_free: free the ring buffer.

ring_buffer_resize: resize the buffer. Changes the size of each cpu
	buffer. Note, it is up to the caller to provide that
	the buffer is not being used while this is happening.
	This requirement may go away but do not count on it.

ring_buffer_lock_reserve: locks the ring buffer and allocates an
	entry on the buffer to write to.
ring_buffer_unlock_commit: unlocks the ring buffer and commits it to
	the buffer.

ring_buffer_write: writes some data into the ring buffer.

ring_buffer_peek: Look at a next item in the cpu buffer.
ring_buffer_consume: get the next item in the cpu buffer and
	consume it. That is, this function increments the head
	pointer.

ring_buffer_read_start: Start an iterator of a cpu buffer.
	For now, this disables the cpu buffer, until you issue
	a finish. This is just because we do not want the iterator
	to be overwritten. This restriction may change in the future.
	But note, this is used for static reading of a buffer which
	is usually done "after" a trace. Live readings would want
	to use the ring_buffer_consume above, which will not
	disable the ring buffer.

ring_buffer_read_finish: Finishes the read iterator and reenables
	the ring buffer.

ring_buffer_iter_peek: Look at the next item in the cpu iterator.
ring_buffer_read: Read the iterator and increment it.
ring_buffer_iter_reset: Reset the iterator to point to the beginning
	of the cpu buffer.
ring_buffer_iter_empty: Returns true if the iterator is at the end
	of the cpu buffer.

ring_buffer_size: returns the size in bytes of each cpu buffer.
	Note, the real size is this times the number of CPUs.

ring_buffer_reset_cpu: Sets the cpu buffer to empty
ring_buffer_reset: sets all cpu buffers to empty

ring_buffer_swap_cpu: swaps a cpu buffer from one buffer with a
	cpu buffer of another buffer. This is handy when you
	want to take a snap shot of a running trace on just one
	cpu. Having a backup buffer, to swap with facilitates this.
	Ftrace max latencies use this.

ring_buffer_empty: Returns true if the ring buffer is empty.
ring_buffer_empty_cpu: Returns true if the cpu buffer is empty.

ring_buffer_record_disable: disable all cpu buffers (read only)
ring_buffer_record_disable_cpu: disable a single cpu buffer (read only)
ring_buffer_record_enable: enable all cpu buffers.
ring_buffer_record_enabl_cpu: enable a single cpu buffer.

ring_buffer_entries: The number of entries in a ring buffer.
ring_buffer_overruns: The number of entries removed due to writing wrap.

ring_buffer_time_stamp: Get the time stamp used by the ring buffer
ring_buffer_normalize_time_stamp: normalize the ring buffer time stamp
	into nanosecs.

I still need to implement the GTOD feature. But we need support from
the cpu frequency infrastructure.  But this can be done at a later
time without affecting the ring buffer interface.

Signed-off-by: Steven Rostedt <srostedt@redhat.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 include/linux/ring_buffer.h |  130 ++++
 kernel/trace/Kconfig        |    4 +
 kernel/trace/Makefile       |    1 +
 kernel/trace/ring_buffer.c  | 1672 +++++++++++++++++++++++++++++++++++++++++++
 4 files changed, 1807 insertions(+)
 create mode 100644 include/linux/ring_buffer.h
 create mode 100644 kernel/trace/ring_buffer.c

(limited to 'include/linux')

diff --git a/include/linux/ring_buffer.h b/include/linux/ring_buffer.h
new file mode 100644
index 000000000000..c52375b8330d
--- /dev/null
+++ b/include/linux/ring_buffer.h
@@ -0,0 +1,130 @@
+#ifndef _LINUX_RING_BUFFER_H
+#define _LINUX_RING_BUFFER_H
+
+#include <linux/mm.h>
+#include <linux/seq_file.h>
+
+struct ring_buffer;
+struct ring_buffer_iter;
+
+/*
+ * Don't reference this struct directly, use functions below.
+ */
+struct ring_buffer_event {
+	u32		type:2, len:3, time_delta:27;
+	u32		array[];
+};
+
+/**
+ * enum ring_buffer_type - internal ring buffer types
+ *
+ * @RINGBUF_TYPE_PADDING:	Left over page padding
+ *				 array is ignored
+ *				 size is variable depending on how much
+ *				  padding is needed
+ *
+ * @RINGBUF_TYPE_TIME_EXTEND:	Extend the time delta
+ *				 array[0] = time delta (28 .. 59)
+ *				 size = 8 bytes
+ *
+ * @RINGBUF_TYPE_TIME_STAMP:	Sync time stamp with external clock
+ *				 array[0] = tv_nsec
+ *				 array[1] = tv_sec
+ *				 size = 16 bytes
+ *
+ * @RINGBUF_TYPE_DATA:		Data record
+ *				 If len is zero:
+ *				  array[0] holds the actual length
+ *				  array[1..(length+3)/4-1] holds data
+ *				 else
+ *				  length = len << 2
+ *				  array[0..(length+3)/4] holds data
+ */
+enum ring_buffer_type {
+	RINGBUF_TYPE_PADDING,
+	RINGBUF_TYPE_TIME_EXTEND,
+	/* FIXME: RINGBUF_TYPE_TIME_STAMP not implemented */
+	RINGBUF_TYPE_TIME_STAMP,
+	RINGBUF_TYPE_DATA,
+};
+
+unsigned ring_buffer_event_length(struct ring_buffer_event *event);
+void *ring_buffer_event_data(struct ring_buffer_event *event);
+
+/**
+ * ring_buffer_event_time_delta - return the delta timestamp of the event
+ * @event: the event to get the delta timestamp of
+ *
+ * The delta timestamp is the 27 bit timestamp since the last event.
+ */
+static inline unsigned
+ring_buffer_event_time_delta(struct ring_buffer_event *event)
+{
+	return event->time_delta;
+}
+
+void ring_buffer_lock(struct ring_buffer *buffer, unsigned long *flags);
+void ring_buffer_unlock(struct ring_buffer *buffer, unsigned long flags);
+
+/*
+ * size is in bytes for each per CPU buffer.
+ */
+struct ring_buffer *
+ring_buffer_alloc(unsigned long size, unsigned flags);
+void ring_buffer_free(struct ring_buffer *buffer);
+
+int ring_buffer_resize(struct ring_buffer *buffer, unsigned long size);
+
+struct ring_buffer_event *
+ring_buffer_lock_reserve(struct ring_buffer *buffer,
+			 unsigned long length,
+			 unsigned long *flags);
+int ring_buffer_unlock_commit(struct ring_buffer *buffer,
+			      struct ring_buffer_event *event,
+			      unsigned long flags);
+int ring_buffer_write(struct ring_buffer *buffer,
+		      unsigned long length, void *data);
+
+struct ring_buffer_event *
+ring_buffer_peek(struct ring_buffer *buffer, int cpu, u64 *ts);
+struct ring_buffer_event *
+ring_buffer_consume(struct ring_buffer *buffer, int cpu, u64 *ts);
+
+struct ring_buffer_iter *
+ring_buffer_read_start(struct ring_buffer *buffer, int cpu);
+void ring_buffer_read_finish(struct ring_buffer_iter *iter);
+
+struct ring_buffer_event *
+ring_buffer_iter_peek(struct ring_buffer_iter *iter, u64 *ts);
+struct ring_buffer_event *
+ring_buffer_read(struct ring_buffer_iter *iter, u64 *ts);
+void ring_buffer_iter_reset(struct ring_buffer_iter *iter);
+int ring_buffer_iter_empty(struct ring_buffer_iter *iter);
+
+unsigned long ring_buffer_size(struct ring_buffer *buffer);
+
+void ring_buffer_reset_cpu(struct ring_buffer *buffer, int cpu);
+void ring_buffer_reset(struct ring_buffer *buffer);
+
+int ring_buffer_swap_cpu(struct ring_buffer *buffer_a,
+			 struct ring_buffer *buffer_b, int cpu);
+
+int ring_buffer_empty(struct ring_buffer *buffer);
+int ring_buffer_empty_cpu(struct ring_buffer *buffer, int cpu);
+
+void ring_buffer_record_disable(struct ring_buffer *buffer);
+void ring_buffer_record_enable(struct ring_buffer *buffer);
+void ring_buffer_record_disable_cpu(struct ring_buffer *buffer, int cpu);
+void ring_buffer_record_enable_cpu(struct ring_buffer *buffer, int cpu);
+
+unsigned long ring_buffer_entries(struct ring_buffer *buffer);
+unsigned long ring_buffer_overruns(struct ring_buffer *buffer);
+
+u64 ring_buffer_time_stamp(int cpu);
+void ring_buffer_normalize_time_stamp(int cpu, u64 *ts);
+
+enum ring_buffer_flags {
+	RB_FL_OVERWRITE		= 1 << 0,
+};
+
+#endif /* _LINUX_RING_BUFFER_H */
diff --git a/kernel/trace/Kconfig b/kernel/trace/Kconfig
index 4feb3c81f94d..396aea11398e 100644
--- a/kernel/trace/Kconfig
+++ b/kernel/trace/Kconfig
@@ -18,9 +18,13 @@ config HAVE_FTRACE_MCOUNT_RECORD
 config TRACER_MAX_TRACE
 	bool
 
+config RING_BUFFER
+	bool
+
 config TRACING
 	bool
 	select DEBUG_FS
+	select RING_BUFFER
 	select STACKTRACE
 	select TRACEPOINTS
 
diff --git a/kernel/trace/Makefile b/kernel/trace/Makefile
index 35a07f7cfa86..a85dfba88ba0 100644
--- a/kernel/trace/Makefile
+++ b/kernel/trace/Makefile
@@ -11,6 +11,7 @@ obj-y += trace_selftest_dynamic.o
 endif
 
 obj-$(CONFIG_FTRACE) += libftrace.o
+obj-$(CONFIG_RING_BUFFER) += ring_buffer.o
 
 obj-$(CONFIG_TRACING) += trace.o
 obj-$(CONFIG_CONTEXT_SWITCH_TRACER) += trace_sched_switch.o
diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c
new file mode 100644
index 000000000000..830a2930dd91
--- /dev/null
+++ b/kernel/trace/ring_buffer.c
@@ -0,0 +1,1672 @@
+/*
+ * Generic ring buffer
+ *
+ * Copyright (C) 2008 Steven Rostedt <srostedt@redhat.com>
+ */
+#include <linux/ring_buffer.h>
+#include <linux/spinlock.h>
+#include <linux/debugfs.h>
+#include <linux/uaccess.h>
+#include <linux/module.h>
+#include <linux/percpu.h>
+#include <linux/mutex.h>
+#include <linux/sched.h>	/* used for sched_clock() (for now) */
+#include <linux/init.h>
+#include <linux/hash.h>
+#include <linux/list.h>
+#include <linux/fs.h>
+
+/* Up this if you want to test the TIME_EXTENTS and normalization */
+#define DEBUG_SHIFT 0
+
+/* FIXME!!! */
+u64 ring_buffer_time_stamp(int cpu)
+{
+	/* shift to debug/test normalization and TIME_EXTENTS */
+	return sched_clock() << DEBUG_SHIFT;
+}
+
+void ring_buffer_normalize_time_stamp(int cpu, u64 *ts)
+{
+	/* Just stupid testing the normalize function and deltas */
+	*ts >>= DEBUG_SHIFT;
+}
+
+#define RB_EVNT_HDR_SIZE (sizeof(struct ring_buffer_event))
+#define RB_ALIGNMENT_SHIFT	2
+#define RB_ALIGNMENT		(1 << RB_ALIGNMENT_SHIFT)
+#define RB_MAX_SMALL_DATA	28
+
+enum {
+	RB_LEN_TIME_EXTEND = 8,
+	RB_LEN_TIME_STAMP = 16,
+};
+
+/* inline for ring buffer fast paths */
+static inline unsigned
+rb_event_length(struct ring_buffer_event *event)
+{
+	unsigned length;
+
+	switch (event->type) {
+	case RINGBUF_TYPE_PADDING:
+		/* undefined */
+		return -1;
+
+	case RINGBUF_TYPE_TIME_EXTEND:
+		return RB_LEN_TIME_EXTEND;
+
+	case RINGBUF_TYPE_TIME_STAMP:
+		return RB_LEN_TIME_STAMP;
+
+	case RINGBUF_TYPE_DATA:
+		if (event->len)
+			length = event->len << RB_ALIGNMENT_SHIFT;
+		else
+			length = event->array[0];
+		return length + RB_EVNT_HDR_SIZE;
+	default:
+		BUG();
+	}
+	/* not hit */
+	return 0;
+}
+
+/**
+ * ring_buffer_event_length - return the length of the event
+ * @event: the event to get the length of
+ */
+unsigned ring_buffer_event_length(struct ring_buffer_event *event)
+{
+	return rb_event_length(event);
+}
+
+/* inline for ring buffer fast paths */
+static inline void *
+rb_event_data(struct ring_buffer_event *event)
+{
+	BUG_ON(event->type != RINGBUF_TYPE_DATA);
+	/* If length is in len field, then array[0] has the data */
+	if (event->len)
+		return (void *)&event->array[0];
+	/* Otherwise length is in array[0] and array[1] has the data */
+	return (void *)&event->array[1];
+}
+
+/**
+ * ring_buffer_event_data - return the data of the event
+ * @event: the event to get the data from
+ */
+void *ring_buffer_event_data(struct ring_buffer_event *event)
+{
+	return rb_event_data(event);
+}
+
+#define for_each_buffer_cpu(buffer, cpu)		\
+	for_each_cpu_mask(cpu, buffer->cpumask)
+
+#define TS_SHIFT	27
+#define TS_MASK		((1ULL << TS_SHIFT) - 1)
+#define TS_DELTA_TEST	(~TS_MASK)
+
+/*
+ * This hack stolen from mm/slob.c.
+ * We can store per page timing information in the page frame of the page.
+ * Thanks to Peter Zijlstra for suggesting this idea.
+ */
+struct buffer_page {
+	union {
+		struct {
+			unsigned long	 flags;		/* mandatory */
+			atomic_t	 _count;	/* mandatory */
+			u64		 time_stamp;	/* page time stamp */
+			unsigned	 size;		/* size of page data */
+			struct list_head list;		/* list of free pages */
+		};
+		struct page page;
+	};
+};
+
+/*
+ * We need to fit the time_stamp delta into 27 bits.
+ */
+static inline int test_time_stamp(u64 delta)
+{
+	if (delta & TS_DELTA_TEST)
+		return 1;
+	return 0;
+}
+
+#define BUF_PAGE_SIZE PAGE_SIZE
+
+/*
+ * head_page == tail_page && head == tail then buffer is empty.
+ */
+struct ring_buffer_per_cpu {
+	int				cpu;
+	struct ring_buffer		*buffer;
+	spinlock_t			lock;
+	struct lock_class_key		lock_key;
+	struct list_head		pages;
+	unsigned long			head;	/* read from head */
+	unsigned long			tail;	/* write to tail */
+	struct buffer_page		*head_page;
+	struct buffer_page		*tail_page;
+	unsigned long			overrun;
+	unsigned long			entries;
+	u64				write_stamp;
+	u64				read_stamp;
+	atomic_t			record_disabled;
+};
+
+struct ring_buffer {
+	unsigned long			size;
+	unsigned			pages;
+	unsigned			flags;
+	int				cpus;
+	cpumask_t			cpumask;
+	atomic_t			record_disabled;
+
+	struct mutex			mutex;
+
+	struct ring_buffer_per_cpu	**buffers;
+};
+
+struct ring_buffer_iter {
+	struct ring_buffer_per_cpu	*cpu_buffer;
+	unsigned long			head;
+	struct buffer_page		*head_page;
+	u64				read_stamp;
+};
+
+#define RB_WARN_ON(buffer, cond)			\
+	if (unlikely(cond)) {				\
+		atomic_inc(&buffer->record_disabled);	\
+		WARN_ON(1);				\
+		return -1;				\
+	}
+
+/**
+ * check_pages - integrity check of buffer pages
+ * @cpu_buffer: CPU buffer with pages to test
+ *
+ * As a safty measure we check to make sure the data pages have not
+ * been corrupted.
+ */
+static int rb_check_pages(struct ring_buffer_per_cpu *cpu_buffer)
+{
+	struct list_head *head = &cpu_buffer->pages;
+	struct buffer_page *page, *tmp;
+
+	RB_WARN_ON(cpu_buffer, head->next->prev != head);
+	RB_WARN_ON(cpu_buffer, head->prev->next != head);
+
+	list_for_each_entry_safe(page, tmp, head, list) {
+		RB_WARN_ON(cpu_buffer, page->list.next->prev != &page->list);
+		RB_WARN_ON(cpu_buffer, page->list.prev->next != &page->list);
+	}
+
+	return 0;
+}
+
+static unsigned rb_head_size(struct ring_buffer_per_cpu *cpu_buffer)
+{
+	return cpu_buffer->head_page->size;
+}
+
+static int rb_allocate_pages(struct ring_buffer_per_cpu *cpu_buffer,
+			     unsigned nr_pages)
+{
+	struct list_head *head = &cpu_buffer->pages;
+	struct buffer_page *page, *tmp;
+	unsigned long addr;
+	LIST_HEAD(pages);
+	unsigned i;
+
+	for (i = 0; i < nr_pages; i++) {
+		addr = __get_free_page(GFP_KERNEL);
+		if (!addr)
+			goto free_pages;
+		page = (struct buffer_page *)virt_to_page(addr);
+		list_add(&page->list, &pages);
+	}
+
+	list_splice(&pages, head);
+
+	rb_check_pages(cpu_buffer);
+
+	return 0;
+
+ free_pages:
+	list_for_each_entry_safe(page, tmp, &pages, list) {
+		list_del_init(&page->list);
+		__free_page(&page->page);
+	}
+	return -ENOMEM;
+}
+
+static struct ring_buffer_per_cpu *
+rb_allocate_cpu_buffer(struct ring_buffer *buffer, int cpu)
+{
+	struct ring_buffer_per_cpu *cpu_buffer;
+	int ret;
+
+	cpu_buffer = kzalloc_node(ALIGN(sizeof(*cpu_buffer), cache_line_size()),
+				  GFP_KERNEL, cpu_to_node(cpu));
+	if (!cpu_buffer)
+		return NULL;
+
+	cpu_buffer->cpu = cpu;
+	cpu_buffer->buffer = buffer;
+	spin_lock_init(&cpu_buffer->lock);
+	INIT_LIST_HEAD(&cpu_buffer->pages);
+
+	ret = rb_allocate_pages(cpu_buffer, buffer->pages);
+	if (ret < 0)
+		goto fail_free_buffer;
+
+	cpu_buffer->head_page
+		= list_entry(cpu_buffer->pages.next, struct buffer_page, list);
+	cpu_buffer->tail_page
+		= list_entry(cpu_buffer->pages.next, struct buffer_page, list);
+
+	return cpu_buffer;
+
+ fail_free_buffer:
+	kfree(cpu_buffer);
+	return NULL;
+}
+
+static void rb_free_cpu_buffer(struct ring_buffer_per_cpu *cpu_buffer)
+{
+	struct list_head *head = &cpu_buffer->pages;
+	struct buffer_page *page, *tmp;
+
+	list_for_each_entry_safe(page, tmp, head, list) {
+		list_del_init(&page->list);
+		__free_page(&page->page);
+	}
+	kfree(cpu_buffer);
+}
+
+/**
+ * ring_buffer_alloc - allocate a new ring_buffer
+ * @size: the size in bytes that is needed.
+ * @flags: attributes to set for the ring buffer.
+ *
+ * Currently the only flag that is available is the RB_FL_OVERWRITE
+ * flag. This flag means that the buffer will overwrite old data
+ * when the buffer wraps. If this flag is not set, the buffer will
+ * drop data when the tail hits the head.
+ */
+struct ring_buffer *ring_buffer_alloc(unsigned long size, unsigned flags)
+{
+	struct ring_buffer *buffer;
+	int bsize;
+	int cpu;
+
+	/* keep it in its own cache line */
+	buffer = kzalloc(ALIGN(sizeof(*buffer), cache_line_size()),
+			 GFP_KERNEL);
+	if (!buffer)
+		return NULL;
+
+	buffer->pages = DIV_ROUND_UP(size, BUF_PAGE_SIZE);
+	buffer->flags = flags;
+
+	/* need at least two pages */
+	if (buffer->pages == 1)
+		buffer->pages++;
+
+	buffer->cpumask = cpu_possible_map;
+	buffer->cpus = nr_cpu_ids;
+
+	bsize = sizeof(void *) * nr_cpu_ids;
+	buffer->buffers = kzalloc(ALIGN(bsize, cache_line_size()),
+				  GFP_KERNEL);
+	if (!buffer->buffers)
+		goto fail_free_buffer;
+
+	for_each_buffer_cpu(buffer, cpu) {
+		buffer->buffers[cpu] =
+			rb_allocate_cpu_buffer(buffer, cpu);
+		if (!buffer->buffers[cpu])
+			goto fail_free_buffers;
+	}
+
+	mutex_init(&buffer->mutex);
+
+	return buffer;
+
+ fail_free_buffers:
+	for_each_buffer_cpu(buffer, cpu) {
+		if (buffer->buffers[cpu])
+			rb_free_cpu_buffer(buffer->buffers[cpu]);
+	}
+	kfree(buffer->buffers);
+
+ fail_free_buffer:
+	kfree(buffer);
+	return NULL;
+}
+
+/**
+ * ring_buffer_free - free a ring buffer.
+ * @buffer: the buffer to free.
+ */
+void
+ring_buffer_free(struct ring_buffer *buffer)
+{
+	int cpu;
+
+	for_each_buffer_cpu(buffer, cpu)
+		rb_free_cpu_buffer(buffer->buffers[cpu]);
+
+	kfree(buffer);
+}
+
+static void rb_reset_cpu(struct ring_buffer_per_cpu *cpu_buffer);
+
+static void
+rb_remove_pages(struct ring_buffer_per_cpu *cpu_buffer, unsigned nr_pages)
+{
+	struct buffer_page *page;
+	struct list_head *p;
+	unsigned i;
+
+	atomic_inc(&cpu_buffer->record_disabled);
+	synchronize_sched();
+
+	for (i = 0; i < nr_pages; i++) {
+		BUG_ON(list_empty(&cpu_buffer->pages));
+		p = cpu_buffer->pages.next;
+		page = list_entry(p, struct buffer_page, list);
+		list_del_init(&page->list);
+		__free_page(&page->page);
+	}
+	BUG_ON(list_empty(&cpu_buffer->pages));
+
+	rb_reset_cpu(cpu_buffer);
+
+	rb_check_pages(cpu_buffer);
+
+	atomic_dec(&cpu_buffer->record_disabled);
+
+}
+
+static void
+rb_insert_pages(struct ring_buffer_per_cpu *cpu_buffer,
+		struct list_head *pages, unsigned nr_pages)
+{
+	struct buffer_page *page;
+	struct list_head *p;
+	unsigned i;
+
+	atomic_inc(&cpu_buffer->record_disabled);
+	synchronize_sched();
+
+	for (i = 0; i < nr_pages; i++) {
+		BUG_ON(list_empty(pages));
+		p = pages->next;
+		page = list_entry(p, struct buffer_page, list);
+		list_del_init(&page->list);
+		list_add_tail(&page->list, &cpu_buffer->pages);
+	}
+	rb_reset_cpu(cpu_buffer);
+
+	rb_check_pages(cpu_buffer);
+
+	atomic_dec(&cpu_buffer->record_disabled);
+}
+
+/**
+ * ring_buffer_resize - resize the ring buffer
+ * @buffer: the buffer to resize.
+ * @size: the new size.
+ *
+ * The tracer is responsible for making sure that the buffer is
+ * not being used while changing the size.
+ * Note: We may be able to change the above requirement by using
+ *  RCU synchronizations.
+ *
+ * Minimum size is 2 * BUF_PAGE_SIZE.
+ *
+ * Returns -1 on failure.
+ */
+int ring_buffer_resize(struct ring_buffer *buffer, unsigned long size)
+{
+	struct ring_buffer_per_cpu *cpu_buffer;
+	unsigned nr_pages, rm_pages, new_pages;
+	struct buffer_page *page, *tmp;
+	unsigned long buffer_size;
+	unsigned long addr;
+	LIST_HEAD(pages);
+	int i, cpu;
+
+	size = DIV_ROUND_UP(size, BUF_PAGE_SIZE);
+	size *= BUF_PAGE_SIZE;
+	buffer_size = buffer->pages * BUF_PAGE_SIZE;
+
+	/* we need a minimum of two pages */
+	if (size < BUF_PAGE_SIZE * 2)
+		size = BUF_PAGE_SIZE * 2;
+
+	if (size == buffer_size)
+		return size;
+
+	mutex_lock(&buffer->mutex);
+
+	nr_pages = DIV_ROUND_UP(size, BUF_PAGE_SIZE);
+
+	if (size < buffer_size) {
+
+		/* easy case, just free pages */
+		BUG_ON(nr_pages >= buffer->pages);
+
+		rm_pages = buffer->pages - nr_pages;
+
+		for_each_buffer_cpu(buffer, cpu) {
+			cpu_buffer = buffer->buffers[cpu];
+			rb_remove_pages(cpu_buffer, rm_pages);
+		}
+		goto out;
+	}
+
+	/*
+	 * This is a bit more difficult. We only want to add pages
+	 * when we can allocate enough for all CPUs. We do this
+	 * by allocating all the pages and storing them on a local
+	 * link list. If we succeed in our allocation, then we
+	 * add these pages to the cpu_buffers. Otherwise we just free
+	 * them all and return -ENOMEM;
+	 */
+	BUG_ON(nr_pages <= buffer->pages);
+	new_pages = nr_pages - buffer->pages;
+
+	for_each_buffer_cpu(buffer, cpu) {
+		for (i = 0; i < new_pages; i++) {
+			addr = __get_free_page(GFP_KERNEL);
+			if (!addr)
+				goto free_pages;
+			page = (struct buffer_page *)virt_to_page(addr);
+			list_add(&page->list, &pages);
+		}
+	}
+
+	for_each_buffer_cpu(buffer, cpu) {
+		cpu_buffer = buffer->buffers[cpu];
+		rb_insert_pages(cpu_buffer, &pages, new_pages);
+	}
+
+	BUG_ON(!list_empty(&pages));
+
+ out:
+	buffer->pages = nr_pages;
+	mutex_unlock(&buffer->mutex);
+
+	return size;
+
+ free_pages:
+	list_for_each_entry_safe(page, tmp, &pages, list) {
+		list_del_init(&page->list);
+		__free_page(&page->page);
+	}
+	return -ENOMEM;
+}
+
+static inline int rb_per_cpu_empty(struct ring_buffer_per_cpu *cpu_buffer)
+{
+	return cpu_buffer->head_page == cpu_buffer->tail_page &&
+		cpu_buffer->head == cpu_buffer->tail;
+}
+
+static inline int rb_null_event(struct ring_buffer_event *event)
+{
+	return event->type == RINGBUF_TYPE_PADDING;
+}
+
+static inline void *rb_page_index(struct buffer_page *page, unsigned index)
+{
+	void *addr = page_address(&page->page);
+
+	return addr + index;
+}
+
+static inline struct ring_buffer_event *
+rb_head_event(struct ring_buffer_per_cpu *cpu_buffer)
+{
+	return rb_page_index(cpu_buffer->head_page,
+			     cpu_buffer->head);
+}
+
+static inline struct ring_buffer_event *
+rb_iter_head_event(struct ring_buffer_iter *iter)
+{
+	return rb_page_index(iter->head_page,
+			     iter->head);
+}
+
+/*
+ * When the tail hits the head and the buffer is in overwrite mode,
+ * the head jumps to the next page and all content on the previous
+ * page is discarded. But before doing so, we update the overrun
+ * variable of the buffer.
+ */
+static void rb_update_overflow(struct ring_buffer_per_cpu *cpu_buffer)
+{
+	struct ring_buffer_event *event;
+	unsigned long head;
+
+	for (head = 0; head < rb_head_size(cpu_buffer);
+	     head += rb_event_length(event)) {
+
+		event = rb_page_index(cpu_buffer->head_page, head);
+		BUG_ON(rb_null_event(event));
+		/* Only count data entries */
+		if (event->type != RINGBUF_TYPE_DATA)
+			continue;
+		cpu_buffer->overrun++;
+		cpu_buffer->entries--;
+	}
+}
+
+static inline void rb_inc_page(struct ring_buffer_per_cpu *cpu_buffer,
+			       struct buffer_page **page)
+{
+	struct list_head *p = (*page)->list.next;
+
+	if (p == &cpu_buffer->pages)
+		p = p->next;
+
+	*page = list_entry(p, struct buffer_page, list);
+}
+
+static inline void
+rb_add_stamp(struct ring_buffer_per_cpu *cpu_buffer, u64 *ts)
+{
+	cpu_buffer->tail_page->time_stamp = *ts;
+	cpu_buffer->write_stamp = *ts;
+}
+
+static void rb_reset_read_page(struct ring_buffer_per_cpu *cpu_buffer)
+{
+	cpu_buffer->read_stamp = cpu_buffer->head_page->time_stamp;
+	cpu_buffer->head = 0;
+}
+
+static void
+rb_reset_iter_read_page(struct ring_buffer_iter *iter)
+{
+	iter->read_stamp = iter->head_page->time_stamp;
+	iter->head = 0;
+}
+
+/**
+ * ring_buffer_update_event - update event type and data
+ * @event: the even to update
+ * @type: the type of event
+ * @length: the size of the event field in the ring buffer
+ *
+ * Update the type and data fields of the event. The length
+ * is the actual size that is written to the ring buffer,
+ * and with this, we can determine what to place into the
+ * data field.
+ */
+static inline void
+rb_update_event(struct ring_buffer_event *event,
+			 unsigned type, unsigned length)
+{
+	event->type = type;
+
+	switch (type) {
+
+	case RINGBUF_TYPE_PADDING:
+		break;
+
+	case RINGBUF_TYPE_TIME_EXTEND:
+		event->len =
+			(RB_LEN_TIME_EXTEND + (RB_ALIGNMENT-1))
+			>> RB_ALIGNMENT_SHIFT;
+		break;
+
+	case RINGBUF_TYPE_TIME_STAMP:
+		event->len =
+			(RB_LEN_TIME_STAMP + (RB_ALIGNMENT-1))
+			>> RB_ALIGNMENT_SHIFT;
+		break;
+
+	case RINGBUF_TYPE_DATA:
+		length -= RB_EVNT_HDR_SIZE;
+		if (length > RB_MAX_SMALL_DATA) {
+			event->len = 0;
+			event->array[0] = length;
+		} else
+			event->len =
+				(length + (RB_ALIGNMENT-1))
+				>> RB_ALIGNMENT_SHIFT;
+		break;
+	default:
+		BUG();
+	}
+}
+
+static inline unsigned rb_calculate_event_length(unsigned length)
+{
+	struct ring_buffer_event event; /* Used only for sizeof array */
+
+	/* zero length can cause confusions */
+	if (!length)
+		length = 1;
+
+	if (length > RB_MAX_SMALL_DATA)
+		length += sizeof(event.array[0]);
+
+	length += RB_EVNT_HDR_SIZE;
+	length = ALIGN(length, RB_ALIGNMENT);
+
+	return length;
+}
+
+static struct ring_buffer_event *
+__rb_reserve_next(struct ring_buffer_per_cpu *cpu_buffer,
+		  unsigned type, unsigned long length, u64 *ts)
+{
+	struct buffer_page *head_page, *tail_page;
+	unsigned long tail;
+	struct ring_buffer *buffer = cpu_buffer->buffer;
+	struct ring_buffer_event *event;
+
+	tail_page = cpu_buffer->tail_page;
+	head_page = cpu_buffer->head_page;
+	tail = cpu_buffer->tail;
+
+	if (tail + length > BUF_PAGE_SIZE) {
+		struct buffer_page *next_page = tail_page;
+
+		rb_inc_page(cpu_buffer, &next_page);
+
+		if (next_page == head_page) {
+			if (!(buffer->flags & RB_FL_OVERWRITE))
+				return NULL;
+
+			/* count overflows */
+			rb_update_overflow(cpu_buffer);
+
+			rb_inc_page(cpu_buffer, &head_page);
+			cpu_buffer->head_page = head_page;
+			rb_reset_read_page(cpu_buffer);
+		}
+
+		if (tail != BUF_PAGE_SIZE) {
+			event = rb_page_index(tail_page, tail);
+			/* page padding */
+			event->type = RINGBUF_TYPE_PADDING;
+		}
+
+		tail_page->size = tail;
+		tail_page = next_page;
+		tail_page->size = 0;
+		tail = 0;
+		cpu_buffer->tail_page = tail_page;
+		cpu_buffer->tail = tail;
+		rb_add_stamp(cpu_buffer, ts);
+	}
+
+	BUG_ON(tail + length > BUF_PAGE_SIZE);
+
+	event = rb_page_index(tail_page, tail);
+	rb_update_event(event, type, length);
+
+	return event;
+}
+
+static int
+rb_add_time_stamp(struct ring_buffer_per_cpu *cpu_buffer,
+		  u64 *ts, u64 *delta)
+{
+	struct ring_buffer_event *event;
+	static int once;
+
+	if (unlikely(*delta > (1ULL << 59) && !once++)) {
+		printk(KERN_WARNING "Delta way too big! %llu"
+		       " ts=%llu write stamp = %llu\n",
+		       *delta, *ts, cpu_buffer->write_stamp);
+		WARN_ON(1);
+	}
+
+	/*
+	 * The delta is too big, we to add a
+	 * new timestamp.
+	 */
+	event = __rb_reserve_next(cpu_buffer,
+				  RINGBUF_TYPE_TIME_EXTEND,
+				  RB_LEN_TIME_EXTEND,
+				  ts);
+	if (!event)
+		return -1;
+
+	/* check to see if we went to the next page */
+	if (cpu_buffer->tail) {
+		/* Still on same page, update timestamp */
+		event->time_delta = *delta & TS_MASK;
+		event->array[0] = *delta >> TS_SHIFT;
+		/* commit the time event */
+		cpu_buffer->tail +=
+			rb_event_length(event);
+		cpu_buffer->write_stamp = *ts;
+		*delta = 0;
+	}
+
+	return 0;
+}
+
+static struct ring_buffer_event *
+rb_reserve_next_event(struct ring_buffer_per_cpu *cpu_buffer,
+		      unsigned type, unsigned long length)
+{
+	struct ring_buffer_event *event;
+	u64 ts, delta;
+
+	ts = ring_buffer_time_stamp(cpu_buffer->cpu);
+
+	if (cpu_buffer->tail) {
+		delta = ts - cpu_buffer->write_stamp;
+
+		if (test_time_stamp(delta)) {
+			int ret;
+
+			ret = rb_add_time_stamp(cpu_buffer, &ts, &delta);
+			if (ret < 0)
+				return NULL;
+		}
+	} else {
+		rb_add_stamp(cpu_buffer, &ts);
+		delta = 0;
+	}
+
+	event = __rb_reserve_next(cpu_buffer, type, length, &ts);
+	if (!event)
+		return NULL;
+
+	/* If the reserve went to the next page, our delta is zero */
+	if (!cpu_buffer->tail)
+		delta = 0;
+
+	event->time_delta = delta;
+
+	return event;
+}
+
+/**
+ * ring_buffer_lock_reserve - reserve a part of the buffer
+ * @buffer: the ring buffer to reserve from
+ * @length: the length of the data to reserve (excluding event header)
+ * @flags: a pointer to save the interrupt flags
+ *
+ * Returns a reseverd event on the ring buffer to copy directly to.
+ * The user of this interface will need to get the body to write into
+ * and can use the ring_buffer_event_data() interface.
+ *
+ * The length is the length of the data needed, not the event length
+ * which also includes the event header.
+ *
+ * Must be paired with ring_buffer_unlock_commit, unless NULL is returned.
+ * If NULL is returned, then nothing has been allocated or locked.
+ */
+struct ring_buffer_event *
+ring_buffer_lock_reserve(struct ring_buffer *buffer,
+			 unsigned long length,
+			 unsigned long *flags)
+{
+	struct ring_buffer_per_cpu *cpu_buffer;
+	struct ring_buffer_event *event;
+	int cpu;
+
+	if (atomic_read(&buffer->record_disabled))
+		return NULL;
+
+	raw_local_irq_save(*flags);
+	cpu = raw_smp_processor_id();
+
+	if (!cpu_isset(cpu, buffer->cpumask))
+		goto out_irq;
+
+	cpu_buffer = buffer->buffers[cpu];
+	spin_lock(&cpu_buffer->lock);
+
+	if (atomic_read(&cpu_buffer->record_disabled))
+		goto no_record;
+
+	length = rb_calculate_event_length(length);
+	if (length > BUF_PAGE_SIZE)
+		return NULL;
+
+	event = rb_reserve_next_event(cpu_buffer, RINGBUF_TYPE_DATA, length);
+	if (!event)
+		goto no_record;
+
+	return event;
+
+ no_record:
+	spin_unlock(&cpu_buffer->lock);
+ out_irq:
+	local_irq_restore(*flags);
+	return NULL;
+}
+
+static void rb_commit(struct ring_buffer_per_cpu *cpu_buffer,
+		      struct ring_buffer_event *event)
+{
+	cpu_buffer->tail += rb_event_length(event);
+	cpu_buffer->tail_page->size = cpu_buffer->tail;
+	cpu_buffer->write_stamp += event->time_delta;
+	cpu_buffer->entries++;
+}
+
+/**
+ * ring_buffer_unlock_commit - commit a reserved
+ * @buffer: The buffer to commit to
+ * @event: The event pointer to commit.
+ * @flags: the interrupt flags received from ring_buffer_lock_reserve.
+ *
+ * This commits the data to the ring buffer, and releases any locks held.
+ *
+ * Must be paired with ring_buffer_lock_reserve.
+ */
+int ring_buffer_unlock_commit(struct ring_buffer *buffer,
+			      struct ring_buffer_event *event,
+			      unsigned long flags)
+{
+	struct ring_buffer_per_cpu *cpu_buffer;
+	int cpu = raw_smp_processor_id();
+
+	cpu_buffer = buffer->buffers[cpu];
+
+	assert_spin_locked(&cpu_buffer->lock);
+
+	rb_commit(cpu_buffer, event);
+
+	spin_unlock(&cpu_buffer->lock);
+	raw_local_irq_restore(flags);
+
+	return 0;
+}
+
+/**
+ * ring_buffer_write - write data to the buffer without reserving
+ * @buffer: The ring buffer to write to.
+ * @length: The length of the data being written (excluding the event header)
+ * @data: The data to write to the buffer.
+ *
+ * This is like ring_buffer_lock_reserve and ring_buffer_unlock_commit as
+ * one function. If you already have the data to write to the buffer, it
+ * may be easier to simply call this function.
+ *
+ * Note, like ring_buffer_lock_reserve, the length is the length of the data
+ * and not the length of the event which would hold the header.
+ */
+int ring_buffer_write(struct ring_buffer *buffer,
+			unsigned long length,
+			void *data)
+{
+	struct ring_buffer_per_cpu *cpu_buffer;
+	struct ring_buffer_event *event;
+	unsigned long event_length, flags;
+	void *body;
+	int ret = -EBUSY;
+	int cpu;
+
+	if (atomic_read(&buffer->record_disabled))
+		return -EBUSY;
+
+	local_irq_save(flags);
+	cpu = raw_smp_processor_id();
+
+	if (!cpu_isset(cpu, buffer->cpumask))
+		goto out_irq;
+
+	cpu_buffer = buffer->buffers[cpu];
+	spin_lock(&cpu_buffer->lock);
+
+	if (atomic_read(&cpu_buffer->record_disabled))
+		goto out;
+
+	event_length = rb_calculate_event_length(length);
+	event = rb_reserve_next_event(cpu_buffer,
+				      RINGBUF_TYPE_DATA, event_length);
+	if (!event)
+		goto out;
+
+	body = rb_event_data(event);
+
+	memcpy(body, data, length);
+
+	rb_commit(cpu_buffer, event);
+
+	ret = 0;
+ out:
+	spin_unlock(&cpu_buffer->lock);
+ out_irq:
+	local_irq_restore(flags);
+
+	return ret;
+}
+
+/**
+ * ring_buffer_lock - lock the ring buffer
+ * @buffer: The ring buffer to lock
+ * @flags: The place to store the interrupt flags
+ *
+ * This locks all the per CPU buffers.
+ *
+ * Must be unlocked by ring_buffer_unlock.
+ */
+void ring_buffer_lock(struct ring_buffer *buffer, unsigned long *flags)
+{
+	struct ring_buffer_per_cpu *cpu_buffer;
+	int cpu;
+
+	local_irq_save(*flags);
+
+	for_each_buffer_cpu(buffer, cpu) {
+		cpu_buffer = buffer->buffers[cpu];
+		spin_lock(&cpu_buffer->lock);
+	}
+}
+
+/**
+ * ring_buffer_unlock - unlock a locked buffer
+ * @buffer: The locked buffer to unlock
+ * @flags: The interrupt flags received by ring_buffer_lock
+ */
+void ring_buffer_unlock(struct ring_buffer *buffer, unsigned long flags)
+{
+	struct ring_buffer_per_cpu *cpu_buffer;
+	int cpu;
+
+	for (cpu = buffer->cpus - 1; cpu >= 0; cpu--) {
+		if (!cpu_isset(cpu, buffer->cpumask))
+			continue;
+		cpu_buffer = buffer->buffers[cpu];
+		spin_unlock(&cpu_buffer->lock);
+	}
+
+	local_irq_restore(flags);
+}
+
+/**
+ * ring_buffer_record_disable - stop all writes into the buffer
+ * @buffer: The ring buffer to stop writes to.
+ *
+ * This prevents all writes to the buffer. Any attempt to write
+ * to the buffer after this will fail and return NULL.
+ *
+ * The caller should call synchronize_sched() after this.
+ */
+void ring_buffer_record_disable(struct ring_buffer *buffer)
+{
+	atomic_inc(&buffer->record_disabled);
+}
+
+/**
+ * ring_buffer_record_enable - enable writes to the buffer
+ * @buffer: The ring buffer to enable writes
+ *
+ * Note, multiple disables will need the same number of enables
+ * to truely enable the writing (much like preempt_disable).
+ */
+void ring_buffer_record_enable(struct ring_buffer *buffer)
+{
+	atomic_dec(&buffer->record_disabled);
+}
+
+/**
+ * ring_buffer_record_disable_cpu - stop all writes into the cpu_buffer
+ * @buffer: The ring buffer to stop writes to.
+ * @cpu: The CPU buffer to stop
+ *
+ * This prevents all writes to the buffer. Any attempt to write
+ * to the buffer after this will fail and return NULL.
+ *
+ * The caller should call synchronize_sched() after this.
+ */
+void ring_buffer_record_disable_cpu(struct ring_buffer *buffer, int cpu)
+{
+	struct ring_buffer_per_cpu *cpu_buffer;
+
+	if (!cpu_isset(cpu, buffer->cpumask))
+		return;
+
+	cpu_buffer = buffer->buffers[cpu];
+	atomic_inc(&cpu_buffer->record_disabled);
+}
+
+/**
+ * ring_buffer_record_enable_cpu - enable writes to the buffer
+ * @buffer: The ring buffer to enable writes
+ * @cpu: The CPU to enable.
+ *
+ * Note, multiple disables will need the same number of enables
+ * to truely enable the writing (much like preempt_disable).
+ */
+void ring_buffer_record_enable_cpu(struct ring_buffer *buffer, int cpu)
+{
+	struct ring_buffer_per_cpu *cpu_buffer;
+
+	if (!cpu_isset(cpu, buffer->cpumask))
+		return;
+
+	cpu_buffer = buffer->buffers[cpu];
+	atomic_dec(&cpu_buffer->record_disabled);
+}
+
+/**
+ * ring_buffer_entries_cpu - get the number of entries in a cpu buffer
+ * @buffer: The ring buffer
+ * @cpu: The per CPU buffer to get the entries from.
+ */
+unsigned long ring_buffer_entries_cpu(struct ring_buffer *buffer, int cpu)
+{
+	struct ring_buffer_per_cpu *cpu_buffer;
+
+	if (!cpu_isset(cpu, buffer->cpumask))
+		return 0;
+
+	cpu_buffer = buffer->buffers[cpu];
+	return cpu_buffer->entries;
+}
+
+/**
+ * ring_buffer_overrun_cpu - get the number of overruns in a cpu_buffer
+ * @buffer: The ring buffer
+ * @cpu: The per CPU buffer to get the number of overruns from
+ */
+unsigned long ring_buffer_overrun_cpu(struct ring_buffer *buffer, int cpu)
+{
+	struct ring_buffer_per_cpu *cpu_buffer;
+
+	if (!cpu_isset(cpu, buffer->cpumask))
+		return 0;
+
+	cpu_buffer = buffer->buffers[cpu];
+	return cpu_buffer->overrun;
+}
+
+/**
+ * ring_buffer_entries - get the number of entries in a buffer
+ * @buffer: The ring buffer
+ *
+ * Returns the total number of entries in the ring buffer
+ * (all CPU entries)
+ */
+unsigned long ring_buffer_entries(struct ring_buffer *buffer)
+{
+	struct ring_buffer_per_cpu *cpu_buffer;
+	unsigned long entries = 0;
+	int cpu;
+
+	/* if you care about this being correct, lock the buffer */
+	for_each_buffer_cpu(buffer, cpu) {
+		cpu_buffer = buffer->buffers[cpu];
+		entries += cpu_buffer->entries;
+	}
+
+	return entries;
+}
+
+/**
+ * ring_buffer_overrun_cpu - get the number of overruns in buffer
+ * @buffer: The ring buffer
+ *
+ * Returns the total number of overruns in the ring buffer
+ * (all CPU entries)
+ */
+unsigned long ring_buffer_overruns(struct ring_buffer *buffer)
+{
+	struct ring_buffer_per_cpu *cpu_buffer;
+	unsigned long overruns = 0;
+	int cpu;
+
+	/* if you care about this being correct, lock the buffer */
+	for_each_buffer_cpu(buffer, cpu) {
+		cpu_buffer = buffer->buffers[cpu];
+		overruns += cpu_buffer->overrun;
+	}
+
+	return overruns;
+}
+
+/**
+ * ring_buffer_iter_reset - reset an iterator
+ * @iter: The iterator to reset
+ *
+ * Resets the iterator, so that it will start from the beginning
+ * again.
+ */
+void ring_buffer_iter_reset(struct ring_buffer_iter *iter)
+{
+	struct ring_buffer_per_cpu *cpu_buffer = iter->cpu_buffer;
+
+	iter->head_page = cpu_buffer->head_page;
+	iter->head = cpu_buffer->head;
+	rb_reset_iter_read_page(iter);
+}
+
+/**
+ * ring_buffer_iter_empty - check if an iterator has no more to read
+ * @iter: The iterator to check
+ */
+int ring_buffer_iter_empty(struct ring_buffer_iter *iter)
+{
+	struct ring_buffer_per_cpu *cpu_buffer;
+
+	cpu_buffer = iter->cpu_buffer;
+
+	return iter->head_page == cpu_buffer->tail_page &&
+		iter->head == cpu_buffer->tail;
+}
+
+static void
+rb_update_read_stamp(struct ring_buffer_per_cpu *cpu_buffer,
+		     struct ring_buffer_event *event)
+{
+	u64 delta;
+
+	switch (event->type) {
+	case RINGBUF_TYPE_PADDING:
+		return;
+
+	case RINGBUF_TYPE_TIME_EXTEND:
+		delta = event->array[0];
+		delta <<= TS_SHIFT;
+		delta += event->time_delta;
+		cpu_buffer->read_stamp += delta;
+		return;
+
+	case RINGBUF_TYPE_TIME_STAMP:
+		/* FIXME: not implemented */
+		return;
+
+	case RINGBUF_TYPE_DATA:
+		cpu_buffer->read_stamp += event->time_delta;
+		return;
+
+	default:
+		BUG();
+	}
+	return;
+}
+
+static void
+rb_update_iter_read_stamp(struct ring_buffer_iter *iter,
+			  struct ring_buffer_event *event)
+{
+	u64 delta;
+
+	switch (event->type) {
+	case RINGBUF_TYPE_PADDING:
+		return;
+
+	case RINGBUF_TYPE_TIME_EXTEND:
+		delta = event->array[0];
+		delta <<= TS_SHIFT;
+		delta += event->time_delta;
+		iter->read_stamp += delta;
+		return;
+
+	case RINGBUF_TYPE_TIME_STAMP:
+		/* FIXME: not implemented */
+		return;
+
+	case RINGBUF_TYPE_DATA:
+		iter->read_stamp += event->time_delta;
+		return;
+
+	default:
+		BUG();
+	}
+	return;
+}
+
+static void rb_advance_head(struct ring_buffer_per_cpu *cpu_buffer)
+{
+	struct ring_buffer_event *event;
+	unsigned length;
+
+	/*
+	 * Check if we are at the end of the buffer.
+	 */
+	if (cpu_buffer->head >= cpu_buffer->head_page->size) {
+		BUG_ON(cpu_buffer->head_page == cpu_buffer->tail_page);
+		rb_inc_page(cpu_buffer, &cpu_buffer->head_page);
+		rb_reset_read_page(cpu_buffer);
+		return;
+	}
+
+	event = rb_head_event(cpu_buffer);
+
+	if (event->type == RINGBUF_TYPE_DATA)
+		cpu_buffer->entries--;
+
+	length = rb_event_length(event);
+
+	/*
+	 * This should not be called to advance the header if we are
+	 * at the tail of the buffer.
+	 */
+	BUG_ON((cpu_buffer->head_page == cpu_buffer->tail_page) &&
+	       (cpu_buffer->head + length > cpu_buffer->tail));
+
+	rb_update_read_stamp(cpu_buffer, event);
+
+	cpu_buffer->head += length;
+
+	/* check for end of page */
+	if ((cpu_buffer->head >= cpu_buffer->head_page->size) &&
+	    (cpu_buffer->head_page != cpu_buffer->tail_page))
+		rb_advance_head(cpu_buffer);
+}
+
+static void rb_advance_iter(struct ring_buffer_iter *iter)
+{
+	struct ring_buffer *buffer;
+	struct ring_buffer_per_cpu *cpu_buffer;
+	struct ring_buffer_event *event;
+	unsigned length;
+
+	cpu_buffer = iter->cpu_buffer;
+	buffer = cpu_buffer->buffer;
+
+	/*
+	 * Check if we are at the end of the buffer.
+	 */
+	if (iter->head >= iter->head_page->size) {
+		BUG_ON(iter->head_page == cpu_buffer->tail_page);
+		rb_inc_page(cpu_buffer, &iter->head_page);
+		rb_reset_iter_read_page(iter);
+		return;
+	}
+
+	event = rb_iter_head_event(iter);
+
+	length = rb_event_length(event);
+
+	/*
+	 * This should not be called to advance the header if we are
+	 * at the tail of the buffer.
+	 */
+	BUG_ON((iter->head_page == cpu_buffer->tail_page) &&
+	       (iter->head + length > cpu_buffer->tail));
+
+	rb_update_iter_read_stamp(iter, event);
+
+	iter->head += length;
+
+	/* check for end of page padding */
+	if ((iter->head >= iter->head_page->size) &&
+	    (iter->head_page != cpu_buffer->tail_page))
+		rb_advance_iter(iter);
+}
+
+/**
+ * ring_buffer_peek - peek at the next event to be read
+ * @buffer: The ring buffer to read
+ * @cpu: The cpu to peak at
+ * @ts: The timestamp counter of this event.
+ *
+ * This will return the event that will be read next, but does
+ * not consume the data.
+ */
+struct ring_buffer_event *
+ring_buffer_peek(struct ring_buffer *buffer, int cpu, u64 *ts)
+{
+	struct ring_buffer_per_cpu *cpu_buffer;
+	struct ring_buffer_event *event;
+
+	if (!cpu_isset(cpu, buffer->cpumask))
+		return NULL;
+
+	cpu_buffer = buffer->buffers[cpu];
+
+ again:
+	if (rb_per_cpu_empty(cpu_buffer))
+		return NULL;
+
+	event = rb_head_event(cpu_buffer);
+
+	switch (event->type) {
+	case RINGBUF_TYPE_PADDING:
+		rb_inc_page(cpu_buffer, &cpu_buffer->head_page);
+		rb_reset_read_page(cpu_buffer);
+		goto again;
+
+	case RINGBUF_TYPE_TIME_EXTEND:
+		/* Internal data, OK to advance */
+		rb_advance_head(cpu_buffer);
+		goto again;
+
+	case RINGBUF_TYPE_TIME_STAMP:
+		/* FIXME: not implemented */
+		rb_advance_head(cpu_buffer);
+		goto again;
+
+	case RINGBUF_TYPE_DATA:
+		if (ts) {
+			*ts = cpu_buffer->read_stamp + event->time_delta;
+			ring_buffer_normalize_time_stamp(cpu_buffer->cpu, ts);
+		}
+		return event;
+
+	default:
+		BUG();
+	}
+
+	return NULL;
+}
+
+/**
+ * ring_buffer_iter_peek - peek at the next event to be read
+ * @iter: The ring buffer iterator
+ * @ts: The timestamp counter of this event.
+ *
+ * This will return the event that will be read next, but does
+ * not increment the iterator.
+ */
+struct ring_buffer_event *
+ring_buffer_iter_peek(struct ring_buffer_iter *iter, u64 *ts)
+{
+	struct ring_buffer *buffer;
+	struct ring_buffer_per_cpu *cpu_buffer;
+	struct ring_buffer_event *event;
+
+	if (ring_buffer_iter_empty(iter))
+		return NULL;
+
+	cpu_buffer = iter->cpu_buffer;
+	buffer = cpu_buffer->buffer;
+
+ again:
+	if (rb_per_cpu_empty(cpu_buffer))
+		return NULL;
+
+	event = rb_iter_head_event(iter);
+
+	switch (event->type) {
+	case RINGBUF_TYPE_PADDING:
+		rb_inc_page(cpu_buffer, &iter->head_page);
+		rb_reset_iter_read_page(iter);
+		goto again;
+
+	case RINGBUF_TYPE_TIME_EXTEND:
+		/* Internal data, OK to advance */
+		rb_advance_iter(iter);
+		goto again;
+
+	case RINGBUF_TYPE_TIME_STAMP:
+		/* FIXME: not implemented */
+		rb_advance_iter(iter);
+		goto again;
+
+	case RINGBUF_TYPE_DATA:
+		if (ts) {
+			*ts = iter->read_stamp + event->time_delta;
+			ring_buffer_normalize_time_stamp(cpu_buffer->cpu, ts);
+		}
+		return event;
+
+	default:
+		BUG();
+	}
+
+	return NULL;
+}
+
+/**
+ * ring_buffer_consume - return an event and consume it
+ * @buffer: The ring buffer to get the next event from
+ *
+ * Returns the next event in the ring buffer, and that event is consumed.
+ * Meaning, that sequential reads will keep returning a different event,
+ * and eventually empty the ring buffer if the producer is slower.
+ */
+struct ring_buffer_event *
+ring_buffer_consume(struct ring_buffer *buffer, int cpu, u64 *ts)
+{
+	struct ring_buffer_per_cpu *cpu_buffer;
+	struct ring_buffer_event *event;
+
+	if (!cpu_isset(cpu, buffer->cpumask))
+		return NULL;
+
+	event = ring_buffer_peek(buffer, cpu, ts);
+	if (!event)
+		return NULL;
+
+	cpu_buffer = buffer->buffers[cpu];
+	rb_advance_head(cpu_buffer);
+
+	return event;
+}
+
+/**
+ * ring_buffer_read_start - start a non consuming read of the buffer
+ * @buffer: The ring buffer to read from
+ * @cpu: The cpu buffer to iterate over
+ *
+ * This starts up an iteration through the buffer. It also disables
+ * the recording to the buffer until the reading is finished.
+ * This prevents the reading from being corrupted. This is not
+ * a consuming read, so a producer is not expected.
+ *
+ * Must be paired with ring_buffer_finish.
+ */
+struct ring_buffer_iter *
+ring_buffer_read_start(struct ring_buffer *buffer, int cpu)
+{
+	struct ring_buffer_per_cpu *cpu_buffer;
+	struct ring_buffer_iter *iter;
+
+	if (!cpu_isset(cpu, buffer->cpumask))
+		return NULL;
+
+	iter = kmalloc(sizeof(*iter), GFP_KERNEL);
+	if (!iter)
+		return NULL;
+
+	cpu_buffer = buffer->buffers[cpu];
+
+	iter->cpu_buffer = cpu_buffer;
+
+	atomic_inc(&cpu_buffer->record_disabled);
+	synchronize_sched();
+
+	spin_lock(&cpu_buffer->lock);
+	iter->head = cpu_buffer->head;
+	iter->head_page = cpu_buffer->head_page;
+	rb_reset_iter_read_page(iter);
+	spin_unlock(&cpu_buffer->lock);
+
+	return iter;
+}
+
+/**
+ * ring_buffer_finish - finish reading the iterator of the buffer
+ * @iter: The iterator retrieved by ring_buffer_start
+ *
+ * This re-enables the recording to the buffer, and frees the
+ * iterator.
+ */
+void
+ring_buffer_read_finish(struct ring_buffer_iter *iter)
+{
+	struct ring_buffer_per_cpu *cpu_buffer = iter->cpu_buffer;
+
+	atomic_dec(&cpu_buffer->record_disabled);
+	kfree(iter);
+}
+
+/**
+ * ring_buffer_read - read the next item in the ring buffer by the iterator
+ * @iter: The ring buffer iterator
+ * @ts: The time stamp of the event read.
+ *
+ * This reads the next event in the ring buffer and increments the iterator.
+ */
+struct ring_buffer_event *
+ring_buffer_read(struct ring_buffer_iter *iter, u64 *ts)
+{
+	struct ring_buffer_event *event;
+
+	event = ring_buffer_iter_peek(iter, ts);
+	if (!event)
+		return NULL;
+
+	rb_advance_iter(iter);
+
+	return event;
+}
+
+/**
+ * ring_buffer_size - return the size of the ring buffer (in bytes)
+ * @buffer: The ring buffer.
+ */
+unsigned long ring_buffer_size(struct ring_buffer *buffer)
+{
+	return BUF_PAGE_SIZE * buffer->pages;
+}
+
+static void
+rb_reset_cpu(struct ring_buffer_per_cpu *cpu_buffer)
+{
+	cpu_buffer->head_page
+		= list_entry(cpu_buffer->pages.next, struct buffer_page, list);
+	cpu_buffer->tail_page
+		= list_entry(cpu_buffer->pages.next, struct buffer_page, list);
+
+	cpu_buffer->head = cpu_buffer->tail = 0;
+	cpu_buffer->overrun = 0;
+	cpu_buffer->entries = 0;
+}
+
+/**
+ * ring_buffer_reset_cpu - reset a ring buffer per CPU buffer
+ * @buffer: The ring buffer to reset a per cpu buffer of
+ * @cpu: The CPU buffer to be reset
+ */
+void ring_buffer_reset_cpu(struct ring_buffer *buffer, int cpu)
+{
+	struct ring_buffer_per_cpu *cpu_buffer = buffer->buffers[cpu];
+	unsigned long flags;
+
+	if (!cpu_isset(cpu, buffer->cpumask))
+		return;
+
+	raw_local_irq_save(flags);
+	spin_lock(&cpu_buffer->lock);
+
+	rb_reset_cpu(cpu_buffer);
+
+	spin_unlock(&cpu_buffer->lock);
+	raw_local_irq_restore(flags);
+}
+
+/**
+ * ring_buffer_reset - reset a ring buffer
+ * @buffer: The ring buffer to reset all cpu buffers
+ */
+void ring_buffer_reset(struct ring_buffer *buffer)
+{
+	unsigned long flags;
+	int cpu;
+
+	ring_buffer_lock(buffer, &flags);
+
+	for_each_buffer_cpu(buffer, cpu)
+		rb_reset_cpu(buffer->buffers[cpu]);
+
+	ring_buffer_unlock(buffer, flags);
+}
+
+/**
+ * rind_buffer_empty - is the ring buffer empty?
+ * @buffer: The ring buffer to test
+ */
+int ring_buffer_empty(struct ring_buffer *buffer)
+{
+	struct ring_buffer_per_cpu *cpu_buffer;
+	int cpu;
+
+	/* yes this is racy, but if you don't like the race, lock the buffer */
+	for_each_buffer_cpu(buffer, cpu) {
+		cpu_buffer = buffer->buffers[cpu];
+		if (!rb_per_cpu_empty(cpu_buffer))
+			return 0;
+	}
+	return 1;
+}
+
+/**
+ * ring_buffer_empty_cpu - is a cpu buffer of a ring buffer empty?
+ * @buffer: The ring buffer
+ * @cpu: The CPU buffer to test
+ */
+int ring_buffer_empty_cpu(struct ring_buffer *buffer, int cpu)
+{
+	struct ring_buffer_per_cpu *cpu_buffer;
+
+	if (!cpu_isset(cpu, buffer->cpumask))
+		return 1;
+
+	cpu_buffer = buffer->buffers[cpu];
+	return rb_per_cpu_empty(cpu_buffer);
+}
+
+/**
+ * ring_buffer_swap_cpu - swap a CPU buffer between two ring buffers
+ * @buffer_a: One buffer to swap with
+ * @buffer_b: The other buffer to swap with
+ *
+ * This function is useful for tracers that want to take a "snapshot"
+ * of a CPU buffer and has another back up buffer lying around.
+ * it is expected that the tracer handles the cpu buffer not being
+ * used at the moment.
+ */
+int ring_buffer_swap_cpu(struct ring_buffer *buffer_a,
+			 struct ring_buffer *buffer_b, int cpu)
+{
+	struct ring_buffer_per_cpu *cpu_buffer_a;
+	struct ring_buffer_per_cpu *cpu_buffer_b;
+
+	if (!cpu_isset(cpu, buffer_a->cpumask) ||
+	    !cpu_isset(cpu, buffer_b->cpumask))
+		return -EINVAL;
+
+	/* At least make sure the two buffers are somewhat the same */
+	if (buffer_a->size != buffer_b->size ||
+	    buffer_a->pages != buffer_b->pages)
+		return -EINVAL;
+
+	cpu_buffer_a = buffer_a->buffers[cpu];
+	cpu_buffer_b = buffer_b->buffers[cpu];
+
+	/*
+	 * We can't do a synchronize_sched here because this
+	 * function can be called in atomic context.
+	 * Normally this will be called from the same CPU as cpu.
+	 * If not it's up to the caller to protect this.
+	 */
+	atomic_inc(&cpu_buffer_a->record_disabled);
+	atomic_inc(&cpu_buffer_b->record_disabled);
+
+	buffer_a->buffers[cpu] = cpu_buffer_b;
+	buffer_b->buffers[cpu] = cpu_buffer_a;
+
+	cpu_buffer_b->buffer = buffer_a;
+	cpu_buffer_a->buffer = buffer_b;
+
+	atomic_dec(&cpu_buffer_a->record_disabled);
+	atomic_dec(&cpu_buffer_b->record_disabled);
+
+	return 0;
+}
+
-- 
cgit v1.2.3


From d769041f865330034131525ee6a7f72eb4af2a24 Mon Sep 17 00:00:00 2001
From: Steven Rostedt <rostedt@goodmis.org>
Date: Wed, 1 Oct 2008 00:29:53 -0400
Subject: ring_buffer: implement new locking

The old "lock always" scheme had issues with lockdep, and was not very
efficient anyways.

This patch does a new design to be partially lockless on writes.
Writes will add new entries to the per cpu pages by simply disabling
interrupts. When a write needs to go to another page than it will
grab the lock.

A new "read page" has been added so that the reader can pull out a page
from the ring buffer to read without worrying about the writer writing over
it. This allows us to not take the lock for all reads. The lock is
now only taken when a read needs to go to a new page.

This is far from lockless, and interrupts still need to be disabled,
but it is a step towards a more lockless solution, and it also
solves a lot of the issues that were noticed by the first conversion
of ftrace to the ring buffers.

Note: the ring_buffer_{un}lock API has been removed.

Signed-off-by: Steven Rostedt <srostedt@redhat.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 include/linux/ring_buffer.h |   3 -
 kernel/trace/ring_buffer.c  | 298 +++++++++++++++++++++++++-------------------
 kernel/trace/trace.c        | 113 +++++++++++------
 3 files changed, 247 insertions(+), 167 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/ring_buffer.h b/include/linux/ring_buffer.h
index c52375b8330d..536b0ca46a03 100644
--- a/include/linux/ring_buffer.h
+++ b/include/linux/ring_buffer.h
@@ -63,9 +63,6 @@ ring_buffer_event_time_delta(struct ring_buffer_event *event)
 	return event->time_delta;
 }
 
-void ring_buffer_lock(struct ring_buffer *buffer, unsigned long *flags);
-void ring_buffer_unlock(struct ring_buffer *buffer, unsigned long flags);
-
 /*
  * size is in bytes for each per CPU buffer.
  */
diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c
index 8e7392fd0db9..9631abf2ae29 100644
--- a/kernel/trace/ring_buffer.c
+++ b/kernel/trace/ring_buffer.c
@@ -161,8 +161,10 @@ struct ring_buffer_per_cpu {
 	struct list_head		pages;
 	unsigned long			head;	/* read from head */
 	unsigned long			tail;	/* write to tail */
+	unsigned long			reader;
 	struct buffer_page		*head_page;
 	struct buffer_page		*tail_page;
+	struct buffer_page		*reader_page;
 	unsigned long			overrun;
 	unsigned long			entries;
 	u64				write_stamp;
@@ -260,6 +262,7 @@ static struct ring_buffer_per_cpu *
 rb_allocate_cpu_buffer(struct ring_buffer *buffer, int cpu)
 {
 	struct ring_buffer_per_cpu *cpu_buffer;
+	unsigned long addr;
 	int ret;
 
 	cpu_buffer = kzalloc_node(ALIGN(sizeof(*cpu_buffer), cache_line_size()),
@@ -272,9 +275,16 @@ rb_allocate_cpu_buffer(struct ring_buffer *buffer, int cpu)
 	spin_lock_init(&cpu_buffer->lock);
 	INIT_LIST_HEAD(&cpu_buffer->pages);
 
+	addr = __get_free_page(GFP_KERNEL);
+	if (!addr)
+		goto fail_free_buffer;
+	cpu_buffer->reader_page = (struct buffer_page *)virt_to_page(addr);
+	INIT_LIST_HEAD(&cpu_buffer->reader_page->list);
+	cpu_buffer->reader_page->size = 0;
+
 	ret = rb_allocate_pages(cpu_buffer, buffer->pages);
 	if (ret < 0)
-		goto fail_free_buffer;
+		goto fail_free_reader;
 
 	cpu_buffer->head_page
 		= list_entry(cpu_buffer->pages.next, struct buffer_page, list);
@@ -283,6 +293,9 @@ rb_allocate_cpu_buffer(struct ring_buffer *buffer, int cpu)
 
 	return cpu_buffer;
 
+ fail_free_reader:
+	free_buffer_page(cpu_buffer->reader_page);
+
  fail_free_buffer:
 	kfree(cpu_buffer);
 	return NULL;
@@ -293,6 +306,9 @@ static void rb_free_cpu_buffer(struct ring_buffer_per_cpu *cpu_buffer)
 	struct list_head *head = &cpu_buffer->pages;
 	struct buffer_page *page, *tmp;
 
+	list_del_init(&cpu_buffer->reader_page->list);
+	free_buffer_page(cpu_buffer->reader_page);
+
 	list_for_each_entry_safe(page, tmp, head, list) {
 		list_del_init(&page->list);
 		free_buffer_page(page);
@@ -538,8 +554,10 @@ int ring_buffer_resize(struct ring_buffer *buffer, unsigned long size)
 
 static inline int rb_per_cpu_empty(struct ring_buffer_per_cpu *cpu_buffer)
 {
-	return cpu_buffer->head_page == cpu_buffer->tail_page &&
-		cpu_buffer->head == cpu_buffer->tail;
+	return (cpu_buffer->reader == cpu_buffer->reader_page->size &&
+		(cpu_buffer->tail_page == cpu_buffer->reader_page ||
+		 (cpu_buffer->tail_page == cpu_buffer->head_page &&
+		  cpu_buffer->head == cpu_buffer->tail)));
 }
 
 static inline int rb_null_event(struct ring_buffer_event *event)
@@ -555,10 +573,10 @@ static inline void *rb_page_index(struct buffer_page *page, unsigned index)
 }
 
 static inline struct ring_buffer_event *
-rb_head_event(struct ring_buffer_per_cpu *cpu_buffer)
+rb_reader_event(struct ring_buffer_per_cpu *cpu_buffer)
 {
-	return rb_page_index(cpu_buffer->head_page,
-			     cpu_buffer->head);
+	return rb_page_index(cpu_buffer->reader_page,
+			     cpu_buffer->reader);
 }
 
 static inline struct ring_buffer_event *
@@ -610,15 +628,32 @@ rb_add_stamp(struct ring_buffer_per_cpu *cpu_buffer, u64 *ts)
 	cpu_buffer->write_stamp = *ts;
 }
 
-static void rb_reset_read_page(struct ring_buffer_per_cpu *cpu_buffer)
+static void rb_reset_head_page(struct ring_buffer_per_cpu *cpu_buffer)
 {
-	cpu_buffer->read_stamp = cpu_buffer->head_page->time_stamp;
 	cpu_buffer->head = 0;
 }
 
-static void
-rb_reset_iter_read_page(struct ring_buffer_iter *iter)
+static void rb_reset_reader_page(struct ring_buffer_per_cpu *cpu_buffer)
 {
+	cpu_buffer->read_stamp = cpu_buffer->reader_page->time_stamp;
+	cpu_buffer->reader = 0;
+}
+
+static inline void rb_inc_iter(struct ring_buffer_iter *iter)
+{
+	struct ring_buffer_per_cpu *cpu_buffer = iter->cpu_buffer;
+
+	/*
+	 * The iterator could be on the reader page (it starts there).
+	 * But the head could have moved, since the reader was
+	 * found. Check for this case and assign the iterator
+	 * to the head page instead of next.
+	 */
+	if (iter->head_page == cpu_buffer->reader_page)
+		iter->head_page = cpu_buffer->head_page;
+	else
+		rb_inc_page(cpu_buffer, &iter->head_page);
+
 	iter->read_stamp = iter->head_page->time_stamp;
 	iter->head = 0;
 }
@@ -693,30 +728,39 @@ static struct ring_buffer_event *
 __rb_reserve_next(struct ring_buffer_per_cpu *cpu_buffer,
 		  unsigned type, unsigned long length, u64 *ts)
 {
-	struct buffer_page *head_page, *tail_page;
+	struct buffer_page *tail_page, *head_page, *reader_page;
 	unsigned long tail;
 	struct ring_buffer *buffer = cpu_buffer->buffer;
 	struct ring_buffer_event *event;
 
+	/* No locking needed for tail page */
 	tail_page = cpu_buffer->tail_page;
-	head_page = cpu_buffer->head_page;
 	tail = cpu_buffer->tail;
 
 	if (tail + length > BUF_PAGE_SIZE) {
 		struct buffer_page *next_page = tail_page;
 
+		spin_lock(&cpu_buffer->lock);
 		rb_inc_page(cpu_buffer, &next_page);
 
+		head_page = cpu_buffer->head_page;
+		reader_page = cpu_buffer->reader_page;
+
+		/* we grabbed the lock before incrementing */
+		WARN_ON(next_page == reader_page);
+
 		if (next_page == head_page) {
-			if (!(buffer->flags & RB_FL_OVERWRITE))
+			if (!(buffer->flags & RB_FL_OVERWRITE)) {
+				spin_unlock(&cpu_buffer->lock);
 				return NULL;
+			}
 
 			/* count overflows */
 			rb_update_overflow(cpu_buffer);
 
 			rb_inc_page(cpu_buffer, &head_page);
 			cpu_buffer->head_page = head_page;
-			rb_reset_read_page(cpu_buffer);
+			rb_reset_head_page(cpu_buffer);
 		}
 
 		if (tail != BUF_PAGE_SIZE) {
@@ -732,6 +776,7 @@ __rb_reserve_next(struct ring_buffer_per_cpu *cpu_buffer,
 		cpu_buffer->tail_page = tail_page;
 		cpu_buffer->tail = tail;
 		rb_add_stamp(cpu_buffer, ts);
+		spin_unlock(&cpu_buffer->lock);
 	}
 
 	BUG_ON(tail + length > BUF_PAGE_SIZE);
@@ -802,7 +847,9 @@ rb_reserve_next_event(struct ring_buffer_per_cpu *cpu_buffer,
 				return NULL;
 		}
 	} else {
+		spin_lock(&cpu_buffer->lock);
 		rb_add_stamp(cpu_buffer, &ts);
+		spin_unlock(&cpu_buffer->lock);
 		delta = 0;
 	}
 
@@ -851,13 +898,12 @@ ring_buffer_lock_reserve(struct ring_buffer *buffer,
 	cpu = raw_smp_processor_id();
 
 	if (!cpu_isset(cpu, buffer->cpumask))
-		goto out_irq;
+		goto out;
 
 	cpu_buffer = buffer->buffers[cpu];
-	spin_lock(&cpu_buffer->lock);
 
 	if (atomic_read(&cpu_buffer->record_disabled))
-		goto no_record;
+		goto out;
 
 	length = rb_calculate_event_length(length);
 	if (length > BUF_PAGE_SIZE)
@@ -865,13 +911,11 @@ ring_buffer_lock_reserve(struct ring_buffer *buffer,
 
 	event = rb_reserve_next_event(cpu_buffer, RINGBUF_TYPE_DATA, length);
 	if (!event)
-		goto no_record;
+		goto out;
 
 	return event;
 
- no_record:
-	spin_unlock(&cpu_buffer->lock);
- out_irq:
+ out:
 	local_irq_restore(*flags);
 	return NULL;
 }
@@ -904,11 +948,8 @@ int ring_buffer_unlock_commit(struct ring_buffer *buffer,
 
 	cpu_buffer = buffer->buffers[cpu];
 
-	assert_spin_locked(&cpu_buffer->lock);
-
 	rb_commit(cpu_buffer, event);
 
-	spin_unlock(&cpu_buffer->lock);
 	local_irq_restore(flags);
 
 	return 0;
@@ -945,10 +986,9 @@ int ring_buffer_write(struct ring_buffer *buffer,
 	cpu = raw_smp_processor_id();
 
 	if (!cpu_isset(cpu, buffer->cpumask))
-		goto out_irq;
+		goto out;
 
 	cpu_buffer = buffer->buffers[cpu];
-	spin_lock(&cpu_buffer->lock);
 
 	if (atomic_read(&cpu_buffer->record_disabled))
 		goto out;
@@ -967,55 +1007,11 @@ int ring_buffer_write(struct ring_buffer *buffer,
 
 	ret = 0;
  out:
-	spin_unlock(&cpu_buffer->lock);
- out_irq:
 	local_irq_restore(flags);
 
 	return ret;
 }
 
-/**
- * ring_buffer_lock - lock the ring buffer
- * @buffer: The ring buffer to lock
- * @flags: The place to store the interrupt flags
- *
- * This locks all the per CPU buffers.
- *
- * Must be unlocked by ring_buffer_unlock.
- */
-void ring_buffer_lock(struct ring_buffer *buffer, unsigned long *flags)
-{
-	struct ring_buffer_per_cpu *cpu_buffer;
-	int cpu;
-
-	local_irq_save(*flags);
-
-	for_each_buffer_cpu(buffer, cpu) {
-		cpu_buffer = buffer->buffers[cpu];
-		spin_lock(&cpu_buffer->lock);
-	}
-}
-
-/**
- * ring_buffer_unlock - unlock a locked buffer
- * @buffer: The locked buffer to unlock
- * @flags: The interrupt flags received by ring_buffer_lock
- */
-void ring_buffer_unlock(struct ring_buffer *buffer, unsigned long flags)
-{
-	struct ring_buffer_per_cpu *cpu_buffer;
-	int cpu;
-
-	for (cpu = buffer->cpus - 1; cpu >= 0; cpu--) {
-		if (!cpu_isset(cpu, buffer->cpumask))
-			continue;
-		cpu_buffer = buffer->buffers[cpu];
-		spin_unlock(&cpu_buffer->lock);
-	}
-
-	local_irq_restore(flags);
-}
-
 /**
  * ring_buffer_record_disable - stop all writes into the buffer
  * @buffer: The ring buffer to stop writes to.
@@ -1169,9 +1165,18 @@ void ring_buffer_iter_reset(struct ring_buffer_iter *iter)
 {
 	struct ring_buffer_per_cpu *cpu_buffer = iter->cpu_buffer;
 
-	iter->head_page = cpu_buffer->head_page;
-	iter->head = cpu_buffer->head;
-	rb_reset_iter_read_page(iter);
+	/* Iterator usage is expected to have record disabled */
+	if (list_empty(&cpu_buffer->reader_page->list)) {
+		iter->head_page = cpu_buffer->head_page;
+		iter->head = cpu_buffer->head;
+	} else {
+		iter->head_page = cpu_buffer->reader_page;
+		iter->head = cpu_buffer->reader;
+	}
+	if (iter->head)
+		iter->read_stamp = cpu_buffer->read_stamp;
+	else
+		iter->read_stamp = iter->head_page->time_stamp;
 }
 
 /**
@@ -1250,43 +1255,84 @@ rb_update_iter_read_stamp(struct ring_buffer_iter *iter,
 	return;
 }
 
-static void rb_advance_head(struct ring_buffer_per_cpu *cpu_buffer)
+static struct buffer_page *
+rb_get_reader_page(struct ring_buffer_per_cpu *cpu_buffer)
 {
-	struct ring_buffer_event *event;
-	unsigned length;
+	struct buffer_page *reader = NULL;
+	unsigned long flags;
+
+	spin_lock_irqsave(&cpu_buffer->lock, flags);
+
+ again:
+	reader = cpu_buffer->reader_page;
+
+	/* If there's more to read, return this page */
+	if (cpu_buffer->reader < reader->size)
+		goto out;
+
+	/* Never should we have an index greater than the size */
+	WARN_ON(cpu_buffer->reader > reader->size);
+
+	/* check if we caught up to the tail */
+	reader = NULL;
+	if (cpu_buffer->tail_page == cpu_buffer->reader_page)
+		goto out;
 
 	/*
-	 * Check if we are at the end of the buffer.
+	 * Splice the empty reader page into the list around the head.
+	 * Reset the reader page to size zero.
 	 */
-	if (cpu_buffer->head >= cpu_buffer->head_page->size) {
-		BUG_ON(cpu_buffer->head_page == cpu_buffer->tail_page);
-		rb_inc_page(cpu_buffer, &cpu_buffer->head_page);
-		rb_reset_read_page(cpu_buffer);
-		return;
-	}
 
-	event = rb_head_event(cpu_buffer);
+	reader = cpu_buffer->head_page;
+	cpu_buffer->reader_page->list.next = reader->list.next;
+	cpu_buffer->reader_page->list.prev = reader->list.prev;
+	cpu_buffer->reader_page->size = 0;
 
-	if (event->type == RINGBUF_TYPE_DATA)
-		cpu_buffer->entries--;
-
-	length = rb_event_length(event);
+	/* Make the reader page now replace the head */
+	reader->list.prev->next = &cpu_buffer->reader_page->list;
+	reader->list.next->prev = &cpu_buffer->reader_page->list;
 
 	/*
-	 * This should not be called to advance the header if we are
-	 * at the tail of the buffer.
+	 * If the tail is on the reader, then we must set the head
+	 * to the inserted page, otherwise we set it one before.
 	 */
-	BUG_ON((cpu_buffer->head_page == cpu_buffer->tail_page) &&
-	       (cpu_buffer->head + length > cpu_buffer->tail));
+	cpu_buffer->head_page = cpu_buffer->reader_page;
 
-	rb_update_read_stamp(cpu_buffer, event);
+	if (cpu_buffer->tail_page != reader)
+		rb_inc_page(cpu_buffer, &cpu_buffer->head_page);
+
+	/* Finally update the reader page to the new head */
+	cpu_buffer->reader_page = reader;
+	rb_reset_reader_page(cpu_buffer);
+
+	goto again;
+
+ out:
+	spin_unlock_irqrestore(&cpu_buffer->lock, flags);
+
+	return reader;
+}
+
+static void rb_advance_reader(struct ring_buffer_per_cpu *cpu_buffer)
+{
+	struct ring_buffer_event *event;
+	struct buffer_page *reader;
+	unsigned length;
+
+	reader = rb_get_reader_page(cpu_buffer);
 
-	cpu_buffer->head += length;
+	/* This function should not be called when buffer is empty */
+	BUG_ON(!reader);
 
-	/* check for end of page */
-	if ((cpu_buffer->head >= cpu_buffer->head_page->size) &&
-	    (cpu_buffer->head_page != cpu_buffer->tail_page))
-		rb_advance_head(cpu_buffer);
+	event = rb_reader_event(cpu_buffer);
+
+	if (event->type == RINGBUF_TYPE_DATA)
+		cpu_buffer->entries--;
+
+	rb_update_read_stamp(cpu_buffer, event);
+
+	length = rb_event_length(event);
+	cpu_buffer->reader += length;
 }
 
 static void rb_advance_iter(struct ring_buffer_iter *iter)
@@ -1304,8 +1350,7 @@ static void rb_advance_iter(struct ring_buffer_iter *iter)
 	 */
 	if (iter->head >= iter->head_page->size) {
 		BUG_ON(iter->head_page == cpu_buffer->tail_page);
-		rb_inc_page(cpu_buffer, &iter->head_page);
-		rb_reset_iter_read_page(iter);
+		rb_inc_iter(iter);
 		return;
 	}
 
@@ -1344,6 +1389,7 @@ ring_buffer_peek(struct ring_buffer *buffer, int cpu, u64 *ts)
 {
 	struct ring_buffer_per_cpu *cpu_buffer;
 	struct ring_buffer_event *event;
+	struct buffer_page *reader;
 
 	if (!cpu_isset(cpu, buffer->cpumask))
 		return NULL;
@@ -1351,25 +1397,26 @@ ring_buffer_peek(struct ring_buffer *buffer, int cpu, u64 *ts)
 	cpu_buffer = buffer->buffers[cpu];
 
  again:
-	if (rb_per_cpu_empty(cpu_buffer))
+	reader = rb_get_reader_page(cpu_buffer);
+	if (!reader)
 		return NULL;
 
-	event = rb_head_event(cpu_buffer);
+	event = rb_reader_event(cpu_buffer);
 
 	switch (event->type) {
 	case RINGBUF_TYPE_PADDING:
-		rb_inc_page(cpu_buffer, &cpu_buffer->head_page);
-		rb_reset_read_page(cpu_buffer);
-		goto again;
+		WARN_ON(1);
+		rb_advance_reader(cpu_buffer);
+		return NULL;
 
 	case RINGBUF_TYPE_TIME_EXTEND:
 		/* Internal data, OK to advance */
-		rb_advance_head(cpu_buffer);
+		rb_advance_reader(cpu_buffer);
 		goto again;
 
 	case RINGBUF_TYPE_TIME_STAMP:
 		/* FIXME: not implemented */
-		rb_advance_head(cpu_buffer);
+		rb_advance_reader(cpu_buffer);
 		goto again;
 
 	case RINGBUF_TYPE_DATA:
@@ -1415,8 +1462,7 @@ ring_buffer_iter_peek(struct ring_buffer_iter *iter, u64 *ts)
 
 	switch (event->type) {
 	case RINGBUF_TYPE_PADDING:
-		rb_inc_page(cpu_buffer, &iter->head_page);
-		rb_reset_iter_read_page(iter);
+		rb_inc_iter(iter);
 		goto again;
 
 	case RINGBUF_TYPE_TIME_EXTEND:
@@ -1465,7 +1511,7 @@ ring_buffer_consume(struct ring_buffer *buffer, int cpu, u64 *ts)
 		return NULL;
 
 	cpu_buffer = buffer->buffers[cpu];
-	rb_advance_head(cpu_buffer);
+	rb_advance_reader(cpu_buffer);
 
 	return event;
 }
@@ -1487,6 +1533,7 @@ ring_buffer_read_start(struct ring_buffer *buffer, int cpu)
 {
 	struct ring_buffer_per_cpu *cpu_buffer;
 	struct ring_buffer_iter *iter;
+	unsigned long flags;
 
 	if (!cpu_isset(cpu, buffer->cpumask))
 		return NULL;
@@ -1502,11 +1549,9 @@ ring_buffer_read_start(struct ring_buffer *buffer, int cpu)
 	atomic_inc(&cpu_buffer->record_disabled);
 	synchronize_sched();
 
-	spin_lock(&cpu_buffer->lock);
-	iter->head = cpu_buffer->head;
-	iter->head_page = cpu_buffer->head_page;
-	rb_reset_iter_read_page(iter);
-	spin_unlock(&cpu_buffer->lock);
+	spin_lock_irqsave(&cpu_buffer->lock, flags);
+	ring_buffer_iter_reset(iter);
+	spin_unlock_irqrestore(&cpu_buffer->lock, flags);
 
 	return iter;
 }
@@ -1562,10 +1607,14 @@ rb_reset_cpu(struct ring_buffer_per_cpu *cpu_buffer)
 {
 	cpu_buffer->head_page
 		= list_entry(cpu_buffer->pages.next, struct buffer_page, list);
-	cpu_buffer->tail_page
-		= list_entry(cpu_buffer->pages.next, struct buffer_page, list);
+	cpu_buffer->head_page->size = 0;
+	cpu_buffer->tail_page = cpu_buffer->head_page;
+	cpu_buffer->tail_page->size = 0;
+	INIT_LIST_HEAD(&cpu_buffer->reader_page->list);
+	cpu_buffer->reader_page->size = 0;
+
+	cpu_buffer->head = cpu_buffer->tail = cpu_buffer->reader = 0;
 
-	cpu_buffer->head = cpu_buffer->tail = 0;
 	cpu_buffer->overrun = 0;
 	cpu_buffer->entries = 0;
 }
@@ -1583,13 +1632,11 @@ void ring_buffer_reset_cpu(struct ring_buffer *buffer, int cpu)
 	if (!cpu_isset(cpu, buffer->cpumask))
 		return;
 
-	local_irq_save(flags);
-	spin_lock(&cpu_buffer->lock);
+	spin_lock_irqsave(&cpu_buffer->lock, flags);
 
 	rb_reset_cpu(cpu_buffer);
 
-	spin_unlock(&cpu_buffer->lock);
-	local_irq_restore(flags);
+	spin_unlock_irqrestore(&cpu_buffer->lock, flags);
 }
 
 /**
@@ -1598,15 +1645,10 @@ void ring_buffer_reset_cpu(struct ring_buffer *buffer, int cpu)
  */
 void ring_buffer_reset(struct ring_buffer *buffer)
 {
-	unsigned long flags;
 	int cpu;
 
-	ring_buffer_lock(buffer, &flags);
-
 	for_each_buffer_cpu(buffer, cpu)
-		rb_reset_cpu(buffer->buffers[cpu]);
-
-	ring_buffer_unlock(buffer, flags);
+		ring_buffer_reset_cpu(buffer, cpu);
 }
 
 /**
diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index 6a1c76bb56ba..b542f8837801 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -42,6 +42,20 @@
 unsigned long __read_mostly	tracing_max_latency = (cycle_t)ULONG_MAX;
 unsigned long __read_mostly	tracing_thresh;
 
+static DEFINE_PER_CPU(local_t, ftrace_cpu_disabled);
+
+static inline void ftrace_disable_cpu(void)
+{
+	preempt_disable();
+	local_inc(&__get_cpu_var(ftrace_cpu_disabled));
+}
+
+static inline void ftrace_enable_cpu(void)
+{
+	local_dec(&__get_cpu_var(ftrace_cpu_disabled));
+	preempt_enable();
+}
+
 static cpumask_t __read_mostly		tracing_buffer_mask;
 
 #define for_each_tracing_cpu(cpu)	\
@@ -406,7 +420,9 @@ update_max_tr(struct trace_array *tr, struct task_struct *tsk, int cpu)
 	tr->buffer = max_tr.buffer;
 	max_tr.buffer = buf;
 
+	ftrace_disable_cpu();
 	ring_buffer_reset(tr->buffer);
+	ftrace_enable_cpu();
 
 	__update_max_tr(tr, tsk, cpu);
 	__raw_spin_unlock(&ftrace_max_lock);
@@ -428,9 +444,13 @@ update_max_tr_single(struct trace_array *tr, struct task_struct *tsk, int cpu)
 	WARN_ON_ONCE(!irqs_disabled());
 	__raw_spin_lock(&ftrace_max_lock);
 
+	ftrace_disable_cpu();
+
 	ring_buffer_reset(max_tr.buffer);
 	ret = ring_buffer_swap_cpu(max_tr.buffer, tr->buffer, cpu);
 
+	ftrace_enable_cpu();
+
 	WARN_ON_ONCE(ret);
 
 	__update_max_tr(tr, tsk, cpu);
@@ -543,7 +563,9 @@ void unregister_tracer(struct tracer *type)
 
 void tracing_reset(struct trace_array *tr, int cpu)
 {
+	ftrace_disable_cpu();
 	ring_buffer_reset_cpu(tr->buffer, cpu);
+	ftrace_enable_cpu();
 }
 
 #define SAVED_CMDLINES 128
@@ -654,6 +676,10 @@ trace_function(struct trace_array *tr, struct trace_array_cpu *data,
 	struct ftrace_entry *entry;
 	unsigned long irq_flags;
 
+	/* If we are reading the ring buffer, don't trace */
+	if (unlikely(local_read(&__get_cpu_var(ftrace_cpu_disabled))))
+		return;
+
 	event = ring_buffer_lock_reserve(tr->buffer, sizeof(*entry),
 					 &irq_flags);
 	if (!event)
@@ -870,8 +896,14 @@ enum trace_file_type {
 
 static void trace_iterator_increment(struct trace_iterator *iter, int cpu)
 {
+	/* Don't allow ftrace to trace into the ring buffers */
+	ftrace_disable_cpu();
+
 	iter->idx++;
-	ring_buffer_read(iter->buffer_iter[iter->cpu], NULL);
+	if (iter->buffer_iter[iter->cpu])
+		ring_buffer_read(iter->buffer_iter[iter->cpu], NULL);
+
+	ftrace_enable_cpu();
 }
 
 static struct trace_entry *
@@ -880,9 +912,19 @@ peek_next_entry(struct trace_iterator *iter, int cpu, u64 *ts)
 	struct ring_buffer_event *event;
 	struct ring_buffer_iter *buf_iter = iter->buffer_iter[cpu];
 
-	event = ring_buffer_iter_peek(buf_iter, ts);
+	/* Don't allow ftrace to trace into the ring buffers */
+	ftrace_disable_cpu();
+
+	if (buf_iter)
+		event = ring_buffer_iter_peek(buf_iter, ts);
+	else
+		event = ring_buffer_peek(iter->tr->buffer, cpu, ts);
+
+	ftrace_enable_cpu();
+
 	return event ? ring_buffer_event_data(event) : NULL;
 }
+
 static struct trace_entry *
 __find_next_entry(struct trace_iterator *iter, int *ent_cpu, u64 *ent_ts)
 {
@@ -938,7 +980,10 @@ static void *find_next_entry_inc(struct trace_iterator *iter)
 
 static void trace_consume(struct trace_iterator *iter)
 {
+	/* Don't allow ftrace to trace into the ring buffers */
+	ftrace_disable_cpu();
 	ring_buffer_consume(iter->tr->buffer, iter->cpu, &iter->ts);
+	ftrace_enable_cpu();
 }
 
 static void *s_next(struct seq_file *m, void *v, loff_t *pos)
@@ -991,10 +1036,14 @@ static void *s_start(struct seq_file *m, loff_t *pos)
 		iter->cpu = 0;
 		iter->idx = -1;
 
+		ftrace_disable_cpu();
+
 		for_each_tracing_cpu(cpu) {
 			ring_buffer_iter_reset(iter->buffer_iter[cpu]);
 		}
 
+		ftrace_enable_cpu();
+
 		for (p = iter; p && l < *pos; p = s_next(m, p, &l))
 			;
 
@@ -1242,7 +1291,16 @@ void trace_seq_print_cont(struct trace_seq *s, struct trace_iterator *iter)
 		cont = (struct trace_field_cont *)ent;
 		if (ok)
 			ok = (trace_seq_printf(s, "%s", cont->buf) > 0);
-		ring_buffer_read(iter->buffer_iter[iter->cpu], NULL);
+
+		ftrace_disable_cpu();
+
+		if (iter->buffer_iter[iter->cpu])
+			ring_buffer_read(iter->buffer_iter[iter->cpu], NULL);
+		else
+			ring_buffer_consume(iter->tr->buffer, iter->cpu, NULL);
+
+		ftrace_enable_cpu();
+
 		ent = peek_next_entry(iter, iter->cpu, NULL);
 	} while (ent && ent->type == TRACE_CONT);
 
@@ -1683,9 +1741,15 @@ static int trace_empty(struct trace_iterator *iter)
 	int cpu;
 
 	for_each_tracing_cpu(cpu) {
-		if (!ring_buffer_iter_empty(iter->buffer_iter[cpu]))
-			return 0;
+		if (iter->buffer_iter[cpu]) {
+			if (!ring_buffer_iter_empty(iter->buffer_iter[cpu]))
+				return 0;
+		} else {
+			if (!ring_buffer_empty_cpu(iter->tr->buffer, cpu))
+				return 0;
+		}
 	}
+
 	return TRACE_TYPE_HANDLED;
 }
 
@@ -1776,8 +1840,10 @@ __tracing_open(struct inode *inode, struct file *file, int *ret)
 	iter->pos = -1;
 
 	for_each_tracing_cpu(cpu) {
+
 		iter->buffer_iter[cpu] =
 			ring_buffer_read_start(iter->tr->buffer, cpu);
+
 		if (!iter->buffer_iter[cpu])
 			goto fail_buffer;
 	}
@@ -2341,7 +2407,6 @@ static atomic_t tracing_reader;
 static int tracing_open_pipe(struct inode *inode, struct file *filp)
 {
 	struct trace_iterator *iter;
-	int cpu;
 
 	if (tracing_disabled)
 		return -ENODEV;
@@ -2362,38 +2427,17 @@ static int tracing_open_pipe(struct inode *inode, struct file *filp)
 	iter->trace = current_trace;
 	filp->private_data = iter;
 
-	for_each_tracing_cpu(cpu) {
-		iter->buffer_iter[cpu] =
-			ring_buffer_read_start(iter->tr->buffer, cpu);
-		if (!iter->buffer_iter[cpu])
-			goto fail_buffer;
-	}
-
 	if (iter->trace->pipe_open)
 		iter->trace->pipe_open(iter);
 	mutex_unlock(&trace_types_lock);
 
 	return 0;
-
- fail_buffer:
-	for_each_tracing_cpu(cpu) {
-		if (iter->buffer_iter[cpu])
-			ring_buffer_read_finish(iter->buffer_iter[cpu]);
-	}
-	mutex_unlock(&trace_types_lock);
-
-	return -ENOMEM;
 }
 
 static int tracing_release_pipe(struct inode *inode, struct file *file)
 {
 	struct trace_iterator *iter = file->private_data;
-	int cpu;
 
-	for_each_tracing_cpu(cpu) {
-		if (iter->buffer_iter[cpu])
-			ring_buffer_read_finish(iter->buffer_iter[cpu]);
-	}
 	kfree(iter);
 	atomic_dec(&tracing_reader);
 
@@ -2429,7 +2473,6 @@ tracing_read_pipe(struct file *filp, char __user *ubuf,
 		  size_t cnt, loff_t *ppos)
 {
 	struct trace_iterator *iter = filp->private_data;
-	unsigned long flags;
 #ifdef CONFIG_FTRACE
 	int ftrace_save;
 #endif
@@ -2528,7 +2571,6 @@ waitagain:
 	ftrace_enabled = 0;
 #endif
 	smp_wmb();
-	ring_buffer_lock(iter->tr->buffer, &flags);
 
 	while (find_next_entry_inc(iter) != NULL) {
 		enum print_line_t ret;
@@ -2547,7 +2589,6 @@ waitagain:
 			break;
 	}
 
-	ring_buffer_unlock(iter->tr->buffer, flags);
 #ifdef CONFIG_FTRACE
 	ftrace_enabled = ftrace_save;
 #endif
@@ -3010,8 +3051,8 @@ void ftrace_dump(void)
 	static struct trace_iterator iter;
 	static cpumask_t mask;
 	static int dump_ran;
-	unsigned long flags, irq_flags;
-	int cnt = 0;
+	unsigned long flags;
+	int cnt = 0, cpu;
 
 	/* only one dump */
 	spin_lock_irqsave(&ftrace_dump_lock, flags);
@@ -3023,6 +3064,10 @@ void ftrace_dump(void)
 	/* No turning back! */
 	ftrace_kill_atomic();
 
+	for_each_tracing_cpu(cpu) {
+		atomic_inc(&global_trace.data[cpu]->disabled);
+	}
+
 	printk(KERN_TRACE "Dumping ftrace buffer:\n");
 
 	iter.tr = &global_trace;
@@ -3037,8 +3082,6 @@ void ftrace_dump(void)
 
 	cpus_clear(mask);
 
-	ring_buffer_lock(iter.tr->buffer, &irq_flags);
-
 	while (!trace_empty(&iter)) {
 
 		if (!cnt)
@@ -3066,8 +3109,6 @@ void ftrace_dump(void)
 	else
 		printk(KERN_TRACE "---------------------------------\n");
 
-	ring_buffer_unlock(iter.tr->buffer, irq_flags);
-
  out:
 	spin_unlock_irqrestore(&ftrace_dump_lock, flags);
 }
-- 
cgit v1.2.3


From cb5ab74204a6e2579d1119bf1348eb806526b12b Mon Sep 17 00:00:00 2001
From: Frederic Weisbecker <fweisbec@gmail.com>
Date: Thu, 2 Oct 2008 12:59:20 +0200
Subject: tracing/fastboot: change the printing of boot tracer according to
 bootgraph.pl

Change the boot tracer printing to make it parsable for
the scripts/bootgraph.pl script.

We have now to output two lines for each initcall, according to the
printk in do_one_initcall() in init/main.c
We need now the call's time and the return's time.

Signed-off-by: Frederic Weisbecker <fweisbec@gmail.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 include/linux/ftrace.h    |  2 ++
 init/main.c               | 20 +++++++++-----------
 kernel/trace/trace_boot.c | 22 +++++++++++++++-------
 3 files changed, 26 insertions(+), 18 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/ftrace.h b/include/linux/ftrace.h
index 91954eb6460f..4455490d91bd 100644
--- a/include/linux/ftrace.h
+++ b/include/linux/ftrace.h
@@ -216,6 +216,8 @@ struct boot_trace {
 	initcall_t		func;
 	int			result;
 	unsigned long long	duration;
+	ktime_t			calltime;
+	ktime_t			rettime;
 };
 
 #ifdef CONFIG_BOOT_TRACER
diff --git a/init/main.c b/init/main.c
index 1e39a1eab190..61eb66159391 100644
--- a/init/main.c
+++ b/init/main.c
@@ -706,34 +706,32 @@ __setup("initcall_debug", initcall_debug_setup);
 int do_one_initcall(initcall_t fn)
 {
 	int count = preempt_count();
-	ktime_t t0, t1, delta;
+	ktime_t delta;
 	char msgbuf[64];
-	int result;
 	struct boot_trace it;
 
 	if (initcall_debug) {
 		it.caller = task_pid_nr(current);
 		it.func = fn;
 		printk("calling  %pF @ %i\n", fn, it.caller);
-		t0 = ktime_get();
+		it.calltime = ktime_get();
 	}
 
-	result = fn();
+	it.result = fn();
 
 	if (initcall_debug) {
-		t1 = ktime_get();
-		delta = ktime_sub(t1, t0);
-		it.result = result;
+		it.rettime = ktime_get();
+		delta = ktime_sub(it.rettime, it.calltime);
 		it.duration = (unsigned long long) delta.tv64 >> 20;
 		printk("initcall %pF returned %d after %Ld msecs\n", fn,
-			result, it.duration);
+			it.result, it.duration);
 		trace_boot(&it);
 	}
 
 	msgbuf[0] = 0;
 
-	if (result && result != -ENODEV && initcall_debug)
-		sprintf(msgbuf, "error code %d ", result);
+	if (it.result && it.result != -ENODEV && initcall_debug)
+		sprintf(msgbuf, "error code %d ", it.result);
 
 	if (preempt_count() != count) {
 		strlcat(msgbuf, "preemption imbalance ", sizeof(msgbuf));
@@ -747,7 +745,7 @@ int do_one_initcall(initcall_t fn)
 		printk("initcall %pF returned with %s\n", fn, msgbuf);
 	}
 
-	return result;
+	return it.result;
 }
 
 
diff --git a/kernel/trace/trace_boot.c b/kernel/trace/trace_boot.c
index f2dac6f1cf06..7c15f3e68ba3 100644
--- a/kernel/trace/trace_boot.c
+++ b/kernel/trace/trace_boot.c
@@ -52,16 +52,24 @@ static enum print_line_t initcall_print_line(struct trace_iterator *iter)
 	struct trace_boot *field = (struct trace_boot *)entry;
 	struct boot_trace *it = &field->initcall;
 	struct trace_seq *s = &iter->seq;
+	struct timespec calltime = ktime_to_timespec(it->calltime);
+	struct timespec rettime = ktime_to_timespec(it->rettime);
 
 	if (entry->type == TRACE_BOOT) {
-		ret = trace_seq_printf(s, "%pF called from %i "
-				       "returned %d after %lld msecs\n",
-				       it->func, it->caller, it->result,
-				       it->duration);
-		if (ret)
-			return TRACE_TYPE_HANDLED;
-		else
+		ret = trace_seq_printf(s, "[%5ld.%06ld] calling  %pF @ %i\n",
+					  calltime.tv_sec,
+					  calltime.tv_nsec,
+					  it->func, it->caller);
+		if (!ret)
 			return TRACE_TYPE_PARTIAL_LINE;
+		ret = trace_seq_printf(s, "[%5ld.%06ld] initcall %pF "
+					  "returned %d after %lld msecs\n",
+					  rettime.tv_sec,
+					  rettime.tv_nsec,
+					  it->func, it->result, it->duration);
+		if (!ret)
+			return TRACE_TYPE_PARTIAL_LINE;
+		return TRACE_TYPE_HANDLED;
 	}
 	return TRACE_TYPE_UNHANDLED;
 }
-- 
cgit v1.2.3


From 5601020feb0c3010e9e3e0131e9697ac6a06777b Mon Sep 17 00:00:00 2001
From: Frederic Weisbecker <fweisbec@gmail.com>
Date: Thu, 2 Oct 2008 13:26:05 +0200
Subject: tracing/fastboot: get the initcall name before it disappears

After some initcall traces, some initcall names may be inconsistent.
That's because these functions will disappear from the .init section
and also their name from the symbols table.

So we have to copy the name of the function in a buffer large enough
during the trace appending. It is not costly for the ring_buffer because
the number of initcall entries is commonly not really large.

Signed-off-by: Frederic Weisbecker <fweisbec@gmail.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 include/linux/ftrace.h    |  7 ++++---
 init/main.c               |  3 +--
 kernel/trace/trace_boot.c | 14 ++++++++++----
 3 files changed, 15 insertions(+), 9 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/ftrace.h b/include/linux/ftrace.h
index 4455490d91bd..e672e51c40a9 100644
--- a/include/linux/ftrace.h
+++ b/include/linux/ftrace.h
@@ -7,6 +7,7 @@
 #include <linux/fs.h>
 #include <linux/init.h>
 #include <linux/types.h>
+#include <linux/kallsyms.h>
 
 extern int ftrace_enabled;
 extern int
@@ -213,7 +214,7 @@ ftrace_init_module(unsigned long *start, unsigned long *end) { }
 
 struct boot_trace {
 	pid_t			caller;
-	initcall_t		func;
+	char 			func[KSYM_NAME_LEN];
 	int			result;
 	unsigned long long	duration;
 	ktime_t			calltime;
@@ -221,10 +222,10 @@ struct boot_trace {
 };
 
 #ifdef CONFIG_BOOT_TRACER
-extern void trace_boot(struct boot_trace *it);
+extern void trace_boot(struct boot_trace *it, initcall_t fn);
 extern void start_boot_trace(void);
 #else
-static inline void trace_boot(struct boot_trace *it) { }
+static inline void trace_boot(struct boot_trace *it, initcall_t fn) { }
 static inline void start_boot_trace(void) { }
 #endif
 
diff --git a/init/main.c b/init/main.c
index 61eb66159391..8e96a0ef17f4 100644
--- a/init/main.c
+++ b/init/main.c
@@ -712,7 +712,6 @@ int do_one_initcall(initcall_t fn)
 
 	if (initcall_debug) {
 		it.caller = task_pid_nr(current);
-		it.func = fn;
 		printk("calling  %pF @ %i\n", fn, it.caller);
 		it.calltime = ktime_get();
 	}
@@ -725,7 +724,7 @@ int do_one_initcall(initcall_t fn)
 		it.duration = (unsigned long long) delta.tv64 >> 20;
 		printk("initcall %pF returned %d after %Ld msecs\n", fn,
 			it.result, it.duration);
-		trace_boot(&it);
+		trace_boot(&it, fn);
 	}
 
 	msgbuf[0] = 0;
diff --git a/kernel/trace/trace_boot.c b/kernel/trace/trace_boot.c
index 7c15f3e68ba3..b9dc2c0093ab 100644
--- a/kernel/trace/trace_boot.c
+++ b/kernel/trace/trace_boot.c
@@ -8,6 +8,7 @@
 #include <linux/init.h>
 #include <linux/debugfs.h>
 #include <linux/ftrace.h>
+#include <linux/kallsyms.h>
 
 #include "trace.h"
 
@@ -56,17 +57,19 @@ static enum print_line_t initcall_print_line(struct trace_iterator *iter)
 	struct timespec rettime = ktime_to_timespec(it->rettime);
 
 	if (entry->type == TRACE_BOOT) {
-		ret = trace_seq_printf(s, "[%5ld.%06ld] calling  %pF @ %i\n",
+		ret = trace_seq_printf(s, "[%5ld.%06ld] calling  %s @ %i\n",
 					  calltime.tv_sec,
 					  calltime.tv_nsec,
 					  it->func, it->caller);
 		if (!ret)
 			return TRACE_TYPE_PARTIAL_LINE;
-		ret = trace_seq_printf(s, "[%5ld.%06ld] initcall %pF "
+
+		ret = trace_seq_printf(s, "[%5ld.%06ld] initcall %s "
 					  "returned %d after %lld msecs\n",
 					  rettime.tv_sec,
 					  rettime.tv_nsec,
 					  it->func, it->result, it->duration);
+
 		if (!ret)
 			return TRACE_TYPE_PARTIAL_LINE;
 		return TRACE_TYPE_HANDLED;
@@ -83,8 +86,7 @@ struct tracer boot_tracer __read_mostly =
 	.print_line	= initcall_print_line,
 };
 
-
-void trace_boot(struct boot_trace *it)
+void trace_boot(struct boot_trace *it, initcall_t fn)
 {
 	struct ring_buffer_event *event;
 	struct trace_boot *entry;
@@ -95,6 +97,10 @@ void trace_boot(struct boot_trace *it)
 	if (!trace_boot_enabled)
 		return;
 
+	/* Get its name now since this function could
+	 * disappear because it is in the .init section.
+	 */
+	sprint_symbol(it->func, (unsigned long)fn);
 	preempt_disable();
 	data = tr->data[smp_processor_id()];
 
-- 
cgit v1.2.3


From 3e1932ad59726d794a865cc159c0593d54bf0cb6 Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Thu, 2 Oct 2008 17:45:47 +0200
Subject: tracing/fastboot: build fix

fix:

 In file included from kernel/sysctl.c:52:
 include/linux/ftrace.h:217: error: 'KSYM_NAME_LEN' undeclared here (not in a function)

Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 include/linux/ftrace.h | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/ftrace.h b/include/linux/ftrace.h
index e672e51c40a9..deded114dffd 100644
--- a/include/linux/ftrace.h
+++ b/include/linux/ftrace.h
@@ -1,14 +1,14 @@
 #ifndef _LINUX_FTRACE_H
 #define _LINUX_FTRACE_H
 
-#ifdef CONFIG_FTRACE
-
 #include <linux/linkage.h>
 #include <linux/fs.h>
 #include <linux/init.h>
 #include <linux/types.h>
 #include <linux/kallsyms.h>
 
+#ifdef CONFIG_FTRACE
+
 extern int ftrace_enabled;
 extern int
 ftrace_enable_sysctl(struct ctl_table *table, int write,
-- 
cgit v1.2.3


From eb7fa935274bb233686fdf7a53f40c5d9ee76ed6 Mon Sep 17 00:00:00 2001
From: Steven Noonan <steven@uplinklabs.net>
Date: Thu, 2 Oct 2008 12:00:07 -0700
Subject: ftrace: ktime.h not included in ftrace.h

Including <linux/ktime.h> eliminates the following error:

include/linux/ftrace.h:220: error: expected specifier-qualifier-list
before 'ktime_t'

Signed-off-by: Steven Noonan <steven@uplinklabs.net>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 include/linux/ftrace.h | 1 +
 1 file changed, 1 insertion(+)

(limited to 'include/linux')

diff --git a/include/linux/ftrace.h b/include/linux/ftrace.h
index deded114dffd..ed53265d1f63 100644
--- a/include/linux/ftrace.h
+++ b/include/linux/ftrace.h
@@ -3,6 +3,7 @@
 
 #include <linux/linkage.h>
 #include <linux/fs.h>
+#include <linux/ktime.h>
 #include <linux/init.h>
 #include <linux/types.h>
 #include <linux/kallsyms.h>
-- 
cgit v1.2.3


From 097d036a2f25eecc42435c57e010aaf4a2eed2d9 Mon Sep 17 00:00:00 2001
From: Frederic Weisbecker <fweisbec@gmail.com>
Date: Fri, 3 Oct 2008 15:39:21 +0200
Subject: tracing/fastboot: only trace non-module initcalls

At this time, only built-in initcalls interest us.
We can't really produce a relevant graph if we include
the modules initcall too.

I had good results after this patch (see svg in attachment).

Signed-off-by: Frederic Weisbecker <fweisbec@gmail.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 include/linux/ftrace.h    |  2 ++
 init/main.c               |  1 +
 kernel/trace/trace_boot.c | 11 ++++++++---
 3 files changed, 11 insertions(+), 3 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/ftrace.h b/include/linux/ftrace.h
index ed53265d1f63..5812dba4ee24 100644
--- a/include/linux/ftrace.h
+++ b/include/linux/ftrace.h
@@ -225,9 +225,11 @@ struct boot_trace {
 #ifdef CONFIG_BOOT_TRACER
 extern void trace_boot(struct boot_trace *it, initcall_t fn);
 extern void start_boot_trace(void);
+extern void stop_boot_trace(void);
 #else
 static inline void trace_boot(struct boot_trace *it, initcall_t fn) { }
 static inline void start_boot_trace(void) { }
+static inline void stop_boot_trace(void) { }
 #endif
 
 
diff --git a/init/main.c b/init/main.c
index 8e96a0ef17f4..e7939de80f3e 100644
--- a/init/main.c
+++ b/init/main.c
@@ -886,6 +886,7 @@ static int __init kernel_init(void * unused)
 	 * we're essentially up and running. Get rid of the
 	 * initmem segments and start the user-mode stuff..
 	 */
+	stop_boot_trace();
 	init_post();
 	return 0;
 }
diff --git a/kernel/trace/trace_boot.c b/kernel/trace/trace_boot.c
index b9dc2c0093ab..a7efe3559654 100644
--- a/kernel/trace/trace_boot.c
+++ b/kernel/trace/trace_boot.c
@@ -22,11 +22,16 @@ void start_boot_trace(void)
 	trace_boot_enabled = 1;
 }
 
-void stop_boot_trace(struct trace_array *tr)
+void stop_boot_trace(void)
 {
 	trace_boot_enabled = 0;
 }
 
+void reset_boot_trace(struct trace_array *tr)
+{
+	stop_boot_trace();
+}
+
 static void boot_trace_init(struct trace_array *tr)
 {
 	int cpu;
@@ -43,7 +48,7 @@ static void boot_trace_ctrl_update(struct trace_array *tr)
 	if (tr->ctrl)
 		start_boot_trace();
 	else
-		stop_boot_trace(tr);
+		stop_boot_trace();
 }
 
 static enum print_line_t initcall_print_line(struct trace_iterator *iter)
@@ -81,7 +86,7 @@ struct tracer boot_tracer __read_mostly =
 {
 	.name		= "initcall",
 	.init		= boot_trace_init,
-	.reset		= stop_boot_trace,
+	.reset		= reset_boot_trace,
 	.ctrl_update	= boot_trace_ctrl_update,
 	.print_line	= initcall_print_line,
 };
-- 
cgit v1.2.3


From ca538f6bbe583406f941f3041d40c41f9a13d1de Mon Sep 17 00:00:00 2001
From: Tim Bird <tim.bird@am.sony.com>
Date: Thu, 9 Oct 2008 15:23:05 -0700
Subject: tracing/fastboot: add better resolution to initcall debug/tracing

Change the time resolution for initcall_debug to microseconds, from
milliseconds.  This is handy to determine which initcalls you want to work
on for faster booting.

One one of my test machines, over 90% of the initcalls are less than a
millisecond and (without this patch) these are all reported as 0 msecs.
Working on the 900 us ones is more important than the 4 us ones.

With 'quiet' on the kernel command line, this adds no significant overhead
to kernel boot time.

Signed-off-by: Tim Bird <tim.bird@am.sony.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 include/linux/ftrace.h | 4 ++--
 init/main.c            | 4 ++--
 2 files changed, 4 insertions(+), 4 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/ftrace.h b/include/linux/ftrace.h
index 5812dba4ee24..a3d46151be19 100644
--- a/include/linux/ftrace.h
+++ b/include/linux/ftrace.h
@@ -215,9 +215,9 @@ ftrace_init_module(unsigned long *start, unsigned long *end) { }
 
 struct boot_trace {
 	pid_t			caller;
-	char 			func[KSYM_NAME_LEN];
+	char			func[KSYM_NAME_LEN];
 	int			result;
-	unsigned long long	duration;
+	unsigned long long	duration;		/* usecs */
 	ktime_t			calltime;
 	ktime_t			rettime;
 };
diff --git a/init/main.c b/init/main.c
index e7939de80f3e..b2e7ff4a5349 100644
--- a/init/main.c
+++ b/init/main.c
@@ -721,8 +721,8 @@ int do_one_initcall(initcall_t fn)
 	if (initcall_debug) {
 		it.rettime = ktime_get();
 		delta = ktime_sub(it.rettime, it.calltime);
-		it.duration = (unsigned long long) delta.tv64 >> 20;
-		printk("initcall %pF returned %d after %Ld msecs\n", fn,
+		it.duration = (unsigned long long) delta.tv64 >> 10;
+		printk("initcall %pF returned %d after %Ld usecs\n", fn,
 			it.result, it.duration);
 		trace_boot(&it, fn);
 	}
-- 
cgit v1.2.3


From bfadadfccc19e36f7d600c5ce7b3e5ba5197fbf0 Mon Sep 17 00:00:00 2001
From: Mathieu Desnoyers <mathieu.desnoyers@polymtl.ca>
Date: Fri, 10 Oct 2008 03:48:25 -0400
Subject: markers: fix synchronize marker unregister static inline

Use a #define for synchronize marker unregister to fix include dependencies.

Fixes the slab circular inclusion which triggers when slab.git is combined
with tracing.git, where rcupdate includes slab, which includes markers
which includes rcupdate.

Signed-off-by: Mathieu Desnoyers <mathieu.desnoyers@polymtl.ca>
Acked-by: Pekka Enberg <penberg@cs.helsinki.fi>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 include/linux/marker.h | 6 +-----
 1 file changed, 1 insertion(+), 5 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/marker.h b/include/linux/marker.h
index 38e32e781ed7..889196c7fbb1 100644
--- a/include/linux/marker.h
+++ b/include/linux/marker.h
@@ -13,7 +13,6 @@
  */
 
 #include <linux/types.h>
-#include <linux/rcupdate.h>
 
 struct module;
 struct marker;
@@ -166,9 +165,6 @@ extern void *marker_get_private_data(const char *name, marker_probe_func *probe,
  * unregistration and the end of module exit to make sure there is no caller
  * executing a probe when it is freed.
  */
-static inline void marker_synchronize_unregister(void)
-{
-	synchronize_sched();
-}
+#define marker_synchronize_unregister() synchronize_sched()
 
 #endif
-- 
cgit v1.2.3


From f2461fc82a083dd60062e05e704c5fcc1c658ba1 Mon Sep 17 00:00:00 2001
From: Mathieu Desnoyers <mathieu.desnoyers@polymtl.ca>
Date: Mon, 6 Oct 2008 10:33:00 -0400
Subject: tracepoints: tracepoint_synchronize_unregister()

Create tracepoint_synchronize_unregister() which must be called before the end
of exit() to make sure every probe callers have exited the non preemptible
section and thus are not executing the probe code anymore.

Signed-off-by: Mathieu Desnoyers <mathieu.desnoyers@polymtl.ca>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 include/linux/tracepoint.h | 7 +++++++
 1 file changed, 7 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/tracepoint.h b/include/linux/tracepoint.h
index e623a6fca5c3..199f4c207c1e 100644
--- a/include/linux/tracepoint.h
+++ b/include/linux/tracepoint.h
@@ -124,4 +124,11 @@ extern void tracepoint_iter_reset(struct tracepoint_iter *iter);
 extern int tracepoint_get_iter_range(struct tracepoint **tracepoint,
 	struct tracepoint *begin, struct tracepoint *end);
 
+/*
+ * tracepoint_synchronize_unregister must be called between the last tracepoint
+ * probe unregistration and the end of module exit to make sure there is no
+ * caller executing a probe when it is freed.
+ */
+#define tracepoint_synchronize_unregister() synchronize_sched()
+
 #endif
-- 
cgit v1.2.3


From 231375cc5cc3549bb413f94a164bdcbd5f9ce943 Mon Sep 17 00:00:00 2001
From: Mathieu Desnoyers <mathieu.desnoyers@polymtl.ca>
Date: Fri, 3 Oct 2008 15:01:33 -0400
Subject: tracepoints: synchronize unregister static inline

Turn tracepoint synchronize unregister into a static inline. There is no
reason to keep it as a macro over a static inline.

Signed-off-by: Mathieu Desnoyers <mathieu.desnoyers@polymtl.ca>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 include/linux/tracepoint.h | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/tracepoint.h b/include/linux/tracepoint.h
index 199f4c207c1e..c5bb39c7a770 100644
--- a/include/linux/tracepoint.h
+++ b/include/linux/tracepoint.h
@@ -129,6 +129,9 @@ extern int tracepoint_get_iter_range(struct tracepoint **tracepoint,
  * probe unregistration and the end of module exit to make sure there is no
  * caller executing a probe when it is freed.
  */
-#define tracepoint_synchronize_unregister() synchronize_sched()
+static inline void tracepoint_synchronize_unregister(void)
+{
+	synchronize_sched();
+}
 
 #endif
-- 
cgit v1.2.3


From 6028aa01f759a1dae11e5d0e495b3dc9d2b0a47b Mon Sep 17 00:00:00 2001
From: Yoshihiro Shimoda <shimoda.yoshihiro@renesas.com>
Date: Tue, 14 Oct 2008 21:23:26 +0900
Subject: [MTD] [NAND] sh_flctl: add support for Renesas SuperH FLCTL

Several Renesas SuperH CPU has FLCTL. The FLCTL support NAND Flash.
This driver support SH7723.

Signed-off-by: Yoshihiro Shimoda <shimoda.yoshihiro@renesas.com>
Acked-by: Paul Mundt <lethal@linux-sh.org>
Signed-off-by: David Woodhouse <David.Woodhouse@intel.com>
---
 drivers/mtd/nand/Kconfig     |   7 +
 drivers/mtd/nand/Makefile    |   1 +
 drivers/mtd/nand/sh_flctl.c  | 301 +++++++++++++++++++++++++++++++++++++++++++
 include/linux/mtd/sh_flctl.h | 125 ++++++++++++++++++
 4 files changed, 434 insertions(+)
 create mode 100644 drivers/mtd/nand/sh_flctl.c
 create mode 100644 include/linux/mtd/sh_flctl.h

(limited to 'include/linux')

diff --git a/drivers/mtd/nand/Kconfig b/drivers/mtd/nand/Kconfig
index 82815dd64bf6..89b4d39386ab 100644
--- a/drivers/mtd/nand/Kconfig
+++ b/drivers/mtd/nand/Kconfig
@@ -407,4 +407,11 @@ config MTD_NAND_MXC
 	  This enables the driver for the NAND flash controller on the
 	  MXC processors.
 
+config MTD_NAND_SH_FLCTL
+	tristate "Support for NAND on Renesas SuperH FLCTL"
+	depends on MTD_NAND && SUPERH && CPU_SUBTYPE_SH7723
+	help
+	  Several Renesas SuperH CPU has FLCTL. This option enables support
+	  for NAND Flash using FLCTL. This driver support SH7723.
+
 endif # MTD_NAND
diff --git a/drivers/mtd/nand/Makefile b/drivers/mtd/nand/Makefile
index e0fee048c1b4..9bfeca324b32 100644
--- a/drivers/mtd/nand/Makefile
+++ b/drivers/mtd/nand/Makefile
@@ -33,6 +33,7 @@ obj-$(CONFIG_MTD_NAND_PASEMI)		+= pasemi_nand.o
 obj-$(CONFIG_MTD_NAND_ORION)		+= orion_nand.o
 obj-$(CONFIG_MTD_NAND_FSL_ELBC)		+= fsl_elbc_nand.o
 obj-$(CONFIG_MTD_NAND_FSL_UPM)		+= fsl_upm.o
+obj-$(CONFIG_MTD_NAND_SH_FLCTL)		+= sh_flctl.o
 obj-$(CONFIG_MTD_NAND_MXC)		+= mxc_nand.o
 
 nand-objs := nand_base.o nand_bbt.o
diff --git a/drivers/mtd/nand/sh_flctl.c b/drivers/mtd/nand/sh_flctl.c
new file mode 100644
index 000000000000..600a76f5580e
--- /dev/null
+++ b/drivers/mtd/nand/sh_flctl.c
@@ -0,0 +1,301 @@
+/*
+ * SuperH FLCTL nand controller
+ *
+ * Copyright © 2008 Renesas Solutions Corp.
+ * Copyright © 2008 Atom Create Engineering Co., Ltd.
+ *
+ * Based on fsl_elbc_nand.c, Copyright © 2006-2007 Freescale Semiconductor
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; version 2 of the License.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
+ *
+ */
+
+#include <linux/module.h>
+#include <linux/kernel.h>
+#include <linux/delay.h>
+#include <linux/io.h>
+#include <linux/platform_device.h>
+
+#include <linux/mtd/mtd.h>
+#include <linux/mtd/nand.h>
+#include <linux/mtd/partitions.h>
+#include <linux/mtd/sh_flctl.h>
+
+static struct nand_ecclayout flctl_4secc_oob_16 = {
+	.eccbytes = 10,
+	.eccpos = {0, 1, 2, 3, 4, 5, 6, 7, 8, 9},
+	.oobfree = {
+		{.offset = 12,
+		. length = 4} },
+};
+
+static struct nand_ecclayout flctl_4secc_oob_64 = {
+	.eccbytes = 10,
+	.eccpos = {48, 49, 50, 51, 52, 53, 54, 55, 56, 57},
+	.oobfree = {
+		{.offset = 60,
+		. length = 4} },
+};
+
+static uint8_t scan_ff_pattern[] = { 0xff, 0xff };
+
+static struct nand_bbt_descr flctl_4secc_smallpage = {
+	.options = NAND_BBT_SCAN2NDPAGE,
+	.offs = 11,
+	.len = 1,
+	.pattern = scan_ff_pattern,
+};
+
+static struct nand_bbt_descr flctl_4secc_largepage = {
+	.options = 0,
+	.offs = 58,
+	.len = 2,
+	.pattern = scan_ff_pattern,
+};
+
+static void empty_fifo(struct sh_flctl *flctl)
+{
+	writel(0x000c0000, FLINTDMACR(flctl));	/* FIFO Clear */
+	writel(0x00000000, FLINTDMACR(flctl));	/* Clear Error flags */
+}
+
+static void start_translation(struct sh_flctl *flctl)
+{
+	writeb(TRSTRT, FLTRCR(flctl));
+}
+
+static void wait_completion(struct sh_flctl *flctl)
+{
+	uint32_t timeout = LOOP_TIMEOUT_MAX;
+
+	while (timeout--) {
+		if (readb(FLTRCR(flctl)) & TREND) {
+			writeb(0x0, FLTRCR(flctl));
+			return;
+		}
+		udelay(1);
+	}
+
+	printk(KERN_ERR "wait_completion(): Timeout occured \n");
+	writeb(0x0, FLTRCR(flctl));
+}
+
+static void set_addr(struct mtd_info *mtd, int column, int page_addr)
+{
+	struct sh_flctl *flctl = mtd_to_flctl(mtd);
+	uint32_t addr = 0;
+
+	if (column == -1) {
+		addr = page_addr;	/* ERASE1 */
+	} else if (page_addr != -1) {
+		/* SEQIN, READ0, etc.. */
+		if (flctl->page_size) {
+			addr = column & 0x0FFF;
+			addr |= (page_addr & 0xff) << 16;
+			addr |= ((page_addr >> 8) & 0xff) << 24;
+			/* big than 128MB */
+			if (flctl->rw_ADRCNT == ADRCNT2_E) {
+				uint32_t 	addr2;
+				addr2 = (page_addr >> 16) & 0xff;
+				writel(addr2, FLADR2(flctl));
+			}
+		} else {
+			addr = column;
+			addr |= (page_addr & 0xff) << 8;
+			addr |= ((page_addr >> 8) & 0xff) << 16;
+			addr |= ((page_addr >> 16) & 0xff) << 24;
+		}
+	}
+	writel(addr, FLADR(flctl));
+}
+
+static void wait_rfifo_ready(struct sh_flctl *flctl)
+{
+	uint32_t timeout = LOOP_TIMEOUT_MAX;
+
+	while (timeout--) {
+		uint32_t val;
+		/* check FIFO */
+		val = readl(FLDTCNTR(flctl)) >> 16;
+		if (val & 0xFF)
+			return;
+		udelay(1);
+	}
+	printk(KERN_ERR "wait_rfifo_ready(): Timeout occured \n");
+}
+
+static void wait_wfifo_ready(struct sh_flctl *flctl)
+{
+	uint32_t len, timeout = LOOP_TIMEOUT_MAX;
+
+	while (timeout--) {
+		/* check FIFO */
+		len = (readl(FLDTCNTR(flctl)) >> 16) & 0xFF;
+		if (len >= 4)
+			return;
+		udelay(1);
+	}
+	printk(KERN_ERR "wait_wfifo_ready(): Timeout occured \n");
+}
+
+static int wait_recfifo_ready(struct sh_flctl *flctl)
+{
+	uint32_t timeout = LOOP_TIMEOUT_MAX;
+	int checked[4];
+	void __iomem *ecc_reg[4];
+	int i;
+	uint32_t data, size;
+
+	memset(checked, 0, sizeof(checked));
+
+	while (timeout--) {
+		size = readl(FLDTCNTR(flctl)) >> 24;
+		if (size & 0xFF)
+			return 0;	/* success */
+
+		if (readl(FL4ECCCR(flctl)) & _4ECCFA)
+			return 1;	/* can't correct */
+
+		udelay(1);
+		if (!(readl(FL4ECCCR(flctl)) & _4ECCEND))
+			continue;
+
+		/* start error correction */
+		ecc_reg[0] = FL4ECCRESULT0(flctl);
+		ecc_reg[1] = FL4ECCRESULT1(flctl);
+		ecc_reg[2] = FL4ECCRESULT2(flctl);
+		ecc_reg[3] = FL4ECCRESULT3(flctl);
+
+		for (i = 0; i < 3; i++) {
+			data = readl(ecc_reg[i]);
+			if (data != INIT_FL4ECCRESULT_VAL && !checked[i]) {
+				uint8_t org;
+				int index;
+
+				index = data >> 16;
+				org = flctl->done_buff[index];
+				flctl->done_buff[index] = org ^ (data & 0xFF);
+				checked[i] = 1;
+			}
+		}
+
+		writel(0, FL4ECCCR(flctl));
+	}
+
+	printk(KERN_ERR "wait_recfifo_ready(): Timeout occured \n");
+	return 1;	/* timeout */
+}
+
+static void wait_wecfifo_ready(struct sh_flctl *flctl)
+{
+	uint32_t timeout = LOOP_TIMEOUT_MAX;
+	uint32_t len;
+
+	while (timeout--) {
+		/* check FLECFIFO */
+		len = (readl(FLDTCNTR(flctl)) >> 24) & 0xFF;
+		if (len >= 4)
+			return;
+		udelay(1);
+	}
+	printk(KERN_ERR "wait_wecfifo_ready(): Timeout occured \n");
+}
+
+static void read_datareg(struct sh_flctl *flctl, int offset)
+{
+	unsigned long data;
+	unsigned long *buf = (unsigned long *)&flctl->done_buff[offset];
+
+	wait_completion(flctl);
+
+	data = readl(FLDATAR(flctl));
+	*buf = le32_to_cpu(data);
+}
+
+static void read_fiforeg(struct sh_flctl *flctl, int rlen, int offset)
+{
+	int i, len_4align;
+	unsigned long *buf = (unsigned long *)&flctl->done_buff[offset];
+	void *fifo_addr = (void *)FLDTFIFO(flctl);
+
+	len_4align = (rlen + 3) / 4;
+
+	for (i = 0; i < len_4align; i++) {
+		wait_rfifo_ready(flctl);
+		buf[i] = readl(fifo_addr);
+		buf[i] = be32_to_cpu(buf[i]);
+	}
+}
+
+static int read_ecfiforeg(struct sh_flctl *flctl, uint8_t *buff)
+{
+	int i;
+	unsigned long *ecc_buf = (unsigned long *)buff;
+	void *fifo_addr = (void *)FLECFIFO(flctl);
+
+	for (i = 0; i < 4; i++) {
+		if (wait_recfifo_ready(flctl))
+			return 1;
+		ecc_buf[i] = readl(fifo_addr);
+		ecc_buf[i] = be32_to_cpu(ecc_buf[i]);
+	}
+
+	return 0;
+}
+
+static void write_fiforeg(struct sh_flctl *flctl, int rlen, int offset)
+{
+	int i, len_4align;
+	unsigned long *data = (unsigned long *)&flctl->done_buff[offset];
+	void *fifo_addr = (void *)FLDTFIFO(flctl);
+
+	len_4align = (rlen + 3) / 4;
+	for (i = 0; i < len_4align; i++) {
+		wait_wfifo_ready(flctl);
+		writel(cpu_to_be32(data[i]), fifo_addr);
+	}
+}
+
+static void set_cmd_regs(struct mtd_info *mtd, uint32_t cmd, uint32_t flcmcdr_val)
+{
+	struct sh_flctl *flctl = mtd_to_flctl(mtd);
+	uint32_t flcmncr_val = readl(FLCMNCR(flctl));
+	uint32_t flcmdcr_val, addr_len_bytes = 0;
+
+	/* Set SNAND bit if page size is 2048byte */
+	if (flctl->page_size)
+		flcmncr_val |= SNAND_E;
+	else
+		flcmncr_val &= ~SNAND_E;
+
+	/* default FLCMDCR val */
+	flcmdcr_val = DOCMD1_E | DOADR_E;
+
+	/* Set for FLCMDCR */
+	switch (cmd) {
+	case NAND_CMD_ERASE1:
+		addr_len_bytes = flctl->erase_ADRCNT;
+		flcmdcr_val |= DOCMD2_E;
+		break;
+	case NAND_CMD_READ0:
+	case NAND_CMD_READOOB:
+		addr_len_bytes = flctl->rw_ADRCNT;
+		flcmdcr_val |= CDSRC_E;
+		break;
+	case NAND_CMD_SEQIN:
+		/* This case is that cmd is READ0 or READ1 or READ00 */
+		flcmdcr_val &= ~DOADR_E;	/* ONLY execute 1st cmd */
+		break;
+	case NAND_CMD_PAGEPROG:
+		addr_len_bytes = flctl->rw_ADRCNT;
diff --git a/include/linux/mtd/sh_flctl.h b/include/linux/mtd/sh_flctl.h
new file mode 100644
index 000000000000..e77c1cea404d
--- /dev/null
+++ b/include/linux/mtd/sh_flctl.h
@@ -0,0 +1,125 @@
+/*
+ * SuperH FLCTL nand controller
+ *
+ * Copyright © 2008 Renesas Solutions Corp.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; version 2 of the License.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
+ */
+
+#ifndef __SH_FLCTL_H__
+#define __SH_FLCTL_H__
+
+#include <linux/mtd/mtd.h>
+#include <linux/mtd/nand.h>
+#include <linux/mtd/partitions.h>
+
+/* FLCTL registers */
+#define FLCMNCR(f)		(f->reg + 0x0)
+#define FLCMDCR(f)		(f->reg + 0x4)
+#define FLCMCDR(f)		(f->reg + 0x8)
+#define FLADR(f)		(f->reg + 0xC)
+#define FLADR2(f)		(f->reg + 0x3C)
+#define FLDATAR(f)		(f->reg + 0x10)
+#define FLDTCNTR(f)		(f->reg + 0x14)
+#define FLINTDMACR(f)		(f->reg + 0x18)
+#define FLBSYTMR(f)		(f->reg + 0x1C)
+#define FLBSYCNT(f)		(f->reg + 0x20)
+#define FLDTFIFO(f)		(f->reg + 0x24)
+#define FLECFIFO(f)		(f->reg + 0x28)
+#define FLTRCR(f)		(f->reg + 0x2C)
+#define	FL4ECCRESULT0(f)	(f->reg + 0x80)
+#define	FL4ECCRESULT1(f)	(f->reg + 0x84)
+#define	FL4ECCRESULT2(f)	(f->reg + 0x88)
+#define	FL4ECCRESULT3(f)	(f->reg + 0x8C)
+#define	FL4ECCCR(f)		(f->reg + 0x90)
+#define	FL4ECCCNT(f)		(f->reg + 0x94)
+#define	FLERRADR(f)		(f->reg + 0x98)
+
+/* FLCMNCR control bits */
+#define ECCPOS2		(0x1 << 25)
+#define _4ECCCNTEN	(0x1 << 24)
+#define _4ECCEN		(0x1 << 23)
+#define _4ECCCORRECT	(0x1 << 22)
+#define SNAND_E		(0x1 << 18)	/* SNAND (0=512 1=2048)*/
+#define QTSEL_E		(0x1 << 17)
+#define ENDIAN		(0x1 << 16)	/* 1 = little endian */
+#define FCKSEL_E	(0x1 << 15)
+#define ECCPOS_00	(0x00 << 12)
+#define ECCPOS_01	(0x01 << 12)
+#define ECCPOS_02	(0x02 << 12)
+#define ACM_SACCES_MODE	(0x01 << 10)
+#define NANWF_E		(0x1 << 9)
+#define SE_D		(0x1 << 8)	/* Spare area disable */
+#define	CE1_ENABLE	(0x1 << 4)	/* Chip Enable 1 */
+#define	CE0_ENABLE	(0x1 << 3)	/* Chip Enable 0 */
+#define	TYPESEL_SET	(0x1 << 0)
+
+/* FLCMDCR control bits */
+#define ADRCNT2_E	(0x1 << 31)	/* 5byte address enable */
+#define ADRMD_E		(0x1 << 26)	/* Sector address access */
+#define CDSRC_E		(0x1 << 25)	/* Data buffer selection */
+#define DOSR_E		(0x1 << 24)	/* Status read check */
+#define SELRW		(0x1 << 21)	/*  0:read 1:write */
+#define DOADR_E		(0x1 << 20)	/* Address stage execute */
+#define ADRCNT_1	(0x00 << 18)	/* Address data bytes: 1byte */
+#define ADRCNT_2	(0x01 << 18)	/* Address data bytes: 2byte */
+#define ADRCNT_3	(0x02 << 18)	/* Address data bytes: 3byte */
+#define ADRCNT_4	(0x03 << 18)	/* Address data bytes: 4byte */
+#define DOCMD2_E	(0x1 << 17)	/* 2nd cmd stage execute */
+#define DOCMD1_E	(0x1 << 16)	/* 1st cmd stage execute */
+
+/* FLTRCR control bits */
+#define TRSTRT		(0x1 << 0)	/* translation start */
+#define TREND		(0x1 << 1)	/* translation end */
+
+/* FL4ECCCR control bits */
+#define	_4ECCFA		(0x1 << 2)	/* 4 symbols correct fault */
+#define	_4ECCEND	(0x1 << 1)	/* 4 symbols end */
+#define	_4ECCEXST	(0x1 << 0)	/* 4 symbols exist */
+
+#define INIT_FL4ECCRESULT_VAL	0x03FF03FF
+#define LOOP_TIMEOUT_MAX	0x00010000
+
+#define mtd_to_flctl(mtd)	container_of(mtd, struct sh_flctl, mtd)
+
+struct sh_flctl {
+	struct mtd_info		mtd;
+	struct nand_chip	chip;
+	void __iomem		*reg;
+
+	uint8_t	done_buff[2048 + 64];	/* max size 2048 + 64 */
+	int	read_bytes;
+	int	index;
+	int	seqin_column;		/* column in SEQIN cmd */
+	int	seqin_page_addr;	/* page_addr in SEQIN cmd */
+	uint32_t seqin_read_cmd;		/* read cmd in SEQIN cmd */
+	int	erase1_page_addr;	/* page_addr in ERASE1 cmd */
+	uint32_t erase_ADRCNT;		/* bits of FLCMDCR in ERASE1 cmd */
+	uint32_t rw_ADRCNT;	/* bits of FLCMDCR in READ WRITE cmd */
+
+	int	hwecc_cant_correct[4];
+
+	unsigned page_size:1;	/* NAND page size (0 = 512, 1 = 2048) */
+	unsigned hwecc:1;	/* Hardware ECC (0 = disabled, 1 = enabled) */
+};
+
+struct sh_flctl_platform_data {
+	struct mtd_partition	*parts;
+	int			nr_parts;
+	unsigned long		flcmncr_val;
+
+	unsigned has_hwecc:1;
+};
+
+#endif	/* __SH_FLCTL_H__ */
-- 
cgit v1.2.3


From 40b8606253552109815786e5d4b0de98782d31f5 Mon Sep 17 00:00:00 2001
From: Stephen Rothwell <sfr@canb.auug.org.au>
Date: Wed, 15 Oct 2008 14:20:28 +1100
Subject: DECLARE_PER_CPU needs linux/percpu.h

Signed-off-by: Stephen Rothwell <sfr@canb.auug.org.au>
---
 include/linux/hrtimer.h | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/hrtimer.h b/include/linux/hrtimer.h
index 508ce20b8f9c..1e6f731381d9 100644
--- a/include/linux/hrtimer.h
+++ b/include/linux/hrtimer.h
@@ -20,6 +20,8 @@
 #include <linux/init.h>
 #include <linux/list.h>
 #include <linux/wait.h>
+#include <linux/percpu.h>
+
 
 struct hrtimer_clock_base;
 struct hrtimer_cpu_base;
-- 
cgit v1.2.3


From 29d434b39c807320fbe4bcdce0ab98a0b9fcb285 Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Thu, 16 Oct 2008 16:08:57 +0200
Subject: fuse: add include protectors

Add include protectors to include/linux/fuse.h and fs/fuse/fuse_i.h.

Signed-off-by: Tejun Heo <tj@kernel.org>
Signed-off-by: Miklos Szeredi <mszeredi@suse.cz>
---
 fs/fuse/fuse_i.h     | 5 +++++
 include/linux/fuse.h | 5 +++++
 2 files changed, 10 insertions(+)

(limited to 'include/linux')

diff --git a/fs/fuse/fuse_i.h b/fs/fuse/fuse_i.h
index 3a876076bdd1..35accfdd747f 100644
--- a/fs/fuse/fuse_i.h
+++ b/fs/fuse/fuse_i.h
@@ -6,6 +6,9 @@
   See the file COPYING.
 */
 
+#ifndef _FS_FUSE_I_H
+#define _FS_FUSE_I_H
+
 #include <linux/fuse.h>
 #include <linux/fs.h>
 #include <linux/mount.h>
@@ -655,3 +658,5 @@ void fuse_set_nowrite(struct inode *inode);
 void fuse_release_nowrite(struct inode *inode);
 
 u64 fuse_get_attr_version(struct fuse_conn *fc);
+
+#endif /* _FS_FUSE_I_H */
diff --git a/include/linux/fuse.h b/include/linux/fuse.h
index 265635dc9908..8bc1101e9b35 100644
--- a/include/linux/fuse.h
+++ b/include/linux/fuse.h
@@ -19,6 +19,9 @@
  *  - add file flags field to fuse_read_in and fuse_write_in
  */
 
+#ifndef _LINUX_FUSE_H
+#define _LINUX_FUSE_H
+
 #include <asm/types.h>
 #include <linux/major.h>
 
@@ -409,3 +412,5 @@ struct fuse_dirent {
 #define FUSE_DIRENT_ALIGN(x) (((x) + sizeof(__u64) - 1) & ~(sizeof(__u64) - 1))
 #define FUSE_DIRENT_SIZE(d) \
 	FUSE_DIRENT_ALIGN(FUSE_NAME_OFFSET + (d)->namelen)
+
+#endif /* _LINUX_FUSE_H */
-- 
cgit v1.2.3


From a7c1b990f71574e077b94ce4582e2cf11cb891fe Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Thu, 16 Oct 2008 16:08:57 +0200
Subject: fuse: implement nonseekable open

Let the client request nonseekable open using FOPEN_NONSEEKABLE and
call nonseekable_open() on the file if requested.

Signed-off-by: Tejun Heo <tj@kernel.org>
Signed-off-by: Miklos Szeredi <mszeredi@suse.cz>
---
 fs/fuse/file.c       | 2 ++
 include/linux/fuse.h | 7 ++++++-
 2 files changed, 8 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/fs/fuse/file.c b/fs/fuse/file.c
index 98079aa800e8..34930a964b82 100644
--- a/fs/fuse/file.c
+++ b/fs/fuse/file.c
@@ -101,6 +101,8 @@ void fuse_finish_open(struct inode *inode, struct file *file,
 		file->f_op = &fuse_direct_io_file_operations;
 	if (!(outarg->open_flags & FOPEN_KEEP_CACHE))
 		invalidate_inode_pages2(inode->i_mapping);
+	if (outarg->open_flags & FOPEN_NONSEEKABLE)
+		nonseekable_open(inode, file);
 	ff->fh = outarg->fh;
 	file->private_data = fuse_file_get(ff);
 }
diff --git a/include/linux/fuse.h b/include/linux/fuse.h
index 8bc1101e9b35..350fe9767bbc 100644
--- a/include/linux/fuse.h
+++ b/include/linux/fuse.h
@@ -17,6 +17,9 @@
  *  - add lock_owner field to fuse_setattr_in, fuse_read_in and fuse_write_in
  *  - add blksize field to fuse_attr
  *  - add file flags field to fuse_read_in and fuse_write_in
+ *
+ * 7.10
+ *  - add nonseekable open flag
  */
 
 #ifndef _LINUX_FUSE_H
@@ -29,7 +32,7 @@
 #define FUSE_KERNEL_VERSION 7
 
 /** Minor version number of this interface */
-#define FUSE_KERNEL_MINOR_VERSION 9
+#define FUSE_KERNEL_MINOR_VERSION 10
 
 /** The node ID of the root inode */
 #define FUSE_ROOT_ID 1
@@ -101,9 +104,11 @@ struct fuse_file_lock {
  *
  * FOPEN_DIRECT_IO: bypass page cache for this open file
  * FOPEN_KEEP_CACHE: don't invalidate the data cache on open
+ * FOPEN_NONSEEKABLE: the file is not seekable
  */
 #define FOPEN_DIRECT_IO		(1 << 0)
 #define FOPEN_KEEP_CACHE	(1 << 1)
+#define FOPEN_NONSEEKABLE	(1 << 2)
 
 /**
  * INIT request/reply flags
-- 
cgit v1.2.3


From 3ddfda11861d305b02ed810b522dcf48b74ca808 Mon Sep 17 00:00:00 2001
From: Yinghai Lu <yhlu.kernel@gmail.com>
Date: Tue, 19 Aug 2008 20:49:43 -0700
Subject: generic: add dyn_array support

Allow crazy big arrays via bootmem at init stage.
Architectures use CONFIG_HAVE_DYN_ARRAY to enable it.

usage:

| static struct irq_desc irq_desc_init __initdata = {
|        .status = IRQ_DISABLED,
|        .chip = &no_irq_chip,
|        .handle_irq = handle_bad_irq,
|        .depth = 1,
|        .lock = __SPIN_LOCK_UNLOCKED(irq_desc->lock),
| #ifdef CONFIG_SMP
|        .affinity = CPU_MASK_ALL
| #endif
| };
|
| static void __init init_work(void *data)
| {
|        struct dyn_array *da = data;
|        struct  irq_desc *desc;
|        int i;
|
|        desc = *da->name;
|
|        for (i = 0; i < *da->nr; i++)
|                memcpy(&desc[i], &irq_desc_init, sizeof(struct irq_desc));
| }
|
| struct irq_desc *irq_desc;
| DEFINE_DYN_ARRAY(irq_desc, sizeof(struct irq_desc), nr_irqs, PAGE_SIZE, init_work);

after pre_alloc_dyn_array() after setup_arch(), the array is ready to be
used.

Via this facility we can replace irq_desc[NR_IRQS] array with dyn_array
irq_desc[nr_irqs].

v2: remove _nopanic in pre_alloc_dyn_array()

Signed-off-by: Yinghai Lu <yhlu.kernel@gmail.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 include/asm-generic/vmlinux.lds.h |  7 +++++++
 include/linux/init.h              | 23 +++++++++++++++++++++++
 init/main.c                       | 24 ++++++++++++++++++++++++
 3 files changed, 54 insertions(+)

(limited to 'include/linux')

diff --git a/include/asm-generic/vmlinux.lds.h b/include/asm-generic/vmlinux.lds.h
index 7440a0dceddb..7881406c03ec 100644
--- a/include/asm-generic/vmlinux.lds.h
+++ b/include/asm-generic/vmlinux.lds.h
@@ -210,6 +210,13 @@
  * All archs are supposed to use RO_DATA() */
 #define RODATA RO_DATA(4096)
 
+#define DYN_ARRAY_INIT(align)							\
+	. = ALIGN((align));						\
+	.dyn_array.init : AT(ADDR(.dyn_array.init) - LOAD_OFFSET) {	\
+		VMLINUX_SYMBOL(__dyn_array_start) = .;			\
+		*(.dyn_array.init)					\
+		VMLINUX_SYMBOL(__dyn_array_end) = .;			\
+	}
 #define SECURITY_INIT							\
 	.security_initcall.init : AT(ADDR(.security_initcall.init) - LOAD_OFFSET) { \
 		VMLINUX_SYMBOL(__security_initcall_start) = .;		\
diff --git a/include/linux/init.h b/include/linux/init.h
index 93538b696e3d..cf9fa7f174af 100644
--- a/include/linux/init.h
+++ b/include/linux/init.h
@@ -246,6 +246,29 @@ struct obs_kernel_param {
 
 /* Relies on boot_command_line being set */
 void __init parse_early_param(void);
+
+struct dyn_array {
+	void **name;
+	unsigned long size;
+	unsigned int *nr;
+	unsigned long align;
+	void (*init_work)(void *);
+};
+extern struct dyn_array *__dyn_array_start[], *__dyn_array_end[];
+
+#define DEFINE_DYN_ARRAY(nameX, sizeX, nrX, alignX, init_workX) \
+		static struct dyn_array __dyn_array_##nameX __initdata = \
+		{	.name = (void **)&nameX,\
+			.size = sizeX,\
+			.nr   = &nrX,\
+			.align = alignX,\
+			.init_work = init_workX,\
+		}; \
+		static struct dyn_array *__dyn_array_ptr_##nameX __used \
+		__attribute__((__section__(".dyn_array.init"))) = \
+			&__dyn_array_##nameX
+
+extern void pre_alloc_dyn_array(void);
 #endif /* __ASSEMBLY__ */
 
 /**
diff --git a/init/main.c b/init/main.c
index 27f6bf6108e9..638d3a786412 100644
--- a/init/main.c
+++ b/init/main.c
@@ -536,6 +536,29 @@ void __init __weak thread_info_cache_init(void)
 {
 }
 
+void pre_alloc_dyn_array(void)
+{
+#ifdef CONFIG_HAVE_DYN_ARRAY
+	unsigned long size, phys = 0;
+	struct dyn_array **daa;
+
+	for (daa = __dyn_array_start ; daa < __dyn_array_end; daa++) {
+		struct dyn_array *da = *daa;
+
+		size = da->size * (*da->nr);
+		print_fn_descriptor_symbol("dyna_array %s ", da->name);
+		printk(KERN_CONT "size:%#lx nr:%d align:%#lx",
+			da->size, *da->nr, da->align);
+		*da->name = __alloc_bootmem(size, da->align, phys);
+		phys = virt_to_phys(*da->name);
+		printk(KERN_CONT " ==> [%#lx - %#lx]\n", phys, phys + size);
+
+		if (da->init_work)
+			da->init_work(da);
+	}
+#endif
+}
+
 asmlinkage void __init start_kernel(void)
 {
 	char * command_line;
@@ -567,6 +590,7 @@ asmlinkage void __init start_kernel(void)
 	printk(KERN_NOTICE);
 	printk(linux_banner);
 	setup_arch(&command_line);
+	pre_alloc_dyn_array();
 	mm_init_owner(&init_mm, &init_task);
 	setup_command_line(command_line);
 	unwind_setup();
-- 
cgit v1.2.3


From 1f3fcd4b1adc972d5c6a34cfed98931c46575b49 Mon Sep 17 00:00:00 2001
From: Yinghai Lu <yhlu.kernel@gmail.com>
Date: Tue, 19 Aug 2008 20:49:44 -0700
Subject: add per_cpu_dyn_array support

allow dyn-array in per_cpu area, allocated dynamically.

usage:

|  /* in .h */
| struct kernel_stat {
|        struct cpu_usage_stat   cpustat;
|        unsigned int *irqs;
| };
|
|  /* in .c */
| DEFINE_PER_CPU(struct kernel_stat, kstat);
|
| DEFINE_PER_CPU_DYN_ARRAY_ADDR(per_cpu__kstat_irqs, per_cpu__kstat.irqs, sizeof(unsigned int), nr_irqs, sizeof(unsigned long), NULL);

after setup_percpu()/per_cpu_alloc_dyn_array(), the dyn_array in
per_cpu area is ready to use.

Signed-off-by: Yinghai Lu <yhlu.kernel@gmail.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/setup_percpu.c    |  7 +++--
 include/asm-generic/vmlinux.lds.h |  6 ++++
 include/linux/init.h              | 27 +++++++++++++++--
 init/main.c                       | 63 +++++++++++++++++++++++++++++++++++++--
 4 files changed, 96 insertions(+), 7 deletions(-)

(limited to 'include/linux')

diff --git a/arch/x86/kernel/setup_percpu.c b/arch/x86/kernel/setup_percpu.c
index 0e67f72d9316..13ba7a83808d 100644
--- a/arch/x86/kernel/setup_percpu.c
+++ b/arch/x86/kernel/setup_percpu.c
@@ -140,7 +140,7 @@ static void __init setup_cpu_pda_map(void)
  */
 void __init setup_per_cpu_areas(void)
 {
-	ssize_t size = PERCPU_ENOUGH_ROOM;
+	ssize_t size, old_size;
 	char *ptr;
 	int cpu;
 
@@ -148,7 +148,8 @@ void __init setup_per_cpu_areas(void)
 	setup_cpu_pda_map();
 
 	/* Copy section for each CPU (we discard the original) */
-	size = PERCPU_ENOUGH_ROOM;
+	old_size = PERCPU_ENOUGH_ROOM;
+	size = old_size + per_cpu_dyn_array_size();
 	printk(KERN_INFO "PERCPU: Allocating %zd bytes of per cpu data\n",
 			  size);
 
@@ -176,6 +177,8 @@ void __init setup_per_cpu_areas(void)
 		per_cpu_offset(cpu) = ptr - __per_cpu_start;
 		memcpy(ptr, __per_cpu_start, __per_cpu_end - __per_cpu_start);
 
+		per_cpu_alloc_dyn_array(cpu, ptr + old_size);
+
 	}
 
 	printk(KERN_DEBUG "NR_CPUS: %d, nr_cpu_ids: %d, nr_node_ids %d\n",
diff --git a/include/asm-generic/vmlinux.lds.h b/include/asm-generic/vmlinux.lds.h
index 7881406c03ec..c68eda9d9a90 100644
--- a/include/asm-generic/vmlinux.lds.h
+++ b/include/asm-generic/vmlinux.lds.h
@@ -216,6 +216,12 @@
 		VMLINUX_SYMBOL(__dyn_array_start) = .;			\
 		*(.dyn_array.init)					\
 		VMLINUX_SYMBOL(__dyn_array_end) = .;			\
+	}								\
+	. = ALIGN((align));						\
+	.per_cpu_dyn_array.init : AT(ADDR(.per_cpu_dyn_array.init) - LOAD_OFFSET) {	\
+		VMLINUX_SYMBOL(__per_cpu_dyn_array_start) = .;		\
+		*(.per_cpu_dyn_array.init)				\
+		VMLINUX_SYMBOL(__per_cpu_dyn_array_end) = .;		\
 	}
 #define SECURITY_INIT							\
 	.security_initcall.init : AT(ADDR(.security_initcall.init) - LOAD_OFFSET) { \
diff --git a/include/linux/init.h b/include/linux/init.h
index cf9fa7f174af..332806826b8e 100644
--- a/include/linux/init.h
+++ b/include/linux/init.h
@@ -255,12 +255,13 @@ struct dyn_array {
 	void (*init_work)(void *);
 };
 extern struct dyn_array *__dyn_array_start[], *__dyn_array_end[];
+extern struct dyn_array *__per_cpu_dyn_array_start[], *__per_cpu_dyn_array_end[];
 
-#define DEFINE_DYN_ARRAY(nameX, sizeX, nrX, alignX, init_workX) \
+#define DEFINE_DYN_ARRAY_ADDR(nameX, addrX, sizeX, nrX, alignX, init_workX) \
 		static struct dyn_array __dyn_array_##nameX __initdata = \
-		{	.name = (void **)&nameX,\
+		{	.name = (void **)&(nameX),\
 			.size = sizeX,\
-			.nr   = &nrX,\
+			.nr   = &(nrX),\
 			.align = alignX,\
 			.init_work = init_workX,\
 		}; \
@@ -268,7 +269,27 @@ extern struct dyn_array *__dyn_array_start[], *__dyn_array_end[];
 		__attribute__((__section__(".dyn_array.init"))) = \
 			&__dyn_array_##nameX
 
+#define DEFINE_DYN_ARRAY(nameX, sizeX, nrX, alignX, init_workX) \
+	DEFINE_DYN_ARRAY_ADDR(nameX, nameX, sizeX, nrX, alignX, init_workX)
+
+#define DEFINE_PER_CPU_DYN_ARRAY_ADDR(nameX, addrX, sizeX, nrX, alignX, init_workX) \
+		static struct dyn_array __per_cpu_dyn_array_##nameX __initdata = \
+		{	.name = (void **)&(addrX),\
+			.size = sizeX,\
+			.nr   = &(nrX),\
+			.align = alignX,\
+			.init_work = init_workX,\
+		}; \
+		static struct dyn_array *__per_cpu_dyn_array_ptr_##nameX __used \
+		__attribute__((__section__(".per_cpu_dyn_array.init"))) = \
+			&__per_cpu_dyn_array_##nameX
+
+#define DEFINE_PER_CPU_DYN_ARRAY(nameX, sizeX, nrX, alignX, init_workX) \
+	DEFINE_PER_CPU_DYN_ARRAY_ADDR(nameX, nameX, nrX, alignX, init_workX)
+
 extern void pre_alloc_dyn_array(void);
+extern unsigned long per_cpu_dyn_array_size(void);
+extern void per_cpu_alloc_dyn_array(int cpu, char *ptr);
 #endif /* __ASSEMBLY__ */
 
 /**
diff --git a/init/main.c b/init/main.c
index 638d3a786412..416bca4f734f 100644
--- a/init/main.c
+++ b/init/main.c
@@ -391,17 +391,19 @@ EXPORT_SYMBOL(__per_cpu_offset);
 
 static void __init setup_per_cpu_areas(void)
 {
-	unsigned long size, i;
+	unsigned long size, i, old_size;
 	char *ptr;
 	unsigned long nr_possible_cpus = num_possible_cpus();
 
 	/* Copy section for each CPU (we discard the original) */
-	size = ALIGN(PERCPU_ENOUGH_ROOM, PAGE_SIZE);
+	old_size = PERCPU_ENOUGH_ROOM;
+	size = ALIGN(old_size + per_cpu_dyn_array_size(), PAGE_SIZE);
 	ptr = alloc_bootmem_pages(size * nr_possible_cpus);
 
 	for_each_possible_cpu(i) {
 		__per_cpu_offset[i] = ptr - __per_cpu_start;
 		memcpy(ptr, __per_cpu_start, __per_cpu_end - __per_cpu_start);
+		per_cpu_alloc_dyn_array(i, ptr + old_size);
 		ptr += size;
 	}
 }
@@ -559,6 +561,63 @@ void pre_alloc_dyn_array(void)
 #endif
 }
 
+unsigned long per_cpu_dyn_array_size(void)
+{
+	unsigned long total_size = 0;
+#ifdef CONFIG_HAVE_DYN_ARRAY
+	unsigned long size;
+	struct dyn_array **daa;
+
+	for (daa = __per_cpu_dyn_array_start ; daa < __per_cpu_dyn_array_end; daa++) {
+		struct dyn_array *da = *daa;
+
+		size = da->size * (*da->nr);
+		print_fn_descriptor_symbol("per_cpu_dyna_array %s ", da->name);
+		printk(KERN_CONT "size:%#lx nr:%d align:%#lx\n",
+			da->size, *da->nr, da->align);
+		total_size += roundup(size, da->align);
+	}
+	if (total_size)
+		printk(KERN_DEBUG "per_cpu_dyna_array total_size: %#lx\n",
+			 total_size);
+#endif
+	return total_size;
+}
+
+void per_cpu_alloc_dyn_array(int cpu, char *ptr)
+{
+#ifdef CONFIG_HAVE_DYN_ARRAY
+	unsigned long size, phys;
+	struct dyn_array **daa;
+	unsigned long addr;
+	void **array;
+
+	phys = virt_to_phys(ptr);
+
+	for (daa = __per_cpu_dyn_array_start ; daa < __per_cpu_dyn_array_end; daa++) {
+		struct dyn_array *da = *daa;
+
+		size = da->size * (*da->nr);
+		print_fn_descriptor_symbol("per_cpu_dyna_array %s ", da->name);
+		printk(KERN_CONT "size:%#lx nr:%d align:%#lx",
+			da->size, *da->nr, da->align);
+
+		phys = roundup(phys, da->align);
+		addr = (unsigned long)da->name;
+		addr += per_cpu_offset(cpu);
+		array = (void **)addr;
+		*array = phys_to_virt(phys);
+		*da->name = *array; /* so init_work could use it directly */
+		printk(KERN_CONT " %p ==> [%#lx - %#lx]\n", array, phys, phys + size);
+		phys += size;
+
+		if (da->init_work) {
+			da->init_work(da);
+		}
+	}
+#endif
+}
+
 asmlinkage void __init start_kernel(void)
 {
 	char * command_line;
-- 
cgit v1.2.3


From 1f8ff037a871690c762d267d8a052529d3102fc9 Mon Sep 17 00:00:00 2001
From: Yinghai Lu <yhlu.kernel@gmail.com>
Date: Tue, 19 Aug 2008 20:49:45 -0700
Subject: x86: alloc dyn_array all together

so could spare some memory with small alignment in bootmem

also tighten the alignment checking, and make print out less debug info.

Signed-off-by: Yinghai Lu <yhlu.kernel@gmail.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/setup_percpu.c | 16 +++++++----
 include/linux/init.h           |  2 +-
 init/main.c                    | 65 ++++++++++++++++++++++++++++++++----------
 3 files changed, 62 insertions(+), 21 deletions(-)

(limited to 'include/linux')

diff --git a/arch/x86/kernel/setup_percpu.c b/arch/x86/kernel/setup_percpu.c
index 13ba7a83808d..2b7dab699e83 100644
--- a/arch/x86/kernel/setup_percpu.c
+++ b/arch/x86/kernel/setup_percpu.c
@@ -140,26 +140,31 @@ static void __init setup_cpu_pda_map(void)
  */
 void __init setup_per_cpu_areas(void)
 {
-	ssize_t size, old_size;
+	ssize_t size, old_size, da_size;
 	char *ptr;
 	int cpu;
+	unsigned long align = 1;
 
 	/* Setup cpu_pda map */
 	setup_cpu_pda_map();
 
 	/* Copy section for each CPU (we discard the original) */
 	old_size = PERCPU_ENOUGH_ROOM;
-	size = old_size + per_cpu_dyn_array_size();
+	da_size = per_cpu_dyn_array_size(&align);
+	align = max_t(unsigned long, PAGE_SIZE, align);
+	size = roundup(old_size + da_size, align);
 	printk(KERN_INFO "PERCPU: Allocating %zd bytes of per cpu data\n",
 			  size);
 
 	for_each_possible_cpu(cpu) {
 #ifndef CONFIG_NEED_MULTIPLE_NODES
-		ptr = alloc_bootmem_pages(size);
+		ptr = __alloc_bootmem(size, align,
+				 __pa(MAX_DMA_ADDRESS));
 #else
 		int node = early_cpu_to_node(cpu);
 		if (!node_online(node) || !NODE_DATA(node)) {
-			ptr = alloc_bootmem_pages(size);
+			ptr = __alloc_bootmem(size, align,
+					 __pa(MAX_DMA_ADDRESS));
 			printk(KERN_INFO
 			       "cpu %d has no node %d or node-local memory\n",
 				cpu, node);
@@ -168,7 +173,8 @@ void __init setup_per_cpu_areas(void)
 					 cpu, __pa(ptr));
 		}
 		else {
-			ptr = alloc_bootmem_pages_node(NODE_DATA(node), size);
+			ptr = __alloc_bootmem_node(NODE_DATA(node), size, align,
+							__pa(MAX_DMA_ADDRESS));
 			if (ptr)
 				printk(KERN_DEBUG "per cpu data for cpu%d on node%d at %016lx\n",
 					 cpu, node, __pa(ptr));
diff --git a/include/linux/init.h b/include/linux/init.h
index 332806826b8e..59fbb4aaba6a 100644
--- a/include/linux/init.h
+++ b/include/linux/init.h
@@ -288,7 +288,7 @@ extern struct dyn_array *__per_cpu_dyn_array_start[], *__per_cpu_dyn_array_end[]
 	DEFINE_PER_CPU_DYN_ARRAY_ADDR(nameX, nameX, nrX, alignX, init_workX)
 
 extern void pre_alloc_dyn_array(void);
-extern unsigned long per_cpu_dyn_array_size(void);
+extern unsigned long per_cpu_dyn_array_size(unsigned long *align);
 extern void per_cpu_alloc_dyn_array(int cpu, char *ptr);
 #endif /* __ASSEMBLY__ */
 
diff --git a/init/main.c b/init/main.c
index 416bca4f734f..ab97d0877acc 100644
--- a/init/main.c
+++ b/init/main.c
@@ -394,10 +394,14 @@ static void __init setup_per_cpu_areas(void)
 	unsigned long size, i, old_size;
 	char *ptr;
 	unsigned long nr_possible_cpus = num_possible_cpus();
+	unsigned long align = 1;
+	unsigned da_size;
 
 	/* Copy section for each CPU (we discard the original) */
 	old_size = PERCPU_ENOUGH_ROOM;
-	size = ALIGN(old_size + per_cpu_dyn_array_size(), PAGE_SIZE);
+	da_size = per_cpu_dyn_array_size(&align);
+	align = max_t(unsigned long, PAGE_SIZE, align);
+	size = ALIGN(old_size + da_size, align);
 	ptr = alloc_bootmem_pages(size * nr_possible_cpus);
 
 	for_each_possible_cpu(i) {
@@ -541,45 +545,78 @@ void __init __weak thread_info_cache_init(void)
 void pre_alloc_dyn_array(void)
 {
 #ifdef CONFIG_HAVE_DYN_ARRAY
-	unsigned long size, phys = 0;
+	unsigned long total_size = 0, size, phys;
+	unsigned long max_align = 1;
 	struct dyn_array **daa;
+	char *ptr;
 
+	/* get the total size at first */
 	for (daa = __dyn_array_start ; daa < __dyn_array_end; daa++) {
 		struct dyn_array *da = *daa;
 
 		size = da->size * (*da->nr);
-		print_fn_descriptor_symbol("dyna_array %s ", da->name);
-		printk(KERN_CONT "size:%#lx nr:%d align:%#lx",
+		print_fn_descriptor_symbol("dyn_array %s ", da->name);
+		printk(KERN_CONT "size:%#lx nr:%d align:%#lx\n",
 			da->size, *da->nr, da->align);
-		*da->name = __alloc_bootmem(size, da->align, phys);
-		phys = virt_to_phys(*da->name);
+		total_size += roundup(size, da->align);
+		if (da->align > max_align)
+			max_align = da->align;
+	}
+	if (total_size)
+		printk(KERN_DEBUG "dyn_array total_size: %#lx\n",
+			 total_size);
+	else
+		return;
+
+	/* allocate them all together */
+	max_align = max_t(unsigned long, max_align, PAGE_SIZE);
+	ptr = __alloc_bootmem_nopanic(total_size, max_align, 0);
+	if (!ptr)
+		panic("Can not alloc dyn_alloc\n");
+
+	phys = virt_to_phys(ptr);
+	for (daa = __dyn_array_start ; daa < __dyn_array_end; daa++) {
+		struct dyn_array *da = *daa;
+
+		size = da->size * (*da->nr);
+		print_fn_descriptor_symbol("dyn_array %s ", da->name);
+
+		phys = roundup(phys, da->align);
+		*da->name = phys_to_virt(phys);
 		printk(KERN_CONT " ==> [%#lx - %#lx]\n", phys, phys + size);
 
+		phys += size;
+
 		if (da->init_work)
 			da->init_work(da);
 	}
 #endif
 }
 
-unsigned long per_cpu_dyn_array_size(void)
+unsigned long per_cpu_dyn_array_size(unsigned long *align)
 {
 	unsigned long total_size = 0;
 #ifdef CONFIG_HAVE_DYN_ARRAY
 	unsigned long size;
 	struct dyn_array **daa;
+	unsigned max_align = 1;
 
 	for (daa = __per_cpu_dyn_array_start ; daa < __per_cpu_dyn_array_end; daa++) {
 		struct dyn_array *da = *daa;
 
 		size = da->size * (*da->nr);
-		print_fn_descriptor_symbol("per_cpu_dyna_array %s ", da->name);
+		print_fn_descriptor_symbol("per_cpu_dyn_array %s ", da->name);
 		printk(KERN_CONT "size:%#lx nr:%d align:%#lx\n",
 			da->size, *da->nr, da->align);
 		total_size += roundup(size, da->align);
+		if (da->align > max_align)
+			max_align = da->align;
 	}
-	if (total_size)
-		printk(KERN_DEBUG "per_cpu_dyna_array total_size: %#lx\n",
+	if (total_size) {
+		printk(KERN_DEBUG "per_cpu_dyn_array total_size: %#lx\n",
 			 total_size);
+		*align = max_align;
+	}
 #endif
 	return total_size;
 }
@@ -593,14 +630,11 @@ void per_cpu_alloc_dyn_array(int cpu, char *ptr)
 	void **array;
 
 	phys = virt_to_phys(ptr);
-
 	for (daa = __per_cpu_dyn_array_start ; daa < __per_cpu_dyn_array_end; daa++) {
 		struct dyn_array *da = *daa;
 
 		size = da->size * (*da->nr);
-		print_fn_descriptor_symbol("per_cpu_dyna_array %s ", da->name);
-		printk(KERN_CONT "size:%#lx nr:%d align:%#lx",
-			da->size, *da->nr, da->align);
+		print_fn_descriptor_symbol("per_cpu_dyn_array %s ", da->name);
 
 		phys = roundup(phys, da->align);
 		addr = (unsigned long)da->name;
@@ -608,7 +642,8 @@ void per_cpu_alloc_dyn_array(int cpu, char *ptr)
 		array = (void **)addr;
 		*array = phys_to_virt(phys);
 		*da->name = *array; /* so init_work could use it directly */
-		printk(KERN_CONT " %p ==> [%#lx - %#lx]\n", array, phys, phys + size);
+		printk(KERN_CONT " ==> [%#lx - %#lx]\n", phys, phys + size);
+
 		phys += size;
 
 		if (da->init_work) {
-- 
cgit v1.2.3


From 85c0f90978bf50596dbd23854648020f1f9b5bfd Mon Sep 17 00:00:00 2001
From: Yinghai Lu <yhlu.kernel@gmail.com>
Date: Tue, 19 Aug 2008 20:49:47 -0700
Subject: irq: introduce nr_irqs

at this point nr_irqs is equal NR_IRQS

convert a few easy users from NR_IRQS to dynamic nr_irqs.

v2: according to Eric, we need to take care of arch without generic_hardirqs

Signed-off-by: Yinghai Lu <yhlu.kernel@gmail.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/m68k/kernel/ints.c   |  2 ++
 arch/s390/kernel/irq.c    |  2 ++
 arch/sparc/kernel/irq.c   |  3 +++
 include/linux/interrupt.h |  2 ++
 kernel/irq/autoprobe.c    | 10 +++++-----
 kernel/irq/chip.c         | 20 ++++++++++----------
 kernel/irq/handle.c       |  3 ++-
 kernel/irq/manage.c       | 16 ++++++++--------
 kernel/irq/proc.c         |  2 +-
 kernel/irq/resend.c       |  4 ++--
 kernel/irq/spurious.c     |  4 ++--
 11 files changed, 39 insertions(+), 29 deletions(-)

(limited to 'include/linux')

diff --git a/arch/m68k/kernel/ints.c b/arch/m68k/kernel/ints.c
index 7e8a0d394e61..74453d15692e 100644
--- a/arch/m68k/kernel/ints.c
+++ b/arch/m68k/kernel/ints.c
@@ -46,6 +46,8 @@
 #include <asm/q40ints.h>
 #endif
 
+int nr_irqs = NR_IRQS;
+
 extern u32 auto_irqhandler_fixup[];
 extern u32 user_irqhandler_fixup[];
 extern u16 user_irqvec_fixup[];
diff --git a/arch/s390/kernel/irq.c b/arch/s390/kernel/irq.c
index e7c5bfb7c755..14eb5496c8a8 100644
--- a/arch/s390/kernel/irq.c
+++ b/arch/s390/kernel/irq.c
@@ -17,6 +17,8 @@
 #include <linux/proc_fs.h>
 #include <linux/profile.h>
 
+int nr_irqs = NR_IRQS;
+
 /*
  * show_interrupts is needed by /proc/interrupts.
  */
diff --git a/arch/sparc/kernel/irq.c b/arch/sparc/kernel/irq.c
index 93e1d1c65290..059598b7e0f0 100644
--- a/arch/sparc/kernel/irq.c
+++ b/arch/sparc/kernel/irq.c
@@ -55,6 +55,9 @@
 #define SMP_NOP2
 #define SMP_NOP3
 #endif /* SMP */
+
+int nr_irqs = NR_IRQS;
+
 unsigned long __raw_local_irq_save(void)
 {
 	unsigned long retval;
diff --git a/include/linux/interrupt.h b/include/linux/interrupt.h
index 58ff4e74b2f3..511803853a5b 100644
--- a/include/linux/interrupt.h
+++ b/include/linux/interrupt.h
@@ -15,6 +15,8 @@
 #include <asm/ptrace.h>
 #include <asm/system.h>
 
+extern int nr_irqs;
+
 /*
  * These correspond to the IORESOURCE_IRQ_* defines in
  * linux/ioport.h to select the interrupt line behaviour.  When
diff --git a/kernel/irq/autoprobe.c b/kernel/irq/autoprobe.c
index 533068cfb607..c689e9851a80 100644
--- a/kernel/irq/autoprobe.c
+++ b/kernel/irq/autoprobe.c
@@ -38,7 +38,7 @@ unsigned long probe_irq_on(void)
 	 * something may have generated an irq long ago and we want to
 	 * flush such a longstanding irq before considering it as spurious.
 	 */
-	for (i = NR_IRQS-1; i > 0; i--) {
+	for (i = nr_irqs-1; i > 0; i--) {
 		desc = irq_desc + i;
 
 		spin_lock_irq(&desc->lock);
@@ -68,7 +68,7 @@ unsigned long probe_irq_on(void)
 	 * (we must startup again here because if a longstanding irq
 	 * happened in the previous stage, it may have masked itself)
 	 */
-	for (i = NR_IRQS-1; i > 0; i--) {
+	for (i = nr_irqs-1; i > 0; i--) {
 		desc = irq_desc + i;
 
 		spin_lock_irq(&desc->lock);
@@ -89,7 +89,7 @@ unsigned long probe_irq_on(void)
 	 * Now filter out any obviously spurious interrupts
 	 */
 	mask = 0;
-	for (i = 0; i < NR_IRQS; i++) {
+	for (i = 0; i < nr_irqs; i++) {
 		unsigned int status;
 
 		desc = irq_desc + i;
@@ -130,7 +130,7 @@ unsigned int probe_irq_mask(unsigned long val)
 	int i;
 
 	mask = 0;
-	for (i = 0; i < NR_IRQS; i++) {
+	for (i = 0; i < nr_irqs; i++) {
 		struct irq_desc *desc = irq_desc + i;
 		unsigned int status;
 
@@ -173,7 +173,7 @@ int probe_irq_off(unsigned long val)
 {
 	int i, irq_found = 0, nr_irqs = 0;
 
-	for (i = 0; i < NR_IRQS; i++) {
+	for (i = 0; i < nr_irqs; i++) {
 		struct irq_desc *desc = irq_desc + i;
 		unsigned int status;
 
diff --git a/kernel/irq/chip.c b/kernel/irq/chip.c
index 5203a599d211..bba66e098703 100644
--- a/kernel/irq/chip.c
+++ b/kernel/irq/chip.c
@@ -27,7 +27,7 @@ void dynamic_irq_init(unsigned int irq)
 	struct irq_desc *desc;
 	unsigned long flags;
 
-	if (irq >= NR_IRQS) {
+	if (irq >= nr_irqs) {
 		WARN(1, KERN_ERR "Trying to initialize invalid IRQ%d\n", irq);
 		return;
 	}
@@ -60,7 +60,7 @@ void dynamic_irq_cleanup(unsigned int irq)
 	struct irq_desc *desc;
 	unsigned long flags;
 
-	if (irq >= NR_IRQS) {
+	if (irq >= nr_irqs) {
 		WARN(1, KERN_ERR "Trying to cleanup invalid IRQ%d\n", irq);
 		return;
 	}
@@ -92,7 +92,7 @@ int set_irq_chip(unsigned int irq, struct irq_chip *chip)
 	struct irq_desc *desc;
 	unsigned long flags;
 
-	if (irq >= NR_IRQS) {
+	if (irq >= nr_irqs) {
 		WARN(1, KERN_ERR "Trying to install chip for IRQ%d\n", irq);
 		return -EINVAL;
 	}
@@ -121,7 +121,7 @@ int set_irq_type(unsigned int irq, unsigned int type)
 	unsigned long flags;
 	int ret = -ENXIO;
 
-	if (irq >= NR_IRQS) {
+	if (irq >= nr_irqs) {
 		printk(KERN_ERR "Trying to set irq type for IRQ%d\n", irq);
 		return -ENODEV;
 	}
@@ -149,7 +149,7 @@ int set_irq_data(unsigned int irq, void *data)
 	struct irq_desc *desc;
 	unsigned long flags;
 
-	if (irq >= NR_IRQS) {
+	if (irq >= nr_irqs) {
 		printk(KERN_ERR
 		       "Trying to install controller data for IRQ%d\n", irq);
 		return -EINVAL;
@@ -175,7 +175,7 @@ int set_irq_msi(unsigned int irq, struct msi_desc *entry)
 	struct irq_desc *desc;
 	unsigned long flags;
 
-	if (irq >= NR_IRQS) {
+	if (irq >= nr_irqs) {
 		printk(KERN_ERR
 		       "Trying to install msi data for IRQ%d\n", irq);
 		return -EINVAL;
@@ -201,7 +201,7 @@ int set_irq_chip_data(unsigned int irq, void *data)
 	struct irq_desc *desc = irq_desc + irq;
 	unsigned long flags;
 
-	if (irq >= NR_IRQS || !desc->chip) {
+	if (irq >= nr_irqs || !desc->chip) {
 		printk(KERN_ERR "BUG: bad set_irq_chip_data(IRQ#%d)\n", irq);
 		return -EINVAL;
 	}
@@ -545,7 +545,7 @@ __set_irq_handler(unsigned int irq, irq_flow_handler_t handle, int is_chained,
 	struct irq_desc *desc;
 	unsigned long flags;
 
-	if (irq >= NR_IRQS) {
+	if (irq >= nr_irqs) {
 		printk(KERN_ERR
 		       "Trying to install type control for IRQ%d\n", irq);
 		return;
@@ -610,7 +610,7 @@ void __init set_irq_noprobe(unsigned int irq)
 	struct irq_desc *desc;
 	unsigned long flags;
 
-	if (irq >= NR_IRQS) {
+	if (irq >= nr_irqs) {
 		printk(KERN_ERR "Trying to mark IRQ%d non-probeable\n", irq);
 
 		return;
@@ -628,7 +628,7 @@ void __init set_irq_probe(unsigned int irq)
 	struct irq_desc *desc;
 	unsigned long flags;
 
-	if (irq >= NR_IRQS) {
+	if (irq >= nr_irqs) {
 		printk(KERN_ERR "Trying to mark IRQ%d probeable\n", irq);
 
 		return;
diff --git a/kernel/irq/handle.c b/kernel/irq/handle.c
index f4c8a03a9fbb..e9d022cf593e 100644
--- a/kernel/irq/handle.c
+++ b/kernel/irq/handle.c
@@ -47,6 +47,7 @@ handle_bad_irq(unsigned int irq, struct irq_desc *desc)
  *
  * Controller mappings for all interrupt sources:
  */
+int nr_irqs = NR_IRQS;
 struct irq_desc irq_desc[NR_IRQS] __cacheline_aligned_in_smp = {
 	[0 ... NR_IRQS-1] = {
 		.status = IRQ_DISABLED,
@@ -265,7 +266,7 @@ void early_init_irq_lock_class(void)
 {
 	int i;
 
-	for (i = 0; i < NR_IRQS; i++)
+	for (i = 0; i < nr_irqs; i++)
 		lockdep_set_class(&irq_desc[i].lock, &irq_desc_lock_class);
 }
 
diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c
index d363f32dba7d..d5a4333d8f1f 100644
--- a/kernel/irq/manage.c
+++ b/kernel/irq/manage.c
@@ -34,7 +34,7 @@ void synchronize_irq(unsigned int irq)
 	struct irq_desc *desc = irq_desc + irq;
 	unsigned int status;
 
-	if (irq >= NR_IRQS)
+	if (irq >= nr_irqs)
 		return;
 
 	do {
@@ -143,7 +143,7 @@ void disable_irq_nosync(unsigned int irq)
 	struct irq_desc *desc = irq_desc + irq;
 	unsigned long flags;
 
-	if (irq >= NR_IRQS)
+	if (irq >= nr_irqs)
 		return;
 
 	spin_lock_irqsave(&desc->lock, flags);
@@ -171,7 +171,7 @@ void disable_irq(unsigned int irq)
 {
 	struct irq_desc *desc = irq_desc + irq;
 
-	if (irq >= NR_IRQS)
+	if (irq >= nr_irqs)
 		return;
 
 	disable_irq_nosync(irq);
@@ -214,7 +214,7 @@ void enable_irq(unsigned int irq)
 	struct irq_desc *desc = irq_desc + irq;
 	unsigned long flags;
 
-	if (irq >= NR_IRQS)
+	if (irq >= nr_irqs)
 		return;
 
 	spin_lock_irqsave(&desc->lock, flags);
@@ -290,7 +290,7 @@ int can_request_irq(unsigned int irq, unsigned long irqflags)
 {
 	struct irqaction *action;
 
-	if (irq >= NR_IRQS || irq_desc[irq].status & IRQ_NOREQUEST)
+	if (irq >= nr_irqs || irq_desc[irq].status & IRQ_NOREQUEST)
 		return 0;
 
 	action = irq_desc[irq].action;
@@ -356,7 +356,7 @@ int setup_irq(unsigned int irq, struct irqaction *new)
 	int shared = 0;
 	int ret;
 
-	if (irq >= NR_IRQS)
+	if (irq >= nr_irqs)
 		return -EINVAL;
 
 	if (desc->chip == &no_irq_chip)
@@ -515,7 +515,7 @@ void free_irq(unsigned int irq, void *dev_id)
 	unsigned long flags;
 
 	WARN_ON(in_interrupt());
-	if (irq >= NR_IRQS)
+	if (irq >= nr_irqs)
 		return;
 
 	desc = irq_desc + irq;
@@ -630,7 +630,7 @@ int request_irq(unsigned int irq, irq_handler_t handler,
 	 */
 	if ((irqflags & IRQF_SHARED) && !dev_id)
 		return -EINVAL;
-	if (irq >= NR_IRQS)
+	if (irq >= nr_irqs)
 		return -EINVAL;
 	if (irq_desc[irq].status & IRQ_NOREQUEST)
 		return -EINVAL;
diff --git a/kernel/irq/proc.c b/kernel/irq/proc.c
index a09dd29c2fd7..e5225a65a4f9 100644
--- a/kernel/irq/proc.c
+++ b/kernel/irq/proc.c
@@ -240,7 +240,7 @@ void init_irq_proc(void)
 	/*
 	 * Create entries for all existing IRQs.
 	 */
-	for (i = 0; i < NR_IRQS; i++)
+	for (i = 0; i < nr_irqs; i++)
 		register_irq_proc(i);
 }
 
diff --git a/kernel/irq/resend.c b/kernel/irq/resend.c
index a8046791ba2d..cba8aa5bc7f4 100644
--- a/kernel/irq/resend.c
+++ b/kernel/irq/resend.c
@@ -33,8 +33,8 @@ static void resend_irqs(unsigned long arg)
 	struct irq_desc *desc;
 	int irq;
 
-	while (!bitmap_empty(irqs_resend, NR_IRQS)) {
-		irq = find_first_bit(irqs_resend, NR_IRQS);
+	while (!bitmap_empty(irqs_resend, nr_irqs)) {
+		irq = find_first_bit(irqs_resend, nr_irqs);
 		clear_bit(irq, irqs_resend);
 		desc = irq_desc + irq;
 		local_irq_disable();
diff --git a/kernel/irq/spurious.c b/kernel/irq/spurious.c
index 19fe9d6ebfe8..e26ca1e90c08 100644
--- a/kernel/irq/spurious.c
+++ b/kernel/irq/spurious.c
@@ -91,7 +91,7 @@ static int misrouted_irq(int irq)
 	int i;
 	int ok = 0;
 
-	for (i = 1; i < NR_IRQS; i++) {
+	for (i = 1; i < nr_irqs; i++) {
 		struct irq_desc *desc = irq_desc + i;
 
 		if (i == irq)	/* Already tried */
@@ -107,7 +107,7 @@ static int misrouted_irq(int irq)
 static void poll_spurious_irqs(unsigned long dummy)
 {
 	int i;
-	for (i = 1; i < NR_IRQS; i++) {
+	for (i = 1; i < nr_irqs; i++) {
 		struct irq_desc *desc = irq_desc + i;
 		unsigned int status;
 
-- 
cgit v1.2.3


From d60458b224d6b997a582a05cb8c4b9bed9e17a1d Mon Sep 17 00:00:00 2001
From: Yinghai Lu <yhlu.kernel@gmail.com>
Date: Tue, 19 Aug 2008 20:50:00 -0700
Subject: irq: make irq_desc to use dyn_array

Signed-off-by: Yinghai Lu <yhlu.kernel@gmail.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 include/linux/irq.h |  4 ++++
 kernel/irq/handle.c | 31 +++++++++++++++++++++++++++++++
 2 files changed, 35 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/irq.h b/include/linux/irq.h
index 1d73d1abb834..5f4b013624dc 100644
--- a/include/linux/irq.h
+++ b/include/linux/irq.h
@@ -179,7 +179,11 @@ struct irq_desc {
 	const char		*name;
 } ____cacheline_internodealigned_in_smp;
 
+#ifdef CONFIG_HAVE_DYN_ARRAY
+extern struct irq_desc *irq_desc;
+#else
 extern struct irq_desc irq_desc[NR_IRQS];
+#endif
 
 /*
  * Migration helpers for obsolete names, they will go away:
diff --git a/kernel/irq/handle.c b/kernel/irq/handle.c
index e9d022cf593e..e94eeca09ea9 100644
--- a/kernel/irq/handle.c
+++ b/kernel/irq/handle.c
@@ -48,6 +48,36 @@ handle_bad_irq(unsigned int irq, struct irq_desc *desc)
  * Controller mappings for all interrupt sources:
  */
 int nr_irqs = NR_IRQS;
+
+#ifdef CONFIG_HAVE_DYN_ARRAY
+static struct irq_desc irq_desc_init __initdata = {
+	.status = IRQ_DISABLED,
+	.chip = &no_irq_chip,
+	.handle_irq = handle_bad_irq,
+	.depth = 1,
+	.lock = __SPIN_LOCK_UNLOCKED(irq_desc_init.lock),
+#ifdef CONFIG_SMP
+	.affinity = CPU_MASK_ALL
+#endif
+};
+
+static void __init init_work(void *data)
+{
+	struct dyn_array *da = data;
+	int i;
+	struct  irq_desc *desc;
+
+	desc = *da->name;
+
+	for (i = 0; i < *da->nr; i++)
+		memcpy(&desc[i], &irq_desc_init, sizeof(struct irq_desc));
+}
+
+struct irq_desc *irq_desc;
+DEFINE_DYN_ARRAY(irq_desc, sizeof(struct irq_desc), nr_irqs, PAGE_SIZE, init_work);
+
+#else
+
 struct irq_desc irq_desc[NR_IRQS] __cacheline_aligned_in_smp = {
 	[0 ... NR_IRQS-1] = {
 		.status = IRQ_DISABLED,
@@ -60,6 +90,7 @@ struct irq_desc irq_desc[NR_IRQS] __cacheline_aligned_in_smp = {
 #endif
 	}
 };
+#endif
 
 /*
  * What should we do if we get a hw irq event on an illegal vector?
-- 
cgit v1.2.3


From d17a55ded3393ad3878010bb3a8243a15a8d8df5 Mon Sep 17 00:00:00 2001
From: Yinghai Lu <yhlu.kernel@gmail.com>
Date: Tue, 19 Aug 2008 20:50:01 -0700
Subject: irq: make irqs in kernel stat use per_cpu_dyn_array

Signed-off-by: Yinghai Lu <yhlu.kernel@gmail.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 include/linux/kernel_stat.h | 4 ++++
 kernel/sched.c              | 5 ++++-
 2 files changed, 8 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/kernel_stat.h b/include/linux/kernel_stat.h
index cf9f40a91c9c..fe1f7fe534b4 100644
--- a/include/linux/kernel_stat.h
+++ b/include/linux/kernel_stat.h
@@ -28,7 +28,11 @@ struct cpu_usage_stat {
 
 struct kernel_stat {
 	struct cpu_usage_stat	cpustat;
+#ifdef CONFIG_HAVE_DYN_ARRAY
+	unsigned int *irqs;
+#else
 	unsigned int irqs[NR_IRQS];
+#endif
 };
 
 DECLARE_PER_CPU(struct kernel_stat, kstat);
diff --git a/kernel/sched.c b/kernel/sched.c
index 6f230596bd0c..b9d713781b5b 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -4048,9 +4048,12 @@ static inline void idle_balance(int cpu, struct rq *rq)
 #endif
 
 DEFINE_PER_CPU(struct kernel_stat, kstat);
-
 EXPORT_PER_CPU_SYMBOL(kstat);
 
+#ifdef CONFIG_HAVE_DYN_ARRAY
+DEFINE_PER_CPU_DYN_ARRAY_ADDR(per_cpu__kstat_irqs, per_cpu__kstat.irqs, sizeof(unsigned int), nr_irqs, sizeof(unsigned long), NULL);
+#endif
+
 /*
  * Return p->sum_exec_runtime plus any more ns on the sched_clock
  * that have not yet been banked in case the task is currently running.
-- 
cgit v1.2.3


From 08678b0841267c1d00d771fe01548d86043d065e Mon Sep 17 00:00:00 2001
From: Yinghai Lu <yhlu.kernel@gmail.com>
Date: Tue, 19 Aug 2008 20:50:05 -0700
Subject: generic: sparse irqs: use irq_desc() together with dyn_array, instead
 of irq_desc[]

add CONFIG_HAVE_SPARSE_IRQ to for use condensed array.
Get rid of irq_desc[] array assumptions.

Preallocate 32 irq_desc, and irq_desc() will try to get more.

( No change in functionality is expected anywhere, except the odd build
  failure where we missed a code site or where a crossing commit itroduces
  new irq_desc[] usage. )

v2: according to Eric, change get_irq_desc() to irq_desc()

Signed-off-by: Yinghai Lu <yhlu.kernel@gmail.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/Kconfig                        |   4 ++
 arch/x86/Kconfig                    |   1 +
 arch/x86/kernel/io_apic_32.c        |  46 ++++++++----
 arch/x86/kernel/io_apic_64.c        |  75 +++++++++++++-------
 arch/x86/kernel/irq_32.c            |  24 ++++---
 arch/x86/kernel/irq_64.c            |  35 ++++-----
 arch/x86/kernel/irqinit_64.c        |  10 +--
 arch/x86/kernel/visws_quirks.c      |  30 ++++----
 arch/x86/mach-voyager/voyager_smp.c |   4 +-
 drivers/gpio/gpiolib.c              |   2 +-
 drivers/mfd/asic3.c                 |   4 +-
 drivers/mfd/htc-egpio.c             |   2 +-
 drivers/parisc/dino.c               |   6 +-
 drivers/parisc/eisa.c               |   4 +-
 drivers/parisc/gsc.c                |  12 ++--
 drivers/parisc/iosapic.c            |   4 +-
 drivers/parisc/superio.c            |   4 +-
 drivers/pcmcia/hd64465_ss.c         |  12 +++-
 drivers/xen/events.c                |   8 ++-
 include/linux/irq.h                 |  32 ++++++---
 kernel/irq/autoprobe.c              |  10 +--
 kernel/irq/chip.c                   |  32 +++++----
 kernel/irq/handle.c                 | 138 ++++++++++++++++++++++++++++++++----
 kernel/irq/manage.c                 |  35 +++++----
 kernel/irq/migration.c              |  14 ++--
 kernel/irq/proc.c                   |  36 +++++-----
 kernel/irq/resend.c                 |   2 +-
 kernel/irq/spurious.c               |   5 +-
 28 files changed, 404 insertions(+), 187 deletions(-)

(limited to 'include/linux')

diff --git a/arch/Kconfig b/arch/Kconfig
index c1f9febb404f..b36762246265 100644
--- a/arch/Kconfig
+++ b/arch/Kconfig
@@ -105,3 +105,7 @@ config HAVE_CLK
 
 config HAVE_DYN_ARRAY
 	def_bool n
+
+config HAVE_SPARSE_IRQ
+	def_bool n
+
diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
index 42f98009d752..1004888e9b13 100644
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -34,6 +34,7 @@ config X86
 	select HAVE_GENERIC_DMA_COHERENT if X86_32
 	select HAVE_EFFICIENT_UNALIGNED_ACCESS
 	select HAVE_DYN_ARRAY
+	select HAVE_SPARSE_IRQ if X86_64
 
 config ARCH_DEFCONFIG
 	string
diff --git a/arch/x86/kernel/io_apic_32.c b/arch/x86/kernel/io_apic_32.c
index 7f2bcc3dad82..c2160cfdec9b 100644
--- a/arch/x86/kernel/io_apic_32.c
+++ b/arch/x86/kernel/io_apic_32.c
@@ -345,6 +345,7 @@ static void set_ioapic_affinity_irq(unsigned int irq, cpumask_t cpumask)
 	struct irq_pin_list *entry = irq_2_pin + irq;
 	unsigned int apicid_value;
 	cpumask_t tmp;
+	struct irq_desc *desc;
 
 	cpus_and(tmp, cpumask, cpu_online_map);
 	if (cpus_empty(tmp))
@@ -365,7 +366,8 @@ static void set_ioapic_affinity_irq(unsigned int irq, cpumask_t cpumask)
 			break;
 		entry = irq_2_pin + entry->next;
 	}
-	irq_desc[irq].affinity = cpumask;
+	desc = irq_to_desc(irq);
+	desc->affinity = cpumask;
 	spin_unlock_irqrestore(&ioapic_lock, flags);
 }
 
@@ -475,10 +477,12 @@ static inline void balance_irq(int cpu, int irq)
 static inline void rotate_irqs_among_cpus(unsigned long useful_load_threshold)
 {
 	int i, j;
+	struct irq_desc *desc;
 
 	for_each_online_cpu(i) {
 		for (j = 0; j < nr_irqs; j++) {
-			if (!irq_desc[j].action)
+			desc = irq_to_desc(j);
+			if (!desc->action)
 				continue;
 			/* Is it a significant load ?  */
 			if (IRQ_DELTA(CPU_TO_PACKAGEINDEX(i), j) <
@@ -505,6 +509,7 @@ static void do_irq_balance(void)
 	unsigned long tmp_cpu_irq;
 	unsigned long imbalance = 0;
 	cpumask_t allowed_mask, target_cpu_mask, tmp;
+	struct irq_desc *desc;
 
 	for_each_possible_cpu(i) {
 		int package_index;
@@ -515,7 +520,8 @@ static void do_irq_balance(void)
 		for (j = 0; j < nr_irqs; j++) {
 			unsigned long value_now, delta;
 			/* Is this an active IRQ or balancing disabled ? */
-			if (!irq_desc[j].action || irq_balancing_disabled(j))
+			desc = irq_to_desc(j);
+			if (!desc->action || irq_balancing_disabled(j))
 				continue;
 			if (package_index == i)
 				IRQ_DELTA(package_index, j) = 0;
@@ -609,7 +615,8 @@ tryanotherirq:
 	selected_irq = -1;
 	for (j = 0; j < nr_irqs; j++) {
 		/* Is this an active IRQ? */
-		if (!irq_desc[j].action)
+		desc = irq_to_desc(j);
+		if (!desc->action)
 			continue;
 		if (imbalance <= IRQ_DELTA(max_loaded, j))
 			continue;
@@ -682,10 +689,12 @@ static int balanced_irq(void *unused)
 	int i;
 	unsigned long prev_balance_time = jiffies;
 	long time_remaining = balanced_irq_interval;
+	struct irq_desc *desc;
 
 	/* push everything to CPU 0 to give us a starting point.  */
 	for (i = 0 ; i < nr_irqs ; i++) {
-		irq_desc[i].pending_mask = cpumask_of_cpu(0);
+		desc = irq_to_desc(i);
+		desc->pending_mask = cpumask_of_cpu(0);
 		set_pending_irq(i, cpumask_of_cpu(0));
 	}
 
@@ -1254,13 +1263,16 @@ static struct irq_chip ioapic_chip;
 
 static void ioapic_register_intr(int irq, int vector, unsigned long trigger)
 {
+	struct irq_desc *desc;
+
+	desc = irq_to_desc(irq);
 	if ((trigger == IOAPIC_AUTO && IO_APIC_irq_trigger(irq)) ||
 	    trigger == IOAPIC_LEVEL) {
-		irq_desc[irq].status |= IRQ_LEVEL;
+		desc->status |= IRQ_LEVEL;
 		set_irq_chip_and_handler_name(irq, &ioapic_chip,
 					 handle_fasteoi_irq, "fasteoi");
 	} else {
-		irq_desc[irq].status &= ~IRQ_LEVEL;
+		desc->status &= ~IRQ_LEVEL;
 		set_irq_chip_and_handler_name(irq, &ioapic_chip,
 					 handle_edge_irq, "edge");
 	}
@@ -2027,6 +2039,7 @@ static struct irq_chip ioapic_chip __read_mostly = {
 static inline void init_IO_APIC_traps(void)
 {
 	int irq;
+	struct irq_desc *desc;
 
 	/*
 	 * NOTE! The local APIC isn't very good at handling
@@ -2048,9 +2061,11 @@ static inline void init_IO_APIC_traps(void)
 			 */
 			if (irq < 16)
 				make_8259A_irq(irq);
-			else
+			else {
+				desc = irq_to_desc(irq);
 				/* Strange. Oh, well.. */
-				irq_desc[irq].chip = &no_irq_chip;
+				desc->chip = &no_irq_chip;
+			}
 		}
 	}
 }
@@ -2089,7 +2104,10 @@ static struct irq_chip lapic_chip __read_mostly = {
 
 static void lapic_register_intr(int irq, int vector)
 {
-	irq_desc[irq].status &= ~IRQ_LEVEL;
+	struct irq_desc *desc;
+
+	desc = irq_to_desc(irq);
+	desc->status &= ~IRQ_LEVEL;
 	set_irq_chip_and_handler_name(irq, &lapic_chip, handle_edge_irq,
 				      "edge");
 	set_intr_gate(vector, interrupt[irq]);
@@ -2556,6 +2574,7 @@ static void set_msi_irq_affinity(unsigned int irq, cpumask_t mask)
 	unsigned int dest;
 	cpumask_t tmp;
 	int vector;
+	struct irq_desc *desc;
 
 	cpus_and(tmp, mask, cpu_online_map);
 	if (cpus_empty(tmp))
@@ -2575,7 +2594,8 @@ static void set_msi_irq_affinity(unsigned int irq, cpumask_t mask)
 	msg.address_lo |= MSI_ADDR_DEST_ID(dest);
 
 	write_msi_msg(irq, &msg);
-	irq_desc[irq].affinity = mask;
+	desc = irq_to_desc(irq);
+	desc->affinity = mask;
 }
 #endif /* CONFIG_SMP */
 
@@ -2649,6 +2669,7 @@ static void set_ht_irq_affinity(unsigned int irq, cpumask_t mask)
 {
 	unsigned int dest;
 	cpumask_t tmp;
+	struct irq_desc *desc;
 
 	cpus_and(tmp, mask, cpu_online_map);
 	if (cpus_empty(tmp))
@@ -2659,7 +2680,8 @@ static void set_ht_irq_affinity(unsigned int irq, cpumask_t mask)
 	dest = cpu_mask_to_apicid(mask);
 
 	target_ht_irq(irq, dest);
-	irq_desc[irq].affinity = mask;
+	desc = irq_to_desc(irq);
+	desc->affinity = mask;
 }
 #endif
 
diff --git a/arch/x86/kernel/io_apic_64.c b/arch/x86/kernel/io_apic_64.c
index 93a3ffabfe6a..cab5a25d81b1 100644
--- a/arch/x86/kernel/io_apic_64.c
+++ b/arch/x86/kernel/io_apic_64.c
@@ -345,6 +345,7 @@ static void set_ioapic_affinity_irq(unsigned int irq, cpumask_t mask)
 	unsigned long flags;
 	unsigned int dest;
 	cpumask_t tmp;
+	struct irq_desc *desc;
 
 	cpus_and(tmp, mask, cpu_online_map);
 	if (cpus_empty(tmp))
@@ -361,9 +362,10 @@ static void set_ioapic_affinity_irq(unsigned int irq, cpumask_t mask)
 	 */
 	dest = SET_APIC_LOGICAL_ID(dest);
 
+	desc = irq_to_desc(irq);
 	spin_lock_irqsave(&ioapic_lock, flags);
 	__target_IO_APIC_irq(irq, dest, cfg->vector);
-	irq_desc[irq].affinity = mask;
+	desc->affinity = mask;
 	spin_unlock_irqrestore(&ioapic_lock, flags);
 }
 #endif
@@ -933,14 +935,17 @@ static struct irq_chip ir_ioapic_chip;
 
 static void ioapic_register_intr(int irq, unsigned long trigger)
 {
+	struct irq_desc *desc;
+
+	desc = irq_to_desc(irq);
 	if (trigger)
-		irq_desc[irq].status |= IRQ_LEVEL;
+		desc->status |= IRQ_LEVEL;
 	else
-		irq_desc[irq].status &= ~IRQ_LEVEL;
+		desc->status &= ~IRQ_LEVEL;
 
 #ifdef CONFIG_INTR_REMAP
 	if (irq_remapped(irq)) {
-		irq_desc[irq].status |= IRQ_MOVE_PCNTXT;
+		desc->status |= IRQ_MOVE_PCNTXT;
 		if (trigger)
 			set_irq_chip_and_handler_name(irq, &ir_ioapic_chip,
 						      handle_fasteoi_irq,
@@ -1596,10 +1601,10 @@ static DECLARE_DELAYED_WORK(ir_migration_work, ir_irq_migration);
 static void migrate_ioapic_irq(int irq, cpumask_t mask)
 {
 	struct irq_cfg *cfg = irq_cfg + irq;
-	struct irq_desc *desc = irq_desc + irq;
+	struct irq_desc *desc;
 	cpumask_t tmp, cleanup_mask;
 	struct irte irte;
-	int modify_ioapic_rte = desc->status & IRQ_LEVEL;
+	int modify_ioapic_rte;
 	unsigned int dest;
 	unsigned long flags;
 
@@ -1616,6 +1621,8 @@ static void migrate_ioapic_irq(int irq, cpumask_t mask)
 	cpus_and(tmp, cfg->domain, mask);
 	dest = cpu_mask_to_apicid(tmp);
 
+	desc = irq_to_desc(irq);
+	modify_ioapic_rte = desc->status & IRQ_LEVEL;
 	if (modify_ioapic_rte) {
 		spin_lock_irqsave(&ioapic_lock, flags);
 		__target_IO_APIC_irq(irq, dest, cfg->vector);
@@ -1637,12 +1644,13 @@ static void migrate_ioapic_irq(int irq, cpumask_t mask)
 		cfg->move_in_progress = 0;
 	}
 
-	irq_desc[irq].affinity = mask;
+	desc->affinity = mask;
 }
 
 static int migrate_irq_remapped_level(int irq)
 {
 	int ret = -1;
+	struct irq_desc *desc = irq_to_desc(irq);
 
 	mask_IO_APIC_irq(irq);
 
@@ -1658,11 +1666,11 @@ static int migrate_irq_remapped_level(int irq)
 	}
 
 	/* everthing is clear. we have right of way */
-	migrate_ioapic_irq(irq, irq_desc[irq].pending_mask);
+	migrate_ioapic_irq(irq, desc->pending_mask);
 
 	ret = 0;
-	irq_desc[irq].status &= ~IRQ_MOVE_PENDING;
-	cpus_clear(irq_desc[irq].pending_mask);
+	desc->status &= ~IRQ_MOVE_PENDING;
+	cpus_clear(desc->pending_mask);
 
 unmask:
 	unmask_IO_APIC_irq(irq);
@@ -1674,7 +1682,7 @@ static void ir_irq_migration(struct work_struct *work)
 	int irq;
 
 	for (irq = 0; irq < nr_irqs; irq++) {
-		struct irq_desc *desc = irq_desc + irq;
+		struct irq_desc *desc = irq_to_desc(irq);
 		if (desc->status & IRQ_MOVE_PENDING) {
 			unsigned long flags;
 
@@ -1686,8 +1694,7 @@ static void ir_irq_migration(struct work_struct *work)
 				continue;
 			}
 
-			desc->chip->set_affinity(irq,
-					         irq_desc[irq].pending_mask);
+			desc->chip->set_affinity(irq, desc->pending_mask);
 			spin_unlock_irqrestore(&desc->lock, flags);
 		}
 	}
@@ -1698,9 +1705,11 @@ static void ir_irq_migration(struct work_struct *work)
  */
 static void set_ir_ioapic_affinity_irq(unsigned int irq, cpumask_t mask)
 {
-	if (irq_desc[irq].status & IRQ_LEVEL) {
-		irq_desc[irq].status |= IRQ_MOVE_PENDING;
-		irq_desc[irq].pending_mask = mask;
+	struct irq_desc *desc = irq_to_desc(irq);
+
+	if (desc->status & IRQ_LEVEL) {
+		desc->status |= IRQ_MOVE_PENDING;
+		desc->pending_mask = mask;
 		migrate_irq_remapped_level(irq);
 		return;
 	}
@@ -1725,7 +1734,7 @@ asmlinkage void smp_irq_move_cleanup_interrupt(void)
 		if (irq >= nr_irqs)
 			continue;
 
-		desc = irq_desc + irq;
+		desc = irq_to_desc(irq);
 		cfg = irq_cfg + irq;
 		spin_lock(&desc->lock);
 		if (!cfg->move_cleanup_count)
@@ -1791,7 +1800,7 @@ static void ack_apic_level(unsigned int irq)
 	irq_complete_move(irq);
 #ifdef CONFIG_GENERIC_PENDING_IRQ
 	/* If we are moving the irq we need to mask it */
-	if (unlikely(irq_desc[irq].status & IRQ_MOVE_PENDING)) {
+	if (unlikely(irq_to_desc(irq)->status & IRQ_MOVE_PENDING)) {
 		do_unmask_irq = 1;
 		mask_IO_APIC_irq(irq);
 	}
@@ -1868,6 +1877,7 @@ static struct irq_chip ir_ioapic_chip __read_mostly = {
 static inline void init_IO_APIC_traps(void)
 {
 	int irq;
+	struct irq_desc *desc;
 
 	/*
 	 * NOTE! The local APIC isn't very good at handling
@@ -1889,9 +1899,11 @@ static inline void init_IO_APIC_traps(void)
 			 */
 			if (irq < 16)
 				make_8259A_irq(irq);
-			else
+			else {
+				desc = irq_to_desc(irq);
 				/* Strange. Oh, well.. */
-				irq_desc[irq].chip = &no_irq_chip;
+				desc->chip = &no_irq_chip;
+			}
 		}
 	}
 }
@@ -1926,7 +1938,10 @@ static struct irq_chip lapic_chip __read_mostly = {
 
 static void lapic_register_intr(int irq)
 {
-	irq_desc[irq].status &= ~IRQ_LEVEL;
+	struct irq_desc *desc;
+
+	desc = irq_to_desc(irq);
+	desc->status &= ~IRQ_LEVEL;
 	set_irq_chip_and_handler_name(irq, &lapic_chip, handle_edge_irq,
 				      "edge");
 }
@@ -2402,6 +2417,7 @@ static void set_msi_irq_affinity(unsigned int irq, cpumask_t mask)
 	struct msi_msg msg;
 	unsigned int dest;
 	cpumask_t tmp;
+	struct irq_desc *desc;
 
 	cpus_and(tmp, mask, cpu_online_map);
 	if (cpus_empty(tmp))
@@ -2421,7 +2437,8 @@ static void set_msi_irq_affinity(unsigned int irq, cpumask_t mask)
 	msg.address_lo |= MSI_ADDR_DEST_ID(dest);
 
 	write_msi_msg(irq, &msg);
-	irq_desc[irq].affinity = mask;
+	desc = irq_to_desc(irq);
+	desc->affinity = mask;
 }
 
 #ifdef CONFIG_INTR_REMAP
@@ -2435,6 +2452,7 @@ static void ir_set_msi_irq_affinity(unsigned int irq, cpumask_t mask)
 	unsigned int dest;
 	cpumask_t tmp, cleanup_mask;
 	struct irte irte;
+	struct irq_desc *desc;
 
 	cpus_and(tmp, mask, cpu_online_map);
 	if (cpus_empty(tmp))
@@ -2469,7 +2487,8 @@ static void ir_set_msi_irq_affinity(unsigned int irq, cpumask_t mask)
 		cfg->move_in_progress = 0;
 	}
 
-	irq_desc[irq].affinity = mask;
+	desc = irq_to_desc(irq);
+	desc->affinity = mask;
 }
 #endif
 #endif /* CONFIG_SMP */
@@ -2543,7 +2562,7 @@ static int setup_msi_irq(struct pci_dev *dev, struct msi_desc *desc, int irq)
 
 #ifdef CONFIG_INTR_REMAP
 	if (irq_remapped(irq)) {
-		struct irq_desc *desc = irq_desc + irq;
+		struct irq_desc *desc = irq_to_desc(irq);
 		/*
 		 * irq migration in process context
 		 */
@@ -2655,6 +2674,7 @@ static void dmar_msi_set_affinity(unsigned int irq, cpumask_t mask)
 	struct msi_msg msg;
 	unsigned int dest;
 	cpumask_t tmp;
+	struct irq_desc *desc;
 
 	cpus_and(tmp, mask, cpu_online_map);
 	if (cpus_empty(tmp))
@@ -2674,7 +2694,8 @@ static void dmar_msi_set_affinity(unsigned int irq, cpumask_t mask)
 	msg.address_lo |= MSI_ADDR_DEST_ID(dest);
 
 	dmar_msi_write(irq, &msg);
-	irq_desc[irq].affinity = mask;
+	desc = irq_to_desc(irq);
+	desc->affinity = mask;
 }
 #endif /* CONFIG_SMP */
 
@@ -2731,6 +2752,7 @@ static void set_ht_irq_affinity(unsigned int irq, cpumask_t mask)
 	struct irq_cfg *cfg = irq_cfg + irq;
 	unsigned int dest;
 	cpumask_t tmp;
+	struct irq_desc *desc;
 
 	cpus_and(tmp, mask, cpu_online_map);
 	if (cpus_empty(tmp))
@@ -2743,7 +2765,8 @@ static void set_ht_irq_affinity(unsigned int irq, cpumask_t mask)
 	dest = cpu_mask_to_apicid(tmp);
 
 	target_ht_irq(irq, dest, cfg->vector);
-	irq_desc[irq].affinity = mask;
+	desc = irq_to_desc(irq);
+	desc->affinity = mask;
 }
 #endif
 
diff --git a/arch/x86/kernel/irq_32.c b/arch/x86/kernel/irq_32.c
index 4c7ffb32854c..ede513be517d 100644
--- a/arch/x86/kernel/irq_32.c
+++ b/arch/x86/kernel/irq_32.c
@@ -224,7 +224,7 @@ unsigned int do_IRQ(struct pt_regs *regs)
 	struct pt_regs *old_regs;
 	/* high bit used in ret_from_ code */
 	int overflow, irq = ~regs->orig_ax;
-	struct irq_desc *desc = irq_desc + irq;
+	struct irq_desc *desc = irq_to_desc(irq);
 
 	if (unlikely((unsigned)irq >= nr_irqs)) {
 		printk(KERN_EMERG "%s: cannot handle IRQ %d\n",
@@ -273,15 +273,16 @@ int show_interrupts(struct seq_file *p, void *v)
 
 	if (i < nr_irqs) {
 		unsigned any_count = 0;
+		struct irq_desc *desc = irq_to_desc(i);
 
-		spin_lock_irqsave(&irq_desc[i].lock, flags);
+		spin_lock_irqsave(&desc->lock, flags);
 #ifndef CONFIG_SMP
 		any_count = kstat_irqs(i);
 #else
 		for_each_online_cpu(j)
 			any_count |= kstat_cpu(j).irqs[i];
 #endif
-		action = irq_desc[i].action;
+		action = desc->action;
 		if (!action && !any_count)
 			goto skip;
 		seq_printf(p, "%3d: ",i);
@@ -291,8 +292,8 @@ int show_interrupts(struct seq_file *p, void *v)
 		for_each_online_cpu(j)
 			seq_printf(p, "%10u ", kstat_cpu(j).irqs[i]);
 #endif
-		seq_printf(p, " %8s", irq_desc[i].chip->name);
-		seq_printf(p, "-%-8s", irq_desc[i].name);
+		seq_printf(p, " %8s", desc->chip->name);
+		seq_printf(p, "-%-8s", desc->name);
 
 		if (action) {
 			seq_printf(p, "  %s", action->name);
@@ -302,7 +303,7 @@ int show_interrupts(struct seq_file *p, void *v)
 
 		seq_putc(p, '\n');
 skip:
-		spin_unlock_irqrestore(&irq_desc[i].lock, flags);
+		spin_unlock_irqrestore(&desc->lock, flags);
 	} else if (i == nr_irqs) {
 		seq_printf(p, "NMI: ");
 		for_each_online_cpu(j)
@@ -398,17 +399,20 @@ void fixup_irqs(cpumask_t map)
 
 	for (irq = 0; irq < nr_irqs; irq++) {
 		cpumask_t mask;
+		struct irq_desc *desc;
+
 		if (irq == 2)
 			continue;
 
-		cpus_and(mask, irq_desc[irq].affinity, map);
+		desc = irq_to_desc(irq);
+		cpus_and(mask, desc->affinity, map);
 		if (any_online_cpu(mask) == NR_CPUS) {
 			printk("Breaking affinity for irq %i\n", irq);
 			mask = map;
 		}
-		if (irq_desc[irq].chip->set_affinity)
-			irq_desc[irq].chip->set_affinity(irq, mask);
-		else if (irq_desc[irq].action && !(warned++))
+		if (desc->chip->set_affinity)
+			desc->chip->set_affinity(irq, mask);
+		else if (desc->action && !(warned++))
 			printk("Cannot set affinity for irq %i\n", irq);
 	}
 
diff --git a/arch/x86/kernel/irq_64.c b/arch/x86/kernel/irq_64.c
index e1f0839430d2..738eb65a924e 100644
--- a/arch/x86/kernel/irq_64.c
+++ b/arch/x86/kernel/irq_64.c
@@ -83,15 +83,16 @@ int show_interrupts(struct seq_file *p, void *v)
 
 	if (i < nr_irqs) {
 		unsigned any_count = 0;
+		struct irq_desc *desc = irq_to_desc(i);
 
-		spin_lock_irqsave(&irq_desc[i].lock, flags);
+		spin_lock_irqsave(&desc->lock, flags);
 #ifndef CONFIG_SMP
 		any_count = kstat_irqs(i);
 #else
 		for_each_online_cpu(j)
 			any_count |= kstat_cpu(j).irqs[i];
 #endif
-		action = irq_desc[i].action;
+		action = desc->action;
 		if (!action && !any_count)
 			goto skip;
 		seq_printf(p, "%3d: ",i);
@@ -101,8 +102,8 @@ int show_interrupts(struct seq_file *p, void *v)
 		for_each_online_cpu(j)
 			seq_printf(p, "%10u ", kstat_cpu(j).irqs[i]);
 #endif
-		seq_printf(p, " %8s", irq_desc[i].chip->name);
-		seq_printf(p, "-%-8s", irq_desc[i].name);
+		seq_printf(p, " %8s", desc->chip->name);
+		seq_printf(p, "-%-8s", desc->name);
 
 		if (action) {
 			seq_printf(p, "  %s", action->name);
@@ -111,7 +112,7 @@ int show_interrupts(struct seq_file *p, void *v)
 		}
 		seq_putc(p, '\n');
 skip:
-		spin_unlock_irqrestore(&irq_desc[i].lock, flags);
+		spin_unlock_irqrestore(&desc->lock, flags);
 	} else if (i == nr_irqs) {
 		seq_printf(p, "NMI: ");
 		for_each_online_cpu(j)
@@ -228,37 +229,39 @@ void fixup_irqs(cpumask_t map)
 		cpumask_t mask;
 		int break_affinity = 0;
 		int set_affinity = 1;
+		struct irq_desc *desc;
 
 		if (irq == 2)
 			continue;
 
+		desc = irq_to_desc(irq);
 		/* interrupt's are disabled at this point */
-		spin_lock(&irq_desc[irq].lock);
+		spin_lock(&desc->lock);
 
 		if (!irq_has_action(irq) ||
-		    cpus_equal(irq_desc[irq].affinity, map)) {
-			spin_unlock(&irq_desc[irq].lock);
+		    cpus_equal(desc->affinity, map)) {
+			spin_unlock(&desc->lock);
 			continue;
 		}
 
-		cpus_and(mask, irq_desc[irq].affinity, map);
+		cpus_and(mask, desc->affinity, map);
 		if (cpus_empty(mask)) {
 			break_affinity = 1;
 			mask = map;
 		}
 
-		if (irq_desc[irq].chip->mask)
-			irq_desc[irq].chip->mask(irq);
+		if (desc->chip->mask)
+			desc->chip->mask(irq);
 
-		if (irq_desc[irq].chip->set_affinity)
-			irq_desc[irq].chip->set_affinity(irq, mask);
+		if (desc->chip->set_affinity)
+			desc->chip->set_affinity(irq, mask);
 		else if (!(warned++))
 			set_affinity = 0;
 
-		if (irq_desc[irq].chip->unmask)
-			irq_desc[irq].chip->unmask(irq);
+		if (desc->chip->unmask)
+			desc->chip->unmask(irq);
 
-		spin_unlock(&irq_desc[irq].lock);
+		spin_unlock(&desc->lock);
 
 		if (break_affinity && set_affinity)
 			printk("Broke affinity for irq %i\n", irq);
diff --git a/arch/x86/kernel/irqinit_64.c b/arch/x86/kernel/irqinit_64.c
index 165c5d9b0d1a..0744b49b4d12 100644
--- a/arch/x86/kernel/irqinit_64.c
+++ b/arch/x86/kernel/irqinit_64.c
@@ -143,9 +143,11 @@ void __init init_ISA_irqs(void)
 	init_8259A(0);
 
 	for (i = 0; i < nr_irqs; i++) {
-		irq_desc[i].status = IRQ_DISABLED;
-		irq_desc[i].action = NULL;
-		irq_desc[i].depth = 1;
+		struct irq_desc *desc = irq_to_desc(i);
+
+		desc->status = IRQ_DISABLED;
+		desc->action = NULL;
+		desc->depth = 1;
 
 		if (i < 16) {
 			/*
@@ -157,7 +159,7 @@ void __init init_ISA_irqs(void)
 			/*
 			 * 'high' PCI IRQs filled in on demand
 			 */
-			irq_desc[i].chip = &no_irq_chip;
+			desc->chip = &no_irq_chip;
 		}
 	}
 }
diff --git a/arch/x86/kernel/visws_quirks.c b/arch/x86/kernel/visws_quirks.c
index 61a97e616f70..9d85ab384435 100644
--- a/arch/x86/kernel/visws_quirks.c
+++ b/arch/x86/kernel/visws_quirks.c
@@ -484,10 +484,11 @@ static void disable_cobalt_irq(unsigned int irq)
 static unsigned int startup_cobalt_irq(unsigned int irq)
 {
 	unsigned long flags;
+	struct irq_desc *desc = irq_to_desc(irq);
 
 	spin_lock_irqsave(&cobalt_lock, flags);
-	if ((irq_desc[irq].status & (IRQ_DISABLED | IRQ_INPROGRESS | IRQ_WAITING)))
-		irq_desc[irq].status &= ~(IRQ_DISABLED | IRQ_INPROGRESS | IRQ_WAITING);
+	if ((desc->status & (IRQ_DISABLED | IRQ_INPROGRESS | IRQ_WAITING)))
+		desc->status &= ~(IRQ_DISABLED | IRQ_INPROGRESS | IRQ_WAITING);
 	enable_cobalt_irq(irq);
 	spin_unlock_irqrestore(&cobalt_lock, flags);
 	return 0;
@@ -506,9 +507,10 @@ static void ack_cobalt_irq(unsigned int irq)
 static void end_cobalt_irq(unsigned int irq)
 {
 	unsigned long flags;
+	struct irq_desc *desc = irq_to_desc(irq);
 
 	spin_lock_irqsave(&cobalt_lock, flags);
-	if (!(irq_desc[irq].status & (IRQ_DISABLED | IRQ_INPROGRESS)))
+	if (!(desc->status & (IRQ_DISABLED | IRQ_INPROGRESS)))
 		enable_cobalt_irq(irq);
 	spin_unlock_irqrestore(&cobalt_lock, flags);
 }
@@ -626,7 +628,7 @@ static irqreturn_t piix4_master_intr(int irq, void *dev_id)
 
 	spin_unlock_irqrestore(&i8259A_lock, flags);
 
-	desc = irq_desc + realirq;
+	desc = irq_to_desc(realirq);
 
 	/*
 	 * handle this 'virtual interrupt' as a Cobalt one now.
@@ -662,27 +664,29 @@ void init_VISWS_APIC_irqs(void)
 	int i;
 
 	for (i = 0; i < CO_IRQ_APIC0 + CO_APIC_LAST + 1; i++) {
-		irq_desc[i].status = IRQ_DISABLED;
-		irq_desc[i].action = 0;
-		irq_desc[i].depth = 1;
+		struct irq_desc *desc = irq_to_desc(i);
+
+		desc->status = IRQ_DISABLED;
+		desc->action = 0;
+		desc->depth = 1;
 
 		if (i == 0) {
-			irq_desc[i].chip = &cobalt_irq_type;
+			desc->chip = &cobalt_irq_type;
 		}
 		else if (i == CO_IRQ_IDE0) {
-			irq_desc[i].chip = &cobalt_irq_type;
+			desc->chip = &cobalt_irq_type;
 		}
 		else if (i == CO_IRQ_IDE1) {
-			irq_desc[i].chip = &cobalt_irq_type;
+			desc->chip = &cobalt_irq_type;
 		}
 		else if (i == CO_IRQ_8259) {
-			irq_desc[i].chip = &piix4_master_irq_type;
+			desc->chip = &piix4_master_irq_type;
 		}
 		else if (i < CO_IRQ_APIC0) {
-			irq_desc[i].chip = &piix4_virtual_irq_type;
+			desc->chip = &piix4_virtual_irq_type;
 		}
 		else if (IS_CO_APIC(i)) {
-			irq_desc[i].chip = &cobalt_irq_type;
+			desc->chip = &cobalt_irq_type;
 		}
 	}
 
diff --git a/arch/x86/mach-voyager/voyager_smp.c b/arch/x86/mach-voyager/voyager_smp.c
index 199a5f4a873c..0f6e8a6523ae 100644
--- a/arch/x86/mach-voyager/voyager_smp.c
+++ b/arch/x86/mach-voyager/voyager_smp.c
@@ -1483,7 +1483,7 @@ static void disable_local_vic_irq(unsigned int irq)
  * the interrupt off to another CPU */
 static void before_handle_vic_irq(unsigned int irq)
 {
-	irq_desc_t *desc = irq_desc + irq;
+	irq_desc_t *desc = irq_to_desc(irq);
 	__u8 cpu = smp_processor_id();
 
 	_raw_spin_lock(&vic_irq_lock);
@@ -1518,7 +1518,7 @@ static void before_handle_vic_irq(unsigned int irq)
 /* Finish the VIC interrupt: basically mask */
 static void after_handle_vic_irq(unsigned int irq)
 {
-	irq_desc_t *desc = irq_desc + irq;
+	irq_desc_t *desc = irq_to_desc(irq);
 
 	_raw_spin_lock(&vic_irq_lock);
 	{
diff --git a/drivers/gpio/gpiolib.c b/drivers/gpio/gpiolib.c
index 8d2940517c99..572d372899d3 100644
--- a/drivers/gpio/gpiolib.c
+++ b/drivers/gpio/gpiolib.c
@@ -1058,7 +1058,7 @@ static void gpiolib_dbg_show(struct seq_file *s, struct gpio_chip *chip)
 
 		if (!is_out) {
 			int		irq = gpio_to_irq(gpio);
-			struct irq_desc	*desc = irq_desc + irq;
+			struct irq_desc	*desc = irq_to_desc(irq);
 
 			/* This races with request_irq(), set_irq_type(),
 			 * and set_irq_wake() ... but those are "rare".
diff --git a/drivers/mfd/asic3.c b/drivers/mfd/asic3.c
index ba5aa2008273..e4c0db4dc7b1 100644
--- a/drivers/mfd/asic3.c
+++ b/drivers/mfd/asic3.c
@@ -123,7 +123,7 @@ static void asic3_irq_demux(unsigned int irq, struct irq_desc *desc)
 					irqnr = asic->irq_base +
 						(ASIC3_GPIOS_PER_BANK * bank)
 						+ i;
-					desc = irq_desc + irqnr;
+					desc = irq_to_desc(irqnr);
 					desc->handle_irq(irqnr, desc);
 					if (asic->irq_bothedge[bank] & bit)
 						asic3_irq_flip_edge(asic, base,
@@ -136,7 +136,7 @@ static void asic3_irq_demux(unsigned int irq, struct irq_desc *desc)
 		for (i = ASIC3_NUM_GPIOS; i < ASIC3_NR_IRQS; i++) {
 			/* They start at bit 4 and go up */
 			if (status & (1 << (i - ASIC3_NUM_GPIOS + 4))) {
-				desc = irq_desc + asic->irq_base + i;
+				desc = irq_to_desc(asic->irq_base + i);
 				desc->handle_irq(asic->irq_base + i,
 						 desc);
 			}
diff --git a/drivers/mfd/htc-egpio.c b/drivers/mfd/htc-egpio.c
index 6be43172dc65..ad3379fcd194 100644
--- a/drivers/mfd/htc-egpio.c
+++ b/drivers/mfd/htc-egpio.c
@@ -112,7 +112,7 @@ static void egpio_handler(unsigned int irq, struct irq_desc *desc)
 		/* Run irq handler */
 		pr_debug("got IRQ %d\n", irqpin);
 		irq = ei->irq_start + irqpin;
-		desc = &irq_desc[irq];
+		desc = irq_to_desc(irq);
 		desc->handle_irq(irq, desc);
 	}
 }
diff --git a/drivers/parisc/dino.c b/drivers/parisc/dino.c
index fd56128525d1..3bc54b30c3a1 100644
--- a/drivers/parisc/dino.c
+++ b/drivers/parisc/dino.c
@@ -298,7 +298,8 @@ struct pci_port_ops dino_port_ops = {
 
 static void dino_disable_irq(unsigned int irq)
 {
-	struct dino_device *dino_dev = irq_desc[irq].chip_data;
+	struct irq_desc *desc = irq_to_desc(irq);
+	struct dino_device *dino_dev = desc->chip_data;
 	int local_irq = gsc_find_local_irq(irq, dino_dev->global_irq, DINO_LOCAL_IRQS);
 
 	DBG(KERN_WARNING "%s(0x%p, %d)\n", __func__, dino_dev, irq);
@@ -310,7 +311,8 @@ static void dino_disable_irq(unsigned int irq)
 
 static void dino_enable_irq(unsigned int irq)
 {
-	struct dino_device *dino_dev = irq_desc[irq].chip_data;
+	struct irq_desc *desc = irq_to_desc(irq);
+	struct dino_device *dino_dev = desc->chip_data;
 	int local_irq = gsc_find_local_irq(irq, dino_dev->global_irq, DINO_LOCAL_IRQS);
 	u32 tmp;
 
diff --git a/drivers/parisc/eisa.c b/drivers/parisc/eisa.c
index 771cef592542..7891db50c483 100644
--- a/drivers/parisc/eisa.c
+++ b/drivers/parisc/eisa.c
@@ -346,10 +346,10 @@ static int __init eisa_probe(struct parisc_device *dev)
 	}
 	
 	/* Reserve IRQ2 */
-	irq_desc[2].action = &irq2_action;
+	irq_to_desc(2)->action = &irq2_action;
 	
 	for (i = 0; i < 16; i++) {
-		irq_desc[i].chip = &eisa_interrupt_type;
+		irq_to_desc(i)->chip = &eisa_interrupt_type;
 	}
 	
 	EISA_bus = 1;
diff --git a/drivers/parisc/gsc.c b/drivers/parisc/gsc.c
index f7d088b897ee..e76db9e4d504 100644
--- a/drivers/parisc/gsc.c
+++ b/drivers/parisc/gsc.c
@@ -108,7 +108,8 @@ int gsc_find_local_irq(unsigned int irq, int *global_irqs, int limit)
 
 static void gsc_asic_disable_irq(unsigned int irq)
 {
-	struct gsc_asic *irq_dev = irq_desc[irq].chip_data;
+	struct irq_desc *desc = irq_to_desc(irq);
+	struct gsc_asic *irq_dev = desc->chip_data;
 	int local_irq = gsc_find_local_irq(irq, irq_dev->global_irq, 32);
 	u32 imr;
 
@@ -123,7 +124,8 @@ static void gsc_asic_disable_irq(unsigned int irq)
 
 static void gsc_asic_enable_irq(unsigned int irq)
 {
-	struct gsc_asic *irq_dev = irq_desc[irq].chip_data;
+	struct irq_desc *desc = irq_to_desc(irq);
+	struct gsc_asic *irq_dev = desc->chip_data;
 	int local_irq = gsc_find_local_irq(irq, irq_dev->global_irq, 32);
 	u32 imr;
 
@@ -159,12 +161,14 @@ static struct hw_interrupt_type gsc_asic_interrupt_type = {
 int gsc_assign_irq(struct hw_interrupt_type *type, void *data)
 {
 	static int irq = GSC_IRQ_BASE;
+	struct irq_desc *desc;
 
 	if (irq > GSC_IRQ_MAX)
 		return NO_IRQ;
 
-	irq_desc[irq].chip = type;
-	irq_desc[irq].chip_data = data;
+	desc = irq_to_desc(irq);
+	desc->chip = type;
+	desc->chip_data = data;
 	return irq++;
 }
 
diff --git a/drivers/parisc/iosapic.c b/drivers/parisc/iosapic.c
index 6fb3f7979f21..7beffcab2745 100644
--- a/drivers/parisc/iosapic.c
+++ b/drivers/parisc/iosapic.c
@@ -619,7 +619,9 @@ iosapic_set_irt_data( struct vector_info *vi, u32 *dp0, u32 *dp1)
 
 static struct vector_info *iosapic_get_vector(unsigned int irq)
 {
-	return irq_desc[irq].chip_data;
+	struct irq_desc *desc = irq_to_desc(irq);
+
+	return desc->chip_data;
 }
 
 static void iosapic_disable_irq(unsigned int irq)
diff --git a/drivers/parisc/superio.c b/drivers/parisc/superio.c
index 1e8d2d17f04c..1e93c837514f 100644
--- a/drivers/parisc/superio.c
+++ b/drivers/parisc/superio.c
@@ -363,7 +363,9 @@ int superio_fixup_irq(struct pci_dev *pcidev)
 #endif
 
 	for (i = 0; i < 16; i++) {
-		irq_desc[i].chip = &superio_interrupt_type;
+		struct irq_desc *desc = irq_to_desc(i);
+
+		desc->chip = &superio_interrupt_type;
 	}
 
 	/*
diff --git a/drivers/pcmcia/hd64465_ss.c b/drivers/pcmcia/hd64465_ss.c
index 117dc12ab438..9ef69cdb3183 100644
--- a/drivers/pcmcia/hd64465_ss.c
+++ b/drivers/pcmcia/hd64465_ss.c
@@ -233,15 +233,18 @@ static struct hw_interrupt_type hd64465_ss_irq_type = {
  */
 static void hs_map_irq(hs_socket_t *sp, unsigned int irq)
 {
+	struct irq_desc *desc;
+
     	DPRINTK("hs_map_irq(sock=%d irq=%d)\n", sp->number, irq);
 	
 	if (irq >= HS_NUM_MAPPED_IRQS)
 	    return;
 
+	desc = irq_to_desc(irq);
     	hs_mapped_irq[irq].sock = sp;
 	/* insert ourselves as the irq controller */
-	hs_mapped_irq[irq].old_handler = irq_desc[irq].chip;
-	irq_desc[irq].chip = &hd64465_ss_irq_type;
+	hs_mapped_irq[irq].old_handler = desc->chip;
+	desc->chip = &hd64465_ss_irq_type;
 }
 
 
@@ -250,13 +253,16 @@ static void hs_map_irq(hs_socket_t *sp, unsigned int irq)
  */
 static void hs_unmap_irq(hs_socket_t *sp, unsigned int irq)
 {
+	struct irq_desc *desc;
+
     	DPRINTK("hs_unmap_irq(sock=%d irq=%d)\n", sp->number, irq);
 	
 	if (irq >= HS_NUM_MAPPED_IRQS)
 	    return;
 		
+	desc = irq_to_desc(irq);
 	/* restore the original irq controller */
-	irq_desc[irq].chip = hs_mapped_irq[irq].old_handler;
+	desc->chip = hs_mapped_irq[irq].old_handler;
 }
 
 /*============================================================*/
diff --git a/drivers/xen/events.c b/drivers/xen/events.c
index ed8235187dc0..56ace47f24d6 100644
--- a/drivers/xen/events.c
+++ b/drivers/xen/events.c
@@ -125,7 +125,7 @@ static void bind_evtchn_to_cpu(unsigned int chn, unsigned int cpu)
 
 	BUG_ON(irq == -1);
 #ifdef CONFIG_SMP
-	irq_desc[irq].affinity = cpumask_of_cpu(cpu);
+	irq_to_desc(irq)->affinity = cpumask_of_cpu(cpu);
 #endif
 
 	__clear_bit(chn, cpu_evtchn_mask[cpu_evtchn[chn]]);
@@ -139,8 +139,10 @@ static void init_evtchn_cpu_bindings(void)
 #ifdef CONFIG_SMP
 	int i;
 	/* By default all event channels notify CPU#0. */
-	for (i = 0; i < nr_irqs; i++)
-		irq_desc[i].affinity = cpumask_of_cpu(0);
+	for (i = 0; i < nr_irqs; i++) {
+		struct irq_desc *desc = irq_to_desc(i);
+		desc->affinity = cpumask_of_cpu(0);
+	}
 #endif
 
 	memset(cpu_evtchn, 0, sizeof(cpu_evtchn));
diff --git a/include/linux/irq.h b/include/linux/irq.h
index 5f4b013624dc..80b8200f2adb 100644
--- a/include/linux/irq.h
+++ b/include/linux/irq.h
@@ -152,6 +152,10 @@ struct irq_chip {
  * @name:		flow handler name for /proc/interrupts output
  */
 struct irq_desc {
+	unsigned int		irq;
+#ifdef CONFIG_HAVE_SPARSE_IRQ
+	struct irq_desc		*next;
+#endif
 	irq_flow_handler_t	handle_irq;
 	struct irq_chip		*chip;
 	struct msi_desc		*msi_desc;
@@ -179,9 +183,9 @@ struct irq_desc {
 	const char		*name;
 } ____cacheline_internodealigned_in_smp;
 
-#ifdef CONFIG_HAVE_DYN_ARRAY
-extern struct irq_desc *irq_desc;
-#else
+extern struct irq_desc *irq_to_desc(unsigned int irq);
+#ifndef CONFIG_HAVE_DYN_ARRAY
+/* could be removed if we get rid of all irq_desc reference */
 extern struct irq_desc irq_desc[NR_IRQS];
 #endif
 
@@ -249,7 +253,10 @@ extern int no_irq_affinity;
 
 static inline int irq_balancing_disabled(unsigned int irq)
 {
-	return irq_desc[irq].status & IRQ_NO_BALANCING_MASK;
+	struct irq_desc *desc;
+
+	desc = irq_to_desc(irq);
+	return desc->status & IRQ_NO_BALANCING_MASK;
 }
 
 /* Handle irq action chains: */
@@ -281,7 +288,7 @@ extern unsigned int __do_IRQ(unsigned int irq);
  */
 static inline void generic_handle_irq(unsigned int irq)
 {
-	struct irq_desc *desc = irq_desc + irq;
+	struct irq_desc *desc = irq_to_desc(irq);
 
 #ifdef CONFIG_GENERIC_HARDIRQS_NO__DO_IRQ
 	desc->handle_irq(irq, desc);
@@ -325,7 +332,10 @@ __set_irq_handler(unsigned int irq, irq_flow_handler_t handle, int is_chained,
 static inline void __set_irq_handler_unlocked(int irq,
 					      irq_flow_handler_t handler)
 {
-	irq_desc[irq].handle_irq = handler;
+	struct irq_desc *desc;
+
+	desc = irq_to_desc(irq);
+	desc->handle_irq = handler;
 }
 
 /*
@@ -359,7 +369,7 @@ extern void destroy_irq(unsigned int irq);
 /* Test to see if a driver has successfully requested an irq */
 static inline int irq_has_action(unsigned int irq)
 {
-	struct irq_desc *desc = irq_desc + irq;
+	struct irq_desc *desc = irq_to_desc(irq);
 	return desc->action != NULL;
 }
 
@@ -374,10 +384,10 @@ extern int set_irq_chip_data(unsigned int irq, void *data);
 extern int set_irq_type(unsigned int irq, unsigned int type);
 extern int set_irq_msi(unsigned int irq, struct msi_desc *entry);
 
-#define get_irq_chip(irq)	(irq_desc[irq].chip)
-#define get_irq_chip_data(irq)	(irq_desc[irq].chip_data)
-#define get_irq_data(irq)	(irq_desc[irq].handler_data)
-#define get_irq_msi(irq)	(irq_desc[irq].msi_desc)
+#define get_irq_chip(irq)	(irq_to_desc(irq)->chip)
+#define get_irq_chip_data(irq)	(irq_to_desc(irq)->chip_data)
+#define get_irq_data(irq)	(irq_to_desc(irq)->handler_data)
+#define get_irq_msi(irq)	(irq_to_desc(irq)->msi_desc)
 
 #endif /* CONFIG_GENERIC_HARDIRQS */
 
diff --git a/kernel/irq/autoprobe.c b/kernel/irq/autoprobe.c
index c689e9851a80..c45ab718cf07 100644
--- a/kernel/irq/autoprobe.c
+++ b/kernel/irq/autoprobe.c
@@ -39,7 +39,7 @@ unsigned long probe_irq_on(void)
 	 * flush such a longstanding irq before considering it as spurious.
 	 */
 	for (i = nr_irqs-1; i > 0; i--) {
-		desc = irq_desc + i;
+		desc = irq_to_desc(i);
 
 		spin_lock_irq(&desc->lock);
 		if (!desc->action && !(desc->status & IRQ_NOPROBE)) {
@@ -69,7 +69,7 @@ unsigned long probe_irq_on(void)
 	 * happened in the previous stage, it may have masked itself)
 	 */
 	for (i = nr_irqs-1; i > 0; i--) {
-		desc = irq_desc + i;
+		desc = irq_to_desc(i);
 
 		spin_lock_irq(&desc->lock);
 		if (!desc->action && !(desc->status & IRQ_NOPROBE)) {
@@ -92,7 +92,7 @@ unsigned long probe_irq_on(void)
 	for (i = 0; i < nr_irqs; i++) {
 		unsigned int status;
 
-		desc = irq_desc + i;
+		desc = irq_to_desc(i);
 		spin_lock_irq(&desc->lock);
 		status = desc->status;
 
@@ -131,7 +131,7 @@ unsigned int probe_irq_mask(unsigned long val)
 
 	mask = 0;
 	for (i = 0; i < nr_irqs; i++) {
-		struct irq_desc *desc = irq_desc + i;
+		struct irq_desc *desc = irq_to_desc(i);
 		unsigned int status;
 
 		spin_lock_irq(&desc->lock);
@@ -174,7 +174,7 @@ int probe_irq_off(unsigned long val)
 	int i, irq_found = 0, nr_irqs = 0;
 
 	for (i = 0; i < nr_irqs; i++) {
-		struct irq_desc *desc = irq_desc + i;
+		struct irq_desc *desc = irq_to_desc(i);
 		unsigned int status;
 
 		spin_lock_irq(&desc->lock);
diff --git a/kernel/irq/chip.c b/kernel/irq/chip.c
index bba66e098703..76c225cf4b26 100644
--- a/kernel/irq/chip.c
+++ b/kernel/irq/chip.c
@@ -33,7 +33,7 @@ void dynamic_irq_init(unsigned int irq)
 	}
 
 	/* Ensure we don't have left over values from a previous use of this irq */
-	desc = irq_desc + irq;
+	desc = irq_to_desc(irq);
 	spin_lock_irqsave(&desc->lock, flags);
 	desc->status = IRQ_DISABLED;
 	desc->chip = &no_irq_chip;
@@ -65,7 +65,7 @@ void dynamic_irq_cleanup(unsigned int irq)
 		return;
 	}
 
-	desc = irq_desc + irq;
+	desc = irq_to_desc(irq);
 	spin_lock_irqsave(&desc->lock, flags);
 	if (desc->action) {
 		spin_unlock_irqrestore(&desc->lock, flags);
@@ -100,7 +100,7 @@ int set_irq_chip(unsigned int irq, struct irq_chip *chip)
 	if (!chip)
 		chip = &no_irq_chip;
 
-	desc = irq_desc + irq;
+	desc = irq_to_desc(irq);
 	spin_lock_irqsave(&desc->lock, flags);
 	irq_chip_set_defaults(chip);
 	desc->chip = chip;
@@ -126,7 +126,7 @@ int set_irq_type(unsigned int irq, unsigned int type)
 		return -ENODEV;
 	}
 
-	desc = irq_desc + irq;
+	desc = irq_to_desc(irq);
 	if (type == IRQ_TYPE_NONE)
 		return 0;
 
@@ -155,7 +155,7 @@ int set_irq_data(unsigned int irq, void *data)
 		return -EINVAL;
 	}
 
-	desc = irq_desc + irq;
+	desc = irq_to_desc(irq);
 	spin_lock_irqsave(&desc->lock, flags);
 	desc->handler_data = data;
 	spin_unlock_irqrestore(&desc->lock, flags);
@@ -180,7 +180,7 @@ int set_irq_msi(unsigned int irq, struct msi_desc *entry)
 		       "Trying to install msi data for IRQ%d\n", irq);
 		return -EINVAL;
 	}
-	desc = irq_desc + irq;
+	desc = irq_to_desc(irq);
 	spin_lock_irqsave(&desc->lock, flags);
 	desc->msi_desc = entry;
 	if (entry)
@@ -198,9 +198,10 @@ int set_irq_msi(unsigned int irq, struct msi_desc *entry)
  */
 int set_irq_chip_data(unsigned int irq, void *data)
 {
-	struct irq_desc *desc = irq_desc + irq;
+	struct irq_desc *desc;
 	unsigned long flags;
 
+	desc = irq_to_desc(irq);
 	if (irq >= nr_irqs || !desc->chip) {
 		printk(KERN_ERR "BUG: bad set_irq_chip_data(IRQ#%d)\n", irq);
 		return -EINVAL;
@@ -219,8 +220,9 @@ EXPORT_SYMBOL(set_irq_chip_data);
  */
 static void default_enable(unsigned int irq)
 {
-	struct irq_desc *desc = irq_desc + irq;
+	struct irq_desc *desc;
 
+	desc = irq_to_desc(irq);
 	desc->chip->unmask(irq);
 	desc->status &= ~IRQ_MASKED;
 }
@@ -237,7 +239,10 @@ static void default_disable(unsigned int irq)
  */
 static unsigned int default_startup(unsigned int irq)
 {
-	irq_desc[irq].chip->enable(irq);
+	struct irq_desc *desc;
+
+	desc = irq_to_desc(irq);
+	desc->chip->enable(irq);
 
 	return 0;
 }
@@ -247,8 +252,9 @@ static unsigned int default_startup(unsigned int irq)
  */
 static void default_shutdown(unsigned int irq)
 {
-	struct irq_desc *desc = irq_desc + irq;
+	struct irq_desc *desc;
 
+	desc = irq_to_desc(irq);
 	desc->chip->mask(irq);
 	desc->status |= IRQ_MASKED;
 }
@@ -551,7 +557,7 @@ __set_irq_handler(unsigned int irq, irq_flow_handler_t handle, int is_chained,
 		return;
 	}
 
-	desc = irq_desc + irq;
+	desc = irq_to_desc(irq);
 
 	if (!handle)
 		handle = handle_bad_irq;
@@ -616,7 +622,7 @@ void __init set_irq_noprobe(unsigned int irq)
 		return;
 	}
 
-	desc = irq_desc + irq;
+	desc = irq_to_desc(irq);
 
 	spin_lock_irqsave(&desc->lock, flags);
 	desc->status |= IRQ_NOPROBE;
@@ -634,7 +640,7 @@ void __init set_irq_probe(unsigned int irq)
 		return;
 	}
 
-	desc = irq_desc + irq;
+	desc = irq_to_desc(irq);
 
 	spin_lock_irqsave(&desc->lock, flags);
 	desc->status &= ~IRQ_NOPROBE;
diff --git a/kernel/irq/handle.c b/kernel/irq/handle.c
index 6ce3bcc2b8f7..9fc33b3378e6 100644
--- a/kernel/irq/handle.c
+++ b/kernel/irq/handle.c
@@ -18,6 +18,14 @@
 
 #include "internals.h"
 
+#ifdef CONFIG_TRACE_IRQFLAGS
+
+/*
+ * lockdep: we want to handle all irq_desc locks as a single lock-class:
+ */
+static struct lock_class_key irq_desc_lock_class;
+#endif
+
 /**
  * handle_bad_irq - handle spurious and unhandled irqs
  * @irq:       the interrupt number
@@ -51,7 +59,8 @@ int nr_irqs = NR_IRQS;
 EXPORT_SYMBOL_GPL(nr_irqs);
 
 #ifdef CONFIG_HAVE_DYN_ARRAY
-static struct irq_desc irq_desc_init __initdata = {
+static struct irq_desc irq_desc_init = {
+	.irq = -1U,
 	.status = IRQ_DISABLED,
 	.chip = &no_irq_chip,
 	.handle_irq = handle_bad_irq,
@@ -62,6 +71,27 @@ static struct irq_desc irq_desc_init __initdata = {
 #endif
 };
 
+
+static void init_one_irq_desc(struct irq_desc *desc)
+{
+	memcpy(desc, &irq_desc_init, sizeof(struct irq_desc));
+#ifdef CONFIG_TRACE_IRQFLAGS
+	lockdep_set_class(&desc->lock, &irq_desc_lock_class);
+#endif
+}
+
+#ifdef CONFIG_HAVE_SPARSE_IRQ
+static int nr_irq_desc = 32;
+
+static int __init parse_nr_irq_desc(char *arg)
+{
+	if (arg)
+		nr_irq_desc = simple_strtoul(arg, NULL, 0);
+	return 0;
+}
+
+early_param("nr_irq_desc", parse_nr_irq_desc);
+
 static void __init init_work(void *data)
 {
 	struct dyn_array *da = data;
@@ -71,12 +101,83 @@ static void __init init_work(void *data)
 	desc = *da->name;
 
 	for (i = 0; i < *da->nr; i++)
-		memcpy(&desc[i], &irq_desc_init, sizeof(struct irq_desc));
+		init_one_irq_desc(&desc[i]);
+
+	for (i = 1; i < *da->nr; i++)
+		desc[i-1].next = &desc[i];
 }
 
-struct irq_desc *irq_desc;
+static struct irq_desc *sparse_irqs;
+DEFINE_DYN_ARRAY(sparse_irqs, sizeof(struct irq_desc), nr_irq_desc, PAGE_SIZE, init_work);
+
+extern int after_bootmem;
+extern void *__alloc_bootmem_nopanic(unsigned long size,
+			     unsigned long align,
+			     unsigned long goal);
+struct irq_desc *irq_to_desc(unsigned int irq)
+{
+	struct irq_desc *desc, *desc_pri;
+	int i;
+	int count = 0;
+
+	BUG_ON(irq == -1U);
+
+	desc_pri = desc = &sparse_irqs[0];
+	while (desc) {
+		if (desc->irq == irq)
+			return desc;
+
+		if (desc->irq == -1U) {
+			desc->irq = irq;
+			return desc;
+		}
+		desc_pri = desc;
+		desc = desc->next;
+		count++;
+	}
+
+	/*
+	 *  we run out of pre-allocate ones, allocate more
+	 */
+	printk(KERN_DEBUG "try to get more irq_desc %d\n", nr_irq_desc);
+
+	if (after_bootmem)
+		desc = kzalloc(sizeof(struct irq_desc)*nr_irq_desc, GFP_ATOMIC);
+	else
+		desc = __alloc_bootmem_nopanic(sizeof(struct irq_desc)*nr_irq_desc, PAGE_SIZE, 0);
+
+	if (!desc)
+		panic("please boot with nr_irq_desc= %d\n", count * 2);
+
+	for (i = 0; i < nr_irq_desc; i++)
+		init_one_irq_desc(&desc[i]);
+
+	for (i = 1; i < nr_irq_desc; i++)
+		desc[i-1].next = &desc[i];
+
+	desc->irq = irq;
+	desc_pri->next = desc;
+
+	return desc;
+}
+#else
+static void __init init_work(void *data)
+{
+	struct dyn_array *da = data;
+	int i;
+	struct  irq_desc *desc;
+
+	desc = *da->name;
+
+	for (i = 0; i < *da->nr; i++)
+		init_one_irq_desc(&desc[i]);
+
+}
+static struct irq_desc *irq_desc;
 DEFINE_DYN_ARRAY(irq_desc, sizeof(struct irq_desc), nr_irqs, PAGE_SIZE, init_work);
 
+#endif
+
 #else
 
 struct irq_desc irq_desc[NR_IRQS] __cacheline_aligned_in_smp = {
@@ -85,12 +186,23 @@ struct irq_desc irq_desc[NR_IRQS] __cacheline_aligned_in_smp = {
 		.chip = &no_irq_chip,
 		.handle_irq = handle_bad_irq,
 		.depth = 1,
-		.lock = __SPIN_LOCK_UNLOCKED(irq_desc->lock),
+		.lock = __SPIN_LOCK_UNLOCKED(sparse_irqs->lock),
 #ifdef CONFIG_SMP
 		.affinity = CPU_MASK_ALL
 #endif
 	}
 };
+
+#endif
+
+#ifndef CONFIG_HAVE_SPARSE_IRQ
+struct irq_desc *irq_to_desc(unsigned int irq)
+{
+	if (irq < nr_irqs)
+		return &irq_desc[irq];
+
+	return NULL;
+}
 #endif
 
 /*
@@ -99,7 +211,10 @@ struct irq_desc irq_desc[NR_IRQS] __cacheline_aligned_in_smp = {
  */
 static void ack_bad(unsigned int irq)
 {
-	print_irq_desc(irq, irq_desc + irq);
+	struct irq_desc *desc;
+
+	desc = irq_to_desc(irq);
+	print_irq_desc(irq, desc);
 	ack_bad_irq(irq);
 }
 
@@ -196,7 +311,7 @@ irqreturn_t handle_IRQ_event(unsigned int irq, struct irqaction *action)
  */
 unsigned int __do_IRQ(unsigned int irq)
 {
-	struct irq_desc *desc = irq_desc + irq;
+	struct irq_desc *desc = irq_to_desc(irq);
 	struct irqaction *action;
 	unsigned int status;
 
@@ -287,19 +402,16 @@ out:
 }
 #endif
 
-#ifdef CONFIG_TRACE_IRQFLAGS
-
-/*
- * lockdep: we want to handle all irq_desc locks as a single lock-class:
- */
-static struct lock_class_key irq_desc_lock_class;
 
+#ifdef CONFIG_TRACE_IRQFLAGS
 void early_init_irq_lock_class(void)
 {
+#ifndef CONFIG_HAVE_DYN_ARRAY
 	int i;
 
 	for (i = 0; i < nr_irqs; i++)
 		lockdep_set_class(&irq_desc[i].lock, &irq_desc_lock_class);
+#endif
 }
-
 #endif
+
diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c
index d5a4333d8f1f..b5943e9f95aa 100644
--- a/kernel/irq/manage.c
+++ b/kernel/irq/manage.c
@@ -31,7 +31,7 @@ cpumask_t irq_default_affinity = CPU_MASK_ALL;
  */
 void synchronize_irq(unsigned int irq)
 {
-	struct irq_desc *desc = irq_desc + irq;
+	struct irq_desc *desc = irq_to_desc(irq);
 	unsigned int status;
 
 	if (irq >= nr_irqs)
@@ -64,7 +64,7 @@ EXPORT_SYMBOL(synchronize_irq);
  */
 int irq_can_set_affinity(unsigned int irq)
 {
-	struct irq_desc *desc = irq_desc + irq;
+	struct irq_desc *desc = irq_to_desc(irq);
 
 	if (CHECK_IRQ_PER_CPU(desc->status) || !desc->chip ||
 	    !desc->chip->set_affinity)
@@ -81,7 +81,7 @@ int irq_can_set_affinity(unsigned int irq)
  */
 int irq_set_affinity(unsigned int irq, cpumask_t cpumask)
 {
-	struct irq_desc *desc = irq_desc + irq;
+	struct irq_desc *desc = irq_to_desc(irq);
 
 	if (!desc->chip->set_affinity)
 		return -EINVAL;
@@ -111,14 +111,16 @@ int irq_set_affinity(unsigned int irq, cpumask_t cpumask)
 int irq_select_affinity(unsigned int irq)
 {
 	cpumask_t mask;
+	struct irq_desc *desc;
 
 	if (!irq_can_set_affinity(irq))
 		return 0;
 
 	cpus_and(mask, cpu_online_map, irq_default_affinity);
 
-	irq_desc[irq].affinity = mask;
-	irq_desc[irq].chip->set_affinity(irq, mask);
+	desc = irq_to_desc(irq);
+	desc->affinity = mask;
+	desc->chip->set_affinity(irq, mask);
 
 	set_balance_irq_affinity(irq, mask);
 	return 0;
@@ -140,7 +142,7 @@ int irq_select_affinity(unsigned int irq)
  */
 void disable_irq_nosync(unsigned int irq)
 {
-	struct irq_desc *desc = irq_desc + irq;
+	struct irq_desc *desc = irq_to_desc(irq);
 	unsigned long flags;
 
 	if (irq >= nr_irqs)
@@ -169,7 +171,7 @@ EXPORT_SYMBOL(disable_irq_nosync);
  */
 void disable_irq(unsigned int irq)
 {
-	struct irq_desc *desc = irq_desc + irq;
+	struct irq_desc *desc = irq_to_desc(irq);
 
 	if (irq >= nr_irqs)
 		return;
@@ -211,7 +213,7 @@ static void __enable_irq(struct irq_desc *desc, unsigned int irq)
  */
 void enable_irq(unsigned int irq)
 {
-	struct irq_desc *desc = irq_desc + irq;
+	struct irq_desc *desc = irq_to_desc(irq);
 	unsigned long flags;
 
 	if (irq >= nr_irqs)
@@ -225,7 +227,7 @@ EXPORT_SYMBOL(enable_irq);
 
 static int set_irq_wake_real(unsigned int irq, unsigned int on)
 {
-	struct irq_desc *desc = irq_desc + irq;
+	struct irq_desc *desc = irq_to_desc(irq);
 	int ret = -ENXIO;
 
 	if (desc->chip->set_wake)
@@ -248,7 +250,7 @@ static int set_irq_wake_real(unsigned int irq, unsigned int on)
  */
 int set_irq_wake(unsigned int irq, unsigned int on)
 {
-	struct irq_desc *desc = irq_desc + irq;
+	struct irq_desc *desc = irq_to_desc(irq);
 	unsigned long flags;
 	int ret = 0;
 
@@ -288,12 +290,13 @@ EXPORT_SYMBOL(set_irq_wake);
  */
 int can_request_irq(unsigned int irq, unsigned long irqflags)
 {
+	struct irq_desc *desc = irq_to_desc(irq);
 	struct irqaction *action;
 
-	if (irq >= nr_irqs || irq_desc[irq].status & IRQ_NOREQUEST)
+	if (irq >= nr_irqs || desc->status & IRQ_NOREQUEST)
 		return 0;
 
-	action = irq_desc[irq].action;
+	action = desc->action;
 	if (action)
 		if (irqflags & action->flags & IRQF_SHARED)
 			action = NULL;
@@ -349,7 +352,7 @@ int __irq_set_trigger(struct irq_desc *desc, unsigned int irq,
  */
 int setup_irq(unsigned int irq, struct irqaction *new)
 {
-	struct irq_desc *desc = irq_desc + irq;
+	struct irq_desc *desc = irq_to_desc(irq);
 	struct irqaction *old, **p;
 	const char *old_name = NULL;
 	unsigned long flags;
@@ -518,7 +521,7 @@ void free_irq(unsigned int irq, void *dev_id)
 	if (irq >= nr_irqs)
 		return;
 
-	desc = irq_desc + irq;
+	desc = irq_to_desc(irq);
 	spin_lock_irqsave(&desc->lock, flags);
 	p = &desc->action;
 	for (;;) {
@@ -615,6 +618,7 @@ int request_irq(unsigned int irq, irq_handler_t handler,
 {
 	struct irqaction *action;
 	int retval;
+	struct irq_desc *desc;
 
 #ifdef CONFIG_LOCKDEP
 	/*
@@ -632,7 +636,8 @@ int request_irq(unsigned int irq, irq_handler_t handler,
 		return -EINVAL;
 	if (irq >= nr_irqs)
 		return -EINVAL;
-	if (irq_desc[irq].status & IRQ_NOREQUEST)
+	desc = irq_to_desc(irq);
+	if (desc->status & IRQ_NOREQUEST)
 		return -EINVAL;
 	if (!handler)
 		return -EINVAL;
diff --git a/kernel/irq/migration.c b/kernel/irq/migration.c
index 77b7acc875c5..90b920d3f52b 100644
--- a/kernel/irq/migration.c
+++ b/kernel/irq/migration.c
@@ -3,18 +3,18 @@
 
 void set_pending_irq(unsigned int irq, cpumask_t mask)
 {
-	struct irq_desc *desc = irq_desc + irq;
+	struct irq_desc *desc = irq_to_desc(irq);
 	unsigned long flags;
 
 	spin_lock_irqsave(&desc->lock, flags);
 	desc->status |= IRQ_MOVE_PENDING;
-	irq_desc[irq].pending_mask = mask;
+	desc->pending_mask = mask;
 	spin_unlock_irqrestore(&desc->lock, flags);
 }
 
 void move_masked_irq(int irq)
 {
-	struct irq_desc *desc = irq_desc + irq;
+	struct irq_desc *desc = irq_to_desc(irq);
 	cpumask_t tmp;
 
 	if (likely(!(desc->status & IRQ_MOVE_PENDING)))
@@ -30,7 +30,7 @@ void move_masked_irq(int irq)
 
 	desc->status &= ~IRQ_MOVE_PENDING;
 
-	if (unlikely(cpus_empty(irq_desc[irq].pending_mask)))
+	if (unlikely(cpus_empty(desc->pending_mask)))
 		return;
 
 	if (!desc->chip->set_affinity)
@@ -38,7 +38,7 @@ void move_masked_irq(int irq)
 
 	assert_spin_locked(&desc->lock);
 
-	cpus_and(tmp, irq_desc[irq].pending_mask, cpu_online_map);
+	cpus_and(tmp, desc->pending_mask, cpu_online_map);
 
 	/*
 	 * If there was a valid mask to work with, please
@@ -55,12 +55,12 @@ void move_masked_irq(int irq)
 	if (likely(!cpus_empty(tmp))) {
 		desc->chip->set_affinity(irq,tmp);
 	}
-	cpus_clear(irq_desc[irq].pending_mask);
+	cpus_clear(desc->pending_mask);
 }
 
 void move_native_irq(int irq)
 {
-	struct irq_desc *desc = irq_desc + irq;
+	struct irq_desc *desc = irq_to_desc(irq);
 
 	if (likely(!(desc->status & IRQ_MOVE_PENDING)))
 		return;
diff --git a/kernel/irq/proc.c b/kernel/irq/proc.c
index e5225a65a4f9..c2f356c808f6 100644
--- a/kernel/irq/proc.c
+++ b/kernel/irq/proc.c
@@ -19,7 +19,7 @@ static struct proc_dir_entry *root_irq_dir;
 
 static int irq_affinity_proc_show(struct seq_file *m, void *v)
 {
-	struct irq_desc *desc = irq_desc + (long)m->private;
+	struct irq_desc *desc = irq_to_desc((long)m->private);
 	cpumask_t *mask = &desc->affinity;
 
 #ifdef CONFIG_GENERIC_PENDING_IRQ
@@ -43,7 +43,7 @@ static ssize_t irq_affinity_proc_write(struct file *file,
 	cpumask_t new_value;
 	int err;
 
-	if (!irq_desc[irq].chip->set_affinity || no_irq_affinity ||
+	if (!irq_to_desc(irq)->chip->set_affinity || no_irq_affinity ||
 	    irq_balancing_disabled(irq))
 		return -EIO;
 
@@ -132,20 +132,20 @@ static const struct file_operations default_affinity_proc_fops = {
 static int irq_spurious_read(char *page, char **start, off_t off,
 				  int count, int *eof, void *data)
 {
-	struct irq_desc *d = &irq_desc[(long) data];
+	struct irq_desc *desc = irq_to_desc((long) data);
 	return sprintf(page, "count %u\n"
 			     "unhandled %u\n"
 			     "last_unhandled %u ms\n",
-			d->irq_count,
-			d->irqs_unhandled,
-			jiffies_to_msecs(d->last_unhandled));
+			desc->irq_count,
+			desc->irqs_unhandled,
+			jiffies_to_msecs(desc->last_unhandled));
 }
 
 #define MAX_NAMELEN 128
 
 static int name_unique(unsigned int irq, struct irqaction *new_action)
 {
-	struct irq_desc *desc = irq_desc + irq;
+	struct irq_desc *desc = irq_to_desc(irq);
 	struct irqaction *action;
 	unsigned long flags;
 	int ret = 1;
@@ -165,8 +165,9 @@ static int name_unique(unsigned int irq, struct irqaction *new_action)
 void register_handler_proc(unsigned int irq, struct irqaction *action)
 {
 	char name [MAX_NAMELEN];
+	struct irq_desc *desc = irq_to_desc(irq);
 
-	if (!irq_desc[irq].dir || action->dir || !action->name ||
+	if (!desc->dir || action->dir || !action->name ||
 					!name_unique(irq, action))
 		return;
 
@@ -174,7 +175,7 @@ void register_handler_proc(unsigned int irq, struct irqaction *action)
 	snprintf(name, MAX_NAMELEN, "%s", action->name);
 
 	/* create /proc/irq/1234/handler/ */
-	action->dir = proc_mkdir(name, irq_desc[irq].dir);
+	action->dir = proc_mkdir(name, desc->dir);
 }
 
 #undef MAX_NAMELEN
@@ -185,25 +186,24 @@ void register_irq_proc(unsigned int irq)
 {
 	char name [MAX_NAMELEN];
 	struct proc_dir_entry *entry;
+	struct irq_desc *desc = irq_to_desc(irq);
 
-	if (!root_irq_dir ||
-		(irq_desc[irq].chip == &no_irq_chip) ||
-			irq_desc[irq].dir)
+	if (!root_irq_dir || (desc->chip == &no_irq_chip) || desc->dir)
 		return;
 
 	memset(name, 0, MAX_NAMELEN);
 	sprintf(name, "%d", irq);
 
 	/* create /proc/irq/1234 */
-	irq_desc[irq].dir = proc_mkdir(name, root_irq_dir);
+	desc->dir = proc_mkdir(name, root_irq_dir);
 
 #ifdef CONFIG_SMP
 	/* create /proc/irq/<irq>/smp_affinity */
-	proc_create_data("smp_affinity", 0600, irq_desc[irq].dir,
+	proc_create_data("smp_affinity", 0600, desc->dir,
 			 &irq_affinity_proc_fops, (void *)(long)irq);
 #endif
 
-	entry = create_proc_entry("spurious", 0444, irq_desc[irq].dir);
+	entry = create_proc_entry("spurious", 0444, desc->dir);
 	if (entry) {
 		entry->data = (void *)(long)irq;
 		entry->read_proc = irq_spurious_read;
@@ -214,8 +214,10 @@ void register_irq_proc(unsigned int irq)
 
 void unregister_handler_proc(unsigned int irq, struct irqaction *action)
 {
-	if (action->dir)
-		remove_proc_entry(action->dir->name, irq_desc[irq].dir);
+	if (action->dir) {
+		struct irq_desc *desc = irq_to_desc(irq);
+		remove_proc_entry(action->dir->name, desc->dir);
+	}
 }
 
 void register_default_affinity_proc(void)
diff --git a/kernel/irq/resend.c b/kernel/irq/resend.c
index cba8aa5bc7f4..89c7117acf2b 100644
--- a/kernel/irq/resend.c
+++ b/kernel/irq/resend.c
@@ -36,7 +36,7 @@ static void resend_irqs(unsigned long arg)
 	while (!bitmap_empty(irqs_resend, nr_irqs)) {
 		irq = find_first_bit(irqs_resend, nr_irqs);
 		clear_bit(irq, irqs_resend);
-		desc = irq_desc + irq;
+		desc = irq_to_desc(irq);
 		local_irq_disable();
 		desc->handle_irq(irq, desc);
 		local_irq_enable();
diff --git a/kernel/irq/spurious.c b/kernel/irq/spurious.c
index e26ca1e90c08..b5d906002e1d 100644
--- a/kernel/irq/spurious.c
+++ b/kernel/irq/spurious.c
@@ -92,11 +92,12 @@ static int misrouted_irq(int irq)
 	int ok = 0;
 
 	for (i = 1; i < nr_irqs; i++) {
-		struct irq_desc *desc = irq_desc + i;
+		struct irq_desc *desc;
 
 		if (i == irq)	/* Already tried */
 			continue;
 
+		desc = irq_to_desc(i);
 		if (try_one_irq(i, desc))
 			ok = 1;
 	}
@@ -108,7 +109,7 @@ static void poll_spurious_irqs(unsigned long dummy)
 {
 	int i;
 	for (i = 1; i < nr_irqs; i++) {
-		struct irq_desc *desc = irq_desc + i;
+		struct irq_desc *desc = irq_to_desc(i);
 		unsigned int status;
 
 		/* Racy but it doesn't matter */
-- 
cgit v1.2.3


From 3060d6fe28570640c2d7d66d38b9eaa848c3b9e3 Mon Sep 17 00:00:00 2001
From: Yinghai Lu <yhlu.kernel@gmail.com>
Date: Tue, 19 Aug 2008 20:50:08 -0700
Subject: x86: put timer_rand_state pointer into irq_desc

irq_timer_state[] is a NR_IRQS sized array that is a side-by array to
the real irq_desc[] array.

Integrate that field into the (now dynamic) irq_desc dynamic array and
save some RAM.

v2: keep the old way to support arch not support irq_desc

Signed-off-by: Yinghai Lu <yhlu.kernel@gmail.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 drivers/char/random.c | 66 +++++++++++++++++++++++++++++++++++++++++++++++----
 include/linux/irq.h   |  2 ++
 2 files changed, 63 insertions(+), 5 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/char/random.c b/drivers/char/random.c
index 1610aa64c7cf..60c9c7ee6b2c 100644
--- a/drivers/char/random.c
+++ b/drivers/char/random.c
@@ -558,7 +558,7 @@ struct timer_rand_state {
 	unsigned dont_count_entropy:1;
 };
 
-static struct timer_rand_state input_timer_state;
+#ifndef CONFIG_HAVE_SPARSE_IRQ
 
 #ifdef CONFIG_HAVE_DYN_ARRAY
 static struct timer_rand_state **irq_timer_state;
@@ -567,6 +567,51 @@ DEFINE_DYN_ARRAY(irq_timer_state, sizeof(struct timer_rand_state *), nr_irqs, PA
 static struct timer_rand_state *irq_timer_state[NR_IRQS];
 #endif
 
+static struct timer_rand_state *get_timer_rand_state(unsigned int irq)
+{
+	if (irq >= nr_irqs)
+		return NULL;
+
+	return irq_timer_state[irq];
+}
+
+static void set_timer_rand_state(unsigned int irq, struct timer_rand_state *state)
+{
+	if (irq >= nr_irqs)
+		return;
+
+	irq_timer_state[irq] = state;
+}
+
+#else
+
+static struct timer_rand_state *get_timer_rand_state(unsigned int irq)
+{
+	struct irq_desc *desc;
+
+	desc = irq_to_desc(irq);
+
+	if (!desc)
+		return NULL;
+
+	return desc->timer_rand_state;
+}
+
+static void set_timer_rand_state(unsigned int irq, struct timer_rand_state *state)
+{
+	struct irq_desc *desc;
+
+	desc = irq_to_desc(irq);
+
+	if (!desc)
+		return;
+
+	desc->timer_rand_state = state;
+}
+#endif
+
+static struct timer_rand_state input_timer_state;
+
 /*
  * This function adds entropy to the entropy "pool" by using timing
  * delays.  It uses the timer_rand_state structure to make an estimate
@@ -654,11 +699,15 @@ EXPORT_SYMBOL_GPL(add_input_randomness);
 
 void add_interrupt_randomness(int irq)
 {
-	if (irq >= nr_irqs || irq_timer_state[irq] == NULL)
+	struct timer_rand_state *state;
+
+	state = get_timer_rand_state(irq);
+
+	if (state == NULL)
 		return;
 
 	DEBUG_ENT("irq event %d\n", irq);
-	add_timer_randomness(irq_timer_state[irq], 0x100 + irq);
+	add_timer_randomness(state, 0x100 + irq);
 }
 
 #ifdef CONFIG_BLOCK
@@ -918,7 +967,14 @@ void rand_initialize_irq(int irq)
 {
 	struct timer_rand_state *state;
 
-	if (irq >= nr_irqs || irq_timer_state[irq])
+#ifndef CONFIG_HAVE_SPARSE_IRQ
+	if (irq >= nr_irqs)
+		return;
+#endif
+
+	state = get_timer_rand_state(irq);
+
+	if (state)
 		return;
 
 	/*
@@ -927,7 +983,7 @@ void rand_initialize_irq(int irq)
 	 */
 	state = kzalloc(sizeof(struct timer_rand_state), GFP_KERNEL);
 	if (state)
-		irq_timer_state[irq] = state;
+		set_timer_rand_state(irq, state);
 }
 
 #ifdef CONFIG_BLOCK
diff --git a/include/linux/irq.h b/include/linux/irq.h
index 80b8200f2adb..60c856aaac0f 100644
--- a/include/linux/irq.h
+++ b/include/linux/irq.h
@@ -127,6 +127,7 @@ struct irq_chip {
 	const char	*typename;
 };
 
+struct timer_rand_state;
 /**
  * struct irq_desc - interrupt descriptor
  *
@@ -155,6 +156,7 @@ struct irq_desc {
 	unsigned int		irq;
 #ifdef CONFIG_HAVE_SPARSE_IRQ
 	struct irq_desc		*next;
+	struct timer_rand_state *timer_rand_state;
 #endif
 	irq_flow_handler_t	handle_irq;
 	struct irq_chip		*chip;
-- 
cgit v1.2.3


From 7f95ec9e4c12fd067febfd57532da1166d75d858 Mon Sep 17 00:00:00 2001
From: Yinghai Lu <yhlu.kernel@gmail.com>
Date: Tue, 19 Aug 2008 20:50:09 -0700
Subject: x86: move kstat_irqs from kstat to irq_desc

based on Eric's patch ...

together mold it with dyn_array for irq_desc, will allcate kstat_irqs for
nr_irq_desc alltogether if needed. -- at that point nr_cpus is known already.

v2: make sure system without generic_hardirqs works they don't have irq_desc
v3: fix merging
v4: [mingo@elte.hu] fix typo

[ mingo@elte.hu ] irq: build fix

fix:

 arch/x86/xen/spinlock.c: In function 'xen_spin_lock_slow':
 arch/x86/xen/spinlock.c:90: error: 'struct kernel_stat' has no member named 'irqs'

Signed-off-by: Yinghai Lu <yhlu.kernel@gmail.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/io_apic_32.c   |  2 +-
 arch/x86/kernel/irq_32.c       |  4 +-
 arch/x86/kernel/irq_64.c       |  4 +-
 arch/x86/kernel/visws_quirks.c |  2 +-
 arch/x86/xen/spinlock.c        |  2 +-
 fs/proc/proc_misc.c            |  2 +-
 include/linux/irq.h            |  7 +++
 include/linux/kernel_stat.h    | 22 +++++++---
 kernel/irq/chip.c              | 15 +++----
 kernel/irq/handle.c            | 97 ++++++++++++++++++++++++++++++------------
 kernel/sched.c                 |  5 +--
 11 files changed, 106 insertions(+), 56 deletions(-)

(limited to 'include/linux')

diff --git a/arch/x86/kernel/io_apic_32.c b/arch/x86/kernel/io_apic_32.c
index c2160cfdec9b..204884b1415a 100644
--- a/arch/x86/kernel/io_apic_32.c
+++ b/arch/x86/kernel/io_apic_32.c
@@ -526,7 +526,7 @@ static void do_irq_balance(void)
 			if (package_index == i)
 				IRQ_DELTA(package_index, j) = 0;
 			/* Determine the total count per processor per IRQ */
-			value_now = (unsigned long) kstat_cpu(i).irqs[j];
+			value_now = (unsigned long) kstat_irqs_cpu(j, i);
 
 			/* Determine the activity per processor per IRQ */
 			delta = value_now - LAST_CPU_IRQ(i, j);
diff --git a/arch/x86/kernel/irq_32.c b/arch/x86/kernel/irq_32.c
index ede513be517d..576c5df6cad8 100644
--- a/arch/x86/kernel/irq_32.c
+++ b/arch/x86/kernel/irq_32.c
@@ -280,7 +280,7 @@ int show_interrupts(struct seq_file *p, void *v)
 		any_count = kstat_irqs(i);
 #else
 		for_each_online_cpu(j)
-			any_count |= kstat_cpu(j).irqs[i];
+			any_count |= kstat_irqs_cpu(i, j);
 #endif
 		action = desc->action;
 		if (!action && !any_count)
@@ -290,7 +290,7 @@ int show_interrupts(struct seq_file *p, void *v)
 		seq_printf(p, "%10u ", kstat_irqs(i));
 #else
 		for_each_online_cpu(j)
-			seq_printf(p, "%10u ", kstat_cpu(j).irqs[i]);
+			seq_printf(p, "%10u ", kstat_irqs_cpu(i, j));
 #endif
 		seq_printf(p, " %8s", desc->chip->name);
 		seq_printf(p, "-%-8s", desc->name);
diff --git a/arch/x86/kernel/irq_64.c b/arch/x86/kernel/irq_64.c
index 738eb65a924e..4a0a4eb44dcb 100644
--- a/arch/x86/kernel/irq_64.c
+++ b/arch/x86/kernel/irq_64.c
@@ -90,7 +90,7 @@ int show_interrupts(struct seq_file *p, void *v)
 		any_count = kstat_irqs(i);
 #else
 		for_each_online_cpu(j)
-			any_count |= kstat_cpu(j).irqs[i];
+			any_count |= kstat_irqs_cpu(i, j);
 #endif
 		action = desc->action;
 		if (!action && !any_count)
@@ -100,7 +100,7 @@ int show_interrupts(struct seq_file *p, void *v)
 		seq_printf(p, "%10u ", kstat_irqs(i));
 #else
 		for_each_online_cpu(j)
-			seq_printf(p, "%10u ", kstat_cpu(j).irqs[i]);
+			seq_printf(p, "%10u ", kstat_irqs_cpu(i, j));
 #endif
 		seq_printf(p, " %8s", desc->chip->name);
 		seq_printf(p, "-%-8s", desc->name);
diff --git a/arch/x86/kernel/visws_quirks.c b/arch/x86/kernel/visws_quirks.c
index 9d85ab384435..817aa55a1209 100644
--- a/arch/x86/kernel/visws_quirks.c
+++ b/arch/x86/kernel/visws_quirks.c
@@ -633,7 +633,7 @@ static irqreturn_t piix4_master_intr(int irq, void *dev_id)
 	/*
 	 * handle this 'virtual interrupt' as a Cobalt one now.
 	 */
-	kstat_cpu(smp_processor_id()).irqs[realirq]++;
+	kstat_irqs_this_cpu(desc)++;
 
 	if (likely(desc->action != NULL))
 		handle_IRQ_event(realirq, desc->action);
diff --git a/arch/x86/xen/spinlock.c b/arch/x86/xen/spinlock.c
index dd71e3a021cd..bb6bc721b13d 100644
--- a/arch/x86/xen/spinlock.c
+++ b/arch/x86/xen/spinlock.c
@@ -241,7 +241,7 @@ static noinline int xen_spin_lock_slow(struct raw_spinlock *lock, bool irq_enabl
 		ADD_STATS(taken_slow_spurious, !xen_test_irq_pending(irq));
 	} while (!xen_test_irq_pending(irq)); /* check for spurious wakeups */
 
-	kstat_this_cpu.irqs[irq]++;
+	kstat_irqs_this_cpu(irq_to_desc(irq))++;
 
 out:
 	raw_local_irq_restore(flags);
diff --git a/fs/proc/proc_misc.c b/fs/proc/proc_misc.c
index a2173a2a5625..aa069acf61a0 100644
--- a/fs/proc/proc_misc.c
+++ b/fs/proc/proc_misc.c
@@ -532,7 +532,7 @@ static int show_stat(struct seq_file *p, void *v)
 		steal = cputime64_add(steal, kstat_cpu(i).cpustat.steal);
 		guest = cputime64_add(guest, kstat_cpu(i).cpustat.guest);
 		for (j = 0; j < nr_irqs; j++) {
-			unsigned int temp = kstat_cpu(i).irqs[j];
+			unsigned int temp = kstat_irqs_cpu(j, i);
 			sum += temp;
 			per_irq_sum[j] += temp;
 		}
diff --git a/include/linux/irq.h b/include/linux/irq.h
index 60c856aaac0f..cbf471aee1ce 100644
--- a/include/linux/irq.h
+++ b/include/linux/irq.h
@@ -157,6 +157,11 @@ struct irq_desc {
 #ifdef CONFIG_HAVE_SPARSE_IRQ
 	struct irq_desc		*next;
 	struct timer_rand_state *timer_rand_state;
+#endif
+#ifdef CONFIG_HAVE_DYN_ARRAY
+	unsigned int            *kstat_irqs;
+#else
+	unsigned int            kstat_irqs[NR_CPUS];
 #endif
 	irq_flow_handler_t	handle_irq;
 	struct irq_chip		*chip;
@@ -190,6 +195,8 @@ extern struct irq_desc *irq_to_desc(unsigned int irq);
 /* could be removed if we get rid of all irq_desc reference */
 extern struct irq_desc irq_desc[NR_IRQS];
 #endif
+#define kstat_irqs_this_cpu(DESC) \
+	((DESC)->kstat_irqs[smp_processor_id()])
 
 /*
  * Migration helpers for obsolete names, they will go away:
diff --git a/include/linux/kernel_stat.h b/include/linux/kernel_stat.h
index fe1f7fe534b4..f10616712de5 100644
--- a/include/linux/kernel_stat.h
+++ b/include/linux/kernel_stat.h
@@ -28,10 +28,8 @@ struct cpu_usage_stat {
 
 struct kernel_stat {
 	struct cpu_usage_stat	cpustat;
-#ifdef CONFIG_HAVE_DYN_ARRAY
-	unsigned int *irqs;
-#else
-	unsigned int irqs[NR_IRQS];
+#ifndef CONFIG_GENERIC_HARDIRQS
+       unsigned int irqs[NR_IRQS];
 #endif
 };
 
@@ -43,15 +41,25 @@ DECLARE_PER_CPU(struct kernel_stat, kstat);
 
 extern unsigned long long nr_context_switches(void);
 
+#ifndef CONFIG_GENERIC_HARDIRQS
+static inline unsigned int kstat_irqs_cpu(unsigned int irq, int cpu)
+{
+       return kstat_cpu(cpu).irqs[irq];
+}
+#else
+extern unsigned int kstat_irqs_cpu(unsigned int irq, int cpu);
+#endif
+
 /*
  * Number of interrupts per specific IRQ source, since bootup
  */
-static inline int kstat_irqs(int irq)
+static inline unsigned int kstat_irqs(unsigned int irq)
 {
-	int cpu, sum = 0;
+	unsigned int sum = 0;
+	int cpu;
 
 	for_each_possible_cpu(cpu)
-		sum += kstat_cpu(cpu).irqs[irq];
+		sum += kstat_irqs_cpu(irq, cpu);
 
 	return sum;
 }
diff --git a/kernel/irq/chip.c b/kernel/irq/chip.c
index 76c225cf4b26..2aa3d4b2fce8 100644
--- a/kernel/irq/chip.c
+++ b/kernel/irq/chip.c
@@ -312,14 +312,13 @@ handle_simple_irq(unsigned int irq, struct irq_desc *desc)
 {
 	struct irqaction *action;
 	irqreturn_t action_ret;
-	const unsigned int cpu = smp_processor_id();
 
 	spin_lock(&desc->lock);
 
 	if (unlikely(desc->status & IRQ_INPROGRESS))
 		goto out_unlock;
 	desc->status &= ~(IRQ_REPLAY | IRQ_WAITING);
-	kstat_cpu(cpu).irqs[irq]++;
+	kstat_irqs_this_cpu(desc)++;
 
 	action = desc->action;
 	if (unlikely(!action || (desc->status & IRQ_DISABLED)))
@@ -351,7 +350,6 @@ out_unlock:
 void
 handle_level_irq(unsigned int irq, struct irq_desc *desc)
 {
-	unsigned int cpu = smp_processor_id();
 	struct irqaction *action;
 	irqreturn_t action_ret;
 
@@ -361,7 +359,7 @@ handle_level_irq(unsigned int irq, struct irq_desc *desc)
 	if (unlikely(desc->status & IRQ_INPROGRESS))
 		goto out_unlock;
 	desc->status &= ~(IRQ_REPLAY | IRQ_WAITING);
-	kstat_cpu(cpu).irqs[irq]++;
+	kstat_irqs_this_cpu(desc)++;
 
 	/*
 	 * If its disabled or no action available
@@ -399,7 +397,6 @@ out_unlock:
 void
 handle_fasteoi_irq(unsigned int irq, struct irq_desc *desc)
 {
-	unsigned int cpu = smp_processor_id();
 	struct irqaction *action;
 	irqreturn_t action_ret;
 
@@ -409,7 +406,7 @@ handle_fasteoi_irq(unsigned int irq, struct irq_desc *desc)
 		goto out;
 
 	desc->status &= ~(IRQ_REPLAY | IRQ_WAITING);
-	kstat_cpu(cpu).irqs[irq]++;
+	kstat_irqs_this_cpu(desc)++;
 
 	/*
 	 * If its disabled or no action available
@@ -458,8 +455,6 @@ out:
 void
 handle_edge_irq(unsigned int irq, struct irq_desc *desc)
 {
-	const unsigned int cpu = smp_processor_id();
-
 	spin_lock(&desc->lock);
 
 	desc->status &= ~(IRQ_REPLAY | IRQ_WAITING);
@@ -476,7 +471,7 @@ handle_edge_irq(unsigned int irq, struct irq_desc *desc)
 		goto out_unlock;
 	}
 
-	kstat_cpu(cpu).irqs[irq]++;
+	kstat_irqs_this_cpu(desc)++;
 
 	/* Start handling the irq */
 	desc->chip->ack(irq);
@@ -531,7 +526,7 @@ handle_percpu_irq(unsigned int irq, struct irq_desc *desc)
 {
 	irqreturn_t action_ret;
 
-	kstat_this_cpu.irqs[irq]++;
+	kstat_irqs_this_cpu(desc)++;
 
 	if (desc->chip->ack)
 		desc->chip->ack(irq);
diff --git a/kernel/irq/handle.c b/kernel/irq/handle.c
index 9fc33b3378e6..1f346990f3f8 100644
--- a/kernel/irq/handle.c
+++ b/kernel/irq/handle.c
@@ -37,7 +37,7 @@ void
 handle_bad_irq(unsigned int irq, struct irq_desc *desc)
 {
 	print_irq_desc(irq, desc);
-	kstat_this_cpu.irqs[irq]++;
+	kstat_irqs_this_cpu(desc)++;
 	ack_bad_irq(irq);
 }
 
@@ -80,17 +80,38 @@ static void init_one_irq_desc(struct irq_desc *desc)
 #endif
 }
 
-#ifdef CONFIG_HAVE_SPARSE_IRQ
-static int nr_irq_desc = 32;
+extern int after_bootmem;
+extern void *__alloc_bootmem_nopanic(unsigned long size,
+			     unsigned long align,
+			     unsigned long goal);
 
-static int __init parse_nr_irq_desc(char *arg)
+static void init_kstat_irqs(struct irq_desc *desc, int nr_desc, int nr)
 {
-	if (arg)
-		nr_irq_desc = simple_strtoul(arg, NULL, 0);
-	return 0;
+	unsigned long bytes, total_bytes;
+	char *ptr;
+	int i;
+	unsigned long phys;
+
+	/* Compute how many bytes we need per irq and allocate them */
+	bytes = nr * sizeof(unsigned int);
+	total_bytes = bytes * nr_desc;
+	if (after_bootmem)
+		ptr = kzalloc(total_bytes, GFP_ATOMIC);
+	else
+		ptr = __alloc_bootmem_nopanic(total_bytes, PAGE_SIZE, 0);
+
+	if (!ptr)
+		panic(" can not allocate kstat_irqs\n");
+
+	phys = __pa(ptr);
+	printk(KERN_DEBUG "kstat_irqs ==> [%#lx - %#lx]\n", phys, phys + total_bytes);
+
+	for (i = 0; i < nr_desc; i++) {
+		desc[i].kstat_irqs = (unsigned int *)ptr;
+		ptr += bytes;
+	}
 }
 
-early_param("nr_irq_desc", parse_nr_irq_desc);
 
 static void __init init_work(void *data)
 {
@@ -100,25 +121,44 @@ static void __init init_work(void *data)
 
 	desc = *da->name;
 
-	for (i = 0; i < *da->nr; i++)
+	for (i = 0; i < *da->nr; i++) {
 		init_one_irq_desc(&desc[i]);
+#ifndef CONFIG_HAVE_SPARSE_IRQ
+		desc[i].irq = i;
+#endif
+	}
 
+#ifdef CONFIG_HAVE_SPARSE_IRQ
 	for (i = 1; i < *da->nr; i++)
 		desc[i-1].next = &desc[i];
+#endif
+
+	/* init kstat_irqs, nr_cpu_ids is ready already */
+	init_kstat_irqs(desc, *da->nr, nr_cpu_ids);
 }
 
+#ifdef CONFIG_HAVE_SPARSE_IRQ
+static int nr_irq_desc = 32;
+
+static int __init parse_nr_irq_desc(char *arg)
+{
+	if (arg)
+		nr_irq_desc = simple_strtoul(arg, NULL, 0);
+	return 0;
+}
+
+early_param("nr_irq_desc", parse_nr_irq_desc);
+
 static struct irq_desc *sparse_irqs;
 DEFINE_DYN_ARRAY(sparse_irqs, sizeof(struct irq_desc), nr_irq_desc, PAGE_SIZE, init_work);
 
-extern int after_bootmem;
-extern void *__alloc_bootmem_nopanic(unsigned long size,
-			     unsigned long align,
-			     unsigned long goal);
 struct irq_desc *irq_to_desc(unsigned int irq)
 {
 	struct irq_desc *desc, *desc_pri;
 	int i;
 	int count = 0;
+	unsigned long phys;
+	unsigned long total_bytes;
 
 	BUG_ON(irq == -1U);
 
@@ -141,38 +181,34 @@ struct irq_desc *irq_to_desc(unsigned int irq)
 	 */
 	printk(KERN_DEBUG "try to get more irq_desc %d\n", nr_irq_desc);
 
+	total_bytes = sizeof(struct irq_desc) * nr_irq_desc;
 	if (after_bootmem)
-		desc = kzalloc(sizeof(struct irq_desc)*nr_irq_desc, GFP_ATOMIC);
+		desc = kzalloc(total_bytes, GFP_ATOMIC);
 	else
-		desc = __alloc_bootmem_nopanic(sizeof(struct irq_desc)*nr_irq_desc, PAGE_SIZE, 0);
+		desc = __alloc_bootmem_nopanic(total_bytes, PAGE_SIZE, 0);
 
 	if (!desc)
 		panic("please boot with nr_irq_desc= %d\n", count * 2);
 
+	phys = __pa(desc);
+	printk(KERN_DEBUG "irq_desc ==> [%#lx - %#lx]\n", phys, phys + total_bytes);
+
 	for (i = 0; i < nr_irq_desc; i++)
 		init_one_irq_desc(&desc[i]);
 
 	for (i = 1; i < nr_irq_desc; i++)
 		desc[i-1].next = &desc[i];
 
+	/* init kstat_irqs, nr_cpu_ids is ready already */
+	init_kstat_irqs(desc, nr_irq_desc, nr_cpu_ids);
+
 	desc->irq = irq;
 	desc_pri->next = desc;
 
 	return desc;
 }
 #else
-static void __init init_work(void *data)
-{
-	struct dyn_array *da = data;
-	int i;
-	struct  irq_desc *desc;
-
-	desc = *da->name;
 
-	for (i = 0; i < *da->nr; i++)
-		init_one_irq_desc(&desc[i]);
-
-}
 static struct irq_desc *irq_desc;
 DEFINE_DYN_ARRAY(irq_desc, sizeof(struct irq_desc), nr_irqs, PAGE_SIZE, init_work);
 
@@ -315,7 +351,7 @@ unsigned int __do_IRQ(unsigned int irq)
 	struct irqaction *action;
 	unsigned int status;
 
-	kstat_this_cpu.irqs[irq]++;
+	kstat_irqs_this_cpu(desc)++;
 	if (CHECK_IRQ_PER_CPU(desc->status)) {
 		irqreturn_t action_ret;
 
@@ -415,3 +451,10 @@ void early_init_irq_lock_class(void)
 }
 #endif
 
+unsigned int kstat_irqs_cpu(unsigned int irq, int cpu)
+{
+	struct irq_desc *desc = irq_to_desc(irq);
+	return desc->kstat_irqs[cpu];
+}
+EXPORT_SYMBOL(kstat_irqs_cpu);
+
diff --git a/kernel/sched.c b/kernel/sched.c
index b9d713781b5b..6f230596bd0c 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -4048,11 +4048,8 @@ static inline void idle_balance(int cpu, struct rq *rq)
 #endif
 
 DEFINE_PER_CPU(struct kernel_stat, kstat);
-EXPORT_PER_CPU_SYMBOL(kstat);
 
-#ifdef CONFIG_HAVE_DYN_ARRAY
-DEFINE_PER_CPU_DYN_ARRAY_ADDR(per_cpu__kstat_irqs, per_cpu__kstat.irqs, sizeof(unsigned int), nr_irqs, sizeof(unsigned long), NULL);
-#endif
+EXPORT_PER_CPU_SYMBOL(kstat);
 
 /*
  * Return p->sum_exec_runtime plus any more ns on the sched_clock
-- 
cgit v1.2.3


From 9059d8fa4a3a9153da53da890039f7f956cc9d19 Mon Sep 17 00:00:00 2001
From: Yinghai Lu <yhlu.kernel@gmail.com>
Date: Tue, 19 Aug 2008 20:50:10 -0700
Subject: irq: add irq_desc_without_new

add an irq_desc accessor that will not allocate any sparse entry
but returns failure if there's no entry present.

Signed-off-by: Yinghai Lu <yhlu.kernel@gmail.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 include/linux/irq.h | 13 +++++++++++++
 kernel/irq/handle.c | 28 ++++++++++++++++++++++++----
 2 files changed, 37 insertions(+), 4 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/irq.h b/include/linux/irq.h
index cbf471aee1ce..c9ffef7c3b44 100644
--- a/include/linux/irq.h
+++ b/include/linux/irq.h
@@ -191,10 +191,23 @@ struct irq_desc {
 } ____cacheline_internodealigned_in_smp;
 
 extern struct irq_desc *irq_to_desc(unsigned int irq);
+extern struct irq_desc *__irq_to_desc(unsigned int irq);
+
+#ifndef CONFIG_HAVE_SPARSE_IRQ
+
 #ifndef CONFIG_HAVE_DYN_ARRAY
 /* could be removed if we get rid of all irq_desc reference */
 extern struct irq_desc irq_desc[NR_IRQS];
+#else
+extern struct irq_desc *irq_desc;
 #endif
+
+#else
+
+extern struct irq_desc *sparse_irqs;
+
+#endif
+
 #define kstat_irqs_this_cpu(DESC) \
 	((DESC)->kstat_irqs[smp_processor_id()])
 
diff --git a/kernel/irq/handle.c b/kernel/irq/handle.c
index 1f346990f3f8..8e55dbe50afc 100644
--- a/kernel/irq/handle.c
+++ b/kernel/irq/handle.c
@@ -112,7 +112,6 @@ static void init_kstat_irqs(struct irq_desc *desc, int nr_desc, int nr)
 	}
 }
 
-
 static void __init init_work(void *data)
 {
 	struct dyn_array *da = data;
@@ -149,9 +148,27 @@ static int __init parse_nr_irq_desc(char *arg)
 
 early_param("nr_irq_desc", parse_nr_irq_desc);
 
-static struct irq_desc *sparse_irqs;
+struct irq_desc *sparse_irqs;
 DEFINE_DYN_ARRAY(sparse_irqs, sizeof(struct irq_desc), nr_irq_desc, PAGE_SIZE, init_work);
 
+struct irq_desc *__irq_to_desc(unsigned int irq)
+{
+	struct irq_desc *desc;
+
+	BUG_ON(irq == -1U);
+
+	desc = &sparse_irqs[0];
+	while (desc) {
+		if (desc->irq == irq)
+			return desc;
+
+		if (desc->irq == -1U)
+			return NULL;
+
+		desc = desc->next;
+	}
+	return NULL;
+}
 struct irq_desc *irq_to_desc(unsigned int irq)
 {
 	struct irq_desc *desc, *desc_pri;
@@ -208,8 +225,7 @@ struct irq_desc *irq_to_desc(unsigned int irq)
 	return desc;
 }
 #else
-
-static struct irq_desc *irq_desc;
+struct irq_desc *irq_desc;
 DEFINE_DYN_ARRAY(irq_desc, sizeof(struct irq_desc), nr_irqs, PAGE_SIZE, init_work);
 
 #endif
@@ -239,6 +255,10 @@ struct irq_desc *irq_to_desc(unsigned int irq)
 
 	return NULL;
 }
+struct irq_desc *__irq_to_desc(unsigned int irq)
+{
+	return irq_to_desc(irq);
+}
 #endif
 
 /*
-- 
cgit v1.2.3


From 2c6927a38f65b53b62f86158fba29a068c4e8b6a Mon Sep 17 00:00:00 2001
From: Yinghai Lu <yhlu.kernel@gmail.com>
Date: Tue, 19 Aug 2008 20:50:11 -0700
Subject: irq: replace loop with nr_irqs with for_each_irq_desc

There are a handful of loops that go from 0 to nr_irqs and use
get_irq_desc() on them. These would allocate all the irq_desc
entries, regardless of the need for them.

Use the smarter for_each_irq_desc() iterator that will only iterate
over the present ones.

v2: make sure arch without GENERIC_HARDIRQS work too

Signed-off-by: Yinghai Lu <yhlu.kernel@gmail.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/io_apic_64.c |  6 +++---
 arch/x86/kernel/irq_64.c     |  5 ++---
 arch/x86/kernel/irqinit_64.c | 17 +++++------------
 include/linux/irq.h          |  7 +++++++
 kernel/irq/internals.h       |  4 ++--
 kernel/irq/manage.c          |  2 +-
 kernel/irq/proc.c            | 10 +++++-----
 7 files changed, 25 insertions(+), 26 deletions(-)

(limited to 'include/linux')

diff --git a/arch/x86/kernel/io_apic_64.c b/arch/x86/kernel/io_apic_64.c
index 51ef7eb75f2e..708be9724daf 100644
--- a/arch/x86/kernel/io_apic_64.c
+++ b/arch/x86/kernel/io_apic_64.c
@@ -1871,10 +1871,10 @@ unmask:
 
 static void ir_irq_migration(struct work_struct *work)
 {
-	int irq;
+	unsigned int irq;
+	struct irq_desc *desc;
 
-	for (irq = 0; irq < nr_irqs; irq++) {
-		struct irq_desc *desc = irq_to_desc(irq);
+	for_each_irq_desc(irq, desc) {
 		if (desc->status & IRQ_MOVE_PENDING) {
 			unsigned long flags;
 
diff --git a/arch/x86/kernel/irq_64.c b/arch/x86/kernel/irq_64.c
index 4a0a4eb44dcb..b3cf55e325f5 100644
--- a/arch/x86/kernel/irq_64.c
+++ b/arch/x86/kernel/irq_64.c
@@ -224,17 +224,16 @@ void fixup_irqs(cpumask_t map)
 {
 	unsigned int irq;
 	static int warned;
+	struct irq_desc *desc;
 
-	for (irq = 0; irq < nr_irqs; irq++) {
+	for_each_irq_desc(irq, desc) {
 		cpumask_t mask;
 		int break_affinity = 0;
 		int set_affinity = 1;
-		struct irq_desc *desc;
 
 		if (irq == 2)
 			continue;
 
-		desc = irq_to_desc(irq);
 		/* interrupt's are disabled at this point */
 		spin_lock(&desc->lock);
 
diff --git a/arch/x86/kernel/irqinit_64.c b/arch/x86/kernel/irqinit_64.c
index 0744b49b4d12..cd9f42d028d9 100644
--- a/arch/x86/kernel/irqinit_64.c
+++ b/arch/x86/kernel/irqinit_64.c
@@ -142,25 +142,18 @@ void __init init_ISA_irqs(void)
 	init_bsp_APIC();
 	init_8259A(0);
 
-	for (i = 0; i < nr_irqs; i++) {
+	for (i = 0; i < 16; i++) {
 		struct irq_desc *desc = irq_to_desc(i);
 
 		desc->status = IRQ_DISABLED;
 		desc->action = NULL;
 		desc->depth = 1;
 
-		if (i < 16) {
-			/*
-			 * 16 old-style INTA-cycle interrupts:
-			 */
-			set_irq_chip_and_handler_name(i, &i8259A_chip,
+		/*
+		 * 16 old-style INTA-cycle interrupts:
+		 */
+		set_irq_chip_and_handler_name(i, &i8259A_chip,
 						      handle_level_irq, "XT");
-		} else {
-			/*
-			 * 'high' PCI IRQs filled in on demand
-			 */
-			desc->chip = &no_irq_chip;
-		}
 	}
 }
 
diff --git a/include/linux/irq.h b/include/linux/irq.h
index c9ffef7c3b44..9de16ca8b8e5 100644
--- a/include/linux/irq.h
+++ b/include/linux/irq.h
@@ -202,9 +202,16 @@ extern struct irq_desc irq_desc[NR_IRQS];
 extern struct irq_desc *irq_desc;
 #endif
 
+#ifdef CONFIG_GENERIC_HARDIRQS
+#define for_each_irq_desc(irq, desc)		\
+	for (irq = 0, desc = irq_desc; irq < nr_irqs; irq++, desc = &irq_desc[irq])
+#endif
+
 #else
 
 extern struct irq_desc *sparse_irqs;
+#define for_each_irq_desc(irqX, desc)		\
+	for (desc = sparse_irqs, irqX = desc->irq; desc && irqX != -1U; desc = desc->next, irqX = desc ? desc->irq : -1U)
 
 #endif
 
diff --git a/kernel/irq/internals.h b/kernel/irq/internals.h
index 422dd00c8bd3..c9767e641980 100644
--- a/kernel/irq/internals.h
+++ b/kernel/irq/internals.h
@@ -14,11 +14,11 @@ extern int __irq_set_trigger(struct irq_desc *desc, unsigned int irq,
 		unsigned long flags);
 
 #ifdef CONFIG_PROC_FS
-extern void register_irq_proc(unsigned int irq);
+extern void register_irq_proc(unsigned int irq, struct irq_desc *desc);
 extern void register_handler_proc(unsigned int irq, struct irqaction *action);
 extern void unregister_handler_proc(unsigned int irq, struct irqaction *action);
 #else
-static inline void register_irq_proc(unsigned int irq) { }
+static inline void register_irq_proc(unsigned int irq, struct irq_desc *desc) { }
 static inline void register_handler_proc(unsigned int irq,
 					 struct irqaction *action) { }
 static inline void unregister_handler_proc(unsigned int irq,
diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c
index b5943e9f95aa..5070f55fdc16 100644
--- a/kernel/irq/manage.c
+++ b/kernel/irq/manage.c
@@ -478,7 +478,7 @@ int setup_irq(unsigned int irq, struct irqaction *new)
 	spin_unlock_irqrestore(&desc->lock, flags);
 
 	new->irq = irq;
-	register_irq_proc(irq);
+	register_irq_proc(irq, desc);
 	new->dir = NULL;
 	register_handler_proc(irq, new);
 
diff --git a/kernel/irq/proc.c b/kernel/irq/proc.c
index c2f356c808f6..bc0993d86c8b 100644
--- a/kernel/irq/proc.c
+++ b/kernel/irq/proc.c
@@ -182,11 +182,10 @@ void register_handler_proc(unsigned int irq, struct irqaction *action)
 
 #define MAX_NAMELEN 10
 
-void register_irq_proc(unsigned int irq)
+void register_irq_proc(unsigned int irq, struct irq_desc *desc)
 {
 	char name [MAX_NAMELEN];
 	struct proc_dir_entry *entry;
-	struct irq_desc *desc = irq_to_desc(irq);
 
 	if (!root_irq_dir || (desc->chip == &no_irq_chip) || desc->dir)
 		return;
@@ -230,7 +229,8 @@ void register_default_affinity_proc(void)
 
 void init_irq_proc(void)
 {
-	int i;
+	unsigned int irq;
+	struct irq_desc *desc;
 
 	/* create /proc/irq */
 	root_irq_dir = proc_mkdir("irq", NULL);
@@ -242,7 +242,7 @@ void init_irq_proc(void)
 	/*
 	 * Create entries for all existing IRQs.
 	 */
-	for (i = 0; i < nr_irqs; i++)
-		register_irq_proc(i);
+	for_each_irq_desc(irq, desc)
+		register_irq_proc(irq, desc);
 }
 
-- 
cgit v1.2.3


From c7fb03a475bd80c642c1345d85c7c550f63514b8 Mon Sep 17 00:00:00 2001
From: Yinghai Lu <yhlu.kernel@gmail.com>
Date: Tue, 19 Aug 2008 20:50:12 -0700
Subject: irq, fs/proc: replace loop with nr_irqs for proc/stat

Replace another nr_irqs loop to avoid the allocation of all sparse
irq entries - use for_each_irq_desc instead.

v2: make sure arch without GENERIC_HARDIRQS works too

Signed-off-by: Yinghai Lu <yhlu.kernel@gmail.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 fs/proc/proc_misc.c       | 42 ++++++++++++++++++++++++++++--------------
 include/linux/interrupt.h |  5 +++++
 2 files changed, 33 insertions(+), 14 deletions(-)

(limited to 'include/linux')

diff --git a/fs/proc/proc_misc.c b/fs/proc/proc_misc.c
index aa069acf61a0..c3cbabe8b38e 100644
--- a/fs/proc/proc_misc.c
+++ b/fs/proc/proc_misc.c
@@ -30,6 +30,7 @@
 #include <linux/mm.h>
 #include <linux/mmzone.h>
 #include <linux/pagemap.h>
+#include <linux/irq.h>
 #include <linux/interrupt.h>
 #include <linux/swap.h>
 #include <linux/slab.h>
@@ -501,17 +502,16 @@ static const struct file_operations proc_vmalloc_operations = {
 
 static int show_stat(struct seq_file *p, void *v)
 {
-	int i;
+	int i, j;
 	unsigned long jif;
 	cputime64_t user, nice, system, idle, iowait, irq, softirq, steal;
 	cputime64_t guest;
 	u64 sum = 0;
 	struct timespec boottime;
-	unsigned int *per_irq_sum;
-
-	per_irq_sum = kzalloc(sizeof(unsigned int)*nr_irqs, GFP_KERNEL);
-	if (!per_irq_sum)
-		return -ENOMEM;
+	unsigned int per_irq_sum;
+#ifdef CONFIG_GENERIC_HARDIRQS
+	struct irq_desc *desc;
+#endif
 
 	user = nice = system = idle = iowait =
 		irq = softirq = steal = cputime64_zero;
@@ -520,8 +520,6 @@ static int show_stat(struct seq_file *p, void *v)
 	jif = boottime.tv_sec;
 
 	for_each_possible_cpu(i) {
-		int j;
-
 		user = cputime64_add(user, kstat_cpu(i).cpustat.user);
 		nice = cputime64_add(nice, kstat_cpu(i).cpustat.nice);
 		system = cputime64_add(system, kstat_cpu(i).cpustat.system);
@@ -531,10 +529,12 @@ static int show_stat(struct seq_file *p, void *v)
 		softirq = cputime64_add(softirq, kstat_cpu(i).cpustat.softirq);
 		steal = cputime64_add(steal, kstat_cpu(i).cpustat.steal);
 		guest = cputime64_add(guest, kstat_cpu(i).cpustat.guest);
-		for (j = 0; j < nr_irqs; j++) {
-			unsigned int temp = kstat_irqs_cpu(j, i);
+		for_each_irq_desc(j, desc)
+		{
+			unsigned int temp;
+
+			temp = kstat_irqs_cpu(j, i);
 			sum += temp;
-			per_irq_sum[j] += temp;
 		}
 		sum += arch_irq_stat_cpu(i);
 	}
@@ -577,8 +577,23 @@ static int show_stat(struct seq_file *p, void *v)
 	}
 	seq_printf(p, "intr %llu", (unsigned long long)sum);
 
-	for (i = 0; i < nr_irqs; i++)
-		seq_printf(p, " %u", per_irq_sum[i]);
+	/* sum again ? it could be updated? */
+	for_each_irq_desc(j, desc)
+	{
+		per_irq_sum = 0;
+		for_each_possible_cpu(i) {
+			unsigned int temp;
+
+			temp = kstat_irqs_cpu(j, i);
+			per_irq_sum += temp;
+		}
+
+#ifdef CONFIG_HAVE_SPARSE_IRQ
+		seq_printf(p, " %u:%u", j, per_irq_sum);
+#else
+		seq_printf(p, " %u", per_irq_sum);
+#endif
+	}
 
 	seq_printf(p,
 		"\nctxt %llu\n"
@@ -592,7 +607,6 @@ static int show_stat(struct seq_file *p, void *v)
 		nr_running(),
 		nr_iowait());
 
-	kfree(per_irq_sum);
 	return 0;
 }
 
diff --git a/include/linux/interrupt.h b/include/linux/interrupt.h
index 511803853a5b..d4039a0b23f4 100644
--- a/include/linux/interrupt.h
+++ b/include/linux/interrupt.h
@@ -17,6 +17,11 @@
 
 extern int nr_irqs;
 
+#ifndef CONFIG_GENERIC_HARDIRQS
+#define for_each_irq_desc(irq, desc)		\
+	for (irq = 0; irq < nr_irqs; irq++)
+#endif
+
 /*
  * These correspond to the IORESOURCE_IRQ_* defines in
  * linux/ioport.h to select the interrupt line behaviour.  When
-- 
cgit v1.2.3


From 46926b67fc663d357a1a8174328998a9e49da0b8 Mon Sep 17 00:00:00 2001
From: Yinghai Lu <yhlu.kernel@gmail.com>
Date: Tue, 19 Aug 2008 20:50:15 -0700
Subject: generic: add irq_desc in function in parameter

So we could remove some duplicated calling to irq_desc

v2: make sure irq_desc in  init/main.c is not used without generic_hardirqs

Signed-off-by: Yinghai Lu <yhlu.kernel@gmail.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/irq_64.c |  6 ++++--
 include/linux/irq.h      |  9 ++++++---
 init/main.c              |  7 +++++++
 kernel/irq/handle.c      | 30 ++++++++++++++++++++++++++++++
 4 files changed, 47 insertions(+), 5 deletions(-)

(limited to 'include/linux')

diff --git a/arch/x86/kernel/irq_64.c b/arch/x86/kernel/irq_64.c
index a3e36336d914..f58b995b30ee 100644
--- a/arch/x86/kernel/irq_64.c
+++ b/arch/x86/kernel/irq_64.c
@@ -189,6 +189,7 @@ u64 arch_irq_stat(void)
 asmlinkage unsigned int do_IRQ(struct pt_regs *regs)
 {
 	struct pt_regs *old_regs = set_irq_regs(regs);
+	struct irq_desc *desc;
 
 	/* high bit used in ret_from_ code  */
 	unsigned vector = ~regs->orig_ax;
@@ -202,8 +203,9 @@ asmlinkage unsigned int do_IRQ(struct pt_regs *regs)
 	stack_overflow_check(regs);
 #endif
 
-	if (likely(__irq_to_desc(irq)))
-		generic_handle_irq(irq);
+	desc = __irq_to_desc(irq);
+	if (likely(desc))
+		generic_handle_irq_desc(irq, desc);
 	else {
 		if (!disable_apic)
 			ack_APIC_irq();
diff --git a/include/linux/irq.h b/include/linux/irq.h
index 9de16ca8b8e5..7b59e193a119 100644
--- a/include/linux/irq.h
+++ b/include/linux/irq.h
@@ -315,10 +315,8 @@ extern unsigned int __do_IRQ(unsigned int irq);
  * irqchip-style controller then we call the ->handle_irq() handler,
  * and it calls __do_IRQ() if it's attached to an irqtype-style controller.
  */
-static inline void generic_handle_irq(unsigned int irq)
+static inline void generic_handle_irq_desc(unsigned int irq, struct irq_desc *desc)
 {
-	struct irq_desc *desc = irq_to_desc(irq);
-
 #ifdef CONFIG_GENERIC_HARDIRQS_NO__DO_IRQ
 	desc->handle_irq(irq, desc);
 #else
@@ -329,6 +327,11 @@ static inline void generic_handle_irq(unsigned int irq)
 #endif
 }
 
+static inline void generic_handle_irq(unsigned int irq)
+{
+	generic_handle_irq_desc(irq, irq_to_desc(irq));
+}
+
 /* Handling of unhandled and spurious interrupts: */
 extern void note_interrupt(unsigned int irq, struct irq_desc *desc,
 			   int action_ret);
diff --git a/init/main.c b/init/main.c
index ab97d0877acc..0d2e60144f83 100644
--- a/init/main.c
+++ b/init/main.c
@@ -590,6 +590,13 @@ void pre_alloc_dyn_array(void)
 		if (da->init_work)
 			da->init_work(da);
 	}
+#else
+#ifdef CONFIF_GENERIC_HARDIRQS
+	unsigned int i;
+
+	for (i = 0; i < NR_IRQS; i++)
+		irq_desc[i].irq = i;
+#endif
 #endif
 }
 
diff --git a/kernel/irq/handle.c b/kernel/irq/handle.c
index 8e55dbe50afc..e1d787e9169b 100644
--- a/kernel/irq/handle.c
+++ b/kernel/irq/handle.c
@@ -197,6 +197,21 @@ struct irq_desc *irq_to_desc(unsigned int irq)
 	 *  we run out of pre-allocate ones, allocate more
 	 */
 	printk(KERN_DEBUG "try to get more irq_desc %d\n", nr_irq_desc);
+	{
+		/* double check if some one mess up the list */
+		struct irq_desc *desc;
+		int count = 0;
+
+		desc = &sparse_irqs[0];
+		while (desc) {
+			printk(KERN_DEBUG "found irq_desc for irq %d\n", desc->irq);
+			if (desc->next)
+				printk(KERN_DEBUG "found irq_desc for irq %d and next will be irq %d\n", desc->irq, desc->next->irq);
+			desc = desc->next;
+			count++;
+		}
+		printk(KERN_DEBUG "all preallocted %d\n", count);
+	}
 
 	total_bytes = sizeof(struct irq_desc) * nr_irq_desc;
 	if (after_bootmem)
@@ -221,6 +236,21 @@ struct irq_desc *irq_to_desc(unsigned int irq)
 
 	desc->irq = irq;
 	desc_pri->next = desc;
+	{
+		/* double check if some one mess up the list */
+		struct irq_desc *desc;
+		int count = 0;
+
+		desc = &sparse_irqs[0];
+		while (desc) {
+			printk(KERN_DEBUG "1 found irq_desc for irq %d\n", desc->irq);
+			if (desc->next)
+				printk(KERN_DEBUG "1 found irq_desc for irq %d and next will be irq %d\n", desc->irq, desc->next->irq);
+			desc = desc->next;
+			count++;
+		}
+		printk(KERN_DEBUG "1 all preallocted %d\n", count);
+	}
 
 	return desc;
 }
-- 
cgit v1.2.3


From cb5bc83225a86ca53bbb889ed8439e4fd6cf44ac Mon Sep 17 00:00:00 2001
From: Yinghai Lu <yhlu.kernel@gmail.com>
Date: Tue, 19 Aug 2008 20:50:17 -0700
Subject: x86_64: rename irq_desc/irq_desc_alloc

change names:

          irq_desc() ==> irq_desc_alloc
	__irq_desc() ==> irq_desc

Also split a few of the uses in lowlevel x86 code.

v2: need to check if desc is null in smp_irq_move_cleanup

Signed-off-by: Yinghai Lu <yhlu.kernel@gmail.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/io_apic_64.c | 10 +++++++++-
 arch/x86/kernel/irq_64.c     |  4 ++--
 arch/x86/kernel/irqinit_64.c |  3 ++-
 include/linux/irq.h          |  2 +-
 kernel/irq/chip.c            | 21 +++++++++++----------
 kernel/irq/handle.c          | 23 +++++------------------
 kernel/irq/manage.c          | 16 ++++++++--------
 7 files changed, 38 insertions(+), 41 deletions(-)

(limited to 'include/linux')

diff --git a/arch/x86/kernel/io_apic_64.c b/arch/x86/kernel/io_apic_64.c
index 1b8cccb5ba25..a054db9ef190 100644
--- a/arch/x86/kernel/io_apic_64.c
+++ b/arch/x86/kernel/io_apic_64.c
@@ -1124,7 +1124,12 @@ static void ioapic_register_intr(int irq, unsigned long trigger)
 {
 	struct irq_desc *desc;
 
-	desc = irq_to_desc(irq);
+	/* first time to use this irq_desc */
+	if (irq < 16)
+		desc = irq_to_desc(irq);
+	else
+		desc = irq_to_desc_alloc(irq);
+
 	if (trigger)
 		desc->status |= IRQ_LEVEL;
 	else
@@ -1919,6 +1924,9 @@ asmlinkage void smp_irq_move_cleanup_interrupt(void)
 		irq = __get_cpu_var(vector_irq)[vector];
 
 		desc = irq_to_desc(irq);
+		if (!desc)
+			continue;
+
 		cfg = irq_cfg(irq);
 		spin_lock(&desc->lock);
 		if (!cfg->move_cleanup_count)
diff --git a/arch/x86/kernel/irq_64.c b/arch/x86/kernel/irq_64.c
index f337f87c1e16..5d5976e0311a 100644
--- a/arch/x86/kernel/irq_64.c
+++ b/arch/x86/kernel/irq_64.c
@@ -83,7 +83,7 @@ int show_interrupts(struct seq_file *p, void *v)
 
 	if (i < nr_irqs) {
 		unsigned any_count = 0;
-		struct irq_desc *desc = __irq_to_desc(i);
+		struct irq_desc *desc = irq_to_desc(i);
 
 		if (!desc)
 			return 0;
@@ -206,7 +206,7 @@ asmlinkage unsigned int do_IRQ(struct pt_regs *regs)
 	stack_overflow_check(regs);
 #endif
 
-	desc = __irq_to_desc(irq);
+	desc = irq_to_desc(irq);
 	if (likely(desc))
 		generic_handle_irq_desc(irq, desc);
 	else {
diff --git a/arch/x86/kernel/irqinit_64.c b/arch/x86/kernel/irqinit_64.c
index cd9f42d028d9..d17fbc26d96f 100644
--- a/arch/x86/kernel/irqinit_64.c
+++ b/arch/x86/kernel/irqinit_64.c
@@ -143,7 +143,8 @@ void __init init_ISA_irqs(void)
 	init_8259A(0);
 
 	for (i = 0; i < 16; i++) {
-		struct irq_desc *desc = irq_to_desc(i);
+		/* first time call this irq_desc */
+		struct irq_desc *desc = irq_to_desc_alloc(i);
 
 		desc->status = IRQ_DISABLED;
 		desc->action = NULL;
diff --git a/include/linux/irq.h b/include/linux/irq.h
index 7b59e193a119..5fe1b01c11fe 100644
--- a/include/linux/irq.h
+++ b/include/linux/irq.h
@@ -191,7 +191,7 @@ struct irq_desc {
 } ____cacheline_internodealigned_in_smp;
 
 extern struct irq_desc *irq_to_desc(unsigned int irq);
-extern struct irq_desc *__irq_to_desc(unsigned int irq);
+extern struct irq_desc *irq_to_desc_alloc(unsigned int irq);
 
 #ifndef CONFIG_HAVE_SPARSE_IRQ
 
diff --git a/kernel/irq/chip.c b/kernel/irq/chip.c
index a4bb0da9c88c..9fc5e69213de 100644
--- a/kernel/irq/chip.c
+++ b/kernel/irq/chip.c
@@ -27,7 +27,8 @@ void dynamic_irq_init(unsigned int irq)
 	struct irq_desc *desc;
 	unsigned long flags;
 
-	desc = irq_to_desc(irq);
+	/* first time to use this irq_desc */
+	desc = irq_to_desc_alloc(irq);
 	if (!desc) {
 		WARN(1, KERN_ERR "Trying to initialize invalid IRQ%d\n", irq);
 		return;
@@ -60,7 +61,7 @@ void dynamic_irq_cleanup(unsigned int irq)
 	struct irq_desc *desc;
 	unsigned long flags;
 
-	desc = __irq_to_desc(irq);
+	desc = irq_to_desc(irq);
 	if (!desc) {
 		WARN(1, KERN_ERR "Trying to cleanup invalid IRQ%d\n", irq);
 		return;
@@ -92,7 +93,7 @@ int set_irq_chip(unsigned int irq, struct irq_chip *chip)
 	struct irq_desc *desc;
 	unsigned long flags;
 
-	desc = __irq_to_desc(irq);
+	desc = irq_to_desc(irq);
 	if (!desc) {
 		WARN(1, KERN_ERR "Trying to install chip for IRQ%d\n", irq);
 		return -EINVAL;
@@ -122,7 +123,7 @@ int set_irq_type(unsigned int irq, unsigned int type)
 	unsigned long flags;
 	int ret = -ENXIO;
 
-	desc = __irq_to_desc(irq);
+	desc = irq_to_desc(irq);
 	if (!desc) {
 		printk(KERN_ERR "Trying to set irq type for IRQ%d\n", irq);
 		return -ENODEV;
@@ -150,7 +151,7 @@ int set_irq_data(unsigned int irq, void *data)
 	struct irq_desc *desc;
 	unsigned long flags;
 
-	desc = __irq_to_desc(irq);
+	desc = irq_to_desc(irq);
 	if (!desc) {
 		printk(KERN_ERR
 		       "Trying to install controller data for IRQ%d\n", irq);
@@ -176,7 +177,7 @@ int set_irq_msi(unsigned int irq, struct msi_desc *entry)
 	struct irq_desc *desc;
 	unsigned long flags;
 
-	desc = __irq_to_desc(irq);
+	desc = irq_to_desc(irq);
 	if (!desc) {
 		printk(KERN_ERR
 		       "Trying to install msi data for IRQ%d\n", irq);
@@ -203,7 +204,7 @@ int set_irq_chip_data(unsigned int irq, void *data)
 	struct irq_desc *desc;
 	unsigned long flags;
 
-	desc = __irq_to_desc(irq);
+	desc = irq_to_desc(irq);
 	if (!desc) {
 		printk(KERN_ERR
 		       "Trying to install chip data for IRQ%d\n", irq);
@@ -554,7 +555,7 @@ __set_irq_handler(unsigned int irq, irq_flow_handler_t handle, int is_chained,
 	struct irq_desc *desc;
 	unsigned long flags;
 
-	desc = __irq_to_desc(irq);
+	desc = irq_to_desc(irq);
 	if (!desc) {
 		printk(KERN_ERR
 		       "Trying to install type control for IRQ%d\n", irq);
@@ -618,7 +619,7 @@ void __init set_irq_noprobe(unsigned int irq)
 	struct irq_desc *desc;
 	unsigned long flags;
 
-	desc = __irq_to_desc(irq);
+	desc = irq_to_desc(irq);
 	if (!desc) {
 		printk(KERN_ERR "Trying to mark IRQ%d non-probeable\n", irq);
 
@@ -635,7 +636,7 @@ void __init set_irq_probe(unsigned int irq)
 	struct irq_desc *desc;
 	unsigned long flags;
 
-	desc = __irq_to_desc(irq);
+	desc = irq_to_desc(irq);
 	if (!desc) {
 		printk(KERN_ERR "Trying to mark IRQ%d probeable\n", irq);
 
diff --git a/kernel/irq/handle.c b/kernel/irq/handle.c
index e1d787e9169b..d44e3515eae1 100644
--- a/kernel/irq/handle.c
+++ b/kernel/irq/handle.c
@@ -151,7 +151,7 @@ early_param("nr_irq_desc", parse_nr_irq_desc);
 struct irq_desc *sparse_irqs;
 DEFINE_DYN_ARRAY(sparse_irqs, sizeof(struct irq_desc), nr_irq_desc, PAGE_SIZE, init_work);
 
-struct irq_desc *__irq_to_desc(unsigned int irq)
+struct irq_desc *irq_to_desc(unsigned int irq)
 {
 	struct irq_desc *desc;
 
@@ -169,7 +169,7 @@ struct irq_desc *__irq_to_desc(unsigned int irq)
 	}
 	return NULL;
 }
-struct irq_desc *irq_to_desc(unsigned int irq)
+struct irq_desc *irq_to_desc_alloc(unsigned int irq)
 {
 	struct irq_desc *desc, *desc_pri;
 	int i;
@@ -186,6 +186,7 @@ struct irq_desc *irq_to_desc(unsigned int irq)
 
 		if (desc->irq == -1U) {
 			desc->irq = irq;
+			printk(KERN_DEBUG "found new irq_desc for irq %d\n", desc->irq);
 			return desc;
 		}
 		desc_pri = desc;
@@ -236,21 +237,7 @@ struct irq_desc *irq_to_desc(unsigned int irq)
 
 	desc->irq = irq;
 	desc_pri->next = desc;
-	{
-		/* double check if some one mess up the list */
-		struct irq_desc *desc;
-		int count = 0;
-
-		desc = &sparse_irqs[0];
-		while (desc) {
-			printk(KERN_DEBUG "1 found irq_desc for irq %d\n", desc->irq);
-			if (desc->next)
-				printk(KERN_DEBUG "1 found irq_desc for irq %d and next will be irq %d\n", desc->irq, desc->next->irq);
-			desc = desc->next;
-			count++;
-		}
-		printk(KERN_DEBUG "1 all preallocted %d\n", count);
-	}
+	printk(KERN_DEBUG "1 found new irq_desc for irq %d and pri will be irq %d\n", desc->irq, desc_pri->irq);
 
 	return desc;
 }
@@ -285,7 +272,7 @@ struct irq_desc *irq_to_desc(unsigned int irq)
 
 	return NULL;
 }
-struct irq_desc *__irq_to_desc(unsigned int irq)
+struct irq_desc *irq_to_desc_alloc(unsigned int irq)
 {
 	return irq_to_desc(irq);
 }
diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c
index c0b4d4df6de2..6df49218632a 100644
--- a/kernel/irq/manage.c
+++ b/kernel/irq/manage.c
@@ -31,7 +31,7 @@ cpumask_t irq_default_affinity = CPU_MASK_ALL;
  */
 void synchronize_irq(unsigned int irq)
 {
-	struct irq_desc *desc = __irq_to_desc(irq);
+	struct irq_desc *desc = irq_to_desc(irq);
 	unsigned int status;
 
 	if (!desc)
@@ -145,7 +145,7 @@ void disable_irq_nosync(unsigned int irq)
 	struct irq_desc *desc;
 	unsigned long flags;
 
-	desc = __irq_to_desc(irq);
+	desc = irq_to_desc(irq);
 	if (!desc)
 		return;
 
@@ -174,7 +174,7 @@ void disable_irq(unsigned int irq)
 {
 	struct irq_desc *desc;
 
-	desc = __irq_to_desc(irq);
+	desc = irq_to_desc(irq);
 	if (!desc)
 		return;
 
@@ -218,7 +218,7 @@ void enable_irq(unsigned int irq)
 	struct irq_desc *desc;
 	unsigned long flags;
 
-	desc = __irq_to_desc(irq);
+	desc = irq_to_desc(irq);
 	if (!desc)
 		return;
 
@@ -296,7 +296,7 @@ int can_request_irq(unsigned int irq, unsigned long irqflags)
 	struct irq_desc *desc;
 	struct irqaction *action;
 
-	desc = __irq_to_desc(irq);
+	desc = irq_to_desc(irq);
 	if (!desc)
 		return 0;
 
@@ -366,7 +366,7 @@ int setup_irq(unsigned int irq, struct irqaction *new)
 	int shared = 0;
 	int ret;
 
-	desc = __irq_to_desc(irq);
+	desc = irq_to_desc(irq);
 	if (!desc)
 		return -EINVAL;
 
@@ -527,7 +527,7 @@ void free_irq(unsigned int irq, void *dev_id)
 
 	WARN_ON(in_interrupt());
 
-	desc = __irq_to_desc(irq);
+	desc = irq_to_desc(irq);
 	if (!desc)
 		return;
 
@@ -644,7 +644,7 @@ int request_irq(unsigned int irq, irq_handler_t handler,
 	if ((irqflags & IRQF_SHARED) && !dev_id)
 		return -EINVAL;
 
-	desc = __irq_to_desc(irq);
+	desc = irq_to_desc(irq);
 	if (!desc)
 		return -EINVAL;
 
-- 
cgit v1.2.3


From 67fb283e148e9bd761f73691d3173b6eab9ba8db Mon Sep 17 00:00:00 2001
From: Yinghai Lu <yhlu.kernel@gmail.com>
Date: Tue, 19 Aug 2008 20:50:18 -0700
Subject: irq: separate sparse_irqs from sparse_irqs_free

so later don't need compare with -1U

Signed-off-by: Yinghai Lu <yhlu.kernel@gmail.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 include/linux/irq.h |   2 +-
 kernel/irq/handle.c | 115 ++++++++++++++++++++++++++++------------------------
 2 files changed, 62 insertions(+), 55 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/irq.h b/include/linux/irq.h
index 5fe1b01c11fe..d5749852ee69 100644
--- a/include/linux/irq.h
+++ b/include/linux/irq.h
@@ -211,7 +211,7 @@ extern struct irq_desc *irq_desc;
 
 extern struct irq_desc *sparse_irqs;
 #define for_each_irq_desc(irqX, desc)		\
-	for (desc = sparse_irqs, irqX = desc->irq; desc && irqX != -1U; desc = desc->next, irqX = desc ? desc->irq : -1U)
+	for (desc = sparse_irqs, irqX = desc->irq; desc; desc = desc->next, irqX = desc ? desc->irq : -1U)
 
 #endif
 
diff --git a/kernel/irq/handle.c b/kernel/irq/handle.c
index d44e3515eae1..6d174390f3a0 100644
--- a/kernel/irq/handle.c
+++ b/kernel/irq/handle.c
@@ -112,6 +112,11 @@ static void init_kstat_irqs(struct irq_desc *desc, int nr_desc, int nr)
 	}
 }
 
+#ifdef CONFIG_HAVE_SPARSE_IRQ
+static struct irq_desc *sparse_irqs_free;
+struct irq_desc *sparse_irqs;
+#endif
+
 static void __init init_work(void *data)
 {
 	struct dyn_array *da = data;
@@ -127,13 +132,16 @@ static void __init init_work(void *data)
 #endif
 	}
 
+	/* init kstat_irqs, nr_cpu_ids is ready already */
+	init_kstat_irqs(desc, *da->nr, nr_cpu_ids);
+
 #ifdef CONFIG_HAVE_SPARSE_IRQ
 	for (i = 1; i < *da->nr; i++)
 		desc[i-1].next = &desc[i];
-#endif
 
-	/* init kstat_irqs, nr_cpu_ids is ready already */
-	init_kstat_irqs(desc, *da->nr, nr_cpu_ids);
+	sparse_irqs_free = sparse_irqs;
+	sparse_irqs = NULL;
+#endif
 }
 
 #ifdef CONFIG_HAVE_SPARSE_IRQ
@@ -148,23 +156,17 @@ static int __init parse_nr_irq_desc(char *arg)
 
 early_param("nr_irq_desc", parse_nr_irq_desc);
 
-struct irq_desc *sparse_irqs;
 DEFINE_DYN_ARRAY(sparse_irqs, sizeof(struct irq_desc), nr_irq_desc, PAGE_SIZE, init_work);
 
 struct irq_desc *irq_to_desc(unsigned int irq)
 {
 	struct irq_desc *desc;
 
-	BUG_ON(irq == -1U);
-
-	desc = &sparse_irqs[0];
+	desc = sparse_irqs;
 	while (desc) {
 		if (desc->irq == irq)
 			return desc;
 
-		if (desc->irq == -1U)
-			return NULL;
-
 		desc = desc->next;
 	}
 	return NULL;
@@ -174,21 +176,12 @@ struct irq_desc *irq_to_desc_alloc(unsigned int irq)
 	struct irq_desc *desc, *desc_pri;
 	int i;
 	int count = 0;
-	unsigned long phys;
-	unsigned long total_bytes;
 
-	BUG_ON(irq == -1U);
-
-	desc_pri = desc = &sparse_irqs[0];
+	desc_pri = desc = sparse_irqs;
 	while (desc) {
 		if (desc->irq == irq)
 			return desc;
 
-		if (desc->irq == -1U) {
-			desc->irq = irq;
-			printk(KERN_DEBUG "found new irq_desc for irq %d\n", desc->irq);
-			return desc;
-		}
 		desc_pri = desc;
 		desc = desc->next;
 		count++;
@@ -197,48 +190,62 @@ struct irq_desc *irq_to_desc_alloc(unsigned int irq)
 	/*
 	 *  we run out of pre-allocate ones, allocate more
 	 */
-	printk(KERN_DEBUG "try to get more irq_desc %d\n", nr_irq_desc);
-	{
-		/* double check if some one mess up the list */
-		struct irq_desc *desc;
-		int count = 0;
-
-		desc = &sparse_irqs[0];
-		while (desc) {
-			printk(KERN_DEBUG "found irq_desc for irq %d\n", desc->irq);
-			if (desc->next)
-				printk(KERN_DEBUG "found irq_desc for irq %d and next will be irq %d\n", desc->irq, desc->next->irq);
-			desc = desc->next;
-			count++;
-		}
-		printk(KERN_DEBUG "all preallocted %d\n", count);
-	}
+	if (!sparse_irqs_free) {
+		unsigned long phys;
+		unsigned long total_bytes;
 
-	total_bytes = sizeof(struct irq_desc) * nr_irq_desc;
-	if (after_bootmem)
-		desc = kzalloc(total_bytes, GFP_ATOMIC);
-	else
-		desc = __alloc_bootmem_nopanic(total_bytes, PAGE_SIZE, 0);
+		printk(KERN_DEBUG "try to get more irq_desc %d\n", nr_irq_desc);
 
-	if (!desc)
-		panic("please boot with nr_irq_desc= %d\n", count * 2);
+		total_bytes = sizeof(struct irq_desc) * nr_irq_desc;
+		if (after_bootmem)
+			desc = kzalloc(total_bytes, GFP_ATOMIC);
+		else
+			desc = __alloc_bootmem_nopanic(total_bytes, PAGE_SIZE, 0);
 
-	phys = __pa(desc);
-	printk(KERN_DEBUG "irq_desc ==> [%#lx - %#lx]\n", phys, phys + total_bytes);
+		if (!desc)
+			panic("please boot with nr_irq_desc= %d\n", count * 2);
 
-	for (i = 0; i < nr_irq_desc; i++)
-		init_one_irq_desc(&desc[i]);
+		phys = __pa(desc);
+		printk(KERN_DEBUG "irq_desc ==> [%#lx - %#lx]\n", phys, phys + total_bytes);
 
-	for (i = 1; i < nr_irq_desc; i++)
-		desc[i-1].next = &desc[i];
+		for (i = 0; i < nr_irq_desc; i++)
+			init_one_irq_desc(&desc[i]);
 
-	/* init kstat_irqs, nr_cpu_ids is ready already */
-	init_kstat_irqs(desc, nr_irq_desc, nr_cpu_ids);
+		for (i = 1; i < nr_irq_desc; i++)
+			desc[i-1].next = &desc[i];
 
-	desc->irq = irq;
-	desc_pri->next = desc;
-	printk(KERN_DEBUG "1 found new irq_desc for irq %d and pri will be irq %d\n", desc->irq, desc_pri->irq);
+		/* init kstat_irqs, nr_cpu_ids is ready already */
+		init_kstat_irqs(desc, nr_irq_desc, nr_cpu_ids);
 
+		sparse_irqs_free = desc;
+	}
+
+	desc = sparse_irqs_free;
+	sparse_irqs_free = sparse_irqs_free->next;
+	desc->next = NULL;
+	if (desc_pri)
+		desc_pri->next = desc;
+	else
+		sparse_irqs = desc;
+	desc->irq = irq;
+	printk(KERN_DEBUG "found new irq_desc for irq %d\n", desc->irq);
+#ifdef CONFIG_HAVE_SPARSE_IRQ_DEBUG
+	{
+		/* dump the results */
+		struct irq_desc *desc;
+		unsigned long phys;
+		unsigned long bytes = sizeof(struct irq_desc);
+		unsigned int irqx;
+
+		printk(KERN_DEBUG "=========================== %d\n", irq);
+		printk(KERN_DEBUG "irq_desc dump after get that for %d\n", irq);
+		for_each_irq_desc(irqx, desc) {
+			phys = __pa(desc);
+			printk(KERN_DEBUG "irq_desc %d ==> [%#lx - %#lx]\n", irqx, phys, phys + bytes);
+		}
+		printk(KERN_DEBUG "===========================\n");
+	}
+#endif
 	return desc;
 }
 #else
-- 
cgit v1.2.3


From e420dfb40c453a9760b86c7f338052bdb4dfa755 Mon Sep 17 00:00:00 2001
From: Yinghai Lu <yhlu.kernel@gmail.com>
Date: Tue, 19 Aug 2008 20:50:21 -0700
Subject: x86: put irq_2_iommu pointer into irq_desc

when CONFIG_HAVE_SPARSE_IRQ
preallocate some irq_2_iommu entries, and use get_one_free_irq_2_iomm to
get new one and link to irq_desc if needed.

else will use dyn_array or static array.

v2: <= nr_irqs fix

Signed-off-by: Yinghai Lu <yhlu.kernel@gmail.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 drivers/pci/intr_remapping.c | 213 +++++++++++++++++++++++++++++++++----------
 include/linux/irq.h          |   4 +
 2 files changed, 169 insertions(+), 48 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/pci/intr_remapping.c b/drivers/pci/intr_remapping.c
index 6961be807684..23372c811159 100644
--- a/drivers/pci/intr_remapping.c
+++ b/drivers/pci/intr_remapping.c
@@ -19,41 +19,136 @@ struct irq_2_iommu {
 	u8  irte_mask;
 };
 
-#ifdef CONFIG_HAVE_DYNA_ARRAY
-static struct irq_2_iommu *irq_2_iommu;
-DEFINE_DYN_ARRAY(irq_2_iommu, sizeof(struct irq_2_iommu), nr_irqs, PAGE_SIZE, NULL);
+#ifdef CONFIG_HAVE_SPARSE_IRQ
+static struct irq_2_iommu *irq_2_iommuX;
+/* fill one page ? */
+static int nr_irq_2_iommu = 0x100;
+static int irq_2_iommu_index;
+DEFINE_DYN_ARRAY(irq_2_iommuX, sizeof(struct irq_2_iommu), nr_irq_2_iommu, PAGE_SIZE, NULL);
+
+extern void *__alloc_bootmem_nopanic(unsigned long size,
+				     unsigned long align,
+				     unsigned long goal);
+
+static struct irq_2_iommu *get_one_free_irq_2_iommu(int not_used)
+{
+	struct irq_2_iommu *iommu;
+	unsigned long total_bytes;
+
+	if (irq_2_iommu_index >= nr_irq_2_iommu) {
+		/*
+		 *  we run out of pre-allocate ones, allocate more
+		 */
+		printk(KERN_DEBUG "try to get more irq_2_iommu %d\n", nr_irq_2_iommu);
+
+		total_bytes = sizeof(struct irq_2_iommu)*nr_irq_2_iommu;
+
+		if (after_bootmem)
+			iommu = kzalloc(total_bytes, GFP_ATOMIC);
+		else
+			iommu = __alloc_bootmem_nopanic(total_bytes, PAGE_SIZE, 0);
+
+		if (!iommu)
+			panic("can not get more irq_2_iommu\n");
+
+		irq_2_iommuX = iommu;
+		irq_2_iommu_index = 0;
+	}
+
+	iommu = &irq_2_iommuX[irq_2_iommu_index];
+	irq_2_iommu_index++;
+	return iommu;
+}
+
+static struct irq_2_iommu *irq_2_iommu(unsigned int irq)
+{
+	struct irq_desc *desc;
+
+	desc = irq_to_desc(irq);
+
+	BUG_ON(!desc);
+
+	return desc->irq_2_iommu;
+}
+
+static struct irq_2_iommu *irq_2_iommu_alloc(unsigned int irq)
+{
+	struct irq_desc *desc;
+	struct irq_2_iommu *irq_iommu;
+
+	desc = irq_to_desc(irq);
+
+	BUG_ON(!desc);
+
+	irq_iommu = desc->irq_2_iommu;
+
+	if (!irq_iommu)
+		desc->irq_2_iommu = get_one_free_irq_2_iommu(irq);
+
+	return desc->irq_2_iommu;
+}
+
+#else /* !CONFIG_HAVE_SPARSE_IRQ */
+
+#ifdef CONFIG_HAVE_DYN_ARRAY
+static struct irq_2_iommu *irq_2_iommuX;
+DEFINE_DYN_ARRAY(irq_2_iommuX, sizeof(struct irq_2_iommu), nr_irqs, PAGE_SIZE, NULL);
 #else
-static struct irq_2_iommu irq_2_iommu[NR_IRQS];
+static struct irq_2_iommu irq_2_iommuX[NR_IRQS];
+#endif
+
+static struct irq_2_iommu *irq_2_iommu(unsigned int irq)
+{
+	if (irq < nr_irqs)
+		return &irq_2_iommuX[irq];
+
+	return NULL;
+}
+static struct irq_2_iommu *irq_2_iommu_alloc(unsigned int irq)
+{
+	return irq_2_iommu(irq);
+}
 #endif
 
 static DEFINE_SPINLOCK(irq_2_ir_lock);
 
-int irq_remapped(int irq)
+static struct irq_2_iommu *valid_irq_2_iommu(unsigned int irq)
 {
-	if (irq > nr_irqs)
-		return 0;
+	struct irq_2_iommu *irq_iommu;
+
+	irq_iommu = irq_2_iommu(irq);
 
-	if (!irq_2_iommu[irq].iommu)
-		return 0;
+	if (!irq_iommu)
+		return NULL;
 
-	return 1;
+	if (!irq_iommu->iommu)
+		return NULL;
+
+	return irq_iommu;
+}
+
+int irq_remapped(int irq)
+{
+	return valid_irq_2_iommu(irq) != NULL;
 }
 
 int get_irte(int irq, struct irte *entry)
 {
 	int index;
+	struct irq_2_iommu *irq_iommu;
 
-	if (!entry || irq > nr_irqs)
+	if (!entry)
 		return -1;
 
 	spin_lock(&irq_2_ir_lock);
-	if (!irq_2_iommu[irq].iommu) {
+	irq_iommu = valid_irq_2_iommu(irq);
+	if (!irq_iommu) {
 		spin_unlock(&irq_2_ir_lock);
 		return -1;
 	}
 
-	index = irq_2_iommu[irq].irte_index + irq_2_iommu[irq].sub_handle;
-	*entry = *(irq_2_iommu[irq].iommu->ir_table->base + index);
+	index = irq_iommu->irte_index + irq_iommu->sub_handle;
+	*entry = *(irq_iommu->iommu->ir_table->base + index);
 
 	spin_unlock(&irq_2_ir_lock);
 	return 0;
@@ -62,6 +157,7 @@ int get_irte(int irq, struct irte *entry)
 int alloc_irte(struct intel_iommu *iommu, int irq, u16 count)
 {
 	struct ir_table *table = iommu->ir_table;
+	struct irq_2_iommu *irq_iommu;
 	u16 index, start_index;
 	unsigned int mask = 0;
 	int i;
@@ -69,6 +165,12 @@ int alloc_irte(struct intel_iommu *iommu, int irq, u16 count)
 	if (!count)
 		return -1;
 
+#ifndef	CONFIG_HAVE_SPARSE_IRQ
+	/* protect irq_2_iommu_alloc later */
+	if (irq >= nr_irqs)
+		return -1;
+#endif
+
 	/*
 	 * start the IRTE search from index 0.
 	 */
@@ -108,10 +210,11 @@ int alloc_irte(struct intel_iommu *iommu, int irq, u16 count)
 	for (i = index; i < index + count; i++)
 		table->base[i].present = 1;
 
-	irq_2_iommu[irq].iommu = iommu;
-	irq_2_iommu[irq].irte_index =  index;
-	irq_2_iommu[irq].sub_handle = 0;
-	irq_2_iommu[irq].irte_mask = mask;
+	irq_iommu = irq_2_iommu_alloc(irq);
+	irq_iommu->iommu = iommu;
+	irq_iommu->irte_index =  index;
+	irq_iommu->sub_handle = 0;
+	irq_iommu->irte_mask = mask;
 
 	spin_unlock(&irq_2_ir_lock);
 
@@ -132,31 +235,36 @@ static void qi_flush_iec(struct intel_iommu *iommu, int index, int mask)
 int map_irq_to_irte_handle(int irq, u16 *sub_handle)
 {
 	int index;
+	struct irq_2_iommu *irq_iommu;
 
 	spin_lock(&irq_2_ir_lock);
-	if (irq >= nr_irqs || !irq_2_iommu[irq].iommu) {
+	irq_iommu = valid_irq_2_iommu(irq);
+	if (!irq_iommu) {
 		spin_unlock(&irq_2_ir_lock);
 		return -1;
 	}
 
-	*sub_handle = irq_2_iommu[irq].sub_handle;
-	index = irq_2_iommu[irq].irte_index;
+	*sub_handle = irq_iommu->sub_handle;
+	index = irq_iommu->irte_index;
 	spin_unlock(&irq_2_ir_lock);
 	return index;
 }
 
 int set_irte_irq(int irq, struct intel_iommu *iommu, u16 index, u16 subhandle)
 {
+	struct irq_2_iommu *irq_iommu;
+
 	spin_lock(&irq_2_ir_lock);
-	if (irq >= nr_irqs || irq_2_iommu[irq].iommu) {
+	irq_iommu = valid_irq_2_iommu(irq);
+	if (!irq_iommu) {
 		spin_unlock(&irq_2_ir_lock);
 		return -1;
 	}
 
-	irq_2_iommu[irq].iommu = iommu;
-	irq_2_iommu[irq].irte_index = index;
-	irq_2_iommu[irq].sub_handle = subhandle;
-	irq_2_iommu[irq].irte_mask = 0;
+	irq_iommu->iommu = iommu;
+	irq_iommu->irte_index = index;
+	irq_iommu->sub_handle = subhandle;
+	irq_iommu->irte_mask = 0;
 
 	spin_unlock(&irq_2_ir_lock);
 
@@ -165,16 +273,19 @@ int set_irte_irq(int irq, struct intel_iommu *iommu, u16 index, u16 subhandle)
 
 int clear_irte_irq(int irq, struct intel_iommu *iommu, u16 index)
 {
+	struct irq_2_iommu *irq_iommu;
+
 	spin_lock(&irq_2_ir_lock);
-	if (irq >= nr_irqs || !irq_2_iommu[irq].iommu) {
+	irq_iommu = valid_irq_2_iommu(irq);
+	if (!irq_iommu) {
 		spin_unlock(&irq_2_ir_lock);
 		return -1;
 	}
 
-	irq_2_iommu[irq].iommu = NULL;
-	irq_2_iommu[irq].irte_index = 0;
-	irq_2_iommu[irq].sub_handle = 0;
-	irq_2_iommu[irq].irte_mask = 0;
+	irq_iommu->iommu = NULL;
+	irq_iommu->irte_index = 0;
+	irq_iommu->sub_handle = 0;
+	irq_2_iommu(irq)->irte_mask = 0;
 
 	spin_unlock(&irq_2_ir_lock);
 
@@ -186,16 +297,18 @@ int modify_irte(int irq, struct irte *irte_modified)
 	int index;
 	struct irte *irte;
 	struct intel_iommu *iommu;
+	struct irq_2_iommu *irq_iommu;
 
 	spin_lock(&irq_2_ir_lock);
-	if (irq >= nr_irqs || !irq_2_iommu[irq].iommu) {
+	irq_iommu = valid_irq_2_iommu(irq);
+	if (!irq_iommu) {
 		spin_unlock(&irq_2_ir_lock);
 		return -1;
 	}
 
-	iommu = irq_2_iommu[irq].iommu;
+	iommu = irq_iommu->iommu;
 
-	index = irq_2_iommu[irq].irte_index + irq_2_iommu[irq].sub_handle;
+	index = irq_iommu->irte_index + irq_iommu->sub_handle;
 	irte = &iommu->ir_table->base[index];
 
 	set_64bit((unsigned long *)irte, irte_modified->low | (1 << 1));
@@ -211,18 +324,20 @@ int flush_irte(int irq)
 {
 	int index;
 	struct intel_iommu *iommu;
+	struct irq_2_iommu *irq_iommu;
 
 	spin_lock(&irq_2_ir_lock);
-	if (irq >= nr_irqs || !irq_2_iommu[irq].iommu) {
+	irq_iommu = valid_irq_2_iommu(irq);
+	if (!irq_iommu) {
 		spin_unlock(&irq_2_ir_lock);
 		return -1;
 	}
 
-	iommu = irq_2_iommu[irq].iommu;
+	iommu = irq_iommu->iommu;
 
-	index = irq_2_iommu[irq].irte_index + irq_2_iommu[irq].sub_handle;
+	index = irq_iommu->irte_index + irq_iommu->sub_handle;
 
-	qi_flush_iec(iommu, index, irq_2_iommu[irq].irte_mask);
+	qi_flush_iec(iommu, index, irq_iommu->irte_mask);
 	spin_unlock(&irq_2_ir_lock);
 
 	return 0;
@@ -254,28 +369,30 @@ int free_irte(int irq)
 	int index, i;
 	struct irte *irte;
 	struct intel_iommu *iommu;
+	struct irq_2_iommu *irq_iommu;
 
 	spin_lock(&irq_2_ir_lock);
-	if (irq >= nr_irqs || !irq_2_iommu[irq].iommu) {
+	irq_iommu = valid_irq_2_iommu(irq);
+	if (!irq_iommu) {
 		spin_unlock(&irq_2_ir_lock);
 		return -1;
 	}
 
-	iommu = irq_2_iommu[irq].iommu;
+	iommu = irq_iommu->iommu;
 
-	index = irq_2_iommu[irq].irte_index + irq_2_iommu[irq].sub_handle;
+	index = irq_iommu->irte_index + irq_iommu->sub_handle;
 	irte = &iommu->ir_table->base[index];
 
-	if (!irq_2_iommu[irq].sub_handle) {
-		for (i = 0; i < (1 << irq_2_iommu[irq].irte_mask); i++)
+	if (!irq_iommu->sub_handle) {
+		for (i = 0; i < (1 << irq_iommu->irte_mask); i++)
 			set_64bit((unsigned long *)irte, 0);
-		qi_flush_iec(iommu, index, irq_2_iommu[irq].irte_mask);
+		qi_flush_iec(iommu, index, irq_iommu->irte_mask);
 	}
 
-	irq_2_iommu[irq].iommu = NULL;
-	irq_2_iommu[irq].irte_index = 0;
-	irq_2_iommu[irq].sub_handle = 0;
-	irq_2_iommu[irq].irte_mask = 0;
+	irq_iommu->iommu = NULL;
+	irq_iommu->irte_index = 0;
+	irq_iommu->sub_handle = 0;
+	irq_iommu->irte_mask = 0;
 
 	spin_unlock(&irq_2_ir_lock);
 
diff --git a/include/linux/irq.h b/include/linux/irq.h
index d5749852ee69..788d5a35a580 100644
--- a/include/linux/irq.h
+++ b/include/linux/irq.h
@@ -128,6 +128,7 @@ struct irq_chip {
 };
 
 struct timer_rand_state;
+struct irq_2_iommu;
 /**
  * struct irq_desc - interrupt descriptor
  *
@@ -162,6 +163,9 @@ struct irq_desc {
 	unsigned int            *kstat_irqs;
 #else
 	unsigned int            kstat_irqs[NR_CPUS];
+#endif
+#if defined(CONFIG_INTR_REMAP) && defined(CONFIG_HAVE_SPARSE_IRQ)
+       struct irq_2_iommu      *irq_2_iommu;
 #endif
 	irq_flow_handler_t	handle_irq;
 	struct irq_chip		*chip;
-- 
cgit v1.2.3


From 6d50bc26836e16a9589e0b128d527c29e30d722a Mon Sep 17 00:00:00 2001
From: Yinghai Lu <yhlu.kernel@gmail.com>
Date: Tue, 19 Aug 2008 20:50:22 -0700
Subject: x86: use 28 bits irq NR for pci msi/msix and ht

also print out irq no in /proc/interrups and /proc/stat in hex, so could
tell bus/dev/func.

Signed-off-by: Yinghai Lu <yhlu.kernel@gmail.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/io_apic_64.c | 64 ++++++++++++++++++++++++++++++++++----------
 arch/x86/kernel/irq_64.c     |  2 +-
 drivers/pci/htirq.c          | 22 +++++++++++++--
 fs/proc/proc_misc.c          |  2 +-
 include/linux/irq.h          |  1 +
 5 files changed, 73 insertions(+), 18 deletions(-)

(limited to 'include/linux')

diff --git a/arch/x86/kernel/io_apic_64.c b/arch/x86/kernel/io_apic_64.c
index 8ab7ae01773f..b0d4abc55a11 100644
--- a/arch/x86/kernel/io_apic_64.c
+++ b/arch/x86/kernel/io_apic_64.c
@@ -2520,17 +2520,21 @@ device_initcall(ioapic_init_sysfs);
 /*
  * Dynamic irq allocate and deallocation
  */
-int create_irq(void)
+unsigned int create_irq_nr(unsigned int irq_want)
 {
 	/* Allocate an unused irq */
-	int irq;
-	int new;
+	unsigned int irq;
+	unsigned int new;
 	unsigned long flags;
 	struct irq_cfg *cfg_new;
 
-	irq = -ENOSPC;
+#ifndef CONFIG_HAVE_SPARSE_IRQ
+	irq_want = nr_irqs - 1;
+#endif
+
+	irq = 0;
 	spin_lock_irqsave(&vector_lock, flags);
-	for (new = (nr_irqs - 1); new >= 0; new--) {
+	for (new = irq_want; new > 0; new--) {
 		if (platform_legacy_irq(new))
 			continue;
 		cfg_new = irq_cfg(new);
@@ -2545,12 +2549,24 @@ int create_irq(void)
 	}
 	spin_unlock_irqrestore(&vector_lock, flags);
 
-	if (irq >= 0) {
+	if (irq > 0) {
 		dynamic_irq_init(irq);
 	}
 	return irq;
 }
 
+int create_irq(void)
+{
+	int irq;
+
+	irq = create_irq_nr(nr_irqs - 1);
+
+	if (irq == 0)
+		irq = -1;
+
+	return irq;
+}
+
 void destroy_irq(unsigned int irq)
 {
 	unsigned long flags;
@@ -2803,13 +2819,29 @@ static int setup_msi_irq(struct pci_dev *dev, struct msi_desc *desc, int irq)
 	return 0;
 }
 
+static unsigned int build_irq_for_pci_dev(struct pci_dev *dev)
+{
+	unsigned int irq;
+
+	irq = dev->bus->number;
+	irq <<= 8;
+	irq |= dev->devfn;
+	irq <<= 12;
+
+	return irq;
+}
+
 int arch_setup_msi_irq(struct pci_dev *dev, struct msi_desc *desc)
 {
-	int irq, ret;
+	unsigned int irq;
+	int ret;
+	unsigned int irq_want;
 
-	irq = create_irq();
-	if (irq < 0)
-		return irq;
+	irq_want = build_irq_for_pci_dev(dev) + 0x100;
+
+	irq = create_irq_nr(irq_want);
+	if (irq == 0)
+		return -1;
 
 #ifdef CONFIG_INTR_REMAP
 	if (!intr_remapping_enabled)
@@ -2836,18 +2868,22 @@ error:
 
 int arch_setup_msi_irqs(struct pci_dev *dev, int nvec, int type)
 {
-	int irq, ret, sub_handle;
+	unsigned int irq;
+	int ret, sub_handle;
 	struct msi_desc *desc;
+	unsigned int irq_want;
+
 #ifdef CONFIG_INTR_REMAP
 	struct intel_iommu *iommu = 0;
 	int index = 0;
 #endif
 
+	irq_want = build_irq_for_pci_dev(dev) + 0x100;
 	sub_handle = 0;
 	list_for_each_entry(desc, &dev->msi_list, list) {
-		irq = create_irq();
-		if (irq < 0)
-			return irq;
+		irq = create_irq_nr(irq_want--);
+		if (irq == 0)
+			return -1;
 #ifdef CONFIG_INTR_REMAP
 		if (!intr_remapping_enabled)
 			goto no_ir;
diff --git a/arch/x86/kernel/irq_64.c b/arch/x86/kernel/irq_64.c
index 7bd841a9c640..348a11168c2b 100644
--- a/arch/x86/kernel/irq_64.c
+++ b/arch/x86/kernel/irq_64.c
@@ -112,7 +112,7 @@ int show_interrupts(struct seq_file *p, void *v)
 		action = desc->action;
 		if (!action && !any_count)
 			goto skip;
-		seq_printf(p, "%3d: ",i);
+		seq_printf(p, "%#x: ",i);
 #ifndef CONFIG_SMP
 		seq_printf(p, "%10u ", kstat_irqs(i));
 #else
diff --git a/drivers/pci/htirq.c b/drivers/pci/htirq.c
index 279c940a0039..7c5aef13fcdb 100644
--- a/drivers/pci/htirq.c
+++ b/drivers/pci/htirq.c
@@ -82,6 +82,18 @@ void unmask_ht_irq(unsigned int irq)
 	write_ht_irq_msg(irq, &msg);
 }
 
+static unsigned int build_irq_for_pci_dev(struct pci_dev *dev)
+{
+	unsigned int irq;
+
+	irq = dev->bus->number;
+	irq <<= 8;
+	irq |= dev->devfn;
+	irq <<= 12;
+
+	return irq;
+}
+
 /**
  * __ht_create_irq - create an irq and attach it to a device.
  * @dev: The hypertransport device to find the irq capability on.
@@ -97,7 +109,8 @@ int __ht_create_irq(struct pci_dev *dev, int idx, ht_irq_update_t *update)
 	u32 data;
 	int max_irq;
 	int pos;
-	int irq;
+	unsigned int irq;
+	unsigned int irq_want;
 
 	pos = pci_find_ht_capability(dev, HT_CAPTYPE_IRQ);
 	if (!pos)
@@ -125,8 +138,13 @@ int __ht_create_irq(struct pci_dev *dev, int idx, ht_irq_update_t *update)
 	cfg->msg.address_lo = 0xffffffff;
 	cfg->msg.address_hi = 0xffffffff;
 
+	irq_want= build_irq_for_pci_dev(dev);
+#ifdef CONFIG_HAVE_SPARSE_IRQ
+	irq = create_irq_nr(irq_want + idx);
+#else
 	irq = create_irq();
-	if (irq < 0) {
+#endif
+	if (irq == 0) {
 		kfree(cfg);
 		return -EBUSY;
 	}
diff --git a/fs/proc/proc_misc.c b/fs/proc/proc_misc.c
index 72dd739a7f8a..d68c3592fe4a 100644
--- a/fs/proc/proc_misc.c
+++ b/fs/proc/proc_misc.c
@@ -589,7 +589,7 @@ static int show_stat(struct seq_file *p, void *v)
 		}
 
 #ifdef CONFIG_HAVE_SPARSE_IRQ
-		seq_printf(p, " %u:%u", j, per_irq_sum);
+		seq_printf(p, " %#x:%u", j, per_irq_sum);
 #else
 		seq_printf(p, " %u", per_irq_sum);
 #endif
diff --git a/include/linux/irq.h b/include/linux/irq.h
index 788d5a35a580..704136138dc7 100644
--- a/include/linux/irq.h
+++ b/include/linux/irq.h
@@ -399,6 +399,7 @@ extern void set_irq_noprobe(unsigned int irq);
 extern void set_irq_probe(unsigned int irq);
 
 /* Handle dynamic irq creation and destruction */
+extern unsigned int create_irq_nr(unsigned int irq_want);
 extern int create_irq(void);
 extern void destroy_irq(unsigned int irq);
 
-- 
cgit v1.2.3


From 8b8e8c1bf7275eca859fe551dfa484134eaf013b Mon Sep 17 00:00:00 2001
From: Yinghai Lu <yhlu.kernel@gmail.com>
Date: Tue, 19 Aug 2008 20:50:23 -0700
Subject: x86: remove irqbalance in kernel for 32 bit

This has been deprecated for years, the user space irqbalanced utility
works better with numa, has configurable policies, etc...

Signed-off-by: Yinghai Lu <yhlu.kernel@gmai.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/Kconfig                |   8 -
 arch/x86/configs/i386_defconfig |   1 -
 arch/x86/kernel/io_apic_32.c    | 402 ----------------------------------------
 arch/x86/kernel/quirks.c        |   3 -
 include/linux/irq.h             |  14 +-
 kernel/irq/manage.c             |   3 -
 6 files changed, 3 insertions(+), 428 deletions(-)

(limited to 'include/linux')

diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
index 1004888e9b13..3e0eaaa1a339 100644
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -1254,14 +1254,6 @@ config EFI
   	resultant kernel should continue to boot on existing non-EFI
   	platforms.
 
-config IRQBALANCE
-	def_bool y
-	prompt "Enable kernel irq balancing"
-	depends on X86_32 && SMP && X86_IO_APIC
-	help
-	  The default yes will allow the kernel to do irq load balancing.
-	  Saying no will keep the kernel from doing irq load balancing.
-
 config SECCOMP
 	def_bool y
 	prompt "Enable seccomp to safely compute untrusted bytecode"
diff --git a/arch/x86/configs/i386_defconfig b/arch/x86/configs/i386_defconfig
index 52d0359719d7..13b8c86ae985 100644
--- a/arch/x86/configs/i386_defconfig
+++ b/arch/x86/configs/i386_defconfig
@@ -287,7 +287,6 @@ CONFIG_MTRR=y
 # CONFIG_MTRR_SANITIZER is not set
 CONFIG_X86_PAT=y
 CONFIG_EFI=y
-# CONFIG_IRQBALANCE is not set
 CONFIG_SECCOMP=y
 # CONFIG_HZ_100 is not set
 # CONFIG_HZ_250 is not set
diff --git a/arch/x86/kernel/io_apic_32.c b/arch/x86/kernel/io_apic_32.c
index 204884b1415a..668edf226067 100644
--- a/arch/x86/kernel/io_apic_32.c
+++ b/arch/x86/kernel/io_apic_32.c
@@ -371,408 +371,6 @@ static void set_ioapic_affinity_irq(unsigned int irq, cpumask_t cpumask)
 	spin_unlock_irqrestore(&ioapic_lock, flags);
 }
 
-#if defined(CONFIG_IRQBALANCE)
-# include <asm/processor.h>	/* kernel_thread() */
-# include <linux/kernel_stat.h>	/* kstat */
-# include <linux/slab.h>		/* kmalloc() */
-# include <linux/timer.h>
-
-#define IRQBALANCE_CHECK_ARCH -999
-#define MAX_BALANCED_IRQ_INTERVAL	(5*HZ)
-#define MIN_BALANCED_IRQ_INTERVAL	(HZ/2)
-#define BALANCED_IRQ_MORE_DELTA		(HZ/10)
-#define BALANCED_IRQ_LESS_DELTA		(HZ)
-
-static int irqbalance_disabled __read_mostly = IRQBALANCE_CHECK_ARCH;
-static int physical_balance __read_mostly;
-static long balanced_irq_interval __read_mostly = MAX_BALANCED_IRQ_INTERVAL;
-
-static struct irq_cpu_info {
-	unsigned long *last_irq;
-	unsigned long *irq_delta;
-	unsigned long irq;
-} irq_cpu_data[NR_CPUS];
-
-#define CPU_IRQ(cpu)		(irq_cpu_data[cpu].irq)
-#define LAST_CPU_IRQ(cpu, irq)   (irq_cpu_data[cpu].last_irq[irq])
-#define IRQ_DELTA(cpu, irq) 	(irq_cpu_data[cpu].irq_delta[irq])
-
-#define IDLE_ENOUGH(cpu,now) \
-	(idle_cpu(cpu) && ((now) - per_cpu(irq_stat, (cpu)).idle_timestamp > 1))
-
-#define IRQ_ALLOWED(cpu, allowed_mask)	cpu_isset(cpu, allowed_mask)
-
-#define CPU_TO_PACKAGEINDEX(i) (first_cpu(per_cpu(cpu_sibling_map, i)))
-
-static cpumask_t balance_irq_affinity_init __initdata = CPU_MASK_ALL;
-
-static cpumask_t *balance_irq_affinity;
-
-
-static void __init irq_affinity_init_work(void *data)
-{
-	struct dyn_array *da = data;
-
-	int i;
-	struct  balance_irq_affinity *affinity;
-
-	affinity = *da->name;
-
-	for (i = 0; i < *da->nr; i++)
-		memcpy(&affinity[i], &balance_irq_affinity_init,
-			 sizeof(struct balance_irq_affinity));
-
-}
-
-DEFINE_DYN_ARRAY(balance_irq_affinity, sizeof(struct balance_irq_affinity), nr_irqs, PAGE_SIZE, irq_affinity_init_work);
-
-
-void set_balance_irq_affinity(unsigned int irq, cpumask_t mask)
-{
-	balance_irq_affinity[irq] = mask;
-}
-
-static unsigned long move(int curr_cpu, cpumask_t allowed_mask,
-			unsigned long now, int direction)
-{
-	int search_idle = 1;
-	int cpu = curr_cpu;
-
-	goto inside;
-
-	do {
-		if (unlikely(cpu == curr_cpu))
-			search_idle = 0;
-inside:
-		if (direction == 1) {
-			cpu++;
-			if (cpu >= NR_CPUS)
-				cpu = 0;
-		} else {
-			cpu--;
-			if (cpu == -1)
-				cpu = NR_CPUS-1;
-		}
-	} while (!cpu_online(cpu) || !IRQ_ALLOWED(cpu, allowed_mask) ||
-			(search_idle && !IDLE_ENOUGH(cpu, now)));
-
-	return cpu;
-}
-
-static inline void balance_irq(int cpu, int irq)
-{
-	unsigned long now = jiffies;
-	cpumask_t allowed_mask;
-	unsigned int new_cpu;
-
-	if (irqbalance_disabled)
-		return;
-
-	cpus_and(allowed_mask, cpu_online_map, balance_irq_affinity[irq]);
-	new_cpu = move(cpu, allowed_mask, now, 1);
-	if (cpu != new_cpu)
-		set_pending_irq(irq, cpumask_of_cpu(new_cpu));
-}
-
-static inline void rotate_irqs_among_cpus(unsigned long useful_load_threshold)
-{
-	int i, j;
-	struct irq_desc *desc;
-
-	for_each_online_cpu(i) {
-		for (j = 0; j < nr_irqs; j++) {
-			desc = irq_to_desc(j);
-			if (!desc->action)
-				continue;
-			/* Is it a significant load ?  */
-			if (IRQ_DELTA(CPU_TO_PACKAGEINDEX(i), j) <
-						useful_load_threshold)
-				continue;
-			balance_irq(i, j);
-		}
-	}
-	balanced_irq_interval = max((long)MIN_BALANCED_IRQ_INTERVAL,
-		balanced_irq_interval - BALANCED_IRQ_LESS_DELTA);
-	return;
-}
-
-static void do_irq_balance(void)
-{
-	int i, j;
-	unsigned long max_cpu_irq = 0, min_cpu_irq = (~0);
-	unsigned long move_this_load = 0;
-	int max_loaded = 0, min_loaded = 0;
-	int load;
-	unsigned long useful_load_threshold = balanced_irq_interval + 10;
-	int selected_irq;
-	int tmp_loaded, first_attempt = 1;
-	unsigned long tmp_cpu_irq;
-	unsigned long imbalance = 0;
-	cpumask_t allowed_mask, target_cpu_mask, tmp;
-	struct irq_desc *desc;
-
-	for_each_possible_cpu(i) {
-		int package_index;
-		CPU_IRQ(i) = 0;
-		if (!cpu_online(i))
-			continue;
-		package_index = CPU_TO_PACKAGEINDEX(i);
-		for (j = 0; j < nr_irqs; j++) {
-			unsigned long value_now, delta;
-			/* Is this an active IRQ or balancing disabled ? */
-			desc = irq_to_desc(j);
-			if (!desc->action || irq_balancing_disabled(j))
-				continue;
-			if (package_index == i)
-				IRQ_DELTA(package_index, j) = 0;
-			/* Determine the total count per processor per IRQ */
-			value_now = (unsigned long) kstat_irqs_cpu(j, i);
-
-			/* Determine the activity per processor per IRQ */
-			delta = value_now - LAST_CPU_IRQ(i, j);
-
-			/* Update last_cpu_irq[][] for the next time */
-			LAST_CPU_IRQ(i, j) = value_now;
-
-			/* Ignore IRQs whose rate is less than the clock */
-			if (delta < useful_load_threshold)
-				continue;
-			/* update the load for the processor or package total */
-			IRQ_DELTA(package_index, j) += delta;
-
-			/* Keep track of the higher numbered sibling as well */
-			if (i != package_index)
-				CPU_IRQ(i) += delta;
-			/*
-			 * We have sibling A and sibling B in the package
-			 *
-			 * cpu_irq[A] = load for cpu A + load for cpu B
-			 * cpu_irq[B] = load for cpu B
-			 */
-			CPU_IRQ(package_index) += delta;
-		}
-	}
-	/* Find the least loaded processor package */
-	for_each_online_cpu(i) {
-		if (i != CPU_TO_PACKAGEINDEX(i))
-			continue;
-		if (min_cpu_irq > CPU_IRQ(i)) {
-			min_cpu_irq = CPU_IRQ(i);
-			min_loaded = i;
-		}
-	}
-	max_cpu_irq = ULONG_MAX;
-
-tryanothercpu:
-	/*
-	 * Look for heaviest loaded processor.
-	 * We may come back to get the next heaviest loaded processor.
-	 * Skip processors with trivial loads.
-	 */
-	tmp_cpu_irq = 0;
-	tmp_loaded = -1;
-	for_each_online_cpu(i) {
-		if (i != CPU_TO_PACKAGEINDEX(i))
-			continue;
-		if (max_cpu_irq <= CPU_IRQ(i))
-			continue;
-		if (tmp_cpu_irq < CPU_IRQ(i)) {
-			tmp_cpu_irq = CPU_IRQ(i);
-			tmp_loaded = i;
-		}
-	}
-
-	if (tmp_loaded == -1) {
-	 /*
-	  * In the case of small number of heavy interrupt sources,
-	  * loading some of the cpus too much. We use Ingo's original
-	  * approach to rotate them around.
-	  */
-		if (!first_attempt && imbalance >= useful_load_threshold) {
-			rotate_irqs_among_cpus(useful_load_threshold);
-			return;
-		}
-		goto not_worth_the_effort;
-	}
-
-	first_attempt = 0;		/* heaviest search */
-	max_cpu_irq = tmp_cpu_irq;	/* load */
-	max_loaded = tmp_loaded;	/* processor */
-	imbalance = (max_cpu_irq - min_cpu_irq) / 2;
-
-	/*
-	 * if imbalance is less than approx 10% of max load, then
-	 * observe diminishing returns action. - quit
-	 */
-	if (imbalance < (max_cpu_irq >> 3))
-		goto not_worth_the_effort;
-
-tryanotherirq:
-	/* if we select an IRQ to move that can't go where we want, then
-	 * see if there is another one to try.
-	 */
-	move_this_load = 0;
-	selected_irq = -1;
-	for (j = 0; j < nr_irqs; j++) {
-		/* Is this an active IRQ? */
-		desc = irq_to_desc(j);
-		if (!desc->action)
-			continue;
-		if (imbalance <= IRQ_DELTA(max_loaded, j))
-			continue;
-		/* Try to find the IRQ that is closest to the imbalance
-		 * without going over.
-		 */
-		if (move_this_load < IRQ_DELTA(max_loaded, j)) {
-			move_this_load = IRQ_DELTA(max_loaded, j);
-			selected_irq = j;
-		}
-	}
-	if (selected_irq == -1)
-		goto tryanothercpu;
-
-	imbalance = move_this_load;
-
-	/* For physical_balance case, we accumulated both load
-	 * values in the one of the siblings cpu_irq[],
-	 * to use the same code for physical and logical processors
-	 * as much as possible.
-	 *
-	 * NOTE: the cpu_irq[] array holds the sum of the load for
-	 * sibling A and sibling B in the slot for the lowest numbered
-	 * sibling (A), _AND_ the load for sibling B in the slot for
-	 * the higher numbered sibling.
-	 *
-	 * We seek the least loaded sibling by making the comparison
-	 * (A+B)/2 vs B
-	 */
-	load = CPU_IRQ(min_loaded) >> 1;
-	for_each_cpu_mask(j, per_cpu(cpu_sibling_map, min_loaded)) {
-		if (load > CPU_IRQ(j)) {
-			/* This won't change cpu_sibling_map[min_loaded] */
-			load = CPU_IRQ(j);
-			min_loaded = j;
-		}
-	}
-
-	cpus_and(allowed_mask,
-		cpu_online_map,
-		balance_irq_affinity[selected_irq]);
-	target_cpu_mask = cpumask_of_cpu(min_loaded);
-	cpus_and(tmp, target_cpu_mask, allowed_mask);
-
-	if (!cpus_empty(tmp)) {
-		/* mark for change destination */
-		set_pending_irq(selected_irq, cpumask_of_cpu(min_loaded));
-
-		/* Since we made a change, come back sooner to
-		 * check for more variation.
-		 */
-		balanced_irq_interval = max((long)MIN_BALANCED_IRQ_INTERVAL,
-			balanced_irq_interval - BALANCED_IRQ_LESS_DELTA);
-		return;
-	}
-	goto tryanotherirq;
-
-not_worth_the_effort:
-	/*
-	 * if we did not find an IRQ to move, then adjust the time interval
-	 * upward
-	 */
-	balanced_irq_interval = min((long)MAX_BALANCED_IRQ_INTERVAL,
-		balanced_irq_interval + BALANCED_IRQ_MORE_DELTA);
-	return;
-}
-
-static int balanced_irq(void *unused)
-{
-	int i;
-	unsigned long prev_balance_time = jiffies;
-	long time_remaining = balanced_irq_interval;
-	struct irq_desc *desc;
-
-	/* push everything to CPU 0 to give us a starting point.  */
-	for (i = 0 ; i < nr_irqs ; i++) {
-		desc = irq_to_desc(i);
-		desc->pending_mask = cpumask_of_cpu(0);
-		set_pending_irq(i, cpumask_of_cpu(0));
-	}
-
-	set_freezable();
-	for ( ; ; ) {
-		time_remaining = schedule_timeout_interruptible(time_remaining);
-		try_to_freeze();
-		if (time_after(jiffies,
-				prev_balance_time+balanced_irq_interval)) {
-			preempt_disable();
-			do_irq_balance();
-			prev_balance_time = jiffies;
-			time_remaining = balanced_irq_interval;
-			preempt_enable();
-		}
-	}
-	return 0;
-}
-
-static int __init balanced_irq_init(void)
-{
-	int i;
-	struct cpuinfo_x86 *c;
-	cpumask_t tmp;
-
-	cpus_shift_right(tmp, cpu_online_map, 2);
-	c = &boot_cpu_data;
-	/* When not overwritten by the command line ask subarchitecture. */
-	if (irqbalance_disabled == IRQBALANCE_CHECK_ARCH)
-		irqbalance_disabled = NO_BALANCE_IRQ;
-	if (irqbalance_disabled)
-		return 0;
-
-	 /* disable irqbalance completely if there is only one processor online */
-	if (num_online_cpus() < 2) {
-		irqbalance_disabled = 1;
-		return 0;
-	}
-	/*
-	 * Enable physical balance only if more than 1 physical processor
-	 * is present
-	 */
-	if (smp_num_siblings > 1 && !cpus_empty(tmp))
-		physical_balance = 1;
-
-	for_each_online_cpu(i) {
-		irq_cpu_data[i].irq_delta = kzalloc(sizeof(unsigned long) * nr_irqs, GFP_KERNEL);
-		irq_cpu_data[i].last_irq = kzalloc(sizeof(unsigned long) * nr_irqs, GFP_KERNEL);
-		if (irq_cpu_data[i].irq_delta == NULL || irq_cpu_data[i].last_irq == NULL) {
-			printk(KERN_ERR "balanced_irq_init: out of memory");
-			goto failed;
-		}
-	}
-
-	printk(KERN_INFO "Starting balanced_irq\n");
-	if (!IS_ERR(kthread_run(balanced_irq, NULL, "kirqd")))
-		return 0;
-	printk(KERN_ERR "balanced_irq_init: failed to spawn balanced_irq");
-failed:
-	for_each_possible_cpu(i) {
-		kfree(irq_cpu_data[i].irq_delta);
-		irq_cpu_data[i].irq_delta = NULL;
-		kfree(irq_cpu_data[i].last_irq);
-		irq_cpu_data[i].last_irq = NULL;
-	}
-	return 0;
-}
-
-int __devinit irqbalance_disable(char *str)
-{
-	irqbalance_disabled = 1;
-	return 1;
-}
-
-__setup("noirqbalance", irqbalance_disable);
-
-late_initcall(balanced_irq_init);
-#endif /* CONFIG_IRQBALANCE */
 #endif /* CONFIG_SMP */
 
 #ifndef CONFIG_SMP
diff --git a/arch/x86/kernel/quirks.c b/arch/x86/kernel/quirks.c
index f6a11b9b1f98..67465ed89310 100644
--- a/arch/x86/kernel/quirks.c
+++ b/arch/x86/kernel/quirks.c
@@ -35,9 +35,6 @@ static void __devinit quirk_intel_irqbalance(struct pci_dev *dev)
 	if (!(word & (1 << 13))) {
 		dev_info(&dev->dev, "Intel E7520/7320/7525 detected; "
 			"disabling irq balancing and affinity\n");
-#ifdef CONFIG_IRQBALANCE
-		irqbalance_disable("");
-#endif
 		noirqdebug_setup("");
 #ifdef CONFIG_PROC_FS
 		no_irq_affinity = 1;
diff --git a/include/linux/irq.h b/include/linux/irq.h
index 704136138dc7..2445d2b3d5dc 100644
--- a/include/linux/irq.h
+++ b/include/linux/irq.h
@@ -185,7 +185,7 @@ struct irq_desc {
 	cpumask_t		affinity;
 	unsigned int		cpu;
 #endif
-#if defined(CONFIG_GENERIC_PENDING_IRQ) || defined(CONFIG_IRQBALANCE)
+#ifdef CONFIG_GENERIC_PENDING_IRQ
 	cpumask_t		pending_mask;
 #endif
 #ifdef CONFIG_PROC_FS
@@ -241,13 +241,13 @@ extern int setup_irq(unsigned int irq, struct irqaction *new);
 
 #ifdef CONFIG_SMP
 
-#if defined(CONFIG_GENERIC_PENDING_IRQ) || defined(CONFIG_IRQBALANCE)
+#ifdef CONFIG_GENERIC_PENDING_IRQ
 
 void set_pending_irq(unsigned int irq, cpumask_t mask);
 void move_native_irq(int irq);
 void move_masked_irq(int irq);
 
-#else /* CONFIG_GENERIC_PENDING_IRQ || CONFIG_IRQBALANCE */
+#else /* CONFIG_GENERIC_PENDING_IRQ */
 
 static inline void move_irq(int irq)
 {
@@ -274,14 +274,6 @@ static inline void set_pending_irq(unsigned int irq, cpumask_t mask)
 
 #endif /* CONFIG_SMP */
 
-#ifdef CONFIG_IRQBALANCE
-extern void set_balance_irq_affinity(unsigned int irq, cpumask_t mask);
-#else
-static inline void set_balance_irq_affinity(unsigned int irq, cpumask_t mask)
-{
-}
-#endif
-
 extern int no_irq_affinity;
 
 static inline int irq_balancing_disabled(unsigned int irq)
diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c
index 6df49218632a..ddc956861a58 100644
--- a/kernel/irq/manage.c
+++ b/kernel/irq/manage.c
@@ -86,8 +86,6 @@ int irq_set_affinity(unsigned int irq, cpumask_t cpumask)
 	if (!desc->chip->set_affinity)
 		return -EINVAL;
 
-	set_balance_irq_affinity(irq, cpumask);
-
 #ifdef CONFIG_GENERIC_PENDING_IRQ
 	if (desc->status & IRQ_MOVE_PCNTXT) {
 		unsigned long flags;
@@ -122,7 +120,6 @@ int irq_select_affinity(unsigned int irq)
 	desc->affinity = mask;
 	desc->chip->set_affinity(irq, mask);
 
-	set_balance_irq_affinity(irq, mask);
 	return 0;
 }
 #endif
-- 
cgit v1.2.3


From 42379b1122bab7f9aefdbd4b7004a6fa89dfbae5 Mon Sep 17 00:00:00 2001
From: Yinghai Lu <yhlu.kernel@gmail.com>
Date: Tue, 19 Aug 2008 20:50:45 -0700
Subject: pci: change msi-x vector to 32bit

we are using 28bit pci (bus/dev/fn + 12 bits) as irq number, so the
cache for irq number should be 32 bit too.

Signed-off-by: Yinghai Lu <yhlu.kernel@gmail.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 drivers/scsi/qla2xxx/qla_def.h | 2 +-
 include/linux/pci.h            | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/scsi/qla2xxx/qla_def.h b/drivers/scsi/qla2xxx/qla_def.h
index 83c819216771..f25f41a499e5 100644
--- a/drivers/scsi/qla2xxx/qla_def.h
+++ b/drivers/scsi/qla2xxx/qla_def.h
@@ -2108,7 +2108,7 @@ struct scsi_qla_host;
 
 struct qla_msix_entry {
 	int have_irq;
-	uint16_t msix_vector;
+	uint32_t msix_vector;
 	uint16_t msix_entry;
 };
 
diff --git a/include/linux/pci.h b/include/linux/pci.h
index 98dc6243a706..1f8db240ca48 100644
--- a/include/linux/pci.h
+++ b/include/linux/pci.h
@@ -723,7 +723,7 @@ enum pci_dma_burst_strategy {
 };
 
 struct msix_entry {
-	u16 	vector;	/* kernel uses to write allocated vector */
+	u32	vector;	/* kernel uses to write allocated vector */
 	u16	entry;	/* driver uses to specify entry, OS writes */
 };
 
-- 
cgit v1.2.3


From 8c464a4b23ca283b414022ebc77787f3c7040fa7 Mon Sep 17 00:00:00 2001
From: Yinghai Lu <yhlu.kernel@gmail.com>
Date: Mon, 25 Aug 2008 12:41:19 -0700
Subject: sparseirq: move kstat_irqs from kstat to irq_desc - fix

fix non-sparseirq architectures.

Signed-off-by: Yinghai Lu <yhlu.kernel@gmail.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 include/linux/irq.h         |  4 ++--
 include/linux/kernel_stat.h | 10 ++++++++--
 kernel/irq/chip.c           | 21 ++++++++++++++++++++-
 kernel/irq/handle.c         | 10 ++++++++++
 4 files changed, 40 insertions(+), 5 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/irq.h b/include/linux/irq.h
index 2445d2b3d5dc..93fe9a943e71 100644
--- a/include/linux/irq.h
+++ b/include/linux/irq.h
@@ -161,8 +161,6 @@ struct irq_desc {
 #endif
 #ifdef CONFIG_HAVE_DYN_ARRAY
 	unsigned int            *kstat_irqs;
-#else
-	unsigned int            kstat_irqs[NR_CPUS];
 #endif
 #if defined(CONFIG_INTR_REMAP) && defined(CONFIG_HAVE_SPARSE_IRQ)
        struct irq_2_iommu      *irq_2_iommu;
@@ -219,8 +217,10 @@ extern struct irq_desc *sparse_irqs;
 
 #endif
 
+#ifdef CONFIG_HAVE_DYN_ARRAY
 #define kstat_irqs_this_cpu(DESC) \
 	((DESC)->kstat_irqs[smp_processor_id()])
+#endif
 
 /*
  * Migration helpers for obsolete names, they will go away:
diff --git a/include/linux/kernel_stat.h b/include/linux/kernel_stat.h
index f10616712de5..21249d8c1293 100644
--- a/include/linux/kernel_stat.h
+++ b/include/linux/kernel_stat.h
@@ -28,7 +28,7 @@ struct cpu_usage_stat {
 
 struct kernel_stat {
 	struct cpu_usage_stat	cpustat;
-#ifndef CONFIG_GENERIC_HARDIRQS
+#ifndef CONFIG_HAVE_DYN_ARRAY
        unsigned int irqs[NR_IRQS];
 #endif
 };
@@ -41,7 +41,13 @@ DECLARE_PER_CPU(struct kernel_stat, kstat);
 
 extern unsigned long long nr_context_switches(void);
 
-#ifndef CONFIG_GENERIC_HARDIRQS
+#ifndef CONFIG_HAVE_DYN_ARRAY
+#define kstat_irqs_this_cpu(irq) \
+	(kstat_this_cpu.irqs[irq])
+#endif
+
+
+#ifndef CONFIG_HAVE_DYN_ARRAY
 static inline unsigned int kstat_irqs_cpu(unsigned int irq, int cpu)
 {
        return kstat_cpu(cpu).irqs[irq];
diff --git a/kernel/irq/chip.c b/kernel/irq/chip.c
index 9fc5e69213de..4ef555c50db8 100644
--- a/kernel/irq/chip.c
+++ b/kernel/irq/chip.c
@@ -327,7 +327,11 @@ handle_simple_irq(unsigned int irq, struct irq_desc *desc)
 	if (unlikely(desc->status & IRQ_INPROGRESS))
 		goto out_unlock;
 	desc->status &= ~(IRQ_REPLAY | IRQ_WAITING);
+#ifdef CONFIG_HAVE_DYN_ARRAY
 	kstat_irqs_this_cpu(desc)++;
+#else
+	kstat_irqs_this_cpu(irq)++;
+#endif
 
 	action = desc->action;
 	if (unlikely(!action || (desc->status & IRQ_DISABLED)))
@@ -368,7 +372,11 @@ handle_level_irq(unsigned int irq, struct irq_desc *desc)
 	if (unlikely(desc->status & IRQ_INPROGRESS))
 		goto out_unlock;
 	desc->status &= ~(IRQ_REPLAY | IRQ_WAITING);
+#ifdef CONFIG_HAVE_DYN_ARRAY
 	kstat_irqs_this_cpu(desc)++;
+#else
+	kstat_irqs_this_cpu(irq)++;
+#endif
 
 	/*
 	 * If its disabled or no action available
@@ -415,7 +423,11 @@ handle_fasteoi_irq(unsigned int irq, struct irq_desc *desc)
 		goto out;
 
 	desc->status &= ~(IRQ_REPLAY | IRQ_WAITING);
+#ifdef CONFIG_HAVE_DYN_ARRAY
 	kstat_irqs_this_cpu(desc)++;
+#else
+	kstat_irqs_this_cpu(irq)++;
+#endif
 
 	/*
 	 * If its disabled or no action available
@@ -479,8 +491,11 @@ handle_edge_irq(unsigned int irq, struct irq_desc *desc)
 		mask_ack_irq(desc, irq);
 		goto out_unlock;
 	}
-
+#ifdef CONFIG_HAVE_DYN_ARRAY
 	kstat_irqs_this_cpu(desc)++;
+#else
+	kstat_irqs_this_cpu(irq)++;
+#endif
 
 	/* Start handling the irq */
 	desc->chip->ack(irq);
@@ -535,7 +550,11 @@ handle_percpu_irq(unsigned int irq, struct irq_desc *desc)
 {
 	irqreturn_t action_ret;
 
+#ifdef CONFIG_HAVE_DYN_ARRAY
 	kstat_irqs_this_cpu(desc)++;
+#else
+	kstat_irqs_this_cpu(irq)++;
+#endif
 
 	if (desc->chip->ack)
 		desc->chip->ack(irq);
diff --git a/kernel/irq/handle.c b/kernel/irq/handle.c
index d638a911cbc1..eae69373a9c6 100644
--- a/kernel/irq/handle.c
+++ b/kernel/irq/handle.c
@@ -34,7 +34,11 @@ void
 handle_bad_irq(unsigned int irq, struct irq_desc *desc)
 {
 	print_irq_desc(irq, desc);
+#ifdef CONFIG_HAVE_DYN_ARRAY
 	kstat_irqs_this_cpu(desc)++;
+#else
+	kstat_irqs_this_cpu(irq)++;
+#endif
 	ack_bad_irq(irq);
 }
 
@@ -401,7 +405,11 @@ unsigned int __do_IRQ(unsigned int irq)
 	struct irqaction *action;
 	unsigned int status;
 
+#ifdef CONFIG_HAVE_DYN_ARRAY
 	kstat_irqs_this_cpu(desc)++;
+#else
+	kstat_irqs_this_cpu(irq)++;
+#endif
 	if (CHECK_IRQ_PER_CPU(desc->status)) {
 		irqreturn_t action_ret;
 
@@ -501,10 +509,12 @@ void early_init_irq_lock_class(void)
 }
 #endif
 
+#ifdef CONFIG_HAVE_DYN_ARRAY
 unsigned int kstat_irqs_cpu(unsigned int irq, int cpu)
 {
 	struct irq_desc *desc = irq_to_desc(irq);
 	return desc->kstat_irqs[cpu];
 }
+#endif
 EXPORT_SYMBOL(kstat_irqs_cpu);
 
-- 
cgit v1.2.3


From f6dd5c3106fb283e37d915eeb33019ef40510f85 Mon Sep 17 00:00:00 2001
From: Yinghai Lu <yhlu.kernel@gmail.com>
Date: Wed, 3 Sep 2008 16:58:32 -0700
Subject: dmar: fix using early fixmap mapping for DMAR table parsing

Very early detection of the DMAR tables will setup fixmap mapping. For
parsing these tables later (while enabling dma and/or interrupt remapping),
early fixmap mapping shouldn't be used. Fix it by calling table detection
routines again, which will call generic apci_get_table() for setting up
the correct mapping.

Signed-off-by: Yinghai Lu <yhlu.kernel@gmail.com>
Signed-off-by: Suresh Siddha <suresh.b.siddha@intel.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 drivers/pci/dmar.c   | 49 ++++++++++++++++++++++++++++---------------------
 include/linux/dmar.h |  1 -
 2 files changed, 28 insertions(+), 22 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/pci/dmar.c b/drivers/pci/dmar.c
index bd2c01674f5e..f2c5eb6e78f7 100644
--- a/drivers/pci/dmar.c
+++ b/drivers/pci/dmar.c
@@ -289,6 +289,24 @@ dmar_table_print_dmar_entry(struct acpi_dmar_header *header)
 	}
 }
 
+/**
+ * dmar_table_detect - checks to see if the platform supports DMAR devices
+ */
+static int __init dmar_table_detect(void)
+{
+	acpi_status status = AE_OK;
+
+	/* if we could find DMAR table, then there are DMAR devices */
+	status = acpi_get_table(ACPI_SIG_DMAR, 0,
+				(struct acpi_table_header **)&dmar_tbl);
+
+	if (ACPI_SUCCESS(status) && !dmar_tbl) {
+		printk (KERN_WARNING PREFIX "Unable to map DMAR\n");
+		status = AE_NOT_FOUND;
+	}
+
+	return (ACPI_SUCCESS(status) ? 1 : 0);
+}
 
 /**
  * parse_dmar_table - parses the DMA reporting table
@@ -300,6 +318,12 @@ parse_dmar_table(void)
 	struct acpi_dmar_header *entry_header;
 	int ret = 0;
 
+	/*
+	 * Do it again, earlier dmar_tbl mapping could be mapped with
+	 * fixed map.
+	 */
+	dmar_table_detect();
+
 	dmar = (struct acpi_table_dmar *)dmar_tbl;
 	if (!dmar)
 		return -ENODEV;
@@ -430,30 +454,11 @@ int __init dmar_table_init(void)
 	return 0;
 }
 
-/**
- * early_dmar_detect - checks to see if the platform supports DMAR devices
- */
-int __init early_dmar_detect(void)
-{
-	acpi_status status = AE_OK;
-
-	/* if we could find DMAR table, then there are DMAR devices */
-	status = acpi_get_table(ACPI_SIG_DMAR, 0,
-				(struct acpi_table_header **)&dmar_tbl);
-
-	if (ACPI_SUCCESS(status) && !dmar_tbl) {
-		printk (KERN_WARNING PREFIX "Unable to map DMAR\n");
-		status = AE_NOT_FOUND;
-	}
-
-	return (ACPI_SUCCESS(status) ? 1 : 0);
-}
-
 void __init detect_intel_iommu(void)
 {
 	int ret;
 
-	ret = early_dmar_detect();
+	ret = dmar_table_detect();
 
 #ifdef CONFIG_DMAR
 	{
@@ -479,14 +484,16 @@ void __init detect_intel_iommu(void)
 			       " x2apic support\n");
 
 			dmar_disabled = 1;
-			return;
+			goto end;
 		}
 
 		if (ret && !no_iommu && !iommu_detected && !swiotlb &&
 		    !dmar_disabled)
 			iommu_detected = 1;
 	}
+end:
 #endif
+	dmar_tbl = NULL;
 }
 
 
diff --git a/include/linux/dmar.h b/include/linux/dmar.h
index c360c558e59e..f1984fc3e06d 100644
--- a/include/linux/dmar.h
+++ b/include/linux/dmar.h
@@ -45,7 +45,6 @@ extern struct list_head dmar_drhd_units;
 	list_for_each_entry(drhd, &dmar_drhd_units, list)
 
 extern int dmar_table_init(void);
-extern int early_dmar_detect(void);
 extern int dmar_dev_scope_init(void);
 
 /* Intel IOMMU detection */
-- 
cgit v1.2.3


From a50f70b17541c0060967c6df61133e968bad3652 Mon Sep 17 00:00:00 2001
From: Russ Anderson <rja@sgi.com>
Date: Fri, 3 Oct 2008 11:58:54 -0500
Subject: x86: Add UV EFI table entry v4

Look for a UV entry in the EFI tables.

Signed-off-by: Russ Anderson <rja@sgi.com>
Signed-off-by: Paul Jackson <pj@sgi.com>
Acked-by: Huang Ying <ying.huang@intel.com>
Signed-off-by: H. Peter Anvin <hpa@zytor.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/efi.c | 4 ++++
 include/linux/efi.h   | 4 ++++
 2 files changed, 8 insertions(+)

(limited to 'include/linux')

diff --git a/arch/x86/kernel/efi.c b/arch/x86/kernel/efi.c
index 945a31cdd81f..1119d247fe11 100644
--- a/arch/x86/kernel/efi.c
+++ b/arch/x86/kernel/efi.c
@@ -366,6 +366,10 @@ void __init efi_init(void)
 					SMBIOS_TABLE_GUID)) {
 			efi.smbios = config_tables[i].table;
 			printk(" SMBIOS=0x%lx ", config_tables[i].table);
+		} else if (!efi_guidcmp(config_tables[i].guid,
+					UV_SYSTEM_TABLE_GUID)) {
+			efi.uv_systab = config_tables[i].table;
+			printk(" UVsystab=0x%lx ", config_tables[i].table);
 		} else if (!efi_guidcmp(config_tables[i].guid,
 					HCDP_TABLE_GUID)) {
 			efi.hcdp = config_tables[i].table;
diff --git a/include/linux/efi.h b/include/linux/efi.h
index 807373d467f7..bb66feb164bd 100644
--- a/include/linux/efi.h
+++ b/include/linux/efi.h
@@ -208,6 +208,9 @@ typedef efi_status_t efi_set_virtual_address_map_t (unsigned long memory_map_siz
 #define EFI_GLOBAL_VARIABLE_GUID \
     EFI_GUID(  0x8be4df61, 0x93ca, 0x11d2, 0xaa, 0x0d, 0x00, 0xe0, 0x98, 0x03, 0x2b, 0x8c )
 
+#define UV_SYSTEM_TABLE_GUID \
+    EFI_GUID(  0x3b13a7d4, 0x633e, 0x11dd, 0x93, 0xec, 0xda, 0x25, 0x56, 0xd8, 0x95, 0x93 )
+
 typedef struct {
 	efi_guid_t guid;
 	unsigned long table;
@@ -255,6 +258,7 @@ extern struct efi {
 	unsigned long boot_info;	/* boot info table */
 	unsigned long hcdp;		/* HCDP table */
 	unsigned long uga;		/* UGA table */
+	unsigned long uv_systab;	/* UV system table */
 	efi_get_time_t *get_time;
 	efi_set_time_t *set_time;
 	efi_get_wakeup_time_t *get_wakeup_time;
-- 
cgit v1.2.3


From 7ef0c30dbf96a8d9a234e90c248eb19df3c031be Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Wed, 15 Oct 2008 13:07:35 +0200
Subject: genirq: define nr_irqs for architectures with GENERIC_HARDIRQS=n

Revert the sparse irq changes in m68k/s390/sparc and just define
nr_irqs as NR_IRQS for those architectures.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 arch/m68k/kernel/ints.c   | 3 ---
 arch/s390/kernel/irq.c    | 3 ---
 arch/sparc/kernel/irq.c   | 4 ----
 include/linux/interrupt.h | 8 +++++---
 4 files changed, 5 insertions(+), 13 deletions(-)

(limited to 'include/linux')

diff --git a/arch/m68k/kernel/ints.c b/arch/m68k/kernel/ints.c
index 44169e4cd91d..7e8a0d394e61 100644
--- a/arch/m68k/kernel/ints.c
+++ b/arch/m68k/kernel/ints.c
@@ -46,9 +46,6 @@
 #include <asm/q40ints.h>
 #endif
 
-int nr_irqs = NR_IRQS;
-EXPORT_SYMBOL(nr_irqs);
-
 extern u32 auto_irqhandler_fixup[];
 extern u32 user_irqhandler_fixup[];
 extern u16 user_irqvec_fixup[];
diff --git a/arch/s390/kernel/irq.c b/arch/s390/kernel/irq.c
index 3624c4a0037a..e7c5bfb7c755 100644
--- a/arch/s390/kernel/irq.c
+++ b/arch/s390/kernel/irq.c
@@ -17,9 +17,6 @@
 #include <linux/proc_fs.h>
 #include <linux/profile.h>
 
-int nr_irqs = NR_IRQS;
-EXPORT_SYMBOL(nr_irqs);
-
 /*
  * show_interrupts is needed by /proc/interrupts.
  */
diff --git a/arch/sparc/kernel/irq.c b/arch/sparc/kernel/irq.c
index 4b99e3ce3916..93e1d1c65290 100644
--- a/arch/sparc/kernel/irq.c
+++ b/arch/sparc/kernel/irq.c
@@ -55,10 +55,6 @@
 #define SMP_NOP2
 #define SMP_NOP3
 #endif /* SMP */
-
-int nr_irqs = NR_IRQS;
-EXPORT_SYMBOL(nr_irqs);
-
 unsigned long __raw_local_irq_save(void)
 {
 	unsigned long retval;
diff --git a/include/linux/interrupt.h b/include/linux/interrupt.h
index d4039a0b23f4..5a57df2ee922 100644
--- a/include/linux/interrupt.h
+++ b/include/linux/interrupt.h
@@ -15,11 +15,13 @@
 #include <asm/ptrace.h>
 #include <asm/system.h>
 
-extern int nr_irqs;
-
 #ifndef CONFIG_GENERIC_HARDIRQS
-#define for_each_irq_desc(irq, desc)		\
+# define for_each_irq_desc(irq, desc)		\
 	for (irq = 0; irq < nr_irqs; irq++)
+
+# define nr_irqs		NR_IRQS
+#else
+extern int nr_irqs;
 #endif
 
 /*
-- 
cgit v1.2.3


From 70dd4d992ab324a59cdcd6bedc3f4e729863d514 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Wed, 15 Oct 2008 15:39:27 +0200
Subject: genirq: consolidate nr_irqs and for_each_irq_desc()

Move all of those to linux/irq.h where they belong.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 include/linux/interrupt.h |  9 ---------
 include/linux/irq.h       | 17 ++++++++++++-----
 2 files changed, 12 insertions(+), 14 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/interrupt.h b/include/linux/interrupt.h
index 5a57df2ee922..58ff4e74b2f3 100644
--- a/include/linux/interrupt.h
+++ b/include/linux/interrupt.h
@@ -15,15 +15,6 @@
 #include <asm/ptrace.h>
 #include <asm/system.h>
 
-#ifndef CONFIG_GENERIC_HARDIRQS
-# define for_each_irq_desc(irq, desc)		\
-	for (irq = 0; irq < nr_irqs; irq++)
-
-# define nr_irqs		NR_IRQS
-#else
-extern int nr_irqs;
-#endif
-
 /*
  * These correspond to the IORESOURCE_IRQ_* defines in
  * linux/ioport.h to select the interrupt line behaviour.  When
diff --git a/include/linux/irq.h b/include/linux/irq.h
index 93fe9a943e71..dbe8734ae86c 100644
--- a/include/linux/irq.h
+++ b/include/linux/irq.h
@@ -11,6 +11,18 @@
 
 #include <linux/smp.h>
 
+#ifndef CONFIG_GENERIC_HARDIRQS
+# define nr_irqs		NR_IRQS
+
+# define for_each_irq_desc(irq, desc)		\
+	for (irq = 0; irq < nr_irqs; irq++)
+#else
+extern int nr_irqs;
+
+# define for_each_irq_desc(irq, desc)		\
+	for (irq = 0, desc = irq_desc; irq < nr_irqs; irq++, desc++)
+#endif
+
 #ifndef CONFIG_S390
 
 #include <linux/linkage.h>
@@ -204,11 +216,6 @@ extern struct irq_desc irq_desc[NR_IRQS];
 extern struct irq_desc *irq_desc;
 #endif
 
-#ifdef CONFIG_GENERIC_HARDIRQS
-#define for_each_irq_desc(irq, desc)		\
-	for (irq = 0, desc = irq_desc; irq < nr_irqs; irq++, desc = &irq_desc[irq])
-#endif
-
 #else
 
 extern struct irq_desc *sparse_irqs;
-- 
cgit v1.2.3


From c6b7674f323622d86316bf7951ad9cae1ce24642 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Wed, 15 Oct 2008 14:31:29 +0200
Subject: genirq: use inline function for irq_to_desc

For the non sparse irq case an inline function is perfectly fine.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 include/linux/irq.h | 15 +++++++++++++--
 kernel/irq/handle.c | 14 --------------
 2 files changed, 13 insertions(+), 16 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/irq.h b/include/linux/irq.h
index dbe8734ae86c..7d1adacaadb4 100644
--- a/include/linux/irq.h
+++ b/include/linux/irq.h
@@ -204,8 +204,6 @@ struct irq_desc {
 	const char		*name;
 } ____cacheline_internodealigned_in_smp;
 
-extern struct irq_desc *irq_to_desc(unsigned int irq);
-extern struct irq_desc *irq_to_desc_alloc(unsigned int irq);
 
 #ifndef CONFIG_HAVE_SPARSE_IRQ
 
@@ -216,8 +214,21 @@ extern struct irq_desc irq_desc[NR_IRQS];
 extern struct irq_desc *irq_desc;
 #endif
 
+static inline struct irq_desc *irq_to_desc(unsigned int irq)
+{
+	return (irq < nr_irqs) ? irq_desc + irq : NULL;
+}
+
+static inline struct irq_desc *irq_to_desc_alloc(unsigned int irq)
+{
+	return irq_to_desc(irq);
+}
+
 #else
 
+extern struct irq_desc *irq_to_desc(unsigned int irq);
+extern struct irq_desc *irq_to_desc_alloc(unsigned int irq);
+
 extern struct irq_desc *sparse_irqs;
 #define for_each_irq_desc(irqX, desc)		\
 	for (desc = sparse_irqs, irqX = desc->irq; desc; desc = desc->next, irqX = desc ? desc->irq : -1U)
diff --git a/kernel/irq/handle.c b/kernel/irq/handle.c
index fb6bdb602a93..c19896f895f9 100644
--- a/kernel/irq/handle.c
+++ b/kernel/irq/handle.c
@@ -262,20 +262,6 @@ struct irq_desc irq_desc[NR_IRQS] __cacheline_aligned_in_smp = {
 
 #endif
 
-#ifndef CONFIG_HAVE_SPARSE_IRQ
-struct irq_desc *irq_to_desc(unsigned int irq)
-{
-	if (irq < nr_irqs)
-		return &irq_desc[irq];
-
-	return NULL;
-}
-struct irq_desc *irq_to_desc_alloc(unsigned int irq)
-{
-	return irq_to_desc(irq);
-}
-#endif
-
 /*
  * What should we do if we get a hw irq event on an illegal vector?
  * Each architecture has to answer this themself.
-- 
cgit v1.2.3


From 2cc21ef843d4fb7da122239b644a1f6f0aca60a6 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Wed, 15 Oct 2008 14:16:55 +0200
Subject: genirq: remove sparse irq code

This code is not ready, but we need to rip it out instead of rebasing
as we would lose the APIC/IO_APIC unification otherwise.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 arch/x86/kernel/io_apic.c    | 130 ++-----------------------------------------
 arch/x86/kernel/irq_32.c     |   8 ---
 arch/x86/kernel/irq_64.c     |   8 ---
 drivers/char/random.c        |  31 -----------
 drivers/pci/htirq.c          |  19 +------
 drivers/pci/intr_remapping.c |  75 -------------------------
 fs/proc/proc_misc.c          |  43 ++------------
 include/linux/irq.h          |  20 -------
 kernel/irq/handle.c          | 114 -------------------------------------
 9 files changed, 10 insertions(+), 438 deletions(-)

(limited to 'include/linux')

diff --git a/arch/x86/kernel/io_apic.c b/arch/x86/kernel/io_apic.c
index f959acbc0db2..683610517d2a 100644
--- a/arch/x86/kernel/io_apic.c
+++ b/arch/x86/kernel/io_apic.c
@@ -111,9 +111,6 @@ struct irq_cfg;
 struct irq_pin_list;
 struct irq_cfg {
 	unsigned int irq;
-#ifdef CONFIG_HAVE_SPARSE_IRQ
-	struct irq_cfg *next;
-#endif
 	struct irq_pin_list *irq_2_pin;
 	cpumask_t domain;
 	cpumask_t old_domain;
@@ -151,15 +148,6 @@ static void init_one_irq_cfg(struct irq_cfg *cfg)
 
 static struct irq_cfg *irq_cfgx;
 
-#ifdef CONFIG_HAVE_SPARSE_IRQ
-/*
- * Protect the irq_cfgx_free freelist:
- */
-static DEFINE_SPINLOCK(irq_cfg_lock);
-
-static struct irq_cfg *irq_cfgx_free;
-#endif
-
 static void __init init_work(void *data)
 {
 	struct dyn_array *da = data;
@@ -174,114 +162,7 @@ static void __init init_work(void *data)
 	legacy_count = ARRAY_SIZE(irq_cfg_legacy);
 	for (i = legacy_count; i < *da->nr; i++)
 		init_one_irq_cfg(&cfg[i]);
-
-#ifdef CONFIG_HAVE_SPARSE_IRQ
-	for (i = 1; i < *da->nr; i++)
-		cfg[i-1].next = &cfg[i];
-
-	irq_cfgx_free = &irq_cfgx[legacy_count];
-	irq_cfgx[legacy_count - 1].next = NULL;
-#endif
-}
-
-#ifdef CONFIG_HAVE_SPARSE_IRQ
-/* need to be biger than size of irq_cfg_legacy */
-static int nr_irq_cfg = 32;
-
-static int __init parse_nr_irq_cfg(char *arg)
-{
-	if (arg) {
-		nr_irq_cfg = simple_strtoul(arg, NULL, 0);
-		if (nr_irq_cfg < 32)
-			nr_irq_cfg = 32;
-	}
-	return 0;
-}
-
-early_param("nr_irq_cfg", parse_nr_irq_cfg);
-
-#define for_each_irq_cfg(irqX, cfg)           \
-        for (cfg = irq_cfgx, irqX = cfg->irq; cfg; cfg = cfg->next, irqX = cfg ? cfg->irq : -1U)
-
-
-DEFINE_DYN_ARRAY(irq_cfgx, sizeof(struct irq_cfg), nr_irq_cfg, PAGE_SIZE, init_work);
-
-static struct irq_cfg *irq_cfg(unsigned int irq)
-{
-	struct irq_cfg *cfg;
-
-	cfg = irq_cfgx;
-	while (cfg) {
-		if (cfg->irq == irq)
-			return cfg;
-
-		cfg = cfg->next;
-	}
-
-	return NULL;
-}
-
-static struct irq_cfg *irq_cfg_alloc(unsigned int irq)
-{
-	struct irq_cfg *cfg, *cfg_pri;
-	unsigned long flags;
-	int count = 0;
-	int i;
-
-	cfg_pri = cfg = irq_cfgx;
-	while (cfg) {
-		if (cfg->irq == irq)
-			return cfg;
-
-		cfg_pri = cfg;
-		cfg = cfg->next;
-		count++;
-	}
-
-	spin_lock_irqsave(&irq_cfg_lock, flags);
-	if (!irq_cfgx_free) {
-		unsigned long phys;
-		unsigned long total_bytes;
-		/*
-		 *  we run out of pre-allocate ones, allocate more
-		 */
-		printk(KERN_DEBUG "try to get more irq_cfg %d\n", nr_irq_cfg);
-
-		total_bytes = sizeof(struct irq_cfg) * nr_irq_cfg;
-		if (after_bootmem)
-			cfg = kzalloc(total_bytes, GFP_ATOMIC);
-		else
-			cfg = __alloc_bootmem_nopanic(total_bytes, PAGE_SIZE, 0);
-
-		if (!cfg)
-			panic("please boot with nr_irq_cfg= %d\n", count * 2);
-
-		phys = __pa(cfg);
-		printk(KERN_DEBUG "irq_cfg ==> [%#lx - %#lx]\n", phys, phys + total_bytes);
-
-		for (i = 0; i < nr_irq_cfg; i++)
-			init_one_irq_cfg(&cfg[i]);
-
-		for (i = 1; i < nr_irq_cfg; i++)
-			cfg[i-1].next = &cfg[i];
-
-		irq_cfgx_free = cfg;
-	}
-
-	cfg = irq_cfgx_free;
-	irq_cfgx_free = irq_cfgx_free->next;
-	cfg->next = NULL;
-	if (cfg_pri)
-		cfg_pri->next = cfg;
-	else
-		irq_cfgx = cfg;
-	cfg->irq = irq;
-
-	spin_unlock_irqrestore(&irq_cfg_lock, flags);
-
-	return cfg;
 }
-#else
 
 #define for_each_irq_cfg(irq, cfg)		\
 	for (irq = 0, cfg = &irq_cfgx[irq]; irq < nr_irqs; irq++, cfg = &irq_cfgx[irq])
@@ -290,17 +171,16 @@ DEFINE_DYN_ARRAY(irq_cfgx, sizeof(struct irq_cfg), nr_irqs, PAGE_SIZE, init_work
 
 struct irq_cfg *irq_cfg(unsigned int irq)
 {
-        if (irq < nr_irqs)
-                return &irq_cfgx[irq];
+	if (irq < nr_irqs)
+		return &irq_cfgx[irq];
 
-        return NULL;
+	return NULL;
 }
 struct irq_cfg *irq_cfg_alloc(unsigned int irq)
 {
-        return irq_cfg(irq);
+	return irq_cfg(irq);
 }
 
-#endif
 /*
  * This is performance-critical, we want to do it O(1)
  *
@@ -3068,9 +2948,7 @@ unsigned int create_irq_nr(unsigned int irq_want)
 	unsigned long flags;
 	struct irq_cfg *cfg_new;
 
-#ifndef CONFIG_HAVE_SPARSE_IRQ
 	irq_want = nr_irqs - 1;
-#endif
 
 	irq = 0;
 	spin_lock_irqsave(&vector_lock, flags);
diff --git a/arch/x86/kernel/irq_32.c b/arch/x86/kernel/irq_32.c
index 001772ffc918..ccf6c1bf7120 100644
--- a/arch/x86/kernel/irq_32.c
+++ b/arch/x86/kernel/irq_32.c
@@ -272,20 +272,12 @@ int show_interrupts(struct seq_file *p, void *v)
 	struct irq_desc *desc = NULL;
 	int tail = 0;
 
-#ifdef CONFIG_HAVE_SPARSE_IRQ
-	desc = (struct irq_desc *)v;
-	entries = -1U;
-	i = desc->irq;
-	if (!desc->next)
-		tail = 1;
-#else
 	entries = nr_irqs - 1;
 	i = *(loff_t *) v;
 	if (i == nr_irqs)
 		tail = 1;
 	else
 		desc = irq_to_desc(i);
-#endif
 
 	if (i == 0) {
 		seq_printf(p, "           ");
diff --git a/arch/x86/kernel/irq_64.c b/arch/x86/kernel/irq_64.c
index ec2661091283..21f53b911113 100644
--- a/arch/x86/kernel/irq_64.c
+++ b/arch/x86/kernel/irq_64.c
@@ -77,20 +77,12 @@ int show_interrupts(struct seq_file *p, void *v)
 	struct irq_desc *desc = NULL;
 	int tail = 0;
 
-#ifdef CONFIG_HAVE_SPARSE_IRQ
-	desc = (struct irq_desc *)v;
-	entries = -1U;
-	i = desc->irq;
-	if (!desc->next)
-		tail = 1;
-#else
 	entries = nr_irqs - 1;
 	i = *(loff_t *) v;
 	if (i == nr_irqs)
 		tail = 1;
 	else
 		desc = irq_to_desc(i);
-#endif
 
 	if (i == 0) {
 		seq_printf(p, "           ");
diff --git a/drivers/char/random.c b/drivers/char/random.c
index 60c9c7ee6b2c..9ce80213007b 100644
--- a/drivers/char/random.c
+++ b/drivers/char/random.c
@@ -558,8 +558,6 @@ struct timer_rand_state {
 	unsigned dont_count_entropy:1;
 };
 
-#ifndef CONFIG_HAVE_SPARSE_IRQ
-
 #ifdef CONFIG_HAVE_DYN_ARRAY
 static struct timer_rand_state **irq_timer_state;
 DEFINE_DYN_ARRAY(irq_timer_state, sizeof(struct timer_rand_state *), nr_irqs, PAGE_SIZE, NULL);
@@ -583,33 +581,6 @@ static void set_timer_rand_state(unsigned int irq, struct timer_rand_state *stat
 	irq_timer_state[irq] = state;
 }
 
-#else
-
-static struct timer_rand_state *get_timer_rand_state(unsigned int irq)
-{
-	struct irq_desc *desc;
-
-	desc = irq_to_desc(irq);
-
-	if (!desc)
-		return NULL;
-
-	return desc->timer_rand_state;
-}
-
-static void set_timer_rand_state(unsigned int irq, struct timer_rand_state *state)
-{
-	struct irq_desc *desc;
-
-	desc = irq_to_desc(irq);
-
-	if (!desc)
-		return;
-
-	desc->timer_rand_state = state;
-}
-#endif
-
 static struct timer_rand_state input_timer_state;
 
 /*
@@ -967,10 +938,8 @@ void rand_initialize_irq(int irq)
 {
 	struct timer_rand_state *state;
 
-#ifndef CONFIG_HAVE_SPARSE_IRQ
 	if (irq >= nr_irqs)
 		return;
-#endif
 
 	state = get_timer_rand_state(irq);
 
diff --git a/drivers/pci/htirq.c b/drivers/pci/htirq.c
index 9e4929a00832..bf7d6ce9bbb3 100644
--- a/drivers/pci/htirq.c
+++ b/drivers/pci/htirq.c
@@ -82,18 +82,6 @@ void unmask_ht_irq(unsigned int irq)
 	write_ht_irq_msg(irq, &msg);
 }
 
-static unsigned int build_irq_for_pci_dev(struct pci_dev *dev)
-{
-	unsigned int irq;
-
-	irq = dev->bus->number;
-	irq <<= 8;
-	irq |= dev->devfn;
-	irq <<= 12;
-
-	return irq;
-}
-
 /**
  * __ht_create_irq - create an irq and attach it to a device.
  * @dev: The hypertransport device to find the irq capability on.
@@ -110,7 +98,6 @@ int __ht_create_irq(struct pci_dev *dev, int idx, ht_irq_update_t *update)
 	int max_irq;
 	int pos;
 	int irq;
-	unsigned int irq_want;
 
 	pos = pci_find_ht_capability(dev, HT_CAPTYPE_IRQ);
 	if (!pos)
@@ -138,12 +125,8 @@ int __ht_create_irq(struct pci_dev *dev, int idx, ht_irq_update_t *update)
 	cfg->msg.address_lo = 0xffffffff;
 	cfg->msg.address_hi = 0xffffffff;
 
-	irq_want= build_irq_for_pci_dev(dev);
-#ifdef CONFIG_HAVE_SPARSE_IRQ
-	irq = create_irq_nr(irq_want + idx);
-#else
 	irq = create_irq();
-#endif
+
 	if (irq <= 0) {
 		kfree(cfg);
 		return -EBUSY;
diff --git a/drivers/pci/intr_remapping.c b/drivers/pci/intr_remapping.c
index 2dcf973890c4..0f43b265eee6 100644
--- a/drivers/pci/intr_remapping.c
+++ b/drivers/pci/intr_remapping.c
@@ -19,78 +19,6 @@ struct irq_2_iommu {
 	u8  irte_mask;
 };
 
-#ifdef CONFIG_HAVE_SPARSE_IRQ
-static struct irq_2_iommu *irq_2_iommuX;
-/* fill one page ? */
-static int nr_irq_2_iommu = 0x100;
-static int irq_2_iommu_index;
-DEFINE_DYN_ARRAY(irq_2_iommuX, sizeof(struct irq_2_iommu), nr_irq_2_iommu, PAGE_SIZE, NULL);
-
-extern void *__alloc_bootmem_nopanic(unsigned long size,
-				     unsigned long align,
-				     unsigned long goal);
-
-static struct irq_2_iommu *get_one_free_irq_2_iommu(int not_used)
-{
-	struct irq_2_iommu *iommu;
-	unsigned long total_bytes;
-
-	if (irq_2_iommu_index >= nr_irq_2_iommu) {
-		/*
-		 *  we run out of pre-allocate ones, allocate more
-		 */
-		printk(KERN_DEBUG "try to get more irq_2_iommu %d\n", nr_irq_2_iommu);
-
-		total_bytes = sizeof(struct irq_2_iommu)*nr_irq_2_iommu;
-
-		if (after_bootmem)
-			iommu = kzalloc(total_bytes, GFP_ATOMIC);
-		else
-			iommu = __alloc_bootmem_nopanic(total_bytes, PAGE_SIZE, 0);
-
-		if (!iommu)
-			panic("can not get more irq_2_iommu\n");
-
-		irq_2_iommuX = iommu;
-		irq_2_iommu_index = 0;
-	}
-
-	iommu = &irq_2_iommuX[irq_2_iommu_index];
-	irq_2_iommu_index++;
-	return iommu;
-}
-
-static struct irq_2_iommu *irq_2_iommu(unsigned int irq)
-{
-	struct irq_desc *desc;
-
-	desc = irq_to_desc(irq);
-
-	BUG_ON(!desc);
-
-	return desc->irq_2_iommu;
-}
-
-static struct irq_2_iommu *irq_2_iommu_alloc(unsigned int irq)
-{
-	struct irq_desc *desc;
-	struct irq_2_iommu *irq_iommu;
-
-	/*
-	 * alloc irq desc if not allocated already.
-	 */
-	desc = irq_to_desc_alloc(irq);
-
-	irq_iommu = desc->irq_2_iommu;
-
-	if (!irq_iommu)
-		desc->irq_2_iommu = get_one_free_irq_2_iommu(irq);
-
-	return desc->irq_2_iommu;
-}
-
-#else /* !CONFIG_HAVE_SPARSE_IRQ */
-
 #ifdef CONFIG_HAVE_DYN_ARRAY
 static struct irq_2_iommu *irq_2_iommuX;
 DEFINE_DYN_ARRAY(irq_2_iommuX, sizeof(struct irq_2_iommu), nr_irqs, PAGE_SIZE, NULL);
@@ -109,7 +37,6 @@ static struct irq_2_iommu *irq_2_iommu_alloc(unsigned int irq)
 {
 	return irq_2_iommu(irq);
 }
-#endif
 
 static DEFINE_SPINLOCK(irq_2_ir_lock);
 
@@ -166,11 +93,9 @@ int alloc_irte(struct intel_iommu *iommu, int irq, u16 count)
 	if (!count)
 		return -1;
 
-#ifndef	CONFIG_HAVE_SPARSE_IRQ
 	/* protect irq_2_iommu_alloc later */
 	if (irq >= nr_irqs)
 		return -1;
-#endif
 
 	/*
 	 * start the IRTE search from index 0.
diff --git a/fs/proc/proc_misc.c b/fs/proc/proc_misc.c
index d68c3592fe4a..3f5c7b9d1a70 100644
--- a/fs/proc/proc_misc.c
+++ b/fs/proc/proc_misc.c
@@ -529,13 +529,10 @@ static int show_stat(struct seq_file *p, void *v)
 		softirq = cputime64_add(softirq, kstat_cpu(i).cpustat.softirq);
 		steal = cputime64_add(steal, kstat_cpu(i).cpustat.steal);
 		guest = cputime64_add(guest, kstat_cpu(i).cpustat.guest);
+
 		for_each_irq_desc(j, desc)
-		{
-			unsigned int temp;
+			sum += kstat_irqs_cpu(j, i);
 
-			temp = kstat_irqs_cpu(j, i);
-			sum += temp;
-		}
 		sum += arch_irq_stat_cpu(i);
 	}
 	sum += arch_irq_stat();
@@ -578,21 +575,13 @@ static int show_stat(struct seq_file *p, void *v)
 	seq_printf(p, "intr %llu", (unsigned long long)sum);
 
 	/* sum again ? it could be updated? */
-	for_each_irq_desc(j, desc)
-	{
+	for_each_irq_desc(j, desc) {
 		per_irq_sum = 0;
-		for_each_possible_cpu(i) {
-			unsigned int temp;
 
-			temp = kstat_irqs_cpu(j, i);
-			per_irq_sum += temp;
-		}
+		for_each_possible_cpu(i)
+			per_irq_sum += kstat_irqs_cpu(j, i);
 
-#ifdef CONFIG_HAVE_SPARSE_IRQ
-		seq_printf(p, " %#x:%u", j, per_irq_sum);
-#else
 		seq_printf(p, " %u", per_irq_sum);
-#endif
 	}
 
 	seq_printf(p,
@@ -645,36 +634,14 @@ static const struct file_operations proc_stat_operations = {
  */
 static void *int_seq_start(struct seq_file *f, loff_t *pos)
 {
-#ifdef CONFIG_HAVE_SPARSE_IRQ
-	struct irq_desc *desc;
-	int irq;
-	int count = *pos;
-
-	for_each_irq_desc(irq, desc) {
-		if (count-- == 0)
-			return desc;
-	}
-
-	return NULL;
-#else
 	return (*pos <= nr_irqs) ? pos : NULL;
-#endif
 }
 
 
 static void *int_seq_next(struct seq_file *f, void *v, loff_t *pos)
 {
-#ifdef CONFIG_HAVE_SPARSE_IRQ
-	struct irq_desc *desc;
-
-	desc = ((struct irq_desc *)v)->next;
-	(*pos)++;
-
-	return desc;
-#else
 	(*pos)++;
 	return (*pos <= nr_irqs) ? pos : NULL;
-#endif
 }
 
 static void int_seq_stop(struct seq_file *f, void *v)
diff --git a/include/linux/irq.h b/include/linux/irq.h
index 7d1adacaadb4..68e0f3f9df30 100644
--- a/include/linux/irq.h
+++ b/include/linux/irq.h
@@ -167,15 +167,8 @@ struct irq_2_iommu;
  */
 struct irq_desc {
 	unsigned int		irq;
-#ifdef CONFIG_HAVE_SPARSE_IRQ
-	struct irq_desc		*next;
-	struct timer_rand_state *timer_rand_state;
-#endif
 #ifdef CONFIG_HAVE_DYN_ARRAY
 	unsigned int            *kstat_irqs;
-#endif
-#if defined(CONFIG_INTR_REMAP) && defined(CONFIG_HAVE_SPARSE_IRQ)
-       struct irq_2_iommu      *irq_2_iommu;
 #endif
 	irq_flow_handler_t	handle_irq;
 	struct irq_chip		*chip;
@@ -205,8 +198,6 @@ struct irq_desc {
 } ____cacheline_internodealigned_in_smp;
 
 
-#ifndef CONFIG_HAVE_SPARSE_IRQ
-
 #ifndef CONFIG_HAVE_DYN_ARRAY
 /* could be removed if we get rid of all irq_desc reference */
 extern struct irq_desc irq_desc[NR_IRQS];
@@ -224,17 +215,6 @@ static inline struct irq_desc *irq_to_desc_alloc(unsigned int irq)
 	return irq_to_desc(irq);
 }
 
-#else
-
-extern struct irq_desc *irq_to_desc(unsigned int irq);
-extern struct irq_desc *irq_to_desc_alloc(unsigned int irq);
-
-extern struct irq_desc *sparse_irqs;
-#define for_each_irq_desc(irqX, desc)		\
-	for (desc = sparse_irqs, irqX = desc->irq; desc; desc = desc->next, irqX = desc ? desc->irq : -1U)
-
-#endif
-
 #ifdef CONFIG_HAVE_DYN_ARRAY
 #define kstat_irqs_this_cpu(DESC) \
 	((DESC)->kstat_irqs[smp_processor_id()])
diff --git a/kernel/irq/handle.c b/kernel/irq/handle.c
index c19896f895f9..f837133cdfbe 100644
--- a/kernel/irq/handle.c
+++ b/kernel/irq/handle.c
@@ -111,15 +111,6 @@ static void init_kstat_irqs(struct irq_desc *desc, int nr_desc, int nr)
 	}
 }
 
-#ifdef CONFIG_HAVE_SPARSE_IRQ
-/*
- * Protect the sparse_irqs_free freelist:
- */
-static DEFINE_SPINLOCK(sparse_irq_lock);
-static struct irq_desc *sparse_irqs_free;
-struct irq_desc *sparse_irqs;
-#endif
-
 static void __init init_work(void *data)
 {
 	struct dyn_array *da = data;
@@ -130,121 +121,16 @@ static void __init init_work(void *data)
 
 	for (i = 0; i < *da->nr; i++) {
 		init_one_irq_desc(&desc[i]);
-#ifndef CONFIG_HAVE_SPARSE_IRQ
 		desc[i].irq = i;
-#endif
 	}
 
 	/* init kstat_irqs, nr_cpu_ids is ready already */
 	init_kstat_irqs(desc, *da->nr, nr_cpu_ids);
-
-#ifdef CONFIG_HAVE_SPARSE_IRQ
-	for (i = 1; i < *da->nr; i++)
-		desc[i-1].next = &desc[i];
-
-	sparse_irqs_free = sparse_irqs;
-	sparse_irqs = NULL;
-#endif
-}
-
-#ifdef CONFIG_HAVE_SPARSE_IRQ
-static int nr_irq_desc = 32;
-
-static int __init parse_nr_irq_desc(char *arg)
-{
-	if (arg)
-		nr_irq_desc = simple_strtoul(arg, NULL, 0);
-	return 0;
-}
-
-early_param("nr_irq_desc", parse_nr_irq_desc);
-
-DEFINE_DYN_ARRAY(sparse_irqs, sizeof(struct irq_desc), nr_irq_desc, PAGE_SIZE, init_work);
-
-struct irq_desc *irq_to_desc(unsigned int irq)
-{
-	struct irq_desc *desc;
-
-	desc = sparse_irqs;
-	while (desc) {
-		if (desc->irq == irq)
-			return desc;
-
-		desc = desc->next;
-	}
-	return NULL;
 }
 
-struct irq_desc *irq_to_desc_alloc(unsigned int irq)
-{
-	struct irq_desc *desc, *desc_pri;
-	unsigned long flags;
-	int count = 0;
-	int i;
-
-	desc_pri = desc = sparse_irqs;
-	while (desc) {
-		if (desc->irq == irq)
-			return desc;
-
-		desc_pri = desc;
-		desc = desc->next;
-		count++;
-	}
-
-	spin_lock_irqsave(&sparse_irq_lock, flags);
-	/*
-	 *  we run out of pre-allocate ones, allocate more
-	 */
-	if (!sparse_irqs_free) {
-		unsigned long phys;
-		unsigned long total_bytes;
-
-		printk(KERN_DEBUG "try to get more irq_desc %d\n", nr_irq_desc);
-
-		total_bytes = sizeof(struct irq_desc) * nr_irq_desc;
-		if (after_bootmem)
-			desc = kzalloc(total_bytes, GFP_ATOMIC);
-		else
-			desc = __alloc_bootmem_nopanic(total_bytes, PAGE_SIZE, 0);
-
-		if (!desc)
-			panic("please boot with nr_irq_desc= %d\n", count * 2);
-
-		phys = __pa(desc);
-		printk(KERN_DEBUG "irq_desc ==> [%#lx - %#lx]\n", phys, phys + total_bytes);
-
-		for (i = 0; i < nr_irq_desc; i++)
-			init_one_irq_desc(&desc[i]);
-
-		for (i = 1; i < nr_irq_desc; i++)
-			desc[i-1].next = &desc[i];
-
-		/* init kstat_irqs, nr_cpu_ids is ready already */
-		init_kstat_irqs(desc, nr_irq_desc, nr_cpu_ids);
-
-		sparse_irqs_free = desc;
-	}
-
-	desc = sparse_irqs_free;
-	sparse_irqs_free = sparse_irqs_free->next;
-	desc->next = NULL;
-	if (desc_pri)
-		desc_pri->next = desc;
-	else
-		sparse_irqs = desc;
-	desc->irq = irq;
-
-	spin_unlock_irqrestore(&sparse_irq_lock, flags);
-
-	return desc;
-}
-#else
 struct irq_desc *irq_desc;
 DEFINE_DYN_ARRAY(irq_desc, sizeof(struct irq_desc), nr_irqs, PAGE_SIZE, init_work);
 
-#endif
-
 #else
 
 struct irq_desc irq_desc[NR_IRQS] __cacheline_aligned_in_smp = {
-- 
cgit v1.2.3


From ee32c9732244bde4b9b59eeac2814c23e2b71f8d Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Wed, 15 Oct 2008 14:34:09 +0200
Subject: genirq: remove irq_to_desc_alloc

Remove the leftover of sparseirqs.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 arch/x86/kernel/io_apic.c    | 6 +-----
 arch/x86/kernel/irqinit_32.c | 2 +-
 arch/x86/kernel/irqinit_64.c | 2 +-
 include/linux/irq.h          | 5 -----
 kernel/irq/chip.c            | 2 +-
 5 files changed, 4 insertions(+), 13 deletions(-)

(limited to 'include/linux')

diff --git a/arch/x86/kernel/io_apic.c b/arch/x86/kernel/io_apic.c
index 683610517d2a..e03bc0f87eef 100644
--- a/arch/x86/kernel/io_apic.c
+++ b/arch/x86/kernel/io_apic.c
@@ -1257,11 +1257,7 @@ static void ioapic_register_intr(int irq, unsigned long trigger)
 {
 	struct irq_desc *desc;
 
-	/* first time to use this irq_desc */
-	if (irq < 16)
-		desc = irq_to_desc(irq);
-	else
-		desc = irq_to_desc_alloc(irq);
+	desc = irq_to_desc(irq);
 
 	if ((trigger == IOAPIC_AUTO && IO_APIC_irq_trigger(irq)) ||
 	    trigger == IOAPIC_LEVEL)
diff --git a/arch/x86/kernel/irqinit_32.c b/arch/x86/kernel/irqinit_32.c
index 9092103a18eb..a8d35998d308 100644
--- a/arch/x86/kernel/irqinit_32.c
+++ b/arch/x86/kernel/irqinit_32.c
@@ -70,7 +70,7 @@ void __init init_ISA_irqs (void)
 	 */
 	for (i = 0; i < 16; i++) {
 		/* first time call this irq_desc */
-		struct irq_desc *desc = irq_to_desc_alloc(i);
+		struct irq_desc *desc = irq_to_desc(i);
 
 		desc->status = IRQ_DISABLED;
 		desc->action = NULL;
diff --git a/arch/x86/kernel/irqinit_64.c b/arch/x86/kernel/irqinit_64.c
index d17fbc26d96f..ff0235391285 100644
--- a/arch/x86/kernel/irqinit_64.c
+++ b/arch/x86/kernel/irqinit_64.c
@@ -144,7 +144,7 @@ void __init init_ISA_irqs(void)
 
 	for (i = 0; i < 16; i++) {
 		/* first time call this irq_desc */
-		struct irq_desc *desc = irq_to_desc_alloc(i);
+		struct irq_desc *desc = irq_to_desc(i);
 
 		desc->status = IRQ_DISABLED;
 		desc->action = NULL;
diff --git a/include/linux/irq.h b/include/linux/irq.h
index 68e0f3f9df30..3f33c7790300 100644
--- a/include/linux/irq.h
+++ b/include/linux/irq.h
@@ -210,11 +210,6 @@ static inline struct irq_desc *irq_to_desc(unsigned int irq)
 	return (irq < nr_irqs) ? irq_desc + irq : NULL;
 }
 
-static inline struct irq_desc *irq_to_desc_alloc(unsigned int irq)
-{
-	return irq_to_desc(irq);
-}
-
 #ifdef CONFIG_HAVE_DYN_ARRAY
 #define kstat_irqs_this_cpu(DESC) \
 	((DESC)->kstat_irqs[smp_processor_id()])
diff --git a/kernel/irq/chip.c b/kernel/irq/chip.c
index 570d1ea1db5d..e6f73dbfcc3d 100644
--- a/kernel/irq/chip.c
+++ b/kernel/irq/chip.c
@@ -28,7 +28,7 @@ void dynamic_irq_init(unsigned int irq)
 	unsigned long flags;
 
 	/* first time to use this irq_desc */
-	desc = irq_to_desc_alloc(irq);
+	desc = irq_to_desc(irq);
 	if (!desc) {
 		WARN(1, KERN_ERR "Trying to initialize invalid IRQ%d\n", irq);
 		return;
-- 
cgit v1.2.3


From d6c88a507ef0b6afdb013cba4e7804ba7324d99a Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Wed, 15 Oct 2008 15:27:23 +0200
Subject: genirq: revert dynarray

Revert the dynarray changes. They need more thought and polishing.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 arch/Kconfig                      |   4 -
 arch/x86/Kconfig                  |   1 -
 arch/x86/kernel/io_apic.c         | 199 ++++++++++++++------------------------
 arch/x86/kernel/setup_percpu.c    |   8 +-
 arch/x86/kernel/visws_quirks.c    |   2 +-
 arch/x86/kernel/vmlinux_32.lds.S  |   1 -
 arch/x86/kernel/vmlinux_64.lds.S  |   2 -
 arch/x86/xen/spinlock.c           |   2 +-
 drivers/char/random.c             |   5 -
 drivers/pci/intr_remapping.c      |  11 +--
 include/asm-generic/vmlinux.lds.h |  13 ---
 include/linux/init.h              |  43 --------
 include/linux/irq.h               |  15 ---
 include/linux/kernel_stat.h       |  16 ++-
 init/Makefile                     |   2 +-
 init/dyn_array.c                  | 120 -----------------------
 init/main.c                       |  11 +--
 kernel/irq/chip.c                 |  30 +-----
 kernel/irq/handle.c               | 114 ++--------------------
 19 files changed, 103 insertions(+), 496 deletions(-)
 delete mode 100644 init/dyn_array.c

(limited to 'include/linux')

diff --git a/arch/Kconfig b/arch/Kconfig
index c8a7c2eb6490..071004d3a1b1 100644
--- a/arch/Kconfig
+++ b/arch/Kconfig
@@ -102,7 +102,3 @@ config HAVE_CLK
 	help
 	  The <linux/clk.h> calls support software clock gating and
 	  thus are a key power management tool on many systems.
-
-config HAVE_DYN_ARRAY
-	def_bool n
-
diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
index 8636ddf2f4a4..8da6123a60d0 100644
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -33,7 +33,6 @@ config X86
 	select HAVE_ARCH_TRACEHOOK
 	select HAVE_GENERIC_DMA_COHERENT if X86_32
 	select HAVE_EFFICIENT_UNALIGNED_ACCESS
-	select HAVE_DYN_ARRAY
 
 config ARCH_DEFCONFIG
 	string
diff --git a/arch/x86/kernel/io_apic.c b/arch/x86/kernel/io_apic.c
index e03bc0f87eef..6f80dc2f137e 100644
--- a/arch/x86/kernel/io_apic.c
+++ b/arch/x86/kernel/io_apic.c
@@ -107,7 +107,6 @@ static int __init parse_noapic(char *str)
 }
 early_param("noapic", parse_noapic);
 
-struct irq_cfg;
 struct irq_pin_list;
 struct irq_cfg {
 	unsigned int irq;
@@ -120,7 +119,7 @@ struct irq_cfg {
 };
 
 /* irq_cfg is indexed by the sum of all RTEs in all I/O APICs. */
-static struct irq_cfg irq_cfg_legacy[] __initdata = {
+static struct irq_cfg irq_cfgx[NR_IRQS] = {
 	[0]  = { .irq =  0, .domain = CPU_MASK_ALL, .vector = IRQ0_VECTOR,  },
 	[1]  = { .irq =  1, .domain = CPU_MASK_ALL, .vector = IRQ1_VECTOR,  },
 	[2]  = { .irq =  2, .domain = CPU_MASK_ALL, .vector = IRQ2_VECTOR,  },
@@ -139,48 +138,26 @@ static struct irq_cfg irq_cfg_legacy[] __initdata = {
 	[15] = { .irq = 15, .domain = CPU_MASK_ALL, .vector = IRQ15_VECTOR, },
 };
 
-static struct irq_cfg irq_cfg_init = { .irq =  -1U, };
-
-static void init_one_irq_cfg(struct irq_cfg *cfg)
-{
-	memcpy(cfg, &irq_cfg_init, sizeof(struct irq_cfg));
-}
-
-static struct irq_cfg *irq_cfgx;
-
-static void __init init_work(void *data)
-{
-	struct dyn_array *da = data;
-	struct irq_cfg *cfg;
-	int legacy_count;
-	int i;
-
-	cfg = *da->name;
-
-	memcpy(cfg, irq_cfg_legacy, sizeof(irq_cfg_legacy));
-
-	legacy_count = ARRAY_SIZE(irq_cfg_legacy);
-	for (i = legacy_count; i < *da->nr; i++)
-		init_one_irq_cfg(&cfg[i]);
-}
-
 #define for_each_irq_cfg(irq, cfg)		\
-	for (irq = 0, cfg = &irq_cfgx[irq]; irq < nr_irqs; irq++, cfg = &irq_cfgx[irq])
+	for (irq = 0, cfg = irq_cfgx; irq < nr_irqs; irq++, cfg++)
 
-DEFINE_DYN_ARRAY(irq_cfgx, sizeof(struct irq_cfg), nr_irqs, PAGE_SIZE, init_work);
-
-struct irq_cfg *irq_cfg(unsigned int irq)
+static struct irq_cfg *irq_cfg(unsigned int irq)
 {
-	if (irq < nr_irqs)
-		return &irq_cfgx[irq];
-
-	return NULL;
+	return irq < nr_irqs ? irq_cfgx + irq : NULL;
 }
-struct irq_cfg *irq_cfg_alloc(unsigned int irq)
+
+static struct irq_cfg *irq_cfg_alloc(unsigned int irq)
 {
 	return irq_cfg(irq);
 }
 
+/*
+ * Rough estimation of how many shared IRQs there are, can be changed
+ * anytime.
+ */
+#define MAX_PLUS_SHARED_IRQS NR_IRQS
+#define PIN_MAP_SIZE (MAX_PLUS_SHARED_IRQS + NR_IRQS)
+
 /*
  * This is performance-critical, we want to do it O(1)
  *
@@ -193,59 +170,29 @@ struct irq_pin_list {
 	struct irq_pin_list *next;
 };
 
-static struct irq_pin_list *irq_2_pin_head;
-/* fill one page ? */
-static int nr_irq_2_pin = 0x100;
+static struct irq_pin_list irq_2_pin_head[PIN_MAP_SIZE];
 static struct irq_pin_list *irq_2_pin_ptr;
-static void __init irq_2_pin_init_work(void *data)
+
+static void __init irq_2_pin_init(void)
 {
-	struct dyn_array *da = data;
-	struct irq_pin_list *pin;
+	struct irq_pin_list *pin = irq_2_pin_head;
 	int i;
 
-	pin = *da->name;
-
-	for (i = 1; i < *da->nr; i++)
+	for (i = 1; i < PIN_MAP_SIZE; i++)
 		pin[i-1].next = &pin[i];
 
 	irq_2_pin_ptr = &pin[0];
 }
-DEFINE_DYN_ARRAY(irq_2_pin_head, sizeof(struct irq_pin_list), nr_irq_2_pin, PAGE_SIZE, irq_2_pin_init_work);
 
 static struct irq_pin_list *get_one_free_irq_2_pin(void)
 {
-	struct irq_pin_list *pin;
-	int i;
-
-	pin = irq_2_pin_ptr;
-
-	if (pin) {
-		irq_2_pin_ptr = pin->next;
-		pin->next = NULL;
-		return pin;
-	}
-
-	/*
-	 *  we run out of pre-allocate ones, allocate more
-	 */
-	printk(KERN_DEBUG "try to get more irq_2_pin %d\n", nr_irq_2_pin);
-
-	if (after_bootmem)
-		pin = kzalloc(sizeof(struct irq_pin_list)*nr_irq_2_pin,
-				 GFP_ATOMIC);
-	else
-		pin = __alloc_bootmem_nopanic(sizeof(struct irq_pin_list) *
-				nr_irq_2_pin, PAGE_SIZE, 0);
+	struct irq_pin_list *pin = irq_2_pin_ptr;
 
 	if (!pin)
 		panic("can not get more irq_2_pin\n");
 
-	for (i = 1; i < nr_irq_2_pin; i++)
-		pin[i-1].next = &pin[i];
-
 	irq_2_pin_ptr = pin->next;
 	pin->next = NULL;
-
 	return pin;
 }
 
@@ -284,8 +231,9 @@ static inline void io_apic_write(unsigned int apic, unsigned int reg, unsigned i
 static inline void io_apic_modify(unsigned int apic, unsigned int reg, unsigned int value)
 {
 	struct io_apic __iomem *io_apic = io_apic_base(apic);
-        if (sis_apic_bug)
-                writel(reg, &io_apic->index);
+
+	if (sis_apic_bug)
+		writel(reg, &io_apic->index);
 	writel(value, &io_apic->data);
 }
 
@@ -1044,11 +992,11 @@ static int pin_2_irq(int idx, int apic, int pin)
 		while (i < apic)
 			irq += nr_ioapic_registers[i++];
 		irq += pin;
-                /*
+		/*
                  * For MPS mode, so far only needed by ES7000 platform
                  */
-                if (ioapic_renumber_irq)
-                        irq = ioapic_renumber_irq(apic, irq);
+		if (ioapic_renumber_irq)
+			irq = ioapic_renumber_irq(apic, irq);
 	}
 
 #ifdef CONFIG_X86_32
@@ -1232,19 +1180,19 @@ static struct irq_chip ir_ioapic_chip;
 #ifdef CONFIG_X86_32
 static inline int IO_APIC_irq_trigger(int irq)
 {
-        int apic, idx, pin;
+	int apic, idx, pin;
 
-        for (apic = 0; apic < nr_ioapics; apic++) {
-                for (pin = 0; pin < nr_ioapic_registers[apic]; pin++) {
-                        idx = find_irq_entry(apic, pin, mp_INT);
-                        if ((idx != -1) && (irq == pin_2_irq(idx, apic, pin)))
-                                return irq_trigger(idx);
-                }
-        }
-        /*
+	for (apic = 0; apic < nr_ioapics; apic++) {
+		for (pin = 0; pin < nr_ioapic_registers[apic]; pin++) {
+			idx = find_irq_entry(apic, pin, mp_INT);
+			if ((idx != -1) && (irq == pin_2_irq(idx, apic, pin)))
+				return irq_trigger(idx);
+		}
+	}
+	/*
          * nonexistent IRQs are edge default
          */
-        return 0;
+	return 0;
 }
 #else
 static inline int IO_APIC_irq_trigger(int irq)
@@ -1509,8 +1457,8 @@ __apicdebuginit(void) print_IO_APIC(void)
 	reg_01.raw = io_apic_read(apic, 1);
 	if (reg_01.bits.version >= 0x10)
 		reg_02.raw = io_apic_read(apic, 2);
-        if (reg_01.bits.version >= 0x20)
-                reg_03.raw = io_apic_read(apic, 3);
+	if (reg_01.bits.version >= 0x20)
+		reg_03.raw = io_apic_read(apic, 3);
 	spin_unlock_irqrestore(&ioapic_lock, flags);
 
 	printk("\n");
@@ -2089,9 +2037,9 @@ static int ioapic_retrigger_irq(unsigned int irq)
 #else
 static int ioapic_retrigger_irq(unsigned int irq)
 {
-        send_IPI_self(irq_cfg(irq)->vector);
+	send_IPI_self(irq_cfg(irq)->vector);
 
-        return 1;
+	return 1;
 }
 #endif
 
@@ -2189,7 +2137,7 @@ static int migrate_irq_remapped_level(int irq)
 
 	if (io_apic_level_ack_pending(irq)) {
 		/*
-	 	 * Interrupt in progress. Migrating irq now will change the
+		 * Interrupt in progress. Migrating irq now will change the
 		 * vector information in the IO-APIC RTE and that will confuse
 		 * the EOI broadcast performed by cpu.
 		 * So, delay the irq migration to the next instance.
@@ -2426,28 +2374,28 @@ static void ack_apic_level(unsigned int irq)
 }
 
 static struct irq_chip ioapic_chip __read_mostly = {
-	.name 		= "IO-APIC",
-	.startup 	= startup_ioapic_irq,
-	.mask	 	= mask_IO_APIC_irq,
-	.unmask	 	= unmask_IO_APIC_irq,
-	.ack 		= ack_apic_edge,
-	.eoi 		= ack_apic_level,
+	.name		= "IO-APIC",
+	.startup	= startup_ioapic_irq,
+	.mask		= mask_IO_APIC_irq,
+	.unmask		= unmask_IO_APIC_irq,
+	.ack		= ack_apic_edge,
+	.eoi		= ack_apic_level,
 #ifdef CONFIG_SMP
-	.set_affinity 	= set_ioapic_affinity_irq,
+	.set_affinity	= set_ioapic_affinity_irq,
 #endif
 	.retrigger	= ioapic_retrigger_irq,
 };
 
 #ifdef CONFIG_INTR_REMAP
 static struct irq_chip ir_ioapic_chip __read_mostly = {
-	.name 		= "IR-IO-APIC",
-	.startup 	= startup_ioapic_irq,
-	.mask	 	= mask_IO_APIC_irq,
-	.unmask	 	= unmask_IO_APIC_irq,
-	.ack 		= ack_x2apic_edge,
-	.eoi 		= ack_x2apic_level,
+	.name		= "IR-IO-APIC",
+	.startup	= startup_ioapic_irq,
+	.mask		= mask_IO_APIC_irq,
+	.unmask		= unmask_IO_APIC_irq,
+	.ack		= ack_x2apic_edge,
+	.eoi		= ack_x2apic_level,
 #ifdef CONFIG_SMP
-	.set_affinity 	= set_ir_ioapic_affinity_irq,
+	.set_affinity	= set_ir_ioapic_affinity_irq,
 #endif
 	.retrigger	= ioapic_retrigger_irq,
 };
@@ -2636,8 +2584,8 @@ static inline void __init check_timer(void)
 
 	local_irq_save(flags);
 
-        ver = apic_read(APIC_LVR);
-        ver = GET_APIC_VERSION(ver);
+	ver = apic_read(APIC_LVR);
+	ver = GET_APIC_VERSION(ver);
 
 	/*
 	 * get/set the timer IRQ vector:
@@ -2822,12 +2770,12 @@ void __init setup_IO_APIC(void)
 	io_apic_irqs = ~PIC_IRQS;
 
 	apic_printk(APIC_VERBOSE, "ENABLING IO-APIC IRQs\n");
-        /*
+	/*
          * Set up IO-APIC IRQ routing.
          */
 #ifdef CONFIG_X86_32
-        if (!acpi_ioapic)
-                setup_ioapic_ids_from_mpc();
+	if (!acpi_ioapic)
+		setup_ioapic_ids_from_mpc();
 #endif
 	sync_Arb_IDs();
 	setup_IO_APIC_irqs();
@@ -2842,9 +2790,9 @@ void __init setup_IO_APIC(void)
 
 static int __init io_apic_bug_finalize(void)
 {
-        if (sis_apic_bug == -1)
-                sis_apic_bug = 0;
-        return 0;
+	if (sis_apic_bug == -1)
+		sis_apic_bug = 0;
+	return 0;
 }
 
 late_initcall(io_apic_bug_finalize);
@@ -3199,7 +3147,7 @@ static int msi_alloc_irte(struct pci_dev *dev, int irq, int nvec)
 	if (index < 0) {
 		printk(KERN_ERR
 		       "Unable to allocate %d IRTE for PCI %s\n", nvec,
-		        pci_name(dev));
+		       pci_name(dev));
 		return -ENOSPC;
 	}
 	return index;
@@ -3885,23 +3833,24 @@ static struct resource * __init ioapic_setup_resources(void)
 void __init ioapic_init_mappings(void)
 {
 	unsigned long ioapic_phys, idx = FIX_IO_APIC_BASE_0;
-	int i;
 	struct resource *ioapic_res;
+	int i;
 
+	irq_2_pin_init();
 	ioapic_res = ioapic_setup_resources();
 	for (i = 0; i < nr_ioapics; i++) {
 		if (smp_found_config) {
 			ioapic_phys = mp_ioapics[i].mp_apicaddr;
 #ifdef CONFIG_X86_32
-                        if (!ioapic_phys) {
-                                printk(KERN_ERR
-                                       "WARNING: bogus zero IO-APIC "
-                                       "address found in MPTABLE, "
-                                       "disabling IO/APIC support!\n");
-                                smp_found_config = 0;
-                                skip_ioapic_setup = 1;
-                                goto fake_ioapic_page;
-                        }
+			if (!ioapic_phys) {
+				printk(KERN_ERR
+				       "WARNING: bogus zero IO-APIC "
+				       "address found in MPTABLE, "
+				       "disabling IO/APIC support!\n");
+				smp_found_config = 0;
+				skip_ioapic_setup = 1;
+				goto fake_ioapic_page;
+			}
 #endif
 		} else {
 #ifdef CONFIG_X86_32
diff --git a/arch/x86/kernel/setup_percpu.c b/arch/x86/kernel/setup_percpu.c
index 2b7dab699e83..410c88f0bfeb 100644
--- a/arch/x86/kernel/setup_percpu.c
+++ b/arch/x86/kernel/setup_percpu.c
@@ -140,7 +140,7 @@ static void __init setup_cpu_pda_map(void)
  */
 void __init setup_per_cpu_areas(void)
 {
-	ssize_t size, old_size, da_size;
+	ssize_t size, old_size;
 	char *ptr;
 	int cpu;
 	unsigned long align = 1;
@@ -150,9 +150,8 @@ void __init setup_per_cpu_areas(void)
 
 	/* Copy section for each CPU (we discard the original) */
 	old_size = PERCPU_ENOUGH_ROOM;
-	da_size = per_cpu_dyn_array_size(&align);
 	align = max_t(unsigned long, PAGE_SIZE, align);
-	size = roundup(old_size + da_size, align);
+	size = roundup(old_size, align);
 	printk(KERN_INFO "PERCPU: Allocating %zd bytes of per cpu data\n",
 			  size);
 
@@ -182,9 +181,6 @@ void __init setup_per_cpu_areas(void)
 #endif
 		per_cpu_offset(cpu) = ptr - __per_cpu_start;
 		memcpy(ptr, __per_cpu_start, __per_cpu_end - __per_cpu_start);
-
-		per_cpu_alloc_dyn_array(cpu, ptr + old_size);
-
 	}
 
 	printk(KERN_DEBUG "NR_CPUS: %d, nr_cpu_ids: %d, nr_node_ids %d\n",
diff --git a/arch/x86/kernel/visws_quirks.c b/arch/x86/kernel/visws_quirks.c
index 817aa55a1209..0c9667f0752a 100644
--- a/arch/x86/kernel/visws_quirks.c
+++ b/arch/x86/kernel/visws_quirks.c
@@ -633,7 +633,7 @@ static irqreturn_t piix4_master_intr(int irq, void *dev_id)
 	/*
 	 * handle this 'virtual interrupt' as a Cobalt one now.
 	 */
-	kstat_irqs_this_cpu(desc)++;
+	kstat_incr_irqs_this_cpu(realirq, desc);
 
 	if (likely(desc->action != NULL))
 		handle_IRQ_event(realirq, desc->action);
diff --git a/arch/x86/kernel/vmlinux_32.lds.S b/arch/x86/kernel/vmlinux_32.lds.S
index c36007ab3940..a9b8560adbc2 100644
--- a/arch/x86/kernel/vmlinux_32.lds.S
+++ b/arch/x86/kernel/vmlinux_32.lds.S
@@ -145,7 +145,6 @@ SECTIONS
 	*(.x86_cpu_dev.init)
 	__x86_cpu_dev_end = .;
   }
-  DYN_ARRAY_INIT(8)
   SECURITY_INIT
   . = ALIGN(4);
   .altinstructions : AT(ADDR(.altinstructions) - LOAD_OFFSET) {
diff --git a/arch/x86/kernel/vmlinux_64.lds.S b/arch/x86/kernel/vmlinux_64.lds.S
index 30973dbac8c2..3245ad72594a 100644
--- a/arch/x86/kernel/vmlinux_64.lds.S
+++ b/arch/x86/kernel/vmlinux_64.lds.S
@@ -174,8 +174,6 @@ SECTIONS
   }
   __x86_cpu_dev_end = .;
 
-  DYN_ARRAY_INIT(8)
-
   SECURITY_INIT
 
   . = ALIGN(8);
diff --git a/arch/x86/xen/spinlock.c b/arch/x86/xen/spinlock.c
index bb6bc721b13d..5601506f2dd9 100644
--- a/arch/x86/xen/spinlock.c
+++ b/arch/x86/xen/spinlock.c
@@ -241,7 +241,7 @@ static noinline int xen_spin_lock_slow(struct raw_spinlock *lock, bool irq_enabl
 		ADD_STATS(taken_slow_spurious, !xen_test_irq_pending(irq));
 	} while (!xen_test_irq_pending(irq)); /* check for spurious wakeups */
 
-	kstat_irqs_this_cpu(irq_to_desc(irq))++;
+	kstat_incr_irqs_this_cpu(irq, irq_to_desc(irq));
 
 out:
 	raw_local_irq_restore(flags);
diff --git a/drivers/char/random.c b/drivers/char/random.c
index 9ce80213007b..1137d2976043 100644
--- a/drivers/char/random.c
+++ b/drivers/char/random.c
@@ -558,12 +558,7 @@ struct timer_rand_state {
 	unsigned dont_count_entropy:1;
 };
 
-#ifdef CONFIG_HAVE_DYN_ARRAY
-static struct timer_rand_state **irq_timer_state;
-DEFINE_DYN_ARRAY(irq_timer_state, sizeof(struct timer_rand_state *), nr_irqs, PAGE_SIZE, NULL);
-#else
 static struct timer_rand_state *irq_timer_state[NR_IRQS];
-#endif
 
 static struct timer_rand_state *get_timer_rand_state(unsigned int irq)
 {
diff --git a/drivers/pci/intr_remapping.c b/drivers/pci/intr_remapping.c
index 0f43b265eee6..950769e87475 100644
--- a/drivers/pci/intr_remapping.c
+++ b/drivers/pci/intr_remapping.c
@@ -19,20 +19,13 @@ struct irq_2_iommu {
 	u8  irte_mask;
 };
 
-#ifdef CONFIG_HAVE_DYN_ARRAY
-static struct irq_2_iommu *irq_2_iommuX;
-DEFINE_DYN_ARRAY(irq_2_iommuX, sizeof(struct irq_2_iommu), nr_irqs, PAGE_SIZE, NULL);
-#else
 static struct irq_2_iommu irq_2_iommuX[NR_IRQS];
-#endif
 
 static struct irq_2_iommu *irq_2_iommu(unsigned int irq)
 {
-	if (irq < nr_irqs)
-		return &irq_2_iommuX[irq];
-
-	return NULL;
+	return (irq < nr_irqs) ?: irq_2_iommuX + irq : NULL;
 }
+
 static struct irq_2_iommu *irq_2_iommu_alloc(unsigned int irq)
 {
 	return irq_2_iommu(irq);
diff --git a/include/asm-generic/vmlinux.lds.h b/include/asm-generic/vmlinux.lds.h
index c68eda9d9a90..7440a0dceddb 100644
--- a/include/asm-generic/vmlinux.lds.h
+++ b/include/asm-generic/vmlinux.lds.h
@@ -210,19 +210,6 @@
  * All archs are supposed to use RO_DATA() */
 #define RODATA RO_DATA(4096)
 
-#define DYN_ARRAY_INIT(align)							\
-	. = ALIGN((align));						\
-	.dyn_array.init : AT(ADDR(.dyn_array.init) - LOAD_OFFSET) {	\
-		VMLINUX_SYMBOL(__dyn_array_start) = .;			\
-		*(.dyn_array.init)					\
-		VMLINUX_SYMBOL(__dyn_array_end) = .;			\
-	}								\
-	. = ALIGN((align));						\
-	.per_cpu_dyn_array.init : AT(ADDR(.per_cpu_dyn_array.init) - LOAD_OFFSET) {	\
-		VMLINUX_SYMBOL(__per_cpu_dyn_array_start) = .;		\
-		*(.per_cpu_dyn_array.init)				\
-		VMLINUX_SYMBOL(__per_cpu_dyn_array_end) = .;		\
-	}
 #define SECURITY_INIT							\
 	.security_initcall.init : AT(ADDR(.security_initcall.init) - LOAD_OFFSET) { \
 		VMLINUX_SYMBOL(__security_initcall_start) = .;		\
diff --git a/include/linux/init.h b/include/linux/init.h
index 59fbb4aaba6a..70ad53e1eab8 100644
--- a/include/linux/init.h
+++ b/include/linux/init.h
@@ -247,49 +247,6 @@ struct obs_kernel_param {
 /* Relies on boot_command_line being set */
 void __init parse_early_param(void);
 
-struct dyn_array {
-	void **name;
-	unsigned long size;
-	unsigned int *nr;
-	unsigned long align;
-	void (*init_work)(void *);
-};
-extern struct dyn_array *__dyn_array_start[], *__dyn_array_end[];
-extern struct dyn_array *__per_cpu_dyn_array_start[], *__per_cpu_dyn_array_end[];
-
-#define DEFINE_DYN_ARRAY_ADDR(nameX, addrX, sizeX, nrX, alignX, init_workX) \
-		static struct dyn_array __dyn_array_##nameX __initdata = \
-		{	.name = (void **)&(nameX),\
-			.size = sizeX,\
-			.nr   = &(nrX),\
-			.align = alignX,\
-			.init_work = init_workX,\
-		}; \
-		static struct dyn_array *__dyn_array_ptr_##nameX __used \
-		__attribute__((__section__(".dyn_array.init"))) = \
-			&__dyn_array_##nameX
-
-#define DEFINE_DYN_ARRAY(nameX, sizeX, nrX, alignX, init_workX) \
-	DEFINE_DYN_ARRAY_ADDR(nameX, nameX, sizeX, nrX, alignX, init_workX)
-
-#define DEFINE_PER_CPU_DYN_ARRAY_ADDR(nameX, addrX, sizeX, nrX, alignX, init_workX) \
-		static struct dyn_array __per_cpu_dyn_array_##nameX __initdata = \
-		{	.name = (void **)&(addrX),\
-			.size = sizeX,\
-			.nr   = &(nrX),\
-			.align = alignX,\
-			.init_work = init_workX,\
-		}; \
-		static struct dyn_array *__per_cpu_dyn_array_ptr_##nameX __used \
-		__attribute__((__section__(".per_cpu_dyn_array.init"))) = \
-			&__per_cpu_dyn_array_##nameX
-
-#define DEFINE_PER_CPU_DYN_ARRAY(nameX, sizeX, nrX, alignX, init_workX) \
-	DEFINE_PER_CPU_DYN_ARRAY_ADDR(nameX, nameX, nrX, alignX, init_workX)
-
-extern void pre_alloc_dyn_array(void);
-extern unsigned long per_cpu_dyn_array_size(unsigned long *align);
-extern void per_cpu_alloc_dyn_array(int cpu, char *ptr);
 #endif /* __ASSEMBLY__ */
 
 /**
diff --git a/include/linux/irq.h b/include/linux/irq.h
index 3f33c7790300..38bf89f2ade0 100644
--- a/include/linux/irq.h
+++ b/include/linux/irq.h
@@ -139,8 +139,6 @@ struct irq_chip {
 	const char	*typename;
 };
 
-struct timer_rand_state;
-struct irq_2_iommu;
 /**
  * struct irq_desc - interrupt descriptor
  *
@@ -167,9 +165,6 @@ struct irq_2_iommu;
  */
 struct irq_desc {
 	unsigned int		irq;
-#ifdef CONFIG_HAVE_DYN_ARRAY
-	unsigned int            *kstat_irqs;
-#endif
 	irq_flow_handler_t	handle_irq;
 	struct irq_chip		*chip;
 	struct msi_desc		*msi_desc;
@@ -198,23 +193,13 @@ struct irq_desc {
 } ____cacheline_internodealigned_in_smp;
 
 
-#ifndef CONFIG_HAVE_DYN_ARRAY
-/* could be removed if we get rid of all irq_desc reference */
 extern struct irq_desc irq_desc[NR_IRQS];
-#else
-extern struct irq_desc *irq_desc;
-#endif
 
 static inline struct irq_desc *irq_to_desc(unsigned int irq)
 {
 	return (irq < nr_irqs) ? irq_desc + irq : NULL;
 }
 
-#ifdef CONFIG_HAVE_DYN_ARRAY
-#define kstat_irqs_this_cpu(DESC) \
-	((DESC)->kstat_irqs[smp_processor_id()])
-#endif
-
 /*
  * Migration helpers for obsolete names, they will go away:
  */
diff --git a/include/linux/kernel_stat.h b/include/linux/kernel_stat.h
index 21249d8c1293..a9d0d360b776 100644
--- a/include/linux/kernel_stat.h
+++ b/include/linux/kernel_stat.h
@@ -28,9 +28,7 @@ struct cpu_usage_stat {
 
 struct kernel_stat {
 	struct cpu_usage_stat	cpustat;
-#ifndef CONFIG_HAVE_DYN_ARRAY
        unsigned int irqs[NR_IRQS];
-#endif
 };
 
 DECLARE_PER_CPU(struct kernel_stat, kstat);
@@ -41,20 +39,18 @@ DECLARE_PER_CPU(struct kernel_stat, kstat);
 
 extern unsigned long long nr_context_switches(void);
 
-#ifndef CONFIG_HAVE_DYN_ARRAY
-#define kstat_irqs_this_cpu(irq) \
-	(kstat_this_cpu.irqs[irq])
-#endif
+struct irq_desc;
 
+static inline void kstat_incr_irqs_this_cpu(unsigned int irq,
+					    struct irq_desc *desc)
+{
+	kstat_this_cpu.irqs[irq]++;
+}
 
-#ifndef CONFIG_HAVE_DYN_ARRAY
 static inline unsigned int kstat_irqs_cpu(unsigned int irq, int cpu)
 {
        return kstat_cpu(cpu).irqs[irq];
 }
-#else
-extern unsigned int kstat_irqs_cpu(unsigned int irq, int cpu);
-#endif
 
 /*
  * Number of interrupts per specific IRQ source, since bootup
diff --git a/init/Makefile b/init/Makefile
index dc5eeca6eb6d..4a243df426f7 100644
--- a/init/Makefile
+++ b/init/Makefile
@@ -2,7 +2,7 @@
 # Makefile for the linux kernel.
 #
 
-obj-y                          := main.o dyn_array.o version.o mounts.o
+obj-y                          := main.o version.o mounts.o
 ifneq ($(CONFIG_BLK_DEV_INITRD),y)
 obj-y                          += noinitramfs.o
 else
diff --git a/init/dyn_array.c b/init/dyn_array.c
deleted file mode 100644
index c8d5e2a18588..000000000000
--- a/init/dyn_array.c
+++ /dev/null
@@ -1,120 +0,0 @@
-#include <linux/types.h>
-#include <linux/kernel.h>
-#include <linux/kallsyms.h>
-#include <linux/init.h>
-#include <linux/bootmem.h>
-#include <linux/irq.h>
-
-void __init pre_alloc_dyn_array(void)
-{
-#ifdef CONFIG_HAVE_DYN_ARRAY
-	unsigned long total_size = 0, size, phys;
-	unsigned long max_align = 1;
-	struct dyn_array **daa;
-	char *ptr;
-
-	/* get the total size at first */
-	for (daa = __dyn_array_start ; daa < __dyn_array_end; daa++) {
-		struct dyn_array *da = *daa;
-
-		printk(KERN_DEBUG "dyn_array %pF size:%#lx nr:%d align:%#lx\n",
-			da->name, da->size, *da->nr, da->align);
-		size = da->size * (*da->nr);
-		total_size += roundup(size, da->align);
-		if (da->align > max_align)
-			max_align = da->align;
-	}
-	if (total_size)
-		printk(KERN_DEBUG "dyn_array total_size: %#lx\n",
-			 total_size);
-	else
-		return;
-
-	/* allocate them all together */
-	max_align = max_t(unsigned long, max_align, PAGE_SIZE);
-	ptr = __alloc_bootmem(total_size, max_align, 0);
-	phys = virt_to_phys(ptr);
-
-	for (daa = __dyn_array_start ; daa < __dyn_array_end; daa++) {
-		struct dyn_array *da = *daa;
-
-		size = da->size * (*da->nr);
-		phys = roundup(phys, da->align);
-		printk(KERN_DEBUG "dyn_array %pF ==> [%#lx - %#lx]\n",
-			da->name, phys, phys + size);
-		*da->name = phys_to_virt(phys);
-
-		phys += size;
-
-		if (da->init_work)
-			da->init_work(da);
-	}
-#else
-#ifdef CONFIG_GENERIC_HARDIRQS
-	unsigned int i;
-
-	for (i = 0; i < NR_IRQS; i++)
-		irq_desc[i].irq = i;
-#endif
-#endif
-}
-
-unsigned long __init per_cpu_dyn_array_size(unsigned long *align)
-{
-	unsigned long total_size = 0;
-#ifdef CONFIG_HAVE_DYN_ARRAY
-	unsigned long size;
-	struct dyn_array **daa;
-	unsigned max_align = 1;
-
-	for (daa = __per_cpu_dyn_array_start ; daa < __per_cpu_dyn_array_end; daa++) {
-		struct dyn_array *da = *daa;
-
-		printk(KERN_DEBUG "per_cpu_dyn_array %pF size:%#lx nr:%d align:%#lx\n",
-			da->name, da->size, *da->nr, da->align);
-		size = da->size * (*da->nr);
-		total_size += roundup(size, da->align);
-		if (da->align > max_align)
-			max_align = da->align;
-	}
-	if (total_size) {
-		printk(KERN_DEBUG "per_cpu_dyn_array total_size: %#lx\n",
-			 total_size);
-		*align = max_align;
-	}
-#endif
-	return total_size;
-}
-
-#ifdef CONFIG_SMP
-void __init per_cpu_alloc_dyn_array(int cpu, char *ptr)
-{
-#ifdef CONFIG_HAVE_DYN_ARRAY
-	unsigned long size, phys;
-	struct dyn_array **daa;
-	unsigned long addr;
-	void **array;
-
-	phys = virt_to_phys(ptr);
-	for (daa = __per_cpu_dyn_array_start ; daa < __per_cpu_dyn_array_end; daa++) {
-		struct dyn_array *da = *daa;
-
-		size = da->size * (*da->nr);
-		phys = roundup(phys, da->align);
-		printk(KERN_DEBUG "per_cpu_dyn_array %pF ==> [%#lx - %#lx]\n",
-			da->name, phys, phys + size);
-
-		addr = (unsigned long)da->name;
-		addr += per_cpu_offset(cpu);
-		array = (void **)addr;
-		*array = phys_to_virt(phys);
-		*da->name = *array; /* so init_work could use it directly */
-
-		phys += size;
-
-		if (da->init_work)
-			da->init_work(da);
-	}
-#endif
-}
-#endif
diff --git a/init/main.c b/init/main.c
index e81cf427d9c7..27f6bf6108e9 100644
--- a/init/main.c
+++ b/init/main.c
@@ -391,23 +391,17 @@ EXPORT_SYMBOL(__per_cpu_offset);
 
 static void __init setup_per_cpu_areas(void)
 {
-	unsigned long size, i, old_size;
+	unsigned long size, i;
 	char *ptr;
 	unsigned long nr_possible_cpus = num_possible_cpus();
-	unsigned long align = 1;
-	unsigned da_size;
 
 	/* Copy section for each CPU (we discard the original) */
-	old_size = PERCPU_ENOUGH_ROOM;
-	da_size = per_cpu_dyn_array_size(&align);
-	align = max_t(unsigned long, PAGE_SIZE, align);
-	size = ALIGN(old_size + da_size, align);
+	size = ALIGN(PERCPU_ENOUGH_ROOM, PAGE_SIZE);
 	ptr = alloc_bootmem_pages(size * nr_possible_cpus);
 
 	for_each_possible_cpu(i) {
 		__per_cpu_offset[i] = ptr - __per_cpu_start;
 		memcpy(ptr, __per_cpu_start, __per_cpu_end - __per_cpu_start);
-		per_cpu_alloc_dyn_array(i, ptr + old_size);
 		ptr += size;
 	}
 }
@@ -573,7 +567,6 @@ asmlinkage void __init start_kernel(void)
 	printk(KERN_NOTICE);
 	printk(linux_banner);
 	setup_arch(&command_line);
-	pre_alloc_dyn_array();
 	mm_init_owner(&init_mm, &init_task);
 	setup_command_line(command_line);
 	unwind_setup();
diff --git a/kernel/irq/chip.c b/kernel/irq/chip.c
index e6f73dbfcc3d..d96d6f687c48 100644
--- a/kernel/irq/chip.c
+++ b/kernel/irq/chip.c
@@ -326,11 +326,7 @@ handle_simple_irq(unsigned int irq, struct irq_desc *desc)
 	if (unlikely(desc->status & IRQ_INPROGRESS))
 		goto out_unlock;
 	desc->status &= ~(IRQ_REPLAY | IRQ_WAITING);
-#ifdef CONFIG_HAVE_DYN_ARRAY
-	kstat_irqs_this_cpu(desc)++;
-#else
-	kstat_irqs_this_cpu(irq)++;
-#endif
+	kstat_incr_irqs_this_cpu(irq, desc);
 
 	action = desc->action;
 	if (unlikely(!action || (desc->status & IRQ_DISABLED)))
@@ -371,11 +367,7 @@ handle_level_irq(unsigned int irq, struct irq_desc *desc)
 	if (unlikely(desc->status & IRQ_INPROGRESS))
 		goto out_unlock;
 	desc->status &= ~(IRQ_REPLAY | IRQ_WAITING);
-#ifdef CONFIG_HAVE_DYN_ARRAY
-	kstat_irqs_this_cpu(desc)++;
-#else
-	kstat_irqs_this_cpu(irq)++;
-#endif
+	kstat_incr_irqs_this_cpu(irq, desc);
 
 	/*
 	 * If its disabled or no action available
@@ -422,11 +414,7 @@ handle_fasteoi_irq(unsigned int irq, struct irq_desc *desc)
 		goto out;
 
 	desc->status &= ~(IRQ_REPLAY | IRQ_WAITING);
-#ifdef CONFIG_HAVE_DYN_ARRAY
-	kstat_irqs_this_cpu(desc)++;
-#else
-	kstat_irqs_this_cpu(irq)++;
-#endif
+	kstat_incr_irqs_this_cpu(irq, desc);
 
 	/*
 	 * If its disabled or no action available
@@ -490,11 +478,7 @@ handle_edge_irq(unsigned int irq, struct irq_desc *desc)
 		mask_ack_irq(desc, irq);
 		goto out_unlock;
 	}
-#ifdef CONFIG_HAVE_DYN_ARRAY
-	kstat_irqs_this_cpu(desc)++;
-#else
-	kstat_irqs_this_cpu(irq)++;
-#endif
+	kstat_incr_irqs_this_cpu(irq, desc);
 
 	/* Start handling the irq */
 	desc->chip->ack(irq);
@@ -549,11 +533,7 @@ handle_percpu_irq(unsigned int irq, struct irq_desc *desc)
 {
 	irqreturn_t action_ret;
 
-#ifdef CONFIG_HAVE_DYN_ARRAY
-	kstat_irqs_this_cpu(desc)++;
-#else
-	kstat_irqs_this_cpu(irq)++;
-#endif
+	kstat_incr_irqs_this_cpu(irq, desc);
 
 	if (desc->chip->ack)
 		desc->chip->ack(irq);
diff --git a/kernel/irq/handle.c b/kernel/irq/handle.c
index f837133cdfbe..9fe86b3a60a5 100644
--- a/kernel/irq/handle.c
+++ b/kernel/irq/handle.c
@@ -18,11 +18,6 @@
 
 #include "internals.h"
 
-/*
- * lockdep: we want to handle all irq_desc locks as a single lock-class:
- */
-static struct lock_class_key irq_desc_lock_class;
-
 /**
  * handle_bad_irq - handle spurious and unhandled irqs
  * @irq:       the interrupt number
@@ -30,15 +25,10 @@ static struct lock_class_key irq_desc_lock_class;
  *
  * Handles spurious and unhandled IRQ's. It also prints a debugmessage.
  */
-void
-handle_bad_irq(unsigned int irq, struct irq_desc *desc)
+void handle_bad_irq(unsigned int irq, struct irq_desc *desc)
 {
 	print_irq_desc(irq, desc);
-#ifdef CONFIG_HAVE_DYN_ARRAY
-	kstat_irqs_this_cpu(desc)++;
-#else
-	kstat_irqs_this_cpu(irq)++;
-#endif
+	kstat_incr_irqs_this_cpu(irq, desc);
 	ack_bad_irq(irq);
 }
 
@@ -59,80 +49,6 @@ handle_bad_irq(unsigned int irq, struct irq_desc *desc)
 int nr_irqs = NR_IRQS;
 EXPORT_SYMBOL_GPL(nr_irqs);
 
-#ifdef CONFIG_HAVE_DYN_ARRAY
-static struct irq_desc irq_desc_init = {
-	.irq = -1U,
-	.status = IRQ_DISABLED,
-	.chip = &no_irq_chip,
-	.handle_irq = handle_bad_irq,
-	.depth = 1,
-	.lock = __SPIN_LOCK_UNLOCKED(irq_desc_init.lock),
-#ifdef CONFIG_SMP
-	.affinity = CPU_MASK_ALL
-#endif
-};
-
-
-static void init_one_irq_desc(struct irq_desc *desc)
-{
-	memcpy(desc, &irq_desc_init, sizeof(struct irq_desc));
-	lockdep_set_class(&desc->lock, &irq_desc_lock_class);
-}
-
-extern int after_bootmem;
-extern void *__alloc_bootmem_nopanic(unsigned long size,
-			     unsigned long align,
-			     unsigned long goal);
-
-static void init_kstat_irqs(struct irq_desc *desc, int nr_desc, int nr)
-{
-	unsigned long bytes, total_bytes;
-	char *ptr;
-	int i;
-	unsigned long phys;
-
-	/* Compute how many bytes we need per irq and allocate them */
-	bytes = nr * sizeof(unsigned int);
-	total_bytes = bytes * nr_desc;
-	if (after_bootmem)
-		ptr = kzalloc(total_bytes, GFP_ATOMIC);
-	else
-		ptr = __alloc_bootmem_nopanic(total_bytes, PAGE_SIZE, 0);
-
-	if (!ptr)
-		panic(" can not allocate kstat_irqs\n");
-
-	phys = __pa(ptr);
-	printk(KERN_DEBUG "kstat_irqs ==> [%#lx - %#lx]\n", phys, phys + total_bytes);
-
-	for (i = 0; i < nr_desc; i++) {
-		desc[i].kstat_irqs = (unsigned int *)ptr;
-		ptr += bytes;
-	}
-}
-
-static void __init init_work(void *data)
-{
-	struct dyn_array *da = data;
-	int i;
-	struct  irq_desc *desc;
-
-	desc = *da->name;
-
-	for (i = 0; i < *da->nr; i++) {
-		init_one_irq_desc(&desc[i]);
-		desc[i].irq = i;
-	}
-
-	/* init kstat_irqs, nr_cpu_ids is ready already */
-	init_kstat_irqs(desc, *da->nr, nr_cpu_ids);
-}
-
-struct irq_desc *irq_desc;
-DEFINE_DYN_ARRAY(irq_desc, sizeof(struct irq_desc), nr_irqs, PAGE_SIZE, init_work);
-
-#else
-
 struct irq_desc irq_desc[NR_IRQS] __cacheline_aligned_in_smp = {
 	[0 ... NR_IRQS-1] = {
 		.status = IRQ_DISABLED,
@@ -146,8 +62,6 @@ struct irq_desc irq_desc[NR_IRQS] __cacheline_aligned_in_smp = {
 	}
 };
 
-#endif
-
 /*
  * What should we do if we get a hw irq event on an illegal vector?
  * Each architecture has to answer this themself.
@@ -258,11 +172,8 @@ unsigned int __do_IRQ(unsigned int irq)
 	struct irqaction *action;
 	unsigned int status;
 
-#ifdef CONFIG_HAVE_DYN_ARRAY
-	kstat_irqs_this_cpu(desc)++;
-#else
-	kstat_irqs_this_cpu(irq)++;
-#endif
+	kstat_incr_irqs_this_cpu(irq, desc);
+
 	if (CHECK_IRQ_PER_CPU(desc->status)) {
 		irqreturn_t action_ret;
 
@@ -351,23 +262,16 @@ out:
 
 
 #ifdef CONFIG_TRACE_IRQFLAGS
+/*
+ * lockdep: we want to handle all irq_desc locks as a single lock-class:
+ */
+static struct lock_class_key irq_desc_lock_class;
+
 void early_init_irq_lock_class(void)
 {
-#ifndef CONFIG_HAVE_DYN_ARRAY
 	int i;
 
 	for (i = 0; i < nr_irqs; i++)
 		lockdep_set_class(&irq_desc[i].lock, &irq_desc_lock_class);
-#endif
 }
 #endif
-
-#ifdef CONFIG_HAVE_DYN_ARRAY
-unsigned int kstat_irqs_cpu(unsigned int irq, int cpu)
-{
-	struct irq_desc *desc = irq_to_desc(irq);
-	return desc->kstat_irqs[cpu];
-}
-#endif
-EXPORT_SYMBOL(kstat_irqs_cpu);
-
-- 
cgit v1.2.3


From a1aca5de08a0cb840a90fb3f729a5940f8d21185 Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Wed, 15 Oct 2008 19:29:15 +0200
Subject: genirq: remove artifacts from sparseirq removal

Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/Kconfig                     | 1 +
 arch/x86/kernel/acpi/boot.c      | 1 -
 arch/x86/kernel/irqinit_32.c     | 1 -
 arch/x86/kernel/vmlinux_64.lds.S | 1 -
 arch/x86/mm/init_32.c            | 3 ---
 include/linux/init.h             | 1 -
 include/linux/kernel_stat.h      | 2 +-
 7 files changed, 2 insertions(+), 8 deletions(-)

(limited to 'include/linux')

diff --git a/arch/Kconfig b/arch/Kconfig
index 071004d3a1b1..0267babe5eb9 100644
--- a/arch/Kconfig
+++ b/arch/Kconfig
@@ -102,3 +102,4 @@ config HAVE_CLK
 	help
 	  The <linux/clk.h> calls support software clock gating and
 	  thus are a key power management tool on many systems.
+
diff --git a/arch/x86/kernel/acpi/boot.c b/arch/x86/kernel/acpi/boot.c
index 5fef4fece4a5..0d1c26a583c5 100644
--- a/arch/x86/kernel/acpi/boot.c
+++ b/arch/x86/kernel/acpi/boot.c
@@ -1254,7 +1254,6 @@ static int __init acpi_parse_madt_ioapic_entries(void)
 		return count;
 	}
 
-
 	count =
 	    acpi_table_parse_madt(ACPI_MADT_TYPE_INTERRUPT_OVERRIDE, acpi_parse_int_src_ovr,
 				  nr_irqs);
diff --git a/arch/x86/kernel/irqinit_32.c b/arch/x86/kernel/irqinit_32.c
index a8d35998d308..845aa9803e80 100644
--- a/arch/x86/kernel/irqinit_32.c
+++ b/arch/x86/kernel/irqinit_32.c
@@ -184,4 +184,3 @@ void __init native_init_IRQ(void)
 
 	irq_ctx_init(smp_processor_id());
 }
-
diff --git a/arch/x86/kernel/vmlinux_64.lds.S b/arch/x86/kernel/vmlinux_64.lds.S
index 3245ad72594a..46e05447405b 100644
--- a/arch/x86/kernel/vmlinux_64.lds.S
+++ b/arch/x86/kernel/vmlinux_64.lds.S
@@ -173,7 +173,6 @@ SECTIONS
 	*(.x86_cpu_dev.init)
   }
   __x86_cpu_dev_end = .;
-
   SECURITY_INIT
 
   . = ALIGN(8);
diff --git a/arch/x86/mm/init_32.c b/arch/x86/mm/init_32.c
index 91343c2694b4..8396868e82c5 100644
--- a/arch/x86/mm/init_32.c
+++ b/arch/x86/mm/init_32.c
@@ -66,7 +66,6 @@ static unsigned long __meminitdata table_end;
 static unsigned long __meminitdata table_top;
 
 static int __initdata after_init_bootmem;
-int after_bootmem;
 
 static __init void *alloc_low_page(unsigned long *phys)
 {
@@ -989,8 +988,6 @@ void __init mem_init(void)
 
 	set_highmem_pages_init();
 
-	after_bootmem = 1;
-
 	codesize =  (unsigned long) &_etext - (unsigned long) &_text;
 	datasize =  (unsigned long) &_edata - (unsigned long) &_etext;
 	initsize =  (unsigned long) &__init_end - (unsigned long) &__init_begin;
diff --git a/include/linux/init.h b/include/linux/init.h
index 70ad53e1eab8..93538b696e3d 100644
--- a/include/linux/init.h
+++ b/include/linux/init.h
@@ -246,7 +246,6 @@ struct obs_kernel_param {
 
 /* Relies on boot_command_line being set */
 void __init parse_early_param(void);
-
 #endif /* __ASSEMBLY__ */
 
 /**
diff --git a/include/linux/kernel_stat.h b/include/linux/kernel_stat.h
index a9d0d360b776..89b6ecd41473 100644
--- a/include/linux/kernel_stat.h
+++ b/include/linux/kernel_stat.h
@@ -28,7 +28,7 @@ struct cpu_usage_stat {
 
 struct kernel_stat {
 	struct cpu_usage_stat	cpustat;
-       unsigned int irqs[NR_IRQS];
+	unsigned int irqs[NR_IRQS];
 };
 
 DECLARE_PER_CPU(struct kernel_stat, kstat);
-- 
cgit v1.2.3


From 811410fdb6b9d82a518542289efe9b2a51e3cbfb Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Thu, 16 Oct 2008 14:16:11 +0200
Subject: genirq: add reverse iterator for irq_desc

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 include/linux/irq.h | 4 ++++
 1 file changed, 4 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/irq.h b/include/linux/irq.h
index 38bf89f2ade0..31632aa65d16 100644
--- a/include/linux/irq.h
+++ b/include/linux/irq.h
@@ -21,6 +21,10 @@ extern int nr_irqs;
 
 # define for_each_irq_desc(irq, desc)		\
 	for (irq = 0, desc = irq_desc; irq < nr_irqs; irq++, desc++)
+
+# define for_each_irq_desc_reverse(irq, desc)			\
+	for (irq = nr_irqs -1, desc = irq_desc + (nr_irqs -1 );	\
+	     irq > 0; irq--, desc--)
 #endif
 
 #ifndef CONFIG_S390
-- 
cgit v1.2.3


From 2be3b52a5785a6a5c5349fbd315f57595f7074be Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Thu, 16 Oct 2008 14:50:27 +0200
Subject: proc: fixup irq iterator

There is no need for irq_desc here. Even for sparse_irq we can
handle this clever in for_each_irq_nr().

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 fs/proc/proc_misc.c | 7 ++-----
 include/linux/irq.h | 3 +++
 2 files changed, 5 insertions(+), 5 deletions(-)

(limited to 'include/linux')

diff --git a/fs/proc/proc_misc.c b/fs/proc/proc_misc.c
index 3f5c7b9d1a70..97b4579134d5 100644
--- a/fs/proc/proc_misc.c
+++ b/fs/proc/proc_misc.c
@@ -509,9 +509,6 @@ static int show_stat(struct seq_file *p, void *v)
 	u64 sum = 0;
 	struct timespec boottime;
 	unsigned int per_irq_sum;
-#ifdef CONFIG_GENERIC_HARDIRQS
-	struct irq_desc *desc;
-#endif
 
 	user = nice = system = idle = iowait =
 		irq = softirq = steal = cputime64_zero;
@@ -530,7 +527,7 @@ static int show_stat(struct seq_file *p, void *v)
 		steal = cputime64_add(steal, kstat_cpu(i).cpustat.steal);
 		guest = cputime64_add(guest, kstat_cpu(i).cpustat.guest);
 
-		for_each_irq_desc(j, desc)
+		for_each_irq_nr(j)
 			sum += kstat_irqs_cpu(j, i);
 
 		sum += arch_irq_stat_cpu(i);
@@ -575,7 +572,7 @@ static int show_stat(struct seq_file *p, void *v)
 	seq_printf(p, "intr %llu", (unsigned long long)sum);
 
 	/* sum again ? it could be updated? */
-	for_each_irq_desc(j, desc) {
+	for_each_irq_nr(j) {
 		per_irq_sum = 0;
 
 		for_each_possible_cpu(i)
diff --git a/include/linux/irq.h b/include/linux/irq.h
index 31632aa65d16..0618fb362cb4 100644
--- a/include/linux/irq.h
+++ b/include/linux/irq.h
@@ -27,6 +27,9 @@ extern int nr_irqs;
 	     irq > 0; irq--, desc--)
 #endif
 
+#define for_each_irq_nr(irq)			\
+	for (irq = 0; irq < nr_irqs; irq++)
+
 #ifndef CONFIG_S390
 
 #include <linux/linkage.h>
-- 
cgit v1.2.3


From 3481f21097cb560392c411377893b5109fbde557 Mon Sep 17 00:00:00 2001
From: Youquan Song <youquan.song@intel.com>
Date: Thu, 16 Oct 2008 16:31:55 -0700
Subject: dmar: context cache and IOTLB invalidation using queued invalidation

Implement context cache invalidate and IOTLB invalidation using
queued invalidation interface. This interface will be used by
DMA remapping, when queued invalidation is supported.

Signed-off-by: Youquan Song <youquan.song@intel.com>
Signed-off-by: Suresh Siddha <suresh.b.siddha@intel.com>
Signed-off-by: David Woodhouse <David.Woodhouse@intel.com>
---
 drivers/pci/dmar.c          | 56 +++++++++++++++++++++++++++++++++++++++++++++
 include/linux/intel-iommu.h | 21 +++++++++++++++++
 2 files changed, 77 insertions(+)

(limited to 'include/linux')

diff --git a/drivers/pci/dmar.c b/drivers/pci/dmar.c
index b64cec190542..0f409e23631e 100644
--- a/drivers/pci/dmar.c
+++ b/drivers/pci/dmar.c
@@ -645,6 +645,62 @@ void qi_global_iec(struct intel_iommu *iommu)
 	qi_submit_sync(&desc, iommu);
 }
 
+int qi_flush_context(struct intel_iommu *iommu, u16 did, u16 sid, u8 fm,
+		     u64 type, int non_present_entry_flush)
+{
+
+	struct qi_desc desc;
+
+	if (non_present_entry_flush) {
+		if (!cap_caching_mode(iommu->cap))
+			return 1;
+		else
+			did = 0;
+	}
+
+	desc.low = QI_CC_FM(fm) | QI_CC_SID(sid) | QI_CC_DID(did)
+			| QI_CC_GRAN(type) | QI_CC_TYPE;
+	desc.high = 0;
+
+	qi_submit_sync(&desc, iommu);
+
+	return 0;
+
+}
+
+int qi_flush_iotlb(struct intel_iommu *iommu, u16 did, u64 addr,
+		   unsigned int size_order, u64 type,
+		   int non_present_entry_flush)
+{
+	u8 dw = 0, dr = 0;
+
+	struct qi_desc desc;
+	int ih = 0;
+
+	if (non_present_entry_flush) {
+		if (!cap_caching_mode(iommu->cap))
+			return 1;
+		else
+			did = 0;
+	}
+
+	if (cap_write_drain(iommu->cap))
+		dw = 1;
+
+	if (cap_read_drain(iommu->cap))
+		dr = 1;
+
+	desc.low = QI_IOTLB_DID(did) | QI_IOTLB_DR(dr) | QI_IOTLB_DW(dw)
+		| QI_IOTLB_GRAN(type) | QI_IOTLB_TYPE;
+	desc.high = QI_IOTLB_ADDR(addr) | QI_IOTLB_IH(ih)
+		| QI_IOTLB_AM(size_order);
+
+	qi_submit_sync(&desc, iommu);
+
+	return 0;
+
+}
+
 /*
  * Enable Queued Invalidation interface. This is a must to support
  * interrupt-remapping. Also used by DMA-remapping, which replaces
diff --git a/include/linux/intel-iommu.h b/include/linux/intel-iommu.h
index 2e117f30a76c..0c5f5e49107b 100644
--- a/include/linux/intel-iommu.h
+++ b/include/linux/intel-iommu.h
@@ -127,6 +127,7 @@ static inline void dmar_writeq(void __iomem *addr, u64 val)
 
 
 /* IOTLB_REG */
+#define DMA_TLB_FLUSH_GRANU_OFFSET  60
 #define DMA_TLB_GLOBAL_FLUSH (((u64)1) << 60)
 #define DMA_TLB_DSI_FLUSH (((u64)2) << 60)
 #define DMA_TLB_PSI_FLUSH (((u64)3) << 60)
@@ -140,6 +141,7 @@ static inline void dmar_writeq(void __iomem *addr, u64 val)
 #define DMA_TLB_MAX_SIZE (0x3f)
 
 /* INVALID_DESC */
+#define DMA_CCMD_INVL_GRANU_OFFSET  61
 #define DMA_ID_TLB_GLOBAL_FLUSH	(((u64)1) << 3)
 #define DMA_ID_TLB_DSI_FLUSH	(((u64)2) << 3)
 #define DMA_ID_TLB_PSI_FLUSH	(((u64)3) << 3)
@@ -238,6 +240,19 @@ enum {
 #define QI_IWD_STATUS_DATA(d)	(((u64)d) << 32)
 #define QI_IWD_STATUS_WRITE	(((u64)1) << 5)
 
+#define QI_IOTLB_DID(did) 	(((u64)did) << 16)
+#define QI_IOTLB_DR(dr) 	(((u64)dr) << 7)
+#define QI_IOTLB_DW(dw) 	(((u64)dw) << 6)
+#define QI_IOTLB_GRAN(gran) 	(((u64)gran) >> (DMA_TLB_FLUSH_GRANU_OFFSET-4))
+#define QI_IOTLB_ADDR(addr)	(((u64)addr) & PAGE_MASK_4K)
+#define QI_IOTLB_IH(ih)		(((u64)ih) << 6)
+#define QI_IOTLB_AM(am)		(((u8)am))
+
+#define QI_CC_FM(fm)		(((u64)fm) << 48)
+#define QI_CC_SID(sid)		(((u64)sid) << 32)
+#define QI_CC_DID(did)		(((u64)did) << 16)
+#define QI_CC_GRAN(gran)	(((u64)gran) >> (DMA_CCMD_INVL_GRANU_OFFSET-4))
+
 struct qi_desc {
 	u64 low, high;
 };
@@ -303,6 +318,12 @@ extern void free_iommu(struct intel_iommu *iommu);
 extern int dmar_enable_qi(struct intel_iommu *iommu);
 extern void qi_global_iec(struct intel_iommu *iommu);
 
+extern int qi_flush_context(struct intel_iommu *iommu, u16 did, u16 sid,
+			        u8 fm, u64 type, int non_present_entry_flush);
+extern int qi_flush_iotlb(struct intel_iommu *iommu, u16 did, u64 addr,
+			  unsigned int size_order, u64 type,
+			  int non_present_entry_flush);
+
 extern void qi_submit_sync(struct qi_desc *desc, struct intel_iommu *iommu);
 
 void intel_iommu_domain_exit(struct dmar_domain *domain);
-- 
cgit v1.2.3


From a77b67d4023770805141014b8fa9eb5467457817 Mon Sep 17 00:00:00 2001
From: Youquan Song <youquan.song@intel.com>
Date: Thu, 16 Oct 2008 16:31:56 -0700
Subject: dmar: Use queued invalidation interface for IOTLB and context
 invalidation

If queued invalidation interface is available and enabled, queued invalidation
interface will be used instead of the register based interface.

According to Vt-d2 specification, when queued invalidation is enabled,
invalidation command submit works only through invalidation queue and not
through the command registers interface.

Signed-off-by: Youquan Song <youquan.song@intel.com>
Signed-off-by: Suresh Siddha <suresh.b.siddha@intel.com>
Signed-off-by: David Woodhouse <David.Woodhouse@intel.com>
---
 drivers/pci/intel-iommu.c   | 95 +++++++++++++++++++++------------------------
 include/linux/intel-iommu.h |  8 ++++
 2 files changed, 53 insertions(+), 50 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/pci/intel-iommu.c b/drivers/pci/intel-iommu.c
index fc5f2dbf5323..509470419130 100644
--- a/drivers/pci/intel-iommu.c
+++ b/drivers/pci/intel-iommu.c
@@ -567,27 +567,6 @@ static int __iommu_flush_context(struct intel_iommu *iommu,
 	return 0;
 }
 
-static int inline iommu_flush_context_global(struct intel_iommu *iommu,
-	int non_present_entry_flush)
-{
-	return __iommu_flush_context(iommu, 0, 0, 0, DMA_CCMD_GLOBAL_INVL,
-		non_present_entry_flush);
-}
-
-static int inline iommu_flush_context_domain(struct intel_iommu *iommu, u16 did,
-	int non_present_entry_flush)
-{
-	return __iommu_flush_context(iommu, did, 0, 0, DMA_CCMD_DOMAIN_INVL,
-		non_present_entry_flush);
-}
-
-static int inline iommu_flush_context_device(struct intel_iommu *iommu,
-	u16 did, u16 source_id, u8 function_mask, int non_present_entry_flush)
-{
-	return __iommu_flush_context(iommu, did, source_id, function_mask,
-		DMA_CCMD_DEVICE_INVL, non_present_entry_flush);
-}
-
 /* return value determine if we need a write buffer flush */
 static int __iommu_flush_iotlb(struct intel_iommu *iommu, u16 did,
 	u64 addr, unsigned int size_order, u64 type,
@@ -660,20 +639,6 @@ static int __iommu_flush_iotlb(struct intel_iommu *iommu, u16 did,
 	return 0;
 }
 
-static int inline iommu_flush_iotlb_global(struct intel_iommu *iommu,
-	int non_present_entry_flush)
-{
-	return __iommu_flush_iotlb(iommu, 0, 0, 0, DMA_TLB_GLOBAL_FLUSH,
-		non_present_entry_flush);
-}
-
-static int inline iommu_flush_iotlb_dsi(struct intel_iommu *iommu, u16 did,
-	int non_present_entry_flush)
-{
-	return __iommu_flush_iotlb(iommu, did, 0, 0, DMA_TLB_DSI_FLUSH,
-		non_present_entry_flush);
-}
-
 static int iommu_flush_iotlb_psi(struct intel_iommu *iommu, u16 did,
 	u64 addr, unsigned int pages, int non_present_entry_flush)
 {
@@ -684,8 +649,9 @@ static int iommu_flush_iotlb_psi(struct intel_iommu *iommu, u16 did,
 
 	/* Fallback to domain selective flush if no PSI support */
 	if (!cap_pgsel_inv(iommu->cap))
-		return iommu_flush_iotlb_dsi(iommu, did,
-			non_present_entry_flush);
+		return iommu->flush.flush_iotlb(iommu, did, 0, 0,
+						DMA_TLB_DSI_FLUSH,
+						non_present_entry_flush);
 
 	/*
 	 * PSI requires page size to be 2 ^ x, and the base address is naturally
@@ -694,11 +660,12 @@ static int iommu_flush_iotlb_psi(struct intel_iommu *iommu, u16 did,
 	mask = ilog2(__roundup_pow_of_two(pages));
 	/* Fallback to domain selective flush if size is too big */
 	if (mask > cap_max_amask_val(iommu->cap))
-		return iommu_flush_iotlb_dsi(iommu, did,
-			non_present_entry_flush);
+		return iommu->flush.flush_iotlb(iommu, did, 0, 0,
+			DMA_TLB_DSI_FLUSH, non_present_entry_flush);
 
-	return __iommu_flush_iotlb(iommu, did, addr, mask,
-		DMA_TLB_PSI_FLUSH, non_present_entry_flush);
+	return iommu->flush.flush_iotlb(iommu, did, addr, mask,
+					DMA_TLB_PSI_FLUSH,
+					non_present_entry_flush);
 }
 
 static void iommu_disable_protect_mem_regions(struct intel_iommu *iommu)
@@ -1204,11 +1171,13 @@ static int domain_context_mapping_one(struct dmar_domain *domain,
 	__iommu_flush_cache(iommu, context, sizeof(*context));
 
 	/* it's a non-present to present mapping */
-	if (iommu_flush_context_device(iommu, domain->id,
-			(((u16)bus) << 8) | devfn, DMA_CCMD_MASK_NOBIT, 1))
+	if (iommu->flush.flush_context(iommu, domain->id,
+		(((u16)bus) << 8) | devfn, DMA_CCMD_MASK_NOBIT,
+		DMA_CCMD_DEVICE_INVL, 1))
 		iommu_flush_write_buffer(iommu);
 	else
-		iommu_flush_iotlb_dsi(iommu, 0, 0);
+		iommu->flush.flush_iotlb(iommu, 0, 0, 0, DMA_TLB_DSI_FLUSH, 0);
+
 	spin_unlock_irqrestore(&iommu->lock, flags);
 	return 0;
 }
@@ -1310,8 +1279,10 @@ domain_page_mapping(struct dmar_domain *domain, dma_addr_t iova,
 static void detach_domain_for_dev(struct dmar_domain *domain, u8 bus, u8 devfn)
 {
 	clear_context_table(domain->iommu, bus, devfn);
-	iommu_flush_context_global(domain->iommu, 0);
-	iommu_flush_iotlb_global(domain->iommu, 0);
+	domain->iommu->flush.flush_context(domain->iommu, 0, 0, 0,
+					   DMA_CCMD_GLOBAL_INVL, 0);
+	domain->iommu->flush.flush_iotlb(domain->iommu, 0, 0, 0,
+					 DMA_TLB_GLOBAL_FLUSH, 0);
 }
 
 static void domain_remove_dev_info(struct dmar_domain *domain)
@@ -1662,6 +1633,28 @@ int __init init_dmars(void)
 		}
 	}
 
+	for_each_drhd_unit(drhd) {
+		if (drhd->ignored)
+			continue;
+
+		iommu = drhd->iommu;
+		if (dmar_enable_qi(iommu)) {
+			/*
+			 * Queued Invalidate not enabled, use Register Based
+			 * Invalidate
+			 */
+			iommu->flush.flush_context = __iommu_flush_context;
+			iommu->flush.flush_iotlb = __iommu_flush_iotlb;
+			printk(KERN_INFO "IOMMU 0x%Lx: using Register based "
+			       "invalidation\n", drhd->reg_base_addr);
+		} else {
+			iommu->flush.flush_context = qi_flush_context;
+			iommu->flush.flush_iotlb = qi_flush_iotlb;
+			printk(KERN_INFO "IOMMU 0x%Lx: using Queued "
+			       "invalidation\n", drhd->reg_base_addr);
+		}
+	}
+
 	/*
 	 * For each rmrr
 	 *   for each dev attached to rmrr
@@ -1714,9 +1707,10 @@ int __init init_dmars(void)
 
 		iommu_set_root_entry(iommu);
 
-		iommu_flush_context_global(iommu, 0);
-		iommu_flush_iotlb_global(iommu, 0);
-
+		iommu->flush.flush_context(iommu, 0, 0, 0, DMA_CCMD_GLOBAL_INVL,
+					   0);
+		iommu->flush.flush_iotlb(iommu, 0, 0, 0, DMA_TLB_GLOBAL_FLUSH,
+					 0);
 		iommu_disable_protect_mem_regions(iommu);
 
 		ret = iommu_enable_translation(iommu);
@@ -1891,7 +1885,8 @@ static void flush_unmaps(void)
 			struct intel_iommu *iommu =
 				deferred_flush[i].domain[0]->iommu;
 
-			iommu_flush_iotlb_global(iommu, 0);
+			iommu->flush.flush_iotlb(iommu, 0, 0, 0,
+						 DMA_TLB_GLOBAL_FLUSH, 0);
 			for (j = 0; j < deferred_flush[i].next; j++) {
 				__free_iova(&deferred_flush[i].domain[j]->iovad,
 						deferred_flush[i].iova[j]);
diff --git a/include/linux/intel-iommu.h b/include/linux/intel-iommu.h
index 0c5f5e49107b..afb0d2a5b7cd 100644
--- a/include/linux/intel-iommu.h
+++ b/include/linux/intel-iommu.h
@@ -278,6 +278,13 @@ struct ir_table {
 };
 #endif
 
+struct iommu_flush {
+	int (*flush_context)(struct intel_iommu *iommu, u16 did, u16 sid, u8 fm,
+		u64 type, int non_present_entry_flush);
+	int (*flush_iotlb)(struct intel_iommu *iommu, u16 did, u64 addr,
+		unsigned int size_order, u64 type, int non_present_entry_flush);
+};
+
 struct intel_iommu {
 	void __iomem	*reg; /* Pointer to hardware regs, virtual addr */
 	u64		cap;
@@ -297,6 +304,7 @@ struct intel_iommu {
 	unsigned char name[7];    /* Device Name */
 	struct msi_msg saved_msg;
 	struct sys_device sysdev;
+	struct iommu_flush flush;
 #endif
 	struct q_inval  *qi;            /* Queued invalidation info */
 #ifdef CONFIG_INTR_REMAP
-- 
cgit v1.2.3


From e62b4853983d032dcb3cde9fb20407dc556f47bc Mon Sep 17 00:00:00 2001
From: David Miller <davem@davemloft.net>
Date: Thu, 16 Oct 2008 21:14:11 -0700
Subject: sched: kill unused scheduler decl.

I noticed this while making investigations into the tbench
regressions.  Please apply.

sched: Remove hrtick_resched() extern decl.

This function was removed by 31656519e132f6612584815f128c83976a9aaaef
("sched, x86: clean up hrtick implementation").

Signed-off-by: David S. Miller <davem@davemloft.net>
Acked-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 include/linux/sched.h | 1 -
 1 file changed, 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/sched.h b/include/linux/sched.h
index c226c7b82946..6eda6ad735dc 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -287,7 +287,6 @@ extern void trap_init(void);
 extern void account_process_tick(struct task_struct *task, int user);
 extern void update_process_times(int user);
 extern void scheduler_tick(void);
-extern void hrtick_resched(void);
 
 extern void sched_show_task(struct task_struct *p);
 
-- 
cgit v1.2.3


From fe11edfaabf1787c05d782a7b33e6497d1118b1d Mon Sep 17 00:00:00 2001
From: Bartlomiej Zolnierkiewicz <bzolnier@gmail.com>
Date: Fri, 17 Oct 2008 18:09:11 +0200
Subject: ide: IDE_AFLAG_MEDIA_CHANGED -> IDE_DFLAG_MEDIA_CHANGED

There should be no functional changes caused by this patch.

Acked-by: Borislav Petkov <petkovbb@gmail.com>
Signed-off-by: Bartlomiej Zolnierkiewicz <bzolnier@gmail.com>
---
 drivers/ide/ide-cd.c       | 6 +++---
 drivers/ide/ide-cd_ioctl.c | 4 ++--
 drivers/ide/ide-floppy.c   | 6 +++---
 include/linux/ide.h        | 4 ++--
 4 files changed, 10 insertions(+), 10 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/ide/ide-cd.c b/drivers/ide/ide-cd.c
index 3308b1cd3a33..7dc1a17a4dd8 100644
--- a/drivers/ide/ide-cd.c
+++ b/drivers/ide/ide-cd.c
@@ -99,7 +99,7 @@ static void ide_cd_put(struct cdrom_info *cd)
 /* Mark that we've seen a media change and invalidate our internal buffers. */
 static void cdrom_saw_media_change(ide_drive_t *drive)
 {
-	drive->atapi_flags |= IDE_AFLAG_MEDIA_CHANGED;
+	drive->dev_flags |= IDE_DFLAG_MEDIA_CHANGED;
 	drive->atapi_flags &= ~IDE_AFLAG_TOC_VALID;
 }
 
@@ -1986,8 +1986,8 @@ static int ide_cdrom_setup(ide_drive_t *drive)
 	if (!drive->queue->unplug_delay)
 		drive->queue->unplug_delay = 1;
 
-	drive->atapi_flags = IDE_AFLAG_MEDIA_CHANGED | IDE_AFLAG_NO_EJECT |
-		       ide_cd_flags(id);
+	drive->dev_flags |= IDE_DFLAG_MEDIA_CHANGED;
+	drive->atapi_flags = IDE_AFLAG_NO_EJECT | ide_cd_flags(id);
 
 	if ((drive->atapi_flags & IDE_AFLAG_VERTOS_300_SSD) &&
 	    fw_rev[4] == '1' && fw_rev[6] <= '2')
diff --git a/drivers/ide/ide-cd_ioctl.c b/drivers/ide/ide-cd_ioctl.c
index 74231b41f611..37d89ead13dd 100644
--- a/drivers/ide/ide-cd_ioctl.c
+++ b/drivers/ide/ide-cd_ioctl.c
@@ -86,8 +86,8 @@ int ide_cdrom_check_media_change_real(struct cdrom_device_info *cdi,
 
 	if (slot_nr == CDSL_CURRENT) {
 		(void) cdrom_check_status(drive, NULL);
-		retval = (drive->atapi_flags & IDE_AFLAG_MEDIA_CHANGED) ? 1 : 0;
-		drive->atapi_flags &= ~IDE_AFLAG_MEDIA_CHANGED;
+		retval = (drive->dev_flags & IDE_DFLAG_MEDIA_CHANGED) ? 1 : 0;
+		drive->dev_flags &= ~IDE_DFLAG_MEDIA_CHANGED;
 		return retval;
 	} else {
 		return -EINVAL;
diff --git a/drivers/ide/ide-floppy.c b/drivers/ide/ide-floppy.c
index 3271e56d091c..df410c7191ac 100644
--- a/drivers/ide/ide-floppy.c
+++ b/drivers/ide/ide-floppy.c
@@ -689,8 +689,8 @@ static int idefloppy_open(struct inode *inode, struct file *filp)
 			goto out_put_floppy;
 		}
 
-		drive->atapi_flags |= IDE_AFLAG_MEDIA_CHANGED;
 		ide_set_media_lock(drive, disk, 1);
+		drive->dev_flags |= IDE_DFLAG_MEDIA_CHANGED;
 		check_disk_change(inode->i_bdev);
 	} else if (drive->atapi_flags & IDE_AFLAG_FORMAT_IN_PROGRESS) {
 		ret = -EBUSY;
@@ -747,8 +747,8 @@ static int idefloppy_media_changed(struct gendisk *disk)
 		drive->dev_flags &= ~IDE_DFLAG_ATTACH;
 		return 0;
 	}
-	ret = !!(drive->atapi_flags & IDE_AFLAG_MEDIA_CHANGED);
-	drive->atapi_flags &= ~IDE_AFLAG_MEDIA_CHANGED;
+	ret = !!(drive->dev_flags & IDE_DFLAG_MEDIA_CHANGED);
+	drive->dev_flags &= ~IDE_DFLAG_MEDIA_CHANGED;
 	return ret;
 }
 
diff --git a/include/linux/ide.h b/include/linux/ide.h
index c47e371554c1..155a57f55c60 100644
--- a/include/linux/ide.h
+++ b/include/linux/ide.h
@@ -464,7 +464,6 @@ struct ide_acpi_hwif_link;
 /* ATAPI device flags */
 enum {
 	IDE_AFLAG_DRQ_INTERRUPT		= (1 << 0),
-	IDE_AFLAG_MEDIA_CHANGED		= (1 << 1),
 	/* Drive cannot lock the door. */
 	IDE_AFLAG_NO_DOORLOCK		= (1 << 2),
 
@@ -578,7 +577,8 @@ enum {
 	/* don't unload heads */
 	IDE_DFLAG_NO_UNLOAD		= (1 << 27),
 	/* heads unloaded, please don't reset port */
-	IDE_DFLAG_PARKED		= (1 << 28)
+	IDE_DFLAG_PARKED		= (1 << 28),
+	IDE_DFLAG_MEDIA_CHANGED		= (1 << 29),
 };
 
 struct ide_drive_s {
-- 
cgit v1.2.3


From da167876bd0f71f1c646e5dd98997544d8d90e8e Mon Sep 17 00:00:00 2001
From: Bartlomiej Zolnierkiewicz <bzolnier@gmail.com>
Date: Fri, 17 Oct 2008 18:09:11 +0200
Subject: ide: IDE_AFLAG_WP -> IDE_DFLAG_WP

There should be no functional changes caused by this patch.

Acked-by: Borislav Petkov <petkovbb@gmail.com>
Signed-off-by: Bartlomiej Zolnierkiewicz <bzolnier@gmail.com>
---
 drivers/ide/ide-floppy.c | 8 ++++----
 include/linux/ide.h      | 4 ++--
 2 files changed, 6 insertions(+), 6 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/ide/ide-floppy.c b/drivers/ide/ide-floppy.c
index df410c7191ac..8078e0826cd3 100644
--- a/drivers/ide/ide-floppy.c
+++ b/drivers/ide/ide-floppy.c
@@ -410,11 +410,11 @@ static int ide_floppy_get_flexible_disk_page(ide_drive_t *drive)
 	}
 
 	if (pc.buf[3] & 0x80)
-		drive->atapi_flags |= IDE_AFLAG_WP;
+		drive->dev_flags |= IDE_DFLAG_WP;
 	else
-		drive->atapi_flags &= ~IDE_AFLAG_WP;
+		drive->dev_flags &= ~IDE_DFLAG_WP;
 
-	set_disk_ro(disk, !!(drive->atapi_flags & IDE_AFLAG_WP));
+	set_disk_ro(disk, !!(drive->dev_flags & IDE_DFLAG_WP));
 
 	page = &pc.buf[8];
 
@@ -684,7 +684,7 @@ static int idefloppy_open(struct inode *inode, struct file *filp)
 			goto out_put_floppy;
 		}
 
-		if ((drive->atapi_flags & IDE_AFLAG_WP) && (filp->f_mode & 2)) {
+		if ((drive->dev_flags & IDE_DFLAG_WP) && (filp->f_mode & 2)) {
 			ret = -EROFS;
 			goto out_put_floppy;
 		}
diff --git a/include/linux/ide.h b/include/linux/ide.h
index 155a57f55c60..bd0a4d36b6d3 100644
--- a/include/linux/ide.h
+++ b/include/linux/ide.h
@@ -503,8 +503,6 @@ enum {
 	IDE_AFLAG_CLIK_DRIVE		= (1 << 19),
 	/* Requires BH algorithm for packets */
 	IDE_AFLAG_ZIP_DRIVE		= (1 << 20),
-	/* Write protect */
-	IDE_AFLAG_WP			= (1 << 21),
 	/* Supports format progress report */
 	IDE_AFLAG_SRFP			= (1 << 22),
 
@@ -579,6 +577,8 @@ enum {
 	/* heads unloaded, please don't reset port */
 	IDE_DFLAG_PARKED		= (1 << 28),
 	IDE_DFLAG_MEDIA_CHANGED		= (1 << 29),
+	/* write protect */
+	IDE_DFLAG_WP			= (1 << 30),
 };
 
 struct ide_drive_s {
-- 
cgit v1.2.3


From e01286282eef85e4783b06fb2e0ed84fc111eb32 Mon Sep 17 00:00:00 2001
From: Bartlomiej Zolnierkiewicz <bzolnier@gmail.com>
Date: Fri, 17 Oct 2008 18:09:11 +0200
Subject: ide: IDE_AFLAG_FORMAT_IN_PROGRESS -> IDE_DFLAG_FORMAT_IN_PROGRESS

There should be no functional changes caused by this patch.

Acked-by: Borislav Petkov <petkovbb@gmail.com>
Signed-off-by: Bartlomiej Zolnierkiewicz <bzolnier@gmail.com>
---
 drivers/ide/ide-floppy.c       | 6 +++---
 drivers/ide/ide-floppy_ioctl.c | 6 +++---
 include/linux/ide.h            | 3 +--
 3 files changed, 7 insertions(+), 8 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/ide/ide-floppy.c b/drivers/ide/ide-floppy.c
index 8078e0826cd3..2cf98b531fd9 100644
--- a/drivers/ide/ide-floppy.c
+++ b/drivers/ide/ide-floppy.c
@@ -664,7 +664,7 @@ static int idefloppy_open(struct inode *inode, struct file *filp)
 	floppy->openers++;
 
 	if (floppy->openers == 1) {
-		drive->atapi_flags &= ~IDE_AFLAG_FORMAT_IN_PROGRESS;
+		drive->dev_flags &= ~IDE_DFLAG_FORMAT_IN_PROGRESS;
 		/* Just in case */
 
 		if (ide_do_test_unit_ready(drive, disk))
@@ -692,7 +692,7 @@ static int idefloppy_open(struct inode *inode, struct file *filp)
 		ide_set_media_lock(drive, disk, 1);
 		drive->dev_flags |= IDE_DFLAG_MEDIA_CHANGED;
 		check_disk_change(inode->i_bdev);
-	} else if (drive->atapi_flags & IDE_AFLAG_FORMAT_IN_PROGRESS) {
+	} else if (drive->dev_flags & IDE_DFLAG_FORMAT_IN_PROGRESS) {
 		ret = -EBUSY;
 		goto out_put_floppy;
 	}
@@ -714,7 +714,7 @@ static int idefloppy_release(struct inode *inode, struct file *filp)
 
 	if (floppy->openers == 1) {
 		ide_set_media_lock(drive, disk, 0);
-		drive->atapi_flags &= ~IDE_AFLAG_FORMAT_IN_PROGRESS;
+		drive->dev_flags &= ~IDE_DFLAG_FORMAT_IN_PROGRESS;
 	}
 
 	floppy->openers--;
diff --git a/drivers/ide/ide-floppy_ioctl.c b/drivers/ide/ide-floppy_ioctl.c
index a3a7a0809e2b..b1f391df6cca 100644
--- a/drivers/ide/ide-floppy_ioctl.c
+++ b/drivers/ide/ide-floppy_ioctl.c
@@ -138,11 +138,11 @@ static int ide_floppy_format_unit(ide_drive_t *drive, int __user *arg)
 
 	if (floppy->openers > 1) {
 		/* Don't format if someone is using the disk */
-		drive->atapi_flags &= ~IDE_AFLAG_FORMAT_IN_PROGRESS;
+		drive->dev_flags &= ~IDE_DFLAG_FORMAT_IN_PROGRESS;
 		return -EBUSY;
 	}
 
-	drive->atapi_flags |= IDE_AFLAG_FORMAT_IN_PROGRESS;
+	drive->dev_flags |= IDE_DFLAG_FORMAT_IN_PROGRESS;
 
 	/*
 	 * Send ATAPI_FORMAT_UNIT to the drive.
@@ -174,7 +174,7 @@ static int ide_floppy_format_unit(ide_drive_t *drive, int __user *arg)
 
 out:
 	if (err)
-		drive->atapi_flags &= ~IDE_AFLAG_FORMAT_IN_PROGRESS;
+		drive->dev_flags &= ~IDE_DFLAG_FORMAT_IN_PROGRESS;
 	return err;
 }
 
diff --git a/include/linux/ide.h b/include/linux/ide.h
index bd0a4d36b6d3..d111c3ebbbae 100644
--- a/include/linux/ide.h
+++ b/include/linux/ide.h
@@ -497,8 +497,6 @@ enum {
 	IDE_AFLAG_LE_SPEED_FIELDS	= (1 << 17),
 
 	/* ide-floppy */
-	/* Format in progress */
-	IDE_AFLAG_FORMAT_IN_PROGRESS	= (1 << 18),
 	/* Avoid commands not supported in Clik drive */
 	IDE_AFLAG_CLIK_DRIVE		= (1 << 19),
 	/* Requires BH algorithm for packets */
@@ -579,6 +577,7 @@ enum {
 	IDE_DFLAG_MEDIA_CHANGED		= (1 << 29),
 	/* write protect */
 	IDE_DFLAG_WP			= (1 << 30),
+	IDE_DFLAG_FORMAT_IN_PROGRESS	= (1 << 31),
 };
 
 struct ide_drive_s {
-- 
cgit v1.2.3


From 42619d35c7af2f88cad56425fe3981f1f65ff0bd Mon Sep 17 00:00:00 2001
From: Bartlomiej Zolnierkiewicz <bzolnier@gmail.com>
Date: Fri, 17 Oct 2008 18:09:11 +0200
Subject: ide: remove IDE_AFLAG_NO_DOORLOCKING

Just use IDE_DFLAG_DOORLOCKING instead.

There should be no functional changes caused by this patch.

Acked-by: Borislav Petkov <petkovbb@gmail.com>
Signed-off-by: Bartlomiej Zolnierkiewicz <bzolnier@gmail.com>
---
 drivers/ide/ide-atapi.c    | 2 +-
 drivers/ide/ide-cd.c       | 2 +-
 drivers/ide/ide-cd_ioctl.c | 4 ++--
 drivers/ide/ide-floppy.c   | 2 +-
 drivers/ide/ide-probe.c    | 1 +
 drivers/ide/ide-tape.c     | 2 +-
 include/linux/ide.h        | 2 --
 7 files changed, 7 insertions(+), 8 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/ide/ide-atapi.c b/drivers/ide/ide-atapi.c
index 2e305714c209..4e58b9e7a58a 100644
--- a/drivers/ide/ide-atapi.c
+++ b/drivers/ide/ide-atapi.c
@@ -191,7 +191,7 @@ int ide_set_media_lock(ide_drive_t *drive, struct gendisk *disk, int on)
 {
 	struct ide_atapi_pc pc;
 
-	if (drive->atapi_flags & IDE_AFLAG_NO_DOORLOCK)
+	if ((drive->dev_flags & IDE_DFLAG_DOORLOCKING) == 0)
 		return 0;
 
 	ide_init_pc(&pc);
diff --git a/drivers/ide/ide-cd.c b/drivers/ide/ide-cd.c
index 7dc1a17a4dd8..2f4cc10391e5 100644
--- a/drivers/ide/ide-cd.c
+++ b/drivers/ide/ide-cd.c
@@ -1732,7 +1732,7 @@ static int ide_cdrom_probe_capabilities(ide_drive_t *drive)
 		return 0;
 
 	if ((buf[8 + 6] & 0x01) == 0)
-		drive->atapi_flags |= IDE_AFLAG_NO_DOORLOCK;
+		drive->dev_flags &= ~IDE_DFLAG_DOORLOCKING;
 	if (buf[8 + 6] & 0x08)
 		drive->atapi_flags &= ~IDE_AFLAG_NO_EJECT;
 	if (buf[8 + 3] & 0x01)
diff --git a/drivers/ide/ide-cd_ioctl.c b/drivers/ide/ide-cd_ioctl.c
index 37d89ead13dd..df3df0041eb6 100644
--- a/drivers/ide/ide-cd_ioctl.c
+++ b/drivers/ide/ide-cd_ioctl.c
@@ -136,7 +136,7 @@ int ide_cd_lockdoor(ide_drive_t *drive, int lockflag,
 		sense = &my_sense;
 
 	/* If the drive cannot lock the door, just pretend. */
-	if (drive->atapi_flags & IDE_AFLAG_NO_DOORLOCK) {
+	if ((drive->dev_flags & IDE_DFLAG_DOORLOCKING) == 0) {
 		stat = 0;
 	} else {
 		unsigned char cmd[BLK_MAX_CDB];
@@ -157,7 +157,7 @@ int ide_cd_lockdoor(ide_drive_t *drive, int lockflag,
 	    (sense->asc == 0x24 || sense->asc == 0x20)) {
 		printk(KERN_ERR "%s: door locking not supported\n",
 			drive->name);
-		drive->atapi_flags |= IDE_AFLAG_NO_DOORLOCK;
+		drive->dev_flags &= ~IDE_DFLAG_DOORLOCKING;
 		stat = 0;
 	}
 
diff --git a/drivers/ide/ide-floppy.c b/drivers/ide/ide-floppy.c
index 2cf98b531fd9..791a9d6f371c 100644
--- a/drivers/ide/ide-floppy.c
+++ b/drivers/ide/ide-floppy.c
@@ -592,7 +592,7 @@ static void idefloppy_setup(ide_drive_t *drive)
 		blk_queue_max_sectors(drive->queue, 64);
 		drive->atapi_flags |= IDE_AFLAG_CLIK_DRIVE;
 		/* IOMEGA Clik! drives do not support lock/unlock commands */
-		drive->atapi_flags |= IDE_AFLAG_NO_DOORLOCK;
+		drive->dev_flags &= ~IDE_DFLAG_DOORLOCKING;
 	}
 
 	(void) ide_floppy_get_capacity(drive);
diff --git a/drivers/ide/ide-probe.c b/drivers/ide/ide-probe.c
index 19f8c7770a25..1649ea54f76c 100644
--- a/drivers/ide/ide-probe.c
+++ b/drivers/ide/ide-probe.c
@@ -208,6 +208,7 @@ static inline void do_identify (ide_drive_t *drive, u8 cmd)
 		drive->ready_stat = 0;
 		if (ata_id_cdb_intr(id))
 			drive->atapi_flags |= IDE_AFLAG_DRQ_INTERRUPT;
+		drive->dev_flags |= IDE_DFLAG_DOORLOCKING;
 		/* we don't do head unloading on ATAPI devices */
 		drive->dev_flags |= IDE_DFLAG_NO_UNLOAD;
 		return;
diff --git a/drivers/ide/ide-tape.c b/drivers/ide/ide-tape.c
index d879c7797cde..a99e28f45156 100644
--- a/drivers/ide/ide-tape.c
+++ b/drivers/ide/ide-tape.c
@@ -2108,7 +2108,7 @@ static void idetape_get_mode_sense_results(ide_drive_t *drive)
 
 	/* device lacks locking support according to capabilities page */
 	if ((caps[6] & 1) == 0)
-		drive->atapi_flags |= IDE_AFLAG_NO_DOORLOCK;
+		drive->dev_flags &= ~IDE_DFLAG_DOORLOCKING;
 
 	if (caps[7] & 0x02)
 		tape->blk_size = 512;
diff --git a/include/linux/ide.h b/include/linux/ide.h
index d111c3ebbbae..ba51a93fa547 100644
--- a/include/linux/ide.h
+++ b/include/linux/ide.h
@@ -464,8 +464,6 @@ struct ide_acpi_hwif_link;
 /* ATAPI device flags */
 enum {
 	IDE_AFLAG_DRQ_INTERRUPT		= (1 << 0),
-	/* Drive cannot lock the door. */
-	IDE_AFLAG_NO_DOORLOCK		= (1 << 2),
 
 	/* ide-cd */
 	/* Drive cannot eject the disc. */
-- 
cgit v1.2.3


From 79cb380397c834a35952d8497651d93b543ef968 Mon Sep 17 00:00:00 2001
From: Bartlomiej Zolnierkiewicz <bzolnier@gmail.com>
Date: Fri, 17 Oct 2008 18:09:13 +0200
Subject: ide: allow device drivers to specify per-device type /proc settings

Turn ide_driver_t's 'proc' field into ->proc_entries method
(and also 'settings' field into ->proc_devsets method).  Then
update all device drivers accordingly.

There should be no functional changes caused by this patch.

Acked-by: Borislav Petkov <petkovbb@gmail.com>
Signed-off-by: Bartlomiej Zolnierkiewicz <bzolnier@gmail.com>
---
 drivers/ide/ide-cd.c        | 14 ++++++++++++--
 drivers/ide/ide-gd-floppy.c | 16 ++++++++++++++--
 drivers/ide/ide-gd.c        | 16 ++++++++++++++--
 drivers/ide/ide-proc.c      |  6 +++---
 drivers/ide/ide-tape.c      | 14 ++++++++++++--
 drivers/scsi/ide-scsi.c     | 26 +++++++++++++++++---------
 include/linux/ide.h         |  4 ++--
 7 files changed, 74 insertions(+), 22 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/ide/ide-cd.c b/drivers/ide/ide-cd.c
index 2f4cc10391e5..32073666b9ca 100644
--- a/drivers/ide/ide-cd.c
+++ b/drivers/ide/ide-cd.c
@@ -1908,6 +1908,16 @@ static const struct ide_proc_devset idecd_settings[] = {
 	IDE_PROC_DEVSET(dsc_overlap, 0, 1),
 	{ 0 },
 };
+
+static ide_proc_entry_t *ide_cd_proc_entries(ide_drive_t *drive)
+{
+	return idecd_proc;
+}
+
+static const struct ide_proc_devset *ide_cd_proc_devsets(ide_drive_t *drive)
+{
+	return idecd_settings;
+}
 #endif
 
 static const struct cd_list_entry ide_cd_quirks_list[] = {
@@ -2069,8 +2079,8 @@ static ide_driver_t ide_cdrom_driver = {
 	.end_request		= ide_end_request,
 	.error			= __ide_error,
 #ifdef CONFIG_IDE_PROC_FS
-	.proc			= idecd_proc,
-	.settings		= idecd_settings,
+	.proc_entries		= ide_cd_proc_entries,
+	.proc_devsets		= ide_cd_proc_devsets,
 #endif
 };
 
diff --git a/drivers/ide/ide-gd-floppy.c b/drivers/ide/ide-gd-floppy.c
index 986253418794..082800b9a558 100644
--- a/drivers/ide/ide-gd-floppy.c
+++ b/drivers/ide/ide-gd-floppy.c
@@ -77,6 +77,18 @@ static void ide_disk_release(struct kref *kref)
 	kfree(idkp);
 }
 
+#ifdef CONFIG_IDE_PROC_FS
+static ide_proc_entry_t *ide_floppy_proc_entries(ide_drive_t *drive)
+{
+	return ide_floppy_proc;
+}
+
+static const struct ide_proc_devset *ide_floppy_proc_devsets(ide_drive_t *drive)
+{
+	return ide_floppy_settings;
+}
+#endif
+
 static ide_driver_t ide_gd_driver = {
 	.gen_driver = {
 		.owner		= THIS_MODULE,
@@ -90,8 +102,8 @@ static ide_driver_t ide_gd_driver = {
 	.end_request		= ide_floppy_end_request,
 	.error			= __ide_error,
 #ifdef CONFIG_IDE_PROC_FS
-	.proc			= ide_floppy_proc,
-	.settings		= ide_floppy_settings,
+	.proc_entries		= ide_floppy_proc_entries,
+	.proc_devsets		= ide_floppy_proc_devsets,
 #endif
 };
 
diff --git a/drivers/ide/ide-gd.c b/drivers/ide/ide-gd.c
index c08500270b9d..a3d4ad7db2af 100644
--- a/drivers/ide/ide-gd.c
+++ b/drivers/ide/ide-gd.c
@@ -119,6 +119,18 @@ static void ide_gd_shutdown(ide_drive_t *drive)
 	drive->gendev.bus->suspend(&drive->gendev, PMSG_SUSPEND);
 }
 
+#ifdef CONFIG_IDE_PROC_FS
+static ide_proc_entry_t *ide_disk_proc_entries(ide_drive_t *drive)
+{
+	return ide_disk_proc;
+}
+
+static const struct ide_proc_devset *ide_disk_proc_devsets(ide_drive_t *drive)
+{
+	return ide_disk_settings;
+}
+#endif
+
 static ide_driver_t ide_gd_driver = {
 	.gen_driver = {
 		.owner		= THIS_MODULE,
@@ -134,8 +146,8 @@ static ide_driver_t ide_gd_driver = {
 	.end_request		= ide_end_request,
 	.error			= __ide_error,
 #ifdef CONFIG_IDE_PROC_FS
-	.proc			= ide_disk_proc,
-	.settings		= ide_disk_settings,
+	.proc_entries		= ide_disk_proc_entries,
+	.proc_devsets		= ide_disk_proc_devsets,
 #endif
 };
 
diff --git a/drivers/ide/ide-proc.c b/drivers/ide/ide-proc.c
index b26926487cc0..c31d0dd7a532 100644
--- a/drivers/ide/ide-proc.c
+++ b/drivers/ide/ide-proc.c
@@ -567,10 +567,10 @@ static void ide_remove_proc_entries(struct proc_dir_entry *dir, ide_proc_entry_t
 void ide_proc_register_driver(ide_drive_t *drive, ide_driver_t *driver)
 {
 	mutex_lock(&ide_setting_mtx);
-	drive->settings = driver->settings;
+	drive->settings = driver->proc_devsets(drive);
 	mutex_unlock(&ide_setting_mtx);
 
-	ide_add_proc_entries(drive->proc, driver->proc, drive);
+	ide_add_proc_entries(drive->proc, driver->proc_entries(drive), drive);
 }
 
 EXPORT_SYMBOL(ide_proc_register_driver);
@@ -591,7 +591,7 @@ void ide_proc_unregister_driver(ide_drive_t *drive, ide_driver_t *driver)
 {
 	unsigned long flags;
 
-	ide_remove_proc_entries(drive->proc, driver->proc);
+	ide_remove_proc_entries(drive->proc, driver->proc_entries(drive));
 
 	mutex_lock(&ide_setting_mtx);
 	spin_lock_irqsave(&ide_lock, flags);
diff --git a/drivers/ide/ide-tape.c b/drivers/ide/ide-tape.c
index a99e28f45156..b2b2e5e8d38e 100644
--- a/drivers/ide/ide-tape.c
+++ b/drivers/ide/ide-tape.c
@@ -2298,6 +2298,16 @@ static ide_proc_entry_t idetape_proc[] = {
 	{ "name",	S_IFREG|S_IRUGO,	proc_idetape_read_name,	NULL },
 	{ NULL, 0, NULL, NULL }
 };
+
+static ide_proc_entry_t *ide_tape_proc_entries(ide_drive_t *drive)
+{
+	return idetape_proc;
+}
+
+static const struct ide_proc_devset *ide_tape_proc_devsets(ide_drive_t *drive)
+{
+	return idetape_settings;
+}
 #endif
 
 static int ide_tape_probe(ide_drive_t *);
@@ -2315,8 +2325,8 @@ static ide_driver_t idetape_driver = {
 	.end_request		= idetape_end_request,
 	.error			= __ide_error,
 #ifdef CONFIG_IDE_PROC_FS
-	.proc			= idetape_proc,
-	.settings		= idetape_settings,
+	.proc_entries		= ide_tape_proc_entries,
+	.proc_devsets		= ide_tape_proc_devsets,
 #endif
 };
 
diff --git a/drivers/scsi/ide-scsi.c b/drivers/scsi/ide-scsi.c
index 740bad435995..afc96e844a25 100644
--- a/drivers/scsi/ide-scsi.c
+++ b/drivers/scsi/ide-scsi.c
@@ -343,6 +343,11 @@ static ide_startstop_t idescsi_do_request (ide_drive_t *drive, struct request *r
 }
 
 #ifdef CONFIG_IDE_PROC_FS
+static ide_proc_entry_t idescsi_proc[] = {
+	{ "capacity", S_IFREG|S_IRUGO, proc_ide_read_capacity, NULL },
+	{ NULL, 0, NULL, NULL }
+};
+
 #define ide_scsi_devset_get(name, field) \
 static int get_##name(ide_drive_t *drive) \
 { \
@@ -378,6 +383,16 @@ static const struct ide_proc_devset idescsi_settings[] = {
 	IDE_PROC_DEVSET(transform, 0,	 3),
 	{ 0 },
 };
+
+static ide_proc_entry_t *ide_scsi_proc_entries(ide_drive_t *drive)
+{
+	return idescsi_proc;
+}
+
+static const struct ide_proc_devset *ide_scsi_proc_devsets(ide_drive_t *drive)
+{
+	return idescsi_settings;
+}
 #endif
 
 /*
@@ -419,13 +434,6 @@ static void ide_scsi_remove(ide_drive_t *drive)
 
 static int ide_scsi_probe(ide_drive_t *);
 
-#ifdef CONFIG_IDE_PROC_FS
-static ide_proc_entry_t idescsi_proc[] = {
-	{ "capacity", S_IFREG|S_IRUGO, proc_ide_read_capacity, NULL },
-	{ NULL, 0, NULL, NULL }
-};
-#endif
-
 static ide_driver_t idescsi_driver = {
 	.gen_driver = {
 		.owner		= THIS_MODULE,
@@ -439,8 +447,8 @@ static ide_driver_t idescsi_driver = {
 	.end_request		= idescsi_end_request,
 	.error                  = idescsi_atapi_error,
 #ifdef CONFIG_IDE_PROC_FS
-	.proc			= idescsi_proc,
-	.settings		= idescsi_settings,
+	.proc_entries		= ide_scsi_proc_entries,
+	.proc_devsets		= ide_scsi_proc_devsets,
 #endif
 };
 
diff --git a/include/linux/ide.h b/include/linux/ide.h
index ba51a93fa547..488808891acb 100644
--- a/include/linux/ide.h
+++ b/include/linux/ide.h
@@ -1120,8 +1120,8 @@ struct ide_driver_s {
 	void		(*resume)(ide_drive_t *);
 	void		(*shutdown)(ide_drive_t *);
 #ifdef CONFIG_IDE_PROC_FS
-	ide_proc_entry_t		*proc;
-	const struct ide_proc_devset	*settings;
+	ide_proc_entry_t *		(*proc_entries)(ide_drive_t *);
+	const struct ide_proc_devset *	(*proc_devsets)(ide_drive_t *);
 #endif
 };
 
-- 
cgit v1.2.3


From 806f80a6fc203ad0bde84e5a9e94572617d2ae45 Mon Sep 17 00:00:00 2001
From: Bartlomiej Zolnierkiewicz <bzolnier@gmail.com>
Date: Fri, 17 Oct 2008 18:09:14 +0200
Subject: ide: add generic ATA/ATAPI disk driver

* Add struct ide_disk_ops containing protocol specific methods.

* Add 'struct ide_disk_ops *' to ide_drive_t.

* Convert ide-{disk,floppy} drivers to use struct ide_disk_ops.

* Merge ide-{disk,floppy} drivers into generic ide-gd driver.

While at it:
- ide_disk_init_capacity() -> ide_disk_get_capacity()

Acked-by: Borislav Petkov <petkovbb@gmail.com>
Signed-off-by: Bartlomiej Zolnierkiewicz <bzolnier@gmail.com>
---
 drivers/ide/Kconfig            |  64 ++++-----
 drivers/ide/Makefile           |  19 ++-
 drivers/ide/ide-disk.c         |  39 +++++-
 drivers/ide/ide-disk.h         |  24 ++--
 drivers/ide/ide-disk_ioctl.c   |   4 +-
 drivers/ide/ide-floppy.c       |  45 ++++--
 drivers/ide/ide-floppy.h       |  58 ++------
 drivers/ide/ide-floppy_ioctl.c |   9 +-
 drivers/ide/ide-gd-floppy.c    | 309 -----------------------------------------
 drivers/ide/ide-gd.c           | 122 +++++++++++++---
 drivers/ide/ide-gd.h           |  44 ++++++
 drivers/leds/Kconfig           |   2 +-
 include/linux/ide.h            |  19 +++
 13 files changed, 303 insertions(+), 455 deletions(-)
 delete mode 100644 drivers/ide/ide-gd-floppy.c
 create mode 100644 drivers/ide/ide-gd.h

(limited to 'include/linux')

diff --git a/drivers/ide/Kconfig b/drivers/ide/Kconfig
index 74a369a6116f..faa974e615da 100644
--- a/drivers/ide/Kconfig
+++ b/drivers/ide/Kconfig
@@ -84,21 +84,40 @@ config BLK_DEV_IDE_SATA
 
 	  If unsure, say N.
 
-config BLK_DEV_IDEDISK
-	tristate "Include IDE/ATA-2 DISK support"
-	---help---
-	  This will include enhanced support for MFM/RLL/IDE hard disks.  If
-	  you have a MFM/RLL/IDE disk, and there is no special reason to use
-	  the old hard disk driver instead, say Y.  If you have an SCSI-only
-	  system, you can say N here.
+config IDE_GD
+	tristate "generic ATA/ATAPI disk support"
+	default y
+	help
+	  Support for ATA/ATAPI disks (including ATAPI floppy drives).
 
-	  To compile this driver as a module, choose M here: the
-	  module will be called ide-disk.
-	  Do not compile this driver as a module if your root file system
-	  (the one containing the directory /) is located on the IDE disk.
+	  To compile this driver as a module, choose M here.
+	  The module will be called ide-gd_mod.
 
 	  If unsure, say Y.
 
+config IDE_GD_ATA
+	bool "ATA disk support"
+	depends on IDE_GD
+	default y
+	help
+	  This will include support for ATA hard disks.
+
+	  If unsure, say Y.
+
+config IDE_GD_ATAPI
+	bool "ATAPI floppy support"
+	depends on IDE_GD
+	select IDE_ATAPI
+	help
+	  This will include support for ATAPI floppy drives
+	  (i.e. Iomega ZIP or MKE LS-120).
+
+	  For information about jumper settings and the question
+	  of when a ZIP drive uses a partition table, see
+	  <http://www.win.tue.nl/~aeb/linux/zip/zip-1.html>.
+
+	  If unsure, say N.
+
 config BLK_DEV_IDECS
 	tristate "PCMCIA IDE support"
 	depends on PCMCIA
@@ -163,29 +182,6 @@ config BLK_DEV_IDETAPE
 	  To compile this driver as a module, choose M here: the
 	  module will be called ide-tape.
 
-config BLK_DEV_IDEFLOPPY
-	tristate "Include IDE/ATAPI FLOPPY support"
-	select IDE_ATAPI
-	---help---
-	  If you have an IDE floppy drive which uses the ATAPI protocol,
-	  answer Y.  ATAPI is a newer protocol used by IDE CD-ROM/tape/floppy
-	  drives, similar to the SCSI protocol.
-
-	  The LS-120 and the IDE/ATAPI Iomega ZIP drive are also supported by
-	  this driver. For information about jumper settings and the question
-	  of when a ZIP drive uses a partition table, see
-	  <http://www.win.tue.nl/~aeb/linux/zip/zip-1.html>.
-	  (ATAPI PD-CD/CDR drives are not supported by this driver; support
-	  for PD-CD/CDR drives is available if you answer Y to
-	  "SCSI emulation support", below).
-
-	  If you say Y here, the FLOPPY drive will be identified along with
-	  other IDE devices, as "hdb" or "hdc", or something similar (check
-	  the boot messages with dmesg).
-
-	  To compile this driver as a module, choose M here: the
-	  module will be called ide-floppy.
-
 config BLK_DEV_IDESCSI
 	tristate "SCSI emulation support (DEPRECATED)"
 	depends on SCSI
diff --git a/drivers/ide/Makefile b/drivers/ide/Makefile
index 7eeeab597959..093d3248ca89 100644
--- a/drivers/ide/Makefile
+++ b/drivers/ide/Makefile
@@ -37,18 +37,25 @@ obj-$(CONFIG_IDE_H8300)			+= h8300/
 obj-$(CONFIG_IDE_GENERIC)		+= ide-generic.o
 obj-$(CONFIG_BLK_DEV_IDEPNP)		+= ide-pnp.o
 
-ide-disk_mod-y += ide-gd.o ide-disk.o ide-disk_ioctl.o
+ide-gd_mod-y += ide-gd.o
 ide-cd_mod-y += ide-cd.o ide-cd_ioctl.o ide-cd_verbose.o
-ide-floppy_mod-y += ide-gd-floppy.o ide-floppy.o ide-floppy_ioctl.o
 
+ifeq ($(CONFIG_IDE_GD_ATA), y)
+	ide-gd_mod-y += ide-disk.o ide-disk_ioctl.o
 ifeq ($(CONFIG_IDE_PROC_FS), y)
-	ide-disk_mod-y += ide-disk_proc.o
-	ide-floppy_mod-y += ide-floppy_proc.o
+	ide-gd_mod-y += ide-disk_proc.o
+endif
+endif
+
+ifeq ($(CONFIG_IDE_GD_ATAPI), y)
+	ide-gd_mod-y += ide-floppy.o ide-floppy_ioctl.o
+ifeq ($(CONFIG_IDE_PROC_FS), y)
+	ide-gd_mod-y += ide-floppy_proc.o
+endif
 endif
 
-obj-$(CONFIG_BLK_DEV_IDEDISK)		+= ide-disk_mod.o
+obj-$(CONFIG_IDE_GD)			+= ide-gd_mod.o
 obj-$(CONFIG_BLK_DEV_IDECD)		+= ide-cd_mod.o
-obj-$(CONFIG_BLK_DEV_IDEFLOPPY)		+= ide-floppy_mod.o
 obj-$(CONFIG_BLK_DEV_IDETAPE)		+= ide-tape.o
 
 ifeq ($(CONFIG_BLK_DEV_IDECS), y)
diff --git a/drivers/ide/ide-disk.c b/drivers/ide/ide-disk.c
index 751be7af22c2..223750c1b5a6 100644
--- a/drivers/ide/ide-disk.c
+++ b/drivers/ide/ide-disk.c
@@ -184,8 +184,8 @@ static ide_startstop_t __ide_do_rw_disk(ide_drive_t *drive, struct request *rq,
  * 1073741822 == 549756 MB or 48bit addressing fake drive
  */
 
-ide_startstop_t ide_do_rw_disk(ide_drive_t *drive, struct request *rq,
-			       sector_t block)
+static ide_startstop_t ide_do_rw_disk(ide_drive_t *drive, struct request *rq,
+				      sector_t block)
 {
 	ide_hwif_t *hwif = HWIF(drive);
 
@@ -333,7 +333,7 @@ static void idedisk_check_hpa(ide_drive_t *drive)
 	}
 }
 
-void ide_disk_init_capacity(ide_drive_t *drive)
+static int ide_disk_get_capacity(ide_drive_t *drive)
 {
 	u16 *id = drive->id;
 	int lba;
@@ -382,6 +382,8 @@ void ide_disk_init_capacity(ide_drive_t *drive)
 		} else
 			drive->dev_flags &= ~IDE_DFLAG_LBA48;
 	}
+
+	return 0;
 }
 
 static void idedisk_prepare_flush(struct request_queue *q, struct request *rq)
@@ -590,7 +592,12 @@ ide_ext_devset_rw(wcache, wcache);
 
 ide_ext_devset_rw_sync(nowerr, nowerr);
 
-void ide_disk_setup(ide_drive_t *drive)
+static int ide_disk_check(ide_drive_t *drive, const char *s)
+{
+	return 1;
+}
+
+static void ide_disk_setup(ide_drive_t *drive)
 {
 	struct ide_disk_obj *idkp = drive->driver_data;
 	ide_hwif_t *hwif = drive->hwif;
@@ -626,7 +633,7 @@ void ide_disk_setup(ide_drive_t *drive)
 			 drive->queue->max_sectors / 2);
 
 	/* calculate drive capacity, and select LBA if possible */
-	ide_disk_init_capacity(drive);
+	ide_disk_get_capacity(drive);
 
 	/*
 	 * if possible, give fdisk access to more of the drive,
@@ -682,7 +689,7 @@ void ide_disk_setup(ide_drive_t *drive)
 		drive->dev_flags |= IDE_DFLAG_ATTACH;
 }
 
-void ide_disk_flush(ide_drive_t *drive)
+static void ide_disk_flush(ide_drive_t *drive)
 {
 	if (ata_id_flush_enabled(drive->id) == 0 ||
 	    (drive->dev_flags & IDE_DFLAG_WCACHE) == 0)
@@ -692,7 +699,13 @@ void ide_disk_flush(ide_drive_t *drive)
 		printk(KERN_INFO "%s: wcache flush failed!\n", drive->name);
 }
 
-int ide_disk_set_doorlock(ide_drive_t *drive, int on)
+static int ide_disk_init_media(ide_drive_t *drive, struct gendisk *disk)
+{
+	return 0;
+}
+
+static int ide_disk_set_doorlock(ide_drive_t *drive, struct gendisk *disk,
+				 int on)
 {
 	ide_task_t task;
 	int ret;
@@ -711,3 +724,15 @@ int ide_disk_set_doorlock(ide_drive_t *drive, int on)
 
 	return ret;
 }
+
+const struct ide_disk_ops ide_ata_disk_ops = {
+	.check		= ide_disk_check,
+	.get_capacity	= ide_disk_get_capacity,
+	.setup		= ide_disk_setup,
+	.flush		= ide_disk_flush,
+	.init_media	= ide_disk_init_media,
+	.set_doorlock	= ide_disk_set_doorlock,
+	.do_request	= ide_do_rw_disk,
+	.end_request	= ide_end_request,
+	.ioctl		= ide_disk_ioctl,
+};
diff --git a/drivers/ide/ide-disk.h b/drivers/ide/ide-disk.h
index 104ad71288a5..b234b0feaf7b 100644
--- a/drivers/ide/ide-disk.h
+++ b/drivers/ide/ide-disk.h
@@ -1,22 +1,11 @@
 #ifndef __IDE_DISK_H
 #define __IDE_DISK_H
 
-struct ide_disk_obj {
-	ide_drive_t	*drive;
-	ide_driver_t	*driver;
-	struct gendisk	*disk;
-	struct kref	kref;
-	unsigned int	openers;	/* protected by BKL for now */
-};
-
-sector_t ide_gd_capacity(ide_drive_t *);
+#include "ide-gd.h"
 
+#ifdef CONFIG_IDE_GD_ATA
 /* ide-disk.c */
-void ide_disk_init_capacity(ide_drive_t *);
-void ide_disk_setup(ide_drive_t *);
-void ide_disk_flush(ide_drive_t *);
-int ide_disk_set_doorlock(ide_drive_t *, int);
-ide_startstop_t ide_do_rw_disk(ide_drive_t *, struct request *, sector_t);
+extern const struct ide_disk_ops ide_ata_disk_ops;
 ide_decl_devset(address);
 ide_decl_devset(multcount);
 ide_decl_devset(nowerr);
@@ -24,12 +13,17 @@ ide_decl_devset(wcache);
 ide_decl_devset(acoustic);
 
 /* ide-disk_ioctl.c */
-int ide_disk_ioctl(struct inode *, struct file *, unsigned int, unsigned long);
+int ide_disk_ioctl(ide_drive_t *, struct inode *, struct file *, unsigned int,
+		   unsigned long);
 
 #ifdef CONFIG_IDE_PROC_FS
 /* ide-disk_proc.c */
 extern ide_proc_entry_t ide_disk_proc[];
 extern const struct ide_proc_devset ide_disk_settings[];
 #endif
+#else
+#define ide_disk_proc		NULL
+#define ide_disk_settings	NULL
+#endif
 
 #endif /* __IDE_DISK_H */
diff --git a/drivers/ide/ide-disk_ioctl.c b/drivers/ide/ide-disk_ioctl.c
index e6624eda9e69..a49698bcf966 100644
--- a/drivers/ide/ide-disk_ioctl.c
+++ b/drivers/ide/ide-disk_ioctl.c
@@ -13,12 +13,10 @@ static const struct ide_ioctl_devset ide_disk_ioctl_settings[] = {
 { 0 }
 };
 
-int ide_disk_ioctl(struct inode *inode, struct file *file,
+int ide_disk_ioctl(ide_drive_t *drive, struct inode *inode, struct file *file,
 		   unsigned int cmd, unsigned long arg)
 {
 	struct block_device *bdev = inode->i_bdev;
-	struct ide_disk_obj *idkp = ide_drv_g(bdev->bd_disk, ide_disk_obj);
-	ide_drive_t *drive = idkp->drive;
 	int err;
 
 	err = ide_setting_ioctl(drive, bdev, cmd, arg, ide_disk_ioctl_settings);
diff --git a/drivers/ide/ide-floppy.c b/drivers/ide/ide-floppy.c
index 802e0968e32f..58746c748c12 100644
--- a/drivers/ide/ide-floppy.c
+++ b/drivers/ide/ide-floppy.c
@@ -68,7 +68,7 @@
  * Used to finish servicing a request. For read/write requests, we will call
  * ide_end_request to pass to the next buffer.
  */
-int ide_floppy_end_request(ide_drive_t *drive, int uptodate, int nsecs)
+static int ide_floppy_end_request(ide_drive_t *drive, int uptodate, int nsecs)
 {
 	idefloppy_floppy_t *floppy = drive->driver_data;
 	struct request *rq = HWGROUP(drive)->rq;
@@ -280,13 +280,12 @@ static void idefloppy_blockpc_cmd(idefloppy_floppy_t *floppy,
 	pc->req_xfer = pc->buf_size = rq->data_len;
 }
 
-ide_startstop_t ide_floppy_do_request(ide_drive_t *drive, struct request *rq,
-				      sector_t block_s)
+static ide_startstop_t ide_floppy_do_request(ide_drive_t *drive,
+					     struct request *rq, sector_t block)
 {
 	idefloppy_floppy_t *floppy = drive->driver_data;
 	ide_hwif_t *hwif = drive->hwif;
 	struct ide_atapi_pc *pc;
-	unsigned long block = (unsigned long)block_s;
 
 	ide_debug_log(IDE_DBG_FUNC, "%s: dev: %s, cmd: 0x%x, cmd_type: %x, "
 		      "errors: %d\n",
@@ -316,7 +315,7 @@ ide_startstop_t ide_floppy_do_request(ide_drive_t *drive, struct request *rq,
 			return ide_stopped;
 		}
 		pc = &floppy->queued_pc;
-		idefloppy_create_rw_cmd(drive, pc, rq, block);
+		idefloppy_create_rw_cmd(drive, pc, rq, (unsigned long)block);
 	} else if (blk_special_request(rq)) {
 		pc = (struct ide_atapi_pc *) rq->buffer;
 	} else if (blk_pc_request(rq)) {
@@ -406,7 +405,7 @@ static int ide_floppy_get_flexible_disk_page(ide_drive_t *drive)
  * Determine if a media is present in the floppy drive, and if so, its LBA
  * capacity.
  */
-int ide_floppy_get_capacity(ide_drive_t *drive)
+static int ide_floppy_get_capacity(ide_drive_t *drive)
 {
 	idefloppy_floppy_t *floppy = drive->driver_data;
 	struct gendisk *disk = floppy->disk;
@@ -505,9 +504,9 @@ int ide_floppy_get_capacity(ide_drive_t *drive)
 	return rc;
 }
 
-void ide_floppy_setup(ide_drive_t *drive)
+static void ide_floppy_setup(ide_drive_t *drive)
 {
-	struct ide_floppy_obj *floppy = drive->driver_data;
+	struct ide_disk_obj *floppy = drive->driver_data;
 	u16 *id = drive->id;
 
 	drive->pc_callback	 = ide_floppy_callback;
@@ -547,3 +546,33 @@ void ide_floppy_setup(ide_drive_t *drive)
 
 	drive->dev_flags |= IDE_DFLAG_ATTACH;
 }
+
+static void ide_floppy_flush(ide_drive_t *drive)
+{
+}
+
+static int ide_floppy_init_media(ide_drive_t *drive, struct gendisk *disk)
+{
+	int ret = 0;
+
+	if (ide_do_test_unit_ready(drive, disk))
+		ide_do_start_stop(drive, disk, 1);
+
+	ret = ide_floppy_get_capacity(drive);
+
+	set_capacity(disk, ide_gd_capacity(drive));
+
+	return ret;
+}
+
+const struct ide_disk_ops ide_atapi_disk_ops = {
+	.check		= ide_check_atapi_device,
+	.get_capacity	= ide_floppy_get_capacity,
+	.setup		= ide_floppy_setup,
+	.flush		= ide_floppy_flush,
+	.init_media	= ide_floppy_init_media,
+	.set_doorlock	= ide_set_media_lock,
+	.do_request	= ide_floppy_do_request,
+	.end_request	= ide_floppy_end_request,
+	.ioctl		= ide_floppy_ioctl,
+};
diff --git a/drivers/ide/ide-floppy.h b/drivers/ide/ide-floppy.h
index b965da2f41ce..acebc8c5a827 100644
--- a/drivers/ide/ide-floppy.h
+++ b/drivers/ide/ide-floppy.h
@@ -1,48 +1,10 @@
 #ifndef __IDE_FLOPPY_H
 #define __IDE_FLOPPY_H
 
-#define DRV_NAME "ide-floppy"
-#define PFX DRV_NAME ": "
+#include "ide-gd.h"
 
-/* define to see debug info */
-#define IDEFLOPPY_DEBUG_LOG	0
-
-#if IDEFLOPPY_DEBUG_LOG
-#define ide_debug_log(lvl, fmt, args...) __ide_debug_log(lvl, fmt, args)
-#else
-#define ide_debug_log(lvl, fmt, args...) do {} while (0)
-#endif
-
-/*
- * Most of our global data which we need to save even as we leave the driver
- * due to an interrupt or a timer event is stored in a variable of type
- * idefloppy_floppy_t, defined below.
- */
-typedef struct ide_floppy_obj {
-	ide_drive_t	*drive;
-	ide_driver_t	*driver;
-	struct gendisk	*disk;
-	struct kref	kref;
-	unsigned int	openers;	/* protected by BKL for now */
-
-	/* Last failed packet command */
-	struct ide_atapi_pc *failed_pc;
-	/* used for blk_{fs,pc}_request() requests */
-	struct ide_atapi_pc queued_pc;
-
-	/* Last error information */
-	u8 sense_key, asc, ascq;
-
-	int progress_indication;
-
-	/* Device information */
-	/* Current format */
-	int blocks, block_size, bs_factor;
-	/* Last format capacity descriptor */
-	u8 cap_desc[8];
-	/* Copy of the flexible disk page */
-	u8 flexible_disk_page[32];
-} idefloppy_floppy_t;
+#ifdef CONFIG_IDE_GD_ATAPI
+typedef struct ide_disk_obj idefloppy_floppy_t;
 
 /*
  * Pages of the SELECT SENSE / MODE SENSE packet commands.
@@ -57,23 +19,23 @@ typedef struct ide_floppy_obj {
 #define	IDEFLOPPY_IOCTL_FORMAT_START		0x4602
 #define IDEFLOPPY_IOCTL_FORMAT_GET_PROGRESS	0x4603
 
-sector_t ide_gd_capacity(ide_drive_t *);
-
 /* ide-floppy.c */
+extern const struct ide_disk_ops ide_atapi_disk_ops;
 void ide_floppy_create_mode_sense_cmd(struct ide_atapi_pc *, u8);
 void ide_floppy_create_read_capacity_cmd(struct ide_atapi_pc *);
-int ide_floppy_get_capacity(ide_drive_t *);
-void ide_floppy_setup(ide_drive_t *);
-ide_startstop_t ide_floppy_do_request(ide_drive_t *, struct request *, sector_t);
-int ide_floppy_end_request(ide_drive_t *, int, int);
 
 /* ide-floppy_ioctl.c */
-int ide_floppy_ioctl(struct inode *, struct file *, unsigned, unsigned long);
+int ide_floppy_ioctl(ide_drive_t *, struct inode *, struct file *, unsigned int,
+		     unsigned long);
 
 #ifdef CONFIG_IDE_PROC_FS
 /* ide-floppy_proc.c */
 extern ide_proc_entry_t ide_floppy_proc[];
 extern const struct ide_proc_devset ide_floppy_settings[];
 #endif
+#else
+#define ide_floppy_proc		NULL
+#define ide_floppy_settings	NULL
+#endif
 
 #endif /*__IDE_FLOPPY_H */
diff --git a/drivers/ide/ide-floppy_ioctl.c b/drivers/ide/ide-floppy_ioctl.c
index b1f391df6cca..e8aa0a5bf5dc 100644
--- a/drivers/ide/ide-floppy_ioctl.c
+++ b/drivers/ide/ide-floppy_ioctl.c
@@ -33,7 +33,7 @@
 
 static int ide_floppy_get_format_capacities(ide_drive_t *drive, int __user *arg)
 {
-	struct ide_floppy_obj *floppy = drive->driver_data;
+	struct ide_disk_obj *floppy = drive->driver_data;
 	struct ide_atapi_pc pc;
 	u8 header_len, desc_cnt;
 	int i, blocks, length, u_array_size, u_index;
@@ -260,13 +260,10 @@ static int ide_floppy_format_ioctl(ide_drive_t *drive, struct file *file,
 	}
 }
 
-int ide_floppy_ioctl(struct inode *inode, struct file *file,
-		    unsigned int cmd, unsigned long arg)
+int ide_floppy_ioctl(ide_drive_t *drive, struct inode *inode,
+		     struct file *file, unsigned int cmd, unsigned long arg)
 {
 	struct block_device *bdev = inode->i_bdev;
-	struct ide_floppy_obj *floppy = ide_drv_g(bdev->bd_disk,
-						     ide_floppy_obj);
-	ide_drive_t *drive = floppy->drive;
 	struct ide_atapi_pc pc;
 	void __user *argp = (void __user *)arg;
 	int err;
diff --git a/drivers/ide/ide-gd-floppy.c b/drivers/ide/ide-gd-floppy.c
deleted file mode 100644
index 082800b9a558..000000000000
--- a/drivers/ide/ide-gd-floppy.c
+++ /dev/null
@@ -1,309 +0,0 @@
-#include <linux/module.h>
-#include <linux/types.h>
-#include <linux/string.h>
-#include <linux/kernel.h>
-#include <linux/errno.h>
-#include <linux/genhd.h>
-#include <linux/mutex.h>
-#include <linux/ide.h>
-#include <linux/hdreg.h>
-
-#include "ide-floppy.h"
-
-#define IDEFLOPPY_VERSION "1.00"
-
-/* module parameters */
-static unsigned long debug_mask;
-module_param(debug_mask, ulong, 0644);
-
-static DEFINE_MUTEX(ide_disk_ref_mutex);
-
-static void ide_disk_release(struct kref *);
-
-static struct ide_floppy_obj *ide_disk_get(struct gendisk *disk)
-{
-	struct ide_floppy_obj *idkp = NULL;
-
-	mutex_lock(&ide_disk_ref_mutex);
-	idkp = ide_drv_g(disk, ide_floppy_obj);
-	if (idkp) {
-		if (ide_device_get(idkp->drive))
-			idkp = NULL;
-		else
-			kref_get(&idkp->kref);
-	}
-	mutex_unlock(&ide_disk_ref_mutex);
-	return idkp;
-}
-
-static void ide_disk_put(struct ide_floppy_obj *idkp)
-{
-	ide_drive_t *drive = idkp->drive;
-
-	mutex_lock(&ide_disk_ref_mutex);
-	kref_put(&idkp->kref, ide_disk_release);
-	ide_device_put(drive);
-	mutex_unlock(&ide_disk_ref_mutex);
-}
-
-sector_t ide_gd_capacity(ide_drive_t *drive)
-{
-	return drive->capacity64;
-}
-
-static int ide_gd_probe(ide_drive_t *);
-
-static void ide_gd_remove(ide_drive_t *drive)
-{
-	struct ide_floppy_obj *idkp = drive->driver_data;
-	struct gendisk *g = idkp->disk;
-
-	ide_proc_unregister_driver(drive, idkp->driver);
-
-	del_gendisk(g);
-
-	ide_disk_put(idkp);
-}
-
-static void ide_disk_release(struct kref *kref)
-{
-	struct ide_floppy_obj *idkp = to_ide_drv(kref, ide_floppy_obj);
-	ide_drive_t *drive = idkp->drive;
-	struct gendisk *g = idkp->disk;
-
-	drive->driver_data = NULL;
-	g->private_data = NULL;
-	put_disk(g);
-	kfree(idkp);
-}
-
-#ifdef CONFIG_IDE_PROC_FS
-static ide_proc_entry_t *ide_floppy_proc_entries(ide_drive_t *drive)
-{
-	return ide_floppy_proc;
-}
-
-static const struct ide_proc_devset *ide_floppy_proc_devsets(ide_drive_t *drive)
-{
-	return ide_floppy_settings;
-}
-#endif
-
-static ide_driver_t ide_gd_driver = {
-	.gen_driver = {
-		.owner		= THIS_MODULE,
-		.name		= "ide-floppy",
-		.bus		= &ide_bus_type,
-	},
-	.probe			= ide_gd_probe,
-	.remove			= ide_gd_remove,
-	.version		= IDEFLOPPY_VERSION,
-	.do_request		= ide_floppy_do_request,
-	.end_request		= ide_floppy_end_request,
-	.error			= __ide_error,
-#ifdef CONFIG_IDE_PROC_FS
-	.proc_entries		= ide_floppy_proc_entries,
-	.proc_devsets		= ide_floppy_proc_devsets,
-#endif
-};
-
-static int ide_gd_open(struct inode *inode, struct file *filp)
-{
-	struct gendisk *disk = inode->i_bdev->bd_disk;
-	struct ide_floppy_obj *idkp;
-	ide_drive_t *drive;
-	int ret = 0;
-
-	idkp = ide_disk_get(disk);
-	if (idkp == NULL)
-		return -ENXIO;
-
-	drive = idkp->drive;
-
-	ide_debug_log(IDE_DBG_FUNC, "Call %s\n", __func__);
-
-	idkp->openers++;
-
-	if (idkp->openers == 1) {
-		drive->dev_flags &= ~IDE_DFLAG_FORMAT_IN_PROGRESS;
-		/* Just in case */
-
-		if (ide_do_test_unit_ready(drive, disk))
-			ide_do_start_stop(drive, disk, 1);
-
-		ret = ide_floppy_get_capacity(drive);
-
-		set_capacity(disk, ide_gd_capacity(drive));
-
-		if (ret && (filp->f_flags & O_NDELAY) == 0) {
-		    /*
-		     * Allow O_NDELAY to open a drive without a disk, or with an
-		     * unreadable disk, so that we can get the format capacity
-		     * of the drive or begin the format - Sam
-		     */
-			ret = -EIO;
-			goto out_put_idkp;
-		}
-
-		if ((drive->dev_flags & IDE_DFLAG_WP) && (filp->f_mode & 2)) {
-			ret = -EROFS;
-			goto out_put_idkp;
-		}
-
-		ide_set_media_lock(drive, disk, 1);
-		drive->dev_flags |= IDE_DFLAG_MEDIA_CHANGED;
-		check_disk_change(inode->i_bdev);
-	} else if (drive->dev_flags & IDE_DFLAG_FORMAT_IN_PROGRESS) {
-		ret = -EBUSY;
-		goto out_put_idkp;
-	}
-	return 0;
-
-out_put_idkp:
-	idkp->openers--;
-	ide_disk_put(idkp);
-	return ret;
-}
-
-static int ide_gd_release(struct inode *inode, struct file *filp)
-{
-	struct gendisk *disk = inode->i_bdev->bd_disk;
-	struct ide_floppy_obj *idkp = ide_drv_g(disk, ide_floppy_obj);
-	ide_drive_t *drive = idkp->drive;
-
-	ide_debug_log(IDE_DBG_FUNC, "Call %s\n", __func__);
-
-	if (idkp->openers == 1) {
-		ide_set_media_lock(drive, disk, 0);
-		drive->dev_flags &= ~IDE_DFLAG_FORMAT_IN_PROGRESS;
-	}
-
-	idkp->openers--;
-
-	ide_disk_put(idkp);
-
-	return 0;
-}
-
-static int ide_gd_getgeo(struct block_device *bdev, struct hd_geometry *geo)
-{
-	struct ide_floppy_obj *idkp = ide_drv_g(bdev->bd_disk, ide_floppy_obj);
-	ide_drive_t *drive = idkp->drive;
-
-	geo->heads = drive->bios_head;
-	geo->sectors = drive->bios_sect;
-	geo->cylinders = (u16)drive->bios_cyl; /* truncate */
-	return 0;
-}
-
-static int ide_gd_media_changed(struct gendisk *disk)
-{
-	struct ide_floppy_obj *idkp = ide_drv_g(disk, ide_floppy_obj);
-	ide_drive_t *drive = idkp->drive;
-	int ret;
-
-	/* do not scan partitions twice if this is a removable device */
-	if (drive->dev_flags & IDE_DFLAG_ATTACH) {
-		drive->dev_flags &= ~IDE_DFLAG_ATTACH;
-		return 0;
-	}
-
-	ret = !!(drive->dev_flags & IDE_DFLAG_MEDIA_CHANGED);
-	drive->dev_flags &= ~IDE_DFLAG_MEDIA_CHANGED;
-
-	return ret;
-}
-
-static int ide_gd_revalidate_disk(struct gendisk *disk)
-{
-	struct ide_floppy_obj *idkp = ide_drv_g(disk, ide_floppy_obj);
-	set_capacity(disk, ide_gd_capacity(idkp->drive));
-	return 0;
-}
-
-static struct block_device_operations ide_gd_ops = {
-	.owner			= THIS_MODULE,
-	.open			= ide_gd_open,
-	.release		= ide_gd_release,
-	.ioctl			= ide_floppy_ioctl,
-	.getgeo			= ide_gd_getgeo,
-	.media_changed		= ide_gd_media_changed,
-	.revalidate_disk	= ide_gd_revalidate_disk
-};
-
-static int ide_gd_probe(ide_drive_t *drive)
-{
-	struct ide_floppy_obj *idkp;
-	struct gendisk *g;
-
-	if (!strstr("ide-floppy", drive->driver_req))
-		goto failed;
-
-	if (drive->media != ide_floppy)
-		goto failed;
-
-	if (!ide_check_atapi_device(drive, DRV_NAME)) {
-		printk(KERN_ERR PFX "%s: not supported by this version of "
-		       DRV_NAME "\n", drive->name);
-		goto failed;
-	}
-	idkp = kzalloc(sizeof(*idkp), GFP_KERNEL);
-	if (!idkp) {
-		printk(KERN_ERR PFX "%s: Can't allocate a floppy structure\n",
-		       drive->name);
-		goto failed;
-	}
-
-	g = alloc_disk_node(1 << PARTN_BITS, hwif_to_node(drive->hwif));
-	if (!g)
-		goto out_free_idkp;
-
-	ide_init_disk(g, drive);
-
-	kref_init(&idkp->kref);
-
-	idkp->drive = drive;
-	idkp->driver = &ide_gd_driver;
-	idkp->disk = g;
-
-	g->private_data = &idkp->driver;
-
-	drive->driver_data = idkp;
-
-	drive->debug_mask = debug_mask;
-
-	ide_floppy_setup(drive);
-
-	set_capacity(g, ide_gd_capacity(drive));
-
-	g->minors = 1 << PARTN_BITS;
-	g->driverfs_dev = &drive->gendev;
-	if (drive->dev_flags & IDE_DFLAG_REMOVABLE)
-		g->flags = GENHD_FL_REMOVABLE;
-	g->fops = &ide_gd_ops;
-	add_disk(g);
-	return 0;
-
-out_free_idkp:
-	kfree(idkp);
-failed:
-	return -ENODEV;
-}
-
-static int __init ide_gd_init(void)
-{
-	printk(KERN_INFO DRV_NAME " driver " IDEFLOPPY_VERSION "\n");
-	return driver_register(&ide_gd_driver.gen_driver);
-}
-
-static void __exit ide_gd_exit(void)
-{
-	driver_unregister(&ide_gd_driver.gen_driver);
-}
-
-MODULE_ALIAS("ide:*m-floppy*");
-MODULE_ALIAS("ide-floppy");
-module_init(ide_gd_init);
-module_exit(ide_gd_exit);
-MODULE_LICENSE("GPL");
-MODULE_DESCRIPTION("ATAPI FLOPPY Driver");
diff --git a/drivers/ide/ide-gd.c b/drivers/ide/ide-gd.c
index a3d4ad7db2af..d44898f46c33 100644
--- a/drivers/ide/ide-gd.c
+++ b/drivers/ide/ide-gd.c
@@ -15,9 +15,14 @@
 #endif
 
 #include "ide-disk.h"
+#include "ide-floppy.h"
 
 #define IDE_GD_VERSION	"1.18"
 
+/* module parameters */
+static unsigned long debug_mask;
+module_param(debug_mask, ulong, 0644);
+
 static DEFINE_MUTEX(ide_disk_ref_mutex);
 
 static void ide_disk_release(struct kref *);
@@ -64,7 +69,7 @@ static void ide_gd_remove(ide_drive_t *drive)
 
 	del_gendisk(g);
 
-	ide_disk_flush(drive);
+	drive->disk_ops->flush(drive);
 
 	ide_disk_put(idkp);
 }
@@ -75,6 +80,7 @@ static void ide_disk_release(struct kref *kref)
 	ide_drive_t *drive = idkp->drive;
 	struct gendisk *g = idkp->disk;
 
+	drive->disk_ops = NULL;
 	drive->driver_data = NULL;
 	g->private_data = NULL;
 	put_disk(g);
@@ -89,7 +95,7 @@ static void ide_disk_release(struct kref *kref)
 static void ide_gd_resume(ide_drive_t *drive)
 {
 	if (ata_id_hpa_enabled(drive->id))
-		ide_disk_init_capacity(drive);
+		(void)drive->disk_ops->get_capacity(drive);
 }
 
 static void ide_gd_shutdown(ide_drive_t *drive)
@@ -110,7 +116,7 @@ static void ide_gd_shutdown(ide_drive_t *drive)
 #else
 	if (system_state == SYSTEM_RESTART) {
 #endif
-		ide_disk_flush(drive);
+		drive->disk_ops->flush(drive);
 		return;
 	}
 
@@ -122,19 +128,31 @@ static void ide_gd_shutdown(ide_drive_t *drive)
 #ifdef CONFIG_IDE_PROC_FS
 static ide_proc_entry_t *ide_disk_proc_entries(ide_drive_t *drive)
 {
-	return ide_disk_proc;
+	return (drive->media == ide_disk) ? ide_disk_proc : ide_floppy_proc;
 }
 
 static const struct ide_proc_devset *ide_disk_proc_devsets(ide_drive_t *drive)
 {
-	return ide_disk_settings;
+	return (drive->media == ide_disk) ? ide_disk_settings
+					  : ide_floppy_settings;
 }
 #endif
 
+static ide_startstop_t ide_gd_do_request(ide_drive_t *drive,
+					 struct request *rq, sector_t sector)
+{
+	return drive->disk_ops->do_request(drive, rq, sector);
+}
+
+static int ide_gd_end_request(ide_drive_t *drive, int uptodate, int nrsecs)
+{
+	return drive->disk_ops->end_request(drive, uptodate, nrsecs);
+}
+
 static ide_driver_t ide_gd_driver = {
 	.gen_driver = {
 		.owner		= THIS_MODULE,
-		.name		= "ide-disk",
+		.name		= "ide-gd",
 		.bus		= &ide_bus_type,
 	},
 	.probe			= ide_gd_probe,
@@ -142,8 +160,8 @@ static ide_driver_t ide_gd_driver = {
 	.resume			= ide_gd_resume,
 	.shutdown		= ide_gd_shutdown,
 	.version		= IDE_GD_VERSION,
-	.do_request		= ide_do_rw_disk,
-	.end_request		= ide_end_request,
+	.do_request		= ide_gd_do_request,
+	.end_request		= ide_gd_end_request,
 	.error			= __ide_error,
 #ifdef CONFIG_IDE_PROC_FS
 	.proc_entries		= ide_disk_proc_entries,
@@ -156,6 +174,7 @@ static int ide_gd_open(struct inode *inode, struct file *filp)
 	struct gendisk *disk = inode->i_bdev->bd_disk;
 	struct ide_disk_obj *idkp;
 	ide_drive_t *drive;
+	int ret = 0;
 
 	idkp = ide_disk_get(disk);
 	if (idkp == NULL)
@@ -163,19 +182,49 @@ static int ide_gd_open(struct inode *inode, struct file *filp)
 
 	drive = idkp->drive;
 
+	ide_debug_log(IDE_DBG_FUNC, "Call %s\n", __func__);
+
 	idkp->openers++;
 
 	if ((drive->dev_flags & IDE_DFLAG_REMOVABLE) && idkp->openers == 1) {
+		drive->dev_flags &= ~IDE_DFLAG_FORMAT_IN_PROGRESS;
+		/* Just in case */
+
+		ret = drive->disk_ops->init_media(drive, disk);
+
+		/*
+		 * Allow O_NDELAY to open a drive without a disk, or with an
+		 * unreadable disk, so that we can get the format capacity
+		 * of the drive or begin the format - Sam
+		 */
+		if (ret && (filp->f_flags & O_NDELAY) == 0) {
+			ret = -EIO;
+			goto out_put_idkp;
+		}
+
+		if ((drive->dev_flags & IDE_DFLAG_WP) && (filp->f_mode & 2)) {
+			ret = -EROFS;
+			goto out_put_idkp;
+		}
+
 		/*
 		 * Ignore the return code from door_lock,
 		 * since the open() has already succeeded,
 		 * and the door_lock is irrelevant at this point.
 		 */
-		ide_disk_set_doorlock(drive, 1);
+		drive->disk_ops->set_doorlock(drive, disk, 1);
 		drive->dev_flags |= IDE_DFLAG_MEDIA_CHANGED;
 		check_disk_change(inode->i_bdev);
+	} else if (drive->dev_flags & IDE_DFLAG_FORMAT_IN_PROGRESS) {
+		ret = -EBUSY;
+		goto out_put_idkp;
 	}
 	return 0;
+
+out_put_idkp:
+	idkp->openers--;
+	ide_disk_put(idkp);
+	return ret;
 }
 
 static int ide_gd_release(struct inode *inode, struct file *filp)
@@ -184,11 +233,15 @@ static int ide_gd_release(struct inode *inode, struct file *filp)
 	struct ide_disk_obj *idkp = ide_drv_g(disk, ide_disk_obj);
 	ide_drive_t *drive = idkp->drive;
 
+	ide_debug_log(IDE_DBG_FUNC, "Call %s\n", __func__);
+
 	if (idkp->openers == 1)
-		ide_disk_flush(drive);
+		drive->disk_ops->flush(drive);
 
-	if ((drive->dev_flags & IDE_DFLAG_REMOVABLE) && idkp->openers == 1)
-		ide_disk_set_doorlock(drive, 0);
+	if ((drive->dev_flags & IDE_DFLAG_REMOVABLE) && idkp->openers == 1) {
+		drive->disk_ops->set_doorlock(drive, disk, 0);
+		drive->dev_flags &= ~IDE_DFLAG_FORMAT_IN_PROGRESS;
+	}
 
 	idkp->openers--;
 
@@ -233,11 +286,21 @@ static int ide_gd_revalidate_disk(struct gendisk *disk)
 	return 0;
 }
 
+static int ide_gd_ioctl(struct inode *inode, struct file *file,
+			     unsigned int cmd, unsigned long arg)
+{
+	struct block_device *bdev = inode->i_bdev;
+	struct ide_disk_obj *idkp = ide_drv_g(bdev->bd_disk, ide_disk_obj);
+	ide_drive_t *drive = idkp->drive;
+
+	return drive->disk_ops->ioctl(drive, inode, file, cmd, arg);
+}
+
 static struct block_device_operations ide_gd_ops = {
 	.owner			= THIS_MODULE,
 	.open			= ide_gd_open,
 	.release		= ide_gd_release,
-	.ioctl			= ide_disk_ioctl,
+	.ioctl			= ide_gd_ioctl,
 	.getgeo			= ide_gd_getgeo,
 	.media_changed		= ide_gd_media_changed,
 	.revalidate_disk	= ide_gd_revalidate_disk
@@ -245,19 +308,37 @@ static struct block_device_operations ide_gd_ops = {
 
 static int ide_gd_probe(ide_drive_t *drive)
 {
+	const struct ide_disk_ops *disk_ops = NULL;
 	struct ide_disk_obj *idkp;
 	struct gendisk *g;
 
 	/* strstr("foo", "") is non-NULL */
-	if (!strstr("ide-disk", drive->driver_req))
+	if (!strstr("ide-gd", drive->driver_req))
+		goto failed;
+
+#ifdef CONFIG_IDE_GD_ATA
+	if (drive->media == ide_disk)
+		disk_ops = &ide_ata_disk_ops;
+#endif
+#ifdef CONFIG_IDE_GD_ATAPI
+	if (drive->media == ide_floppy)
+		disk_ops = &ide_atapi_disk_ops;
+#endif
+	if (disk_ops == NULL)
 		goto failed;
 
-	if (drive->media != ide_disk)
+	if (disk_ops->check(drive, DRV_NAME) == 0) {
+		printk(KERN_ERR PFX "%s: not supported by this driver\n",
+			drive->name);
 		goto failed;
+	}
 
 	idkp = kzalloc(sizeof(*idkp), GFP_KERNEL);
-	if (!idkp)
+	if (!idkp) {
+		printk(KERN_ERR PFX "%s: can't allocate a disk structure\n",
+			drive->name);
 		goto failed;
+	}
 
 	g = alloc_disk_node(IDE_DISK_MINORS, hwif_to_node(drive->hwif));
 	if (!g)
@@ -274,8 +355,10 @@ static int ide_gd_probe(ide_drive_t *drive)
 	g->private_data = &idkp->driver;
 
 	drive->driver_data = idkp;
+	drive->debug_mask = debug_mask;
+	drive->disk_ops = disk_ops;
 
-	ide_disk_setup(drive);
+	disk_ops->setup(drive);
 
 	set_capacity(g, ide_gd_capacity(drive));
 
@@ -296,6 +379,7 @@ failed:
 
 static int __init ide_gd_init(void)
 {
+	printk(KERN_INFO DRV_NAME " driver " IDE_GD_VERSION "\n");
 	return driver_register(&ide_gd_driver.gen_driver);
 }
 
@@ -306,7 +390,9 @@ static void __exit ide_gd_exit(void)
 
 MODULE_ALIAS("ide:*m-disk*");
 MODULE_ALIAS("ide-disk");
+MODULE_ALIAS("ide:*m-floppy*");
+MODULE_ALIAS("ide-floppy");
 module_init(ide_gd_init);
 module_exit(ide_gd_exit);
 MODULE_LICENSE("GPL");
-MODULE_DESCRIPTION("ATA DISK Driver");
+MODULE_DESCRIPTION("generic ATA/ATAPI disk driver");
diff --git a/drivers/ide/ide-gd.h b/drivers/ide/ide-gd.h
new file mode 100644
index 000000000000..7d3d101713e0
--- /dev/null
+++ b/drivers/ide/ide-gd.h
@@ -0,0 +1,44 @@
+#ifndef __IDE_GD_H
+#define __IDE_GD_H
+
+#define DRV_NAME "ide-gd"
+#define PFX DRV_NAME ": "
+
+/* define to see debug info */
+#define IDE_GD_DEBUG_LOG	0
+
+#if IDE_GD_DEBUG_LOG
+#define ide_debug_log(lvl, fmt, args...) __ide_debug_log(lvl, fmt, args)
+#else
+#define ide_debug_log(lvl, fmt, args...) do {} while (0)
+#endif
+
+struct ide_disk_obj {
+	ide_drive_t	*drive;
+	ide_driver_t	*driver;
+	struct gendisk	*disk;
+	struct kref	kref;
+	unsigned int	openers;	/* protected by BKL for now */
+
+	/* Last failed packet command */
+	struct ide_atapi_pc *failed_pc;
+	/* used for blk_{fs,pc}_request() requests */
+	struct ide_atapi_pc queued_pc;
+
+	/* Last error information */
+	u8 sense_key, asc, ascq;
+
+	int progress_indication;
+
+	/* Device information */
+	/* Current format */
+	int blocks, block_size, bs_factor;
+	/* Last format capacity descriptor */
+	u8 cap_desc[8];
+	/* Copy of the flexible disk page */
+	u8 flexible_disk_page[32];
+};
+
+sector_t ide_gd_capacity(ide_drive_t *);
+
+#endif /* __IDE_GD_H */
diff --git a/drivers/leds/Kconfig b/drivers/leds/Kconfig
index e3e40427e00e..c7ff1e11ea85 100644
--- a/drivers/leds/Kconfig
+++ b/drivers/leds/Kconfig
@@ -179,7 +179,7 @@ config LEDS_TRIGGER_TIMER
 
 config LEDS_TRIGGER_IDE_DISK
 	bool "LED IDE Disk Trigger"
-	depends on LEDS_TRIGGERS && BLK_DEV_IDEDISK
+	depends on LEDS_TRIGGERS && IDE_GD_ATA
 	help
 	  This allows LEDs to be controlled by IDE disk activity.
 	  If unsure, say Y.
diff --git a/include/linux/ide.h b/include/linux/ide.h
index 488808891acb..89e53cfbc787 100644
--- a/include/linux/ide.h
+++ b/include/linux/ide.h
@@ -461,6 +461,23 @@ struct ide_acpi_drive_link;
 struct ide_acpi_hwif_link;
 #endif
 
+struct ide_drive_s;
+
+struct ide_disk_ops {
+	int		(*check)(struct ide_drive_s *, const char *);
+	int		(*get_capacity)(struct ide_drive_s *);
+	void		(*setup)(struct ide_drive_s *);
+	void		(*flush)(struct ide_drive_s *);
+	int		(*init_media)(struct ide_drive_s *, struct gendisk *);
+	int		(*set_doorlock)(struct ide_drive_s *, struct gendisk *,
+					int);
+	ide_startstop_t	(*do_request)(struct ide_drive_s *, struct request *,
+				      sector_t);
+	int		(*end_request)(struct ide_drive_s *, int, int);
+	int		(*ioctl)(struct ide_drive_s *, struct inode *,
+				 struct file *, unsigned int, unsigned long);
+};
+
 /* ATAPI device flags */
 enum {
 	IDE_AFLAG_DRQ_INTERRUPT		= (1 << 0),
@@ -594,6 +611,8 @@ struct ide_drive_s {
 #endif
 	struct hwif_s		*hwif;	/* actually (ide_hwif_t *) */
 
+	const struct ide_disk_ops *disk_ops;
+
 	unsigned long dev_flags;
 
 	unsigned long sleep;		/* sleep until this time */
-- 
cgit v1.2.3


From 719254faa17ffedc87ba0fadb9b34e535c9758d5 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Fri, 17 Oct 2008 09:59:47 +0200
Subject: NOHZ: unify the nohz function calls in irq_enter()

We have two separate nohz function calls in irq_enter() for no good
reason. Just call a single NOHZ function from irq_enter() and call
the bits in the tick code.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 include/linux/tick.h     |  7 +++----
 kernel/softirq.c         | 10 +++-------
 kernel/time/tick-sched.c | 13 ++++++++++++-
 3 files changed, 18 insertions(+), 12 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/tick.h b/include/linux/tick.h
index 98921a3e1aa8..b6ec8189ac0c 100644
--- a/include/linux/tick.h
+++ b/include/linux/tick.h
@@ -96,9 +96,11 @@ extern cpumask_t *tick_get_broadcast_oneshot_mask(void);
 extern void tick_clock_notify(void);
 extern int tick_check_oneshot_change(int allow_nohz);
 extern struct tick_sched *tick_get_tick_sched(int cpu);
+extern void tick_check_idle(int cpu);
 # else
 static inline void tick_clock_notify(void) { }
 static inline int tick_check_oneshot_change(int allow_nohz) { return 0; }
+static inline void tick_check_idle(int cpu) { }
 # endif
 
 #else /* CONFIG_GENERIC_CLOCKEVENTS */
@@ -106,26 +108,23 @@ static inline void tick_init(void) { }
 static inline void tick_cancel_sched_timer(int cpu) { }
 static inline void tick_clock_notify(void) { }
 static inline int tick_check_oneshot_change(int allow_nohz) { return 0; }
+static inline void tick_check_idle(int cpu) { }
 #endif /* !CONFIG_GENERIC_CLOCKEVENTS */
 
 # ifdef CONFIG_NO_HZ
 extern void tick_nohz_stop_sched_tick(int inidle);
 extern void tick_nohz_restart_sched_tick(void);
-extern void tick_nohz_update_jiffies(void);
 extern ktime_t tick_nohz_get_sleep_length(void);
-extern void tick_nohz_stop_idle(int cpu);
 extern u64 get_cpu_idle_time_us(int cpu, u64 *last_update_time);
 # else
 static inline void tick_nohz_stop_sched_tick(int inidle) { }
 static inline void tick_nohz_restart_sched_tick(void) { }
-static inline void tick_nohz_update_jiffies(void) { }
 static inline ktime_t tick_nohz_get_sleep_length(void)
 {
 	ktime_t len = { .tv64 = NSEC_PER_SEC/HZ };
 
 	return len;
 }
-static inline void tick_nohz_stop_idle(int cpu) { }
 static inline u64 get_cpu_idle_time_us(int cpu, u64 *unused) { return -1; }
 # endif /* !NO_HZ */
 
diff --git a/kernel/softirq.c b/kernel/softirq.c
index 37d67aa2d56f..d410014279e7 100644
--- a/kernel/softirq.c
+++ b/kernel/softirq.c
@@ -265,16 +265,12 @@ asmlinkage void do_softirq(void)
  */
 void irq_enter(void)
 {
-#ifdef CONFIG_NO_HZ
 	int cpu = smp_processor_id();
+
 	if (idle_cpu(cpu) && !in_interrupt())
-		tick_nohz_stop_idle(cpu);
-#endif
+		tick_check_idle(cpu);
+
 	__irq_enter();
-#ifdef CONFIG_NO_HZ
-	if (idle_cpu(cpu))
-		tick_nohz_update_jiffies();
-#endif
 }
 
 #ifdef __ARCH_IRQ_EXIT_IRQS_DISABLED
diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c
index b711ffcb106c..fdcf3f93bb8d 100644
--- a/kernel/time/tick-sched.c
+++ b/kernel/time/tick-sched.c
@@ -155,7 +155,7 @@ void tick_nohz_update_jiffies(void)
 	touch_softlockup_watchdog();
 }
 
-void tick_nohz_stop_idle(int cpu)
+static void tick_nohz_stop_idle(int cpu)
 {
 	struct tick_sched *ts = &per_cpu(tick_cpu_sched, cpu);
 
@@ -558,6 +558,17 @@ static inline void tick_nohz_switch_to_nohz(void) { }
 
 #endif /* NO_HZ */
 
+/*
+ * Called from irq_enter to notify about the possible interruption of idle()
+ */
+void tick_check_idle(int cpu)
+{
+#ifdef CONFIG_NO_HZ
+	tick_nohz_stop_idle(cpu);
+	tick_nohz_update_jiffies();
+#endif
+}
+
 /*
  * High resolution timer specific code
  */
-- 
cgit v1.2.3


From 504e518953a330c8d44a95bdd65a5c9f50f1012e Mon Sep 17 00:00:00 2001
From: Neil Brown <neilb@suse.de>
Date: Thu, 16 Oct 2008 14:15:16 +1100
Subject: Make nfs_file_cred more robust.

As not all files have an associated open_context (e.g. device special
files), it is safest to test for the existence of the open context
before de-referencing it.

Signed-off-by: NeilBrown <neilb@suse.de>
Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
---
 fs/nfs/nfs4proc.c      | 6 ++++--
 include/linux/nfs_fs.h | 8 ++++++--
 2 files changed, 10 insertions(+), 4 deletions(-)

(limited to 'include/linux')

diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c
index c910413eaeca..83e700a2b0c0 100644
--- a/fs/nfs/nfs4proc.c
+++ b/fs/nfs/nfs4proc.c
@@ -1659,8 +1659,10 @@ nfs4_proc_setattr(struct dentry *dentry, struct nfs_fattr *fattr,
 		struct nfs_open_context *ctx;
 
 		ctx = nfs_file_open_context(sattr->ia_file);
-		cred = ctx->cred;
-		state = ctx->state;
+		if (ctx) {
+			cred = ctx->cred;
+			state = ctx->state;
+		}
 	}
 
 	status = nfs4_do_setattr(inode, cred, fattr, sattr, state);
diff --git a/include/linux/nfs_fs.h b/include/linux/nfs_fs.h
index ac8d0233b05c..4eaa8347a0d9 100644
--- a/include/linux/nfs_fs.h
+++ b/include/linux/nfs_fs.h
@@ -367,8 +367,12 @@ static inline struct nfs_open_context *nfs_file_open_context(struct file *filp)
 
 static inline struct rpc_cred *nfs_file_cred(struct file *file)
 {
-	if (file != NULL)
-		return nfs_file_open_context(file)->cred;
+	if (file != NULL) {
+		struct nfs_open_context *ctx =
+			nfs_file_open_context(file);
+		if (ctx)
+			return ctx->cred;
+	}
 	return NULL;
 }
 
-- 
cgit v1.2.3


From aaf7ea20000436df3cbb397ccb734ad1e2e5164d Mon Sep 17 00:00:00 2001
From: Mike Rapoport <mike@compulab.co.il>
Date: Wed, 15 Oct 2008 08:38:49 +0200
Subject: [MTD] [NAND] GPIO NAND flash driver

The patch adds support for NAND flashes connected to GPIOs.

Signed-off-by: Russell King <rmk+kernel@arm.linux.org.uk>
Signed-off-by: Mike Rapoport <mike@compulab.co.il>
Signed-off-by: David Woodhouse <David.Woodhouse@intel.com>
---
 drivers/mtd/nand/Kconfig      |   6 +
 drivers/mtd/nand/Makefile     |   1 +
 drivers/mtd/nand/gpio.c       | 375 ++++++++++++++++++++++++++++++++++++++++++
 include/linux/mtd/nand-gpio.h |  19 +++
 4 files changed, 401 insertions(+)
 create mode 100644 drivers/mtd/nand/gpio.c
 create mode 100644 include/linux/mtd/nand-gpio.h

(limited to 'include/linux')

diff --git a/drivers/mtd/nand/Kconfig b/drivers/mtd/nand/Kconfig
index 89b4d39386ab..b9eed9925462 100644
--- a/drivers/mtd/nand/Kconfig
+++ b/drivers/mtd/nand/Kconfig
@@ -56,6 +56,12 @@ config MTD_NAND_H1900
 	help
 	  This enables the driver for the iPAQ h1900 flash.
 
+config MTD_NAND_GPIO
+	tristate "GPIO NAND Flash driver"
+	depends on GENERIC_GPIO
+	help
+	  This enables a GPIO based NAND flash driver.
+
 config MTD_NAND_SPIA
 	tristate "NAND Flash device on SPIA board"
 	depends on ARCH_P720T
diff --git a/drivers/mtd/nand/Makefile b/drivers/mtd/nand/Makefile
index 9bfeca324b32..b661586afbfc 100644
--- a/drivers/mtd/nand/Makefile
+++ b/drivers/mtd/nand/Makefile
@@ -23,6 +23,7 @@ obj-$(CONFIG_MTD_NAND_NANDSIM)		+= nandsim.o
 obj-$(CONFIG_MTD_NAND_CS553X)		+= cs553x_nand.o
 obj-$(CONFIG_MTD_NAND_NDFC)		+= ndfc.o
 obj-$(CONFIG_MTD_NAND_ATMEL)		+= atmel_nand.o
+obj-$(CONFIG_MTD_NAND_GPIO)		+= gpio.o
 obj-$(CONFIG_MTD_NAND_CM_X270)		+= cmx270_nand.o
 obj-$(CONFIG_MTD_NAND_BASLER_EXCITE)	+= excite_nandflash.o
 obj-$(CONFIG_MTD_NAND_PXA3xx)		+= pxa3xx_nand.o
diff --git a/drivers/mtd/nand/gpio.c b/drivers/mtd/nand/gpio.c
new file mode 100644
index 000000000000..8f902e75aa85
--- /dev/null
+++ b/drivers/mtd/nand/gpio.c
@@ -0,0 +1,375 @@
+/*
+ * drivers/mtd/nand/gpio.c
+ *
+ * Updated, and converted to generic GPIO based driver by Russell King.
+ *
+ * Written by Ben Dooks <ben@simtec.co.uk>
+ *   Based on 2.4 version by Mark Whittaker
+ *
+ * © 2004 Simtec Electronics
+ *
+ * Device driver for NAND connected via GPIO
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ *
+ */
+
+#include <linux/kernel.h>
+#include <linux/init.h>
+#include <linux/slab.h>
+#include <linux/module.h>
+#include <linux/platform_device.h>
+#include <linux/gpio.h>
+#include <linux/io.h>
+#include <linux/mtd/mtd.h>
+#include <linux/mtd/nand.h>
+#include <linux/mtd/partitions.h>
+#include <linux/mtd/nand-gpio.h>
+
+struct gpiomtd {
+	void __iomem		*io_sync;
+	struct mtd_info		mtd_info;
+	struct nand_chip	nand_chip;
+	struct gpio_nand_platdata plat;
+};
+
+#define gpio_nand_getpriv(x) container_of(x, struct gpiomtd, mtd_info)
+
+
+#ifdef CONFIG_ARM
+/* gpio_nand_dosync()
+ *
+ * Make sure the GPIO state changes occur in-order with writes to NAND
+ * memory region.
+ * Needed on PXA due to bus-reordering within the SoC itself (see section on
+ * I/O ordering in PXA manual (section 2.3, p35)
+ */
+static void gpio_nand_dosync(struct gpiomtd *gpiomtd)
+{
+	unsigned long tmp;
+
+	if (gpiomtd->io_sync) {
+		/*
+		 * Linux memory barriers don't cater for what's required here.
+		 * What's required is what's here - a read from a separate
+		 * region with a dependency on that read.
+		 */
+		tmp = readl(gpiomtd->io_sync);
+		asm volatile("mov %1, %0\n" : "=r" (tmp) : "r" (tmp));
+	}
+}
+#else
+static inline void gpio_nand_dosync(struct gpiomtd *gpiomtd) {}
+#endif
+
+static void gpio_nand_cmd_ctrl(struct mtd_info *mtd, int cmd, unsigned int ctrl)
+{
+	struct gpiomtd *gpiomtd = gpio_nand_getpriv(mtd);
+
+	gpio_nand_dosync(gpiomtd);
+
+	if (ctrl & NAND_CTRL_CHANGE) {
+		gpio_set_value(gpiomtd->plat.gpio_nce, !(ctrl & NAND_NCE));
+		gpio_set_value(gpiomtd->plat.gpio_cle, !!(ctrl & NAND_CLE));
+		gpio_set_value(gpiomtd->plat.gpio_ale, !!(ctrl & NAND_ALE));
+		gpio_nand_dosync(gpiomtd);
+	}
+	if (cmd == NAND_CMD_NONE)
+		return;
+
+	writeb(cmd, gpiomtd->nand_chip.IO_ADDR_W);
+	gpio_nand_dosync(gpiomtd);
+}
+
+static void gpio_nand_writebuf(struct mtd_info *mtd, const u_char *buf, int len)
+{
+	struct nand_chip *this = mtd->priv;
+
+	writesb(this->IO_ADDR_W, buf, len);
+}
+
+static void gpio_nand_readbuf(struct mtd_info *mtd, u_char *buf, int len)
+{
+	struct nand_chip *this = mtd->priv;
+
+	readsb(this->IO_ADDR_R, buf, len);
+}
+
+static int gpio_nand_verifybuf(struct mtd_info *mtd, const u_char *buf, int len)
+{
+	struct nand_chip *this = mtd->priv;
+	unsigned char read, *p = (unsigned char *) buf;
+	int i, err = 0;
+
+	for (i = 0; i < len; i++) {
+		read = readb(this->IO_ADDR_R);
+		if (read != p[i]) {
+			pr_debug("%s: err at %d (read %04x vs %04x)\n",
+			       __func__, i, read, p[i]);
+			err = -EFAULT;
+		}
+	}
+	return err;
+}
+
+static void gpio_nand_writebuf16(struct mtd_info *mtd, const u_char *buf,
+				 int len)
+{
+	struct nand_chip *this = mtd->priv;
+
+	if (IS_ALIGNED((unsigned long)buf, 2)) {
+		writesw(this->IO_ADDR_W, buf, len>>1);
+	} else {
+		int i;
+		unsigned short *ptr = (unsigned short *)buf;
+
+		for (i = 0; i < len; i += 2, ptr++)
+			writew(*ptr, this->IO_ADDR_W);
+	}
+}
+
+static void gpio_nand_readbuf16(struct mtd_info *mtd, u_char *buf, int len)
+{
+	struct nand_chip *this = mtd->priv;
+
+	if (IS_ALIGNED((unsigned long)buf, 2)) {
+		readsw(this->IO_ADDR_R, buf, len>>1);
+	} else {
+		int i;
+		unsigned short *ptr = (unsigned short *)buf;
+
+		for (i = 0; i < len; i += 2, ptr++)
+			*ptr = readw(this->IO_ADDR_R);
+	}
+}
+
+static int gpio_nand_verifybuf16(struct mtd_info *mtd, const u_char *buf,
+				 int len)
+{
+	struct nand_chip *this = mtd->priv;
+	unsigned short read, *p = (unsigned short *) buf;
+	int i, err = 0;
+	len >>= 1;
+
+	for (i = 0; i < len; i++) {
+		read = readw(this->IO_ADDR_R);
+		if (read != p[i]) {
+			pr_debug("%s: err at %d (read %04x vs %04x)\n",
+			       __func__, i, read, p[i]);
+			err = -EFAULT;
+		}
+	}
+	return err;
+}
+
+
+static int gpio_nand_devready(struct mtd_info *mtd)
+{
+	struct gpiomtd *gpiomtd = gpio_nand_getpriv(mtd);
+	return gpio_get_value(gpiomtd->plat.gpio_rdy);
+}
+
+static int __devexit gpio_nand_remove(struct platform_device *dev)
+{
+	struct gpiomtd *gpiomtd = platform_get_drvdata(dev);
+	struct resource *res;
+
+	nand_release(&gpiomtd->mtd_info);
+
+	res = platform_get_resource(dev, IORESOURCE_MEM, 1);
+	iounmap(gpiomtd->io_sync);
+	if (res)
+		release_mem_region(res->start, res->end - res->start + 1);
+
+	res = platform_get_resource(dev, IORESOURCE_MEM, 0);
+	iounmap(gpiomtd->nand_chip.IO_ADDR_R);
+	release_mem_region(res->start, res->end - res->start + 1);
+
+	if (gpio_is_valid(gpiomtd->plat.gpio_nwp))
+		gpio_set_value(gpiomtd->plat.gpio_nwp, 0);
+	gpio_set_value(gpiomtd->plat.gpio_nce, 1);
+
+	gpio_free(gpiomtd->plat.gpio_cle);
+	gpio_free(gpiomtd->plat.gpio_ale);
+	gpio_free(gpiomtd->plat.gpio_nce);
+	if (gpio_is_valid(gpiomtd->plat.gpio_nwp))
+		gpio_free(gpiomtd->plat.gpio_nwp);
+	gpio_free(gpiomtd->plat.gpio_rdy);
+
+	kfree(gpiomtd);
+
+	return 0;
+}
+
+static void __iomem *request_and_remap(struct resource *res, size_t size,
+					const char *name, int *err)
+{
+	void __iomem *ptr;
+
+	if (!request_mem_region(res->start, res->end - res->start + 1, name)) {
+		*err = -EBUSY;
+		return NULL;
+	}
+
+	ptr = ioremap(res->start, size);
+	if (!ptr) {
+		release_mem_region(res->start, res->end - res->start + 1);
+		*err = -ENOMEM;
+	}
+	return ptr;
+}
+
+static int __devinit gpio_nand_probe(struct platform_device *dev)
+{
+	struct gpiomtd *gpiomtd;
+	struct nand_chip *this;
+	struct resource *res0, *res1;
+	int ret;
+
+	if (!dev->dev.platform_data)
+		return -EINVAL;
+
+	res0 = platform_get_resource(dev, IORESOURCE_MEM, 0);
+	if (!res0)
+		return -EINVAL;
+
+	gpiomtd = kzalloc(sizeof(*gpiomtd), GFP_KERNEL);
+	if (gpiomtd == NULL) {
+		dev_err(&dev->dev, "failed to create NAND MTD\n");
+		return -ENOMEM;
+	}
+
+	this = &gpiomtd->nand_chip;
+	this->IO_ADDR_R = request_and_remap(res0, 2, "NAND", &ret);
+	if (!this->IO_ADDR_R) {
+		dev_err(&dev->dev, "unable to map NAND\n");
+		goto err_map;
+	}
+
+	res1 = platform_get_resource(dev, IORESOURCE_MEM, 1);
+	if (res1) {
+		gpiomtd->io_sync = request_and_remap(res1, 4, "NAND sync", &ret);
+		if (!gpiomtd->io_sync) {
+			dev_err(&dev->dev, "unable to map sync NAND\n");
+			goto err_sync;
+		}
+	}
+
+	memcpy(&gpiomtd->plat, dev->dev.platform_data, sizeof(gpiomtd->plat));
+
+	ret = gpio_request(gpiomtd->plat.gpio_nce, "NAND NCE");
+	if (ret)
+		goto err_nce;
+	gpio_direction_output(gpiomtd->plat.gpio_nce, 1);
+	if (gpio_is_valid(gpiomtd->plat.gpio_nwp)) {
+		ret = gpio_request(gpiomtd->plat.gpio_nwp, "NAND NWP");
+		if (ret)
+			goto err_nwp;
+		gpio_direction_output(gpiomtd->plat.gpio_nwp, 1);
+	}
+	ret = gpio_request(gpiomtd->plat.gpio_ale, "NAND ALE");
+	if (ret)
+		goto err_ale;
+	gpio_direction_output(gpiomtd->plat.gpio_ale, 0);
+	ret = gpio_request(gpiomtd->plat.gpio_cle, "NAND CLE");
+	if (ret)
+		goto err_cle;
+	gpio_direction_output(gpiomtd->plat.gpio_cle, 0);
+	ret = gpio_request(gpiomtd->plat.gpio_rdy, "NAND RDY");
+	if (ret)
+		goto err_rdy;
+	gpio_direction_input(gpiomtd->plat.gpio_rdy);
+
+
+	this->IO_ADDR_W  = this->IO_ADDR_R;
+	this->ecc.mode   = NAND_ECC_SOFT;
+	this->options    = gpiomtd->plat.options;
+	this->chip_delay = gpiomtd->plat.chip_delay;
+
+	/* install our routines */
+	this->cmd_ctrl   = gpio_nand_cmd_ctrl;
+	this->dev_ready  = gpio_nand_devready;
+
+	if (this->options & NAND_BUSWIDTH_16) {
+		this->read_buf   = gpio_nand_readbuf16;
+		this->write_buf  = gpio_nand_writebuf16;
+		this->verify_buf = gpio_nand_verifybuf16;
+	} else {
+		this->read_buf   = gpio_nand_readbuf;
+		this->write_buf  = gpio_nand_writebuf;
+		this->verify_buf = gpio_nand_verifybuf;
+	}
+
+	/* set the mtd private data for the nand driver */
+	gpiomtd->mtd_info.priv = this;
+	gpiomtd->mtd_info.owner = THIS_MODULE;
+
+	if (nand_scan(&gpiomtd->mtd_info, 1)) {
+		dev_err(&dev->dev, "no nand chips found?\n");
+		ret = -ENXIO;
+		goto err_wp;
+	}
+
+	if (gpiomtd->plat.adjust_parts)
+		gpiomtd->plat.adjust_parts(&gpiomtd->plat,
+					   gpiomtd->mtd_info.size);
+
+	add_mtd_partitions(&gpiomtd->mtd_info, gpiomtd->plat.parts,
+			   gpiomtd->plat.num_parts);
+	platform_set_drvdata(dev, gpiomtd);
+
+	return 0;
+
+err_wp:
+	if (gpio_is_valid(gpiomtd->plat.gpio_nwp))
+		gpio_set_value(gpiomtd->plat.gpio_nwp, 0);
+	gpio_free(gpiomtd->plat.gpio_rdy);
+err_rdy:
+	gpio_free(gpiomtd->plat.gpio_cle);
+err_cle:
+	gpio_free(gpiomtd->plat.gpio_ale);
+err_ale:
+	if (gpio_is_valid(gpiomtd->plat.gpio_nwp))
+		gpio_free(gpiomtd->plat.gpio_nwp);
+err_nwp:
+	gpio_free(gpiomtd->plat.gpio_nce);
+err_nce:
+	iounmap(gpiomtd->io_sync);
+	if (res1)
+		release_mem_region(res1->start, res1->end - res1->start + 1);
+err_sync:
+	iounmap(gpiomtd->nand_chip.IO_ADDR_R);
+	release_mem_region(res0->start, res0->end - res0->start + 1);
+err_map:
+	kfree(gpiomtd);
+	return ret;
+}
+
+static struct platform_driver gpio_nand_driver = {
+	.probe		= gpio_nand_probe,
+	.remove		= gpio_nand_remove,
+	.driver		= {
+		.name	= "gpio-nand",
+	},
+};
+
+static int __init gpio_nand_init(void)
+{
+	printk(KERN_INFO "GPIO NAND driver, © 2004 Simtec Electronics\n");
+
+	return platform_driver_register(&gpio_nand_driver);
+}
+
+static void __exit gpio_nand_exit(void)
+{
+	platform_driver_unregister(&gpio_nand_driver);
+}
+
+module_init(gpio_nand_init);
+module_exit(gpio_nand_exit);
+
+MODULE_LICENSE("GPL");
+MODULE_AUTHOR("Ben Dooks <ben@simtec.co.uk>");
+MODULE_DESCRIPTION("GPIO NAND Driver");
diff --git a/include/linux/mtd/nand-gpio.h b/include/linux/mtd/nand-gpio.h
new file mode 100644
index 000000000000..51534e50f7fc
--- /dev/null
+++ b/include/linux/mtd/nand-gpio.h
@@ -0,0 +1,19 @@
+#ifndef __LINUX_MTD_NAND_GPIO_H
+#define __LINUX_MTD_NAND_GPIO_H
+
+#include <linux/mtd/nand.h>
+
+struct gpio_nand_platdata {
+	int	gpio_nce;
+	int	gpio_nwp;
+	int	gpio_cle;
+	int	gpio_ale;
+	int	gpio_rdy;
+	void	(*adjust_parts)(struct gpio_nand_platdata *, size_t);
+	struct mtd_partition *parts;
+	unsigned int num_parts;
+	unsigned int options;
+	int	chip_delay;
+};
+
+#endif
-- 
cgit v1.2.3


From dd3a1db900f2a215a7d7dd71b836e149a6cf5fed Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Thu, 16 Oct 2008 18:20:58 +0200
Subject: genirq: improve include files

Move the irq_desc related iterators out of irq.h, into irqnr.h, also
available via interrupt.h.

This way non-genirq (and even non-hardirq) architectures get the
common definitions and iterators.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 include/linux/interrupt.h |  1 +
 include/linux/irq.h       | 20 +-------------------
 include/linux/irqnr.h     | 24 ++++++++++++++++++++++++
 3 files changed, 26 insertions(+), 19 deletions(-)
 create mode 100644 include/linux/irqnr.h

(limited to 'include/linux')

diff --git a/include/linux/interrupt.h b/include/linux/interrupt.h
index 58ff4e74b2f3..72fcfcff5637 100644
--- a/include/linux/interrupt.h
+++ b/include/linux/interrupt.h
@@ -8,6 +8,7 @@
 #include <linux/preempt.h>
 #include <linux/cpumask.h>
 #include <linux/irqreturn.h>
+#include <linux/irqnr.h>
 #include <linux/hardirq.h>
 #include <linux/sched.h>
 #include <linux/irqflags.h>
diff --git a/include/linux/irq.h b/include/linux/irq.h
index 0618fb362cb4..d058c57be02d 100644
--- a/include/linux/irq.h
+++ b/include/linux/irq.h
@@ -11,25 +11,6 @@
 
 #include <linux/smp.h>
 
-#ifndef CONFIG_GENERIC_HARDIRQS
-# define nr_irqs		NR_IRQS
-
-# define for_each_irq_desc(irq, desc)		\
-	for (irq = 0; irq < nr_irqs; irq++)
-#else
-extern int nr_irqs;
-
-# define for_each_irq_desc(irq, desc)		\
-	for (irq = 0, desc = irq_desc; irq < nr_irqs; irq++, desc++)
-
-# define for_each_irq_desc_reverse(irq, desc)			\
-	for (irq = nr_irqs -1, desc = irq_desc + (nr_irqs -1 );	\
-	     irq > 0; irq--, desc--)
-#endif
-
-#define for_each_irq_nr(irq)			\
-	for (irq = 0; irq < nr_irqs; irq++)
-
 #ifndef CONFIG_S390
 
 #include <linux/linkage.h>
@@ -37,6 +18,7 @@ extern int nr_irqs;
 #include <linux/spinlock.h>
 #include <linux/cpumask.h>
 #include <linux/irqreturn.h>
+#include <linux/irqnr.h>
 #include <linux/errno.h>
 
 #include <asm/irq.h>
diff --git a/include/linux/irqnr.h b/include/linux/irqnr.h
new file mode 100644
index 000000000000..3171ddc3b39d
--- /dev/null
+++ b/include/linux/irqnr.h
@@ -0,0 +1,24 @@
+#ifndef _LINUX_IRQNR_H
+#define _LINUX_IRQNR_H
+
+#ifndef CONFIG_GENERIC_HARDIRQS
+#include <asm/irq.h>
+# define nr_irqs		NR_IRQS
+
+# define for_each_irq_desc(irq, desc)		\
+	for (irq = 0; irq < nr_irqs; irq++)
+#else
+extern int nr_irqs;
+
+# define for_each_irq_desc(irq, desc)		\
+	for (irq = 0, desc = irq_desc; irq < nr_irqs; irq++, desc++)
+
+# define for_each_irq_desc_reverse(irq, desc)			\
+	for (irq = nr_irqs -1, desc = irq_desc + (nr_irqs -1 );	\
+	     irq > 0; irq--, desc--)
+#endif
+
+#define for_each_irq_nr(irq)			\
+	for (irq = 0; irq < nr_irqs; irq++)
+
+#endif
-- 
cgit v1.2.3


From 5b6985ce8ec7127b4d60ad450b64ca8b82748a3b Mon Sep 17 00:00:00 2001
From: Fenghua Yu <fenghua.yu@intel.com>
Date: Thu, 16 Oct 2008 18:02:32 -0700
Subject: intel-iommu: IA64 support

The current Intel IOMMU code assumes that both host page size and Intel
IOMMU page size are 4KiB. The first patch supports variable page size.
This provides support for IA64 which has multiple page sizes.

This patch also adds some other code hooks for IA64 platform including
DMAR_OPERATION_TIMEOUT definition.

[dwmw2: some cleanup]
Signed-off-by: Fenghua Yu <fenghua.yu@intel.com>
Signed-off-by: Tony Luck <tony.luck@intel.com>
Signed-off-by: David Woodhouse <David.Woodhouse@intel.com>
---
 arch/x86/kernel/pci-dma.c     |  16 ------
 drivers/pci/dmar.c            |  19 ++++---
 drivers/pci/intel-iommu.c     | 128 ++++++++++++++++++++++--------------------
 drivers/pci/quirks.c          |  14 +++++
 include/asm-x86/iommu.h       |   4 ++
 include/linux/dma_remapping.h |  27 +++++----
 include/linux/intel-iommu.h   |  39 +++++++------
 7 files changed, 131 insertions(+), 116 deletions(-)

(limited to 'include/linux')

diff --git a/arch/x86/kernel/pci-dma.c b/arch/x86/kernel/pci-dma.c
index 192624820217..1972266e8ba5 100644
--- a/arch/x86/kernel/pci-dma.c
+++ b/arch/x86/kernel/pci-dma.c
@@ -9,8 +9,6 @@
 #include <asm/calgary.h>
 #include <asm/amd_iommu.h>
 
-static int forbid_dac __read_mostly;
-
 struct dma_mapping_ops *dma_ops;
 EXPORT_SYMBOL(dma_ops);
 
@@ -293,17 +291,3 @@ void pci_iommu_shutdown(void)
 }
 /* Must execute after PCI subsystem */
 fs_initcall(pci_iommu_init);
-
-#ifdef CONFIG_PCI
-/* Many VIA bridges seem to corrupt data for DAC. Disable it here */
-
-static __devinit void via_no_dac(struct pci_dev *dev)
-{
-	if ((dev->class >> 8) == PCI_CLASS_BRIDGE_PCI && forbid_dac == 0) {
-		printk(KERN_INFO "PCI: VIA PCI bridge detected."
-				 "Disabling DAC.\n");
-		forbid_dac = 1;
-	}
-}
-DECLARE_PCI_FIXUP_FINAL(PCI_VENDOR_ID_VIA, PCI_ANY_ID, via_no_dac);
-#endif
diff --git a/drivers/pci/dmar.c b/drivers/pci/dmar.c
index 44d6c7081b8f..b65173828bc2 100644
--- a/drivers/pci/dmar.c
+++ b/drivers/pci/dmar.c
@@ -277,14 +277,15 @@ dmar_table_print_dmar_entry(struct acpi_dmar_header *header)
 		drhd = (struct acpi_dmar_hardware_unit *)header;
 		printk (KERN_INFO PREFIX
 			"DRHD (flags: 0x%08x)base: 0x%016Lx\n",
-			drhd->flags, drhd->address);
+			drhd->flags, (unsigned long long)drhd->address);
 		break;
 	case ACPI_DMAR_TYPE_RESERVED_MEMORY:
 		rmrr = (struct acpi_dmar_reserved_memory *)header;
 
 		printk (KERN_INFO PREFIX
 			"RMRR base: 0x%016Lx end: 0x%016Lx\n",
-			rmrr->base_address, rmrr->end_address);
+			(unsigned long long)rmrr->base_address,
+			(unsigned long long)rmrr->end_address);
 		break;
 	}
 }
@@ -304,7 +305,7 @@ parse_dmar_table(void)
 	if (!dmar)
 		return -ENODEV;
 
-	if (dmar->width < PAGE_SHIFT_4K - 1) {
+	if (dmar->width < PAGE_SHIFT - 1) {
 		printk(KERN_WARNING PREFIX "Invalid DMAR haw\n");
 		return -EINVAL;
 	}
@@ -493,7 +494,7 @@ int alloc_iommu(struct dmar_drhd_unit *drhd)
 
 	iommu->seq_id = iommu_allocated++;
 
-	iommu->reg = ioremap(drhd->reg_base_addr, PAGE_SIZE_4K);
+	iommu->reg = ioremap(drhd->reg_base_addr, VTD_PAGE_SIZE);
 	if (!iommu->reg) {
 		printk(KERN_ERR "IOMMU: can't map the region\n");
 		goto error;
@@ -504,8 +505,8 @@ int alloc_iommu(struct dmar_drhd_unit *drhd)
 	/* the registers might be more than one page */
 	map_size = max_t(int, ecap_max_iotlb_offset(iommu->ecap),
 		cap_max_fault_reg_offset(iommu->cap));
-	map_size = PAGE_ALIGN_4K(map_size);
-	if (map_size > PAGE_SIZE_4K) {
+	map_size = VTD_PAGE_ALIGN(map_size);
+	if (map_size > VTD_PAGE_SIZE) {
 		iounmap(iommu->reg);
 		iommu->reg = ioremap(drhd->reg_base_addr, map_size);
 		if (!iommu->reg) {
@@ -516,8 +517,10 @@ int alloc_iommu(struct dmar_drhd_unit *drhd)
 
 	ver = readl(iommu->reg + DMAR_VER_REG);
 	pr_debug("IOMMU %llx: ver %d:%d cap %llx ecap %llx\n",
-		drhd->reg_base_addr, DMAR_VER_MAJOR(ver), DMAR_VER_MINOR(ver),
-		iommu->cap, iommu->ecap);
+		(unsigned long long)drhd->reg_base_addr,
+		DMAR_VER_MAJOR(ver), DMAR_VER_MINOR(ver),
+		(unsigned long long)iommu->cap,
+		(unsigned long long)iommu->ecap);
 
 	spin_lock_init(&iommu->register_lock);
 
diff --git a/drivers/pci/intel-iommu.c b/drivers/pci/intel-iommu.c
index 509470419130..2bf96babbc4f 100644
--- a/drivers/pci/intel-iommu.c
+++ b/drivers/pci/intel-iommu.c
@@ -18,6 +18,7 @@
  * Author: Ashok Raj <ashok.raj@intel.com>
  * Author: Shaohua Li <shaohua.li@intel.com>
  * Author: Anil S Keshavamurthy <anil.s.keshavamurthy@intel.com>
+ * Author: Fenghua Yu <fenghua.yu@intel.com>
  */
 
 #include <linux/init.h>
@@ -35,11 +36,13 @@
 #include <linux/timer.h>
 #include <linux/iova.h>
 #include <linux/intel-iommu.h>
-#include <asm/proto.h> /* force_iommu in this header in x86-64*/
 #include <asm/cacheflush.h>
 #include <asm/iommu.h>
 #include "pci.h"
 
+#define ROOT_SIZE		VTD_PAGE_SIZE
+#define CONTEXT_SIZE		VTD_PAGE_SIZE
+
 #define IS_GFX_DEVICE(pdev) ((pdev->class >> 16) == PCI_BASE_CLASS_DISPLAY)
 #define IS_ISA_DEVICE(pdev) ((pdev->class >> 8) == PCI_CLASS_BRIDGE_ISA)
 
@@ -199,7 +202,7 @@ static struct context_entry * device_to_context_entry(struct intel_iommu *iommu,
 			spin_unlock_irqrestore(&iommu->lock, flags);
 			return NULL;
 		}
-		__iommu_flush_cache(iommu, (void *)context, PAGE_SIZE_4K);
+		__iommu_flush_cache(iommu, (void *)context, CONTEXT_SIZE);
 		phy_addr = virt_to_phys((void *)context);
 		set_root_value(root, phy_addr);
 		set_root_present(root);
@@ -345,7 +348,7 @@ static struct dma_pte * addr_to_dma_pte(struct dmar_domain *domain, u64 addr)
 				return NULL;
 			}
 			__iommu_flush_cache(domain->iommu, tmp_page,
-					PAGE_SIZE_4K);
+					PAGE_SIZE);
 			dma_set_pte_addr(*pte, virt_to_phys(tmp_page));
 			/*
 			 * high level table always sets r/w, last level page
@@ -408,13 +411,13 @@ static void dma_pte_clear_range(struct dmar_domain *domain, u64 start, u64 end)
 	start &= (((u64)1) << addr_width) - 1;
 	end &= (((u64)1) << addr_width) - 1;
 	/* in case it's partial page */
-	start = PAGE_ALIGN_4K(start);
-	end &= PAGE_MASK_4K;
+	start = PAGE_ALIGN(start);
+	end &= PAGE_MASK;
 
 	/* we don't need lock here, nobody else touches the iova range */
 	while (start < end) {
 		dma_pte_clear_one(domain, start);
-		start += PAGE_SIZE_4K;
+		start += VTD_PAGE_SIZE;
 	}
 }
 
@@ -468,7 +471,7 @@ static int iommu_alloc_root_entry(struct intel_iommu *iommu)
 	if (!root)
 		return -ENOMEM;
 
-	__iommu_flush_cache(iommu, root, PAGE_SIZE_4K);
+	__iommu_flush_cache(iommu, root, ROOT_SIZE);
 
 	spin_lock_irqsave(&iommu->lock, flags);
 	iommu->root_entry = root;
@@ -634,7 +637,8 @@ static int __iommu_flush_iotlb(struct intel_iommu *iommu, u16 did,
 		printk(KERN_ERR"IOMMU: flush IOTLB failed\n");
 	if (DMA_TLB_IAIG(val) != DMA_TLB_IIRG(type))
 		pr_debug("IOMMU: tlb flush request %Lx, actual %Lx\n",
-			DMA_TLB_IIRG(type), DMA_TLB_IAIG(val));
+			(unsigned long long)DMA_TLB_IIRG(type),
+			(unsigned long long)DMA_TLB_IAIG(val));
 	/* flush context entry will implictly flush write buffer */
 	return 0;
 }
@@ -644,7 +648,7 @@ static int iommu_flush_iotlb_psi(struct intel_iommu *iommu, u16 did,
 {
 	unsigned int mask;
 
-	BUG_ON(addr & (~PAGE_MASK_4K));
+	BUG_ON(addr & (~VTD_PAGE_MASK));
 	BUG_ON(pages == 0);
 
 	/* Fallback to domain selective flush if no PSI support */
@@ -798,7 +802,7 @@ void dmar_msi_read(int irq, struct msi_msg *msg)
 }
 
 static int iommu_page_fault_do_one(struct intel_iommu *iommu, int type,
-		u8 fault_reason, u16 source_id, u64 addr)
+		u8 fault_reason, u16 source_id, unsigned long long addr)
 {
 	const char *reason;
 
@@ -1051,9 +1055,9 @@ static void dmar_init_reserved_ranges(void)
 			if (!r->flags || !(r->flags & IORESOURCE_MEM))
 				continue;
 			addr = r->start;
-			addr &= PAGE_MASK_4K;
+			addr &= PAGE_MASK;
 			size = r->end - addr;
-			size = PAGE_ALIGN_4K(size);
+			size = PAGE_ALIGN(size);
 			iova = reserve_iova(&reserved_iova_list, IOVA_PFN(addr),
 				IOVA_PFN(size + addr) - 1);
 			if (!iova)
@@ -1115,7 +1119,7 @@ static int domain_init(struct dmar_domain *domain, int guest_width)
 	domain->pgd = (struct dma_pte *)alloc_pgtable_page();
 	if (!domain->pgd)
 		return -ENOMEM;
-	__iommu_flush_cache(iommu, domain->pgd, PAGE_SIZE_4K);
+	__iommu_flush_cache(iommu, domain->pgd, PAGE_SIZE);
 	return 0;
 }
 
@@ -1131,7 +1135,7 @@ static void domain_exit(struct dmar_domain *domain)
 	/* destroy iovas */
 	put_iova_domain(&domain->iovad);
 	end = DOMAIN_MAX_ADDR(domain->gaw);
-	end = end & (~PAGE_MASK_4K);
+	end = end & (~PAGE_MASK);
 
 	/* clear ptes */
 	dma_pte_clear_range(domain, 0, end);
@@ -1252,22 +1256,25 @@ domain_page_mapping(struct dmar_domain *domain, dma_addr_t iova,
 	u64 start_pfn, end_pfn;
 	struct dma_pte *pte;
 	int index;
+	int addr_width = agaw_to_width(domain->agaw);
+
+	hpa &= (((u64)1) << addr_width) - 1;
 
 	if ((prot & (DMA_PTE_READ|DMA_PTE_WRITE)) == 0)
 		return -EINVAL;
-	iova &= PAGE_MASK_4K;
-	start_pfn = ((u64)hpa) >> PAGE_SHIFT_4K;
-	end_pfn = (PAGE_ALIGN_4K(((u64)hpa) + size)) >> PAGE_SHIFT_4K;
+	iova &= PAGE_MASK;
+	start_pfn = ((u64)hpa) >> VTD_PAGE_SHIFT;
+	end_pfn = (VTD_PAGE_ALIGN(((u64)hpa) + size)) >> VTD_PAGE_SHIFT;
 	index = 0;
 	while (start_pfn < end_pfn) {
-		pte = addr_to_dma_pte(domain, iova + PAGE_SIZE_4K * index);
+		pte = addr_to_dma_pte(domain, iova + VTD_PAGE_SIZE * index);
 		if (!pte)
 			return -ENOMEM;
 		/* We don't need lock here, nobody else
 		 * touches the iova range
 		 */
 		BUG_ON(dma_pte_addr(*pte));
-		dma_set_pte_addr(*pte, start_pfn << PAGE_SHIFT_4K);
+		dma_set_pte_addr(*pte, start_pfn << VTD_PAGE_SHIFT);
 		dma_set_pte_prot(*pte, prot);
 		__iommu_flush_cache(domain->iommu, pte, sizeof(*pte));
 		start_pfn++;
@@ -1445,11 +1452,13 @@ error:
 	return find_domain(pdev);
 }
 
-static int iommu_prepare_identity_map(struct pci_dev *pdev, u64 start, u64 end)
+static int iommu_prepare_identity_map(struct pci_dev *pdev,
+				      unsigned long long start,
+				      unsigned long long end)
 {
 	struct dmar_domain *domain;
 	unsigned long size;
-	u64 base;
+	unsigned long long base;
 	int ret;
 
 	printk(KERN_INFO
@@ -1461,9 +1470,9 @@ static int iommu_prepare_identity_map(struct pci_dev *pdev, u64 start, u64 end)
 		return -ENOMEM;
 
 	/* The address might not be aligned */
-	base = start & PAGE_MASK_4K;
+	base = start & PAGE_MASK;
 	size = end - base;
-	size = PAGE_ALIGN_4K(size);
+	size = PAGE_ALIGN(size);
 	if (!reserve_iova(&domain->iovad, IOVA_PFN(base),
 			IOVA_PFN(base + size) - 1)) {
 		printk(KERN_ERR "IOMMU: reserve iova failed\n");
@@ -1732,8 +1741,8 @@ error:
 static inline u64 aligned_size(u64 host_addr, size_t size)
 {
 	u64 addr;
-	addr = (host_addr & (~PAGE_MASK_4K)) + size;
-	return PAGE_ALIGN_4K(addr);
+	addr = (host_addr & (~PAGE_MASK)) + size;
+	return PAGE_ALIGN(addr);
 }
 
 struct iova *
@@ -1747,7 +1756,7 @@ iommu_alloc_iova(struct dmar_domain *domain, size_t size, u64 end)
 		return NULL;
 
 	piova = alloc_iova(&domain->iovad,
-			size >> PAGE_SHIFT_4K, IOVA_PFN(end), 1);
+			size >> PAGE_SHIFT, IOVA_PFN(end), 1);
 	return piova;
 }
 
@@ -1807,12 +1816,12 @@ get_valid_domain_for_dev(struct pci_dev *pdev)
 	return domain;
 }
 
-static dma_addr_t
+dma_addr_t
 intel_map_single(struct device *hwdev, phys_addr_t paddr, size_t size, int dir)
 {
 	struct pci_dev *pdev = to_pci_dev(hwdev);
 	struct dmar_domain *domain;
-	unsigned long start_paddr;
+	phys_addr_t start_paddr;
 	struct iova *iova;
 	int prot = 0;
 	int ret;
@@ -1831,7 +1840,7 @@ intel_map_single(struct device *hwdev, phys_addr_t paddr, size_t size, int dir)
 	if (!iova)
 		goto error;
 
-	start_paddr = iova->pfn_lo << PAGE_SHIFT_4K;
+	start_paddr = (phys_addr_t)iova->pfn_lo << PAGE_SHIFT;
 
 	/*
 	 * Check if DMAR supports zero-length reads on write only
@@ -1849,27 +1858,23 @@ intel_map_single(struct device *hwdev, phys_addr_t paddr, size_t size, int dir)
 	 * is not a big problem
 	 */
 	ret = domain_page_mapping(domain, start_paddr,
-		((u64)paddr) & PAGE_MASK_4K, size, prot);
+		((u64)paddr) & PAGE_MASK, size, prot);
 	if (ret)
 		goto error;
 
-	pr_debug("Device %s request: %lx@%llx mapping: %lx@%llx, dir %d\n",
-		pci_name(pdev), size, (u64)paddr,
-		size, (u64)start_paddr, dir);
-
 	/* it's a non-present to present mapping */
 	ret = iommu_flush_iotlb_psi(domain->iommu, domain->id,
-			start_paddr, size >> PAGE_SHIFT_4K, 1);
+			start_paddr, size >> VTD_PAGE_SHIFT, 1);
 	if (ret)
 		iommu_flush_write_buffer(domain->iommu);
 
-	return (start_paddr + ((u64)paddr & (~PAGE_MASK_4K)));
+	return start_paddr + ((u64)paddr & (~PAGE_MASK));
 
 error:
 	if (iova)
 		__free_iova(&domain->iovad, iova);
 	printk(KERN_ERR"Device %s request: %lx@%llx dir %d --- failed\n",
-		pci_name(pdev), size, (u64)paddr, dir);
+		pci_name(pdev), size, (unsigned long long)paddr, dir);
 	return 0;
 }
 
@@ -1931,8 +1936,8 @@ static void add_unmap(struct dmar_domain *dom, struct iova *iova)
 	spin_unlock_irqrestore(&async_umap_flush_lock, flags);
 }
 
-static void intel_unmap_single(struct device *dev, dma_addr_t dev_addr,
-	size_t size, int dir)
+void intel_unmap_single(struct device *dev, dma_addr_t dev_addr, size_t size,
+			int dir)
 {
 	struct pci_dev *pdev = to_pci_dev(dev);
 	struct dmar_domain *domain;
@@ -1948,11 +1953,11 @@ static void intel_unmap_single(struct device *dev, dma_addr_t dev_addr,
 	if (!iova)
 		return;
 
-	start_addr = iova->pfn_lo << PAGE_SHIFT_4K;
+	start_addr = iova->pfn_lo << PAGE_SHIFT;
 	size = aligned_size((u64)dev_addr, size);
 
 	pr_debug("Device %s unmapping: %lx@%llx\n",
-		pci_name(pdev), size, (u64)start_addr);
+		pci_name(pdev), size, (unsigned long long)start_addr);
 
 	/*  clear the whole page */
 	dma_pte_clear_range(domain, start_addr, start_addr + size);
@@ -1960,7 +1965,7 @@ static void intel_unmap_single(struct device *dev, dma_addr_t dev_addr,
 	dma_pte_free_pagetable(domain, start_addr, start_addr + size);
 	if (intel_iommu_strict) {
 		if (iommu_flush_iotlb_psi(domain->iommu,
-			domain->id, start_addr, size >> PAGE_SHIFT_4K, 0))
+			domain->id, start_addr, size >> VTD_PAGE_SHIFT, 0))
 			iommu_flush_write_buffer(domain->iommu);
 		/* free iova */
 		__free_iova(&domain->iovad, iova);
@@ -1973,13 +1978,13 @@ static void intel_unmap_single(struct device *dev, dma_addr_t dev_addr,
 	}
 }
 
-static void * intel_alloc_coherent(struct device *hwdev, size_t size,
-		       dma_addr_t *dma_handle, gfp_t flags)
+void *intel_alloc_coherent(struct device *hwdev, size_t size,
+			   dma_addr_t *dma_handle, gfp_t flags)
 {
 	void *vaddr;
 	int order;
 
-	size = PAGE_ALIGN_4K(size);
+	size = PAGE_ALIGN(size);
 	order = get_order(size);
 	flags &= ~(GFP_DMA | GFP_DMA32);
 
@@ -1995,12 +2000,12 @@ static void * intel_alloc_coherent(struct device *hwdev, size_t size,
 	return NULL;
 }
 
-static void intel_free_coherent(struct device *hwdev, size_t size,
-	void *vaddr, dma_addr_t dma_handle)
+void intel_free_coherent(struct device *hwdev, size_t size, void *vaddr,
+			 dma_addr_t dma_handle)
 {
 	int order;
 
-	size = PAGE_ALIGN_4K(size);
+	size = PAGE_ALIGN(size);
 	order = get_order(size);
 
 	intel_unmap_single(hwdev, dma_handle, size, DMA_BIDIRECTIONAL);
@@ -2008,8 +2013,9 @@ static void intel_free_coherent(struct device *hwdev, size_t size,
 }
 
 #define SG_ENT_VIRT_ADDRESS(sg)	(sg_virt((sg)))
-static void intel_unmap_sg(struct device *hwdev, struct scatterlist *sglist,
-	int nelems, int dir)
+
+void intel_unmap_sg(struct device *hwdev, struct scatterlist *sglist,
+		    int nelems, int dir)
 {
 	int i;
 	struct pci_dev *pdev = to_pci_dev(hwdev);
@@ -2033,7 +2039,7 @@ static void intel_unmap_sg(struct device *hwdev, struct scatterlist *sglist,
 		size += aligned_size((u64)addr, sg->length);
 	}
 
-	start_addr = iova->pfn_lo << PAGE_SHIFT_4K;
+	start_addr = iova->pfn_lo << PAGE_SHIFT;
 
 	/*  clear the whole page */
 	dma_pte_clear_range(domain, start_addr, start_addr + size);
@@ -2041,7 +2047,7 @@ static void intel_unmap_sg(struct device *hwdev, struct scatterlist *sglist,
 	dma_pte_free_pagetable(domain, start_addr, start_addr + size);
 
 	if (iommu_flush_iotlb_psi(domain->iommu, domain->id, start_addr,
-			size >> PAGE_SHIFT_4K, 0))
+			size >> VTD_PAGE_SHIFT, 0))
 		iommu_flush_write_buffer(domain->iommu);
 
 	/* free iova */
@@ -2062,8 +2068,8 @@ static int intel_nontranslate_map_sg(struct device *hddev,
 	return nelems;
 }
 
-static int intel_map_sg(struct device *hwdev, struct scatterlist *sglist,
-				int nelems, int dir)
+int intel_map_sg(struct device *hwdev, struct scatterlist *sglist, int nelems,
+		 int dir)
 {
 	void *addr;
 	int i;
@@ -2107,14 +2113,14 @@ static int intel_map_sg(struct device *hwdev, struct scatterlist *sglist,
 	if (dir == DMA_FROM_DEVICE || dir == DMA_BIDIRECTIONAL)
 		prot |= DMA_PTE_WRITE;
 
-	start_addr = iova->pfn_lo << PAGE_SHIFT_4K;
+	start_addr = iova->pfn_lo << PAGE_SHIFT;
 	offset = 0;
 	for_each_sg(sglist, sg, nelems, i) {
 		addr = SG_ENT_VIRT_ADDRESS(sg);
 		addr = (void *)virt_to_phys(addr);
 		size = aligned_size((u64)addr, sg->length);
 		ret = domain_page_mapping(domain, start_addr + offset,
-			((u64)addr) & PAGE_MASK_4K,
+			((u64)addr) & PAGE_MASK,
 			size, prot);
 		if (ret) {
 			/*  clear the page */
@@ -2128,14 +2134,14 @@ static int intel_map_sg(struct device *hwdev, struct scatterlist *sglist,
 			return 0;
 		}
 		sg->dma_address = start_addr + offset +
-				((u64)addr & (~PAGE_MASK_4K));
+				((u64)addr & (~PAGE_MASK));
 		sg->dma_length = sg->length;
 		offset += size;
 	}
 
 	/* it's a non-present to present mapping */
 	if (iommu_flush_iotlb_psi(domain->iommu, domain->id,
-			start_addr, offset >> PAGE_SHIFT_4K, 1))
+			start_addr, offset >> VTD_PAGE_SHIFT, 1))
 		iommu_flush_write_buffer(domain->iommu);
 	return nelems;
 }
@@ -2175,7 +2181,6 @@ static inline int iommu_devinfo_cache_init(void)
 					 sizeof(struct device_domain_info),
 					 0,
 					 SLAB_HWCACHE_ALIGN,
-
 					 NULL);
 	if (!iommu_devinfo_cache) {
 		printk(KERN_ERR "Couldn't create devinfo cache\n");
@@ -2193,7 +2198,6 @@ static inline int iommu_iova_cache_init(void)
 					 sizeof(struct iova),
 					 0,
 					 SLAB_HWCACHE_ALIGN,
-
 					 NULL);
 	if (!iommu_iova_cache) {
 		printk(KERN_ERR "Couldn't create iova cache\n");
@@ -2322,7 +2326,7 @@ void intel_iommu_domain_exit(struct dmar_domain *domain)
 		return;
 
 	end = DOMAIN_MAX_ADDR(domain->gaw);
-	end = end & (~PAGE_MASK_4K);
+	end = end & (~VTD_PAGE_MASK);
 
 	/* clear ptes */
 	dma_pte_clear_range(domain, 0, end);
@@ -2418,6 +2422,6 @@ u64 intel_iommu_iova_to_pfn(struct dmar_domain *domain, u64 iova)
 	if (pte)
 		pfn = dma_pte_addr(*pte);
 
-	return pfn >> PAGE_SHIFT_4K;
+	return pfn >> VTD_PAGE_SHIFT;
 }
 EXPORT_SYMBOL_GPL(intel_iommu_iova_to_pfn);
diff --git a/drivers/pci/quirks.c b/drivers/pci/quirks.c
index e872ac925b4b..832175d9ca25 100644
--- a/drivers/pci/quirks.c
+++ b/drivers/pci/quirks.c
@@ -35,6 +35,20 @@ static void __devinit quirk_mellanox_tavor(struct pci_dev *dev)
 DECLARE_PCI_FIXUP_FINAL(PCI_VENDOR_ID_MELLANOX,PCI_DEVICE_ID_MELLANOX_TAVOR,quirk_mellanox_tavor);
 DECLARE_PCI_FIXUP_FINAL(PCI_VENDOR_ID_MELLANOX,PCI_DEVICE_ID_MELLANOX_TAVOR_BRIDGE,quirk_mellanox_tavor);
 
+/* Many VIA bridges seem to corrupt data for DAC. Disable it here */
+int forbid_dac __read_mostly;
+EXPORT_SYMBOL(forbid_dac);
+
+static __devinit void via_no_dac(struct pci_dev *dev)
+{
+	if ((dev->class >> 8) == PCI_CLASS_BRIDGE_PCI && forbid_dac == 0) {
+		dev_info(&dev->dev,
+			"VIA PCI bridge detected. Disabling DAC.\n");
+		forbid_dac = 1;
+	}
+}
+DECLARE_PCI_FIXUP_FINAL(PCI_VENDOR_ID_VIA, PCI_ANY_ID, via_no_dac);
+
 /* Deal with broken BIOS'es that neglect to enable passive release,
    which can cause problems in combination with the 82441FX/PPro MTRRs */
 static void quirk_passive_release(struct pci_dev *dev)
diff --git a/include/asm-x86/iommu.h b/include/asm-x86/iommu.h
index 961e746da977..2daaffcda52f 100644
--- a/include/asm-x86/iommu.h
+++ b/include/asm-x86/iommu.h
@@ -7,9 +7,13 @@ extern struct dma_mapping_ops nommu_dma_ops;
 extern int force_iommu, no_iommu;
 extern int iommu_detected;
 extern int dmar_disabled;
+extern int forbid_dac;
 
 extern unsigned long iommu_nr_pages(unsigned long addr, unsigned long len);
 
+/* 10 seconds */
+#define DMAR_OPERATION_TIMEOUT ((cycles_t) tsc_khz*10*1000)
+
 #ifdef CONFIG_GART_IOMMU
 extern int gart_iommu_aperture;
 extern int gart_iommu_aperture_allowed;
diff --git a/include/linux/dma_remapping.h b/include/linux/dma_remapping.h
index bff5c65f81dc..952df39c989d 100644
--- a/include/linux/dma_remapping.h
+++ b/include/linux/dma_remapping.h
@@ -2,15 +2,14 @@
 #define _DMA_REMAPPING_H
 
 /*
- * We need a fixed PAGE_SIZE of 4K irrespective of
- * arch PAGE_SIZE for IOMMU page tables.
+ * VT-d hardware uses 4KiB page size regardless of host page size.
  */
-#define PAGE_SHIFT_4K		(12)
-#define PAGE_SIZE_4K		(1UL << PAGE_SHIFT_4K)
-#define PAGE_MASK_4K		(((u64)-1) << PAGE_SHIFT_4K)
-#define PAGE_ALIGN_4K(addr)	(((addr) + PAGE_SIZE_4K - 1) & PAGE_MASK_4K)
+#define VTD_PAGE_SHIFT		(12)
+#define VTD_PAGE_SIZE		(1UL << VTD_PAGE_SHIFT)
+#define VTD_PAGE_MASK		(((u64)-1) << VTD_PAGE_SHIFT)
+#define VTD_PAGE_ALIGN(addr)	(((addr) + VTD_PAGE_SIZE - 1) & VTD_PAGE_MASK)
 
-#define IOVA_PFN(addr)		((addr) >> PAGE_SHIFT_4K)
+#define IOVA_PFN(addr)		((addr) >> PAGE_SHIFT)
 #define DMA_32BIT_PFN		IOVA_PFN(DMA_32BIT_MASK)
 #define DMA_64BIT_PFN		IOVA_PFN(DMA_64BIT_MASK)
 
@@ -25,7 +24,7 @@ struct root_entry {
 	u64	val;
 	u64	rsvd1;
 };
-#define ROOT_ENTRY_NR (PAGE_SIZE_4K/sizeof(struct root_entry))
+#define ROOT_ENTRY_NR (VTD_PAGE_SIZE/sizeof(struct root_entry))
 static inline bool root_present(struct root_entry *root)
 {
 	return (root->val & 1);
@@ -36,7 +35,7 @@ static inline void set_root_present(struct root_entry *root)
 }
 static inline void set_root_value(struct root_entry *root, unsigned long value)
 {
-	root->val |= value & PAGE_MASK_4K;
+	root->val |= value & VTD_PAGE_MASK;
 }
 
 struct context_entry;
@@ -45,7 +44,7 @@ get_context_addr_from_root(struct root_entry *root)
 {
 	return (struct context_entry *)
 		(root_present(root)?phys_to_virt(
-		root->val & PAGE_MASK_4K):
+		root->val & VTD_PAGE_MASK) :
 		NULL);
 }
 
@@ -67,7 +66,7 @@ struct context_entry {
 #define context_present(c) ((c).lo & 1)
 #define context_fault_disable(c) (((c).lo >> 1) & 1)
 #define context_translation_type(c) (((c).lo >> 2) & 3)
-#define context_address_root(c) ((c).lo & PAGE_MASK_4K)
+#define context_address_root(c) ((c).lo & VTD_PAGE_MASK)
 #define context_address_width(c) ((c).hi &  7)
 #define context_domain_id(c) (((c).hi >> 8) & ((1 << 16) - 1))
 
@@ -81,7 +80,7 @@ struct context_entry {
 	} while (0)
 #define CONTEXT_TT_MULTI_LEVEL 0
 #define context_set_address_root(c, val) \
-	do {(c).lo |= (val) & PAGE_MASK_4K;} while (0)
+	do {(c).lo |= (val) & VTD_PAGE_MASK; } while (0)
 #define context_set_address_width(c, val) do {(c).hi |= (val) & 7;} while (0)
 #define context_set_domain_id(c, val) \
 	do {(c).hi |= ((val) & ((1 << 16) - 1)) << 8;} while (0)
@@ -107,9 +106,9 @@ struct dma_pte {
 #define dma_set_pte_writable(p) do {(p).val |= DMA_PTE_WRITE;} while (0)
 #define dma_set_pte_prot(p, prot) \
 		do {(p).val = ((p).val & ~3) | ((prot) & 3); } while (0)
-#define dma_pte_addr(p) ((p).val & PAGE_MASK_4K)
+#define dma_pte_addr(p) ((p).val & VTD_PAGE_MASK)
 #define dma_set_pte_addr(p, addr) do {\
-		(p).val |= ((addr) & PAGE_MASK_4K); } while (0)
+		(p).val |= ((addr) & VTD_PAGE_MASK); } while (0)
 #define dma_pte_present(p) (((p).val & 3) != 0)
 
 struct intel_iommu;
diff --git a/include/linux/intel-iommu.h b/include/linux/intel-iommu.h
index afb0d2a5b7cd..3d017cfd245b 100644
--- a/include/linux/intel-iommu.h
+++ b/include/linux/intel-iommu.h
@@ -29,6 +29,7 @@
 #include <linux/io.h>
 #include <linux/dma_remapping.h>
 #include <asm/cacheflush.h>
+#include <asm/iommu.h>
 
 /*
  * Intel IOMMU register specification per version 1.0 public spec.
@@ -202,22 +203,21 @@ static inline void dmar_writeq(void __iomem *addr, u64 val)
 #define dma_frcd_type(d) ((d >> 30) & 1)
 #define dma_frcd_fault_reason(c) (c & 0xff)
 #define dma_frcd_source_id(c) (c & 0xffff)
-#define dma_frcd_page_addr(d) (d & (((u64)-1) << 12)) /* low 64 bit */
-
-#define DMAR_OPERATION_TIMEOUT ((cycles_t) tsc_khz*10*1000) /* 10sec */
-
-#define IOMMU_WAIT_OP(iommu, offset, op, cond, sts) \
-{\
-	cycles_t start_time = get_cycles();\
-	while (1) {\
-		sts = op (iommu->reg + offset);\
-		if (cond)\
-			break;\
+/* low 64 bit */
+#define dma_frcd_page_addr(d) (d & (((u64)-1) << PAGE_SHIFT))
+
+#define IOMMU_WAIT_OP(iommu, offset, op, cond, sts)			\
+do {									\
+	cycles_t start_time = get_cycles();				\
+	while (1) {							\
+		sts = op(iommu->reg + offset);				\
+		if (cond)						\
+			break;						\
 		if (DMAR_OPERATION_TIMEOUT < (get_cycles() - start_time))\
-			panic("DMAR hardware is malfunctioning\n");\
-		cpu_relax();\
-	}\
-}
+			panic("DMAR hardware is malfunctioning\n");	\
+		cpu_relax();						\
+	}								\
+} while (0)
 
 #define QI_LENGTH	256	/* queue length */
 
@@ -244,7 +244,7 @@ enum {
 #define QI_IOTLB_DR(dr) 	(((u64)dr) << 7)
 #define QI_IOTLB_DW(dw) 	(((u64)dw) << 6)
 #define QI_IOTLB_GRAN(gran) 	(((u64)gran) >> (DMA_TLB_FLUSH_GRANU_OFFSET-4))
-#define QI_IOTLB_ADDR(addr)	(((u64)addr) & PAGE_MASK_4K)
+#define QI_IOTLB_ADDR(addr)	(((u64)addr) & VTD_PAGE_MASK)
 #define QI_IOTLB_IH(ih)		(((u64)ih) << 6)
 #define QI_IOTLB_AM(am)		(((u8)am))
 
@@ -353,4 +353,11 @@ static inline int intel_iommu_found(void)
 }
 #endif /* CONFIG_DMAR */
 
+extern void *intel_alloc_coherent(struct device *, size_t, dma_addr_t *, gfp_t);
+extern void intel_free_coherent(struct device *, size_t, void *, dma_addr_t);
+extern dma_addr_t intel_map_single(struct device *, phys_addr_t, size_t, int);
+extern void intel_unmap_single(struct device *, dma_addr_t, size_t, int);
+extern int intel_map_sg(struct device *, struct scatterlist *, int, int);
+extern void intel_unmap_sg(struct device *, struct scatterlist *, int, int);
+
 #endif
-- 
cgit v1.2.3


From 1c1b6ffce5737d764cc474b9bd6677bb9a344094 Mon Sep 17 00:00:00 2001
From: Dmitry Baryshkov <dbaryshkov@gmail.com>
Date: Wed, 24 Sep 2008 23:36:23 +0200
Subject: mfd: provide and use setup hook for tc6393xb

Instead of using bitfields for initial gpio setup,
provide generic setup/teardown hooks that can be used
to set the gpio states, register child devices, etc.

Signed-off-by: Dmitry Baryshkov <dbaryshkov@gmail.com>
Signed-off-by: Samuel Ortiz <sameo@openedhand.com>
---
 arch/arm/mach-pxa/include/mach/tosa.h |  2 --
 arch/arm/mach-pxa/tosa.c              | 35 +++++++++++++++++++++++++++++------
 drivers/mfd/tc6393xb.c                | 21 ++++++++++++++-------
 include/linux/mfd/tc6393xb.h          |  4 ++--
 4 files changed, 45 insertions(+), 17 deletions(-)

(limited to 'include/linux')

diff --git a/arch/arm/mach-pxa/include/mach/tosa.h b/arch/arm/mach-pxa/include/mach/tosa.h
index a72803f0461b..8bce6d8615b9 100644
--- a/arch/arm/mach-pxa/include/mach/tosa.h
+++ b/arch/arm/mach-pxa/include/mach/tosa.h
@@ -59,8 +59,6 @@
  * TC6393XB GPIOs
  */
 #define TOSA_TC6393XB_GPIO_BASE		(NR_BUILTIN_GPIO + 2 * 12)
-#define TOSA_TC6393XB_GPIO(i)		(TOSA_TC6393XB_GPIO_BASE + (i))
-#define TOSA_TC6393XB_GPIO_BIT(gpio)	(1 << (gpio - TOSA_TC6393XB_GPIO_BASE))
 
 #define TOSA_GPIO_TG_ON			(TOSA_TC6393XB_GPIO_BASE + 0)
 #define TOSA_GPIO_L_MUTE		(TOSA_TC6393XB_GPIO_BASE + 1)
diff --git a/arch/arm/mach-pxa/tosa.c b/arch/arm/mach-pxa/tosa.c
index 130e37e4ebdd..fac846b0d070 100644
--- a/arch/arm/mach-pxa/tosa.c
+++ b/arch/arm/mach-pxa/tosa.c
@@ -706,16 +706,39 @@ static struct tmio_nand_data tosa_tc6393xb_nand_config = {
 	.badblock_pattern = &tosa_tc6393xb_nand_bbt,
 };
 
-static struct tc6393xb_platform_data tosa_tc6393xb_setup = {
+static int tosa_tc6393xb_setup(struct platform_device *dev)
+{
+	int rc;
+
+	rc = gpio_request(TOSA_GPIO_CARD_VCC_ON, "CARD_VCC_ON");
+	if (rc)
+		goto err_req;
+
+	rc = gpio_direction_output(TOSA_GPIO_CARD_VCC_ON, 1);
+	if (rc)
+		goto err_dir;
+
+	return rc;
+
+err_dir:
+	gpio_free(TOSA_GPIO_CARD_VCC_ON);
+err_req:
+	return rc;
+}
+
+static void tosa_tc6393xb_teardown(struct platform_device *dev)
+{
+	gpio_free(TOSA_GPIO_CARD_VCC_ON);
+}
+
+static struct tc6393xb_platform_data tosa_tc6393xb_data = {
 	.scr_pll2cr	= 0x0cc1,
 	.scr_gper	= 0x3300,
-	.scr_gpo_dsr	=
-		TOSA_TC6393XB_GPIO_BIT(TOSA_GPIO_CARD_VCC_ON),
-	.scr_gpo_doecr	=
-		TOSA_TC6393XB_GPIO_BIT(TOSA_GPIO_CARD_VCC_ON),
 
 	.irq_base	= IRQ_BOARD_START,
 	.gpio_base	= TOSA_TC6393XB_GPIO_BASE,
+	.setup		= tosa_tc6393xb_setup,
+	.teardown	= tosa_tc6393xb_teardown,
 
 	.enable		= tosa_tc6393xb_enable,
 	.disable	= tosa_tc6393xb_disable,
@@ -730,7 +753,7 @@ static struct platform_device tc6393xb_device = {
 	.name	= "tc6393xb",
 	.id	= -1,
 	.dev	= {
-		.platform_data	= &tosa_tc6393xb_setup,
+		.platform_data	= &tosa_tc6393xb_data,
 	},
 	.num_resources	= ARRAY_SIZE(tc6393xb_resources),
 	.resource	= tc6393xb_resources,
diff --git a/drivers/mfd/tc6393xb.c b/drivers/mfd/tc6393xb.c
index e4c1c788b5f8..83dc703f3767 100644
--- a/drivers/mfd/tc6393xb.c
+++ b/drivers/mfd/tc6393xb.c
@@ -460,13 +460,6 @@ static int __devinit tc6393xb_probe(struct platform_device *dev)
 
 	tc6393xb->suspend_state.fer = 0;
 
-	for (i = 0; i < 3; i++) {
-		tc6393xb->suspend_state.gpo_dsr[i] =
-			(tcpd->scr_gpo_dsr >> (8 * i)) & 0xff;
-		tc6393xb->suspend_state.gpo_doecr[i] =
-			(tcpd->scr_gpo_doecr >> (8 * i)) & 0xff;
-	}
-
 	tc6393xb->suspend_state.ccr = SCR_CCR_UNK1 |
 					SCR_CCR_HCLK_48;
 
@@ -488,6 +481,12 @@ static int __devinit tc6393xb_probe(struct platform_device *dev)
 
 	tc6393xb_attach_irq(dev);
 
+	if (tcpd->setup) {
+		ret = tcpd->setup(dev);
+		if (ret)
+			goto err_setup;
+	}
+
 	tc6393xb_cells[TC6393XB_CELL_NAND].driver_data = tcpd->nand_data;
 	tc6393xb_cells[TC6393XB_CELL_NAND].platform_data =
 		&tc6393xb_cells[TC6393XB_CELL_NAND];
@@ -506,6 +505,10 @@ static int __devinit tc6393xb_probe(struct platform_device *dev)
 	if (!ret)
 		return 0;
 
+	if (tcpd->teardown)
+		tcpd->teardown(dev);
+
+err_setup:
 	tc6393xb_detach_irq(dev);
 
 err_gpio_add:
@@ -535,6 +538,10 @@ static int __devexit tc6393xb_remove(struct platform_device *dev)
 	int ret;
 
 	mfd_remove_devices(&dev->dev);
+
+	if (tcpd->teardown)
+		tcpd->teardown(dev);
+
 	tc6393xb_detach_irq(dev);
 
 	if (tc6393xb->gpio.base != -1) {
diff --git a/include/linux/mfd/tc6393xb.h b/include/linux/mfd/tc6393xb.h
index fec7b3f7a81f..1fa820646d98 100644
--- a/include/linux/mfd/tc6393xb.h
+++ b/include/linux/mfd/tc6393xb.h
@@ -21,8 +21,6 @@
 struct tc6393xb_platform_data {
 	u16	scr_pll2cr;	/* PLL2 Control */
 	u16	scr_gper;	/* GP Enable */
-	u32	scr_gpo_doecr;	/* GPO Data OE Control */
-	u32	scr_gpo_dsr;	/* GPO Data Set */
 
 	int	(*enable)(struct platform_device *dev);
 	int	(*disable)(struct platform_device *dev);
@@ -31,6 +29,8 @@ struct tc6393xb_platform_data {
 
 	int	irq_base;	/* base for subdevice irqs */
 	int	gpio_base;
+	int	(*setup)(struct platform_device *dev);
+	void	(*teardown)(struct platform_device *dev);
 
 	struct tmio_nand_data	*nand_data;
 };
-- 
cgit v1.2.3


From f98a0bd0e4b77b12e49ce01f4c9f04503931c291 Mon Sep 17 00:00:00 2001
From: Dmitry Baryshkov <dbaryshkov@gmail.com>
Date: Wed, 24 Sep 2008 23:46:10 +0200
Subject: mfd: do tcb6393xb state restore on resume only if requested

As requested by Ian make state restore only if it's requested
by platform data: some platforms do correctly save the state of
the chip during suspend/resume, but some (like tosa) incorrectly
power off the chip at suspend, so the driver supports restoring
some bits of the tc6393xb state (not full, merely enough to support
resume on tosa). With this patch this code is disabled by default.

Signed-off-by: Dmitry Baryshkov <dbaryshkov@gmail.com>
Acked-by: Ian Molton <spyro@f2s.com>
Signed-off-by: Samuel Ortiz <sameo@openedhand.com>
---
 arch/arm/mach-pxa/tosa.c     |  2 ++
 drivers/mfd/tc6393xb.c       | 74 ++++++++++++++++++++------------------------
 include/linux/mfd/tc6393xb.h |  4 +++
 3 files changed, 40 insertions(+), 40 deletions(-)

(limited to 'include/linux')

diff --git a/arch/arm/mach-pxa/tosa.c b/arch/arm/mach-pxa/tosa.c
index fac846b0d070..a6c4694359ca 100644
--- a/arch/arm/mach-pxa/tosa.c
+++ b/arch/arm/mach-pxa/tosa.c
@@ -746,6 +746,8 @@ static struct tc6393xb_platform_data tosa_tc6393xb_data = {
 	.resume		= tosa_tc6393xb_resume,
 
 	.nand_data	= &tosa_tc6393xb_nand_config,
+
+	.resume_restore = 1,
 };
 
 
diff --git a/drivers/mfd/tc6393xb.c b/drivers/mfd/tc6393xb.c
index 83dc703f3767..c3c64aeeb12a 100644
--- a/drivers/mfd/tc6393xb.c
+++ b/drivers/mfd/tc6393xb.c
@@ -369,41 +369,12 @@ static void tc6393xb_detach_irq(struct platform_device *dev)
 
 /*--------------------------------------------------------------------------*/
 
-static int tc6393xb_hw_init(struct platform_device *dev)
-{
-	struct tc6393xb_platform_data *tcpd = dev->dev.platform_data;
-	struct tc6393xb *tc6393xb = platform_get_drvdata(dev);
-	int i;
-
-	iowrite8(tc6393xb->suspend_state.fer,	tc6393xb->scr + SCR_FER);
-	iowrite16(tcpd->scr_pll2cr,		tc6393xb->scr + SCR_PLL2CR);
-	iowrite16(tc6393xb->suspend_state.ccr,	tc6393xb->scr + SCR_CCR);
-	iowrite16(SCR_MCR_RDY_OPENDRAIN | SCR_MCR_RDY_UNK | SCR_MCR_RDY_EN |
-		  SCR_MCR_INT_OPENDRAIN | SCR_MCR_INT_UNK | SCR_MCR_INT_EN |
-		  BIT(15),			tc6393xb->scr + SCR_MCR);
-	iowrite16(tcpd->scr_gper,		tc6393xb->scr + SCR_GPER);
-	iowrite8(0,				tc6393xb->scr + SCR_IRR);
-	iowrite8(0xbf,				tc6393xb->scr + SCR_IMR);
-
-	for (i = 0; i < 3; i++) {
-		iowrite8(tc6393xb->suspend_state.gpo_dsr[i],
-					tc6393xb->scr + SCR_GPO_DSR(i));
-		iowrite8(tc6393xb->suspend_state.gpo_doecr[i],
-					tc6393xb->scr + SCR_GPO_DOECR(i));
-		iowrite8(tc6393xb->suspend_state.gpi_bcr[i],
-					tc6393xb->scr + SCR_GPI_BCR(i));
-	}
-
-	return 0;
-}
-
 static int __devinit tc6393xb_probe(struct platform_device *dev)
 {
 	struct tc6393xb_platform_data *tcpd = dev->dev.platform_data;
 	struct tc6393xb *tc6393xb;
 	struct resource *iomem, *rscr;
 	int ret, temp;
-	int i;
 
 	iomem = platform_get_resource(dev, IORESOURCE_MEM, 0);
 	if (!iomem)
@@ -458,14 +429,16 @@ static int __devinit tc6393xb_probe(struct platform_device *dev)
 	if (ret)
 		goto err_enable;
 
-	tc6393xb->suspend_state.fer = 0;
-
-	tc6393xb->suspend_state.ccr = SCR_CCR_UNK1 |
-					SCR_CCR_HCLK_48;
-
-	ret = tc6393xb_hw_init(dev);
-	if (ret)
-		goto err_hw_init;
+	iowrite8(0,				tc6393xb->scr + SCR_FER);
+	iowrite16(tcpd->scr_pll2cr,		tc6393xb->scr + SCR_PLL2CR);
+	iowrite16(SCR_CCR_UNK1 | SCR_CCR_HCLK_48,
+						tc6393xb->scr + SCR_CCR);
+	iowrite16(SCR_MCR_RDY_OPENDRAIN | SCR_MCR_RDY_UNK | SCR_MCR_RDY_EN |
+		  SCR_MCR_INT_OPENDRAIN | SCR_MCR_INT_UNK | SCR_MCR_INT_EN |
+		  BIT(15),			tc6393xb->scr + SCR_MCR);
+	iowrite16(tcpd->scr_gper,		tc6393xb->scr + SCR_GPER);
+	iowrite8(0,				tc6393xb->scr + SCR_IRR);
+	iowrite8(0xbf,				tc6393xb->scr + SCR_IMR);
 
 	printk(KERN_INFO "Toshiba tc6393xb revision %d at 0x%08lx, irq %d\n",
 			tmio_ioread8(tc6393xb->scr + SCR_REVID),
@@ -514,7 +487,6 @@ err_setup:
 err_gpio_add:
 	if (tc6393xb->gpio.base != -1)
 		temp = gpiochip_remove(&tc6393xb->gpio);
-err_hw_init:
 	tcpd->disable(dev);
 err_clk_enable:
 	clk_disable(tc6393xb->clk);
@@ -592,15 +564,37 @@ static int tc6393xb_resume(struct platform_device *dev)
 	struct tc6393xb_platform_data *tcpd = dev->dev.platform_data;
 	struct tc6393xb *tc6393xb = platform_get_drvdata(dev);
 	int ret;
+	int i;
 
 	clk_enable(tc6393xb->clk);
 
 	ret = tcpd->resume(dev);
-
 	if (ret)
 		return ret;
 
-	return tc6393xb_hw_init(dev);
+	if (!tcpd->resume_restore)
+		return 0;
+
+	iowrite8(tc6393xb->suspend_state.fer,	tc6393xb->scr + SCR_FER);
+	iowrite16(tcpd->scr_pll2cr,		tc6393xb->scr + SCR_PLL2CR);
+	iowrite16(tc6393xb->suspend_state.ccr,	tc6393xb->scr + SCR_CCR);
+	iowrite16(SCR_MCR_RDY_OPENDRAIN | SCR_MCR_RDY_UNK | SCR_MCR_RDY_EN |
+		  SCR_MCR_INT_OPENDRAIN | SCR_MCR_INT_UNK | SCR_MCR_INT_EN |
+		  BIT(15),			tc6393xb->scr + SCR_MCR);
+	iowrite16(tcpd->scr_gper,		tc6393xb->scr + SCR_GPER);
+	iowrite8(0,				tc6393xb->scr + SCR_IRR);
+	iowrite8(0xbf,				tc6393xb->scr + SCR_IMR);
+
+	for (i = 0; i < 3; i++) {
+		iowrite8(tc6393xb->suspend_state.gpo_dsr[i],
+					tc6393xb->scr + SCR_GPO_DSR(i));
+		iowrite8(tc6393xb->suspend_state.gpo_doecr[i],
+					tc6393xb->scr + SCR_GPO_DOECR(i));
+		iowrite8(tc6393xb->suspend_state.gpi_bcr[i],
+					tc6393xb->scr + SCR_GPI_BCR(i));
+	}
+
+	return 0;
 }
 #else
 #define tc6393xb_suspend NULL
diff --git a/include/linux/mfd/tc6393xb.h b/include/linux/mfd/tc6393xb.h
index 1fa820646d98..3ce10ae0f397 100644
--- a/include/linux/mfd/tc6393xb.h
+++ b/include/linux/mfd/tc6393xb.h
@@ -33,6 +33,10 @@ struct tc6393xb_platform_data {
 	void	(*teardown)(struct platform_device *dev);
 
 	struct tmio_nand_data	*nand_data;
+
+	unsigned resume_restore : 1; /* make special actions
+					to preserve the state
+					on suspend/resume */
 };
 
 /*
-- 
cgit v1.2.3


From 51a55623565c6ca864f7cf19e87c2d4bde1c0c5e Mon Sep 17 00:00:00 2001
From: Dmitry Baryshkov <dbaryshkov@gmail.com>
Date: Fri, 3 Oct 2008 20:11:36 +0200
Subject: mfd: add OHCI cell to tc6393xb

Add information regarding OHCI cell of the tc6393xb

Signed-off-by: Dmitry Baryshkov <dbaryshkov@gmail.com>
Acked-by: Ian Molton <spyro@f2s.com>
Signed-off-by: Samuel Ortiz <sameo@openedhand.com>
---
 drivers/mfd/tc6393xb.c       | 87 ++++++++++++++++++++++++++++++++++++++++++++
 include/linux/mfd/tc6393xb.h |  1 +
 2 files changed, 88 insertions(+)

(limited to 'include/linux')

diff --git a/drivers/mfd/tc6393xb.c b/drivers/mfd/tc6393xb.c
index c3c64aeeb12a..6197db7d4859 100644
--- a/drivers/mfd/tc6393xb.c
+++ b/drivers/mfd/tc6393xb.c
@@ -113,6 +113,7 @@ struct tc6393xb {
 enum {
 	TC6393XB_CELL_NAND,
 	TC6393XB_CELL_MMC,
+	TC6393XB_CELL_OHCI,
 };
 
 /*--------------------------------------------------------------------------*/
@@ -170,6 +171,78 @@ static struct resource __devinitdata tc6393xb_mmc_resources[] = {
 	},
 };
 
+const static struct resource tc6393xb_ohci_resources[] = {
+	{
+		.start	= 0x3000,
+		.end	= 0x31ff,
+		.flags	= IORESOURCE_MEM,
+	},
+	{
+		.start	= 0x0300,
+		.end	= 0x03ff,
+		.flags	= IORESOURCE_MEM,
+	},
+	{
+		.start	= 0x010000,
+		.end	= 0x017fff,
+		.flags	= IORESOURCE_MEM,
+	},
+	{
+		.start	= 0x018000,
+		.end	= 0x01ffff,
+		.flags	= IORESOURCE_MEM,
+	},
+	{
+		.start	= IRQ_TC6393_OHCI,
+		.end	= IRQ_TC6393_OHCI,
+		.flags	= IORESOURCE_IRQ,
+	},
+};
+
+static int tc6393xb_ohci_enable(struct platform_device *dev)
+{
+	struct tc6393xb *tc6393xb = dev_get_drvdata(dev->dev.parent);
+	unsigned long flags;
+	u16 ccr;
+	u8 fer;
+
+	spin_lock_irqsave(&tc6393xb->lock, flags);
+
+	ccr = tmio_ioread16(tc6393xb->scr + SCR_CCR);
+	ccr |= SCR_CCR_USBCK;
+	tmio_iowrite16(ccr, tc6393xb->scr + SCR_CCR);
+
+	fer = tmio_ioread8(tc6393xb->scr + SCR_FER);
+	fer |= SCR_FER_USBEN;
+	tmio_iowrite8(fer, tc6393xb->scr + SCR_FER);
+
+	spin_unlock_irqrestore(&tc6393xb->lock, flags);
+
+	return 0;
+}
+
+static int tc6393xb_ohci_disable(struct platform_device *dev)
+{
+	struct tc6393xb *tc6393xb = dev_get_drvdata(dev->dev.parent);
+	unsigned long flags;
+	u16 ccr;
+	u8 fer;
+
+	spin_lock_irqsave(&tc6393xb->lock, flags);
+
+	fer = tmio_ioread8(tc6393xb->scr + SCR_FER);
+	fer &= ~SCR_FER_USBEN;
+	tmio_iowrite8(fer, tc6393xb->scr + SCR_FER);
+
+	ccr = tmio_ioread16(tc6393xb->scr + SCR_CCR);
+	ccr &= ~SCR_CCR_USBCK;
+	tmio_iowrite16(ccr, tc6393xb->scr + SCR_CCR);
+
+	spin_unlock_irqrestore(&tc6393xb->lock, flags);
+
+	return 0;
+}
+
 static struct mfd_cell __devinitdata tc6393xb_cells[] = {
 	[TC6393XB_CELL_NAND] = {
 		.name = "tmio-nand",
@@ -182,6 +255,15 @@ static struct mfd_cell __devinitdata tc6393xb_cells[] = {
 		.num_resources = ARRAY_SIZE(tc6393xb_mmc_resources),
 		.resources = tc6393xb_mmc_resources,
 	},
+	[TC6393XB_CELL_OHCI] = {
+		.name = "tmio-ohci",
+		.num_resources = ARRAY_SIZE(tc6393xb_ohci_resources),
+		.resources = tc6393xb_ohci_resources,
+		.enable = tc6393xb_ohci_enable,
+		.suspend = tc6393xb_ohci_disable,
+		.resume = tc6393xb_ohci_enable,
+		.disable = tc6393xb_ohci_disable,
+	},
 };
 
 /*--------------------------------------------------------------------------*/
@@ -470,6 +552,11 @@ static int __devinit tc6393xb_probe(struct platform_device *dev)
 	tc6393xb_cells[TC6393XB_CELL_MMC].data_size =
 		sizeof(tc6393xb_cells[TC6393XB_CELL_MMC]);
 
+	tc6393xb_cells[TC6393XB_CELL_OHCI].platform_data =
+		&tc6393xb_cells[TC6393XB_CELL_OHCI];
+	tc6393xb_cells[TC6393XB_CELL_OHCI].data_size =
+		sizeof(tc6393xb_cells[TC6393XB_CELL_OHCI]);
+
 
 	ret = mfd_add_devices(&dev->dev, dev->id,
 			tc6393xb_cells, ARRAY_SIZE(tc6393xb_cells),
diff --git a/include/linux/mfd/tc6393xb.h b/include/linux/mfd/tc6393xb.h
index 3ce10ae0f397..4437736ebe19 100644
--- a/include/linux/mfd/tc6393xb.h
+++ b/include/linux/mfd/tc6393xb.h
@@ -44,6 +44,7 @@ struct tc6393xb_platform_data {
  */
 #define	IRQ_TC6393_NAND		0
 #define	IRQ_TC6393_MMC		1
+#define	IRQ_TC6393_OHCI		2
 
 #define	TC6393XB_NR_IRQS	8
 
-- 
cgit v1.2.3


From 9e78cfe53f3c2bc1b37870697c3cde1543fefa8b Mon Sep 17 00:00:00 2001
From: Dmitry Baryshkov <dbaryshkov@gmail.com>
Date: Sat, 4 Oct 2008 00:50:36 +0200
Subject: mfd: support tmiofb cell on tc6393xb

Add support for tmiofb cell found in tc6393xb chip.

Signed-off-by: Dmitry Baryshkov <dbaryshkov@gmail.com>
Cc: Ian Molton <spyro@f2s.com>
Signed-off-by: Samuel Ortiz <sameo@openedhand.com>
---
 drivers/mfd/tc6393xb.c       | 114 +++++++++++++++++++++++++++++++++++++++++++
 include/linux/mfd/tc6393xb.h |   8 +++
 2 files changed, 122 insertions(+)

(limited to 'include/linux')

diff --git a/drivers/mfd/tc6393xb.c b/drivers/mfd/tc6393xb.c
index 6197db7d4859..f856e9463a9f 100644
--- a/drivers/mfd/tc6393xb.c
+++ b/drivers/mfd/tc6393xb.c
@@ -114,6 +114,7 @@ enum {
 	TC6393XB_CELL_NAND,
 	TC6393XB_CELL_MMC,
 	TC6393XB_CELL_OHCI,
+	TC6393XB_CELL_FB,
 };
 
 /*--------------------------------------------------------------------------*/
@@ -199,6 +200,29 @@ const static struct resource tc6393xb_ohci_resources[] = {
 	},
 };
 
+static struct resource __devinitdata tc6393xb_fb_resources[] = {
+	{
+		.start	= 0x5000,
+		.end	= 0x51ff,
+		.flags	= IORESOURCE_MEM,
+	},
+	{
+		.start	= 0x0500,
+		.end	= 0x05ff,
+		.flags	= IORESOURCE_MEM,
+	},
+	{
+		.start	= 0x100000,
+		.end	= 0x1fffff,
+		.flags	= IORESOURCE_MEM,
+	},
+	{
+		.start	= IRQ_TC6393_FB,
+		.end	= IRQ_TC6393_FB,
+		.flags	= IORESOURCE_IRQ,
+	},
+};
+
 static int tc6393xb_ohci_enable(struct platform_device *dev)
 {
 	struct tc6393xb *tc6393xb = dev_get_drvdata(dev->dev.parent);
@@ -243,6 +267,81 @@ static int tc6393xb_ohci_disable(struct platform_device *dev)
 	return 0;
 }
 
+static int tc6393xb_fb_enable(struct platform_device *dev)
+{
+	struct tc6393xb *tc6393xb = dev_get_drvdata(dev->dev.parent);
+	unsigned long flags;
+	u16 ccr;
+
+	spin_lock_irqsave(&tc6393xb->lock, flags);
+
+	ccr = tmio_ioread16(tc6393xb->scr + SCR_CCR);
+	ccr &= ~SCR_CCR_MCLK_MASK;
+	ccr |= SCR_CCR_MCLK_48;
+	tmio_iowrite16(ccr, tc6393xb->scr + SCR_CCR);
+
+	spin_unlock_irqrestore(&tc6393xb->lock, flags);
+
+	return 0;
+}
+
+static int tc6393xb_fb_disable(struct platform_device *dev)
+{
+	struct tc6393xb *tc6393xb = dev_get_drvdata(dev->dev.parent);
+	unsigned long flags;
+	u16 ccr;
+
+	spin_lock_irqsave(&tc6393xb->lock, flags);
+
+	ccr = tmio_ioread16(tc6393xb->scr + SCR_CCR);
+	ccr &= ~SCR_CCR_MCLK_MASK;
+	ccr |= SCR_CCR_MCLK_OFF;
+	tmio_iowrite16(ccr, tc6393xb->scr + SCR_CCR);
+
+	spin_unlock_irqrestore(&tc6393xb->lock, flags);
+
+	return 0;
+}
+
+int tc6393xb_lcd_set_power(struct platform_device *fb, bool on)
+{
+	struct platform_device *dev = to_platform_device(fb->dev.parent);
+	struct tc6393xb *tc6393xb = platform_get_drvdata(dev);
+	u8 fer;
+	unsigned long flags;
+
+	spin_lock_irqsave(&tc6393xb->lock, flags);
+
+	fer = ioread8(tc6393xb->scr + SCR_FER);
+	if (on)
+		fer |= SCR_FER_SLCDEN;
+	else
+		fer &= ~SCR_FER_SLCDEN;
+	iowrite8(fer, tc6393xb->scr + SCR_FER);
+
+	spin_unlock_irqrestore(&tc6393xb->lock, flags);
+
+	return 0;
+}
+EXPORT_SYMBOL(tc6393xb_lcd_set_power);
+
+int tc6393xb_lcd_mode(struct platform_device *fb,
+					const struct fb_videomode *mode) {
+	struct platform_device *dev = to_platform_device(fb->dev.parent);
+	struct tc6393xb *tc6393xb = platform_get_drvdata(dev);
+	unsigned long flags;
+
+	spin_lock_irqsave(&tc6393xb->lock, flags);
+
+	iowrite16(mode->pixclock, tc6393xb->scr + SCR_PLL1CR + 0);
+	iowrite16(mode->pixclock >> 16, tc6393xb->scr + SCR_PLL1CR + 2);
+
+	spin_unlock_irqrestore(&tc6393xb->lock, flags);
+
+	return 0;
+}
+EXPORT_SYMBOL(tc6393xb_lcd_mode);
+
 static struct mfd_cell __devinitdata tc6393xb_cells[] = {
 	[TC6393XB_CELL_NAND] = {
 		.name = "tmio-nand",
@@ -264,6 +363,15 @@ static struct mfd_cell __devinitdata tc6393xb_cells[] = {
 		.resume = tc6393xb_ohci_enable,
 		.disable = tc6393xb_ohci_disable,
 	},
+	[TC6393XB_CELL_FB] = {
+		.name = "tmio-fb",
+		.num_resources = ARRAY_SIZE(tc6393xb_fb_resources),
+		.resources = tc6393xb_fb_resources,
+		.enable = tc6393xb_fb_enable,
+		.suspend = tc6393xb_fb_disable,
+		.resume = tc6393xb_fb_enable,
+		.disable = tc6393xb_fb_disable,
+	},
 };
 
 /*--------------------------------------------------------------------------*/
@@ -547,6 +655,7 @@ static int __devinit tc6393xb_probe(struct platform_device *dev)
 		&tc6393xb_cells[TC6393XB_CELL_NAND];
 	tc6393xb_cells[TC6393XB_CELL_NAND].data_size =
 		sizeof(tc6393xb_cells[TC6393XB_CELL_NAND]);
+
 	tc6393xb_cells[TC6393XB_CELL_MMC].platform_data =
 		&tc6393xb_cells[TC6393XB_CELL_MMC];
 	tc6393xb_cells[TC6393XB_CELL_MMC].data_size =
@@ -557,6 +666,11 @@ static int __devinit tc6393xb_probe(struct platform_device *dev)
 	tc6393xb_cells[TC6393XB_CELL_OHCI].data_size =
 		sizeof(tc6393xb_cells[TC6393XB_CELL_OHCI]);
 
+	tc6393xb_cells[TC6393XB_CELL_FB].driver_data = tcpd->fb_data;
+	tc6393xb_cells[TC6393XB_CELL_FB].platform_data =
+		&tc6393xb_cells[TC6393XB_CELL_FB];
+	tc6393xb_cells[TC6393XB_CELL_FB].data_size =
+		sizeof(tc6393xb_cells[TC6393XB_CELL_FB]);
 
 	ret = mfd_add_devices(&dev->dev, dev->id,
 			tc6393xb_cells, ARRAY_SIZE(tc6393xb_cells),
diff --git a/include/linux/mfd/tc6393xb.h b/include/linux/mfd/tc6393xb.h
index 4437736ebe19..626e448205c5 100644
--- a/include/linux/mfd/tc6393xb.h
+++ b/include/linux/mfd/tc6393xb.h
@@ -17,6 +17,8 @@
 #ifndef MFD_TC6393XB_H
 #define MFD_TC6393XB_H
 
+#include <linux/fb.h>
+
 /* Also one should provide the CK3P6MI clock */
 struct tc6393xb_platform_data {
 	u16	scr_pll2cr;	/* PLL2 Control */
@@ -33,18 +35,24 @@ struct tc6393xb_platform_data {
 	void	(*teardown)(struct platform_device *dev);
 
 	struct tmio_nand_data	*nand_data;
+	struct tmio_fb_data	*fb_data;
 
 	unsigned resume_restore : 1; /* make special actions
 					to preserve the state
 					on suspend/resume */
 };
 
+extern int tc6393xb_lcd_mode(struct platform_device *fb,
+			     const struct fb_videomode *mode);
+extern int tc6393xb_lcd_set_power(struct platform_device *fb, bool on);
+
 /*
  * Relative to irq_base
  */
 #define	IRQ_TC6393_NAND		0
 #define	IRQ_TC6393_MMC		1
 #define	IRQ_TC6393_OHCI		2
+#define	IRQ_TC6393_FB		4
 
 #define	TC6393XB_NR_IRQS	8
 
-- 
cgit v1.2.3


From a603a7fa8717fb778bba91b5a879babf333dc6a3 Mon Sep 17 00:00:00 2001
From: David Brownell <dbrownell@users.sourceforge.net>
Date: Wed, 15 Oct 2008 12:15:39 +0200
Subject: mfd: TWL4030 core driver

This patch adds the core of the TWL4030 driver, which supports
chips including the TPS65950.  These chips are multi-function; see

  http://focus.ti.com/docs/prod/folders/print/tps65950.html

Public specs are in the works.  For now, the block diagram on
the second page of the datasheet is fairly informative.

There are some known issues with this core code.  Most notably,
the IRQ dispatching needs simplification (to use more of genirq),
generalization (integrating support for secondary IRQ dispatch
as well as primary, and removing the build dependency on OMAP),
and then probably updating to leverage threaded IRQ support
(expected to arrive in mainline "soon").

Once the core is in mainline, drivers for other parts of this
chip can follow its lead and start swimming upstream too.

Signed-off-by: David Brownell <dbrownell@users.sourceforge.net>
Signed-off-by: Samuel Ortiz <sameo@openedhand.com>
---
 drivers/mfd/Kconfig         |   14 +
 drivers/mfd/Makefile        |    2 +
 drivers/mfd/twl4030-core.c  | 1257 +++++++++++++++++++++++++++++++++++++++++++
 include/linux/i2c/twl4030.h |  339 ++++++++++++
 4 files changed, 1612 insertions(+)
 create mode 100644 drivers/mfd/twl4030-core.c
 create mode 100644 include/linux/i2c/twl4030.h

(limited to 'include/linux')

diff --git a/drivers/mfd/Kconfig b/drivers/mfd/Kconfig
index 5eff8ad834d6..cccda99328f3 100644
--- a/drivers/mfd/Kconfig
+++ b/drivers/mfd/Kconfig
@@ -59,6 +59,20 @@ config UCB1400_CORE
 	  To compile this driver as a module, choose M here: the
 	  module will be called ucb1400_core.
 
+config TWL4030_CORE
+	bool "Texas Instruments TWL4030/TPS659x0 Support"
+	depends on I2C=y && GENERIC_HARDIRQS && (ARCH_OMAP2 || ARCH_OMAP3)
+	help
+	  Say yes here if you have TWL4030 family chip on your board.
+	  This core driver provides register access and IRQ handling
+	  facilities, and registers devices for the various functions
+	  so that function-specific drivers can bind to them.
+
+	  These multi-function chips are found on many OMAP2 and OMAP3
+	  boards, providing power management, RTC, GPIO, keypad, a
+	  high speed USB OTG transceiver, an audio codec (on most
+	  versions) and many other features.
+
 config MFD_TMIO
 	bool
 	default n
diff --git a/drivers/mfd/Makefile b/drivers/mfd/Makefile
index 759b1fe1c891..68e237b830ad 100644
--- a/drivers/mfd/Makefile
+++ b/drivers/mfd/Makefile
@@ -17,6 +17,8 @@ wm8350-objs			:= wm8350-core.o wm8350-regmap.o wm8350-gpio.o
 obj-$(CONFIG_MFD_WM8350)	+= wm8350.o
 obj-$(CONFIG_MFD_WM8350_I2C)	+= wm8350-i2c.o
 
+obj-$(CONFIG_TWL4030_CORE)	+= twl4030-core.o
+
 obj-$(CONFIG_MFD_CORE)		+= mfd-core.o
 
 obj-$(CONFIG_MCP)		+= mcp-core.o
diff --git a/drivers/mfd/twl4030-core.c b/drivers/mfd/twl4030-core.c
new file mode 100644
index 000000000000..4af1624987c5
--- /dev/null
+++ b/drivers/mfd/twl4030-core.c
@@ -0,0 +1,1257 @@
+/*
+ * twl4030_core.c - driver for TWL4030/TPS659x0 PM and audio CODEC devices
+ *
+ * Copyright (C) 2005-2006 Texas Instruments, Inc.
+ *
+ * Modifications to defer interrupt handling to a kernel thread:
+ * Copyright (C) 2006 MontaVista Software, Inc.
+ *
+ * Based on tlv320aic23.c:
+ * Copyright (c) by Kai Svahn <kai.svahn@nokia.com>
+ *
+ * Code cleanup and modifications to IRQ handler.
+ * by syed khasim <x0khasim@ti.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307 USA
+ */
+
+#include <linux/kernel_stat.h>
+#include <linux/init.h>
+#include <linux/mutex.h>
+#include <linux/interrupt.h>
+#include <linux/irq.h>
+#include <linux/random.h>
+#include <linux/kthread.h>
+#include <linux/platform_device.h>
+#include <linux/clk.h>
+
+#include <linux/i2c.h>
+#include <linux/i2c/twl4030.h>
+
+
+/*
+ * The TWL4030 "Triton 2" is one of a family of a multi-function "Power
+ * Management and System Companion Device" chips originally designed for
+ * use in OMAP2 and OMAP 3 based systems.  Its control interfaces use I2C,
+ * often at around 3 Mbit/sec, including for interrupt handling.
+ *
+ * This driver core provides genirq support for the interrupts emitted,
+ * by the various modules, and exports register access primitives.
+ *
+ * FIXME this driver currently requires use of the first interrupt line
+ * (and associated registers).
+ */
+
+#define DRIVER_NAME			"twl4030"
+
+#if defined(CONFIG_TWL4030_BCI_BATTERY) || \
+	defined(CONFIG_TWL4030_BCI_BATTERY_MODULE)
+#define twl_has_bci()		true
+#else
+#define twl_has_bci()		false
+#endif
+
+#if defined(CONFIG_KEYBOARD_TWL4030) || defined(CONFIG_KEYBOARD_TWL4030_MODULE)
+#define twl_has_keypad()	true
+#else
+#define twl_has_keypad()	false
+#endif
+
+#if defined(CONFIG_GPIO_TWL4030) || defined(CONFIG_GPIO_TWL4030_MODULE)
+#define twl_has_gpio()	true
+#else
+#define twl_has_gpio()	false
+#endif
+
+#if defined(CONFIG_TWL4030_MADC) || defined(CONFIG_TWL4030_MADC_MODULE)
+#define twl_has_madc()	true
+#else
+#define twl_has_madc()	false
+#endif
+
+#if defined(CONFIG_RTC_DRV_TWL4030) || defined(CONFIG_RTC_DRV_TWL4030_MODULE)
+#define twl_has_rtc()	true
+#else
+#define twl_has_rtc()	false
+#endif
+
+#if defined(CONFIG_TWL4030_USB) || defined(CONFIG_TWL4030_USB_MODULE)
+#define twl_has_usb()	true
+#else
+#define twl_has_usb()	false
+#endif
+
+static inline void activate_irq(int irq)
+{
+#ifdef CONFIG_ARM
+	/* ARM requires an extra step to clear IRQ_NOREQUEST, which it
+	 * sets on behalf of every irq_chip.  Also sets IRQ_NOPROBE.
+	 */
+	set_irq_flags(irq, IRQF_VALID);
+#else
+	/* same effect on other architectures */
+	set_irq_noprobe(irq);
+#endif
+}
+
+/* Primary Interrupt Handler on TWL4030 Registers */
+
+/* Register Definitions */
+
+#define REG_PIH_ISR_P1			(0x1)
+#define REG_PIH_ISR_P2			(0x2)
+#define REG_PIH_SIR			(0x3)
+
+/* Triton Core internal information (BEGIN) */
+
+/* Last - for index max*/
+#define TWL4030_MODULE_LAST		TWL4030_MODULE_SECURED_REG
+
+#define TWL4030_NUM_SLAVES		4
+
+
+/* Base Address defns for twl4030_map[] */
+
+/* subchip/slave 0 - USB ID */
+#define TWL4030_BASEADD_USB		0x0000
+
+/* subchip/slave 1 - AUD ID */
+#define TWL4030_BASEADD_AUDIO_VOICE	0x0000
+#define TWL4030_BASEADD_GPIO		0x0098
+#define TWL4030_BASEADD_INTBR		0x0085
+#define TWL4030_BASEADD_PIH		0x0080
+#define TWL4030_BASEADD_TEST		0x004C
+
+/* subchip/slave 2 - AUX ID */
+#define TWL4030_BASEADD_INTERRUPTS	0x00B9
+#define TWL4030_BASEADD_LED		0x00EE
+#define TWL4030_BASEADD_MADC		0x0000
+#define TWL4030_BASEADD_MAIN_CHARGE	0x0074
+#define TWL4030_BASEADD_PRECHARGE	0x00AA
+#define TWL4030_BASEADD_PWM0		0x00F8
+#define TWL4030_BASEADD_PWM1		0x00FB
+#define TWL4030_BASEADD_PWMA		0x00EF
+#define TWL4030_BASEADD_PWMB		0x00F1
+#define TWL4030_BASEADD_KEYPAD		0x00D2
+
+/* subchip/slave 3 - POWER ID */
+#define TWL4030_BASEADD_BACKUP		0x0014
+#define TWL4030_BASEADD_INT		0x002E
+#define TWL4030_BASEADD_PM_MASTER	0x0036
+#define TWL4030_BASEADD_PM_RECEIVER	0x005B
+#define TWL4030_BASEADD_RTC		0x001C
+#define TWL4030_BASEADD_SECURED_REG	0x0000
+
+/* Triton Core internal information (END) */
+
+
+/* Few power values */
+#define R_CFG_BOOT			0x05
+#define R_PROTECT_KEY			0x0E
+
+/* access control values for R_PROTECT_KEY */
+#define KEY_UNLOCK1			0xce
+#define KEY_UNLOCK2			0xec
+#define KEY_LOCK			0x00
+
+/* some fields in R_CFG_BOOT */
+#define HFCLK_FREQ_19p2_MHZ		(1 << 0)
+#define HFCLK_FREQ_26_MHZ		(2 << 0)
+#define HFCLK_FREQ_38p4_MHZ		(3 << 0)
+#define HIGH_PERF_SQ			(1 << 3)
+
+
+/*----------------------------------------------------------------------*/
+
+/**
+ * struct twl4030_mod_iregs - TWL module IMR/ISR regs to mask/clear at init
+ * @mod_no: TWL4030 module number (e.g., TWL4030_MODULE_GPIO)
+ * @sih_ctrl: address of module SIH_CTRL register
+ * @reg_cnt: number of IMR/ISR regs
+ * @imrs: pointer to array of TWL module interrupt mask register indices
+ * @isrs: pointer to array of TWL module interrupt status register indices
+ *
+ * Ties together TWL4030 modules and lists of IMR/ISR registers to mask/clear
+ * during twl_init_irq().
+ */
+struct twl4030_mod_iregs {
+	const u8 mod_no;
+	const u8 sih_ctrl;
+	const u8 reg_cnt;
+	const u8 *imrs;
+	const u8 *isrs;
+};
+
+/* TWL4030 INT module interrupt mask registers */
+static const u8 __initconst twl4030_int_imr_regs[] = {
+	TWL4030_INT_PWR_IMR1,
+	TWL4030_INT_PWR_IMR2,
+};
+
+/* TWL4030 INT module interrupt status registers */
+static const u8 __initconst twl4030_int_isr_regs[] = {
+	TWL4030_INT_PWR_ISR1,
+	TWL4030_INT_PWR_ISR2,
+};
+
+/* TWL4030 INTERRUPTS module interrupt mask registers */
+static const u8 __initconst twl4030_interrupts_imr_regs[] = {
+	TWL4030_INTERRUPTS_BCIIMR1A,
+	TWL4030_INTERRUPTS_BCIIMR1B,
+	TWL4030_INTERRUPTS_BCIIMR2A,
+	TWL4030_INTERRUPTS_BCIIMR2B,
+};
+
+/* TWL4030 INTERRUPTS module interrupt status registers */
+static const u8 __initconst twl4030_interrupts_isr_regs[] = {
+	TWL4030_INTERRUPTS_BCIISR1A,
+	TWL4030_INTERRUPTS_BCIISR1B,
+	TWL4030_INTERRUPTS_BCIISR2A,
+	TWL4030_INTERRUPTS_BCIISR2B,
+};
+
+/* TWL4030 MADC module interrupt mask registers */
+static const u8 __initconst twl4030_madc_imr_regs[] = {
+	TWL4030_MADC_IMR1,
+	TWL4030_MADC_IMR2,
+};
+
+/* TWL4030 MADC module interrupt status registers */
+static const u8 __initconst twl4030_madc_isr_regs[] = {
+	TWL4030_MADC_ISR1,
+	TWL4030_MADC_ISR2,
+};
+
+/* TWL4030 keypad module interrupt mask registers */
+static const u8 __initconst twl4030_keypad_imr_regs[] = {
+	TWL4030_KEYPAD_KEYP_IMR1,
+	TWL4030_KEYPAD_KEYP_IMR2,
+};
+
+/* TWL4030 keypad module interrupt status registers */
+static const u8 __initconst twl4030_keypad_isr_regs[] = {
+	TWL4030_KEYPAD_KEYP_ISR1,
+	TWL4030_KEYPAD_KEYP_ISR2,
+};
+
+/* TWL4030 GPIO module interrupt mask registers */
+static const u8 __initconst twl4030_gpio_imr_regs[] = {
+	REG_GPIO_IMR1A,
+	REG_GPIO_IMR1B,
+	REG_GPIO_IMR2A,
+	REG_GPIO_IMR2B,
+	REG_GPIO_IMR3A,
+	REG_GPIO_IMR3B,
+};
+
+/* TWL4030 GPIO module interrupt status registers */
+static const u8 __initconst twl4030_gpio_isr_regs[] = {
+	REG_GPIO_ISR1A,
+	REG_GPIO_ISR1B,
+	REG_GPIO_ISR2A,
+	REG_GPIO_ISR2B,
+	REG_GPIO_ISR3A,
+	REG_GPIO_ISR3B,
+};
+
+/* TWL4030 modules that have IMR/ISR registers that must be masked/cleared */
+static const struct twl4030_mod_iregs __initconst twl4030_mod_regs[] = {
+	{
+		.mod_no	  = TWL4030_MODULE_INT,
+		.sih_ctrl = TWL4030_INT_PWR_SIH_CTRL,
+		.reg_cnt  = ARRAY_SIZE(twl4030_int_imr_regs),
+		.imrs	  = twl4030_int_imr_regs,
+		.isrs	  = twl4030_int_isr_regs,
+	},
+	{
+		.mod_no	  = TWL4030_MODULE_INTERRUPTS,
+		.sih_ctrl = TWL4030_INTERRUPTS_BCISIHCTRL,
+		.reg_cnt  = ARRAY_SIZE(twl4030_interrupts_imr_regs),
+		.imrs	  = twl4030_interrupts_imr_regs,
+		.isrs	  = twl4030_interrupts_isr_regs,
+	},
+	{
+		.mod_no	  = TWL4030_MODULE_MADC,
+		.sih_ctrl = TWL4030_MADC_SIH_CTRL,
+		.reg_cnt  = ARRAY_SIZE(twl4030_madc_imr_regs),
+		.imrs	  = twl4030_madc_imr_regs,
+		.isrs	  = twl4030_madc_isr_regs,
+	},
+	{
+		.mod_no	  = TWL4030_MODULE_KEYPAD,
+		.sih_ctrl = TWL4030_KEYPAD_KEYP_SIH_CTRL,
+		.reg_cnt  = ARRAY_SIZE(twl4030_keypad_imr_regs),
+		.imrs	  = twl4030_keypad_imr_regs,
+		.isrs	  = twl4030_keypad_isr_regs,
+	},
+	{
+		.mod_no	  = TWL4030_MODULE_GPIO,
+		.sih_ctrl = REG_GPIO_SIH_CTRL,
+		.reg_cnt  = ARRAY_SIZE(twl4030_gpio_imr_regs),
+		.imrs	  = twl4030_gpio_imr_regs,
+		.isrs	  = twl4030_gpio_isr_regs,
+	},
+};
+
+/*----------------------------------------------------------------*/
+
+/* is driver active, bound to a chip? */
+static bool inuse;
+
+/* Structure for each TWL4030 Slave */
+struct twl4030_client {
+	struct i2c_client *client;
+	u8 address;
+
+	/* max numb of i2c_msg required is for read =2 */
+	struct i2c_msg xfer_msg[2];
+
+	/* To lock access to xfer_msg */
+	struct mutex xfer_lock;
+};
+
+static struct twl4030_client twl4030_modules[TWL4030_NUM_SLAVES];
+
+
+/* mapping the module id to slave id and base address */
+struct twl4030mapping {
+	unsigned char sid;	/* Slave ID */
+	unsigned char base;	/* base address */
+};
+
+static struct twl4030mapping twl4030_map[TWL4030_MODULE_LAST + 1] = {
+	/*
+	 * NOTE:  don't change this table without updating the
+	 * <linux/i2c/twl4030.h> defines for TWL4030_MODULE_*
+	 * so they continue to match the order in this table.
+	 */
+
+	{ 0, TWL4030_BASEADD_USB },
+
+	{ 1, TWL4030_BASEADD_AUDIO_VOICE },
+	{ 1, TWL4030_BASEADD_GPIO },
+	{ 1, TWL4030_BASEADD_INTBR },
+	{ 1, TWL4030_BASEADD_PIH },
+	{ 1, TWL4030_BASEADD_TEST },
+
+	{ 2, TWL4030_BASEADD_KEYPAD },
+	{ 2, TWL4030_BASEADD_MADC },
+	{ 2, TWL4030_BASEADD_INTERRUPTS },
+	{ 2, TWL4030_BASEADD_LED },
+	{ 2, TWL4030_BASEADD_MAIN_CHARGE },
+	{ 2, TWL4030_BASEADD_PRECHARGE },
+	{ 2, TWL4030_BASEADD_PWM0 },
+	{ 2, TWL4030_BASEADD_PWM1 },
+	{ 2, TWL4030_BASEADD_PWMA },
+	{ 2, TWL4030_BASEADD_PWMB },
+
+	{ 3, TWL4030_BASEADD_BACKUP },
+	{ 3, TWL4030_BASEADD_INT },
+	{ 3, TWL4030_BASEADD_PM_MASTER },
+	{ 3, TWL4030_BASEADD_PM_RECEIVER },
+	{ 3, TWL4030_BASEADD_RTC },
+	{ 3, TWL4030_BASEADD_SECURED_REG },
+};
+
+/*----------------------------------------------------------------------*/
+
+/*
+ * TWL4030 doesn't have PIH mask, hence dummy function for mask
+ * and unmask of the (eight) interrupts reported at that level ...
+ * masking is only available from SIH (secondary) modules.
+ */
+
+static void twl4030_i2c_ackirq(unsigned int irq)
+{
+}
+
+static void twl4030_i2c_disableint(unsigned int irq)
+{
+}
+
+static void twl4030_i2c_enableint(unsigned int irq)
+{
+}
+
+static struct irq_chip twl4030_irq_chip = {
+	.name	= "twl4030",
+	.ack	= twl4030_i2c_ackirq,
+	.mask	= twl4030_i2c_disableint,
+	.unmask	= twl4030_i2c_enableint,
+};
+
+/*----------------------------------------------------------------------*/
+
+/* Exported Functions */
+
+/**
+ * twl4030_i2c_write - Writes a n bit register in TWL4030
+ * @mod_no: module number
+ * @value: an array of num_bytes+1 containing data to write
+ * @reg: register address (just offset will do)
+ * @num_bytes: number of bytes to transfer
+ *
+ * IMPORTANT: for 'value' parameter: Allocate value num_bytes+1 and
+ * valid data starts at Offset 1.
+ *
+ * Returns the result of operation - 0 is success
+ */
+int twl4030_i2c_write(u8 mod_no, u8 *value, u8 reg, u8 num_bytes)
+{
+	int ret;
+	int sid;
+	struct twl4030_client *twl;
+	struct i2c_msg *msg;
+
+	if (unlikely(mod_no > TWL4030_MODULE_LAST)) {
+		pr_err("%s: invalid module number %d\n", DRIVER_NAME, mod_no);
+		return -EPERM;
+	}
+	sid = twl4030_map[mod_no].sid;
+	twl = &twl4030_modules[sid];
+
+	if (unlikely(!inuse)) {
+		pr_err("%s: client %d is not initialized\n", DRIVER_NAME, sid);
+		return -EPERM;
+	}
+	mutex_lock(&twl->xfer_lock);
+	/*
+	 * [MSG1]: fill the register address data
+	 * fill the data Tx buffer
+	 */
+	msg = &twl->xfer_msg[0];
+	msg->addr = twl->address;
+	msg->len = num_bytes + 1;
+	msg->flags = 0;
+	msg->buf = value;
+	/* over write the first byte of buffer with the register address */
+	*value = twl4030_map[mod_no].base + reg;
+	ret = i2c_transfer(twl->client->adapter, twl->xfer_msg, 1);
+	mutex_unlock(&twl->xfer_lock);
+
+	/* i2cTransfer returns num messages.translate it pls.. */
+	if (ret >= 0)
+		ret = 0;
+	return ret;
+}
+EXPORT_SYMBOL(twl4030_i2c_write);
+
+/**
+ * twl4030_i2c_read - Reads a n bit register in TWL4030
+ * @mod_no: module number
+ * @value: an array of num_bytes containing data to be read
+ * @reg: register address (just offset will do)
+ * @num_bytes: number of bytes to transfer
+ *
+ * Returns result of operation - num_bytes is success else failure.
+ */
+int twl4030_i2c_read(u8 mod_no, u8 *value, u8 reg, u8 num_bytes)
+{
+	int ret;
+	u8 val;
+	int sid;
+	struct twl4030_client *twl;
+	struct i2c_msg *msg;
+
+	if (unlikely(mod_no > TWL4030_MODULE_LAST)) {
+		pr_err("%s: invalid module number %d\n", DRIVER_NAME, mod_no);
+		return -EPERM;
+	}
+	sid = twl4030_map[mod_no].sid;
+	twl = &twl4030_modules[sid];
+
+	if (unlikely(!inuse)) {
+		pr_err("%s: client %d is not initialized\n", DRIVER_NAME, sid);
+		return -EPERM;
+	}
+	mutex_lock(&twl->xfer_lock);
+	/* [MSG1] fill the register address data */
+	msg = &twl->xfer_msg[0];
+	msg->addr = twl->address;
+	msg->len = 1;
+	msg->flags = 0;	/* Read the register value */
+	val = twl4030_map[mod_no].base + reg;
+	msg->buf = &val;
+	/* [MSG2] fill the data rx buffer */
+	msg = &twl->xfer_msg[1];
+	msg->addr = twl->address;
+	msg->flags = I2C_M_RD;	/* Read the register value */
+	msg->len = num_bytes;	/* only n bytes */
+	msg->buf = value;
+	ret = i2c_transfer(twl->client->adapter, twl->xfer_msg, 2);
+	mutex_unlock(&twl->xfer_lock);
+
+	/* i2cTransfer returns num messages.translate it pls.. */
+	if (ret >= 0)
+		ret = 0;
+	return ret;
+}
+EXPORT_SYMBOL(twl4030_i2c_read);
+
+/**
+ * twl4030_i2c_write_u8 - Writes a 8 bit register in TWL4030
+ * @mod_no: module number
+ * @value: the value to be written 8 bit
+ * @reg: register address (just offset will do)
+ *
+ * Returns result of operation - 0 is success
+ */
+int twl4030_i2c_write_u8(u8 mod_no, u8 value, u8 reg)
+{
+
+	/* 2 bytes offset 1 contains the data offset 0 is used by i2c_write */
+	u8 temp_buffer[2] = { 0 };
+	/* offset 1 contains the data */
+	temp_buffer[1] = value;
+	return twl4030_i2c_write(mod_no, temp_buffer, reg, 1);
+}
+EXPORT_SYMBOL(twl4030_i2c_write_u8);
+
+/**
+ * twl4030_i2c_read_u8 - Reads a 8 bit register from TWL4030
+ * @mod_no: module number
+ * @value: the value read 8 bit
+ * @reg: register address (just offset will do)
+ *
+ * Returns result of operation - 0 is success
+ */
+int twl4030_i2c_read_u8(u8 mod_no, u8 *value, u8 reg)
+{
+	return twl4030_i2c_read(mod_no, value, reg, 1);
+}
+EXPORT_SYMBOL(twl4030_i2c_read_u8);
+
+/*----------------------------------------------------------------------*/
+
+/*
+ * do_twl4030_module_irq() is the desc->handle method for each of the twl4030
+ * module interrupts that doesn't chain to another irq_chip (GPIO, power, etc).
+ * It executes in kernel thread context.  On entry, cpu interrupts are disabled.
+ */
+static void do_twl4030_module_irq(unsigned int irq, irq_desc_t *desc)
+{
+	struct irqaction *action;
+	const unsigned int cpu = smp_processor_id();
+
+	/*
+	 * Earlier this was desc->triggered = 1;
+	 */
+	desc->status |= IRQ_LEVEL;
+
+	/*
+	 * The desc->handle method would normally call the desc->chip->ack
+	 * method here, but we won't bother since our ack method is NULL.
+	 */
+
+	if (!desc->depth) {
+		kstat_cpu(cpu).irqs[irq]++;
+
+		action = desc->action;
+		if (action) {
+			int ret;
+			int status = 0;
+			int retval = 0;
+
+			local_irq_enable();
+
+			do {
+				/* Call the ISR with cpu interrupts enabled */
+				ret = action->handler(irq, action->dev_id);
+				if (ret == IRQ_HANDLED)
+					status |= action->flags;
+				retval |= ret;
+				action = action->next;
+			} while (action);
+
+			if (status & IRQF_SAMPLE_RANDOM)
+				add_interrupt_randomness(irq);
+
+			local_irq_disable();
+
+			if (retval != IRQ_HANDLED)
+				printk(KERN_ERR "ISR for TWL4030 module"
+					" irq %d can't handle interrupt\n",
+					irq);
+
+			/*
+			 * Here is where we should call the unmask method, but
+			 * again we won't bother since it is NULL.
+			 */
+		} else
+			printk(KERN_CRIT "TWL4030 module irq %d has no ISR"
+					" but can't be masked!\n", irq);
+	} else
+		printk(KERN_CRIT "TWL4030 module irq %d is disabled but can't"
+				" be masked!\n", irq);
+}
+
+static unsigned twl4030_irq_base;
+
+static struct completion irq_event;
+
+/*
+ * This thread processes interrupts reported by the Primary Interrupt Handler.
+ */
+static int twl4030_irq_thread(void *data)
+{
+	long irq = (long)data;
+	irq_desc_t *desc = irq_desc + irq;
+	static unsigned i2c_errors;
+	const static unsigned max_i2c_errors = 100;
+
+	daemonize("twl4030-irq");
+	current->flags |= PF_NOFREEZE;
+
+	while (!kthread_should_stop()) {
+		int ret;
+		int module_irq;
+		u8 pih_isr;
+
+		/* Wait for IRQ, then read PIH irq status (also blocking) */
+		wait_for_completion_interruptible(&irq_event);
+
+		ret = twl4030_i2c_read_u8(TWL4030_MODULE_PIH, &pih_isr,
+					  REG_PIH_ISR_P1);
+		if (ret) {
+			pr_warning("%s: I2C error %d reading PIH ISR\n",
+					DRIVER_NAME, ret);
+			if (++i2c_errors >= max_i2c_errors) {
+				printk(KERN_ERR "Maximum I2C error count"
+						" exceeded.  Terminating %s.\n",
+						__func__);
+				break;
+			}
+			complete(&irq_event);
+			continue;
+		}
+
+		/* these handlers deal with the relevant SIH irq status */
+		local_irq_disable();
+		for (module_irq = twl4030_irq_base;
+				pih_isr;
+				pih_isr >>= 1, module_irq++) {
+			if (pih_isr & 0x1) {
+				irq_desc_t *d = irq_desc + module_irq;
+
+				d->handle_irq(module_irq, d);
+			}
+		}
+		local_irq_enable();
+
+		desc->chip->unmask(irq);
+	}
+
+	return 0;
+}
+
+/*
+ * do_twl4030_irq() is the desc->handle method for the twl4030 interrupt.
+ * This is a chained interrupt, so there is no desc->action method for it.
+ * Now we need to query the interrupt controller in the twl4030 to determine
+ * which module is generating the interrupt request.  However, we can't do i2c
+ * transactions in interrupt context, so we must defer that work to a kernel
+ * thread.  All we do here is acknowledge and mask the interrupt and wakeup
+ * the kernel thread.
+ */
+static void do_twl4030_irq(unsigned int irq, irq_desc_t *desc)
+{
+	const unsigned int cpu = smp_processor_id();
+
+	/*
+	 * Earlier this was desc->triggered = 1;
+	 */
+	desc->status |= IRQ_LEVEL;
+
+	/*
+	 * Acknowledge, clear _AND_ disable the interrupt.
+	 */
+	desc->chip->ack(irq);
+
+	if (!desc->depth) {
+		kstat_cpu(cpu).irqs[irq]++;
+
+		complete(&irq_event);
+	}
+}
+
+static struct task_struct * __init start_twl4030_irq_thread(long irq)
+{
+	struct task_struct *thread;
+
+	init_completion(&irq_event);
+	thread = kthread_run(twl4030_irq_thread, (void *)irq,
+			     "twl4030 irq %ld", irq);
+	if (!thread)
+		pr_err("%s: could not create twl4030 irq %ld thread!\n",
+		       DRIVER_NAME, irq);
+
+	return thread;
+}
+
+/*----------------------------------------------------------------------*/
+
+static int add_children(struct twl4030_platform_data *pdata)
+{
+	struct platform_device	*pdev = NULL;
+	struct twl4030_client	*twl = NULL;
+	int			status = 0;
+
+	if (twl_has_bci() && pdata->bci) {
+		twl = &twl4030_modules[3];
+
+		pdev = platform_device_alloc("twl4030_bci", -1);
+		if (!pdev) {
+			pr_debug("%s: can't alloc bci dev\n", DRIVER_NAME);
+			status = -ENOMEM;
+			goto err;
+		}
+
+		if (status == 0) {
+			pdev->dev.parent = &twl->client->dev;
+			status = platform_device_add_data(pdev, pdata->bci,
+					sizeof(*pdata->bci));
+			if (status < 0) {
+				dev_dbg(&twl->client->dev,
+					"can't add bci data, %d\n",
+					status);
+				goto err;
+			}
+		}
+
+		if (status == 0) {
+			struct resource r = {
+				.start = TWL4030_PWRIRQ_CHG_PRES,
+				.flags = IORESOURCE_IRQ,
+			};
+
+			status = platform_device_add_resources(pdev, &r, 1);
+		}
+
+		if (status == 0)
+			status = platform_device_add(pdev);
+
+		if (status < 0) {
+			platform_device_put(pdev);
+			dev_dbg(&twl->client->dev,
+					"can't create bci dev, %d\n",
+					status);
+			goto err;
+		}
+	}
+
+	if (twl_has_gpio() && pdata->gpio) {
+		twl = &twl4030_modules[1];
+
+		pdev = platform_device_alloc("twl4030_gpio", -1);
+		if (!pdev) {
+			pr_debug("%s: can't alloc gpio dev\n", DRIVER_NAME);
+			status = -ENOMEM;
+			goto err;
+		}
+
+		/* more driver model init */
+		if (status == 0) {
+			pdev->dev.parent = &twl->client->dev;
+			/* device_init_wakeup(&pdev->dev, 1); */
+
+			status = platform_device_add_data(pdev, pdata->gpio,
+					sizeof(*pdata->gpio));
+			if (status < 0) {
+				dev_dbg(&twl->client->dev,
+					"can't add gpio data, %d\n",
+					status);
+				goto err;
+			}
+		}
+
+		/* GPIO module IRQ */
+		if (status == 0) {
+			struct resource	r = {
+				.start = pdata->irq_base + 0,
+				.flags = IORESOURCE_IRQ,
+			};
+
+			status = platform_device_add_resources(pdev, &r, 1);
+		}
+
+		if (status == 0)
+			status = platform_device_add(pdev);
+
+		if (status < 0) {
+			platform_device_put(pdev);
+			dev_dbg(&twl->client->dev,
+					"can't create gpio dev, %d\n",
+					status);
+			goto err;
+		}
+	}
+
+	if (twl_has_keypad() && pdata->keypad) {
+		pdev = platform_device_alloc("twl4030_keypad", -1);
+		if (pdev) {
+			twl = &twl4030_modules[2];
+			pdev->dev.parent = &twl->client->dev;
+			device_init_wakeup(&pdev->dev, 1);
+			status = platform_device_add_data(pdev, pdata->keypad,
+					sizeof(*pdata->keypad));
+			if (status < 0) {
+				dev_dbg(&twl->client->dev,
+					"can't add keypad data, %d\n",
+					status);
+				platform_device_put(pdev);
+				goto err;
+			}
+			status = platform_device_add(pdev);
+			if (status < 0) {
+				platform_device_put(pdev);
+				dev_dbg(&twl->client->dev,
+						"can't create keypad dev, %d\n",
+						status);
+				goto err;
+			}
+		} else {
+			pr_debug("%s: can't alloc keypad dev\n", DRIVER_NAME);
+			status = -ENOMEM;
+			goto err;
+		}
+	}
+
+	if (twl_has_madc() && pdata->madc) {
+		pdev = platform_device_alloc("twl4030_madc", -1);
+		if (pdev) {
+			twl = &twl4030_modules[2];
+			pdev->dev.parent = &twl->client->dev;
+			device_init_wakeup(&pdev->dev, 1);
+			status = platform_device_add_data(pdev, pdata->madc,
+					sizeof(*pdata->madc));
+			if (status < 0) {
+				platform_device_put(pdev);
+				dev_dbg(&twl->client->dev,
+					"can't add madc data, %d\n",
+					status);
+				goto err;
+			}
+			status = platform_device_add(pdev);
+			if (status < 0) {
+				platform_device_put(pdev);
+				dev_dbg(&twl->client->dev,
+						"can't create madc dev, %d\n",
+						status);
+				goto err;
+			}
+		} else {
+			pr_debug("%s: can't alloc madc dev\n", DRIVER_NAME);
+			status = -ENOMEM;
+			goto err;
+		}
+	}
+
+	if (twl_has_rtc()) {
+		twl = &twl4030_modules[3];
+
+		pdev = platform_device_alloc("twl4030_rtc", -1);
+		if (!pdev) {
+			pr_debug("%s: can't alloc rtc dev\n", DRIVER_NAME);
+			status = -ENOMEM;
+		} else {
+			pdev->dev.parent = &twl->client->dev;
+			device_init_wakeup(&pdev->dev, 1);
+		}
+
+		/*
+		 * REVISIT platform_data here currently might use of
+		 * "msecure" line ... but for now we just expect board
+		 * setup to tell the chip "we are secure" at all times.
+		 * Eventually, Linux might become more aware of such
+		 * HW security concerns, and "least privilege".
+		 */
+
+		/* RTC module IRQ */
+		if (status == 0) {
+			struct resource	r = {
+				/* REVISIT don't hard-wire this stuff */
+				.start = TWL4030_PWRIRQ_RTC,
+				.flags = IORESOURCE_IRQ,
+			};
+
+			status = platform_device_add_resources(pdev, &r, 1);
+		}
+
+		if (status == 0)
+			status = platform_device_add(pdev);
+
+		if (status < 0) {
+			platform_device_put(pdev);
+			dev_dbg(&twl->client->dev,
+					"can't create rtc dev, %d\n",
+					status);
+			goto err;
+		}
+	}
+
+	if (twl_has_usb() && pdata->usb) {
+		twl = &twl4030_modules[0];
+
+		pdev = platform_device_alloc("twl4030_usb", -1);
+		if (!pdev) {
+			pr_debug("%s: can't alloc usb dev\n", DRIVER_NAME);
+			status = -ENOMEM;
+			goto err;
+		}
+
+		if (status == 0) {
+			pdev->dev.parent = &twl->client->dev;
+			device_init_wakeup(&pdev->dev, 1);
+			status = platform_device_add_data(pdev, pdata->usb,
+					sizeof(*pdata->usb));
+			if (status < 0) {
+				platform_device_put(pdev);
+				dev_dbg(&twl->client->dev,
+					"can't add usb data, %d\n",
+					status);
+				goto err;
+			}
+		}
+
+		if (status == 0) {
+			struct resource r = {
+				.start = TWL4030_PWRIRQ_USB_PRES,
+				.flags = IORESOURCE_IRQ,
+			};
+
+			status = platform_device_add_resources(pdev, &r, 1);
+		}
+
+		if (status == 0)
+			status = platform_device_add(pdev);
+
+		if (status < 0) {
+			platform_device_put(pdev);
+			dev_dbg(&twl->client->dev,
+					"can't create usb dev, %d\n",
+					status);
+		}
+	}
+
+err:
+	if (status)
+		pr_err("failed to add twl4030's children (status %d)\n", status);
+	return status;
+}
+
+/*----------------------------------------------------------------------*/
+
+/*
+ * These three functions initialize the on-chip clock framework,
+ * letting it generate the right frequencies for USB, MADC, and
+ * other purposes.
+ */
+static inline int __init protect_pm_master(void)
+{
+	int e = 0;
+
+	e = twl4030_i2c_write_u8(TWL4030_MODULE_PM_MASTER, KEY_LOCK,
+			R_PROTECT_KEY);
+	return e;
+}
+
+static inline int __init unprotect_pm_master(void)
+{
+	int e = 0;
+
+	e |= twl4030_i2c_write_u8(TWL4030_MODULE_PM_MASTER, KEY_UNLOCK1,
+			R_PROTECT_KEY);
+	e |= twl4030_i2c_write_u8(TWL4030_MODULE_PM_MASTER, KEY_UNLOCK2,
+			R_PROTECT_KEY);
+	return e;
+}
+
+static void __init clocks_init(void)
+{
+	int e = 0;
+	struct clk *osc;
+	u32 rate;
+	u8 ctrl = HFCLK_FREQ_26_MHZ;
+
+#if defined(CONFIG_ARCH_OMAP2) || defined(CONFIG_ARCH_OMAP3)
+	if (cpu_is_omap2430())
+		osc = clk_get(NULL, "osc_ck");
+	else
+		osc = clk_get(NULL, "osc_sys_ck");
+#else
+	/* REVISIT for non-OMAP systems, pass the clock rate from
+	 * board init code, using platform_data.
+	 */
+	osc = ERR_PTR(-EIO);
+#endif
+	if (IS_ERR(osc)) {
+		printk(KERN_WARNING "Skipping twl4030 internal clock init and "
+				"using bootloader value (unknown osc rate)\n");
+		return;
+	}
+
+	rate = clk_get_rate(osc);
+	clk_put(osc);
+
+	switch (rate) {
+	case 19200000:
+		ctrl = HFCLK_FREQ_19p2_MHZ;
+		break;
+	case 26000000:
+		ctrl = HFCLK_FREQ_26_MHZ;
+		break;
+	case 38400000:
+		ctrl = HFCLK_FREQ_38p4_MHZ;
+		break;
+	}
+
+	ctrl |= HIGH_PERF_SQ;
+	e |= unprotect_pm_master();
+	/* effect->MADC+USB ck en */
+	e |= twl4030_i2c_write_u8(TWL4030_MODULE_PM_MASTER, ctrl, R_CFG_BOOT);
+	e |= protect_pm_master();
+
+	if (e < 0)
+		pr_err("%s: clock init err [%d]\n", DRIVER_NAME, e);
+}
+
+/*----------------------------------------------------------------------*/
+
+/**
+ * twl4030_i2c_clear_isr - clear TWL4030 SIH ISR regs via read + write
+ * @mod_no: TWL4030 module number
+ * @reg: register index to clear
+ * @cor: value of the <module>_SIH_CTRL.COR bit (1 or 0)
+ *
+ * Either reads (cor == 1) or writes (cor == 0) to a TWL4030 interrupt
+ * status register to ensure that any prior interrupts are cleared.
+ * Returns the status from the I2C read operation.
+ */
+static int __init twl4030_i2c_clear_isr(u8 mod_no, u8 reg, u8 cor)
+{
+	u8 tmp;
+
+	return (cor) ? twl4030_i2c_read_u8(mod_no, &tmp, reg) :
+		twl4030_i2c_write_u8(mod_no, 0xff, reg);
+}
+
+/**
+ * twl4030_read_cor_bit - are TWL module ISRs cleared by reads or writes?
+ * @mod_no: TWL4030 module number
+ * @reg: register index to clear
+ *
+ * Returns 1 if the TWL4030 SIH interrupt status registers (ISRs) for
+ * the specified TWL module are cleared by reads, or 0 if cleared by
+ * writes.
+ */
+static int twl4030_read_cor_bit(u8 mod_no, u8 reg)
+{
+	u8 tmp = 0;
+
+	WARN_ON(twl4030_i2c_read_u8(mod_no, &tmp, reg) < 0);
+
+	tmp &= TWL4030_SIH_CTRL_COR_MASK;
+	tmp >>= __ffs(TWL4030_SIH_CTRL_COR_MASK);
+
+	return tmp;
+}
+
+/**
+ * twl4030_mask_clear_intrs - mask and clear all TWL4030 interrupts
+ * @t: pointer to twl4030_mod_iregs array
+ * @t_sz: ARRAY_SIZE(t) (starting at 1)
+ *
+ * Mask all TWL4030 interrupt mask registers (IMRs) and clear all
+ * interrupt status registers (ISRs).  No return value, but will WARN if
+ * any I2C operations fail.
+ */
+static void __init twl4030_mask_clear_intrs(const struct twl4030_mod_iregs *t,
+					    const u8 t_sz)
+{
+	int i, j;
+
+	/*
+	 * N.B. - further efficiency is possible here.  Eight I2C
+	 * operations on BCI and GPIO modules are avoidable if I2C
+	 * burst read/write transactions were implemented.  Would
+	 * probably save about 1ms of boot time and a small amount of
+	 * power.
+	 */
+	for (i = 0; i < t_sz; i++) {
+		const struct twl4030_mod_iregs tmr = t[i];
+		int cor;
+
+		/* Are ISRs cleared by reads or writes? */
+		cor = twl4030_read_cor_bit(tmr.mod_no, tmr.sih_ctrl);
+
+		for (j = 0; j < tmr.reg_cnt; j++) {
+
+			/* Mask interrupts at the TWL4030 */
+			WARN_ON(twl4030_i2c_write_u8(tmr.mod_no, 0xff,
+						     tmr.imrs[j]) < 0);
+
+			/* Clear TWL4030 ISRs */
+			WARN_ON(twl4030_i2c_clear_isr(tmr.mod_no,
+						      tmr.isrs[j], cor) < 0);
+		}
+	}
+}
+
+
+static void twl_init_irq(int irq_num, unsigned irq_base, unsigned irq_end)
+{
+	int	i;
+
+	/*
+	 * Mask and clear all TWL4030 interrupts since initially we do
+	 * not have any TWL4030 module interrupt handlers present
+	 */
+	twl4030_mask_clear_intrs(twl4030_mod_regs,
+				 ARRAY_SIZE(twl4030_mod_regs));
+
+	twl4030_irq_base = irq_base;
+
+	/* install an irq handler for each of the PIH modules */
+	for (i = irq_base; i < irq_end; i++) {
+		set_irq_chip_and_handler(i, &twl4030_irq_chip,
+				do_twl4030_module_irq);
+		activate_irq(i);
+	}
+
+	/* install an irq handler to demultiplex the TWL4030 interrupt */
+	set_irq_data(irq_num, start_twl4030_irq_thread(irq_num));
+	set_irq_chained_handler(irq_num, do_twl4030_irq);
+}
+
+/*----------------------------------------------------------------------*/
+
+static int twl4030_remove(struct i2c_client *client)
+{
+	unsigned i;
+
+	/* FIXME undo twl_init_irq() */
+	if (twl4030_irq_base) {
+		dev_err(&client->dev, "can't yet clean up IRQs?\n");
+		return -ENOSYS;
+	}
+
+	for (i = 0; i < TWL4030_NUM_SLAVES; i++) {
+		struct twl4030_client	*twl = &twl4030_modules[i];
+
+		if (twl->client && twl->client != client)
+			i2c_unregister_device(twl->client);
+		twl4030_modules[i].client = NULL;
+	}
+	inuse = false;
+	return 0;
+}
+
+/* NOTE:  this driver only handles a single twl4030/tps659x0 chip */
+static int
+twl4030_probe(struct i2c_client *client, const struct i2c_device_id *id)
+{
+	int				status;
+	unsigned			i;
+	struct twl4030_platform_data	*pdata = client->dev.platform_data;
+
+	if (!pdata) {
+		dev_dbg(&client->dev, "no platform data?\n");
+		return -EINVAL;
+	}
+
+	if (i2c_check_functionality(client->adapter, I2C_FUNC_I2C) == 0) {
+		dev_dbg(&client->dev, "can't talk I2C?\n");
+		return -EIO;
+	}
+
+	if (inuse || twl4030_irq_base) {
+		dev_dbg(&client->dev, "driver is already in use\n");
+		return -EBUSY;
+	}
+
+	for (i = 0; i < TWL4030_NUM_SLAVES; i++) {
+		struct twl4030_client	*twl = &twl4030_modules[i];
+
+		twl->address = client->addr + i;
+		if (i == 0)
+			twl->client = client;
+		else {
+			twl->client = i2c_new_dummy(client->adapter,
+					twl->address);
+			if (!twl->client) {
+				dev_err(&twl->client->dev,
+					"can't attach client %d\n", i);
+				status = -ENOMEM;
+				goto fail;
+			}
+			strlcpy(twl->client->name, id->name,
+					sizeof(twl->client->name));
+		}
+		mutex_init(&twl->xfer_lock);
+	}
+	inuse = true;
+
+	/* setup clock framework */
+	clocks_init();
+
+	/* Maybe init the T2 Interrupt subsystem */
+	if (client->irq
+			&& pdata->irq_base
+			&& pdata->irq_end > pdata->irq_base) {
+		twl_init_irq(client->irq, pdata->irq_base, pdata->irq_end);
+		dev_info(&client->dev, "IRQ %d chains IRQs %d..%d\n",
+				client->irq, pdata->irq_base, pdata->irq_end - 1);
+	}
+
+	status = add_children(pdata);
+fail:
+	if (status < 0)
+		twl4030_remove(client);
+	return status;
+}
+
+static const struct i2c_device_id twl4030_ids[] = {
+	{ "twl4030", 0 },	/* "Triton 2" */
+	{ "tps65950", 0 },	/* catalog version of twl4030 */
+	{ "tps65930", 0 },	/* fewer LDOs and DACs; no charger */
+	{ "tps65920", 0 },	/* fewer LDOs; no codec or charger */
+	{ "twl5030", 0 },	/* T2 updated */
+	{ /* end of list */ },
+};
+MODULE_DEVICE_TABLE(i2c, twl4030_ids);
+
+/* One Client Driver , 4 Clients */
+static struct i2c_driver twl4030_driver = {
+	.driver.name	= DRIVER_NAME,
+	.id_table	= twl4030_ids,
+	.probe		= twl4030_probe,
+	.remove		= twl4030_remove,
+};
+
+static int __init twl4030_init(void)
+{
+	return i2c_add_driver(&twl4030_driver);
+}
+subsys_initcall(twl4030_init);
+
+static void __exit twl4030_exit(void)
+{
+	i2c_del_driver(&twl4030_driver);
+}
+module_exit(twl4030_exit);
+
+MODULE_AUTHOR("Texas Instruments, Inc.");
+MODULE_DESCRIPTION("I2C Core interface for TWL4030");
+MODULE_LICENSE("GPL");
diff --git a/include/linux/i2c/twl4030.h b/include/linux/i2c/twl4030.h
new file mode 100644
index 000000000000..cdb453162a97
--- /dev/null
+++ b/include/linux/i2c/twl4030.h
@@ -0,0 +1,339 @@
+/*
+ * twl4030.h - header for TWL4030 PM and audio CODEC device
+ *
+ * Copyright (C) 2005-2006 Texas Instruments, Inc.
+ *
+ * Based on tlv320aic23.c:
+ * Copyright (c) by Kai Svahn <kai.svahn@nokia.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307 USA
+ *
+ */
+
+#ifndef __TWL4030_H_
+#define __TWL4030_H_
+
+/*
+ * Using the twl4030 core we address registers using a pair
+ *	{ module id, relative register offset }
+ * which that core then maps to the relevant
+ *	{ i2c slave, absolute register address }
+ *
+ * The module IDs are meaningful only to the twl4030 core code,
+ * which uses them as array indices to look up the first register
+ * address each module uses within a given i2c slave.
+ */
+
+/* Slave 0 (i2c address 0x48) */
+#define TWL4030_MODULE_USB		0x00
+
+/* Slave 1 (i2c address 0x49) */
+#define TWL4030_MODULE_AUDIO_VOICE	0x01
+#define TWL4030_MODULE_GPIO		0x02
+#define TWL4030_MODULE_INTBR		0x03
+#define TWL4030_MODULE_PIH		0x04
+#define TWL4030_MODULE_TEST		0x05
+
+/* Slave 2 (i2c address 0x4a) */
+#define TWL4030_MODULE_KEYPAD		0x06
+#define TWL4030_MODULE_MADC		0x07
+#define TWL4030_MODULE_INTERRUPTS	0x08
+#define TWL4030_MODULE_LED		0x09
+#define TWL4030_MODULE_MAIN_CHARGE	0x0A
+#define TWL4030_MODULE_PRECHARGE	0x0B
+#define TWL4030_MODULE_PWM0		0x0C
+#define TWL4030_MODULE_PWM1		0x0D
+#define TWL4030_MODULE_PWMA		0x0E
+#define TWL4030_MODULE_PWMB		0x0F
+
+/* Slave 3 (i2c address 0x4b) */
+#define TWL4030_MODULE_BACKUP		0x10
+#define TWL4030_MODULE_INT		0x11
+#define TWL4030_MODULE_PM_MASTER	0x12
+#define TWL4030_MODULE_PM_RECEIVER	0x13
+#define TWL4030_MODULE_RTC		0x14
+#define TWL4030_MODULE_SECURED_REG	0x15
+
+/*
+ * Read and write single 8-bit registers
+ */
+int twl4030_i2c_write_u8(u8 mod_no, u8 val, u8 reg);
+int twl4030_i2c_read_u8(u8 mod_no, u8 *val, u8 reg);
+
+/*
+ * Read and write several 8-bit registers at once.
+ *
+ * IMPORTANT:  For twl4030_i2c_write(), allocate num_bytes + 1
+ * for the value, and populate your data starting at offset 1.
+ */
+int twl4030_i2c_write(u8 mod_no, u8 *value, u8 reg, u8 num_bytes);
+int twl4030_i2c_read(u8 mod_no, u8 *value, u8 reg, u8 num_bytes);
+
+/*----------------------------------------------------------------------*/
+
+/*
+ * NOTE:  at up to 1024 registers, this is a big chip.
+ *
+ * Avoid putting register declarations in this file, instead of into
+ * a driver-private file, unless some of the registers in a block
+ * need to be shared with other drivers.  One example is blocks that
+ * have Secondary IRQ Handler (SIH) registers.
+ */
+
+#define TWL4030_SIH_CTRL_EXCLEN_MASK	BIT(0)
+#define TWL4030_SIH_CTRL_PENDDIS_MASK	BIT(1)
+#define TWL4030_SIH_CTRL_COR_MASK	BIT(2)
+
+/*----------------------------------------------------------------------*/
+
+/*
+ * GPIO Block Register offsets (use TWL4030_MODULE_GPIO)
+ */
+
+#define REG_GPIODATAIN1			0x0
+#define REG_GPIODATAIN2			0x1
+#define REG_GPIODATAIN3			0x2
+#define REG_GPIODATADIR1		0x3
+#define REG_GPIODATADIR2		0x4
+#define REG_GPIODATADIR3		0x5
+#define REG_GPIODATAOUT1		0x6
+#define REG_GPIODATAOUT2		0x7
+#define REG_GPIODATAOUT3		0x8
+#define REG_CLEARGPIODATAOUT1		0x9
+#define REG_CLEARGPIODATAOUT2		0xA
+#define REG_CLEARGPIODATAOUT3		0xB
+#define REG_SETGPIODATAOUT1		0xC
+#define REG_SETGPIODATAOUT2		0xD
+#define REG_SETGPIODATAOUT3		0xE
+#define REG_GPIO_DEBEN1			0xF
+#define REG_GPIO_DEBEN2			0x10
+#define REG_GPIO_DEBEN3			0x11
+#define REG_GPIO_CTRL			0x12
+#define REG_GPIOPUPDCTR1		0x13
+#define REG_GPIOPUPDCTR2		0x14
+#define REG_GPIOPUPDCTR3		0x15
+#define REG_GPIOPUPDCTR4		0x16
+#define REG_GPIOPUPDCTR5		0x17
+#define REG_GPIO_ISR1A			0x19
+#define REG_GPIO_ISR2A			0x1A
+#define REG_GPIO_ISR3A			0x1B
+#define REG_GPIO_IMR1A			0x1C
+#define REG_GPIO_IMR2A			0x1D
+#define REG_GPIO_IMR3A			0x1E
+#define REG_GPIO_ISR1B			0x1F
+#define REG_GPIO_ISR2B			0x20
+#define REG_GPIO_ISR3B			0x21
+#define REG_GPIO_IMR1B			0x22
+#define REG_GPIO_IMR2B			0x23
+#define REG_GPIO_IMR3B			0x24
+#define REG_GPIO_EDR1			0x28
+#define REG_GPIO_EDR2			0x29
+#define REG_GPIO_EDR3			0x2A
+#define REG_GPIO_EDR4			0x2B
+#define REG_GPIO_EDR5			0x2C
+#define REG_GPIO_SIH_CTRL		0x2D
+
+/* Up to 18 signals are available as GPIOs, when their
+ * pins are not assigned to another use (such as ULPI/USB).
+ */
+#define TWL4030_GPIO_MAX		18
+
+/*----------------------------------------------------------------------*/
+
+/*
+ * Keypad register offsets (use TWL4030_MODULE_KEYPAD)
+ * ... SIH/interrupt only
+ */
+
+#define TWL4030_KEYPAD_KEYP_ISR1	0x11
+#define TWL4030_KEYPAD_KEYP_IMR1	0x12
+#define TWL4030_KEYPAD_KEYP_ISR2	0x13
+#define TWL4030_KEYPAD_KEYP_IMR2	0x14
+#define TWL4030_KEYPAD_KEYP_SIR		0x15	/* test register */
+#define TWL4030_KEYPAD_KEYP_EDR		0x16
+#define TWL4030_KEYPAD_KEYP_SIH_CTRL	0x17
+
+/*----------------------------------------------------------------------*/
+
+/*
+ * Multichannel ADC register offsets (use TWL4030_MODULE_MADC)
+ * ... SIH/interrupt only
+ */
+
+#define TWL4030_MADC_ISR1		0x61
+#define TWL4030_MADC_IMR1		0x62
+#define TWL4030_MADC_ISR2		0x63
+#define TWL4030_MADC_IMR2		0x64
+#define TWL4030_MADC_SIR		0x65	/* test register */
+#define TWL4030_MADC_EDR		0x66
+#define TWL4030_MADC_SIH_CTRL		0x67
+
+/*----------------------------------------------------------------------*/
+
+/*
+ * Battery charger register offsets (use TWL4030_MODULE_INTERRUPTS)
+ */
+
+#define TWL4030_INTERRUPTS_BCIISR1A	0x0
+#define TWL4030_INTERRUPTS_BCIISR2A	0x1
+#define TWL4030_INTERRUPTS_BCIIMR1A	0x2
+#define TWL4030_INTERRUPTS_BCIIMR2A	0x3
+#define TWL4030_INTERRUPTS_BCIISR1B	0x4
+#define TWL4030_INTERRUPTS_BCIISR2B	0x5
+#define TWL4030_INTERRUPTS_BCIIMR1B	0x6
+#define TWL4030_INTERRUPTS_BCIIMR2B	0x7
+#define TWL4030_INTERRUPTS_BCISIR1	0x8	/* test register */
+#define TWL4030_INTERRUPTS_BCISIR2	0x9	/* test register */
+#define TWL4030_INTERRUPTS_BCIEDR1	0xa
+#define TWL4030_INTERRUPTS_BCIEDR2	0xb
+#define TWL4030_INTERRUPTS_BCIEDR3	0xc
+#define TWL4030_INTERRUPTS_BCISIHCTRL	0xd
+
+/*----------------------------------------------------------------------*/
+
+/*
+ * Power Interrupt block register offsets (use TWL4030_MODULE_INT)
+ */
+
+#define TWL4030_INT_PWR_ISR1		0x0
+#define TWL4030_INT_PWR_IMR1		0x1
+#define TWL4030_INT_PWR_ISR2		0x2
+#define TWL4030_INT_PWR_IMR2		0x3
+#define TWL4030_INT_PWR_SIR		0x4	/* test register */
+#define TWL4030_INT_PWR_EDR1		0x5
+#define TWL4030_INT_PWR_EDR2		0x6
+#define TWL4030_INT_PWR_SIH_CTRL	0x7
+
+/*----------------------------------------------------------------------*/
+
+struct twl4030_bci_platform_data {
+	int *battery_tmp_tbl;
+	unsigned int tblsize;
+};
+
+/* TWL4030_GPIO_MAX (18) GPIOs, with interrupts */
+struct twl4030_gpio_platform_data {
+	int		gpio_base;
+	unsigned	irq_base, irq_end;
+
+	/* For gpio-N, bit (1 << N) in "pullups" is set if that pullup
+	 * should be enabled.  Else, if that bit is set in "pulldowns",
+	 * that pulldown is enabled.  Don't waste power by letting any
+	 * digital inputs float...
+	 */
+	u32		pullups;
+	u32		pulldowns;
+
+	int		(*setup)(struct device *dev,
+				unsigned gpio, unsigned ngpio);
+	int		(*teardown)(struct device *dev,
+				unsigned gpio, unsigned ngpio);
+};
+
+struct twl4030_madc_platform_data {
+	int		irq_line;
+};
+
+struct twl4030_keypad_data {
+	int rows;
+	int cols;
+	int *keymap;
+	int irq;
+	unsigned int keymapsize;
+	unsigned int rep:1;
+};
+
+enum twl4030_usb_mode {
+	T2_USB_MODE_ULPI = 1,
+	T2_USB_MODE_CEA2011_3PIN = 2,
+};
+
+struct twl4030_usb_data {
+	enum twl4030_usb_mode	usb_mode;
+};
+
+struct twl4030_platform_data {
+	unsigned				irq_base, irq_end;
+	struct twl4030_bci_platform_data	*bci;
+	struct twl4030_gpio_platform_data	*gpio;
+	struct twl4030_madc_platform_data	*madc;
+	struct twl4030_keypad_data		*keypad;
+	struct twl4030_usb_data			*usb;
+
+	/* REVISIT more to come ... _nothing_ should be hard-wired */
+};
+
+/*----------------------------------------------------------------------*/
+
+/*
+ * FIXME completely stop using TWL4030_IRQ_BASE ... instead, pass the
+ * IRQ data to subsidiary devices using platform device resources.
+ */
+
+/* IRQ information-need base */
+#include <mach/irqs.h>
+/* TWL4030 interrupts */
+
+/* #define TWL4030_MODIRQ_GPIO		(TWL4030_IRQ_BASE + 0) */
+#define TWL4030_MODIRQ_KEYPAD		(TWL4030_IRQ_BASE + 1)
+#define TWL4030_MODIRQ_BCI		(TWL4030_IRQ_BASE + 2)
+#define TWL4030_MODIRQ_MADC		(TWL4030_IRQ_BASE + 3)
+/* #define TWL4030_MODIRQ_USB		(TWL4030_IRQ_BASE + 4) */
+#define TWL4030_MODIRQ_PWR		(TWL4030_IRQ_BASE + 5)
+
+#define TWL4030_PWRIRQ_PWRBTN		(TWL4030_PWR_IRQ_BASE + 0)
+#define TWL4030_PWRIRQ_CHG_PRES		(TWL4030_PWR_IRQ_BASE + 1)
+#define TWL4030_PWRIRQ_USB_PRES		(TWL4030_PWR_IRQ_BASE + 2)
+#define TWL4030_PWRIRQ_RTC		(TWL4030_PWR_IRQ_BASE + 3)
+#define TWL4030_PWRIRQ_HOT_DIE		(TWL4030_PWR_IRQ_BASE + 4)
+#define TWL4030_PWRIRQ_PWROK_TIMEOUT	(TWL4030_PWR_IRQ_BASE + 5)
+#define TWL4030_PWRIRQ_MBCHG		(TWL4030_PWR_IRQ_BASE + 6)
+#define TWL4030_PWRIRQ_SC_DETECT	(TWL4030_PWR_IRQ_BASE + 7)
+
+/* Rest are unsued currently*/
+
+/* Offsets to Power Registers */
+#define TWL4030_VDAC_DEV_GRP		0x3B
+#define TWL4030_VDAC_DEDICATED		0x3E
+#define TWL4030_VAUX1_DEV_GRP		0x17
+#define TWL4030_VAUX1_DEDICATED		0x1A
+#define TWL4030_VAUX2_DEV_GRP		0x1B
+#define TWL4030_VAUX2_DEDICATED		0x1E
+#define TWL4030_VAUX3_DEV_GRP		0x1F
+#define TWL4030_VAUX3_DEDICATED		0x22
+
+/* TWL4030 GPIO interrupt definitions */
+
+#define TWL4030_GPIO_IRQ_NO(n)		(TWL4030_GPIO_IRQ_BASE + (n))
+#define TWL4030_GPIO_IS_ENABLE		1
+
+/*
+ * Exported TWL4030 GPIO APIs
+ *
+ * WARNING -- use standard GPIO and IRQ calls instead; these will vanish.
+ */
+int twl4030_get_gpio_datain(int gpio);
+int twl4030_request_gpio(int gpio);
+int twl4030_set_gpio_debounce(int gpio, int enable);
+int twl4030_free_gpio(int gpio);
+
+#if defined(CONFIG_TWL4030_BCI_BATTERY) || \
+	defined(CONFIG_TWL4030_BCI_BATTERY_MODULE)
+	extern int twl4030charger_usb_en(int enable);
+#else
+	static inline int twl4030charger_usb_en(int enable) { return 0; }
+#endif
+
+#endif /* End of __TWL4030_H */
-- 
cgit v1.2.3


From 26b8f5e1e2d1229c186d8e61d26513c43a058c5e Mon Sep 17 00:00:00 2001
From: Eric Miao <eric.miao@marvell.com>
Date: Wed, 15 Oct 2008 12:20:06 +0200
Subject: mfd: add base support for Dialog DA9030/DA9034 PMICs

DA9030 (a.k.a ARAVA) and DA9034 (a.k.a MICCO) are PMICs designed by
Dialog Semiconductor, usually found on PXA-based platforms. These
PMICs are I2C-based, multi-function devices, usually with LEDs, PWMs
for backlight, BUCKs and LDOs, ADCs and touchscreen controller (on
DA9034).

This is the base support for the I2C operations, event registration
and handling, sub-devices management.

Signed-off-by: Mike Rapoport <mike@compulab.co.il>
Signed-off-by: Eric Miao <eric.miao@marvell.com>
Signed-off-by: Liam Girdwood <lrg@kernel.org>
Signed-off-by: Samuel Ortiz <sameo@openedhand.com>
---
 drivers/mfd/da903x.c       | 563 +++++++++++++++++++++++++++++++++++++++++++++
 include/linux/mfd/da903x.h | 201 ++++++++++++++++
 2 files changed, 764 insertions(+)
 create mode 100644 drivers/mfd/da903x.c
 create mode 100644 include/linux/mfd/da903x.h

(limited to 'include/linux')

diff --git a/drivers/mfd/da903x.c b/drivers/mfd/da903x.c
new file mode 100644
index 000000000000..b57326ae464d
--- /dev/null
+++ b/drivers/mfd/da903x.c
@@ -0,0 +1,563 @@
+/*
+ * Base driver for Dialog Semiconductor DA9030/DA9034
+ *
+ * Copyright (C) 2008 Compulab, Ltd.
+ * 	Mike Rapoport <mike@compulab.co.il>
+ *
+ * Copyright (C) 2006-2008 Marvell International Ltd.
+ * 	Eric Miao <eric.miao@marvell.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/interrupt.h>
+#include <linux/platform_device.h>
+#include <linux/i2c.h>
+#include <linux/mfd/da903x.h>
+
+#define DA9030_CHIP_ID		0x00
+#define DA9030_EVENT_A		0x01
+#define DA9030_EVENT_B		0x02
+#define DA9030_EVENT_C		0x03
+#define DA9030_STATUS		0x04
+#define DA9030_IRQ_MASK_A	0x05
+#define DA9030_IRQ_MASK_B	0x06
+#define DA9030_IRQ_MASK_C	0x07
+#define DA9030_SYS_CTRL_A	0x08
+#define DA9030_SYS_CTRL_B	0x09
+#define DA9030_FAULT_LOG	0x0a
+
+#define DA9034_CHIP_ID		0x00
+#define DA9034_EVENT_A		0x01
+#define DA9034_EVENT_B		0x02
+#define DA9034_EVENT_C		0x03
+#define DA9034_EVENT_D		0x04
+#define DA9034_STATUS_A		0x05
+#define DA9034_STATUS_B		0x06
+#define DA9034_IRQ_MASK_A	0x07
+#define DA9034_IRQ_MASK_B	0x08
+#define DA9034_IRQ_MASK_C	0x09
+#define DA9034_IRQ_MASK_D	0x0a
+#define DA9034_SYS_CTRL_A	0x0b
+#define DA9034_SYS_CTRL_B	0x0c
+#define DA9034_FAULT_LOG	0x0d
+
+struct da903x_chip;
+
+struct da903x_chip_ops {
+	int	(*init_chip)(struct da903x_chip *);
+	int	(*unmask_events)(struct da903x_chip *, unsigned int events);
+	int	(*mask_events)(struct da903x_chip *, unsigned int events);
+	int	(*read_events)(struct da903x_chip *, unsigned int *events);
+	int	(*read_status)(struct da903x_chip *, unsigned int *status);
+};
+
+struct da903x_chip {
+	struct i2c_client	*client;
+	struct device		*dev;
+	struct da903x_chip_ops	*ops;
+
+	int			type;
+	uint32_t		events_mask;
+
+	struct mutex		lock;
+	struct work_struct	irq_work;
+
+	struct blocking_notifier_head notifier_list;
+};
+
+static inline int __da903x_read(struct i2c_client *client,
+				int reg, uint8_t *val)
+{
+	int ret;
+
+	ret = i2c_smbus_read_byte_data(client, reg);
+	if (ret < 0) {
+		dev_err(&client->dev, "failed reading at 0x%02x\n", reg);
+		return ret;
+	}
+
+	*val = (uint8_t)ret;
+	return 0;
+}
+
+static inline int __da903x_reads(struct i2c_client *client, int reg,
+				 int len, uint8_t *val)
+{
+	int ret;
+
+	ret = i2c_smbus_read_i2c_block_data(client, reg, len, val);
+	if (ret < 0) {
+		dev_err(&client->dev, "failed reading from 0x%02x\n", reg);
+		return ret;
+	}
+	return 0;
+}
+
+static inline int __da903x_write(struct i2c_client *client,
+				 int reg, uint8_t val)
+{
+	int ret;
+
+	ret = i2c_smbus_write_byte_data(client, reg, val);
+	if (ret < 0) {
+		dev_err(&client->dev, "failed writing 0x%02x to 0x%02x\n",
+				val, reg);
+		return ret;
+	}
+	return 0;
+}
+
+static inline int __da903x_writes(struct i2c_client *client, int reg,
+				  int len, uint8_t *val)
+{
+	int ret;
+
+	ret = i2c_smbus_write_i2c_block_data(client, reg, len, val);
+	if (ret < 0) {
+		dev_err(&client->dev, "failed writings to 0x%02x\n", reg);
+		return ret;
+	}
+	return 0;
+}
+
+int da903x_register_notifier(struct device *dev, struct notifier_block *nb,
+				unsigned int events)
+{
+	struct da903x_chip *chip = dev_get_drvdata(dev);
+
+	chip->ops->unmask_events(chip, events);
+	return blocking_notifier_chain_register(&chip->notifier_list, nb);
+}
+EXPORT_SYMBOL_GPL(da903x_register_notifier);
+
+int da903x_unregister_notifier(struct device *dev, struct notifier_block *nb,
+				unsigned int events)
+{
+	struct da903x_chip *chip = dev_get_drvdata(dev);
+
+	chip->ops->mask_events(chip, events);
+	return blocking_notifier_chain_unregister(&chip->notifier_list, nb);
+}
+EXPORT_SYMBOL_GPL(da903x_unregister_notifier);
+
+int da903x_write(struct device *dev, int reg, uint8_t val)
+{
+	return __da903x_write(to_i2c_client(dev), reg, val);
+}
+EXPORT_SYMBOL_GPL(da903x_write);
+
+int da903x_read(struct device *dev, int reg, uint8_t *val)
+{
+	return __da903x_read(to_i2c_client(dev), reg, val);
+}
+EXPORT_SYMBOL_GPL(da903x_read);
+
+int da903x_set_bits(struct device *dev, int reg, uint8_t bit_mask)
+{
+	struct da903x_chip *chip = dev_get_drvdata(dev);
+	uint8_t reg_val;
+	int ret = 0;
+
+	mutex_lock(&chip->lock);
+
+	ret = __da903x_read(chip->client, reg, &reg_val);
+	if (ret)
+		goto out;
+
+	if ((reg_val & bit_mask) == 0) {
+		reg_val |= bit_mask;
+		ret = __da903x_write(chip->client, reg, reg_val);
+	}
+out:
+	mutex_unlock(&chip->lock);
+	return ret;
+}
+EXPORT_SYMBOL_GPL(da903x_set_bits);
+
+int da903x_clr_bits(struct device *dev, int reg, uint8_t bit_mask)
+{
+	struct da903x_chip *chip = dev_get_drvdata(dev);
+	uint8_t reg_val;
+	int ret = 0;
+
+	mutex_lock(&chip->lock);
+
+	ret = __da903x_read(chip->client, reg, &reg_val);
+	if (ret)
+		goto out;
+
+	if (reg_val & bit_mask) {
+		reg_val &= ~bit_mask;
+		ret = __da903x_write(chip->client, reg, reg_val);
+	}
+out:
+	mutex_unlock(&chip->lock);
+	return ret;
+}
+EXPORT_SYMBOL_GPL(da903x_clr_bits);
+
+int da903x_update(struct device *dev, int reg, uint8_t val, uint8_t mask)
+{
+	struct da903x_chip *chip = dev_get_drvdata(dev);
+	uint8_t reg_val;
+	int ret = 0;
+
+	mutex_lock(&chip->lock);
+
+	ret = __da903x_read(chip->client, reg, &reg_val);
+	if (ret)
+		goto out;
+
+	if ((reg_val & mask) != val) {
+		reg_val = (reg_val & ~mask) | val;
+		ret = __da903x_write(chip->client, reg, reg_val);
+	}
+out:
+	mutex_unlock(&chip->lock);
+	return ret;
+}
+EXPORT_SYMBOL_GPL(da903x_update);
+
+int da903x_query_status(struct device *dev, unsigned int sbits)
+{
+	struct da903x_chip *chip = dev_get_drvdata(dev);
+	unsigned int status = 0;
+
+	chip->ops->read_status(chip, &status);
+	return ((status & sbits) == sbits);
+}
+EXPORT_SYMBOL(da903x_query_status);
+
+static int __devinit da9030_init_chip(struct da903x_chip *chip)
+{
+	uint8_t chip_id;
+	int err;
+
+	err = __da903x_read(chip->client, DA9030_CHIP_ID, &chip_id);
+	if (err)
+		return err;
+
+	err = __da903x_write(chip->client, DA9030_SYS_CTRL_A, 0xE8);
+	if (err)
+		return err;
+
+	dev_info(chip->dev, "DA9030 (CHIP ID: 0x%02x) detected\n", chip_id);
+	return 0;
+}
+
+static int da9030_unmask_events(struct da903x_chip *chip, unsigned int events)
+{
+	uint8_t v[3];
+
+	chip->events_mask &= ~events;
+
+	v[0] = (chip->events_mask & 0xff);
+	v[1] = (chip->events_mask >> 8) & 0xff;
+	v[2] = (chip->events_mask >> 16) & 0xff;
+
+	return __da903x_writes(chip->client, DA9030_IRQ_MASK_A, 3, v);
+}
+
+static int da9030_mask_events(struct da903x_chip *chip, unsigned int events)
+{
+	uint8_t v[3];
+
+	chip->events_mask &= ~events;
+
+	v[0] = (chip->events_mask & 0xff);
+	v[1] = (chip->events_mask >> 8) & 0xff;
+	v[2] = (chip->events_mask >> 16) & 0xff;
+
+	return __da903x_writes(chip->client, DA9030_IRQ_MASK_A, 3, v);
+}
+
+static int da9030_read_events(struct da903x_chip *chip, unsigned int *events)
+{
+	uint8_t v[3] = {0, 0, 0};
+	int ret;
+
+	ret = __da903x_reads(chip->client, DA9030_EVENT_A, 3, v);
+	if (ret < 0)
+		return ret;
+
+	*events = (v[2] << 16) | (v[1] << 8) | v[0];
+	return 0;
+}
+
+static int da9030_read_status(struct da903x_chip *chip, unsigned int *status)
+{
+	return __da903x_read(chip->client, DA9030_STATUS, (uint8_t *)status);
+}
+
+static int da9034_init_chip(struct da903x_chip *chip)
+{
+	uint8_t chip_id;
+	int err;
+
+	err = __da903x_read(chip->client, DA9034_CHIP_ID, &chip_id);
+	if (err)
+		return err;
+
+	err = __da903x_write(chip->client, DA9034_SYS_CTRL_A, 0xE8);
+	if (err)
+		return err;
+
+	/* avoid SRAM power off during sleep*/
+	__da903x_write(chip->client, 0x10, 0x07);
+	__da903x_write(chip->client, 0x11, 0xff);
+	__da903x_write(chip->client, 0x12, 0xff);
+
+	/* Enable the ONKEY power down functionality */
+	__da903x_write(chip->client, DA9034_SYS_CTRL_B, 0x20);
+	__da903x_write(chip->client, DA9034_SYS_CTRL_A, 0x60);
+
+	/* workaround to make LEDs work */
+	__da903x_write(chip->client, 0x90, 0x01);
+	__da903x_write(chip->client, 0xB0, 0x08);
+
+	/* make ADTV1 and SDTV1 effective */
+	__da903x_write(chip->client, 0x20, 0x00);
+
+	dev_info(chip->dev, "DA9034 (CHIP ID: 0x%02x) detected\n", chip_id);
+	return 0;
+}
+
+static int da9034_unmask_events(struct da903x_chip *chip, unsigned int events)
+{
+	uint8_t v[4];
+
+	chip->events_mask &= ~events;
+
+	v[0] = (chip->events_mask & 0xff);
+	v[1] = (chip->events_mask >> 8) & 0xff;
+	v[2] = (chip->events_mask >> 16) & 0xff;
+	v[3] = (chip->events_mask >> 24) & 0xff;
+
+	return __da903x_writes(chip->client, DA9034_IRQ_MASK_A, 4, v);
+}
+
+static int da9034_mask_events(struct da903x_chip *chip, unsigned int events)
+{
+	uint8_t v[4];
+
+	chip->events_mask |= events;
+
+	v[0] = (chip->events_mask & 0xff);
+	v[1] = (chip->events_mask >> 8) & 0xff;
+	v[2] = (chip->events_mask >> 16) & 0xff;
+	v[3] = (chip->events_mask >> 24) & 0xff;
+
+	return __da903x_writes(chip->client, DA9034_IRQ_MASK_A, 4, v);
+}
+
+static int da9034_read_events(struct da903x_chip *chip, unsigned int *events)
+{
+	uint8_t v[4] = {0, 0, 0, 0};
+	int ret;
+
+	ret = __da903x_reads(chip->client, DA9034_EVENT_A, 4, v);
+	if (ret < 0)
+		return ret;
+
+	*events = (v[3] << 24) | (v[2] << 16) | (v[1] << 8) | v[0];
+	return 0;
+}
+
+static int da9034_read_status(struct da903x_chip *chip, unsigned int *status)
+{
+	uint8_t v[2] = {0, 0};
+	int ret = 0;
+
+	ret = __da903x_reads(chip->client, DA9034_STATUS_A, 2, v);
+	if (ret)
+		return ret;
+
+	*status = (v[1] << 8) | v[0];
+	return 0;
+}
+
+static void da903x_irq_work(struct work_struct *work)
+{
+	struct da903x_chip *chip =
+		container_of(work, struct da903x_chip, irq_work);
+	unsigned int events = 0;
+
+	while (1) {
+		if (chip->ops->read_events(chip, &events))
+			break;
+
+		events &= ~chip->events_mask;
+		if (events == 0)
+			break;
+
+		blocking_notifier_call_chain(
+				&chip->notifier_list, events, NULL);
+	}
+	enable_irq(chip->client->irq);
+}
+
+static int da903x_irq_handler(int irq, void *data)
+{
+	struct da903x_chip *chip = data;
+
+	disable_irq_nosync(irq);
+	(void)schedule_work(&chip->irq_work);
+
+	return IRQ_HANDLED;
+}
+
+static struct da903x_chip_ops da903x_ops[] = {
+	[0] = {
+		.init_chip	= da9030_init_chip,
+		.unmask_events	= da9030_unmask_events,
+		.mask_events	= da9030_mask_events,
+		.read_events	= da9030_read_events,
+		.read_status	= da9030_read_status,
+	},
+	[1] = {
+		.init_chip	= da9034_init_chip,
+		.unmask_events	= da9034_unmask_events,
+		.mask_events	= da9034_mask_events,
+		.read_events	= da9034_read_events,
+		.read_status	= da9034_read_status,
+	}
+};
+
+static const struct i2c_device_id da903x_id_table[] = {
+	{ "da9030", 0 },
+	{ "da9034", 1 },
+	{ },
+};
+MODULE_DEVICE_TABLE(i2c, da903x_id_table);
+
+static int __devexit __remove_subdev(struct device *dev, void *unused)
+{
+	platform_device_unregister(to_platform_device(dev));
+	return 0;
+}
+
+static int __devexit da903x_remove_subdevs(struct da903x_chip *chip)
+{
+	return device_for_each_child(chip->dev, NULL, __remove_subdev);
+}
+
+static int __devinit da903x_add_subdevs(struct da903x_chip *chip,
+					struct da903x_platform_data *pdata)
+{
+	struct da903x_subdev_info *subdev;
+	struct platform_device *pdev;
+	int i, ret = 0;
+
+	for (i = 0; i < pdata->num_subdevs; i++) {
+		subdev = &pdata->subdevs[i];
+
+		pdev = platform_device_alloc(subdev->name, subdev->id);
+
+		pdev->dev.parent = chip->dev;
+		pdev->dev.platform_data = subdev->platform_data;
+
+		ret = platform_device_add(pdev);
+		if (ret)
+			goto failed;
+	}
+	return 0;
+
+failed:
+	da903x_remove_subdevs(chip);
+	return ret;
+}
+
+static int __devinit da903x_probe(struct i2c_client *client,
+				  const struct i2c_device_id *id)
+{
+	struct da903x_platform_data *pdata = client->dev.platform_data;
+	struct da903x_chip *chip;
+	unsigned int tmp;
+	int ret;
+
+	chip = kzalloc(sizeof(struct da903x_chip), GFP_KERNEL);
+	if (chip == NULL)
+		return -ENOMEM;
+
+	chip->client = client;
+	chip->dev = &client->dev;
+	chip->ops = &da903x_ops[id->driver_data];
+
+	mutex_init(&chip->lock);
+	INIT_WORK(&chip->irq_work, da903x_irq_work);
+	BLOCKING_INIT_NOTIFIER_HEAD(&chip->notifier_list);
+
+	i2c_set_clientdata(client, chip);
+
+	ret = chip->ops->init_chip(chip);
+	if (ret)
+		goto out_free_chip;
+
+	/* mask and clear all IRQs */
+	chip->events_mask = 0xffffffff;
+	chip->ops->mask_events(chip, chip->events_mask);
+	chip->ops->read_events(chip, &tmp);
+
+	ret = request_irq(client->irq, da903x_irq_handler,
+			IRQF_DISABLED | IRQF_TRIGGER_FALLING,
+			"da903x", chip);
+	if (ret) {
+		dev_err(&client->dev, "failed to request irq %d\n",
+				client->irq);
+		goto out_free_chip;
+	}
+
+	ret = da903x_add_subdevs(chip, pdata);
+	if (ret)
+		goto out_free_irq;
+
+	return 0;
+
+out_free_irq:
+	free_irq(client->irq, chip);
+out_free_chip:
+	i2c_set_clientdata(client, NULL);
+	kfree(chip);
+	return ret;
+}
+
+static int __devexit da903x_remove(struct i2c_client *client)
+{
+	struct da903x_chip *chip = i2c_get_clientdata(client);
+
+	da903x_remove_subdevs(chip);
+	kfree(chip);
+	return 0;
+}
+
+static struct i2c_driver da903x_driver = {
+	.driver	= {
+		.name	= "da903x",
+		.owner	= THIS_MODULE,
+	},
+	.probe		= da903x_probe,
+	.remove		= __devexit_p(da903x_remove),
+	.id_table	= da903x_id_table,
+};
+
+static int __init da903x_init(void)
+{
+	return i2c_add_driver(&da903x_driver);
+}
+module_init(da903x_init);
+
+static void __exit da903x_exit(void)
+{
+	i2c_del_driver(&da903x_driver);
+}
+module_exit(da903x_exit);
+
+MODULE_DESCRIPTION("PMIC Driver for Dialog Semiconductor DA9034");
+MODULE_AUTHOR("Eric Miao <eric.miao@marvell.com>"
+	      "Mike Rapoport <mike@compulab.co.il>");
+MODULE_LICENSE("GPL");
diff --git a/include/linux/mfd/da903x.h b/include/linux/mfd/da903x.h
new file mode 100644
index 000000000000..cad314c12439
--- /dev/null
+++ b/include/linux/mfd/da903x.h
@@ -0,0 +1,201 @@
+#ifndef __LINUX_PMIC_DA903X_H
+#define __LINUX_PMIC_DA903X_H
+
+/* Unified sub device IDs for DA9030/DA9034 */
+enum {
+	DA9030_ID_LED_1,
+	DA9030_ID_LED_2,
+	DA9030_ID_LED_3,
+	DA9030_ID_LED_4,
+	DA9030_ID_LED_PC,
+	DA9030_ID_VIBRA,
+	DA9030_ID_WLED,
+	DA9030_ID_BUCK1,
+	DA9030_ID_BUCK2,
+	DA9030_ID_LDO1,
+	DA9030_ID_LDO2,
+	DA9030_ID_LDO3,
+	DA9030_ID_LDO4,
+	DA9030_ID_LDO5,
+	DA9030_ID_LDO6,
+	DA9030_ID_LDO7,
+	DA9030_ID_LDO8,
+	DA9030_ID_LDO9,
+	DA9030_ID_LDO10,
+	DA9030_ID_LDO11,
+	DA9030_ID_LDO12,
+	DA9030_ID_LDO13,
+	DA9030_ID_LDO14,
+	DA9030_ID_LDO15,
+	DA9030_ID_LDO16,
+	DA9030_ID_LDO17,
+	DA9030_ID_LDO18,
+	DA9030_ID_LDO19,
+	DA9030_ID_LDO_INT,	/* LDO Internal */
+
+	DA9034_ID_LED_1,
+	DA9034_ID_LED_2,
+	DA9034_ID_VIBRA,
+	DA9034_ID_WLED,
+	DA9034_ID_TOUCH,
+
+	DA9034_ID_BUCK1,
+	DA9034_ID_BUCK2,
+	DA9034_ID_LDO1,
+	DA9034_ID_LDO2,
+	DA9034_ID_LDO3,
+	DA9034_ID_LDO4,
+	DA9034_ID_LDO5,
+	DA9034_ID_LDO6,
+	DA9034_ID_LDO7,
+	DA9034_ID_LDO8,
+	DA9034_ID_LDO9,
+	DA9034_ID_LDO10,
+	DA9034_ID_LDO11,
+	DA9034_ID_LDO12,
+	DA9034_ID_LDO13,
+	DA9034_ID_LDO14,
+	DA9034_ID_LDO15,
+};
+
+/*
+ * DA9030/DA9034 LEDs sub-devices uses generic "struct led_info"
+ * as the platform_data
+ */
+
+/* DA9030 flags for "struct led_info"
+ */
+#define DA9030_LED_RATE_ON	(0 << 5)
+#define DA9030_LED_RATE_052S	(1 << 5)
+#define DA9030_LED_DUTY_1_16	(0 << 3)
+#define DA9030_LED_DUTY_1_8	(1 << 3)
+#define DA9030_LED_DUTY_1_4	(2 << 3)
+#define DA9030_LED_DUTY_1_2	(3 << 3)
+
+#define DA9030_VIBRA_MODE_1P3V	(0 << 1)
+#define DA9030_VIBRA_MODE_2P7V	(1 << 1)
+#define DA9030_VIBRA_FREQ_1HZ	(0 << 2)
+#define DA9030_VIBRA_FREQ_2HZ	(1 << 2)
+#define DA9030_VIBRA_FREQ_4HZ	(2 << 2)
+#define DA9030_VIBRA_FREQ_8HZ	(3 << 2)
+#define DA9030_VIBRA_DUTY_ON	(0 << 4)
+#define DA9030_VIBRA_DUTY_75P	(1 << 4)
+#define DA9030_VIBRA_DUTY_50P	(2 << 4)
+#define DA9030_VIBRA_DUTY_25P	(3 << 4)
+
+/* DA9034 flags for "struct led_info" */
+#define DA9034_LED_RAMP		(1 << 7)
+
+/* DA9034 touch screen platform data */
+struct da9034_touch_pdata {
+	int	interval_ms;	/* sampling interval while pen down */
+	int	x_inverted;
+	int	y_inverted;
+};
+
+struct da903x_subdev_info {
+	int		id;
+	const char	*name;
+	void		*platform_data;
+};
+
+struct da903x_platform_data {
+	int num_subdevs;
+	struct da903x_subdev_info *subdevs;
+};
+
+/* bit definitions for DA9030 events */
+#define DA9030_EVENT_ONKEY		(1 << 0)
+#define	DA9030_EVENT_PWREN		(1 << 1)
+#define	DA9030_EVENT_EXTON		(1 << 2)
+#define	DA9030_EVENT_CHDET		(1 << 3)
+#define	DA9030_EVENT_TBAT		(1 << 4)
+#define	DA9030_EVENT_VBATMON		(1 << 5)
+#define	DA9030_EVENT_VBATMON_TXON	(1 << 6)
+#define	DA9030_EVENT_CHIOVER		(1 << 7)
+#define	DA9030_EVENT_TCTO		(1 << 8)
+#define	DA9030_EVENT_CCTO		(1 << 9)
+#define	DA9030_EVENT_ADC_READY		(1 << 10)
+#define	DA9030_EVENT_VBUS_4P4		(1 << 11)
+#define	DA9030_EVENT_VBUS_4P0		(1 << 12)
+#define	DA9030_EVENT_SESS_VALID		(1 << 13)
+#define	DA9030_EVENT_SRP_DETECT		(1 << 14)
+#define	DA9030_EVENT_WATCHDOG		(1 << 15)
+#define	DA9030_EVENT_LDO15		(1 << 16)
+#define	DA9030_EVENT_LDO16		(1 << 17)
+#define	DA9030_EVENT_LDO17		(1 << 18)
+#define	DA9030_EVENT_LDO18		(1 << 19)
+#define	DA9030_EVENT_LDO19		(1 << 20)
+#define	DA9030_EVENT_BUCK2		(1 << 21)
+
+/* bit definitions for DA9034 events */
+#define DA9034_EVENT_ONKEY		(1 << 0)
+#define DA9034_EVENT_EXTON		(1 << 2)
+#define DA9034_EVENT_CHDET		(1 << 3)
+#define DA9034_EVENT_TBAT		(1 << 4)
+#define DA9034_EVENT_VBATMON		(1 << 5)
+#define DA9034_EVENT_REV_IOVER		(1 << 6)
+#define DA9034_EVENT_CH_IOVER		(1 << 7)
+#define DA9034_EVENT_CH_TCTO		(1 << 8)
+#define DA9034_EVENT_CH_CCTO		(1 << 9)
+#define DA9034_EVENT_USB_DEV		(1 << 10)
+#define DA9034_EVENT_OTGCP_IOVER	(1 << 11)
+#define DA9034_EVENT_VBUS_4P55		(1 << 12)
+#define DA9034_EVENT_VBUS_3P8		(1 << 13)
+#define DA9034_EVENT_SESS_1P8		(1 << 14)
+#define DA9034_EVENT_SRP_READY		(1 << 15)
+#define DA9034_EVENT_ADC_MAN		(1 << 16)
+#define DA9034_EVENT_ADC_AUTO4		(1 << 17)
+#define DA9034_EVENT_ADC_AUTO5		(1 << 18)
+#define DA9034_EVENT_ADC_AUTO6		(1 << 19)
+#define DA9034_EVENT_PEN_DOWN		(1 << 20)
+#define DA9034_EVENT_TSI_READY		(1 << 21)
+#define DA9034_EVENT_UART_TX		(1 << 22)
+#define DA9034_EVENT_UART_RX		(1 << 23)
+#define DA9034_EVENT_HEADSET		(1 << 25)
+#define DA9034_EVENT_HOOKSWITCH		(1 << 26)
+#define DA9034_EVENT_WATCHDOG		(1 << 27)
+
+extern int da903x_register_notifier(struct device *dev,
+		struct notifier_block *nb, unsigned int events);
+extern int da903x_unregister_notifier(struct device *dev,
+		struct notifier_block *nb, unsigned int events);
+
+/* Status Query Interface */
+#define DA9030_STATUS_ONKEY		(1 << 0)
+#define DA9030_STATUS_PWREN1		(1 << 1)
+#define DA9030_STATUS_EXTON		(1 << 2)
+#define DA9030_STATUS_CHDET		(1 << 3)
+#define DA9030_STATUS_TBAT		(1 << 4)
+#define DA9030_STATUS_VBATMON		(1 << 5)
+#define DA9030_STATUS_VBATMON_TXON	(1 << 6)
+#define DA9030_STATUS_MCLKDET		(1 << 7)
+
+#define DA9034_STATUS_ONKEY		(1 << 0)
+#define DA9034_STATUS_EXTON		(1 << 2)
+#define DA9034_STATUS_CHDET		(1 << 3)
+#define DA9034_STATUS_TBAT		(1 << 4)
+#define DA9034_STATUS_VBATMON		(1 << 5)
+#define DA9034_STATUS_PEN_DOWN		(1 << 6)
+#define DA9034_STATUS_MCLKDET		(1 << 7)
+#define DA9034_STATUS_USB_DEV		(1 << 8)
+#define DA9034_STATUS_HEADSET		(1 << 9)
+#define DA9034_STATUS_HOOKSWITCH	(1 << 10)
+#define DA9034_STATUS_REMCON		(1 << 11)
+#define DA9034_STATUS_VBUS_VALID_4P55	(1 << 12)
+#define DA9034_STATUS_VBUS_VALID_3P8	(1 << 13)
+#define DA9034_STATUS_SESS_VALID_1P8	(1 << 14)
+#define DA9034_STATUS_SRP_READY		(1 << 15)
+
+extern int da903x_query_status(struct device *dev, unsigned int status);
+
+
+/* NOTE: the two functions below are not intended for use outside
+ * of the DA9034 sub-device drivers
+ */
+extern int da903x_write(struct device *dev, int reg, uint8_t val);
+extern int da903x_read(struct device *dev, int reg, uint8_t *val);
+extern int da903x_update(struct device *dev, int reg, uint8_t val, uint8_t mask);
+extern int da903x_set_bits(struct device *dev, int reg, uint8_t bit_mask);
+extern int da903x_clr_bits(struct device *dev, int reg, uint8_t bit_mask);
+#endif /* __LINUX_PMIC_DA903X_H */
-- 
cgit v1.2.3


From 7acb706ca97fce84bda4a902a33de2f3dae10260 Mon Sep 17 00:00:00 2001
From: Ian Molton <spyro@f2s.com>
Date: Thu, 9 Oct 2008 20:06:09 +0200
Subject: mfd: update TMIO drivers to use the clock API

This patch updates the remaining two TMIO drivers to use the clock API
rather than callback hooks into platform code.

Signed-off-by: Ian Molton <spyro@f2s.com>
Signed-off-by: Samuel Ortiz <sameo@openedhand.com>
---
 drivers/mfd/t7l66xb.c        | 40 +++++++++++++++++++++++++++++--------
 drivers/mfd/tc6387xb.c       | 47 +++++++++++++++++++++++++++-----------------
 include/linux/mfd/t7l66xb.h  |  2 --
 include/linux/mfd/tc6387xb.h |  3 ---
 4 files changed, 61 insertions(+), 31 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/mfd/t7l66xb.c b/drivers/mfd/t7l66xb.c
index 49a0fffc02af..9f7024c0f8ec 100644
--- a/drivers/mfd/t7l66xb.c
+++ b/drivers/mfd/t7l66xb.c
@@ -24,8 +24,10 @@
 
 #include <linux/kernel.h>
 #include <linux/module.h>
+#include <linux/err.h>
 #include <linux/io.h>
 #include <linux/irq.h>
+#include <linux/clk.h>
 #include <linux/platform_device.h>
 #include <linux/mfd/core.h>
 #include <linux/mfd/tmio.h>
@@ -56,6 +58,8 @@ struct t7l66xb {
 	spinlock_t		lock;
 
 	struct resource		rscr;
+	struct clk		*clk48m;
+	struct clk		*clk32k;
 	int			irq;
 	int			irq_base;
 };
@@ -65,13 +69,11 @@ struct t7l66xb {
 static int t7l66xb_mmc_enable(struct platform_device *mmc)
 {
 	struct platform_device *dev = to_platform_device(mmc->dev.parent);
-	struct t7l66xb_platform_data   *pdata = dev->dev.platform_data;
 	struct t7l66xb *t7l66xb = platform_get_drvdata(dev);
 	unsigned long flags;
 	u8 dev_ctl;
 
-	if (pdata->enable_clk32k)
-		pdata->enable_clk32k(dev);
+	clk_enable(t7l66xb->clk32k);
 
 	spin_lock_irqsave(&t7l66xb->lock, flags);
 
@@ -87,7 +89,6 @@ static int t7l66xb_mmc_enable(struct platform_device *mmc)
 static int t7l66xb_mmc_disable(struct platform_device *mmc)
 {
 	struct platform_device *dev = to_platform_device(mmc->dev.parent);
-	struct t7l66xb_platform_data   *pdata = dev->dev.platform_data;
 	struct t7l66xb *t7l66xb = platform_get_drvdata(dev);
 	unsigned long flags;
 	u8 dev_ctl;
@@ -100,8 +101,7 @@ static int t7l66xb_mmc_disable(struct platform_device *mmc)
 
 	spin_unlock_irqrestore(&t7l66xb->lock, flags);
 
-	if (pdata->disable_clk32k)
-		pdata->disable_clk32k(dev);
+	clk_disable(t7l66xb->clk32k);
 
 	return 0;
 }
@@ -258,18 +258,22 @@ static void t7l66xb_detach_irq(struct platform_device *dev)
 #ifdef CONFIG_PM
 static int t7l66xb_suspend(struct platform_device *dev, pm_message_t state)
 {
+	struct t7l66xb *t7l66xb = platform_get_drvdata(dev);
 	struct t7l66xb_platform_data *pdata = dev->dev.platform_data;
 
 	if (pdata && pdata->suspend)
 		pdata->suspend(dev);
+	clk_disable(t7l66xb->clk48m);
 
 	return 0;
 }
 
 static int t7l66xb_resume(struct platform_device *dev)
 {
+	struct t7l66xb *t7l66xb = platform_get_drvdata(dev);
 	struct t7l66xb_platform_data *pdata = dev->dev.platform_data;
 
+	clk_enable(t7l66xb->clk48m);
 	if (pdata && pdata->resume)
 		pdata->resume(dev);
 
@@ -309,6 +313,19 @@ static int t7l66xb_probe(struct platform_device *dev)
 
 	t7l66xb->irq_base = pdata->irq_base;
 
+	t7l66xb->clk32k = clk_get(&dev->dev, "CLK_CK32K");
+	if (IS_ERR(t7l66xb->clk32k)) {
+		ret = PTR_ERR(t7l66xb->clk32k);
+		goto err_clk32k_get;
+	}
+
+	t7l66xb->clk48m = clk_get(&dev->dev, "CLK_CK48M");
+	if (IS_ERR(t7l66xb->clk48m)) {
+		ret = PTR_ERR(t7l66xb->clk48m);
+		clk_put(t7l66xb->clk32k);
+		goto err_clk48m_get;
+	}
+
 	rscr = &t7l66xb->rscr;
 	rscr->name = "t7l66xb-core";
 	rscr->start = iomem->start;
@@ -325,6 +342,8 @@ static int t7l66xb_probe(struct platform_device *dev)
 		goto err_ioremap;
 	}
 
+	clk_enable(t7l66xb->clk48m);
+
 	if (pdata && pdata->enable)
 		pdata->enable(dev);
 
@@ -359,9 +378,13 @@ static int t7l66xb_probe(struct platform_device *dev)
 	iounmap(t7l66xb->scr);
 err_ioremap:
 	release_resource(&t7l66xb->rscr);
-err_noirq:
 err_request_scr:
 	kfree(t7l66xb);
+	clk_put(t7l66xb->clk48m);
+err_clk48m_get:
+	clk_put(t7l66xb->clk32k);
+err_clk32k_get:
+err_noirq:
 	return ret;
 }
 
@@ -372,7 +395,8 @@ static int t7l66xb_remove(struct platform_device *dev)
 	int ret;
 
 	ret = pdata->disable(dev);
-
+	clk_disable(t7l66xb->clk48m);
+	clk_put(t7l66xb->clk48m);
 	t7l66xb_detach_irq(dev);
 	iounmap(t7l66xb->scr);
 	release_resource(&t7l66xb->rscr);
diff --git a/drivers/mfd/tc6387xb.c b/drivers/mfd/tc6387xb.c
index a22b21ac6cf8..43222c12fec1 100644
--- a/drivers/mfd/tc6387xb.c
+++ b/drivers/mfd/tc6387xb.c
@@ -12,6 +12,7 @@
 
 #include <linux/module.h>
 #include <linux/platform_device.h>
+#include <linux/clk.h>
 #include <linux/err.h>
 #include <linux/mfd/core.h>
 #include <linux/mfd/tmio.h>
@@ -24,18 +25,22 @@ enum {
 #ifdef CONFIG_PM
 static int tc6387xb_suspend(struct platform_device *dev, pm_message_t state)
 {
-	struct tc6387xb_platform_data *pdata = platform_get_drvdata(dev);
+	struct clk *clk32k = platform_get_drvdata(dev);
+	struct tc6387xb_platform_data *pdata = dev->dev.platform_data;
 
 	if (pdata && pdata->suspend)
 		pdata->suspend(dev);
+	clk_disable(clk32k);
 
 	return 0;
 }
 
 static int tc6387xb_resume(struct platform_device *dev)
 {
-	struct tc6387xb_platform_data *pdata = platform_get_drvdata(dev);
+	struct clk *clk32k = platform_get_drvdata(dev);
+	struct tc6387xb_platform_data *pdata = dev->dev.platform_data;
 
+	clk_enable(clk32k);
 	if (pdata && pdata->resume)
 		pdata->resume(dev);
 
@@ -51,10 +56,9 @@ static int tc6387xb_resume(struct platform_device *dev)
 static int tc6387xb_mmc_enable(struct platform_device *mmc)
 {
 	struct platform_device *dev      = to_platform_device(mmc->dev.parent);
-	struct tc6387xb_platform_data *tc6387xb = dev->dev.platform_data;
+	struct clk *clk32k = platform_get_drvdata(dev);
 
-	if (tc6387xb->enable_clk32k)
-		tc6387xb->enable_clk32k(dev);
+	clk_enable(clk32k);
 
 	return 0;
 }
@@ -62,10 +66,9 @@ static int tc6387xb_mmc_enable(struct platform_device *mmc)
 static int tc6387xb_mmc_disable(struct platform_device *mmc)
 {
 	struct platform_device *dev      = to_platform_device(mmc->dev.parent);
-	struct tc6387xb_platform_data *tc6387xb = dev->dev.platform_data;
+	struct clk *clk32k = platform_get_drvdata(dev);
 
-	if (tc6387xb->disable_clk32k)
-		tc6387xb->disable_clk32k(dev);
+	clk_disable(clk32k);
 
 	return 0;
 }
@@ -102,14 +105,14 @@ static struct mfd_cell tc6387xb_cells[] = {
 
 static int tc6387xb_probe(struct platform_device *dev)
 {
-	struct tc6387xb_platform_data *data = platform_get_drvdata(dev);
+	struct tc6387xb_platform_data *pdata = dev->dev.platform_data;
 	struct resource *iomem;
+	struct clk *clk32k;
 	int irq, ret;
 
 	iomem = platform_get_resource(dev, IORESOURCE_MEM, 0);
 	if (!iomem) {
-		ret = -EINVAL;
-		goto err_resource;
+		return -EINVAL;
 	}
 
 	ret  = platform_get_irq(dev, 0);
@@ -118,8 +121,15 @@ static int tc6387xb_probe(struct platform_device *dev)
 	else
 		goto err_resource;
 
-	if (data && data->enable)
-		data->enable(dev);
+	clk32k = clk_get(&dev->dev, "CLK_CK32K");
+	if (IS_ERR(clk32k)) {
+		ret = PTR_ERR(clk32k);
+		goto err_resource;
+	}
+	platform_set_drvdata(dev, clk32k);
+
+	if (pdata && pdata->enable)
+		pdata->enable(dev);
 
 	printk(KERN_INFO "Toshiba tc6387xb initialised\n");
 
@@ -134,18 +144,19 @@ static int tc6387xb_probe(struct platform_device *dev)
 	if (!ret)
 		return 0;
 
+	clk_put(clk32k);
 err_resource:
 	return ret;
 }
 
 static int tc6387xb_remove(struct platform_device *dev)
 {
-	struct tc6387xb_platform_data *data = platform_get_drvdata(dev);
-
-	if (data && data->disable)
-		data->disable(dev);
+	struct clk *clk32k = platform_get_drvdata(dev);
 
-	/* FIXME - free the resources! */
+	mfd_remove_devices(&dev->dev);
+	clk_disable(clk32k);
+	clk_put(clk32k);
+	platform_set_drvdata(dev, NULL);
 
 	return 0;
 }
diff --git a/include/linux/mfd/t7l66xb.h b/include/linux/mfd/t7l66xb.h
index e83c7f2036f9..b4629818aea5 100644
--- a/include/linux/mfd/t7l66xb.h
+++ b/include/linux/mfd/t7l66xb.h
@@ -15,8 +15,6 @@
 #include <linux/mfd/tmio.h>
 
 struct t7l66xb_platform_data {
-	int (*enable_clk32k)(struct platform_device *dev);
-	void (*disable_clk32k)(struct platform_device *dev);
 	int (*enable)(struct platform_device *dev);
 	int (*disable)(struct platform_device *dev);
 	int (*suspend)(struct platform_device *dev);
diff --git a/include/linux/mfd/tc6387xb.h b/include/linux/mfd/tc6387xb.h
index fa06e0610b8e..b4888209494a 100644
--- a/include/linux/mfd/tc6387xb.h
+++ b/include/linux/mfd/tc6387xb.h
@@ -11,9 +11,6 @@
 #define MFD_TC6387XB_H
 
 struct tc6387xb_platform_data {
-	int (*enable_clk32k)(struct platform_device *dev);
-	void (*disable_clk32k)(struct platform_device *dev);
-
 	int (*enable)(struct platform_device *dev);
 	int (*disable)(struct platform_device *dev);
 	int (*suspend)(struct platform_device *dev);
-- 
cgit v1.2.3


From ffda12a17a324103e9900fa1035309811eecbfe5 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <a.p.zijlstra@chello.nl>
Date: Fri, 17 Oct 2008 19:27:02 +0200
Subject: sched: optimize group load balancer

I noticed that tg_shares_up() unconditionally takes rq-locks for all cpus
in the sched_domain. This hurts.

We need the rq-locks whenever we change the weight of the per-cpu group sched
entities. To allevate this a little, only change the weight when the new
weight is at least shares_thresh away from the old value.

This avoids the rq-lock for the top level entries, since those will never
be re-weighted, and fuzzes the lower level entries a little to gain performance
in semi-stable situations.

Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 include/linux/sched.h |  1 +
 kernel/sched.c        | 45 +++++++++++++++++++++++++--------------------
 kernel/sysctl.c       | 10 ++++++++++
 3 files changed, 36 insertions(+), 20 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/sched.h b/include/linux/sched.h
index 6eda6ad735dc..4f59c8e8597d 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -1621,6 +1621,7 @@ extern unsigned int sysctl_sched_features;
 extern unsigned int sysctl_sched_migration_cost;
 extern unsigned int sysctl_sched_nr_migrate;
 extern unsigned int sysctl_sched_shares_ratelimit;
+extern unsigned int sysctl_sched_shares_thresh;
 
 int sched_nr_latency_handler(struct ctl_table *table, int write,
 		struct file *file, void __user *buffer, size_t *length,
diff --git a/kernel/sched.c b/kernel/sched.c
index c530b84c7f80..11ca39017835 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -817,6 +817,13 @@ const_debug unsigned int sysctl_sched_nr_migrate = 32;
  */
 unsigned int sysctl_sched_shares_ratelimit = 250000;
 
+/*
+ * Inject some fuzzyness into changing the per-cpu group shares
+ * this avoids remote rq-locks at the expense of fairness.
+ * default: 4
+ */
+unsigned int sysctl_sched_shares_thresh = 4;
+
 /*
  * period over which we measure -rt task cpu usage in us.
  * default: 1s
@@ -1453,8 +1460,8 @@ static void __set_se_shares(struct sched_entity *se, unsigned long shares);
  * Calculate and set the cpu's group shares.
  */
 static void
-__update_group_shares_cpu(struct task_group *tg, int cpu,
-			  unsigned long sd_shares, unsigned long sd_rq_weight)
+update_group_shares_cpu(struct task_group *tg, int cpu,
+			unsigned long sd_shares, unsigned long sd_rq_weight)
 {
 	int boost = 0;
 	unsigned long shares;
@@ -1485,19 +1492,23 @@ __update_group_shares_cpu(struct task_group *tg, int cpu,
 	 *
 	 */
 	shares = (sd_shares * rq_weight) / (sd_rq_weight + 1);
+	shares = clamp_t(unsigned long, shares, MIN_SHARES, MAX_SHARES);
 
-	/*
-	 * record the actual number of shares, not the boosted amount.
-	 */
-	tg->cfs_rq[cpu]->shares = boost ? 0 : shares;
-	tg->cfs_rq[cpu]->rq_weight = rq_weight;
+	if (abs(shares - tg->se[cpu]->load.weight) >
+			sysctl_sched_shares_thresh) {
+		struct rq *rq = cpu_rq(cpu);
+		unsigned long flags;
 
-	if (shares < MIN_SHARES)
-		shares = MIN_SHARES;
-	else if (shares > MAX_SHARES)
-		shares = MAX_SHARES;
+		spin_lock_irqsave(&rq->lock, flags);
+		/*
+		 * record the actual number of shares, not the boosted amount.
+		 */
+		tg->cfs_rq[cpu]->shares = boost ? 0 : shares;
+		tg->cfs_rq[cpu]->rq_weight = rq_weight;
 
-	__set_se_shares(tg->se[cpu], shares);
+		__set_se_shares(tg->se[cpu], shares);
+		spin_unlock_irqrestore(&rq->lock, flags);
+	}
 }
 
 /*
@@ -1526,14 +1537,8 @@ static int tg_shares_up(struct task_group *tg, void *data)
 	if (!rq_weight)
 		rq_weight = cpus_weight(sd->span) * NICE_0_LOAD;
 
-	for_each_cpu_mask(i, sd->span) {
-		struct rq *rq = cpu_rq(i);
-		unsigned long flags;
-
-		spin_lock_irqsave(&rq->lock, flags);
-		__update_group_shares_cpu(tg, i, shares, rq_weight);
-		spin_unlock_irqrestore(&rq->lock, flags);
-	}
+	for_each_cpu_mask(i, sd->span)
+		update_group_shares_cpu(tg, i, shares, rq_weight);
 
 	return 0;
 }
diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index 617d41e4d6a0..3d804f41e649 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -274,6 +274,16 @@ static struct ctl_table kern_table[] = {
 		.mode		= 0644,
 		.proc_handler	= &proc_dointvec,
 	},
+	{
+		.ctl_name	= CTL_UNNUMBERED,
+		.procname	= "sched_shares_thresh",
+		.data		= &sysctl_sched_shares_thresh,
+		.maxlen		= sizeof(unsigned int),
+		.mode		= 0644,
+		.proc_handler	= &proc_dointvec_minmax,
+		.strategy	= &sysctl_intvec,
+		.extra1		= &zero,
+	},
 	{
 		.ctl_name	= CTL_UNNUMBERED,
 		.procname	= "sched_child_runs_first",
-- 
cgit v1.2.3


From 592aa999d6a272856c9bfbdaac0cfba1bb37c24c Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Mon, 20 Oct 2008 16:38:19 +0200
Subject: hrtimers: add missing docbook comments to struct hrtimer

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 include/linux/hrtimer.h | 9 +++++++--
 1 file changed, 7 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/hrtimer.h b/include/linux/hrtimer.h
index cb25c1cc2352..58bca8e9bae1 100644
--- a/include/linux/hrtimer.h
+++ b/include/linux/hrtimer.h
@@ -103,9 +103,14 @@ enum hrtimer_cb_mode {
 /**
  * struct hrtimer - the basic hrtimer structure
  * @node:	red black tree node for time ordered insertion
- * @expires:	the absolute expiry time in the hrtimers internal
+ * @_expires:	the absolute expiry time in the hrtimers internal
  *		representation. The time is related to the clock on
- *		which the timer is based.
+ *		which the timer is based. Is setup by adding
+ *		slack to the _softexpires value. For non range timers
+ *		identical to _softexpires.
+ * @_softexpires: the absolute earliest expiry time of the hrtimer.
+ *		The time which was given as expiry time when the timer
+ *		was armed.
  * @function:	timer expiry callback function
  * @base:	pointer to the timer base (per cpu and per clock)
  * @state:	state information (See bit values above)
-- 
cgit v1.2.3


From 62695a84eb8f2e718bf4dfb21700afaa7a08e0ea Mon Sep 17 00:00:00 2001
From: Nick Piggin <npiggin@suse.de>
Date: Sat, 18 Oct 2008 20:26:09 -0700
Subject: vmscan: move isolate_lru_page() to vmscan.c

On large memory systems, the VM can spend way too much time scanning
through pages that it cannot (or should not) evict from memory.  Not only
does it use up CPU time, but it also provokes lock contention and can
leave large systems under memory presure in a catatonic state.

This patch series improves VM scalability by:

1) putting filesystem backed, swap backed and unevictable pages
   onto their own LRUs, so the system only scans the pages that it
   can/should evict from memory

2) switching to two handed clock replacement for the anonymous LRUs,
   so the number of pages that need to be scanned when the system
   starts swapping is bound to a reasonable number

3) keeping unevictable pages off the LRU completely, so the
   VM does not waste CPU time scanning them. ramfs, ramdisk,
   SHM_LOCKED shared memory segments and mlock()ed VMA pages
   are keept on the unevictable list.

This patch:

isolate_lru_page logically belongs to be in vmscan.c than migrate.c.

It is tough, because we don't need that function without memory migration
so there is a valid argument to have it in migrate.c.  However a
subsequent patch needs to make use of it in the core mm, so we can happily
move it to vmscan.c.

Also, make the function a little more generic by not requiring that it
adds an isolated page to a given list.  Callers can do that.

	Note that we now have '__isolate_lru_page()', that does
	something quite different, visible outside of vmscan.c
	for use with memory controller.  Methinks we need to
	rationalize these names/purposes.	--lts

[akpm@linux-foundation.org: fix mm/memory_hotplug.c build]
Signed-off-by: Nick Piggin <npiggin@suse.de>
Signed-off-by: Rik van Riel <riel@redhat.com>
Signed-off-by: Lee Schermerhorn <Lee.Schermerhorn@hp.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/migrate.h |  3 ---
 mm/internal.h           |  2 ++
 mm/memory_hotplug.c     |  3 ++-
 mm/mempolicy.c          |  9 +++++++--
 mm/migrate.c            | 34 +++-------------------------------
 mm/vmscan.c             | 45 +++++++++++++++++++++++++++++++++++++++++++++
 6 files changed, 59 insertions(+), 37 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/migrate.h b/include/linux/migrate.h
index 03aea612d284..3f34005068d4 100644
--- a/include/linux/migrate.h
+++ b/include/linux/migrate.h
@@ -7,7 +7,6 @@
 typedef struct page *new_page_t(struct page *, unsigned long private, int **);
 
 #ifdef CONFIG_MIGRATION
-extern int isolate_lru_page(struct page *p, struct list_head *pagelist);
 extern int putback_lru_pages(struct list_head *l);
 extern int migrate_page(struct address_space *,
 			struct page *, struct page *);
@@ -21,8 +20,6 @@ extern int migrate_vmas(struct mm_struct *mm,
 		const nodemask_t *from, const nodemask_t *to,
 		unsigned long flags);
 #else
-static inline int isolate_lru_page(struct page *p, struct list_head *list)
-					{ return -ENOSYS; }
 static inline int putback_lru_pages(struct list_head *l) { return 0; }
 static inline int migrate_pages(struct list_head *l, new_page_t x,
 		unsigned long private) { return -ENOSYS; }
diff --git a/mm/internal.h b/mm/internal.h
index 1f43f7416972..4e8e78b978b5 100644
--- a/mm/internal.h
+++ b/mm/internal.h
@@ -39,6 +39,8 @@ static inline void __put_page(struct page *page)
 	atomic_dec(&page->_count);
 }
 
+extern int isolate_lru_page(struct page *page);
+
 extern void __free_pages_bootmem(struct page *page, unsigned int order);
 
 /*
diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c
index c299d083d8e2..3b4975815141 100644
--- a/mm/memory_hotplug.c
+++ b/mm/memory_hotplug.c
@@ -658,8 +658,9 @@ do_migrate_range(unsigned long start_pfn, unsigned long end_pfn)
 		 * We can skip free pages. And we can only deal with pages on
 		 * LRU.
 		 */
-		ret = isolate_lru_page(page, &source);
+		ret = isolate_lru_page(page);
 		if (!ret) { /* Success */
+			list_add_tail(&page->lru, &source);
 			move_pages--;
 		} else {
 			/* Becasue we don't have big zone->lock. we should
diff --git a/mm/mempolicy.c b/mm/mempolicy.c
index 83369058ec13..71b47491487d 100644
--- a/mm/mempolicy.c
+++ b/mm/mempolicy.c
@@ -93,6 +93,8 @@
 #include <asm/tlbflush.h>
 #include <asm/uaccess.h>
 
+#include "internal.h"
+
 /* Internal flags */
 #define MPOL_MF_DISCONTIG_OK (MPOL_MF_INTERNAL << 0)	/* Skip checks for continuous vmas */
 #define MPOL_MF_INVERT (MPOL_MF_INTERNAL << 1)		/* Invert check for nodemask */
@@ -762,8 +764,11 @@ static void migrate_page_add(struct page *page, struct list_head *pagelist,
 	/*
 	 * Avoid migrating a page that is shared with others.
 	 */
-	if ((flags & MPOL_MF_MOVE_ALL) || page_mapcount(page) == 1)
-		isolate_lru_page(page, pagelist);
+	if ((flags & MPOL_MF_MOVE_ALL) || page_mapcount(page) == 1) {
+		if (!isolate_lru_page(page)) {
+			list_add_tail(&page->lru, pagelist);
+		}
+	}
 }
 
 static struct page *new_node_page(struct page *page, unsigned long node, int **x)
diff --git a/mm/migrate.c b/mm/migrate.c
index 2a80136b23bb..da73742e52a5 100644
--- a/mm/migrate.c
+++ b/mm/migrate.c
@@ -36,36 +36,6 @@
 
 #define lru_to_page(_head) (list_entry((_head)->prev, struct page, lru))
 
-/*
- * Isolate one page from the LRU lists. If successful put it onto
- * the indicated list with elevated page count.
- *
- * Result:
- *  -EBUSY: page not on LRU list
- *  0: page removed from LRU list and added to the specified list.
- */
-int isolate_lru_page(struct page *page, struct list_head *pagelist)
-{
-	int ret = -EBUSY;
-
-	if (PageLRU(page)) {
-		struct zone *zone = page_zone(page);
-
-		spin_lock_irq(&zone->lru_lock);
-		if (PageLRU(page) && get_page_unless_zero(page)) {
-			ret = 0;
-			ClearPageLRU(page);
-			if (PageActive(page))
-				del_page_from_active_list(zone, page);
-			else
-				del_page_from_inactive_list(zone, page);
-			list_add_tail(&page->lru, pagelist);
-		}
-		spin_unlock_irq(&zone->lru_lock);
-	}
-	return ret;
-}
-
 /*
  * migrate_prep() needs to be called before we start compiling a list of pages
  * to be migrated using isolate_lru_page().
@@ -914,7 +884,9 @@ static int do_move_pages(struct mm_struct *mm, struct page_to_node *pm,
 				!migrate_all)
 			goto put_and_set;
 
-		err = isolate_lru_page(page, &pagelist);
+		err = isolate_lru_page(page);
+		if (!err)
+			list_add_tail(&page->lru, &pagelist);
 put_and_set:
 		/*
 		 * Either remove the duplicate refcount from
diff --git a/mm/vmscan.c b/mm/vmscan.c
index 1ff1a58e7c10..1fd4912a596c 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -844,6 +844,51 @@ static unsigned long clear_active_flags(struct list_head *page_list)
 	return nr_active;
 }
 
+/**
+ * isolate_lru_page - tries to isolate a page from its LRU list
+ * @page: page to isolate from its LRU list
+ *
+ * Isolates a @page from an LRU list, clears PageLRU and adjusts the
+ * vmstat statistic corresponding to whatever LRU list the page was on.
+ *
+ * Returns 0 if the page was removed from an LRU list.
+ * Returns -EBUSY if the page was not on an LRU list.
+ *
+ * The returned page will have PageLRU() cleared.  If it was found on
+ * the active list, it will have PageActive set.  That flag may need
+ * to be cleared by the caller before letting the page go.
+ *
+ * The vmstat statistic corresponding to the list on which the page was
+ * found will be decremented.
+ *
+ * Restrictions:
+ * (1) Must be called with an elevated refcount on the page. This is a
+ *     fundamentnal difference from isolate_lru_pages (which is called
+ *     without a stable reference).
+ * (2) the lru_lock must not be held.
+ * (3) interrupts must be enabled.
+ */
+int isolate_lru_page(struct page *page)
+{
+	int ret = -EBUSY;
+
+	if (PageLRU(page)) {
+		struct zone *zone = page_zone(page);
+
+		spin_lock_irq(&zone->lru_lock);
+		if (PageLRU(page) && get_page_unless_zero(page)) {
+			ret = 0;
+			ClearPageLRU(page);
+			if (PageActive(page))
+				del_page_from_active_list(zone, page);
+			else
+				del_page_from_inactive_list(zone, page);
+		}
+		spin_unlock_irq(&zone->lru_lock);
+	}
+	return ret;
+}
+
 /*
  * shrink_inactive_list() is a helper for shrink_zone().  It returns the number
  * of reclaimed pages
-- 
cgit v1.2.3


From b69408e88bd86b98feb7b9a38fd865e1ddb29827 Mon Sep 17 00:00:00 2001
From: Christoph Lameter <cl@linux-foundation.org>
Date: Sat, 18 Oct 2008 20:26:14 -0700
Subject: vmscan: Use an indexed array for LRU variables

Currently we are defining explicit variables for the inactive and active
list.  An indexed array can be more generic and avoid repeating similar
code in several places in the reclaim code.

We are saving a few bytes in terms of code size:

Before:

   text    data     bss     dec     hex filename
4097753  573120 4092484 8763357  85b7dd vmlinux

After:

   text    data     bss     dec     hex filename
4097729  573120 4092484 8763333  85b7c5 vmlinux

Having an easy way to add new lru lists may ease future work on the
reclaim code.

Signed-off-by: Rik van Riel <riel@redhat.com>
Signed-off-by: Lee Schermerhorn <lee.schermerhorn@hp.com>
Signed-off-by: Christoph Lameter <cl@linux-foundation.org>
Signed-off-by: KOSAKI Motohiro <kosaki.motohiro@jp.fujitsu.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/memcontrol.h |  17 ++-----
 include/linux/mm_inline.h  |  49 +++++++++++++-----
 include/linux/mmzone.h     |  26 +++++++---
 mm/memcontrol.c            | 115 ++++++++++++++++---------------------------
 mm/page_alloc.c            |   9 ++--
 mm/swap.c                  |   2 +-
 mm/vmscan.c                | 120 +++++++++++++++++++++------------------------
 mm/vmstat.c                |   3 +-
 8 files changed, 171 insertions(+), 170 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h
index fdf3967e1397..a6ac0d491fe6 100644
--- a/include/linux/memcontrol.h
+++ b/include/linux/memcontrol.h
@@ -69,10 +69,8 @@ extern void mem_cgroup_note_reclaim_priority(struct mem_cgroup *mem,
 extern void mem_cgroup_record_reclaim_priority(struct mem_cgroup *mem,
 							int priority);
 
-extern long mem_cgroup_calc_reclaim_active(struct mem_cgroup *mem,
-				struct zone *zone, int priority);
-extern long mem_cgroup_calc_reclaim_inactive(struct mem_cgroup *mem,
-				struct zone *zone, int priority);
+extern long mem_cgroup_calc_reclaim(struct mem_cgroup *mem, struct zone *zone,
+					int priority, enum lru_list lru);
 
 #else /* CONFIG_CGROUP_MEM_RES_CTLR */
 static inline void page_reset_bad_cgroup(struct page *page)
@@ -159,14 +157,9 @@ static inline void mem_cgroup_record_reclaim_priority(struct mem_cgroup *mem,
 {
 }
 
-static inline long mem_cgroup_calc_reclaim_active(struct mem_cgroup *mem,
-					struct zone *zone, int priority)
-{
-	return 0;
-}
-
-static inline long mem_cgroup_calc_reclaim_inactive(struct mem_cgroup *mem,
-					struct zone *zone, int priority)
+static inline long mem_cgroup_calc_reclaim(struct mem_cgroup *mem,
+					struct zone *zone, int priority,
+					enum lru_list lru)
 {
 	return 0;
 }
diff --git a/include/linux/mm_inline.h b/include/linux/mm_inline.h
index 895bc4e93039..2704729777ef 100644
--- a/include/linux/mm_inline.h
+++ b/include/linux/mm_inline.h
@@ -1,40 +1,67 @@
+static inline void
+add_page_to_lru_list(struct zone *zone, struct page *page, enum lru_list l)
+{
+	list_add(&page->lru, &zone->lru[l].list);
+	__inc_zone_state(zone, NR_LRU_BASE + l);
+}
+
+static inline void
+del_page_from_lru_list(struct zone *zone, struct page *page, enum lru_list l)
+{
+	list_del(&page->lru);
+	__dec_zone_state(zone, NR_LRU_BASE + l);
+}
+
 static inline void
 add_page_to_active_list(struct zone *zone, struct page *page)
 {
-	list_add(&page->lru, &zone->active_list);
-	__inc_zone_state(zone, NR_ACTIVE);
+	add_page_to_lru_list(zone, page, LRU_ACTIVE);
 }
 
 static inline void
 add_page_to_inactive_list(struct zone *zone, struct page *page)
 {
-	list_add(&page->lru, &zone->inactive_list);
-	__inc_zone_state(zone, NR_INACTIVE);
+	add_page_to_lru_list(zone, page, LRU_INACTIVE);
 }
 
 static inline void
 del_page_from_active_list(struct zone *zone, struct page *page)
 {
-	list_del(&page->lru);
-	__dec_zone_state(zone, NR_ACTIVE);
+	del_page_from_lru_list(zone, page, LRU_ACTIVE);
 }
 
 static inline void
 del_page_from_inactive_list(struct zone *zone, struct page *page)
 {
-	list_del(&page->lru);
-	__dec_zone_state(zone, NR_INACTIVE);
+	del_page_from_lru_list(zone, page, LRU_INACTIVE);
 }
 
 static inline void
 del_page_from_lru(struct zone *zone, struct page *page)
 {
+	enum lru_list l = LRU_INACTIVE;
+
 	list_del(&page->lru);
 	if (PageActive(page)) {
 		__ClearPageActive(page);
-		__dec_zone_state(zone, NR_ACTIVE);
-	} else {
-		__dec_zone_state(zone, NR_INACTIVE);
+		l = LRU_ACTIVE;
 	}
+	__dec_zone_state(zone, NR_LRU_BASE + l);
 }
 
+/**
+ * page_lru - which LRU list should a page be on?
+ * @page: the page to test
+ *
+ * Returns the LRU list a page should be on, as an index
+ * into the array of LRU lists.
+ */
+static inline enum lru_list page_lru(struct page *page)
+{
+	enum lru_list lru = LRU_BASE;
+
+	if (PageActive(page))
+		lru += LRU_ACTIVE;
+
+	return lru;
+}
diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h
index 428328a05fa1..156e18f3919b 100644
--- a/include/linux/mmzone.h
+++ b/include/linux/mmzone.h
@@ -81,8 +81,9 @@ struct zone_padding {
 enum zone_stat_item {
 	/* First 128 byte cacheline (assuming 64 bit words) */
 	NR_FREE_PAGES,
-	NR_INACTIVE,
-	NR_ACTIVE,
+	NR_LRU_BASE,
+	NR_INACTIVE = NR_LRU_BASE, /* must match order of LRU_[IN]ACTIVE */
+	NR_ACTIVE,	/*  "     "     "   "       "         */
 	NR_ANON_PAGES,	/* Mapped anonymous pages */
 	NR_FILE_MAPPED,	/* pagecache pages mapped into pagetables.
 			   only modified from process context */
@@ -107,6 +108,19 @@ enum zone_stat_item {
 #endif
 	NR_VM_ZONE_STAT_ITEMS };
 
+enum lru_list {
+	LRU_BASE,
+	LRU_INACTIVE=LRU_BASE,	/* must match order of NR_[IN]ACTIVE */
+	LRU_ACTIVE,		/*  "     "     "   "       "        */
+	NR_LRU_LISTS };
+
+#define for_each_lru(l) for (l = 0; l < NR_LRU_LISTS; l++)
+
+static inline int is_active_lru(enum lru_list l)
+{
+	return (l == LRU_ACTIVE);
+}
+
 struct per_cpu_pages {
 	int count;		/* number of pages in the list */
 	int high;		/* high watermark, emptying needed */
@@ -251,10 +265,10 @@ struct zone {
 
 	/* Fields commonly accessed by the page reclaim scanner */
 	spinlock_t		lru_lock;	
-	struct list_head	active_list;
-	struct list_head	inactive_list;
-	unsigned long		nr_scan_active;
-	unsigned long		nr_scan_inactive;
+	struct {
+		struct list_head list;
+		unsigned long nr_scan;
+	} lru[NR_LRU_LISTS];
 	unsigned long		pages_scanned;	   /* since last reclaim */
 	unsigned long		flags;		   /* zone flags, see below */
 
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index 36896f3eb7f5..c0cbd7790c51 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -32,6 +32,7 @@
 #include <linux/fs.h>
 #include <linux/seq_file.h>
 #include <linux/vmalloc.h>
+#include <linux/mm_inline.h>
 
 #include <asm/uaccess.h>
 
@@ -85,22 +86,13 @@ static s64 mem_cgroup_read_stat(struct mem_cgroup_stat *stat,
 /*
  * per-zone information in memory controller.
  */
-
-enum mem_cgroup_zstat_index {
-	MEM_CGROUP_ZSTAT_ACTIVE,
-	MEM_CGROUP_ZSTAT_INACTIVE,
-
-	NR_MEM_CGROUP_ZSTAT,
-};
-
 struct mem_cgroup_per_zone {
 	/*
 	 * spin_lock to protect the per cgroup LRU
 	 */
 	spinlock_t		lru_lock;
-	struct list_head	active_list;
-	struct list_head	inactive_list;
-	unsigned long count[NR_MEM_CGROUP_ZSTAT];
+	struct list_head	lists[NR_LRU_LISTS];
+	unsigned long		count[NR_LRU_LISTS];
 };
 /* Macro for accessing counter */
 #define MEM_CGROUP_ZSTAT(mz, idx)	((mz)->count[(idx)])
@@ -227,7 +219,7 @@ page_cgroup_zoneinfo(struct page_cgroup *pc)
 }
 
 static unsigned long mem_cgroup_get_all_zonestat(struct mem_cgroup *mem,
-					enum mem_cgroup_zstat_index idx)
+					enum lru_list idx)
 {
 	int nid, zid;
 	struct mem_cgroup_per_zone *mz;
@@ -297,11 +289,9 @@ static void __mem_cgroup_remove_list(struct mem_cgroup_per_zone *mz,
 			struct page_cgroup *pc)
 {
 	int from = pc->flags & PAGE_CGROUP_FLAG_ACTIVE;
+	int lru = !!from;
 
-	if (from)
-		MEM_CGROUP_ZSTAT(mz, MEM_CGROUP_ZSTAT_ACTIVE) -= 1;
-	else
-		MEM_CGROUP_ZSTAT(mz, MEM_CGROUP_ZSTAT_INACTIVE) -= 1;
+	MEM_CGROUP_ZSTAT(mz, lru) -= 1;
 
 	mem_cgroup_charge_statistics(pc->mem_cgroup, pc->flags, false);
 	list_del(&pc->lru);
@@ -310,37 +300,35 @@ static void __mem_cgroup_remove_list(struct mem_cgroup_per_zone *mz,
 static void __mem_cgroup_add_list(struct mem_cgroup_per_zone *mz,
 				struct page_cgroup *pc)
 {
-	int to = pc->flags & PAGE_CGROUP_FLAG_ACTIVE;
+	int lru = LRU_INACTIVE;
+
+	if (pc->flags & PAGE_CGROUP_FLAG_ACTIVE)
+		lru += LRU_ACTIVE;
+
+	MEM_CGROUP_ZSTAT(mz, lru) += 1;
+	list_add(&pc->lru, &mz->lists[lru]);
 
-	if (!to) {
-		MEM_CGROUP_ZSTAT(mz, MEM_CGROUP_ZSTAT_INACTIVE) += 1;
-		list_add(&pc->lru, &mz->inactive_list);
-	} else {
-		MEM_CGROUP_ZSTAT(mz, MEM_CGROUP_ZSTAT_ACTIVE) += 1;
-		list_add(&pc->lru, &mz->active_list);
-	}
 	mem_cgroup_charge_statistics(pc->mem_cgroup, pc->flags, true);
 }
 
 static void __mem_cgroup_move_lists(struct page_cgroup *pc, bool active)
 {
-	int from = pc->flags & PAGE_CGROUP_FLAG_ACTIVE;
 	struct mem_cgroup_per_zone *mz = page_cgroup_zoneinfo(pc);
+	int lru = LRU_INACTIVE;
 
-	if (from)
-		MEM_CGROUP_ZSTAT(mz, MEM_CGROUP_ZSTAT_ACTIVE) -= 1;
-	else
-		MEM_CGROUP_ZSTAT(mz, MEM_CGROUP_ZSTAT_INACTIVE) -= 1;
+	if (pc->flags & PAGE_CGROUP_FLAG_ACTIVE)
+		lru += LRU_ACTIVE;
 
-	if (active) {
-		MEM_CGROUP_ZSTAT(mz, MEM_CGROUP_ZSTAT_ACTIVE) += 1;
+	MEM_CGROUP_ZSTAT(mz, lru) -= 1;
+
+	if (active)
 		pc->flags |= PAGE_CGROUP_FLAG_ACTIVE;
-		list_move(&pc->lru, &mz->active_list);
-	} else {
-		MEM_CGROUP_ZSTAT(mz, MEM_CGROUP_ZSTAT_INACTIVE) += 1;
+	else
 		pc->flags &= ~PAGE_CGROUP_FLAG_ACTIVE;
-		list_move(&pc->lru, &mz->inactive_list);
-	}
+
+	lru = !!active;
+	MEM_CGROUP_ZSTAT(mz, lru) += 1;
+	list_move(&pc->lru, &mz->lists[lru]);
 }
 
 int task_in_mem_cgroup(struct task_struct *task, const struct mem_cgroup *mem)
@@ -412,8 +400,8 @@ long mem_cgroup_reclaim_imbalance(struct mem_cgroup *mem)
 {
 	unsigned long active, inactive;
 	/* active and inactive are the number of pages. 'long' is ok.*/
-	active = mem_cgroup_get_all_zonestat(mem, MEM_CGROUP_ZSTAT_ACTIVE);
-	inactive = mem_cgroup_get_all_zonestat(mem, MEM_CGROUP_ZSTAT_INACTIVE);
+	active = mem_cgroup_get_all_zonestat(mem, LRU_ACTIVE);
+	inactive = mem_cgroup_get_all_zonestat(mem, LRU_INACTIVE);
 	return (long) (active / (inactive + 1));
 }
 
@@ -444,28 +432,17 @@ void mem_cgroup_record_reclaim_priority(struct mem_cgroup *mem, int priority)
  * (see include/linux/mmzone.h)
  */
 
-long mem_cgroup_calc_reclaim_active(struct mem_cgroup *mem,
-				   struct zone *zone, int priority)
+long mem_cgroup_calc_reclaim(struct mem_cgroup *mem, struct zone *zone,
+					int priority, enum lru_list lru)
 {
-	long nr_active;
+	long nr_pages;
 	int nid = zone->zone_pgdat->node_id;
 	int zid = zone_idx(zone);
 	struct mem_cgroup_per_zone *mz = mem_cgroup_zoneinfo(mem, nid, zid);
 
-	nr_active = MEM_CGROUP_ZSTAT(mz, MEM_CGROUP_ZSTAT_ACTIVE);
-	return (nr_active >> priority);
-}
-
-long mem_cgroup_calc_reclaim_inactive(struct mem_cgroup *mem,
-					struct zone *zone, int priority)
-{
-	long nr_inactive;
-	int nid = zone->zone_pgdat->node_id;
-	int zid = zone_idx(zone);
-	struct mem_cgroup_per_zone *mz = mem_cgroup_zoneinfo(mem, nid, zid);
+	nr_pages = MEM_CGROUP_ZSTAT(mz, lru);
 
-	nr_inactive = MEM_CGROUP_ZSTAT(mz, MEM_CGROUP_ZSTAT_INACTIVE);
-	return (nr_inactive >> priority);
+	return (nr_pages >> priority);
 }
 
 unsigned long mem_cgroup_isolate_pages(unsigned long nr_to_scan,
@@ -484,14 +461,11 @@ unsigned long mem_cgroup_isolate_pages(unsigned long nr_to_scan,
 	int nid = z->zone_pgdat->node_id;
 	int zid = zone_idx(z);
 	struct mem_cgroup_per_zone *mz;
+	int lru = !!active;
 
 	BUG_ON(!mem_cont);
 	mz = mem_cgroup_zoneinfo(mem_cont, nid, zid);
-	if (active)
-		src = &mz->active_list;
-	else
-		src = &mz->inactive_list;
-
+	src = &mz->lists[lru];
 
 	spin_lock(&mz->lru_lock);
 	scan = 0;
@@ -863,7 +837,7 @@ int mem_cgroup_resize_limit(struct mem_cgroup *memcg, unsigned long long val)
 #define FORCE_UNCHARGE_BATCH	(128)
 static void mem_cgroup_force_empty_list(struct mem_cgroup *mem,
 			    struct mem_cgroup_per_zone *mz,
-			    int active)
+			    enum lru_list lru)
 {
 	struct page_cgroup *pc;
 	struct page *page;
@@ -871,10 +845,7 @@ static void mem_cgroup_force_empty_list(struct mem_cgroup *mem,
 	unsigned long flags;
 	struct list_head *list;
 
-	if (active)
-		list = &mz->active_list;
-	else
-		list = &mz->inactive_list;
+	list = &mz->lists[lru];
 
 	spin_lock_irqsave(&mz->lru_lock, flags);
 	while (!list_empty(list)) {
@@ -922,11 +893,10 @@ static int mem_cgroup_force_empty(struct mem_cgroup *mem)
 		for_each_node_state(node, N_POSSIBLE)
 			for (zid = 0; zid < MAX_NR_ZONES; zid++) {
 				struct mem_cgroup_per_zone *mz;
+				enum lru_list l;
 				mz = mem_cgroup_zoneinfo(mem, node, zid);
-				/* drop all page_cgroup in active_list */
-				mem_cgroup_force_empty_list(mem, mz, 1);
-				/* drop all page_cgroup in inactive_list */
-				mem_cgroup_force_empty_list(mem, mz, 0);
+				for_each_lru(l)
+					mem_cgroup_force_empty_list(mem, mz, l);
 			}
 	}
 	ret = 0;
@@ -1015,9 +985,9 @@ static int mem_control_stat_show(struct cgroup *cont, struct cftype *cft,
 		unsigned long active, inactive;
 
 		inactive = mem_cgroup_get_all_zonestat(mem_cont,
-						MEM_CGROUP_ZSTAT_INACTIVE);
+						LRU_INACTIVE);
 		active = mem_cgroup_get_all_zonestat(mem_cont,
-						MEM_CGROUP_ZSTAT_ACTIVE);
+						LRU_ACTIVE);
 		cb->fill(cb, "active", (active) * PAGE_SIZE);
 		cb->fill(cb, "inactive", (inactive) * PAGE_SIZE);
 	}
@@ -1062,6 +1032,7 @@ static int alloc_mem_cgroup_per_zone_info(struct mem_cgroup *mem, int node)
 {
 	struct mem_cgroup_per_node *pn;
 	struct mem_cgroup_per_zone *mz;
+	enum lru_list l;
 	int zone, tmp = node;
 	/*
 	 * This routine is called against possible nodes.
@@ -1082,9 +1053,9 @@ static int alloc_mem_cgroup_per_zone_info(struct mem_cgroup *mem, int node)
 
 	for (zone = 0; zone < MAX_NR_ZONES; zone++) {
 		mz = &pn->zoneinfo[zone];
-		INIT_LIST_HEAD(&mz->active_list);
-		INIT_LIST_HEAD(&mz->inactive_list);
 		spin_lock_init(&mz->lru_lock);
+		for_each_lru(l)
+			INIT_LIST_HEAD(&mz->lists[l]);
 	}
 	return 0;
 }
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 9eb9eb928285..ee7a96ef40dc 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -3414,6 +3414,7 @@ static void __paginginit free_area_init_core(struct pglist_data *pgdat,
 	for (j = 0; j < MAX_NR_ZONES; j++) {
 		struct zone *zone = pgdat->node_zones + j;
 		unsigned long size, realsize, memmap_pages;
+		enum lru_list l;
 
 		size = zone_spanned_pages_in_node(nid, j, zones_size);
 		realsize = size - zone_absent_pages_in_node(nid, j,
@@ -3465,10 +3466,10 @@ static void __paginginit free_area_init_core(struct pglist_data *pgdat,
 		zone->prev_priority = DEF_PRIORITY;
 
 		zone_pcp_init(zone);
-		INIT_LIST_HEAD(&zone->active_list);
-		INIT_LIST_HEAD(&zone->inactive_list);
-		zone->nr_scan_active = 0;
-		zone->nr_scan_inactive = 0;
+		for_each_lru(l) {
+			INIT_LIST_HEAD(&zone->lru[l].list);
+			zone->lru[l].nr_scan = 0;
+		}
 		zap_zone_vm_stats(zone);
 		zone->flags = 0;
 		if (!size)
diff --git a/mm/swap.c b/mm/swap.c
index 9e0cb3118079..82c2b3a76f94 100644
--- a/mm/swap.c
+++ b/mm/swap.c
@@ -117,7 +117,7 @@ static void pagevec_move_tail(struct pagevec *pvec)
 			spin_lock(&zone->lru_lock);
 		}
 		if (PageLRU(page) && !PageActive(page)) {
-			list_move_tail(&page->lru, &zone->inactive_list);
+			list_move_tail(&page->lru, &zone->lru[LRU_INACTIVE].list);
 			pgmoved++;
 		}
 	}
diff --git a/mm/vmscan.c b/mm/vmscan.c
index 1fd4912a596c..46fdaa546b8d 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -819,10 +819,10 @@ static unsigned long isolate_pages_global(unsigned long nr,
 					int active)
 {
 	if (active)
-		return isolate_lru_pages(nr, &z->active_list, dst,
+		return isolate_lru_pages(nr, &z->lru[LRU_ACTIVE].list, dst,
 						scanned, order, mode);
 	else
-		return isolate_lru_pages(nr, &z->inactive_list, dst,
+		return isolate_lru_pages(nr, &z->lru[LRU_INACTIVE].list, dst,
 						scanned, order, mode);
 }
 
@@ -973,10 +973,7 @@ static unsigned long shrink_inactive_list(unsigned long max_scan,
 			VM_BUG_ON(PageLRU(page));
 			SetPageLRU(page);
 			list_del(&page->lru);
-			if (PageActive(page))
-				add_page_to_active_list(zone, page);
-			else
-				add_page_to_inactive_list(zone, page);
+			add_page_to_lru_list(zone, page, page_lru(page));
 			if (!pagevec_add(&pvec, page)) {
 				spin_unlock_irq(&zone->lru_lock);
 				__pagevec_release(&pvec);
@@ -1144,8 +1141,8 @@ static void shrink_active_list(unsigned long nr_pages, struct zone *zone,
 	int pgdeactivate = 0;
 	unsigned long pgscanned;
 	LIST_HEAD(l_hold);	/* The pages which were snipped off */
-	LIST_HEAD(l_inactive);	/* Pages to go onto the inactive_list */
-	LIST_HEAD(l_active);	/* Pages to go onto the active_list */
+	LIST_HEAD(l_active);
+	LIST_HEAD(l_inactive);
 	struct page *page;
 	struct pagevec pvec;
 	int reclaim_mapped = 0;
@@ -1194,7 +1191,7 @@ static void shrink_active_list(unsigned long nr_pages, struct zone *zone,
 		VM_BUG_ON(!PageActive(page));
 		ClearPageActive(page);
 
-		list_move(&page->lru, &zone->inactive_list);
+		list_move(&page->lru, &zone->lru[LRU_INACTIVE].list);
 		mem_cgroup_move_lists(page, false);
 		pgmoved++;
 		if (!pagevec_add(&pvec, page)) {
@@ -1224,7 +1221,7 @@ static void shrink_active_list(unsigned long nr_pages, struct zone *zone,
 		SetPageLRU(page);
 		VM_BUG_ON(!PageActive(page));
 
-		list_move(&page->lru, &zone->active_list);
+		list_move(&page->lru, &zone->lru[LRU_ACTIVE].list);
 		mem_cgroup_move_lists(page, true);
 		pgmoved++;
 		if (!pagevec_add(&pvec, page)) {
@@ -1244,65 +1241,64 @@ static void shrink_active_list(unsigned long nr_pages, struct zone *zone,
 	pagevec_release(&pvec);
 }
 
+static unsigned long shrink_list(enum lru_list l, unsigned long nr_to_scan,
+	struct zone *zone, struct scan_control *sc, int priority)
+{
+	if (l == LRU_ACTIVE) {
+		shrink_active_list(nr_to_scan, zone, sc, priority);
+		return 0;
+	}
+	return shrink_inactive_list(nr_to_scan, zone, sc);
+}
+
 /*
  * This is a basic per-zone page freer.  Used by both kswapd and direct reclaim.
  */
 static unsigned long shrink_zone(int priority, struct zone *zone,
 				struct scan_control *sc)
 {
-	unsigned long nr_active;
-	unsigned long nr_inactive;
+	unsigned long nr[NR_LRU_LISTS];
 	unsigned long nr_to_scan;
 	unsigned long nr_reclaimed = 0;
+	enum lru_list l;
 
 	if (scan_global_lru(sc)) {
 		/*
 		 * Add one to nr_to_scan just to make sure that the kernel
 		 * will slowly sift through the active list.
 		 */
-		zone->nr_scan_active +=
-			(zone_page_state(zone, NR_ACTIVE) >> priority) + 1;
-		nr_active = zone->nr_scan_active;
-		zone->nr_scan_inactive +=
-			(zone_page_state(zone, NR_INACTIVE) >> priority) + 1;
-		nr_inactive = zone->nr_scan_inactive;
-		if (nr_inactive >= sc->swap_cluster_max)
-			zone->nr_scan_inactive = 0;
-		else
-			nr_inactive = 0;
-
-		if (nr_active >= sc->swap_cluster_max)
-			zone->nr_scan_active = 0;
-		else
-			nr_active = 0;
+		for_each_lru(l) {
+			zone->lru[l].nr_scan += (zone_page_state(zone,
+					NR_LRU_BASE + l)  >> priority) + 1;
+			nr[l] = zone->lru[l].nr_scan;
+			if (nr[l] >= sc->swap_cluster_max)
+				zone->lru[l].nr_scan = 0;
+			else
+				nr[l] = 0;
+		}
 	} else {
 		/*
 		 * This reclaim occurs not because zone memory shortage but
 		 * because memory controller hits its limit.
 		 * Then, don't modify zone reclaim related data.
 		 */
-		nr_active = mem_cgroup_calc_reclaim_active(sc->mem_cgroup,
-					zone, priority);
+		nr[LRU_ACTIVE] = mem_cgroup_calc_reclaim(sc->mem_cgroup,
+					zone, priority, LRU_ACTIVE);
 
-		nr_inactive = mem_cgroup_calc_reclaim_inactive(sc->mem_cgroup,
-					zone, priority);
+		nr[LRU_INACTIVE] = mem_cgroup_calc_reclaim(sc->mem_cgroup,
+					zone, priority, LRU_INACTIVE);
 	}
 
-
-	while (nr_active || nr_inactive) {
-		if (nr_active) {
-			nr_to_scan = min(nr_active,
+	while (nr[LRU_ACTIVE] || nr[LRU_INACTIVE]) {
+		for_each_lru(l) {
+			if (nr[l]) {
+				nr_to_scan = min(nr[l],
 					(unsigned long)sc->swap_cluster_max);
-			nr_active -= nr_to_scan;
-			shrink_active_list(nr_to_scan, zone, sc, priority);
-		}
+				nr[l] -= nr_to_scan;
 
-		if (nr_inactive) {
-			nr_to_scan = min(nr_inactive,
-					(unsigned long)sc->swap_cluster_max);
-			nr_inactive -= nr_to_scan;
-			nr_reclaimed += shrink_inactive_list(nr_to_scan, zone,
-								sc);
+				nr_reclaimed += shrink_list(l, nr_to_scan,
+							zone, sc, priority);
+			}
 		}
 	}
 
@@ -1819,6 +1815,7 @@ static unsigned long shrink_all_zones(unsigned long nr_pages, int prio,
 {
 	struct zone *zone;
 	unsigned long nr_to_scan, ret = 0;
+	enum lru_list l;
 
 	for_each_zone(zone) {
 
@@ -1828,28 +1825,25 @@ static unsigned long shrink_all_zones(unsigned long nr_pages, int prio,
 		if (zone_is_all_unreclaimable(zone) && prio != DEF_PRIORITY)
 			continue;
 
-		/* For pass = 0 we don't shrink the active list */
-		if (pass > 0) {
-			zone->nr_scan_active +=
-				(zone_page_state(zone, NR_ACTIVE) >> prio) + 1;
-			if (zone->nr_scan_active >= nr_pages || pass > 3) {
-				zone->nr_scan_active = 0;
+		for_each_lru(l) {
+			/* For pass = 0 we don't shrink the active list */
+			if (pass == 0 && l == LRU_ACTIVE)
+				continue;
+
+			zone->lru[l].nr_scan +=
+				(zone_page_state(zone, NR_LRU_BASE + l)
+								>> prio) + 1;
+			if (zone->lru[l].nr_scan >= nr_pages || pass > 3) {
+				zone->lru[l].nr_scan = 0;
 				nr_to_scan = min(nr_pages,
-					zone_page_state(zone, NR_ACTIVE));
-				shrink_active_list(nr_to_scan, zone, sc, prio);
+					zone_page_state(zone,
+							NR_LRU_BASE + l));
+				ret += shrink_list(l, nr_to_scan, zone,
+								sc, prio);
+				if (ret >= nr_pages)
+					return ret;
 			}
 		}
-
-		zone->nr_scan_inactive +=
-			(zone_page_state(zone, NR_INACTIVE) >> prio) + 1;
-		if (zone->nr_scan_inactive >= nr_pages || pass > 3) {
-			zone->nr_scan_inactive = 0;
-			nr_to_scan = min(nr_pages,
-				zone_page_state(zone, NR_INACTIVE));
-			ret += shrink_inactive_list(nr_to_scan, zone, sc);
-			if (ret >= nr_pages)
-				return ret;
-		}
 	}
 
 	return ret;
diff --git a/mm/vmstat.c b/mm/vmstat.c
index d7826af2fb07..52c0335c1b71 100644
--- a/mm/vmstat.c
+++ b/mm/vmstat.c
@@ -696,7 +696,8 @@ static void zoneinfo_show_print(struct seq_file *m, pg_data_t *pgdat,
 		   zone->pages_low,
 		   zone->pages_high,
 		   zone->pages_scanned,
-		   zone->nr_scan_active, zone->nr_scan_inactive,
+		   zone->lru[LRU_ACTIVE].nr_scan,
+		   zone->lru[LRU_INACTIVE].nr_scan,
 		   zone->spanned_pages,
 		   zone->present_pages);
 
-- 
cgit v1.2.3


From f04e9ebbe4909f9a41efd55149bc353299f4e83b Mon Sep 17 00:00:00 2001
From: KOSAKI Motohiro <kosaki.motohiro@jp.fujitsu.com>
Date: Sat, 18 Oct 2008 20:26:19 -0700
Subject: swap: use an array for the LRU pagevecs

Turn the pagevecs into an array just like the LRUs.  This significantly
cleans up the source code and reduces the size of the kernel by about 13kB
after all the LRU lists have been created further down in the split VM
patch series.

Signed-off-by: Rik van Riel <riel@redhat.com>
Signed-off-by: KOSAKI Motohiro <kosaki.motohiro@jp.fujitsu.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/pagevec.h | 13 ++++++--
 include/linux/swap.h    | 18 +++++++++--
 mm/migrate.c            | 11 +------
 mm/swap.c               | 79 +++++++++++++++++--------------------------------
 4 files changed, 55 insertions(+), 66 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/pagevec.h b/include/linux/pagevec.h
index 8eb7fa76c1d0..6b8f11bcc948 100644
--- a/include/linux/pagevec.h
+++ b/include/linux/pagevec.h
@@ -23,8 +23,7 @@ struct pagevec {
 void __pagevec_release(struct pagevec *pvec);
 void __pagevec_release_nonlru(struct pagevec *pvec);
 void __pagevec_free(struct pagevec *pvec);
-void __pagevec_lru_add(struct pagevec *pvec);
-void __pagevec_lru_add_active(struct pagevec *pvec);
+void ____pagevec_lru_add(struct pagevec *pvec, enum lru_list lru);
 void pagevec_strip(struct pagevec *pvec);
 unsigned pagevec_lookup(struct pagevec *pvec, struct address_space *mapping,
 		pgoff_t start, unsigned nr_pages);
@@ -81,6 +80,16 @@ static inline void pagevec_free(struct pagevec *pvec)
 		__pagevec_free(pvec);
 }
 
+static inline void __pagevec_lru_add(struct pagevec *pvec)
+{
+	____pagevec_lru_add(pvec, LRU_INACTIVE);
+}
+
+static inline void __pagevec_lru_add_active(struct pagevec *pvec)
+{
+	____pagevec_lru_add(pvec, LRU_ACTIVE);
+}
+
 static inline void pagevec_lru_add(struct pagevec *pvec)
 {
 	if (pagevec_count(pvec))
diff --git a/include/linux/swap.h b/include/linux/swap.h
index de40f169a4e4..fcc169610d09 100644
--- a/include/linux/swap.h
+++ b/include/linux/swap.h
@@ -171,8 +171,8 @@ extern unsigned int nr_free_pagecache_pages(void);
 
 
 /* linux/mm/swap.c */
-extern void lru_cache_add(struct page *);
-extern void lru_cache_add_active(struct page *);
+extern void __lru_cache_add(struct page *, enum lru_list lru);
+extern void lru_cache_add_lru(struct page *, enum lru_list lru);
 extern void activate_page(struct page *);
 extern void mark_page_accessed(struct page *);
 extern void lru_add_drain(void);
@@ -180,6 +180,20 @@ extern int lru_add_drain_all(void);
 extern void rotate_reclaimable_page(struct page *page);
 extern void swap_setup(void);
 
+/**
+ * lru_cache_add: add a page to the page lists
+ * @page: the page to add
+ */
+static inline void lru_cache_add(struct page *page)
+{
+	__lru_cache_add(page, LRU_INACTIVE);
+}
+
+static inline void lru_cache_add_active(struct page *page)
+{
+	__lru_cache_add(page, LRU_ACTIVE);
+}
+
 /* linux/mm/vmscan.c */
 extern unsigned long try_to_free_pages(struct zonelist *zonelist, int order,
 					gfp_t gfp_mask);
diff --git a/mm/migrate.c b/mm/migrate.c
index da73742e52a5..ad15b5ef2599 100644
--- a/mm/migrate.c
+++ b/mm/migrate.c
@@ -55,16 +55,7 @@ int migrate_prep(void)
 
 static inline void move_to_lru(struct page *page)
 {
-	if (PageActive(page)) {
-		/*
-		 * lru_cache_add_active checks that
-		 * the PG_active bit is off.
-		 */
-		ClearPageActive(page);
-		lru_cache_add_active(page);
-	} else {
-		lru_cache_add(page);
-	}
+	lru_cache_add_lru(page, page_lru(page));
 	put_page(page);
 }
 
diff --git a/mm/swap.c b/mm/swap.c
index 82c2b3a76f94..e3045040dc3e 100644
--- a/mm/swap.c
+++ b/mm/swap.c
@@ -34,8 +34,7 @@
 /* How many pages do we try to swap or page in/out together? */
 int page_cluster;
 
-static DEFINE_PER_CPU(struct pagevec, lru_add_pvecs);
-static DEFINE_PER_CPU(struct pagevec, lru_add_active_pvecs);
+static DEFINE_PER_CPU(struct pagevec[NR_LRU_LISTS], lru_add_pvecs);
 static DEFINE_PER_CPU(struct pagevec, lru_rotate_pvecs);
 
 /*
@@ -186,28 +185,29 @@ void mark_page_accessed(struct page *page)
 
 EXPORT_SYMBOL(mark_page_accessed);
 
-/**
- * lru_cache_add: add a page to the page lists
- * @page: the page to add
- */
-void lru_cache_add(struct page *page)
+void __lru_cache_add(struct page *page, enum lru_list lru)
 {
-	struct pagevec *pvec = &get_cpu_var(lru_add_pvecs);
+	struct pagevec *pvec = &get_cpu_var(lru_add_pvecs)[lru];
 
 	page_cache_get(page);
 	if (!pagevec_add(pvec, page))
-		__pagevec_lru_add(pvec);
+		____pagevec_lru_add(pvec, lru);
 	put_cpu_var(lru_add_pvecs);
 }
 
-void lru_cache_add_active(struct page *page)
+/**
+ * lru_cache_add_lru - add a page to a page list
+ * @page: the page to be added to the LRU.
+ * @lru: the LRU list to which the page is added.
+ */
+void lru_cache_add_lru(struct page *page, enum lru_list lru)
 {
-	struct pagevec *pvec = &get_cpu_var(lru_add_active_pvecs);
+	if (PageActive(page)) {
+		ClearPageActive(page);
+	}
 
-	page_cache_get(page);
-	if (!pagevec_add(pvec, page))
-		__pagevec_lru_add_active(pvec);
-	put_cpu_var(lru_add_active_pvecs);
+	VM_BUG_ON(PageLRU(page) || PageActive(page));
+	__lru_cache_add(page, lru);
 }
 
 /*
@@ -217,15 +217,15 @@ void lru_cache_add_active(struct page *page)
  */
 static void drain_cpu_pagevecs(int cpu)
 {
+	struct pagevec *pvecs = per_cpu(lru_add_pvecs, cpu);
 	struct pagevec *pvec;
+	int lru;
 
-	pvec = &per_cpu(lru_add_pvecs, cpu);
-	if (pagevec_count(pvec))
-		__pagevec_lru_add(pvec);
-
-	pvec = &per_cpu(lru_add_active_pvecs, cpu);
-	if (pagevec_count(pvec))
-		__pagevec_lru_add_active(pvec);
+	for_each_lru(lru) {
+		pvec = &pvecs[lru - LRU_BASE];
+		if (pagevec_count(pvec))
+			____pagevec_lru_add(pvec, lru);
+	}
 
 	pvec = &per_cpu(lru_rotate_pvecs, cpu);
 	if (pagevec_count(pvec)) {
@@ -380,7 +380,7 @@ void __pagevec_release_nonlru(struct pagevec *pvec)
  * Add the passed pages to the LRU, then drop the caller's refcount
  * on them.  Reinitialises the caller's pagevec.
  */
-void __pagevec_lru_add(struct pagevec *pvec)
+void ____pagevec_lru_add(struct pagevec *pvec, enum lru_list lru)
 {
 	int i;
 	struct zone *zone = NULL;
@@ -397,7 +397,9 @@ void __pagevec_lru_add(struct pagevec *pvec)
 		}
 		VM_BUG_ON(PageLRU(page));
 		SetPageLRU(page);
-		add_page_to_inactive_list(zone, page);
+		if (is_active_lru(lru))
+			SetPageActive(page);
+		add_page_to_lru_list(zone, page, lru);
 	}
 	if (zone)
 		spin_unlock_irq(&zone->lru_lock);
@@ -405,34 +407,7 @@ void __pagevec_lru_add(struct pagevec *pvec)
 	pagevec_reinit(pvec);
 }
 
-EXPORT_SYMBOL(__pagevec_lru_add);
-
-void __pagevec_lru_add_active(struct pagevec *pvec)
-{
-	int i;
-	struct zone *zone = NULL;
-
-	for (i = 0; i < pagevec_count(pvec); i++) {
-		struct page *page = pvec->pages[i];
-		struct zone *pagezone = page_zone(page);
-
-		if (pagezone != zone) {
-			if (zone)
-				spin_unlock_irq(&zone->lru_lock);
-			zone = pagezone;
-			spin_lock_irq(&zone->lru_lock);
-		}
-		VM_BUG_ON(PageLRU(page));
-		SetPageLRU(page);
-		VM_BUG_ON(PageActive(page));
-		SetPageActive(page);
-		add_page_to_active_list(zone, page);
-	}
-	if (zone)
-		spin_unlock_irq(&zone->lru_lock);
-	release_pages(pvec->pages, pvec->nr, pvec->cold);
-	pagevec_reinit(pvec);
-}
+EXPORT_SYMBOL(____pagevec_lru_add);
 
 /*
  * Try to drop buffers from the pages in a pagevec
-- 
cgit v1.2.3


From 68a22394c286a2daf06ee8d65d8835f738faefa5 Mon Sep 17 00:00:00 2001
From: Rik van Riel <riel@redhat.com>
Date: Sat, 18 Oct 2008 20:26:23 -0700
Subject: vmscan: free swap space on swap-in/activation

If vm_swap_full() (swap space more than 50% full), the system will free
swap space at swapin time.  With this patch, the system will also free the
swap space in the pageout code, when we decide that the page is not a
candidate for swapout (and just wasting swap space).

Signed-off-by: Rik van Riel <riel@redhat.com>
Signed-off-by: Lee Schermerhorn <Lee.Schermerhorn@hp.com>
Signed-off-by: MinChan Kim <minchan.kim@gmail.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/pagevec.h |  1 +
 include/linux/swap.h    |  6 ++++++
 mm/swap.c               | 24 ++++++++++++++++++++++++
 mm/swapfile.c           | 25 ++++++++++++++++++++++---
 mm/vmscan.c             |  7 +++++++
 5 files changed, 60 insertions(+), 3 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/pagevec.h b/include/linux/pagevec.h
index 6b8f11bcc948..fea3a982ee55 100644
--- a/include/linux/pagevec.h
+++ b/include/linux/pagevec.h
@@ -25,6 +25,7 @@ void __pagevec_release_nonlru(struct pagevec *pvec);
 void __pagevec_free(struct pagevec *pvec);
 void ____pagevec_lru_add(struct pagevec *pvec, enum lru_list lru);
 void pagevec_strip(struct pagevec *pvec);
+void pagevec_swap_free(struct pagevec *pvec);
 unsigned pagevec_lookup(struct pagevec *pvec, struct address_space *mapping,
 		pgoff_t start, unsigned nr_pages);
 unsigned pagevec_lookup_tag(struct pagevec *pvec,
diff --git a/include/linux/swap.h b/include/linux/swap.h
index fcc169610d09..833be56ad835 100644
--- a/include/linux/swap.h
+++ b/include/linux/swap.h
@@ -265,6 +265,7 @@ extern sector_t swapdev_block(int, pgoff_t);
 extern struct swap_info_struct *get_swap_info_struct(unsigned);
 extern int can_share_swap_page(struct page *);
 extern int remove_exclusive_swap_page(struct page *);
+extern int remove_exclusive_swap_page_ref(struct page *);
 struct backing_dev_info;
 
 /* linux/mm/thrash.c */
@@ -353,6 +354,11 @@ static inline int remove_exclusive_swap_page(struct page *p)
 	return 0;
 }
 
+static inline int remove_exclusive_swap_page_ref(struct page *page)
+{
+	return 0;
+}
+
 static inline swp_entry_t get_swap_page(void)
 {
 	swp_entry_t entry;
diff --git a/mm/swap.c b/mm/swap.c
index e3045040dc3e..88a394872677 100644
--- a/mm/swap.c
+++ b/mm/swap.c
@@ -427,6 +427,30 @@ void pagevec_strip(struct pagevec *pvec)
 	}
 }
 
+/**
+ * pagevec_swap_free - try to free swap space from the pages in a pagevec
+ * @pvec: pagevec with swapcache pages to free the swap space of
+ *
+ * The caller needs to hold an extra reference to each page and
+ * not hold the page lock on the pages.  This function uses a
+ * trylock on the page lock so it may not always free the swap
+ * space associated with a page.
+ */
+void pagevec_swap_free(struct pagevec *pvec)
+{
+	int i;
+
+	for (i = 0; i < pagevec_count(pvec); i++) {
+		struct page *page = pvec->pages[i];
+
+		if (PageSwapCache(page) && trylock_page(page)) {
+			if (PageSwapCache(page))
+				remove_exclusive_swap_page_ref(page);
+			unlock_page(page);
+		}
+	}
+}
+
 /**
  * pagevec_lookup - gang pagecache lookup
  * @pvec:	Where the resulting pages are placed
diff --git a/mm/swapfile.c b/mm/swapfile.c
index 1e330f2998fa..2a97fafa3d89 100644
--- a/mm/swapfile.c
+++ b/mm/swapfile.c
@@ -344,7 +344,7 @@ int can_share_swap_page(struct page *page)
  * Work out if there are any other processes sharing this
  * swap cache page. Free it if you can. Return success.
  */
-int remove_exclusive_swap_page(struct page *page)
+static int remove_exclusive_swap_page_count(struct page *page, int count)
 {
 	int retval;
 	struct swap_info_struct * p;
@@ -357,7 +357,7 @@ int remove_exclusive_swap_page(struct page *page)
 		return 0;
 	if (PageWriteback(page))
 		return 0;
-	if (page_count(page) != 2) /* 2: us + cache */
+	if (page_count(page) != count) /* us + cache + ptes */
 		return 0;
 
 	entry.val = page_private(page);
@@ -370,7 +370,7 @@ int remove_exclusive_swap_page(struct page *page)
 	if (p->swap_map[swp_offset(entry)] == 1) {
 		/* Recheck the page count with the swapcache lock held.. */
 		spin_lock_irq(&swapper_space.tree_lock);
-		if ((page_count(page) == 2) && !PageWriteback(page)) {
+		if ((page_count(page) == count) && !PageWriteback(page)) {
 			__delete_from_swap_cache(page);
 			SetPageDirty(page);
 			retval = 1;
@@ -387,6 +387,25 @@ int remove_exclusive_swap_page(struct page *page)
 	return retval;
 }
 
+/*
+ * Most of the time the page should have two references: one for the
+ * process and one for the swap cache.
+ */
+int remove_exclusive_swap_page(struct page *page)
+{
+	return remove_exclusive_swap_page_count(page, 2);
+}
+
+/*
+ * The pageout code holds an extra reference to the page.  That raises
+ * the reference count to test for to 2 for a page that is only in the
+ * swap cache plus 1 for each process that maps the page.
+ */
+int remove_exclusive_swap_page_ref(struct page *page)
+{
+	return remove_exclusive_swap_page_count(page, 2 + page_mapcount(page));
+}
+
 /*
  * Free the swap entry like above, but also try to
  * free the page cache entry if it is the last user.
diff --git a/mm/vmscan.c b/mm/vmscan.c
index 46fdaa546b8d..e656035d3406 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -647,6 +647,9 @@ free_it:
 		continue;
 
 activate_locked:
+		/* Not a candidate for swapping, so reclaim swap space. */
+		if (PageSwapCache(page) && vm_swap_full())
+			remove_exclusive_swap_page_ref(page);
 		SetPageActive(page);
 		pgactivate++;
 keep_locked:
@@ -1228,6 +1231,8 @@ static void shrink_active_list(unsigned long nr_pages, struct zone *zone,
 			__mod_zone_page_state(zone, NR_ACTIVE, pgmoved);
 			pgmoved = 0;
 			spin_unlock_irq(&zone->lru_lock);
+			if (vm_swap_full())
+				pagevec_swap_free(&pvec);
 			__pagevec_release(&pvec);
 			spin_lock_irq(&zone->lru_lock);
 		}
@@ -1237,6 +1242,8 @@ static void shrink_active_list(unsigned long nr_pages, struct zone *zone,
 	__count_zone_vm_events(PGREFILL, zone, pgscanned);
 	__count_vm_events(PGDEACTIVATE, pgdeactivate);
 	spin_unlock_irq(&zone->lru_lock);
+	if (vm_swap_full())
+		pagevec_swap_free(&pvec);
 
 	pagevec_release(&pvec);
 }
-- 
cgit v1.2.3


From b2e185384f534781fd22f5ce170b2ad26f97df70 Mon Sep 17 00:00:00 2001
From: Rik van Riel <riel@redhat.com>
Date: Sat, 18 Oct 2008 20:26:30 -0700
Subject: define page_file_cache() function

Define page_file_cache() function to answer the question:
	is page backed by a file?

Originally part of Rik van Riel's split-lru patch.  Extracted to make
available for other, independent reclaim patches.

Moved inline function to linux/mm_inline.h where it will be needed by
subsequent "split LRU" and "noreclaim" patches.

Unfortunately this needs to use a page flag, since the PG_swapbacked state
needs to be preserved all the way to the point where the page is last
removed from the LRU.  Trying to derive the status from other info in the
page resulted in wrong VM statistics in earlier split VM patchsets.

The total number of page flags in use on a 32 bit machine after this patch
is 19.

[akpm@linux-foundation.org: fix up out-of-order merge fallout]
[hugh@veritas.com: splitlru: shmem_getpage SetPageSwapBacked sooner[
Signed-off-by: Rik van Riel <riel@redhat.com>
Signed-off-by: Lee Schermerhorn <lee.schermerhorn@hp.com>
Signed-off-by: MinChan Kim <minchan.kim@gmail.com>
Signed-off-by: Hugh Dickins <hugh@veritas.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/mm_inline.h  | 27 +++++++++++++++++++++++++++
 include/linux/page-flags.h |  8 ++++++--
 mm/memory.c                |  3 +++
 mm/migrate.c               |  2 ++
 mm/page_alloc.c            |  2 ++
 mm/shmem.c                 |  1 +
 mm/swap_state.c            |  3 +++
 7 files changed, 44 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/mm_inline.h b/include/linux/mm_inline.h
index 2704729777ef..96e970485b6c 100644
--- a/include/linux/mm_inline.h
+++ b/include/linux/mm_inline.h
@@ -1,3 +1,28 @@
+#ifndef LINUX_MM_INLINE_H
+#define LINUX_MM_INLINE_H
+
+/**
+ * page_is_file_cache - should the page be on a file LRU or anon LRU?
+ * @page: the page to test
+ *
+ * Returns !0 if @page is page cache page backed by a regular filesystem,
+ * or 0 if @page is anonymous, tmpfs or otherwise ram or swap backed.
+ * Used by functions that manipulate the LRU lists, to sort a page
+ * onto the right LRU list.
+ *
+ * We would like to get this info without a page flag, but the state
+ * needs to survive until the page is last deleted from the LRU, which
+ * could be as far down as __page_cache_release.
+ */
+static inline int page_is_file_cache(struct page *page)
+{
+	if (PageSwapBacked(page))
+		return 0;
+
+	/* The page is page cache backed by a normal filesystem. */
+	return 1;
+}
+
 static inline void
 add_page_to_lru_list(struct zone *zone, struct page *page, enum lru_list l)
 {
@@ -65,3 +90,5 @@ static inline enum lru_list page_lru(struct page *page)
 
 	return lru;
 }
+
+#endif
diff --git a/include/linux/page-flags.h b/include/linux/page-flags.h
index c74d3e875314..57b688cfb5e2 100644
--- a/include/linux/page-flags.h
+++ b/include/linux/page-flags.h
@@ -93,6 +93,7 @@ enum pageflags {
 	PG_mappedtodisk,	/* Has blocks allocated on-disk */
 	PG_reclaim,		/* To be reclaimed asap */
 	PG_buddy,		/* Page is free, on buddy lists */
+	PG_swapbacked,		/* Page is backed by RAM/swap */
 #ifdef CONFIG_IA64_UNCACHED_ALLOCATOR
 	PG_uncached,		/* Page has been mapped as uncached */
 #endif
@@ -176,6 +177,7 @@ PAGEFLAG(SavePinned, savepinned);			/* Xen */
 PAGEFLAG(Reserved, reserved) __CLEARPAGEFLAG(Reserved, reserved)
 PAGEFLAG(Private, private) __CLEARPAGEFLAG(Private, private)
 	__SETPAGEFLAG(Private, private)
+PAGEFLAG(SwapBacked, swapbacked) __CLEARPAGEFLAG(SwapBacked, swapbacked)
 
 __PAGEFLAG(SlobPage, slob_page)
 __PAGEFLAG(SlobFree, slob_free)
@@ -334,7 +336,8 @@ static inline void __ClearPageTail(struct page *page)
  * Flags checked in bad_page().  Pages on the free list should not have
  * these flags set.  It they are, there is a problem.
  */
-#define PAGE_FLAGS_CLEAR_WHEN_BAD (PAGE_FLAGS | 1 << PG_reclaim | 1 << PG_dirty)
+#define PAGE_FLAGS_CLEAR_WHEN_BAD (PAGE_FLAGS | \
+		1 << PG_reclaim | 1 << PG_dirty | 1 << PG_swapbacked)
 
 /*
  * Flags checked when a page is freed.  Pages being freed should not have
@@ -347,7 +350,8 @@ static inline void __ClearPageTail(struct page *page)
  * Pages being prepped should not have these flags set.  It they are, there
  * is a problem.
  */
-#define PAGE_FLAGS_CHECK_AT_PREP (PAGE_FLAGS | 1 << PG_reserved | 1 << PG_dirty)
+#define PAGE_FLAGS_CHECK_AT_PREP (PAGE_FLAGS | \
+		1 << PG_reserved | 1 << PG_dirty | 1 << PG_swapbacked)
 
 #endif /* !__GENERATING_BOUNDS_H */
 #endif	/* PAGE_FLAGS_H */
diff --git a/mm/memory.c b/mm/memory.c
index 1002f473f497..7512933dcc10 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -1888,6 +1888,7 @@ gotten:
 		ptep_clear_flush_notify(vma, address, page_table);
 		set_pte_at(mm, address, page_table, entry);
 		update_mmu_cache(vma, address, entry);
+		SetPageSwapBacked(new_page);
 		lru_cache_add_active(new_page);
 		page_add_new_anon_rmap(new_page, vma, address);
 
@@ -2382,6 +2383,7 @@ static int do_anonymous_page(struct mm_struct *mm, struct vm_area_struct *vma,
 	if (!pte_none(*page_table))
 		goto release;
 	inc_mm_counter(mm, anon_rss);
+	SetPageSwapBacked(page);
 	lru_cache_add_active(page);
 	page_add_new_anon_rmap(page, vma, address);
 	set_pte_at(mm, address, page_table, entry);
@@ -2523,6 +2525,7 @@ static int __do_fault(struct mm_struct *mm, struct vm_area_struct *vma,
 		set_pte_at(mm, address, page_table, entry);
 		if (anon) {
                         inc_mm_counter(mm, anon_rss);
+			SetPageSwapBacked(page);
                         lru_cache_add_active(page);
                         page_add_new_anon_rmap(page, vma, address);
 		} else {
diff --git a/mm/migrate.c b/mm/migrate.c
index ad15b5ef2599..c07327487111 100644
--- a/mm/migrate.c
+++ b/mm/migrate.c
@@ -572,6 +572,8 @@ static int move_to_new_page(struct page *newpage, struct page *page)
 	/* Prepare mapping for the new page.*/
 	newpage->index = page->index;
 	newpage->mapping = page->mapping;
+	if (PageSwapBacked(page))
+		SetPageSwapBacked(newpage);
 
 	mapping = page_mapping(page);
 	if (!mapping)
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index ee7a96ef40dc..2099904d6cc4 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -462,6 +462,8 @@ static inline int free_pages_check(struct page *page)
 		bad_page(page);
 	if (PageDirty(page))
 		__ClearPageDirty(page);
+	if (PageSwapBacked(page))
+		__ClearPageSwapBacked(page);
 	/*
 	 * For now, we report if PG_reserved was found set, but do not
 	 * clear it, and do not free the page.  But we shall soon need
diff --git a/mm/shmem.c b/mm/shmem.c
index d87958a5f03e..fd421ed703ed 100644
--- a/mm/shmem.c
+++ b/mm/shmem.c
@@ -1367,6 +1367,7 @@ repeat:
 				error = -ENOMEM;
 				goto failed;
 			}
+			SetPageSwapBacked(filepage);
 
 			/* Precharge page while we can wait, compensate after */
 			error = mem_cgroup_cache_charge(filepage, current->mm,
diff --git a/mm/swap_state.c b/mm/swap_state.c
index 797c3831cbec..7a3ece0b5a3b 100644
--- a/mm/swap_state.c
+++ b/mm/swap_state.c
@@ -75,6 +75,7 @@ int add_to_swap_cache(struct page *page, swp_entry_t entry, gfp_t gfp_mask)
 	BUG_ON(!PageLocked(page));
 	BUG_ON(PageSwapCache(page));
 	BUG_ON(PagePrivate(page));
+	BUG_ON(!PageSwapBacked(page));
 	error = radix_tree_preload(gfp_mask);
 	if (!error) {
 		page_cache_get(page);
@@ -303,6 +304,7 @@ struct page *read_swap_cache_async(swp_entry_t entry, gfp_t gfp_mask,
 		 * May fail (-ENOMEM) if radix-tree node allocation failed.
 		 */
 		set_page_locked(new_page);
+		SetPageSwapBacked(new_page);
 		err = add_to_swap_cache(new_page, entry, gfp_mask & GFP_KERNEL);
 		if (likely(!err)) {
 			/*
@@ -312,6 +314,7 @@ struct page *read_swap_cache_async(swp_entry_t entry, gfp_t gfp_mask,
 			swap_readpage(NULL, new_page);
 			return new_page;
 		}
+		ClearPageSwapBacked(new_page);
 		clear_page_locked(new_page);
 		swap_free(entry);
 	} while (err != -ENOMEM);
-- 
cgit v1.2.3


From 4f98a2fee8acdb4ac84545df98cccecfd130f8db Mon Sep 17 00:00:00 2001
From: Rik van Riel <riel@redhat.com>
Date: Sat, 18 Oct 2008 20:26:32 -0700
Subject: vmscan: split LRU lists into anon & file sets

Split the LRU lists in two, one set for pages that are backed by real file
systems ("file") and one for pages that are backed by memory and swap
("anon").  The latter includes tmpfs.

The advantage of doing this is that the VM will not have to scan over lots
of anonymous pages (which we generally do not want to swap out), just to
find the page cache pages that it should evict.

This patch has the infrastructure and a basic policy to balance how much
we scan the anon lists and how much we scan the file lists.  The big
policy changes are in separate patches.

[lee.schermerhorn@hp.com: collect lru meminfo statistics from correct offset]
[kosaki.motohiro@jp.fujitsu.com: prevent incorrect oom under split_lru]
[kosaki.motohiro@jp.fujitsu.com: fix pagevec_move_tail() doesn't treat unevictable page]
[hugh@veritas.com: memcg swapbacked pages active]
[hugh@veritas.com: splitlru: BDI_CAP_SWAP_BACKED]
[akpm@linux-foundation.org: fix /proc/vmstat units]
[nishimura@mxp.nes.nec.co.jp: memcg: fix handling of shmem migration]
[kosaki.motohiro@jp.fujitsu.com: adjust Quicklists field of /proc/meminfo]
[kosaki.motohiro@jp.fujitsu.com: fix style issue of get_scan_ratio()]
Signed-off-by: Rik van Riel <riel@redhat.com>
Signed-off-by: Lee Schermerhorn <Lee.Schermerhorn@hp.com>
Signed-off-by: KOSAKI Motohiro <kosaki.motohiro@jp.fujitsu.com>
Signed-off-by: Hugh Dickins <hugh@veritas.com>
Signed-off-by: Daisuke Nishimura <nishimura@mxp.nes.nec.co.jp>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 drivers/base/node.c         |  56 +++---
 fs/cifs/file.c              |   4 +-
 fs/nfs/dir.c                |   2 +-
 fs/ntfs/file.c              |   4 +-
 fs/proc/proc_misc.c         |  77 ++++----
 fs/ramfs/file-nommu.c       |   4 +-
 include/linux/backing-dev.h |  13 ++
 include/linux/memcontrol.h  |   2 +-
 include/linux/mm_inline.h   |  50 ++++--
 include/linux/mmzone.h      |  47 ++++-
 include/linux/pagevec.h     |  29 ++-
 include/linux/swap.h        |  20 ++-
 include/linux/vmstat.h      |  10 ++
 mm/filemap.c                |  22 ++-
 mm/hugetlb.c                |  10 +-
 mm/memcontrol.c             |  88 ++++++----
 mm/memory.c                 |   6 +-
 mm/page-writeback.c         |   8 +-
 mm/page_alloc.c             |  25 ++-
 mm/readahead.c              |   2 +-
 mm/shmem.c                  |   2 +-
 mm/swap.c                   |  14 +-
 mm/swap_state.c             |   4 +-
 mm/vmscan.c                 | 416 +++++++++++++++++++++++---------------------
 mm/vmstat.c                 |  14 +-
 25 files changed, 562 insertions(+), 367 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/base/node.c b/drivers/base/node.c
index 5116b78c6325..fc7e9bf0cdbc 100644
--- a/drivers/base/node.c
+++ b/drivers/base/node.c
@@ -61,34 +61,44 @@ static ssize_t node_read_meminfo(struct sys_device * dev,
 	si_meminfo_node(&i, nid);
 
 	n = sprintf(buf, "\n"
-		       "Node %d MemTotal:     %8lu kB\n"
-		       "Node %d MemFree:      %8lu kB\n"
-		       "Node %d MemUsed:      %8lu kB\n"
-		       "Node %d Active:       %8lu kB\n"
-		       "Node %d Inactive:     %8lu kB\n"
+		       "Node %d MemTotal:       %8lu kB\n"
+		       "Node %d MemFree:        %8lu kB\n"
+		       "Node %d MemUsed:        %8lu kB\n"
+		       "Node %d Active:         %8lu kB\n"
+		       "Node %d Inactive:       %8lu kB\n"
+		       "Node %d Active(anon):   %8lu kB\n"
+		       "Node %d Inactive(anon): %8lu kB\n"
+		       "Node %d Active(file):   %8lu kB\n"
+		       "Node %d Inactive(file): %8lu kB\n"
 #ifdef CONFIG_HIGHMEM
-		       "Node %d HighTotal:    %8lu kB\n"
-		       "Node %d HighFree:     %8lu kB\n"
-		       "Node %d LowTotal:     %8lu kB\n"
-		       "Node %d LowFree:      %8lu kB\n"
+		       "Node %d HighTotal:      %8lu kB\n"
+		       "Node %d HighFree:       %8lu kB\n"
+		       "Node %d LowTotal:       %8lu kB\n"
+		       "Node %d LowFree:        %8lu kB\n"
 #endif
-		       "Node %d Dirty:        %8lu kB\n"
-		       "Node %d Writeback:    %8lu kB\n"
-		       "Node %d FilePages:    %8lu kB\n"
-		       "Node %d Mapped:       %8lu kB\n"
-		       "Node %d AnonPages:    %8lu kB\n"
-		       "Node %d PageTables:   %8lu kB\n"
-		       "Node %d NFS_Unstable: %8lu kB\n"
-		       "Node %d Bounce:       %8lu kB\n"
-		       "Node %d WritebackTmp: %8lu kB\n"
-		       "Node %d Slab:         %8lu kB\n"
-		       "Node %d SReclaimable: %8lu kB\n"
-		       "Node %d SUnreclaim:   %8lu kB\n",
+		       "Node %d Dirty:          %8lu kB\n"
+		       "Node %d Writeback:      %8lu kB\n"
+		       "Node %d FilePages:      %8lu kB\n"
+		       "Node %d Mapped:         %8lu kB\n"
+		       "Node %d AnonPages:      %8lu kB\n"
+		       "Node %d PageTables:     %8lu kB\n"
+		       "Node %d NFS_Unstable:   %8lu kB\n"
+		       "Node %d Bounce:         %8lu kB\n"
+		       "Node %d WritebackTmp:   %8lu kB\n"
+		       "Node %d Slab:           %8lu kB\n"
+		       "Node %d SReclaimable:   %8lu kB\n"
+		       "Node %d SUnreclaim:     %8lu kB\n",
 		       nid, K(i.totalram),
 		       nid, K(i.freeram),
 		       nid, K(i.totalram - i.freeram),
-		       nid, K(node_page_state(nid, NR_ACTIVE)),
-		       nid, K(node_page_state(nid, NR_INACTIVE)),
+		       nid, K(node_page_state(nid, NR_ACTIVE_ANON) +
+				node_page_state(nid, NR_ACTIVE_FILE)),
+		       nid, K(node_page_state(nid, NR_INACTIVE_ANON) +
+				node_page_state(nid, NR_INACTIVE_FILE)),
+		       nid, K(node_page_state(nid, NR_ACTIVE_ANON)),
+		       nid, K(node_page_state(nid, NR_INACTIVE_ANON)),
+		       nid, K(node_page_state(nid, NR_ACTIVE_FILE)),
+		       nid, K(node_page_state(nid, NR_INACTIVE_FILE)),
 #ifdef CONFIG_HIGHMEM
 		       nid, K(i.totalhigh),
 		       nid, K(i.freehigh),
diff --git a/fs/cifs/file.c b/fs/cifs/file.c
index c4a8a0605125..62d8bd8f14c0 100644
--- a/fs/cifs/file.c
+++ b/fs/cifs/file.c
@@ -1791,7 +1791,7 @@ static void cifs_copy_cache_pages(struct address_space *mapping,
 		SetPageUptodate(page);
 		unlock_page(page);
 		if (!pagevec_add(plru_pvec, page))
-			__pagevec_lru_add(plru_pvec);
+			__pagevec_lru_add_file(plru_pvec);
 		data += PAGE_CACHE_SIZE;
 	}
 	return;
@@ -1925,7 +1925,7 @@ static int cifs_readpages(struct file *file, struct address_space *mapping,
 		bytes_read = 0;
 	}
 
-	pagevec_lru_add(&lru_pvec);
+	pagevec_lru_add_file(&lru_pvec);
 
 /* need to free smb_read_data buf before exit */
 	if (smb_read_data) {
diff --git a/fs/nfs/dir.c b/fs/nfs/dir.c
index 2ab70d46ecbc..efdba2e802d7 100644
--- a/fs/nfs/dir.c
+++ b/fs/nfs/dir.c
@@ -1517,7 +1517,7 @@ static int nfs_symlink(struct inode *dir, struct dentry *dentry, const char *sym
 	if (!add_to_page_cache(page, dentry->d_inode->i_mapping, 0,
 							GFP_KERNEL)) {
 		pagevec_add(&lru_pvec, page);
-		pagevec_lru_add(&lru_pvec);
+		pagevec_lru_add_file(&lru_pvec);
 		SetPageUptodate(page);
 		unlock_page(page);
 	} else
diff --git a/fs/ntfs/file.c b/fs/ntfs/file.c
index d020866d4232..3140a4429af1 100644
--- a/fs/ntfs/file.c
+++ b/fs/ntfs/file.c
@@ -439,7 +439,7 @@ static inline int __ntfs_grab_cache_pages(struct address_space *mapping,
 			pages[nr] = *cached_page;
 			page_cache_get(*cached_page);
 			if (unlikely(!pagevec_add(lru_pvec, *cached_page)))
-				__pagevec_lru_add(lru_pvec);
+				__pagevec_lru_add_file(lru_pvec);
 			*cached_page = NULL;
 		}
 		index++;
@@ -2084,7 +2084,7 @@ err_out:
 						OSYNC_METADATA|OSYNC_DATA);
 		}
   	}
-	pagevec_lru_add(&lru_pvec);
+	pagevec_lru_add_file(&lru_pvec);
 	ntfs_debug("Done.  Returning %s (written 0x%lx, status %li).",
 			written ? "written" : "status", (unsigned long)written,
 			(long)status);
diff --git a/fs/proc/proc_misc.c b/fs/proc/proc_misc.c
index 59ea42e1ef03..b8edb2860557 100644
--- a/fs/proc/proc_misc.c
+++ b/fs/proc/proc_misc.c
@@ -136,6 +136,8 @@ static int meminfo_read_proc(char *page, char **start, off_t off,
 	unsigned long allowed;
 	struct vmalloc_info vmi;
 	long cached;
+	unsigned long pages[NR_LRU_LISTS];
+	int lru;
 
 /*
  * display in kilobytes.
@@ -154,51 +156,62 @@ static int meminfo_read_proc(char *page, char **start, off_t off,
 
 	get_vmalloc_info(&vmi);
 
+	for (lru = LRU_BASE; lru < NR_LRU_LISTS; lru++)
+		pages[lru] = global_page_state(NR_LRU_BASE + lru);
+
 	/*
 	 * Tagged format, for easy grepping and expansion.
 	 */
 	len = sprintf(page,
-		"MemTotal:     %8lu kB\n"
-		"MemFree:      %8lu kB\n"
-		"Buffers:      %8lu kB\n"
-		"Cached:       %8lu kB\n"
-		"SwapCached:   %8lu kB\n"
-		"Active:       %8lu kB\n"
-		"Inactive:     %8lu kB\n"
+		"MemTotal:       %8lu kB\n"
+		"MemFree:        %8lu kB\n"
+		"Buffers:        %8lu kB\n"
+		"Cached:         %8lu kB\n"
+		"SwapCached:     %8lu kB\n"
+		"Active:         %8lu kB\n"
+		"Inactive:       %8lu kB\n"
+		"Active(anon):   %8lu kB\n"
+		"Inactive(anon): %8lu kB\n"
+		"Active(file):   %8lu kB\n"
+		"Inactive(file): %8lu kB\n"
 #ifdef CONFIG_HIGHMEM
-		"HighTotal:    %8lu kB\n"
-		"HighFree:     %8lu kB\n"
-		"LowTotal:     %8lu kB\n"
-		"LowFree:      %8lu kB\n"
+		"HighTotal:      %8lu kB\n"
+		"HighFree:       %8lu kB\n"
+		"LowTotal:       %8lu kB\n"
+		"LowFree:        %8lu kB\n"
 #endif
-		"SwapTotal:    %8lu kB\n"
-		"SwapFree:     %8lu kB\n"
-		"Dirty:        %8lu kB\n"
-		"Writeback:    %8lu kB\n"
-		"AnonPages:    %8lu kB\n"
-		"Mapped:       %8lu kB\n"
-		"Slab:         %8lu kB\n"
-		"SReclaimable: %8lu kB\n"
-		"SUnreclaim:   %8lu kB\n"
-		"PageTables:   %8lu kB\n"
+		"SwapTotal:      %8lu kB\n"
+		"SwapFree:       %8lu kB\n"
+		"Dirty:          %8lu kB\n"
+		"Writeback:      %8lu kB\n"
+		"AnonPages:      %8lu kB\n"
+		"Mapped:         %8lu kB\n"
+		"Slab:           %8lu kB\n"
+		"SReclaimable:   %8lu kB\n"
+		"SUnreclaim:     %8lu kB\n"
+		"PageTables:     %8lu kB\n"
 #ifdef CONFIG_QUICKLIST
-		"Quicklists:   %8lu kB\n"
+		"Quicklists:     %8lu kB\n"
 #endif
-		"NFS_Unstable: %8lu kB\n"
-		"Bounce:       %8lu kB\n"
-		"WritebackTmp: %8lu kB\n"
-		"CommitLimit:  %8lu kB\n"
-		"Committed_AS: %8lu kB\n"
-		"VmallocTotal: %8lu kB\n"
-		"VmallocUsed:  %8lu kB\n"
-		"VmallocChunk: %8lu kB\n",
+		"NFS_Unstable:   %8lu kB\n"
+		"Bounce:         %8lu kB\n"
+		"WritebackTmp:   %8lu kB\n"
+		"CommitLimit:    %8lu kB\n"
+		"Committed_AS:   %8lu kB\n"
+		"VmallocTotal:   %8lu kB\n"
+		"VmallocUsed:    %8lu kB\n"
+		"VmallocChunk:   %8lu kB\n",
 		K(i.totalram),
 		K(i.freeram),
 		K(i.bufferram),
 		K(cached),
 		K(total_swapcache_pages),
-		K(global_page_state(NR_ACTIVE)),
-		K(global_page_state(NR_INACTIVE)),
+		K(pages[LRU_ACTIVE_ANON]   + pages[LRU_ACTIVE_FILE]),
+		K(pages[LRU_INACTIVE_ANON] + pages[LRU_INACTIVE_FILE]),
+		K(pages[LRU_ACTIVE_ANON]),
+		K(pages[LRU_INACTIVE_ANON]),
+		K(pages[LRU_ACTIVE_FILE]),
+		K(pages[LRU_INACTIVE_FILE]),
 #ifdef CONFIG_HIGHMEM
 		K(i.totalhigh),
 		K(i.freehigh),
diff --git a/fs/ramfs/file-nommu.c b/fs/ramfs/file-nommu.c
index 5145cb9125af..76acdbc34611 100644
--- a/fs/ramfs/file-nommu.c
+++ b/fs/ramfs/file-nommu.c
@@ -112,12 +112,12 @@ int ramfs_nommu_expand_for_mapping(struct inode *inode, size_t newsize)
 			goto add_error;
 
 		if (!pagevec_add(&lru_pvec, page))
-			__pagevec_lru_add(&lru_pvec);
+			__pagevec_lru_add_file(&lru_pvec);
 
 		unlock_page(page);
 	}
 
-	pagevec_lru_add(&lru_pvec);
+	pagevec_lru_add_file(&lru_pvec);
 	return 0;
 
  fsize_exceeded:
diff --git a/include/linux/backing-dev.h b/include/linux/backing-dev.h
index 0a24d5550eb3..bee52abb8a4d 100644
--- a/include/linux/backing-dev.h
+++ b/include/linux/backing-dev.h
@@ -175,6 +175,8 @@ int bdi_set_max_ratio(struct backing_dev_info *bdi, unsigned int max_ratio);
  * BDI_CAP_READ_MAP:       Can be mapped for reading
  * BDI_CAP_WRITE_MAP:      Can be mapped for writing
  * BDI_CAP_EXEC_MAP:       Can be mapped for execution
+ *
+ * BDI_CAP_SWAP_BACKED:    Count shmem/tmpfs objects as swap-backed.
  */
 #define BDI_CAP_NO_ACCT_DIRTY	0x00000001
 #define BDI_CAP_NO_WRITEBACK	0x00000002
@@ -184,6 +186,7 @@ int bdi_set_max_ratio(struct backing_dev_info *bdi, unsigned int max_ratio);
 #define BDI_CAP_WRITE_MAP	0x00000020
 #define BDI_CAP_EXEC_MAP	0x00000040
 #define BDI_CAP_NO_ACCT_WB	0x00000080
+#define BDI_CAP_SWAP_BACKED	0x00000100
 
 #define BDI_CAP_VMFLAGS \
 	(BDI_CAP_READ_MAP | BDI_CAP_WRITE_MAP | BDI_CAP_EXEC_MAP)
@@ -248,6 +251,11 @@ static inline bool bdi_cap_account_writeback(struct backing_dev_info *bdi)
 				      BDI_CAP_NO_WRITEBACK));
 }
 
+static inline bool bdi_cap_swap_backed(struct backing_dev_info *bdi)
+{
+	return bdi->capabilities & BDI_CAP_SWAP_BACKED;
+}
+
 static inline bool mapping_cap_writeback_dirty(struct address_space *mapping)
 {
 	return bdi_cap_writeback_dirty(mapping->backing_dev_info);
@@ -258,4 +266,9 @@ static inline bool mapping_cap_account_dirty(struct address_space *mapping)
 	return bdi_cap_account_dirty(mapping->backing_dev_info);
 }
 
+static inline bool mapping_cap_swap_backed(struct address_space *mapping)
+{
+	return bdi_cap_swap_backed(mapping->backing_dev_info);
+}
+
 #endif		/* _LINUX_BACKING_DEV_H */
diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h
index a6ac0d491fe6..8d8f05c1515a 100644
--- a/include/linux/memcontrol.h
+++ b/include/linux/memcontrol.h
@@ -44,7 +44,7 @@ extern unsigned long mem_cgroup_isolate_pages(unsigned long nr_to_scan,
 					unsigned long *scanned, int order,
 					int mode, struct zone *z,
 					struct mem_cgroup *mem_cont,
-					int active);
+					int active, int file);
 extern void mem_cgroup_out_of_memory(struct mem_cgroup *mem, gfp_t gfp_mask);
 int task_in_mem_cgroup(struct task_struct *task, const struct mem_cgroup *mem);
 
diff --git a/include/linux/mm_inline.h b/include/linux/mm_inline.h
index 96e970485b6c..2eb599465d56 100644
--- a/include/linux/mm_inline.h
+++ b/include/linux/mm_inline.h
@@ -5,7 +5,7 @@
  * page_is_file_cache - should the page be on a file LRU or anon LRU?
  * @page: the page to test
  *
- * Returns !0 if @page is page cache page backed by a regular filesystem,
+ * Returns LRU_FILE if @page is page cache page backed by a regular filesystem,
  * or 0 if @page is anonymous, tmpfs or otherwise ram or swap backed.
  * Used by functions that manipulate the LRU lists, to sort a page
  * onto the right LRU list.
@@ -20,7 +20,7 @@ static inline int page_is_file_cache(struct page *page)
 		return 0;
 
 	/* The page is page cache backed by a normal filesystem. */
-	return 1;
+	return LRU_FILE;
 }
 
 static inline void
@@ -38,39 +38,64 @@ del_page_from_lru_list(struct zone *zone, struct page *page, enum lru_list l)
 }
 
 static inline void
-add_page_to_active_list(struct zone *zone, struct page *page)
+add_page_to_inactive_anon_list(struct zone *zone, struct page *page)
 {
-	add_page_to_lru_list(zone, page, LRU_ACTIVE);
+	add_page_to_lru_list(zone, page, LRU_INACTIVE_ANON);
 }
 
 static inline void
-add_page_to_inactive_list(struct zone *zone, struct page *page)
+add_page_to_active_anon_list(struct zone *zone, struct page *page)
 {
-	add_page_to_lru_list(zone, page, LRU_INACTIVE);
+	add_page_to_lru_list(zone, page, LRU_ACTIVE_ANON);
 }
 
 static inline void
-del_page_from_active_list(struct zone *zone, struct page *page)
+add_page_to_inactive_file_list(struct zone *zone, struct page *page)
 {
-	del_page_from_lru_list(zone, page, LRU_ACTIVE);
+	add_page_to_lru_list(zone, page, LRU_INACTIVE_FILE);
 }
 
 static inline void
-del_page_from_inactive_list(struct zone *zone, struct page *page)
+add_page_to_active_file_list(struct zone *zone, struct page *page)
 {
-	del_page_from_lru_list(zone, page, LRU_INACTIVE);
+	add_page_to_lru_list(zone, page, LRU_ACTIVE_FILE);
+}
+
+static inline void
+del_page_from_inactive_anon_list(struct zone *zone, struct page *page)
+{
+	del_page_from_lru_list(zone, page, LRU_INACTIVE_ANON);
+}
+
+static inline void
+del_page_from_active_anon_list(struct zone *zone, struct page *page)
+{
+	del_page_from_lru_list(zone, page, LRU_ACTIVE_ANON);
+}
+
+static inline void
+del_page_from_inactive_file_list(struct zone *zone, struct page *page)
+{
+	del_page_from_lru_list(zone, page, LRU_INACTIVE_FILE);
+}
+
+static inline void
+del_page_from_active_file_list(struct zone *zone, struct page *page)
+{
+	del_page_from_lru_list(zone, page, LRU_INACTIVE_FILE);
 }
 
 static inline void
 del_page_from_lru(struct zone *zone, struct page *page)
 {
-	enum lru_list l = LRU_INACTIVE;
+	enum lru_list l = LRU_BASE;
 
 	list_del(&page->lru);
 	if (PageActive(page)) {
 		__ClearPageActive(page);
-		l = LRU_ACTIVE;
+		l += LRU_ACTIVE;
 	}
+	l += page_is_file_cache(page);
 	__dec_zone_state(zone, NR_LRU_BASE + l);
 }
 
@@ -87,6 +112,7 @@ static inline enum lru_list page_lru(struct page *page)
 
 	if (PageActive(page))
 		lru += LRU_ACTIVE;
+	lru += page_is_file_cache(page);
 
 	return lru;
 }
diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h
index 156e18f3919b..59a4c8fd6ebd 100644
--- a/include/linux/mmzone.h
+++ b/include/linux/mmzone.h
@@ -82,21 +82,23 @@ enum zone_stat_item {
 	/* First 128 byte cacheline (assuming 64 bit words) */
 	NR_FREE_PAGES,
 	NR_LRU_BASE,
-	NR_INACTIVE = NR_LRU_BASE, /* must match order of LRU_[IN]ACTIVE */
-	NR_ACTIVE,	/*  "     "     "   "       "         */
+	NR_INACTIVE_ANON = NR_LRU_BASE, /* must match order of LRU_[IN]ACTIVE */
+	NR_ACTIVE_ANON,		/*  "     "     "   "       "         */
+	NR_INACTIVE_FILE,	/*  "     "     "   "       "         */
+	NR_ACTIVE_FILE,		/*  "     "     "   "       "         */
 	NR_ANON_PAGES,	/* Mapped anonymous pages */
 	NR_FILE_MAPPED,	/* pagecache pages mapped into pagetables.
 			   only modified from process context */
 	NR_FILE_PAGES,
 	NR_FILE_DIRTY,
 	NR_WRITEBACK,
-	/* Second 128 byte cacheline */
 	NR_SLAB_RECLAIMABLE,
 	NR_SLAB_UNRECLAIMABLE,
 	NR_PAGETABLE,		/* used for pagetables */
 	NR_UNSTABLE_NFS,	/* NFS unstable pages */
 	NR_BOUNCE,
 	NR_VMSCAN_WRITE,
+	/* Second 128 byte cacheline */
 	NR_WRITEBACK_TEMP,	/* Writeback using temporary buffers */
 #ifdef CONFIG_NUMA
 	NUMA_HIT,		/* allocated in intended node */
@@ -108,17 +110,36 @@ enum zone_stat_item {
 #endif
 	NR_VM_ZONE_STAT_ITEMS };
 
+/*
+ * We do arithmetic on the LRU lists in various places in the code,
+ * so it is important to keep the active lists LRU_ACTIVE higher in
+ * the array than the corresponding inactive lists, and to keep
+ * the *_FILE lists LRU_FILE higher than the corresponding _ANON lists.
+ *
+ * This has to be kept in sync with the statistics in zone_stat_item
+ * above and the descriptions in vmstat_text in mm/vmstat.c
+ */
+#define LRU_BASE 0
+#define LRU_ACTIVE 1
+#define LRU_FILE 2
+
 enum lru_list {
-	LRU_BASE,
-	LRU_INACTIVE=LRU_BASE,	/* must match order of NR_[IN]ACTIVE */
-	LRU_ACTIVE,		/*  "     "     "   "       "        */
+	LRU_INACTIVE_ANON = LRU_BASE,
+	LRU_ACTIVE_ANON = LRU_BASE + LRU_ACTIVE,
+	LRU_INACTIVE_FILE = LRU_BASE + LRU_FILE,
+	LRU_ACTIVE_FILE = LRU_BASE + LRU_FILE + LRU_ACTIVE,
 	NR_LRU_LISTS };
 
 #define for_each_lru(l) for (l = 0; l < NR_LRU_LISTS; l++)
 
+static inline int is_file_lru(enum lru_list l)
+{
+	return (l == LRU_INACTIVE_FILE || l == LRU_ACTIVE_FILE);
+}
+
 static inline int is_active_lru(enum lru_list l)
 {
-	return (l == LRU_ACTIVE);
+	return (l == LRU_ACTIVE_ANON || l == LRU_ACTIVE_FILE);
 }
 
 struct per_cpu_pages {
@@ -269,6 +290,18 @@ struct zone {
 		struct list_head list;
 		unsigned long nr_scan;
 	} lru[NR_LRU_LISTS];
+
+	/*
+	 * The pageout code in vmscan.c keeps track of how many of the
+	 * mem/swap backed and file backed pages are refeferenced.
+	 * The higher the rotated/scanned ratio, the more valuable
+	 * that cache is.
+	 *
+	 * The anon LRU stats live in [0], file LRU stats in [1]
+	 */
+	unsigned long		recent_rotated[2];
+	unsigned long		recent_scanned[2];
+
 	unsigned long		pages_scanned;	   /* since last reclaim */
 	unsigned long		flags;		   /* zone flags, see below */
 
diff --git a/include/linux/pagevec.h b/include/linux/pagevec.h
index fea3a982ee55..5fc96a4e760f 100644
--- a/include/linux/pagevec.h
+++ b/include/linux/pagevec.h
@@ -81,20 +81,37 @@ static inline void pagevec_free(struct pagevec *pvec)
 		__pagevec_free(pvec);
 }
 
-static inline void __pagevec_lru_add(struct pagevec *pvec)
+static inline void __pagevec_lru_add_anon(struct pagevec *pvec)
 {
-	____pagevec_lru_add(pvec, LRU_INACTIVE);
+	____pagevec_lru_add(pvec, LRU_INACTIVE_ANON);
 }
 
-static inline void __pagevec_lru_add_active(struct pagevec *pvec)
+static inline void __pagevec_lru_add_active_anon(struct pagevec *pvec)
 {
-	____pagevec_lru_add(pvec, LRU_ACTIVE);
+	____pagevec_lru_add(pvec, LRU_ACTIVE_ANON);
 }
 
-static inline void pagevec_lru_add(struct pagevec *pvec)
+static inline void __pagevec_lru_add_file(struct pagevec *pvec)
+{
+	____pagevec_lru_add(pvec, LRU_INACTIVE_FILE);
+}
+
+static inline void __pagevec_lru_add_active_file(struct pagevec *pvec)
+{
+	____pagevec_lru_add(pvec, LRU_ACTIVE_FILE);
+}
+
+
+static inline void pagevec_lru_add_file(struct pagevec *pvec)
+{
+	if (pagevec_count(pvec))
+		__pagevec_lru_add_file(pvec);
+}
+
+static inline void pagevec_lru_add_anon(struct pagevec *pvec)
 {
 	if (pagevec_count(pvec))
-		__pagevec_lru_add(pvec);
+		__pagevec_lru_add_anon(pvec);
 }
 
 #endif /* _LINUX_PAGEVEC_H */
diff --git a/include/linux/swap.h b/include/linux/swap.h
index 833be56ad835..7d09d79997a4 100644
--- a/include/linux/swap.h
+++ b/include/linux/swap.h
@@ -184,14 +184,24 @@ extern void swap_setup(void);
  * lru_cache_add: add a page to the page lists
  * @page: the page to add
  */
-static inline void lru_cache_add(struct page *page)
+static inline void lru_cache_add_anon(struct page *page)
 {
-	__lru_cache_add(page, LRU_INACTIVE);
+	__lru_cache_add(page, LRU_INACTIVE_ANON);
 }
 
-static inline void lru_cache_add_active(struct page *page)
+static inline void lru_cache_add_active_anon(struct page *page)
 {
-	__lru_cache_add(page, LRU_ACTIVE);
+	__lru_cache_add(page, LRU_ACTIVE_ANON);
+}
+
+static inline void lru_cache_add_file(struct page *page)
+{
+	__lru_cache_add(page, LRU_INACTIVE_FILE);
+}
+
+static inline void lru_cache_add_active_file(struct page *page)
+{
+	__lru_cache_add(page, LRU_ACTIVE_FILE);
 }
 
 /* linux/mm/vmscan.c */
@@ -199,7 +209,7 @@ extern unsigned long try_to_free_pages(struct zonelist *zonelist, int order,
 					gfp_t gfp_mask);
 extern unsigned long try_to_free_mem_cgroup_pages(struct mem_cgroup *mem,
 							gfp_t gfp_mask);
-extern int __isolate_lru_page(struct page *page, int mode);
+extern int __isolate_lru_page(struct page *page, int mode, int file);
 extern unsigned long shrink_all_memory(unsigned long nr_pages);
 extern int vm_swappiness;
 extern int remove_mapping(struct address_space *mapping, struct page *page);
diff --git a/include/linux/vmstat.h b/include/linux/vmstat.h
index 58334d439516..ff5179f2b153 100644
--- a/include/linux/vmstat.h
+++ b/include/linux/vmstat.h
@@ -159,6 +159,16 @@ static inline unsigned long zone_page_state(struct zone *zone,
 	return x;
 }
 
+extern unsigned long global_lru_pages(void);
+
+static inline unsigned long zone_lru_pages(struct zone *zone)
+{
+	return (zone_page_state(zone, NR_ACTIVE_ANON)
+		+ zone_page_state(zone, NR_ACTIVE_FILE)
+		+ zone_page_state(zone, NR_INACTIVE_ANON)
+		+ zone_page_state(zone, NR_INACTIVE_FILE));
+}
+
 #ifdef CONFIG_NUMA
 /*
  * Determine the per node value of a stat item. This function
diff --git a/mm/filemap.c b/mm/filemap.c
index 903bf316912a..a1ddd2557af2 100644
--- a/mm/filemap.c
+++ b/mm/filemap.c
@@ -33,6 +33,7 @@
 #include <linux/cpuset.h>
 #include <linux/hardirq.h> /* for BUG_ON(!in_atomic()) only */
 #include <linux/memcontrol.h>
+#include <linux/mm_inline.h> /* for page_is_file_cache() */
 #include "internal.h"
 
 /*
@@ -492,9 +493,24 @@ EXPORT_SYMBOL(add_to_page_cache_locked);
 int add_to_page_cache_lru(struct page *page, struct address_space *mapping,
 				pgoff_t offset, gfp_t gfp_mask)
 {
-	int ret = add_to_page_cache(page, mapping, offset, gfp_mask);
-	if (ret == 0)
-		lru_cache_add(page);
+	int ret;
+
+	/*
+	 * Splice_read and readahead add shmem/tmpfs pages into the page cache
+	 * before shmem_readpage has a chance to mark them as SwapBacked: they
+	 * need to go on the active_anon lru below, and mem_cgroup_cache_charge
+	 * (called in add_to_page_cache) needs to know where they're going too.
+	 */
+	if (mapping_cap_swap_backed(mapping))
+		SetPageSwapBacked(page);
+
+	ret = add_to_page_cache(page, mapping, offset, gfp_mask);
+	if (ret == 0) {
+		if (page_is_file_cache(page))
+			lru_cache_add_file(page);
+		else
+			lru_cache_add_active_anon(page);
+	}
 	return ret;
 }
 
diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index 38633864a93e..2fc7fddd9b1f 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -1459,11 +1459,11 @@ int hugetlb_report_meminfo(char *buf)
 {
 	struct hstate *h = &default_hstate;
 	return sprintf(buf,
-			"HugePages_Total: %5lu\n"
-			"HugePages_Free:  %5lu\n"
-			"HugePages_Rsvd:  %5lu\n"
-			"HugePages_Surp:  %5lu\n"
-			"Hugepagesize:    %5lu kB\n",
+			"HugePages_Total:   %5lu\n"
+			"HugePages_Free:    %5lu\n"
+			"HugePages_Rsvd:    %5lu\n"
+			"HugePages_Surp:    %5lu\n"
+			"Hugepagesize:   %8lu kB\n",
 			h->nr_huge_pages,
 			h->free_huge_pages,
 			h->resv_huge_pages,
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index c0cbd7790c51..27e9e75f4eab 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -162,6 +162,7 @@ struct page_cgroup {
 };
 #define PAGE_CGROUP_FLAG_CACHE	(0x1)	/* charged as cache */
 #define PAGE_CGROUP_FLAG_ACTIVE (0x2)	/* page is active in this cgroup */
+#define PAGE_CGROUP_FLAG_FILE	(0x4)	/* page is file system backed */
 
 static int page_cgroup_nid(struct page_cgroup *pc)
 {
@@ -177,6 +178,7 @@ enum charge_type {
 	MEM_CGROUP_CHARGE_TYPE_CACHE = 0,
 	MEM_CGROUP_CHARGE_TYPE_MAPPED,
 	MEM_CGROUP_CHARGE_TYPE_FORCE,	/* used by force_empty */
+	MEM_CGROUP_CHARGE_TYPE_SHMEM,	/* used by page migration of shmem */
 };
 
 /*
@@ -288,8 +290,12 @@ static void unlock_page_cgroup(struct page *page)
 static void __mem_cgroup_remove_list(struct mem_cgroup_per_zone *mz,
 			struct page_cgroup *pc)
 {
-	int from = pc->flags & PAGE_CGROUP_FLAG_ACTIVE;
-	int lru = !!from;
+	int lru = LRU_BASE;
+
+	if (pc->flags & PAGE_CGROUP_FLAG_ACTIVE)
+		lru += LRU_ACTIVE;
+	if (pc->flags & PAGE_CGROUP_FLAG_FILE)
+		lru += LRU_FILE;
 
 	MEM_CGROUP_ZSTAT(mz, lru) -= 1;
 
@@ -300,10 +306,12 @@ static void __mem_cgroup_remove_list(struct mem_cgroup_per_zone *mz,
 static void __mem_cgroup_add_list(struct mem_cgroup_per_zone *mz,
 				struct page_cgroup *pc)
 {
-	int lru = LRU_INACTIVE;
+	int lru = LRU_BASE;
 
 	if (pc->flags & PAGE_CGROUP_FLAG_ACTIVE)
 		lru += LRU_ACTIVE;
+	if (pc->flags & PAGE_CGROUP_FLAG_FILE)
+		lru += LRU_FILE;
 
 	MEM_CGROUP_ZSTAT(mz, lru) += 1;
 	list_add(&pc->lru, &mz->lists[lru]);
@@ -314,10 +322,9 @@ static void __mem_cgroup_add_list(struct mem_cgroup_per_zone *mz,
 static void __mem_cgroup_move_lists(struct page_cgroup *pc, bool active)
 {
 	struct mem_cgroup_per_zone *mz = page_cgroup_zoneinfo(pc);
-	int lru = LRU_INACTIVE;
-
-	if (pc->flags & PAGE_CGROUP_FLAG_ACTIVE)
-		lru += LRU_ACTIVE;
+	int from = pc->flags & PAGE_CGROUP_FLAG_ACTIVE;
+	int file = pc->flags & PAGE_CGROUP_FLAG_FILE;
+	int lru = LRU_FILE * !!file + !!from;
 
 	MEM_CGROUP_ZSTAT(mz, lru) -= 1;
 
@@ -326,7 +333,7 @@ static void __mem_cgroup_move_lists(struct page_cgroup *pc, bool active)
 	else
 		pc->flags &= ~PAGE_CGROUP_FLAG_ACTIVE;
 
-	lru = !!active;
+	lru = LRU_FILE * !!file + !!active;
 	MEM_CGROUP_ZSTAT(mz, lru) += 1;
 	list_move(&pc->lru, &mz->lists[lru]);
 }
@@ -390,21 +397,6 @@ int mem_cgroup_calc_mapped_ratio(struct mem_cgroup *mem)
 	return (int)((rss * 100L) / total);
 }
 
-/*
- * This function is called from vmscan.c. In page reclaiming loop. balance
- * between active and inactive list is calculated. For memory controller
- * page reclaiming, we should use using mem_cgroup's imbalance rather than
- * zone's global lru imbalance.
- */
-long mem_cgroup_reclaim_imbalance(struct mem_cgroup *mem)
-{
-	unsigned long active, inactive;
-	/* active and inactive are the number of pages. 'long' is ok.*/
-	active = mem_cgroup_get_all_zonestat(mem, LRU_ACTIVE);
-	inactive = mem_cgroup_get_all_zonestat(mem, LRU_INACTIVE);
-	return (long) (active / (inactive + 1));
-}
-
 /*
  * prev_priority control...this will be used in memory reclaim path.
  */
@@ -450,7 +442,7 @@ unsigned long mem_cgroup_isolate_pages(unsigned long nr_to_scan,
 					unsigned long *scanned, int order,
 					int mode, struct zone *z,
 					struct mem_cgroup *mem_cont,
-					int active)
+					int active, int file)
 {
 	unsigned long nr_taken = 0;
 	struct page *page;
@@ -461,7 +453,7 @@ unsigned long mem_cgroup_isolate_pages(unsigned long nr_to_scan,
 	int nid = z->zone_pgdat->node_id;
 	int zid = zone_idx(z);
 	struct mem_cgroup_per_zone *mz;
-	int lru = !!active;
+	int lru = LRU_FILE * !!file + !!active;
 
 	BUG_ON(!mem_cont);
 	mz = mem_cgroup_zoneinfo(mem_cont, nid, zid);
@@ -477,6 +469,9 @@ unsigned long mem_cgroup_isolate_pages(unsigned long nr_to_scan,
 		if (unlikely(!PageLRU(page)))
 			continue;
 
+		/*
+		 * TODO: play better with lumpy reclaim, grabbing anything.
+		 */
 		if (PageActive(page) && !active) {
 			__mem_cgroup_move_lists(pc, true);
 			continue;
@@ -489,7 +484,7 @@ unsigned long mem_cgroup_isolate_pages(unsigned long nr_to_scan,
 		scan++;
 		list_move(&pc->lru, &pc_list);
 
-		if (__isolate_lru_page(page, mode) == 0) {
+		if (__isolate_lru_page(page, mode, file) == 0) {
 			list_move(&page->lru, dst);
 			nr_taken++;
 		}
@@ -575,10 +570,16 @@ static int mem_cgroup_charge_common(struct page *page, struct mm_struct *mm,
 	 * If a page is accounted as a page cache, insert to inactive list.
 	 * If anon, insert to active list.
 	 */
-	if (ctype == MEM_CGROUP_CHARGE_TYPE_CACHE)
+	if (ctype == MEM_CGROUP_CHARGE_TYPE_CACHE) {
 		pc->flags = PAGE_CGROUP_FLAG_CACHE;
-	else
+		if (page_is_file_cache(page))
+			pc->flags |= PAGE_CGROUP_FLAG_FILE;
+		else
+			pc->flags |= PAGE_CGROUP_FLAG_ACTIVE;
+	} else if (ctype == MEM_CGROUP_CHARGE_TYPE_MAPPED)
 		pc->flags = PAGE_CGROUP_FLAG_ACTIVE;
+	else /* MEM_CGROUP_CHARGE_TYPE_SHMEM */
+		pc->flags = PAGE_CGROUP_FLAG_CACHE | PAGE_CGROUP_FLAG_ACTIVE;
 
 	lock_page_cgroup(page);
 	if (unlikely(page_get_page_cgroup(page))) {
@@ -737,8 +738,12 @@ int mem_cgroup_prepare_migration(struct page *page, struct page *newpage)
 	if (pc) {
 		mem = pc->mem_cgroup;
 		css_get(&mem->css);
-		if (pc->flags & PAGE_CGROUP_FLAG_CACHE)
-			ctype = MEM_CGROUP_CHARGE_TYPE_CACHE;
+		if (pc->flags & PAGE_CGROUP_FLAG_CACHE) {
+			if (page_is_file_cache(page))
+				ctype = MEM_CGROUP_CHARGE_TYPE_CACHE;
+			else
+				ctype = MEM_CGROUP_CHARGE_TYPE_SHMEM;
+		}
 	}
 	unlock_page_cgroup(page);
 	if (mem) {
@@ -982,14 +987,21 @@ static int mem_control_stat_show(struct cgroup *cont, struct cftype *cft,
 	}
 	/* showing # of active pages */
 	{
-		unsigned long active, inactive;
-
-		inactive = mem_cgroup_get_all_zonestat(mem_cont,
-						LRU_INACTIVE);
-		active = mem_cgroup_get_all_zonestat(mem_cont,
-						LRU_ACTIVE);
-		cb->fill(cb, "active", (active) * PAGE_SIZE);
-		cb->fill(cb, "inactive", (inactive) * PAGE_SIZE);
+		unsigned long active_anon, inactive_anon;
+		unsigned long active_file, inactive_file;
+
+		inactive_anon = mem_cgroup_get_all_zonestat(mem_cont,
+						LRU_INACTIVE_ANON);
+		active_anon = mem_cgroup_get_all_zonestat(mem_cont,
+						LRU_ACTIVE_ANON);
+		inactive_file = mem_cgroup_get_all_zonestat(mem_cont,
+						LRU_INACTIVE_FILE);
+		active_file = mem_cgroup_get_all_zonestat(mem_cont,
+						LRU_ACTIVE_FILE);
+		cb->fill(cb, "active_anon", (active_anon) * PAGE_SIZE);
+		cb->fill(cb, "inactive_anon", (inactive_anon) * PAGE_SIZE);
+		cb->fill(cb, "active_file", (active_file) * PAGE_SIZE);
+		cb->fill(cb, "inactive_file", (inactive_file) * PAGE_SIZE);
 	}
 	return 0;
 }
diff --git a/mm/memory.c b/mm/memory.c
index 7512933dcc10..71cdefd1ef14 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -1889,7 +1889,7 @@ gotten:
 		set_pte_at(mm, address, page_table, entry);
 		update_mmu_cache(vma, address, entry);
 		SetPageSwapBacked(new_page);
-		lru_cache_add_active(new_page);
+		lru_cache_add_active_anon(new_page);
 		page_add_new_anon_rmap(new_page, vma, address);
 
 		if (old_page) {
@@ -2384,7 +2384,7 @@ static int do_anonymous_page(struct mm_struct *mm, struct vm_area_struct *vma,
 		goto release;
 	inc_mm_counter(mm, anon_rss);
 	SetPageSwapBacked(page);
-	lru_cache_add_active(page);
+	lru_cache_add_active_anon(page);
 	page_add_new_anon_rmap(page, vma, address);
 	set_pte_at(mm, address, page_table, entry);
 
@@ -2526,7 +2526,7 @@ static int __do_fault(struct mm_struct *mm, struct vm_area_struct *vma,
 		if (anon) {
                         inc_mm_counter(mm, anon_rss);
 			SetPageSwapBacked(page);
-                        lru_cache_add_active(page);
+                        lru_cache_add_active_anon(page);
                         page_add_new_anon_rmap(page, vma, address);
 		} else {
 			inc_mm_counter(mm, file_rss);
diff --git a/mm/page-writeback.c b/mm/page-writeback.c
index b40f6d5f8fe9..2970e35fd03f 100644
--- a/mm/page-writeback.c
+++ b/mm/page-writeback.c
@@ -329,9 +329,7 @@ static unsigned long highmem_dirtyable_memory(unsigned long total)
 		struct zone *z =
 			&NODE_DATA(node)->node_zones[ZONE_HIGHMEM];
 
-		x += zone_page_state(z, NR_FREE_PAGES)
-			+ zone_page_state(z, NR_INACTIVE)
-			+ zone_page_state(z, NR_ACTIVE);
+		x += zone_page_state(z, NR_FREE_PAGES) + zone_lru_pages(z);
 	}
 	/*
 	 * Make sure that the number of highmem pages is never larger
@@ -355,9 +353,7 @@ unsigned long determine_dirtyable_memory(void)
 {
 	unsigned long x;
 
-	x = global_page_state(NR_FREE_PAGES)
-		+ global_page_state(NR_INACTIVE)
-		+ global_page_state(NR_ACTIVE);
+	x = global_page_state(NR_FREE_PAGES) + global_lru_pages();
 
 	if (!vm_highmem_is_dirtyable)
 		x -= highmem_dirtyable_memory(x);
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 2099904d6cc4..740a16a32c22 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -1864,10 +1864,13 @@ void show_free_areas(void)
 		}
 	}
 
-	printk("Active:%lu inactive:%lu dirty:%lu writeback:%lu unstable:%lu\n"
+	printk("Active_anon:%lu active_file:%lu inactive_anon%lu\n"
+		" inactive_file:%lu dirty:%lu writeback:%lu unstable:%lu\n"
 		" free:%lu slab:%lu mapped:%lu pagetables:%lu bounce:%lu\n",
-		global_page_state(NR_ACTIVE),
-		global_page_state(NR_INACTIVE),
+		global_page_state(NR_ACTIVE_ANON),
+		global_page_state(NR_ACTIVE_FILE),
+		global_page_state(NR_INACTIVE_ANON),
+		global_page_state(NR_INACTIVE_FILE),
 		global_page_state(NR_FILE_DIRTY),
 		global_page_state(NR_WRITEBACK),
 		global_page_state(NR_UNSTABLE_NFS),
@@ -1890,8 +1893,10 @@ void show_free_areas(void)
 			" min:%lukB"
 			" low:%lukB"
 			" high:%lukB"
-			" active:%lukB"
-			" inactive:%lukB"
+			" active_anon:%lukB"
+			" inactive_anon:%lukB"
+			" active_file:%lukB"
+			" inactive_file:%lukB"
 			" present:%lukB"
 			" pages_scanned:%lu"
 			" all_unreclaimable? %s"
@@ -1901,8 +1906,10 @@ void show_free_areas(void)
 			K(zone->pages_min),
 			K(zone->pages_low),
 			K(zone->pages_high),
-			K(zone_page_state(zone, NR_ACTIVE)),
-			K(zone_page_state(zone, NR_INACTIVE)),
+			K(zone_page_state(zone, NR_ACTIVE_ANON)),
+			K(zone_page_state(zone, NR_INACTIVE_ANON)),
+			K(zone_page_state(zone, NR_ACTIVE_FILE)),
+			K(zone_page_state(zone, NR_INACTIVE_FILE)),
 			K(zone->present_pages),
 			zone->pages_scanned,
 			(zone_is_all_unreclaimable(zone) ? "yes" : "no")
@@ -3472,6 +3479,10 @@ static void __paginginit free_area_init_core(struct pglist_data *pgdat,
 			INIT_LIST_HEAD(&zone->lru[l].list);
 			zone->lru[l].nr_scan = 0;
 		}
+		zone->recent_rotated[0] = 0;
+		zone->recent_rotated[1] = 0;
+		zone->recent_scanned[0] = 0;
+		zone->recent_scanned[1] = 0;
 		zap_zone_vm_stats(zone);
 		zone->flags = 0;
 		if (!size)
diff --git a/mm/readahead.c b/mm/readahead.c
index 6cbd9a72fde2..bec83c15a78f 100644
--- a/mm/readahead.c
+++ b/mm/readahead.c
@@ -229,7 +229,7 @@ int do_page_cache_readahead(struct address_space *mapping, struct file *filp,
  */
 unsigned long max_sane_readahead(unsigned long nr)
 {
-	return min(nr, (node_page_state(numa_node_id(), NR_INACTIVE)
+	return min(nr, (node_page_state(numa_node_id(), NR_INACTIVE_FILE)
 		+ node_page_state(numa_node_id(), NR_FREE_PAGES)) / 2);
 }
 
diff --git a/mm/shmem.c b/mm/shmem.c
index fd421ed703ed..fc2ccf79a776 100644
--- a/mm/shmem.c
+++ b/mm/shmem.c
@@ -199,7 +199,7 @@ static struct vm_operations_struct shmem_vm_ops;
 
 static struct backing_dev_info shmem_backing_dev_info  __read_mostly = {
 	.ra_pages	= 0,	/* No readahead */
-	.capabilities	= BDI_CAP_NO_ACCT_AND_WRITEBACK,
+	.capabilities	= BDI_CAP_NO_ACCT_AND_WRITEBACK | BDI_CAP_SWAP_BACKED,
 	.unplug_io_fn	= default_unplug_io_fn,
 };
 
diff --git a/mm/swap.c b/mm/swap.c
index 88a394872677..0b1974a08974 100644
--- a/mm/swap.c
+++ b/mm/swap.c
@@ -116,7 +116,8 @@ static void pagevec_move_tail(struct pagevec *pvec)
 			spin_lock(&zone->lru_lock);
 		}
 		if (PageLRU(page) && !PageActive(page)) {
-			list_move_tail(&page->lru, &zone->lru[LRU_INACTIVE].list);
+			int lru = page_is_file_cache(page);
+			list_move_tail(&page->lru, &zone->lru[lru].list);
 			pgmoved++;
 		}
 	}
@@ -157,11 +158,18 @@ void activate_page(struct page *page)
 
 	spin_lock_irq(&zone->lru_lock);
 	if (PageLRU(page) && !PageActive(page)) {
-		del_page_from_inactive_list(zone, page);
+		int file = page_is_file_cache(page);
+		int lru = LRU_BASE + file;
+		del_page_from_lru_list(zone, page, lru);
+
 		SetPageActive(page);
-		add_page_to_active_list(zone, page);
+		lru += LRU_ACTIVE;
+		add_page_to_lru_list(zone, page, lru);
 		__count_vm_event(PGACTIVATE);
 		mem_cgroup_move_lists(page, true);
+
+		zone->recent_rotated[!!file]++;
+		zone->recent_scanned[!!file]++;
 	}
 	spin_unlock_irq(&zone->lru_lock);
 }
diff --git a/mm/swap_state.c b/mm/swap_state.c
index 7a3ece0b5a3b..ea62084ed402 100644
--- a/mm/swap_state.c
+++ b/mm/swap_state.c
@@ -33,7 +33,7 @@ static const struct address_space_operations swap_aops = {
 };
 
 static struct backing_dev_info swap_backing_dev_info = {
-	.capabilities	= BDI_CAP_NO_ACCT_AND_WRITEBACK,
+	.capabilities	= BDI_CAP_NO_ACCT_AND_WRITEBACK | BDI_CAP_SWAP_BACKED,
 	.unplug_io_fn	= swap_unplug_io_fn,
 };
 
@@ -310,7 +310,7 @@ struct page *read_swap_cache_async(swp_entry_t entry, gfp_t gfp_mask,
 			/*
 			 * Initiate read into locked page and return.
 			 */
-			lru_cache_add_active(new_page);
+			lru_cache_add_active_anon(new_page);
 			swap_readpage(NULL, new_page);
 			return new_page;
 		}
diff --git a/mm/vmscan.c b/mm/vmscan.c
index e656035d3406..d10d2f9a33f3 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -78,7 +78,7 @@ struct scan_control {
 	unsigned long (*isolate_pages)(unsigned long nr, struct list_head *dst,
 			unsigned long *scanned, int order, int mode,
 			struct zone *z, struct mem_cgroup *mem_cont,
-			int active);
+			int active, int file);
 };
 
 #define lru_to_page(_head) (list_entry((_head)->prev, struct page, lru))
@@ -680,7 +680,7 @@ keep:
  *
  * returns 0 on success, -ve errno on failure.
  */
-int __isolate_lru_page(struct page *page, int mode)
+int __isolate_lru_page(struct page *page, int mode, int file)
 {
 	int ret = -EINVAL;
 
@@ -696,6 +696,9 @@ int __isolate_lru_page(struct page *page, int mode)
 	if (mode != ISOLATE_BOTH && (!PageActive(page) != !mode))
 		return ret;
 
+	if (mode != ISOLATE_BOTH && (!page_is_file_cache(page) != !file))
+		return ret;
+
 	ret = -EBUSY;
 	if (likely(get_page_unless_zero(page))) {
 		/*
@@ -726,12 +729,13 @@ int __isolate_lru_page(struct page *page, int mode)
  * @scanned:	The number of pages that were scanned.
  * @order:	The caller's attempted allocation order
  * @mode:	One of the LRU isolation modes
+ * @file:	True [1] if isolating file [!anon] pages
  *
  * returns how many pages were moved onto *@dst.
  */
 static unsigned long isolate_lru_pages(unsigned long nr_to_scan,
 		struct list_head *src, struct list_head *dst,
-		unsigned long *scanned, int order, int mode)
+		unsigned long *scanned, int order, int mode, int file)
 {
 	unsigned long nr_taken = 0;
 	unsigned long scan;
@@ -748,7 +752,7 @@ static unsigned long isolate_lru_pages(unsigned long nr_to_scan,
 
 		VM_BUG_ON(!PageLRU(page));
 
-		switch (__isolate_lru_page(page, mode)) {
+		switch (__isolate_lru_page(page, mode, file)) {
 		case 0:
 			list_move(&page->lru, dst);
 			nr_taken++;
@@ -791,10 +795,11 @@ static unsigned long isolate_lru_pages(unsigned long nr_to_scan,
 				break;
 
 			cursor_page = pfn_to_page(pfn);
+
 			/* Check that we have not crossed a zone boundary. */
 			if (unlikely(page_zone_id(cursor_page) != zone_id))
 				continue;
-			switch (__isolate_lru_page(cursor_page, mode)) {
+			switch (__isolate_lru_page(cursor_page, mode, file)) {
 			case 0:
 				list_move(&cursor_page->lru, dst);
 				nr_taken++;
@@ -819,30 +824,37 @@ static unsigned long isolate_pages_global(unsigned long nr,
 					unsigned long *scanned, int order,
 					int mode, struct zone *z,
 					struct mem_cgroup *mem_cont,
-					int active)
+					int active, int file)
 {
+	int lru = LRU_BASE;
 	if (active)
-		return isolate_lru_pages(nr, &z->lru[LRU_ACTIVE].list, dst,
-						scanned, order, mode);
-	else
-		return isolate_lru_pages(nr, &z->lru[LRU_INACTIVE].list, dst,
-						scanned, order, mode);
+		lru += LRU_ACTIVE;
+	if (file)
+		lru += LRU_FILE;
+	return isolate_lru_pages(nr, &z->lru[lru].list, dst, scanned, order,
+								mode, !!file);
 }
 
 /*
  * clear_active_flags() is a helper for shrink_active_list(), clearing
  * any active bits from the pages in the list.
  */
-static unsigned long clear_active_flags(struct list_head *page_list)
+static unsigned long clear_active_flags(struct list_head *page_list,
+					unsigned int *count)
 {
 	int nr_active = 0;
+	int lru;
 	struct page *page;
 
-	list_for_each_entry(page, page_list, lru)
+	list_for_each_entry(page, page_list, lru) {
+		lru = page_is_file_cache(page);
 		if (PageActive(page)) {
+			lru += LRU_ACTIVE;
 			ClearPageActive(page);
 			nr_active++;
 		}
+		count[lru]++;
+	}
 
 	return nr_active;
 }
@@ -880,12 +892,12 @@ int isolate_lru_page(struct page *page)
 
 		spin_lock_irq(&zone->lru_lock);
 		if (PageLRU(page) && get_page_unless_zero(page)) {
+			int lru = LRU_BASE;
 			ret = 0;
 			ClearPageLRU(page);
-			if (PageActive(page))
-				del_page_from_active_list(zone, page);
-			else
-				del_page_from_inactive_list(zone, page);
+
+			lru += page_is_file_cache(page) + !!PageActive(page);
+			del_page_from_lru_list(zone, page, lru);
 		}
 		spin_unlock_irq(&zone->lru_lock);
 	}
@@ -897,7 +909,7 @@ int isolate_lru_page(struct page *page)
  * of reclaimed pages
  */
 static unsigned long shrink_inactive_list(unsigned long max_scan,
-				struct zone *zone, struct scan_control *sc)
+			struct zone *zone, struct scan_control *sc, int file)
 {
 	LIST_HEAD(page_list);
 	struct pagevec pvec;
@@ -914,20 +926,32 @@ static unsigned long shrink_inactive_list(unsigned long max_scan,
 		unsigned long nr_scan;
 		unsigned long nr_freed;
 		unsigned long nr_active;
+		unsigned int count[NR_LRU_LISTS] = { 0, };
+		int mode = (sc->order > PAGE_ALLOC_COSTLY_ORDER) ?
+					ISOLATE_BOTH : ISOLATE_INACTIVE;
 
 		nr_taken = sc->isolate_pages(sc->swap_cluster_max,
-			     &page_list, &nr_scan, sc->order,
-			     (sc->order > PAGE_ALLOC_COSTLY_ORDER)?
-					     ISOLATE_BOTH : ISOLATE_INACTIVE,
-				zone, sc->mem_cgroup, 0);
-		nr_active = clear_active_flags(&page_list);
+			     &page_list, &nr_scan, sc->order, mode,
+				zone, sc->mem_cgroup, 0, file);
+		nr_active = clear_active_flags(&page_list, count);
 		__count_vm_events(PGDEACTIVATE, nr_active);
 
-		__mod_zone_page_state(zone, NR_ACTIVE, -nr_active);
-		__mod_zone_page_state(zone, NR_INACTIVE,
-						-(nr_taken - nr_active));
-		if (scan_global_lru(sc))
+		__mod_zone_page_state(zone, NR_ACTIVE_FILE,
+						-count[LRU_ACTIVE_FILE]);
+		__mod_zone_page_state(zone, NR_INACTIVE_FILE,
+						-count[LRU_INACTIVE_FILE]);
+		__mod_zone_page_state(zone, NR_ACTIVE_ANON,
+						-count[LRU_ACTIVE_ANON]);
+		__mod_zone_page_state(zone, NR_INACTIVE_ANON,
+						-count[LRU_INACTIVE_ANON]);
+
+		if (scan_global_lru(sc)) {
 			zone->pages_scanned += nr_scan;
+			zone->recent_scanned[0] += count[LRU_INACTIVE_ANON];
+			zone->recent_scanned[0] += count[LRU_ACTIVE_ANON];
+			zone->recent_scanned[1] += count[LRU_INACTIVE_FILE];
+			zone->recent_scanned[1] += count[LRU_ACTIVE_FILE];
+		}
 		spin_unlock_irq(&zone->lru_lock);
 
 		nr_scanned += nr_scan;
@@ -947,7 +971,7 @@ static unsigned long shrink_inactive_list(unsigned long max_scan,
 			 * The attempt at page out may have made some
 			 * of the pages active, mark them inactive again.
 			 */
-			nr_active = clear_active_flags(&page_list);
+			nr_active = clear_active_flags(&page_list, count);
 			count_vm_events(PGDEACTIVATE, nr_active);
 
 			nr_freed += shrink_page_list(&page_list, sc,
@@ -977,6 +1001,10 @@ static unsigned long shrink_inactive_list(unsigned long max_scan,
 			SetPageLRU(page);
 			list_del(&page->lru);
 			add_page_to_lru_list(zone, page, page_lru(page));
+			if (PageActive(page) && scan_global_lru(sc)) {
+				int file = !!page_is_file_cache(page);
+				zone->recent_rotated[file]++;
+			}
 			if (!pagevec_add(&pvec, page)) {
 				spin_unlock_irq(&zone->lru_lock);
 				__pagevec_release(&pvec);
@@ -1007,115 +1035,7 @@ static inline void note_zone_scanning_priority(struct zone *zone, int priority)
 
 static inline int zone_is_near_oom(struct zone *zone)
 {
-	return zone->pages_scanned >= (zone_page_state(zone, NR_ACTIVE)
-				+ zone_page_state(zone, NR_INACTIVE))*3;
-}
-
-/*
- * Determine we should try to reclaim mapped pages.
- * This is called only when sc->mem_cgroup is NULL.
- */
-static int calc_reclaim_mapped(struct scan_control *sc, struct zone *zone,
-				int priority)
-{
-	long mapped_ratio;
-	long distress;
-	long swap_tendency;
-	long imbalance;
-	int reclaim_mapped = 0;
-	int prev_priority;
-
-	if (scan_global_lru(sc) && zone_is_near_oom(zone))
-		return 1;
-	/*
-	 * `distress' is a measure of how much trouble we're having
-	 * reclaiming pages.  0 -> no problems.  100 -> great trouble.
-	 */
-	if (scan_global_lru(sc))
-		prev_priority = zone->prev_priority;
-	else
-		prev_priority = mem_cgroup_get_reclaim_priority(sc->mem_cgroup);
-
-	distress = 100 >> min(prev_priority, priority);
-
-	/*
-	 * The point of this algorithm is to decide when to start
-	 * reclaiming mapped memory instead of just pagecache.  Work out
-	 * how much memory
-	 * is mapped.
-	 */
-	if (scan_global_lru(sc))
-		mapped_ratio = ((global_page_state(NR_FILE_MAPPED) +
-				global_page_state(NR_ANON_PAGES)) * 100) /
-					vm_total_pages;
-	else
-		mapped_ratio = mem_cgroup_calc_mapped_ratio(sc->mem_cgroup);
-
-	/*
-	 * Now decide how much we really want to unmap some pages.  The
-	 * mapped ratio is downgraded - just because there's a lot of
-	 * mapped memory doesn't necessarily mean that page reclaim
-	 * isn't succeeding.
-	 *
-	 * The distress ratio is important - we don't want to start
-	 * going oom.
-	 *
-	 * A 100% value of vm_swappiness overrides this algorithm
-	 * altogether.
-	 */
-	swap_tendency = mapped_ratio / 2 + distress + sc->swappiness;
-
-	/*
-	 * If there's huge imbalance between active and inactive
-	 * (think active 100 times larger than inactive) we should
-	 * become more permissive, or the system will take too much
-	 * cpu before it start swapping during memory pressure.
-	 * Distress is about avoiding early-oom, this is about
-	 * making swappiness graceful despite setting it to low
-	 * values.
-	 *
-	 * Avoid div by zero with nr_inactive+1, and max resulting
-	 * value is vm_total_pages.
-	 */
-	if (scan_global_lru(sc)) {
-		imbalance  = zone_page_state(zone, NR_ACTIVE);
-		imbalance /= zone_page_state(zone, NR_INACTIVE) + 1;
-	} else
-		imbalance = mem_cgroup_reclaim_imbalance(sc->mem_cgroup);
-
-	/*
-	 * Reduce the effect of imbalance if swappiness is low,
-	 * this means for a swappiness very low, the imbalance
-	 * must be much higher than 100 for this logic to make
-	 * the difference.
-	 *
-	 * Max temporary value is vm_total_pages*100.
-	 */
-	imbalance *= (vm_swappiness + 1);
-	imbalance /= 100;
-
-	/*
-	 * If not much of the ram is mapped, makes the imbalance
-	 * less relevant, it's high priority we refill the inactive
-	 * list with mapped pages only in presence of high ratio of
-	 * mapped pages.
-	 *
-	 * Max temporary value is vm_total_pages*100.
-	 */
-	imbalance *= mapped_ratio;
-	imbalance /= 100;
-
-	/* apply imbalance feedback to swap_tendency */
-	swap_tendency += imbalance;
-
-	/*
-	 * Now use this metric to decide whether to start moving mapped
-	 * memory onto the inactive list.
-	 */
-	if (swap_tendency >= 100)
-		reclaim_mapped = 1;
-
-	return reclaim_mapped;
+	return zone->pages_scanned >= (zone_lru_pages(zone) * 3);
 }
 
 /*
@@ -1138,7 +1058,7 @@ static int calc_reclaim_mapped(struct scan_control *sc, struct zone *zone,
 
 
 static void shrink_active_list(unsigned long nr_pages, struct zone *zone,
-				struct scan_control *sc, int priority)
+			struct scan_control *sc, int priority, int file)
 {
 	unsigned long pgmoved;
 	int pgdeactivate = 0;
@@ -1148,43 +1068,42 @@ static void shrink_active_list(unsigned long nr_pages, struct zone *zone,
 	LIST_HEAD(l_inactive);
 	struct page *page;
 	struct pagevec pvec;
-	int reclaim_mapped = 0;
-
-	if (sc->may_swap)
-		reclaim_mapped = calc_reclaim_mapped(sc, zone, priority);
+	enum lru_list lru;
 
 	lru_add_drain();
 	spin_lock_irq(&zone->lru_lock);
 	pgmoved = sc->isolate_pages(nr_pages, &l_hold, &pgscanned, sc->order,
 					ISOLATE_ACTIVE, zone,
-					sc->mem_cgroup, 1);
+					sc->mem_cgroup, 1, file);
 	/*
 	 * zone->pages_scanned is used for detect zone's oom
 	 * mem_cgroup remembers nr_scan by itself.
 	 */
-	if (scan_global_lru(sc))
+	if (scan_global_lru(sc)) {
 		zone->pages_scanned += pgscanned;
+		zone->recent_scanned[!!file] += pgmoved;
+	}
 
-	__mod_zone_page_state(zone, NR_ACTIVE, -pgmoved);
+	if (file)
+		__mod_zone_page_state(zone, NR_ACTIVE_FILE, -pgmoved);
+	else
+		__mod_zone_page_state(zone, NR_ACTIVE_ANON, -pgmoved);
 	spin_unlock_irq(&zone->lru_lock);
 
 	while (!list_empty(&l_hold)) {
 		cond_resched();
 		page = lru_to_page(&l_hold);
 		list_del(&page->lru);
-		if (page_mapped(page)) {
-			if (!reclaim_mapped ||
-			    (total_swap_pages == 0 && PageAnon(page)) ||
-			    page_referenced(page, 0, sc->mem_cgroup)) {
-				list_add(&page->lru, &l_active);
-				continue;
-			}
-		}
 		list_add(&page->lru, &l_inactive);
 	}
 
+	/*
+	 * Now put the pages back on the appropriate [file or anon] inactive
+	 * and active lists.
+	 */
 	pagevec_init(&pvec, 1);
 	pgmoved = 0;
+	lru = LRU_BASE + file * LRU_FILE;
 	spin_lock_irq(&zone->lru_lock);
 	while (!list_empty(&l_inactive)) {
 		page = lru_to_page(&l_inactive);
@@ -1194,11 +1113,11 @@ static void shrink_active_list(unsigned long nr_pages, struct zone *zone,
 		VM_BUG_ON(!PageActive(page));
 		ClearPageActive(page);
 
-		list_move(&page->lru, &zone->lru[LRU_INACTIVE].list);
+		list_move(&page->lru, &zone->lru[lru].list);
 		mem_cgroup_move_lists(page, false);
 		pgmoved++;
 		if (!pagevec_add(&pvec, page)) {
-			__mod_zone_page_state(zone, NR_INACTIVE, pgmoved);
+			__mod_zone_page_state(zone, NR_LRU_BASE + lru, pgmoved);
 			spin_unlock_irq(&zone->lru_lock);
 			pgdeactivate += pgmoved;
 			pgmoved = 0;
@@ -1208,7 +1127,7 @@ static void shrink_active_list(unsigned long nr_pages, struct zone *zone,
 			spin_lock_irq(&zone->lru_lock);
 		}
 	}
-	__mod_zone_page_state(zone, NR_INACTIVE, pgmoved);
+	__mod_zone_page_state(zone, NR_LRU_BASE + lru, pgmoved);
 	pgdeactivate += pgmoved;
 	if (buffer_heads_over_limit) {
 		spin_unlock_irq(&zone->lru_lock);
@@ -1217,6 +1136,7 @@ static void shrink_active_list(unsigned long nr_pages, struct zone *zone,
 	}
 
 	pgmoved = 0;
+	lru = LRU_ACTIVE + file * LRU_FILE;
 	while (!list_empty(&l_active)) {
 		page = lru_to_page(&l_active);
 		prefetchw_prev_lru_page(page, &l_active, flags);
@@ -1224,11 +1144,11 @@ static void shrink_active_list(unsigned long nr_pages, struct zone *zone,
 		SetPageLRU(page);
 		VM_BUG_ON(!PageActive(page));
 
-		list_move(&page->lru, &zone->lru[LRU_ACTIVE].list);
+		list_move(&page->lru, &zone->lru[lru].list);
 		mem_cgroup_move_lists(page, true);
 		pgmoved++;
 		if (!pagevec_add(&pvec, page)) {
-			__mod_zone_page_state(zone, NR_ACTIVE, pgmoved);
+			__mod_zone_page_state(zone, NR_LRU_BASE + lru, pgmoved);
 			pgmoved = 0;
 			spin_unlock_irq(&zone->lru_lock);
 			if (vm_swap_full())
@@ -1237,7 +1157,8 @@ static void shrink_active_list(unsigned long nr_pages, struct zone *zone,
 			spin_lock_irq(&zone->lru_lock);
 		}
 	}
-	__mod_zone_page_state(zone, NR_ACTIVE, pgmoved);
+	__mod_zone_page_state(zone, NR_LRU_BASE + lru, pgmoved);
+	zone->recent_rotated[!!file] += pgmoved;
 
 	__count_zone_vm_events(PGREFILL, zone, pgscanned);
 	__count_vm_events(PGDEACTIVATE, pgdeactivate);
@@ -1248,16 +1169,103 @@ static void shrink_active_list(unsigned long nr_pages, struct zone *zone,
 	pagevec_release(&pvec);
 }
 
-static unsigned long shrink_list(enum lru_list l, unsigned long nr_to_scan,
+static unsigned long shrink_list(enum lru_list lru, unsigned long nr_to_scan,
 	struct zone *zone, struct scan_control *sc, int priority)
 {
-	if (l == LRU_ACTIVE) {
-		shrink_active_list(nr_to_scan, zone, sc, priority);
+	int file = is_file_lru(lru);
+
+	if (lru == LRU_ACTIVE_ANON || lru == LRU_ACTIVE_FILE) {
+		shrink_active_list(nr_to_scan, zone, sc, priority, file);
 		return 0;
 	}
-	return shrink_inactive_list(nr_to_scan, zone, sc);
+	return shrink_inactive_list(nr_to_scan, zone, sc, file);
+}
+
+/*
+ * Determine how aggressively the anon and file LRU lists should be
+ * scanned.  The relative value of each set of LRU lists is determined
+ * by looking at the fraction of the pages scanned we did rotate back
+ * onto the active list instead of evict.
+ *
+ * percent[0] specifies how much pressure to put on ram/swap backed
+ * memory, while percent[1] determines pressure on the file LRUs.
+ */
+static void get_scan_ratio(struct zone *zone, struct scan_control *sc,
+					unsigned long *percent)
+{
+	unsigned long anon, file, free;
+	unsigned long anon_prio, file_prio;
+	unsigned long ap, fp;
+
+	anon  = zone_page_state(zone, NR_ACTIVE_ANON) +
+		zone_page_state(zone, NR_INACTIVE_ANON);
+	file  = zone_page_state(zone, NR_ACTIVE_FILE) +
+		zone_page_state(zone, NR_INACTIVE_FILE);
+	free  = zone_page_state(zone, NR_FREE_PAGES);
+
+	/* If we have no swap space, do not bother scanning anon pages. */
+	if (nr_swap_pages <= 0) {
+		percent[0] = 0;
+		percent[1] = 100;
+		return;
+	}
+
+	/* If we have very few page cache pages, force-scan anon pages. */
+	if (unlikely(file + free <= zone->pages_high)) {
+		percent[0] = 100;
+		percent[1] = 0;
+		return;
+	}
+
+	/*
+	 * OK, so we have swap space and a fair amount of page cache
+	 * pages.  We use the recently rotated / recently scanned
+	 * ratios to determine how valuable each cache is.
+	 *
+	 * Because workloads change over time (and to avoid overflow)
+	 * we keep these statistics as a floating average, which ends
+	 * up weighing recent references more than old ones.
+	 *
+	 * anon in [0], file in [1]
+	 */
+	if (unlikely(zone->recent_scanned[0] > anon / 4)) {
+		spin_lock_irq(&zone->lru_lock);
+		zone->recent_scanned[0] /= 2;
+		zone->recent_rotated[0] /= 2;
+		spin_unlock_irq(&zone->lru_lock);
+	}
+
+	if (unlikely(zone->recent_scanned[1] > file / 4)) {
+		spin_lock_irq(&zone->lru_lock);
+		zone->recent_scanned[1] /= 2;
+		zone->recent_rotated[1] /= 2;
+		spin_unlock_irq(&zone->lru_lock);
+	}
+
+	/*
+	 * With swappiness at 100, anonymous and file have the same priority.
+	 * This scanning priority is essentially the inverse of IO cost.
+	 */
+	anon_prio = sc->swappiness;
+	file_prio = 200 - sc->swappiness;
+
+	/*
+	 *                  anon       recent_rotated[0]
+	 * %anon = 100 * ----------- / ----------------- * IO cost
+	 *               anon + file      rotate_sum
+	 */
+	ap = (anon_prio + 1) * (zone->recent_scanned[0] + 1);
+	ap /= zone->recent_rotated[0] + 1;
+
+	fp = (file_prio + 1) * (zone->recent_scanned[1] + 1);
+	fp /= zone->recent_rotated[1] + 1;
+
+	/* Normalize to percentages */
+	percent[0] = 100 * ap / (ap + fp + 1);
+	percent[1] = 100 - percent[0];
 }
 
+
 /*
  * This is a basic per-zone page freer.  Used by both kswapd and direct reclaim.
  */
@@ -1267,36 +1275,43 @@ static unsigned long shrink_zone(int priority, struct zone *zone,
 	unsigned long nr[NR_LRU_LISTS];
 	unsigned long nr_to_scan;
 	unsigned long nr_reclaimed = 0;
+	unsigned long percent[2];	/* anon @ 0; file @ 1 */
 	enum lru_list l;
 
-	if (scan_global_lru(sc)) {
-		/*
-		 * Add one to nr_to_scan just to make sure that the kernel
-		 * will slowly sift through the active list.
-		 */
-		for_each_lru(l) {
-			zone->lru[l].nr_scan += (zone_page_state(zone,
-					NR_LRU_BASE + l)  >> priority) + 1;
+	get_scan_ratio(zone, sc, percent);
+
+	for_each_lru(l) {
+		if (scan_global_lru(sc)) {
+			int file = is_file_lru(l);
+			int scan;
+			/*
+			 * Add one to nr_to_scan just to make sure that the
+			 * kernel will slowly sift through each list.
+			 */
+			scan = zone_page_state(zone, NR_LRU_BASE + l);
+			if (priority) {
+				scan >>= priority;
+				scan = (scan * percent[file]) / 100;
+			}
+			zone->lru[l].nr_scan += scan + 1;
 			nr[l] = zone->lru[l].nr_scan;
 			if (nr[l] >= sc->swap_cluster_max)
 				zone->lru[l].nr_scan = 0;
 			else
 				nr[l] = 0;
+		} else {
+			/*
+			 * This reclaim occurs not because zone memory shortage
+			 * but because memory controller hits its limit.
+			 * Don't modify zone reclaim related data.
+			 */
+			nr[l] = mem_cgroup_calc_reclaim(sc->mem_cgroup, zone,
+								priority, l);
 		}
-	} else {
-		/*
-		 * This reclaim occurs not because zone memory shortage but
-		 * because memory controller hits its limit.
-		 * Then, don't modify zone reclaim related data.
-		 */
-		nr[LRU_ACTIVE] = mem_cgroup_calc_reclaim(sc->mem_cgroup,
-					zone, priority, LRU_ACTIVE);
-
-		nr[LRU_INACTIVE] = mem_cgroup_calc_reclaim(sc->mem_cgroup,
-					zone, priority, LRU_INACTIVE);
 	}
 
-	while (nr[LRU_ACTIVE] || nr[LRU_INACTIVE]) {
+	while (nr[LRU_ACTIVE_ANON] || nr[LRU_INACTIVE_ANON] ||
+			nr[LRU_ACTIVE_FILE] || nr[LRU_INACTIVE_FILE]) {
 		for_each_lru(l) {
 			if (nr[l]) {
 				nr_to_scan = min(nr[l],
@@ -1369,7 +1384,7 @@ static unsigned long shrink_zones(int priority, struct zonelist *zonelist,
 
 	return nr_reclaimed;
 }
- 
+
 /*
  * This is the main entry point to direct page reclaim.
  *
@@ -1412,8 +1427,7 @@ static unsigned long do_try_to_free_pages(struct zonelist *zonelist,
 			if (!cpuset_zone_allowed_hardwall(zone, GFP_KERNEL))
 				continue;
 
-			lru_pages += zone_page_state(zone, NR_ACTIVE)
-					+ zone_page_state(zone, NR_INACTIVE);
+			lru_pages += zone_lru_pages(zone);
 		}
 	}
 
@@ -1615,8 +1629,7 @@ loop_again:
 		for (i = 0; i <= end_zone; i++) {
 			struct zone *zone = pgdat->node_zones + i;
 
-			lru_pages += zone_page_state(zone, NR_ACTIVE)
-					+ zone_page_state(zone, NR_INACTIVE);
+			lru_pages += zone_lru_pages(zone);
 		}
 
 		/*
@@ -1660,8 +1673,7 @@ loop_again:
 			if (zone_is_all_unreclaimable(zone))
 				continue;
 			if (nr_slab == 0 && zone->pages_scanned >=
-				(zone_page_state(zone, NR_ACTIVE)
-				+ zone_page_state(zone, NR_INACTIVE)) * 6)
+						(zone_lru_pages(zone) * 6))
 					zone_set_flag(zone,
 						      ZONE_ALL_UNRECLAIMABLE);
 			/*
@@ -1715,7 +1727,7 @@ out:
 
 /*
  * The background pageout daemon, started as a kernel thread
- * from the init process. 
+ * from the init process.
  *
  * This basically trickles out pages so that we have _some_
  * free memory available even if there is no other activity
@@ -1809,6 +1821,14 @@ void wakeup_kswapd(struct zone *zone, int order)
 	wake_up_interruptible(&pgdat->kswapd_wait);
 }
 
+unsigned long global_lru_pages(void)
+{
+	return global_page_state(NR_ACTIVE_ANON)
+		+ global_page_state(NR_ACTIVE_FILE)
+		+ global_page_state(NR_INACTIVE_ANON)
+		+ global_page_state(NR_INACTIVE_FILE);
+}
+
 #ifdef CONFIG_PM
 /*
  * Helper function for shrink_all_memory().  Tries to reclaim 'nr_pages' pages
@@ -1834,7 +1854,8 @@ static unsigned long shrink_all_zones(unsigned long nr_pages, int prio,
 
 		for_each_lru(l) {
 			/* For pass = 0 we don't shrink the active list */
-			if (pass == 0 && l == LRU_ACTIVE)
+			if (pass == 0 &&
+				(l == LRU_ACTIVE || l == LRU_ACTIVE_FILE))
 				continue;
 
 			zone->lru[l].nr_scan +=
@@ -1856,11 +1877,6 @@ static unsigned long shrink_all_zones(unsigned long nr_pages, int prio,
 	return ret;
 }
 
-static unsigned long count_lru_pages(void)
-{
-	return global_page_state(NR_ACTIVE) + global_page_state(NR_INACTIVE);
-}
-
 /*
  * Try to free `nr_pages' of memory, system-wide, and return the number of
  * freed pages.
@@ -1886,7 +1902,7 @@ unsigned long shrink_all_memory(unsigned long nr_pages)
 
 	current->reclaim_state = &reclaim_state;
 
-	lru_pages = count_lru_pages();
+	lru_pages = global_lru_pages();
 	nr_slab = global_page_state(NR_SLAB_RECLAIMABLE);
 	/* If slab caches are huge, it's better to hit them first */
 	while (nr_slab >= lru_pages) {
@@ -1929,7 +1945,7 @@ unsigned long shrink_all_memory(unsigned long nr_pages)
 
 			reclaim_state.reclaimed_slab = 0;
 			shrink_slab(sc.nr_scanned, sc.gfp_mask,
-					count_lru_pages());
+					global_lru_pages());
 			ret += reclaim_state.reclaimed_slab;
 			if (ret >= nr_pages)
 				goto out;
@@ -1946,7 +1962,7 @@ unsigned long shrink_all_memory(unsigned long nr_pages)
 	if (!ret) {
 		do {
 			reclaim_state.reclaimed_slab = 0;
-			shrink_slab(nr_pages, sc.gfp_mask, count_lru_pages());
+			shrink_slab(nr_pages, sc.gfp_mask, global_lru_pages());
 			ret += reclaim_state.reclaimed_slab;
 		} while (ret < nr_pages && reclaim_state.reclaimed_slab > 0);
 	}
diff --git a/mm/vmstat.c b/mm/vmstat.c
index 52c0335c1b71..27400b7da7c4 100644
--- a/mm/vmstat.c
+++ b/mm/vmstat.c
@@ -619,8 +619,10 @@ const struct seq_operations pagetypeinfo_op = {
 static const char * const vmstat_text[] = {
 	/* Zoned VM counters */
 	"nr_free_pages",
-	"nr_inactive",
-	"nr_active",
+	"nr_inactive_anon",
+	"nr_active_anon",
+	"nr_inactive_file",
+	"nr_active_file",
 	"nr_anon_pages",
 	"nr_mapped",
 	"nr_file_pages",
@@ -688,7 +690,7 @@ static void zoneinfo_show_print(struct seq_file *m, pg_data_t *pgdat,
 		   "\n        min      %lu"
 		   "\n        low      %lu"
 		   "\n        high     %lu"
-		   "\n        scanned  %lu (a: %lu i: %lu)"
+		   "\n        scanned  %lu (aa: %lu ia: %lu af: %lu if: %lu)"
 		   "\n        spanned  %lu"
 		   "\n        present  %lu",
 		   zone_page_state(zone, NR_FREE_PAGES),
@@ -696,8 +698,10 @@ static void zoneinfo_show_print(struct seq_file *m, pg_data_t *pgdat,
 		   zone->pages_low,
 		   zone->pages_high,
 		   zone->pages_scanned,
-		   zone->lru[LRU_ACTIVE].nr_scan,
-		   zone->lru[LRU_INACTIVE].nr_scan,
+		   zone->lru[LRU_ACTIVE_ANON].nr_scan,
+		   zone->lru[LRU_INACTIVE_ANON].nr_scan,
+		   zone->lru[LRU_ACTIVE_FILE].nr_scan,
+		   zone->lru[LRU_INACTIVE_FILE].nr_scan,
 		   zone->spanned_pages,
 		   zone->present_pages);
 
-- 
cgit v1.2.3


From 556adecba110bf5f1db6c6b56416cfab5bcab698 Mon Sep 17 00:00:00 2001
From: Rik van Riel <riel@redhat.com>
Date: Sat, 18 Oct 2008 20:26:34 -0700
Subject: vmscan: second chance replacement for anonymous pages

We avoid evicting and scanning anonymous pages for the most part, but
under some workloads we can end up with most of memory filled with
anonymous pages.  At that point, we suddenly need to clear the referenced
bits on all of memory, which can take ages on very large memory systems.

We can reduce the maximum number of pages that need to be scanned by not
taking the referenced state into account when deactivating an anonymous
page.  After all, every anonymous page starts out referenced, so why
check?

If an anonymous page gets referenced again before it reaches the end of
the inactive list, we move it back to the active list.

To keep the maximum amount of necessary work reasonable, we scale the
active to inactive ratio with the size of memory, using the formula
active:inactive ratio = sqrt(memory in GB * 10).

Kswapd CPU use now seems to scale by the amount of pageout bandwidth,
instead of by the amount of memory present in the system.

[kamezawa.hiroyu@jp.fujitsu.com: fix OOM with memcg]
[kamezawa.hiroyu@jp.fujitsu.com: memcg: lru scan fix]
Signed-off-by: Rik van Riel <riel@redhat.com>
Signed-off-by: KOSAKI Motohiro <kosaki.motohiro@jp.fujitsu.com>
Signed-off-by: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/mm_inline.h | 19 +++++++++++++++++++
 include/linux/mmzone.h    |  6 ++++++
 mm/page_alloc.c           | 41 +++++++++++++++++++++++++++++++++++++++++
 mm/vmscan.c               | 38 ++++++++++++++++++++++++++++++++++----
 mm/vmstat.c               |  6 ++++--
 5 files changed, 104 insertions(+), 6 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/mm_inline.h b/include/linux/mm_inline.h
index 2eb599465d56..f451fedd1e75 100644
--- a/include/linux/mm_inline.h
+++ b/include/linux/mm_inline.h
@@ -117,4 +117,23 @@ static inline enum lru_list page_lru(struct page *page)
 	return lru;
 }
 
+/**
+ * inactive_anon_is_low - check if anonymous pages need to be deactivated
+ * @zone: zone to check
+ *
+ * Returns true if the zone does not have enough inactive anon pages,
+ * meaning some active anon pages need to be deactivated.
+ */
+static inline int inactive_anon_is_low(struct zone *zone)
+{
+	unsigned long active, inactive;
+
+	active = zone_page_state(zone, NR_ACTIVE_ANON);
+	inactive = zone_page_state(zone, NR_INACTIVE_ANON);
+
+	if (inactive * zone->inactive_ratio < active)
+		return 1;
+
+	return 0;
+}
 #endif
diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h
index 59a4c8fd6ebd..9c5111f49a32 100644
--- a/include/linux/mmzone.h
+++ b/include/linux/mmzone.h
@@ -323,6 +323,12 @@ struct zone {
 	 */
 	int prev_priority;
 
+	/*
+	 * The target ratio of ACTIVE_ANON to INACTIVE_ANON pages on
+	 * this zone's LRU.  Maintained by the pageout code.
+	 */
+	unsigned int inactive_ratio;
+
 
 	ZONE_PADDING(_pad2_)
 	/* Rarely used or read-mostly fields */
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 740a16a32c22..79c0981b1d32 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -4263,6 +4263,46 @@ void setup_per_zone_pages_min(void)
 	calculate_totalreserve_pages();
 }
 
+/**
+ * setup_per_zone_inactive_ratio - called when min_free_kbytes changes.
+ *
+ * The inactive anon list should be small enough that the VM never has to
+ * do too much work, but large enough that each inactive page has a chance
+ * to be referenced again before it is swapped out.
+ *
+ * The inactive_anon ratio is the target ratio of ACTIVE_ANON to
+ * INACTIVE_ANON pages on this zone's LRU, maintained by the
+ * pageout code. A zone->inactive_ratio of 3 means 3:1 or 25% of
+ * the anonymous pages are kept on the inactive list.
+ *
+ * total     target    max
+ * memory    ratio     inactive anon
+ * -------------------------------------
+ *   10MB       1         5MB
+ *  100MB       1        50MB
+ *    1GB       3       250MB
+ *   10GB      10       0.9GB
+ *  100GB      31         3GB
+ *    1TB     101        10GB
+ *   10TB     320        32GB
+ */
+void setup_per_zone_inactive_ratio(void)
+{
+	struct zone *zone;
+
+	for_each_zone(zone) {
+		unsigned int gb, ratio;
+
+		/* Zone size in gigabytes */
+		gb = zone->present_pages >> (30 - PAGE_SHIFT);
+		ratio = int_sqrt(10 * gb);
+		if (!ratio)
+			ratio = 1;
+
+		zone->inactive_ratio = ratio;
+	}
+}
+
 /*
  * Initialise min_free_kbytes.
  *
@@ -4300,6 +4340,7 @@ static int __init init_per_zone_pages_min(void)
 		min_free_kbytes = 65536;
 	setup_per_zone_pages_min();
 	setup_per_zone_lowmem_reserve();
+	setup_per_zone_inactive_ratio();
 	return 0;
 }
 module_init(init_per_zone_pages_min)
diff --git a/mm/vmscan.c b/mm/vmscan.c
index d10d2f9a33f3..c82ee9a33cfc 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -1090,6 +1090,7 @@ static void shrink_active_list(unsigned long nr_pages, struct zone *zone,
 		__mod_zone_page_state(zone, NR_ACTIVE_ANON, -pgmoved);
 	spin_unlock_irq(&zone->lru_lock);
 
+	pgmoved = 0;
 	while (!list_empty(&l_hold)) {
 		cond_resched();
 		page = lru_to_page(&l_hold);
@@ -1097,6 +1098,13 @@ static void shrink_active_list(unsigned long nr_pages, struct zone *zone,
 		list_add(&page->lru, &l_inactive);
 	}
 
+	/*
+	 * Count the referenced pages as rotated, even when they are moved
+	 * to the inactive list.  This helps balance scan pressure between
+	 * file and anonymous pages in get_scan_ratio.
+ 	 */
+	zone->recent_rotated[!!file] += pgmoved;
+
 	/*
 	 * Now put the pages back on the appropriate [file or anon] inactive
 	 * and active lists.
@@ -1158,7 +1166,6 @@ static void shrink_active_list(unsigned long nr_pages, struct zone *zone,
 		}
 	}
 	__mod_zone_page_state(zone, NR_LRU_BASE + lru, pgmoved);
-	zone->recent_rotated[!!file] += pgmoved;
 
 	__count_zone_vm_events(PGREFILL, zone, pgscanned);
 	__count_vm_events(PGDEACTIVATE, pgdeactivate);
@@ -1174,7 +1181,13 @@ static unsigned long shrink_list(enum lru_list lru, unsigned long nr_to_scan,
 {
 	int file = is_file_lru(lru);
 
-	if (lru == LRU_ACTIVE_ANON || lru == LRU_ACTIVE_FILE) {
+	if (lru == LRU_ACTIVE_FILE) {
+		shrink_active_list(nr_to_scan, zone, sc, priority, file);
+		return 0;
+	}
+
+	if (lru == LRU_ACTIVE_ANON &&
+	    (!scan_global_lru(sc) || inactive_anon_is_low(zone))) {
 		shrink_active_list(nr_to_scan, zone, sc, priority, file);
 		return 0;
 	}
@@ -1310,8 +1323,8 @@ static unsigned long shrink_zone(int priority, struct zone *zone,
 		}
 	}
 
-	while (nr[LRU_ACTIVE_ANON] || nr[LRU_INACTIVE_ANON] ||
-			nr[LRU_ACTIVE_FILE] || nr[LRU_INACTIVE_FILE]) {
+	while (nr[LRU_INACTIVE_ANON] || nr[LRU_ACTIVE_FILE] ||
+					nr[LRU_INACTIVE_FILE]) {
 		for_each_lru(l) {
 			if (nr[l]) {
 				nr_to_scan = min(nr[l],
@@ -1324,6 +1337,15 @@ static unsigned long shrink_zone(int priority, struct zone *zone,
 		}
 	}
 
+	/*
+	 * Even if we did not try to evict anon pages at all, we want to
+	 * rebalance the anon lru active/inactive ratio.
+	 */
+	if (!scan_global_lru(sc) || inactive_anon_is_low(zone))
+		shrink_active_list(SWAP_CLUSTER_MAX, zone, sc, priority, 0);
+	else if (!scan_global_lru(sc))
+		shrink_active_list(SWAP_CLUSTER_MAX, zone, sc, priority, 0);
+
 	throttle_vm_writeout(sc->gfp_mask);
 	return nr_reclaimed;
 }
@@ -1617,6 +1639,14 @@ loop_again:
 			    priority != DEF_PRIORITY)
 				continue;
 
+			/*
+			 * Do some background aging of the anon list, to give
+			 * pages a chance to be referenced before reclaiming.
+			 */
+			if (inactive_anon_is_low(zone))
+				shrink_active_list(SWAP_CLUSTER_MAX, zone,
+							&sc, priority, 0);
+
 			if (!zone_watermark_ok(zone, order, zone->pages_high,
 					       0, 0)) {
 				end_zone = i;
diff --git a/mm/vmstat.c b/mm/vmstat.c
index 27400b7da7c4..4380b0dba6d9 100644
--- a/mm/vmstat.c
+++ b/mm/vmstat.c
@@ -738,10 +738,12 @@ static void zoneinfo_show_print(struct seq_file *m, pg_data_t *pgdat,
 	seq_printf(m,
 		   "\n  all_unreclaimable: %u"
 		   "\n  prev_priority:     %i"
-		   "\n  start_pfn:         %lu",
+		   "\n  start_pfn:         %lu"
+		   "\n  inactive_ratio:    %u",
 			   zone_is_all_unreclaimable(zone),
 		   zone->prev_priority,
-		   zone->zone_start_pfn);
+		   zone->zone_start_pfn,
+		   zone->inactive_ratio);
 	seq_putc(m, '\n');
 }
 
-- 
cgit v1.2.3


From 8a7a8544a4f6554ec2d8048ac9f9672f442db5a2 Mon Sep 17 00:00:00 2001
From: Lee Schermerhorn <lee.schermerhorn@hp.com>
Date: Sat, 18 Oct 2008 20:26:37 -0700
Subject: pageflag helpers for configed-out flags

Define proper false/noop inline functions for noreclaim page flags when
!defined(CONFIG_UNEVICTABLE_LRU)

Signed-off-by: Lee Schermerhorn <lee.schermerhorn@hp.com>
Signed-off-by: Rik van Riel <riel@redhat.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/page-flags.h | 12 ++++++++++++
 1 file changed, 12 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/page-flags.h b/include/linux/page-flags.h
index 57b688cfb5e2..3d31616dcd23 100644
--- a/include/linux/page-flags.h
+++ b/include/linux/page-flags.h
@@ -162,6 +162,18 @@ static inline int Page##uname(struct page *page) 			\
 #define TESTSCFLAG(uname, lname)					\
 	TESTSETFLAG(uname, lname) TESTCLEARFLAG(uname, lname)
 
+#define SETPAGEFLAG_NOOP(uname)						\
+static inline void SetPage##uname(struct page *page) {  }
+
+#define CLEARPAGEFLAG_NOOP(uname)					\
+static inline void ClearPage##uname(struct page *page) {  }
+
+#define __CLEARPAGEFLAG_NOOP(uname)					\
+static inline void __ClearPage##uname(struct page *page) {  }
+
+#define TESTCLEARFLAG_FALSE(uname)					\
+static inline int TestClearPage##uname(struct page *page) { return 0; }
+
 struct page;	/* forward declaration */
 
 TESTPAGEFLAG(Locked, locked)
-- 
cgit v1.2.3


From 894bc310419ac95f4fa4142dc364401a7e607f65 Mon Sep 17 00:00:00 2001
From: Lee Schermerhorn <Lee.Schermerhorn@hp.com>
Date: Sat, 18 Oct 2008 20:26:39 -0700
Subject: Unevictable LRU Infrastructure

When the system contains lots of mlocked or otherwise unevictable pages,
the pageout code (kswapd) can spend lots of time scanning over these
pages.  Worse still, the presence of lots of unevictable pages can confuse
kswapd into thinking that more aggressive pageout modes are required,
resulting in all kinds of bad behaviour.

Infrastructure to manage pages excluded from reclaim--i.e., hidden from
vmscan.  Based on a patch by Larry Woodman of Red Hat.  Reworked to
maintain "unevictable" pages on a separate per-zone LRU list, to "hide"
them from vmscan.

Kosaki Motohiro added the support for the memory controller unevictable
lru list.

Pages on the unevictable list have both PG_unevictable and PG_lru set.
Thus, PG_unevictable is analogous to and mutually exclusive with
PG_active--it specifies which LRU list the page is on.

The unevictable infrastructure is enabled by a new mm Kconfig option
[CONFIG_]UNEVICTABLE_LRU.

A new function 'page_evictable(page, vma)' in vmscan.c tests whether or
not a page may be evictable.  Subsequent patches will add the various
!evictable tests.  We'll want to keep these tests light-weight for use in
shrink_active_list() and, possibly, the fault path.

To avoid races between tasks putting pages [back] onto an LRU list and
tasks that might be moving the page from non-evictable to evictable state,
the new function 'putback_lru_page()' -- inverse to 'isolate_lru_page()'
-- tests the "evictability" of a page after placing it on the LRU, before
dropping the reference.  If the page has become unevictable,
putback_lru_page() will redo the 'putback', thus moving the page to the
unevictable list.  This way, we avoid "stranding" evictable pages on the
unevictable list.

[akpm@linux-foundation.org: fix fallout from out-of-order merge]
[riel@redhat.com: fix UNEVICTABLE_LRU and !PROC_PAGE_MONITOR build]
[nishimura@mxp.nes.nec.co.jp: remove redundant mapping check]
[kosaki.motohiro@jp.fujitsu.com: unevictable-lru-infrastructure: putback_lru_page()/unevictable page handling rework]
[kosaki.motohiro@jp.fujitsu.com: kill unnecessary lock_page() in vmscan.c]
[kosaki.motohiro@jp.fujitsu.com: revert migration change of unevictable lru infrastructure]
[kosaki.motohiro@jp.fujitsu.com: revert to unevictable-lru-infrastructure-kconfig-fix.patch]
[kosaki.motohiro@jp.fujitsu.com: restore patch failure of vmstat-unevictable-and-mlocked-pages-vm-events.patch]
Signed-off-by: Lee Schermerhorn <lee.schermerhorn@hp.com>
Signed-off-by: Rik van Riel <riel@redhat.com>
Signed-off-by: KOSAKI Motohiro <kosaki.motohiro@jp.fujitsu.com>
Debugged-by: Benjamin Kidwell <benjkidwell@yahoo.com>
Signed-off-by: Daisuke Nishimura <nishimura@mxp.nes.nec.co.jp>
Signed-off-by: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/memcontrol.h |   2 +-
 include/linux/mm_inline.h  |  23 ++++---
 include/linux/mmzone.h     |  24 +++++++-
 include/linux/page-flags.h |  22 ++++++-
 include/linux/pagevec.h    |   1 -
 include/linux/swap.h       |  12 ++++
 mm/Kconfig                 |  11 ++++
 mm/internal.h              |  26 ++++++++
 mm/memcontrol.c            |  73 +++++++++++++---------
 mm/mempolicy.c             |   2 +-
 mm/migrate.c               |  31 +++++-----
 mm/swap.c                  |  42 +++++++++++--
 mm/vmscan.c                | 149 +++++++++++++++++++++++++++++++++++++++++----
 13 files changed, 345 insertions(+), 73 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h
index 8d8f05c1515a..ee1b2fcb4410 100644
--- a/include/linux/memcontrol.h
+++ b/include/linux/memcontrol.h
@@ -34,9 +34,9 @@ extern int mem_cgroup_charge(struct page *page, struct mm_struct *mm,
 				gfp_t gfp_mask);
 extern int mem_cgroup_cache_charge(struct page *page, struct mm_struct *mm,
 					gfp_t gfp_mask);
+extern void mem_cgroup_move_lists(struct page *page, enum lru_list lru);
 extern void mem_cgroup_uncharge_page(struct page *page);
 extern void mem_cgroup_uncharge_cache_page(struct page *page);
-extern void mem_cgroup_move_lists(struct page *page, bool active);
 extern int mem_cgroup_shrink_usage(struct mm_struct *mm, gfp_t gfp_mask);
 
 extern unsigned long mem_cgroup_isolate_pages(unsigned long nr_to_scan,
diff --git a/include/linux/mm_inline.h b/include/linux/mm_inline.h
index f451fedd1e75..67d7697fd019 100644
--- a/include/linux/mm_inline.h
+++ b/include/linux/mm_inline.h
@@ -91,11 +91,16 @@ del_page_from_lru(struct zone *zone, struct page *page)
 	enum lru_list l = LRU_BASE;
 
 	list_del(&page->lru);
-	if (PageActive(page)) {
-		__ClearPageActive(page);
-		l += LRU_ACTIVE;
+	if (PageUnevictable(page)) {
+		__ClearPageUnevictable(page);
+		l = LRU_UNEVICTABLE;
+	} else {
+		if (PageActive(page)) {
+			__ClearPageActive(page);
+			l += LRU_ACTIVE;
+		}
+		l += page_is_file_cache(page);
 	}
-	l += page_is_file_cache(page);
 	__dec_zone_state(zone, NR_LRU_BASE + l);
 }
 
@@ -110,9 +115,13 @@ static inline enum lru_list page_lru(struct page *page)
 {
 	enum lru_list lru = LRU_BASE;
 
-	if (PageActive(page))
-		lru += LRU_ACTIVE;
-	lru += page_is_file_cache(page);
+	if (PageUnevictable(page))
+		lru = LRU_UNEVICTABLE;
+	else {
+		if (PageActive(page))
+			lru += LRU_ACTIVE;
+		lru += page_is_file_cache(page);
+	}
 
 	return lru;
 }
diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h
index 9c5111f49a32..d1f60d5fe2ea 100644
--- a/include/linux/mmzone.h
+++ b/include/linux/mmzone.h
@@ -86,6 +86,11 @@ enum zone_stat_item {
 	NR_ACTIVE_ANON,		/*  "     "     "   "       "         */
 	NR_INACTIVE_FILE,	/*  "     "     "   "       "         */
 	NR_ACTIVE_FILE,		/*  "     "     "   "       "         */
+#ifdef CONFIG_UNEVICTABLE_LRU
+	NR_UNEVICTABLE,		/*  "     "     "   "       "         */
+#else
+	NR_UNEVICTABLE = NR_ACTIVE_FILE, /* avoid compiler errors in dead code */
+#endif
 	NR_ANON_PAGES,	/* Mapped anonymous pages */
 	NR_FILE_MAPPED,	/* pagecache pages mapped into pagetables.
 			   only modified from process context */
@@ -128,10 +133,18 @@ enum lru_list {
 	LRU_ACTIVE_ANON = LRU_BASE + LRU_ACTIVE,
 	LRU_INACTIVE_FILE = LRU_BASE + LRU_FILE,
 	LRU_ACTIVE_FILE = LRU_BASE + LRU_FILE + LRU_ACTIVE,
-	NR_LRU_LISTS };
+#ifdef CONFIG_UNEVICTABLE_LRU
+	LRU_UNEVICTABLE,
+#else
+	LRU_UNEVICTABLE = LRU_ACTIVE_FILE, /* avoid compiler errors in dead code */
+#endif
+	NR_LRU_LISTS
+};
 
 #define for_each_lru(l) for (l = 0; l < NR_LRU_LISTS; l++)
 
+#define for_each_evictable_lru(l) for (l = 0; l <= LRU_ACTIVE_FILE; l++)
+
 static inline int is_file_lru(enum lru_list l)
 {
 	return (l == LRU_INACTIVE_FILE || l == LRU_ACTIVE_FILE);
@@ -142,6 +155,15 @@ static inline int is_active_lru(enum lru_list l)
 	return (l == LRU_ACTIVE_ANON || l == LRU_ACTIVE_FILE);
 }
 
+static inline int is_unevictable_lru(enum lru_list l)
+{
+#ifdef CONFIG_UNEVICTABLE_LRU
+	return (l == LRU_UNEVICTABLE);
+#else
+	return 0;
+#endif
+}
+
 struct per_cpu_pages {
 	int count;		/* number of pages in the list */
 	int high;		/* high watermark, emptying needed */
diff --git a/include/linux/page-flags.h b/include/linux/page-flags.h
index 3d31616dcd23..ec1a1baad348 100644
--- a/include/linux/page-flags.h
+++ b/include/linux/page-flags.h
@@ -94,6 +94,9 @@ enum pageflags {
 	PG_reclaim,		/* To be reclaimed asap */
 	PG_buddy,		/* Page is free, on buddy lists */
 	PG_swapbacked,		/* Page is backed by RAM/swap */
+#ifdef CONFIG_UNEVICTABLE_LRU
+	PG_unevictable,		/* Page is "unevictable"  */
+#endif
 #ifdef CONFIG_IA64_UNCACHED_ALLOCATOR
 	PG_uncached,		/* Page has been mapped as uncached */
 #endif
@@ -182,6 +185,7 @@ PAGEFLAG(Referenced, referenced) TESTCLEARFLAG(Referenced, referenced)
 PAGEFLAG(Dirty, dirty) TESTSCFLAG(Dirty, dirty) __CLEARPAGEFLAG(Dirty, dirty)
 PAGEFLAG(LRU, lru) __CLEARPAGEFLAG(LRU, lru)
 PAGEFLAG(Active, active) __CLEARPAGEFLAG(Active, active)
+	TESTCLEARFLAG(Active, active)
 __PAGEFLAG(Slab, slab)
 PAGEFLAG(Checked, checked)		/* Used by some filesystems */
 PAGEFLAG(Pinned, pinned) TESTSCFLAG(Pinned, pinned)	/* Xen */
@@ -225,6 +229,15 @@ PAGEFLAG(SwapCache, swapcache)
 PAGEFLAG_FALSE(SwapCache)
 #endif
 
+#ifdef CONFIG_UNEVICTABLE_LRU
+PAGEFLAG(Unevictable, unevictable) __CLEARPAGEFLAG(Unevictable, unevictable)
+	TESTCLEARFLAG(Unevictable, unevictable)
+#else
+PAGEFLAG_FALSE(Unevictable) TESTCLEARFLAG_FALSE(Unevictable)
+	SETPAGEFLAG_NOOP(Unevictable) CLEARPAGEFLAG_NOOP(Unevictable)
+	__CLEARPAGEFLAG_NOOP(Unevictable)
+#endif
+
 #ifdef CONFIG_IA64_UNCACHED_ALLOCATOR
 PAGEFLAG(Uncached, uncached)
 #else
@@ -340,9 +353,16 @@ static inline void __ClearPageTail(struct page *page)
 
 #endif /* !PAGEFLAGS_EXTENDED */
 
+#ifdef CONFIG_UNEVICTABLE_LRU
+#define __PG_UNEVICTABLE (1 << PG_unevictable)
+#else
+#define __PG_UNEVICTABLE 0
+#endif
+
 #define PAGE_FLAGS	(1 << PG_lru   | 1 << PG_private   | 1 << PG_locked | \
 			 1 << PG_buddy | 1 << PG_writeback | \
-			 1 << PG_slab  | 1 << PG_swapcache | 1 << PG_active)
+			 1 << PG_slab  | 1 << PG_swapcache | 1 << PG_active | \
+			 __PG_UNEVICTABLE)
 
 /*
  * Flags checked in bad_page().  Pages on the free list should not have
diff --git a/include/linux/pagevec.h b/include/linux/pagevec.h
index 5fc96a4e760f..e90a2cb02915 100644
--- a/include/linux/pagevec.h
+++ b/include/linux/pagevec.h
@@ -101,7 +101,6 @@ static inline void __pagevec_lru_add_active_file(struct pagevec *pvec)
 	____pagevec_lru_add(pvec, LRU_ACTIVE_FILE);
 }
 
-
 static inline void pagevec_lru_add_file(struct pagevec *pvec)
 {
 	if (pagevec_count(pvec))
diff --git a/include/linux/swap.h b/include/linux/swap.h
index 7d09d79997a4..a2113044d20a 100644
--- a/include/linux/swap.h
+++ b/include/linux/swap.h
@@ -180,6 +180,8 @@ extern int lru_add_drain_all(void);
 extern void rotate_reclaimable_page(struct page *page);
 extern void swap_setup(void);
 
+extern void add_page_to_unevictable_list(struct page *page);
+
 /**
  * lru_cache_add: add a page to the page lists
  * @page: the page to add
@@ -228,6 +230,16 @@ static inline int zone_reclaim(struct zone *z, gfp_t mask, unsigned int order)
 }
 #endif
 
+#ifdef CONFIG_UNEVICTABLE_LRU
+extern int page_evictable(struct page *page, struct vm_area_struct *vma);
+#else
+static inline int page_evictable(struct page *page,
+						struct vm_area_struct *vma)
+{
+	return 1;
+}
+#endif
+
 extern int kswapd_run(int nid);
 
 #ifdef CONFIG_MMU
diff --git a/mm/Kconfig b/mm/Kconfig
index 1a501a4de95c..5b5790f8a816 100644
--- a/mm/Kconfig
+++ b/mm/Kconfig
@@ -209,5 +209,16 @@ config VIRT_TO_BUS
 	def_bool y
 	depends on !ARCH_NO_VIRT_TO_BUS
 
+config UNEVICTABLE_LRU
+	bool "Add LRU list to track non-evictable pages"
+	default y
+	depends on MMU
+	help
+	  Keeps unevictable pages off of the active and inactive pageout
+	  lists, so kswapd will not waste CPU time or have its balancing
+	  algorithms thrown off by scanning these pages.  Selecting this
+	  will use one page flag and increase the code size a little,
+	  say Y unless you know what you are doing.
+
 config MMU_NOTIFIER
 	bool
diff --git a/mm/internal.h b/mm/internal.h
index 4e8e78b978b5..3db17b2a1ac6 100644
--- a/mm/internal.h
+++ b/mm/internal.h
@@ -39,8 +39,15 @@ static inline void __put_page(struct page *page)
 	atomic_dec(&page->_count);
 }
 
+/*
+ * in mm/vmscan.c:
+ */
 extern int isolate_lru_page(struct page *page);
+extern void putback_lru_page(struct page *page);
 
+/*
+ * in mm/page_alloc.c
+ */
 extern void __free_pages_bootmem(struct page *page, unsigned int order);
 
 /*
@@ -54,6 +61,25 @@ static inline unsigned long page_order(struct page *page)
 	return page_private(page);
 }
 
+#ifdef CONFIG_UNEVICTABLE_LRU
+/*
+ * unevictable_migrate_page() called only from migrate_page_copy() to
+ * migrate unevictable flag to new page.
+ * Note that the old page has been isolated from the LRU lists at this
+ * point so we don't need to worry about LRU statistics.
+ */
+static inline void unevictable_migrate_page(struct page *new, struct page *old)
+{
+	if (TestClearPageUnevictable(old))
+		SetPageUnevictable(new);
+}
+#else
+static inline void unevictable_migrate_page(struct page *new, struct page *old)
+{
+}
+#endif
+
+
 /*
  * FLATMEM and DISCONTIGMEM configurations use alloc_bootmem_node,
  * so all functions starting at paging_init should be marked __init
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index 27e9e75f4eab..82c065e7551e 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -160,9 +160,10 @@ struct page_cgroup {
 	struct mem_cgroup *mem_cgroup;
 	int flags;
 };
-#define PAGE_CGROUP_FLAG_CACHE	(0x1)	/* charged as cache */
-#define PAGE_CGROUP_FLAG_ACTIVE (0x2)	/* page is active in this cgroup */
-#define PAGE_CGROUP_FLAG_FILE	(0x4)	/* page is file system backed */
+#define PAGE_CGROUP_FLAG_CACHE	   (0x1)	/* charged as cache */
+#define PAGE_CGROUP_FLAG_ACTIVE    (0x2)	/* page is active in this cgroup */
+#define PAGE_CGROUP_FLAG_FILE	   (0x4)	/* page is file system backed */
+#define PAGE_CGROUP_FLAG_UNEVICTABLE (0x8)	/* page is unevictableable */
 
 static int page_cgroup_nid(struct page_cgroup *pc)
 {
@@ -292,10 +293,14 @@ static void __mem_cgroup_remove_list(struct mem_cgroup_per_zone *mz,
 {
 	int lru = LRU_BASE;
 
-	if (pc->flags & PAGE_CGROUP_FLAG_ACTIVE)
-		lru += LRU_ACTIVE;
-	if (pc->flags & PAGE_CGROUP_FLAG_FILE)
-		lru += LRU_FILE;
+	if (pc->flags & PAGE_CGROUP_FLAG_UNEVICTABLE)
+		lru = LRU_UNEVICTABLE;
+	else {
+		if (pc->flags & PAGE_CGROUP_FLAG_ACTIVE)
+			lru += LRU_ACTIVE;
+		if (pc->flags & PAGE_CGROUP_FLAG_FILE)
+			lru += LRU_FILE;
+	}
 
 	MEM_CGROUP_ZSTAT(mz, lru) -= 1;
 
@@ -308,10 +313,14 @@ static void __mem_cgroup_add_list(struct mem_cgroup_per_zone *mz,
 {
 	int lru = LRU_BASE;
 
-	if (pc->flags & PAGE_CGROUP_FLAG_ACTIVE)
-		lru += LRU_ACTIVE;
-	if (pc->flags & PAGE_CGROUP_FLAG_FILE)
-		lru += LRU_FILE;
+	if (pc->flags & PAGE_CGROUP_FLAG_UNEVICTABLE)
+		lru = LRU_UNEVICTABLE;
+	else {
+		if (pc->flags & PAGE_CGROUP_FLAG_ACTIVE)
+			lru += LRU_ACTIVE;
+		if (pc->flags & PAGE_CGROUP_FLAG_FILE)
+			lru += LRU_FILE;
+	}
 
 	MEM_CGROUP_ZSTAT(mz, lru) += 1;
 	list_add(&pc->lru, &mz->lists[lru]);
@@ -319,21 +328,31 @@ static void __mem_cgroup_add_list(struct mem_cgroup_per_zone *mz,
 	mem_cgroup_charge_statistics(pc->mem_cgroup, pc->flags, true);
 }
 
-static void __mem_cgroup_move_lists(struct page_cgroup *pc, bool active)
+static void __mem_cgroup_move_lists(struct page_cgroup *pc, enum lru_list lru)
 {
 	struct mem_cgroup_per_zone *mz = page_cgroup_zoneinfo(pc);
-	int from = pc->flags & PAGE_CGROUP_FLAG_ACTIVE;
-	int file = pc->flags & PAGE_CGROUP_FLAG_FILE;
-	int lru = LRU_FILE * !!file + !!from;
+	int active    = pc->flags & PAGE_CGROUP_FLAG_ACTIVE;
+	int file      = pc->flags & PAGE_CGROUP_FLAG_FILE;
+	int unevictable = pc->flags & PAGE_CGROUP_FLAG_UNEVICTABLE;
+	enum lru_list from = unevictable ? LRU_UNEVICTABLE :
+				(LRU_FILE * !!file + !!active);
 
-	MEM_CGROUP_ZSTAT(mz, lru) -= 1;
+	if (lru == from)
+		return;
 
-	if (active)
-		pc->flags |= PAGE_CGROUP_FLAG_ACTIVE;
-	else
+	MEM_CGROUP_ZSTAT(mz, from) -= 1;
+
+	if (is_unevictable_lru(lru)) {
 		pc->flags &= ~PAGE_CGROUP_FLAG_ACTIVE;
+		pc->flags |= PAGE_CGROUP_FLAG_UNEVICTABLE;
+	} else {
+		if (is_active_lru(lru))
+			pc->flags |= PAGE_CGROUP_FLAG_ACTIVE;
+		else
+			pc->flags &= ~PAGE_CGROUP_FLAG_ACTIVE;
+		pc->flags &= ~PAGE_CGROUP_FLAG_UNEVICTABLE;
+	}
 
-	lru = LRU_FILE * !!file + !!active;
 	MEM_CGROUP_ZSTAT(mz, lru) += 1;
 	list_move(&pc->lru, &mz->lists[lru]);
 }
@@ -351,7 +370,7 @@ int task_in_mem_cgroup(struct task_struct *task, const struct mem_cgroup *mem)
 /*
  * This routine assumes that the appropriate zone's lru lock is already held
  */
-void mem_cgroup_move_lists(struct page *page, bool active)
+void mem_cgroup_move_lists(struct page *page, enum lru_list lru)
 {
 	struct page_cgroup *pc;
 	struct mem_cgroup_per_zone *mz;
@@ -374,7 +393,7 @@ void mem_cgroup_move_lists(struct page *page, bool active)
 	if (pc) {
 		mz = page_cgroup_zoneinfo(pc);
 		spin_lock_irqsave(&mz->lru_lock, flags);
-		__mem_cgroup_move_lists(pc, active);
+		__mem_cgroup_move_lists(pc, lru);
 		spin_unlock_irqrestore(&mz->lru_lock, flags);
 	}
 	unlock_page_cgroup(page);
@@ -472,12 +491,10 @@ unsigned long mem_cgroup_isolate_pages(unsigned long nr_to_scan,
 		/*
 		 * TODO: play better with lumpy reclaim, grabbing anything.
 		 */
-		if (PageActive(page) && !active) {
-			__mem_cgroup_move_lists(pc, true);
-			continue;
-		}
-		if (!PageActive(page) && active) {
-			__mem_cgroup_move_lists(pc, false);
+		if (PageUnevictable(page) ||
+		    (PageActive(page) && !active) ||
+		    (!PageActive(page) && active)) {
+			__mem_cgroup_move_lists(pc, page_lru(page));
 			continue;
 		}
 
diff --git a/mm/mempolicy.c b/mm/mempolicy.c
index 71b47491487d..36f42573a335 100644
--- a/mm/mempolicy.c
+++ b/mm/mempolicy.c
@@ -2202,7 +2202,7 @@ static void gather_stats(struct page *page, void *private, int pte_dirty)
 	if (PageSwapCache(page))
 		md->swapcache++;
 
-	if (PageActive(page))
+	if (PageActive(page) || PageUnevictable(page))
 		md->active++;
 
 	if (PageWriteback(page))
diff --git a/mm/migrate.c b/mm/migrate.c
index c07327487111..b10237d8b459 100644
--- a/mm/migrate.c
+++ b/mm/migrate.c
@@ -53,14 +53,9 @@ int migrate_prep(void)
 	return 0;
 }
 
-static inline void move_to_lru(struct page *page)
-{
-	lru_cache_add_lru(page, page_lru(page));
-	put_page(page);
-}
-
 /*
- * Add isolated pages on the list back to the LRU.
+ * Add isolated pages on the list back to the LRU under page lock
+ * to avoid leaking evictable pages back onto unevictable list.
  *
  * returns the number of pages put back.
  */
@@ -72,7 +67,7 @@ int putback_lru_pages(struct list_head *l)
 
 	list_for_each_entry_safe(page, page2, l, lru) {
 		list_del(&page->lru);
-		move_to_lru(page);
+		putback_lru_page(page);
 		count++;
 	}
 	return count;
@@ -354,8 +349,11 @@ static void migrate_page_copy(struct page *newpage, struct page *page)
 		SetPageReferenced(newpage);
 	if (PageUptodate(page))
 		SetPageUptodate(newpage);
-	if (PageActive(page))
+	if (TestClearPageActive(page)) {
+		VM_BUG_ON(PageUnevictable(page));
 		SetPageActive(newpage);
+	} else
+		unevictable_migrate_page(newpage, page);
 	if (PageChecked(page))
 		SetPageChecked(newpage);
 	if (PageMappedToDisk(page))
@@ -376,7 +374,6 @@ static void migrate_page_copy(struct page *newpage, struct page *page)
 #ifdef CONFIG_SWAP
 	ClearPageSwapCache(page);
 #endif
-	ClearPageActive(page);
 	ClearPagePrivate(page);
 	set_page_private(page, 0);
 	page->mapping = NULL;
@@ -555,6 +552,10 @@ static int fallback_migrate_page(struct address_space *mapping,
  *
  * The new page will have replaced the old page if this function
  * is successful.
+ *
+ * Return value:
+ *   < 0 - error code
+ *  == 0 - success
  */
 static int move_to_new_page(struct page *newpage, struct page *page)
 {
@@ -617,9 +618,10 @@ static int unmap_and_move(new_page_t get_new_page, unsigned long private,
 	if (!newpage)
 		return -ENOMEM;
 
-	if (page_count(page) == 1)
+	if (page_count(page) == 1) {
 		/* page was freed from under us. So we are done. */
 		goto move_newpage;
+	}
 
 	charge = mem_cgroup_prepare_migration(page, newpage);
 	if (charge == -ENOMEM) {
@@ -693,7 +695,6 @@ rcu_unlock:
 		rcu_read_unlock();
 
 unlock:
-
 	unlock_page(page);
 
 	if (rc != -EAGAIN) {
@@ -704,17 +705,19 @@ unlock:
  		 * restored.
  		 */
  		list_del(&page->lru);
- 		move_to_lru(page);
+		putback_lru_page(page);
 	}
 
 move_newpage:
 	if (!charge)
 		mem_cgroup_end_migration(newpage);
+
 	/*
 	 * Move the new page to the LRU. If migration was not successful
 	 * then this will free the page.
 	 */
-	move_to_lru(newpage);
+	putback_lru_page(newpage);
+
 	if (result) {
 		if (rc)
 			*result = rc;
diff --git a/mm/swap.c b/mm/swap.c
index 0b1974a08974..fee6b973f143 100644
--- a/mm/swap.c
+++ b/mm/swap.c
@@ -115,7 +115,7 @@ static void pagevec_move_tail(struct pagevec *pvec)
 			zone = pagezone;
 			spin_lock(&zone->lru_lock);
 		}
-		if (PageLRU(page) && !PageActive(page)) {
+		if (PageLRU(page) && !PageActive(page) && !PageUnevictable(page)) {
 			int lru = page_is_file_cache(page);
 			list_move_tail(&page->lru, &zone->lru[lru].list);
 			pgmoved++;
@@ -136,7 +136,7 @@ static void pagevec_move_tail(struct pagevec *pvec)
 void  rotate_reclaimable_page(struct page *page)
 {
 	if (!PageLocked(page) && !PageDirty(page) && !PageActive(page) &&
-	    PageLRU(page)) {
+	    !PageUnevictable(page) && PageLRU(page)) {
 		struct pagevec *pvec;
 		unsigned long flags;
 
@@ -157,7 +157,7 @@ void activate_page(struct page *page)
 	struct zone *zone = page_zone(page);
 
 	spin_lock_irq(&zone->lru_lock);
-	if (PageLRU(page) && !PageActive(page)) {
+	if (PageLRU(page) && !PageActive(page) && !PageUnevictable(page)) {
 		int file = page_is_file_cache(page);
 		int lru = LRU_BASE + file;
 		del_page_from_lru_list(zone, page, lru);
@@ -166,7 +166,7 @@ void activate_page(struct page *page)
 		lru += LRU_ACTIVE;
 		add_page_to_lru_list(zone, page, lru);
 		__count_vm_event(PGACTIVATE);
-		mem_cgroup_move_lists(page, true);
+		mem_cgroup_move_lists(page, lru);
 
 		zone->recent_rotated[!!file]++;
 		zone->recent_scanned[!!file]++;
@@ -183,7 +183,8 @@ void activate_page(struct page *page)
  */
 void mark_page_accessed(struct page *page)
 {
-	if (!PageActive(page) && PageReferenced(page) && PageLRU(page)) {
+	if (!PageActive(page) && !PageUnevictable(page) &&
+			PageReferenced(page) && PageLRU(page)) {
 		activate_page(page);
 		ClearPageReferenced(page);
 	} else if (!PageReferenced(page)) {
@@ -211,13 +212,38 @@ void __lru_cache_add(struct page *page, enum lru_list lru)
 void lru_cache_add_lru(struct page *page, enum lru_list lru)
 {
 	if (PageActive(page)) {
+		VM_BUG_ON(PageUnevictable(page));
 		ClearPageActive(page);
+	} else if (PageUnevictable(page)) {
+		VM_BUG_ON(PageActive(page));
+		ClearPageUnevictable(page);
 	}
 
-	VM_BUG_ON(PageLRU(page) || PageActive(page));
+	VM_BUG_ON(PageLRU(page) || PageActive(page) || PageUnevictable(page));
 	__lru_cache_add(page, lru);
 }
 
+/**
+ * add_page_to_unevictable_list - add a page to the unevictable list
+ * @page:  the page to be added to the unevictable list
+ *
+ * Add page directly to its zone's unevictable list.  To avoid races with
+ * tasks that might be making the page evictable, through eg. munlock,
+ * munmap or exit, while it's not on the lru, we want to add the page
+ * while it's locked or otherwise "invisible" to other tasks.  This is
+ * difficult to do when using the pagevec cache, so bypass that.
+ */
+void add_page_to_unevictable_list(struct page *page)
+{
+	struct zone *zone = page_zone(page);
+
+	spin_lock_irq(&zone->lru_lock);
+	SetPageUnevictable(page);
+	SetPageLRU(page);
+	add_page_to_lru_list(zone, page, LRU_UNEVICTABLE);
+	spin_unlock_irq(&zone->lru_lock);
+}
+
 /*
  * Drain pages out of the cpu's pagevecs.
  * Either "cpu" is the current CPU, and preemption has already been
@@ -316,6 +342,7 @@ void release_pages(struct page **pages, int nr, int cold)
 
 		if (PageLRU(page)) {
 			struct zone *pagezone = page_zone(page);
+
 			if (pagezone != zone) {
 				if (zone)
 					spin_unlock_irqrestore(&zone->lru_lock,
@@ -392,6 +419,7 @@ void ____pagevec_lru_add(struct pagevec *pvec, enum lru_list lru)
 {
 	int i;
 	struct zone *zone = NULL;
+	VM_BUG_ON(is_unevictable_lru(lru));
 
 	for (i = 0; i < pagevec_count(pvec); i++) {
 		struct page *page = pvec->pages[i];
@@ -403,6 +431,8 @@ void ____pagevec_lru_add(struct pagevec *pvec, enum lru_list lru)
 			zone = pagezone;
 			spin_lock_irq(&zone->lru_lock);
 		}
+		VM_BUG_ON(PageActive(page));
+		VM_BUG_ON(PageUnevictable(page));
 		VM_BUG_ON(PageLRU(page));
 		SetPageLRU(page);
 		if (is_active_lru(lru))
diff --git a/mm/vmscan.c b/mm/vmscan.c
index a8347b677e74..154b9b608da6 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -470,6 +470,79 @@ int remove_mapping(struct address_space *mapping, struct page *page)
 	return 0;
 }
 
+/**
+ * putback_lru_page - put previously isolated page onto appropriate LRU list
+ * @page: page to be put back to appropriate lru list
+ *
+ * Add previously isolated @page to appropriate LRU list.
+ * Page may still be unevictable for other reasons.
+ *
+ * lru_lock must not be held, interrupts must be enabled.
+ */
+#ifdef CONFIG_UNEVICTABLE_LRU
+void putback_lru_page(struct page *page)
+{
+	int lru;
+	int active = !!TestClearPageActive(page);
+
+	VM_BUG_ON(PageLRU(page));
+
+redo:
+	ClearPageUnevictable(page);
+
+	if (page_evictable(page, NULL)) {
+		/*
+		 * For evictable pages, we can use the cache.
+		 * In event of a race, worst case is we end up with an
+		 * unevictable page on [in]active list.
+		 * We know how to handle that.
+		 */
+		lru = active + page_is_file_cache(page);
+		lru_cache_add_lru(page, lru);
+	} else {
+		/*
+		 * Put unevictable pages directly on zone's unevictable
+		 * list.
+		 */
+		lru = LRU_UNEVICTABLE;
+		add_page_to_unevictable_list(page);
+	}
+	mem_cgroup_move_lists(page, lru);
+
+	/*
+	 * page's status can change while we move it among lru. If an evictable
+	 * page is on unevictable list, it never be freed. To avoid that,
+	 * check after we added it to the list, again.
+	 */
+	if (lru == LRU_UNEVICTABLE && page_evictable(page, NULL)) {
+		if (!isolate_lru_page(page)) {
+			put_page(page);
+			goto redo;
+		}
+		/* This means someone else dropped this page from LRU
+		 * So, it will be freed or putback to LRU again. There is
+		 * nothing to do here.
+		 */
+	}
+
+	put_page(page);		/* drop ref from isolate */
+}
+
+#else /* CONFIG_UNEVICTABLE_LRU */
+
+void putback_lru_page(struct page *page)
+{
+	int lru;
+	VM_BUG_ON(PageLRU(page));
+
+	lru = !!TestClearPageActive(page) + page_is_file_cache(page);
+	lru_cache_add_lru(page, lru);
+	mem_cgroup_move_lists(page, lru);
+	put_page(page);
+}
+#endif /* CONFIG_UNEVICTABLE_LRU */
+
+
 /*
  * shrink_page_list() returns the number of reclaimed pages
  */
@@ -503,6 +576,12 @@ static unsigned long shrink_page_list(struct list_head *page_list,
 
 		sc->nr_scanned++;
 
+		if (unlikely(!page_evictable(page, NULL))) {
+			unlock_page(page);
+			putback_lru_page(page);
+			continue;
+		}
+
 		if (!sc->may_swap && page_mapped(page))
 			goto keep_locked;
 
@@ -602,7 +681,7 @@ static unsigned long shrink_page_list(struct list_head *page_list,
 		 * possible for a page to have PageDirty set, but it is actually
 		 * clean (all its buffers are clean).  This happens if the
 		 * buffers were written out directly, with submit_bh(). ext3
-		 * will do this, as well as the blockdev mapping. 
+		 * will do this, as well as the blockdev mapping.
 		 * try_to_release_page() will discover that cleanness and will
 		 * drop the buffers and mark the page clean - it can be freed.
 		 *
@@ -650,6 +729,7 @@ activate_locked:
 		/* Not a candidate for swapping, so reclaim swap space. */
 		if (PageSwapCache(page) && vm_swap_full())
 			remove_exclusive_swap_page_ref(page);
+		VM_BUG_ON(PageActive(page));
 		SetPageActive(page);
 		pgactivate++;
 keep_locked:
@@ -699,6 +779,14 @@ int __isolate_lru_page(struct page *page, int mode, int file)
 	if (mode != ISOLATE_BOTH && (!page_is_file_cache(page) != !file))
 		return ret;
 
+	/*
+	 * When this function is being called for lumpy reclaim, we
+	 * initially look into all LRU pages, active, inactive and
+	 * unevictable; only give shrink_page_list evictable pages.
+	 */
+	if (PageUnevictable(page))
+		return ret;
+
 	ret = -EBUSY;
 	if (likely(get_page_unless_zero(page))) {
 		/*
@@ -810,7 +898,7 @@ static unsigned long isolate_lru_pages(unsigned long nr_to_scan,
 				/* else it is being freed elsewhere */
 				list_move(&cursor_page->lru, src);
 			default:
-				break;
+				break;	/* ! on LRU or wrong list */
 			}
 		}
 	}
@@ -870,8 +958,9 @@ static unsigned long clear_active_flags(struct list_head *page_list,
  * Returns -EBUSY if the page was not on an LRU list.
  *
  * The returned page will have PageLRU() cleared.  If it was found on
- * the active list, it will have PageActive set.  That flag may need
- * to be cleared by the caller before letting the page go.
+ * the active list, it will have PageActive set.  If it was found on
+ * the unevictable list, it will have the PageUnevictable bit set. That flag
+ * may need to be cleared by the caller before letting the page go.
  *
  * The vmstat statistic corresponding to the list on which the page was
  * found will be decremented.
@@ -892,11 +981,10 @@ int isolate_lru_page(struct page *page)
 
 		spin_lock_irq(&zone->lru_lock);
 		if (PageLRU(page) && get_page_unless_zero(page)) {
-			int lru = LRU_BASE;
+			int lru = page_lru(page);
 			ret = 0;
 			ClearPageLRU(page);
 
-			lru += page_is_file_cache(page) + !!PageActive(page);
 			del_page_from_lru_list(zone, page, lru);
 		}
 		spin_unlock_irq(&zone->lru_lock);
@@ -1008,11 +1096,20 @@ static unsigned long shrink_inactive_list(unsigned long max_scan,
 		 * Put back any unfreeable pages.
 		 */
 		while (!list_empty(&page_list)) {
+			int lru;
 			page = lru_to_page(&page_list);
 			VM_BUG_ON(PageLRU(page));
-			SetPageLRU(page);
 			list_del(&page->lru);
-			add_page_to_lru_list(zone, page, page_lru(page));
+			if (unlikely(!page_evictable(page, NULL))) {
+				spin_unlock_irq(&zone->lru_lock);
+				putback_lru_page(page);
+				spin_lock_irq(&zone->lru_lock);
+				continue;
+			}
+			SetPageLRU(page);
+			lru = page_lru(page);
+			add_page_to_lru_list(zone, page, lru);
+			mem_cgroup_move_lists(page, lru);
 			if (PageActive(page) && scan_global_lru(sc)) {
 				int file = !!page_is_file_cache(page);
 				zone->recent_rotated[file]++;
@@ -1107,6 +1204,11 @@ static void shrink_active_list(unsigned long nr_pages, struct zone *zone,
 		page = lru_to_page(&l_hold);
 		list_del(&page->lru);
 
+		if (unlikely(!page_evictable(page, NULL))) {
+			putback_lru_page(page);
+			continue;
+		}
+
 		/* page_referenced clears PageReferenced */
 		if (page_mapping_inuse(page) &&
 		    page_referenced(page, 0, sc->mem_cgroup))
@@ -1140,7 +1242,7 @@ static void shrink_active_list(unsigned long nr_pages, struct zone *zone,
 		ClearPageActive(page);
 
 		list_move(&page->lru, &zone->lru[lru].list);
-		mem_cgroup_move_lists(page, false);
+		mem_cgroup_move_lists(page, lru);
 		pgmoved++;
 		if (!pagevec_add(&pvec, page)) {
 			__mod_zone_page_state(zone, NR_LRU_BASE + lru, pgmoved);
@@ -1286,7 +1388,7 @@ static unsigned long shrink_zone(int priority, struct zone *zone,
 
 	get_scan_ratio(zone, sc, percent);
 
-	for_each_lru(l) {
+	for_each_evictable_lru(l) {
 		if (scan_global_lru(sc)) {
 			int file = is_file_lru(l);
 			int scan;
@@ -1318,7 +1420,7 @@ static unsigned long shrink_zone(int priority, struct zone *zone,
 
 	while (nr[LRU_INACTIVE_ANON] || nr[LRU_ACTIVE_FILE] ||
 					nr[LRU_INACTIVE_FILE]) {
-		for_each_lru(l) {
+		for_each_evictable_lru(l) {
 			if (nr[l]) {
 				nr_to_scan = min(nr[l],
 					(unsigned long)sc->swap_cluster_max);
@@ -1875,8 +1977,8 @@ static unsigned long shrink_all_zones(unsigned long nr_pages, int prio,
 		if (zone_is_all_unreclaimable(zone) && prio != DEF_PRIORITY)
 			continue;
 
-		for_each_lru(l) {
-			/* For pass = 0 we don't shrink the active list */
+		for_each_evictable_lru(l) {
+			/* For pass = 0, we don't shrink the active list */
 			if (pass == 0 &&
 				(l == LRU_ACTIVE || l == LRU_ACTIVE_FILE))
 				continue;
@@ -2213,3 +2315,24 @@ int zone_reclaim(struct zone *zone, gfp_t gfp_mask, unsigned int order)
 	return ret;
 }
 #endif
+
+#ifdef CONFIG_UNEVICTABLE_LRU
+/*
+ * page_evictable - test whether a page is evictable
+ * @page: the page to test
+ * @vma: the VMA in which the page is or will be mapped, may be NULL
+ *
+ * Test whether page is evictable--i.e., should be placed on active/inactive
+ * lists vs unevictable list.
+ *
+ * Reasons page might not be evictable:
+ * TODO - later patches
+ */
+int page_evictable(struct page *page, struct vm_area_struct *vma)
+{
+
+	/* TODO:  test page [!]evictable conditions */
+
+	return 1;
+}
+#endif
-- 
cgit v1.2.3


From bbfd28eee9fbd73e780b19beb3dc562befbb94fa Mon Sep 17 00:00:00 2001
From: Lee Schermerhorn <lee.schermerhorn@hp.com>
Date: Sat, 18 Oct 2008 20:26:40 -0700
Subject: unevictable lru: add event counting with statistics

Fix to unevictable-lru-page-statistics.patch

Add unevictable lru infrastructure vm events to the statistics patch.
Rename the "NORECL_" and "noreclaim_" symbols and text strings to
"UNEVICTABLE_" and "unevictable_", respectively.

Currently, both the infrastructure and the mlocked pages event are
added by a single patch later in the series.  This makes it difficult
to add or rework the incremental patches.  The events actually "belong"
with the stats, so pull them up to here.

Also, restore the event counting to putback_lru_page().  This was removed
from previous patch in series where it was "misplaced".  The actual events
weren't defined that early.

Signed-off-by: Lee Schermerhorn <lee.schermerhorn@hp.com>
Cc: Rik van Riel <riel@redhat.com>
Reviewed-by: KOSAKI Motohiro <kosaki.motohiro@jp.fujitsu.com>
Cc: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/vmstat.h | 5 +++++
 mm/vmscan.c            | 6 ++++++
 mm/vmstat.c            | 5 +++++
 3 files changed, 16 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/vmstat.h b/include/linux/vmstat.h
index ff5179f2b153..135840cd7feb 100644
--- a/include/linux/vmstat.h
+++ b/include/linux/vmstat.h
@@ -40,6 +40,11 @@ enum vm_event_item { PGPGIN, PGPGOUT, PSWPIN, PSWPOUT,
 		PAGEOUTRUN, ALLOCSTALL, PGROTATED,
 #ifdef CONFIG_HUGETLB_PAGE
 		HTLB_BUDDY_PGALLOC, HTLB_BUDDY_PGALLOC_FAIL,
+#endif
+#ifdef CONFIG_UNEVICTABLE_LRU
+		UNEVICTABLE_PGCULLED,	/* culled to noreclaim list */
+		UNEVICTABLE_PGSCANNED,	/* scanned for reclaimability */
+		UNEVICTABLE_PGRESCUED,	/* rescued from noreclaim list */
 #endif
 		NR_VM_EVENT_ITEMS
 };
diff --git a/mm/vmscan.c b/mm/vmscan.c
index 154b9b608da6..2804d23e2da7 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -484,6 +484,7 @@ void putback_lru_page(struct page *page)
 {
 	int lru;
 	int active = !!TestClearPageActive(page);
+	int was_unevictable = PageUnevictable(page);
 
 	VM_BUG_ON(PageLRU(page));
 
@@ -525,6 +526,11 @@ redo:
 		 */
 	}
 
+	if (was_unevictable && lru != LRU_UNEVICTABLE)
+		count_vm_event(UNEVICTABLE_PGRESCUED);
+	else if (!was_unevictable && lru == LRU_UNEVICTABLE)
+		count_vm_event(UNEVICTABLE_PGCULLED);
+
 	put_page(page);		/* drop ref from isolate */
 }
 
diff --git a/mm/vmstat.c b/mm/vmstat.c
index 4380b0dba6d9..6cb08cdd4f03 100644
--- a/mm/vmstat.c
+++ b/mm/vmstat.c
@@ -677,6 +677,11 @@ static const char * const vmstat_text[] = {
 	"htlb_buddy_alloc_success",
 	"htlb_buddy_alloc_fail",
 #endif
+#ifdef CONFIG_UNEVICTABLE_LRU
+	"unevictable_pgs_culled",
+	"unevictable_pgs_scanned",
+	"unevictable_pgs_rescued",
+#endif
 #endif
 };
 
-- 
cgit v1.2.3


From ba9ddf49391645e6bb93219131a40446538a5e76 Mon Sep 17 00:00:00 2001
From: Lee Schermerhorn <Lee.Schermerhorn@hp.com>
Date: Sat, 18 Oct 2008 20:26:42 -0700
Subject: Ramfs and Ram Disk pages are unevictable

Christoph Lameter pointed out that ram disk pages also clutter the LRU
lists.  When vmscan finds them dirty and tries to clean them, the ram disk
writeback function just redirties the page so that it goes back onto the
active list.  Round and round she goes...

With the ram disk driver [rd.c] replaced by the newer 'brd.c', this is no
longer the case, as ram disk pages are no longer maintained on the lru.
[This makes them unmigratable for defrag or memory hot remove, but that
can be addressed by a separate patch series.] However, the ramfs pages
behave like ram disk pages used to, so:

Define new address_space flag [shares address_space flags member with
mapping's gfp mask] to indicate that the address space contains all
unevictable pages.  This will provide for efficient testing of ramfs pages
in page_evictable().

Also provide wrapper functions to set/test the unevictable state to
minimize #ifdefs in ramfs driver and any other users of this facility.

Set the unevictable state on address_space structures for new ramfs
inodes.  Test the unevictable state in page_evictable() to cull
unevictable pages.

These changes depend on [CONFIG_]UNEVICTABLE_LRU.

[riel@redhat.com: undo the brd.c part]
Signed-off-by: Lee Schermerhorn <lee.schermerhorn@hp.com>
Signed-off-by: Rik van Riel <riel@redhat.com>
Debugged-by: Nick Piggin <nickpiggin@yahoo.com.au>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/ramfs/inode.c        |  1 +
 include/linux/pagemap.h | 22 ++++++++++++++++++++++
 mm/vmscan.c             |  5 +++++
 3 files changed, 28 insertions(+)

(limited to 'include/linux')

diff --git a/fs/ramfs/inode.c b/fs/ramfs/inode.c
index b13123424e49..f031d1c925f0 100644
--- a/fs/ramfs/inode.c
+++ b/fs/ramfs/inode.c
@@ -61,6 +61,7 @@ struct inode *ramfs_get_inode(struct super_block *sb, int mode, dev_t dev)
 		inode->i_mapping->a_ops = &ramfs_aops;
 		inode->i_mapping->backing_dev_info = &ramfs_backing_dev_info;
 		mapping_set_gfp_mask(inode->i_mapping, GFP_HIGHUSER);
+		mapping_set_unevictable(inode->i_mapping);
 		inode->i_atime = inode->i_mtime = inode->i_ctime = CURRENT_TIME;
 		switch (mode & S_IFMT) {
 		default:
diff --git a/include/linux/pagemap.h b/include/linux/pagemap.h
index 5da31c12101c..09164d2c5c27 100644
--- a/include/linux/pagemap.h
+++ b/include/linux/pagemap.h
@@ -32,6 +32,28 @@ static inline void mapping_set_error(struct address_space *mapping, int error)
 	}
 }
 
+#ifdef CONFIG_UNEVICTABLE_LRU
+#define AS_UNEVICTABLE	(__GFP_BITS_SHIFT + 2)	/* e.g., ramdisk, SHM_LOCK */
+
+static inline void mapping_set_unevictable(struct address_space *mapping)
+{
+	set_bit(AS_UNEVICTABLE, &mapping->flags);
+}
+
+static inline int mapping_unevictable(struct address_space *mapping)
+{
+	if (mapping && (mapping->flags & AS_UNEVICTABLE))
+		return 1;
+	return 0;
+}
+#else
+static inline void mapping_set_unevictable(struct address_space *mapping) { }
+static inline int mapping_unevictable(struct address_space *mapping)
+{
+	return 0;
+}
+#endif
+
 static inline gfp_t mapping_gfp_mask(struct address_space * mapping)
 {
 	return (__force gfp_t)mapping->flags & __GFP_BITS_MASK;
diff --git a/mm/vmscan.c b/mm/vmscan.c
index 2804d23e2da7..9babfbc1ddc8 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -2332,11 +2332,16 @@ int zone_reclaim(struct zone *zone, gfp_t gfp_mask, unsigned int order)
  * lists vs unevictable list.
  *
  * Reasons page might not be evictable:
+ * (1) page's mapping marked unevictable
+ *
  * TODO - later patches
  */
 int page_evictable(struct page *page, struct vm_area_struct *vma)
 {
 
+	if (mapping_unevictable(page_mapping(page)))
+		return 0;
+
 	/* TODO:  test page [!]evictable conditions */
 
 	return 1;
-- 
cgit v1.2.3


From 89e004ea55abe201b29e2d6e35124101f1288ef7 Mon Sep 17 00:00:00 2001
From: Lee Schermerhorn <Lee.Schermerhorn@hp.com>
Date: Sat, 18 Oct 2008 20:26:43 -0700
Subject: SHM_LOCKED pages are unevictable

Shmem segments locked into memory via shmctl(SHM_LOCKED) should not be
kept on the normal LRU, since scanning them is a waste of time and might
throw off kswapd's balancing algorithms.  Place them on the unevictable
LRU list instead.

Use the AS_UNEVICTABLE flag to mark address_space of SHM_LOCKed shared
memory regions as unevictable.  Then these pages will be culled off the
normal LRU lists during vmscan.

Add new wrapper function to clear the mapping's unevictable state when/if
shared memory segment is munlocked.

Add 'scan_mapping_unevictable_page()' to mm/vmscan.c to scan all pages in
the shmem segment's mapping [struct address_space] for evictability now
that they're no longer locked.  If so, move them to the appropriate zone
lru list.

Changes depend on [CONFIG_]UNEVICTABLE_LRU.

[kosaki.motohiro@jp.fujitsu.com: revert shm change]
Signed-off-by: Lee Schermerhorn <lee.schermerhorn@hp.com>
Signed-off-by: Rik van Riel <riel@redhat.com>
Signed-off-by: Kosaki Motohiro <kosaki.motohiro@jp.fujitsu.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/mm.h      |  4 +--
 include/linux/pagemap.h | 12 +++++--
 include/linux/swap.h    |  4 +++
 ipc/shm.c               |  4 +++
 mm/shmem.c              |  4 +++
 mm/vmscan.c             | 89 +++++++++++++++++++++++++++++++++++++++++++++++++
 6 files changed, 112 insertions(+), 5 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/mm.h b/include/linux/mm.h
index c61ba10768ea..40236290e2ae 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -700,10 +700,10 @@ static inline int page_mapped(struct page *page)
 extern void show_free_areas(void);
 
 #ifdef CONFIG_SHMEM
-int shmem_lock(struct file *file, int lock, struct user_struct *user);
+extern int shmem_lock(struct file *file, int lock, struct user_struct *user);
 #else
 static inline int shmem_lock(struct file *file, int lock,
-			     struct user_struct *user)
+			    struct user_struct *user)
 {
 	return 0;
 }
diff --git a/include/linux/pagemap.h b/include/linux/pagemap.h
index 09164d2c5c27..4b6c4d8d26b8 100644
--- a/include/linux/pagemap.h
+++ b/include/linux/pagemap.h
@@ -40,14 +40,20 @@ static inline void mapping_set_unevictable(struct address_space *mapping)
 	set_bit(AS_UNEVICTABLE, &mapping->flags);
 }
 
+static inline void mapping_clear_unevictable(struct address_space *mapping)
+{
+	clear_bit(AS_UNEVICTABLE, &mapping->flags);
+}
+
 static inline int mapping_unevictable(struct address_space *mapping)
 {
-	if (mapping && (mapping->flags & AS_UNEVICTABLE))
-		return 1;
-	return 0;
+	if (likely(mapping))
+		return test_bit(AS_UNEVICTABLE, &mapping->flags);
+	return !!mapping;
 }
 #else
 static inline void mapping_set_unevictable(struct address_space *mapping) { }
+static inline void mapping_clear_unevictable(struct address_space *mapping) { }
 static inline int mapping_unevictable(struct address_space *mapping)
 {
 	return 0;
diff --git a/include/linux/swap.h b/include/linux/swap.h
index a2113044d20a..7edb4cbc29f9 100644
--- a/include/linux/swap.h
+++ b/include/linux/swap.h
@@ -232,12 +232,16 @@ static inline int zone_reclaim(struct zone *z, gfp_t mask, unsigned int order)
 
 #ifdef CONFIG_UNEVICTABLE_LRU
 extern int page_evictable(struct page *page, struct vm_area_struct *vma);
+extern void scan_mapping_unevictable_pages(struct address_space *);
 #else
 static inline int page_evictable(struct page *page,
 						struct vm_area_struct *vma)
 {
 	return 1;
 }
+static inline void scan_mapping_unevictable_pages(struct address_space *mapping)
+{
+}
 #endif
 
 extern int kswapd_run(int nid);
diff --git a/ipc/shm.c b/ipc/shm.c
index e77ec698cf40..0add3fa5f547 100644
--- a/ipc/shm.c
+++ b/ipc/shm.c
@@ -737,6 +737,10 @@ asmlinkage long sys_shmctl(int shmid, int cmd, struct shmid_ds __user *buf)
 	case SHM_LOCK:
 	case SHM_UNLOCK:
 	{
+		struct file *uninitialized_var(shm_file);
+
+		lru_add_drain_all();  /* drain pagevecs to lru lists */
+
 		shp = shm_lock_check(ns, shmid);
 		if (IS_ERR(shp)) {
 			err = PTR_ERR(shp);
diff --git a/mm/shmem.c b/mm/shmem.c
index fc2ccf79a776..d38d7e61fcd0 100644
--- a/mm/shmem.c
+++ b/mm/shmem.c
@@ -1477,12 +1477,16 @@ int shmem_lock(struct file *file, int lock, struct user_struct *user)
 		if (!user_shm_lock(inode->i_size, user))
 			goto out_nomem;
 		info->flags |= VM_LOCKED;
+		mapping_set_unevictable(file->f_mapping);
 	}
 	if (!lock && (info->flags & VM_LOCKED) && user) {
 		user_shm_unlock(inode->i_size, user);
 		info->flags &= ~VM_LOCKED;
+		mapping_clear_unevictable(file->f_mapping);
+		scan_mapping_unevictable_pages(file->f_mapping);
 	}
 	retval = 0;
+
 out_nomem:
 	spin_unlock(&info->lock);
 	return retval;
diff --git a/mm/vmscan.c b/mm/vmscan.c
index 9babfbc1ddc8..dfb342e0db9b 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -2346,4 +2346,93 @@ int page_evictable(struct page *page, struct vm_area_struct *vma)
 
 	return 1;
 }
+
+/**
+ * check_move_unevictable_page - check page for evictability and move to appropriate zone lru list
+ * @page: page to check evictability and move to appropriate lru list
+ * @zone: zone page is in
+ *
+ * Checks a page for evictability and moves the page to the appropriate
+ * zone lru list.
+ *
+ * Restrictions: zone->lru_lock must be held, page must be on LRU and must
+ * have PageUnevictable set.
+ */
+static void check_move_unevictable_page(struct page *page, struct zone *zone)
+{
+	VM_BUG_ON(PageActive(page));
+
+retry:
+	ClearPageUnevictable(page);
+	if (page_evictable(page, NULL)) {
+		enum lru_list l = LRU_INACTIVE_ANON + page_is_file_cache(page);
+		__dec_zone_state(zone, NR_UNEVICTABLE);
+		list_move(&page->lru, &zone->lru[l].list);
+		__inc_zone_state(zone, NR_INACTIVE_ANON + l);
+		__count_vm_event(UNEVICTABLE_PGRESCUED);
+	} else {
+		/*
+		 * rotate unevictable list
+		 */
+		SetPageUnevictable(page);
+		list_move(&page->lru, &zone->lru[LRU_UNEVICTABLE].list);
+		if (page_evictable(page, NULL))
+			goto retry;
+	}
+}
+
+/**
+ * scan_mapping_unevictable_pages - scan an address space for evictable pages
+ * @mapping: struct address_space to scan for evictable pages
+ *
+ * Scan all pages in mapping.  Check unevictable pages for
+ * evictability and move them to the appropriate zone lru list.
+ */
+void scan_mapping_unevictable_pages(struct address_space *mapping)
+{
+	pgoff_t next = 0;
+	pgoff_t end   = (i_size_read(mapping->host) + PAGE_CACHE_SIZE - 1) >>
+			 PAGE_CACHE_SHIFT;
+	struct zone *zone;
+	struct pagevec pvec;
+
+	if (mapping->nrpages == 0)
+		return;
+
+	pagevec_init(&pvec, 0);
+	while (next < end &&
+		pagevec_lookup(&pvec, mapping, next, PAGEVEC_SIZE)) {
+		int i;
+		int pg_scanned = 0;
+
+		zone = NULL;
+
+		for (i = 0; i < pagevec_count(&pvec); i++) {
+			struct page *page = pvec.pages[i];
+			pgoff_t page_index = page->index;
+			struct zone *pagezone = page_zone(page);
+
+			pg_scanned++;
+			if (page_index > next)
+				next = page_index;
+			next++;
+
+			if (pagezone != zone) {
+				if (zone)
+					spin_unlock_irq(&zone->lru_lock);
+				zone = pagezone;
+				spin_lock_irq(&zone->lru_lock);
+			}
+
+			if (PageLRU(page) && PageUnevictable(page))
+				check_move_unevictable_page(page, zone);
+		}
+		if (zone)
+			spin_unlock_irq(&zone->lru_lock);
+		pagevec_release(&pvec);
+
+		count_vm_events(UNEVICTABLE_PGSCANNED, pg_scanned);
+	}
+
+}
 #endif
-- 
cgit v1.2.3


From b291f000393f5a0b679012b39d79fbc85c018233 Mon Sep 17 00:00:00 2001
From: Nick Piggin <npiggin@suse.de>
Date: Sat, 18 Oct 2008 20:26:44 -0700
Subject: mlock: mlocked pages are unevictable

Make sure that mlocked pages also live on the unevictable LRU, so kswapd
will not scan them over and over again.

This is achieved through various strategies:

1) add yet another page flag--PG_mlocked--to indicate that
   the page is locked for efficient testing in vmscan and,
   optionally, fault path.  This allows early culling of
   unevictable pages, preventing them from getting to
   page_referenced()/try_to_unmap().  Also allows separate
   accounting of mlock'd pages, as Nick's original patch
   did.

   Note:  Nick's original mlock patch used a PG_mlocked
   flag.  I had removed this in favor of the PG_unevictable
   flag + an mlock_count [new page struct member].  I
   restored the PG_mlocked flag to eliminate the new
   count field.

2) add the mlock/unevictable infrastructure to mm/mlock.c,
   with internal APIs in mm/internal.h.  This is a rework
   of Nick's original patch to these files, taking into
   account that mlocked pages are now kept on unevictable
   LRU list.

3) update vmscan.c:page_evictable() to check PageMlocked()
   and, if vma passed in, the vm_flags.  Note that the vma
   will only be passed in for new pages in the fault path;
   and then only if the "cull unevictable pages in fault
   path" patch is included.

4) add try_to_unlock() to rmap.c to walk a page's rmap and
   ClearPageMlocked() if no other vmas have it mlocked.
   Reuses as much of try_to_unmap() as possible.  This
   effectively replaces the use of one of the lru list links
   as an mlock count.  If this mechanism let's pages in mlocked
   vmas leak through w/o PG_mlocked set [I don't know that it
   does], we should catch them later in try_to_unmap().  One
   hopes this will be rare, as it will be relatively expensive.

Original mm/internal.h, mm/rmap.c and mm/mlock.c changes:
Signed-off-by: Nick Piggin <npiggin@suse.de>

splitlru: introduce __get_user_pages():

  New munlock processing need to GUP_FLAGS_IGNORE_VMA_PERMISSIONS.
  because current get_user_pages() can't grab PROT_NONE pages theresore it
  cause PROT_NONE pages can't munlock.

[akpm@linux-foundation.org: fix this for pagemap-pass-mm-into-pagewalkers.patch]
[akpm@linux-foundation.org: untangle patch interdependencies]
[akpm@linux-foundation.org: fix things after out-of-order merging]
[hugh@veritas.com: fix page-flags mess]
[lee.schermerhorn@hp.com: fix munlock page table walk - now requires 'mm']
[kosaki.motohiro@jp.fujitsu.com: build fix]
[kosaki.motohiro@jp.fujitsu.com: fix truncate race and sevaral comments]
[kosaki.motohiro@jp.fujitsu.com: splitlru: introduce __get_user_pages()]
Signed-off-by: KOSAKI Motohiro <kosaki.motohiro@jp.fujitsu.com>
Signed-off-by: Rik van Riel <riel@redhat.com>
Signed-off-by: Lee Schermerhorn <lee.schermerhorn@hp.com>
Cc: Nick Piggin <npiggin@suse.de>
Cc: Dave Hansen <dave@linux.vnet.ibm.com>
Cc: Matt Mackall <mpm@selenic.com>
Signed-off-by: Hugh Dickins <hugh@veritas.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/mm.h         |   5 +
 include/linux/page-flags.h |  19 ++-
 include/linux/rmap.h       |  14 ++
 mm/internal.h              |  71 ++++++++
 mm/memory.c                |  56 ++++++-
 mm/migrate.c               |   2 +
 mm/mlock.c                 | 394 ++++++++++++++++++++++++++++++++++++++++++---
 mm/mmap.c                  |   2 -
 mm/nommu.c                 |  44 +++--
 mm/page_alloc.c            |   6 +-
 mm/rmap.c                  | 257 ++++++++++++++++++++++++-----
 mm/swap.c                  |   2 +-
 mm/vmscan.c                |  36 +++--
 13 files changed, 817 insertions(+), 91 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/mm.h b/include/linux/mm.h
index 40236290e2ae..ffee2f743418 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -131,6 +131,11 @@ extern unsigned int kobjsize(const void *objp);
 #define VM_SequentialReadHint(v)	((v)->vm_flags & VM_SEQ_READ)
 #define VM_RandomReadHint(v)		((v)->vm_flags & VM_RAND_READ)
 
+/*
+ * special vmas that are non-mergable, non-mlock()able
+ */
+#define VM_SPECIAL (VM_IO | VM_DONTEXPAND | VM_RESERVED | VM_PFNMAP)
+
 /*
  * mapping from the currently active vm_flags protection bits (the
  * low four bits) to a page protection mask..
diff --git a/include/linux/page-flags.h b/include/linux/page-flags.h
index ec1a1baad348..b12f93a3c345 100644
--- a/include/linux/page-flags.h
+++ b/include/linux/page-flags.h
@@ -96,6 +96,7 @@ enum pageflags {
 	PG_swapbacked,		/* Page is backed by RAM/swap */
 #ifdef CONFIG_UNEVICTABLE_LRU
 	PG_unevictable,		/* Page is "unevictable"  */
+	PG_mlocked,		/* Page is vma mlocked */
 #endif
 #ifdef CONFIG_IA64_UNCACHED_ALLOCATOR
 	PG_uncached,		/* Page has been mapped as uncached */
@@ -232,7 +233,17 @@ PAGEFLAG_FALSE(SwapCache)
 #ifdef CONFIG_UNEVICTABLE_LRU
 PAGEFLAG(Unevictable, unevictable) __CLEARPAGEFLAG(Unevictable, unevictable)
 	TESTCLEARFLAG(Unevictable, unevictable)
+
+#define MLOCK_PAGES 1
+PAGEFLAG(Mlocked, mlocked) __CLEARPAGEFLAG(Mlocked, mlocked)
+	TESTSCFLAG(Mlocked, mlocked)
+
 #else
+
+#define MLOCK_PAGES 0
+PAGEFLAG_FALSE(Mlocked)
+	SETPAGEFLAG_NOOP(Mlocked) TESTCLEARFLAG_FALSE(Mlocked)
+
 PAGEFLAG_FALSE(Unevictable) TESTCLEARFLAG_FALSE(Unevictable)
 	SETPAGEFLAG_NOOP(Unevictable) CLEARPAGEFLAG_NOOP(Unevictable)
 	__CLEARPAGEFLAG_NOOP(Unevictable)
@@ -354,15 +365,17 @@ static inline void __ClearPageTail(struct page *page)
 #endif /* !PAGEFLAGS_EXTENDED */
 
 #ifdef CONFIG_UNEVICTABLE_LRU
-#define __PG_UNEVICTABLE (1 << PG_unevictable)
+#define __PG_UNEVICTABLE	(1 << PG_unevictable)
+#define __PG_MLOCKED		(1 << PG_mlocked)
 #else
-#define __PG_UNEVICTABLE 0
+#define __PG_UNEVICTABLE	0
+#define __PG_MLOCKED		0
 #endif
 
 #define PAGE_FLAGS	(1 << PG_lru   | 1 << PG_private   | 1 << PG_locked | \
 			 1 << PG_buddy | 1 << PG_writeback | \
 			 1 << PG_slab  | 1 << PG_swapcache | 1 << PG_active | \
-			 __PG_UNEVICTABLE)
+			 __PG_UNEVICTABLE | __PG_MLOCKED)
 
 /*
  * Flags checked in bad_page().  Pages on the free list should not have
diff --git a/include/linux/rmap.h b/include/linux/rmap.h
index fed6f5e0b411..955667e6a52d 100644
--- a/include/linux/rmap.h
+++ b/include/linux/rmap.h
@@ -117,6 +117,19 @@ unsigned long page_address_in_vma(struct page *, struct vm_area_struct *);
  */
 int page_mkclean(struct page *);
 
+#ifdef CONFIG_UNEVICTABLE_LRU
+/*
+ * called in munlock()/munmap() path to check for other vmas holding
+ * the page mlocked.
+ */
+int try_to_munlock(struct page *);
+#else
+static inline int try_to_munlock(struct page *page)
+{
+	return 0;	/* a.k.a. SWAP_SUCCESS */
+}
+#endif
+
 #else	/* !CONFIG_MMU */
 
 #define anon_vma_init()		do {} while (0)
@@ -140,5 +153,6 @@ static inline int page_mkclean(struct page *page)
 #define SWAP_SUCCESS	0
 #define SWAP_AGAIN	1
 #define SWAP_FAIL	2
+#define SWAP_MLOCK	3
 
 #endif	/* _LINUX_RMAP_H */
diff --git a/mm/internal.h b/mm/internal.h
index 3db17b2a1ac6..4ebf0bef9a39 100644
--- a/mm/internal.h
+++ b/mm/internal.h
@@ -61,6 +61,10 @@ static inline unsigned long page_order(struct page *page)
 	return page_private(page);
 }
 
+extern int mlock_vma_pages_range(struct vm_area_struct *vma,
+			unsigned long start, unsigned long end);
+extern void munlock_vma_pages_all(struct vm_area_struct *vma);
+
 #ifdef CONFIG_UNEVICTABLE_LRU
 /*
  * unevictable_migrate_page() called only from migrate_page_copy() to
@@ -79,6 +83,65 @@ static inline void unevictable_migrate_page(struct page *new, struct page *old)
 }
 #endif
 
+#ifdef CONFIG_UNEVICTABLE_LRU
+/*
+ * Called only in fault path via page_evictable() for a new page
+ * to determine if it's being mapped into a LOCKED vma.
+ * If so, mark page as mlocked.
+ */
+static inline int is_mlocked_vma(struct vm_area_struct *vma, struct page *page)
+{
+	VM_BUG_ON(PageLRU(page));
+
+	if (likely((vma->vm_flags & (VM_LOCKED | VM_SPECIAL)) != VM_LOCKED))
+		return 0;
+
+	SetPageMlocked(page);
+	return 1;
+}
+
+/*
+ * must be called with vma's mmap_sem held for read, and page locked.
+ */
+extern void mlock_vma_page(struct page *page);
+
+/*
+ * Clear the page's PageMlocked().  This can be useful in a situation where
+ * we want to unconditionally remove a page from the pagecache -- e.g.,
+ * on truncation or freeing.
+ *
+ * It is legal to call this function for any page, mlocked or not.
+ * If called for a page that is still mapped by mlocked vmas, all we do
+ * is revert to lazy LRU behaviour -- semantics are not broken.
+ */
+extern void __clear_page_mlock(struct page *page);
+static inline void clear_page_mlock(struct page *page)
+{
+	if (unlikely(TestClearPageMlocked(page)))
+		__clear_page_mlock(page);
+}
+
+/*
+ * mlock_migrate_page - called only from migrate_page_copy() to
+ * migrate the Mlocked page flag
+ */
+static inline void mlock_migrate_page(struct page *newpage, struct page *page)
+{
+	if (TestClearPageMlocked(page))
+		SetPageMlocked(newpage);
+}
+
+
+#else /* CONFIG_UNEVICTABLE_LRU */
+static inline int is_mlocked_vma(struct vm_area_struct *v, struct page *p)
+{
+	return 0;
+}
+static inline void clear_page_mlock(struct page *page) { }
+static inline void mlock_vma_page(struct page *page) { }
+static inline void mlock_migrate_page(struct page *new, struct page *old) { }
+
+#endif /* CONFIG_UNEVICTABLE_LRU */
 
 /*
  * FLATMEM and DISCONTIGMEM configurations use alloc_bootmem_node,
@@ -148,4 +211,12 @@ static inline void mminit_validate_memmodel_limits(unsigned long *start_pfn,
 }
 #endif /* CONFIG_SPARSEMEM */
 
+#define GUP_FLAGS_WRITE                  0x1
+#define GUP_FLAGS_FORCE                  0x2
+#define GUP_FLAGS_IGNORE_VMA_PERMISSIONS 0x4
+
+int __get_user_pages(struct task_struct *tsk, struct mm_struct *mm,
+		     unsigned long start, int len, int flags,
+		     struct page **pages, struct vm_area_struct **vmas);
+
 #endif
diff --git a/mm/memory.c b/mm/memory.c
index 71cdefd1ef14..9fef7272fb9e 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -64,6 +64,8 @@
 
 #include "internal.h"
 
+#include "internal.h"
+
 #ifndef CONFIG_NEED_MULTIPLE_NODES
 /* use the per-pgdat data instead for discontigmem - mbligh */
 unsigned long max_mapnr;
@@ -1129,12 +1131,17 @@ static inline int use_zero_page(struct vm_area_struct *vma)
 	return !vma->vm_ops || !vma->vm_ops->fault;
 }
 
-int get_user_pages(struct task_struct *tsk, struct mm_struct *mm,
-		unsigned long start, int len, int write, int force,
+
+
+int __get_user_pages(struct task_struct *tsk, struct mm_struct *mm,
+		     unsigned long start, int len, int flags,
 		struct page **pages, struct vm_area_struct **vmas)
 {
 	int i;
-	unsigned int vm_flags;
+	unsigned int vm_flags = 0;
+	int write = !!(flags & GUP_FLAGS_WRITE);
+	int force = !!(flags & GUP_FLAGS_FORCE);
+	int ignore = !!(flags & GUP_FLAGS_IGNORE_VMA_PERMISSIONS);
 
 	if (len <= 0)
 		return 0;
@@ -1158,7 +1165,9 @@ int get_user_pages(struct task_struct *tsk, struct mm_struct *mm,
 			pud_t *pud;
 			pmd_t *pmd;
 			pte_t *pte;
-			if (write) /* user gate pages are read-only */
+
+			/* user gate pages are read-only */
+			if (!ignore && write)
 				return i ? : -EFAULT;
 			if (pg > TASK_SIZE)
 				pgd = pgd_offset_k(pg);
@@ -1190,8 +1199,9 @@ int get_user_pages(struct task_struct *tsk, struct mm_struct *mm,
 			continue;
 		}
 
-		if (!vma || (vma->vm_flags & (VM_IO | VM_PFNMAP))
-				|| !(vm_flags & vma->vm_flags))
+		if (!vma ||
+		    (vma->vm_flags & (VM_IO | VM_PFNMAP)) ||
+		    (!ignore && !(vm_flags & vma->vm_flags)))
 			return i ? : -EFAULT;
 
 		if (is_vm_hugetlb_page(vma)) {
@@ -1266,6 +1276,23 @@ int get_user_pages(struct task_struct *tsk, struct mm_struct *mm,
 	} while (len);
 	return i;
 }
+
+int get_user_pages(struct task_struct *tsk, struct mm_struct *mm,
+		unsigned long start, int len, int write, int force,
+		struct page **pages, struct vm_area_struct **vmas)
+{
+	int flags = 0;
+
+	if (write)
+		flags |= GUP_FLAGS_WRITE;
+	if (force)
+		flags |= GUP_FLAGS_FORCE;
+
+	return __get_user_pages(tsk, mm,
+				start, len, flags,
+				pages, vmas);
+}
+
 EXPORT_SYMBOL(get_user_pages);
 
 pte_t *get_locked_pte(struct mm_struct *mm, unsigned long addr,
@@ -1858,6 +1885,15 @@ gotten:
 	new_page = alloc_page_vma(GFP_HIGHUSER_MOVABLE, vma, address);
 	if (!new_page)
 		goto oom;
+	/*
+	 * Don't let another task, with possibly unlocked vma,
+	 * keep the mlocked page.
+	 */
+	if (vma->vm_flags & VM_LOCKED) {
+		lock_page(old_page);	/* for LRU manipulation */
+		clear_page_mlock(old_page);
+		unlock_page(old_page);
+	}
 	cow_user_page(new_page, old_page, address, vma);
 	__SetPageUptodate(new_page);
 
@@ -2325,7 +2361,7 @@ static int do_swap_page(struct mm_struct *mm, struct vm_area_struct *vma,
 	page_add_anon_rmap(page, vma, address);
 
 	swap_free(entry);
-	if (vm_swap_full())
+	if (vm_swap_full() || (vma->vm_flags & VM_LOCKED) || PageMlocked(page))
 		remove_exclusive_swap_page(page);
 	unlock_page(page);
 
@@ -2465,6 +2501,12 @@ static int __do_fault(struct mm_struct *mm, struct vm_area_struct *vma,
 				ret = VM_FAULT_OOM;
 				goto out;
 			}
+			/*
+			 * Don't let another task, with possibly unlocked vma,
+			 * keep the mlocked page.
+			 */
+			if (vma->vm_flags & VM_LOCKED)
+				clear_page_mlock(vmf.page);
 			copy_user_highpage(page, vmf.page, address, vma);
 			__SetPageUptodate(page);
 		} else {
diff --git a/mm/migrate.c b/mm/migrate.c
index b10237d8b459..6802a7a3dfec 100644
--- a/mm/migrate.c
+++ b/mm/migrate.c
@@ -371,6 +371,8 @@ static void migrate_page_copy(struct page *newpage, struct page *page)
 		__set_page_dirty_nobuffers(newpage);
  	}
 
+	mlock_migrate_page(newpage, page);
+
 #ifdef CONFIG_SWAP
 	ClearPageSwapCache(page);
 #endif
diff --git a/mm/mlock.c b/mm/mlock.c
index 01fbe93eff5c..8746fe3f9730 100644
--- a/mm/mlock.c
+++ b/mm/mlock.c
@@ -8,10 +8,18 @@
 #include <linux/capability.h>
 #include <linux/mman.h>
 #include <linux/mm.h>
+#include <linux/swap.h>
+#include <linux/swapops.h>
+#include <linux/pagemap.h>
 #include <linux/mempolicy.h>
 #include <linux/syscalls.h>
 #include <linux/sched.h>
 #include <linux/module.h>
+#include <linux/rmap.h>
+#include <linux/mmzone.h>
+#include <linux/hugetlb.h>
+
+#include "internal.h"
 
 int can_do_mlock(void)
 {
@@ -23,17 +31,360 @@ int can_do_mlock(void)
 }
 EXPORT_SYMBOL(can_do_mlock);
 
+#ifdef CONFIG_UNEVICTABLE_LRU
+/*
+ * Mlocked pages are marked with PageMlocked() flag for efficient testing
+ * in vmscan and, possibly, the fault path; and to support semi-accurate
+ * statistics.
+ *
+ * An mlocked page [PageMlocked(page)] is unevictable.  As such, it will
+ * be placed on the LRU "unevictable" list, rather than the [in]active lists.
+ * The unevictable list is an LRU sibling list to the [in]active lists.
+ * PageUnevictable is set to indicate the unevictable state.
+ *
+ * When lazy mlocking via vmscan, it is important to ensure that the
+ * vma's VM_LOCKED status is not concurrently being modified, otherwise we
+ * may have mlocked a page that is being munlocked. So lazy mlock must take
+ * the mmap_sem for read, and verify that the vma really is locked
+ * (see mm/rmap.c).
+ */
+
+/*
+ *  LRU accounting for clear_page_mlock()
+ */
+void __clear_page_mlock(struct page *page)
+{
+	VM_BUG_ON(!PageLocked(page));
+
+	if (!page->mapping) {	/* truncated ? */
+		return;
+	}
+
+	if (!isolate_lru_page(page)) {
+		putback_lru_page(page);
+	} else {
+		/*
+		 * Page not on the LRU yet.  Flush all pagevecs and retry.
+		 */
+		lru_add_drain_all();
+		if (!isolate_lru_page(page))
+			putback_lru_page(page);
+	}
+}
+
+/*
+ * Mark page as mlocked if not already.
+ * If page on LRU, isolate and putback to move to unevictable list.
+ */
+void mlock_vma_page(struct page *page)
+{
+	BUG_ON(!PageLocked(page));
+
+	if (!TestSetPageMlocked(page) && !isolate_lru_page(page))
+		putback_lru_page(page);
+}
+
+/*
+ * called from munlock()/munmap() path with page supposedly on the LRU.
+ *
+ * Note:  unlike mlock_vma_page(), we can't just clear the PageMlocked
+ * [in try_to_munlock()] and then attempt to isolate the page.  We must
+ * isolate the page to keep others from messing with its unevictable
+ * and mlocked state while trying to munlock.  However, we pre-clear the
+ * mlocked state anyway as we might lose the isolation race and we might
+ * not get another chance to clear PageMlocked.  If we successfully
+ * isolate the page and try_to_munlock() detects other VM_LOCKED vmas
+ * mapping the page, it will restore the PageMlocked state, unless the page
+ * is mapped in a non-linear vma.  So, we go ahead and SetPageMlocked(),
+ * perhaps redundantly.
+ * If we lose the isolation race, and the page is mapped by other VM_LOCKED
+ * vmas, we'll detect this in vmscan--via try_to_munlock() or try_to_unmap()
+ * either of which will restore the PageMlocked state by calling
+ * mlock_vma_page() above, if it can grab the vma's mmap sem.
+ */
+static void munlock_vma_page(struct page *page)
+{
+	BUG_ON(!PageLocked(page));
+
+	if (TestClearPageMlocked(page) && !isolate_lru_page(page)) {
+		try_to_munlock(page);
+		putback_lru_page(page);
+	}
+}
+
+/*
+ * mlock a range of pages in the vma.
+ *
+ * This takes care of making the pages present too.
+ *
+ * vma->vm_mm->mmap_sem must be held for write.
+ */
+static int __mlock_vma_pages_range(struct vm_area_struct *vma,
+			unsigned long start, unsigned long end)
+{
+	struct mm_struct *mm = vma->vm_mm;
+	unsigned long addr = start;
+	struct page *pages[16]; /* 16 gives a reasonable batch */
+	int write = !!(vma->vm_flags & VM_WRITE);
+	int nr_pages = (end - start) / PAGE_SIZE;
+	int ret;
+
+	VM_BUG_ON(start & ~PAGE_MASK || end & ~PAGE_MASK);
+	VM_BUG_ON(start < vma->vm_start || end > vma->vm_end);
+	VM_BUG_ON(!rwsem_is_locked(&vma->vm_mm->mmap_sem));
+
+	lru_add_drain_all();	/* push cached pages to LRU */
+
+	while (nr_pages > 0) {
+		int i;
+
+		cond_resched();
+
+		/*
+		 * get_user_pages makes pages present if we are
+		 * setting mlock. and this extra reference count will
+		 * disable migration of this page.  However, page may
+		 * still be truncated out from under us.
+		 */
+		ret = get_user_pages(current, mm, addr,
+				min_t(int, nr_pages, ARRAY_SIZE(pages)),
+				write, 0, pages, NULL);
+		/*
+		 * This can happen for, e.g., VM_NONLINEAR regions before
+		 * a page has been allocated and mapped at a given offset,
+		 * or for addresses that map beyond end of a file.
+		 * We'll mlock the the pages if/when they get faulted in.
+		 */
+		if (ret < 0)
+			break;
+		if (ret == 0) {
+			/*
+			 * We know the vma is there, so the only time
+			 * we cannot get a single page should be an
+			 * error (ret < 0) case.
+			 */
+			WARN_ON(1);
+			break;
+		}
+
+		lru_add_drain();	/* push cached pages to LRU */
+
+		for (i = 0; i < ret; i++) {
+			struct page *page = pages[i];
+
+			lock_page(page);
+			/*
+			 * Because we lock page here and migration is blocked
+			 * by the elevated reference, we need only check for
+			 * page truncation (file-cache only).
+			 */
+			if (page->mapping)
+				mlock_vma_page(page);
+			unlock_page(page);
+			put_page(page);		/* ref from get_user_pages() */
+
+			/*
+			 * here we assume that get_user_pages() has given us
+			 * a list of virtually contiguous pages.
+			 */
+			addr += PAGE_SIZE;	/* for next get_user_pages() */
+			nr_pages--;
+		}
+	}
+
+	lru_add_drain_all();	/* to update stats */
+
+	return 0;	/* count entire vma as locked_vm */
+}
+
+/*
+ * private structure for munlock page table walk
+ */
+struct munlock_page_walk {
+	struct vm_area_struct *vma;
+	pmd_t                 *pmd; /* for migration_entry_wait() */
+};
+
+/*
+ * munlock normal pages for present ptes
+ */
+static int __munlock_pte_handler(pte_t *ptep, unsigned long addr,
+				   unsigned long end, struct mm_walk *walk)
+{
+	struct munlock_page_walk *mpw = walk->private;
+	swp_entry_t entry;
+	struct page *page;
+	pte_t pte;
+
+retry:
+	pte = *ptep;
+	/*
+	 * If it's a swap pte, we might be racing with page migration.
+	 */
+	if (unlikely(!pte_present(pte))) {
+		if (!is_swap_pte(pte))
+			goto out;
+		entry = pte_to_swp_entry(pte);
+		if (is_migration_entry(entry)) {
+			migration_entry_wait(mpw->vma->vm_mm, mpw->pmd, addr);
+			goto retry;
+		}
+		goto out;
+	}
+
+	page = vm_normal_page(mpw->vma, addr, pte);
+	if (!page)
+		goto out;
+
+	lock_page(page);
+	if (!page->mapping) {
+		unlock_page(page);
+		goto retry;
+	}
+	munlock_vma_page(page);
+	unlock_page(page);
+
+out:
+	return 0;
+}
+
+/*
+ * Save pmd for pte handler for waiting on migration entries
+ */
+static int __munlock_pmd_handler(pmd_t *pmd, unsigned long addr,
+				 unsigned long end, struct mm_walk *walk)
+{
+	struct munlock_page_walk *mpw = walk->private;
+
+	mpw->pmd = pmd;
+	return 0;
+}
+
+
+/*
+ * munlock a range of pages in the vma using standard page table walk.
+ *
+ * vma->vm_mm->mmap_sem must be held for write.
+ */
+static void __munlock_vma_pages_range(struct vm_area_struct *vma,
+			      unsigned long start, unsigned long end)
+{
+	struct mm_struct *mm = vma->vm_mm;
+	struct munlock_page_walk mpw = {
+		.vma = vma,
+	};
+	struct mm_walk munlock_page_walk = {
+		.pmd_entry = __munlock_pmd_handler,
+		.pte_entry = __munlock_pte_handler,
+		.private = &mpw,
+		.mm = mm,
+	};
+
+	VM_BUG_ON(start & ~PAGE_MASK || end & ~PAGE_MASK);
+	VM_BUG_ON(!rwsem_is_locked(&vma->vm_mm->mmap_sem));
+	VM_BUG_ON(start < vma->vm_start);
+	VM_BUG_ON(end > vma->vm_end);
+
+	lru_add_drain_all();	/* push cached pages to LRU */
+	walk_page_range(start, end, &munlock_page_walk);
+	lru_add_drain_all();	/* to update stats */
+}
+
+#else /* CONFIG_UNEVICTABLE_LRU */
+
+/*
+ * Just make pages present if VM_LOCKED.  No-op if unlocking.
+ */
+static int __mlock_vma_pages_range(struct vm_area_struct *vma,
+			unsigned long start, unsigned long end)
+{
+	if (vma->vm_flags & VM_LOCKED)
+		make_pages_present(start, end);
+	return 0;
+}
+
+/*
+ * munlock a range of pages in the vma -- no-op.
+ */
+static void __munlock_vma_pages_range(struct vm_area_struct *vma,
+			      unsigned long start, unsigned long end)
+{
+}
+#endif /* CONFIG_UNEVICTABLE_LRU */
+
+/*
+ * mlock all pages in this vma range.  For mmap()/mremap()/...
+ */
+int mlock_vma_pages_range(struct vm_area_struct *vma,
+			unsigned long start, unsigned long end)
+{
+	int nr_pages = (end - start) / PAGE_SIZE;
+	BUG_ON(!(vma->vm_flags & VM_LOCKED));
+
+	/*
+	 * filter unlockable vmas
+	 */
+	if (vma->vm_flags & (VM_IO | VM_PFNMAP))
+		goto no_mlock;
+
+	if (!((vma->vm_flags & (VM_DONTEXPAND | VM_RESERVED)) ||
+			is_vm_hugetlb_page(vma) ||
+			vma == get_gate_vma(current)))
+		return __mlock_vma_pages_range(vma, start, end);
+
+	/*
+	 * User mapped kernel pages or huge pages:
+	 * make these pages present to populate the ptes, but
+	 * fall thru' to reset VM_LOCKED--no need to unlock, and
+	 * return nr_pages so these don't get counted against task's
+	 * locked limit.  huge pages are already counted against
+	 * locked vm limit.
+	 */
+	make_pages_present(start, end);
+
+no_mlock:
+	vma->vm_flags &= ~VM_LOCKED;	/* and don't come back! */
+	return nr_pages;		/* pages NOT mlocked */
+}
+
+
+/*
+ * munlock all pages in vma.   For munmap() and exit().
+ */
+void munlock_vma_pages_all(struct vm_area_struct *vma)
+{
+	vma->vm_flags &= ~VM_LOCKED;
+	__munlock_vma_pages_range(vma, vma->vm_start, vma->vm_end);
+}
+
+/*
+ * mlock_fixup  - handle mlock[all]/munlock[all] requests.
+ *
+ * Filters out "special" vmas -- VM_LOCKED never gets set for these, and
+ * munlock is a no-op.  However, for some special vmas, we go ahead and
+ * populate the ptes via make_pages_present().
+ *
+ * For vmas that pass the filters, merge/split as appropriate.
+ */
 static int mlock_fixup(struct vm_area_struct *vma, struct vm_area_struct **prev,
 	unsigned long start, unsigned long end, unsigned int newflags)
 {
-	struct mm_struct * mm = vma->vm_mm;
+	struct mm_struct *mm = vma->vm_mm;
 	pgoff_t pgoff;
-	int pages;
+	int nr_pages;
 	int ret = 0;
-
-	if (newflags == vma->vm_flags) {
-		*prev = vma;
-		goto out;
+	int lock = newflags & VM_LOCKED;
+
+	if (newflags == vma->vm_flags ||
+			(vma->vm_flags & (VM_IO | VM_PFNMAP)))
+		goto out;	/* don't set VM_LOCKED,  don't count */
+
+	if ((vma->vm_flags & (VM_DONTEXPAND | VM_RESERVED)) ||
+			is_vm_hugetlb_page(vma) ||
+			vma == get_gate_vma(current)) {
+		if (lock)
+			make_pages_present(start, end);
+		goto out;	/* don't set VM_LOCKED,  don't count */
 	}
 
 	pgoff = vma->vm_pgoff + ((start - vma->vm_start) >> PAGE_SHIFT);
@@ -44,8 +395,6 @@ static int mlock_fixup(struct vm_area_struct *vma, struct vm_area_struct **prev,
 		goto success;
 	}
 
-	*prev = vma;
-
 	if (start != vma->vm_start) {
 		ret = split_vma(mm, vma, start, 1);
 		if (ret)
@@ -59,25 +408,32 @@ static int mlock_fixup(struct vm_area_struct *vma, struct vm_area_struct **prev,
 	}
 
 success:
+	/*
+	 * Keep track of amount of locked VM.
+	 */
+	nr_pages = (end - start) >> PAGE_SHIFT;
+	if (!lock)
+		nr_pages = -nr_pages;
+	mm->locked_vm += nr_pages;
+
 	/*
 	 * vm_flags is protected by the mmap_sem held in write mode.
 	 * It's okay if try_to_unmap_one unmaps a page just after we
-	 * set VM_LOCKED, make_pages_present below will bring it back.
+	 * set VM_LOCKED, __mlock_vma_pages_range will bring it back.
 	 */
 	vma->vm_flags = newflags;
 
-	/*
-	 * Keep track of amount of locked VM.
-	 */
-	pages = (end - start) >> PAGE_SHIFT;
-	if (newflags & VM_LOCKED) {
-		pages = -pages;
-		if (!(newflags & VM_IO))
-			ret = make_pages_present(start, end);
-	}
+	if (lock) {
+		ret = __mlock_vma_pages_range(vma, start, end);
+		if (ret > 0) {
+			mm->locked_vm -= ret;
+			ret = 0;
+		}
+	} else
+		__munlock_vma_pages_range(vma, start, end);
 
-	mm->locked_vm -= pages;
 out:
+	*prev = vma;
 	return ret;
 }
 
diff --git a/mm/mmap.c b/mm/mmap.c
index e7a5a68a9c2e..7bdfd2661f17 100644
--- a/mm/mmap.c
+++ b/mm/mmap.c
@@ -662,8 +662,6 @@ again:			remove_next = 1 + (end > next->vm_end);
  * If the vma has a ->close operation then the driver probably needs to release
  * per-vma resources, so we don't attempt to merge those.
  */
-#define VM_SPECIAL (VM_IO | VM_DONTEXPAND | VM_RESERVED | VM_PFNMAP)
-
 static inline int is_mergeable_vma(struct vm_area_struct *vma,
 			struct file *file, unsigned long vm_flags)
 {
diff --git a/mm/nommu.c b/mm/nommu.c
index ed75bc962fbe..2696b24f2bb3 100644
--- a/mm/nommu.c
+++ b/mm/nommu.c
@@ -34,6 +34,8 @@
 #include <asm/tlb.h>
 #include <asm/tlbflush.h>
 
+#include "internal.h"
+
 void *high_memory;
 struct page *mem_map;
 unsigned long max_mapnr;
@@ -128,20 +130,16 @@ unsigned int kobjsize(const void *objp)
 	return PAGE_SIZE << compound_order(page);
 }
 
-/*
- * get a list of pages in an address range belonging to the specified process
- * and indicate the VMA that covers each page
- * - this is potentially dodgy as we may end incrementing the page count of a
- *   slab page or a secondary page from a compound page
- * - don't permit access to VMAs that don't support it, such as I/O mappings
- */
-int get_user_pages(struct task_struct *tsk, struct mm_struct *mm,
-	unsigned long start, int len, int write, int force,
-	struct page **pages, struct vm_area_struct **vmas)
+int __get_user_pages(struct task_struct *tsk, struct mm_struct *mm,
+		     unsigned long start, int len, int flags,
+		struct page **pages, struct vm_area_struct **vmas)
 {
 	struct vm_area_struct *vma;
 	unsigned long vm_flags;
 	int i;
+	int write = !!(flags & GUP_FLAGS_WRITE);
+	int force = !!(flags & GUP_FLAGS_FORCE);
+	int ignore = !!(flags & GUP_FLAGS_IGNORE_VMA_PERMISSIONS);
 
 	/* calculate required read or write permissions.
 	 * - if 'force' is set, we only require the "MAY" flags.
@@ -156,7 +154,7 @@ int get_user_pages(struct task_struct *tsk, struct mm_struct *mm,
 
 		/* protect what we can, including chardevs */
 		if (vma->vm_flags & (VM_IO | VM_PFNMAP) ||
-		    !(vm_flags & vma->vm_flags))
+		    (!ignore && !(vm_flags & vma->vm_flags)))
 			goto finish_or_fault;
 
 		if (pages) {
@@ -174,6 +172,30 @@ int get_user_pages(struct task_struct *tsk, struct mm_struct *mm,
 finish_or_fault:
 	return i ? : -EFAULT;
 }
+
+
+/*
+ * get a list of pages in an address range belonging to the specified process
+ * and indicate the VMA that covers each page
+ * - this is potentially dodgy as we may end incrementing the page count of a
+ *   slab page or a secondary page from a compound page
+ * - don't permit access to VMAs that don't support it, such as I/O mappings
+ */
+int get_user_pages(struct task_struct *tsk, struct mm_struct *mm,
+	unsigned long start, int len, int write, int force,
+	struct page **pages, struct vm_area_struct **vmas)
+{
+	int flags = 0;
+
+	if (write)
+		flags |= GUP_FLAGS_WRITE;
+	if (force)
+		flags |= GUP_FLAGS_FORCE;
+
+	return __get_user_pages(tsk, mm,
+				start, len, flags,
+				pages, vmas);
+}
 EXPORT_SYMBOL(get_user_pages);
 
 DEFINE_RWLOCK(vmlist_lock);
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 4125230a1b2c..5886586fde6c 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -616,7 +616,11 @@ static int prep_new_page(struct page *page, int order, gfp_t gfp_flags)
 
 	page->flags &= ~(1 << PG_uptodate | 1 << PG_error | 1 << PG_reclaim |
 			1 << PG_referenced | 1 << PG_arch_1 |
-			1 << PG_owner_priv_1 | 1 << PG_mappedtodisk);
+			1 << PG_owner_priv_1 | 1 << PG_mappedtodisk
+#ifdef CONFIG_UNEVICTABLE_LRU
+			| 1 << PG_mlocked
+#endif
+			);
 	set_page_private(page, 0);
 	set_page_refcounted(page);
 
diff --git a/mm/rmap.c b/mm/rmap.c
index e8d639b16c6d..7e60df99018e 100644
--- a/mm/rmap.c
+++ b/mm/rmap.c
@@ -53,6 +53,8 @@
 
 #include <asm/tlbflush.h>
 
+#include "internal.h"
+
 struct kmem_cache *anon_vma_cachep;
 
 /**
@@ -290,6 +292,32 @@ pte_t *page_check_address(struct page *page, struct mm_struct *mm,
 	return NULL;
 }
 
+/**
+ * page_mapped_in_vma - check whether a page is really mapped in a VMA
+ * @page: the page to test
+ * @vma: the VMA to test
+ *
+ * Returns 1 if the page is mapped into the page tables of the VMA, 0
+ * if the page is not mapped into the page tables of this VMA.  Only
+ * valid for normal file or anonymous VMAs.
+ */
+static int page_mapped_in_vma(struct page *page, struct vm_area_struct *vma)
+{
+	unsigned long address;
+	pte_t *pte;
+	spinlock_t *ptl;
+
+	address = vma_address(page, vma);
+	if (address == -EFAULT)		/* out of vma range */
+		return 0;
+	pte = page_check_address(page, vma->vm_mm, address, &ptl, 1);
+	if (!pte)			/* the page is not in this mm */
+		return 0;
+	pte_unmap_unlock(pte, ptl);
+
+	return 1;
+}
+
 /*
  * Subfunctions of page_referenced: page_referenced_one called
  * repeatedly from either page_referenced_anon or page_referenced_file.
@@ -311,10 +339,17 @@ static int page_referenced_one(struct page *page,
 	if (!pte)
 		goto out;
 
+	/*
+	 * Don't want to elevate referenced for mlocked page that gets this far,
+	 * in order that it progresses to try_to_unmap and is moved to the
+	 * unevictable list.
+	 */
 	if (vma->vm_flags & VM_LOCKED) {
-		referenced++;
 		*mapcount = 1;	/* break early from loop */
-	} else if (ptep_clear_flush_young_notify(vma, address, pte))
+		goto out_unmap;
+	}
+
+	if (ptep_clear_flush_young_notify(vma, address, pte))
 		referenced++;
 
 	/* Pretend the page is referenced if the task has the
@@ -323,6 +358,7 @@ static int page_referenced_one(struct page *page,
 			rwsem_is_locked(&mm->mmap_sem))
 		referenced++;
 
+out_unmap:
 	(*mapcount)--;
 	pte_unmap_unlock(pte, ptl);
 out:
@@ -412,11 +448,6 @@ static int page_referenced_file(struct page *page,
 		 */
 		if (mem_cont && !mm_match_cgroup(vma->vm_mm, mem_cont))
 			continue;
-		if ((vma->vm_flags & (VM_LOCKED|VM_MAYSHARE))
-				  == (VM_LOCKED|VM_MAYSHARE)) {
-			referenced++;
-			break;
-		}
 		referenced += page_referenced_one(page, vma, &mapcount);
 		if (!mapcount)
 			break;
@@ -739,11 +770,16 @@ static int try_to_unmap_one(struct page *page, struct vm_area_struct *vma,
 	 * If it's recently referenced (perhaps page_referenced
 	 * skipped over this mm) then we should reactivate it.
 	 */
-	if (!migration && ((vma->vm_flags & VM_LOCKED) ||
-			(ptep_clear_flush_young_notify(vma, address, pte)))) {
-		ret = SWAP_FAIL;
-		goto out_unmap;
-	}
+	if (!migration) {
+		if (vma->vm_flags & VM_LOCKED) {
+			ret = SWAP_MLOCK;
+			goto out_unmap;
+		}
+		if (ptep_clear_flush_young_notify(vma, address, pte)) {
+			ret = SWAP_FAIL;
+			goto out_unmap;
+		}
+  	}
 
 	/* Nuke the page table entry. */
 	flush_cache_page(vma, address, page_to_pfn(page));
@@ -824,12 +860,17 @@ out:
  * For very sparsely populated VMAs this is a little inefficient - chances are
  * there there won't be many ptes located within the scan cluster.  In this case
  * maybe we could scan further - to the end of the pte page, perhaps.
+ *
+ * Mlocked pages:  check VM_LOCKED under mmap_sem held for read, if we can
+ * acquire it without blocking.  If vma locked, mlock the pages in the cluster,
+ * rather than unmapping them.  If we encounter the "check_page" that vmscan is
+ * trying to unmap, return SWAP_MLOCK, else default SWAP_AGAIN.
  */
 #define CLUSTER_SIZE	min(32*PAGE_SIZE, PMD_SIZE)
 #define CLUSTER_MASK	(~(CLUSTER_SIZE - 1))
 
-static void try_to_unmap_cluster(unsigned long cursor,
-	unsigned int *mapcount, struct vm_area_struct *vma)
+static int try_to_unmap_cluster(unsigned long cursor, unsigned int *mapcount,
+		struct vm_area_struct *vma, struct page *check_page)
 {
 	struct mm_struct *mm = vma->vm_mm;
 	pgd_t *pgd;
@@ -841,6 +882,8 @@ static void try_to_unmap_cluster(unsigned long cursor,
 	struct page *page;
 	unsigned long address;
 	unsigned long end;
+	int ret = SWAP_AGAIN;
+	int locked_vma = 0;
 
 	address = (vma->vm_start + cursor) & CLUSTER_MASK;
 	end = address + CLUSTER_SIZE;
@@ -851,15 +894,26 @@ static void try_to_unmap_cluster(unsigned long cursor,
 
 	pgd = pgd_offset(mm, address);
 	if (!pgd_present(*pgd))
-		return;
+		return ret;
 
 	pud = pud_offset(pgd, address);
 	if (!pud_present(*pud))
-		return;
+		return ret;
 
 	pmd = pmd_offset(pud, address);
 	if (!pmd_present(*pmd))
-		return;
+		return ret;
+
+	/*
+	 * MLOCK_PAGES => feature is configured.
+	 * if we can acquire the mmap_sem for read, and vma is VM_LOCKED,
+	 * keep the sem while scanning the cluster for mlocking pages.
+	 */
+	if (MLOCK_PAGES && down_read_trylock(&vma->vm_mm->mmap_sem)) {
+		locked_vma = (vma->vm_flags & VM_LOCKED);
+		if (!locked_vma)
+			up_read(&vma->vm_mm->mmap_sem); /* don't need it */
+	}
 
 	pte = pte_offset_map_lock(mm, pmd, address, &ptl);
 
@@ -872,6 +926,13 @@ static void try_to_unmap_cluster(unsigned long cursor,
 		page = vm_normal_page(vma, address, *pte);
 		BUG_ON(!page || PageAnon(page));
 
+		if (locked_vma) {
+			mlock_vma_page(page);   /* no-op if already mlocked */
+			if (page == check_page)
+				ret = SWAP_MLOCK;
+			continue;	/* don't unmap */
+		}
+
 		if (ptep_clear_flush_young_notify(vma, address, pte))
 			continue;
 
@@ -893,39 +954,104 @@ static void try_to_unmap_cluster(unsigned long cursor,
 		(*mapcount)--;
 	}
 	pte_unmap_unlock(pte - 1, ptl);
+	if (locked_vma)
+		up_read(&vma->vm_mm->mmap_sem);
+	return ret;
 }
 
-static int try_to_unmap_anon(struct page *page, int migration)
+/*
+ * common handling for pages mapped in VM_LOCKED vmas
+ */
+static int try_to_mlock_page(struct page *page, struct vm_area_struct *vma)
+{
+	int mlocked = 0;
+
+	if (down_read_trylock(&vma->vm_mm->mmap_sem)) {
+		if (vma->vm_flags & VM_LOCKED) {
+			mlock_vma_page(page);
+			mlocked++;	/* really mlocked the page */
+		}
+		up_read(&vma->vm_mm->mmap_sem);
+	}
+	return mlocked;
+}
+
+/**
+ * try_to_unmap_anon - unmap or unlock anonymous page using the object-based
+ * rmap method
+ * @page: the page to unmap/unlock
+ * @unlock:  request for unlock rather than unmap [unlikely]
+ * @migration:  unmapping for migration - ignored if @unlock
+ *
+ * Find all the mappings of a page using the mapping pointer and the vma chains
+ * contained in the anon_vma struct it points to.
+ *
+ * This function is only called from try_to_unmap/try_to_munlock for
+ * anonymous pages.
+ * When called from try_to_munlock(), the mmap_sem of the mm containing the vma
+ * where the page was found will be held for write.  So, we won't recheck
+ * vm_flags for that VMA.  That should be OK, because that vma shouldn't be
+ * 'LOCKED.
+ */
+static int try_to_unmap_anon(struct page *page, int unlock, int migration)
 {
 	struct anon_vma *anon_vma;
 	struct vm_area_struct *vma;
+	unsigned int mlocked = 0;
 	int ret = SWAP_AGAIN;
 
+	if (MLOCK_PAGES && unlikely(unlock))
+		ret = SWAP_SUCCESS;	/* default for try_to_munlock() */
+
 	anon_vma = page_lock_anon_vma(page);
 	if (!anon_vma)
 		return ret;
 
 	list_for_each_entry(vma, &anon_vma->head, anon_vma_node) {
-		ret = try_to_unmap_one(page, vma, migration);
-		if (ret == SWAP_FAIL || !page_mapped(page))
-			break;
+		if (MLOCK_PAGES && unlikely(unlock)) {
+			if (!((vma->vm_flags & VM_LOCKED) &&
+			      page_mapped_in_vma(page, vma)))
+				continue;  /* must visit all unlocked vmas */
+			ret = SWAP_MLOCK;  /* saw at least one mlocked vma */
+		} else {
+			ret = try_to_unmap_one(page, vma, migration);
+			if (ret == SWAP_FAIL || !page_mapped(page))
+				break;
+		}
+		if (ret == SWAP_MLOCK) {
+			mlocked = try_to_mlock_page(page, vma);
+			if (mlocked)
+				break;	/* stop if actually mlocked page */
+		}
 	}
 
 	page_unlock_anon_vma(anon_vma);
+
+	if (mlocked)
+		ret = SWAP_MLOCK;	/* actually mlocked the page */
+	else if (ret == SWAP_MLOCK)
+		ret = SWAP_AGAIN;	/* saw VM_LOCKED vma */
+
 	return ret;
 }
 
 /**
- * try_to_unmap_file - unmap file page using the object-based rmap method
- * @page: the page to unmap
- * @migration: migration flag
+ * try_to_unmap_file - unmap/unlock file page using the object-based rmap method
+ * @page: the page to unmap/unlock
+ * @unlock:  request for unlock rather than unmap [unlikely]
+ * @migration:  unmapping for migration - ignored if @unlock
  *
  * Find all the mappings of a page using the mapping pointer and the vma chains
  * contained in the address_space struct it points to.
  *
- * This function is only called from try_to_unmap for object-based pages.
+ * This function is only called from try_to_unmap/try_to_munlock for
+ * object-based pages.
+ * When called from try_to_munlock(), the mmap_sem of the mm containing the vma
+ * where the page was found will be held for write.  So, we won't recheck
+ * vm_flags for that VMA.  That should be OK, because that vma shouldn't be
+ * 'LOCKED.
  */
-static int try_to_unmap_file(struct page *page, int migration)
+static int try_to_unmap_file(struct page *page, int unlock, int migration)
 {
 	struct address_space *mapping = page->mapping;
 	pgoff_t pgoff = page->index << (PAGE_CACHE_SHIFT - PAGE_SHIFT);
@@ -936,20 +1062,44 @@ static int try_to_unmap_file(struct page *page, int migration)
 	unsigned long max_nl_cursor = 0;
 	unsigned long max_nl_size = 0;
 	unsigned int mapcount;
+	unsigned int mlocked = 0;
+
+	if (MLOCK_PAGES && unlikely(unlock))
+		ret = SWAP_SUCCESS;	/* default for try_to_munlock() */
 
 	spin_lock(&mapping->i_mmap_lock);
 	vma_prio_tree_foreach(vma, &iter, &mapping->i_mmap, pgoff, pgoff) {
-		ret = try_to_unmap_one(page, vma, migration);
-		if (ret == SWAP_FAIL || !page_mapped(page))
-			goto out;
+		if (MLOCK_PAGES && unlikely(unlock)) {
+			if (!(vma->vm_flags & VM_LOCKED))
+				continue;	/* must visit all vmas */
+			ret = SWAP_MLOCK;
+		} else {
+			ret = try_to_unmap_one(page, vma, migration);
+			if (ret == SWAP_FAIL || !page_mapped(page))
+				goto out;
+		}
+		if (ret == SWAP_MLOCK) {
+			mlocked = try_to_mlock_page(page, vma);
+			if (mlocked)
+				break;  /* stop if actually mlocked page */
+		}
 	}
 
+	if (mlocked)
+		goto out;
+
 	if (list_empty(&mapping->i_mmap_nonlinear))
 		goto out;
 
 	list_for_each_entry(vma, &mapping->i_mmap_nonlinear,
 						shared.vm_set.list) {
-		if ((vma->vm_flags & VM_LOCKED) && !migration)
+		if (MLOCK_PAGES && unlikely(unlock)) {
+			if (!(vma->vm_flags & VM_LOCKED))
+				continue;	/* must visit all vmas */
+			ret = SWAP_MLOCK;	/* leave mlocked == 0 */
+			goto out;		/* no need to look further */
+		}
+		if (!MLOCK_PAGES && !migration && (vma->vm_flags & VM_LOCKED))
 			continue;
 		cursor = (unsigned long) vma->vm_private_data;
 		if (cursor > max_nl_cursor)
@@ -959,7 +1109,7 @@ static int try_to_unmap_file(struct page *page, int migration)
 			max_nl_size = cursor;
 	}
 
-	if (max_nl_size == 0) {	/* any nonlinears locked or reserved */
+	if (max_nl_size == 0) {	/* all nonlinears locked or reserved ? */
 		ret = SWAP_FAIL;
 		goto out;
 	}
@@ -983,12 +1133,16 @@ static int try_to_unmap_file(struct page *page, int migration)
 	do {
 		list_for_each_entry(vma, &mapping->i_mmap_nonlinear,
 						shared.vm_set.list) {
-			if ((vma->vm_flags & VM_LOCKED) && !migration)
+			if (!MLOCK_PAGES && !migration &&
+			    (vma->vm_flags & VM_LOCKED))
 				continue;
 			cursor = (unsigned long) vma->vm_private_data;
 			while ( cursor < max_nl_cursor &&
 				cursor < vma->vm_end - vma->vm_start) {
-				try_to_unmap_cluster(cursor, &mapcount, vma);
+				ret = try_to_unmap_cluster(cursor, &mapcount,
+								vma, page);
+				if (ret == SWAP_MLOCK)
+					mlocked = 2;	/* to return below */
 				cursor += CLUSTER_SIZE;
 				vma->vm_private_data = (void *) cursor;
 				if ((int)mapcount <= 0)
@@ -1009,6 +1163,10 @@ static int try_to_unmap_file(struct page *page, int migration)
 		vma->vm_private_data = NULL;
 out:
 	spin_unlock(&mapping->i_mmap_lock);
+	if (mlocked)
+		ret = SWAP_MLOCK;	/* actually mlocked the page */
+	else if (ret == SWAP_MLOCK)
+		ret = SWAP_AGAIN;	/* saw VM_LOCKED vma */
 	return ret;
 }
 
@@ -1024,6 +1182,7 @@ out:
  * SWAP_SUCCESS	- we succeeded in removing all mappings
  * SWAP_AGAIN	- we missed a mapping, try again later
  * SWAP_FAIL	- the page is unswappable
+ * SWAP_MLOCK	- page is mlocked.
  */
 int try_to_unmap(struct page *page, int migration)
 {
@@ -1032,12 +1191,36 @@ int try_to_unmap(struct page *page, int migration)
 	BUG_ON(!PageLocked(page));
 
 	if (PageAnon(page))
-		ret = try_to_unmap_anon(page, migration);
+		ret = try_to_unmap_anon(page, 0, migration);
 	else
-		ret = try_to_unmap_file(page, migration);
-
-	if (!page_mapped(page))
+		ret = try_to_unmap_file(page, 0, migration);
+	if (ret != SWAP_MLOCK && !page_mapped(page))
 		ret = SWAP_SUCCESS;
 	return ret;
 }
 
+#ifdef CONFIG_UNEVICTABLE_LRU
+/**
+ * try_to_munlock - try to munlock a page
+ * @page: the page to be munlocked
+ *
+ * Called from munlock code.  Checks all of the VMAs mapping the page
+ * to make sure nobody else has this page mlocked. The page will be
+ * returned with PG_mlocked cleared if no other vmas have it mlocked.
+ *
+ * Return values are:
+ *
+ * SWAP_SUCCESS	- no vma's holding page mlocked.
+ * SWAP_AGAIN	- page mapped in mlocked vma -- couldn't acquire mmap sem
+ * SWAP_MLOCK	- page is now mlocked.
+ */
+int try_to_munlock(struct page *page)
+{
+	VM_BUG_ON(!PageLocked(page) || PageLRU(page));
+
+	if (PageAnon(page))
+		return try_to_unmap_anon(page, 1, 0);
+	else
+		return try_to_unmap_file(page, 1, 0);
+}
+#endif
diff --git a/mm/swap.c b/mm/swap.c
index fee6b973f143..bc58c1369dd6 100644
--- a/mm/swap.c
+++ b/mm/swap.c
@@ -278,7 +278,7 @@ void lru_add_drain(void)
 	put_cpu();
 }
 
-#ifdef CONFIG_NUMA
+#if defined(CONFIG_NUMA) || defined(CONFIG_UNEVICTABLE_LRU)
 static void lru_add_drain_per_cpu(struct work_struct *dummy)
 {
 	lru_add_drain();
diff --git a/mm/vmscan.c b/mm/vmscan.c
index dfb342e0db9b..e5aaaad159ef 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -582,11 +582,8 @@ static unsigned long shrink_page_list(struct list_head *page_list,
 
 		sc->nr_scanned++;
 
-		if (unlikely(!page_evictable(page, NULL))) {
-			unlock_page(page);
-			putback_lru_page(page);
-			continue;
-		}
+		if (unlikely(!page_evictable(page, NULL)))
+			goto cull_mlocked;
 
 		if (!sc->may_swap && page_mapped(page))
 			goto keep_locked;
@@ -624,9 +621,19 @@ static unsigned long shrink_page_list(struct list_head *page_list,
 		 * Anonymous process memory has backing store?
 		 * Try to allocate it some swap space here.
 		 */
-		if (PageAnon(page) && !PageSwapCache(page))
+		if (PageAnon(page) && !PageSwapCache(page)) {
+			switch (try_to_munlock(page)) {
+			case SWAP_FAIL:		/* shouldn't happen */
+			case SWAP_AGAIN:
+				goto keep_locked;
+			case SWAP_MLOCK:
+				goto cull_mlocked;
+			case SWAP_SUCCESS:
+				; /* fall thru'; add to swap cache */
+			}
 			if (!add_to_swap(page, GFP_ATOMIC))
 				goto activate_locked;
+		}
 #endif /* CONFIG_SWAP */
 
 		mapping = page_mapping(page);
@@ -641,6 +648,8 @@ static unsigned long shrink_page_list(struct list_head *page_list,
 				goto activate_locked;
 			case SWAP_AGAIN:
 				goto keep_locked;
+			case SWAP_MLOCK:
+				goto cull_mlocked;
 			case SWAP_SUCCESS:
 				; /* try to free the page below */
 			}
@@ -731,6 +740,11 @@ free_it:
 		}
 		continue;
 
+cull_mlocked:
+		unlock_page(page);
+		putback_lru_page(page);
+		continue;
+
 activate_locked:
 		/* Not a candidate for swapping, so reclaim swap space. */
 		if (PageSwapCache(page) && vm_swap_full())
@@ -742,7 +756,7 @@ keep_locked:
 		unlock_page(page);
 keep:
 		list_add(&page->lru, &ret_pages);
-		VM_BUG_ON(PageLRU(page));
+		VM_BUG_ON(PageLRU(page) || PageUnevictable(page));
 	}
 	list_splice(&ret_pages, page_list);
 	if (pagevec_count(&freed_pvec))
@@ -2329,12 +2343,13 @@ int zone_reclaim(struct zone *zone, gfp_t gfp_mask, unsigned int order)
  * @vma: the VMA in which the page is or will be mapped, may be NULL
  *
  * Test whether page is evictable--i.e., should be placed on active/inactive
- * lists vs unevictable list.
+ * lists vs unevictable list.  The vma argument is !NULL when called from the
+ * fault path to determine how to instantate a new page.
  *
  * Reasons page might not be evictable:
  * (1) page's mapping marked unevictable
+ * (2) page is part of an mlocked VMA
  *
- * TODO - later patches
  */
 int page_evictable(struct page *page, struct vm_area_struct *vma)
 {
@@ -2342,7 +2357,8 @@ int page_evictable(struct page *page, struct vm_area_struct *vma)
 	if (mapping_unevictable(page_mapping(page)))
 		return 0;
 
-	/* TODO:  test page [!]evictable conditions */
+	if (PageMlocked(page) || (vma && is_mlocked_vma(vma, page)))
+		return 0;
 
 	return 1;
 }
-- 
cgit v1.2.3


From 5344b7e648980cc2ca613ec03a56a8222ff48820 Mon Sep 17 00:00:00 2001
From: Nick Piggin <npiggin@suse.de>
Date: Sat, 18 Oct 2008 20:26:51 -0700
Subject: vmstat: mlocked pages statistics

Add NR_MLOCK zone page state, which provides a (conservative) count of
mlocked pages (actually, the number of mlocked pages moved off the LRU).

Reworked by lts to fit in with the modified mlock page support in the
Reclaim Scalability series.

[kosaki.motohiro@jp.fujitsu.com: fix incorrect Mlocked field of /proc/meminfo]
[lee.schermerhorn@hp.com: mlocked-pages: add event counting with statistics]
Signed-off-by: Nick Piggin <npiggin@suse.de>
Signed-off-by: Lee Schermerhorn <lee.schermerhorn@hp.com>
Signed-off-by: Rik van Riel <riel@redhat.com>
Signed-off-by: KOSAKI Motohiro <kosaki.motohiro@jp.fujitsu.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 drivers/base/node.c    |  4 +++-
 fs/proc/proc_misc.c    |  2 ++
 include/linux/mmzone.h |  2 ++
 include/linux/vmstat.h |  4 ++++
 mm/internal.h          | 16 +++++++++++++---
 mm/mlock.c             | 41 ++++++++++++++++++++++++++++++++++++-----
 mm/vmstat.c            |  5 +++++
 7 files changed, 65 insertions(+), 9 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/base/node.c b/drivers/base/node.c
index 11a9a05cf554..fb45d88a2446 100644
--- a/drivers/base/node.c
+++ b/drivers/base/node.c
@@ -71,7 +71,8 @@ static ssize_t node_read_meminfo(struct sys_device * dev,
 		       "Node %d Active(file):   %8lu kB\n"
 		       "Node %d Inactive(file): %8lu kB\n"
 #ifdef CONFIG_UNEVICTABLE_LRU
-		       "Node %d Noreclaim:      %8lu kB\n"
+		       "Node %d Unevictable:    %8lu kB\n"
+		       "Node %d Mlocked:        %8lu kB\n"
 #endif
 #ifdef CONFIG_HIGHMEM
 		       "Node %d HighTotal:      %8lu kB\n"
@@ -104,6 +105,7 @@ static ssize_t node_read_meminfo(struct sys_device * dev,
 		       nid, K(node_page_state(nid, NR_INACTIVE_FILE)),
 #ifdef CONFIG_UNEVICTABLE_LRU
 		       nid, K(node_page_state(nid, NR_UNEVICTABLE)),
+		       nid, K(node_page_state(nid, NR_MLOCK)),
 #endif
 #ifdef CONFIG_HIGHMEM
 		       nid, K(i.totalhigh),
diff --git a/fs/proc/proc_misc.c b/fs/proc/proc_misc.c
index 6dd60eaea997..61b25f4eabe6 100644
--- a/fs/proc/proc_misc.c
+++ b/fs/proc/proc_misc.c
@@ -176,6 +176,7 @@ static int meminfo_read_proc(char *page, char **start, off_t off,
 		"Inactive(file): %8lu kB\n"
 #ifdef CONFIG_UNEVICTABLE_LRU
 		"Unevictable:    %8lu kB\n"
+		"Mlocked:        %8lu kB\n"
 #endif
 #ifdef CONFIG_HIGHMEM
 		"HighTotal:      %8lu kB\n"
@@ -217,6 +218,7 @@ static int meminfo_read_proc(char *page, char **start, off_t off,
 		K(pages[LRU_INACTIVE_FILE]),
 #ifdef CONFIG_UNEVICTABLE_LRU
 		K(pages[LRU_UNEVICTABLE]),
+		K(global_page_state(NR_MLOCK)),
 #endif
 #ifdef CONFIG_HIGHMEM
 		K(i.totalhigh),
diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h
index d1f60d5fe2ea..da2d053a95f1 100644
--- a/include/linux/mmzone.h
+++ b/include/linux/mmzone.h
@@ -88,8 +88,10 @@ enum zone_stat_item {
 	NR_ACTIVE_FILE,		/*  "     "     "   "       "         */
 #ifdef CONFIG_UNEVICTABLE_LRU
 	NR_UNEVICTABLE,		/*  "     "     "   "       "         */
+	NR_MLOCK,		/* mlock()ed pages found and moved off LRU */
 #else
 	NR_UNEVICTABLE = NR_ACTIVE_FILE, /* avoid compiler errors in dead code */
+	NR_MLOCK = NR_ACTIVE_FILE,
 #endif
 	NR_ANON_PAGES,	/* Mapped anonymous pages */
 	NR_FILE_MAPPED,	/* pagecache pages mapped into pagetables.
diff --git a/include/linux/vmstat.h b/include/linux/vmstat.h
index 135840cd7feb..05b805020be2 100644
--- a/include/linux/vmstat.h
+++ b/include/linux/vmstat.h
@@ -45,6 +45,10 @@ enum vm_event_item { PGPGIN, PGPGOUT, PSWPIN, PSWPOUT,
 		UNEVICTABLE_PGCULLED,	/* culled to noreclaim list */
 		UNEVICTABLE_PGSCANNED,	/* scanned for reclaimability */
 		UNEVICTABLE_PGRESCUED,	/* rescued from noreclaim list */
+		UNEVICTABLE_PGMLOCKED,
+		UNEVICTABLE_PGMUNLOCKED,
+		UNEVICTABLE_PGCLEARED,	/* on COW, page truncate */
+		UNEVICTABLE_PGSTRANDED,	/* unable to isolate on unlock */
 #endif
 		NR_VM_EVENT_ITEMS
 };
diff --git a/mm/internal.h b/mm/internal.h
index 48e32f790571..1cfbf2e2bc9e 100644
--- a/mm/internal.h
+++ b/mm/internal.h
@@ -101,7 +101,10 @@ static inline int is_mlocked_vma(struct vm_area_struct *vma, struct page *page)
 	if (likely((vma->vm_flags & (VM_LOCKED | VM_SPECIAL)) != VM_LOCKED))
 		return 0;
 
-	SetPageMlocked(page);
+	if (!TestSetPageMlocked(page)) {
+		inc_zone_page_state(page, NR_MLOCK);
+		count_vm_event(UNEVICTABLE_PGMLOCKED);
+	}
 	return 1;
 }
 
@@ -128,12 +131,19 @@ static inline void clear_page_mlock(struct page *page)
 
 /*
  * mlock_migrate_page - called only from migrate_page_copy() to
- * migrate the Mlocked page flag
+ * migrate the Mlocked page flag; update statistics.
  */
 static inline void mlock_migrate_page(struct page *newpage, struct page *page)
 {
-	if (TestClearPageMlocked(page))
+	if (TestClearPageMlocked(page)) {
+		unsigned long flags;
+
+		local_irq_save(flags);
+		__dec_zone_page_state(page, NR_MLOCK);
 		SetPageMlocked(newpage);
+		__inc_zone_page_state(newpage, NR_MLOCK);
+		local_irq_restore(flags);
+	}
 }
 
 
diff --git a/mm/mlock.c b/mm/mlock.c
index 8b478350a2a1..bce1b22c36c2 100644
--- a/mm/mlock.c
+++ b/mm/mlock.c
@@ -60,6 +60,8 @@ void __clear_page_mlock(struct page *page)
 		return;
 	}
 
+	dec_zone_page_state(page, NR_MLOCK);
+	count_vm_event(UNEVICTABLE_PGCLEARED);
 	if (!isolate_lru_page(page)) {
 		putback_lru_page(page);
 	} else {
@@ -69,6 +71,9 @@ void __clear_page_mlock(struct page *page)
 		lru_add_drain_all();
 		if (!isolate_lru_page(page))
 			putback_lru_page(page);
+		else if (PageUnevictable(page))
+			count_vm_event(UNEVICTABLE_PGSTRANDED);
+
 	}
 }
 
@@ -80,8 +85,12 @@ void mlock_vma_page(struct page *page)
 {
 	BUG_ON(!PageLocked(page));
 
-	if (!TestSetPageMlocked(page) && !isolate_lru_page(page))
-		putback_lru_page(page);
+	if (!TestSetPageMlocked(page)) {
+		inc_zone_page_state(page, NR_MLOCK);
+		count_vm_event(UNEVICTABLE_PGMLOCKED);
+		if (!isolate_lru_page(page))
+			putback_lru_page(page);
+	}
 }
 
 /*
@@ -106,9 +115,31 @@ static void munlock_vma_page(struct page *page)
 {
 	BUG_ON(!PageLocked(page));
 
-	if (TestClearPageMlocked(page) && !isolate_lru_page(page)) {
-		try_to_munlock(page);
-		putback_lru_page(page);
+	if (TestClearPageMlocked(page)) {
+		dec_zone_page_state(page, NR_MLOCK);
+		if (!isolate_lru_page(page)) {
+			int ret = try_to_munlock(page);
+			/*
+			 * did try_to_unlock() succeed or punt?
+			 */
+			if (ret == SWAP_SUCCESS || ret == SWAP_AGAIN)
+				count_vm_event(UNEVICTABLE_PGMUNLOCKED);
+
+			putback_lru_page(page);
+		} else {
+			/*
+			 * We lost the race.  let try_to_unmap() deal
+			 * with it.  At least we get the page state and
+			 * mlock stats right.  However, page is still on
+			 * the noreclaim list.  We'll fix that up when
+			 * the page is eventually freed or we scan the
+			 * noreclaim list.
+			 */
+			if (PageUnevictable(page))
+				count_vm_event(UNEVICTABLE_PGSTRANDED);
+			else
+				count_vm_event(UNEVICTABLE_PGMUNLOCKED);
+		}
 	}
 }
 
diff --git a/mm/vmstat.c b/mm/vmstat.c
index 6db2f6319313..9e28abc0a0b9 100644
--- a/mm/vmstat.c
+++ b/mm/vmstat.c
@@ -625,6 +625,7 @@ static const char * const vmstat_text[] = {
 	"nr_active_file",
 #ifdef CONFIG_UNEVICTABLE_LRU
 	"nr_unevictable",
+	"nr_mlock",
 #endif
 	"nr_anon_pages",
 	"nr_mapped",
@@ -684,6 +685,10 @@ static const char * const vmstat_text[] = {
 	"unevictable_pgs_culled",
 	"unevictable_pgs_scanned",
 	"unevictable_pgs_rescued",
+	"unevictable_pgs_mlocked",
+	"unevictable_pgs_munlocked",
+	"unevictable_pgs_cleared",
+	"unevictable_pgs_stranded",
 #endif
 #endif
 };
-- 
cgit v1.2.3


From 64d6519dda3905dfb94d3f93c07c5f263f41813f Mon Sep 17 00:00:00 2001
From: Lee Schermerhorn <lee.schermerhorn@hp.com>
Date: Sat, 18 Oct 2008 20:26:52 -0700
Subject: swap: cull unevictable pages in fault path

In the fault paths that install new anonymous pages, check whether the
page is evictable or not using lru_cache_add_active_or_unevictable().  If
the page is evictable, just add it to the active lru list [via the pagevec
cache], else add it to the unevictable list.

This "proactive" culling in the fault path mimics the handling of mlocked
pages in Nick Piggin's series to keep mlocked pages off the lru lists.

Notes:

1) This patch is optional--e.g., if one is concerned about the
   additional test in the fault path.  We can defer the moving of
   nonreclaimable pages until when vmscan [shrink_*_list()]
   encounters them.  Vmscan will only need to handle such pages
   once, but if there are a lot of them it could impact system
   performance.

2) The 'vma' argument to page_evictable() is require to notice that
   we're faulting a page into an mlock()ed vma w/o having to scan the
   page's rmap in the fault path.   Culling mlock()ed anon pages is
   currently the only reason for this patch.

3) We can't cull swap pages in read_swap_cache_async() because the
   vma argument doesn't necessarily correspond to the swap cache
   offset passed in by swapin_readahead().  This could [did!] result
   in mlocking pages in non-VM_LOCKED vmas if [when] we tried to
   cull in this path.

4) Move set_pte_at() to after where we add page to lru to keep it
   hidden from other tasks that might walk the page table.
   We already do it in this order in do_anonymous() page.  And,
   these are COW'd anon pages.  Is this safe?

[riel@redhat.com: undo an overzealous code cleanup]
Signed-off-by: Lee Schermerhorn <lee.schermerhorn@hp.com>
Signed-off-by: Rik van Riel <riel@redhat.com>
Signed-off-by: KOSAKI Motohiro <kosaki.motohiro@jp.fujitsu.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/swap.h |  2 ++
 mm/memory.c          | 18 ++++++++++--------
 mm/swap.c            | 21 +++++++++++++++++++++
 3 files changed, 33 insertions(+), 8 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/swap.h b/include/linux/swap.h
index 7edb4cbc29f9..07eda69412fb 100644
--- a/include/linux/swap.h
+++ b/include/linux/swap.h
@@ -173,6 +173,8 @@ extern unsigned int nr_free_pagecache_pages(void);
 /* linux/mm/swap.c */
 extern void __lru_cache_add(struct page *, enum lru_list lru);
 extern void lru_cache_add_lru(struct page *, enum lru_list lru);
+extern void lru_cache_add_active_or_unevictable(struct page *,
+					struct vm_area_struct *);
 extern void activate_page(struct page *);
 extern void mark_page_accessed(struct page *);
 extern void lru_add_drain(void);
diff --git a/mm/memory.c b/mm/memory.c
index 9fef7272fb9e..450127f4c582 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -1922,12 +1922,13 @@ gotten:
 		 * thread doing COW.
 		 */
 		ptep_clear_flush_notify(vma, address, page_table);
-		set_pte_at(mm, address, page_table, entry);
-		update_mmu_cache(vma, address, entry);
 		SetPageSwapBacked(new_page);
-		lru_cache_add_active_anon(new_page);
+		lru_cache_add_active_or_unevictable(new_page, vma);
 		page_add_new_anon_rmap(new_page, vma, address);
 
+//TODO:  is this safe?  do_anonymous_page() does it this way.
+		set_pte_at(mm, address, page_table, entry);
+		update_mmu_cache(vma, address, entry);
 		if (old_page) {
 			/*
 			 * Only after switching the pte to the new page may
@@ -2420,7 +2421,7 @@ static int do_anonymous_page(struct mm_struct *mm, struct vm_area_struct *vma,
 		goto release;
 	inc_mm_counter(mm, anon_rss);
 	SetPageSwapBacked(page);
-	lru_cache_add_active_anon(page);
+	lru_cache_add_active_or_unevictable(page, vma);
 	page_add_new_anon_rmap(page, vma, address);
 	set_pte_at(mm, address, page_table, entry);
 
@@ -2564,12 +2565,11 @@ static int __do_fault(struct mm_struct *mm, struct vm_area_struct *vma,
 		entry = mk_pte(page, vma->vm_page_prot);
 		if (flags & FAULT_FLAG_WRITE)
 			entry = maybe_mkwrite(pte_mkdirty(entry), vma);
-		set_pte_at(mm, address, page_table, entry);
 		if (anon) {
-                        inc_mm_counter(mm, anon_rss);
+			inc_mm_counter(mm, anon_rss);
 			SetPageSwapBacked(page);
-                        lru_cache_add_active_anon(page);
-                        page_add_new_anon_rmap(page, vma, address);
+			lru_cache_add_active_or_unevictable(page, vma);
+			page_add_new_anon_rmap(page, vma, address);
 		} else {
 			inc_mm_counter(mm, file_rss);
 			page_add_file_rmap(page);
@@ -2578,6 +2578,8 @@ static int __do_fault(struct mm_struct *mm, struct vm_area_struct *vma,
 				get_page(dirty_page);
 			}
 		}
+//TODO:  is this safe?  do_anonymous_page() does it this way.
+		set_pte_at(mm, address, page_table, entry);
 
 		/* no need to invalidate: a not-present page won't be cached */
 		update_mmu_cache(vma, address, entry);
diff --git a/mm/swap.c b/mm/swap.c
index bc58c1369dd6..2152e48a7b8f 100644
--- a/mm/swap.c
+++ b/mm/swap.c
@@ -31,6 +31,8 @@
 #include <linux/backing-dev.h>
 #include <linux/memcontrol.h>
 
+#include "internal.h"
+
 /* How many pages do we try to swap or page in/out together? */
 int page_cluster;
 
@@ -244,6 +246,25 @@ void add_page_to_unevictable_list(struct page *page)
 	spin_unlock_irq(&zone->lru_lock);
 }
 
+/**
+ * lru_cache_add_active_or_unevictable
+ * @page:  the page to be added to LRU
+ * @vma:   vma in which page is mapped for determining reclaimability
+ *
+ * place @page on active or unevictable LRU list, depending on
+ * page_evictable().  Note that if the page is not evictable,
+ * it goes directly back onto it's zone's unevictable list.  It does
+ * NOT use a per cpu pagevec.
+ */
+void lru_cache_add_active_or_unevictable(struct page *page,
+					struct vm_area_struct *vma)
+{
+	if (page_evictable(page, vma))
+		lru_cache_add_lru(page, LRU_ACTIVE + page_is_file_cache(page));
+	else
+		add_page_to_unevictable_list(page);
+}
+
 /*
  * Drain pages out of the cpu's pagevecs.
  * Either "cpu" is the current CPU, and preemption has already been
-- 
cgit v1.2.3


From af936a1606246a10c145feac3770f6287f483f02 Mon Sep 17 00:00:00 2001
From: Lee Schermerhorn <lee.schermerhorn@hp.com>
Date: Sat, 18 Oct 2008 20:26:53 -0700
Subject: vmscan: unevictable LRU scan sysctl

This patch adds a function to scan individual or all zones' unevictable
lists and move any pages that have become evictable onto the respective
zone's inactive list, where shrink_inactive_list() will deal with them.

Adds sysctl to scan all nodes, and per node attributes to individual
nodes' zones.

Kosaki: If evictable page found in unevictable lru when write
/proc/sys/vm/scan_unevictable_pages, print filename and file offset of
these pages.

[akpm@linux-foundation.org: fix one CONFIG_MMU=n build error]
[kosaki.motohiro@jp.fujitsu.com: adapt vmscan-unevictable-lru-scan-sysctl.patch to new sysfs API]
Signed-off-by: Lee Schermerhorn <lee.schermerhorn@hp.com>
Signed-off-by: Rik van Riel <riel@redhat.com>
Signed-off-by: KOSAKI Motohiro <kosaki.motohiro@jp.fujitsu.com>
Signed-off-by: KOSAKI Motohiro <kosaki.motohiro@jp.fujitsu.com>
Signed-off-by: Hugh Dickins <hugh@veritas.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 drivers/base/node.c  |   5 ++
 include/linux/rmap.h |   3 +
 include/linux/swap.h |  15 +++++
 kernel/sysctl.c      |  10 ++++
 mm/rmap.c            |   4 +-
 mm/vmscan.c          | 166 +++++++++++++++++++++++++++++++++++++++++++++++++++
 6 files changed, 201 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/base/node.c b/drivers/base/node.c
index fb45d88a2446..f5207090885a 100644
--- a/drivers/base/node.c
+++ b/drivers/base/node.c
@@ -13,6 +13,7 @@
 #include <linux/nodemask.h>
 #include <linux/cpu.h>
 #include <linux/device.h>
+#include <linux/swap.h>
 
 static struct sysdev_class node_class = {
 	.name = "node",
@@ -191,6 +192,8 @@ int register_node(struct node *node, int num, struct node *parent)
 		sysdev_create_file(&node->sysdev, &attr_meminfo);
 		sysdev_create_file(&node->sysdev, &attr_numastat);
 		sysdev_create_file(&node->sysdev, &attr_distance);
+
+		scan_unevictable_register_node(node);
 	}
 	return error;
 }
@@ -210,6 +213,8 @@ void unregister_node(struct node *node)
 	sysdev_remove_file(&node->sysdev, &attr_numastat);
 	sysdev_remove_file(&node->sysdev, &attr_distance);
 
+	scan_unevictable_unregister_node(node);
+
 	sysdev_unregister(&node->sysdev);
 }
 
diff --git a/include/linux/rmap.h b/include/linux/rmap.h
index 955667e6a52d..1da48db8db09 100644
--- a/include/linux/rmap.h
+++ b/include/linux/rmap.h
@@ -75,6 +75,9 @@ void anon_vma_unlink(struct vm_area_struct *);
 void anon_vma_link(struct vm_area_struct *);
 void __anon_vma_link(struct vm_area_struct *);
 
+extern struct anon_vma *page_lock_anon_vma(struct page *page);
+extern void page_unlock_anon_vma(struct anon_vma *anon_vma);
+
 /*
  * rmap interfaces called when adding or removing pte of page
  */
diff --git a/include/linux/swap.h b/include/linux/swap.h
index 07eda69412fb..a3af95b2cb6d 100644
--- a/include/linux/swap.h
+++ b/include/linux/swap.h
@@ -7,6 +7,7 @@
 #include <linux/list.h>
 #include <linux/memcontrol.h>
 #include <linux/sched.h>
+#include <linux/node.h>
 
 #include <asm/atomic.h>
 #include <asm/page.h>
@@ -235,15 +236,29 @@ static inline int zone_reclaim(struct zone *z, gfp_t mask, unsigned int order)
 #ifdef CONFIG_UNEVICTABLE_LRU
 extern int page_evictable(struct page *page, struct vm_area_struct *vma);
 extern void scan_mapping_unevictable_pages(struct address_space *);
+
+extern unsigned long scan_unevictable_pages;
+extern int scan_unevictable_handler(struct ctl_table *, int, struct file *,
+					void __user *, size_t *, loff_t *);
+extern int scan_unevictable_register_node(struct node *node);
+extern void scan_unevictable_unregister_node(struct node *node);
 #else
 static inline int page_evictable(struct page *page,
 						struct vm_area_struct *vma)
 {
 	return 1;
 }
+
 static inline void scan_mapping_unevictable_pages(struct address_space *mapping)
 {
 }
+
+static inline int scan_unevictable_register_node(struct node *node)
+{
+	return 0;
+}
+
+static inline void scan_unevictable_unregister_node(struct node *node) { }
 #endif
 
 extern int kswapd_run(int nid);
diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index 617d41e4d6a0..b3cc73931d1f 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -833,6 +833,16 @@ static struct ctl_table kern_table[] = {
 		.proc_handler   = &proc_dointvec,
 	},
 #endif
+#ifdef CONFIG_UNEVICTABLE_LRU
+	{
+		.ctl_name	= CTL_UNNUMBERED,
+		.procname	= "scan_unevictable_pages",
+		.data		= &scan_unevictable_pages,
+		.maxlen		= sizeof(scan_unevictable_pages),
+		.mode		= 0644,
+		.proc_handler	= &scan_unevictable_handler,
+	},
+#endif
 /*
  * NOTE: do not add new entries to this table unless you have read
  * Documentation/sysctl/ctl_unnumbered.txt
diff --git a/mm/rmap.c b/mm/rmap.c
index 7e60df99018e..7e90bebbeb6c 100644
--- a/mm/rmap.c
+++ b/mm/rmap.c
@@ -181,7 +181,7 @@ void __init anon_vma_init(void)
  * Getting a lock on a stable anon_vma from a page off the LRU is
  * tricky: page_lock_anon_vma rely on RCU to guard against the races.
  */
-static struct anon_vma *page_lock_anon_vma(struct page *page)
+struct anon_vma *page_lock_anon_vma(struct page *page)
 {
 	struct anon_vma *anon_vma;
 	unsigned long anon_mapping;
@@ -201,7 +201,7 @@ out:
 	return NULL;
 }
 
-static void page_unlock_anon_vma(struct anon_vma *anon_vma)
+void page_unlock_anon_vma(struct anon_vma *anon_vma)
 {
 	spin_unlock(&anon_vma->lock);
 	rcu_read_unlock();
diff --git a/mm/vmscan.c b/mm/vmscan.c
index e5aaaad159ef..ca64e3e0c518 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -39,6 +39,7 @@
 #include <linux/freezer.h>
 #include <linux/memcontrol.h>
 #include <linux/delayacct.h>
+#include <linux/sysctl.h>
 
 #include <asm/tlbflush.h>
 #include <asm/div64.h>
@@ -2363,6 +2364,39 @@ int page_evictable(struct page *page, struct vm_area_struct *vma)
 	return 1;
 }
 
+static void show_page_path(struct page *page)
+{
+	char buf[256];
+	if (page_is_file_cache(page)) {
+		struct address_space *mapping = page->mapping;
+		struct dentry *dentry;
+		pgoff_t pgoff = page->index << (PAGE_CACHE_SHIFT - PAGE_SHIFT);
+
+		spin_lock(&mapping->i_mmap_lock);
+		dentry = d_find_alias(mapping->host);
+		printk(KERN_INFO "rescued: %s %lu\n",
+		       dentry_path(dentry, buf, 256), pgoff);
+		spin_unlock(&mapping->i_mmap_lock);
+	} else {
+#if defined(CONFIG_MM_OWNER) && defined(CONFIG_MMU)
+		struct anon_vma *anon_vma;
+		struct vm_area_struct *vma;
+
+		anon_vma = page_lock_anon_vma(page);
+		if (!anon_vma)
+			return;
+
+		list_for_each_entry(vma, &anon_vma->head, anon_vma_node) {
+			printk(KERN_INFO "rescued: anon %s\n",
+			       vma->vm_mm->owner->comm);
+			break;
+		}
+		page_unlock_anon_vma(anon_vma);
+#endif
+	}
+}
+
+
 /**
  * check_move_unevictable_page - check page for evictability and move to appropriate zone lru list
  * @page: page to check evictability and move to appropriate lru list
@@ -2382,6 +2416,9 @@ retry:
 	ClearPageUnevictable(page);
 	if (page_evictable(page, NULL)) {
 		enum lru_list l = LRU_INACTIVE_ANON + page_is_file_cache(page);
+
+		show_page_path(page);
+
 		__dec_zone_state(zone, NR_UNEVICTABLE);
 		list_move(&page->lru, &zone->lru[l].list);
 		__inc_zone_state(zone, NR_INACTIVE_ANON + l);
@@ -2451,4 +2488,133 @@ void scan_mapping_unevictable_pages(struct address_space *mapping)
 	}
 
 }
+
+/**
+ * scan_zone_unevictable_pages - check unevictable list for evictable pages
+ * @zone - zone of which to scan the unevictable list
+ *
+ * Scan @zone's unevictable LRU lists to check for pages that have become
+ * evictable.  Move those that have to @zone's inactive list where they
+ * become candidates for reclaim, unless shrink_inactive_zone() decides
+ * to reactivate them.  Pages that are still unevictable are rotated
+ * back onto @zone's unevictable list.
+ */
+#define SCAN_UNEVICTABLE_BATCH_SIZE 16UL /* arbitrary lock hold batch size */
+void scan_zone_unevictable_pages(struct zone *zone)
+{
+	struct list_head *l_unevictable = &zone->lru[LRU_UNEVICTABLE].list;
+	unsigned long scan;
+	unsigned long nr_to_scan = zone_page_state(zone, NR_UNEVICTABLE);
+
+	while (nr_to_scan > 0) {
+		unsigned long batch_size = min(nr_to_scan,
+						SCAN_UNEVICTABLE_BATCH_SIZE);
+
+		spin_lock_irq(&zone->lru_lock);
+		for (scan = 0;  scan < batch_size; scan++) {
+			struct page *page = lru_to_page(l_unevictable);
+
+			if (!trylock_page(page))
+				continue;
+
+			prefetchw_prev_lru_page(page, l_unevictable, flags);
+
+			if (likely(PageLRU(page) && PageUnevictable(page)))
+				check_move_unevictable_page(page, zone);
+
+			unlock_page(page);
+		}
+		spin_unlock_irq(&zone->lru_lock);
+
+		nr_to_scan -= batch_size;
+	}
+}
+
+
+/**
+ * scan_all_zones_unevictable_pages - scan all unevictable lists for evictable pages
+ *
+ * A really big hammer:  scan all zones' unevictable LRU lists to check for
+ * pages that have become evictable.  Move those back to the zones'
+ * inactive list where they become candidates for reclaim.
+ * This occurs when, e.g., we have unswappable pages on the unevictable lists,
+ * and we add swap to the system.  As such, it runs in the context of a task
+ * that has possibly/probably made some previously unevictable pages
+ * evictable.
+ */
+void scan_all_zones_unevictable_pages(void)
+{
+	struct zone *zone;
+
+	for_each_zone(zone) {
+		scan_zone_unevictable_pages(zone);
+	}
+}
+
+/*
+ * scan_unevictable_pages [vm] sysctl handler.  On demand re-scan of
+ * all nodes' unevictable lists for evictable pages
+ */
+unsigned long scan_unevictable_pages;
+
+int scan_unevictable_handler(struct ctl_table *table, int write,
+			   struct file *file, void __user *buffer,
+			   size_t *length, loff_t *ppos)
+{
+	proc_doulongvec_minmax(table, write, file, buffer, length, ppos);
+
+	if (write && *(unsigned long *)table->data)
+		scan_all_zones_unevictable_pages();
+
+	scan_unevictable_pages = 0;
+	return 0;
+}
+
+/*
+ * per node 'scan_unevictable_pages' attribute.  On demand re-scan of
+ * a specified node's per zone unevictable lists for evictable pages.
+ */
+
+static ssize_t read_scan_unevictable_node(struct sys_device *dev,
+					  struct sysdev_attribute *attr,
+					  char *buf)
+{
+	return sprintf(buf, "0\n");	/* always zero; should fit... */
+}
+
+static ssize_t write_scan_unevictable_node(struct sys_device *dev,
+					   struct sysdev_attribute *attr,
+					const char *buf, size_t count)
+{
+	struct zone *node_zones = NODE_DATA(dev->id)->node_zones;
+	struct zone *zone;
+	unsigned long res;
+	unsigned long req = strict_strtoul(buf, 10, &res);
+
+	if (!req)
+		return 1;	/* zero is no-op */
+
+	for (zone = node_zones; zone - node_zones < MAX_NR_ZONES; ++zone) {
+		if (!populated_zone(zone))
+			continue;
+		scan_zone_unevictable_pages(zone);
+	}
+	return 1;
+}
+
+
+static SYSDEV_ATTR(scan_unevictable_pages, S_IRUGO | S_IWUSR,
+			read_scan_unevictable_node,
+			write_scan_unevictable_node);
+
+int scan_unevictable_register_node(struct node *node)
+{
+	return sysdev_create_file(&node->sysdev, &attr_scan_unevictable_pages);
+}
+
+void scan_unevictable_unregister_node(struct node *node)
+{
+	sysdev_remove_file(&node->sysdev, &attr_scan_unevictable_pages);
+}
+
 #endif
-- 
cgit v1.2.3


From 985737cf2ea096ea946aed82c7484d40defc71a8 Mon Sep 17 00:00:00 2001
From: Lee Schermerhorn <lee.schermerhorn@hp.com>
Date: Sat, 18 Oct 2008 20:26:53 -0700
Subject: mlock: count attempts to free mlocked page

Allow free of mlock()ed pages.  This shouldn't happen, but during
developement, it occasionally did.

This patch allows us to survive that condition, while keeping the
statistics and events correct for debug.

Signed-off-by: Lee Schermerhorn <lee.schermerhorn@hp.com>
Signed-off-by: Rik van Riel <riel@redhat.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/vmstat.h |  1 +
 mm/internal.h          | 17 +++++++++++++++++
 mm/page_alloc.c        |  1 +
 mm/vmstat.c            |  1 +
 4 files changed, 20 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/vmstat.h b/include/linux/vmstat.h
index 05b805020be2..9cd3ab0f554d 100644
--- a/include/linux/vmstat.h
+++ b/include/linux/vmstat.h
@@ -49,6 +49,7 @@ enum vm_event_item { PGPGIN, PGPGOUT, PSWPIN, PSWPOUT,
 		UNEVICTABLE_PGMUNLOCKED,
 		UNEVICTABLE_PGCLEARED,	/* on COW, page truncate */
 		UNEVICTABLE_PGSTRANDED,	/* unable to isolate on unlock */
+		UNEVICTABLE_MLOCKFREED,
 #endif
 		NR_VM_EVENT_ITEMS
 };
diff --git a/mm/internal.h b/mm/internal.h
index 1cfbf2e2bc9e..e4e728bdf324 100644
--- a/mm/internal.h
+++ b/mm/internal.h
@@ -146,6 +146,22 @@ static inline void mlock_migrate_page(struct page *newpage, struct page *page)
 	}
 }
 
+/*
+ * free_page_mlock() -- clean up attempts to free and mlocked() page.
+ * Page should not be on lru, so no need to fix that up.
+ * free_pages_check() will verify...
+ */
+static inline void free_page_mlock(struct page *page)
+{
+	if (unlikely(TestClearPageMlocked(page))) {
+		unsigned long flags;
+
+		local_irq_save(flags);
+		__dec_zone_page_state(page, NR_MLOCK);
+		__count_vm_event(UNEVICTABLE_MLOCKFREED);
+		local_irq_restore(flags);
+	}
+}
 
 #else /* CONFIG_UNEVICTABLE_LRU */
 static inline int is_mlocked_vma(struct vm_area_struct *v, struct page *p)
@@ -155,6 +171,7 @@ static inline int is_mlocked_vma(struct vm_area_struct *v, struct page *p)
 static inline void clear_page_mlock(struct page *page) { }
 static inline void mlock_vma_page(struct page *page) { }
 static inline void mlock_migrate_page(struct page *new, struct page *old) { }
+static inline void free_page_mlock(struct page *page) { }
 
 #endif /* CONFIG_UNEVICTABLE_LRU */
 
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 5886586fde6c..cfbadad75d1d 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -454,6 +454,7 @@ static inline void __free_one_page(struct page *page,
 
 static inline int free_pages_check(struct page *page)
 {
+	free_page_mlock(page);
 	if (unlikely(page_mapcount(page) |
 		(page->mapping != NULL)  |
 		(page_get_page_cgroup(page) != NULL) |
diff --git a/mm/vmstat.c b/mm/vmstat.c
index 9e28abc0a0b9..9343227c5c60 100644
--- a/mm/vmstat.c
+++ b/mm/vmstat.c
@@ -689,6 +689,7 @@ static const char * const vmstat_text[] = {
 	"unevictable_pgs_munlocked",
 	"unevictable_pgs_cleared",
 	"unevictable_pgs_stranded",
+	"unevictable_pgs_mlockfreed",
 #endif
 #endif
 };
-- 
cgit v1.2.3


From 902d2e8ae0de29f483840ba1134af27343b9564d Mon Sep 17 00:00:00 2001
From: KOSAKI Motohiro <kosaki.motohiro@jp.fujitsu.com>
Date: Sat, 18 Oct 2008 20:26:54 -0700
Subject: vmscan: kill unused lru functions

Several LRU manupuration function are not used now.  So they can be
removed.

Signed-off-by: KOSAKI Motohiro <kosaki.motohiro@jp.fujitsu.com>
Acked-by: Rik van Riel <riel@redhat.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/mm_inline.h | 48 -----------------------------------------------
 1 file changed, 48 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/mm_inline.h b/include/linux/mm_inline.h
index 67d7697fd019..c948350c378e 100644
--- a/include/linux/mm_inline.h
+++ b/include/linux/mm_inline.h
@@ -37,54 +37,6 @@ del_page_from_lru_list(struct zone *zone, struct page *page, enum lru_list l)
 	__dec_zone_state(zone, NR_LRU_BASE + l);
 }
 
-static inline void
-add_page_to_inactive_anon_list(struct zone *zone, struct page *page)
-{
-	add_page_to_lru_list(zone, page, LRU_INACTIVE_ANON);
-}
-
-static inline void
-add_page_to_active_anon_list(struct zone *zone, struct page *page)
-{
-	add_page_to_lru_list(zone, page, LRU_ACTIVE_ANON);
-}
-
-static inline void
-add_page_to_inactive_file_list(struct zone *zone, struct page *page)
-{
-	add_page_to_lru_list(zone, page, LRU_INACTIVE_FILE);
-}
-
-static inline void
-add_page_to_active_file_list(struct zone *zone, struct page *page)
-{
-	add_page_to_lru_list(zone, page, LRU_ACTIVE_FILE);
-}
-
-static inline void
-del_page_from_inactive_anon_list(struct zone *zone, struct page *page)
-{
-	del_page_from_lru_list(zone, page, LRU_INACTIVE_ANON);
-}
-
-static inline void
-del_page_from_active_anon_list(struct zone *zone, struct page *page)
-{
-	del_page_from_lru_list(zone, page, LRU_ACTIVE_ANON);
-}
-
-static inline void
-del_page_from_inactive_file_list(struct zone *zone, struct page *page)
-{
-	del_page_from_lru_list(zone, page, LRU_INACTIVE_FILE);
-}
-
-static inline void
-del_page_from_active_file_list(struct zone *zone, struct page *page)
-{
-	del_page_from_lru_list(zone, page, LRU_INACTIVE_FILE);
-}
-
 static inline void
 del_page_from_lru(struct zone *zone, struct page *page)
 {
-- 
cgit v1.2.3


From f45840b5c128445da70e7ec33adc47b4a12bdaf4 Mon Sep 17 00:00:00 2001
From: Nick Piggin <npiggin@suse.de>
Date: Sat, 18 Oct 2008 20:26:57 -0700
Subject: mm: pagecache insertion fewer atomics

Setting and clearing the page locked when inserting it into swapcache /
pagecache when it has no other references can use non-atomic page flags
operations because no other CPU may be operating on it at this time.

This saves one atomic operation when inserting a page into pagecache.

Signed-off-by: Nick Piggin <npiggin@suse.de>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/pagemap.h | 14 +++++++-------
 mm/swap_state.c         |  4 ++--
 2 files changed, 9 insertions(+), 9 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/pagemap.h b/include/linux/pagemap.h
index 4b6c4d8d26b8..7334b2b6c4c6 100644
--- a/include/linux/pagemap.h
+++ b/include/linux/pagemap.h
@@ -299,14 +299,14 @@ extern int __lock_page_killable(struct page *page);
 extern void __lock_page_nosync(struct page *page);
 extern void unlock_page(struct page *page);
 
-static inline void set_page_locked(struct page *page)
+static inline void __set_page_locked(struct page *page)
 {
-	set_bit(PG_locked, &page->flags);
+	__set_bit(PG_locked, &page->flags);
 }
 
-static inline void clear_page_locked(struct page *page)
+static inline void __clear_page_locked(struct page *page)
 {
-	clear_bit(PG_locked, &page->flags);
+	__clear_bit(PG_locked, &page->flags);
 }
 
 static inline int trylock_page(struct page *page)
@@ -438,17 +438,17 @@ extern void __remove_from_page_cache(struct page *page);
 
 /*
  * Like add_to_page_cache_locked, but used to add newly allocated pages:
- * the page is new, so we can just run set_page_locked() against it.
+ * the page is new, so we can just run __set_page_locked() against it.
  */
 static inline int add_to_page_cache(struct page *page,
 		struct address_space *mapping, pgoff_t offset, gfp_t gfp_mask)
 {
 	int error;
 
-	set_page_locked(page);
+	__set_page_locked(page);
 	error = add_to_page_cache_locked(page, mapping, offset, gfp_mask);
 	if (unlikely(error))
-		clear_page_locked(page);
+		__clear_page_locked(page);
 	return error;
 }
 
diff --git a/mm/swap_state.c b/mm/swap_state.c
index 43cda7b4b808..3353c9029cef 100644
--- a/mm/swap_state.c
+++ b/mm/swap_state.c
@@ -303,7 +303,7 @@ struct page *read_swap_cache_async(swp_entry_t entry, gfp_t gfp_mask,
 		 * re-using the just freed swap entry for an existing page.
 		 * May fail (-ENOMEM) if radix-tree node allocation failed.
 		 */
-		set_page_locked(new_page);
+		__set_page_locked(new_page);
 		SetPageSwapBacked(new_page);
 		err = add_to_swap_cache(new_page, entry, gfp_mask & GFP_KERNEL);
 		if (likely(!err)) {
@@ -315,7 +315,7 @@ struct page *read_swap_cache_async(swp_entry_t entry, gfp_t gfp_mask,
 			return new_page;
 		}
 		ClearPageSwapBacked(new_page);
-		clear_page_locked(new_page);
+		__clear_page_locked(new_page);
 		swap_free(entry);
 	} while (err != -ENOMEM);
 
-- 
cgit v1.2.3


From 8413ac9d8c9a1366a4f57880723126cd24e5a5c3 Mon Sep 17 00:00:00 2001
From: Nick Piggin <npiggin@suse.de>
Date: Sat, 18 Oct 2008 20:26:59 -0700
Subject: mm: page lock use lock bitops

trylock_page, unlock_page open and close a critical section. Hence,
we can use the lock bitops to get the desired memory ordering.

Also, mark trylock as likely to succeed (and remove the annotation from
callers).

Signed-off-by: Nick Piggin <npiggin@suse.de>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/pagemap.h |  2 +-
 mm/filemap.c            | 13 +++++--------
 mm/swapfile.c           |  2 +-
 3 files changed, 7 insertions(+), 10 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/pagemap.h b/include/linux/pagemap.h
index 7334b2b6c4c6..709742be02f0 100644
--- a/include/linux/pagemap.h
+++ b/include/linux/pagemap.h
@@ -311,7 +311,7 @@ static inline void __clear_page_locked(struct page *page)
 
 static inline int trylock_page(struct page *page)
 {
-	return !test_and_set_bit(PG_locked, &page->flags);
+	return (likely(!test_and_set_bit_lock(PG_locked, &page->flags)));
 }
 
 /*
diff --git a/mm/filemap.c b/mm/filemap.c
index a1ddd2557af2..e1b23fda48de 100644
--- a/mm/filemap.c
+++ b/mm/filemap.c
@@ -573,17 +573,14 @@ EXPORT_SYMBOL(wait_on_page_bit);
  * mechananism between PageLocked pages and PageWriteback pages is shared.
  * But that's OK - sleepers in wait_on_page_writeback() just go back to sleep.
  *
- * The first mb is necessary to safely close the critical section opened by the
- * test_and_set_bit() to lock the page; the second mb is necessary to enforce
- * ordering between the clear_bit and the read of the waitqueue (to avoid SMP
- * races with a parallel wait_on_page_locked()).
+ * The mb is necessary to enforce ordering between the clear_bit and the read
+ * of the waitqueue (to avoid SMP races with a parallel wait_on_page_locked()).
  */
 void unlock_page(struct page *page)
 {
-	smp_mb__before_clear_bit();
-	if (!test_and_clear_bit(PG_locked, &page->flags))
-		BUG();
-	smp_mb__after_clear_bit(); 
+	VM_BUG_ON(!PageLocked(page));
+	clear_bit_unlock(PG_locked, &page->flags);
+	smp_mb__after_clear_bit();
 	wake_up_page(page, PG_locked);
 }
 EXPORT_SYMBOL(unlock_page);
diff --git a/mm/swapfile.c b/mm/swapfile.c
index 2a97fafa3d89..90cb67a5417c 100644
--- a/mm/swapfile.c
+++ b/mm/swapfile.c
@@ -422,7 +422,7 @@ void free_swap_and_cache(swp_entry_t entry)
 	if (p) {
 		if (swap_entry_free(p, swp_offset(entry)) == 1) {
 			page = find_get_page(&swapper_space, entry.val);
-			if (page && unlikely(!trylock_page(page))) {
+			if (page && !trylock_page(page)) {
 				page_cache_release(page);
 				page = NULL;
 			}
-- 
cgit v1.2.3


From 51b07fc3c5c830bb49c80fc5eac041e1f66a72e7 Mon Sep 17 00:00:00 2001
From: Nick Piggin <npiggin@suse.de>
Date: Sat, 18 Oct 2008 20:27:00 -0700
Subject: fs: buffer lock use lock bitops

trylock_buffer and unlock_buffer open and close a critical section.
Hence, we can use the lock bitops to get the desired memory ordering.

Signed-off-by: Nick Piggin <npiggin@suse.de>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/buffer.c                 | 3 +--
 include/linux/buffer_head.h | 2 +-
 2 files changed, 2 insertions(+), 3 deletions(-)

(limited to 'include/linux')

diff --git a/fs/buffer.c b/fs/buffer.c
index ac78d4c19b3b..6569fda5cfed 100644
--- a/fs/buffer.c
+++ b/fs/buffer.c
@@ -76,8 +76,7 @@ EXPORT_SYMBOL(__lock_buffer);
 
 void unlock_buffer(struct buffer_head *bh)
 {
-	smp_mb__before_clear_bit();
-	clear_buffer_locked(bh);
+	clear_bit_unlock(BH_Lock, &bh->b_state);
 	smp_mb__after_clear_bit();
 	wake_up_bit(&bh->b_state, BH_Lock);
 }
diff --git a/include/linux/buffer_head.h b/include/linux/buffer_head.h
index eadaab44015f..3ce64b90118c 100644
--- a/include/linux/buffer_head.h
+++ b/include/linux/buffer_head.h
@@ -322,7 +322,7 @@ static inline void wait_on_buffer(struct buffer_head *bh)
 
 static inline int trylock_buffer(struct buffer_head *bh)
 {
-	return likely(!test_and_set_bit(BH_Lock, &bh->b_state));
+	return likely(!test_and_set_bit_lock(BH_Lock, &bh->b_state));
 }
 
 static inline void lock_buffer(struct buffer_head *bh)
-- 
cgit v1.2.3


From db64fe02258f1507e13fe5212a989922323685ce Mon Sep 17 00:00:00 2001
From: Nick Piggin <npiggin@suse.de>
Date: Sat, 18 Oct 2008 20:27:03 -0700
Subject: mm: rewrite vmap layer

Rewrite the vmap allocator to use rbtrees and lazy tlb flushing, and
provide a fast, scalable percpu frontend for small vmaps (requires a
slightly different API, though).

The biggest problem with vmap is actually vunmap.  Presently this requires
a global kernel TLB flush, which on most architectures is a broadcast IPI
to all CPUs to flush the cache.  This is all done under a global lock.  As
the number of CPUs increases, so will the number of vunmaps a scaled
workload will want to perform, and so will the cost of a global TLB flush.
 This gives terrible quadratic scalability characteristics.

Another problem is that the entire vmap subsystem works under a single
lock.  It is a rwlock, but it is actually taken for write in all the fast
paths, and the read locking would likely never be run concurrently anyway,
so it's just pointless.

This is a rewrite of vmap subsystem to solve those problems.  The existing
vmalloc API is implemented on top of the rewritten subsystem.

The TLB flushing problem is solved by using lazy TLB unmapping.  vmap
addresses do not have to be flushed immediately when they are vunmapped,
because the kernel will not reuse them again (would be a use-after-free)
until they are reallocated.  So the addresses aren't allocated again until
a subsequent TLB flush.  A single TLB flush then can flush multiple
vunmaps from each CPU.

XEN and PAT and such do not like deferred TLB flushing because they can't
always handle multiple aliasing virtual addresses to a physical address.
They now call vm_unmap_aliases() in order to flush any deferred mappings.
That call is very expensive (well, actually not a lot more expensive than
a single vunmap under the old scheme), however it should be OK if not
called too often.

The virtual memory extent information is stored in an rbtree rather than a
linked list to improve the algorithmic scalability.

There is a per-CPU allocator for small vmaps, which amortizes or avoids
global locking.

To use the per-CPU interface, the vm_map_ram / vm_unmap_ram interfaces
must be used in place of vmap and vunmap.  Vmalloc does not use these
interfaces at the moment, so it will not be quite so scalable (although it
will use lazy TLB flushing).

As a quick test of performance, I ran a test that loops in the kernel,
linearly mapping then touching then unmapping 4 pages.  Different numbers
of tests were run in parallel on an 4 core, 2 socket opteron.  Results are
in nanoseconds per map+touch+unmap.

threads           vanilla         vmap rewrite
1                 14700           2900
2                 33600           3000
4                 49500           2800
8                 70631           2900

So with a 8 cores, the rewritten version is already 25x faster.

In a slightly more realistic test (although with an older and less
scalable version of the patch), I ripped the not-very-good vunmap batching
code out of XFS, and implemented the large buffer mapping with vm_map_ram
and vm_unmap_ram...  along with a couple of other tricks, I was able to
speed up a large directory workload by 20x on a 64 CPU system.  I believe
vmap/vunmap is actually sped up a lot more than 20x on such a system, but
I'm running into other locks now.  vmap is pretty well blown off the
profiles.

Before:
1352059 total                                      0.1401
798784 _write_lock                              8320.6667 <- vmlist_lock
529313 default_idle                             1181.5022
 15242 smp_call_function                         15.8771  <- vmap tlb flushing
  2472 __get_vm_area_node                         1.9312  <- vmap
  1762 remove_vm_area                             4.5885  <- vunmap
   316 map_vm_area                                0.2297  <- vmap
   312 kfree                                      0.1950
   300 _spin_lock                                 3.1250
   252 sn_send_IPI_phys                           0.4375  <- tlb flushing
   238 vmap                                       0.8264  <- vmap
   216 find_lock_page                             0.5192
   196 find_next_bit                              0.3603
   136 sn2_send_IPI                               0.2024
   130 pio_phys_write_mmr                         2.0312
   118 unmap_kernel_range                         0.1229

After:
 78406 total                                      0.0081
 40053 default_idle                              89.4040
 33576 ia64_spinlock_contention                 349.7500
  1650 _spin_lock                                17.1875
   319 __reg_op                                   0.5538
   281 _atomic_dec_and_lock                       1.0977
   153 mutex_unlock                               1.5938
   123 iget_locked                                0.1671
   117 xfs_dir_lookup                             0.1662
   117 dput                                       0.1406
   114 xfs_iget_core                              0.0268
    92 xfs_da_hashname                            0.1917
    75 d_alloc                                    0.0670
    68 vmap_page_range                            0.0462 <- vmap
    58 kmem_cache_alloc                           0.0604
    57 memset                                     0.0540
    52 rb_next                                    0.1625
    50 __copy_user                                0.0208
    49 bitmap_find_free_region                    0.2188 <- vmap
    46 ia64_sn_udelay                             0.1106
    45 find_inode_fast                            0.1406
    42 memcmp                                     0.2188
    42 finish_task_switch                         0.1094
    42 __d_lookup                                 0.0410
    40 radix_tree_lookup_slot                     0.1250
    37 _spin_unlock_irqrestore                    0.3854
    36 xfs_bmapi                                  0.0050
    36 kmem_cache_free                            0.0256
    35 xfs_vn_getattr                             0.0322
    34 radix_tree_lookup                          0.1062
    33 __link_path_walk                           0.0035
    31 xfs_da_do_buf                              0.0091
    30 _xfs_buf_find                              0.0204
    28 find_get_page                              0.0875
    27 xfs_iread                                  0.0241
    27 __strncpy_from_user                        0.2812
    26 _xfs_buf_initialize                        0.0406
    24 _xfs_buf_lookup_pages                      0.0179
    24 vunmap_page_range                          0.0250 <- vunmap
    23 find_lock_page                             0.0799
    22 vm_map_ram                                 0.0087 <- vmap
    20 kfree                                      0.0125
    19 put_page                                   0.0330
    18 __kmalloc                                  0.0176
    17 xfs_da_node_lookup_int                     0.0086
    17 _read_lock                                 0.0885
    17 page_waitqueue                             0.0664

vmap has gone from being the top 5 on the profiles and flushing the crap
out of all TLBs, to using less than 1% of kernel time.

[akpm@linux-foundation.org: cleanups, section fix]
[akpm@linux-foundation.org: fix build on alpha]
Signed-off-by: Nick Piggin <npiggin@suse.de>
Cc: Jeremy Fitzhardinge <jeremy@goop.org>
Cc: Krzysztof Helt <krzysztof.h1@poczta.fm>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 arch/x86/mm/pageattr.c   |   2 +
 arch/x86/xen/enlighten.c |   1 +
 arch/x86/xen/mmu.c       |   1 +
 include/linux/vmalloc.h  |  15 +-
 init/main.c              |   2 +
 mm/vmalloc.c             | 975 ++++++++++++++++++++++++++++++++++++++++-------
 6 files changed, 862 insertions(+), 134 deletions(-)

(limited to 'include/linux')

diff --git a/arch/x86/mm/pageattr.c b/arch/x86/mm/pageattr.c
index a9ec89c3fbca..407d8784f669 100644
--- a/arch/x86/mm/pageattr.c
+++ b/arch/x86/mm/pageattr.c
@@ -792,6 +792,8 @@ static int change_page_attr_set_clr(unsigned long *addr, int numpages,
 	/* Must avoid aliasing mappings in the highmem code */
 	kmap_flush_unused();
 
+	vm_unmap_aliases();
+
 	cpa.vaddr = addr;
 	cpa.numpages = numpages;
 	cpa.mask_set = mask_set;
diff --git a/arch/x86/xen/enlighten.c b/arch/x86/xen/enlighten.c
index 0013a729b41d..b61534c7a4c4 100644
--- a/arch/x86/xen/enlighten.c
+++ b/arch/x86/xen/enlighten.c
@@ -871,6 +871,7 @@ static void xen_alloc_ptpage(struct mm_struct *mm, unsigned long pfn, unsigned l
 			/* make sure there are no stray mappings of
 			   this page */
 			kmap_flush_unused();
+			vm_unmap_aliases();
 	}
 }
 
diff --git a/arch/x86/xen/mmu.c b/arch/x86/xen/mmu.c
index ae173f6edd8b..d4d52f5a1cf7 100644
--- a/arch/x86/xen/mmu.c
+++ b/arch/x86/xen/mmu.c
@@ -846,6 +846,7 @@ static void __xen_pgd_pin(struct mm_struct *mm, pgd_t *pgd)
 		/* re-enable interrupts for kmap_flush_unused */
 		xen_mc_issue(0);
 		kmap_flush_unused();
+		vm_unmap_aliases();
 		xen_mc_batch();
 	}
 
diff --git a/include/linux/vmalloc.h b/include/linux/vmalloc.h
index 328eb4022727..4c28c4d564e2 100644
--- a/include/linux/vmalloc.h
+++ b/include/linux/vmalloc.h
@@ -2,6 +2,7 @@
 #define _LINUX_VMALLOC_H
 
 #include <linux/spinlock.h>
+#include <linux/init.h>
 #include <asm/page.h>		/* pgprot_t */
 
 struct vm_area_struct;		/* vma defining user mapping in mm_types.h */
@@ -23,7 +24,6 @@ struct vm_area_struct;		/* vma defining user mapping in mm_types.h */
 #endif
 
 struct vm_struct {
-	/* keep next,addr,size together to speedup lookups */
 	struct vm_struct	*next;
 	void			*addr;
 	unsigned long		size;
@@ -37,6 +37,19 @@ struct vm_struct {
 /*
  *	Highlevel APIs for driver use
  */
+extern void vm_unmap_ram(const void *mem, unsigned int count);
+extern void *vm_map_ram(struct page **pages, unsigned int count,
+				int node, pgprot_t prot);
+extern void vm_unmap_aliases(void);
+
+#ifdef CONFIG_MMU
+extern void __init vmalloc_init(void);
+#else
+static inline void vmalloc_init(void)
+{
+}
+#endif
+
 extern void *vmalloc(unsigned long size);
 extern void *vmalloc_user(unsigned long size);
 extern void *vmalloc_node(unsigned long size, int node);
diff --git a/init/main.c b/init/main.c
index 27f6bf6108e9..4371d11721f6 100644
--- a/init/main.c
+++ b/init/main.c
@@ -27,6 +27,7 @@
 #include <linux/gfp.h>
 #include <linux/percpu.h>
 #include <linux/kmod.h>
+#include <linux/vmalloc.h>
 #include <linux/kernel_stat.h>
 #include <linux/start_kernel.h>
 #include <linux/security.h>
@@ -642,6 +643,7 @@ asmlinkage void __init start_kernel(void)
 		initrd_start = 0;
 	}
 #endif
+	vmalloc_init();
 	vfs_caches_init_early();
 	cpuset_init_early();
 	mem_init();
diff --git a/mm/vmalloc.c b/mm/vmalloc.c
index bba06c41fc59..712ae47af0bf 100644
--- a/mm/vmalloc.c
+++ b/mm/vmalloc.c
@@ -8,6 +8,7 @@
  *  Numa awareness, Christoph Lameter, SGI, June 2005
  */
 
+#include <linux/vmalloc.h>
 #include <linux/mm.h>
 #include <linux/module.h>
 #include <linux/highmem.h>
@@ -18,16 +19,17 @@
 #include <linux/debugobjects.h>
 #include <linux/vmalloc.h>
 #include <linux/kallsyms.h>
+#include <linux/list.h>
+#include <linux/rbtree.h>
+#include <linux/radix-tree.h>
+#include <linux/rcupdate.h>
 
+#include <asm/atomic.h>
 #include <asm/uaccess.h>
 #include <asm/tlbflush.h>
 
 
-DEFINE_RWLOCK(vmlist_lock);
-struct vm_struct *vmlist;
-
-static void *__vmalloc_node(unsigned long size, gfp_t gfp_mask, pgprot_t prot,
-			    int node, void *caller);
+/*** Page table manipulation functions ***/
 
 static void vunmap_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end)
 {
@@ -40,8 +42,7 @@ static void vunmap_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end)
 	} while (pte++, addr += PAGE_SIZE, addr != end);
 }
 
-static inline void vunmap_pmd_range(pud_t *pud, unsigned long addr,
-						unsigned long end)
+static void vunmap_pmd_range(pud_t *pud, unsigned long addr, unsigned long end)
 {
 	pmd_t *pmd;
 	unsigned long next;
@@ -55,8 +56,7 @@ static inline void vunmap_pmd_range(pud_t *pud, unsigned long addr,
 	} while (pmd++, addr = next, addr != end);
 }
 
-static inline void vunmap_pud_range(pgd_t *pgd, unsigned long addr,
-						unsigned long end)
+static void vunmap_pud_range(pgd_t *pgd, unsigned long addr, unsigned long end)
 {
 	pud_t *pud;
 	unsigned long next;
@@ -70,12 +70,10 @@ static inline void vunmap_pud_range(pgd_t *pgd, unsigned long addr,
 	} while (pud++, addr = next, addr != end);
 }
 
-void unmap_kernel_range(unsigned long addr, unsigned long size)
+static void vunmap_page_range(unsigned long addr, unsigned long end)
 {
 	pgd_t *pgd;
 	unsigned long next;
-	unsigned long start = addr;
-	unsigned long end = addr + size;
 
 	BUG_ON(addr >= end);
 	pgd = pgd_offset_k(addr);
@@ -86,35 +84,36 @@ void unmap_kernel_range(unsigned long addr, unsigned long size)
 			continue;
 		vunmap_pud_range(pgd, addr, next);
 	} while (pgd++, addr = next, addr != end);
-	flush_tlb_kernel_range(start, end);
-}
-
-static void unmap_vm_area(struct vm_struct *area)
-{
-	unmap_kernel_range((unsigned long)area->addr, area->size);
 }
 
 static int vmap_pte_range(pmd_t *pmd, unsigned long addr,
-			unsigned long end, pgprot_t prot, struct page ***pages)
+		unsigned long end, pgprot_t prot, struct page **pages, int *nr)
 {
 	pte_t *pte;
 
+	/*
+	 * nr is a running index into the array which helps higher level
+	 * callers keep track of where we're up to.
+	 */
+
 	pte = pte_alloc_kernel(pmd, addr);
 	if (!pte)
 		return -ENOMEM;
 	do {
-		struct page *page = **pages;
-		WARN_ON(!pte_none(*pte));
-		if (!page)
+		struct page *page = pages[*nr];
+
+		if (WARN_ON(!pte_none(*pte)))
+			return -EBUSY;
+		if (WARN_ON(!page))
 			return -ENOMEM;
 		set_pte_at(&init_mm, addr, pte, mk_pte(page, prot));
-		(*pages)++;
+		(*nr)++;
 	} while (pte++, addr += PAGE_SIZE, addr != end);
 	return 0;
 }
 
-static inline int vmap_pmd_range(pud_t *pud, unsigned long addr,
-			unsigned long end, pgprot_t prot, struct page ***pages)
+static int vmap_pmd_range(pud_t *pud, unsigned long addr,
+		unsigned long end, pgprot_t prot, struct page **pages, int *nr)
 {
 	pmd_t *pmd;
 	unsigned long next;
@@ -124,14 +123,14 @@ static inline int vmap_pmd_range(pud_t *pud, unsigned long addr,
 		return -ENOMEM;
 	do {
 		next = pmd_addr_end(addr, end);
-		if (vmap_pte_range(pmd, addr, next, prot, pages))
+		if (vmap_pte_range(pmd, addr, next, prot, pages, nr))
 			return -ENOMEM;
 	} while (pmd++, addr = next, addr != end);
 	return 0;
 }
 
-static inline int vmap_pud_range(pgd_t *pgd, unsigned long addr,
-			unsigned long end, pgprot_t prot, struct page ***pages)
+static int vmap_pud_range(pgd_t *pgd, unsigned long addr,
+		unsigned long end, pgprot_t prot, struct page **pages, int *nr)
 {
 	pud_t *pud;
 	unsigned long next;
@@ -141,44 +140,49 @@ static inline int vmap_pud_range(pgd_t *pgd, unsigned long addr,
 		return -ENOMEM;
 	do {
 		next = pud_addr_end(addr, end);
-		if (vmap_pmd_range(pud, addr, next, prot, pages))
+		if (vmap_pmd_range(pud, addr, next, prot, pages, nr))
 			return -ENOMEM;
 	} while (pud++, addr = next, addr != end);
 	return 0;
 }
 
-int map_vm_area(struct vm_struct *area, pgprot_t prot, struct page ***pages)
+/*
+ * Set up page tables in kva (addr, end). The ptes shall have prot "prot", and
+ * will have pfns corresponding to the "pages" array.
+ *
+ * Ie. pte at addr+N*PAGE_SIZE shall point to pfn corresponding to pages[N]
+ */
+static int vmap_page_range(unsigned long addr, unsigned long end,
+				pgprot_t prot, struct page **pages)
 {
 	pgd_t *pgd;
 	unsigned long next;
-	unsigned long addr = (unsigned long) area->addr;
-	unsigned long end = addr + area->size - PAGE_SIZE;
-	int err;
+	int err = 0;
+	int nr = 0;
 
 	BUG_ON(addr >= end);
 	pgd = pgd_offset_k(addr);
 	do {
 		next = pgd_addr_end(addr, end);
-		err = vmap_pud_range(pgd, addr, next, prot, pages);
+		err = vmap_pud_range(pgd, addr, next, prot, pages, &nr);
 		if (err)
 			break;
 	} while (pgd++, addr = next, addr != end);
-	flush_cache_vmap((unsigned long) area->addr, end);
-	return err;
+	flush_cache_vmap(addr, end);
+
+	if (unlikely(err))
+		return err;
+	return nr;
 }
-EXPORT_SYMBOL_GPL(map_vm_area);
 
 /*
- * Map a vmalloc()-space virtual address to the physical page.
+ * Walk a vmap address to the struct page it maps.
  */
 struct page *vmalloc_to_page(const void *vmalloc_addr)
 {
 	unsigned long addr = (unsigned long) vmalloc_addr;
 	struct page *page = NULL;
 	pgd_t *pgd = pgd_offset_k(addr);
-	pud_t *pud;
-	pmd_t *pmd;
-	pte_t *ptep, pte;
 
 	/*
 	 * XXX we might need to change this if we add VIRTUAL_BUG_ON for
@@ -188,10 +192,12 @@ struct page *vmalloc_to_page(const void *vmalloc_addr)
 			!is_module_address(addr));
 
 	if (!pgd_none(*pgd)) {
-		pud = pud_offset(pgd, addr);
+		pud_t *pud = pud_offset(pgd, addr);
 		if (!pud_none(*pud)) {
-			pmd = pmd_offset(pud, addr);
+			pmd_t *pmd = pmd_offset(pud, addr);
 			if (!pmd_none(*pmd)) {
+				pte_t *ptep, pte;
+
 				ptep = pte_offset_map(pmd, addr);
 				pte = *ptep;
 				if (pte_present(pte))
@@ -213,13 +219,751 @@ unsigned long vmalloc_to_pfn(const void *vmalloc_addr)
 }
 EXPORT_SYMBOL(vmalloc_to_pfn);
 
-static struct vm_struct *
-__get_vm_area_node(unsigned long size, unsigned long flags, unsigned long start,
-		unsigned long end, int node, gfp_t gfp_mask, void *caller)
+
+/*** Global kva allocator ***/
+
+#define VM_LAZY_FREE	0x01
+#define VM_LAZY_FREEING	0x02
+#define VM_VM_AREA	0x04
+
+struct vmap_area {
+	unsigned long va_start;
+	unsigned long va_end;
+	unsigned long flags;
+	struct rb_node rb_node;		/* address sorted rbtree */
+	struct list_head list;		/* address sorted list */
+	struct list_head purge_list;	/* "lazy purge" list */
+	void *private;
+	struct rcu_head rcu_head;
+};
+
+static DEFINE_SPINLOCK(vmap_area_lock);
+static struct rb_root vmap_area_root = RB_ROOT;
+static LIST_HEAD(vmap_area_list);
+
+static struct vmap_area *__find_vmap_area(unsigned long addr)
 {
-	struct vm_struct **p, *tmp, *area;
-	unsigned long align = 1;
+	struct rb_node *n = vmap_area_root.rb_node;
+
+	while (n) {
+		struct vmap_area *va;
+
+		va = rb_entry(n, struct vmap_area, rb_node);
+		if (addr < va->va_start)
+			n = n->rb_left;
+		else if (addr > va->va_start)
+			n = n->rb_right;
+		else
+			return va;
+	}
+
+	return NULL;
+}
+
+static void __insert_vmap_area(struct vmap_area *va)
+{
+	struct rb_node **p = &vmap_area_root.rb_node;
+	struct rb_node *parent = NULL;
+	struct rb_node *tmp;
+
+	while (*p) {
+		struct vmap_area *tmp;
+
+		parent = *p;
+		tmp = rb_entry(parent, struct vmap_area, rb_node);
+		if (va->va_start < tmp->va_end)
+			p = &(*p)->rb_left;
+		else if (va->va_end > tmp->va_start)
+			p = &(*p)->rb_right;
+		else
+			BUG();
+	}
+
+	rb_link_node(&va->rb_node, parent, p);
+	rb_insert_color(&va->rb_node, &vmap_area_root);
+
+	/* address-sort this list so it is usable like the vmlist */
+	tmp = rb_prev(&va->rb_node);
+	if (tmp) {
+		struct vmap_area *prev;
+		prev = rb_entry(tmp, struct vmap_area, rb_node);
+		list_add_rcu(&va->list, &prev->list);
+	} else
+		list_add_rcu(&va->list, &vmap_area_list);
+}
+
+static void purge_vmap_area_lazy(void);
+
+/*
+ * Allocate a region of KVA of the specified size and alignment, within the
+ * vstart and vend.
+ */
+static struct vmap_area *alloc_vmap_area(unsigned long size,
+				unsigned long align,
+				unsigned long vstart, unsigned long vend,
+				int node, gfp_t gfp_mask)
+{
+	struct vmap_area *va;
+	struct rb_node *n;
+	unsigned long addr;
+	int purged = 0;
+
+	BUG_ON(size & ~PAGE_MASK);
+
+	addr = ALIGN(vstart, align);
+
+	va = kmalloc_node(sizeof(struct vmap_area),
+			gfp_mask & GFP_RECLAIM_MASK, node);
+	if (unlikely(!va))
+		return ERR_PTR(-ENOMEM);
+
+retry:
+	spin_lock(&vmap_area_lock);
+	/* XXX: could have a last_hole cache */
+	n = vmap_area_root.rb_node;
+	if (n) {
+		struct vmap_area *first = NULL;
+
+		do {
+			struct vmap_area *tmp;
+			tmp = rb_entry(n, struct vmap_area, rb_node);
+			if (tmp->va_end >= addr) {
+				if (!first && tmp->va_start < addr + size)
+					first = tmp;
+				n = n->rb_left;
+			} else {
+				first = tmp;
+				n = n->rb_right;
+			}
+		} while (n);
+
+		if (!first)
+			goto found;
+
+		if (first->va_end < addr) {
+			n = rb_next(&first->rb_node);
+			if (n)
+				first = rb_entry(n, struct vmap_area, rb_node);
+			else
+				goto found;
+		}
+
+		while (addr + size >= first->va_start && addr + size <= vend) {
+			addr = ALIGN(first->va_end + PAGE_SIZE, align);
+
+			n = rb_next(&first->rb_node);
+			if (n)
+				first = rb_entry(n, struct vmap_area, rb_node);
+			else
+				goto found;
+		}
+	}
+found:
+	if (addr + size > vend) {
+		spin_unlock(&vmap_area_lock);
+		if (!purged) {
+			purge_vmap_area_lazy();
+			purged = 1;
+			goto retry;
+		}
+		if (printk_ratelimit())
+			printk(KERN_WARNING "vmap allocation failed: "
+				 "use vmalloc=<size> to increase size.\n");
+		return ERR_PTR(-EBUSY);
+	}
+
+	BUG_ON(addr & (align-1));
+
+	va->va_start = addr;
+	va->va_end = addr + size;
+	va->flags = 0;
+	__insert_vmap_area(va);
+	spin_unlock(&vmap_area_lock);
+
+	return va;
+}
+
+static void rcu_free_va(struct rcu_head *head)
+{
+	struct vmap_area *va = container_of(head, struct vmap_area, rcu_head);
+
+	kfree(va);
+}
+
+static void __free_vmap_area(struct vmap_area *va)
+{
+	BUG_ON(RB_EMPTY_NODE(&va->rb_node));
+	rb_erase(&va->rb_node, &vmap_area_root);
+	RB_CLEAR_NODE(&va->rb_node);
+	list_del_rcu(&va->list);
+
+	call_rcu(&va->rcu_head, rcu_free_va);
+}
+
+/*
+ * Free a region of KVA allocated by alloc_vmap_area
+ */
+static void free_vmap_area(struct vmap_area *va)
+{
+	spin_lock(&vmap_area_lock);
+	__free_vmap_area(va);
+	spin_unlock(&vmap_area_lock);
+}
+
+/*
+ * Clear the pagetable entries of a given vmap_area
+ */
+static void unmap_vmap_area(struct vmap_area *va)
+{
+	vunmap_page_range(va->va_start, va->va_end);
+}
+
+/*
+ * lazy_max_pages is the maximum amount of virtual address space we gather up
+ * before attempting to purge with a TLB flush.
+ *
+ * There is a tradeoff here: a larger number will cover more kernel page tables
+ * and take slightly longer to purge, but it will linearly reduce the number of
+ * global TLB flushes that must be performed. It would seem natural to scale
+ * this number up linearly with the number of CPUs (because vmapping activity
+ * could also scale linearly with the number of CPUs), however it is likely
+ * that in practice, workloads might be constrained in other ways that mean
+ * vmap activity will not scale linearly with CPUs. Also, I want to be
+ * conservative and not introduce a big latency on huge systems, so go with
+ * a less aggressive log scale. It will still be an improvement over the old
+ * code, and it will be simple to change the scale factor if we find that it
+ * becomes a problem on bigger systems.
+ */
+static unsigned long lazy_max_pages(void)
+{
+	unsigned int log;
+
+	log = fls(num_online_cpus());
+
+	return log * (32UL * 1024 * 1024 / PAGE_SIZE);
+}
+
+static atomic_t vmap_lazy_nr = ATOMIC_INIT(0);
+
+/*
+ * Purges all lazily-freed vmap areas.
+ *
+ * If sync is 0 then don't purge if there is already a purge in progress.
+ * If force_flush is 1, then flush kernel TLBs between *start and *end even
+ * if we found no lazy vmap areas to unmap (callers can use this to optimise
+ * their own TLB flushing).
+ * Returns with *start = min(*start, lowest purged address)
+ *              *end = max(*end, highest purged address)
+ */
+static void __purge_vmap_area_lazy(unsigned long *start, unsigned long *end,
+					int sync, int force_flush)
+{
+	static DEFINE_SPINLOCK(purge_lock);
+	LIST_HEAD(valist);
+	struct vmap_area *va;
+	int nr = 0;
+
+	/*
+	 * If sync is 0 but force_flush is 1, we'll go sync anyway but callers
+	 * should not expect such behaviour. This just simplifies locking for
+	 * the case that isn't actually used at the moment anyway.
+	 */
+	if (!sync && !force_flush) {
+		if (!spin_trylock(&purge_lock))
+			return;
+	} else
+		spin_lock(&purge_lock);
+
+	rcu_read_lock();
+	list_for_each_entry_rcu(va, &vmap_area_list, list) {
+		if (va->flags & VM_LAZY_FREE) {
+			if (va->va_start < *start)
+				*start = va->va_start;
+			if (va->va_end > *end)
+				*end = va->va_end;
+			nr += (va->va_end - va->va_start) >> PAGE_SHIFT;
+			unmap_vmap_area(va);
+			list_add_tail(&va->purge_list, &valist);
+			va->flags |= VM_LAZY_FREEING;
+			va->flags &= ~VM_LAZY_FREE;
+		}
+	}
+	rcu_read_unlock();
+
+	if (nr) {
+		BUG_ON(nr > atomic_read(&vmap_lazy_nr));
+		atomic_sub(nr, &vmap_lazy_nr);
+	}
+
+	if (nr || force_flush)
+		flush_tlb_kernel_range(*start, *end);
+
+	if (nr) {
+		spin_lock(&vmap_area_lock);
+		list_for_each_entry(va, &valist, purge_list)
+			__free_vmap_area(va);
+		spin_unlock(&vmap_area_lock);
+	}
+	spin_unlock(&purge_lock);
+}
+
+/*
+ * Kick off a purge of the outstanding lazy areas.
+ */
+static void purge_vmap_area_lazy(void)
+{
+	unsigned long start = ULONG_MAX, end = 0;
+
+	__purge_vmap_area_lazy(&start, &end, 0, 0);
+}
+
+/*
+ * Free and unmap a vmap area
+ */
+static void free_unmap_vmap_area(struct vmap_area *va)
+{
+	va->flags |= VM_LAZY_FREE;
+	atomic_add((va->va_end - va->va_start) >> PAGE_SHIFT, &vmap_lazy_nr);
+	if (unlikely(atomic_read(&vmap_lazy_nr) > lazy_max_pages()))
+		purge_vmap_area_lazy();
+}
+
+static struct vmap_area *find_vmap_area(unsigned long addr)
+{
+	struct vmap_area *va;
+
+	spin_lock(&vmap_area_lock);
+	va = __find_vmap_area(addr);
+	spin_unlock(&vmap_area_lock);
+
+	return va;
+}
+
+static void free_unmap_vmap_area_addr(unsigned long addr)
+{
+	struct vmap_area *va;
+
+	va = find_vmap_area(addr);
+	BUG_ON(!va);
+	free_unmap_vmap_area(va);
+}
+
+
+/*** Per cpu kva allocator ***/
+
+/*
+ * vmap space is limited especially on 32 bit architectures. Ensure there is
+ * room for at least 16 percpu vmap blocks per CPU.
+ */
+/*
+ * If we had a constant VMALLOC_START and VMALLOC_END, we'd like to be able
+ * to #define VMALLOC_SPACE		(VMALLOC_END-VMALLOC_START). Guess
+ * instead (we just need a rough idea)
+ */
+#if BITS_PER_LONG == 32
+#define VMALLOC_SPACE		(128UL*1024*1024)
+#else
+#define VMALLOC_SPACE		(128UL*1024*1024*1024)
+#endif
+
+#define VMALLOC_PAGES		(VMALLOC_SPACE / PAGE_SIZE)
+#define VMAP_MAX_ALLOC		BITS_PER_LONG	/* 256K with 4K pages */
+#define VMAP_BBMAP_BITS_MAX	1024	/* 4MB with 4K pages */
+#define VMAP_BBMAP_BITS_MIN	(VMAP_MAX_ALLOC*2)
+#define VMAP_MIN(x, y)		((x) < (y) ? (x) : (y)) /* can't use min() */
+#define VMAP_MAX(x, y)		((x) > (y) ? (x) : (y)) /* can't use max() */
+#define VMAP_BBMAP_BITS		VMAP_MIN(VMAP_BBMAP_BITS_MAX,		\
+					VMAP_MAX(VMAP_BBMAP_BITS_MIN,	\
+						VMALLOC_PAGES / NR_CPUS / 16))
+
+#define VMAP_BLOCK_SIZE		(VMAP_BBMAP_BITS * PAGE_SIZE)
+
+struct vmap_block_queue {
+	spinlock_t lock;
+	struct list_head free;
+	struct list_head dirty;
+	unsigned int nr_dirty;
+};
+
+struct vmap_block {
+	spinlock_t lock;
+	struct vmap_area *va;
+	struct vmap_block_queue *vbq;
+	unsigned long free, dirty;
+	DECLARE_BITMAP(alloc_map, VMAP_BBMAP_BITS);
+	DECLARE_BITMAP(dirty_map, VMAP_BBMAP_BITS);
+	union {
+		struct {
+			struct list_head free_list;
+			struct list_head dirty_list;
+		};
+		struct rcu_head rcu_head;
+	};
+};
+
+/* Queue of free and dirty vmap blocks, for allocation and flushing purposes */
+static DEFINE_PER_CPU(struct vmap_block_queue, vmap_block_queue);
+
+/*
+ * Radix tree of vmap blocks, indexed by address, to quickly find a vmap block
+ * in the free path. Could get rid of this if we change the API to return a
+ * "cookie" from alloc, to be passed to free. But no big deal yet.
+ */
+static DEFINE_SPINLOCK(vmap_block_tree_lock);
+static RADIX_TREE(vmap_block_tree, GFP_ATOMIC);
+
+/*
+ * We should probably have a fallback mechanism to allocate virtual memory
+ * out of partially filled vmap blocks. However vmap block sizing should be
+ * fairly reasonable according to the vmalloc size, so it shouldn't be a
+ * big problem.
+ */
+
+static unsigned long addr_to_vb_idx(unsigned long addr)
+{
+	addr -= VMALLOC_START & ~(VMAP_BLOCK_SIZE-1);
+	addr /= VMAP_BLOCK_SIZE;
+	return addr;
+}
+
+static struct vmap_block *new_vmap_block(gfp_t gfp_mask)
+{
+	struct vmap_block_queue *vbq;
+	struct vmap_block *vb;
+	struct vmap_area *va;
+	unsigned long vb_idx;
+	int node, err;
+
+	node = numa_node_id();
+
+	vb = kmalloc_node(sizeof(struct vmap_block),
+			gfp_mask & GFP_RECLAIM_MASK, node);
+	if (unlikely(!vb))
+		return ERR_PTR(-ENOMEM);
+
+	va = alloc_vmap_area(VMAP_BLOCK_SIZE, VMAP_BLOCK_SIZE,
+					VMALLOC_START, VMALLOC_END,
+					node, gfp_mask);
+	if (unlikely(IS_ERR(va))) {
+		kfree(vb);
+		return ERR_PTR(PTR_ERR(va));
+	}
+
+	err = radix_tree_preload(gfp_mask);
+	if (unlikely(err)) {
+		kfree(vb);
+		free_vmap_area(va);
+		return ERR_PTR(err);
+	}
+
+	spin_lock_init(&vb->lock);
+	vb->va = va;
+	vb->free = VMAP_BBMAP_BITS;
+	vb->dirty = 0;
+	bitmap_zero(vb->alloc_map, VMAP_BBMAP_BITS);
+	bitmap_zero(vb->dirty_map, VMAP_BBMAP_BITS);
+	INIT_LIST_HEAD(&vb->free_list);
+	INIT_LIST_HEAD(&vb->dirty_list);
+
+	vb_idx = addr_to_vb_idx(va->va_start);
+	spin_lock(&vmap_block_tree_lock);
+	err = radix_tree_insert(&vmap_block_tree, vb_idx, vb);
+	spin_unlock(&vmap_block_tree_lock);
+	BUG_ON(err);
+	radix_tree_preload_end();
+
+	vbq = &get_cpu_var(vmap_block_queue);
+	vb->vbq = vbq;
+	spin_lock(&vbq->lock);
+	list_add(&vb->free_list, &vbq->free);
+	spin_unlock(&vbq->lock);
+	put_cpu_var(vmap_cpu_blocks);
+
+	return vb;
+}
+
+static void rcu_free_vb(struct rcu_head *head)
+{
+	struct vmap_block *vb = container_of(head, struct vmap_block, rcu_head);
+
+	kfree(vb);
+}
+
+static void free_vmap_block(struct vmap_block *vb)
+{
+	struct vmap_block *tmp;
+	unsigned long vb_idx;
+
+	spin_lock(&vb->vbq->lock);
+	if (!list_empty(&vb->free_list))
+		list_del(&vb->free_list);
+	if (!list_empty(&vb->dirty_list))
+		list_del(&vb->dirty_list);
+	spin_unlock(&vb->vbq->lock);
+
+	vb_idx = addr_to_vb_idx(vb->va->va_start);
+	spin_lock(&vmap_block_tree_lock);
+	tmp = radix_tree_delete(&vmap_block_tree, vb_idx);
+	spin_unlock(&vmap_block_tree_lock);
+	BUG_ON(tmp != vb);
+
+	free_unmap_vmap_area(vb->va);
+	call_rcu(&vb->rcu_head, rcu_free_vb);
+}
+
+static void *vb_alloc(unsigned long size, gfp_t gfp_mask)
+{
+	struct vmap_block_queue *vbq;
+	struct vmap_block *vb;
+	unsigned long addr = 0;
+	unsigned int order;
+
+	BUG_ON(size & ~PAGE_MASK);
+	BUG_ON(size > PAGE_SIZE*VMAP_MAX_ALLOC);
+	order = get_order(size);
+
+again:
+	rcu_read_lock();
+	vbq = &get_cpu_var(vmap_block_queue);
+	list_for_each_entry_rcu(vb, &vbq->free, free_list) {
+		int i;
+
+		spin_lock(&vb->lock);
+		i = bitmap_find_free_region(vb->alloc_map,
+						VMAP_BBMAP_BITS, order);
+
+		if (i >= 0) {
+			addr = vb->va->va_start + (i << PAGE_SHIFT);
+			BUG_ON(addr_to_vb_idx(addr) !=
+					addr_to_vb_idx(vb->va->va_start));
+			vb->free -= 1UL << order;
+			if (vb->free == 0) {
+				spin_lock(&vbq->lock);
+				list_del_init(&vb->free_list);
+				spin_unlock(&vbq->lock);
+			}
+			spin_unlock(&vb->lock);
+			break;
+		}
+		spin_unlock(&vb->lock);
+	}
+	put_cpu_var(vmap_cpu_blocks);
+	rcu_read_unlock();
+
+	if (!addr) {
+		vb = new_vmap_block(gfp_mask);
+		if (IS_ERR(vb))
+			return vb;
+		goto again;
+	}
+
+	return (void *)addr;
+}
+
+static void vb_free(const void *addr, unsigned long size)
+{
+	unsigned long offset;
+	unsigned long vb_idx;
+	unsigned int order;
+	struct vmap_block *vb;
+
+	BUG_ON(size & ~PAGE_MASK);
+	BUG_ON(size > PAGE_SIZE*VMAP_MAX_ALLOC);
+	order = get_order(size);
+
+	offset = (unsigned long)addr & (VMAP_BLOCK_SIZE - 1);
+
+	vb_idx = addr_to_vb_idx((unsigned long)addr);
+	rcu_read_lock();
+	vb = radix_tree_lookup(&vmap_block_tree, vb_idx);
+	rcu_read_unlock();
+	BUG_ON(!vb);
+
+	spin_lock(&vb->lock);
+	bitmap_allocate_region(vb->dirty_map, offset >> PAGE_SHIFT, order);
+	if (!vb->dirty) {
+		spin_lock(&vb->vbq->lock);
+		list_add(&vb->dirty_list, &vb->vbq->dirty);
+		spin_unlock(&vb->vbq->lock);
+	}
+	vb->dirty += 1UL << order;
+	if (vb->dirty == VMAP_BBMAP_BITS) {
+		BUG_ON(vb->free || !list_empty(&vb->free_list));
+		spin_unlock(&vb->lock);
+		free_vmap_block(vb);
+	} else
+		spin_unlock(&vb->lock);
+}
+
+/**
+ * vm_unmap_aliases - unmap outstanding lazy aliases in the vmap layer
+ *
+ * The vmap/vmalloc layer lazily flushes kernel virtual mappings primarily
+ * to amortize TLB flushing overheads. What this means is that any page you
+ * have now, may, in a former life, have been mapped into kernel virtual
+ * address by the vmap layer and so there might be some CPUs with TLB entries
+ * still referencing that page (additional to the regular 1:1 kernel mapping).
+ *
+ * vm_unmap_aliases flushes all such lazy mappings. After it returns, we can
+ * be sure that none of the pages we have control over will have any aliases
+ * from the vmap layer.
+ */
+void vm_unmap_aliases(void)
+{
+	unsigned long start = ULONG_MAX, end = 0;
+	int cpu;
+	int flush = 0;
+
+	for_each_possible_cpu(cpu) {
+		struct vmap_block_queue *vbq = &per_cpu(vmap_block_queue, cpu);
+		struct vmap_block *vb;
+
+		rcu_read_lock();
+		list_for_each_entry_rcu(vb, &vbq->free, free_list) {
+			int i;
+
+			spin_lock(&vb->lock);
+			i = find_first_bit(vb->dirty_map, VMAP_BBMAP_BITS);
+			while (i < VMAP_BBMAP_BITS) {
+				unsigned long s, e;
+				int j;
+				j = find_next_zero_bit(vb->dirty_map,
+					VMAP_BBMAP_BITS, i);
+
+				s = vb->va->va_start + (i << PAGE_SHIFT);
+				e = vb->va->va_start + (j << PAGE_SHIFT);
+				vunmap_page_range(s, e);
+				flush = 1;
+
+				if (s < start)
+					start = s;
+				if (e > end)
+					end = e;
+
+				i = j;
+				i = find_next_bit(vb->dirty_map,
+							VMAP_BBMAP_BITS, i);
+			}
+			spin_unlock(&vb->lock);
+		}
+		rcu_read_unlock();
+	}
+
+	__purge_vmap_area_lazy(&start, &end, 1, flush);
+}
+EXPORT_SYMBOL_GPL(vm_unmap_aliases);
+
+/**
+ * vm_unmap_ram - unmap linear kernel address space set up by vm_map_ram
+ * @mem: the pointer returned by vm_map_ram
+ * @count: the count passed to that vm_map_ram call (cannot unmap partial)
+ */
+void vm_unmap_ram(const void *mem, unsigned int count)
+{
+	unsigned long size = count << PAGE_SHIFT;
+	unsigned long addr = (unsigned long)mem;
+
+	BUG_ON(!addr);
+	BUG_ON(addr < VMALLOC_START);
+	BUG_ON(addr > VMALLOC_END);
+	BUG_ON(addr & (PAGE_SIZE-1));
+
+	debug_check_no_locks_freed(mem, size);
+
+	if (likely(count <= VMAP_MAX_ALLOC))
+		vb_free(mem, size);
+	else
+		free_unmap_vmap_area_addr(addr);
+}
+EXPORT_SYMBOL(vm_unmap_ram);
+
+/**
+ * vm_map_ram - map pages linearly into kernel virtual address (vmalloc space)
+ * @pages: an array of pointers to the pages to be mapped
+ * @count: number of pages
+ * @node: prefer to allocate data structures on this node
+ * @prot: memory protection to use. PAGE_KERNEL for regular RAM
+ * @returns: a pointer to the address that has been mapped, or NULL on failure
+ */
+void *vm_map_ram(struct page **pages, unsigned int count, int node, pgprot_t prot)
+{
+	unsigned long size = count << PAGE_SHIFT;
 	unsigned long addr;
+	void *mem;
+
+	if (likely(count <= VMAP_MAX_ALLOC)) {
+		mem = vb_alloc(size, GFP_KERNEL);
+		if (IS_ERR(mem))
+			return NULL;
+		addr = (unsigned long)mem;
+	} else {
+		struct vmap_area *va;
+		va = alloc_vmap_area(size, PAGE_SIZE,
+				VMALLOC_START, VMALLOC_END, node, GFP_KERNEL);
+		if (IS_ERR(va))
+			return NULL;
+
+		addr = va->va_start;
+		mem = (void *)addr;
+	}
+	if (vmap_page_range(addr, addr + size, prot, pages) < 0) {
+		vm_unmap_ram(mem, count);
+		return NULL;
+	}
+	return mem;
+}
+EXPORT_SYMBOL(vm_map_ram);
+
+void __init vmalloc_init(void)
+{
+	int i;
+
+	for_each_possible_cpu(i) {
+		struct vmap_block_queue *vbq;
+
+		vbq = &per_cpu(vmap_block_queue, i);
+		spin_lock_init(&vbq->lock);
+		INIT_LIST_HEAD(&vbq->free);
+		INIT_LIST_HEAD(&vbq->dirty);
+		vbq->nr_dirty = 0;
+	}
+}
+
+void unmap_kernel_range(unsigned long addr, unsigned long size)
+{
+	unsigned long end = addr + size;
+	vunmap_page_range(addr, end);
+	flush_tlb_kernel_range(addr, end);
+}
+
+int map_vm_area(struct vm_struct *area, pgprot_t prot, struct page ***pages)
+{
+	unsigned long addr = (unsigned long)area->addr;
+	unsigned long end = addr + area->size - PAGE_SIZE;
+	int err;
+
+	err = vmap_page_range(addr, end, prot, *pages);
+	if (err > 0) {
+		*pages += err;
+		err = 0;
+	}
+
+	return err;
+}
+EXPORT_SYMBOL_GPL(map_vm_area);
+
+/*** Old vmalloc interfaces ***/
+DEFINE_RWLOCK(vmlist_lock);
+struct vm_struct *vmlist;
+
+static struct vm_struct *__get_vm_area_node(unsigned long size,
+		unsigned long flags, unsigned long start, unsigned long end,
+		int node, gfp_t gfp_mask, void *caller)
+{
+	static struct vmap_area *va;
+	struct vm_struct *area;
+	struct vm_struct *tmp, **p;
+	unsigned long align = 1;
 
 	BUG_ON(in_interrupt());
 	if (flags & VM_IOREMAP) {
@@ -232,13 +976,12 @@ __get_vm_area_node(unsigned long size, unsigned long flags, unsigned long start,
 
 		align = 1ul << bit;
 	}
-	addr = ALIGN(start, align);
+
 	size = PAGE_ALIGN(size);
 	if (unlikely(!size))
 		return NULL;
 
 	area = kmalloc_node(sizeof(*area), gfp_mask & GFP_RECLAIM_MASK, node);
-
 	if (unlikely(!area))
 		return NULL;
 
@@ -247,48 +990,32 @@ __get_vm_area_node(unsigned long size, unsigned long flags, unsigned long start,
 	 */
 	size += PAGE_SIZE;
 
-	write_lock(&vmlist_lock);
-	for (p = &vmlist; (tmp = *p) != NULL ;p = &tmp->next) {
-		if ((unsigned long)tmp->addr < addr) {
-			if((unsigned long)tmp->addr + tmp->size >= addr)
-				addr = ALIGN(tmp->size + 
-					     (unsigned long)tmp->addr, align);
-			continue;
-		}
-		if ((size + addr) < addr)
-			goto out;
-		if (size + addr <= (unsigned long)tmp->addr)
-			goto found;
-		addr = ALIGN(tmp->size + (unsigned long)tmp->addr, align);
-		if (addr > end - size)
-			goto out;
+	va = alloc_vmap_area(size, align, start, end, node, gfp_mask);
+	if (IS_ERR(va)) {
+		kfree(area);
+		return NULL;
 	}
-	if ((size + addr) < addr)
-		goto out;
-	if (addr > end - size)
-		goto out;
-
-found:
-	area->next = *p;
-	*p = area;
 
 	area->flags = flags;
-	area->addr = (void *)addr;
+	area->addr = (void *)va->va_start;
 	area->size = size;
 	area->pages = NULL;
 	area->nr_pages = 0;
 	area->phys_addr = 0;
 	area->caller = caller;
+	va->private = area;
+	va->flags |= VM_VM_AREA;
+
+	write_lock(&vmlist_lock);
+	for (p = &vmlist; (tmp = *p) != NULL; p = &tmp->next) {
+		if (tmp->addr >= area->addr)
+			break;
+	}
+	area->next = *p;
+	*p = area;
 	write_unlock(&vmlist_lock);
 
 	return area;
-
-out:
-	write_unlock(&vmlist_lock);
-	kfree(area);
-	if (printk_ratelimit())
-		printk(KERN_WARNING "allocation failed: out of vmalloc space - use vmalloc=<size> to increase size.\n");
-	return NULL;
 }
 
 struct vm_struct *__get_vm_area(unsigned long size, unsigned long flags,
@@ -328,39 +1055,15 @@ struct vm_struct *get_vm_area_node(unsigned long size, unsigned long flags,
 				  gfp_mask, __builtin_return_address(0));
 }
 
-/* Caller must hold vmlist_lock */
-static struct vm_struct *__find_vm_area(const void *addr)
+static struct vm_struct *find_vm_area(const void *addr)
 {
-	struct vm_struct *tmp;
+	struct vmap_area *va;
 
-	for (tmp = vmlist; tmp != NULL; tmp = tmp->next) {
-		 if (tmp->addr == addr)
-			break;
-	}
-
-	return tmp;
-}
-
-/* Caller must hold vmlist_lock */
-static struct vm_struct *__remove_vm_area(const void *addr)
-{
-	struct vm_struct **p, *tmp;
+	va = find_vmap_area((unsigned long)addr);
+	if (va && va->flags & VM_VM_AREA)
+		return va->private;
 
-	for (p = &vmlist ; (tmp = *p) != NULL ;p = &tmp->next) {
-		 if (tmp->addr == addr)
-			 goto found;
-	}
 	return NULL;
-
-found:
-	unmap_vm_area(tmp);
-	*p = tmp->next;
-
-	/*
-	 * Remove the guard page.
-	 */
-	tmp->size -= PAGE_SIZE;
-	return tmp;
 }
 
 /**
@@ -373,11 +1076,24 @@ found:
  */
 struct vm_struct *remove_vm_area(const void *addr)
 {
-	struct vm_struct *v;
-	write_lock(&vmlist_lock);
-	v = __remove_vm_area(addr);
-	write_unlock(&vmlist_lock);
-	return v;
+	struct vmap_area *va;
+
+	va = find_vmap_area((unsigned long)addr);
+	if (va && va->flags & VM_VM_AREA) {
+		struct vm_struct *vm = va->private;
+		struct vm_struct *tmp, **p;
+		free_unmap_vmap_area(va);
+		vm->size -= PAGE_SIZE;
+
+		write_lock(&vmlist_lock);
+		for (p = &vmlist; (tmp = *p) != vm; p = &tmp->next)
+			;
+		*p = tmp->next;
+		write_unlock(&vmlist_lock);
+
+		return vm;
+	}
+	return NULL;
 }
 
 static void __vunmap(const void *addr, int deallocate_pages)
@@ -487,6 +1203,8 @@ void *vmap(struct page **pages, unsigned int count,
 }
 EXPORT_SYMBOL(vmap);
 
+static void *__vmalloc_node(unsigned long size, gfp_t gfp_mask, pgprot_t prot,
+			    int node, void *caller);
 static void *__vmalloc_area_node(struct vm_struct *area, gfp_t gfp_mask,
 				 pgprot_t prot, int node, void *caller)
 {
@@ -613,10 +1331,8 @@ void *vmalloc_user(unsigned long size)
 
 	ret = __vmalloc(size, GFP_KERNEL | __GFP_HIGHMEM | __GFP_ZERO, PAGE_KERNEL);
 	if (ret) {
-		write_lock(&vmlist_lock);
-		area = __find_vm_area(ret);
+		area = find_vm_area(ret);
 		area->flags |= VM_USERMAP;
-		write_unlock(&vmlist_lock);
 	}
 	return ret;
 }
@@ -696,10 +1412,8 @@ void *vmalloc_32_user(unsigned long size)
 
 	ret = __vmalloc(size, GFP_VMALLOC32 | __GFP_ZERO, PAGE_KERNEL);
 	if (ret) {
-		write_lock(&vmlist_lock);
-		area = __find_vm_area(ret);
+		area = find_vm_area(ret);
 		area->flags |= VM_USERMAP;
-		write_unlock(&vmlist_lock);
 	}
 	return ret;
 }
@@ -800,26 +1514,25 @@ int remap_vmalloc_range(struct vm_area_struct *vma, void *addr,
 	struct vm_struct *area;
 	unsigned long uaddr = vma->vm_start;
 	unsigned long usize = vma->vm_end - vma->vm_start;
-	int ret;
 
 	if ((PAGE_SIZE-1) & (unsigned long)addr)
 		return -EINVAL;
 
-	read_lock(&vmlist_lock);
-	area = __find_vm_area(addr);
+	area = find_vm_area(addr);
 	if (!area)
-		goto out_einval_locked;
+		return -EINVAL;
 
 	if (!(area->flags & VM_USERMAP))
-		goto out_einval_locked;
+		return -EINVAL;
 
 	if (usize + (pgoff << PAGE_SHIFT) > area->size - PAGE_SIZE)
-		goto out_einval_locked;
-	read_unlock(&vmlist_lock);
+		return -EINVAL;
 
 	addr += pgoff << PAGE_SHIFT;
 	do {
 		struct page *page = vmalloc_to_page(addr);
+		int ret;
+
 		ret = vm_insert_page(vma, uaddr, page);
 		if (ret)
 			return ret;
@@ -832,11 +1545,7 @@ int remap_vmalloc_range(struct vm_area_struct *vma, void *addr,
 	/* Prevent "things" like memory migration? VM_flags need a cleanup... */
 	vma->vm_flags |= VM_RESERVED;
 
-	return ret;
-
-out_einval_locked:
-	read_unlock(&vmlist_lock);
-	return -EINVAL;
+	return 0;
 }
 EXPORT_SYMBOL(remap_vmalloc_range);
 
-- 
cgit v1.2.3


From e575f111dc0f27044e170580e7de50985ab3e011 Mon Sep 17 00:00:00 2001
From: KOSAKI Motohiro <kosaki.motohiro@jp.fujitsu.com>
Date: Sat, 18 Oct 2008 20:27:08 -0700
Subject: coredump_filter: add hugepage dumping

Presently hugepage's vma has a VM_RESERVED flag in order not to be
swapped.  But a VM_RESERVED vma isn't core dumped because this flag is
often used for some kernel vmas (e.g.  vmalloc, sound related).

Thus hugepages are never dumped and it can't be debugged easily.  Many
developers want hugepages to be included into core-dump.

However, We can't read generic VM_RESERVED area because this area is often
IO mapping area.  then these area reading may change device state.  it is
definitly undesiable side-effect.

So adding a hugepage specific bit to the coredump filter is better.  It
will be able to hugepage core dumping and doesn't cause any side-effect to
any i/o devices.

In additional, libhugetlb use hugetlb private mapping pages as anonymous
page.  Then, hugepage private mapping pages should be core dumped by
default.

Then, /proc/[pid]/core_dump_filter has two new bits.

 - bit 5 mean hugetlb private mapping pages are dumped or not. (default: yes)
 - bit 6 mean hugetlb shared mapping pages are dumped or not.  (default: no)

I tested by following method.

% ulimit -c unlimited
% ./crash_hugepage  50
% ./crash_hugepage  50  -p
% ls -lh
% gdb ./crash_hugepage core
%
% echo 0x43 > /proc/self/coredump_filter
% ./crash_hugepage  50
% ./crash_hugepage  50  -p
% ls -lh
% gdb ./crash_hugepage core

#include <stdlib.h>
#include <stdio.h>
#include <unistd.h>
#include <sys/mman.h>
#include <string.h>

#include "hugetlbfs.h"

int main(int argc, char** argv){
	char* p;
	int ch;
	int mmap_flags = MAP_SHARED;
	int fd;
	int nr_pages;

	while((ch = getopt(argc, argv, "p")) != -1) {
		switch (ch) {
		case 'p':
			mmap_flags &= ~MAP_SHARED;
			mmap_flags |= MAP_PRIVATE;
			break;
		default:
			/* nothing*/
			break;
		}
	}
	argc -= optind;
	argv += optind;

	if (argc == 0){
		printf("need # of pages\n");
		exit(1);
	}

	nr_pages = atoi(argv[0]);
	if (nr_pages < 2) {
		printf("nr_pages must >2\n");
		exit(1);
	}

	fd = hugetlbfs_unlinked_fd();
	p = mmap(NULL, nr_pages * gethugepagesize(),
		 PROT_READ|PROT_WRITE, mmap_flags, fd, 0);

	sleep(2);

	*(p + gethugepagesize()) = 1; /* COW */
	sleep(2);

	/* crash! */
	*(int*)0 = 1;

	return 0;
}

Signed-off-by: KOSAKI Motohiro <kosaki.motohiro@jp.fujitsu.com>
Reviewed-by: Kawai Hidehiro <hidehiro.kawai.ez@hitachi.com>
Cc: Hugh Dickins <hugh@veritas.com>
Cc: William Irwin <wli@holomorphy.com>
Cc: Adam Litke <agl@us.ibm.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 Documentation/filesystems/proc.txt | 15 ++++++++++-----
 fs/binfmt_elf.c                    | 12 ++++++++++--
 include/linux/sched.h              |  7 +++++--
 3 files changed, 25 insertions(+), 9 deletions(-)

(limited to 'include/linux')

diff --git a/Documentation/filesystems/proc.txt b/Documentation/filesystems/proc.txt
index c032bf39e8b9..02cb7faeed6b 100644
--- a/Documentation/filesystems/proc.txt
+++ b/Documentation/filesystems/proc.txt
@@ -2412,24 +2412,29 @@ will be dumped when the <pid> process is dumped. coredump_filter is a bitmask
 of memory types. If a bit of the bitmask is set, memory segments of the
 corresponding memory type are dumped, otherwise they are not dumped.
 
-The following 4 memory types are supported:
+The following 7 memory types are supported:
   - (bit 0) anonymous private memory
   - (bit 1) anonymous shared memory
   - (bit 2) file-backed private memory
   - (bit 3) file-backed shared memory
   - (bit 4) ELF header pages in file-backed private memory areas (it is
             effective only if the bit 2 is cleared)
+  - (bit 5) hugetlb private memory
+  - (bit 6) hugetlb shared memory
 
   Note that MMIO pages such as frame buffer are never dumped and vDSO pages
   are always dumped regardless of the bitmask status.
 
-Default value of coredump_filter is 0x3; this means all anonymous memory
-segments are dumped.
+  Note bit 0-4 doesn't effect any hugetlb memory. hugetlb memory are only
+  effected by bit 5-6.
+
+Default value of coredump_filter is 0x23; this means all anonymous memory
+segments and hugetlb private memory are dumped.
 
 If you don't want to dump all shared memory segments attached to pid 1234,
-write 1 to the process's proc file.
+write 0x21 to the process's proc file.
 
-  $ echo 0x1 > /proc/1234/coredump_filter
+  $ echo 0x21 > /proc/1234/coredump_filter
 
 When a new process is created, the process inherits the bitmask status from its
 parent. It is useful to set up coredump_filter before the program runs.
diff --git a/fs/binfmt_elf.c b/fs/binfmt_elf.c
index c76afa26edf7..e2159063198a 100644
--- a/fs/binfmt_elf.c
+++ b/fs/binfmt_elf.c
@@ -1156,16 +1156,24 @@ static int dump_seek(struct file *file, loff_t off)
 static unsigned long vma_dump_size(struct vm_area_struct *vma,
 				   unsigned long mm_flags)
 {
+#define FILTER(type)	(mm_flags & (1UL << MMF_DUMP_##type))
+
 	/* The vma can be set up to tell us the answer directly.  */
 	if (vma->vm_flags & VM_ALWAYSDUMP)
 		goto whole;
 
+	/* Hugetlb memory check */
+	if (vma->vm_flags & VM_HUGETLB) {
+		if ((vma->vm_flags & VM_SHARED) && FILTER(HUGETLB_SHARED))
+			goto whole;
+		if (!(vma->vm_flags & VM_SHARED) && FILTER(HUGETLB_PRIVATE))
+			goto whole;
+	}
+
 	/* Do not dump I/O mapped devices or special mappings */
 	if (vma->vm_flags & (VM_IO | VM_RESERVED))
 		return 0;
 
-#define FILTER(type)	(mm_flags & (1UL << MMF_DUMP_##type))
-
 	/* By default, dump shared memory if mapped from an anonymous file. */
 	if (vma->vm_flags & VM_SHARED) {
 		if (vma->vm_file->f_path.dentry->d_inode->i_nlink == 0 ?
diff --git a/include/linux/sched.h b/include/linux/sched.h
index c226c7b82946..017cc914ef1f 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -403,12 +403,15 @@ extern int get_dumpable(struct mm_struct *mm);
 #define MMF_DUMP_MAPPED_PRIVATE	4
 #define MMF_DUMP_MAPPED_SHARED	5
 #define MMF_DUMP_ELF_HEADERS	6
+#define MMF_DUMP_HUGETLB_PRIVATE 7
+#define MMF_DUMP_HUGETLB_SHARED  8
 #define MMF_DUMP_FILTER_SHIFT	MMF_DUMPABLE_BITS
-#define MMF_DUMP_FILTER_BITS	5
+#define MMF_DUMP_FILTER_BITS	7
 #define MMF_DUMP_FILTER_MASK \
 	(((1 << MMF_DUMP_FILTER_BITS) - 1) << MMF_DUMP_FILTER_SHIFT)
 #define MMF_DUMP_FILTER_DEFAULT \
-	((1 << MMF_DUMP_ANON_PRIVATE) |	(1 << MMF_DUMP_ANON_SHARED))
+	((1 << MMF_DUMP_ANON_PRIVATE) |	(1 << MMF_DUMP_ANON_SHARED) |\
+	 (1 << MMF_DUMP_HUGETLB_PRIVATE))
 
 struct sighand_struct {
 	atomic_t		count;
-- 
cgit v1.2.3


From 8174f1503f4bf7e9a14b3fbbfdb30c6be6e29f77 Mon Sep 17 00:00:00 2001
From: Matt Helsley <matthltc@us.ibm.com>
Date: Sat, 18 Oct 2008 20:27:19 -0700
Subject: container freezer: make refrigerator always available

Now that the TIF_FREEZE flag is available in all architectures, extract
the refrigerator() and freeze_task() from kernel/power/process.c and make
it available to all.

The refrigerator() can now be used in a control group subsystem
implementing a control group freezer.

Signed-off-by: Cedric Le Goater <clg@fr.ibm.com>
Signed-off-by: Matt Helsley <matthltc@us.ibm.com>
Acked-by: Serge E. Hallyn <serue@us.ibm.com>
Tested-by: Matt Helsley <matthltc@us.ibm.com>
Cc: "Rafael J. Wysocki" <rjw@sisk.pl>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/freezer.h |  14 ++++--
 kernel/Makefile         |   1 +
 kernel/freezer.c        | 122 ++++++++++++++++++++++++++++++++++++++++++++++++
 kernel/power/Kconfig    |   3 ++
 kernel/power/process.c  | 116 ---------------------------------------------
 5 files changed, 137 insertions(+), 119 deletions(-)
 create mode 100644 kernel/freezer.c

(limited to 'include/linux')

diff --git a/include/linux/freezer.h b/include/linux/freezer.h
index deddeedf3257..17e3bb42dd3c 100644
--- a/include/linux/freezer.h
+++ b/include/linux/freezer.h
@@ -6,7 +6,7 @@
 #include <linux/sched.h>
 #include <linux/wait.h>
 
-#ifdef CONFIG_PM_SLEEP
+#ifdef CONFIG_FREEZER
 /*
  * Check if a process has been frozen
  */
@@ -39,6 +39,11 @@ static inline void clear_freeze_flag(struct task_struct *p)
 	clear_tsk_thread_flag(p, TIF_FREEZE);
 }
 
+static inline bool should_send_signal(struct task_struct *p)
+{
+	return !(p->flags & PF_FREEZER_NOSIG);
+}
+
 /*
  * Wake up a frozen process
  *
@@ -75,6 +80,9 @@ static inline int try_to_freeze(void)
 		return 0;
 }
 
+extern bool freeze_task(struct task_struct *p, bool sig_only);
+extern void cancel_freezing(struct task_struct *p);
+
 /*
  * The PF_FREEZER_SKIP flag should be set by a vfork parent right before it
  * calls wait_for_completion(&vfork) and reset right after it returns from this
@@ -166,7 +174,7 @@ static inline void set_freezable_with_signal(void)
 	} while (try_to_freeze());					\
 	__retval;							\
 })
-#else /* !CONFIG_PM_SLEEP */
+#else /* !CONFIG_FREEZER */
 static inline int frozen(struct task_struct *p) { return 0; }
 static inline int freezing(struct task_struct *p) { return 0; }
 static inline void set_freeze_flag(struct task_struct *p) {}
@@ -191,6 +199,6 @@ static inline void set_freezable_with_signal(void) {}
 #define wait_event_freezable_timeout(wq, condition, timeout)		\
 		wait_event_interruptible_timeout(wq, condition, timeout)
 
-#endif /* !CONFIG_PM_SLEEP */
+#endif /* !CONFIG_FREEZER */
 
 #endif	/* FREEZER_H_INCLUDED */
diff --git a/kernel/Makefile b/kernel/Makefile
index 4e1d7df7c3e2..e8194d15d5f4 100644
--- a/kernel/Makefile
+++ b/kernel/Makefile
@@ -24,6 +24,7 @@ CFLAGS_REMOVE_sched_clock.o = -pg
 CFLAGS_REMOVE_sched.o = -mno-spe -pg
 endif
 
+obj-$(CONFIG_FREEZER) += freezer.o
 obj-$(CONFIG_PROFILING) += profile.o
 obj-$(CONFIG_SYSCTL_SYSCALL_CHECK) += sysctl_check.o
 obj-$(CONFIG_STACKTRACE) += stacktrace.o
diff --git a/kernel/freezer.c b/kernel/freezer.c
new file mode 100644
index 000000000000..cb0931f89306
--- /dev/null
+++ b/kernel/freezer.c
@@ -0,0 +1,122 @@
+/*
+ * kernel/freezer.c - Function to freeze a process
+ *
+ * Originally from kernel/power/process.c
+ */
+
+#include <linux/interrupt.h>
+#include <linux/suspend.h>
+#include <linux/module.h>
+#include <linux/syscalls.h>
+#include <linux/freezer.h>
+
+/*
+ * freezing is complete, mark current process as frozen
+ */
+static inline void frozen_process(void)
+{
+	if (!unlikely(current->flags & PF_NOFREEZE)) {
+		current->flags |= PF_FROZEN;
+		wmb();
+	}
+	clear_freeze_flag(current);
+}
+
+/* Refrigerator is place where frozen processes are stored :-). */
+void refrigerator(void)
+{
+	/* Hmm, should we be allowed to suspend when there are realtime
+	   processes around? */
+	long save;
+
+	task_lock(current);
+	if (freezing(current)) {
+		frozen_process();
+		task_unlock(current);
+	} else {
+		task_unlock(current);
+		return;
+	}
+	save = current->state;
+	pr_debug("%s entered refrigerator\n", current->comm);
+
+	spin_lock_irq(&current->sighand->siglock);
+	recalc_sigpending(); /* We sent fake signal, clean it up */
+	spin_unlock_irq(&current->sighand->siglock);
+
+	for (;;) {
+		set_current_state(TASK_UNINTERRUPTIBLE);
+		if (!frozen(current))
+			break;
+		schedule();
+	}
+	pr_debug("%s left refrigerator\n", current->comm);
+	__set_current_state(save);
+}
+EXPORT_SYMBOL(refrigerator);
+
+static void fake_signal_wake_up(struct task_struct *p)
+{
+	unsigned long flags;
+
+	spin_lock_irqsave(&p->sighand->siglock, flags);
+	signal_wake_up(p, 0);
+	spin_unlock_irqrestore(&p->sighand->siglock, flags);
+}
+
+/**
+ *	freeze_task - send a freeze request to given task
+ *	@p: task to send the request to
+ *	@sig_only: if set, the request will only be sent if the task has the
+ *		PF_FREEZER_NOSIG flag unset
+ *	Return value: 'false', if @sig_only is set and the task has
+ *		PF_FREEZER_NOSIG set or the task is frozen, 'true', otherwise
+ *
+ *	The freeze request is sent by setting the tasks's TIF_FREEZE flag and
+ *	either sending a fake signal to it or waking it up, depending on whether
+ *	or not it has PF_FREEZER_NOSIG set.  If @sig_only is set and the task
+ *	has PF_FREEZER_NOSIG set (ie. it is a typical kernel thread), its
+ *	TIF_FREEZE flag will not be set.
+ */
+bool freeze_task(struct task_struct *p, bool sig_only)
+{
+	/*
+	 * We first check if the task is freezing and next if it has already
+	 * been frozen to avoid the race with frozen_process() which first marks
+	 * the task as frozen and next clears its TIF_FREEZE.
+	 */
+	if (!freezing(p)) {
+		rmb();
+		if (frozen(p))
+			return false;
+
+		if (!sig_only || should_send_signal(p))
+			set_freeze_flag(p);
+		else
+			return false;
+	}
+
+	if (should_send_signal(p)) {
+		if (!signal_pending(p))
+			fake_signal_wake_up(p);
+	} else if (sig_only) {
+		return false;
+	} else {
+		wake_up_state(p, TASK_INTERRUPTIBLE);
+	}
+
+	return true;
+}
+
+void cancel_freezing(struct task_struct *p)
+{
+	unsigned long flags;
+
+	if (freezing(p)) {
+		pr_debug("  clean up: %s\n", p->comm);
+		clear_freeze_flag(p);
+		spin_lock_irqsave(&p->sighand->siglock, flags);
+		recalc_sigpending_and_wake(p);
+		spin_unlock_irqrestore(&p->sighand->siglock, flags);
+	}
+}
diff --git a/kernel/power/Kconfig b/kernel/power/Kconfig
index dcd165f92a88..ebdd7f55273d 100644
--- a/kernel/power/Kconfig
+++ b/kernel/power/Kconfig
@@ -85,6 +85,9 @@ config PM_SLEEP
 	depends on SUSPEND || HIBERNATION || XEN_SAVE_RESTORE
 	default y
 
+config FREEZER
+	def_bool PM_SLEEP
+
 config SUSPEND
 	bool "Suspend to RAM and standby"
 	depends on PM && ARCH_SUSPEND_POSSIBLE
diff --git a/kernel/power/process.c b/kernel/power/process.c
index 278946aecaf0..444cea80fde8 100644
--- a/kernel/power/process.c
+++ b/kernel/power/process.c
@@ -28,121 +28,6 @@ static inline int freezeable(struct task_struct * p)
 	return 1;
 }
 
-/*
- * freezing is complete, mark current process as frozen
- */
-static inline void frozen_process(void)
-{
-	if (!unlikely(current->flags & PF_NOFREEZE)) {
-		current->flags |= PF_FROZEN;
-		wmb();
-	}
-	clear_freeze_flag(current);
-}
-
-/* Refrigerator is place where frozen processes are stored :-). */
-void refrigerator(void)
-{
-	/* Hmm, should we be allowed to suspend when there are realtime
-	   processes around? */
-	long save;
-
-	task_lock(current);
-	if (freezing(current)) {
-		frozen_process();
-		task_unlock(current);
-	} else {
-		task_unlock(current);
-		return;
-	}
-	save = current->state;
-	pr_debug("%s entered refrigerator\n", current->comm);
-
-	spin_lock_irq(&current->sighand->siglock);
-	recalc_sigpending(); /* We sent fake signal, clean it up */
-	spin_unlock_irq(&current->sighand->siglock);
-
-	for (;;) {
-		set_current_state(TASK_UNINTERRUPTIBLE);
-		if (!frozen(current))
-			break;
-		schedule();
-	}
-	pr_debug("%s left refrigerator\n", current->comm);
-	__set_current_state(save);
-}
-
-static void fake_signal_wake_up(struct task_struct *p)
-{
-	unsigned long flags;
-
-	spin_lock_irqsave(&p->sighand->siglock, flags);
-	signal_wake_up(p, 0);
-	spin_unlock_irqrestore(&p->sighand->siglock, flags);
-}
-
-static inline bool should_send_signal(struct task_struct *p)
-{
-	return !(p->flags & PF_FREEZER_NOSIG);
-}
-
-/**
- *	freeze_task - send a freeze request to given task
- *	@p: task to send the request to
- *	@sig_only: if set, the request will only be sent if the task has the
- *		PF_FREEZER_NOSIG flag unset
- *	Return value: 'false', if @sig_only is set and the task has
- *		PF_FREEZER_NOSIG set or the task is frozen, 'true', otherwise
- *
- *	The freeze request is sent by setting the tasks's TIF_FREEZE flag and
- *	either sending a fake signal to it or waking it up, depending on whether
- *	or not it has PF_FREEZER_NOSIG set.  If @sig_only is set and the task
- *	has PF_FREEZER_NOSIG set (ie. it is a typical kernel thread), its
- *	TIF_FREEZE flag will not be set.
- */
-static bool freeze_task(struct task_struct *p, bool sig_only)
-{
-	/*
-	 * We first check if the task is freezing and next if it has already
-	 * been frozen to avoid the race with frozen_process() which first marks
-	 * the task as frozen and next clears its TIF_FREEZE.
-	 */
-	if (!freezing(p)) {
-		rmb();
-		if (frozen(p))
-			return false;
-
-		if (!sig_only || should_send_signal(p))
-			set_freeze_flag(p);
-		else
-			return false;
-	}
-
-	if (should_send_signal(p)) {
-		if (!signal_pending(p))
-			fake_signal_wake_up(p);
-	} else if (sig_only) {
-		return false;
-	} else {
-		wake_up_state(p, TASK_INTERRUPTIBLE);
-	}
-
-	return true;
-}
-
-static void cancel_freezing(struct task_struct *p)
-{
-	unsigned long flags;
-
-	if (freezing(p)) {
-		pr_debug("  clean up: %s\n", p->comm);
-		clear_freeze_flag(p);
-		spin_lock_irqsave(&p->sighand->siglock, flags);
-		recalc_sigpending_and_wake(p);
-		spin_unlock_irqrestore(&p->sighand->siglock, flags);
-	}
-}
-
 static int try_to_freeze_tasks(bool sig_only)
 {
 	struct task_struct *g, *p;
@@ -264,4 +149,3 @@ void thaw_processes(void)
 	printk("done.\n");
 }
 
-EXPORT_SYMBOL(refrigerator);
-- 
cgit v1.2.3


From dc52ddc0e6f45b04780b26fc0813509f8e798c42 Mon Sep 17 00:00:00 2001
From: Matt Helsley <matthltc@us.ibm.com>
Date: Sat, 18 Oct 2008 20:27:21 -0700
Subject: container freezer: implement freezer cgroup subsystem

This patch implements a new freezer subsystem in the control groups
framework.  It provides a way to stop and resume execution of all tasks in
a cgroup by writing in the cgroup filesystem.

The freezer subsystem in the container filesystem defines a file named
freezer.state.  Writing "FROZEN" to the state file will freeze all tasks
in the cgroup.  Subsequently writing "RUNNING" will unfreeze the tasks in
the cgroup.  Reading will return the current state.

* Examples of usage :

   # mkdir /containers/freezer
   # mount -t cgroup -ofreezer freezer  /containers
   # mkdir /containers/0
   # echo $some_pid > /containers/0/tasks

to get status of the freezer subsystem :

   # cat /containers/0/freezer.state
   RUNNING

to freeze all tasks in the container :

   # echo FROZEN > /containers/0/freezer.state
   # cat /containers/0/freezer.state
   FREEZING
   # cat /containers/0/freezer.state
   FROZEN

to unfreeze all tasks in the container :

   # echo RUNNING > /containers/0/freezer.state
   # cat /containers/0/freezer.state
   RUNNING

This is the basic mechanism which should do the right thing for user space
task in a simple scenario.

It's important to note that freezing can be incomplete.  In that case we
return EBUSY.  This means that some tasks in the cgroup are busy doing
something that prevents us from completely freezing the cgroup at this
time.  After EBUSY, the cgroup will remain partially frozen -- reflected
by freezer.state reporting "FREEZING" when read.  The state will remain
"FREEZING" until one of these things happens:

	1) Userspace cancels the freezing operation by writing "RUNNING" to
		the freezer.state file
	2) Userspace retries the freezing operation by writing "FROZEN" to
		the freezer.state file (writing "FREEZING" is not legal
		and returns EIO)
	3) The tasks that blocked the cgroup from entering the "FROZEN"
		state disappear from the cgroup's set of tasks.

[akpm@linux-foundation.org: coding-style fixes]
[akpm@linux-foundation.org: export thaw_process]
Signed-off-by: Cedric Le Goater <clg@fr.ibm.com>
Signed-off-by: Matt Helsley <matthltc@us.ibm.com>
Acked-by: Serge E. Hallyn <serue@us.ibm.com>
Tested-by: Matt Helsley <matthltc@us.ibm.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 arch/alpha/Kconfig            |   1 +
 arch/arm/Kconfig              |   2 +
 arch/avr32/Kconfig            |   2 +
 arch/blackfin/Kconfig         |   3 +
 arch/cris/Kconfig             |   2 +
 arch/frv/Kconfig              |   2 +
 arch/h8300/Kconfig            |   2 +
 arch/ia64/Kconfig             |   2 +
 arch/m32r/Kconfig             |   2 +
 arch/m68k/Kconfig             |   2 +
 arch/m68knommu/Kconfig        |   2 +
 arch/mips/Kconfig             |   2 +
 arch/mn10300/Kconfig          |   2 +
 arch/parisc/Kconfig           |   2 +
 arch/powerpc/Kconfig          |   2 +
 arch/s390/Kconfig             |   2 +
 arch/sh/Kconfig               |   2 +
 arch/sparc/Kconfig            |   2 +
 arch/sparc64/Kconfig          |   1 +
 arch/um/Kconfig               |   2 +
 arch/x86/Kconfig              |   1 +
 arch/xtensa/Kconfig           |   1 +
 include/linux/cgroup_subsys.h |   6 +
 include/linux/freezer.h       |  29 ++--
 init/Kconfig                  |   7 +
 kernel/Kconfig.freezer        |   2 +
 kernel/Makefile               |   1 +
 kernel/cgroup_freezer.c       | 366 ++++++++++++++++++++++++++++++++++++++++++
 kernel/freezer.c              |  32 ++++
 kernel/power/Kconfig          |   3 -
 30 files changed, 465 insertions(+), 22 deletions(-)
 create mode 100644 kernel/Kconfig.freezer
 create mode 100644 kernel/cgroup_freezer.c

(limited to 'include/linux')

diff --git a/arch/alpha/Kconfig b/arch/alpha/Kconfig
index a0f642b6a4b9..6110197757a3 100644
--- a/arch/alpha/Kconfig
+++ b/arch/alpha/Kconfig
@@ -70,6 +70,7 @@ config AUTO_IRQ_AFFINITY
 	default y
 
 source "init/Kconfig"
+source "kernel/Kconfig.freezer"
 
 
 menu "System setup"
diff --git a/arch/arm/Kconfig b/arch/arm/Kconfig
index 4853f9df37bd..df39d20f7425 100644
--- a/arch/arm/Kconfig
+++ b/arch/arm/Kconfig
@@ -192,6 +192,8 @@ config VECTORS_BASE
 
 source "init/Kconfig"
 
+source "kernel/Kconfig.freezer"
+
 menu "System Type"
 
 choice
diff --git a/arch/avr32/Kconfig b/arch/avr32/Kconfig
index 7c239a916275..33a5b2969eb4 100644
--- a/arch/avr32/Kconfig
+++ b/arch/avr32/Kconfig
@@ -72,6 +72,8 @@ config GENERIC_BUG
 
 source "init/Kconfig"
 
+source "kernel/Kconfig.freezer"
+
 menu "System Type and features"
 
 source "kernel/time/Kconfig"
diff --git a/arch/blackfin/Kconfig b/arch/blackfin/Kconfig
index 8102c79aaa94..29e71ed6b8a7 100644
--- a/arch/blackfin/Kconfig
+++ b/arch/blackfin/Kconfig
@@ -64,8 +64,11 @@ config HARDWARE_PM
 	depends on OPROFILE
 
 source "init/Kconfig"
+
 source "kernel/Kconfig.preempt"
 
+source "kernel/Kconfig.freezer"
+
 menu "Blackfin Processor Options"
 
 comment "Processor and Board Settings"
diff --git a/arch/cris/Kconfig b/arch/cris/Kconfig
index 9389d38f222f..07335e719bf8 100644
--- a/arch/cris/Kconfig
+++ b/arch/cris/Kconfig
@@ -62,6 +62,8 @@ config HZ
 
 source "init/Kconfig"
 
+source "kernel/Kconfig.freezer"
+
 menu "General setup"
 
 source "fs/Kconfig.binfmt"
diff --git a/arch/frv/Kconfig b/arch/frv/Kconfig
index a5aac1b07562..9d1552a9ee2c 100644
--- a/arch/frv/Kconfig
+++ b/arch/frv/Kconfig
@@ -66,6 +66,8 @@ mainmenu "Fujitsu FR-V Kernel Configuration"
 
 source "init/Kconfig"
 
+source "kernel/Kconfig.freezer"
+
 
 menu "Fujitsu FR-V system setup"
 
diff --git a/arch/h8300/Kconfig b/arch/h8300/Kconfig
index c7966746fbfe..bd1995403c67 100644
--- a/arch/h8300/Kconfig
+++ b/arch/h8300/Kconfig
@@ -90,6 +90,8 @@ config HZ
 
 source "init/Kconfig"
 
+source "kernel/Kconfig.freezer"
+
 source "arch/h8300/Kconfig.cpu"
 
 menu "Executable file formats"
diff --git a/arch/ia64/Kconfig b/arch/ia64/Kconfig
index 3b7aa38254a8..912c57db2d21 100644
--- a/arch/ia64/Kconfig
+++ b/arch/ia64/Kconfig
@@ -7,6 +7,8 @@ mainmenu "IA-64 Linux Kernel Configuration"
 
 source "init/Kconfig"
 
+source "kernel/Kconfig.freezer"
+
 menu "Processor type and features"
 
 config IA64
diff --git a/arch/m32r/Kconfig b/arch/m32r/Kconfig
index 00289c178f89..dbaed4a63815 100644
--- a/arch/m32r/Kconfig
+++ b/arch/m32r/Kconfig
@@ -42,6 +42,8 @@ config HZ
 
 source "init/Kconfig"
 
+source "kernel/Kconfig.freezer"
+
 
 menu "Processor type and features"
 
diff --git a/arch/m68k/Kconfig b/arch/m68k/Kconfig
index 677c93a490f6..836fb66f080d 100644
--- a/arch/m68k/Kconfig
+++ b/arch/m68k/Kconfig
@@ -62,6 +62,8 @@ mainmenu "Linux/68k Kernel Configuration"
 
 source "init/Kconfig"
 
+source "kernel/Kconfig.freezer"
+
 menu "Platform dependent setup"
 
 config EISA
diff --git a/arch/m68knommu/Kconfig b/arch/m68knommu/Kconfig
index 0a8998315e5e..76b66feb74df 100644
--- a/arch/m68knommu/Kconfig
+++ b/arch/m68knommu/Kconfig
@@ -75,6 +75,8 @@ config NO_IOPORT
 
 source "init/Kconfig"
 
+source "kernel/Kconfig.freezer"
+
 menu "Processor type and features"
 
 choice
diff --git a/arch/mips/Kconfig b/arch/mips/Kconfig
index b905744d7915..5f149b030c0f 100644
--- a/arch/mips/Kconfig
+++ b/arch/mips/Kconfig
@@ -1885,6 +1885,8 @@ config PROBE_INITRD_HEADER
 	  add initrd or initramfs image to the kernel image.
 	  Otherwise, say N.
 
+source "kernel/Kconfig.freezer"
+
 menu "Bus options (PCI, PCMCIA, EISA, ISA, TC)"
 
 config HW_HAS_EISA
diff --git a/arch/mn10300/Kconfig b/arch/mn10300/Kconfig
index dd557c9cf001..9a9f43358879 100644
--- a/arch/mn10300/Kconfig
+++ b/arch/mn10300/Kconfig
@@ -68,6 +68,8 @@ mainmenu "Matsushita MN10300/AM33 Kernel Configuration"
 
 source "init/Kconfig"
 
+source "kernel/Kconfig.freezer"
+
 
 menu "Matsushita MN10300 system setup"
 
diff --git a/arch/parisc/Kconfig b/arch/parisc/Kconfig
index 8313fccced5e..2bd1f6ef5db0 100644
--- a/arch/parisc/Kconfig
+++ b/arch/parisc/Kconfig
@@ -90,6 +90,8 @@ config ARCH_MAY_HAVE_PC_FDC
 
 source "init/Kconfig"
 
+source "kernel/Kconfig.freezer"
+
 
 menu "Processor type and features"
 
diff --git a/arch/powerpc/Kconfig b/arch/powerpc/Kconfig
index 380baa1780e9..9391199d9e77 100644
--- a/arch/powerpc/Kconfig
+++ b/arch/powerpc/Kconfig
@@ -230,6 +230,8 @@ config PPC_OF_PLATFORM_PCI
 
 source "init/Kconfig"
 
+source "kernel/Kconfig.freezer"
+
 source "arch/powerpc/sysdev/Kconfig"
 source "arch/powerpc/platforms/Kconfig"
 
diff --git a/arch/s390/Kconfig b/arch/s390/Kconfig
index bc581d8a7cd9..70b7645ce745 100644
--- a/arch/s390/Kconfig
+++ b/arch/s390/Kconfig
@@ -78,6 +78,8 @@ config S390
 
 source "init/Kconfig"
 
+source "kernel/Kconfig.freezer"
+
 menu "Base setup"
 
 comment "Processor type and features"
diff --git a/arch/sh/Kconfig b/arch/sh/Kconfig
index 5131d50f851a..2ed5713b7540 100644
--- a/arch/sh/Kconfig
+++ b/arch/sh/Kconfig
@@ -106,6 +106,8 @@ config IO_TRAPPED
 
 source "init/Kconfig"
 
+source "kernel/Kconfig.freezer"
+
 menu "System type"
 
 #
diff --git a/arch/sparc/Kconfig b/arch/sparc/Kconfig
index 97671dac12a6..e594559c8dba 100644
--- a/arch/sparc/Kconfig
+++ b/arch/sparc/Kconfig
@@ -37,6 +37,8 @@ config HZ
 
 source "init/Kconfig"
 
+source "kernel/Kconfig.freezer"
+
 menu "General machine setup"
 
 config SMP
diff --git a/arch/sparc64/Kconfig b/arch/sparc64/Kconfig
index 5446e2a499b1..035b15af90d8 100644
--- a/arch/sparc64/Kconfig
+++ b/arch/sparc64/Kconfig
@@ -96,6 +96,7 @@ config GENERIC_HARDIRQS_NO__DO_IRQ
 	def_bool y
 
 source "init/Kconfig"
+source "kernel/Kconfig.freezer"
 
 menu "Processor type and features"
 
diff --git a/arch/um/Kconfig b/arch/um/Kconfig
index 6976812cfb18..393bccfe1785 100644
--- a/arch/um/Kconfig
+++ b/arch/um/Kconfig
@@ -229,6 +229,8 @@ endmenu
 
 source "init/Kconfig"
 
+source "kernel/Kconfig.freezer"
+
 source "drivers/block/Kconfig"
 
 source "arch/um/Kconfig.char"
diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
index bd3c2c53873e..49349ba77d80 100644
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -193,6 +193,7 @@ config X86_TRAMPOLINE
 config KTIME_SCALAR
 	def_bool X86_32
 source "init/Kconfig"
+source "kernel/Kconfig.freezer"
 
 menu "Processor type and features"
 
diff --git a/arch/xtensa/Kconfig b/arch/xtensa/Kconfig
index 02e417d3d8e9..a213260b51e5 100644
--- a/arch/xtensa/Kconfig
+++ b/arch/xtensa/Kconfig
@@ -55,6 +55,7 @@ config HZ
 	default 100
 
 source "init/Kconfig"
+source "kernel/Kconfig.freezer"
 
 menu "Processor type and features"
 
diff --git a/include/linux/cgroup_subsys.h b/include/linux/cgroup_subsys.h
index e2877454ec82..9c22396e8b50 100644
--- a/include/linux/cgroup_subsys.h
+++ b/include/linux/cgroup_subsys.h
@@ -48,3 +48,9 @@ SUBSYS(devices)
 #endif
 
 /* */
+
+#ifdef CONFIG_CGROUP_FREEZER
+SUBSYS(freezer)
+#endif
+
+/* */
diff --git a/include/linux/freezer.h b/include/linux/freezer.h
index 17e3bb42dd3c..8f225339eee9 100644
--- a/include/linux/freezer.h
+++ b/include/linux/freezer.h
@@ -46,26 +46,11 @@ static inline bool should_send_signal(struct task_struct *p)
 
 /*
  * Wake up a frozen process
- *
- * task_lock() is taken to prevent the race with refrigerator() which may
- * occur if the freezing of tasks fails.  Namely, without the lock, if the
- * freezing of tasks failed, thaw_tasks() might have run before a task in
- * refrigerator() could call frozen_process(), in which case the task would be
- * frozen and no one would thaw it.
  */
-static inline int thaw_process(struct task_struct *p)
-{
-	task_lock(p);
-	if (frozen(p)) {
-		p->flags &= ~PF_FROZEN;
-		task_unlock(p);
-		wake_up_process(p);
-		return 1;
-	}
-	clear_freeze_flag(p);
-	task_unlock(p);
-	return 0;
-}
+extern int __thaw_process(struct task_struct *p);
+
+/* Takes and releases task alloc lock using task_lock() */
+extern int thaw_process(struct task_struct *p);
 
 extern void refrigerator(void);
 extern int freeze_processes(void);
@@ -83,6 +68,12 @@ static inline int try_to_freeze(void)
 extern bool freeze_task(struct task_struct *p, bool sig_only);
 extern void cancel_freezing(struct task_struct *p);
 
+#ifdef CONFIG_CGROUP_FREEZER
+extern int cgroup_frozen(struct task_struct *task);
+#else /* !CONFIG_CGROUP_FREEZER */
+static inline int cgroup_frozen(struct task_struct *task) { return 0; }
+#endif /* !CONFIG_CGROUP_FREEZER */
+
 /*
  * The PF_FREEZER_SKIP flag should be set by a vfork parent right before it
  * calls wait_for_completion(&vfork) and reset right after it returns from this
diff --git a/init/Kconfig b/init/Kconfig
index 5ceff3249a2d..8828ed0b2051 100644
--- a/init/Kconfig
+++ b/init/Kconfig
@@ -299,6 +299,13 @@ config CGROUP_NS
           for instance virtual servers and checkpoint/restart
           jobs.
 
+config CGROUP_FREEZER
+        bool "control group freezer subsystem"
+        depends on CGROUPS
+        help
+          Provides a way to freeze and unfreeze all tasks in a
+	  cgroup.
+
 config CGROUP_DEVICE
 	bool "Device controller for cgroups"
 	depends on CGROUPS && EXPERIMENTAL
diff --git a/kernel/Kconfig.freezer b/kernel/Kconfig.freezer
new file mode 100644
index 000000000000..a3bb4cb52539
--- /dev/null
+++ b/kernel/Kconfig.freezer
@@ -0,0 +1,2 @@
+config FREEZER
+	def_bool PM_SLEEP || CGROUP_FREEZER
diff --git a/kernel/Makefile b/kernel/Makefile
index e8194d15d5f4..066550aa61c5 100644
--- a/kernel/Makefile
+++ b/kernel/Makefile
@@ -56,6 +56,7 @@ obj-$(CONFIG_BACKTRACE_SELF_TEST) += backtracetest.o
 obj-$(CONFIG_COMPAT) += compat.o
 obj-$(CONFIG_CGROUPS) += cgroup.o
 obj-$(CONFIG_CGROUP_DEBUG) += cgroup_debug.o
+obj-$(CONFIG_CGROUP_FREEZER) += cgroup_freezer.o
 obj-$(CONFIG_CPUSETS) += cpuset.o
 obj-$(CONFIG_CGROUP_NS) += ns_cgroup.o
 obj-$(CONFIG_UTS_NS) += utsname.o
diff --git a/kernel/cgroup_freezer.c b/kernel/cgroup_freezer.c
new file mode 100644
index 000000000000..b08722de610c
--- /dev/null
+++ b/kernel/cgroup_freezer.c
@@ -0,0 +1,366 @@
+/*
+ * cgroup_freezer.c -  control group freezer subsystem
+ *
+ * Copyright IBM Corporation, 2007
+ *
+ * Author : Cedric Le Goater <clg@fr.ibm.com>
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of version 2.1 of the GNU Lesser General Public License
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it would be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ */
+
+#include <linux/module.h>
+#include <linux/cgroup.h>
+#include <linux/fs.h>
+#include <linux/uaccess.h>
+#include <linux/freezer.h>
+#include <linux/seq_file.h>
+
+enum freezer_state {
+	STATE_RUNNING = 0,
+	STATE_FREEZING,
+	STATE_FROZEN,
+};
+
+struct freezer {
+	struct cgroup_subsys_state css;
+	enum freezer_state state;
+	spinlock_t lock; /* protects _writes_ to state */
+};
+
+static inline struct freezer *cgroup_freezer(
+		struct cgroup *cgroup)
+{
+	return container_of(
+		cgroup_subsys_state(cgroup, freezer_subsys_id),
+		struct freezer, css);
+}
+
+static inline struct freezer *task_freezer(struct task_struct *task)
+{
+	return container_of(task_subsys_state(task, freezer_subsys_id),
+			    struct freezer, css);
+}
+
+int cgroup_frozen(struct task_struct *task)
+{
+	struct freezer *freezer;
+	enum freezer_state state;
+
+	task_lock(task);
+	freezer = task_freezer(task);
+	state = freezer->state;
+	task_unlock(task);
+
+	return state == STATE_FROZEN;
+}
+
+/*
+ * cgroups_write_string() limits the size of freezer state strings to
+ * CGROUP_LOCAL_BUFFER_SIZE
+ */
+static const char *freezer_state_strs[] = {
+	"RUNNING",
+	"FREEZING",
+	"FROZEN",
+};
+
+/*
+ * State diagram
+ * Transitions are caused by userspace writes to the freezer.state file.
+ * The values in parenthesis are state labels. The rest are edge labels.
+ *
+ * (RUNNING) --FROZEN--> (FREEZING) --FROZEN--> (FROZEN)
+ *    ^ ^                     |                       |
+ *    | \_______RUNNING_______/                       |
+ *    \_____________________________RUNNING___________/
+ */
+
+struct cgroup_subsys freezer_subsys;
+
+/* Locks taken and their ordering
+ * ------------------------------
+ * css_set_lock
+ * cgroup_mutex (AKA cgroup_lock)
+ * task->alloc_lock (AKA task_lock)
+ * freezer->lock
+ * task->sighand->siglock
+ *
+ * cgroup code forces css_set_lock to be taken before task->alloc_lock
+ *
+ * freezer_create(), freezer_destroy():
+ * cgroup_mutex [ by cgroup core ]
+ *
+ * can_attach():
+ * cgroup_mutex
+ *
+ * cgroup_frozen():
+ * task->alloc_lock (to get task's cgroup)
+ *
+ * freezer_fork() (preserving fork() performance means can't take cgroup_mutex):
+ * task->alloc_lock (to get task's cgroup)
+ * freezer->lock
+ *  sighand->siglock (if the cgroup is freezing)
+ *
+ * freezer_read():
+ * cgroup_mutex
+ *  freezer->lock
+ *   read_lock css_set_lock (cgroup iterator start)
+ *
+ * freezer_write() (freeze):
+ * cgroup_mutex
+ *  freezer->lock
+ *   read_lock css_set_lock (cgroup iterator start)
+ *    sighand->siglock
+ *
+ * freezer_write() (unfreeze):
+ * cgroup_mutex
+ *  freezer->lock
+ *   read_lock css_set_lock (cgroup iterator start)
+ *    task->alloc_lock (to prevent races with freeze_task())
+ *     sighand->siglock
+ */
+static struct cgroup_subsys_state *freezer_create(struct cgroup_subsys *ss,
+						  struct cgroup *cgroup)
+{
+	struct freezer *freezer;
+
+	freezer = kzalloc(sizeof(struct freezer), GFP_KERNEL);
+	if (!freezer)
+		return ERR_PTR(-ENOMEM);
+
+	spin_lock_init(&freezer->lock);
+	freezer->state = STATE_RUNNING;
+	return &freezer->css;
+}
+
+static void freezer_destroy(struct cgroup_subsys *ss,
+			    struct cgroup *cgroup)
+{
+	kfree(cgroup_freezer(cgroup));
+}
+
+
+static int freezer_can_attach(struct cgroup_subsys *ss,
+			      struct cgroup *new_cgroup,
+			      struct task_struct *task)
+{
+	struct freezer *freezer;
+	int retval = 0;
+
+	/*
+	 * The call to cgroup_lock() in the freezer.state write method prevents
+	 * a write to that file racing against an attach, and hence the
+	 * can_attach() result will remain valid until the attach completes.
+	 */
+	freezer = cgroup_freezer(new_cgroup);
+	if (freezer->state == STATE_FROZEN)
+		retval = -EBUSY;
+	return retval;
+}
+
+static void freezer_fork(struct cgroup_subsys *ss, struct task_struct *task)
+{
+	struct freezer *freezer;
+
+	task_lock(task);
+	freezer = task_freezer(task);
+	task_unlock(task);
+
+	BUG_ON(freezer->state == STATE_FROZEN);
+	spin_lock_irq(&freezer->lock);
+	/* Locking avoids race with FREEZING -> RUNNING transitions. */
+	if (freezer->state == STATE_FREEZING)
+		freeze_task(task, true);
+	spin_unlock_irq(&freezer->lock);
+}
+
+/*
+ * caller must hold freezer->lock
+ */
+static void check_if_frozen(struct cgroup *cgroup,
+			     struct freezer *freezer)
+{
+	struct cgroup_iter it;
+	struct task_struct *task;
+	unsigned int nfrozen = 0, ntotal = 0;
+
+	cgroup_iter_start(cgroup, &it);
+	while ((task = cgroup_iter_next(cgroup, &it))) {
+		ntotal++;
+		/*
+		 * Task is frozen or will freeze immediately when next it gets
+		 * woken
+		 */
+		if (frozen(task) ||
+		    (task_is_stopped_or_traced(task) && freezing(task)))
+			nfrozen++;
+	}
+
+	/*
+	 * Transition to FROZEN when no new tasks can be added ensures
+	 * that we never exist in the FROZEN state while there are unfrozen
+	 * tasks.
+	 */
+	if (nfrozen == ntotal)
+		freezer->state = STATE_FROZEN;
+	cgroup_iter_end(cgroup, &it);
+}
+
+static int freezer_read(struct cgroup *cgroup, struct cftype *cft,
+			struct seq_file *m)
+{
+	struct freezer *freezer;
+	enum freezer_state state;
+
+	if (!cgroup_lock_live_group(cgroup))
+		return -ENODEV;
+
+	freezer = cgroup_freezer(cgroup);
+	spin_lock_irq(&freezer->lock);
+	state = freezer->state;
+	if (state == STATE_FREEZING) {
+		/* We change from FREEZING to FROZEN lazily if the cgroup was
+		 * only partially frozen when we exitted write. */
+		check_if_frozen(cgroup, freezer);
+		state = freezer->state;
+	}
+	spin_unlock_irq(&freezer->lock);
+	cgroup_unlock();
+
+	seq_puts(m, freezer_state_strs[state]);
+	seq_putc(m, '\n');
+	return 0;
+}
+
+static int try_to_freeze_cgroup(struct cgroup *cgroup, struct freezer *freezer)
+{
+	struct cgroup_iter it;
+	struct task_struct *task;
+	unsigned int num_cant_freeze_now = 0;
+
+	freezer->state = STATE_FREEZING;
+	cgroup_iter_start(cgroup, &it);
+	while ((task = cgroup_iter_next(cgroup, &it))) {
+		if (!freeze_task(task, true))
+			continue;
+		if (task_is_stopped_or_traced(task) && freezing(task))
+			/*
+			 * The freeze flag is set so these tasks will
+			 * immediately go into the fridge upon waking.
+			 */
+			continue;
+		if (!freezing(task) && !freezer_should_skip(task))
+			num_cant_freeze_now++;
+	}
+	cgroup_iter_end(cgroup, &it);
+
+	return num_cant_freeze_now ? -EBUSY : 0;
+}
+
+static int unfreeze_cgroup(struct cgroup *cgroup, struct freezer *freezer)
+{
+	struct cgroup_iter it;
+	struct task_struct *task;
+
+	cgroup_iter_start(cgroup, &it);
+	while ((task = cgroup_iter_next(cgroup, &it))) {
+		int do_wake;
+
+		task_lock(task);
+		do_wake = __thaw_process(task);
+		task_unlock(task);
+		if (do_wake)
+			wake_up_process(task);
+	}
+	cgroup_iter_end(cgroup, &it);
+	freezer->state = STATE_RUNNING;
+
+	return 0;
+}
+
+static int freezer_change_state(struct cgroup *cgroup,
+				enum freezer_state goal_state)
+{
+	struct freezer *freezer;
+	int retval = 0;
+
+	freezer = cgroup_freezer(cgroup);
+	spin_lock_irq(&freezer->lock);
+	check_if_frozen(cgroup, freezer); /* may update freezer->state */
+	if (goal_state == freezer->state)
+		goto out;
+	switch (freezer->state) {
+	case STATE_RUNNING:
+		retval = try_to_freeze_cgroup(cgroup, freezer);
+		break;
+	case STATE_FREEZING:
+		if (goal_state == STATE_FROZEN) {
+			/* Userspace is retrying after
+			 * "/bin/echo FROZEN > freezer.state" returned -EBUSY */
+			retval = try_to_freeze_cgroup(cgroup, freezer);
+			break;
+		}
+		/* state == FREEZING and goal_state == RUNNING, so unfreeze */
+	case STATE_FROZEN:
+		retval = unfreeze_cgroup(cgroup, freezer);
+		break;
+	default:
+		break;
+	}
+out:
+	spin_unlock_irq(&freezer->lock);
+
+	return retval;
+}
+
+static int freezer_write(struct cgroup *cgroup,
+			 struct cftype *cft,
+			 const char *buffer)
+{
+	int retval;
+	enum freezer_state goal_state;
+
+	if (strcmp(buffer, freezer_state_strs[STATE_RUNNING]) == 0)
+		goal_state = STATE_RUNNING;
+	else if (strcmp(buffer, freezer_state_strs[STATE_FROZEN]) == 0)
+		goal_state = STATE_FROZEN;
+	else
+		return -EIO;
+
+	if (!cgroup_lock_live_group(cgroup))
+		return -ENODEV;
+	retval = freezer_change_state(cgroup, goal_state);
+	cgroup_unlock();
+	return retval;
+}
+
+static struct cftype files[] = {
+	{
+		.name = "state",
+		.read_seq_string = freezer_read,
+		.write_string = freezer_write,
+	},
+};
+
+static int freezer_populate(struct cgroup_subsys *ss, struct cgroup *cgroup)
+{
+	return cgroup_add_files(cgroup, ss, files, ARRAY_SIZE(files));
+}
+
+struct cgroup_subsys freezer_subsys = {
+	.name		= "freezer",
+	.create		= freezer_create,
+	.destroy	= freezer_destroy,
+	.populate	= freezer_populate,
+	.subsys_id	= freezer_subsys_id,
+	.can_attach	= freezer_can_attach,
+	.attach		= NULL,
+	.fork		= freezer_fork,
+	.exit		= NULL,
+};
diff --git a/kernel/freezer.c b/kernel/freezer.c
index cb0931f89306..ba6248b323ef 100644
--- a/kernel/freezer.c
+++ b/kernel/freezer.c
@@ -120,3 +120,35 @@ void cancel_freezing(struct task_struct *p)
 		spin_unlock_irqrestore(&p->sighand->siglock, flags);
 	}
 }
+
+/*
+ * Wake up a frozen process
+ *
+ * task_lock() is needed to prevent the race with refrigerator() which may
+ * occur if the freezing of tasks fails.  Namely, without the lock, if the
+ * freezing of tasks failed, thaw_tasks() might have run before a task in
+ * refrigerator() could call frozen_process(), in which case the task would be
+ * frozen and no one would thaw it.
+ */
+int __thaw_process(struct task_struct *p)
+{
+	if (frozen(p)) {
+		p->flags &= ~PF_FROZEN;
+		return 1;
+	}
+	clear_freeze_flag(p);
+	return 0;
+}
+
+int thaw_process(struct task_struct *p)
+{
+	task_lock(p);
+	if (__thaw_process(p) == 1) {
+		task_unlock(p);
+		wake_up_process(p);
+		return 1;
+	}
+	task_unlock(p);
+	return 0;
+}
+EXPORT_SYMBOL(thaw_process);
diff --git a/kernel/power/Kconfig b/kernel/power/Kconfig
index ebdd7f55273d..dcd165f92a88 100644
--- a/kernel/power/Kconfig
+++ b/kernel/power/Kconfig
@@ -85,9 +85,6 @@ config PM_SLEEP
 	depends on SUSPEND || HIBERNATION || XEN_SAVE_RESTORE
 	default y
 
-config FREEZER
-	def_bool PM_SLEEP
-
 config SUSPEND
 	bool "Suspend to RAM and standby"
 	depends on PM && ARCH_SUSPEND_POSSIBLE
-- 
cgit v1.2.3


From 3e680aae4e53ab54cdbb0c29257dae0cbb158e1c Mon Sep 17 00:00:00 2001
From: Krzysztof Helt <krzysztof.h1@wp.pl>
Date: Sat, 18 Oct 2008 20:27:51 -0700
Subject: fb: convert lock/unlock_kernel() into local fb mutex

Change lock_kernel()/unlock_kernel() to local fb mutex.  Each frame buffer
instance has its own mutex.

The one line try_to_load() function is unrolled to request_module() in two
places for readability.

[righi.andrea@gmail.com: fb: fix NULL pointer BUG dereference in fb_open()]
Signed-off-by: Krzysztof Helt <krzysztof.h1@wp.pl>
Signed-off-by: Andrea Righi <righi.andrea@gmail.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 drivers/video/fbmem.c | 39 ++++++++++++++++++++-------------------
 include/linux/fb.h    |  1 +
 2 files changed, 21 insertions(+), 19 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/video/fbmem.c b/drivers/video/fbmem.c
index 2a0f013dc37b..cd5f20da738a 100644
--- a/drivers/video/fbmem.c
+++ b/drivers/video/fbmem.c
@@ -1018,12 +1018,12 @@ fb_ioctl(struct file *file, unsigned int cmd,
 	void __user *argp = (void __user *)arg;
 	long ret = 0;
 
-	lock_kernel();
 	info = registered_fb[fbidx];
+	mutex_lock(&info->lock);
 	fb = info->fbops;
 
 	if (!fb) {
-		unlock_kernel();
+		mutex_unlock(&info->lock);
 		return -ENODEV;
 	}
 	switch (cmd) {
@@ -1126,7 +1126,7 @@ fb_ioctl(struct file *file, unsigned int cmd,
 		else
 			ret = fb->fb_ioctl(info, cmd, arg);
 	}
-	unlock_kernel();
+	mutex_unlock(&info->lock);
 	return ret;
 }
 
@@ -1253,7 +1253,7 @@ fb_compat_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
 	struct fb_ops *fb = info->fbops;
 	long ret = -ENOIOCTLCMD;
 
-	lock_kernel();
+	mutex_lock(&info->lock);
 	switch(cmd) {
 	case FBIOGET_VSCREENINFO:
 	case FBIOPUT_VSCREENINFO:
@@ -1279,7 +1279,7 @@ fb_compat_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
 			ret = fb->fb_compat_ioctl(info, cmd, arg);
 		break;
 	}
-	unlock_kernel();
+	mutex_unlock(&info->lock);
 	return ret;
 }
 #endif
@@ -1301,13 +1301,13 @@ fb_mmap(struct file *file, struct vm_area_struct * vma)
 		return -ENODEV;
 	if (fb->fb_mmap) {
 		int res;
-		lock_kernel();
+		mutex_lock(&info->lock);
 		res = fb->fb_mmap(info, vma);
-		unlock_kernel();
+		mutex_unlock(&info->lock);
 		return res;
 	}
 
-	lock_kernel();
+	mutex_lock(&info->lock);
 
 	/* frame buffer memory */
 	start = info->fix.smem_start;
@@ -1316,13 +1316,13 @@ fb_mmap(struct file *file, struct vm_area_struct * vma)
 		/* memory mapped io */
 		off -= len;
 		if (info->var.accel_flags) {
-			unlock_kernel();
+			mutex_unlock(&info->lock);
 			return -EINVAL;
 		}
 		start = info->fix.mmio_start;
 		len = PAGE_ALIGN((start & ~PAGE_MASK) + info->fix.mmio_len);
 	}
-	unlock_kernel();
+	mutex_unlock(&info->lock);
 	start &= PAGE_MASK;
 	if ((vma->vm_end - vma->vm_start + off) > len)
 		return -EINVAL;
@@ -1346,13 +1346,13 @@ fb_open(struct inode *inode, struct file *file)
 
 	if (fbidx >= FB_MAX)
 		return -ENODEV;
-	lock_kernel();
-	if (!(info = registered_fb[fbidx]))
+	info = registered_fb[fbidx];
+	if (!info)
 		request_module("fb%d", fbidx);
-	if (!(info = registered_fb[fbidx])) {
-		res = -ENODEV;
-		goto out;
-	}
+	info = registered_fb[fbidx];
+	if (!info)
+		return -ENODEV;
+	mutex_lock(&info->lock);
 	if (!try_module_get(info->fbops->owner)) {
 		res = -ENODEV;
 		goto out;
@@ -1368,7 +1368,7 @@ fb_open(struct inode *inode, struct file *file)
 		fb_deferred_io_open(info, inode, file);
 #endif
 out:
-	unlock_kernel();
+	mutex_unlock(&info->lock);
 	return res;
 }
 
@@ -1377,11 +1377,11 @@ fb_release(struct inode *inode, struct file *file)
 {
 	struct fb_info * const info = file->private_data;
 
-	lock_kernel();
+	mutex_lock(&info->lock);
 	if (info->fbops->fb_release)
 		info->fbops->fb_release(info,1);
 	module_put(info->fbops->owner);
-	unlock_kernel();
+	mutex_unlock(&info->lock);
 	return 0;
 }
 
@@ -1460,6 +1460,7 @@ register_framebuffer(struct fb_info *fb_info)
 		if (!registered_fb[i])
 			break;
 	fb_info->node = i;
+	mutex_init(&fb_info->lock);
 
 	fb_info->dev = device_create(fb_class, fb_info->device,
 				     MKDEV(FB_MAJOR, i), NULL, "fb%d", i);
diff --git a/include/linux/fb.h b/include/linux/fb.h
index 531ccd5f5960..75a81eaf3430 100644
--- a/include/linux/fb.h
+++ b/include/linux/fb.h
@@ -808,6 +808,7 @@ struct fb_tile_ops {
 struct fb_info {
 	int node;
 	int flags;
+	struct mutex lock;		/* Lock for open/release/ioctl funcs */
 	struct fb_var_screeninfo var;	/* Current var */
 	struct fb_fix_screeninfo fix;	/* Current fix */
 	struct fb_monspecs monspecs;	/* Current Monitor specs */
-- 
cgit v1.2.3


From 0e4fb5e283870757024294bc4567a7c59d936f0b Mon Sep 17 00:00:00 2001
From: Hidehiro Kawai <hidehiro.kawai.ez@hitachi.com>
Date: Sat, 18 Oct 2008 20:27:57 -0700
Subject: ext3: add an option to control error handling on file data

If the journal doesn't abort when it gets an IO error in file data blocks,
the file data corruption will spread silently.  Because most of
applications and commands do buffered writes without fsync(), they don't
notice the IO error.  It's scary for mission critical systems.  On the
other hand, if the journal aborts whenever it gets an IO error in file
data blocks, the system will easily become inoperable.  So this patch
introduces a filesystem option to determine whether it aborts the journal
or just call printk() when it gets an IO error in file data.

If you mount a ext3 fs with data_err=abort option, it aborts on file data
write error.  If you mount it with data_err=ignore, it doesn't abort, just
call printk().  data_err=ignore is the default.

Signed-off-by: Hidehiro Kawai <hidehiro.kawai.ez@hitachi.com>
Cc: Jan Kara <jack@ucw.cz>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 Documentation/filesystems/ext3.txt |  5 +++++
 fs/ext3/super.c                    | 16 ++++++++++++++++
 fs/jbd/commit.c                    |  2 ++
 include/linux/ext3_fs.h            |  2 ++
 include/linux/jbd.h                |  3 +++
 5 files changed, 28 insertions(+)

(limited to 'include/linux')

diff --git a/Documentation/filesystems/ext3.txt b/Documentation/filesystems/ext3.txt
index 295f26cd895a..9dd2a3bb2acc 100644
--- a/Documentation/filesystems/ext3.txt
+++ b/Documentation/filesystems/ext3.txt
@@ -96,6 +96,11 @@ errors=remount-ro(*)	Remount the filesystem read-only on an error.
 errors=continue		Keep going on a filesystem error.
 errors=panic		Panic and halt the machine if an error occurs.
 
+data_err=ignore(*)	Just print an error message if an error occurs
+			in a file data buffer in ordered mode.
+data_err=abort		Abort the journal if an error occurs in a file
+			data buffer in ordered mode.
+
 grpid			Give objects the same group ID as their creator.
 bsdgroups
 
diff --git a/fs/ext3/super.c b/fs/ext3/super.c
index 399a96a6c556..3a260af5544d 100644
--- a/fs/ext3/super.c
+++ b/fs/ext3/super.c
@@ -625,6 +625,9 @@ static int ext3_show_options(struct seq_file *seq, struct vfsmount *vfs)
 	else if (test_opt(sb, DATA_FLAGS) == EXT3_MOUNT_WRITEBACK_DATA)
 		seq_puts(seq, ",data=writeback");
 
+	if (test_opt(sb, DATA_ERR_ABORT))
+		seq_puts(seq, ",data_err=abort");
+
 	ext3_show_quota_options(seq, sb);
 
 	return 0;
@@ -754,6 +757,7 @@ enum {
 	Opt_reservation, Opt_noreservation, Opt_noload, Opt_nobh, Opt_bh,
 	Opt_commit, Opt_journal_update, Opt_journal_inum, Opt_journal_dev,
 	Opt_abort, Opt_data_journal, Opt_data_ordered, Opt_data_writeback,
+	Opt_data_err_abort, Opt_data_err_ignore,
 	Opt_usrjquota, Opt_grpjquota, Opt_offusrjquota, Opt_offgrpjquota,
 	Opt_jqfmt_vfsold, Opt_jqfmt_vfsv0, Opt_quota, Opt_noquota,
 	Opt_ignore, Opt_barrier, Opt_err, Opt_resize, Opt_usrquota,
@@ -796,6 +800,8 @@ static const match_table_t tokens = {
 	{Opt_data_journal, "data=journal"},
 	{Opt_data_ordered, "data=ordered"},
 	{Opt_data_writeback, "data=writeback"},
+	{Opt_data_err_abort, "data_err=abort"},
+	{Opt_data_err_ignore, "data_err=ignore"},
 	{Opt_offusrjquota, "usrjquota="},
 	{Opt_usrjquota, "usrjquota=%s"},
 	{Opt_offgrpjquota, "grpjquota="},
@@ -1011,6 +1017,12 @@ static int parse_options (char *options, struct super_block *sb,
 				sbi->s_mount_opt |= data_opt;
 			}
 			break;
+		case Opt_data_err_abort:
+			set_opt(sbi->s_mount_opt, DATA_ERR_ABORT);
+			break;
+		case Opt_data_err_ignore:
+			clear_opt(sbi->s_mount_opt, DATA_ERR_ABORT);
+			break;
 #ifdef CONFIG_QUOTA
 		case Opt_usrjquota:
 			qtype = USRQUOTA;
@@ -1986,6 +1998,10 @@ static void ext3_init_journal_params(struct super_block *sb, journal_t *journal)
 		journal->j_flags |= JFS_BARRIER;
 	else
 		journal->j_flags &= ~JFS_BARRIER;
+	if (test_opt(sb, DATA_ERR_ABORT))
+		journal->j_flags |= JFS_ABORT_ON_SYNCDATA_ERR;
+	else
+		journal->j_flags &= ~JFS_ABORT_ON_SYNCDATA_ERR;
 	spin_unlock(&journal->j_state_lock);
 }
 
diff --git a/fs/jbd/commit.c b/fs/jbd/commit.c
index d6a6659f3e46..25719d902c51 100644
--- a/fs/jbd/commit.c
+++ b/fs/jbd/commit.c
@@ -482,6 +482,8 @@ void journal_commit_transaction(journal_t *journal)
 		printk(KERN_WARNING
 			"JBD: Detected IO errors while flushing file data "
 			"on %s\n", bdevname(journal->j_fs_dev, b));
+		if (journal->j_flags & JFS_ABORT_ON_SYNCDATA_ERR)
+			journal_abort(journal, err);
 		err = 0;
 	}
 
diff --git a/include/linux/ext3_fs.h b/include/linux/ext3_fs.h
index 159d9b476cd7..d14f02918483 100644
--- a/include/linux/ext3_fs.h
+++ b/include/linux/ext3_fs.h
@@ -380,6 +380,8 @@ struct ext3_inode {
 #define EXT3_MOUNT_QUOTA		0x80000 /* Some quota option set */
 #define EXT3_MOUNT_USRQUOTA		0x100000 /* "old" user quota */
 #define EXT3_MOUNT_GRPQUOTA		0x200000 /* "old" group quota */
+#define EXT3_MOUNT_DATA_ERR_ABORT	0x400000 /* Abort on file data write
+						  * error in ordered mode */
 
 /* Compatibility, for having both ext2_fs.h and ext3_fs.h included at once */
 #ifndef _LINUX_EXT2_FS_H
diff --git a/include/linux/jbd.h b/include/linux/jbd.h
index 7ebbcb1c9ba4..35d4f6342fac 100644
--- a/include/linux/jbd.h
+++ b/include/linux/jbd.h
@@ -816,6 +816,9 @@ struct journal_s
 #define JFS_FLUSHED	0x008	/* The journal superblock has been flushed */
 #define JFS_LOADED	0x010	/* The journal superblock has been loaded */
 #define JFS_BARRIER	0x020	/* Use IDE barriers */
+#define JFS_ABORT_ON_SYNCDATA_ERR	0x040  /* Abort the journal on file
+						* data write error in ordered
+						* mode */
 
 /*
  * Function declarations for the journaling transaction and buffer
-- 
cgit v1.2.3


From 146aa1bd0511f88ddb4e92fafa2b8aad4f2f65f3 Mon Sep 17 00:00:00 2001
From: Lai Jiangshan <laijs@cn.fujitsu.com>
Date: Sat, 18 Oct 2008 20:28:03 -0700
Subject: cgroups: fix probable race with put_css_set[_taskexit] and
 find_css_set

put_css_set_taskexit may be called when find_css_set is called on other
cpu.  And the race will occur:

put_css_set_taskexit side                    find_css_set side

                                        |
atomic_dec_and_test(&kref->refcount)    |
    /* kref->refcount = 0 */            |
....................................................................
                                        |  read_lock(&css_set_lock)
                                        |  find_existing_css_set
                                        |  get_css_set
                                        |  read_unlock(&css_set_lock);
....................................................................
__release_css_set                       |
....................................................................
                                        | /* use a released css_set */
                                        |

[put_css_set is the same. But in the current code, all put_css_set are
put into cgroup mutex critical region as the same as find_css_set.]

[akpm@linux-foundation.org: repair comments]
[menage@google.com: eliminate race in css_set refcounting]
Signed-off-by: Lai Jiangshan <laijs@cn.fujitsu.com>
Cc: Balbir Singh <balbir@in.ibm.com>
Cc: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
Signed-off-by: Paul Menage <menage@google.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/cgroup.h |  3 +--
 kernel/cgroup.c        | 43 ++++++++++++++++++++-----------------------
 kernel/cgroup_debug.c  |  4 ++--
 3 files changed, 23 insertions(+), 27 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/cgroup.h b/include/linux/cgroup.h
index 30934e4bfaab..7166023e07d2 100644
--- a/include/linux/cgroup.h
+++ b/include/linux/cgroup.h
@@ -9,7 +9,6 @@
  */
 
 #include <linux/sched.h>
-#include <linux/kref.h>
 #include <linux/cpumask.h>
 #include <linux/nodemask.h>
 #include <linux/rcupdate.h>
@@ -149,7 +148,7 @@ struct cgroup {
 struct css_set {
 
 	/* Reference count */
-	struct kref ref;
+	atomic_t refcount;
 
 	/*
 	 * List running through all cgroup groups in the same hash
diff --git a/kernel/cgroup.c b/kernel/cgroup.c
index 8c6e1c17e6d3..1e49218457e0 100644
--- a/kernel/cgroup.c
+++ b/kernel/cgroup.c
@@ -241,7 +241,6 @@ static void unlink_css_set(struct css_set *cg)
 	struct cg_cgroup_link *link;
 	struct cg_cgroup_link *saved_link;
 
-	write_lock(&css_set_lock);
 	hlist_del(&cg->hlist);
 	css_set_count--;
 
@@ -251,16 +250,25 @@ static void unlink_css_set(struct css_set *cg)
 		list_del(&link->cgrp_link_list);
 		kfree(link);
 	}
-
-	write_unlock(&css_set_lock);
 }
 
-static void __release_css_set(struct kref *k, int taskexit)
+static void __put_css_set(struct css_set *cg, int taskexit)
 {
 	int i;
-	struct css_set *cg = container_of(k, struct css_set, ref);
-
+	/*
+	 * Ensure that the refcount doesn't hit zero while any readers
+	 * can see it. Similar to atomic_dec_and_lock(), but for an
+	 * rwlock
+	 */
+	if (atomic_add_unless(&cg->refcount, -1, 1))
+		return;
+	write_lock(&css_set_lock);
+	if (!atomic_dec_and_test(&cg->refcount)) {
+		write_unlock(&css_set_lock);
+		return;
+	}
 	unlink_css_set(cg);
+	write_unlock(&css_set_lock);
 
 	rcu_read_lock();
 	for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) {
@@ -276,32 +284,22 @@ static void __release_css_set(struct kref *k, int taskexit)
 	kfree(cg);
 }
 
-static void release_css_set(struct kref *k)
-{
-	__release_css_set(k, 0);
-}
-
-static void release_css_set_taskexit(struct kref *k)
-{
-	__release_css_set(k, 1);
-}
-
 /*
  * refcounted get/put for css_set objects
  */
 static inline void get_css_set(struct css_set *cg)
 {
-	kref_get(&cg->ref);
+	atomic_inc(&cg->refcount);
 }
 
 static inline void put_css_set(struct css_set *cg)
 {
-	kref_put(&cg->ref, release_css_set);
+	__put_css_set(cg, 0);
 }
 
 static inline void put_css_set_taskexit(struct css_set *cg)
 {
-	kref_put(&cg->ref, release_css_set_taskexit);
+	__put_css_set(cg, 1);
 }
 
 /*
@@ -427,7 +425,7 @@ static struct css_set *find_css_set(
 		return NULL;
 	}
 
-	kref_init(&res->ref);
+	atomic_set(&res->refcount, 1);
 	INIT_LIST_HEAD(&res->cg_links);
 	INIT_LIST_HEAD(&res->tasks);
 	INIT_HLIST_NODE(&res->hlist);
@@ -1728,7 +1726,7 @@ int cgroup_task_count(const struct cgroup *cgrp)
 
 	read_lock(&css_set_lock);
 	list_for_each_entry(link, &cgrp->css_sets, cgrp_link_list) {
-		count += atomic_read(&link->cg->ref.refcount);
+		count += atomic_read(&link->cg->refcount);
 	}
 	read_unlock(&css_set_lock);
 	return count;
@@ -2495,8 +2493,7 @@ static void __init cgroup_init_subsys(struct cgroup_subsys *ss)
 int __init cgroup_init_early(void)
 {
 	int i;
-	kref_init(&init_css_set.ref);
-	kref_get(&init_css_set.ref);
+	atomic_set(&init_css_set.refcount, 1);
 	INIT_LIST_HEAD(&init_css_set.cg_links);
 	INIT_LIST_HEAD(&init_css_set.tasks);
 	INIT_HLIST_NODE(&init_css_set.hlist);
diff --git a/kernel/cgroup_debug.c b/kernel/cgroup_debug.c
index c3dc3aba4c02..daca6209202d 100644
--- a/kernel/cgroup_debug.c
+++ b/kernel/cgroup_debug.c
@@ -57,7 +57,7 @@ static u64 current_css_set_refcount_read(struct cgroup *cont,
 	u64 count;
 
 	rcu_read_lock();
-	count = atomic_read(&current->cgroups->ref.refcount);
+	count = atomic_read(&current->cgroups->refcount);
 	rcu_read_unlock();
 	return count;
 }
@@ -90,7 +90,7 @@ static struct cftype files[] =  {
 	{
 		.name = "releasable",
 		.read_u64 = releasable_read,
-	}
+	},
 };
 
 static int debug_populate(struct cgroup_subsys *ss, struct cgroup *cont)
-- 
cgit v1.2.3


From cc31edceee04a7b87f2be48f9489ebb72d264844 Mon Sep 17 00:00:00 2001
From: Paul Menage <menage@google.com>
Date: Sat, 18 Oct 2008 20:28:04 -0700
Subject: cgroups: convert tasks file to use a seq_file with shared pid array

Rather than pre-generating the entire text for the "tasks" file each
time the file is opened, we instead just generate/update the array of
process ids and use a seq_file to report these to userspace.  All open
file handles on the same "tasks" file can share a pid array, which may
be updated any time that no thread is actively reading the array.  By
sharing the array, the potential for userspace to DoS the system by
opening many handles on the same "tasks" file is removed.

[Based on a patch by Lai Jiangshan, extended to use seq_file]

Signed-off-by: Paul Menage <menage@google.com>
Reviewed-by: Lai Jiangshan <laijs@cn.fujitsu.com>
Cc: Serge Hallyn <serue@us.ibm.com>
Cc: Balbir Singh <balbir@in.ibm.com>
Cc: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/cgroup.h |  10 +++
 kernel/cgroup.c        | 222 +++++++++++++++++++++++++++++++------------------
 2 files changed, 149 insertions(+), 83 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/cgroup.h b/include/linux/cgroup.h
index 7166023e07d2..8ab91880a0ad 100644
--- a/include/linux/cgroup.h
+++ b/include/linux/cgroup.h
@@ -14,6 +14,7 @@
 #include <linux/rcupdate.h>
 #include <linux/cgroupstats.h>
 #include <linux/prio_heap.h>
+#include <linux/rwsem.h>
 
 #ifdef CONFIG_CGROUPS
 
@@ -136,6 +137,15 @@ struct cgroup {
 	 * release_list_lock
 	 */
 	struct list_head release_list;
+
+	/* pids_mutex protects the fields below */
+	struct rw_semaphore pids_mutex;
+	/* Array of process ids in the cgroup */
+	pid_t *tasks_pids;
+	/* How many files are using the current tasks_pids array */
+	int pids_use_count;
+	/* Length of the current tasks_pids array */
+	int pids_length;
 };
 
 /* A css_set is a structure holding pointers to a set of
diff --git a/kernel/cgroup.c b/kernel/cgroup.c
index 1e49218457e0..046c1609606b 100644
--- a/kernel/cgroup.c
+++ b/kernel/cgroup.c
@@ -868,6 +868,14 @@ static struct super_operations cgroup_ops = {
 	.remount_fs = cgroup_remount,
 };
 
+static void init_cgroup_housekeeping(struct cgroup *cgrp)
+{
+	INIT_LIST_HEAD(&cgrp->sibling);
+	INIT_LIST_HEAD(&cgrp->children);
+	INIT_LIST_HEAD(&cgrp->css_sets);
+	INIT_LIST_HEAD(&cgrp->release_list);
+	init_rwsem(&cgrp->pids_mutex);
+}
 static void init_cgroup_root(struct cgroupfs_root *root)
 {
 	struct cgroup *cgrp = &root->top_cgroup;
@@ -876,10 +884,7 @@ static void init_cgroup_root(struct cgroupfs_root *root)
 	root->number_of_cgroups = 1;
 	cgrp->root = root;
 	cgrp->top_cgroup = cgrp;
-	INIT_LIST_HEAD(&cgrp->sibling);
-	INIT_LIST_HEAD(&cgrp->children);
-	INIT_LIST_HEAD(&cgrp->css_sets);
-	INIT_LIST_HEAD(&cgrp->release_list);
+	init_cgroup_housekeeping(cgrp);
 }
 
 static int cgroup_test_super(struct super_block *sb, void *data)
@@ -1995,16 +2000,7 @@ int cgroup_scan_tasks(struct cgroup_scanner *scan)
  * but we cannot guarantee that the information we produce is correct
  * unless we produce it entirely atomically.
  *
- * Upon tasks file open(), a struct ctr_struct is allocated, that
- * will have a pointer to an array (also allocated here).  The struct
- * ctr_struct * is stored in file->private_data.  Its resources will
- * be freed by release() when the file is closed.  The array is used
- * to sprintf the PIDs and then used by read().
  */
-struct ctr_struct {
-	char *buf;
-	int bufsz;
-};
 
 /*
  * Load into 'pidarray' up to 'npids' of the tasks using cgroup
@@ -2086,42 +2082,132 @@ static int cmppid(const void *a, const void *b)
 	return *(pid_t *)a - *(pid_t *)b;
 }
 
+
 /*
- * Convert array 'a' of 'npids' pid_t's to a string of newline separated
- * decimal pids in 'buf'.  Don't write more than 'sz' chars, but return
- * count 'cnt' of how many chars would be written if buf were large enough.
+ * seq_file methods for the "tasks" file. The seq_file position is the
+ * next pid to display; the seq_file iterator is a pointer to the pid
+ * in the cgroup->tasks_pids array.
  */
-static int pid_array_to_buf(char *buf, int sz, pid_t *a, int npids)
+
+static void *cgroup_tasks_start(struct seq_file *s, loff_t *pos)
 {
-	int cnt = 0;
-	int i;
+	/*
+	 * Initially we receive a position value that corresponds to
+	 * one more than the last pid shown (or 0 on the first call or
+	 * after a seek to the start). Use a binary-search to find the
+	 * next pid to display, if any
+	 */
+	struct cgroup *cgrp = s->private;
+	int index = 0, pid = *pos;
+	int *iter;
 
-	for (i = 0; i < npids; i++)
-		cnt += snprintf(buf + cnt, max(sz - cnt, 0), "%d\n", a[i]);
-	return cnt;
+	down_read(&cgrp->pids_mutex);
+	if (pid) {
+		int end = cgrp->pids_length;
+		int i;
+		while (index < end) {
+			int mid = (index + end) / 2;
+			if (cgrp->tasks_pids[mid] == pid) {
+				index = mid;
+				break;
+			} else if (cgrp->tasks_pids[mid] <= pid)
+				index = mid + 1;
+			else
+				end = mid;
+		}
+	}
+	/* If we're off the end of the array, we're done */
+	if (index >= cgrp->pids_length)
+		return NULL;
+	/* Update the abstract position to be the actual pid that we found */
+	iter = cgrp->tasks_pids + index;
+	*pos = *iter;
+	return iter;
+}
+
+static void cgroup_tasks_stop(struct seq_file *s, void *v)
+{
+	struct cgroup *cgrp = s->private;
+	up_read(&cgrp->pids_mutex);
 }
 
+static void *cgroup_tasks_next(struct seq_file *s, void *v, loff_t *pos)
+{
+	struct cgroup *cgrp = s->private;
+	int *p = v;
+	int *end = cgrp->tasks_pids + cgrp->pids_length;
+
+	/*
+	 * Advance to the next pid in the array. If this goes off the
+	 * end, we're done
+	 */
+	p++;
+	if (p >= end) {
+		return NULL;
+	} else {
+		*pos = *p;
+		return p;
+	}
+}
+
+static int cgroup_tasks_show(struct seq_file *s, void *v)
+{
+	return seq_printf(s, "%d\n", *(int *)v);
+}
+
+static struct seq_operations cgroup_tasks_seq_operations = {
+	.start = cgroup_tasks_start,
+	.stop = cgroup_tasks_stop,
+	.next = cgroup_tasks_next,
+	.show = cgroup_tasks_show,
+};
+
+static void release_cgroup_pid_array(struct cgroup *cgrp)
+{
+	down_write(&cgrp->pids_mutex);
+	BUG_ON(!cgrp->pids_use_count);
+	if (!--cgrp->pids_use_count) {
+		kfree(cgrp->tasks_pids);
+		cgrp->tasks_pids = NULL;
+		cgrp->pids_length = 0;
+	}
+	up_write(&cgrp->pids_mutex);
+}
+
+static int cgroup_tasks_release(struct inode *inode, struct file *file)
+{
+	struct cgroup *cgrp = __d_cgrp(file->f_dentry->d_parent);
+
+	if (!(file->f_mode & FMODE_READ))
+		return 0;
+
+	release_cgroup_pid_array(cgrp);
+	return seq_release(inode, file);
+}
+
+static struct file_operations cgroup_tasks_operations = {
+	.read = seq_read,
+	.llseek = seq_lseek,
+	.write = cgroup_file_write,
+	.release = cgroup_tasks_release,
+};
+
 /*
- * Handle an open on 'tasks' file.  Prepare a buffer listing the
+ * Handle an open on 'tasks' file.  Prepare an array containing the
  * process id's of tasks currently attached to the cgroup being opened.
- *
- * Does not require any specific cgroup mutexes, and does not take any.
  */
+
 static int cgroup_tasks_open(struct inode *unused, struct file *file)
 {
 	struct cgroup *cgrp = __d_cgrp(file->f_dentry->d_parent);
-	struct ctr_struct *ctr;
 	pid_t *pidarray;
 	int npids;
-	char c;
+	int retval;
 
+	/* Nothing to do for write-only files */
 	if (!(file->f_mode & FMODE_READ))
 		return 0;
 
-	ctr = kmalloc(sizeof(*ctr), GFP_KERNEL);
-	if (!ctr)
-		goto err0;
-
 	/*
 	 * If cgroup gets more users after we read count, we won't have
 	 * enough space - tough.  This race is indistinguishable to the
@@ -2129,57 +2215,31 @@ static int cgroup_tasks_open(struct inode *unused, struct file *file)
 	 * show up until sometime later on.
 	 */
 	npids = cgroup_task_count(cgrp);
-	if (npids) {
-		pidarray = kmalloc(npids * sizeof(pid_t), GFP_KERNEL);
-		if (!pidarray)
-			goto err1;
-
-		npids = pid_array_load(pidarray, npids, cgrp);
-		sort(pidarray, npids, sizeof(pid_t), cmppid, NULL);
-
-		/* Call pid_array_to_buf() twice, first just to get bufsz */
-		ctr->bufsz = pid_array_to_buf(&c, sizeof(c), pidarray, npids) + 1;
-		ctr->buf = kmalloc(ctr->bufsz, GFP_KERNEL);
-		if (!ctr->buf)
-			goto err2;
-		ctr->bufsz = pid_array_to_buf(ctr->buf, ctr->bufsz, pidarray, npids);
-
-		kfree(pidarray);
-	} else {
-		ctr->buf = NULL;
-		ctr->bufsz = 0;
-	}
-	file->private_data = ctr;
-	return 0;
-
-err2:
-	kfree(pidarray);
-err1:
-	kfree(ctr);
-err0:
-	return -ENOMEM;
-}
-
-static ssize_t cgroup_tasks_read(struct cgroup *cgrp,
-				    struct cftype *cft,
-				    struct file *file, char __user *buf,
-				    size_t nbytes, loff_t *ppos)
-{
-	struct ctr_struct *ctr = file->private_data;
+	pidarray = kmalloc(npids * sizeof(pid_t), GFP_KERNEL);
+	if (!pidarray)
+		return -ENOMEM;
+	npids = pid_array_load(pidarray, npids, cgrp);
+	sort(pidarray, npids, sizeof(pid_t), cmppid, NULL);
 
-	return simple_read_from_buffer(buf, nbytes, ppos, ctr->buf, ctr->bufsz);
-}
+	/*
+	 * Store the array in the cgroup, freeing the old
+	 * array if necessary
+	 */
+	down_write(&cgrp->pids_mutex);
+	kfree(cgrp->tasks_pids);
+	cgrp->tasks_pids = pidarray;
+	cgrp->pids_length = npids;
+	cgrp->pids_use_count++;
+	up_write(&cgrp->pids_mutex);
 
-static int cgroup_tasks_release(struct inode *unused_inode,
-					struct file *file)
-{
-	struct ctr_struct *ctr;
+	file->f_op = &cgroup_tasks_operations;
 
-	if (file->f_mode & FMODE_READ) {
-		ctr = file->private_data;
-		kfree(ctr->buf);
-		kfree(ctr);
+	retval = seq_open(file, &cgroup_tasks_seq_operations);
+	if (retval) {
+		release_cgroup_pid_array(cgrp);
+		return retval;
 	}
+	((struct seq_file *)file->private_data)->private = cgrp;
 	return 0;
 }
 
@@ -2208,7 +2268,6 @@ static struct cftype files[] = {
 	{
 		.name = "tasks",
 		.open = cgroup_tasks_open,
-		.read = cgroup_tasks_read,
 		.write_u64 = cgroup_tasks_write,
 		.release = cgroup_tasks_release,
 		.private = FILE_TASKLIST,
@@ -2298,10 +2357,7 @@ static long cgroup_create(struct cgroup *parent, struct dentry *dentry,
 
 	mutex_lock(&cgroup_mutex);
 
-	INIT_LIST_HEAD(&cgrp->sibling);
-	INIT_LIST_HEAD(&cgrp->children);
-	INIT_LIST_HEAD(&cgrp->css_sets);
-	INIT_LIST_HEAD(&cgrp->release_list);
+	init_cgroup_housekeeping(cgrp);
 
 	cgrp->parent = parent;
 	cgrp->root = parent->root;
-- 
cgit v1.2.3


From 886465f407e57d6c3c81013c919ea670ce1ae0d0 Mon Sep 17 00:00:00 2001
From: Paul Menage <menage@google.com>
Date: Sat, 18 Oct 2008 20:28:05 -0700
Subject: cgroups: fix declaration of cgroup_mm_owner_callbacks

The choice of real/dummy declaration for cgroup_mm_owner_callbacks()
shouldn't be based on CONFIG_MM_OWNER, but on CONFIG_CGROUPS.  Otherwise
kernel/exit.c fails to compile when something other than a cgroups
controller selects CONFIG_MM_OWNER

Signed-off-by: Paul Menage <menage@google.com>
Acked-by: Pekka Enberg <penberg@cs.helsinki.fi>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/cgroup.h | 15 ++++++---------
 1 file changed, 6 insertions(+), 9 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/cgroup.h b/include/linux/cgroup.h
index 8ab91880a0ad..8b00f6643e93 100644
--- a/include/linux/cgroup.h
+++ b/include/linux/cgroup.h
@@ -403,6 +403,9 @@ void cgroup_iter_end(struct cgroup *cgrp, struct cgroup_iter *it);
 int cgroup_scan_tasks(struct cgroup_scanner *scan);
 int cgroup_attach_task(struct cgroup *, struct task_struct *);
 
+void cgroup_mm_owner_callbacks(struct task_struct *old,
+			       struct task_struct *new);
+
 #else /* !CONFIG_CGROUPS */
 
 static inline int cgroup_init_early(void) { return 0; }
@@ -421,15 +424,9 @@ static inline int cgroupstats_build(struct cgroupstats *stats,
 	return -EINVAL;
 }
 
+static inline void cgroup_mm_owner_callbacks(struct task_struct *old,
+					     struct task_struct *new) {}
+
 #endif /* !CONFIG_CGROUPS */
 
-#ifdef CONFIG_MM_OWNER
-extern void
-cgroup_mm_owner_callbacks(struct task_struct *old, struct task_struct *new);
-#else /* !CONFIG_MM_OWNER */
-static inline void
-cgroup_mm_owner_callbacks(struct task_struct *old, struct task_struct *new)
-{
-}
-#endif /* CONFIG_MM_OWNER */
 #endif /* _LINUX_CGROUP_H */
-- 
cgit v1.2.3


From 52d4b9ac0b985168009c2a57098324e67bae171f Mon Sep 17 00:00:00 2001
From: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
Date: Sat, 18 Oct 2008 20:28:16 -0700
Subject: memcg: allocate all page_cgroup at boot

Allocate all page_cgroup at boot and remove page_cgroup poitner from
struct page.  This patch adds an interface as

 struct page_cgroup *lookup_page_cgroup(struct page*)

All FLATMEM/DISCONTIGMEM/SPARSEMEM  and MEMORY_HOTPLUG is supported.

Remove page_cgroup pointer reduces the amount of memory by
 - 4 bytes per PAGE_SIZE.
 - 8 bytes per PAGE_SIZE
if memory controller is disabled. (even if configured.)

On usual 8GB x86-32 server, this saves 8MB of NORMAL_ZONE memory.
On my x86-64 server with 48GB of memory, this saves 96MB of memory.
I think this reduction makes sense.

By pre-allocation, kmalloc/kfree in charge/uncharge are removed.
This means
  - we're not necessary to be afraid of kmalloc faiulre.
    (this can happen because of gfp_mask type.)
  - we can avoid calling kmalloc/kfree.
  - we can avoid allocating tons of small objects which can be fragmented.
  - we can know what amount of memory will be used for this extra-lru handling.

I added printk message as

	"allocated %ld bytes of page_cgroup"
        "please try cgroup_disable=memory option if you don't want"

maybe enough informative for users.

Signed-off-by: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
Reviewed-by: Balbir Singh <balbir@linux.vnet.ibm.com>
Cc: Daisuke Nishimura <nishimura@mxp.nes.nec.co.jp>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/memcontrol.h  |  13 +--
 include/linux/mm_types.h    |   3 -
 include/linux/mmzone.h      |  14 ++-
 include/linux/page_cgroup.h | 103 ++++++++++++++++++
 mm/Makefile                 |   3 +-
 mm/memcontrol.c             | 247 ++++++++++++++------------------------------
 mm/page_alloc.c             |  12 +--
 mm/page_cgroup.c            | 237 ++++++++++++++++++++++++++++++++++++++++++
 8 files changed, 438 insertions(+), 194 deletions(-)
 create mode 100644 include/linux/page_cgroup.h
 create mode 100644 mm/page_cgroup.c

(limited to 'include/linux')

diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h
index ee1b2fcb4410..1fbe14d39521 100644
--- a/include/linux/memcontrol.h
+++ b/include/linux/memcontrol.h
@@ -27,9 +27,6 @@ struct mm_struct;
 
 #ifdef CONFIG_CGROUP_MEM_RES_CTLR
 
-#define page_reset_bad_cgroup(page)	((page)->page_cgroup = 0)
-
-extern struct page_cgroup *page_get_page_cgroup(struct page *page);
 extern int mem_cgroup_charge(struct page *page, struct mm_struct *mm,
 				gfp_t gfp_mask);
 extern int mem_cgroup_cache_charge(struct page *page, struct mm_struct *mm,
@@ -72,16 +69,8 @@ extern void mem_cgroup_record_reclaim_priority(struct mem_cgroup *mem,
 extern long mem_cgroup_calc_reclaim(struct mem_cgroup *mem, struct zone *zone,
 					int priority, enum lru_list lru);
 
-#else /* CONFIG_CGROUP_MEM_RES_CTLR */
-static inline void page_reset_bad_cgroup(struct page *page)
-{
-}
-
-static inline struct page_cgroup *page_get_page_cgroup(struct page *page)
-{
-	return NULL;
-}
 
+#else /* CONFIG_CGROUP_MEM_RES_CTLR */
 static inline int mem_cgroup_charge(struct page *page,
 					struct mm_struct *mm, gfp_t gfp_mask)
 {
diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h
index 9d49fa36bbef..fe825471d5aa 100644
--- a/include/linux/mm_types.h
+++ b/include/linux/mm_types.h
@@ -94,9 +94,6 @@ struct page {
 	void *virtual;			/* Kernel virtual address (NULL if
 					   not kmapped, ie. highmem) */
 #endif /* WANT_PAGE_VIRTUAL */
-#ifdef CONFIG_CGROUP_MEM_RES_CTLR
-	unsigned long page_cgroup;
-#endif
 };
 
 /*
diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h
index da2d053a95f1..35a7b5e19465 100644
--- a/include/linux/mmzone.h
+++ b/include/linux/mmzone.h
@@ -601,8 +601,11 @@ typedef struct pglist_data {
 	struct zone node_zones[MAX_NR_ZONES];
 	struct zonelist node_zonelists[MAX_ZONELISTS];
 	int nr_zones;
-#ifdef CONFIG_FLAT_NODE_MEM_MAP
+#ifdef CONFIG_FLAT_NODE_MEM_MAP	/* means !SPARSEMEM */
 	struct page *node_mem_map;
+#ifdef CONFIG_CGROUP_MEM_RES_CTLR
+	struct page_cgroup *node_page_cgroup;
+#endif
 #endif
 	struct bootmem_data *bdata;
 #ifdef CONFIG_MEMORY_HOTPLUG
@@ -931,6 +934,7 @@ static inline unsigned long early_pfn_to_nid(unsigned long pfn)
 #endif
 
 struct page;
+struct page_cgroup;
 struct mem_section {
 	/*
 	 * This is, logically, a pointer to an array of struct
@@ -948,6 +952,14 @@ struct mem_section {
 
 	/* See declaration of similar field in struct zone */
 	unsigned long *pageblock_flags;
+#ifdef CONFIG_CGROUP_MEM_RES_CTLR
+	/*
+	 * If !SPARSEMEM, pgdat doesn't have page_cgroup pointer. We use
+	 * section. (see memcontrol.h/page_cgroup.h about this.)
+	 */
+	struct page_cgroup *page_cgroup;
+	unsigned long pad;
+#endif
 };
 
 #ifdef CONFIG_SPARSEMEM_EXTREME
diff --git a/include/linux/page_cgroup.h b/include/linux/page_cgroup.h
new file mode 100644
index 000000000000..0fd39f2231ec
--- /dev/null
+++ b/include/linux/page_cgroup.h
@@ -0,0 +1,103 @@
+#ifndef __LINUX_PAGE_CGROUP_H
+#define __LINUX_PAGE_CGROUP_H
+
+#ifdef CONFIG_CGROUP_MEM_RES_CTLR
+#include <linux/bit_spinlock.h>
+/*
+ * Page Cgroup can be considered as an extended mem_map.
+ * A page_cgroup page is associated with every page descriptor. The
+ * page_cgroup helps us identify information about the cgroup
+ * All page cgroups are allocated at boot or memory hotplug event,
+ * then the page cgroup for pfn always exists.
+ */
+struct page_cgroup {
+	unsigned long flags;
+	struct mem_cgroup *mem_cgroup;
+	struct page *page;
+	struct list_head lru;		/* per cgroup LRU list */
+};
+
+void __init pgdat_page_cgroup_init(struct pglist_data *pgdat);
+void __init page_cgroup_init(void);
+struct page_cgroup *lookup_page_cgroup(struct page *page);
+
+enum {
+	/* flags for mem_cgroup */
+	PCG_LOCK,  /* page cgroup is locked */
+	PCG_CACHE, /* charged as cache */
+	PCG_USED, /* this object is in use. */
+	/* flags for LRU placement */
+	PCG_ACTIVE, /* page is active in this cgroup */
+	PCG_FILE, /* page is file system backed */
+	PCG_UNEVICTABLE, /* page is unevictableable */
+};
+
+#define TESTPCGFLAG(uname, lname)			\
+static inline int PageCgroup##uname(struct page_cgroup *pc)	\
+	{ return test_bit(PCG_##lname, &pc->flags); }
+
+#define SETPCGFLAG(uname, lname)			\
+static inline void SetPageCgroup##uname(struct page_cgroup *pc)\
+	{ set_bit(PCG_##lname, &pc->flags);  }
+
+#define CLEARPCGFLAG(uname, lname)			\
+static inline void ClearPageCgroup##uname(struct page_cgroup *pc)	\
+	{ clear_bit(PCG_##lname, &pc->flags);  }
+
+/* Cache flag is set only once (at allocation) */
+TESTPCGFLAG(Cache, CACHE)
+
+TESTPCGFLAG(Used, USED)
+CLEARPCGFLAG(Used, USED)
+
+/* LRU management flags (from global-lru definition) */
+TESTPCGFLAG(File, FILE)
+SETPCGFLAG(File, FILE)
+CLEARPCGFLAG(File, FILE)
+
+TESTPCGFLAG(Active, ACTIVE)
+SETPCGFLAG(Active, ACTIVE)
+CLEARPCGFLAG(Active, ACTIVE)
+
+TESTPCGFLAG(Unevictable, UNEVICTABLE)
+SETPCGFLAG(Unevictable, UNEVICTABLE)
+CLEARPCGFLAG(Unevictable, UNEVICTABLE)
+
+static inline int page_cgroup_nid(struct page_cgroup *pc)
+{
+	return page_to_nid(pc->page);
+}
+
+static inline enum zone_type page_cgroup_zid(struct page_cgroup *pc)
+{
+	return page_zonenum(pc->page);
+}
+
+static inline void lock_page_cgroup(struct page_cgroup *pc)
+{
+	bit_spin_lock(PCG_LOCK, &pc->flags);
+}
+
+static inline int trylock_page_cgroup(struct page_cgroup *pc)
+{
+	return bit_spin_trylock(PCG_LOCK, &pc->flags);
+}
+
+static inline void unlock_page_cgroup(struct page_cgroup *pc)
+{
+	bit_spin_unlock(PCG_LOCK, &pc->flags);
+}
+
+#else /* CONFIG_CGROUP_MEM_RES_CTLR */
+struct page_cgroup;
+
+static inline void pgdat_page_cgroup_init(struct pglist_data *pgdat)
+{
+}
+
+static inline struct page_cgroup *lookup_page_cgroup(struct page *page)
+{
+	return NULL;
+}
+#endif
+#endif
diff --git a/mm/Makefile b/mm/Makefile
index da4ccf015aea..c06b45a1ff5f 100644
--- a/mm/Makefile
+++ b/mm/Makefile
@@ -33,5 +33,4 @@ obj-$(CONFIG_FS_XIP) += filemap_xip.o
 obj-$(CONFIG_MIGRATION) += migrate.o
 obj-$(CONFIG_SMP) += allocpercpu.o
 obj-$(CONFIG_QUICKLIST) += quicklist.o
-obj-$(CONFIG_CGROUP_MEM_RES_CTLR) += memcontrol.o
-
+obj-$(CONFIG_CGROUP_MEM_RES_CTLR) += memcontrol.o page_cgroup.o
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index 031682e7ef0c..d4a92b63e98e 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -33,11 +33,11 @@
 #include <linux/seq_file.h>
 #include <linux/vmalloc.h>
 #include <linux/mm_inline.h>
+#include <linux/page_cgroup.h>
 
 #include <asm/uaccess.h>
 
 struct cgroup_subsys mem_cgroup_subsys __read_mostly;
-static struct kmem_cache *page_cgroup_cache __read_mostly;
 #define MEM_CGROUP_RECLAIM_RETRIES	5
 
 /*
@@ -135,79 +135,6 @@ struct mem_cgroup {
 };
 static struct mem_cgroup init_mem_cgroup;
 
-/*
- * We use the lower bit of the page->page_cgroup pointer as a bit spin
- * lock.  We need to ensure that page->page_cgroup is at least two
- * byte aligned (based on comments from Nick Piggin).  But since
- * bit_spin_lock doesn't actually set that lock bit in a non-debug
- * uniprocessor kernel, we should avoid setting it here too.
- */
-#define PAGE_CGROUP_LOCK_BIT 	0x0
-#if defined(CONFIG_SMP) || defined(CONFIG_DEBUG_SPINLOCK)
-#define PAGE_CGROUP_LOCK 	(1 << PAGE_CGROUP_LOCK_BIT)
-#else
-#define PAGE_CGROUP_LOCK	0x0
-#endif
-
-/*
- * A page_cgroup page is associated with every page descriptor. The
- * page_cgroup helps us identify information about the cgroup
- */
-struct page_cgroup {
-	struct list_head lru;		/* per cgroup LRU list */
-	struct page *page;
-	struct mem_cgroup *mem_cgroup;
-	unsigned long flags;
-};
-
-enum {
-	/* flags for mem_cgroup */
-	PCG_CACHE, /* charged as cache */
-	/* flags for LRU placement */
-	PCG_ACTIVE, /* page is active in this cgroup */
-	PCG_FILE, /* page is file system backed */
-	PCG_UNEVICTABLE, /* page is unevictableable */
-};
-
-#define TESTPCGFLAG(uname, lname)			\
-static inline int PageCgroup##uname(struct page_cgroup *pc)	\
-	{ return test_bit(PCG_##lname, &pc->flags); }
-
-#define SETPCGFLAG(uname, lname)			\
-static inline void SetPageCgroup##uname(struct page_cgroup *pc)\
-	{ set_bit(PCG_##lname, &pc->flags);  }
-
-#define CLEARPCGFLAG(uname, lname)			\
-static inline void ClearPageCgroup##uname(struct page_cgroup *pc)	\
-	{ clear_bit(PCG_##lname, &pc->flags);  }
-
-
-/* Cache flag is set only once (at allocation) */
-TESTPCGFLAG(Cache, CACHE)
-
-/* LRU management flags (from global-lru definition) */
-TESTPCGFLAG(File, FILE)
-SETPCGFLAG(File, FILE)
-CLEARPCGFLAG(File, FILE)
-
-TESTPCGFLAG(Active, ACTIVE)
-SETPCGFLAG(Active, ACTIVE)
-CLEARPCGFLAG(Active, ACTIVE)
-
-TESTPCGFLAG(Unevictable, UNEVICTABLE)
-SETPCGFLAG(Unevictable, UNEVICTABLE)
-CLEARPCGFLAG(Unevictable, UNEVICTABLE)
-
-static int page_cgroup_nid(struct page_cgroup *pc)
-{
-	return page_to_nid(pc->page);
-}
-
-static enum zone_type page_cgroup_zid(struct page_cgroup *pc)
-{
-	return page_zonenum(pc->page);
-}
-
 enum charge_type {
 	MEM_CGROUP_CHARGE_TYPE_CACHE = 0,
 	MEM_CGROUP_CHARGE_TYPE_MAPPED,
@@ -216,12 +143,18 @@ enum charge_type {
 	NR_CHARGE_TYPE,
 };
 
+/* only for here (for easy reading.) */
+#define PCGF_CACHE	(1UL << PCG_CACHE)
+#define PCGF_USED	(1UL << PCG_USED)
+#define PCGF_ACTIVE	(1UL << PCG_ACTIVE)
+#define PCGF_LOCK	(1UL << PCG_LOCK)
+#define PCGF_FILE	(1UL << PCG_FILE)
 static const unsigned long
 pcg_default_flags[NR_CHARGE_TYPE] = {
-	((1 << PCG_CACHE) | (1 << PCG_FILE)),
-	((1 << PCG_ACTIVE)),
-	((1 << PCG_ACTIVE) | (1 << PCG_CACHE)),
-	0,
+	PCGF_CACHE | PCGF_FILE | PCGF_USED | PCGF_LOCK, /* File Cache */
+	PCGF_ACTIVE | PCGF_USED | PCGF_LOCK, /* Anon */
+	PCGF_ACTIVE | PCGF_CACHE | PCGF_USED | PCGF_LOCK, /* Shmem */
+	0, /* FORCE */
 };
 
 /*
@@ -303,37 +236,6 @@ struct mem_cgroup *mem_cgroup_from_task(struct task_struct *p)
 				struct mem_cgroup, css);
 }
 
-static inline int page_cgroup_locked(struct page *page)
-{
-	return bit_spin_is_locked(PAGE_CGROUP_LOCK_BIT, &page->page_cgroup);
-}
-
-static void page_assign_page_cgroup(struct page *page, struct page_cgroup *pc)
-{
-	VM_BUG_ON(!page_cgroup_locked(page));
-	page->page_cgroup = ((unsigned long)pc | PAGE_CGROUP_LOCK);
-}
-
-struct page_cgroup *page_get_page_cgroup(struct page *page)
-{
-	return (struct page_cgroup *) (page->page_cgroup & ~PAGE_CGROUP_LOCK);
-}
-
-static void lock_page_cgroup(struct page *page)
-{
-	bit_spin_lock(PAGE_CGROUP_LOCK_BIT, &page->page_cgroup);
-}
-
-static int try_lock_page_cgroup(struct page *page)
-{
-	return bit_spin_trylock(PAGE_CGROUP_LOCK_BIT, &page->page_cgroup);
-}
-
-static void unlock_page_cgroup(struct page *page)
-{
-	bit_spin_unlock(PAGE_CGROUP_LOCK_BIT, &page->page_cgroup);
-}
-
 static void __mem_cgroup_remove_list(struct mem_cgroup_per_zone *mz,
 			struct page_cgroup *pc)
 {
@@ -436,17 +338,16 @@ void mem_cgroup_move_lists(struct page *page, enum lru_list lru)
 	 * safely get to page_cgroup without it, so just try_lock it:
 	 * mem_cgroup_isolate_pages allows for page left on wrong list.
 	 */
-	if (!try_lock_page_cgroup(page))
+	pc = lookup_page_cgroup(page);
+	if (!trylock_page_cgroup(pc))
 		return;
-
-	pc = page_get_page_cgroup(page);
-	if (pc) {
+	if (pc && PageCgroupUsed(pc)) {
 		mz = page_cgroup_zoneinfo(pc);
 		spin_lock_irqsave(&mz->lru_lock, flags);
 		__mem_cgroup_move_lists(pc, lru);
 		spin_unlock_irqrestore(&mz->lru_lock, flags);
 	}
-	unlock_page_cgroup(page);
+	unlock_page_cgroup(pc);
 }
 
 /*
@@ -533,6 +434,8 @@ unsigned long mem_cgroup_isolate_pages(unsigned long nr_to_scan,
 	list_for_each_entry_safe_reverse(pc, tmp, src, lru) {
 		if (scan >= nr_to_scan)
 			break;
+		if (unlikely(!PageCgroupUsed(pc)))
+			continue;
 		page = pc->page;
 
 		if (unlikely(!PageLRU(page)))
@@ -576,26 +479,27 @@ static int mem_cgroup_charge_common(struct page *page, struct mm_struct *mm,
 {
 	struct mem_cgroup *mem;
 	struct page_cgroup *pc;
-	unsigned long flags;
 	unsigned long nr_retries = MEM_CGROUP_RECLAIM_RETRIES;
 	struct mem_cgroup_per_zone *mz;
+	unsigned long flags;
 
-	pc = kmem_cache_alloc(page_cgroup_cache, gfp_mask);
-	if (unlikely(pc == NULL))
-		goto err;
-
+	pc = lookup_page_cgroup(page);
+	/* can happen at boot */
+	if (unlikely(!pc))
+		return 0;
+	prefetchw(pc);
 	/*
 	 * We always charge the cgroup the mm_struct belongs to.
 	 * The mm_struct's mem_cgroup changes on task migration if the
 	 * thread group leader migrates. It's possible that mm is not
 	 * set, if so charge the init_mm (happens for pagecache usage).
 	 */
+
 	if (likely(!memcg)) {
 		rcu_read_lock();
 		mem = mem_cgroup_from_task(rcu_dereference(mm->owner));
 		if (unlikely(!mem)) {
 			rcu_read_unlock();
-			kmem_cache_free(page_cgroup_cache, pc);
 			return 0;
 		}
 		/*
@@ -631,36 +535,33 @@ static int mem_cgroup_charge_common(struct page *page, struct mm_struct *mm,
 		}
 	}
 
+
+	lock_page_cgroup(pc);
+	if (unlikely(PageCgroupUsed(pc))) {
+		unlock_page_cgroup(pc);
+		res_counter_uncharge(&mem->res, PAGE_SIZE);
+		css_put(&mem->css);
+
+		goto done;
+	}
 	pc->mem_cgroup = mem;
-	pc->page = page;
 	/*
 	 * If a page is accounted as a page cache, insert to inactive list.
 	 * If anon, insert to active list.
 	 */
 	pc->flags = pcg_default_flags[ctype];
 
-	lock_page_cgroup(page);
-	if (unlikely(page_get_page_cgroup(page))) {
-		unlock_page_cgroup(page);
-		res_counter_uncharge(&mem->res, PAGE_SIZE);
-		css_put(&mem->css);
-		kmem_cache_free(page_cgroup_cache, pc);
-		goto done;
-	}
-	page_assign_page_cgroup(page, pc);
-
 	mz = page_cgroup_zoneinfo(pc);
+
 	spin_lock_irqsave(&mz->lru_lock, flags);
 	__mem_cgroup_add_list(mz, pc);
 	spin_unlock_irqrestore(&mz->lru_lock, flags);
+	unlock_page_cgroup(pc);
 
-	unlock_page_cgroup(page);
 done:
 	return 0;
 out:
 	css_put(&mem->css);
-	kmem_cache_free(page_cgroup_cache, pc);
-err:
 	return -ENOMEM;
 }
 
@@ -668,7 +569,8 @@ int mem_cgroup_charge(struct page *page, struct mm_struct *mm, gfp_t gfp_mask)
 {
 	if (mem_cgroup_subsys.disabled)
 		return 0;
-
+	if (PageCompound(page))
+		return 0;
 	/*
 	 * If already mapped, we don't have to account.
 	 * If page cache, page->mapping has address_space.
@@ -689,7 +591,8 @@ int mem_cgroup_cache_charge(struct page *page, struct mm_struct *mm,
 {
 	if (mem_cgroup_subsys.disabled)
 		return 0;
-
+	if (PageCompound(page))
+		return 0;
 	/*
 	 * Corner case handling. This is called from add_to_page_cache()
 	 * in usual. But some FS (shmem) precharges this page before calling it
@@ -702,15 +605,16 @@ int mem_cgroup_cache_charge(struct page *page, struct mm_struct *mm,
 	if (!(gfp_mask & __GFP_WAIT)) {
 		struct page_cgroup *pc;
 
-		lock_page_cgroup(page);
-		pc = page_get_page_cgroup(page);
-		if (pc) {
-			VM_BUG_ON(pc->page != page);
-			VM_BUG_ON(!pc->mem_cgroup);
-			unlock_page_cgroup(page);
+
+		pc = lookup_page_cgroup(page);
+		if (!pc)
+			return 0;
+		lock_page_cgroup(pc);
+		if (PageCgroupUsed(pc)) {
+			unlock_page_cgroup(pc);
 			return 0;
 		}
-		unlock_page_cgroup(page);
+		unlock_page_cgroup(pc);
 	}
 
 	if (unlikely(!mm))
@@ -741,37 +645,39 @@ __mem_cgroup_uncharge_common(struct page *page, enum charge_type ctype)
 	/*
 	 * Check if our page_cgroup is valid
 	 */
-	lock_page_cgroup(page);
-	pc = page_get_page_cgroup(page);
-	if (unlikely(!pc))
-		goto unlock;
-
-	VM_BUG_ON(pc->page != page);
+	pc = lookup_page_cgroup(page);
+	if (unlikely(!pc || !PageCgroupUsed(pc)))
+		return;
 
-	if ((ctype == MEM_CGROUP_CHARGE_TYPE_MAPPED)
-	    && ((PageCgroupCache(pc) || page_mapped(page))))
-		goto unlock;
+	lock_page_cgroup(pc);
+	if ((ctype == MEM_CGROUP_CHARGE_TYPE_MAPPED && page_mapped(page))
+	     || !PageCgroupUsed(pc)) {
+		/* This happens at race in zap_pte_range() and do_swap_page()*/
+		unlock_page_cgroup(pc);
+		return;
+	}
+	ClearPageCgroupUsed(pc);
+	mem = pc->mem_cgroup;
 
 	mz = page_cgroup_zoneinfo(pc);
 	spin_lock_irqsave(&mz->lru_lock, flags);
 	__mem_cgroup_remove_list(mz, pc);
 	spin_unlock_irqrestore(&mz->lru_lock, flags);
+	unlock_page_cgroup(pc);
 
-	page_assign_page_cgroup(page, NULL);
-	unlock_page_cgroup(page);
-
-	mem = pc->mem_cgroup;
 	res_counter_uncharge(&mem->res, PAGE_SIZE);
 	css_put(&mem->css);
 
-	kmem_cache_free(page_cgroup_cache, pc);
 	return;
-unlock:
-	unlock_page_cgroup(page);
 }
 
 void mem_cgroup_uncharge_page(struct page *page)
 {
+	/* early check. */
+	if (page_mapped(page))
+		return;
+	if (page->mapping && !PageAnon(page))
+		return;
 	__mem_cgroup_uncharge_common(page, MEM_CGROUP_CHARGE_TYPE_MAPPED);
 }
 
@@ -795,9 +701,9 @@ int mem_cgroup_prepare_migration(struct page *page, struct page *newpage)
 	if (mem_cgroup_subsys.disabled)
 		return 0;
 
-	lock_page_cgroup(page);
-	pc = page_get_page_cgroup(page);
-	if (pc) {
+	pc = lookup_page_cgroup(page);
+	lock_page_cgroup(pc);
+	if (PageCgroupUsed(pc)) {
 		mem = pc->mem_cgroup;
 		css_get(&mem->css);
 		if (PageCgroupCache(pc)) {
@@ -807,7 +713,7 @@ int mem_cgroup_prepare_migration(struct page *page, struct page *newpage)
 				ctype = MEM_CGROUP_CHARGE_TYPE_SHMEM;
 		}
 	}
-	unlock_page_cgroup(page);
+	unlock_page_cgroup(pc);
 	if (mem) {
 		ret = mem_cgroup_charge_common(newpage, NULL, GFP_KERNEL,
 			ctype, mem);
@@ -832,7 +738,7 @@ void mem_cgroup_end_migration(struct page *newpage)
 	 */
 	if (!newpage->mapping)
 		__mem_cgroup_uncharge_common(newpage,
-					 MEM_CGROUP_CHARGE_TYPE_FORCE);
+				MEM_CGROUP_CHARGE_TYPE_FORCE);
 	else if (PageAnon(newpage))
 		mem_cgroup_uncharge_page(newpage);
 }
@@ -918,6 +824,8 @@ static void mem_cgroup_force_empty_list(struct mem_cgroup *mem,
 	while (!list_empty(list)) {
 		pc = list_entry(list->prev, struct page_cgroup, lru);
 		page = pc->page;
+		if (!PageCgroupUsed(pc))
+			break;
 		get_page(page);
 		spin_unlock_irqrestore(&mz->lru_lock, flags);
 		/*
@@ -932,8 +840,10 @@ static void mem_cgroup_force_empty_list(struct mem_cgroup *mem,
 				count = FORCE_UNCHARGE_BATCH;
 				cond_resched();
 			}
-		} else
-			cond_resched();
+		} else {
+			spin_lock_irqsave(&mz->lru_lock, flags);
+			break;
+		}
 		spin_lock_irqsave(&mz->lru_lock, flags);
 	}
 	spin_unlock_irqrestore(&mz->lru_lock, flags);
@@ -957,6 +867,8 @@ static int mem_cgroup_force_empty(struct mem_cgroup *mem)
 	while (mem->res.usage > 0) {
 		if (atomic_read(&mem->css.cgroup->count) > 0)
 			goto out;
+		/* This is for making all *used* pages to be on LRU. */
+		lru_add_drain_all();
 		for_each_node_state(node, N_POSSIBLE)
 			for (zid = 0; zid < MAX_NR_ZONES; zid++) {
 				struct mem_cgroup_per_zone *mz;
@@ -965,6 +877,7 @@ static int mem_cgroup_force_empty(struct mem_cgroup *mem)
 				for_each_lru(l)
 					mem_cgroup_force_empty_list(mem, mz, l);
 			}
+		cond_resched();
 	}
 	ret = 0;
 out:
@@ -1175,8 +1088,8 @@ mem_cgroup_create(struct cgroup_subsys *ss, struct cgroup *cont)
 	int node;
 
 	if (unlikely((cont->parent) == NULL)) {
+		page_cgroup_init();
 		mem = &init_mem_cgroup;
-		page_cgroup_cache = KMEM_CACHE(page_cgroup, SLAB_PANIC);
 	} else {
 		mem = mem_cgroup_alloc();
 		if (!mem)
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index f2fc44ec1d44..d0a240fbb8bf 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -44,7 +44,7 @@
 #include <linux/backing-dev.h>
 #include <linux/fault-inject.h>
 #include <linux/page-isolation.h>
-#include <linux/memcontrol.h>
+#include <linux/page_cgroup.h>
 #include <linux/debugobjects.h>
 
 #include <asm/tlbflush.h>
@@ -223,17 +223,12 @@ static inline int bad_range(struct zone *zone, struct page *page)
 
 static void bad_page(struct page *page)
 {
-	void *pc = page_get_page_cgroup(page);
-
 	printk(KERN_EMERG "Bad page state in process '%s'\n" KERN_EMERG
 		"page:%p flags:0x%0*lx mapping:%p mapcount:%d count:%d\n",
 		current->comm, page, (int)(2*sizeof(unsigned long)),
 		(unsigned long)page->flags, page->mapping,
 		page_mapcount(page), page_count(page));
-	if (pc) {
-		printk(KERN_EMERG "cgroup:%p\n", pc);
-		page_reset_bad_cgroup(page);
-	}
+
 	printk(KERN_EMERG "Trying to fix it up, but a reboot is needed\n"
 		KERN_EMERG "Backtrace:\n");
 	dump_stack();
@@ -457,7 +452,6 @@ static inline int free_pages_check(struct page *page)
 	free_page_mlock(page);
 	if (unlikely(page_mapcount(page) |
 		(page->mapping != NULL)  |
-		(page_get_page_cgroup(page) != NULL) |
 		(page_count(page) != 0)  |
 		(page->flags & PAGE_FLAGS_CHECK_AT_FREE)))
 		bad_page(page);
@@ -603,7 +597,6 @@ static int prep_new_page(struct page *page, int order, gfp_t gfp_flags)
 {
 	if (unlikely(page_mapcount(page) |
 		(page->mapping != NULL)  |
-		(page_get_page_cgroup(page) != NULL) |
 		(page_count(page) != 0)  |
 		(page->flags & PAGE_FLAGS_CHECK_AT_PREP)))
 		bad_page(page);
@@ -3438,6 +3431,7 @@ static void __paginginit free_area_init_core(struct pglist_data *pgdat,
 	pgdat->nr_zones = 0;
 	init_waitqueue_head(&pgdat->kswapd_wait);
 	pgdat->kswapd_max_order = 0;
+	pgdat_page_cgroup_init(pgdat);
 	
 	for (j = 0; j < MAX_NR_ZONES; j++) {
 		struct zone *zone = pgdat->node_zones + j;
diff --git a/mm/page_cgroup.c b/mm/page_cgroup.c
new file mode 100644
index 000000000000..5d86550701f2
--- /dev/null
+++ b/mm/page_cgroup.c
@@ -0,0 +1,237 @@
+#include <linux/mm.h>
+#include <linux/mmzone.h>
+#include <linux/bootmem.h>
+#include <linux/bit_spinlock.h>
+#include <linux/page_cgroup.h>
+#include <linux/hash.h>
+#include <linux/memory.h>
+
+static void __meminit
+__init_page_cgroup(struct page_cgroup *pc, unsigned long pfn)
+{
+	pc->flags = 0;
+	pc->mem_cgroup = NULL;
+	pc->page = pfn_to_page(pfn);
+}
+static unsigned long total_usage;
+
+#if !defined(CONFIG_SPARSEMEM)
+
+
+void __init pgdat_page_cgroup_init(struct pglist_data *pgdat)
+{
+	pgdat->node_page_cgroup = NULL;
+}
+
+struct page_cgroup *lookup_page_cgroup(struct page *page)
+{
+	unsigned long pfn = page_to_pfn(page);
+	unsigned long offset;
+	struct page_cgroup *base;
+
+	base = NODE_DATA(page_to_nid(page))->node_page_cgroup;
+	if (unlikely(!base))
+		return NULL;
+
+	offset = pfn - NODE_DATA(page_to_nid(page))->node_start_pfn;
+	return base + offset;
+}
+
+static int __init alloc_node_page_cgroup(int nid)
+{
+	struct page_cgroup *base, *pc;
+	unsigned long table_size;
+	unsigned long start_pfn, nr_pages, index;
+
+	start_pfn = NODE_DATA(nid)->node_start_pfn;
+	nr_pages = NODE_DATA(nid)->node_spanned_pages;
+
+	table_size = sizeof(struct page_cgroup) * nr_pages;
+
+	base = __alloc_bootmem_node_nopanic(NODE_DATA(nid),
+			table_size, PAGE_SIZE, __pa(MAX_DMA_ADDRESS));
+	if (!base)
+		return -ENOMEM;
+	for (index = 0; index < nr_pages; index++) {
+		pc = base + index;
+		__init_page_cgroup(pc, start_pfn + index);
+	}
+	NODE_DATA(nid)->node_page_cgroup = base;
+	total_usage += table_size;
+	return 0;
+}
+
+void __init page_cgroup_init(void)
+{
+
+	int nid, fail;
+
+	for_each_online_node(nid)  {
+		fail = alloc_node_page_cgroup(nid);
+		if (fail)
+			goto fail;
+	}
+	printk(KERN_INFO "allocated %ld bytes of page_cgroup\n", total_usage);
+	printk(KERN_INFO "please try cgroup_disable=memory option if you"
+	" don't want\n");
+	return;
+fail:
+	printk(KERN_CRIT "allocation of page_cgroup was failed.\n");
+	printk(KERN_CRIT "please try cgroup_disable=memory boot option\n");
+	panic("Out of memory");
+}
+
+#else /* CONFIG_FLAT_NODE_MEM_MAP */
+
+struct page_cgroup *lookup_page_cgroup(struct page *page)
+{
+	unsigned long pfn = page_to_pfn(page);
+	struct mem_section *section = __pfn_to_section(pfn);
+
+	return section->page_cgroup + pfn;
+}
+
+int __meminit init_section_page_cgroup(unsigned long pfn)
+{
+	struct mem_section *section;
+	struct page_cgroup *base, *pc;
+	unsigned long table_size;
+	int nid, index;
+
+	section = __pfn_to_section(pfn);
+
+	if (section->page_cgroup)
+		return 0;
+
+	nid = page_to_nid(pfn_to_page(pfn));
+
+	table_size = sizeof(struct page_cgroup) * PAGES_PER_SECTION;
+	base = kmalloc_node(table_size, GFP_KERNEL, nid);
+	if (!base)
+		base = vmalloc_node(table_size, nid);
+
+	if (!base) {
+		printk(KERN_ERR "page cgroup allocation failure\n");
+		return -ENOMEM;
+	}
+
+	for (index = 0; index < PAGES_PER_SECTION; index++) {
+		pc = base + index;
+		__init_page_cgroup(pc, pfn + index);
+	}
+
+	section = __pfn_to_section(pfn);
+	section->page_cgroup = base - pfn;
+	total_usage += table_size;
+	return 0;
+}
+#ifdef CONFIG_MEMORY_HOTPLUG
+void __free_page_cgroup(unsigned long pfn)
+{
+	struct mem_section *ms;
+	struct page_cgroup *base;
+
+	ms = __pfn_to_section(pfn);
+	if (!ms || !ms->page_cgroup)
+		return;
+	base = ms->page_cgroup + pfn;
+	ms->page_cgroup = NULL;
+	if (is_vmalloc_addr(base))
+		vfree(base);
+	else
+		kfree(base);
+}
+
+int online_page_cgroup(unsigned long start_pfn,
+			unsigned long nr_pages,
+			int nid)
+{
+	unsigned long start, end, pfn;
+	int fail = 0;
+
+	start = start_pfn & (PAGES_PER_SECTION - 1);
+	end = ALIGN(start_pfn + nr_pages, PAGES_PER_SECTION);
+
+	for (pfn = start; !fail && pfn < end; pfn += PAGES_PER_SECTION) {
+		if (!pfn_present(pfn))
+			continue;
+		fail = init_section_page_cgroup(pfn);
+	}
+	if (!fail)
+		return 0;
+
+	/* rollback */
+	for (pfn = start; pfn < end; pfn += PAGES_PER_SECTION)
+		__free_page_cgroup(pfn);
+
+	return -ENOMEM;
+}
+
+int offline_page_cgroup(unsigned long start_pfn,
+		unsigned long nr_pages, int nid)
+{
+	unsigned long start, end, pfn;
+
+	start = start_pfn & (PAGES_PER_SECTION - 1);
+	end = ALIGN(start_pfn + nr_pages, PAGES_PER_SECTION);
+
+	for (pfn = start; pfn < end; pfn += PAGES_PER_SECTION)
+		__free_page_cgroup(pfn);
+	return 0;
+
+}
+
+static int page_cgroup_callback(struct notifier_block *self,
+			       unsigned long action, void *arg)
+{
+	struct memory_notify *mn = arg;
+	int ret = 0;
+	switch (action) {
+	case MEM_GOING_ONLINE:
+		ret = online_page_cgroup(mn->start_pfn,
+				   mn->nr_pages, mn->status_change_nid);
+		break;
+	case MEM_CANCEL_ONLINE:
+	case MEM_OFFLINE:
+		offline_page_cgroup(mn->start_pfn,
+				mn->nr_pages, mn->status_change_nid);
+		break;
+	case MEM_GOING_OFFLINE:
+		break;
+	case MEM_ONLINE:
+	case MEM_CANCEL_OFFLINE:
+		break;
+	}
+	ret = notifier_from_errno(ret);
+	return ret;
+}
+
+#endif
+
+void __init page_cgroup_init(void)
+{
+	unsigned long pfn;
+	int fail = 0;
+
+	for (pfn = 0; !fail && pfn < max_pfn; pfn += PAGES_PER_SECTION) {
+		if (!pfn_present(pfn))
+			continue;
+		fail = init_section_page_cgroup(pfn);
+	}
+	if (fail) {
+		printk(KERN_CRIT "try cgroup_disable=memory boot option\n");
+		panic("Out of memory");
+	} else {
+		hotplug_memory_notifier(page_cgroup_callback, 0);
+	}
+	printk(KERN_INFO "allocated %ld bytes of page_cgroup\n", total_usage);
+	printk(KERN_INFO "please try cgroup_disable=memory option if you don't"
+	" want\n");
+}
+
+void __init pgdat_page_cgroup_init(struct pglist_data *pgdat)
+{
+	return;
+}
+
+#endif
-- 
cgit v1.2.3


From 3eda20118000941e7e8994fc5fac8706d8c10f00 Mon Sep 17 00:00:00 2001
From: Lai Jiangshan <laijs@cn.fujitsu.com>
Date: Sat, 18 Oct 2008 20:28:19 -0700
Subject: seq_file: add seq_cpumask_list(), seq_nodemask_list()

seq_cpumask_list(), seq_nodemask_list() are very like seq_cpumask(),
seq_nodemask(), but they print human readable string.

Signed-off-by: Lai Jiangshan <laijs@cn.fujitsu.com>
Cc: Alexey Dobriyan <adobriyan@gmail.com>
Cc: Paul Menage <menage@google.com>
Cc: Paul Jackson <pj@sgi.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/seq_file.c            | 16 ++++++++++++++++
 include/linux/seq_file.h | 13 +++++++++++++
 2 files changed, 29 insertions(+)

(limited to 'include/linux')

diff --git a/fs/seq_file.c b/fs/seq_file.c
index 11c85fec6b4f..eba2eabcd2b8 100644
--- a/fs/seq_file.c
+++ b/fs/seq_file.c
@@ -465,6 +465,22 @@ int seq_bitmap(struct seq_file *m, unsigned long *bits, unsigned int nr_bits)
 }
 EXPORT_SYMBOL(seq_bitmap);
 
+int seq_bitmap_list(struct seq_file *m, unsigned long *bits,
+		unsigned int nr_bits)
+{
+	if (m->count < m->size) {
+		int len = bitmap_scnlistprintf(m->buf + m->count,
+				m->size - m->count, bits, nr_bits);
+		if (m->count + len < m->size) {
+			m->count += len;
+			return 0;
+		}
+	}
+	m->count = m->size;
+	return -1;
+}
+EXPORT_SYMBOL(seq_bitmap_list);
+
 static void *single_start(struct seq_file *p, loff_t *pos)
 {
 	return NULL + (*pos == 0);
diff --git a/include/linux/seq_file.h b/include/linux/seq_file.h
index a1783b229ef4..dc50bcc282a8 100644
--- a/include/linux/seq_file.h
+++ b/include/linux/seq_file.h
@@ -60,6 +60,19 @@ static inline int seq_nodemask(struct seq_file *m, nodemask_t *mask)
 	return seq_bitmap(m, mask->bits, MAX_NUMNODES);
 }
 
+int seq_bitmap_list(struct seq_file *m, unsigned long *bits,
+		unsigned int nr_bits);
+
+static inline int seq_cpumask_list(struct seq_file *m, cpumask_t *mask)
+{
+	return seq_bitmap_list(m, mask->bits, NR_CPUS);
+}
+
+static inline int seq_nodemask_list(struct seq_file *m, nodemask_t *mask)
+{
+	return seq_bitmap_list(m, mask->bits, MAX_NUMNODES);
+}
+
 int single_open(struct file *, int (*)(struct seq_file *, void *), void *);
 int single_release(struct inode *, struct file *);
 void *__seq_open_private(struct file *, const struct seq_operations *, int);
-- 
cgit v1.2.3


From c4596435404976b0ded9cdf18b456ca2e1408ddd Mon Sep 17 00:00:00 2001
From: Lai Jiangshan <laijs@cn.fujitsu.com>
Date: Sat, 18 Oct 2008 20:28:21 -0700
Subject: bitmask: remove bitmap_scnprintf_len()

bitmap_scnprintf_len() is not used now, so we remove it.

Otherwise we have to maintain it and make its return
value always equal to bitmap_scnprintf()'s return value.

Signed-off-by: Lai Jiangshan <laijs@cn.fujitsu.com>
Cc: Alexey Dobriyan <adobriyan@gmail.com>
Cc: Paul Menage <menage@google.com>
Cc: Paul Jackson <pj@sgi.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/bitmap.h |  1 -
 lib/bitmap.c           | 11 -----------
 2 files changed, 12 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/bitmap.h b/include/linux/bitmap.h
index 89781fd48859..1abfe664c444 100644
--- a/include/linux/bitmap.h
+++ b/include/linux/bitmap.h
@@ -110,7 +110,6 @@ extern int __bitmap_weight(const unsigned long *bitmap, int bits);
 
 extern int bitmap_scnprintf(char *buf, unsigned int len,
 			const unsigned long *src, int nbits);
-extern int bitmap_scnprintf_len(unsigned int nr_bits);
 extern int __bitmap_parse(const char *buf, unsigned int buflen, int is_user,
 			unsigned long *dst, int nbits);
 extern int bitmap_parse_user(const char __user *ubuf, unsigned int ulen,
diff --git a/lib/bitmap.c b/lib/bitmap.c
index 06fb57c86de0..482df94ea21e 100644
--- a/lib/bitmap.c
+++ b/lib/bitmap.c
@@ -315,17 +315,6 @@ int bitmap_scnprintf(char *buf, unsigned int buflen,
 }
 EXPORT_SYMBOL(bitmap_scnprintf);
 
-/**
- * bitmap_scnprintf_len - return buffer length needed to convert
- * bitmap to an ASCII hex string
- * @nr_bits: number of bits to be converted
- */
-int bitmap_scnprintf_len(unsigned int nr_bits)
-{
-	unsigned int nr_nibbles = ALIGN(nr_bits, 4) / 4;
-	return nr_nibbles + ALIGN(nr_nibbles, CHUNKSZ / 4) / (CHUNKSZ / 4) - 1;
-}
-
 /**
  * __bitmap_parse - convert an ASCII hex string into a bitmap.
  * @buf: pointer to buffer containing string.
-- 
cgit v1.2.3


From b747c8c102cc0677a7a8056a093f58d7c9b500e7 Mon Sep 17 00:00:00 2001
From: Adrian Bunk <bunk@kernel.org>
Date: Sat, 18 Oct 2008 20:28:21 -0700
Subject: make ptrace_untrace() static

ptrace_untrace() can now become static.

Signed-off-by: Adrian Bunk <bunk@kernel.org>
Cc: Oleg Nesterov <oleg@tv-sign.ru>
Cc: Roland McGrath <roland@redhat.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/ptrace.h | 1 -
 kernel/ptrace.c        | 2 +-
 2 files changed, 1 insertion(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/ptrace.h b/include/linux/ptrace.h
index ea7416c901d1..22641d5d45df 100644
--- a/include/linux/ptrace.h
+++ b/include/linux/ptrace.h
@@ -94,7 +94,6 @@ extern void ptrace_notify(int exit_code);
 extern void __ptrace_link(struct task_struct *child,
 			  struct task_struct *new_parent);
 extern void __ptrace_unlink(struct task_struct *child);
-extern void ptrace_untrace(struct task_struct *child);
 #define PTRACE_MODE_READ   1
 #define PTRACE_MODE_ATTACH 2
 /* Returns 0 on success, -errno on denial. */
diff --git a/kernel/ptrace.c b/kernel/ptrace.c
index 356699a96d56..1e68e4c39e2c 100644
--- a/kernel/ptrace.c
+++ b/kernel/ptrace.c
@@ -45,7 +45,7 @@ void __ptrace_link(struct task_struct *child, struct task_struct *new_parent)
  * TASK_TRACED, resume it now.
  * Requires that irqs be disabled.
  */
-void ptrace_untrace(struct task_struct *child)
+static void ptrace_untrace(struct task_struct *child)
 {
 	spin_lock(&child->sighand->siglock);
 	if (task_is_traced(child)) {
-- 
cgit v1.2.3


From 656eb2cd5da153762f2e8419ca117ce12ef522c3 Mon Sep 17 00:00:00 2001
From: Roland McGrath <roland@redhat.com>
Date: Sat, 18 Oct 2008 20:28:23 -0700
Subject: add CONFIG_CORE_DUMP_DEFAULT_ELF_HEADERS

This adds a kconfig option to change the /proc/PID/coredump_filter default.
Fedora has been carrying a trivial patch to change the hard-wired value for
this default, since Fedora 8.  The default default can't change safely
because there are old GDB versions out there (all before 6.7) that are
confused by the core dump files created by the MMF_DUMP_ELF_HEADERS setting.

Signed-off-by: Roland McGrath <roland@redhat.com>
Cc: Michael Kerrisk <mtk.manpages@googlemail.com>
Cc: Oleg Nesterov <oleg@tv-sign.ru>
Cc: Alan Cox <alan@lxorguk.ukuu.org.uk>
Cc: Andi Kleen <andi@firstfloor.org>
Cc: KOSAKI Motohiro <kosaki.motohiro@jp.fujitsu.com>
Cc: Kawai Hidehiro <hidehiro.kawai.ez@hitachi.com>
Cc: Ingo Molnar <mingo@elte.hu>
Cc: David Jones <davej@redhat.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/Kconfig.binfmt     | 22 ++++++++++++++++++++++
 include/linux/sched.h |  8 +++++++-
 2 files changed, 29 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/fs/Kconfig.binfmt b/fs/Kconfig.binfmt
index 801db1341811..ce9fb3fbfae4 100644
--- a/fs/Kconfig.binfmt
+++ b/fs/Kconfig.binfmt
@@ -40,6 +40,28 @@ config BINFMT_ELF_FDPIC
 
 	  It is also possible to run FDPIC ELF binaries on MMU linux also.
 
+config CORE_DUMP_DEFAULT_ELF_HEADERS
+	bool "Write ELF core dumps with partial segments"
+	default n
+	depends on BINFMT_ELF
+	help
+	  ELF core dump files describe each memory mapping of the crashed
+	  process, and can contain or omit the memory contents of each one.
+	  The contents of an unmodified text mapping are omitted by default.
+
+	  For an unmodified text mapping of an ELF object, including just
+	  the first page of the file in a core dump makes it possible to
+	  identify the build ID bits in the file, without paying the i/o
+	  cost and disk space to dump all the text.  However, versions of
+	  GDB before 6.7 are confused by ELF core dump files in this format.
+
+	  The core dump behavior can be controlled per process using
+	  the /proc/PID/coredump_filter pseudo-file; this setting is
+	  inherited.  See Documentation/filesystems/proc.txt for details.
+
+	  This config option changes the default setting of coredump_filter
+	  seen at boot time.  If unsure, say N.
+
 config BINFMT_FLAT
 	bool "Kernel support for flat binaries"
 	depends on !MMU && (!FRV || BROKEN)
diff --git a/include/linux/sched.h b/include/linux/sched.h
index 017cc914ef1f..f52dbd3587a7 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -411,7 +411,13 @@ extern int get_dumpable(struct mm_struct *mm);
 	(((1 << MMF_DUMP_FILTER_BITS) - 1) << MMF_DUMP_FILTER_SHIFT)
 #define MMF_DUMP_FILTER_DEFAULT \
 	((1 << MMF_DUMP_ANON_PRIVATE) |	(1 << MMF_DUMP_ANON_SHARED) |\
-	 (1 << MMF_DUMP_HUGETLB_PRIVATE))
+	 (1 << MMF_DUMP_HUGETLB_PRIVATE) | MMF_DUMP_MASK_DEFAULT_ELF)
+
+#ifdef CONFIG_CORE_DUMP_DEFAULT_ELF_HEADERS
+# define MMF_DUMP_MASK_DEFAULT_ELF	(1 << MMF_DUMP_ELF_HEADERS)
+#else
+# define MMF_DUMP_MASK_DEFAULT_ELF	0
+#endif
 
 struct sighand_struct {
 	atomic_t		count;
-- 
cgit v1.2.3


From 57cac4d1880527e0baf6c2fda529d2ad1d815aec Mon Sep 17 00:00:00 2001
From: Vivek Goyal <vgoyal@redhat.com>
Date: Sat, 18 Oct 2008 20:28:25 -0700
Subject: kdump: make elfcorehdr_addr independent of CONFIG_PROC_VMCORE

o elfcorehdr_addr is used by not only the code under CONFIG_PROC_VMCORE
  but also by the code which is not inside CONFIG_PROC_VMCORE.  For
  example, is_kdump_kernel() is used by powerpc code to determine if
  kernel is booting after a panic then use previous kernel's TCE table.
  So even if CONFIG_PROC_VMCORE is not set in second kernel, one should be
  able to correctly determine that we are booting after a panic and setup
  calgary iommu accordingly.

o So remove the assumption that elfcorehdr_addr is under
  CONFIG_PROC_VMCORE.

o Move definition of elfcorehdr_addr to arch dependent crash files.
  (Unfortunately crash dump does not have an arch independent file
  otherwise that would have been the best place).

o kexec.c is not the right place as one can Have CRASH_DUMP enabled in
  second kernel without KEXEC being enabled.

o I don't see sh setup code parsing the command line for
  elfcorehdr_addr.  I am wondering how does vmcore interface work on sh.
  Anyway, I am atleast defining elfcoredhr_addr so that compilation is not
  broken on sh.

Signed-off-by: Vivek Goyal <vgoyal@redhat.com>
Acked-by: "Eric W. Biederman" <ebiederm@xmission.com>
Acked-by: Simon Horman <horms@verge.net.au>
Acked-by: Paul Mundt <lethal@linux-sh.org>
Cc: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 arch/ia64/kernel/crash_dump.c    |  4 ++++
 arch/ia64/kernel/setup.c         |  9 ++++++++-
 arch/powerpc/kernel/crash_dump.c | 10 ++++++++--
 arch/sh/kernel/crash_dump.c      |  3 +++
 arch/x86/kernel/crash_dump_32.c  |  3 +++
 arch/x86/kernel/crash_dump_64.c  |  3 +++
 arch/x86/kernel/setup.c          |  8 +++++++-
 fs/proc/vmcore.c                 |  3 ---
 include/linux/crash_dump.h       | 14 ++++++++++----
 9 files changed, 46 insertions(+), 11 deletions(-)

(limited to 'include/linux')

diff --git a/arch/ia64/kernel/crash_dump.c b/arch/ia64/kernel/crash_dump.c
index da60e90eeeb1..23e91290e41f 100644
--- a/arch/ia64/kernel/crash_dump.c
+++ b/arch/ia64/kernel/crash_dump.c
@@ -8,10 +8,14 @@
 
 #include <linux/errno.h>
 #include <linux/types.h>
+#include <linux/crash_dump.h>
 
 #include <asm/page.h>
 #include <asm/uaccess.h>
 
+/* Stores the physical address of elf header of crash image. */
+unsigned long long elfcorehdr_addr = ELFCORE_ADDR_MAX;
+
 /**
  * copy_oldmem_page - copy one page from "oldmem"
  * @pfn: page frame number to be copied
diff --git a/arch/ia64/kernel/setup.c b/arch/ia64/kernel/setup.c
index de636b215677..a0286be6c235 100644
--- a/arch/ia64/kernel/setup.c
+++ b/arch/ia64/kernel/setup.c
@@ -478,7 +478,12 @@ static __init int setup_nomca(char *s)
 }
 early_param("nomca", setup_nomca);
 
-#ifdef CONFIG_PROC_VMCORE
+/*
+ * Note: elfcorehdr_addr is not just limited to vmcore. It is also used by
+ * is_kdump_kernel() to determine if we are booting after a panic. Hence
+ * ifdef it under CONFIG_CRASH_DUMP and not CONFIG_PROC_VMCORE.
+ */
+#ifdef CONFIG_CRASH_DUMP
 /* elfcorehdr= specifies the location of elf core header
  * stored by the crashed kernel.
  */
@@ -491,7 +496,9 @@ static int __init parse_elfcorehdr(char *arg)
 	return 0;
 }
 early_param("elfcorehdr", parse_elfcorehdr);
+#endif
 
+#ifdef CONFIG_PROC_VMCORE
 int __init reserve_elfcorehdr(unsigned long *start, unsigned long *end)
 {
 	unsigned long length;
diff --git a/arch/powerpc/kernel/crash_dump.c b/arch/powerpc/kernel/crash_dump.c
index a323c9b32ee1..97e056379728 100644
--- a/arch/powerpc/kernel/crash_dump.c
+++ b/arch/powerpc/kernel/crash_dump.c
@@ -27,6 +27,9 @@
 #define DBG(fmt...)
 #endif
 
+/* Stores the physical address of elf header of crash image. */
+unsigned long long elfcorehdr_addr = ELFCORE_ADDR_MAX;
+
 void __init reserve_kdump_trampoline(void)
 {
 	lmb_reserve(0, KDUMP_RESERVE_LIMIT);
@@ -66,7 +69,11 @@ void __init setup_kdump_trampoline(void)
 	DBG(" <- setup_kdump_trampoline()\n");
 }
 
-#ifdef CONFIG_PROC_VMCORE
+/*
+ * Note: elfcorehdr_addr is not just limited to vmcore. It is also used by
+ * is_kdump_kernel() to determine if we are booting after a panic. Hence
+ * ifdef it under CONFIG_CRASH_DUMP and not CONFIG_PROC_VMCORE.
+ */
 static int __init parse_elfcorehdr(char *p)
 {
 	if (p)
@@ -75,7 +82,6 @@ static int __init parse_elfcorehdr(char *p)
 	return 1;
 }
 __setup("elfcorehdr=", parse_elfcorehdr);
-#endif
 
 static int __init parse_savemaxmem(char *p)
 {
diff --git a/arch/sh/kernel/crash_dump.c b/arch/sh/kernel/crash_dump.c
index 4a2ecbe27d8e..95d216255565 100644
--- a/arch/sh/kernel/crash_dump.c
+++ b/arch/sh/kernel/crash_dump.c
@@ -10,6 +10,9 @@
 #include <linux/io.h>
 #include <asm/uaccess.h>
 
+/* Stores the physical address of elf header of crash image. */
+unsigned long long elfcorehdr_addr = ELFCORE_ADDR_MAX;
+
 /**
  * copy_oldmem_page - copy one page from "oldmem"
  * @pfn: page frame number to be copied
diff --git a/arch/x86/kernel/crash_dump_32.c b/arch/x86/kernel/crash_dump_32.c
index 72d0c56c1b48..f7cdb3b457aa 100644
--- a/arch/x86/kernel/crash_dump_32.c
+++ b/arch/x86/kernel/crash_dump_32.c
@@ -13,6 +13,9 @@
 
 static void *kdump_buf_page;
 
+/* Stores the physical address of elf header of crash image. */
+unsigned long long elfcorehdr_addr = ELFCORE_ADDR_MAX;
+
 /**
  * copy_oldmem_page - copy one page from "oldmem"
  * @pfn: page frame number to be copied
diff --git a/arch/x86/kernel/crash_dump_64.c b/arch/x86/kernel/crash_dump_64.c
index e90a60ef10c2..045b36cada65 100644
--- a/arch/x86/kernel/crash_dump_64.c
+++ b/arch/x86/kernel/crash_dump_64.c
@@ -10,6 +10,9 @@
 #include <linux/uaccess.h>
 #include <linux/io.h>
 
+/* Stores the physical address of elf header of crash image. */
+unsigned long long elfcorehdr_addr = ELFCORE_ADDR_MAX;
+
 /**
  * copy_oldmem_page - copy one page from "oldmem"
  * @pfn: page frame number to be copied
diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c
index 2255782e8d4b..b2c97874ec0f 100644
--- a/arch/x86/kernel/setup.c
+++ b/arch/x86/kernel/setup.c
@@ -561,7 +561,13 @@ static void __init reserve_standard_io_resources(void)
 
 }
 
-#ifdef CONFIG_PROC_VMCORE
+/*
+ * Note: elfcorehdr_addr is not just limited to vmcore. It is also used by
+ * is_kdump_kernel() to determine if we are booting after a panic. Hence
+ * ifdef it under CONFIG_CRASH_DUMP and not CONFIG_PROC_VMCORE.
+ */
+
+#ifdef CONFIG_CRASH_DUMP
 /* elfcorehdr= specifies the location of elf core header
  * stored by the crashed kernel. This option will be passed
  * by kexec loader to the capture kernel.
diff --git a/fs/proc/vmcore.c b/fs/proc/vmcore.c
index 841368b87a29..4c65ca432d30 100644
--- a/fs/proc/vmcore.c
+++ b/fs/proc/vmcore.c
@@ -32,9 +32,6 @@ static size_t elfcorebuf_sz;
 /* Total size of vmcore file. */
 static u64 vmcore_size;
 
-/* Stores the physical address of elf header of crash image. */
-unsigned long long elfcorehdr_addr = ELFCORE_ADDR_MAX;
-
 struct proc_dir_entry *proc_vmcore = NULL;
 
 /* Reads a page from the oldmem device from given offset. */
diff --git a/include/linux/crash_dump.h b/include/linux/crash_dump.h
index 025e4f575103..de027d1db745 100644
--- a/include/linux/crash_dump.h
+++ b/include/linux/crash_dump.h
@@ -9,11 +9,7 @@
 
 #define ELFCORE_ADDR_MAX	(-1ULL)
 
-#ifdef CONFIG_PROC_VMCORE
 extern unsigned long long elfcorehdr_addr;
-#else
-static const unsigned long long elfcorehdr_addr = ELFCORE_ADDR_MAX;
-#endif
 
 extern ssize_t copy_oldmem_page(unsigned long, char *, size_t,
 						unsigned long, int);
@@ -28,6 +24,16 @@ extern struct proc_dir_entry *proc_vmcore;
 
 #define vmcore_elf_check_arch(x) (elf_check_arch(x) || vmcore_elf_check_arch_cross(x))
 
+/*
+ * is_kdump_kernel() checks whether this kernel is booting after a panic of
+ * previous kernel or not. This is determined by checking if previous kernel
+ * has passed the elf core header address on command line.
+ *
+ * This is not just a test if CONFIG_CRASH_DUMP is enabled or not. It will
+ * return 1 if CONFIG_CRASH_DUMP=y and if kernel is booting after a panic of
+ * previous kernel.
+ */
+
 static inline int is_kdump_kernel(void)
 {
 	return (elfcorehdr_addr != ELFCORE_ADDR_MAX) ? 1 : 0;
-- 
cgit v1.2.3


From 85a0ee342e0c06c19d78fdf48307211c6cf18fcb Mon Sep 17 00:00:00 2001
From: Simon Horman <horms@verge.net.au>
Date: Sat, 18 Oct 2008 20:28:29 -0700
Subject: kdump: add is_vmcore_usable() and vmcore_unusable()

The usage of elfcorehdr_addr has changed recently such that being set to
ELFCORE_ADDR_MAX is used by is_kdump_kernel() to indicate if the code is
executing in a kernel executed as a crash kernel.

However, arch/ia64/kernel/setup.c:reserve_elfcorehdr will rest
elfcorehdr_addr to ELFCORE_ADDR_MAX on error, which means any subsequent
calls to is_kdump_kernel() will return 0, even though they should return
1.

Ok, at this point in time there are no subsequent calls, but I think its
fair to say that there is ample scope for error or at the very least
confusion.

This patch add an extra state, ELFCORE_ADDR_ERR, which indicates that
elfcorehdr_addr was passed on the command line, and thus execution is
taking place in a crashdump kernel, but vmcore can't be used for some
reason.  This is tested for using is_vmcore_usable() and set using
vmcore_unusable().  A subsequent patch makes use of this new code.

To summarise, the states that elfcorehdr_addr can now be in are as follows:

ELFCORE_ADDR_MAX: not a crashdump kernel
ELFCORE_ADDR_ERR: crashdump kernel but vmcore is unusable
any other value:  crash dump kernel and vmcore is usable

Signed-off-by: Simon Horman <horms@verge.net.au>
Cc: Vivek Goyal <vgoyal@redhat.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 arch/ia64/kernel/setup.c   |  4 ++--
 fs/proc/vmcore.c           |  2 +-
 include/linux/crash_dump.h | 24 ++++++++++++++++++++++++
 3 files changed, 27 insertions(+), 3 deletions(-)

(limited to 'include/linux')

diff --git a/arch/ia64/kernel/setup.c b/arch/ia64/kernel/setup.c
index a0286be6c235..60286522d54a 100644
--- a/arch/ia64/kernel/setup.c
+++ b/arch/ia64/kernel/setup.c
@@ -509,11 +509,11 @@ int __init reserve_elfcorehdr(unsigned long *start, unsigned long *end)
 	 * to work properly.
 	 */
 
-	if (elfcorehdr_addr >= ELFCORE_ADDR_MAX)
+	if (!is_vmcore_usable())
 		return -EINVAL;
 
 	if ((length = vmcore_find_descriptor_size(elfcorehdr_addr)) == 0) {
-		elfcorehdr_addr = ELFCORE_ADDR_MAX;
+		vmcore_unusable();
 		return -EINVAL;
 	}
 
diff --git a/fs/proc/vmcore.c b/fs/proc/vmcore.c
index 4c65ca432d30..cd9ca67f841b 100644
--- a/fs/proc/vmcore.c
+++ b/fs/proc/vmcore.c
@@ -644,7 +644,7 @@ static int __init vmcore_init(void)
 	int rc = 0;
 
 	/* If elfcorehdr= has been passed in cmdline, then capture the dump.*/
-	if (!(elfcorehdr_addr < ELFCORE_ADDR_MAX))
+	if (!(is_vmcore_usable()))
 		return rc;
 	rc = parse_crash_elf_headers();
 	if (rc) {
diff --git a/include/linux/crash_dump.h b/include/linux/crash_dump.h
index de027d1db745..0acf3b737e2e 100644
--- a/include/linux/crash_dump.h
+++ b/include/linux/crash_dump.h
@@ -8,6 +8,7 @@
 #include <linux/proc_fs.h>
 
 #define ELFCORE_ADDR_MAX	(-1ULL)
+#define ELFCORE_ADDR_ERR	(-2ULL)
 
 extern unsigned long long elfcorehdr_addr;
 
@@ -38,6 +39,29 @@ static inline int is_kdump_kernel(void)
 {
 	return (elfcorehdr_addr != ELFCORE_ADDR_MAX) ? 1 : 0;
 }
+
+/* is_vmcore_usable() checks if the kernel is booting after a panic and
+ * the vmcore region is usable.
+ *
+ * This makes use of the fact that due to alignment -2ULL is not
+ * a valid pointer, much in the vain of IS_ERR(), except
+ * dealing directly with an unsigned long long rather than a pointer.
+ */
+
+static inline int is_vmcore_usable(void)
+{
+	return is_kdump_kernel() && elfcorehdr_addr != ELFCORE_ADDR_ERR ? 1 : 0;
+}
+
+/* vmcore_unusable() marks the vmcore as unusable,
+ * without disturbing the logic of is_kdump_kernel()
+ */
+
+static inline void vmcore_unusable(void)
+{
+	if (is_kdump_kernel())
+		elfcorehdr_addr = ELFCORE_ADDR_ERR;
+}
 #else /* !CONFIG_CRASH_DUMP */
 static inline int is_kdump_kernel(void) { return 0; }
 #endif /* CONFIG_CRASH_DUMP */
-- 
cgit v1.2.3


From b8e465f4945bc0e9f324e3bbe15f5180a8e9a6fe Mon Sep 17 00:00:00 2001
From: Harvey Harrison <harvey.harrison@gmail.com>
Date: Sat, 18 Oct 2008 20:28:35 -0700
Subject: byteorder: add new headers for make headers-install

Signed-off-by: Harvey Harrison <harvey.harrison@gmail.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/Kbuild | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/Kbuild b/include/linux/Kbuild
index bf9aca548f14..e531783e5d78 100644
--- a/include/linux/Kbuild
+++ b/include/linux/Kbuild
@@ -183,6 +183,7 @@ unifdef-y += auto_fs.h
 unifdef-y += auxvec.h
 unifdef-y += binfmts.h
 unifdef-y += blktrace_api.h
+unifdef-y += byteorder.h
 unifdef-y += capability.h
 unifdef-y += capi.h
 unifdef-y += cciss_ioctl.h
@@ -340,6 +341,7 @@ unifdef-y += soundcard.h
 unifdef-y += stat.h
 unifdef-y += stddef.h
 unifdef-y += string.h
+unifdef-y += swab.h
 unifdef-y += synclink.h
 unifdef-y += sysctl.h
 unifdef-y += tcp.h
-- 
cgit v1.2.3


From acf0108a84edae22b99655eb2f6f6c9f7ec4d449 Mon Sep 17 00:00:00 2001
From: Harvey Harrison <harvey.harrison@gmail.com>
Date: Sat, 18 Oct 2008 20:28:36 -0700
Subject: byteorder: use generic C version for value byteswapping

This makes the new implementation of the byteorder helpers match the old
in how it degraded when an arch-defined version was not available:

1) swab()
	- look for arch defined
	- if not, use generic c version

2) swabp()
	- look for arch-defined
	- if not, deref pointer and use swab()

3) swabs()
	- look for arch defined
	- if not, use swabp

Signed-off-by: Harvey Harrison <harvey.harrison@gmail.com>
Cc: "David S. Miller" <davem@davemloft.net>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/swab.h | 10 ----------
 1 file changed, 10 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/swab.h b/include/linux/swab.h
index 270d5c208a89..bbed279f3b32 100644
--- a/include/linux/swab.h
+++ b/include/linux/swab.h
@@ -47,8 +47,6 @@ static inline __attribute_const__ __u16 ___swab16(__u16 val)
 {
 #ifdef __arch_swab16
 	return __arch_swab16(val);
-#elif defined(__arch_swab16p)
-	return __arch_swab16p(&val);
 #else
 	return __const_swab16(val);
 #endif
@@ -58,8 +56,6 @@ static inline __attribute_const__ __u32 ___swab32(__u32 val)
 {
 #ifdef __arch_swab32
 	return __arch_swab32(val);
-#elif defined(__arch_swab32p)
-	return __arch_swab32p(&val);
 #else
 	return __const_swab32(val);
 #endif
@@ -69,8 +65,6 @@ static inline __attribute_const__ __u64 ___swab64(__u64 val)
 {
 #ifdef __arch_swab64
 	return __arch_swab64(val);
-#elif defined(__arch_swab64p)
-	return __arch_swab64p(&val);
 #elif defined(__SWAB_64_THRU_32__)
 	__u32 h = val >> 32;
 	__u32 l = val & ((1ULL << 32) - 1);
@@ -84,8 +78,6 @@ static inline __attribute_const__ __u32 ___swahw32(__u32 val)
 {
 #ifdef __arch_swahw32
 	return __arch_swahw32(val);
-#elif defined(__arch_swahw32p)
-	return __arch_swahw32p(&val);
 #else
 	return __const_swahw32(val);
 #endif
@@ -95,8 +87,6 @@ static inline __attribute_const__ __u32 ___swahb32(__u32 val)
 {
 #ifdef __arch_swahb32
 	return __arch_swahb32(val);
-#elif defined(__arch_swahb32p)
-	return __arch_swahb32p(&val);
 #else
 	return __const_swahb32(val);
 #endif
-- 
cgit v1.2.3


From 1d8cca44b6a244b7e378546d719041819049a0f9 Mon Sep 17 00:00:00 2001
From: Harvey Harrison <harvey.harrison@gmail.com>
Date: Sat, 18 Oct 2008 20:28:37 -0700
Subject: byteorder: provide swabb.h generically in asm/byteorder.h

This is needed during the transition to the new byteorder headers as the
swabb.h functionality will be provided from asm/byteorder.h in the new
version.  To avoid breakage on arches still using the old implementation,
provide swabb.h from asm/byteorder.h as well.

Signed-off-by: Harvey Harrison <harvey.harrison@gmail.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/byteorder/Kbuild          | 1 +
 include/linux/byteorder/big_endian.h    | 1 +
 include/linux/byteorder/little_endian.h | 1 +
 3 files changed, 3 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/byteorder/Kbuild b/include/linux/byteorder/Kbuild
index 1133d5f9d818..fbaa7f9cee32 100644
--- a/include/linux/byteorder/Kbuild
+++ b/include/linux/byteorder/Kbuild
@@ -1,3 +1,4 @@
 unifdef-y += big_endian.h
 unifdef-y += little_endian.h
 unifdef-y += swab.h
+unifdef-y += swabb.h
diff --git a/include/linux/byteorder/big_endian.h b/include/linux/byteorder/big_endian.h
index 44f95b92393b..1cba3f3efe5f 100644
--- a/include/linux/byteorder/big_endian.h
+++ b/include/linux/byteorder/big_endian.h
@@ -10,6 +10,7 @@
 
 #include <linux/types.h>
 #include <linux/byteorder/swab.h>
+#include <linux/byteorder/swabb.h>
 
 #define __constant_htonl(x) ((__force __be32)(__u32)(x))
 #define __constant_ntohl(x) ((__force __u32)(__be32)(x))
diff --git a/include/linux/byteorder/little_endian.h b/include/linux/byteorder/little_endian.h
index 4cc170a31762..cedc1b5a289c 100644
--- a/include/linux/byteorder/little_endian.h
+++ b/include/linux/byteorder/little_endian.h
@@ -10,6 +10,7 @@
 
 #include <linux/types.h>
 #include <linux/byteorder/swab.h>
+#include <linux/byteorder/swabb.h>
 
 #define __constant_htonl(x) ((__force __be32)___constant_swab32((x)))
 #define __constant_ntohl(x) ___constant_swab32((__force __be32)(x))
-- 
cgit v1.2.3


From fdd2e5f88a259a537bb239e0c03c973cb6ea402a Mon Sep 17 00:00:00 2001
From: Adrian Bunk <bunk@kernel.org>
Date: Sat, 18 Oct 2008 20:28:38 -0700
Subject: make mm/rmap.c:anon_vma_cachep static

This patch makes the needlessly global anon_vma_cachep static.

Signed-off-by: Adrian Bunk <bunk@kernel.org>
Reviewed-by: KOSAKI Motohiro <kosaki.motohiro@jp.fujitsu.com>
Acked-by: Rik van Riel <riel@redhat.com>
Acked-by: Hugh Dickins <hugh@veritas.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/rmap.h | 12 ------------
 mm/rmap.c            | 12 +++++++++++-
 2 files changed, 11 insertions(+), 13 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/rmap.h b/include/linux/rmap.h
index 1da48db8db09..89f0564b10c8 100644
--- a/include/linux/rmap.h
+++ b/include/linux/rmap.h
@@ -39,18 +39,6 @@ struct anon_vma {
 
 #ifdef CONFIG_MMU
 
-extern struct kmem_cache *anon_vma_cachep;
-
-static inline struct anon_vma *anon_vma_alloc(void)
-{
-	return kmem_cache_alloc(anon_vma_cachep, GFP_KERNEL);
-}
-
-static inline void anon_vma_free(struct anon_vma *anon_vma)
-{
-	kmem_cache_free(anon_vma_cachep, anon_vma);
-}
-
 static inline void anon_vma_lock(struct vm_area_struct *vma)
 {
 	struct anon_vma *anon_vma = vma->anon_vma;
diff --git a/mm/rmap.c b/mm/rmap.c
index 8701d5fce732..10993942d6c9 100644
--- a/mm/rmap.c
+++ b/mm/rmap.c
@@ -55,7 +55,17 @@
 
 #include "internal.h"
 
-struct kmem_cache *anon_vma_cachep;
+static struct kmem_cache *anon_vma_cachep;
+
+static inline struct anon_vma *anon_vma_alloc(void)
+{
+	return kmem_cache_alloc(anon_vma_cachep, GFP_KERNEL);
+}
+
+static inline void anon_vma_free(struct anon_vma *anon_vma)
+{
+	kmem_cache_free(anon_vma_cachep, anon_vma);
+}
 
 /**
  * anon_vma_prepare - attach an anon_vma to a memory region
-- 
cgit v1.2.3


From a0098efd6ee4e8c04d82d761aa1bb9ec7a0aa32d Mon Sep 17 00:00:00 2001
From: Adrian Bunk <bunk@kernel.org>
Date: Sat, 18 Oct 2008 20:28:47 -0700
Subject: remove the obsolete BCD*BIN/BIN*BCD macros

Remove the following obsolete macros:

- BCD2BIN
- BIN2BCD
- BCD_TO_BIN
- BIN_TO_BCD

Signed-off-by: Adrian Bunk <bunk@kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/bcd.h | 7 -------
 1 file changed, 7 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/bcd.h b/include/linux/bcd.h
index 7ac518e3c152..75f6d6e699a0 100644
--- a/include/linux/bcd.h
+++ b/include/linux/bcd.h
@@ -15,11 +15,4 @@
 unsigned bcd2bin(unsigned char val) __attribute_const__;
 unsigned char bin2bcd(unsigned val) __attribute_const__;
 
-#define BCD2BIN(val)	bcd2bin(val)
-#define BIN2BCD(val)	bin2bcd(val)
-
-/* backwards compat */
-#define BCD_TO_BIN(val) ((val)=BCD2BIN(val))
-#define BIN_TO_BCD(val) ((val)=BIN2BCD(val))
-
 #endif /* _BCD_H */
-- 
cgit v1.2.3


From 5a85a7dda15f88b7f9c96c67fe826b5d0486d601 Mon Sep 17 00:00:00 2001
From: Adrian Bunk <bunk@kernel.org>
Date: Sat, 18 Oct 2008 20:28:48 -0700
Subject: include/linux/bcd.h: remove comments

- the macros are gone
- there's no more code in this file,
  LGPL + GPL = GPL,
  and the code that was moved to lib/bcd.c is anyway trivial

Signed-off-by: Adrian Bunk <bunk@kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/bcd.h | 9 ---------
 1 file changed, 9 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/bcd.h b/include/linux/bcd.h
index 75f6d6e699a0..22ea563ba3eb 100644
--- a/include/linux/bcd.h
+++ b/include/linux/bcd.h
@@ -1,12 +1,3 @@
-/* Permission is hereby granted to copy, modify and redistribute this code
- * in terms of the GNU Library General Public License, Version 2 or later,
- * at your option.
- */
-
-/* macros to translate to/from binary and binary-coded decimal (frequently
- * found in RTC chips).
- */
-
 #ifndef _BCD_H
 #define _BCD_H
 
-- 
cgit v1.2.3


From 01e8ef11bc1a74e65678ed55795f59266d4add01 Mon Sep 17 00:00:00 2001
From: Parag Warudkar <parag.lkml@gmail.com>
Date: Sat, 18 Oct 2008 20:28:50 -0700
Subject: x86: sysfs: kill owner field from attribute

Tejun's commit 7b595756ec1f49e0049a9e01a1298d53a7faaa15 made sysfs
attribute->owner unnecessary.  But the field was left in the structure to
ease the merge.  It's been over a year since that change and it is now
time to start killing attribute->owner along with its users - one arch at
a time!

This patch is attempt #1 to get rid of attribute->owner only for
CONFIG_X86_64 or CONFIG_X86_32 .  We will deal with other arches later on
as and when possible - avr32 will be the next since that is something I
can test.  Compile (make allyesconfig / make allmodconfig / custom config)
and boot tested.

akpm: the idea is that we put the declaration of sttribute.owner inside
`#ifndef CONFIG_X86'.  But that proved to be too ambitious for now because
new usages kept on turning up in subsystem trees.

[akpm: remove the ifdef for now]
Signed-off-by: Parag Warudkar <parag.lkml@gmail.com>
Cc: Greg KH <greg@kroah.com>
Cc: Ingo Molnar <mingo@elte.hu>
Cc: Tejun Heo <htejun@gmail.com>
Cc: Len Brown <lenb@kernel.org>
Cc: Jens Axboe <jens.axboe@oracle.com>
Cc: Jean Delvare <khali@linux-fr.org>
Cc: Roland Dreier <rolandd@cisco.com>
Cc: David Brownell <david-b@pacbell.net>
Cc: Alessandro Zummo <a.zummo@towertech.it>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 drivers/acpi/battery.c              | 2 +-
 drivers/acpi/sbs.c                  | 2 +-
 drivers/acpi/system.c               | 1 -
 drivers/block/aoe/aoeblk.c          | 2 +-
 drivers/block/nbd.c                 | 2 +-
 drivers/firmware/iscsi_ibft.c       | 1 -
 drivers/i2c/chips/at24.c            | 1 -
 drivers/i2c/chips/ds1682.c          | 1 -
 drivers/infiniband/core/cm.c        | 2 +-
 drivers/memstick/core/mspro_block.c | 1 -
 drivers/power/power_supply_sysfs.c  | 2 +-
 drivers/rtc/rtc-cmos.c              | 1 -
 drivers/rtc/rtc-ds1305.c            | 1 -
 drivers/rtc/rtc-ds1307.c            | 1 -
 drivers/rtc/rtc-ds1511.c            | 1 -
 drivers/rtc/rtc-m48t59.c            | 1 -
 drivers/rtc/rtc-stk17ta8.c          | 1 -
 drivers/scsi/arcmsr/arcmsr_attr.c   | 3 ---
 drivers/w1/slaves/w1_ds2760.c       | 1 -
 include/linux/sysfs.h               | 5 +++--
 20 files changed, 9 insertions(+), 23 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/acpi/battery.c b/drivers/acpi/battery.c
index b1c723f9f58d..70f7f60929ca 100644
--- a/drivers/acpi/battery.c
+++ b/drivers/acpi/battery.c
@@ -431,7 +431,7 @@ static ssize_t acpi_battery_alarm_store(struct device *dev,
 }
 
 static struct device_attribute alarm_attr = {
-	.attr = {.name = "alarm", .mode = 0644, .owner = THIS_MODULE},
+	.attr = {.name = "alarm", .mode = 0644},
 	.show = acpi_battery_alarm_show,
 	.store = acpi_battery_alarm_store,
 };
diff --git a/drivers/acpi/sbs.c b/drivers/acpi/sbs.c
index 10a36512647c..7b011e7e29fe 100644
--- a/drivers/acpi/sbs.c
+++ b/drivers/acpi/sbs.c
@@ -463,7 +463,7 @@ static ssize_t acpi_battery_alarm_store(struct device *dev,
 }
 
 static struct device_attribute alarm_attr = {
-	.attr = {.name = "alarm", .mode = 0644, .owner = THIS_MODULE},
+	.attr = {.name = "alarm", .mode = 0644},
 	.show = acpi_battery_alarm_show,
 	.store = acpi_battery_alarm_store,
 };
diff --git a/drivers/acpi/system.c b/drivers/acpi/system.c
index 91dec448b3ed..24e80fd927e2 100644
--- a/drivers/acpi/system.c
+++ b/drivers/acpi/system.c
@@ -115,7 +115,6 @@ static void acpi_table_attr_init(struct acpi_table_attr *table_attr,
 	table_attr->attr.read = acpi_table_show;
 	table_attr->attr.attr.name = table_attr->name;
 	table_attr->attr.attr.mode = 0444;
-	table_attr->attr.attr.owner = THIS_MODULE;
 
 	return;
 }
diff --git a/drivers/block/aoe/aoeblk.c b/drivers/block/aoe/aoeblk.c
index b82654e883a7..d876ad861237 100644
--- a/drivers/block/aoe/aoeblk.c
+++ b/drivers/block/aoe/aoeblk.c
@@ -90,7 +90,7 @@ static DEVICE_ATTR(state, S_IRUGO, aoedisk_show_state, NULL);
 static DEVICE_ATTR(mac, S_IRUGO, aoedisk_show_mac, NULL);
 static DEVICE_ATTR(netif, S_IRUGO, aoedisk_show_netif, NULL);
 static struct device_attribute dev_attr_firmware_version = {
-	.attr = { .name = "firmware-version", .mode = S_IRUGO, .owner = THIS_MODULE },
+	.attr = { .name = "firmware-version", .mode = S_IRUGO },
 	.show = aoedisk_show_fwver,
 };
 
diff --git a/drivers/block/nbd.c b/drivers/block/nbd.c
index 7b3351260d56..9034ca585afd 100644
--- a/drivers/block/nbd.c
+++ b/drivers/block/nbd.c
@@ -391,7 +391,7 @@ static ssize_t pid_show(struct device *dev,
 }
 
 static struct device_attribute pid_attr = {
-	.attr = { .name = "pid", .mode = S_IRUGO, .owner = THIS_MODULE },
+	.attr = { .name = "pid", .mode = S_IRUGO},
 	.show = pid_show,
 };
 
diff --git a/drivers/firmware/iscsi_ibft.c b/drivers/firmware/iscsi_ibft.c
index deb154aa47c4..4353414a0b77 100644
--- a/drivers/firmware/iscsi_ibft.c
+++ b/drivers/firmware/iscsi_ibft.c
@@ -732,7 +732,6 @@ static int __init ibft_create_attribute(struct ibft_kobject *kobj_data,
 
 	attr->attr.name = name;
 	attr->attr.mode = S_IRUSR;
-	attr->attr.owner = THIS_MODULE;
 
 	attr->hdr = hdr;
 	attr->show = show;
diff --git a/drivers/i2c/chips/at24.c b/drivers/i2c/chips/at24.c
index 2a4acb269569..d4775528abc6 100644
--- a/drivers/i2c/chips/at24.c
+++ b/drivers/i2c/chips/at24.c
@@ -460,7 +460,6 @@ static int at24_probe(struct i2c_client *client, const struct i2c_device_id *id)
 	 */
 	at24->bin.attr.name = "eeprom";
 	at24->bin.attr.mode = chip.flags & AT24_FLAG_IRUGO ? S_IRUGO : S_IRUSR;
-	at24->bin.attr.owner = THIS_MODULE;
 	at24->bin.read = at24_bin_read;
 	at24->bin.size = chip.byte_len;
 
diff --git a/drivers/i2c/chips/ds1682.c b/drivers/i2c/chips/ds1682.c
index 23be4d42cb02..f3ee4a1abb77 100644
--- a/drivers/i2c/chips/ds1682.c
+++ b/drivers/i2c/chips/ds1682.c
@@ -190,7 +190,6 @@ static struct bin_attribute ds1682_eeprom_attr = {
 	.attr = {
 		.name = "eeprom",
 		.mode = S_IRUGO | S_IWUSR,
-		.owner = THIS_MODULE,
 	},
 	.size = DS1682_EEPROM_SIZE,
 	.read = ds1682_eeprom_read,
diff --git a/drivers/infiniband/core/cm.c b/drivers/infiniband/core/cm.c
index a78d35aecee3..f1e82a92e61e 100644
--- a/drivers/infiniband/core/cm.c
+++ b/drivers/infiniband/core/cm.c
@@ -122,7 +122,7 @@ struct cm_counter_attribute {
 
 #define CM_COUNTER_ATTR(_name, _index) \
 struct cm_counter_attribute cm_##_name##_counter_attr = { \
-	.attr = { .name = __stringify(_name), .mode = 0444, .owner = THIS_MODULE }, \
+	.attr = { .name = __stringify(_name), .mode = 0444 }, \
 	.index = _index \
 }
 
diff --git a/drivers/memstick/core/mspro_block.c b/drivers/memstick/core/mspro_block.c
index 6e291bf8237a..5263913e0c69 100644
--- a/drivers/memstick/core/mspro_block.c
+++ b/drivers/memstick/core/mspro_block.c
@@ -1044,7 +1044,6 @@ static int mspro_block_read_attributes(struct memstick_dev *card)
 
 		s_attr->dev_attr.attr.name = s_attr->name;
 		s_attr->dev_attr.attr.mode = S_IRUGO;
-		s_attr->dev_attr.attr.owner = THIS_MODULE;
 		s_attr->dev_attr.show = mspro_block_attr_show(s_attr->id);
 
 		if (!rc)
diff --git a/drivers/power/power_supply_sysfs.c b/drivers/power/power_supply_sysfs.c
index fe2aeb11939b..23ae8460f5c1 100644
--- a/drivers/power/power_supply_sysfs.c
+++ b/drivers/power/power_supply_sysfs.c
@@ -30,7 +30,7 @@
 
 #define POWER_SUPPLY_ATTR(_name)					\
 {									\
-	.attr = { .name = #_name, .mode = 0444, .owner = THIS_MODULE },	\
+	.attr = { .name = #_name, .mode = 0444 },	\
 	.show = power_supply_show_property,				\
 	.store = NULL,							\
 }
diff --git a/drivers/rtc/rtc-cmos.c b/drivers/rtc/rtc-cmos.c
index 957365e4a746..5549231179a2 100644
--- a/drivers/rtc/rtc-cmos.c
+++ b/drivers/rtc/rtc-cmos.c
@@ -592,7 +592,6 @@ static struct bin_attribute nvram = {
 	.attr = {
 		.name	= "nvram",
 		.mode	= S_IRUGO | S_IWUSR,
-		.owner	= THIS_MODULE,
 	},
 
 	.read	= cmos_nvram_read,
diff --git a/drivers/rtc/rtc-ds1305.c b/drivers/rtc/rtc-ds1305.c
index 57b470f3fc0b..fc372df6534b 100644
--- a/drivers/rtc/rtc-ds1305.c
+++ b/drivers/rtc/rtc-ds1305.c
@@ -606,7 +606,6 @@ ds1305_nvram_write(struct kobject *kobj, struct bin_attribute *attr,
 static struct bin_attribute nvram = {
 	.attr.name	= "nvram",
 	.attr.mode	= S_IRUGO | S_IWUSR,
-	.attr.owner	= THIS_MODULE,
 	.read		= ds1305_nvram_read,
 	.write		= ds1305_nvram_write,
 	.size		= DS1305_NVRAM_LEN,
diff --git a/drivers/rtc/rtc-ds1307.c b/drivers/rtc/rtc-ds1307.c
index cad23bcfebd4..162330b9d1dc 100644
--- a/drivers/rtc/rtc-ds1307.c
+++ b/drivers/rtc/rtc-ds1307.c
@@ -551,7 +551,6 @@ static struct bin_attribute nvram = {
 	.attr = {
 		.name	= "nvram",
 		.mode	= S_IRUGO | S_IWUSR,
-		.owner	= THIS_MODULE,
 	},
 
 	.read	= ds1307_nvram_read,
diff --git a/drivers/rtc/rtc-ds1511.c b/drivers/rtc/rtc-ds1511.c
index 7bb0a962ad21..25caada78398 100644
--- a/drivers/rtc/rtc-ds1511.c
+++ b/drivers/rtc/rtc-ds1511.c
@@ -481,7 +481,6 @@ static struct bin_attribute ds1511_nvram_attr = {
 	.attr = {
 		.name = "nvram",
 		.mode = S_IRUGO | S_IWUGO,
-		.owner = THIS_MODULE,
 	},
 	.size = DS1511_RAM_MAX,
 	.read = ds1511_nvram_read,
diff --git a/drivers/rtc/rtc-m48t59.c b/drivers/rtc/rtc-m48t59.c
index ed671e29e07d..04b63dab6932 100644
--- a/drivers/rtc/rtc-m48t59.c
+++ b/drivers/rtc/rtc-m48t59.c
@@ -360,7 +360,6 @@ static struct bin_attribute m48t59_nvram_attr = {
 	.attr = {
 		.name = "nvram",
 		.mode = S_IRUGO | S_IWUSR,
-		.owner = THIS_MODULE,
 	},
 	.read = m48t59_nvram_read,
 	.write = m48t59_nvram_write,
diff --git a/drivers/rtc/rtc-stk17ta8.c b/drivers/rtc/rtc-stk17ta8.c
index 0947f8c23957..f4cd46e15af9 100644
--- a/drivers/rtc/rtc-stk17ta8.c
+++ b/drivers/rtc/rtc-stk17ta8.c
@@ -280,7 +280,6 @@ static struct bin_attribute stk17ta8_nvram_attr = {
 	.attr = {
 		.name = "nvram",
 		.mode = S_IRUGO | S_IWUSR,
-		.owner = THIS_MODULE,
 	},
 	.size = RTC_OFFSET,
 	.read = stk17ta8_nvram_read,
diff --git a/drivers/scsi/arcmsr/arcmsr_attr.c b/drivers/scsi/arcmsr/arcmsr_attr.c
index 69f8346aa288..5877f29a6005 100644
--- a/drivers/scsi/arcmsr/arcmsr_attr.c
+++ b/drivers/scsi/arcmsr/arcmsr_attr.c
@@ -189,7 +189,6 @@ static struct bin_attribute arcmsr_sysfs_message_read_attr = {
 	.attr = {
 		.name = "mu_read",
 		.mode = S_IRUSR ,
-		.owner = THIS_MODULE,
 	},
 	.size = 1032,
 	.read = arcmsr_sysfs_iop_message_read,
@@ -199,7 +198,6 @@ static struct bin_attribute arcmsr_sysfs_message_write_attr = {
 	.attr = {
 		.name = "mu_write",
 		.mode = S_IWUSR,
-		.owner = THIS_MODULE,
 	},
 	.size = 1032,
 	.write = arcmsr_sysfs_iop_message_write,
@@ -209,7 +207,6 @@ static struct bin_attribute arcmsr_sysfs_message_clear_attr = {
 	.attr = {
 		.name = "mu_clear",
 		.mode = S_IWUSR,
-		.owner = THIS_MODULE,
 	},
 	.size = 1,
 	.write = arcmsr_sysfs_iop_message_clear,
diff --git a/drivers/w1/slaves/w1_ds2760.c b/drivers/w1/slaves/w1_ds2760.c
index ed6b0576208c..1f09d4e4144c 100644
--- a/drivers/w1/slaves/w1_ds2760.c
+++ b/drivers/w1/slaves/w1_ds2760.c
@@ -80,7 +80,6 @@ static struct bin_attribute w1_ds2760_bin_attr = {
 	.attr = {
 		.name = "w1_slave",
 		.mode = S_IRUGO,
-		.owner = THIS_MODULE,
 	},
 	.size = DS2760_DATA_SIZE,
 	.read = w1_ds2760_read_bin,
diff --git a/include/linux/sysfs.h b/include/linux/sysfs.h
index b330e289d71f..9d68fed50f11 100644
--- a/include/linux/sysfs.h
+++ b/include/linux/sysfs.h
@@ -21,8 +21,9 @@ struct kobject;
 struct module;
 
 /* FIXME
- * The *owner field is no longer used, but leave around
- * until the tree gets cleaned up fully.
+ * The *owner field is no longer used.
+ * x86 tree has been cleaned up. The owner
+ * attribute is still left for other arches.
  */
 struct attribute {
 	const char		*name;
-- 
cgit v1.2.3


From 606576ce816603d9fe1fb453a88bc6eea16ca709 Mon Sep 17 00:00:00 2001
From: Steven Rostedt <rostedt@goodmis.org>
Date: Mon, 6 Oct 2008 19:06:12 -0400
Subject: ftrace: rename FTRACE to FUNCTION_TRACER

Due to confusion between the ftrace infrastructure and the gcc profiling
tracer "ftrace", this patch renames the config options from FTRACE to
FUNCTION_TRACER.  The other two names that are offspring from FTRACE
DYNAMIC_FTRACE and FTRACE_MCOUNT_RECORD will stay the same.

This patch was generated mostly by script, and partially by hand.

Signed-off-by: Steven Rostedt <srostedt@redhat.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 Makefile                                 |  2 +-
 arch/arm/Kconfig                         |  4 ++--
 arch/arm/boot/compressed/Makefile        |  2 +-
 arch/arm/include/asm/ftrace.h            |  2 +-
 arch/arm/kernel/armksyms.c               |  2 +-
 arch/arm/kernel/entry-common.S           |  4 ++--
 arch/powerpc/Kconfig                     |  2 +-
 arch/powerpc/Makefile                    |  2 +-
 arch/powerpc/include/asm/ftrace.h        |  2 +-
 arch/powerpc/kernel/Makefile             |  2 +-
 arch/powerpc/kernel/entry_32.S           |  2 +-
 arch/powerpc/kernel/entry_64.S           |  2 +-
 arch/powerpc/kernel/ppc_ksyms.c          |  2 +-
 arch/powerpc/platforms/powermac/Makefile |  2 +-
 arch/sparc64/Kconfig                     |  2 +-
 arch/sparc64/Kconfig.debug               |  2 +-
 arch/sparc64/lib/mcount.S                |  4 ++--
 arch/x86/Kconfig                         |  2 +-
 arch/x86/kernel/Makefile                 |  2 +-
 arch/x86/kernel/entry_32.S               |  4 ++--
 arch/x86/kernel/entry_64.S               |  4 ++--
 arch/x86/kernel/i386_ksyms_32.c          |  2 +-
 arch/x86/kernel/x8664_ksyms_64.c         |  2 +-
 arch/x86/xen/Makefile                    |  2 +-
 include/asm-x86/ftrace.h                 |  4 ++--
 include/linux/ftrace.h                   | 12 ++++++------
 kernel/Makefile                          |  4 ++--
 kernel/sysctl.c                          |  2 +-
 kernel/trace/Kconfig                     | 17 +++++++++--------
 kernel/trace/Makefile                    |  6 +++---
 kernel/trace/trace.c                     |  2 +-
 kernel/trace/trace.h                     |  2 +-
 kernel/trace/trace_irqsoff.c             |  4 ++--
 kernel/trace/trace_sched_wakeup.c        |  4 ++--
 kernel/trace/trace_selftest.c            |  4 ++--
 lib/Makefile                             |  2 +-
 36 files changed, 61 insertions(+), 60 deletions(-)

(limited to 'include/linux')

diff --git a/Makefile b/Makefile
index 16e3fbb968a8..b7eb70b13cad 100644
--- a/Makefile
+++ b/Makefile
@@ -536,7 +536,7 @@ KBUILD_CFLAGS	+= -g
 KBUILD_AFLAGS	+= -gdwarf-2
 endif
 
-ifdef CONFIG_FTRACE
+ifdef CONFIG_FUNCTION_TRACER
 KBUILD_CFLAGS	+= -pg
 endif
 
diff --git a/arch/arm/Kconfig b/arch/arm/Kconfig
index 4853f9df37bd..c2f18ea40500 100644
--- a/arch/arm/Kconfig
+++ b/arch/arm/Kconfig
@@ -16,8 +16,8 @@ config ARM
 	select HAVE_ARCH_KGDB
 	select HAVE_KPROBES if (!XIP_KERNEL)
 	select HAVE_KRETPROBES if (HAVE_KPROBES)
-	select HAVE_FTRACE if (!XIP_KERNEL)
-	select HAVE_DYNAMIC_FTRACE if (HAVE_FTRACE)
+	select HAVE_FUNCTION_TRACER if (!XIP_KERNEL)
+	select HAVE_DYNAMIC_FTRACE if (HAVE_FUNCTION_TRACER)
 	select HAVE_GENERIC_DMA_COHERENT
 	help
 	  The ARM series is a line of low-power-consumption RISC chip designs
diff --git a/arch/arm/boot/compressed/Makefile b/arch/arm/boot/compressed/Makefile
index 7a03f2007882..c47f2a3f8f8f 100644
--- a/arch/arm/boot/compressed/Makefile
+++ b/arch/arm/boot/compressed/Makefile
@@ -70,7 +70,7 @@ SEDFLAGS	= s/TEXT_START/$(ZTEXTADDR)/;s/BSS_START/$(ZBSSADDR)/
 targets       := vmlinux vmlinux.lds piggy.gz piggy.o font.o font.c \
 		 head.o misc.o $(OBJS)
 
-ifeq ($(CONFIG_FTRACE),y)
+ifeq ($(CONFIG_FUNCTION_TRACER),y)
 ORIG_CFLAGS := $(KBUILD_CFLAGS)
 KBUILD_CFLAGS = $(subst -pg, , $(ORIG_CFLAGS))
 endif
diff --git a/arch/arm/include/asm/ftrace.h b/arch/arm/include/asm/ftrace.h
index 584ef9a8e5a5..39c8bc1a006a 100644
--- a/arch/arm/include/asm/ftrace.h
+++ b/arch/arm/include/asm/ftrace.h
@@ -1,7 +1,7 @@
 #ifndef _ASM_ARM_FTRACE
 #define _ASM_ARM_FTRACE
 
-#ifdef CONFIG_FTRACE
+#ifdef CONFIG_FUNCTION_TRACER
 #define MCOUNT_ADDR		((long)(mcount))
 #define MCOUNT_INSN_SIZE	4 /* sizeof mcount call */
 
diff --git a/arch/arm/kernel/armksyms.c b/arch/arm/kernel/armksyms.c
index 2357b1cf1cf9..c74f766ffc12 100644
--- a/arch/arm/kernel/armksyms.c
+++ b/arch/arm/kernel/armksyms.c
@@ -183,6 +183,6 @@ EXPORT_SYMBOL(_find_next_bit_be);
 
 EXPORT_SYMBOL(copy_page);
 
-#ifdef CONFIG_FTRACE
+#ifdef CONFIG_FUNCTION_TRACER
 EXPORT_SYMBOL(mcount);
 #endif
diff --git a/arch/arm/kernel/entry-common.S b/arch/arm/kernel/entry-common.S
index 3aa14dcc5bab..06269ea375c5 100644
--- a/arch/arm/kernel/entry-common.S
+++ b/arch/arm/kernel/entry-common.S
@@ -101,7 +101,7 @@ ENDPROC(ret_from_fork)
 #undef CALL
 #define CALL(x) .long x
 
-#ifdef CONFIG_FTRACE
+#ifdef CONFIG_FUNCTION_TRACER
 #ifdef CONFIG_DYNAMIC_FTRACE
 ENTRY(mcount)
 	stmdb sp!, {r0-r3, lr}
@@ -149,7 +149,7 @@ trace:
 ftrace_stub:
 	mov pc, lr
 
-#endif /* CONFIG_FTRACE */
+#endif /* CONFIG_FUNCTION_TRACER */
 
 /*=============================================================================
  * SWI handler
diff --git a/arch/powerpc/Kconfig b/arch/powerpc/Kconfig
index 380baa1780e9..97d86702e2d5 100644
--- a/arch/powerpc/Kconfig
+++ b/arch/powerpc/Kconfig
@@ -112,7 +112,7 @@ config PPC
 	bool
 	default y
 	select HAVE_DYNAMIC_FTRACE
-	select HAVE_FTRACE
+	select HAVE_FUNCTION_TRACER
 	select ARCH_WANT_OPTIONAL_GPIOLIB
 	select HAVE_IDE
 	select HAVE_IOREMAP_PROT
diff --git a/arch/powerpc/Makefile b/arch/powerpc/Makefile
index 24dd1a37f8fb..1f0667069940 100644
--- a/arch/powerpc/Makefile
+++ b/arch/powerpc/Makefile
@@ -122,7 +122,7 @@ KBUILD_CFLAGS		+= -mcpu=powerpc
 endif
 
 # Work around a gcc code-gen bug with -fno-omit-frame-pointer.
-ifeq ($(CONFIG_FTRACE),y)
+ifeq ($(CONFIG_FUNCTION_TRACER),y)
 KBUILD_CFLAGS		+= -mno-sched-epilog
 endif
 
diff --git a/arch/powerpc/include/asm/ftrace.h b/arch/powerpc/include/asm/ftrace.h
index de921326cca8..b298f7a631e6 100644
--- a/arch/powerpc/include/asm/ftrace.h
+++ b/arch/powerpc/include/asm/ftrace.h
@@ -1,7 +1,7 @@
 #ifndef _ASM_POWERPC_FTRACE
 #define _ASM_POWERPC_FTRACE
 
-#ifdef CONFIG_FTRACE
+#ifdef CONFIG_FUNCTION_TRACER
 #define MCOUNT_ADDR		((long)(_mcount))
 #define MCOUNT_INSN_SIZE	4 /* sizeof mcount call */
 
diff --git a/arch/powerpc/kernel/Makefile b/arch/powerpc/kernel/Makefile
index fdb58253fa5b..92673b43858d 100644
--- a/arch/powerpc/kernel/Makefile
+++ b/arch/powerpc/kernel/Makefile
@@ -12,7 +12,7 @@ CFLAGS_prom_init.o      += -fPIC
 CFLAGS_btext.o		+= -fPIC
 endif
 
-ifdef CONFIG_FTRACE
+ifdef CONFIG_FUNCTION_TRACER
 # Do not trace early boot code
 CFLAGS_REMOVE_cputable.o = -pg -mno-sched-epilog
 CFLAGS_REMOVE_prom_init.o = -pg -mno-sched-epilog
diff --git a/arch/powerpc/kernel/entry_32.S b/arch/powerpc/kernel/entry_32.S
index 1cbbf7033641..7ecc0d1855c3 100644
--- a/arch/powerpc/kernel/entry_32.S
+++ b/arch/powerpc/kernel/entry_32.S
@@ -1158,7 +1158,7 @@ machine_check_in_rtas:
 
 #endif /* CONFIG_PPC_RTAS */
 
-#ifdef CONFIG_FTRACE
+#ifdef CONFIG_FUNCTION_TRACER
 #ifdef CONFIG_DYNAMIC_FTRACE
 _GLOBAL(mcount)
 _GLOBAL(_mcount)
diff --git a/arch/powerpc/kernel/entry_64.S b/arch/powerpc/kernel/entry_64.S
index fd8b4bae9b04..e6d52845854f 100644
--- a/arch/powerpc/kernel/entry_64.S
+++ b/arch/powerpc/kernel/entry_64.S
@@ -884,7 +884,7 @@ _GLOBAL(enter_prom)
 	mtlr    r0
         blr
 
-#ifdef CONFIG_FTRACE
+#ifdef CONFIG_FUNCTION_TRACER
 #ifdef CONFIG_DYNAMIC_FTRACE
 _GLOBAL(mcount)
 _GLOBAL(_mcount)
diff --git a/arch/powerpc/kernel/ppc_ksyms.c b/arch/powerpc/kernel/ppc_ksyms.c
index 8edc2359c419..260089dccfb0 100644
--- a/arch/powerpc/kernel/ppc_ksyms.c
+++ b/arch/powerpc/kernel/ppc_ksyms.c
@@ -68,7 +68,7 @@ EXPORT_SYMBOL(single_step_exception);
 EXPORT_SYMBOL(sys_sigreturn);
 #endif
 
-#ifdef CONFIG_FTRACE
+#ifdef CONFIG_FUNCTION_TRACER
 EXPORT_SYMBOL(_mcount);
 #endif
 
diff --git a/arch/powerpc/platforms/powermac/Makefile b/arch/powerpc/platforms/powermac/Makefile
index be60d64be7ad..50f169392551 100644
--- a/arch/powerpc/platforms/powermac/Makefile
+++ b/arch/powerpc/platforms/powermac/Makefile
@@ -1,6 +1,6 @@
 CFLAGS_bootx_init.o  		+= -fPIC
 
-ifdef CONFIG_FTRACE
+ifdef CONFIG_FUNCTION_TRACER
 # Do not trace early boot code
 CFLAGS_REMOVE_bootx_init.o = -pg -mno-sched-epilog
 endif
diff --git a/arch/sparc64/Kconfig b/arch/sparc64/Kconfig
index 5446e2a499b1..d269400d2868 100644
--- a/arch/sparc64/Kconfig
+++ b/arch/sparc64/Kconfig
@@ -12,7 +12,7 @@ config SPARC64
 	bool
 	default y
 	select HAVE_DYNAMIC_FTRACE
-	select HAVE_FTRACE
+	select HAVE_FUNCTION_TRACER
 	select HAVE_IDE
 	select HAVE_LMB
 	select HAVE_ARCH_KGDB
diff --git a/arch/sparc64/Kconfig.debug b/arch/sparc64/Kconfig.debug
index d6d32d178fc8..c40515c06690 100644
--- a/arch/sparc64/Kconfig.debug
+++ b/arch/sparc64/Kconfig.debug
@@ -33,7 +33,7 @@ config DEBUG_PAGEALLOC
 
 config MCOUNT
 	bool
-	depends on STACK_DEBUG || FTRACE
+	depends on STACK_DEBUG || FUNCTION_TRACER
 	default y
 
 config FRAME_POINTER
diff --git a/arch/sparc64/lib/mcount.S b/arch/sparc64/lib/mcount.S
index fad90ddb3a28..7ce9c65f3592 100644
--- a/arch/sparc64/lib/mcount.S
+++ b/arch/sparc64/lib/mcount.S
@@ -93,7 +93,7 @@ mcount:
 	 nop
 1:
 #endif
-#ifdef CONFIG_FTRACE
+#ifdef CONFIG_FUNCTION_TRACER
 #ifdef CONFIG_DYNAMIC_FTRACE
 	mov		%o7, %o0
 	.globl		mcount_call
@@ -119,7 +119,7 @@ mcount_call:
 	.size		_mcount,.-_mcount
 	.size		mcount,.-mcount
 
-#ifdef CONFIG_FTRACE
+#ifdef CONFIG_FUNCTION_TRACER
 	.globl		ftrace_stub
 	.type		ftrace_stub,#function
 ftrace_stub:
diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
index 40ee80809562..290e21aa774d 100644
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -28,7 +28,7 @@ config X86
 	select HAVE_KRETPROBES
 	select HAVE_FTRACE_MCOUNT_RECORD
 	select HAVE_DYNAMIC_FTRACE
-	select HAVE_FTRACE
+	select HAVE_FUNCTION_TRACER
 	select HAVE_KVM if ((X86_32 && !X86_VOYAGER && !X86_VISWS && !X86_NUMAQ) || X86_64)
 	select HAVE_ARCH_KGDB if !X86_VOYAGER
 	select HAVE_ARCH_TRACEHOOK
diff --git a/arch/x86/kernel/Makefile b/arch/x86/kernel/Makefile
index 0d41f0343dc0..ec3d30136bf0 100644
--- a/arch/x86/kernel/Makefile
+++ b/arch/x86/kernel/Makefile
@@ -6,7 +6,7 @@ extra-y                := head_$(BITS).o head$(BITS).o head.o init_task.o vmlinu
 
 CPPFLAGS_vmlinux.lds += -U$(UTS_MACHINE)
 
-ifdef CONFIG_FTRACE
+ifdef CONFIG_FUNCTION_TRACER
 # Do not profile debug and lowlevel utilities
 CFLAGS_REMOVE_tsc.o = -pg
 CFLAGS_REMOVE_rtc.o = -pg
diff --git a/arch/x86/kernel/entry_32.S b/arch/x86/kernel/entry_32.S
index 4e4269c73bb7..9d49facc21f2 100644
--- a/arch/x86/kernel/entry_32.S
+++ b/arch/x86/kernel/entry_32.S
@@ -1149,7 +1149,7 @@ ENDPROC(xen_failsafe_callback)
 
 #endif	/* CONFIG_XEN */
 
-#ifdef CONFIG_FTRACE
+#ifdef CONFIG_FUNCTION_TRACER
 #ifdef CONFIG_DYNAMIC_FTRACE
 
 ENTRY(mcount)
@@ -1204,7 +1204,7 @@ trace:
 	jmp ftrace_stub
 END(mcount)
 #endif /* CONFIG_DYNAMIC_FTRACE */
-#endif /* CONFIG_FTRACE */
+#endif /* CONFIG_FUNCTION_TRACER */
 
 .section .rodata,"a"
 #include "syscall_table_32.S"
diff --git a/arch/x86/kernel/entry_64.S b/arch/x86/kernel/entry_64.S
index 09e7145484c5..b86f332c96a6 100644
--- a/arch/x86/kernel/entry_64.S
+++ b/arch/x86/kernel/entry_64.S
@@ -61,7 +61,7 @@
 
 	.code64
 
-#ifdef CONFIG_FTRACE
+#ifdef CONFIG_FUNCTION_TRACER
 #ifdef CONFIG_DYNAMIC_FTRACE
 ENTRY(mcount)
 	retq
@@ -138,7 +138,7 @@ trace:
 	jmp ftrace_stub
 END(mcount)
 #endif /* CONFIG_DYNAMIC_FTRACE */
-#endif /* CONFIG_FTRACE */
+#endif /* CONFIG_FUNCTION_TRACER */
 
 #ifndef CONFIG_PREEMPT
 #define retint_kernel retint_restore_args
diff --git a/arch/x86/kernel/i386_ksyms_32.c b/arch/x86/kernel/i386_ksyms_32.c
index dd7ebee446af..43cec6bdda63 100644
--- a/arch/x86/kernel/i386_ksyms_32.c
+++ b/arch/x86/kernel/i386_ksyms_32.c
@@ -5,7 +5,7 @@
 #include <asm/desc.h>
 #include <asm/ftrace.h>
 
-#ifdef CONFIG_FTRACE
+#ifdef CONFIG_FUNCTION_TRACER
 /* mcount is defined in assembly */
 EXPORT_SYMBOL(mcount);
 #endif
diff --git a/arch/x86/kernel/x8664_ksyms_64.c b/arch/x86/kernel/x8664_ksyms_64.c
index b545f371b5f5..695e426aa354 100644
--- a/arch/x86/kernel/x8664_ksyms_64.c
+++ b/arch/x86/kernel/x8664_ksyms_64.c
@@ -12,7 +12,7 @@
 #include <asm/desc.h>
 #include <asm/ftrace.h>
 
-#ifdef CONFIG_FTRACE
+#ifdef CONFIG_FUNCTION_TRACER
 /* mcount is defined in assembly */
 EXPORT_SYMBOL(mcount);
 #endif
diff --git a/arch/x86/xen/Makefile b/arch/x86/xen/Makefile
index 313947940a1a..6dcefba7836f 100644
--- a/arch/x86/xen/Makefile
+++ b/arch/x86/xen/Makefile
@@ -1,4 +1,4 @@
-ifdef CONFIG_FTRACE
+ifdef CONFIG_FUNCTION_TRACER
 # Do not profile debug and lowlevel utilities
 CFLAGS_REMOVE_spinlock.o = -pg
 CFLAGS_REMOVE_time.o = -pg
diff --git a/include/asm-x86/ftrace.h b/include/asm-x86/ftrace.h
index 1bb6f9bbe1ab..233bb9b869c0 100644
--- a/include/asm-x86/ftrace.h
+++ b/include/asm-x86/ftrace.h
@@ -1,7 +1,7 @@
 #ifndef ASM_X86__FTRACE_H
 #define ASM_X86__FTRACE_H
 
-#ifdef CONFIG_FTRACE
+#ifdef CONFIG_FUNCTION_TRACER
 #define MCOUNT_ADDR		((long)(mcount))
 #define MCOUNT_INSN_SIZE	5 /* sizeof mcount call */
 
@@ -19,6 +19,6 @@ static inline unsigned long ftrace_call_adjust(unsigned long addr)
 }
 #endif
 
-#endif /* CONFIG_FTRACE */
+#endif /* CONFIG_FUNCTION_TRACER */
 
 #endif /* ASM_X86__FTRACE_H */
diff --git a/include/linux/ftrace.h b/include/linux/ftrace.h
index a3d46151be19..0e9529589151 100644
--- a/include/linux/ftrace.h
+++ b/include/linux/ftrace.h
@@ -8,7 +8,7 @@
 #include <linux/types.h>
 #include <linux/kallsyms.h>
 
-#ifdef CONFIG_FTRACE
+#ifdef CONFIG_FUNCTION_TRACER
 
 extern int ftrace_enabled;
 extern int
@@ -36,12 +36,12 @@ void clear_ftrace_function(void);
 
 extern void ftrace_stub(unsigned long a0, unsigned long a1);
 
-#else /* !CONFIG_FTRACE */
+#else /* !CONFIG_FUNCTION_TRACER */
 # define register_ftrace_function(ops) do { } while (0)
 # define unregister_ftrace_function(ops) do { } while (0)
 # define clear_ftrace_function(ops) do { } while (0)
 static inline void ftrace_kill_atomic(void) { }
-#endif /* CONFIG_FTRACE */
+#endif /* CONFIG_FUNCTION_TRACER */
 
 #ifdef CONFIG_DYNAMIC_FTRACE
 # define FTRACE_HASHBITS	10
@@ -101,7 +101,7 @@ void ftrace_kill_atomic(void);
 
 static inline void tracer_disable(void)
 {
-#ifdef CONFIG_FTRACE
+#ifdef CONFIG_FUNCTION_TRACER
 	ftrace_enabled = 0;
 #endif
 }
@@ -113,7 +113,7 @@ static inline void tracer_disable(void)
  */
 static inline int __ftrace_enabled_save(void)
 {
-#ifdef CONFIG_FTRACE
+#ifdef CONFIG_FUNCTION_TRACER
 	int saved_ftrace_enabled = ftrace_enabled;
 	ftrace_enabled = 0;
 	return saved_ftrace_enabled;
@@ -124,7 +124,7 @@ static inline int __ftrace_enabled_save(void)
 
 static inline void __ftrace_enabled_restore(int enabled)
 {
-#ifdef CONFIG_FTRACE
+#ifdef CONFIG_FUNCTION_TRACER
 	ftrace_enabled = enabled;
 #endif
 }
diff --git a/kernel/Makefile b/kernel/Makefile
index 8f9ce7ec21b6..85f588a9d0b1 100644
--- a/kernel/Makefile
+++ b/kernel/Makefile
@@ -13,7 +13,7 @@ obj-y     = sched.o fork.o exec_domain.o panic.o printk.o \
 
 CFLAGS_REMOVE_sched.o = -mno-spe
 
-ifdef CONFIG_FTRACE
+ifdef CONFIG_FUNCTION_TRACER
 # Do not trace debug files and internal ftrace files
 CFLAGS_REMOVE_lockdep.o = -pg
 CFLAGS_REMOVE_lockdep_proc.o = -pg
@@ -86,7 +86,7 @@ obj-$(CONFIG_MARKERS) += marker.o
 obj-$(CONFIG_TRACEPOINTS) += tracepoint.o
 obj-$(CONFIG_LATENCYTOP) += latencytop.o
 obj-$(CONFIG_HAVE_GENERIC_DMA_COHERENT) += dma-coherent.o
-obj-$(CONFIG_FTRACE) += trace/
+obj-$(CONFIG_FUNCTION_TRACER) += trace/
 obj-$(CONFIG_TRACING) += trace/
 obj-$(CONFIG_SMP) += sched_cpupri.o
 
diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index 617d41e4d6a0..619eb9f3acd8 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -464,7 +464,7 @@ static struct ctl_table kern_table[] = {
 		.mode		= 0644,
 		.proc_handler	= &proc_dointvec,
 	},
-#ifdef CONFIG_FTRACE
+#ifdef CONFIG_FUNCTION_TRACER
 	{
 		.ctl_name	= CTL_UNNUMBERED,
 		.procname	= "ftrace_enabled",
diff --git a/kernel/trace/Kconfig b/kernel/trace/Kconfig
index 5866edbc2ed1..3533c583df47 100644
--- a/kernel/trace/Kconfig
+++ b/kernel/trace/Kconfig
@@ -1,11 +1,12 @@
 #
-# Architectures that offer an FTRACE implementation should select HAVE_FTRACE:
+# Architectures that offer an FUNCTION_TRACER implementation should
+#  select HAVE_FUNCTION_TRACER:
 #
 
 config NOP_TRACER
 	bool
 
-config HAVE_FTRACE
+config HAVE_FUNCTION_TRACER
 	bool
 	select NOP_TRACER
 
@@ -28,9 +29,9 @@ config TRACING
 	select STACKTRACE
 	select TRACEPOINTS
 
-config FTRACE
+config FUNCTION_TRACER
 	bool "Kernel Function Tracer"
-	depends on HAVE_FTRACE
+	depends on HAVE_FUNCTION_TRACER
 	depends on DEBUG_KERNEL
 	select FRAME_POINTER
 	select TRACING
@@ -136,9 +137,9 @@ config BOOT_TRACER
 
 config STACK_TRACER
 	bool "Trace max stack"
-	depends on HAVE_FTRACE
+	depends on HAVE_FUNCTION_TRACER
 	depends on DEBUG_KERNEL
-	select FTRACE
+	select FUNCTION_TRACER
 	select STACKTRACE
 	help
 	  This special tracer records the maximum stack footprint of the
@@ -155,7 +156,7 @@ config STACK_TRACER
 
 config DYNAMIC_FTRACE
 	bool "enable/disable ftrace tracepoints dynamically"
-	depends on FTRACE
+	depends on FUNCTION_TRACER
 	depends on HAVE_DYNAMIC_FTRACE
 	depends on DEBUG_KERNEL
 	default y
@@ -165,7 +166,7 @@ config DYNAMIC_FTRACE
 	 with a No-Op instruction) as they are called. A table is
 	 created to dynamically enable them again.
 
-	 This way a CONFIG_FTRACE kernel is slightly larger, but otherwise
+	 This way a CONFIG_FUNCTION_TRACER kernel is slightly larger, but otherwise
 	 has native performance as long as no tracing is active.
 
 	 The changes to the code are done by a kernel thread that
diff --git a/kernel/trace/Makefile b/kernel/trace/Makefile
index a85dfba88ba0..c8228b1a49e9 100644
--- a/kernel/trace/Makefile
+++ b/kernel/trace/Makefile
@@ -1,7 +1,7 @@
 
 # Do not instrument the tracer itself:
 
-ifdef CONFIG_FTRACE
+ifdef CONFIG_FUNCTION_TRACER
 ORIG_CFLAGS := $(KBUILD_CFLAGS)
 KBUILD_CFLAGS = $(subst -pg,,$(ORIG_CFLAGS))
 
@@ -10,13 +10,13 @@ CFLAGS_trace_selftest_dynamic.o = -pg
 obj-y += trace_selftest_dynamic.o
 endif
 
-obj-$(CONFIG_FTRACE) += libftrace.o
+obj-$(CONFIG_FUNCTION_TRACER) += libftrace.o
 obj-$(CONFIG_RING_BUFFER) += ring_buffer.o
 
 obj-$(CONFIG_TRACING) += trace.o
 obj-$(CONFIG_CONTEXT_SWITCH_TRACER) += trace_sched_switch.o
 obj-$(CONFIG_SYSPROF_TRACER) += trace_sysprof.o
-obj-$(CONFIG_FTRACE) += trace_functions.o
+obj-$(CONFIG_FUNCTION_TRACER) += trace_functions.o
 obj-$(CONFIG_IRQSOFF_TRACER) += trace_irqsoff.o
 obj-$(CONFIG_PREEMPT_TRACER) += trace_irqsoff.o
 obj-$(CONFIG_SCHED_TRACER) += trace_sched_wakeup.o
diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index d345d649d073..aeb2f2505bc5 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -851,7 +851,7 @@ ftrace_special(unsigned long arg1, unsigned long arg2, unsigned long arg3)
 	preempt_enable_notrace();
 }
 
-#ifdef CONFIG_FTRACE
+#ifdef CONFIG_FUNCTION_TRACER
 static void
 function_trace_call(unsigned long ip, unsigned long parent_ip)
 {
diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h
index f1f99572cde7..6889ca48f1f1 100644
--- a/kernel/trace/trace.h
+++ b/kernel/trace/trace.h
@@ -335,7 +335,7 @@ void update_max_tr_single(struct trace_array *tr,
 
 extern cycle_t ftrace_now(int cpu);
 
-#ifdef CONFIG_FTRACE
+#ifdef CONFIG_FUNCTION_TRACER
 void tracing_start_function_trace(void);
 void tracing_stop_function_trace(void);
 #else
diff --git a/kernel/trace/trace_irqsoff.c b/kernel/trace/trace_irqsoff.c
index a7db7f040ae0..9c74071c10e0 100644
--- a/kernel/trace/trace_irqsoff.c
+++ b/kernel/trace/trace_irqsoff.c
@@ -63,7 +63,7 @@ irq_trace(void)
  */
 static __cacheline_aligned_in_smp	unsigned long max_sequence;
 
-#ifdef CONFIG_FTRACE
+#ifdef CONFIG_FUNCTION_TRACER
 /*
  * irqsoff uses its own tracer function to keep the overhead down:
  */
@@ -104,7 +104,7 @@ static struct ftrace_ops trace_ops __read_mostly =
 {
 	.func = irqsoff_tracer_call,
 };
-#endif /* CONFIG_FTRACE */
+#endif /* CONFIG_FUNCTION_TRACER */
 
 /*
  * Should this new latency be reported/recorded?
diff --git a/kernel/trace/trace_sched_wakeup.c b/kernel/trace/trace_sched_wakeup.c
index fe4a252c2363..3ae93f16b565 100644
--- a/kernel/trace/trace_sched_wakeup.c
+++ b/kernel/trace/trace_sched_wakeup.c
@@ -31,7 +31,7 @@ static raw_spinlock_t wakeup_lock =
 
 static void __wakeup_reset(struct trace_array *tr);
 
-#ifdef CONFIG_FTRACE
+#ifdef CONFIG_FUNCTION_TRACER
 /*
  * irqsoff uses its own tracer function to keep the overhead down:
  */
@@ -96,7 +96,7 @@ static struct ftrace_ops trace_ops __read_mostly =
 {
 	.func = wakeup_tracer_call,
 };
-#endif /* CONFIG_FTRACE */
+#endif /* CONFIG_FUNCTION_TRACER */
 
 /*
  * Should this new latency be reported/recorded?
diff --git a/kernel/trace/trace_selftest.c b/kernel/trace/trace_selftest.c
index 09cf230d7eca..95815d26a041 100644
--- a/kernel/trace/trace_selftest.c
+++ b/kernel/trace/trace_selftest.c
@@ -70,7 +70,7 @@ static int trace_test_buffer(struct trace_array *tr, unsigned long *count)
 	return ret;
 }
 
-#ifdef CONFIG_FTRACE
+#ifdef CONFIG_FUNCTION_TRACER
 
 #ifdef CONFIG_DYNAMIC_FTRACE
 
@@ -226,7 +226,7 @@ trace_selftest_startup_function(struct tracer *trace, struct trace_array *tr)
 
 	return ret;
 }
-#endif /* CONFIG_FTRACE */
+#endif /* CONFIG_FUNCTION_TRACER */
 
 #ifdef CONFIG_IRQSOFF_TRACER
 int
diff --git a/lib/Makefile b/lib/Makefile
index 16feaab057b2..7cb65d85aeb0 100644
--- a/lib/Makefile
+++ b/lib/Makefile
@@ -2,7 +2,7 @@
 # Makefile for some libs needed in the kernel.
 #
 
-ifdef CONFIG_FTRACE
+ifdef CONFIG_FUNCTION_TRACER
 ORIG_CFLAGS := $(KBUILD_CFLAGS)
 KBUILD_CFLAGS = $(subst -pg,,$(ORIG_CFLAGS))
 endif
-- 
cgit v1.2.3


From edbc25caaa492a82e19baa915f1f6b0a0db6554d Mon Sep 17 00:00:00 2001
From: Milton Miller <miltonm@bga.com>
Date: Thu, 10 Jul 2008 16:29:37 -0500
Subject: PCI: remove dynids.use_driver_data

The driver flag dynids.use_driver_data is almost consistently not set,
and causes more problems than it solves.  It was initially intended as a
flag to indicate whether a driver's usage of driver_data had been
carefully inspected and was ready for values from userspace.  That audit
was never done, so most drivers just get a 0 for driver_data when new
IDs are added from userspace via sysfs.  So remove the flag, allowing
drivers to see the data directly (a followon patch validates the passed
driver_data value against what the drivers expect).

Acked-by: Greg Kroah-Hartman <gregkh@suse.de>
Acked-by: Jean Delvare <khali@linux-fr.org>
Signed-off-by: Milton Miller <miltonm@bga.com>
Signed-off-by: Jesse Barnes <jbarnes@virtuousgeek.org>
---
 drivers/i2c/busses/i2c-amd756.c | 1 -
 drivers/i2c/busses/i2c-viapro.c | 1 -
 drivers/pci/pci-driver.c        | 3 +--
 drivers/scsi/ipr.c              | 1 -
 include/linux/pci.h             | 1 -
 5 files changed, 1 insertion(+), 6 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/i2c/busses/i2c-amd756.c b/drivers/i2c/busses/i2c-amd756.c
index 1ea39254dac6..a3542b053c8e 100644
--- a/drivers/i2c/busses/i2c-amd756.c
+++ b/drivers/i2c/busses/i2c-amd756.c
@@ -412,7 +412,6 @@ static struct pci_driver amd756_driver = {
 	.id_table	= amd756_ids,
 	.probe		= amd756_probe,
 	.remove		= __devexit_p(amd756_remove),
-	.dynids.use_driver_data = 1,
 };
 
 static int __init amd756_init(void)
diff --git a/drivers/i2c/busses/i2c-viapro.c b/drivers/i2c/busses/i2c-viapro.c
index 73dc52e114eb..2324780484c0 100644
--- a/drivers/i2c/busses/i2c-viapro.c
+++ b/drivers/i2c/busses/i2c-viapro.c
@@ -483,7 +483,6 @@ static struct pci_driver vt596_driver = {
 	.name		= "vt596_smbus",
 	.id_table	= vt596_ids,
 	.probe		= vt596_probe,
-	.dynids.use_driver_data = 1,
 };
 
 static int __init i2c_vt596_init(void)
diff --git a/drivers/pci/pci-driver.c b/drivers/pci/pci-driver.c
index a13f53486114..4940a53c56a3 100644
--- a/drivers/pci/pci-driver.c
+++ b/drivers/pci/pci-driver.c
@@ -65,8 +65,7 @@ store_new_id(struct device_driver *driver, const char *buf, size_t count)
 	dynid->id.subdevice = subdevice;
 	dynid->id.class = class;
 	dynid->id.class_mask = class_mask;
-	dynid->id.driver_data = pdrv->dynids.use_driver_data ?
-		driver_data : 0UL;
+	dynid->id.driver_data = driver_data;
 
 	spin_lock(&pdrv->dynids.lock);
 	list_add_tail(&dynid->node, &pdrv->dynids.list);
diff --git a/drivers/scsi/ipr.c b/drivers/scsi/ipr.c
index d30eb7ba018e..098739deb02e 100644
--- a/drivers/scsi/ipr.c
+++ b/drivers/scsi/ipr.c
@@ -7859,7 +7859,6 @@ static struct pci_driver ipr_driver = {
 	.remove = ipr_remove,
 	.shutdown = ipr_shutdown,
 	.err_handler = &ipr_err_handler,
-	.dynids.use_driver_data = 1
 };
 
 /**
diff --git a/include/linux/pci.h b/include/linux/pci.h
index acf8f24037cd..c989f58d09bf 100644
--- a/include/linux/pci.h
+++ b/include/linux/pci.h
@@ -347,7 +347,6 @@ struct pci_bus_region {
 struct pci_dynids {
 	spinlock_t lock;            /* protects list, index */
 	struct list_head list;      /* for IDs added at runtime */
-	unsigned int use_driver_data:1; /* pci_device_id->driver_data is used */
 };
 
 /* ---------------------------------------------------------------- */
-- 
cgit v1.2.3


From 0235c4fc7fc6f621dc0dd89eba102ad5aa373390 Mon Sep 17 00:00:00 2001
From: "Rafael J. Wysocki" <rjw@sisk.pl>
Date: Mon, 18 Aug 2008 21:38:00 +0200
Subject: PCI PM: Introduce function pci_wake_from_d3

Many device drivers use the following sequence of statements to enable
the device to wake up the system while being in the D3_hot or D3_cold
low power state:

        pci_enable_wake(pdev, PCI_D3hot, 1);
        pci_enable_wake(pdev, PCI_D3cold, 1);

However, the second call is not necessary if the first one succeeds (the
ordering of the statements above doesn't matter here) and it may even be
harmful, because we are not supposed to enable PME# after the wake-up
power has been enabled for the device.

To allow drivers to overcome this problem, introduce function
pci_wake_from_d3() that will enable the device to wake up the system
from any of D3_hot and D3_cold as long as the wake-up from at least one
of them is supported.

Acked-by: Pavel Machek <pavel@suse.cz>
Signed-off-by: Rafael J. Wysocki <rjw@sisk.pl>
Signed-off-by: Jesse Barnes <jbarnes@virtuousgeek.org>
---
 drivers/pci/pci.c   | 22 ++++++++++++++++++++++
 include/linux/pci.h |  1 +
 2 files changed, 23 insertions(+)

(limited to 'include/linux')

diff --git a/drivers/pci/pci.c b/drivers/pci/pci.c
index dbe9f39f4436..2797112c9400 100644
--- a/drivers/pci/pci.c
+++ b/drivers/pci/pci.c
@@ -1126,6 +1126,27 @@ int pci_enable_wake(struct pci_dev *dev, pci_power_t state, int enable)
 	return pme_done ? 0 : error;
 }
 
+/**
+ * pci_wake_from_d3 - enable/disable device to wake up from D3_hot or D3_cold
+ * @dev: PCI device to prepare
+ * @enable: True to enable wake-up event generation; false to disable
+ *
+ * Many drivers want the device to wake up the system from D3_hot or D3_cold
+ * and this function allows them to set that up cleanly - pci_enable_wake()
+ * should not be called twice in a row to enable wake-up due to PCI PM vs ACPI
+ * ordering constraints.
+ *
+ * This function only returns error code if the device is not capable of
+ * generating PME# from both D3_hot and D3_cold, and the platform is unable to
+ * enable wake-up power for it.
+ */
+int pci_wake_from_d3(struct pci_dev *dev, bool enable)
+{
+	return pci_pme_capable(dev, PCI_D3cold) ?
+			pci_enable_wake(dev, PCI_D3cold, enable) :
+			pci_enable_wake(dev, PCI_D3hot, enable);
+}
+
 /**
  * pci_target_state - find an appropriate low power state for a given PCI dev
  * @dev: PCI device
@@ -1942,6 +1963,7 @@ EXPORT_SYMBOL(pci_restore_state);
 EXPORT_SYMBOL(pci_pme_capable);
 EXPORT_SYMBOL(pci_pme_active);
 EXPORT_SYMBOL(pci_enable_wake);
+EXPORT_SYMBOL(pci_wake_from_d3);
 EXPORT_SYMBOL(pci_target_state);
 EXPORT_SYMBOL(pci_prepare_to_sleep);
 EXPORT_SYMBOL(pci_back_from_sleep);
diff --git a/include/linux/pci.h b/include/linux/pci.h
index c989f58d09bf..f7e7dbc09194 100644
--- a/include/linux/pci.h
+++ b/include/linux/pci.h
@@ -644,6 +644,7 @@ pci_power_t pci_choose_state(struct pci_dev *dev, pm_message_t state);
 bool pci_pme_capable(struct pci_dev *dev, pci_power_t state);
 void pci_pme_active(struct pci_dev *dev, bool enable);
 int pci_enable_wake(struct pci_dev *dev, pci_power_t state, int enable);
+int pci_wake_from_d3(struct pci_dev *dev, bool enable);
 pci_power_t pci_target_state(struct pci_dev *dev);
 int pci_prepare_to_sleep(struct pci_dev *dev);
 int pci_back_from_sleep(struct pci_dev *dev);
-- 
cgit v1.2.3


From 16dbef4a831782466b10d4ae56837c5ba17d1948 Mon Sep 17 00:00:00 2001
From: Yinghai Lu <yhlu.kernel@gmail.com>
Date: Fri, 15 Aug 2008 19:36:45 -0700
Subject: PCI: change MSI-x vector to 32bit

We are using 28bit pci (bus/dev/fn + 12 bits) as irq number, so the
cache for irq number should be 32 bit too.

Signed-off-by: Yinghai Lu <yhlu.kernel@gmail.com>
Cc: Andrew Vasquez <andrew.vasquez@qlogic.com>
Signed-off-by: Jesse Barnes <jbarnes@virtuousgeek.org>
---
 drivers/scsi/qla2xxx/qla_def.h | 2 +-
 include/linux/pci.h            | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/scsi/qla2xxx/qla_def.h b/drivers/scsi/qla2xxx/qla_def.h
index 83c819216771..f25f41a499e5 100644
--- a/drivers/scsi/qla2xxx/qla_def.h
+++ b/drivers/scsi/qla2xxx/qla_def.h
@@ -2108,7 +2108,7 @@ struct scsi_qla_host;
 
 struct qla_msix_entry {
 	int have_irq;
-	uint16_t msix_vector;
+	uint32_t msix_vector;
 	uint16_t msix_entry;
 };
 
diff --git a/include/linux/pci.h b/include/linux/pci.h
index f7e7dbc09194..8a4d0bebc311 100644
--- a/include/linux/pci.h
+++ b/include/linux/pci.h
@@ -725,7 +725,7 @@ enum pci_dma_burst_strategy {
 };
 
 struct msix_entry {
-	u16 	vector;	/* kernel uses to write allocated vector */
+	u32	vector;	/* kernel uses to write allocated vector */
 	u16	entry;	/* driver uses to specify entry, OS writes */
 };
 
-- 
cgit v1.2.3


From 37a84ec668ba251ae02cf2c2c664baf6b247ae1f Mon Sep 17 00:00:00 2001
From: Seth Heasley <seth.heasley@intel.com>
Date: Thu, 28 Aug 2008 15:40:59 -0700
Subject: x86/PCI: irq and pci_ids patch for Intel Ibex Peak DeviceIDs

This patch updates the Intel Ibex Peak (PCH) LPC and SMBus Controller
DeviceIDs.

The LPC Controller ID is set by Firmware within the range of
0x3b00-3b1f.  This range is included in pci_ids.h using min and max
values, and irq.c now has code to handle the range (in lieu of 32
additions to a SWITCH statement).

The SMBus Controller ID is a fixed-value and will not change.

Signed-off-by: Seth Heasley <seth.heasley@intel.com>
Acked-by: Jean Delvare <khali@linux-fr.org>
Signed-off-by: Jesse Barnes <jbarnes@virtuousgeek.org>
---
 arch/x86/pci/irq.c      | 11 +++++++++--
 include/linux/pci_ids.h |  6 +++---
 2 files changed, 12 insertions(+), 5 deletions(-)

(limited to 'include/linux')

diff --git a/arch/x86/pci/irq.c b/arch/x86/pci/irq.c
index 52a1de1128c1..bf69dbe08bff 100644
--- a/arch/x86/pci/irq.c
+++ b/arch/x86/pci/irq.c
@@ -590,13 +590,20 @@ static __init int intel_router_probe(struct irq_router *r, struct pci_dev *route
 	case PCI_DEVICE_ID_INTEL_ICH10_1:
 	case PCI_DEVICE_ID_INTEL_ICH10_2:
 	case PCI_DEVICE_ID_INTEL_ICH10_3:
-	case PCI_DEVICE_ID_INTEL_PCH_0:
-	case PCI_DEVICE_ID_INTEL_PCH_1:
 		r->name = "PIIX/ICH";
 		r->get = pirq_piix_get;
 		r->set = pirq_piix_set;
 		return 1;
 	}
+
+	if ((device >= PCI_DEVICE_ID_INTEL_PCH_LPC_MIN) && 
+		(device <= PCI_DEVICE_ID_INTEL_PCH_LPC_MAX)) {
+		r->name = "PIIX/ICH";
+		r->get = pirq_piix_get;
+		r->set = pirq_piix_set;
+		return 1;
+	}
+
 	return 0;
 }
 
diff --git a/include/linux/pci_ids.h b/include/linux/pci_ids.h
index 8edddc240e4f..e5d344bfcb7e 100644
--- a/include/linux/pci_ids.h
+++ b/include/linux/pci_ids.h
@@ -2454,9 +2454,9 @@
 #define PCI_DEVICE_ID_INTEL_ICH10_3	0x3a1a
 #define PCI_DEVICE_ID_INTEL_ICH10_4	0x3a30
 #define PCI_DEVICE_ID_INTEL_ICH10_5	0x3a60
-#define PCI_DEVICE_ID_INTEL_PCH_0	0x3b10
-#define PCI_DEVICE_ID_INTEL_PCH_1	0x3b11
-#define PCI_DEVICE_ID_INTEL_PCH_2	0x3b30
+#define PCI_DEVICE_ID_INTEL_PCH_LPC_MIN	0x3b00
+#define PCI_DEVICE_ID_INTEL_PCH_LPC_MAX	0x3b1f
+#define PCI_DEVICE_ID_INTEL_PCH_SMBUS	0x3b30
 #define PCI_DEVICE_ID_INTEL_IOAT_SNB	0x402f
 #define PCI_DEVICE_ID_INTEL_5100_16	0x65f0
 #define PCI_DEVICE_ID_INTEL_5100_21	0x65f5
-- 
cgit v1.2.3


From c322b28a04c084a467a862766f74c40c917a721c Mon Sep 17 00:00:00 2001
From: "Zhao, Yu" <yu.zhao@intel.com>
Date: Mon, 13 Oct 2008 19:36:05 +0800
Subject: PCI: use same arg names in PCI_VDEVICE comment

This cleanup makes the argument names in PCI_VDEVICE comment consistent
with those used in its definition.

Signed-off-by: Yu Zhao <yu.zhao@intel.com>
Signed-off-by: Jesse Barnes <jbarnes@virtuousgeek.org>
---
 include/linux/pci.h | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/pci.h b/include/linux/pci.h
index 8a4d0bebc311..008005674b60 100644
--- a/include/linux/pci.h
+++ b/include/linux/pci.h
@@ -455,8 +455,8 @@ struct pci_driver {
 
 /**
  * PCI_VDEVICE - macro used to describe a specific pci device in short form
- * @vend: the vendor name
- * @dev: the 16 bit PCI Device ID
+ * @vendor: the vendor name
+ * @device: the 16 bit PCI Device ID
  *
  * This macro is used to create a struct pci_device_id that matches a
  * specific PCI device.  The subvendor, and subdevice fields will be set
-- 
cgit v1.2.3


From 58c3a727cb73b75a9104d295f096cca12959a5a5 Mon Sep 17 00:00:00 2001
From: Yu Zhao <yu.zhao@intel.com>
Date: Tue, 14 Oct 2008 14:02:53 +0800
Subject: PCI: support PCIe ARI capability

This patch adds support for PCI Express Alternative Routing-ID
Interpretation (ARI) capability.

The ARI capability extends the Function Number field of the PCI Express
Endpoint by reusing the Device Number which is otherwise hardwired to 0.
With ARI, an Endpoint can have up to 256 functions.

Signed-off-by: Yu Zhao <yu.zhao@intel.com>
Signed-off-by: Jesse Barnes <jbarnes@virtuousgeek.org>
---
 drivers/pci/pci.c        | 32 ++++++++++++++++++++++++++++++++
 drivers/pci/pci.h        | 12 ++++++++++++
 drivers/pci/probe.c      |  3 +++
 include/linux/pci.h      |  1 +
 include/linux/pci_regs.h | 14 ++++++++++++++
 5 files changed, 62 insertions(+)

(limited to 'include/linux')

diff --git a/drivers/pci/pci.c b/drivers/pci/pci.c
index 553ca6657955..4db261e13e69 100644
--- a/drivers/pci/pci.c
+++ b/drivers/pci/pci.c
@@ -1299,6 +1299,38 @@ void pci_pm_init(struct pci_dev *dev)
 	}
 }
 
+/**
+ * pci_enable_ari - enable ARI forwarding if hardware support it
+ * @dev: the PCI device
+ */
+void pci_enable_ari(struct pci_dev *dev)
+{
+	int pos;
+	u32 cap;
+	u16 ctrl;
+
+	if (!dev->is_pcie)
+		return;
+
+	if (dev->pcie_type != PCI_EXP_TYPE_ROOT_PORT &&
+	    dev->pcie_type != PCI_EXP_TYPE_DOWNSTREAM)
+		return;
+
+	pos = pci_find_capability(dev, PCI_CAP_ID_EXP);
+	if (!pos)
+		return;
+
+	pci_read_config_dword(dev, pos + PCI_EXP_DEVCAP2, &cap);
+	if (!(cap & PCI_EXP_DEVCAP2_ARI))
+		return;
+
+	pci_read_config_word(dev, pos + PCI_EXP_DEVCTL2, &ctrl);
+	ctrl |= PCI_EXP_DEVCTL2_ARI;
+	pci_write_config_word(dev, pos + PCI_EXP_DEVCTL2, ctrl);
+
+	dev->ari_enabled = 1;
+}
+
 int
 pci_get_interrupt_pin(struct pci_dev *dev, struct pci_dev **bridge)
 {
diff --git a/drivers/pci/pci.h b/drivers/pci/pci.h
index 601abdc8dd9f..39684c1415c5 100644
--- a/drivers/pci/pci.h
+++ b/drivers/pci/pci.h
@@ -151,4 +151,16 @@ struct pci_slot_attribute {
 };
 #define to_pci_slot_attr(s) container_of(s, struct pci_slot_attribute, attr)
 
+extern void pci_enable_ari(struct pci_dev *dev);
+/**
+ * pci_ari_enabled - query ARI forwarding status
+ * @dev: the PCI device
+ *
+ * Returns 1 if ARI forwarding is enabled, or 0 if not enabled;
+ */
+static inline int pci_ari_enabled(struct pci_dev *dev)
+{
+	return dev->ari_enabled;
+}
+
 #endif /* DRIVERS_PCI_H */
diff --git a/drivers/pci/probe.c b/drivers/pci/probe.c
index 8c158b9abd41..3141e8deeac4 100644
--- a/drivers/pci/probe.c
+++ b/drivers/pci/probe.c
@@ -1025,6 +1025,9 @@ static void pci_init_capabilities(struct pci_dev *dev)
 
 	/* Vital Product Data */
 	pci_vpd_pci22_init(dev);
+
+	/* Alternative Routing-ID Forwarding */
+	pci_enable_ari(dev);
 }
 
 void pci_device_add(struct pci_dev *dev, struct pci_bus *bus)
diff --git a/include/linux/pci.h b/include/linux/pci.h
index 008005674b60..7e9a1f0715e6 100644
--- a/include/linux/pci.h
+++ b/include/linux/pci.h
@@ -214,6 +214,7 @@ struct pci_dev {
 	unsigned int	broken_parity_status:1;	/* Device generates false positive parity */
 	unsigned int 	msi_enabled:1;
 	unsigned int	msix_enabled:1;
+	unsigned int	ari_enabled:1;	/* ARI forwarding */
 	unsigned int	is_managed:1;
 	unsigned int	is_pcie:1;
 	pci_dev_flags_t dev_flags;
diff --git a/include/linux/pci_regs.h b/include/linux/pci_regs.h
index 450684f7eaac..eb6686b88f9a 100644
--- a/include/linux/pci_regs.h
+++ b/include/linux/pci_regs.h
@@ -419,6 +419,10 @@
 #define  PCI_EXP_RTCTL_CRSSVE	0x10	/* CRS Software Visibility Enable */
 #define PCI_EXP_RTCAP		30	/* Root Capabilities */
 #define PCI_EXP_RTSTA		32	/* Root Status */
+#define PCI_EXP_DEVCAP2		36	/* Device Capabilities 2 */
+#define  PCI_EXP_DEVCAP2_ARI	0x20	/* Alternative Routing-ID */
+#define PCI_EXP_DEVCTL2		40	/* Device Control 2 */
+#define  PCI_EXP_DEVCTL2_ARI	0x20	/* Alternative Routing-ID */
 
 /* Extended Capabilities (PCI-X 2.0 and Express) */
 #define PCI_EXT_CAP_ID(header)		(header & 0x0000ffff)
@@ -429,6 +433,7 @@
 #define PCI_EXT_CAP_ID_VC	2
 #define PCI_EXT_CAP_ID_DSN	3
 #define PCI_EXT_CAP_ID_PWR	4
+#define PCI_EXT_CAP_ID_ARI	14
 
 /* Advanced Error Reporting */
 #define PCI_ERR_UNCOR_STATUS	4	/* Uncorrectable Error Status */
@@ -536,5 +541,14 @@
 #define HT_CAPTYPE_GEN3		0xD0	/* Generation 3 hypertransport configuration */
 #define HT_CAPTYPE_PM		0xE0	/* Hypertransport powermanagement configuration */
 
+/* Alternative Routing-ID Interpretation */
+#define PCI_ARI_CAP		0x04	/* ARI Capability Register */
+#define  PCI_ARI_CAP_MFVC	0x0001	/* MFVC Function Groups Capability */
+#define  PCI_ARI_CAP_ACS	0x0002	/* ACS Function Groups Capability */
+#define  PCI_ARI_CAP_NFN(x)	(((x) >> 8) & 0xff) /* Next Function Number */
+#define PCI_ARI_CTRL		0x06	/* ARI Control Register */
+#define  PCI_ARI_CTRL_MFVC	0x0001	/* MFVC Function Groups Enable */
+#define  PCI_ARI_CTRL_ACS	0x0002	/* ACS Function Groups Enable */
+#define  PCI_ARI_CTRL_FG(x)	(((x) >> 4) & 7) /* Function Group */
 
 #endif /* LINUX_PCI_REGS_H */
-- 
cgit v1.2.3


From aa42d7c6138afdc54f74e971456a0fbfec16b77b Mon Sep 17 00:00:00 2001
From: Arjan van de Ven <arjan@infradead.org>
Date: Sun, 28 Sep 2008 16:36:11 -0700
Subject: PCI: introduce an pci_ioremap(pdev, barnr) function

A common thing in many PCI drivers is to ioremap() an entire bar.  This
is a slightly fragile thing right now, needing both an address and a
size, and many driver writers do.. various things there.

This patch introduces an pci_ioremap() function taking just a PCI device
struct and the bar number as arguments, and figures this all out itself,
in one place.  In addition, we can add various sanity checks to this
function (the patch already checks to make sure that the bar in question
really is a MEM bar; few to no drivers do that sort of thing).

Hopefully with this type of API we get less chance of mistakes in
drivers with ioremap() operations.

Signed-off-by: Arjan van de Ven <arjan@linux.intel.com>
Signed-off-by: Jesse Barnes <jbarnes@virtuousgeek.org>
---
 include/linux/pci.h | 13 +++++++++++++
 1 file changed, 13 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/pci.h b/include/linux/pci.h
index 7e9a1f0715e6..46ad282ffe4d 100644
--- a/include/linux/pci.h
+++ b/include/linux/pci.h
@@ -1119,5 +1119,18 @@ static inline void pci_mmcfg_early_init(void) { }
 static inline void pci_mmcfg_late_init(void) { }
 #endif
 
+static inline void * pci_ioremap_bar(struct pci_dev *pdev, int bar)
+{
+	/*
+	 * Make sure the BAR is actually a memory resource, not an IO resource
+	 */
+	if (!(pci_resource_flags(pdev, bar) & IORESOURCE_MEM)) {
+		WARN_ON(1);
+		return NULL;
+	}
+	return ioremap_nocache(pci_resource_start(pdev, bar),
+				     pci_resource_len(pdev, bar));
+}
+
 #endif /* __KERNEL__ */
 #endif /* LINUX_PCI_H */
-- 
cgit v1.2.3


From 0927678f55c9a50c296f7e6dae85e87b8236e155 Mon Sep 17 00:00:00 2001
From: Jesse Barnes <jbarnes@virtuousgeek.org>
Date: Sat, 18 Oct 2008 17:33:19 -0700
Subject: PCI: use pci_find_ext_capability everywhere

Remove some open coded (and buggy) versions of pci_find_ext_capability
in favor of the real routine in the PCI core.

Tested-by: Tomasz Czernecki <czernecki@gmail.com>
Acked-by: Andrew Vasquez <andrew.vasquez@qlogic.com>
Reviewed-by: Matthew Wilcox <willy@linux.intel.com>
Signed-off-by: Jesse Barnes <jbarnes@virtuousgeek.org>
---
 drivers/pci/pcie/aer/aerdrv.c      |  6 ++---
 drivers/pci/pcie/aer/aerdrv_core.c | 47 ++++++++------------------------------
 drivers/pci/pcie/portdrv_core.c    | 23 ++++---------------
 drivers/scsi/qla2xxx/qla_os.c      |  5 ++--
 include/linux/aer.h                |  4 ----
 5 files changed, 20 insertions(+), 65 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/pci/pcie/aer/aerdrv.c b/drivers/pci/pcie/aer/aerdrv.c
index 77036f46acfe..e390707661dd 100644
--- a/drivers/pci/pcie/aer/aerdrv.c
+++ b/drivers/pci/pcie/aer/aerdrv.c
@@ -105,7 +105,7 @@ static irqreturn_t aer_irq(int irq, void *context)
 	unsigned long flags;
 	int pos;
 
-	pos = pci_find_aer_capability(pdev->port);
+	pos = pci_find_ext_capability(pdev->port, PCI_EXT_CAP_ID_ERR);
 	/*
 	 * Must lock access to Root Error Status Reg, Root Error ID Reg,
 	 * and Root error producer/consumer index
@@ -252,7 +252,7 @@ static pci_ers_result_t aer_root_reset(struct pci_dev *dev)
 	u32 status;
 	int pos;
 
-	pos = pci_find_aer_capability(dev);
+	pos = pci_find_ext_capability(dev, PCI_EXT_CAP_ID_ERR);
 
 	/* Disable Root's interrupt in response to error messages */
 	pci_write_config_dword(dev, pos + PCI_ERR_ROOT_COMMAND, 0);
@@ -316,7 +316,7 @@ static void aer_error_resume(struct pci_dev *dev)
 	pci_write_config_word(dev, pos + PCI_EXP_DEVSTA, reg16);
 
 	/* Clean AER Root Error Status */
-	pos = pci_find_aer_capability(dev);
+	pos = pci_find_ext_capability(dev, PCI_EXT_CAP_ID_ERR);
 	pci_read_config_dword(dev, pos + PCI_ERR_UNCOR_STATUS, &status);
 	pci_read_config_dword(dev, pos + PCI_ERR_UNCOR_SEVER, &mask);
 	if (dev->error_state == pci_channel_io_normal)
diff --git a/drivers/pci/pcie/aer/aerdrv_core.c b/drivers/pci/pcie/aer/aerdrv_core.c
index ee5e7b5176d0..1ff21f6045d6 100644
--- a/drivers/pci/pcie/aer/aerdrv_core.c
+++ b/drivers/pci/pcie/aer/aerdrv_core.c
@@ -28,36 +28,6 @@
 static int forceload;
 module_param(forceload, bool, 0);
 
-#define PCI_CFG_SPACE_SIZE	(0x100)
-int pci_find_aer_capability(struct pci_dev *dev)
-{
-	int pos;
-	u32 reg32 = 0;
-
-	/* Check if it's a pci-express device */
-	pos = pci_find_capability(dev, PCI_CAP_ID_EXP);
-	if (!pos)
-		return 0;
-
-	/* Check if it supports pci-express AER */
-	pos = PCI_CFG_SPACE_SIZE;
-	while (pos) {
-		if (pci_read_config_dword(dev, pos, &reg32))
-			return 0;
-
-		/* some broken boards return ~0 */
-		if (reg32 == 0xffffffff)
-			return 0;
-
-		if (PCI_EXT_CAP_ID(reg32) == PCI_EXT_CAP_ID_ERR)
-			break;
-
-		pos = reg32 >> 20;
-	}
-
-	return pos;
-}
-
 int pci_enable_pcie_error_reporting(struct pci_dev *dev)
 {
 	u16 reg16 = 0;
@@ -67,6 +37,10 @@ int pci_enable_pcie_error_reporting(struct pci_dev *dev)
 	if (!pos)
 		return -EIO;
 
+	pos = pci_find_ext_capability(dev, PCI_EXT_CAP_ID_ERR);
+	if (!pos)
+		return -EIO;
+
 	pci_read_config_word(dev, pos+PCI_EXP_DEVCTL, &reg16);
 	reg16 = reg16 |
 		PCI_EXP_DEVCTL_CERE |
@@ -102,7 +76,7 @@ int pci_cleanup_aer_uncorrect_error_status(struct pci_dev *dev)
 	int pos;
 	u32 status, mask;
 
-	pos = pci_find_aer_capability(dev);
+	pos = pci_find_ext_capability(dev, PCI_EXT_CAP_ID_ERR);
 	if (!pos)
 		return -EIO;
 
@@ -123,7 +97,7 @@ int pci_cleanup_aer_correct_error_status(struct pci_dev *dev)
 	int pos;
 	u32 status;
 
-	pos = pci_find_aer_capability(dev);
+	pos = pci_find_ext_capability(dev, PCI_EXT_CAP_ID_ERR);
 	if (!pos)
 		return -EIO;
 
@@ -502,7 +476,7 @@ static void handle_error_source(struct pcie_device * aerdev,
 		 * Correctable error does not need software intevention.
 		 * No need to go through error recovery process.
 		 */
-		pos = pci_find_aer_capability(dev);
+		pos = pci_find_ext_capability(dev, PCI_EXT_CAP_ID_ERR);
 		if (pos)
 			pci_write_config_dword(dev, pos + PCI_ERR_COR_STATUS,
 					info.status);
@@ -542,7 +516,7 @@ void aer_enable_rootport(struct aer_rpc *rpc)
 	reg16 &= ~(SYSTEM_ERROR_INTR_ON_MESG_MASK);
 	pci_write_config_word(pdev, pos + PCI_EXP_RTCTL, reg16);
 
-	aer_pos = pci_find_aer_capability(pdev);
+	aer_pos = pci_find_ext_capability(pdev, PCI_EXT_CAP_ID_ERR);
 	/* Clear error status */
 	pci_read_config_dword(pdev, aer_pos + PCI_ERR_ROOT_STATUS, &reg32);
 	pci_write_config_dword(pdev, aer_pos + PCI_ERR_ROOT_STATUS, reg32);
@@ -579,7 +553,7 @@ static void disable_root_aer(struct aer_rpc *rpc)
 	u32 reg32;
 	int pos;
 
-	pos = pci_find_aer_capability(pdev);
+	pos = pci_find_ext_capability(pdev, PCI_EXT_CAP_ID_ERR);
 	/* Disable Root's interrupt in response to error messages */
 	pci_write_config_dword(pdev, pos + PCI_ERR_ROOT_COMMAND, 0);
 
@@ -618,7 +592,7 @@ static int get_device_error_info(struct pci_dev *dev, struct aer_err_info *info)
 {
 	int pos;
 
-	pos = pci_find_aer_capability(dev);
+	pos = pci_find_ext_capability(dev, PCI_EXT_CAP_ID_ERR);
 
 	/* The device might not support AER */
 	if (!pos)
@@ -755,7 +729,6 @@ int aer_init(struct pcie_device *dev)
 	return AER_SUCCESS;
 }
 
-EXPORT_SYMBOL_GPL(pci_find_aer_capability);
 EXPORT_SYMBOL_GPL(pci_enable_pcie_error_reporting);
 EXPORT_SYMBOL_GPL(pci_disable_pcie_error_reporting);
 EXPORT_SYMBOL_GPL(pci_cleanup_aer_uncorrect_error_status);
diff --git a/drivers/pci/pcie/portdrv_core.c b/drivers/pci/pcie/portdrv_core.c
index 890f0d2b370a..2e091e014829 100644
--- a/drivers/pci/pcie/portdrv_core.c
+++ b/drivers/pci/pcie/portdrv_core.c
@@ -195,24 +195,11 @@ static int get_port_device_capability(struct pci_dev *dev)
 	/* PME Capable - root port capability */
 	if (((reg16 >> 4) & PORT_TYPE_MASK) == PCIE_RC_PORT)
 		services |= PCIE_PORT_SERVICE_PME;
-	
-	pos = PCI_CFG_SPACE_SIZE;
-	while (pos) {
-		pci_read_config_dword(dev, pos, &reg32);
-		switch (reg32 & 0xffff) {
-		case PCI_EXT_CAP_ID_ERR:
-			services |= PCIE_PORT_SERVICE_AER;
-			pos = reg32 >> 20;
-			break;
-		case PCI_EXT_CAP_ID_VC:
-			services |= PCIE_PORT_SERVICE_VC;
-			pos = reg32 >> 20;
-			break;
-		default:
-			pos = 0;
-			break;
-		}
-	}
+
+	if (pci_find_ext_capability(dev, PCI_EXT_CAP_ID_ERR))
+		services |= PCIE_PORT_SERVICE_AER;
+	if (pci_find_ext_capability(dev, PCI_EXT_CAP_ID_VC))
+		services |= PCIE_PORT_SERVICE_VC;
 
 	return services;
 }
diff --git a/drivers/scsi/qla2xxx/qla_os.c b/drivers/scsi/qla2xxx/qla_os.c
index 2aed4721c0d0..21dd182ad512 100644
--- a/drivers/scsi/qla2xxx/qla_os.c
+++ b/drivers/scsi/qla2xxx/qla_os.c
@@ -1566,9 +1566,8 @@ qla2x00_probe_one(struct pci_dev *pdev, const struct pci_device_id *id)
 			goto probe_out;
 	}
 
-	if (pci_find_aer_capability(pdev))
-		if (pci_enable_pcie_error_reporting(pdev))
-			goto probe_out;
+	/* This may fail but that's ok */
+	pci_enable_pcie_error_reporting(pdev);
 
 	host = scsi_host_alloc(sht, sizeof(scsi_qla_host_t));
 	if (host == NULL) {
diff --git a/include/linux/aer.h b/include/linux/aer.h
index f2518141de88..a2383a72356a 100644
--- a/include/linux/aer.h
+++ b/include/linux/aer.h
@@ -18,10 +18,6 @@ static inline int pci_enable_pcie_error_reporting(struct pci_dev *dev)
 {
 	return -EINVAL;
 }
-static inline int pci_find_aer_capability(struct pci_dev *dev)
-{
-	return 0;
-}
 static inline int pci_disable_pcie_error_reporting(struct pci_dev *dev)
 {
 	return -EINVAL;
-- 
cgit v1.2.3


From 270c66be9b4a6f2be53ef3aec5dc8e7b07782ec9 Mon Sep 17 00:00:00 2001
From: Yu Zhao <yu.zhao@intel.com>
Date: Sun, 19 Oct 2008 20:35:20 +0800
Subject: PCI: fix AER capability check

The 'use pci_find_ext_capability everywhere' cleanup brought a new bug,
which makes the AER stop working.  Fix it by actually using find_ext_cap
instead of just find_cap.  Drop the unused config space size define while
we're at it.

Signed-off-by: Yu Zhao <yu.zhao@intel.com>
Signed-off-by: Jesse Barnes <jbarnes@virtuousgeek.org>
---
 Documentation/PCI/pcieaer-howto.txt | 11 +++--------
 drivers/pci/pcie/aer/aerdrv_core.c  |  4 ++--
 drivers/pci/pcie/portdrv.h          |  1 -
 include/linux/aer.h                 |  1 -
 4 files changed, 5 insertions(+), 12 deletions(-)

(limited to 'include/linux')

diff --git a/Documentation/PCI/pcieaer-howto.txt b/Documentation/PCI/pcieaer-howto.txt
index 16c251230c82..ddeb14beacc8 100644
--- a/Documentation/PCI/pcieaer-howto.txt
+++ b/Documentation/PCI/pcieaer-howto.txt
@@ -203,22 +203,17 @@ to mmio_enabled.
 
 3.3 helper functions
 
-3.3.1 int pci_find_aer_capability(struct pci_dev *dev);
-pci_find_aer_capability locates the PCI Express AER capability
-in the device configuration space. If the device doesn't support
-PCI-Express AER, the function returns 0.
-
-3.3.2 int pci_enable_pcie_error_reporting(struct pci_dev *dev);
+3.3.1 int pci_enable_pcie_error_reporting(struct pci_dev *dev);
 pci_enable_pcie_error_reporting enables the device to send error
 messages to root port when an error is detected. Note that devices
 don't enable the error reporting by default, so device drivers need
 call this function to enable it.
 
-3.3.3 int pci_disable_pcie_error_reporting(struct pci_dev *dev);
+3.3.2 int pci_disable_pcie_error_reporting(struct pci_dev *dev);
 pci_disable_pcie_error_reporting disables the device to send error
 messages to root port when an error is detected.
 
-3.3.4 int pci_cleanup_aer_uncorrect_error_status(struct pci_dev *dev);
+3.3.3 int pci_cleanup_aer_uncorrect_error_status(struct pci_dev *dev);
 pci_cleanup_aer_uncorrect_error_status cleanups the uncorrectable
 error status register.
 
diff --git a/drivers/pci/pcie/aer/aerdrv_core.c b/drivers/pci/pcie/aer/aerdrv_core.c
index 1ff21f6045d6..dfc63d01f20a 100644
--- a/drivers/pci/pcie/aer/aerdrv_core.c
+++ b/drivers/pci/pcie/aer/aerdrv_core.c
@@ -33,11 +33,11 @@ int pci_enable_pcie_error_reporting(struct pci_dev *dev)
 	u16 reg16 = 0;
 	int pos;
 
-	pos = pci_find_capability(dev, PCI_CAP_ID_EXP);
+	pos = pci_find_ext_capability(dev, PCI_EXT_CAP_ID_ERR);
 	if (!pos)
 		return -EIO;
 
-	pos = pci_find_ext_capability(dev, PCI_EXT_CAP_ID_ERR);
+	pos = pci_find_capability(dev, PCI_CAP_ID_EXP);
 	if (!pos)
 		return -EIO;
 
diff --git a/drivers/pci/pcie/portdrv.h b/drivers/pci/pcie/portdrv.h
index 3656e0349dd1..2529f3f2ea5a 100644
--- a/drivers/pci/pcie/portdrv.h
+++ b/drivers/pci/pcie/portdrv.h
@@ -25,7 +25,6 @@
 #define PCIE_CAPABILITIES_REG		0x2
 #define PCIE_SLOT_CAPABILITIES_REG	0x14
 #define PCIE_PORT_DEVICE_MAXSERVICES	4
-#define PCI_CFG_SPACE_SIZE		256
 
 #define get_descriptor_id(type, service) (((type - 4) << 4) | service)
 
diff --git a/include/linux/aer.h b/include/linux/aer.h
index a2383a72356a..f7df1eefc107 100644
--- a/include/linux/aer.h
+++ b/include/linux/aer.h
@@ -10,7 +10,6 @@
 #if defined(CONFIG_PCIEAER)
 /* pci-e port driver needs this function to enable aer */
 extern int pci_enable_pcie_error_reporting(struct pci_dev *dev);
-extern int pci_find_aer_capability(struct pci_dev *dev);
 extern int pci_disable_pcie_error_reporting(struct pci_dev *dev);
 extern int pci_cleanup_aer_uncorrect_error_status(struct pci_dev *dev);
 #else
-- 
cgit v1.2.3


From 96499871f45b9126157b1a5c512d6e30f1635225 Mon Sep 17 00:00:00 2001
From: Heiko Carstens <heiko.carstens@de.ibm.com>
Date: Mon, 20 Oct 2008 19:45:43 +0200
Subject: PCI: fix pci_ioremap_bar() on s390

s390 doesn't have ioremap_*, so protect the definition of the new
pci_ioremap_bar function with CONFIG_HAS_IOMEM to avoid build breakage.

Acked-by: Arjan van de Ven <arjan@linux.intel.com>
Signed-off-by: Heiko Carstens <heiko.carstens@de.ibm.com>
Signed-off-by: Jesse Barnes <jbarnes@virtuousgeek.org>
---
 include/linux/pci.h | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/pci.h b/include/linux/pci.h
index 46ad282ffe4d..085187be29c7 100644
--- a/include/linux/pci.h
+++ b/include/linux/pci.h
@@ -1119,6 +1119,7 @@ static inline void pci_mmcfg_early_init(void) { }
 static inline void pci_mmcfg_late_init(void) { }
 #endif
 
+#ifdef CONFIG_HAS_IOMEM
 static inline void * pci_ioremap_bar(struct pci_dev *pdev, int bar)
 {
 	/*
@@ -1131,6 +1132,7 @@ static inline void * pci_ioremap_bar(struct pci_dev *pdev, int bar)
 	return ioremap_nocache(pci_resource_start(pdev, bar),
 				     pci_resource_len(pdev, bar));
 }
+#endif
 
 #endif /* __KERNEL__ */
 #endif /* LINUX_PCI_H */
-- 
cgit v1.2.3


From 326bb8a5a12c6298a6bf6c74af490b1858b2f12c Mon Sep 17 00:00:00 2001
From: Trent Piepho <tpiepho@freescale.com>
Date: Mon, 13 Oct 2008 10:13:01 +0100
Subject: leds: Make default trigger fields const

The default_trigger fields of struct gpio_led and thus struct
led_classdev are pretty much always assigned from a string literal,
which means the string can't be modified.  Which is fine, since there is
no reason to modify the string and in fact it never is.

But they should be marked const to prevent such code from being added,
to prevent warnings if -Wwrite-strings is used, when assigned from a
constant string other than a string literal (which produces a warning
under current kernel compiler flags), and for general good coding
practices.

Signed-off-by: Trent Piepho <tpiepho@freescale.com>
Signed-off-by: Richard Purdie <rpurdie@linux.intel.com>
---
 include/linux/leds.h | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/leds.h b/include/linux/leds.h
index d41ccb56146a..d3a73f5a48c3 100644
--- a/include/linux/leds.h
+++ b/include/linux/leds.h
@@ -123,7 +123,7 @@ extern void ledtrig_ide_activity(void);
  */
 struct led_info {
 	const char	*name;
-	char		*default_trigger;
+	const char	*default_trigger;
 	int		flags;
 };
 
@@ -135,7 +135,7 @@ struct led_platform_data {
 /* For the leds-gpio driver */
 struct gpio_led {
 	const char *name;
-	char *default_trigger;
+	const char *default_trigger;
 	unsigned 	gpio;
 	u8 		active_low;
 };
-- 
cgit v1.2.3


From b62b75905d571c29262a6c38cf9e5f089c203871 Mon Sep 17 00:00:00 2001
From: NeilBrown <neilb@suse.de>
Date: Tue, 21 Oct 2008 13:25:21 +1100
Subject: md: use sysfs_notify_dirent to notify changes to md/array_state

Now that we have sysfs_notify_dirent, use it to notify changes
to md/array_state.
As sysfs_notify_dirent can be called in atomic context, we can
remove the delayed notify and the MD_NOTIFY_ARRAY_STATE flag.

Signed-off-by: NeilBrown <neilb@suse.de>
---
 drivers/md/md.c           | 30 ++++++++++++++++--------------
 include/linux/raid/md_k.h |  5 ++++-
 2 files changed, 20 insertions(+), 15 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/md/md.c b/drivers/md/md.c
index aaa3d465de4e..feea72dc4b69 100644
--- a/drivers/md/md.c
+++ b/drivers/md/md.c
@@ -222,6 +222,9 @@ static void mddev_put(mddev_t *mddev)
 		list_del(&mddev->all_mddevs);
 		spin_unlock(&all_mddevs_lock);
 		blk_cleanup_queue(mddev->queue);
+		if (mddev->sysfs_state)
+			sysfs_put(mddev->sysfs_state);
+		mddev->sysfs_state = NULL;
 		kobject_put(&mddev->kobj);
 	} else
 		spin_unlock(&all_mddevs_lock);
@@ -2770,7 +2773,7 @@ array_state_store(mddev_t *mddev, const char *buf, size_t len)
 	if (err)
 		return err;
 	else {
-		sysfs_notify(&mddev->kobj, NULL, "array_state");
+		sysfs_notify_dirent(mddev->sysfs_state);
 		return len;
 	}
 }
@@ -3465,8 +3468,10 @@ static struct kobject *md_probe(dev_t dev, int *part, void *data)
 	if (error)
 		printk(KERN_WARNING "md: cannot register %s/md - name in use\n",
 		       disk->disk_name);
-	else
+	else {
 		kobject_uevent(&mddev->kobj, KOBJ_ADD);
+		mddev->sysfs_state = sysfs_get_dirent(mddev->kobj.sd, "array_state");
+	}
 	return NULL;
 }
 
@@ -3477,7 +3482,7 @@ static void md_safemode_timeout(unsigned long data)
 	if (!atomic_read(&mddev->writes_pending)) {
 		mddev->safemode = 1;
 		if (mddev->external)
-			set_bit(MD_NOTIFY_ARRAY_STATE, &mddev->flags);
+			sysfs_notify_dirent(mddev->sysfs_state);
 	}
 	md_wakeup_thread(mddev->thread);
 }
@@ -3740,7 +3745,7 @@ static int do_md_run(mddev_t * mddev)
 
 	mddev->changed = 1;
 	md_new_event(mddev);
-	sysfs_notify(&mddev->kobj, NULL, "array_state");
+	sysfs_notify_dirent(mddev->sysfs_state);
 	sysfs_notify(&mddev->kobj, NULL, "sync_action");
 	sysfs_notify(&mddev->kobj, NULL, "degraded");
 	kobject_uevent(&disk_to_dev(mddev->gendisk)->kobj, KOBJ_CHANGE);
@@ -3767,7 +3772,7 @@ static int restart_array(mddev_t *mddev)
 	set_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
 	md_wakeup_thread(mddev->thread);
 	md_wakeup_thread(mddev->sync_thread);
-	sysfs_notify(&mddev->kobj, NULL, "array_state");
+	sysfs_notify_dirent(mddev->sysfs_state);
 	return 0;
 }
 
@@ -3847,7 +3852,7 @@ static int do_md_stop(mddev_t * mddev, int mode, int is_open)
 			module_put(mddev->pers->owner);
 			mddev->pers = NULL;
 			/* tell userspace to handle 'inactive' */
-			sysfs_notify(&mddev->kobj, NULL, "array_state");
+			sysfs_notify_dirent(mddev->sysfs_state);
 
 			set_capacity(disk, 0);
 			mddev->changed = 1;
@@ -3933,7 +3938,7 @@ static int do_md_stop(mddev_t * mddev, int mode, int is_open)
 			mdname(mddev));
 	err = 0;
 	md_new_event(mddev);
-	sysfs_notify(&mddev->kobj, NULL, "array_state");
+	sysfs_notify_dirent(mddev->sysfs_state);
 out:
 	return err;
 }
@@ -4938,7 +4943,7 @@ static int md_ioctl(struct inode *inode, struct file *file,
 	if (_IOC_TYPE(cmd) == MD_MAJOR && mddev->ro && mddev->pers) {
 		if (mddev->ro == 2) {
 			mddev->ro = 0;
-			sysfs_notify(&mddev->kobj, NULL, "array_state");
+			sysfs_notify_dirent(mddev->sysfs_state);
 			set_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
 			md_wakeup_thread(mddev->thread);
 		} else {
@@ -5612,7 +5617,7 @@ void md_write_start(mddev_t *mddev, struct bio *bi)
 		spin_unlock_irq(&mddev->write_lock);
 	}
 	if (did_change)
-		sysfs_notify(&mddev->kobj, NULL, "array_state");
+		sysfs_notify_dirent(mddev->sysfs_state);
 	wait_event(mddev->sb_wait,
 		   !test_bit(MD_CHANGE_CLEAN, &mddev->flags) &&
 		   !test_bit(MD_CHANGE_PENDING, &mddev->flags));
@@ -5655,7 +5660,7 @@ int md_allow_write(mddev_t *mddev)
 			mddev->safemode = 1;
 		spin_unlock_irq(&mddev->write_lock);
 		md_update_sb(mddev, 0);
-		sysfs_notify(&mddev->kobj, NULL, "array_state");
+		sysfs_notify_dirent(mddev->sysfs_state);
 	} else
 		spin_unlock_irq(&mddev->write_lock);
 
@@ -6048,9 +6053,6 @@ void md_check_recovery(mddev_t *mddev)
 	if (mddev->bitmap)
 		bitmap_daemon_work(mddev->bitmap);
 
-	if (test_and_clear_bit(MD_NOTIFY_ARRAY_STATE, &mddev->flags))
-		sysfs_notify(&mddev->kobj, NULL, "array_state");
-
 	if (mddev->ro)
 		return;
 
@@ -6103,7 +6105,7 @@ void md_check_recovery(mddev_t *mddev)
 				mddev->safemode = 0;
 			spin_unlock_irq(&mddev->write_lock);
 			if (did_change)
-				sysfs_notify(&mddev->kobj, NULL, "array_state");
+				sysfs_notify_dirent(mddev->sysfs_state);
 		}
 
 		if (mddev->flags)
diff --git a/include/linux/raid/md_k.h b/include/linux/raid/md_k.h
index c200b9a34aff..b16ad867e944 100644
--- a/include/linux/raid/md_k.h
+++ b/include/linux/raid/md_k.h
@@ -128,7 +128,6 @@ struct mddev_s
 #define MD_CHANGE_DEVS	0	/* Some device status has changed */
 #define MD_CHANGE_CLEAN 1	/* transition to or from 'clean' */
 #define MD_CHANGE_PENDING 2	/* superblock update in progress */
-#define MD_NOTIFY_ARRAY_STATE 3	/* atomic context wants to notify userspace */
 
 	int				ro;
 
@@ -239,6 +238,10 @@ struct mddev_s
 	sector_t			resync_max;	/* resync should pause
 							 * when it gets here */
 
+	struct sysfs_dirent		*sysfs_state;	/* handle for 'array_state'
+							 * file in sysfs.
+							 */
+
 	spinlock_t			write_lock;
 	wait_queue_head_t		sb_wait;	/* for waiting on superblock updates */
 	atomic_t			pending_writes;	/* number of active superblock writes */
-- 
cgit v1.2.3


From 3c0ee63a64a20351ed6c16ec797e1f8c850741ea Mon Sep 17 00:00:00 2001
From: NeilBrown <neilb@suse.de>
Date: Tue, 21 Oct 2008 13:25:28 +1100
Subject: md: use sysfs_notify_dirent to notify changes to md/dev-xxx/state

The 'state' file for a device reports, for example, when the device
has failed.  Changes should be reported to userspace ASAP without
the possibility of blocking on low-memory.  sysfs_notify does
have that possibility (as it takes a mutex which can be held
across a kmalloc) so use sysfs_notify_dirent instead.

Signed-off-by: NeilBrown <neilb@suse.de>
---
 drivers/md/md.c           | 21 ++++++++++++---------
 include/linux/raid/md_k.h |  3 +++
 2 files changed, 15 insertions(+), 9 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/md/md.c b/drivers/md/md.c
index feea72dc4b69..8b303477c77b 100644
--- a/drivers/md/md.c
+++ b/drivers/md/md.c
@@ -1462,6 +1462,8 @@ static int bind_rdev_to_array(mdk_rdev_t * rdev, mddev_t * mddev)
 		kobject_del(&rdev->kobj);
 		goto fail;
 	}
+	rdev->sysfs_state = sysfs_get_dirent(rdev->kobj.sd, "state");
+
 	list_add_rcu(&rdev->same_set, &mddev->disks);
 	bd_claim_by_disk(rdev->bdev, rdev->bdev->bd_holder, mddev->gendisk);
 	return 0;
@@ -1491,7 +1493,8 @@ static void unbind_rdev_from_array(mdk_rdev_t * rdev)
 	printk(KERN_INFO "md: unbind<%s>\n", bdevname(rdev->bdev,b));
 	rdev->mddev = NULL;
 	sysfs_remove_link(&rdev->kobj, "block");
-
+	sysfs_put(rdev->sysfs_state);
+	rdev->sysfs_state = NULL;
 	/* We need to delay this, otherwise we can deadlock when
 	 * writing to 'remove' to "dev/state".  We also need
 	 * to delay it due to rcu usage.
@@ -1926,8 +1929,8 @@ state_store(mdk_rdev_t *rdev, const char *buf, size_t len)
 
 		err = 0;
 	}
-	if (!err)
-		sysfs_notify(&rdev->kobj, NULL, "state");
+	if (!err && rdev->sysfs_state)
+		sysfs_notify_dirent(rdev->sysfs_state);
 	return err ? err : len;
 }
 static struct rdev_sysfs_entry rdev_state =
@@ -2022,7 +2025,7 @@ slot_store(mdk_rdev_t *rdev, const char *buf, size_t len)
 			rdev->raid_disk = -1;
 			return err;
 		} else
-			sysfs_notify(&rdev->kobj, NULL, "state");
+			sysfs_notify_dirent(rdev->sysfs_state);
 		sprintf(nm, "rd%d", rdev->raid_disk);
 		if (sysfs_create_link(&rdev->mddev->kobj, &rdev->kobj, nm))
 			printk(KERN_WARNING
@@ -2039,7 +2042,7 @@ slot_store(mdk_rdev_t *rdev, const char *buf, size_t len)
 		clear_bit(Faulty, &rdev->flags);
 		clear_bit(WriteMostly, &rdev->flags);
 		set_bit(In_sync, &rdev->flags);
-		sysfs_notify(&rdev->kobj, NULL, "state");
+		sysfs_notify_dirent(rdev->sysfs_state);
 	}
 	return len;
 }
@@ -3583,7 +3586,7 @@ static int do_md_run(mddev_t * mddev)
 				return -EINVAL;
 			}
 		}
-		sysfs_notify(&rdev->kobj, NULL, "state");
+		sysfs_notify_dirent(rdev->sysfs_state);
 	}
 
 	md_probe(mddev->unit, NULL, NULL);
@@ -4302,7 +4305,7 @@ static int add_new_disk(mddev_t * mddev, mdu_disk_info_t *info)
 		if (err)
 			export_rdev(rdev);
 		else
-			sysfs_notify(&rdev->kobj, NULL, "state");
+			sysfs_notify_dirent(rdev->sysfs_state);
 
 		md_update_sb(mddev, 1);
 		if (mddev->degraded)
@@ -6113,7 +6116,7 @@ void md_check_recovery(mddev_t *mddev)
 
 		rdev_for_each(rdev, rtmp, mddev)
 			if (test_and_clear_bit(StateChanged, &rdev->flags))
-				sysfs_notify(&rdev->kobj, NULL, "state");
+				sysfs_notify_dirent(rdev->sysfs_state);
 
 
 		if (test_bit(MD_RECOVERY_RUNNING, &mddev->recovery) &&
@@ -6223,7 +6226,7 @@ void md_check_recovery(mddev_t *mddev)
 
 void md_wait_for_blocked_rdev(mdk_rdev_t *rdev, mddev_t *mddev)
 {
-	sysfs_notify(&rdev->kobj, NULL, "state");
+	sysfs_notify_dirent(rdev->sysfs_state);
 	wait_event_timeout(rdev->blocked_wait,
 			   !test_bit(Blocked, &rdev->flags),
 			   msecs_to_jiffies(5000));
diff --git a/include/linux/raid/md_k.h b/include/linux/raid/md_k.h
index b16ad867e944..8fc909ef6787 100644
--- a/include/linux/raid/md_k.h
+++ b/include/linux/raid/md_k.h
@@ -115,6 +115,9 @@ struct mdk_rdev_s
 					   * in superblock.
 					   */
 	struct work_struct del_work;	/* used for delayed sysfs removal */
+
+	struct sysfs_dirent *sysfs_state; /* handle for 'state'
+					   * sysfs entry */
 };
 
 struct mddev_s
-- 
cgit v1.2.3


From a5598ca0d49821912a5053c05f07fd650671eb6d Mon Sep 17 00:00:00 2001
From: Carl Love <cel@us.ibm.com>
Date: Tue, 14 Oct 2008 23:37:01 +0000
Subject: powerpc/oprofile: Fix mutex locking for cell spu-oprofile

The issue is the SPU code is not holding the kernel mutex lock while
adding samples to the kernel buffer.

This patch creates per SPU buffers to hold the data.  Data
is added to the buffers from in interrupt context.  The data
is periodically pushed to the kernel buffer via a new Oprofile
function oprofile_put_buff(). The oprofile_put_buff() function
is called via a work queue enabling the funtion to acquire the
mutex lock.

The existing user controls for adjusting the per CPU buffer
size is used to control the size of the per SPU buffers.
Similarly, overflows of the SPU buffers are reported by
incrementing the per CPU buffer stats.  This eliminates the
need to have architecture specific controls for the per SPU
buffers which is not acceptable to the OProfile user tool
maintainer.

The export of the oprofile add_event_entry() is removed as it
is no longer needed given this patch.

Note, this patch has not addressed the issue of indexing arrays
by the spu number.  This still needs to be fixed as the spu
numbering is not guarenteed to be 0 to max_num_spus-1.

Signed-off-by: Carl Love <carll@us.ibm.com>
Signed-off-by: Maynard Johnson <maynardj@us.ibm.com>
Signed-off-by: Arnd Bergmann <arnd@arndb.de>
Acked-by: Acked-by: Robert Richter <robert.richter@amd.com>
Signed-off-by: Benjamin Herrenschmidt <benh@kernel.crashing.org>
---
 arch/powerpc/oprofile/cell/pr_util.h       |  13 ++
 arch/powerpc/oprofile/cell/spu_profiler.c  |   4 +-
 arch/powerpc/oprofile/cell/spu_task_sync.c | 236 +++++++++++++++++++++++++----
 drivers/oprofile/buffer_sync.c             |  24 +++
 drivers/oprofile/cpu_buffer.c              |  15 +-
 drivers/oprofile/event_buffer.h            |   7 +
 include/linux/oprofile.h                   |  16 +-
 7 files changed, 279 insertions(+), 36 deletions(-)

(limited to 'include/linux')

diff --git a/arch/powerpc/oprofile/cell/pr_util.h b/arch/powerpc/oprofile/cell/pr_util.h
index 22e4e8d4eb2c..628009c01958 100644
--- a/arch/powerpc/oprofile/cell/pr_util.h
+++ b/arch/powerpc/oprofile/cell/pr_util.h
@@ -24,6 +24,11 @@
 #define SKIP_GENERIC_SYNC 0
 #define SYNC_START_ERROR -1
 #define DO_GENERIC_SYNC 1
+#define SPUS_PER_NODE   8
+#define DEFAULT_TIMER_EXPIRE  (HZ / 10)
+
+extern struct delayed_work spu_work;
+extern int spu_prof_running;
 
 struct spu_overlay_info {	/* map of sections within an SPU overlay */
 	unsigned int vma;	/* SPU virtual memory address from elf */
@@ -62,6 +67,14 @@ struct vma_to_fileoffset_map {	/* map of sections within an SPU program */
 
 };
 
+struct spu_buffer {
+	int last_guard_val;
+	int ctx_sw_seen;
+	unsigned long *buff;
+	unsigned int head, tail;
+};
+
+
 /* The three functions below are for maintaining and accessing
  * the vma-to-fileoffset map.
  */
diff --git a/arch/powerpc/oprofile/cell/spu_profiler.c b/arch/powerpc/oprofile/cell/spu_profiler.c
index 380d7e217531..6edaebd5099a 100644
--- a/arch/powerpc/oprofile/cell/spu_profiler.c
+++ b/arch/powerpc/oprofile/cell/spu_profiler.c
@@ -23,12 +23,11 @@
 
 static u32 *samples;
 
-static int spu_prof_running;
+int spu_prof_running;
 static unsigned int profiling_interval;
 
 #define NUM_SPU_BITS_TRBUF 16
 #define SPUS_PER_TB_ENTRY   4
-#define SPUS_PER_NODE	     8
 
 #define SPU_PC_MASK	     0xFFFF
 
@@ -208,6 +207,7 @@ int start_spu_profiling(unsigned int cycles_reset)
 
 	spu_prof_running = 1;
 	hrtimer_start(&timer, kt, HRTIMER_MODE_REL);
+	schedule_delayed_work(&spu_work, DEFAULT_TIMER_EXPIRE);
 
 	return 0;
 }
diff --git a/arch/powerpc/oprofile/cell/spu_task_sync.c b/arch/powerpc/oprofile/cell/spu_task_sync.c
index 2a9b4a049329..2949126d28d1 100644
--- a/arch/powerpc/oprofile/cell/spu_task_sync.c
+++ b/arch/powerpc/oprofile/cell/spu_task_sync.c
@@ -35,7 +35,102 @@ static DEFINE_SPINLOCK(buffer_lock);
 static DEFINE_SPINLOCK(cache_lock);
 static int num_spu_nodes;
 int spu_prof_num_nodes;
-int last_guard_val[MAX_NUMNODES * 8];
+
+struct spu_buffer spu_buff[MAX_NUMNODES * SPUS_PER_NODE];
+struct delayed_work spu_work;
+static unsigned max_spu_buff;
+
+static void spu_buff_add(unsigned long int value, int spu)
+{
+	/* spu buff is a circular buffer.  Add entries to the
+	 * head.  Head is the index to store the next value.
+	 * The buffer is full when there is one available entry
+	 * in the queue, i.e. head and tail can't be equal.
+	 * That way we can tell the difference between the
+	 * buffer being full versus empty.
+	 *
+	 *  ASSUPTION: the buffer_lock is held when this function
+	 *             is called to lock the buffer, head and tail.
+	 */
+	int full = 1;
+
+	if (spu_buff[spu].head >= spu_buff[spu].tail) {
+		if ((spu_buff[spu].head - spu_buff[spu].tail)
+		    <  (max_spu_buff - 1))
+			full = 0;
+
+	} else if (spu_buff[spu].tail > spu_buff[spu].head) {
+		if ((spu_buff[spu].tail - spu_buff[spu].head)
+		    > 1)
+			full = 0;
+	}
+
+	if (!full) {
+		spu_buff[spu].buff[spu_buff[spu].head] = value;
+		spu_buff[spu].head++;
+
+		if (spu_buff[spu].head >= max_spu_buff)
+			spu_buff[spu].head = 0;
+	} else {
+		/* From the user's perspective make the SPU buffer
+		 * size management/overflow look like we are using
+		 * per cpu buffers.  The user uses the same
+		 * per cpu parameter to adjust the SPU buffer size.
+		 * Increment the sample_lost_overflow to inform
+		 * the user the buffer size needs to be increased.
+		 */
+		oprofile_cpu_buffer_inc_smpl_lost();
+	}
+}
+
+/* This function copies the per SPU buffers to the
+ * OProfile kernel buffer.
+ */
+void sync_spu_buff(void)
+{
+	int spu;
+	unsigned long flags;
+	int curr_head;
+
+	for (spu = 0; spu < num_spu_nodes; spu++) {
+		/* In case there was an issue and the buffer didn't
+		 * get created skip it.
+		 */
+		if (spu_buff[spu].buff == NULL)
+			continue;
+
+		/* Hold the lock to make sure the head/tail
+		 * doesn't change while spu_buff_add() is
+		 * deciding if the buffer is full or not.
+		 * Being a little paranoid.
+		 */
+		spin_lock_irqsave(&buffer_lock, flags);
+		curr_head = spu_buff[spu].head;
+		spin_unlock_irqrestore(&buffer_lock, flags);
+
+		/* Transfer the current contents to the kernel buffer.
+		 * data can still be added to the head of the buffer.
+		 */
+		oprofile_put_buff(spu_buff[spu].buff,
+				  spu_buff[spu].tail,
+				  curr_head, max_spu_buff);
+
+		spin_lock_irqsave(&buffer_lock, flags);
+		spu_buff[spu].tail = curr_head;
+		spin_unlock_irqrestore(&buffer_lock, flags);
+	}
+
+}
+
+static void wq_sync_spu_buff(struct work_struct *work)
+{
+	/* move data from spu buffers to kernel buffer */
+	sync_spu_buff();
+
+	/* only reschedule if profiling is not done */
+	if (spu_prof_running)
+		schedule_delayed_work(&spu_work, DEFAULT_TIMER_EXPIRE);
+}
 
 /* Container for caching information about an active SPU task. */
 struct cached_info {
@@ -305,14 +400,21 @@ static int process_context_switch(struct spu *spu, unsigned long objectId)
 
 	/* Record context info in event buffer */
 	spin_lock_irqsave(&buffer_lock, flags);
-	add_event_entry(ESCAPE_CODE);
-	add_event_entry(SPU_CTX_SWITCH_CODE);
-	add_event_entry(spu->number);
-	add_event_entry(spu->pid);
-	add_event_entry(spu->tgid);
-	add_event_entry(app_dcookie);
-	add_event_entry(spu_cookie);
-	add_event_entry(offset);
+	spu_buff_add(ESCAPE_CODE, spu->number);
+	spu_buff_add(SPU_CTX_SWITCH_CODE, spu->number);
+	spu_buff_add(spu->number, spu->number);
+	spu_buff_add(spu->pid, spu->number);
+	spu_buff_add(spu->tgid, spu->number);
+	spu_buff_add(app_dcookie, spu->number);
+	spu_buff_add(spu_cookie, spu->number);
+	spu_buff_add(offset, spu->number);
+
+	/* Set flag to indicate SPU PC data can now be written out.  If
+	 * the SPU program counter data is seen before an SPU context
+	 * record is seen, the postprocessing will fail.
+	 */
+	spu_buff[spu->number].ctx_sw_seen = 1;
+
 	spin_unlock_irqrestore(&buffer_lock, flags);
 	smp_wmb();	/* insure spu event buffer updates are written */
 			/* don't want entries intermingled... */
@@ -360,6 +462,47 @@ static int number_of_online_nodes(void)
         return nodes;
 }
 
+static int oprofile_spu_buff_create(void)
+{
+	int spu;
+
+	max_spu_buff = oprofile_get_cpu_buffer_size();
+
+	for (spu = 0; spu < num_spu_nodes; spu++) {
+		/* create circular buffers to store the data in.
+		 * use locks to manage accessing the buffers
+		 */
+		spu_buff[spu].head = 0;
+		spu_buff[spu].tail = 0;
+
+		/*
+		 * Create a buffer for each SPU.  Can't reliably
+		 * create a single buffer for all spus due to not
+		 * enough contiguous kernel memory.
+		 */
+
+		spu_buff[spu].buff = kzalloc((max_spu_buff
+					      * sizeof(unsigned long)),
+					     GFP_KERNEL);
+
+		if (!spu_buff[spu].buff) {
+			printk(KERN_ERR "SPU_PROF: "
+			       "%s, line %d:  oprofile_spu_buff_create "
+		       "failed to allocate spu buffer %d.\n",
+			       __func__, __LINE__, spu);
+
+			/* release the spu buffers that have been allocated */
+			while (spu >= 0) {
+				kfree(spu_buff[spu].buff);
+				spu_buff[spu].buff = 0;
+				spu--;
+			}
+			return -ENOMEM;
+		}
+	}
+	return 0;
+}
+
 /* The main purpose of this function is to synchronize
  * OProfile with SPUFS by registering to be notified of
  * SPU task switches.
@@ -372,20 +515,35 @@ static int number_of_online_nodes(void)
  */
 int spu_sync_start(void)
 {
-	int k;
+	int spu;
 	int ret = SKIP_GENERIC_SYNC;
 	int register_ret;
 	unsigned long flags = 0;
 
 	spu_prof_num_nodes = number_of_online_nodes();
 	num_spu_nodes = spu_prof_num_nodes * 8;
+	INIT_DELAYED_WORK(&spu_work, wq_sync_spu_buff);
+
+	/* create buffer for storing the SPU data to put in
+	 * the kernel buffer.
+	 */
+	ret = oprofile_spu_buff_create();
+	if (ret)
+		goto out;
 
 	spin_lock_irqsave(&buffer_lock, flags);
-	add_event_entry(ESCAPE_CODE);
-	add_event_entry(SPU_PROFILING_CODE);
-	add_event_entry(num_spu_nodes);
+	for (spu = 0; spu < num_spu_nodes; spu++) {
+		spu_buff_add(ESCAPE_CODE, spu);
+		spu_buff_add(SPU_PROFILING_CODE, spu);
+		spu_buff_add(num_spu_nodes, spu);
+	}
 	spin_unlock_irqrestore(&buffer_lock, flags);
 
+	for (spu = 0; spu < num_spu_nodes; spu++) {
+		spu_buff[spu].ctx_sw_seen = 0;
+		spu_buff[spu].last_guard_val = 0;
+	}
+
 	/* Register for SPU events  */
 	register_ret = spu_switch_event_register(&spu_active);
 	if (register_ret) {
@@ -393,8 +551,6 @@ int spu_sync_start(void)
 		goto out;
 	}
 
-	for (k = 0; k < (MAX_NUMNODES * 8); k++)
-		last_guard_val[k] = 0;
 	pr_debug("spu_sync_start -- running.\n");
 out:
 	return ret;
@@ -446,13 +602,20 @@ void spu_sync_buffer(int spu_num, unsigned int *samples,
 		 * use.	 We need to discard samples taken during the time
 		 * period which an overlay occurs (i.e., guard value changes).
 		 */
-		if (grd_val && grd_val != last_guard_val[spu_num]) {
-			last_guard_val[spu_num] = grd_val;
+		if (grd_val && grd_val != spu_buff[spu_num].last_guard_val) {
+			spu_buff[spu_num].last_guard_val = grd_val;
 			/* Drop the rest of the samples. */
 			break;
 		}
 
-		add_event_entry(file_offset | spu_num_shifted);
+		/* We must ensure that the SPU context switch has been written
+		 * out before samples for the SPU.  Otherwise, the SPU context
+		 * information is not available and the postprocessing of the
+		 * SPU PC will fail with no available anonymous map information.
+		 */
+		if (spu_buff[spu_num].ctx_sw_seen)
+			spu_buff_add((file_offset | spu_num_shifted),
+					 spu_num);
 	}
 	spin_unlock(&buffer_lock);
 out:
@@ -463,20 +626,41 @@ out:
 int spu_sync_stop(void)
 {
 	unsigned long flags = 0;
-	int ret = spu_switch_event_unregister(&spu_active);
-	if (ret) {
+	int ret;
+	int k;
+
+	ret = spu_switch_event_unregister(&spu_active);
+
+	if (ret)
 		printk(KERN_ERR "SPU_PROF: "
-			"%s, line %d: spu_switch_event_unregister returned %d\n",
-			__func__, __LINE__, ret);
-		goto out;
-	}
+		       "%s, line %d: spu_switch_event_unregister "	\
+		       "returned %d\n",
+		       __func__, __LINE__, ret);
+
+	/* flush any remaining data in the per SPU buffers */
+	sync_spu_buff();
 
 	spin_lock_irqsave(&cache_lock, flags);
 	ret = release_cached_info(RELEASE_ALL);
 	spin_unlock_irqrestore(&cache_lock, flags);
-out:
+
+	/* remove scheduled work queue item rather then waiting
+	 * for every queued entry to execute.  Then flush pending
+	 * system wide buffer to event buffer.
+	 */
+	cancel_delayed_work(&spu_work);
+
+	for (k = 0; k < num_spu_nodes; k++) {
+		spu_buff[k].ctx_sw_seen = 0;
+
+		/*
+		 * spu_sys_buff will be null if there was a problem
+		 * allocating the buffer.  Only delete if it exists.
+		 */
+		kfree(spu_buff[k].buff);
+		spu_buff[k].buff = 0;
+	}
 	pr_debug("spu_sync_stop -- done.\n");
 	return ret;
 }
 
-
diff --git a/drivers/oprofile/buffer_sync.c b/drivers/oprofile/buffer_sync.c
index ed982273fb8b..37681700b61a 100644
--- a/drivers/oprofile/buffer_sync.c
+++ b/drivers/oprofile/buffer_sync.c
@@ -628,3 +628,27 @@ void sync_buffer(int cpu)
 
 	mutex_unlock(&buffer_mutex);
 }
+
+/* The function can be used to add a buffer worth of data directly to
+ * the kernel buffer. The buffer is assumed to be a circular buffer.
+ * Take the entries from index start and end at index end, wrapping
+ * at max_entries.
+ */
+void oprofile_put_buff(unsigned long *buf, unsigned int start,
+		       unsigned int stop, unsigned int max)
+{
+	int i;
+
+	i = start;
+
+	mutex_lock(&buffer_mutex);
+	while (i != stop) {
+		add_event_entry(buf[i++]);
+
+		if (i >= max)
+			i = 0;
+	}
+
+	mutex_unlock(&buffer_mutex);
+}
+
diff --git a/drivers/oprofile/cpu_buffer.c b/drivers/oprofile/cpu_buffer.c
index e1bd5a937f6c..7ba39fe20a8a 100644
--- a/drivers/oprofile/cpu_buffer.c
+++ b/drivers/oprofile/cpu_buffer.c
@@ -38,13 +38,26 @@ static int work_enabled;
 void free_cpu_buffers(void)
 {
 	int i;
- 
+
 	for_each_online_cpu(i) {
 		vfree(per_cpu(cpu_buffer, i).buffer);
 		per_cpu(cpu_buffer, i).buffer = NULL;
 	}
 }
 
+unsigned long oprofile_get_cpu_buffer_size(void)
+{
+	return fs_cpu_buffer_size;
+}
+
+void oprofile_cpu_buffer_inc_smpl_lost(void)
+{
+	struct oprofile_cpu_buffer *cpu_buf
+		= &__get_cpu_var(cpu_buffer);
+
+	cpu_buf->sample_lost_overflow++;
+}
+
 int alloc_cpu_buffers(void)
 {
 	int i;
diff --git a/drivers/oprofile/event_buffer.h b/drivers/oprofile/event_buffer.h
index 5076ed1ebd8f..84bf324c5771 100644
--- a/drivers/oprofile/event_buffer.h
+++ b/drivers/oprofile/event_buffer.h
@@ -17,6 +17,13 @@ int alloc_event_buffer(void);
 
 void free_event_buffer(void);
  
+/**
+ * Add data to the event buffer.
+ * The data passed is free-form, but typically consists of
+ * file offsets, dcookies, context information, and ESCAPE codes.
+ */
+void add_event_entry(unsigned long data);
+
 /* wake up the process sleeping on the event file */
 void wake_up_buffer_waiter(void);
 
diff --git a/include/linux/oprofile.h b/include/linux/oprofile.h
index bcb8f725427c..5231861f357d 100644
--- a/include/linux/oprofile.h
+++ b/include/linux/oprofile.h
@@ -85,13 +85,6 @@ int oprofile_arch_init(struct oprofile_operations * ops);
  */
 void oprofile_arch_exit(void);
 
-/**
- * Add data to the event buffer.
- * The data passed is free-form, but typically consists of
- * file offsets, dcookies, context information, and ESCAPE codes.
- */
-void add_event_entry(unsigned long data);
-
 /**
  * Add a sample. This may be called from any context. Pass
  * smp_processor_id() as cpu.
@@ -162,5 +155,14 @@ int oprofilefs_ulong_from_user(unsigned long * val, char const __user * buf, siz
 
 /** lock for read/write safety */
 extern spinlock_t oprofilefs_lock;
+
+/**
+ * Add the contents of a circular buffer to the event buffer.
+ */
+void oprofile_put_buff(unsigned long *buf, unsigned int start,
+			unsigned int stop, unsigned int max);
+
+unsigned long oprofile_get_cpu_buffer_size(void);
+void oprofile_cpu_buffer_inc_smpl_lost(void);
  
 #endif /* OPROFILE_H */
-- 
cgit v1.2.3


From aeb5d727062a0238a2f96c9c380fbd2be4640c6f Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Tue, 2 Sep 2008 15:28:45 -0400
Subject: [PATCH] introduce fmode_t, do annotations

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 block/bsg.c                         |  7 ++++---
 block/cmd-filter.c                  |  2 +-
 block/scsi_ioctl.c                  |  5 +++--
 drivers/block/amiflop.c             |  4 ++--
 drivers/block/ataflop.c             |  4 ++--
 drivers/block/floppy.c              |  4 ++--
 drivers/block/paride/pf.c           |  2 +-
 drivers/block/paride/pt.c           |  2 +-
 drivers/block/pktcdvd.c             |  2 +-
 drivers/block/swim3.c               |  4 ++--
 drivers/char/nvram.c                |  6 +++---
 drivers/ide/ide-floppy_ioctl.c      |  2 +-
 drivers/ide/ide-gd.c                |  2 +-
 drivers/md/dm-ioctl.c               |  4 ++--
 drivers/md/dm-table.c               | 12 ++++++------
 drivers/mtd/mtdchar.c               | 10 +++++-----
 drivers/parisc/eisa_eeprom.c        |  2 +-
 fs/block_dev.c                      | 10 +++++-----
 fs/fifo.c                           |  6 +++---
 fs/file_table.c                     |  4 ++--
 fs/hostfs/hostfs_kern.c             |  5 +++--
 fs/locks.c                          |  3 ++-
 fs/open.c                           |  2 +-
 fs/proc/base.c                      |  4 ++--
 fs/reiserfs/journal.c               |  2 +-
 include/linux/blkdev.h              |  3 ++-
 include/linux/device-mapper.h       |  8 ++++----
 include/linux/file.h                |  4 ++--
 include/linux/fs.h                  | 16 ++++++++--------
 include/linux/fsnotify.h            |  2 +-
 include/linux/types.h               |  1 +
 ipc/shm.c                           |  2 +-
 sound/core/oss/pcm_oss.c            |  2 +-
 sound/oss/au1550_ac97.c             |  2 +-
 sound/oss/dmasound/dmasound.h       |  4 ++--
 sound/oss/dmasound/dmasound_atari.c |  4 ++--
 sound/oss/dmasound/dmasound_core.c  | 10 +++++-----
 sound/oss/msnd.h                    |  2 +-
 sound/oss/sound_config.h            | 20 ++++++--------------
 sound/oss/swarm_cs4297a.c           |  2 +-
 sound/oss/vwsnd.c                   |  2 +-
 41 files changed, 96 insertions(+), 98 deletions(-)

(limited to 'include/linux')

diff --git a/block/bsg.c b/block/bsg.c
index 034112bfe1f3..2d36b127f384 100644
--- a/block/bsg.c
+++ b/block/bsg.c
@@ -173,7 +173,7 @@ unlock:
 
 static int blk_fill_sgv4_hdr_rq(struct request_queue *q, struct request *rq,
 				struct sg_io_v4 *hdr, struct bsg_device *bd,
-				int has_write_perm)
+				fmode_t has_write_perm)
 {
 	if (hdr->request_len > BLK_MAX_CDB) {
 		rq->cmd = kzalloc(hdr->request_len, GFP_KERNEL);
@@ -242,7 +242,7 @@ bsg_validate_sgv4_hdr(struct request_queue *q, struct sg_io_v4 *hdr, int *rw)
  * map sg_io_v4 to a request.
  */
 static struct request *
-bsg_map_hdr(struct bsg_device *bd, struct sg_io_v4 *hdr, int has_write_perm)
+bsg_map_hdr(struct bsg_device *bd, struct sg_io_v4 *hdr, fmode_t has_write_perm)
 {
 	struct request_queue *q = bd->queue;
 	struct request *rq, *next_rq = NULL;
@@ -601,7 +601,8 @@ bsg_read(struct file *file, char __user *buf, size_t count, loff_t *ppos)
 }
 
 static int __bsg_write(struct bsg_device *bd, const char __user *buf,
-		       size_t count, ssize_t *bytes_written, int has_write_perm)
+		       size_t count, ssize_t *bytes_written,
+		       fmode_t has_write_perm)
 {
 	struct bsg_command *bc;
 	struct request *rq;
diff --git a/block/cmd-filter.c b/block/cmd-filter.c
index e669aed4c6bc..504b275e1b90 100644
--- a/block/cmd-filter.c
+++ b/block/cmd-filter.c
@@ -27,7 +27,7 @@
 #include <linux/cdrom.h>
 
 int blk_verify_command(struct blk_cmd_filter *filter,
-		       unsigned char *cmd, int has_write_perm)
+		       unsigned char *cmd, fmode_t has_write_perm)
 {
 	/* root can do any command. */
 	if (capable(CAP_SYS_RAWIO))
diff --git a/block/scsi_ioctl.c b/block/scsi_ioctl.c
index c34272a348fe..c525905f9d35 100644
--- a/block/scsi_ioctl.c
+++ b/block/scsi_ioctl.c
@@ -384,7 +384,8 @@ int sg_scsi_ioctl(struct file *file, struct request_queue *q,
 		  struct gendisk *disk, struct scsi_ioctl_command __user *sic)
 {
 	struct request *rq;
-	int err, write_perm = 0;
+	int err;
+	fmode_t write_perm = 0;
 	unsigned int in_len, out_len, bytes, opcode, cmdlen;
 	char *buffer = NULL, sense[SCSI_SENSE_BUFFERSIZE];
 
@@ -428,7 +429,7 @@ int sg_scsi_ioctl(struct file *file, struct request_queue *q,
 
 	/* scsi_ioctl passes NULL */
 	if (file && (file->f_mode & FMODE_WRITE))
-		write_perm = 1;
+		write_perm = FMODE_WRITE;
 
 	err = blk_verify_command(&q->cmd_filter, rq->cmd, write_perm);
 	if (err)
diff --git a/drivers/block/amiflop.c b/drivers/block/amiflop.c
index 7516baff3bb9..d19c5a939fe8 100644
--- a/drivers/block/amiflop.c
+++ b/drivers/block/amiflop.c
@@ -1560,9 +1560,9 @@ static int floppy_open(struct inode *inode, struct file *filp)
 	if (fd_ref[drive] && old_dev != system)
 		return -EBUSY;
 
-	if (filp && filp->f_mode & 3) {
+	if (filp && filp->f_mode & (FMODE_READ|FMODE_WRITE)) {
 		check_disk_change(inode->i_bdev);
-		if (filp->f_mode & 2 ) {
+		if (filp->f_mode & FMODE_WRITE ) {
 			int wrprot;
 
 			get_fdc(drive);
diff --git a/drivers/block/ataflop.c b/drivers/block/ataflop.c
index 432cf4018291..e1db285b72cd 100644
--- a/drivers/block/ataflop.c
+++ b/drivers/block/ataflop.c
@@ -1826,9 +1826,9 @@ static int floppy_open( struct inode *inode, struct file *filp )
 	if (filp->f_flags & O_NDELAY)
 		return 0;
 
-	if (filp->f_mode & 3) {
+	if (filp->f_mode & (FMODE_READ|FMODE_WRITE)) {
 		check_disk_change(inode->i_bdev);
-		if (filp->f_mode & 2) {
+		if (filp->f_mode & FMODE_WRITE) {
 			if (p->wpstat) {
 				if (p->ref < 0)
 					p->ref = 0;
diff --git a/drivers/block/floppy.c b/drivers/block/floppy.c
index 2cea27aba9a0..ae3ef8945f3f 100644
--- a/drivers/block/floppy.c
+++ b/drivers/block/floppy.c
@@ -3761,14 +3761,14 @@ static int floppy_open(struct inode *inode, struct file *filp)
 		UFDCS->rawcmd = 2;
 
 	if (!(filp->f_flags & O_NDELAY)) {
-		if (filp->f_mode & 3) {
+		if (filp->f_mode & (FMODE_READ|FMODE_WRITE)) {
 			UDRS->last_checked = 0;
 			check_disk_change(inode->i_bdev);
 			if (UTESTF(FD_DISK_CHANGED))
 				goto out;
 		}
 		res = -EROFS;
-		if ((filp->f_mode & 2) && !(UTESTF(FD_DISK_WRITABLE)))
+		if ((filp->f_mode & FMODE_WRITE) && !(UTESTF(FD_DISK_WRITABLE)))
 			goto out;
 	}
 	mutex_unlock(&open_lock);
diff --git a/drivers/block/paride/pf.c b/drivers/block/paride/pf.c
index e7fe6ca97dd8..a902d84fd330 100644
--- a/drivers/block/paride/pf.c
+++ b/drivers/block/paride/pf.c
@@ -305,7 +305,7 @@ static int pf_open(struct inode *inode, struct file *file)
 	if (pf->media_status == PF_NM)
 		return -ENODEV;
 
-	if ((pf->media_status == PF_RO) && (file->f_mode & 2))
+	if ((pf->media_status == PF_RO) && (file->f_mode & FMODE_WRITE))
 		return -EROFS;
 
 	pf->access++;
diff --git a/drivers/block/paride/pt.c b/drivers/block/paride/pt.c
index 5ae229656eaa..1e4006e18f03 100644
--- a/drivers/block/paride/pt.c
+++ b/drivers/block/paride/pt.c
@@ -667,7 +667,7 @@ static int pt_open(struct inode *inode, struct file *file)
 		goto out;
 
 	err = -EROFS;
-	if ((!(tape->flags & PT_WRITE_OK)) && (file->f_mode & 2))
+	if ((!(tape->flags & PT_WRITE_OK)) && (file->f_mode & FMODE_WRITE))
 		goto out;
 
 	if (!(iminor(inode) & 128))
diff --git a/drivers/block/pktcdvd.c b/drivers/block/pktcdvd.c
index 195ca7c720f5..4d581e8ba9f6 100644
--- a/drivers/block/pktcdvd.c
+++ b/drivers/block/pktcdvd.c
@@ -2320,7 +2320,7 @@ static int pkt_open_write(struct pktcdvd_device *pd)
 /*
  * called at open time.
  */
-static int pkt_open_dev(struct pktcdvd_device *pd, int write)
+static int pkt_open_dev(struct pktcdvd_device *pd, fmode_t write)
 {
 	int ret;
 	long lba;
diff --git a/drivers/block/swim3.c b/drivers/block/swim3.c
index 730ccea78e45..a53ca54bee12 100644
--- a/drivers/block/swim3.c
+++ b/drivers/block/swim3.c
@@ -908,13 +908,13 @@ static int floppy_open(struct inode *inode, struct file *filp)
 		return -EBUSY;
 
 	if (err == 0 && (filp->f_flags & O_NDELAY) == 0
-	    && (filp->f_mode & 3)) {
+	    && (filp->f_mode & (FMODE_READ|FMODE_WRITE))) {
 		check_disk_change(inode->i_bdev);
 		if (fs->ejected)
 			err = -ENXIO;
 	}
 
-	if (err == 0 && (filp->f_mode & 2)) {
+	if (err == 0 && (filp->f_mode & FMODE_WRITE)) {
 		if (fs->write_prot < 0)
 			fs->write_prot = swim3_readbit(fs, WRITE_PROT);
 		if (fs->write_prot)
diff --git a/drivers/char/nvram.c b/drivers/char/nvram.c
index 39f6357e3b5d..8054ee839b3c 100644
--- a/drivers/char/nvram.c
+++ b/drivers/char/nvram.c
@@ -338,7 +338,7 @@ nvram_open(struct inode *inode, struct file *file)
 
 	if ((nvram_open_cnt && (file->f_flags & O_EXCL)) ||
 	    (nvram_open_mode & NVRAM_EXCL) ||
-	    ((file->f_mode & 2) && (nvram_open_mode & NVRAM_WRITE))) {
+	    ((file->f_mode & FMODE_WRITE) && (nvram_open_mode & NVRAM_WRITE))) {
 		spin_unlock(&nvram_state_lock);
 		unlock_kernel();
 		return -EBUSY;
@@ -346,7 +346,7 @@ nvram_open(struct inode *inode, struct file *file)
 
 	if (file->f_flags & O_EXCL)
 		nvram_open_mode |= NVRAM_EXCL;
-	if (file->f_mode & 2)
+	if (file->f_mode & FMODE_WRITE)
 		nvram_open_mode |= NVRAM_WRITE;
 	nvram_open_cnt++;
 
@@ -366,7 +366,7 @@ nvram_release(struct inode *inode, struct file *file)
 	/* if only one instance is open, clear the EXCL bit */
 	if (nvram_open_mode & NVRAM_EXCL)
 		nvram_open_mode &= ~NVRAM_EXCL;
-	if (file->f_mode & 2)
+	if (file->f_mode & FMODE_WRITE)
 		nvram_open_mode &= ~NVRAM_WRITE;
 
 	spin_unlock(&nvram_state_lock);
diff --git a/drivers/ide/ide-floppy_ioctl.c b/drivers/ide/ide-floppy_ioctl.c
index 409e4c15f9b7..0d5f5054ab64 100644
--- a/drivers/ide/ide-floppy_ioctl.c
+++ b/drivers/ide/ide-floppy_ioctl.c
@@ -250,7 +250,7 @@ static int ide_floppy_format_ioctl(ide_drive_t *drive, struct file *file,
 	case IDEFLOPPY_IOCTL_FORMAT_GET_CAPACITY:
 		return ide_floppy_get_format_capacities(drive, argp);
 	case IDEFLOPPY_IOCTL_FORMAT_START:
-		if (!(file->f_mode & 2))
+		if (!(file->f_mode & FMODE_WRITE))
 			return -EPERM;
 		return ide_floppy_format_unit(drive, (int __user *)argp);
 	case IDEFLOPPY_IOCTL_FORMAT_GET_PROGRESS:
diff --git a/drivers/ide/ide-gd.c b/drivers/ide/ide-gd.c
index d44898f46c33..d367473098f9 100644
--- a/drivers/ide/ide-gd.c
+++ b/drivers/ide/ide-gd.c
@@ -202,7 +202,7 @@ static int ide_gd_open(struct inode *inode, struct file *filp)
 			goto out_put_idkp;
 		}
 
-		if ((drive->dev_flags & IDE_DFLAG_WP) && (filp->f_mode & 2)) {
+		if ((drive->dev_flags & IDE_DFLAG_WP) && (filp->f_mode & FMODE_WRITE)) {
 			ret = -EROFS;
 			goto out_put_idkp;
 		}
diff --git a/drivers/md/dm-ioctl.c b/drivers/md/dm-ioctl.c
index dca401dc70a0..777c948180f9 100644
--- a/drivers/md/dm-ioctl.c
+++ b/drivers/md/dm-ioctl.c
@@ -988,9 +988,9 @@ static int dev_wait(struct dm_ioctl *param, size_t param_size)
 	return r;
 }
 
-static inline int get_mode(struct dm_ioctl *param)
+static inline fmode_t get_mode(struct dm_ioctl *param)
 {
-	int mode = FMODE_READ | FMODE_WRITE;
+	fmode_t mode = FMODE_READ | FMODE_WRITE;
 
 	if (param->flags & DM_READONLY_FLAG)
 		mode = FMODE_READ;
diff --git a/drivers/md/dm-table.c b/drivers/md/dm-table.c
index a740a6950f59..7c8671b06fe3 100644
--- a/drivers/md/dm-table.c
+++ b/drivers/md/dm-table.c
@@ -43,7 +43,7 @@ struct dm_table {
 	 * device.  This should be a combination of FMODE_READ
 	 * and FMODE_WRITE.
 	 */
-	int mode;
+	fmode_t mode;
 
 	/* a list of devices used by this table */
 	struct list_head devices;
@@ -217,7 +217,7 @@ static int alloc_targets(struct dm_table *t, unsigned int num)
 	return 0;
 }
 
-int dm_table_create(struct dm_table **result, int mode,
+int dm_table_create(struct dm_table **result, fmode_t mode,
 		    unsigned num_targets, struct mapped_device *md)
 {
 	struct dm_table *t = kzalloc(sizeof(*t), GFP_KERNEL);
@@ -395,7 +395,7 @@ static int check_device_area(struct dm_dev_internal *dd, sector_t start,
  * careful to leave things as they were if we fail to reopen the
  * device.
  */
-static int upgrade_mode(struct dm_dev_internal *dd, int new_mode,
+static int upgrade_mode(struct dm_dev_internal *dd, fmode_t new_mode,
 			struct mapped_device *md)
 {
 	int r;
@@ -421,7 +421,7 @@ static int upgrade_mode(struct dm_dev_internal *dd, int new_mode,
  */
 static int __table_get_device(struct dm_table *t, struct dm_target *ti,
 			      const char *path, sector_t start, sector_t len,
-			      int mode, struct dm_dev **result)
+			      fmode_t mode, struct dm_dev **result)
 {
 	int r;
 	dev_t uninitialized_var(dev);
@@ -537,7 +537,7 @@ void dm_set_device_limits(struct dm_target *ti, struct block_device *bdev)
 EXPORT_SYMBOL_GPL(dm_set_device_limits);
 
 int dm_get_device(struct dm_target *ti, const char *path, sector_t start,
-		  sector_t len, int mode, struct dm_dev **result)
+		  sector_t len, fmode_t mode, struct dm_dev **result)
 {
 	int r = __table_get_device(ti->table, ti, path,
 				   start, len, mode, result);
@@ -887,7 +887,7 @@ struct list_head *dm_table_get_devices(struct dm_table *t)
 	return &t->devices;
 }
 
-int dm_table_get_mode(struct dm_table *t)
+fmode_t dm_table_get_mode(struct dm_table *t)
 {
 	return t->mode;
 }
diff --git a/drivers/mtd/mtdchar.c b/drivers/mtd/mtdchar.c
index 963840e9b5bf..bcffeda2df3d 100644
--- a/drivers/mtd/mtdchar.c
+++ b/drivers/mtd/mtdchar.c
@@ -96,7 +96,7 @@ static int mtd_open(struct inode *inode, struct file *file)
 		return -ENODEV;
 
 	/* You can't open the RO devices RW */
-	if ((file->f_mode & 2) && (minor & 1))
+	if ((file->f_mode & FMODE_WRITE) && (minor & 1))
 		return -EACCES;
 
 	lock_kernel();
@@ -114,7 +114,7 @@ static int mtd_open(struct inode *inode, struct file *file)
 	}
 
 	/* You can't open it RW if it's not a writeable device */
-	if ((file->f_mode & 2) && !(mtd->flags & MTD_WRITEABLE)) {
+	if ((file->f_mode & FMODE_WRITE) && !(mtd->flags & MTD_WRITEABLE)) {
 		put_mtd_device(mtd);
 		ret = -EACCES;
 		goto out;
@@ -144,7 +144,7 @@ static int mtd_close(struct inode *inode, struct file *file)
 	DEBUG(MTD_DEBUG_LEVEL0, "MTD_close\n");
 
 	/* Only sync if opened RW */
-	if ((file->f_mode & 2) && mtd->sync)
+	if ((file->f_mode & FMODE_WRITE) && mtd->sync)
 		mtd->sync(mtd);
 
 	put_mtd_device(mtd);
@@ -443,7 +443,7 @@ static int mtd_ioctl(struct inode *inode, struct file *file,
 	{
 		struct erase_info *erase;
 
-		if(!(file->f_mode & 2))
+		if(!(file->f_mode & FMODE_WRITE))
 			return -EPERM;
 
 		erase=kzalloc(sizeof(struct erase_info),GFP_KERNEL);
@@ -497,7 +497,7 @@ static int mtd_ioctl(struct inode *inode, struct file *file,
 		struct mtd_oob_buf __user *user_buf = argp;
 	        uint32_t retlen;
 
-		if(!(file->f_mode & 2))
+		if(!(file->f_mode & FMODE_WRITE))
 			return -EPERM;
 
 		if (copy_from_user(&buf, argp, sizeof(struct mtd_oob_buf)))
diff --git a/drivers/parisc/eisa_eeprom.c b/drivers/parisc/eisa_eeprom.c
index 5ac207932fd7..685d94e69d44 100644
--- a/drivers/parisc/eisa_eeprom.c
+++ b/drivers/parisc/eisa_eeprom.c
@@ -86,7 +86,7 @@ static int eisa_eeprom_open(struct inode *inode, struct file *file)
 {
 	cycle_kernel_lock();
 
-	if (file->f_mode & 2)
+	if (file->f_mode & FMODE_WRITE)
 		return -EINVAL;
    
 	return 0;
diff --git a/fs/block_dev.c b/fs/block_dev.c
index 218408eed1bb..8897f3b02e98 100644
--- a/fs/block_dev.c
+++ b/fs/block_dev.c
@@ -840,7 +840,7 @@ EXPORT_SYMBOL_GPL(bd_release_from_disk);
  * to be used for internal purposes.  If you ever need it - reconsider
  * your API.
  */
-struct block_device *open_by_devnum(dev_t dev, unsigned mode)
+struct block_device *open_by_devnum(dev_t dev, fmode_t mode)
 {
 	struct block_device *bdev = bdget(dev);
 	int err = -ENOMEM;
@@ -975,7 +975,7 @@ void bd_set_size(struct block_device *bdev, loff_t size)
 }
 EXPORT_SYMBOL(bd_set_size);
 
-static int __blkdev_get(struct block_device *bdev, mode_t mode, unsigned flags,
+static int __blkdev_get(struct block_device *bdev, fmode_t mode, unsigned flags,
 			int for_part);
 static int __blkdev_put(struct block_device *bdev, int for_part);
 
@@ -1104,7 +1104,7 @@ static int do_open(struct block_device *bdev, struct file *file, int for_part)
 	return ret;
 }
 
-static int __blkdev_get(struct block_device *bdev, mode_t mode, unsigned flags,
+static int __blkdev_get(struct block_device *bdev, fmode_t mode, unsigned flags,
 			int for_part)
 {
 	/*
@@ -1123,7 +1123,7 @@ static int __blkdev_get(struct block_device *bdev, mode_t mode, unsigned flags,
 	return do_open(bdev, &fake_file, for_part);
 }
 
-int blkdev_get(struct block_device *bdev, mode_t mode, unsigned flags)
+int blkdev_get(struct block_device *bdev, fmode_t mode, unsigned flags)
 {
 	return __blkdev_get(bdev, mode, flags, 0);
 }
@@ -1315,7 +1315,7 @@ EXPORT_SYMBOL(lookup_bdev);
 struct block_device *open_bdev_excl(const char *path, int flags, void *holder)
 {
 	struct block_device *bdev;
-	mode_t mode = FMODE_READ;
+	fmode_t mode = FMODE_READ;
 	int error = 0;
 
 	bdev = lookup_bdev(path);
diff --git a/fs/fifo.c b/fs/fifo.c
index 987bf9411495..f8f97b8b6d44 100644
--- a/fs/fifo.c
+++ b/fs/fifo.c
@@ -51,7 +51,7 @@ static int fifo_open(struct inode *inode, struct file *filp)
 	filp->f_mode &= (FMODE_READ | FMODE_WRITE);
 
 	switch (filp->f_mode) {
-	case 1:
+	case FMODE_READ:
 	/*
 	 *  O_RDONLY
 	 *  POSIX.1 says that O_NONBLOCK means return with the FIFO
@@ -76,7 +76,7 @@ static int fifo_open(struct inode *inode, struct file *filp)
 		}
 		break;
 	
-	case 2:
+	case FMODE_WRITE:
 	/*
 	 *  O_WRONLY
 	 *  POSIX.1 says that O_NONBLOCK means return -1 with
@@ -98,7 +98,7 @@ static int fifo_open(struct inode *inode, struct file *filp)
 		}
 		break;
 	
-	case 3:
+	case FMODE_READ | FMODE_WRITE:
 	/*
 	 *  O_RDWR
 	 *  POSIX.1 leaves this case "undefined" when O_NONBLOCK is set.
diff --git a/fs/file_table.c b/fs/file_table.c
index f45a4493f9e7..efc06faede6c 100644
--- a/fs/file_table.c
+++ b/fs/file_table.c
@@ -161,7 +161,7 @@ EXPORT_SYMBOL(get_empty_filp);
  * code should be moved into this function.
  */
 struct file *alloc_file(struct vfsmount *mnt, struct dentry *dentry,
-		mode_t mode, const struct file_operations *fop)
+		fmode_t mode, const struct file_operations *fop)
 {
 	struct file *file;
 	struct path;
@@ -193,7 +193,7 @@ EXPORT_SYMBOL(alloc_file);
  * of this should be moving to alloc_file().
  */
 int init_file(struct file *file, struct vfsmount *mnt, struct dentry *dentry,
-	   mode_t mode, const struct file_operations *fop)
+	   fmode_t mode, const struct file_operations *fop)
 {
 	int error = 0;
 	file->f_path.dentry = dentry;
diff --git a/fs/hostfs/hostfs_kern.c b/fs/hostfs/hostfs_kern.c
index d6ecabf4d231..7f34f4385de0 100644
--- a/fs/hostfs/hostfs_kern.c
+++ b/fs/hostfs/hostfs_kern.c
@@ -20,7 +20,7 @@
 struct hostfs_inode_info {
 	char *host_filename;
 	int fd;
-	int mode;
+	fmode_t mode;
 	struct inode vfs_inode;
 };
 
@@ -373,7 +373,8 @@ int hostfs_readdir(struct file *file, void *ent, filldir_t filldir)
 int hostfs_file_open(struct inode *ino, struct file *file)
 {
 	char *name;
-	int mode = 0, r = 0, w = 0, fd;
+	fmode_t mode = 0;
+	int r = 0, w = 0, fd;
 
 	mode = file->f_mode & (FMODE_READ | FMODE_WRITE);
 	if ((mode & HOSTFS_I(ino)->mode) == mode)
diff --git a/fs/locks.c b/fs/locks.c
index 5eb259e3cd38..20457486d6b2 100644
--- a/fs/locks.c
+++ b/fs/locks.c
@@ -1580,7 +1580,8 @@ asmlinkage long sys_flock(unsigned int fd, unsigned int cmd)
 	cmd &= ~LOCK_NB;
 	unlock = (cmd == LOCK_UN);
 
-	if (!unlock && !(cmd & LOCK_MAND) && !(filp->f_mode & 3))
+	if (!unlock && !(cmd & LOCK_MAND) &&
+	    !(filp->f_mode & (FMODE_READ|FMODE_WRITE)))
 		goto out_putf;
 
 	error = flock_make_lock(filp, &lock, cmd);
diff --git a/fs/open.c b/fs/open.c
index 5596049863bf..83cdb9dee0c1 100644
--- a/fs/open.c
+++ b/fs/open.c
@@ -798,7 +798,7 @@ static struct file *__dentry_open(struct dentry *dentry, struct vfsmount *mnt,
 	int error;
 
 	f->f_flags = flags;
-	f->f_mode = ((flags+1) & O_ACCMODE) | FMODE_LSEEK |
+	f->f_mode = (__force fmode_t)((flags+1) & O_ACCMODE) | FMODE_LSEEK |
 				FMODE_PREAD | FMODE_PWRITE;
 	inode = dentry->d_inode;
 	if (f->f_mode & FMODE_WRITE) {
diff --git a/fs/proc/base.c b/fs/proc/base.c
index b5918ae8ca79..486cf3fe7139 100644
--- a/fs/proc/base.c
+++ b/fs/proc/base.c
@@ -1712,9 +1712,9 @@ static struct dentry *proc_fd_instantiate(struct inode *dir,
 	file = fcheck_files(files, fd);
 	if (!file)
 		goto out_unlock;
-	if (file->f_mode & 1)
+	if (file->f_mode & FMODE_READ)
 		inode->i_mode |= S_IRUSR | S_IXUSR;
-	if (file->f_mode & 2)
+	if (file->f_mode & FMODE_WRITE)
 		inode->i_mode |= S_IWUSR | S_IXUSR;
 	spin_unlock(&files->file_lock);
 	put_files_struct(files);
diff --git a/fs/reiserfs/journal.c b/fs/reiserfs/journal.c
index c21df71943a6..b89d193a00d9 100644
--- a/fs/reiserfs/journal.c
+++ b/fs/reiserfs/journal.c
@@ -2593,7 +2593,7 @@ static int journal_init_dev(struct super_block *super,
 {
 	int result;
 	dev_t jdev;
-	int blkdev_mode = FMODE_READ | FMODE_WRITE;
+	fmode_t blkdev_mode = FMODE_READ | FMODE_WRITE;
 	char b[BDEVNAME_SIZE];
 
 	result = 0;
diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
index b4fe68fe3a57..a4413ec3cb3a 100644
--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@ -910,7 +910,8 @@ static inline int sb_issue_discard(struct super_block *sb,
 * command filter functions
 */
 extern int blk_verify_command(struct blk_cmd_filter *filter,
-			      unsigned char *cmd, int has_write_perm);
+			      unsigned char *cmd, fmode_t has_write_perm);
+extern void blk_unregister_filter(struct gendisk *disk);
 extern void blk_set_cmd_filter_defaults(struct blk_cmd_filter *filter);
 
 #define MAX_PHYS_SEGMENTS 128
diff --git a/include/linux/device-mapper.h b/include/linux/device-mapper.h
index 08d783592b73..3f8d4e763672 100644
--- a/include/linux/device-mapper.h
+++ b/include/linux/device-mapper.h
@@ -85,7 +85,7 @@ void dm_set_device_limits(struct dm_target *ti, struct block_device *bdev);
 
 struct dm_dev {
 	struct block_device *bdev;
-	int mode;
+	fmode_t mode;
 	char name[16];
 };
 
@@ -95,7 +95,7 @@ struct dm_dev {
  * FIXME: too many arguments.
  */
 int dm_get_device(struct dm_target *ti, const char *path, sector_t start,
-		  sector_t len, int mode, struct dm_dev **result);
+		  sector_t len, fmode_t mode, struct dm_dev **result);
 void dm_put_device(struct dm_target *ti, struct dm_dev *d);
 
 /*
@@ -223,7 +223,7 @@ int dm_set_geometry(struct mapped_device *md, struct hd_geometry *geo);
 /*
  * First create an empty table.
  */
-int dm_table_create(struct dm_table **result, int mode,
+int dm_table_create(struct dm_table **result, fmode_t mode,
 		    unsigned num_targets, struct mapped_device *md);
 
 /*
@@ -254,7 +254,7 @@ void dm_table_put(struct dm_table *t);
  */
 sector_t dm_table_get_size(struct dm_table *t);
 unsigned int dm_table_get_num_targets(struct dm_table *t);
-int dm_table_get_mode(struct dm_table *t);
+fmode_t dm_table_get_mode(struct dm_table *t);
 struct mapped_device *dm_table_get_md(struct dm_table *t);
 
 /*
diff --git a/include/linux/file.h b/include/linux/file.h
index a20259e248a5..335a0a5c316e 100644
--- a/include/linux/file.h
+++ b/include/linux/file.h
@@ -19,10 +19,10 @@ struct file_operations;
 struct vfsmount;
 struct dentry;
 extern int init_file(struct file *, struct vfsmount *mnt,
-		struct dentry *dentry, mode_t mode,
+		struct dentry *dentry, fmode_t mode,
 		const struct file_operations *fop);
 extern struct file *alloc_file(struct vfsmount *, struct dentry *dentry,
-		mode_t mode, const struct file_operations *fop);
+		fmode_t mode, const struct file_operations *fop);
 
 static inline void fput_light(struct file *file, int fput_needed)
 {
diff --git a/include/linux/fs.h b/include/linux/fs.h
index a6a625be13fc..60a7a581ba91 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -63,18 +63,18 @@ extern int dir_notify_enable;
 #define MAY_ACCESS 16
 #define MAY_OPEN 32
 
-#define FMODE_READ 1
-#define FMODE_WRITE 2
+#define FMODE_READ ((__force fmode_t)1)
+#define FMODE_WRITE ((__force fmode_t)2)
 
 /* Internal kernel extensions */
-#define FMODE_LSEEK	4
-#define FMODE_PREAD	8
+#define FMODE_LSEEK	((__force fmode_t)4)
+#define FMODE_PREAD	((__force fmode_t)8)
 #define FMODE_PWRITE	FMODE_PREAD	/* These go hand in hand */
 
 /* File is being opened for execution. Primary users of this flag are
    distributed filesystems that can use it to achieve correct ETXTBUSY
    behavior for cross-node execution/opening_for_writing of files */
-#define FMODE_EXEC	16
+#define FMODE_EXEC	((__force fmode_t)16)
 
 #define RW_MASK		1
 #define RWA_MASK	2
@@ -825,7 +825,7 @@ struct file {
 	const struct file_operations	*f_op;
 	atomic_long_t		f_count;
 	unsigned int 		f_flags;
-	mode_t			f_mode;
+	fmode_t			f_mode;
 	loff_t			f_pos;
 	struct fown_struct	f_owner;
 	unsigned int		f_uid, f_gid;
@@ -1714,7 +1714,7 @@ extern struct block_device *bdget(dev_t);
 extern void bd_set_size(struct block_device *, loff_t size);
 extern void bd_forget(struct inode *inode);
 extern void bdput(struct block_device *);
-extern struct block_device *open_by_devnum(dev_t, unsigned);
+extern struct block_device *open_by_devnum(dev_t, fmode_t);
 #else
 static inline void bd_forget(struct inode *inode) {}
 #endif
@@ -1729,7 +1729,7 @@ extern int blkdev_driver_ioctl(struct inode *inode, struct file *file,
 			       struct gendisk *disk, unsigned cmd,
 			       unsigned long arg);
 extern long compat_blkdev_ioctl(struct file *, unsigned, unsigned long);
-extern int blkdev_get(struct block_device *, mode_t, unsigned);
+extern int blkdev_get(struct block_device *, fmode_t, unsigned);
 extern int blkdev_put(struct block_device *);
 extern int bd_claim(struct block_device *, void *);
 extern void bd_release(struct block_device *);
diff --git a/include/linux/fsnotify.h b/include/linux/fsnotify.h
index a89513188ce7..00fbd5b245c9 100644
--- a/include/linux/fsnotify.h
+++ b/include/linux/fsnotify.h
@@ -188,7 +188,7 @@ static inline void fsnotify_close(struct file *file)
 	struct dentry *dentry = file->f_path.dentry;
 	struct inode *inode = dentry->d_inode;
 	const char *name = dentry->d_name.name;
-	mode_t mode = file->f_mode;
+	fmode_t mode = file->f_mode;
 	u32 mask = (mode & FMODE_WRITE) ? IN_CLOSE_WRITE : IN_CLOSE_NOWRITE;
 
 	if (S_ISDIR(inode->i_mode))
diff --git a/include/linux/types.h b/include/linux/types.h
index f24f7beb47df..1d98330b1f2c 100644
--- a/include/linux/types.h
+++ b/include/linux/types.h
@@ -190,6 +190,7 @@ typedef __u32 __bitwise __wsum;
 
 #ifdef __KERNEL__
 typedef unsigned __bitwise__ gfp_t;
+typedef unsigned __bitwise__ fmode_t;
 
 #ifdef CONFIG_PHYS_ADDR_T_64BIT
 typedef u64 phys_addr_t;
diff --git a/ipc/shm.c b/ipc/shm.c
index 0add3fa5f547..867e5d6a55c2 100644
--- a/ipc/shm.c
+++ b/ipc/shm.c
@@ -817,7 +817,7 @@ long do_shmat(int shmid, char __user *shmaddr, int shmflg, ulong *raddr)
 	struct ipc_namespace *ns;
 	struct shm_file_data *sfd;
 	struct path path;
-	mode_t f_mode;
+	fmode_t f_mode;
 
 	err = -EINVAL;
 	if (shmid < 0)
diff --git a/sound/core/oss/pcm_oss.c b/sound/core/oss/pcm_oss.c
index 1af62b8b86c6..e17836680f49 100644
--- a/sound/core/oss/pcm_oss.c
+++ b/sound/core/oss/pcm_oss.c
@@ -2283,7 +2283,7 @@ static int snd_pcm_oss_open_file(struct file *file,
 	int idx, err;
 	struct snd_pcm_oss_file *pcm_oss_file;
 	struct snd_pcm_substream *substream;
-	unsigned int f_mode = file->f_mode;
+	fmode_t f_mode = file->f_mode;
 
 	if (rpcm_oss_file)
 		*rpcm_oss_file = NULL;
diff --git a/sound/oss/au1550_ac97.c b/sound/oss/au1550_ac97.c
index 23018a7c063a..81e1f443d094 100644
--- a/sound/oss/au1550_ac97.c
+++ b/sound/oss/au1550_ac97.c
@@ -93,7 +93,7 @@ static struct au1550_state {
 	spinlock_t      lock;
 	struct mutex open_mutex;
 	struct mutex sem;
-	mode_t          open_mode;
+	fmode_t          open_mode;
 	wait_queue_head_t open_wait;
 
 	struct dmabuf {
diff --git a/sound/oss/dmasound/dmasound.h b/sound/oss/dmasound/dmasound.h
index d978b0096564..1cb13fe56ec4 100644
--- a/sound/oss/dmasound/dmasound.h
+++ b/sound/oss/dmasound/dmasound.h
@@ -129,7 +129,7 @@ typedef struct {
     int (*mixer_ioctl)(u_int, u_long);	/* optional */
     int (*write_sq_setup)(void);	/* optional */
     int (*read_sq_setup)(void);		/* optional */
-    int (*sq_open)(mode_t);		/* optional */
+    int (*sq_open)(fmode_t);		/* optional */
     int (*state_info)(char *, size_t);	/* optional */
     void (*abort_read)(void);		/* optional */
     int min_dsp_speed;
@@ -235,7 +235,7 @@ struct sound_queue {
      */
     int active;
     wait_queue_head_t action_queue, open_queue, sync_queue;
-    int open_mode;
+    fmode_t open_mode;
     int busy, syncing, xruns, died;
 };
 
diff --git a/sound/oss/dmasound/dmasound_atari.c b/sound/oss/dmasound/dmasound_atari.c
index 285239d64b82..4d45bd63718b 100644
--- a/sound/oss/dmasound/dmasound_atari.c
+++ b/sound/oss/dmasound/dmasound_atari.c
@@ -143,7 +143,7 @@ static int AtaMixerIoctl(u_int cmd, u_long arg);
 static int TTMixerIoctl(u_int cmd, u_long arg);
 static int FalconMixerIoctl(u_int cmd, u_long arg);
 static int AtaWriteSqSetup(void);
-static int AtaSqOpen(mode_t mode);
+static int AtaSqOpen(fmode_t mode);
 static int TTStateInfo(char *buffer, size_t space);
 static int FalconStateInfo(char *buffer, size_t space);
 
@@ -1461,7 +1461,7 @@ static int AtaWriteSqSetup(void)
 	return 0 ;
 }
 
-static int AtaSqOpen(mode_t mode)
+static int AtaSqOpen(fmode_t mode)
 {
 	write_sq_ignore_int = 1;
 	return 0 ;
diff --git a/sound/oss/dmasound/dmasound_core.c b/sound/oss/dmasound/dmasound_core.c
index 95fc5c681755..b8239f3168fb 100644
--- a/sound/oss/dmasound/dmasound_core.c
+++ b/sound/oss/dmasound/dmasound_core.c
@@ -212,7 +212,7 @@ static int irq_installed;
 #endif /* MODULE */
 
 /* control over who can modify resources shared between play/record */
-static mode_t shared_resource_owner;
+static fmode_t shared_resource_owner;
 static int shared_resources_initialised;
 
     /*
@@ -668,7 +668,7 @@ static inline void sq_init_waitqueue(struct sound_queue *sq)
 
 #if 0 /* blocking open() */
 static inline void sq_wake_up(struct sound_queue *sq, struct file *file,
-			      mode_t mode)
+			      fmode_t mode)
 {
 	if (file->f_mode & mode) {
 		sq->busy = 0; /* CHECK: IS THIS OK??? */
@@ -677,7 +677,7 @@ static inline void sq_wake_up(struct sound_queue *sq, struct file *file,
 }
 #endif
 
-static int sq_open2(struct sound_queue *sq, struct file *file, mode_t mode,
+static int sq_open2(struct sound_queue *sq, struct file *file, fmode_t mode,
 		    int numbufs, int bufsize)
 {
 	int rc = 0;
@@ -891,10 +891,10 @@ static int sq_release(struct inode *inode, struct file *file)
    is the owner - if we have problems.
 */
 
-static int shared_resources_are_mine(mode_t md)
+static int shared_resources_are_mine(fmode_t md)
 {
 	if (shared_resource_owner)
-		return (shared_resource_owner & md ) ;
+		return (shared_resource_owner & md) != 0;
 	else {
 		shared_resource_owner = md ;
 		return 1 ;
diff --git a/sound/oss/msnd.h b/sound/oss/msnd.h
index 61b3955481c5..c8be47ec2b7e 100644
--- a/sound/oss/msnd.h
+++ b/sound/oss/msnd.h
@@ -211,7 +211,7 @@ typedef struct multisound_dev {
 
 	/* State variables */
 	enum { msndClassic, msndPinnacle } type;
-	mode_t mode;
+	fmode_t mode;
 	unsigned long flags;
 #define F_RESETTING			0
 #define F_HAVEDIGITAL			1
diff --git a/sound/oss/sound_config.h b/sound/oss/sound_config.h
index 1a00a3210616..55271fbe7f49 100644
--- a/sound/oss/sound_config.h
+++ b/sound/oss/sound_config.h
@@ -110,24 +110,16 @@ struct channel_info {
 #define OPEN_WRITE	PCM_ENABLE_OUTPUT
 #define OPEN_READWRITE	(OPEN_READ|OPEN_WRITE)
 
-#if OPEN_READ == FMODE_READ && OPEN_WRITE == FMODE_WRITE
-
-static inline int translate_mode(struct file *file)
-{
-	return file->f_mode;
-}
-
-#else
-
 static inline int translate_mode(struct file *file)
 {
-	return ((file->f_mode & FMODE_READ) ? OPEN_READ : 0) |
-		((file->f_mode & FMODE_WRITE) ? OPEN_WRITE : 0);
+	if (OPEN_READ == (__force int)FMODE_READ &&
+	    OPEN_WRITE == (__force int)FMODE_WRITE)
+		return (__force int)(file->f_mode & (FMODE_READ | FMODE_WRITE));
+	else
+		return ((file->f_mode & FMODE_READ) ? OPEN_READ : 0) |
+			((file->f_mode & FMODE_WRITE) ? OPEN_WRITE : 0);
 }
 
-#endif
-
-
 #include "sound_calls.h"
 #include "dev_table.h"
 
diff --git a/sound/oss/swarm_cs4297a.c b/sound/oss/swarm_cs4297a.c
index 044453a4ee5b..41562ecde5bb 100644
--- a/sound/oss/swarm_cs4297a.c
+++ b/sound/oss/swarm_cs4297a.c
@@ -295,7 +295,7 @@ struct cs4297a_state {
 	struct mutex open_mutex;
 	struct mutex open_sem_adc;
 	struct mutex open_sem_dac;
-	mode_t open_mode;
+	fmode_t open_mode;
 	wait_queue_head_t open_wait;
 	wait_queue_head_t open_wait_adc;
 	wait_queue_head_t open_wait_dac;
diff --git a/sound/oss/vwsnd.c b/sound/oss/vwsnd.c
index dcbb3f739e61..78b8acc7c3b9 100644
--- a/sound/oss/vwsnd.c
+++ b/sound/oss/vwsnd.c
@@ -1509,7 +1509,7 @@ typedef struct vwsnd_dev {
 	struct mutex open_mutex;
 	struct mutex io_mutex;
 	struct mutex mix_mutex;
-	mode_t		open_mode;
+	fmode_t		open_mode;
 	wait_queue_head_t open_wait;
 
 	lithium_t	lith;
-- 
cgit v1.2.3


From 86d434dede14108dd917b25af0f29c0cb28b8d18 Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Sun, 26 Aug 2007 19:50:05 -0400
Subject: [PATCH] eliminate use of ->f_flags in block methods

store needed information in f_mode

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 drivers/block/ataflop.c |  6 +++---
 drivers/block/floppy.c  | 15 ++++-----------
 drivers/block/swim3.c   |  6 +++---
 drivers/block/ub.c      |  2 +-
 drivers/cdrom/cdrom.c   |  4 ++--
 drivers/ide/ide-gd.c    |  2 +-
 drivers/scsi/sd.c       |  2 +-
 fs/block_dev.c          |  7 +++++++
 include/linux/fs.h      |  4 ++++
 9 files changed, 26 insertions(+), 22 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/block/ataflop.c b/drivers/block/ataflop.c
index e1db285b72cd..85d56a26f7c6 100644
--- a/drivers/block/ataflop.c
+++ b/drivers/block/ataflop.c
@@ -1813,17 +1813,17 @@ static int floppy_open( struct inode *inode, struct file *filp )
 	if (p->ref && p->type != type)
 		return -EBUSY;
 
-	if (p->ref == -1 || (p->ref && filp->f_flags & O_EXCL))
+	if (p->ref == -1 || (p->ref && filp->f_mode & FMODE_EXCL))
 		return -EBUSY;
 
-	if (filp->f_flags & O_EXCL)
+	if (filp->f_mode & FMODE_EXCL)
 		p->ref = -1;
 	else
 		p->ref++;
 
 	p->type = type;
 
-	if (filp->f_flags & O_NDELAY)
+	if (filp->f_mode & FMODE_NDELAY)
 		return 0;
 
 	if (filp->f_mode & (FMODE_READ|FMODE_WRITE)) {
diff --git a/drivers/block/floppy.c b/drivers/block/floppy.c
index ae3ef8945f3f..5d60c05a736a 100644
--- a/drivers/block/floppy.c
+++ b/drivers/block/floppy.c
@@ -3453,7 +3453,7 @@ static int fd_getgeo(struct block_device *bdev, struct hd_geometry *geo)
 static int fd_ioctl(struct inode *inode, struct file *filp, unsigned int cmd,
 		    unsigned long param)
 {
-#define FD_IOCTL_ALLOWED ((filp) && (filp)->private_data)
+#define FD_IOCTL_ALLOWED ((filp) && (filp)->f_mode & (FMODE_WRITE|FMODE_WRITE_IOCTL))
 #define OUT(c,x) case c: outparam = (const char *) (x); break
 #define IN(c,x,tag) case c: *(x) = inparam. tag ; return 0
 
@@ -3690,7 +3690,6 @@ static int floppy_open(struct inode *inode, struct file *filp)
 	int res = -EBUSY;
 	char *tmp;
 
-	filp->private_data = (void *)0;
 	mutex_lock(&open_lock);
 	old_dev = UDRS->fd_device;
 	if (opened_bdev[drive] && opened_bdev[drive] != inode->i_bdev)
@@ -3701,10 +3700,10 @@ static int floppy_open(struct inode *inode, struct file *filp)
 		USETF(FD_VERIFY);
 	}
 
-	if (UDRS->fd_ref == -1 || (UDRS->fd_ref && (filp->f_flags & O_EXCL)))
+	if (UDRS->fd_ref == -1 || (UDRS->fd_ref && (filp->f_mode & FMODE_EXCL)))
 		goto out2;
 
-	if (filp->f_flags & O_EXCL)
+	if (filp->f_mode & FMODE_EXCL)
 		UDRS->fd_ref = -1;
 	else
 		UDRS->fd_ref++;
@@ -3751,16 +3750,10 @@ static int floppy_open(struct inode *inode, struct file *filp)
 			buffer_track = -1;
 	}
 
-	/* Allow ioctls if we have write-permissions even if read-only open.
-	 * Needed so that programs such as fdrawcmd still can work on write
-	 * protected disks */
-	if ((filp->f_mode & FMODE_WRITE) || !file_permission(filp, MAY_WRITE))
-		filp->private_data = (void *)8;
-
 	if (UFDCS->rawcmd == 1)
 		UFDCS->rawcmd = 2;
 
-	if (!(filp->f_flags & O_NDELAY)) {
+	if (!(filp->f_mode & FMODE_NDELAY)) {
 		if (filp->f_mode & (FMODE_READ|FMODE_WRITE)) {
 			UDRS->last_checked = 0;
 			check_disk_change(inode->i_bdev);
diff --git a/drivers/block/swim3.c b/drivers/block/swim3.c
index a53ca54bee12..5c45d5556ae8 100644
--- a/drivers/block/swim3.c
+++ b/drivers/block/swim3.c
@@ -904,10 +904,10 @@ static int floppy_open(struct inode *inode, struct file *filp)
 		swim3_action(fs, SETMFM);
 		swim3_select(fs, RELAX);
 
-	} else if (fs->ref_count == -1 || filp->f_flags & O_EXCL)
+	} else if (fs->ref_count == -1 || filp->f_mode & FMODE_EXCL)
 		return -EBUSY;
 
-	if (err == 0 && (filp->f_flags & O_NDELAY) == 0
+	if (err == 0 && (filp->f_mode & FMODE_NDELAY) == 0
 	    && (filp->f_mode & (FMODE_READ|FMODE_WRITE))) {
 		check_disk_change(inode->i_bdev);
 		if (fs->ejected)
@@ -930,7 +930,7 @@ static int floppy_open(struct inode *inode, struct file *filp)
 		return err;
 	}
 
-	if (filp->f_flags & O_EXCL)
+	if (filp->f_mode & FMODE_EXCL)
 		fs->ref_count = -1;
 	else
 		++fs->ref_count;
diff --git a/drivers/block/ub.c b/drivers/block/ub.c
index f60e41833f69..85d41eb67c0b 100644
--- a/drivers/block/ub.c
+++ b/drivers/block/ub.c
@@ -1691,7 +1691,7 @@ static int ub_bd_open(struct inode *inode, struct file *filp)
 	 * under some pretty murky conditions (a failure of READ CAPACITY).
 	 * We may need it one day.
 	 */
-	if (lun->removable && lun->changed && !(filp->f_flags & O_NDELAY)) {
+	if (lun->removable && lun->changed && !(filp->f_mode & FMODE_NDELAY)) {
 		rc = -ENOMEDIUM;
 		goto err_open;
 	}
diff --git a/drivers/cdrom/cdrom.c b/drivers/cdrom/cdrom.c
index d47f2f80accd..4feefa622aed 100644
--- a/drivers/cdrom/cdrom.c
+++ b/drivers/cdrom/cdrom.c
@@ -982,7 +982,7 @@ int cdrom_open(struct cdrom_device_info *cdi, struct inode *ip, struct file *fp)
 	/* if this was a O_NONBLOCK open and we should honor the flags,
 	 * do a quick open without drive/disc integrity checks. */
 	cdi->use_count++;
-	if ((fp->f_flags & O_NONBLOCK) && (cdi->options & CDO_USE_FFLAGS)) {
+	if ((fp->f_mode & FMODE_NDELAY) && (cdi->options & CDO_USE_FFLAGS)) {
 		ret = cdi->ops->open(cdi, 1);
 	} else {
 		ret = open_for_data(cdi);
@@ -1205,7 +1205,7 @@ int cdrom_release(struct cdrom_device_info *cdi, struct file *fp)
 	}
 
 	opened_for_data = !(cdi->options & CDO_USE_FFLAGS) ||
-		!(fp && fp->f_flags & O_NONBLOCK);
+		!(fp && fp->f_mode & FMODE_NDELAY);
 
 	/*
 	 * flush cache on last write release
diff --git a/drivers/ide/ide-gd.c b/drivers/ide/ide-gd.c
index d367473098f9..66bbb0a22f57 100644
--- a/drivers/ide/ide-gd.c
+++ b/drivers/ide/ide-gd.c
@@ -197,7 +197,7 @@ static int ide_gd_open(struct inode *inode, struct file *filp)
 		 * unreadable disk, so that we can get the format capacity
 		 * of the drive or begin the format - Sam
 		 */
-		if (ret && (filp->f_flags & O_NDELAY) == 0) {
+		if (ret && (filp->f_mode & FMODE_NDELAY) == 0) {
 			ret = -EIO;
 			goto out_put_idkp;
 		}
diff --git a/drivers/scsi/sd.c b/drivers/scsi/sd.c
index 7c4d2e68df1c..202c1ed9abd7 100644
--- a/drivers/scsi/sd.c
+++ b/drivers/scsi/sd.c
@@ -640,7 +640,7 @@ static int sd_open(struct inode *inode, struct file *filp)
 	 */
 	retval = -ENOMEDIUM;
 	if (sdev->removable && !sdkp->media_present &&
-	    !(filp->f_flags & O_NDELAY))
+	    !(filp->f_mode & FMODE_NDELAY))
 		goto error_out;
 
 	/*
diff --git a/fs/block_dev.c b/fs/block_dev.c
index 8897f3b02e98..b9022694e9f7 100644
--- a/fs/block_dev.c
+++ b/fs/block_dev.c
@@ -1007,6 +1007,13 @@ static int do_open(struct block_device *bdev, struct file *file, int for_part)
 		return ret;
 	}
 
+	if (file->f_flags & O_NDELAY)
+		file->f_mode |= FMODE_NDELAY;
+	if (file->f_flags & O_EXCL)
+		file->f_mode |= FMODE_EXCL;
+	if ((file->f_flags & O_ACCMODE) == 3)
+		file->f_mode |= FMODE_WRITE_IOCTL;
+
 	ret = -ENXIO;
 	file->f_mapping = bdev->bd_inode->i_mapping;
 
diff --git a/include/linux/fs.h b/include/linux/fs.h
index 60a7a581ba91..5ab5579a5162 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -76,6 +76,10 @@ extern int dir_notify_enable;
    behavior for cross-node execution/opening_for_writing of files */
 #define FMODE_EXEC	((__force fmode_t)16)
 
+#define FMODE_NDELAY	((__force fmode_t)32)
+#define FMODE_EXCL	((__force fmode_t)64)
+#define FMODE_WRITE_IOCTL	((__force fmode_t)128)
+
 #define RW_MASK		1
 #define RWA_MASK	2
 #define READ 0
-- 
cgit v1.2.3


From e915e872ed921d707bc32b3f2184d43abfa8c9e2 Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Tue, 2 Sep 2008 17:16:41 -0400
Subject: [PATCH] switch sg_scsi_ioctl() to passing fmode_t

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 block/scsi_ioctl.c        | 13 ++++---------
 drivers/scsi/scsi_ioctl.c |  2 +-
 drivers/scsi/sg.c         |  2 +-
 include/linux/blkdev.h    |  4 ++--
 4 files changed, 8 insertions(+), 13 deletions(-)

(limited to 'include/linux')

diff --git a/block/scsi_ioctl.c b/block/scsi_ioctl.c
index 9a441559118d..375e25df8adc 100644
--- a/block/scsi_ioctl.c
+++ b/block/scsi_ioctl.c
@@ -379,12 +379,11 @@ out:
  *      bytes in one int) where the lowest byte is the SCSI status.
  */
 #define OMAX_SB_LEN 16          /* For backward compatibility */
-int sg_scsi_ioctl(struct file *file, struct request_queue *q,
-		  struct gendisk *disk, struct scsi_ioctl_command __user *sic)
+int sg_scsi_ioctl(struct request_queue *q, struct gendisk *disk, fmode_t mode,
+		struct scsi_ioctl_command __user *sic)
 {
 	struct request *rq;
 	int err;
-	fmode_t write_perm = 0;
 	unsigned int in_len, out_len, bytes, opcode, cmdlen;
 	char *buffer = NULL, sense[SCSI_SENSE_BUFFERSIZE];
 
@@ -426,11 +425,7 @@ int sg_scsi_ioctl(struct file *file, struct request_queue *q,
 	if (in_len && copy_from_user(buffer, sic->data + cmdlen, in_len))
 		goto error;
 
-	/* scsi_ioctl passes NULL */
-	if (file && (file->f_mode & FMODE_WRITE))
-		write_perm = FMODE_WRITE;
-
-	err = blk_verify_command(&q->cmd_filter, rq->cmd, write_perm);
+	err = blk_verify_command(&q->cmd_filter, rq->cmd, mode & FMODE_WRITE);
 	if (err)
 		goto error;
 
@@ -636,7 +631,7 @@ int scsi_cmd_ioctl(struct file *file, struct request_queue *q,
 			if (!arg)
 				break;
 
-			err = sg_scsi_ioctl(file, q, bd_disk, arg);
+			err = sg_scsi_ioctl(q, bd_disk, file ? file->f_mode : 0, arg);
 			break;
 		case CDROMCLOSETRAY:
 			err = blk_send_start_stop(q, bd_disk, 0x03);
diff --git a/drivers/scsi/scsi_ioctl.c b/drivers/scsi/scsi_ioctl.c
index 28b19ef26309..1f08f5a2f8fd 100644
--- a/drivers/scsi/scsi_ioctl.c
+++ b/drivers/scsi/scsi_ioctl.c
@@ -237,7 +237,7 @@ int scsi_ioctl(struct scsi_device *sdev, int cmd, void __user *arg)
 	case SCSI_IOCTL_SEND_COMMAND:
 		if (!capable(CAP_SYS_ADMIN) || !capable(CAP_SYS_RAWIO))
 			return -EACCES;
-		return sg_scsi_ioctl(NULL, sdev->request_queue, NULL, arg);
+		return sg_scsi_ioctl(sdev->request_queue, NULL, 0, arg);
 	case SCSI_IOCTL_DOORLOCK:
 		return scsi_set_medium_removal(sdev, SCSI_REMOVAL_PREVENT);
 	case SCSI_IOCTL_DOORUNLOCK:
diff --git a/drivers/scsi/sg.c b/drivers/scsi/sg.c
index 93bd59a1ed79..9adf35bd8b56 100644
--- a/drivers/scsi/sg.c
+++ b/drivers/scsi/sg.c
@@ -1059,7 +1059,7 @@ sg_ioctl(struct inode *inode, struct file *filp,
 			if (sg_allow_access(filp, &opcode))
 				return -EPERM;
 		}
-		return sg_scsi_ioctl(filp, sdp->device->request_queue, NULL, p);
+		return sg_scsi_ioctl(sdp->device->request_queue, NULL, filp->f_mode, p);
 	case SG_SET_DEBUG:
 		result = get_user(val, ip);
 		if (result)
diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
index a4413ec3cb3a..8945c30e9936 100644
--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@ -719,8 +719,8 @@ extern int blk_remove_plug(struct request_queue *);
 extern void blk_recount_segments(struct request_queue *, struct bio *);
 extern int scsi_cmd_ioctl(struct file *, struct request_queue *,
 			  struct gendisk *, unsigned int, void __user *);
-extern int sg_scsi_ioctl(struct file *, struct request_queue *,
-		struct gendisk *, struct scsi_ioctl_command __user *);
+extern int sg_scsi_ioctl(struct request_queue *, struct gendisk *, fmode_t,
+			 struct scsi_ioctl_command __user *);
 
 /*
  * Temporary export, until SCSI gets fixed up.
-- 
cgit v1.2.3


From 74f3c8aff36ad6552ea609c8b20bfd588fa16f38 Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Mon, 27 Aug 2007 15:38:10 -0400
Subject: [PATCH] switch scsi_cmd_ioctl() to passing fmode_t

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 block/bsg.c                    |  2 +-
 block/scsi_ioctl.c             | 10 +++++-----
 drivers/block/cciss.c          |  3 ++-
 drivers/block/ub.c             |  2 +-
 drivers/block/virtio_blk.c     |  4 ++--
 drivers/cdrom/cdrom.c          |  2 +-
 drivers/ide/ide-floppy_ioctl.c |  4 ++--
 drivers/scsi/sd.c              |  3 ++-
 drivers/scsi/st.c              |  4 ++--
 include/linux/blkdev.h         |  4 ++--
 10 files changed, 20 insertions(+), 18 deletions(-)

(limited to 'include/linux')

diff --git a/block/bsg.c b/block/bsg.c
index 2d36b127f384..e8bd2475682a 100644
--- a/block/bsg.c
+++ b/block/bsg.c
@@ -914,7 +914,7 @@ static long bsg_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
 	case SG_EMULATED_HOST:
 	case SCSI_IOCTL_SEND_COMMAND: {
 		void __user *uarg = (void __user *) arg;
-		return scsi_cmd_ioctl(file, bd->queue, NULL, cmd, uarg);
+		return scsi_cmd_ioctl(bd->queue, NULL, file->f_mode, cmd, uarg);
 	}
 	case SG_IO: {
 		struct request *rq;
diff --git a/block/scsi_ioctl.c b/block/scsi_ioctl.c
index 375e25df8adc..5963cf91a3a0 100644
--- a/block/scsi_ioctl.c
+++ b/block/scsi_ioctl.c
@@ -517,8 +517,8 @@ static inline int blk_send_start_stop(struct request_queue *q,
 	return __blk_send_generic(q, bd_disk, GPCMD_START_STOP_UNIT, data);
 }
 
-int scsi_cmd_ioctl(struct file *file, struct request_queue *q,
-		   struct gendisk *bd_disk, unsigned int cmd, void __user *arg)
+int scsi_cmd_ioctl(struct request_queue *q, struct gendisk *bd_disk, fmode_t mode,
+		   unsigned int cmd, void __user *arg)
 {
 	int err;
 
@@ -559,7 +559,7 @@ int scsi_cmd_ioctl(struct file *file, struct request_queue *q,
 			err = -EFAULT;
 			if (copy_from_user(&hdr, arg, sizeof(hdr)))
 				break;
-			err = sg_io(q, bd_disk, &hdr, file ? file->f_mode : 0);
+			err = sg_io(q, bd_disk, &hdr, mode);
 			if (err == -EFAULT)
 				break;
 
@@ -607,7 +607,7 @@ int scsi_cmd_ioctl(struct file *file, struct request_queue *q,
 			hdr.cmdp = ((struct cdrom_generic_command __user*) arg)->cmd;
 			hdr.cmd_len = sizeof(cgc.cmd);
 
-			err = sg_io(q, bd_disk, &hdr, file ? file->f_mode : 0);
+			err = sg_io(q, bd_disk, &hdr, mode);
 			if (err == -EFAULT)
 				break;
 
@@ -631,7 +631,7 @@ int scsi_cmd_ioctl(struct file *file, struct request_queue *q,
 			if (!arg)
 				break;
 
-			err = sg_scsi_ioctl(q, bd_disk, file ? file->f_mode : 0, arg);
+			err = sg_scsi_ioctl(q, bd_disk, mode, arg);
 			break;
 		case CDROMCLOSETRAY:
 			err = blk_send_start_stop(q, bd_disk, 0x03);
diff --git a/drivers/block/cciss.c b/drivers/block/cciss.c
index 1e1f9153000c..d9b1c15b8113 100644
--- a/drivers/block/cciss.c
+++ b/drivers/block/cciss.c
@@ -1232,7 +1232,8 @@ static int cciss_ioctl(struct inode *inode, struct file *filep,
 	case SG_EMULATED_HOST:
 	case SG_IO:
 	case SCSI_IOCTL_SEND_COMMAND:
-		return scsi_cmd_ioctl(filep, disk->queue, disk, cmd, argp);
+		return scsi_cmd_ioctl(disk->queue, disk,
+				      filep ? filep->f_mode : 0, cmd, argp);
 
 	/* scsi_cmd_ioctl would normally handle these, below, but */
 	/* they aren't a good fit for cciss, as CD-ROMs are */
diff --git a/drivers/block/ub.c b/drivers/block/ub.c
index 85d41eb67c0b..bc04330f3683 100644
--- a/drivers/block/ub.c
+++ b/drivers/block/ub.c
@@ -1729,7 +1729,7 @@ static int ub_bd_ioctl(struct inode *inode, struct file *filp,
 	struct gendisk *disk = inode->i_bdev->bd_disk;
 	void __user *usermem = (void __user *) arg;
 
-	return scsi_cmd_ioctl(filp, disk->queue, disk, cmd, usermem);
+	return scsi_cmd_ioctl(disk->queue, disk, filp ? filp->f_mode : 0, cmd, usermem);
 }
 
 /*
diff --git a/drivers/block/virtio_blk.c b/drivers/block/virtio_blk.c
index 6ec5fc052786..7643cd16fd67 100644
--- a/drivers/block/virtio_blk.c
+++ b/drivers/block/virtio_blk.c
@@ -149,8 +149,8 @@ static void do_virtblk_request(struct request_queue *q)
 static int virtblk_ioctl(struct inode *inode, struct file *filp,
 			 unsigned cmd, unsigned long data)
 {
-	return scsi_cmd_ioctl(filp, inode->i_bdev->bd_disk->queue,
-			      inode->i_bdev->bd_disk, cmd,
+	return scsi_cmd_ioctl(inode->i_bdev->bd_disk->queue,
+			      inode->i_bdev->bd_disk, filp->f_mode, cmd,
 			      (void __user *)data);
 }
 
diff --git a/drivers/cdrom/cdrom.c b/drivers/cdrom/cdrom.c
index 4feefa622aed..e286eb5d1f6a 100644
--- a/drivers/cdrom/cdrom.c
+++ b/drivers/cdrom/cdrom.c
@@ -2672,7 +2672,7 @@ int cdrom_ioctl(struct file * file, struct cdrom_device_info *cdi,
 	/*
 	 * Try the generic SCSI command ioctl's first.
 	 */
-	ret = scsi_cmd_ioctl(file, disk->queue, disk, cmd, argp);
+	ret = scsi_cmd_ioctl(disk->queue, disk, file ? file->f_mode : 0, cmd, argp);
 	if (ret != -ENOTTY)
 		return ret;
 
diff --git a/drivers/ide/ide-floppy_ioctl.c b/drivers/ide/ide-floppy_ioctl.c
index 0d5f5054ab64..3c9da4ac9341 100644
--- a/drivers/ide/ide-floppy_ioctl.c
+++ b/drivers/ide/ide-floppy_ioctl.c
@@ -280,8 +280,8 @@ int ide_floppy_ioctl(ide_drive_t *drive, struct inode *inode,
 	 * and CDROM_SEND_PACKET (legacy) ioctls
 	 */
 	if (cmd != CDROM_SEND_PACKET && cmd != SCSI_IOCTL_SEND_COMMAND)
-		err = scsi_cmd_ioctl(file, bdev->bd_disk->queue,
-					bdev->bd_disk, cmd, argp);
+		err = scsi_cmd_ioctl(bdev->bd_disk->queue, bdev->bd_disk,
+				file ? file->f_mode : 0, cmd, argp);
 
 	if (err == -ENOTTY)
 		err = generic_ide_ioctl(drive, file, bdev, cmd, arg);
diff --git a/drivers/scsi/sd.c b/drivers/scsi/sd.c
index 202c1ed9abd7..5d74413f591a 100644
--- a/drivers/scsi/sd.c
+++ b/drivers/scsi/sd.c
@@ -775,7 +775,8 @@ static int sd_ioctl(struct inode * inode, struct file * filp,
 		case SCSI_IOCTL_GET_BUS_NUMBER:
 			return scsi_ioctl(sdp, cmd, p);
 		default:
-			error = scsi_cmd_ioctl(filp, disk->queue, disk, cmd, p);
+			error = scsi_cmd_ioctl(disk->queue, disk,
+					filp ? filp->f_mode : 0, cmd, p);
 			if (error != -ENOTTY)
 				return error;
 	}
diff --git a/drivers/scsi/st.c b/drivers/scsi/st.c
index 5c28d08f18f4..8dffac9f3419 100644
--- a/drivers/scsi/st.c
+++ b/drivers/scsi/st.c
@@ -3567,8 +3567,8 @@ static long st_ioctl(struct file *file, unsigned int cmd_in, unsigned long arg)
 			    !capable(CAP_SYS_RAWIO))
 				i = -EPERM;
 			else
-				i = scsi_cmd_ioctl(file, STp->disk->queue,
-						   STp->disk, cmd_in, p);
+				i = scsi_cmd_ioctl(STp->disk->queue, STp->disk,
+						   file->f_mode, cmd_in, p);
 			if (i != -ENOTTY)
 				return i;
 			break;
diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
index 8945c30e9936..48f41b991adb 100644
--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@ -717,8 +717,8 @@ extern void blk_plug_device(struct request_queue *);
 extern void blk_plug_device_unlocked(struct request_queue *);
 extern int blk_remove_plug(struct request_queue *);
 extern void blk_recount_segments(struct request_queue *, struct bio *);
-extern int scsi_cmd_ioctl(struct file *, struct request_queue *,
-			  struct gendisk *, unsigned int, void __user *);
+extern int scsi_cmd_ioctl(struct request_queue *, struct gendisk *, fmode_t,
+			  unsigned int, void __user *);
 extern int sg_scsi_ioctl(struct request_queue *, struct gendisk *, fmode_t,
 			 struct scsi_ioctl_command __user *);
 
-- 
cgit v1.2.3


From 1bddd9e6453ef1c7bc5b6f4ddbf7d31f4aee7a44 Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Tue, 2 Sep 2008 17:19:43 -0400
Subject: [PATCH] lose the unused file argument in generic_ide_ioctl()

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 drivers/ide/ide-cd.c           | 2 +-
 drivers/ide/ide-disk_ioctl.c   | 2 +-
 drivers/ide/ide-floppy_ioctl.c | 2 +-
 drivers/ide/ide-ioctls.c       | 3 +--
 drivers/ide/ide-tape.c         | 2 +-
 drivers/scsi/ide-scsi.c        | 2 +-
 include/linux/ide.h            | 3 +--
 7 files changed, 7 insertions(+), 9 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/ide/ide-cd.c b/drivers/ide/ide-cd.c
index 13265a8827da..cd21b34fe509 100644
--- a/drivers/ide/ide-cd.c
+++ b/drivers/ide/ide-cd.c
@@ -2174,7 +2174,7 @@ static int idecd_ioctl(struct inode *inode, struct file *file,
 		break;
 	}
 
-	err = generic_ide_ioctl(info->drive, file, bdev, cmd, arg);
+	err = generic_ide_ioctl(info->drive, bdev, cmd, arg);
 	if (err == -EINVAL)
 		err = cdrom_ioctl(file, &info->devinfo, inode, cmd, arg);
 
diff --git a/drivers/ide/ide-disk_ioctl.c b/drivers/ide/ide-disk_ioctl.c
index a49698bcf966..41832af400d6 100644
--- a/drivers/ide/ide-disk_ioctl.c
+++ b/drivers/ide/ide-disk_ioctl.c
@@ -23,5 +23,5 @@ int ide_disk_ioctl(ide_drive_t *drive, struct inode *inode, struct file *file,
 	if (err != -EOPNOTSUPP)
 		return err;
 
-	return generic_ide_ioctl(drive, file, bdev, cmd, arg);
+	return generic_ide_ioctl(drive, bdev, cmd, arg);
 }
diff --git a/drivers/ide/ide-floppy_ioctl.c b/drivers/ide/ide-floppy_ioctl.c
index 3c9da4ac9341..5af70a2c9ef8 100644
--- a/drivers/ide/ide-floppy_ioctl.c
+++ b/drivers/ide/ide-floppy_ioctl.c
@@ -284,7 +284,7 @@ int ide_floppy_ioctl(ide_drive_t *drive, struct inode *inode,
 				file ? file->f_mode : 0, cmd, argp);
 
 	if (err == -ENOTTY)
-		err = generic_ide_ioctl(drive, file, bdev, cmd, arg);
+		err = generic_ide_ioctl(drive, bdev, cmd, arg);
 
 	return err;
 }
diff --git a/drivers/ide/ide-ioctls.c b/drivers/ide/ide-ioctls.c
index a90945f49792..fcde16bb53a7 100644
--- a/drivers/ide/ide-ioctls.c
+++ b/drivers/ide/ide-ioctls.c
@@ -240,8 +240,7 @@ static int generic_drive_reset(ide_drive_t *drive)
 	return ret;
 }
 
-int generic_ide_ioctl(ide_drive_t *drive, struct file *file,
-		      struct block_device *bdev,
+int generic_ide_ioctl(ide_drive_t *drive, struct block_device *bdev,
 		      unsigned int cmd, unsigned long arg)
 {
 	int err;
diff --git a/drivers/ide/ide-tape.c b/drivers/ide/ide-tape.c
index b2b2e5e8d38e..2b263281ffea 100644
--- a/drivers/ide/ide-tape.c
+++ b/drivers/ide/ide-tape.c
@@ -2368,7 +2368,7 @@ static int idetape_ioctl(struct inode *inode, struct file *file,
 	struct block_device *bdev = inode->i_bdev;
 	struct ide_tape_obj *tape = ide_drv_g(bdev->bd_disk, ide_tape_obj);
 	ide_drive_t *drive = tape->drive;
-	int err = generic_ide_ioctl(drive, file, bdev, cmd, arg);
+	int err = generic_ide_ioctl(drive, bdev, cmd, arg);
 	if (err == -EINVAL)
 		err = idetape_blkdev_ioctl(drive, cmd, arg);
 	return err;
diff --git a/drivers/scsi/ide-scsi.c b/drivers/scsi/ide-scsi.c
index afc96e844a25..5bcc04e82c28 100644
--- a/drivers/scsi/ide-scsi.c
+++ b/drivers/scsi/ide-scsi.c
@@ -478,7 +478,7 @@ static int idescsi_ide_ioctl(struct inode *inode, struct file *file,
 {
 	struct block_device *bdev = inode->i_bdev;
 	struct ide_scsi_obj *scsi = ide_scsi_g(bdev->bd_disk);
-	return generic_ide_ioctl(scsi->drive, file, bdev, cmd, arg);
+	return generic_ide_ioctl(scsi->drive, bdev, cmd, arg);
 }
 
 static struct block_device_operations idescsi_ops = {
diff --git a/include/linux/ide.h b/include/linux/ide.h
index 89e53cfbc787..0d03e83f7194 100644
--- a/include/linux/ide.h
+++ b/include/linux/ide.h
@@ -1158,8 +1158,7 @@ struct ide_ioctl_devset {
 int ide_setting_ioctl(ide_drive_t *, struct block_device *, unsigned int,
 		      unsigned long, const struct ide_ioctl_devset *);
 
-int generic_ide_ioctl(ide_drive_t *, struct file *, struct block_device *,
-		      unsigned, unsigned long);
+int generic_ide_ioctl(ide_drive_t *, struct block_device *, unsigned, unsigned long);
 
 extern int ide_vlb_clk;
 extern int ide_pci_clk;
-- 
cgit v1.2.3


From 647b3d0084158c47b1aea8f34d13cab9cd0a5b49 Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Tue, 28 Aug 2007 22:15:59 -0400
Subject: [PATCH] lose unused arguments in dm ioctl callbacks

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 drivers/md/dm-linear.c        | 3 +--
 drivers/md/dm-mpath.c         | 3 +--
 drivers/md/dm.c               | 2 +-
 include/linux/device-mapper.h | 3 +--
 4 files changed, 4 insertions(+), 7 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/md/dm-linear.c b/drivers/md/dm-linear.c
index 6449bcdf84ca..fa358385eed3 100644
--- a/drivers/md/dm-linear.c
+++ b/drivers/md/dm-linear.c
@@ -110,8 +110,7 @@ static int linear_status(struct dm_target *ti, status_type_t type,
 	return 0;
 }
 
-static int linear_ioctl(struct dm_target *ti, struct inode *inode,
-			struct file *filp, unsigned int cmd,
+static int linear_ioctl(struct dm_target *ti, unsigned int cmd,
 			unsigned long arg)
 {
 	struct linear_c *lc = (struct linear_c *) ti->private;
diff --git a/drivers/md/dm-mpath.c b/drivers/md/dm-mpath.c
index 9bf3460c5540..c681d5e5f45c 100644
--- a/drivers/md/dm-mpath.c
+++ b/drivers/md/dm-mpath.c
@@ -1395,8 +1395,7 @@ error:
 	return -EINVAL;
 }
 
-static int multipath_ioctl(struct dm_target *ti, struct inode *inode,
-			   struct file *filp, unsigned int cmd,
+static int multipath_ioctl(struct dm_target *ti, unsigned int cmd,
 			   unsigned long arg)
 {
 	struct multipath *m = (struct multipath *) ti->private;
diff --git a/drivers/md/dm.c b/drivers/md/dm.c
index 327de03a5bdf..5f0f4c8bcd3e 100644
--- a/drivers/md/dm.c
+++ b/drivers/md/dm.c
@@ -345,7 +345,7 @@ static int dm_blk_ioctl(struct inode *inode, struct file *file,
 	}
 
 	if (tgt->type->ioctl)
-		r = tgt->type->ioctl(tgt, inode, file, cmd, arg);
+		r = tgt->type->ioctl(tgt, cmd, arg);
 
 out:
 	dm_table_put(map);
diff --git a/include/linux/device-mapper.h b/include/linux/device-mapper.h
index 3f8d4e763672..a567bbc5293a 100644
--- a/include/linux/device-mapper.h
+++ b/include/linux/device-mapper.h
@@ -69,8 +69,7 @@ typedef int (*dm_status_fn) (struct dm_target *ti, status_type_t status_type,
 
 typedef int (*dm_message_fn) (struct dm_target *ti, unsigned argc, char **argv);
 
-typedef int (*dm_ioctl_fn) (struct dm_target *ti, struct inode *inode,
-			    struct file *filp, unsigned int cmd,
+typedef int (*dm_ioctl_fn) (struct dm_target *ti, unsigned int cmd,
 			    unsigned long arg);
 
 typedef int (*dm_merge_fn) (struct dm_target *ti, struct bvec_merge_data *bvm,
-- 
cgit v1.2.3


From 08f85851215100d0eebf026810955ee6ad456c38 Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Mon, 8 Oct 2007 13:26:20 -0400
Subject: [PATCH] move block_device_operations to blkdev.h

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/ext2/xip.c          |  1 +
 include/linux/blkdev.h | 17 +++++++++++++++++
 include/linux/fs.h     | 15 +--------------
 3 files changed, 19 insertions(+), 14 deletions(-)

(limited to 'include/linux')

diff --git a/fs/ext2/xip.c b/fs/ext2/xip.c
index 4fb94c20041b..b72b85884223 100644
--- a/fs/ext2/xip.c
+++ b/fs/ext2/xip.c
@@ -11,6 +11,7 @@
 #include <linux/buffer_head.h>
 #include <linux/ext2_fs_sb.h>
 #include <linux/ext2_fs.h>
+#include <linux/blkdev.h>
 #include "ext2.h"
 #include "xip.h"
 
diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
index 48f41b991adb..48ec8862a11a 100644
--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@ -1057,6 +1057,23 @@ static inline int blk_integrity_rq(struct request *rq)
 
 #endif /* CONFIG_BLK_DEV_INTEGRITY */
 
+struct file;
+struct inode;
+
+struct block_device_operations {
+	int (*open) (struct inode *, struct file *);
+	int (*release) (struct inode *, struct file *);
+	int (*ioctl) (struct inode *, struct file *, unsigned, unsigned long);
+	long (*unlocked_ioctl) (struct file *, unsigned, unsigned long);
+	long (*compat_ioctl) (struct file *, unsigned, unsigned long);
+	int (*direct_access) (struct block_device *, sector_t,
+						void **, unsigned long *);
+	int (*media_changed) (struct gendisk *);
+	int (*revalidate_disk) (struct gendisk *);
+	int (*getgeo)(struct block_device *, struct hd_geometry *);
+	struct module *owner;
+};
+
 #else /* CONFIG_BLOCK */
 /*
  * stubs for when the block layer is configured out
diff --git a/include/linux/fs.h b/include/linux/fs.h
index 5ab5579a5162..58bbf689fef7 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -1270,20 +1270,7 @@ int generic_osync_inode(struct inode *, struct address_space *, int);
  * to have different dirent layouts depending on the binary type.
  */
 typedef int (*filldir_t)(void *, const char *, int, loff_t, u64, unsigned);
-
-struct block_device_operations {
-	int (*open) (struct inode *, struct file *);
-	int (*release) (struct inode *, struct file *);
-	int (*ioctl) (struct inode *, struct file *, unsigned, unsigned long);
-	long (*unlocked_ioctl) (struct file *, unsigned, unsigned long);
-	long (*compat_ioctl) (struct file *, unsigned, unsigned long);
-	int (*direct_access) (struct block_device *, sector_t,
-						void **, unsigned long *);
-	int (*media_changed) (struct gendisk *);
-	int (*revalidate_disk) (struct gendisk *);
-	int (*getgeo)(struct block_device *, struct hd_geometry *);
-	struct module *owner;
-};
+struct block_device_operations;
 
 /* These macros are for out of kernel modules to test that
  * the kernel supports the unlocked_ioctl and compat_ioctl
-- 
cgit v1.2.3


From bbc1cc978404105da23d505163ce9fd5598ed5b1 Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Sun, 7 Oct 2007 17:54:28 -0400
Subject: [PATCH] switch cdrom_{open,release,ioctl} to sane APIs

... convert to it in callers

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 drivers/block/paride/pcd.c |  8 +++++---
 drivers/cdrom/cdrom.c      | 23 +++++++++++------------
 drivers/cdrom/gdrom.c      |  7 ++++---
 drivers/cdrom/viocd.c      |  8 +++++---
 drivers/ide/ide-cd.c       |  7 ++++---
 drivers/scsi/sr.c          | 11 ++++-------
 include/linux/cdrom.h      | 10 +++++-----
 7 files changed, 38 insertions(+), 36 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/block/paride/pcd.c b/drivers/block/paride/pcd.c
index b8a994a2b013..8bd557e2a659 100644
--- a/drivers/block/paride/pcd.c
+++ b/drivers/block/paride/pcd.c
@@ -226,20 +226,22 @@ static int pcd_warned;		/* Have we logged a phase warning ? */
 static int pcd_block_open(struct inode *inode, struct file *file)
 {
 	struct pcd_unit *cd = inode->i_bdev->bd_disk->private_data;
-	return cdrom_open(&cd->info, inode, file);
+	return cdrom_open(&cd->info, inode->i_bdev, file->f_mode);
 }
 
 static int pcd_block_release(struct inode *inode, struct file *file)
 {
 	struct pcd_unit *cd = inode->i_bdev->bd_disk->private_data;
-	return cdrom_release(&cd->info, file);
+	cdrom_release(&cd->info, file ? file->f_mode : 0);
+	return 0;
 }
 
 static int pcd_block_ioctl(struct inode *inode, struct file *file,
 				unsigned cmd, unsigned long arg)
 {
 	struct pcd_unit *cd = inode->i_bdev->bd_disk->private_data;
-	return cdrom_ioctl(file, &cd->info, inode, cmd, arg);
+	return cdrom_ioctl(&cd->info, inode->i_bdev,
+			   file ? file->f_mode : 0, cmd, arg);
 }
 
 static int pcd_block_media_changed(struct gendisk *disk)
diff --git a/drivers/cdrom/cdrom.c b/drivers/cdrom/cdrom.c
index e286eb5d1f6a..d16b02423d61 100644
--- a/drivers/cdrom/cdrom.c
+++ b/drivers/cdrom/cdrom.c
@@ -973,7 +973,7 @@ static int cdrom_close_write(struct cdrom_device_info *cdi)
  * is in their own interest: device control becomes a lot easier
  * this way.
  */
-int cdrom_open(struct cdrom_device_info *cdi, struct inode *ip, struct file *fp)
+int cdrom_open(struct cdrom_device_info *cdi, struct block_device *bdev, fmode_t mode)
 {
 	int ret;
 
@@ -982,14 +982,14 @@ int cdrom_open(struct cdrom_device_info *cdi, struct inode *ip, struct file *fp)
 	/* if this was a O_NONBLOCK open and we should honor the flags,
 	 * do a quick open without drive/disc integrity checks. */
 	cdi->use_count++;
-	if ((fp->f_mode & FMODE_NDELAY) && (cdi->options & CDO_USE_FFLAGS)) {
+	if ((mode & FMODE_NDELAY) && (cdi->options & CDO_USE_FFLAGS)) {
 		ret = cdi->ops->open(cdi, 1);
 	} else {
 		ret = open_for_data(cdi);
 		if (ret)
 			goto err;
 		cdrom_mmc3_profile(cdi);
-		if (fp->f_mode & FMODE_WRITE) {
+		if (mode & FMODE_WRITE) {
 			ret = -EROFS;
 			if (cdrom_open_write(cdi))
 				goto err_release;
@@ -1007,7 +1007,7 @@ int cdrom_open(struct cdrom_device_info *cdi, struct inode *ip, struct file *fp)
 			cdi->name, cdi->use_count);
 	/* Do this on open.  Don't wait for mount, because they might
 	    not be mounting, but opening with O_NONBLOCK */
-	check_disk_change(ip->i_bdev);
+	check_disk_change(bdev);
 	return 0;
 err_release:
 	if (CDROM_CAN(CDC_LOCK) && cdi->options & CDO_LOCK) {
@@ -1184,7 +1184,7 @@ static int check_for_audio_disc(struct cdrom_device_info * cdi,
 	return 0;
 }
 
-int cdrom_release(struct cdrom_device_info *cdi, struct file *fp)
+void cdrom_release(struct cdrom_device_info *cdi, fmode_t mode)
 {
 	struct cdrom_device_ops *cdo = cdi->ops;
 	int opened_for_data;
@@ -1205,7 +1205,7 @@ int cdrom_release(struct cdrom_device_info *cdi, struct file *fp)
 	}
 
 	opened_for_data = !(cdi->options & CDO_USE_FFLAGS) ||
-		!(fp && fp->f_mode & FMODE_NDELAY);
+		!(mode & FMODE_NDELAY);
 
 	/*
 	 * flush cache on last write release
@@ -1219,7 +1219,6 @@ int cdrom_release(struct cdrom_device_info *cdi, struct file *fp)
 		    cdi->options & CDO_AUTO_EJECT && CDROM_CAN(CDC_OPEN_TRAY))
 			cdo->tray_move(cdi, 1);
 	}
-	return 0;
 }
 
 static int cdrom_read_mech_status(struct cdrom_device_info *cdi, 
@@ -2662,17 +2661,17 @@ static int cdrom_ioctl_audioctl(struct cdrom_device_info *cdi,
  * these days.
  * ATAPI / SCSI specific code now mainly resides in mmc_ioctl().
  */
-int cdrom_ioctl(struct file * file, struct cdrom_device_info *cdi,
-		struct inode *ip, unsigned int cmd, unsigned long arg)
+int cdrom_ioctl(struct cdrom_device_info *cdi, struct block_device *bdev,
+		fmode_t mode, unsigned int cmd, unsigned long arg)
 {
 	void __user *argp = (void __user *)arg;
 	int ret;
-	struct gendisk *disk = ip->i_bdev->bd_disk;
+	struct gendisk *disk = bdev->bd_disk;
 
 	/*
 	 * Try the generic SCSI command ioctl's first.
 	 */
-	ret = scsi_cmd_ioctl(disk->queue, disk, file ? file->f_mode : 0, cmd, argp);
+	ret = scsi_cmd_ioctl(disk->queue, disk, mode, cmd, argp);
 	if (ret != -ENOTTY)
 		return ret;
 
@@ -2696,7 +2695,7 @@ int cdrom_ioctl(struct file * file, struct cdrom_device_info *cdi,
 	case CDROM_SELECT_DISC:
 		return cdrom_ioctl_select_disc(cdi, arg);
 	case CDROMRESET:
-		return cdrom_ioctl_reset(cdi, ip->i_bdev);
+		return cdrom_ioctl_reset(cdi, bdev);
 	case CDROM_LOCKDOOR:
 		return cdrom_ioctl_lock_door(cdi, arg);
 	case CDROM_DEBUG:
diff --git a/drivers/cdrom/gdrom.c b/drivers/cdrom/gdrom.c
index d6ba77a2dd7b..0959edf2afdb 100644
--- a/drivers/cdrom/gdrom.c
+++ b/drivers/cdrom/gdrom.c
@@ -492,12 +492,12 @@ static struct cdrom_device_ops gdrom_ops = {
 
 static int gdrom_bdops_open(struct inode *inode, struct file *file)
 {
-	return cdrom_open(gd.cd_info, inode, file);
+	return cdrom_open(gd.cd_info, inode->i_bdev, file->f_mode);
 }
 
 static int gdrom_bdops_release(struct inode *inode, struct file *file)
 {
-	return cdrom_release(gd.cd_info, file);
+	return cdrom_release(gd.cd_info, file ? file->f_mode : 0);
 }
 
 static int gdrom_bdops_mediachanged(struct gendisk *disk)
@@ -508,7 +508,8 @@ static int gdrom_bdops_mediachanged(struct gendisk *disk)
 static int gdrom_bdops_ioctl(struct inode *inode, struct file *file,
 	unsigned cmd, unsigned long arg)
 {
-	return cdrom_ioctl(file, gd.cd_info, inode, cmd, arg);
+	return cdrom_ioctl(gd.cd_info, inode->i_bdev,
+			file ? file->f_mode : 0, cmd, arg);
 }
 
 static struct block_device_operations gdrom_bdops = {
diff --git a/drivers/cdrom/viocd.c b/drivers/cdrom/viocd.c
index 031e0e1a1a3b..abc4079c3f41 100644
--- a/drivers/cdrom/viocd.c
+++ b/drivers/cdrom/viocd.c
@@ -154,20 +154,22 @@ static const struct file_operations proc_viocd_operations = {
 static int viocd_blk_open(struct inode *inode, struct file *file)
 {
 	struct disk_info *di = inode->i_bdev->bd_disk->private_data;
-	return cdrom_open(&di->viocd_info, inode, file);
+	return cdrom_open(&di->viocd_info, inode->i_bdev, file->f_mode);
 }
 
 static int viocd_blk_release(struct inode *inode, struct file *file)
 {
 	struct disk_info *di = inode->i_bdev->bd_disk->private_data;
-	return cdrom_release(&di->viocd_info, file);
+	cdrom_release(&di->viocd_info, file ? file->f_mode : 0);
+	return 0;
 }
 
 static int viocd_blk_ioctl(struct inode *inode, struct file *file,
 		unsigned cmd, unsigned long arg)
 {
 	struct disk_info *di = inode->i_bdev->bd_disk->private_data;
-	return cdrom_ioctl(file, &di->viocd_info, inode, cmd, arg);
+	return cdrom_ioctl(&di->viocd_info, inode->i_bdev,
+			   file ? file->f_mode : 0, cmd, arg);
 }
 
 static int viocd_blk_media_changed(struct gendisk *disk)
diff --git a/drivers/ide/ide-cd.c b/drivers/ide/ide-cd.c
index cd21b34fe509..87d90200b169 100644
--- a/drivers/ide/ide-cd.c
+++ b/drivers/ide/ide-cd.c
@@ -2099,7 +2099,7 @@ static int idecd_open(struct inode *inode, struct file *file)
 	if (!info)
 		return -ENXIO;
 
-	rc = cdrom_open(&info->devinfo, inode, file);
+	rc = cdrom_open(&info->devinfo, inode->i_bdev, file->f_mode);
 
 	if (rc < 0)
 		ide_cd_put(info);
@@ -2112,7 +2112,7 @@ static int idecd_release(struct inode *inode, struct file *file)
 	struct gendisk *disk = inode->i_bdev->bd_disk;
 	struct cdrom_info *info = ide_drv_g(disk, cdrom_info);
 
-	cdrom_release(&info->devinfo, file);
+	cdrom_release(&info->devinfo, file ? file->f_mode : 0);
 
 	ide_cd_put(info);
 
@@ -2176,7 +2176,8 @@ static int idecd_ioctl(struct inode *inode, struct file *file,
 
 	err = generic_ide_ioctl(info->drive, bdev, cmd, arg);
 	if (err == -EINVAL)
-		err = cdrom_ioctl(file, &info->devinfo, inode, cmd, arg);
+		err = cdrom_ioctl(&info->devinfo, bdev,
+				  file ? file->f_mode : 0, cmd, arg);
 
 	return err;
 }
diff --git a/drivers/scsi/sr.c b/drivers/scsi/sr.c
index 0f17009c99d2..b92e2dac9aa9 100644
--- a/drivers/scsi/sr.c
+++ b/drivers/scsi/sr.c
@@ -480,7 +480,7 @@ static int sr_block_open(struct inode *inode, struct file *file)
 	if(!(cd = scsi_cd_get(disk)))
 		return -ENXIO;
 
-	if((ret = cdrom_open(&cd->cdi, inode, file)) != 0)
+	if((ret = cdrom_open(&cd->cdi, inode->i_bdev, file->f_mode)) != 0)
 		scsi_cd_put(cd);
 
 	return ret;
@@ -488,12 +488,8 @@ static int sr_block_open(struct inode *inode, struct file *file)
 
 static int sr_block_release(struct inode *inode, struct file *file)
 {
-	int ret;
 	struct scsi_cd *cd = scsi_cd(inode->i_bdev->bd_disk);
-	ret = cdrom_release(&cd->cdi, file);
-	if(ret)
-		return ret;
-	
+	cdrom_release(&cd->cdi, file ? file->f_mode : 0);
 	scsi_cd_put(cd);
 
 	return 0;
@@ -517,7 +513,8 @@ static int sr_block_ioctl(struct inode *inode, struct file *file, unsigned cmd,
 		return scsi_ioctl(sdev, cmd, argp);
 	}
 
-	ret = cdrom_ioctl(file, &cd->cdi, inode, cmd, arg);
+	ret = cdrom_ioctl(&cd->cdi, inode->i_bdev,
+			  file ? file->f_mode : 0, cmd, arg);
 	if (ret != -ENOSYS)
 		return ret;
 
diff --git a/include/linux/cdrom.h b/include/linux/cdrom.h
index 5db265ea60f6..0b49e08d3cb0 100644
--- a/include/linux/cdrom.h
+++ b/include/linux/cdrom.h
@@ -987,11 +987,11 @@ struct cdrom_device_ops {
 };
 
 /* the general block_device operations structure: */
-extern int cdrom_open(struct cdrom_device_info *cdi, struct inode *ip,
-			struct file *fp);
-extern int cdrom_release(struct cdrom_device_info *cdi, struct file *fp);
-extern int cdrom_ioctl(struct file *file, struct cdrom_device_info *cdi,
-		struct inode *ip, unsigned int cmd, unsigned long arg);
+extern int cdrom_open(struct cdrom_device_info *cdi, struct block_device *bdev,
+			fmode_t mode);
+extern void cdrom_release(struct cdrom_device_info *cdi, fmode_t mode);
+extern int cdrom_ioctl(struct cdrom_device_info *cdi, struct block_device *bdev,
+		       fmode_t mode, unsigned int cmd, unsigned long arg);
 extern int cdrom_media_changed(struct cdrom_device_info *);
 
 extern int register_cdrom(struct cdrom_device_info *cdi);
-- 
cgit v1.2.3


From 633a08b81206122469365b4c72eaeb71f04f2cb4 Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Wed, 29 Aug 2007 20:34:12 -0400
Subject: [PATCH] introduce __blkdev_driver_ioctl()

Analog of blkdev_driver_ioctl() with sane arguments.  For
now uses fake struct file, by the end of the series it won't
and blkdev_driver_ioctl() will become a wrapper around it.

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 block/ioctl.c           | 31 +++++++++++++++++++++++++++++++
 drivers/block/pktcdvd.c |  4 ++--
 drivers/md/dm-linear.c  | 10 +---------
 drivers/md/dm-mpath.c   | 11 +++--------
 include/linux/blkdev.h  |  2 ++
 5 files changed, 39 insertions(+), 19 deletions(-)

(limited to 'include/linux')

diff --git a/block/ioctl.c b/block/ioctl.c
index 38bee321e1fa..9a26ace6d042 100644
--- a/block/ioctl.c
+++ b/block/ioctl.c
@@ -283,6 +283,37 @@ int blkdev_driver_ioctl(struct inode *inode, struct file *file,
 }
 EXPORT_SYMBOL_GPL(blkdev_driver_ioctl);
 
+int __blkdev_driver_ioctl(struct block_device *bdev, fmode_t mode,
+			unsigned cmd, unsigned long arg)
+{
+	struct gendisk *disk = bdev->bd_disk;
+	int ret;
+	/* you bet it'll go away by the end of patch series */
+	struct file fake_file = {};
+	struct dentry fake_dentry = {};
+	fake_file.f_mode = mode;
+	fake_file.f_path.dentry = &fake_dentry;
+	fake_dentry.d_inode = bdev->bd_inode;
+
+	if (disk->fops->unlocked_ioctl)
+		return disk->fops->unlocked_ioctl(&fake_file, cmd, arg);
+
+	if (disk->fops->ioctl) {
+		lock_kernel();
+		ret = disk->fops->ioctl(bdev->bd_inode, &fake_file, cmd, arg);
+		unlock_kernel();
+		return ret;
+	}
+
+	return -ENOTTY;
+}
+/*
+ * For the record: _GPL here is only because somebody decided to slap it
+ * on the previous export.  Sheer idiocy, since it wasn't copyrightable
+ * at all and could be open-coded without any exports by anybody who cares.
+ */
+EXPORT_SYMBOL_GPL(__blkdev_driver_ioctl);
+
 /*
  * always keep this in sync with compat_blkdev_ioctl() and
  * compat_blkdev_locked_ioctl()
diff --git a/drivers/block/pktcdvd.c b/drivers/block/pktcdvd.c
index bdd49abcb546..a0ba4023953b 100644
--- a/drivers/block/pktcdvd.c
+++ b/drivers/block/pktcdvd.c
@@ -2819,8 +2819,8 @@ static int pkt_ioctl(struct inode *inode, struct file *file, unsigned int cmd, u
 	case CDROM_LAST_WRITTEN:
 	case CDROM_SEND_PACKET:
 	case SCSI_IOCTL_SEND_COMMAND:
-		return blkdev_driver_ioctl(pd->bdev->bd_inode, pd->bdev->bd_disk,
-					file, cmd, arg);
+		return __blkdev_driver_ioctl(pd->bdev, file ? file->f_mode : 0,
+					cmd, arg);
 
 	default:
 		VPRINTK(DRIVER_NAME": Unknown ioctl for %s (%x)\n", pd->name, cmd);
diff --git a/drivers/md/dm-linear.c b/drivers/md/dm-linear.c
index fa358385eed3..373442b1e98f 100644
--- a/drivers/md/dm-linear.c
+++ b/drivers/md/dm-linear.c
@@ -114,15 +114,7 @@ static int linear_ioctl(struct dm_target *ti, unsigned int cmd,
 			unsigned long arg)
 {
 	struct linear_c *lc = (struct linear_c *) ti->private;
-	struct block_device *bdev = lc->dev->bdev;
-	struct file fake_file = {};
-	struct dentry fake_dentry = {};
-
-	fake_file.f_mode = lc->dev->mode;
-	fake_file.f_path.dentry = &fake_dentry;
-	fake_dentry.d_inode = bdev->bd_inode;
-
-	return blkdev_driver_ioctl(bdev->bd_inode, &fake_file, bdev->bd_disk, cmd, arg);
+	return __blkdev_driver_ioctl(lc->dev->bdev, lc->dev->mode, cmd, arg);
 }
 
 static int linear_merge(struct dm_target *ti, struct bvec_merge_data *bvm,
diff --git a/drivers/md/dm-mpath.c b/drivers/md/dm-mpath.c
index c681d5e5f45c..d85c65a46433 100644
--- a/drivers/md/dm-mpath.c
+++ b/drivers/md/dm-mpath.c
@@ -1400,13 +1400,10 @@ static int multipath_ioctl(struct dm_target *ti, unsigned int cmd,
 {
 	struct multipath *m = (struct multipath *) ti->private;
 	struct block_device *bdev = NULL;
+	fmode_t mode = 0;
 	unsigned long flags;
-	struct file fake_file = {};
-	struct dentry fake_dentry = {};
 	int r = 0;
 
-	fake_file.f_path.dentry = &fake_dentry;
-
 	spin_lock_irqsave(&m->lock, flags);
 
 	if (!m->current_pgpath)
@@ -1414,8 +1411,7 @@ static int multipath_ioctl(struct dm_target *ti, unsigned int cmd,
 
 	if (m->current_pgpath) {
 		bdev = m->current_pgpath->path.dev->bdev;
-		fake_dentry.d_inode = bdev->bd_inode;
-		fake_file.f_mode = m->current_pgpath->path.dev->mode;
+		mode = m->current_pgpath->path.dev->mode;
 	}
 
 	if (m->queue_io)
@@ -1425,8 +1421,7 @@ static int multipath_ioctl(struct dm_target *ti, unsigned int cmd,
 
 	spin_unlock_irqrestore(&m->lock, flags);
 
-	return r ? : blkdev_driver_ioctl(bdev->bd_inode, &fake_file,
-					 bdev->bd_disk, cmd, arg);
+	return r ? : __blkdev_driver_ioctl(bdev, mode, cmd, arg);
 }
 
 /*-----------------------------------------------------------------
diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
index 48ec8862a11a..2bad616b9949 100644
--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@ -1074,6 +1074,8 @@ struct block_device_operations {
 	struct module *owner;
 };
 
+extern int __blkdev_driver_ioctl(struct block_device *, fmode_t, unsigned int,
+				 unsigned long);
 #else /* CONFIG_BLOCK */
 /*
  * stubs for when the block layer is configured out
-- 
cgit v1.2.3


From badf8082c33d18b118d3a6f1b32d5ea6b97d3839 Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Thu, 16 Oct 2008 10:23:20 -0400
Subject: [PATCH] switch ide_disk_ops ->ioctl() to sane prototype

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 drivers/ide/ide-disk.h         |  2 +-
 drivers/ide/ide-disk_ioctl.c   |  3 +--
 drivers/ide/ide-floppy.h       |  4 ++--
 drivers/ide/ide-floppy_ioctl.c | 13 ++++++-------
 drivers/ide/ide-gd.c           |  2 +-
 include/linux/ide.h            |  4 ++--
 6 files changed, 13 insertions(+), 15 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/ide/ide-disk.h b/drivers/ide/ide-disk.h
index b234b0feaf7b..d511dab7c4aa 100644
--- a/drivers/ide/ide-disk.h
+++ b/drivers/ide/ide-disk.h
@@ -13,7 +13,7 @@ ide_decl_devset(wcache);
 ide_decl_devset(acoustic);
 
 /* ide-disk_ioctl.c */
-int ide_disk_ioctl(ide_drive_t *, struct inode *, struct file *, unsigned int,
+int ide_disk_ioctl(ide_drive_t *, struct block_device *, fmode_t, unsigned int,
 		   unsigned long);
 
 #ifdef CONFIG_IDE_PROC_FS
diff --git a/drivers/ide/ide-disk_ioctl.c b/drivers/ide/ide-disk_ioctl.c
index 41832af400d6..7b783dd7c0be 100644
--- a/drivers/ide/ide-disk_ioctl.c
+++ b/drivers/ide/ide-disk_ioctl.c
@@ -13,10 +13,9 @@ static const struct ide_ioctl_devset ide_disk_ioctl_settings[] = {
 { 0 }
 };
 
-int ide_disk_ioctl(ide_drive_t *drive, struct inode *inode, struct file *file,
+int ide_disk_ioctl(ide_drive_t *drive, struct block_device *bdev, fmode_t mode,
 		   unsigned int cmd, unsigned long arg)
 {
-	struct block_device *bdev = inode->i_bdev;
 	int err;
 
 	err = ide_setting_ioctl(drive, bdev, cmd, arg, ide_disk_ioctl_settings);
diff --git a/drivers/ide/ide-floppy.h b/drivers/ide/ide-floppy.h
index c17124dd6079..6dd2beb48434 100644
--- a/drivers/ide/ide-floppy.h
+++ b/drivers/ide/ide-floppy.h
@@ -23,8 +23,8 @@ void ide_floppy_create_mode_sense_cmd(struct ide_atapi_pc *, u8);
 void ide_floppy_create_read_capacity_cmd(struct ide_atapi_pc *);
 
 /* ide-floppy_ioctl.c */
-int ide_floppy_ioctl(ide_drive_t *, struct inode *, struct file *, unsigned int,
-		     unsigned long);
+int ide_floppy_ioctl(ide_drive_t *, struct block_device *, fmode_t,
+		     unsigned int, unsigned long);
 
 #ifdef CONFIG_IDE_PROC_FS
 /* ide-floppy_proc.c */
diff --git a/drivers/ide/ide-floppy_ioctl.c b/drivers/ide/ide-floppy_ioctl.c
index 5af70a2c9ef8..2bc51ff73fee 100644
--- a/drivers/ide/ide-floppy_ioctl.c
+++ b/drivers/ide/ide-floppy_ioctl.c
@@ -241,7 +241,7 @@ static int ide_floppy_lockdoor(ide_drive_t *drive, struct ide_atapi_pc *pc,
 	return 0;
 }
 
-static int ide_floppy_format_ioctl(ide_drive_t *drive, struct file *file,
+static int ide_floppy_format_ioctl(ide_drive_t *drive, fmode_t mode,
 				   unsigned int cmd, void __user *argp)
 {
 	switch (cmd) {
@@ -250,7 +250,7 @@ static int ide_floppy_format_ioctl(ide_drive_t *drive, struct file *file,
 	case IDEFLOPPY_IOCTL_FORMAT_GET_CAPACITY:
 		return ide_floppy_get_format_capacities(drive, argp);
 	case IDEFLOPPY_IOCTL_FORMAT_START:
-		if (!(file->f_mode & FMODE_WRITE))
+		if (!(mode & FMODE_WRITE))
 			return -EPERM;
 		return ide_floppy_format_unit(drive, (int __user *)argp);
 	case IDEFLOPPY_IOCTL_FORMAT_GET_PROGRESS:
@@ -260,10 +260,9 @@ static int ide_floppy_format_ioctl(ide_drive_t *drive, struct file *file,
 	}
 }
 
-int ide_floppy_ioctl(ide_drive_t *drive, struct inode *inode,
-		     struct file *file, unsigned int cmd, unsigned long arg)
+int ide_floppy_ioctl(ide_drive_t *drive, struct block_device *bdev,
+		     fmode_t mode, unsigned int cmd, unsigned long arg)
 {
-	struct block_device *bdev = inode->i_bdev;
 	struct ide_atapi_pc pc;
 	void __user *argp = (void __user *)arg;
 	int err;
@@ -271,7 +270,7 @@ int ide_floppy_ioctl(ide_drive_t *drive, struct inode *inode,
 	if (cmd == CDROMEJECT || cmd == CDROM_LOCKDOOR)
 		return ide_floppy_lockdoor(drive, &pc, arg, cmd);
 
-	err = ide_floppy_format_ioctl(drive, file, cmd, argp);
+	err = ide_floppy_format_ioctl(drive, mode, cmd, argp);
 	if (err != -ENOTTY)
 		return err;
 
@@ -281,7 +280,7 @@ int ide_floppy_ioctl(ide_drive_t *drive, struct inode *inode,
 	 */
 	if (cmd != CDROM_SEND_PACKET && cmd != SCSI_IOCTL_SEND_COMMAND)
 		err = scsi_cmd_ioctl(bdev->bd_disk->queue, bdev->bd_disk,
-				file ? file->f_mode : 0, cmd, argp);
+				mode, cmd, argp);
 
 	if (err == -ENOTTY)
 		err = generic_ide_ioctl(drive, bdev, cmd, arg);
diff --git a/drivers/ide/ide-gd.c b/drivers/ide/ide-gd.c
index 66bbb0a22f57..948af08abe23 100644
--- a/drivers/ide/ide-gd.c
+++ b/drivers/ide/ide-gd.c
@@ -293,7 +293,7 @@ static int ide_gd_ioctl(struct inode *inode, struct file *file,
 	struct ide_disk_obj *idkp = ide_drv_g(bdev->bd_disk, ide_disk_obj);
 	ide_drive_t *drive = idkp->drive;
 
-	return drive->disk_ops->ioctl(drive, inode, file, cmd, arg);
+	return drive->disk_ops->ioctl(drive, bdev, file ? file->f_mode : 0, cmd, arg);
 }
 
 static struct block_device_operations ide_gd_ops = {
diff --git a/include/linux/ide.h b/include/linux/ide.h
index 0d03e83f7194..54525be4b5f8 100644
--- a/include/linux/ide.h
+++ b/include/linux/ide.h
@@ -474,8 +474,8 @@ struct ide_disk_ops {
 	ide_startstop_t	(*do_request)(struct ide_drive_s *, struct request *,
 				      sector_t);
 	int		(*end_request)(struct ide_drive_s *, int, int);
-	int		(*ioctl)(struct ide_drive_s *, struct inode *,
-				 struct file *, unsigned int, unsigned long);
+	int		(*ioctl)(struct ide_drive_s *, struct block_device *,
+				 fmode_t, unsigned int, unsigned long);
 };
 
 /* ATAPI device flags */
-- 
cgit v1.2.3


From d4430d62fa77208824a37fe6f85ab2831d274769 Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Sun, 2 Mar 2008 09:09:22 -0500
Subject: [PATCH] beginning of methods conversion

To keep the size of changesets sane we split the switch by drivers;
to keep the damn thing bisectable we do the following:
	1) rename the affected methods, add ones with correct
prototypes, make (few) callers handle both.  That's this changeset.
	2) for each driver convert to new methods.  *ALL* drivers
are converted in this series.
	3) kill the old (renamed) methods.

Note that it _is_ a flagday; all in-tree drivers are converted and by the
end of this series no trace of old methods remain.  The only reason why
we do that this way is to keep the damn thing bisectable and allow per-driver
debugging if anything goes wrong.

New methods:
	open(bdev, mode)
	release(disk, mode)
	ioctl(bdev, mode, cmd, arg)		/* Called without BKL */
	compat_ioctl(bdev, mode, cmd, arg)
	locked_ioctl(bdev, mode, cmd, arg)	/* Called with BKL, legacy */

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 arch/um/drivers/ubd_kern.c          |  6 +++---
 block/compat_ioctl.c                | 17 +++++++++--------
 block/ioctl.c                       | 35 ++++++++++++++++++++++++++---------
 drivers/block/DAC960.c              |  2 +-
 drivers/block/amiflop.c             |  6 +++---
 drivers/block/aoe/aoeblk.c          |  4 ++--
 drivers/block/ataflop.c             |  6 +++---
 drivers/block/brd.c                 |  2 +-
 drivers/block/cciss.c               |  8 ++++----
 drivers/block/cpqarray.c            |  6 +++---
 drivers/block/floppy.c              |  6 +++---
 drivers/block/loop.c                |  8 ++++----
 drivers/block/nbd.c                 |  2 +-
 drivers/block/paride/pcd.c          |  6 +++---
 drivers/block/paride/pd.c           |  6 +++---
 drivers/block/paride/pf.c           |  6 +++---
 drivers/block/pktcdvd.c             |  6 +++---
 drivers/block/swim3.c               |  6 +++---
 drivers/block/ub.c                  |  6 +++---
 drivers/block/viodasd.c             |  4 ++--
 drivers/block/virtio_blk.c          |  2 +-
 drivers/block/xd.c                  |  2 +-
 drivers/block/xen-blkfront.c        |  4 ++--
 drivers/block/xsysace.c             |  4 ++--
 drivers/block/z2ram.c               |  4 ++--
 drivers/cdrom/gdrom.c               |  6 +++---
 drivers/cdrom/viocd.c               |  6 +++---
 drivers/ide/ide-cd.c                |  6 +++---
 drivers/ide/ide-gd.c                |  6 +++---
 drivers/ide/ide-tape.c              |  6 +++---
 drivers/md/dm.c                     |  6 +++---
 drivers/md/md.c                     |  6 +++---
 drivers/memstick/core/mspro_block.c |  4 ++--
 drivers/message/i2o/i2o_block.c     |  6 +++---
 drivers/mmc/card/block.c            |  4 ++--
 drivers/mtd/mtd_blkdevs.c           |  6 +++---
 drivers/s390/block/dasd.c           |  8 ++++----
 drivers/s390/block/dcssblk.c        |  4 ++--
 drivers/s390/char/tape_block.c      |  6 +++---
 drivers/scsi/ide-scsi.c             |  6 +++---
 drivers/scsi/sd.c                   |  8 ++++----
 drivers/scsi/sr.c                   |  6 +++---
 fs/block_dev.c                      | 18 +++++++++++++++---
 include/linux/blkdev.h              | 15 ++++++++++-----
 include/linux/fs.h                  |  1 +
 45 files changed, 167 insertions(+), 131 deletions(-)

(limited to 'include/linux')

diff --git a/arch/um/drivers/ubd_kern.c b/arch/um/drivers/ubd_kern.c
index b58fb8941d8d..72569cc3cbb7 100644
--- a/arch/um/drivers/ubd_kern.c
+++ b/arch/um/drivers/ubd_kern.c
@@ -108,9 +108,9 @@ static int ubd_getgeo(struct block_device *bdev, struct hd_geometry *geo);
 
 static struct block_device_operations ubd_blops = {
         .owner		= THIS_MODULE,
-        .open		= ubd_open,
-        .release	= ubd_release,
-        .ioctl		= ubd_ioctl,
+        .__open		= ubd_open,
+        .__release	= ubd_release,
+        .__ioctl		= ubd_ioctl,
 	.getgeo		= ubd_getgeo,
 };
 
diff --git a/block/compat_ioctl.c b/block/compat_ioctl.c
index 1e559fba7bdf..576c4fd15463 100644
--- a/block/compat_ioctl.c
+++ b/block/compat_ioctl.c
@@ -708,17 +708,17 @@ static int compat_blkdev_driver_ioctl(struct inode *inode, struct file *file,
 		return -ENOIOCTLCMD;
 	}
 
-	if (disk->fops->unlocked_ioctl)
-		return disk->fops->unlocked_ioctl(file, cmd, arg);
+	if (disk->fops->__unlocked_ioctl)
+		return disk->fops->__unlocked_ioctl(file, cmd, arg);
 
-	if (disk->fops->ioctl) {
+	if (disk->fops->__ioctl) {
 		lock_kernel();
-		ret = disk->fops->ioctl(inode, file, cmd, arg);
+		ret = disk->fops->__ioctl(inode, file, cmd, arg);
 		unlock_kernel();
 		return ret;
 	}
 
-	return -ENOTTY;
+	return __blkdev_driver_ioctl(inode->i_bdev, file->f_mode, cmd, arg);
 }
 
 static int compat_blkdev_locked_ioctl(struct inode *inode, struct file *file,
@@ -805,10 +805,11 @@ long compat_blkdev_ioctl(struct file *file, unsigned cmd, unsigned long arg)
 
 	lock_kernel();
 	ret = compat_blkdev_locked_ioctl(inode, file, bdev, cmd, arg);
-	/* FIXME: why do we assume -> compat_ioctl needs the BKL? */
-	if (ret == -ENOIOCTLCMD && disk->fops->compat_ioctl)
-		ret = disk->fops->compat_ioctl(file, cmd, arg);
+	if (ret == -ENOIOCTLCMD && disk->fops->__compat_ioctl)
+		ret = disk->fops->__compat_ioctl(file, cmd, arg);
 	unlock_kernel();
+	if (ret == -ENOIOCTLCMD && disk->fops->compat_ioctl)
+		ret = disk->fops->compat_ioctl(bdev, file->f_mode, cmd, arg);
 
 	if (ret != -ENOIOCTLCMD)
 		return ret;
diff --git a/block/ioctl.c b/block/ioctl.c
index 9a26ace6d042..01ff463bc801 100644
--- a/block/ioctl.c
+++ b/block/ioctl.c
@@ -269,17 +269,24 @@ int blkdev_driver_ioctl(struct inode *inode, struct file *file,
 			struct gendisk *disk, unsigned cmd, unsigned long arg)
 {
 	int ret;
-	if (disk->fops->unlocked_ioctl)
-		return disk->fops->unlocked_ioctl(file, cmd, arg);
+	fmode_t mode = 0;
+	if (file) {
+		mode = file->f_mode;
+		if (file->f_flags & O_NDELAY)
+			mode |= FMODE_NDELAY_NOW;
+	}
+
+	if (disk->fops->__unlocked_ioctl)
+		return disk->fops->__unlocked_ioctl(file, cmd, arg);
 
-	if (disk->fops->ioctl) {
+	if (disk->fops->__ioctl) {
 		lock_kernel();
-		ret = disk->fops->ioctl(inode, file, cmd, arg);
+		ret = disk->fops->__ioctl(inode, file, cmd, arg);
 		unlock_kernel();
 		return ret;
 	}
 
-	return -ENOTTY;
+	return __blkdev_driver_ioctl(inode->i_bdev, mode, cmd, arg);
 }
 EXPORT_SYMBOL_GPL(blkdev_driver_ioctl);
 
@@ -295,12 +302,22 @@ int __blkdev_driver_ioctl(struct block_device *bdev, fmode_t mode,
 	fake_file.f_path.dentry = &fake_dentry;
 	fake_dentry.d_inode = bdev->bd_inode;
 
-	if (disk->fops->unlocked_ioctl)
-		return disk->fops->unlocked_ioctl(&fake_file, cmd, arg);
+	if (disk->fops->__unlocked_ioctl)
+		return disk->fops->__unlocked_ioctl(&fake_file, cmd, arg);
+
+	if (disk->fops->__ioctl) {
+		lock_kernel();
+		ret = disk->fops->__ioctl(bdev->bd_inode, &fake_file, cmd, arg);
+		unlock_kernel();
+		return ret;
+	}
+
+	if (disk->fops->ioctl)
+		return disk->fops->ioctl(bdev, mode, cmd, arg);
 
-	if (disk->fops->ioctl) {
+	if (disk->fops->locked_ioctl) {
 		lock_kernel();
-		ret = disk->fops->ioctl(bdev->bd_inode, &fake_file, cmd, arg);
+		ret = disk->fops->locked_ioctl(bdev, mode, cmd, arg);
 		unlock_kernel();
 		return ret;
 	}
diff --git a/drivers/block/DAC960.c b/drivers/block/DAC960.c
index a002a381df92..4b90ebfa6676 100644
--- a/drivers/block/DAC960.c
+++ b/drivers/block/DAC960.c
@@ -153,7 +153,7 @@ static int DAC960_revalidate_disk(struct gendisk *disk)
 
 static struct block_device_operations DAC960_BlockDeviceOperations = {
 	.owner			= THIS_MODULE,
-	.open			= DAC960_open,
+	.__open			= DAC960_open,
 	.getgeo			= DAC960_getgeo,
 	.media_changed		= DAC960_media_changed,
 	.revalidate_disk	= DAC960_revalidate_disk,
diff --git a/drivers/block/amiflop.c b/drivers/block/amiflop.c
index d19c5a939fe8..d5da4e3cb2ad 100644
--- a/drivers/block/amiflop.c
+++ b/drivers/block/amiflop.c
@@ -1648,9 +1648,9 @@ static int amiga_floppy_change(struct gendisk *disk)
 
 static struct block_device_operations floppy_fops = {
 	.owner		= THIS_MODULE,
-	.open		= floppy_open,
-	.release	= floppy_release,
-	.ioctl		= fd_ioctl,
+	.__open		= floppy_open,
+	.__release	= floppy_release,
+	.__ioctl		= fd_ioctl,
 	.getgeo		= fd_getgeo,
 	.media_changed	= amiga_floppy_change,
 };
diff --git a/drivers/block/aoe/aoeblk.c b/drivers/block/aoe/aoeblk.c
index d876ad861237..d4d9796d5ddd 100644
--- a/drivers/block/aoe/aoeblk.c
+++ b/drivers/block/aoe/aoeblk.c
@@ -239,8 +239,8 @@ aoeblk_getgeo(struct block_device *bdev, struct hd_geometry *geo)
 }
 
 static struct block_device_operations aoe_bdops = {
-	.open = aoeblk_open,
-	.release = aoeblk_release,
+	.__open = aoeblk_open,
+	.__release = aoeblk_release,
 	.getgeo = aoeblk_getgeo,
 	.owner = THIS_MODULE,
 };
diff --git a/drivers/block/ataflop.c b/drivers/block/ataflop.c
index 85d56a26f7c6..30166774327d 100644
--- a/drivers/block/ataflop.c
+++ b/drivers/block/ataflop.c
@@ -1857,9 +1857,9 @@ static int floppy_release( struct inode * inode, struct file * filp )
 
 static struct block_device_operations floppy_fops = {
 	.owner		= THIS_MODULE,
-	.open		= floppy_open,
-	.release	= floppy_release,
-	.ioctl		= fd_ioctl,
+	.__open		= floppy_open,
+	.__release	= floppy_release,
+	.__ioctl		= fd_ioctl,
 	.media_changed	= check_floppy_change,
 	.revalidate_disk= floppy_revalidate,
 };
diff --git a/drivers/block/brd.c b/drivers/block/brd.c
index d070d492e385..2ea99f947667 100644
--- a/drivers/block/brd.c
+++ b/drivers/block/brd.c
@@ -376,7 +376,7 @@ static int brd_ioctl(struct inode *inode, struct file *file,
 
 static struct block_device_operations brd_fops = {
 	.owner =		THIS_MODULE,
-	.ioctl =		brd_ioctl,
+	.__ioctl =		brd_ioctl,
 #ifdef CONFIG_BLK_DEV_XIP
 	.direct_access =	brd_direct_access,
 #endif
diff --git a/drivers/block/cciss.c b/drivers/block/cciss.c
index d9b1c15b8113..781b745181d2 100644
--- a/drivers/block/cciss.c
+++ b/drivers/block/cciss.c
@@ -197,12 +197,12 @@ static long cciss_compat_ioctl(struct file *f, unsigned cmd, unsigned long arg);
 
 static struct block_device_operations cciss_fops = {
 	.owner = THIS_MODULE,
-	.open = cciss_open,
-	.release = cciss_release,
-	.ioctl = cciss_ioctl,
+	.__open = cciss_open,
+	.__release = cciss_release,
+	.__ioctl = cciss_ioctl,
 	.getgeo = cciss_getgeo,
 #ifdef CONFIG_COMPAT
-	.compat_ioctl = cciss_compat_ioctl,
+	.__compat_ioctl = cciss_compat_ioctl,
 #endif
 	.revalidate_disk = cciss_revalidate,
 };
diff --git a/drivers/block/cpqarray.c b/drivers/block/cpqarray.c
index 3d967525e9a9..b71334b968b6 100644
--- a/drivers/block/cpqarray.c
+++ b/drivers/block/cpqarray.c
@@ -195,9 +195,9 @@ static inline ctlr_info_t *get_host(struct gendisk *disk)
 
 static struct block_device_operations ida_fops  = {
 	.owner		= THIS_MODULE,
-	.open		= ida_open,
-	.release	= ida_release,
-	.ioctl		= ida_ioctl,
+	.__open		= ida_open,
+	.__release	= ida_release,
+	.__ioctl		= ida_ioctl,
 	.getgeo		= ida_getgeo,
 	.revalidate_disk= ida_revalidate,
 };
diff --git a/drivers/block/floppy.c b/drivers/block/floppy.c
index 5d60c05a736a..72363df58953 100644
--- a/drivers/block/floppy.c
+++ b/drivers/block/floppy.c
@@ -3902,9 +3902,9 @@ static int floppy_revalidate(struct gendisk *disk)
 
 static struct block_device_operations floppy_fops = {
 	.owner			= THIS_MODULE,
-	.open			= floppy_open,
-	.release		= floppy_release,
-	.ioctl			= fd_ioctl,
+	.__open			= floppy_open,
+	.__release		= floppy_release,
+	.__ioctl			= fd_ioctl,
 	.getgeo			= fd_getgeo,
 	.media_changed		= check_floppy_change,
 	.revalidate_disk	= floppy_revalidate,
diff --git a/drivers/block/loop.c b/drivers/block/loop.c
index d3a25b027ff9..6faca2b7ae37 100644
--- a/drivers/block/loop.c
+++ b/drivers/block/loop.c
@@ -1355,11 +1355,11 @@ static int lo_release(struct inode *inode, struct file *file)
 
 static struct block_device_operations lo_fops = {
 	.owner =	THIS_MODULE,
-	.open =		lo_open,
-	.release =	lo_release,
-	.ioctl =	lo_ioctl,
+	.__open =		lo_open,
+	.__release =	lo_release,
+	.__ioctl =	lo_ioctl,
 #ifdef CONFIG_COMPAT
-	.compat_ioctl =	lo_compat_ioctl,
+	.__compat_ioctl =	lo_compat_ioctl,
 #endif
 };
 
diff --git a/drivers/block/nbd.c b/drivers/block/nbd.c
index 9034ca585afd..36015e0945b1 100644
--- a/drivers/block/nbd.c
+++ b/drivers/block/nbd.c
@@ -691,7 +691,7 @@ static int nbd_ioctl(struct inode *inode, struct file *file,
 static struct block_device_operations nbd_fops =
 {
 	.owner =	THIS_MODULE,
-	.ioctl =	nbd_ioctl,
+	.__ioctl =	nbd_ioctl,
 };
 
 /*
diff --git a/drivers/block/paride/pcd.c b/drivers/block/paride/pcd.c
index 8bd557e2a659..6e6dcc1d4328 100644
--- a/drivers/block/paride/pcd.c
+++ b/drivers/block/paride/pcd.c
@@ -252,9 +252,9 @@ static int pcd_block_media_changed(struct gendisk *disk)
 
 static struct block_device_operations pcd_bdops = {
 	.owner		= THIS_MODULE,
-	.open		= pcd_block_open,
-	.release	= pcd_block_release,
-	.ioctl		= pcd_block_ioctl,
+	.__open		= pcd_block_open,
+	.__release	= pcd_block_release,
+	.__ioctl		= pcd_block_ioctl,
 	.media_changed	= pcd_block_media_changed,
 };
 
diff --git a/drivers/block/paride/pd.c b/drivers/block/paride/pd.c
index 5fdfa7c888ce..b3023844947c 100644
--- a/drivers/block/paride/pd.c
+++ b/drivers/block/paride/pd.c
@@ -807,9 +807,9 @@ static int pd_revalidate(struct gendisk *p)
 
 static struct block_device_operations pd_fops = {
 	.owner		= THIS_MODULE,
-	.open		= pd_open,
-	.release	= pd_release,
-	.ioctl		= pd_ioctl,
+	.__open		= pd_open,
+	.__release	= pd_release,
+	.__ioctl		= pd_ioctl,
 	.getgeo		= pd_getgeo,
 	.media_changed	= pd_check_media,
 	.revalidate_disk= pd_revalidate
diff --git a/drivers/block/paride/pf.c b/drivers/block/paride/pf.c
index a902d84fd330..e08ca5161ad8 100644
--- a/drivers/block/paride/pf.c
+++ b/drivers/block/paride/pf.c
@@ -264,9 +264,9 @@ static char *pf_buf;		/* buffer for request in progress */
 
 static struct block_device_operations pf_fops = {
 	.owner		= THIS_MODULE,
-	.open		= pf_open,
-	.release	= pf_release,
-	.ioctl		= pf_ioctl,
+	.__open		= pf_open,
+	.__release	= pf_release,
+	.__ioctl		= pf_ioctl,
 	.getgeo		= pf_getgeo,
 	.media_changed	= pf_check_media,
 };
diff --git a/drivers/block/pktcdvd.c b/drivers/block/pktcdvd.c
index a0ba4023953b..33ac8ddf4912 100644
--- a/drivers/block/pktcdvd.c
+++ b/drivers/block/pktcdvd.c
@@ -2847,9 +2847,9 @@ static int pkt_media_changed(struct gendisk *disk)
 
 static struct block_device_operations pktcdvd_ops = {
 	.owner =		THIS_MODULE,
-	.open =			pkt_open,
-	.release =		pkt_close,
-	.ioctl =		pkt_ioctl,
+	.__open =			pkt_open,
+	.__release =		pkt_close,
+	.__ioctl =		pkt_ioctl,
 	.media_changed =	pkt_media_changed,
 };
 
diff --git a/drivers/block/swim3.c b/drivers/block/swim3.c
index 5c45d5556ae8..9398af86a7aa 100644
--- a/drivers/block/swim3.c
+++ b/drivers/block/swim3.c
@@ -998,9 +998,9 @@ static int floppy_revalidate(struct gendisk *disk)
 }
 
 static struct block_device_operations floppy_fops = {
-	.open		= floppy_open,
-	.release	= floppy_release,
-	.ioctl		= floppy_ioctl,
+	.__open		= floppy_open,
+	.__release	= floppy_release,
+	.__ioctl		= floppy_ioctl,
 	.media_changed	= floppy_check_change,
 	.revalidate_disk= floppy_revalidate,
 };
diff --git a/drivers/block/ub.c b/drivers/block/ub.c
index bc04330f3683..5261773407da 100644
--- a/drivers/block/ub.c
+++ b/drivers/block/ub.c
@@ -1791,9 +1791,9 @@ static int ub_bd_media_changed(struct gendisk *disk)
 
 static struct block_device_operations ub_bd_fops = {
 	.owner		= THIS_MODULE,
-	.open		= ub_bd_open,
-	.release	= ub_bd_release,
-	.ioctl		= ub_bd_ioctl,
+	.__open		= ub_bd_open,
+	.__release	= ub_bd_release,
+	.__ioctl		= ub_bd_ioctl,
 	.media_changed	= ub_bd_media_changed,
 	.revalidate_disk = ub_bd_revalidate,
 };
diff --git a/drivers/block/viodasd.c b/drivers/block/viodasd.c
index 1730d29e6044..7f7beec29ebb 100644
--- a/drivers/block/viodasd.c
+++ b/drivers/block/viodasd.c
@@ -221,8 +221,8 @@ static int viodasd_getgeo(struct block_device *bdev, struct hd_geometry *geo)
  */
 static struct block_device_operations viodasd_fops = {
 	.owner = THIS_MODULE,
-	.open = viodasd_open,
-	.release = viodasd_release,
+	.__open = viodasd_open,
+	.__release = viodasd_release,
 	.getgeo = viodasd_getgeo,
 };
 
diff --git a/drivers/block/virtio_blk.c b/drivers/block/virtio_blk.c
index 7643cd16fd67..10f157ea7b0b 100644
--- a/drivers/block/virtio_blk.c
+++ b/drivers/block/virtio_blk.c
@@ -180,7 +180,7 @@ static int virtblk_getgeo(struct block_device *bd, struct hd_geometry *geo)
 }
 
 static struct block_device_operations virtblk_fops = {
-	.ioctl  = virtblk_ioctl,
+	.__ioctl  = virtblk_ioctl,
 	.owner  = THIS_MODULE,
 	.getgeo = virtblk_getgeo,
 };
diff --git a/drivers/block/xd.c b/drivers/block/xd.c
index 624d30f7da3f..316fa1da4b9c 100644
--- a/drivers/block/xd.c
+++ b/drivers/block/xd.c
@@ -132,7 +132,7 @@ static int xd_getgeo(struct block_device *bdev, struct hd_geometry *geo);
 
 static struct block_device_operations xd_fops = {
 	.owner	= THIS_MODULE,
-	.ioctl	= xd_ioctl,
+	.__ioctl	= xd_ioctl,
 	.getgeo = xd_getgeo,
 };
 static DECLARE_WAIT_QUEUE_HEAD(xd_wait_int);
diff --git a/drivers/block/xen-blkfront.c b/drivers/block/xen-blkfront.c
index 1a50ae70f716..7efac80c8dde 100644
--- a/drivers/block/xen-blkfront.c
+++ b/drivers/block/xen-blkfront.c
@@ -1041,8 +1041,8 @@ static int blkif_release(struct inode *inode, struct file *filep)
 static struct block_device_operations xlvbd_block_fops =
 {
 	.owner = THIS_MODULE,
-	.open = blkif_open,
-	.release = blkif_release,
+	.__open = blkif_open,
+	.__release = blkif_release,
 	.getgeo = blkif_getgeo,
 	.ioctl = blkif_ioctl,
 };
diff --git a/drivers/block/xsysace.c b/drivers/block/xsysace.c
index 4a7a059ebaf7..e4efe5b7ec22 100644
--- a/drivers/block/xsysace.c
+++ b/drivers/block/xsysace.c
@@ -919,8 +919,8 @@ static int ace_getgeo(struct block_device *bdev, struct hd_geometry *geo)
 
 static struct block_device_operations ace_fops = {
 	.owner = THIS_MODULE,
-	.open = ace_open,
-	.release = ace_release,
+	.__open = ace_open,
+	.__release = ace_release,
 	.media_changed = ace_media_changed,
 	.revalidate_disk = ace_revalidate_disk,
 	.getgeo = ace_getgeo,
diff --git a/drivers/block/z2ram.c b/drivers/block/z2ram.c
index be20a67f1fa8..4860d0f36870 100644
--- a/drivers/block/z2ram.c
+++ b/drivers/block/z2ram.c
@@ -314,8 +314,8 @@ z2_release( struct inode *inode, struct file *filp )
 static struct block_device_operations z2_fops =
 {
 	.owner		= THIS_MODULE,
-	.open		= z2_open,
-	.release	= z2_release,
+	.__open		= z2_open,
+	.__release	= z2_release,
 };
 
 static struct kobject *z2_find(dev_t dev, int *part, void *data)
diff --git a/drivers/cdrom/gdrom.c b/drivers/cdrom/gdrom.c
index 0959edf2afdb..ab0c637f58be 100644
--- a/drivers/cdrom/gdrom.c
+++ b/drivers/cdrom/gdrom.c
@@ -514,10 +514,10 @@ static int gdrom_bdops_ioctl(struct inode *inode, struct file *file,
 
 static struct block_device_operations gdrom_bdops = {
 	.owner			= THIS_MODULE,
-	.open			= gdrom_bdops_open,
-	.release		= gdrom_bdops_release,
+	.__open			= gdrom_bdops_open,
+	.__release		= gdrom_bdops_release,
 	.media_changed		= gdrom_bdops_mediachanged,
-	.ioctl			= gdrom_bdops_ioctl,
+	.__ioctl			= gdrom_bdops_ioctl,
 };
 
 static irqreturn_t gdrom_command_interrupt(int irq, void *dev_id)
diff --git a/drivers/cdrom/viocd.c b/drivers/cdrom/viocd.c
index abc4079c3f41..57c2dced3e9d 100644
--- a/drivers/cdrom/viocd.c
+++ b/drivers/cdrom/viocd.c
@@ -180,9 +180,9 @@ static int viocd_blk_media_changed(struct gendisk *disk)
 
 struct block_device_operations viocd_fops = {
 	.owner =		THIS_MODULE,
-	.open =			viocd_blk_open,
-	.release =		viocd_blk_release,
-	.ioctl =		viocd_blk_ioctl,
+	.__open =			viocd_blk_open,
+	.__release =		viocd_blk_release,
+	.__ioctl =		viocd_blk_ioctl,
 	.media_changed =	viocd_blk_media_changed,
 };
 
diff --git a/drivers/ide/ide-cd.c b/drivers/ide/ide-cd.c
index 87d90200b169..3533984355a6 100644
--- a/drivers/ide/ide-cd.c
+++ b/drivers/ide/ide-cd.c
@@ -2200,9 +2200,9 @@ static int idecd_revalidate_disk(struct gendisk *disk)
 
 static struct block_device_operations idecd_ops = {
 	.owner			= THIS_MODULE,
-	.open			= idecd_open,
-	.release		= idecd_release,
-	.ioctl			= idecd_ioctl,
+	.__open			= idecd_open,
+	.__release		= idecd_release,
+	.__ioctl			= idecd_ioctl,
 	.media_changed		= idecd_media_changed,
 	.revalidate_disk	= idecd_revalidate_disk
 };
diff --git a/drivers/ide/ide-gd.c b/drivers/ide/ide-gd.c
index 948af08abe23..d118bbed7cd3 100644
--- a/drivers/ide/ide-gd.c
+++ b/drivers/ide/ide-gd.c
@@ -298,9 +298,9 @@ static int ide_gd_ioctl(struct inode *inode, struct file *file,
 
 static struct block_device_operations ide_gd_ops = {
 	.owner			= THIS_MODULE,
-	.open			= ide_gd_open,
-	.release		= ide_gd_release,
-	.ioctl			= ide_gd_ioctl,
+	.__open			= ide_gd_open,
+	.__release		= ide_gd_release,
+	.__ioctl			= ide_gd_ioctl,
 	.getgeo			= ide_gd_getgeo,
 	.media_changed		= ide_gd_media_changed,
 	.revalidate_disk	= ide_gd_revalidate_disk
diff --git a/drivers/ide/ide-tape.c b/drivers/ide/ide-tape.c
index 2b263281ffea..c5df53c4838c 100644
--- a/drivers/ide/ide-tape.c
+++ b/drivers/ide/ide-tape.c
@@ -2376,9 +2376,9 @@ static int idetape_ioctl(struct inode *inode, struct file *file,
 
 static struct block_device_operations idetape_block_ops = {
 	.owner		= THIS_MODULE,
-	.open		= idetape_open,
-	.release	= idetape_release,
-	.ioctl		= idetape_ioctl,
+	.__open		= idetape_open,
+	.__release	= idetape_release,
+	.__ioctl		= idetape_ioctl,
 };
 
 static int ide_tape_probe(ide_drive_t *drive)
diff --git a/drivers/md/dm.c b/drivers/md/dm.c
index 5f0f4c8bcd3e..8b4c92b1b6db 100644
--- a/drivers/md/dm.c
+++ b/drivers/md/dm.c
@@ -1698,9 +1698,9 @@ int dm_noflush_suspending(struct dm_target *ti)
 EXPORT_SYMBOL_GPL(dm_noflush_suspending);
 
 static struct block_device_operations dm_blk_dops = {
-	.open = dm_blk_open,
-	.release = dm_blk_close,
-	.ioctl = dm_blk_ioctl,
+	.__open = dm_blk_open,
+	.__release = dm_blk_close,
+	.__ioctl = dm_blk_ioctl,
 	.getgeo = dm_blk_getgeo,
 	.owner = THIS_MODULE
 };
diff --git a/drivers/md/md.c b/drivers/md/md.c
index aaa3d465de4e..21b04d39ba3b 100644
--- a/drivers/md/md.c
+++ b/drivers/md/md.c
@@ -5046,9 +5046,9 @@ static int md_revalidate(struct gendisk *disk)
 static struct block_device_operations md_fops =
 {
 	.owner		= THIS_MODULE,
-	.open		= md_open,
-	.release	= md_release,
-	.ioctl		= md_ioctl,
+	.__open		= md_open,
+	.__release	= md_release,
+	.__ioctl		= md_ioctl,
 	.getgeo		= md_getgeo,
 	.media_changed	= md_media_changed,
 	.revalidate_disk= md_revalidate,
diff --git a/drivers/memstick/core/mspro_block.c b/drivers/memstick/core/mspro_block.c
index 5263913e0c69..fbe5919789d0 100644
--- a/drivers/memstick/core/mspro_block.c
+++ b/drivers/memstick/core/mspro_block.c
@@ -237,8 +237,8 @@ static int mspro_block_bd_getgeo(struct block_device *bdev,
 }
 
 static struct block_device_operations ms_block_bdops = {
-	.open    = mspro_block_bd_open,
-	.release = mspro_block_bd_release,
+	.__open    = mspro_block_bd_open,
+	.__release = mspro_block_bd_release,
 	.getgeo  = mspro_block_bd_getgeo,
 	.owner   = THIS_MODULE
 };
diff --git a/drivers/message/i2o/i2o_block.c b/drivers/message/i2o/i2o_block.c
index 81483de8c0fd..71500dda8eb3 100644
--- a/drivers/message/i2o/i2o_block.c
+++ b/drivers/message/i2o/i2o_block.c
@@ -931,9 +931,9 @@ static void i2o_block_request_fn(struct request_queue *q)
 /* I2O Block device operations definition */
 static struct block_device_operations i2o_block_fops = {
 	.owner = THIS_MODULE,
-	.open = i2o_block_open,
-	.release = i2o_block_release,
-	.ioctl = i2o_block_ioctl,
+	.__open = i2o_block_open,
+	.__release = i2o_block_release,
+	.__ioctl = i2o_block_ioctl,
 	.getgeo = i2o_block_getgeo,
 	.media_changed = i2o_block_media_changed
 };
diff --git a/drivers/mmc/card/block.c b/drivers/mmc/card/block.c
index 24c97d3d16bb..8cba06f5e11d 100644
--- a/drivers/mmc/card/block.c
+++ b/drivers/mmc/card/block.c
@@ -130,8 +130,8 @@ mmc_blk_getgeo(struct block_device *bdev, struct hd_geometry *geo)
 }
 
 static struct block_device_operations mmc_bdops = {
-	.open			= mmc_blk_open,
-	.release		= mmc_blk_release,
+	.__open			= mmc_blk_open,
+	.__release		= mmc_blk_release,
 	.getgeo			= mmc_blk_getgeo,
 	.owner			= THIS_MODULE,
 };
diff --git a/drivers/mtd/mtd_blkdevs.c b/drivers/mtd/mtd_blkdevs.c
index 681d5aca2af4..b00d07c53753 100644
--- a/drivers/mtd/mtd_blkdevs.c
+++ b/drivers/mtd/mtd_blkdevs.c
@@ -213,9 +213,9 @@ static int blktrans_ioctl(struct inode *inode, struct file *file,
 
 static struct block_device_operations mtd_blktrans_ops = {
 	.owner		= THIS_MODULE,
-	.open		= blktrans_open,
-	.release	= blktrans_release,
-	.ioctl		= blktrans_ioctl,
+	.__open		= blktrans_open,
+	.__release	= blktrans_release,
+	.__ioctl		= blktrans_ioctl,
 	.getgeo		= blktrans_getgeo,
 };
 
diff --git a/drivers/s390/block/dasd.c b/drivers/s390/block/dasd.c
index 0a225ccda026..6bf68e5fe89f 100644
--- a/drivers/s390/block/dasd.c
+++ b/drivers/s390/block/dasd.c
@@ -2087,10 +2087,10 @@ static int dasd_getgeo(struct block_device *bdev, struct hd_geometry *geo)
 struct block_device_operations
 dasd_device_operations = {
 	.owner		= THIS_MODULE,
-	.open		= dasd_open,
-	.release	= dasd_release,
-	.ioctl		= dasd_ioctl,
-	.compat_ioctl	= dasd_compat_ioctl,
+	.__open		= dasd_open,
+	.__release	= dasd_release,
+	.__ioctl		= dasd_ioctl,
+	.__compat_ioctl	= dasd_compat_ioctl,
 	.getgeo		= dasd_getgeo,
 };
 
diff --git a/drivers/s390/block/dcssblk.c b/drivers/s390/block/dcssblk.c
index a7ff167d5b81..413460cc3dd8 100644
--- a/drivers/s390/block/dcssblk.c
+++ b/drivers/s390/block/dcssblk.c
@@ -42,8 +42,8 @@ static char dcssblk_segments[DCSSBLK_PARM_LEN] = "\0";
 static int dcssblk_major;
 static struct block_device_operations dcssblk_devops = {
 	.owner   	= THIS_MODULE,
-	.open    	= dcssblk_open,
-	.release 	= dcssblk_release,
+	.__open    	= dcssblk_open,
+	.__release 	= dcssblk_release,
 	.direct_access 	= dcssblk_direct_access,
 };
 
diff --git a/drivers/s390/char/tape_block.c b/drivers/s390/char/tape_block.c
index a25b8bf54f41..f1a741c9a6f0 100644
--- a/drivers/s390/char/tape_block.c
+++ b/drivers/s390/char/tape_block.c
@@ -52,9 +52,9 @@ static int tapeblock_revalidate_disk(struct gendisk *);
 
 static struct block_device_operations tapeblock_fops = {
 	.owner		 = THIS_MODULE,
-	.open		 = tapeblock_open,
-	.release	 = tapeblock_release,
-	.ioctl           = tapeblock_ioctl,
+	.__open		 = tapeblock_open,
+	.__release	 = tapeblock_release,
+	.__ioctl           = tapeblock_ioctl,
 	.media_changed   = tapeblock_medium_changed,
 	.revalidate_disk = tapeblock_revalidate_disk,
 };
diff --git a/drivers/scsi/ide-scsi.c b/drivers/scsi/ide-scsi.c
index 5bcc04e82c28..9069afbad9d3 100644
--- a/drivers/scsi/ide-scsi.c
+++ b/drivers/scsi/ide-scsi.c
@@ -483,9 +483,9 @@ static int idescsi_ide_ioctl(struct inode *inode, struct file *file,
 
 static struct block_device_operations idescsi_ops = {
 	.owner		= THIS_MODULE,
-	.open		= idescsi_ide_open,
-	.release	= idescsi_ide_release,
-	.ioctl		= idescsi_ide_ioctl,
+	.__open		= idescsi_ide_open,
+	.__release	= idescsi_ide_release,
+	.__ioctl		= idescsi_ide_ioctl,
 };
 
 static int idescsi_slave_configure(struct scsi_device * sdp)
diff --git a/drivers/scsi/sd.c b/drivers/scsi/sd.c
index 5a18528a69d0..c8b95e8d2859 100644
--- a/drivers/scsi/sd.c
+++ b/drivers/scsi/sd.c
@@ -962,12 +962,12 @@ static long sd_compat_ioctl(struct file *file, unsigned int cmd, unsigned long a
 
 static struct block_device_operations sd_fops = {
 	.owner			= THIS_MODULE,
-	.open			= sd_open,
-	.release		= sd_release,
-	.ioctl			= sd_ioctl,
+	.__open			= sd_open,
+	.__release		= sd_release,
+	.__ioctl			= sd_ioctl,
 	.getgeo			= sd_getgeo,
 #ifdef CONFIG_COMPAT
-	.compat_ioctl		= sd_compat_ioctl,
+	.__compat_ioctl		= sd_compat_ioctl,
 #endif
 	.media_changed		= sd_media_changed,
 	.revalidate_disk	= sd_revalidate_disk,
diff --git a/drivers/scsi/sr.c b/drivers/scsi/sr.c
index 2fb8d4d2d6f6..9446cbf4de84 100644
--- a/drivers/scsi/sr.c
+++ b/drivers/scsi/sr.c
@@ -540,9 +540,9 @@ static int sr_block_media_changed(struct gendisk *disk)
 static struct block_device_operations sr_bdops =
 {
 	.owner		= THIS_MODULE,
-	.open		= sr_block_open,
-	.release	= sr_block_release,
-	.ioctl		= sr_block_ioctl,
+	.__open		= sr_block_open,
+	.__release	= sr_block_release,
+	.__ioctl		= sr_block_ioctl,
 	.media_changed	= sr_block_media_changed,
 	/* 
 	 * No compat_ioctl for now because sr_block_ioctl never
diff --git a/fs/block_dev.c b/fs/block_dev.c
index b9022694e9f7..73b6ce47c861 100644
--- a/fs/block_dev.c
+++ b/fs/block_dev.c
@@ -1033,8 +1033,13 @@ static int do_open(struct block_device *bdev, struct file *file, int for_part)
 		bdev->bd_contains = bdev;
 		if (!partno) {
 			struct backing_dev_info *bdi;
+			if (disk->fops->__open) {
+				ret = disk->fops->__open(bdev->bd_inode, file);
+				if (ret)
+					goto out_first;
+			}
 			if (disk->fops->open) {
-				ret = disk->fops->open(bdev->bd_inode, file);
+				ret = disk->fops->open(bdev, file->f_mode);
 				if (ret)
 					goto out_clear;
 			}
@@ -1074,8 +1079,13 @@ static int do_open(struct block_device *bdev, struct file *file, int for_part)
 		part = NULL;
 		disk = NULL;
 		if (bdev->bd_contains == bdev) {
+			if (bdev->bd_disk->fops->__open) {
+				ret = bdev->bd_disk->fops->__open(bdev->bd_inode, file);
+				if (ret)
+					goto out;
+			}
 			if (bdev->bd_disk->fops->open) {
-				ret = bdev->bd_disk->fops->open(bdev->bd_inode, file);
+				ret = bdev->bd_disk->fops->open(bdev, file->f_mode);
 				if (ret)
 					goto out_unlock_bdev;
 			}
@@ -1184,8 +1194,10 @@ static int __blkdev_put(struct block_device *bdev, int for_part)
 		kill_bdev(bdev);
 	}
 	if (bdev->bd_contains == bdev) {
+		if (disk->fops->__release)
+			ret = disk->fops->__release(bd_inode, NULL);
 		if (disk->fops->release)
-			ret = disk->fops->release(bd_inode, NULL);
+			ret = disk->fops->release(disk, 0);
 	}
 	if (!bdev->bd_openers) {
 		struct module *owner = disk->fops->owner;
diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
index 2bad616b9949..b573186ff1a1 100644
--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@ -1061,11 +1061,16 @@ struct file;
 struct inode;
 
 struct block_device_operations {
-	int (*open) (struct inode *, struct file *);
-	int (*release) (struct inode *, struct file *);
-	int (*ioctl) (struct inode *, struct file *, unsigned, unsigned long);
-	long (*unlocked_ioctl) (struct file *, unsigned, unsigned long);
-	long (*compat_ioctl) (struct file *, unsigned, unsigned long);
+	int (*__open) (struct inode *, struct file *);
+	int (*__release) (struct inode *, struct file *);
+	int (*__ioctl) (struct inode *, struct file *, unsigned, unsigned long);
+	long (*__unlocked_ioctl) (struct file *, unsigned, unsigned long);
+	long (*__compat_ioctl) (struct file *, unsigned, unsigned long);
+	int (*open) (struct block_device *, fmode_t);
+	int (*release) (struct gendisk *, fmode_t);
+	int (*locked_ioctl) (struct block_device *, fmode_t, unsigned, unsigned long);
+	int (*ioctl) (struct block_device *, fmode_t, unsigned, unsigned long);
+	int (*compat_ioctl) (struct block_device *, fmode_t, unsigned, unsigned long);
 	int (*direct_access) (struct block_device *, sector_t,
 						void **, unsigned long *);
 	int (*media_changed) (struct gendisk *);
diff --git a/include/linux/fs.h b/include/linux/fs.h
index 58bbf689fef7..b5894604ba5e 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -79,6 +79,7 @@ extern int dir_notify_enable;
 #define FMODE_NDELAY	((__force fmode_t)32)
 #define FMODE_EXCL	((__force fmode_t)64)
 #define FMODE_WRITE_IOCTL	((__force fmode_t)128)
+#define FMODE_NDELAY_NOW	((__force fmode_t)256)
 
 #define RW_MASK		1
 #define RWA_MASK	2
-- 
cgit v1.2.3


From 90b8f2824ce68dd87d304641a1d5a048dfff39f5 Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Sun, 2 Mar 2008 10:43:36 -0500
Subject: [PATCH] end of methods switch: remove the old ones

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 block/compat_ioctl.c   | 12 ------------
 block/ioctl.c          | 26 --------------------------
 fs/block_dev.c         | 13 -------------
 include/linux/blkdev.h |  8 --------
 4 files changed, 59 deletions(-)

(limited to 'include/linux')

diff --git a/block/compat_ioctl.c b/block/compat_ioctl.c
index 576c4fd15463..fd537fdb25a3 100644
--- a/block/compat_ioctl.c
+++ b/block/compat_ioctl.c
@@ -708,16 +708,6 @@ static int compat_blkdev_driver_ioctl(struct inode *inode, struct file *file,
 		return -ENOIOCTLCMD;
 	}
 
-	if (disk->fops->__unlocked_ioctl)
-		return disk->fops->__unlocked_ioctl(file, cmd, arg);
-
-	if (disk->fops->__ioctl) {
-		lock_kernel();
-		ret = disk->fops->__ioctl(inode, file, cmd, arg);
-		unlock_kernel();
-		return ret;
-	}
-
 	return __blkdev_driver_ioctl(inode->i_bdev, file->f_mode, cmd, arg);
 }
 
@@ -805,8 +795,6 @@ long compat_blkdev_ioctl(struct file *file, unsigned cmd, unsigned long arg)
 
 	lock_kernel();
 	ret = compat_blkdev_locked_ioctl(inode, file, bdev, cmd, arg);
-	if (ret == -ENOIOCTLCMD && disk->fops->__compat_ioctl)
-		ret = disk->fops->__compat_ioctl(file, cmd, arg);
 	unlock_kernel();
 	if (ret == -ENOIOCTLCMD && disk->fops->compat_ioctl)
 		ret = disk->fops->compat_ioctl(bdev, file->f_mode, cmd, arg);
diff --git a/block/ioctl.c b/block/ioctl.c
index 01ff463bc801..0db89f95b15f 100644
--- a/block/ioctl.c
+++ b/block/ioctl.c
@@ -276,16 +276,6 @@ int blkdev_driver_ioctl(struct inode *inode, struct file *file,
 			mode |= FMODE_NDELAY_NOW;
 	}
 
-	if (disk->fops->__unlocked_ioctl)
-		return disk->fops->__unlocked_ioctl(file, cmd, arg);
-
-	if (disk->fops->__ioctl) {
-		lock_kernel();
-		ret = disk->fops->__ioctl(inode, file, cmd, arg);
-		unlock_kernel();
-		return ret;
-	}
-
 	return __blkdev_driver_ioctl(inode->i_bdev, mode, cmd, arg);
 }
 EXPORT_SYMBOL_GPL(blkdev_driver_ioctl);
@@ -295,22 +285,6 @@ int __blkdev_driver_ioctl(struct block_device *bdev, fmode_t mode,
 {
 	struct gendisk *disk = bdev->bd_disk;
 	int ret;
-	/* you bet it'll go away by the end of patch series */
-	struct file fake_file = {};
-	struct dentry fake_dentry = {};
-	fake_file.f_mode = mode;
-	fake_file.f_path.dentry = &fake_dentry;
-	fake_dentry.d_inode = bdev->bd_inode;
-
-	if (disk->fops->__unlocked_ioctl)
-		return disk->fops->__unlocked_ioctl(&fake_file, cmd, arg);
-
-	if (disk->fops->__ioctl) {
-		lock_kernel();
-		ret = disk->fops->__ioctl(bdev->bd_inode, &fake_file, cmd, arg);
-		unlock_kernel();
-		return ret;
-	}
 
 	if (disk->fops->ioctl)
 		return disk->fops->ioctl(bdev, mode, cmd, arg);
diff --git a/fs/block_dev.c b/fs/block_dev.c
index 73b6ce47c861..55124ac8c7ad 100644
--- a/fs/block_dev.c
+++ b/fs/block_dev.c
@@ -1033,11 +1033,6 @@ static int do_open(struct block_device *bdev, struct file *file, int for_part)
 		bdev->bd_contains = bdev;
 		if (!partno) {
 			struct backing_dev_info *bdi;
-			if (disk->fops->__open) {
-				ret = disk->fops->__open(bdev->bd_inode, file);
-				if (ret)
-					goto out_first;
-			}
 			if (disk->fops->open) {
 				ret = disk->fops->open(bdev, file->f_mode);
 				if (ret)
@@ -1079,11 +1074,6 @@ static int do_open(struct block_device *bdev, struct file *file, int for_part)
 		part = NULL;
 		disk = NULL;
 		if (bdev->bd_contains == bdev) {
-			if (bdev->bd_disk->fops->__open) {
-				ret = bdev->bd_disk->fops->__open(bdev->bd_inode, file);
-				if (ret)
-					goto out;
-			}
 			if (bdev->bd_disk->fops->open) {
 				ret = bdev->bd_disk->fops->open(bdev, file->f_mode);
 				if (ret)
@@ -1180,7 +1170,6 @@ static int blkdev_open(struct inode * inode, struct file * filp)
 static int __blkdev_put(struct block_device *bdev, int for_part)
 {
 	int ret = 0;
-	struct inode *bd_inode = bdev->bd_inode;
 	struct gendisk *disk = bdev->bd_disk;
 	struct block_device *victim = NULL;
 
@@ -1194,8 +1183,6 @@ static int __blkdev_put(struct block_device *bdev, int for_part)
 		kill_bdev(bdev);
 	}
 	if (bdev->bd_contains == bdev) {
-		if (disk->fops->__release)
-			ret = disk->fops->__release(bd_inode, NULL);
 		if (disk->fops->release)
 			ret = disk->fops->release(disk, 0);
 	}
diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
index b573186ff1a1..a135256b272c 100644
--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@ -1057,15 +1057,7 @@ static inline int blk_integrity_rq(struct request *rq)
 
 #endif /* CONFIG_BLK_DEV_INTEGRITY */
 
-struct file;
-struct inode;
-
 struct block_device_operations {
-	int (*__open) (struct inode *, struct file *);
-	int (*__release) (struct inode *, struct file *);
-	int (*__ioctl) (struct inode *, struct file *, unsigned, unsigned long);
-	long (*__unlocked_ioctl) (struct file *, unsigned, unsigned long);
-	long (*__compat_ioctl) (struct file *, unsigned, unsigned long);
 	int (*open) (struct block_device *, fmode_t);
 	int (*release) (struct gendisk *, fmode_t);
 	int (*locked_ioctl) (struct block_device *, fmode_t, unsigned, unsigned long);
-- 
cgit v1.2.3


From 9a1c3542768b5a58e45a9216921cd10a3bae1205 Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Fri, 22 Feb 2008 20:40:24 -0500
Subject: [PATCH] pass fmode_t to blkdev_put()

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 drivers/block/pktcdvd.c         |  8 ++++----
 drivers/char/raw.c              |  4 ++--
 drivers/md/dm-table.c           |  4 ++--
 drivers/md/md.c                 |  4 ++--
 drivers/s390/block/dasd_genhd.c |  2 +-
 fs/block_dev.c                  | 22 +++++++++++-----------
 fs/ext3/super.c                 |  4 ++--
 fs/ext4/super.c                 |  4 ++--
 fs/jfs/jfs_logmgr.c             |  4 ++--
 fs/ocfs2/cluster/heartbeat.c    |  4 ++--
 fs/partitions/check.c           |  2 +-
 fs/reiserfs/journal.c           |  4 ++--
 include/linux/fs.h              |  2 +-
 kernel/power/swap.c             |  8 ++++----
 14 files changed, 38 insertions(+), 38 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/block/pktcdvd.c b/drivers/block/pktcdvd.c
index 90548da9c1cb..ce8c71901923 100644
--- a/drivers/block/pktcdvd.c
+++ b/drivers/block/pktcdvd.c
@@ -2381,7 +2381,7 @@ static int pkt_open_dev(struct pktcdvd_device *pd, fmode_t write)
 out_unclaim:
 	bd_release(pd->bdev);
 out_putdev:
-	blkdev_put(pd->bdev);
+	blkdev_put(pd->bdev, FMODE_READ);
 out:
 	return ret;
 }
@@ -2399,7 +2399,7 @@ static void pkt_release_dev(struct pktcdvd_device *pd, int flush)
 
 	pkt_set_speed(pd, MAX_SPEED, MAX_SPEED);
 	bd_release(pd->bdev);
-	blkdev_put(pd->bdev);
+	blkdev_put(pd->bdev, FMODE_READ);
 
 	pkt_shrink_pktlist(pd);
 }
@@ -2790,7 +2790,7 @@ static int pkt_new_dev(struct pktcdvd_device *pd, dev_t dev)
 	return 0;
 
 out_mem:
-	blkdev_put(bdev);
+	blkdev_put(bdev, FMODE_READ|FMODE_WRITE);
 	/* This is safe: open() is still holding a reference. */
 	module_put(THIS_MODULE);
 	return ret;
@@ -2975,7 +2975,7 @@ static int pkt_remove_dev(dev_t pkt_dev)
 	pkt_debugfs_dev_remove(pd);
 	pkt_sysfs_dev_remove(pd);
 
-	blkdev_put(pd->bdev);
+	blkdev_put(pd->bdev, FMODE_READ|FMODE_WRITE);
 
 	remove_proc_entry(pd->name, pkt_proc);
 	DPRINTK(DRIVER_NAME": writer %s unmapped\n", pd->name);
diff --git a/drivers/char/raw.c b/drivers/char/raw.c
index e139372d0e69..bfd59e6bf54f 100644
--- a/drivers/char/raw.c
+++ b/drivers/char/raw.c
@@ -87,7 +87,7 @@ static int raw_open(struct inode *inode, struct file *filp)
 out2:
 	bd_release(bdev);
 out1:
-	blkdev_put(bdev);
+	blkdev_put(bdev, filp->f_mode);
 out:
 	mutex_unlock(&raw_mutex);
 	return err;
@@ -112,7 +112,7 @@ static int raw_release(struct inode *inode, struct file *filp)
 	mutex_unlock(&raw_mutex);
 
 	bd_release(bdev);
-	blkdev_put(bdev);
+	blkdev_put(bdev, filp->f_mode);
 	return 0;
 }
 
diff --git a/drivers/md/dm-table.c b/drivers/md/dm-table.c
index 7c8671b06fe3..dd8bd2e867cd 100644
--- a/drivers/md/dm-table.c
+++ b/drivers/md/dm-table.c
@@ -357,7 +357,7 @@ static int open_dev(struct dm_dev_internal *d, dev_t dev,
 		return PTR_ERR(bdev);
 	r = bd_claim_by_disk(bdev, _claim_ptr, dm_disk(md));
 	if (r)
-		blkdev_put(bdev);
+		blkdev_put(bdev, d->dm_dev.mode);
 	else
 		d->dm_dev.bdev = bdev;
 	return r;
@@ -372,7 +372,7 @@ static void close_dev(struct dm_dev_internal *d, struct mapped_device *md)
 		return;
 
 	bd_release_from_disk(d->dm_dev.bdev, dm_disk(md));
-	blkdev_put(d->dm_dev.bdev);
+	blkdev_put(d->dm_dev.bdev, d->dm_dev.mode);
 	d->dm_dev.bdev = NULL;
 }
 
diff --git a/drivers/md/md.c b/drivers/md/md.c
index 06ea991c7a40..c1a837ca193c 100644
--- a/drivers/md/md.c
+++ b/drivers/md/md.c
@@ -1520,7 +1520,7 @@ static int lock_rdev(mdk_rdev_t *rdev, dev_t dev, int shared)
 	if (err) {
 		printk(KERN_ERR "md: could not bd_claim %s.\n",
 			bdevname(bdev, b));
-		blkdev_put(bdev);
+		blkdev_put(bdev, FMODE_READ|FMODE_WRITE);
 		return err;
 	}
 	if (!shared)
@@ -1536,7 +1536,7 @@ static void unlock_rdev(mdk_rdev_t *rdev)
 	if (!bdev)
 		MD_BUG();
 	bd_release(bdev);
-	blkdev_put(bdev);
+	blkdev_put(bdev, FMODE_READ|FMODE_WRITE);
 }
 
 void md_autodetect_dev(dev_t dev);
diff --git a/drivers/s390/block/dasd_genhd.c b/drivers/s390/block/dasd_genhd.c
index aee6565aaf98..3c1b6915c9ad 100644
--- a/drivers/s390/block/dasd_genhd.c
+++ b/drivers/s390/block/dasd_genhd.c
@@ -152,7 +152,7 @@ void dasd_destroy_partitions(struct dasd_block *block)
 
 	invalidate_partition(block->gdp, 0);
 	/* Matching blkdev_put to the blkdev_get in dasd_scan_partitions. */
-	blkdev_put(bdev);
+	blkdev_put(bdev, FMODE_READ);
 	set_capacity(block->gdp, 0);
 }
 
diff --git a/fs/block_dev.c b/fs/block_dev.c
index 55124ac8c7ad..05131baf3cf8 100644
--- a/fs/block_dev.c
+++ b/fs/block_dev.c
@@ -977,7 +977,7 @@ EXPORT_SYMBOL(bd_set_size);
 
 static int __blkdev_get(struct block_device *bdev, fmode_t mode, unsigned flags,
 			int for_part);
-static int __blkdev_put(struct block_device *bdev, int for_part);
+static int __blkdev_put(struct block_device *bdev, fmode_t mode, int for_part);
 
 /*
  * bd_mutex locking:
@@ -1095,7 +1095,7 @@ static int do_open(struct block_device *bdev, struct file *file, int for_part)
 	bdev->bd_part = NULL;
 	bdev->bd_inode->i_data.backing_dev_info = &default_backing_dev_info;
 	if (bdev != bdev->bd_contains)
-		__blkdev_put(bdev->bd_contains, 1);
+		__blkdev_put(bdev->bd_contains, file->f_mode, 1);
 	bdev->bd_contains = NULL;
  out_unlock_bdev:
 	mutex_unlock(&bdev->bd_mutex);
@@ -1163,11 +1163,11 @@ static int blkdev_open(struct inode * inode, struct file * filp)
 	if (!(res = bd_claim(bdev, filp)))
 		return 0;
 
-	blkdev_put(bdev);
+	blkdev_put(bdev, filp->f_mode);
 	return res;
 }
 
-static int __blkdev_put(struct block_device *bdev, int for_part)
+static int __blkdev_put(struct block_device *bdev, fmode_t mode, int for_part)
 {
 	int ret = 0;
 	struct gendisk *disk = bdev->bd_disk;
@@ -1184,7 +1184,7 @@ static int __blkdev_put(struct block_device *bdev, int for_part)
 	}
 	if (bdev->bd_contains == bdev) {
 		if (disk->fops->release)
-			ret = disk->fops->release(disk, 0);
+			ret = disk->fops->release(disk, mode);
 	}
 	if (!bdev->bd_openers) {
 		struct module *owner = disk->fops->owner;
@@ -1203,13 +1203,13 @@ static int __blkdev_put(struct block_device *bdev, int for_part)
 	mutex_unlock(&bdev->bd_mutex);
 	bdput(bdev);
 	if (victim)
-		__blkdev_put(victim, 1);
+		__blkdev_put(victim, mode, 1);
 	return ret;
 }
 
-int blkdev_put(struct block_device *bdev)
+int blkdev_put(struct block_device *bdev, fmode_t mode)
 {
-	return __blkdev_put(bdev, 0);
+	return __blkdev_put(bdev, mode, 0);
 }
 EXPORT_SYMBOL(blkdev_put);
 
@@ -1218,7 +1218,7 @@ static int blkdev_close(struct inode * inode, struct file * filp)
 	struct block_device *bdev = I_BDEV(filp->f_mapping->host);
 	if (bdev->bd_holder == filp)
 		bd_release(bdev);
-	return blkdev_put(bdev);
+	return blkdev_put(bdev, filp->f_mode);
 }
 
 static long block_ioctl(struct file *file, unsigned cmd, unsigned long arg)
@@ -1343,7 +1343,7 @@ struct block_device *open_bdev_excl(const char *path, int flags, void *holder)
 	return bdev;
 	
 blkdev_put:
-	blkdev_put(bdev);
+	blkdev_put(bdev, mode);
 	return ERR_PTR(error);
 }
 
@@ -1359,7 +1359,7 @@ EXPORT_SYMBOL(open_bdev_excl);
 void close_bdev_excl(struct block_device *bdev)
 {
 	bd_release(bdev);
-	blkdev_put(bdev);
+	blkdev_put(bdev, 0);	/* move up in the next patches */
 }
 
 EXPORT_SYMBOL(close_bdev_excl);
diff --git a/fs/ext3/super.c b/fs/ext3/super.c
index 3a260af5544d..15c38e69b694 100644
--- a/fs/ext3/super.c
+++ b/fs/ext3/super.c
@@ -347,7 +347,7 @@ fail:
 static int ext3_blkdev_put(struct block_device *bdev)
 {
 	bd_release(bdev);
-	return blkdev_put(bdev);
+	return blkdev_put(bdev, FMODE_READ|FMODE_WRITE);
 }
 
 static int ext3_blkdev_remove(struct ext3_sb_info *sbi)
@@ -2066,7 +2066,7 @@ static journal_t *ext3_get_dev_journal(struct super_block *sb,
 	if (bd_claim(bdev, sb)) {
 		printk(KERN_ERR
 		        "EXT3: failed to claim external journal device.\n");
-		blkdev_put(bdev);
+		blkdev_put(bdev, FMODE_READ|FMODE_WRITE);
 		return NULL;
 	}
 
diff --git a/fs/ext4/super.c b/fs/ext4/super.c
index 9b2b2bc4ec17..c12cf7a657a9 100644
--- a/fs/ext4/super.c
+++ b/fs/ext4/super.c
@@ -399,7 +399,7 @@ fail:
 static int ext4_blkdev_put(struct block_device *bdev)
 {
 	bd_release(bdev);
-	return blkdev_put(bdev);
+	return blkdev_put(bdev, FMODE_READ|FMODE_WRITE);
 }
 
 static int ext4_blkdev_remove(struct ext4_sb_info *sbi)
@@ -2553,7 +2553,7 @@ static journal_t *ext4_get_dev_journal(struct super_block *sb,
 	if (bd_claim(bdev, sb)) {
 		printk(KERN_ERR
 			"EXT4: failed to claim external journal device.\n");
-		blkdev_put(bdev);
+		blkdev_put(bdev, FMODE_READ|FMODE_WRITE);
 		return NULL;
 	}
 
diff --git a/fs/jfs/jfs_logmgr.c b/fs/jfs/jfs_logmgr.c
index cd2ec2988b59..335c4de6552d 100644
--- a/fs/jfs/jfs_logmgr.c
+++ b/fs/jfs/jfs_logmgr.c
@@ -1168,7 +1168,7 @@ journal_found:
 	bd_release(bdev);
 
       close:		/* close external log device */
-	blkdev_put(bdev);
+	blkdev_put(bdev, FMODE_READ|FMODE_WRITE);
 
       free:		/* free log descriptor */
 	mutex_unlock(&jfs_log_mutex);
@@ -1514,7 +1514,7 @@ int lmLogClose(struct super_block *sb)
 	rc = lmLogShutdown(log);
 
 	bd_release(bdev);
-	blkdev_put(bdev);
+	blkdev_put(bdev, FMODE_READ|FMODE_WRITE);
 
 	kfree(log);
 
diff --git a/fs/ocfs2/cluster/heartbeat.c b/fs/ocfs2/cluster/heartbeat.c
index 7dce1612553e..4b6fdf591eed 100644
--- a/fs/ocfs2/cluster/heartbeat.c
+++ b/fs/ocfs2/cluster/heartbeat.c
@@ -976,7 +976,7 @@ static void o2hb_region_release(struct config_item *item)
 	}
 
 	if (reg->hr_bdev)
-		blkdev_put(reg->hr_bdev);
+		blkdev_put(reg->hr_bdev, FMODE_READ|FMODE_WRITE);
 
 	if (reg->hr_slots)
 		kfree(reg->hr_slots);
@@ -1358,7 +1358,7 @@ out:
 		iput(inode);
 	if (ret < 0) {
 		if (reg->hr_bdev) {
-			blkdev_put(reg->hr_bdev);
+			blkdev_put(reg->hr_bdev, FMODE_READ|FMODE_WRITE);
 			reg->hr_bdev = NULL;
 		}
 	}
diff --git a/fs/partitions/check.c b/fs/partitions/check.c
index cfb0c80690aa..5a35ff2e1a9b 100644
--- a/fs/partitions/check.c
+++ b/fs/partitions/check.c
@@ -488,7 +488,7 @@ void register_disk(struct gendisk *disk)
 	err = blkdev_get(bdev, FMODE_READ, 0);
 	if (err < 0)
 		goto exit;
-	blkdev_put(bdev);
+	blkdev_put(bdev, FMODE_READ);
 
 exit:
 	/* announce disk after possible partitions are created */
diff --git a/fs/reiserfs/journal.c b/fs/reiserfs/journal.c
index b89d193a00d9..3261518478f4 100644
--- a/fs/reiserfs/journal.c
+++ b/fs/reiserfs/journal.c
@@ -2575,7 +2575,7 @@ static int release_journal_dev(struct super_block *super,
 	if (journal->j_dev_bd != NULL) {
 		if (journal->j_dev_bd->bd_dev != super->s_dev)
 			bd_release(journal->j_dev_bd);
-		result = blkdev_put(journal->j_dev_bd);
+		result = blkdev_put(journal->j_dev_bd, 0); /* move up */
 		journal->j_dev_bd = NULL;
 	}
 
@@ -2618,7 +2618,7 @@ static int journal_init_dev(struct super_block *super,
 		} else if (jdev != super->s_dev) {
 			result = bd_claim(journal->j_dev_bd, journal);
 			if (result) {
-				blkdev_put(journal->j_dev_bd);
+				blkdev_put(journal->j_dev_bd, blkdev_mode);
 				return result;
 			}
 
diff --git a/include/linux/fs.h b/include/linux/fs.h
index b5894604ba5e..04c8dc41f454 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -1722,7 +1722,7 @@ extern int blkdev_driver_ioctl(struct inode *inode, struct file *file,
 			       unsigned long arg);
 extern long compat_blkdev_ioctl(struct file *, unsigned, unsigned long);
 extern int blkdev_get(struct block_device *, fmode_t, unsigned);
-extern int blkdev_put(struct block_device *);
+extern int blkdev_put(struct block_device *, fmode_t);
 extern int bd_claim(struct block_device *, void *);
 extern void bd_release(struct block_device *);
 #ifdef CONFIG_SYSFS
diff --git a/kernel/power/swap.c b/kernel/power/swap.c
index 80ccac849e46..7b9d611c1106 100644
--- a/kernel/power/swap.c
+++ b/kernel/power/swap.c
@@ -178,7 +178,7 @@ static int swsusp_swap_check(void) /* This is called before saving image */
 
 	res = set_blocksize(resume_bdev, PAGE_SIZE);
 	if (res < 0)
-		blkdev_put(resume_bdev);
+		blkdev_put(resume_bdev, FMODE_WRITE);
 
 	return res;
 }
@@ -574,7 +574,7 @@ int swsusp_read(unsigned int *flags_p)
 		error = load_image(&handle, &snapshot, header->pages - 1);
 	release_swap_reader(&handle);
 
-	blkdev_put(resume_bdev);
+	blkdev_put(resume_bdev, FMODE_READ);
 
 	if (!error)
 		pr_debug("PM: Image successfully loaded\n");
@@ -609,7 +609,7 @@ int swsusp_check(void)
 			return -EINVAL;
 		}
 		if (error)
-			blkdev_put(resume_bdev);
+			blkdev_put(resume_bdev, FMODE_READ);
 		else
 			pr_debug("PM: Signature found, resuming\n");
 	} else {
@@ -633,7 +633,7 @@ void swsusp_close(void)
 		return;
 	}
 
-	blkdev_put(resume_bdev);
+	blkdev_put(resume_bdev, 0); /* move up */
 }
 
 static int swsusp_header_init(void)
-- 
cgit v1.2.3


From 30c40d2c01f68c7eb1a41ab3552bdaf5dbf300d4 Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Fri, 22 Feb 2008 19:50:45 -0500
Subject: [PATCH] propagate mode through open_bdev_excl/close_bdev_excl

replace open_bdev_excl/close_bdev_excl with variants taking fmode_t.
superblock gets the value used to mount it stored in sb->s_mode

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 drivers/mtd/devices/block2mtd.c |  4 ++--
 fs/block_dev.c                  | 24 +++++++++++-------------
 fs/reiserfs/journal.c           |  3 ++-
 fs/super.c                      | 14 ++++++++++----
 fs/xfs/linux-2.6/xfs_super.c    |  4 ++--
 include/linux/fs.h              |  6 ++++--
 6 files changed, 31 insertions(+), 24 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/mtd/devices/block2mtd.c b/drivers/mtd/devices/block2mtd.c
index 91fbba767635..8c295f40d2ac 100644
--- a/drivers/mtd/devices/block2mtd.c
+++ b/drivers/mtd/devices/block2mtd.c
@@ -224,7 +224,7 @@ static void block2mtd_free_device(struct block2mtd_dev *dev)
 	if (dev->blkdev) {
 		invalidate_mapping_pages(dev->blkdev->bd_inode->i_mapping,
 					0, -1);
-		close_bdev_excl(dev->blkdev);
+		close_bdev_exclusive(dev->blkdev, FMODE_READ|FMODE_WRITE);
 	}
 
 	kfree(dev);
@@ -246,7 +246,7 @@ static struct block2mtd_dev *add_device(char *devname, int erase_size)
 		return NULL;
 
 	/* Get a handle on the device */
-	bdev = open_bdev_excl(devname, O_RDWR, NULL);
+	bdev = open_bdev_exclusive(devname, FMODE_READ|FMODE_WRITE, NULL);
 #ifndef MODULE
 	if (IS_ERR(bdev)) {
 
diff --git a/fs/block_dev.c b/fs/block_dev.c
index 05131baf3cf8..4b595904cefd 100644
--- a/fs/block_dev.c
+++ b/fs/block_dev.c
@@ -1309,32 +1309,29 @@ fail:
 EXPORT_SYMBOL(lookup_bdev);
 
 /**
- * open_bdev_excl  -  open a block device by name and set it up for use
+ * open_bdev_exclusive  -  open a block device by name and set it up for use
  *
  * @path:	special file representing the block device
- * @flags:	%MS_RDONLY for opening read-only
+ * @mode:	FMODE_... combination to pass be used
  * @holder:	owner for exclusion
  *
  * Open the blockdevice described by the special file at @path, claim it
  * for the @holder.
  */
-struct block_device *open_bdev_excl(const char *path, int flags, void *holder)
+struct block_device *open_bdev_exclusive(const char *path, fmode_t mode, void *holder)
 {
 	struct block_device *bdev;
-	fmode_t mode = FMODE_READ;
 	int error = 0;
 
 	bdev = lookup_bdev(path);
 	if (IS_ERR(bdev))
 		return bdev;
 
-	if (!(flags & MS_RDONLY))
-		mode |= FMODE_WRITE;
 	error = blkdev_get(bdev, mode, 0);
 	if (error)
 		return ERR_PTR(error);
 	error = -EACCES;
-	if (!(flags & MS_RDONLY) && bdev_read_only(bdev))
+	if ((mode & FMODE_WRITE) && bdev_read_only(bdev))
 		goto blkdev_put;
 	error = bd_claim(bdev, holder);
 	if (error)
@@ -1347,22 +1344,23 @@ blkdev_put:
 	return ERR_PTR(error);
 }
 
-EXPORT_SYMBOL(open_bdev_excl);
+EXPORT_SYMBOL(open_bdev_exclusive);
 
 /**
- * close_bdev_excl  -  release a blockdevice openen by open_bdev_excl()
+ * close_bdev_exclusive  -  close a blockdevice opened by open_bdev_exclusive()
  *
  * @bdev:	blockdevice to close
+ * @mode:	mode, must match that used to open.
  *
- * This is the counterpart to open_bdev_excl().
+ * This is the counterpart to open_bdev_exclusive().
  */
-void close_bdev_excl(struct block_device *bdev)
+void close_bdev_exclusive(struct block_device *bdev, fmode_t mode)
 {
 	bd_release(bdev);
-	blkdev_put(bdev, 0);	/* move up in the next patches */
+	blkdev_put(bdev, mode);
 }
 
-EXPORT_SYMBOL(close_bdev_excl);
+EXPORT_SYMBOL(close_bdev_exclusive);
 
 int __invalidate_device(struct block_device *bdev)
 {
diff --git a/fs/reiserfs/journal.c b/fs/reiserfs/journal.c
index 3261518478f4..70b896076676 100644
--- a/fs/reiserfs/journal.c
+++ b/fs/reiserfs/journal.c
@@ -2628,7 +2628,8 @@ static int journal_init_dev(struct super_block *super,
 		return 0;
 	}
 
-	journal->j_dev_bd = open_bdev_excl(jdev_name, 0, journal);
+	journal->j_dev_bd = open_bdev_exclusive(jdev_name,
+						FMODE_READ|FMODE_WRITE, journal);
 	if (IS_ERR(journal->j_dev_bd)) {
 		result = PTR_ERR(journal->j_dev_bd);
 		journal->j_dev_bd = NULL;
diff --git a/fs/super.c b/fs/super.c
index e931ae9511fe..0d77ac20d03e 100644
--- a/fs/super.c
+++ b/fs/super.c
@@ -760,9 +760,13 @@ int get_sb_bdev(struct file_system_type *fs_type,
 {
 	struct block_device *bdev;
 	struct super_block *s;
+	fmode_t mode = FMODE_READ;
 	int error = 0;
 
-	bdev = open_bdev_excl(dev_name, flags, fs_type);
+	if (!(flags & MS_RDONLY))
+		mode |= FMODE_WRITE;
+
+	bdev = open_bdev_exclusive(dev_name, mode, fs_type);
 	if (IS_ERR(bdev))
 		return PTR_ERR(bdev);
 
@@ -785,11 +789,12 @@ int get_sb_bdev(struct file_system_type *fs_type,
 			goto error_bdev;
 		}
 
-		close_bdev_excl(bdev);
+		close_bdev_exclusive(bdev, mode);
 	} else {
 		char b[BDEVNAME_SIZE];
 
 		s->s_flags = flags;
+		s->s_mode = mode;
 		strlcpy(s->s_id, bdevname(bdev, b), sizeof(s->s_id));
 		sb_set_blocksize(s, block_size(bdev));
 		error = fill_super(s, data, flags & MS_SILENT ? 1 : 0);
@@ -807,7 +812,7 @@ int get_sb_bdev(struct file_system_type *fs_type,
 error_s:
 	error = PTR_ERR(s);
 error_bdev:
-	close_bdev_excl(bdev);
+	close_bdev_exclusive(bdev, mode);
 error:
 	return error;
 }
@@ -817,10 +822,11 @@ EXPORT_SYMBOL(get_sb_bdev);
 void kill_block_super(struct super_block *sb)
 {
 	struct block_device *bdev = sb->s_bdev;
+	fmode_t mode = sb->s_mode;
 
 	generic_shutdown_super(sb);
 	sync_blockdev(bdev);
-	close_bdev_excl(bdev);
+	close_bdev_exclusive(bdev, mode);
 }
 
 EXPORT_SYMBOL(kill_block_super);
diff --git a/fs/xfs/linux-2.6/xfs_super.c b/fs/xfs/linux-2.6/xfs_super.c
index e39013619b26..37ebe36056eb 100644
--- a/fs/xfs/linux-2.6/xfs_super.c
+++ b/fs/xfs/linux-2.6/xfs_super.c
@@ -589,7 +589,7 @@ xfs_blkdev_get(
 {
 	int			error = 0;
 
-	*bdevp = open_bdev_excl(name, 0, mp);
+	*bdevp = open_bdev_exclusive(name, FMODE_READ|FMODE_WRITE, mp);
 	if (IS_ERR(*bdevp)) {
 		error = PTR_ERR(*bdevp);
 		printk("XFS: Invalid device [%s], error=%d\n", name, error);
@@ -603,7 +603,7 @@ xfs_blkdev_put(
 	struct block_device	*bdev)
 {
 	if (bdev)
-		close_bdev_excl(bdev);
+		close_bdev_exclusive(bdev, FMODE_READ|FMODE_WRITE);
 }
 
 /*
diff --git a/include/linux/fs.h b/include/linux/fs.h
index 04c8dc41f454..c6766314dc5e 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -1157,6 +1157,7 @@ struct super_block {
 	char s_id[32];				/* Informational name */
 
 	void 			*s_fs_info;	/* Filesystem private info */
+	fmode_t			s_mode;
 
 	/*
 	 * The next field is for VFS *only*. No filesystems have any business
@@ -1753,9 +1754,10 @@ extern void chrdev_show(struct seq_file *,off_t);
 extern const char *__bdevname(dev_t, char *buffer);
 extern const char *bdevname(struct block_device *bdev, char *buffer);
 extern struct block_device *lookup_bdev(const char *);
-extern struct block_device *open_bdev_excl(const char *, int, void *);
-extern void close_bdev_excl(struct block_device *);
+extern struct block_device *open_bdev_exclusive(const char *, fmode_t, void *);
+extern void close_bdev_exclusive(struct block_device *, fmode_t);
 extern void blkdev_show(struct seq_file *,off_t);
+
 #else
 #define BLKDEV_MAJOR_HASH_SIZE	0
 #endif
-- 
cgit v1.2.3


From e5eb8caa83a76191feb9705c1a0a689ca260b91e Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Mon, 8 Oct 2007 13:24:05 -0400
Subject: [PATCH] remember mode of reiserfs journal

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/reiserfs/journal.c          | 6 ++++--
 include/linux/reiserfs_fs_sb.h | 1 +
 2 files changed, 5 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/fs/reiserfs/journal.c b/fs/reiserfs/journal.c
index 70b896076676..9643c3bbeb3b 100644
--- a/fs/reiserfs/journal.c
+++ b/fs/reiserfs/journal.c
@@ -2575,7 +2575,7 @@ static int release_journal_dev(struct super_block *super,
 	if (journal->j_dev_bd != NULL) {
 		if (journal->j_dev_bd->bd_dev != super->s_dev)
 			bd_release(journal->j_dev_bd);
-		result = blkdev_put(journal->j_dev_bd, 0); /* move up */
+		result = blkdev_put(journal->j_dev_bd, journal->j_dev_mode);
 		journal->j_dev_bd = NULL;
 	}
 
@@ -2608,6 +2608,7 @@ static int journal_init_dev(struct super_block *super,
 	/* there is no "jdev" option and journal is on separate device */
 	if ((!jdev_name || !jdev_name[0])) {
 		journal->j_dev_bd = open_by_devnum(jdev, blkdev_mode);
+		journal->j_dev_mode = blkdev_mode;
 		if (IS_ERR(journal->j_dev_bd)) {
 			result = PTR_ERR(journal->j_dev_bd);
 			journal->j_dev_bd = NULL;
@@ -2628,8 +2629,9 @@ static int journal_init_dev(struct super_block *super,
 		return 0;
 	}
 
+	journal->j_dev_mode = blkdev_mode;
 	journal->j_dev_bd = open_bdev_exclusive(jdev_name,
-						FMODE_READ|FMODE_WRITE, journal);
+						blkdev_mode, journal);
 	if (IS_ERR(journal->j_dev_bd)) {
 		result = PTR_ERR(journal->j_dev_bd);
 		journal->j_dev_bd = NULL;
diff --git a/include/linux/reiserfs_fs_sb.h b/include/linux/reiserfs_fs_sb.h
index 315517e8bfa1..bda6b562a1e0 100644
--- a/include/linux/reiserfs_fs_sb.h
+++ b/include/linux/reiserfs_fs_sb.h
@@ -178,6 +178,7 @@ struct reiserfs_journal {
 	struct reiserfs_journal_cnode *j_first;	/*  oldest journal block.  start here for traverse */
 
 	struct block_device *j_dev_bd;
+	fmode_t j_dev_mode;
 	int j_1st_reserved_block;	/* first block on s_dev of reserved area journal */
 
 	unsigned long j_state;
-- 
cgit v1.2.3


From 572c48921574dbe6dceb958cf965aa962baefde4 Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Mon, 8 Oct 2007 13:24:05 -0400
Subject: [PATCH] sanitize blkdev_get() and friends

* get rid of fake struct file/struct dentry in __blkdev_get()
* merge __blkdev_get() and do_open()
* get rid of flags argument of blkdev_get()

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 drivers/block/pktcdvd.c         |  4 +--
 drivers/char/raw.c              |  2 +-
 drivers/s390/block/dasd_genhd.c |  2 +-
 fs/block_dev.c                  | 65 ++++++++++++++---------------------------
 fs/ocfs2/cluster/heartbeat.c    |  2 +-
 fs/partitions/check.c           |  2 +-
 include/linux/fs.h              |  2 +-
 kernel/power/swap.c             |  2 +-
 8 files changed, 30 insertions(+), 51 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/block/pktcdvd.c b/drivers/block/pktcdvd.c
index ce8c71901923..f20bf359b84f 100644
--- a/drivers/block/pktcdvd.c
+++ b/drivers/block/pktcdvd.c
@@ -2332,7 +2332,7 @@ static int pkt_open_dev(struct pktcdvd_device *pd, fmode_t write)
 	 * so bdget() can't fail.
 	 */
 	bdget(pd->bdev->bd_dev);
-	if ((ret = blkdev_get(pd->bdev, FMODE_READ, O_RDONLY)))
+	if ((ret = blkdev_get(pd->bdev, FMODE_READ)))
 		goto out;
 
 	if ((ret = bd_claim(pd->bdev, pd)))
@@ -2765,7 +2765,7 @@ static int pkt_new_dev(struct pktcdvd_device *pd, dev_t dev)
 	bdev = bdget(dev);
 	if (!bdev)
 		return -ENOMEM;
-	ret = blkdev_get(bdev, FMODE_READ, O_RDONLY | O_NONBLOCK);
+	ret = blkdev_get(bdev, FMODE_READ | FMODE_NDELAY);
 	if (ret)
 		return ret;
 
diff --git a/drivers/char/raw.c b/drivers/char/raw.c
index bfd59e6bf54f..f3cf5eb9b7fb 100644
--- a/drivers/char/raw.c
+++ b/drivers/char/raw.c
@@ -65,7 +65,7 @@ static int raw_open(struct inode *inode, struct file *filp)
 	if (!bdev)
 		goto out;
 	igrab(bdev->bd_inode);
-	err = blkdev_get(bdev, filp->f_mode, 0);
+	err = blkdev_get(bdev, filp->f_mode);
 	if (err)
 		goto out;
 	err = bd_claim(bdev, raw_open);
diff --git a/drivers/s390/block/dasd_genhd.c b/drivers/s390/block/dasd_genhd.c
index 3c1b6915c9ad..e99d566b69cc 100644
--- a/drivers/s390/block/dasd_genhd.c
+++ b/drivers/s390/block/dasd_genhd.c
@@ -99,7 +99,7 @@ int dasd_scan_partitions(struct dasd_block *block)
 	struct block_device *bdev;
 
 	bdev = bdget_disk(block->gdp, 0);
-	if (!bdev || blkdev_get(bdev, FMODE_READ, 1) < 0)
+	if (!bdev || blkdev_get(bdev, FMODE_READ) < 0)
 		return -ENODEV;
 	/*
 	 * See fs/partition/check.c:register_disk,rescan_partitions
diff --git a/fs/block_dev.c b/fs/block_dev.c
index 4b595904cefd..b89c956e04f6 100644
--- a/fs/block_dev.c
+++ b/fs/block_dev.c
@@ -844,9 +844,8 @@ struct block_device *open_by_devnum(dev_t dev, fmode_t mode)
 {
 	struct block_device *bdev = bdget(dev);
 	int err = -ENOMEM;
-	int flags = mode & FMODE_WRITE ? O_RDWR : O_RDONLY;
 	if (bdev)
-		err = blkdev_get(bdev, mode, flags);
+		err = blkdev_get(bdev, mode);
 	return err ? ERR_PTR(err) : bdev;
 }
 
@@ -975,8 +974,6 @@ void bd_set_size(struct block_device *bdev, loff_t size)
 }
 EXPORT_SYMBOL(bd_set_size);
 
-static int __blkdev_get(struct block_device *bdev, fmode_t mode, unsigned flags,
-			int for_part);
 static int __blkdev_put(struct block_device *bdev, fmode_t mode, int for_part);
 
 /*
@@ -986,7 +983,7 @@ static int __blkdev_put(struct block_device *bdev, fmode_t mode, int for_part);
  *    mutex_lock_nested(whole->bd_mutex, 1)
  */
 
-static int do_open(struct block_device *bdev, struct file *file, int for_part)
+static int __blkdev_get(struct block_device *bdev, fmode_t mode, int for_part)
 {
 	struct gendisk *disk;
 	struct hd_struct *part = NULL;
@@ -994,9 +991,9 @@ static int do_open(struct block_device *bdev, struct file *file, int for_part)
 	int partno;
 	int perm = 0;
 
-	if (file->f_mode & FMODE_READ)
+	if (mode & FMODE_READ)
 		perm |= MAY_READ;
-	if (file->f_mode & FMODE_WRITE)
+	if (mode & FMODE_WRITE)
 		perm |= MAY_WRITE;
 	/*
 	 * hooks: /n/, see "layering violations".
@@ -1007,15 +1004,7 @@ static int do_open(struct block_device *bdev, struct file *file, int for_part)
 		return ret;
 	}
 
-	if (file->f_flags & O_NDELAY)
-		file->f_mode |= FMODE_NDELAY;
-	if (file->f_flags & O_EXCL)
-		file->f_mode |= FMODE_EXCL;
-	if ((file->f_flags & O_ACCMODE) == 3)
-		file->f_mode |= FMODE_WRITE_IOCTL;
-
 	ret = -ENXIO;
-	file->f_mapping = bdev->bd_inode->i_mapping;
 
 	lock_kernel();
 
@@ -1034,7 +1023,7 @@ static int do_open(struct block_device *bdev, struct file *file, int for_part)
 		if (!partno) {
 			struct backing_dev_info *bdi;
 			if (disk->fops->open) {
-				ret = disk->fops->open(bdev, file->f_mode);
+				ret = disk->fops->open(bdev, mode);
 				if (ret)
 					goto out_clear;
 			}
@@ -1054,7 +1043,7 @@ static int do_open(struct block_device *bdev, struct file *file, int for_part)
 			if (!whole)
 				goto out_clear;
 			BUG_ON(for_part);
-			ret = __blkdev_get(whole, file->f_mode, file->f_flags, 1);
+			ret = __blkdev_get(whole, mode, 1);
 			if (ret)
 				goto out_clear;
 			bdev->bd_contains = whole;
@@ -1075,7 +1064,7 @@ static int do_open(struct block_device *bdev, struct file *file, int for_part)
 		disk = NULL;
 		if (bdev->bd_contains == bdev) {
 			if (bdev->bd_disk->fops->open) {
-				ret = bdev->bd_disk->fops->open(bdev, file->f_mode);
+				ret = bdev->bd_disk->fops->open(bdev, mode);
 				if (ret)
 					goto out_unlock_bdev;
 			}
@@ -1095,7 +1084,7 @@ static int do_open(struct block_device *bdev, struct file *file, int for_part)
 	bdev->bd_part = NULL;
 	bdev->bd_inode->i_data.backing_dev_info = &default_backing_dev_info;
 	if (bdev != bdev->bd_contains)
-		__blkdev_put(bdev->bd_contains, file->f_mode, 1);
+		__blkdev_put(bdev->bd_contains, mode, 1);
 	bdev->bd_contains = NULL;
  out_unlock_bdev:
 	mutex_unlock(&bdev->bd_mutex);
@@ -1111,28 +1100,9 @@ static int do_open(struct block_device *bdev, struct file *file, int for_part)
 	return ret;
 }
 
-static int __blkdev_get(struct block_device *bdev, fmode_t mode, unsigned flags,
-			int for_part)
-{
-	/*
-	 * This crockload is due to bad choice of ->open() type.
-	 * It will go away.
-	 * For now, block device ->open() routine must _not_
-	 * examine anything in 'inode' argument except ->i_rdev.
-	 */
-	struct file fake_file = {};
-	struct dentry fake_dentry = {};
-	fake_file.f_mode = mode;
-	fake_file.f_flags = flags;
-	fake_file.f_path.dentry = &fake_dentry;
-	fake_dentry.d_inode = bdev->bd_inode;
-
-	return do_open(bdev, &fake_file, for_part);
-}
-
-int blkdev_get(struct block_device *bdev, fmode_t mode, unsigned flags)
+int blkdev_get(struct block_device *bdev, fmode_t mode)
 {
-	return __blkdev_get(bdev, mode, flags, 0);
+	return __blkdev_get(bdev, mode, 0);
 }
 EXPORT_SYMBOL(blkdev_get);
 
@@ -1149,15 +1119,24 @@ static int blkdev_open(struct inode * inode, struct file * filp)
 	 */
 	filp->f_flags |= O_LARGEFILE;
 
+	if (filp->f_flags & O_NDELAY)
+		filp->f_mode |= FMODE_NDELAY;
+	if (filp->f_flags & O_EXCL)
+		filp->f_mode |= FMODE_EXCL;
+	if ((filp->f_flags & O_ACCMODE) == 3)
+		filp->f_mode |= FMODE_WRITE_IOCTL;
+
 	bdev = bd_acquire(inode);
 	if (bdev == NULL)
 		return -ENOMEM;
 
-	res = do_open(bdev, filp, 0);
+	filp->f_mapping = bdev->bd_inode->i_mapping;
+
+	res = blkdev_get(bdev, filp->f_mode);
 	if (res)
 		return res;
 
-	if (!(filp->f_flags & O_EXCL) )
+	if (!(filp->f_mode & FMODE_EXCL))
 		return 0;
 
 	if (!(res = bd_claim(bdev, filp)))
@@ -1327,7 +1306,7 @@ struct block_device *open_bdev_exclusive(const char *path, fmode_t mode, void *h
 	if (IS_ERR(bdev))
 		return bdev;
 
-	error = blkdev_get(bdev, mode, 0);
+	error = blkdev_get(bdev, mode);
 	if (error)
 		return ERR_PTR(error);
 	error = -EACCES;
diff --git a/fs/ocfs2/cluster/heartbeat.c b/fs/ocfs2/cluster/heartbeat.c
index 4b6fdf591eed..6ebaa58e2c03 100644
--- a/fs/ocfs2/cluster/heartbeat.c
+++ b/fs/ocfs2/cluster/heartbeat.c
@@ -1268,7 +1268,7 @@ static ssize_t o2hb_region_dev_write(struct o2hb_region *reg,
 		goto out;
 
 	reg->hr_bdev = I_BDEV(filp->f_mapping->host);
-	ret = blkdev_get(reg->hr_bdev, FMODE_WRITE | FMODE_READ, 0);
+	ret = blkdev_get(reg->hr_bdev, FMODE_WRITE | FMODE_READ);
 	if (ret) {
 		reg->hr_bdev = NULL;
 		goto out;
diff --git a/fs/partitions/check.c b/fs/partitions/check.c
index 5a35ff2e1a9b..633f7a0ebb2c 100644
--- a/fs/partitions/check.c
+++ b/fs/partitions/check.c
@@ -485,7 +485,7 @@ void register_disk(struct gendisk *disk)
 		goto exit;
 
 	bdev->bd_invalidated = 1;
-	err = blkdev_get(bdev, FMODE_READ, 0);
+	err = blkdev_get(bdev, FMODE_READ);
 	if (err < 0)
 		goto exit;
 	blkdev_put(bdev, FMODE_READ);
diff --git a/include/linux/fs.h b/include/linux/fs.h
index c6766314dc5e..cb78e389699b 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -1722,7 +1722,7 @@ extern int blkdev_driver_ioctl(struct inode *inode, struct file *file,
 			       struct gendisk *disk, unsigned cmd,
 			       unsigned long arg);
 extern long compat_blkdev_ioctl(struct file *, unsigned, unsigned long);
-extern int blkdev_get(struct block_device *, fmode_t, unsigned);
+extern int blkdev_get(struct block_device *, fmode_t);
 extern int blkdev_put(struct block_device *, fmode_t);
 extern int bd_claim(struct block_device *, void *);
 extern void bd_release(struct block_device *);
diff --git a/kernel/power/swap.c b/kernel/power/swap.c
index 178b001a4f17..b7713b53d07a 100644
--- a/kernel/power/swap.c
+++ b/kernel/power/swap.c
@@ -172,7 +172,7 @@ static int swsusp_swap_check(void) /* This is called before saving image */
 		return res;
 
 	root_swap = res;
-	res = blkdev_get(resume_bdev, FMODE_WRITE, O_RDWR);
+	res = blkdev_get(resume_bdev, FMODE_WRITE);
 	if (res)
 		return res;
 
-- 
cgit v1.2.3


From e436fdae70a31102d2be32969b80fe8545edebd9 Mon Sep 17 00:00:00 2001
From: Al Viro <al@aretha.pdmi.ras.ru>
Date: Thu, 18 Sep 2008 03:38:12 -0400
Subject: [PATCH] get rid of blkdev_driver_ioctl()

convert remaining callers to __blkdev_driver_ioctl()

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 block/ioctl.c      | 29 ++++++++++-------------------
 include/linux/fs.h |  3 ---
 2 files changed, 10 insertions(+), 22 deletions(-)

(limited to 'include/linux')

diff --git a/block/ioctl.c b/block/ioctl.c
index 0db89f95b15f..b4e0abed1b4b 100644
--- a/block/ioctl.c
+++ b/block/ioctl.c
@@ -265,21 +265,6 @@ static int blkdev_locked_ioctl(struct file *file, struct block_device *bdev,
 	return -ENOIOCTLCMD;
 }
 
-int blkdev_driver_ioctl(struct inode *inode, struct file *file,
-			struct gendisk *disk, unsigned cmd, unsigned long arg)
-{
-	int ret;
-	fmode_t mode = 0;
-	if (file) {
-		mode = file->f_mode;
-		if (file->f_flags & O_NDELAY)
-			mode |= FMODE_NDELAY_NOW;
-	}
-
-	return __blkdev_driver_ioctl(inode->i_bdev, mode, cmd, arg);
-}
-EXPORT_SYMBOL_GPL(blkdev_driver_ioctl);
-
 int __blkdev_driver_ioctl(struct block_device *bdev, fmode_t mode,
 			unsigned cmd, unsigned long arg)
 {
@@ -315,13 +300,19 @@ int blkdev_ioctl(struct inode *inode, struct file *file, unsigned cmd,
 	struct block_device *bdev = inode->i_bdev;
 	struct gendisk *disk = bdev->bd_disk;
 	int ret, n;
+	fmode_t mode = 0;
+	if (file) {
+		mode = file->f_mode;
+		if (file->f_flags & O_NDELAY)
+			mode |= FMODE_NDELAY_NOW;
+	}
 
 	switch(cmd) {
 	case BLKFLSBUF:
 		if (!capable(CAP_SYS_ADMIN))
 			return -EACCES;
 
-		ret = blkdev_driver_ioctl(inode, file, disk, cmd, arg);
+		ret = __blkdev_driver_ioctl(bdev, mode, cmd, arg);
 		/* -EINVAL to handle old uncorrected drivers */
 		if (ret != -EINVAL && ret != -ENOTTY)
 			return ret;
@@ -333,7 +324,7 @@ int blkdev_ioctl(struct inode *inode, struct file *file, unsigned cmd,
 		return 0;
 
 	case BLKROSET:
-		ret = blkdev_driver_ioctl(inode, file, disk, cmd, arg);
+		ret = __blkdev_driver_ioctl(bdev, mode, cmd, arg);
 		/* -EINVAL to handle old uncorrected drivers */
 		if (ret != -EINVAL && ret != -ENOTTY)
 			return ret;
@@ -349,7 +340,7 @@ int blkdev_ioctl(struct inode *inode, struct file *file, unsigned cmd,
 	case BLKDISCARD: {
 		uint64_t range[2];
 
-		if (!(file->f_mode & FMODE_WRITE))
+		if (!(mode & FMODE_WRITE))
 			return -EBADF;
 
 		if (copy_from_user(range, (void __user *)arg, sizeof(range)))
@@ -387,6 +378,6 @@ int blkdev_ioctl(struct inode *inode, struct file *file, unsigned cmd,
 	if (ret != -ENOIOCTLCMD)
 		return ret;
 
-	return blkdev_driver_ioctl(inode, file, disk, cmd, arg);
+	ret = __blkdev_driver_ioctl(bdev, mode, cmd, arg);
 }
 EXPORT_SYMBOL_GPL(blkdev_ioctl);
diff --git a/include/linux/fs.h b/include/linux/fs.h
index cb78e389699b..11de682c65a1 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -1718,9 +1718,6 @@ extern const struct file_operations def_fifo_fops;
 #ifdef CONFIG_BLOCK
 extern int ioctl_by_bdev(struct block_device *, unsigned, unsigned long);
 extern int blkdev_ioctl(struct inode *, struct file *, unsigned, unsigned long);
-extern int blkdev_driver_ioctl(struct inode *inode, struct file *file,
-			       struct gendisk *disk, unsigned cmd,
-			       unsigned long arg);
 extern long compat_blkdev_ioctl(struct file *, unsigned, unsigned long);
 extern int blkdev_get(struct block_device *, fmode_t);
 extern int blkdev_put(struct block_device *, fmode_t);
-- 
cgit v1.2.3


From 56b26add02b4bdea81d5e0ebda60db1fe3311ad4 Mon Sep 17 00:00:00 2001
From: Al Viro <al@aretha.pdmi.ras.ru>
Date: Fri, 19 Sep 2008 03:17:36 -0400
Subject: [PATCH] kill the rest of struct file propagation in block ioctls

Now we can switch blkdev_ioctl() block_device/mode

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 block/compat_ioctl.c | 10 +++++-----
 block/ioctl.c        |  9 +--------
 drivers/char/raw.c   |  2 +-
 fs/block_dev.c       |  8 ++++++--
 include/linux/fs.h   |  2 +-
 5 files changed, 14 insertions(+), 17 deletions(-)

(limited to 'include/linux')

diff --git a/block/compat_ioctl.c b/block/compat_ioctl.c
index 5b3db0640d87..3098c92402fd 100644
--- a/block/compat_ioctl.c
+++ b/block/compat_ioctl.c
@@ -177,7 +177,7 @@ struct compat_blkpg_ioctl_arg {
 	compat_caddr_t data;
 };
 
-static int compat_blkpg_ioctl(struct inode *inode, struct file *file,
+static int compat_blkpg_ioctl(struct block_device *bdev, fmode_t mode,
 		unsigned int cmd, struct compat_blkpg_ioctl_arg __user *ua32)
 {
 	struct blkpg_ioctl_arg __user *a = compat_alloc_user_space(sizeof(*a));
@@ -196,7 +196,7 @@ static int compat_blkpg_ioctl(struct inode *inode, struct file *file,
 	if (err)
 		return err;
 
-	return blkdev_ioctl(inode, file, cmd, (unsigned long)a);
+	return blkdev_ioctl(bdev, mode, cmd, (unsigned long)a);
 }
 
 #define BLKBSZGET_32		_IOR(0x12, 112, int)
@@ -715,13 +715,13 @@ long compat_blkdev_ioctl(struct file *file, unsigned cmd, unsigned long arg)
 	 * but we call blkdev_ioctl, which gets the lock for us
 	 */
 	case BLKRRPART:
-		return blkdev_ioctl(inode, file, cmd,
+		return blkdev_ioctl(bdev, mode, cmd,
 				(unsigned long)compat_ptr(arg));
 	case BLKBSZSET_32:
-		return blkdev_ioctl(inode, file, BLKBSZSET,
+		return blkdev_ioctl(bdev, mode, BLKBSZSET,
 				(unsigned long)compat_ptr(arg));
 	case BLKPG:
-		return compat_blkpg_ioctl(inode, file, cmd, compat_ptr(arg));
+		return compat_blkpg_ioctl(bdev, mode, cmd, compat_ptr(arg));
 	case BLKRAGET:
 	case BLKFRAGET:
 		if (!arg)
diff --git a/block/ioctl.c b/block/ioctl.c
index 14b7f2c10662..c832d639b6e2 100644
--- a/block/ioctl.c
+++ b/block/ioctl.c
@@ -230,20 +230,13 @@ EXPORT_SYMBOL_GPL(__blkdev_driver_ioctl);
  * always keep this in sync with compat_blkdev_ioctl() and
  * compat_blkdev_locked_ioctl()
  */
-int blkdev_ioctl(struct inode *inode, struct file *file, unsigned cmd,
+int blkdev_ioctl(struct block_device *bdev, fmode_t mode, unsigned cmd,
 			unsigned long arg)
 {
-	struct block_device *bdev = inode->i_bdev;
 	struct gendisk *disk = bdev->bd_disk;
 	struct backing_dev_info *bdi;
 	loff_t size;
 	int ret, n;
-	fmode_t mode = 0;
-	if (file) {
-		mode = file->f_mode;
-		if (file->f_flags & O_NDELAY)
-			mode |= FMODE_NDELAY_NOW;
-	}
 
 	switch(cmd) {
 	case BLKFLSBUF:
diff --git a/drivers/char/raw.c b/drivers/char/raw.c
index f3cf5eb9b7fb..96adf28a17e4 100644
--- a/drivers/char/raw.c
+++ b/drivers/char/raw.c
@@ -125,7 +125,7 @@ raw_ioctl(struct inode *inode, struct file *filp,
 {
 	struct block_device *bdev = filp->private_data;
 
-	return blkdev_ioctl(bdev->bd_inode, NULL, command, arg);
+	return blkdev_ioctl(bdev, 0, command, arg);
 }
 
 static void bind_device(struct raw_config_request *rq)
diff --git a/fs/block_dev.c b/fs/block_dev.c
index b89c956e04f6..05865b93f7e1 100644
--- a/fs/block_dev.c
+++ b/fs/block_dev.c
@@ -1202,7 +1202,11 @@ static int blkdev_close(struct inode * inode, struct file * filp)
 
 static long block_ioctl(struct file *file, unsigned cmd, unsigned long arg)
 {
-	return blkdev_ioctl(file->f_mapping->host, file, cmd, arg);
+	struct block_device *bdev = I_BDEV(file->f_mapping->host);
+	fmode_t mode = file->f_mode;
+	if (file->f_flags & O_NDELAY)
+		mode |= FMODE_NDELAY_NOW;
+	return blkdev_ioctl(bdev, mode, cmd, arg);
 }
 
 static const struct address_space_operations def_blk_aops = {
@@ -1238,7 +1242,7 @@ int ioctl_by_bdev(struct block_device *bdev, unsigned cmd, unsigned long arg)
 	int res;
 	mm_segment_t old_fs = get_fs();
 	set_fs(KERNEL_DS);
-	res = blkdev_ioctl(bdev->bd_inode, NULL, cmd, arg);
+	res = blkdev_ioctl(bdev, 0, cmd, arg);
 	set_fs(old_fs);
 	return res;
 }
diff --git a/include/linux/fs.h b/include/linux/fs.h
index 11de682c65a1..ff536e106b4e 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -1717,7 +1717,7 @@ extern const struct file_operations bad_sock_fops;
 extern const struct file_operations def_fifo_fops;
 #ifdef CONFIG_BLOCK
 extern int ioctl_by_bdev(struct block_device *, unsigned, unsigned long);
-extern int blkdev_ioctl(struct inode *, struct file *, unsigned, unsigned long);
+extern int blkdev_ioctl(struct block_device *, fmode_t, unsigned, unsigned long);
 extern long compat_blkdev_ioctl(struct file *, unsigned, unsigned long);
 extern int blkdev_get(struct block_device *, fmode_t);
 extern int blkdev_put(struct block_device *, fmode_t);
-- 
cgit v1.2.3


From e9f95e637320efe1936b647308ddf4ec5b8e0311 Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Tue, 21 Oct 2008 15:49:59 +0200
Subject: genirq: fix off by one and coding style

Fix off-by-one in for_each_irq_desc_reverse().

Impact is near zero in practice, because nothing substantial wants to
iterate down to IRQ#0 - but fix it nevertheless.

Signed-off-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 arch/x86/kernel/irq.c | 4 ++--
 include/linux/irqnr.h | 6 +++---
 2 files changed, 5 insertions(+), 5 deletions(-)

(limited to 'include/linux')

diff --git a/arch/x86/kernel/irq.c b/arch/x86/kernel/irq.c
index ccf6c503fc3b..d1d4dc52f649 100644
--- a/arch/x86/kernel/irq.c
+++ b/arch/x86/kernel/irq.c
@@ -36,7 +36,7 @@ void ack_bad_irq(unsigned int irq)
 }
 
 #ifdef CONFIG_X86_32
-# define irq_stats(x)		(&per_cpu(irq_stat,x))
+# define irq_stats(x)		(&per_cpu(irq_stat, x))
 #else
 # define irq_stats(x)		cpu_pda(x)
 #endif
@@ -113,7 +113,7 @@ int show_interrupts(struct seq_file *p, void *v)
 	if (i == 0) {
 		seq_printf(p, "           ");
 		for_each_online_cpu(j)
-			seq_printf(p, "CPU%-8d",j);
+			seq_printf(p, "CPU%-8d", j);
 		seq_putc(p, '\n');
 	}
 
diff --git a/include/linux/irqnr.h b/include/linux/irqnr.h
index 3171ddc3b39d..452c280c8115 100644
--- a/include/linux/irqnr.h
+++ b/include/linux/irqnr.h
@@ -13,9 +13,9 @@ extern int nr_irqs;
 # define for_each_irq_desc(irq, desc)		\
 	for (irq = 0, desc = irq_desc; irq < nr_irqs; irq++, desc++)
 
-# define for_each_irq_desc_reverse(irq, desc)			\
-	for (irq = nr_irqs -1, desc = irq_desc + (nr_irqs -1 );	\
-	     irq > 0; irq--, desc--)
+# define for_each_irq_desc_reverse(irq, desc)				\
+	for (irq = nr_irqs - 1, desc = irq_desc + (nr_irqs - 1);	\
+	     irq >= 0; irq--, desc--)
 #endif
 
 #define for_each_irq_nr(irq)			\
-- 
cgit v1.2.3


From 733d710b09748a79f70cbb58547d036d28ec566e Mon Sep 17 00:00:00 2001
From: Sergio Aguirre <saaguirre@ti.com>
Date: Sat, 18 Oct 2008 12:26:47 -0300
Subject: V4L/DVB (9320): v4l2: Add 10-bit RAW Bayer formats

Add 10-bit raw bayer format expanded to 16 bits. Adds also definition
for 10-bit raw bayer format dpcm-compressed to 8 bits.

Signed-off-by: Sergio Aguirre <saaguirre@ti.com>
Signed-off-by: Hans Verkuil <hverkuil@xs4all.nl>
Signed-off-by: Mauro Carvalho Chehab <mchehab@redhat.com>
---
 include/linux/videodev2.h | 7 +++++++
 1 file changed, 7 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/videodev2.h b/include/linux/videodev2.h
index d4b03034ee73..4669d7e72e75 100644
--- a/include/linux/videodev2.h
+++ b/include/linux/videodev2.h
@@ -315,6 +315,13 @@ struct v4l2_pix_format {
 /* see http://www.siliconimaging.com/RGB%20Bayer.htm */
 #define V4L2_PIX_FMT_SBGGR8  v4l2_fourcc('B', 'A', '8', '1') /*  8  BGBG.. GRGR.. */
 #define V4L2_PIX_FMT_SGBRG8  v4l2_fourcc('G', 'B', 'R', 'G') /*  8  GBGB.. RGRG.. */
+/*
+ * 10bit raw bayer, expanded to 16 bits
+ * xxxxrrrrrrrrrrxxxxgggggggggg xxxxggggggggggxxxxbbbbbbbbbb...
+ */
+#define V4L2_PIX_FMT_SGRBG10 v4l2_fourcc('B', 'A', '1', '0')
+/* 10bit raw bayer DPCM compressed to 8 bits */
+#define V4L2_PIX_FMT_SGRBG10DPCM8 v4l2_fourcc('B', 'D', '1', '0')
 #define V4L2_PIX_FMT_SBGGR16 v4l2_fourcc('B', 'Y', 'R', '2') /* 16  BGBG.. GRGR.. */
 
 /* compressed formats */
-- 
cgit v1.2.3


From d63a5ce3c0d25c96bdadc78792e5b48b846e899d Mon Sep 17 00:00:00 2001
From: Mikulas Patocka <mpatocka@redhat.com>
Date: Tue, 21 Oct 2008 17:44:57 +0100
Subject: dm: publish array_too_big

Move array_too_big to include/linux/device-mapper.h because it is
used by targets.

Remove the test from dm-raid1 as the number of mirror legs is limited
such that it can never fail.  (Even for stripes it seems rather
unlikely.)

Signed-off-by: Mikulas Patocka <mpatocka@redhat.com>
Signed-off-by: Alasdair G Kergon <agk@redhat.com>
---
 drivers/md/dm-raid1.c         | 3 ---
 drivers/md/dm-stripe.c        | 4 ++--
 drivers/md/dm.h               | 9 ---------
 include/linux/device-mapper.h | 3 +++
 4 files changed, 5 insertions(+), 14 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/md/dm-raid1.c b/drivers/md/dm-raid1.c
index 29913e42c4ab..ecfd82169cb3 100644
--- a/drivers/md/dm-raid1.c
+++ b/drivers/md/dm-raid1.c
@@ -1315,9 +1315,6 @@ static struct mirror_set *alloc_context(unsigned int nr_mirrors,
 	size_t len;
 	struct mirror_set *ms = NULL;
 
-	if (array_too_big(sizeof(*ms), sizeof(ms->mirror[0]), nr_mirrors))
-		return NULL;
-
 	len = sizeof(*ms) + (sizeof(ms->mirror[0]) * nr_mirrors);
 
 	ms = kzalloc(len, GFP_KERNEL);
diff --git a/drivers/md/dm-stripe.c b/drivers/md/dm-stripe.c
index b745d8ac625b..287e24584730 100644
--- a/drivers/md/dm-stripe.c
+++ b/drivers/md/dm-stripe.c
@@ -60,8 +60,8 @@ static inline struct stripe_c *alloc_context(unsigned int stripes)
 {
 	size_t len;
 
-	if (array_too_big(sizeof(struct stripe_c), sizeof(struct stripe),
-			  stripes))
+	if (dm_array_too_big(sizeof(struct stripe_c), sizeof(struct stripe),
+			     stripes))
 		return NULL;
 
 	len = sizeof(struct stripe_c) + (sizeof(struct stripe) * stripes);
diff --git a/drivers/md/dm.h b/drivers/md/dm.h
index cd189da2b2fa..0ade60cdef42 100644
--- a/drivers/md/dm.h
+++ b/drivers/md/dm.h
@@ -62,15 +62,6 @@ void dm_put_target_type(struct target_type *t);
 int dm_target_iterate(void (*iter_func)(struct target_type *tt,
 					void *param), void *param);
 
-/*-----------------------------------------------------------------
- * Useful inlines.
- *---------------------------------------------------------------*/
-static inline int array_too_big(unsigned long fixed, unsigned long obj,
-				unsigned long num)
-{
-	return (num > (ULONG_MAX - fixed) / obj);
-}
-
 int dm_split_args(int *argc, char ***argvp, char *input);
 
 /*
diff --git a/include/linux/device-mapper.h b/include/linux/device-mapper.h
index 08d783592b73..dfb30db475ed 100644
--- a/include/linux/device-mapper.h
+++ b/include/linux/device-mapper.h
@@ -354,6 +354,9 @@ void *dm_vcalloc(unsigned long nmemb, unsigned long elem_size);
  */
 #define dm_round_up(n, sz) (dm_div_up((n), (sz)) * (sz))
 
+#define dm_array_too_big(fixed, obj, num) \
+	((num) > (UINT_MAX - (fixed)) / (obj))
+
 static inline sector_t to_sector(unsigned long n)
 {
 	return (n >> SECTOR_SHIFT);
-- 
cgit v1.2.3


From 1f965b19437017cea6d3f3f46acdc5acae5fd011 Mon Sep 17 00:00:00 2001
From: Heinz Mauelshagen <hjm@redhat.com>
Date: Tue, 21 Oct 2008 17:45:06 +0100
Subject: dm raid1: separate region_hash interface part1

Separate the region hash code from raid1 so it can be shared by forthcoming
targets.  Use BUG_ON() for failed async dm_io() calls.

Signed-off-by: Heinz Mauelshagen <hjm@redhat.com>
Signed-off-by: Alasdair G Kergon <agk@redhat.com>
---
 drivers/md/Makefile            |   2 +-
 drivers/md/dm-raid1.c          | 789 ++++++-----------------------------------
 drivers/md/dm-region-hash.c    | 704 ++++++++++++++++++++++++++++++++++++
 include/linux/dm-region-hash.h | 104 ++++++
 4 files changed, 912 insertions(+), 687 deletions(-)
 create mode 100644 drivers/md/dm-region-hash.c
 create mode 100644 include/linux/dm-region-hash.h

(limited to 'include/linux')

diff --git a/drivers/md/Makefile b/drivers/md/Makefile
index f1ef33dfd8cf..1c615804ea76 100644
--- a/drivers/md/Makefile
+++ b/drivers/md/Makefile
@@ -34,7 +34,7 @@ obj-$(CONFIG_DM_CRYPT)		+= dm-crypt.o
 obj-$(CONFIG_DM_DELAY)		+= dm-delay.o
 obj-$(CONFIG_DM_MULTIPATH)	+= dm-multipath.o dm-round-robin.o
 obj-$(CONFIG_DM_SNAPSHOT)	+= dm-snapshot.o
-obj-$(CONFIG_DM_MIRROR)		+= dm-mirror.o dm-log.o
+obj-$(CONFIG_DM_MIRROR)		+= dm-mirror.o dm-log.o dm-region-hash.o
 obj-$(CONFIG_DM_ZERO)		+= dm-zero.o
 
 quiet_cmd_unroll = UNROLL  $@
diff --git a/drivers/md/dm-raid1.c b/drivers/md/dm-raid1.c
index f358853af5cf..92dcc06832a4 100644
--- a/drivers/md/dm-raid1.c
+++ b/drivers/md/dm-raid1.c
@@ -1,118 +1,36 @@
 /*
  * Copyright (C) 2003 Sistina Software Limited.
+ * Copyright (C) 2005-2008 Red Hat, Inc. All rights reserved.
  *
  * This file is released under the GPL.
  */
 
-#include <linux/device-mapper.h>
-
 #include "dm-bio-list.h"
 #include "dm-bio-record.h"
 
-#include <linux/ctype.h>
 #include <linux/init.h>
 #include <linux/mempool.h>
 #include <linux/module.h>
 #include <linux/pagemap.h>
 #include <linux/slab.h>
-#include <linux/time.h>
-#include <linux/vmalloc.h>
 #include <linux/workqueue.h>
-#include <linux/log2.h>
-#include <linux/hardirq.h>
+#include <linux/device-mapper.h>
 #include <linux/dm-io.h>
 #include <linux/dm-dirty-log.h>
 #include <linux/dm-kcopyd.h>
+#include <linux/dm-region-hash.h>
 
 #define DM_MSG_PREFIX "raid1"
+
+#define MAX_RECOVERY 1	/* Maximum number of regions recovered in parallel. */
 #define DM_IO_PAGES 64
+#define DM_KCOPYD_PAGES 64
 
 #define DM_RAID1_HANDLE_ERRORS 0x01
 #define errors_handled(p)	((p)->features & DM_RAID1_HANDLE_ERRORS)
 
 static DECLARE_WAIT_QUEUE_HEAD(_kmirrord_recovery_stopped);
 
-/*-----------------------------------------------------------------
- * Region hash
- *
- * The mirror splits itself up into discrete regions.  Each
- * region can be in one of three states: clean, dirty,
- * nosync.  There is no need to put clean regions in the hash.
- *
- * In addition to being present in the hash table a region _may_
- * be present on one of three lists.
- *
- *   clean_regions: Regions on this list have no io pending to
- *   them, they are in sync, we are no longer interested in them,
- *   they are dull.  rh_update_states() will remove them from the
- *   hash table.
- *
- *   quiesced_regions: These regions have been spun down, ready
- *   for recovery.  rh_recovery_start() will remove regions from
- *   this list and hand them to kmirrord, which will schedule the
- *   recovery io with kcopyd.
- *
- *   recovered_regions: Regions that kcopyd has successfully
- *   recovered.  rh_update_states() will now schedule any delayed
- *   io, up the recovery_count, and remove the region from the
- *   hash.
- *
- * There are 2 locks:
- *   A rw spin lock 'hash_lock' protects just the hash table,
- *   this is never held in write mode from interrupt context,
- *   which I believe means that we only have to disable irqs when
- *   doing a write lock.
- *
- *   An ordinary spin lock 'region_lock' that protects the three
- *   lists in the region_hash, with the 'state', 'list' and
- *   'bhs_delayed' fields of the regions.  This is used from irq
- *   context, so all other uses will have to suspend local irqs.
- *---------------------------------------------------------------*/
-struct mirror_set;
-struct region_hash {
-	struct mirror_set *ms;
-	uint32_t region_size;
-	unsigned region_shift;
-
-	/* holds persistent region state */
-	struct dm_dirty_log *log;
-
-	/* hash table */
-	rwlock_t hash_lock;
-	mempool_t *region_pool;
-	unsigned int mask;
-	unsigned int nr_buckets;
-	struct list_head *buckets;
-
-	spinlock_t region_lock;
-	atomic_t recovery_in_flight;
-	struct semaphore recovery_count;
-	struct list_head clean_regions;
-	struct list_head quiesced_regions;
-	struct list_head recovered_regions;
-	struct list_head failed_recovered_regions;
-};
-
-enum {
-	RH_CLEAN,
-	RH_DIRTY,
-	RH_NOSYNC,
-	RH_RECOVERING
-};
-
-struct region {
-	struct region_hash *rh;	/* FIXME: can we get rid of this ? */
-	region_t key;
-	int state;
-
-	struct list_head hash_list;
-	struct list_head list;
-
-	atomic_t pending;
-	struct bio_list delayed_bios;
-};
-
-
 /*-----------------------------------------------------------------
  * Mirror set structures.
  *---------------------------------------------------------------*/
@@ -133,8 +51,7 @@ struct mirror {
 struct mirror_set {
 	struct dm_target *ti;
 	struct list_head list;
-	struct region_hash rh;
-	struct dm_kcopyd_client *kcopyd_client;
+
 	uint64_t features;
 
 	spinlock_t lock;	/* protects the lists */
@@ -142,6 +59,8 @@ struct mirror_set {
 	struct bio_list writes;
 	struct bio_list failures;
 
+	struct dm_region_hash *rh;
+	struct dm_kcopyd_client *kcopyd_client;
 	struct dm_io_client *io_client;
 	mempool_t *read_record_pool;
 
@@ -160,25 +79,14 @@ struct mirror_set {
 
 	struct work_struct trigger_event;
 
-	unsigned int nr_mirrors;
+	unsigned nr_mirrors;
 	struct mirror mirror[0];
 };
 
-/*
- * Conversion fns
- */
-static inline region_t bio_to_region(struct region_hash *rh, struct bio *bio)
+static void wakeup_mirrord(void *context)
 {
-	return (bio->bi_sector - rh->ms->ti->begin) >> rh->region_shift;
-}
+	struct mirror_set *ms = context;
 
-static inline sector_t region_to_sector(struct region_hash *rh, region_t region)
-{
-	return region << rh->region_shift;
-}
-
-static void wake(struct mirror_set *ms)
-{
 	queue_work(ms->kmirrord_wq, &ms->kmirrord_work);
 }
 
@@ -187,7 +95,7 @@ static void delayed_wake_fn(unsigned long data)
 	struct mirror_set *ms = (struct mirror_set *) data;
 
 	clear_bit(0, &ms->timer_pending);
-	wake(ms);
+	wakeup_mirrord(ms);
 }
 
 static void delayed_wake(struct mirror_set *ms)
@@ -201,473 +109,34 @@ static void delayed_wake(struct mirror_set *ms)
 	add_timer(&ms->timer);
 }
 
-/* FIXME move this */
-static void queue_bio(struct mirror_set *ms, struct bio *bio, int rw);
-
-#define MIN_REGIONS 64
-#define MAX_RECOVERY 1
-static int rh_init(struct region_hash *rh, struct mirror_set *ms,
-		   struct dm_dirty_log *log, uint32_t region_size,
-		   region_t nr_regions)
-{
-	unsigned int nr_buckets, max_buckets;
-	size_t i;
-
-	/*
-	 * Calculate a suitable number of buckets for our hash
-	 * table.
-	 */
-	max_buckets = nr_regions >> 6;
-	for (nr_buckets = 128u; nr_buckets < max_buckets; nr_buckets <<= 1)
-		;
-	nr_buckets >>= 1;
-
-	rh->ms = ms;
-	rh->log = log;
-	rh->region_size = region_size;
-	rh->region_shift = ffs(region_size) - 1;
-	rwlock_init(&rh->hash_lock);
-	rh->mask = nr_buckets - 1;
-	rh->nr_buckets = nr_buckets;
-
-	rh->buckets = vmalloc(nr_buckets * sizeof(*rh->buckets));
-	if (!rh->buckets) {
-		DMERR("unable to allocate region hash memory");
-		return -ENOMEM;
-	}
-
-	for (i = 0; i < nr_buckets; i++)
-		INIT_LIST_HEAD(rh->buckets + i);
-
-	spin_lock_init(&rh->region_lock);
-	sema_init(&rh->recovery_count, 0);
-	atomic_set(&rh->recovery_in_flight, 0);
-	INIT_LIST_HEAD(&rh->clean_regions);
-	INIT_LIST_HEAD(&rh->quiesced_regions);
-	INIT_LIST_HEAD(&rh->recovered_regions);
-	INIT_LIST_HEAD(&rh->failed_recovered_regions);
-
-	rh->region_pool = mempool_create_kmalloc_pool(MIN_REGIONS,
-						      sizeof(struct region));
-	if (!rh->region_pool) {
-		vfree(rh->buckets);
-		rh->buckets = NULL;
-		return -ENOMEM;
-	}
-
-	return 0;
-}
-
-static void rh_exit(struct region_hash *rh)
-{
-	unsigned int h;
-	struct region *reg, *nreg;
-
-	BUG_ON(!list_empty(&rh->quiesced_regions));
-	for (h = 0; h < rh->nr_buckets; h++) {
-		list_for_each_entry_safe(reg, nreg, rh->buckets + h, hash_list) {
-			BUG_ON(atomic_read(&reg->pending));
-			mempool_free(reg, rh->region_pool);
-		}
-	}
-
-	if (rh->log)
-		dm_dirty_log_destroy(rh->log);
-	if (rh->region_pool)
-		mempool_destroy(rh->region_pool);
-	vfree(rh->buckets);
-}
-
-#define RH_HASH_MULT 2654435387U
-
-static inline unsigned int rh_hash(struct region_hash *rh, region_t region)
-{
-	return (unsigned int) ((region * RH_HASH_MULT) >> 12) & rh->mask;
-}
-
-static struct region *__rh_lookup(struct region_hash *rh, region_t region)
-{
-	struct region *reg;
-
-	list_for_each_entry (reg, rh->buckets + rh_hash(rh, region), hash_list)
-		if (reg->key == region)
-			return reg;
-
-	return NULL;
-}
-
-static void __rh_insert(struct region_hash *rh, struct region *reg)
-{
-	unsigned int h = rh_hash(rh, reg->key);
-	list_add(&reg->hash_list, rh->buckets + h);
-}
-
-static struct region *__rh_alloc(struct region_hash *rh, region_t region)
-{
-	struct region *reg, *nreg;
-
-	read_unlock(&rh->hash_lock);
-	nreg = mempool_alloc(rh->region_pool, GFP_ATOMIC);
-	if (unlikely(!nreg))
-		nreg = kmalloc(sizeof(struct region), GFP_NOIO);
-	nreg->state = rh->log->type->in_sync(rh->log, region, 1) ?
-		RH_CLEAN : RH_NOSYNC;
-	nreg->rh = rh;
-	nreg->key = region;
-
-	INIT_LIST_HEAD(&nreg->list);
-
-	atomic_set(&nreg->pending, 0);
-	bio_list_init(&nreg->delayed_bios);
-	write_lock_irq(&rh->hash_lock);
-
-	reg = __rh_lookup(rh, region);
-	if (reg)
-		/* we lost the race */
-		mempool_free(nreg, rh->region_pool);
-
-	else {
-		__rh_insert(rh, nreg);
-		if (nreg->state == RH_CLEAN) {
-			spin_lock(&rh->region_lock);
-			list_add(&nreg->list, &rh->clean_regions);
-			spin_unlock(&rh->region_lock);
-		}
-		reg = nreg;
-	}
-	write_unlock_irq(&rh->hash_lock);
-	read_lock(&rh->hash_lock);
-
-	return reg;
-}
-
-static inline struct region *__rh_find(struct region_hash *rh, region_t region)
-{
-	struct region *reg;
-
-	reg = __rh_lookup(rh, region);
-	if (!reg)
-		reg = __rh_alloc(rh, region);
-
-	return reg;
-}
-
-static int rh_state(struct region_hash *rh, region_t region, int may_block)
-{
-	int r;
-	struct region *reg;
-
-	read_lock(&rh->hash_lock);
-	reg = __rh_lookup(rh, region);
-	read_unlock(&rh->hash_lock);
-
-	if (reg)
-		return reg->state;
-
-	/*
-	 * The region wasn't in the hash, so we fall back to the
-	 * dirty log.
-	 */
-	r = rh->log->type->in_sync(rh->log, region, may_block);
-
-	/*
-	 * Any error from the dirty log (eg. -EWOULDBLOCK) gets
-	 * taken as a RH_NOSYNC
-	 */
-	return r == 1 ? RH_CLEAN : RH_NOSYNC;
-}
-
-static inline int rh_in_sync(struct region_hash *rh,
-			     region_t region, int may_block)
-{
-	int state = rh_state(rh, region, may_block);
-	return state == RH_CLEAN || state == RH_DIRTY;
-}
-
-static void dispatch_bios(struct mirror_set *ms, struct bio_list *bio_list)
-{
-	struct bio *bio;
-
-	while ((bio = bio_list_pop(bio_list))) {
-		queue_bio(ms, bio, WRITE);
-	}
-}
-
-static void complete_resync_work(struct region *reg, int success)
-{
-	struct region_hash *rh = reg->rh;
-
-	rh->log->type->set_region_sync(rh->log, reg->key, success);
-
-	/*
-	 * Dispatch the bios before we call 'wake_up_all'.
-	 * This is important because if we are suspending,
-	 * we want to know that recovery is complete and
-	 * the work queue is flushed.  If we wake_up_all
-	 * before we dispatch_bios (queue bios and call wake()),
-	 * then we risk suspending before the work queue
-	 * has been properly flushed.
-	 */
-	dispatch_bios(rh->ms, &reg->delayed_bios);
-	if (atomic_dec_and_test(&rh->recovery_in_flight))
-		wake_up_all(&_kmirrord_recovery_stopped);
-	up(&rh->recovery_count);
-}
-
-static void rh_update_states(struct region_hash *rh)
-{
-	struct region *reg, *next;
-
-	LIST_HEAD(clean);
-	LIST_HEAD(recovered);
-	LIST_HEAD(failed_recovered);
-
-	/*
-	 * Quickly grab the lists.
-	 */
-	write_lock_irq(&rh->hash_lock);
-	spin_lock(&rh->region_lock);
-	if (!list_empty(&rh->clean_regions)) {
-		list_splice_init(&rh->clean_regions, &clean);
-
-		list_for_each_entry(reg, &clean, list)
-			list_del(&reg->hash_list);
-	}
-
-	if (!list_empty(&rh->recovered_regions)) {
-		list_splice_init(&rh->recovered_regions, &recovered);
-
-		list_for_each_entry (reg, &recovered, list)
-			list_del(&reg->hash_list);
-	}
-
-	if (!list_empty(&rh->failed_recovered_regions)) {
-		list_splice_init(&rh->failed_recovered_regions,
-				 &failed_recovered);
-
-		list_for_each_entry(reg, &failed_recovered, list)
-			list_del(&reg->hash_list);
-	}
-
-	spin_unlock(&rh->region_lock);
-	write_unlock_irq(&rh->hash_lock);
-
-	/*
-	 * All the regions on the recovered and clean lists have
-	 * now been pulled out of the system, so no need to do
-	 * any more locking.
-	 */
-	list_for_each_entry_safe (reg, next, &recovered, list) {
-		rh->log->type->clear_region(rh->log, reg->key);
-		complete_resync_work(reg, 1);
-		mempool_free(reg, rh->region_pool);
-	}
-
-	list_for_each_entry_safe(reg, next, &failed_recovered, list) {
-		complete_resync_work(reg, errors_handled(rh->ms) ? 0 : 1);
-		mempool_free(reg, rh->region_pool);
-	}
-
-	list_for_each_entry_safe(reg, next, &clean, list) {
-		rh->log->type->clear_region(rh->log, reg->key);
-		mempool_free(reg, rh->region_pool);
-	}
-
-	rh->log->type->flush(rh->log);
-}
-
-static void rh_inc(struct region_hash *rh, region_t region)
-{
-	struct region *reg;
-
-	read_lock(&rh->hash_lock);
-	reg = __rh_find(rh, region);
-
-	spin_lock_irq(&rh->region_lock);
-	atomic_inc(&reg->pending);
-
-	if (reg->state == RH_CLEAN) {
-		reg->state = RH_DIRTY;
-		list_del_init(&reg->list);	/* take off the clean list */
-		spin_unlock_irq(&rh->region_lock);
-
-		rh->log->type->mark_region(rh->log, reg->key);
-	} else
-		spin_unlock_irq(&rh->region_lock);
-
-
-	read_unlock(&rh->hash_lock);
-}
-
-static void rh_inc_pending(struct region_hash *rh, struct bio_list *bios)
+static void wakeup_all_recovery_waiters(void *context)
 {
-	struct bio *bio;
-
-	for (bio = bios->head; bio; bio = bio->bi_next)
-		rh_inc(rh, bio_to_region(rh, bio));
+	wake_up_all(&_kmirrord_recovery_stopped);
 }
 
-static void rh_dec(struct region_hash *rh, region_t region)
+static void queue_bio(struct mirror_set *ms, struct bio *bio, int rw)
 {
 	unsigned long flags;
-	struct region *reg;
 	int should_wake = 0;
+	struct bio_list *bl;
 
-	read_lock(&rh->hash_lock);
-	reg = __rh_lookup(rh, region);
-	read_unlock(&rh->hash_lock);
-
-	spin_lock_irqsave(&rh->region_lock, flags);
-	if (atomic_dec_and_test(&reg->pending)) {
-		/*
-		 * There is no pending I/O for this region.
-		 * We can move the region to corresponding list for next action.
-		 * At this point, the region is not yet connected to any list.
-		 *
-		 * If the state is RH_NOSYNC, the region should be kept off
-		 * from clean list.
-		 * The hash entry for RH_NOSYNC will remain in memory
-		 * until the region is recovered or the map is reloaded.
-		 */
-
-		/* do nothing for RH_NOSYNC */
-		if (reg->state == RH_RECOVERING) {
-			list_add_tail(&reg->list, &rh->quiesced_regions);
-		} else if (reg->state == RH_DIRTY) {
-			reg->state = RH_CLEAN;
-			list_add(&reg->list, &rh->clean_regions);
-		}
-		should_wake = 1;
-	}
-	spin_unlock_irqrestore(&rh->region_lock, flags);
+	bl = (rw == WRITE) ? &ms->writes : &ms->reads;
+	spin_lock_irqsave(&ms->lock, flags);
+	should_wake = !(bl->head);
+	bio_list_add(bl, bio);
+	spin_unlock_irqrestore(&ms->lock, flags);
 
 	if (should_wake)
-		wake(rh->ms);
+		wakeup_mirrord(ms);
 }
 
-/*
- * Starts quiescing a region in preparation for recovery.
- */
-static int __rh_recovery_prepare(struct region_hash *rh)
+static void dispatch_bios(void *context, struct bio_list *bio_list)
 {
-	int r;
-	struct region *reg;
-	region_t region;
-
-	/*
-	 * Ask the dirty log what's next.
-	 */
-	r = rh->log->type->get_resync_work(rh->log, &region);
-	if (r <= 0)
-		return r;
-
-	/*
-	 * Get this region, and start it quiescing by setting the
-	 * recovering flag.
-	 */
-	read_lock(&rh->hash_lock);
-	reg = __rh_find(rh, region);
-	read_unlock(&rh->hash_lock);
-
-	spin_lock_irq(&rh->region_lock);
-	reg->state = RH_RECOVERING;
-
-	/* Already quiesced ? */
-	if (atomic_read(&reg->pending))
-		list_del_init(&reg->list);
-	else
-		list_move(&reg->list, &rh->quiesced_regions);
-
-	spin_unlock_irq(&rh->region_lock);
-
-	return 1;
-}
-
-static void rh_recovery_prepare(struct region_hash *rh)
-{
-	/* Extra reference to avoid race with rh_stop_recovery */
-	atomic_inc(&rh->recovery_in_flight);
-
-	while (!down_trylock(&rh->recovery_count)) {
-		atomic_inc(&rh->recovery_in_flight);
-		if (__rh_recovery_prepare(rh) <= 0) {
-			atomic_dec(&rh->recovery_in_flight);
-			up(&rh->recovery_count);
-			break;
-		}
-	}
-
-	/* Drop the extra reference */
-	if (atomic_dec_and_test(&rh->recovery_in_flight))
-		wake_up_all(&_kmirrord_recovery_stopped);
-}
-
-/*
- * Returns any quiesced regions.
- */
-static struct region *rh_recovery_start(struct region_hash *rh)
-{
-	struct region *reg = NULL;
-
-	spin_lock_irq(&rh->region_lock);
-	if (!list_empty(&rh->quiesced_regions)) {
-		reg = list_entry(rh->quiesced_regions.next,
-				 struct region, list);
-		list_del_init(&reg->list);	/* remove from the quiesced list */
-	}
-	spin_unlock_irq(&rh->region_lock);
-
-	return reg;
-}
-
-static void rh_recovery_end(struct region *reg, int success)
-{
-	struct region_hash *rh = reg->rh;
-
-	spin_lock_irq(&rh->region_lock);
-	if (success)
-		list_add(&reg->list, &reg->rh->recovered_regions);
-	else {
-		reg->state = RH_NOSYNC;
-		list_add(&reg->list, &reg->rh->failed_recovered_regions);
-	}
-	spin_unlock_irq(&rh->region_lock);
-
-	wake(rh->ms);
-}
-
-static int rh_flush(struct region_hash *rh)
-{
-	return rh->log->type->flush(rh->log);
-}
-
-static void rh_delay(struct region_hash *rh, struct bio *bio)
-{
-	struct region *reg;
-
-	read_lock(&rh->hash_lock);
-	reg = __rh_find(rh, bio_to_region(rh, bio));
-	bio_list_add(&reg->delayed_bios, bio);
-	read_unlock(&rh->hash_lock);
-}
-
-static void rh_stop_recovery(struct region_hash *rh)
-{
-	int i;
-
-	/* wait for any recovering regions */
-	for (i = 0; i < MAX_RECOVERY; i++)
-		down(&rh->recovery_count);
-}
-
-static void rh_start_recovery(struct region_hash *rh)
-{
-	int i;
-
-	for (i = 0; i < MAX_RECOVERY; i++)
-		up(&rh->recovery_count);
+	struct mirror_set *ms = context;
+	struct bio *bio;
 
-	wake(rh->ms);
+	while ((bio = bio_list_pop(bio_list)))
+		queue_bio(ms, bio, WRITE);
 }
 
 #define MIN_READ_RECORDS 20
@@ -777,8 +246,8 @@ out:
 static void recovery_complete(int read_err, unsigned long write_err,
 			      void *context)
 {
-	struct region *reg = (struct region *)context;
-	struct mirror_set *ms = reg->rh->ms;
+	struct dm_region *reg = context;
+	struct mirror_set *ms = dm_rh_region_context(reg);
 	int m, bit = 0;
 
 	if (read_err) {
@@ -804,31 +273,33 @@ static void recovery_complete(int read_err, unsigned long write_err,
 		}
 	}
 
-	rh_recovery_end(reg, !(read_err || write_err));
+	dm_rh_recovery_end(reg, !(read_err || write_err));
 }
 
-static int recover(struct mirror_set *ms, struct region *reg)
+static int recover(struct mirror_set *ms, struct dm_region *reg)
 {
 	int r;
-	unsigned int i;
+	unsigned i;
 	struct dm_io_region from, to[DM_KCOPYD_MAX_REGIONS], *dest;
 	struct mirror *m;
 	unsigned long flags = 0;
+	region_t key = dm_rh_get_region_key(reg);
+	sector_t region_size = dm_rh_get_region_size(ms->rh);
 
 	/* fill in the source */
 	m = get_default_mirror(ms);
 	from.bdev = m->dev->bdev;
-	from.sector = m->offset + region_to_sector(reg->rh, reg->key);
-	if (reg->key == (ms->nr_regions - 1)) {
+	from.sector = m->offset + dm_rh_region_to_sector(ms->rh, key);
+	if (key == (ms->nr_regions - 1)) {
 		/*
 		 * The final region may be smaller than
 		 * region_size.
 		 */
-		from.count = ms->ti->len & (reg->rh->region_size - 1);
+		from.count = ms->ti->len & (region_size - 1);
 		if (!from.count)
-			from.count = reg->rh->region_size;
+			from.count = region_size;
 	} else
-		from.count = reg->rh->region_size;
+		from.count = region_size;
 
 	/* fill in the destinations */
 	for (i = 0, dest = to; i < ms->nr_mirrors; i++) {
@@ -837,7 +308,7 @@ static int recover(struct mirror_set *ms, struct region *reg)
 
 		m = ms->mirror + i;
 		dest->bdev = m->dev->bdev;
-		dest->sector = m->offset + region_to_sector(reg->rh, reg->key);
+		dest->sector = m->offset + dm_rh_region_to_sector(ms->rh, key);
 		dest->count = from.count;
 		dest++;
 	}
@@ -854,22 +325,22 @@ static int recover(struct mirror_set *ms, struct region *reg)
 
 static void do_recovery(struct mirror_set *ms)
 {
+	struct dm_region *reg;
+	struct dm_dirty_log *log = dm_rh_dirty_log(ms->rh);
 	int r;
-	struct region *reg;
-	struct dm_dirty_log *log = ms->rh.log;
 
 	/*
 	 * Start quiescing some regions.
 	 */
-	rh_recovery_prepare(&ms->rh);
+	dm_rh_recovery_prepare(ms->rh);
 
 	/*
 	 * Copy any already quiesced regions.
 	 */
-	while ((reg = rh_recovery_start(&ms->rh))) {
+	while ((reg = dm_rh_recovery_start(ms->rh))) {
 		r = recover(ms, reg);
 		if (r)
-			rh_recovery_end(reg, 0);
+			dm_rh_recovery_end(reg, 0);
 	}
 
 	/*
@@ -910,9 +381,10 @@ static int default_ok(struct mirror *m)
 
 static int mirror_available(struct mirror_set *ms, struct bio *bio)
 {
-	region_t region = bio_to_region(&ms->rh, bio);
+	struct dm_dirty_log *log = dm_rh_dirty_log(ms->rh);
+	region_t region = dm_rh_bio_to_region(ms->rh, bio);
 
-	if (ms->rh.log->type->in_sync(ms->rh.log, region, 0))
+	if (log->type->in_sync(log, region, 0))
 		return choose_mirror(ms,  bio->bi_sector) ? 1 : 0;
 
 	return 0;
@@ -986,7 +458,14 @@ static void read_async_bio(struct mirror *m, struct bio *bio)
 
 	map_region(&io, m, bio);
 	bio_set_m(bio, m);
-	(void) dm_io(&io_req, 1, &io, NULL);
+	BUG_ON(dm_io(&io_req, 1, &io, NULL));
+}
+
+static inline int region_in_sync(struct mirror_set *ms, region_t region,
+				 int may_block)
+{
+	int state = dm_rh_get_state(ms->rh, region, may_block);
+	return state == DM_RH_CLEAN || state == DM_RH_DIRTY;
 }
 
 static void do_reads(struct mirror_set *ms, struct bio_list *reads)
@@ -996,13 +475,13 @@ static void do_reads(struct mirror_set *ms, struct bio_list *reads)
 	struct mirror *m;
 
 	while ((bio = bio_list_pop(reads))) {
-		region = bio_to_region(&ms->rh, bio);
+		region = dm_rh_bio_to_region(ms->rh, bio);
 		m = get_default_mirror(ms);
 
 		/*
 		 * We can only read balance if the region is in sync.
 		 */
-		if (likely(rh_in_sync(&ms->rh, region, 1)))
+		if (likely(region_in_sync(ms, region, 1)))
 			m = choose_mirror(ms, bio->bi_sector);
 		else if (m && atomic_read(&m->error_count))
 			m = NULL;
@@ -1025,57 +504,6 @@ static void do_reads(struct mirror_set *ms, struct bio_list *reads)
  * NOSYNC:	increment pending, just write to the default mirror
  *---------------------------------------------------------------*/
 
-/* __bio_mark_nosync
- * @ms
- * @bio
- * @done
- * @error
- *
- * The bio was written on some mirror(s) but failed on other mirror(s).
- * We can successfully endio the bio but should avoid the region being
- * marked clean by setting the state RH_NOSYNC.
- *
- * This function is _not_ safe in interrupt context!
- */
-static void __bio_mark_nosync(struct mirror_set *ms,
-			      struct bio *bio, unsigned done, int error)
-{
-	unsigned long flags;
-	struct region_hash *rh = &ms->rh;
-	struct dm_dirty_log *log = ms->rh.log;
-	struct region *reg;
-	region_t region = bio_to_region(rh, bio);
-	int recovering = 0;
-
-	/* We must inform the log that the sync count has changed. */
-	log->type->set_region_sync(log, region, 0);
-	ms->in_sync = 0;
-
-	read_lock(&rh->hash_lock);
-	reg = __rh_find(rh, region);
-	read_unlock(&rh->hash_lock);
-
-	/* region hash entry should exist because write was in-flight */
-	BUG_ON(!reg);
-	BUG_ON(!list_empty(&reg->list));
-
-	spin_lock_irqsave(&rh->region_lock, flags);
-	/*
-	 * Possible cases:
-	 *   1) RH_DIRTY
-	 *   2) RH_NOSYNC: was dirty, other preceeding writes failed
-	 *   3) RH_RECOVERING: flushing pending writes
-	 * Either case, the region should have not been connected to list.
-	 */
-	recovering = (reg->state == RH_RECOVERING);
-	reg->state = RH_NOSYNC;
-	BUG_ON(!list_empty(&reg->list));
-	spin_unlock_irqrestore(&rh->region_lock, flags);
-
-	bio_endio(bio, error);
-	if (recovering)
-		complete_resync_work(reg, 0);
-}
 
 static void write_callback(unsigned long error, void *context)
 {
@@ -1120,7 +548,7 @@ static void write_callback(unsigned long error, void *context)
 		bio_list_add(&ms->failures, bio);
 		spin_unlock_irqrestore(&ms->lock, flags);
 		if (should_wake)
-			wake(ms);
+			wakeup_mirrord(ms);
 		return;
 	}
 out:
@@ -1150,7 +578,7 @@ static void do_write(struct mirror_set *ms, struct bio *bio)
 	 */
 	bio_set_m(bio, get_default_mirror(ms));
 
-	(void) dm_io(&io_req, ms->nr_mirrors, io, NULL);
+	BUG_ON(dm_io(&io_req, ms->nr_mirrors, io, NULL));
 }
 
 static void do_writes(struct mirror_set *ms, struct bio_list *writes)
@@ -1170,18 +598,19 @@ static void do_writes(struct mirror_set *ms, struct bio_list *writes)
 	bio_list_init(&recover);
 
 	while ((bio = bio_list_pop(writes))) {
-		state = rh_state(&ms->rh, bio_to_region(&ms->rh, bio), 1);
+		state = dm_rh_get_state(ms->rh,
+					dm_rh_bio_to_region(ms->rh, bio), 1);
 		switch (state) {
-		case RH_CLEAN:
-		case RH_DIRTY:
+		case DM_RH_CLEAN:
+		case DM_RH_DIRTY:
 			this_list = &sync;
 			break;
 
-		case RH_NOSYNC:
+		case DM_RH_NOSYNC:
 			this_list = &nosync;
 			break;
 
-		case RH_RECOVERING:
+		case DM_RH_RECOVERING:
 			this_list = &recover;
 			break;
 		}
@@ -1194,9 +623,9 @@ static void do_writes(struct mirror_set *ms, struct bio_list *writes)
 	 * be written to (writes to recover regions are going to
 	 * be delayed).
 	 */
-	rh_inc_pending(&ms->rh, &sync);
-	rh_inc_pending(&ms->rh, &nosync);
-	ms->log_failure = rh_flush(&ms->rh) ? 1 : 0;
+	dm_rh_inc_pending(ms->rh, &sync);
+	dm_rh_inc_pending(ms->rh, &nosync);
+	ms->log_failure = dm_rh_flush(ms->rh) ? 1 : 0;
 
 	/*
 	 * Dispatch io.
@@ -1205,13 +634,13 @@ static void do_writes(struct mirror_set *ms, struct bio_list *writes)
 		spin_lock_irq(&ms->lock);
 		bio_list_merge(&ms->failures, &sync);
 		spin_unlock_irq(&ms->lock);
-		wake(ms);
+		wakeup_mirrord(ms);
 	} else
 		while ((bio = bio_list_pop(&sync)))
 			do_write(ms, bio);
 
 	while ((bio = bio_list_pop(&recover)))
-		rh_delay(&ms->rh, bio);
+		dm_rh_delay(ms->rh, bio);
 
 	while ((bio = bio_list_pop(&nosync))) {
 		map_bio(get_default_mirror(ms), bio);
@@ -1228,7 +657,8 @@ static void do_failures(struct mirror_set *ms, struct bio_list *failures)
 
 	if (!ms->log_failure) {
 		while ((bio = bio_list_pop(failures)))
-			__bio_mark_nosync(ms, bio, bio->bi_size, 0);
+			ms->in_sync = 0;
+			dm_rh_mark_nosync(ms->rh, bio, bio->bi_size, 0);
 		return;
 	}
 
@@ -1281,8 +711,8 @@ static void trigger_event(struct work_struct *work)
  *---------------------------------------------------------------*/
 static void do_mirror(struct work_struct *work)
 {
-	struct mirror_set *ms =container_of(work, struct mirror_set,
-					    kmirrord_work);
+	struct mirror_set *ms = container_of(work, struct mirror_set,
+					     kmirrord_work);
 	struct bio_list reads, writes, failures;
 	unsigned long flags;
 
@@ -1295,7 +725,7 @@ static void do_mirror(struct work_struct *work)
 	bio_list_init(&ms->failures);
 	spin_unlock_irqrestore(&ms->lock, flags);
 
-	rh_update_states(&ms->rh);
+	dm_rh_update_states(ms->rh, errors_handled(ms));
 	do_recovery(ms);
 	do_reads(ms, &reads);
 	do_writes(ms, &writes);
@@ -1304,7 +734,6 @@ static void do_mirror(struct work_struct *work)
 	dm_table_unplug_all(ms->ti->table);
 }
 
-
 /*-----------------------------------------------------------------
  * Target functions
  *---------------------------------------------------------------*/
@@ -1351,7 +780,11 @@ static struct mirror_set *alloc_context(unsigned int nr_mirrors,
  		return NULL;
 	}
 
-	if (rh_init(&ms->rh, ms, dl, region_size, ms->nr_regions)) {
+	ms->rh = dm_region_hash_create(ms, dispatch_bios, wakeup_mirrord,
+				       wakeup_all_recovery_waiters,
+				       ms->ti->begin, MAX_RECOVERY,
+				       dl, region_size, ms->nr_regions);
+	if (IS_ERR(ms->rh)) {
 		ti->error = "Error creating dirty region hash";
 		dm_io_client_destroy(ms->io_client);
 		mempool_destroy(ms->read_record_pool);
@@ -1369,7 +802,7 @@ static void free_context(struct mirror_set *ms, struct dm_target *ti,
 		dm_put_device(ti, ms->mirror[m].dev);
 
 	dm_io_client_destroy(ms->io_client);
-	rh_exit(&ms->rh);
+	dm_region_hash_destroy(ms->rh);
 	mempool_destroy(ms->read_record_pool);
 	kfree(ms);
 }
@@ -1409,10 +842,10 @@ static int get_mirror(struct mirror_set *ms, struct dm_target *ti,
  * Create dirty log: log_type #log_params <log_params>
  */
 static struct dm_dirty_log *create_dirty_log(struct dm_target *ti,
-					  unsigned int argc, char **argv,
-					  unsigned int *args_used)
+					     unsigned argc, char **argv,
+					     unsigned *args_used)
 {
-	unsigned int param_count;
+	unsigned param_count;
 	struct dm_dirty_log *dl;
 
 	if (argc < 2) {
@@ -1543,7 +976,7 @@ static int mirror_ctr(struct dm_target *ti, unsigned int argc, char **argv)
 	}
 
 	ti->private = ms;
- 	ti->split_io = ms->rh.region_size;
+	ti->split_io = dm_rh_get_region_size(ms->rh);
 
 	ms->kmirrord_wq = create_singlethread_workqueue("kmirrord");
 	if (!ms->kmirrord_wq) {
@@ -1578,11 +1011,11 @@ static int mirror_ctr(struct dm_target *ti, unsigned int argc, char **argv)
 		goto err_destroy_wq;
 	}
 
-	r = dm_kcopyd_client_create(DM_IO_PAGES, &ms->kcopyd_client);
+	r = dm_kcopyd_client_create(DM_KCOPYD_PAGES, &ms->kcopyd_client);
 	if (r)
 		goto err_destroy_wq;
 
-	wake(ms);
+	wakeup_mirrord(ms);
 	return 0;
 
 err_destroy_wq:
@@ -1603,22 +1036,6 @@ static void mirror_dtr(struct dm_target *ti)
 	free_context(ms, ti, ms->nr_mirrors);
 }
 
-static void queue_bio(struct mirror_set *ms, struct bio *bio, int rw)
-{
-	unsigned long flags;
-	int should_wake = 0;
-	struct bio_list *bl;
-
-	bl = (rw == WRITE) ? &ms->writes : &ms->reads;
-	spin_lock_irqsave(&ms->lock, flags);
-	should_wake = !(bl->head);
-	bio_list_add(bl, bio);
-	spin_unlock_irqrestore(&ms->lock, flags);
-
-	if (should_wake)
-		wake(ms);
-}
-
 /*
  * Mirror mapping function
  */
@@ -1629,16 +1046,16 @@ static int mirror_map(struct dm_target *ti, struct bio *bio,
 	struct mirror *m;
 	struct mirror_set *ms = ti->private;
 	struct dm_raid1_read_record *read_record = NULL;
+	struct dm_dirty_log *log = dm_rh_dirty_log(ms->rh);
 
 	if (rw == WRITE) {
 		/* Save region for mirror_end_io() handler */
-		map_context->ll = bio_to_region(&ms->rh, bio);
+		map_context->ll = dm_rh_bio_to_region(ms->rh, bio);
 		queue_bio(ms, bio, rw);
 		return DM_MAPIO_SUBMITTED;
 	}
 
-	r = ms->rh.log->type->in_sync(ms->rh.log,
-				      bio_to_region(&ms->rh, bio), 0);
+	r = log->type->in_sync(log, dm_rh_bio_to_region(ms->rh, bio), 0);
 	if (r < 0 && r != -EWOULDBLOCK)
 		return r;
 
@@ -1686,7 +1103,7 @@ static int mirror_end_io(struct dm_target *ti, struct bio *bio,
 	 * We need to dec pending if this was a write.
 	 */
 	if (rw == WRITE) {
-		rh_dec(&ms->rh, map_context->ll);
+		dm_rh_dec(ms->rh, map_context->ll);
 		return error;
 	}
 
@@ -1742,7 +1159,7 @@ out:
 static void mirror_presuspend(struct dm_target *ti)
 {
 	struct mirror_set *ms = (struct mirror_set *) ti->private;
-	struct dm_dirty_log *log = ms->rh.log;
+	struct dm_dirty_log *log = dm_rh_dirty_log(ms->rh);
 
 	atomic_set(&ms->suspend, 1);
 
@@ -1750,10 +1167,10 @@ static void mirror_presuspend(struct dm_target *ti)
 	 * We must finish up all the work that we've
 	 * generated (i.e. recovery work).
 	 */
-	rh_stop_recovery(&ms->rh);
+	dm_rh_stop_recovery(ms->rh);
 
 	wait_event(_kmirrord_recovery_stopped,
-		   !atomic_read(&ms->rh.recovery_in_flight));
+		   !dm_rh_recovery_in_flight(ms->rh));
 
 	if (log->type->presuspend && log->type->presuspend(log))
 		/* FIXME: need better error handling */
@@ -1771,7 +1188,7 @@ static void mirror_presuspend(struct dm_target *ti)
 static void mirror_postsuspend(struct dm_target *ti)
 {
 	struct mirror_set *ms = ti->private;
-	struct dm_dirty_log *log = ms->rh.log;
+	struct dm_dirty_log *log = dm_rh_dirty_log(ms->rh);
 
 	if (log->type->postsuspend && log->type->postsuspend(log))
 		/* FIXME: need better error handling */
@@ -1781,13 +1198,13 @@ static void mirror_postsuspend(struct dm_target *ti)
 static void mirror_resume(struct dm_target *ti)
 {
 	struct mirror_set *ms = ti->private;
-	struct dm_dirty_log *log = ms->rh.log;
+	struct dm_dirty_log *log = dm_rh_dirty_log(ms->rh);
 
 	atomic_set(&ms->suspend, 0);
 	if (log->type->resume && log->type->resume(log))
 		/* FIXME: need better error handling */
 		DMWARN("log resume failed");
-	rh_start_recovery(&ms->rh);
+	dm_rh_start_recovery(ms->rh);
 }
 
 /*
@@ -1819,7 +1236,7 @@ static int mirror_status(struct dm_target *ti, status_type_t type,
 {
 	unsigned int m, sz = 0;
 	struct mirror_set *ms = (struct mirror_set *) ti->private;
-	struct dm_dirty_log *log = ms->rh.log;
+	struct dm_dirty_log *log = dm_rh_dirty_log(ms->rh);
 	char buffer[ms->nr_mirrors + 1];
 
 	switch (type) {
@@ -1832,15 +1249,15 @@ static int mirror_status(struct dm_target *ti, status_type_t type,
 		buffer[m] = '\0';
 
 		DMEMIT("%llu/%llu 1 %s ",
-		      (unsigned long long)log->type->get_sync_count(ms->rh.log),
+		      (unsigned long long)log->type->get_sync_count(log),
 		      (unsigned long long)ms->nr_regions, buffer);
 
-		sz += log->type->status(ms->rh.log, type, result+sz, maxlen-sz);
+		sz += log->type->status(log, type, result+sz, maxlen-sz);
 
 		break;
 
 	case STATUSTYPE_TABLE:
-		sz = log->type->status(ms->rh.log, type, result, maxlen);
+		sz = log->type->status(log, type, result, maxlen);
 
 		DMEMIT("%d", ms->nr_mirrors);
 		for (m = 0; m < ms->nr_mirrors; m++)
diff --git a/drivers/md/dm-region-hash.c b/drivers/md/dm-region-hash.c
new file mode 100644
index 000000000000..59f8d9df9e1a
--- /dev/null
+++ b/drivers/md/dm-region-hash.c
@@ -0,0 +1,704 @@
+/*
+ * Copyright (C) 2003 Sistina Software Limited.
+ * Copyright (C) 2004-2008 Red Hat, Inc. All rights reserved.
+ *
+ * This file is released under the GPL.
+ */
+
+#include <linux/dm-dirty-log.h>
+#include <linux/dm-region-hash.h>
+
+#include <linux/ctype.h>
+#include <linux/init.h>
+#include <linux/module.h>
+#include <linux/vmalloc.h>
+
+#include "dm.h"
+#include "dm-bio-list.h"
+
+#define	DM_MSG_PREFIX	"region hash"
+
+/*-----------------------------------------------------------------
+ * Region hash
+ *
+ * The mirror splits itself up into discrete regions.  Each
+ * region can be in one of three states: clean, dirty,
+ * nosync.  There is no need to put clean regions in the hash.
+ *
+ * In addition to being present in the hash table a region _may_
+ * be present on one of three lists.
+ *
+ *   clean_regions: Regions on this list have no io pending to
+ *   them, they are in sync, we are no longer interested in them,
+ *   they are dull.  dm_rh_update_states() will remove them from the
+ *   hash table.
+ *
+ *   quiesced_regions: These regions have been spun down, ready
+ *   for recovery.  rh_recovery_start() will remove regions from
+ *   this list and hand them to kmirrord, which will schedule the
+ *   recovery io with kcopyd.
+ *
+ *   recovered_regions: Regions that kcopyd has successfully
+ *   recovered.  dm_rh_update_states() will now schedule any delayed
+ *   io, up the recovery_count, and remove the region from the
+ *   hash.
+ *
+ * There are 2 locks:
+ *   A rw spin lock 'hash_lock' protects just the hash table,
+ *   this is never held in write mode from interrupt context,
+ *   which I believe means that we only have to disable irqs when
+ *   doing a write lock.
+ *
+ *   An ordinary spin lock 'region_lock' that protects the three
+ *   lists in the region_hash, with the 'state', 'list' and
+ *   'delayed_bios' fields of the regions.  This is used from irq
+ *   context, so all other uses will have to suspend local irqs.
+ *---------------------------------------------------------------*/
+struct dm_region_hash {
+	uint32_t region_size;
+	unsigned region_shift;
+
+	/* holds persistent region state */
+	struct dm_dirty_log *log;
+
+	/* hash table */
+	rwlock_t hash_lock;
+	mempool_t *region_pool;
+	unsigned mask;
+	unsigned nr_buckets;
+	unsigned prime;
+	unsigned shift;
+	struct list_head *buckets;
+
+	unsigned max_recovery; /* Max # of regions to recover in parallel */
+
+	spinlock_t region_lock;
+	atomic_t recovery_in_flight;
+	struct semaphore recovery_count;
+	struct list_head clean_regions;
+	struct list_head quiesced_regions;
+	struct list_head recovered_regions;
+	struct list_head failed_recovered_regions;
+
+	void *context;
+	sector_t target_begin;
+
+	/* Callback function to schedule bios writes */
+	void (*dispatch_bios)(void *context, struct bio_list *bios);
+
+	/* Callback function to wakeup callers worker thread. */
+	void (*wakeup_workers)(void *context);
+
+	/* Callback function to wakeup callers recovery waiters. */
+	void (*wakeup_all_recovery_waiters)(void *context);
+};
+
+struct dm_region {
+	struct dm_region_hash *rh;	/* FIXME: can we get rid of this ? */
+	region_t key;
+	int state;
+
+	struct list_head hash_list;
+	struct list_head list;
+
+	atomic_t pending;
+	struct bio_list delayed_bios;
+};
+
+/*
+ * Conversion fns
+ */
+static region_t dm_rh_sector_to_region(struct dm_region_hash *rh, sector_t sector)
+{
+	return sector >> rh->region_shift;
+}
+
+sector_t dm_rh_region_to_sector(struct dm_region_hash *rh, region_t region)
+{
+	return region << rh->region_shift;
+}
+EXPORT_SYMBOL_GPL(dm_rh_region_to_sector);
+
+region_t dm_rh_bio_to_region(struct dm_region_hash *rh, struct bio *bio)
+{
+	return dm_rh_sector_to_region(rh, bio->bi_sector - rh->target_begin);
+}
+EXPORT_SYMBOL_GPL(dm_rh_bio_to_region);
+
+void *dm_rh_region_context(struct dm_region *reg)
+{
+	return reg->rh->context;
+}
+EXPORT_SYMBOL_GPL(dm_rh_region_context);
+
+region_t dm_rh_get_region_key(struct dm_region *reg)
+{
+	return reg->key;
+}
+EXPORT_SYMBOL_GPL(dm_rh_get_region_key);
+
+sector_t dm_rh_get_region_size(struct dm_region_hash *rh)
+{
+	return rh->region_size;
+}
+EXPORT_SYMBOL_GPL(dm_rh_get_region_size);
+
+/*
+ * FIXME: shall we pass in a structure instead of all these args to
+ * dm_region_hash_create()????
+ */
+#define RH_HASH_MULT 2654435387U
+#define RH_HASH_SHIFT 12
+
+#define MIN_REGIONS 64
+struct dm_region_hash *dm_region_hash_create(
+		void *context, void (*dispatch_bios)(void *context,
+						     struct bio_list *bios),
+		void (*wakeup_workers)(void *context),
+		void (*wakeup_all_recovery_waiters)(void *context),
+		sector_t target_begin, unsigned max_recovery,
+		struct dm_dirty_log *log, uint32_t region_size,
+		region_t nr_regions)
+{
+	struct dm_region_hash *rh;
+	unsigned nr_buckets, max_buckets;
+	size_t i;
+
+	/*
+	 * Calculate a suitable number of buckets for our hash
+	 * table.
+	 */
+	max_buckets = nr_regions >> 6;
+	for (nr_buckets = 128u; nr_buckets < max_buckets; nr_buckets <<= 1)
+		;
+	nr_buckets >>= 1;
+
+	rh = kmalloc(sizeof(*rh), GFP_KERNEL);
+	if (!rh) {
+		DMERR("unable to allocate region hash memory");
+		return ERR_PTR(-ENOMEM);
+	}
+
+	rh->context = context;
+	rh->dispatch_bios = dispatch_bios;
+	rh->wakeup_workers = wakeup_workers;
+	rh->wakeup_all_recovery_waiters = wakeup_all_recovery_waiters;
+	rh->target_begin = target_begin;
+	rh->max_recovery = max_recovery;
+	rh->log = log;
+	rh->region_size = region_size;
+	rh->region_shift = ffs(region_size) - 1;
+	rwlock_init(&rh->hash_lock);
+	rh->mask = nr_buckets - 1;
+	rh->nr_buckets = nr_buckets;
+
+	rh->shift = RH_HASH_SHIFT;
+	rh->prime = RH_HASH_MULT;
+
+	rh->buckets = vmalloc(nr_buckets * sizeof(*rh->buckets));
+	if (!rh->buckets) {
+		DMERR("unable to allocate region hash bucket memory");
+		kfree(rh);
+		return ERR_PTR(-ENOMEM);
+	}
+
+	for (i = 0; i < nr_buckets; i++)
+		INIT_LIST_HEAD(rh->buckets + i);
+
+	spin_lock_init(&rh->region_lock);
+	sema_init(&rh->recovery_count, 0);
+	atomic_set(&rh->recovery_in_flight, 0);
+	INIT_LIST_HEAD(&rh->clean_regions);
+	INIT_LIST_HEAD(&rh->quiesced_regions);
+	INIT_LIST_HEAD(&rh->recovered_regions);
+	INIT_LIST_HEAD(&rh->failed_recovered_regions);
+
+	rh->region_pool = mempool_create_kmalloc_pool(MIN_REGIONS,
+						      sizeof(struct dm_region));
+	if (!rh->region_pool) {
+		vfree(rh->buckets);
+		kfree(rh);
+		rh = ERR_PTR(-ENOMEM);
+	}
+
+	return rh;
+}
+EXPORT_SYMBOL_GPL(dm_region_hash_create);
+
+void dm_region_hash_destroy(struct dm_region_hash *rh)
+{
+	unsigned h;
+	struct dm_region *reg, *nreg;
+
+	BUG_ON(!list_empty(&rh->quiesced_regions));
+	for (h = 0; h < rh->nr_buckets; h++) {
+		list_for_each_entry_safe(reg, nreg, rh->buckets + h,
+					 hash_list) {
+			BUG_ON(atomic_read(&reg->pending));
+			mempool_free(reg, rh->region_pool);
+		}
+	}
+
+	if (rh->log)
+		dm_dirty_log_destroy(rh->log);
+
+	if (rh->region_pool)
+		mempool_destroy(rh->region_pool);
+
+	vfree(rh->buckets);
+	kfree(rh);
+}
+EXPORT_SYMBOL_GPL(dm_region_hash_destroy);
+
+struct dm_dirty_log *dm_rh_dirty_log(struct dm_region_hash *rh)
+{
+	return rh->log;
+}
+EXPORT_SYMBOL_GPL(dm_rh_dirty_log);
+
+static unsigned rh_hash(struct dm_region_hash *rh, region_t region)
+{
+	return (unsigned) ((region * rh->prime) >> rh->shift) & rh->mask;
+}
+
+static struct dm_region *__rh_lookup(struct dm_region_hash *rh, region_t region)
+{
+	struct dm_region *reg;
+	struct list_head *bucket = rh->buckets + rh_hash(rh, region);
+
+	list_for_each_entry(reg, bucket, hash_list)
+		if (reg->key == region)
+			return reg;
+
+	return NULL;
+}
+
+static void __rh_insert(struct dm_region_hash *rh, struct dm_region *reg)
+{
+	list_add(&reg->hash_list, rh->buckets + rh_hash(rh, reg->key));
+}
+
+static struct dm_region *__rh_alloc(struct dm_region_hash *rh, region_t region)
+{
+	struct dm_region *reg, *nreg;
+
+	nreg = mempool_alloc(rh->region_pool, GFP_ATOMIC);
+	if (unlikely(!nreg))
+		nreg = kmalloc(sizeof(*nreg), GFP_NOIO);
+
+	nreg->state = rh->log->type->in_sync(rh->log, region, 1) ?
+		      DM_RH_CLEAN : DM_RH_NOSYNC;
+	nreg->rh = rh;
+	nreg->key = region;
+	INIT_LIST_HEAD(&nreg->list);
+	atomic_set(&nreg->pending, 0);
+	bio_list_init(&nreg->delayed_bios);
+
+	write_lock_irq(&rh->hash_lock);
+	reg = __rh_lookup(rh, region);
+	if (reg)
+		/* We lost the race. */
+		mempool_free(nreg, rh->region_pool);
+	else {
+		__rh_insert(rh, nreg);
+		if (nreg->state == DM_RH_CLEAN) {
+			spin_lock(&rh->region_lock);
+			list_add(&nreg->list, &rh->clean_regions);
+			spin_unlock(&rh->region_lock);
+		}
+
+		reg = nreg;
+	}
+	write_unlock_irq(&rh->hash_lock);
+
+	return reg;
+}
+
+static struct dm_region *__rh_find(struct dm_region_hash *rh, region_t region)
+{
+	struct dm_region *reg;
+
+	reg = __rh_lookup(rh, region);
+	if (!reg) {
+		read_unlock(&rh->hash_lock);
+		reg = __rh_alloc(rh, region);
+		read_lock(&rh->hash_lock);
+	}
+
+	return reg;
+}
+
+int dm_rh_get_state(struct dm_region_hash *rh, region_t region, int may_block)
+{
+	int r;
+	struct dm_region *reg;
+
+	read_lock(&rh->hash_lock);
+	reg = __rh_lookup(rh, region);
+	read_unlock(&rh->hash_lock);
+
+	if (reg)
+		return reg->state;
+
+	/*
+	 * The region wasn't in the hash, so we fall back to the
+	 * dirty log.
+	 */
+	r = rh->log->type->in_sync(rh->log, region, may_block);
+
+	/*
+	 * Any error from the dirty log (eg. -EWOULDBLOCK) gets
+	 * taken as a DM_RH_NOSYNC
+	 */
+	return r == 1 ? DM_RH_CLEAN : DM_RH_NOSYNC;
+}
+EXPORT_SYMBOL_GPL(dm_rh_get_state);
+
+static void complete_resync_work(struct dm_region *reg, int success)
+{
+	struct dm_region_hash *rh = reg->rh;
+
+	rh->log->type->set_region_sync(rh->log, reg->key, success);
+
+	/*
+	 * Dispatch the bios before we call 'wake_up_all'.
+	 * This is important because if we are suspending,
+	 * we want to know that recovery is complete and
+	 * the work queue is flushed.  If we wake_up_all
+	 * before we dispatch_bios (queue bios and call wake()),
+	 * then we risk suspending before the work queue
+	 * has been properly flushed.
+	 */
+	rh->dispatch_bios(rh->context, &reg->delayed_bios);
+	if (atomic_dec_and_test(&rh->recovery_in_flight))
+		rh->wakeup_all_recovery_waiters(rh->context);
+	up(&rh->recovery_count);
+}
+
+/* dm_rh_mark_nosync
+ * @ms
+ * @bio
+ * @done
+ * @error
+ *
+ * The bio was written on some mirror(s) but failed on other mirror(s).
+ * We can successfully endio the bio but should avoid the region being
+ * marked clean by setting the state DM_RH_NOSYNC.
+ *
+ * This function is _not_ safe in interrupt context!
+ */
+void dm_rh_mark_nosync(struct dm_region_hash *rh,
+		       struct bio *bio, unsigned done, int error)
+{
+	unsigned long flags;
+	struct dm_dirty_log *log = rh->log;
+	struct dm_region *reg;
+	region_t region = dm_rh_bio_to_region(rh, bio);
+	int recovering = 0;
+
+	/* We must inform the log that the sync count has changed. */
+	log->type->set_region_sync(log, region, 0);
+
+	read_lock(&rh->hash_lock);
+	reg = __rh_find(rh, region);
+	read_unlock(&rh->hash_lock);
+
+	/* region hash entry should exist because write was in-flight */
+	BUG_ON(!reg);
+	BUG_ON(!list_empty(&reg->list));
+
+	spin_lock_irqsave(&rh->region_lock, flags);
+	/*
+	 * Possible cases:
+	 *   1) DM_RH_DIRTY
+	 *   2) DM_RH_NOSYNC: was dirty, other preceeding writes failed
+	 *   3) DM_RH_RECOVERING: flushing pending writes
+	 * Either case, the region should have not been connected to list.
+	 */
+	recovering = (reg->state == DM_RH_RECOVERING);
+	reg->state = DM_RH_NOSYNC;
+	BUG_ON(!list_empty(&reg->list));
+	spin_unlock_irqrestore(&rh->region_lock, flags);
+
+	bio_endio(bio, error);
+	if (recovering)
+		complete_resync_work(reg, 0);
+}
+EXPORT_SYMBOL_GPL(dm_rh_mark_nosync);
+
+void dm_rh_update_states(struct dm_region_hash *rh, int errors_handled)
+{
+	struct dm_region *reg, *next;
+
+	LIST_HEAD(clean);
+	LIST_HEAD(recovered);
+	LIST_HEAD(failed_recovered);
+
+	/*
+	 * Quickly grab the lists.
+	 */
+	write_lock_irq(&rh->hash_lock);
+	spin_lock(&rh->region_lock);
+	if (!list_empty(&rh->clean_regions)) {
+		list_splice_init(&rh->clean_regions, &clean);
+
+		list_for_each_entry(reg, &clean, list)
+			list_del(&reg->hash_list);
+	}
+
+	if (!list_empty(&rh->recovered_regions)) {
+		list_splice_init(&rh->recovered_regions, &recovered);
+
+		list_for_each_entry(reg, &recovered, list)
+			list_del(&reg->hash_list);
+	}
+
+	if (!list_empty(&rh->failed_recovered_regions)) {
+		list_splice_init(&rh->failed_recovered_regions,
+				 &failed_recovered);
+
+		list_for_each_entry(reg, &failed_recovered, list)
+			list_del(&reg->hash_list);
+	}
+
+	spin_unlock(&rh->region_lock);
+	write_unlock_irq(&rh->hash_lock);
+
+	/*
+	 * All the regions on the recovered and clean lists have
+	 * now been pulled out of the system, so no need to do
+	 * any more locking.
+	 */
+	list_for_each_entry_safe(reg, next, &recovered, list) {
+		rh->log->type->clear_region(rh->log, reg->key);
+		complete_resync_work(reg, 1);
+		mempool_free(reg, rh->region_pool);
+	}
+
+	list_for_each_entry_safe(reg, next, &failed_recovered, list) {
+		complete_resync_work(reg, errors_handled ? 0 : 1);
+		mempool_free(reg, rh->region_pool);
+	}
+
+	list_for_each_entry_safe(reg, next, &clean, list) {
+		rh->log->type->clear_region(rh->log, reg->key);
+		mempool_free(reg, rh->region_pool);
+	}
+
+	rh->log->type->flush(rh->log);
+}
+EXPORT_SYMBOL_GPL(dm_rh_update_states);
+
+static void rh_inc(struct dm_region_hash *rh, region_t region)
+{
+	struct dm_region *reg;
+
+	read_lock(&rh->hash_lock);
+	reg = __rh_find(rh, region);
+
+	spin_lock_irq(&rh->region_lock);
+	atomic_inc(&reg->pending);
+
+	if (reg->state == DM_RH_CLEAN) {
+		reg->state = DM_RH_DIRTY;
+		list_del_init(&reg->list);	/* take off the clean list */
+		spin_unlock_irq(&rh->region_lock);
+
+		rh->log->type->mark_region(rh->log, reg->key);
+	} else
+		spin_unlock_irq(&rh->region_lock);
+
+
+	read_unlock(&rh->hash_lock);
+}
+
+void dm_rh_inc_pending(struct dm_region_hash *rh, struct bio_list *bios)
+{
+	struct bio *bio;
+
+	for (bio = bios->head; bio; bio = bio->bi_next)
+		rh_inc(rh, dm_rh_bio_to_region(rh, bio));
+}
+EXPORT_SYMBOL_GPL(dm_rh_inc_pending);
+
+void dm_rh_dec(struct dm_region_hash *rh, region_t region)
+{
+	unsigned long flags;
+	struct dm_region *reg;
+	int should_wake = 0;
+
+	read_lock(&rh->hash_lock);
+	reg = __rh_lookup(rh, region);
+	read_unlock(&rh->hash_lock);
+
+	spin_lock_irqsave(&rh->region_lock, flags);
+	if (atomic_dec_and_test(&reg->pending)) {
+		/*
+		 * There is no pending I/O for this region.
+		 * We can move the region to corresponding list for next action.
+		 * At this point, the region is not yet connected to any list.
+		 *
+		 * If the state is DM_RH_NOSYNC, the region should be kept off
+		 * from clean list.
+		 * The hash entry for DM_RH_NOSYNC will remain in memory
+		 * until the region is recovered or the map is reloaded.
+		 */
+
+		/* do nothing for DM_RH_NOSYNC */
+		if (reg->state == DM_RH_RECOVERING) {
+			list_add_tail(&reg->list, &rh->quiesced_regions);
+		} else if (reg->state == DM_RH_DIRTY) {
+			reg->state = DM_RH_CLEAN;
+			list_add(&reg->list, &rh->clean_regions);
+		}
+		should_wake = 1;
+	}
+	spin_unlock_irqrestore(&rh->region_lock, flags);
+
+	if (should_wake)
+		rh->wakeup_workers(rh->context);
+}
+EXPORT_SYMBOL_GPL(dm_rh_dec);
+
+/*
+ * Starts quiescing a region in preparation for recovery.
+ */
+static int __rh_recovery_prepare(struct dm_region_hash *rh)
+{
+	int r;
+	region_t region;
+	struct dm_region *reg;
+
+	/*
+	 * Ask the dirty log what's next.
+	 */
+	r = rh->log->type->get_resync_work(rh->log, &region);
+	if (r <= 0)
+		return r;
+
+	/*
+	 * Get this region, and start it quiescing by setting the
+	 * recovering flag.
+	 */
+	read_lock(&rh->hash_lock);
+	reg = __rh_find(rh, region);
+	read_unlock(&rh->hash_lock);
+
+	spin_lock_irq(&rh->region_lock);
+	reg->state = DM_RH_RECOVERING;
+
+	/* Already quiesced ? */
+	if (atomic_read(&reg->pending))
+		list_del_init(&reg->list);
+	else
+		list_move(&reg->list, &rh->quiesced_regions);
+
+	spin_unlock_irq(&rh->region_lock);
+
+	return 1;
+}
+
+void dm_rh_recovery_prepare(struct dm_region_hash *rh)
+{
+	/* Extra reference to avoid race with dm_rh_stop_recovery */
+	atomic_inc(&rh->recovery_in_flight);
+
+	while (!down_trylock(&rh->recovery_count)) {
+		atomic_inc(&rh->recovery_in_flight);
+		if (__rh_recovery_prepare(rh) <= 0) {
+			atomic_dec(&rh->recovery_in_flight);
+			up(&rh->recovery_count);
+			break;
+		}
+	}
+
+	/* Drop the extra reference */
+	if (atomic_dec_and_test(&rh->recovery_in_flight))
+		rh->wakeup_all_recovery_waiters(rh->context);
+}
+EXPORT_SYMBOL_GPL(dm_rh_recovery_prepare);
+
+/*
+ * Returns any quiesced regions.
+ */
+struct dm_region *dm_rh_recovery_start(struct dm_region_hash *rh)
+{
+	struct dm_region *reg = NULL;
+
+	spin_lock_irq(&rh->region_lock);
+	if (!list_empty(&rh->quiesced_regions)) {
+		reg = list_entry(rh->quiesced_regions.next,
+				 struct dm_region, list);
+		list_del_init(&reg->list);  /* remove from the quiesced list */
+	}
+	spin_unlock_irq(&rh->region_lock);
+
+	return reg;
+}
+EXPORT_SYMBOL_GPL(dm_rh_recovery_start);
+
+void dm_rh_recovery_end(struct dm_region *reg, int success)
+{
+	struct dm_region_hash *rh = reg->rh;
+
+	spin_lock_irq(&rh->region_lock);
+	if (success)
+		list_add(&reg->list, &reg->rh->recovered_regions);
+	else {
+		reg->state = DM_RH_NOSYNC;
+		list_add(&reg->list, &reg->rh->failed_recovered_regions);
+	}
+	spin_unlock_irq(&rh->region_lock);
+
+	rh->wakeup_workers(rh->context);
+}
+EXPORT_SYMBOL_GPL(dm_rh_recovery_end);
+
+/* Return recovery in flight count. */
+int dm_rh_recovery_in_flight(struct dm_region_hash *rh)
+{
+	return atomic_read(&rh->recovery_in_flight);
+}
+EXPORT_SYMBOL_GPL(dm_rh_recovery_in_flight);
+
+int dm_rh_flush(struct dm_region_hash *rh)
+{
+	return rh->log->type->flush(rh->log);
+}
+EXPORT_SYMBOL_GPL(dm_rh_flush);
+
+void dm_rh_delay(struct dm_region_hash *rh, struct bio *bio)
+{
+	struct dm_region *reg;
+
+	read_lock(&rh->hash_lock);
+	reg = __rh_find(rh, dm_rh_bio_to_region(rh, bio));
+	bio_list_add(&reg->delayed_bios, bio);
+	read_unlock(&rh->hash_lock);
+}
+EXPORT_SYMBOL_GPL(dm_rh_delay);
+
+void dm_rh_stop_recovery(struct dm_region_hash *rh)
+{
+	int i;
+
+	/* wait for any recovering regions */
+	for (i = 0; i < rh->max_recovery; i++)
+		down(&rh->recovery_count);
+}
+EXPORT_SYMBOL_GPL(dm_rh_stop_recovery);
+
+void dm_rh_start_recovery(struct dm_region_hash *rh)
+{
+	int i;
+
+	for (i = 0; i < rh->max_recovery; i++)
+		up(&rh->recovery_count);
+
+	rh->wakeup_workers(rh->context);
+}
+EXPORT_SYMBOL_GPL(dm_rh_start_recovery);
+
+MODULE_DESCRIPTION(DM_NAME " region hash");
+MODULE_AUTHOR("Joe Thornber/Heinz Mauelshagen <dm-devel@redhat.com>");
+MODULE_LICENSE("GPL");
diff --git a/include/linux/dm-region-hash.h b/include/linux/dm-region-hash.h
new file mode 100644
index 000000000000..a9e652a41373
--- /dev/null
+++ b/include/linux/dm-region-hash.h
@@ -0,0 +1,104 @@
+/*
+ * Copyright (C) 2003 Sistina Software Limited.
+ * Copyright (C) 2004-2008 Red Hat, Inc. All rights reserved.
+ *
+ * Device-Mapper dirty region hash interface.
+ *
+ * This file is released under the GPL.
+ */
+
+#ifndef DM_REGION_HASH_H
+#define DM_REGION_HASH_H
+
+#include <linux/dm-dirty-log.h>
+
+/*-----------------------------------------------------------------
+ * Region hash
+ *----------------------------------------------------------------*/
+struct dm_region_hash;
+struct dm_region;
+
+/*
+ * States a region can have.
+ */
+enum dm_rh_region_states {
+	DM_RH_CLEAN	 = 0x01,	/* No writes in flight. */
+	DM_RH_DIRTY	 = 0x02,	/* Writes in flight. */
+	DM_RH_NOSYNC	 = 0x04,	/* Out of sync. */
+	DM_RH_RECOVERING = 0x08,	/* Under resynchronization. */
+};
+
+/*
+ * Region hash create/destroy.
+ */
+struct bio_list;
+struct dm_region_hash *dm_region_hash_create(
+		void *context, void (*dispatch_bios)(void *context,
+						     struct bio_list *bios),
+		void (*wakeup_workers)(void *context),
+		void (*wakeup_all_recovery_waiters)(void *context),
+		sector_t target_begin, unsigned max_recovery,
+		struct dm_dirty_log *log, uint32_t region_size,
+		region_t nr_regions);
+void dm_region_hash_destroy(struct dm_region_hash *rh);
+
+struct dm_dirty_log *dm_rh_dirty_log(struct dm_region_hash *rh);
+
+/*
+ * Conversion functions.
+ */
+region_t dm_rh_bio_to_region(struct dm_region_hash *rh, struct bio *bio);
+sector_t dm_rh_region_to_sector(struct dm_region_hash *rh, region_t region);
+void *dm_rh_region_context(struct dm_region *reg);
+
+/*
+ * Get region size and key (ie. number of the region).
+ */
+sector_t dm_rh_get_region_size(struct dm_region_hash *rh);
+region_t dm_rh_get_region_key(struct dm_region *reg);
+
+/*
+ * Get/set/update region state (and dirty log).
+ *
+ */
+int dm_rh_get_state(struct dm_region_hash *rh, region_t region, int may_block);
+void dm_rh_set_state(struct dm_region_hash *rh, region_t region,
+		     enum dm_rh_region_states state, int may_block);
+
+/* Non-zero errors_handled leaves the state of the region NOSYNC */
+void dm_rh_update_states(struct dm_region_hash *rh, int errors_handled);
+
+/* Flush the region hash and dirty log. */
+int dm_rh_flush(struct dm_region_hash *rh);
+
+/* Inc/dec pending count on regions. */
+void dm_rh_inc_pending(struct dm_region_hash *rh, struct bio_list *bios);
+void dm_rh_dec(struct dm_region_hash *rh, region_t region);
+
+/* Delay bios on regions. */
+void dm_rh_delay(struct dm_region_hash *rh, struct bio *bio);
+
+void dm_rh_mark_nosync(struct dm_region_hash *rh,
+		       struct bio *bio, unsigned done, int error);
+
+/*
+ * Region recovery control.
+ */
+
+/* Prepare some regions for recovery by starting to quiesce them. */
+void dm_rh_recovery_prepare(struct dm_region_hash *rh);
+
+/* Try fetching a quiesced region for recovery. */
+struct dm_region *dm_rh_recovery_start(struct dm_region_hash *rh);
+
+/* Report recovery end on a region. */
+void dm_rh_recovery_end(struct dm_region *reg, int error);
+
+/* Returns number of regions with recovery work outstanding. */
+int dm_rh_recovery_in_flight(struct dm_region_hash *rh);
+
+/* Start/stop recovery. */
+void dm_rh_start_recovery(struct dm_region_hash *rh);
+void dm_rh_stop_recovery(struct dm_region_hash *rh);
+
+#endif /* DM_REGION_HASH_H */
-- 
cgit v1.2.3


From 5e458cc0f4770eea45d3c07110f01b3a94c72aa5 Mon Sep 17 00:00:00 2001
From: Rusty Russell <rusty@rustcorp.com.au>
Date: Wed, 22 Oct 2008 10:00:13 -0500
Subject: module: simplify load_module.

Linus' recent catch of stack overflow in load_module lead me to look
at the code.  A couple of helpers to get a section address and get
objects from a section can help clean things up a little.

(And in case you're wondering, the stack size also dropped from 328 to
284 bytes).

Signed-off-by: Rusty Russell <rusty@rustcorp.com.au>
---
 include/linux/module.h |   2 +-
 kernel/module.c        | 235 +++++++++++++++++++++----------------------------
 2 files changed, 100 insertions(+), 137 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/module.h b/include/linux/module.h
index 5d2970cdce93..eddf27db442b 100644
--- a/include/linux/module.h
+++ b/include/linux/module.h
@@ -277,7 +277,7 @@ struct module
 
 	/* Exception table */
 	unsigned int num_exentries;
-	const struct exception_table_entry *extable;
+	struct exception_table_entry *extable;
 
 	/* Startup function. */
 	int (*init)(void);
diff --git a/kernel/module.c b/kernel/module.c
index 0d8d21ee792c..3d256681ab64 100644
--- a/kernel/module.c
+++ b/kernel/module.c
@@ -132,6 +132,29 @@ static unsigned int find_sec(Elf_Ehdr *hdr,
 	return 0;
 }
 
+/* Find a module section, or NULL. */
+static void *section_addr(Elf_Ehdr *hdr, Elf_Shdr *shdrs,
+			  const char *secstrings, const char *name)
+{
+	/* Section 0 has sh_addr 0. */
+	return (void *)shdrs[find_sec(hdr, shdrs, secstrings, name)].sh_addr;
+}
+
+/* Find a module section, or NULL.  Fill in number of "objects" in section. */
+static void *section_objs(Elf_Ehdr *hdr,
+			  Elf_Shdr *sechdrs,
+			  const char *secstrings,
+			  const char *name,
+			  size_t object_size,
+			  unsigned int *num)
+{
+	unsigned int sec = find_sec(hdr, sechdrs, secstrings, name);
+
+	/* Section 0 has sh_addr 0 and sh_size 0. */
+	*num = sechdrs[sec].sh_size / object_size;
+	return (void *)sechdrs[sec].sh_addr;
+}
+
 /* Provided by the linker */
 extern const struct kernel_symbol __start___ksymtab[];
 extern const struct kernel_symbol __stop___ksymtab[];
@@ -1789,32 +1812,20 @@ static inline void add_kallsyms(struct module *mod,
 }
 #endif /* CONFIG_KALLSYMS */
 
-#ifdef CONFIG_DYNAMIC_PRINTK_DEBUG
-static void dynamic_printk_setup(Elf_Shdr *sechdrs, unsigned int verboseindex)
+static void dynamic_printk_setup(struct mod_debug *debug, unsigned int num)
 {
-	struct mod_debug *debug_info;
-	unsigned long pos, end;
-	unsigned int num_verbose;
-
-	pos = sechdrs[verboseindex].sh_addr;
-	num_verbose = sechdrs[verboseindex].sh_size /
-				sizeof(struct mod_debug);
-	end = pos + (num_verbose * sizeof(struct mod_debug));
+#ifdef CONFIG_DYNAMIC_PRINTK_DEBUG
+	unsigned int i;
 
-	for (; pos < end; pos += sizeof(struct mod_debug)) {
-		debug_info = (struct mod_debug *)pos;
-		register_dynamic_debug_module(debug_info->modname,
-			debug_info->type, debug_info->logical_modname,
-			debug_info->flag_names, debug_info->hash,
-			debug_info->hash2);
+	for (i = 0; i < num; i++) {
+		register_dynamic_debug_module(debug[i].modname,
+					      debug[i].type,
+					      debug[i].logical_modname,
+					      debug[i].flag_names,
+					      debug[i].hash, debug[i].hash2);
 	}
-}
-#else
-static inline void dynamic_printk_setup(Elf_Shdr *sechdrs,
-					unsigned int verboseindex)
-{
-}
 #endif /* CONFIG_DYNAMIC_PRINTK_DEBUG */
+}
 
 static void *module_alloc_update_bounds(unsigned long size)
 {
@@ -1843,37 +1854,14 @@ static noinline struct module *load_module(void __user *umod,
 	unsigned int i;
 	unsigned int symindex = 0;
 	unsigned int strindex = 0;
-	unsigned int setupindex;
-	unsigned int exindex;
-	unsigned int exportindex;
-	unsigned int modindex;
-	unsigned int obsparmindex;
-	unsigned int infoindex;
-	unsigned int gplindex;
-	unsigned int crcindex;
-	unsigned int gplcrcindex;
-	unsigned int versindex;
-	unsigned int pcpuindex;
-	unsigned int gplfutureindex;
-	unsigned int gplfuturecrcindex;
+	unsigned int modindex, versindex, infoindex, pcpuindex;
 	unsigned int unwindex = 0;
-#ifdef CONFIG_UNUSED_SYMBOLS
-	unsigned int unusedindex;
-	unsigned int unusedcrcindex;
-	unsigned int unusedgplindex;
-	unsigned int unusedgplcrcindex;
-#endif
-	unsigned int markersindex;
-	unsigned int markersstringsindex;
-	unsigned int verboseindex;
-	unsigned int tracepointsindex;
-	unsigned int tracepointsstringsindex;
-	unsigned int mcountindex;
+	unsigned int num_kp, num_mcount;
+	struct kernel_param *kp;
 	struct module *mod;
 	long err = 0;
 	void *percpu = NULL, *ptr = NULL; /* Stops spurious gcc warning */
-	void *mseg;
-	struct exception_table_entry *extable;
+	unsigned long *mseg;
 	mm_segment_t old_fs;
 
 	DEBUGP("load_module: umod=%p, len=%lu, uargs=%p\n",
@@ -1937,6 +1925,7 @@ static noinline struct module *load_module(void __user *umod,
 		err = -ENOEXEC;
 		goto free_hdr;
 	}
+	/* This is temporary: point mod into copy of data. */
 	mod = (void *)sechdrs[modindex].sh_addr;
 
 	if (symindex == 0) {
@@ -1946,22 +1935,6 @@ static noinline struct module *load_module(void __user *umod,
 		goto free_hdr;
 	}
 
-	/* Optional sections */
-	exportindex = find_sec(hdr, sechdrs, secstrings, "__ksymtab");
-	gplindex = find_sec(hdr, sechdrs, secstrings, "__ksymtab_gpl");
-	gplfutureindex = find_sec(hdr, sechdrs, secstrings, "__ksymtab_gpl_future");
-	crcindex = find_sec(hdr, sechdrs, secstrings, "__kcrctab");
-	gplcrcindex = find_sec(hdr, sechdrs, secstrings, "__kcrctab_gpl");
-	gplfuturecrcindex = find_sec(hdr, sechdrs, secstrings, "__kcrctab_gpl_future");
-#ifdef CONFIG_UNUSED_SYMBOLS
-	unusedindex = find_sec(hdr, sechdrs, secstrings, "__ksymtab_unused");
-	unusedgplindex = find_sec(hdr, sechdrs, secstrings, "__ksymtab_unused_gpl");
-	unusedcrcindex = find_sec(hdr, sechdrs, secstrings, "__kcrctab_unused");
-	unusedgplcrcindex = find_sec(hdr, sechdrs, secstrings, "__kcrctab_unused_gpl");
-#endif
-	setupindex = find_sec(hdr, sechdrs, secstrings, "__param");
-	exindex = find_sec(hdr, sechdrs, secstrings, "__ex_table");
-	obsparmindex = find_sec(hdr, sechdrs, secstrings, "__obsparm");
 	versindex = find_sec(hdr, sechdrs, secstrings, "__versions");
 	infoindex = find_sec(hdr, sechdrs, secstrings, ".modinfo");
 	pcpuindex = find_pcpusec(hdr, sechdrs, secstrings);
@@ -2117,42 +2090,57 @@ static noinline struct module *load_module(void __user *umod,
 	if (err < 0)
 		goto cleanup;
 
-	/* Set up EXPORTed & EXPORT_GPLed symbols (section 0 is 0 length) */
-	mod->num_syms = sechdrs[exportindex].sh_size / sizeof(*mod->syms);
-	mod->syms = (void *)sechdrs[exportindex].sh_addr;
-	if (crcindex)
-		mod->crcs = (void *)sechdrs[crcindex].sh_addr;
-	mod->num_gpl_syms = sechdrs[gplindex].sh_size / sizeof(*mod->gpl_syms);
-	mod->gpl_syms = (void *)sechdrs[gplindex].sh_addr;
-	if (gplcrcindex)
-		mod->gpl_crcs = (void *)sechdrs[gplcrcindex].sh_addr;
-	mod->num_gpl_future_syms = sechdrs[gplfutureindex].sh_size /
-					sizeof(*mod->gpl_future_syms);
-	mod->gpl_future_syms = (void *)sechdrs[gplfutureindex].sh_addr;
-	if (gplfuturecrcindex)
-		mod->gpl_future_crcs = (void *)sechdrs[gplfuturecrcindex].sh_addr;
+	/* Now we've got everything in the final locations, we can
+	 * find optional sections. */
+	kp = section_objs(hdr, sechdrs, secstrings, "__param", sizeof(*kp),
+			  &num_kp);
+	mod->syms = section_objs(hdr, sechdrs, secstrings, "__ksymtab",
+				 sizeof(*mod->syms), &mod->num_syms);
+	mod->crcs = section_addr(hdr, sechdrs, secstrings, "__kcrctab");
+	mod->gpl_syms = section_objs(hdr, sechdrs, secstrings, "__ksymtab_gpl",
+				     sizeof(*mod->gpl_syms),
+				     &mod->num_gpl_syms);
+	mod->gpl_crcs = section_addr(hdr, sechdrs, secstrings, "__kcrctab_gpl");
+	mod->gpl_future_syms = section_objs(hdr, sechdrs, secstrings,
+					    "__ksymtab_gpl_future",
+					    sizeof(*mod->gpl_future_syms),
+					    &mod->num_gpl_future_syms);
+	mod->gpl_future_crcs = section_addr(hdr, sechdrs, secstrings,
+					    "__kcrctab_gpl_future");
 
 #ifdef CONFIG_UNUSED_SYMBOLS
-	mod->num_unused_syms = sechdrs[unusedindex].sh_size /
-					sizeof(*mod->unused_syms);
-	mod->num_unused_gpl_syms = sechdrs[unusedgplindex].sh_size /
-					sizeof(*mod->unused_gpl_syms);
-	mod->unused_syms = (void *)sechdrs[unusedindex].sh_addr;
-	if (unusedcrcindex)
-		mod->unused_crcs = (void *)sechdrs[unusedcrcindex].sh_addr;
-	mod->unused_gpl_syms = (void *)sechdrs[unusedgplindex].sh_addr;
-	if (unusedgplcrcindex)
-		mod->unused_gpl_crcs
-			= (void *)sechdrs[unusedgplcrcindex].sh_addr;
+	mod->unused_syms = section_objs(hdr, sechdrs, secstrings,
+					"__ksymtab_unused",
+					sizeof(*mod->unused_syms),
+					&mod->num_unused_syms);
+	mod->unused_crcs = section_addr(hdr, sechdrs, secstrings,
+					"__kcrctab_unused");
+	mod->unused_gpl_syms = section_objs(hdr, sechdrs, secstrings,
+					    "__ksymtab_unused_gpl",
+					    sizeof(*mod->unused_gpl_syms),
+					    &mod->num_unused_gpl_syms);
+	mod->unused_gpl_crcs = section_addr(hdr, sechdrs, secstrings,
+					    "__kcrctab_unused_gpl");
+#endif
+
+#ifdef CONFIG_MARKERS
+	mod->markers = section_objs(hdr, sechdrs, secstrings, "__markers",
+				    sizeof(*mod->markers), &mod->num_markers);
+#endif
+#ifdef CONFIG_TRACEPOINTS
+	mod->tracepoints = section_objs(hdr, sechdrs, secstrings,
+					"__tracepoints",
+					sizeof(*mod->tracepoints),
+					&mod->num_tracepoints);
 #endif
 
 #ifdef CONFIG_MODVERSIONS
-	if ((mod->num_syms && !crcindex)
-	    || (mod->num_gpl_syms && !gplcrcindex)
-	    || (mod->num_gpl_future_syms && !gplfuturecrcindex)
+	if ((mod->num_syms && !mod->crcs)
+	    || (mod->num_gpl_syms && !mod->gpl_crcs)
+	    || (mod->num_gpl_future_syms && !mod->gpl_future_crcs)
 #ifdef CONFIG_UNUSED_SYMBOLS
-	    || (mod->num_unused_syms && !unusedcrcindex)
-	    || (mod->num_unused_gpl_syms && !unusedgplcrcindex)
+	    || (mod->num_unused_syms && !mod->unused_crcs)
+	    || (mod->num_unused_gpl_syms && !mod->unused_gpl_crcs)
 #endif
 		) {
 		printk(KERN_WARNING "%s: No versions for exported symbols.\n", mod->name);
@@ -2161,16 +2149,6 @@ static noinline struct module *load_module(void __user *umod,
 			goto cleanup;
 	}
 #endif
-	markersindex = find_sec(hdr, sechdrs, secstrings, "__markers");
- 	markersstringsindex = find_sec(hdr, sechdrs, secstrings,
-					"__markers_strings");
-	verboseindex = find_sec(hdr, sechdrs, secstrings, "__verbose");
-	tracepointsindex = find_sec(hdr, sechdrs, secstrings, "__tracepoints");
-	tracepointsstringsindex = find_sec(hdr, sechdrs, secstrings,
-					"__tracepoints_strings");
-
-	mcountindex = find_sec(hdr, sechdrs, secstrings,
-			       "__mcount_loc");
 
 	/* Now do relocations. */
 	for (i = 1; i < hdr->e_shnum; i++) {
@@ -2193,28 +2171,16 @@ static noinline struct module *load_module(void __user *umod,
 		if (err < 0)
 			goto cleanup;
 	}
-#ifdef CONFIG_MARKERS
-	mod->markers = (void *)sechdrs[markersindex].sh_addr;
-	mod->num_markers =
-		sechdrs[markersindex].sh_size / sizeof(*mod->markers);
-#endif
-#ifdef CONFIG_TRACEPOINTS
-	mod->tracepoints = (void *)sechdrs[tracepointsindex].sh_addr;
-	mod->num_tracepoints =
-		sechdrs[tracepointsindex].sh_size / sizeof(*mod->tracepoints);
-#endif
-
 
         /* Find duplicate symbols */
 	err = verify_export_symbols(mod);
-
 	if (err < 0)
 		goto cleanup;
 
   	/* Set up and sort exception table */
-	mod->num_exentries = sechdrs[exindex].sh_size / sizeof(*mod->extable);
-	mod->extable = extable = (void *)sechdrs[exindex].sh_addr;
-	sort_extable(extable, extable + mod->num_exentries);
+	mod->extable = section_objs(hdr, sechdrs, secstrings, "__ex_table",
+				    sizeof(*mod->extable), &mod->num_exentries);
+	sort_extable(mod->extable, mod->extable + mod->num_exentries);
 
 	/* Finally, copy percpu area over. */
 	percpu_modcopy(mod->percpu, (void *)sechdrs[pcpuindex].sh_addr,
@@ -2223,11 +2189,17 @@ static noinline struct module *load_module(void __user *umod,
 	add_kallsyms(mod, sechdrs, symindex, strindex, secstrings);
 
 	if (!mod->taints) {
+		struct mod_debug *debug;
+		unsigned int num_debug;
+
 #ifdef CONFIG_MARKERS
 		marker_update_probe_range(mod->markers,
 			mod->markers + mod->num_markers);
 #endif
-	dynamic_printk_setup(sechdrs, verboseindex);
+		debug = section_objs(hdr, sechdrs, secstrings, "__verbose",
+				     sizeof(*debug), &num_debug);
+		dynamic_printk_setup(debug, num_debug);
+
 #ifdef CONFIG_TRACEPOINTS
 		tracepoint_update_probe_range(mod->tracepoints,
 			mod->tracepoints + mod->num_tracepoints);
@@ -2235,8 +2207,9 @@ static noinline struct module *load_module(void __user *umod,
 	}
 
 	/* sechdrs[0].sh_size is always zero */
-	mseg = (void *)sechdrs[mcountindex].sh_addr;
-	ftrace_init_module(mseg, mseg + sechdrs[mcountindex].sh_size);
+	mseg = section_objs(hdr, sechdrs, secstrings, "__mcount_loc",
+			    sizeof(*mseg), &num_mcount);
+	ftrace_init_module(mseg, mseg + num_mcount);
 
 	err = module_finalize(hdr, sechdrs, mod);
 	if (err < 0)
@@ -2261,7 +2234,7 @@ static noinline struct module *load_module(void __user *umod,
 	set_fs(old_fs);
 
 	mod->args = args;
-	if (obsparmindex)
+	if (section_addr(hdr, sechdrs, secstrings, "__obsparm"))
 		printk(KERN_WARNING "%s: Ignoring obsolete parameters\n",
 		       mod->name);
 
@@ -2270,21 +2243,11 @@ static noinline struct module *load_module(void __user *umod,
          * strong_try_module_get() will fail. */
 	stop_machine(__link_module, mod, NULL);
 
-	/* Size of section 0 is 0, so this works well if no params */
-	err = parse_args(mod->name, mod->args,
-			 (struct kernel_param *)
-			 sechdrs[setupindex].sh_addr,
-			 sechdrs[setupindex].sh_size
-			 / sizeof(struct kernel_param),
-			 NULL);
+	err = parse_args(mod->name, mod->args, kp, num_kp, NULL);
 	if (err < 0)
 		goto unlink;
 
-	err = mod_sysfs_setup(mod,
-			      (struct kernel_param *)
-			      sechdrs[setupindex].sh_addr,
-			      sechdrs[setupindex].sh_size
-			      / sizeof(struct kernel_param));
+	err = mod_sysfs_setup(mod, kp, num_kp);
 	if (err < 0)
 		goto unlink;
 	add_sect_attrs(mod, hdr->e_shnum, secstrings, sechdrs);
-- 
cgit v1.2.3


From 730b69d225259565c705f5f5a11cb1aba69568f1 Mon Sep 17 00:00:00 2001
From: Rusty Russell <rusty@rustcorp.com.au>
Date: Wed, 22 Oct 2008 10:00:22 -0500
Subject: module: check kernel param length at compile time, not runtime

The kparam code tries to handle over-length parameter prefixes at
runtime.  Not only would I bet this has never been tested, it's not
clear that truncating names is a good idea either.

So let's check at compile time.  We need to move the #define to
moduleparam.h to do this, though.

Signed-off-by: Rusty Russell <rusty@rustcorp.com.au>
---
 include/linux/module.h      | 2 +-
 include/linux/moduleparam.h | 6 +++++-
 kernel/params.c             | 7 ++-----
 3 files changed, 8 insertions(+), 7 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/module.h b/include/linux/module.h
index eddf27db442b..196b499270da 100644
--- a/include/linux/module.h
+++ b/include/linux/module.h
@@ -29,7 +29,7 @@
 #define MODULE_SYMBOL_PREFIX ""
 #endif
 
-#define MODULE_NAME_LEN (64 - sizeof(unsigned long))
+#define MODULE_NAME_LEN MAX_PARAM_PREFIX_LEN
 
 struct kernel_symbol
 {
diff --git a/include/linux/moduleparam.h b/include/linux/moduleparam.h
index ec624381c844..1eefe6d61b86 100644
--- a/include/linux/moduleparam.h
+++ b/include/linux/moduleparam.h
@@ -13,6 +13,9 @@
 #define MODULE_PARAM_PREFIX KBUILD_MODNAME "."
 #endif
 
+/* Chosen so that structs with an unsigned long line up. */
+#define MAX_PARAM_PREFIX_LEN (64 - sizeof(unsigned long))
+
 #ifdef MODULE
 #define ___module_cat(a,b) __mod_ ## a ## b
 #define __module_cat(a,b) ___module_cat(a,b)
@@ -79,7 +82,8 @@ struct kparam_array
 #define __module_param_call(prefix, name, set, get, arg, perm)		\
 	/* Default value instead of permissions? */			\
 	static int __param_perm_check_##name __attribute__((unused)) =	\
-	BUILD_BUG_ON_ZERO((perm) < 0 || (perm) > 0777 || ((perm) & 2));	\
+	BUILD_BUG_ON_ZERO((perm) < 0 || (perm) > 0777 || ((perm) & 2))	\
+	+ BUILD_BUG_ON_ZERO(sizeof(""prefix) > MAX_PARAM_PREFIX_LEN);	\
 	static const char __param_str_##name[] = prefix #name;		\
 	static struct kernel_param __moduleparam_const __param_##name	\
 	__used								\
diff --git a/kernel/params.c b/kernel/params.c
index afc46a23eb6d..aca07e1a050f 100644
--- a/kernel/params.c
+++ b/kernel/params.c
@@ -585,17 +585,14 @@ static void __init param_sysfs_builtin(void)
 {
 	struct kernel_param *kp, *kp_begin = NULL;
 	unsigned int i, name_len, count = 0;
-	char modname[MODULE_NAME_LEN + 1] = "";
+	char modname[MODULE_NAME_LEN] = "";
 
 	for (i=0; i < __stop___param - __start___param; i++) {
 		char *dot;
-		size_t max_name_len;
 
 		kp = &__start___param[i];
-		max_name_len =
-			min_t(size_t, MODULE_NAME_LEN, strlen(kp->name));
 
-		dot = memchr(kp->name, '.', max_name_len);
+		dot = strchr(kp->name, '.');
 		if (!dot) {
 			DEBUGP("couldn't find period in first %d characters "
 			       "of %s\n", MODULE_NAME_LEN, kp->name);
-- 
cgit v1.2.3


From 9b473de87209fa86eb421b23386693b461612f30 Mon Sep 17 00:00:00 2001
From: Rusty Russell <rusty@rustcorp.com.au>
Date: Wed, 22 Oct 2008 10:00:22 -0500
Subject: param: Fix duplicate module prefixes

Instead of insisting each new module_param sysfs entry is unique,
handle the case where it already exists (for builtin modules).

The current code assumes that all identical prefixes are together in
the section: true for normal uses, but not necessarily so if someone
overrides MODULE_PARAM_PREFIX.  More importantly, it's not true with
the new "core_param()" code which uses "kernel" as a prefix.

This simplifies the caller for the builtin case, at a slight loss of
efficiency (we do the lookup every time to see if the directory
exists).

Signed-off-by: Rusty Russell <rusty@rustcorp.com.au>
Cc: Greg Kroah-Hartman <gregkh@suse.de>
---
 include/linux/module.h |   2 +-
 kernel/params.c        | 261 ++++++++++++++++++++++++++-----------------------
 2 files changed, 142 insertions(+), 121 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/module.h b/include/linux/module.h
index 196b499270da..3bfed013350b 100644
--- a/include/linux/module.h
+++ b/include/linux/module.h
@@ -60,6 +60,7 @@ struct module_kobject
 	struct kobject kobj;
 	struct module *mod;
 	struct kobject *drivers_dir;
+	struct module_param_attrs *mp;
 };
 
 /* These are either module local, or the kernel's dummy ones. */
@@ -242,7 +243,6 @@ struct module
 
 	/* Sysfs stuff. */
 	struct module_kobject mkobj;
-	struct module_param_attrs *param_attrs;
 	struct module_attribute *modinfo_attrs;
 	const char *version;
 	const char *srcversion;
diff --git a/kernel/params.c b/kernel/params.c
index aca07e1a050f..f27c992a4625 100644
--- a/kernel/params.c
+++ b/kernel/params.c
@@ -373,6 +373,8 @@ int param_get_string(char *buffer, struct kernel_param *kp)
 }
 
 /* sysfs output in /sys/modules/XYZ/parameters/ */
+#define to_module_attr(n) container_of(n, struct module_attribute, attr);
+#define to_module_kobject(n) container_of(n, struct module_kobject, kobj);
 
 extern struct kernel_param __start___param[], __stop___param[];
 
@@ -384,6 +386,7 @@ struct param_attribute
 
 struct module_param_attrs
 {
+	unsigned int num;
 	struct attribute_group grp;
 	struct param_attribute attrs[0];
 };
@@ -434,69 +437,84 @@ static ssize_t param_attr_store(struct module_attribute *mattr,
 
 #ifdef CONFIG_SYSFS
 /*
- * param_sysfs_setup - setup sysfs support for one module or KBUILD_MODNAME
- * @mk: struct module_kobject (contains parent kobject)
- * @kparam: array of struct kernel_param, the actual parameter definitions
- * @num_params: number of entries in array
- * @name_skip: offset where the parameter name start in kparam[].name. Needed for built-in "modules"
+ * add_sysfs_param - add a parameter to sysfs
+ * @mk: struct module_kobject
+ * @kparam: the actual parameter definition to add to sysfs
+ * @name: name of parameter
  *
- * Create a kobject for a (per-module) group of parameters, and create files
- * in sysfs. A pointer to the param_kobject is returned on success,
- * NULL if there's no parameter to export, or other ERR_PTR(err).
+ * Create a kobject if for a (per-module) parameter if mp NULL, and
+ * create file in sysfs.  Returns an error on out of memory.  Always cleans up
+ * if there's an error.
  */
-static __modinit struct module_param_attrs *
-param_sysfs_setup(struct module_kobject *mk,
-		  struct kernel_param *kparam,
-		  unsigned int num_params,
-		  unsigned int name_skip)
+static __modinit int add_sysfs_param(struct module_kobject *mk,
+				     struct kernel_param *kp,
+				     const char *name)
 {
-	struct module_param_attrs *mp;
-	unsigned int valid_attrs = 0;
-	unsigned int i, size[2];
-	struct param_attribute *pattr;
-	struct attribute **gattr;
-	int err;
-
-	for (i=0; i<num_params; i++) {
-		if (kparam[i].perm)
-			valid_attrs++;
+	struct module_param_attrs *new;
+	struct attribute **attrs;
+	int err, num;
+
+	/* We don't bother calling this with invisible parameters. */
+	BUG_ON(!kp->perm);
+
+	if (!mk->mp) {
+		num = 0;
+		attrs = NULL;
+	} else {
+		num = mk->mp->num;
+		attrs = mk->mp->grp.attrs;
 	}
 
-	if (!valid_attrs)
-		return NULL;
-
-	size[0] = ALIGN(sizeof(*mp) +
-			valid_attrs * sizeof(mp->attrs[0]),
-			sizeof(mp->grp.attrs[0]));
-	size[1] = (valid_attrs + 1) * sizeof(mp->grp.attrs[0]);
-
-	mp = kzalloc(size[0] + size[1], GFP_KERNEL);
-	if (!mp)
-		return ERR_PTR(-ENOMEM);
+	/* Enlarge. */
+	new = krealloc(mk->mp,
+		       sizeof(*mk->mp) + sizeof(mk->mp->attrs[0]) * (num+1),
+		       GFP_KERNEL);
+	if (!new) {
+		kfree(mk->mp);
+		err = -ENOMEM;
+		goto fail;
+	}
+	attrs = krealloc(attrs, sizeof(new->grp.attrs[0])*(num+2), GFP_KERNEL);
+	if (!attrs) {
+		err = -ENOMEM;
+		goto fail_free_new;
+	}
 
-	mp->grp.name = "parameters";
-	mp->grp.attrs = (void *)mp + size[0];
+	/* Sysfs wants everything zeroed. */
+	memset(new, 0, sizeof(*new));
+	memset(&new->attrs[num], 0, sizeof(new->attrs[num]));
+	memset(&attrs[num], 0, sizeof(attrs[num]));
+	new->grp.name = "parameters";
+	new->grp.attrs = attrs;
+
+	/* Tack new one on the end. */
+	new->attrs[num].param = kp;
+	new->attrs[num].mattr.show = param_attr_show;
+	new->attrs[num].mattr.store = param_attr_store;
+	new->attrs[num].mattr.attr.name = (char *)name;
+	new->attrs[num].mattr.attr.mode = kp->perm;
+	new->num = num+1;
+
+	/* Fix up all the pointers, since krealloc can move us */
+	for (num = 0; num < new->num; num++)
+		new->grp.attrs[num] = &new->attrs[num].mattr.attr;
+	new->grp.attrs[num] = NULL;
+
+	mk->mp = new;
+	return 0;
 
-	pattr = &mp->attrs[0];
-	gattr = &mp->grp.attrs[0];
-	for (i = 0; i < num_params; i++) {
-		struct kernel_param *kp = &kparam[i];
-		if (kp->perm) {
-			pattr->param = kp;
-			pattr->mattr.show = param_attr_show;
-			pattr->mattr.store = param_attr_store;
-			pattr->mattr.attr.name = (char *)&kp->name[name_skip];
-			pattr->mattr.attr.mode = kp->perm;
-			*(gattr++) = &(pattr++)->mattr.attr;
-		}
-	}
-	*gattr = NULL;
+fail_free_new:
+	kfree(new);
+fail:
+	mk->mp = NULL;
+	return err;
+}
 
-	if ((err = sysfs_create_group(&mk->kobj, &mp->grp))) {
-		kfree(mp);
-		return ERR_PTR(err);
-	}
-	return mp;
+static void free_module_param_attrs(struct module_kobject *mk)
+{
+	kfree(mk->mp->grp.attrs);
+	kfree(mk->mp);
+	mk->mp = NULL;
 }
 
 #ifdef CONFIG_MODULES
@@ -506,21 +524,33 @@ param_sysfs_setup(struct module_kobject *mk,
  * @kparam: module parameters (array)
  * @num_params: number of module parameters
  *
- * Adds sysfs entries for module parameters, and creates a link from
- * /sys/module/[mod->name]/parameters to /sys/parameters/[mod->name]/
+ * Adds sysfs entries for module parameters under
+ * /sys/module/[mod->name]/parameters/
  */
 int module_param_sysfs_setup(struct module *mod,
 			     struct kernel_param *kparam,
 			     unsigned int num_params)
 {
-	struct module_param_attrs *mp;
+	int i, err;
+	bool params = false;
+
+	for (i = 0; i < num_params; i++) {
+		if (kparam[i].perm == 0)
+			continue;
+		err = add_sysfs_param(&mod->mkobj, &kparam[i], kparam[i].name);
+		if (err)
+			return err;
+		params = true;
+	}
 
-	mp = param_sysfs_setup(&mod->mkobj, kparam, num_params, 0);
-	if (IS_ERR(mp))
-		return PTR_ERR(mp);
+	if (!params)
+		return 0;
 
-	mod->param_attrs = mp;
-	return 0;
+	/* Create the param group. */
+	err = sysfs_create_group(&mod->mkobj.kobj, &mod->mkobj.mp->grp);
+	if (err)
+		free_module_param_attrs(&mod->mkobj);
+	return err;
 }
 
 /*
@@ -532,43 +562,55 @@ int module_param_sysfs_setup(struct module *mod,
  */
 void module_param_sysfs_remove(struct module *mod)
 {
-	if (mod->param_attrs) {
-		sysfs_remove_group(&mod->mkobj.kobj,
-				   &mod->param_attrs->grp);
+	if (mod->mkobj.mp) {
+		sysfs_remove_group(&mod->mkobj.kobj, &mod->mkobj.mp->grp);
 		/* We are positive that no one is using any param
 		 * attrs at this point.  Deallocate immediately. */
-		kfree(mod->param_attrs);
-		mod->param_attrs = NULL;
+		free_module_param_attrs(&mod->mkobj);
 	}
 }
 #endif
 
-/*
- * kernel_param_sysfs_setup - wrapper for built-in params support
- */
-static void __init kernel_param_sysfs_setup(const char *name,
-					    struct kernel_param *kparam,
-					    unsigned int num_params,
-					    unsigned int name_skip)
+static void __init kernel_add_sysfs_param(const char *name,
+					  struct kernel_param *kparam,
+					  unsigned int name_skip)
 {
 	struct module_kobject *mk;
-	int ret;
+	struct kobject *kobj;
+	int err;
 
-	mk = kzalloc(sizeof(struct module_kobject), GFP_KERNEL);
-	BUG_ON(!mk);
-
-	mk->mod = THIS_MODULE;
-	mk->kobj.kset = module_kset;
-	ret = kobject_init_and_add(&mk->kobj, &module_ktype, NULL, "%s", name);
-	if (ret) {
-		kobject_put(&mk->kobj);
-		printk(KERN_ERR "Module '%s' failed to be added to sysfs, "
-		      "error number %d\n", name, ret);
-		printk(KERN_ERR	"The system will be unstable now.\n");
-		return;
+	kobj = kset_find_obj(module_kset, name);
+	if (kobj) {
+		/* We already have one.  Remove params so we can add more. */
+		mk = to_module_kobject(kobj);
+		/* We need to remove it before adding parameters. */
+		sysfs_remove_group(&mk->kobj, &mk->mp->grp);
+	} else {
+		mk = kzalloc(sizeof(struct module_kobject), GFP_KERNEL);
+		BUG_ON(!mk);
+
+		mk->mod = THIS_MODULE;
+		mk->kobj.kset = module_kset;
+		err = kobject_init_and_add(&mk->kobj, &module_ktype, NULL,
+					   "%s", name);
+		if (err) {
+			kobject_put(&mk->kobj);
+			printk(KERN_ERR "Module '%s' failed add to sysfs, "
+			       "error number %d\n", name, err);
+			printk(KERN_ERR	"The system will be unstable now.\n");
+			return;
+		}
+		/* So that exit path is even. */
+		kobject_get(&mk->kobj);
 	}
-	param_sysfs_setup(mk, kparam, num_params, name_skip);
+
+	/* These should not fail at boot. */
+	err = add_sysfs_param(mk, kparam, kparam->name + name_skip);
+	BUG_ON(err);
+	err = sysfs_create_group(&mk->kobj, &mk->mp->grp);
+	BUG_ON(err);
 	kobject_uevent(&mk->kobj, KOBJ_ADD);
+	kobject_put(&mk->kobj);
 }
 
 /*
@@ -579,18 +621,19 @@ static void __init kernel_param_sysfs_setup(const char *name,
  * The "module" name (KBUILD_MODNAME) is stored before a dot, the
  * "parameter" name is stored behind a dot in kernel_param->name. So,
  * extract the "module" name for all built-in kernel_param-eters,
- * and for all who have the same, call kernel_param_sysfs_setup.
+ * and for all who have the same, call kernel_add_sysfs_param.
  */
 static void __init param_sysfs_builtin(void)
 {
-	struct kernel_param *kp, *kp_begin = NULL;
-	unsigned int i, name_len, count = 0;
-	char modname[MODULE_NAME_LEN] = "";
+	struct kernel_param *kp;
+	unsigned int name_len;
+	char modname[MODULE_NAME_LEN];
 
-	for (i=0; i < __stop___param - __start___param; i++) {
+	for (kp = __start___param; kp < __stop___param; kp++) {
 		char *dot;
 
-		kp = &__start___param[i];
+		if (kp->perm == 0)
+			continue;
 
 		dot = strchr(kp->name, '.');
 		if (!dot) {
@@ -599,37 +642,15 @@ static void __init param_sysfs_builtin(void)
 			continue;
 		}
 		name_len = dot - kp->name;
-
- 		/* new kbuild_modname? */
-		if (strlen(modname) != name_len
-		    || strncmp(modname, kp->name, name_len) != 0) {
-			/* add a new kobject for previous kernel_params. */
-			if (count)
-				kernel_param_sysfs_setup(modname,
-							 kp_begin,
-							 count,
-							 strlen(modname)+1);
-
-			strncpy(modname, kp->name, name_len);
-			modname[name_len] = '\0';
-			count = 0;
-			kp_begin = kp;
-		}
-		count++;
+		strncpy(modname, kp->name, name_len);
+		modname[name_len] = '\0';
+		kernel_add_sysfs_param(modname, kp, name_len+1);
 	}
-
-	/* last kernel_params need to be registered as well */
-	if (count)
-		kernel_param_sysfs_setup(modname, kp_begin, count,
-					 strlen(modname)+1);
 }
 
 
 /* module-related sysfs stuff */
 
-#define to_module_attr(n) container_of(n, struct module_attribute, attr);
-#define to_module_kobject(n) container_of(n, struct module_kobject, kobj);
-
 static ssize_t module_attr_show(struct kobject *kobj,
 				struct attribute *attr,
 				char *buf)
-- 
cgit v1.2.3


From 67e67ceaac5bf55dbdceb704ff2d763d438b5373 Mon Sep 17 00:00:00 2001
From: Rusty Russell <rusty@rustcorp.com.au>
Date: Wed, 22 Oct 2008 10:00:23 -0500
Subject: core_param() for genuinely core kernel parameters

There are a lot of one-liner uses of __setup() in the kernel: they're
cumbersome and not queryable (definitely not settable) via /sys.  Yet
it's ugly to simplify them to module_param(), because by default that
inserts a prefix of the module name (usually filename).

So, introduce a "core_param".  The parameter gets no prefix, but
appears in /sys/module/kernel/parameters/ (if non-zero perms arg).  I
thought about using the name "core", but that's more common than
"kernel".  And if you create a module called "kernel", you will die
a horrible death.

Signed-off-by: Rusty Russell <rusty@rustcorp.com.au>
---
 include/linux/moduleparam.h | 19 +++++++++++++++++++
 kernel/params.c             | 14 +++++++-------
 2 files changed, 26 insertions(+), 7 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/moduleparam.h b/include/linux/moduleparam.h
index 1eefe6d61b86..e4af3399ef48 100644
--- a/include/linux/moduleparam.h
+++ b/include/linux/moduleparam.h
@@ -104,6 +104,25 @@ struct kparam_array
 #define module_param(name, type, perm)				\
 	module_param_named(name, name, type, perm)
 
+#ifndef MODULE
+/**
+ * core_param - define a historical core kernel parameter.
+ * @name: the name of the cmdline and sysfs parameter (often the same as var)
+ * @var: the variable
+ * @type: the type (for param_set_##type and param_get_##type)
+ * @perm: visibility in sysfs
+ *
+ * core_param is just like module_param(), but cannot be modular and
+ * doesn't add a prefix (such as "printk.").  This is for compatibility
+ * with __setup(), and it makes sense as truly core parameters aren't
+ * tied to the particular file they're in.
+ */
+#define core_param(name, var, type, perm)				\
+	param_check_##type(name, &(var));				\
+	__module_param_call("", name, param_set_##type, param_get_##type, \
+			    &var, perm)
+#endif /* !MODULE */
+
 /* Actually copy string: maxlen param is usually sizeof(string). */
 #define module_param_string(name, string, len, perm)			\
 	static const struct kparam_string __param_string_##name		\
diff --git a/kernel/params.c b/kernel/params.c
index f27c992a4625..b077f1b045d3 100644
--- a/kernel/params.c
+++ b/kernel/params.c
@@ -637,14 +637,14 @@ static void __init param_sysfs_builtin(void)
 
 		dot = strchr(kp->name, '.');
 		if (!dot) {
-			DEBUGP("couldn't find period in first %d characters "
-			       "of %s\n", MODULE_NAME_LEN, kp->name);
-			continue;
+			/* This happens for core_param() */
+			strcpy(modname, "kernel");
+			name_len = 0;
+		} else {
+			name_len = dot - kp->name + 1;
+			strlcpy(modname, kp->name, name_len);
 		}
-		name_len = dot - kp->name;
-		strncpy(modname, kp->name, name_len);
-		modname[name_len] = '\0';
-		kernel_add_sysfs_param(modname, kp, name_len+1);
+		kernel_add_sysfs_param(modname, kp, name_len);
 	}
 }
 
-- 
cgit v1.2.3


From 0d557dc97f4bb501f086a03d0f00b99a7855d794 Mon Sep 17 00:00:00 2001
From: Heiko Carstens <heiko.carstens@de.ibm.com>
Date: Mon, 13 Oct 2008 23:50:09 +0200
Subject: workqueue: introduce create_rt_workqueue

create_rt_workqueue will create a real time prioritized workqueue.
This is needed for the conversion of stop_machine to a workqueue based
implementation.
This patch adds yet another parameter to __create_workqueue_key to tell
it that we want an rt workqueue.
However it looks like we rather should have something like "int type"
instead of singlethread, freezable and rt.

Signed-off-by: Heiko Carstens <heiko.carstens@de.ibm.com>
Signed-off-by: Rusty Russell <rusty@rustcorp.com.au>
Cc: Ingo Molnar <mingo@elte.hu>
---
 include/linux/workqueue.h | 18 ++++++++++--------
 kernel/workqueue.c        |  7 ++++++-
 2 files changed, 16 insertions(+), 9 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/workqueue.h b/include/linux/workqueue.h
index 5c158c477ac7..89a5a1231ffb 100644
--- a/include/linux/workqueue.h
+++ b/include/linux/workqueue.h
@@ -149,11 +149,11 @@ struct execute_work {
 
 extern struct workqueue_struct *
 __create_workqueue_key(const char *name, int singlethread,
-		       int freezeable, struct lock_class_key *key,
+		       int freezeable, int rt, struct lock_class_key *key,
 		       const char *lock_name);
 
 #ifdef CONFIG_LOCKDEP
-#define __create_workqueue(name, singlethread, freezeable)	\
+#define __create_workqueue(name, singlethread, freezeable, rt)	\
 ({								\
 	static struct lock_class_key __key;			\
 	const char *__lock_name;				\
@@ -164,17 +164,19 @@ __create_workqueue_key(const char *name, int singlethread,
 		__lock_name = #name;				\
 								\
 	__create_workqueue_key((name), (singlethread),		\
-			       (freezeable), &__key,		\
+			       (freezeable), (rt), &__key,	\
 			       __lock_name);			\
 })
 #else
-#define __create_workqueue(name, singlethread, freezeable)	\
-	__create_workqueue_key((name), (singlethread), (freezeable), NULL, NULL)
+#define __create_workqueue(name, singlethread, freezeable, rt)	\
+	__create_workqueue_key((name), (singlethread), (freezeable), (rt), \
+			       NULL, NULL)
 #endif
 
-#define create_workqueue(name) __create_workqueue((name), 0, 0)
-#define create_freezeable_workqueue(name) __create_workqueue((name), 1, 1)
-#define create_singlethread_workqueue(name) __create_workqueue((name), 1, 0)
+#define create_workqueue(name) __create_workqueue((name), 0, 0, 0)
+#define create_rt_workqueue(name) __create_workqueue((name), 0, 0, 1)
+#define create_freezeable_workqueue(name) __create_workqueue((name), 1, 1, 0)
+#define create_singlethread_workqueue(name) __create_workqueue((name), 1, 0, 0)
 
 extern void destroy_workqueue(struct workqueue_struct *wq);
 
diff --git a/kernel/workqueue.c b/kernel/workqueue.c
index 714afad46539..f928f2a87b9b 100644
--- a/kernel/workqueue.c
+++ b/kernel/workqueue.c
@@ -62,6 +62,7 @@ struct workqueue_struct {
 	const char *name;
 	int singlethread;
 	int freezeable;		/* Freeze threads during suspend */
+	int rt;
 #ifdef CONFIG_LOCKDEP
 	struct lockdep_map lockdep_map;
 #endif
@@ -766,6 +767,7 @@ init_cpu_workqueue(struct workqueue_struct *wq, int cpu)
 
 static int create_workqueue_thread(struct cpu_workqueue_struct *cwq, int cpu)
 {
+	struct sched_param param = { .sched_priority = MAX_RT_PRIO-1 };
 	struct workqueue_struct *wq = cwq->wq;
 	const char *fmt = is_single_threaded(wq) ? "%s" : "%s/%d";
 	struct task_struct *p;
@@ -781,7 +783,8 @@ static int create_workqueue_thread(struct cpu_workqueue_struct *cwq, int cpu)
 	 */
 	if (IS_ERR(p))
 		return PTR_ERR(p);
-
+	if (cwq->wq->rt)
+		sched_setscheduler_nocheck(p, SCHED_FIFO, &param);
 	cwq->thread = p;
 
 	return 0;
@@ -801,6 +804,7 @@ static void start_workqueue_thread(struct cpu_workqueue_struct *cwq, int cpu)
 struct workqueue_struct *__create_workqueue_key(const char *name,
 						int singlethread,
 						int freezeable,
+						int rt,
 						struct lock_class_key *key,
 						const char *lock_name)
 {
@@ -822,6 +826,7 @@ struct workqueue_struct *__create_workqueue_key(const char *name,
 	lockdep_init_map(&wq->lockdep_map, lock_name, key, 0);
 	wq->singlethread = singlethread;
 	wq->freezeable = freezeable;
+	wq->rt = rt;
 	INIT_LIST_HEAD(&wq->list);
 
 	if (singlethread) {
-- 
cgit v1.2.3


From a30d46c042c8a17ef25de02f439fbd120ab8a8de Mon Sep 17 00:00:00 2001
From: David Brownell <dbrownell@users.sourceforge.net>
Date: Mon, 20 Oct 2008 23:46:28 +0200
Subject: mfd: twl4030 IRQ handling update

- Move it into a separate file; clean and streamline it
 - Restructure the init code for reuse during secondary dispatch
 - Support both levels (primary, secondary) of IRQ dispatch
 - Use a workqueue for irq mask/unmask and trigger configuration

Code for two subchips currently share that secondary handler code.
One is the power subchip; its IRQs are now handled by this core,
courtesy of this patch.  The other is the GPIO module, which will
be supported through a later patch.

There are also minor changes to the header file, mostly related
to GPIO support; nothing yet in mainline cares about those.  A
few references to OMAP-specific symbols are disabled; when they
can all be removed, the TWL4030 support ceases being OMAP-specific.

Signed-off-by: David Brownell <dbrownell@users.sourceforge.net>
Signed-off-by: Tony Lindgren <tony@atomide.com>
Signed-off-by: Samuel Ortiz <sameo@openedhand.com>
---
 drivers/mfd/Makefile        |   2 +-
 drivers/mfd/twl4030-core.c  | 421 +------------------------
 drivers/mfd/twl4030-irq.c   | 743 ++++++++++++++++++++++++++++++++++++++++++++
 include/linux/i2c/twl4030.h |  28 +-
 4 files changed, 777 insertions(+), 417 deletions(-)
 create mode 100644 drivers/mfd/twl4030-irq.c

(limited to 'include/linux')

diff --git a/drivers/mfd/Makefile b/drivers/mfd/Makefile
index 68e237b830ad..0acefe8aff87 100644
--- a/drivers/mfd/Makefile
+++ b/drivers/mfd/Makefile
@@ -17,7 +17,7 @@ wm8350-objs			:= wm8350-core.o wm8350-regmap.o wm8350-gpio.o
 obj-$(CONFIG_MFD_WM8350)	+= wm8350.o
 obj-$(CONFIG_MFD_WM8350_I2C)	+= wm8350-i2c.o
 
-obj-$(CONFIG_TWL4030_CORE)	+= twl4030-core.o
+obj-$(CONFIG_TWL4030_CORE)	+= twl4030-core.o twl4030-irq.o
 
 obj-$(CONFIG_MFD_CORE)		+= mfd-core.o
 
diff --git a/drivers/mfd/twl4030-core.c b/drivers/mfd/twl4030-core.c
index fd9a0160202c..dd843c4fbcc7 100644
--- a/drivers/mfd/twl4030-core.c
+++ b/drivers/mfd/twl4030-core.c
@@ -27,15 +27,11 @@
  * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307 USA
  */
 
-#include <linux/kernel_stat.h>
 #include <linux/init.h>
 #include <linux/mutex.h>
-#include <linux/interrupt.h>
-#include <linux/irq.h>
-#include <linux/random.h>
-#include <linux/kthread.h>
 #include <linux/platform_device.h>
 #include <linux/clk.h>
+#include <linux/err.h>
 
 #include <linux/i2c.h>
 #include <linux/i2c/twl4030.h>
@@ -93,26 +89,6 @@
 #define twl_has_usb()	false
 #endif
 
-static inline void activate_irq(int irq)
-{
-#ifdef CONFIG_ARM
-	/* ARM requires an extra step to clear IRQ_NOREQUEST, which it
-	 * sets on behalf of every irq_chip.  Also sets IRQ_NOPROBE.
-	 */
-	set_irq_flags(irq, IRQF_VALID);
-#else
-	/* same effect on other architectures */
-	set_irq_noprobe(irq);
-#endif
-}
-
-/* Primary Interrupt Handler on TWL4030 Registers */
-
-/* Register Definitions */
-
-#define REG_PIH_ISR_P1			(0x1)
-#define REG_PIH_ISR_P2			(0x2)
-#define REG_PIH_SIR			(0x3)
 
 /* Triton Core internal information (BEGIN) */
 
@@ -175,138 +151,6 @@ static inline void activate_irq(int irq)
 
 /*----------------------------------------------------------------------*/
 
-/**
- * struct twl4030_mod_iregs - TWL module IMR/ISR regs to mask/clear at init
- * @mod_no: TWL4030 module number (e.g., TWL4030_MODULE_GPIO)
- * @sih_ctrl: address of module SIH_CTRL register
- * @reg_cnt: number of IMR/ISR regs
- * @imrs: pointer to array of TWL module interrupt mask register indices
- * @isrs: pointer to array of TWL module interrupt status register indices
- *
- * Ties together TWL4030 modules and lists of IMR/ISR registers to mask/clear
- * during twl_init_irq().
- */
-struct twl4030_mod_iregs {
-	const u8 mod_no;
-	const u8 sih_ctrl;
-	const u8 reg_cnt;
-	const u8 *imrs;
-	const u8 *isrs;
-};
-
-/* TWL4030 INT module interrupt mask registers */
-static const u8 __initconst twl4030_int_imr_regs[] = {
-	TWL4030_INT_PWR_IMR1,
-	TWL4030_INT_PWR_IMR2,
-};
-
-/* TWL4030 INT module interrupt status registers */
-static const u8 __initconst twl4030_int_isr_regs[] = {
-	TWL4030_INT_PWR_ISR1,
-	TWL4030_INT_PWR_ISR2,
-};
-
-/* TWL4030 INTERRUPTS module interrupt mask registers */
-static const u8 __initconst twl4030_interrupts_imr_regs[] = {
-	TWL4030_INTERRUPTS_BCIIMR1A,
-	TWL4030_INTERRUPTS_BCIIMR1B,
-	TWL4030_INTERRUPTS_BCIIMR2A,
-	TWL4030_INTERRUPTS_BCIIMR2B,
-};
-
-/* TWL4030 INTERRUPTS module interrupt status registers */
-static const u8 __initconst twl4030_interrupts_isr_regs[] = {
-	TWL4030_INTERRUPTS_BCIISR1A,
-	TWL4030_INTERRUPTS_BCIISR1B,
-	TWL4030_INTERRUPTS_BCIISR2A,
-	TWL4030_INTERRUPTS_BCIISR2B,
-};
-
-/* TWL4030 MADC module interrupt mask registers */
-static const u8 __initconst twl4030_madc_imr_regs[] = {
-	TWL4030_MADC_IMR1,
-	TWL4030_MADC_IMR2,
-};
-
-/* TWL4030 MADC module interrupt status registers */
-static const u8 __initconst twl4030_madc_isr_regs[] = {
-	TWL4030_MADC_ISR1,
-	TWL4030_MADC_ISR2,
-};
-
-/* TWL4030 keypad module interrupt mask registers */
-static const u8 __initconst twl4030_keypad_imr_regs[] = {
-	TWL4030_KEYPAD_KEYP_IMR1,
-	TWL4030_KEYPAD_KEYP_IMR2,
-};
-
-/* TWL4030 keypad module interrupt status registers */
-static const u8 __initconst twl4030_keypad_isr_regs[] = {
-	TWL4030_KEYPAD_KEYP_ISR1,
-	TWL4030_KEYPAD_KEYP_ISR2,
-};
-
-/* TWL4030 GPIO module interrupt mask registers */
-static const u8 __initconst twl4030_gpio_imr_regs[] = {
-	REG_GPIO_IMR1A,
-	REG_GPIO_IMR1B,
-	REG_GPIO_IMR2A,
-	REG_GPIO_IMR2B,
-	REG_GPIO_IMR3A,
-	REG_GPIO_IMR3B,
-};
-
-/* TWL4030 GPIO module interrupt status registers */
-static const u8 __initconst twl4030_gpio_isr_regs[] = {
-	REG_GPIO_ISR1A,
-	REG_GPIO_ISR1B,
-	REG_GPIO_ISR2A,
-	REG_GPIO_ISR2B,
-	REG_GPIO_ISR3A,
-	REG_GPIO_ISR3B,
-};
-
-/* TWL4030 modules that have IMR/ISR registers that must be masked/cleared */
-static const struct twl4030_mod_iregs __initconst twl4030_mod_regs[] = {
-	{
-		.mod_no	  = TWL4030_MODULE_INT,
-		.sih_ctrl = TWL4030_INT_PWR_SIH_CTRL,
-		.reg_cnt  = ARRAY_SIZE(twl4030_int_imr_regs),
-		.imrs	  = twl4030_int_imr_regs,
-		.isrs	  = twl4030_int_isr_regs,
-	},
-	{
-		.mod_no	  = TWL4030_MODULE_INTERRUPTS,
-		.sih_ctrl = TWL4030_INTERRUPTS_BCISIHCTRL,
-		.reg_cnt  = ARRAY_SIZE(twl4030_interrupts_imr_regs),
-		.imrs	  = twl4030_interrupts_imr_regs,
-		.isrs	  = twl4030_interrupts_isr_regs,
-	},
-	{
-		.mod_no	  = TWL4030_MODULE_MADC,
-		.sih_ctrl = TWL4030_MADC_SIH_CTRL,
-		.reg_cnt  = ARRAY_SIZE(twl4030_madc_imr_regs),
-		.imrs	  = twl4030_madc_imr_regs,
-		.isrs	  = twl4030_madc_isr_regs,
-	},
-	{
-		.mod_no	  = TWL4030_MODULE_KEYPAD,
-		.sih_ctrl = TWL4030_KEYPAD_KEYP_SIH_CTRL,
-		.reg_cnt  = ARRAY_SIZE(twl4030_keypad_imr_regs),
-		.imrs	  = twl4030_keypad_imr_regs,
-		.isrs	  = twl4030_keypad_isr_regs,
-	},
-	{
-		.mod_no	  = TWL4030_MODULE_GPIO,
-		.sih_ctrl = REG_GPIO_SIH_CTRL,
-		.reg_cnt  = ARRAY_SIZE(twl4030_gpio_imr_regs),
-		.imrs	  = twl4030_gpio_imr_regs,
-		.isrs	  = twl4030_gpio_isr_regs,
-	},
-};
-
-/*----------------------------------------------------------------*/
-
 /* is driver active, bound to a chip? */
 static bool inuse;
 
@@ -367,33 +211,6 @@ static struct twl4030mapping twl4030_map[TWL4030_MODULE_LAST + 1] = {
 
 /*----------------------------------------------------------------------*/
 
-/*
- * TWL4030 doesn't have PIH mask, hence dummy function for mask
- * and unmask of the (eight) interrupts reported at that level ...
- * masking is only available from SIH (secondary) modules.
- */
-
-static void twl4030_i2c_ackirq(unsigned int irq)
-{
-}
-
-static void twl4030_i2c_disableint(unsigned int irq)
-{
-}
-
-static void twl4030_i2c_enableint(unsigned int irq)
-{
-}
-
-static struct irq_chip twl4030_irq_chip = {
-	.name	= "twl4030",
-	.ack	= twl4030_i2c_ackirq,
-	.mask	= twl4030_i2c_disableint,
-	.unmask	= twl4030_i2c_enableint,
-};
-
-/*----------------------------------------------------------------------*/
-
 /* Exported Functions */
 
 /**
@@ -535,108 +352,11 @@ EXPORT_SYMBOL(twl4030_i2c_read_u8);
 
 /*----------------------------------------------------------------------*/
 
-static unsigned twl4030_irq_base;
-
-static struct completion irq_event;
-
-/*
- * This thread processes interrupts reported by the Primary Interrupt Handler.
- */
-static int twl4030_irq_thread(void *data)
-{
-	long irq = (long)data;
-	irq_desc_t *desc = irq_desc + irq;
-	static unsigned i2c_errors;
-	const static unsigned max_i2c_errors = 100;
-
-	current->flags |= PF_NOFREEZE;
-
-	while (!kthread_should_stop()) {
-		int ret;
-		int module_irq;
-		u8 pih_isr;
-
-		/* Wait for IRQ, then read PIH irq status (also blocking) */
-		wait_for_completion_interruptible(&irq_event);
-
-		ret = twl4030_i2c_read_u8(TWL4030_MODULE_PIH, &pih_isr,
-					  REG_PIH_ISR_P1);
-		if (ret) {
-			pr_warning("%s: I2C error %d reading PIH ISR\n",
-					DRIVER_NAME, ret);
-			if (++i2c_errors >= max_i2c_errors) {
-				printk(KERN_ERR "Maximum I2C error count"
-						" exceeded.  Terminating %s.\n",
-						__func__);
-				break;
-			}
-			complete(&irq_event);
-			continue;
-		}
-
-		/* these handlers deal with the relevant SIH irq status */
-		local_irq_disable();
-		for (module_irq = twl4030_irq_base;
-				pih_isr;
-				pih_isr >>= 1, module_irq++) {
-			if (pih_isr & 0x1) {
-				irq_desc_t *d = irq_desc + module_irq;
-
-				d->handle_irq(module_irq, d);
-			}
-		}
-		local_irq_enable();
-
-		desc->chip->unmask(irq);
-	}
-
-	return 0;
-}
-
 /*
- * do_twl4030_irq() is the desc->handle method for the twl4030 interrupt.
- * This is a chained interrupt, so there is no desc->action method for it.
- * Now we need to query the interrupt controller in the twl4030 to determine
- * which module is generating the interrupt request.  However, we can't do i2c
- * transactions in interrupt context, so we must defer that work to a kernel
- * thread.  All we do here is acknowledge and mask the interrupt and wakeup
- * the kernel thread.
+ * NOTE:  We know the first 8 IRQs after pdata->base_irq are
+ * for the PIH, and the next are for the PWR_INT SIH, since
+ * that's how twl_init_irq() sets things up.
  */
-static void do_twl4030_irq(unsigned int irq, irq_desc_t *desc)
-{
-	const unsigned int cpu = smp_processor_id();
-
-	/*
-	 * Earlier this was desc->triggered = 1;
-	 */
-	desc->status |= IRQ_LEVEL;
-
-	/*
-	 * Acknowledge, clear _AND_ disable the interrupt.
-	 */
-	desc->chip->ack(irq);
-
-	if (!desc->depth) {
-		kstat_cpu(cpu).irqs[irq]++;
-
-		complete(&irq_event);
-	}
-}
-
-static struct task_struct * __init start_twl4030_irq_thread(long irq)
-{
-	struct task_struct *thread;
-
-	init_completion(&irq_event);
-	thread = kthread_run(twl4030_irq_thread, (void *)irq, "twl4030-irq");
-	if (!thread)
-		pr_err("%s: could not create twl4030 irq %ld thread!\n",
-		       DRIVER_NAME, irq);
-
-	return thread;
-}
-
-/*----------------------------------------------------------------------*/
 
 static int add_children(struct twl4030_platform_data *pdata)
 {
@@ -668,7 +388,7 @@ static int add_children(struct twl4030_platform_data *pdata)
 
 		if (status == 0) {
 			struct resource r = {
-				.start = TWL4030_PWRIRQ_CHG_PRES,
+				.start = pdata->irq_base + 8 + 1,
 				.flags = IORESOURCE_IRQ,
 			};
 
@@ -817,8 +537,7 @@ static int add_children(struct twl4030_platform_data *pdata)
 		/* RTC module IRQ */
 		if (status == 0) {
 			struct resource	r = {
-				/* REVISIT don't hard-wire this stuff */
-				.start = TWL4030_PWRIRQ_RTC,
+				.start = pdata->irq_base + 8 + 3,
 				.flags = IORESOURCE_IRQ,
 			};
 
@@ -863,7 +582,7 @@ static int add_children(struct twl4030_platform_data *pdata)
 
 		if (status == 0) {
 			struct resource r = {
-				.start = TWL4030_PWRIRQ_USB_PRES,
+				.start = pdata->irq_base + 8 + 2,
 				.flags = IORESOURCE_IRQ,
 			};
 
@@ -965,123 +684,17 @@ static void __init clocks_init(void)
 
 /*----------------------------------------------------------------------*/
 
-/**
- * twl4030_i2c_clear_isr - clear TWL4030 SIH ISR regs via read + write
- * @mod_no: TWL4030 module number
- * @reg: register index to clear
- * @cor: value of the <module>_SIH_CTRL.COR bit (1 or 0)
- *
- * Either reads (cor == 1) or writes (cor == 0) to a TWL4030 interrupt
- * status register to ensure that any prior interrupts are cleared.
- * Returns the status from the I2C read operation.
- */
-static int __init twl4030_i2c_clear_isr(u8 mod_no, u8 reg, u8 cor)
-{
-	u8 tmp;
-
-	return (cor) ? twl4030_i2c_read_u8(mod_no, &tmp, reg) :
-		twl4030_i2c_write_u8(mod_no, 0xff, reg);
-}
-
-/**
- * twl4030_read_cor_bit - are TWL module ISRs cleared by reads or writes?
- * @mod_no: TWL4030 module number
- * @reg: register index to clear
- *
- * Returns 1 if the TWL4030 SIH interrupt status registers (ISRs) for
- * the specified TWL module are cleared by reads, or 0 if cleared by
- * writes.
- */
-static int twl4030_read_cor_bit(u8 mod_no, u8 reg)
-{
-	u8 tmp = 0;
-
-	WARN_ON(twl4030_i2c_read_u8(mod_no, &tmp, reg) < 0);
-
-	tmp &= TWL4030_SIH_CTRL_COR_MASK;
-	tmp >>= __ffs(TWL4030_SIH_CTRL_COR_MASK);
-
-	return tmp;
-}
-
-/**
- * twl4030_mask_clear_intrs - mask and clear all TWL4030 interrupts
- * @t: pointer to twl4030_mod_iregs array
- * @t_sz: ARRAY_SIZE(t) (starting at 1)
- *
- * Mask all TWL4030 interrupt mask registers (IMRs) and clear all
- * interrupt status registers (ISRs).  No return value, but will WARN if
- * any I2C operations fail.
- */
-static void __init twl4030_mask_clear_intrs(const struct twl4030_mod_iregs *t,
-					    const u8 t_sz)
-{
-	int i, j;
-
-	/*
-	 * N.B. - further efficiency is possible here.  Eight I2C
-	 * operations on BCI and GPIO modules are avoidable if I2C
-	 * burst read/write transactions were implemented.  Would
-	 * probably save about 1ms of boot time and a small amount of
-	 * power.
-	 */
-	for (i = 0; i < t_sz; i++) {
-		const struct twl4030_mod_iregs tmr = t[i];
-		int cor;
-
-		/* Are ISRs cleared by reads or writes? */
-		cor = twl4030_read_cor_bit(tmr.mod_no, tmr.sih_ctrl);
-
-		for (j = 0; j < tmr.reg_cnt; j++) {
-
-			/* Mask interrupts at the TWL4030 */
-			WARN_ON(twl4030_i2c_write_u8(tmr.mod_no, 0xff,
-						     tmr.imrs[j]) < 0);
-
-			/* Clear TWL4030 ISRs */
-			WARN_ON(twl4030_i2c_clear_isr(tmr.mod_no,
-						      tmr.isrs[j], cor) < 0);
-		}
-	}
-}
-
-
-static void twl_init_irq(int irq_num, unsigned irq_base, unsigned irq_end)
-{
-	int	i;
-
-	/*
-	 * Mask and clear all TWL4030 interrupts since initially we do
-	 * not have any TWL4030 module interrupt handlers present
-	 */
-	twl4030_mask_clear_intrs(twl4030_mod_regs,
-				 ARRAY_SIZE(twl4030_mod_regs));
-
-	twl4030_irq_base = irq_base;
-
-	/* install an irq handler for each of the PIH modules */
-	for (i = irq_base; i < irq_end; i++) {
-		set_irq_chip_and_handler(i, &twl4030_irq_chip,
-				handle_simple_irq);
-		activate_irq(i);
-	}
-
-	/* install an irq handler to demultiplex the TWL4030 interrupt */
-	set_irq_data(irq_num, start_twl4030_irq_thread(irq_num));
-	set_irq_chained_handler(irq_num, do_twl4030_irq);
-}
-
-/*----------------------------------------------------------------------*/
+int twl_init_irq(int irq_num, unsigned irq_base, unsigned irq_end);
+int twl_exit_irq(void);
 
 static int twl4030_remove(struct i2c_client *client)
 {
 	unsigned i;
+	int status;
 
-	/* FIXME undo twl_init_irq() */
-	if (twl4030_irq_base) {
-		dev_err(&client->dev, "can't yet clean up IRQs?\n");
-		return -ENOSYS;
-	}
+	status = twl_exit_irq();
+	if (status < 0)
+		return status;
 
 	for (i = 0; i < TWL4030_NUM_SLAVES; i++) {
 		struct twl4030_client	*twl = &twl4030_modules[i];
@@ -1112,7 +725,7 @@ twl4030_probe(struct i2c_client *client, const struct i2c_device_id *id)
 		return -EIO;
 	}
 
-	if (inuse || twl4030_irq_base) {
+	if (inuse) {
 		dev_dbg(&client->dev, "driver is already in use\n");
 		return -EBUSY;
 	}
@@ -1146,9 +759,9 @@ twl4030_probe(struct i2c_client *client, const struct i2c_device_id *id)
 	if (client->irq
 			&& pdata->irq_base
 			&& pdata->irq_end > pdata->irq_base) {
-		twl_init_irq(client->irq, pdata->irq_base, pdata->irq_end);
-		dev_info(&client->dev, "IRQ %d chains IRQs %d..%d\n",
-				client->irq, pdata->irq_base, pdata->irq_end - 1);
+		status = twl_init_irq(client->irq, pdata->irq_base, pdata->irq_end);
+		if (status < 0)
+			goto fail;
 	}
 
 	status = add_children(pdata);
diff --git a/drivers/mfd/twl4030-irq.c b/drivers/mfd/twl4030-irq.c
new file mode 100644
index 000000000000..fae868a8d499
--- /dev/null
+++ b/drivers/mfd/twl4030-irq.c
@@ -0,0 +1,743 @@
+/*
+ * twl4030-irq.c - TWL4030/TPS659x0 irq support
+ *
+ * Copyright (C) 2005-2006 Texas Instruments, Inc.
+ *
+ * Modifications to defer interrupt handling to a kernel thread:
+ * Copyright (C) 2006 MontaVista Software, Inc.
+ *
+ * Based on tlv320aic23.c:
+ * Copyright (c) by Kai Svahn <kai.svahn@nokia.com>
+ *
+ * Code cleanup and modifications to IRQ handler.
+ * by syed khasim <x0khasim@ti.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307 USA
+ */
+
+#include <linux/init.h>
+#include <linux/interrupt.h>
+#include <linux/irq.h>
+#include <linux/kthread.h>
+
+#include <linux/i2c/twl4030.h>
+
+
+/*
+ * TWL4030 IRQ handling has two stages in hardware, and thus in software.
+ * The Primary Interrupt Handler (PIH) stage exposes status bits saying
+ * which Secondary Interrupt Handler (SIH) stage is raising an interrupt.
+ * SIH modules are more traditional IRQ components, which support per-IRQ
+ * enable/disable and trigger controls; they do most of the work.
+ *
+ * These chips are designed to support IRQ handling from two different
+ * I2C masters.  Each has a dedicated IRQ line, and dedicated IRQ status
+ * and mask registers in the PIH and SIH modules.
+ *
+ * We set up IRQs starting at a platform-specified base, always starting
+ * with PIH and the SIH for PWR_INT and then usually adding GPIO:
+ *	base + 0  .. base + 7	PIH
+ *	base + 8  .. base + 15	SIH for PWR_INT
+ *	base + 16 .. base + 33	SIH for GPIO
+ */
+
+/* PIH register offsets */
+#define REG_PIH_ISR_P1			0x01
+#define REG_PIH_ISR_P2			0x02
+#define REG_PIH_SIR			0x03	/* for testing */
+
+
+/* Linux could (eventually) use either IRQ line */
+static int irq_line;
+
+struct sih {
+	char	name[8];
+	u8	module;			/* module id */
+	u8	control_offset;		/* for SIH_CTRL */
+	bool	set_cor;
+
+	u8	bits;			/* valid in isr/imr */
+	u8	bytes_ixr;		/* bytelen of ISR/IMR/SIR */
+
+	u8	edr_offset;
+	u8	bytes_edr;		/* bytelen of EDR */
+
+	/* SIR ignored -- set interrupt, for testing only */
+	struct irq_data {
+		u8	isr_offset;
+		u8	imr_offset;
+	} mask[2];
+	/* + 2 bytes padding */
+};
+
+#define SIH_INITIALIZER(modname, nbits) \
+	.module		= TWL4030_MODULE_ ## modname, \
+	.control_offset = TWL4030_ ## modname ## _SIH_CTRL, \
+	.bits		= nbits, \
+	.bytes_ixr	= DIV_ROUND_UP(nbits, 8), \
+	.edr_offset	= TWL4030_ ## modname ## _EDR, \
+	.bytes_edr	= DIV_ROUND_UP((2*(nbits)), 8), \
+	.mask = { { \
+		.isr_offset	= TWL4030_ ## modname ## _ISR1, \
+		.imr_offset	= TWL4030_ ## modname ## _IMR1, \
+	}, \
+	{ \
+		.isr_offset	= TWL4030_ ## modname ## _ISR2, \
+		.imr_offset	= TWL4030_ ## modname ## _IMR2, \
+	}, },
+
+/* register naming policies are inconsistent ... */
+#define TWL4030_INT_PWR_EDR		TWL4030_INT_PWR_EDR1
+#define TWL4030_MODULE_KEYPAD_KEYP	TWL4030_MODULE_KEYPAD
+#define TWL4030_MODULE_INT_PWR		TWL4030_MODULE_INT
+
+
+/* Order in this table matches order in PIH_ISR.  That is,
+ * BIT(n) in PIH_ISR is sih_modules[n].
+ */
+static const struct sih sih_modules[6] = {
+	[0] = {
+		.name		= "gpio",
+		.module		= TWL4030_MODULE_GPIO,
+		.control_offset	= REG_GPIO_SIH_CTRL,
+		.set_cor	= true,
+		.bits		= TWL4030_GPIO_MAX,
+		.bytes_ixr	= 3,
+		/* Note: *all* of these IRQs default to no-trigger */
+		.edr_offset	= REG_GPIO_EDR1,
+		.bytes_edr	= 5,
+		.mask = { {
+			.isr_offset	= REG_GPIO_ISR1A,
+			.imr_offset	= REG_GPIO_IMR1A,
+		}, {
+			.isr_offset	= REG_GPIO_ISR1B,
+			.imr_offset	= REG_GPIO_IMR1B,
+		}, },
+	},
+	[1] = {
+		.name		= "keypad",
+		.set_cor	= true,
+		SIH_INITIALIZER(KEYPAD_KEYP, 4)
+	},
+	[2] = {
+		.name		= "bci",
+		.module		= TWL4030_MODULE_INTERRUPTS,
+		.control_offset	= TWL4030_INTERRUPTS_BCISIHCTRL,
+		.bits		= 12,
+		.bytes_ixr	= 2,
+		.edr_offset	= TWL4030_INTERRUPTS_BCIEDR1,
+		/* Note: most of these IRQs default to no-trigger */
+		.bytes_edr	= 3,
+		.mask = { {
+			.isr_offset	= TWL4030_INTERRUPTS_BCIISR1A,
+			.imr_offset	= TWL4030_INTERRUPTS_BCIIMR1A,
+		}, {
+			.isr_offset	= TWL4030_INTERRUPTS_BCIISR1B,
+			.imr_offset	= TWL4030_INTERRUPTS_BCIIMR1B,
+		}, },
+	},
+	[3] = {
+		.name		= "madc",
+		SIH_INITIALIZER(MADC, 4)
+	},
+	[4] = {
+		/* USB doesn't use the same SIH organization */
+		.name		= "usb",
+	},
+	[5] = {
+		.name		= "power",
+		.set_cor	= true,
+		SIH_INITIALIZER(INT_PWR, 8)
+	},
+		/* there are no SIH modules #6 or #7 ... */
+};
+
+#undef TWL4030_MODULE_KEYPAD_KEYP
+#undef TWL4030_MODULE_INT_PWR
+#undef TWL4030_INT_PWR_EDR
+
+/*----------------------------------------------------------------------*/
+
+static unsigned twl4030_irq_base;
+
+static struct completion irq_event;
+
+/*
+ * This thread processes interrupts reported by the Primary Interrupt Handler.
+ */
+static int twl4030_irq_thread(void *data)
+{
+	long irq = (long)data;
+	irq_desc_t *desc = irq_desc + irq;
+	static unsigned i2c_errors;
+	const static unsigned max_i2c_errors = 100;
+
+	current->flags |= PF_NOFREEZE;
+
+	while (!kthread_should_stop()) {
+		int ret;
+		int module_irq;
+		u8 pih_isr;
+
+		/* Wait for IRQ, then read PIH irq status (also blocking) */
+		wait_for_completion_interruptible(&irq_event);
+
+		ret = twl4030_i2c_read_u8(TWL4030_MODULE_PIH, &pih_isr,
+					  REG_PIH_ISR_P1);
+		if (ret) {
+			pr_warning("twl4030: I2C error %d reading PIH ISR\n",
+					ret);
+			if (++i2c_errors >= max_i2c_errors) {
+				printk(KERN_ERR "Maximum I2C error count"
+						" exceeded.  Terminating %s.\n",
+						__func__);
+				break;
+			}
+			complete(&irq_event);
+			continue;
+		}
+
+		/* these handlers deal with the relevant SIH irq status */
+		local_irq_disable();
+		for (module_irq = twl4030_irq_base;
+				pih_isr;
+				pih_isr >>= 1, module_irq++) {
+			if (pih_isr & 0x1) {
+				irq_desc_t *d = irq_desc + module_irq;
+
+				/* These can't be masked ... always warn
+				 * if we get any surprises.
+				 */
+				if (d->status & IRQ_DISABLED)
+					note_interrupt(module_irq, d,
+							IRQ_NONE);
+				else
+					d->handle_irq(module_irq, d);
+			}
+		}
+		local_irq_enable();
+
+		desc->chip->unmask(irq);
+	}
+
+	return 0;
+}
+
+/*
+ * handle_twl4030_pih() is the desc->handle method for the twl4030 interrupt.
+ * This is a chained interrupt, so there is no desc->action method for it.
+ * Now we need to query the interrupt controller in the twl4030 to determine
+ * which module is generating the interrupt request.  However, we can't do i2c
+ * transactions in interrupt context, so we must defer that work to a kernel
+ * thread.  All we do here is acknowledge and mask the interrupt and wakeup
+ * the kernel thread.
+ */
+static void handle_twl4030_pih(unsigned int irq, irq_desc_t *desc)
+{
+	/* Acknowledge, clear *AND* mask the interrupt... */
+	desc->chip->ack(irq);
+	complete(&irq_event);
+}
+
+static struct task_struct *start_twl4030_irq_thread(long irq)
+{
+	struct task_struct *thread;
+
+	init_completion(&irq_event);
+	thread = kthread_run(twl4030_irq_thread, (void *)irq, "twl4030-irq");
+	if (!thread)
+		pr_err("twl4030: could not create irq %ld thread!\n", irq);
+
+	return thread;
+}
+
+/*----------------------------------------------------------------------*/
+
+/*
+ * twl4030_init_sih_modules() ... start from a known state where no
+ * IRQs will be coming in, and where we can quickly enable them then
+ * handle them as they arrive.  Mask all IRQs: maybe init SIH_CTRL.
+ *
+ * NOTE:  we don't touch EDR registers here; they stay with hardware
+ * defaults or whatever the last value was.  Note that when both EDR
+ * bits for an IRQ are clear, that's as if its IMR bit is set...
+ */
+static int twl4030_init_sih_modules(unsigned line)
+{
+	const struct sih *sih;
+	u8 buf[4];
+	int i;
+	int status;
+
+	/* line 0 == int1_n signal; line 1 == int2_n signal */
+	if (line > 1)
+		return -EINVAL;
+
+	irq_line = line;
+
+	/* disable all interrupts on our line */
+	memset(buf, 0xff, sizeof buf);
+	sih = sih_modules;
+	for (i = 0; i < ARRAY_SIZE(sih_modules); i++, sih++) {
+
+		/* skip USB -- it's funky */
+		if (!sih->bytes_ixr)
+			continue;
+
+		status = twl4030_i2c_write(sih->module, buf,
+				sih->mask[line].imr_offset, sih->bytes_ixr);
+		if (status < 0)
+			pr_err("twl4030: err %d initializing %s %s\n",
+					status, sih->name, "IMR");
+
+		/* Maybe disable "exclusive" mode; buffer second pending irq;
+		 * set Clear-On-Read (COR) bit.
+		 *
+		 * NOTE that sometimes COR polarity is documented as being
+		 * inverted:  for MADC and BCI, COR=1 means "clear on write".
+		 * And for PWR_INT it's not documented...
+		 */
+		if (sih->set_cor) {
+			status = twl4030_i2c_write_u8(sih->module,
+					TWL4030_SIH_CTRL_COR_MASK,
+					sih->control_offset);
+			if (status < 0)
+				pr_err("twl4030: err %d initializing %s %s\n",
+						status, sih->name, "SIH_CTRL");
+		}
+	}
+
+	sih = sih_modules;
+	for (i = 0; i < ARRAY_SIZE(sih_modules); i++, sih++) {
+		u8 rxbuf[4];
+		int j;
+
+		/* skip USB */
+		if (!sih->bytes_ixr)
+			continue;
+
+		/* Clear pending interrupt status.  Either the read was
+		 * enough, or we need to write those bits.  Repeat, in
+		 * case an IRQ is pending (PENDDIS=0) ... that's not
+		 * uncommon with PWR_INT.PWRON.
+		 */
+		for (j = 0; j < 2; j++) {
+			status = twl4030_i2c_read(sih->module, rxbuf,
+				sih->mask[line].isr_offset, sih->bytes_ixr);
+			if (status < 0)
+				pr_err("twl4030: err %d initializing %s %s\n",
+					status, sih->name, "ISR");
+
+			if (!sih->set_cor)
+				status = twl4030_i2c_write(sih->module, buf,
+					sih->mask[line].isr_offset,
+					sih->bytes_ixr);
+			/* else COR=1 means read sufficed.
+			 * (for most SIH modules...)
+			 */
+		}
+	}
+
+	return 0;
+}
+
+static inline void activate_irq(int irq)
+{
+#ifdef CONFIG_ARM
+	/* ARM requires an extra step to clear IRQ_NOREQUEST, which it
+	 * sets on behalf of every irq_chip.  Also sets IRQ_NOPROBE.
+	 */
+	set_irq_flags(irq, IRQF_VALID);
+#else
+	/* same effect on other architectures */
+	set_irq_noprobe(irq);
+#endif
+}
+
+/*----------------------------------------------------------------------*/
+
+static DEFINE_SPINLOCK(sih_agent_lock);
+
+static struct workqueue_struct *wq;
+
+struct sih_agent {
+	int			irq_base;
+	const struct sih	*sih;
+
+	u32			imr;
+	bool			imr_change_pending;
+	struct work_struct	mask_work;
+
+	u32			edge_change;
+	struct work_struct	edge_work;
+};
+
+static void twl4030_sih_do_mask(struct work_struct *work)
+{
+	struct sih_agent	*agent;
+	const struct sih	*sih;
+	union {
+		u8	bytes[4];
+		u32	word;
+	}			imr;
+	int			status;
+
+	agent = container_of(work, struct sih_agent, mask_work);
+
+	/* see what work we have */
+	spin_lock_irq(&sih_agent_lock);
+	if (agent->imr_change_pending) {
+		sih = agent->sih;
+		/* byte[0] gets overwritten as we write ... */
+		imr.word = cpu_to_le32(agent->imr << 8);
+		agent->imr_change_pending = false;
+	} else
+		sih = NULL;
+	spin_unlock_irq(&sih_agent_lock);
+	if (!sih)
+		return;
+
+	/* write the whole mask ... simpler than subsetting it */
+	status = twl4030_i2c_write(sih->module, imr.bytes,
+			sih->mask[irq_line].imr_offset, sih->bytes_ixr);
+	if (status)
+		pr_err("twl4030: %s, %s --> %d\n", __func__,
+				"write", status);
+}
+
+static void twl4030_sih_do_edge(struct work_struct *work)
+{
+	struct sih_agent	*agent;
+	const struct sih	*sih;
+	u8			bytes[6];
+	u32			edge_change;
+	int			status;
+
+	agent = container_of(work, struct sih_agent, edge_work);
+
+	/* see what work we have */
+	spin_lock_irq(&sih_agent_lock);
+	edge_change = agent->edge_change;
+	agent->edge_change = 0;;
+	sih = edge_change ? agent->sih : NULL;
+	spin_unlock_irq(&sih_agent_lock);
+	if (!sih)
+		return;
+
+	/* Read, reserving first byte for write scratch.  Yes, this
+	 * could be cached for some speedup ... but be careful about
+	 * any processor on the other IRQ line, EDR registers are
+	 * shared.
+	 */
+	status = twl4030_i2c_read(sih->module, bytes + 1,
+			sih->edr_offset, sih->bytes_edr);
+	if (status) {
+		pr_err("twl4030: %s, %s --> %d\n", __func__,
+				"read", status);
+		return;
+	}
+
+	/* Modify only the bits we know must change */
+	while (edge_change) {
+		int		i = fls(edge_change) - 1;
+		struct irq_desc	*d = irq_desc + i + agent->irq_base;
+		int		byte = 1 + (i >> 2);
+		int		off = (i & 0x3) * 2;
+
+		bytes[byte] &= ~(0x03 << off);
+
+		spin_lock_irq(&d->lock);
+		if (d->status & IRQ_TYPE_EDGE_RISING)
+			bytes[byte] |= BIT(off + 1);
+		if (d->status & IRQ_TYPE_EDGE_FALLING)
+			bytes[byte] |= BIT(off + 0);
+		spin_unlock_irq(&d->lock);
+
+		edge_change &= ~BIT(i);
+	}
+
+	/* Write */
+	status = twl4030_i2c_write(sih->module, bytes,
+			sih->edr_offset, sih->bytes_edr);
+	if (status)
+		pr_err("twl4030: %s, %s --> %d\n", __func__,
+				"write", status);
+}
+
+/*----------------------------------------------------------------------*/
+
+/*
+ * All irq_chip methods get issued from code holding irq_desc[irq].lock,
+ * which can't perform the underlying I2C operations (because they sleep).
+ * So we must hand them off to a thread (workqueue) and cope with asynch
+ * completion, potentially including some re-ordering, of these requests.
+ */
+
+static void twl4030_sih_mask(unsigned irq)
+{
+	struct sih_agent *sih = get_irq_chip_data(irq);
+	unsigned long flags;
+
+	spin_lock_irqsave(&sih_agent_lock, flags);
+	sih->imr |= BIT(irq - sih->irq_base);
+	sih->imr_change_pending = true;
+	queue_work(wq, &sih->mask_work);
+	spin_unlock_irqrestore(&sih_agent_lock, flags);
+}
+
+static void twl4030_sih_unmask(unsigned irq)
+{
+	struct sih_agent *sih = get_irq_chip_data(irq);
+	unsigned long flags;
+
+	spin_lock_irqsave(&sih_agent_lock, flags);
+	sih->imr &= ~BIT(irq - sih->irq_base);
+	sih->imr_change_pending = true;
+	queue_work(wq, &sih->mask_work);
+	spin_unlock_irqrestore(&sih_agent_lock, flags);
+}
+
+static int twl4030_sih_set_type(unsigned irq, unsigned trigger)
+{
+	struct sih_agent *sih = get_irq_chip_data(irq);
+	struct irq_desc *desc = irq_desc + irq;
+	unsigned long flags;
+
+	if (trigger & ~(IRQ_TYPE_EDGE_FALLING | IRQ_TYPE_EDGE_RISING))
+		return -EINVAL;
+
+	spin_lock_irqsave(&sih_agent_lock, flags);
+	if ((desc->status & IRQ_TYPE_SENSE_MASK) != trigger) {
+		desc->status &= ~IRQ_TYPE_SENSE_MASK;
+		desc->status |= trigger;
+		sih->edge_change |= BIT(irq - sih->irq_base);
+		queue_work(wq, &sih->edge_work);
+	}
+	spin_unlock_irqrestore(&sih_agent_lock, flags);
+	return 0;
+}
+
+static struct irq_chip twl4030_sih_irq_chip = {
+	.name		= "twl4030",
+	.mask		= twl4030_sih_mask,
+	.unmask		= twl4030_sih_unmask,
+	.set_type	= twl4030_sih_set_type,
+};
+
+/*----------------------------------------------------------------------*/
+
+static inline int sih_read_isr(const struct sih *sih)
+{
+	int status;
+	union {
+		u8 bytes[4];
+		u32 word;
+	} isr;
+
+	/* FIXME need retry-on-error ... */
+
+	isr.word = 0;
+	status = twl4030_i2c_read(sih->module, isr.bytes,
+			sih->mask[irq_line].isr_offset, sih->bytes_ixr);
+
+	return (status < 0) ? status : le32_to_cpu(isr.word);
+}
+
+/*
+ * Generic handler for SIH interrupts ... we "know" this is called
+ * in task context, with IRQs enabled.
+ */
+static void handle_twl4030_sih(unsigned irq, struct irq_desc *desc)
+{
+	struct sih_agent *agent = get_irq_data(irq);
+	const struct sih *sih = agent->sih;
+	int isr;
+
+	/* reading ISR acks the IRQs, using clear-on-read mode */
+	local_irq_enable();
+	isr = sih_read_isr(sih);
+	local_irq_disable();
+
+	if (isr < 0) {
+		pr_err("twl4030: %s SIH, read ISR error %d\n",
+			sih->name, isr);
+		/* REVISIT:  recover; eventually mask it all, etc */
+		return;
+	}
+
+	while (isr) {
+		irq = fls(isr);
+		irq--;
+		isr &= ~BIT(irq);
+
+		if (irq < sih->bits)
+			generic_handle_irq(agent->irq_base + irq);
+		else
+			pr_err("twl4030: %s SIH, invalid ISR bit %d\n",
+				sih->name, irq);
+	}
+}
+
+static unsigned twl4030_irq_next;
+
+/* returns the first IRQ used by this SIH bank,
+ * or negative errno
+ */
+int twl4030_sih_setup(int module)
+{
+	int			sih_mod;
+	const struct sih	*sih = NULL;
+	struct sih_agent	*agent;
+	int			i, irq;
+	int			status = -EINVAL;
+	unsigned		irq_base = twl4030_irq_next;
+
+	/* only support modules with standard clear-on-read for now */
+	for (sih_mod = 0, sih = sih_modules;
+			sih_mod < ARRAY_SIZE(sih_modules);
+			sih_mod++, sih++) {
+		if (sih->module == module && sih->set_cor) {
+			if (!WARN((irq_base + sih->bits) > NR_IRQS,
+					"irq %d for %s too big\n",
+					irq_base + sih->bits,
+					sih->name))
+				status = 0;
+			break;
+		}
+	}
+	if (status < 0)
+		return status;
+
+	agent = kzalloc(sizeof *agent, GFP_KERNEL);
+	if (!agent)
+		return -ENOMEM;
+
+	status = 0;
+
+	agent->irq_base = irq_base;
+	agent->sih = sih;
+	agent->imr = ~0;
+	INIT_WORK(&agent->mask_work, twl4030_sih_do_mask);
+	INIT_WORK(&agent->edge_work, twl4030_sih_do_edge);
+
+	for (i = 0; i < sih->bits; i++) {
+		irq = irq_base + i;
+
+		set_irq_chip_and_handler(irq, &twl4030_sih_irq_chip,
+				handle_edge_irq);
+		set_irq_chip_data(irq, agent);
+		activate_irq(irq);
+	}
+
+	status = irq_base;
+	twl4030_irq_next += i;
+
+	/* replace generic PIH handler (handle_simple_irq) */
+	irq = sih_mod + twl4030_irq_base;
+	set_irq_data(irq, agent);
+	set_irq_chained_handler(irq, handle_twl4030_sih);
+
+	pr_info("twl4030: %s (irq %d) chaining IRQs %d..%d\n", sih->name,
+			irq, irq_base, twl4030_irq_next - 1);
+
+	return status;
+}
+
+/* FIXME need a call to reverse twl4030_sih_setup() ... */
+
+
+/*----------------------------------------------------------------------*/
+
+/* FIXME pass in which interrupt line we'll use ... */
+#define twl_irq_line	0
+
+int twl_init_irq(int irq_num, unsigned irq_base, unsigned irq_end)
+{
+	static struct irq_chip	twl4030_irq_chip;
+
+	int			status;
+	int			i;
+	struct task_struct	*task;
+
+	/*
+	 * Mask and clear all TWL4030 interrupts since initially we do
+	 * not have any TWL4030 module interrupt handlers present
+	 */
+	status = twl4030_init_sih_modules(twl_irq_line);
+	if (status < 0)
+		return status;
+
+	wq = create_singlethread_workqueue("twl4030-irqchip");
+	if (!wq) {
+		pr_err("twl4030: workqueue FAIL\n");
+		return -ESRCH;
+	}
+
+	twl4030_irq_base = irq_base;
+
+	/* install an irq handler for each of the SIH modules;
+	 * clone dummy irq_chip since PIH can't *do* anything
+	 */
+	twl4030_irq_chip = dummy_irq_chip;
+	twl4030_irq_chip.name = "twl4030";
+
+	twl4030_sih_irq_chip.ack = dummy_irq_chip.ack;
+
+	for (i = irq_base; i < irq_end; i++) {
+		set_irq_chip_and_handler(i, &twl4030_irq_chip,
+				handle_simple_irq);
+		activate_irq(i);
+	}
+	twl4030_irq_next = i;
+	pr_info("twl4030: %s (irq %d) chaining IRQs %d..%d\n", "PIH",
+			irq_num, irq_base, twl4030_irq_next - 1);
+
+	/* ... and the PWR_INT module ... */
+	status = twl4030_sih_setup(TWL4030_MODULE_INT);
+	if (status < 0) {
+		pr_err("twl4030: sih_setup PWR INT --> %d\n", status);
+		goto fail;
+	}
+
+	/* install an irq handler to demultiplex the TWL4030 interrupt */
+	task = start_twl4030_irq_thread(irq_num);
+	if (!task) {
+		pr_err("twl4030: irq thread FAIL\n");
+		status = -ESRCH;
+		goto fail;
+	}
+
+	set_irq_data(irq_num, task);
+	set_irq_chained_handler(irq_num, handle_twl4030_pih);
+
+	return status;
+
+fail:
+	for (i = irq_base; i < irq_end; i++)
+		set_irq_chip_and_handler(i, NULL, NULL);
+	destroy_workqueue(wq);
+	wq = NULL;
+	return status;
+}
+
+int twl_exit_irq(void)
+{
+	/* FIXME undo twl_init_irq() */
+	if (twl4030_irq_base) {
+		pr_err("twl4030: can't yet clean up IRQs?\n");
+		return -ENOSYS;
+	}
+	return 0;
+}
diff --git a/include/linux/i2c/twl4030.h b/include/linux/i2c/twl4030.h
index cdb453162a97..fb604dcd38f1 100644
--- a/include/linux/i2c/twl4030.h
+++ b/include/linux/i2c/twl4030.h
@@ -228,6 +228,12 @@ struct twl4030_gpio_platform_data {
 	int		gpio_base;
 	unsigned	irq_base, irq_end;
 
+	/* package the two LED signals as output-only GPIOs? */
+	bool		use_leds;
+
+	/* gpio-n should control VMMC(n+1) if BIT(n) in mmc_cd is set */
+	u8		mmc_cd;
+
 	/* For gpio-N, bit (1 << N) in "pullups" is set if that pullup
 	 * should be enabled.  Else, if that bit is set in "pulldowns",
 	 * that pulldown is enabled.  Don't waste power by letting any
@@ -277,6 +283,8 @@ struct twl4030_platform_data {
 
 /*----------------------------------------------------------------------*/
 
+int twl4030_sih_setup(int module);
+
 /*
  * FIXME completely stop using TWL4030_IRQ_BASE ... instead, pass the
  * IRQ data to subsidiary devices using platform device resources.
@@ -291,16 +299,16 @@ struct twl4030_platform_data {
 #define TWL4030_MODIRQ_BCI		(TWL4030_IRQ_BASE + 2)
 #define TWL4030_MODIRQ_MADC		(TWL4030_IRQ_BASE + 3)
 /* #define TWL4030_MODIRQ_USB		(TWL4030_IRQ_BASE + 4) */
-#define TWL4030_MODIRQ_PWR		(TWL4030_IRQ_BASE + 5)
+/* #define TWL4030_MODIRQ_PWR		(TWL4030_IRQ_BASE + 5) */
 
 #define TWL4030_PWRIRQ_PWRBTN		(TWL4030_PWR_IRQ_BASE + 0)
-#define TWL4030_PWRIRQ_CHG_PRES		(TWL4030_PWR_IRQ_BASE + 1)
-#define TWL4030_PWRIRQ_USB_PRES		(TWL4030_PWR_IRQ_BASE + 2)
-#define TWL4030_PWRIRQ_RTC		(TWL4030_PWR_IRQ_BASE + 3)
-#define TWL4030_PWRIRQ_HOT_DIE		(TWL4030_PWR_IRQ_BASE + 4)
-#define TWL4030_PWRIRQ_PWROK_TIMEOUT	(TWL4030_PWR_IRQ_BASE + 5)
-#define TWL4030_PWRIRQ_MBCHG		(TWL4030_PWR_IRQ_BASE + 6)
-#define TWL4030_PWRIRQ_SC_DETECT	(TWL4030_PWR_IRQ_BASE + 7)
+/* #define TWL4030_PWRIRQ_CHG_PRES		(TWL4030_PWR_IRQ_BASE + 1) */
+/* #define TWL4030_PWRIRQ_USB_PRES		(TWL4030_PWR_IRQ_BASE + 2) */
+/* #define TWL4030_PWRIRQ_RTC		(TWL4030_PWR_IRQ_BASE + 3) */
+/* #define TWL4030_PWRIRQ_HOT_DIE		(TWL4030_PWR_IRQ_BASE + 4) */
+/* #define TWL4030_PWRIRQ_PWROK_TIMEOUT	(TWL4030_PWR_IRQ_BASE + 5) */
+/* #define TWL4030_PWRIRQ_MBCHG		(TWL4030_PWR_IRQ_BASE + 6) */
+/* #define TWL4030_PWRIRQ_SC_DETECT	(TWL4030_PWR_IRQ_BASE + 7) */
 
 /* Rest are unsued currently*/
 
@@ -317,17 +325,13 @@ struct twl4030_platform_data {
 /* TWL4030 GPIO interrupt definitions */
 
 #define TWL4030_GPIO_IRQ_NO(n)		(TWL4030_GPIO_IRQ_BASE + (n))
-#define TWL4030_GPIO_IS_ENABLE		1
 
 /*
  * Exported TWL4030 GPIO APIs
  *
  * WARNING -- use standard GPIO and IRQ calls instead; these will vanish.
  */
-int twl4030_get_gpio_datain(int gpio);
-int twl4030_request_gpio(int gpio);
 int twl4030_set_gpio_debounce(int gpio, int enable);
-int twl4030_free_gpio(int gpio);
 
 #if defined(CONFIG_TWL4030_BCI_BATTERY) || \
 	defined(CONFIG_TWL4030_BCI_BATTERY_MODULE)
-- 
cgit v1.2.3


From 27471fdb32e77ecb92f09d4ac5757785b4dc33bc Mon Sep 17 00:00:00 2001
From: Andy Henroid <andrew.d.henroid@intel.com>
Date: Thu, 9 Oct 2008 11:45:22 -0700
Subject: i7300_idle driver v1.55

The Intel 7300 Memory Controller supports dynamic throttling of memory which can
be used to save power when system is idle. This driver does the memory
throttling when all CPUs are idle on such a system.

Refer to "Intel 7300 Memory Controller Hub (MCH)" datasheet
for the config space description.

Signed-off-by: Andy Henroid <andrew.d.henroid@intel.com>
Signed-off-by: Len Brown <len.brown@intel.com>
Signed-off-by: Venkatesh Pallipadi <venkatesh.pallipadi@intel.com>
---
 MAINTAINERS               |   6 +
 arch/x86/Kconfig          |   2 +
 drivers/Makefile          |   1 +
 drivers/dma/ioat_dma.c    |   3 +
 drivers/idle/Kconfig      |  16 ++
 drivers/idle/Makefile     |   2 +
 drivers/idle/i7300_idle.c | 674 ++++++++++++++++++++++++++++++++++++++++++++++
 include/asm-x86/idle.h    |   1 +
 include/linux/pci_ids.h   |   1 +
 9 files changed, 706 insertions(+)
 create mode 100644 drivers/idle/Kconfig
 create mode 100644 drivers/idle/Makefile
 create mode 100644 drivers/idle/i7300_idle.c

(limited to 'include/linux')

diff --git a/MAINTAINERS b/MAINTAINERS
index 8dae4555f10e..43f71b0d2a2b 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -2078,6 +2078,12 @@ L:	linux-ide@vger.kernel.org
 L:	linux-scsi@vger.kernel.org
 S:	Orphan
 
+IDLE-I7300
+P:	Andy Henroid
+M:	andrew.d.henroid@intel.com
+L:	linux-pm@lists.linux-foundation.org
+S:	Supported
+
 IEEE 1394 SUBSYSTEM (drivers/ieee1394)
 P:	Ben Collins
 M:	ben.collins@ubuntu.com
diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
index ed92864d1325..19cdfe1f237a 100644
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -1536,6 +1536,8 @@ source "arch/x86/kernel/cpu/cpufreq/Kconfig"
 
 source "drivers/cpuidle/Kconfig"
 
+source "drivers/idle/Kconfig"
+
 endmenu
 
 
diff --git a/drivers/Makefile b/drivers/Makefile
index 2735bde73475..f443a8a9d46e 100644
--- a/drivers/Makefile
+++ b/drivers/Makefile
@@ -82,6 +82,7 @@ obj-$(CONFIG_EISA)		+= eisa/
 obj-y				+= lguest/
 obj-$(CONFIG_CPU_FREQ)		+= cpufreq/
 obj-$(CONFIG_CPU_IDLE)		+= cpuidle/
+obj-y				+= idle/
 obj-$(CONFIG_MMC)		+= mmc/
 obj-$(CONFIG_MEMSTICK)		+= memstick/
 obj-$(CONFIG_NEW_LEDS)		+= leds/
diff --git a/drivers/dma/ioat_dma.c b/drivers/dma/ioat_dma.c
index bc8c6e3470ca..f8396cafa05f 100644
--- a/drivers/dma/ioat_dma.c
+++ b/drivers/dma/ioat_dma.c
@@ -171,6 +171,9 @@ static int ioat_dma_enumerate_channels(struct ioatdma_device *device)
 	xfercap_scale = readb(device->reg_base + IOAT_XFERCAP_OFFSET);
 	xfercap = (xfercap_scale == 0 ? -1 : (1UL << xfercap_scale));
 
+#if CONFIG_I7300_IDLE_IOAT_CHANNEL
+	device->common.chancnt--;
+#endif
 	for (i = 0; i < device->common.chancnt; i++) {
 		ioat_chan = kzalloc(sizeof(*ioat_chan), GFP_KERNEL);
 		if (!ioat_chan) {
diff --git a/drivers/idle/Kconfig b/drivers/idle/Kconfig
new file mode 100644
index 000000000000..f5b26dd579e4
--- /dev/null
+++ b/drivers/idle/Kconfig
@@ -0,0 +1,16 @@
+
+menu "Memory power savings"
+
+config I7300_IDLE_IOAT_CHANNEL
+	bool
+
+config I7300_IDLE
+	tristate "Intel chipset idle power saving driver"
+	select I7300_IDLE_IOAT_CHANNEL
+	depends on X86_64
+	help
+	  Enable idle power savings with certain Intel server chipsets.
+	  The chipset must have I/O AT support, such as the Intel 7300.
+	  The power savings depends on the type and quantity of DRAM devices.
+
+endmenu
diff --git a/drivers/idle/Makefile b/drivers/idle/Makefile
new file mode 100644
index 000000000000..5f68fc377e21
--- /dev/null
+++ b/drivers/idle/Makefile
@@ -0,0 +1,2 @@
+obj-$(CONFIG_I7300_IDLE)			+= i7300_idle.o
+
diff --git a/drivers/idle/i7300_idle.c b/drivers/idle/i7300_idle.c
new file mode 100644
index 000000000000..59d1bbc3cd3c
--- /dev/null
+++ b/drivers/idle/i7300_idle.c
@@ -0,0 +1,674 @@
+/*
+ * (C) Copyright 2008 Intel Corporation
+ * Authors:
+ * Andy Henroid <andrew.d.henroid@intel.com>
+ * Venkatesh Pallipadi <venkatesh.pallipadi@intel.com>
+ */
+
+/*
+ * Save DIMM power on Intel 7300-based platforms when all CPUs/cores
+ * are idle, using the DIMM thermal throttling capability.
+ *
+ * This driver depends on the Intel integrated DMA controller (I/O AT).
+ * If the driver for I/O AT (drivers/dma/ioatdma*) is also enabled,
+ * this driver should work cooperatively.
+ */
+
+/* #define DEBUG */
+
+#include <linux/module.h>
+#include <linux/pci.h>
+#include <linux/sched.h>
+#include <linux/notifier.h>
+#include <linux/cpumask.h>
+#include <linux/ktime.h>
+#include <linux/delay.h>
+#include <linux/debugfs.h>
+#include <linux/stop_machine.h>
+
+#include <asm/idle.h>
+
+#include "../dma/ioatdma_hw.h"
+#include "../dma/ioatdma_registers.h"
+
+#define I7300_IDLE_DRIVER_VERSION	"1.55"
+#define I7300_PRINT			"i7300_idle:"
+
+static int debug;
+module_param_named(debug, debug, uint, 0644);
+MODULE_PARM_DESC(debug, "Enable debug printks in this driver");
+
+#define dprintk(fmt, arg...) \
+	do { if (debug) printk(KERN_INFO I7300_PRINT fmt, ##arg); } while (0)
+
+/*
+ * Value to set THRTLOW to when initiating throttling
+ *  0 = No throttling
+ *  1 = Throttle when > 4 activations per eval window (Maximum throttling)
+ *  2 = Throttle when > 8 activations
+ *  168 = Throttle when > 168 activations (Minimum throttling)
+ */
+#define MAX_THRTLWLIMIT		168
+static uint i7300_idle_thrtlowlm = 1;
+module_param_named(thrtlwlimit, i7300_idle_thrtlowlm, uint, 0644);
+MODULE_PARM_DESC(thrtlwlimit,
+		"Value for THRTLOWLM activation field "
+		"(0 = disable throttle, 1 = Max throttle, 168 = Min throttle)");
+
+/*
+ * simple invocation and duration statistics
+ */
+static unsigned long total_starts;
+static unsigned long total_us;
+
+#ifdef DEBUG
+static unsigned long past_skip;
+#endif
+
+static struct pci_dev *fbd_dev;
+
+static spinlock_t i7300_idle_lock;
+static int i7300_idle_active;
+
+static u8 i7300_idle_thrtctl_saved;
+static u8 i7300_idle_thrtlow_saved;
+static u32 i7300_idle_mc_saved;
+
+static cpumask_t idle_cpumask;
+static ktime_t start_ktime;
+static unsigned long avg_idle_us;
+
+static struct dentry *debugfs_dir;
+
+/* Begin: I/O AT Helper routines */
+
+#define IOAT_CHANBASE(ioat_ctl, chan) (ioat_ctl + 0x80 + 0x80 * chan)
+/* Snoop control (disable snoops when coherency is not important) */
+#define IOAT_DESC_SADDR_SNP_CTL (1UL << 1)
+#define IOAT_DESC_DADDR_SNP_CTL (1UL << 2)
+
+static struct pci_dev *ioat_dev;
+static struct ioat_dma_descriptor *ioat_desc; /* I/O AT desc & data (1 page) */
+static unsigned long ioat_desc_phys;
+static u8 *ioat_iomap; /* I/O AT memory-mapped control regs (aka CB_BAR) */
+static u8 *ioat_chanbase;
+
+/* Start I/O AT memory copy */
+static int i7300_idle_ioat_start(void)
+{
+	u32 err;
+	/* Clear error (due to circular descriptor pointer) */
+	err = readl(ioat_chanbase + IOAT_CHANERR_OFFSET);
+	if (err)
+		writel(err, ioat_chanbase + IOAT_CHANERR_OFFSET);
+
+	writeb(IOAT_CHANCMD_START, ioat_chanbase + IOAT1_CHANCMD_OFFSET);
+	return 0;
+}
+
+/* Stop I/O AT memory copy */
+static void i7300_idle_ioat_stop(void)
+{
+	int i;
+	u8 sts;
+
+	for (i = 0; i < 5; i++) {
+		writeb(IOAT_CHANCMD_RESET,
+			ioat_chanbase + IOAT1_CHANCMD_OFFSET);
+
+		udelay(10);
+
+		sts = readq(ioat_chanbase + IOAT1_CHANSTS_OFFSET) &
+			IOAT_CHANSTS_DMA_TRANSFER_STATUS;
+
+		if (sts != IOAT_CHANSTS_DMA_TRANSFER_STATUS_ACTIVE)
+			break;
+
+	}
+
+	if (i == 5)
+		dprintk("failed to suspend+reset I/O AT after 5 retries\n");
+
+}
+
+/* Test I/O AT by copying 1024 byte from 2k to 1k */
+static int __init i7300_idle_ioat_selftest(u8 *ctl,
+		struct ioat_dma_descriptor *desc, unsigned long desc_phys)
+{
+	u64 chan_sts;
+
+	memset(desc, 0, 2048);
+	memset((u8 *) desc + 2048, 0xab, 1024);
+
+	desc[0].size = 1024;
+	desc[0].ctl = 0;
+	desc[0].src_addr = desc_phys + 2048;
+	desc[0].dst_addr = desc_phys + 1024;
+	desc[0].next = 0;
+
+	writeb(IOAT_CHANCMD_RESET, ioat_chanbase + IOAT1_CHANCMD_OFFSET);
+	writeb(IOAT_CHANCMD_START, ioat_chanbase + IOAT1_CHANCMD_OFFSET);
+
+	udelay(1000);
+
+	chan_sts = readq(ioat_chanbase + IOAT1_CHANSTS_OFFSET) &
+			IOAT_CHANSTS_DMA_TRANSFER_STATUS;
+
+	if (chan_sts != IOAT_CHANSTS_DMA_TRANSFER_STATUS_DONE) {
+		/* Not complete, reset the channel */
+		writeb(IOAT_CHANCMD_RESET,
+		       ioat_chanbase + IOAT1_CHANCMD_OFFSET);
+		return -1;
+	}
+
+	if (*(u32 *) ((u8 *) desc + 3068) != 0xabababab ||
+	    *(u32 *) ((u8 *) desc + 2044) != 0xabababab) {
+		dprintk("Data values src 0x%x, dest 0x%x, memset 0x%x\n",
+			*(u32 *) ((u8 *) desc + 2048),
+			*(u32 *) ((u8 *) desc + 1024),
+			*(u32 *) ((u8 *) desc + 3072));
+		return -1;
+	}
+	return 0;
+}
+
+static struct device dummy_dma_dev = {
+	.bus_id = "fallback device",
+	.coherent_dma_mask = DMA_64BIT_MASK,
+	.dma_mask = &dummy_dma_dev.coherent_dma_mask,
+};
+
+/* Setup and initialize I/O AT */
+/* This driver needs I/O AT as the throttling takes effect only when there is
+ * some memory activity. We use I/O AT to set up a dummy copy, while all CPUs
+ * go idle and memory is throttled.
+ */
+static int __init i7300_idle_ioat_init(void)
+{
+	u8 ver, chan_count, ioat_chan;
+	u16 chan_ctl;
+
+	ioat_iomap = (u8 *) ioremap_nocache(pci_resource_start(ioat_dev, 0),
+					    pci_resource_len(ioat_dev, 0));
+
+	if (!ioat_iomap) {
+		printk(KERN_ERR I7300_PRINT "failed to map I/O AT registers\n");
+		goto err_ret;
+	}
+
+	ver = readb(ioat_iomap + IOAT_VER_OFFSET);
+	if (ver != IOAT_VER_1_2) {
+		printk(KERN_ERR I7300_PRINT "unknown I/O AT version (%u.%u)\n",
+			ver >> 4, ver & 0xf);
+		goto err_unmap;
+	}
+
+	chan_count = readb(ioat_iomap + IOAT_CHANCNT_OFFSET);
+	if (!chan_count) {
+		printk(KERN_ERR I7300_PRINT "unexpected # of I/O AT channels "
+			"(%u)\n",
+			chan_count);
+		goto err_unmap;
+	}
+
+	ioat_chan = chan_count - 1;
+	ioat_chanbase = IOAT_CHANBASE(ioat_iomap, ioat_chan);
+
+	chan_ctl = readw(ioat_chanbase + IOAT_CHANCTRL_OFFSET);
+	if (chan_ctl & IOAT_CHANCTRL_CHANNEL_IN_USE) {
+		printk(KERN_ERR I7300_PRINT "channel %d in use\n", ioat_chan);
+		goto err_unmap;
+	}
+
+	writew(IOAT_CHANCTRL_CHANNEL_IN_USE,
+		ioat_chanbase + IOAT_CHANCTRL_OFFSET);
+
+	ioat_desc = (struct ioat_dma_descriptor *)dma_alloc_coherent(
+			&dummy_dma_dev, 4096,
+			(dma_addr_t *)&ioat_desc_phys, GFP_KERNEL);
+	if (!ioat_desc) {
+		printk(KERN_ERR I7300_PRINT "failed to allocate I/O AT desc\n");
+		goto err_mark_unused;
+	}
+
+	writel(ioat_desc_phys & 0xffffffffUL,
+	       ioat_chanbase + IOAT1_CHAINADDR_OFFSET_LOW);
+	writel(ioat_desc_phys >> 32,
+	       ioat_chanbase + IOAT1_CHAINADDR_OFFSET_HIGH);
+
+	if (i7300_idle_ioat_selftest(ioat_iomap, ioat_desc, ioat_desc_phys)) {
+		printk(KERN_ERR I7300_PRINT "I/O AT self-test failed\n");
+		goto err_free;
+	}
+
+	/* Setup circular I/O AT descriptor chain */
+	ioat_desc[0].ctl = IOAT_DESC_SADDR_SNP_CTL | IOAT_DESC_DADDR_SNP_CTL;
+	ioat_desc[0].src_addr = ioat_desc_phys + 2048;
+	ioat_desc[0].dst_addr = ioat_desc_phys + 3072;
+	ioat_desc[0].size = 128;
+	ioat_desc[0].next = ioat_desc_phys + sizeof(struct ioat_dma_descriptor);
+
+	ioat_desc[1].ctl = ioat_desc[0].ctl;
+	ioat_desc[1].src_addr = ioat_desc[0].src_addr;
+	ioat_desc[1].dst_addr = ioat_desc[0].dst_addr;
+	ioat_desc[1].size = ioat_desc[0].size;
+	ioat_desc[1].next = ioat_desc_phys;
+
+	return 0;
+
+err_free:
+	dma_free_coherent(&dummy_dma_dev, 4096, (void *)ioat_desc, 0);
+err_mark_unused:
+	writew(0, ioat_chanbase + IOAT_CHANCTRL_OFFSET);
+err_unmap:
+	iounmap(ioat_iomap);
+err_ret:
+	return -ENODEV;
+}
+
+/* Cleanup I/O AT */
+static void __exit i7300_idle_ioat_exit(void)
+{
+	int i;
+	u64 chan_sts;
+
+	i7300_idle_ioat_stop();
+
+	/* Wait for a while for the channel to halt before releasing */
+	for (i = 0; i < 10; i++) {
+		writeb(IOAT_CHANCMD_RESET,
+		       ioat_chanbase + IOAT1_CHANCMD_OFFSET);
+
+		chan_sts = readq(ioat_chanbase + IOAT1_CHANSTS_OFFSET) &
+			IOAT_CHANSTS_DMA_TRANSFER_STATUS;
+
+		if (chan_sts != IOAT_CHANSTS_DMA_TRANSFER_STATUS_ACTIVE) {
+			writew(0, ioat_chanbase + IOAT_CHANCTRL_OFFSET);
+			break;
+		}
+		udelay(1000);
+	}
+
+	chan_sts = readq(ioat_chanbase + IOAT1_CHANSTS_OFFSET) &
+			IOAT_CHANSTS_DMA_TRANSFER_STATUS;
+
+	/*
+	 * We tried to reset multiple times. If IO A/T channel is still active
+	 * flag an error and return without cleanup. Memory leak is better
+	 * than random corruption in that extreme error situation.
+	 */
+	if (chan_sts == IOAT_CHANSTS_DMA_TRANSFER_STATUS_ACTIVE) {
+		printk(KERN_ERR I7300_PRINT "Unable to stop IO A/T channels."
+			" Not freeing resources\n");
+		return;
+	}
+
+	dma_free_coherent(&dummy_dma_dev, 4096, (void *)ioat_desc, 0);
+	iounmap(ioat_iomap);
+}
+
+/* End: I/O AT Helper routines */
+
+#define DIMM_THRTLOW 0x64
+#define DIMM_THRTCTL 0x67
+#define DIMM_THRTCTL_THRMHUNT (1UL << 0)
+#define DIMM_MC 0x40
+#define DIMM_GTW_MODE (1UL << 17)
+#define DIMM_GBLACT 0x60
+
+/*
+ * Keep track of an exponential-decaying average of recent idle durations.
+ * The latest duration gets DURATION_WEIGHT_PCT percentage weight
+ * in this average, with the old average getting the remaining weight.
+ *
+ * High weights emphasize recent history, low weights include long history.
+ */
+#define DURATION_WEIGHT_PCT 55
+
+/*
+ * When the decaying average of recent durations or the predicted duration
+ * of the next timer interrupt is shorter than duration_threshold, the
+ * driver will decline to throttle.
+ */
+#define DURATION_THRESHOLD_US 100
+
+
+/* Store DIMM thermal throttle configuration */
+static int i7300_idle_thrt_save(void)
+{
+	u32 new_mc_val;
+	u8 gblactlm;
+
+	pci_read_config_byte(fbd_dev, DIMM_THRTCTL, &i7300_idle_thrtctl_saved);
+	pci_read_config_byte(fbd_dev, DIMM_THRTLOW, &i7300_idle_thrtlow_saved);
+	pci_read_config_dword(fbd_dev, DIMM_MC, &i7300_idle_mc_saved);
+	/*
+	 * Make sure we have Global Throttling Window Mode set to have a
+	 * "short" window. This (mostly) works around an issue where
+	 * throttling persists until the end of the global throttling window
+	 * size. On the tested system, this was resulting in a maximum of
+	 * 64 ms to exit throttling (average 32 ms). The actual numbers
+	 * depends on system frequencies. Setting the short window reduces
+	 * this by a factor of 4096.
+	 *
+	 * We will only do this only if the system is set for
+	 * unlimited-activations while in open-loop throttling (i.e., when
+	 * Global Activation Throttle Limit is zero).
+	 */
+	pci_read_config_byte(fbd_dev, DIMM_GBLACT, &gblactlm);
+	dprintk("thrtctl_saved = 0x%02x, thrtlow_saved = 0x%02x\n",
+		i7300_idle_thrtctl_saved,
+		i7300_idle_thrtlow_saved);
+	dprintk("mc_saved = 0x%08x, gblactlm = 0x%02x\n",
+		i7300_idle_mc_saved,
+		gblactlm);
+	if (gblactlm == 0) {
+		new_mc_val = i7300_idle_mc_saved | DIMM_GTW_MODE;
+		pci_write_config_dword(fbd_dev, DIMM_MC, new_mc_val);
+		return 0;
+	} else {
+		dprintk("could not set GTW_MODE = 1 (OLTT enabled)\n");
+		return -ENODEV;
+	}
+}
+
+/* Restore DIMM thermal throttle configuration */
+static void i7300_idle_thrt_restore(void)
+{
+	pci_write_config_dword(fbd_dev, DIMM_MC, i7300_idle_mc_saved);
+	pci_write_config_byte(fbd_dev, DIMM_THRTLOW, i7300_idle_thrtlow_saved);
+	pci_write_config_byte(fbd_dev, DIMM_THRTCTL, i7300_idle_thrtctl_saved);
+}
+
+/* Enable DIMM thermal throttling */
+static void i7300_idle_start(void)
+{
+	u8 new_ctl;
+	u8 limit;
+
+	new_ctl = i7300_idle_thrtctl_saved & ~DIMM_THRTCTL_THRMHUNT;
+	pci_write_config_byte(fbd_dev, DIMM_THRTCTL, new_ctl);
+
+	limit = i7300_idle_thrtlowlm;
+	if (unlikely(limit > MAX_THRTLWLIMIT))
+		limit = MAX_THRTLWLIMIT;
+
+	pci_write_config_byte(fbd_dev, DIMM_THRTLOW, limit);
+
+	new_ctl = i7300_idle_thrtctl_saved | DIMM_THRTCTL_THRMHUNT;
+	pci_write_config_byte(fbd_dev, DIMM_THRTCTL, new_ctl);
+}
+
+/* Disable DIMM thermal throttling */
+static void i7300_idle_stop(void)
+{
+	u8 new_ctl;
+	u8 got_ctl;
+
+	new_ctl = i7300_idle_thrtctl_saved & ~DIMM_THRTCTL_THRMHUNT;
+	pci_write_config_byte(fbd_dev, DIMM_THRTCTL, new_ctl);
+
+	pci_write_config_byte(fbd_dev, DIMM_THRTLOW, i7300_idle_thrtlow_saved);
+	pci_write_config_byte(fbd_dev, DIMM_THRTCTL, i7300_idle_thrtctl_saved);
+	pci_read_config_byte(fbd_dev, DIMM_THRTCTL, &got_ctl);
+	WARN_ON_ONCE(got_ctl != i7300_idle_thrtctl_saved);
+}
+
+
+/*
+ * i7300_avg_duration_check()
+ * return 0 if the decaying average of recent idle durations is
+ * more than DURATION_THRESHOLD_US
+ */
+static int i7300_avg_duration_check(void)
+{
+	if (avg_idle_us >= DURATION_THRESHOLD_US)
+		return 0;
+
+#ifdef DEBUG
+	past_skip++;
+#endif
+	return 1;
+}
+
+/* Idle notifier to look at idle CPUs */
+static int i7300_idle_notifier(struct notifier_block *nb, unsigned long val,
+				void *data)
+{
+	unsigned long flags;
+	ktime_t now_ktime;
+	static ktime_t idle_begin_time;
+	static int time_init = 1;
+
+	if (!i7300_idle_thrtlowlm)
+		return 0;
+
+	if (unlikely(time_init)) {
+		time_init = 0;
+		idle_begin_time = ktime_get();
+	}
+
+	spin_lock_irqsave(&i7300_idle_lock, flags);
+	if (val == IDLE_START) {
+
+		cpu_set(smp_processor_id(), idle_cpumask);
+
+		if (cpus_weight(idle_cpumask) != num_online_cpus())
+			goto end;
+
+		now_ktime = ktime_get();
+		idle_begin_time = now_ktime;
+
+		if (i7300_avg_duration_check())
+			goto end;
+
+		i7300_idle_active = 1;
+		total_starts++;
+		start_ktime = now_ktime;
+
+		i7300_idle_start();
+		i7300_idle_ioat_start();
+
+	} else if (val == IDLE_END) {
+		cpu_clear(smp_processor_id(), idle_cpumask);
+		if (cpus_weight(idle_cpumask) == (num_online_cpus() - 1)) {
+			/* First CPU coming out of idle */
+			u64 idle_duration_us;
+
+			now_ktime = ktime_get();
+
+			idle_duration_us = ktime_to_us(ktime_sub
+						(now_ktime, idle_begin_time));
+
+			avg_idle_us =
+				((100 - DURATION_WEIGHT_PCT) * avg_idle_us +
+				 DURATION_WEIGHT_PCT * idle_duration_us) / 100;
+
+			if (i7300_idle_active) {
+				ktime_t idle_ktime;
+
+				idle_ktime = ktime_sub(now_ktime, start_ktime);
+				total_us += ktime_to_us(idle_ktime);
+
+				i7300_idle_ioat_stop();
+				i7300_idle_stop();
+				i7300_idle_active = 0;
+			}
+		}
+	}
+end:
+	spin_unlock_irqrestore(&i7300_idle_lock, flags);
+	return 0;
+}
+
+static struct notifier_block i7300_idle_nb = {
+	.notifier_call = i7300_idle_notifier,
+};
+
+/*
+ * I/O AT controls (PCI bus 0 device 8 function 0)
+ * DIMM controls (PCI bus 0 device 16 function 1)
+ */
+#define IOAT_BUS 0
+#define IOAT_DEVFN PCI_DEVFN(8, 0)
+#define MEMCTL_BUS 0
+#define MEMCTL_DEVFN PCI_DEVFN(16, 1)
+
+struct fbd_ioat {
+	unsigned int vendor;
+	unsigned int ioat_dev;
+};
+
+/*
+ * The i5000 chip-set has the same hooks as the i7300
+ * but support is disabled by default because this driver
+ * has not been validated on that platform.
+ */
+#define SUPPORT_I5000 0
+
+static const struct fbd_ioat fbd_ioat_list[] = {
+	{PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_IOAT_CNB},
+#if SUPPORT_I5000
+	{PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_IOAT},
+#endif
+	{0, 0}
+};
+
+/* table of devices that work with this driver */
+static const struct pci_device_id pci_tbl[] = {
+	{ PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_FBD_CNB) },
+#if SUPPORT_I5000
+	{ PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_5000_ERR) },
+#endif
+	{ } /* Terminating entry */
+};
+
+MODULE_DEVICE_TABLE(pci, pci_tbl);
+
+/* Check for known platforms with I/O-AT */
+static int __init i7300_idle_platform_probe(void)
+{
+	int i;
+
+	fbd_dev = pci_get_bus_and_slot(MEMCTL_BUS, MEMCTL_DEVFN);
+	if (!fbd_dev)
+		return -ENODEV;
+
+	for (i = 0; pci_tbl[i].vendor != 0; i++) {
+		if (fbd_dev->vendor == pci_tbl[i].vendor &&
+		    fbd_dev->device == pci_tbl[i].device) {
+			break;
+		}
+	}
+	if (pci_tbl[i].vendor == 0)
+		return -ENODEV;
+
+	ioat_dev = pci_get_bus_and_slot(IOAT_BUS, IOAT_DEVFN);
+	if (!ioat_dev)
+		return -ENODEV;
+
+	for (i = 0; fbd_ioat_list[i].vendor != 0; i++) {
+		if (ioat_dev->vendor == fbd_ioat_list[i].vendor &&
+		    ioat_dev->device == fbd_ioat_list[i].ioat_dev) {
+			return 0;
+		}
+	}
+	return -ENODEV;
+}
+
+int stats_open_generic(struct inode *inode, struct file *fp)
+{
+	fp->private_data = inode->i_private;
+	return 0;
+}
+
+static ssize_t stats_read_ul(struct file *fp, char __user *ubuf, size_t count,
+				loff_t *off)
+{
+	unsigned long *p = fp->private_data;
+	char buf[32];
+	int len;
+
+	len = snprintf(buf, 32, "%lu\n", *p);
+	return simple_read_from_buffer(ubuf, count, off, buf, len);
+}
+
+static const struct file_operations idle_fops = {
+	.open	= stats_open_generic,
+	.read	= stats_read_ul,
+};
+
+struct debugfs_file_info {
+	void *ptr;
+	char name[32];
+	struct dentry *file;
+} debugfs_file_list[] = {
+				{&total_starts, "total_starts", NULL},
+				{&total_us, "total_us", NULL},
+#ifdef DEBUG
+				{&past_skip, "past_skip", NULL},
+#endif
+				{NULL, "", NULL}
+			};
+
+static int __init i7300_idle_init(void)
+{
+	spin_lock_init(&i7300_idle_lock);
+	cpus_clear(idle_cpumask);
+	total_us = 0;
+
+	if (i7300_idle_platform_probe())
+		return -ENODEV;
+
+	if (i7300_idle_thrt_save())
+		return -ENODEV;
+
+	if (i7300_idle_ioat_init())
+		return -ENODEV;
+
+	debugfs_dir = debugfs_create_dir("i7300_idle", NULL);
+	if (debugfs_dir) {
+		int i = 0;
+
+		while (debugfs_file_list[i].ptr != NULL) {
+			debugfs_file_list[i].file = debugfs_create_file(
+					debugfs_file_list[i].name,
+					S_IRUSR,
+					debugfs_dir,
+					debugfs_file_list[i].ptr,
+					&idle_fops);
+			i++;
+		}
+	}
+
+	idle_notifier_register(&i7300_idle_nb);
+
+	printk(KERN_INFO "i7300_idle: loaded v%s\n", I7300_IDLE_DRIVER_VERSION);
+	return 0;
+}
+
+static void __exit i7300_idle_exit(void)
+{
+	idle_notifier_unregister(&i7300_idle_nb);
+
+	if (debugfs_dir) {
+		int i = 0;
+
+		while (debugfs_file_list[i].file != NULL) {
+			debugfs_remove(debugfs_file_list[i].file);
+			i++;
+		}
+
+		debugfs_remove(debugfs_dir);
+	}
+	i7300_idle_thrt_restore();
+	i7300_idle_ioat_exit();
+}
+
+module_init(i7300_idle_init);
+module_exit(i7300_idle_exit);
+
+MODULE_AUTHOR("Andy Henroid <andrew.d.henroid@intel.com>");
+MODULE_DESCRIPTION("Intel Chipset DIMM Idle Power Saving Driver v"
+			I7300_IDLE_DRIVER_VERSION);
+MODULE_LICENSE("GPL");
diff --git a/include/asm-x86/idle.h b/include/asm-x86/idle.h
index cbb649123612..54ce018d4b6c 100644
--- a/include/asm-x86/idle.h
+++ b/include/asm-x86/idle.h
@@ -6,6 +6,7 @@
 
 struct notifier_block;
 void idle_notifier_register(struct notifier_block *n);
+void idle_notifier_unregister(struct notifier_block *n);
 
 void enter_idle(void);
 void exit_idle(void);
diff --git a/include/linux/pci_ids.h b/include/linux/pci_ids.h
index f1624b396754..efb786d11f2a 100644
--- a/include/linux/pci_ids.h
+++ b/include/linux/pci_ids.h
@@ -2422,6 +2422,7 @@
 #define PCI_DEVICE_ID_INTEL_MCH_PC1	0x359a
 #define PCI_DEVICE_ID_INTEL_E7525_MCH	0x359e
 #define PCI_DEVICE_ID_INTEL_IOAT_CNB	0x360b
+#define PCI_DEVICE_ID_INTEL_FBD_CNB	0x360c
 #define PCI_DEVICE_ID_INTEL_ICH10_0	0x3a14
 #define PCI_DEVICE_ID_INTEL_ICH10_1	0x3a16
 #define PCI_DEVICE_ID_INTEL_ICH10_2	0x3a18
-- 
cgit v1.2.3


From 4ce72a2c063a7fa8e42a9435440ae3364115a58d Mon Sep 17 00:00:00 2001
From: Li Zefan <lizf@cn.fujitsu.com>
Date: Wed, 22 Oct 2008 15:25:26 +0800
Subject: sched: add CONFIG_SMP consistency

a patch from Henrik Austad did this:

>> Do not declare select_task_rq as part of sched_class when CONFIG_SMP is
>> not set.

Peter observed:

> While a proper cleanup, could you do it by re-arranging the methods so
> as to not create an additional ifdef?

Do not declare select_task_rq and some other methods as part of sched_class
when CONFIG_SMP is not set.

Also gather those methods to avoid CONFIG_SMP mess.

Idea-by: Henrik Austad <henrik.austad@gmail.com>
Signed-off-by: Li Zefan <lizf@cn.fujitsu.com>
Acked-by: Peter Zijlstra <peterz@infradead.org>
Acked-by: Henrik Austad <henrik@austad.us>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 include/linux/sched.h   | 12 +++++++-----
 kernel/sched_fair.c     |  5 ++---
 kernel/sched_idletask.c |  5 ++---
 kernel/sched_rt.c       |  5 ++---
 4 files changed, 13 insertions(+), 14 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/sched.h b/include/linux/sched.h
index 4f59c8e8597d..c05b45faef18 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -897,7 +897,6 @@ struct sched_class {
 	void (*enqueue_task) (struct rq *rq, struct task_struct *p, int wakeup);
 	void (*dequeue_task) (struct rq *rq, struct task_struct *p, int sleep);
 	void (*yield_task) (struct rq *rq);
-	int  (*select_task_rq)(struct task_struct *p, int sync);
 
 	void (*check_preempt_curr) (struct rq *rq, struct task_struct *p, int sync);
 
@@ -905,6 +904,8 @@ struct sched_class {
 	void (*put_prev_task) (struct rq *rq, struct task_struct *p);
 
 #ifdef CONFIG_SMP
+	int  (*select_task_rq)(struct task_struct *p, int sync);
+
 	unsigned long (*load_balance) (struct rq *this_rq, int this_cpu,
 			struct rq *busiest, unsigned long max_load_move,
 			struct sched_domain *sd, enum cpu_idle_type idle,
@@ -916,16 +917,17 @@ struct sched_class {
 	void (*pre_schedule) (struct rq *this_rq, struct task_struct *task);
 	void (*post_schedule) (struct rq *this_rq);
 	void (*task_wake_up) (struct rq *this_rq, struct task_struct *task);
-#endif
 
-	void (*set_curr_task) (struct rq *rq);
-	void (*task_tick) (struct rq *rq, struct task_struct *p, int queued);
-	void (*task_new) (struct rq *rq, struct task_struct *p);
 	void (*set_cpus_allowed)(struct task_struct *p,
 				 const cpumask_t *newmask);
 
 	void (*rq_online)(struct rq *rq);
 	void (*rq_offline)(struct rq *rq);
+#endif
+
+	void (*set_curr_task) (struct rq *rq);
+	void (*task_tick) (struct rq *rq, struct task_struct *p, int queued);
+	void (*task_new) (struct rq *rq, struct task_struct *p);
 
 	void (*switched_from) (struct rq *this_rq, struct task_struct *task,
 			       int running);
diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c
index a0aa38b10fdd..8de48a5da354 100644
--- a/kernel/sched_fair.c
+++ b/kernel/sched_fair.c
@@ -1593,9 +1593,6 @@ static const struct sched_class fair_sched_class = {
 	.enqueue_task		= enqueue_task_fair,
 	.dequeue_task		= dequeue_task_fair,
 	.yield_task		= yield_task_fair,
-#ifdef CONFIG_SMP
-	.select_task_rq		= select_task_rq_fair,
-#endif /* CONFIG_SMP */
 
 	.check_preempt_curr	= check_preempt_wakeup,
 
@@ -1603,6 +1600,8 @@ static const struct sched_class fair_sched_class = {
 	.put_prev_task		= put_prev_task_fair,
 
 #ifdef CONFIG_SMP
+	.select_task_rq		= select_task_rq_fair,
+
 	.load_balance		= load_balance_fair,
 	.move_one_task		= move_one_task_fair,
 #endif
diff --git a/kernel/sched_idletask.c b/kernel/sched_idletask.c
index dec4ccabe2f5..8a21a2e28c13 100644
--- a/kernel/sched_idletask.c
+++ b/kernel/sched_idletask.c
@@ -105,9 +105,6 @@ static const struct sched_class idle_sched_class = {
 
 	/* dequeue is not valid, we print a debug message there: */
 	.dequeue_task		= dequeue_task_idle,
-#ifdef CONFIG_SMP
-	.select_task_rq		= select_task_rq_idle,
-#endif /* CONFIG_SMP */
 
 	.check_preempt_curr	= check_preempt_curr_idle,
 
@@ -115,6 +112,8 @@ static const struct sched_class idle_sched_class = {
 	.put_prev_task		= put_prev_task_idle,
 
 #ifdef CONFIG_SMP
+	.select_task_rq		= select_task_rq_idle,
+
 	.load_balance		= load_balance_idle,
 	.move_one_task		= move_one_task_idle,
 #endif
diff --git a/kernel/sched_rt.c b/kernel/sched_rt.c
index cdf5740ab03e..c9aa5bede226 100644
--- a/kernel/sched_rt.c
+++ b/kernel/sched_rt.c
@@ -1502,9 +1502,6 @@ static const struct sched_class rt_sched_class = {
 	.enqueue_task		= enqueue_task_rt,
 	.dequeue_task		= dequeue_task_rt,
 	.yield_task		= yield_task_rt,
-#ifdef CONFIG_SMP
-	.select_task_rq		= select_task_rq_rt,
-#endif /* CONFIG_SMP */
 
 	.check_preempt_curr	= check_preempt_curr_rt,
 
@@ -1512,6 +1509,8 @@ static const struct sched_class rt_sched_class = {
 	.put_prev_task		= put_prev_task_rt,
 
 #ifdef CONFIG_SMP
+	.select_task_rq		= select_task_rq_rt,
+
 	.load_balance		= load_balance_rt,
 	.move_one_task		= move_one_task_rt,
 	.set_cpus_allowed       = set_cpus_allowed_rt,
-- 
cgit v1.2.3


From 319edafef64406c971035c56bd68480e5a82b581 Mon Sep 17 00:00:00 2001
From: Catalin Marinas <catalin.marinas@arm.com>
Date: Mon, 20 Oct 2008 18:15:30 +0100
Subject: smc911x: Add IRQ polarity configuration

Platforms like ARM Ltd's RealView require the IRQ polarity bit to be set
for the SMC9118 chip. This patch allows the dynamic configuration via
the smc911x_platdata structure.

This patch also changes the smc91x_platdata structure name to the
correct smc911x_platdata in the smc911x_drv_probe() function.

Signed-off-by: Catalin Marinas <catalin.marinas@arm.com>
Signed-off-by: Jeff Garzik <jgarzik@redhat.com>
---
 drivers/net/smc911x.c   | 11 ++++++++---
 include/linux/smc911x.h |  1 +
 2 files changed, 9 insertions(+), 3 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/net/smc911x.c b/drivers/net/smc911x.c
index ec32b5d89c92..2c78229ad04b 100644
--- a/drivers/net/smc911x.c
+++ b/drivers/net/smc911x.c
@@ -180,7 +180,7 @@ static void PRINT_PKT(u_char *buf, int length)
 static void smc911x_reset(struct net_device *dev)
 {
 	struct smc911x_local *lp = netdev_priv(dev);
-	unsigned int reg, timeout=0, resets=1;
+	unsigned int reg, timeout=0, resets=1, irq_cfg;
 	unsigned long flags;
 
 	DBG(SMC_DEBUG_FUNC, "%s: --> %s\n", dev->name, __func__);
@@ -252,7 +252,12 @@ static void smc911x_reset(struct net_device *dev)
 	 * Deassert IRQ for 1*10us for edge type interrupts
 	 * and drive IRQ pin push-pull
 	 */
-	SMC_SET_IRQ_CFG(lp, (1 << 24) | INT_CFG_IRQ_EN_ | INT_CFG_IRQ_TYPE_);
+	irq_cfg = (1 << 24) | INT_CFG_IRQ_EN_ | INT_CFG_IRQ_TYPE_;
+#ifdef SMC_DYNAMIC_BUS_CONFIG
+	if (lp->cfg.irq_polarity)
+		irq_cfg |= INT_CFG_IRQ_POL_;
+#endif
+	SMC_SET_IRQ_CFG(lp, irq_cfg);
 
 	/* clear anything saved */
 	if (lp->pending_tx_skb != NULL) {
@@ -2054,7 +2059,7 @@ err_out:
  */
 static int smc911x_drv_probe(struct platform_device *pdev)
 {
-	struct smc91x_platdata *pd = pdev->dev.platform_data;
+	struct smc911x_platdata *pd = pdev->dev.platform_data;
 	struct net_device *ndev;
 	struct resource *res;
 	struct smc911x_local *lp;
diff --git a/include/linux/smc911x.h b/include/linux/smc911x.h
index b58f54c24183..521f37143fae 100644
--- a/include/linux/smc911x.h
+++ b/include/linux/smc911x.h
@@ -7,6 +7,7 @@
 struct smc911x_platdata {
 	unsigned long flags;
 	unsigned long irq_flags; /* IRQF_... */
+	int irq_polarity;
 };
 
 #endif /* __SMC911X_H__ */
-- 
cgit v1.2.3


From a73a63701f8f23e70674b3c5e367a0a726c18468 Mon Sep 17 00:00:00 2001
From: Jiri Slaby <jirislaby@gmail.com>
Date: Wed, 22 Oct 2008 14:45:11 +0200
Subject: HID: add hid_type to general hid struct

Add type to the hid structure to distinguish to which device type
(now only mouse) we are talking to. Needed for per device type ignore
list support.

Note: this patch leaves the type as unknown for bluetooth devices,
there is not support for this in the hidp code.

Signed-off-by: Jiri Slaby <jirislaby@gmail.com>
Signed-off-by: Jiri Kosina <jkosina@suse.cz>
---
 drivers/hid/usbhid/hid-core.c | 3 +++
 include/linux/hid.h           | 6 ++++++
 2 files changed, 9 insertions(+)

(limited to 'include/linux')

diff --git a/drivers/hid/usbhid/hid-core.c b/drivers/hid/usbhid/hid-core.c
index 1d3b8a394d46..1dc341a04d04 100644
--- a/drivers/hid/usbhid/hid-core.c
+++ b/drivers/hid/usbhid/hid-core.c
@@ -972,6 +972,9 @@ static int hid_probe(struct usb_interface *intf, const struct usb_device_id *id)
 	hid->vendor = le16_to_cpu(dev->descriptor.idVendor);
 	hid->product = le16_to_cpu(dev->descriptor.idProduct);
 	hid->name[0] = 0;
+	if (intf->cur_altsetting->desc.bInterfaceProtocol ==
+			USB_INTERFACE_PROTOCOL_MOUSE)
+		hid->type = HID_TYPE_USBMOUSE;
 
 	if (dev->manufacturer)
 		strlcpy(hid->name, dev->manufacturer, sizeof(hid->name));
diff --git a/include/linux/hid.h b/include/linux/hid.h
index f13bca2dd53b..5355ca4b939e 100644
--- a/include/linux/hid.h
+++ b/include/linux/hid.h
@@ -417,6 +417,11 @@ struct hid_input {
 	struct input_dev *input;
 };
 
+enum hid_type {
+	HID_TYPE_OTHER = 0,
+	HID_TYPE_USBMOUSE
+};
+
 struct hid_driver;
 struct hid_ll_driver;
 
@@ -431,6 +436,7 @@ struct hid_device {							/* device report descriptor */
 	__u32 vendor;							/* Vendor ID */
 	__u32 product;							/* Product ID */
 	__u32 version;							/* HID version */
+	enum hid_type type;						/* device type (mouse, kbd, ...) */
 	unsigned country;						/* HID country */
 	struct hid_report_enum report_enum[HID_REPORT_TYPES];
 
-- 
cgit v1.2.3


From 93fc9e1bb6507dde945c2eab68c93e1066ac3691 Mon Sep 17 00:00:00 2001
From: Yevgeny Petrilin <yevgenyp@mellanox.co.il>
Date: Wed, 22 Oct 2008 10:25:29 -0700
Subject: mlx4_core: Support multiple pre-reserved QP regions

For ethernet support, we need to reserve QPs for the ethernet and
fibre channel driver.  The QPs are reserved at the end of the QP
table.  (This way we assure that they are aligned to their size)

We need to consider these reserved ranges in bitmap creation, so we
extend the mlx4 bitmap utility functions to allow reserved ranges at
both the bottom and the top of the range.

Signed-off-by: Yevgeny Petrilin <yevgenyp@mellanox.co.il>
Signed-off-by: Roland Dreier <rolandd@cisco.com>
---
 drivers/net/mlx4/alloc.c    | 29 +++++++++++++--------
 drivers/net/mlx4/cq.c       |  2 +-
 drivers/net/mlx4/eq.c       |  2 +-
 drivers/net/mlx4/fw.c       |  5 ++++
 drivers/net/mlx4/fw.h       |  2 ++
 drivers/net/mlx4/main.c     | 62 ++++++++++++++++++++++++++++++++++++++++-----
 drivers/net/mlx4/mcg.c      |  4 +--
 drivers/net/mlx4/mlx4.h     |  4 ++-
 drivers/net/mlx4/mr.c       |  2 +-
 drivers/net/mlx4/pd.c       |  4 +--
 drivers/net/mlx4/qp.c       | 36 ++++++++++++++++++++++++--
 drivers/net/mlx4/srq.c      |  2 +-
 include/linux/mlx4/device.h | 19 +++++++++++++-
 13 files changed, 144 insertions(+), 29 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/net/mlx4/alloc.c b/drivers/net/mlx4/alloc.c
index e6c0d5bb5dcb..e2bc7ecf162d 100644
--- a/drivers/net/mlx4/alloc.c
+++ b/drivers/net/mlx4/alloc.c
@@ -47,13 +47,16 @@ u32 mlx4_bitmap_alloc(struct mlx4_bitmap *bitmap)
 
 	obj = find_next_zero_bit(bitmap->table, bitmap->max, bitmap->last);
 	if (obj >= bitmap->max) {
-		bitmap->top = (bitmap->top + bitmap->max) & bitmap->mask;
+		bitmap->top = (bitmap->top + bitmap->max + bitmap->reserved_top)
+				& bitmap->mask;
 		obj = find_first_zero_bit(bitmap->table, bitmap->max);
 	}
 
 	if (obj < bitmap->max) {
 		set_bit(obj, bitmap->table);
-		bitmap->last = (obj + 1) & (bitmap->max - 1);
+		bitmap->last = (obj + 1);
+		if (bitmap->last == bitmap->max)
+			bitmap->last = 0;
 		obj |= bitmap->top;
 	} else
 		obj = -1;
@@ -109,9 +112,9 @@ u32 mlx4_bitmap_alloc_range(struct mlx4_bitmap *bitmap, int cnt, int align)
 	obj = find_aligned_range(bitmap->table, bitmap->last,
 				 bitmap->max, cnt, align);
 	if (obj >= bitmap->max) {
-		bitmap->top = (bitmap->top + bitmap->max) & bitmap->mask;
-		obj = find_aligned_range(bitmap->table, 0,
-					 bitmap->max,
+		bitmap->top = (bitmap->top + bitmap->max + bitmap->reserved_top)
+				& bitmap->mask;
+		obj = find_aligned_range(bitmap->table, 0, bitmap->max,
 					 cnt, align);
 	}
 
@@ -136,17 +139,19 @@ void mlx4_bitmap_free_range(struct mlx4_bitmap *bitmap, u32 obj, int cnt)
 {
 	u32 i;
 
-	obj &= bitmap->max - 1;
+	obj &= bitmap->max + bitmap->reserved_top - 1;
 
 	spin_lock(&bitmap->lock);
 	for (i = 0; i < cnt; i++)
 		clear_bit(obj + i, bitmap->table);
 	bitmap->last = min(bitmap->last, obj);
-	bitmap->top = (bitmap->top + bitmap->max) & bitmap->mask;
+	bitmap->top = (bitmap->top + bitmap->max + bitmap->reserved_top)
+			& bitmap->mask;
 	spin_unlock(&bitmap->lock);
 }
 
-int mlx4_bitmap_init(struct mlx4_bitmap *bitmap, u32 num, u32 mask, u32 reserved)
+int mlx4_bitmap_init(struct mlx4_bitmap *bitmap, u32 num, u32 mask,
+		     u32 reserved_bot, u32 reserved_top)
 {
 	int i;
 
@@ -156,14 +161,16 @@ int mlx4_bitmap_init(struct mlx4_bitmap *bitmap, u32 num, u32 mask, u32 reserved
 
 	bitmap->last = 0;
 	bitmap->top  = 0;
-	bitmap->max  = num;
+	bitmap->max  = num - reserved_top;
 	bitmap->mask = mask;
+	bitmap->reserved_top = reserved_top;
 	spin_lock_init(&bitmap->lock);
-	bitmap->table = kzalloc(BITS_TO_LONGS(num) * sizeof (long), GFP_KERNEL);
+	bitmap->table = kzalloc(BITS_TO_LONGS(bitmap->max) *
+				sizeof (long), GFP_KERNEL);
 	if (!bitmap->table)
 		return -ENOMEM;
 
-	for (i = 0; i < reserved; ++i)
+	for (i = 0; i < reserved_bot; ++i)
 		set_bit(i, bitmap->table);
 
 	return 0;
diff --git a/drivers/net/mlx4/cq.c b/drivers/net/mlx4/cq.c
index 9bb50e3f8974..b7ad2829d67e 100644
--- a/drivers/net/mlx4/cq.c
+++ b/drivers/net/mlx4/cq.c
@@ -300,7 +300,7 @@ int mlx4_init_cq_table(struct mlx4_dev *dev)
 	INIT_RADIX_TREE(&cq_table->tree, GFP_ATOMIC);
 
 	err = mlx4_bitmap_init(&cq_table->bitmap, dev->caps.num_cqs,
-			       dev->caps.num_cqs - 1, dev->caps.reserved_cqs);
+			       dev->caps.num_cqs - 1, dev->caps.reserved_cqs, 0);
 	if (err)
 		return err;
 
diff --git a/drivers/net/mlx4/eq.c b/drivers/net/mlx4/eq.c
index 8a8b56135a58..de169338cd90 100644
--- a/drivers/net/mlx4/eq.c
+++ b/drivers/net/mlx4/eq.c
@@ -558,7 +558,7 @@ int mlx4_init_eq_table(struct mlx4_dev *dev)
 	int i;
 
 	err = mlx4_bitmap_init(&priv->eq_table.bitmap, dev->caps.num_eqs,
-			       dev->caps.num_eqs - 1, dev->caps.reserved_eqs);
+			       dev->caps.num_eqs - 1, dev->caps.reserved_eqs, 0);
 	if (err)
 		return err;
 
diff --git a/drivers/net/mlx4/fw.c b/drivers/net/mlx4/fw.c
index 7e32955da982..40d8142c23b2 100644
--- a/drivers/net/mlx4/fw.c
+++ b/drivers/net/mlx4/fw.c
@@ -357,6 +357,7 @@ int mlx4_QUERY_DEV_CAP(struct mlx4_dev *dev, struct mlx4_dev_cap *dev_cap)
 #define QUERY_PORT_MTU_OFFSET			0x01
 #define QUERY_PORT_WIDTH_OFFSET			0x06
 #define QUERY_PORT_MAX_GID_PKEY_OFFSET		0x07
+#define QUERY_PORT_MAX_MACVLAN_OFFSET		0x0a
 #define QUERY_PORT_MAX_VL_OFFSET		0x0b
 
 		for (i = 1; i <= dev_cap->num_ports; ++i) {
@@ -374,6 +375,10 @@ int mlx4_QUERY_DEV_CAP(struct mlx4_dev *dev, struct mlx4_dev_cap *dev_cap)
 			dev_cap->max_pkeys[i]	   = 1 << (field & 0xf);
 			MLX4_GET(field, outbox, QUERY_PORT_MAX_VL_OFFSET);
 			dev_cap->max_vl[i]	   = field & 0xf;
+			MLX4_GET(field, outbox, QUERY_PORT_MAX_MACVLAN_OFFSET);
+			dev_cap->log_max_macs[i]  = field & 0xf;
+			dev_cap->log_max_vlans[i] = field >> 4;
+
 		}
 	}
 
diff --git a/drivers/net/mlx4/fw.h b/drivers/net/mlx4/fw.h
index decbb5c2ad41..c34e726d66e4 100644
--- a/drivers/net/mlx4/fw.h
+++ b/drivers/net/mlx4/fw.h
@@ -102,6 +102,8 @@ struct mlx4_dev_cap {
 	u32 reserved_lkey;
 	u64 max_icm_sz;
 	int max_gso_sz;
+	u8  log_max_macs[MLX4_MAX_PORTS + 1];
+	u8  log_max_vlans[MLX4_MAX_PORTS + 1];
 };
 
 struct mlx4_adapter {
diff --git a/drivers/net/mlx4/main.c b/drivers/net/mlx4/main.c
index 1252a919de2e..560e1962212e 100644
--- a/drivers/net/mlx4/main.c
+++ b/drivers/net/mlx4/main.c
@@ -85,6 +85,19 @@ static struct mlx4_profile default_profile = {
 	.num_mtt	= 1 << 20,
 };
 
+static int log_num_mac = 2;
+module_param_named(log_num_mac, log_num_mac, int, 0444);
+MODULE_PARM_DESC(log_num_mac, "Log2 max number of MACs per ETH port (1-7)");
+
+static int log_num_vlan;
+module_param_named(log_num_vlan, log_num_vlan, int, 0444);
+MODULE_PARM_DESC(log_num_vlan, "Log2 max number of VLANs per ETH port (0-7)");
+
+static int use_prio;
+module_param_named(use_prio, use_prio, bool, 0444);
+MODULE_PARM_DESC(use_prio, "Enable steering by VLAN priority on ETH ports "
+		  "(0/1, default 0)");
+
 static int mlx4_dev_cap(struct mlx4_dev *dev, struct mlx4_dev_cap *dev_cap)
 {
 	int err;
@@ -134,7 +147,6 @@ static int mlx4_dev_cap(struct mlx4_dev *dev, struct mlx4_dev_cap *dev_cap)
 	dev->caps.max_rq_sg	     = dev_cap->max_rq_sg;
 	dev->caps.max_wqes	     = dev_cap->max_qp_sz;
 	dev->caps.max_qp_init_rdma   = dev_cap->max_requester_per_qp;
-	dev->caps.reserved_qps	     = dev_cap->reserved_qps;
 	dev->caps.max_srq_wqes	     = dev_cap->max_srq_sz;
 	dev->caps.max_srq_sge	     = dev_cap->max_rq_sg - 1;
 	dev->caps.reserved_srqs	     = dev_cap->reserved_srqs;
@@ -163,6 +175,39 @@ static int mlx4_dev_cap(struct mlx4_dev *dev, struct mlx4_dev_cap *dev_cap)
 	dev->caps.stat_rate_support  = dev_cap->stat_rate_support;
 	dev->caps.max_gso_sz	     = dev_cap->max_gso_sz;
 
+	dev->caps.log_num_macs  = log_num_mac;
+	dev->caps.log_num_vlans = log_num_vlan;
+	dev->caps.log_num_prios = use_prio ? 3 : 0;
+
+	for (i = 1; i <= dev->caps.num_ports; ++i) {
+		if (dev->caps.log_num_macs > dev_cap->log_max_macs[i]) {
+			dev->caps.log_num_macs = dev_cap->log_max_macs[i];
+			mlx4_warn(dev, "Requested number of MACs is too much "
+				  "for port %d, reducing to %d.\n",
+				  i, 1 << dev->caps.log_num_macs);
+		}
+		if (dev->caps.log_num_vlans > dev_cap->log_max_vlans[i]) {
+			dev->caps.log_num_vlans = dev_cap->log_max_vlans[i];
+			mlx4_warn(dev, "Requested number of VLANs is too much "
+				  "for port %d, reducing to %d.\n",
+				  i, 1 << dev->caps.log_num_vlans);
+		}
+	}
+
+	dev->caps.reserved_qps_cnt[MLX4_QP_REGION_FW] = dev_cap->reserved_qps;
+	dev->caps.reserved_qps_cnt[MLX4_QP_REGION_ETH_ADDR] =
+		dev->caps.reserved_qps_cnt[MLX4_QP_REGION_FC_ADDR] =
+		(1 << dev->caps.log_num_macs) *
+		(1 << dev->caps.log_num_vlans) *
+		(1 << dev->caps.log_num_prios) *
+		dev->caps.num_ports;
+	dev->caps.reserved_qps_cnt[MLX4_QP_REGION_FC_EXCH] = MLX4_NUM_FEXCH;
+
+	dev->caps.reserved_qps = dev->caps.reserved_qps_cnt[MLX4_QP_REGION_FW] +
+		dev->caps.reserved_qps_cnt[MLX4_QP_REGION_ETH_ADDR] +
+		dev->caps.reserved_qps_cnt[MLX4_QP_REGION_FC_ADDR] +
+		dev->caps.reserved_qps_cnt[MLX4_QP_REGION_FC_EXCH];
+
 	return 0;
 }
 
@@ -211,7 +256,8 @@ static int mlx4_init_cmpt_table(struct mlx4_dev *dev, u64 cmpt_base,
 				  ((u64) (MLX4_CMPT_TYPE_QP *
 					  cmpt_entry_sz) << MLX4_CMPT_SHIFT),
 				  cmpt_entry_sz, dev->caps.num_qps,
-				  dev->caps.reserved_qps, 0, 0);
+				  dev->caps.reserved_qps_cnt[MLX4_QP_REGION_FW],
+				  0, 0);
 	if (err)
 		goto err;
 
@@ -336,7 +382,8 @@ static int mlx4_init_icm(struct mlx4_dev *dev, struct mlx4_dev_cap *dev_cap,
 				  init_hca->qpc_base,
 				  dev_cap->qpc_entry_sz,
 				  dev->caps.num_qps,
-				  dev->caps.reserved_qps, 0, 0);
+				  dev->caps.reserved_qps_cnt[MLX4_QP_REGION_FW],
+				  0, 0);
 	if (err) {
 		mlx4_err(dev, "Failed to map QP context memory, aborting.\n");
 		goto err_unmap_dmpt;
@@ -346,7 +393,8 @@ static int mlx4_init_icm(struct mlx4_dev *dev, struct mlx4_dev_cap *dev_cap,
 				  init_hca->auxc_base,
 				  dev_cap->aux_entry_sz,
 				  dev->caps.num_qps,
-				  dev->caps.reserved_qps, 0, 0);
+				  dev->caps.reserved_qps_cnt[MLX4_QP_REGION_FW],
+				  0, 0);
 	if (err) {
 		mlx4_err(dev, "Failed to map AUXC context memory, aborting.\n");
 		goto err_unmap_qp;
@@ -356,7 +404,8 @@ static int mlx4_init_icm(struct mlx4_dev *dev, struct mlx4_dev_cap *dev_cap,
 				  init_hca->altc_base,
 				  dev_cap->altc_entry_sz,
 				  dev->caps.num_qps,
-				  dev->caps.reserved_qps, 0, 0);
+				  dev->caps.reserved_qps_cnt[MLX4_QP_REGION_FW],
+				  0, 0);
 	if (err) {
 		mlx4_err(dev, "Failed to map ALTC context memory, aborting.\n");
 		goto err_unmap_auxc;
@@ -366,7 +415,8 @@ static int mlx4_init_icm(struct mlx4_dev *dev, struct mlx4_dev_cap *dev_cap,
 				  init_hca->rdmarc_base,
 				  dev_cap->rdmarc_entry_sz << priv->qp_table.rdmarc_shift,
 				  dev->caps.num_qps,
-				  dev->caps.reserved_qps, 0, 0);
+				  dev->caps.reserved_qps_cnt[MLX4_QP_REGION_FW],
+				  0, 0);
 	if (err) {
 		mlx4_err(dev, "Failed to map RDMARC context memory, aborting\n");
 		goto err_unmap_altc;
diff --git a/drivers/net/mlx4/mcg.c b/drivers/net/mlx4/mcg.c
index c83f88ce0736..592c01ae2c5d 100644
--- a/drivers/net/mlx4/mcg.c
+++ b/drivers/net/mlx4/mcg.c
@@ -368,8 +368,8 @@ int mlx4_init_mcg_table(struct mlx4_dev *dev)
 	struct mlx4_priv *priv = mlx4_priv(dev);
 	int err;
 
-	err = mlx4_bitmap_init(&priv->mcg_table.bitmap,
-			       dev->caps.num_amgms, dev->caps.num_amgms - 1, 0);
+	err = mlx4_bitmap_init(&priv->mcg_table.bitmap, dev->caps.num_amgms,
+			       dev->caps.num_amgms - 1, 0, 0);
 	if (err)
 		return err;
 
diff --git a/drivers/net/mlx4/mlx4.h b/drivers/net/mlx4/mlx4.h
index b55ddab73f66..9e2f44c31810 100644
--- a/drivers/net/mlx4/mlx4.h
+++ b/drivers/net/mlx4/mlx4.h
@@ -111,6 +111,7 @@ struct mlx4_bitmap {
 	u32			last;
 	u32			top;
 	u32			max;
+	u32                     reserved_top;
 	u32			mask;
 	spinlock_t		lock;
 	unsigned long	       *table;
@@ -290,7 +291,8 @@ u32 mlx4_bitmap_alloc(struct mlx4_bitmap *bitmap);
 void mlx4_bitmap_free(struct mlx4_bitmap *bitmap, u32 obj);
 u32 mlx4_bitmap_alloc_range(struct mlx4_bitmap *bitmap, int cnt, int align);
 void mlx4_bitmap_free_range(struct mlx4_bitmap *bitmap, u32 obj, int cnt);
-int mlx4_bitmap_init(struct mlx4_bitmap *bitmap, u32 num, u32 mask, u32 reserved);
+int mlx4_bitmap_init(struct mlx4_bitmap *bitmap, u32 num, u32 mask,
+		     u32 reserved_bot, u32 resetrved_top);
 void mlx4_bitmap_cleanup(struct mlx4_bitmap *bitmap);
 
 int mlx4_reset(struct mlx4_dev *dev);
diff --git a/drivers/net/mlx4/mr.c b/drivers/net/mlx4/mr.c
index d1dd5b48dbd1..0caf74cae8bc 100644
--- a/drivers/net/mlx4/mr.c
+++ b/drivers/net/mlx4/mr.c
@@ -461,7 +461,7 @@ int mlx4_init_mr_table(struct mlx4_dev *dev)
 	int err;
 
 	err = mlx4_bitmap_init(&mr_table->mpt_bitmap, dev->caps.num_mpts,
-			       ~0, dev->caps.reserved_mrws);
+			       ~0, dev->caps.reserved_mrws, 0);
 	if (err)
 		return err;
 
diff --git a/drivers/net/mlx4/pd.c b/drivers/net/mlx4/pd.c
index aa616892d09c..26d1a7a9e375 100644
--- a/drivers/net/mlx4/pd.c
+++ b/drivers/net/mlx4/pd.c
@@ -62,7 +62,7 @@ int mlx4_init_pd_table(struct mlx4_dev *dev)
 	struct mlx4_priv *priv = mlx4_priv(dev);
 
 	return mlx4_bitmap_init(&priv->pd_bitmap, dev->caps.num_pds,
-				(1 << 24) - 1, dev->caps.reserved_pds);
+				(1 << 24) - 1, dev->caps.reserved_pds, 0);
 }
 
 void mlx4_cleanup_pd_table(struct mlx4_dev *dev)
@@ -100,7 +100,7 @@ int mlx4_init_uar_table(struct mlx4_dev *dev)
 
 	return mlx4_bitmap_init(&mlx4_priv(dev)->uar_table.bitmap,
 				dev->caps.num_uars, dev->caps.num_uars - 1,
-				max(128, dev->caps.reserved_uars));
+				max(128, dev->caps.reserved_uars), 0);
 }
 
 void mlx4_cleanup_uar_table(struct mlx4_dev *dev)
diff --git a/drivers/net/mlx4/qp.c b/drivers/net/mlx4/qp.c
index 98e0c40ba368..1c565ef8d179 100644
--- a/drivers/net/mlx4/qp.c
+++ b/drivers/net/mlx4/qp.c
@@ -272,6 +272,7 @@ int mlx4_init_qp_table(struct mlx4_dev *dev)
 {
 	struct mlx4_qp_table *qp_table = &mlx4_priv(dev)->qp_table;
 	int err;
+	int reserved_from_top = 0;
 
 	spin_lock_init(&qp_table->lock);
 	INIT_RADIX_TREE(&dev->qp_table_tree, GFP_ATOMIC);
@@ -281,9 +282,40 @@ int mlx4_init_qp_table(struct mlx4_dev *dev)
 	 * block of special QPs must be aligned to a multiple of 8, so
 	 * round up.
 	 */
-	dev->caps.sqp_start = ALIGN(dev->caps.reserved_qps, 8);
+	dev->caps.sqp_start =
+		ALIGN(dev->caps.reserved_qps_cnt[MLX4_QP_REGION_FW], 8);
+
+	{
+		int sort[MLX4_NUM_QP_REGION];
+		int i, j, tmp;
+		int last_base = dev->caps.num_qps;
+
+		for (i = 1; i < MLX4_NUM_QP_REGION; ++i)
+			sort[i] = i;
+
+		for (i = MLX4_NUM_QP_REGION; i > 0; --i) {
+			for (j = 2; j < i; ++j) {
+				if (dev->caps.reserved_qps_cnt[sort[j]] >
+				    dev->caps.reserved_qps_cnt[sort[j - 1]]) {
+					tmp             = sort[j];
+					sort[j]         = sort[j - 1];
+					sort[j - 1]     = tmp;
+				}
+			}
+		}
+
+		for (i = 1; i < MLX4_NUM_QP_REGION; ++i) {
+			last_base -= dev->caps.reserved_qps_cnt[sort[i]];
+			dev->caps.reserved_qps_base[sort[i]] = last_base;
+			reserved_from_top +=
+				dev->caps.reserved_qps_cnt[sort[i]];
+		}
+
+	}
+
 	err = mlx4_bitmap_init(&qp_table->bitmap, dev->caps.num_qps,
-			       (1 << 24) - 1, dev->caps.sqp_start + 8);
+			       (1 << 23) - 1, dev->caps.sqp_start + 8,
+			       reserved_from_top);
 	if (err)
 		return err;
 
diff --git a/drivers/net/mlx4/srq.c b/drivers/net/mlx4/srq.c
index 533eb6db24b3..fe9f218691f5 100644
--- a/drivers/net/mlx4/srq.c
+++ b/drivers/net/mlx4/srq.c
@@ -245,7 +245,7 @@ int mlx4_init_srq_table(struct mlx4_dev *dev)
 	INIT_RADIX_TREE(&srq_table->tree, GFP_ATOMIC);
 
 	err = mlx4_bitmap_init(&srq_table->bitmap, dev->caps.num_srqs,
-			       dev->caps.num_srqs - 1, dev->caps.reserved_srqs);
+			       dev->caps.num_srqs - 1, dev->caps.reserved_srqs, 0);
 	if (err)
 		return err;
 
diff --git a/include/linux/mlx4/device.h b/include/linux/mlx4/device.h
index d21e879f3c90..693f93cd29e1 100644
--- a/include/linux/mlx4/device.h
+++ b/include/linux/mlx4/device.h
@@ -145,6 +145,18 @@ enum {
 	MLX4_MTT_FLAG_PRESENT		= 1
 };
 
+enum mlx4_qp_region {
+	MLX4_QP_REGION_FW = 0,
+	MLX4_QP_REGION_ETH_ADDR,
+	MLX4_QP_REGION_FC_ADDR,
+	MLX4_QP_REGION_FC_EXCH,
+	MLX4_NUM_QP_REGION
+};
+
+enum {
+	MLX4_NUM_FEXCH          = 64 * 1024,
+};
+
 static inline u64 mlx4_fw_ver(u64 major, u64 minor, u64 subminor)
 {
 	return (major << 32) | (minor << 16) | subminor;
@@ -169,7 +181,6 @@ struct mlx4_caps {
 	int			max_rq_desc_sz;
 	int			max_qp_init_rdma;
 	int			max_qp_dest_rdma;
-	int			reserved_qps;
 	int			sqp_start;
 	int			num_srqs;
 	int			max_srq_wqes;
@@ -201,6 +212,12 @@ struct mlx4_caps {
 	u16			stat_rate_support;
 	u8			port_width_cap[MLX4_MAX_PORTS + 1];
 	int			max_gso_sz;
+	int                     reserved_qps_cnt[MLX4_NUM_QP_REGION];
+	int			reserved_qps;
+	int                     reserved_qps_base[MLX4_NUM_QP_REGION];
+	int                     log_num_macs;
+	int                     log_num_vlans;
+	int                     log_num_prios;
 };
 
 struct mlx4_buf_list {
-- 
cgit v1.2.3


From b79acb49de6c2ab9ff0245f0f2b573d48b9a2d93 Mon Sep 17 00:00:00 2001
From: Yevgeny Petrilin <yevgenyp@mellanox.co.il>
Date: Wed, 22 Oct 2008 10:56:48 -0700
Subject: mlx4_core: Get ethernet MTU and default address from firmware

Get maximum ethernet MTU and default MAC address from the firmware
QUERY_DEV_CAP command.

Signed-off-by: Yevgeny Petrilin <yevgenyp@mellanox.co.il>
Signed-off-by: Roland Dreier <rolandd@cisco.com>
---
 drivers/net/mlx4/fw.c       | 13 ++++++++-----
 drivers/net/mlx4/fw.h       |  4 +++-
 drivers/net/mlx4/main.c     |  4 +++-
 include/linux/mlx4/device.h |  4 +++-
 4 files changed, 17 insertions(+), 8 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/net/mlx4/fw.c b/drivers/net/mlx4/fw.c
index 40d8142c23b2..8d402db9a03d 100644
--- a/drivers/net/mlx4/fw.c
+++ b/drivers/net/mlx4/fw.c
@@ -346,7 +346,7 @@ int mlx4_QUERY_DEV_CAP(struct mlx4_dev *dev, struct mlx4_dev_cap *dev_cap)
 			MLX4_GET(field, outbox, QUERY_DEV_CAP_VL_PORT_OFFSET);
 			dev_cap->max_vl[i]	   = field >> 4;
 			MLX4_GET(field, outbox, QUERY_DEV_CAP_MTU_WIDTH_OFFSET);
-			dev_cap->max_mtu[i]	   = field >> 4;
+			dev_cap->ib_mtu[i]	   = field >> 4;
 			dev_cap->max_port_width[i] = field & 0xf;
 			MLX4_GET(field, outbox, QUERY_DEV_CAP_MAX_GID_OFFSET);
 			dev_cap->max_gids[i]	   = 1 << (field & 0xf);
@@ -355,8 +355,10 @@ int mlx4_QUERY_DEV_CAP(struct mlx4_dev *dev, struct mlx4_dev_cap *dev_cap)
 		}
 	} else {
 #define QUERY_PORT_MTU_OFFSET			0x01
+#define QUERY_PORT_ETH_MTU_OFFSET		0x02
 #define QUERY_PORT_WIDTH_OFFSET			0x06
 #define QUERY_PORT_MAX_GID_PKEY_OFFSET		0x07
+#define QUERY_PORT_MAC_OFFSET			0x08
 #define QUERY_PORT_MAX_MACVLAN_OFFSET		0x0a
 #define QUERY_PORT_MAX_VL_OFFSET		0x0b
 
@@ -367,7 +369,7 @@ int mlx4_QUERY_DEV_CAP(struct mlx4_dev *dev, struct mlx4_dev_cap *dev_cap)
 				goto out;
 
 			MLX4_GET(field, outbox, QUERY_PORT_MTU_OFFSET);
-			dev_cap->max_mtu[i]	   = field & 0xf;
+			dev_cap->ib_mtu[i]	   = field & 0xf;
 			MLX4_GET(field, outbox, QUERY_PORT_WIDTH_OFFSET);
 			dev_cap->max_port_width[i] = field & 0xf;
 			MLX4_GET(field, outbox, QUERY_PORT_MAX_GID_PKEY_OFFSET);
@@ -378,7 +380,8 @@ int mlx4_QUERY_DEV_CAP(struct mlx4_dev *dev, struct mlx4_dev_cap *dev_cap)
 			MLX4_GET(field, outbox, QUERY_PORT_MAX_MACVLAN_OFFSET);
 			dev_cap->log_max_macs[i]  = field & 0xf;
 			dev_cap->log_max_vlans[i] = field >> 4;
-
+			MLX4_GET(dev_cap->eth_mtu[i], outbox, QUERY_PORT_ETH_MTU_OFFSET);
+			MLX4_GET(dev_cap->def_mac[i], outbox, QUERY_PORT_MAC_OFFSET);
 		}
 	}
 
@@ -412,7 +415,7 @@ int mlx4_QUERY_DEV_CAP(struct mlx4_dev *dev, struct mlx4_dev_cap *dev_cap)
 	mlx4_dbg(dev, "Max CQEs: %d, max WQEs: %d, max SRQ WQEs: %d\n",
 		 dev_cap->max_cq_sz, dev_cap->max_qp_sz, dev_cap->max_srq_sz);
 	mlx4_dbg(dev, "Local CA ACK delay: %d, max MTU: %d, port width cap: %d\n",
-		 dev_cap->local_ca_ack_delay, 128 << dev_cap->max_mtu[1],
+		 dev_cap->local_ca_ack_delay, 128 << dev_cap->ib_mtu[1],
 		 dev_cap->max_port_width[1]);
 	mlx4_dbg(dev, "Max SQ desc size: %d, max SQ S/G: %d\n",
 		 dev_cap->max_sq_desc_sz, dev_cap->max_sq_sg);
@@ -824,7 +827,7 @@ int mlx4_INIT_PORT(struct mlx4_dev *dev, int port)
 		flags |= (dev->caps.port_width_cap[port] & 0xf) << INIT_PORT_PORT_WIDTH_SHIFT;
 		MLX4_PUT(inbox, flags,		  INIT_PORT_FLAGS_OFFSET);
 
-		field = 128 << dev->caps.mtu_cap[port];
+		field = 128 << dev->caps.ib_mtu_cap[port];
 		MLX4_PUT(inbox, field, INIT_PORT_MTU_OFFSET);
 		field = dev->caps.gid_table_len[port];
 		MLX4_PUT(inbox, field, INIT_PORT_MAX_GID_OFFSET);
diff --git a/drivers/net/mlx4/fw.h b/drivers/net/mlx4/fw.h
index c34e726d66e4..d0913d4d262a 100644
--- a/drivers/net/mlx4/fw.h
+++ b/drivers/net/mlx4/fw.h
@@ -66,11 +66,13 @@ struct mlx4_dev_cap {
 	int local_ca_ack_delay;
 	int num_ports;
 	u32 max_msg_sz;
-	int max_mtu[MLX4_MAX_PORTS + 1];
+	int ib_mtu[MLX4_MAX_PORTS + 1];
 	int max_port_width[MLX4_MAX_PORTS + 1];
 	int max_vl[MLX4_MAX_PORTS + 1];
 	int max_gids[MLX4_MAX_PORTS + 1];
 	int max_pkeys[MLX4_MAX_PORTS + 1];
+	u64 def_mac[MLX4_MAX_PORTS + 1];
+	u16 eth_mtu[MLX4_MAX_PORTS + 1];
 	u16 stat_rate_support;
 	u32 flags;
 	int reserved_uars;
diff --git a/drivers/net/mlx4/main.c b/drivers/net/mlx4/main.c
index 560e1962212e..28f36b88de38 100644
--- a/drivers/net/mlx4/main.c
+++ b/drivers/net/mlx4/main.c
@@ -133,10 +133,12 @@ static int mlx4_dev_cap(struct mlx4_dev *dev, struct mlx4_dev_cap *dev_cap)
 	dev->caps.num_ports	     = dev_cap->num_ports;
 	for (i = 1; i <= dev->caps.num_ports; ++i) {
 		dev->caps.vl_cap[i]	    = dev_cap->max_vl[i];
-		dev->caps.mtu_cap[i]	    = dev_cap->max_mtu[i];
+		dev->caps.ib_mtu_cap[i]	    = dev_cap->ib_mtu[i];
 		dev->caps.gid_table_len[i]  = dev_cap->max_gids[i];
 		dev->caps.pkey_table_len[i] = dev_cap->max_pkeys[i];
 		dev->caps.port_width_cap[i] = dev_cap->max_port_width[i];
+		dev->caps.eth_mtu_cap[i]    = dev_cap->eth_mtu[i];
+		dev->caps.def_mac[i]        = dev_cap->def_mac[i];
 	}
 
 	dev->caps.num_uars	     = dev_cap->uar_size / PAGE_SIZE;
diff --git a/include/linux/mlx4/device.h b/include/linux/mlx4/device.h
index 693f93cd29e1..f9e73cfc540b 100644
--- a/include/linux/mlx4/device.h
+++ b/include/linux/mlx4/device.h
@@ -166,7 +166,9 @@ struct mlx4_caps {
 	u64			fw_ver;
 	int			num_ports;
 	int			vl_cap[MLX4_MAX_PORTS + 1];
-	int			mtu_cap[MLX4_MAX_PORTS + 1];
+	int			ib_mtu_cap[MLX4_MAX_PORTS + 1];
+	u64			def_mac[MLX4_MAX_PORTS + 1];
+	int			eth_mtu_cap[MLX4_MAX_PORTS + 1];
 	int			gid_table_len[MLX4_MAX_PORTS + 1];
 	int			pkey_table_len[MLX4_MAX_PORTS + 1];
 	int			local_ca_ack_delay;
-- 
cgit v1.2.3


From 08e5338d119daeb3c7746fa80fa916b8d3d48e89 Mon Sep 17 00:00:00 2001
From: David Miller <davem@davemloft.net>
Date: Wed, 22 Oct 2008 20:21:29 +0200
Subject: i2c-algo-pcf: Pass adapter data into ->waitforpin() method

Pass adapter data into ->waitforpin() method.

Signed-off-by: David S. Miller <davem@davemloft.net>
Signed-off-by: Jean Delvare <khali@linux-fr.org>
---
 drivers/i2c/algos/i2c-algo-pcf.c | 2 +-
 drivers/i2c/busses/i2c-elektor.c | 3 ++-
 include/linux/i2c-algo-pcf.h     | 2 +-
 3 files changed, 4 insertions(+), 3 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/i2c/algos/i2c-algo-pcf.c b/drivers/i2c/algos/i2c-algo-pcf.c
index 1e328d19cd6d..a8a5b6d1dd88 100644
--- a/drivers/i2c/algos/i2c-algo-pcf.c
+++ b/drivers/i2c/algos/i2c-algo-pcf.c
@@ -135,7 +135,7 @@ static int wait_for_pin(struct i2c_algo_pcf_data *adap, int *status) {
 	*status = get_pcf(adap, 1);
 #ifndef STUB_I2C
 	while (timeout-- && (*status & I2C_PCF_PIN)) {
-		adap->waitforpin();
+		adap->waitforpin(adap->data);
 		*status = get_pcf(adap, 1);
 	}
 	if (*status & I2C_PCF_LAB) {
diff --git a/drivers/i2c/busses/i2c-elektor.c b/drivers/i2c/busses/i2c-elektor.c
index 7f38c01fb3a0..0ed3ccb81b63 100644
--- a/drivers/i2c/busses/i2c-elektor.c
+++ b/drivers/i2c/busses/i2c-elektor.c
@@ -104,7 +104,8 @@ static int pcf_isa_getclock(void *data)
 	return (clock);
 }
 
-static void pcf_isa_waitforpin(void) {
+static void pcf_isa_waitforpin(void *data)
+{
 	DEFINE_WAIT(wait);
 	int timeout = 2;
 	unsigned long flags;
diff --git a/include/linux/i2c-algo-pcf.h b/include/linux/i2c-algo-pcf.h
index 0177d280f733..5de8a319bf14 100644
--- a/include/linux/i2c-algo-pcf.h
+++ b/include/linux/i2c-algo-pcf.h
@@ -31,7 +31,7 @@ struct i2c_algo_pcf_data {
 	int  (*getpcf) (void *data, int ctl);
 	int  (*getown) (void *data);
 	int  (*getclock) (void *data);
-	void (*waitforpin) (void);
+	void (*waitforpin) (void *data);
 
 	/* Multi-master lost arbitration back-off delay (msecs)
 	 * This should be set by the bus adapter or knowledgable client
-- 
cgit v1.2.3


From 30091404af5a7cd515e7b565df76932e295d8f6f Mon Sep 17 00:00:00 2001
From: David Miller <davem@davemloft.net>
Date: Wed, 22 Oct 2008 20:21:30 +0200
Subject: i2c-algo-pcf: Add adapter hooks around xfer begin and end

Some I2C bus implementations need to synchronize with external
entities, such as system firmware, which might also be programming the
same I2C bus.

In order to facilitate this add ->xfer_begin() and ->xfer_end() hooks
which are invoked around pcf_xfer().

[JD: Make these hooks optional.]

Signed-off-by: David S. Miller <davem@davemloft.net>
Signed-off-by: Jean Delvare <khali@linux-fr.org>
---
 drivers/i2c/algos/i2c-algo-pcf.c | 17 +++++++++++++----
 include/linux/i2c-algo-pcf.h     |  3 +++
 2 files changed, 16 insertions(+), 4 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/i2c/algos/i2c-algo-pcf.c b/drivers/i2c/algos/i2c-algo-pcf.c
index a8a5b6d1dd88..b8a6f3bcbae3 100644
--- a/drivers/i2c/algos/i2c-algo-pcf.c
+++ b/drivers/i2c/algos/i2c-algo-pcf.c
@@ -331,13 +331,16 @@ static int pcf_xfer(struct i2c_adapter *i2c_adap,
 	int i;
 	int ret=0, timeout, status;
     
+	if (adap->xfer_begin)
+		adap->xfer_begin(adap->data);
 
 	/* Check for bus busy */
 	timeout = wait_for_bb(adap);
 	if (timeout) {
 		DEB2(printk(KERN_ERR "i2c-algo-pcf.o: "
 		            "Timeout waiting for BB in pcf_xfer\n");)
-		return -EIO;
+		i = -EIO;
+		goto out;
 	}
 	
 	for (i = 0;ret >= 0 && i < num; i++) {
@@ -359,12 +362,14 @@ static int pcf_xfer(struct i2c_adapter *i2c_adap,
 		if (timeout) {
 			if (timeout == -EINTR) {
 				/* arbitration lost */
-				return (-EINTR);
+				i = -EINTR;
+				goto out;
 			}
 			i2c_stop(adap);
 			DEB2(printk(KERN_ERR "i2c-algo-pcf.o: Timeout waiting "
 				    "for PIN(1) in pcf_xfer\n");)
-			return (-EREMOTEIO);
+			i = -EREMOTEIO;
+			goto out;
 		}
     
 #ifndef STUB_I2C
@@ -372,7 +377,8 @@ static int pcf_xfer(struct i2c_adapter *i2c_adap,
 		if (status & I2C_PCF_LRB) {
 			i2c_stop(adap);
 			DEB2(printk(KERN_ERR "i2c-algo-pcf.o: No LRB(1) in pcf_xfer\n");)
-			return (-EREMOTEIO);
+			i = -EREMOTEIO;
+			goto out;
 		}
 #endif
     
@@ -404,6 +410,9 @@ static int pcf_xfer(struct i2c_adapter *i2c_adap,
 		}
 	}
 
+out:
+	if (adap->xfer_end)
+		adap->xfer_end(adap->data);
 	return (i);
 }
 
diff --git a/include/linux/i2c-algo-pcf.h b/include/linux/i2c-algo-pcf.h
index 5de8a319bf14..0f91a957a690 100644
--- a/include/linux/i2c-algo-pcf.h
+++ b/include/linux/i2c-algo-pcf.h
@@ -33,6 +33,9 @@ struct i2c_algo_pcf_data {
 	int  (*getclock) (void *data);
 	void (*waitforpin) (void *data);
 
+	void (*xfer_begin) (void *data);
+	void (*xfer_end) (void *data);
+
 	/* Multi-master lost arbitration back-off delay (msecs)
 	 * This should be set by the bus adapter or knowledgable client
 	 * if bus is multi-mastered, else zero
-- 
cgit v1.2.3


From 14f55f7a033f86a4e8f0310dd4d54b5464322e6e Mon Sep 17 00:00:00 2001
From: Wolfram Sang <w.sang@pengutronix.de>
Date: Wed, 22 Oct 2008 20:21:30 +0200
Subject: i2c: Make clear what the class field of i2c_adapter is good for

Make clear what the class field of i2c_adapter is good for.

Signed-off-by: Wolfram Sang <w.sang@pengutronix.de>
Signed-off-by: Jean Delvare <khali@linux-fr.org>
---
 include/linux/i2c.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/i2c.h b/include/linux/i2c.h
index 06115128047f..4ac8ec3c7927 100644
--- a/include/linux/i2c.h
+++ b/include/linux/i2c.h
@@ -345,7 +345,7 @@ struct i2c_algorithm {
 struct i2c_adapter {
 	struct module *owner;
 	unsigned int id;
-	unsigned int class;
+	unsigned int class;		  /* classes to allow probing for */
 	const struct i2c_algorithm *algo; /* the algorithm to access the bus */
 	void *algo_data;
 
-- 
cgit v1.2.3


From 7d1d8999b4bec0ba09f935e648a688bb25596d06 Mon Sep 17 00:00:00 2001
From: Jean Delvare <khali@linux-fr.org>
Date: Wed, 22 Oct 2008 20:21:31 +0200
Subject: i2c: Constify i2c_get_clientdata's parameter

i2c_get_clientdata doesn't change the i2c_client it is passed as a
parameter, so it can be constified. Same for i2c_get_adapdata.

Signed-off-by: Jean Delvare <khali@linux-fr.org>
---
 Documentation/i2c/writing-clients | 2 +-
 include/linux/device.h            | 2 +-
 include/linux/i2c.h               | 4 ++--
 3 files changed, 4 insertions(+), 4 deletions(-)

(limited to 'include/linux')

diff --git a/Documentation/i2c/writing-clients b/Documentation/i2c/writing-clients
index d73ee117a8ca..3b01350c149c 100644
--- a/Documentation/i2c/writing-clients
+++ b/Documentation/i2c/writing-clients
@@ -83,7 +83,7 @@ be very useful.
 	void i2c_set_clientdata(struct i2c_client *client, void *data);
 
 	/* retrieve the value */
-	void *i2c_get_clientdata(struct i2c_client *client);
+	void *i2c_get_clientdata(const struct i2c_client *client);
 
 An example structure is below.
 
diff --git a/include/linux/device.h b/include/linux/device.h
index 987f5912720a..1a3686d15f98 100644
--- a/include/linux/device.h
+++ b/include/linux/device.h
@@ -450,7 +450,7 @@ static inline void set_dev_node(struct device *dev, int node)
 }
 #endif
 
-static inline void *dev_get_drvdata(struct device *dev)
+static inline void *dev_get_drvdata(const struct device *dev)
 {
 	return dev->driver_data;
 }
diff --git a/include/linux/i2c.h b/include/linux/i2c.h
index 4ac8ec3c7927..a411f0b70e8e 100644
--- a/include/linux/i2c.h
+++ b/include/linux/i2c.h
@@ -224,7 +224,7 @@ static inline struct i2c_client *kobj_to_i2c_client(struct kobject *kobj)
 	return to_i2c_client(dev);
 }
 
-static inline void *i2c_get_clientdata (struct i2c_client *dev)
+static inline void *i2c_get_clientdata(const struct i2c_client *dev)
 {
 	return dev_get_drvdata (&dev->dev);
 }
@@ -369,7 +369,7 @@ struct i2c_adapter {
 };
 #define to_i2c_adapter(d) container_of(d, struct i2c_adapter, dev)
 
-static inline void *i2c_get_adapdata (struct i2c_adapter *dev)
+static inline void *i2c_get_adapdata(const struct i2c_adapter *dev)
 {
 	return dev_get_drvdata (&dev->dev);
 }
-- 
cgit v1.2.3


From c0589d4bc19294a49934af1be736eb6e9ad11673 Mon Sep 17 00:00:00 2001
From: Jean Delvare <khali@linux-fr.org>
Date: Wed, 22 Oct 2008 20:21:31 +0200
Subject: i2c: Drop 2-byte address block transfer defines

We have no users and no implementers for these transfer types so it
makes little sense to define functionality bits for them.

Signed-off-by: Jean Delvare <khali@linux-fr.org>
---
 include/linux/i2c.h | 4 ----
 1 file changed, 4 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/i2c.h b/include/linux/i2c.h
index a411f0b70e8e..7d21aba819ff 100644
--- a/include/linux/i2c.h
+++ b/include/linux/i2c.h
@@ -541,8 +541,6 @@ struct i2c_msg {
 #define I2C_FUNC_SMBUS_WRITE_BLOCK_DATA 0x02000000
 #define I2C_FUNC_SMBUS_READ_I2C_BLOCK	0x04000000 /* I2C-like block xfer  */
 #define I2C_FUNC_SMBUS_WRITE_I2C_BLOCK	0x08000000 /* w/ 1-byte reg. addr. */
-#define I2C_FUNC_SMBUS_READ_I2C_BLOCK_2	 0x10000000 /* I2C-like block xfer  */
-#define I2C_FUNC_SMBUS_WRITE_I2C_BLOCK_2 0x20000000 /* w/ 2-byte reg. addr. */
 
 #define I2C_FUNC_SMBUS_BYTE (I2C_FUNC_SMBUS_READ_BYTE | \
                              I2C_FUNC_SMBUS_WRITE_BYTE)
@@ -554,8 +552,6 @@ struct i2c_msg {
                                    I2C_FUNC_SMBUS_WRITE_BLOCK_DATA)
 #define I2C_FUNC_SMBUS_I2C_BLOCK (I2C_FUNC_SMBUS_READ_I2C_BLOCK | \
                                   I2C_FUNC_SMBUS_WRITE_I2C_BLOCK)
-#define I2C_FUNC_SMBUS_I2C_BLOCK_2 (I2C_FUNC_SMBUS_READ_I2C_BLOCK_2 | \
-                                    I2C_FUNC_SMBUS_WRITE_I2C_BLOCK_2)
 
 #define I2C_FUNC_SMBUS_EMUL (I2C_FUNC_SMBUS_QUICK | \
                              I2C_FUNC_SMBUS_BYTE | \
-- 
cgit v1.2.3


From 3ae70deef0a5cc34a96aa1972697d01606bc7933 Mon Sep 17 00:00:00 2001
From: Jean Delvare <khali@linux-fr.org>
Date: Wed, 22 Oct 2008 20:21:32 +0200
Subject: i2c: Clean up <linux/i2c.h>

Fix most checkpatch.pl errors and warnings. This includes replacing
spaces with tabs in many places, adding and removing spaces, and
folding long lines.

Also complete a couple prototypes to make it clearer what the
parameters represent.

Signed-off-by: Jean Delvare <khali@linux-fr.org>
---
 include/linux/i2c.h | 149 ++++++++++++++++++++++++++--------------------------
 1 file changed, 75 insertions(+), 74 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/i2c.h b/include/linux/i2c.h
index 7d21aba819ff..12d7364ad3f3 100644
--- a/include/linux/i2c.h
+++ b/include/linux/i2c.h
@@ -53,45 +53,44 @@ struct i2c_board_info;
  * transmit one message at a time, a more complex version can be used to
  * transmit an arbitrary number of messages without interruption.
  */
-extern int i2c_master_send(struct i2c_client *,const char* ,int);
-extern int i2c_master_recv(struct i2c_client *,char* ,int);
+extern int i2c_master_send(struct i2c_client *client, const char *buf,
+			   int count);
+extern int i2c_master_recv(struct i2c_client *client, char *buf, int count);
 
 /* Transfer num messages.
  */
-extern int i2c_transfer(struct i2c_adapter *adap, struct i2c_msg *msgs, int num);
-
+extern int i2c_transfer(struct i2c_adapter *adap, struct i2c_msg *msgs,
+			int num);
 
 /* This is the very generalized SMBus access routine. You probably do not
    want to use this, though; one of the functions below may be much easier,
    and probably just as fast.
    Note that we use i2c_adapter here, because you do not need a specific
    smbus adapter to call this function. */
-extern s32 i2c_smbus_xfer (struct i2c_adapter * adapter, u16 addr,
-                           unsigned short flags,
-                           char read_write, u8 command, int size,
-                           union i2c_smbus_data * data);
+extern s32 i2c_smbus_xfer(struct i2c_adapter *adapter, u16 addr,
+			  unsigned short flags, char read_write, u8 command,
+			  int size, union i2c_smbus_data *data);
 
 /* Now follow the 'nice' access routines. These also document the calling
    conventions of i2c_smbus_xfer. */
 
-extern s32 i2c_smbus_read_byte(struct i2c_client * client);
-extern s32 i2c_smbus_write_byte(struct i2c_client * client, u8 value);
-extern s32 i2c_smbus_read_byte_data(struct i2c_client * client, u8 command);
-extern s32 i2c_smbus_write_byte_data(struct i2c_client * client,
-                                     u8 command, u8 value);
-extern s32 i2c_smbus_read_word_data(struct i2c_client * client, u8 command);
-extern s32 i2c_smbus_write_word_data(struct i2c_client * client,
-                                     u8 command, u16 value);
+extern s32 i2c_smbus_read_byte(struct i2c_client *client);
+extern s32 i2c_smbus_write_byte(struct i2c_client *client, u8 value);
+extern s32 i2c_smbus_read_byte_data(struct i2c_client *client, u8 command);
+extern s32 i2c_smbus_write_byte_data(struct i2c_client *client,
+				     u8 command, u8 value);
+extern s32 i2c_smbus_read_word_data(struct i2c_client *client, u8 command);
+extern s32 i2c_smbus_write_word_data(struct i2c_client *client,
+				     u8 command, u16 value);
 /* Returns the number of read bytes */
 extern s32 i2c_smbus_read_block_data(struct i2c_client *client,
 				     u8 command, u8 *values);
-extern s32 i2c_smbus_write_block_data(struct i2c_client * client,
-				      u8 command, u8 length,
-				      const u8 *values);
+extern s32 i2c_smbus_write_block_data(struct i2c_client *client,
+				      u8 command, u8 length, const u8 *values);
 /* Returns the number of read bytes */
-extern s32 i2c_smbus_read_i2c_block_data(struct i2c_client * client,
+extern s32 i2c_smbus_read_i2c_block_data(struct i2c_client *client,
 					 u8 command, u8 length, u8 *values);
-extern s32 i2c_smbus_write_i2c_block_data(struct i2c_client * client,
+extern s32 i2c_smbus_write_i2c_block_data(struct i2c_client *client,
 					  u8 command, u8 length,
 					  const u8 *values);
 
@@ -169,7 +168,7 @@ struct i2c_driver {
 	/* a ioctl like command that can be used to perform specific functions
 	 * with the device.
 	 */
-	int (*command)(struct i2c_client *client,unsigned int cmd, void *arg);
+	int (*command)(struct i2c_client *client, unsigned int cmd, void *arg);
 
 	struct device_driver driver;
 	const struct i2c_device_id *id_table;
@@ -226,12 +225,12 @@ static inline struct i2c_client *kobj_to_i2c_client(struct kobject *kobj)
 
 static inline void *i2c_get_clientdata(const struct i2c_client *dev)
 {
-	return dev_get_drvdata (&dev->dev);
+	return dev_get_drvdata(&dev->dev);
 }
 
-static inline void i2c_set_clientdata (struct i2c_client *dev, void *data)
+static inline void i2c_set_clientdata(struct i2c_client *dev, void *data)
 {
-	dev_set_drvdata (&dev->dev, data);
+	dev_set_drvdata(&dev->dev, data);
 }
 
 /**
@@ -272,7 +271,7 @@ struct i2c_board_info {
  * fields (such as associated irq, or device-specific platform_data)
  * are provided using conventional syntax.
  */
-#define I2C_BOARD_INFO(dev_type,dev_addr) \
+#define I2C_BOARD_INFO(dev_type, dev_addr) \
 	.type = (dev_type), .addr = (dev_addr)
 
 
@@ -306,10 +305,12 @@ extern void i2c_unregister_device(struct i2c_client *);
  */
 #ifdef CONFIG_I2C_BOARDINFO
 extern int
-i2c_register_board_info(int busnum, struct i2c_board_info const *info, unsigned n);
+i2c_register_board_info(int busnum, struct i2c_board_info const *info,
+			unsigned n);
 #else
 static inline int
-i2c_register_board_info(int busnum, struct i2c_board_info const *info, unsigned n)
+i2c_register_board_info(int busnum, struct i2c_board_info const *info,
+			unsigned n)
 {
 	return 0;
 }
@@ -328,11 +329,11 @@ struct i2c_algorithm {
 	   using common I2C messages */
 	/* master_xfer should return the number of messages successfully
 	   processed, or a negative value on error */
-	int (*master_xfer)(struct i2c_adapter *adap,struct i2c_msg *msgs,
-	                   int num);
+	int (*master_xfer)(struct i2c_adapter *adap, struct i2c_msg *msgs,
+			   int num);
 	int (*smbus_xfer) (struct i2c_adapter *adap, u16 addr,
-	                   unsigned short flags, char read_write,
-	                   u8 command, int size, union i2c_smbus_data * data);
+			   unsigned short flags, char read_write,
+			   u8 command, int size, union i2c_smbus_data *data);
 
 	/* To determine what the adapter supports */
 	u32 (*functionality) (struct i2c_adapter *);
@@ -371,12 +372,12 @@ struct i2c_adapter {
 
 static inline void *i2c_get_adapdata(const struct i2c_adapter *dev)
 {
-	return dev_get_drvdata (&dev->dev);
+	return dev_get_drvdata(&dev->dev);
 }
 
-static inline void i2c_set_adapdata (struct i2c_adapter *dev, void *data)
+static inline void i2c_set_adapdata(struct i2c_adapter *dev, void *data)
 {
-	dev_set_drvdata (&dev->dev, data);
+	dev_set_drvdata(&dev->dev, data);
 }
 
 /*flags for the client struct: */
@@ -449,7 +450,7 @@ extern int i2c_probe(struct i2c_adapter *adapter,
 		const struct i2c_client_address_data *address_data,
 		int (*found_proc) (struct i2c_adapter *, int, int));
 
-extern struct i2c_adapter* i2c_get_adapter(int id);
+extern struct i2c_adapter *i2c_get_adapter(int id);
 extern void i2c_put_adapter(struct i2c_adapter *adap);
 
 
@@ -465,7 +466,7 @@ static inline int i2c_check_functionality(struct i2c_adapter *adap, u32 func)
 	return (func & i2c_get_functionality(adap)) == func;
 }
 
-/* Return id number for a specific adapter */
+/* Return the adapter number for a specific adapter */
 static inline int i2c_adapter_id(struct i2c_adapter *adap)
 {
 	return adap->nr;
@@ -526,7 +527,7 @@ struct i2c_msg {
 
 #define I2C_FUNC_I2C			0x00000001
 #define I2C_FUNC_10BIT_ADDR		0x00000002
-#define I2C_FUNC_PROTOCOL_MANGLING	0x00000004 /* I2C_M_{REV_DIR_ADDR,NOSTART,..} */
+#define I2C_FUNC_PROTOCOL_MANGLING	0x00000004 /* I2C_M_NOSTART etc. */
 #define I2C_FUNC_SMBUS_PEC		0x00000008
 #define I2C_FUNC_SMBUS_BLOCK_PROC_CALL	0x00008000 /* SMBus 2.0 */
 #define I2C_FUNC_SMBUS_QUICK		0x00010000
@@ -542,25 +543,25 @@ struct i2c_msg {
 #define I2C_FUNC_SMBUS_READ_I2C_BLOCK	0x04000000 /* I2C-like block xfer  */
 #define I2C_FUNC_SMBUS_WRITE_I2C_BLOCK	0x08000000 /* w/ 1-byte reg. addr. */
 
-#define I2C_FUNC_SMBUS_BYTE (I2C_FUNC_SMBUS_READ_BYTE | \
-                             I2C_FUNC_SMBUS_WRITE_BYTE)
-#define I2C_FUNC_SMBUS_BYTE_DATA (I2C_FUNC_SMBUS_READ_BYTE_DATA | \
-                                  I2C_FUNC_SMBUS_WRITE_BYTE_DATA)
-#define I2C_FUNC_SMBUS_WORD_DATA (I2C_FUNC_SMBUS_READ_WORD_DATA | \
-                                  I2C_FUNC_SMBUS_WRITE_WORD_DATA)
-#define I2C_FUNC_SMBUS_BLOCK_DATA (I2C_FUNC_SMBUS_READ_BLOCK_DATA | \
-                                   I2C_FUNC_SMBUS_WRITE_BLOCK_DATA)
-#define I2C_FUNC_SMBUS_I2C_BLOCK (I2C_FUNC_SMBUS_READ_I2C_BLOCK | \
-                                  I2C_FUNC_SMBUS_WRITE_I2C_BLOCK)
-
-#define I2C_FUNC_SMBUS_EMUL (I2C_FUNC_SMBUS_QUICK | \
-                             I2C_FUNC_SMBUS_BYTE | \
-                             I2C_FUNC_SMBUS_BYTE_DATA | \
-                             I2C_FUNC_SMBUS_WORD_DATA | \
-                             I2C_FUNC_SMBUS_PROC_CALL | \
-                             I2C_FUNC_SMBUS_WRITE_BLOCK_DATA | \
-			     I2C_FUNC_SMBUS_I2C_BLOCK | \
-			     I2C_FUNC_SMBUS_PEC)
+#define I2C_FUNC_SMBUS_BYTE		(I2C_FUNC_SMBUS_READ_BYTE | \
+					 I2C_FUNC_SMBUS_WRITE_BYTE)
+#define I2C_FUNC_SMBUS_BYTE_DATA	(I2C_FUNC_SMBUS_READ_BYTE_DATA | \
+					 I2C_FUNC_SMBUS_WRITE_BYTE_DATA)
+#define I2C_FUNC_SMBUS_WORD_DATA	(I2C_FUNC_SMBUS_READ_WORD_DATA | \
+					 I2C_FUNC_SMBUS_WRITE_WORD_DATA)
+#define I2C_FUNC_SMBUS_BLOCK_DATA	(I2C_FUNC_SMBUS_READ_BLOCK_DATA | \
+					 I2C_FUNC_SMBUS_WRITE_BLOCK_DATA)
+#define I2C_FUNC_SMBUS_I2C_BLOCK	(I2C_FUNC_SMBUS_READ_I2C_BLOCK | \
+					 I2C_FUNC_SMBUS_WRITE_I2C_BLOCK)
+
+#define I2C_FUNC_SMBUS_EMUL		(I2C_FUNC_SMBUS_QUICK | \
+					 I2C_FUNC_SMBUS_BYTE | \
+					 I2C_FUNC_SMBUS_BYTE_DATA | \
+					 I2C_FUNC_SMBUS_WORD_DATA | \
+					 I2C_FUNC_SMBUS_PROC_CALL | \
+					 I2C_FUNC_SMBUS_WRITE_BLOCK_DATA | \
+					 I2C_FUNC_SMBUS_I2C_BLOCK | \
+					 I2C_FUNC_SMBUS_PEC)
 
 /*
  * Data for SMBus Messages
@@ -570,7 +571,7 @@ union i2c_smbus_data {
 	__u8 byte;
 	__u16 word;
 	__u8 block[I2C_SMBUS_BLOCK_MAX + 2]; /* block[0] is used for length */
-	                       /* and one more for user-space compatibility */
+			       /* and one more for user-space compatibility */
 };
 
 /* i2c_smbus_xfer read or write markers */
@@ -598,21 +599,21 @@ union i2c_smbus_data {
 
 /* Default fill of many variables */
 #define I2C_CLIENT_DEFAULTS {I2C_CLIENT_END, I2C_CLIENT_END, I2C_CLIENT_END, \
-                          I2C_CLIENT_END, I2C_CLIENT_END, I2C_CLIENT_END, \
-                          I2C_CLIENT_END, I2C_CLIENT_END, I2C_CLIENT_END, \
-                          I2C_CLIENT_END, I2C_CLIENT_END, I2C_CLIENT_END, \
-                          I2C_CLIENT_END, I2C_CLIENT_END, I2C_CLIENT_END, \
-                          I2C_CLIENT_END, I2C_CLIENT_END, I2C_CLIENT_END, \
-                          I2C_CLIENT_END, I2C_CLIENT_END, I2C_CLIENT_END, \
-                          I2C_CLIENT_END, I2C_CLIENT_END, I2C_CLIENT_END, \
-                          I2C_CLIENT_END, I2C_CLIENT_END, I2C_CLIENT_END, \
-                          I2C_CLIENT_END, I2C_CLIENT_END, I2C_CLIENT_END, \
-                          I2C_CLIENT_END, I2C_CLIENT_END, I2C_CLIENT_END, \
-                          I2C_CLIENT_END, I2C_CLIENT_END, I2C_CLIENT_END, \
-                          I2C_CLIENT_END, I2C_CLIENT_END, I2C_CLIENT_END, \
-                          I2C_CLIENT_END, I2C_CLIENT_END, I2C_CLIENT_END, \
-                          I2C_CLIENT_END, I2C_CLIENT_END, I2C_CLIENT_END, \
-                          I2C_CLIENT_END, I2C_CLIENT_END, I2C_CLIENT_END}
+			     I2C_CLIENT_END, I2C_CLIENT_END, I2C_CLIENT_END, \
+			     I2C_CLIENT_END, I2C_CLIENT_END, I2C_CLIENT_END, \
+			     I2C_CLIENT_END, I2C_CLIENT_END, I2C_CLIENT_END, \
+			     I2C_CLIENT_END, I2C_CLIENT_END, I2C_CLIENT_END, \
+			     I2C_CLIENT_END, I2C_CLIENT_END, I2C_CLIENT_END, \
+			     I2C_CLIENT_END, I2C_CLIENT_END, I2C_CLIENT_END, \
+			     I2C_CLIENT_END, I2C_CLIENT_END, I2C_CLIENT_END, \
+			     I2C_CLIENT_END, I2C_CLIENT_END, I2C_CLIENT_END, \
+			     I2C_CLIENT_END, I2C_CLIENT_END, I2C_CLIENT_END, \
+			     I2C_CLIENT_END, I2C_CLIENT_END, I2C_CLIENT_END, \
+			     I2C_CLIENT_END, I2C_CLIENT_END, I2C_CLIENT_END, \
+			     I2C_CLIENT_END, I2C_CLIENT_END, I2C_CLIENT_END, \
+			     I2C_CLIENT_END, I2C_CLIENT_END, I2C_CLIENT_END, \
+			     I2C_CLIENT_END, I2C_CLIENT_END, I2C_CLIENT_END, \
+			     I2C_CLIENT_END, I2C_CLIENT_END, I2C_CLIENT_END}
 
 /* I2C_CLIENT_MODULE_PARM creates a module parameter, and puts it in the
    module header */
@@ -621,7 +622,7 @@ union i2c_smbus_data {
   static unsigned short var[I2C_CLIENT_MAX_OPTS] = I2C_CLIENT_DEFAULTS; \
   static unsigned int var##_num; \
   module_param_array(var, short, &var##_num, 0); \
-  MODULE_PARM_DESC(var,desc)
+  MODULE_PARM_DESC(var, desc)
 
 #define I2C_CLIENT_MODULE_PARM_FORCE(name)				\
 I2C_CLIENT_MODULE_PARM(force_##name,					\
-- 
cgit v1.2.3


From 11f1f2afd6b07729b12aaba479344d7f12d88ff9 Mon Sep 17 00:00:00 2001
From: Anton Vorontsov <avorontsov@ru.mvista.com>
Date: Wed, 22 Oct 2008 20:21:33 +0200
Subject: i2c: Add info->archdata field

If present the info->archdata is copied into the dev->archdata.
Some (OpenFirmware) platforms need it.

Signed-off-by: Anton Vorontsov <avorontsov@ru.mvista.com>
Signed-off-by: Jean Delvare <khali@linux-fr.org>
---
 drivers/i2c/i2c-core.c | 3 +++
 include/linux/i2c.h    | 2 ++
 2 files changed, 5 insertions(+)

(limited to 'include/linux')

diff --git a/drivers/i2c/i2c-core.c b/drivers/i2c/i2c-core.c
index 42e852d79ffa..5a485c22660a 100644
--- a/drivers/i2c/i2c-core.c
+++ b/drivers/i2c/i2c-core.c
@@ -266,6 +266,9 @@ i2c_new_device(struct i2c_adapter *adap, struct i2c_board_info const *info)
 
 	client->dev.platform_data = info->platform_data;
 
+	if (info->archdata)
+		client->dev.archdata = *info->archdata;
+
 	client->flags = info->flags;
 	client->addr = info->addr;
 	client->irq = info->irq;
diff --git a/include/linux/i2c.h b/include/linux/i2c.h
index 12d7364ad3f3..33a5992d4936 100644
--- a/include/linux/i2c.h
+++ b/include/linux/i2c.h
@@ -239,6 +239,7 @@ static inline void i2c_set_clientdata(struct i2c_client *dev, void *data)
  * @flags: to initialize i2c_client.flags
  * @addr: stored in i2c_client.addr
  * @platform_data: stored in i2c_client.dev.platform_data
+ * @archdata: copied into i2c_client.dev.archdata
  * @irq: stored in i2c_client.irq
  *
  * I2C doesn't actually support hardware probing, although controllers and
@@ -258,6 +259,7 @@ struct i2c_board_info {
 	unsigned short	flags;
 	unsigned short	addr;
 	void		*platform_data;
+	struct dev_archdata	*archdata;
 	int		irq;
 };
 
-- 
cgit v1.2.3


From 2a2336f8228292b8197f4187e54b0748903e6645 Mon Sep 17 00:00:00 2001
From: Yevgeny Petrilin <yevgenyp@mellanox.co.il>
Date: Wed, 22 Oct 2008 11:44:46 -0700
Subject: mlx4_core: Ethernet MAC/VLAN management

Add support for managing MAC and VLAN filters for each port.

Signed-off-by: Yevgeny Petrilin <yevgenyp@mellanox.co.il>
Signed-off-by: Oren Duer <oren@mellanox.co.il>
Signed-off-by: Roland Dreier <rolandd@cisco.com>
---
 drivers/net/mlx4/Makefile   |   2 +-
 drivers/net/mlx4/main.c     |  14 +++
 drivers/net/mlx4/mlx4.h     |  33 ++++++
 drivers/net/mlx4/port.c     | 259 ++++++++++++++++++++++++++++++++++++++++++++
 include/linux/mlx4/cmd.h    |   9 ++
 include/linux/mlx4/device.h |  12 ++
 6 files changed, 328 insertions(+), 1 deletion(-)
 create mode 100644 drivers/net/mlx4/port.c

(limited to 'include/linux')

diff --git a/drivers/net/mlx4/Makefile b/drivers/net/mlx4/Makefile
index 0952a6528f58..9f493666e27b 100644
--- a/drivers/net/mlx4/Makefile
+++ b/drivers/net/mlx4/Makefile
@@ -1,4 +1,4 @@
 obj-$(CONFIG_MLX4_CORE)		+= mlx4_core.o
 
 mlx4_core-y :=	alloc.o catas.o cmd.o cq.o eq.o fw.o icm.o intf.o main.o mcg.o \
-		mr.o pd.o profile.o qp.o reset.o srq.o
+		mr.o pd.o port.o profile.o qp.o reset.o srq.o
diff --git a/drivers/net/mlx4/main.c b/drivers/net/mlx4/main.c
index 28f36b88de38..0a5c8bfb3f1f 100644
--- a/drivers/net/mlx4/main.c
+++ b/drivers/net/mlx4/main.c
@@ -780,11 +780,22 @@ no_msi:
 		priv->eq_table.eq[i].irq = dev->pdev->irq;
 }
 
+static void mlx4_init_port_info(struct mlx4_dev *dev, int port)
+{
+	struct mlx4_port_info *info = &mlx4_priv(dev)->port[port];
+
+	info->dev = dev;
+	info->port = port;
+	mlx4_init_mac_table(dev, &info->mac_table);
+	mlx4_init_vlan_table(dev, &info->vlan_table);
+}
+
 static int __mlx4_init_one(struct pci_dev *pdev, const struct pci_device_id *id)
 {
 	struct mlx4_priv *priv;
 	struct mlx4_dev *dev;
 	int err;
+	int port;
 
 	printk(KERN_INFO PFX "Initializing %s\n",
 	       pci_name(pdev));
@@ -894,6 +905,9 @@ static int __mlx4_init_one(struct pci_dev *pdev, const struct pci_device_id *id)
 	if (err)
 		goto err_close;
 
+	for (port = 1; port <= dev->caps.num_ports; port++)
+		mlx4_init_port_info(dev, port);
+
 	err = mlx4_register_device(dev);
 	if (err)
 		goto err_cleanup;
diff --git a/drivers/net/mlx4/mlx4.h b/drivers/net/mlx4/mlx4.h
index 9e2f44c31810..23309f381ee3 100644
--- a/drivers/net/mlx4/mlx4.h
+++ b/drivers/net/mlx4/mlx4.h
@@ -252,6 +252,35 @@ struct mlx4_catas_err {
 	struct list_head	list;
 };
 
+#define MLX4_MAX_MAC_NUM	128
+#define MLX4_MAC_TABLE_SIZE	(MLX4_MAX_MAC_NUM << 3)
+
+struct mlx4_mac_table {
+	__be64			entries[MLX4_MAX_MAC_NUM];
+	int			refs[MLX4_MAX_MAC_NUM];
+	struct mutex		mutex;
+	int			total;
+	int			max;
+};
+
+#define MLX4_MAX_VLAN_NUM	128
+#define MLX4_VLAN_TABLE_SIZE	(MLX4_MAX_VLAN_NUM << 2)
+
+struct mlx4_vlan_table {
+	__be32			entries[MLX4_MAX_VLAN_NUM];
+	int			refs[MLX4_MAX_VLAN_NUM];
+	struct mutex		mutex;
+	int			total;
+	int			max;
+};
+
+struct mlx4_port_info {
+	struct mlx4_dev	       *dev;
+	int			port;
+	struct mlx4_mac_table	mac_table;
+	struct mlx4_vlan_table	vlan_table;
+};
+
 struct mlx4_priv {
 	struct mlx4_dev		dev;
 
@@ -280,6 +309,7 @@ struct mlx4_priv {
 
 	struct mlx4_uar		driver_uar;
 	void __iomem	       *kar;
+	struct mlx4_port_info	port[MLX4_MAX_PORTS + 1];
 };
 
 static inline struct mlx4_priv *mlx4_priv(struct mlx4_dev *dev)
@@ -350,4 +380,7 @@ void mlx4_srq_event(struct mlx4_dev *dev, u32 srqn, int event_type);
 
 void mlx4_handle_catas_err(struct mlx4_dev *dev);
 
+void mlx4_init_mac_table(struct mlx4_dev *dev, struct mlx4_mac_table *table);
+void mlx4_init_vlan_table(struct mlx4_dev *dev, struct mlx4_vlan_table *table);
+
 #endif /* MLX4_H */
diff --git a/drivers/net/mlx4/port.c b/drivers/net/mlx4/port.c
new file mode 100644
index 000000000000..8644f3d978ee
--- /dev/null
+++ b/drivers/net/mlx4/port.c
@@ -0,0 +1,259 @@
+/*
+ * Copyright (c) 2007 Mellanox Technologies. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <linux/errno.h>
+#include <linux/if_ether.h>
+
+#include <linux/mlx4/cmd.h>
+
+#include "mlx4.h"
+
+#define MLX4_MAC_VALID		(1ull << 63)
+#define MLX4_MAC_MASK		0xffffffffffffULL
+
+#define MLX4_VLAN_VALID		(1u << 31)
+#define MLX4_VLAN_MASK		0xfff
+
+void mlx4_init_mac_table(struct mlx4_dev *dev, struct mlx4_mac_table *table)
+{
+	int i;
+
+	mutex_init(&table->mutex);
+	for (i = 0; i < MLX4_MAX_MAC_NUM; i++) {
+		table->entries[i] = 0;
+		table->refs[i]	 = 0;
+	}
+	table->max   = 1 << dev->caps.log_num_macs;
+	table->total = 0;
+}
+
+void mlx4_init_vlan_table(struct mlx4_dev *dev, struct mlx4_vlan_table *table)
+{
+	int i;
+
+	mutex_init(&table->mutex);
+	for (i = 0; i < MLX4_MAX_VLAN_NUM; i++) {
+		table->entries[i] = 0;
+		table->refs[i]	 = 0;
+	}
+	table->max   = 1 << dev->caps.log_num_vlans;
+	table->total = 0;
+}
+
+static int mlx4_set_port_mac_table(struct mlx4_dev *dev, u8 port,
+				   __be64 *entries)
+{
+	struct mlx4_cmd_mailbox *mailbox;
+	u32 in_mod;
+	int err;
+
+	mailbox = mlx4_alloc_cmd_mailbox(dev);
+	if (IS_ERR(mailbox))
+		return PTR_ERR(mailbox);
+
+	memcpy(mailbox->buf, entries, MLX4_MAC_TABLE_SIZE);
+
+	in_mod = MLX4_SET_PORT_MAC_TABLE << 8 | port;
+	err = mlx4_cmd(dev, mailbox->dma, in_mod, 1, MLX4_CMD_SET_PORT,
+		       MLX4_CMD_TIME_CLASS_B);
+
+	mlx4_free_cmd_mailbox(dev, mailbox);
+	return err;
+}
+
+int mlx4_register_mac(struct mlx4_dev *dev, u8 port, u64 mac, int *index)
+{
+	struct mlx4_mac_table *table = &mlx4_priv(dev)->port[port].mac_table;
+	int i, err = 0;
+	int free = -1;
+
+	mlx4_dbg(dev, "Registering MAC: 0x%llx\n", (unsigned long long) mac);
+	mutex_lock(&table->mutex);
+	for (i = 0; i < MLX4_MAX_MAC_NUM - 1; i++) {
+		if (free < 0 && !table->refs[i]) {
+			free = i;
+			continue;
+		}
+
+		if (mac == (MLX4_MAC_MASK & be64_to_cpu(table->entries[i]))) {
+			/* MAC already registered, increase refernce count */
+			*index = i;
+			++table->refs[i];
+			goto out;
+		}
+	}
+	mlx4_dbg(dev, "Free MAC index is %d\n", free);
+
+	if (table->total == table->max) {
+		/* No free mac entries */
+		err = -ENOSPC;
+		goto out;
+	}
+
+	/* Register new MAC */
+	table->refs[free] = 1;
+	table->entries[free] = cpu_to_be64(mac | MLX4_MAC_VALID);
+
+	err = mlx4_set_port_mac_table(dev, port, table->entries);
+	if (unlikely(err)) {
+		mlx4_err(dev, "Failed adding MAC: 0x%llx\n", (unsigned long long) mac);
+		table->refs[free] = 0;
+		table->entries[free] = 0;
+		goto out;
+	}
+
+	*index = free;
+	++table->total;
+out:
+	mutex_unlock(&table->mutex);
+	return err;
+}
+EXPORT_SYMBOL_GPL(mlx4_register_mac);
+
+void mlx4_unregister_mac(struct mlx4_dev *dev, u8 port, int index)
+{
+	struct mlx4_mac_table *table = &mlx4_priv(dev)->port[port].mac_table;
+
+	mutex_lock(&table->mutex);
+	if (!table->refs[index]) {
+		mlx4_warn(dev, "No MAC entry for index %d\n", index);
+		goto out;
+	}
+	if (--table->refs[index]) {
+		mlx4_warn(dev, "Have more references for index %d,"
+			  "no need to modify MAC table\n", index);
+		goto out;
+	}
+	table->entries[index] = 0;
+	mlx4_set_port_mac_table(dev, port, table->entries);
+	--table->total;
+out:
+	mutex_unlock(&table->mutex);
+}
+EXPORT_SYMBOL_GPL(mlx4_unregister_mac);
+
+static int mlx4_set_port_vlan_table(struct mlx4_dev *dev, u8 port,
+				    __be32 *entries)
+{
+	struct mlx4_cmd_mailbox *mailbox;
+	u32 in_mod;
+	int err;
+
+	mailbox = mlx4_alloc_cmd_mailbox(dev);
+	if (IS_ERR(mailbox))
+		return PTR_ERR(mailbox);
+
+	memcpy(mailbox->buf, entries, MLX4_VLAN_TABLE_SIZE);
+	in_mod = MLX4_SET_PORT_VLAN_TABLE << 8 | port;
+	err = mlx4_cmd(dev, mailbox->dma, in_mod, 1, MLX4_CMD_SET_PORT,
+		       MLX4_CMD_TIME_CLASS_B);
+
+	mlx4_free_cmd_mailbox(dev, mailbox);
+
+	return err;
+}
+
+int mlx4_register_vlan(struct mlx4_dev *dev, u8 port, u16 vlan, int *index)
+{
+	struct mlx4_vlan_table *table = &mlx4_priv(dev)->port[port].vlan_table;
+	int i, err = 0;
+	int free = -1;
+
+	mutex_lock(&table->mutex);
+	for (i = MLX4_VLAN_REGULAR; i < MLX4_MAX_VLAN_NUM; i++) {
+		if (free < 0 && (table->refs[i] == 0)) {
+			free = i;
+			continue;
+		}
+
+		if (table->refs[i] &&
+		    (vlan == (MLX4_VLAN_MASK &
+			      be32_to_cpu(table->entries[i])))) {
+			/* Vlan already registered, increase refernce count */
+			*index = i;
+			++table->refs[i];
+			goto out;
+		}
+	}
+
+	if (table->total == table->max) {
+		/* No free vlan entries */
+		err = -ENOSPC;
+		goto out;
+	}
+
+	/* Register new MAC */
+	table->refs[free] = 1;
+	table->entries[free] = cpu_to_be32(vlan | MLX4_VLAN_VALID);
+
+	err = mlx4_set_port_vlan_table(dev, port, table->entries);
+	if (unlikely(err)) {
+		mlx4_warn(dev, "Failed adding vlan: %u\n", vlan);
+		table->refs[free] = 0;
+		table->entries[free] = 0;
+		goto out;
+	}
+
+	*index = free;
+	++table->total;
+out:
+	mutex_unlock(&table->mutex);
+	return err;
+}
+EXPORT_SYMBOL_GPL(mlx4_register_vlan);
+
+void mlx4_unregister_vlan(struct mlx4_dev *dev, u8 port, int index)
+{
+	struct mlx4_vlan_table *table = &mlx4_priv(dev)->port[port].vlan_table;
+
+	if (index < MLX4_VLAN_REGULAR) {
+		mlx4_warn(dev, "Trying to free special vlan index %d\n", index);
+		return;
+	}
+
+	mutex_lock(&table->mutex);
+	if (!table->refs[index]) {
+		mlx4_warn(dev, "No vlan entry for index %d\n", index);
+		goto out;
+	}
+	if (--table->refs[index]) {
+		mlx4_dbg(dev, "Have more references for index %d,"
+			 "no need to modify vlan table\n", index);
+		goto out;
+	}
+	table->entries[index] = 0;
+	mlx4_set_port_vlan_table(dev, port, table->entries);
+	--table->total;
+out:
+	mutex_unlock(&table->mutex);
+}
+EXPORT_SYMBOL_GPL(mlx4_unregister_vlan);
diff --git a/include/linux/mlx4/cmd.h b/include/linux/mlx4/cmd.h
index 77323a72dd3c..cf9c679ab38b 100644
--- a/include/linux/mlx4/cmd.h
+++ b/include/linux/mlx4/cmd.h
@@ -132,6 +132,15 @@ enum {
 	MLX4_MAILBOX_SIZE	=  4096
 };
 
+enum {
+	/* set port opcode modifiers */
+	MLX4_SET_PORT_GENERAL   = 0x0,
+	MLX4_SET_PORT_RQP_CALC  = 0x1,
+	MLX4_SET_PORT_MAC_TABLE = 0x2,
+	MLX4_SET_PORT_VLAN_TABLE = 0x3,
+	MLX4_SET_PORT_PRIO_MAP  = 0x4,
+};
+
 struct mlx4_dev;
 
 struct mlx4_cmd_mailbox {
diff --git a/include/linux/mlx4/device.h b/include/linux/mlx4/device.h
index f9e73cfc540b..1951fe70a251 100644
--- a/include/linux/mlx4/device.h
+++ b/include/linux/mlx4/device.h
@@ -153,6 +153,12 @@ enum mlx4_qp_region {
 	MLX4_NUM_QP_REGION
 };
 
+enum mlx4_special_vlan_idx {
+	MLX4_NO_VLAN_IDX        = 0,
+	MLX4_VLAN_MISS_IDX,
+	MLX4_VLAN_REGULAR
+};
+
 enum {
 	MLX4_NUM_FEXCH          = 64 * 1024,
 };
@@ -438,6 +444,12 @@ int mlx4_multicast_attach(struct mlx4_dev *dev, struct mlx4_qp *qp, u8 gid[16],
 			  int block_mcast_loopback);
 int mlx4_multicast_detach(struct mlx4_dev *dev, struct mlx4_qp *qp, u8 gid[16]);
 
+int mlx4_register_mac(struct mlx4_dev *dev, u8 port, u64 mac, int *index);
+void mlx4_unregister_mac(struct mlx4_dev *dev, u8 port, int index);
+
+int mlx4_register_vlan(struct mlx4_dev *dev, u8 port, u16 vlan, int *index);
+void mlx4_unregister_vlan(struct mlx4_dev *dev, u8 port, int index);
+
 int mlx4_map_phys_fmr(struct mlx4_dev *dev, struct mlx4_fmr *fmr, u64 *page_list,
 		      int npages, u64 iova, u32 *lkey, u32 *rkey);
 int mlx4_fmr_alloc(struct mlx4_dev *dev, u32 pd, u32 access, int max_pages,
-- 
cgit v1.2.3


From 7ff93f8b7ecbc36e7ffc5c11a61643821c1bfee5 Mon Sep 17 00:00:00 2001
From: Yevgeny Petrilin <yevgenyp@mellanox.co.il>
Date: Wed, 22 Oct 2008 15:38:42 -0700
Subject: mlx4_core: Multiple port type support

Multi-protocol adapters support different port types.  Each consumer
of mlx4_core queries for supported port types; in particular mlx4_ib
can no longer assume that all physical ports belong to it.  Port type
is configured through a sysfs interface.  When the type of a port is
changed, all mlx4 interfaces are unregistered, and then registered
again with the new port types.

Signed-off-by: Yevgeny Petrilin <yevgenyp@mellanox.co.il>
Signed-off-by: Roland Dreier <rolandd@cisco.com>
---
 drivers/infiniband/hw/mlx4/mad.c     |   6 +-
 drivers/infiniband/hw/mlx4/main.c    |  11 +-
 drivers/infiniband/hw/mlx4/mlx4_ib.h |   1 +
 drivers/net/mlx4/fw.c                |   4 +
 drivers/net/mlx4/fw.h                |   1 +
 drivers/net/mlx4/main.c              | 211 ++++++++++++++++++++++++++++++++++-
 drivers/net/mlx4/mlx4.h              |   6 +
 drivers/net/mlx4/port.c              |  23 ++++
 include/linux/mlx4/device.h          |  14 +++
 9 files changed, 266 insertions(+), 11 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/infiniband/hw/mlx4/mad.c b/drivers/infiniband/hw/mlx4/mad.c
index cdca3a511e1c..606f1e2ef284 100644
--- a/drivers/infiniband/hw/mlx4/mad.c
+++ b/drivers/infiniband/hw/mlx4/mad.c
@@ -298,7 +298,7 @@ int mlx4_ib_mad_init(struct mlx4_ib_dev *dev)
 	int p, q;
 	int ret;
 
-	for (p = 0; p < dev->dev->caps.num_ports; ++p)
+	for (p = 0; p < dev->num_ports; ++p)
 		for (q = 0; q <= 1; ++q) {
 			agent = ib_register_mad_agent(&dev->ib_dev, p + 1,
 						      q ? IB_QPT_GSI : IB_QPT_SMI,
@@ -314,7 +314,7 @@ int mlx4_ib_mad_init(struct mlx4_ib_dev *dev)
 	return 0;
 
 err:
-	for (p = 0; p < dev->dev->caps.num_ports; ++p)
+	for (p = 0; p < dev->num_ports; ++p)
 		for (q = 0; q <= 1; ++q)
 			if (dev->send_agent[p][q])
 				ib_unregister_mad_agent(dev->send_agent[p][q]);
@@ -327,7 +327,7 @@ void mlx4_ib_mad_cleanup(struct mlx4_ib_dev *dev)
 	struct ib_mad_agent *agent;
 	int p, q;
 
-	for (p = 0; p < dev->dev->caps.num_ports; ++p) {
+	for (p = 0; p < dev->num_ports; ++p) {
 		for (q = 0; q <= 1; ++q) {
 			agent = dev->send_agent[p][q];
 			dev->send_agent[p][q] = NULL;
diff --git a/drivers/infiniband/hw/mlx4/main.c b/drivers/infiniband/hw/mlx4/main.c
index a3c2851c0545..2e80f8f47b02 100644
--- a/drivers/infiniband/hw/mlx4/main.c
+++ b/drivers/infiniband/hw/mlx4/main.c
@@ -574,7 +574,10 @@ static void *mlx4_ib_add(struct mlx4_dev *dev)
 	ibdev->ib_dev.owner		= THIS_MODULE;
 	ibdev->ib_dev.node_type		= RDMA_NODE_IB_CA;
 	ibdev->ib_dev.local_dma_lkey	= dev->caps.reserved_lkey;
-	ibdev->ib_dev.phys_port_cnt	= dev->caps.num_ports;
+	ibdev->num_ports = 0;
+	mlx4_foreach_port(i, dev, MLX4_PORT_TYPE_IB)
+		ibdev->num_ports++;
+	ibdev->ib_dev.phys_port_cnt     = ibdev->num_ports;
 	ibdev->ib_dev.num_comp_vectors	= 1;
 	ibdev->ib_dev.dma_device	= &dev->pdev->dev;
 
@@ -691,7 +694,7 @@ static void mlx4_ib_remove(struct mlx4_dev *dev, void *ibdev_ptr)
 	struct mlx4_ib_dev *ibdev = ibdev_ptr;
 	int p;
 
-	for (p = 1; p <= dev->caps.num_ports; ++p)
+	for (p = 1; p <= ibdev->num_ports; ++p)
 		mlx4_CLOSE_PORT(dev, p);
 
 	mlx4_ib_mad_cleanup(ibdev);
@@ -706,6 +709,10 @@ static void mlx4_ib_event(struct mlx4_dev *dev, void *ibdev_ptr,
 			  enum mlx4_dev_event event, int port)
 {
 	struct ib_event ibev;
+	struct mlx4_ib_dev *ibdev = to_mdev((struct ib_device *) ibdev_ptr);
+
+	if (port > ibdev->num_ports)
+		return;
 
 	switch (event) {
 	case MLX4_DEV_EVENT_PORT_UP:
diff --git a/drivers/infiniband/hw/mlx4/mlx4_ib.h b/drivers/infiniband/hw/mlx4/mlx4_ib.h
index 6e2b0dc21b61..9974e886b8de 100644
--- a/drivers/infiniband/hw/mlx4/mlx4_ib.h
+++ b/drivers/infiniband/hw/mlx4/mlx4_ib.h
@@ -162,6 +162,7 @@ struct mlx4_ib_ah {
 struct mlx4_ib_dev {
 	struct ib_device	ib_dev;
 	struct mlx4_dev	       *dev;
+	int			num_ports;
 	void __iomem	       *uar_map;
 
 	struct mlx4_uar		priv_uar;
diff --git a/drivers/net/mlx4/fw.c b/drivers/net/mlx4/fw.c
index 8d402db9a03d..be09fdb79cb8 100644
--- a/drivers/net/mlx4/fw.c
+++ b/drivers/net/mlx4/fw.c
@@ -88,6 +88,7 @@ static void dump_dev_cap_flags(struct mlx4_dev *dev, u32 flags)
 		[ 8] = "P_Key violation counter",
 		[ 9] = "Q_Key violation counter",
 		[10] = "VMM",
+		[12] = "DPDP",
 		[16] = "MW support",
 		[17] = "APM support",
 		[18] = "Atomic ops support",
@@ -354,6 +355,7 @@ int mlx4_QUERY_DEV_CAP(struct mlx4_dev *dev, struct mlx4_dev_cap *dev_cap)
 			dev_cap->max_pkeys[i]	   = 1 << (field & 0xf);
 		}
 	} else {
+#define QUERY_PORT_SUPPORTED_TYPE_OFFSET	0x00
 #define QUERY_PORT_MTU_OFFSET			0x01
 #define QUERY_PORT_ETH_MTU_OFFSET		0x02
 #define QUERY_PORT_WIDTH_OFFSET			0x06
@@ -368,6 +370,8 @@ int mlx4_QUERY_DEV_CAP(struct mlx4_dev *dev, struct mlx4_dev_cap *dev_cap)
 			if (err)
 				goto out;
 
+			MLX4_GET(field, outbox, QUERY_PORT_SUPPORTED_TYPE_OFFSET);
+			dev_cap->supported_port_types[i] = field & 3;
 			MLX4_GET(field, outbox, QUERY_PORT_MTU_OFFSET);
 			dev_cap->ib_mtu[i]	   = field & 0xf;
 			MLX4_GET(field, outbox, QUERY_PORT_WIDTH_OFFSET);
diff --git a/drivers/net/mlx4/fw.h b/drivers/net/mlx4/fw.h
index d0913d4d262a..526d7f30c041 100644
--- a/drivers/net/mlx4/fw.h
+++ b/drivers/net/mlx4/fw.h
@@ -104,6 +104,7 @@ struct mlx4_dev_cap {
 	u32 reserved_lkey;
 	u64 max_icm_sz;
 	int max_gso_sz;
+	u8  supported_port_types[MLX4_MAX_PORTS + 1];
 	u8  log_max_macs[MLX4_MAX_PORTS + 1];
 	u8  log_max_vlans[MLX4_MAX_PORTS + 1];
 };
diff --git a/drivers/net/mlx4/main.c b/drivers/net/mlx4/main.c
index 0a5c8bfb3f1f..c1d447873bf1 100644
--- a/drivers/net/mlx4/main.c
+++ b/drivers/net/mlx4/main.c
@@ -98,6 +98,44 @@ module_param_named(use_prio, use_prio, bool, 0444);
 MODULE_PARM_DESC(use_prio, "Enable steering by VLAN priority on ETH ports "
 		  "(0/1, default 0)");
 
+static int mlx4_check_port_params(struct mlx4_dev *dev,
+				  enum mlx4_port_type *port_type)
+{
+	int i;
+
+	for (i = 0; i < dev->caps.num_ports - 1; i++) {
+		if (port_type[i] != port_type[i+1] &&
+		    !(dev->caps.flags & MLX4_DEV_CAP_FLAG_DPDP)) {
+			mlx4_err(dev, "Only same port types supported "
+				 "on this HCA, aborting.\n");
+			return -EINVAL;
+		}
+	}
+	if ((port_type[0] == MLX4_PORT_TYPE_ETH) &&
+	    (port_type[1] == MLX4_PORT_TYPE_IB)) {
+		mlx4_err(dev, "eth-ib configuration is not supported.\n");
+		return -EINVAL;
+	}
+
+	for (i = 0; i < dev->caps.num_ports; i++) {
+		if (!(port_type[i] & dev->caps.supported_type[i+1])) {
+			mlx4_err(dev, "Requested port type for port %d is not "
+				      "supported on this HCA\n", i + 1);
+			return -EINVAL;
+		}
+	}
+	return 0;
+}
+
+static void mlx4_set_port_mask(struct mlx4_dev *dev)
+{
+	int i;
+
+	dev->caps.port_mask = 0;
+	for (i = 1; i <= dev->caps.num_ports; ++i)
+		if (dev->caps.port_type[i] == MLX4_PORT_TYPE_IB)
+			dev->caps.port_mask |= 1 << (i - 1);
+}
 static int mlx4_dev_cap(struct mlx4_dev *dev, struct mlx4_dev_cap *dev_cap)
 {
 	int err;
@@ -139,6 +177,7 @@ static int mlx4_dev_cap(struct mlx4_dev *dev, struct mlx4_dev_cap *dev_cap)
 		dev->caps.port_width_cap[i] = dev_cap->max_port_width[i];
 		dev->caps.eth_mtu_cap[i]    = dev_cap->eth_mtu[i];
 		dev->caps.def_mac[i]        = dev_cap->def_mac[i];
+		dev->caps.supported_type[i] = dev_cap->supported_port_types[i];
 	}
 
 	dev->caps.num_uars	     = dev_cap->uar_size / PAGE_SIZE;
@@ -182,6 +221,11 @@ static int mlx4_dev_cap(struct mlx4_dev *dev, struct mlx4_dev_cap *dev_cap)
 	dev->caps.log_num_prios = use_prio ? 3 : 0;
 
 	for (i = 1; i <= dev->caps.num_ports; ++i) {
+		if (dev->caps.supported_type[i] != MLX4_PORT_TYPE_ETH)
+			dev->caps.port_type[i] = MLX4_PORT_TYPE_IB;
+		else
+			dev->caps.port_type[i] = MLX4_PORT_TYPE_ETH;
+
 		if (dev->caps.log_num_macs > dev_cap->log_max_macs[i]) {
 			dev->caps.log_num_macs = dev_cap->log_max_macs[i];
 			mlx4_warn(dev, "Requested number of MACs is too much "
@@ -196,6 +240,8 @@ static int mlx4_dev_cap(struct mlx4_dev *dev, struct mlx4_dev_cap *dev_cap)
 		}
 	}
 
+	mlx4_set_port_mask(dev);
+
 	dev->caps.reserved_qps_cnt[MLX4_QP_REGION_FW] = dev_cap->reserved_qps;
 	dev->caps.reserved_qps_cnt[MLX4_QP_REGION_ETH_ADDR] =
 		dev->caps.reserved_qps_cnt[MLX4_QP_REGION_FC_ADDR] =
@@ -213,6 +259,95 @@ static int mlx4_dev_cap(struct mlx4_dev *dev, struct mlx4_dev_cap *dev_cap)
 	return 0;
 }
 
+/*
+ * Change the port configuration of the device.
+ * Every user of this function must hold the port mutex.
+ */
+static int mlx4_change_port_types(struct mlx4_dev *dev,
+				  enum mlx4_port_type *port_types)
+{
+	int err = 0;
+	int change = 0;
+	int port;
+
+	for (port = 0; port <  dev->caps.num_ports; port++) {
+		if (port_types[port] != dev->caps.port_type[port + 1]) {
+			change = 1;
+			dev->caps.port_type[port + 1] = port_types[port];
+		}
+	}
+	if (change) {
+		mlx4_unregister_device(dev);
+		for (port = 1; port <= dev->caps.num_ports; port++) {
+			mlx4_CLOSE_PORT(dev, port);
+			err = mlx4_SET_PORT(dev, port);
+			if (err) {
+				mlx4_err(dev, "Failed to set port %d, "
+					      "aborting\n", port);
+				goto out;
+			}
+		}
+		mlx4_set_port_mask(dev);
+		err = mlx4_register_device(dev);
+	}
+
+out:
+	return err;
+}
+
+static ssize_t show_port_type(struct device *dev,
+			      struct device_attribute *attr,
+			      char *buf)
+{
+	struct mlx4_port_info *info = container_of(attr, struct mlx4_port_info,
+						   port_attr);
+	struct mlx4_dev *mdev = info->dev;
+
+	return sprintf(buf, "%s\n",
+		       mdev->caps.port_type[info->port] == MLX4_PORT_TYPE_IB ?
+		       "ib" : "eth");
+}
+
+static ssize_t set_port_type(struct device *dev,
+			     struct device_attribute *attr,
+			     const char *buf, size_t count)
+{
+	struct mlx4_port_info *info = container_of(attr, struct mlx4_port_info,
+						   port_attr);
+	struct mlx4_dev *mdev = info->dev;
+	struct mlx4_priv *priv = mlx4_priv(mdev);
+	enum mlx4_port_type types[MLX4_MAX_PORTS];
+	int i;
+	int err = 0;
+
+	if (!strcmp(buf, "ib\n"))
+		info->tmp_type = MLX4_PORT_TYPE_IB;
+	else if (!strcmp(buf, "eth\n"))
+		info->tmp_type = MLX4_PORT_TYPE_ETH;
+	else {
+		mlx4_err(mdev, "%s is not supported port type\n", buf);
+		return -EINVAL;
+	}
+
+	mutex_lock(&priv->port_mutex);
+	for (i = 0; i < mdev->caps.num_ports; i++)
+		types[i] = priv->port[i+1].tmp_type ? priv->port[i+1].tmp_type :
+					mdev->caps.port_type[i+1];
+
+	err = mlx4_check_port_params(mdev, types);
+	if (err)
+		goto out;
+
+	for (i = 1; i <= mdev->caps.num_ports; i++)
+		priv->port[i].tmp_type = 0;
+
+	err = mlx4_change_port_types(mdev, types);
+
+out:
+	mutex_unlock(&priv->port_mutex);
+	return err ? err : count;
+}
+
 static int mlx4_load_fw(struct mlx4_dev *dev)
 {
 	struct mlx4_priv *priv = mlx4_priv(dev);
@@ -617,6 +752,7 @@ static int mlx4_setup_hca(struct mlx4_dev *dev)
 {
 	struct mlx4_priv *priv = mlx4_priv(dev);
 	int err;
+	int port;
 
 	err = mlx4_init_uar_table(dev);
 	if (err) {
@@ -715,8 +851,20 @@ static int mlx4_setup_hca(struct mlx4_dev *dev)
 		goto err_qp_table_free;
 	}
 
+	for (port = 1; port <= dev->caps.num_ports; port++) {
+		err = mlx4_SET_PORT(dev, port);
+		if (err) {
+			mlx4_err(dev, "Failed to set port %d, aborting\n",
+				port);
+			goto err_mcg_table_free;
+		}
+	}
+
 	return 0;
 
+err_mcg_table_free:
+	mlx4_cleanup_mcg_table(dev);
+
 err_qp_table_free:
 	mlx4_cleanup_qp_table(dev);
 
@@ -780,14 +928,37 @@ no_msi:
 		priv->eq_table.eq[i].irq = dev->pdev->irq;
 }
 
-static void mlx4_init_port_info(struct mlx4_dev *dev, int port)
+static int mlx4_init_port_info(struct mlx4_dev *dev, int port)
 {
 	struct mlx4_port_info *info = &mlx4_priv(dev)->port[port];
+	int err = 0;
 
 	info->dev = dev;
 	info->port = port;
 	mlx4_init_mac_table(dev, &info->mac_table);
 	mlx4_init_vlan_table(dev, &info->vlan_table);
+
+	sprintf(info->dev_name, "mlx4_port%d", port);
+	info->port_attr.attr.name = info->dev_name;
+	info->port_attr.attr.mode = S_IRUGO | S_IWUSR;
+	info->port_attr.show      = show_port_type;
+	info->port_attr.store     = set_port_type;
+
+	err = device_create_file(&dev->pdev->dev, &info->port_attr);
+	if (err) {
+		mlx4_err(dev, "Failed to create file for port %d\n", port);
+		info->port = -1;
+	}
+
+	return err;
+}
+
+static void mlx4_cleanup_port_info(struct mlx4_port_info *info)
+{
+	if (info->port < 0)
+		return;
+
+	device_remove_file(&info->dev->pdev->dev, &info->port_attr);
 }
 
 static int __mlx4_init_one(struct pci_dev *pdev, const struct pci_device_id *id)
@@ -870,6 +1041,8 @@ static int __mlx4_init_one(struct pci_dev *pdev, const struct pci_device_id *id)
 	INIT_LIST_HEAD(&priv->ctx_list);
 	spin_lock_init(&priv->ctx_lock);
 
+	mutex_init(&priv->port_mutex);
+
 	INIT_LIST_HEAD(&priv->pgdir_list);
 	mutex_init(&priv->pgdir_mutex);
 
@@ -905,18 +1078,24 @@ static int __mlx4_init_one(struct pci_dev *pdev, const struct pci_device_id *id)
 	if (err)
 		goto err_close;
 
-	for (port = 1; port <= dev->caps.num_ports; port++)
-		mlx4_init_port_info(dev, port);
+	for (port = 1; port <= dev->caps.num_ports; port++) {
+		err = mlx4_init_port_info(dev, port);
+		if (err)
+			goto err_port;
+	}
 
 	err = mlx4_register_device(dev);
 	if (err)
-		goto err_cleanup;
+		goto err_port;
 
 	pci_set_drvdata(pdev, dev);
 
 	return 0;
 
-err_cleanup:
+err_port:
+	for (port = 1; port <= dev->caps.num_ports; port++)
+		mlx4_cleanup_port_info(&priv->port[port]);
+
 	mlx4_cleanup_mcg_table(dev);
 	mlx4_cleanup_qp_table(dev);
 	mlx4_cleanup_srq_table(dev);
@@ -973,8 +1152,10 @@ static void mlx4_remove_one(struct pci_dev *pdev)
 	if (dev) {
 		mlx4_unregister_device(dev);
 
-		for (p = 1; p <= dev->caps.num_ports; ++p)
+		for (p = 1; p <= dev->caps.num_ports; p++) {
+			mlx4_cleanup_port_info(&priv->port[p]);
 			mlx4_CLOSE_PORT(dev, p);
+		}
 
 		mlx4_cleanup_mcg_table(dev);
 		mlx4_cleanup_qp_table(dev);
@@ -1026,10 +1207,28 @@ static struct pci_driver mlx4_driver = {
 	.remove		= __devexit_p(mlx4_remove_one)
 };
 
+static int __init mlx4_verify_params(void)
+{
+	if ((log_num_mac < 0) || (log_num_mac > 7)) {
+		printk(KERN_WARNING "mlx4_core: bad num_mac: %d\n", log_num_mac);
+		return -1;
+	}
+
+	if ((log_num_vlan < 0) || (log_num_vlan > 7)) {
+		printk(KERN_WARNING "mlx4_core: bad num_vlan: %d\n", log_num_vlan);
+		return -1;
+	}
+
+	return 0;
+}
+
 static int __init mlx4_init(void)
 {
 	int ret;
 
+	if (mlx4_verify_params())
+		return -EINVAL;
+
 	ret = mlx4_catas_init();
 	if (ret)
 		return ret;
diff --git a/drivers/net/mlx4/mlx4.h b/drivers/net/mlx4/mlx4.h
index 23309f381ee3..fa431fad0eec 100644
--- a/drivers/net/mlx4/mlx4.h
+++ b/drivers/net/mlx4/mlx4.h
@@ -277,6 +277,9 @@ struct mlx4_vlan_table {
 struct mlx4_port_info {
 	struct mlx4_dev	       *dev;
 	int			port;
+	char			dev_name[16];
+	struct device_attribute port_attr;
+	enum mlx4_port_type	tmp_type;
 	struct mlx4_mac_table	mac_table;
 	struct mlx4_vlan_table	vlan_table;
 };
@@ -310,6 +313,7 @@ struct mlx4_priv {
 	struct mlx4_uar		driver_uar;
 	void __iomem	       *kar;
 	struct mlx4_port_info	port[MLX4_MAX_PORTS + 1];
+	struct mutex		port_mutex;
 };
 
 static inline struct mlx4_priv *mlx4_priv(struct mlx4_dev *dev)
@@ -383,4 +387,6 @@ void mlx4_handle_catas_err(struct mlx4_dev *dev);
 void mlx4_init_mac_table(struct mlx4_dev *dev, struct mlx4_mac_table *table);
 void mlx4_init_vlan_table(struct mlx4_dev *dev, struct mlx4_vlan_table *table);
 
+int mlx4_SET_PORT(struct mlx4_dev *dev, u8 port);
+
 #endif /* MLX4_H */
diff --git a/drivers/net/mlx4/port.c b/drivers/net/mlx4/port.c
index 8644f3d978ee..e2fdab42c4ce 100644
--- a/drivers/net/mlx4/port.c
+++ b/drivers/net/mlx4/port.c
@@ -257,3 +257,26 @@ out:
 	mutex_unlock(&table->mutex);
 }
 EXPORT_SYMBOL_GPL(mlx4_unregister_vlan);
+
+int mlx4_SET_PORT(struct mlx4_dev *dev, u8 port)
+{
+	struct mlx4_cmd_mailbox *mailbox;
+	int err;
+	u8 is_eth = dev->caps.port_type[port] == MLX4_PORT_TYPE_ETH;
+
+	mailbox = mlx4_alloc_cmd_mailbox(dev);
+	if (IS_ERR(mailbox))
+		return PTR_ERR(mailbox);
+
+	memset(mailbox->buf, 0, 256);
+	if (is_eth) {
+		((u8 *) mailbox->buf)[3] = 6;
+		((__be16 *) mailbox->buf)[4] = cpu_to_be16(1 << 15);
+		((__be16 *) mailbox->buf)[6] = cpu_to_be16(1 << 15);
+	}
+	err = mlx4_cmd(dev, mailbox->dma, port, is_eth, MLX4_CMD_SET_PORT,
+		       MLX4_CMD_TIME_CLASS_B);
+
+	mlx4_free_cmd_mailbox(dev, mailbox);
+	return err;
+}
diff --git a/include/linux/mlx4/device.h b/include/linux/mlx4/device.h
index 1951fe70a251..bd9977b89490 100644
--- a/include/linux/mlx4/device.h
+++ b/include/linux/mlx4/device.h
@@ -60,6 +60,7 @@ enum {
 	MLX4_DEV_CAP_FLAG_IPOIB_CSUM	= 1 <<  7,
 	MLX4_DEV_CAP_FLAG_BAD_PKEY_CNTR	= 1 <<  8,
 	MLX4_DEV_CAP_FLAG_BAD_QKEY_CNTR	= 1 <<  9,
+	MLX4_DEV_CAP_FLAG_DPDP		= 1 << 12,
 	MLX4_DEV_CAP_FLAG_MEM_WINDOW	= 1 << 16,
 	MLX4_DEV_CAP_FLAG_APM		= 1 << 17,
 	MLX4_DEV_CAP_FLAG_ATOMIC	= 1 << 18,
@@ -153,6 +154,11 @@ enum mlx4_qp_region {
 	MLX4_NUM_QP_REGION
 };
 
+enum mlx4_port_type {
+	MLX4_PORT_TYPE_IB	= 1 << 0,
+	MLX4_PORT_TYPE_ETH	= 1 << 1,
+};
+
 enum mlx4_special_vlan_idx {
 	MLX4_NO_VLAN_IDX        = 0,
 	MLX4_VLAN_MISS_IDX,
@@ -226,6 +232,9 @@ struct mlx4_caps {
 	int                     log_num_macs;
 	int                     log_num_vlans;
 	int                     log_num_prios;
+	enum mlx4_port_type	port_type[MLX4_MAX_PORTS + 1];
+	u8			supported_type[MLX4_MAX_PORTS + 1];
+	u32			port_mask;
 };
 
 struct mlx4_buf_list {
@@ -380,6 +389,11 @@ struct mlx4_init_port_param {
 	u64			si_guid;
 };
 
+#define mlx4_foreach_port(port, dev, type)				\
+	for ((port) = 1; (port) <= (dev)->caps.num_ports; (port)++)	\
+		if (((type) == MLX4_PORT_TYPE_IB ? (dev)->caps.port_mask : \
+		     ~(dev)->caps.port_mask) & 1 << ((port) - 1))
+
 int mlx4_buf_alloc(struct mlx4_dev *dev, int size, int max_direct,
 		   struct mlx4_buf *buf);
 void mlx4_buf_free(struct mlx4_dev *dev, int size, struct mlx4_buf *buf);
-- 
cgit v1.2.3


From 8dd7f8036c123296fc4214f9d8810eb485570422 Mon Sep 17 00:00:00 2001
From: Sheng Yang <sheng@linux.intel.com>
Date: Tue, 21 Oct 2008 17:38:25 +0800
Subject: PCI: add support for function level reset

Sometimes, it's necessary to enable software's ability to quiesce and
reset endpoint hardware with function-level granularity, so provide
support for it.

The patch implement Function Level Reset(FLR) feature following PCI-e
spec. And this is the first step. We would add more generic method, like
D0/D3, to allow more devices support this function.

The patch contains two functions. pcie_reset_function() is the new
driver API, and, contains some action to quiesce a device.  The other
function is a helper:  pcie_execute_reset_function() just executes the
reset for a particular device function.

Current the usage model is in KVM. Function reset is necessary for
assigning device to a guest, or moving it between partitions.

For Function Level Reset(FLR), please refer to PCI Express spec chapter
6.6.2.

Signed-off-by: Sheng Yang <sheng@linux.intel.com>
Signed-off-by: Matthew Wilcox <willy@linux.intel.com>
Signed-off-by: Jesse Barnes <jbarnes@virtuousgeek.org>
---
 drivers/pci/pci.c        | 98 ++++++++++++++++++++++++++++++++++++++++++++++++
 include/linux/pci.h      |  2 +
 include/linux/pci_regs.h |  2 +
 3 files changed, 102 insertions(+)

(limited to 'include/linux')

diff --git a/drivers/pci/pci.c b/drivers/pci/pci.c
index aee73cf251b6..533aeb5fcbe4 100644
--- a/drivers/pci/pci.c
+++ b/drivers/pci/pci.c
@@ -18,6 +18,7 @@
 #include <linux/log2.h>
 #include <linux/pci-aspm.h>
 #include <linux/pm_wakeup.h>
+#include <linux/interrupt.h>
 #include <asm/dma.h>	/* isa_dma_bridge_buggy */
 #include "pci.h"
 
@@ -1745,6 +1746,103 @@ int pci_set_dma_seg_boundary(struct pci_dev *dev, unsigned long mask)
 EXPORT_SYMBOL(pci_set_dma_seg_boundary);
 #endif
 
+/**
+ * pci_execute_reset_function() - Reset a PCI device function
+ * @dev: Device function to reset
+ *
+ * Some devices allow an individual function to be reset without affecting
+ * other functions in the same device.  The PCI device must be responsive
+ * to PCI config space in order to use this function.
+ *
+ * The device function is presumed to be unused when this function is called.
+ * Resetting the device will make the contents of PCI configuration space
+ * random, so any caller of this must be prepared to reinitialise the
+ * device including MSI, bus mastering, BARs, decoding IO and memory spaces,
+ * etc.
+ *
+ * Returns 0 if the device function was successfully reset or -ENOTTY if the
+ * device doesn't support resetting a single function.
+ */
+int pci_execute_reset_function(struct pci_dev *dev)
+{
+	u16 status;
+	u32 cap;
+	int exppos = pci_find_capability(dev, PCI_CAP_ID_EXP);
+
+	if (!exppos)
+		return -ENOTTY;
+	pci_read_config_dword(dev, exppos + PCI_EXP_DEVCAP, &cap);
+	if (!(cap & PCI_EXP_DEVCAP_FLR))
+		return -ENOTTY;
+
+	pci_block_user_cfg_access(dev);
+
+	/* Wait for Transaction Pending bit clean */
+	msleep(100);
+	pci_read_config_word(dev, exppos + PCI_EXP_DEVSTA, &status);
+	if (status & PCI_EXP_DEVSTA_TRPND) {
+		dev_info(&dev->dev, "Busy after 100ms while trying to reset; "
+			"sleeping for 1 second\n");
+		ssleep(1);
+		pci_read_config_word(dev, exppos + PCI_EXP_DEVSTA, &status);
+		if (status & PCI_EXP_DEVSTA_TRPND)
+			dev_info(&dev->dev, "Still busy after 1s; "
+				"proceeding with reset anyway\n");
+	}
+
+	pci_write_config_word(dev, exppos + PCI_EXP_DEVCTL,
+				PCI_EXP_DEVCTL_BCR_FLR);
+	mdelay(100);
+
+	pci_unblock_user_cfg_access(dev);
+	return 0;
+}
+EXPORT_SYMBOL_GPL(pci_execute_reset_function);
+
+/**
+ * pci_reset_function() - quiesce and reset a PCI device function
+ * @dev: Device function to reset
+ *
+ * Some devices allow an individual function to be reset without affecting
+ * other functions in the same device.  The PCI device must be responsive
+ * to PCI config space in order to use this function.
+ *
+ * This function does not just reset the PCI portion of a device, but
+ * clears all the state associated with the device.  This function differs
+ * from pci_execute_reset_function in that it saves and restores device state
+ * over the reset.
+ *
+ * Returns 0 if the device function was successfully reset or -ENOTTY if the
+ * device doesn't support resetting a single function.
+ */
+int pci_reset_function(struct pci_dev *dev)
+{
+	u32 cap;
+	int exppos = pci_find_capability(dev, PCI_CAP_ID_EXP);
+	int r;
+
+	if (!exppos)
+		return -ENOTTY;
+	pci_read_config_dword(dev, exppos + PCI_EXP_DEVCAP, &cap);
+	if (!(cap & PCI_EXP_DEVCAP_FLR))
+		return -ENOTTY;
+
+	if (!dev->msi_enabled && !dev->msix_enabled)
+		disable_irq(dev->irq);
+	pci_save_state(dev);
+
+	pci_write_config_word(dev, PCI_COMMAND, PCI_COMMAND_INTX_DISABLE);
+
+	r = pci_execute_reset_function(dev);
+
+	pci_restore_state(dev);
+	if (!dev->msi_enabled && !dev->msix_enabled)
+		enable_irq(dev->irq);
+
+	return r;
+}
+EXPORT_SYMBOL_GPL(pci_reset_function);
+
 /**
  * pcix_get_max_mmrbc - get PCI-X maximum designed memory read byte count
  * @dev: PCI device to query
diff --git a/include/linux/pci.h b/include/linux/pci.h
index 085187be29c7..f6f6810296e6 100644
--- a/include/linux/pci.h
+++ b/include/linux/pci.h
@@ -626,6 +626,8 @@ int pcix_get_mmrbc(struct pci_dev *dev);
 int pcix_set_mmrbc(struct pci_dev *dev, int mmrbc);
 int pcie_get_readrq(struct pci_dev *dev);
 int pcie_set_readrq(struct pci_dev *dev, int rq);
+int pci_reset_function(struct pci_dev *dev);
+int pci_execute_reset_function(struct pci_dev *dev);
 void pci_update_resource(struct pci_dev *dev, struct resource *res, int resno);
 int __must_check pci_assign_resource(struct pci_dev *dev, int i);
 int pci_select_bars(struct pci_dev *dev, unsigned long flags);
diff --git a/include/linux/pci_regs.h b/include/linux/pci_regs.h
index eb6686b88f9a..e5effd47ed74 100644
--- a/include/linux/pci_regs.h
+++ b/include/linux/pci_regs.h
@@ -377,6 +377,7 @@
 #define  PCI_EXP_DEVCAP_RBER	0x8000	/* Role-Based Error Reporting */
 #define  PCI_EXP_DEVCAP_PWR_VAL	0x3fc0000 /* Slot Power Limit Value */
 #define  PCI_EXP_DEVCAP_PWR_SCL	0xc000000 /* Slot Power Limit Scale */
+#define  PCI_EXP_DEVCAP_FLR     0x10000000 /* Function Level Reset */
 #define PCI_EXP_DEVCTL		8	/* Device Control */
 #define  PCI_EXP_DEVCTL_CERE	0x0001	/* Correctable Error Reporting En. */
 #define  PCI_EXP_DEVCTL_NFERE	0x0002	/* Non-Fatal Error Reporting Enable */
@@ -389,6 +390,7 @@
 #define  PCI_EXP_DEVCTL_AUX_PME	0x0400	/* Auxiliary Power PM Enable */
 #define  PCI_EXP_DEVCTL_NOSNOOP_EN 0x0800  /* Enable No Snoop */
 #define  PCI_EXP_DEVCTL_READRQ	0x7000	/* Max_Read_Request_Size */
+#define  PCI_EXP_DEVCTL_BCR_FLR 0x8000  /* Bridge Configuration Retry / FLR */
 #define PCI_EXP_DEVSTA		10	/* Device Status */
 #define  PCI_EXP_DEVSTA_CED	0x01	/* Correctable Error Detected */
 #define  PCI_EXP_DEVSTA_NFED	0x02	/* Non-Fatal Error Detected */
-- 
cgit v1.2.3


From 64c7f63c1b5c26f057c26f7920f397fed2f590d9 Mon Sep 17 00:00:00 2001
From: Jesse Barnes <jbarnes@virtuousgeek.org>
Date: Tue, 21 Oct 2008 10:09:05 -0700
Subject: PCI: include io.h in pci.h so that ioremap_nocache is defined

Ingo pointed out that the m32r build was broken by pci_ioremap.  It looks like
some files include pci.h w/o including io.h.  The latter defines ioremap_* if
present, so it makes sense to include it in pci.h now that we have pci_ioremap
there.

Reported-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Jesse Barnes <jbarnes@virtuousgeek.org>
---
 include/linux/pci.h | 1 +
 1 file changed, 1 insertion(+)

(limited to 'include/linux')

diff --git a/include/linux/pci.h b/include/linux/pci.h
index f6f6810296e6..ee2fd6304e05 100644
--- a/include/linux/pci.h
+++ b/include/linux/pci.h
@@ -18,6 +18,7 @@
 #define LINUX_PCI_H
 
 #include <linux/pci_regs.h>	/* The pci register defines */
+#include <linux/io.h>
 
 /*
  * The PCI interface treats multi-function devices as independent
-- 
cgit v1.2.3


From 1359f2701b96abd9bb69c1273fb995a093b6409a Mon Sep 17 00:00:00 2001
From: Alex Chiang <achiang@hp.com>
Date: Mon, 20 Oct 2008 17:40:42 -0600
Subject: PCI Hotplug core: add 'name' param pci_hp_register interface

Update pci_hp_register() to take a const char *name parameter.

The motivation for this is to clean up the individual hotplug
drivers so that each one does not have to manage its own name.
The PCI core should be the place where we manage the name.

We update the interface and all callsites first, in a
"no functional change" manner, and clean up the drivers later.

Cc: kristen.c.accardi@intel.com
Acked-by: Kenji Kaneshige <kaneshige.kenji@jp.fujitsu.com>
Reviewed-by: Matthew Wilcox <willy@linux.intel.com>
Signed-off-by: Alex Chiang <achiang@hp.com>
Signed-off-by: Jesse Barnes <jbarnes@virtuousgeek.org>
---
 drivers/pci/hotplug/acpiphp_core.c      |  3 ++-
 drivers/pci/hotplug/cpci_hotplug_core.c |  3 ++-
 drivers/pci/hotplug/cpqphp_core.c       |  3 ++-
 drivers/pci/hotplug/fakephp.c           |  3 ++-
 drivers/pci/hotplug/ibmphp_ebda.c       |  3 ++-
 drivers/pci/hotplug/pci_hotplug_core.c  | 15 ++++++++-------
 drivers/pci/hotplug/pciehp_core.c       |  3 ++-
 drivers/pci/hotplug/rpaphp_slot.c       |  2 +-
 drivers/pci/hotplug/sgi_hotplug.c       |  3 ++-
 drivers/pci/hotplug/shpchp_core.c       |  3 ++-
 include/linux/pci_hotplug.h             |  3 ++-
 11 files changed, 27 insertions(+), 17 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/pci/hotplug/acpiphp_core.c b/drivers/pci/hotplug/acpiphp_core.c
index 0e496e866a84..e9841765339f 100644
--- a/drivers/pci/hotplug/acpiphp_core.c
+++ b/drivers/pci/hotplug/acpiphp_core.c
@@ -340,7 +340,8 @@ int acpiphp_register_hotplug_slot(struct acpiphp_slot *acpiphp_slot)
 
 	retval = pci_hp_register(slot->hotplug_slot,
 					acpiphp_slot->bridge->pci_bus,
-					acpiphp_slot->device);
+					acpiphp_slot->device,
+					slot->name);
 	if (retval == -EBUSY)
 		goto error_hpslot;
 	if (retval) {
diff --git a/drivers/pci/hotplug/cpci_hotplug_core.c b/drivers/pci/hotplug/cpci_hotplug_core.c
index 935947991dc9..5e5dee85763c 100644
--- a/drivers/pci/hotplug/cpci_hotplug_core.c
+++ b/drivers/pci/hotplug/cpci_hotplug_core.c
@@ -285,7 +285,8 @@ cpci_hp_register_bus(struct pci_bus *bus, u8 first, u8 last)
 		info->attention_status = cpci_get_attention_status(slot);
 
 		dbg("registering slot %s", slot->hotplug_slot->name);
-		status = pci_hp_register(slot->hotplug_slot, bus, i);
+		status = pci_hp_register(slot->hotplug_slot, bus, i,
+					 slot->hotplug_slot->name);
 		if (status) {
 			err("pci_hp_register failed with error %d", status);
 			goto error_name;
diff --git a/drivers/pci/hotplug/cpqphp_core.c b/drivers/pci/hotplug/cpqphp_core.c
index 54defec51d08..a7fe4584f00b 100644
--- a/drivers/pci/hotplug/cpqphp_core.c
+++ b/drivers/pci/hotplug/cpqphp_core.c
@@ -436,7 +436,8 @@ static int ctrl_slot_setup(struct controller *ctrl,
 				slot_number);
 		result = pci_hp_register(hotplug_slot,
 					 ctrl->pci_dev->subordinate,
-					 slot->device);
+					 slot->device,
+					 hotplug_slot->name);
 		if (result) {
 			err("pci_hp_register failed with error %d\n", result);
 			goto error_name;
diff --git a/drivers/pci/hotplug/fakephp.c b/drivers/pci/hotplug/fakephp.c
index 146ca9cd1567..3069f2153832 100644
--- a/drivers/pci/hotplug/fakephp.c
+++ b/drivers/pci/hotplug/fakephp.c
@@ -126,7 +126,8 @@ static int add_slot(struct pci_dev *dev)
 	slot->release = &dummy_release;
 	slot->private = dslot;
 
-	retval = pci_hp_register(slot, dev->bus, PCI_SLOT(dev->devfn));
+	retval = pci_hp_register(slot, dev->bus, PCI_SLOT(dev->devfn),
+				 slot->name);
 	if (retval) {
 		err("pci_hp_register failed with error %d\n", retval);
 		goto error_dslot;
diff --git a/drivers/pci/hotplug/ibmphp_ebda.c b/drivers/pci/hotplug/ibmphp_ebda.c
index 8cfd1c4926c8..342d3e8f77c8 100644
--- a/drivers/pci/hotplug/ibmphp_ebda.c
+++ b/drivers/pci/hotplug/ibmphp_ebda.c
@@ -966,7 +966,8 @@ static int __init ebda_rsrc_controller (void)
 	list_for_each_entry(tmp_slot, &ibmphp_slot_head, ibm_slot_list) {
 		snprintf (tmp_slot->hotplug_slot->name, 30, "%s", create_file_name (tmp_slot));
 		pci_hp_register(tmp_slot->hotplug_slot,
-			pci_find_bus(0, tmp_slot->bus), tmp_slot->device);
+			pci_find_bus(0, tmp_slot->bus), tmp_slot->device,
+			tmp_slot->hotplug_slot->name);
 	}
 
 	print_ebda_hpc ();
diff --git a/drivers/pci/hotplug/pci_hotplug_core.c b/drivers/pci/hotplug/pci_hotplug_core.c
index 2e6c4474644e..02b1ae12a2e6 100644
--- a/drivers/pci/hotplug/pci_hotplug_core.c
+++ b/drivers/pci/hotplug/pci_hotplug_core.c
@@ -547,13 +547,15 @@ out:
  * @bus: bus this slot is on
  * @slot: pointer to the &struct hotplug_slot to register
  * @slot_nr: slot number
+ * @name: name registered with kobject core
  *
  * Registers a hotplug slot with the pci hotplug subsystem, which will allow
  * userspace interaction to the slot.
  *
  * Returns 0 if successful, anything else for an error.
  */
-int pci_hp_register(struct hotplug_slot *slot, struct pci_bus *bus, int slot_nr)
+int pci_hp_register(struct hotplug_slot *slot, struct pci_bus *bus, int slot_nr,
+			const char *name)
 {
 	int result;
 	struct pci_slot *pci_slot;
@@ -569,7 +571,7 @@ int pci_hp_register(struct hotplug_slot *slot, struct pci_bus *bus, int slot_nr)
 	}
 
 	/* Check if we have already registered a slot with the same name. */
-	if (get_slot_from_name(slot->name))
+	if (get_slot_from_name(name))
 		return -EEXIST;
 
 	/*
@@ -577,7 +579,7 @@ int pci_hp_register(struct hotplug_slot *slot, struct pci_bus *bus, int slot_nr)
 	 * driver and call it here again. If we've already created the
 	 * pci_slot, the interface will simply bump the refcount.
 	 */
-	pci_slot = pci_create_slot(bus, slot_nr, slot->name);
+	pci_slot = pci_create_slot(bus, slot_nr, name);
 	if (IS_ERR(pci_slot))
 		return PTR_ERR(pci_slot);
 
@@ -593,8 +595,8 @@ int pci_hp_register(struct hotplug_slot *slot, struct pci_bus *bus, int slot_nr)
 	/*
 	 * Allow pcihp drivers to override the ACPI_PCI_SLOT name.
 	 */
-	if (strcmp(kobject_name(&pci_slot->kobj), slot->name)) {
-		result = kobject_rename(&pci_slot->kobj, slot->name);
+	if (strcmp(kobject_name(&pci_slot->kobj), name)) {
+		result = kobject_rename(&pci_slot->kobj, name);
 		if (result) {
 			pci_destroy_slot(pci_slot);
 			return result;
@@ -607,8 +609,7 @@ int pci_hp_register(struct hotplug_slot *slot, struct pci_bus *bus, int slot_nr)
 
 	result = fs_add_slot(pci_slot);
 	kobject_uevent(&pci_slot->kobj, KOBJ_ADD);
-	dbg("Added slot %s to the list\n", slot->name);
-
+	dbg("Added slot %s to the list\n", name);
 
 	return result;
 }
diff --git a/drivers/pci/hotplug/pciehp_core.c b/drivers/pci/hotplug/pciehp_core.c
index c748a19db89d..3ace5e057601 100644
--- a/drivers/pci/hotplug/pciehp_core.c
+++ b/drivers/pci/hotplug/pciehp_core.c
@@ -226,7 +226,8 @@ static int init_slots(struct controller *ctrl)
 duplicate_name:
 		retval = pci_hp_register(hotplug_slot,
 					 ctrl->pci_dev->subordinate,
-					 slot->device);
+					 slot->device,
+					 slot->name);
 		if (retval) {
 			/*
 			 * If slot N already exists, we'll try to create
diff --git a/drivers/pci/hotplug/rpaphp_slot.c b/drivers/pci/hotplug/rpaphp_slot.c
index 50884507b8be..736d3b43ed0b 100644
--- a/drivers/pci/hotplug/rpaphp_slot.c
+++ b/drivers/pci/hotplug/rpaphp_slot.c
@@ -137,7 +137,7 @@ int rpaphp_register_slot(struct slot *slot)
 		slotno = PCI_SLOT(PCI_DN(slot->dn->child)->devfn);
 	else
 		slotno = -1;
-	retval = pci_hp_register(php_slot, slot->bus, slotno);
+	retval = pci_hp_register(php_slot, slot->bus, slotno, slot->name);
 	if (retval) {
 		err("pci_hp_register failed with error %d\n", retval);
 		return retval;
diff --git a/drivers/pci/hotplug/sgi_hotplug.c b/drivers/pci/hotplug/sgi_hotplug.c
index 410fe0394a8e..6d20bbd4359a 100644
--- a/drivers/pci/hotplug/sgi_hotplug.c
+++ b/drivers/pci/hotplug/sgi_hotplug.c
@@ -653,7 +653,8 @@ static int sn_hotplug_slot_register(struct pci_bus *pci_bus)
 		bss_hotplug_slot->ops = &sn_hotplug_slot_ops;
 		bss_hotplug_slot->release = &sn_release_slot;
 
-		rc = pci_hp_register(bss_hotplug_slot, pci_bus, device);
+		rc = pci_hp_register(bss_hotplug_slot, pci_bus, device,
+				     bss_hotplug_slot->name);
 		if (rc)
 			goto register_err;
 
diff --git a/drivers/pci/hotplug/shpchp_core.c b/drivers/pci/hotplug/shpchp_core.c
index cc38615395f1..bf5096612aab 100644
--- a/drivers/pci/hotplug/shpchp_core.c
+++ b/drivers/pci/hotplug/shpchp_core.c
@@ -146,7 +146,8 @@ static int init_slots(struct controller *ctrl)
 		    slot->hp_slot, slot->number, ctrl->slot_device_offset);
 duplicate_name:
 		retval = pci_hp_register(slot->hotplug_slot,
-				ctrl->pci_dev->subordinate, slot->device);
+				ctrl->pci_dev->subordinate, slot->device,
+				hotplug_slot->name);
 		if (retval) {
 			/*
 			 * If slot N already exists, we'll try to create
diff --git a/include/linux/pci_hotplug.h b/include/linux/pci_hotplug.h
index a08cd06b541a..5efba6671865 100644
--- a/include/linux/pci_hotplug.h
+++ b/include/linux/pci_hotplug.h
@@ -165,7 +165,8 @@ struct hotplug_slot {
 };
 #define to_hotplug_slot(n) container_of(n, struct hotplug_slot, kobj)
 
-extern int pci_hp_register(struct hotplug_slot *, struct pci_bus *, int nr);
+extern int pci_hp_register(struct hotplug_slot *, struct pci_bus *, int nr,
+			   const char *name);
 extern int pci_hp_deregister(struct hotplug_slot *slot);
 extern int __must_check pci_hp_change_slot_info	(struct hotplug_slot *slot,
 						 struct hotplug_slot_info *info);
-- 
cgit v1.2.3


From d25b7c8d6ba2735602003d75a28894772fe8ad6a Mon Sep 17 00:00:00 2001
From: Alex Chiang <achiang@hp.com>
Date: Mon, 20 Oct 2008 17:40:47 -0600
Subject: PCI: rename pci_update_slot_number to pci_renumber_slot

The GPL exported symbol pci_update_slot_number has been renamed to
pci_renumber_slot. Some of the safety checks were unnecessary and
were removed.

Cc: kristen.c.accardi@intel.com
Cc: matthew@wil.cx
Acked-by: Kenji Kaneshige <kaneshige.kenji@jp.fujitsu.com>
Signed-off-by: Alex Chiang <achiang@hp.com>
Signed-off-by: Jesse Barnes <jbarnes@virtuousgeek.org>
---
 drivers/pci/slot.c  | 15 +++++----------
 include/linux/pci.h |  2 +-
 2 files changed, 6 insertions(+), 11 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/pci/slot.c b/drivers/pci/slot.c
index 0c6db03698ea..b9b90ab6b861 100644
--- a/drivers/pci/slot.c
+++ b/drivers/pci/slot.c
@@ -175,7 +175,7 @@ placeholder:
 EXPORT_SYMBOL_GPL(pci_create_slot);
 
 /**
- * pci_update_slot_number - update %struct pci_slot -> number
+ * pci_renumber_slot - update %struct pci_slot -> number
  * @slot - %struct pci_slot to update
  * @slot_nr - new number for slot
  *
@@ -183,27 +183,22 @@ EXPORT_SYMBOL_GPL(pci_create_slot);
  * created a placeholder slot in pci_create_slot() by passing a -1 as
  * slot_nr, to update their %struct pci_slot with the correct @slot_nr.
  */
-
-void pci_update_slot_number(struct pci_slot *slot, int slot_nr)
+void pci_renumber_slot(struct pci_slot *slot, int slot_nr)
 {
-	int name_count = 0;
 	struct pci_slot *tmp;
 
 	down_write(&pci_bus_sem);
 
 	list_for_each_entry(tmp, &slot->bus->slots, list) {
 		WARN_ON(tmp->number == slot_nr);
-		if (!strcmp(kobject_name(&tmp->kobj), kobject_name(&slot->kobj)))
-			name_count++;
+		goto out;
 	}
 
-	if (name_count > 1)
-		printk(KERN_WARNING "pci_update_slot_number found %d slots with the same name: %s\n", name_count, kobject_name(&slot->kobj));
-
 	slot->number = slot_nr;
+out:
 	up_write(&pci_bus_sem);
 }
-EXPORT_SYMBOL_GPL(pci_update_slot_number);
+EXPORT_SYMBOL_GPL(pci_renumber_slot);
 
 /**
  * pci_destroy_slot - decrement refcount for physical PCI slot
diff --git a/include/linux/pci.h b/include/linux/pci.h
index ee2fd6304e05..41717ae9807e 100644
--- a/include/linux/pci.h
+++ b/include/linux/pci.h
@@ -512,7 +512,7 @@ struct pci_bus *pci_add_new_bus(struct pci_bus *parent, struct pci_dev *dev,
 struct pci_slot *pci_create_slot(struct pci_bus *parent, int slot_nr,
 				 const char *name);
 void pci_destroy_slot(struct pci_slot *slot);
-void pci_update_slot_number(struct pci_slot *slot, int slot_nr);
+void pci_renumber_slot(struct pci_slot *slot, int slot_nr);
 int pci_scan_slot(struct pci_bus *bus, int devfn);
 struct pci_dev *pci_scan_single_device(struct pci_bus *bus, int devfn);
 void pci_device_add(struct pci_dev *dev, struct pci_bus *bus);
-- 
cgit v1.2.3


From 828f37683e6d3ab5912989df0d04201db7ad798e Mon Sep 17 00:00:00 2001
From: Alex Chiang <achiang@hp.com>
Date: Mon, 20 Oct 2008 17:40:52 -0600
Subject: PCI: update pci_create_slot() to take a 'hotplug' param

Slot detection drivers can co-exist with hotplug drivers. The names
of the detected/claimed slots may be different depending on module
load order.

For legacy reasons, we need to allow hotplug drivers to override
the slot name if a detection driver is loaded first (and they find
the same slots).

Creating and overriding slot names should be an atomic operation,
otherwise you get a locking nightmare as various drivers race to
call pci_create_slot().

pci_create_slot() is already serialized by grabbing the pci_bus_sem.

We update the API and add a 'hotplug' param, which is:

	set if the caller is a hotplug driver
	NULL if the caller is a detection driver

pci_create_slot() does not actually use the 'hotplug' parameter in this
patch. A later patch will add the logic that uses it.

Cc: kristen.c.accardi@intel.com
Cc: matthew@wil.cx
Acked-by: Kenji Kaneshige <kaneshige.kenji@jp.fujitsu.com>
Signed-off-by: Alex Chiang <achiang@hp.com>
Signed-off-by: Jesse Barnes <jbarnes@virtuousgeek.org>
---
 drivers/acpi/pci_slot.c                | 2 +-
 drivers/pci/hotplug/pci_hotplug_core.c | 2 +-
 drivers/pci/slot.c                     | 4 +++-
 include/linux/pci.h                    | 3 ++-
 4 files changed, 7 insertions(+), 4 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/acpi/pci_slot.c b/drivers/acpi/pci_slot.c
index d5b4ef898879..8d4a568be1cc 100644
--- a/drivers/acpi/pci_slot.c
+++ b/drivers/acpi/pci_slot.c
@@ -150,7 +150,7 @@ register_slot(acpi_handle handle, u32 lvl, void *context, void **rv)
 	}
 
 	snprintf(name, sizeof(name), "%u", (u32)sun);
-	pci_slot = pci_create_slot(pci_bus, device, name);
+	pci_slot = pci_create_slot(pci_bus, device, name, NULL);
 	if (IS_ERR(pci_slot)) {
 		err("pci_create_slot returned %ld\n", PTR_ERR(pci_slot));
 		kfree(slot);
diff --git a/drivers/pci/hotplug/pci_hotplug_core.c b/drivers/pci/hotplug/pci_hotplug_core.c
index 02b1ae12a2e6..1cdeb642fdcf 100644
--- a/drivers/pci/hotplug/pci_hotplug_core.c
+++ b/drivers/pci/hotplug/pci_hotplug_core.c
@@ -579,7 +579,7 @@ int pci_hp_register(struct hotplug_slot *slot, struct pci_bus *bus, int slot_nr,
 	 * driver and call it here again. If we've already created the
 	 * pci_slot, the interface will simply bump the refcount.
 	 */
-	pci_slot = pci_create_slot(bus, slot_nr, name);
+	pci_slot = pci_create_slot(bus, slot_nr, name, slot);
 	if (IS_ERR(pci_slot))
 		return PTR_ERR(pci_slot);
 
diff --git a/drivers/pci/slot.c b/drivers/pci/slot.c
index b9b90ab6b861..0e009c3ba5fd 100644
--- a/drivers/pci/slot.c
+++ b/drivers/pci/slot.c
@@ -83,6 +83,7 @@ static struct kobj_type pci_slot_ktype = {
  * @parent: struct pci_bus of parent bridge
  * @slot_nr: PCI_SLOT(pci_dev->devfn) or -1 for placeholder
  * @name: user visible string presented in /sys/bus/pci/slots/<name>
+ * @hotplug: set if caller is hotplug driver, NULL otherwise
  *
  * PCI slots have first class attributes such as address, speed, width,
  * and a &struct pci_slot is used to manage them. This interface will
@@ -111,7 +112,8 @@ static struct kobj_type pci_slot_ktype = {
  */
 
 struct pci_slot *pci_create_slot(struct pci_bus *parent, int slot_nr,
-				 const char *name)
+				 const char *name,
+				 struct hotplug_slot *hotplug)
 {
 	struct pci_dev *dev;
 	struct pci_slot *slot;
diff --git a/include/linux/pci.h b/include/linux/pci.h
index 41717ae9807e..9a8cee623301 100644
--- a/include/linux/pci.h
+++ b/include/linux/pci.h
@@ -510,7 +510,8 @@ struct pci_bus *pci_create_bus(struct device *parent, int bus,
 struct pci_bus *pci_add_new_bus(struct pci_bus *parent, struct pci_dev *dev,
 				int busnr);
 struct pci_slot *pci_create_slot(struct pci_bus *parent, int slot_nr,
-				 const char *name);
+				 const char *name,
+				 struct hotplug_slot *hotplug);
 void pci_destroy_slot(struct pci_slot *slot);
 void pci_renumber_slot(struct pci_slot *slot, int slot_nr);
 int pci_scan_slot(struct pci_bus *bus, int devfn);
-- 
cgit v1.2.3


From 0ad772ec464d3fcf9d210836b97e654f393606c4 Mon Sep 17 00:00:00 2001
From: Alex Chiang <achiang@hp.com>
Date: Mon, 20 Oct 2008 17:41:07 -0600
Subject: PCI, PCI Hotplug: introduce slot_name helpers

In preparation for cleaning up the various hotplug drivers
such that they don't have to manage their own 'name' parameters
anymore, we provide the following convenience functions:

	pci_slot_name()
	hotplug_slot_name()

These helpers will be used by individual hotplug drivers.

Cc: kristen.c.accardi@intel.com
Cc: matthew@wil.cx
Acked-by: Kenji Kaneshige <kaneshige.kenji@jp.fujitsu.com>
Signed-off-by: Alex Chiang <achiang@hp.com>
Signed-off-by: Jesse Barnes <jbarnes@virtuousgeek.org>
---
 drivers/pci/slot.c          | 2 +-
 include/linux/pci.h         | 5 +++++
 include/linux/pci_hotplug.h | 5 +++++
 3 files changed, 11 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/drivers/pci/slot.c b/drivers/pci/slot.c
index b6ee352ae459..4dd1c3e157ae 100644
--- a/drivers/pci/slot.c
+++ b/drivers/pci/slot.c
@@ -121,7 +121,7 @@ static int rename_slot(struct pci_slot *slot, const char *name)
 	int result = 0;
 	char *slot_name;
 
-	if (strcmp(kobject_name(&slot->kobj), name) == 0)
+	if (strcmp(pci_slot_name(slot), name) == 0)
 		return result;
 
 	slot_name = make_slot_name(name);
diff --git a/include/linux/pci.h b/include/linux/pci.h
index 9a8cee623301..955ab705c05e 100644
--- a/include/linux/pci.h
+++ b/include/linux/pci.h
@@ -65,6 +65,11 @@ struct pci_slot {
 	struct kobject kobj;
 };
 
+static inline const char *pci_slot_name(const struct pci_slot *slot)
+{
+	return kobject_name(&slot->kobj);
+}
+
 /* File state for mmap()s on /proc/bus/pci/X/Y */
 enum pci_mmap_state {
 	pci_mmap_io,
diff --git a/include/linux/pci_hotplug.h b/include/linux/pci_hotplug.h
index 5efba6671865..a3a3245943b1 100644
--- a/include/linux/pci_hotplug.h
+++ b/include/linux/pci_hotplug.h
@@ -165,6 +165,11 @@ struct hotplug_slot {
 };
 #define to_hotplug_slot(n) container_of(n, struct hotplug_slot, kobj)
 
+static inline const char *hotplug_slot_name(const struct hotplug_slot *slot)
+{
+	return pci_slot_name(slot->pci_slot);
+}
+
 extern int pci_hp_register(struct hotplug_slot *, struct pci_bus *, int nr,
 			   const char *name);
 extern int pci_hp_deregister(struct hotplug_slot *slot);
-- 
cgit v1.2.3


From 58319b802a614f10f1b5238fbde7a4b2e9a60069 Mon Sep 17 00:00:00 2001
From: Alex Chiang <achiang@hp.com>
Date: Mon, 20 Oct 2008 17:41:58 -0600
Subject: PCI: Hotplug core: remove 'name'

Now that the PCI core manages the 'name' for each individual
hotplug driver, and all drivers (except rpaphp) have been converted
to use hotplug_slot_name(), there is no need for the PCI hotplug
core to drag around its own copy of name either.

Cc: kristen.c.accardi@intel.com
Cc: matthew@wil.cx
Acked-by: Kenji Kaneshige <kaneshige.kenji@jp.fujitsu.com>
Signed-off-by: Alex Chiang <achiang@hp.com>
Signed-off-by: Jesse Barnes <jbarnes@virtuousgeek.org>
---
 drivers/pci/hotplug/pci_hotplug_core.c | 6 +++---
 include/linux/pci_hotplug.h            | 3 ---
 2 files changed, 3 insertions(+), 6 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/pci/hotplug/pci_hotplug_core.c b/drivers/pci/hotplug/pci_hotplug_core.c
index a6f1f282b683..535fce0f07f9 100644
--- a/drivers/pci/hotplug/pci_hotplug_core.c
+++ b/drivers/pci/hotplug/pci_hotplug_core.c
@@ -533,7 +533,7 @@ static struct hotplug_slot *get_slot_from_name (const char *name)
 
 	list_for_each (tmp, &pci_hotplug_slot_list) {
 		slot = list_entry (tmp, struct hotplug_slot, slot_list);
-		if (strcmp(slot->name, name) == 0)
+		if (strcmp(hotplug_slot_name(slot), name) == 0)
 			return slot;
 	}
 	return NULL;
@@ -611,7 +611,7 @@ int pci_hp_deregister(struct hotplug_slot *hotplug)
 		return -ENODEV;
 
 	mutex_lock(&pci_hp_mutex);
-	temp = get_slot_from_name(hotplug->name);
+	temp = get_slot_from_name(hotplug_slot_name(hotplug));
 	if (temp != hotplug) {
 		mutex_unlock(&pci_hp_mutex);
 		return -ENODEV;
@@ -621,7 +621,7 @@ int pci_hp_deregister(struct hotplug_slot *hotplug)
 
 	slot = hotplug->pci_slot;
 	fs_remove_slot(slot);
-	dbg("Removed slot %s from the list\n", hotplug->name);
+	dbg("Removed slot %s from the list\n", hotplug_slot_name(hotplug));
 
 	hotplug->release(hotplug);
 	slot->hotplug = NULL;
diff --git a/include/linux/pci_hotplug.h b/include/linux/pci_hotplug.h
index a3a3245943b1..a00bd1a0f156 100644
--- a/include/linux/pci_hotplug.h
+++ b/include/linux/pci_hotplug.h
@@ -142,8 +142,6 @@ struct hotplug_slot_info {
 
 /**
  * struct hotplug_slot - used to register a physical slot with the hotplug pci core
- * @name: the name of the slot being registered.  This string must
- * be unique amoung slots registered on this system.
  * @ops: pointer to the &struct hotplug_slot_ops to be used for this slot
  * @info: pointer to the &struct hotplug_slot_info for the initial values for
  * this slot.
@@ -153,7 +151,6 @@ struct hotplug_slot_info {
  * needs.
  */
 struct hotplug_slot {
-	char				*name;
 	struct hotplug_slot_ops		*ops;
 	struct hotplug_slot_info	*info;
 	void (*release) (struct hotplug_slot *slot);
-- 
cgit v1.2.3


From 1388cc964e680c1086ca0edae35be094cb29d51e Mon Sep 17 00:00:00 2001
From: Stephen Rothwell <sfr@canb.auug.org.au>
Date: Wed, 22 Oct 2008 13:39:55 +1100
Subject: PCI: don't export linux/io.h from pci.h

Move the include of io.h down into the #ifdef __KERNEL__ protected
region.

Signed-off-by: Stephen Rothwell <sfr@canb.auug.org.au>
Signed-off-by: Jesse Barnes <jbarnes@virtuousgeek.org>
---
 include/linux/pci.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/pci.h b/include/linux/pci.h
index 955ab705c05e..752def8a2ef4 100644
--- a/include/linux/pci.h
+++ b/include/linux/pci.h
@@ -18,7 +18,6 @@
 #define LINUX_PCI_H
 
 #include <linux/pci_regs.h>	/* The pci register defines */
-#include <linux/io.h>
 
 /*
  * The PCI interface treats multi-function devices as independent
@@ -52,6 +51,7 @@
 #include <linux/kobject.h>
 #include <asm/atomic.h>
 #include <linux/device.h>
+#include <linux/io.h>
 
 /* Include the ID list */
 #include <linux/pci_ids.h>
-- 
cgit v1.2.3


From 848e4c68c4695beae563f9a3d59fce596b466a74 Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Tue, 21 Oct 2008 14:26:39 +0900
Subject: libata: transfer EHI control flags to slave ehc.i

ATA_EHI_NO_AUTOPSY and ATA_EHI_QUIET are used to control the behavior
of EH.  As only the master link is visible outside EH, these flags are
set only for the master link although they should also apply to the
slave link, which causes spurious EH messages during probe and
suspend/resume.

This patch transfers those two flags to slave ehc.i before performing
slave autopsy and reporting.

Signed-off-by: Tejun Heo <tj@kernel.org>
Signed-off-by: Jeff Garzik <jgarzik@redhat.com>
---
 drivers/ata/libata-eh.c | 5 +++++
 include/linux/libata.h  | 3 +++
 2 files changed, 8 insertions(+)

(limited to 'include/linux')

diff --git a/drivers/ata/libata-eh.c b/drivers/ata/libata-eh.c
index a93247cc395a..d2409a8acece 100644
--- a/drivers/ata/libata-eh.c
+++ b/drivers/ata/libata-eh.c
@@ -2010,8 +2010,13 @@ void ata_eh_autopsy(struct ata_port *ap)
 		struct ata_eh_context *mehc = &ap->link.eh_context;
 		struct ata_eh_context *sehc = &ap->slave_link->eh_context;
 
+		/* transfer control flags from master to slave */
+		sehc->i.flags |= mehc->i.flags & ATA_EHI_TO_SLAVE_MASK;
+
+		/* perform autopsy on the slave link */
 		ata_eh_link_autopsy(ap->slave_link);
 
+		/* transfer actions from slave to master and clear slave */
 		ata_eh_about_to_do(ap->slave_link, NULL, ATA_EH_ALL_ACTIONS);
 		mehc->i.action		|= sehc->i.action;
 		mehc->i.dev_action[1]	|= sehc->i.dev_action[1];
diff --git a/include/linux/libata.h b/include/linux/libata.h
index 947cf84e555d..c261aa0584b1 100644
--- a/include/linux/libata.h
+++ b/include/linux/libata.h
@@ -340,6 +340,9 @@ enum {
 
 	ATA_EHI_DID_RESET	= ATA_EHI_DID_SOFTRESET | ATA_EHI_DID_HARDRESET,
 
+	/* mask of flags to transfer *to* the slave link */
+	ATA_EHI_TO_SLAVE_MASK	= ATA_EHI_NO_AUTOPSY | ATA_EHI_QUIET,
+
 	/* max tries if error condition is still set after ->error_handler */
 	ATA_EH_MAX_TRIES	= 5,
 
-- 
cgit v1.2.3


From d181146572c4fa9af2a068b967cb53dcac7da944 Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Sat, 2 Aug 2008 00:49:18 -0400
Subject: [PATCH] new helper - kern_path()

Analog of lookup_path(), takes struct path *.

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/namei.c            | 10 ++++++++++
 include/linux/namei.h |  2 ++
 2 files changed, 12 insertions(+)

(limited to 'include/linux')

diff --git a/fs/namei.c b/fs/namei.c
index 4ea63ed5e791..4a56f9b59e8c 100644
--- a/fs/namei.c
+++ b/fs/namei.c
@@ -1106,6 +1106,15 @@ int path_lookup(const char *name, unsigned int flags,
 	return do_path_lookup(AT_FDCWD, name, flags, nd);
 }
 
+int kern_path(const char *name, unsigned int flags, struct path *path)
+{
+	struct nameidata nd;
+	int res = do_path_lookup(AT_FDCWD, name, flags, &nd);
+	if (!res)
+		*path = nd.path;
+	return res;
+}
+
 /**
  * vfs_path_lookup - lookup a file path relative to a dentry-vfsmount pair
  * @dentry:  pointer to dentry of the base directory
@@ -2855,6 +2864,7 @@ EXPORT_SYMBOL(__page_symlink);
 EXPORT_SYMBOL(page_symlink);
 EXPORT_SYMBOL(page_symlink_inode_operations);
 EXPORT_SYMBOL(path_lookup);
+EXPORT_SYMBOL(kern_path);
 EXPORT_SYMBOL(vfs_path_lookup);
 EXPORT_SYMBOL(inode_permission);
 EXPORT_SYMBOL(vfs_permission);
diff --git a/include/linux/namei.h b/include/linux/namei.h
index 68f8c3203c89..221e8bc894ba 100644
--- a/include/linux/namei.h
+++ b/include/linux/namei.h
@@ -61,6 +61,8 @@ extern int user_path_at(int, const char __user *, unsigned, struct path *);
 #define user_path_dir(name, path) \
 	user_path_at(AT_FDCWD, name, LOOKUP_FOLLOW | LOOKUP_DIRECTORY, path)
 
+extern int kern_path(const char *, unsigned, struct path *);
+
 extern int path_lookup(const char *, unsigned, struct nameidata *);
 extern int vfs_path_lookup(struct dentry *, struct vfsmount *,
 			   const char *, unsigned int, struct nameidata *);
-- 
cgit v1.2.3


From b63365a2d60268a3988285d6c3c6003d7066f93a Mon Sep 17 00:00:00 2001
From: Herbert Xu <herbert@gondor.apana.org.au>
Date: Thu, 23 Oct 2008 01:11:29 -0700
Subject: net: Fix disjunct computation of netdev features

My change

    commit e2a6b85247aacc52d6ba0d9b37a99b8d1a3e0d83
    net: Enable TSO if supported by at least one device

didn't do what was intended because the netdev_compute_features
function was designed for conjunctions.  So what happened was that
it would simply take the TSO status of the last constituent device.

This patch extends it to support both conjunctions and disjunctions
under the new name of netdev_increment_features.

It also adds a new function netdev_fix_features which does the
sanity checking that usually occurs upon registration.  This ensures
that the computation doesn't result in an illegal combination
since this checking is absent when the change is initiated via
ethtool.

The two users of netdev_compute_features have been converted.

Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/bonding/bond_main.c |  16 +++--
 include/linux/netdevice.h       |  12 +++-
 net/bridge/br_device.c          |   2 +-
 net/bridge/br_if.c              |  14 +++--
 net/core/dev.c                  | 135 +++++++++++++++++++++-------------------
 5 files changed, 104 insertions(+), 75 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/net/bonding/bond_main.c b/drivers/net/bonding/bond_main.c
index 8e2be24f3fe4..832739f38db4 100644
--- a/drivers/net/bonding/bond_main.c
+++ b/drivers/net/bonding/bond_main.c
@@ -1341,18 +1341,24 @@ static int bond_compute_features(struct bonding *bond)
 	int i;
 
 	features &= ~(NETIF_F_ALL_CSUM | BOND_VLAN_FEATURES);
-	features |= NETIF_F_SG | NETIF_F_FRAGLIST | NETIF_F_HIGHDMA |
-		    NETIF_F_GSO_MASK | NETIF_F_NO_CSUM;
+	features |=  NETIF_F_GSO_MASK | NETIF_F_NO_CSUM;
+
+	if (!bond->first_slave)
+		goto done;
+
+	features &= ~NETIF_F_ONE_FOR_ALL;
 
 	bond_for_each_slave(bond, slave, i) {
-		features = netdev_compute_features(features,
-						   slave->dev->features);
+		features = netdev_increment_features(features,
+						     slave->dev->features,
+						     NETIF_F_ONE_FOR_ALL);
 		if (slave->dev->hard_header_len > max_hard_header_len)
 			max_hard_header_len = slave->dev->hard_header_len;
 	}
 
+done:
 	features |= (bond_dev->features & BOND_VLAN_FEATURES);
-	bond_dev->features = features;
+	bond_dev->features = netdev_fix_features(features, NULL);
 	bond_dev->hard_header_len = max_hard_header_len;
 
 	return 0;
diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index 64875859d654..c8bcb59adfdf 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -541,6 +541,14 @@ struct net_device
 #define NETIF_F_V6_CSUM		(NETIF_F_GEN_CSUM | NETIF_F_IPV6_CSUM)
 #define NETIF_F_ALL_CSUM	(NETIF_F_V4_CSUM | NETIF_F_V6_CSUM)
 
+	/*
+	 * If one device supports one of these features, then enable them
+	 * for all in netdev_increment_features.
+	 */
+#define NETIF_F_ONE_FOR_ALL	(NETIF_F_GSO_SOFTWARE | NETIF_F_GSO_ROBUST | \
+				 NETIF_F_SG | NETIF_F_HIGHDMA | \
+				 NETIF_F_FRAGLIST)
+
 	/* Interface index. Unique device identifier	*/
 	int			ifindex;
 	int			iflink;
@@ -1698,7 +1706,9 @@ extern char *netdev_drivername(const struct net_device *dev, char *buffer, int l
 
 extern void linkwatch_run_queue(void);
 
-extern int netdev_compute_features(unsigned long all, unsigned long one);
+unsigned long netdev_increment_features(unsigned long all, unsigned long one,
+					unsigned long mask);
+unsigned long netdev_fix_features(unsigned long features, const char *name);
 
 static inline int net_gso_ok(int features, int gso_type)
 {
diff --git a/net/bridge/br_device.c b/net/bridge/br_device.c
index 22ba8632196f..6c023f0f8252 100644
--- a/net/bridge/br_device.c
+++ b/net/bridge/br_device.c
@@ -179,5 +179,5 @@ void br_dev_setup(struct net_device *dev)
 
 	dev->features = NETIF_F_SG | NETIF_F_FRAGLIST | NETIF_F_HIGHDMA |
 			NETIF_F_GSO_MASK | NETIF_F_NO_CSUM | NETIF_F_LLTX |
-			NETIF_F_NETNS_LOCAL;
+			NETIF_F_NETNS_LOCAL | NETIF_F_GSO;
 }
diff --git a/net/bridge/br_if.c b/net/bridge/br_if.c
index 573e20f7dba4..0a09ccf68c1c 100644
--- a/net/bridge/br_if.c
+++ b/net/bridge/br_if.c
@@ -347,15 +347,21 @@ int br_min_mtu(const struct net_bridge *br)
 void br_features_recompute(struct net_bridge *br)
 {
 	struct net_bridge_port *p;
-	unsigned long features;
+	unsigned long features, mask;
 
-	features = br->feature_mask;
+	features = mask = br->feature_mask;
+	if (list_empty(&br->port_list))
+		goto done;
+
+	features &= ~NETIF_F_ONE_FOR_ALL;
 
 	list_for_each_entry(p, &br->port_list, list) {
-		features = netdev_compute_features(features, p->dev->features);
+		features = netdev_increment_features(features,
+						     p->dev->features, mask);
 	}
 
-	br->dev->features = features;
+done:
+	br->dev->features = netdev_fix_features(features, NULL);
 }
 
 /* called with RTNL */
diff --git a/net/core/dev.c b/net/core/dev.c
index b8a4fd0806af..d9038e328cc1 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -3947,6 +3947,46 @@ static void netdev_init_queue_locks(struct net_device *dev)
 	__netdev_init_queue_locks_one(dev, &dev->rx_queue, NULL);
 }
 
+unsigned long netdev_fix_features(unsigned long features, const char *name)
+{
+	/* Fix illegal SG+CSUM combinations. */
+	if ((features & NETIF_F_SG) &&
+	    !(features & NETIF_F_ALL_CSUM)) {
+		if (name)
+			printk(KERN_NOTICE "%s: Dropping NETIF_F_SG since no "
+			       "checksum feature.\n", name);
+		features &= ~NETIF_F_SG;
+	}
+
+	/* TSO requires that SG is present as well. */
+	if ((features & NETIF_F_TSO) && !(features & NETIF_F_SG)) {
+		if (name)
+			printk(KERN_NOTICE "%s: Dropping NETIF_F_TSO since no "
+			       "SG feature.\n", name);
+		features &= ~NETIF_F_TSO;
+	}
+
+	if (features & NETIF_F_UFO) {
+		if (!(features & NETIF_F_GEN_CSUM)) {
+			if (name)
+				printk(KERN_ERR "%s: Dropping NETIF_F_UFO "
+				       "since no NETIF_F_HW_CSUM feature.\n",
+				       name);
+			features &= ~NETIF_F_UFO;
+		}
+
+		if (!(features & NETIF_F_SG)) {
+			if (name)
+				printk(KERN_ERR "%s: Dropping NETIF_F_UFO "
+				       "since no NETIF_F_SG feature.\n", name);
+			features &= ~NETIF_F_UFO;
+		}
+	}
+
+	return features;
+}
+EXPORT_SYMBOL(netdev_fix_features);
+
 /**
  *	register_netdevice	- register a network device
  *	@dev: device to register
@@ -4032,36 +4072,7 @@ int register_netdevice(struct net_device *dev)
 		dev->features &= ~(NETIF_F_IP_CSUM|NETIF_F_IPV6_CSUM|NETIF_F_HW_CSUM);
 	}
 
-
-	/* Fix illegal SG+CSUM combinations. */
-	if ((dev->features & NETIF_F_SG) &&
-	    !(dev->features & NETIF_F_ALL_CSUM)) {
-		printk(KERN_NOTICE "%s: Dropping NETIF_F_SG since no checksum feature.\n",
-		       dev->name);
-		dev->features &= ~NETIF_F_SG;
-	}
-
-	/* TSO requires that SG is present as well. */
-	if ((dev->features & NETIF_F_TSO) &&
-	    !(dev->features & NETIF_F_SG)) {
-		printk(KERN_NOTICE "%s: Dropping NETIF_F_TSO since no SG feature.\n",
-		       dev->name);
-		dev->features &= ~NETIF_F_TSO;
-	}
-	if (dev->features & NETIF_F_UFO) {
-		if (!(dev->features & NETIF_F_HW_CSUM)) {
-			printk(KERN_ERR "%s: Dropping NETIF_F_UFO since no "
-					"NETIF_F_HW_CSUM feature.\n",
-							dev->name);
-			dev->features &= ~NETIF_F_UFO;
-		}
-		if (!(dev->features & NETIF_F_SG)) {
-			printk(KERN_ERR "%s: Dropping NETIF_F_UFO since no "
-					"NETIF_F_SG feature.\n",
-					dev->name);
-			dev->features &= ~NETIF_F_UFO;
-		}
-	}
+	dev->features = netdev_fix_features(dev->features, dev->name);
 
 	/* Enable software GSO if SG is supported. */
 	if (dev->features & NETIF_F_SG)
@@ -4700,49 +4711,45 @@ static int __init netdev_dma_register(void) { return -ENODEV; }
 #endif /* CONFIG_NET_DMA */
 
 /**
- *	netdev_compute_feature - compute conjunction of two feature sets
- *	@all: first feature set
- *	@one: second feature set
+ *	netdev_increment_features - increment feature set by one
+ *	@all: current feature set
+ *	@one: new feature set
+ *	@mask: mask feature set
  *
  *	Computes a new feature set after adding a device with feature set
- *	@one to the master device with current feature set @all.  Returns
- *	the new feature set.
+ *	@one to the master device with current feature set @all.  Will not
+ *	enable anything that is off in @mask. Returns the new feature set.
  */
-int netdev_compute_features(unsigned long all, unsigned long one)
-{
-	/* if device needs checksumming, downgrade to hw checksumming */
-	if (all & NETIF_F_NO_CSUM && !(one & NETIF_F_NO_CSUM))
-		all ^= NETIF_F_NO_CSUM | NETIF_F_HW_CSUM;
-
-	/* if device can't do all checksum, downgrade to ipv4/ipv6 */
-	if (all & NETIF_F_HW_CSUM && !(one & NETIF_F_HW_CSUM))
-		all ^= NETIF_F_HW_CSUM
-			| NETIF_F_IP_CSUM | NETIF_F_IPV6_CSUM;
-
-	if (one & NETIF_F_GSO)
-		one |= NETIF_F_GSO_SOFTWARE;
-	one |= NETIF_F_GSO;
-
-	/*
-	 * If even one device supports a GSO protocol with software fallback,
-	 * enable it for all.
-	 */
-	all |= one & NETIF_F_GSO_SOFTWARE;
+unsigned long netdev_increment_features(unsigned long all, unsigned long one,
+					unsigned long mask)
+{
+	/* If device needs checksumming, downgrade to it. */
+        if (all & NETIF_F_NO_CSUM && !(one & NETIF_F_NO_CSUM))
+		all ^= NETIF_F_NO_CSUM | (one & NETIF_F_ALL_CSUM);
+	else if (mask & NETIF_F_ALL_CSUM) {
+		/* If one device supports v4/v6 checksumming, set for all. */
+		if (one & (NETIF_F_IP_CSUM | NETIF_F_IPV6_CSUM) &&
+		    !(all & NETIF_F_GEN_CSUM)) {
+			all &= ~NETIF_F_ALL_CSUM;
+			all |= one & (NETIF_F_IP_CSUM | NETIF_F_IPV6_CSUM);
+		}
 
-	/* If even one device supports robust GSO, enable it for all. */
-	if (one & NETIF_F_GSO_ROBUST)
-		all |= NETIF_F_GSO_ROBUST;
+		/* If one device supports hw checksumming, set for all. */
+		if (one & NETIF_F_GEN_CSUM && !(all & NETIF_F_GEN_CSUM)) {
+			all &= ~NETIF_F_ALL_CSUM;
+			all |= NETIF_F_HW_CSUM;
+		}
+	}
 
-	all &= one | NETIF_F_LLTX;
+	one |= NETIF_F_ALL_CSUM;
 
-	if (!(all & NETIF_F_ALL_CSUM))
-		all &= ~NETIF_F_SG;
-	if (!(all & NETIF_F_SG))
-		all &= ~NETIF_F_GSO_MASK;
+	one |= all & NETIF_F_ONE_FOR_ALL;
+	all &= one | NETIF_F_LLTX | NETIF_F_GSO;
+	all |= one & mask & NETIF_F_ONE_FOR_ALL;
 
 	return all;
 }
-EXPORT_SYMBOL(netdev_compute_features);
+EXPORT_SYMBOL(netdev_increment_features);
 
 static struct hlist_head *netdev_create_hash(void)
 {
-- 
cgit v1.2.3


From 3516586a424ea5727be089da6541cbd5644f0497 Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Tue, 5 Aug 2008 03:00:49 -0400
Subject: [PATCH] make O_EXCL in nd->intent.flags visible in nd->flags

New flag: LOOKUP_EXCL.  Set before doing the final step of pathname
resolution on the paths that have LOOKUP_CREATE and O_EXCL.

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/gfs2/ops_inode.c   | 2 +-
 fs/namei.c            | 4 +++-
 fs/nfs/dir.c          | 6 ++----
 include/linux/namei.h | 5 +++--
 4 files changed, 9 insertions(+), 8 deletions(-)

(limited to 'include/linux')

diff --git a/fs/gfs2/ops_inode.c b/fs/gfs2/ops_inode.c
index 534e1e2c65ca..d232991b9046 100644
--- a/fs/gfs2/ops_inode.c
+++ b/fs/gfs2/ops_inode.c
@@ -69,7 +69,7 @@ static int gfs2_create(struct inode *dir, struct dentry *dentry,
 			mark_inode_dirty(inode);
 			break;
 		} else if (PTR_ERR(inode) != -EEXIST ||
-			   (nd && (nd->intent.open.flags & O_EXCL))) {
+			   (nd && nd->flags & LOOKUP_EXCL)) {
 			gfs2_holder_uninit(ghs);
 			return PTR_ERR(inode);
 		}
diff --git a/fs/namei.c b/fs/namei.c
index e584f04745b5..2b8f823eda44 100644
--- a/fs/namei.c
+++ b/fs/namei.c
@@ -1709,6 +1709,8 @@ struct file *do_filp_open(int dfd, const char *pathname,
 	dir = nd.path.dentry;
 	nd.flags &= ~LOOKUP_PARENT;
 	nd.flags |= LOOKUP_CREATE | LOOKUP_OPEN;
+	if (flag & O_EXCL)
+		nd.flags |= LOOKUP_EXCL;
 	mutex_lock(&dir->d_inode->i_mutex);
 	path.dentry = lookup_hash(&nd);
 	path.mnt = nd.path.mnt;
@@ -1906,7 +1908,7 @@ struct dentry *lookup_create(struct nameidata *nd, int is_dir)
 	if (nd->last_type != LAST_NORM)
 		goto fail;
 	nd->flags &= ~LOOKUP_PARENT;
-	nd->flags |= LOOKUP_CREATE;
+	nd->flags |= LOOKUP_CREATE | LOOKUP_EXCL;
 	nd->intent.open.flags = O_EXCL;
 
 	/*
diff --git a/fs/nfs/dir.c b/fs/nfs/dir.c
index efdba2e802d7..c216c8786c51 100644
--- a/fs/nfs/dir.c
+++ b/fs/nfs/dir.c
@@ -707,9 +707,7 @@ static int nfs_is_exclusive_create(struct inode *dir, struct nameidata *nd)
 {
 	if (NFS_PROTO(dir)->version == 2)
 		return 0;
-	if (nd == NULL || nfs_lookup_check_intent(nd, LOOKUP_CREATE) == 0)
-		return 0;
-	return (nd->intent.open.flags & O_EXCL) != 0;
+	return nd && nfs_lookup_check_intent(nd, LOOKUP_EXCL);
 }
 
 /*
@@ -1009,7 +1007,7 @@ static struct dentry *nfs_atomic_lookup(struct inode *dir, struct dentry *dentry
 
 	/* Let vfs_create() deal with O_EXCL. Instantiate, but don't hash
 	 * the dentry. */
-	if (nd->intent.open.flags & O_EXCL) {
+	if (nd->flags & LOOKUP_EXCL) {
 		d_instantiate(dentry, NULL);
 		goto out;
 	}
diff --git a/include/linux/namei.h b/include/linux/namei.h
index 221e8bc894ba..6b5627afd2eb 100644
--- a/include/linux/namei.h
+++ b/include/linux/namei.h
@@ -51,8 +51,9 @@ enum {LAST_NORM, LAST_ROOT, LAST_DOT, LAST_DOTDOT, LAST_BIND};
 /*
  * Intent data
  */
-#define LOOKUP_OPEN		(0x0100)
-#define LOOKUP_CREATE		(0x0200)
+#define LOOKUP_OPEN		0x0100
+#define LOOKUP_CREATE		0x0200
+#define LOOKUP_EXCL		0x0400
 
 extern int user_path_at(int, const char __user *, unsigned, struct path *);
 
-- 
cgit v1.2.3


From 4ea3ada2955e4519befa98ff55dd62d6dfbd1705 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Mon, 11 Aug 2008 15:48:57 +0200
Subject: [PATCH] new helper: d_obtain_alias

The calling conventions of d_alloc_anon are rather unfortunate for all
users, and it's name is not very descriptive either.

Add d_obtain_alias as a new exported helper that drops the inode
reference in the failure case, too and allows to pass-through NULL
pointers and inodes to allow for tail-calls in the export operations.

Incidentally this helper already existed as a private function in
libfs.c as exportfs_d_alloc so kill that one and switch the callers
to d_obtain_alias.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/dcache.c            | 35 +++++++++++++++++++++++++++++++++++
 fs/libfs.c             | 26 ++------------------------
 include/linux/dcache.h |  1 +
 3 files changed, 38 insertions(+), 24 deletions(-)

(limited to 'include/linux')

diff --git a/fs/dcache.c b/fs/dcache.c
index e7a1a99b7464..46fc78206782 100644
--- a/fs/dcache.c
+++ b/fs/dcache.c
@@ -1174,6 +1174,41 @@ struct dentry * d_alloc_anon(struct inode *inode)
 	return res;
 }
 
+/**
+ * d_obtain_alias - find or allocate a dentry for a given inode
+ * @inode: inode to allocate the dentry for
+ *
+ * Obtain a dentry for an inode resulting from NFS filehandle conversion or
+ * similar open by handle operations.  The returned dentry may be anonymous,
+ * or may have a full name (if the inode was already in the cache).
+ *
+ * When called on a directory inode, we must ensure that the inode only ever
+ * has one dentry.  If a dentry is found, that is returned instead of
+ * allocating a new one.
+ *
+ * On successful return, the reference to the inode has been transferred
+ * to the dentry.  If %NULL is returned (indicating kmalloc failure),
+ * the reference on the inode has been released.  To make it easier
+ * to use in export operations a NULL or IS_ERR inode may be passed in
+ * and will be casted to the corresponding NULL or IS_ERR dentry.
+ */
+struct dentry *d_obtain_alias(struct inode *inode)
+{
+	struct dentry *dentry;
+
+	if (!inode)
+		return NULL;
+	if (IS_ERR(inode))
+		return ERR_CAST(inode);
+
+	dentry = d_alloc_anon(inode);
+	if (!dentry) {
+		iput(inode);
+		dentry = ERR_PTR(-ENOMEM);
+	}
+	return dentry;
+}
+EXPORT_SYMBOL_GPL(d_obtain_alias);
 
 /**
  * d_splice_alias - splice a disconnected dentry into the tree if one exists
diff --git a/fs/libfs.c b/fs/libfs.c
index 1add676a19df..74688598bcf7 100644
--- a/fs/libfs.c
+++ b/fs/libfs.c
@@ -732,28 +732,6 @@ out:
 	return ret;
 }
 
-/*
- * This is what d_alloc_anon should have been.  Once the exportfs
- * argument transition has been finished I will update d_alloc_anon
- * to this prototype and this wrapper will go away.   --hch
- */
-static struct dentry *exportfs_d_alloc(struct inode *inode)
-{
-	struct dentry *dentry;
-
-	if (!inode)
-		return NULL;
-	if (IS_ERR(inode))
-		return ERR_PTR(PTR_ERR(inode));
-
-	dentry = d_alloc_anon(inode);
-	if (!dentry) {
-		iput(inode);
-		dentry = ERR_PTR(-ENOMEM);
-	}
-	return dentry;
-}
-
 /**
  * generic_fh_to_dentry - generic helper for the fh_to_dentry export operation
  * @sb:		filesystem to do the file handle conversion on
@@ -782,7 +760,7 @@ struct dentry *generic_fh_to_dentry(struct super_block *sb, struct fid *fid,
 		break;
 	}
 
-	return exportfs_d_alloc(inode);
+	return d_obtain_alias(inode);
 }
 EXPORT_SYMBOL_GPL(generic_fh_to_dentry);
 
@@ -815,7 +793,7 @@ struct dentry *generic_fh_to_parent(struct super_block *sb, struct fid *fid,
 		break;
 	}
 
-	return exportfs_d_alloc(inode);
+	return d_obtain_alias(inode);
 }
 EXPORT_SYMBOL_GPL(generic_fh_to_parent);
 
diff --git a/include/linux/dcache.h b/include/linux/dcache.h
index efba1de629ac..2404257d6c67 100644
--- a/include/linux/dcache.h
+++ b/include/linux/dcache.h
@@ -231,6 +231,7 @@ extern struct dentry * d_alloc(struct dentry *, const struct qstr *);
 extern struct dentry * d_alloc_anon(struct inode *);
 extern struct dentry * d_splice_alias(struct inode *, struct dentry *);
 extern struct dentry * d_add_ci(struct dentry *, struct inode *, struct qstr *);
+extern struct dentry * d_obtain_alias(struct inode *);
 extern void shrink_dcache_sb(struct super_block *);
 extern void shrink_dcache_parent(struct dentry *);
 extern void shrink_dcache_for_umount(struct super_block *);
-- 
cgit v1.2.3


From 9308a6128d9074e348d9f9b5822546fe12a794a9 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Mon, 11 Aug 2008 15:49:12 +0200
Subject: [PATCH] kill d_alloc_anon

Remove d_alloc_anon now that no users are left.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/dcache.c            | 108 +++++++++++++++++--------------------------------
 include/linux/dcache.h |   1 -
 2 files changed, 37 insertions(+), 72 deletions(-)

(limited to 'include/linux')

diff --git a/fs/dcache.c b/fs/dcache.c
index d45ff7f5ecc2..1710d2484fd9 100644
--- a/fs/dcache.c
+++ b/fs/dcache.c
@@ -1110,70 +1110,6 @@ static inline struct hlist_head *d_hash(struct dentry *parent,
 	return dentry_hashtable + (hash & D_HASHMASK);
 }
 
-/**
- * d_alloc_anon - allocate an anonymous dentry
- * @inode: inode to allocate the dentry for
- *
- * This is similar to d_alloc_root.  It is used by filesystems when
- * creating a dentry for a given inode, often in the process of 
- * mapping a filehandle to a dentry.  The returned dentry may be
- * anonymous, or may have a full name (if the inode was already
- * in the cache).  The file system may need to make further
- * efforts to connect this dentry into the dcache properly.
- *
- * When called on a directory inode, we must ensure that
- * the inode only ever has one dentry.  If a dentry is
- * found, that is returned instead of allocating a new one.
- *
- * On successful return, the reference to the inode has been transferred
- * to the dentry.  If %NULL is returned (indicating kmalloc failure),
- * the reference on the inode has not been released.
- */
-
-struct dentry * d_alloc_anon(struct inode *inode)
-{
-	static const struct qstr anonstring = { .name = "" };
-	struct dentry *tmp;
-	struct dentry *res;
-
-	if ((res = d_find_alias(inode))) {
-		iput(inode);
-		return res;
-	}
-
-	tmp = d_alloc(NULL, &anonstring);
-	if (!tmp)
-		return NULL;
-
-	tmp->d_parent = tmp; /* make sure dput doesn't croak */
-	
-	spin_lock(&dcache_lock);
-	res = __d_find_alias(inode, 0);
-	if (!res) {
-		/* attach a disconnected dentry */
-		res = tmp;
-		tmp = NULL;
-		spin_lock(&res->d_lock);
-		res->d_sb = inode->i_sb;
-		res->d_parent = res;
-		res->d_inode = inode;
-		res->d_flags |= DCACHE_DISCONNECTED;
-		res->d_flags &= ~DCACHE_UNHASHED;
-		list_add(&res->d_alias, &inode->i_dentry);
-		hlist_add_head(&res->d_hash, &inode->i_sb->s_anon);
-		spin_unlock(&res->d_lock);
-
-		inode = NULL; /* don't drop reference */
-	}
-	spin_unlock(&dcache_lock);
-
-	if (inode)
-		iput(inode);
-	if (tmp)
-		dput(tmp);
-	return res;
-}
-
 /**
  * d_obtain_alias - find or allocate a dentry for a given inode
  * @inode: inode to allocate the dentry for
@@ -1194,19 +1130,50 @@ struct dentry * d_alloc_anon(struct inode *inode)
  */
 struct dentry *d_obtain_alias(struct inode *inode)
 {
-	struct dentry *dentry;
+	static const struct qstr anonstring = { .name = "" };
+	struct dentry *tmp;
+	struct dentry *res;
 
 	if (!inode)
 		return ERR_PTR(-ESTALE);
 	if (IS_ERR(inode))
 		return ERR_CAST(inode);
 
-	dentry = d_alloc_anon(inode);
-	if (!dentry) {
-		iput(inode);
-		dentry = ERR_PTR(-ENOMEM);
+	res = d_find_alias(inode);
+	if (res)
+		goto out_iput;
+
+	tmp = d_alloc(NULL, &anonstring);
+	if (!tmp) {
+		res = ERR_PTR(-ENOMEM);
+		goto out_iput;
 	}
-	return dentry;
+	tmp->d_parent = tmp; /* make sure dput doesn't croak */
+
+	spin_lock(&dcache_lock);
+	res = __d_find_alias(inode, 0);
+	if (res) {
+		spin_unlock(&dcache_lock);
+		dput(tmp);
+		goto out_iput;
+	}
+
+	/* attach a disconnected dentry */
+	spin_lock(&tmp->d_lock);
+	tmp->d_sb = inode->i_sb;
+	tmp->d_inode = inode;
+	tmp->d_flags |= DCACHE_DISCONNECTED;
+	tmp->d_flags &= ~DCACHE_UNHASHED;
+	list_add(&tmp->d_alias, &inode->i_dentry);
+	hlist_add_head(&tmp->d_hash, &inode->i_sb->s_anon);
+	spin_unlock(&tmp->d_lock);
+
+	spin_unlock(&dcache_lock);
+	return tmp;
+
+ out_iput:
+	iput(inode);
+	return res;
 }
 EXPORT_SYMBOL_GPL(d_obtain_alias);
 
@@ -2379,7 +2346,6 @@ void __init vfs_caches_init(unsigned long mempages)
 }
 
 EXPORT_SYMBOL(d_alloc);
-EXPORT_SYMBOL(d_alloc_anon);
 EXPORT_SYMBOL(d_alloc_root);
 EXPORT_SYMBOL(d_delete);
 EXPORT_SYMBOL(d_find_alias);
diff --git a/include/linux/dcache.h b/include/linux/dcache.h
index 2404257d6c67..74c64ae30cf0 100644
--- a/include/linux/dcache.h
+++ b/include/linux/dcache.h
@@ -228,7 +228,6 @@ extern void d_delete(struct dentry *);
 
 /* allocate/de-allocate */
 extern struct dentry * d_alloc(struct dentry *, const struct qstr *);
-extern struct dentry * d_alloc_anon(struct inode *);
 extern struct dentry * d_splice_alias(struct inode *, struct dentry *);
 extern struct dentry * d_add_ci(struct dentry *, struct inode *, struct qstr *);
 extern struct dentry * d_obtain_alias(struct inode *);
-- 
cgit v1.2.3


From 6de24f0ed08054b2a202902e4d63beff27654db8 Mon Sep 17 00:00:00 2001
From: Alexey Dobriyan <adobriyan@gmail.com>
Date: Thu, 28 Aug 2008 06:25:49 +0400
Subject: [PATCH 1/2] anondev: init IDR statically

Signed-off-by: Alexey Dobriyan <adobriyan@gmail.com>
---
 fs/super.c         | 7 +------
 include/linux/fs.h | 1 -
 init/main.c        | 1 -
 3 files changed, 1 insertion(+), 8 deletions(-)

(limited to 'include/linux')

diff --git a/fs/super.c b/fs/super.c
index e931ae9511fe..dd23bf927fbc 100644
--- a/fs/super.c
+++ b/fs/super.c
@@ -682,7 +682,7 @@ void emergency_remount(void)
  * filesystems which don't use real block-devices.  -- jrs
  */
 
-static struct idr unnamed_dev_idr;
+static DEFINE_IDR(unnamed_dev_idr);
 static DEFINE_SPINLOCK(unnamed_dev_lock);/* protects the above */
 
 int set_anon_super(struct super_block *s, void *data)
@@ -726,11 +726,6 @@ void kill_anon_super(struct super_block *sb)
 
 EXPORT_SYMBOL(kill_anon_super);
 
-void __init unnamed_dev_init(void)
-{
-	idr_init(&unnamed_dev_idr);
-}
-
 void kill_litter_super(struct super_block *sb)
 {
 	if (sb->s_root)
diff --git a/include/linux/fs.h b/include/linux/fs.h
index a6a625be13fc..5f70aa62cf0f 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -1593,7 +1593,6 @@ extern int get_sb_pseudo(struct file_system_type *, char *,
 	struct vfsmount *mnt);
 extern int simple_set_mnt(struct vfsmount *mnt, struct super_block *sb);
 int __put_super_and_need_restart(struct super_block *sb);
-void unnamed_dev_init(void);
 
 /* Alas, no aliases. Too much hassle with bringing module.h everywhere */
 #define fops_get(fops) \
diff --git a/init/main.c b/init/main.c
index 3e17a3bafe60..c6a1024a27a3 100644
--- a/init/main.c
+++ b/init/main.c
@@ -670,7 +670,6 @@ asmlinkage void __init start_kernel(void)
 	fork_init(num_physpages);
 	proc_caches_init();
 	buffer_init();
-	unnamed_dev_init();
 	key_init();
 	security_init();
 	vfs_caches_init(num_physpages);
-- 
cgit v1.2.3


From e2761a1167633ed943fea29002f990194923d060 Mon Sep 17 00:00:00 2001
From: OGAWA Hirofumi <hirofumi@mail.parknet.co.jp>
Date: Thu, 16 Oct 2008 07:50:28 +0900
Subject: [PATCH vfs-2.6 2/6] vfs: add d_ancestor()

This adds d_ancestor() instead of d_isparent(), then use it.

If new_dentry == old_dentry, is_subdir() returns 1, looks strange.
"new_dentry == old_dentry" is not subdir obviously. But I'm not
checking callers for now, so this keeps current behavior.

Signed-off-by: OGAWA Hirofumi <hirofumi@mail.parknet.co.jp>
---
 fs/dcache.c            | 45 +++++++++++++++++++++++----------------------
 fs/namei.c             | 22 ++++++++++------------
 include/linux/dcache.h |  1 +
 3 files changed, 34 insertions(+), 34 deletions(-)

(limited to 'include/linux')

diff --git a/fs/dcache.c b/fs/dcache.c
index c6fd1f27da57..64024005da43 100644
--- a/fs/dcache.c
+++ b/fs/dcache.c
@@ -1720,18 +1720,23 @@ void d_move(struct dentry * dentry, struct dentry * target)
 	spin_unlock(&dcache_lock);
 }
 
-/*
- * Helper that returns 1 if p1 is a parent of p2, else 0
+/**
+ * d_ancestor - search for an ancestor
+ * @p1: ancestor dentry
+ * @p2: child dentry
+ *
+ * Returns the ancestor dentry of p2 which is a child of p1, if p1 is
+ * an ancestor of p2, else NULL.
  */
-static int d_isparent(struct dentry *p1, struct dentry *p2)
+struct dentry *d_ancestor(struct dentry *p1, struct dentry *p2)
 {
 	struct dentry *p;
 
 	for (p = p2; !IS_ROOT(p); p = p->d_parent) {
 		if (p->d_parent == p1)
-			return 1;
+			return p;
 	}
-	return 0;
+	return NULL;
 }
 
 /*
@@ -1755,7 +1760,7 @@ static struct dentry *__d_unalias(struct dentry *dentry, struct dentry *alias)
 
 	/* Check for loops */
 	ret = ERR_PTR(-ELOOP);
-	if (d_isparent(alias, dentry))
+	if (d_ancestor(alias, dentry))
 		goto out_err;
 
 	/* See lock_rename() */
@@ -2155,31 +2160,27 @@ out:
  * Caller must ensure that "new_dentry" is pinned before calling is_subdir()
  */
   
-int is_subdir(struct dentry * new_dentry, struct dentry * old_dentry)
+int is_subdir(struct dentry *new_dentry, struct dentry *old_dentry)
 {
 	int result;
-	struct dentry * saved = new_dentry;
 	unsigned long seq;
 
-	/* need rcu_readlock to protect against the d_parent trashing due to
-	 * d_move
+	/* FIXME: This is old behavior, needed? Please check callers. */
+	if (new_dentry == old_dentry)
+		return 1;
+
+	/*
+	 * Need rcu_readlock to protect against the d_parent trashing
+	 * due to d_move
 	 */
 	rcu_read_lock();
-        do {
+	do {
 		/* for restarting inner loop in case of seq retry */
-		new_dentry = saved;
-		result = 0;
 		seq = read_seqbegin(&rename_lock);
-		for (;;) {
-			if (new_dentry != old_dentry) {
-				if (IS_ROOT(new_dentry))
-					break;
-				new_dentry = new_dentry->d_parent;
-				continue;
-			}
+		if (d_ancestor(old_dentry, new_dentry))
 			result = 1;
-			break;
-		}
+		else
+			result = 0;
 	} while (read_seqretry(&rename_lock, seq));
 	rcu_read_unlock();
 
diff --git a/fs/namei.c b/fs/namei.c
index 068a9e50c8c0..b7cd65224d60 100644
--- a/fs/namei.c
+++ b/fs/namei.c
@@ -1454,20 +1454,18 @@ struct dentry *lock_rename(struct dentry *p1, struct dentry *p2)
 
 	mutex_lock(&p1->d_inode->i_sb->s_vfs_rename_mutex);
 
-	for (p = p1; !IS_ROOT(p); p = p->d_parent) {
-		if (p->d_parent == p2) {
-			mutex_lock_nested(&p2->d_inode->i_mutex, I_MUTEX_PARENT);
-			mutex_lock_nested(&p1->d_inode->i_mutex, I_MUTEX_CHILD);
-			return p;
-		}
+	p = d_ancestor(p2, p1);
+	if (p) {
+		mutex_lock_nested(&p2->d_inode->i_mutex, I_MUTEX_PARENT);
+		mutex_lock_nested(&p1->d_inode->i_mutex, I_MUTEX_CHILD);
+		return p;
 	}
 
-	for (p = p2; !IS_ROOT(p); p = p->d_parent) {
-		if (p->d_parent == p1) {
-			mutex_lock_nested(&p1->d_inode->i_mutex, I_MUTEX_PARENT);
-			mutex_lock_nested(&p2->d_inode->i_mutex, I_MUTEX_CHILD);
-			return p;
-		}
+	p = d_ancestor(p1, p2);
+	if (p) {
+		mutex_lock_nested(&p1->d_inode->i_mutex, I_MUTEX_PARENT);
+		mutex_lock_nested(&p2->d_inode->i_mutex, I_MUTEX_CHILD);
+		return p;
 	}
 
 	mutex_lock_nested(&p1->d_inode->i_mutex, I_MUTEX_PARENT);
diff --git a/include/linux/dcache.h b/include/linux/dcache.h
index 74c64ae30cf0..a37359d0bad1 100644
--- a/include/linux/dcache.h
+++ b/include/linux/dcache.h
@@ -287,6 +287,7 @@ static inline struct dentry *d_add_unique(struct dentry *entry, struct inode *in
 
 /* used for rename() and baskets */
 extern void d_move(struct dentry *, struct dentry *);
+extern struct dentry *d_ancestor(struct dentry *, struct dentry *);
 
 /* appendix may either be NULL or be used for transname suffixes */
 extern struct dentry * d_lookup(struct dentry *, struct qstr *);
-- 
cgit v1.2.3


From 4e9ed2f85af7adfa7c3f0efa839a53186254fdcb Mon Sep 17 00:00:00 2001
From: OGAWA Hirofumi <hirofumi@mail.parknet.co.jp>
Date: Thu, 16 Oct 2008 07:50:29 +0900
Subject: [PATCH vfs-2.6 6/6] vfs: add LOOKUP_RENAME_TARGET intent

This adds LOOKUP_RENAME_TARGET intent for lookup of rename destination.

LOOKUP_RENAME_TARGET is going to be used like LOOKUP_CREATE. But since
the destination of rename() can be existing directory entry, so it has a
difference. Although that difference doesn't matter in my usage, this
tells it to user of this intent.

Signed-off-by: OGAWA Hirofumi <hirofumi@mail.parknet.co.jp>
---
 fs/namei.c            | 1 +
 include/linux/namei.h | 1 +
 2 files changed, 2 insertions(+)

(limited to 'include/linux')

diff --git a/fs/namei.c b/fs/namei.c
index 18894fdf048a..9e2a534383d9 100644
--- a/fs/namei.c
+++ b/fs/namei.c
@@ -2654,6 +2654,7 @@ asmlinkage long sys_renameat(int olddfd, const char __user *oldname,
 
 	oldnd.flags &= ~LOOKUP_PARENT;
 	newnd.flags &= ~LOOKUP_PARENT;
+	newnd.flags |= LOOKUP_RENAME_TARGET;
 
 	trap = lock_rename(new_dir, old_dir);
 
diff --git a/include/linux/namei.h b/include/linux/namei.h
index 6b5627afd2eb..99eb80306dc5 100644
--- a/include/linux/namei.h
+++ b/include/linux/namei.h
@@ -54,6 +54,7 @@ enum {LAST_NORM, LAST_ROOT, LAST_DOT, LAST_DOTDOT, LAST_BIND};
 #define LOOKUP_OPEN		0x0100
 #define LOOKUP_CREATE		0x0200
 #define LOOKUP_EXCL		0x0400
+#define LOOKUP_RENAME_TARGET	0x0800
 
 extern int user_path_at(int, const char __user *, unsigned, struct path *);
 
-- 
cgit v1.2.3


From f696a3659fc4b3a3bf4bc83d9dbec5e5a2ffd929 Mon Sep 17 00:00:00 2001
From: Miklos Szeredi <miklos@szeredi.hu>
Date: Thu, 31 Jul 2008 13:41:58 +0200
Subject: [PATCH] move executable checking into ->permission()

For execute permission on a regular files we need to check if file has
any execute bits at all, regardless of capabilites.

This check is normally performed by generic_permission() but was also
added to the case when the filesystem defines its own ->permission()
method.  In the latter case the filesystem should be responsible for
performing this check.

Move the check from inode_permission() inside filesystems which are
not calling generic_permission().

Create a helper function execute_ok() that returns true if the inode
is a directory or if any execute bits are present in i_mode.

Also fix up the following code:

 - coda control file is never executable
 - sysctl files are never executable
 - hfs_permission seems broken on MAY_EXEC, remove
 - hfsplus_permission is eqivalent to generic_permission(), remove

Signed-off-by: Miklos Szeredi <mszeredi@suse.cz>
---
 fs/cifs/cifsfs.c      |  9 ++++++---
 fs/coda/dir.c         |  3 +++
 fs/coda/pioctl.c      |  2 +-
 fs/hfs/inode.c        |  8 --------
 fs/hfsplus/inode.c    | 13 -------------
 fs/namei.c            | 21 ++++-----------------
 fs/nfs/dir.c          |  3 +++
 fs/proc/proc_sysctl.c | 10 ++++++++--
 include/linux/fs.h    |  5 +++++
 9 files changed, 30 insertions(+), 44 deletions(-)

(limited to 'include/linux')

diff --git a/fs/cifs/cifsfs.c b/fs/cifs/cifsfs.c
index 89c64a8dcb99..84cc011a16e4 100644
--- a/fs/cifs/cifsfs.c
+++ b/fs/cifs/cifsfs.c
@@ -275,9 +275,12 @@ static int cifs_permission(struct inode *inode, int mask)
 
 	cifs_sb = CIFS_SB(inode->i_sb);
 
-	if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_NO_PERM)
-		return 0;
-	else /* file mode might have been restricted at mount time
+	if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_NO_PERM) {
+		if ((mask & MAY_EXEC) && !execute_ok(inode))
+			return -EACCES;
+		else
+			return 0;
+	} else /* file mode might have been restricted at mount time
 		on the client (above and beyond ACL on servers) for
 		servers which do not support setting and viewing mode bits,
 		so allowing client to check permissions is useful */
diff --git a/fs/coda/dir.c b/fs/coda/dir.c
index c5916228243c..75b1fa90b2cb 100644
--- a/fs/coda/dir.c
+++ b/fs/coda/dir.c
@@ -146,6 +146,9 @@ int coda_permission(struct inode *inode, int mask)
 	if (!mask)
 		return 0; 
 
+	if ((mask & MAY_EXEC) && !execute_ok(inode))
+		return -EACCES;
+
 	lock_kernel();
 
 	if (coda_cache_check(inode, mask))
diff --git a/fs/coda/pioctl.c b/fs/coda/pioctl.c
index c51365422aa8..773f2ce9aa06 100644
--- a/fs/coda/pioctl.c
+++ b/fs/coda/pioctl.c
@@ -43,7 +43,7 @@ const struct file_operations coda_ioctl_operations = {
 /* the coda pioctl inode ops */
 static int coda_ioctl_permission(struct inode *inode, int mask)
 {
-        return 0;
+	return (mask & MAY_EXEC) ? -EACCES : 0;
 }
 
 static int coda_pioctl(struct inode * inode, struct file * filp, 
diff --git a/fs/hfs/inode.c b/fs/hfs/inode.c
index 7e19835efa2e..c69b7ac75bf7 100644
--- a/fs/hfs/inode.c
+++ b/fs/hfs/inode.c
@@ -511,13 +511,6 @@ void hfs_clear_inode(struct inode *inode)
 	}
 }
 
-static int hfs_permission(struct inode *inode, int mask)
-{
-	if (S_ISREG(inode->i_mode) && mask & MAY_EXEC)
-		return 0;
-	return generic_permission(inode, mask, NULL);
-}
-
 static int hfs_file_open(struct inode *inode, struct file *file)
 {
 	if (HFS_IS_RSRC(inode))
@@ -616,7 +609,6 @@ static const struct inode_operations hfs_file_inode_operations = {
 	.lookup		= hfs_file_lookup,
 	.truncate	= hfs_file_truncate,
 	.setattr	= hfs_inode_setattr,
-	.permission	= hfs_permission,
 	.setxattr	= hfs_setxattr,
 	.getxattr	= hfs_getxattr,
 	.listxattr	= hfs_listxattr,
diff --git a/fs/hfsplus/inode.c b/fs/hfsplus/inode.c
index 963be644297a..b207f0e6fc22 100644
--- a/fs/hfsplus/inode.c
+++ b/fs/hfsplus/inode.c
@@ -238,18 +238,6 @@ static void hfsplus_set_perms(struct inode *inode, struct hfsplus_perm *perms)
 	perms->dev = cpu_to_be32(HFSPLUS_I(inode).dev);
 }
 
-static int hfsplus_permission(struct inode *inode, int mask)
-{
-	/* MAY_EXEC is also used for lookup, if no x bit is set allow lookup,
-	 * open_exec has the same test, so it's still not executable, if a x bit
-	 * is set fall back to standard permission check.
-	 */
-	if (S_ISREG(inode->i_mode) && mask & MAY_EXEC && !(inode->i_mode & 0111))
-		return 0;
-	return generic_permission(inode, mask, NULL);
-}
-
-
 static int hfsplus_file_open(struct inode *inode, struct file *file)
 {
 	if (HFSPLUS_IS_RSRC(inode))
@@ -281,7 +269,6 @@ static int hfsplus_file_release(struct inode *inode, struct file *file)
 static const struct inode_operations hfsplus_file_inode_operations = {
 	.lookup		= hfsplus_file_lookup,
 	.truncate	= hfsplus_file_truncate,
-	.permission	= hfsplus_permission,
 	.setxattr	= hfsplus_setxattr,
 	.getxattr	= hfsplus_getxattr,
 	.listxattr	= hfsplus_listxattr,
diff --git a/fs/namei.c b/fs/namei.c
index 9e2a534383d9..09ce58e49e72 100644
--- a/fs/namei.c
+++ b/fs/namei.c
@@ -212,8 +212,7 @@ int generic_permission(struct inode *inode, int mask,
 	 * Read/write DACs are always overridable.
 	 * Executable DACs are overridable if at least one exec bit is set.
 	 */
-	if (!(mask & MAY_EXEC) ||
-	    (inode->i_mode & S_IXUGO) || S_ISDIR(inode->i_mode))
+	if (!(mask & MAY_EXEC) || execute_ok(inode))
 		if (capable(CAP_DAC_OVERRIDE))
 			return 0;
 
@@ -249,23 +248,11 @@ int inode_permission(struct inode *inode, int mask)
 	}
 
 	/* Ordinary permission routines do not understand MAY_APPEND. */
-	if (inode->i_op && inode->i_op->permission) {
+	if (inode->i_op && inode->i_op->permission)
 		retval = inode->i_op->permission(inode, mask);
-		if (!retval) {
-			/*
-			 * Exec permission on a regular file is denied if none
-			 * of the execute bits are set.
-			 *
-			 * This check should be done by the ->permission()
-			 * method.
-			 */
-			if ((mask & MAY_EXEC) && S_ISREG(inode->i_mode) &&
-			    !(inode->i_mode & S_IXUGO))
-				return -EACCES;
-		}
-	} else {
+	else
 		retval = generic_permission(inode, mask, NULL);
-	}
+
 	if (retval)
 		return retval;
 
diff --git a/fs/nfs/dir.c b/fs/nfs/dir.c
index c216c8786c51..3e64b98f3a93 100644
--- a/fs/nfs/dir.c
+++ b/fs/nfs/dir.c
@@ -1957,6 +1957,9 @@ force_lookup:
 	} else
 		res = PTR_ERR(cred);
 out:
+	if (!res && (mask & MAY_EXEC) && !execute_ok(inode))
+		res = -EACCES;
+
 	dfprintk(VFS, "NFS: permission(%s/%ld), mask=0x%x, res=%d\n",
 		inode->i_sb->s_id, inode->i_ino, mask, res);
 	return res;
diff --git a/fs/proc/proc_sysctl.c b/fs/proc/proc_sysctl.c
index 5fe210c09171..7b997754a25e 100644
--- a/fs/proc/proc_sysctl.c
+++ b/fs/proc/proc_sysctl.c
@@ -298,13 +298,19 @@ static int proc_sys_permission(struct inode *inode, int mask)
 	 * sysctl entries that are not writeable,
 	 * are _NOT_ writeable, capabilities or not.
 	 */
-	struct ctl_table_header *head = grab_header(inode);
-	struct ctl_table *table = PROC_I(inode)->sysctl_entry;
+	struct ctl_table_header *head;
+	struct ctl_table *table;
 	int error;
 
+	/* Executable files are not allowed under /proc/sys/ */
+	if ((mask & MAY_EXEC) && S_ISREG(inode->i_mode))
+		return -EACCES;
+
+	head = grab_header(inode);
 	if (IS_ERR(head))
 		return PTR_ERR(head);
 
+	table = PROC_I(inode)->sysctl_entry;
 	if (!table) /* global root - r-xr-xr-x */
 		error = mask & MAY_WRITE ? -EACCES : 0;
 	else /* Use the permissions on the sysctl table entry */
diff --git a/include/linux/fs.h b/include/linux/fs.h
index 5f70aa62cf0f..025a4a251b64 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -1851,6 +1851,11 @@ extern int inode_permission(struct inode *, int);
 extern int generic_permission(struct inode *, int,
 		int (*check_acl)(struct inode *, int));
 
+static inline bool execute_ok(struct inode *inode)
+{
+	return (inode->i_mode & S_IXUGO) || S_ISDIR(inode->i_mode);
+}
+
 extern int get_write_access(struct inode *);
 extern int deny_write_access(struct file *);
 static inline void put_write_access(struct inode * inode)
-- 
cgit v1.2.3


From 08b9fe6b12d32324f311c46b88102b6b9067d434 Mon Sep 17 00:00:00 2001
From: Mimi Zohar <zohar@linux.vnet.ibm.com>
Date: Mon, 13 Oct 2008 00:09:50 -0400
Subject: [PATCH] i_version: remount support

Add support for remounting a filesystem with the i_version option.

Signed-off-by: Mimi Zohar <zohar@us.ibm.com>
---
 include/linux/fs.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/fs.h b/include/linux/fs.h
index 025a4a251b64..7d719c1a18e3 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -136,7 +136,7 @@ extern int dir_notify_enable;
 /*
  * Superblock flags that can be altered by MS_REMOUNT
  */
-#define MS_RMT_MASK	(MS_RDONLY|MS_SYNCHRONOUS|MS_MANDLOCK)
+#define MS_RMT_MASK	(MS_RDONLY|MS_SYNCHRONOUS|MS_MANDLOCK|MS_I_VERSION)
 
 /*
  * Old magic mount flag and mask
-- 
cgit v1.2.3


From e1759c215bee5abbcb6cb066590ab20905154ed5 Mon Sep 17 00:00:00 2001
From: Alexey Dobriyan <adobriyan@gmail.com>
Date: Wed, 15 Oct 2008 23:50:22 +0400
Subject: proc: switch /proc/meminfo to seq_file

and move it to fs/proc/meminfo.c while I'm at it.

Signed-off-by: Alexey Dobriyan <adobriyan@gmail.com>
---
 arch/x86/mm/pageattr.c    |  11 ++-
 fs/proc/Makefile          |   1 +
 fs/proc/meminfo.c         | 168 ++++++++++++++++++++++++++++++++++++++++++++++
 fs/proc/proc_misc.c       | 137 -------------------------------------
 include/asm-x86/pgtable.h |   3 +-
 include/linux/hugetlb.h   |   6 +-
 mm/hugetlb.c              |   5 +-
 7 files changed, 183 insertions(+), 148 deletions(-)
 create mode 100644 fs/proc/meminfo.c

(limited to 'include/linux')

diff --git a/arch/x86/mm/pageattr.c b/arch/x86/mm/pageattr.c
index 407d8784f669..f1dc1b75d166 100644
--- a/arch/x86/mm/pageattr.c
+++ b/arch/x86/mm/pageattr.c
@@ -65,23 +65,22 @@ static void split_page_count(int level)
 	direct_pages_count[level - 1] += PTRS_PER_PTE;
 }
 
-int arch_report_meminfo(char *page)
+void arch_report_meminfo(struct seq_file *m)
 {
-	int n = sprintf(page, "DirectMap4k:  %8lu kB\n",
+	seq_printf(m, "DirectMap4k:  %8lu kB\n",
 			direct_pages_count[PG_LEVEL_4K] << 2);
 #if defined(CONFIG_X86_64) || defined(CONFIG_X86_PAE)
-	n += sprintf(page + n, "DirectMap2M:  %8lu kB\n",
+	seq_printf(m, "DirectMap2M:  %8lu kB\n",
 			direct_pages_count[PG_LEVEL_2M] << 11);
 #else
-	n += sprintf(page + n, "DirectMap4M:  %8lu kB\n",
+	seq_printf(m, "DirectMap4M:  %8lu kB\n",
 			direct_pages_count[PG_LEVEL_2M] << 12);
 #endif
 #ifdef CONFIG_X86_64
 	if (direct_gbpages)
-		n += sprintf(page + n, "DirectMap1G:  %8lu kB\n",
+		seq_printf(m, "DirectMap1G:  %8lu kB\n",
 			direct_pages_count[PG_LEVEL_1G] << 20);
 #endif
-	return n;
 }
 #else
 static inline void split_page_count(int level) { }
diff --git a/fs/proc/Makefile b/fs/proc/Makefile
index 27efa14963b1..70607a03839d 100644
--- a/fs/proc/Makefile
+++ b/fs/proc/Makefile
@@ -10,6 +10,7 @@ proc-$(CONFIG_MMU)	:= mmu.o task_mmu.o
 proc-y       += inode.o root.o base.o generic.o array.o \
 		proc_tty.o proc_misc.o
 proc-y	+= loadavg.o
+proc-y	+= meminfo.o
 proc-y	+= uptime.o
 proc-$(CONFIG_PROC_SYSCTL)	+= proc_sysctl.o
 proc-$(CONFIG_NET)		+= proc_net.o
diff --git a/fs/proc/meminfo.c b/fs/proc/meminfo.c
new file mode 100644
index 000000000000..b1675c4e66da
--- /dev/null
+++ b/fs/proc/meminfo.c
@@ -0,0 +1,168 @@
+#include <linux/fs.h>
+#include <linux/hugetlb.h>
+#include <linux/init.h>
+#include <linux/kernel.h>
+#include <linux/mm.h>
+#include <linux/mman.h>
+#include <linux/mmzone.h>
+#include <linux/proc_fs.h>
+#include <linux/quicklist.h>
+#include <linux/seq_file.h>
+#include <linux/swap.h>
+#include <linux/vmstat.h>
+#include <asm/atomic.h>
+#include <asm/page.h>
+#include <asm/pgtable.h>
+#include "internal.h"
+
+void __attribute__((weak)) arch_report_meminfo(struct seq_file *m)
+{
+}
+
+static int meminfo_proc_show(struct seq_file *m, void *v)
+{
+	struct sysinfo i;
+	unsigned long committed;
+	unsigned long allowed;
+	struct vmalloc_info vmi;
+	long cached;
+	unsigned long pages[NR_LRU_LISTS];
+	int lru;
+
+/*
+ * display in kilobytes.
+ */
+#define K(x) ((x) << (PAGE_SHIFT - 10))
+	si_meminfo(&i);
+	si_swapinfo(&i);
+	committed = atomic_long_read(&vm_committed_space);
+	allowed = ((totalram_pages - hugetlb_total_pages())
+		* sysctl_overcommit_ratio / 100) + total_swap_pages;
+
+	cached = global_page_state(NR_FILE_PAGES) -
+			total_swapcache_pages - i.bufferram;
+	if (cached < 0)
+		cached = 0;
+
+	get_vmalloc_info(&vmi);
+
+	for (lru = LRU_BASE; lru < NR_LRU_LISTS; lru++)
+		pages[lru] = global_page_state(NR_LRU_BASE + lru);
+
+	/*
+	 * Tagged format, for easy grepping and expansion.
+	 */
+	seq_printf(m,
+		"MemTotal:       %8lu kB\n"
+		"MemFree:        %8lu kB\n"
+		"Buffers:        %8lu kB\n"
+		"Cached:         %8lu kB\n"
+		"SwapCached:     %8lu kB\n"
+		"Active:         %8lu kB\n"
+		"Inactive:       %8lu kB\n"
+		"Active(anon):   %8lu kB\n"
+		"Inactive(anon): %8lu kB\n"
+		"Active(file):   %8lu kB\n"
+		"Inactive(file): %8lu kB\n"
+#ifdef CONFIG_UNEVICTABLE_LRU
+		"Unevictable:    %8lu kB\n"
+		"Mlocked:        %8lu kB\n"
+#endif
+#ifdef CONFIG_HIGHMEM
+		"HighTotal:      %8lu kB\n"
+		"HighFree:       %8lu kB\n"
+		"LowTotal:       %8lu kB\n"
+		"LowFree:        %8lu kB\n"
+#endif
+		"SwapTotal:      %8lu kB\n"
+		"SwapFree:       %8lu kB\n"
+		"Dirty:          %8lu kB\n"
+		"Writeback:      %8lu kB\n"
+		"AnonPages:      %8lu kB\n"
+		"Mapped:         %8lu kB\n"
+		"Slab:           %8lu kB\n"
+		"SReclaimable:   %8lu kB\n"
+		"SUnreclaim:     %8lu kB\n"
+		"PageTables:     %8lu kB\n"
+#ifdef CONFIG_QUICKLIST
+		"Quicklists:     %8lu kB\n"
+#endif
+		"NFS_Unstable:   %8lu kB\n"
+		"Bounce:         %8lu kB\n"
+		"WritebackTmp:   %8lu kB\n"
+		"CommitLimit:    %8lu kB\n"
+		"Committed_AS:   %8lu kB\n"
+		"VmallocTotal:   %8lu kB\n"
+		"VmallocUsed:    %8lu kB\n"
+		"VmallocChunk:   %8lu kB\n",
+		K(i.totalram),
+		K(i.freeram),
+		K(i.bufferram),
+		K(cached),
+		K(total_swapcache_pages),
+		K(pages[LRU_ACTIVE_ANON]   + pages[LRU_ACTIVE_FILE]),
+		K(pages[LRU_INACTIVE_ANON] + pages[LRU_INACTIVE_FILE]),
+		K(pages[LRU_ACTIVE_ANON]),
+		K(pages[LRU_INACTIVE_ANON]),
+		K(pages[LRU_ACTIVE_FILE]),
+		K(pages[LRU_INACTIVE_FILE]),
+#ifdef CONFIG_UNEVICTABLE_LRU
+		K(pages[LRU_UNEVICTABLE]),
+		K(global_page_state(NR_MLOCK)),
+#endif
+#ifdef CONFIG_HIGHMEM
+		K(i.totalhigh),
+		K(i.freehigh),
+		K(i.totalram-i.totalhigh),
+		K(i.freeram-i.freehigh),
+#endif
+		K(i.totalswap),
+		K(i.freeswap),
+		K(global_page_state(NR_FILE_DIRTY)),
+		K(global_page_state(NR_WRITEBACK)),
+		K(global_page_state(NR_ANON_PAGES)),
+		K(global_page_state(NR_FILE_MAPPED)),
+		K(global_page_state(NR_SLAB_RECLAIMABLE) +
+				global_page_state(NR_SLAB_UNRECLAIMABLE)),
+		K(global_page_state(NR_SLAB_RECLAIMABLE)),
+		K(global_page_state(NR_SLAB_UNRECLAIMABLE)),
+		K(global_page_state(NR_PAGETABLE)),
+#ifdef CONFIG_QUICKLIST
+		K(quicklist_total_size()),
+#endif
+		K(global_page_state(NR_UNSTABLE_NFS)),
+		K(global_page_state(NR_BOUNCE)),
+		K(global_page_state(NR_WRITEBACK_TEMP)),
+		K(allowed),
+		K(committed),
+		(unsigned long)VMALLOC_TOTAL >> 10,
+		vmi.used >> 10,
+		vmi.largest_chunk >> 10
+		);
+
+	hugetlb_report_meminfo(m);
+
+	arch_report_meminfo(m);
+
+	return 0;
+#undef K
+}
+
+static int meminfo_proc_open(struct inode *inode, struct file *file)
+{
+	return single_open(file, meminfo_proc_show, NULL);
+}
+
+static const struct file_operations meminfo_proc_fops = {
+	.open		= meminfo_proc_open,
+	.read		= seq_read,
+	.llseek		= seq_lseek,
+	.release	= single_release,
+};
+
+static int __init proc_meminfo_init(void)
+{
+	proc_create("meminfo", 0, NULL, &meminfo_proc_fops);
+	return 0;
+}
+module_init(proc_meminfo_init);
diff --git a/fs/proc/proc_misc.c b/fs/proc/proc_misc.c
index 484b6011bf0b..1aba51b0a0c4 100644
--- a/fs/proc/proc_misc.c
+++ b/fs/proc/proc_misc.c
@@ -78,142 +78,6 @@ static int proc_calc_metrics(char *page, char **start, off_t off,
 	return len;
 }
 
-int __attribute__((weak)) arch_report_meminfo(char *page)
-{
-	return 0;
-}
-
-static int meminfo_read_proc(char *page, char **start, off_t off,
-				 int count, int *eof, void *data)
-{
-	struct sysinfo i;
-	int len;
-	unsigned long committed;
-	unsigned long allowed;
-	struct vmalloc_info vmi;
-	long cached;
-	unsigned long pages[NR_LRU_LISTS];
-	int lru;
-
-/*
- * display in kilobytes.
- */
-#define K(x) ((x) << (PAGE_SHIFT - 10))
-	si_meminfo(&i);
-	si_swapinfo(&i);
-	committed = atomic_long_read(&vm_committed_space);
-	allowed = ((totalram_pages - hugetlb_total_pages())
-		* sysctl_overcommit_ratio / 100) + total_swap_pages;
-
-	cached = global_page_state(NR_FILE_PAGES) -
-			total_swapcache_pages - i.bufferram;
-	if (cached < 0)
-		cached = 0;
-
-	get_vmalloc_info(&vmi);
-
-	for (lru = LRU_BASE; lru < NR_LRU_LISTS; lru++)
-		pages[lru] = global_page_state(NR_LRU_BASE + lru);
-
-	/*
-	 * Tagged format, for easy grepping and expansion.
-	 */
-	len = sprintf(page,
-		"MemTotal:       %8lu kB\n"
-		"MemFree:        %8lu kB\n"
-		"Buffers:        %8lu kB\n"
-		"Cached:         %8lu kB\n"
-		"SwapCached:     %8lu kB\n"
-		"Active:         %8lu kB\n"
-		"Inactive:       %8lu kB\n"
-		"Active(anon):   %8lu kB\n"
-		"Inactive(anon): %8lu kB\n"
-		"Active(file):   %8lu kB\n"
-		"Inactive(file): %8lu kB\n"
-#ifdef CONFIG_UNEVICTABLE_LRU
-		"Unevictable:    %8lu kB\n"
-		"Mlocked:        %8lu kB\n"
-#endif
-#ifdef CONFIG_HIGHMEM
-		"HighTotal:      %8lu kB\n"
-		"HighFree:       %8lu kB\n"
-		"LowTotal:       %8lu kB\n"
-		"LowFree:        %8lu kB\n"
-#endif
-		"SwapTotal:      %8lu kB\n"
-		"SwapFree:       %8lu kB\n"
-		"Dirty:          %8lu kB\n"
-		"Writeback:      %8lu kB\n"
-		"AnonPages:      %8lu kB\n"
-		"Mapped:         %8lu kB\n"
-		"Slab:           %8lu kB\n"
-		"SReclaimable:   %8lu kB\n"
-		"SUnreclaim:     %8lu kB\n"
-		"PageTables:     %8lu kB\n"
-#ifdef CONFIG_QUICKLIST
-		"Quicklists:     %8lu kB\n"
-#endif
-		"NFS_Unstable:   %8lu kB\n"
-		"Bounce:         %8lu kB\n"
-		"WritebackTmp:   %8lu kB\n"
-		"CommitLimit:    %8lu kB\n"
-		"Committed_AS:   %8lu kB\n"
-		"VmallocTotal:   %8lu kB\n"
-		"VmallocUsed:    %8lu kB\n"
-		"VmallocChunk:   %8lu kB\n",
-		K(i.totalram),
-		K(i.freeram),
-		K(i.bufferram),
-		K(cached),
-		K(total_swapcache_pages),
-		K(pages[LRU_ACTIVE_ANON]   + pages[LRU_ACTIVE_FILE]),
-		K(pages[LRU_INACTIVE_ANON] + pages[LRU_INACTIVE_FILE]),
-		K(pages[LRU_ACTIVE_ANON]),
-		K(pages[LRU_INACTIVE_ANON]),
-		K(pages[LRU_ACTIVE_FILE]),
-		K(pages[LRU_INACTIVE_FILE]),
-#ifdef CONFIG_UNEVICTABLE_LRU
-		K(pages[LRU_UNEVICTABLE]),
-		K(global_page_state(NR_MLOCK)),
-#endif
-#ifdef CONFIG_HIGHMEM
-		K(i.totalhigh),
-		K(i.freehigh),
-		K(i.totalram-i.totalhigh),
-		K(i.freeram-i.freehigh),
-#endif
-		K(i.totalswap),
-		K(i.freeswap),
-		K(global_page_state(NR_FILE_DIRTY)),
-		K(global_page_state(NR_WRITEBACK)),
-		K(global_page_state(NR_ANON_PAGES)),
-		K(global_page_state(NR_FILE_MAPPED)),
-		K(global_page_state(NR_SLAB_RECLAIMABLE) +
-				global_page_state(NR_SLAB_UNRECLAIMABLE)),
-		K(global_page_state(NR_SLAB_RECLAIMABLE)),
-		K(global_page_state(NR_SLAB_UNRECLAIMABLE)),
-		K(global_page_state(NR_PAGETABLE)),
-#ifdef CONFIG_QUICKLIST
-		K(quicklist_total_size()),
-#endif
-		K(global_page_state(NR_UNSTABLE_NFS)),
-		K(global_page_state(NR_BOUNCE)),
-		K(global_page_state(NR_WRITEBACK_TEMP)),
-		K(allowed),
-		K(committed),
-		(unsigned long)VMALLOC_TOTAL >> 10,
-		vmi.used >> 10,
-		vmi.largest_chunk >> 10
-		);
-
-		len += hugetlb_report_meminfo(page + len);
-
-	len += arch_report_meminfo(page + len);
-
-	return proc_calc_metrics(page, start, off, count, eof, len);
-#undef K
-}
-
 static int fragmentation_open(struct inode *inode, struct file *file)
 {
 	(void)inode;
@@ -816,7 +680,6 @@ void __init proc_misc_init(void)
 		char *name;
 		int (*read_proc)(char*,char**,off_t,int,int*,void*);
 	} *p, simple_ones[] = {
-		{"meminfo",	meminfo_read_proc},
 		{"version",	version_read_proc},
 #ifdef CONFIG_PROC_HARDWARE
 		{"hardware",	hardware_read_proc},
diff --git a/include/asm-x86/pgtable.h b/include/asm-x86/pgtable.h
index 88a53b1a17f0..a3dda6d615be 100644
--- a/include/asm-x86/pgtable.h
+++ b/include/asm-x86/pgtable.h
@@ -348,7 +348,8 @@ static inline void native_pagetable_setup_start(pgd_t *base) {}
 static inline void native_pagetable_setup_done(pgd_t *base) {}
 #endif
 
-extern int arch_report_meminfo(char *page);
+struct seq_file;
+extern void arch_report_meminfo(struct seq_file *m);
 
 #ifdef CONFIG_PARAVIRT
 #include <asm/paravirt.h>
diff --git a/include/linux/hugetlb.h b/include/linux/hugetlb.h
index 32e0ef0f6e1f..e1c8afc002c0 100644
--- a/include/linux/hugetlb.h
+++ b/include/linux/hugetlb.h
@@ -27,7 +27,7 @@ void unmap_hugepage_range(struct vm_area_struct *,
 void __unmap_hugepage_range(struct vm_area_struct *,
 			unsigned long, unsigned long, struct page *);
 int hugetlb_prefault(struct address_space *, struct vm_area_struct *);
-int hugetlb_report_meminfo(char *);
+void hugetlb_report_meminfo(struct seq_file *);
 int hugetlb_report_node_meminfo(int, char *);
 unsigned long hugetlb_total_pages(void);
 int hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma,
@@ -79,7 +79,9 @@ static inline unsigned long hugetlb_total_pages(void)
 #define copy_hugetlb_page_range(src, dst, vma)	({ BUG(); 0; })
 #define hugetlb_prefault(mapping, vma)		({ BUG(); 0; })
 #define unmap_hugepage_range(vma, start, end, page)	BUG()
-#define hugetlb_report_meminfo(buf)		0
+static inline void hugetlb_report_meminfo(struct seq_file *m)
+{
+}
 #define hugetlb_report_node_meminfo(n, buf)	0
 #define follow_huge_pmd(mm, addr, pmd, write)	NULL
 #define follow_huge_pud(mm, addr, pud, write)	NULL
diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index ce8cbb29860b..421aee99b84a 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -7,6 +7,7 @@
 #include <linux/init.h>
 #include <linux/module.h>
 #include <linux/mm.h>
+#include <linux/seq_file.h>
 #include <linux/sysctl.h>
 #include <linux/highmem.h>
 #include <linux/mmu_notifier.h>
@@ -1455,10 +1456,10 @@ int hugetlb_overcommit_handler(struct ctl_table *table, int write,
 
 #endif /* CONFIG_SYSCTL */
 
-int hugetlb_report_meminfo(char *buf)
+void hugetlb_report_meminfo(struct seq_file *m)
 {
 	struct hstate *h = &default_hstate;
-	return sprintf(buf,
+	seq_printf(m,
 			"HugePages_Total:   %5lu\n"
 			"HugePages_Free:    %5lu\n"
 			"HugePages_Rsvd:    %5lu\n"
-- 
cgit v1.2.3


From d8ba7a363393f803c93c8cffabd6d0362618bc2a Mon Sep 17 00:00:00 2001
From: Alexey Dobriyan <adobriyan@gmail.com>
Date: Sat, 4 Oct 2008 22:34:18 +0400
Subject: proc: move rest of /proc/locks to fs/locks.c

Signed-off-by: Alexey Dobriyan <adobriyan@gmail.com>
---
 fs/locks.c          | 22 +++++++++++++++++++++-
 fs/proc/proc_misc.c | 17 -----------------
 include/linux/fs.h  |  1 -
 3 files changed, 21 insertions(+), 19 deletions(-)

(limited to 'include/linux')

diff --git a/fs/locks.c b/fs/locks.c
index 5eb259e3cd38..90e87f57b331 100644
--- a/fs/locks.c
+++ b/fs/locks.c
@@ -2078,6 +2078,7 @@ int vfs_cancel_lock(struct file *filp, struct file_lock *fl)
 EXPORT_SYMBOL_GPL(vfs_cancel_lock);
 
 #ifdef CONFIG_PROC_FS
+#include <linux/proc_fs.h>
 #include <linux/seq_file.h>
 
 static void lock_get_status(struct seq_file *f, struct file_lock *fl,
@@ -2183,12 +2184,31 @@ static void locks_stop(struct seq_file *f, void *v)
 	unlock_kernel();
 }
 
-struct seq_operations locks_seq_operations = {
+static const struct seq_operations locks_seq_operations = {
 	.start	= locks_start,
 	.next	= locks_next,
 	.stop	= locks_stop,
 	.show	= locks_show,
 };
+
+static int locks_open(struct inode *inode, struct file *filp)
+{
+	return seq_open(filp, &locks_seq_operations);
+}
+
+static const struct file_operations proc_locks_operations = {
+	.open		= locks_open,
+	.read		= seq_read,
+	.llseek		= seq_lseek,
+	.release	= seq_release,
+};
+
+static int __init proc_locks_init(void)
+{
+	proc_create("locks", 0, NULL, &proc_locks_operations);
+	return 0;
+}
+module_init(proc_locks_init);
 #endif
 
 /**
diff --git a/fs/proc/proc_misc.c b/fs/proc/proc_misc.c
index fcac25edaef7..fea7d658fff6 100644
--- a/fs/proc/proc_misc.c
+++ b/fs/proc/proc_misc.c
@@ -453,20 +453,6 @@ static const struct file_operations proc_interrupts_operations = {
 	.release	= seq_release,
 };
 
-#ifdef CONFIG_FILE_LOCKING
-static int locks_open(struct inode *inode, struct file *filp)
-{
-	return seq_open(filp, &locks_seq_operations);
-}
-
-static const struct file_operations proc_locks_operations = {
-	.open		= locks_open,
-	.read		= seq_read,
-	.llseek		= seq_lseek,
-	.release	= seq_release,
-};
-#endif /* CONFIG_FILE_LOCKING */
-
 #ifdef CONFIG_PROC_PAGE_MONITOR
 #define KPMSIZE sizeof(u64)
 #define KPMMASK (KPMSIZE - 1)
@@ -605,9 +591,6 @@ void __init proc_misc_init(void)
 	proc_symlink("mounts", NULL, "self/mounts");
 
 	/* And now for trickier ones */
-#ifdef CONFIG_FILE_LOCKING
-	proc_create("locks", 0, NULL, &proc_locks_operations);
-#endif
 	proc_create("devices", 0, NULL, &proc_devinfo_operations);
 	proc_create("cpuinfo", 0, NULL, &proc_cpuinfo_operations);
 #ifdef CONFIG_BLOCK
diff --git a/include/linux/fs.h b/include/linux/fs.h
index a6a625be13fc..024049543ae1 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -1037,7 +1037,6 @@ extern int vfs_setlease(struct file *, long, struct file_lock **);
 extern int lease_modify(struct file_lock **, int);
 extern int lock_may_read(struct inode *, loff_t start, unsigned long count);
 extern int lock_may_write(struct inode *, loff_t start, unsigned long count);
-extern struct seq_operations locks_seq_operations;
 #else /* !CONFIG_FILE_LOCKING */
 #define fcntl_getlk(a, b) ({ -EINVAL; })
 #define fcntl_setlk(a, b, c, d) ({ -EACCES; })
-- 
cgit v1.2.3


From f500975a3f3ecf3611d79f1d933906753460b9f2 Mon Sep 17 00:00:00 2001
From: Alexey Dobriyan <adobriyan@gmail.com>
Date: Sat, 4 Oct 2008 23:53:21 +0400
Subject: proc: move rest of /proc/partitions code to block/genhd.c

Signed-off-by: Alexey Dobriyan <adobriyan@gmail.com>
Acked-by: Jens Axboe <jens.axboe@oracle.com>
---
 block/genhd.c         | 22 +++++++++++++++++++++-
 fs/proc/proc_misc.c   | 14 --------------
 include/linux/genhd.h |  1 -
 3 files changed, 21 insertions(+), 16 deletions(-)

(limited to 'include/linux')

diff --git a/block/genhd.c b/block/genhd.c
index 646e1d2507c7..15f4d2b12c48 100644
--- a/block/genhd.c
+++ b/block/genhd.c
@@ -10,6 +10,7 @@
 #include <linux/blkdev.h>
 #include <linux/init.h>
 #include <linux/spinlock.h>
+#include <linux/proc_fs.h>
 #include <linux/seq_file.h>
 #include <linux/slab.h>
 #include <linux/kmod.h>
@@ -727,12 +728,24 @@ static int show_partition(struct seq_file *seqf, void *v)
 	return 0;
 }
 
-const struct seq_operations partitions_op = {
+static const struct seq_operations partitions_op = {
 	.start	= show_partition_start,
 	.next	= disk_seqf_next,
 	.stop	= disk_seqf_stop,
 	.show	= show_partition
 };
+
+static int partitions_open(struct inode *inode, struct file *file)
+{
+	return seq_open(file, &partitions_op);
+}
+
+static const struct file_operations proc_partitions_operations = {
+	.open		= partitions_open,
+	.read		= seq_read,
+	.llseek		= seq_lseek,
+	.release	= seq_release,
+};
 #endif
 
 
@@ -998,6 +1011,13 @@ const struct seq_operations diskstats_op = {
 	.stop	= disk_seqf_stop,
 	.show	= diskstats_show
 };
+
+static int __init proc_genhd_init(void)
+{
+	proc_create("partitions", 0, NULL, &proc_partitions_operations);
+	return 0;
+}
+module_init(proc_genhd_init);
 #endif /* CONFIG_PROC_FS */
 
 static void media_change_notify_thread(struct work_struct *work)
diff --git a/fs/proc/proc_misc.c b/fs/proc/proc_misc.c
index 8974809be5f6..253ea50c4393 100644
--- a/fs/proc/proc_misc.c
+++ b/fs/proc/proc_misc.c
@@ -106,17 +106,6 @@ static const struct file_operations proc_vmstat_file_operations = {
 };
 
 #ifdef CONFIG_BLOCK
-static int partitions_open(struct inode *inode, struct file *file)
-{
-	return seq_open(file, &partitions_op);
-}
-static const struct file_operations proc_partitions_operations = {
-	.open		= partitions_open,
-	.read		= seq_read,
-	.llseek		= seq_lseek,
-	.release	= seq_release,
-};
-
 static int diskstats_open(struct inode *inode, struct file *file)
 {
 	return seq_open(file, &diskstats_op);
@@ -519,9 +508,6 @@ void __init proc_misc_init(void)
 	proc_symlink("mounts", NULL, "self/mounts");
 
 	/* And now for trickier ones */
-#ifdef CONFIG_BLOCK
-	proc_create("partitions", 0, NULL, &proc_partitions_operations);
-#endif
 	proc_create("stat", 0, NULL, &proc_stat_operations);
 	proc_create("interrupts", 0, NULL, &proc_interrupts_operations);
 #ifdef CONFIG_SLABINFO
diff --git a/include/linux/genhd.h b/include/linux/genhd.h
index 206cdf96c3a7..074a4fdf4365 100644
--- a/include/linux/genhd.h
+++ b/include/linux/genhd.h
@@ -25,7 +25,6 @@ extern struct device_type part_type;
 extern struct kobject *block_depr;
 extern struct class block_class;
 
-extern const struct seq_operations partitions_op;
 extern const struct seq_operations diskstats_op;
 
 enum {
-- 
cgit v1.2.3


From 7b3c3a50a3e0ea46815150d420fa276ac254572b Mon Sep 17 00:00:00 2001
From: Alexey Dobriyan <adobriyan@gmail.com>
Date: Mon, 6 Oct 2008 02:42:17 +0400
Subject: proc: move /proc/slabinfo boilerplate to mm/slub.c, mm/slab.c

Lose dummy ->write hook in case of SLUB, it's possible now.

Signed-off-by: Alexey Dobriyan <adobriyan@gmail.com>
Acked-by: Pekka Enberg <penberg@cs.helsinki.fi>
---
 fs/proc/proc_misc.c  | 17 -----------------
 include/linux/slab.h |  5 -----
 mm/slab.c            | 16 +++++++++++++++-
 mm/slub.c            | 29 ++++++++++++++++++++---------
 4 files changed, 35 insertions(+), 32 deletions(-)

(limited to 'include/linux')

diff --git a/fs/proc/proc_misc.c b/fs/proc/proc_misc.c
index 5bca02842d07..1d6d5c5cc2a8 100644
--- a/fs/proc/proc_misc.c
+++ b/fs/proc/proc_misc.c
@@ -132,20 +132,6 @@ static const struct file_operations proc_modules_operations = {
 };
 #endif
 
-#ifdef CONFIG_SLABINFO
-static int slabinfo_open(struct inode *inode, struct file *file)
-{
-	return seq_open(file, &slabinfo_op);
-}
-static const struct file_operations proc_slabinfo_operations = {
-	.open		= slabinfo_open,
-	.read		= seq_read,
-	.write		= slabinfo_write,
-	.llseek		= seq_lseek,
-	.release	= seq_release,
-};
-#endif
-
 #ifdef CONFIG_MMU
 static int vmalloc_open(struct inode *inode, struct file *file)
 {
@@ -309,9 +295,6 @@ void __init proc_misc_init(void)
 	proc_symlink("mounts", NULL, "self/mounts");
 
 	/* And now for trickier ones */
-#ifdef CONFIG_SLABINFO
-	proc_create("slabinfo",S_IWUSR|S_IRUGO,NULL,&proc_slabinfo_operations);
-#endif
 #ifdef CONFIG_MMU
 	proc_create("vmallocinfo", S_IRUSR, NULL, &proc_vmalloc_operations);
 #endif
diff --git a/include/linux/slab.h b/include/linux/slab.h
index 5ff9676c1e2c..ba965c84ae06 100644
--- a/include/linux/slab.h
+++ b/include/linux/slab.h
@@ -288,9 +288,4 @@ static inline void *kzalloc_node(size_t size, gfp_t flags, int node)
 	return kmalloc_node(size, flags | __GFP_ZERO, node);
 }
 
-#ifdef CONFIG_SLABINFO
-extern const struct seq_operations slabinfo_op;
-ssize_t slabinfo_write(struct file *, const char __user *, size_t, loff_t *);
-#endif
-
 #endif	/* _LINUX_SLAB_H */
diff --git a/mm/slab.c b/mm/slab.c
index d53ac9c26ab7..09187517f9dc 100644
--- a/mm/slab.c
+++ b/mm/slab.c
@@ -4259,7 +4259,7 @@ static int s_show(struct seq_file *m, void *p)
  * + further values on SMP and with statistics enabled
  */
 
-const struct seq_operations slabinfo_op = {
+static const struct seq_operations slabinfo_op = {
 	.start = s_start,
 	.next = s_next,
 	.stop = s_stop,
@@ -4316,6 +4316,19 @@ ssize_t slabinfo_write(struct file *file, const char __user * buffer,
 	return res;
 }
 
+static int slabinfo_open(struct inode *inode, struct file *file)
+{
+	return seq_open(file, &slabinfo_op);
+}
+
+static const struct file_operations proc_slabinfo_operations = {
+	.open		= slabinfo_open,
+	.read		= seq_read,
+	.write		= slabinfo_write,
+	.llseek		= seq_lseek,
+	.release	= seq_release,
+};
+
 #ifdef CONFIG_DEBUG_SLAB_LEAK
 
 static void *leaks_start(struct seq_file *m, loff_t *pos)
@@ -4478,6 +4491,7 @@ static const struct file_operations proc_slabstats_operations = {
 
 static int __init slab_proc_init(void)
 {
+	proc_create("slabinfo",S_IWUSR|S_IRUGO,NULL,&proc_slabinfo_operations);
 #ifdef CONFIG_DEBUG_SLAB_LEAK
 	proc_create("slab_allocators", 0, NULL, &proc_slabstats_operations);
 #endif
diff --git a/mm/slub.c b/mm/slub.c
index 0c83e6afe7b2..7ad489af9561 100644
--- a/mm/slub.c
+++ b/mm/slub.c
@@ -14,6 +14,7 @@
 #include <linux/interrupt.h>
 #include <linux/bitops.h>
 #include <linux/slab.h>
+#include <linux/proc_fs.h>
 #include <linux/seq_file.h>
 #include <linux/cpu.h>
 #include <linux/cpuset.h>
@@ -4417,14 +4418,6 @@ __initcall(slab_sysfs_init);
  * The /proc/slabinfo ABI
  */
 #ifdef CONFIG_SLABINFO
-
-ssize_t slabinfo_write(struct file *file, const char __user *buffer,
-		       size_t count, loff_t *ppos)
-{
-	return -EINVAL;
-}
-
-
 static void print_slabinfo_header(struct seq_file *m)
 {
 	seq_puts(m, "slabinfo - version: 2.1\n");
@@ -4492,11 +4485,29 @@ static int s_show(struct seq_file *m, void *p)
 	return 0;
 }
 
-const struct seq_operations slabinfo_op = {
+static const struct seq_operations slabinfo_op = {
 	.start = s_start,
 	.next = s_next,
 	.stop = s_stop,
 	.show = s_show,
 };
 
+static int slabinfo_open(struct inode *inode, struct file *file)
+{
+	return seq_open(file, &slabinfo_op);
+}
+
+static const struct file_operations proc_slabinfo_operations = {
+	.open		= slabinfo_open,
+	.read		= seq_read,
+	.llseek		= seq_lseek,
+	.release	= seq_release,
+};
+
+static int __init slab_proc_init(void)
+{
+	proc_create("slabinfo",S_IWUSR|S_IRUGO,NULL,&proc_slabinfo_operations);
+	return 0;
+}
+module_init(slab_proc_init);
 #endif /* CONFIG_SLABINFO */
-- 
cgit v1.2.3


From 5f6a6a9c4e4d790aae55cb412a7643329057c5e0 Mon Sep 17 00:00:00 2001
From: Alexey Dobriyan <adobriyan@gmail.com>
Date: Mon, 6 Oct 2008 03:50:47 +0400
Subject: proc: move /proc/vmallocinfo to mm/vmalloc.c

Signed-off-by: Alexey Dobriyan <adobriyan@gmail.com>
Acked-by: Christoph Lameter <cl@linux-foundation.org>
---
 fs/proc/proc_misc.c     | 28 ----------------------------
 include/linux/vmalloc.h |  2 --
 mm/vmalloc.c            | 33 ++++++++++++++++++++++++++++++++-
 3 files changed, 32 insertions(+), 31 deletions(-)

(limited to 'include/linux')

diff --git a/fs/proc/proc_misc.c b/fs/proc/proc_misc.c
index 1d6d5c5cc2a8..fd41a032456b 100644
--- a/fs/proc/proc_misc.c
+++ b/fs/proc/proc_misc.c
@@ -132,31 +132,6 @@ static const struct file_operations proc_modules_operations = {
 };
 #endif
 
-#ifdef CONFIG_MMU
-static int vmalloc_open(struct inode *inode, struct file *file)
-{
-	unsigned int *ptr = NULL;
-	int ret;
-
-	if (NUMA_BUILD)
-		ptr = kmalloc(nr_node_ids * sizeof(unsigned int), GFP_KERNEL);
-	ret = seq_open(file, &vmalloc_op);
-	if (!ret) {
-		struct seq_file *m = file->private_data;
-		m->private = ptr;
-	} else
-		kfree(ptr);
-	return ret;
-}
-
-static const struct file_operations proc_vmalloc_operations = {
-	.open		= vmalloc_open,
-	.read		= seq_read,
-	.llseek		= seq_lseek,
-	.release	= seq_release_private,
-};
-#endif
-
 #ifdef CONFIG_PROC_PAGE_MONITOR
 #define KPMSIZE sizeof(u64)
 #define KPMMASK (KPMSIZE - 1)
@@ -295,9 +270,6 @@ void __init proc_misc_init(void)
 	proc_symlink("mounts", NULL, "self/mounts");
 
 	/* And now for trickier ones */
-#ifdef CONFIG_MMU
-	proc_create("vmallocinfo", S_IRUSR, NULL, &proc_vmalloc_operations);
-#endif
 	proc_create("buddyinfo", S_IRUGO, NULL, &fragmentation_file_operations);
 	proc_create("pagetypeinfo", S_IRUGO, NULL, &pagetypeinfo_file_ops);
 	proc_create("vmstat", S_IRUGO, NULL, &proc_vmstat_file_operations);
diff --git a/include/linux/vmalloc.h b/include/linux/vmalloc.h
index 4c28c4d564e2..307b88577eaa 100644
--- a/include/linux/vmalloc.h
+++ b/include/linux/vmalloc.h
@@ -103,6 +103,4 @@ extern void free_vm_area(struct vm_struct *area);
 extern rwlock_t vmlist_lock;
 extern struct vm_struct *vmlist;
 
-extern const struct seq_operations vmalloc_op;
-
 #endif /* _LINUX_VMALLOC_H */
diff --git a/mm/vmalloc.c b/mm/vmalloc.c
index 65ae576030da..036536945dd9 100644
--- a/mm/vmalloc.c
+++ b/mm/vmalloc.c
@@ -15,6 +15,7 @@
 #include <linux/slab.h>
 #include <linux/spinlock.h>
 #include <linux/interrupt.h>
+#include <linux/proc_fs.h>
 #include <linux/seq_file.h>
 #include <linux/debugobjects.h>
 #include <linux/kallsyms.h>
@@ -1718,11 +1719,41 @@ static int s_show(struct seq_file *m, void *p)
 	return 0;
 }
 
-const struct seq_operations vmalloc_op = {
+static const struct seq_operations vmalloc_op = {
 	.start = s_start,
 	.next = s_next,
 	.stop = s_stop,
 	.show = s_show,
 };
+
+static int vmalloc_open(struct inode *inode, struct file *file)
+{
+	unsigned int *ptr = NULL;
+	int ret;
+
+	if (NUMA_BUILD)
+		ptr = kmalloc(nr_node_ids * sizeof(unsigned int), GFP_KERNEL);
+	ret = seq_open(file, &vmalloc_op);
+	if (!ret) {
+		struct seq_file *m = file->private_data;
+		m->private = ptr;
+	} else
+		kfree(ptr);
+	return ret;
+}
+
+static const struct file_operations proc_vmalloc_operations = {
+	.open		= vmalloc_open,
+	.read		= seq_read,
+	.llseek		= seq_lseek,
+	.release	= seq_release_private,
+};
+
+static int __init proc_vmalloc_init(void)
+{
+	proc_create("vmallocinfo", S_IRUSR, NULL, &proc_vmalloc_operations);
+	return 0;
+}
+module_init(proc_vmalloc_init);
 #endif
 
-- 
cgit v1.2.3


From 8f32f7e5ac2ed11b0659b6b55af926f3d58ffd9d Mon Sep 17 00:00:00 2001
From: Alexey Dobriyan <adobriyan@gmail.com>
Date: Mon, 6 Oct 2008 04:13:52 +0400
Subject: proc: move /proc/buddyinfo boilerplate to mm/vmstat.c

Signed-off-by: Alexey Dobriyan <adobriyan@gmail.com>
---
 fs/proc/proc_misc.c    | 14 --------------
 include/linux/vmstat.h |  1 -
 mm/vmstat.c            | 25 +++++++++++++++++++++----
 3 files changed, 21 insertions(+), 19 deletions(-)

(limited to 'include/linux')

diff --git a/fs/proc/proc_misc.c b/fs/proc/proc_misc.c
index fd41a032456b..a35e50659b8d 100644
--- a/fs/proc/proc_misc.c
+++ b/fs/proc/proc_misc.c
@@ -57,19 +57,6 @@
 #include <asm/div64.h>
 #include "internal.h"
 
-static int fragmentation_open(struct inode *inode, struct file *file)
-{
-	(void)inode;
-	return seq_open(file, &fragmentation_op);
-}
-
-static const struct file_operations fragmentation_file_operations = {
-	.open		= fragmentation_open,
-	.read		= seq_read,
-	.llseek		= seq_lseek,
-	.release	= seq_release,
-};
-
 static int pagetypeinfo_open(struct inode *inode, struct file *file)
 {
 	return seq_open(file, &pagetypeinfo_op);
@@ -270,7 +257,6 @@ void __init proc_misc_init(void)
 	proc_symlink("mounts", NULL, "self/mounts");
 
 	/* And now for trickier ones */
-	proc_create("buddyinfo", S_IRUGO, NULL, &fragmentation_file_operations);
 	proc_create("pagetypeinfo", S_IRUGO, NULL, &pagetypeinfo_file_ops);
 	proc_create("vmstat", S_IRUGO, NULL, &proc_vmstat_file_operations);
 	proc_create("zoneinfo", S_IRUGO, NULL, &proc_zoneinfo_file_operations);
diff --git a/include/linux/vmstat.h b/include/linux/vmstat.h
index 9cd3ab0f554d..d4551f206409 100644
--- a/include/linux/vmstat.h
+++ b/include/linux/vmstat.h
@@ -54,7 +54,6 @@ enum vm_event_item { PGPGIN, PGPGOUT, PSWPIN, PSWPOUT,
 		NR_VM_EVENT_ITEMS
 };
 
-extern const struct seq_operations fragmentation_op;
 extern const struct seq_operations pagetypeinfo_op;
 extern const struct seq_operations zoneinfo_op;
 extern const struct seq_operations vmstat_op;
diff --git a/mm/vmstat.c b/mm/vmstat.c
index 9343227c5c60..f45d7245a282 100644
--- a/mm/vmstat.c
+++ b/mm/vmstat.c
@@ -8,7 +8,7 @@
  *  Copyright (C) 2006 Silicon Graphics, Inc.,
  *		Christoph Lameter <christoph@lameter.com>
  */
-
+#include <linux/fs.h>
 #include <linux/mm.h>
 #include <linux/err.h>
 #include <linux/module.h>
@@ -384,7 +384,7 @@ void zone_statistics(struct zone *preferred_zone, struct zone *z)
 #endif
 
 #ifdef CONFIG_PROC_FS
-
+#include <linux/proc_fs.h>
 #include <linux/seq_file.h>
 
 static char * const migratetype_names[MIGRATE_TYPES] = {
@@ -581,13 +581,25 @@ static int pagetypeinfo_show(struct seq_file *m, void *arg)
 	return 0;
 }
 
-const struct seq_operations fragmentation_op = {
+static const struct seq_operations fragmentation_op = {
 	.start	= frag_start,
 	.next	= frag_next,
 	.stop	= frag_stop,
 	.show	= frag_show,
 };
 
+static int fragmentation_open(struct inode *inode, struct file *file)
+{
+	return seq_open(file, &fragmentation_op);
+}
+
+static const struct file_operations fragmentation_file_operations = {
+	.open		= fragmentation_open,
+	.read		= seq_read,
+	.llseek		= seq_lseek,
+	.release	= seq_release,
+};
+
 const struct seq_operations pagetypeinfo_op = {
 	.start	= frag_start,
 	.next	= frag_next,
@@ -898,9 +910,11 @@ static int __cpuinit vmstat_cpuup_callback(struct notifier_block *nfb,
 
 static struct notifier_block __cpuinitdata vmstat_notifier =
 	{ &vmstat_cpuup_callback, NULL, 0 };
+#endif
 
 static int __init setup_vmstat(void)
 {
+#ifdef CONFIG_SMP
 	int cpu;
 
 	refresh_zone_stat_thresholds();
@@ -908,7 +922,10 @@ static int __init setup_vmstat(void)
 
 	for_each_online_cpu(cpu)
 		start_cpu_timer(cpu);
+#endif
+#ifdef CONFIG_PROC_FS
+	proc_create("buddyinfo", S_IRUGO, NULL, &fragmentation_file_operations);
+#endif
 	return 0;
 }
 module_init(setup_vmstat)
-#endif
-- 
cgit v1.2.3


From 74e2e8e8ce7b3c0f878a349f9fa6cf2831548eef Mon Sep 17 00:00:00 2001
From: Alexey Dobriyan <adobriyan@gmail.com>
Date: Mon, 6 Oct 2008 04:15:36 +0400
Subject: proc: move /proc/pagetypeinfo boilerplate to mm/vmstat.c

Signed-off-by: Alexey Dobriyan <adobriyan@gmail.com>
---
 fs/proc/proc_misc.c    | 13 -------------
 include/linux/vmstat.h |  1 -
 mm/vmstat.c            | 15 ++++++++++++++-
 3 files changed, 14 insertions(+), 15 deletions(-)

(limited to 'include/linux')

diff --git a/fs/proc/proc_misc.c b/fs/proc/proc_misc.c
index a35e50659b8d..900331a634ef 100644
--- a/fs/proc/proc_misc.c
+++ b/fs/proc/proc_misc.c
@@ -57,18 +57,6 @@
 #include <asm/div64.h>
 #include "internal.h"
 
-static int pagetypeinfo_open(struct inode *inode, struct file *file)
-{
-	return seq_open(file, &pagetypeinfo_op);
-}
-
-static const struct file_operations pagetypeinfo_file_ops = {
-	.open		= pagetypeinfo_open,
-	.read		= seq_read,
-	.llseek		= seq_lseek,
-	.release	= seq_release,
-};
-
 static int zoneinfo_open(struct inode *inode, struct file *file)
 {
 	return seq_open(file, &zoneinfo_op);
@@ -257,7 +245,6 @@ void __init proc_misc_init(void)
 	proc_symlink("mounts", NULL, "self/mounts");
 
 	/* And now for trickier ones */
-	proc_create("pagetypeinfo", S_IRUGO, NULL, &pagetypeinfo_file_ops);
 	proc_create("vmstat", S_IRUGO, NULL, &proc_vmstat_file_operations);
 	proc_create("zoneinfo", S_IRUGO, NULL, &proc_zoneinfo_file_operations);
 #ifdef CONFIG_BLOCK
diff --git a/include/linux/vmstat.h b/include/linux/vmstat.h
index d4551f206409..33ffd89a88ac 100644
--- a/include/linux/vmstat.h
+++ b/include/linux/vmstat.h
@@ -54,7 +54,6 @@ enum vm_event_item { PGPGIN, PGPGOUT, PSWPIN, PSWPOUT,
 		NR_VM_EVENT_ITEMS
 };
 
-extern const struct seq_operations pagetypeinfo_op;
 extern const struct seq_operations zoneinfo_op;
 extern const struct seq_operations vmstat_op;
 extern int sysctl_stat_interval;
diff --git a/mm/vmstat.c b/mm/vmstat.c
index f45d7245a282..d624d251946d 100644
--- a/mm/vmstat.c
+++ b/mm/vmstat.c
@@ -600,13 +600,25 @@ static const struct file_operations fragmentation_file_operations = {
 	.release	= seq_release,
 };
 
-const struct seq_operations pagetypeinfo_op = {
+static const struct seq_operations pagetypeinfo_op = {
 	.start	= frag_start,
 	.next	= frag_next,
 	.stop	= frag_stop,
 	.show	= pagetypeinfo_show,
 };
 
+static int pagetypeinfo_open(struct inode *inode, struct file *file)
+{
+	return seq_open(file, &pagetypeinfo_op);
+}
+
+static const struct file_operations pagetypeinfo_file_ops = {
+	.open		= pagetypeinfo_open,
+	.read		= seq_read,
+	.llseek		= seq_lseek,
+	.release	= seq_release,
+};
+
 #ifdef CONFIG_ZONE_DMA
 #define TEXT_FOR_DMA(xx) xx "_dma",
 #else
@@ -925,6 +937,7 @@ static int __init setup_vmstat(void)
 #endif
 #ifdef CONFIG_PROC_FS
 	proc_create("buddyinfo", S_IRUGO, NULL, &fragmentation_file_operations);
+	proc_create("pagetypeinfo", S_IRUGO, NULL, &pagetypeinfo_file_ops);
 #endif
 	return 0;
 }
-- 
cgit v1.2.3


From b6aa44ab698c7df9d951d3eb45c4fcb8ba68fb25 Mon Sep 17 00:00:00 2001
From: Alexey Dobriyan <adobriyan@gmail.com>
Date: Mon, 6 Oct 2008 04:17:48 +0400
Subject: proc: move /proc/vmstat boilerplate to mm/vmstat.c

Signed-off-by: Alexey Dobriyan <adobriyan@gmail.com>
Acked-by: Christoph Lameter <cl@linux-foundation.org>
---
 fs/proc/proc_misc.c    | 12 ------------
 include/linux/vmstat.h |  1 -
 mm/vmstat.c            | 14 +++++++++++++-
 3 files changed, 13 insertions(+), 14 deletions(-)

(limited to 'include/linux')

diff --git a/fs/proc/proc_misc.c b/fs/proc/proc_misc.c
index 900331a634ef..e7a301d5d432 100644
--- a/fs/proc/proc_misc.c
+++ b/fs/proc/proc_misc.c
@@ -69,17 +69,6 @@ static const struct file_operations proc_zoneinfo_file_operations = {
 	.release	= seq_release,
 };
 
-static int vmstat_open(struct inode *inode, struct file *file)
-{
-	return seq_open(file, &vmstat_op);
-}
-static const struct file_operations proc_vmstat_file_operations = {
-	.open		= vmstat_open,
-	.read		= seq_read,
-	.llseek		= seq_lseek,
-	.release	= seq_release,
-};
-
 #ifdef CONFIG_BLOCK
 static int diskstats_open(struct inode *inode, struct file *file)
 {
@@ -245,7 +234,6 @@ void __init proc_misc_init(void)
 	proc_symlink("mounts", NULL, "self/mounts");
 
 	/* And now for trickier ones */
-	proc_create("vmstat", S_IRUGO, NULL, &proc_vmstat_file_operations);
 	proc_create("zoneinfo", S_IRUGO, NULL, &proc_zoneinfo_file_operations);
 #ifdef CONFIG_BLOCK
 	proc_create("diskstats", 0, NULL, &proc_diskstats_operations);
diff --git a/include/linux/vmstat.h b/include/linux/vmstat.h
index 33ffd89a88ac..7b68c4c1e19c 100644
--- a/include/linux/vmstat.h
+++ b/include/linux/vmstat.h
@@ -55,7 +55,6 @@ enum vm_event_item { PGPGIN, PGPGOUT, PSWPIN, PSWPOUT,
 };
 
 extern const struct seq_operations zoneinfo_op;
-extern const struct seq_operations vmstat_op;
 extern int sysctl_stat_interval;
 
 #ifdef CONFIG_VM_EVENT_COUNTERS
diff --git a/mm/vmstat.c b/mm/vmstat.c
index d624d251946d..7e1854b81868 100644
--- a/mm/vmstat.c
+++ b/mm/vmstat.c
@@ -858,13 +858,24 @@ static void vmstat_stop(struct seq_file *m, void *arg)
 	m->private = NULL;
 }
 
-const struct seq_operations vmstat_op = {
+static const struct seq_operations vmstat_op = {
 	.start	= vmstat_start,
 	.next	= vmstat_next,
 	.stop	= vmstat_stop,
 	.show	= vmstat_show,
 };
 
+static int vmstat_open(struct inode *inode, struct file *file)
+{
+	return seq_open(file, &vmstat_op);
+}
+
+static const struct file_operations proc_vmstat_file_operations = {
+	.open		= vmstat_open,
+	.read		= seq_read,
+	.llseek		= seq_lseek,
+	.release	= seq_release,
+};
 #endif /* CONFIG_PROC_FS */
 
 #ifdef CONFIG_SMP
@@ -938,6 +949,7 @@ static int __init setup_vmstat(void)
 #ifdef CONFIG_PROC_FS
 	proc_create("buddyinfo", S_IRUGO, NULL, &fragmentation_file_operations);
 	proc_create("pagetypeinfo", S_IRUGO, NULL, &pagetypeinfo_file_ops);
+	proc_create("vmstat", S_IRUGO, NULL, &proc_vmstat_file_operations);
 #endif
 	return 0;
 }
-- 
cgit v1.2.3


From 5c9fe6281b75832e8d2555ec8700ea763d9a865e Mon Sep 17 00:00:00 2001
From: Alexey Dobriyan <adobriyan@gmail.com>
Date: Mon, 6 Oct 2008 04:19:42 +0400
Subject: proc: move /proc/zoneinfo boilerplate to mm/vmstat.c

Signed-off-by: Alexey Dobriyan <adobriyan@gmail.com>
Acked-by: Christoph Lameter <cl@linux-foundation.org>
---
 fs/proc/proc_misc.c    | 13 -------------
 include/linux/vmstat.h |  1 -
 mm/vmstat.c            | 15 ++++++++++++++-
 3 files changed, 14 insertions(+), 15 deletions(-)

(limited to 'include/linux')

diff --git a/fs/proc/proc_misc.c b/fs/proc/proc_misc.c
index e7a301d5d432..8f3a6f085c5f 100644
--- a/fs/proc/proc_misc.c
+++ b/fs/proc/proc_misc.c
@@ -57,18 +57,6 @@
 #include <asm/div64.h>
 #include "internal.h"
 
-static int zoneinfo_open(struct inode *inode, struct file *file)
-{
-	return seq_open(file, &zoneinfo_op);
-}
-
-static const struct file_operations proc_zoneinfo_file_operations = {
-	.open		= zoneinfo_open,
-	.read		= seq_read,
-	.llseek		= seq_lseek,
-	.release	= seq_release,
-};
-
 #ifdef CONFIG_BLOCK
 static int diskstats_open(struct inode *inode, struct file *file)
 {
@@ -234,7 +222,6 @@ void __init proc_misc_init(void)
 	proc_symlink("mounts", NULL, "self/mounts");
 
 	/* And now for trickier ones */
-	proc_create("zoneinfo", S_IRUGO, NULL, &proc_zoneinfo_file_operations);
 #ifdef CONFIG_BLOCK
 	proc_create("diskstats", 0, NULL, &proc_diskstats_operations);
 #endif
diff --git a/include/linux/vmstat.h b/include/linux/vmstat.h
index 7b68c4c1e19c..524cd1b28ecb 100644
--- a/include/linux/vmstat.h
+++ b/include/linux/vmstat.h
@@ -54,7 +54,6 @@ enum vm_event_item { PGPGIN, PGPGOUT, PSWPIN, PSWPOUT,
 		NR_VM_EVENT_ITEMS
 };
 
-extern const struct seq_operations zoneinfo_op;
 extern int sysctl_stat_interval;
 
 #ifdef CONFIG_VM_EVENT_COUNTERS
diff --git a/mm/vmstat.c b/mm/vmstat.c
index 7e1854b81868..c3ccfda23adc 100644
--- a/mm/vmstat.c
+++ b/mm/vmstat.c
@@ -795,7 +795,7 @@ static int zoneinfo_show(struct seq_file *m, void *arg)
 	return 0;
 }
 
-const struct seq_operations zoneinfo_op = {
+static const struct seq_operations zoneinfo_op = {
 	.start	= frag_start, /* iterate over all zones. The same as in
 			       * fragmentation. */
 	.next	= frag_next,
@@ -803,6 +803,18 @@ const struct seq_operations zoneinfo_op = {
 	.show	= zoneinfo_show,
 };
 
+static int zoneinfo_open(struct inode *inode, struct file *file)
+{
+	return seq_open(file, &zoneinfo_op);
+}
+
+static const struct file_operations proc_zoneinfo_file_operations = {
+	.open		= zoneinfo_open,
+	.read		= seq_read,
+	.llseek		= seq_lseek,
+	.release	= seq_release,
+};
+
 static void *vmstat_start(struct seq_file *m, loff_t *pos)
 {
 	unsigned long *v;
@@ -950,6 +962,7 @@ static int __init setup_vmstat(void)
 	proc_create("buddyinfo", S_IRUGO, NULL, &fragmentation_file_operations);
 	proc_create("pagetypeinfo", S_IRUGO, NULL, &pagetypeinfo_file_ops);
 	proc_create("vmstat", S_IRUGO, NULL, &proc_vmstat_file_operations);
+	proc_create("zoneinfo", S_IRUGO, NULL, &proc_zoneinfo_file_operations);
 #endif
 	return 0;
 }
-- 
cgit v1.2.3


From 31d85ab28e71b0c938e0ef48af45747e80d99b53 Mon Sep 17 00:00:00 2001
From: Alexey Dobriyan <adobriyan@gmail.com>
Date: Mon, 6 Oct 2008 12:55:38 +0400
Subject: proc: move /proc/diskstats boilerplate to block/genhd.c

Signed-off-by: Alexey Dobriyan <adobriyan@gmail.com>
Acked-by: Jens Axboe <jens.axboe@oracle.com>
---
 block/genhd.c         | 15 ++++++++++++++-
 fs/proc/proc_misc.c   | 16 ----------------
 include/linux/genhd.h |  2 --
 3 files changed, 14 insertions(+), 19 deletions(-)

(limited to 'include/linux')

diff --git a/block/genhd.c b/block/genhd.c
index 15f4d2b12c48..4e5e7493f676 100644
--- a/block/genhd.c
+++ b/block/genhd.c
@@ -1005,15 +1005,28 @@ static int diskstats_show(struct seq_file *seqf, void *v)
 	return 0;
 }
 
-const struct seq_operations diskstats_op = {
+static const struct seq_operations diskstats_op = {
 	.start	= disk_seqf_start,
 	.next	= disk_seqf_next,
 	.stop	= disk_seqf_stop,
 	.show	= diskstats_show
 };
 
+static int diskstats_open(struct inode *inode, struct file *file)
+{
+	return seq_open(file, &diskstats_op);
+}
+
+static const struct file_operations proc_diskstats_operations = {
+	.open		= diskstats_open,
+	.read		= seq_read,
+	.llseek		= seq_lseek,
+	.release	= seq_release,
+};
+
 static int __init proc_genhd_init(void)
 {
+	proc_create("diskstats", 0, NULL, &proc_diskstats_operations);
 	proc_create("partitions", 0, NULL, &proc_partitions_operations);
 	return 0;
 }
diff --git a/fs/proc/proc_misc.c b/fs/proc/proc_misc.c
index 8f3a6f085c5f..7c22831efd94 100644
--- a/fs/proc/proc_misc.c
+++ b/fs/proc/proc_misc.c
@@ -57,19 +57,6 @@
 #include <asm/div64.h>
 #include "internal.h"
 
-#ifdef CONFIG_BLOCK
-static int diskstats_open(struct inode *inode, struct file *file)
-{
-	return seq_open(file, &diskstats_op);
-}
-static const struct file_operations proc_diskstats_operations = {
-	.open		= diskstats_open,
-	.read		= seq_read,
-	.llseek		= seq_lseek,
-	.release	= seq_release,
-};
-#endif
-
 #ifdef CONFIG_MODULES
 extern const struct seq_operations modules_op;
 static int modules_open(struct inode *inode, struct file *file)
@@ -222,9 +209,6 @@ void __init proc_misc_init(void)
 	proc_symlink("mounts", NULL, "self/mounts");
 
 	/* And now for trickier ones */
-#ifdef CONFIG_BLOCK
-	proc_create("diskstats", 0, NULL, &proc_diskstats_operations);
-#endif
 #ifdef CONFIG_MODULES
 	proc_create("modules", 0, NULL, &proc_modules_operations);
 #endif
diff --git a/include/linux/genhd.h b/include/linux/genhd.h
index 074a4fdf4365..e439e6aed832 100644
--- a/include/linux/genhd.h
+++ b/include/linux/genhd.h
@@ -25,8 +25,6 @@ extern struct device_type part_type;
 extern struct kobject *block_depr;
 extern struct class block_class;
 
-extern const struct seq_operations diskstats_op;
-
 enum {
 /* These three have identical behaviour; use the second one if DOS FDISK gets
    confused about extended/logical partitions starting past cylinder 1023. */
-- 
cgit v1.2.3


From 593eb8a2d63e95772a5f22d746f18a997c5ee463 Mon Sep 17 00:00:00 2001
From: Steven Rostedt <rostedt@goodmis.org>
Date: Thu, 23 Oct 2008 09:32:59 -0400
Subject: ftrace: return error on failed modified text.

Have the ftrace_modify_code return error values:

  -EFAULT on error of reading the address

  -EINVAL if what is read does not match what it expected

  -EPERM  if the write fails to update after a successful match.

Signed-off-by: Steven Rostedt <srostedt@redhat.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/ftrace.c | 14 +++++++-------
 include/linux/ftrace.h   | 24 ++++++++++++++++++++++--
 kernel/trace/ftrace.c    | 21 +++++++++++++++------
 3 files changed, 44 insertions(+), 15 deletions(-)

(limited to 'include/linux')

diff --git a/arch/x86/kernel/ftrace.c b/arch/x86/kernel/ftrace.c
index 8821ceabf51d..428291581cb2 100644
--- a/arch/x86/kernel/ftrace.c
+++ b/arch/x86/kernel/ftrace.c
@@ -62,7 +62,6 @@ ftrace_modify_code(unsigned long ip, unsigned char *old_code,
 		   unsigned char *new_code)
 {
 	unsigned char replaced[MCOUNT_INSN_SIZE];
-	int ret;
 
 	/*
 	 * Note: Due to modules and __init, code can
@@ -72,15 +71,16 @@ ftrace_modify_code(unsigned long ip, unsigned char *old_code,
 	 * No real locking needed, this code is run through
 	 * kstop_machine, or before SMP starts.
 	 */
-	if (__copy_from_user_inatomic(replaced, (char __user *)ip, MCOUNT_INSN_SIZE))
-		return 1;
+	if (__copy_from_user_inatomic(replaced, (char __user *)ip,
+				      MCOUNT_INSN_SIZE))
+		return -EFAULT;
 
 	if (memcmp(replaced, old_code, MCOUNT_INSN_SIZE) != 0)
-		return 2;
+		return -EINVAL;
 
-	ret = __copy_to_user_inatomic((char __user *)ip, new_code,
-					MCOUNT_INSN_SIZE);
-	WARN_ON_ONCE(ret);
+	if (__copy_to_user_inatomic((char __user *)ip, new_code,
+				    MCOUNT_INSN_SIZE))
+		return -EPERM;
 
 	sync_core();
 
diff --git a/include/linux/ftrace.h b/include/linux/ftrace.h
index 0e9529589151..79fa10cbdcfb 100644
--- a/include/linux/ftrace.h
+++ b/include/linux/ftrace.h
@@ -72,13 +72,33 @@ extern unsigned char *ftrace_nop_replace(void);
 extern unsigned char *ftrace_call_replace(unsigned long ip, unsigned long addr);
 extern int ftrace_dyn_arch_init(void *data);
 extern int ftrace_mcount_set(unsigned long *data);
-extern int ftrace_modify_code(unsigned long ip, unsigned char *old_code,
-			      unsigned char *new_code);
 extern int ftrace_update_ftrace_func(ftrace_func_t func);
 extern void ftrace_caller(void);
 extern void ftrace_call(void);
 extern void mcount_call(void);
 
+/**
+ * ftrace_modify_code - modify code segment
+ * @ip: the address of the code segment
+ * @old_code: the contents of what is expected to be there
+ * @new_code: the code to patch in
+ *
+ * This is a very sensitive operation and great care needs
+ * to be taken by the arch.  The operation should carefully
+ * read the location, check to see if what is read is indeed
+ * what we expect it to be, and then on success of the compare,
+ * it should write to the location.
+ *
+ * Return must be:
+ *  0 on success
+ *  -EFAULT on error reading the location
+ *  -EINVAL on a failed compare of the contents
+ *  -EPERM  on error writing to the location
+ * Any other value will be considered a failure.
+ */
+extern int ftrace_modify_code(unsigned long ip, unsigned char *old_code,
+			      unsigned char *new_code);
+
 extern int skip_trace(unsigned long ip);
 
 extern void ftrace_release(void *start, unsigned long size);
diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c
index 1f54a94189fe..b2de8de77356 100644
--- a/kernel/trace/ftrace.c
+++ b/kernel/trace/ftrace.c
@@ -596,22 +596,22 @@ ftrace_code_disable(struct dyn_ftrace *rec)
 {
 	unsigned long ip;
 	unsigned char *nop, *call;
-	int failed;
+	int ret;
 
 	ip = rec->ip;
 
 	nop = ftrace_nop_replace();
 	call = ftrace_call_replace(ip, mcount_addr);
 
-	failed = ftrace_modify_code(ip, call, nop);
-	if (failed) {
-		switch (failed) {
-		case 1:
+	ret = ftrace_modify_code(ip, call, nop);
+	if (ret) {
+		switch (ret) {
+		case -EFAULT:
 			WARN_ON_ONCE(1);
 			pr_info("ftrace faulted on modifying ");
 			print_ip_sym(ip);
 			break;
-		case 2:
+		case -EINVAL:
 			WARN_ON_ONCE(1);
 			pr_info("ftrace failed to modify ");
 			print_ip_sym(ip);
@@ -620,6 +620,15 @@ ftrace_code_disable(struct dyn_ftrace *rec)
 			print_ip_ins(" replace: ", nop);
 			printk(KERN_CONT "\n");
 			break;
+		case -EPERM:
+			WARN_ON_ONCE(1);
+			pr_info("ftrace faulted on writing ");
+			print_ip_sym(ip);
+			break;
+		default:
+			WARN_ON_ONCE(1);
+			pr_info("ftrace faulted on unknown error ");
+			print_ip_sym(ip);
 		}
 
 		rec->flags |= FTRACE_FL_FAILED;
-- 
cgit v1.2.3


From 81adbdc029ecc416d56563e7f159100181dd711d Mon Sep 17 00:00:00 2001
From: Steven Rostedt <rostedt@goodmis.org>
Date: Thu, 23 Oct 2008 09:33:02 -0400
Subject: ftrace: only have ftrace_kill atomic

When an anomaly is detected, we need a way to completely disable
ftrace. Right now we have two functions: ftrace_kill and ftrace_kill_atomic.
The ftrace_kill tries to do it in a "nice" way by converting everything
back to a nop.

The "nice" way is dangerous itself, so this patch removes it and only
has the "atomic" version, which is all that is needed.

Signed-off-by: Steven Rostedt <srostedt@redhat.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 include/linux/ftrace.h |  3 +--
 kernel/trace/ftrace.c  | 42 ++----------------------------------------
 kernel/trace/trace.c   |  2 +-
 3 files changed, 4 insertions(+), 43 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/ftrace.h b/include/linux/ftrace.h
index 79fa10cbdcfb..ac58e94668b7 100644
--- a/include/linux/ftrace.h
+++ b/include/linux/ftrace.h
@@ -40,7 +40,7 @@ extern void ftrace_stub(unsigned long a0, unsigned long a1);
 # define register_ftrace_function(ops) do { } while (0)
 # define unregister_ftrace_function(ops) do { } while (0)
 # define clear_ftrace_function(ops) do { } while (0)
-static inline void ftrace_kill_atomic(void) { }
+static inline void ftrace_kill(void) { }
 #endif /* CONFIG_FUNCTION_TRACER */
 
 #ifdef CONFIG_DYNAMIC_FTRACE
@@ -117,7 +117,6 @@ static inline void ftrace_release(void *start, unsigned long size) { }
 
 /* totally disable ftrace - can not re-enable after this */
 void ftrace_kill(void);
-void ftrace_kill_atomic(void);
 
 static inline void tracer_disable(void)
 {
diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c
index b2de8de77356..93245ae046e1 100644
--- a/kernel/trace/ftrace.c
+++ b/kernel/trace/ftrace.c
@@ -1549,22 +1549,6 @@ int ftrace_force_update(void)
 	return ret;
 }
 
-static void ftrace_force_shutdown(void)
-{
-	struct task_struct *task;
-	int command = FTRACE_DISABLE_CALLS | FTRACE_UPDATE_TRACE_FUNC;
-
-	mutex_lock(&ftraced_lock);
-	task = ftraced_task;
-	ftraced_task = NULL;
-	ftraced_suspend = -1;
-	ftrace_run_update_code(command);
-	mutex_unlock(&ftraced_lock);
-
-	if (task)
-		kthread_stop(task);
-}
-
 static __init int ftrace_init_debugfs(void)
 {
 	struct dentry *d_tracer;
@@ -1795,17 +1779,16 @@ core_initcall(ftrace_dynamic_init);
 # define ftrace_shutdown()		do { } while (0)
 # define ftrace_startup_sysctl()	do { } while (0)
 # define ftrace_shutdown_sysctl()	do { } while (0)
-# define ftrace_force_shutdown()	do { } while (0)
 #endif /* CONFIG_DYNAMIC_FTRACE */
 
 /**
- * ftrace_kill_atomic - kill ftrace from critical sections
+ * ftrace_kill - kill ftrace
  *
  * This function should be used by panic code. It stops ftrace
  * but in a not so nice way. If you need to simply kill ftrace
  * from a non-atomic section, use ftrace_kill.
  */
-void ftrace_kill_atomic(void)
+void ftrace_kill(void)
 {
 	ftrace_disabled = 1;
 	ftrace_enabled = 0;
@@ -1815,27 +1798,6 @@ void ftrace_kill_atomic(void)
 	clear_ftrace_function();
 }
 
-/**
- * ftrace_kill - totally shutdown ftrace
- *
- * This is a safety measure. If something was detected that seems
- * wrong, calling this function will keep ftrace from doing
- * any more modifications, and updates.
- * used when something went wrong.
- */
-void ftrace_kill(void)
-{
-	mutex_lock(&ftrace_sysctl_lock);
-	ftrace_disabled = 1;
-	ftrace_enabled = 0;
-
-	clear_ftrace_function();
-	mutex_unlock(&ftrace_sysctl_lock);
-
-	/* Try to totally disable ftrace */
-	ftrace_force_shutdown();
-}
-
 /**
  * register_ftrace_function - register a function for profiling
  * @ops - ops structure that holds the function for profiling.
diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index aeb2f2505bc5..333a5162149b 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -3097,7 +3097,7 @@ void ftrace_dump(void)
 	dump_ran = 1;
 
 	/* No turning back! */
-	ftrace_kill_atomic();
+	ftrace_kill();
 
 	for_each_tracing_cpu(cpu) {
 		atomic_inc(&global_trace.data[cpu]->disabled);
-- 
cgit v1.2.3


From 4d296c24326783bff1282ac72f310d8bac8df413 Mon Sep 17 00:00:00 2001
From: Steven Rostedt <rostedt@goodmis.org>
Date: Thu, 23 Oct 2008 09:33:06 -0400
Subject: ftrace: remove mcount set

The arch dependent function ftrace_mcount_set was only used by the daemon
start up code. This patch removes it.

Signed-off-by: Steven Rostedt <srostedt@redhat.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/arm/kernel/ftrace.c     | 13 -------------
 arch/powerpc/kernel/ftrace.c | 17 -----------------
 arch/sparc64/kernel/ftrace.c | 18 ------------------
 arch/x86/kernel/ftrace.c     |  7 -------
 include/linux/ftrace.h       |  1 -
 kernel/trace/ftrace.c        |  9 ---------
 6 files changed, 65 deletions(-)

(limited to 'include/linux')

diff --git a/arch/arm/kernel/ftrace.c b/arch/arm/kernel/ftrace.c
index 76d50e6091bc..6c90479e8974 100644
--- a/arch/arm/kernel/ftrace.c
+++ b/arch/arm/kernel/ftrace.c
@@ -95,19 +95,6 @@ int ftrace_update_ftrace_func(ftrace_func_t func)
 	return ret;
 }
 
-int ftrace_mcount_set(unsigned long *data)
-{
-	unsigned long pc, old;
-	unsigned long *addr = data;
-	unsigned char *new;
-
-	pc = (unsigned long)&mcount_call;
-	memcpy(&old, &mcount_call, MCOUNT_INSN_SIZE);
-	new = ftrace_call_replace(pc, *addr);
-	*addr = ftrace_modify_code(pc, (unsigned char *)&old, new);
-	return 0;
-}
-
 /* run from kstop_machine */
 int __init ftrace_dyn_arch_init(void *data)
 {
diff --git a/arch/powerpc/kernel/ftrace.c b/arch/powerpc/kernel/ftrace.c
index 3855ceb937b0..6b75522e8b34 100644
--- a/arch/powerpc/kernel/ftrace.c
+++ b/arch/powerpc/kernel/ftrace.c
@@ -126,23 +126,6 @@ notrace int ftrace_update_ftrace_func(ftrace_func_t func)
 	return ret;
 }
 
-notrace int ftrace_mcount_set(unsigned long *data)
-{
-	unsigned long ip = (long)(&mcount_call);
-	unsigned long *addr = data;
-	unsigned char old[MCOUNT_INSN_SIZE], *new;
-
-	/*
-	 * Replace the mcount stub with a pointer to the
-	 * ip recorder function.
-	 */
-	memcpy(old, &mcount_call, MCOUNT_INSN_SIZE);
-	new = ftrace_call_replace(ip, *addr);
-	*addr = ftrace_modify_code(ip, old, new);
-
-	return 0;
-}
-
 int __init ftrace_dyn_arch_init(void *data)
 {
 	/* This is running in kstop_machine */
diff --git a/arch/sparc64/kernel/ftrace.c b/arch/sparc64/kernel/ftrace.c
index 4298d0aee713..447942041a7c 100644
--- a/arch/sparc64/kernel/ftrace.c
+++ b/arch/sparc64/kernel/ftrace.c
@@ -69,24 +69,6 @@ notrace int ftrace_update_ftrace_func(ftrace_func_t func)
 	return ftrace_modify_code(ip, old, new);
 }
 
-notrace int ftrace_mcount_set(unsigned long *data)
-{
-	unsigned long ip = (long)(&mcount_call);
-	unsigned long *addr = data;
-	unsigned char old[MCOUNT_INSN_SIZE], *new;
-
-	/*
-	 * Replace the mcount stub with a pointer to the
-	 * ip recorder function.
-	 */
-	memcpy(old, &mcount_call, MCOUNT_INSN_SIZE);
-	new = ftrace_call_replace(ip, *addr);
-	*addr = ftrace_modify_code(ip, old, new);
-
-	return 0;
-}
-
-
 int __init ftrace_dyn_arch_init(void *data)
 {
 	ftrace_mcount_set(data);
diff --git a/arch/x86/kernel/ftrace.c b/arch/x86/kernel/ftrace.c
index da4fb0deecf7..b399eed23538 100644
--- a/arch/x86/kernel/ftrace.c
+++ b/arch/x86/kernel/ftrace.c
@@ -103,13 +103,6 @@ notrace int ftrace_update_ftrace_func(ftrace_func_t func)
 	return ret;
 }
 
-notrace int ftrace_mcount_set(unsigned long *data)
-{
-	/* mcount is initialized as a nop */
-	*data = 0;
-	return 0;
-}
-
 int __init ftrace_dyn_arch_init(void *data)
 {
 	extern const unsigned char ftrace_test_p6nop[];
diff --git a/include/linux/ftrace.h b/include/linux/ftrace.h
index ac58e94668b7..1c4835f86911 100644
--- a/include/linux/ftrace.h
+++ b/include/linux/ftrace.h
@@ -71,7 +71,6 @@ extern int ftrace_ip_converted(unsigned long ip);
 extern unsigned char *ftrace_nop_replace(void);
 extern unsigned char *ftrace_call_replace(unsigned long ip, unsigned long addr);
 extern int ftrace_dyn_arch_init(void *data);
-extern int ftrace_mcount_set(unsigned long *data);
 extern int ftrace_update_ftrace_func(ftrace_func_t func);
 extern void ftrace_caller(void);
 extern void ftrace_call(void);
diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c
index e758cab0836f..226fd9132d53 100644
--- a/kernel/trace/ftrace.c
+++ b/kernel/trace/ftrace.c
@@ -620,7 +620,6 @@ static int ftrace_update_code(void *ignore);
 
 static int __ftrace_modify_code(void *data)
 {
-	unsigned long addr;
 	int *command = data;
 
 	if (*command & FTRACE_ENABLE_CALLS) {
@@ -639,14 +638,6 @@ static int __ftrace_modify_code(void *data)
 	if (*command & FTRACE_UPDATE_TRACE_FUNC)
 		ftrace_update_ftrace_func(ftrace_trace_function);
 
-	if (*command & FTRACE_ENABLE_MCOUNT) {
-		addr = (unsigned long)ftrace_record_ip;
-		ftrace_mcount_set(&addr);
-	} else if (*command & FTRACE_DISABLE_MCOUNT) {
-		addr = (unsigned long)ftrace_stub;
-		ftrace_mcount_set(&addr);
-	}
-
 	return 0;
 }
 
-- 
cgit v1.2.3


From 08f5ac906d2c0faf96d608c54a0b03177376da8d Mon Sep 17 00:00:00 2001
From: Steven Rostedt <rostedt@goodmis.org>
Date: Thu, 23 Oct 2008 09:33:07 -0400
Subject: ftrace: remove ftrace hash

The ftrace hash was used by the ftrace_daemon code. The record ip function
would place the calling address (ip) into the hash. The daemon would later
read the hash and modify that code.

The hash complicates the code. This patch removes it.

Signed-off-by: Steven Rostedt <srostedt@redhat.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 include/linux/ftrace.h |   8 +-
 kernel/trace/ftrace.c  | 243 +++++++------------------------------------------
 kernel/trace/trace.c   |   3 -
 3 files changed, 38 insertions(+), 216 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/ftrace.h b/include/linux/ftrace.h
index 1c4835f86911..703eb53cfa2b 100644
--- a/include/linux/ftrace.h
+++ b/include/linux/ftrace.h
@@ -44,8 +44,6 @@ static inline void ftrace_kill(void) { }
 #endif /* CONFIG_FUNCTION_TRACER */
 
 #ifdef CONFIG_DYNAMIC_FTRACE
-# define FTRACE_HASHBITS	10
-# define FTRACE_HASHSIZE	(1<<FTRACE_HASHBITS)
 
 enum {
 	FTRACE_FL_FREE		= (1 << 0),
@@ -58,9 +56,9 @@ enum {
 };
 
 struct dyn_ftrace {
-	struct hlist_node node;
-	unsigned long	  ip; /* address of mcount call-site */
-	unsigned long	  flags;
+	struct list_head	list;
+	unsigned long		ip; /* address of mcount call-site */
+	unsigned long		flags;
 };
 
 int ftrace_force_update(void);
diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c
index 226fd9132d53..07762c08a944 100644
--- a/kernel/trace/ftrace.c
+++ b/kernel/trace/ftrace.c
@@ -25,7 +25,6 @@
 #include <linux/ftrace.h>
 #include <linux/sysctl.h>
 #include <linux/ctype.h>
-#include <linux/hash.h>
 #include <linux/list.h>
 
 #include <asm/ftrace.h>
@@ -189,9 +188,7 @@ static int ftrace_filtered;
 static int tracing_on;
 static int frozen_record_count;
 
-static struct hlist_head ftrace_hash[FTRACE_HASHSIZE];
-
-static DEFINE_PER_CPU(int, ftrace_shutdown_disable_cpu);
+static LIST_HEAD(ftrace_new_addrs);
 
 static DEFINE_MUTEX(ftrace_regex_lock);
 
@@ -210,8 +207,6 @@ struct ftrace_page {
 static struct ftrace_page	*ftrace_pages_start;
 static struct ftrace_page	*ftrace_pages;
 
-static int ftrace_record_suspend;
-
 static struct dyn_ftrace *ftrace_free_records;
 
 
@@ -242,72 +237,6 @@ static inline int record_frozen(struct dyn_ftrace *rec)
 # define record_frozen(rec)			({ 0; })
 #endif /* CONFIG_KPROBES */
 
-int skip_trace(unsigned long ip)
-{
-	unsigned long fl;
-	struct dyn_ftrace *rec;
-	struct hlist_node *t;
-	struct hlist_head *head;
-
-	if (frozen_record_count == 0)
-		return 0;
-
-	head = &ftrace_hash[hash_long(ip, FTRACE_HASHBITS)];
-	hlist_for_each_entry_rcu(rec, t, head, node) {
-		if (rec->ip == ip) {
-			if (record_frozen(rec)) {
-				if (rec->flags & FTRACE_FL_FAILED)
-					return 1;
-
-				if (!(rec->flags & FTRACE_FL_CONVERTED))
-					return 1;
-
-				if (!tracing_on || !ftrace_enabled)
-					return 1;
-
-				if (ftrace_filtered) {
-					fl = rec->flags & (FTRACE_FL_FILTER |
-							   FTRACE_FL_NOTRACE);
-					if (!fl || (fl & FTRACE_FL_NOTRACE))
-						return 1;
-				}
-			}
-			break;
-		}
-	}
-
-	return 0;
-}
-
-static inline int
-ftrace_ip_in_hash(unsigned long ip, unsigned long key)
-{
-	struct dyn_ftrace *p;
-	struct hlist_node *t;
-	int found = 0;
-
-	hlist_for_each_entry_rcu(p, t, &ftrace_hash[key], node) {
-		if (p->ip == ip) {
-			found = 1;
-			break;
-		}
-	}
-
-	return found;
-}
-
-static inline void
-ftrace_add_hash(struct dyn_ftrace *node, unsigned long key)
-{
-	hlist_add_head_rcu(&node->node, &ftrace_hash[key]);
-}
-
-/* called from kstop_machine */
-static inline void ftrace_del_hash(struct dyn_ftrace *node)
-{
-	hlist_del(&node->node);
-}
-
 static void ftrace_free_rec(struct dyn_ftrace *rec)
 {
 	rec->ip = (unsigned long)ftrace_free_records;
@@ -362,69 +291,36 @@ static struct dyn_ftrace *ftrace_alloc_dyn_node(unsigned long ip)
 	}
 
 	if (ftrace_pages->index == ENTRIES_PER_PAGE) {
-		if (!ftrace_pages->next)
-			return NULL;
+		if (!ftrace_pages->next) {
+			/* allocate another page */
+			ftrace_pages->next =
+				(void *)get_zeroed_page(GFP_KERNEL);
+			if (!ftrace_pages->next)
+				return NULL;
+		}
 		ftrace_pages = ftrace_pages->next;
 	}
 
 	return &ftrace_pages->records[ftrace_pages->index++];
 }
 
-static void
+static struct dyn_ftrace *
 ftrace_record_ip(unsigned long ip)
 {
-	struct dyn_ftrace *node;
-	unsigned long key;
-	int resched;
-	int cpu;
+	struct dyn_ftrace *rec;
 
 	if (!ftrace_enabled || ftrace_disabled)
-		return;
-
-	resched = need_resched();
-	preempt_disable_notrace();
-
-	/*
-	 * We simply need to protect against recursion.
-	 * Use the the raw version of smp_processor_id and not
-	 * __get_cpu_var which can call debug hooks that can
-	 * cause a recursive crash here.
-	 */
-	cpu = raw_smp_processor_id();
-	per_cpu(ftrace_shutdown_disable_cpu, cpu)++;
-	if (per_cpu(ftrace_shutdown_disable_cpu, cpu) != 1)
-		goto out;
-
-	if (unlikely(ftrace_record_suspend))
-		goto out;
-
-	key = hash_long(ip, FTRACE_HASHBITS);
-
-	FTRACE_WARN_ON_ONCE(key >= FTRACE_HASHSIZE);
-
-	if (ftrace_ip_in_hash(ip, key))
-		goto out;
-
-	/* This ip may have hit the hash before the lock */
-	if (ftrace_ip_in_hash(ip, key))
-		goto out;
-
-	node = ftrace_alloc_dyn_node(ip);
-	if (!node)
-		goto out;
+		return NULL;
 
-	node->ip = ip;
+	rec = ftrace_alloc_dyn_node(ip);
+	if (!rec)
+		return NULL;
 
-	ftrace_add_hash(node, key);
+	rec->ip = ip;
 
- out:
-	per_cpu(ftrace_shutdown_disable_cpu, cpu)--;
+	list_add(&rec->list, &ftrace_new_addrs);
 
-	/* prevent recursion with scheduler */
-	if (resched)
-		preempt_enable_no_resched_notrace();
-	else
-		preempt_enable_notrace();
+	return rec;
 }
 
 #define FTRACE_ADDR ((long)(ftrace_caller))
@@ -543,7 +439,6 @@ static void ftrace_replace_code(int enable)
 				rec->flags |= FTRACE_FL_FAILED;
 				if ((system_state == SYSTEM_BOOTING) ||
 				    !core_kernel_text(rec->ip)) {
-					ftrace_del_hash(rec);
 					ftrace_free_rec(rec);
 				}
 			}
@@ -551,15 +446,6 @@ static void ftrace_replace_code(int enable)
 	}
 }
 
-static void ftrace_shutdown_replenish(void)
-{
-	if (ftrace_pages->next)
-		return;
-
-	/* allocate another page */
-	ftrace_pages->next = (void *)get_zeroed_page(GFP_KERNEL);
-}
-
 static void print_ip_ins(const char *fmt, unsigned char *p)
 {
 	int i;
@@ -616,18 +502,11 @@ ftrace_code_disable(struct dyn_ftrace *rec)
 	return 1;
 }
 
-static int ftrace_update_code(void *ignore);
-
 static int __ftrace_modify_code(void *data)
 {
 	int *command = data;
 
 	if (*command & FTRACE_ENABLE_CALLS) {
-		/*
-		 * Update any recorded ips now that we have the
-		 * machine stopped
-		 */
-		ftrace_update_code(NULL);
 		ftrace_replace_code(1);
 		tracing_on = 1;
 	} else if (*command & FTRACE_DISABLE_CALLS) {
@@ -738,84 +617,34 @@ static cycle_t		ftrace_update_time;
 static unsigned long	ftrace_update_cnt;
 unsigned long		ftrace_update_tot_cnt;
 
-static int ftrace_update_code(void *ignore)
+static int ftrace_update_code(void)
 {
-	int i, save_ftrace_enabled;
+	struct dyn_ftrace *p, *t;
 	cycle_t start, stop;
-	struct dyn_ftrace *p;
-	struct hlist_node *t, *n;
-	struct hlist_head *head, temp_list;
-
-	/* Don't be recording funcs now */
-	ftrace_record_suspend++;
-	save_ftrace_enabled = ftrace_enabled;
-	ftrace_enabled = 0;
 
 	start = ftrace_now(raw_smp_processor_id());
 	ftrace_update_cnt = 0;
 
-	/* No locks needed, the machine is stopped! */
-	for (i = 0; i < FTRACE_HASHSIZE; i++) {
-		INIT_HLIST_HEAD(&temp_list);
-		head = &ftrace_hash[i];
+	list_for_each_entry_safe(p, t, &ftrace_new_addrs, list) {
 
-		/* all CPUS are stopped, we are safe to modify code */
-		hlist_for_each_entry_safe(p, t, n, head, node) {
-			/* Skip over failed records which have not been
-			 * freed. */
-			if (p->flags & FTRACE_FL_FAILED)
-				continue;
+		/* If something went wrong, bail without enabling anything */
+		if (unlikely(ftrace_disabled))
+			return -1;
 
-			/* Unconverted records are always at the head of the
-			 * hash bucket. Once we encounter a converted record,
-			 * simply skip over to the next bucket. Saves ftraced
-			 * some processor cycles (ftrace does its bid for
-			 * global warming :-p ). */
-			if (p->flags & (FTRACE_FL_CONVERTED))
-				break;
+		list_del_init(&p->list);
 
-			/* Ignore updates to this record's mcount site.
-			 * Reintroduce this record at the head of this
-			 * bucket to attempt to "convert" it again if
-			 * the kprobe on it is unregistered before the
-			 * next run. */
-			if (get_kprobe((void *)p->ip)) {
-				ftrace_del_hash(p);
-				INIT_HLIST_NODE(&p->node);
-				hlist_add_head(&p->node, &temp_list);
-				freeze_record(p);
-				continue;
-			} else {
-				unfreeze_record(p);
-			}
-
-			/* convert record (i.e, patch mcount-call with NOP) */
-			if (ftrace_code_disable(p)) {
-				p->flags |= FTRACE_FL_CONVERTED;
-				ftrace_update_cnt++;
-			} else {
-				if ((system_state == SYSTEM_BOOTING) ||
-				    !core_kernel_text(p->ip)) {
-					ftrace_del_hash(p);
-					ftrace_free_rec(p);
-				}
-			}
-		}
-
-		hlist_for_each_entry_safe(p, t, n, &temp_list, node) {
-			hlist_del(&p->node);
-			INIT_HLIST_NODE(&p->node);
-			hlist_add_head(&p->node, head);
-		}
+		/* convert record (i.e, patch mcount-call with NOP) */
+		if (ftrace_code_disable(p)) {
+			p->flags |= FTRACE_FL_CONVERTED;
+			ftrace_update_cnt++;
+		} else
+			ftrace_free_rec(p);
 	}
 
 	stop = ftrace_now(raw_smp_processor_id());
 	ftrace_update_time = stop - start;
 	ftrace_update_tot_cnt += ftrace_update_cnt;
 
-	ftrace_enabled = save_ftrace_enabled;
-	ftrace_record_suspend--;
-
 	return 0;
 }
 
@@ -847,7 +676,7 @@ static int __init ftrace_dyn_table_alloc(unsigned long num_to_init)
 	pg = ftrace_pages = ftrace_pages_start;
 
 	cnt = num_to_init / ENTRIES_PER_PAGE;
-	pr_info("ftrace: allocating %ld hash entries in %d pages\n",
+	pr_info("ftrace: allocating %ld entries in %d pages\n",
 		num_to_init, cnt);
 
 	for (i = 0; i < cnt; i++) {
@@ -1451,20 +1280,18 @@ static int ftrace_convert_nops(unsigned long *start,
 	unsigned long addr;
 	unsigned long flags;
 
+	mutex_lock(&ftrace_start_lock);
 	p = start;
 	while (p < end) {
 		addr = ftrace_call_adjust(*p++);
-		/* should not be called from interrupt context */
-		spin_lock(&ftrace_lock);
 		ftrace_record_ip(addr);
-		spin_unlock(&ftrace_lock);
-		ftrace_shutdown_replenish();
 	}
 
-	/* p is ignored */
+	/* disable interrupts to prevent kstop machine */
 	local_irq_save(flags);
-	ftrace_update_code(p);
+	ftrace_update_code();
 	local_irq_restore(flags);
+	mutex_unlock(&ftrace_start_lock);
 
 	return 0;
 }
diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index 333a5162149b..06951e229443 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -865,9 +865,6 @@ function_trace_call(unsigned long ip, unsigned long parent_ip)
 	if (unlikely(!ftrace_function_enabled))
 		return;
 
-	if (skip_trace(ip))
-		return;
-
 	pc = preempt_count();
 	resched = need_resched();
 	preempt_disable_notrace();
-- 
cgit v1.2.3


From b5aadf7f14c1acc94956aa257e018e9de3881f41 Mon Sep 17 00:00:00 2001
From: Alexey Dobriyan <adobriyan@gmail.com>
Date: Mon, 6 Oct 2008 13:23:43 +0400
Subject: proc: move /proc/schedstat boilerplate to kernel/sched_stats.h

Signed-off-by: Alexey Dobriyan <adobriyan@gmail.com>
---
 fs/proc/proc_misc.c   | 3 ---
 include/linux/sched.h | 4 ----
 kernel/sched.c        | 1 +
 kernel/sched_stats.h  | 9 ++++++++-
 4 files changed, 9 insertions(+), 8 deletions(-)

(limited to 'include/linux')

diff --git a/fs/proc/proc_misc.c b/fs/proc/proc_misc.c
index f6d25db98922..4a768ed5da2e 100644
--- a/fs/proc/proc_misc.c
+++ b/fs/proc/proc_misc.c
@@ -195,9 +195,6 @@ void __init proc_misc_init(void)
 	proc_symlink("mounts", NULL, "self/mounts");
 
 	/* And now for trickier ones */
-#ifdef CONFIG_SCHEDSTATS
-	proc_create("schedstat", 0, NULL, &proc_schedstat_operations);
-#endif
 #ifdef CONFIG_PROC_KCORE
 	proc_root_kcore = proc_create("kcore", S_IRUSR, NULL, &proc_kcore_operations);
 	if (proc_root_kcore)
diff --git a/include/linux/sched.h b/include/linux/sched.h
index 5c38db536e07..7f60cb9b53cb 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -681,10 +681,6 @@ struct sched_info {
 };
 #endif /* defined(CONFIG_SCHEDSTATS) || defined(CONFIG_TASK_DELAY_ACCT) */
 
-#ifdef CONFIG_SCHEDSTATS
-extern const struct file_operations proc_schedstat_operations;
-#endif /* CONFIG_SCHEDSTATS */
-
 #ifdef CONFIG_TASK_DELAY_ACCT
 struct task_delay_info {
 	spinlock_t	lock;
diff --git a/kernel/sched.c b/kernel/sched.c
index d906f72b42d2..5a70189d5051 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -55,6 +55,7 @@
 #include <linux/cpuset.h>
 #include <linux/percpu.h>
 #include <linux/kthread.h>
+#include <linux/proc_fs.h>
 #include <linux/seq_file.h>
 #include <linux/sysctl.h>
 #include <linux/syscalls.h>
diff --git a/kernel/sched_stats.h b/kernel/sched_stats.h
index b8c156979cf2..3d14ce273902 100644
--- a/kernel/sched_stats.h
+++ b/kernel/sched_stats.h
@@ -90,13 +90,20 @@ static int schedstat_open(struct inode *inode, struct file *file)
 	return res;
 }
 
-const struct file_operations proc_schedstat_operations = {
+static const struct file_operations proc_schedstat_operations = {
 	.open    = schedstat_open,
 	.read    = seq_read,
 	.llseek  = seq_lseek,
 	.release = single_release,
 };
 
+static int __init proc_schedstat_init(void)
+{
+	proc_create("schedstat", 0, NULL, &proc_schedstat_operations);
+	return 0;
+}
+module_init(proc_schedstat_init);
+
 /*
  * Expects runqueue lock to be held for atomicity of update
  */
-- 
cgit v1.2.3


From 97ce5d6dcb07c403c0fc6001b755aacc38b5d7ff Mon Sep 17 00:00:00 2001
From: Alexey Dobriyan <adobriyan@gmail.com>
Date: Mon, 6 Oct 2008 14:14:19 +0400
Subject: proc: move all /proc/kcore stuff to fs/proc/kcore.c

Signed-off-by: Alexey Dobriyan <adobriyan@gmail.com>
---
 fs/proc/kcore.c         | 14 +++++++++++++-
 fs/proc/proc_misc.c     |  8 --------
 include/linux/proc_fs.h |  4 ----
 3 files changed, 13 insertions(+), 13 deletions(-)

(limited to 'include/linux')

diff --git a/fs/proc/kcore.c b/fs/proc/kcore.c
index c2370c76fb71..59b43a068872 100644
--- a/fs/proc/kcore.c
+++ b/fs/proc/kcore.c
@@ -27,6 +27,8 @@
 #define ELF_CORE_EFLAGS	0
 #endif
 
+static struct proc_dir_entry *proc_root_kcore;
+
 static int open_kcore(struct inode * inode, struct file * filp)
 {
 	return capable(CAP_SYS_RAWIO) ? 0 : -EPERM;
@@ -34,7 +36,7 @@ static int open_kcore(struct inode * inode, struct file * filp)
 
 static ssize_t read_kcore(struct file *, char __user *, size_t, loff_t *);
 
-const struct file_operations proc_kcore_operations = {
+static const struct file_operations proc_kcore_operations = {
 	.read		= read_kcore,
 	.open		= open_kcore,
 };
@@ -399,3 +401,13 @@ read_kcore(struct file *file, char __user *buffer, size_t buflen, loff_t *fpos)
 
 	return acc;
 }
+
+static int __init proc_kcore_init(void)
+{
+	proc_root_kcore = proc_create("kcore", S_IRUSR, NULL, &proc_kcore_operations);
+	if (proc_root_kcore)
+		proc_root_kcore->size =
+				(size_t)high_memory - PAGE_OFFSET + PAGE_SIZE;
+	return 0;
+}
+module_init(proc_kcore_init);
diff --git a/fs/proc/proc_misc.c b/fs/proc/proc_misc.c
index 4a768ed5da2e..5ed15ff8fd1d 100644
--- a/fs/proc/proc_misc.c
+++ b/fs/proc/proc_misc.c
@@ -188,19 +188,11 @@ static struct file_operations proc_kpageflags_operations = {
 };
 #endif /* CONFIG_PROC_PAGE_MONITOR */
 
-struct proc_dir_entry *proc_root_kcore;
-
 void __init proc_misc_init(void)
 {
 	proc_symlink("mounts", NULL, "self/mounts");
 
 	/* And now for trickier ones */
-#ifdef CONFIG_PROC_KCORE
-	proc_root_kcore = proc_create("kcore", S_IRUSR, NULL, &proc_kcore_operations);
-	if (proc_root_kcore)
-		proc_root_kcore->size =
-				(size_t)high_memory - PAGE_OFFSET + PAGE_SIZE;
-#endif
 #ifdef CONFIG_PROC_PAGE_MONITOR
 	proc_create("kpagecount", S_IRUSR, NULL, &proc_kpagecount_operations);
 	proc_create("kpageflags", S_IRUSR, NULL, &proc_kpageflags_operations);
diff --git a/include/linux/proc_fs.h b/include/linux/proc_fs.h
index 27d534f4470d..9d8308905053 100644
--- a/include/linux/proc_fs.h
+++ b/include/linux/proc_fs.h
@@ -97,8 +97,6 @@ struct vmcore {
 
 #ifdef CONFIG_PROC_FS
 
-extern struct proc_dir_entry *proc_root_kcore;
-
 extern spinlock_t proc_subdir_lock;
 
 extern void proc_root_init(void);
@@ -138,8 +136,6 @@ extern struct inode *proc_get_inode(struct super_block *, unsigned int, struct p
 extern int proc_readdir(struct file *, void *, filldir_t);
 extern struct dentry *proc_lookup(struct inode *, struct dentry *, struct nameidata *);
 
-extern const struct file_operations proc_kcore_operations;
-
 extern int pid_ns_prepare_proc(struct pid_namespace *ns);
 extern void pid_ns_release_proc(struct pid_namespace *ns);
 
-- 
cgit v1.2.3


From 5aa140c2deca3701238d5acddf436ad7b02664c7 Mon Sep 17 00:00:00 2001
From: Alexey Dobriyan <adobriyan@gmail.com>
Date: Mon, 6 Oct 2008 14:36:31 +0400
Subject: proc: move /proc/vmcore creation to fs/proc/vmcore.c

Signed-off-by: Alexey Dobriyan <adobriyan@gmail.com>
---
 fs/proc/proc_misc.c        | 3 ---
 fs/proc/vmcore.c           | 6 +++---
 include/linux/crash_dump.h | 2 --
 3 files changed, 3 insertions(+), 8 deletions(-)

(limited to 'include/linux')

diff --git a/fs/proc/proc_misc.c b/fs/proc/proc_misc.c
index 2ef9ef9bc8c1..e2db35006c05 100644
--- a/fs/proc/proc_misc.c
+++ b/fs/proc/proc_misc.c
@@ -62,7 +62,4 @@ void __init proc_misc_init(void)
 	proc_symlink("mounts", NULL, "self/mounts");
 
 	/* And now for trickier ones */
-#ifdef CONFIG_PROC_VMCORE
-	proc_vmcore = proc_create("vmcore", S_IRUSR, NULL, &proc_vmcore_operations);
-#endif
 }
diff --git a/fs/proc/vmcore.c b/fs/proc/vmcore.c
index cd9ca67f841b..03ec59504906 100644
--- a/fs/proc/vmcore.c
+++ b/fs/proc/vmcore.c
@@ -32,7 +32,7 @@ static size_t elfcorebuf_sz;
 /* Total size of vmcore file. */
 static u64 vmcore_size;
 
-struct proc_dir_entry *proc_vmcore = NULL;
+static struct proc_dir_entry *proc_vmcore = NULL;
 
 /* Reads a page from the oldmem device from given offset. */
 static ssize_t read_from_oldmem(char *buf, size_t count,
@@ -162,7 +162,7 @@ static ssize_t read_vmcore(struct file *file, char __user *buffer,
 	return acc;
 }
 
-const struct file_operations proc_vmcore_operations = {
+static const struct file_operations proc_vmcore_operations = {
 	.read		= read_vmcore,
 };
 
@@ -652,7 +652,7 @@ static int __init vmcore_init(void)
 		return rc;
 	}
 
-	/* Initialize /proc/vmcore size if proc is already up. */
+	proc_vmcore = proc_create("vmcore", S_IRUSR, NULL, &proc_vmcore_operations);
 	if (proc_vmcore)
 		proc_vmcore->size = vmcore_size;
 	return 0;
diff --git a/include/linux/crash_dump.h b/include/linux/crash_dump.h
index 0acf3b737e2e..2dac064d8359 100644
--- a/include/linux/crash_dump.h
+++ b/include/linux/crash_dump.h
@@ -14,8 +14,6 @@ extern unsigned long long elfcorehdr_addr;
 
 extern ssize_t copy_oldmem_page(unsigned long, char *, size_t,
 						unsigned long, int);
-extern const struct file_operations proc_vmcore_operations;
-extern struct proc_dir_entry *proc_vmcore;
 
 /* Architecture code defines this if there are other possible ELF
  * machine types, e.g. on bi-arch capable hardware. */
-- 
cgit v1.2.3


From 59c7572e82d69483a66eaa67b46548baeb69ecf4 Mon Sep 17 00:00:00 2001
From: Alexey Dobriyan <adobriyan@gmail.com>
Date: Mon, 6 Oct 2008 14:49:39 +0400
Subject: proc: remove fs/proc/proc_misc.c

Now that everything was moved to their more or less expected places,
apply rm(1).

Signed-off-by: Alexey Dobriyan <adobriyan@gmail.com>
---
 fs/proc/Makefile        |  2 +-
 fs/proc/proc_misc.c     | 65 -------------------------------------------------
 fs/proc/root.c          |  2 +-
 include/linux/proc_fs.h |  1 -
 4 files changed, 2 insertions(+), 68 deletions(-)
 delete mode 100644 fs/proc/proc_misc.c

(limited to 'include/linux')

diff --git a/fs/proc/Makefile b/fs/proc/Makefile
index fef524410e86..63d965193b22 100644
--- a/fs/proc/Makefile
+++ b/fs/proc/Makefile
@@ -8,7 +8,7 @@ proc-y			:= nommu.o task_nommu.o
 proc-$(CONFIG_MMU)	:= mmu.o task_mmu.o
 
 proc-y       += inode.o root.o base.o generic.o array.o \
-		proc_tty.o proc_misc.o
+		proc_tty.o
 proc-y	+= cmdline.o
 proc-y	+= cpuinfo.o
 proc-y	+= devices.o
diff --git a/fs/proc/proc_misc.c b/fs/proc/proc_misc.c
deleted file mode 100644
index e2db35006c05..000000000000
--- a/fs/proc/proc_misc.c
+++ /dev/null
@@ -1,65 +0,0 @@
-/*
- *  linux/fs/proc/proc_misc.c
- *
- *  linux/fs/proc/array.c
- *  Copyright (C) 1992  by Linus Torvalds
- *  based on ideas by Darren Senn
- *
- *  This used to be the part of array.c. See the rest of history and credits
- *  there. I took this into a separate file and switched the thing to generic
- *  proc_file_inode_operations, leaving in array.c only per-process stuff.
- *  Inumbers allocation made dynamic (via create_proc_entry()).  AV, May 1999.
- *
- * Changes:
- * Fulton Green      :  Encapsulated position metric calculations.
- *			<kernel@FultonGreen.com>
- */
-
-#include <linux/types.h>
-#include <linux/errno.h>
-#include <linux/time.h>
-#include <linux/kernel.h>
-#include <linux/kernel_stat.h>
-#include <linux/fs.h>
-#include <linux/tty.h>
-#include <linux/string.h>
-#include <linux/mman.h>
-#include <linux/quicklist.h>
-#include <linux/proc_fs.h>
-#include <linux/ioport.h>
-#include <linux/mm.h>
-#include <linux/mmzone.h>
-#include <linux/pagemap.h>
-#include <linux/irq.h>
-#include <linux/interrupt.h>
-#include <linux/swap.h>
-#include <linux/slab.h>
-#include <linux/genhd.h>
-#include <linux/smp.h>
-#include <linux/signal.h>
-#include <linux/module.h>
-#include <linux/init.h>
-#include <linux/seq_file.h>
-#include <linux/times.h>
-#include <linux/profile.h>
-#include <linux/utsname.h>
-#include <linux/blkdev.h>
-#include <linux/hugetlb.h>
-#include <linux/jiffies.h>
-#include <linux/vmalloc.h>
-#include <linux/crash_dump.h>
-#include <linux/pid_namespace.h>
-#include <linux/bootmem.h>
-#include <asm/uaccess.h>
-#include <asm/pgtable.h>
-#include <asm/io.h>
-#include <asm/tlb.h>
-#include <asm/div64.h>
-#include "internal.h"
-
-void __init proc_misc_init(void)
-{
-	proc_symlink("mounts", NULL, "self/mounts");
-
-	/* And now for trickier ones */
-}
diff --git a/fs/proc/root.c b/fs/proc/root.c
index 2a3abd25b30b..7761602af9de 100644
--- a/fs/proc/root.c
+++ b/fs/proc/root.c
@@ -117,7 +117,7 @@ void __init proc_root_init(void)
 		return;
 	}
 
-	proc_misc_init();
+	proc_symlink("mounts", NULL, "self/mounts");
 
 	proc_net_init();
 
diff --git a/include/linux/proc_fs.h b/include/linux/proc_fs.h
index 9d8308905053..b8bdb96eff78 100644
--- a/include/linux/proc_fs.h
+++ b/include/linux/proc_fs.h
@@ -100,7 +100,6 @@ struct vmcore {
 extern spinlock_t proc_subdir_lock;
 
 extern void proc_root_init(void);
-extern void proc_misc_init(void);
 
 void proc_flush_task(struct task_struct *task);
 struct dentry *proc_pid_lookup(struct inode *dir, struct dentry * dentry, struct nameidata *);
-- 
cgit v1.2.3


From 66f50ee3cee4c9d98eea0add6f439e6e5e0ca4a5 Mon Sep 17 00:00:00 2001
From: Paul Mundt <lethal@linux-sh.org>
Date: Wed, 22 Oct 2008 14:14:59 -0700
Subject: profiling: fix up CONFIG_PROC_FS=n build

In the case where procfs is disabled, create_proc_profile() does not
exist. Stub it in with the others.

Signed-off-by: Paul Mundt <lethal@linux-sh.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/profile.h | 7 ++++++-
 1 file changed, 6 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/profile.h b/include/linux/profile.h
index 570045053ce9..a0fc32279fc0 100644
--- a/include/linux/profile.h
+++ b/include/linux/profile.h
@@ -19,10 +19,16 @@ struct notifier_block;
 
 #if defined(CONFIG_PROFILING) && defined(CONFIG_PROC_FS)
 void create_prof_cpu_mask(struct proc_dir_entry *de);
+int create_proc_profile(void);
 #else
 static inline void create_prof_cpu_mask(struct proc_dir_entry *de)
 {
 }
+
+static inline int create_proc_profile(void)
+{
+	return 0;
+}
 #endif
 
 enum profile_type {
@@ -37,7 +43,6 @@ extern int prof_on __read_mostly;
 /* init basic kernel profiler */
 int profile_init(void);
 int profile_setup(char *str);
-int create_proc_profile(void);
 void profile_tick(int type);
 
 /*
-- 
cgit v1.2.3


From 4afe978530702c934dfdb11f54073136818b2119 Mon Sep 17 00:00:00 2001
From: Hidehiro Kawai <hidehiro.kawai.ez@hitachi.com>
Date: Wed, 22 Oct 2008 14:15:00 -0700
Subject: jbd: fix error handling for checkpoint io

When a checkpointing IO fails, current JBD code doesn't check the error
and continue journaling.  This means latest metadata can be lost from both
the journal and filesystem.

This patch leaves the failed metadata blocks in the journal space and
aborts journaling in the case of log_do_checkpoint().  To achieve this, we
need to do:

1. don't remove the failed buffer from the checkpoint list where in
   the case of __try_to_free_cp_buf() because it may be released or
   overwritten by a later transaction
2. log_do_checkpoint() is the last chance, remove the failed buffer
   from the checkpoint list and abort the journal
3. when checkpointing fails, don't update the journal super block to
   prevent the journaled contents from being cleaned.  For safety,
   don't update j_tail and j_tail_sequence either
4. when checkpointing fails, notify this error to the ext3 layer so
   that ext3 don't clear the needs_recovery flag, otherwise the
   journaled contents are ignored and cleaned in the recovery phase
5. if the recovery fails, keep the needs_recovery flag
6. prevent cleanup_journal_tail() from being called between
   __journal_drop_transaction() and journal_abort() (a race issue
   between journal_flush() and __log_wait_for_space()

Signed-off-by: Hidehiro Kawai <hidehiro.kawai.ez@hitachi.com>
Acked-by: Jan Kara <jack@suse.cz>
Cc: <linux-ext4@vger.kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/jbd/checkpoint.c | 49 +++++++++++++++++++++++++++++++++++++------------
 fs/jbd/journal.c    | 28 ++++++++++++++++++++++------
 fs/jbd/recovery.c   |  7 +++++--
 include/linux/jbd.h |  2 +-
 4 files changed, 65 insertions(+), 21 deletions(-)

(limited to 'include/linux')

diff --git a/fs/jbd/checkpoint.c b/fs/jbd/checkpoint.c
index a5432bbbfb88..e29293501d42 100644
--- a/fs/jbd/checkpoint.c
+++ b/fs/jbd/checkpoint.c
@@ -93,7 +93,8 @@ static int __try_to_free_cp_buf(struct journal_head *jh)
 	int ret = 0;
 	struct buffer_head *bh = jh2bh(jh);
 
-	if (jh->b_jlist == BJ_None && !buffer_locked(bh) && !buffer_dirty(bh)) {
+	if (jh->b_jlist == BJ_None && !buffer_locked(bh) &&
+	    !buffer_dirty(bh) && buffer_uptodate(bh)) {
 		JBUFFER_TRACE(jh, "remove from checkpoint list");
 		ret = __journal_remove_checkpoint(jh) + 1;
 		jbd_unlock_bh_state(bh);
@@ -160,21 +161,25 @@ static void jbd_sync_bh(journal_t *journal, struct buffer_head *bh)
  * buffers. Note that we take the buffers in the opposite ordering
  * from the one in which they were submitted for IO.
  *
+ * Return 0 on success, and return <0 if some buffers have failed
+ * to be written out.
+ *
  * Called with j_list_lock held.
  */
-static void __wait_cp_io(journal_t *journal, transaction_t *transaction)
+static int __wait_cp_io(journal_t *journal, transaction_t *transaction)
 {
 	struct journal_head *jh;
 	struct buffer_head *bh;
 	tid_t this_tid;
 	int released = 0;
+	int ret = 0;
 
 	this_tid = transaction->t_tid;
 restart:
 	/* Did somebody clean up the transaction in the meanwhile? */
 	if (journal->j_checkpoint_transactions != transaction ||
 			transaction->t_tid != this_tid)
-		return;
+		return ret;
 	while (!released && transaction->t_checkpoint_io_list) {
 		jh = transaction->t_checkpoint_io_list;
 		bh = jh2bh(jh);
@@ -194,6 +199,9 @@ restart:
 			spin_lock(&journal->j_list_lock);
 			goto restart;
 		}
+		if (unlikely(!buffer_uptodate(bh)))
+			ret = -EIO;
+
 		/*
 		 * Now in whatever state the buffer currently is, we know that
 		 * it has been written out and so we can drop it from the list
@@ -203,6 +211,8 @@ restart:
 		journal_remove_journal_head(bh);
 		__brelse(bh);
 	}
+
+	return ret;
 }
 
 #define NR_BATCH	64
@@ -226,7 +236,8 @@ __flush_batch(journal_t *journal, struct buffer_head **bhs, int *batch_count)
  * Try to flush one buffer from the checkpoint list to disk.
  *
  * Return 1 if something happened which requires us to abort the current
- * scan of the checkpoint list.
+ * scan of the checkpoint list.  Return <0 if the buffer has failed to
+ * be written out.
  *
  * Called with j_list_lock held and drops it if 1 is returned
  * Called under jbd_lock_bh_state(jh2bh(jh)), and drops it
@@ -256,6 +267,9 @@ static int __process_buffer(journal_t *journal, struct journal_head *jh,
 		log_wait_commit(journal, tid);
 		ret = 1;
 	} else if (!buffer_dirty(bh)) {
+		ret = 1;
+		if (unlikely(!buffer_uptodate(bh)))
+			ret = -EIO;
 		J_ASSERT_JH(jh, !buffer_jbddirty(bh));
 		BUFFER_TRACE(bh, "remove from checkpoint");
 		__journal_remove_checkpoint(jh);
@@ -263,7 +277,6 @@ static int __process_buffer(journal_t *journal, struct journal_head *jh,
 		jbd_unlock_bh_state(bh);
 		journal_remove_journal_head(bh);
 		__brelse(bh);
-		ret = 1;
 	} else {
 		/*
 		 * Important: we are about to write the buffer, and
@@ -295,6 +308,7 @@ static int __process_buffer(journal_t *journal, struct journal_head *jh,
  * to disk. We submit larger chunks of data at once.
  *
  * The journal should be locked before calling this function.
+ * Called with j_checkpoint_mutex held.
  */
 int log_do_checkpoint(journal_t *journal)
 {
@@ -318,6 +332,7 @@ int log_do_checkpoint(journal_t *journal)
 	 * OK, we need to start writing disk blocks.  Take one transaction
 	 * and write it.
 	 */
+	result = 0;
 	spin_lock(&journal->j_list_lock);
 	if (!journal->j_checkpoint_transactions)
 		goto out;
@@ -334,7 +349,7 @@ restart:
 		int batch_count = 0;
 		struct buffer_head *bhs[NR_BATCH];
 		struct journal_head *jh;
-		int retry = 0;
+		int retry = 0, err;
 
 		while (!retry && transaction->t_checkpoint_list) {
 			struct buffer_head *bh;
@@ -347,6 +362,8 @@ restart:
 				break;
 			}
 			retry = __process_buffer(journal, jh, bhs,&batch_count);
+			if (retry < 0 && !result)
+				result = retry;
 			if (!retry && (need_resched() ||
 				spin_needbreak(&journal->j_list_lock))) {
 				spin_unlock(&journal->j_list_lock);
@@ -371,14 +388,18 @@ restart:
 		 * Now we have cleaned up the first transaction's checkpoint
 		 * list. Let's clean up the second one
 		 */
-		__wait_cp_io(journal, transaction);
+		err = __wait_cp_io(journal, transaction);
+		if (!result)
+			result = err;
 	}
 out:
 	spin_unlock(&journal->j_list_lock);
-	result = cleanup_journal_tail(journal);
 	if (result < 0)
-		return result;
-	return 0;
+		journal_abort(journal, result);
+	else
+		result = cleanup_journal_tail(journal);
+
+	return (result < 0) ? result : 0;
 }
 
 /*
@@ -394,8 +415,9 @@ out:
  * This is the only part of the journaling code which really needs to be
  * aware of transaction aborts.  Checkpointing involves writing to the
  * main filesystem area rather than to the journal, so it can proceed
- * even in abort state, but we must not update the journal superblock if
- * we have an abort error outstanding.
+ * even in abort state, but we must not update the super block if
+ * checkpointing may have failed.  Otherwise, we would lose some metadata
+ * buffers which should be written-back to the filesystem.
  */
 
 int cleanup_journal_tail(journal_t *journal)
@@ -404,6 +426,9 @@ int cleanup_journal_tail(journal_t *journal)
 	tid_t		first_tid;
 	unsigned long	blocknr, freed;
 
+	if (is_journal_aborted(journal))
+		return 1;
+
 	/* OK, work out the oldest transaction remaining in the log, and
 	 * the log block it starts at.
 	 *
diff --git a/fs/jbd/journal.c b/fs/jbd/journal.c
index aa7143a8349b..9e4fa52d7dc8 100644
--- a/fs/jbd/journal.c
+++ b/fs/jbd/journal.c
@@ -1121,9 +1121,12 @@ recovery_error:
  *
  * Release a journal_t structure once it is no longer in use by the
  * journaled object.
+ * Return <0 if we couldn't clean up the journal.
  */
-void journal_destroy(journal_t *journal)
+int journal_destroy(journal_t *journal)
 {
+	int err = 0;
+
 	/* Wait for the commit thread to wake up and die. */
 	journal_kill_thread(journal);
 
@@ -1146,11 +1149,16 @@ void journal_destroy(journal_t *journal)
 	J_ASSERT(journal->j_checkpoint_transactions == NULL);
 	spin_unlock(&journal->j_list_lock);
 
-	/* We can now mark the journal as empty. */
-	journal->j_tail = 0;
-	journal->j_tail_sequence = ++journal->j_transaction_sequence;
 	if (journal->j_sb_buffer) {
-		journal_update_superblock(journal, 1);
+		if (!is_journal_aborted(journal)) {
+			/* We can now mark the journal as empty. */
+			journal->j_tail = 0;
+			journal->j_tail_sequence =
+				++journal->j_transaction_sequence;
+			journal_update_superblock(journal, 1);
+		} else {
+			err = -EIO;
+		}
 		brelse(journal->j_sb_buffer);
 	}
 
@@ -1160,6 +1168,8 @@ void journal_destroy(journal_t *journal)
 		journal_destroy_revoke(journal);
 	kfree(journal->j_wbuf);
 	kfree(journal);
+
+	return err;
 }
 
 
@@ -1359,10 +1369,16 @@ int journal_flush(journal_t *journal)
 	spin_lock(&journal->j_list_lock);
 	while (!err && journal->j_checkpoint_transactions != NULL) {
 		spin_unlock(&journal->j_list_lock);
+		mutex_lock(&journal->j_checkpoint_mutex);
 		err = log_do_checkpoint(journal);
+		mutex_unlock(&journal->j_checkpoint_mutex);
 		spin_lock(&journal->j_list_lock);
 	}
 	spin_unlock(&journal->j_list_lock);
+
+	if (is_journal_aborted(journal))
+		return -EIO;
+
 	cleanup_journal_tail(journal);
 
 	/* Finally, mark the journal as really needing no recovery.
@@ -1384,7 +1400,7 @@ int journal_flush(journal_t *journal)
 	J_ASSERT(journal->j_head == journal->j_tail);
 	J_ASSERT(journal->j_tail_sequence == journal->j_transaction_sequence);
 	spin_unlock(&journal->j_state_lock);
-	return err;
+	return 0;
 }
 
 /**
diff --git a/fs/jbd/recovery.c b/fs/jbd/recovery.c
index 43bc5e5ed064..db5e982c5ddf 100644
--- a/fs/jbd/recovery.c
+++ b/fs/jbd/recovery.c
@@ -223,7 +223,7 @@ do {									\
  */
 int journal_recover(journal_t *journal)
 {
-	int			err;
+	int			err, err2;
 	journal_superblock_t *	sb;
 
 	struct recovery_info	info;
@@ -261,7 +261,10 @@ int journal_recover(journal_t *journal)
 	journal->j_transaction_sequence = ++info.end_transaction;
 
 	journal_clear_revoke(journal);
-	sync_blockdev(journal->j_fs_dev);
+	err2 = sync_blockdev(journal->j_fs_dev);
+	if (!err)
+		err = err2;
+
 	return err;
 }
 
diff --git a/include/linux/jbd.h b/include/linux/jbd.h
index 35d4f6342fac..346e2b80be7d 100644
--- a/include/linux/jbd.h
+++ b/include/linux/jbd.h
@@ -911,7 +911,7 @@ extern int	   journal_set_features
 		   (journal_t *, unsigned long, unsigned long, unsigned long);
 extern int	   journal_create     (journal_t *);
 extern int	   journal_load       (journal_t *journal);
-extern void	   journal_destroy    (journal_t *);
+extern int	   journal_destroy    (journal_t *);
 extern int	   journal_recover    (journal_t *journal);
 extern int	   journal_wipe       (journal_t *, int);
 extern int	   journal_skip_recovery	(journal_t *);
-- 
cgit v1.2.3


From 94b6da5ab8293b04a300ba35c72eddfa94db8b02 Mon Sep 17 00:00:00 2001
From: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
Date: Wed, 22 Oct 2008 14:15:05 -0700
Subject: memcg: fix page_cgroup allocation

page_cgroup_init() is called from mem_cgroup_init(). But at this
point, we cannot call alloc_bootmem().
(and this caused panic at boot.)

This patch moves page_cgroup_init() to init/main.c.

Time table is following:
==
  parse_args(). # we can trust mem_cgroup_subsys.disabled bit after this.
  ....
  cgroup_init_early()  # "early" init of cgroup.
  ....
  setup_arch()         # memmap is allocated.
  ...
  page_cgroup_init();
  mem_init();   # we cannot call alloc_bootmem after this.
  ....
  cgroup_init() # mem_cgroup is initialized.
==

Before page_cgroup_init(), mem_map must be initialized. So,
I added page_cgroup_init() to init/main.c directly.

(*) maybe this is not very clean but
    - cgroup_init_early() is too early
    - in cgroup_init(), we have to use vmalloc instead of alloc_bootmem().
    use of vmalloc area in x86-32 is important and we should avoid very large
    vmalloc() in x86-32. So, we want to use alloc_bootmem() and added page_cgroup_init()
    directly to init/main.c

[akpm@linux-foundation.org: remove unneeded/bad mem_cgroup_subsys declaration]
[akpm@linux-foundation.org: fix build]
Acked-by: Balbir Singh <balbir@linux.vnet.ibm.com>
Tested-by: Balbir Singh <balbir@linux.vnet.ibm.com>
Signed-off-by: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/page_cgroup.h |  5 +++++
 init/main.c                 |  2 ++
 mm/memcontrol.c             |  1 -
 mm/page_cgroup.c            | 32 +++++++++++++++++++++++++-------
 4 files changed, 32 insertions(+), 8 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/page_cgroup.h b/include/linux/page_cgroup.h
index 0fd39f2231ec..f546ad6fc028 100644
--- a/include/linux/page_cgroup.h
+++ b/include/linux/page_cgroup.h
@@ -99,5 +99,10 @@ static inline struct page_cgroup *lookup_page_cgroup(struct page *page)
 {
 	return NULL;
 }
+
+static inline void page_cgroup_init(void)
+{
+}
+
 #endif
 #endif
diff --git a/init/main.c b/init/main.c
index 3e17a3bafe60..672ae75b2059 100644
--- a/init/main.c
+++ b/init/main.c
@@ -52,6 +52,7 @@
 #include <linux/key.h>
 #include <linux/unwind.h>
 #include <linux/buffer_head.h>
+#include <linux/page_cgroup.h>
 #include <linux/debug_locks.h>
 #include <linux/debugobjects.h>
 #include <linux/lockdep.h>
@@ -647,6 +648,7 @@ asmlinkage void __init start_kernel(void)
 	vmalloc_init();
 	vfs_caches_init_early();
 	cpuset_init_early();
+	page_cgroup_init();
 	mem_init();
 	enable_debug_pagealloc();
 	cpu_hotplug_init();
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index d4a92b63e98e..866dcc7eeb0c 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -1088,7 +1088,6 @@ mem_cgroup_create(struct cgroup_subsys *ss, struct cgroup *cont)
 	int node;
 
 	if (unlikely((cont->parent) == NULL)) {
-		page_cgroup_init();
 		mem = &init_mem_cgroup;
 	} else {
 		mem = mem_cgroup_alloc();
diff --git a/mm/page_cgroup.c b/mm/page_cgroup.c
index 78242b4d7edf..f59d797dc5a9 100644
--- a/mm/page_cgroup.c
+++ b/mm/page_cgroup.c
@@ -4,8 +4,10 @@
 #include <linux/bit_spinlock.h>
 #include <linux/page_cgroup.h>
 #include <linux/hash.h>
+#include <linux/slab.h>
 #include <linux/memory.h>
 #include <linux/vmalloc.h>
+#include <linux/cgroup.h>
 
 static void __meminit
 __init_page_cgroup(struct page_cgroup *pc, unsigned long pfn)
@@ -67,6 +69,9 @@ void __init page_cgroup_init(void)
 
 	int nid, fail;
 
+	if (mem_cgroup_subsys.disabled)
+		return;
+
 	for_each_online_node(nid)  {
 		fail = alloc_node_page_cgroup(nid);
 		if (fail)
@@ -107,9 +112,14 @@ int __meminit init_section_page_cgroup(unsigned long pfn)
 	nid = page_to_nid(pfn_to_page(pfn));
 
 	table_size = sizeof(struct page_cgroup) * PAGES_PER_SECTION;
-	base = kmalloc_node(table_size, GFP_KERNEL, nid);
-	if (!base)
-		base = vmalloc_node(table_size, nid);
+	if (slab_is_available()) {
+		base = kmalloc_node(table_size, GFP_KERNEL, nid);
+		if (!base)
+			base = vmalloc_node(table_size, nid);
+	} else {
+		base = __alloc_bootmem_node_nopanic(NODE_DATA(nid), table_size,
+				PAGE_SIZE, __pa(MAX_DMA_ADDRESS));
+	}
 
 	if (!base) {
 		printk(KERN_ERR "page cgroup allocation failure\n");
@@ -136,11 +146,16 @@ void __free_page_cgroup(unsigned long pfn)
 	if (!ms || !ms->page_cgroup)
 		return;
 	base = ms->page_cgroup + pfn;
-	ms->page_cgroup = NULL;
-	if (is_vmalloc_addr(base))
+	if (is_vmalloc_addr(base)) {
 		vfree(base);
-	else
-		kfree(base);
+		ms->page_cgroup = NULL;
+	} else {
+		struct page *page = virt_to_page(base);
+		if (!PageReserved(page)) { /* Is bootmem ? */
+			kfree(base);
+			ms->page_cgroup = NULL;
+		}
+	}
 }
 
 int online_page_cgroup(unsigned long start_pfn,
@@ -214,6 +229,9 @@ void __init page_cgroup_init(void)
 	unsigned long pfn;
 	int fail = 0;
 
+	if (mem_cgroup_subsys.disabled)
+		return;
+
 	for (pfn = 0; !fail && pfn < max_pfn; pfn += PAGES_PER_SECTION) {
 		if (!pfn_present(pfn))
 			continue;
-- 
cgit v1.2.3


From 7106b4e333baeaf3c596e4d240438059b8a7616d Mon Sep 17 00:00:00 2001
From: Lee Howard <lee.howard@mainpine.com>
Date: Tue, 21 Oct 2008 13:48:58 +0100
Subject: 8250: Oxford Semiconductor Devices

Add support for the OxSemi 'Tornado' devices.

Reformatted and reworked a bit by Alan Cox

Signed-off-by: Lee Howard <lee.howard@mainpine.com>
Signed-off-by: Alan Cox <alan@redhat.com>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 drivers/parport/parport_pc.c |  20 +++-
 drivers/serial/8250_pci.c    | 211 +++++++++++++++++++++++++++++++++++++++++++
 include/linux/pci_ids.h      |   8 ++
 3 files changed, 238 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/drivers/parport/parport_pc.c b/drivers/parport/parport_pc.c
index 8a846adf1dcf..96f3bdf0ec4b 100644
--- a/drivers/parport/parport_pc.c
+++ b/drivers/parport/parport_pc.c
@@ -2791,6 +2791,7 @@ enum parport_pc_pci_cards {
 	oxsemi_952,
 	oxsemi_954,
 	oxsemi_840,
+	oxsemi_pcie_pport,
 	aks_0100,
 	mobility_pp,
 	netmos_9705,
@@ -2868,6 +2869,7 @@ static struct parport_pc_pci {
 	/* oxsemi_952 */		{ 1, { { 0, 1 }, } },
 	/* oxsemi_954 */		{ 1, { { 0, -1 }, } },
 	/* oxsemi_840 */		{ 1, { { 0, 1 }, } },
+	/* oxsemi_pcie_pport */		{ 1, { { 0, 1 }, } },
 	/* aks_0100 */                  { 1, { { 0, -1 }, } },
 	/* mobility_pp */		{ 1, { { 0, 1 }, } },
 	/* netmos_9705 */               { 1, { { 0, -1 }, } }, /* untested */
@@ -2928,7 +2930,6 @@ static const struct pci_device_id parport_pc_pci_tbl[] = {
 	{ 0x1409, 0x7268, 0x1409, 0x0103, 0, 0, timedia_4008a },
 	{ 0x1409, 0x7268, 0x1409, 0x0104, 0, 0, timedia_4018 },
 	{ 0x1409, 0x7268, 0x1409, 0x9018, 0, 0, timedia_9018a },
-	{ 0x14f2, 0x0121, PCI_ANY_ID, PCI_ANY_ID, 0, 0, mobility_pp },
 	{ PCI_VENDOR_ID_SYBA, PCI_DEVICE_ID_SYBA_2P_EPP,
 	  PCI_ANY_ID, PCI_ANY_ID, 0, 0, syba_2p_epp },
 	{ PCI_VENDOR_ID_SYBA, PCI_DEVICE_ID_SYBA_1P_ECP,
@@ -2946,8 +2947,25 @@ static const struct pci_device_id parport_pc_pci_tbl[] = {
 	  PCI_ANY_ID, PCI_ANY_ID, 0, 0, oxsemi_954 },
 	{ PCI_VENDOR_ID_OXSEMI, PCI_DEVICE_ID_OXSEMI_12PCI840,
 	  PCI_ANY_ID, PCI_ANY_ID, 0, 0, oxsemi_840 },
+	{ PCI_VENDOR_ID_OXSEMI, PCI_DEVICE_ID_OXSEMI_PCIe840,
+	  PCI_ANY_ID, PCI_ANY_ID, 0, 0, oxsemi_pcie_pport },
+	{ PCI_VENDOR_ID_OXSEMI, PCI_DEVICE_ID_OXSEMI_PCIe840_G,
+	  PCI_ANY_ID, PCI_ANY_ID, 0, 0, oxsemi_pcie_pport },
+	{ PCI_VENDOR_ID_OXSEMI, PCI_DEVICE_ID_OXSEMI_PCIe952_0,
+	  PCI_ANY_ID, PCI_ANY_ID, 0, 0, oxsemi_pcie_pport },
+	{ PCI_VENDOR_ID_OXSEMI, PCI_DEVICE_ID_OXSEMI_PCIe952_0_G,
+	  PCI_ANY_ID, PCI_ANY_ID, 0, 0, oxsemi_pcie_pport },
+	{ PCI_VENDOR_ID_OXSEMI, PCI_DEVICE_ID_OXSEMI_PCIe952_1,
+	  PCI_ANY_ID, PCI_ANY_ID, 0, 0, oxsemi_pcie_pport },
+	{ PCI_VENDOR_ID_OXSEMI, PCI_DEVICE_ID_OXSEMI_PCIe952_1_G,
+	  PCI_ANY_ID, PCI_ANY_ID, 0, 0, oxsemi_pcie_pport },
+	{ PCI_VENDOR_ID_OXSEMI, PCI_DEVICE_ID_OXSEMI_PCIe952_1_U,
+	  PCI_ANY_ID, PCI_ANY_ID, 0, 0, oxsemi_pcie_pport },
+	{ PCI_VENDOR_ID_OXSEMI, PCI_DEVICE_ID_OXSEMI_PCIe952_1_GU,
+	  PCI_ANY_ID, PCI_ANY_ID, 0, 0, oxsemi_pcie_pport },
 	{ PCI_VENDOR_ID_AKS, PCI_DEVICE_ID_AKS_ALADDINCARD,
 	  PCI_ANY_ID, PCI_ANY_ID, 0, 0, aks_0100 },
+	{ 0x14f2, 0x0121, PCI_ANY_ID, PCI_ANY_ID, 0, 0, mobility_pp },
 	/* NetMos communication controllers */
 	{ PCI_VENDOR_ID_NETMOS, PCI_DEVICE_ID_NETMOS_9705,
 	  PCI_ANY_ID, PCI_ANY_ID, 0, 0, netmos_9705 },
diff --git a/drivers/serial/8250_pci.c b/drivers/serial/8250_pci.c
index c014ffb110e9..1bdb08b41f73 100644
--- a/drivers/serial/8250_pci.c
+++ b/drivers/serial/8250_pci.c
@@ -1100,6 +1100,8 @@ enum pci_board_num_t {
 	pbn_b0_4_1843200_200,
 	pbn_b0_8_1843200_200,
 
+	pbn_b0_1_4000000,
+
 	pbn_b0_bt_1_115200,
 	pbn_b0_bt_2_115200,
 	pbn_b0_bt_8_115200,
@@ -1167,6 +1169,10 @@ enum pci_board_num_t {
 	pbn_exsys_4055,
 	pbn_plx_romulus,
 	pbn_oxsemi,
+	pbn_oxsemi_1_4000000,
+	pbn_oxsemi_2_4000000,
+	pbn_oxsemi_4_4000000,
+	pbn_oxsemi_8_4000000,
 	pbn_intel_i960,
 	pbn_sgi_ioc3,
 	pbn_computone_4,
@@ -1290,6 +1296,12 @@ static struct pciserial_board pci_boards[] __devinitdata = {
 		.base_baud	= 1843200,
 		.uart_offset	= 0x200,
 	},
+	[pbn_b0_1_4000000] = {
+		.flags		= FL_BASE0,
+		.num_ports	= 1,
+		.base_baud	= 4000000,
+		.uart_offset	= 8,
+	},
 
 	[pbn_b0_bt_1_115200] = {
 		.flags		= FL_BASE0|FL_BASE_BARS,
@@ -1625,6 +1637,35 @@ static struct pciserial_board pci_boards[] __devinitdata = {
 		.base_baud	= 115200,
 		.uart_offset	= 8,
 	},
+	[pbn_oxsemi_1_4000000] = {
+		.flags		= FL_BASE0,
+		.num_ports	= 1,
+		.base_baud	= 4000000,
+		.uart_offset	= 0x200,
+		.first_offset	= 0x1000,
+	},
+	[pbn_oxsemi_2_4000000] = {
+		.flags		= FL_BASE0,
+		.num_ports	= 2,
+		.base_baud	= 4000000,
+		.uart_offset	= 0x200,
+		.first_offset	= 0x1000,
+	},
+	[pbn_oxsemi_4_4000000] = {
+		.flags		= FL_BASE0,
+		.num_ports	= 4,
+		.base_baud	= 4000000,
+		.uart_offset	= 0x200,
+		.first_offset	= 0x1000,
+	},
+	[pbn_oxsemi_8_4000000] = {
+		.flags		= FL_BASE0,
+		.num_ports	= 8,
+		.base_baud	= 4000000,
+		.uart_offset	= 0x200,
+		.first_offset	= 0x1000,
+	},
+
 
 	/*
 	 * EKF addition for i960 Boards form EKF with serial port.
@@ -1813,6 +1854,34 @@ serial_pci_matches(struct pciserial_board *board,
 	    board->first_offset == guessed->first_offset;
 }
 
+/*
+ * Oxford Semiconductor Inc.
+ * Check that device is part of the Tornado range of devices, then determine
+ * the number of ports available on the device.
+ */
+static int pci_oxsemi_tornado_init(struct pci_dev *dev, struct pciserial_board *board)
+{
+	u8 __iomem *p;
+	unsigned long deviceID;
+	unsigned int  number_uarts;
+
+	p = pci_iomap(dev, 0, 5);
+	if (p == NULL)
+		return -ENOMEM;
+
+	deviceID = ioread32(p);
+	/* Tornado device */
+	if (deviceID == 0x07000200) {
+		number_uarts = ioread8(p + 4);
+		board->num_ports = number_uarts;
+		printk(KERN_DEBUG
+			"%d ports detected on Oxford PCI Express device\n",
+								number_uarts);
+	}
+	pci_iounmap(dev, p);
+	return 0;
+}
+
 struct serial_private *
 pciserial_init_ports(struct pci_dev *dev, struct pciserial_board *board)
 {
@@ -1821,6 +1890,12 @@ pciserial_init_ports(struct pci_dev *dev, struct pciserial_board *board)
 	struct pci_serial_quirk *quirk;
 	int rc, nr_ports, i;
 
+	/*
+	 * Find number of ports on board
+	 */
+	if (dev->vendor == PCI_VENDOR_ID_OXSEMI)
+		pci_oxsemi_tornado_init(dev, board);
+
 	nr_ports = board->num_ports;
 
 	/*
@@ -2300,6 +2375,142 @@ static struct pci_device_id serial_pci_tbl[] = {
 		PCI_ANY_ID, PCI_ANY_ID, 0, 0,
 		pbn_b0_bt_2_921600 },
 
+	/*
+	 * Oxford Semiconductor Inc. Tornado PCI express device range.
+	 */
+	{	PCI_VENDOR_ID_OXSEMI, 0xc101,    /* OXPCIe952 1 Legacy UART */
+		PCI_ANY_ID, PCI_ANY_ID, 0, 0,
+		pbn_b0_1_4000000 },
+	{	PCI_VENDOR_ID_OXSEMI, 0xc105,    /* OXPCIe952 1 Legacy UART */
+		PCI_ANY_ID, PCI_ANY_ID, 0, 0,
+		pbn_b0_1_4000000 },
+	{	PCI_VENDOR_ID_OXSEMI, 0xc11b,    /* OXPCIe952 1 Native UART */
+		PCI_ANY_ID, PCI_ANY_ID, 0, 0,
+		pbn_oxsemi_1_4000000 },
+	{	PCI_VENDOR_ID_OXSEMI, 0xc11f,    /* OXPCIe952 1 Native UART */
+		PCI_ANY_ID, PCI_ANY_ID, 0, 0,
+		pbn_oxsemi_1_4000000 },
+	{	PCI_VENDOR_ID_OXSEMI, 0xc120,    /* OXPCIe952 1 Legacy UART */
+		PCI_ANY_ID, PCI_ANY_ID, 0, 0,
+		pbn_b0_1_4000000 },
+	{	PCI_VENDOR_ID_OXSEMI, 0xc124,    /* OXPCIe952 1 Legacy UART */
+		PCI_ANY_ID, PCI_ANY_ID, 0, 0,
+		pbn_b0_1_4000000 },
+	{	PCI_VENDOR_ID_OXSEMI, 0xc138,    /* OXPCIe952 1 Native UART */
+		PCI_ANY_ID, PCI_ANY_ID, 0, 0,
+		pbn_oxsemi_1_4000000 },
+	{	PCI_VENDOR_ID_OXSEMI, 0xc13d,    /* OXPCIe952 1 Native UART */
+		PCI_ANY_ID, PCI_ANY_ID, 0, 0,
+		pbn_oxsemi_1_4000000 },
+	{	PCI_VENDOR_ID_OXSEMI, 0xc140,    /* OXPCIe952 1 Legacy UART */
+		PCI_ANY_ID, PCI_ANY_ID, 0, 0,
+		pbn_b0_1_4000000 },
+	{	PCI_VENDOR_ID_OXSEMI, 0xc141,    /* OXPCIe952 1 Legacy UART */
+		PCI_ANY_ID, PCI_ANY_ID, 0, 0,
+		pbn_b0_1_4000000 },
+	{	PCI_VENDOR_ID_OXSEMI, 0xc144,    /* OXPCIe952 1 Legacy UART */
+		PCI_ANY_ID, PCI_ANY_ID, 0, 0,
+		pbn_b0_1_4000000 },
+	{	PCI_VENDOR_ID_OXSEMI, 0xc145,    /* OXPCIe952 1 Legacy UART */
+		PCI_ANY_ID, PCI_ANY_ID, 0, 0,
+		pbn_b0_1_4000000 },
+	{	PCI_VENDOR_ID_OXSEMI, 0xc158,    /* OXPCIe952 2 Native UART */
+		PCI_ANY_ID, PCI_ANY_ID, 0, 0,
+		pbn_oxsemi_2_4000000 },
+	{	PCI_VENDOR_ID_OXSEMI, 0xc15d,    /* OXPCIe952 2 Native UART */
+		PCI_ANY_ID, PCI_ANY_ID, 0, 0,
+		pbn_oxsemi_2_4000000 },
+	{	PCI_VENDOR_ID_OXSEMI, 0xc208,    /* OXPCIe954 4 Native UART */
+		PCI_ANY_ID, PCI_ANY_ID, 0, 0,
+		pbn_oxsemi_4_4000000 },
+	{	PCI_VENDOR_ID_OXSEMI, 0xc20d,    /* OXPCIe954 4 Native UART */
+		PCI_ANY_ID, PCI_ANY_ID, 0, 0,
+		pbn_oxsemi_4_4000000 },
+	{	PCI_VENDOR_ID_OXSEMI, 0xc308,    /* OXPCIe958 8 Native UART */
+		PCI_ANY_ID, PCI_ANY_ID, 0, 0,
+		pbn_oxsemi_8_4000000 },
+	{	PCI_VENDOR_ID_OXSEMI, 0xc30d,    /* OXPCIe958 8 Native UART */
+		PCI_ANY_ID, PCI_ANY_ID, 0, 0,
+		pbn_oxsemi_8_4000000 },
+	{	PCI_VENDOR_ID_OXSEMI, 0xc40b,    /* OXPCIe200 1 Native UART */
+		PCI_ANY_ID, PCI_ANY_ID, 0, 0,
+		pbn_oxsemi_1_4000000 },
+	{	PCI_VENDOR_ID_OXSEMI, 0xc40f,    /* OXPCIe200 1 Native UART */
+		PCI_ANY_ID, PCI_ANY_ID, 0, 0,
+		pbn_oxsemi_1_4000000 },
+	{	PCI_VENDOR_ID_OXSEMI, 0xc41b,    /* OXPCIe200 1 Native UART */
+		PCI_ANY_ID, PCI_ANY_ID, 0, 0,
+		pbn_oxsemi_1_4000000 },
+	{	PCI_VENDOR_ID_OXSEMI, 0xc41f,    /* OXPCIe200 1 Native UART */
+		PCI_ANY_ID, PCI_ANY_ID, 0, 0,
+		pbn_oxsemi_1_4000000 },
+	{	PCI_VENDOR_ID_OXSEMI, 0xc42b,    /* OXPCIe200 1 Native UART */
+		PCI_ANY_ID, PCI_ANY_ID, 0, 0,
+		pbn_oxsemi_1_4000000 },
+	{	PCI_VENDOR_ID_OXSEMI, 0xc42f,    /* OXPCIe200 1 Native UART */
+		PCI_ANY_ID, PCI_ANY_ID, 0, 0,
+		pbn_oxsemi_1_4000000 },
+	{	PCI_VENDOR_ID_OXSEMI, 0xc43b,    /* OXPCIe200 1 Native UART */
+		PCI_ANY_ID, PCI_ANY_ID, 0, 0,
+		pbn_oxsemi_1_4000000 },
+	{	PCI_VENDOR_ID_OXSEMI, 0xc43f,    /* OXPCIe200 1 Native UART */
+		PCI_ANY_ID, PCI_ANY_ID, 0, 0,
+		pbn_oxsemi_1_4000000 },
+	{	PCI_VENDOR_ID_OXSEMI, 0xc44b,    /* OXPCIe200 1 Native UART */
+		PCI_ANY_ID, PCI_ANY_ID, 0, 0,
+		pbn_oxsemi_1_4000000 },
+	{	PCI_VENDOR_ID_OXSEMI, 0xc44f,    /* OXPCIe200 1 Native UART */
+		PCI_ANY_ID, PCI_ANY_ID, 0, 0,
+		pbn_oxsemi_1_4000000 },
+	{	PCI_VENDOR_ID_OXSEMI, 0xc45b,    /* OXPCIe200 1 Native UART */
+		PCI_ANY_ID, PCI_ANY_ID, 0, 0,
+		pbn_oxsemi_1_4000000 },
+	{	PCI_VENDOR_ID_OXSEMI, 0xc45f,    /* OXPCIe200 1 Native UART */
+		PCI_ANY_ID, PCI_ANY_ID, 0, 0,
+		pbn_oxsemi_1_4000000 },
+	{	PCI_VENDOR_ID_OXSEMI, 0xc46b,    /* OXPCIe200 1 Native UART */
+		PCI_ANY_ID, PCI_ANY_ID, 0, 0,
+		pbn_oxsemi_1_4000000 },
+	{	PCI_VENDOR_ID_OXSEMI, 0xc46f,    /* OXPCIe200 1 Native UART */
+		PCI_ANY_ID, PCI_ANY_ID, 0, 0,
+		pbn_oxsemi_1_4000000 },
+	{	PCI_VENDOR_ID_OXSEMI, 0xc47b,    /* OXPCIe200 1 Native UART */
+		PCI_ANY_ID, PCI_ANY_ID, 0, 0,
+		pbn_oxsemi_1_4000000 },
+	{	PCI_VENDOR_ID_OXSEMI, 0xc47f,    /* OXPCIe200 1 Native UART */
+		PCI_ANY_ID, PCI_ANY_ID, 0, 0,
+		pbn_oxsemi_1_4000000 },
+	{	PCI_VENDOR_ID_OXSEMI, 0xc48b,    /* OXPCIe200 1 Native UART */
+		PCI_ANY_ID, PCI_ANY_ID, 0, 0,
+		pbn_oxsemi_1_4000000 },
+	{	PCI_VENDOR_ID_OXSEMI, 0xc48f,    /* OXPCIe200 1 Native UART */
+		PCI_ANY_ID, PCI_ANY_ID, 0, 0,
+		pbn_oxsemi_1_4000000 },
+	{	PCI_VENDOR_ID_OXSEMI, 0xc49b,    /* OXPCIe200 1 Native UART */
+		PCI_ANY_ID, PCI_ANY_ID, 0, 0,
+		pbn_oxsemi_1_4000000 },
+	{	PCI_VENDOR_ID_OXSEMI, 0xc49f,    /* OXPCIe200 1 Native UART */
+		PCI_ANY_ID, PCI_ANY_ID, 0, 0,
+		pbn_oxsemi_1_4000000 },
+	{	PCI_VENDOR_ID_OXSEMI, 0xc4ab,    /* OXPCIe200 1 Native UART */
+		PCI_ANY_ID, PCI_ANY_ID, 0, 0,
+		pbn_oxsemi_1_4000000 },
+	{	PCI_VENDOR_ID_OXSEMI, 0xc4af,    /* OXPCIe200 1 Native UART */
+		PCI_ANY_ID, PCI_ANY_ID, 0, 0,
+		pbn_oxsemi_1_4000000 },
+	{	PCI_VENDOR_ID_OXSEMI, 0xc4bb,    /* OXPCIe200 1 Native UART */
+		PCI_ANY_ID, PCI_ANY_ID, 0, 0,
+		pbn_oxsemi_1_4000000 },
+	{	PCI_VENDOR_ID_OXSEMI, 0xc4bf,    /* OXPCIe200 1 Native UART */
+		PCI_ANY_ID, PCI_ANY_ID, 0, 0,
+		pbn_oxsemi_1_4000000 },
+	{	PCI_VENDOR_ID_OXSEMI, 0xc4cb,    /* OXPCIe200 1 Native UART */
+		PCI_ANY_ID, PCI_ANY_ID, 0, 0,
+		pbn_oxsemi_1_4000000 },
+	{	PCI_VENDOR_ID_OXSEMI, 0xc4cf,    /* OXPCIe200 1 Native UART */
+		PCI_ANY_ID, PCI_ANY_ID, 0, 0,
+		pbn_oxsemi_1_4000000 },
+
 	/*
 	 * SBS Technologies, Inc. P-Octal and PMC-OCTPRO cards,
 	 * from skokodyn@yahoo.com
diff --git a/include/linux/pci_ids.h b/include/linux/pci_ids.h
index e5d344bfcb7e..369f44286353 100644
--- a/include/linux/pci_ids.h
+++ b/include/linux/pci_ids.h
@@ -1944,6 +1944,14 @@
 
 #define PCI_VENDOR_ID_OXSEMI		0x1415
 #define PCI_DEVICE_ID_OXSEMI_12PCI840	0x8403
+#define PCI_DEVICE_ID_OXSEMI_PCIe840		0xC000
+#define PCI_DEVICE_ID_OXSEMI_PCIe840_G		0xC004
+#define PCI_DEVICE_ID_OXSEMI_PCIe952_0		0xC100
+#define PCI_DEVICE_ID_OXSEMI_PCIe952_0_G	0xC104
+#define PCI_DEVICE_ID_OXSEMI_PCIe952_1		0xC110
+#define PCI_DEVICE_ID_OXSEMI_PCIe952_1_G	0xC114
+#define PCI_DEVICE_ID_OXSEMI_PCIe952_1_U	0xC118
+#define PCI_DEVICE_ID_OXSEMI_PCIe952_1_GU	0xC11C
 #define PCI_DEVICE_ID_OXSEMI_16PCI954	0x9501
 #define PCI_DEVICE_ID_OXSEMI_16PCI95N	0x9511
 #define PCI_DEVICE_ID_OXSEMI_16PCI954PP	0x9513
-- 
cgit v1.2.3


From 388c8c16abafc2e74dff173b5de9ee519ea8d32f Mon Sep 17 00:00:00 2001
From: James Bottomley <James.Bottomley@HansenPartnership.com>
Date: Sun, 3 Aug 2008 13:02:12 -0500
Subject: PCI: add routines for debugging and handling lost interrupts

We're getting a lot of storage drivers blamed for interrupt misrouting
issues.  This patch provides a standard way of reporting the problem
... and, if possible, correcting it.

Signed-off-by: James Bottomley <James.Bottomley@HansenPartnership.com>
Signed-off-by: Jesse Barnes <jbarnes@virtuousgeek.org>
---
 drivers/pci/Makefile |  3 ++-
 drivers/pci/irq.c    | 60 ++++++++++++++++++++++++++++++++++++++++++++++++++++
 include/linux/pci.h  |  7 ++++++
 3 files changed, 69 insertions(+), 1 deletion(-)
 create mode 100644 drivers/pci/irq.c

(limited to 'include/linux')

diff --git a/drivers/pci/Makefile b/drivers/pci/Makefile
index 4b47f4ece5b7..af3bfe22847b 100644
--- a/drivers/pci/Makefile
+++ b/drivers/pci/Makefile
@@ -3,7 +3,8 @@
 #
 
 obj-y		+= access.o bus.o probe.o remove.o pci.o quirks.o slot.o \
-			pci-driver.o search.o pci-sysfs.o rom.o setup-res.o
+			pci-driver.o search.o pci-sysfs.o rom.o setup-res.o \
+			irq.o
 obj-$(CONFIG_PROC_FS) += proc.o
 
 # Build PCI Express stuff if needed
diff --git a/drivers/pci/irq.c b/drivers/pci/irq.c
new file mode 100644
index 000000000000..6441dfa969a3
--- /dev/null
+++ b/drivers/pci/irq.c
@@ -0,0 +1,60 @@
+/*
+ * PCI IRQ failure handing code
+ *
+ * Copyright (c) 2008 James Bottomley <James.Bottomley@HansenPartnership.com>
+ */
+
+#include <linux/acpi.h>
+#include <linux/device.h>
+#include <linux/kernel.h>
+#include <linux/pci.h>
+
+static void pci_note_irq_problem(struct pci_dev *pdev, const char *reason)
+{
+	struct pci_dev *parent = to_pci_dev(pdev->dev.parent);
+
+	dev_printk(KERN_ERR, &pdev->dev,
+		   "Potentially misrouted IRQ (Bridge %s %04x:%04x)\n",
+		   parent->dev.bus_id, parent->vendor, parent->device);
+	dev_printk(KERN_ERR, &pdev->dev, "%s\n", reason);
+	dev_printk(KERN_ERR, &pdev->dev, "Please report to linux-kernel@vger.kernel.org\n");
+	WARN_ON(1);
+}
+
+/**
+ * pci_lost_interrupt - reports a lost PCI interrupt
+ * @pdev:	device whose interrupt is lost
+ * 
+ * The primary function of this routine is to report a lost interrupt
+ * in a standard way which users can recognise (instead of blaming the
+ * driver).
+ *
+ * Returns:
+ *  a suggestion for fixing it (although the driver is not required to
+ * act on this).
+ */
+enum pci_lost_interrupt_reason pci_lost_interrupt(struct pci_dev *pdev)
+{
+	if (pdev->msi_enabled || pdev->msix_enabled) {
+		enum pci_lost_interrupt_reason ret;
+
+		if (pdev->msix_enabled) {
+			pci_note_irq_problem(pdev, "MSIX routing failure");
+			ret = PCI_LOST_IRQ_DISABLE_MSIX;
+		} else {
+			pci_note_irq_problem(pdev, "MSI routing failure");
+			ret = PCI_LOST_IRQ_DISABLE_MSI;
+		}
+		return ret;
+	}
+#ifdef CONFIG_ACPI
+	if (!(acpi_disabled || acpi_noirq)) {
+		pci_note_irq_problem(pdev, "Potential ACPI misrouting please reboot with acpi=noirq");
+		/* currently no way to fix acpi on the fly */
+		return PCI_LOST_IRQ_DISABLE_ACPI;
+	}
+#endif
+	pci_note_irq_problem(pdev, "unknown cause (not MSI or ACPI)");
+	return PCI_LOST_IRQ_NO_INFORMATION;
+}
+EXPORT_SYMBOL(pci_lost_interrupt);
diff --git a/include/linux/pci.h b/include/linux/pci.h
index 752def8a2ef4..c75b82bda327 100644
--- a/include/linux/pci.h
+++ b/include/linux/pci.h
@@ -546,6 +546,13 @@ struct pci_dev __deprecated *pci_find_slot(unsigned int bus,
 					   unsigned int devfn);
 #endif /* CONFIG_PCI_LEGACY */
 
+enum pci_lost_interrupt_reason {
+	PCI_LOST_IRQ_NO_INFORMATION = 0,
+	PCI_LOST_IRQ_DISABLE_MSI,
+	PCI_LOST_IRQ_DISABLE_MSIX,
+	PCI_LOST_IRQ_DISABLE_ACPI,
+};
+enum pci_lost_interrupt_reason pci_lost_interrupt(struct pci_dev *dev);
 int pci_find_capability(struct pci_dev *dev, int cap);
 int pci_find_next_capability(struct pci_dev *dev, u8 pos, int cap);
 int pci_find_ext_capability(struct pci_dev *dev, int cap);
-- 
cgit v1.2.3


From 2fca5ccf97d2c28bcfce44f5b07d85e74e3cd18e Mon Sep 17 00:00:00 2001
From: Jens Axboe <jens.axboe@oracle.com>
Date: Wed, 22 Oct 2008 09:34:49 +0200
Subject: libata: switch to using block layer tagging support

libata currently has a pretty dumb ATA_MAX_QUEUE loop for finding
a free tag to use. Instead of fixing that up, convert libata to
using block layer tagging - gets rid of code in libata, and is also
much faster.

Signed-off-by: Jens Axboe <jens.axboe@oracle.com>
Acked-by: Tejun Heo <tj@kernel.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 drivers/ata/libata-core.c | 66 +++++------------------------------------------
 drivers/ata/libata-scsi.c | 10 +++++--
 drivers/ata/libata.h      | 19 ++++++++++++--
 include/linux/libata.h    |  1 -
 4 files changed, 31 insertions(+), 65 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/ata/libata-core.c b/drivers/ata/libata-core.c
index bbb3cae57492..8cb0b360bfd8 100644
--- a/drivers/ata/libata-core.c
+++ b/drivers/ata/libata-core.c
@@ -1713,8 +1713,6 @@ unsigned ata_exec_internal_sg(struct ata_device *dev,
 	else
 		tag = 0;
 
-	if (test_and_set_bit(tag, &ap->qc_allocated))
-		BUG();
 	qc = __ata_qc_from_tag(ap, tag);
 
 	qc->tag = tag;
@@ -4552,37 +4550,6 @@ void swap_buf_le16(u16 *buf, unsigned int buf_words)
 #endif /* __BIG_ENDIAN */
 }
 
-/**
- *	ata_qc_new - Request an available ATA command, for queueing
- *	@ap: Port associated with device @dev
- *	@dev: Device from whom we request an available command structure
- *
- *	LOCKING:
- *	None.
- */
-
-static struct ata_queued_cmd *ata_qc_new(struct ata_port *ap)
-{
-	struct ata_queued_cmd *qc = NULL;
-	unsigned int i;
-
-	/* no command while frozen */
-	if (unlikely(ap->pflags & ATA_PFLAG_FROZEN))
-		return NULL;
-
-	/* the last tag is reserved for internal command. */
-	for (i = 0; i < ATA_MAX_QUEUE - 1; i++)
-		if (!test_and_set_bit(i, &ap->qc_allocated)) {
-			qc = __ata_qc_from_tag(ap, i);
-			break;
-		}
-
-	if (qc)
-		qc->tag = i;
-
-	return qc;
-}
-
 /**
  *	ata_qc_new_init - Request an available ATA command, and initialize it
  *	@dev: Device from whom we request an available command structure
@@ -4591,16 +4558,20 @@ static struct ata_queued_cmd *ata_qc_new(struct ata_port *ap)
  *	None.
  */
 
-struct ata_queued_cmd *ata_qc_new_init(struct ata_device *dev)
+struct ata_queued_cmd *ata_qc_new_init(struct ata_device *dev, int tag)
 {
 	struct ata_port *ap = dev->link->ap;
 	struct ata_queued_cmd *qc;
 
-	qc = ata_qc_new(ap);
+	if (unlikely(ap->pflags & ATA_PFLAG_FROZEN))
+		return NULL;
+
+	qc = __ata_qc_from_tag(ap, tag);
 	if (qc) {
 		qc->scsicmd = NULL;
 		qc->ap = ap;
 		qc->dev = dev;
+		qc->tag = tag;
 
 		ata_qc_reinit(qc);
 	}
@@ -4608,31 +4579,6 @@ struct ata_queued_cmd *ata_qc_new_init(struct ata_device *dev)
 	return qc;
 }
 
-/**
- *	ata_qc_free - free unused ata_queued_cmd
- *	@qc: Command to complete
- *
- *	Designed to free unused ata_queued_cmd object
- *	in case something prevents using it.
- *
- *	LOCKING:
- *	spin_lock_irqsave(host lock)
- */
-void ata_qc_free(struct ata_queued_cmd *qc)
-{
-	struct ata_port *ap = qc->ap;
-	unsigned int tag;
-
-	WARN_ON(qc == NULL);	/* ata_qc_from_tag _might_ return NULL */
-
-	qc->flags = 0;
-	tag = qc->tag;
-	if (likely(ata_tag_valid(tag))) {
-		qc->tag = ATA_TAG_POISON;
-		clear_bit(tag, &ap->qc_allocated);
-	}
-}
-
 void __ata_qc_complete(struct ata_queued_cmd *qc)
 {
 	struct ata_port *ap = qc->ap;
diff --git a/drivers/ata/libata-scsi.c b/drivers/ata/libata-scsi.c
index 5d312dc9be9f..d5b9b7266c8b 100644
--- a/drivers/ata/libata-scsi.c
+++ b/drivers/ata/libata-scsi.c
@@ -708,7 +708,7 @@ static struct ata_queued_cmd *ata_scsi_qc_new(struct ata_device *dev,
 {
 	struct ata_queued_cmd *qc;
 
-	qc = ata_qc_new_init(dev);
+	qc = ata_qc_new_init(dev, cmd->request->tag);
 	if (qc) {
 		qc->scsicmd = cmd;
 		qc->scsidone = done;
@@ -1103,7 +1103,8 @@ static int ata_scsi_dev_config(struct scsi_device *sdev,
 
 		depth = min(sdev->host->can_queue, ata_id_queue_depth(dev->id));
 		depth = min(ATA_MAX_QUEUE - 1, depth);
-		scsi_adjust_queue_depth(sdev, MSG_SIMPLE_TAG, depth);
+		scsi_set_tag_type(sdev, MSG_SIMPLE_TAG);
+		scsi_activate_tcq(sdev, depth);
 	}
 
 	return 0;
@@ -1943,6 +1944,11 @@ static unsigned int ata_scsiop_inq_std(struct ata_scsi_args *args, u8 *rbuf)
 		hdr[1] |= (1 << 7);
 
 	memcpy(rbuf, hdr, sizeof(hdr));
+
+	/* if ncq, set tags supported */
+	if (ata_id_has_ncq(args->id))
+		rbuf[7] |= (1 << 1);
+
 	memcpy(&rbuf[8], "ATA     ", 8);
 	ata_id_string(args->id, &rbuf[16], ATA_ID_PROD, 16);
 	ata_id_string(args->id, &rbuf[32], ATA_ID_FW_REV, 4);
diff --git a/drivers/ata/libata.h b/drivers/ata/libata.h
index fe2839e58774..d3831d39bdaa 100644
--- a/drivers/ata/libata.h
+++ b/drivers/ata/libata.h
@@ -74,7 +74,7 @@ extern struct ata_link *ata_dev_phys_link(struct ata_device *dev);
 extern void ata_force_cbl(struct ata_port *ap);
 extern u64 ata_tf_to_lba(const struct ata_taskfile *tf);
 extern u64 ata_tf_to_lba48(const struct ata_taskfile *tf);
-extern struct ata_queued_cmd *ata_qc_new_init(struct ata_device *dev);
+extern struct ata_queued_cmd *ata_qc_new_init(struct ata_device *dev, int tag);
 extern int ata_build_rw_tf(struct ata_taskfile *tf, struct ata_device *dev,
 			   u64 block, u32 n_block, unsigned int tf_flags,
 			   unsigned int tag);
@@ -103,7 +103,6 @@ extern int ata_dev_configure(struct ata_device *dev);
 extern int sata_down_spd_limit(struct ata_link *link);
 extern int ata_down_xfermask_limit(struct ata_device *dev, unsigned int sel);
 extern void ata_sg_clean(struct ata_queued_cmd *qc);
-extern void ata_qc_free(struct ata_queued_cmd *qc);
 extern void ata_qc_issue(struct ata_queued_cmd *qc);
 extern void __ata_qc_complete(struct ata_queued_cmd *qc);
 extern int atapi_check_dma(struct ata_queued_cmd *qc);
@@ -119,6 +118,22 @@ extern struct ata_port *ata_port_alloc(struct ata_host *host);
 extern void ata_dev_enable_pm(struct ata_device *dev, enum link_pm policy);
 extern void ata_lpm_schedule(struct ata_port *ap, enum link_pm);
 
+/**
+ *	ata_qc_free - free unused ata_queued_cmd
+ *	@qc: Command to complete
+ *
+ *	Designed to free unused ata_queued_cmd object
+ *	in case something prevents using it.
+ *
+ *	LOCKING:
+ *	spin_lock_irqsave(host lock)
+ */
+static inline void ata_qc_free(struct ata_queued_cmd *qc)
+{
+	qc->flags = 0;
+	qc->tag = ATA_TAG_POISON;
+}
+
 /* libata-acpi.c */
 #ifdef CONFIG_ATA_ACPI
 extern void ata_acpi_associate_sata_port(struct ata_port *ap);
diff --git a/include/linux/libata.h b/include/linux/libata.h
index c261aa0584b1..507f53ef8038 100644
--- a/include/linux/libata.h
+++ b/include/linux/libata.h
@@ -695,7 +695,6 @@ struct ata_port {
 	unsigned int		cbl;	/* cable type; ATA_CBL_xxx */
 
 	struct ata_queued_cmd	qcmd[ATA_MAX_QUEUE];
-	unsigned long		qc_allocated;
 	unsigned int		qc_active;
 	int			nr_active_links; /* #links with active qcs */
 
-- 
cgit v1.2.3


From 3ad0b02e4c1d5feba44b8ff48dccd1ba61a826b0 Mon Sep 17 00:00:00 2001
From: Venki Pallipadi <venkatesh.pallipadi@intel.com>
Date: Wed, 22 Oct 2008 16:34:52 -0700
Subject: i7300_idle: Disable ioat channel only on platforms where ile driver
 can load

Based on input from Andi Kleen:
share the platform detection code with ioat_dma and disable the channel in
dma engine only for specific platforms.

Signed-off-by: Venkatesh Pallipadi <venkatesh.pallipadi@intel.com>
Signed-off-by: Len Brown <len.brown@intel.com>
---
 drivers/dma/ioat_dma.c     |  5 ++-
 drivers/idle/i7300_idle.c  | 72 ++--------------------------------------
 include/linux/i7300_idle.h | 83 ++++++++++++++++++++++++++++++++++++++++++++++
 3 files changed, 89 insertions(+), 71 deletions(-)
 create mode 100644 include/linux/i7300_idle.h

(limited to 'include/linux')

diff --git a/drivers/dma/ioat_dma.c b/drivers/dma/ioat_dma.c
index f8396cafa05f..fd1631d0a795 100644
--- a/drivers/dma/ioat_dma.c
+++ b/drivers/dma/ioat_dma.c
@@ -33,6 +33,7 @@
 #include <linux/delay.h>
 #include <linux/dma-mapping.h>
 #include <linux/workqueue.h>
+#include <linux/i7300_idle.h>
 #include "ioatdma.h"
 #include "ioatdma_registers.h"
 #include "ioatdma_hw.h"
@@ -172,7 +173,9 @@ static int ioat_dma_enumerate_channels(struct ioatdma_device *device)
 	xfercap = (xfercap_scale == 0 ? -1 : (1UL << xfercap_scale));
 
 #if CONFIG_I7300_IDLE_IOAT_CHANNEL
-	device->common.chancnt--;
+	if (i7300_idle_platform_probe(NULL, NULL) == 0) {
+		device->common.chancnt--;
+	}
 #endif
 	for (i = 0; i < device->common.chancnt; i++) {
 		ioat_chan = kzalloc(sizeof(*ioat_chan), GFP_KERNEL);
diff --git a/drivers/idle/i7300_idle.c b/drivers/idle/i7300_idle.c
index 59d1bbc3cd3c..79d47f284cd6 100644
--- a/drivers/idle/i7300_idle.c
+++ b/drivers/idle/i7300_idle.c
@@ -25,6 +25,7 @@
 #include <linux/delay.h>
 #include <linux/debugfs.h>
 #include <linux/stop_machine.h>
+#include <linux/i7300_idle.h>
 
 #include <asm/idle.h>
 
@@ -505,77 +506,8 @@ static struct notifier_block i7300_idle_nb = {
 	.notifier_call = i7300_idle_notifier,
 };
 
-/*
- * I/O AT controls (PCI bus 0 device 8 function 0)
- * DIMM controls (PCI bus 0 device 16 function 1)
- */
-#define IOAT_BUS 0
-#define IOAT_DEVFN PCI_DEVFN(8, 0)
-#define MEMCTL_BUS 0
-#define MEMCTL_DEVFN PCI_DEVFN(16, 1)
-
-struct fbd_ioat {
-	unsigned int vendor;
-	unsigned int ioat_dev;
-};
-
-/*
- * The i5000 chip-set has the same hooks as the i7300
- * but support is disabled by default because this driver
- * has not been validated on that platform.
- */
-#define SUPPORT_I5000 0
-
-static const struct fbd_ioat fbd_ioat_list[] = {
-	{PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_IOAT_CNB},
-#if SUPPORT_I5000
-	{PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_IOAT},
-#endif
-	{0, 0}
-};
-
-/* table of devices that work with this driver */
-static const struct pci_device_id pci_tbl[] = {
-	{ PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_FBD_CNB) },
-#if SUPPORT_I5000
-	{ PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_5000_ERR) },
-#endif
-	{ } /* Terminating entry */
-};
-
 MODULE_DEVICE_TABLE(pci, pci_tbl);
 
-/* Check for known platforms with I/O-AT */
-static int __init i7300_idle_platform_probe(void)
-{
-	int i;
-
-	fbd_dev = pci_get_bus_and_slot(MEMCTL_BUS, MEMCTL_DEVFN);
-	if (!fbd_dev)
-		return -ENODEV;
-
-	for (i = 0; pci_tbl[i].vendor != 0; i++) {
-		if (fbd_dev->vendor == pci_tbl[i].vendor &&
-		    fbd_dev->device == pci_tbl[i].device) {
-			break;
-		}
-	}
-	if (pci_tbl[i].vendor == 0)
-		return -ENODEV;
-
-	ioat_dev = pci_get_bus_and_slot(IOAT_BUS, IOAT_DEVFN);
-	if (!ioat_dev)
-		return -ENODEV;
-
-	for (i = 0; fbd_ioat_list[i].vendor != 0; i++) {
-		if (ioat_dev->vendor == fbd_ioat_list[i].vendor &&
-		    ioat_dev->device == fbd_ioat_list[i].ioat_dev) {
-			return 0;
-		}
-	}
-	return -ENODEV;
-}
-
 int stats_open_generic(struct inode *inode, struct file *fp)
 {
 	fp->private_data = inode->i_private;
@@ -617,7 +549,7 @@ static int __init i7300_idle_init(void)
 	cpus_clear(idle_cpumask);
 	total_us = 0;
 
-	if (i7300_idle_platform_probe())
+	if (i7300_idle_platform_probe(&fbd_dev, &ioat_dev))
 		return -ENODEV;
 
 	if (i7300_idle_thrt_save())
diff --git a/include/linux/i7300_idle.h b/include/linux/i7300_idle.h
new file mode 100644
index 000000000000..05a80c44513c
--- /dev/null
+++ b/include/linux/i7300_idle.h
@@ -0,0 +1,83 @@
+
+#ifndef I7300_IDLE_H
+#define I7300_IDLE_H
+
+#include <linux/pci.h>
+
+/*
+ * I/O AT controls (PCI bus 0 device 8 function 0)
+ * DIMM controls (PCI bus 0 device 16 function 1)
+ */
+#define IOAT_BUS 0
+#define IOAT_DEVFN PCI_DEVFN(8, 0)
+#define MEMCTL_BUS 0
+#define MEMCTL_DEVFN PCI_DEVFN(16, 1)
+
+struct fbd_ioat {
+	unsigned int vendor;
+	unsigned int ioat_dev;
+};
+
+/*
+ * The i5000 chip-set has the same hooks as the i7300
+ * but support is disabled by default because this driver
+ * has not been validated on that platform.
+ */
+#define SUPPORT_I5000 0
+
+static const struct fbd_ioat fbd_ioat_list[] = {
+	{PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_IOAT_CNB},
+#if SUPPORT_I5000
+	{PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_IOAT},
+#endif
+	{0, 0}
+};
+
+/* table of devices that work with this driver */
+static const struct pci_device_id pci_tbl[] = {
+	{ PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_FBD_CNB) },
+#if SUPPORT_I5000
+	{ PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_5000_ERR) },
+#endif
+	{ } /* Terminating entry */
+};
+
+/* Check for known platforms with I/O-AT */
+static inline int i7300_idle_platform_probe(struct pci_dev **fbd_dev,
+						struct pci_dev **ioat_dev)
+{
+	int i;
+	struct pci_dev *memdev, *dmadev;
+
+	memdev = pci_get_bus_and_slot(MEMCTL_BUS, MEMCTL_DEVFN);
+	if (!memdev)
+		return -ENODEV;
+
+	for (i = 0; pci_tbl[i].vendor != 0; i++) {
+		if (memdev->vendor == pci_tbl[i].vendor &&
+		    memdev->device == pci_tbl[i].device) {
+			break;
+		}
+	}
+	if (pci_tbl[i].vendor == 0)
+		return -ENODEV;
+
+	dmadev = pci_get_bus_and_slot(IOAT_BUS, IOAT_DEVFN);
+	if (!dmadev)
+		return -ENODEV;
+
+	for (i = 0; fbd_ioat_list[i].vendor != 0; i++) {
+		if (dmadev->vendor == fbd_ioat_list[i].vendor &&
+		    dmadev->device == fbd_ioat_list[i].ioat_dev) {
+			if (fbd_dev)
+				*fbd_dev = memdev;
+			if (ioat_dev)
+				*ioat_dev = dmadev;
+
+			return 0;
+		}
+	}
+	return -ENODEV;
+}
+
+#endif
-- 
cgit v1.2.3


From c3a90c788b743303c4d824780a3a7271693fb64a Mon Sep 17 00:00:00 2001
From: Remi Denis-Courmont <remi.denis-courmont@nokia.com>
Date: Sun, 26 Oct 2008 23:07:25 -0700
Subject: Phonet: do not reply to indication reset packets

This fixes a potential error packet loop.

Signed-off-by: Remi Denis-Courmont <remi.denis-courmont@nokia.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/phonet.h | 1 +
 net/phonet/af_phonet.c | 5 ++++-
 2 files changed, 5 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/phonet.h b/include/linux/phonet.h
index c9609f9aedac..4157faa857b6 100644
--- a/include/linux/phonet.h
+++ b/include/linux/phonet.h
@@ -72,6 +72,7 @@ struct phonetmsg {
 	} pn_msg_u;
 };
 #define PN_COMMON_MESSAGE	0xF0
+#define PN_COMMGR		0x10
 #define PN_PREFIX		0xE0 /* resource for extended messages */
 #define pn_submsg_id		pn_msg_u.base.pn_submsg_id
 #define pn_e_submsg_id		pn_msg_u.ext.pn_e_submsg_id
diff --git a/net/phonet/af_phonet.c b/net/phonet/af_phonet.c
index b9d97effebe3..defeb7a0d502 100644
--- a/net/phonet/af_phonet.c
+++ b/net/phonet/af_phonet.c
@@ -261,6 +261,8 @@ static inline int can_respond(struct sk_buff *skb)
 		return 0; /* we are not the destination */
 	if (ph->pn_res == PN_PREFIX && !pskb_may_pull(skb, 5))
 		return 0;
+	if (ph->pn_res == PN_COMMGR) /* indications */
+		return 0;
 
 	ph = pn_hdr(skb); /* re-acquires the pointer */
 	pm = pn_msg(skb);
@@ -309,7 +311,8 @@ static int send_reset_indications(struct sk_buff *rskb)
 
 	return pn_raw_send(data, sizeof(data), rskb->dev,
 				pn_object(oph->pn_sdev, 0x00),
-				pn_object(oph->pn_rdev, oph->pn_robj), 0x10);
+				pn_object(oph->pn_rdev, oph->pn_robj),
+				PN_COMMGR);
 }
 
 
-- 
cgit v1.2.3


From 3d5afd324a4bf9f64f59599bf1e93cd7dd1dc97a Mon Sep 17 00:00:00 2001
From: Jiri Slaby <jirislaby@gmail.com>
Date: Mon, 27 Oct 2008 12:16:15 +0100
Subject: HID: fix oops during suspend of unbound HID devices

Usbhid structure is allocated on start invoked only from probe
of some driver. When there is no driver, the structure is null
and causes null-dereference oopses.

Fix it by allocating the structure on probe and disconnect of
the device itself. Also make sure we won't race between start
and resume or stop and suspend respectively.

References: http://bugzilla.kernel.org/show_bug.cgi?id=11827

Signed-off-by: Jiri Slaby <jirislaby@gmail.com>
Cc: Johannes Berg <johannes@sipsolutions.net>
Cc: Andreas Schwab <schwab@suse.de>
Signed-off-by: Jiri Kosina <jkosina@suse.cz>
---
 drivers/hid/usbhid/hid-core.c | 58 ++++++++++++++++++++++++++++++-------------
 drivers/hid/usbhid/usbhid.h   |  2 ++
 include/linux/hid.h           |  1 +
 3 files changed, 44 insertions(+), 17 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/hid/usbhid/hid-core.c b/drivers/hid/usbhid/hid-core.c
index 42bdd83444c1..3b1c489998c3 100644
--- a/drivers/hid/usbhid/hid-core.c
+++ b/drivers/hid/usbhid/hid-core.c
@@ -20,6 +20,7 @@
 #include <linux/kernel.h>
 #include <linux/list.h>
 #include <linux/mm.h>
+#include <linux/mutex.h>
 #include <linux/smp_lock.h>
 #include <linux/spinlock.h>
 #include <asm/unaligned.h>
@@ -776,21 +777,10 @@ static int usbhid_start(struct hid_device *hid)
 	struct usb_interface *intf = to_usb_interface(hid->dev.parent);
 	struct usb_host_interface *interface = intf->cur_altsetting;
 	struct usb_device *dev = interface_to_usbdev(intf);
-	struct usbhid_device *usbhid;
+	struct usbhid_device *usbhid = hid->driver_data;
 	unsigned int n, insize = 0;
 	int ret;
 
-	WARN_ON(hid->driver_data);
-
-	usbhid = kzalloc(sizeof(struct usbhid_device), GFP_KERNEL);
-	if (usbhid == NULL) {
-		ret = -ENOMEM;
-		goto err;
-	}
-
-	hid->driver_data = usbhid;
-	usbhid->hid = hid;
-
 	usbhid->bufsize = HID_MIN_BUFFER_SIZE;
 	hid_find_max_report(hid, HID_INPUT_REPORT, &usbhid->bufsize);
 	hid_find_max_report(hid, HID_OUTPUT_REPORT, &usbhid->bufsize);
@@ -804,6 +794,7 @@ static int usbhid_start(struct hid_device *hid)
 	if (insize > HID_MAX_BUFFER_SIZE)
 		insize = HID_MAX_BUFFER_SIZE;
 
+	mutex_lock(&usbhid->setup);
 	if (hid_alloc_buffers(dev, hid)) {
 		ret = -ENOMEM;
 		goto fail;
@@ -888,6 +879,9 @@ static int usbhid_start(struct hid_device *hid)
 	usbhid_init_reports(hid);
 	hid_dump_device(hid);
 
+	set_bit(HID_STARTED, &usbhid->iofl);
+	mutex_unlock(&usbhid->setup);
+
 	return 0;
 
 fail:
@@ -895,8 +889,7 @@ fail:
 	usb_free_urb(usbhid->urbout);
 	usb_free_urb(usbhid->urbctrl);
 	hid_free_buffers(dev, hid);
-	kfree(usbhid);
-err:
+	mutex_unlock(&usbhid->setup);
 	return ret;
 }
 
@@ -907,6 +900,8 @@ static void usbhid_stop(struct hid_device *hid)
 	if (WARN_ON(!usbhid))
 		return;
 
+	mutex_lock(&usbhid->setup);
+	clear_bit(HID_STARTED, &usbhid->iofl);
 	spin_lock_irq(&usbhid->inlock);	/* Sync with error handler */
 	set_bit(HID_DISCONNECTED, &usbhid->iofl);
 	spin_unlock_irq(&usbhid->inlock);
@@ -931,8 +926,7 @@ static void usbhid_stop(struct hid_device *hid)
 	usb_free_urb(usbhid->urbout);
 
 	hid_free_buffers(hid_to_usb_dev(hid), hid);
-	kfree(usbhid);
-	hid->driver_data = NULL;
+	mutex_unlock(&usbhid->setup);
 }
 
 static struct hid_ll_driver usb_hid_driver = {
@@ -947,6 +941,7 @@ static struct hid_ll_driver usb_hid_driver = {
 static int hid_probe(struct usb_interface *intf, const struct usb_device_id *id)
 {
 	struct usb_device *dev = interface_to_usbdev(intf);
+	struct usbhid_device *usbhid;
 	struct hid_device *hid;
 	size_t len;
 	int ret;
@@ -1000,14 +995,26 @@ static int hid_probe(struct usb_interface *intf, const struct usb_device_id *id)
 	if (usb_string(dev, dev->descriptor.iSerialNumber, hid->uniq, 64) <= 0)
 		hid->uniq[0] = 0;
 
+	usbhid = kzalloc(sizeof(*usbhid), GFP_KERNEL);
+	if (usbhid == NULL) {
+		ret = -ENOMEM;
+		goto err;
+	}
+
+	hid->driver_data = usbhid;
+	usbhid->hid = hid;
+	mutex_init(&usbhid->setup); /* needed on suspend/resume */
+
 	ret = hid_add_device(hid);
 	if (ret) {
 		if (ret != -ENODEV)
 			dev_err(&intf->dev, "can't add hid device: %d\n", ret);
-		goto err;
+		goto err_free;
 	}
 
 	return 0;
+err_free:
+	kfree(usbhid);
 err:
 	hid_destroy_device(hid);
 	return ret;
@@ -1016,11 +1023,14 @@ err:
 static void hid_disconnect(struct usb_interface *intf)
 {
 	struct hid_device *hid = usb_get_intfdata(intf);
+	struct usbhid_device *usbhid;
 
 	if (WARN_ON(!hid))
 		return;
 
+	usbhid = hid->driver_data;
 	hid_destroy_device(hid);
+	kfree(usbhid);
 }
 
 static int hid_suspend(struct usb_interface *intf, pm_message_t message)
@@ -1028,11 +1038,18 @@ static int hid_suspend(struct usb_interface *intf, pm_message_t message)
 	struct hid_device *hid = usb_get_intfdata (intf);
 	struct usbhid_device *usbhid = hid->driver_data;
 
+	mutex_lock(&usbhid->setup);
+	if (!test_bit(HID_STARTED, &usbhid->iofl)) {
+		mutex_unlock(&usbhid->setup);
+		return 0;
+	}
+
 	spin_lock_irq(&usbhid->inlock);	/* Sync with error handler */
 	set_bit(HID_SUSPENDED, &usbhid->iofl);
 	spin_unlock_irq(&usbhid->inlock);
 	del_timer(&usbhid->io_retry);
 	usb_kill_urb(usbhid->urbin);
+	mutex_unlock(&usbhid->setup);
 	dev_dbg(&intf->dev, "suspend\n");
 	return 0;
 }
@@ -1043,9 +1060,16 @@ static int hid_resume(struct usb_interface *intf)
 	struct usbhid_device *usbhid = hid->driver_data;
 	int status;
 
+	mutex_lock(&usbhid->setup);
+	if (!test_bit(HID_STARTED, &usbhid->iofl)) {
+		mutex_unlock(&usbhid->setup);
+		return 0;
+	}
+
 	clear_bit(HID_SUSPENDED, &usbhid->iofl);
 	usbhid->retry_delay = 0;
 	status = hid_start_in(hid);
+	mutex_unlock(&usbhid->setup);
 	dev_dbg(&intf->dev, "resume status %d\n", status);
 	return status;
 }
diff --git a/drivers/hid/usbhid/usbhid.h b/drivers/hid/usbhid/usbhid.h
index abedb13c623e..55973ff54008 100644
--- a/drivers/hid/usbhid/usbhid.h
+++ b/drivers/hid/usbhid/usbhid.h
@@ -27,6 +27,7 @@
 #include <linux/types.h>
 #include <linux/slab.h>
 #include <linux/list.h>
+#include <linux/mutex.h>
 #include <linux/timer.h>
 #include <linux/wait.h>
 #include <linux/workqueue.h>
@@ -73,6 +74,7 @@ struct usbhid_device {
 	dma_addr_t outbuf_dma;                                          /* Output buffer dma */
 	spinlock_t outlock;                                             /* Output fifo spinlock */
 
+	struct mutex setup;
 	unsigned long iofl;                                             /* I/O flags (CTRL_RUNNING, OUT_RUNNING) */
 	struct timer_list io_retry;                                     /* Retry timer */
 	unsigned long stop_retry;                                       /* Time to give up, in jiffies */
diff --git a/include/linux/hid.h b/include/linux/hid.h
index 5355ca4b939e..e5780f8c934a 100644
--- a/include/linux/hid.h
+++ b/include/linux/hid.h
@@ -410,6 +410,7 @@ struct hid_output_fifo {
 #define HID_SUSPENDED		5
 #define HID_CLEAR_HALT		6
 #define HID_DISCONNECTED	7
+#define HID_STARTED		8
 
 struct hid_input {
 	struct list_head list;
-- 
cgit v1.2.3


From 5550af4df179e52753d3a43a788a113ad8cd95cd Mon Sep 17 00:00:00 2001
From: Sheng Yang <sheng@linux.intel.com>
Date: Wed, 15 Oct 2008 20:15:06 +0800
Subject: KVM: Fix guest shared interrupt with in-kernel irqchip

Every call of kvm_set_irq() should offer an irq_source_id, which is
allocated by kvm_request_irq_source_id(). Based on irq_source_id, we
identify the irq source and implement logical OR for shared level
interrupts.

The allocated irq_source_id can be freed by kvm_free_irq_source_id().

Currently, we support at most sizeof(unsigned long) different irq sources.

[Amit: - rebase to kvm.git HEAD
       - move definition of KVM_USERSPACE_IRQ_SOURCE_ID to common file
       - move kvm_request_irq_source_id to the update_irq ioctl]

[Xiantao: - Add kvm/ia64 stuff and make it work for kvm/ia64 guests]

Signed-off-by: Sheng Yang <sheng@linux.intel.com>
Signed-off-by: Amit Shah <amit.shah@redhat.com>
Signed-off-by: Xiantao Zhang <xiantao.zhang@intel.com>
Signed-off-by: Avi Kivity <avi@redhat.com>
---
 arch/ia64/include/asm/kvm_host.h |  3 +++
 arch/ia64/kvm/kvm-ia64.c         |  8 +++++---
 arch/x86/include/asm/kvm_host.h  |  3 +++
 arch/x86/kvm/i8254.c             | 11 +++++++++--
 arch/x86/kvm/i8254.h             |  1 +
 arch/x86/kvm/x86.c               |  6 +++++-
 include/linux/kvm_host.h         |  7 ++++++-
 virt/kvm/irq_comm.c              | 42 +++++++++++++++++++++++++++++++++++++---
 virt/kvm/kvm_main.c              | 12 ++++++++----
 9 files changed, 79 insertions(+), 14 deletions(-)

(limited to 'include/linux')

diff --git a/arch/ia64/include/asm/kvm_host.h b/arch/ia64/include/asm/kvm_host.h
index 85db124d37f6..04c0b88f7b3a 100644
--- a/arch/ia64/include/asm/kvm_host.h
+++ b/arch/ia64/include/asm/kvm_host.h
@@ -417,6 +417,9 @@ struct kvm_arch {
 	struct list_head assigned_dev_head;
 	struct dmar_domain *intel_iommu_domain;
 	struct hlist_head irq_ack_notifier_list;
+
+	unsigned long irq_sources_bitmap;
+	unsigned long irq_states[KVM_IOAPIC_NUM_PINS];
 };
 
 union cpuid3_t {
diff --git a/arch/ia64/kvm/kvm-ia64.c b/arch/ia64/kvm/kvm-ia64.c
index a312c9e9b9ef..8a2b13ff0aff 100644
--- a/arch/ia64/kvm/kvm-ia64.c
+++ b/arch/ia64/kvm/kvm-ia64.c
@@ -778,6 +778,9 @@ static void kvm_init_vm(struct kvm *kvm)
 	kvm_build_io_pmt(kvm);
 
 	INIT_LIST_HEAD(&kvm->arch.assigned_dev_head);
+
+	/* Reserve bit 0 of irq_sources_bitmap for userspace irq source */
+	set_bit(KVM_USERSPACE_IRQ_SOURCE_ID, &kvm->arch.irq_sources_bitmap);
 }
 
 struct  kvm *kvm_arch_create_vm(void)
@@ -941,9 +944,8 @@ long kvm_arch_vm_ioctl(struct file *filp,
 			goto out;
 		if (irqchip_in_kernel(kvm)) {
 			mutex_lock(&kvm->lock);
-			kvm_ioapic_set_irq(kvm->arch.vioapic,
-						irq_event.irq,
-						irq_event.level);
+			kvm_set_irq(kvm, KVM_USERSPACE_IRQ_SOURCE_ID,
+				    irq_event.irq, irq_event.level);
 			mutex_unlock(&kvm->lock);
 			r = 0;
 		}
diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index 65679d006337..8346be87cfa1 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -364,6 +364,9 @@ struct kvm_arch{
 
 	struct page *ept_identity_pagetable;
 	bool ept_identity_pagetable_done;
+
+	unsigned long irq_sources_bitmap;
+	unsigned long irq_states[KVM_IOAPIC_NUM_PINS];
 };
 
 struct kvm_vm_stat {
diff --git a/arch/x86/kvm/i8254.c b/arch/x86/kvm/i8254.c
index 11c6725fb798..8772dc946823 100644
--- a/arch/x86/kvm/i8254.c
+++ b/arch/x86/kvm/i8254.c
@@ -545,6 +545,12 @@ struct kvm_pit *kvm_create_pit(struct kvm *kvm)
 	if (!pit)
 		return NULL;
 
+	mutex_lock(&kvm->lock);
+	pit->irq_source_id = kvm_request_irq_source_id(kvm);
+	mutex_unlock(&kvm->lock);
+	if (pit->irq_source_id < 0)
+		return NULL;
+
 	mutex_init(&pit->pit_state.lock);
 	mutex_lock(&pit->pit_state.lock);
 	spin_lock_init(&pit->pit_state.inject_lock);
@@ -587,6 +593,7 @@ void kvm_free_pit(struct kvm *kvm)
 		mutex_lock(&kvm->arch.vpit->pit_state.lock);
 		timer = &kvm->arch.vpit->pit_state.pit_timer.timer;
 		hrtimer_cancel(timer);
+		kvm_free_irq_source_id(kvm, kvm->arch.vpit->irq_source_id);
 		mutex_unlock(&kvm->arch.vpit->pit_state.lock);
 		kfree(kvm->arch.vpit);
 	}
@@ -595,8 +602,8 @@ void kvm_free_pit(struct kvm *kvm)
 static void __inject_pit_timer_intr(struct kvm *kvm)
 {
 	mutex_lock(&kvm->lock);
-	kvm_set_irq(kvm, 0, 1);
-	kvm_set_irq(kvm, 0, 0);
+	kvm_set_irq(kvm, kvm->arch.vpit->irq_source_id, 0, 1);
+	kvm_set_irq(kvm, kvm->arch.vpit->irq_source_id, 0, 0);
 	mutex_unlock(&kvm->lock);
 }
 
diff --git a/arch/x86/kvm/i8254.h b/arch/x86/kvm/i8254.h
index e436d4983aa1..4178022b97aa 100644
--- a/arch/x86/kvm/i8254.h
+++ b/arch/x86/kvm/i8254.h
@@ -44,6 +44,7 @@ struct kvm_pit {
 	struct kvm_io_device speaker_dev;
 	struct kvm *kvm;
 	struct kvm_kpit_state pit_state;
+	int irq_source_id;
 };
 
 #define KVM_PIT_BASE_ADDRESS	    0x40
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 4f0677d1eae8..f1f8ff2f1fa2 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -1742,7 +1742,8 @@ long kvm_arch_vm_ioctl(struct file *filp,
 			goto out;
 		if (irqchip_in_kernel(kvm)) {
 			mutex_lock(&kvm->lock);
-			kvm_set_irq(kvm, irq_event.irq, irq_event.level);
+			kvm_set_irq(kvm, KVM_USERSPACE_IRQ_SOURCE_ID,
+				    irq_event.irq, irq_event.level);
 			mutex_unlock(&kvm->lock);
 			r = 0;
 		}
@@ -4013,6 +4014,9 @@ struct  kvm *kvm_arch_create_vm(void)
 	INIT_LIST_HEAD(&kvm->arch.active_mmu_pages);
 	INIT_LIST_HEAD(&kvm->arch.assigned_dev_head);
 
+	/* Reserve bit 0 of irq_sources_bitmap for userspace irq source */
+	set_bit(KVM_USERSPACE_IRQ_SOURCE_ID, &kvm->arch.irq_sources_bitmap);
+
 	return kvm;
 }
 
diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h
index 3833c48fae3a..bb92be2153bc 100644
--- a/include/linux/kvm_host.h
+++ b/include/linux/kvm_host.h
@@ -37,6 +37,8 @@
 #define KVM_REQ_UNHALT             6
 #define KVM_REQ_MMU_SYNC           7
 
+#define KVM_USERSPACE_IRQ_SOURCE_ID	0
+
 struct kvm_vcpu;
 extern struct kmem_cache *kvm_vcpu_cache;
 
@@ -306,15 +308,18 @@ struct kvm_assigned_dev_kernel {
 	int host_irq;
 	int guest_irq;
 	int irq_requested;
+	int irq_source_id;
 	struct pci_dev *dev;
 	struct kvm *kvm;
 };
-void kvm_set_irq(struct kvm *kvm, int irq, int level);
+void kvm_set_irq(struct kvm *kvm, int irq_source_id, int irq, int level);
 void kvm_notify_acked_irq(struct kvm *kvm, unsigned gsi);
 void kvm_register_irq_ack_notifier(struct kvm *kvm,
 				   struct kvm_irq_ack_notifier *kian);
 void kvm_unregister_irq_ack_notifier(struct kvm *kvm,
 				     struct kvm_irq_ack_notifier *kian);
+int kvm_request_irq_source_id(struct kvm *kvm);
+void kvm_free_irq_source_id(struct kvm *kvm, int irq_source_id);
 
 #ifdef CONFIG_DMAR
 int kvm_iommu_map_pages(struct kvm *kvm, gfn_t base_gfn,
diff --git a/virt/kvm/irq_comm.c b/virt/kvm/irq_comm.c
index d0169f5e6047..55ad76ee2d09 100644
--- a/virt/kvm/irq_comm.c
+++ b/virt/kvm/irq_comm.c
@@ -25,15 +25,23 @@
 #include "ioapic.h"
 
 /* This should be called with the kvm->lock mutex held */
-void kvm_set_irq(struct kvm *kvm, int irq, int level)
+void kvm_set_irq(struct kvm *kvm, int irq_source_id, int irq, int level)
 {
+	unsigned long *irq_state = (unsigned long *)&kvm->arch.irq_states[irq];
+
+	/* Logical OR for level trig interrupt */
+	if (level)
+		set_bit(irq_source_id, irq_state);
+	else
+		clear_bit(irq_source_id, irq_state);
+
 	/* Not possible to detect if the guest uses the PIC or the
 	 * IOAPIC.  So set the bit in both. The guest will ignore
 	 * writes to the unused one.
 	 */
-	kvm_ioapic_set_irq(kvm->arch.vioapic, irq, level);
+	kvm_ioapic_set_irq(kvm->arch.vioapic, irq, !!(*irq_state));
 #ifdef CONFIG_X86
-	kvm_pic_set_irq(pic_irqchip(kvm), irq, level);
+	kvm_pic_set_irq(pic_irqchip(kvm), irq, !!(*irq_state));
 #endif
 }
 
@@ -58,3 +66,31 @@ void kvm_unregister_irq_ack_notifier(struct kvm *kvm,
 {
 	hlist_del(&kian->link);
 }
+
+/* The caller must hold kvm->lock mutex */
+int kvm_request_irq_source_id(struct kvm *kvm)
+{
+	unsigned long *bitmap = &kvm->arch.irq_sources_bitmap;
+	int irq_source_id = find_first_zero_bit(bitmap,
+				sizeof(kvm->arch.irq_sources_bitmap));
+	if (irq_source_id >= sizeof(kvm->arch.irq_sources_bitmap)) {
+		printk(KERN_WARNING "kvm: exhaust allocatable IRQ sources!\n");
+		irq_source_id = -EFAULT;
+	} else
+		set_bit(irq_source_id, bitmap);
+	return irq_source_id;
+}
+
+void kvm_free_irq_source_id(struct kvm *kvm, int irq_source_id)
+{
+	int i;
+
+	if (irq_source_id <= 0 ||
+	    irq_source_id >= sizeof(kvm->arch.irq_sources_bitmap)) {
+		printk(KERN_ERR "kvm: IRQ source ID out of range!\n");
+		return;
+	}
+	for (i = 0; i < KVM_IOAPIC_NUM_PINS; i++)
+		clear_bit(irq_source_id, &kvm->arch.irq_states[i]);
+	clear_bit(irq_source_id, &kvm->arch.irq_sources_bitmap);
+}
diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
index cf0ab8ed3845..a87f45edfae8 100644
--- a/virt/kvm/kvm_main.c
+++ b/virt/kvm/kvm_main.c
@@ -105,14 +105,12 @@ static void kvm_assigned_dev_interrupt_work_handler(struct work_struct *work)
 	 */
 	mutex_lock(&assigned_dev->kvm->lock);
 	kvm_set_irq(assigned_dev->kvm,
+		    assigned_dev->irq_source_id,
 		    assigned_dev->guest_irq, 1);
 	mutex_unlock(&assigned_dev->kvm->lock);
 	kvm_put_kvm(assigned_dev->kvm);
 }
 
-/* FIXME: Implement the OR logic needed to make shared interrupts on
- * this line behave properly
- */
 static irqreturn_t kvm_assigned_dev_intr(int irq, void *dev_id)
 {
 	struct kvm_assigned_dev_kernel *assigned_dev =
@@ -134,7 +132,7 @@ static void kvm_assigned_dev_ack_irq(struct kvm_irq_ack_notifier *kian)
 
 	dev = container_of(kian, struct kvm_assigned_dev_kernel,
 			   ack_notifier);
-	kvm_set_irq(dev->kvm, dev->guest_irq, 0);
+	kvm_set_irq(dev->kvm, dev->irq_source_id, dev->guest_irq, 0);
 	enable_irq(dev->host_irq);
 }
 
@@ -146,6 +144,7 @@ static void kvm_free_assigned_device(struct kvm *kvm,
 		free_irq(assigned_dev->host_irq, (void *)assigned_dev);
 
 	kvm_unregister_irq_ack_notifier(kvm, &assigned_dev->ack_notifier);
+	kvm_free_irq_source_id(kvm, assigned_dev->irq_source_id);
 
 	if (cancel_work_sync(&assigned_dev->interrupt_work))
 		/* We had pending work. That means we will have to take
@@ -215,6 +214,11 @@ static int kvm_vm_ioctl_assign_irq(struct kvm *kvm,
 		match->ack_notifier.gsi = assigned_irq->guest_irq;
 		match->ack_notifier.irq_acked = kvm_assigned_dev_ack_irq;
 		kvm_register_irq_ack_notifier(kvm, &match->ack_notifier);
+		r = kvm_request_irq_source_id(kvm);
+		if (r < 0)
+			goto out_release;
+		else
+			match->irq_source_id = r;
 
 		/* Even though this is PCI, we don't want to use shared
 		 * interrupts. Sharing host devices with guest-assigned devices
-- 
cgit v1.2.3


From bb45e202e695dea8657bb03a01d1522c37558672 Mon Sep 17 00:00:00 2001
From: Avi Kivity <avi@redhat.com>
Date: Sun, 19 Oct 2008 16:39:45 +0200
Subject: KVM: Future-proof device assignment ABI

Reserve some space so we can add more data.

Signed-off-by: Avi Kivity <avi@qumranet.com>
---
 include/linux/kvm.h | 6 ++++++
 1 file changed, 6 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/kvm.h b/include/linux/kvm.h
index 797fcd781242..f18b86fa8655 100644
--- a/include/linux/kvm.h
+++ b/include/linux/kvm.h
@@ -489,6 +489,9 @@ struct kvm_assigned_pci_dev {
 	__u32 busnr;
 	__u32 devfn;
 	__u32 flags;
+	union {
+		__u32 reserved[12];
+	};
 };
 
 struct kvm_assigned_irq {
@@ -496,6 +499,9 @@ struct kvm_assigned_irq {
 	__u32 host_irq;
 	__u32 guest_irq;
 	__u32 flags;
+	union {
+		__u32 reserved[12];
+	};
 };
 
 #define KVM_DEV_ASSIGN_ENABLE_IOMMU	(1 << 0)
-- 
cgit v1.2.3


From 8175fe2dda1c93a9c596921c8ed4a0b4baccdefe Mon Sep 17 00:00:00 2001
From: Andreas Schwab <schwab@suse.de>
Date: Sun, 26 Oct 2008 00:30:18 +0200
Subject: HID: fix hid_device_id for cross compiling

struct hid_device_id contains hidden padding which is bad for cross
compiling.  Make the padding explicit and consistent across
architectures.

Signed-off-by: Andreas Schwab <schwab@suse.de>
Signed-off-by: Jiri Kosina <jkosina@suse.cz>
---
 include/linux/mod_devicetable.h | 1 +
 1 file changed, 1 insertion(+)

(limited to 'include/linux')

diff --git a/include/linux/mod_devicetable.h b/include/linux/mod_devicetable.h
index eb71b45fdf5a..97b91d1abb43 100644
--- a/include/linux/mod_devicetable.h
+++ b/include/linux/mod_devicetable.h
@@ -135,6 +135,7 @@ struct usb_device_id {
 
 struct hid_device_id {
 	__u16 bus;
+	__u16 pad1;
 	__u32 vendor;
 	__u32 product;
 	kernel_ulong_t driver_data
-- 
cgit v1.2.3


From 3f5e26cee443eb4d3900cd3085664c3e51b72135 Mon Sep 17 00:00:00 2001
From: Jan Beulich <jbeulich@novell.com>
Date: Sat, 25 Oct 2008 15:02:51 -0700
Subject: adjust init section definitions

Add rodata equivalents for assembly use, and fix the section attributes
used by __REFCONST.

Signed-off-by: Jan Beulich <jbeulich@novell.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Sam Ravnborg <sam@ravnborg.org>
---
 include/linux/init.h | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/init.h b/include/linux/init.h
index 0c1264668be0..68cb0265d009 100644
--- a/include/linux/init.h
+++ b/include/linux/init.h
@@ -112,21 +112,25 @@
 #define __FINIT		.previous
 
 #define __INITDATA	.section	".init.data","aw"
+#define __INITRODATA	.section	".init.rodata","a"
 #define __FINITDATA	.previous
 
 #define __DEVINIT        .section	".devinit.text", "ax"
 #define __DEVINITDATA    .section	".devinit.data", "aw"
+#define __DEVINITRODATA  .section	".devinit.rodata", "a"
 
 #define __CPUINIT        .section	".cpuinit.text", "ax"
 #define __CPUINITDATA    .section	".cpuinit.data", "aw"
+#define __CPUINITRODATA  .section	".cpuinit.rodata", "a"
 
 #define __MEMINIT        .section	".meminit.text", "ax"
 #define __MEMINITDATA    .section	".meminit.data", "aw"
+#define __MEMINITRODATA  .section	".meminit.rodata", "a"
 
 /* silence warnings when references are OK */
 #define __REF            .section       ".ref.text", "ax"
 #define __REFDATA        .section       ".ref.data", "aw"
-#define __REFCONST       .section       ".ref.rodata", "aw"
+#define __REFCONST       .section       ".ref.rodata", "a"
 
 #ifndef __ASSEMBLY__
 /*
-- 
cgit v1.2.3


From 0833422274ff00729a603b020fac297e69a03e40 Mon Sep 17 00:00:00 2001
From: Kurt Garloff <garloff@suse.de>
Date: Wed, 29 Oct 2008 14:00:48 -0700
Subject: mm: increase the default mlock limit from 32k to 64k

By default, non-privileged tasks can only mlock() a small amount of
memory to avoid a DoS attack by ordinary users.  The Linux kernel
defaulted to 32k (on a 4k page size system) to accommodate the needs of
gpg.

However, newer gpg2 needs 64k in various circumstances and otherwise
fails miserably, see bnc#329675.

Change the default to 64k, and make it more agnostic to PAGE_SIZE.

Signed-off-by: Kurt Garloff <garloff@suse.de>
Signed-off-by: Nick Piggin <npiggin@suse.de>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/resource.h | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/resource.h b/include/linux/resource.h
index aaa423a6f3d9..40fc7e626082 100644
--- a/include/linux/resource.h
+++ b/include/linux/resource.h
@@ -59,10 +59,10 @@ struct rlimit {
 #define _STK_LIM	(8*1024*1024)
 
 /*
- * GPG wants 32kB of mlocked memory, to make sure pass phrases
+ * GPG2 wants 64kB of mlocked memory, to make sure pass phrases
  * and other sensitive information are never written to disk.
  */
-#define MLOCK_LIMIT	(8 * PAGE_SIZE)
+#define MLOCK_LIMIT	((PAGE_SIZE > 64*1024) ? PAGE_SIZE : 64*1024)
 
 /*
  * Due to binary compatibility, the actual resource numbers
-- 
cgit v1.2.3


From 00c2e63c31d0f431952ff2a671c5c6997dd4f8b2 Mon Sep 17 00:00:00 2001
From: Li Zefan <lizf@cn.fujitsu.com>
Date: Wed, 29 Oct 2008 14:00:53 -0700
Subject: freezer_cg: use thaw_process() in unfreeze_cgroup()

Don't duplicate the implementation of thaw_process().

[akpm@linux-foundation.org: make __thaw_process() static]
Signed-off-by: Li Zefan <lizf@cn.fujitsu.com>
Cc: Cedric Le Goater <clg@fr.ibm.com>
Acked-by: Matt Helsley <matthltc@us.ibm.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/freezer.h |  5 -----
 kernel/cgroup_freezer.c | 15 ++++-----------
 kernel/freezer.c        | 20 ++++++++++----------
 3 files changed, 14 insertions(+), 26 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/freezer.h b/include/linux/freezer.h
index 8f225339eee9..5a361f85cfec 100644
--- a/include/linux/freezer.h
+++ b/include/linux/freezer.h
@@ -44,11 +44,6 @@ static inline bool should_send_signal(struct task_struct *p)
 	return !(p->flags & PF_FREEZER_NOSIG);
 }
 
-/*
- * Wake up a frozen process
- */
-extern int __thaw_process(struct task_struct *p);
-
 /* Takes and releases task alloc lock using task_lock() */
 extern int thaw_process(struct task_struct *p);
 
diff --git a/kernel/cgroup_freezer.c b/kernel/cgroup_freezer.c
index e9c856a265c9..5e6d26b66e88 100644
--- a/kernel/cgroup_freezer.c
+++ b/kernel/cgroup_freezer.c
@@ -275,25 +275,18 @@ static int try_to_freeze_cgroup(struct cgroup *cgroup, struct freezer *freezer)
 	return num_cant_freeze_now ? -EBUSY : 0;
 }
 
-static int unfreeze_cgroup(struct cgroup *cgroup, struct freezer *freezer)
+static void unfreeze_cgroup(struct cgroup *cgroup, struct freezer *freezer)
 {
 	struct cgroup_iter it;
 	struct task_struct *task;
 
 	cgroup_iter_start(cgroup, &it);
 	while ((task = cgroup_iter_next(cgroup, &it))) {
-		int do_wake;
-
-		task_lock(task);
-		do_wake = __thaw_process(task);
-		task_unlock(task);
-		if (do_wake)
-			wake_up_process(task);
+		thaw_process(task);
 	}
 	cgroup_iter_end(cgroup, &it);
-	freezer->state = CGROUP_THAWED;
 
-	return 0;
+	freezer->state = CGROUP_THAWED;
 }
 
 static int freezer_change_state(struct cgroup *cgroup,
@@ -320,7 +313,7 @@ static int freezer_change_state(struct cgroup *cgroup,
 		}
 		/* state == FREEZING and goal_state == THAWED, so unfreeze */
 	case CGROUP_FROZEN:
-		retval = unfreeze_cgroup(cgroup, freezer);
+		unfreeze_cgroup(cgroup, freezer);
 		break;
 	default:
 		break;
diff --git a/kernel/freezer.c b/kernel/freezer.c
index ba6248b323ef..2f4936cf7083 100644
--- a/kernel/freezer.c
+++ b/kernel/freezer.c
@@ -121,16 +121,7 @@ void cancel_freezing(struct task_struct *p)
 	}
 }
 
-/*
- * Wake up a frozen process
- *
- * task_lock() is needed to prevent the race with refrigerator() which may
- * occur if the freezing of tasks fails.  Namely, without the lock, if the
- * freezing of tasks failed, thaw_tasks() might have run before a task in
- * refrigerator() could call frozen_process(), in which case the task would be
- * frozen and no one would thaw it.
- */
-int __thaw_process(struct task_struct *p)
+static int __thaw_process(struct task_struct *p)
 {
 	if (frozen(p)) {
 		p->flags &= ~PF_FROZEN;
@@ -140,6 +131,15 @@ int __thaw_process(struct task_struct *p)
 	return 0;
 }
 
+/*
+ * Wake up a frozen process
+ *
+ * task_lock() is needed to prevent the race with refrigerator() which may
+ * occur if the freezing of tasks fails.  Namely, without the lock, if the
+ * freezing of tasks failed, thaw_tasks() might have run before a task in
+ * refrigerator() could call frozen_process(), in which case the task would be
+ * frozen and no one would thaw it.
+ */
 int thaw_process(struct task_struct *p)
 {
 	task_lock(p);
-- 
cgit v1.2.3


From 9b913735e53ab0da4a792bac0de8e178cc13dcfb Mon Sep 17 00:00:00 2001
From: Li Zefan <lizf@cn.fujitsu.com>
Date: Wed, 29 Oct 2008 14:00:54 -0700
Subject: cgroups: tiny cleanups

- remove 'private' field from struct subsys
- remove cgroup_init_smp()

Signed-off-by: Li Zefan <lizf@cn.fujitsu.com>
Acked-by: Paul Menage <menage@google.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/cgroup.h | 4 ----
 1 file changed, 4 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/cgroup.h b/include/linux/cgroup.h
index 8b00f6643e93..1164963c3a85 100644
--- a/include/linux/cgroup.h
+++ b/include/linux/cgroup.h
@@ -25,7 +25,6 @@ struct cgroup;
 
 extern int cgroup_init_early(void);
 extern int cgroup_init(void);
-extern void cgroup_init_smp(void);
 extern void cgroup_lock(void);
 extern bool cgroup_lock_live_group(struct cgroup *cgrp);
 extern void cgroup_unlock(void);
@@ -348,8 +347,6 @@ struct cgroup_subsys {
 	struct cgroupfs_root *root;
 
 	struct list_head sibling;
-
-	void *private;
 };
 
 #define SUBSYS(_x) extern struct cgroup_subsys _x ## _subsys;
@@ -410,7 +407,6 @@ void cgroup_mm_owner_callbacks(struct task_struct *old,
 
 static inline int cgroup_init_early(void) { return 0; }
 static inline int cgroup_init(void) { return 0; }
-static inline void cgroup_init_smp(void) {}
 static inline void cgroup_fork(struct task_struct *p) {}
 static inline void cgroup_fork_callbacks(struct task_struct *p) {}
 static inline void cgroup_post_fork(struct task_struct *p) {}
-- 
cgit v1.2.3


From 4e02ed4b4a2fae34aae766a5bb93ae235f60adb8 Mon Sep 17 00:00:00 2001
From: Nick Piggin <npiggin@suse.de>
Date: Wed, 29 Oct 2008 14:00:55 -0700
Subject: fs: remove prepare_write/commit_write

Nothing uses prepare_write or commit_write. Remove them from the tree
completely.

[akpm@linux-foundation.org: schedule simple_prepare_write() for unexporting]
Signed-off-by: Nick Piggin <npiggin@suse.de>
Cc: Christoph Hellwig <hch@lst.de>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 Documentation/filesystems/Locking |  12 +-
 Documentation/filesystems/vfs.txt |  39 +-----
 drivers/block/loop.c              |   5 +-
 fs/fat/inode.c                    |   2 +-
 fs/libfs.c                        |   2 +-
 fs/ocfs2/file.c                   |   3 +-
 fs/splice.c                       |   4 +-
 include/linux/fs.h                |   7 --
 mm/filemap.c                      | 242 +-------------------------------------
 9 files changed, 23 insertions(+), 293 deletions(-)

(limited to 'include/linux')

diff --git a/Documentation/filesystems/Locking b/Documentation/filesystems/Locking
index 8362860e21a7..23d2f4460deb 100644
--- a/Documentation/filesystems/Locking
+++ b/Documentation/filesystems/Locking
@@ -161,8 +161,12 @@ prototypes:
 	int (*set_page_dirty)(struct page *page);
 	int (*readpages)(struct file *filp, struct address_space *mapping,
 			struct list_head *pages, unsigned nr_pages);
-	int (*prepare_write)(struct file *, struct page *, unsigned, unsigned);
-	int (*commit_write)(struct file *, struct page *, unsigned, unsigned);
+	int (*write_begin)(struct file *, struct address_space *mapping,
+				loff_t pos, unsigned len, unsigned flags,
+				struct page **pagep, void **fsdata);
+	int (*write_end)(struct file *, struct address_space *mapping,
+				loff_t pos, unsigned len, unsigned copied,
+				struct page *page, void *fsdata);
 	sector_t (*bmap)(struct address_space *, sector_t);
 	int (*invalidatepage) (struct page *, unsigned long);
 	int (*releasepage) (struct page *, int);
@@ -180,8 +184,6 @@ sync_page:		no	maybe
 writepages:		no
 set_page_dirty		no	no
 readpages:		no
-prepare_write:		no	yes			yes
-commit_write:		no	yes			yes
 write_begin:		no	locks the page		yes
 write_end:		no	yes, unlocks		yes
 perform_write:		no	n/a			yes
@@ -191,7 +193,7 @@ releasepage:		no	yes
 direct_IO:		no
 launder_page:		no	yes
 
-	->prepare_write(), ->commit_write(), ->sync_page() and ->readpage()
+	->write_begin(), ->write_end(), ->sync_page() and ->readpage()
 may be called from the request handler (/dev/loop).
 
 	->readpage() unlocks the page, either synchronously or via I/O
diff --git a/Documentation/filesystems/vfs.txt b/Documentation/filesystems/vfs.txt
index c4d348dabe94..5579bda58a6d 100644
--- a/Documentation/filesystems/vfs.txt
+++ b/Documentation/filesystems/vfs.txt
@@ -492,7 +492,7 @@ written-back to storage typically in whole pages, however the
 address_space has finer control of write sizes.
 
 The read process essentially only requires 'readpage'.  The write
-process is more complicated and uses prepare_write/commit_write or
+process is more complicated and uses write_begin/write_end or
 set_page_dirty to write data into the address_space, and writepage,
 sync_page, and writepages to writeback data to storage.
 
@@ -521,8 +521,6 @@ struct address_space_operations {
 	int (*set_page_dirty)(struct page *page);
 	int (*readpages)(struct file *filp, struct address_space *mapping,
 			struct list_head *pages, unsigned nr_pages);
-	int (*prepare_write)(struct file *, struct page *, unsigned, unsigned);
-	int (*commit_write)(struct file *, struct page *, unsigned, unsigned);
 	int (*write_begin)(struct file *, struct address_space *mapping,
 				loff_t pos, unsigned len, unsigned flags,
 				struct page **pagep, void **fsdata);
@@ -598,37 +596,7 @@ struct address_space_operations {
 	readpages is only used for read-ahead, so read errors are
   	ignored.  If anything goes wrong, feel free to give up.
 
-  prepare_write: called by the generic write path in VM to set up a write
-  	request for a page.  This indicates to the address space that
-  	the given range of bytes is about to be written.  The
-  	address_space should check that the write will be able to
-  	complete, by allocating space if necessary and doing any other
-  	internal housekeeping.  If the write will update parts of
-  	any basic-blocks on storage, then those blocks should be
-  	pre-read (if they haven't been read already) so that the
-  	updated blocks can be written out properly.
-	The page will be locked.
-
-	Note: the page _must not_ be marked uptodate in this function
-	(or anywhere else) unless it actually is uptodate right now. As
-	soon as a page is marked uptodate, it is possible for a concurrent
-	read(2) to copy it to userspace.
-
-  commit_write: If prepare_write succeeds, new data will be copied
-        into the page and then commit_write will be called.  It will
-        typically update the size of the file (if appropriate) and
-        mark the inode as dirty, and do any other related housekeeping
-        operations.  It should avoid returning an error if possible -
-        errors should have been handled by prepare_write.
-
-  write_begin: This is intended as a replacement for prepare_write. The
-	key differences being that:
-		- it returns a locked page (in *pagep) rather than being
-		  given a pre locked page;
-		- it must be able to cope with short writes (where the
-		  length passed to write_begin is greater than the number
-		  of bytes copied into the page).
-
+  write_begin:
 	Called by the generic buffered write code to ask the filesystem to
 	prepare to write len bytes at the given offset in the file. The
 	address_space should check that the write will be able to complete,
@@ -640,6 +608,9 @@ struct address_space_operations {
         The filesystem must return the locked pagecache page for the specified
 	offset, in *pagep, for the caller to write into.
 
+	It must be able to cope with short writes (where the length passed to
+	write_begin is greater than the number of bytes copied into the page).
+
 	flags is a field for AOP_FLAG_xxx flags, described in
 	include/linux/fs.h.
 
diff --git a/drivers/block/loop.c b/drivers/block/loop.c
index 3f09cd8bcc38..5c4ee70d5cf3 100644
--- a/drivers/block/loop.c
+++ b/drivers/block/loop.c
@@ -40,8 +40,7 @@
  * Heinz Mauelshagen <mge@sistina.com>, Feb 2002
  *
  * Support for falling back on the write file operation when the address space
- * operations prepare_write and/or commit_write are not available on the
- * backing filesystem.
+ * operations write_begin is not available on the backing filesystem.
  * Anton Altaparmakov, 16 Feb 2005
  *
  * Still To Fix:
@@ -765,7 +764,7 @@ static int loop_set_fd(struct loop_device *lo, fmode_t mode,
 		 */
 		if (!file->f_op->splice_read)
 			goto out_putf;
-		if (aops->prepare_write || aops->write_begin)
+		if (aops->write_begin)
 			lo_flags |= LO_FLAGS_USE_AOPS;
 		if (!(lo_flags & LO_FLAGS_USE_AOPS) && !file->f_op->write)
 			lo_flags |= LO_FLAGS_READ_ONLY;
diff --git a/fs/fat/inode.c b/fs/fat/inode.c
index 19eafbe3c379..2b2eec1283bf 100644
--- a/fs/fat/inode.c
+++ b/fs/fat/inode.c
@@ -175,7 +175,7 @@ static ssize_t fat_direct_IO(int rw, struct kiocb *iocb,
 
 	if (rw == WRITE) {
 		/*
-		 * FIXME: blockdev_direct_IO() doesn't use ->prepare_write(),
+		 * FIXME: blockdev_direct_IO() doesn't use ->write_begin(),
 		 * so we need to update the ->mmu_private to block boundary.
 		 *
 		 * But we must fill the remaining area or hole by nul for
diff --git a/fs/libfs.c b/fs/libfs.c
index 74688598bcf7..e960a8321902 100644
--- a/fs/libfs.c
+++ b/fs/libfs.c
@@ -814,7 +814,7 @@ EXPORT_SYMBOL(simple_getattr);
 EXPORT_SYMBOL(simple_link);
 EXPORT_SYMBOL(simple_lookup);
 EXPORT_SYMBOL(simple_pin_fs);
-EXPORT_SYMBOL(simple_prepare_write);
+EXPORT_UNUSED_SYMBOL(simple_prepare_write);
 EXPORT_SYMBOL(simple_readpage);
 EXPORT_SYMBOL(simple_release_fs);
 EXPORT_SYMBOL(simple_rename);
diff --git a/fs/ocfs2/file.c b/fs/ocfs2/file.c
index 8d3225a78073..7efe937a415f 100644
--- a/fs/ocfs2/file.c
+++ b/fs/ocfs2/file.c
@@ -679,8 +679,7 @@ leave:
 
 /* Some parts of this taken from generic_cont_expand, which turned out
  * to be too fragile to do exactly what we need without us having to
- * worry about recursive locking in ->prepare_write() and
- * ->commit_write(). */
+ * worry about recursive locking in ->write_begin() and ->write_end(). */
 static int ocfs2_write_zero_page(struct inode *inode,
 				 u64 size)
 {
diff --git a/fs/splice.c b/fs/splice.c
index a1e701c27156..1abab5cee4ba 100644
--- a/fs/splice.c
+++ b/fs/splice.c
@@ -731,8 +731,8 @@ ssize_t splice_from_pipe(struct pipe_inode_info *pipe, struct file *out,
 	};
 
 	/*
-	 * The actor worker might be calling ->prepare_write and
-	 * ->commit_write. Most of the time, these expect i_mutex to
+	 * The actor worker might be calling ->write_begin and
+	 * ->write_end. Most of the time, these expect i_mutex to
 	 * be held. Since this may result in an ABBA deadlock with
 	 * pipe->inode, we have to order lock acquiry here.
 	 */
diff --git a/include/linux/fs.h b/include/linux/fs.h
index 5b248d61430c..0dcdd9458f4b 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -489,13 +489,6 @@ struct address_space_operations {
 	int (*readpages)(struct file *filp, struct address_space *mapping,
 			struct list_head *pages, unsigned nr_pages);
 
-	/*
-	 * ext3 requires that a successful prepare_write() call be followed
-	 * by a commit_write() call - they must be balanced
-	 */
-	int (*prepare_write)(struct file *, struct page *, unsigned, unsigned);
-	int (*commit_write)(struct file *, struct page *, unsigned, unsigned);
-
 	int (*write_begin)(struct file *, struct address_space *mapping,
 				loff_t pos, unsigned len, unsigned flags,
 				struct page **pagep, void **fsdata);
diff --git a/mm/filemap.c b/mm/filemap.c
index ab8553658af3..f3e5f8944d17 100644
--- a/mm/filemap.c
+++ b/mm/filemap.c
@@ -2029,48 +2029,8 @@ int pagecache_write_begin(struct file *file, struct address_space *mapping,
 {
 	const struct address_space_operations *aops = mapping->a_ops;
 
-	if (aops->write_begin) {
-		return aops->write_begin(file, mapping, pos, len, flags,
+	return aops->write_begin(file, mapping, pos, len, flags,
 							pagep, fsdata);
-	} else {
-		int ret;
-		pgoff_t index = pos >> PAGE_CACHE_SHIFT;
-		unsigned offset = pos & (PAGE_CACHE_SIZE - 1);
-		struct inode *inode = mapping->host;
-		struct page *page;
-again:
-		page = __grab_cache_page(mapping, index);
-		*pagep = page;
-		if (!page)
-			return -ENOMEM;
-
-		if (flags & AOP_FLAG_UNINTERRUPTIBLE && !PageUptodate(page)) {
-			/*
-			 * There is no way to resolve a short write situation
-			 * for a !Uptodate page (except by double copying in
-			 * the caller done by generic_perform_write_2copy).
-			 *
-			 * Instead, we have to bring it uptodate here.
-			 */
-			ret = aops->readpage(file, page);
-			page_cache_release(page);
-			if (ret) {
-				if (ret == AOP_TRUNCATED_PAGE)
-					goto again;
-				return ret;
-			}
-			goto again;
-		}
-
-		ret = aops->prepare_write(file, page, offset, offset+len);
-		if (ret) {
-			unlock_page(page);
-			page_cache_release(page);
-			if (pos + len > inode->i_size)
-				vmtruncate(inode, inode->i_size);
-		}
-		return ret;
-	}
 }
 EXPORT_SYMBOL(pagecache_write_begin);
 
@@ -2079,32 +2039,9 @@ int pagecache_write_end(struct file *file, struct address_space *mapping,
 				struct page *page, void *fsdata)
 {
 	const struct address_space_operations *aops = mapping->a_ops;
-	int ret;
-
-	if (aops->write_end) {
-		mark_page_accessed(page);
-		ret = aops->write_end(file, mapping, pos, len, copied,
-							page, fsdata);
-	} else {
-		unsigned offset = pos & (PAGE_CACHE_SIZE - 1);
-		struct inode *inode = mapping->host;
-
-		flush_dcache_page(page);
-		ret = aops->commit_write(file, page, offset, offset+len);
-		unlock_page(page);
-		mark_page_accessed(page);
-		page_cache_release(page);
-
-		if (ret < 0) {
-			if (pos + len > inode->i_size)
-				vmtruncate(inode, inode->i_size);
-		} else if (ret > 0)
-			ret = min_t(size_t, copied, ret);
-		else
-			ret = copied;
-	}
 
-	return ret;
+	mark_page_accessed(page);
+	return aops->write_end(file, mapping, pos, len, copied, page, fsdata);
 }
 EXPORT_SYMBOL(pagecache_write_end);
 
@@ -2226,174 +2163,6 @@ repeat:
 }
 EXPORT_SYMBOL(__grab_cache_page);
 
-static ssize_t generic_perform_write_2copy(struct file *file,
-				struct iov_iter *i, loff_t pos)
-{
-	struct address_space *mapping = file->f_mapping;
-	const struct address_space_operations *a_ops = mapping->a_ops;
-	struct inode *inode = mapping->host;
-	long status = 0;
-	ssize_t written = 0;
-
-	do {
-		struct page *src_page;
-		struct page *page;
-		pgoff_t index;		/* Pagecache index for current page */
-		unsigned long offset;	/* Offset into pagecache page */
-		unsigned long bytes;	/* Bytes to write to page */
-		size_t copied;		/* Bytes copied from user */
-
-		offset = (pos & (PAGE_CACHE_SIZE - 1));
-		index = pos >> PAGE_CACHE_SHIFT;
-		bytes = min_t(unsigned long, PAGE_CACHE_SIZE - offset,
-						iov_iter_count(i));
-
-		/*
-		 * a non-NULL src_page indicates that we're doing the
-		 * copy via get_user_pages and kmap.
-		 */
-		src_page = NULL;
-
-		/*
-		 * Bring in the user page that we will copy from _first_.
-		 * Otherwise there's a nasty deadlock on copying from the
-		 * same page as we're writing to, without it being marked
-		 * up-to-date.
-		 *
-		 * Not only is this an optimisation, but it is also required
-		 * to check that the address is actually valid, when atomic
-		 * usercopies are used, below.
-		 */
-		if (unlikely(iov_iter_fault_in_readable(i, bytes))) {
-			status = -EFAULT;
-			break;
-		}
-
-		page = __grab_cache_page(mapping, index);
-		if (!page) {
-			status = -ENOMEM;
-			break;
-		}
-
-		/*
-		 * non-uptodate pages cannot cope with short copies, and we
-		 * cannot take a pagefault with the destination page locked.
-		 * So pin the source page to copy it.
-		 */
-		if (!PageUptodate(page) && !segment_eq(get_fs(), KERNEL_DS)) {
-			unlock_page(page);
-
-			src_page = alloc_page(GFP_KERNEL);
-			if (!src_page) {
-				page_cache_release(page);
-				status = -ENOMEM;
-				break;
-			}
-
-			/*
-			 * Cannot get_user_pages with a page locked for the
-			 * same reason as we can't take a page fault with a
-			 * page locked (as explained below).
-			 */
-			copied = iov_iter_copy_from_user(src_page, i,
-								offset, bytes);
-			if (unlikely(copied == 0)) {
-				status = -EFAULT;
-				page_cache_release(page);
-				page_cache_release(src_page);
-				break;
-			}
-			bytes = copied;
-
-			lock_page(page);
-			/*
-			 * Can't handle the page going uptodate here, because
-			 * that means we would use non-atomic usercopies, which
-			 * zero out the tail of the page, which can cause
-			 * zeroes to become transiently visible. We could just
-			 * use a non-zeroing copy, but the APIs aren't too
-			 * consistent.
-			 */
-			if (unlikely(!page->mapping || PageUptodate(page))) {
-				unlock_page(page);
-				page_cache_release(page);
-				page_cache_release(src_page);
-				continue;
-			}
-		}
-
-		status = a_ops->prepare_write(file, page, offset, offset+bytes);
-		if (unlikely(status))
-			goto fs_write_aop_error;
-
-		if (!src_page) {
-			/*
-			 * Must not enter the pagefault handler here, because
-			 * we hold the page lock, so we might recursively
-			 * deadlock on the same lock, or get an ABBA deadlock
-			 * against a different lock, or against the mmap_sem
-			 * (which nests outside the page lock).  So increment
-			 * preempt count, and use _atomic usercopies.
-			 *
-			 * The page is uptodate so we are OK to encounter a
-			 * short copy: if unmodified parts of the page are
-			 * marked dirty and written out to disk, it doesn't
-			 * really matter.
-			 */
-			pagefault_disable();
-			copied = iov_iter_copy_from_user_atomic(page, i,
-								offset, bytes);
-			pagefault_enable();
-		} else {
-			void *src, *dst;
-			src = kmap_atomic(src_page, KM_USER0);
-			dst = kmap_atomic(page, KM_USER1);
-			memcpy(dst + offset, src + offset, bytes);
-			kunmap_atomic(dst, KM_USER1);
-			kunmap_atomic(src, KM_USER0);
-			copied = bytes;
-		}
-		flush_dcache_page(page);
-
-		status = a_ops->commit_write(file, page, offset, offset+bytes);
-		if (unlikely(status < 0))
-			goto fs_write_aop_error;
-		if (unlikely(status > 0)) /* filesystem did partial write */
-			copied = min_t(size_t, copied, status);
-
-		unlock_page(page);
-		mark_page_accessed(page);
-		page_cache_release(page);
-		if (src_page)
-			page_cache_release(src_page);
-
-		iov_iter_advance(i, copied);
-		pos += copied;
-		written += copied;
-
-		balance_dirty_pages_ratelimited(mapping);
-		cond_resched();
-		continue;
-
-fs_write_aop_error:
-		unlock_page(page);
-		page_cache_release(page);
-		if (src_page)
-			page_cache_release(src_page);
-
-		/*
-		 * prepare_write() may have instantiated a few blocks
-		 * outside i_size.  Trim these off again. Don't need
-		 * i_size_read because we hold i_mutex.
-		 */
-		if (pos + bytes > inode->i_size)
-			vmtruncate(inode, inode->i_size);
-		break;
-	} while (iov_iter_count(i));
-
-	return written ? written : status;
-}
-
 static ssize_t generic_perform_write(struct file *file,
 				struct iov_iter *i, loff_t pos)
 {
@@ -2494,10 +2263,7 @@ generic_file_buffered_write(struct kiocb *iocb, const struct iovec *iov,
 	struct iov_iter i;
 
 	iov_iter_init(&i, iov, nr_segs, count, written);
-	if (a_ops->write_begin)
-		status = generic_perform_write(file, &i, pos);
-	else
-		status = generic_perform_write_2copy(file, &i, pos);
+	status = generic_perform_write(file, &i, pos);
 
 	if (likely(status >= 0)) {
 		written += status;
-- 
cgit v1.2.3


From 7106a27b52940085c2c3f6e42742d3a2a84d872a Mon Sep 17 00:00:00 2001
From: Randy Dunlap <randy.dunlap@oracle.com>
Date: Wed, 29 Oct 2008 14:01:15 -0700
Subject: kernel.h: fix might_sleep kernel-doc

Put the kernel-doc for might_sleep() _immediately_ before the macro
(no intervening lines).  Otherwise kernel-doc complains like so:

Warning(linux-2.6.27-rc3-git2//include/linux/kernel.h:129): No description found for parameter 'file'
Warning(linux-2.6.27-rc3-git2//include/linux/kernel.h:129): No description found for parameter 'line'

because kernel-doc is looking at the wrong function prototype (i.e.,
__might_sleep).  [Yes, I have a todo note to myself to check/warn for that
inconsistency in scripts/kernel-doc.]

Signed-off-by: Randy Dunlap <randy.dunlap@oracle.com>
Cc: <Uwe.Kleine-Koenig@digi.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/kernel.h | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/kernel.h b/include/linux/kernel.h
index 396a350b87a6..fba141d3ca07 100644
--- a/include/linux/kernel.h
+++ b/include/linux/kernel.h
@@ -116,6 +116,8 @@ extern int _cond_resched(void);
 # define might_resched() do { } while (0)
 #endif
 
+#ifdef CONFIG_DEBUG_SPINLOCK_SLEEP
+  void __might_sleep(char *file, int line);
 /**
  * might_sleep - annotation for functions that can sleep
  *
@@ -126,8 +128,6 @@ extern int _cond_resched(void);
  * be bitten later when the calling function happens to sleep when it is not
  * supposed to.
  */
-#ifdef CONFIG_DEBUG_SPINLOCK_SLEEP
-  void __might_sleep(char *file, int line);
 # define might_sleep() \
 	do { __might_sleep(__FILE__, __LINE__); might_resched(); } while (0)
 #else
-- 
cgit v1.2.3


From 731572d39fcd3498702eda4600db4c43d51e0b26 Mon Sep 17 00:00:00 2001
From: Alan Cox <alan@redhat.com>
Date: Wed, 29 Oct 2008 14:01:20 -0700
Subject: nfsd: fix vm overcommit crash

Junjiro R.  Okajima reported a problem where knfsd crashes if you are
using it to export shmemfs objects and run strict overcommit.  In this
situation the current->mm based modifier to the overcommit goes through a
NULL pointer.

We could simply check for NULL and skip the modifier but we've caught
other real bugs in the past from mm being NULL here - cases where we did
need a valid mm set up (eg the exec bug about a year ago).

To preserve the checks and get the logic we want shuffle the checking
around and add a new helper to the vm_ security wrappers

Also fix a current->mm reference in nommu that should use the passed mm

[akpm@linux-foundation.org: coding-style fixes]
[akpm@linux-foundation.org: fix build]
Reported-by: Junjiro R. Okajima <hooanon05@yahoo.co.jp>
Acked-by: James Morris <jmorris@namei.org>
Signed-off-by: Alan Cox <alan@redhat.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/security.h | 6 ++++++
 mm/mmap.c                | 3 ++-
 mm/nommu.c               | 3 ++-
 mm/shmem.c               | 8 ++++----
 security/security.c      | 9 +++++++++
 5 files changed, 23 insertions(+), 6 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/security.h b/include/linux/security.h
index f5c4a51eb42e..c13f1cec9abb 100644
--- a/include/linux/security.h
+++ b/include/linux/security.h
@@ -1585,6 +1585,7 @@ int security_syslog(int type);
 int security_settime(struct timespec *ts, struct timezone *tz);
 int security_vm_enough_memory(long pages);
 int security_vm_enough_memory_mm(struct mm_struct *mm, long pages);
+int security_vm_enough_memory_kern(long pages);
 int security_bprm_alloc(struct linux_binprm *bprm);
 void security_bprm_free(struct linux_binprm *bprm);
 void security_bprm_apply_creds(struct linux_binprm *bprm, int unsafe);
@@ -1820,6 +1821,11 @@ static inline int security_vm_enough_memory(long pages)
 	return cap_vm_enough_memory(current->mm, pages);
 }
 
+static inline int security_vm_enough_memory_kern(long pages)
+{
+	return cap_vm_enough_memory(current->mm, pages);
+}
+
 static inline int security_vm_enough_memory_mm(struct mm_struct *mm, long pages)
 {
 	return cap_vm_enough_memory(mm, pages);
diff --git a/mm/mmap.c b/mm/mmap.c
index 74f4d158022e..de14ac21e5b5 100644
--- a/mm/mmap.c
+++ b/mm/mmap.c
@@ -175,7 +175,8 @@ int __vm_enough_memory(struct mm_struct *mm, long pages, int cap_sys_admin)
 
 	/* Don't let a single process grow too big:
 	   leave 3% of the size of this process for other processes */
-	allowed -= mm->total_vm / 32;
+	if (mm)
+		allowed -= mm->total_vm / 32;
 
 	/*
 	 * cast `allowed' as a signed long because vm_committed_space
diff --git a/mm/nommu.c b/mm/nommu.c
index 2696b24f2bb3..7695dc850785 100644
--- a/mm/nommu.c
+++ b/mm/nommu.c
@@ -1454,7 +1454,8 @@ int __vm_enough_memory(struct mm_struct *mm, long pages, int cap_sys_admin)
 
 	/* Don't let a single process grow too big:
 	   leave 3% of the size of this process for other processes */
-	allowed -= current->mm->total_vm / 32;
+	if (mm)
+		allowed -= mm->total_vm / 32;
 
 	/*
 	 * cast `allowed' as a signed long because vm_committed_space
diff --git a/mm/shmem.c b/mm/shmem.c
index d38d7e61fcd0..0ed075215e5f 100644
--- a/mm/shmem.c
+++ b/mm/shmem.c
@@ -161,8 +161,8 @@ static inline struct shmem_sb_info *SHMEM_SB(struct super_block *sb)
  */
 static inline int shmem_acct_size(unsigned long flags, loff_t size)
 {
-	return (flags & VM_ACCOUNT)?
-		security_vm_enough_memory(VM_ACCT(size)): 0;
+	return (flags & VM_ACCOUNT) ?
+		security_vm_enough_memory_kern(VM_ACCT(size)) : 0;
 }
 
 static inline void shmem_unacct_size(unsigned long flags, loff_t size)
@@ -179,8 +179,8 @@ static inline void shmem_unacct_size(unsigned long flags, loff_t size)
  */
 static inline int shmem_acct_block(unsigned long flags)
 {
-	return (flags & VM_ACCOUNT)?
-		0: security_vm_enough_memory(VM_ACCT(PAGE_CACHE_SIZE));
+	return (flags & VM_ACCOUNT) ?
+		0 : security_vm_enough_memory_kern(VM_ACCT(PAGE_CACHE_SIZE));
 }
 
 static inline void shmem_unacct_blocks(unsigned long flags, long pages)
diff --git a/security/security.c b/security/security.c
index 255b08559b2b..c0acfa7177e5 100644
--- a/security/security.c
+++ b/security/security.c
@@ -198,14 +198,23 @@ int security_settime(struct timespec *ts, struct timezone *tz)
 
 int security_vm_enough_memory(long pages)
 {
+	WARN_ON(current->mm == NULL);
 	return security_ops->vm_enough_memory(current->mm, pages);
 }
 
 int security_vm_enough_memory_mm(struct mm_struct *mm, long pages)
 {
+	WARN_ON(mm == NULL);
 	return security_ops->vm_enough_memory(mm, pages);
 }
 
+int security_vm_enough_memory_kern(long pages)
+{
+	/* If current->mm is a kernel thread then we will pass NULL,
+	   for this specific case that is fine */
+	return security_ops->vm_enough_memory(current->mm, pages);
+}
+
 int security_bprm_alloc(struct linux_binprm *bprm)
 {
 	return security_ops->bprm_alloc_security(bprm);
-- 
cgit v1.2.3


From effdb9492de01a51f8123e62e87e3330688f9bf1 Mon Sep 17 00:00:00 2001
From: Fernando Luis Vazquez Cao <fernando@oss.ntt.co.jp>
Date: Wed, 29 Oct 2008 14:01:21 -0700
Subject: spi: fix compile error

Fix compile error below:

     LD      drivers/spi/built-in.o
     CC [M]  drivers/spi/spi_gpio.o
   In file included from drivers/spi/spi_gpio.c:26:
   include/linux/spi/spi_bitbang.h:23: error: field `work' has incomplete type
   make[2]: *** [drivers/spi/spi_gpio.o] Error 1
   make[1]: *** [drivers/spi] Error 2
   make: *** [drivers] Error 2

Signed-off-by: Fernando Luis Vazquez Cao <fernando@oss.ntt.co.jp>
Cc: David Brownell <david-b@pacbell.net>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/spi/spi_bitbang.h | 3 +++
 1 file changed, 3 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/spi/spi_bitbang.h b/include/linux/spi/spi_bitbang.h
index b8db32cea1de..bf8de281b4ed 100644
--- a/include/linux/spi/spi_bitbang.h
+++ b/include/linux/spi/spi_bitbang.h
@@ -18,6 +18,9 @@
  *	duplex (MicroWire) controllers.  Provide chipslect() and txrx_bufs(),
  *	and custom setup()/cleanup() methods.
  */
+
+#include <linux/workqueue.h>
+
 struct spi_bitbang {
 	struct workqueue_struct	*workqueue;
 	struct work_struct	work;
-- 
cgit v1.2.3


From c132419e560a2ecd3c8cf77f9c37e103e74b3754 Mon Sep 17 00:00:00 2001
From: Trent Piepho <tpiepho@freescale.com>
Date: Thu, 30 Oct 2008 18:17:06 -0700
Subject: gianfar: Fix race in TBI/SerDes configuration

The init_phy() function attaches to the PHY, then configures the
SerDes<->TBI link (in SGMII mode).  The TBI is on the MDIO bus with the PHY
(sort of) and is accessed via the gianfar's MDIO registers, using the
functions gfar_local_mdio_read/write(), which don't do any locking.

The previously attached PHY will start a work-queue on a timer, and
probably an irq handler as well, which will talk to the PHY and thus use
the MDIO bus.  This uses phy_read/write(), which have locking, but not
against the gfar_local_mdio versions.

The result is that PHY code will try to use the MDIO bus at the same time
as the SerDes setup code, corrupting the transfers.

Setting up the SerDes before attaching to the PHY will insure that there is
no race between the SerDes code and *our* PHY, but doesn't fix everything.
Typically the PHYs for all gianfar devices are on the same MDIO bus, which
is associated with the first gianfar device.  This means that the first
gianfar's SerDes code could corrupt the MDIO transfers for a different
gianfar's PHY.

The lock used by phy_read/write() is contained in the mii_bus structure,
which is pointed to by the PHY.  This is difficult to access from the
gianfar drivers, as there is no link between a gianfar device and the
mii_bus which shares the same MDIO registers.  As far as the device layer
and drivers are concerned they are two unrelated devices (which happen to
share registers).

Generally all gianfar devices' PHYs will be on the bus associated with the
first gianfar.  But this might not be the case, so simply locking the
gianfar's PHY's mii bus might not lock the mii bus that the SerDes setup
code is going to use.

We solve this by having the code that creates the gianfar platform device
look in the device tree for an mdio device that shares the gianfar's
registers.  If one is found the ID of its platform device is saved in the
gianfar's platform data.

A new function in the gianfar mii code, gfar_get_miibus(), can use the bus
ID to search through the platform devices for a gianfar_mdio device with
the right ID.  The platform device's driver data is the mii_bus structure,
which the SerDes setup code can use to lock the current bus.

Signed-off-by: Trent Piepho <tpiepho@freescale.com>
CC: Andy Fleming <afleming@freescale.com>
Signed-off-by: Jeff Garzik <jgarzik@redhat.com>
---
 arch/powerpc/sysdev/fsl_soc.c | 26 ++++++++++++++++++++++++++
 drivers/net/gianfar.c         |  7 +++++++
 drivers/net/gianfar_mii.c     | 21 +++++++++++++++++++++
 drivers/net/gianfar_mii.h     |  3 +++
 include/linux/fsl_devices.h   |  3 ++-
 5 files changed, 59 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/arch/powerpc/sysdev/fsl_soc.c b/arch/powerpc/sysdev/fsl_soc.c
index 01b884b25696..26ecb96f9731 100644
--- a/arch/powerpc/sysdev/fsl_soc.c
+++ b/arch/powerpc/sysdev/fsl_soc.c
@@ -223,6 +223,8 @@ static int gfar_mdio_of_init_one(struct device_node *np)
 	if (ret)
 		return ret;
 
+	/* The gianfar device will try to use the same ID created below to find
+	 * this bus, to coordinate register access (since they share).  */
 	mdio_dev = platform_device_register_simple("fsl-gianfar_mdio",
 			res.start&0xfffff, &res, 1);
 	if (IS_ERR(mdio_dev))
@@ -394,6 +396,30 @@ static int __init gfar_of_init(void)
 			of_node_put(mdio);
 		}
 
+		/* Get MDIO bus controlled by this eTSEC, if any.  Normally only
+		 * eTSEC 1 will control an MDIO bus, not necessarily the same
+		 * bus that its PHY is on ('mdio' above), so we can't just use
+		 * that.  What we do is look for a gianfar mdio device that has
+		 * overlapping registers with this device.  That's really the
+		 * whole point, to find the device sharing our registers to
+		 * coordinate access with it.
+		 */
+		for_each_compatible_node(mdio, NULL, "fsl,gianfar-mdio") {
+			if (of_address_to_resource(mdio, 0, &res))
+				continue;
+
+			if (res.start >= r[0].start && res.end <= r[0].end) {
+				/* Get the ID the mdio bus platform device was
+				 * registered with.  gfar_data.bus_id is
+				 * different because it's for finding a PHY,
+				 * while this is for finding a MII bus.
+				 */
+				gfar_data.mdio_bus = res.start&0xfffff;
+				of_node_put(mdio);
+				break;
+			}
+		}
+
 		ret =
 		    platform_device_add_data(gfar_dev, &gfar_data,
 					     sizeof(struct
diff --git a/drivers/net/gianfar.c b/drivers/net/gianfar.c
index 64b201134fdb..249541a1814b 100644
--- a/drivers/net/gianfar.c
+++ b/drivers/net/gianfar.c
@@ -586,6 +586,10 @@ static void gfar_configure_serdes(struct net_device *dev)
 	struct gfar_mii __iomem *regs =
 			(void __iomem *)&priv->regs->gfar_mii_regs;
 	int tbipa = gfar_read(&priv->regs->tbipa);
+	struct mii_bus *bus = gfar_get_miibus(priv);
+
+	if (bus)
+		mutex_lock(&bus->mdio_lock);
 
 	/* Single clk mode, mii mode off(for serdes communication) */
 	gfar_local_mdio_write(regs, tbipa, MII_TBICON, TBICON_CLK_SELECT);
@@ -596,6 +600,9 @@ static void gfar_configure_serdes(struct net_device *dev)
 
 	gfar_local_mdio_write(regs, tbipa, MII_BMCR, BMCR_ANENABLE |
 			BMCR_ANRESTART | BMCR_FULLDPLX | BMCR_SPEED1000);
+
+	if (bus)
+		mutex_unlock(&bus->mdio_lock);
 }
 
 static void init_registers(struct net_device *dev)
diff --git a/drivers/net/gianfar_mii.c b/drivers/net/gianfar_mii.c
index bf73eea98010..0e2595d24933 100644
--- a/drivers/net/gianfar_mii.c
+++ b/drivers/net/gianfar_mii.c
@@ -269,6 +269,27 @@ static struct device_driver gianfar_mdio_driver = {
 	.remove = gfar_mdio_remove,
 };
 
+static int match_mdio_bus(struct device *dev, void *data)
+{
+	const struct gfar_private *priv = data;
+	const struct platform_device *pdev = to_platform_device(dev);
+
+	return !strcmp(pdev->name, gianfar_mdio_driver.name) &&
+		pdev->id == priv->einfo->mdio_bus;
+}
+
+/* Given a gfar_priv structure, find the mii_bus controlled by this device (not
+ * necessarily the same as the bus the gfar's PHY is on), if one exists.
+ * Normally only the first gianfar controls a mii_bus.  */
+struct mii_bus *gfar_get_miibus(const struct gfar_private *priv)
+{
+	/*const*/ struct device *d;
+
+	d = bus_find_device(gianfar_mdio_driver.bus, NULL, (void *)priv,
+			    match_mdio_bus);
+	return d ? dev_get_drvdata(d) : NULL;
+}
+
 int __init gfar_mdio_init(void)
 {
 	return driver_register(&gianfar_mdio_driver);
diff --git a/drivers/net/gianfar_mii.h b/drivers/net/gianfar_mii.h
index 2af28b16a0e2..02dc970ca1ff 100644
--- a/drivers/net/gianfar_mii.h
+++ b/drivers/net/gianfar_mii.h
@@ -18,6 +18,8 @@
 #ifndef __GIANFAR_MII_H
 #define __GIANFAR_MII_H
 
+struct gfar_private; /* forward ref */
+
 #define MIIMIND_BUSY            0x00000001
 #define MIIMIND_NOTVALID        0x00000004
 
@@ -44,6 +46,7 @@ int gfar_mdio_write(struct mii_bus *bus, int mii_id, int regnum, u16 value);
 int gfar_local_mdio_write(struct gfar_mii __iomem *regs, int mii_id,
 			  int regnum, u16 value);
 int gfar_local_mdio_read(struct gfar_mii __iomem *regs, int mii_id, int regnum);
+struct mii_bus *gfar_get_miibus(const struct gfar_private *priv);
 int __init gfar_mdio_init(void);
 void gfar_mdio_exit(void);
 #endif /* GIANFAR_PHY_H */
diff --git a/include/linux/fsl_devices.h b/include/linux/fsl_devices.h
index 4e625e0094c8..708bab58d8d0 100644
--- a/include/linux/fsl_devices.h
+++ b/include/linux/fsl_devices.h
@@ -49,7 +49,8 @@ struct gianfar_platform_data {
 	u32	device_flags;
 	/* board specific information */
 	u32	board_flags;
-	char	bus_id[MII_BUS_ID_SIZE];
+	int	mdio_bus;			/* Bus controlled by us */
+	char	bus_id[MII_BUS_ID_SIZE];	/* Bus PHY is on */
 	u32	phy_id;
 	u8	mac_addr[6];
 	phy_interface_t interface;
-- 
cgit v1.2.3


From 9ce8e3073d9cfd6f859c22a25441db41b85cbf6e Mon Sep 17 00:00:00 2001
From: Jens Axboe <jens.axboe@oracle.com>
Date: Wed, 27 Aug 2008 15:23:18 +0200
Subject: libata: add whitelist for devices with known good pata-sata bridges

libata currently imposes a UDMA5 max transfer rate and 200 sector max
transfer size for SATA devices that sit behind a pata-sata bridge. Lots
of devices have known good bridges that don't need this limit applied.
The MTRON SSD disks are such devices. Transfer rates are increased by
20-30% with the restriction removed.

So add a "blacklist" entry for the MTRON devices, with a flag indicating
that the bridge is known good.

Signed-off-by: Jeff Garzik <jgarzik@redhat.com>
---
 drivers/ata/libata-core.c | 7 +++++++
 include/linux/libata.h    | 1 +
 2 files changed, 8 insertions(+)

(limited to 'include/linux')

diff --git a/drivers/ata/libata-core.c b/drivers/ata/libata-core.c
index 8824c8da3f2f..82af7011f2dd 100644
--- a/drivers/ata/libata-core.c
+++ b/drivers/ata/libata-core.c
@@ -2158,6 +2158,10 @@ retry:
 static inline u8 ata_dev_knobble(struct ata_device *dev)
 {
 	struct ata_port *ap = dev->link->ap;
+
+	if (ata_dev_blacklisted(dev) & ATA_HORKAGE_BRIDGE_OK)
+		return 0;
+
 	return ((ap->cbl == ATA_CBL_SATA) && (!ata_id_is_sata(dev->id)));
 }
 
@@ -4062,6 +4066,9 @@ static const struct ata_blacklist_entry ata_device_blacklist [] = {
 	{ "TSSTcorp CDDVDW SH-S202N", "SB00",	  ATA_HORKAGE_IVB, },
 	{ "TSSTcorp CDDVDW SH-S202N", "SB01",	  ATA_HORKAGE_IVB, },
 
+	/* Devices that do not need bridging limits applied */
+	{ "MTRON MSP-SATA*",		NULL,	ATA_HORKAGE_BRIDGE_OK, },
+
 	/* End Marker */
 	{ }
 };
diff --git a/include/linux/libata.h b/include/linux/libata.h
index 507f53ef8038..f5441edee55f 100644
--- a/include/linux/libata.h
+++ b/include/linux/libata.h
@@ -372,6 +372,7 @@ enum {
 	ATA_HORKAGE_IPM		= (1 << 7),	/* Link PM problems */
 	ATA_HORKAGE_IVB		= (1 << 8),	/* cbl det validity bit bugs */
 	ATA_HORKAGE_STUCK_ERR	= (1 << 9),	/* stuck ERR on next PACKET */
+	ATA_HORKAGE_BRIDGE_OK	= (1 << 10),	/* no bridge limits */
 
 	 /* DMA mask for user DMA control: User visible values; DO NOT
 	    renumber */
-- 
cgit v1.2.3


From ad1d967c88e349c7e822ad75dd3247a2a50d2ea3 Mon Sep 17 00:00:00 2001
From: Randy Dunlap <randy.dunlap@oracle.com>
Date: Thu, 30 Oct 2008 23:54:35 -0700
Subject: net: delete excess kernel-doc notation

Remove excess kernel-doc function parameters from networking header
& driver files:

Warning(include/net/sock.h:946): Excess function parameter or struct member 'sk' description in 'sk_filter_release'
Warning(include/linux/netdevice.h:1545): Excess function parameter or struct member 'cpu' description in 'netif_tx_lock'
Warning(drivers/net/wan/z85230.c:712): Excess function parameter or struct member 'regs' description in 'z8530_interrupt'

Signed-off-by: Randy Dunlap <randy.dunlap@oracle.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/wan/z85230.c  | 1 -
 include/linux/netdevice.h | 1 -
 include/net/sock.h        | 1 -
 3 files changed, 3 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/net/wan/z85230.c b/drivers/net/wan/z85230.c
index ccd9cd35ecbe..5bf7e01ef0e9 100644
--- a/drivers/net/wan/z85230.c
+++ b/drivers/net/wan/z85230.c
@@ -695,7 +695,6 @@ EXPORT_SYMBOL(z8530_nop);
  *	z8530_interrupt - Handle an interrupt from a Z8530
  *	@irq: 	Interrupt number
  *	@dev_id: The Z8530 device that is interrupting.
- *	@regs: unused
  *
  *	A Z85[2]30 device has stuck its hand in the air for attention.
  *	We scan both the channels on the chip for events and then call
diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index c8bcb59adfdf..9d77b1d7dca8 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -1537,7 +1537,6 @@ static inline void __netif_tx_unlock_bh(struct netdev_queue *txq)
 /**
  *	netif_tx_lock - grab network device transmit lock
  *	@dev: network device
- *	@cpu: cpu number of lock owner
  *
  * Get network device transmit lock
  */
diff --git a/include/net/sock.h b/include/net/sock.h
index ada50c04d09f..c04f9e18ea22 100644
--- a/include/net/sock.h
+++ b/include/net/sock.h
@@ -936,7 +936,6 @@ extern void sock_init_data(struct socket *sock, struct sock *sk);
 
 /**
  *	sk_filter_release: Release a socket filter
- *	@sk: socket
  *	@fp: filter to remove
  *
  *	Remove a filter from a socket and release its resources.
-- 
cgit v1.2.3


From 9663f2e6a6cf3f82b06d8fb699b11b80f92553ba Mon Sep 17 00:00:00 2001
From: Keith Packard <keithp@keithp.com>
Date: Thu, 30 Oct 2008 19:38:18 -0700
Subject: resources: add io-mapping functions to dynamically map large device
 apertures

Impact: add new generic io_map_*() APIs

Graphics devices have large PCI apertures which would consume a significant
fraction of a 32-bit address space if mapped during driver initialization.
Using ioremap at runtime is impractical as it is too slow.

This new set of interfaces uses atomic mappings on 32-bit processors and a
large static mapping on 64-bit processors to provide reasonable 32-bit
performance and optimal 64-bit performance.

The current implementation sits atop the io_map_atomic fixmap-based
mechanism for 32-bit processors.

This includes some editorial suggestions from Randy Dunlap for
Documentation/io-mapping.txt

Signed-off-by: Keith Packard <keithp@keithp.com>
Signed-off-by: Eric Anholt <eric@anholt.net>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 Documentation/io-mapping.txt |  76 ++++++++++++++++++++++++++++
 include/linux/io-mapping.h   | 118 +++++++++++++++++++++++++++++++++++++++++++
 2 files changed, 194 insertions(+)
 create mode 100644 Documentation/io-mapping.txt
 create mode 100644 include/linux/io-mapping.h

(limited to 'include/linux')

diff --git a/Documentation/io-mapping.txt b/Documentation/io-mapping.txt
new file mode 100644
index 000000000000..cd2f726becc8
--- /dev/null
+++ b/Documentation/io-mapping.txt
@@ -0,0 +1,76 @@
+The io_mapping functions in linux/io-mapping.h provide an abstraction for
+efficiently mapping small regions of an I/O device to the CPU. The initial
+usage is to support the large graphics aperture on 32-bit processors where
+ioremap_wc cannot be used to statically map the entire aperture to the CPU
+as it would consume too much of the kernel address space.
+
+A mapping object is created during driver initialization using
+
+	struct io_mapping *io_mapping_create_wc(unsigned long base,
+						unsigned long size)
+
+		'base' is the bus address of the region to be made
+		mappable, while 'size' indicates how large a mapping region to
+		enable. Both are in bytes.
+
+		This _wc variant provides a mapping which may only be used
+		with the io_mapping_map_atomic_wc or io_mapping_map_wc.
+
+With this mapping object, individual pages can be mapped either atomically
+or not, depending on the necessary scheduling environment. Of course, atomic
+maps are more efficient:
+
+	void *io_mapping_map_atomic_wc(struct io_mapping *mapping,
+				       unsigned long offset)
+
+		'offset' is the offset within the defined mapping region.
+		Accessing addresses beyond the region specified in the
+		creation function yields undefined results. Using an offset
+		which is not page aligned yields an undefined result. The
+		return value points to a single page in CPU address space.
+
+		This _wc variant returns a write-combining map to the
+		page and may only be used with mappings created by
+		io_mapping_create_wc
+
+		Note that the task may not sleep while holding this page
+		mapped.
+
+	void io_mapping_unmap_atomic(void *vaddr)
+
+		'vaddr' must be the the value returned by the last
+		io_mapping_map_atomic_wc call. This unmaps the specified
+		page and allows the task to sleep once again.
+
+If you need to sleep while holding the lock, you can use the non-atomic
+variant, although they may be significantly slower.
+
+	void *io_mapping_map_wc(struct io_mapping *mapping,
+				unsigned long offset)
+
+		This works like io_mapping_map_atomic_wc except it allows
+		the task to sleep while holding the page mapped.
+
+	void io_mapping_unmap(void *vaddr)
+
+		This works like io_mapping_unmap_atomic, except it is used
+		for pages mapped with io_mapping_map_wc.
+
+At driver close time, the io_mapping object must be freed:
+
+	void io_mapping_free(struct io_mapping *mapping)
+
+Current Implementation:
+
+The initial implementation of these functions uses existing mapping
+mechanisms and so provides only an abstraction layer and no new
+functionality.
+
+On 64-bit processors, io_mapping_create_wc calls ioremap_wc for the whole
+range, creating a permanent kernel-visible mapping to the resource. The
+map_atomic and map functions add the requested offset to the base of the
+virtual address returned by ioremap_wc.
+
+On 32-bit processors, io_mapping_map_atomic_wc uses io_map_atomic_prot_pfn,
+which uses the fixmaps to get us a mapping to a page using an atomic fashion.
+For io_mapping_map_wc, ioremap_wc() is used to get a mapping of the region.
diff --git a/include/linux/io-mapping.h b/include/linux/io-mapping.h
new file mode 100644
index 000000000000..1b566993db6e
--- /dev/null
+++ b/include/linux/io-mapping.h
@@ -0,0 +1,118 @@
+/*
+ * Copyright © 2008 Keith Packard <keithp@keithp.com>
+ *
+ * This file is free software; you can redistribute it and/or modify
+ * it under the terms of version 2 of the GNU General Public License
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software Foundation,
+ * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301, USA.
+ */
+
+#ifndef _LINUX_IO_MAPPING_H
+#define _LINUX_IO_MAPPING_H
+
+#include <linux/types.h>
+#include <asm/io.h>
+#include <asm/page.h>
+#include <asm/iomap.h>
+
+/*
+ * The io_mapping mechanism provides an abstraction for mapping
+ * individual pages from an io device to the CPU in an efficient fashion.
+ *
+ * See Documentation/io_mapping.txt
+ */
+
+/* this struct isn't actually defined anywhere */
+struct io_mapping;
+
+#ifdef CONFIG_X86_64
+
+/* Create the io_mapping object*/
+static inline struct io_mapping *
+io_mapping_create_wc(unsigned long base, unsigned long size)
+{
+	return (struct io_mapping *) ioremap_wc(base, size);
+}
+
+static inline void
+io_mapping_free(struct io_mapping *mapping)
+{
+	iounmap(mapping);
+}
+
+/* Atomic map/unmap */
+static inline void *
+io_mapping_map_atomic_wc(struct io_mapping *mapping, unsigned long offset)
+{
+	return ((char *) mapping) + offset;
+}
+
+static inline void
+io_mapping_unmap_atomic(void *vaddr)
+{
+}
+
+/* Non-atomic map/unmap */
+static inline void *
+io_mapping_map_wc(struct io_mapping *mapping, unsigned long offset)
+{
+	return ((char *) mapping) + offset;
+}
+
+static inline void
+io_mapping_unmap(void *vaddr)
+{
+}
+
+#endif /* CONFIG_X86_64 */
+
+#ifdef CONFIG_X86_32
+static inline struct io_mapping *
+io_mapping_create_wc(unsigned long base, unsigned long size)
+{
+	return (struct io_mapping *) base;
+}
+
+static inline void
+io_mapping_free(struct io_mapping *mapping)
+{
+}
+
+/* Atomic map/unmap */
+static inline void *
+io_mapping_map_atomic_wc(struct io_mapping *mapping, unsigned long offset)
+{
+	offset += (unsigned long) mapping;
+	return iomap_atomic_prot_pfn(offset >> PAGE_SHIFT, KM_USER0,
+				     __pgprot(__PAGE_KERNEL_WC));
+}
+
+static inline void
+io_mapping_unmap_atomic(void *vaddr)
+{
+	iounmap_atomic(vaddr, KM_USER0);
+}
+
+static inline void *
+io_mapping_map_wc(struct io_mapping *mapping, unsigned long offset)
+{
+	offset += (unsigned long) mapping;
+	return ioremap_wc(offset, PAGE_SIZE);
+}
+
+static inline void
+io_mapping_unmap(void *vaddr)
+{
+	iounmap(vaddr);
+}
+#endif /* CONFIG_X86_32 */
+
+#endif /* _LINUX_IO_MAPPING_H */
-- 
cgit v1.2.3


From 4ac96572f1f6abe44b5e02e80fdfb5a990129613 Mon Sep 17 00:00:00 2001
From: Jeff Garzik <jeff@garzik.org>
Date: Sun, 2 Nov 2008 09:51:27 -0500
Subject: linux/string.h: fix comment typo

s/user/used/

Signed-off-by: Jeff Garzik <jgarzik@redhat.com>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/string.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/string.h b/include/linux/string.h
index 810d80df0a1d..d18fc198aa2f 100644
--- a/include/linux/string.h
+++ b/include/linux/string.h
@@ -1,7 +1,7 @@
 #ifndef _LINUX_STRING_H_
 #define _LINUX_STRING_H_
 
-/* We don't want strings.h stuff being user by user stuff by accident */
+/* We don't want strings.h stuff being used by user stuff by accident */
 
 #ifndef __KERNEL__
 #include <string.h>
-- 
cgit v1.2.3


From e5beae16901795223d677f15aa2fe192976278ee Mon Sep 17 00:00:00 2001
From: Keith Packard <keithp@keithp.com>
Date: Mon, 3 Nov 2008 18:21:45 +0100
Subject: io mapping: clean up #ifdefs

Impact: cleanup

clean up ifdefs: change #ifdef CONFIG_X86_32/64 to
CONFIG_HAVE_ATOMIC_IOMAP.

flip around the #ifdef sections to clean up the structure.

Signed-off-by: Keith Packard <keithp@keithp.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/Kconfig           |  4 ++++
 include/linux/io-mapping.h | 43 +++++++++++++++++++++++++------------------
 2 files changed, 29 insertions(+), 18 deletions(-)

(limited to 'include/linux')

diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
index 6f20718d3156..e60c59b81bdd 100644
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -1894,6 +1894,10 @@ config SYSVIPC_COMPAT
 endmenu
 
 
+config HAVE_ATOMIC_IOMAP
+	def_bool y
+	depends on X86_32
+
 source "net/Kconfig"
 
 source "drivers/Kconfig"
diff --git a/include/linux/io-mapping.h b/include/linux/io-mapping.h
index 1b566993db6e..82df31726a54 100644
--- a/include/linux/io-mapping.h
+++ b/include/linux/io-mapping.h
@@ -33,86 +33,93 @@
 /* this struct isn't actually defined anywhere */
 struct io_mapping;
 
-#ifdef CONFIG_X86_64
+#ifdef CONFIG_HAVE_ATOMIC_IOMAP
+
+/*
+ * For small address space machines, mapping large objects
+ * into the kernel virtual space isn't practical. Where
+ * available, use fixmap support to dynamically map pages
+ * of the object at run time.
+ */
 
-/* Create the io_mapping object*/
 static inline struct io_mapping *
 io_mapping_create_wc(unsigned long base, unsigned long size)
 {
-	return (struct io_mapping *) ioremap_wc(base, size);
+	return (struct io_mapping *) base;
 }
 
 static inline void
 io_mapping_free(struct io_mapping *mapping)
 {
-	iounmap(mapping);
 }
 
 /* Atomic map/unmap */
 static inline void *
 io_mapping_map_atomic_wc(struct io_mapping *mapping, unsigned long offset)
 {
-	return ((char *) mapping) + offset;
+	offset += (unsigned long) mapping;
+	return iomap_atomic_prot_pfn(offset >> PAGE_SHIFT, KM_USER0,
+				     __pgprot(__PAGE_KERNEL_WC));
 }
 
 static inline void
 io_mapping_unmap_atomic(void *vaddr)
 {
+	iounmap_atomic(vaddr, KM_USER0);
 }
 
-/* Non-atomic map/unmap */
 static inline void *
 io_mapping_map_wc(struct io_mapping *mapping, unsigned long offset)
 {
-	return ((char *) mapping) + offset;
+	offset += (unsigned long) mapping;
+	return ioremap_wc(offset, PAGE_SIZE);
 }
 
 static inline void
 io_mapping_unmap(void *vaddr)
 {
+	iounmap(vaddr);
 }
 
-#endif /* CONFIG_X86_64 */
+#else
 
-#ifdef CONFIG_X86_32
+/* Create the io_mapping object*/
 static inline struct io_mapping *
 io_mapping_create_wc(unsigned long base, unsigned long size)
 {
-	return (struct io_mapping *) base;
+	return (struct io_mapping *) ioremap_wc(base, size);
 }
 
 static inline void
 io_mapping_free(struct io_mapping *mapping)
 {
+	iounmap(mapping);
 }
 
 /* Atomic map/unmap */
 static inline void *
 io_mapping_map_atomic_wc(struct io_mapping *mapping, unsigned long offset)
 {
-	offset += (unsigned long) mapping;
-	return iomap_atomic_prot_pfn(offset >> PAGE_SHIFT, KM_USER0,
-				     __pgprot(__PAGE_KERNEL_WC));
+	return ((char *) mapping) + offset;
 }
 
 static inline void
 io_mapping_unmap_atomic(void *vaddr)
 {
-	iounmap_atomic(vaddr, KM_USER0);
 }
 
+/* Non-atomic map/unmap */
 static inline void *
 io_mapping_map_wc(struct io_mapping *mapping, unsigned long offset)
 {
-	offset += (unsigned long) mapping;
-	return ioremap_wc(offset, PAGE_SIZE);
+	return ((char *) mapping) + offset;
 }
 
 static inline void
 io_mapping_unmap(void *vaddr)
 {
-	iounmap(vaddr);
 }
-#endif /* CONFIG_X86_32 */
+
+#endif /* HAVE_ATOMIC_IOMAP */
 
 #endif /* _LINUX_IO_MAPPING_H */
-- 
cgit v1.2.3


From a7b930cdf8ec790c85f81416c87f7c066679d373 Mon Sep 17 00:00:00 2001
From: Harvey Harrison <harvey.harrison@gmail.com>
Date: Sun, 2 Nov 2008 13:32:43 -0800
Subject: PCI: annotate return value of pci_ioremap_bar with __iomem

Was missing from the initial patch.

Acked-by: Arjan van de Ven <arjan@linux.intel.com>
Signed-off-by: Harvey Harrison <harvey.harrison@gmail.com>
Signed-off-by: Jesse Barnes <jbarnes@virtuousgeek.org>
---
 include/linux/pci.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/pci.h b/include/linux/pci.h
index c75b82bda327..feb4657bb043 100644
--- a/include/linux/pci.h
+++ b/include/linux/pci.h
@@ -1136,7 +1136,7 @@ static inline void pci_mmcfg_late_init(void) { }
 #endif
 
 #ifdef CONFIG_HAS_IOMEM
-static inline void * pci_ioremap_bar(struct pci_dev *pdev, int bar)
+static inline void __iomem *pci_ioremap_bar(struct pci_dev *pdev, int bar)
 {
 	/*
 	 * Make sure the BAR is actually a memory resource, not an IO resource
-- 
cgit v1.2.3


From 6a87e42e955ff27e07a77f65f8f077dc7c4171e1 Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Mon, 3 Nov 2008 19:01:09 +0900
Subject: libata: implement ATA_HORKAGE_ATAPI_MOD16_DMA and apply it

libata always uses PIO for ATAPI commands when the number of bytes to
transfer isn't multiple of 16 but quantum DAT72 chokes on odd bytes
PIO transfers.  Implement a horkage to skip the mod16 check and apply
it to the quantum device.

This is reported by John Clark in the following thread.

  http://thread.gmane.org/gmane.linux.ide/34748

Signed-off-by: Tejun Heo <tj@kernel.org>
Cc: John Clark <clarkjc@runbox.com>
Signed-off-by: Jeff Garzik <jgarzik@redhat.com>
---
 drivers/ata/libata-core.c | 4 +++-
 include/linux/libata.h    | 2 ++
 2 files changed, 5 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/drivers/ata/libata-core.c b/drivers/ata/libata-core.c
index 82af7011f2dd..91b478f20557 100644
--- a/drivers/ata/libata-core.c
+++ b/drivers/ata/libata-core.c
@@ -4024,6 +4024,7 @@ static const struct ata_blacklist_entry ata_device_blacklist [] = {
 
 	/* Weird ATAPI devices */
 	{ "TORiSAN DVD-ROM DRD-N216", NULL,	ATA_HORKAGE_MAX_SEC_128 },
+	{ "QUANTUM DAT    DAT72-000", NULL,	ATA_HORKAGE_ATAPI_MOD16_DMA },
 
 	/* Devices we expect to fail diagnostics */
 
@@ -4444,7 +4445,8 @@ int atapi_check_dma(struct ata_queued_cmd *qc)
 	/* Don't allow DMA if it isn't multiple of 16 bytes.  Quite a
 	 * few ATAPI devices choke on such DMA requests.
 	 */
-	if (unlikely(qc->nbytes & 15))
+	if (!(qc->dev->horkage & ATA_HORKAGE_ATAPI_MOD16_DMA) &&
+	    unlikely(qc->nbytes & 15))
 		return 1;
 
 	if (ap->ops->check_atapi_dma)
diff --git a/include/linux/libata.h b/include/linux/libata.h
index f5441edee55f..c7665a4134c5 100644
--- a/include/linux/libata.h
+++ b/include/linux/libata.h
@@ -373,6 +373,8 @@ enum {
 	ATA_HORKAGE_IVB		= (1 << 8),	/* cbl det validity bit bugs */
 	ATA_HORKAGE_STUCK_ERR	= (1 << 9),	/* stuck ERR on next PACKET */
 	ATA_HORKAGE_BRIDGE_OK	= (1 << 10),	/* no bridge limits */
+	ATA_HORKAGE_ATAPI_MOD16_DMA = (1 << 11), /* use ATAPI DMA for commands
+						    not multiple of 16 bytes */
 
 	 /* DMA mask for user DMA control: User visible values; DO NOT
 	    renumber */
-- 
cgit v1.2.3


From 9b22ea560957de1484e6b3e8538f7eef202e3596 Mon Sep 17 00:00:00 2001
From: Patrick McHardy <kaber@trash.net>
Date: Tue, 4 Nov 2008 14:49:57 -0800
Subject: net: fix packet socket delivery in rx irq handler

The changes to deliver hardware accelerated VLAN packets to packet
sockets (commit bc1d0411) caused a warning for non-NAPI drivers.
The __vlan_hwaccel_rx() function is called directly from the drivers
RX function, for non-NAPI drivers that means its still in RX IRQ
context:

[   27.779463] ------------[ cut here ]------------
[   27.779509] WARNING: at kernel/softirq.c:136 local_bh_enable+0x37/0x81()
...
[   27.782520]  [<c0264755>] netif_nit_deliver+0x5b/0x75
[   27.782590]  [<c02bba83>] __vlan_hwaccel_rx+0x79/0x162
[   27.782664]  [<f8851c1d>] atl1_intr+0x9a9/0xa7c [atl1]
[   27.782738]  [<c0155b17>] handle_IRQ_event+0x23/0x51
[   27.782808]  [<c015692e>] handle_edge_irq+0xc2/0x102
[   27.782878]  [<c0105fd5>] do_IRQ+0x4d/0x64

Split hardware accelerated VLAN reception into two parts to fix this:

- __vlan_hwaccel_rx just stores the VLAN TCI and performs the VLAN
  device lookup, then calls netif_receive_skb()/netif_rx()

- vlan_hwaccel_do_receive(), which is invoked by netif_receive_skb()
  in softirq context, performs the real reception and delivery to
  packet sockets.

Reported-and-tested-by: Ramon Casellas <ramon.casellas@cttc.es>
Signed-off-by: Patrick McHardy <kaber@trash.net>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/if_vlan.h |  7 +++++++
 net/8021q/vlan_core.c   | 46 +++++++++++++++++++++++++++++++++-------------
 net/core/dev.c          |  3 +++
 3 files changed, 43 insertions(+), 13 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/if_vlan.h b/include/linux/if_vlan.h
index 9e7b49b8062d..a5cb0c3f6dcf 100644
--- a/include/linux/if_vlan.h
+++ b/include/linux/if_vlan.h
@@ -114,6 +114,8 @@ extern u16 vlan_dev_vlan_id(const struct net_device *dev);
 
 extern int __vlan_hwaccel_rx(struct sk_buff *skb, struct vlan_group *grp,
 			     u16 vlan_tci, int polling);
+extern int vlan_hwaccel_do_receive(struct sk_buff *skb);
+
 #else
 static inline struct net_device *vlan_dev_real_dev(const struct net_device *dev)
 {
@@ -133,6 +135,11 @@ static inline int __vlan_hwaccel_rx(struct sk_buff *skb, struct vlan_group *grp,
 	BUG();
 	return NET_XMIT_SUCCESS;
 }
+
+static inline int vlan_hwaccel_do_receive(struct sk_buff *skb)
+{
+	return 0;
+}
 #endif
 
 /**
diff --git a/net/8021q/vlan_core.c b/net/8021q/vlan_core.c
index 916061f681b6..68ced4bf158c 100644
--- a/net/8021q/vlan_core.c
+++ b/net/8021q/vlan_core.c
@@ -3,11 +3,20 @@
 #include <linux/if_vlan.h>
 #include "vlan.h"
 
+struct vlan_hwaccel_cb {
+	struct net_device	*dev;
+};
+
+static inline struct vlan_hwaccel_cb *vlan_hwaccel_cb(struct sk_buff *skb)
+{
+	return (struct vlan_hwaccel_cb *)skb->cb;
+}
+
 /* VLAN rx hw acceleration helper.  This acts like netif_{rx,receive_skb}(). */
 int __vlan_hwaccel_rx(struct sk_buff *skb, struct vlan_group *grp,
 		      u16 vlan_tci, int polling)
 {
-	struct net_device_stats *stats;
+	struct vlan_hwaccel_cb *cb = vlan_hwaccel_cb(skb);
 
 	if (skb_bond_should_drop(skb)) {
 		dev_kfree_skb_any(skb);
@@ -15,23 +24,35 @@ int __vlan_hwaccel_rx(struct sk_buff *skb, struct vlan_group *grp,
 	}
 
 	skb->vlan_tci = vlan_tci;
+	cb->dev = vlan_group_get_device(grp, vlan_tci & VLAN_VID_MASK);
+
+	return (polling ? netif_receive_skb(skb) : netif_rx(skb));
+}
+EXPORT_SYMBOL(__vlan_hwaccel_rx);
+
+int vlan_hwaccel_do_receive(struct sk_buff *skb)
+{
+	struct vlan_hwaccel_cb *cb = vlan_hwaccel_cb(skb);
+	struct net_device *dev = cb->dev;
+	struct net_device_stats *stats;
+
 	netif_nit_deliver(skb);
 
-	skb->dev = vlan_group_get_device(grp, vlan_tci & VLAN_VID_MASK);
-	if (skb->dev == NULL) {
-		dev_kfree_skb_any(skb);
-		/* Not NET_RX_DROP, this is not being dropped
-		 * due to congestion. */
-		return NET_RX_SUCCESS;
+	if (dev == NULL) {
+		kfree_skb(skb);
+		return -1;
 	}
-	skb->dev->last_rx = jiffies;
+
+	skb->dev = dev;
+	skb->priority = vlan_get_ingress_priority(dev, skb->vlan_tci);
 	skb->vlan_tci = 0;
 
-	stats = &skb->dev->stats;
+	dev->last_rx = jiffies;
+
+	stats = &dev->stats;
 	stats->rx_packets++;
 	stats->rx_bytes += skb->len;
 
-	skb->priority = vlan_get_ingress_priority(skb->dev, vlan_tci);
 	switch (skb->pkt_type) {
 	case PACKET_BROADCAST:
 		break;
@@ -43,13 +64,12 @@ int __vlan_hwaccel_rx(struct sk_buff *skb, struct vlan_group *grp,
 		 * This allows the VLAN to have a different MAC than the
 		 * underlying device, and still route correctly. */
 		if (!compare_ether_addr(eth_hdr(skb)->h_dest,
-					skb->dev->dev_addr))
+					dev->dev_addr))
 			skb->pkt_type = PACKET_HOST;
 		break;
 	};
-	return (polling ? netif_receive_skb(skb) : netif_rx(skb));
+	return 0;
 }
-EXPORT_SYMBOL(__vlan_hwaccel_rx);
 
 struct net_device *vlan_dev_real_dev(const struct net_device *dev)
 {
diff --git a/net/core/dev.c b/net/core/dev.c
index d9038e328cc1..9174c77d3112 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -2218,6 +2218,9 @@ int netif_receive_skb(struct sk_buff *skb)
 	int ret = NET_RX_DROP;
 	__be16 type;
 
+	if (skb->vlan_tci && vlan_hwaccel_do_receive(skb))
+		return NET_RX_SUCCESS;
+
 	/* if we've gotten here through NAPI, check netpoll */
 	if (netpoll_receive_skb(skb))
 		return NET_RX_DROP;
-- 
cgit v1.2.3


From 467622ef2acb01986eab37ef96c3632b3ea35999 Mon Sep 17 00:00:00 2001
From: "Eric W. Biederman" <ebiederm@xmission.com>
Date: Sat, 1 Nov 2008 04:19:11 -0700
Subject: [MTD] [NOR] Fix cfi_send_gen_cmd handling of x16 devices in x8 mode
 (v4)

For "unlock" cycles to 16bit devices in 8bit compatibility mode we need
to use the byte addresses 0xaaa and 0x555. These effectively match
the word address 0x555 and 0x2aa, except the latter has its low bit set.

Most chips don't care about the value of the 'A-1' pin in x8 mode,
but some -- like the ST M29W320D -- do. So we need to be careful to
set it where appropriate.

cfi_send_gen_cmd is only ever passed addresses where the low byte
is 0x00, 0x55 or 0xaa. Of those, only addresses ending 0xaa are
affected by this patch, by masking in the extra low bit when the device
is known to be in compatibility mode.

[dwmw2: Do it only when (cmd_ofs & 0xff) == 0xaa]
v4: Fix  stupid typo in cfi_build_cmd_addr that failed to compile
    I'm writing this patch way to late at night.
v3: Bring all of the work back into cfi_build_cmd_addr
    including calling of map_bankwidth(map) and cfi_interleave(cfi)
    So every caller doesn't need to.
v2: Only modified the address if we our device_type is larger than our
    bus width.

Cc: stable@kernel.org
Signed-off-by: Eric W. Biederman <ebiederm@xmission.com>
Signed-off-by: David Woodhouse <David.Woodhouse@intel.com>
---
 drivers/mtd/chips/cfi_cmdset_0002.c | 13 -------------
 drivers/mtd/chips/jedec_probe.c     | 10 ++++------
 include/linux/mtd/cfi.h             | 22 +++++++++++++++++++---
 3 files changed, 23 insertions(+), 22 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/mtd/chips/cfi_cmdset_0002.c b/drivers/mtd/chips/cfi_cmdset_0002.c
index 3e6f5d8609e8..d74ec46aa032 100644
--- a/drivers/mtd/chips/cfi_cmdset_0002.c
+++ b/drivers/mtd/chips/cfi_cmdset_0002.c
@@ -406,19 +406,6 @@ struct mtd_info *cfi_cmdset_0002(struct map_info *map, int primary)
 		/* Set the default CFI lock/unlock addresses */
 		cfi->addr_unlock1 = 0x555;
 		cfi->addr_unlock2 = 0x2aa;
-		/* Modify the unlock address if we are in compatibility mode */
-		if (	/* x16 in x8 mode */
-			((cfi->device_type == CFI_DEVICETYPE_X8) &&
-				(cfi->cfiq->InterfaceDesc ==
-					CFI_INTERFACE_X8_BY_X16_ASYNC)) ||
-			/* x32 in x16 mode */
-			((cfi->device_type == CFI_DEVICETYPE_X16) &&
-				(cfi->cfiq->InterfaceDesc ==
-					CFI_INTERFACE_X16_BY_X32_ASYNC)))
-		{
-			cfi->addr_unlock1 = 0xaaa;
-			cfi->addr_unlock2 = 0x555;
-		}
 
 	} /* CFI mode */
 	else if (cfi->cfi_mode == CFI_MODE_JEDEC) {
diff --git a/drivers/mtd/chips/jedec_probe.c b/drivers/mtd/chips/jedec_probe.c
index f84ab6182148..2f3f2f719ba4 100644
--- a/drivers/mtd/chips/jedec_probe.c
+++ b/drivers/mtd/chips/jedec_probe.c
@@ -1808,9 +1808,7 @@ static inline u32 jedec_read_mfr(struct map_info *map, uint32_t base,
 	 * several first banks can contain 0x7f instead of actual ID
 	 */
 	do {
-		uint32_t ofs = cfi_build_cmd_addr(0 + (bank << 8),
-						  cfi_interleave(cfi),
-						  cfi->device_type);
+		uint32_t ofs = cfi_build_cmd_addr(0 + (bank << 8), map, cfi);
 		mask = (1 << (cfi->device_type * 8)) - 1;
 		result = map_read(map, base + ofs);
 		bank++;
@@ -1824,7 +1822,7 @@ static inline u32 jedec_read_id(struct map_info *map, uint32_t base,
 {
 	map_word result;
 	unsigned long mask;
-	u32 ofs = cfi_build_cmd_addr(1, cfi_interleave(cfi), cfi->device_type);
+	u32 ofs = cfi_build_cmd_addr(1, map, cfi);
 	mask = (1 << (cfi->device_type * 8)) -1;
 	result = map_read(map, base + ofs);
 	return result.x[0] & mask;
@@ -2067,8 +2065,8 @@ static int jedec_probe_chip(struct map_info *map, __u32 base,
 
 	}
 	/* Ensure the unlock addresses we try stay inside the map */
-	probe_offset1 = cfi_build_cmd_addr(cfi->addr_unlock1, cfi_interleave(cfi), cfi->device_type);
-	probe_offset2 = cfi_build_cmd_addr(cfi->addr_unlock2, cfi_interleave(cfi), cfi->device_type);
+	probe_offset1 = cfi_build_cmd_addr(cfi->addr_unlock1, map, cfi);
+	probe_offset2 = cfi_build_cmd_addr(cfi->addr_unlock2, map, cfi);
 	if (	((base + probe_offset1 + map_bankwidth(map)) >= map->size) ||
 		((base + probe_offset2 + map_bankwidth(map)) >= map->size))
 		goto retry;
diff --git a/include/linux/mtd/cfi.h b/include/linux/mtd/cfi.h
index ee5124ec319e..00e2b575021f 100644
--- a/include/linux/mtd/cfi.h
+++ b/include/linux/mtd/cfi.h
@@ -282,9 +282,25 @@ struct cfi_private {
 /*
  * Returns the command address according to the given geometry.
  */
-static inline uint32_t cfi_build_cmd_addr(uint32_t cmd_ofs, int interleave, int type)
+static inline uint32_t cfi_build_cmd_addr(uint32_t cmd_ofs,
+				struct map_info *map, struct cfi_private *cfi)
 {
-	return (cmd_ofs * type) * interleave;
+	unsigned bankwidth = map_bankwidth(map);
+	unsigned interleave = cfi_interleave(cfi);
+	unsigned type = cfi->device_type;
+	uint32_t addr;
+	
+	addr = (cmd_ofs * type) * interleave;
+
+	/* Modify the unlock address if we are in compatiblity mode.
+	 * For 16bit devices on 8 bit busses
+	 * and 32bit devices on 16 bit busses
+	 * set the low bit of the alternating bit sequence of the address.
+	 */
+	if (((type * interleave) > bankwidth) && ((uint8_t)cmd_ofs == 0xaa))
+		addr |= (type >> 1)*interleave;
+
+	return  addr;
 }
 
 /*
@@ -430,7 +446,7 @@ static inline uint32_t cfi_send_gen_cmd(u_char cmd, uint32_t cmd_addr, uint32_t
 				int type, map_word *prev_val)
 {
 	map_word val;
-	uint32_t addr = base + cfi_build_cmd_addr(cmd_addr, cfi_interleave(cfi), type);
+	uint32_t addr = base + cfi_build_cmd_addr(cmd_addr, map, cfi);
 	val = cfi_build_cmd(cmd, map, cfi);
 
 	if (prev_val)
-- 
cgit v1.2.3


From 9fcd18c9e63e325dbd2b4c726623f760788d5aa8 Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Wed, 5 Nov 2008 16:52:08 +0100
Subject: sched: re-tune balancing

Impact: improve wakeup affinity on NUMA systems, tweak SMP systems

Given the fixes+tweaks to the wakeup-buddy code, re-tweak the domain
balancing defaults on NUMA and SMP systems.

Turn on SD_WAKE_AFFINE which was off on x86 NUMA - there's no reason
why we would not want to have wakeup affinity across nodes as well.
(we already do this in the standard NUMA template.)

lat_ctx on a NUMA box is particularly happy about this change:

before:

 |   phoenix:~/l> ./lat_ctx -s 0 2
 |   "size=0k ovr=2.60
 |   2 5.70

after:

 |   phoenix:~/l> ./lat_ctx -s 0 2
 |   "size=0k ovr=2.65
 |   2 2.07

a 2.75x speedup.

pipe-test is similarly happy about it too:

 |  phoenix:~/sched-tests> ./pipe-test
 |   18.26 usecs/loop.
 |   14.70 usecs/loop.
 |   14.38 usecs/loop.
 |   10.55 usecs/loop.              # +WAKE_AFFINE on domain0+domain1
 |   8.63 usecs/loop.
 |   8.59 usecs/loop.
 |   9.03 usecs/loop.
 |   8.94 usecs/loop.
 |   8.96 usecs/loop.
 |   8.63 usecs/loop.

Also:

 - disable SD_BALANCE_NEWIDLE on NUMA and SMP domains (keep it for siblings)
 - enable SD_WAKE_BALANCE on SMP domains

Sysbench+postgresql improves all around the board, quite significantly:

           .28-rc3-11474e2c  .28-rc3-11474e2c-tune
-------------------------------------------------
    1:             571              688    +17.08%
    2:            1236             1206    -2.55%
    4:            2381             2642    +9.89%
    8:            4958             5164    +3.99%
   16:            9580             9574    -0.07%
   32:            7128             8118    +12.20%
   64:            7342             8266    +11.18%
  128:            7342             8064    +8.95%
  256:            7519             7884    +4.62%
  512:            7350             7731    +4.93%
-------------------------------------------------
  SUM:           55412            59341    +6.62%

So it's a win both for the runup portion, the peak area and the tail.

Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/include/asm/topology.h | 7 ++++---
 include/linux/topology.h        | 4 ++--
 2 files changed, 6 insertions(+), 5 deletions(-)

(limited to 'include/linux')

diff --git a/arch/x86/include/asm/topology.h b/arch/x86/include/asm/topology.h
index 90ac7718469a..4850e4b02b61 100644
--- a/arch/x86/include/asm/topology.h
+++ b/arch/x86/include/asm/topology.h
@@ -154,7 +154,7 @@ extern unsigned long node_remap_size[];
 
 #endif
 
-/* sched_domains SD_NODE_INIT for NUMAQ machines */
+/* sched_domains SD_NODE_INIT for NUMA machines */
 #define SD_NODE_INIT (struct sched_domain) {		\
 	.min_interval		= 8,			\
 	.max_interval		= 32,			\
@@ -169,8 +169,9 @@ extern unsigned long node_remap_size[];
 	.flags			= SD_LOAD_BALANCE	\
 				| SD_BALANCE_EXEC	\
 				| SD_BALANCE_FORK	\
-				| SD_SERIALIZE		\
-				| SD_WAKE_BALANCE,	\
+				| SD_WAKE_AFFINE	\
+				| SD_WAKE_BALANCE	\
+				| SD_SERIALIZE,		\
 	.last_balance		= jiffies,		\
 	.balance_interval	= 1,			\
 }
diff --git a/include/linux/topology.h b/include/linux/topology.h
index 2158fc0d5a56..34a7ee0ebed2 100644
--- a/include/linux/topology.h
+++ b/include/linux/topology.h
@@ -146,10 +146,10 @@ void arch_update_cpu_topology(void);
 	.wake_idx		= 1,			\
 	.forkexec_idx		= 1,			\
 	.flags			= SD_LOAD_BALANCE	\
-				| SD_BALANCE_NEWIDLE	\
-				| SD_BALANCE_FORK	\
 				| SD_BALANCE_EXEC	\
+				| SD_BALANCE_FORK	\
 				| SD_WAKE_AFFINE	\
+				| SD_WAKE_BALANCE	\
 				| BALANCE_FOR_PKG_POWER,\
 	.last_balance		= jiffies,		\
 	.balance_interval	= 1,			\
-- 
cgit v1.2.3


From f92131c3dd567fc6df18ce3f46fcf57ecbdefbe0 Mon Sep 17 00:00:00 2001
From: Jeremy Fitzhardinge <jeremy@goop.org>
Date: Wed, 29 Oct 2008 14:10:51 +0100
Subject: bio: define __BIOVEC_PHYS_MERGEABLE

Define __BIOVEC_PHYS_MERGEABLE as the default implementation of
BIOVEC_PHYS_MERGEABLE, so that its available for reuse within an
arch-specific definition of BIOVEC_PHYS_MERGEABLE.

Signed-off-by: Jeremy Fitzhardinge <jeremy.fitzhardinge@citrix.com>
Signed-off-by: Jens Axboe <jens.axboe@oracle.com>
---
 include/linux/bio.h | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/bio.h b/include/linux/bio.h
index 1c91a176b9ae..6a642098e5c3 100644
--- a/include/linux/bio.h
+++ b/include/linux/bio.h
@@ -236,12 +236,16 @@ static inline void *bio_data(struct bio *bio)
 #define __BVEC_END(bio)		bio_iovec_idx((bio), (bio)->bi_vcnt - 1)
 #define __BVEC_START(bio)	bio_iovec_idx((bio), (bio)->bi_idx)
 
+/* Default implementation of BIOVEC_PHYS_MERGEABLE */
+#define __BIOVEC_PHYS_MERGEABLE(vec1, vec2)	\
+	((bvec_to_phys((vec1)) + (vec1)->bv_len) == bvec_to_phys((vec2)))
+
 /*
  * allow arch override, for eg virtualized architectures (put in asm/io.h)
  */
 #ifndef BIOVEC_PHYS_MERGEABLE
 #define BIOVEC_PHYS_MERGEABLE(vec1, vec2)	\
-	((bvec_to_phys((vec1)) + (vec1)->bv_len) == bvec_to_phys((vec2)))
+	__BIOVEC_PHYS_MERGEABLE(vec1, vec2)
 #endif
 
 #define __BIO_SEG_BOUNDARY(addr1, addr2, mask) \
-- 
cgit v1.2.3


From 9c133c469d38043d5aadaa03f2fb840d88d1cf4f Mon Sep 17 00:00:00 2001
From: Alan Stern <stern@rowland.harvard.edu>
Date: Thu, 6 Nov 2008 08:42:48 +0100
Subject: Add round_jiffies_up and related routines

This patch (as1158b) adds round_jiffies_up() and friends.  These
routines work like the analogous round_jiffies() functions, except
that they will never round down.

The new routines will be useful for timeouts where we don't care
exactly when the timer expires, provided it doesn't expire too soon.

Signed-off-by: Alan Stern <stern@rowland.harvard.edu>
Signed-off-by: Jens Axboe <jens.axboe@oracle.com>
---
 include/linux/timer.h |   5 ++
 kernel/timer.c        | 129 ++++++++++++++++++++++++++++++++++++++------------
 2 files changed, 104 insertions(+), 30 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/timer.h b/include/linux/timer.h
index d4ba79248a27..daf9685b861c 100644
--- a/include/linux/timer.h
+++ b/include/linux/timer.h
@@ -186,4 +186,9 @@ unsigned long __round_jiffies_relative(unsigned long j, int cpu);
 unsigned long round_jiffies(unsigned long j);
 unsigned long round_jiffies_relative(unsigned long j);
 
+unsigned long __round_jiffies_up(unsigned long j, int cpu);
+unsigned long __round_jiffies_up_relative(unsigned long j, int cpu);
+unsigned long round_jiffies_up(unsigned long j);
+unsigned long round_jiffies_up_relative(unsigned long j);
+
 #endif
diff --git a/kernel/timer.c b/kernel/timer.c
index 56becf373c58..dbd50fabe4c7 100644
--- a/kernel/timer.c
+++ b/kernel/timer.c
@@ -112,27 +112,8 @@ timer_set_base(struct timer_list *timer, struct tvec_base *new_base)
 				      tbase_get_deferrable(timer->base));
 }
 
-/**
- * __round_jiffies - function to round jiffies to a full second
- * @j: the time in (absolute) jiffies that should be rounded
- * @cpu: the processor number on which the timeout will happen
- *
- * __round_jiffies() rounds an absolute time in the future (in jiffies)
- * up or down to (approximately) full seconds. This is useful for timers
- * for which the exact time they fire does not matter too much, as long as
- * they fire approximately every X seconds.
- *
- * By rounding these timers to whole seconds, all such timers will fire
- * at the same time, rather than at various times spread out. The goal
- * of this is to have the CPU wake up less, which saves power.
- *
- * The exact rounding is skewed for each processor to avoid all
- * processors firing at the exact same time, which could lead
- * to lock contention or spurious cache line bouncing.
- *
- * The return value is the rounded version of the @j parameter.
- */
-unsigned long __round_jiffies(unsigned long j, int cpu)
+static unsigned long round_jiffies_common(unsigned long j, int cpu,
+		bool force_up)
 {
 	int rem;
 	unsigned long original = j;
@@ -154,8 +135,9 @@ unsigned long __round_jiffies(unsigned long j, int cpu)
 	 * due to delays of the timer irq, long irq off times etc etc) then
 	 * we should round down to the whole second, not up. Use 1/4th second
 	 * as cutoff for this rounding as an extreme upper bound for this.
+	 * But never round down if @force_up is set.
 	 */
-	if (rem < HZ/4) /* round down */
+	if (rem < HZ/4 && !force_up) /* round down */
 		j = j - rem;
 	else /* round up */
 		j = j - rem + HZ;
@@ -167,6 +149,31 @@ unsigned long __round_jiffies(unsigned long j, int cpu)
 		return original;
 	return j;
 }
+
+/**
+ * __round_jiffies - function to round jiffies to a full second
+ * @j: the time in (absolute) jiffies that should be rounded
+ * @cpu: the processor number on which the timeout will happen
+ *
+ * __round_jiffies() rounds an absolute time in the future (in jiffies)
+ * up or down to (approximately) full seconds. This is useful for timers
+ * for which the exact time they fire does not matter too much, as long as
+ * they fire approximately every X seconds.
+ *
+ * By rounding these timers to whole seconds, all such timers will fire
+ * at the same time, rather than at various times spread out. The goal
+ * of this is to have the CPU wake up less, which saves power.
+ *
+ * The exact rounding is skewed for each processor to avoid all
+ * processors firing at the exact same time, which could lead
+ * to lock contention or spurious cache line bouncing.
+ *
+ * The return value is the rounded version of the @j parameter.
+ */
+unsigned long __round_jiffies(unsigned long j, int cpu)
+{
+	return round_jiffies_common(j, cpu, false);
+}
 EXPORT_SYMBOL_GPL(__round_jiffies);
 
 /**
@@ -191,13 +198,10 @@ EXPORT_SYMBOL_GPL(__round_jiffies);
  */
 unsigned long __round_jiffies_relative(unsigned long j, int cpu)
 {
-	/*
-	 * In theory the following code can skip a jiffy in case jiffies
-	 * increments right between the addition and the later subtraction.
-	 * However since the entire point of this function is to use approximate
-	 * timeouts, it's entirely ok to not handle that.
-	 */
-	return  __round_jiffies(j + jiffies, cpu) - jiffies;
+	unsigned long j0 = jiffies;
+
+	/* Use j0 because jiffies might change while we run */
+	return round_jiffies_common(j + j0, cpu, false) - j0;
 }
 EXPORT_SYMBOL_GPL(__round_jiffies_relative);
 
@@ -218,7 +222,7 @@ EXPORT_SYMBOL_GPL(__round_jiffies_relative);
  */
 unsigned long round_jiffies(unsigned long j)
 {
-	return __round_jiffies(j, raw_smp_processor_id());
+	return round_jiffies_common(j, raw_smp_processor_id(), false);
 }
 EXPORT_SYMBOL_GPL(round_jiffies);
 
@@ -243,6 +247,71 @@ unsigned long round_jiffies_relative(unsigned long j)
 }
 EXPORT_SYMBOL_GPL(round_jiffies_relative);
 
+/**
+ * __round_jiffies_up - function to round jiffies up to a full second
+ * @j: the time in (absolute) jiffies that should be rounded
+ * @cpu: the processor number on which the timeout will happen
+ *
+ * This is the same as __round_jiffies() except that it will never
+ * round down.  This is useful for timeouts for which the exact time
+ * of firing does not matter too much, as long as they don't fire too
+ * early.
+ */
+unsigned long __round_jiffies_up(unsigned long j, int cpu)
+{
+	return round_jiffies_common(j, cpu, true);
+}
+EXPORT_SYMBOL_GPL(__round_jiffies_up);
+
+/**
+ * __round_jiffies_up_relative - function to round jiffies up to a full second
+ * @j: the time in (relative) jiffies that should be rounded
+ * @cpu: the processor number on which the timeout will happen
+ *
+ * This is the same as __round_jiffies_relative() except that it will never
+ * round down.  This is useful for timeouts for which the exact time
+ * of firing does not matter too much, as long as they don't fire too
+ * early.
+ */
+unsigned long __round_jiffies_up_relative(unsigned long j, int cpu)
+{
+	unsigned long j0 = jiffies;
+
+	/* Use j0 because jiffies might change while we run */
+	return round_jiffies_common(j + j0, cpu, true) - j0;
+}
+EXPORT_SYMBOL_GPL(__round_jiffies_up_relative);
+
+/**
+ * round_jiffies_up - function to round jiffies up to a full second
+ * @j: the time in (absolute) jiffies that should be rounded
+ *
+ * This is the same as round_jiffies() except that it will never
+ * round down.  This is useful for timeouts for which the exact time
+ * of firing does not matter too much, as long as they don't fire too
+ * early.
+ */
+unsigned long round_jiffies_up(unsigned long j)
+{
+	return round_jiffies_common(j, raw_smp_processor_id(), true);
+}
+EXPORT_SYMBOL_GPL(round_jiffies_up);
+
+/**
+ * round_jiffies_up_relative - function to round jiffies up to a full second
+ * @j: the time in (relative) jiffies that should be rounded
+ *
+ * This is the same as round_jiffies_relative() except that it will never
+ * round down.  This is useful for timeouts for which the exact time
+ * of firing does not matter too much, as long as they don't fire too
+ * early.
+ */
+unsigned long round_jiffies_up_relative(unsigned long j)
+{
+	return __round_jiffies_up_relative(j, raw_smp_processor_id());
+}
+EXPORT_SYMBOL_GPL(round_jiffies_up_relative);
+
 
 static inline void set_running_timer(struct tvec_base *base,
 					struct timer_list *timer)
-- 
cgit v1.2.3


From 2d3854a37e8b767a51aba38ed6d22817b0631e33 Mon Sep 17 00:00:00 2001
From: Rusty Russell <rusty@rustcorp.com.au>
Date: Wed, 5 Nov 2008 13:39:10 +1100
Subject: cpumask: introduce new API, without changing anything

Impact: introduce new APIs

We want to deprecate cpumasks on the stack, as we are headed for
gynormous numbers of CPUs.  Eventually, we want to head towards an
undefined 'struct cpumask' so they can never be declared on stack.

1) New cpumask functions which take pointers instead of copies.
   (cpus_* -> cpumask_*)

2) Several new helpers to reduce requirements for temporary cpumasks
   (cpumask_first_and, cpumask_next_and, cpumask_any_and)

3) Helpers for declaring cpumasks on or offstack for large NR_CPUS
   (cpumask_var_t, alloc_cpumask_var and free_cpumask_var)

4) 'struct cpumask' for explicitness and to mark new-style code.

5) Make iterator functions stop at nr_cpu_ids (a runtime constant),
   not NR_CPUS for time efficiency and for smaller dynamic allocations
   in future.

6) cpumask_copy() so we can allocate less than a full cpumask eventually
   (for alloc_cpumask_var), and so we can eliminate the 'struct cpumask'
   definition eventually.

7) work_on_cpu() helper for doing task on a CPU, rather than saving old
   cpumask for current thread and manipulating it.

8) smp_call_function_many() which is smp_call_function_mask() except
   taking a cpumask pointer.

Note that this patch simply introduces the new functions and leaves
the obsolescent ones in place.  This is to simplify the transition
patches.

Signed-off-by: Rusty Russell <rusty@rustcorp.com.au>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 include/linux/cpumask.h   | 502 +++++++++++++++++++++++++++++++++++++++++++++-
 include/linux/smp.h       |   9 +
 include/linux/workqueue.h |   8 +
 kernel/cpu.c              |   3 +
 kernel/workqueue.c        |  45 +++++
 lib/cpumask.c             |  73 +++++++
 6 files changed, 638 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/cpumask.h b/include/linux/cpumask.h
index d3219d73f8e6..c8e66619097b 100644
--- a/include/linux/cpumask.h
+++ b/include/linux/cpumask.h
@@ -5,6 +5,9 @@
  * Cpumasks provide a bitmap suitable for representing the
  * set of CPU's in a system, one bit position per CPU number.
  *
+ * The new cpumask_ ops take a "struct cpumask *"; the old ones
+ * use cpumask_t.
+ *
  * See detailed comments in the file linux/bitmap.h describing the
  * data type on which these cpumasks are based.
  *
@@ -31,7 +34,7 @@
  *       will span the entire range of NR_CPUS.
  * . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . .
  *
- * The available cpumask operations are:
+ * The obsolescent cpumask operations are:
  *
  * void cpu_set(cpu, mask)		turn on bit 'cpu' in mask
  * void cpu_clear(cpu, mask)		turn off bit 'cpu' in mask
@@ -138,7 +141,7 @@
 #include <linux/threads.h>
 #include <linux/bitmap.h>
 
-typedef struct { DECLARE_BITMAP(bits, NR_CPUS); } cpumask_t;
+typedef struct cpumask { DECLARE_BITMAP(bits, NR_CPUS); } cpumask_t;
 extern cpumask_t _unused_cpumask_arg_;
 
 #define cpu_set(cpu, dst) __cpu_set((cpu), &(dst))
@@ -527,4 +530,499 @@ extern cpumask_t cpu_active_map;
 #define for_each_online_cpu(cpu)   for_each_cpu_mask_nr((cpu), cpu_online_map)
 #define for_each_present_cpu(cpu)  for_each_cpu_mask_nr((cpu), cpu_present_map)
 
+/* These are the new versions of the cpumask operators: passed by pointer.
+ * The older versions will be implemented in terms of these, then deleted. */
+#define cpumask_bits(maskp) ((maskp)->bits)
+
+#if NR_CPUS <= BITS_PER_LONG
+#define CPU_BITS_ALL						\
+{								\
+	[BITS_TO_LONGS(NR_CPUS)-1] = CPU_MASK_LAST_WORD	\
+}
+
+/* This produces more efficient code. */
+#define nr_cpumask_bits	NR_CPUS
+
+#else /* NR_CPUS > BITS_PER_LONG */
+
+#define CPU_BITS_ALL						\
+{								\
+	[0 ... BITS_TO_LONGS(NR_CPUS)-2] = ~0UL,		\
+	[BITS_TO_LONGS(NR_CPUS)-1] = CPU_MASK_LAST_WORD		\
+}
+
+#define nr_cpumask_bits	nr_cpu_ids
+#endif /* NR_CPUS > BITS_PER_LONG */
+
+/* verify cpu argument to cpumask_* operators */
+static inline unsigned int cpumask_check(unsigned int cpu)
+{
+#ifdef CONFIG_DEBUG_PER_CPU_MAPS
+	WARN_ON_ONCE(cpu >= nr_cpumask_bits);
+#endif /* CONFIG_DEBUG_PER_CPU_MAPS */
+	return cpu;
+}
+
+#if NR_CPUS == 1
+/* Uniprocesor. */
+#define cpumask_first(src)		({ (void)(src); 0; })
+#define cpumask_next(n, src)		({ (void)(src); 1; })
+#define cpumask_next_zero(n, src)	({ (void)(src); 1; })
+#define cpumask_next_and(n, srcp, andp)	({ (void)(srcp), (void)(andp); 1; })
+#define cpumask_any_but(mask, cpu)	({ (void)(mask); (void)(cpu); 0; })
+
+#define for_each_cpu(cpu, mask)			\
+	for ((cpu) = 0; (cpu) < 1; (cpu)++, (void)mask)
+#define for_each_cpu_and(cpu, mask, and)	\
+	for ((cpu) = 0; (cpu) < 1; (cpu)++, (void)mask, (void)and)
+#else
+/**
+ * cpumask_first - get the first cpu in a cpumask
+ * @srcp: the cpumask pointer
+ *
+ * Returns >= nr_cpu_ids if no cpus set.
+ */
+static inline unsigned int cpumask_first(const struct cpumask *srcp)
+{
+	return find_first_bit(cpumask_bits(srcp), nr_cpumask_bits);
+}
+
+/**
+ * cpumask_next - get the next cpu in a cpumask
+ * @n: the cpu prior to the place to search (ie. return will be > @n)
+ * @srcp: the cpumask pointer
+ *
+ * Returns >= nr_cpu_ids if no further cpus set.
+ */
+static inline unsigned int cpumask_next(int n, const struct cpumask *srcp)
+{
+	/* -1 is a legal arg here. */
+	if (n != -1)
+		cpumask_check(n);
+	return find_next_bit(cpumask_bits(srcp), nr_cpumask_bits, n+1);
+}
+
+/**
+ * cpumask_next_zero - get the next unset cpu in a cpumask
+ * @n: the cpu prior to the place to search (ie. return will be > @n)
+ * @srcp: the cpumask pointer
+ *
+ * Returns >= nr_cpu_ids if no further cpus unset.
+ */
+static inline unsigned int cpumask_next_zero(int n, const struct cpumask *srcp)
+{
+	/* -1 is a legal arg here. */
+	if (n != -1)
+		cpumask_check(n);
+	return find_next_zero_bit(cpumask_bits(srcp), nr_cpumask_bits, n+1);
+}
+
+int cpumask_next_and(int n, const struct cpumask *, const struct cpumask *);
+int cpumask_any_but(const struct cpumask *mask, unsigned int cpu);
+
+#define for_each_cpu(cpu, mask)				\
+	for ((cpu) = -1;				\
+		(cpu) = cpumask_next((cpu), (mask)),	\
+		(cpu) < nr_cpu_ids;)
+#define for_each_cpu_and(cpu, mask, and)				\
+	for ((cpu) = -1;						\
+		(cpu) = cpumask_next_and((cpu), (mask), (and)),		\
+		(cpu) < nr_cpu_ids;)
+#endif /* SMP */
+
+#define CPU_BITS_NONE						\
+{								\
+	[0 ... BITS_TO_LONGS(NR_CPUS)-1] = 0UL			\
+}
+
+#define CPU_BITS_CPU0						\
+{								\
+	[0] =  1UL						\
+}
+
+/**
+ * cpumask_set_cpu - set a cpu in a cpumask
+ * @cpu: cpu number (< nr_cpu_ids)
+ * @dstp: the cpumask pointer
+ */
+static inline void cpumask_set_cpu(unsigned int cpu, struct cpumask *dstp)
+{
+	set_bit(cpumask_check(cpu), cpumask_bits(dstp));
+}
+
+/**
+ * cpumask_clear_cpu - clear a cpu in a cpumask
+ * @cpu: cpu number (< nr_cpu_ids)
+ * @dstp: the cpumask pointer
+ */
+static inline void cpumask_clear_cpu(int cpu, struct cpumask *dstp)
+{
+	clear_bit(cpumask_check(cpu), cpumask_bits(dstp));
+}
+
+/**
+ * cpumask_test_cpu - test for a cpu in a cpumask
+ * @cpu: cpu number (< nr_cpu_ids)
+ * @cpumask: the cpumask pointer
+ *
+ * No static inline type checking - see Subtlety (1) above.
+ */
+#define cpumask_test_cpu(cpu, cpumask) \
+	test_bit(cpumask_check(cpu), (cpumask)->bits)
+
+/**
+ * cpumask_test_and_set_cpu - atomically test and set a cpu in a cpumask
+ * @cpu: cpu number (< nr_cpu_ids)
+ * @cpumask: the cpumask pointer
+ *
+ * test_and_set_bit wrapper for cpumasks.
+ */
+static inline int cpumask_test_and_set_cpu(int cpu, struct cpumask *cpumask)
+{
+	return test_and_set_bit(cpumask_check(cpu), cpumask_bits(cpumask));
+}
+
+/**
+ * cpumask_setall - set all cpus (< nr_cpu_ids) in a cpumask
+ * @dstp: the cpumask pointer
+ */
+static inline void cpumask_setall(struct cpumask *dstp)
+{
+	bitmap_fill(cpumask_bits(dstp), nr_cpumask_bits);
+}
+
+/**
+ * cpumask_clear - clear all cpus (< nr_cpu_ids) in a cpumask
+ * @dstp: the cpumask pointer
+ */
+static inline void cpumask_clear(struct cpumask *dstp)
+{
+	bitmap_zero(cpumask_bits(dstp), nr_cpumask_bits);
+}
+
+/**
+ * cpumask_and - *dstp = *src1p & *src2p
+ * @dstp: the cpumask result
+ * @src1p: the first input
+ * @src2p: the second input
+ */
+static inline void cpumask_and(struct cpumask *dstp,
+			       const struct cpumask *src1p,
+			       const struct cpumask *src2p)
+{
+	bitmap_and(cpumask_bits(dstp), cpumask_bits(src1p),
+				       cpumask_bits(src2p), nr_cpumask_bits);
+}
+
+/**
+ * cpumask_or - *dstp = *src1p | *src2p
+ * @dstp: the cpumask result
+ * @src1p: the first input
+ * @src2p: the second input
+ */
+static inline void cpumask_or(struct cpumask *dstp, const struct cpumask *src1p,
+			      const struct cpumask *src2p)
+{
+	bitmap_or(cpumask_bits(dstp), cpumask_bits(src1p),
+				      cpumask_bits(src2p), nr_cpumask_bits);
+}
+
+/**
+ * cpumask_xor - *dstp = *src1p ^ *src2p
+ * @dstp: the cpumask result
+ * @src1p: the first input
+ * @src2p: the second input
+ */
+static inline void cpumask_xor(struct cpumask *dstp,
+			       const struct cpumask *src1p,
+			       const struct cpumask *src2p)
+{
+	bitmap_xor(cpumask_bits(dstp), cpumask_bits(src1p),
+				       cpumask_bits(src2p), nr_cpumask_bits);
+}
+
+/**
+ * cpumask_andnot - *dstp = *src1p & ~*src2p
+ * @dstp: the cpumask result
+ * @src1p: the first input
+ * @src2p: the second input
+ */
+static inline void cpumask_andnot(struct cpumask *dstp,
+				  const struct cpumask *src1p,
+				  const struct cpumask *src2p)
+{
+	bitmap_andnot(cpumask_bits(dstp), cpumask_bits(src1p),
+					  cpumask_bits(src2p), nr_cpumask_bits);
+}
+
+/**
+ * cpumask_complement - *dstp = ~*srcp
+ * @dstp: the cpumask result
+ * @srcp: the input to invert
+ */
+static inline void cpumask_complement(struct cpumask *dstp,
+				      const struct cpumask *srcp)
+{
+	bitmap_complement(cpumask_bits(dstp), cpumask_bits(srcp),
+					      nr_cpumask_bits);
+}
+
+/**
+ * cpumask_equal - *src1p == *src2p
+ * @src1p: the first input
+ * @src2p: the second input
+ */
+static inline bool cpumask_equal(const struct cpumask *src1p,
+				const struct cpumask *src2p)
+{
+	return bitmap_equal(cpumask_bits(src1p), cpumask_bits(src2p),
+						 nr_cpumask_bits);
+}
+
+/**
+ * cpumask_intersects - (*src1p & *src2p) != 0
+ * @src1p: the first input
+ * @src2p: the second input
+ */
+static inline bool cpumask_intersects(const struct cpumask *src1p,
+				     const struct cpumask *src2p)
+{
+	return bitmap_intersects(cpumask_bits(src1p), cpumask_bits(src2p),
+						      nr_cpumask_bits);
+}
+
+/**
+ * cpumask_subset - (*src1p & ~*src2p) == 0
+ * @src1p: the first input
+ * @src2p: the second input
+ */
+static inline int cpumask_subset(const struct cpumask *src1p,
+				 const struct cpumask *src2p)
+{
+	return bitmap_subset(cpumask_bits(src1p), cpumask_bits(src2p),
+						  nr_cpumask_bits);
+}
+
+/**
+ * cpumask_empty - *srcp == 0
+ * @srcp: the cpumask to that all cpus < nr_cpu_ids are clear.
+ */
+static inline bool cpumask_empty(const struct cpumask *srcp)
+{
+	return bitmap_empty(cpumask_bits(srcp), nr_cpumask_bits);
+}
+
+/**
+ * cpumask_full - *srcp == 0xFFFFFFFF...
+ * @srcp: the cpumask to that all cpus < nr_cpu_ids are set.
+ */
+static inline bool cpumask_full(const struct cpumask *srcp)
+{
+	return bitmap_full(cpumask_bits(srcp), nr_cpumask_bits);
+}
+
+/**
+ * cpumask_weight - Count of bits in *srcp
+ * @srcp: the cpumask to count bits (< nr_cpu_ids) in.
+ */
+static inline unsigned int cpumask_weight(const struct cpumask *srcp)
+{
+	return bitmap_weight(cpumask_bits(srcp), nr_cpumask_bits);
+}
+
+/**
+ * cpumask_shift_right - *dstp = *srcp >> n
+ * @dstp: the cpumask result
+ * @srcp: the input to shift
+ * @n: the number of bits to shift by
+ */
+static inline void cpumask_shift_right(struct cpumask *dstp,
+				       const struct cpumask *srcp, int n)
+{
+	bitmap_shift_right(cpumask_bits(dstp), cpumask_bits(srcp), n,
+					       nr_cpumask_bits);
+}
+
+/**
+ * cpumask_shift_left - *dstp = *srcp << n
+ * @dstp: the cpumask result
+ * @srcp: the input to shift
+ * @n: the number of bits to shift by
+ */
+static inline void cpumask_shift_left(struct cpumask *dstp,
+				      const struct cpumask *srcp, int n)
+{
+	bitmap_shift_left(cpumask_bits(dstp), cpumask_bits(srcp), n,
+					      nr_cpumask_bits);
+}
+
+/**
+ * cpumask_copy - *dstp = *srcp
+ * @dstp: the result
+ * @srcp: the input cpumask
+ */
+static inline void cpumask_copy(struct cpumask *dstp,
+				const struct cpumask *srcp)
+{
+	bitmap_copy(cpumask_bits(dstp), cpumask_bits(srcp), nr_cpumask_bits);
+}
+
+/**
+ * cpumask_any - pick a "random" cpu from *srcp
+ * @srcp: the input cpumask
+ *
+ * Returns >= nr_cpu_ids if no cpus set.
+ */
+#define cpumask_any(srcp) cpumask_first(srcp)
+
+/**
+ * cpumask_first_and - return the first cpu from *srcp1 & *srcp2
+ * @src1p: the first input
+ * @src2p: the second input
+ *
+ * Returns >= nr_cpu_ids if no cpus set in both.  See also cpumask_next_and().
+ */
+#define cpumask_first_and(src1p, src2p) cpumask_next_and(-1, (src1p), (src2p))
+
+/**
+ * cpumask_any_and - pick a "random" cpu from *mask1 & *mask2
+ * @mask1: the first input cpumask
+ * @mask2: the second input cpumask
+ *
+ * Returns >= nr_cpu_ids if no cpus set.
+ */
+#define cpumask_any_and(mask1, mask2) cpumask_first_and((mask1), (mask2))
+
+/**
+ * to_cpumask - convert an NR_CPUS bitmap to a struct cpumask *
+ * @bitmap: the bitmap
+ *
+ * There are a few places where cpumask_var_t isn't appropriate and
+ * static cpumasks must be used (eg. very early boot), yet we don't
+ * expose the definition of 'struct cpumask'.
+ *
+ * This does the conversion, and can be used as a constant initializer.
+ */
+#define to_cpumask(bitmap)						\
+	((struct cpumask *)(1 ? (bitmap)				\
+			    : (void *)sizeof(__check_is_bitmap(bitmap))))
+
+static inline int __check_is_bitmap(const unsigned long *bitmap)
+{
+	return 1;
+}
+
+/**
+ * cpumask_size - size to allocate for a 'struct cpumask' in bytes
+ *
+ * This will eventually be a runtime variable, depending on nr_cpu_ids.
+ */
+static inline size_t cpumask_size(void)
+{
+	/* FIXME: Once all cpumask assignments are eliminated, this
+	 * can be nr_cpumask_bits */
+	return BITS_TO_LONGS(NR_CPUS) * sizeof(long);
+}
+
+/*
+ * cpumask_var_t: struct cpumask for stack usage.
+ *
+ * Oh, the wicked games we play!  In order to make kernel coding a
+ * little more difficult, we typedef cpumask_var_t to an array or a
+ * pointer: doing &mask on an array is a noop, so it still works.
+ *
+ * ie.
+ *	cpumask_var_t tmpmask;
+ *	if (!alloc_cpumask_var(&tmpmask, GFP_KERNEL))
+ *		return -ENOMEM;
+ *
+ *	  ... use 'tmpmask' like a normal struct cpumask * ...
+ *
+ *	free_cpumask_var(tmpmask);
+ */
+#ifdef CONFIG_CPUMASK_OFFSTACK
+typedef struct cpumask *cpumask_var_t;
+
+bool alloc_cpumask_var(cpumask_var_t *mask, gfp_t flags);
+void alloc_bootmem_cpumask_var(cpumask_var_t *mask);
+void free_cpumask_var(cpumask_var_t mask);
+
+#else
+typedef struct cpumask cpumask_var_t[1];
+
+static inline bool alloc_cpumask_var(cpumask_var_t *mask, gfp_t flags)
+{
+	return true;
+}
+
+static inline void alloc_bootmem_cpumask_var(cpumask_var_t *mask)
+{
+}
+
+static inline void free_cpumask_var(cpumask_var_t mask)
+{
+}
+#endif /* CONFIG_CPUMASK_OFFSTACK */
+
+/* The pointer versions of the maps, these will become the primary versions. */
+#define cpu_possible_mask ((const struct cpumask *)&cpu_possible_map)
+#define cpu_online_mask ((const struct cpumask *)&cpu_online_map)
+#define cpu_present_mask ((const struct cpumask *)&cpu_present_map)
+#define cpu_active_mask ((const struct cpumask *)&cpu_active_map)
+
+/* It's common to want to use cpu_all_mask in struct member initializers,
+ * so it has to refer to an address rather than a pointer. */
+extern const DECLARE_BITMAP(cpu_all_bits, NR_CPUS);
+#define cpu_all_mask to_cpumask(cpu_all_bits)
+
+/* First bits of cpu_bit_bitmap are in fact unset. */
+#define cpu_none_mask to_cpumask(cpu_bit_bitmap[0])
+
+/* Wrappers for arch boot code to manipulate normally-constant masks */
+static inline void set_cpu_possible(unsigned int cpu, bool possible)
+{
+	if (possible)
+		cpumask_set_cpu(cpu, &cpu_possible_map);
+	else
+		cpumask_clear_cpu(cpu, &cpu_possible_map);
+}
+
+static inline void set_cpu_present(unsigned int cpu, bool present)
+{
+	if (present)
+		cpumask_set_cpu(cpu, &cpu_present_map);
+	else
+		cpumask_clear_cpu(cpu, &cpu_present_map);
+}
+
+static inline void set_cpu_online(unsigned int cpu, bool online)
+{
+	if (online)
+		cpumask_set_cpu(cpu, &cpu_online_map);
+	else
+		cpumask_clear_cpu(cpu, &cpu_online_map);
+}
+
+static inline void set_cpu_active(unsigned int cpu, bool active)
+{
+	if (active)
+		cpumask_set_cpu(cpu, &cpu_active_map);
+	else
+		cpumask_clear_cpu(cpu, &cpu_active_map);
+}
+
+static inline void init_cpu_present(const struct cpumask *src)
+{
+	cpumask_copy(&cpu_present_map, src);
+}
+
+static inline void init_cpu_possible(const struct cpumask *src)
+{
+	cpumask_copy(&cpu_possible_map, src);
+}
+
+static inline void init_cpu_online(const struct cpumask *src)
+{
+	cpumask_copy(&cpu_online_map, src);
+}
 #endif /* __LINUX_CPUMASK_H */
diff --git a/include/linux/smp.h b/include/linux/smp.h
index 2e4d58b26c06..3f9a60043a97 100644
--- a/include/linux/smp.h
+++ b/include/linux/smp.h
@@ -64,8 +64,17 @@ extern void smp_cpus_done(unsigned int max_cpus);
  * Call a function on all other processors
  */
 int smp_call_function(void(*func)(void *info), void *info, int wait);
+/* Deprecated: use smp_call_function_many() which uses a cpumask ptr. */
 int smp_call_function_mask(cpumask_t mask, void(*func)(void *info), void *info,
 				int wait);
+
+static inline void smp_call_function_many(const struct cpumask *mask,
+					  void (*func)(void *info), void *info,
+					  int wait)
+{
+	smp_call_function_mask(*mask, func, info, wait);
+}
+
 int smp_call_function_single(int cpuid, void (*func) (void *info), void *info,
 				int wait);
 void __smp_call_function_single(int cpuid, struct call_single_data *data);
diff --git a/include/linux/workqueue.h b/include/linux/workqueue.h
index 89a5a1231ffb..b36291130f22 100644
--- a/include/linux/workqueue.h
+++ b/include/linux/workqueue.h
@@ -240,4 +240,12 @@ void cancel_rearming_delayed_work(struct delayed_work *work)
 	cancel_delayed_work_sync(work);
 }
 
+#ifndef CONFIG_SMP
+static inline long work_on_cpu(unsigned int cpu, long (*fn)(void *), void *arg)
+{
+	return fn(arg);
+}
+#else
+long work_on_cpu(unsigned int cpu, long (*fn)(void *), void *arg);
+#endif /* CONFIG_SMP */
 #endif
diff --git a/kernel/cpu.c b/kernel/cpu.c
index 86d49045daed..5a732c5ef08b 100644
--- a/kernel/cpu.c
+++ b/kernel/cpu.c
@@ -499,3 +499,6 @@ const unsigned long cpu_bit_bitmap[BITS_PER_LONG+1][BITS_TO_LONGS(NR_CPUS)] = {
 #endif
 };
 EXPORT_SYMBOL_GPL(cpu_bit_bitmap);
+
+const DECLARE_BITMAP(cpu_all_bits, NR_CPUS) = CPU_BITS_ALL;
+EXPORT_SYMBOL(cpu_all_bits);
diff --git a/kernel/workqueue.c b/kernel/workqueue.c
index f928f2a87b9b..d4dc69ddebd7 100644
--- a/kernel/workqueue.c
+++ b/kernel/workqueue.c
@@ -970,6 +970,51 @@ undo:
 	return ret;
 }
 
+#ifdef CONFIG_SMP
+struct work_for_cpu {
+	struct work_struct work;
+	long (*fn)(void *);
+	void *arg;
+	long ret;
+};
+
+static void do_work_for_cpu(struct work_struct *w)
+{
+	struct work_for_cpu *wfc = container_of(w, struct work_for_cpu, work);
+
+	wfc->ret = wfc->fn(wfc->arg);
+}
+
+/**
+ * work_on_cpu - run a function in user context on a particular cpu
+ * @cpu: the cpu to run on
+ * @fn: the function to run
+ * @arg: the function arg
+ *
+ * This will return -EINVAL in the cpu is not online, or the return value
+ * of @fn otherwise.
+ */
+long work_on_cpu(unsigned int cpu, long (*fn)(void *), void *arg)
+{
+	struct work_for_cpu wfc;
+
+	INIT_WORK(&wfc.work, do_work_for_cpu);
+	wfc.fn = fn;
+	wfc.arg = arg;
+	get_online_cpus();
+	if (unlikely(!cpu_online(cpu)))
+		wfc.ret = -EINVAL;
+	else {
+		schedule_work_on(cpu, &wfc.work);
+		flush_work(&wfc.work);
+	}
+	put_online_cpus();
+
+	return wfc.ret;
+}
+EXPORT_SYMBOL_GPL(work_on_cpu);
+#endif /* CONFIG_SMP */
+
 void __init init_workqueues(void)
 {
 	cpu_populated_map = cpu_online_map;
diff --git a/lib/cpumask.c b/lib/cpumask.c
index 5f97dc25ef9c..5ceb4211c834 100644
--- a/lib/cpumask.c
+++ b/lib/cpumask.c
@@ -2,6 +2,7 @@
 #include <linux/bitops.h>
 #include <linux/cpumask.h>
 #include <linux/module.h>
+#include <linux/bootmem.h>
 
 int __first_cpu(const cpumask_t *srcp)
 {
@@ -35,3 +36,75 @@ int __any_online_cpu(const cpumask_t *mask)
 	return cpu;
 }
 EXPORT_SYMBOL(__any_online_cpu);
+
+/**
+ * cpumask_next_and - get the next cpu in *src1p & *src2p
+ * @n: the cpu prior to the place to search (ie. return will be > @n)
+ * @src1p: the first cpumask pointer
+ * @src2p: the second cpumask pointer
+ *
+ * Returns >= nr_cpu_ids if no further cpus set in both.
+ */
+int cpumask_next_and(int n, const struct cpumask *src1p,
+		     const struct cpumask *src2p)
+{
+	while ((n = cpumask_next(n, src1p)) < nr_cpu_ids)
+		if (cpumask_test_cpu(n, src2p))
+			break;
+	return n;
+}
+EXPORT_SYMBOL(cpumask_next_and);
+
+/**
+ * cpumask_any_but - return a "random" in a cpumask, but not this one.
+ * @mask: the cpumask to search
+ * @cpu: the cpu to ignore.
+ *
+ * Often used to find any cpu but smp_processor_id() in a mask.
+ * Returns >= nr_cpu_ids if no cpus set.
+ */
+int cpumask_any_but(const struct cpumask *mask, unsigned int cpu)
+{
+	unsigned int i;
+
+	for_each_cpu(i, mask)
+		if (i != cpu)
+			break;
+	return i;
+}
+
+/* These are not inline because of header tangles. */
+#ifdef CONFIG_CPUMASK_OFFSTACK
+bool alloc_cpumask_var(cpumask_var_t *mask, gfp_t flags)
+{
+	if (likely(slab_is_available()))
+		*mask = kmalloc(cpumask_size(), flags);
+	else {
+#ifdef CONFIG_DEBUG_PER_CPU_MAPS
+		printk(KERN_ERR
+			"=> alloc_cpumask_var: kmalloc not available!\n");
+		dump_stack();
+#endif
+		*mask = NULL;
+	}
+#ifdef CONFIG_DEBUG_PER_CPU_MAPS
+	if (!*mask) {
+		printk(KERN_ERR "=> alloc_cpumask_var: failed!\n");
+		dump_stack();
+	}
+#endif
+	return *mask != NULL;
+}
+EXPORT_SYMBOL(alloc_cpumask_var);
+
+void __init alloc_bootmem_cpumask_var(cpumask_var_t *mask)
+{
+	*mask = alloc_bootmem(cpumask_size());
+}
+
+void free_cpumask_var(cpumask_var_t mask)
+{
+	kfree(mask);
+}
+EXPORT_SYMBOL(free_cpumask_var);
+#endif
-- 
cgit v1.2.3


From f8d570a4745835f2238a33b537218a1bb03fc671 Mon Sep 17 00:00:00 2001
From: David Miller <davem@davemloft.net>
Date: Thu, 6 Nov 2008 00:37:40 -0800
Subject: net: Fix recursive descent in __scm_destroy().

__scm_destroy() walks the list of file descriptors in the scm_fp_list
pointed to by the scm_cookie argument.

Those, in turn, can close sockets and invoke __scm_destroy() again.

There is nothing which limits how deeply this can occur.

The idea for how to fix this is from Linus.  Basically, we do all of
the fput()s at the top level by collecting all of the scm_fp_list
objects hit by an fput().  Inside of the initial __scm_destroy() we
keep running the list until it is empty.

Signed-off-by: David S. Miller <davem@davemloft.net>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/sched.h |  2 ++
 include/net/scm.h     |  5 +++--
 net/core/scm.c        | 24 +++++++++++++++++++++---
 3 files changed, 26 insertions(+), 5 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/sched.h b/include/linux/sched.h
index b483f39a7112..295b7c756ca6 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -1349,6 +1349,8 @@ struct task_struct {
 	 */
 	unsigned long timer_slack_ns;
 	unsigned long default_timer_slack_ns;
+
+	struct list_head	*scm_work_list;
 };
 
 /*
diff --git a/include/net/scm.h b/include/net/scm.h
index 06df126103ca..33e9986beb86 100644
--- a/include/net/scm.h
+++ b/include/net/scm.h
@@ -14,8 +14,9 @@
 
 struct scm_fp_list
 {
-	int		count;
-	struct file	*fp[SCM_MAX_FD];
+	struct list_head	list;
+	int			count;
+	struct file		*fp[SCM_MAX_FD];
 };
 
 struct scm_cookie
diff --git a/net/core/scm.c b/net/core/scm.c
index 10f5c65f6a47..ab242cc1acca 100644
--- a/net/core/scm.c
+++ b/net/core/scm.c
@@ -75,6 +75,7 @@ static int scm_fp_copy(struct cmsghdr *cmsg, struct scm_fp_list **fplp)
 		if (!fpl)
 			return -ENOMEM;
 		*fplp = fpl;
+		INIT_LIST_HEAD(&fpl->list);
 		fpl->count = 0;
 	}
 	fpp = &fpl->fp[fpl->count];
@@ -106,9 +107,25 @@ void __scm_destroy(struct scm_cookie *scm)
 
 	if (fpl) {
 		scm->fp = NULL;
-		for (i=fpl->count-1; i>=0; i--)
-			fput(fpl->fp[i]);
-		kfree(fpl);
+		if (current->scm_work_list) {
+			list_add_tail(&fpl->list, current->scm_work_list);
+		} else {
+			LIST_HEAD(work_list);
+
+			current->scm_work_list = &work_list;
+
+			list_add(&fpl->list, &work_list);
+			while (!list_empty(&work_list)) {
+				fpl = list_first_entry(&work_list, struct scm_fp_list, list);
+
+				list_del(&fpl->list);
+				for (i=fpl->count-1; i>=0; i--)
+					fput(fpl->fp[i]);
+				kfree(fpl);
+			}
+
+			current->scm_work_list = NULL;
+		}
 	}
 }
 
@@ -284,6 +301,7 @@ struct scm_fp_list *scm_fp_dup(struct scm_fp_list *fpl)
 
 	new_fpl = kmalloc(sizeof(*fpl), GFP_KERNEL);
 	if (new_fpl) {
+		INIT_LIST_HEAD(&new_fpl->list);
 		for (i=fpl->count-1; i>=0; i--)
 			get_file(fpl->fp[i]);
 		memcpy(new_fpl, fpl, sizeof(*fpl));
-- 
cgit v1.2.3


From 9e975dae2970d22557662761c8505ce9fd165684 Mon Sep 17 00:00:00 2001
From: OGAWA Hirofumi <hirofumi@mail.parknet.co.jp>
Date: Thu, 6 Nov 2008 12:53:46 -0800
Subject: fat: split include/msdos_fs.h

This splits __KERNEL__ stuff in include/msdos_fs.h into fs/fat/fat.h.

Signed-off-by: OGAWA Hirofumi <hirofumi@mail.parknet.co.jp>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/fat/cache.c           |   2 +-
 fs/fat/dir.c             |   2 +-
 fs/fat/fat.h             | 274 ++++++++++++++++++++++++++++++++++++++++++++++
 fs/fat/fatent.c          |   1 +
 fs/fat/file.c            |   2 +-
 fs/fat/inode.c           |   2 +-
 fs/fat/misc.c            |   2 +-
 fs/fat/namei_msdos.c     |   2 +-
 fs/fat/namei_vfat.c      |   3 +-
 include/linux/msdos_fs.h | 276 +----------------------------------------------
 10 files changed, 284 insertions(+), 282 deletions(-)
 create mode 100644 fs/fat/fat.h

(limited to 'include/linux')

diff --git a/fs/fat/cache.c b/fs/fat/cache.c
index 3222f51c41cf..589edde9053c 100644
--- a/fs/fat/cache.c
+++ b/fs/fat/cache.c
@@ -9,8 +9,8 @@
  */
 
 #include <linux/fs.h>
-#include <linux/msdos_fs.h>
 #include <linux/buffer_head.h>
+#include "fat.h"
 
 /* this must be > 0. */
 #define FAT_MAX_CACHE	8
diff --git a/fs/fat/dir.c b/fs/fat/dir.c
index bae1c3292522..08b23ad25f1c 100644
--- a/fs/fat/dir.c
+++ b/fs/fat/dir.c
@@ -16,11 +16,11 @@
 #include <linux/module.h>
 #include <linux/slab.h>
 #include <linux/time.h>
-#include <linux/msdos_fs.h>
 #include <linux/smp_lock.h>
 #include <linux/buffer_head.h>
 #include <linux/compat.h>
 #include <asm/uaccess.h>
+#include "fat.h"
 
 static inline loff_t fat_make_i_pos(struct super_block *sb,
 				    struct buffer_head *bh,
diff --git a/fs/fat/fat.h b/fs/fat/fat.h
new file mode 100644
index 000000000000..51f1c42ca5e3
--- /dev/null
+++ b/fs/fat/fat.h
@@ -0,0 +1,274 @@
+#ifndef _FAT_H
+#define _FAT_H
+
+#include <linux/buffer_head.h>
+#include <linux/string.h>
+#include <linux/nls.h>
+#include <linux/fs.h>
+#include <linux/mutex.h>
+#include <linux/msdos_fs.h>
+
+/*
+ * vfat shortname flags
+ */
+#define VFAT_SFN_DISPLAY_LOWER	0x0001 /* convert to lowercase for display */
+#define VFAT_SFN_DISPLAY_WIN95	0x0002 /* emulate win95 rule for display */
+#define VFAT_SFN_DISPLAY_WINNT	0x0004 /* emulate winnt rule for display */
+#define VFAT_SFN_CREATE_WIN95	0x0100 /* emulate win95 rule for create */
+#define VFAT_SFN_CREATE_WINNT	0x0200 /* emulate winnt rule for create */
+
+struct fat_mount_options {
+	uid_t fs_uid;
+	gid_t fs_gid;
+	unsigned short fs_fmask;
+	unsigned short fs_dmask;
+	unsigned short codepage;  /* Codepage for shortname conversions */
+	char *iocharset;          /* Charset used for filename input/display */
+	unsigned short shortname; /* flags for shortname display/create rule */
+	unsigned char name_check; /* r = relaxed, n = normal, s = strict */
+	unsigned short allow_utime;/* permission for setting the [am]time */
+	unsigned quiet:1,         /* set = fake successful chmods and chowns */
+		 showexec:1,      /* set = only set x bit for com/exe/bat */
+		 sys_immutable:1, /* set = system files are immutable */
+		 dotsOK:1,        /* set = hidden and system files are named '.filename' */
+		 isvfat:1,        /* 0=no vfat long filename support, 1=vfat support */
+		 utf8:1,	  /* Use of UTF-8 character set (Default) */
+		 unicode_xlate:1, /* create escape sequences for unhandled Unicode */
+		 numtail:1,       /* Does first alias have a numeric '~1' type tail? */
+		 flush:1,	  /* write things quickly */
+		 nocase:1,	  /* Does this need case conversion? 0=need case conversion*/
+		 usefree:1,	  /* Use free_clusters for FAT32 */
+		 tz_utc:1;	  /* Filesystem timestamps are in UTC */
+};
+
+#define FAT_HASH_BITS	8
+#define FAT_HASH_SIZE	(1UL << FAT_HASH_BITS)
+#define FAT_HASH_MASK	(FAT_HASH_SIZE-1)
+
+/*
+ * MS-DOS file system in-core superblock data
+ */
+struct msdos_sb_info {
+	unsigned short sec_per_clus; /* sectors/cluster */
+	unsigned short cluster_bits; /* log2(cluster_size) */
+	unsigned int cluster_size;   /* cluster size */
+	unsigned char fats,fat_bits; /* number of FATs, FAT bits (12 or 16) */
+	unsigned short fat_start;
+	unsigned long fat_length;    /* FAT start & length (sec.) */
+	unsigned long dir_start;
+	unsigned short dir_entries;  /* root dir start & entries */
+	unsigned long data_start;    /* first data sector */
+	unsigned long max_cluster;   /* maximum cluster number */
+	unsigned long root_cluster;  /* first cluster of the root directory */
+	unsigned long fsinfo_sector; /* sector number of FAT32 fsinfo */
+	struct mutex fat_lock;
+	unsigned int prev_free;      /* previously allocated cluster number */
+	unsigned int free_clusters;  /* -1 if undefined */
+	unsigned int free_clus_valid; /* is free_clusters valid? */
+	struct fat_mount_options options;
+	struct nls_table *nls_disk;  /* Codepage used on disk */
+	struct nls_table *nls_io;    /* Charset used for input and display */
+	const void *dir_ops;		     /* Opaque; default directory operations */
+	int dir_per_block;	     /* dir entries per block */
+	int dir_per_block_bits;	     /* log2(dir_per_block) */
+
+	int fatent_shift;
+	struct fatent_operations *fatent_ops;
+
+	spinlock_t inode_hash_lock;
+	struct hlist_head inode_hashtable[FAT_HASH_SIZE];
+};
+
+#define FAT_CACHE_VALID	0	/* special case for valid cache */
+
+/*
+ * MS-DOS file system inode data in memory
+ */
+struct msdos_inode_info {
+	spinlock_t cache_lru_lock;
+	struct list_head cache_lru;
+	int nr_caches;
+	/* for avoiding the race between fat_free() and fat_get_cluster() */
+	unsigned int cache_valid_id;
+
+	loff_t mmu_private;
+	int i_start;		/* first cluster or 0 */
+	int i_logstart;		/* logical first cluster */
+	int i_attrs;		/* unused attribute bits */
+	loff_t i_pos;		/* on-disk position of directory entry or 0 */
+	struct hlist_node i_fat_hash;	/* hash by i_location */
+	struct inode vfs_inode;
+};
+
+struct fat_slot_info {
+	loff_t i_pos;		/* on-disk position of directory entry */
+	loff_t slot_off;	/* offset for slot or de start */
+	int nr_slots;		/* number of slots + 1(de) in filename */
+	struct msdos_dir_entry *de;
+	struct buffer_head *bh;
+};
+
+static inline struct msdos_sb_info *MSDOS_SB(struct super_block *sb)
+{
+	return sb->s_fs_info;
+}
+
+static inline struct msdos_inode_info *MSDOS_I(struct inode *inode)
+{
+	return container_of(inode, struct msdos_inode_info, vfs_inode);
+}
+
+/* Return the FAT attribute byte for this inode */
+static inline u8 fat_attr(struct inode *inode)
+{
+	return ((inode->i_mode & S_IWUGO) ? ATTR_NONE : ATTR_RO) |
+		(S_ISDIR(inode->i_mode) ? ATTR_DIR : ATTR_NONE) |
+		MSDOS_I(inode)->i_attrs;
+}
+
+static inline unsigned char fat_checksum(const __u8 *name)
+{
+	unsigned char s = name[0];
+	s = (s<<7) + (s>>1) + name[1];	s = (s<<7) + (s>>1) + name[2];
+	s = (s<<7) + (s>>1) + name[3];	s = (s<<7) + (s>>1) + name[4];
+	s = (s<<7) + (s>>1) + name[5];	s = (s<<7) + (s>>1) + name[6];
+	s = (s<<7) + (s>>1) + name[7];	s = (s<<7) + (s>>1) + name[8];
+	s = (s<<7) + (s>>1) + name[9];	s = (s<<7) + (s>>1) + name[10];
+	return s;
+}
+
+static inline sector_t fat_clus_to_blknr(struct msdos_sb_info *sbi, int clus)
+{
+	return ((sector_t)clus - FAT_START_ENT) * sbi->sec_per_clus
+		+ sbi->data_start;
+}
+
+static inline void fat16_towchar(wchar_t *dst, const __u8 *src, size_t len)
+{
+#ifdef __BIG_ENDIAN
+	while (len--) {
+		*dst++ = src[0] | (src[1] << 8);
+		src += 2;
+	}
+#else
+	memcpy(dst, src, len * 2);
+#endif
+}
+
+static inline void fatwchar_to16(__u8 *dst, const wchar_t *src, size_t len)
+{
+#ifdef __BIG_ENDIAN
+	while (len--) {
+		dst[0] = *src & 0x00FF;
+		dst[1] = (*src & 0xFF00) >> 8;
+		dst += 2;
+		src++;
+	}
+#else
+	memcpy(dst, src, len * 2);
+#endif
+}
+
+/* fat/cache.c */
+extern void fat_cache_inval_inode(struct inode *inode);
+extern int fat_get_cluster(struct inode *inode, int cluster,
+			   int *fclus, int *dclus);
+extern int fat_bmap(struct inode *inode, sector_t sector, sector_t *phys,
+		    unsigned long *mapped_blocks);
+
+/* fat/dir.c */
+extern const struct file_operations fat_dir_operations;
+extern int fat_search_long(struct inode *inode, const unsigned char *name,
+			   int name_len, struct fat_slot_info *sinfo);
+extern int fat_dir_empty(struct inode *dir);
+extern int fat_subdirs(struct inode *dir);
+extern int fat_scan(struct inode *dir, const unsigned char *name,
+		    struct fat_slot_info *sinfo);
+extern int fat_get_dotdot_entry(struct inode *dir, struct buffer_head **bh,
+				struct msdos_dir_entry **de, loff_t *i_pos);
+extern int fat_alloc_new_dir(struct inode *dir, struct timespec *ts);
+extern int fat_add_entries(struct inode *dir, void *slots, int nr_slots,
+			   struct fat_slot_info *sinfo);
+extern int fat_remove_entries(struct inode *dir, struct fat_slot_info *sinfo);
+
+/* fat/fatent.c */
+struct fat_entry {
+	int entry;
+	union {
+		u8 *ent12_p[2];
+		__le16 *ent16_p;
+		__le32 *ent32_p;
+	} u;
+	int nr_bhs;
+	struct buffer_head *bhs[2];
+};
+
+static inline void fatent_init(struct fat_entry *fatent)
+{
+	fatent->nr_bhs = 0;
+	fatent->entry = 0;
+	fatent->u.ent32_p = NULL;
+	fatent->bhs[0] = fatent->bhs[1] = NULL;
+}
+
+static inline void fatent_set_entry(struct fat_entry *fatent, int entry)
+{
+	fatent->entry = entry;
+	fatent->u.ent32_p = NULL;
+}
+
+static inline void fatent_brelse(struct fat_entry *fatent)
+{
+	int i;
+	fatent->u.ent32_p = NULL;
+	for (i = 0; i < fatent->nr_bhs; i++)
+		brelse(fatent->bhs[i]);
+	fatent->nr_bhs = 0;
+	fatent->bhs[0] = fatent->bhs[1] = NULL;
+}
+
+extern void fat_ent_access_init(struct super_block *sb);
+extern int fat_ent_read(struct inode *inode, struct fat_entry *fatent,
+			int entry);
+extern int fat_ent_write(struct inode *inode, struct fat_entry *fatent,
+			 int new, int wait);
+extern int fat_alloc_clusters(struct inode *inode, int *cluster,
+			      int nr_cluster);
+extern int fat_free_clusters(struct inode *inode, int cluster);
+extern int fat_count_free_clusters(struct super_block *sb);
+
+/* fat/file.c */
+extern int fat_generic_ioctl(struct inode *inode, struct file *filp,
+			     unsigned int cmd, unsigned long arg);
+extern const struct file_operations fat_file_operations;
+extern const struct inode_operations fat_file_inode_operations;
+extern int fat_setattr(struct dentry * dentry, struct iattr * attr);
+extern void fat_truncate(struct inode *inode);
+extern int fat_getattr(struct vfsmount *mnt, struct dentry *dentry,
+		       struct kstat *stat);
+
+/* fat/inode.c */
+extern void fat_attach(struct inode *inode, loff_t i_pos);
+extern void fat_detach(struct inode *inode);
+extern struct inode *fat_iget(struct super_block *sb, loff_t i_pos);
+extern struct inode *fat_build_inode(struct super_block *sb,
+			struct msdos_dir_entry *de, loff_t i_pos);
+extern int fat_sync_inode(struct inode *inode);
+extern int fat_fill_super(struct super_block *sb, void *data, int silent,
+			const struct inode_operations *fs_dir_inode_ops, int isvfat);
+
+extern int fat_flush_inodes(struct super_block *sb, struct inode *i1,
+		            struct inode *i2);
+/* fat/misc.c */
+extern void fat_fs_panic(struct super_block *s, const char *fmt, ...);
+extern void fat_clusters_flush(struct super_block *sb);
+extern int fat_chain_add(struct inode *inode, int new_dclus, int nr_cluster);
+extern int date_dos2unix(unsigned short time, unsigned short date, int tz_utc);
+extern void fat_date_unix2dos(int unix_date, __le16 *time, __le16 *date,
+			      int tz_utc);
+extern int fat_sync_bhs(struct buffer_head **bhs, int nr_bhs);
+
+int fat_cache_init(void);
+void fat_cache_destroy(void);
+
+#endif /* !_FAT_H */
diff --git a/fs/fat/fatent.c b/fs/fat/fatent.c
index fb98b3d847ed..5b5f49061b7c 100644
--- a/fs/fat/fatent.c
+++ b/fs/fat/fatent.c
@@ -7,6 +7,7 @@
 #include <linux/fs.h>
 #include <linux/msdos_fs.h>
 #include <linux/blkdev.h>
+#include "fat.h"
 
 struct fatent_operations {
 	void (*ent_blocknr)(struct super_block *, int, int *, sector_t *);
diff --git a/fs/fat/file.c b/fs/fat/file.c
index ddde37025ca6..b21973f266a1 100644
--- a/fs/fat/file.c
+++ b/fs/fat/file.c
@@ -10,13 +10,13 @@
 #include <linux/module.h>
 #include <linux/mount.h>
 #include <linux/time.h>
-#include <linux/msdos_fs.h>
 #include <linux/buffer_head.h>
 #include <linux/writeback.h>
 #include <linux/backing-dev.h>
 #include <linux/blkdev.h>
 #include <linux/fsnotify.h>
 #include <linux/security.h>
+#include "fat.h"
 
 int fat_generic_ioctl(struct inode *inode, struct file *filp,
 		      unsigned int cmd, unsigned long arg)
diff --git a/fs/fat/inode.c b/fs/fat/inode.c
index 2b2eec1283bf..3921de2013a4 100644
--- a/fs/fat/inode.c
+++ b/fs/fat/inode.c
@@ -16,7 +16,6 @@
 #include <linux/slab.h>
 #include <linux/smp_lock.h>
 #include <linux/seq_file.h>
-#include <linux/msdos_fs.h>
 #include <linux/pagemap.h>
 #include <linux/mpage.h>
 #include <linux/buffer_head.h>
@@ -28,6 +27,7 @@
 #include <linux/writeback.h>
 #include <linux/log2.h>
 #include <asm/unaligned.h>
+#include "fat.h"
 
 #ifndef CONFIG_FAT_DEFAULT_IOCHARSET
 /* if user don't select VFAT, this is undefined. */
diff --git a/fs/fat/misc.c b/fs/fat/misc.c
index 79fb98ad36d4..91ad9be18ff9 100644
--- a/fs/fat/misc.c
+++ b/fs/fat/misc.c
@@ -8,8 +8,8 @@
 
 #include <linux/module.h>
 #include <linux/fs.h>
-#include <linux/msdos_fs.h>
 #include <linux/buffer_head.h>
+#include "fat.h"
 
 /*
  * fat_fs_panic reports a severe file system problem and sets the file system
diff --git a/fs/fat/namei_msdos.c b/fs/fat/namei_msdos.c
index e844b9809d27..c0a4d5cd99b2 100644
--- a/fs/fat/namei_msdos.c
+++ b/fs/fat/namei_msdos.c
@@ -9,8 +9,8 @@
 #include <linux/module.h>
 #include <linux/time.h>
 #include <linux/buffer_head.h>
-#include <linux/msdos_fs.h>
 #include <linux/smp_lock.h>
+#include "fat.h"
 
 /* Characters that are undesirable in an MS-DOS file name */
 static unsigned char bad_chars[] = "*?<>|\"";
diff --git a/fs/fat/namei_vfat.c b/fs/fat/namei_vfat.c
index 155c10b4adbd..facf3bf0211a 100644
--- a/fs/fat/namei_vfat.c
+++ b/fs/fat/namei_vfat.c
@@ -16,14 +16,13 @@
  */
 
 #include <linux/module.h>
-
 #include <linux/jiffies.h>
-#include <linux/msdos_fs.h>
 #include <linux/ctype.h>
 #include <linux/slab.h>
 #include <linux/smp_lock.h>
 #include <linux/buffer_head.h>
 #include <linux/namei.h>
+#include "fat.h"
 
 static int vfat_revalidate(struct dentry *dentry, struct nameidata *nd)
 {
diff --git a/include/linux/msdos_fs.h b/include/linux/msdos_fs.h
index ba63858056c7..0982fb47a90d 100644
--- a/include/linux/msdos_fs.h
+++ b/include/linux/msdos_fs.h
@@ -167,282 +167,10 @@ struct msdos_dir_slot {
 };
 
 #ifdef __KERNEL__
-
-#include <linux/buffer_head.h>
-#include <linux/string.h>
-#include <linux/nls.h>
-#include <linux/fs.h>
-#include <linux/mutex.h>
-
-/*
- * vfat shortname flags
- */
-#define VFAT_SFN_DISPLAY_LOWER	0x0001 /* convert to lowercase for display */
-#define VFAT_SFN_DISPLAY_WIN95	0x0002 /* emulate win95 rule for display */
-#define VFAT_SFN_DISPLAY_WINNT	0x0004 /* emulate winnt rule for display */
-#define VFAT_SFN_CREATE_WIN95	0x0100 /* emulate win95 rule for create */
-#define VFAT_SFN_CREATE_WINNT	0x0200 /* emulate winnt rule for create */
-
-struct fat_mount_options {
-	uid_t fs_uid;
-	gid_t fs_gid;
-	unsigned short fs_fmask;
-	unsigned short fs_dmask;
-	unsigned short codepage;  /* Codepage for shortname conversions */
-	char *iocharset;          /* Charset used for filename input/display */
-	unsigned short shortname; /* flags for shortname display/create rule */
-	unsigned char name_check; /* r = relaxed, n = normal, s = strict */
-	unsigned short allow_utime;/* permission for setting the [am]time */
-	unsigned quiet:1,         /* set = fake successful chmods and chowns */
-		 showexec:1,      /* set = only set x bit for com/exe/bat */
-		 sys_immutable:1, /* set = system files are immutable */
-		 dotsOK:1,        /* set = hidden and system files are named '.filename' */
-		 isvfat:1,        /* 0=no vfat long filename support, 1=vfat support */
-		 utf8:1,	  /* Use of UTF-8 character set (Default) */
-		 unicode_xlate:1, /* create escape sequences for unhandled Unicode */
-		 numtail:1,       /* Does first alias have a numeric '~1' type tail? */
-		 flush:1,	  /* write things quickly */
-		 nocase:1,	  /* Does this need case conversion? 0=need case conversion*/
-		 usefree:1,	  /* Use free_clusters for FAT32 */
-		 tz_utc:1;	  /* Filesystem timestamps are in UTC */
-};
-
-#define FAT_HASH_BITS	8
-#define FAT_HASH_SIZE	(1UL << FAT_HASH_BITS)
-#define FAT_HASH_MASK	(FAT_HASH_SIZE-1)
-
-/*
- * MS-DOS file system in-core superblock data
- */
-struct msdos_sb_info {
-	unsigned short sec_per_clus; /* sectors/cluster */
-	unsigned short cluster_bits; /* log2(cluster_size) */
-	unsigned int cluster_size;   /* cluster size */
-	unsigned char fats,fat_bits; /* number of FATs, FAT bits (12 or 16) */
-	unsigned short fat_start;
-	unsigned long fat_length;    /* FAT start & length (sec.) */
-	unsigned long dir_start;
-	unsigned short dir_entries;  /* root dir start & entries */
-	unsigned long data_start;    /* first data sector */
-	unsigned long max_cluster;   /* maximum cluster number */
-	unsigned long root_cluster;  /* first cluster of the root directory */
-	unsigned long fsinfo_sector; /* sector number of FAT32 fsinfo */
-	struct mutex fat_lock;
-	unsigned int prev_free;      /* previously allocated cluster number */
-	unsigned int free_clusters;  /* -1 if undefined */
-	unsigned int free_clus_valid; /* is free_clusters valid? */
-	struct fat_mount_options options;
-	struct nls_table *nls_disk;  /* Codepage used on disk */
-	struct nls_table *nls_io;    /* Charset used for input and display */
-	const void *dir_ops;		     /* Opaque; default directory operations */
-	int dir_per_block;	     /* dir entries per block */
-	int dir_per_block_bits;	     /* log2(dir_per_block) */
-
-	int fatent_shift;
-	struct fatent_operations *fatent_ops;
-
-	spinlock_t inode_hash_lock;
-	struct hlist_head inode_hashtable[FAT_HASH_SIZE];
-};
-
-#define FAT_CACHE_VALID	0	/* special case for valid cache */
-
-/*
- * MS-DOS file system inode data in memory
- */
-struct msdos_inode_info {
-	spinlock_t cache_lru_lock;
-	struct list_head cache_lru;
-	int nr_caches;
-	/* for avoiding the race between fat_free() and fat_get_cluster() */
-	unsigned int cache_valid_id;
-
-	loff_t mmu_private;
-	int i_start;		/* first cluster or 0 */
-	int i_logstart;		/* logical first cluster */
-	int i_attrs;		/* unused attribute bits */
-	loff_t i_pos;		/* on-disk position of directory entry or 0 */
-	struct hlist_node i_fat_hash;	/* hash by i_location */
-	struct inode vfs_inode;
-};
-
-struct fat_slot_info {
-	loff_t i_pos;		/* on-disk position of directory entry */
-	loff_t slot_off;	/* offset for slot or de start */
-	int nr_slots;		/* number of slots + 1(de) in filename */
-	struct msdos_dir_entry *de;
-	struct buffer_head *bh;
-};
-
-static inline struct msdos_sb_info *MSDOS_SB(struct super_block *sb)
-{
-	return sb->s_fs_info;
-}
-
-static inline struct msdos_inode_info *MSDOS_I(struct inode *inode)
-{
-	return container_of(inode, struct msdos_inode_info, vfs_inode);
-}
-
-/* Return the FAT attribute byte for this inode */
-static inline u8 fat_attr(struct inode *inode)
-{
-	return ((inode->i_mode & S_IWUGO) ? ATTR_NONE : ATTR_RO) |
-		(S_ISDIR(inode->i_mode) ? ATTR_DIR : ATTR_NONE) |
-		MSDOS_I(inode)->i_attrs;
-}
-
-static inline unsigned char fat_checksum(const __u8 *name)
-{
-	unsigned char s = name[0];
-	s = (s<<7) + (s>>1) + name[1];	s = (s<<7) + (s>>1) + name[2];
-	s = (s<<7) + (s>>1) + name[3];	s = (s<<7) + (s>>1) + name[4];
-	s = (s<<7) + (s>>1) + name[5];	s = (s<<7) + (s>>1) + name[6];
-	s = (s<<7) + (s>>1) + name[7];	s = (s<<7) + (s>>1) + name[8];
-	s = (s<<7) + (s>>1) + name[9];	s = (s<<7) + (s>>1) + name[10];
-	return s;
-}
-
-static inline sector_t fat_clus_to_blknr(struct msdos_sb_info *sbi, int clus)
-{
-	return ((sector_t)clus - FAT_START_ENT) * sbi->sec_per_clus
-		+ sbi->data_start;
-}
-
-static inline void fat16_towchar(wchar_t *dst, const __u8 *src, size_t len)
-{
-#ifdef __BIG_ENDIAN
-	while (len--) {
-		*dst++ = src[0] | (src[1] << 8);
-		src += 2;
-	}
-#else
-	memcpy(dst, src, len * 2);
-#endif
-}
-
-static inline void fatwchar_to16(__u8 *dst, const wchar_t *src, size_t len)
-{
-#ifdef __BIG_ENDIAN
-	while (len--) {
-		dst[0] = *src & 0x00FF;
-		dst[1] = (*src & 0xFF00) >> 8;
-		dst += 2;
-		src++;
-	}
-#else
-	memcpy(dst, src, len * 2);
-#endif
-}
-
 /* media of boot sector */
 static inline int fat_valid_media(u8 media)
 {
 	return 0xf8 <= media || media == 0xf0;
 }
-
-/* fat/cache.c */
-extern void fat_cache_inval_inode(struct inode *inode);
-extern int fat_get_cluster(struct inode *inode, int cluster,
-			   int *fclus, int *dclus);
-extern int fat_bmap(struct inode *inode, sector_t sector, sector_t *phys,
-		    unsigned long *mapped_blocks);
-
-/* fat/dir.c */
-extern const struct file_operations fat_dir_operations;
-extern int fat_search_long(struct inode *inode, const unsigned char *name,
-			   int name_len, struct fat_slot_info *sinfo);
-extern int fat_dir_empty(struct inode *dir);
-extern int fat_subdirs(struct inode *dir);
-extern int fat_scan(struct inode *dir, const unsigned char *name,
-		    struct fat_slot_info *sinfo);
-extern int fat_get_dotdot_entry(struct inode *dir, struct buffer_head **bh,
-				struct msdos_dir_entry **de, loff_t *i_pos);
-extern int fat_alloc_new_dir(struct inode *dir, struct timespec *ts);
-extern int fat_add_entries(struct inode *dir, void *slots, int nr_slots,
-			   struct fat_slot_info *sinfo);
-extern int fat_remove_entries(struct inode *dir, struct fat_slot_info *sinfo);
-
-/* fat/fatent.c */
-struct fat_entry {
-	int entry;
-	union {
-		u8 *ent12_p[2];
-		__le16 *ent16_p;
-		__le32 *ent32_p;
-	} u;
-	int nr_bhs;
-	struct buffer_head *bhs[2];
-};
-
-static inline void fatent_init(struct fat_entry *fatent)
-{
-	fatent->nr_bhs = 0;
-	fatent->entry = 0;
-	fatent->u.ent32_p = NULL;
-	fatent->bhs[0] = fatent->bhs[1] = NULL;
-}
-
-static inline void fatent_set_entry(struct fat_entry *fatent, int entry)
-{
-	fatent->entry = entry;
-	fatent->u.ent32_p = NULL;
-}
-
-static inline void fatent_brelse(struct fat_entry *fatent)
-{
-	int i;
-	fatent->u.ent32_p = NULL;
-	for (i = 0; i < fatent->nr_bhs; i++)
-		brelse(fatent->bhs[i]);
-	fatent->nr_bhs = 0;
-	fatent->bhs[0] = fatent->bhs[1] = NULL;
-}
-
-extern void fat_ent_access_init(struct super_block *sb);
-extern int fat_ent_read(struct inode *inode, struct fat_entry *fatent,
-			int entry);
-extern int fat_ent_write(struct inode *inode, struct fat_entry *fatent,
-			 int new, int wait);
-extern int fat_alloc_clusters(struct inode *inode, int *cluster,
-			      int nr_cluster);
-extern int fat_free_clusters(struct inode *inode, int cluster);
-extern int fat_count_free_clusters(struct super_block *sb);
-
-/* fat/file.c */
-extern int fat_generic_ioctl(struct inode *inode, struct file *filp,
-			     unsigned int cmd, unsigned long arg);
-extern const struct file_operations fat_file_operations;
-extern const struct inode_operations fat_file_inode_operations;
-extern int fat_setattr(struct dentry * dentry, struct iattr * attr);
-extern void fat_truncate(struct inode *inode);
-extern int fat_getattr(struct vfsmount *mnt, struct dentry *dentry,
-		       struct kstat *stat);
-
-/* fat/inode.c */
-extern void fat_attach(struct inode *inode, loff_t i_pos);
-extern void fat_detach(struct inode *inode);
-extern struct inode *fat_iget(struct super_block *sb, loff_t i_pos);
-extern struct inode *fat_build_inode(struct super_block *sb,
-			struct msdos_dir_entry *de, loff_t i_pos);
-extern int fat_sync_inode(struct inode *inode);
-extern int fat_fill_super(struct super_block *sb, void *data, int silent,
-			const struct inode_operations *fs_dir_inode_ops, int isvfat);
-
-extern int fat_flush_inodes(struct super_block *sb, struct inode *i1,
-		            struct inode *i2);
-/* fat/misc.c */
-extern void fat_fs_panic(struct super_block *s, const char *fmt, ...);
-extern void fat_clusters_flush(struct super_block *sb);
-extern int fat_chain_add(struct inode *inode, int new_dclus, int nr_cluster);
-extern int date_dos2unix(unsigned short time, unsigned short date, int tz_utc);
-extern void fat_date_unix2dos(int unix_date, __le16 *time, __le16 *date,
-			      int tz_utc);
-extern int fat_sync_bhs(struct buffer_head **bhs, int nr_bhs);
-
-int fat_cache_init(void);
-void fat_cache_destroy(void);
-
-#endif /* __KERNEL__ */
-
-#endif
+#endif /* !__KERNEL__ */
+#endif /* !_LINUX_MSDOS_FS_H */
-- 
cgit v1.2.3


From 9c0aa1b87bf541affef519eb4879ce7c5a5941ae Mon Sep 17 00:00:00 2001
From: OGAWA Hirofumi <hirofumi@mail.parknet.co.jp>
Date: Thu, 6 Nov 2008 12:53:54 -0800
Subject: fat: Cleanup FAT attribute stuff

This adds three helpers:

fat_make_attrs() - makes FAT attributes from inode.
fat_make_mode()  - makes mode_t from FAT attributes.
fat_save_attrs() - saves FAT attributes to inode.

Then this replaces: MSDOS_MKMODE() by fat_make_mode(), fat_attr() by
fat_make_attrs(), ->i_attrs = attr & ATTR_UNUSED by fat_save_attrs().
And for root inode, those is used with ATTR_DIR instead of bogus
ATTR_NONE.

Signed-off-by: OGAWA Hirofumi <hirofumi@mail.parknet.co.jp>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/fat/fat.h             | 20 +++++++++++++++++++-
 fs/fat/file.c            | 32 ++++++++++++--------------------
 fs/fat/inode.c           | 19 +++++++++----------
 include/linux/msdos_fs.h |  5 -----
 4 files changed, 40 insertions(+), 36 deletions(-)

(limited to 'include/linux')

diff --git a/fs/fat/fat.h b/fs/fat/fat.h
index 2b8e94c3eef4..3b4753a024e3 100644
--- a/fs/fat/fat.h
+++ b/fs/fat/fat.h
@@ -117,14 +117,32 @@ static inline struct msdos_inode_info *MSDOS_I(struct inode *inode)
 	return container_of(inode, struct msdos_inode_info, vfs_inode);
 }
 
+/* Convert attribute bits and a mask to the UNIX mode. */
+static inline mode_t fat_make_mode(struct msdos_sb_info *sbi,
+				   u8 attrs, mode_t mode)
+{
+	if (attrs & ATTR_RO)
+		mode &= ~S_IWUGO;
+
+	if (attrs & ATTR_DIR)
+		return (mode & ~sbi->options.fs_dmask) | S_IFDIR;
+	else
+		return (mode & ~sbi->options.fs_fmask) | S_IFREG;
+}
+
 /* Return the FAT attribute byte for this inode */
-static inline u8 fat_attr(struct inode *inode)
+static inline u8 fat_make_attrs(struct inode *inode)
 {
 	return ((inode->i_mode & S_IWUGO) ? ATTR_NONE : ATTR_RO) |
 		(S_ISDIR(inode->i_mode) ? ATTR_DIR : ATTR_NONE) |
 		MSDOS_I(inode)->i_attrs;
 }
 
+static inline void fat_save_attrs(struct inode *inode, u8 attrs)
+{
+	MSDOS_I(inode)->i_attrs = attrs & ATTR_UNUSED;
+}
+
 static inline unsigned char fat_checksum(const __u8 *name)
 {
 	unsigned char s = name[0];
diff --git a/fs/fat/file.c b/fs/fat/file.c
index b21973f266a1..f5a7e907a8fa 100644
--- a/fs/fat/file.c
+++ b/fs/fat/file.c
@@ -27,13 +27,7 @@ int fat_generic_ioctl(struct inode *inode, struct file *filp,
 	switch (cmd) {
 	case FAT_IOCTL_GET_ATTRIBUTES:
 	{
-		u32 attr;
-
-		if (inode->i_ino == MSDOS_ROOT_INO)
-			attr = ATTR_DIR;
-		else
-			attr = fat_attr(inode);
-
+		u32 attr = fat_make_attrs(inode);
 		return put_user(attr, user_attr);
 	}
 	case FAT_IOCTL_SET_ATTRIBUTES:
@@ -62,20 +56,16 @@ int fat_generic_ioctl(struct inode *inode, struct file *filp,
 		/* Merge in ATTR_VOLUME and ATTR_DIR */
 		attr |= (MSDOS_I(inode)->i_attrs & ATTR_VOLUME) |
 			(is_dir ? ATTR_DIR : 0);
-		oldattr = fat_attr(inode);
+		oldattr = fat_make_attrs(inode);
 
 		/* Equivalent to a chmod() */
 		ia.ia_valid = ATTR_MODE | ATTR_CTIME;
 		ia.ia_ctime = current_fs_time(inode->i_sb);
-		if (is_dir) {
-			ia.ia_mode = MSDOS_MKMODE(attr,
-				S_IRWXUGO & ~sbi->options.fs_dmask)
-				| S_IFDIR;
-		} else {
-			ia.ia_mode = MSDOS_MKMODE(attr,
-				(S_IRUGO | S_IWUGO | (inode->i_mode & S_IXUGO))
-				& ~sbi->options.fs_fmask)
-				| S_IFREG;
+		if (is_dir)
+			ia.ia_mode = fat_make_mode(sbi, attr, S_IRWXUGO);
+		else {
+			ia.ia_mode = fat_make_mode(sbi, attr,
+				S_IRUGO | S_IWUGO | (inode->i_mode & S_IXUGO));
 		}
 
 		/* The root directory has no attributes */
@@ -115,7 +105,7 @@ int fat_generic_ioctl(struct inode *inode, struct file *filp,
 				inode->i_flags &= S_IMMUTABLE;
 		}
 
-		MSDOS_I(inode)->i_attrs = attr & ATTR_UNUSED;
+		fat_save_attrs(inode, attr);
 		mark_inode_dirty(inode);
 up:
 		mnt_drop_write(filp->f_path.mnt);
@@ -274,7 +264,7 @@ static int fat_sanitize_mode(const struct msdos_sb_info *sbi,
 
 	/*
 	 * Note, the basic check is already done by a caller of
-	 * (attr->ia_mode & ~MSDOS_VALID_MODE)
+	 * (attr->ia_mode & ~FAT_VALID_MODE)
 	 */
 
 	if (S_ISREG(inode->i_mode))
@@ -314,6 +304,8 @@ static int fat_allow_set_time(struct msdos_sb_info *sbi, struct inode *inode)
 }
 
 #define TIMES_SET_FLAGS	(ATTR_MTIME_SET | ATTR_ATIME_SET | ATTR_TIMES_SET)
+/* valid file mode bits */
+#define FAT_VALID_MODE	(S_IFREG | S_IFDIR | S_IRWXUGO)
 
 int fat_setattr(struct dentry *dentry, struct iattr *attr)
 {
@@ -356,7 +348,7 @@ int fat_setattr(struct dentry *dentry, struct iattr *attr)
 	    ((attr->ia_valid & ATTR_GID) &&
 	     (attr->ia_gid != sbi->options.fs_gid)) ||
 	    ((attr->ia_valid & ATTR_MODE) &&
-	     (attr->ia_mode & ~MSDOS_VALID_MODE)))
+	     (attr->ia_mode & ~FAT_VALID_MODE)))
 		error = -EPERM;
 
 	if (error) {
diff --git a/fs/fat/inode.c b/fs/fat/inode.c
index 8e1b75c63c7f..7aaa21cf019a 100644
--- a/fs/fat/inode.c
+++ b/fs/fat/inode.c
@@ -337,8 +337,7 @@ static int fat_fill_inode(struct inode *inode, struct msdos_dir_entry *de)
 
 	if ((de->attr & ATTR_DIR) && !IS_FREE(de->name)) {
 		inode->i_generation &= ~1;
-		inode->i_mode = MSDOS_MKMODE(de->attr,
-			S_IRWXUGO & ~sbi->options.fs_dmask) | S_IFDIR;
+		inode->i_mode = fat_make_mode(sbi, de->attr, S_IRWXUGO);
 		inode->i_op = sbi->dir_ops;
 		inode->i_fop = &fat_dir_operations;
 
@@ -355,10 +354,9 @@ static int fat_fill_inode(struct inode *inode, struct msdos_dir_entry *de)
 		inode->i_nlink = fat_subdirs(inode);
 	} else { /* not a directory */
 		inode->i_generation |= 1;
-		inode->i_mode = MSDOS_MKMODE(de->attr,
-		    ((sbi->options.showexec && !is_exec(de->name + 8))
-			? S_IRUGO|S_IWUGO : S_IRWXUGO)
-		    & ~sbi->options.fs_fmask) | S_IFREG;
+		inode->i_mode = fat_make_mode(sbi, de->attr,
+			((sbi->options.showexec && !is_exec(de->name + 8))
+			 ? S_IRUGO|S_IWUGO : S_IRWXUGO));
 		MSDOS_I(inode)->i_start = le16_to_cpu(de->start);
 		if (sbi->fat_bits == 32)
 			MSDOS_I(inode)->i_start |= (le16_to_cpu(de->starthi) << 16);
@@ -374,7 +372,8 @@ static int fat_fill_inode(struct inode *inode, struct msdos_dir_entry *de)
 		if (sbi->options.sys_immutable)
 			inode->i_flags |= S_IMMUTABLE;
 	}
-	MSDOS_I(inode)->i_attrs = de->attr & ATTR_UNUSED;
+	fat_save_attrs(inode, de->attr);
+
 	inode->i_blocks = ((inode->i_size + (sbi->cluster_size - 1))
 			   & ~((loff_t)sbi->cluster_size - 1)) >> 9;
 
@@ -569,7 +568,7 @@ retry:
 		raw_entry->size = 0;
 	else
 		raw_entry->size = cpu_to_le32(inode->i_size);
-	raw_entry->attr = fat_attr(inode);
+	raw_entry->attr = fat_make_attrs(inode);
 	raw_entry->start = cpu_to_le16(MSDOS_I(inode)->i_logstart);
 	raw_entry->starthi = cpu_to_le16(MSDOS_I(inode)->i_logstart >> 16);
 	fat_time_unix2fat(sbi, &inode->i_mtime, &raw_entry->time,
@@ -1105,7 +1104,7 @@ static int fat_read_root(struct inode *inode)
 	inode->i_gid = sbi->options.fs_gid;
 	inode->i_version++;
 	inode->i_generation = 0;
-	inode->i_mode = (S_IRWXUGO & ~sbi->options.fs_dmask) | S_IFDIR;
+	inode->i_mode = fat_make_mode(sbi, ATTR_DIR, S_IRWXUGO);
 	inode->i_op = sbi->dir_ops;
 	inode->i_fop = &fat_dir_operations;
 	if (sbi->fat_bits == 32) {
@@ -1122,7 +1121,7 @@ static int fat_read_root(struct inode *inode)
 	MSDOS_I(inode)->i_logstart = 0;
 	MSDOS_I(inode)->mmu_private = inode->i_size;
 
-	MSDOS_I(inode)->i_attrs = ATTR_NONE;
+	fat_save_attrs(inode, ATTR_DIR);
 	inode->i_mtime.tv_sec = inode->i_atime.tv_sec = inode->i_ctime.tv_sec = 0;
 	inode->i_mtime.tv_nsec = inode->i_atime.tv_nsec = inode->i_ctime.tv_nsec = 0;
 	inode->i_nlink = fat_subdirs(inode)+2;
diff --git a/include/linux/msdos_fs.h b/include/linux/msdos_fs.h
index 0982fb47a90d..e0a9b207920d 100644
--- a/include/linux/msdos_fs.h
+++ b/include/linux/msdos_fs.h
@@ -46,11 +46,6 @@
 #define DELETED_FLAG	0xe5	/* marks file as deleted when in name[0] */
 #define IS_FREE(n)	(!*(n) || *(n) == DELETED_FLAG)
 
-/* valid file mode bits */
-#define MSDOS_VALID_MODE (S_IFREG | S_IFDIR | S_IRWXU | S_IRWXG | S_IRWXO)
-/* Convert attribute bits and a mask to the UNIX mode. */
-#define MSDOS_MKMODE(a, m) (m & (a & ATTR_RO ? S_IRUGO|S_IXUGO : S_IRWXUGO))
-
 #define MSDOS_NAME	11	/* maximum name length */
 #define MSDOS_LONGNAME	256	/* maximum name length */
 #define MSDOS_SLOTS	21	/* max # of slots for short and long names */
-- 
cgit v1.2.3


From 7597bc94d6f3bdccb086ac7f2ad91292fdaee2a4 Mon Sep 17 00:00:00 2001
From: David Howells <dhowells@redhat.com>
Date: Wed, 5 Nov 2008 17:38:47 +0000
Subject: Fix accidental implicit cast in HR-timer conversion

Fix the hrtimer_add_expires_ns() function.  It should take a 'u64 ns' argument,
but rather takes an 'unsigned long ns' argument - which might only be 32-bits.

On FRV, this results in the kernel locking up because hrtimer_forward() passes
the result of a 64-bit multiplication to this function, for which the compiler
discards the top 32-bits - something that didn't happen when ktime_add_ns() was
called directly.

Signed-off-by: David Howells <dhowells@redhat.com>
Acked-by: Arjan van de Ven <arjan@linux.intel.com>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/hrtimer.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/hrtimer.h b/include/linux/hrtimer.h
index 2b3645b1acf4..07e510a3b00a 100644
--- a/include/linux/hrtimer.h
+++ b/include/linux/hrtimer.h
@@ -239,7 +239,7 @@ static inline void hrtimer_add_expires(struct hrtimer *timer, ktime_t time)
 	timer->_softexpires = ktime_add_safe(timer->_softexpires, time);
 }
 
-static inline void hrtimer_add_expires_ns(struct hrtimer *timer, unsigned long ns)
+static inline void hrtimer_add_expires_ns(struct hrtimer *timer, u64 ns)
 {
 	timer->_expires = ktime_add_ns(timer->_expires, ns);
 	timer->_softexpires = ktime_add_ns(timer->_softexpires, ns);
-- 
cgit v1.2.3


From 3b53fbf4314594fa04544b02b2fc6e607912da18 Mon Sep 17 00:00:00 2001
From: "David S. Miller" <davem@davemloft.net>
Date: Thu, 6 Nov 2008 15:45:32 -0800
Subject: net: Fix recursive descent in __scm_destroy().

__scm_destroy() walks the list of file descriptors in the scm_fp_list
pointed to by the scm_cookie argument.

Those, in turn, can close sockets and invoke __scm_destroy() again.

There is nothing which limits how deeply this can occur.

The idea for how to fix this is from Linus.  Basically, we do all of
the fput()s at the top level by collecting all of the scm_fp_list
objects hit by an fput().  Inside of the initial __scm_destroy() we
keep running the list until it is empty.

Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/sched.h |  2 ++
 include/net/scm.h     |  5 +++--
 net/core/scm.c        | 24 +++++++++++++++++++++---
 3 files changed, 26 insertions(+), 5 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/sched.h b/include/linux/sched.h
index b483f39a7112..295b7c756ca6 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -1349,6 +1349,8 @@ struct task_struct {
 	 */
 	unsigned long timer_slack_ns;
 	unsigned long default_timer_slack_ns;
+
+	struct list_head	*scm_work_list;
 };
 
 /*
diff --git a/include/net/scm.h b/include/net/scm.h
index 06df126103ca..33e9986beb86 100644
--- a/include/net/scm.h
+++ b/include/net/scm.h
@@ -14,8 +14,9 @@
 
 struct scm_fp_list
 {
-	int		count;
-	struct file	*fp[SCM_MAX_FD];
+	struct list_head	list;
+	int			count;
+	struct file		*fp[SCM_MAX_FD];
 };
 
 struct scm_cookie
diff --git a/net/core/scm.c b/net/core/scm.c
index 10f5c65f6a47..ab242cc1acca 100644
--- a/net/core/scm.c
+++ b/net/core/scm.c
@@ -75,6 +75,7 @@ static int scm_fp_copy(struct cmsghdr *cmsg, struct scm_fp_list **fplp)
 		if (!fpl)
 			return -ENOMEM;
 		*fplp = fpl;
+		INIT_LIST_HEAD(&fpl->list);
 		fpl->count = 0;
 	}
 	fpp = &fpl->fp[fpl->count];
@@ -106,9 +107,25 @@ void __scm_destroy(struct scm_cookie *scm)
 
 	if (fpl) {
 		scm->fp = NULL;
-		for (i=fpl->count-1; i>=0; i--)
-			fput(fpl->fp[i]);
-		kfree(fpl);
+		if (current->scm_work_list) {
+			list_add_tail(&fpl->list, current->scm_work_list);
+		} else {
+			LIST_HEAD(work_list);
+
+			current->scm_work_list = &work_list;
+
+			list_add(&fpl->list, &work_list);
+			while (!list_empty(&work_list)) {
+				fpl = list_first_entry(&work_list, struct scm_fp_list, list);
+
+				list_del(&fpl->list);
+				for (i=fpl->count-1; i>=0; i--)
+					fput(fpl->fp[i]);
+				kfree(fpl);
+			}
+
+			current->scm_work_list = NULL;
+		}
 	}
 }
 
@@ -284,6 +301,7 @@ struct scm_fp_list *scm_fp_dup(struct scm_fp_list *fpl)
 
 	new_fpl = kmalloc(sizeof(*fpl), GFP_KERNEL);
 	if (new_fpl) {
+		INIT_LIST_HEAD(&new_fpl->list);
 		for (i=fpl->count-1; i>=0; i--)
 			get_file(fpl->fp[i]);
 		memcpy(new_fpl, fpl, sizeof(*fpl));
-- 
cgit v1.2.3


From cd83e42c6b0413dcbb548c2ead799111ff7e6a13 Mon Sep 17 00:00:00 2001
From: Rusty Russell <rusty@rustcorp.com.au>
Date: Fri, 7 Nov 2008 11:12:29 +1100
Subject: cpumask: new API, v2

- add cpumask_of()
- add free_bootmem_cpumask_var()

Signed-off-by: Rusty Russell <rusty@rustcorp.com.au>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 include/linux/cpumask.h | 11 +++++++++++
 lib/cpumask.c           |  5 +++++
 2 files changed, 16 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/cpumask.h b/include/linux/cpumask.h
index c8e66619097b..31caa1bc620a 100644
--- a/include/linux/cpumask.h
+++ b/include/linux/cpumask.h
@@ -893,6 +893,12 @@ static inline void cpumask_copy(struct cpumask *dstp,
  */
 #define cpumask_any_and(mask1, mask2) cpumask_first_and((mask1), (mask2))
 
+/**
+ * cpumask_of - the cpumask containing just a given cpu
+ * @cpu: the cpu (<= nr_cpu_ids)
+ */
+#define cpumask_of(cpu) (get_cpu_mask(cpu))
+
 /**
  * to_cpumask - convert an NR_CPUS bitmap to a struct cpumask *
  * @bitmap: the bitmap
@@ -946,6 +952,7 @@ typedef struct cpumask *cpumask_var_t;
 bool alloc_cpumask_var(cpumask_var_t *mask, gfp_t flags);
 void alloc_bootmem_cpumask_var(cpumask_var_t *mask);
 void free_cpumask_var(cpumask_var_t mask);
+void free_bootmem_cpumask_var(cpumask_var_t mask);
 
 #else
 typedef struct cpumask cpumask_var_t[1];
@@ -962,6 +969,10 @@ static inline void alloc_bootmem_cpumask_var(cpumask_var_t *mask)
 static inline void free_cpumask_var(cpumask_var_t mask)
 {
 }
+
+static inline void free_bootmem_cpumask_var(cpumask_var_t mask)
+{
+}
 #endif /* CONFIG_CPUMASK_OFFSTACK */
 
 /* The pointer versions of the maps, these will become the primary versions. */
diff --git a/lib/cpumask.c b/lib/cpumask.c
index 5ceb4211c834..2ebc3a9a7465 100644
--- a/lib/cpumask.c
+++ b/lib/cpumask.c
@@ -107,4 +107,9 @@ void free_cpumask_var(cpumask_var_t mask)
 	kfree(mask);
 }
 EXPORT_SYMBOL(free_cpumask_var);
+
+void free_bootmem_cpumask_var(cpumask_var_t mask)
+{
+	free_bootmem((unsigned long)mask, cpumask_size());
+}
 #endif
-- 
cgit v1.2.3


From 14800984706bf6936bbec5187f736e928be5c218 Mon Sep 17 00:00:00 2001
From: Mike Galbraith <efault@gmx.de>
Date: Fri, 7 Nov 2008 15:26:50 +0100
Subject: sched: fine-tune SD_MC_INIT

Tune SD_MC_INIT the same way as SD_CPU_INIT:
unset SD_BALANCE_NEWIDLE, and set SD_WAKE_BALANCE.

This improves vmark by 5%:

vmark         132102 125968 125497 messages/sec    avg 127855.66    .984
vmark         139404 131719 131272 messages/sec    avg 134131.66   1.033

Signed-off-by: Mike Galbraith <efault@gmx.de>
Acked-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Signed-off-by: Ingo Molnar <mingo@elte.hu>

 # *DOCUMENTATION*
---
 include/linux/topology.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/topology.h b/include/linux/topology.h
index 34a7ee0ebed2..a8d840595b7e 100644
--- a/include/linux/topology.h
+++ b/include/linux/topology.h
@@ -120,10 +120,10 @@ void arch_update_cpu_topology(void);
 	.wake_idx		= 1,			\
 	.forkexec_idx		= 1,			\
 	.flags			= SD_LOAD_BALANCE	\
-				| SD_BALANCE_NEWIDLE	\
 				| SD_BALANCE_FORK	\
 				| SD_BALANCE_EXEC	\
 				| SD_WAKE_AFFINE	\
+				| SD_WAKE_BALANCE	\
 				| SD_SHARE_PKG_RESOURCES\
 				| BALANCE_FOR_MC_POWER,	\
 	.last_balance		= jiffies,		\
-- 
cgit v1.2.3


From 52c642f33b14bfa1b00ef2b68296effb34a573f3 Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Fri, 7 Nov 2008 16:09:23 +0100
Subject: sched: fine-tune SD_SIBLING_INIT

fine-tune the HT sched-domains parameters as well.

On a HT capable box, this increases lat_ctx performance from 23.87
usecs to 1.49 usecs:

 # before

 $ ./lat_ctx -s 0 2

   "size=0k ovr=1.89
    2 23.87

 # after

 $ ./lat_ctx -s 0 2

   "size=0k ovr=1.84
     2 1.49

Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 include/linux/topology.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/topology.h b/include/linux/topology.h
index a8d840595b7e..117f1b7405cf 100644
--- a/include/linux/topology.h
+++ b/include/linux/topology.h
@@ -99,7 +99,7 @@ void arch_update_cpu_topology(void);
 				| SD_BALANCE_FORK	\
 				| SD_BALANCE_EXEC	\
 				| SD_WAKE_AFFINE	\
-				| SD_WAKE_IDLE		\
+				| SD_WAKE_BALANCE	\
 				| SD_SHARE_CPUPOWER,	\
 	.last_balance		= jiffies,		\
 	.balance_interval	= 1,			\
-- 
cgit v1.2.3


From d1b268630875a7713b5d468a0c03403c5b721c8e Mon Sep 17 00:00:00 2001
From: Kay Sievers <kay.sievers@vrfy.org>
Date: Sat, 8 Nov 2008 21:37:46 +0100
Subject: mmc: struct device - replace bus_id with dev_name(), dev_set_name()

Acked-by: Greg Kroah-Hartman <gregkh@suse.de>
Signed-Off-By: Kay Sievers <kay.sievers@vrfy.org>
Signed-off-by: Pierre Ossman <drzeus@drzeus.cx>
---
 drivers/mmc/core/bus.c        |  3 +--
 drivers/mmc/core/host.c       |  5 ++---
 drivers/mmc/core/sdio_bus.c   |  3 +--
 drivers/mmc/host/mmc_spi.c    |  2 +-
 drivers/mmc/host/sdhci.c      |  2 +-
 drivers/mmc/host/tifm_sd.c    | 16 ++++++++--------
 include/linux/mmc/card.h      |  2 +-
 include/linux/mmc/host.h      |  2 +-
 include/linux/mmc/sdio_func.h |  2 +-
 9 files changed, 17 insertions(+), 20 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/mmc/core/bus.c b/drivers/mmc/core/bus.c
index 0d9b2d6f9ebf..f210a8ee6861 100644
--- a/drivers/mmc/core/bus.c
+++ b/drivers/mmc/core/bus.c
@@ -216,8 +216,7 @@ int mmc_add_card(struct mmc_card *card)
 	int ret;
 	const char *type;
 
-	snprintf(card->dev.bus_id, sizeof(card->dev.bus_id),
-		 "%s:%04x", mmc_hostname(card->host), card->rca);
+	dev_set_name(&card->dev, "%s:%04x", mmc_hostname(card->host), card->rca);
 
 	switch (card->type) {
 	case MMC_TYPE_MMC:
diff --git a/drivers/mmc/core/host.c b/drivers/mmc/core/host.c
index 6da80fd4d974..5e945e64ead7 100644
--- a/drivers/mmc/core/host.c
+++ b/drivers/mmc/core/host.c
@@ -73,8 +73,7 @@ struct mmc_host *mmc_alloc_host(int extra, struct device *dev)
 	if (err)
 		goto free;
 
-	snprintf(host->class_dev.bus_id, BUS_ID_SIZE,
-		 "mmc%d", host->index);
+	dev_set_name(&host->class_dev, "mmc%d", host->index);
 
 	host->parent = dev;
 	host->class_dev.parent = dev;
@@ -121,7 +120,7 @@ int mmc_add_host(struct mmc_host *host)
 	WARN_ON((host->caps & MMC_CAP_SDIO_IRQ) &&
 		!host->ops->enable_sdio_irq);
 
-	led_trigger_register_simple(host->class_dev.bus_id, &host->led);
+	led_trigger_register_simple(dev_name(&host->class_dev), &host->led);
 
 	err = device_add(&host->class_dev);
 	if (err)
diff --git a/drivers/mmc/core/sdio_bus.c b/drivers/mmc/core/sdio_bus.c
index 233d0f9b3c4b..46284b527397 100644
--- a/drivers/mmc/core/sdio_bus.c
+++ b/drivers/mmc/core/sdio_bus.c
@@ -239,8 +239,7 @@ int sdio_add_func(struct sdio_func *func)
 {
 	int ret;
 
-	snprintf(func->dev.bus_id, sizeof(func->dev.bus_id),
-		 "%s:%d", mmc_card_id(func->card), func->num);
+	dev_set_name(&func->dev, "%s:%d", mmc_card_id(func->card), func->num);
 
 	ret = device_add(&func->dev);
 	if (ret == 0)
diff --git a/drivers/mmc/host/mmc_spi.c b/drivers/mmc/host/mmc_spi.c
index 07faf5412a1f..ad00e1632317 100644
--- a/drivers/mmc/host/mmc_spi.c
+++ b/drivers/mmc/host/mmc_spi.c
@@ -1348,7 +1348,7 @@ static int mmc_spi_probe(struct spi_device *spi)
 		goto fail_add_host;
 
 	dev_info(&spi->dev, "SD/MMC host %s%s%s%s%s\n",
-			mmc->class_dev.bus_id,
+			dev_name(&mmc->class_dev),
 			host->dma_dev ? "" : ", no DMA",
 			(host->pdata && host->pdata->get_ro)
 				? "" : ", no WP",
diff --git a/drivers/mmc/host/sdhci.c b/drivers/mmc/host/sdhci.c
index 30f64b1f2354..4d010a984bed 100644
--- a/drivers/mmc/host/sdhci.c
+++ b/drivers/mmc/host/sdhci.c
@@ -1733,7 +1733,7 @@ int sdhci_add_host(struct sdhci_host *host)
 	mmc_add_host(mmc);
 
 	printk(KERN_INFO "%s: SDHCI controller on %s [%s] using %s%s\n",
-		mmc_hostname(mmc), host->hw_name, mmc_dev(mmc)->bus_id,
+		mmc_hostname(mmc), host->hw_name, dev_name(mmc_dev(mmc)),
 		(host->flags & SDHCI_USE_ADMA)?"A":"",
 		(host->flags & SDHCI_USE_DMA)?"DMA":"PIO");
 
diff --git a/drivers/mmc/host/tifm_sd.c b/drivers/mmc/host/tifm_sd.c
index 13844843e8de..82554ddec6b3 100644
--- a/drivers/mmc/host/tifm_sd.c
+++ b/drivers/mmc/host/tifm_sd.c
@@ -632,7 +632,7 @@ static void tifm_sd_request(struct mmc_host *mmc, struct mmc_request *mrq)
 
 	if (host->req) {
 		printk(KERN_ERR "%s : unfinished request detected\n",
-		       sock->dev.bus_id);
+		       dev_name(&sock->dev));
 		mrq->cmd->error = -ETIMEDOUT;
 		goto err_out;
 	}
@@ -672,7 +672,7 @@ static void tifm_sd_request(struct mmc_host *mmc, struct mmc_request *mrq)
 					    ? PCI_DMA_TODEVICE
 					    : PCI_DMA_FROMDEVICE)) {
 				printk(KERN_ERR "%s : scatterlist map failed\n",
-				       sock->dev.bus_id);
+				       dev_name(&sock->dev));
 				mrq->cmd->error = -ENOMEM;
 				goto err_out;
 			}
@@ -684,7 +684,7 @@ static void tifm_sd_request(struct mmc_host *mmc, struct mmc_request *mrq)
 						   : PCI_DMA_FROMDEVICE);
 			if (host->sg_len < 1) {
 				printk(KERN_ERR "%s : scatterlist map failed\n",
-				       sock->dev.bus_id);
+				       dev_name(&sock->dev));
 				tifm_unmap_sg(sock, &host->bounce_buf, 1,
 					      r_data->flags & MMC_DATA_WRITE
 					      ? PCI_DMA_TODEVICE
@@ -748,7 +748,7 @@ static void tifm_sd_end_cmd(unsigned long data)
 
 	if (!mrq) {
 		printk(KERN_ERR " %s : no request to complete?\n",
-		       sock->dev.bus_id);
+		       dev_name(&sock->dev));
 		spin_unlock_irqrestore(&sock->lock, flags);
 		return;
 	}
@@ -789,7 +789,7 @@ static void tifm_sd_abort(unsigned long data)
 	printk(KERN_ERR
 	       "%s : card failed to respond for a long period of time "
 	       "(%x, %x)\n",
-	       host->dev->dev.bus_id, host->req->cmd->opcode, host->cmd_flags);
+	       dev_name(&host->dev->dev), host->req->cmd->opcode, host->cmd_flags);
 
 	tifm_eject(host->dev);
 }
@@ -906,7 +906,7 @@ static int tifm_sd_initialize_host(struct tifm_sd *host)
 
 	if (rc) {
 		printk(KERN_ERR "%s : controller failed to reset\n",
-		       sock->dev.bus_id);
+		       dev_name(&sock->dev));
 		return -ENODEV;
 	}
 
@@ -933,7 +933,7 @@ static int tifm_sd_initialize_host(struct tifm_sd *host)
 	if (rc) {
 		printk(KERN_ERR
 		       "%s : card not ready - probe failed on initialization\n",
-		       sock->dev.bus_id);
+		       dev_name(&sock->dev));
 		return -ENODEV;
 	}
 
@@ -954,7 +954,7 @@ static int tifm_sd_probe(struct tifm_dev *sock)
 	if (!(TIFM_SOCK_STATE_OCCUPIED
 	      & readl(sock->addr + SOCK_PRESENT_STATE))) {
 		printk(KERN_WARNING "%s : card gone, unexpectedly\n",
-		       sock->dev.bus_id);
+		       dev_name(&sock->dev));
 		return rc;
 	}
 
diff --git a/include/linux/mmc/card.h b/include/linux/mmc/card.h
index ee6e822d5994..403aa505f27e 100644
--- a/include/linux/mmc/card.h
+++ b/include/linux/mmc/card.h
@@ -130,7 +130,7 @@ struct mmc_card {
 #define mmc_card_set_blockaddr(c) ((c)->state |= MMC_STATE_BLOCKADDR)
 
 #define mmc_card_name(c)	((c)->cid.prod_name)
-#define mmc_card_id(c)		((c)->dev.bus_id)
+#define mmc_card_id(c)		(dev_name(&(c)->dev))
 
 #define mmc_list_to_card(l)	container_of(l, struct mmc_card, node)
 #define mmc_get_drvdata(c)	dev_get_drvdata(&(c)->dev)
diff --git a/include/linux/mmc/host.h b/include/linux/mmc/host.h
index bde891f64591..f842f234e44f 100644
--- a/include/linux/mmc/host.h
+++ b/include/linux/mmc/host.h
@@ -176,7 +176,7 @@ static inline void *mmc_priv(struct mmc_host *host)
 
 #define mmc_dev(x)	((x)->parent)
 #define mmc_classdev(x)	(&(x)->class_dev)
-#define mmc_hostname(x)	((x)->class_dev.bus_id)
+#define mmc_hostname(x)	(dev_name(&(x)->class_dev))
 
 extern int mmc_suspend_host(struct mmc_host *, pm_message_t);
 extern int mmc_resume_host(struct mmc_host *);
diff --git a/include/linux/mmc/sdio_func.h b/include/linux/mmc/sdio_func.h
index 07bee4a0d457..451bdfc85830 100644
--- a/include/linux/mmc/sdio_func.h
+++ b/include/linux/mmc/sdio_func.h
@@ -63,7 +63,7 @@ struct sdio_func {
 
 #define sdio_func_set_present(f) ((f)->state |= SDIO_STATE_PRESENT)
 
-#define sdio_func_id(f)		((f)->dev.bus_id)
+#define sdio_func_id(f)		(dev_name(&(f)->dev))
 
 #define sdio_get_drvdata(f)	dev_get_drvdata(&(f)->dev)
 #define sdio_set_drvdata(f,d)	dev_set_drvdata(&(f)->dev, d)
-- 
cgit v1.2.3


From 058e3739f6b0753696db1952378de9e8d2a11735 Mon Sep 17 00:00:00 2001
From: Nicolas Pitre <nico@cam.org>
Date: Sun, 9 Nov 2008 00:27:53 -0500
Subject: clarify usage expectations for cnt32_to_63()

Currently, all existing users of cnt32_to_63() are fine since the CPU
architectures where it is used don't do read access reordering, and user
mode preemption is disabled already.  It is nevertheless a good idea to
better elaborate usage requirements wrt preemption, and use an explicit
memory barrier on SMP to avoid different CPUs accessing the counter
value in the wrong order.  On UP a simple compiler barrier is
sufficient.

Signed-off-by: Nicolas Pitre <nico@marvell.com>
Acked-by: Mathieu Desnoyers <mathieu.desnoyers@polymtl.ca>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/cnt32_to_63.h | 22 ++++++++++++++++------
 1 file changed, 16 insertions(+), 6 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/cnt32_to_63.h b/include/linux/cnt32_to_63.h
index 8c0f9505b48c..7605fdd1eb65 100644
--- a/include/linux/cnt32_to_63.h
+++ b/include/linux/cnt32_to_63.h
@@ -16,6 +16,7 @@
 #include <linux/compiler.h>
 #include <linux/types.h>
 #include <asm/byteorder.h>
+#include <asm/system.h>
 
 /* this is used only to give gcc a clue about good code generation */
 union cnt32_to_63 {
@@ -53,11 +54,19 @@ union cnt32_to_63 {
  * needed increment.  And any race in updating the value in memory is harmless
  * as the same value would simply be stored more than once.
  *
- * The only restriction for the algorithm to work properly is that this
- * code must be executed at least once per each half period of the 32-bit
- * counter to properly update the state bit in memory. This is usually not a
- * problem in practice, but if it is then a kernel timer could be scheduled
- * to manage for this code to be executed often enough.
+ * The restrictions for the algorithm to work properly are:
+ *
+ * 1) this code must be called at least once per each half period of the
+ *    32-bit counter;
+ *
+ * 2) this code must not be preempted for a duration longer than the
+ *    32-bit counter half period minus the longest period between two
+ *    calls to this code.
+ *
+ * Those requirements ensure proper update to the state bit in memory.
+ * This is usually not a problem in practice, but if it is then a kernel
+ * timer should be scheduled to manage for this code to be executed often
+ * enough.
  *
  * Note that the top bit (bit 63) in the returned value should be considered
  * as garbage.  It is not cleared here because callers are likely to use a
@@ -68,9 +77,10 @@ union cnt32_to_63 {
  */
 #define cnt32_to_63(cnt_lo) \
 ({ \
-	static volatile u32 __m_cnt_hi; \
+	static u32 __m_cnt_hi; \
 	union cnt32_to_63 __x; \
 	__x.hi = __m_cnt_hi; \
+ 	smp_rmb(); \
 	__x.lo = (cnt_lo); \
 	if (unlikely((s32)(__x.hi ^ __x.lo) < 0)) \
 		__m_cnt_hi = __x.hi = (__x.hi ^ 0x80000000) + (__x.hi >> 31); \
-- 
cgit v1.2.3


From 984f2f377fdfd098f5ae58d09ee04d5e29e6112b Mon Sep 17 00:00:00 2001
From: Rusty Russell <rusty@rustcorp.com.au>
Date: Sat, 8 Nov 2008 20:24:19 +1100
Subject: cpumask: introduce new API, without changing anything, v3

Impact: cleanup

Clean up based on feedback from Andrew Morton and others:

 - change to inline functions instead of macros
 - add __init to bootmem method
 - add a missing debug check

Signed-off-by: Rusty Russell <rusty@rustcorp.com.au>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 include/linux/cpumask.h | 58 ++++++++++++++++++++++++++++++++++++++++++++-----
 lib/cpumask.c           |  3 ++-
 2 files changed, 54 insertions(+), 7 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/cpumask.h b/include/linux/cpumask.h
index 31caa1bc620a..21e1dd43e52a 100644
--- a/include/linux/cpumask.h
+++ b/include/linux/cpumask.h
@@ -564,12 +564,36 @@ static inline unsigned int cpumask_check(unsigned int cpu)
 }
 
 #if NR_CPUS == 1
-/* Uniprocesor. */
-#define cpumask_first(src)		({ (void)(src); 0; })
-#define cpumask_next(n, src)		({ (void)(src); 1; })
-#define cpumask_next_zero(n, src)	({ (void)(src); 1; })
-#define cpumask_next_and(n, srcp, andp)	({ (void)(srcp), (void)(andp); 1; })
-#define cpumask_any_but(mask, cpu)	({ (void)(mask); (void)(cpu); 0; })
+/* Uniprocessor.  Assume all masks are "1". */
+static inline unsigned int cpumask_first(const struct cpumask *srcp)
+{
+	return 0;
+}
+
+/* Valid inputs for n are -1 and 0. */
+static inline unsigned int cpumask_next(int n, const struct cpumask *srcp)
+{
+	return n+1;
+}
+
+static inline unsigned int cpumask_next_zero(int n, const struct cpumask *srcp)
+{
+	return n+1;
+}
+
+static inline unsigned int cpumask_next_and(int n,
+					    const struct cpumask *srcp,
+					    const struct cpumask *andp)
+{
+	return n+1;
+}
+
+/* cpu must be a valid cpu, ie 0, so there's no other choice. */
+static inline unsigned int cpumask_any_but(const struct cpumask *mask,
+					   unsigned int cpu)
+{
+	return 1;
+}
 
 #define for_each_cpu(cpu, mask)			\
 	for ((cpu) = 0; (cpu) < 1; (cpu)++, (void)mask)
@@ -620,10 +644,32 @@ static inline unsigned int cpumask_next_zero(int n, const struct cpumask *srcp)
 int cpumask_next_and(int n, const struct cpumask *, const struct cpumask *);
 int cpumask_any_but(const struct cpumask *mask, unsigned int cpu);
 
+/**
+ * for_each_cpu - iterate over every cpu in a mask
+ * @cpu: the (optionally unsigned) integer iterator
+ * @mask: the cpumask pointer
+ *
+ * After the loop, cpu is >= nr_cpu_ids.
+ */
 #define for_each_cpu(cpu, mask)				\
 	for ((cpu) = -1;				\
 		(cpu) = cpumask_next((cpu), (mask)),	\
 		(cpu) < nr_cpu_ids;)
+
+/**
+ * for_each_cpu_and - iterate over every cpu in both masks
+ * @cpu: the (optionally unsigned) integer iterator
+ * @mask: the first cpumask pointer
+ * @and: the second cpumask pointer
+ *
+ * This saves a temporary CPU mask in many places.  It is equivalent to:
+ *	struct cpumask tmp;
+ *	cpumask_and(&tmp, &mask, &and);
+ *	for_each_cpu(cpu, &tmp)
+ *		...
+ *
+ * After the loop, cpu is >= nr_cpu_ids.
+ */
 #define for_each_cpu_and(cpu, mask, and)				\
 	for ((cpu) = -1;						\
 		(cpu) = cpumask_next_and((cpu), (mask), (and)),		\
diff --git a/lib/cpumask.c b/lib/cpumask.c
index 2ebc3a9a7465..8d03f22c6ced 100644
--- a/lib/cpumask.c
+++ b/lib/cpumask.c
@@ -67,6 +67,7 @@ int cpumask_any_but(const struct cpumask *mask, unsigned int cpu)
 {
 	unsigned int i;
 
+	cpumask_check(cpu);
 	for_each_cpu(i, mask)
 		if (i != cpu)
 			break;
@@ -108,7 +109,7 @@ void free_cpumask_var(cpumask_var_t mask)
 }
 EXPORT_SYMBOL(free_cpumask_var);
 
-void free_bootmem_cpumask_var(cpumask_var_t mask)
+void __init free_bootmem_cpumask_var(cpumask_var_t mask)
 {
 	free_bootmem((unsigned long)mask, cpumask_size());
 }
-- 
cgit v1.2.3