From 84e478c6f1eb9c4bfa1fff2f8108e9a061b46428 Mon Sep 17 00:00:00 2001
From: Don Zickus <dzickus@redhat.com>
Date: Fri, 5 Feb 2010 21:47:05 -0500
Subject: nmi_watchdog: Config option to enable new nmi_watchdog

These are the bits that enable the new nmi_watchdog and safely
isolate the old nmi_watchdog.  Only one or the other can run,
not both at the same time.

Signed-off-by: Don Zickus <dzickus@redhat.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: gorcunov@gmail.com
Cc: aris@redhat.com
Cc: peterz@infradead.org
LKML-Reference: <1265424425-31562-4-git-send-email-dzickus@redhat.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 include/linux/nmi.h | 4 ++++
 1 file changed, 4 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/nmi.h b/include/linux/nmi.h
index b752e807adde..a42ff0bef708 100644
--- a/include/linux/nmi.h
+++ b/include/linux/nmi.h
@@ -47,4 +47,8 @@ static inline bool trigger_all_cpu_backtrace(void)
 }
 #endif
 
+#ifdef CONFIG_NMI_WATCHDOG
+int hw_nmi_is_cpu_stuck(struct pt_regs *);
+#endif
+
 #endif
-- 
cgit v1.2.3


From 504d7cf10ee42bb76b9556859f23d4121dee0a77 Mon Sep 17 00:00:00 2001
From: Don Zickus <dzickus@redhat.com>
Date: Fri, 12 Feb 2010 17:19:19 -0500
Subject: nmi_watchdog: Compile and portability fixes

The original patch was x86_64 centric.  Changed the code to make
it less so.

ested by building and running on a powerpc.

Signed-off-by: Don Zickus <dzickus@redhat.com>
Cc: peterz@infradead.org
Cc: gorcunov@gmail.com
Cc: aris@redhat.com
LKML-Reference: <1266013161-31197-2-git-send-email-dzickus@redhat.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/include/asm/nmi.h    |  2 ++
 arch/x86/kernel/apic/hw_nmi.c | 21 ++++++++++++-----
 include/linux/nmi.h           |  9 ++++++++
 kernel/nmi_watchdog.c         | 52 ++++++++++++++++++++++++++++++++++---------
 kernel/sysctl.c               | 15 ++++++++++++-
 5 files changed, 82 insertions(+), 17 deletions(-)

(limited to 'include/linux')

diff --git a/arch/x86/include/asm/nmi.h b/arch/x86/include/asm/nmi.h
index 93da9c3f3341..5b41b0feb6db 100644
--- a/arch/x86/include/asm/nmi.h
+++ b/arch/x86/include/asm/nmi.h
@@ -17,7 +17,9 @@ int do_nmi_callback(struct pt_regs *regs, int cpu);
 
 extern void die_nmi(char *str, struct pt_regs *regs, int do_panic);
 extern int check_nmi_watchdog(void);
+#if !defined(CONFIG_NMI_WATCHDOG)
 extern int nmi_watchdog_enabled;
+#endif
 extern int avail_to_resrv_perfctr_nmi_bit(unsigned int);
 extern int reserve_perfctr_nmi(unsigned int);
 extern void release_perfctr_nmi(unsigned int);
diff --git a/arch/x86/kernel/apic/hw_nmi.c b/arch/x86/kernel/apic/hw_nmi.c
index 8c0e6a410d05..312d772c5c35 100644
--- a/arch/x86/kernel/apic/hw_nmi.c
+++ b/arch/x86/kernel/apic/hw_nmi.c
@@ -32,8 +32,13 @@ static DEFINE_PER_CPU(unsigned, last_irq_sum);
  */
 static inline unsigned int get_timer_irqs(int cpu)
 {
-        return per_cpu(irq_stat, cpu).apic_timer_irqs +
-                per_cpu(irq_stat, cpu).irq0_irqs;
+	unsigned int irqs = per_cpu(irq_stat, cpu).irq0_irqs;
+
+#if defined(CONFIG_X86_LOCAL_APIC)
+	irqs += per_cpu(irq_stat, cpu).apic_timer_irqs;
+#endif
+
+        return irqs;
 }
 
 static inline int mce_in_progress(void)
@@ -82,6 +87,11 @@ int hw_nmi_is_cpu_stuck(struct pt_regs *regs)
 	}
 }
 
+u64 hw_nmi_get_sample_period(void)
+{
+        return cpu_khz * 1000;
+}
+
 void arch_trigger_all_cpu_backtrace(void)
 {
 	int i;
@@ -100,15 +110,16 @@ void arch_trigger_all_cpu_backtrace(void)
 }
 
 /* STUB calls to mimic old nmi_watchdog behaviour */
+#if defined(CONFIG_X86_LOCAL_APIC)
 unsigned int nmi_watchdog = NMI_NONE;
 EXPORT_SYMBOL(nmi_watchdog);
+void acpi_nmi_enable(void) { return; }
+void acpi_nmi_disable(void) { return; }
+#endif
 atomic_t nmi_active = ATOMIC_INIT(0);           /* oprofile uses this */
 EXPORT_SYMBOL(nmi_active);
-int nmi_watchdog_enabled;
 int unknown_nmi_panic;
 void cpu_nmi_set_wd_enabled(void) { return; }
-void acpi_nmi_enable(void) { return; }
-void acpi_nmi_disable(void) { return; }
 void stop_apic_nmi_watchdog(void *unused) { return; }
 void setup_apic_nmi_watchdog(void *unused) { return; }
 int __init check_nmi_watchdog(void) { return 0; }
diff --git a/include/linux/nmi.h b/include/linux/nmi.h
index a42ff0bef708..794e7354c5bf 100644
--- a/include/linux/nmi.h
+++ b/include/linux/nmi.h
@@ -20,10 +20,14 @@ extern void touch_nmi_watchdog(void);
 extern void acpi_nmi_disable(void);
 extern void acpi_nmi_enable(void);
 #else
+#ifndef CONFIG_NMI_WATCHDOG
 static inline void touch_nmi_watchdog(void)
 {
 	touch_softlockup_watchdog();
 }
+#else
+extern void touch_nmi_watchdog(void);
+#endif
 static inline void acpi_nmi_disable(void) { }
 static inline void acpi_nmi_enable(void) { }
 #endif
@@ -49,6 +53,11 @@ static inline bool trigger_all_cpu_backtrace(void)
 
 #ifdef CONFIG_NMI_WATCHDOG
 int hw_nmi_is_cpu_stuck(struct pt_regs *);
+u64 hw_nmi_get_sample_period(void);
+extern int nmi_watchdog_enabled;
+struct ctl_table;
+extern int proc_nmi_enabled(struct ctl_table *, int ,
+                        void __user *, size_t *, loff_t *);
 #endif
 
 #endif
diff --git a/kernel/nmi_watchdog.c b/kernel/nmi_watchdog.c
index 36817b214d69..73c1954a97bb 100644
--- a/kernel/nmi_watchdog.c
+++ b/kernel/nmi_watchdog.c
@@ -30,6 +30,8 @@ static DEFINE_PER_CPU(struct perf_event *, nmi_watchdog_ev);
 static DEFINE_PER_CPU(int, nmi_watchdog_touch);
 static DEFINE_PER_CPU(long, alert_counter);
 
+static int panic_on_timeout;
+
 void touch_nmi_watchdog(void)
 {
 	__raw_get_cpu_var(nmi_watchdog_touch) = 1;
@@ -46,19 +48,49 @@ void touch_all_nmi_watchdog(void)
 	touch_softlockup_watchdog();
 }
 
+static int __init setup_nmi_watchdog(char *str)
+{
+        if (!strncmp(str, "panic", 5)) {
+                panic_on_timeout = 1;
+                str = strchr(str, ',');
+                if (!str)
+                        return 1;
+                ++str;
+        }
+        return 1;
+}
+__setup("nmi_watchdog=", setup_nmi_watchdog);
+
 #ifdef CONFIG_SYSCTL
 /*
  * proc handler for /proc/sys/kernel/nmi_watchdog
  */
+int nmi_watchdog_enabled;
+
 int proc_nmi_enabled(struct ctl_table *table, int write,
 		     void __user *buffer, size_t *length, loff_t *ppos)
 {
 	int cpu;
 
-	if (per_cpu(nmi_watchdog_ev, smp_processor_id()) == NULL)
+	if (!write) {
+		struct perf_event *event;
+		for_each_online_cpu(cpu) {
+			event = per_cpu(nmi_watchdog_ev, cpu);
+			if (event->state > PERF_EVENT_STATE_OFF) {
+				nmi_watchdog_enabled = 1;
+				break;
+			}
+		}
+		proc_dointvec(table, write, buffer, length, ppos);
+		return 0;
+	}
+
+	if (per_cpu(nmi_watchdog_ev, smp_processor_id()) == NULL) {
 		nmi_watchdog_enabled = 0;
-	else
-		nmi_watchdog_enabled = 1;
+		proc_dointvec(table, write, buffer, length, ppos);
+		printk("NMI watchdog failed configuration, can not be enabled\n");
+		return 0;
+	}
 
 	touch_all_nmi_watchdog();
 	proc_dointvec(table, write, buffer, length, ppos);
@@ -81,8 +113,6 @@ struct perf_event_attr wd_attr = {
 	.disabled = 1,
 };
 
-static int panic_on_timeout;
-
 void wd_overflow(struct perf_event *event, int nmi,
 		 struct perf_sample_data *data,
 		 struct pt_regs *regs)
@@ -103,11 +133,11 @@ void wd_overflow(struct perf_event *event, int nmi,
 		 */
 		per_cpu(alert_counter,cpu) += 1;
 		if (per_cpu(alert_counter,cpu) == 5) {
-			/*
-			 * die_nmi will return ONLY if NOTIFY_STOP happens..
-			 */
-			die_nmi("BUG: NMI Watchdog detected LOCKUP",
-				regs, panic_on_timeout);
+			if (panic_on_timeout) {
+				panic("NMI Watchdog detected LOCKUP on cpu %d", cpu);
+			} else {
+				WARN(1, "NMI Watchdog detected LOCKUP on cpu %d", cpu);
+			}
 		}
 	} else {
 		per_cpu(alert_counter,cpu) = 0;
@@ -133,7 +163,7 @@ cpu_callback(struct notifier_block *nfb, unsigned long action, void *hcpu)
 	case CPU_ONLINE:
 	case CPU_ONLINE_FROZEN:
 		/* originally wanted the below chunk to be in CPU_UP_PREPARE, but caps is unpriv for non-CPU0 */
-		wd_attr.sample_period = cpu_khz * 1000;
+		wd_attr.sample_period = hw_nmi_get_sample_period();
 		event = perf_event_create_kernel_counter(&wd_attr, hotcpu, -1, wd_overflow);
 		if (IS_ERR(event)) {
 			printk(KERN_ERR "nmi watchdog failed to create perf event on %i: %p\n", hotcpu, event);
diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index 8a68b2448468..ac72c9e6bd9b 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -60,6 +60,10 @@
 #include <asm/io.h>
 #endif
 
+#ifdef CONFIG_NMI_WATCHDOG
+#include <linux/nmi.h>
+#endif
+
 
 #if defined(CONFIG_SYSCTL)
 
@@ -692,7 +696,16 @@ static struct ctl_table kern_table[] = {
 		.mode		= 0444,
 		.proc_handler	= proc_dointvec,
 	},
-#if defined(CONFIG_X86_LOCAL_APIC) && defined(CONFIG_X86)
+#if defined(CONFIG_NMI_WATCHDOG)
+	{
+		.procname       = "nmi_watchdog",
+		.data           = &nmi_watchdog_enabled,
+		.maxlen         = sizeof (int),
+		.mode           = 0644,
+		.proc_handler   = proc_nmi_enabled,
+	},
+#endif
+#if defined(CONFIG_X86_LOCAL_APIC) && defined(CONFIG_X86) && !defined(CONFIG_NMI_WATCHDOG)
 	{
 		.procname       = "unknown_nmi_panic",
 		.data           = &unknown_nmi_panic,
-- 
cgit v1.2.3


From 47195d57636604ff6048b0d7aa3e4ed9643f6073 Mon Sep 17 00:00:00 2001
From: Don Zickus <dzickus@redhat.com>
Date: Mon, 22 Feb 2010 18:09:03 -0500
Subject: nmi_watchdog: Clean up various small details

Mostly copy/paste whitespace damage with a couple of nitpicks by
the checkpatch script. Fix the struct definition as requested by Ingo too.

Signed-off-by: Don Zickus <dzickus@redhat.com>
Cc: peterz@infradead.org
Cc: gorcunov@gmail.com
Cc: aris@redhat.com
LKML-Reference: <1266880143-24943-1-git-send-email-dzickus@redhat.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
--
 arch/x86/kernel/apic/hw_nmi.c |   14 +++++------
 arch/x86/kernel/traps.c       |    6 ++--
 include/linux/nmi.h           |    2 -
 kernel/nmi_watchdog.c         |   51 ++++++++++++++++++++----------------------
 4 files changed, 36 insertions(+), 37 deletions(-)
---
 arch/x86/kernel/apic/hw_nmi.c | 14 ++++++------
 arch/x86/kernel/traps.c       |  6 ++---
 include/linux/nmi.h           |  2 +-
 kernel/nmi_watchdog.c         | 51 +++++++++++++++++++++----------------------
 4 files changed, 36 insertions(+), 37 deletions(-)

(limited to 'include/linux')

diff --git a/arch/x86/kernel/apic/hw_nmi.c b/arch/x86/kernel/apic/hw_nmi.c
index 0b4d205a6b8e..e8b78a0be5de 100644
--- a/arch/x86/kernel/apic/hw_nmi.c
+++ b/arch/x86/kernel/apic/hw_nmi.c
@@ -38,15 +38,15 @@ static inline unsigned int get_timer_irqs(int cpu)
 	irqs += per_cpu(irq_stat, cpu).apic_timer_irqs;
 #endif
 
-        return irqs;
+	return irqs;
 }
 
 static inline int mce_in_progress(void)
 {
 #if defined(CONFIG_X86_MCE)
-        return atomic_read(&mce_entry) > 0;
+	return atomic_read(&mce_entry) > 0;
 #endif
-        return 0;
+	return 0;
 }
 
 int hw_nmi_is_cpu_stuck(struct pt_regs *regs)
@@ -69,9 +69,9 @@ int hw_nmi_is_cpu_stuck(struct pt_regs *regs)
 	}
 
 	/* if we are doing an mce, just assume the cpu is not stuck */
-        /* Could check oops_in_progress here too, but it's safer not to */
-        if (mce_in_progress())
-                return 0;
+	/* Could check oops_in_progress here too, but it's safer not to */
+	if (mce_in_progress())
+		return 0;
 
 	/* We determine if the cpu is stuck by checking whether any
 	 * interrupts have happened since we last checked.  Of course
@@ -89,7 +89,7 @@ int hw_nmi_is_cpu_stuck(struct pt_regs *regs)
 
 u64 hw_nmi_get_sample_period(void)
 {
-        return cpu_khz * 1000;
+	return cpu_khz * 1000;
 }
 
 #ifdef ARCH_HAS_NMI_WATCHDOG
diff --git a/arch/x86/kernel/traps.c b/arch/x86/kernel/traps.c
index 973cbc4f044f..bdc7fab3ef3e 100644
--- a/arch/x86/kernel/traps.c
+++ b/arch/x86/kernel/traps.c
@@ -402,9 +402,9 @@ static notrace __kprobes void default_do_nmi(struct pt_regs *regs)
 			return;
 
 #ifdef CONFIG_X86_LOCAL_APIC
-	        if (notify_die(DIE_NMI, "nmi", regs, reason, 2, SIGINT)
-	        			                == NOTIFY_STOP)
-	                return;
+		if (notify_die(DIE_NMI, "nmi", regs, reason, 2, SIGINT)
+							== NOTIFY_STOP)
+			return;
 
 #ifndef CONFIG_NMI_WATCHDOG
 		/*
diff --git a/include/linux/nmi.h b/include/linux/nmi.h
index 794e7354c5bf..22cc7960b649 100644
--- a/include/linux/nmi.h
+++ b/include/linux/nmi.h
@@ -57,7 +57,7 @@ u64 hw_nmi_get_sample_period(void);
 extern int nmi_watchdog_enabled;
 struct ctl_table;
 extern int proc_nmi_enabled(struct ctl_table *, int ,
-                        void __user *, size_t *, loff_t *);
+			void __user *, size_t *, loff_t *);
 #endif
 
 #endif
diff --git a/kernel/nmi_watchdog.c b/kernel/nmi_watchdog.c
index 3c75cbf3acb8..0a6f57f537a7 100644
--- a/kernel/nmi_watchdog.c
+++ b/kernel/nmi_watchdog.c
@@ -50,31 +50,31 @@ void touch_all_nmi_watchdog(void)
 
 static int __init setup_nmi_watchdog(char *str)
 {
-        if (!strncmp(str, "panic", 5)) {
-                panic_on_timeout = 1;
-                str = strchr(str, ',');
-                if (!str)
-                        return 1;
-                ++str;
-        }
-        return 1;
+	if (!strncmp(str, "panic", 5)) {
+		panic_on_timeout = 1;
+		str = strchr(str, ',');
+		if (!str)
+			return 1;
+		++str;
+	}
+	return 1;
 }
 __setup("nmi_watchdog=", setup_nmi_watchdog);
 
 struct perf_event_attr wd_hw_attr = {
-	.type = PERF_TYPE_HARDWARE,
-	.config = PERF_COUNT_HW_CPU_CYCLES,
-	.size = sizeof(struct perf_event_attr),
-	.pinned = 1,
-	.disabled = 1,
+	.type		= PERF_TYPE_HARDWARE,
+	.config		= PERF_COUNT_HW_CPU_CYCLES,
+	.size		= sizeof(struct perf_event_attr),
+	.pinned		= 1,
+	.disabled	= 1,
 };
 
 struct perf_event_attr wd_sw_attr = {
-	.type = PERF_TYPE_SOFTWARE,
-	.config = PERF_COUNT_SW_CPU_CLOCK,
-	.size = sizeof(struct perf_event_attr),
-	.pinned = 1,
-	.disabled = 1,
+	.type		= PERF_TYPE_SOFTWARE,
+	.config		= PERF_COUNT_SW_CPU_CLOCK,
+	.size		= sizeof(struct perf_event_attr),
+	.pinned		= 1,
+	.disabled	= 1,
 };
 
 void wd_overflow(struct perf_event *event, int nmi,
@@ -95,16 +95,15 @@ void wd_overflow(struct perf_event *event, int nmi,
 		 * Ayiee, looks like this CPU is stuck ...
 		 * wait a few IRQs (5 seconds) before doing the oops ...
 		 */
-		per_cpu(alert_counter,cpu) += 1;
-		if (per_cpu(alert_counter,cpu) == 5) {
-			if (panic_on_timeout) {
+		per_cpu(alert_counter, cpu) += 1;
+		if (per_cpu(alert_counter, cpu) == 5) {
+			if (panic_on_timeout)
 				panic("NMI Watchdog detected LOCKUP on cpu %d", cpu);
-			} else {
+			else
 				WARN(1, "NMI Watchdog detected LOCKUP on cpu %d", cpu);
-			}
 		}
 	} else {
-		per_cpu(alert_counter,cpu) = 0;
+		per_cpu(alert_counter, cpu) = 0;
 	}
 
 	return;
@@ -126,7 +125,7 @@ static int enable_nmi_watchdog(int cpu)
 		event = perf_event_create_kernel_counter(wd_attr, cpu, -1, wd_overflow);
 		if (IS_ERR(event)) {
 			/* hardware doesn't exist or not supported, fallback to software events */
-			printk("nmi_watchdog: hardware not available, trying software events\n");
+			printk(KERN_INFO "nmi_watchdog: hardware not available, trying software events\n");
 			wd_attr = &wd_sw_attr;
 			wd_attr->sample_period = NSEC_PER_SEC;
 			event = perf_event_create_kernel_counter(wd_attr, cpu, -1, wd_overflow);
@@ -182,7 +181,7 @@ int proc_nmi_enabled(struct ctl_table *table, int write,
 	if (nmi_watchdog_enabled) {
 		for_each_online_cpu(cpu)
 			if (enable_nmi_watchdog(cpu)) {
-				printk("NMI watchdog failed configuration, "
+				printk(KERN_ERR "NMI watchdog failed configuration, "
 					" can not be enabled\n");
 			}
 	} else {
-- 
cgit v1.2.3


From 58687acba59266735adb8ccd9b5b9aa2c7cd205b Mon Sep 17 00:00:00 2001
From: Don Zickus <dzickus@redhat.com>
Date: Fri, 7 May 2010 17:11:44 -0400
Subject: lockup_detector: Combine nmi_watchdog and softlockup detector

The new nmi_watchdog (which uses the perf event subsystem) is very
similar in structure to the softlockup detector.  Using Ingo's
suggestion, I combined the two functionalities into one file:
kernel/watchdog.c.

Now both the nmi_watchdog (or hardlockup detector) and softlockup
detector sit on top of the perf event subsystem, which is run every
60 seconds or so to see if there are any lockups.

To detect hardlockups, cpus not responding to interrupts, I
implemented an hrtimer that runs 5 times for every perf event
overflow event.  If that stops counting on a cpu, then the cpu is
most likely in trouble.

To detect softlockups, tasks not yielding to the scheduler, I used the
previous kthread idea that now gets kicked every time the hrtimer fires.
If the kthread isn't being scheduled neither is anyone else and the
warning is printed to the console.

I tested this on x86_64 and both the softlockup and hardlockup paths
work.

V2:
- cleaned up the Kconfig and softlockup combination
- surrounded hardlockup cases with #ifdef CONFIG_PERF_EVENTS_NMI
- seperated out the softlockup case from perf event subsystem
- re-arranged the enabling/disabling nmi watchdog from proc space
- added cpumasks for hardlockup failure cases
- removed fallback to soft events if no PMU exists for hard events

V3:
- comment cleanups
- drop support for older softlockup code
- per_cpu cleanups
- completely remove software clock base hardlockup detector
- use per_cpu masking on hard/soft lockup detection
- #ifdef cleanups
- rename config option NMI_WATCHDOG to LOCKUP_DETECTOR
- documentation additions

V4:
- documentation fixes
- convert per_cpu to __get_cpu_var
- powerpc compile fixes

V5:
- split apart warn flags for hard and soft lockups

TODO:
- figure out how to make an arch-agnostic clock2cycles call
  (if possible) to feed into perf events as a sample period

[fweisbec: merged conflict patch]

Signed-off-by: Don Zickus <dzickus@redhat.com>
Cc: Ingo Molnar <mingo@elte.hu>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Cyrill Gorcunov <gorcunov@gmail.com>
Cc: Eric Paris <eparis@redhat.com>
Cc: Randy Dunlap <randy.dunlap@oracle.com>
LKML-Reference: <1273266711-18706-2-git-send-email-dzickus@redhat.com>
Signed-off-by: Frederic Weisbecker <fweisbec@gmail.com>
---
 Documentation/kernel-parameters.txt |   2 +
 arch/x86/include/asm/nmi.h          |   2 +-
 arch/x86/kernel/apic/Makefile       |   4 +-
 arch/x86/kernel/apic/hw_nmi.c       |   2 +-
 arch/x86/kernel/traps.c             |   4 +-
 include/linux/nmi.h                 |   8 +-
 include/linux/sched.h               |   6 +
 init/Kconfig                        |   5 +-
 kernel/Makefile                     |   3 +-
 kernel/sysctl.c                     |  21 +-
 kernel/watchdog.c                   | 592 ++++++++++++++++++++++++++++++++++++
 lib/Kconfig.debug                   |  30 +-
 12 files changed, 650 insertions(+), 29 deletions(-)
 create mode 100644 kernel/watchdog.c

(limited to 'include/linux')

diff --git a/Documentation/kernel-parameters.txt b/Documentation/kernel-parameters.txt
index 839b21b0699a..dfe8d1c226c6 100644
--- a/Documentation/kernel-parameters.txt
+++ b/Documentation/kernel-parameters.txt
@@ -1777,6 +1777,8 @@ and is between 256 and 4096 characters. It is defined in the file
 
 	nousb		[USB] Disable the USB subsystem
 
+	nowatchdog	[KNL] Disable the lockup detector.
+
 	nowb		[ARM]
 
 	nox2apic	[X86-64,APIC] Do not enable x2APIC mode.
diff --git a/arch/x86/include/asm/nmi.h b/arch/x86/include/asm/nmi.h
index 5b41b0feb6db..932f0f86b4b7 100644
--- a/arch/x86/include/asm/nmi.h
+++ b/arch/x86/include/asm/nmi.h
@@ -17,7 +17,7 @@ int do_nmi_callback(struct pt_regs *regs, int cpu);
 
 extern void die_nmi(char *str, struct pt_regs *regs, int do_panic);
 extern int check_nmi_watchdog(void);
-#if !defined(CONFIG_NMI_WATCHDOG)
+#if !defined(CONFIG_LOCKUP_DETECTOR)
 extern int nmi_watchdog_enabled;
 #endif
 extern int avail_to_resrv_perfctr_nmi_bit(unsigned int);
diff --git a/arch/x86/kernel/apic/Makefile b/arch/x86/kernel/apic/Makefile
index 1a4512e48d24..52f32e0ea194 100644
--- a/arch/x86/kernel/apic/Makefile
+++ b/arch/x86/kernel/apic/Makefile
@@ -3,10 +3,10 @@
 #
 
 obj-$(CONFIG_X86_LOCAL_APIC)	+= apic.o apic_noop.o probe_$(BITS).o ipi.o
-ifneq ($(CONFIG_NMI_WATCHDOG),y)
+ifneq ($(CONFIG_LOCKUP_DETECTOR),y)
 obj-$(CONFIG_X86_LOCAL_APIC)	+= nmi.o
 endif
-obj-$(CONFIG_NMI_WATCHDOG)	+= hw_nmi.o
+obj-$(CONFIG_LOCKUP_DETECTOR)	+= hw_nmi.o
 
 obj-$(CONFIG_X86_IO_APIC)	+= io_apic.o
 obj-$(CONFIG_SMP)		+= ipi.o
diff --git a/arch/x86/kernel/apic/hw_nmi.c b/arch/x86/kernel/apic/hw_nmi.c
index e8b78a0be5de..79425f96fcee 100644
--- a/arch/x86/kernel/apic/hw_nmi.c
+++ b/arch/x86/kernel/apic/hw_nmi.c
@@ -89,7 +89,7 @@ int hw_nmi_is_cpu_stuck(struct pt_regs *regs)
 
 u64 hw_nmi_get_sample_period(void)
 {
-	return cpu_khz * 1000;
+	return (u64)(cpu_khz) * 1000 * 60;
 }
 
 #ifdef ARCH_HAS_NMI_WATCHDOG
diff --git a/arch/x86/kernel/traps.c b/arch/x86/kernel/traps.c
index bdc7fab3ef3e..bd347c2b34dc 100644
--- a/arch/x86/kernel/traps.c
+++ b/arch/x86/kernel/traps.c
@@ -406,7 +406,7 @@ static notrace __kprobes void default_do_nmi(struct pt_regs *regs)
 							== NOTIFY_STOP)
 			return;
 
-#ifndef CONFIG_NMI_WATCHDOG
+#ifndef CONFIG_LOCKUP_DETECTOR
 		/*
 		 * Ok, so this is none of the documented NMI sources,
 		 * so it must be the NMI watchdog.
@@ -414,7 +414,7 @@ static notrace __kprobes void default_do_nmi(struct pt_regs *regs)
 		if (nmi_watchdog_tick(regs, reason))
 			return;
 		if (!do_nmi_callback(regs, cpu))
-#endif /* !CONFIG_NMI_WATCHDOG */
+#endif /* !CONFIG_LOCKUP_DETECTOR */
 			unknown_nmi_error(reason, regs);
 #else
 		unknown_nmi_error(reason, regs);
diff --git a/include/linux/nmi.h b/include/linux/nmi.h
index 22cc7960b649..abd48aacaf79 100644
--- a/include/linux/nmi.h
+++ b/include/linux/nmi.h
@@ -20,7 +20,7 @@ extern void touch_nmi_watchdog(void);
 extern void acpi_nmi_disable(void);
 extern void acpi_nmi_enable(void);
 #else
-#ifndef CONFIG_NMI_WATCHDOG
+#ifndef CONFIG_LOCKUP_DETECTOR
 static inline void touch_nmi_watchdog(void)
 {
 	touch_softlockup_watchdog();
@@ -51,12 +51,12 @@ static inline bool trigger_all_cpu_backtrace(void)
 }
 #endif
 
-#ifdef CONFIG_NMI_WATCHDOG
+#ifdef CONFIG_LOCKUP_DETECTOR
 int hw_nmi_is_cpu_stuck(struct pt_regs *);
 u64 hw_nmi_get_sample_period(void);
-extern int nmi_watchdog_enabled;
+extern int watchdog_enabled;
 struct ctl_table;
-extern int proc_nmi_enabled(struct ctl_table *, int ,
+extern int proc_dowatchdog_enabled(struct ctl_table *, int ,
 			void __user *, size_t *, loff_t *);
 #endif
 
diff --git a/include/linux/sched.h b/include/linux/sched.h
index dad7f668ebf7..37efe8fa5306 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -346,6 +346,12 @@ extern int proc_dohung_task_timeout_secs(struct ctl_table *table, int write,
 					 size_t *lenp, loff_t *ppos);
 #endif
 
+#ifdef CONFIG_LOCKUP_DETECTOR
+extern int proc_dowatchdog_thresh(struct ctl_table *table, int write,
+				  void __user *buffer,
+				  size_t *lenp, loff_t *ppos);
+#endif
+
 /* Attach to any functions which should be ignored in wchan output. */
 #define __sched		__attribute__((__section__(".sched.text")))
 
diff --git a/init/Kconfig b/init/Kconfig
index c6c8903cb534..e44e25422f22 100644
--- a/init/Kconfig
+++ b/init/Kconfig
@@ -944,8 +944,11 @@ config PERF_USE_VMALLOC
 
 config PERF_EVENTS_NMI
 	bool
+	depends on PERF_EVENTS
 	help
-	  Arch has support for nmi_watchdog
+	  System hardware can generate an NMI using the perf event
+	  subsystem.  Also has support for calculating CPU cycle events
+	  to determine how many clock cycles in a given period.
 
 menu "Kernel Performance Events And Counters"
 
diff --git a/kernel/Makefile b/kernel/Makefile
index d5c30060ac14..6adeafc3e259 100644
--- a/kernel/Makefile
+++ b/kernel/Makefile
@@ -76,9 +76,8 @@ obj-$(CONFIG_GCOV_KERNEL) += gcov/
 obj-$(CONFIG_AUDIT_TREE) += audit_tree.o
 obj-$(CONFIG_KPROBES) += kprobes.o
 obj-$(CONFIG_KGDB) += kgdb.o
-obj-$(CONFIG_DETECT_SOFTLOCKUP) += softlockup.o
-obj-$(CONFIG_NMI_WATCHDOG) += nmi_watchdog.o
 obj-$(CONFIG_DETECT_HUNG_TASK) += hung_task.o
+obj-$(CONFIG_LOCKUP_DETECTOR) += watchdog.o
 obj-$(CONFIG_GENERIC_HARDIRQS) += irq/
 obj-$(CONFIG_SECCOMP) += seccomp.o
 obj-$(CONFIG_RCU_TORTURE_TEST) += rcutorture.o
diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index a38af430f0d8..0f9adda85f97 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -74,7 +74,7 @@
 #include <scsi/sg.h>
 #endif
 
-#ifdef CONFIG_NMI_WATCHDOG
+#ifdef CONFIG_LOCKUP_DETECTOR
 #include <linux/nmi.h>
 #endif
 
@@ -686,16 +686,25 @@ static struct ctl_table kern_table[] = {
 		.mode		= 0444,
 		.proc_handler	= proc_dointvec,
 	},
-#if defined(CONFIG_NMI_WATCHDOG)
+#if defined(CONFIG_LOCKUP_DETECTOR)
 	{
-		.procname       = "nmi_watchdog",
-		.data           = &nmi_watchdog_enabled,
+		.procname       = "watchdog",
+		.data           = &watchdog_enabled,
 		.maxlen         = sizeof (int),
 		.mode           = 0644,
-		.proc_handler   = proc_nmi_enabled,
+		.proc_handler   = proc_dowatchdog_enabled,
+	},
+	{
+		.procname	= "watchdog_thresh",
+		.data		= &softlockup_thresh,
+		.maxlen		= sizeof(int),
+		.mode		= 0644,
+		.proc_handler	= proc_dowatchdog_thresh,
+		.extra1		= &neg_one,
+		.extra2		= &sixty,
 	},
 #endif
-#if defined(CONFIG_X86_LOCAL_APIC) && defined(CONFIG_X86) && !defined(CONFIG_NMI_WATCHDOG)
+#if defined(CONFIG_X86_LOCAL_APIC) && defined(CONFIG_X86) && !defined(CONFIG_LOCKUP_DETECTOR)
 	{
 		.procname       = "unknown_nmi_panic",
 		.data           = &unknown_nmi_panic,
diff --git a/kernel/watchdog.c b/kernel/watchdog.c
new file mode 100644
index 000000000000..6b7fad8497af
--- /dev/null
+++ b/kernel/watchdog.c
@@ -0,0 +1,592 @@
+/*
+ * Detect hard and soft lockups on a system
+ *
+ * started by Don Zickus, Copyright (C) 2010 Red Hat, Inc.
+ *
+ * this code detects hard lockups: incidents in where on a CPU
+ * the kernel does not respond to anything except NMI.
+ *
+ * Note: Most of this code is borrowed heavily from softlockup.c,
+ * so thanks to Ingo for the initial implementation.
+ * Some chunks also taken from arch/x86/kernel/apic/nmi.c, thanks
+ * to those contributors as well.
+ */
+
+#include <linux/mm.h>
+#include <linux/cpu.h>
+#include <linux/nmi.h>
+#include <linux/init.h>
+#include <linux/delay.h>
+#include <linux/freezer.h>
+#include <linux/kthread.h>
+#include <linux/lockdep.h>
+#include <linux/notifier.h>
+#include <linux/module.h>
+#include <linux/sysctl.h>
+
+#include <asm/irq_regs.h>
+#include <linux/perf_event.h>
+
+int watchdog_enabled;
+int __read_mostly softlockup_thresh = 60;
+
+static DEFINE_PER_CPU(unsigned long, watchdog_touch_ts);
+static DEFINE_PER_CPU(struct task_struct *, softlockup_watchdog);
+static DEFINE_PER_CPU(struct hrtimer, watchdog_hrtimer);
+static DEFINE_PER_CPU(bool, softlockup_touch_sync);
+static DEFINE_PER_CPU(bool, hard_watchdog_warn);
+static DEFINE_PER_CPU(bool, soft_watchdog_warn);
+#ifdef CONFIG_PERF_EVENTS_NMI
+static DEFINE_PER_CPU(unsigned long, hrtimer_interrupts);
+static DEFINE_PER_CPU(unsigned long, hrtimer_interrupts_saved);
+static DEFINE_PER_CPU(struct perf_event *, watchdog_ev);
+#endif
+
+static int __read_mostly did_panic;
+static int __initdata no_watchdog;
+
+
+/* boot commands */
+/*
+ * Should we panic when a soft-lockup or hard-lockup occurs:
+ */
+#ifdef CONFIG_PERF_EVENTS_NMI
+static int hardlockup_panic;
+
+static int __init hardlockup_panic_setup(char *str)
+{
+	if (!strncmp(str, "panic", 5))
+		hardlockup_panic = 1;
+	return 1;
+}
+__setup("nmi_watchdog=", hardlockup_panic_setup);
+#endif
+
+unsigned int __read_mostly softlockup_panic =
+			CONFIG_BOOTPARAM_SOFTLOCKUP_PANIC_VALUE;
+
+static int __init softlockup_panic_setup(char *str)
+{
+	softlockup_panic = simple_strtoul(str, NULL, 0);
+
+	return 1;
+}
+__setup("softlockup_panic=", softlockup_panic_setup);
+
+static int __init nowatchdog_setup(char *str)
+{
+	no_watchdog = 1;
+	return 1;
+}
+__setup("nowatchdog", nowatchdog_setup);
+
+/* deprecated */
+static int __init nosoftlockup_setup(char *str)
+{
+	no_watchdog = 1;
+	return 1;
+}
+__setup("nosoftlockup", nosoftlockup_setup);
+/*  */
+
+
+/*
+ * Returns seconds, approximately.  We don't need nanosecond
+ * resolution, and we don't need to waste time with a big divide when
+ * 2^30ns == 1.074s.
+ */
+static unsigned long get_timestamp(int this_cpu)
+{
+	return cpu_clock(this_cpu) >> 30LL;  /* 2^30 ~= 10^9 */
+}
+
+static unsigned long get_sample_period(void)
+{
+	/*
+	 * convert softlockup_thresh from seconds to ns
+	 * the divide by 5 is to give hrtimer 5 chances to
+	 * increment before the hardlockup detector generates
+	 * a warning
+	 */
+	return softlockup_thresh / 5 * NSEC_PER_SEC;
+}
+
+/* Commands for resetting the watchdog */
+static void __touch_watchdog(void)
+{
+	int this_cpu = raw_smp_processor_id();
+
+	__get_cpu_var(watchdog_touch_ts) = get_timestamp(this_cpu);
+}
+
+void touch_watchdog(void)
+{
+	__get_cpu_var(watchdog_touch_ts) = 0;
+}
+EXPORT_SYMBOL(touch_watchdog);
+
+void touch_all_watchdog(void)
+{
+	int cpu;
+
+	/*
+	 * this is done lockless
+	 * do we care if a 0 races with a timestamp?
+	 * all it means is the softlock check starts one cycle later
+	 */
+	for_each_online_cpu(cpu)
+		per_cpu(watchdog_touch_ts, cpu) = 0;
+}
+
+void touch_nmi_watchdog(void)
+{
+	touch_watchdog();
+}
+EXPORT_SYMBOL(touch_nmi_watchdog);
+
+void touch_all_nmi_watchdog(void)
+{
+	touch_all_watchdog();
+}
+
+void touch_softlockup_watchdog(void)
+{
+	touch_watchdog();
+}
+
+void touch_all_softlockup_watchdogs(void)
+{
+	touch_all_watchdog();
+}
+
+void touch_softlockup_watchdog_sync(void)
+{
+	__raw_get_cpu_var(softlockup_touch_sync) = true;
+	__raw_get_cpu_var(watchdog_touch_ts) = 0;
+}
+
+void softlockup_tick(void)
+{
+}
+
+#ifdef CONFIG_PERF_EVENTS_NMI
+/* watchdog detector functions */
+static int is_hardlockup(int cpu)
+{
+	unsigned long hrint = per_cpu(hrtimer_interrupts, cpu);
+
+	if (per_cpu(hrtimer_interrupts_saved, cpu) == hrint)
+		return 1;
+
+	per_cpu(hrtimer_interrupts_saved, cpu) = hrint;
+	return 0;
+}
+#endif
+
+static int is_softlockup(unsigned long touch_ts, int cpu)
+{
+	unsigned long now = get_timestamp(cpu);
+
+	/* Warn about unreasonable delays: */
+	if (time_after(now, touch_ts + softlockup_thresh))
+		return now - touch_ts;
+
+	return 0;
+}
+
+static int
+watchdog_panic(struct notifier_block *this, unsigned long event, void *ptr)
+{
+	did_panic = 1;
+
+	return NOTIFY_DONE;
+}
+
+static struct notifier_block panic_block = {
+	.notifier_call = watchdog_panic,
+};
+
+#ifdef CONFIG_PERF_EVENTS_NMI
+static struct perf_event_attr wd_hw_attr = {
+	.type		= PERF_TYPE_HARDWARE,
+	.config		= PERF_COUNT_HW_CPU_CYCLES,
+	.size		= sizeof(struct perf_event_attr),
+	.pinned		= 1,
+	.disabled	= 1,
+};
+
+/* Callback function for perf event subsystem */
+void watchdog_overflow_callback(struct perf_event *event, int nmi,
+		 struct perf_sample_data *data,
+		 struct pt_regs *regs)
+{
+	int this_cpu = smp_processor_id();
+	unsigned long touch_ts = per_cpu(watchdog_touch_ts, this_cpu);
+
+	if (touch_ts == 0) {
+		__touch_watchdog();
+		return;
+	}
+
+	/* check for a hardlockup
+	 * This is done by making sure our timer interrupt
+	 * is incrementing.  The timer interrupt should have
+	 * fired multiple times before we overflow'd.  If it hasn't
+	 * then this is a good indication the cpu is stuck
+	 */
+	if (is_hardlockup(this_cpu)) {
+		/* only print hardlockups once */
+		if (__get_cpu_var(hard_watchdog_warn) == true)
+			return;
+
+		if (hardlockup_panic)
+			panic("Watchdog detected hard LOCKUP on cpu %d", this_cpu);
+		else
+			WARN(1, "Watchdog detected hard LOCKUP on cpu %d", this_cpu);
+
+		__get_cpu_var(hard_watchdog_warn) = true;
+		return;
+	}
+
+	__get_cpu_var(hard_watchdog_warn) = false;
+	return;
+}
+static void watchdog_interrupt_count(void)
+{
+	__get_cpu_var(hrtimer_interrupts)++;
+}
+#else
+static inline void watchdog_interrupt_count(void) { return; }
+#endif /* CONFIG_PERF_EVENTS_NMI */
+
+/* watchdog kicker functions */
+static enum hrtimer_restart watchdog_timer_fn(struct hrtimer *hrtimer)
+{
+	int this_cpu = smp_processor_id();
+	unsigned long touch_ts = __get_cpu_var(watchdog_touch_ts);
+	struct pt_regs *regs = get_irq_regs();
+	int duration;
+
+	/* kick the hardlockup detector */
+	watchdog_interrupt_count();
+
+	/* kick the softlockup detector */
+	wake_up_process(__get_cpu_var(softlockup_watchdog));
+
+	/* .. and repeat */
+	hrtimer_forward_now(hrtimer, ns_to_ktime(get_sample_period()));
+
+	if (touch_ts == 0) {
+		if (unlikely(per_cpu(softlockup_touch_sync, this_cpu))) {
+			/*
+			 * If the time stamp was touched atomically
+			 * make sure the scheduler tick is up to date.
+			 */
+			per_cpu(softlockup_touch_sync, this_cpu) = false;
+			sched_clock_tick();
+		}
+		__touch_watchdog();
+		return HRTIMER_RESTART;
+	}
+
+	/* check for a softlockup
+	 * This is done by making sure a high priority task is
+	 * being scheduled.  The task touches the watchdog to
+	 * indicate it is getting cpu time.  If it hasn't then
+	 * this is a good indication some task is hogging the cpu
+	 */
+	duration = is_softlockup(touch_ts, this_cpu);
+	if (unlikely(duration)) {
+		/* only warn once */
+		if (__get_cpu_var(soft_watchdog_warn) == true)
+			return HRTIMER_RESTART;
+
+		printk(KERN_ERR "BUG: soft lockup - CPU#%d stuck for %us! [%s:%d]\n",
+			this_cpu, duration,
+			current->comm, task_pid_nr(current));
+		print_modules();
+		print_irqtrace_events(current);
+		if (regs)
+			show_regs(regs);
+		else
+			dump_stack();
+
+		if (softlockup_panic)
+			panic("softlockup: hung tasks");
+		__get_cpu_var(soft_watchdog_warn) = true;
+	} else
+		__get_cpu_var(soft_watchdog_warn) = false;
+
+	return HRTIMER_RESTART;
+}
+
+
+/*
+ * The watchdog thread - touches the timestamp.
+ */
+static int watchdog(void *__bind_cpu)
+{
+	struct sched_param param = { .sched_priority = MAX_RT_PRIO-1 };
+	struct hrtimer *hrtimer = &per_cpu(watchdog_hrtimer, (unsigned long)__bind_cpu);
+
+	sched_setscheduler(current, SCHED_FIFO, &param);
+
+	/* initialize timestamp */
+	__touch_watchdog();
+
+	/* kick off the timer for the hardlockup detector */
+	/* done here because hrtimer_start can only pin to smp_processor_id() */
+	hrtimer_start(hrtimer, ns_to_ktime(get_sample_period()),
+		      HRTIMER_MODE_REL_PINNED);
+
+	set_current_state(TASK_INTERRUPTIBLE);
+	/*
+	 * Run briefly once per second to reset the softlockup timestamp.
+	 * If this gets delayed for more than 60 seconds then the
+	 * debug-printout triggers in softlockup_tick().
+	 */
+	while (!kthread_should_stop()) {
+		__touch_watchdog();
+		schedule();
+
+		if (kthread_should_stop())
+			break;
+
+		set_current_state(TASK_INTERRUPTIBLE);
+	}
+	__set_current_state(TASK_RUNNING);
+
+	return 0;
+}
+
+
+#ifdef CONFIG_PERF_EVENTS_NMI
+static int watchdog_nmi_enable(int cpu)
+{
+	struct perf_event_attr *wd_attr;
+	struct perf_event *event = per_cpu(watchdog_ev, cpu);
+
+	/* is it already setup and enabled? */
+	if (event && event->state > PERF_EVENT_STATE_OFF)
+		goto out;
+
+	/* it is setup but not enabled */
+	if (event != NULL)
+		goto out_enable;
+
+	/* Try to register using hardware perf events */
+	wd_attr = &wd_hw_attr;
+	wd_attr->sample_period = hw_nmi_get_sample_period();
+	event = perf_event_create_kernel_counter(wd_attr, cpu, -1, watchdog_overflow_callback);
+	if (!IS_ERR(event)) {
+		printk(KERN_INFO "NMI watchdog enabled, takes one hw-pmu counter.\n");
+		goto out_save;
+	}
+
+	printk(KERN_ERR "NMI watchdog failed to create perf event on cpu%i: %p\n", cpu, event);
+	return -1;
+
+	/* success path */
+out_save:
+	per_cpu(watchdog_ev, cpu) = event;
+out_enable:
+	perf_event_enable(per_cpu(watchdog_ev, cpu));
+out:
+	return 0;
+}
+
+static void watchdog_nmi_disable(int cpu)
+{
+	struct perf_event *event = per_cpu(watchdog_ev, cpu);
+
+	if (event) {
+		perf_event_disable(event);
+		per_cpu(watchdog_ev, cpu) = NULL;
+
+		/* should be in cleanup, but blocks oprofile */
+		perf_event_release_kernel(event);
+	}
+	return;
+}
+#else
+static int watchdog_nmi_enable(int cpu) { return 0; }
+static void watchdog_nmi_disable(int cpu) { return; }
+#endif /* CONFIG_PERF_EVENTS_NMI */
+
+/* prepare/enable/disable routines */
+static int watchdog_prepare_cpu(int cpu)
+{
+	struct hrtimer *hrtimer = &per_cpu(watchdog_hrtimer, cpu);
+
+	WARN_ON(per_cpu(softlockup_watchdog, cpu));
+	hrtimer_init(hrtimer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
+	hrtimer->function = watchdog_timer_fn;
+
+	return 0;
+}
+
+static int watchdog_enable(int cpu)
+{
+	struct task_struct *p = per_cpu(softlockup_watchdog, cpu);
+
+	/* enable the perf event */
+	if (watchdog_nmi_enable(cpu) != 0)
+		return -1;
+
+	/* create the watchdog thread */
+	if (!p) {
+		p = kthread_create(watchdog, (void *)(unsigned long)cpu, "watchdog/%d", cpu);
+		if (IS_ERR(p)) {
+			printk(KERN_ERR "softlockup watchdog for %i failed\n", cpu);
+			return -1;
+		}
+		kthread_bind(p, cpu);
+		per_cpu(watchdog_touch_ts, cpu) = 0;
+		per_cpu(softlockup_watchdog, cpu) = p;
+		wake_up_process(p);
+	}
+
+	return 0;
+}
+
+static void watchdog_disable(int cpu)
+{
+	struct task_struct *p = per_cpu(softlockup_watchdog, cpu);
+	struct hrtimer *hrtimer = &per_cpu(watchdog_hrtimer, cpu);
+
+	/*
+	 * cancel the timer first to stop incrementing the stats
+	 * and waking up the kthread
+	 */
+	hrtimer_cancel(hrtimer);
+
+	/* disable the perf event */
+	watchdog_nmi_disable(cpu);
+
+	/* stop the watchdog thread */
+	if (p) {
+		per_cpu(softlockup_watchdog, cpu) = NULL;
+		kthread_stop(p);
+	}
+
+	/* if any cpu succeeds, watchdog is considered enabled for the system */
+	watchdog_enabled = 1;
+}
+
+static void watchdog_enable_all_cpus(void)
+{
+	int cpu;
+	int result;
+
+	for_each_online_cpu(cpu)
+		result += watchdog_enable(cpu);
+
+	if (result)
+		printk(KERN_ERR "watchdog: failed to be enabled on some cpus\n");
+
+}
+
+static void watchdog_disable_all_cpus(void)
+{
+	int cpu;
+
+	for_each_online_cpu(cpu)
+		watchdog_disable(cpu);
+
+	/* if all watchdogs are disabled, then they are disabled for the system */
+	watchdog_enabled = 0;
+}
+
+
+/* sysctl functions */
+#ifdef CONFIG_SYSCTL
+/*
+ * proc handler for /proc/sys/kernel/nmi_watchdog
+ */
+
+int proc_dowatchdog_enabled(struct ctl_table *table, int write,
+		     void __user *buffer, size_t *length, loff_t *ppos)
+{
+	proc_dointvec(table, write, buffer, length, ppos);
+
+	if (watchdog_enabled)
+		watchdog_enable_all_cpus();
+	else
+		watchdog_disable_all_cpus();
+	return 0;
+}
+
+int proc_dowatchdog_thresh(struct ctl_table *table, int write,
+			     void __user *buffer,
+			     size_t *lenp, loff_t *ppos)
+{
+	return proc_dointvec_minmax(table, write, buffer, lenp, ppos);
+}
+
+/* stub functions */
+int proc_dosoftlockup_thresh(struct ctl_table *table, int write,
+			     void __user *buffer,
+			     size_t *lenp, loff_t *ppos)
+{
+	return proc_dowatchdog_thresh(table, write, buffer, lenp, ppos);
+}
+/* end of stub functions */
+#endif /* CONFIG_SYSCTL */
+
+
+/*
+ * Create/destroy watchdog threads as CPUs come and go:
+ */
+static int __cpuinit
+cpu_callback(struct notifier_block *nfb, unsigned long action, void *hcpu)
+{
+	int hotcpu = (unsigned long)hcpu;
+
+	switch (action) {
+	case CPU_UP_PREPARE:
+	case CPU_UP_PREPARE_FROZEN:
+		if (watchdog_prepare_cpu(hotcpu))
+			return NOTIFY_BAD;
+		break;
+	case CPU_ONLINE:
+	case CPU_ONLINE_FROZEN:
+		if (watchdog_enable(hotcpu))
+			return NOTIFY_BAD;
+		break;
+#ifdef CONFIG_HOTPLUG_CPU
+	case CPU_UP_CANCELED:
+	case CPU_UP_CANCELED_FROZEN:
+		watchdog_disable(hotcpu);
+		break;
+	case CPU_DEAD:
+	case CPU_DEAD_FROZEN:
+		watchdog_disable(hotcpu);
+		break;
+#endif /* CONFIG_HOTPLUG_CPU */
+	}
+	return NOTIFY_OK;
+}
+
+static struct notifier_block __cpuinitdata cpu_nfb = {
+	.notifier_call = cpu_callback
+};
+
+static int __init spawn_watchdog_task(void)
+{
+	void *cpu = (void *)(long)smp_processor_id();
+	int err;
+
+	if (no_watchdog)
+		return 0;
+
+	err = cpu_callback(&cpu_nfb, CPU_UP_PREPARE, cpu);
+	WARN_ON(err == NOTIFY_BAD);
+
+	cpu_callback(&cpu_nfb, CPU_ONLINE, cpu);
+	register_cpu_notifier(&cpu_nfb);
+
+	atomic_notifier_chain_register(&panic_notifier_list, &panic_block);
+
+	return 0;
+}
+early_initcall(spawn_watchdog_task);
diff --git a/lib/Kconfig.debug b/lib/Kconfig.debug
index 220ae6063b6f..49e285dcaf57 100644
--- a/lib/Kconfig.debug
+++ b/lib/Kconfig.debug
@@ -153,7 +153,7 @@ config DEBUG_SHIRQ
 	  points; some don't and need to be caught.
 
 config DETECT_SOFTLOCKUP
-	bool "Detect Soft Lockups"
+	bool
 	depends on DEBUG_KERNEL && !S390
 	default y
 	help
@@ -171,17 +171,27 @@ config DETECT_SOFTLOCKUP
 	   can be detected via the NMI-watchdog, on platforms that
 	   support it.)
 
-config NMI_WATCHDOG
-	bool "Detect Hard Lockups with an NMI Watchdog"
-	depends on DEBUG_KERNEL && PERF_EVENTS && PERF_EVENTS_NMI
+config LOCKUP_DETECTOR
+	bool "Detect Hard and Soft Lockups"
+	depends on DEBUG_KERNEL
+	default DETECT_SOFTLOCKUP
 	help
-	  Say Y here to enable the kernel to use the NMI as a watchdog
-	  to detect hard lockups.  This is useful when a cpu hangs for no
-	  reason but can still respond to NMIs.  A backtrace is displayed
-	  for reviewing and reporting.
+	  Say Y here to enable the kernel to act as a watchdog to detect
+	  hard and soft lockups.
+
+	  Softlockups are bugs that cause the kernel to loop in kernel
+	  mode for more than 60 seconds, without giving other tasks a
+	  chance to run.  The current stack trace is displayed upon
+	  detection and the system will stay locked up.
+
+	  Hardlockups are bugs that cause the CPU to loop in kernel mode
+	  for more than 60 seconds, without letting other interrupts have a
+	  chance to run.  The current stack trace is displayed upon detection
+	  and the system will stay locked up.
 
-	  The overhead should be minimal, just an extra NMI every few
-	  seconds.
+	  The overhead should be minimal.  A periodic hrtimer runs to
+	  generate interrupts and kick the watchdog task every 10-12 seconds.
+	  An NMI is generated every 60 seconds or so to check for hardlockups.
 
 config BOOTPARAM_SOFTLOCKUP_PANIC
 	bool "Panic (Reboot) On Soft Lockups"
-- 
cgit v1.2.3


From 332fbdbca3f7716c5620970755ae054d213bcc4e Mon Sep 17 00:00:00 2001
From: Don Zickus <dzickus@redhat.com>
Date: Fri, 7 May 2010 17:11:45 -0400
Subject: lockup_detector: Touch_softlockup cleanups and softlockup_tick
 removal

Just some code cleanup to make touch_softlockup clearer and remove the
softlockup_tick function as it is no longer needed.

Also remove the /proc softlockup_thres call as it has been changed to
watchdog_thres.

Signed-off-by: Don Zickus <dzickus@redhat.com>
Cc: Ingo Molnar <mingo@elte.hu>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Cyrill Gorcunov <gorcunov@gmail.com>
Cc: Eric Paris <eparis@redhat.com>
Cc: Randy Dunlap <randy.dunlap@oracle.com>
LKML-Reference: <1273266711-18706-3-git-send-email-dzickus@redhat.com>
Signed-off-by: Frederic Weisbecker <fweisbec@gmail.com>
---
 include/linux/sched.h | 16 +++-------------
 kernel/sysctl.c       |  9 ---------
 kernel/timer.c        |  1 -
 kernel/watchdog.c     | 35 +++--------------------------------
 4 files changed, 6 insertions(+), 55 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/sched.h b/include/linux/sched.h
index 37efe8fa5306..33f9b2ad0bbb 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -312,19 +312,15 @@ extern void scheduler_tick(void);
 extern void sched_show_task(struct task_struct *p);
 
 #ifdef CONFIG_DETECT_SOFTLOCKUP
-extern void softlockup_tick(void);
 extern void touch_softlockup_watchdog(void);
 extern void touch_softlockup_watchdog_sync(void);
 extern void touch_all_softlockup_watchdogs(void);
-extern int proc_dosoftlockup_thresh(struct ctl_table *table, int write,
-				    void __user *buffer,
-				    size_t *lenp, loff_t *ppos);
+extern int proc_dowatchdog_thresh(struct ctl_table *table, int write,
+				  void __user *buffer,
+				  size_t *lenp, loff_t *ppos);
 extern unsigned int  softlockup_panic;
 extern int softlockup_thresh;
 #else
-static inline void softlockup_tick(void)
-{
-}
 static inline void touch_softlockup_watchdog(void)
 {
 }
@@ -346,12 +342,6 @@ extern int proc_dohung_task_timeout_secs(struct ctl_table *table, int write,
 					 size_t *lenp, loff_t *ppos);
 #endif
 
-#ifdef CONFIG_LOCKUP_DETECTOR
-extern int proc_dowatchdog_thresh(struct ctl_table *table, int write,
-				  void __user *buffer,
-				  size_t *lenp, loff_t *ppos);
-#endif
-
 /* Attach to any functions which should be ignored in wchan output. */
 #define __sched		__attribute__((__section__(".sched.text")))
 
diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index 0f9adda85f97..999bc3fccf47 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -817,15 +817,6 @@ static struct ctl_table kern_table[] = {
 		.extra1		= &zero,
 		.extra2		= &one,
 	},
-	{
-		.procname	= "softlockup_thresh",
-		.data		= &softlockup_thresh,
-		.maxlen		= sizeof(int),
-		.mode		= 0644,
-		.proc_handler	= proc_dosoftlockup_thresh,
-		.extra1		= &neg_one,
-		.extra2		= &sixty,
-	},
 #endif
 #ifdef CONFIG_DETECT_HUNG_TASK
 	{
diff --git a/kernel/timer.c b/kernel/timer.c
index aeb6a54f2771..e8de5eb07a02 100644
--- a/kernel/timer.c
+++ b/kernel/timer.c
@@ -1225,7 +1225,6 @@ void run_local_timers(void)
 {
 	hrtimer_run_queues();
 	raise_softirq(TIMER_SOFTIRQ);
-	softlockup_tick();
 }
 
 /*
diff --git a/kernel/watchdog.c b/kernel/watchdog.c
index 6b7fad8497af..f1541b7e3244 100644
--- a/kernel/watchdog.c
+++ b/kernel/watchdog.c
@@ -119,13 +119,12 @@ static void __touch_watchdog(void)
 	__get_cpu_var(watchdog_touch_ts) = get_timestamp(this_cpu);
 }
 
-void touch_watchdog(void)
+void touch_softlockup_watchdog(void)
 {
 	__get_cpu_var(watchdog_touch_ts) = 0;
 }
-EXPORT_SYMBOL(touch_watchdog);
 
-void touch_all_watchdog(void)
+void touch_all_softlockup_watchdogs(void)
 {
 	int cpu;
 
@@ -140,35 +139,16 @@ void touch_all_watchdog(void)
 
 void touch_nmi_watchdog(void)
 {
-	touch_watchdog();
+	touch_softlockup_watchdog();
 }
 EXPORT_SYMBOL(touch_nmi_watchdog);
 
-void touch_all_nmi_watchdog(void)
-{
-	touch_all_watchdog();
-}
-
-void touch_softlockup_watchdog(void)
-{
-	touch_watchdog();
-}
-
-void touch_all_softlockup_watchdogs(void)
-{
-	touch_all_watchdog();
-}
-
 void touch_softlockup_watchdog_sync(void)
 {
 	__raw_get_cpu_var(softlockup_touch_sync) = true;
 	__raw_get_cpu_var(watchdog_touch_ts) = 0;
 }
 
-void softlockup_tick(void)
-{
-}
-
 #ifdef CONFIG_PERF_EVENTS_NMI
 /* watchdog detector functions */
 static int is_hardlockup(int cpu)
@@ -522,15 +502,6 @@ int proc_dowatchdog_thresh(struct ctl_table *table, int write,
 {
 	return proc_dointvec_minmax(table, write, buffer, lenp, ppos);
 }
-
-/* stub functions */
-int proc_dosoftlockup_thresh(struct ctl_table *table, int write,
-			     void __user *buffer,
-			     size_t *lenp, loff_t *ppos)
-{
-	return proc_dowatchdog_thresh(table, write, buffer, lenp, ppos);
-}
-/* end of stub functions */
 #endif /* CONFIG_SYSCTL */
 
 
-- 
cgit v1.2.3


From 19cc36c0f0457e5c6629ec24036fbbe8255c88ec Mon Sep 17 00:00:00 2001
From: Frederic Weisbecker <fweisbec@gmail.com>
Date: Thu, 13 May 2010 02:30:49 +0200
Subject: lockup_detector: Fix forgotten config conversion

Fix forgotten CONFIG_DETECT_SOFTLOCKUP -> CONFIG_LOCKUP_DETECTOR
in sched.h

Fixes:
	arch/x86/built-in.o: In function `touch_nmi_watchdog':
	(.text+0x1bd59): undefined reference to `touch_softlockup_watchdog'
	kernel/built-in.o: In function `show_state_filter':
	(.text+0x10d01): undefined reference to `touch_all_softlockup_watchdogs'
	kernel/built-in.o: In function `sched_clock_idle_wakeup_event':
	(.text+0x362f9): undefined reference to `touch_softlockup_watchdog'
	kernel/built-in.o: In function `timekeeping_resume':
	timekeeping.c:(.text+0x38757): undefined reference to `touch_softlockup_watchdog'
	kernel/built-in.o: In function `tick_nohz_handler':
	tick-sched.c:(.text+0x3e5b9): undefined reference to `touch_softlockup_watchdog'
	kernel/built-in.o: In function `tick_sched_timer':
	tick-sched.c:(.text+0x3e671): undefined reference to `touch_softlockup_watchdog'
	kernel/built-in.o: In function `tick_check_idle':
	(.text+0x3e90b): undefined reference to `touch_softlockup_watchdog'

Signed-off-by: Frederic Weisbecker <fweisbec@gmail.com>
Cc: Don Zickus <dzickus@redhat.com>
Cc: Ingo Molnar <mingo@elte.hu>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Cyrill Gorcunov <gorcunov@gmail.com>
Cc: Eric Paris <eparis@redhat.com>
Cc: Randy Dunlap <randy.dunlap@oracle.com>
---
 include/linux/sched.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/sched.h b/include/linux/sched.h
index 33f9b2ad0bbb..3958e0cd24f7 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -311,7 +311,7 @@ extern void scheduler_tick(void);
 
 extern void sched_show_task(struct task_struct *p);
 
-#ifdef CONFIG_DETECT_SOFTLOCKUP
+#ifdef CONFIG_LOCKUP_DETECTOR
 extern void touch_softlockup_watchdog(void);
 extern void touch_softlockup_watchdog_sync(void);
 extern void touch_all_softlockup_watchdogs(void);
-- 
cgit v1.2.3


From cafcd80d216bc2136b8edbb794327e495792c666 Mon Sep 17 00:00:00 2001
From: Don Zickus <dzickus@redhat.com>
Date: Fri, 14 May 2010 11:11:21 -0400
Subject: lockup_detector: Cross arch compile fixes

Combining the softlockup and hardlockup code causes watchdog.c
to build even without the hardlockup detection support.

So if an arch, that has the previous and the new nmi watchdog
implementations cohabiting, wants to know if the generic one
is in use, CONFIG_LOCKUP_DETECTOR is not a reliable check.
We need to use CONFIG_HARDLOCKUP_DETECTOR instead.

Fixes:
	kernel/built-in.o: In function `touch_nmi_watchdog':
	(.text+0x449bc): multiple definition of `touch_nmi_watchdog'
	arch/sparc/kernel/built-in.o:(.text+0x11b28): first defined here

Signed-off-by: Don Zickus <dzickus@redhat.com>
Cc: Ingo Molnar <mingo@elte.hu>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Don Zickus <dzickus@redhat.com>
Cc: Cyrill Gorcunov <gorcunov@gmail.com>
LKML-Reference: <20100514151121.GR15159@redhat.com>
[ use CONFIG_HARDLOCKUP_DETECTOR instead of CONFIG_PERF_EVENTS_NMI]
Signed-off-by: Frederic Weisbecker <fweisbec@gmail.com>
---
 arch/x86/kernel/apic/Makefile | 4 ++--
 include/linux/nmi.h           | 2 +-
 kernel/watchdog.c             | 7 +++++--
 3 files changed, 8 insertions(+), 5 deletions(-)

(limited to 'include/linux')

diff --git a/arch/x86/kernel/apic/Makefile b/arch/x86/kernel/apic/Makefile
index 52f32e0ea194..910f20b457c4 100644
--- a/arch/x86/kernel/apic/Makefile
+++ b/arch/x86/kernel/apic/Makefile
@@ -3,10 +3,10 @@
 #
 
 obj-$(CONFIG_X86_LOCAL_APIC)	+= apic.o apic_noop.o probe_$(BITS).o ipi.o
-ifneq ($(CONFIG_LOCKUP_DETECTOR),y)
+ifneq ($(CONFIG_HARDLOCKUP_DETECTOR),y)
 obj-$(CONFIG_X86_LOCAL_APIC)	+= nmi.o
 endif
-obj-$(CONFIG_LOCKUP_DETECTOR)	+= hw_nmi.o
+obj-$(CONFIG_HARDLOCKUP_DETECTOR)	+= hw_nmi.o
 
 obj-$(CONFIG_X86_IO_APIC)	+= io_apic.o
 obj-$(CONFIG_SMP)		+= ipi.o
diff --git a/include/linux/nmi.h b/include/linux/nmi.h
index abd48aacaf79..06aab5eee134 100644
--- a/include/linux/nmi.h
+++ b/include/linux/nmi.h
@@ -20,7 +20,7 @@ extern void touch_nmi_watchdog(void);
 extern void acpi_nmi_disable(void);
 extern void acpi_nmi_enable(void);
 #else
-#ifndef CONFIG_LOCKUP_DETECTOR
+#ifndef CONFIG_HARDLOCKUP_DETECTOR
 static inline void touch_nmi_watchdog(void)
 {
 	touch_softlockup_watchdog();
diff --git a/kernel/watchdog.c b/kernel/watchdog.c
index 83fb63155cbc..e53622c1465e 100644
--- a/kernel/watchdog.c
+++ b/kernel/watchdog.c
@@ -31,13 +31,13 @@ int watchdog_enabled;
 int __read_mostly softlockup_thresh = 60;
 
 static DEFINE_PER_CPU(unsigned long, watchdog_touch_ts);
-static DEFINE_PER_CPU(bool, watchdog_nmi_touch);
 static DEFINE_PER_CPU(struct task_struct *, softlockup_watchdog);
 static DEFINE_PER_CPU(struct hrtimer, watchdog_hrtimer);
 static DEFINE_PER_CPU(bool, softlockup_touch_sync);
-static DEFINE_PER_CPU(bool, hard_watchdog_warn);
 static DEFINE_PER_CPU(bool, soft_watchdog_warn);
 #ifdef CONFIG_HARDLOCKUP_DETECTOR
+static DEFINE_PER_CPU(bool, hard_watchdog_warn);
+static DEFINE_PER_CPU(bool, watchdog_nmi_touch);
 static DEFINE_PER_CPU(unsigned long, hrtimer_interrupts);
 static DEFINE_PER_CPU(unsigned long, hrtimer_interrupts_saved);
 static DEFINE_PER_CPU(struct perf_event *, watchdog_ev);
@@ -139,6 +139,7 @@ void touch_all_softlockup_watchdogs(void)
 		per_cpu(watchdog_touch_ts, cpu) = 0;
 }
 
+#ifdef CONFIG_HARDLOCKUP_DETECTOR
 void touch_nmi_watchdog(void)
 {
 	__get_cpu_var(watchdog_nmi_touch) = true;
@@ -146,6 +147,8 @@ void touch_nmi_watchdog(void)
 }
 EXPORT_SYMBOL(touch_nmi_watchdog);
 
+#endif
+
 void touch_softlockup_watchdog_sync(void)
 {
 	__raw_get_cpu_var(softlockup_touch_sync) = true;
-- 
cgit v1.2.3


From 5360bd776f73d0a7da571d72a09a03f237e99900 Mon Sep 17 00:00:00 2001
From: Chris Metcalf <cmetcalf@tilera.com>
Date: Fri, 28 May 2010 23:01:00 -0400
Subject: Fix up the "generic" unistd.h ABI to be more useful.

Reserve 16 "architecture-specific" syscall numbers starting at 244.

Allow use of the sys_sync_file_range2() API with the generic unistd.h
by specifying __ARCH_WANT_SYNC_FILE_RANGE2 before including it.

Allow using the generic unistd.h to create the "compat" syscall table
by specifying __SYSCALL_COMPAT before including it.

Use sys_fadvise64_64 for __NR3264_fadvise64 in both 32- and 64-bit mode.

Request the appropriate __ARCH_WANT_COMPAT_SYS_xxx values when
some deprecated syscall modes are selected.

As part of this change to fix up the syscalls, also provide a couple
of missing signal-related syscall prototypes in <linux/syscalls.h>.

Signed-off-by: Chris Metcalf <cmetcalf@tilera.com>
Acked-by: Arnd Bergmann <arnd@arndb.de>
---
 include/asm-generic/unistd.h | 26 ++++++++++++++++++++------
 include/linux/syscalls.h     |  4 ++++
 2 files changed, 24 insertions(+), 6 deletions(-)

(limited to 'include/linux')

diff --git a/include/asm-generic/unistd.h b/include/asm-generic/unistd.h
index 6a0b30f78a62..30218b4fa4e0 100644
--- a/include/asm-generic/unistd.h
+++ b/include/asm-generic/unistd.h
@@ -18,7 +18,7 @@
 #define __SYSCALL(x, y)
 #endif
 
-#if __BITS_PER_LONG == 32
+#if __BITS_PER_LONG == 32 || defined(__SYSCALL_COMPAT)
 #define __SC_3264(_nr, _32, _64) __SYSCALL(_nr, _32)
 #else
 #define __SC_3264(_nr, _32, _64) __SYSCALL(_nr, _64)
@@ -241,8 +241,13 @@ __SYSCALL(__NR_sync, sys_sync)
 __SYSCALL(__NR_fsync, sys_fsync)
 #define __NR_fdatasync 83
 __SYSCALL(__NR_fdatasync, sys_fdatasync)
+#ifdef __ARCH_WANT_SYNC_FILE_RANGE2
+#define __NR_sync_file_range2 84
+__SYSCALL(__NR_sync_file_range2, sys_sync_file_range2)
+#else
 #define __NR_sync_file_range 84
-__SYSCALL(__NR_sync_file_range, sys_sync_file_range) /* .long sys_sync_file_range2, */
+__SYSCALL(__NR_sync_file_range, sys_sync_file_range)
+#endif
 
 /* fs/timerfd.c */
 #define __NR_timerfd_create 85
@@ -580,7 +585,7 @@ __SYSCALL(__NR_execve, sys_execve)	/* .long sys_execve_wrapper */
 __SC_3264(__NR3264_mmap, sys_mmap2, sys_mmap)
 /* mm/fadvise.c */
 #define __NR3264_fadvise64 223
-__SC_3264(__NR3264_fadvise64, sys_fadvise64_64, sys_fadvise64)
+__SYSCALL(__NR3264_fadvise64, sys_fadvise64_64)
 
 /* mm/, CONFIG_MMU only */
 #ifndef __ARCH_NOMMU
@@ -627,8 +632,14 @@ __SYSCALL(__NR_accept4, sys_accept4)
 #define __NR_recvmmsg 243
 __SYSCALL(__NR_recvmmsg, sys_recvmmsg)
 
+/*
+ * Architectures may provide up to 16 syscalls of their own
+ * starting with this value.
+ */
+#define __NR_arch_specific_syscall 244
+
 #undef __NR_syscalls
-#define __NR_syscalls 244
+#define __NR_syscalls 260
 
 /*
  * All syscalls below here should go away really,
@@ -694,7 +705,8 @@ __SYSCALL(__NR_signalfd, sys_signalfd)
 #define __NR_syscalls (__NR_signalfd+1)
 #endif /* __ARCH_WANT_SYSCALL_NO_FLAGS */
 
-#if __BITS_PER_LONG == 32 && defined(__ARCH_WANT_SYSCALL_OFF_T)
+#if (__BITS_PER_LONG == 32 || defined(__SYSCALL_COMPAT)) && \
+     defined(__ARCH_WANT_SYSCALL_OFF_T)
 #define __NR_sendfile 1046
 __SYSCALL(__NR_sendfile, sys_sendfile)
 #define __NR_ftruncate 1047
@@ -740,6 +752,7 @@ __SYSCALL(__NR_getpgrp, sys_getpgrp)
 __SYSCALL(__NR_pause, sys_pause)
 #define __NR_time 1062
 #define __ARCH_WANT_SYS_TIME
+#define __ARCH_WANT_COMPAT_SYS_TIME
 __SYSCALL(__NR_time, sys_time)
 #define __NR_utime 1063
 #define __ARCH_WANT_SYS_UTIME
@@ -801,7 +814,7 @@ __SYSCALL(__NR_fork, sys_ni_syscall)
  * Here we map the numbers so that both versions
  * use the same syscall table layout.
  */
-#if __BITS_PER_LONG == 64
+#if __BITS_PER_LONG == 64 && !defined(__SYSCALL_COMPAT)
 #define __NR_fcntl __NR3264_fcntl
 #define __NR_statfs __NR3264_statfs
 #define __NR_fstatfs __NR3264_fstatfs
@@ -848,6 +861,7 @@ __SYSCALL(__NR_fork, sys_ni_syscall)
 #endif
 #define __ARCH_WANT_SYS_RT_SIGACTION
 #define __ARCH_WANT_SYS_RT_SIGSUSPEND
+#define __ARCH_WANT_COMPAT_SYS_RT_SIGSUSPEND
 
 /*
  * "Conditional" syscalls
diff --git a/include/linux/syscalls.h b/include/linux/syscalls.h
index a1a86a53bc73..4a19d9bb8368 100644
--- a/include/linux/syscalls.h
+++ b/include/linux/syscalls.h
@@ -364,9 +364,13 @@ asmlinkage long sys_init_module(void __user *umod, unsigned long len,
 asmlinkage long sys_delete_module(const char __user *name_user,
 				unsigned int flags);
 
+asmlinkage long sys_rt_sigaction(int sig, const struct sigaction __user *act,
+				 struct sigaction __user *oact,
+				 size_t sigsetsize);
 asmlinkage long sys_rt_sigprocmask(int how, sigset_t __user *set,
 				sigset_t __user *oset, size_t sigsetsize);
 asmlinkage long sys_rt_sigpending(sigset_t __user *set, size_t sigsetsize);
+asmlinkage long sys_rt_sigsuspend(sigset_t __user *unewset, size_t sigsetsize);
 asmlinkage long sys_rt_sigtimedwait(const sigset_t __user *uthese,
 				siginfo_t __user *uinfo,
 				const struct timespec __user *uts,
-- 
cgit v1.2.3


From 139ef32b0e6b88b00b5e3e74d052d938f178dc9b Mon Sep 17 00:00:00 2001
From: Chris Metcalf <cmetcalf@tilera.com>
Date: Mon, 7 Jun 2010 08:48:13 -0400
Subject: Revert adding some arch-specific signal syscalls to
 <linux/syscalls.h>.

It turns out there is some variance on the calling conventions for
these syscalls, and <asm-generic/syscalls.h> is already the mechanism
used to handle this.  Switch arch/tile over to using that mechanism and
tweak the calling conventions for a couple of tile syscalls to match
<asm-generic/syscalls.h>.

Acked-by: Arnd Bergmann <arnd@arndb.de>
Signed-off-by: Chris Metcalf <cmetcalf@tilera.com>
---
 arch/tile/include/asm/syscalls.h | 22 +---------------------
 arch/tile/kernel/process.c       |  2 +-
 arch/tile/kernel/signal.c        |  4 ++--
 arch/tile/kernel/sys.c           |  2 +-
 include/linux/syscalls.h         |  4 ----
 5 files changed, 5 insertions(+), 29 deletions(-)

(limited to 'include/linux')

diff --git a/arch/tile/include/asm/syscalls.h b/arch/tile/include/asm/syscalls.h
index e1be54d1a7d8..9f2b8e2f69d5 100644
--- a/arch/tile/include/asm/syscalls.h
+++ b/arch/tile/include/asm/syscalls.h
@@ -22,21 +22,7 @@
 #include <linux/linkage.h>
 #include <linux/signal.h>
 #include <linux/types.h>
-
-/* kernel/process.c */
-int sys_fork(struct pt_regs *);
-int sys_vfork(struct pt_regs *);
-int sys_clone(unsigned long clone_flags, unsigned long newsp,
-	      int __user *parent_tidptr, int __user *child_tidptr,
-	      struct pt_regs *);
-int sys_execve(char __user *path, char __user *__user *argv,
-	       char __user *__user *envp, struct pt_regs *);
-
-/* kernel/signal.c */
-int sys_sigaltstack(const stack_t __user *, stack_t __user *,
-		    struct pt_regs *);
-long sys_rt_sigreturn(struct pt_regs *);
-int sys_raise_fpe(int code, unsigned long addr, struct pt_regs*);
+#include <asm-generic/syscalls.h>
 
 /* kernel/sys.c */
 ssize_t sys32_readahead(int fd, u32 offset_lo, u32 offset_hi, u32 count);
@@ -45,12 +31,6 @@ long sys32_fadvise64(int fd, u32 offset_lo, u32 offset_hi,
 int sys32_fadvise64_64(int fd, u32 offset_lo, u32 offset_hi,
 		       u32 len_lo, u32 len_hi, int advice);
 long sys_flush_cache(void);
-long sys_mmap(unsigned long addr, unsigned long len,
-	      unsigned long prot, unsigned long flags,
-	      unsigned long fd, unsigned long offset);
-long sys_mmap2(unsigned long addr, unsigned long len,
-	       unsigned long prot, unsigned long flags,
-	       unsigned long fd, unsigned long offset);
 
 #ifndef __tilegx__
 /* mm/fault.c */
diff --git a/arch/tile/kernel/process.c b/arch/tile/kernel/process.c
index 824f230e6d1a..c70ff14a48e4 100644
--- a/arch/tile/kernel/process.c
+++ b/arch/tile/kernel/process.c
@@ -502,7 +502,7 @@ int _sys_fork(struct pt_regs *regs)
 }
 
 int _sys_clone(unsigned long clone_flags, unsigned long newsp,
-	       int __user *parent_tidptr, int __user *child_tidptr,
+	       void __user *parent_tidptr, void __user *child_tidptr,
 	       struct pt_regs *regs)
 {
 	if (!newsp)
diff --git a/arch/tile/kernel/signal.c b/arch/tile/kernel/signal.c
index 7ea85eb85242..45835cfad407 100644
--- a/arch/tile/kernel/signal.c
+++ b/arch/tile/kernel/signal.c
@@ -43,8 +43,8 @@
 /* Caller before callee in this file; other callee is in assembler */
 void do_signal(struct pt_regs *regs);
 
-int _sys_sigaltstack(const stack_t __user *uss,
-		     stack_t __user *uoss, struct pt_regs *regs)
+long _sys_sigaltstack(const stack_t __user *uss,
+                      stack_t __user *uoss, struct pt_regs *regs)
 {
 	return do_sigaltstack(uss, uoss, regs->sp);
 }
diff --git a/arch/tile/kernel/sys.c b/arch/tile/kernel/sys.c
index a3d982b212b4..0427978cea0a 100644
--- a/arch/tile/kernel/sys.c
+++ b/arch/tile/kernel/sys.c
@@ -95,7 +95,7 @@ SYSCALL_DEFINE6(mmap2, unsigned long, addr, unsigned long, len,
  */
 SYSCALL_DEFINE6(mmap, unsigned long, addr, unsigned long, len,
 		unsigned long, prot, unsigned long, flags,
-		unsigned long, fd, unsigned long, offset)
+		unsigned long, fd, off_t, offset)
 {
 	if (offset & ((1 << PAGE_SHIFT) - 1))
 		return -EINVAL;
diff --git a/include/linux/syscalls.h b/include/linux/syscalls.h
index 1e3cd5fec7ed..7f614ce274a9 100644
--- a/include/linux/syscalls.h
+++ b/include/linux/syscalls.h
@@ -364,13 +364,9 @@ asmlinkage long sys_init_module(void __user *umod, unsigned long len,
 asmlinkage long sys_delete_module(const char __user *name_user,
 				unsigned int flags);
 
-asmlinkage long sys_rt_sigaction(int sig, const struct sigaction __user *act,
-				 struct sigaction __user *oact,
-				 size_t sigsetsize);
 asmlinkage long sys_rt_sigprocmask(int how, sigset_t __user *set,
 				sigset_t __user *oset, size_t sigsetsize);
 asmlinkage long sys_rt_sigpending(sigset_t __user *set, size_t sigsetsize);
-asmlinkage long sys_rt_sigsuspend(sigset_t __user *unewset, size_t sigsetsize);
 asmlinkage long sys_rt_sigtimedwait(const sigset_t __user *uthese,
 				siginfo_t __user *uinfo,
 				const struct timespec __user *uts,
-- 
cgit v1.2.3


From 50a323b73069b169385a8ac65633dee837a7d13f Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Tue, 8 Jun 2010 21:40:36 +0200
Subject: sched: define and use CPU_PRI_* enums for cpu notifier priorities

Instead of hardcoding priority 10 and 20 in sched and perf, collect
them into CPU_PRI_* enums.

Signed-off-by: Tejun Heo <tj@kernel.org>
Acked-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Rusty Russell <rusty@rustcorp.com.au>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Ingo Molnar <mingo@elte.hu>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
---
 include/linux/cpu.h        | 9 +++++++++
 include/linux/perf_event.h | 2 +-
 kernel/sched.c             | 2 +-
 3 files changed, 11 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/cpu.h b/include/linux/cpu.h
index e287863ac053..2d9073883ea9 100644
--- a/include/linux/cpu.h
+++ b/include/linux/cpu.h
@@ -48,6 +48,15 @@ extern ssize_t arch_cpu_release(const char *, size_t);
 #endif
 struct notifier_block;
 
+/*
+ * CPU notifier priorities.
+ */
+enum {
+	/* migration should happen before other stuff but after perf */
+	CPU_PRI_PERF		= 20,
+	CPU_PRI_MIGRATION	= 10,
+};
+
 #ifdef CONFIG_SMP
 /* Need to know about CPUs going up/down? */
 #if defined(CONFIG_HOTPLUG_CPU) || !defined(MODULE)
diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h
index 5d0266d94985..469e03e96fe7 100644
--- a/include/linux/perf_event.h
+++ b/include/linux/perf_event.h
@@ -1068,7 +1068,7 @@ static inline void perf_event_disable(struct perf_event *event)		{ }
 #define perf_cpu_notifier(fn)					\
 do {								\
 	static struct notifier_block fn##_nb __cpuinitdata =	\
-		{ .notifier_call = fn, .priority = 20 };	\
+		{ .notifier_call = fn, .priority = CPU_PRI_PERF }; \
 	fn(&fn##_nb, (unsigned long)CPU_UP_PREPARE,		\
 		(void *)(unsigned long)smp_processor_id());	\
 	fn(&fn##_nb, (unsigned long)CPU_STARTING,		\
diff --git a/kernel/sched.c b/kernel/sched.c
index f8b8996228dd..552faf8d358c 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -5801,7 +5801,7 @@ migration_call(struct notifier_block *nfb, unsigned long action, void *hcpu)
  */
 static struct notifier_block __cpuinitdata migration_notifier = {
 	.notifier_call = migration_call,
-	.priority = 10
+	.priority = CPU_PRI_MIGRATION,
 };
 
 static int __init migration_init(void)
-- 
cgit v1.2.3


From 3a101d0548e925ab16ca6aaa8cf4f767d322ddb0 Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Tue, 8 Jun 2010 21:40:36 +0200
Subject: sched: adjust when cpu_active and cpuset configurations are updated
 during cpu on/offlining

Currently, when a cpu goes down, cpu_active is cleared before
CPU_DOWN_PREPARE starts and cpuset configuration is updated from a
default priority cpu notifier.  When a cpu is coming up, it's set
before CPU_ONLINE but cpuset configuration again is updated from the
same cpu notifier.

For cpu notifiers, this presents an inconsistent state.  Threads which
a CPU_DOWN_PREPARE notifier expects to be bound to the CPU can be
migrated to other cpus because the cpu is no more inactive.

Fix it by updating cpu_active in the highest priority cpu notifier and
cpuset configuration in the second highest when a cpu is coming up.
Down path is updated similarly.  This guarantees that all other cpu
notifiers see consistent cpu_active and cpuset configuration.

cpuset_track_online_cpus() notifier is converted to
cpuset_update_active_cpus() which just updates the configuration and
now called from cpuset_cpu_[in]active() notifiers registered from
sched_init_smp().  If cpuset is disabled, cpuset_update_active_cpus()
degenerates into partition_sched_domains() making separate notifier
for !CONFIG_CPUSETS unnecessary.

This problem is triggered by cmwq.  During CPU_DOWN_PREPARE, hotplug
callback creates a kthread and kthread_bind()s it to the target cpu,
and the thread is expected to run on that cpu.

* Ingo's test discovered __cpuinit/exit markups were incorrect.
  Fixed.

Signed-off-by: Tejun Heo <tj@kernel.org>
Acked-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Rusty Russell <rusty@rustcorp.com.au>
Cc: Ingo Molnar <mingo@elte.hu>
Cc: Paul Menage <menage@google.com>
---
 include/linux/cpu.h    | 16 ++++++++++++
 include/linux/cpuset.h |  6 +++++
 kernel/cpu.c           |  6 -----
 kernel/cpuset.c        | 21 ++--------------
 kernel/sched.c         | 67 +++++++++++++++++++++++++++++++++++++-------------
 5 files changed, 74 insertions(+), 42 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/cpu.h b/include/linux/cpu.h
index 2d9073883ea9..de6b1722cdca 100644
--- a/include/linux/cpu.h
+++ b/include/linux/cpu.h
@@ -52,6 +52,22 @@ struct notifier_block;
  * CPU notifier priorities.
  */
 enum {
+	/*
+	 * SCHED_ACTIVE marks a cpu which is coming up active during
+	 * CPU_ONLINE and CPU_DOWN_FAILED and must be the first
+	 * notifier.  CPUSET_ACTIVE adjusts cpuset according to
+	 * cpu_active mask right after SCHED_ACTIVE.  During
+	 * CPU_DOWN_PREPARE, SCHED_INACTIVE and CPUSET_INACTIVE are
+	 * ordered in the similar way.
+	 *
+	 * This ordering guarantees consistent cpu_active mask and
+	 * migration behavior to all cpu notifiers.
+	 */
+	CPU_PRI_SCHED_ACTIVE	= INT_MAX,
+	CPU_PRI_CPUSET_ACTIVE	= INT_MAX - 1,
+	CPU_PRI_SCHED_INACTIVE	= INT_MIN + 1,
+	CPU_PRI_CPUSET_INACTIVE	= INT_MIN,
+
 	/* migration should happen before other stuff but after perf */
 	CPU_PRI_PERF		= 20,
 	CPU_PRI_MIGRATION	= 10,
diff --git a/include/linux/cpuset.h b/include/linux/cpuset.h
index 457ed765a116..f20eb8f16025 100644
--- a/include/linux/cpuset.h
+++ b/include/linux/cpuset.h
@@ -20,6 +20,7 @@ extern int number_of_cpusets;	/* How many cpusets are defined in system? */
 
 extern int cpuset_init(void);
 extern void cpuset_init_smp(void);
+extern void cpuset_update_active_cpus(void);
 extern void cpuset_cpus_allowed(struct task_struct *p, struct cpumask *mask);
 extern int cpuset_cpus_allowed_fallback(struct task_struct *p);
 extern nodemask_t cpuset_mems_allowed(struct task_struct *p);
@@ -132,6 +133,11 @@ static inline void set_mems_allowed(nodemask_t nodemask)
 static inline int cpuset_init(void) { return 0; }
 static inline void cpuset_init_smp(void) {}
 
+static inline void cpuset_update_active_cpus(void)
+{
+	partition_sched_domains(1, NULL, NULL);
+}
+
 static inline void cpuset_cpus_allowed(struct task_struct *p,
 				       struct cpumask *mask)
 {
diff --git a/kernel/cpu.c b/kernel/cpu.c
index 97d1b426a4ac..f6e726f18491 100644
--- a/kernel/cpu.c
+++ b/kernel/cpu.c
@@ -235,11 +235,8 @@ static int __ref _cpu_down(unsigned int cpu, int tasks_frozen)
 		return -EINVAL;
 
 	cpu_hotplug_begin();
-	set_cpu_active(cpu, false);
 	err = __cpu_notify(CPU_DOWN_PREPARE | mod, hcpu, -1, &nr_calls);
 	if (err) {
-		set_cpu_active(cpu, true);
-
 		nr_calls--;
 		__cpu_notify(CPU_DOWN_FAILED | mod, hcpu, nr_calls, NULL);
 		printk("%s: attempt to take down CPU %u failed\n",
@@ -249,7 +246,6 @@ static int __ref _cpu_down(unsigned int cpu, int tasks_frozen)
 
 	err = __stop_machine(take_cpu_down, &tcd_param, cpumask_of(cpu));
 	if (err) {
-		set_cpu_active(cpu, true);
 		/* CPU didn't die: tell everyone.  Can't complain. */
 		cpu_notify_nofail(CPU_DOWN_FAILED | mod, hcpu);
 
@@ -321,8 +317,6 @@ static int __cpuinit _cpu_up(unsigned int cpu, int tasks_frozen)
 		goto out_notify;
 	BUG_ON(!cpu_online(cpu));
 
-	set_cpu_active(cpu, true);
-
 	/* Now call notifier in preparation. */
 	cpu_notify(CPU_ONLINE | mod, hcpu);
 
diff --git a/kernel/cpuset.c b/kernel/cpuset.c
index 02b9611eadde..05727dcaa80d 100644
--- a/kernel/cpuset.c
+++ b/kernel/cpuset.c
@@ -2113,31 +2113,17 @@ static void scan_for_empty_cpusets(struct cpuset *root)
  * but making no active use of cpusets.
  *
  * This routine ensures that top_cpuset.cpus_allowed tracks
- * cpu_online_map on each CPU hotplug (cpuhp) event.
+ * cpu_active_mask on each CPU hotplug (cpuhp) event.
  *
  * Called within get_online_cpus().  Needs to call cgroup_lock()
  * before calling generate_sched_domains().
  */
-static int cpuset_track_online_cpus(struct notifier_block *unused_nb,
-				unsigned long phase, void *unused_cpu)
+void __cpuexit cpuset_update_active_cpus(void)
 {
 	struct sched_domain_attr *attr;
 	cpumask_var_t *doms;
 	int ndoms;
 
-	switch (phase) {
-	case CPU_ONLINE:
-	case CPU_ONLINE_FROZEN:
-	case CPU_DOWN_PREPARE:
-	case CPU_DOWN_PREPARE_FROZEN:
-	case CPU_DOWN_FAILED:
-	case CPU_DOWN_FAILED_FROZEN:
-		break;
-
-	default:
-		return NOTIFY_DONE;
-	}
-
 	cgroup_lock();
 	mutex_lock(&callback_mutex);
 	cpumask_copy(top_cpuset.cpus_allowed, cpu_active_mask);
@@ -2148,8 +2134,6 @@ static int cpuset_track_online_cpus(struct notifier_block *unused_nb,
 
 	/* Have scheduler rebuild the domains */
 	partition_sched_domains(ndoms, doms, attr);
-
-	return NOTIFY_OK;
 }
 
 #ifdef CONFIG_MEMORY_HOTPLUG
@@ -2203,7 +2187,6 @@ void __init cpuset_init_smp(void)
 	cpumask_copy(top_cpuset.cpus_allowed, cpu_active_mask);
 	top_cpuset.mems_allowed = node_states[N_HIGH_MEMORY];
 
-	hotcpu_notifier(cpuset_track_online_cpus, 0);
 	hotplug_memory_notifier(cpuset_track_online_nodes, 10);
 
 	cpuset_wq = create_singlethread_workqueue("cpuset");
diff --git a/kernel/sched.c b/kernel/sched.c
index 552faf8d358c..2b942e49d0fa 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -5804,17 +5804,46 @@ static struct notifier_block __cpuinitdata migration_notifier = {
 	.priority = CPU_PRI_MIGRATION,
 };
 
+static int __cpuinit sched_cpu_active(struct notifier_block *nfb,
+				      unsigned long action, void *hcpu)
+{
+	switch (action & ~CPU_TASKS_FROZEN) {
+	case CPU_ONLINE:
+	case CPU_DOWN_FAILED:
+		set_cpu_active((long)hcpu, true);
+		return NOTIFY_OK;
+	default:
+		return NOTIFY_DONE;
+	}
+}
+
+static int __cpuinit sched_cpu_inactive(struct notifier_block *nfb,
+					unsigned long action, void *hcpu)
+{
+	switch (action & ~CPU_TASKS_FROZEN) {
+	case CPU_DOWN_PREPARE:
+		set_cpu_active((long)hcpu, false);
+		return NOTIFY_OK;
+	default:
+		return NOTIFY_DONE;
+	}
+}
+
 static int __init migration_init(void)
 {
 	void *cpu = (void *)(long)smp_processor_id();
 	int err;
 
-	/* Start one for the boot CPU: */
+	/* Initialize migration for the boot CPU */
 	err = migration_call(&migration_notifier, CPU_UP_PREPARE, cpu);
 	BUG_ON(err == NOTIFY_BAD);
 	migration_call(&migration_notifier, CPU_ONLINE, cpu);
 	register_cpu_notifier(&migration_notifier);
 
+	/* Register cpu active notifiers */
+	cpu_notifier(sched_cpu_active, CPU_PRI_SCHED_ACTIVE);
+	cpu_notifier(sched_cpu_inactive, CPU_PRI_SCHED_INACTIVE);
+
 	return 0;
 }
 early_initcall(migration_init);
@@ -7273,29 +7302,35 @@ int __init sched_create_sysfs_power_savings_entries(struct sysdev_class *cls)
 }
 #endif /* CONFIG_SCHED_MC || CONFIG_SCHED_SMT */
 
-#ifndef CONFIG_CPUSETS
 /*
- * Add online and remove offline CPUs from the scheduler domains.
- * When cpusets are enabled they take over this function.
+ * Update cpusets according to cpu_active mask.  If cpusets are
+ * disabled, cpuset_update_active_cpus() becomes a simple wrapper
+ * around partition_sched_domains().
  */
-static int update_sched_domains(struct notifier_block *nfb,
-				unsigned long action, void *hcpu)
+static int __cpuexit cpuset_cpu_active(struct notifier_block *nfb,
+				       unsigned long action, void *hcpu)
 {
-	switch (action) {
+	switch (action & ~CPU_TASKS_FROZEN) {
 	case CPU_ONLINE:
-	case CPU_ONLINE_FROZEN:
-	case CPU_DOWN_PREPARE:
-	case CPU_DOWN_PREPARE_FROZEN:
 	case CPU_DOWN_FAILED:
-	case CPU_DOWN_FAILED_FROZEN:
-		partition_sched_domains(1, NULL, NULL);
+		cpuset_update_active_cpus();
 		return NOTIFY_OK;
+	default:
+		return NOTIFY_DONE;
+	}
+}
 
+static int __cpuexit cpuset_cpu_inactive(struct notifier_block *nfb,
+					 unsigned long action, void *hcpu)
+{
+	switch (action & ~CPU_TASKS_FROZEN) {
+	case CPU_DOWN_PREPARE:
+		cpuset_update_active_cpus();
+		return NOTIFY_OK;
 	default:
 		return NOTIFY_DONE;
 	}
 }
-#endif
 
 static int update_runtime(struct notifier_block *nfb,
 				unsigned long action, void *hcpu)
@@ -7341,10 +7376,8 @@ void __init sched_init_smp(void)
 	mutex_unlock(&sched_domains_mutex);
 	put_online_cpus();
 
-#ifndef CONFIG_CPUSETS
-	/* XXX: Theoretical race here - CPU may be hotplugged now */
-	hotcpu_notifier(update_sched_domains, 0);
-#endif
+	hotcpu_notifier(cpuset_cpu_active, CPU_PRI_CPUSET_ACTIVE);
+	hotcpu_notifier(cpuset_cpu_inactive, CPU_PRI_CPUSET_INACTIVE);
 
 	/* RT runtime code needs to handle some hotplug events */
 	hotcpu_notifier(update_runtime, 0);
-- 
cgit v1.2.3


From 21aa9af03d06cb1d19a3738e5cf12acff984e69b Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Tue, 8 Jun 2010 21:40:37 +0200
Subject: sched: add hooks for workqueue

Concurrency managed workqueue needs to know when workers are going to
sleep and waking up.  Using these two hooks, cmwq keeps track of the
current concurrency level and throttles execution of new works if it's
too high and wakes up another worker from the sleep hook if it becomes
too low.

This patch introduces PF_WQ_WORKER to identify workqueue workers and
adds the following two hooks.

* wq_worker_waking_up(): called when a worker is woken up.

* wq_worker_sleeping(): called when a worker is going to sleep and may
  return a pointer to a local task which should be woken up.  The
  returned task is woken up using try_to_wake_up_local() which is
  simplified ttwu which is called under rq lock and can only wake up
  local tasks.

Both hooks are currently defined as noop in kernel/workqueue_sched.h.
Later cmwq implementation will replace them with proper
implementation.

These hooks are hard coded as they'll always be enabled.

Signed-off-by: Tejun Heo <tj@kernel.org>
Acked-by: Peter Zijlstra <peterz@infradead.org>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Ingo Molnar <mingo@elte.hu>
---
 include/linux/sched.h    |  1 +
 kernel/fork.c            |  2 +-
 kernel/sched.c           | 53 ++++++++++++++++++++++++++++++++++++++++++++++--
 kernel/workqueue_sched.h | 16 +++++++++++++++
 4 files changed, 69 insertions(+), 3 deletions(-)
 create mode 100644 kernel/workqueue_sched.h

(limited to 'include/linux')

diff --git a/include/linux/sched.h b/include/linux/sched.h
index f118809c953f..edc3dd168d87 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -1696,6 +1696,7 @@ extern void thread_group_times(struct task_struct *p, cputime_t *ut, cputime_t *
 #define PF_EXITING	0x00000004	/* getting shut down */
 #define PF_EXITPIDONE	0x00000008	/* pi exit done on shut down */
 #define PF_VCPU		0x00000010	/* I'm a virtual CPU */
+#define PF_WQ_WORKER	0x00000020	/* I'm a workqueue worker */
 #define PF_FORKNOEXEC	0x00000040	/* forked but didn't exec */
 #define PF_MCE_PROCESS  0x00000080      /* process policy on mce errors */
 #define PF_SUPERPRIV	0x00000100	/* used super-user privileges */
diff --git a/kernel/fork.c b/kernel/fork.c
index b6cce14ba047..a82a65cef741 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -907,7 +907,7 @@ static void copy_flags(unsigned long clone_flags, struct task_struct *p)
 {
 	unsigned long new_flags = p->flags;
 
-	new_flags &= ~PF_SUPERPRIV;
+	new_flags &= ~(PF_SUPERPRIV | PF_WQ_WORKER);
 	new_flags |= PF_FORKNOEXEC;
 	new_flags |= PF_STARTING;
 	p->flags = new_flags;
diff --git a/kernel/sched.c b/kernel/sched.c
index 96eafd5f345f..edd5a54b95da 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -77,6 +77,7 @@
 #include <asm/irq_regs.h>
 
 #include "sched_cpupri.h"
+#include "workqueue_sched.h"
 
 #define CREATE_TRACE_POINTS
 #include <trace/events/sched.h>
@@ -2306,6 +2307,9 @@ static inline void ttwu_post_activation(struct task_struct *p, struct rq *rq,
 		rq->idle_stamp = 0;
 	}
 #endif
+	/* if a worker is waking up, notify workqueue */
+	if ((p->flags & PF_WQ_WORKER) && success)
+		wq_worker_waking_up(p, cpu_of(rq));
 }
 
 /**
@@ -2413,6 +2417,37 @@ out:
 	return success;
 }
 
+/**
+ * try_to_wake_up_local - try to wake up a local task with rq lock held
+ * @p: the thread to be awakened
+ *
+ * Put @p on the run-queue if it's not alredy there.  The caller must
+ * ensure that this_rq() is locked, @p is bound to this_rq() and not
+ * the current task.  this_rq() stays locked over invocation.
+ */
+static void try_to_wake_up_local(struct task_struct *p)
+{
+	struct rq *rq = task_rq(p);
+	bool success = false;
+
+	BUG_ON(rq != this_rq());
+	BUG_ON(p == current);
+	lockdep_assert_held(&rq->lock);
+
+	if (!(p->state & TASK_NORMAL))
+		return;
+
+	if (!p->se.on_rq) {
+		if (likely(!task_running(rq, p))) {
+			schedstat_inc(rq, ttwu_count);
+			schedstat_inc(rq, ttwu_local);
+		}
+		ttwu_activate(p, rq, false, false, true, ENQUEUE_WAKEUP);
+		success = true;
+	}
+	ttwu_post_activation(p, rq, 0, success);
+}
+
 /**
  * wake_up_process - Wake up a specific process
  * @p: The process to be woken up.
@@ -3618,10 +3653,24 @@ need_resched_nonpreemptible:
 	clear_tsk_need_resched(prev);
 
 	if (prev->state && !(preempt_count() & PREEMPT_ACTIVE)) {
-		if (unlikely(signal_pending_state(prev->state, prev)))
+		if (unlikely(signal_pending_state(prev->state, prev))) {
 			prev->state = TASK_RUNNING;
-		else
+		} else {
+			/*
+			 * If a worker is going to sleep, notify and
+			 * ask workqueue whether it wants to wake up a
+			 * task to maintain concurrency.  If so, wake
+			 * up the task.
+			 */
+			if (prev->flags & PF_WQ_WORKER) {
+				struct task_struct *to_wakeup;
+
+				to_wakeup = wq_worker_sleeping(prev, cpu);
+				if (to_wakeup)
+					try_to_wake_up_local(to_wakeup);
+			}
 			deactivate_task(rq, prev, DEQUEUE_SLEEP);
+		}
 		switch_count = &prev->nvcsw;
 	}
 
diff --git a/kernel/workqueue_sched.h b/kernel/workqueue_sched.h
new file mode 100644
index 000000000000..af040babb742
--- /dev/null
+++ b/kernel/workqueue_sched.h
@@ -0,0 +1,16 @@
+/*
+ * kernel/workqueue_sched.h
+ *
+ * Scheduler hooks for concurrency managed workqueue.  Only to be
+ * included from sched.c and workqueue.c.
+ */
+static inline void wq_worker_waking_up(struct task_struct *task,
+				       unsigned int cpu)
+{
+}
+
+static inline struct task_struct *wq_worker_sleeping(struct task_struct *task,
+						     unsigned int cpu)
+{
+	return NULL;
+}
-- 
cgit v1.2.3


From b0f82b81fe6bbcf78d478071f33e44554726bc81 Mon Sep 17 00:00:00 2001
From: Frederic Weisbecker <fweisbec@gmail.com>
Date: Thu, 20 May 2010 07:47:21 +0200
Subject: perf: Drop the skip argument from perf_arch_fetch_regs_caller

Drop this argument now that we always want to rewind only to the
state of the first caller.
It means frame pointers are not necessary anymore to reliably get
the source of an event. But this also means we need this helper
to be a macro now, as an inline function is not an option since
we need to know when to provide a default implentation.

Signed-off-by: Frederic Weisbecker <fweisbec@gmail.com>
Signed-off-by: Paul Mackerras <paulus@samba.org>
Cc: David Miller <davem@davemloft.net>
Cc: Ingo Molnar <mingo@elte.hu>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
---
 arch/powerpc/include/asm/perf_event.h | 12 ++++++++++++
 arch/powerpc/kernel/misc.S            | 26 --------------------------
 arch/sparc/include/asm/perf_event.h   |  8 ++++++++
 arch/sparc/kernel/helpers.S           |  6 +++---
 arch/x86/include/asm/perf_event.h     | 13 +++++++++++++
 arch/x86/include/asm/stacktrace.h     |  7 ++-----
 arch/x86/kernel/cpu/perf_event.c      | 16 ----------------
 include/linux/perf_event.h            | 32 +++++++-------------------------
 include/trace/ftrace.h                |  2 +-
 kernel/perf_event.c                   |  5 -----
 kernel/trace/trace_event_perf.c       |  2 --
 11 files changed, 46 insertions(+), 83 deletions(-)

(limited to 'include/linux')

diff --git a/arch/powerpc/include/asm/perf_event.h b/arch/powerpc/include/asm/perf_event.h
index e6d4ce69b126..5c16b891d501 100644
--- a/arch/powerpc/include/asm/perf_event.h
+++ b/arch/powerpc/include/asm/perf_event.h
@@ -21,3 +21,15 @@
 #ifdef CONFIG_FSL_EMB_PERF_EVENT
 #include <asm/perf_event_fsl_emb.h>
 #endif
+
+#ifdef CONFIG_PERF_EVENTS
+#include <asm/ptrace.h>
+#include <asm/reg.h>
+
+#define perf_arch_fetch_caller_regs(regs, __ip)			\
+	do {							\
+		(regs)->nip = __ip;				\
+		(regs)->gpr[1] = *(unsigned long *)__get_SP();	\
+		asm volatile("mfmsr %0" : "=r" ((regs)->msr));	\
+	} while (0)
+#endif
diff --git a/arch/powerpc/kernel/misc.S b/arch/powerpc/kernel/misc.S
index 22e507c8a556..2d29752cbe16 100644
--- a/arch/powerpc/kernel/misc.S
+++ b/arch/powerpc/kernel/misc.S
@@ -127,29 +127,3 @@ _GLOBAL(__setup_cpu_power7)
 _GLOBAL(__restore_cpu_power7)
 	/* place holder */
 	blr
-
-/*
- * Get a minimal set of registers for our caller's nth caller.
- * r3 = regs pointer, r5 = n.
- *
- * We only get R1 (stack pointer), NIP (next instruction pointer)
- * and LR (link register).  These are all we can get in the
- * general case without doing complicated stack unwinding, but
- * fortunately they are enough to do a stack backtrace, which
- * is all we need them for.
- */
-_GLOBAL(perf_arch_fetch_caller_regs)
-	mr	r6,r1
-	cmpwi	r5,0
-	mflr	r4
-	ble	2f
-	mtctr	r5
-1:	PPC_LL	r6,0(r6)
-	bdnz	1b
-	PPC_LL	r4,PPC_LR_STKOFF(r6)
-2:	PPC_LL	r7,0(r6)
-	PPC_LL	r7,PPC_LR_STKOFF(r7)
-	PPC_STL	r6,GPR1-STACK_FRAME_OVERHEAD(r3)
-	PPC_STL	r4,_NIP-STACK_FRAME_OVERHEAD(r3)
-	PPC_STL	r7,_LINK-STACK_FRAME_OVERHEAD(r3)
-	blr
diff --git a/arch/sparc/include/asm/perf_event.h b/arch/sparc/include/asm/perf_event.h
index 7e2669894ce8..74c4e0cd889c 100644
--- a/arch/sparc/include/asm/perf_event.h
+++ b/arch/sparc/include/asm/perf_event.h
@@ -6,7 +6,15 @@ extern void set_perf_event_pending(void);
 #define	PERF_EVENT_INDEX_OFFSET	0
 
 #ifdef CONFIG_PERF_EVENTS
+#include <asm/ptrace.h>
+
 extern void init_hw_perf_events(void);
+
+extern void
+__perf_arch_fetch_caller_regs(struct pt_regs *regs, unsigned long ip, int skip);
+
+#define perf_arch_fetch_caller_regs(pt_regs, ip)	\
+	__perf_arch_fetch_caller_regs(pt_regs, ip, 1);
 #else
 static inline void init_hw_perf_events(void)	{ }
 #endif
diff --git a/arch/sparc/kernel/helpers.S b/arch/sparc/kernel/helpers.S
index 92090cc9e829..682fee06a16b 100644
--- a/arch/sparc/kernel/helpers.S
+++ b/arch/sparc/kernel/helpers.S
@@ -47,9 +47,9 @@ stack_trace_flush:
 	.size		stack_trace_flush,.-stack_trace_flush
 
 #ifdef CONFIG_PERF_EVENTS
-	.globl		perf_arch_fetch_caller_regs
-	.type		perf_arch_fetch_caller_regs,#function
-perf_arch_fetch_caller_regs:
+	.globl		__perf_arch_fetch_caller_regs
+	.type		__perf_arch_fetch_caller_regs,#function
+__perf_arch_fetch_caller_regs:
 	/* We always read the %pstate into %o5 since we will use
 	 * that to construct a fake %tstate to store into the regs.
 	 */
diff --git a/arch/x86/include/asm/perf_event.h b/arch/x86/include/asm/perf_event.h
index 254883d0c7e0..02de29830ffe 100644
--- a/arch/x86/include/asm/perf_event.h
+++ b/arch/x86/include/asm/perf_event.h
@@ -140,6 +140,19 @@ extern unsigned long perf_instruction_pointer(struct pt_regs *regs);
 extern unsigned long perf_misc_flags(struct pt_regs *regs);
 #define perf_misc_flags(regs)	perf_misc_flags(regs)
 
+#include <asm/stacktrace.h>
+
+/*
+ * We abuse bit 3 from flags to pass exact information, see perf_misc_flags
+ * and the comment with PERF_EFLAGS_EXACT.
+ */
+#define perf_arch_fetch_caller_regs(regs, __ip)		{	\
+	(regs)->ip = (__ip);					\
+	(regs)->bp = caller_frame_pointer();			\
+	(regs)->cs = __KERNEL_CS;				\
+	regs->flags = 0;					\
+}
+
 #else
 static inline void init_hw_perf_events(void)		{ }
 static inline void perf_events_lapic_init(void)	{ }
diff --git a/arch/x86/include/asm/stacktrace.h b/arch/x86/include/asm/stacktrace.h
index a957463d3c7a..2b16a2ad23dc 100644
--- a/arch/x86/include/asm/stacktrace.h
+++ b/arch/x86/include/asm/stacktrace.h
@@ -78,17 +78,14 @@ struct stack_frame_ia32 {
     u32 return_address;
 };
 
-static inline unsigned long rewind_frame_pointer(int n)
+static inline unsigned long caller_frame_pointer(void)
 {
 	struct stack_frame *frame;
 
 	get_bp(frame);
 
 #ifdef CONFIG_FRAME_POINTER
-	while (n--) {
-		if (probe_kernel_address(&frame->next_frame, frame))
-			break;
-	}
+	frame = frame->next_frame;
 #endif
 
 	return (unsigned long)frame;
diff --git a/arch/x86/kernel/cpu/perf_event.c b/arch/x86/kernel/cpu/perf_event.c
index 9632fb61e8f9..2c075fe573d0 100644
--- a/arch/x86/kernel/cpu/perf_event.c
+++ b/arch/x86/kernel/cpu/perf_event.c
@@ -1706,22 +1706,6 @@ struct perf_callchain_entry *perf_callchain(struct pt_regs *regs)
 	return entry;
 }
 
-void perf_arch_fetch_caller_regs(struct pt_regs *regs, unsigned long ip, int skip)
-{
-	regs->ip = ip;
-	/*
-	 * perf_arch_fetch_caller_regs adds another call, we need to increment
-	 * the skip level
-	 */
-	regs->bp = rewind_frame_pointer(skip + 1);
-	regs->cs = __KERNEL_CS;
-	/*
-	 * We abuse bit 3 to pass exact information, see perf_misc_flags
-	 * and the comment with PERF_EFLAGS_EXACT.
-	 */
-	regs->flags = 0;
-}
-
 unsigned long perf_instruction_pointer(struct pt_regs *regs)
 {
 	unsigned long ip;
diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h
index fb6c91eac7e3..bea785cef493 100644
--- a/include/linux/perf_event.h
+++ b/include/linux/perf_event.h
@@ -905,8 +905,10 @@ extern atomic_t perf_swevent_enabled[PERF_COUNT_SW_MAX];
 
 extern void __perf_sw_event(u32, u64, int, struct pt_regs *, u64);
 
-extern void
-perf_arch_fetch_caller_regs(struct pt_regs *regs, unsigned long ip, int skip);
+#ifndef perf_arch_fetch_caller_regs
+static inline void
+perf_arch_fetch_caller_regs(struct regs *regs, unsigned long ip) { }
+#endif
 
 /*
  * Take a snapshot of the regs. Skip ip and frame pointer to
@@ -916,31 +918,11 @@ perf_arch_fetch_caller_regs(struct pt_regs *regs, unsigned long ip, int skip);
  * - bp for callchains
  * - eflags, for future purposes, just in case
  */
-static inline void perf_fetch_caller_regs(struct pt_regs *regs, int skip)
+static inline void perf_fetch_caller_regs(struct pt_regs *regs)
 {
-	unsigned long ip;
-
 	memset(regs, 0, sizeof(*regs));
 
-	switch (skip) {
-	case 1 :
-		ip = CALLER_ADDR0;
-		break;
-	case 2 :
-		ip = CALLER_ADDR1;
-		break;
-	case 3 :
-		ip = CALLER_ADDR2;
-		break;
-	case 4:
-		ip = CALLER_ADDR3;
-		break;
-	/* No need to support further for now */
-	default:
-		ip = 0;
-	}
-
-	return perf_arch_fetch_caller_regs(regs, ip, skip);
+	perf_arch_fetch_caller_regs(regs, CALLER_ADDR0);
 }
 
 static inline void
@@ -950,7 +932,7 @@ perf_sw_event(u32 event_id, u64 nr, int nmi, struct pt_regs *regs, u64 addr)
 		struct pt_regs hot_regs;
 
 		if (!regs) {
-			perf_fetch_caller_regs(&hot_regs, 1);
+			perf_fetch_caller_regs(&hot_regs);
 			regs = &hot_regs;
 		}
 		__perf_sw_event(event_id, nr, nmi, regs, addr);
diff --git a/include/trace/ftrace.h b/include/trace/ftrace.h
index 3d685d1f2a03..8ee8b6e6b25e 100644
--- a/include/trace/ftrace.h
+++ b/include/trace/ftrace.h
@@ -705,7 +705,7 @@ perf_trace_##call(void *__data, proto)					\
 	int __data_size;						\
 	int rctx;							\
 									\
-	perf_fetch_caller_regs(&__regs, 1);				\
+	perf_fetch_caller_regs(&__regs);				\
 									\
 	__data_size = ftrace_get_offsets_##call(&__data_offsets, args); \
 	__entry_size = ALIGN(__data_size + sizeof(*entry) + sizeof(u32),\
diff --git a/kernel/perf_event.c b/kernel/perf_event.c
index e099650cd249..9ae4dbcdf469 100644
--- a/kernel/perf_event.c
+++ b/kernel/perf_event.c
@@ -2851,11 +2851,6 @@ __weak struct perf_callchain_entry *perf_callchain(struct pt_regs *regs)
 	return NULL;
 }
 
-__weak
-void perf_arch_fetch_caller_regs(struct pt_regs *regs, unsigned long ip, int skip)
-{
-}
-
 
 /*
  * We assume there is only KVM supporting the callbacks.
diff --git a/kernel/trace/trace_event_perf.c b/kernel/trace/trace_event_perf.c
index cb6f365016e4..21db1d3a48d0 100644
--- a/kernel/trace/trace_event_perf.c
+++ b/kernel/trace/trace_event_perf.c
@@ -9,8 +9,6 @@
 #include <linux/kprobes.h>
 #include "trace.h"
 
-EXPORT_SYMBOL_GPL(perf_arch_fetch_caller_regs);
-
 static char *perf_trace_buf[4];
 
 /*
-- 
cgit v1.2.3


From c676329abb2b8359d9a5d734dec0c81779823fd6 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <a.p.zijlstra@chello.nl>
Date: Tue, 25 May 2010 10:48:51 +0200
Subject: sched_clock: Add local_clock() API and improve documentation

For people who otherwise get to write: cpu_clock(smp_processor_id()),
there is now: local_clock().

Also, as per suggestion from Andrew, provide some documentation on
the various clock interfaces, and minimize the unsigned long long vs
u64 mess.

Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Jens Axboe <jaxboe@fusionio.com>
LKML-Reference: <1275052414.1645.52.camel@laptop>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/parisc/kernel/ftrace.c |  4 +-
 include/linux/sched.h       | 37 ++++++++++--------
 kernel/lockdep.c            |  2 +-
 kernel/perf_event.c         |  2 +-
 kernel/rcutorture.c         |  3 +-
 kernel/sched.c              |  2 +-
 kernel/sched_clock.c        | 95 ++++++++++++++++++++++++++++++++++++++++-----
 kernel/trace/trace_clock.c  |  2 +-
 8 files changed, 113 insertions(+), 34 deletions(-)

(limited to 'include/linux')

diff --git a/arch/parisc/kernel/ftrace.c b/arch/parisc/kernel/ftrace.c
index 9877372ffdba..5beb97bafbb1 100644
--- a/arch/parisc/kernel/ftrace.c
+++ b/arch/parisc/kernel/ftrace.c
@@ -82,7 +82,7 @@ unsigned long ftrace_return_to_handler(unsigned long retval0,
 	unsigned long ret;
 
 	pop_return_trace(&trace, &ret);
-	trace.rettime = cpu_clock(raw_smp_processor_id());
+	trace.rettime = local_clock();
 	ftrace_graph_return(&trace);
 
 	if (unlikely(!ret)) {
@@ -126,7 +126,7 @@ void prepare_ftrace_return(unsigned long *parent, unsigned long self_addr)
 		return;
 	}
 
-	calltime = cpu_clock(raw_smp_processor_id());
+	calltime = local_clock();
 
 	if (push_return_trace(old, calltime,
 				self_addr, &trace.depth) == -EBUSY) {
diff --git a/include/linux/sched.h b/include/linux/sched.h
index edc3dd168d87..c2d4316a04bb 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -1791,20 +1791,23 @@ static inline int set_cpus_allowed(struct task_struct *p, cpumask_t new_mask)
 #endif
 
 /*
- * Architectures can set this to 1 if they have specified
- * CONFIG_HAVE_UNSTABLE_SCHED_CLOCK in their arch Kconfig,
- * but then during bootup it turns out that sched_clock()
- * is reliable after all:
+ * Do not use outside of architecture code which knows its limitations.
+ *
+ * sched_clock() has no promise of monotonicity or bounded drift between
+ * CPUs, use (which you should not) requires disabling IRQs.
+ *
+ * Please use one of the three interfaces below.
  */
-#ifdef CONFIG_HAVE_UNSTABLE_SCHED_CLOCK
-extern int sched_clock_stable;
-#endif
-
-/* ftrace calls sched_clock() directly */
 extern unsigned long long notrace sched_clock(void);
+/*
+ * See the comment in kernel/sched_clock.c
+ */
+extern u64 cpu_clock(int cpu);
+extern u64 local_clock(void);
+extern u64 sched_clock_cpu(int cpu);
+
 
 extern void sched_clock_init(void);
-extern u64 sched_clock_cpu(int cpu);
 
 #ifndef CONFIG_HAVE_UNSTABLE_SCHED_CLOCK
 static inline void sched_clock_tick(void)
@@ -1819,17 +1822,19 @@ static inline void sched_clock_idle_wakeup_event(u64 delta_ns)
 {
 }
 #else
+/*
+ * Architectures can set this to 1 if they have specified
+ * CONFIG_HAVE_UNSTABLE_SCHED_CLOCK in their arch Kconfig,
+ * but then during bootup it turns out that sched_clock()
+ * is reliable after all:
+ */
+extern int sched_clock_stable;
+
 extern void sched_clock_tick(void);
 extern void sched_clock_idle_sleep_event(void);
 extern void sched_clock_idle_wakeup_event(u64 delta_ns);
 #endif
 
-/*
- * For kernel-internal use: high-speed (but slightly incorrect) per-cpu
- * clock constructed from sched_clock():
- */
-extern unsigned long long cpu_clock(int cpu);
-
 extern unsigned long long
 task_sched_runtime(struct task_struct *task);
 extern unsigned long long thread_group_sched_runtime(struct task_struct *task);
diff --git a/kernel/lockdep.c b/kernel/lockdep.c
index 54286798c37b..f2852a510232 100644
--- a/kernel/lockdep.c
+++ b/kernel/lockdep.c
@@ -146,7 +146,7 @@ static DEFINE_PER_CPU(struct lock_class_stats[MAX_LOCKDEP_KEYS],
 
 static inline u64 lockstat_clock(void)
 {
-	return cpu_clock(smp_processor_id());
+	return local_clock();
 }
 
 static int lock_point(unsigned long points[], unsigned long ip)
diff --git a/kernel/perf_event.c b/kernel/perf_event.c
index 31d6afe92594..109c5ec88933 100644
--- a/kernel/perf_event.c
+++ b/kernel/perf_event.c
@@ -214,7 +214,7 @@ static void perf_unpin_context(struct perf_event_context *ctx)
 
 static inline u64 perf_clock(void)
 {
-	return cpu_clock(raw_smp_processor_id());
+	return local_clock();
 }
 
 /*
diff --git a/kernel/rcutorture.c b/kernel/rcutorture.c
index 6535ac8bc6a5..2e2726d790b9 100644
--- a/kernel/rcutorture.c
+++ b/kernel/rcutorture.c
@@ -239,8 +239,7 @@ static unsigned long
 rcu_random(struct rcu_random_state *rrsp)
 {
 	if (--rrsp->rrs_count < 0) {
-		rrsp->rrs_state +=
-			(unsigned long)cpu_clock(raw_smp_processor_id());
+		rrsp->rrs_state += (unsigned long)local_clock();
 		rrsp->rrs_count = RCU_RANDOM_REFRESH;
 	}
 	rrsp->rrs_state = rrsp->rrs_state * RCU_RANDOM_MULT + RCU_RANDOM_ADD;
diff --git a/kernel/sched.c b/kernel/sched.c
index 8f351c56567f..3abd8f780dae 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -1647,7 +1647,7 @@ static void update_shares(struct sched_domain *sd)
 	if (root_task_group_empty())
 		return;
 
-	now = cpu_clock(raw_smp_processor_id());
+	now = local_clock();
 	elapsed = now - sd->last_update;
 
 	if (elapsed >= (s64)(u64)sysctl_sched_shares_ratelimit) {
diff --git a/kernel/sched_clock.c b/kernel/sched_clock.c
index 906a0f718cb3..52f1a149bfb1 100644
--- a/kernel/sched_clock.c
+++ b/kernel/sched_clock.c
@@ -10,19 +10,55 @@
  *   Ingo Molnar <mingo@redhat.com>
  *   Guillaume Chazarain <guichaz@gmail.com>
  *
- * Create a semi stable clock from a mixture of other events, including:
- *  - gtod
+ *
+ * What:
+ *
+ * cpu_clock(i) provides a fast (execution time) high resolution
+ * clock with bounded drift between CPUs. The value of cpu_clock(i)
+ * is monotonic for constant i. The timestamp returned is in nanoseconds.
+ *
+ * ######################### BIG FAT WARNING ##########################
+ * # when comparing cpu_clock(i) to cpu_clock(j) for i != j, time can #
+ * # go backwards !!                                                  #
+ * ####################################################################
+ *
+ * There is no strict promise about the base, although it tends to start
+ * at 0 on boot (but people really shouldn't rely on that).
+ *
+ * cpu_clock(i)       -- can be used from any context, including NMI.
+ * sched_clock_cpu(i) -- must be used with local IRQs disabled (implied by NMI)
+ * local_clock()      -- is cpu_clock() on the current cpu.
+ *
+ * How:
+ *
+ * The implementation either uses sched_clock() when
+ * !CONFIG_HAVE_UNSTABLE_SCHED_CLOCK, which means in that case the
+ * sched_clock() is assumed to provide these properties (mostly it means
+ * the architecture provides a globally synchronized highres time source).
+ *
+ * Otherwise it tries to create a semi stable clock from a mixture of other
+ * clocks, including:
+ *
+ *  - GTOD (clock monotomic)
  *  - sched_clock()
  *  - explicit idle events
  *
- * We use gtod as base and the unstable clock deltas. The deltas are filtered,
- * making it monotonic and keeping it within an expected window.
+ * We use GTOD as base and use sched_clock() deltas to improve resolution. The
+ * deltas are filtered to provide monotonicity and keeping it within an
+ * expected window.
  *
  * Furthermore, explicit sleep and wakeup hooks allow us to account for time
  * that is otherwise invisible (TSC gets stopped).
  *
- * The clock: sched_clock_cpu() is monotonic per cpu, and should be somewhat
- * consistent between cpus (never more than 2 jiffies difference).
+ *
+ * Notes:
+ *
+ * The !IRQ-safetly of sched_clock() and sched_clock_cpu() comes from things
+ * like cpufreq interrupts that can change the base clock (TSC) multiplier
+ * and cause funny jumps in time -- although the filtering provided by
+ * sched_clock_cpu() should mitigate serious artifacts we cannot rely on it
+ * in general since for !CONFIG_HAVE_UNSTABLE_SCHED_CLOCK we fully rely on
+ * sched_clock().
  */
 #include <linux/spinlock.h>
 #include <linux/hardirq.h>
@@ -170,6 +206,11 @@ again:
 	return val;
 }
 
+/*
+ * Similar to cpu_clock(), but requires local IRQs to be disabled.
+ *
+ * See cpu_clock().
+ */
 u64 sched_clock_cpu(int cpu)
 {
 	struct sched_clock_data *scd;
@@ -237,9 +278,19 @@ void sched_clock_idle_wakeup_event(u64 delta_ns)
 }
 EXPORT_SYMBOL_GPL(sched_clock_idle_wakeup_event);
 
-unsigned long long cpu_clock(int cpu)
+/*
+ * As outlined at the top, provides a fast, high resolution, nanosecond
+ * time source that is monotonic per cpu argument and has bounded drift
+ * between cpus.
+ *
+ * ######################### BIG FAT WARNING ##########################
+ * # when comparing cpu_clock(i) to cpu_clock(j) for i != j, time can #
+ * # go backwards !!                                                  #
+ * ####################################################################
+ */
+u64 cpu_clock(int cpu)
 {
-	unsigned long long clock;
+	u64 clock;
 	unsigned long flags;
 
 	local_irq_save(flags);
@@ -249,6 +300,25 @@ unsigned long long cpu_clock(int cpu)
 	return clock;
 }
 
+/*
+ * Similar to cpu_clock() for the current cpu. Time will only be observed
+ * to be monotonic if care is taken to only compare timestampt taken on the
+ * same CPU.
+ *
+ * See cpu_clock().
+ */
+u64 local_clock(void)
+{
+	u64 clock;
+	unsigned long flags;
+
+	local_irq_save(flags);
+	clock = sched_clock_cpu(smp_processor_id());
+	local_irq_restore(flags);
+
+	return clock;
+}
+
 #else /* CONFIG_HAVE_UNSTABLE_SCHED_CLOCK */
 
 void sched_clock_init(void)
@@ -264,12 +334,17 @@ u64 sched_clock_cpu(int cpu)
 	return sched_clock();
 }
 
-
-unsigned long long cpu_clock(int cpu)
+u64 cpu_clock(int cpu)
 {
 	return sched_clock_cpu(cpu);
 }
 
+u64 local_clock(void)
+{
+	return sched_clock_cpu(0);
+}
+
 #endif /* CONFIG_HAVE_UNSTABLE_SCHED_CLOCK */
 
 EXPORT_SYMBOL_GPL(cpu_clock);
+EXPORT_SYMBOL_GPL(local_clock);
diff --git a/kernel/trace/trace_clock.c b/kernel/trace/trace_clock.c
index 9d589d8dcd1a..1723e2b8c589 100644
--- a/kernel/trace/trace_clock.c
+++ b/kernel/trace/trace_clock.c
@@ -56,7 +56,7 @@ u64 notrace trace_clock_local(void)
  */
 u64 notrace trace_clock(void)
 {
-	return cpu_clock(raw_smp_processor_id());
+	return local_clock();
 }
 
 
-- 
cgit v1.2.3


From 83cd4fe27ad8446619b2e030b171b858501de87d Mon Sep 17 00:00:00 2001
From: Venkatesh Pallipadi <venki@google.com>
Date: Fri, 21 May 2010 17:09:41 -0700
Subject: sched: Change nohz idle load balancing logic to push model

In the new push model, all idle CPUs indeed go into nohz mode. There is
still the concept of idle load balancer (performing the load balancing
on behalf of all the idle cpu's in the system). Busy CPU kicks the nohz
balancer when any of the nohz CPUs need idle load balancing.
The kickee CPU does the idle load balancing on behalf of all idle CPUs
instead of the normal idle balance.

This addresses the below two problems with the current nohz ilb logic:
* the idle load balancer continued to have periodic ticks during idle and
  wokeup frequently, even though it did not have any rebalancing to do on
  behalf of any of the idle CPUs.
* On x86 and CPUs that have APIC timer stoppage on idle CPUs, this
  periodic wakeup can result in a periodic additional interrupt on a CPU
  doing the timer broadcast.

Also currently we are migrating the unpinned timers from an idle to the cpu
doing idle load balancing (when all the cpus in the system are idle,
there is no idle load balancing cpu and timers get added to the same idle cpu
where the request was made. So the existing optimization works only on semi idle
system).

And In semi idle system, we no longer have periodic ticks on the idle load
balancer CPU. Using that cpu will add more delays to the timers than intended
(as that cpu's timer base may not be uptodate wrt jiffies etc). This was
causing mysterious slowdowns during boot etc.

For now, in the semi idle case, use the nearest busy cpu for migrating timers
from an idle cpu.  This is good for power-savings anyway.

Signed-off-by: Venkatesh Pallipadi <venki@google.com>
Signed-off-by: Suresh Siddha <suresh.b.siddha@intel.com>
Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Thomas Gleixner <tglx@linutronix.de>
LKML-Reference: <1274486981.2840.46.camel@sbs-t61.sc.intel.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 include/linux/sched.h    |   9 +-
 kernel/hrtimer.c         |   8 +-
 kernel/sched.c           |  34 ++++-
 kernel/sched_fair.c      | 329 ++++++++++++++++++++++++++++-------------------
 kernel/time/tick-sched.c |   8 +-
 kernel/timer.c           |   8 +-
 6 files changed, 237 insertions(+), 159 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/sched.h b/include/linux/sched.h
index c2d4316a04bb..a3e5b1cd0438 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -271,14 +271,11 @@ extern int runqueue_is_locked(int cpu);
 
 extern cpumask_var_t nohz_cpu_mask;
 #if defined(CONFIG_SMP) && defined(CONFIG_NO_HZ)
-extern int select_nohz_load_balancer(int cpu);
-extern int get_nohz_load_balancer(void);
+extern void select_nohz_load_balancer(int stop_tick);
+extern int get_nohz_timer_target(void);
 extern int nohz_ratelimit(int cpu);
 #else
-static inline int select_nohz_load_balancer(int cpu)
-{
-	return 0;
-}
+static inline void select_nohz_load_balancer(int stop_tick) { }
 
 static inline int nohz_ratelimit(int cpu)
 {
diff --git a/kernel/hrtimer.c b/kernel/hrtimer.c
index 5c69e996bd0f..e934339fbbef 100644
--- a/kernel/hrtimer.c
+++ b/kernel/hrtimer.c
@@ -144,12 +144,8 @@ struct hrtimer_clock_base *lock_hrtimer_base(const struct hrtimer *timer,
 static int hrtimer_get_target(int this_cpu, int pinned)
 {
 #ifdef CONFIG_NO_HZ
-	if (!pinned && get_sysctl_timer_migration() && idle_cpu(this_cpu)) {
-		int preferred_cpu = get_nohz_load_balancer();
-
-		if (preferred_cpu >= 0)
-			return preferred_cpu;
-	}
+	if (!pinned && get_sysctl_timer_migration() && idle_cpu(this_cpu))
+		return get_nohz_timer_target();
 #endif
 	return this_cpu;
 }
diff --git a/kernel/sched.c b/kernel/sched.c
index a757f6b11cbd..132950b33dde 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -460,7 +460,7 @@ struct rq {
 	unsigned long last_load_update_tick;
 #ifdef CONFIG_NO_HZ
 	u64 nohz_stamp;
-	unsigned char in_nohz_recently;
+	unsigned char nohz_balance_kick;
 #endif
 	unsigned int skip_clock_update;
 
@@ -1194,6 +1194,27 @@ static void resched_cpu(int cpu)
 }
 
 #ifdef CONFIG_NO_HZ
+/*
+ * In the semi idle case, use the nearest busy cpu for migrating timers
+ * from an idle cpu.  This is good for power-savings.
+ *
+ * We don't do similar optimization for completely idle system, as
+ * selecting an idle cpu will add more delays to the timers than intended
+ * (as that cpu's timer base may not be uptodate wrt jiffies etc).
+ */
+int get_nohz_timer_target(void)
+{
+	int cpu = smp_processor_id();
+	int i;
+	struct sched_domain *sd;
+
+	for_each_domain(cpu, sd) {
+		for_each_cpu(i, sched_domain_span(sd))
+			if (!idle_cpu(i))
+				return i;
+	}
+	return cpu;
+}
 /*
  * When add_timer_on() enqueues a timer into the timer wheel of an
  * idle CPU then this timer might expire before the next timer event
@@ -7791,6 +7812,10 @@ void __init sched_init(void)
 		rq->idle_stamp = 0;
 		rq->avg_idle = 2*sysctl_sched_migration_cost;
 		rq_attach_root(rq, &def_root_domain);
+#ifdef CONFIG_NO_HZ
+		rq->nohz_balance_kick = 0;
+		init_sched_softirq_csd(&per_cpu(remote_sched_softirq_cb, i));
+#endif
 #endif
 		init_rq_hrtick(rq);
 		atomic_set(&rq->nr_iowait, 0);
@@ -7835,8 +7860,11 @@ void __init sched_init(void)
 	zalloc_cpumask_var(&nohz_cpu_mask, GFP_NOWAIT);
 #ifdef CONFIG_SMP
 #ifdef CONFIG_NO_HZ
-	zalloc_cpumask_var(&nohz.cpu_mask, GFP_NOWAIT);
-	alloc_cpumask_var(&nohz.ilb_grp_nohz_mask, GFP_NOWAIT);
+	zalloc_cpumask_var(&nohz.idle_cpus_mask, GFP_NOWAIT);
+	alloc_cpumask_var(&nohz.grp_idle_mask, GFP_NOWAIT);
+	atomic_set(&nohz.load_balancer, nr_cpu_ids);
+	atomic_set(&nohz.first_pick_cpu, nr_cpu_ids);
+	atomic_set(&nohz.second_pick_cpu, nr_cpu_ids);
 #endif
 	/* May be allocated at isolcpus cmdline parse time */
 	if (cpu_isolated_map == NULL)
diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c
index 22b8b4f2b616..6ee2e0af665b 100644
--- a/kernel/sched_fair.c
+++ b/kernel/sched_fair.c
@@ -3091,13 +3091,40 @@ out_unlock:
 }
 
 #ifdef CONFIG_NO_HZ
+
+static DEFINE_PER_CPU(struct call_single_data, remote_sched_softirq_cb);
+
+static void trigger_sched_softirq(void *data)
+{
+	raise_softirq_irqoff(SCHED_SOFTIRQ);
+}
+
+static inline void init_sched_softirq_csd(struct call_single_data *csd)
+{
+	csd->func = trigger_sched_softirq;
+	csd->info = NULL;
+	csd->flags = 0;
+	csd->priv = 0;
+}
+
+/*
+ * idle load balancing details
+ * - One of the idle CPUs nominates itself as idle load_balancer, while
+ *   entering idle.
+ * - This idle load balancer CPU will also go into tickless mode when
+ *   it is idle, just like all other idle CPUs
+ * - When one of the busy CPUs notice that there may be an idle rebalancing
+ *   needed, they will kick the idle load balancer, which then does idle
+ *   load balancing for all the idle CPUs.
+ */
 static struct {
 	atomic_t load_balancer;
-	cpumask_var_t cpu_mask;
-	cpumask_var_t ilb_grp_nohz_mask;
-} nohz ____cacheline_aligned = {
-	.load_balancer = ATOMIC_INIT(-1),
-};
+	atomic_t first_pick_cpu;
+	atomic_t second_pick_cpu;
+	cpumask_var_t idle_cpus_mask;
+	cpumask_var_t grp_idle_mask;
+	unsigned long next_balance;     /* in jiffy units */
+} nohz ____cacheline_aligned;
 
 int get_nohz_load_balancer(void)
 {
@@ -3151,17 +3178,17 @@ static inline struct sched_domain *lowest_flag_domain(int cpu, int flag)
  */
 static inline int is_semi_idle_group(struct sched_group *ilb_group)
 {
-	cpumask_and(nohz.ilb_grp_nohz_mask, nohz.cpu_mask,
+	cpumask_and(nohz.grp_idle_mask, nohz.idle_cpus_mask,
 					sched_group_cpus(ilb_group));
 
 	/*
 	 * A sched_group is semi-idle when it has atleast one busy cpu
 	 * and atleast one idle cpu.
 	 */
-	if (cpumask_empty(nohz.ilb_grp_nohz_mask))
+	if (cpumask_empty(nohz.grp_idle_mask))
 		return 0;
 
-	if (cpumask_equal(nohz.ilb_grp_nohz_mask, sched_group_cpus(ilb_group)))
+	if (cpumask_equal(nohz.grp_idle_mask, sched_group_cpus(ilb_group)))
 		return 0;
 
 	return 1;
@@ -3194,7 +3221,7 @@ static int find_new_ilb(int cpu)
 	 * Optimize for the case when we have no idle CPUs or only one
 	 * idle CPU. Don't walk the sched_domain hierarchy in such cases
 	 */
-	if (cpumask_weight(nohz.cpu_mask) < 2)
+	if (cpumask_weight(nohz.idle_cpus_mask) < 2)
 		goto out_done;
 
 	for_each_flag_domain(cpu, sd, SD_POWERSAVINGS_BALANCE) {
@@ -3202,7 +3229,7 @@ static int find_new_ilb(int cpu)
 
 		do {
 			if (is_semi_idle_group(ilb_group))
-				return cpumask_first(nohz.ilb_grp_nohz_mask);
+				return cpumask_first(nohz.grp_idle_mask);
 
 			ilb_group = ilb_group->next;
 
@@ -3210,98 +3237,116 @@ static int find_new_ilb(int cpu)
 	}
 
 out_done:
-	return cpumask_first(nohz.cpu_mask);
+	return nr_cpu_ids;
 }
 #else /*  (CONFIG_SCHED_MC || CONFIG_SCHED_SMT) */
 static inline int find_new_ilb(int call_cpu)
 {
-	return cpumask_first(nohz.cpu_mask);
+	return nr_cpu_ids;
 }
 #endif
 
+/*
+ * Kick a CPU to do the nohz balancing, if it is time for it. We pick the
+ * nohz_load_balancer CPU (if there is one) otherwise fallback to any idle
+ * CPU (if there is one).
+ */
+static void nohz_balancer_kick(int cpu)
+{
+	int ilb_cpu;
+
+	nohz.next_balance++;
+
+	ilb_cpu = get_nohz_load_balancer();
+
+	if (ilb_cpu >= nr_cpu_ids) {
+		ilb_cpu = cpumask_first(nohz.idle_cpus_mask);
+		if (ilb_cpu >= nr_cpu_ids)
+			return;
+	}
+
+	if (!cpu_rq(ilb_cpu)->nohz_balance_kick) {
+		struct call_single_data *cp;
+
+		cpu_rq(ilb_cpu)->nohz_balance_kick = 1;
+		cp = &per_cpu(remote_sched_softirq_cb, cpu);
+		__smp_call_function_single(ilb_cpu, cp, 0);
+	}
+	return;
+}
+
 /*
  * This routine will try to nominate the ilb (idle load balancing)
  * owner among the cpus whose ticks are stopped. ilb owner will do the idle
- * load balancing on behalf of all those cpus. If all the cpus in the system
- * go into this tickless mode, then there will be no ilb owner (as there is
- * no need for one) and all the cpus will sleep till the next wakeup event
- * arrives...
- *
- * For the ilb owner, tick is not stopped. And this tick will be used
- * for idle load balancing. ilb owner will still be part of
- * nohz.cpu_mask..
+ * load balancing on behalf of all those cpus.
  *
- * While stopping the tick, this cpu will become the ilb owner if there
- * is no other owner. And will be the owner till that cpu becomes busy
- * or if all cpus in the system stop their ticks at which point
- * there is no need for ilb owner.
+ * When the ilb owner becomes busy, we will not have new ilb owner until some
+ * idle CPU wakes up and goes back to idle or some busy CPU tries to kick
+ * idle load balancing by kicking one of the idle CPUs.
  *
- * When the ilb owner becomes busy, it nominates another owner, during the
- * next busy scheduler_tick()
+ * Ticks are stopped for the ilb owner as well, with busy CPU kicking this
+ * ilb owner CPU in future (when there is a need for idle load balancing on
+ * behalf of all idle CPUs).
  */
-int select_nohz_load_balancer(int stop_tick)
+void select_nohz_load_balancer(int stop_tick)
 {
 	int cpu = smp_processor_id();
 
 	if (stop_tick) {
-		cpu_rq(cpu)->in_nohz_recently = 1;
-
 		if (!cpu_active(cpu)) {
 			if (atomic_read(&nohz.load_balancer) != cpu)
-				return 0;
+				return;
 
 			/*
 			 * If we are going offline and still the leader,
 			 * give up!
 			 */
-			if (atomic_cmpxchg(&nohz.load_balancer, cpu, -1) != cpu)
+			if (atomic_cmpxchg(&nohz.load_balancer, cpu,
+					   nr_cpu_ids) != cpu)
 				BUG();
 
-			return 0;
+			return;
 		}
 
-		cpumask_set_cpu(cpu, nohz.cpu_mask);
+		cpumask_set_cpu(cpu, nohz.idle_cpus_mask);
 
-		/* time for ilb owner also to sleep */
-		if (cpumask_weight(nohz.cpu_mask) == num_active_cpus()) {
-			if (atomic_read(&nohz.load_balancer) == cpu)
-				atomic_set(&nohz.load_balancer, -1);
-			return 0;
-		}
+		if (atomic_read(&nohz.first_pick_cpu) == cpu)
+			atomic_cmpxchg(&nohz.first_pick_cpu, cpu, nr_cpu_ids);
+		if (atomic_read(&nohz.second_pick_cpu) == cpu)
+			atomic_cmpxchg(&nohz.second_pick_cpu, cpu, nr_cpu_ids);
 
-		if (atomic_read(&nohz.load_balancer) == -1) {
-			/* make me the ilb owner */
-			if (atomic_cmpxchg(&nohz.load_balancer, -1, cpu) == -1)
-				return 1;
-		} else if (atomic_read(&nohz.load_balancer) == cpu) {
+		if (atomic_read(&nohz.load_balancer) >= nr_cpu_ids) {
 			int new_ilb;
 
-			if (!(sched_smt_power_savings ||
-						sched_mc_power_savings))
-				return 1;
+			/* make me the ilb owner */
+			if (atomic_cmpxchg(&nohz.load_balancer, nr_cpu_ids,
+					   cpu) != nr_cpu_ids)
+				return;
+
 			/*
 			 * Check to see if there is a more power-efficient
 			 * ilb.
 			 */
 			new_ilb = find_new_ilb(cpu);
 			if (new_ilb < nr_cpu_ids && new_ilb != cpu) {
-				atomic_set(&nohz.load_balancer, -1);
+				atomic_set(&nohz.load_balancer, nr_cpu_ids);
 				resched_cpu(new_ilb);
-				return 0;
+				return;
 			}
-			return 1;
+			return;
 		}
 	} else {
-		if (!cpumask_test_cpu(cpu, nohz.cpu_mask))
-			return 0;
+		if (!cpumask_test_cpu(cpu, nohz.idle_cpus_mask))
+			return;
 
-		cpumask_clear_cpu(cpu, nohz.cpu_mask);
+		cpumask_clear_cpu(cpu, nohz.idle_cpus_mask);
 
 		if (atomic_read(&nohz.load_balancer) == cpu)
-			if (atomic_cmpxchg(&nohz.load_balancer, cpu, -1) != cpu)
+			if (atomic_cmpxchg(&nohz.load_balancer, cpu,
+					   nr_cpu_ids) != cpu)
 				BUG();
 	}
-	return 0;
+	return;
 }
 #endif
 
@@ -3383,11 +3428,101 @@ out:
 		rq->next_balance = next_balance;
 }
 
+#ifdef CONFIG_NO_HZ
 /*
- * run_rebalance_domains is triggered when needed from the scheduler tick.
- * In CONFIG_NO_HZ case, the idle load balance owner will do the
+ * In CONFIG_NO_HZ case, the idle balance kickee will do the
  * rebalancing for all the cpus for whom scheduler ticks are stopped.
  */
+static void nohz_idle_balance(int this_cpu, enum cpu_idle_type idle)
+{
+	struct rq *this_rq = cpu_rq(this_cpu);
+	struct rq *rq;
+	int balance_cpu;
+
+	if (idle != CPU_IDLE || !this_rq->nohz_balance_kick)
+		return;
+
+	for_each_cpu(balance_cpu, nohz.idle_cpus_mask) {
+		if (balance_cpu == this_cpu)
+			continue;
+
+		/*
+		 * If this cpu gets work to do, stop the load balancing
+		 * work being done for other cpus. Next load
+		 * balancing owner will pick it up.
+		 */
+		if (need_resched()) {
+			this_rq->nohz_balance_kick = 0;
+			break;
+		}
+
+		raw_spin_lock_irq(&this_rq->lock);
+		update_cpu_load(this_rq);
+		raw_spin_unlock_irq(&this_rq->lock);
+
+		rebalance_domains(balance_cpu, CPU_IDLE);
+
+		rq = cpu_rq(balance_cpu);
+		if (time_after(this_rq->next_balance, rq->next_balance))
+			this_rq->next_balance = rq->next_balance;
+	}
+	nohz.next_balance = this_rq->next_balance;
+	this_rq->nohz_balance_kick = 0;
+}
+
+/*
+ * Current heuristic for kicking the idle load balancer
+ * - first_pick_cpu is the one of the busy CPUs. It will kick
+ *   idle load balancer when it has more than one process active. This
+ *   eliminates the need for idle load balancing altogether when we have
+ *   only one running process in the system (common case).
+ * - If there are more than one busy CPU, idle load balancer may have
+ *   to run for active_load_balance to happen (i.e., two busy CPUs are
+ *   SMT or core siblings and can run better if they move to different
+ *   physical CPUs). So, second_pick_cpu is the second of the busy CPUs
+ *   which will kick idle load balancer as soon as it has any load.
+ */
+static inline int nohz_kick_needed(struct rq *rq, int cpu)
+{
+	unsigned long now = jiffies;
+	int ret;
+	int first_pick_cpu, second_pick_cpu;
+
+	if (time_before(now, nohz.next_balance))
+		return 0;
+
+	if (!rq->nr_running)
+		return 0;
+
+	first_pick_cpu = atomic_read(&nohz.first_pick_cpu);
+	second_pick_cpu = atomic_read(&nohz.second_pick_cpu);
+
+	if (first_pick_cpu < nr_cpu_ids && first_pick_cpu != cpu &&
+	    second_pick_cpu < nr_cpu_ids && second_pick_cpu != cpu)
+		return 0;
+
+	ret = atomic_cmpxchg(&nohz.first_pick_cpu, nr_cpu_ids, cpu);
+	if (ret == nr_cpu_ids || ret == cpu) {
+		atomic_cmpxchg(&nohz.second_pick_cpu, cpu, nr_cpu_ids);
+		if (rq->nr_running > 1)
+			return 1;
+	} else {
+		ret = atomic_cmpxchg(&nohz.second_pick_cpu, nr_cpu_ids, cpu);
+		if (ret == nr_cpu_ids || ret == cpu) {
+			if (rq->nr_running)
+				return 1;
+		}
+	}
+	return 0;
+}
+#else
+static void nohz_idle_balance(int this_cpu, enum cpu_idle_type idle) { }
+#endif
+
+/*
+ * run_rebalance_domains is triggered when needed from the scheduler tick.
+ * Also triggered for nohz idle balancing (with nohz_balancing_kick set).
+ */
 static void run_rebalance_domains(struct softirq_action *h)
 {
 	int this_cpu = smp_processor_id();
@@ -3397,40 +3532,12 @@ static void run_rebalance_domains(struct softirq_action *h)
 
 	rebalance_domains(this_cpu, idle);
 
-#ifdef CONFIG_NO_HZ
 	/*
-	 * If this cpu is the owner for idle load balancing, then do the
+	 * If this cpu has a pending nohz_balance_kick, then do the
 	 * balancing on behalf of the other idle cpus whose ticks are
 	 * stopped.
 	 */
-	if (this_rq->idle_at_tick &&
-	    atomic_read(&nohz.load_balancer) == this_cpu) {
-		struct rq *rq;
-		int balance_cpu;
-
-		for_each_cpu(balance_cpu, nohz.cpu_mask) {
-			if (balance_cpu == this_cpu)
-				continue;
-
-			/*
-			 * If this cpu gets work to do, stop the load balancing
-			 * work being done for other cpus. Next load
-			 * balancing owner will pick it up.
-			 */
-			if (need_resched())
-				break;
-
-			rq = cpu_rq(balance_cpu);
-			raw_spin_lock_irq(&rq->lock);
-			update_cpu_load(rq);
-			raw_spin_unlock_irq(&rq->lock);
-			rebalance_domains(balance_cpu, CPU_IDLE);
-
-			if (time_after(this_rq->next_balance, rq->next_balance))
-				this_rq->next_balance = rq->next_balance;
-		}
-	}
-#endif
+	nohz_idle_balance(this_cpu, idle);
 }
 
 static inline int on_null_domain(int cpu)
@@ -3440,57 +3547,17 @@ static inline int on_null_domain(int cpu)
 
 /*
  * Trigger the SCHED_SOFTIRQ if it is time to do periodic load balancing.
- *
- * In case of CONFIG_NO_HZ, this is the place where we nominate a new
- * idle load balancing owner or decide to stop the periodic load balancing,
- * if the whole system is idle.
  */
 static inline void trigger_load_balance(struct rq *rq, int cpu)
 {
-#ifdef CONFIG_NO_HZ
-	/*
-	 * If we were in the nohz mode recently and busy at the current
-	 * scheduler tick, then check if we need to nominate new idle
-	 * load balancer.
-	 */
-	if (rq->in_nohz_recently && !rq->idle_at_tick) {
-		rq->in_nohz_recently = 0;
-
-		if (atomic_read(&nohz.load_balancer) == cpu) {
-			cpumask_clear_cpu(cpu, nohz.cpu_mask);
-			atomic_set(&nohz.load_balancer, -1);
-		}
-
-		if (atomic_read(&nohz.load_balancer) == -1) {
-			int ilb = find_new_ilb(cpu);
-
-			if (ilb < nr_cpu_ids)
-				resched_cpu(ilb);
-		}
-	}
-
-	/*
-	 * If this cpu is idle and doing idle load balancing for all the
-	 * cpus with ticks stopped, is it time for that to stop?
-	 */
-	if (rq->idle_at_tick && atomic_read(&nohz.load_balancer) == cpu &&
-	    cpumask_weight(nohz.cpu_mask) == num_online_cpus()) {
-		resched_cpu(cpu);
-		return;
-	}
-
-	/*
-	 * If this cpu is idle and the idle load balancing is done by
-	 * someone else, then no need raise the SCHED_SOFTIRQ
-	 */
-	if (rq->idle_at_tick && atomic_read(&nohz.load_balancer) != cpu &&
-	    cpumask_test_cpu(cpu, nohz.cpu_mask))
-		return;
-#endif
 	/* Don't need to rebalance while attached to NULL domain */
 	if (time_after_eq(jiffies, rq->next_balance) &&
 	    likely(!on_null_domain(cpu)))
 		raise_softirq(SCHED_SOFTIRQ);
+#ifdef CONFIG_NO_HZ
+	else if (nohz_kick_needed(rq, cpu) && likely(!on_null_domain(cpu)))
+		nohz_balancer_kick(cpu);
+#endif
 }
 
 static void rq_online_fair(struct rq *rq)
diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c
index 1d7b9bc1c034..5f171f04ab00 100644
--- a/kernel/time/tick-sched.c
+++ b/kernel/time/tick-sched.c
@@ -408,13 +408,7 @@ void tick_nohz_stop_sched_tick(int inidle)
 		 * the scheduler tick in nohz_restart_sched_tick.
 		 */
 		if (!ts->tick_stopped) {
-			if (select_nohz_load_balancer(1)) {
-				/*
-				 * sched tick not stopped!
-				 */
-				cpumask_clear_cpu(cpu, nohz_cpu_mask);
-				goto out;
-			}
+			select_nohz_load_balancer(1);
 
 			ts->idle_tick = hrtimer_get_expires(&ts->sched_timer);
 			ts->tick_stopped = 1;
diff --git a/kernel/timer.c b/kernel/timer.c
index ee305c8d4e18..48d6aec0789c 100644
--- a/kernel/timer.c
+++ b/kernel/timer.c
@@ -679,12 +679,8 @@ __mod_timer(struct timer_list *timer, unsigned long expires,
 	cpu = smp_processor_id();
 
 #if defined(CONFIG_NO_HZ) && defined(CONFIG_SMP)
-	if (!pinned && get_sysctl_timer_migration() && idle_cpu(cpu)) {
-		int preferred_cpu = get_nohz_load_balancer();
-
-		if (preferred_cpu >= 0)
-			cpu = preferred_cpu;
-	}
+	if (!pinned && get_sysctl_timer_migration() && idle_cpu(cpu))
+		cpu = get_nohz_timer_target();
 #endif
 	new_base = per_cpu(tvec_bases, cpu);
 
-- 
cgit v1.2.3


From 9d5efe05eb0c904545a28b19c18b949f23334de0 Mon Sep 17 00:00:00 2001
From: Srivatsa Vaddagiri <vatsa@linux.vnet.ibm.com>
Date: Tue, 8 Jun 2010 14:57:02 +1000
Subject: sched: Fix capacity calculations for SMT4

Handle cpu capacity being reported as 0 on cores with more number of
hardware threads. For example on a Power7 core with 4 hardware
threads, core power is 1177 and thus power of each hardware thread is
1177/4 = 294. This low power can lead to capacity for each hardware
thread being calculated as 0, which leads to tasks bouncing within the
core madly!

Fix this by reporting capacity for hardware threads as 1, provided
their power is not scaled down significantly because of frequency
scaling or real-time tasks usage of cpu.

Signed-off-by: Srivatsa Vaddagiri <vatsa@linux.vnet.ibm.com>
Signed-off-by: Michael Neuling <mikey@neuling.org>
Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Arjan van de Ven <arjan@linux.intel.com>
LKML-Reference: <20100608045702.21D03CC895@localhost.localdomain>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 include/linux/sched.h |  2 +-
 kernel/sched_fair.c   | 53 +++++++++++++++++++++++++++++++++++++++++----------
 2 files changed, 44 insertions(+), 11 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/sched.h b/include/linux/sched.h
index a3e5b1cd0438..c731296e5e93 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -857,7 +857,7 @@ struct sched_group {
 	 * CPU power of this group, SCHED_LOAD_SCALE being max power for a
 	 * single CPU.
 	 */
-	unsigned int cpu_power;
+	unsigned int cpu_power, cpu_power_orig;
 
 	/*
 	 * The CPUs this group covers.
diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c
index 6ee2e0af665b..b9b3462483b7 100644
--- a/kernel/sched_fair.c
+++ b/kernel/sched_fair.c
@@ -2285,13 +2285,6 @@ static void update_cpu_power(struct sched_domain *sd, int cpu)
 	unsigned long power = SCHED_LOAD_SCALE;
 	struct sched_group *sdg = sd->groups;
 
-	if (sched_feat(ARCH_POWER))
-		power *= arch_scale_freq_power(sd, cpu);
-	else
-		power *= default_scale_freq_power(sd, cpu);
-
-	power >>= SCHED_LOAD_SHIFT;
-
 	if ((sd->flags & SD_SHARE_CPUPOWER) && weight > 1) {
 		if (sched_feat(ARCH_POWER))
 			power *= arch_scale_smt_power(sd, cpu);
@@ -2301,6 +2294,15 @@ static void update_cpu_power(struct sched_domain *sd, int cpu)
 		power >>= SCHED_LOAD_SHIFT;
 	}
 
+	sdg->cpu_power_orig = power;
+
+	if (sched_feat(ARCH_POWER))
+		power *= arch_scale_freq_power(sd, cpu);
+	else
+		power *= default_scale_freq_power(sd, cpu);
+
+	power >>= SCHED_LOAD_SHIFT;
+
 	power *= scale_rt_power(cpu);
 	power >>= SCHED_LOAD_SHIFT;
 
@@ -2333,6 +2335,31 @@ static void update_group_power(struct sched_domain *sd, int cpu)
 	sdg->cpu_power = power;
 }
 
+/*
+ * Try and fix up capacity for tiny siblings, this is needed when
+ * things like SD_ASYM_PACKING need f_b_g to select another sibling
+ * which on its own isn't powerful enough.
+ *
+ * See update_sd_pick_busiest() and check_asym_packing().
+ */
+static inline int
+fix_small_capacity(struct sched_domain *sd, struct sched_group *group)
+{
+	/*
+	 * Only siblings can have significantly less than SCHED_LOAD_SCALE
+	 */
+	if (sd->level != SD_LV_SIBLING)
+		return 0;
+
+	/*
+	 * If ~90% of the cpu_power is still there, we're good.
+	 */
+	if (group->cpu_power * 32 < group->cpu_power_orig * 29)
+		return 1;
+
+	return 0;
+}
+
 /**
  * update_sg_lb_stats - Update sched_group's statistics for load balancing.
  * @sd: The sched_domain whose statistics are to be updated.
@@ -2426,6 +2453,8 @@ static inline void update_sg_lb_stats(struct sched_domain *sd,
 
 	sgs->group_capacity =
 		DIV_ROUND_CLOSEST(group->cpu_power, SCHED_LOAD_SCALE);
+	if (!sgs->group_capacity)
+		sgs->group_capacity = fix_small_capacity(sd, group);
 }
 
 /**
@@ -2724,8 +2753,9 @@ ret:
  * find_busiest_queue - find the busiest runqueue among the cpus in group.
  */
 static struct rq *
-find_busiest_queue(struct sched_group *group, enum cpu_idle_type idle,
-		   unsigned long imbalance, const struct cpumask *cpus)
+find_busiest_queue(struct sched_domain *sd, struct sched_group *group,
+		   enum cpu_idle_type idle, unsigned long imbalance,
+		   const struct cpumask *cpus)
 {
 	struct rq *busiest = NULL, *rq;
 	unsigned long max_load = 0;
@@ -2736,6 +2766,9 @@ find_busiest_queue(struct sched_group *group, enum cpu_idle_type idle,
 		unsigned long capacity = DIV_ROUND_CLOSEST(power, SCHED_LOAD_SCALE);
 		unsigned long wl;
 
+		if (!capacity)
+			capacity = fix_small_capacity(sd, group);
+
 		if (!cpumask_test_cpu(i, cpus))
 			continue;
 
@@ -2852,7 +2885,7 @@ redo:
 		goto out_balanced;
 	}
 
-	busiest = find_busiest_queue(group, idle, imbalance, cpus);
+	busiest = find_busiest_queue(sd, group, idle, imbalance, cpus);
 	if (!busiest) {
 		schedstat_inc(sd, lb_nobusyq[idle]);
 		goto out_balanced;
-- 
cgit v1.2.3


From 532cb4c401e225b084c14d6bd6a2f8ee561de2f1 Mon Sep 17 00:00:00 2001
From: Michael Neuling <mikey@neuling.org>
Date: Tue, 8 Jun 2010 14:57:02 +1000
Subject: sched: Add asymmetric group packing option for sibling domain

Check to see if the group is packed in a sched doman.

This is primarily intended to used at the sibling level.  Some cores
like POWER7 prefer to use lower numbered SMT threads.  In the case of
POWER7, it can move to lower SMT modes only when higher threads are
idle.  When in lower SMT modes, the threads will perform better since
they share less core resources.  Hence when we have idle threads, we
want them to be the higher ones.

This adds a hook into f_b_g() called check_asym_packing() to check the
packing.  This packing function is run on idle threads.  It checks to
see if the busiest CPU in this domain (core in the P7 case) has a
higher CPU number than what where the packing function is being run
on.  If it is, calculate the imbalance and return the higher busier
thread as the busiest group to f_b_g().  Here we are assuming a lower
CPU number will be equivalent to a lower SMT thread number.

It also creates a new SD_ASYM_PACKING flag to enable this feature at
any scheduler domain level.

It also creates an arch hook to enable this feature at the sibling
level.  The default function doesn't enable this feature.

Based heavily on patch from Peter Zijlstra.
Fixes from Srivatsa Vaddagiri.

Signed-off-by: Michael Neuling <mikey@neuling.org>
Signed-off-by: Srivatsa Vaddagiri <vatsa@linux.vnet.ibm.com>
Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Arjan van de Ven <arjan@linux.intel.com>
Cc: "H. Peter Anvin" <hpa@zytor.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
LKML-Reference: <20100608045702.2936CCC897@localhost.localdomain>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 include/linux/sched.h    |   4 +-
 include/linux/topology.h |   1 +
 kernel/sched_fair.c      | 139 +++++++++++++++++++++++++++++++++++++++++------
 3 files changed, 126 insertions(+), 18 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/sched.h b/include/linux/sched.h
index c731296e5e93..ff154e10752b 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -801,7 +801,7 @@ enum cpu_idle_type {
 #define SD_POWERSAVINGS_BALANCE	0x0100	/* Balance for power savings */
 #define SD_SHARE_PKG_RESOURCES	0x0200	/* Domain members share cpu pkg resources */
 #define SD_SERIALIZE		0x0400	/* Only a single load balancing instance */
-
+#define SD_ASYM_PACKING		0x0800  /* Place busy groups earlier in the domain */
 #define SD_PREFER_SIBLING	0x1000	/* Prefer to place tasks in a sibling domain */
 
 enum powersavings_balance_level {
@@ -836,6 +836,8 @@ static inline int sd_balance_for_package_power(void)
 	return SD_PREFER_SIBLING;
 }
 
+extern int __weak arch_sd_sibiling_asym_packing(void);
+
 /*
  * Optimise SD flags for power savings:
  * SD_BALANCE_NEWIDLE helps agressive task consolidation and power savings.
diff --git a/include/linux/topology.h b/include/linux/topology.h
index c44df50a05ab..cf57f30d0dcb 100644
--- a/include/linux/topology.h
+++ b/include/linux/topology.h
@@ -103,6 +103,7 @@ int arch_update_cpu_topology(void);
 				| 1*SD_SHARE_PKG_RESOURCES		\
 				| 0*SD_SERIALIZE			\
 				| 0*SD_PREFER_SIBLING			\
+				| arch_sd_sibiling_asym_packing()	\
 				,					\
 	.last_balance		= jiffies,				\
 	.balance_interval	= 1,					\
diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c
index b9b3462483b7..593424f91a8a 100644
--- a/kernel/sched_fair.c
+++ b/kernel/sched_fair.c
@@ -2457,12 +2457,54 @@ static inline void update_sg_lb_stats(struct sched_domain *sd,
 		sgs->group_capacity = fix_small_capacity(sd, group);
 }
 
+/**
+ * update_sd_pick_busiest - return 1 on busiest group
+ * @sd: sched_domain whose statistics are to be checked
+ * @sds: sched_domain statistics
+ * @sg: sched_group candidate to be checked for being the busiest
+ * @sds: sched_group statistics
+ *
+ * Determine if @sg is a busier group than the previously selected
+ * busiest group.
+ */
+static bool update_sd_pick_busiest(struct sched_domain *sd,
+				   struct sd_lb_stats *sds,
+				   struct sched_group *sg,
+				   struct sg_lb_stats *sgs,
+				   int this_cpu)
+{
+	if (sgs->avg_load <= sds->max_load)
+		return false;
+
+	if (sgs->sum_nr_running > sgs->group_capacity)
+		return true;
+
+	if (sgs->group_imb)
+		return true;
+
+	/*
+	 * ASYM_PACKING needs to move all the work to the lowest
+	 * numbered CPUs in the group, therefore mark all groups
+	 * higher than ourself as busy.
+	 */
+	if ((sd->flags & SD_ASYM_PACKING) && sgs->sum_nr_running &&
+	    this_cpu < group_first_cpu(sg)) {
+		if (!sds->busiest)
+			return true;
+
+		if (group_first_cpu(sds->busiest) > group_first_cpu(sg))
+			return true;
+	}
+
+	return false;
+}
+
 /**
  * update_sd_lb_stats - Update sched_group's statistics for load balancing.
  * @sd: sched_domain whose statistics are to be updated.
  * @this_cpu: Cpu for which load balance is currently performed.
  * @idle: Idle status of this_cpu
- * @sd_idle: Idle status of the sched_domain containing group.
+ * @sd_idle: Idle status of the sched_domain containing sg.
  * @cpus: Set of cpus considered for load balancing.
  * @balance: Should we balance.
  * @sds: variable to hold the statistics for this sched_domain.
@@ -2473,7 +2515,7 @@ static inline void update_sd_lb_stats(struct sched_domain *sd, int this_cpu,
 			struct sd_lb_stats *sds)
 {
 	struct sched_domain *child = sd->child;
-	struct sched_group *group = sd->groups;
+	struct sched_group *sg = sd->groups;
 	struct sg_lb_stats sgs;
 	int load_idx, prefer_sibling = 0;
 
@@ -2486,21 +2528,20 @@ static inline void update_sd_lb_stats(struct sched_domain *sd, int this_cpu,
 	do {
 		int local_group;
 
-		local_group = cpumask_test_cpu(this_cpu,
-					       sched_group_cpus(group));
+		local_group = cpumask_test_cpu(this_cpu, sched_group_cpus(sg));
 		memset(&sgs, 0, sizeof(sgs));
-		update_sg_lb_stats(sd, group, this_cpu, idle, load_idx, sd_idle,
+		update_sg_lb_stats(sd, sg, this_cpu, idle, load_idx, sd_idle,
 				local_group, cpus, balance, &sgs);
 
 		if (local_group && !(*balance))
 			return;
 
 		sds->total_load += sgs.group_load;
-		sds->total_pwr += group->cpu_power;
+		sds->total_pwr += sg->cpu_power;
 
 		/*
 		 * In case the child domain prefers tasks go to siblings
-		 * first, lower the group capacity to one so that we'll try
+		 * first, lower the sg capacity to one so that we'll try
 		 * and move all the excess tasks away.
 		 */
 		if (prefer_sibling)
@@ -2508,23 +2549,72 @@ static inline void update_sd_lb_stats(struct sched_domain *sd, int this_cpu,
 
 		if (local_group) {
 			sds->this_load = sgs.avg_load;
-			sds->this = group;
+			sds->this = sg;
 			sds->this_nr_running = sgs.sum_nr_running;
 			sds->this_load_per_task = sgs.sum_weighted_load;
-		} else if (sgs.avg_load > sds->max_load &&
-			   (sgs.sum_nr_running > sgs.group_capacity ||
-				sgs.group_imb)) {
+		} else if (update_sd_pick_busiest(sd, sds, sg, &sgs, this_cpu)) {
 			sds->max_load = sgs.avg_load;
-			sds->busiest = group;
+			sds->busiest = sg;
 			sds->busiest_nr_running = sgs.sum_nr_running;
 			sds->busiest_group_capacity = sgs.group_capacity;
 			sds->busiest_load_per_task = sgs.sum_weighted_load;
 			sds->group_imb = sgs.group_imb;
 		}
 
-		update_sd_power_savings_stats(group, sds, local_group, &sgs);
-		group = group->next;
-	} while (group != sd->groups);
+		update_sd_power_savings_stats(sg, sds, local_group, &sgs);
+		sg = sg->next;
+	} while (sg != sd->groups);
+}
+
+int __weak arch_sd_sibiling_asym_packing(void)
+{
+       return 0*SD_ASYM_PACKING;
+}
+
+/**
+ * check_asym_packing - Check to see if the group is packed into the
+ *			sched doman.
+ *
+ * This is primarily intended to used at the sibling level.  Some
+ * cores like POWER7 prefer to use lower numbered SMT threads.  In the
+ * case of POWER7, it can move to lower SMT modes only when higher
+ * threads are idle.  When in lower SMT modes, the threads will
+ * perform better since they share less core resources.  Hence when we
+ * have idle threads, we want them to be the higher ones.
+ *
+ * This packing function is run on idle threads.  It checks to see if
+ * the busiest CPU in this domain (core in the P7 case) has a higher
+ * CPU number than the packing function is being run on.  Here we are
+ * assuming lower CPU number will be equivalent to lower a SMT thread
+ * number.
+ *
+ * @sd: The sched_domain whose packing is to be checked.
+ * @sds: Statistics of the sched_domain which is to be packed
+ * @this_cpu: The cpu at whose sched_domain we're performing load-balance.
+ * @imbalance: returns amount of imbalanced due to packing.
+ *
+ * Returns 1 when packing is required and a task should be moved to
+ * this CPU.  The amount of the imbalance is returned in *imbalance.
+ */
+static int check_asym_packing(struct sched_domain *sd,
+			      struct sd_lb_stats *sds,
+			      int this_cpu, unsigned long *imbalance)
+{
+	int busiest_cpu;
+
+	if (!(sd->flags & SD_ASYM_PACKING))
+		return 0;
+
+	if (!sds->busiest)
+		return 0;
+
+	busiest_cpu = group_first_cpu(sds->busiest);
+	if (this_cpu > busiest_cpu)
+		return 0;
+
+	*imbalance = DIV_ROUND_CLOSEST(sds->max_load * sds->busiest->cpu_power,
+				       SCHED_LOAD_SCALE);
+	return 1;
 }
 
 /**
@@ -2719,6 +2809,10 @@ find_busiest_group(struct sched_domain *sd, int this_cpu,
 	if (!(*balance))
 		goto ret;
 
+	if ((idle == CPU_IDLE || idle == CPU_NEWLY_IDLE) &&
+	    check_asym_packing(sd, &sds, this_cpu, imbalance))
+		return sds.busiest;
+
 	if (!sds.busiest || sds.busiest_nr_running == 0)
 		goto out_balanced;
 
@@ -2808,9 +2902,19 @@ find_busiest_queue(struct sched_domain *sd, struct sched_group *group,
 /* Working cpumask for load_balance and load_balance_newidle. */
 static DEFINE_PER_CPU(cpumask_var_t, load_balance_tmpmask);
 
-static int need_active_balance(struct sched_domain *sd, int sd_idle, int idle)
+static int need_active_balance(struct sched_domain *sd, int sd_idle, int idle,
+			       int busiest_cpu, int this_cpu)
 {
 	if (idle == CPU_NEWLY_IDLE) {
+
+		/*
+		 * ASYM_PACKING needs to force migrate tasks from busy but
+		 * higher numbered CPUs in order to pack all tasks in the
+		 * lowest numbered CPUs.
+		 */
+		if ((sd->flags & SD_ASYM_PACKING) && busiest_cpu > this_cpu)
+			return 1;
+
 		/*
 		 * The only task running in a non-idle cpu can be moved to this
 		 * cpu in an attempt to completely freeup the other CPU
@@ -2929,7 +3033,8 @@ redo:
 		schedstat_inc(sd, lb_failed[idle]);
 		sd->nr_balance_failed++;
 
-		if (need_active_balance(sd, sd_idle, idle)) {
+		if (need_active_balance(sd, sd_idle, idle, cpu_of(busiest),
+					this_cpu)) {
 			raw_spin_lock_irqsave(&busiest->lock, flags);
 
 			/* don't kick the active_load_balance_cpu_stop,
-- 
cgit v1.2.3


From ecc55f84b2e9741f29daa787ded93986df6cbe17 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <a.p.zijlstra@chello.nl>
Date: Fri, 21 May 2010 15:11:34 +0200
Subject: perf, trace: Inline perf_swevent_put_recursion_context()

Inline perf_swevent_put_recursion_context into perf_tp_event(), this
shrinks the per trace template code footprint and saves a function
call.

Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
LKML-Reference: <new-submission>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 include/linux/ftrace_event.h | 3 +--
 include/linux/perf_event.h   | 2 +-
 kernel/perf_event.c          | 8 ++++----
 3 files changed, 6 insertions(+), 7 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/ftrace_event.h b/include/linux/ftrace_event.h
index 3167f2df4126..0af31cd335d6 100644
--- a/include/linux/ftrace_event.h
+++ b/include/linux/ftrace_event.h
@@ -257,8 +257,7 @@ static inline void
 perf_trace_buf_submit(void *raw_data, int size, int rctx, u64 addr,
 		       u64 count, struct pt_regs *regs, void *head)
 {
-	perf_tp_event(addr, count, raw_data, size, regs, head);
-	perf_swevent_put_recursion_context(rctx);
+	perf_tp_event(addr, count, raw_data, size, regs, head, rctx);
 }
 #endif
 
diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h
index 5d0266d94985..c691a0b27bcd 100644
--- a/include/linux/perf_event.h
+++ b/include/linux/perf_event.h
@@ -1001,7 +1001,7 @@ static inline bool perf_paranoid_kernel(void)
 extern void perf_event_init(void);
 extern void perf_tp_event(u64 addr, u64 count, void *record,
 			  int entry_size, struct pt_regs *regs,
-			  struct hlist_head *head);
+			  struct hlist_head *head, int rctx);
 extern void perf_bp_event(struct perf_event *event, void *data);
 
 #ifndef perf_misc_flags
diff --git a/kernel/perf_event.c b/kernel/perf_event.c
index ff86c558af4c..4bd3b597bcca 100644
--- a/kernel/perf_event.c
+++ b/kernel/perf_event.c
@@ -4213,14 +4213,12 @@ int perf_swevent_get_recursion_context(void)
 }
 EXPORT_SYMBOL_GPL(perf_swevent_get_recursion_context);
 
-void perf_swevent_put_recursion_context(int rctx)
+void inline perf_swevent_put_recursion_context(int rctx)
 {
 	struct perf_cpu_context *cpuctx = &__get_cpu_var(perf_cpu_context);
 	barrier();
 	cpuctx->recursion[rctx]--;
 }
-EXPORT_SYMBOL_GPL(perf_swevent_put_recursion_context);
-
 
 void __perf_sw_event(u32 event_id, u64 nr, int nmi,
 			    struct pt_regs *regs, u64 addr)
@@ -4601,7 +4599,7 @@ static int perf_tp_event_match(struct perf_event *event,
 }
 
 void perf_tp_event(u64 addr, u64 count, void *record, int entry_size,
-		   struct pt_regs *regs, struct hlist_head *head)
+		   struct pt_regs *regs, struct hlist_head *head, int rctx)
 {
 	struct perf_sample_data data;
 	struct perf_event *event;
@@ -4621,6 +4619,8 @@ void perf_tp_event(u64 addr, u64 count, void *record, int entry_size,
 			perf_swevent_add(event, count, 1, &data, regs);
 	}
 	rcu_read_unlock();
+
+	perf_swevent_put_recursion_context(rctx);
 }
 EXPORT_SYMBOL_GPL(perf_tp_event);
 
-- 
cgit v1.2.3


From 3af9e859281bda7eb7c20b51879cf43aa788ac2e Mon Sep 17 00:00:00 2001
From: Eric B Munson <ebmunson@us.ibm.com>
Date: Tue, 18 May 2010 15:30:49 +0100
Subject: perf: Add non-exec mmap() tracking

Add the capacility to track data mmap()s. This can be used together
with PERF_SAMPLE_ADDR for data profiling.

Signed-off-by: Anton Blanchard <anton@samba.org>
[Updated code for stable perf ABI]
Signed-off-by: Eric B Munson <ebmunson@us.ibm.com>
Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: Frederic Weisbecker <fweisbec@gmail.com>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Steven Rostedt <rostedt@goodmis.org>
LKML-Reference: <1274193049-25997-1-git-send-email-ebmunson@us.ibm.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 fs/exec.c                   |  1 +
 include/linux/perf_event.h  | 12 +++---------
 kernel/perf_event.c         | 34 +++++++++++++++++++++++-----------
 mm/mmap.c                   |  6 +++++-
 tools/perf/builtin-record.c |  4 +++-
 5 files changed, 35 insertions(+), 22 deletions(-)

(limited to 'include/linux')

diff --git a/fs/exec.c b/fs/exec.c
index e19de6a80339..97d91a03fb13 100644
--- a/fs/exec.c
+++ b/fs/exec.c
@@ -653,6 +653,7 @@ int setup_arg_pages(struct linux_binprm *bprm,
 	else
 		stack_base = vma->vm_start - stack_expand;
 #endif
+	current->mm->start_stack = bprm->p;
 	ret = expand_stack(vma, stack_base);
 	if (ret)
 		ret = -EFAULT;
diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h
index c691a0b27bcd..36efad90cd43 100644
--- a/include/linux/perf_event.h
+++ b/include/linux/perf_event.h
@@ -214,8 +214,9 @@ struct perf_event_attr {
 				 *  See also PERF_RECORD_MISC_EXACT_IP
 				 */
 				precise_ip     :  2, /* skid constraint       */
+				mmap_data      :  1, /* non-exec mmap data    */
 
-				__reserved_1   : 47;
+				__reserved_1   : 46;
 
 	union {
 		__u32		wakeup_events;	  /* wakeup every n events */
@@ -962,14 +963,7 @@ perf_sw_event(u32 event_id, u64 nr, int nmi, struct pt_regs *regs, u64 addr)
 	}
 }
 
-extern void __perf_event_mmap(struct vm_area_struct *vma);
-
-static inline void perf_event_mmap(struct vm_area_struct *vma)
-{
-	if (vma->vm_flags & VM_EXEC)
-		__perf_event_mmap(vma);
-}
-
+extern void perf_event_mmap(struct vm_area_struct *vma);
 extern struct perf_guest_info_callbacks *perf_guest_cbs;
 extern int perf_register_guest_info_callbacks(struct perf_guest_info_callbacks *callbacks);
 extern int perf_unregister_guest_info_callbacks(struct perf_guest_info_callbacks *callbacks);
diff --git a/kernel/perf_event.c b/kernel/perf_event.c
index b39bec346e80..227ed9c8ec34 100644
--- a/kernel/perf_event.c
+++ b/kernel/perf_event.c
@@ -1891,7 +1891,7 @@ static void free_event(struct perf_event *event)
 
 	if (!event->parent) {
 		atomic_dec(&nr_events);
-		if (event->attr.mmap)
+		if (event->attr.mmap || event->attr.mmap_data)
 			atomic_dec(&nr_mmap_events);
 		if (event->attr.comm)
 			atomic_dec(&nr_comm_events);
@@ -3491,7 +3491,7 @@ perf_event_read_event(struct perf_event *event,
 /*
  * task tracking -- fork/exit
  *
- * enabled by: attr.comm | attr.mmap | attr.task
+ * enabled by: attr.comm | attr.mmap | attr.mmap_data | attr.task
  */
 
 struct perf_task_event {
@@ -3541,7 +3541,8 @@ static int perf_event_task_match(struct perf_event *event)
 	if (event->cpu != -1 && event->cpu != smp_processor_id())
 		return 0;
 
-	if (event->attr.comm || event->attr.mmap || event->attr.task)
+	if (event->attr.comm || event->attr.mmap ||
+	    event->attr.mmap_data || event->attr.task)
 		return 1;
 
 	return 0;
@@ -3766,7 +3767,8 @@ static void perf_event_mmap_output(struct perf_event *event,
 }
 
 static int perf_event_mmap_match(struct perf_event *event,
-				   struct perf_mmap_event *mmap_event)
+				   struct perf_mmap_event *mmap_event,
+				   int executable)
 {
 	if (event->state < PERF_EVENT_STATE_INACTIVE)
 		return 0;
@@ -3774,19 +3776,21 @@ static int perf_event_mmap_match(struct perf_event *event,
 	if (event->cpu != -1 && event->cpu != smp_processor_id())
 		return 0;
 
-	if (event->attr.mmap)
+	if ((!executable && event->attr.mmap_data) ||
+	    (executable && event->attr.mmap))
 		return 1;
 
 	return 0;
 }
 
 static void perf_event_mmap_ctx(struct perf_event_context *ctx,
-				  struct perf_mmap_event *mmap_event)
+				  struct perf_mmap_event *mmap_event,
+				  int executable)
 {
 	struct perf_event *event;
 
 	list_for_each_entry_rcu(event, &ctx->event_list, event_entry) {
-		if (perf_event_mmap_match(event, mmap_event))
+		if (perf_event_mmap_match(event, mmap_event, executable))
 			perf_event_mmap_output(event, mmap_event);
 	}
 }
@@ -3830,6 +3834,14 @@ static void perf_event_mmap_event(struct perf_mmap_event *mmap_event)
 		if (!vma->vm_mm) {
 			name = strncpy(tmp, "[vdso]", sizeof(tmp));
 			goto got_name;
+		} else if (vma->vm_start <= vma->vm_mm->start_brk &&
+				vma->vm_end >= vma->vm_mm->brk) {
+			name = strncpy(tmp, "[heap]", sizeof(tmp));
+			goto got_name;
+		} else if (vma->vm_start <= vma->vm_mm->start_stack &&
+				vma->vm_end >= vma->vm_mm->start_stack) {
+			name = strncpy(tmp, "[stack]", sizeof(tmp));
+			goto got_name;
 		}
 
 		name = strncpy(tmp, "//anon", sizeof(tmp));
@@ -3846,17 +3858,17 @@ got_name:
 
 	rcu_read_lock();
 	cpuctx = &get_cpu_var(perf_cpu_context);
-	perf_event_mmap_ctx(&cpuctx->ctx, mmap_event);
+	perf_event_mmap_ctx(&cpuctx->ctx, mmap_event, vma->vm_flags & VM_EXEC);
 	ctx = rcu_dereference(current->perf_event_ctxp);
 	if (ctx)
-		perf_event_mmap_ctx(ctx, mmap_event);
+		perf_event_mmap_ctx(ctx, mmap_event, vma->vm_flags & VM_EXEC);
 	put_cpu_var(perf_cpu_context);
 	rcu_read_unlock();
 
 	kfree(buf);
 }
 
-void __perf_event_mmap(struct vm_area_struct *vma)
+void perf_event_mmap(struct vm_area_struct *vma)
 {
 	struct perf_mmap_event mmap_event;
 
@@ -4911,7 +4923,7 @@ done:
 
 	if (!event->parent) {
 		atomic_inc(&nr_events);
-		if (event->attr.mmap)
+		if (event->attr.mmap || event->attr.mmap_data)
 			atomic_inc(&nr_mmap_events);
 		if (event->attr.comm)
 			atomic_inc(&nr_comm_events);
diff --git a/mm/mmap.c b/mm/mmap.c
index 456ec6f27889..e38e910cb756 100644
--- a/mm/mmap.c
+++ b/mm/mmap.c
@@ -1734,8 +1734,10 @@ int expand_upwards(struct vm_area_struct *vma, unsigned long address)
 		grow = (address - vma->vm_end) >> PAGE_SHIFT;
 
 		error = acct_stack_growth(vma, size, grow);
-		if (!error)
+		if (!error) {
 			vma->vm_end = address;
+			perf_event_mmap(vma);
+		}
 	}
 	anon_vma_unlock(vma);
 	return error;
@@ -1781,6 +1783,7 @@ static int expand_downwards(struct vm_area_struct *vma,
 		if (!error) {
 			vma->vm_start = address;
 			vma->vm_pgoff -= grow;
+			perf_event_mmap(vma);
 		}
 	}
 	anon_vma_unlock(vma);
@@ -2208,6 +2211,7 @@ unsigned long do_brk(unsigned long addr, unsigned long len)
 	vma->vm_page_prot = vm_get_page_prot(flags);
 	vma_link(mm, vma, prev, rb_link, rb_parent);
 out:
+	perf_event_mmap(vma);
 	mm->total_vm += len >> PAGE_SHIFT;
 	if (flags & VM_LOCKED) {
 		if (!mlock_vma_pages_range(vma, addr, addr + len))
diff --git a/tools/perf/builtin-record.c b/tools/perf/builtin-record.c
index 5e5c6403a315..39c7247bc54a 100644
--- a/tools/perf/builtin-record.c
+++ b/tools/perf/builtin-record.c
@@ -268,8 +268,10 @@ static void create_counter(int counter, int cpu)
 	if (inherit_stat)
 		attr->inherit_stat = 1;
 
-	if (sample_address)
+	if (sample_address) {
 		attr->sample_type	|= PERF_SAMPLE_ADDR;
+		attr->mmap_data = track;
+	}
 
 	if (call_graph)
 		attr->sample_type	|= PERF_SAMPLE_CALLCHAIN;
-- 
cgit v1.2.3


From 8d2cacbbb8deadfae78aa16e4e1ee619bdd7019e Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <a.p.zijlstra@chello.nl>
Date: Tue, 25 May 2010 17:49:05 +0200
Subject: perf: Cleanup {start,commit,cancel}_txn details

Clarify some of the transactional group scheduling API details
and change it so that a successfull ->commit_txn also closes
the transaction.

Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: Frederic Weisbecker <fweisbec@gmail.com>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Steven Rostedt <rostedt@goodmis.org>
LKML-Reference: <1274803086.5882.1752.camel@twins>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/powerpc/kernel/perf_event.c |  7 ++++---
 arch/sparc/kernel/perf_event.c   |  7 ++++---
 arch/x86/kernel/cpu/perf_event.c | 14 +++++---------
 include/linux/perf_event.h       | 27 ++++++++++++++++++++++-----
 kernel/perf_event.c              |  9 +--------
 5 files changed, 36 insertions(+), 28 deletions(-)

(limited to 'include/linux')

diff --git a/arch/powerpc/kernel/perf_event.c b/arch/powerpc/kernel/perf_event.c
index 43b83c35cf54..ac2a8c2554d9 100644
--- a/arch/powerpc/kernel/perf_event.c
+++ b/arch/powerpc/kernel/perf_event.c
@@ -754,7 +754,7 @@ static int power_pmu_enable(struct perf_event *event)
 	 * skip the schedulability test here, it will be peformed
 	 * at commit time(->commit_txn) as a whole
 	 */
-	if (cpuhw->group_flag & PERF_EVENT_TXN_STARTED)
+	if (cpuhw->group_flag & PERF_EVENT_TXN)
 		goto nocheck;
 
 	if (check_excludes(cpuhw->event, cpuhw->flags, n0, 1))
@@ -858,7 +858,7 @@ void power_pmu_start_txn(const struct pmu *pmu)
 {
 	struct cpu_hw_events *cpuhw = &__get_cpu_var(cpu_hw_events);
 
-	cpuhw->group_flag |= PERF_EVENT_TXN_STARTED;
+	cpuhw->group_flag |= PERF_EVENT_TXN;
 	cpuhw->n_txn_start = cpuhw->n_events;
 }
 
@@ -871,7 +871,7 @@ void power_pmu_cancel_txn(const struct pmu *pmu)
 {
 	struct cpu_hw_events *cpuhw = &__get_cpu_var(cpu_hw_events);
 
-	cpuhw->group_flag &= ~PERF_EVENT_TXN_STARTED;
+	cpuhw->group_flag &= ~PERF_EVENT_TXN;
 }
 
 /*
@@ -897,6 +897,7 @@ int power_pmu_commit_txn(const struct pmu *pmu)
 	for (i = cpuhw->n_txn_start; i < n; ++i)
 		cpuhw->event[i]->hw.config = cpuhw->events[i];
 
+	cpuhw->group_flag &= ~PERF_EVENT_TXN;
 	return 0;
 }
 
diff --git a/arch/sparc/kernel/perf_event.c b/arch/sparc/kernel/perf_event.c
index 0ec92c8861dd..beeb92fa3acd 100644
--- a/arch/sparc/kernel/perf_event.c
+++ b/arch/sparc/kernel/perf_event.c
@@ -1005,7 +1005,7 @@ static int sparc_pmu_enable(struct perf_event *event)
 	 * skip the schedulability test here, it will be peformed
 	 * at commit time(->commit_txn) as a whole
 	 */
-	if (cpuc->group_flag & PERF_EVENT_TXN_STARTED)
+	if (cpuc->group_flag & PERF_EVENT_TXN)
 		goto nocheck;
 
 	if (check_excludes(cpuc->event, n0, 1))
@@ -1102,7 +1102,7 @@ static void sparc_pmu_start_txn(const struct pmu *pmu)
 {
 	struct cpu_hw_events *cpuhw = &__get_cpu_var(cpu_hw_events);
 
-	cpuhw->group_flag |= PERF_EVENT_TXN_STARTED;
+	cpuhw->group_flag |= PERF_EVENT_TXN;
 }
 
 /*
@@ -1114,7 +1114,7 @@ static void sparc_pmu_cancel_txn(const struct pmu *pmu)
 {
 	struct cpu_hw_events *cpuhw = &__get_cpu_var(cpu_hw_events);
 
-	cpuhw->group_flag &= ~PERF_EVENT_TXN_STARTED;
+	cpuhw->group_flag &= ~PERF_EVENT_TXN;
 }
 
 /*
@@ -1137,6 +1137,7 @@ static int sparc_pmu_commit_txn(const struct pmu *pmu)
 	if (sparc_check_constraints(cpuc->event, cpuc->events, n))
 		return -EAGAIN;
 
+	cpuc->group_flag &= ~PERF_EVENT_TXN;
 	return 0;
 }
 
diff --git a/arch/x86/kernel/cpu/perf_event.c b/arch/x86/kernel/cpu/perf_event.c
index 5db5b7d65a18..af04c6fa59cb 100644
--- a/arch/x86/kernel/cpu/perf_event.c
+++ b/arch/x86/kernel/cpu/perf_event.c
@@ -969,7 +969,7 @@ static int x86_pmu_enable(struct perf_event *event)
 	 * skip the schedulability test here, it will be peformed
 	 * at commit time(->commit_txn) as a whole
 	 */
-	if (cpuc->group_flag & PERF_EVENT_TXN_STARTED)
+	if (cpuc->group_flag & PERF_EVENT_TXN)
 		goto out;
 
 	ret = x86_pmu.schedule_events(cpuc, n, assign);
@@ -1096,7 +1096,7 @@ static void x86_pmu_disable(struct perf_event *event)
 	 * The events never got scheduled and ->cancel_txn will truncate
 	 * the event_list.
 	 */
-	if (cpuc->group_flag & PERF_EVENT_TXN_STARTED)
+	if (cpuc->group_flag & PERF_EVENT_TXN)
 		return;
 
 	x86_pmu_stop(event);
@@ -1388,7 +1388,7 @@ static void x86_pmu_start_txn(const struct pmu *pmu)
 {
 	struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events);
 
-	cpuc->group_flag |= PERF_EVENT_TXN_STARTED;
+	cpuc->group_flag |= PERF_EVENT_TXN;
 	cpuc->n_txn = 0;
 }
 
@@ -1401,7 +1401,7 @@ static void x86_pmu_cancel_txn(const struct pmu *pmu)
 {
 	struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events);
 
-	cpuc->group_flag &= ~PERF_EVENT_TXN_STARTED;
+	cpuc->group_flag &= ~PERF_EVENT_TXN;
 	/*
 	 * Truncate the collected events.
 	 */
@@ -1435,11 +1435,7 @@ static int x86_pmu_commit_txn(const struct pmu *pmu)
 	 */
 	memcpy(cpuc->assign, assign, n*sizeof(int));
 
-	/*
-	 * Clear out the txn count so that ->cancel_txn() which gets
-	 * run after ->commit_txn() doesn't undo things.
-	 */
-	cpuc->n_txn = 0;
+	cpuc->group_flag &= ~PERF_EVENT_TXN;
 
 	return 0;
 }
diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h
index 36efad90cd43..f1b6ba0770e0 100644
--- a/include/linux/perf_event.h
+++ b/include/linux/perf_event.h
@@ -549,7 +549,10 @@ struct hw_perf_event {
 
 struct perf_event;
 
-#define PERF_EVENT_TXN_STARTED 1
+/*
+ * Common implementation detail of pmu::{start,commit,cancel}_txn
+ */
+#define PERF_EVENT_TXN 0x1
 
 /**
  * struct pmu - generic performance monitoring unit
@@ -563,14 +566,28 @@ struct pmu {
 	void (*unthrottle)		(struct perf_event *event);
 
 	/*
-	 * group events scheduling is treated as a transaction,
-	 * add group events as a whole and perform one schedulability test.
-	 * If test fails, roll back the whole group
+	 * Group events scheduling is treated as a transaction, add group
+	 * events as a whole and perform one schedulability test. If the test
+	 * fails, roll back the whole group
 	 */
 
+	/*
+	 * Start the transaction, after this ->enable() doesn't need
+	 * to do schedulability tests.
+	 */
 	void (*start_txn)	(const struct pmu *pmu);
-	void (*cancel_txn)	(const struct pmu *pmu);
+	/*
+	 * If ->start_txn() disabled the ->enable() schedulability test
+	 * then ->commit_txn() is required to perform one. On success
+	 * the transaction is closed. On error the transaction is kept
+	 * open until ->cancel_txn() is called.
+	 */
 	int  (*commit_txn)	(const struct pmu *pmu);
+	/*
+	 * Will cancel the transaction, assumes ->disable() is called for
+	 * each successfull ->enable() during the transaction.
+	 */
+	void (*cancel_txn)	(const struct pmu *pmu);
 };
 
 /**
diff --git a/kernel/perf_event.c b/kernel/perf_event.c
index 227ed9c8ec34..6f60920772b3 100644
--- a/kernel/perf_event.c
+++ b/kernel/perf_event.c
@@ -675,7 +675,6 @@ group_sched_in(struct perf_event *group_event,
 	struct perf_event *event, *partial_group = NULL;
 	const struct pmu *pmu = group_event->pmu;
 	bool txn = false;
-	int ret;
 
 	if (group_event->state == PERF_EVENT_STATE_OFF)
 		return 0;
@@ -703,15 +702,9 @@ group_sched_in(struct perf_event *group_event,
 		}
 	}
 
-	if (!txn)
+	if (!txn || !pmu->commit_txn(pmu))
 		return 0;
 
-	ret = pmu->commit_txn(pmu);
-	if (!ret) {
-		pmu->cancel_txn(pmu);
-		return 0;
-	}
-
 group_error:
 	/*
 	 * Groups can be scheduled in as one unit only, so undo any
-- 
cgit v1.2.3


From ca5135e6b4a3cbc7e187737520fbc4b508f6f7a2 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <a.p.zijlstra@chello.nl>
Date: Fri, 28 May 2010 19:33:23 +0200
Subject: perf: Rename perf_mmap_data to perf_buffer

Rename to clarify code.

s/perf_mmap_data/perf_buffer/g and selective s/data/buffer/g

Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: Frederic Weisbecker <fweisbec@gmail.com>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Steven Rostedt <rostedt@goodmis.org>
LKML-Reference: <new-submission>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 include/linux/perf_event.h |   6 +-
 kernel/perf_event.c        | 308 ++++++++++++++++++++++-----------------------
 2 files changed, 157 insertions(+), 157 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h
index f1b6ba0770e0..2a0da021c23f 100644
--- a/include/linux/perf_event.h
+++ b/include/linux/perf_event.h
@@ -602,7 +602,7 @@ enum perf_event_active_state {
 
 struct file;
 
-struct perf_mmap_data {
+struct perf_buffer {
 	atomic_t			refcount;
 	struct rcu_head			rcu_head;
 #ifdef CONFIG_PERF_USE_VMALLOC
@@ -727,7 +727,7 @@ struct perf_event {
 	atomic_t			mmap_count;
 	int				mmap_locked;
 	struct user_struct		*mmap_user;
-	struct perf_mmap_data		*data;
+	struct perf_buffer		*buffer;
 
 	/* poll related */
 	wait_queue_head_t		waitq;
@@ -825,7 +825,7 @@ struct perf_cpu_context {
 
 struct perf_output_handle {
 	struct perf_event		*event;
-	struct perf_mmap_data		*data;
+	struct perf_buffer		*buffer;
 	unsigned long			wakeup;
 	unsigned long			size;
 	void				*addr;
diff --git a/kernel/perf_event.c b/kernel/perf_event.c
index 6f60920772b3..93d545801e43 100644
--- a/kernel/perf_event.c
+++ b/kernel/perf_event.c
@@ -1876,7 +1876,7 @@ static void free_event_rcu(struct rcu_head *head)
 }
 
 static void perf_pending_sync(struct perf_event *event);
-static void perf_mmap_data_put(struct perf_mmap_data *data);
+static void perf_buffer_put(struct perf_buffer *buffer);
 
 static void free_event(struct perf_event *event)
 {
@@ -1892,9 +1892,9 @@ static void free_event(struct perf_event *event)
 			atomic_dec(&nr_task_events);
 	}
 
-	if (event->data) {
-		perf_mmap_data_put(event->data);
-		event->data = NULL;
+	if (event->buffer) {
+		perf_buffer_put(event->buffer);
+		event->buffer = NULL;
 	}
 
 	if (event->destroy)
@@ -2119,13 +2119,13 @@ perf_read(struct file *file, char __user *buf, size_t count, loff_t *ppos)
 static unsigned int perf_poll(struct file *file, poll_table *wait)
 {
 	struct perf_event *event = file->private_data;
-	struct perf_mmap_data *data;
+	struct perf_buffer *buffer;
 	unsigned int events = POLL_HUP;
 
 	rcu_read_lock();
-	data = rcu_dereference(event->data);
-	if (data)
-		events = atomic_xchg(&data->poll, 0);
+	buffer = rcu_dereference(event->buffer);
+	if (buffer)
+		events = atomic_xchg(&buffer->poll, 0);
 	rcu_read_unlock();
 
 	poll_wait(file, &event->waitq, wait);
@@ -2335,14 +2335,14 @@ static int perf_event_index(struct perf_event *event)
 void perf_event_update_userpage(struct perf_event *event)
 {
 	struct perf_event_mmap_page *userpg;
-	struct perf_mmap_data *data;
+	struct perf_buffer *buffer;
 
 	rcu_read_lock();
-	data = rcu_dereference(event->data);
-	if (!data)
+	buffer = rcu_dereference(event->buffer);
+	if (!buffer)
 		goto unlock;
 
-	userpg = data->user_page;
+	userpg = buffer->user_page;
 
 	/*
 	 * Disable preemption so as to not let the corresponding user-space
@@ -2376,15 +2376,15 @@ unlock:
  */
 
 static struct page *
-perf_mmap_to_page(struct perf_mmap_data *data, unsigned long pgoff)
+perf_mmap_to_page(struct perf_buffer *buffer, unsigned long pgoff)
 {
-	if (pgoff > data->nr_pages)
+	if (pgoff > buffer->nr_pages)
 		return NULL;
 
 	if (pgoff == 0)
-		return virt_to_page(data->user_page);
+		return virt_to_page(buffer->user_page);
 
-	return virt_to_page(data->data_pages[pgoff - 1]);
+	return virt_to_page(buffer->data_pages[pgoff - 1]);
 }
 
 static void *perf_mmap_alloc_page(int cpu)
@@ -2400,42 +2400,42 @@ static void *perf_mmap_alloc_page(int cpu)
 	return page_address(page);
 }
 
-static struct perf_mmap_data *
-perf_mmap_data_alloc(struct perf_event *event, int nr_pages)
+static struct perf_buffer *
+perf_buffer_alloc(struct perf_event *event, int nr_pages)
 {
-	struct perf_mmap_data *data;
+	struct perf_buffer *buffer;
 	unsigned long size;
 	int i;
 
-	size = sizeof(struct perf_mmap_data);
+	size = sizeof(struct perf_buffer);
 	size += nr_pages * sizeof(void *);
 
-	data = kzalloc(size, GFP_KERNEL);
-	if (!data)
+	buffer = kzalloc(size, GFP_KERNEL);
+	if (!buffer)
 		goto fail;
 
-	data->user_page = perf_mmap_alloc_page(event->cpu);
-	if (!data->user_page)
+	buffer->user_page = perf_mmap_alloc_page(event->cpu);
+	if (!buffer->user_page)
 		goto fail_user_page;
 
 	for (i = 0; i < nr_pages; i++) {
-		data->data_pages[i] = perf_mmap_alloc_page(event->cpu);
-		if (!data->data_pages[i])
+		buffer->data_pages[i] = perf_mmap_alloc_page(event->cpu);
+		if (!buffer->data_pages[i])
 			goto fail_data_pages;
 	}
 
-	data->nr_pages = nr_pages;
+	buffer->nr_pages = nr_pages;
 
-	return data;
+	return buffer;
 
 fail_data_pages:
 	for (i--; i >= 0; i--)
-		free_page((unsigned long)data->data_pages[i]);
+		free_page((unsigned long)buffer->data_pages[i]);
 
-	free_page((unsigned long)data->user_page);
+	free_page((unsigned long)buffer->user_page);
 
 fail_user_page:
-	kfree(data);
+	kfree(buffer);
 
 fail:
 	return NULL;
@@ -2449,17 +2449,17 @@ static void perf_mmap_free_page(unsigned long addr)
 	__free_page(page);
 }
 
-static void perf_mmap_data_free(struct perf_mmap_data *data)
+static void perf_buffer_free(struct perf_buffer *buffer)
 {
 	int i;
 
-	perf_mmap_free_page((unsigned long)data->user_page);
-	for (i = 0; i < data->nr_pages; i++)
-		perf_mmap_free_page((unsigned long)data->data_pages[i]);
-	kfree(data);
+	perf_mmap_free_page((unsigned long)buffer->user_page);
+	for (i = 0; i < buffer->nr_pages; i++)
+		perf_mmap_free_page((unsigned long)buffer->data_pages[i]);
+	kfree(buffer);
 }
 
-static inline int page_order(struct perf_mmap_data *data)
+static inline int page_order(struct perf_buffer *buffer)
 {
 	return 0;
 }
@@ -2472,18 +2472,18 @@ static inline int page_order(struct perf_mmap_data *data)
  * Required for architectures that have d-cache aliasing issues.
  */
 
-static inline int page_order(struct perf_mmap_data *data)
+static inline int page_order(struct perf_buffer *buffer)
 {
-	return data->page_order;
+	return buffer->page_order;
 }
 
 static struct page *
-perf_mmap_to_page(struct perf_mmap_data *data, unsigned long pgoff)
+perf_mmap_to_page(struct perf_buffer *buffer, unsigned long pgoff)
 {
-	if (pgoff > (1UL << page_order(data)))
+	if (pgoff > (1UL << page_order(buffer)))
 		return NULL;
 
-	return vmalloc_to_page((void *)data->user_page + pgoff * PAGE_SIZE);
+	return vmalloc_to_page((void *)buffer->user_page + pgoff * PAGE_SIZE);
 }
 
 static void perf_mmap_unmark_page(void *addr)
@@ -2493,57 +2493,57 @@ static void perf_mmap_unmark_page(void *addr)
 	page->mapping = NULL;
 }
 
-static void perf_mmap_data_free_work(struct work_struct *work)
+static void perf_buffer_free_work(struct work_struct *work)
 {
-	struct perf_mmap_data *data;
+	struct perf_buffer *buffer;
 	void *base;
 	int i, nr;
 
-	data = container_of(work, struct perf_mmap_data, work);
-	nr = 1 << page_order(data);
+	buffer = container_of(work, struct perf_buffer, work);
+	nr = 1 << page_order(buffer);
 
-	base = data->user_page;
+	base = buffer->user_page;
 	for (i = 0; i < nr + 1; i++)
 		perf_mmap_unmark_page(base + (i * PAGE_SIZE));
 
 	vfree(base);
-	kfree(data);
+	kfree(buffer);
 }
 
-static void perf_mmap_data_free(struct perf_mmap_data *data)
+static void perf_buffer_free(struct perf_buffer *buffer)
 {
-	schedule_work(&data->work);
+	schedule_work(&buffer->work);
 }
 
-static struct perf_mmap_data *
-perf_mmap_data_alloc(struct perf_event *event, int nr_pages)
+static struct perf_buffer *
+perf_buffer_alloc(struct perf_event *event, int nr_pages)
 {
-	struct perf_mmap_data *data;
+	struct perf_buffer *buffer;
 	unsigned long size;
 	void *all_buf;
 
-	size = sizeof(struct perf_mmap_data);
+	size = sizeof(struct perf_buffer);
 	size += sizeof(void *);
 
-	data = kzalloc(size, GFP_KERNEL);
-	if (!data)
+	buffer = kzalloc(size, GFP_KERNEL);
+	if (!buffer)
 		goto fail;
 
-	INIT_WORK(&data->work, perf_mmap_data_free_work);
+	INIT_WORK(&buffer->work, perf_buffer_free_work);
 
 	all_buf = vmalloc_user((nr_pages + 1) * PAGE_SIZE);
 	if (!all_buf)
 		goto fail_all_buf;
 
-	data->user_page = all_buf;
-	data->data_pages[0] = all_buf + PAGE_SIZE;
-	data->page_order = ilog2(nr_pages);
-	data->nr_pages = 1;
+	buffer->user_page = all_buf;
+	buffer->data_pages[0] = all_buf + PAGE_SIZE;
+	buffer->page_order = ilog2(nr_pages);
+	buffer->nr_pages = 1;
 
-	return data;
+	return buffer;
 
 fail_all_buf:
-	kfree(data);
+	kfree(buffer);
 
 fail:
 	return NULL;
@@ -2551,15 +2551,15 @@ fail:
 
 #endif
 
-static unsigned long perf_data_size(struct perf_mmap_data *data)
+static unsigned long perf_data_size(struct perf_buffer *buffer)
 {
-	return data->nr_pages << (PAGE_SHIFT + page_order(data));
+	return buffer->nr_pages << (PAGE_SHIFT + page_order(buffer));
 }
 
 static int perf_mmap_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
 {
 	struct perf_event *event = vma->vm_file->private_data;
-	struct perf_mmap_data *data;
+	struct perf_buffer *buffer;
 	int ret = VM_FAULT_SIGBUS;
 
 	if (vmf->flags & FAULT_FLAG_MKWRITE) {
@@ -2569,14 +2569,14 @@ static int perf_mmap_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
 	}
 
 	rcu_read_lock();
-	data = rcu_dereference(event->data);
-	if (!data)
+	buffer = rcu_dereference(event->buffer);
+	if (!buffer)
 		goto unlock;
 
 	if (vmf->pgoff && (vmf->flags & FAULT_FLAG_WRITE))
 		goto unlock;
 
-	vmf->page = perf_mmap_to_page(data, vmf->pgoff);
+	vmf->page = perf_mmap_to_page(buffer, vmf->pgoff);
 	if (!vmf->page)
 		goto unlock;
 
@@ -2592,51 +2592,51 @@ unlock:
 }
 
 static void
-perf_mmap_data_init(struct perf_event *event, struct perf_mmap_data *data)
+perf_buffer_init(struct perf_event *event, struct perf_buffer *buffer)
 {
-	long max_size = perf_data_size(data);
+	long max_size = perf_data_size(buffer);
 
 	if (event->attr.watermark) {
-		data->watermark = min_t(long, max_size,
+		buffer->watermark = min_t(long, max_size,
 					event->attr.wakeup_watermark);
 	}
 
-	if (!data->watermark)
-		data->watermark = max_size / 2;
+	if (!buffer->watermark)
+		buffer->watermark = max_size / 2;
 
-	atomic_set(&data->refcount, 1);
-	rcu_assign_pointer(event->data, data);
+	atomic_set(&buffer->refcount, 1);
+	rcu_assign_pointer(event->buffer, buffer);
 }
 
-static void perf_mmap_data_free_rcu(struct rcu_head *rcu_head)
+static void perf_buffer_free_rcu(struct rcu_head *rcu_head)
 {
-	struct perf_mmap_data *data;
+	struct perf_buffer *buffer;
 
-	data = container_of(rcu_head, struct perf_mmap_data, rcu_head);
-	perf_mmap_data_free(data);
+	buffer = container_of(rcu_head, struct perf_buffer, rcu_head);
+	perf_buffer_free(buffer);
 }
 
-static struct perf_mmap_data *perf_mmap_data_get(struct perf_event *event)
+static struct perf_buffer *perf_buffer_get(struct perf_event *event)
 {
-	struct perf_mmap_data *data;
+	struct perf_buffer *buffer;
 
 	rcu_read_lock();
-	data = rcu_dereference(event->data);
-	if (data) {
-		if (!atomic_inc_not_zero(&data->refcount))
-			data = NULL;
+	buffer = rcu_dereference(event->buffer);
+	if (buffer) {
+		if (!atomic_inc_not_zero(&buffer->refcount))
+			buffer = NULL;
 	}
 	rcu_read_unlock();
 
-	return data;
+	return buffer;
 }
 
-static void perf_mmap_data_put(struct perf_mmap_data *data)
+static void perf_buffer_put(struct perf_buffer *buffer)
 {
-	if (!atomic_dec_and_test(&data->refcount))
+	if (!atomic_dec_and_test(&buffer->refcount))
 		return;
 
-	call_rcu(&data->rcu_head, perf_mmap_data_free_rcu);
+	call_rcu(&buffer->rcu_head, perf_buffer_free_rcu);
 }
 
 static void perf_mmap_open(struct vm_area_struct *vma)
@@ -2651,16 +2651,16 @@ static void perf_mmap_close(struct vm_area_struct *vma)
 	struct perf_event *event = vma->vm_file->private_data;
 
 	if (atomic_dec_and_mutex_lock(&event->mmap_count, &event->mmap_mutex)) {
-		unsigned long size = perf_data_size(event->data);
+		unsigned long size = perf_data_size(event->buffer);
 		struct user_struct *user = event->mmap_user;
-		struct perf_mmap_data *data = event->data;
+		struct perf_buffer *buffer = event->buffer;
 
 		atomic_long_sub((size >> PAGE_SHIFT) + 1, &user->locked_vm);
 		vma->vm_mm->locked_vm -= event->mmap_locked;
-		rcu_assign_pointer(event->data, NULL);
+		rcu_assign_pointer(event->buffer, NULL);
 		mutex_unlock(&event->mmap_mutex);
 
-		perf_mmap_data_put(data);
+		perf_buffer_put(buffer);
 		free_uid(user);
 	}
 }
@@ -2678,7 +2678,7 @@ static int perf_mmap(struct file *file, struct vm_area_struct *vma)
 	unsigned long user_locked, user_lock_limit;
 	struct user_struct *user = current_user();
 	unsigned long locked, lock_limit;
-	struct perf_mmap_data *data;
+	struct perf_buffer *buffer;
 	unsigned long vma_size;
 	unsigned long nr_pages;
 	long user_extra, extra;
@@ -2699,7 +2699,7 @@ static int perf_mmap(struct file *file, struct vm_area_struct *vma)
 	nr_pages = (vma_size / PAGE_SIZE) - 1;
 
 	/*
-	 * If we have data pages ensure they're a power-of-two number, so we
+	 * If we have buffer pages ensure they're a power-of-two number, so we
 	 * can do bitmasks instead of modulo.
 	 */
 	if (nr_pages != 0 && !is_power_of_2(nr_pages))
@@ -2713,9 +2713,9 @@ static int perf_mmap(struct file *file, struct vm_area_struct *vma)
 
 	WARN_ON_ONCE(event->ctx->parent_ctx);
 	mutex_lock(&event->mmap_mutex);
-	if (event->data) {
-		if (event->data->nr_pages == nr_pages)
-			atomic_inc(&event->data->refcount);
+	if (event->buffer) {
+		if (event->buffer->nr_pages == nr_pages)
+			atomic_inc(&event->buffer->refcount);
 		else
 			ret = -EINVAL;
 		goto unlock;
@@ -2745,17 +2745,17 @@ static int perf_mmap(struct file *file, struct vm_area_struct *vma)
 		goto unlock;
 	}
 
-	WARN_ON(event->data);
+	WARN_ON(event->buffer);
 
-	data = perf_mmap_data_alloc(event, nr_pages);
-	if (!data) {
+	buffer = perf_buffer_alloc(event, nr_pages);
+	if (!buffer) {
 		ret = -ENOMEM;
 		goto unlock;
 	}
 
-	perf_mmap_data_init(event, data);
+	perf_buffer_init(event, buffer);
 	if (vma->vm_flags & VM_WRITE)
-		event->data->writable = 1;
+		event->buffer->writable = 1;
 
 	atomic_long_add(user_extra, &user->locked_vm);
 	event->mmap_locked = extra;
@@ -2964,15 +2964,15 @@ EXPORT_SYMBOL_GPL(perf_unregister_guest_info_callbacks);
 /*
  * Output
  */
-static bool perf_output_space(struct perf_mmap_data *data, unsigned long tail,
+static bool perf_output_space(struct perf_buffer *buffer, unsigned long tail,
 			      unsigned long offset, unsigned long head)
 {
 	unsigned long mask;
 
-	if (!data->writable)
+	if (!buffer->writable)
 		return true;
 
-	mask = perf_data_size(data) - 1;
+	mask = perf_data_size(buffer) - 1;
 
 	offset = (offset - tail) & mask;
 	head   = (head   - tail) & mask;
@@ -2985,7 +2985,7 @@ static bool perf_output_space(struct perf_mmap_data *data, unsigned long tail,
 
 static void perf_output_wakeup(struct perf_output_handle *handle)
 {
-	atomic_set(&handle->data->poll, POLL_IN);
+	atomic_set(&handle->buffer->poll, POLL_IN);
 
 	if (handle->nmi) {
 		handle->event->pending_wakeup = 1;
@@ -3005,45 +3005,45 @@ static void perf_output_wakeup(struct perf_output_handle *handle)
  */
 static void perf_output_get_handle(struct perf_output_handle *handle)
 {
-	struct perf_mmap_data *data = handle->data;
+	struct perf_buffer *buffer = handle->buffer;
 
 	preempt_disable();
-	local_inc(&data->nest);
-	handle->wakeup = local_read(&data->wakeup);
+	local_inc(&buffer->nest);
+	handle->wakeup = local_read(&buffer->wakeup);
 }
 
 static void perf_output_put_handle(struct perf_output_handle *handle)
 {
-	struct perf_mmap_data *data = handle->data;
+	struct perf_buffer *buffer = handle->buffer;
 	unsigned long head;
 
 again:
-	head = local_read(&data->head);
+	head = local_read(&buffer->head);
 
 	/*
 	 * IRQ/NMI can happen here, which means we can miss a head update.
 	 */
 
-	if (!local_dec_and_test(&data->nest))
+	if (!local_dec_and_test(&buffer->nest))
 		goto out;
 
 	/*
 	 * Publish the known good head. Rely on the full barrier implied
-	 * by atomic_dec_and_test() order the data->head read and this
+	 * by atomic_dec_and_test() order the buffer->head read and this
 	 * write.
 	 */
-	data->user_page->data_head = head;
+	buffer->user_page->data_head = head;
 
 	/*
 	 * Now check if we missed an update, rely on the (compiler)
-	 * barrier in atomic_dec_and_test() to re-read data->head.
+	 * barrier in atomic_dec_and_test() to re-read buffer->head.
 	 */
-	if (unlikely(head != local_read(&data->head))) {
-		local_inc(&data->nest);
+	if (unlikely(head != local_read(&buffer->head))) {
+		local_inc(&buffer->nest);
 		goto again;
 	}
 
-	if (handle->wakeup != local_read(&data->wakeup))
+	if (handle->wakeup != local_read(&buffer->wakeup))
 		perf_output_wakeup(handle);
 
  out:
@@ -3063,12 +3063,12 @@ __always_inline void perf_output_copy(struct perf_output_handle *handle,
 		buf += size;
 		handle->size -= size;
 		if (!handle->size) {
-			struct perf_mmap_data *data = handle->data;
+			struct perf_buffer *buffer = handle->buffer;
 
 			handle->page++;
-			handle->page &= data->nr_pages - 1;
-			handle->addr = data->data_pages[handle->page];
-			handle->size = PAGE_SIZE << page_order(data);
+			handle->page &= buffer->nr_pages - 1;
+			handle->addr = buffer->data_pages[handle->page];
+			handle->size = PAGE_SIZE << page_order(buffer);
 		}
 	} while (len);
 }
@@ -3077,7 +3077,7 @@ int perf_output_begin(struct perf_output_handle *handle,
 		      struct perf_event *event, unsigned int size,
 		      int nmi, int sample)
 {
-	struct perf_mmap_data *data;
+	struct perf_buffer *buffer;
 	unsigned long tail, offset, head;
 	int have_lost;
 	struct {
@@ -3093,19 +3093,19 @@ int perf_output_begin(struct perf_output_handle *handle,
 	if (event->parent)
 		event = event->parent;
 
-	data = rcu_dereference(event->data);
-	if (!data)
+	buffer = rcu_dereference(event->buffer);
+	if (!buffer)
 		goto out;
 
-	handle->data	= data;
+	handle->buffer	= buffer;
 	handle->event	= event;
 	handle->nmi	= nmi;
 	handle->sample	= sample;
 
-	if (!data->nr_pages)
+	if (!buffer->nr_pages)
 		goto out;
 
-	have_lost = local_read(&data->lost);
+	have_lost = local_read(&buffer->lost);
 	if (have_lost)
 		size += sizeof(lost_event);
 
@@ -3117,30 +3117,30 @@ int perf_output_begin(struct perf_output_handle *handle,
 		 * tail pointer. So that all reads will be completed before the
 		 * write is issued.
 		 */
-		tail = ACCESS_ONCE(data->user_page->data_tail);
+		tail = ACCESS_ONCE(buffer->user_page->data_tail);
 		smp_rmb();
-		offset = head = local_read(&data->head);
+		offset = head = local_read(&buffer->head);
 		head += size;
-		if (unlikely(!perf_output_space(data, tail, offset, head)))
+		if (unlikely(!perf_output_space(buffer, tail, offset, head)))
 			goto fail;
-	} while (local_cmpxchg(&data->head, offset, head) != offset);
+	} while (local_cmpxchg(&buffer->head, offset, head) != offset);
 
-	if (head - local_read(&data->wakeup) > data->watermark)
-		local_add(data->watermark, &data->wakeup);
+	if (head - local_read(&buffer->wakeup) > buffer->watermark)
+		local_add(buffer->watermark, &buffer->wakeup);
 
-	handle->page = offset >> (PAGE_SHIFT + page_order(data));
-	handle->page &= data->nr_pages - 1;
-	handle->size = offset & ((PAGE_SIZE << page_order(data)) - 1);
-	handle->addr = data->data_pages[handle->page];
+	handle->page = offset >> (PAGE_SHIFT + page_order(buffer));
+	handle->page &= buffer->nr_pages - 1;
+	handle->size = offset & ((PAGE_SIZE << page_order(buffer)) - 1);
+	handle->addr = buffer->data_pages[handle->page];
 	handle->addr += handle->size;
-	handle->size = (PAGE_SIZE << page_order(data)) - handle->size;
+	handle->size = (PAGE_SIZE << page_order(buffer)) - handle->size;
 
 	if (have_lost) {
 		lost_event.header.type = PERF_RECORD_LOST;
 		lost_event.header.misc = 0;
 		lost_event.header.size = sizeof(lost_event);
 		lost_event.id          = event->id;
-		lost_event.lost        = local_xchg(&data->lost, 0);
+		lost_event.lost        = local_xchg(&buffer->lost, 0);
 
 		perf_output_put(handle, lost_event);
 	}
@@ -3148,7 +3148,7 @@ int perf_output_begin(struct perf_output_handle *handle,
 	return 0;
 
 fail:
-	local_inc(&data->lost);
+	local_inc(&buffer->lost);
 	perf_output_put_handle(handle);
 out:
 	rcu_read_unlock();
@@ -3159,15 +3159,15 @@ out:
 void perf_output_end(struct perf_output_handle *handle)
 {
 	struct perf_event *event = handle->event;
-	struct perf_mmap_data *data = handle->data;
+	struct perf_buffer *buffer = handle->buffer;
 
 	int wakeup_events = event->attr.wakeup_events;
 
 	if (handle->sample && wakeup_events) {
-		int events = local_inc_return(&data->events);
+		int events = local_inc_return(&buffer->events);
 		if (events >= wakeup_events) {
-			local_sub(wakeup_events, &data->events);
-			local_inc(&data->wakeup);
+			local_sub(wakeup_events, &buffer->events);
+			local_inc(&buffer->wakeup);
 		}
 	}
 
@@ -5010,7 +5010,7 @@ err_size:
 static int
 perf_event_set_output(struct perf_event *event, struct perf_event *output_event)
 {
-	struct perf_mmap_data *data = NULL, *old_data = NULL;
+	struct perf_buffer *buffer = NULL, *old_buffer = NULL;
 	int ret = -EINVAL;
 
 	if (!output_event)
@@ -5040,19 +5040,19 @@ set:
 
 	if (output_event) {
 		/* get the buffer we want to redirect to */
-		data = perf_mmap_data_get(output_event);
-		if (!data)
+		buffer = perf_buffer_get(output_event);
+		if (!buffer)
 			goto unlock;
 	}
 
-	old_data = event->data;
-	rcu_assign_pointer(event->data, data);
+	old_buffer = event->buffer;
+	rcu_assign_pointer(event->buffer, buffer);
 	ret = 0;
 unlock:
 	mutex_unlock(&event->mmap_mutex);
 
-	if (old_data)
-		perf_mmap_data_put(old_data);
+	if (old_buffer)
+		perf_buffer_put(old_buffer);
 out:
 	return ret;
 }
-- 
cgit v1.2.3


From d57e34fdd60be7ffd0b1d86bfa1a553df86b7172 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <a.p.zijlstra@chello.nl>
Date: Fri, 28 May 2010 19:41:35 +0200
Subject: perf: Simplify the ring-buffer logic: make perf_buffer_alloc() do
 everything needed

Currently there are perf_buffer_alloc() + perf_buffer_init() + some
separate bits, fold it all into a single perf_buffer_alloc() and only
leave the attachment to the event separate.

Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
LKML-Reference: <new-submission>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 include/linux/perf_event.h |  2 ++
 kernel/perf_event.c        | 61 ++++++++++++++++++++++++++--------------------
 2 files changed, 36 insertions(+), 27 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h
index 2a0da021c23f..441992a9775c 100644
--- a/include/linux/perf_event.h
+++ b/include/linux/perf_event.h
@@ -602,6 +602,8 @@ enum perf_event_active_state {
 
 struct file;
 
+#define PERF_BUFFER_WRITABLE		0x01
+
 struct perf_buffer {
 	atomic_t			refcount;
 	struct rcu_head			rcu_head;
diff --git a/kernel/perf_event.c b/kernel/perf_event.c
index 93d545801e43..f75c9c9c8177 100644
--- a/kernel/perf_event.c
+++ b/kernel/perf_event.c
@@ -2369,6 +2369,25 @@ unlock:
 	rcu_read_unlock();
 }
 
+static unsigned long perf_data_size(struct perf_buffer *buffer);
+
+static void
+perf_buffer_init(struct perf_buffer *buffer, long watermark, int flags)
+{
+	long max_size = perf_data_size(buffer);
+
+	if (watermark)
+		buffer->watermark = min(max_size, watermark);
+
+	if (!buffer->watermark)
+		buffer->watermark = max_size / 2;
+
+	if (flags & PERF_BUFFER_WRITABLE)
+		buffer->writable = 1;
+
+	atomic_set(&buffer->refcount, 1);
+}
+
 #ifndef CONFIG_PERF_USE_VMALLOC
 
 /*
@@ -2401,7 +2420,7 @@ static void *perf_mmap_alloc_page(int cpu)
 }
 
 static struct perf_buffer *
-perf_buffer_alloc(struct perf_event *event, int nr_pages)
+perf_buffer_alloc(int nr_pages, long watermark, int cpu, int flags)
 {
 	struct perf_buffer *buffer;
 	unsigned long size;
@@ -2414,18 +2433,20 @@ perf_buffer_alloc(struct perf_event *event, int nr_pages)
 	if (!buffer)
 		goto fail;
 
-	buffer->user_page = perf_mmap_alloc_page(event->cpu);
+	buffer->user_page = perf_mmap_alloc_page(cpu);
 	if (!buffer->user_page)
 		goto fail_user_page;
 
 	for (i = 0; i < nr_pages; i++) {
-		buffer->data_pages[i] = perf_mmap_alloc_page(event->cpu);
+		buffer->data_pages[i] = perf_mmap_alloc_page(cpu);
 		if (!buffer->data_pages[i])
 			goto fail_data_pages;
 	}
 
 	buffer->nr_pages = nr_pages;
 
+	perf_buffer_init(buffer, watermark, flags);
+
 	return buffer;
 
 fail_data_pages:
@@ -2516,7 +2537,7 @@ static void perf_buffer_free(struct perf_buffer *buffer)
 }
 
 static struct perf_buffer *
-perf_buffer_alloc(struct perf_event *event, int nr_pages)
+perf_buffer_alloc(int nr_pages, long watermark, int cpu, int flags)
 {
 	struct perf_buffer *buffer;
 	unsigned long size;
@@ -2540,6 +2561,8 @@ perf_buffer_alloc(struct perf_event *event, int nr_pages)
 	buffer->page_order = ilog2(nr_pages);
 	buffer->nr_pages = 1;
 
+	perf_buffer_init(buffer, watermark, flags);
+
 	return buffer;
 
 fail_all_buf:
@@ -2591,23 +2614,6 @@ unlock:
 	return ret;
 }
 
-static void
-perf_buffer_init(struct perf_event *event, struct perf_buffer *buffer)
-{
-	long max_size = perf_data_size(buffer);
-
-	if (event->attr.watermark) {
-		buffer->watermark = min_t(long, max_size,
-					event->attr.wakeup_watermark);
-	}
-
-	if (!buffer->watermark)
-		buffer->watermark = max_size / 2;
-
-	atomic_set(&buffer->refcount, 1);
-	rcu_assign_pointer(event->buffer, buffer);
-}
-
 static void perf_buffer_free_rcu(struct rcu_head *rcu_head)
 {
 	struct perf_buffer *buffer;
@@ -2682,7 +2688,7 @@ static int perf_mmap(struct file *file, struct vm_area_struct *vma)
 	unsigned long vma_size;
 	unsigned long nr_pages;
 	long user_extra, extra;
-	int ret = 0;
+	int ret = 0, flags = 0;
 
 	/*
 	 * Don't allow mmap() of inherited per-task counters. This would
@@ -2747,15 +2753,16 @@ static int perf_mmap(struct file *file, struct vm_area_struct *vma)
 
 	WARN_ON(event->buffer);
 
-	buffer = perf_buffer_alloc(event, nr_pages);
+	if (vma->vm_flags & VM_WRITE)
+		flags |= PERF_BUFFER_WRITABLE;
+
+	buffer = perf_buffer_alloc(nr_pages, event->attr.wakeup_watermark,
+				   event->cpu, flags);
 	if (!buffer) {
 		ret = -ENOMEM;
 		goto unlock;
 	}
-
-	perf_buffer_init(event, buffer);
-	if (vma->vm_flags & VM_WRITE)
-		event->buffer->writable = 1;
+	rcu_assign_pointer(event->buffer, buffer);
 
 	atomic_long_add(user_extra, &user->locked_vm);
 	event->mmap_locked = extra;
-- 
cgit v1.2.3


From a6e6dea68c18f705957573ee5596097c7e82d0e5 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <a.p.zijlstra@chello.nl>
Date: Fri, 21 May 2010 14:27:58 +0200
Subject: perf: Add perf_event::child_count

Only child counters adding back their values into the parent counter
are responsible for cross-cpu updates to event->count.

So if we pull that out into a new child_count variable, we get an
event->count that is only modified locally.

Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: Frederic Weisbecker <fweisbec@gmail.com>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Steven Rostedt <rostedt@goodmis.org>
LKML-Reference: <new-submission>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 include/linux/perf_event.h | 1 +
 kernel/perf_event.c        | 4 ++--
 2 files changed, 3 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h
index 441992a9775c..f34dab9b275e 100644
--- a/include/linux/perf_event.h
+++ b/include/linux/perf_event.h
@@ -671,6 +671,7 @@ struct perf_event {
 	enum perf_event_active_state	state;
 	unsigned int			attach_state;
 	atomic64_t			count;
+	atomic64_t			child_count;
 
 	/*
 	 * These are the total time in nanoseconds that the event
diff --git a/kernel/perf_event.c b/kernel/perf_event.c
index ab4c0ffc271c..a395fda2d94c 100644
--- a/kernel/perf_event.c
+++ b/kernel/perf_event.c
@@ -1738,7 +1738,7 @@ static void __perf_event_read(void *info)
 
 static inline u64 perf_event_count(struct perf_event *event)
 {
-	return atomic64_read(&event->count);
+	return atomic64_read(&event->count) + atomic64_read(&event->child_count);
 }
 
 static u64 perf_event_read(struct perf_event *event)
@@ -5379,7 +5379,7 @@ static void sync_child_event(struct perf_event *child_event,
 	/*
 	 * Add back the child's count to the parent's count:
 	 */
-	atomic64_add(child_val, &parent_event->count);
+	atomic64_add(child_val, &parent_event->child_count);
 	atomic64_add(child_event->total_time_enabled,
 		     &parent_event->child_total_time_enabled);
 	atomic64_add(child_event->total_time_running,
-- 
cgit v1.2.3


From e78505958cf123048fb48cb56b79cebb8edd15fb Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <a.p.zijlstra@chello.nl>
Date: Fri, 21 May 2010 14:43:08 +0200
Subject: perf: Convert perf_event to local_t

Since now all modification to event->count (and ->prev_count
and ->period_left) are local to a cpu, change then to local64_t so we
avoid the LOCK'ed ops.

Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
LKML-Reference: <new-submission>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/arm/kernel/perf_event.c     | 18 ++++++++---------
 arch/powerpc/kernel/perf_event.c | 34 ++++++++++++++++----------------
 arch/sh/kernel/perf_event.c      |  6 +++---
 arch/sparc/kernel/perf_event.c   | 18 ++++++++---------
 arch/x86/kernel/cpu/perf_event.c | 18 ++++++++---------
 include/linux/perf_event.h       |  7 ++++---
 kernel/perf_event.c              | 42 ++++++++++++++++++++--------------------
 7 files changed, 72 insertions(+), 71 deletions(-)

(limited to 'include/linux')

diff --git a/arch/arm/kernel/perf_event.c b/arch/arm/kernel/perf_event.c
index c45768614c8a..5b7cfafc0720 100644
--- a/arch/arm/kernel/perf_event.c
+++ b/arch/arm/kernel/perf_event.c
@@ -164,20 +164,20 @@ armpmu_event_set_period(struct perf_event *event,
 			struct hw_perf_event *hwc,
 			int idx)
 {
-	s64 left = atomic64_read(&hwc->period_left);
+	s64 left = local64_read(&hwc->period_left);
 	s64 period = hwc->sample_period;
 	int ret = 0;
 
 	if (unlikely(left <= -period)) {
 		left = period;
-		atomic64_set(&hwc->period_left, left);
+		local64_set(&hwc->period_left, left);
 		hwc->last_period = period;
 		ret = 1;
 	}
 
 	if (unlikely(left <= 0)) {
 		left += period;
-		atomic64_set(&hwc->period_left, left);
+		local64_set(&hwc->period_left, left);
 		hwc->last_period = period;
 		ret = 1;
 	}
@@ -185,7 +185,7 @@ armpmu_event_set_period(struct perf_event *event,
 	if (left > (s64)armpmu->max_period)
 		left = armpmu->max_period;
 
-	atomic64_set(&hwc->prev_count, (u64)-left);
+	local64_set(&hwc->prev_count, (u64)-left);
 
 	armpmu->write_counter(idx, (u64)(-left) & 0xffffffff);
 
@@ -204,18 +204,18 @@ armpmu_event_update(struct perf_event *event,
 	s64 delta;
 
 again:
-	prev_raw_count = atomic64_read(&hwc->prev_count);
+	prev_raw_count = local64_read(&hwc->prev_count);
 	new_raw_count = armpmu->read_counter(idx);
 
-	if (atomic64_cmpxchg(&hwc->prev_count, prev_raw_count,
+	if (local64_cmpxchg(&hwc->prev_count, prev_raw_count,
 			     new_raw_count) != prev_raw_count)
 		goto again;
 
 	delta = (new_raw_count << shift) - (prev_raw_count << shift);
 	delta >>= shift;
 
-	atomic64_add(delta, &event->count);
-	atomic64_sub(delta, &hwc->period_left);
+	local64_add(delta, &event->count);
+	local64_sub(delta, &hwc->period_left);
 
 	return new_raw_count;
 }
@@ -478,7 +478,7 @@ __hw_perf_event_init(struct perf_event *event)
 	if (!hwc->sample_period) {
 		hwc->sample_period  = armpmu->max_period;
 		hwc->last_period    = hwc->sample_period;
-		atomic64_set(&hwc->period_left, hwc->sample_period);
+		local64_set(&hwc->period_left, hwc->sample_period);
 	}
 
 	err = 0;
diff --git a/arch/powerpc/kernel/perf_event.c b/arch/powerpc/kernel/perf_event.c
index ac2a8c2554d9..af1d9a7c65d1 100644
--- a/arch/powerpc/kernel/perf_event.c
+++ b/arch/powerpc/kernel/perf_event.c
@@ -410,15 +410,15 @@ static void power_pmu_read(struct perf_event *event)
 	 * Therefore we treat them like NMIs.
 	 */
 	do {
-		prev = atomic64_read(&event->hw.prev_count);
+		prev = local64_read(&event->hw.prev_count);
 		barrier();
 		val = read_pmc(event->hw.idx);
-	} while (atomic64_cmpxchg(&event->hw.prev_count, prev, val) != prev);
+	} while (local64_cmpxchg(&event->hw.prev_count, prev, val) != prev);
 
 	/* The counters are only 32 bits wide */
 	delta = (val - prev) & 0xfffffffful;
-	atomic64_add(delta, &event->count);
-	atomic64_sub(delta, &event->hw.period_left);
+	local64_add(delta, &event->count);
+	local64_sub(delta, &event->hw.period_left);
 }
 
 /*
@@ -444,10 +444,10 @@ static void freeze_limited_counters(struct cpu_hw_events *cpuhw,
 		if (!event->hw.idx)
 			continue;
 		val = (event->hw.idx == 5) ? pmc5 : pmc6;
-		prev = atomic64_read(&event->hw.prev_count);
+		prev = local64_read(&event->hw.prev_count);
 		event->hw.idx = 0;
 		delta = (val - prev) & 0xfffffffful;
-		atomic64_add(delta, &event->count);
+		local64_add(delta, &event->count);
 	}
 }
 
@@ -462,7 +462,7 @@ static void thaw_limited_counters(struct cpu_hw_events *cpuhw,
 		event = cpuhw->limited_counter[i];
 		event->hw.idx = cpuhw->limited_hwidx[i];
 		val = (event->hw.idx == 5) ? pmc5 : pmc6;
-		atomic64_set(&event->hw.prev_count, val);
+		local64_set(&event->hw.prev_count, val);
 		perf_event_update_userpage(event);
 	}
 }
@@ -666,11 +666,11 @@ void hw_perf_enable(void)
 		}
 		val = 0;
 		if (event->hw.sample_period) {
-			left = atomic64_read(&event->hw.period_left);
+			left = local64_read(&event->hw.period_left);
 			if (left < 0x80000000L)
 				val = 0x80000000L - left;
 		}
-		atomic64_set(&event->hw.prev_count, val);
+		local64_set(&event->hw.prev_count, val);
 		event->hw.idx = idx;
 		write_pmc(idx, val);
 		perf_event_update_userpage(event);
@@ -842,8 +842,8 @@ static void power_pmu_unthrottle(struct perf_event *event)
 	if (left < 0x80000000L)
 		val = 0x80000000L - left;
 	write_pmc(event->hw.idx, val);
-	atomic64_set(&event->hw.prev_count, val);
-	atomic64_set(&event->hw.period_left, left);
+	local64_set(&event->hw.prev_count, val);
+	local64_set(&event->hw.period_left, left);
 	perf_event_update_userpage(event);
 	perf_enable();
 	local_irq_restore(flags);
@@ -1109,7 +1109,7 @@ const struct pmu *hw_perf_event_init(struct perf_event *event)
 	event->hw.config = events[n];
 	event->hw.event_base = cflags[n];
 	event->hw.last_period = event->hw.sample_period;
-	atomic64_set(&event->hw.period_left, event->hw.last_period);
+	local64_set(&event->hw.period_left, event->hw.last_period);
 
 	/*
 	 * See if we need to reserve the PMU.
@@ -1147,16 +1147,16 @@ static void record_and_restart(struct perf_event *event, unsigned long val,
 	int record = 0;
 
 	/* we don't have to worry about interrupts here */
-	prev = atomic64_read(&event->hw.prev_count);
+	prev = local64_read(&event->hw.prev_count);
 	delta = (val - prev) & 0xfffffffful;
-	atomic64_add(delta, &event->count);
+	local64_add(delta, &event->count);
 
 	/*
 	 * See if the total period for this event has expired,
 	 * and update for the next period.
 	 */
 	val = 0;
-	left = atomic64_read(&event->hw.period_left) - delta;
+	left = local64_read(&event->hw.period_left) - delta;
 	if (period) {
 		if (left <= 0) {
 			left += period;
@@ -1194,8 +1194,8 @@ static void record_and_restart(struct perf_event *event, unsigned long val,
 	}
 
 	write_pmc(event->hw.idx, val);
-	atomic64_set(&event->hw.prev_count, val);
-	atomic64_set(&event->hw.period_left, left);
+	local64_set(&event->hw.prev_count, val);
+	local64_set(&event->hw.period_left, left);
 	perf_event_update_userpage(event);
 }
 
diff --git a/arch/sh/kernel/perf_event.c b/arch/sh/kernel/perf_event.c
index 81b6de41ae5d..7a3dc3567258 100644
--- a/arch/sh/kernel/perf_event.c
+++ b/arch/sh/kernel/perf_event.c
@@ -185,10 +185,10 @@ static void sh_perf_event_update(struct perf_event *event,
 	 * this is the simplest approach for maintaining consistency.
 	 */
 again:
-	prev_raw_count = atomic64_read(&hwc->prev_count);
+	prev_raw_count = local64_read(&hwc->prev_count);
 	new_raw_count = sh_pmu->read(idx);
 
-	if (atomic64_cmpxchg(&hwc->prev_count, prev_raw_count,
+	if (local64_cmpxchg(&hwc->prev_count, prev_raw_count,
 			     new_raw_count) != prev_raw_count)
 		goto again;
 
@@ -203,7 +203,7 @@ again:
 	delta = (new_raw_count << shift) - (prev_raw_count << shift);
 	delta >>= shift;
 
-	atomic64_add(delta, &event->count);
+	local64_add(delta, &event->count);
 }
 
 static void sh_pmu_disable(struct perf_event *event)
diff --git a/arch/sparc/kernel/perf_event.c b/arch/sparc/kernel/perf_event.c
index beeb92fa3acd..8a6660da8e08 100644
--- a/arch/sparc/kernel/perf_event.c
+++ b/arch/sparc/kernel/perf_event.c
@@ -572,18 +572,18 @@ static u64 sparc_perf_event_update(struct perf_event *event,
 	s64 delta;
 
 again:
-	prev_raw_count = atomic64_read(&hwc->prev_count);
+	prev_raw_count = local64_read(&hwc->prev_count);
 	new_raw_count = read_pmc(idx);
 
-	if (atomic64_cmpxchg(&hwc->prev_count, prev_raw_count,
+	if (local64_cmpxchg(&hwc->prev_count, prev_raw_count,
 			     new_raw_count) != prev_raw_count)
 		goto again;
 
 	delta = (new_raw_count << shift) - (prev_raw_count << shift);
 	delta >>= shift;
 
-	atomic64_add(delta, &event->count);
-	atomic64_sub(delta, &hwc->period_left);
+	local64_add(delta, &event->count);
+	local64_sub(delta, &hwc->period_left);
 
 	return new_raw_count;
 }
@@ -591,27 +591,27 @@ again:
 static int sparc_perf_event_set_period(struct perf_event *event,
 				       struct hw_perf_event *hwc, int idx)
 {
-	s64 left = atomic64_read(&hwc->period_left);
+	s64 left = local64_read(&hwc->period_left);
 	s64 period = hwc->sample_period;
 	int ret = 0;
 
 	if (unlikely(left <= -period)) {
 		left = period;
-		atomic64_set(&hwc->period_left, left);
+		local64_set(&hwc->period_left, left);
 		hwc->last_period = period;
 		ret = 1;
 	}
 
 	if (unlikely(left <= 0)) {
 		left += period;
-		atomic64_set(&hwc->period_left, left);
+		local64_set(&hwc->period_left, left);
 		hwc->last_period = period;
 		ret = 1;
 	}
 	if (left > MAX_PERIOD)
 		left = MAX_PERIOD;
 
-	atomic64_set(&hwc->prev_count, (u64)-left);
+	local64_set(&hwc->prev_count, (u64)-left);
 
 	write_pmc(idx, (u64)(-left) & 0xffffffff);
 
@@ -1087,7 +1087,7 @@ static int __hw_perf_event_init(struct perf_event *event)
 	if (!hwc->sample_period) {
 		hwc->sample_period = MAX_PERIOD;
 		hwc->last_period = hwc->sample_period;
-		atomic64_set(&hwc->period_left, hwc->sample_period);
+		local64_set(&hwc->period_left, hwc->sample_period);
 	}
 
 	return 0;
diff --git a/arch/x86/kernel/cpu/perf_event.c b/arch/x86/kernel/cpu/perf_event.c
index 79e199843db6..2d0d29069275 100644
--- a/arch/x86/kernel/cpu/perf_event.c
+++ b/arch/x86/kernel/cpu/perf_event.c
@@ -296,10 +296,10 @@ x86_perf_event_update(struct perf_event *event)
 	 * count to the generic event atomically:
 	 */
 again:
-	prev_raw_count = atomic64_read(&hwc->prev_count);
+	prev_raw_count = local64_read(&hwc->prev_count);
 	rdmsrl(hwc->event_base + idx, new_raw_count);
 
-	if (atomic64_cmpxchg(&hwc->prev_count, prev_raw_count,
+	if (local64_cmpxchg(&hwc->prev_count, prev_raw_count,
 					new_raw_count) != prev_raw_count)
 		goto again;
 
@@ -314,8 +314,8 @@ again:
 	delta = (new_raw_count << shift) - (prev_raw_count << shift);
 	delta >>= shift;
 
-	atomic64_add(delta, &event->count);
-	atomic64_sub(delta, &hwc->period_left);
+	local64_add(delta, &event->count);
+	local64_sub(delta, &hwc->period_left);
 
 	return new_raw_count;
 }
@@ -439,7 +439,7 @@ static int x86_setup_perfctr(struct perf_event *event)
 	if (!hwc->sample_period) {
 		hwc->sample_period = x86_pmu.max_period;
 		hwc->last_period = hwc->sample_period;
-		atomic64_set(&hwc->period_left, hwc->sample_period);
+		local64_set(&hwc->period_left, hwc->sample_period);
 	} else {
 		/*
 		 * If we have a PMU initialized but no APIC
@@ -886,7 +886,7 @@ static int
 x86_perf_event_set_period(struct perf_event *event)
 {
 	struct hw_perf_event *hwc = &event->hw;
-	s64 left = atomic64_read(&hwc->period_left);
+	s64 left = local64_read(&hwc->period_left);
 	s64 period = hwc->sample_period;
 	int ret = 0, idx = hwc->idx;
 
@@ -898,14 +898,14 @@ x86_perf_event_set_period(struct perf_event *event)
 	 */
 	if (unlikely(left <= -period)) {
 		left = period;
-		atomic64_set(&hwc->period_left, left);
+		local64_set(&hwc->period_left, left);
 		hwc->last_period = period;
 		ret = 1;
 	}
 
 	if (unlikely(left <= 0)) {
 		left += period;
-		atomic64_set(&hwc->period_left, left);
+		local64_set(&hwc->period_left, left);
 		hwc->last_period = period;
 		ret = 1;
 	}
@@ -924,7 +924,7 @@ x86_perf_event_set_period(struct perf_event *event)
 	 * The hw event starts counting from this event offset,
 	 * mark it to be able to extra future deltas:
 	 */
-	atomic64_set(&hwc->prev_count, (u64)-left);
+	local64_set(&hwc->prev_count, (u64)-left);
 
 	wrmsrl(hwc->event_base + idx, (u64)(-left) & x86_pmu.cntval_mask);
 
diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h
index f34dab9b275e..7342979f95f2 100644
--- a/include/linux/perf_event.h
+++ b/include/linux/perf_event.h
@@ -487,6 +487,7 @@ struct perf_guest_info_callbacks {
 #include <linux/cpu.h>
 #include <asm/atomic.h>
 #include <asm/local.h>
+#include <asm/local64.h>
 
 #define PERF_MAX_STACK_DEPTH		255
 
@@ -536,10 +537,10 @@ struct hw_perf_event {
 		struct arch_hw_breakpoint	info;
 #endif
 	};
-	atomic64_t			prev_count;
+	local64_t			prev_count;
 	u64				sample_period;
 	u64				last_period;
-	atomic64_t			period_left;
+	local64_t			period_left;
 	u64				interrupts;
 
 	u64				freq_time_stamp;
@@ -670,7 +671,7 @@ struct perf_event {
 
 	enum perf_event_active_state	state;
 	unsigned int			attach_state;
-	atomic64_t			count;
+	local64_t			count;
 	atomic64_t			child_count;
 
 	/*
diff --git a/kernel/perf_event.c b/kernel/perf_event.c
index a395fda2d94c..97c73018592e 100644
--- a/kernel/perf_event.c
+++ b/kernel/perf_event.c
@@ -1148,9 +1148,9 @@ static void __perf_event_sync_stat(struct perf_event *event,
 	 * In order to keep per-task stats reliable we need to flip the event
 	 * values when we flip the contexts.
 	 */
-	value = atomic64_read(&next_event->count);
-	value = atomic64_xchg(&event->count, value);
-	atomic64_set(&next_event->count, value);
+	value = local64_read(&next_event->count);
+	value = local64_xchg(&event->count, value);
+	local64_set(&next_event->count, value);
 
 	swap(event->total_time_enabled, next_event->total_time_enabled);
 	swap(event->total_time_running, next_event->total_time_running);
@@ -1540,10 +1540,10 @@ static void perf_adjust_period(struct perf_event *event, u64 nsec, u64 count)
 
 	hwc->sample_period = sample_period;
 
-	if (atomic64_read(&hwc->period_left) > 8*sample_period) {
+	if (local64_read(&hwc->period_left) > 8*sample_period) {
 		perf_disable();
 		perf_event_stop(event);
-		atomic64_set(&hwc->period_left, 0);
+		local64_set(&hwc->period_left, 0);
 		perf_event_start(event);
 		perf_enable();
 	}
@@ -1584,7 +1584,7 @@ static void perf_ctx_adjust_freq(struct perf_event_context *ctx)
 
 		perf_disable();
 		event->pmu->read(event);
-		now = atomic64_read(&event->count);
+		now = local64_read(&event->count);
 		delta = now - hwc->freq_count_stamp;
 		hwc->freq_count_stamp = now;
 
@@ -1738,7 +1738,7 @@ static void __perf_event_read(void *info)
 
 static inline u64 perf_event_count(struct perf_event *event)
 {
-	return atomic64_read(&event->count) + atomic64_read(&event->child_count);
+	return local64_read(&event->count) + atomic64_read(&event->child_count);
 }
 
 static u64 perf_event_read(struct perf_event *event)
@@ -2141,7 +2141,7 @@ static unsigned int perf_poll(struct file *file, poll_table *wait)
 static void perf_event_reset(struct perf_event *event)
 {
 	(void)perf_event_read(event);
-	atomic64_set(&event->count, 0);
+	local64_set(&event->count, 0);
 	perf_event_update_userpage(event);
 }
 
@@ -2359,7 +2359,7 @@ void perf_event_update_userpage(struct perf_event *event)
 	userpg->index = perf_event_index(event);
 	userpg->offset = perf_event_count(event);
 	if (event->state == PERF_EVENT_STATE_ACTIVE)
-		userpg->offset -= atomic64_read(&event->hw.prev_count);
+		userpg->offset -= local64_read(&event->hw.prev_count);
 
 	userpg->time_enabled = event->total_time_enabled +
 			atomic64_read(&event->child_total_time_enabled);
@@ -4035,14 +4035,14 @@ static u64 perf_swevent_set_period(struct perf_event *event)
 	hwc->last_period = hwc->sample_period;
 
 again:
-	old = val = atomic64_read(&hwc->period_left);
+	old = val = local64_read(&hwc->period_left);
 	if (val < 0)
 		return 0;
 
 	nr = div64_u64(period + val, period);
 	offset = nr * period;
 	val -= offset;
-	if (atomic64_cmpxchg(&hwc->period_left, old, val) != old)
+	if (local64_cmpxchg(&hwc->period_left, old, val) != old)
 		goto again;
 
 	return nr;
@@ -4081,7 +4081,7 @@ static void perf_swevent_add(struct perf_event *event, u64 nr,
 {
 	struct hw_perf_event *hwc = &event->hw;
 
-	atomic64_add(nr, &event->count);
+	local64_add(nr, &event->count);
 
 	if (!regs)
 		return;
@@ -4092,7 +4092,7 @@ static void perf_swevent_add(struct perf_event *event, u64 nr,
 	if (nr == 1 && hwc->sample_period == 1 && !event->attr.freq)
 		return perf_swevent_overflow(event, 1, nmi, data, regs);
 
-	if (atomic64_add_negative(nr, &hwc->period_left))
+	if (local64_add_negative(nr, &hwc->period_left))
 		return;
 
 	perf_swevent_overflow(event, 0, nmi, data, regs);
@@ -4383,8 +4383,8 @@ static void cpu_clock_perf_event_update(struct perf_event *event)
 	u64 now;
 
 	now = cpu_clock(cpu);
-	prev = atomic64_xchg(&event->hw.prev_count, now);
-	atomic64_add(now - prev, &event->count);
+	prev = local64_xchg(&event->hw.prev_count, now);
+	local64_add(now - prev, &event->count);
 }
 
 static int cpu_clock_perf_event_enable(struct perf_event *event)
@@ -4392,7 +4392,7 @@ static int cpu_clock_perf_event_enable(struct perf_event *event)
 	struct hw_perf_event *hwc = &event->hw;
 	int cpu = raw_smp_processor_id();
 
-	atomic64_set(&hwc->prev_count, cpu_clock(cpu));
+	local64_set(&hwc->prev_count, cpu_clock(cpu));
 	perf_swevent_start_hrtimer(event);
 
 	return 0;
@@ -4424,9 +4424,9 @@ static void task_clock_perf_event_update(struct perf_event *event, u64 now)
 	u64 prev;
 	s64 delta;
 
-	prev = atomic64_xchg(&event->hw.prev_count, now);
+	prev = local64_xchg(&event->hw.prev_count, now);
 	delta = now - prev;
-	atomic64_add(delta, &event->count);
+	local64_add(delta, &event->count);
 }
 
 static int task_clock_perf_event_enable(struct perf_event *event)
@@ -4436,7 +4436,7 @@ static int task_clock_perf_event_enable(struct perf_event *event)
 
 	now = event->ctx->time;
 
-	atomic64_set(&hwc->prev_count, now);
+	local64_set(&hwc->prev_count, now);
 
 	perf_swevent_start_hrtimer(event);
 
@@ -4879,7 +4879,7 @@ perf_event_alloc(struct perf_event_attr *attr,
 		hwc->sample_period = 1;
 	hwc->last_period = hwc->sample_period;
 
-	atomic64_set(&hwc->period_left, hwc->sample_period);
+	local64_set(&hwc->period_left, hwc->sample_period);
 
 	/*
 	 * we currently do not support PERF_FORMAT_GROUP on inherited events
@@ -5313,7 +5313,7 @@ inherit_event(struct perf_event *parent_event,
 		hwc->sample_period = sample_period;
 		hwc->last_period   = sample_period;
 
-		atomic64_set(&hwc->period_left, sample_period);
+		local64_set(&hwc->period_left, sample_period);
 	}
 
 	child_event->overflow_handler = parent_event->overflow_handler;
-- 
cgit v1.2.3


From 7be7923633a142402266d642ccebf74f556a649b Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <a.p.zijlstra@chello.nl>
Date: Wed, 9 Jun 2010 11:57:23 +0200
Subject: perf: Fix build breakage for architecutes without atomic64_t

The local64.h include dependency was not dependent on PERF_EVENT=y,
which meant that arch's without atomic64_t support ended up including
it and failed to build.

Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
LKML-Reference: <new-submission>
---
 include/linux/perf_event.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h
index 7342979f95f2..1218d05728b9 100644
--- a/include/linux/perf_event.h
+++ b/include/linux/perf_event.h
@@ -462,6 +462,7 @@ enum perf_callchain_context {
 
 #ifdef CONFIG_PERF_EVENTS
 # include <asm/perf_event.h>
+# include <asm/local64.h>
 #endif
 
 struct perf_guest_info_callbacks {
@@ -487,7 +488,6 @@ struct perf_guest_info_callbacks {
 #include <linux/cpu.h>
 #include <asm/atomic.h>
 #include <asm/local.h>
-#include <asm/local64.h>
 
 #define PERF_MAX_STACK_DEPTH		255
 
-- 
cgit v1.2.3


From 039ca4e74a1cf60bd7487324a564ecf5c981f254 Mon Sep 17 00:00:00 2001
From: Li Zefan <lizf@cn.fujitsu.com>
Date: Wed, 26 May 2010 17:22:17 +0800
Subject: tracing: Remove kmemtrace ftrace plugin

We have been resisting new ftrace plugins and removing existing
ones, and kmemtrace has been superseded by kmem trace events
and perf-kmem, so we remove it.

Signed-off-by: Li Zefan <lizf@cn.fujitsu.com>
Acked-by: Pekka Enberg <penberg@cs.helsinki.fi>
Acked-by: Eduard - Gabriel Munteanu <eduard.munteanu@linux360.ro>
Cc: Ingo Molnar <mingo@elte.hu>
Cc: Steven Rostedt <rostedt@goodmis.org>
[ remove kmemtrace from the makefile, handle slob too ]
Signed-off-by: Frederic Weisbecker <fweisbec@gmail.com>
---
 Documentation/ABI/testing/debugfs-kmemtrace |  71 ----
 Documentation/trace/kmemtrace.txt           | 126 -------
 MAINTAINERS                                 |   7 -
 include/linux/kmemtrace.h                   |  25 --
 include/linux/slab_def.h                    |   3 +-
 include/linux/slub_def.h                    |   3 +-
 init/main.c                                 |   2 -
 kernel/trace/Kconfig                        |  20 --
 kernel/trace/Makefile                       |   1 -
 kernel/trace/kmemtrace.c                    | 529 ----------------------------
 kernel/trace/trace.h                        |  12 -
 kernel/trace/trace_entries.h                |  35 --
 mm/slab.c                                   |   1 -
 mm/slob.c                                   |   4 +-
 mm/slub.c                                   |   1 -
 15 files changed, 7 insertions(+), 833 deletions(-)
 delete mode 100644 Documentation/ABI/testing/debugfs-kmemtrace
 delete mode 100644 Documentation/trace/kmemtrace.txt
 delete mode 100644 include/linux/kmemtrace.h
 delete mode 100644 kernel/trace/kmemtrace.c

(limited to 'include/linux')

diff --git a/Documentation/ABI/testing/debugfs-kmemtrace b/Documentation/ABI/testing/debugfs-kmemtrace
deleted file mode 100644
index 5e6a92a02d85..000000000000
--- a/Documentation/ABI/testing/debugfs-kmemtrace
+++ /dev/null
@@ -1,71 +0,0 @@
-What:		/sys/kernel/debug/kmemtrace/
-Date:		July 2008
-Contact:	Eduard - Gabriel Munteanu <eduard.munteanu@linux360.ro>
-Description:
-
-In kmemtrace-enabled kernels, the following files are created:
-
-/sys/kernel/debug/kmemtrace/
-	cpu<n>		(0400)	Per-CPU tracing data, see below. (binary)
-	total_overruns	(0400)	Total number of bytes which were dropped from
-				cpu<n> files because of full buffer condition,
-				non-binary. (text)
-	abi_version	(0400)	Kernel's kmemtrace ABI version. (text)
-
-Each per-CPU file should be read according to the relay interface. That is,
-the reader should set affinity to that specific CPU and, as currently done by
-the userspace application (though there are other methods), use poll() with
-an infinite timeout before every read(). Otherwise, erroneous data may be
-read. The binary data has the following _core_ format:
-
-	Event ID	(1 byte)	Unsigned integer, one of:
-		0 - represents an allocation (KMEMTRACE_EVENT_ALLOC)
-		1 - represents a freeing of previously allocated memory
-		    (KMEMTRACE_EVENT_FREE)
-	Type ID		(1 byte)	Unsigned integer, one of:
-		0 - this is a kmalloc() / kfree()
-		1 - this is a kmem_cache_alloc() / kmem_cache_free()
-		2 - this is a __get_free_pages() et al.
-	Event size	(2 bytes)	Unsigned integer representing the
-					size of this event. Used to extend
-					kmemtrace. Discard the bytes you
-					don't know about.
-	Sequence number	(4 bytes)	Signed integer used to reorder data
-					logged on SMP machines. Wraparound
-					must be taken into account, although
-					it is unlikely.
-	Caller address	(8 bytes)	Return address to the caller.
-	Pointer to mem	(8 bytes)	Pointer to target memory area. Can be
-					NULL, but not all such calls might be
-					recorded.
-
-In case of KMEMTRACE_EVENT_ALLOC events, the next fields follow:
-
-	Requested bytes	(8 bytes)	Total number of requested bytes,
-					unsigned, must not be zero.
-	Allocated bytes (8 bytes)	Total number of actually allocated
-					bytes, unsigned, must not be lower
-					than requested bytes.
-	Requested flags	(4 bytes)	GFP flags supplied by the caller.
-	Target CPU	(4 bytes)	Signed integer, valid for event id 1.
-					If equal to -1, target CPU is the same
-					as origin CPU, but the reverse might
-					not be true.
-
-The data is made available in the same endianness the machine has.
-
-Other event ids and type ids may be defined and added. Other fields may be
-added by increasing event size, but see below for details.
-Every modification to the ABI, including new id definitions, are followed
-by bumping the ABI version by one.
-
-Adding new data to the packet (features) is done at the end of the mandatory
-data:
-	Feature size	(2 byte)
-	Feature ID	(1 byte)
-	Feature data	(Feature size - 3 bytes)
-
-
-Users:
-	kmemtrace-user - git://repo.or.cz/kmemtrace-user.git
-
diff --git a/Documentation/trace/kmemtrace.txt b/Documentation/trace/kmemtrace.txt
deleted file mode 100644
index 6308735e58ca..000000000000
--- a/Documentation/trace/kmemtrace.txt
+++ /dev/null
@@ -1,126 +0,0 @@
-			kmemtrace - Kernel Memory Tracer
-
-			  by Eduard - Gabriel Munteanu
-			     <eduard.munteanu@linux360.ro>
-
-I. Introduction
-===============
-
-kmemtrace helps kernel developers figure out two things:
-1) how different allocators (SLAB, SLUB etc.) perform
-2) how kernel code allocates memory and how much
-
-To do this, we trace every allocation and export information to the userspace
-through the relay interface. We export things such as the number of requested
-bytes, the number of bytes actually allocated (i.e. including internal
-fragmentation), whether this is a slab allocation or a plain kmalloc() and so
-on.
-
-The actual analysis is performed by a userspace tool (see section III for
-details on where to get it from). It logs the data exported by the kernel,
-processes it and (as of writing this) can provide the following information:
-- the total amount of memory allocated and fragmentation per call-site
-- the amount of memory allocated and fragmentation per allocation
-- total memory allocated and fragmentation in the collected dataset
-- number of cross-CPU allocation and frees (makes sense in NUMA environments)
-
-Moreover, it can potentially find inconsistent and erroneous behavior in
-kernel code, such as using slab free functions on kmalloc'ed memory or
-allocating less memory than requested (but not truly failed allocations).
-
-kmemtrace also makes provisions for tracing on some arch and analysing the
-data on another.
-
-II. Design and goals
-====================
-
-kmemtrace was designed to handle rather large amounts of data. Thus, it uses
-the relay interface to export whatever is logged to userspace, which then
-stores it. Analysis and reporting is done asynchronously, that is, after the
-data is collected and stored. By design, it allows one to log and analyse
-on different machines and different arches.
-
-As of writing this, the ABI is not considered stable, though it might not
-change much. However, no guarantees are made about compatibility yet. When
-deemed stable, the ABI should still allow easy extension while maintaining
-backward compatibility. This is described further in Documentation/ABI.
-
-Summary of design goals:
-	- allow logging and analysis to be done across different machines
-	- be fast and anticipate usage in high-load environments (*)
-	- be reasonably extensible
-	- make it possible for GNU/Linux distributions to have kmemtrace
-	included in their repositories
-
-(*) - one of the reasons Pekka Enberg's original userspace data analysis
-    tool's code was rewritten from Perl to C (although this is more than a
-    simple conversion)
-
-
-III. Quick usage guide
-======================
-
-1) Get a kernel that supports kmemtrace and build it accordingly (i.e. enable
-CONFIG_KMEMTRACE).
-
-2) Get the userspace tool and build it:
-$ git clone git://repo.or.cz/kmemtrace-user.git		# current repository
-$ cd kmemtrace-user/
-$ ./autogen.sh
-$ ./configure
-$ make
-
-3) Boot the kmemtrace-enabled kernel if you haven't, preferably in the
-'single' runlevel (so that relay buffers don't fill up easily), and run
-kmemtrace:
-# '$' does not mean user, but root here.
-$ mount -t debugfs none /sys/kernel/debug
-$ mount -t proc none /proc
-$ cd path/to/kmemtrace-user/
-$ ./kmemtraced
-Wait a bit, then stop it with CTRL+C.
-$ cat /sys/kernel/debug/kmemtrace/total_overruns	# Check if we didn't
-							# overrun, should
-							# be zero.
-$ (Optionally) [Run kmemtrace_check separately on each cpu[0-9]*.out file to
-		check its correctness]
-$ ./kmemtrace-report
-
-Now you should have a nice and short summary of how the allocator performs.
-
-IV. FAQ and known issues
-========================
-
-Q: 'cat /sys/kernel/debug/kmemtrace/total_overruns' is non-zero, how do I fix
-this? Should I worry?
-A: If it's non-zero, this affects kmemtrace's accuracy, depending on how
-large the number is. You can fix it by supplying a higher
-'kmemtrace.subbufs=N' kernel parameter.
----
-
-Q: kmemtrace_check reports errors, how do I fix this? Should I worry?
-A: This is a bug and should be reported. It can occur for a variety of
-reasons:
-	- possible bugs in relay code
-	- possible misuse of relay by kmemtrace
-	- timestamps being collected unorderly
-Or you may fix it yourself and send us a patch.
----
-
-Q: kmemtrace_report shows many errors, how do I fix this? Should I worry?
-A: This is a known issue and I'm working on it. These might be true errors
-in kernel code, which may have inconsistent behavior (e.g. allocating memory
-with kmem_cache_alloc() and freeing it with kfree()). Pekka Enberg pointed
-out this behavior may work with SLAB, but may fail with other allocators.
-
-It may also be due to lack of tracing in some unusual allocator functions.
-
-We don't want bug reports regarding this issue yet.
----
-
-V. See also
-===========
-
-Documentation/kernel-parameters.txt
-Documentation/ABI/testing/debugfs-kmemtrace
-
diff --git a/MAINTAINERS b/MAINTAINERS
index 33047a605438..67e6e9d848db 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -3361,13 +3361,6 @@ F:	include/linux/kmemleak.h
 F:	mm/kmemleak.c
 F:	mm/kmemleak-test.c
 
-KMEMTRACE
-M:	Eduard - Gabriel Munteanu <eduard.munteanu@linux360.ro>
-S:	Maintained
-F:	Documentation/trace/kmemtrace.txt
-F:	include/linux/kmemtrace.h
-F:	kernel/trace/kmemtrace.c
-
 KPROBES
 M:	Ananth N Mavinakayanahalli <ananth@in.ibm.com>
 M:	Anil S Keshavamurthy <anil.s.keshavamurthy@intel.com>
diff --git a/include/linux/kmemtrace.h b/include/linux/kmemtrace.h
deleted file mode 100644
index b616d3930c3b..000000000000
--- a/include/linux/kmemtrace.h
+++ /dev/null
@@ -1,25 +0,0 @@
-/*
- * Copyright (C) 2008 Eduard - Gabriel Munteanu
- *
- * This file is released under GPL version 2.
- */
-
-#ifndef _LINUX_KMEMTRACE_H
-#define _LINUX_KMEMTRACE_H
-
-#ifdef __KERNEL__
-
-#include <trace/events/kmem.h>
-
-#ifdef CONFIG_KMEMTRACE
-extern void kmemtrace_init(void);
-#else
-static inline void kmemtrace_init(void)
-{
-}
-#endif
-
-#endif /* __KERNEL__ */
-
-#endif /* _LINUX_KMEMTRACE_H */
-
diff --git a/include/linux/slab_def.h b/include/linux/slab_def.h
index 1812dac8c496..1acfa73ce2ac 100644
--- a/include/linux/slab_def.h
+++ b/include/linux/slab_def.h
@@ -14,7 +14,8 @@
 #include <asm/page.h>		/* kmalloc_sizes.h needs PAGE_SIZE */
 #include <asm/cache.h>		/* kmalloc_sizes.h needs L1_CACHE_BYTES */
 #include <linux/compiler.h>
-#include <linux/kmemtrace.h>
+
+#include <trace/events/kmem.h>
 
 #ifndef ARCH_KMALLOC_MINALIGN
 /*
diff --git a/include/linux/slub_def.h b/include/linux/slub_def.h
index 55695c8d2f8a..2345d3a033e6 100644
--- a/include/linux/slub_def.h
+++ b/include/linux/slub_def.h
@@ -10,9 +10,10 @@
 #include <linux/gfp.h>
 #include <linux/workqueue.h>
 #include <linux/kobject.h>
-#include <linux/kmemtrace.h>
 #include <linux/kmemleak.h>
 
+#include <trace/events/kmem.h>
+
 enum stat_item {
 	ALLOC_FASTPATH,		/* Allocation from cpu slab */
 	ALLOC_SLOWPATH,		/* Allocation by getting a new cpu slab */
diff --git a/init/main.c b/init/main.c
index 94f65efdc65a..e2a2bf3a169f 100644
--- a/init/main.c
+++ b/init/main.c
@@ -66,7 +66,6 @@
 #include <linux/ftrace.h>
 #include <linux/async.h>
 #include <linux/kmemcheck.h>
-#include <linux/kmemtrace.h>
 #include <linux/sfi.h>
 #include <linux/shmem_fs.h>
 #include <linux/slab.h>
@@ -652,7 +651,6 @@ asmlinkage void __init start_kernel(void)
 #endif
 	page_cgroup_init();
 	enable_debug_pagealloc();
-	kmemtrace_init();
 	kmemleak_init();
 	debug_objects_mem_init();
 	idr_init_cache();
diff --git a/kernel/trace/Kconfig b/kernel/trace/Kconfig
index 572992abc71c..f669092fdead 100644
--- a/kernel/trace/Kconfig
+++ b/kernel/trace/Kconfig
@@ -354,26 +354,6 @@ config STACK_TRACER
 
 	  Say N if unsure.
 
-config KMEMTRACE
-	bool "Trace SLAB allocations"
-	select GENERIC_TRACER
-	help
-	  kmemtrace provides tracing for slab allocator functions, such as
-	  kmalloc, kfree, kmem_cache_alloc, kmem_cache_free, etc. Collected
-	  data is then fed to the userspace application in order to analyse
-	  allocation hotspots, internal fragmentation and so on, making it
-	  possible to see how well an allocator performs, as well as debug
-	  and profile kernel code.
-
-	  This requires an userspace application to use. See
-	  Documentation/trace/kmemtrace.txt for more information.
-
-	  Saying Y will make the kernel somewhat larger and slower. However,
-	  if you disable kmemtrace at run-time or boot-time, the performance
-	  impact is minimal (depending on the arch the kernel is built for).
-
-	  If unsure, say N.
-
 config WORKQUEUE_TRACER
 	bool "Trace workqueues"
 	select GENERIC_TRACER
diff --git a/kernel/trace/Makefile b/kernel/trace/Makefile
index c3aaeba82372..469a1c7555a5 100644
--- a/kernel/trace/Makefile
+++ b/kernel/trace/Makefile
@@ -40,7 +40,6 @@ obj-$(CONFIG_STACK_TRACER) += trace_stack.o
 obj-$(CONFIG_MMIOTRACE) += trace_mmiotrace.o
 obj-$(CONFIG_FUNCTION_GRAPH_TRACER) += trace_functions_graph.o
 obj-$(CONFIG_TRACE_BRANCH_PROFILING) += trace_branch.o
-obj-$(CONFIG_KMEMTRACE) += kmemtrace.o
 obj-$(CONFIG_WORKQUEUE_TRACER) += trace_workqueue.o
 obj-$(CONFIG_BLK_DEV_IO_TRACE) += blktrace.o
 ifeq ($(CONFIG_BLOCK),y)
diff --git a/kernel/trace/kmemtrace.c b/kernel/trace/kmemtrace.c
deleted file mode 100644
index bbfc1bb1660b..000000000000
--- a/kernel/trace/kmemtrace.c
+++ /dev/null
@@ -1,529 +0,0 @@
-/*
- * Memory allocator tracing
- *
- * Copyright (C) 2008 Eduard - Gabriel Munteanu
- * Copyright (C) 2008 Pekka Enberg <penberg@cs.helsinki.fi>
- * Copyright (C) 2008 Frederic Weisbecker <fweisbec@gmail.com>
- */
-
-#include <linux/tracepoint.h>
-#include <linux/seq_file.h>
-#include <linux/debugfs.h>
-#include <linux/dcache.h>
-#include <linux/fs.h>
-
-#include <linux/kmemtrace.h>
-
-#include "trace_output.h"
-#include "trace.h"
-
-/* Select an alternative, minimalistic output than the original one */
-#define TRACE_KMEM_OPT_MINIMAL	0x1
-
-static struct tracer_opt kmem_opts[] = {
-	/* Default disable the minimalistic output */
-	{ TRACER_OPT(kmem_minimalistic, TRACE_KMEM_OPT_MINIMAL) },
-	{ }
-};
-
-static struct tracer_flags kmem_tracer_flags = {
-	.val			= 0,
-	.opts			= kmem_opts
-};
-
-static struct trace_array *kmemtrace_array;
-
-/* Trace allocations */
-static inline void kmemtrace_alloc(enum kmemtrace_type_id type_id,
-				   unsigned long call_site,
-				   const void *ptr,
-				   size_t bytes_req,
-				   size_t bytes_alloc,
-				   gfp_t gfp_flags,
-				   int node)
-{
-	struct ftrace_event_call *call = &event_kmem_alloc;
-	struct trace_array *tr = kmemtrace_array;
-	struct kmemtrace_alloc_entry *entry;
-	struct ring_buffer_event *event;
-
-	event = ring_buffer_lock_reserve(tr->buffer, sizeof(*entry));
-	if (!event)
-		return;
-
-	entry = ring_buffer_event_data(event);
-	tracing_generic_entry_update(&entry->ent, 0, 0);
-
-	entry->ent.type		= TRACE_KMEM_ALLOC;
-	entry->type_id		= type_id;
-	entry->call_site	= call_site;
-	entry->ptr		= ptr;
-	entry->bytes_req	= bytes_req;
-	entry->bytes_alloc	= bytes_alloc;
-	entry->gfp_flags	= gfp_flags;
-	entry->node		= node;
-
-	if (!filter_check_discard(call, entry, tr->buffer, event))
-		ring_buffer_unlock_commit(tr->buffer, event);
-
-	trace_wake_up();
-}
-
-static inline void kmemtrace_free(enum kmemtrace_type_id type_id,
-				  unsigned long call_site,
-				  const void *ptr)
-{
-	struct ftrace_event_call *call = &event_kmem_free;
-	struct trace_array *tr = kmemtrace_array;
-	struct kmemtrace_free_entry *entry;
-	struct ring_buffer_event *event;
-
-	event = ring_buffer_lock_reserve(tr->buffer, sizeof(*entry));
-	if (!event)
-		return;
-	entry	= ring_buffer_event_data(event);
-	tracing_generic_entry_update(&entry->ent, 0, 0);
-
-	entry->ent.type		= TRACE_KMEM_FREE;
-	entry->type_id		= type_id;
-	entry->call_site	= call_site;
-	entry->ptr		= ptr;
-
-	if (!filter_check_discard(call, entry, tr->buffer, event))
-		ring_buffer_unlock_commit(tr->buffer, event);
-
-	trace_wake_up();
-}
-
-static void kmemtrace_kmalloc(void *ignore,
-			      unsigned long call_site,
-			      const void *ptr,
-			      size_t bytes_req,
-			      size_t bytes_alloc,
-			      gfp_t gfp_flags)
-{
-	kmemtrace_alloc(KMEMTRACE_TYPE_KMALLOC, call_site, ptr,
-			bytes_req, bytes_alloc, gfp_flags, -1);
-}
-
-static void kmemtrace_kmem_cache_alloc(void *ignore,
-				       unsigned long call_site,
-				       const void *ptr,
-				       size_t bytes_req,
-				       size_t bytes_alloc,
-				       gfp_t gfp_flags)
-{
-	kmemtrace_alloc(KMEMTRACE_TYPE_CACHE, call_site, ptr,
-			bytes_req, bytes_alloc, gfp_flags, -1);
-}
-
-static void kmemtrace_kmalloc_node(void *ignore,
-				   unsigned long call_site,
-				   const void *ptr,
-				   size_t bytes_req,
-				   size_t bytes_alloc,
-				   gfp_t gfp_flags,
-				   int node)
-{
-	kmemtrace_alloc(KMEMTRACE_TYPE_KMALLOC, call_site, ptr,
-			bytes_req, bytes_alloc, gfp_flags, node);
-}
-
-static void kmemtrace_kmem_cache_alloc_node(void *ignore,
-					    unsigned long call_site,
-					    const void *ptr,
-					    size_t bytes_req,
-					    size_t bytes_alloc,
-					    gfp_t gfp_flags,
-					    int node)
-{
-	kmemtrace_alloc(KMEMTRACE_TYPE_CACHE, call_site, ptr,
-			bytes_req, bytes_alloc, gfp_flags, node);
-}
-
-static void
-kmemtrace_kfree(void *ignore, unsigned long call_site, const void *ptr)
-{
-	kmemtrace_free(KMEMTRACE_TYPE_KMALLOC, call_site, ptr);
-}
-
-static void kmemtrace_kmem_cache_free(void *ignore,
-				      unsigned long call_site, const void *ptr)
-{
-	kmemtrace_free(KMEMTRACE_TYPE_CACHE, call_site, ptr);
-}
-
-static int kmemtrace_start_probes(void)
-{
-	int err;
-
-	err = register_trace_kmalloc(kmemtrace_kmalloc, NULL);
-	if (err)
-		return err;
-	err = register_trace_kmem_cache_alloc(kmemtrace_kmem_cache_alloc, NULL);
-	if (err)
-		return err;
-	err = register_trace_kmalloc_node(kmemtrace_kmalloc_node, NULL);
-	if (err)
-		return err;
-	err = register_trace_kmem_cache_alloc_node(kmemtrace_kmem_cache_alloc_node, NULL);
-	if (err)
-		return err;
-	err = register_trace_kfree(kmemtrace_kfree, NULL);
-	if (err)
-		return err;
-	err = register_trace_kmem_cache_free(kmemtrace_kmem_cache_free, NULL);
-
-	return err;
-}
-
-static void kmemtrace_stop_probes(void)
-{
-	unregister_trace_kmalloc(kmemtrace_kmalloc, NULL);
-	unregister_trace_kmem_cache_alloc(kmemtrace_kmem_cache_alloc, NULL);
-	unregister_trace_kmalloc_node(kmemtrace_kmalloc_node, NULL);
-	unregister_trace_kmem_cache_alloc_node(kmemtrace_kmem_cache_alloc_node, NULL);
-	unregister_trace_kfree(kmemtrace_kfree, NULL);
-	unregister_trace_kmem_cache_free(kmemtrace_kmem_cache_free, NULL);
-}
-
-static int kmem_trace_init(struct trace_array *tr)
-{
-	kmemtrace_array = tr;
-
-	tracing_reset_online_cpus(tr);
-
-	kmemtrace_start_probes();
-
-	return 0;
-}
-
-static void kmem_trace_reset(struct trace_array *tr)
-{
-	kmemtrace_stop_probes();
-}
-
-static void kmemtrace_headers(struct seq_file *s)
-{
-	/* Don't need headers for the original kmemtrace output */
-	if (!(kmem_tracer_flags.val & TRACE_KMEM_OPT_MINIMAL))
-		return;
-
-	seq_printf(s, "#\n");
-	seq_printf(s, "# ALLOC  TYPE  REQ   GIVEN  FLAGS     "
-			"      POINTER         NODE    CALLER\n");
-	seq_printf(s, "# FREE   |      |     |       |       "
-			"       |   |            |        |\n");
-	seq_printf(s, "# |\n\n");
-}
-
-/*
- * The following functions give the original output from kmemtrace,
- * plus the origin CPU, since reordering occurs in-kernel now.
- */
-
-#define KMEMTRACE_USER_ALLOC	0
-#define KMEMTRACE_USER_FREE	1
-
-struct kmemtrace_user_event {
-	u8			event_id;
-	u8			type_id;
-	u16			event_size;
-	u32			cpu;
-	u64			timestamp;
-	unsigned long		call_site;
-	unsigned long		ptr;
-};
-
-struct kmemtrace_user_event_alloc {
-	size_t			bytes_req;
-	size_t			bytes_alloc;
-	unsigned		gfp_flags;
-	int			node;
-};
-
-static enum print_line_t
-kmemtrace_print_alloc(struct trace_iterator *iter, int flags,
-		      struct trace_event *event)
-{
-	struct trace_seq *s = &iter->seq;
-	struct kmemtrace_alloc_entry *entry;
-	int ret;
-
-	trace_assign_type(entry, iter->ent);
-
-	ret = trace_seq_printf(s, "type_id %d call_site %pF ptr %lu "
-	    "bytes_req %lu bytes_alloc %lu gfp_flags %lu node %d\n",
-	    entry->type_id, (void *)entry->call_site, (unsigned long)entry->ptr,
-	    (unsigned long)entry->bytes_req, (unsigned long)entry->bytes_alloc,
-	    (unsigned long)entry->gfp_flags, entry->node);
-
-	if (!ret)
-		return TRACE_TYPE_PARTIAL_LINE;
-	return TRACE_TYPE_HANDLED;
-}
-
-static enum print_line_t
-kmemtrace_print_free(struct trace_iterator *iter, int flags,
-		     struct trace_event *event)
-{
-	struct trace_seq *s = &iter->seq;
-	struct kmemtrace_free_entry *entry;
-	int ret;
-
-	trace_assign_type(entry, iter->ent);
-
-	ret = trace_seq_printf(s, "type_id %d call_site %pF ptr %lu\n",
-			       entry->type_id, (void *)entry->call_site,
-			       (unsigned long)entry->ptr);
-
-	if (!ret)
-		return TRACE_TYPE_PARTIAL_LINE;
-	return TRACE_TYPE_HANDLED;
-}
-
-static enum print_line_t
-kmemtrace_print_alloc_user(struct trace_iterator *iter, int flags,
-			   struct trace_event *event)
-{
-	struct trace_seq *s = &iter->seq;
-	struct kmemtrace_alloc_entry *entry;
-	struct kmemtrace_user_event *ev;
-	struct kmemtrace_user_event_alloc *ev_alloc;
-
-	trace_assign_type(entry, iter->ent);
-
-	ev = trace_seq_reserve(s, sizeof(*ev));
-	if (!ev)
-		return TRACE_TYPE_PARTIAL_LINE;
-
-	ev->event_id		= KMEMTRACE_USER_ALLOC;
-	ev->type_id		= entry->type_id;
-	ev->event_size		= sizeof(*ev) + sizeof(*ev_alloc);
-	ev->cpu			= iter->cpu;
-	ev->timestamp		= iter->ts;
-	ev->call_site		= entry->call_site;
-	ev->ptr			= (unsigned long)entry->ptr;
-
-	ev_alloc = trace_seq_reserve(s, sizeof(*ev_alloc));
-	if (!ev_alloc)
-		return TRACE_TYPE_PARTIAL_LINE;
-
-	ev_alloc->bytes_req	= entry->bytes_req;
-	ev_alloc->bytes_alloc	= entry->bytes_alloc;
-	ev_alloc->gfp_flags	= entry->gfp_flags;
-	ev_alloc->node		= entry->node;
-
-	return TRACE_TYPE_HANDLED;
-}
-
-static enum print_line_t
-kmemtrace_print_free_user(struct trace_iterator *iter, int flags,
-			  struct trace_event *event)
-{
-	struct trace_seq *s = &iter->seq;
-	struct kmemtrace_free_entry *entry;
-	struct kmemtrace_user_event *ev;
-
-	trace_assign_type(entry, iter->ent);
-
-	ev = trace_seq_reserve(s, sizeof(*ev));
-	if (!ev)
-		return TRACE_TYPE_PARTIAL_LINE;
-
-	ev->event_id		= KMEMTRACE_USER_FREE;
-	ev->type_id		= entry->type_id;
-	ev->event_size		= sizeof(*ev);
-	ev->cpu			= iter->cpu;
-	ev->timestamp		= iter->ts;
-	ev->call_site		= entry->call_site;
-	ev->ptr			= (unsigned long)entry->ptr;
-
-	return TRACE_TYPE_HANDLED;
-}
-
-/* The two other following provide a more minimalistic output */
-static enum print_line_t
-kmemtrace_print_alloc_compress(struct trace_iterator *iter)
-{
-	struct kmemtrace_alloc_entry *entry;
-	struct trace_seq *s = &iter->seq;
-	int ret;
-
-	trace_assign_type(entry, iter->ent);
-
-	/* Alloc entry */
-	ret = trace_seq_printf(s, "  +      ");
-	if (!ret)
-		return TRACE_TYPE_PARTIAL_LINE;
-
-	/* Type */
-	switch (entry->type_id) {
-	case KMEMTRACE_TYPE_KMALLOC:
-		ret = trace_seq_printf(s, "K   ");
-		break;
-	case KMEMTRACE_TYPE_CACHE:
-		ret = trace_seq_printf(s, "C   ");
-		break;
-	case KMEMTRACE_TYPE_PAGES:
-		ret = trace_seq_printf(s, "P   ");
-		break;
-	default:
-		ret = trace_seq_printf(s, "?   ");
-	}
-
-	if (!ret)
-		return TRACE_TYPE_PARTIAL_LINE;
-
-	/* Requested */
-	ret = trace_seq_printf(s, "%4zu   ", entry->bytes_req);
-	if (!ret)
-		return TRACE_TYPE_PARTIAL_LINE;
-
-	/* Allocated */
-	ret = trace_seq_printf(s, "%4zu   ", entry->bytes_alloc);
-	if (!ret)
-		return TRACE_TYPE_PARTIAL_LINE;
-
-	/* Flags
-	 * TODO: would be better to see the name of the GFP flag names
-	 */
-	ret = trace_seq_printf(s, "%08x   ", entry->gfp_flags);
-	if (!ret)
-		return TRACE_TYPE_PARTIAL_LINE;
-
-	/* Pointer to allocated */
-	ret = trace_seq_printf(s, "0x%tx   ", (ptrdiff_t)entry->ptr);
-	if (!ret)
-		return TRACE_TYPE_PARTIAL_LINE;
-
-	/* Node and call site*/
-	ret = trace_seq_printf(s, "%4d   %pf\n", entry->node,
-						 (void *)entry->call_site);
-	if (!ret)
-		return TRACE_TYPE_PARTIAL_LINE;
-
-	return TRACE_TYPE_HANDLED;
-}
-
-static enum print_line_t
-kmemtrace_print_free_compress(struct trace_iterator *iter)
-{
-	struct kmemtrace_free_entry *entry;
-	struct trace_seq *s = &iter->seq;
-	int ret;
-
-	trace_assign_type(entry, iter->ent);
-
-	/* Free entry */
-	ret = trace_seq_printf(s, "  -      ");
-	if (!ret)
-		return TRACE_TYPE_PARTIAL_LINE;
-
-	/* Type */
-	switch (entry->type_id) {
-	case KMEMTRACE_TYPE_KMALLOC:
-		ret = trace_seq_printf(s, "K     ");
-		break;
-	case KMEMTRACE_TYPE_CACHE:
-		ret = trace_seq_printf(s, "C     ");
-		break;
-	case KMEMTRACE_TYPE_PAGES:
-		ret = trace_seq_printf(s, "P     ");
-		break;
-	default:
-		ret = trace_seq_printf(s, "?     ");
-	}
-
-	if (!ret)
-		return TRACE_TYPE_PARTIAL_LINE;
-
-	/* Skip requested/allocated/flags */
-	ret = trace_seq_printf(s, "                       ");
-	if (!ret)
-		return TRACE_TYPE_PARTIAL_LINE;
-
-	/* Pointer to allocated */
-	ret = trace_seq_printf(s, "0x%tx   ", (ptrdiff_t)entry->ptr);
-	if (!ret)
-		return TRACE_TYPE_PARTIAL_LINE;
-
-	/* Skip node and print call site*/
-	ret = trace_seq_printf(s, "       %pf\n", (void *)entry->call_site);
-	if (!ret)
-		return TRACE_TYPE_PARTIAL_LINE;
-
-	return TRACE_TYPE_HANDLED;
-}
-
-static enum print_line_t kmemtrace_print_line(struct trace_iterator *iter)
-{
-	struct trace_entry *entry = iter->ent;
-
-	if (!(kmem_tracer_flags.val & TRACE_KMEM_OPT_MINIMAL))
-		return TRACE_TYPE_UNHANDLED;
-
-	switch (entry->type) {
-	case TRACE_KMEM_ALLOC:
-		return kmemtrace_print_alloc_compress(iter);
-	case TRACE_KMEM_FREE:
-		return kmemtrace_print_free_compress(iter);
-	default:
-		return TRACE_TYPE_UNHANDLED;
-	}
-}
-
-static struct trace_event_functions kmem_trace_alloc_funcs = {
-	.trace			= kmemtrace_print_alloc,
-	.binary			= kmemtrace_print_alloc_user,
-};
-
-static struct trace_event kmem_trace_alloc = {
-	.type			= TRACE_KMEM_ALLOC,
-	.funcs			= &kmem_trace_alloc_funcs,
-};
-
-static struct trace_event_functions kmem_trace_free_funcs = {
-	.trace			= kmemtrace_print_free,
-	.binary			= kmemtrace_print_free_user,
-};
-
-static struct trace_event kmem_trace_free = {
-	.type			= TRACE_KMEM_FREE,
-	.funcs			= &kmem_trace_free_funcs,
-};
-
-static struct tracer kmem_tracer __read_mostly = {
-	.name			= "kmemtrace",
-	.init			= kmem_trace_init,
-	.reset			= kmem_trace_reset,
-	.print_line		= kmemtrace_print_line,
-	.print_header		= kmemtrace_headers,
-	.flags			= &kmem_tracer_flags
-};
-
-void kmemtrace_init(void)
-{
-	/* earliest opportunity to start kmem tracing */
-}
-
-static int __init init_kmem_tracer(void)
-{
-	if (!register_ftrace_event(&kmem_trace_alloc)) {
-		pr_warning("Warning: could not register kmem events\n");
-		return 1;
-	}
-
-	if (!register_ftrace_event(&kmem_trace_free)) {
-		pr_warning("Warning: could not register kmem events\n");
-		return 1;
-	}
-
-	if (register_tracer(&kmem_tracer) != 0) {
-		pr_warning("Warning: could not register the kmem tracer\n");
-		return 1;
-	}
-
-	return 0;
-}
-device_initcall(init_kmem_tracer);
diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h
index 75a5e800a737..075cd2ea84a2 100644
--- a/kernel/trace/trace.h
+++ b/kernel/trace/trace.h
@@ -9,7 +9,6 @@
 #include <linux/mmiotrace.h>
 #include <linux/tracepoint.h>
 #include <linux/ftrace.h>
-#include <linux/kmemtrace.h>
 #include <linux/hw_breakpoint.h>
 #include <linux/trace_seq.h>
 #include <linux/ftrace_event.h>
@@ -30,19 +29,12 @@ enum trace_type {
 	TRACE_GRAPH_RET,
 	TRACE_GRAPH_ENT,
 	TRACE_USER_STACK,
-	TRACE_KMEM_ALLOC,
-	TRACE_KMEM_FREE,
 	TRACE_BLK,
 	TRACE_KSYM,
 
 	__TRACE_LAST_TYPE,
 };
 
-enum kmemtrace_type_id {
-	KMEMTRACE_TYPE_KMALLOC = 0,	/* kmalloc() or kfree(). */
-	KMEMTRACE_TYPE_CACHE,		/* kmem_cache_*(). */
-	KMEMTRACE_TYPE_PAGES,		/* __get_free_pages() and friends. */
-};
 
 #undef __field
 #define __field(type, item)		type	item;
@@ -208,10 +200,6 @@ extern void __ftrace_bad_type(void);
 			  TRACE_GRAPH_ENT);		\
 		IF_ASSIGN(var, ent, struct ftrace_graph_ret_entry,	\
 			  TRACE_GRAPH_RET);		\
-		IF_ASSIGN(var, ent, struct kmemtrace_alloc_entry,	\
-			  TRACE_KMEM_ALLOC);	\
-		IF_ASSIGN(var, ent, struct kmemtrace_free_entry,	\
-			  TRACE_KMEM_FREE);	\
 		IF_ASSIGN(var, ent, struct ksym_trace_entry, TRACE_KSYM);\
 		__ftrace_bad_type();					\
 	} while (0)
diff --git a/kernel/trace/trace_entries.h b/kernel/trace/trace_entries.h
index c293364c984f..13abc157dbaf 100644
--- a/kernel/trace/trace_entries.h
+++ b/kernel/trace/trace_entries.h
@@ -291,41 +291,6 @@ FTRACE_ENTRY(branch, trace_branch,
 		 __entry->func, __entry->file, __entry->correct)
 );
 
-FTRACE_ENTRY(kmem_alloc, kmemtrace_alloc_entry,
-
-	TRACE_KMEM_ALLOC,
-
-	F_STRUCT(
-		__field(	enum kmemtrace_type_id,	type_id		)
-		__field(	unsigned long,		call_site	)
-		__field(	const void *,		ptr		)
-		__field(	size_t,			bytes_req	)
-		__field(	size_t,			bytes_alloc	)
-		__field(	gfp_t,			gfp_flags	)
-		__field(	int,			node		)
-	),
-
-	F_printk("type:%u call_site:%lx ptr:%p req:%zi alloc:%zi"
-		 " flags:%x node:%d",
-		 __entry->type_id, __entry->call_site, __entry->ptr,
-		 __entry->bytes_req, __entry->bytes_alloc,
-		 __entry->gfp_flags, __entry->node)
-);
-
-FTRACE_ENTRY(kmem_free, kmemtrace_free_entry,
-
-	TRACE_KMEM_FREE,
-
-	F_STRUCT(
-		__field(	enum kmemtrace_type_id,	type_id		)
-		__field(	unsigned long,		call_site	)
-		__field(	const void *,		ptr		)
-	),
-
-	F_printk("type:%u call_site:%lx ptr:%p",
-		 __entry->type_id, __entry->call_site, __entry->ptr)
-);
-
 FTRACE_ENTRY(ksym_trace, ksym_trace_entry,
 
 	TRACE_KSYM,
diff --git a/mm/slab.c b/mm/slab.c
index e49f8f46f46d..47360c3e5abd 100644
--- a/mm/slab.c
+++ b/mm/slab.c
@@ -102,7 +102,6 @@
 #include	<linux/cpu.h>
 #include	<linux/sysctl.h>
 #include	<linux/module.h>
-#include	<linux/kmemtrace.h>
 #include	<linux/rcupdate.h>
 #include	<linux/string.h>
 #include	<linux/uaccess.h>
diff --git a/mm/slob.c b/mm/slob.c
index 23631e2bb57a..a82ab5811bd9 100644
--- a/mm/slob.c
+++ b/mm/slob.c
@@ -66,8 +66,10 @@
 #include <linux/module.h>
 #include <linux/rcupdate.h>
 #include <linux/list.h>
-#include <linux/kmemtrace.h>
 #include <linux/kmemleak.h>
+
+#include <trace/events/kmem.h>
+
 #include <asm/atomic.h>
 
 /*
diff --git a/mm/slub.c b/mm/slub.c
index 26f0cb9cc584..a61f1aad1070 100644
--- a/mm/slub.c
+++ b/mm/slub.c
@@ -17,7 +17,6 @@
 #include <linux/slab.h>
 #include <linux/proc_fs.h>
 #include <linux/seq_file.h>
-#include <linux/kmemtrace.h>
 #include <linux/kmemcheck.h>
 #include <linux/cpu.h>
 #include <linux/cpuset.h>
-- 
cgit v1.2.3


From 8e4b50f94e8c1435a3e0ece42b7f97bc857d0145 Mon Sep 17 00:00:00 2001
From: Clemens Ladisch <clemens@ladisch.de>
Date: Thu, 10 Jun 2010 08:26:28 +0200
Subject: firewire: core: add CSR SPLIT_TIMEOUT support

Implement the SPLIT_TIMEOUT registers.  Besides being required by the
spec, this is desirable for some IIDC devices and necessary for many
audio devices to be able to increase the timeout from userspace.

Signed-off-by: Clemens Ladisch <clemens@ladisch.de>
---
 drivers/firewire/core-card.c        |  4 ++
 drivers/firewire/core-transaction.c | 76 +++++++++++++++++++++++++++++++------
 include/linux/firewire.h            |  5 +++
 3 files changed, 74 insertions(+), 11 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/firewire/core-card.c b/drivers/firewire/core-card.c
index 901435cdd5c2..d0f15c2f1e1d 100644
--- a/drivers/firewire/core-card.c
+++ b/drivers/firewire/core-card.c
@@ -428,6 +428,10 @@ void fw_card_initialize(struct fw_card *card,
 	card->device = device;
 	card->current_tlabel = 0;
 	card->tlabel_mask = 0;
+	card->split_timeout_hi = 0;
+	card->split_timeout_lo = 800 << 19;
+	card->split_timeout_cycles = 800;
+	card->split_timeout_jiffies = DIV_ROUND_UP(HZ, 10);
 	card->color = 0;
 	card->broadcast_channel = BROADCAST_CHANNEL_INITIAL;
 
diff --git a/drivers/firewire/core-transaction.c b/drivers/firewire/core-transaction.c
index 0034229dfd14..9a7d3ec23f2b 100644
--- a/drivers/firewire/core-transaction.c
+++ b/drivers/firewire/core-transaction.c
@@ -339,7 +339,8 @@ void fw_send_request(struct fw_card *card, struct fw_transaction *t, int tcode,
 	setup_timer(&t->split_timeout_timer,
 		    split_transaction_timeout_callback, (unsigned long)t);
 	/* FIXME: start this timer later, relative to t->timestamp */
-	mod_timer(&t->split_timeout_timer, jiffies + DIV_ROUND_UP(HZ, 10));
+	mod_timer(&t->split_timeout_timer,
+		  jiffies + card->split_timeout_jiffies);
 	t->callback = callback;
 	t->callback_data = callback_data;
 
@@ -673,11 +674,28 @@ void fw_fill_response(struct fw_packet *response, u32 *request_header,
 }
 EXPORT_SYMBOL(fw_fill_response);
 
-static struct fw_request *allocate_request(struct fw_packet *p)
+static u32 compute_split_timeout_timestamp(struct fw_card *card,
+					   u32 request_timestamp)
+{
+	unsigned int cycles;
+	u32 timestamp;
+
+	cycles = card->split_timeout_cycles;
+	cycles += request_timestamp & 0x1fff;
+
+	timestamp = request_timestamp & ~0x1fff;
+	timestamp += (cycles / 8000) << 13;
+	timestamp |= cycles % 8000;
+
+	return timestamp;
+}
+
+static struct fw_request *allocate_request(struct fw_card *card,
+					   struct fw_packet *p)
 {
 	struct fw_request *request;
 	u32 *data, length;
-	int request_tcode, t;
+	int request_tcode;
 
 	request_tcode = HEADER_GET_TCODE(p->header[0]);
 	switch (request_tcode) {
@@ -712,14 +730,9 @@ static struct fw_request *allocate_request(struct fw_packet *p)
 	if (request == NULL)
 		return NULL;
 
-	t = (p->timestamp & 0x1fff) + 4000;
-	if (t >= 8000)
-		t = (p->timestamp & ~0x1fff) + 0x2000 + t - 8000;
-	else
-		t = (p->timestamp & ~0x1fff) + t;
-
 	request->response.speed = p->speed;
-	request->response.timestamp = t;
+	request->response.timestamp =
+			compute_split_timeout_timestamp(card, p->timestamp);
 	request->response.generation = p->generation;
 	request->response.ack = 0;
 	request->response.callback = free_response_callback;
@@ -845,7 +858,7 @@ void fw_core_handle_request(struct fw_card *card, struct fw_packet *p)
 	if (p->ack != ACK_PENDING && p->ack != ACK_COMPLETE)
 		return;
 
-	request = allocate_request(p);
+	request = allocate_request(card, p);
 	if (request == NULL) {
 		/* FIXME: send statically allocated busy packet. */
 		return;
@@ -993,6 +1006,19 @@ static u32 read_state_register(struct fw_card *card)
 	return 0;
 }
 
+static void update_split_timeout(struct fw_card *card)
+{
+	unsigned int cycles;
+
+	cycles = card->split_timeout_hi * 8000 + (card->split_timeout_lo >> 19);
+
+	cycles = max(cycles, 800u); /* minimum as per the spec */
+	cycles = min(cycles, 3u * 8000u); /* maximum OHCI timeout */
+
+	card->split_timeout_cycles = cycles;
+	card->split_timeout_jiffies = DIV_ROUND_UP(cycles * HZ, 8000);
+}
+
 static void handle_registers(struct fw_card *card, struct fw_request *request,
 		int tcode, int destination, int source, int generation,
 		int speed, unsigned long long offset,
@@ -1001,6 +1027,7 @@ static void handle_registers(struct fw_card *card, struct fw_request *request,
 	int reg = offset & ~CSR_REGISTER_BASE;
 	__be32 *data = payload;
 	int rcode = RCODE_COMPLETE;
+	unsigned long flags;
 
 	switch (reg) {
 	case CSR_STATE_CLEAR:
@@ -1039,6 +1066,33 @@ static void handle_registers(struct fw_card *card, struct fw_request *request,
 			rcode = RCODE_TYPE_ERROR;
 		break;
 
+	case CSR_SPLIT_TIMEOUT_HI:
+		if (tcode == TCODE_READ_QUADLET_REQUEST) {
+			*data = cpu_to_be32(card->split_timeout_hi);
+		} else if (tcode == TCODE_WRITE_QUADLET_REQUEST) {
+			spin_lock_irqsave(&card->lock, flags);
+			card->split_timeout_hi = be32_to_cpu(*data) & 7;
+			update_split_timeout(card);
+			spin_unlock_irqrestore(&card->lock, flags);
+		} else {
+			rcode = RCODE_TYPE_ERROR;
+		}
+		break;
+
+	case CSR_SPLIT_TIMEOUT_LO:
+		if (tcode == TCODE_READ_QUADLET_REQUEST) {
+			*data = cpu_to_be32(card->split_timeout_lo);
+		} else if (tcode == TCODE_WRITE_QUADLET_REQUEST) {
+			spin_lock_irqsave(&card->lock, flags);
+			card->split_timeout_lo =
+					be32_to_cpu(*data) & 0xfff80000;
+			update_split_timeout(card);
+			spin_unlock_irqrestore(&card->lock, flags);
+		} else {
+			rcode = RCODE_TYPE_ERROR;
+		}
+		break;
+
 	case CSR_CYCLE_TIME:
 		if (TCODE_IS_READ_REQUEST(tcode) && length == 4)
 			*data = cpu_to_be32(card->driver->
diff --git a/include/linux/firewire.h b/include/linux/firewire.h
index 72e2b8ac2a5a..cdf8213c68ca 100644
--- a/include/linux/firewire.h
+++ b/include/linux/firewire.h
@@ -89,6 +89,11 @@ struct fw_card {
 	struct list_head transaction_list;
 	unsigned long reset_jiffies;
 
+	u32 split_timeout_hi;
+	u32 split_timeout_lo;
+	unsigned int split_timeout_cycles;
+	unsigned int split_timeout_jiffies;
+
 	unsigned long long guid;
 	unsigned max_receive;
 	int link_speed;
-- 
cgit v1.2.3


From a1a1132bd83d0aea51d4f19be4b4a58a064a0131 Mon Sep 17 00:00:00 2001
From: Clemens Ladisch <clemens@ladisch.de>
Date: Thu, 10 Jun 2010 08:35:06 +0200
Subject: firewire: add CSR PRIORITY_BUDGET support

If supported by the OHCI controller, implement the PRIORITY_BUDGET
register, which is required for nodes that can use asynchronous
priority arbitration.

To allow the core to determine what features the lowlevel device
supports, add a new card driver callback.

Signed-off-by: Clemens Ladisch <clemens@ladisch.de>
---
 drivers/firewire/core-transaction.c | 14 ++++++++++++++
 drivers/firewire/core.h             |  4 ++++
 drivers/firewire/ohci.c             | 27 +++++++++++++++++++++++++++
 include/linux/firewire.h            |  1 +
 4 files changed, 46 insertions(+)

(limited to 'include/linux')

diff --git a/drivers/firewire/core-transaction.c b/drivers/firewire/core-transaction.c
index 8146133818dc..a61eb3fb9573 100644
--- a/drivers/firewire/core-transaction.c
+++ b/drivers/firewire/core-transaction.c
@@ -1126,6 +1126,20 @@ static void handle_registers(struct fw_card *card, struct fw_request *request,
 			rcode = RCODE_TYPE_ERROR;
 		break;
 
+	case CSR_PRIORITY_BUDGET:
+		if (!(card->driver->get_features(card) &
+						FEATURE_PRIORITY_BUDGET))
+			rcode = RCODE_ADDRESS_ERROR;
+		else if (tcode == TCODE_READ_QUADLET_REQUEST)
+			*data = cpu_to_be32(card->driver->
+				read_csr_reg(card, CSR_PRIORITY_BUDGET));
+		else if (tcode == TCODE_WRITE_QUADLET_REQUEST)
+			card->driver->write_csr_reg(card, CSR_PRIORITY_BUDGET,
+						    be32_to_cpu(*data));
+		else
+			rcode = RCODE_TYPE_ERROR;
+		break;
+
 	case CSR_BROADCAST_CHANNEL:
 		if (tcode == TCODE_READ_QUADLET_REQUEST)
 			*data = cpu_to_be32(card->broadcast_channel);
diff --git a/drivers/firewire/core.h b/drivers/firewire/core.h
index efcdeb2e31e6..3b8c0f042f49 100644
--- a/drivers/firewire/core.h
+++ b/drivers/firewire/core.h
@@ -38,6 +38,8 @@ struct fw_packet;
 #define BROADCAST_CHANNEL_INITIAL	(1 << 31 | 31)
 #define BROADCAST_CHANNEL_VALID		(1 << 30)
 
+#define FEATURE_PRIORITY_BUDGET		0x01
+
 struct fw_card_driver {
 	/*
 	 * Enable the given card with the given initial config rom.
@@ -78,6 +80,8 @@ struct fw_card_driver {
 	u32 (*read_csr_reg)(struct fw_card *card, int csr_offset);
 	void (*write_csr_reg)(struct fw_card *card, int csr_offset, u32 value);
 
+	unsigned int (*get_features)(struct fw_card *card);
+
 	struct fw_iso_context *
 	(*allocate_iso_context)(struct fw_card *card,
 				int type, int channel, size_t header_size);
diff --git a/drivers/firewire/ohci.c b/drivers/firewire/ohci.c
index 9c588fd01250..0e5413531785 100644
--- a/drivers/firewire/ohci.c
+++ b/drivers/firewire/ohci.c
@@ -170,6 +170,7 @@ struct fw_ohci {
 	int generation;
 	int request_generation;	/* for timestamping incoming requests */
 	unsigned quirks;
+	unsigned int pri_req_max;
 	u32 bus_time;
 
 	/*
@@ -1738,6 +1739,11 @@ static int ohci_enable(struct fw_card *card,
 	reg_write(ohci, OHCI1394_IsochronousCycleTimer, seconds << 25);
 	ohci->bus_time = seconds & ~0x3f;
 
+	/* Get implemented bits of the priority arbitration request counter. */
+	reg_write(ohci, OHCI1394_FairnessControl, 0x3f);
+	ohci->pri_req_max = reg_read(ohci, OHCI1394_FairnessControl) & 0x3f;
+	reg_write(ohci, OHCI1394_FairnessControl, 0);
+
 	ar_context_run(&ohci->ar_request_ctx);
 	ar_context_run(&ohci->ar_response_ctx);
 
@@ -2028,6 +2034,10 @@ static u32 ohci_read_csr_reg(struct fw_card *card, int csr_offset)
 		value = reg_read(ohci, OHCI1394_ATRetries);
 		return (value >> 4) & 0x0ffff00f;
 
+	case CSR_PRIORITY_BUDGET:
+		return (reg_read(ohci, OHCI1394_FairnessControl) & 0x3f) |
+			(ohci->pri_req_max << 8);
+
 	default:
 		WARN_ON(1);
 		return 0;
@@ -2065,12 +2075,28 @@ static void ohci_write_csr_reg(struct fw_card *card, int csr_offset, u32 value)
 		flush_writes(ohci);
 		break;
 
+	case CSR_PRIORITY_BUDGET:
+		reg_write(ohci, OHCI1394_FairnessControl, value & 0x3f);
+		flush_writes(ohci);
+		break;
+
 	default:
 		WARN_ON(1);
 		break;
 	}
 }
 
+static unsigned int ohci_get_features(struct fw_card *card)
+{
+	struct fw_ohci *ohci = fw_ohci(card);
+	unsigned int features = 0;
+
+	if (ohci->pri_req_max != 0)
+		features |= FEATURE_PRIORITY_BUDGET;
+
+	return features;
+}
+
 static void copy_iso_headers(struct iso_context *ctx, void *p)
 {
 	int i = ctx->header_length;
@@ -2510,6 +2536,7 @@ static const struct fw_card_driver ohci_driver = {
 	.enable_phys_dma	= ohci_enable_phys_dma,
 	.read_csr_reg		= ohci_read_csr_reg,
 	.write_csr_reg		= ohci_write_csr_reg,
+	.get_features		= ohci_get_features,
 
 	.allocate_iso_context	= ohci_allocate_iso_context,
 	.free_iso_context	= ohci_free_iso_context,
diff --git a/include/linux/firewire.h b/include/linux/firewire.h
index cdf8213c68ca..a50377d91254 100644
--- a/include/linux/firewire.h
+++ b/include/linux/firewire.h
@@ -32,6 +32,7 @@
 #define CSR_CYCLE_TIME			0x200
 #define CSR_BUS_TIME			0x204
 #define CSR_BUSY_TIMEOUT		0x210
+#define CSR_PRIORITY_BUDGET		0x218
 #define CSR_BUS_MANAGER_ID		0x21c
 #define CSR_BANDWIDTH_AVAILABLE		0x220
 #define CSR_CHANNELS_AVAILABLE		0x224
-- 
cgit v1.2.3


From 3d1f46eb60b155c705e389ecdf313f11b4b91976 Mon Sep 17 00:00:00 2001
From: Clemens Ladisch <clemens@ladisch.de>
Date: Thu, 10 Jun 2010 08:35:37 +0200
Subject: firewire: core: add CSR MAINT_UTILITY support

Implement the MAIN_UTILITY register, which is utterly optional
but useful as a safe target for diagnostic read/write/broadcast
transactions.

Signed-off-by: Clemens Ladisch <clemens@ladisch.de>
---
 drivers/firewire/core-transaction.c | 9 +++++++++
 include/linux/firewire.h            | 3 +++
 2 files changed, 12 insertions(+)

(limited to 'include/linux')

diff --git a/drivers/firewire/core-transaction.c b/drivers/firewire/core-transaction.c
index a61eb3fb9573..dd8ef650a7cb 100644
--- a/drivers/firewire/core-transaction.c
+++ b/drivers/firewire/core-transaction.c
@@ -1140,6 +1140,15 @@ static void handle_registers(struct fw_card *card, struct fw_request *request,
 			rcode = RCODE_TYPE_ERROR;
 		break;
 
+	case CSR_MAINT_UTILITY:
+		if (tcode == TCODE_READ_QUADLET_REQUEST)
+			*data = card->maint_utility_register;
+		else if (tcode == TCODE_WRITE_QUADLET_REQUEST)
+			card->maint_utility_register = *data;
+		else
+			rcode = RCODE_TYPE_ERROR;
+		break;
+
 	case CSR_BROADCAST_CHANNEL:
 		if (tcode == TCODE_READ_QUADLET_REQUEST)
 			*data = cpu_to_be32(card->broadcast_channel);
diff --git a/include/linux/firewire.h b/include/linux/firewire.h
index a50377d91254..f1160e831dad 100644
--- a/include/linux/firewire.h
+++ b/include/linux/firewire.h
@@ -38,6 +38,7 @@
 #define CSR_CHANNELS_AVAILABLE		0x224
 #define CSR_CHANNELS_AVAILABLE_HI	0x224
 #define CSR_CHANNELS_AVAILABLE_LO	0x228
+#define CSR_MAINT_UTILITY		0x230
 #define CSR_BROADCAST_CHANNEL		0x234
 #define CSR_CONFIG_ROM			0x400
 #define CSR_CONFIG_ROM_END		0x800
@@ -122,6 +123,8 @@ struct fw_card {
 	bool broadcast_channel_allocated;
 	u32 broadcast_channel;
 	__be32 topology_map[(CSR_TOPOLOGY_MAP_END - CSR_TOPOLOGY_MAP) / 4];
+
+	__be32 maint_utility_register;
 };
 
 struct fw_attribute_group {
-- 
cgit v1.2.3


From 7e0e314f198d5048b74c8f0ef9f4c1c02e5ecfc9 Mon Sep 17 00:00:00 2001
From: Clemens Ladisch <clemens@ladisch.de>
Date: Thu, 10 Jun 2010 08:37:15 +0200
Subject: firewire: core: add CSR abdicate support

Implement the abdicate bit, which is required for bus manager
capable nodes and tested by the Base 1394 Test Suite.

Finally, something to do at a command reset!  :-)

Signed-off-by: Clemens Ladisch <clemens@ladisch.de>
---
 drivers/firewire/core-card.c        |  3 ++-
 drivers/firewire/core-topology.c    |  2 ++
 drivers/firewire/core-transaction.c | 13 +++++++++++--
 drivers/firewire/core.h             |  1 +
 include/linux/firewire.h            |  2 ++
 5 files changed, 18 insertions(+), 3 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/firewire/core-card.c b/drivers/firewire/core-card.c
index d0f15c2f1e1d..7c4cf6cfa746 100644
--- a/drivers/firewire/core-card.c
+++ b/drivers/firewire/core-card.c
@@ -260,7 +260,8 @@ static void fw_card_bm_work(struct work_struct *work)
 
 	grace = time_after(jiffies, card->reset_jiffies + DIV_ROUND_UP(HZ, 8));
 
-	if (is_next_generation(generation, card->bm_generation) ||
+	if ((is_next_generation(generation, card->bm_generation) &&
+	     !card->bm_abdicate) ||
 	    (card->bm_generation != generation && grace)) {
 		/*
 		 * This first step is to figure out who is IRM and
diff --git a/drivers/firewire/core-topology.c b/drivers/firewire/core-topology.c
index 93ec64cdeef7..ca3c65318165 100644
--- a/drivers/firewire/core-topology.c
+++ b/drivers/firewire/core-topology.c
@@ -552,6 +552,8 @@ void fw_core_handle_bus_reset(struct fw_card *card, int node_id, int generation,
 	smp_wmb();
 	card->generation = generation;
 	card->reset_jiffies = jiffies;
+	card->bm_abdicate = card->csr_abdicate;
+	card->csr_abdicate = false;
 	fw_schedule_bm_work(card, 0);
 
 	local_node = build_tree(card, self_ids, self_id_count);
diff --git a/drivers/firewire/core-transaction.c b/drivers/firewire/core-transaction.c
index e0c6cce894cf..85a54da243e2 100644
--- a/drivers/firewire/core-transaction.c
+++ b/drivers/firewire/core-transaction.c
@@ -1008,6 +1008,10 @@ static u32 read_state_register(struct fw_card *card)
 	/* Bit 8 (cmstr): */
 	value |= card->driver->read_csr_reg(card, CSR_STATE_CLEAR);
 
+	/* Bit 10 (abdicate): */
+	if (card->csr_abdicate)
+		value |= CSR_STATE_BIT_ABDICATE;
+
 	return value;
 }
 
@@ -1041,6 +1045,8 @@ static void handle_registers(struct fw_card *card, struct fw_request *request,
 		} else if (tcode == TCODE_WRITE_QUADLET_REQUEST) {
 			card->driver->write_csr_reg(card, CSR_STATE_CLEAR,
 						    be32_to_cpu(*data));
+			if (*data & cpu_to_be32(CSR_STATE_BIT_ABDICATE))
+				card->csr_abdicate = false;
 		} else {
 			rcode = RCODE_TYPE_ERROR;
 		}
@@ -1052,7 +1058,8 @@ static void handle_registers(struct fw_card *card, struct fw_request *request,
 		} else if (tcode == TCODE_WRITE_QUADLET_REQUEST) {
 			card->driver->write_csr_reg(card, CSR_STATE_SET,
 						    be32_to_cpu(*data));
-			/* FIXME: implement abdicate */
+			if (*data & cpu_to_be32(CSR_STATE_BIT_ABDICATE))
+				card->csr_abdicate = true;
 		} else {
 			rcode = RCODE_TYPE_ERROR;
 		}
@@ -1070,7 +1077,9 @@ static void handle_registers(struct fw_card *card, struct fw_request *request,
 		break;
 
 	case CSR_RESET_START:
-		if (tcode != TCODE_WRITE_QUADLET_REQUEST)
+		if (tcode == TCODE_WRITE_QUADLET_REQUEST)
+			card->csr_abdicate = false;
+		else
 			rcode = RCODE_TYPE_ERROR;
 		break;
 
diff --git a/drivers/firewire/core.h b/drivers/firewire/core.h
index aaecdd1c1767..a9ace1f8dc3f 100644
--- a/drivers/firewire/core.h
+++ b/drivers/firewire/core.h
@@ -41,6 +41,7 @@ struct fw_packet;
 #define FEATURE_PRIORITY_BUDGET		0x01
 
 #define CSR_STATE_BIT_CMSTR	(1 << 8)
+#define CSR_STATE_BIT_ABDICATE	(1 << 10)
 
 struct fw_card_driver {
 	/*
diff --git a/include/linux/firewire.h b/include/linux/firewire.h
index f1160e831dad..4d22643215ef 100644
--- a/include/linux/firewire.h
+++ b/include/linux/firewire.h
@@ -119,6 +119,8 @@ struct fw_card {
 	int bm_retries;
 	int bm_generation;
 	__be32 bm_transaction_data[2];
+	bool bm_abdicate; /* value of csr_abdicate before last bus reset */
+	bool csr_abdicate; /* visible in CSR STATE_CLEAR/SET registers */
 
 	bool broadcast_channel_allocated;
 	u32 broadcast_channel;
-- 
cgit v1.2.3


From c6de9f08912311ddc1b3502b90e10fd449acd401 Mon Sep 17 00:00:00 2001
From: Huang Ying <ying.huang@intel.com>
Date: Mon, 31 May 2010 16:48:09 +0800
Subject: x86, mce: Add HW_ERR printk prefix for hardware error logging

This makes hardware error related log in printk log more explicit. So
that the users can report it to hardware vendor instead of LKML or
software vendor.

Signed-off-by: Huang Ying <ying.huang@intel.com>
Reviewed-by: Hidetoshi Seto <seto.hidetoshi@jp.fujitsu.com>
LKML-Reference: <1275295689.3444.462.camel@yhuang-dev.sh.intel.com>
Signed-off-by: H. Peter Anvin <hpa@zytor.com>
---
 include/linux/kernel.h | 7 +++++++
 1 file changed, 7 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/kernel.h b/include/linux/kernel.h
index 8317ec4b9f3b..3bf740bb069e 100644
--- a/include/linux/kernel.h
+++ b/include/linux/kernel.h
@@ -247,6 +247,13 @@ extern struct pid *session_of_pgrp(struct pid *pgrp);
 #define FW_WARN		"[Firmware Warn]: "
 #define FW_INFO		"[Firmware Info]: "
 
+/*
+ * HW_ERR
+ * Add this to a message for hardware errors, so that user can report
+ * it to hardware vendor instead of LKML or software vendor.
+ */
+#define HW_ERR		"[Hardware Error]: "
+
 #ifdef CONFIG_PRINTK
 asmlinkage int vprintk(const char *fmt, va_list args)
 	__attribute__ ((format (printf, 1, 0)));
-- 
cgit v1.2.3


From 551d55a944b143ef26fbd482d1c463199d6f65cf Mon Sep 17 00:00:00 2001
From: Mathieu Desnoyers <mathieu.desnoyers@efficios.com>
Date: Sat, 17 Apr 2010 08:48:42 -0400
Subject: tree/tiny rcu: Add debug RCU head objects

Helps finding racy users of call_rcu(), which results in hangs because list
entries are overwritten and/or skipped.

Changelog since v4:
- Bissectability is now OK
- Now generate a WARN_ON_ONCE() for non-initialized rcu_head passed to
  call_rcu(). Statically initialized objects are detected with
  object_is_static().
- Rename rcu_head_init_on_stack to init_rcu_head_on_stack.
- Remove init_rcu_head() completely.

Changelog since v3:
- Include comments from Lai Jiangshan

This new patch version is based on the debugobjects with the newly introduced
"active state" tracker.

Non-initialized entries are all considered as "statically initialized". An
activation fixup (triggered by call_rcu()) takes care of performing the debug
object initialization without issuing any warning. Since we cannot increase the
size of struct rcu_head, I don't see much room to put an identifier for
statically initialized rcu_head structures. So for now, we have to live without
"activation without explicit init" detection. But the main purpose of this debug
option is to detect double-activations (double call_rcu() use of a rcu_head
before the callback is executed), which is correctly addressed here.

This also detects potential internal RCU callback corruption, which would cause
the callbacks to be executed twice.

Signed-off-by: Mathieu Desnoyers <mathieu.desnoyers@efficios.com>
CC: David S. Miller <davem@davemloft.net>
CC: "Paul E. McKenney" <paulmck@linux.vnet.ibm.com>
CC: akpm@linux-foundation.org
CC: mingo@elte.hu
CC: laijs@cn.fujitsu.com
CC: dipankar@in.ibm.com
CC: josh@joshtriplett.org
CC: dvhltc@us.ibm.com
CC: niv@us.ibm.com
CC: tglx@linutronix.de
CC: peterz@infradead.org
CC: rostedt@goodmis.org
CC: Valdis.Kletnieks@vt.edu
CC: dhowells@redhat.com
CC: eric.dumazet@gmail.com
CC: Alexey Dobriyan <adobriyan@gmail.com>
Signed-off-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
Reviewed-by: Lai Jiangshan <laijs@cn.fujitsu.com>
---
 include/linux/rcupdate.h |  49 +++++++++++++++
 kernel/rcupdate.c        | 160 +++++++++++++++++++++++++++++++++++++++++++++++
 kernel/rcutiny.c         |   2 +
 kernel/rcutree.c         |   2 +
 lib/Kconfig.debug        |   6 ++
 5 files changed, 219 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/rcupdate.h b/include/linux/rcupdate.h
index b653b4aaa8a6..2b7fc506e479 100644
--- a/include/linux/rcupdate.h
+++ b/include/linux/rcupdate.h
@@ -40,6 +40,7 @@
 #include <linux/seqlock.h>
 #include <linux/lockdep.h>
 #include <linux/completion.h>
+#include <linux/debugobjects.h>
 
 #ifdef CONFIG_RCU_TORTURE_TEST
 extern int rcutorture_runnable; /* for sysctl */
@@ -79,6 +80,16 @@ extern void rcu_init(void);
        (ptr)->next = NULL; (ptr)->func = NULL; \
 } while (0)
 
+/*
+ * init_rcu_head_on_stack()/destroy_rcu_head_on_stack() are needed for dynamic
+ * initialization and destruction of rcu_head on the stack. rcu_head structures
+ * allocated dynamically in the heap or defined statically don't need any
+ * initialization.
+ */
+#ifdef CONFIG_DEBUG_OBJECTS_RCU_HEAD
+extern void init_rcu_head_on_stack(struct rcu_head *head);
+extern void destroy_rcu_head_on_stack(struct rcu_head *head);
+#else /* !CONFIG_DEBUG_OBJECTS_RCU_HEAD */
 static inline void init_rcu_head_on_stack(struct rcu_head *head)
 {
 }
@@ -86,6 +97,7 @@ static inline void init_rcu_head_on_stack(struct rcu_head *head)
 static inline void destroy_rcu_head_on_stack(struct rcu_head *head)
 {
 }
+#endif	/* #else !CONFIG_DEBUG_OBJECTS_RCU_HEAD */
 
 #ifdef CONFIG_DEBUG_LOCK_ALLOC
 
@@ -517,4 +529,41 @@ extern void call_rcu(struct rcu_head *head,
 extern void call_rcu_bh(struct rcu_head *head,
 			void (*func)(struct rcu_head *head));
 
+/*
+ * debug_rcu_head_queue()/debug_rcu_head_unqueue() are used internally
+ * by call_rcu() and rcu callback execution, and are therefore not part of the
+ * RCU API. Leaving in rcupdate.h because they are used by all RCU flavors.
+ */
+
+#ifdef CONFIG_DEBUG_OBJECTS_RCU_HEAD
+# define STATE_RCU_HEAD_READY	0
+# define STATE_RCU_HEAD_QUEUED	1
+
+extern struct debug_obj_descr rcuhead_debug_descr;
+
+static inline void debug_rcu_head_queue(struct rcu_head *head)
+{
+	debug_object_activate(head, &rcuhead_debug_descr);
+	debug_object_active_state(head, &rcuhead_debug_descr,
+				  STATE_RCU_HEAD_READY,
+				  STATE_RCU_HEAD_QUEUED);
+}
+
+static inline void debug_rcu_head_unqueue(struct rcu_head *head)
+{
+	debug_object_active_state(head, &rcuhead_debug_descr,
+				  STATE_RCU_HEAD_QUEUED,
+				  STATE_RCU_HEAD_READY);
+	debug_object_deactivate(head, &rcuhead_debug_descr);
+}
+#else	/* !CONFIG_DEBUG_OBJECTS_RCU_HEAD */
+static inline void debug_rcu_head_queue(struct rcu_head *head)
+{
+}
+
+static inline void debug_rcu_head_unqueue(struct rcu_head *head)
+{
+}
+#endif	/* #else !CONFIG_DEBUG_OBJECTS_RCU_HEAD */
+
 #endif /* __LINUX_RCUPDATE_H */
diff --git a/kernel/rcupdate.c b/kernel/rcupdate.c
index 72a8dc9567f5..4d169835fb36 100644
--- a/kernel/rcupdate.c
+++ b/kernel/rcupdate.c
@@ -114,3 +114,163 @@ int rcu_my_thread_group_empty(void)
 }
 EXPORT_SYMBOL_GPL(rcu_my_thread_group_empty);
 #endif /* #ifdef CONFIG_PROVE_RCU */
+
+#ifdef CONFIG_DEBUG_OBJECTS_RCU_HEAD
+static inline void debug_init_rcu_head(struct rcu_head *head)
+{
+	debug_object_init(head, &rcuhead_debug_descr);
+}
+
+static inline void debug_rcu_head_free(struct rcu_head *head)
+{
+	debug_object_free(head, &rcuhead_debug_descr);
+}
+
+/*
+ * fixup_init is called when:
+ * - an active object is initialized
+ */
+static int rcuhead_fixup_init(void *addr, enum debug_obj_state state)
+{
+	struct rcu_head *head = addr;
+
+	switch (state) {
+	case ODEBUG_STATE_ACTIVE:
+		/*
+		 * Ensure that queued callbacks are all executed.
+		 * If we detect that we are nested in a RCU read-side critical
+		 * section, we should simply fail, otherwise we would deadlock.
+		 */
+		if (rcu_preempt_depth() != 0 || preempt_count() != 0 ||
+		    irqs_disabled()) {
+			WARN_ON(1);
+			return 0;
+		}
+		rcu_barrier();
+		rcu_barrier_sched();
+		rcu_barrier_bh();
+		debug_object_init(head, &rcuhead_debug_descr);
+		return 1;
+	default:
+		return 0;
+	}
+}
+
+/*
+ * fixup_activate is called when:
+ * - an active object is activated
+ * - an unknown object is activated (might be a statically initialized object)
+ * Activation is performed internally by call_rcu().
+ */
+static int rcuhead_fixup_activate(void *addr, enum debug_obj_state state)
+{
+	struct rcu_head *head = addr;
+
+	switch (state) {
+
+	case ODEBUG_STATE_NOTAVAILABLE:
+		/*
+		 * This is not really a fixup. We just make sure that it is
+		 * tracked in the object tracker.
+		 */
+		debug_object_init(head, &rcuhead_debug_descr);
+		debug_object_activate(head, &rcuhead_debug_descr);
+		return 0;
+
+	case ODEBUG_STATE_ACTIVE:
+		/*
+		 * Ensure that queued callbacks are all executed.
+		 * If we detect that we are nested in a RCU read-side critical
+		 * section, we should simply fail, otherwise we would deadlock.
+		 */
+		if (rcu_preempt_depth() != 0 || preempt_count() != 0 ||
+		    irqs_disabled()) {
+			WARN_ON(1);
+			return 0;
+		}
+		rcu_barrier();
+		rcu_barrier_sched();
+		rcu_barrier_bh();
+		debug_object_activate(head, &rcuhead_debug_descr);
+		return 1;
+	default:
+		return 0;
+	}
+}
+
+/*
+ * fixup_free is called when:
+ * - an active object is freed
+ */
+static int rcuhead_fixup_free(void *addr, enum debug_obj_state state)
+{
+	struct rcu_head *head = addr;
+
+	switch (state) {
+	case ODEBUG_STATE_ACTIVE:
+		/*
+		 * Ensure that queued callbacks are all executed.
+		 * If we detect that we are nested in a RCU read-side critical
+		 * section, we should simply fail, otherwise we would deadlock.
+		 */
+#ifndef CONFIG_PREEMPT
+		WARN_ON(1);
+		return 0;
+#else
+		if (rcu_preempt_depth() != 0 || preempt_count() != 0 ||
+		    irqs_disabled()) {
+			WARN_ON(1);
+			return 0;
+		}
+		rcu_barrier();
+		rcu_barrier_sched();
+		rcu_barrier_bh();
+		debug_object_free(head, &rcuhead_debug_descr);
+		return 1;
+#endif
+	default:
+		return 0;
+	}
+}
+
+/**
+ * init_rcu_head_on_stack() - initialize on-stack rcu_head for debugobjects
+ * @head: pointer to rcu_head structure to be initialized
+ *
+ * This function informs debugobjects of a new rcu_head structure that
+ * has been allocated as an auto variable on the stack.  This function
+ * is not required for rcu_head structures that are statically defined or
+ * that are dynamically allocated on the heap.  This function has no
+ * effect for !CONFIG_DEBUG_OBJECTS_RCU_HEAD kernel builds.
+ */
+void init_rcu_head_on_stack(struct rcu_head *head)
+{
+	debug_object_init_on_stack(head, &rcuhead_debug_descr);
+}
+EXPORT_SYMBOL_GPL(init_rcu_head_on_stack);
+
+/**
+ * destroy_rcu_head_on_stack() - destroy on-stack rcu_head for debugobjects
+ * @head: pointer to rcu_head structure to be initialized
+ *
+ * This function informs debugobjects that an on-stack rcu_head structure
+ * is about to go out of scope.  As with init_rcu_head_on_stack(), this
+ * function is not required for rcu_head structures that are statically
+ * defined or that are dynamically allocated on the heap.  Also as with
+ * init_rcu_head_on_stack(), this function has no effect for
+ * !CONFIG_DEBUG_OBJECTS_RCU_HEAD kernel builds.
+ */
+void destroy_rcu_head_on_stack(struct rcu_head *head)
+{
+	debug_object_free(head, &rcuhead_debug_descr);
+}
+EXPORT_SYMBOL_GPL(destroy_rcu_head_on_stack);
+
+struct debug_obj_descr rcuhead_debug_descr = {
+	.name = "rcu_head",
+	.fixup_init = rcuhead_fixup_init,
+	.fixup_activate = rcuhead_fixup_activate,
+	.fixup_free = rcuhead_fixup_free,
+};
+EXPORT_SYMBOL_GPL(rcuhead_debug_descr);
+#endif /* #ifdef CONFIG_DEBUG_OBJECTS_RCU_HEAD */
diff --git a/kernel/rcutiny.c b/kernel/rcutiny.c
index 38729d3cd236..196ec02f8be0 100644
--- a/kernel/rcutiny.c
+++ b/kernel/rcutiny.c
@@ -169,6 +169,7 @@ static void __rcu_process_callbacks(struct rcu_ctrlblk *rcp)
 	while (list) {
 		next = list->next;
 		prefetch(next);
+		debug_rcu_head_unqueue(list);
 		list->func(list);
 		list = next;
 	}
@@ -211,6 +212,7 @@ static void __call_rcu(struct rcu_head *head,
 {
 	unsigned long flags;
 
+	debug_rcu_head_queue(head);
 	head->func = func;
 	head->next = NULL;
 
diff --git a/kernel/rcutree.c b/kernel/rcutree.c
index d4437345706f..d5bc43976c5a 100644
--- a/kernel/rcutree.c
+++ b/kernel/rcutree.c
@@ -1112,6 +1112,7 @@ static void rcu_do_batch(struct rcu_state *rsp, struct rcu_data *rdp)
 	while (list) {
 		next = list->next;
 		prefetch(next);
+		debug_rcu_head_unqueue(list);
 		list->func(list);
 		list = next;
 		if (++count >= rdp->blimit)
@@ -1388,6 +1389,7 @@ __call_rcu(struct rcu_head *head, void (*func)(struct rcu_head *rcu),
 	unsigned long flags;
 	struct rcu_data *rdp;
 
+	debug_rcu_head_queue(head);
 	head->func = func;
 	head->next = NULL;
 
diff --git a/lib/Kconfig.debug b/lib/Kconfig.debug
index e722e9d62221..142faa2ec665 100644
--- a/lib/Kconfig.debug
+++ b/lib/Kconfig.debug
@@ -307,6 +307,12 @@ config DEBUG_OBJECTS_WORK
 	  work queue routines to track the life time of work objects and
 	  validate the work operations.
 
+config DEBUG_OBJECTS_RCU_HEAD
+	bool "Debug RCU callbacks objects"
+	depends on DEBUG_OBJECTS && PREEMPT
+	help
+	  Enable this to turn on debugging of RCU list heads (call_rcu() usage).
+
 config DEBUG_OBJECTS_ENABLE_DEFAULT
 	int "debug_objects bootup default value (0-1)"
         range 0 1
-- 
cgit v1.2.3


From f5155b33277c9678041a27869165619bb34f722f Mon Sep 17 00:00:00 2001
From: "Paul E. McKenney" <paulmck@linux.vnet.ibm.com>
Date: Fri, 30 Apr 2010 06:42:01 -0700
Subject: rcu: add an rcu_dereference_index_check()

The sparse RCU-pointer checking relies on type magic that dereferences
the pointer in question.  This does not work if the pointer is in fact
an array index.  This commit therefore supplies a new RCU API that
omits the sparse checking to continue to support rcu_dereference()
on integers.

Signed-off-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
---
 include/linux/rcupdate.h | 33 +++++++++++++++++++++++++++++++++
 1 file changed, 33 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/rcupdate.h b/include/linux/rcupdate.h
index 2b7fc506e479..9fbc54a2585d 100644
--- a/include/linux/rcupdate.h
+++ b/include/linux/rcupdate.h
@@ -566,4 +566,37 @@ static inline void debug_rcu_head_unqueue(struct rcu_head *head)
 }
 #endif	/* #else !CONFIG_DEBUG_OBJECTS_RCU_HEAD */
 
+#ifndef CONFIG_PROVE_RCU
+#define __do_rcu_dereference_check(c) do { } while (0)
+#endif /* #ifdef CONFIG_PROVE_RCU */
+
+#define __rcu_dereference_index_check(p, c) \
+	({ \
+		typeof(p) _________p1 = ACCESS_ONCE(p); \
+		__do_rcu_dereference_check(c); \
+		smp_read_barrier_depends(); \
+		(_________p1); \
+	})
+
+/**
+ * rcu_dereference_index_check() - rcu_dereference for indices with debug checking
+ * @p: The pointer to read, prior to dereferencing
+ * @c: The conditions under which the dereference will take place
+ *
+ * Similar to rcu_dereference_check(), but omits the sparse checking.
+ * This allows rcu_dereference_index_check() to be used on integers,
+ * which can then be used as array indices.  Attempting to use
+ * rcu_dereference_check() on an integer will give compiler warnings
+ * because the sparse address-space mechanism relies on dereferencing
+ * the RCU-protected pointer.  Dereferencing integers is not something
+ * that even gcc will put up with.
+ *
+ * Note that this function does not implicitly check for RCU read-side
+ * critical sections.  If this function gains lots of uses, it might
+ * make sense to provide versions for each flavor of RCU, but it does
+ * not make sense as of early 2010.
+ */
+#define rcu_dereference_index_check(p, c) \
+	__rcu_dereference_index_check((p), (c))
+
 #endif /* __LINUX_RCUPDATE_H */
-- 
cgit v1.2.3


From 71d1d5c722db9ae3b3f9c08ef7ddcd7759fbb1e0 Mon Sep 17 00:00:00 2001
From: "Paul E. McKenney" <paulmck@linux.vnet.ibm.com>
Date: Tue, 11 May 2010 16:13:14 -0700
Subject: rcu: add __rcu API for later sparse checking

This commit defines an __rcu API, but provides only vacuous definitions
for it.  This breaks dependencies among most of the subsequent patches,
allowing them to reach mainline asynchronously via whatever trees are
appropriate.

Signed-off-by: Arnd Bergmann <arnd@arndb.de>
Signed-off-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
Cc: Christopher Li <sparse@chrisli.org>
Cc: Josh Triplett <josh@joshtriplett.org>
---
 include/linux/compiler.h | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/compiler.h b/include/linux/compiler.h
index a5a472b10746..c1a62c56a660 100644
--- a/include/linux/compiler.h
+++ b/include/linux/compiler.h
@@ -16,6 +16,7 @@
 # define __release(x)	__context__(x,-1)
 # define __cond_lock(x,c)	((c) ? ({ __acquire(x); 1; }) : 0)
 # define __percpu	__attribute__((noderef, address_space(3)))
+# define __rcu
 extern void __chk_user_ptr(const volatile void __user *);
 extern void __chk_io_ptr(const volatile void __iomem *);
 #else
@@ -34,6 +35,7 @@ extern void __chk_io_ptr(const volatile void __iomem *);
 # define __release(x) (void)0
 # define __cond_lock(x,c) (c)
 # define __percpu
+# define __rcu
 #endif
 
 #ifdef __KERNEL__
-- 
cgit v1.2.3


From a25909a4d4a29e272f953e12595bf2f04a292dbd Mon Sep 17 00:00:00 2001
From: "Paul E. McKenney" <paulmck@linux.vnet.ibm.com>
Date: Thu, 13 May 2010 12:32:28 -0700
Subject: lockdep: Add an in_workqueue_context() lockdep-based test function

Some recent uses of RCU make use of workqueues.  In these uses, execution
within the context of a specific workqueue takes the place of the usual
RCU read-side primitives such as rcu_read_lock(), and flushing of workqueues
takes the place of the usual RCU grace-period primitives.  Checking for
correct use of rcu_dereference() in such cases requires a test of whether
the code is executing in the context of a particular workqueue.  This
commit adds an in_workqueue_context() function that provides this test.
This new function is only defined when lockdep is enabled, which allows
it to be used as the second argument of rcu_dereference_check().

Signed-off-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
---
 include/linux/workqueue.h |  4 ++++
 kernel/workqueue.c        | 15 +++++++++++++++
 2 files changed, 19 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/workqueue.h b/include/linux/workqueue.h
index 9466e860d8c2..d0f7c8178498 100644
--- a/include/linux/workqueue.h
+++ b/include/linux/workqueue.h
@@ -297,4 +297,8 @@ static inline long work_on_cpu(unsigned int cpu, long (*fn)(void *), void *arg)
 #else
 long work_on_cpu(unsigned int cpu, long (*fn)(void *), void *arg);
 #endif /* CONFIG_SMP */
+
+#ifdef CONFIG_LOCKDEP
+int in_workqueue_context(struct workqueue_struct *wq);
+#endif
 #endif
diff --git a/kernel/workqueue.c b/kernel/workqueue.c
index 327d2deb4451..59fef1531dd2 100644
--- a/kernel/workqueue.c
+++ b/kernel/workqueue.c
@@ -68,6 +68,21 @@ struct workqueue_struct {
 #endif
 };
 
+#ifdef CONFIG_LOCKDEP
+/**
+ * in_workqueue_context() - in context of specified workqueue?
+ * @wq: the workqueue of interest
+ *
+ * Checks lockdep state to see if the current task is executing from
+ * within a workqueue item.  This function exists only if lockdep is
+ * enabled.
+ */
+int in_workqueue_context(struct workqueue_struct *wq)
+{
+	return lock_is_held(&wq->lockdep_map);
+}
+#endif
+
 #ifdef CONFIG_DEBUG_OBJECTS_WORK
 
 static struct debug_obj_descr work_debug_descr;
-- 
cgit v1.2.3


From 2c666df80764389886110c942a7916ba9622583d Mon Sep 17 00:00:00 2001
From: "Paul E. McKenney" <paulmck@linux.vnet.ibm.com>
Date: Thu, 29 Apr 2010 20:48:47 -0700
Subject: vfs: add fs.h to define struct file

The sparse RCU-pointer annotations require definition of the
underlying type of any pointer passed to rcu_dereference() and friends.
So fcheck_files() needs "struct file" to be defined, so include fs.h.

Signed-off-by: Arnd Bergmann <arnd@arndb.de>
Signed-off-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
Cc: Al Viro <viro@ZenIV.linux.org.uk>
---
 include/linux/fdtable.h | 1 +
 1 file changed, 1 insertion(+)

(limited to 'include/linux')

diff --git a/include/linux/fdtable.h b/include/linux/fdtable.h
index 013dc529e95f..551671e87927 100644
--- a/include/linux/fdtable.h
+++ b/include/linux/fdtable.h
@@ -11,6 +11,7 @@
 #include <linux/rcupdate.h>
 #include <linux/types.h>
 #include <linux/init.h>
+#include <linux/fs.h>
 
 #include <asm/atomic.h>
 
-- 
cgit v1.2.3


From 81bdf5bd7349bd4523538cbd7878f334bc2bfe14 Mon Sep 17 00:00:00 2001
From: "Paul E. McKenney" <paulmck@linux.vnet.ibm.com>
Date: Sun, 2 May 2010 18:10:06 -0700
Subject: net: Make accesses to ->br_port safe for sparse RCU

The new versions of the rcu_dereference() APIs requires that any pointers
passed to one of these APIs be fully defined.  The ->br_port field
in struct net_device points to a struct net_bridge_port, which is an
incomplete type.  This commit therefore changes ->br_port to be a void*,
and introduces a br_port() helper function to convert the type to struct
net_bridge_port, and applies this new helper function where required.

Signed-off-by: Arnd Bergmann <arnd@arndb.de>
Signed-off-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
Cc: David Miller <davem@davemloft.net>
Cc: Stephen Hemminger <shemminger@linux-foundation.org>
Cc: Eric Dumazet <eric.dumazet@gmail.com>
---
 include/linux/if_bridge.h           | 3 +++
 net/bridge/br_fdb.c                 | 2 +-
 net/bridge/br_private.h             | 5 +++++
 net/bridge/netfilter/ebt_redirect.c | 2 +-
 net/bridge/netfilter/ebt_ulog.c     | 4 ++--
 net/bridge/netfilter/ebtables.c     | 4 ++--
 net/netfilter/nfnetlink_log.c       | 4 ++--
 net/netfilter/nfnetlink_queue.c     | 4 ++--
 8 files changed, 18 insertions(+), 10 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/if_bridge.h b/include/linux/if_bridge.h
index 938b7e81df95..d001d782922d 100644
--- a/include/linux/if_bridge.h
+++ b/include/linux/if_bridge.h
@@ -101,6 +101,9 @@ struct __fdb_entry {
 
 #include <linux/netdevice.h>
 
+/* br_handle_frame_hook() needs the following forward declaration. */
+struct net_bridge_port;
+
 extern void brioctl_set(int (*ioctl_hook)(struct net *, unsigned int, void __user *));
 extern struct sk_buff *(*br_handle_frame_hook)(struct net_bridge_port *p,
 					       struct sk_buff *skb);
diff --git a/net/bridge/br_fdb.c b/net/bridge/br_fdb.c
index 26637439965b..845710bca49c 100644
--- a/net/bridge/br_fdb.c
+++ b/net/bridge/br_fdb.c
@@ -246,7 +246,7 @@ int br_fdb_test_addr(struct net_device *dev, unsigned char *addr)
 		return 0;
 
 	rcu_read_lock();
-	fdb = __br_fdb_get(dev->br_port->br, addr);
+	fdb = __br_fdb_get(br_port(dev)->br, addr);
 	ret = fdb && fdb->dst->dev != dev &&
 		fdb->dst->state == BR_STATE_FORWARDING;
 	rcu_read_unlock();
diff --git a/net/bridge/br_private.h b/net/bridge/br_private.h
index 0f4a74bc6a9b..3255188355b5 100644
--- a/net/bridge/br_private.h
+++ b/net/bridge/br_private.h
@@ -268,6 +268,11 @@ static inline int br_is_root_bridge(const struct net_bridge *br)
 	return !memcmp(&br->bridge_id, &br->designated_root, 8);
 }
 
+static inline struct net_bridge_port *br_port(const struct net_device *dev)
+{
+	return rcu_dereference(dev->br_port);
+}
+
 /* br_device.c */
 extern void br_dev_setup(struct net_device *dev);
 extern netdev_tx_t br_dev_xmit(struct sk_buff *skb,
diff --git a/net/bridge/netfilter/ebt_redirect.c b/net/bridge/netfilter/ebt_redirect.c
index 9e19166ba453..a39df0ae0f81 100644
--- a/net/bridge/netfilter/ebt_redirect.c
+++ b/net/bridge/netfilter/ebt_redirect.c
@@ -25,7 +25,7 @@ ebt_redirect_tg(struct sk_buff *skb, const struct xt_action_param *par)
 
 	if (par->hooknum != NF_BR_BROUTING)
 		memcpy(eth_hdr(skb)->h_dest,
-		       par->in->br_port->br->dev->dev_addr, ETH_ALEN);
+		       br_port(par->in)->br->dev->dev_addr, ETH_ALEN);
 	else
 		memcpy(eth_hdr(skb)->h_dest, par->in->dev_addr, ETH_ALEN);
 	skb->pkt_type = PACKET_HOST;
diff --git a/net/bridge/netfilter/ebt_ulog.c b/net/bridge/netfilter/ebt_ulog.c
index ae3c7cef1484..5a4996bbb090 100644
--- a/net/bridge/netfilter/ebt_ulog.c
+++ b/net/bridge/netfilter/ebt_ulog.c
@@ -178,7 +178,7 @@ static void ebt_ulog_packet(unsigned int hooknr, const struct sk_buff *skb,
 		strcpy(pm->physindev, in->name);
 		/* If in isn't a bridge, then physindev==indev */
 		if (in->br_port)
-			strcpy(pm->indev, in->br_port->br->dev->name);
+			strcpy(pm->indev, br_port(in)->br->dev->name);
 		else
 			strcpy(pm->indev, in->name);
 	} else
@@ -187,7 +187,7 @@ static void ebt_ulog_packet(unsigned int hooknr, const struct sk_buff *skb,
 	if (out) {
 		/* If out exists, then out is a bridge port */
 		strcpy(pm->physoutdev, out->name);
-		strcpy(pm->outdev, out->br_port->br->dev->name);
+		strcpy(pm->outdev, br_port(out)->br->dev->name);
 	} else
 		pm->outdev[0] = pm->physoutdev[0] = '\0';
 
diff --git a/net/bridge/netfilter/ebtables.c b/net/bridge/netfilter/ebtables.c
index 59ca00e40dec..4c2aab8cbfc7 100644
--- a/net/bridge/netfilter/ebtables.c
+++ b/net/bridge/netfilter/ebtables.c
@@ -141,10 +141,10 @@ ebt_basic_match(const struct ebt_entry *e, const struct ethhdr *h,
 	if (FWINV2(ebt_dev_check(e->out, out), EBT_IOUT))
 		return 1;
 	if ((!in || !in->br_port) ? 0 : FWINV2(ebt_dev_check(
-	   e->logical_in, in->br_port->br->dev), EBT_ILOGICALIN))
+	   e->logical_in, br_port(in)->br->dev), EBT_ILOGICALIN))
 		return 1;
 	if ((!out || !out->br_port) ? 0 : FWINV2(ebt_dev_check(
-	   e->logical_out, out->br_port->br->dev), EBT_ILOGICALOUT))
+	   e->logical_out, br_port(out)->br->dev), EBT_ILOGICALOUT))
 		return 1;
 
 	if (e->bitmask & EBT_SOURCEMAC) {
diff --git a/net/netfilter/nfnetlink_log.c b/net/netfilter/nfnetlink_log.c
index fc9a211e629e..78957cfa3bdd 100644
--- a/net/netfilter/nfnetlink_log.c
+++ b/net/netfilter/nfnetlink_log.c
@@ -404,7 +404,7 @@ __build_packet_message(struct nfulnl_instance *inst,
 				     htonl(indev->ifindex));
 			/* this is the bridge group "brX" */
 			NLA_PUT_BE32(inst->skb, NFULA_IFINDEX_INDEV,
-				     htonl(indev->br_port->br->dev->ifindex));
+				     htonl(br_port(indev)->br->dev->ifindex));
 		} else {
 			/* Case 2: indev is bridge group, we need to look for
 			 * physical device (when called from ipv4) */
@@ -431,7 +431,7 @@ __build_packet_message(struct nfulnl_instance *inst,
 				     htonl(outdev->ifindex));
 			/* this is the bridge group "brX" */
 			NLA_PUT_BE32(inst->skb, NFULA_IFINDEX_OUTDEV,
-				     htonl(outdev->br_port->br->dev->ifindex));
+				     htonl(br_port(outdev)->br->dev->ifindex));
 		} else {
 			/* Case 2: indev is a bridge group, we need to look
 			 * for physical device (when called from ipv4) */
diff --git a/net/netfilter/nfnetlink_queue.c b/net/netfilter/nfnetlink_queue.c
index 12e1ab37fcd8..c3c17498298e 100644
--- a/net/netfilter/nfnetlink_queue.c
+++ b/net/netfilter/nfnetlink_queue.c
@@ -297,7 +297,7 @@ nfqnl_build_packet_message(struct nfqnl_instance *queue,
 				     htonl(indev->ifindex));
 			/* this is the bridge group "brX" */
 			NLA_PUT_BE32(skb, NFQA_IFINDEX_INDEV,
-				     htonl(indev->br_port->br->dev->ifindex));
+				     htonl(br_port(indev)->br->dev->ifindex));
 		} else {
 			/* Case 2: indev is bridge group, we need to look for
 			 * physical device (when called from ipv4) */
@@ -322,7 +322,7 @@ nfqnl_build_packet_message(struct nfqnl_instance *queue,
 				     htonl(outdev->ifindex));
 			/* this is the bridge group "brX" */
 			NLA_PUT_BE32(skb, NFQA_IFINDEX_OUTDEV,
-				     htonl(outdev->br_port->br->dev->ifindex));
+				     htonl(br_port(outdev)->br->dev->ifindex));
 		} else {
 			/* Case 2: outdev is bridge group, we need to look for
 			 * physical output device (when called from ipv4) */
-- 
cgit v1.2.3


From db3c9cc105ee844f6cd7a1beb9926fb8e9a093ae Mon Sep 17 00:00:00 2001
From: Stefan Richter <stefanr@s5r6.in-berlin.de>
Date: Sat, 12 Jun 2010 20:30:21 +0200
Subject: firewire: replace get_features card driver hook

by feature variables in the fw_card struct.  The hook appeared to be an
unnecessary abstraction in the card driver interface.

Cleaner would be to pass those feature flags as arguments to
fw_card_initialize() or fw_card_add(), but the FairnessControl register
is in the SCLK domain and may therefore not be accessible while Link
Power Status is off, i.e. before the card->driver->enable call from
fw_card_add().

Signed-off-by: Stefan Richter <stefanr@s5r6.in-berlin.de>
---
 drivers/firewire/core-topology.c    |  3 +--
 drivers/firewire/core-transaction.c |  3 +--
 drivers/firewire/core.h             |  5 -----
 drivers/firewire/ohci.c             | 14 ++------------
 include/linux/firewire.h            |  3 +++
 5 files changed, 7 insertions(+), 21 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/firewire/core-topology.c b/drivers/firewire/core-topology.c
index 00a556f3a585..3b9667c37b67 100644
--- a/drivers/firewire/core-topology.c
+++ b/drivers/firewire/core-topology.c
@@ -543,8 +543,7 @@ void fw_core_handle_bus_reset(struct fw_card *card, int node_id, int generation,
 
 	spin_lock_irqsave(&card->lock, flags);
 
-	card->broadcast_channel_allocated = (card->driver->get_features(card) &
-					     FEATURE_CHANNEL_31_ALLOCATED) != 0;
+	card->broadcast_channel_allocated = card->broadcast_channel_auto_allocated;
 	card->node_id = node_id;
 	/*
 	 * Update node_id before generation to prevent anybody from using
diff --git a/drivers/firewire/core-transaction.c b/drivers/firewire/core-transaction.c
index 5069cfc75b50..62bf30560a3e 100644
--- a/drivers/firewire/core-transaction.c
+++ b/drivers/firewire/core-transaction.c
@@ -1129,8 +1129,7 @@ static void handle_registers(struct fw_card *card, struct fw_request *request,
 		break;
 
 	case CSR_PRIORITY_BUDGET:
-		if (!(card->driver->get_features(card) &
-						FEATURE_PRIORITY_BUDGET))
+		if (!card->priority_budget_implemented)
 			rcode = RCODE_ADDRESS_ERROR;
 		else if (tcode == TCODE_READ_QUADLET_REQUEST)
 			*data = cpu_to_be32(card->driver->
diff --git a/drivers/firewire/core.h b/drivers/firewire/core.h
index 3f9e39b60bca..8dc76d8711a5 100644
--- a/drivers/firewire/core.h
+++ b/drivers/firewire/core.h
@@ -38,9 +38,6 @@ struct fw_packet;
 #define BROADCAST_CHANNEL_INITIAL	(1 << 31 | 31)
 #define BROADCAST_CHANNEL_VALID		(1 << 30)
 
-#define FEATURE_PRIORITY_BUDGET		0x01
-#define FEATURE_CHANNEL_31_ALLOCATED	0x02
-
 #define CSR_STATE_BIT_CMSTR	(1 << 8)
 #define CSR_STATE_BIT_ABDICATE	(1 << 10)
 
@@ -84,8 +81,6 @@ struct fw_card_driver {
 	u32 (*read_csr_reg)(struct fw_card *card, int csr_offset);
 	void (*write_csr_reg)(struct fw_card *card, int csr_offset, u32 value);
 
-	unsigned int (*get_features)(struct fw_card *card);
-
 	struct fw_iso_context *
 	(*allocate_iso_context)(struct fw_card *card,
 				int type, int channel, size_t header_size);
diff --git a/drivers/firewire/ohci.c b/drivers/firewire/ohci.c
index 2abdb3268a10..09bba9315de9 100644
--- a/drivers/firewire/ohci.c
+++ b/drivers/firewire/ohci.c
@@ -172,7 +172,6 @@ struct fw_ohci {
 	int request_generation;	/* for timestamping incoming requests */
 	unsigned quirks;
 	unsigned int pri_req_max;
-	unsigned int features;
 	u32 bus_time;
 	bool is_root;
 
@@ -1753,15 +1752,14 @@ static int ohci_enable(struct fw_card *card,
 	if (version >= OHCI_VERSION_1_1) {
 		reg_write(ohci, OHCI1394_InitialChannelsAvailableHi,
 			  0xfffffffe);
-		ohci->features |= FEATURE_CHANNEL_31_ALLOCATED;
+		card->broadcast_channel_auto_allocated = true;
 	}
 
 	/* Get implemented bits of the priority arbitration request counter. */
 	reg_write(ohci, OHCI1394_FairnessControl, 0x3f);
 	ohci->pri_req_max = reg_read(ohci, OHCI1394_FairnessControl) & 0x3f;
 	reg_write(ohci, OHCI1394_FairnessControl, 0);
-	if (ohci->pri_req_max != 0)
-		ohci->features |= FEATURE_PRIORITY_BUDGET;
+	card->priority_budget_implemented = ohci->pri_req_max != 0;
 
 	ar_context_run(&ohci->ar_request_ctx);
 	ar_context_run(&ohci->ar_response_ctx);
@@ -2132,13 +2130,6 @@ static void ohci_write_csr_reg(struct fw_card *card, int csr_offset, u32 value)
 	}
 }
 
-static unsigned int ohci_get_features(struct fw_card *card)
-{
-	struct fw_ohci *ohci = fw_ohci(card);
-
-	return ohci->features;
-}
-
 static void copy_iso_headers(struct iso_context *ctx, void *p)
 {
 	int i = ctx->header_length;
@@ -2578,7 +2569,6 @@ static const struct fw_card_driver ohci_driver = {
 	.enable_phys_dma	= ohci_enable_phys_dma,
 	.read_csr_reg		= ohci_read_csr_reg,
 	.write_csr_reg		= ohci_write_csr_reg,
-	.get_features		= ohci_get_features,
 
 	.allocate_iso_context	= ohci_allocate_iso_context,
 	.free_iso_context	= ohci_free_iso_context,
diff --git a/include/linux/firewire.h b/include/linux/firewire.h
index 4d22643215ef..5acb5fc19180 100644
--- a/include/linux/firewire.h
+++ b/include/linux/firewire.h
@@ -122,6 +122,9 @@ struct fw_card {
 	bool bm_abdicate; /* value of csr_abdicate before last bus reset */
 	bool csr_abdicate; /* visible in CSR STATE_CLEAR/SET registers */
 
+	bool priority_budget_implemented;	/* controller feature */
+	bool broadcast_channel_auto_allocated;	/* controller feature */
+
 	bool broadcast_channel_allocated;
 	u32 broadcast_channel;
 	__be32 topology_map[(CSR_TOPOLOGY_MAP_END - CSR_TOPOLOGY_MAP) / 4];
-- 
cgit v1.2.3


From c8a94ded57e9cc2498d401b2f5c856213a3e19fb Mon Sep 17 00:00:00 2001
From: Stefan Richter <stefanr@s5r6.in-berlin.de>
Date: Sat, 12 Jun 2010 20:34:50 +0200
Subject: firewire: normalize STATE_CLEAR/SET CSR access interface

Push the maintenance of STATE_CLEAR/SET.abdicate down into the card
driver.  This way, the read/write_csr_reg driver method works uniformly
across all CSR offsets.

Signed-off-by: Stefan Richter <stefanr@s5r6.in-berlin.de>
---
 drivers/firewire/core-topology.c    |  5 ++---
 drivers/firewire/core-transaction.c | 41 +++++++++++--------------------------
 drivers/firewire/core.h             |  2 +-
 drivers/firewire/ohci.c             | 19 ++++++++++++-----
 include/linux/firewire.h            |  3 +--
 5 files changed, 30 insertions(+), 40 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/firewire/core-topology.c b/drivers/firewire/core-topology.c
index 3b9667c37b67..56e908ba43f1 100644
--- a/drivers/firewire/core-topology.c
+++ b/drivers/firewire/core-topology.c
@@ -524,7 +524,7 @@ static void update_topology_map(struct fw_card *card,
 }
 
 void fw_core_handle_bus_reset(struct fw_card *card, int node_id, int generation,
-			      int self_id_count, u32 *self_ids)
+			      int self_id_count, u32 *self_ids, bool bm_abdicate)
 {
 	struct fw_node *local_node;
 	unsigned long flags;
@@ -552,8 +552,7 @@ void fw_core_handle_bus_reset(struct fw_card *card, int node_id, int generation,
 	smp_wmb();
 	card->generation = generation;
 	card->reset_jiffies = jiffies;
-	card->bm_abdicate = card->csr_abdicate;
-	card->csr_abdicate = false;
+	card->bm_abdicate = bm_abdicate;
 	fw_schedule_bm_work(card, 0);
 
 	local_node = build_tree(card, self_ids, self_id_count);
diff --git a/drivers/firewire/core-transaction.c b/drivers/firewire/core-transaction.c
index 62bf30560a3e..87d69cddb231 100644
--- a/drivers/firewire/core-transaction.c
+++ b/drivers/firewire/core-transaction.c
@@ -982,20 +982,6 @@ static const struct fw_address_region registers_region =
 	{ .start = CSR_REGISTER_BASE,
 	  .end   = CSR_REGISTER_BASE | CSR_CONFIG_ROM, };
 
-static u32 read_state_register(struct fw_card *card)
-{
-	u32 value;
-
-	/* Bit 8 (cmstr): */
-	value = card->driver->read_csr_reg(card, CSR_STATE_CLEAR);
-
-	/* Bit 10 (abdicate): */
-	if (card->csr_abdicate)
-		value |= CSR_STATE_BIT_ABDICATE;
-
-	return value;
-}
-
 static void update_split_timeout(struct fw_card *card)
 {
 	unsigned int cycles;
@@ -1021,29 +1007,25 @@ static void handle_registers(struct fw_card *card, struct fw_request *request,
 
 	switch (reg) {
 	case CSR_STATE_CLEAR:
-		if (tcode == TCODE_READ_QUADLET_REQUEST) {
-			*data = cpu_to_be32(read_state_register(card));
-		} else if (tcode == TCODE_WRITE_QUADLET_REQUEST) {
+		if (tcode == TCODE_READ_QUADLET_REQUEST)
+			*data = cpu_to_be32(card->driver->
+					read_csr_reg(card, CSR_STATE_CLEAR));
+		else if (tcode == TCODE_WRITE_QUADLET_REQUEST)
 			card->driver->write_csr_reg(card, CSR_STATE_CLEAR,
 						    be32_to_cpu(*data));
-			if (*data & cpu_to_be32(CSR_STATE_BIT_ABDICATE))
-				card->csr_abdicate = false;
-		} else {
+		else
 			rcode = RCODE_TYPE_ERROR;
-		}
 		break;
 
 	case CSR_STATE_SET:
-		if (tcode == TCODE_READ_QUADLET_REQUEST) {
-			*data = cpu_to_be32(read_state_register(card));
-		} else if (tcode == TCODE_WRITE_QUADLET_REQUEST) {
+		if (tcode == TCODE_READ_QUADLET_REQUEST)
+			*data = cpu_to_be32(card->driver->
+					read_csr_reg(card, CSR_STATE_SET));
+		else if (tcode == TCODE_WRITE_QUADLET_REQUEST)
 			card->driver->write_csr_reg(card, CSR_STATE_SET,
 						    be32_to_cpu(*data));
-			if (*data & cpu_to_be32(CSR_STATE_BIT_ABDICATE))
-				card->csr_abdicate = true;
-		} else {
+		else
 			rcode = RCODE_TYPE_ERROR;
-		}
 		break;
 
 	case CSR_NODE_IDS:
@@ -1063,7 +1045,8 @@ static void handle_registers(struct fw_card *card, struct fw_request *request,
 
 	case CSR_RESET_START:
 		if (tcode == TCODE_WRITE_QUADLET_REQUEST)
-			card->csr_abdicate = false;
+			card->driver->write_csr_reg(card, CSR_STATE_CLEAR,
+						    CSR_STATE_BIT_ABDICATE);
 		else
 			rcode = RCODE_TYPE_ERROR;
 		break;
diff --git a/drivers/firewire/core.h b/drivers/firewire/core.h
index 8dc76d8711a5..8280c625170b 100644
--- a/drivers/firewire/core.h
+++ b/drivers/firewire/core.h
@@ -196,7 +196,7 @@ static inline void fw_node_put(struct fw_node *node)
 }
 
 void fw_core_handle_bus_reset(struct fw_card *card, int node_id,
-			      int generation, int self_id_count, u32 *self_ids);
+	int generation, int self_id_count, u32 *self_ids, bool bm_abdicate);
 void fw_destroy_nodes(struct fw_card *card);
 
 /*
diff --git a/drivers/firewire/ohci.c b/drivers/firewire/ohci.c
index 09bba9315de9..a55cf0911b72 100644
--- a/drivers/firewire/ohci.c
+++ b/drivers/firewire/ohci.c
@@ -174,6 +174,7 @@ struct fw_ohci {
 	unsigned int pri_req_max;
 	u32 bus_time;
 	bool is_root;
+	bool csr_state_setclear_abdicate;
 
 	/*
 	 * Spinlock for accessing fw_ohci data.  Never call out of
@@ -1529,7 +1530,9 @@ static void bus_reset_tasklet(unsigned long data)
 		    self_id_count, ohci->self_id_buffer);
 
 	fw_core_handle_bus_reset(&ohci->card, ohci->node_id, generation,
-				 self_id_count, ohci->self_id_buffer);
+				 self_id_count, ohci->self_id_buffer,
+				 ohci->csr_state_setclear_abdicate);
+	ohci->csr_state_setclear_abdicate = false;
 }
 
 static irqreturn_t irq_handler(int irq, void *data)
@@ -2032,13 +2035,16 @@ static u32 ohci_read_csr_reg(struct fw_card *card, int csr_offset)
 	switch (csr_offset) {
 	case CSR_STATE_CLEAR:
 	case CSR_STATE_SET:
-		/* the controller driver handles only the cmstr bit */
 		if (ohci->is_root &&
 		    (reg_read(ohci, OHCI1394_LinkControlSet) &
 		     OHCI1394_LinkControl_cycleMaster))
-			return CSR_STATE_BIT_CMSTR;
+			value = CSR_STATE_BIT_CMSTR;
 		else
-			return 0;
+			value = 0;
+		if (ohci->csr_state_setclear_abdicate)
+			value |= CSR_STATE_BIT_ABDICATE;
+
+		return value;
 
 	case CSR_NODE_IDS:
 		return reg_read(ohci, OHCI1394_NodeID) << 16;
@@ -2078,12 +2084,13 @@ static void ohci_write_csr_reg(struct fw_card *card, int csr_offset, u32 value)
 
 	switch (csr_offset) {
 	case CSR_STATE_CLEAR:
-		/* the controller driver handles only the cmstr bit */
 		if ((value & CSR_STATE_BIT_CMSTR) && ohci->is_root) {
 			reg_write(ohci, OHCI1394_LinkControlClear,
 				  OHCI1394_LinkControl_cycleMaster);
 			flush_writes(ohci);
 		}
+		if (value & CSR_STATE_BIT_ABDICATE)
+			ohci->csr_state_setclear_abdicate = false;
 		break;
 
 	case CSR_STATE_SET:
@@ -2092,6 +2099,8 @@ static void ohci_write_csr_reg(struct fw_card *card, int csr_offset, u32 value)
 				  OHCI1394_LinkControl_cycleMaster);
 			flush_writes(ohci);
 		}
+		if (value & CSR_STATE_BIT_ABDICATE)
+			ohci->csr_state_setclear_abdicate = true;
 		break;
 
 	case CSR_NODE_IDS:
diff --git a/include/linux/firewire.h b/include/linux/firewire.h
index 5acb5fc19180..5553018d45d6 100644
--- a/include/linux/firewire.h
+++ b/include/linux/firewire.h
@@ -119,8 +119,7 @@ struct fw_card {
 	int bm_retries;
 	int bm_generation;
 	__be32 bm_transaction_data[2];
-	bool bm_abdicate; /* value of csr_abdicate before last bus reset */
-	bool csr_abdicate; /* visible in CSR STATE_CLEAR/SET registers */
+	bool bm_abdicate;
 
 	bool priority_budget_implemented;	/* controller feature */
 	bool broadcast_channel_auto_allocated;	/* controller feature */
-- 
cgit v1.2.3


From 33e553fe2b4a983ef34a57ab1440d8d33397bb12 Mon Sep 17 00:00:00 2001
From: Stefan Richter <stefanr@s5r6.in-berlin.de>
Date: Sun, 20 Jun 2010 22:50:35 +0200
Subject: firewire: remove an unused function argument

void (*fw_address_callback_t)(..., int speed, ...) is the speed that a
remote node chose to transmit a request to us.  In case of split
transactions, firewire-core will transmit the response at that speed.

Upper layer drivers on the other hand (firewire-net, -sbp2, firedtv, and
userspace drivers) cannot do anything useful with that speed datum,
except log it for debug purposes.  But data that is merely potentially
(not even actually) used for debug purposes does not belong into the API.

Signed-off-by: Stefan Richter <stefanr@s5r6.in-berlin.de>
---
 drivers/firewire/core-cdev.c            |  3 +--
 drivers/firewire/core-transaction.c     | 14 +++++++-------
 drivers/firewire/net.c                  |  4 ++--
 drivers/firewire/sbp2.c                 |  3 +--
 drivers/media/dvb/firewire/firedtv-fw.c |  4 ++--
 include/linux/firewire.h                |  2 +-
 6 files changed, 14 insertions(+), 16 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/firewire/core-cdev.c b/drivers/firewire/core-cdev.c
index ca72cdaa68c9..4e0478d70d4d 100644
--- a/drivers/firewire/core-cdev.c
+++ b/drivers/firewire/core-cdev.c
@@ -632,8 +632,7 @@ static void release_request(struct client *client,
 
 static void handle_request(struct fw_card *card, struct fw_request *request,
 			   int tcode, int destination, int source,
-			   int generation, int speed,
-			   unsigned long long offset,
+			   int generation, unsigned long long offset,
 			   void *payload, size_t length, void *callback_data)
 {
 	struct address_handler_resource *handler = callback_data;
diff --git a/drivers/firewire/core-transaction.c b/drivers/firewire/core-transaction.c
index cb6390fe3686..2f67c8d5ce91 100644
--- a/drivers/firewire/core-transaction.c
+++ b/drivers/firewire/core-transaction.c
@@ -802,7 +802,7 @@ static void handle_exclusive_region_request(struct fw_card *card,
 	else
 		handler->address_callback(card, request,
 					  tcode, destination, source,
-					  p->generation, p->speed, offset,
+					  p->generation, offset,
 					  request->data, request->length,
 					  handler->callback_data);
 }
@@ -840,8 +840,8 @@ static void handle_fcp_region_request(struct fw_card *card,
 		if (is_enclosing_handler(handler, offset, request->length))
 			handler->address_callback(card, NULL, tcode,
 						  destination, source,
-						  p->generation, p->speed,
-						  offset, request->data,
+						  p->generation, offset,
+						  request->data,
 						  request->length,
 						  handler->callback_data);
 	}
@@ -951,8 +951,8 @@ static const struct fw_address_region topology_map_region =
 
 static void handle_topology_map(struct fw_card *card, struct fw_request *request,
 		int tcode, int destination, int source, int generation,
-		int speed, unsigned long long offset,
-		void *payload, size_t length, void *callback_data)
+		unsigned long long offset, void *payload, size_t length,
+		void *callback_data)
 {
 	int start;
 
@@ -996,8 +996,8 @@ static void update_split_timeout(struct fw_card *card)
 
 static void handle_registers(struct fw_card *card, struct fw_request *request,
 		int tcode, int destination, int source, int generation,
-		int speed, unsigned long long offset,
-		void *payload, size_t length, void *callback_data)
+		unsigned long long offset, void *payload, size_t length,
+		void *callback_data)
 {
 	int reg = offset & ~CSR_REGISTER_BASE;
 	__be32 *data = payload;
diff --git a/drivers/firewire/net.c b/drivers/firewire/net.c
index 2d3dc7ded0a9..4bb3fb882f63 100644
--- a/drivers/firewire/net.c
+++ b/drivers/firewire/net.c
@@ -805,8 +805,8 @@ static int fwnet_incoming_packet(struct fwnet_device *dev, __be32 *buf, int len,
 
 static void fwnet_receive_packet(struct fw_card *card, struct fw_request *r,
 		int tcode, int destination, int source, int generation,
-		int speed, unsigned long long offset, void *payload,
-		size_t length, void *callback_data)
+		unsigned long long offset, void *payload, size_t length,
+		void *callback_data)
 {
 	struct fwnet_device *dev = callback_data;
 	int rcode;
diff --git a/drivers/firewire/sbp2.c b/drivers/firewire/sbp2.c
index ae715c82da2e..1931964c4fbf 100644
--- a/drivers/firewire/sbp2.c
+++ b/drivers/firewire/sbp2.c
@@ -410,8 +410,7 @@ static void free_orb(struct kref *kref)
 
 static void sbp2_status_write(struct fw_card *card, struct fw_request *request,
 			      int tcode, int destination, int source,
-			      int generation, int speed,
-			      unsigned long long offset,
+			      int generation, unsigned long long offset,
 			      void *payload, size_t length, void *callback_data)
 {
 	struct sbp2_logical_unit *lu = callback_data;
diff --git a/drivers/media/dvb/firewire/firedtv-fw.c b/drivers/media/dvb/firewire/firedtv-fw.c
index 4253b7ab0097..4dcae63f8cff 100644
--- a/drivers/media/dvb/firewire/firedtv-fw.c
+++ b/drivers/media/dvb/firewire/firedtv-fw.c
@@ -194,8 +194,8 @@ static const struct firedtv_backend backend = {
 
 static void handle_fcp(struct fw_card *card, struct fw_request *request,
 		       int tcode, int destination, int source, int generation,
-		       int speed, unsigned long long offset,
-		       void *payload, size_t length, void *callback_data)
+		       unsigned long long offset, void *payload, size_t length,
+		       void *callback_data)
 {
 	struct firedtv *f, *fdtv = NULL;
 	struct fw_device *device;
diff --git a/include/linux/firewire.h b/include/linux/firewire.h
index 5553018d45d6..e44b502c8341 100644
--- a/include/linux/firewire.h
+++ b/include/linux/firewire.h
@@ -265,7 +265,7 @@ typedef void (*fw_transaction_callback_t)(struct fw_card *card, int rcode,
 typedef void (*fw_address_callback_t)(struct fw_card *card,
 				      struct fw_request *request,
 				      int tcode, int destination, int source,
-				      int generation, int speed,
+				      int generation,
 				      unsigned long long offset,
 				      void *data, size_t length,
 				      void *callback_data);
-- 
cgit v1.2.3


From 604f45167824e18ad5766e51ecf1d4d65f15118d Mon Sep 17 00:00:00 2001
From: Stefan Richter <stefanr@s5r6.in-berlin.de>
Date: Sun, 20 Jun 2010 22:52:55 +0200
Subject: firewire: cdev: freeze FW_CDEV_VERSION due to libraw1394 bug

libraw1394 v2.0.0...v2.0.5 takes FW_CDEV_VERSION from an externally
installed header file and uses it to declare its own implementation
level in FW_CDEV_IOC_GET_INFO.  This is wrong; it should set the real
version for which it was actually written.

If we add features to the kernel ABI that require the kernel to check
a client's implementation level, we can not trust the client version if
it was set from FW_CDEV_VERSION.

Hence freeze FW_CDEV_VERSION at the current value (no damage has been
done yet), clearly document FW_CDEV_VERSION as a dummy version and what
clients are expected to do with fw_cdev_get_info.version, and use a new
defined constant (which is not placed into the exported header file) as
kernel implementation level.

Note, in order to check in client program source code which features are
present in an externally installed linux/firewire-cdev.h, use
preprocessor directives like
  #ifdef FW_CDEV_IOC_ALLOCATE_ISO_RESOURCE
or
  #ifdef FW_CDEV_EVENT_ISO_RESOURCE_ALLOCATED
instead of a check of FW_CDEV_VERSION.

Signed-off-by: Stefan Richter <stefanr@s5r6.in-berlin.de>
---
 drivers/firewire/core-cdev.c  |  7 ++++++-
 include/linux/firewire-cdev.h | 22 ++++++++++++++++------
 2 files changed, 22 insertions(+), 7 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/firewire/core-cdev.c b/drivers/firewire/core-cdev.c
index 8f8c8eeaf046..0cf86bcbeea2 100644
--- a/drivers/firewire/core-cdev.c
+++ b/drivers/firewire/core-cdev.c
@@ -46,6 +46,11 @@
 
 #include "core.h"
 
+/*
+ * ABI version history is documented in linux/firewire-cdev.h.
+ */
+#define FW_CDEV_KERNEL_VERSION 3
+
 struct client {
 	u32 version;
 	struct fw_device *device;
@@ -395,7 +400,7 @@ static int ioctl_get_info(struct client *client, union ioctl_arg *arg)
 	unsigned long ret = 0;
 
 	client->version = a->version;
-	a->version = FW_CDEV_VERSION;
+	a->version = FW_CDEV_KERNEL_VERSION;
 	a->card = client->device->card->index;
 
 	down_read(&fw_device_rwsem);
diff --git a/include/linux/firewire-cdev.h b/include/linux/firewire-cdev.h
index 6ffb24a1f2f2..0d0cc07358af 100644
--- a/include/linux/firewire-cdev.h
+++ b/include/linux/firewire-cdev.h
@@ -219,7 +219,7 @@ union fw_cdev_event {
 	struct fw_cdev_event_response		response;
 	struct fw_cdev_event_request		request;
 	struct fw_cdev_event_iso_interrupt	iso_interrupt;
-	struct fw_cdev_event_iso_resource	iso_resource;
+	struct fw_cdev_event_iso_resource	iso_resource; /* added in 2.6.30 */
 };
 
 /* available since kernel version 2.6.22 */
@@ -252,22 +252,32 @@ union fw_cdev_event {
 #define FW_CDEV_IOC_GET_CYCLE_TIMER2   _IOWR('#', 0x14, struct fw_cdev_get_cycle_timer2)
 
 /*
- * FW_CDEV_VERSION History
+ * ABI version history
  *  1  (2.6.22)  - initial version
+ *     (2.6.24)  - added %FW_CDEV_IOC_GET_CYCLE_TIMER
  *  2  (2.6.30)  - changed &fw_cdev_event_iso_interrupt.header if
  *                 &fw_cdev_create_iso_context.header_size is 8 or more
+ *               - added %FW_CDEV_IOC_*_ISO_RESOURCE*,
+ *                 %FW_CDEV_IOC_GET_SPEED, %FW_CDEV_IOC_SEND_BROADCAST_REQUEST,
+ *                 %FW_CDEV_IOC_SEND_STREAM_PACKET
  *     (2.6.32)  - added time stamp to xmit &fw_cdev_event_iso_interrupt
  *     (2.6.33)  - IR has always packet-per-buffer semantics now, not one of
  *                 dual-buffer or packet-per-buffer depending on hardware
  *  3  (2.6.34)  - made &fw_cdev_get_cycle_timer reliable
+ *               - added %FW_CDEV_IOC_GET_CYCLE_TIMER2
  */
-#define FW_CDEV_VERSION 3
+#define FW_CDEV_VERSION 3 /* Meaningless; don't use this macro. */
 
 /**
  * struct fw_cdev_get_info - General purpose information ioctl
- * @version:	The version field is just a running serial number.
- *		We never break backwards compatibility, but may add more
- *		structs and ioctls in later revisions.
+ * @version:	The version field is just a running serial number.  Both an
+ *		input parameter (ABI version implemented by the client) and
+ *		output parameter (ABI version implemented by the kernel).
+ *		A client must not fill in an %FW_CDEV_VERSION defined from an
+ *		included kernel header file but the actual version for which
+ *		the client was implemented.  This is necessary for forward
+ *		compatibility.  We never break backwards compatibility, but
+ *		may add more structs, events, and ioctls in later revisions.
  * @rom_length:	If @rom is non-zero, at most rom_length bytes of configuration
  *		ROM will be copied into that user space address.  In either
  *		case, @rom_length is updated with the actual length of the
-- 
cgit v1.2.3


From e205597d188a9ea69ce43f740a14f07b3f5b996a Mon Sep 17 00:00:00 2001
From: Stefan Richter <stefanr@s5r6.in-berlin.de>
Date: Sun, 20 Jun 2010 22:53:55 +0200
Subject: firewire: cdev: fix ABI for FCP and address range mapping, add
 fw_cdev_event_request2

The problem:

A target-like userspace driver, e.g. AV/C target or SBP-2/3 target,
needs to be able to act as responder and requester.  In the latter role,
it needs to send requests to nods from which it received requests.  This
is currently impossible because fw_cdev_event_request lacks information
about sender node ID.
Reported-by: Jay Fenlason <fenlason@redhat.com>

Libffado + libraw1394 + firewire-core is currently unable to drive two
or more audio devices on the same bus.
Reported-by: Arnold Krille <arnold@arnoldarts.de>

This is because libffado requires destination node ID of FCP requests
and sender node ID of FCP responses to match.  It even prohibits
libffado from working with a bus on which libraw1394 opens a /dev/fw* as
default ioctl device that does not correspond with the audio device.
This is because libraw1394 does not receive the sender node ID from the
kernel.

Moreover, fw_cdev_event_request makes it impossible to tell unicast and
broadcast write requests apart.

The fix:

Add a replacement of struct fw_cdev_event_request request, boringly
called struct fw_cdev_event_request2.  The new event will be sent to a
userspace client instead of the old one if the client claims
compatibility with <linux/firewire-cdev.h> ABI version 4 or later.

libraw1394 needs to be extended to make use of the new event, in order
to properly support libffado and other FCP or address range mapping
users who require correct sender node IDs.

Further notes:

While we are at it, change back the range of possible values of
fw_cdev_event_request.tcode to 0x0...0xb like in ABI version <= 3.
The preceding change "firewire: expose extended tcode of incoming lock
requests to (userspace) drivers" expanded it to 0x0...0x17 which could
catch sloppily coded clients by surprise.  The extended range of codes
is only used in the new fw_cdev_event_request2.tcode.

Jay and I also suggested an alternative approach to fix the ABI for
incoming requests:  Add an FW_CDEV_IOC_GET_REQUEST_INFO ioctl which can
be called after reception of an fw_cdev_event_request, before issuing of
the closing FW_CDEV_IOC_SEND_RESPONSE ioctl.  The new ioctl would reveal
the vital information about a request that fw_cdev_event_request lacks.
Jay showed an implementation of this approach.

The former event approach adds 27 LOC of rather trivial code to
core-cdev.c, the ioctl approach 34 LOC, some of which is nontrivial.
The ioctl approach would certainly also add more LOC to userspace
programs which require the expanded information on inbound requests.
This approach is probably only on the lighter-weight side in case of
clients that want to be compatible with kernels that lack the new
capability, like libraw1394.  However, the code to be added to such
libraw1394-like clients in case of the event approach is a straight-
forward additional switch () case in its event handler.

Signed-off-by: Stefan Richter <stefanr@s5r6.in-berlin.de>
---
 drivers/firewire/core-cdev.c  | 45 ++++++++++++++++++++-----
 include/linux/firewire-cdev.h | 76 +++++++++++++++++++++++++++++++++++++++++--
 2 files changed, 110 insertions(+), 11 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/firewire/core-cdev.c b/drivers/firewire/core-cdev.c
index 0cf86bcbeea2..9b8df2039155 100644
--- a/drivers/firewire/core-cdev.c
+++ b/drivers/firewire/core-cdev.c
@@ -49,7 +49,8 @@
 /*
  * ABI version history is documented in linux/firewire-cdev.h.
  */
-#define FW_CDEV_KERNEL_VERSION 3
+#define FW_CDEV_KERNEL_VERSION		4
+#define FW_CDEV_VERSION_EVENT_REQUEST2	4
 
 struct client {
 	u32 version;
@@ -176,7 +177,10 @@ struct outbound_transaction_event {
 
 struct inbound_transaction_event {
 	struct event event;
-	struct fw_cdev_event_request request;
+	union {
+		struct fw_cdev_event_request request;
+		struct fw_cdev_event_request2 request2;
+	} req;
 };
 
 struct iso_interrupt_event {
@@ -645,6 +649,7 @@ static void handle_request(struct fw_card *card, struct fw_request *request,
 	struct address_handler_resource *handler = callback_data;
 	struct inbound_transaction_resource *r;
 	struct inbound_transaction_event *e;
+	size_t event_size0;
 	void *fcp_frame = NULL;
 	int ret;
 
@@ -678,15 +683,37 @@ static void handle_request(struct fw_card *card, struct fw_request *request,
 	if (ret < 0)
 		goto failed;
 
-	e->request.type    = FW_CDEV_EVENT_REQUEST;
-	e->request.tcode   = tcode;
-	e->request.offset  = offset;
-	e->request.length  = length;
-	e->request.handle  = r->resource.handle;
-	e->request.closure = handler->closure;
+	if (handler->client->version < FW_CDEV_VERSION_EVENT_REQUEST2) {
+		struct fw_cdev_event_request *req = &e->req.request;
+
+		if (tcode & 0x10)
+			tcode = TCODE_LOCK_REQUEST;
+
+		req->type	= FW_CDEV_EVENT_REQUEST;
+		req->tcode	= tcode;
+		req->offset	= offset;
+		req->length	= length;
+		req->handle	= r->resource.handle;
+		req->closure	= handler->closure;
+		event_size0	= sizeof(*req);
+	} else {
+		struct fw_cdev_event_request2 *req = &e->req.request2;
+
+		req->type	= FW_CDEV_EVENT_REQUEST2;
+		req->tcode	= tcode;
+		req->offset	= offset;
+		req->source_node_id = source;
+		req->destination_node_id = destination;
+		req->card	= card->index;
+		req->generation	= generation;
+		req->length	= length;
+		req->handle	= r->resource.handle;
+		req->closure	= handler->closure;
+		event_size0	= sizeof(*req);
+	}
 
 	queue_event(handler->client, &e->event,
-		    &e->request, sizeof(e->request), r->data, length);
+		    &e->req, event_size0, r->data, length);
 	return;
 
  failed:
diff --git a/include/linux/firewire-cdev.h b/include/linux/firewire-cdev.h
index 0d0cc07358af..52c7ffe934ad 100644
--- a/include/linux/firewire-cdev.h
+++ b/include/linux/firewire-cdev.h
@@ -32,6 +32,9 @@
 #define FW_CDEV_EVENT_ISO_RESOURCE_ALLOCATED	0x04
 #define FW_CDEV_EVENT_ISO_RESOURCE_DEALLOCATED	0x05
 
+/* available since kernel version 2.6.36 */
+#define FW_CDEV_EVENT_REQUEST2			0x06
+
 /**
  * struct fw_cdev_event_common - Common part of all fw_cdev_event_ types
  * @closure:	For arbitrary use by userspace
@@ -98,11 +101,46 @@ struct fw_cdev_event_response {
 };
 
 /**
- * struct fw_cdev_event_request - Sent on incoming request to an address region
+ * struct fw_cdev_event_request - Old version of &fw_cdev_event_request2
  * @closure:	See &fw_cdev_event_common; set by %FW_CDEV_IOC_ALLOCATE ioctl
  * @type:	See &fw_cdev_event_common; always %FW_CDEV_EVENT_REQUEST
+ * @tcode:	See &fw_cdev_event_request2
+ * @offset:	See &fw_cdev_event_request2
+ * @handle:	See &fw_cdev_event_request2
+ * @length:	See &fw_cdev_event_request2
+ * @data:	See &fw_cdev_event_request2
+ *
+ * This event is sent instead of &fw_cdev_event_request2 if the kernel or
+ * the client implements ABI version <= 3.
+ *
+ * Unlike &fw_cdev_event_request2, the sender identity cannot be established,
+ * broadcast write requests cannot be distinguished from unicast writes, and
+ * @tcode of lock requests is %TCODE_LOCK_REQUEST.
+ *
+ * Requests to the FCP_REQUEST or FCP_RESPONSE register are responded to as
+ * with &fw_cdev_event_request2, except in kernel 2.6.32 and older which send
+ * the response packet of the client's %FW_CDEV_IOC_SEND_RESPONSE ioctl.
+ */
+struct fw_cdev_event_request {
+	__u64 closure;
+	__u32 type;
+	__u32 tcode;
+	__u64 offset;
+	__u32 handle;
+	__u32 length;
+	__u32 data[0];
+};
+
+/**
+ * struct fw_cdev_event_request2 - Sent on incoming request to an address region
+ * @closure:	See &fw_cdev_event_common; set by %FW_CDEV_IOC_ALLOCATE ioctl
+ * @type:	See &fw_cdev_event_common; always %FW_CDEV_EVENT_REQUEST2
  * @tcode:	Transaction code of the incoming request
  * @offset:	The offset into the 48-bit per-node address space
+ * @source_node_id: Sender node ID
+ * @destination_node_id: Destination node ID
+ * @card:	The index of the card from which the request came
+ * @generation:	Bus generation in which the request is valid
  * @handle:	Reference to the kernel-side pending request
  * @length:	Data length, i.e. the request's payload size in bytes
  * @data:	Incoming data, if any
@@ -115,12 +153,42 @@ struct fw_cdev_event_response {
  *
  * The payload data for requests carrying data (write and lock requests)
  * follows immediately and can be accessed through the @data field.
+ *
+ * Unlike &fw_cdev_event_request, @tcode of lock requests is one of the
+ * firewire-core specific %TCODE_LOCK_MASK_SWAP...%TCODE_LOCK_VENDOR_DEPENDENT,
+ * i.e. encodes the extended transaction code.
+ *
+ * @card may differ from &fw_cdev_get_info.card because requests are received
+ * from all cards of the Linux host.  @source_node_id, @destination_node_id, and
+ * @generation pertain to that card.  Destination node ID and bus generation may
+ * therefore differ from the corresponding fields of the last
+ * &fw_cdev_event_bus_reset.
+ *
+ * @destination_node_id may also differ from the current node ID because of a
+ * non-local bus ID part or in case of a broadcast write request.  Note, a
+ * client must call an %FW_CDEV_IOC_SEND_RESPONSE ioctl even in case of a
+ * broadcast write request; the kernel will then release the kernel-side pending
+ * request but will not actually send a response packet.
+ *
+ * In case of a write request to FCP_REQUEST or FCP_RESPONSE, the kernel already
+ * sent a write response immediately after the request was received; in this
+ * case the client must still call an %FW_CDEV_IOC_SEND_RESPONSE ioctl to
+ * release the kernel-side pending request, though another response won't be
+ * sent.
+ *
+ * If the client subsequently needs to initiate requests to the sender node of
+ * an &fw_cdev_event_request2, it needs to use a device file with matching
+ * card index, node ID, and generation for outbound requests.
  */
-struct fw_cdev_event_request {
+struct fw_cdev_event_request2 {
 	__u64 closure;
 	__u32 type;
 	__u32 tcode;
 	__u64 offset;
+	__u32 source_node_id;
+	__u32 destination_node_id;
+	__u32 card;
+	__u32 generation;
 	__u32 handle;
 	__u32 length;
 	__u32 data[0];
@@ -200,6 +268,7 @@ struct fw_cdev_event_iso_resource {
  * @bus_reset:     Valid if @common.type == %FW_CDEV_EVENT_BUS_RESET
  * @response:      Valid if @common.type == %FW_CDEV_EVENT_RESPONSE
  * @request:       Valid if @common.type == %FW_CDEV_EVENT_REQUEST
+ * @request2:      Valid if @common.type == %FW_CDEV_EVENT_REQUEST2
  * @iso_interrupt: Valid if @common.type == %FW_CDEV_EVENT_ISO_INTERRUPT
  * @iso_resource:  Valid if @common.type ==
  *				%FW_CDEV_EVENT_ISO_RESOURCE_ALLOCATED or
@@ -218,6 +287,7 @@ union fw_cdev_event {
 	struct fw_cdev_event_bus_reset		bus_reset;
 	struct fw_cdev_event_response		response;
 	struct fw_cdev_event_request		request;
+	struct fw_cdev_event_request2		request2;     /* added in 2.6.36 */
 	struct fw_cdev_event_iso_interrupt	iso_interrupt;
 	struct fw_cdev_event_iso_resource	iso_resource; /* added in 2.6.30 */
 };
@@ -263,8 +333,10 @@ union fw_cdev_event {
  *     (2.6.32)  - added time stamp to xmit &fw_cdev_event_iso_interrupt
  *     (2.6.33)  - IR has always packet-per-buffer semantics now, not one of
  *                 dual-buffer or packet-per-buffer depending on hardware
+ *               - shared use and auto-response for FCP registers
  *  3  (2.6.34)  - made &fw_cdev_get_cycle_timer reliable
  *               - added %FW_CDEV_IOC_GET_CYCLE_TIMER2
+ *  4  (2.6.36)  - added %FW_CDEV_EVENT_REQUEST2
  */
 #define FW_CDEV_VERSION 3 /* Meaningless; don't use this macro. */
 
-- 
cgit v1.2.3


From 3b2b65d68fc87b02ac393a031a4ebb3de84a8218 Mon Sep 17 00:00:00 2001
From: Stefan Richter <stefanr@s5r6.in-berlin.de>
Date: Sun, 20 Jun 2010 22:54:22 +0200
Subject: firewire: cdev: extend fw_cdev_event_iso_interrupt documentation

Add information regarding the 2.6.32 update to the xmit variant of
fw_cdev_event_iso_interrupt.

Signed-off-by: Stefan Richter <stefanr@s5r6.in-berlin.de>
---
 include/linux/firewire-cdev.h | 19 +++++++++++++++----
 1 file changed, 15 insertions(+), 4 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/firewire-cdev.h b/include/linux/firewire-cdev.h
index 52c7ffe934ad..8b9b27373219 100644
--- a/include/linux/firewire-cdev.h
+++ b/include/linux/firewire-cdev.h
@@ -204,10 +204,21 @@ struct fw_cdev_event_request2 {
  * @header:	Stripped headers, if any
  *
  * This event is sent when the controller has completed an &fw_cdev_iso_packet
- * with the %FW_CDEV_ISO_INTERRUPT bit set.  In the receive case, the headers
- * stripped of all packets up until and including the interrupt packet are
- * returned in the @header field.  The amount of header data per packet is as
- * specified at iso context creation by &fw_cdev_create_iso_context.header_size.
+ * with the %FW_CDEV_ISO_INTERRUPT bit set.
+ *
+ * Isochronous transmit events:
+ *
+ * In version 1 of the ABI, &header_length is 0.  In version 3 and some
+ * implementations of version 2 of the ABI, &header_length is a multiple of 4
+ * and &header contains timestamps of all packets up until the interrupt packet.
+ * The format of the timestamps is as described below for isochronous reception.
+ *
+ * Isochronous receive events:
+ *
+ * The headers stripped of all packets up until and including the interrupt
+ * packet are returned in the @header field.  The amount of header data per
+ * packet is as specified at iso context creation by
+ * &fw_cdev_create_iso_context.header_size.
  *
  * In version 1 of this ABI, header data consisted of the 1394 isochronous
  * packet header, followed by quadlets from the packet payload if
-- 
cgit v1.2.3


From 97dc135947181a6670949a480da56c3ebf8d3715 Mon Sep 17 00:00:00 2001
From: Trond Myklebust <Trond.Myklebust@netapp.com>
Date: Wed, 16 Jun 2010 09:52:26 -0400
Subject: NFSv41: Clean up the NFSv4.1 minor version specific operations

Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
---
 fs/nfs/client.c           | 14 ++++++--------
 fs/nfs/nfs4_fs.h          | 11 +++++++++++
 fs/nfs/nfs4proc.c         | 21 ++++++++++++++++++++-
 include/linux/nfs_fs_sb.h |  7 ++-----
 4 files changed, 39 insertions(+), 14 deletions(-)

(limited to 'include/linux')

diff --git a/fs/nfs/client.c b/fs/nfs/client.c
index d25b5257b7a1..1df708fd4205 100644
--- a/fs/nfs/client.c
+++ b/fs/nfs/client.c
@@ -150,6 +150,7 @@ static struct nfs_client *nfs_alloc_client(const struct nfs_client_initdata *cl_
 	clp->cl_boot_time = CURRENT_TIME;
 	clp->cl_state = 1 << NFS4CLNT_LEASE_EXPIRED;
 	clp->cl_minorversion = cl_init->minorversion;
+	clp->cl_mvops = nfs_v4_minor_ops[cl_init->minorversion];
 #endif
 	cred = rpc_lookup_machine_cred();
 	if (!IS_ERR(cred))
@@ -178,7 +179,7 @@ static void nfs4_clear_client_minor_version(struct nfs_client *clp)
 		clp->cl_session = NULL;
 	}
 
-	clp->cl_call_sync = _nfs4_call_sync;
+	clp->cl_mvops = nfs_v4_minor_ops[0];
 #endif /* CONFIG_NFS_V4_1 */
 }
 
@@ -188,7 +189,7 @@ static void nfs4_clear_client_minor_version(struct nfs_client *clp)
 static void nfs4_destroy_callback(struct nfs_client *clp)
 {
 	if (__test_and_clear_bit(NFS_CS_CALLBACK, &clp->cl_res_state))
-		nfs_callback_down(clp->cl_minorversion);
+		nfs_callback_down(clp->cl_mvops->minor_version);
 }
 
 static void nfs4_shutdown_client(struct nfs_client *clp)
@@ -1126,7 +1127,7 @@ static int nfs4_init_callback(struct nfs_client *clp)
 				return error;
 		}
 
-		error = nfs_callback_up(clp->cl_minorversion,
+		error = nfs_callback_up(clp->cl_mvops->minor_version,
 					clp->cl_rpcclient->cl_xprt);
 		if (error < 0) {
 			dprintk("%s: failed to start callback. Error = %d\n",
@@ -1143,10 +1144,8 @@ static int nfs4_init_callback(struct nfs_client *clp)
  */
 static int nfs4_init_client_minor_version(struct nfs_client *clp)
 {
-	clp->cl_call_sync = _nfs4_call_sync;
-
 #if defined(CONFIG_NFS_V4_1)
-	if (clp->cl_minorversion) {
+	if (clp->cl_mvops->minor_version) {
 		struct nfs4_session *session = NULL;
 		/*
 		 * Create the session and mark it expired.
@@ -1158,7 +1157,6 @@ static int nfs4_init_client_minor_version(struct nfs_client *clp)
 			return -ENOMEM;
 
 		clp->cl_session = session;
-		clp->cl_call_sync = _nfs4_call_sync_session;
 	}
 #endif /* CONFIG_NFS_V4_1 */
 
@@ -1454,7 +1452,7 @@ struct nfs_server *nfs4_create_referral_server(struct nfs_clone_mount *data,
 				data->authflavor,
 				parent_server->client->cl_xprt->prot,
 				parent_server->client->cl_timeout,
-				parent_client->cl_minorversion);
+				parent_client->cl_mvops->minor_version);
 	if (error < 0)
 		goto error;
 
diff --git a/fs/nfs/nfs4_fs.h b/fs/nfs/nfs4_fs.h
index bb1e95530699..5b01705e30f9 100644
--- a/fs/nfs/nfs4_fs.h
+++ b/fs/nfs/nfs4_fs.h
@@ -52,6 +52,16 @@ enum nfs4_session_state {
 	NFS4_SESSION_DRAINING,
 };
 
+struct nfs4_minor_version_ops {
+	u32	minor_version;
+
+	int	(*call_sync)(struct nfs_server *server,
+			struct rpc_message *msg,
+			struct nfs4_sequence_args *args,
+			struct nfs4_sequence_res *res,
+			int cache_reply);
+};
+
 /*
  * struct rpc_sequence ensures that RPC calls are sent in the exact
  * order that they appear on the list.
@@ -260,6 +270,7 @@ static inline int nfs4_init_session(struct nfs_server *server)
 }
 #endif /* CONFIG_NFS_V4_1 */
 
+extern const struct nfs4_minor_version_ops *nfs_v4_minor_ops[];
 extern struct nfs4_state_maintenance_ops *nfs4_state_renewal_ops[];
 
 extern const u32 nfs4_fattr_bitmap[2];
diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c
index fc972c6f1ce9..a938daf333da 100644
--- a/fs/nfs/nfs4proc.c
+++ b/fs/nfs/nfs4proc.c
@@ -667,7 +667,7 @@ int _nfs4_call_sync(struct nfs_server *server,
 }
 
 #define nfs4_call_sync(server, msg, args, res, cache_reply) \
-	(server)->nfs_client->cl_call_sync((server), (msg), &(args)->seq_args, \
+	(server)->nfs_client->cl_mvops->call_sync((server), (msg), &(args)->seq_args, \
 			&(res)->seq_res, (cache_reply))
 
 static void update_changeattr(struct inode *dir, struct nfs4_change_info *cinfo)
@@ -5353,6 +5353,18 @@ struct nfs4_state_maintenance_ops nfs41_state_renewal_ops = {
 };
 #endif
 
+static const struct nfs4_minor_version_ops nfs_v4_0_minor_ops = {
+	.minor_version = 0,
+	.call_sync = _nfs4_call_sync,
+};
+
+#if defined(CONFIG_NFS_V4_1)
+static const struct nfs4_minor_version_ops nfs_v4_1_minor_ops = {
+	.minor_version = 1,
+	.call_sync = _nfs4_call_sync_session,
+};
+#endif
+
 /*
  * Per minor version reboot and network partition recovery ops
  */
@@ -5378,6 +5390,13 @@ struct nfs4_state_maintenance_ops *nfs4_state_renewal_ops[] = {
 #endif
 };
 
+const struct nfs4_minor_version_ops *nfs_v4_minor_ops[] = {
+	[0] = &nfs_v4_0_minor_ops,
+#if defined(CONFIG_NFS_V4_1)
+	[1] = &nfs_v4_1_minor_ops,
+#endif
+};
+
 static const struct inode_operations nfs4_file_inode_operations = {
 	.permission	= nfs_permission,
 	.getattr	= nfs_getattr,
diff --git a/include/linux/nfs_fs_sb.h b/include/linux/nfs_fs_sb.h
index d6e10a4c06e5..c82ee7cd6288 100644
--- a/include/linux/nfs_fs_sb.h
+++ b/include/linux/nfs_fs_sb.h
@@ -15,6 +15,7 @@ struct nlm_host;
 struct nfs4_sequence_args;
 struct nfs4_sequence_res;
 struct nfs_server;
+struct nfs4_minor_version_ops;
 
 /*
  * The nfs_client identifies our client state to the server.
@@ -70,11 +71,7 @@ struct nfs_client {
 	 */
 	char			cl_ipaddr[48];
 	unsigned char		cl_id_uniquifier;
-	int		     (* cl_call_sync)(struct nfs_server *server,
-					      struct rpc_message *msg,
-					      struct nfs4_sequence_args *args,
-					      struct nfs4_sequence_res *res,
-					      int cache_reply);
+	const struct nfs4_minor_version_ops *cl_mvops;
 #endif /* CONFIG_NFS_V4 */
 
 #ifdef CONFIG_NFS_V4_1
-- 
cgit v1.2.3


From d77d76ffb638bd013782138cca6d8f4918c5afd6 Mon Sep 17 00:00:00 2001
From: Trond Myklebust <Trond.Myklebust@netapp.com>
Date: Wed, 16 Jun 2010 09:52:27 -0400
Subject: NFSv41: Clean up exclusive create

Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
---
 fs/nfs/nfs4proc.c       | 17 ++++++-----------
 include/linux/nfs_xdr.h |  6 ++++--
 2 files changed, 10 insertions(+), 13 deletions(-)

(limited to 'include/linux')

diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c
index d1ab0c36e939..5d87563d0c1a 100644
--- a/fs/nfs/nfs4proc.c
+++ b/fs/nfs/nfs4proc.c
@@ -744,19 +744,14 @@ static struct nfs4_opendata *nfs4_opendata_alloc(struct path *path,
 	p->o_arg.server = server;
 	p->o_arg.bitmask = server->attr_bitmask;
 	p->o_arg.claim = NFS4_OPEN_CLAIM_NULL;
-	if (flags & O_EXCL) {
-		if (nfs4_has_persistent_session(server->nfs_client)) {
-			/* GUARDED */
-			p->o_arg.u.attrs = &p->attrs;
-			memcpy(&p->attrs, attrs, sizeof(p->attrs));
-		} else { /* EXCLUSIVE4_1 */
-			u32 *s = (u32 *) p->o_arg.u.verifier.data;
-			s[0] = jiffies;
-			s[1] = current->pid;
-		}
-	} else if (flags & O_CREAT) {
+	if (flags & O_CREAT) {
+		u32 *s;
+
 		p->o_arg.u.attrs = &p->attrs;
 		memcpy(&p->attrs, attrs, sizeof(p->attrs));
+		s = (u32 *) p->o_arg.u.verifier.data;
+		s[0] = jiffies;
+		s[1] = current->pid;
 	}
 	p->c_arg.fh = &p->o_res.fh;
 	p->c_arg.stateid = &p->o_res.stateid;
diff --git a/include/linux/nfs_xdr.h b/include/linux/nfs_xdr.h
index 51914d7d6cc4..a319cb926abf 100644
--- a/include/linux/nfs_xdr.h
+++ b/include/linux/nfs_xdr.h
@@ -196,8 +196,10 @@ struct nfs_openargs {
 	__u64                   clientid;
 	__u64                   id;
 	union {
-		struct iattr *  attrs;    /* UNCHECKED, GUARDED */
-		nfs4_verifier   verifier; /* EXCLUSIVE */
+		struct {
+			struct iattr *  attrs;    /* UNCHECKED, GUARDED */
+			nfs4_verifier   verifier; /* EXCLUSIVE */
+		};
 		nfs4_stateid	delegation;		/* CLAIM_DELEGATE_CUR */
 		fmode_t		delegation_type;	/* CLAIM_PREVIOUS */
 	} u;
-- 
cgit v1.2.3


From 69da9bcb98ccbfb5d5f751bc13418f1307332925 Mon Sep 17 00:00:00 2001
From: Daniel Mack <daniel@caiaq.de>
Date: Wed, 16 Jun 2010 17:57:28 +0200
Subject: ALSA: usb-audio: unify UAC macros and struct names

Get rid of the last occurances of _v1 suffixes, and move the version
number right after the "uac" string. Now things are consitent again.

Sorry for the forth and back, but it just looks much nicer this way.

Signed-off-by: Daniel Mack <daniel@caiaq.de>
Signed-off-by: Takashi Iwai <tiwai@suse.de>
---
 drivers/usb/gadget/f_audio.c |  6 +++---
 drivers/usb/gadget/gmidi.c   |  2 +-
 include/linux/usb/audio-v2.h |  2 +-
 include/linux/usb/audio.h    | 12 ++++++------
 sound/usb/card.c             |  2 +-
 sound/usb/endpoint.c         |  4 ++--
 sound/usb/mixer.c            | 14 +++++++-------
 7 files changed, 21 insertions(+), 21 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/usb/gadget/f_audio.c b/drivers/usb/gadget/f_audio.c
index b91115f84b13..1f48ceb55a77 100644
--- a/drivers/usb/gadget/f_audio.c
+++ b/drivers/usb/gadget/f_audio.c
@@ -61,7 +61,7 @@ DECLARE_UAC_AC_HEADER_DESCRIPTOR(2);
 #define UAC_DT_TOTAL_LENGTH (UAC_DT_AC_HEADER_LENGTH + UAC_DT_INPUT_TERMINAL_SIZE \
 	+ UAC_DT_OUTPUT_TERMINAL_SIZE + UAC_DT_FEATURE_UNIT_SIZE(0))
 /* B.3.2  Class-Specific AC Interface Descriptor */
-static struct uac_ac_header_descriptor_v1_2 ac_header_desc = {
+static struct uac1_ac_header_descriptor_2 ac_header_desc = {
 	.bLength =		UAC_DT_AC_HEADER_LENGTH,
 	.bDescriptorType =	USB_DT_CS_INTERFACE,
 	.bDescriptorSubtype =	UAC_HEADER,
@@ -125,7 +125,7 @@ static struct usb_audio_control_selector feature_unit = {
 };
 
 #define OUTPUT_TERMINAL_ID	3
-static struct uac_output_terminal_descriptor_v1 output_terminal_desc = {
+static struct uac1_output_terminal_descriptor output_terminal_desc = {
 	.bLength		= UAC_DT_OUTPUT_TERMINAL_SIZE,
 	.bDescriptorType	= USB_DT_CS_INTERFACE,
 	.bDescriptorSubtype	= UAC_OUTPUT_TERMINAL,
@@ -155,7 +155,7 @@ static struct usb_interface_descriptor as_interface_alt_1_desc = {
 };
 
 /* B.4.2  Class-Specific AS Interface Descriptor */
-static struct uac_as_header_descriptor_v1 as_header_desc = {
+static struct uac1_as_header_descriptor as_header_desc = {
 	.bLength =		UAC_DT_AS_HEADER_SIZE,
 	.bDescriptorType =	USB_DT_CS_INTERFACE,
 	.bDescriptorSubtype =	UAC_AS_GENERAL,
diff --git a/drivers/usb/gadget/gmidi.c b/drivers/usb/gadget/gmidi.c
index 2b56ce621852..b7bf88019b06 100644
--- a/drivers/usb/gadget/gmidi.c
+++ b/drivers/usb/gadget/gmidi.c
@@ -238,7 +238,7 @@ static const struct usb_interface_descriptor ac_interface_desc = {
 };
 
 /* B.3.2  Class-Specific AC Interface Descriptor */
-static const struct uac_ac_header_descriptor_v1_1 ac_header_desc = {
+static const struct uac1_ac_header_descriptor_1 ac_header_desc = {
 	.bLength =		UAC_DT_AC_HEADER_SIZE(1),
 	.bDescriptorType =	USB_DT_CS_INTERFACE,
 	.bDescriptorSubtype =	USB_MS_HEADER,
diff --git a/include/linux/usb/audio-v2.h b/include/linux/usb/audio-v2.h
index 383b94ba8c20..716aebe339e8 100644
--- a/include/linux/usb/audio-v2.h
+++ b/include/linux/usb/audio-v2.h
@@ -121,7 +121,7 @@ struct uac2_feature_unit_descriptor {
 
 /* 4.9.2 Class-Specific AS Interface Descriptor */
 
-struct uac_as_header_descriptor_v2 {
+struct uac2_as_header_descriptor {
 	__u8 bLength;
 	__u8 bDescriptorType;
 	__u8 bDescriptorSubtype;
diff --git a/include/linux/usb/audio.h b/include/linux/usb/audio.h
index c51200c715e5..a54b8255d75f 100644
--- a/include/linux/usb/audio.h
+++ b/include/linux/usb/audio.h
@@ -39,8 +39,8 @@
 #define UAC_MIXER_UNIT			0x04
 #define UAC_SELECTOR_UNIT		0x05
 #define UAC_FEATURE_UNIT		0x06
-#define UAC_PROCESSING_UNIT_V1		0x07
-#define UAC_EXTENSION_UNIT_V1		0x08
+#define UAC1_PROCESSING_UNIT		0x07
+#define UAC1_EXTENSION_UNIT		0x08
 
 /* A.6 Audio Class-Specific AS Interface Descriptor Subtypes */
 #define UAC_AS_GENERAL			0x01
@@ -151,7 +151,7 @@
 
 /* Terminal Control Selectors */
 /* 4.3.2  Class-Specific AC Interface Descriptor */
-struct uac_ac_header_descriptor_v1 {
+struct uac1_ac_header_descriptor {
 	__u8  bLength;			/* 8 + n */
 	__u8  bDescriptorType;		/* USB_DT_CS_INTERFACE */
 	__u8  bDescriptorSubtype;	/* UAC_MS_HEADER */
@@ -165,7 +165,7 @@ struct uac_ac_header_descriptor_v1 {
 
 /* As above, but more useful for defining your own descriptors: */
 #define DECLARE_UAC_AC_HEADER_DESCRIPTOR(n)			\
-struct uac_ac_header_descriptor_v1_##n {			\
+struct uac1_ac_header_descriptor_##n {			\
 	__u8  bLength;						\
 	__u8  bDescriptorType;					\
 	__u8  bDescriptorSubtype;				\
@@ -205,7 +205,7 @@ struct uac_input_terminal_descriptor {
 #define UAC_TERMINAL_CS_COPY_PROTECT_CONTROL		0x01
 
 /* 4.3.2.2 Output Terminal Descriptor */
-struct uac_output_terminal_descriptor_v1 {
+struct uac1_output_terminal_descriptor {
 	__u8  bLength;			/* in bytes: 9 */
 	__u8  bDescriptorType;		/* CS_INTERFACE descriptor type */
 	__u8  bDescriptorSubtype;	/* OUTPUT_TERMINAL descriptor subtype */
@@ -395,7 +395,7 @@ static inline __u8 *uac_processing_unit_specific(struct uac_processing_unit_desc
 }
 
 /* 4.5.2 Class-Specific AS Interface Descriptor */
-struct uac_as_header_descriptor_v1 {
+struct uac1_as_header_descriptor {
 	__u8  bLength;			/* in bytes: 7 */
 	__u8  bDescriptorType;		/* USB_DT_CS_INTERFACE */
 	__u8  bDescriptorSubtype;	/* AS_GENERAL */
diff --git a/sound/usb/card.c b/sound/usb/card.c
index 7a8ac1d81be7..9feb00c831a0 100644
--- a/sound/usb/card.c
+++ b/sound/usb/card.c
@@ -217,7 +217,7 @@ static int snd_usb_create_streams(struct snd_usb_audio *chip, int ctrlif)
 
 	switch (protocol) {
 	case UAC_VERSION_1: {
-		struct uac_ac_header_descriptor_v1 *h1 = control_header;
+		struct uac1_ac_header_descriptor *h1 = control_header;
 
 		if (!h1->bInCollection) {
 			snd_printk(KERN_INFO "skipping empty audio interface (v1)\n");
diff --git a/sound/usb/endpoint.c b/sound/usb/endpoint.c
index 6f6596cf2b19..2af0f9e3dcdf 100644
--- a/sound/usb/endpoint.c
+++ b/sound/usb/endpoint.c
@@ -275,7 +275,7 @@ int snd_usb_parse_audio_endpoints(struct snd_usb_audio *chip, int iface_no)
 		/* get audio formats */
 		switch (protocol) {
 		case UAC_VERSION_1: {
-			struct uac_as_header_descriptor_v1 *as =
+			struct uac1_as_header_descriptor *as =
 				snd_usb_find_csint_desc(alts->extra, alts->extralen, NULL, UAC_AS_GENERAL);
 
 			if (!as) {
@@ -297,7 +297,7 @@ int snd_usb_parse_audio_endpoints(struct snd_usb_audio *chip, int iface_no)
 		case UAC_VERSION_2: {
 			struct uac2_input_terminal_descriptor *input_term;
 			struct uac2_output_terminal_descriptor *output_term;
-			struct uac_as_header_descriptor_v2 *as =
+			struct uac2_as_header_descriptor *as =
 				snd_usb_find_csint_desc(alts->extra, alts->extralen, NULL, UAC_AS_GENERAL);
 
 			if (!as) {
diff --git a/sound/usb/mixer.c b/sound/usb/mixer.c
index 736d134cc03c..ba54eb6bb0c9 100644
--- a/sound/usb/mixer.c
+++ b/sound/usb/mixer.c
@@ -582,9 +582,9 @@ static int get_term_name(struct mixer_build *state, struct usb_audio_term *iterm
 		switch (iterm->type >> 16) {
 		case UAC_SELECTOR_UNIT:
 			strcpy(name, "Selector"); return 8;
-		case UAC_PROCESSING_UNIT_V1:
+		case UAC1_PROCESSING_UNIT:
 			strcpy(name, "Process Unit"); return 12;
-		case UAC_EXTENSION_UNIT_V1:
+		case UAC1_EXTENSION_UNIT:
 			strcpy(name, "Ext Unit"); return 8;
 		case UAC_MIXER_UNIT:
 			strcpy(name, "Mixer"); return 5;
@@ -672,8 +672,8 @@ static int check_input_term(struct mixer_build *state, int id, struct usb_audio_
 			term->name = uac_selector_unit_iSelector(d);
 			return 0;
 		}
-		case UAC_PROCESSING_UNIT_V1:
-		case UAC_EXTENSION_UNIT_V1: {
+		case UAC1_PROCESSING_UNIT:
+		case UAC1_EXTENSION_UNIT: {
 			struct uac_processing_unit_descriptor *d = p1;
 			if (d->bNrInPins) {
 				id = d->baSourceID[0];
@@ -1855,13 +1855,13 @@ static int parse_audio_unit(struct mixer_build *state, int unitid)
 		return parse_audio_selector_unit(state, unitid, p1);
 	case UAC_FEATURE_UNIT:
 		return parse_audio_feature_unit(state, unitid, p1);
-	case UAC_PROCESSING_UNIT_V1:
+	case UAC1_PROCESSING_UNIT:
 	/*   UAC2_EFFECT_UNIT has the same value */
 		if (state->mixer->protocol == UAC_VERSION_1)
 			return parse_audio_processing_unit(state, unitid, p1);
 		else
 			return 0; /* FIXME - effect units not implemented yet */
-	case UAC_EXTENSION_UNIT_V1:
+	case UAC1_EXTENSION_UNIT:
 	/*   UAC2_PROCESSING_UNIT_V2 has the same value */
 		if (state->mixer->protocol == UAC_VERSION_1)
 			return parse_audio_extension_unit(state, unitid, p1);
@@ -1925,7 +1925,7 @@ static int snd_usb_mixer_controls(struct usb_mixer_interface *mixer)
 	p = NULL;
 	while ((p = snd_usb_find_csint_desc(hostif->extra, hostif->extralen, p, UAC_OUTPUT_TERMINAL)) != NULL) {
 		if (mixer->protocol == UAC_VERSION_1) {
-			struct uac_output_terminal_descriptor_v1 *desc = p;
+			struct uac1_output_terminal_descriptor *desc = p;
 
 			if (desc->bLength < sizeof(*desc))
 				continue; /* invalid descriptor? */
-- 
cgit v1.2.3


From 157a57b6fae7d3c6d24b7623dcc6679c6d244621 Mon Sep 17 00:00:00 2001
From: Daniel Mack <daniel@caiaq.de>
Date: Wed, 16 Jun 2010 17:57:30 +0200
Subject: ALSA: usb-audio: move and add some comments

Also add a list of open topics.

Signed-off-by: Daniel Mack <daniel@caiaq.de>
Signed-off-by: Takashi Iwai <tiwai@suse.de>
---
 include/linux/usb/audio-v2.h | 15 +++++++++++++++
 sound/usb/clock.c            | 16 ++++++++++++++--
 sound/usb/mixer.c            | 24 ++++++++++++++++--------
 3 files changed, 45 insertions(+), 10 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/usb/audio-v2.h b/include/linux/usb/audio-v2.h
index 716aebe339e8..964cb603f7c7 100644
--- a/include/linux/usb/audio-v2.h
+++ b/include/linux/usb/audio-v2.h
@@ -18,6 +18,21 @@
 /* v1.0 and v2.0 of this standard have many things in common. For the rest
  * of the definitions, please refer to audio.h */
 
+/*
+ * bmControl field decoders
+ *
+ * From the USB Audio spec v2.0:
+ *
+ *   bmaControls() is a (ch+1)-element array of 4-byte bitmaps,
+ *   each containing a set of bit pairs. If a Control is present,
+ *   it must be Host readable. If a certain Control is not
+ *   present then the bit pair must be set to 0b00.
+ *   If a Control is present but read-only, the bit pair must be
+ *   set to 0b01. If a Control is also Host programmable, the bit
+ *   pair must be set to 0b11. The value 0b10 is not allowed.
+ *
+ */
+
 static inline bool uac2_control_is_readable(u32 bmControls, u8 control)
 {
 	return (bmControls >> (control * 2)) & 0x1;
diff --git a/sound/usb/clock.c b/sound/usb/clock.c
index 386b09c5ce73..7279d6190875 100644
--- a/sound/usb/clock.c
+++ b/sound/usb/clock.c
@@ -120,8 +120,6 @@ static bool uac_clock_source_is_valid(struct snd_usb_audio *chip, int source_id)
 	return !!data;
 }
 
-/* Try to find the clock source ID of a given clock entity */
-
 static int __uac_clock_find_source(struct snd_usb_audio *chip,
 				   struct usb_host_interface *host_iface,
 				   int entity_id, unsigned long *visited)
@@ -154,6 +152,8 @@ static int __uac_clock_find_source(struct snd_usb_audio *chip,
 		if (ret < 0)
 			return ret;
 
+		/* Selector values are one-based */
+
 		if (ret > selector->bNrInPins || ret < 1) {
 			printk(KERN_ERR
 				"%s(): selector reported illegal value, id %d, ret %d\n",
@@ -176,6 +176,17 @@ static int __uac_clock_find_source(struct snd_usb_audio *chip,
 	return -EINVAL;
 }
 
+/*
+ * For all kinds of sample rate settings and other device queries,
+ * the clock source (end-leaf) must be used. However, clock selectors,
+ * clock multipliers and sample rate converters may be specified as
+ * clock source input to terminal. This functions walks the clock path
+ * to its end and tries to find the source.
+ *
+ * The 'visited' bitfield is used internally to detect recursive loops.
+ *
+ * Returns the clock source UnitID (>=0) on success, or an error.
+ */
 int snd_usb_clock_find_source(struct snd_usb_audio *chip,
 			      struct usb_host_interface *host_iface,
 			      int entity_id)
@@ -246,6 +257,7 @@ static int set_sample_rate_v2(struct snd_usb_audio *chip, int iface,
 		return clock;
 
 	if (!uac_clock_source_is_valid(chip, clock)) {
+		/* TODO: should we try to find valid clock setups by ourself? */
 		snd_printk(KERN_ERR "%d:%d:%d: clock source %d is not valid, cannot use\n",
 			   dev->devnum, iface, fmt->altsetting, clock);
 		return -ENXIO;
diff --git a/sound/usb/mixer.c b/sound/usb/mixer.c
index ba54eb6bb0c9..1163ec3ca8a0 100644
--- a/sound/usb/mixer.c
+++ b/sound/usb/mixer.c
@@ -26,6 +26,22 @@
  *
  */
 
+/*
+ * TODOs, for both the mixer and the streaming interfaces:
+ *
+ *  - support for UAC2 effect units
+ *  - support for graphical equalizers
+ *  - RANGE and MEM set commands (UAC2)
+ *  - RANGE and MEM interrupt dispatchers (UAC2)
+ *  - audio channel clustering (UAC2)
+ *  - audio sample rate converter units (UAC2)
+ *  - proper handling of clock multipliers (UAC2)
+ *  - dispatch clock change notifications (UAC2)
+ *  	- stop PCM streams which use a clock that became invalid
+ *  	- stop PCM streams which use a clock selector that has changed
+ *  	- parse available sample rates again when clock sources changed
+ */
+
 #include <linux/bitops.h>
 #include <linux/init.h>
 #include <linux/list.h>
@@ -1199,14 +1215,6 @@ static int parse_audio_feature_unit(struct mixer_build *state, int unitid, void
 		}
 	} else { /* UAC_VERSION_2 */
 		for (i = 0; i < 30/2; i++) {
-			/* From the USB Audio spec v2.0:
-			   bmaControls() is a (ch+1)-element array of 4-byte bitmaps,
-			   each containing a set of bit pairs. If a Control is present,
-			   it must be Host readable. If a certain Control is not
-			   present then the bit pair must be set to 0b00.
-			   If a Control is present but read-only, the bit pair must be
-			   set to 0b01. If a Control is also Host programmable, the bit
-			   pair must be set to 0b11. The value 0b10 is not allowed. */
 			unsigned int ch_bits = 0;
 			unsigned int ch_read_only = 0;
 
-- 
cgit v1.2.3


From 45a73372efe4a63f44aa2e1125d4a777c2fdc8d8 Mon Sep 17 00:00:00 2001
From: Frederic Weisbecker <fweisbec@gmail.com>
Date: Wed, 23 Jun 2010 23:00:37 +0200
Subject: hw_breakpoints: Fix per task breakpoint tracking

Freeing a perf event can happen in several ways. A task
calls perf_event_exit_task() right before exiting. This helper
will detach all the events from the task context and queue their
removal through free_event() if they are child tasks. The task
also loses its context reference there.

Releasing the breakpoint slot from the constraint table is made
from free_event() that calls release_bp_slot(). We count the number
of breakpoints this task is running by looking at the task's
perf_event_ctxp and iterating through its attached events.
But at this time, the reference to this context has been cleaned up
already.

So looking at the event->ctx instead of task->perf_event_ctxp
to count the remaining breakpoints should solve the problem.
At least it would for child breakpoints, but not for parent ones.
If the parent exits before the child, it will remove all its
events from the context but free_event() will be called later,
on fd release time. And checking the number of breakpoints the
task has attached to its context at this time is unreliable as all
events have been removed from the context.

To solve this, we keep track of the list of per task breakpoints.
On top of it, we maintain our array of numbers of breakpoints used
by the tasks. We use the context address as a task id.

So, instead of looking at the number of events attached to a context,
we walk through our list of per task breakpoints and count the number
of breakpoints that use the same ctx than the one to be reserved or
released from the constraint table, and update the count on top of this
result.

In the meantime it solves a bad refcounting, it also solves a warning,
reported by Paul.

Badness at /home/paulus/kernel/perf/kernel/hw_breakpoint.c:114
NIP: c0000000000cb470 LR: c0000000000cb46c CTR: c00000000032d9b8
REGS: c000000118e7b570 TRAP: 0700   Not tainted  (2.6.35-rc3-perf-00008-g76b0f13
)
MSR: 9000000000029032 <EE,ME,CE,IR,DR>  CR: 44004424  XER: 000fffff
TASK = c0000001187dcad0[3143] 'perf' THREAD: c000000118e78000 CPU: 1
GPR00: c0000000000cb46c c000000118e7b7f0 c0000000009866a0 0000000000000020
GPR04: 0000000000000000 000000000000001d 0000000000000000 0000000000000001
GPR08: c0000000009bed68 c00000000086dff8 c000000000a5bf10 0000000000000001
GPR12: 0000000024004422 c00000000ffff200 0000000000000000 0000000000000000
GPR16: 0000000000000000 0000000000000000 0000000000000018 00000000101150f4
GPR20: 0000000010206b40 0000000000000000 0000000000000000 00000000101150f4
GPR24: c0000001199090c0 0000000000000001 0000000000000000 0000000000000001
GPR28: 0000000000000000 0000000000000000 c0000000008ec290 0000000000000000
NIP [c0000000000cb470] .task_bp_pinned+0x5c/0x12c
LR [c0000000000cb46c] .task_bp_pinned+0x58/0x12c
Call Trace:
[c000000118e7b7f0] [c0000000000cb46c] .task_bp_pinned+0x58/0x12c (unreliable)
[c000000118e7b8a0] [c0000000000cb584] .toggle_bp_task_slot+0x44/0xe4
[c000000118e7b940] [c0000000000cb6c8] .toggle_bp_slot+0xa4/0x164
[c000000118e7b9f0] [c0000000000cbafc] .release_bp_slot+0x44/0x6c
[c000000118e7ba80] [c0000000000c4178] .bp_perf_event_destroy+0x10/0x24
[c000000118e7bb00] [c0000000000c4aec] .free_event+0x180/0x1bc
[c000000118e7bbc0] [c0000000000c54c4] .perf_event_release_kernel+0x14c/0x170

Reported-by: Paul Mackerras <paulus@samba.org>
Signed-off-by: Frederic Weisbecker <fweisbec@gmail.com>
Cc: Ingo Molnar <mingo@elte.hu>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: Prasad <prasad@linux.vnet.ibm.com>
Cc: Mahesh Salgaonkar <mahesh@linux.vnet.ibm.com>
Cc: Will Deacon <will.deacon@arm.com>
Cc: Jason Wessel <jason.wessel@windriver.com>
---
 include/linux/perf_event.h |  6 ++--
 kernel/hw_breakpoint.c     | 78 ++++++++++++++++++++++++----------------------
 2 files changed, 45 insertions(+), 39 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h
index 63b5aa5dce69..0dd5f8ad77ac 100644
--- a/include/linux/perf_event.h
+++ b/include/linux/perf_event.h
@@ -533,8 +533,10 @@ struct hw_perf_event {
 			struct hrtimer	hrtimer;
 		};
 #ifdef CONFIG_HAVE_HW_BREAKPOINT
-		/* breakpoint */
-		struct arch_hw_breakpoint	info;
+		struct { /* breakpoint */
+			struct arch_hw_breakpoint	info;
+			struct list_head		bp_list;
+		};
 #endif
 	};
 	local64_t			prev_count;
diff --git a/kernel/hw_breakpoint.c b/kernel/hw_breakpoint.c
index 7a56b22e0602..e34d94d50924 100644
--- a/kernel/hw_breakpoint.c
+++ b/kernel/hw_breakpoint.c
@@ -41,6 +41,7 @@
 #include <linux/sched.h>
 #include <linux/init.h>
 #include <linux/slab.h>
+#include <linux/list.h>
 #include <linux/cpu.h>
 #include <linux/smp.h>
 
@@ -62,6 +63,9 @@ static DEFINE_PER_CPU(unsigned int, nr_bp_flexible[TYPE_MAX]);
 
 static int nr_slots[TYPE_MAX];
 
+/* Keep track of the breakpoints attached to tasks */
+static LIST_HEAD(bp_task_head);
+
 static int constraints_initialized;
 
 /* Gather the number of total pinned and un-pinned bp in a cpuset */
@@ -103,33 +107,21 @@ static unsigned int max_task_bp_pinned(int cpu, enum bp_type_idx type)
 	return 0;
 }
 
-static int task_bp_pinned(struct task_struct *tsk, enum bp_type_idx type)
+/*
+ * Count the number of breakpoints of the same type and same task.
+ * The given event must be not on the list.
+ */
+static int task_bp_pinned(struct perf_event *bp, enum bp_type_idx type)
 {
-	struct perf_event_context *ctx = tsk->perf_event_ctxp;
-	struct list_head *list;
-	struct perf_event *bp;
-	unsigned long flags;
+	struct perf_event_context *ctx = bp->ctx;
+	struct perf_event *iter;
 	int count = 0;
 
-	if (WARN_ONCE(!ctx, "No perf context for this task"))
-		return 0;
-
-	list = &ctx->event_list;
-
-	raw_spin_lock_irqsave(&ctx->lock, flags);
-
-	/*
-	 * The current breakpoint counter is not included in the list
-	 * at the open() callback time
-	 */
-	list_for_each_entry(bp, list, event_entry) {
-		if (bp->attr.type == PERF_TYPE_BREAKPOINT)
-			if (find_slot_idx(bp) == type)
-				count += hw_breakpoint_weight(bp);
+	list_for_each_entry(iter, &bp_task_head, hw.bp_list) {
+		if (iter->ctx == ctx && find_slot_idx(iter) == type)
+			count += hw_breakpoint_weight(iter);
 	}
 
-	raw_spin_unlock_irqrestore(&ctx->lock, flags);
-
 	return count;
 }
 
@@ -149,7 +141,7 @@ fetch_bp_busy_slots(struct bp_busy_slots *slots, struct perf_event *bp,
 		if (!tsk)
 			slots->pinned += max_task_bp_pinned(cpu, type);
 		else
-			slots->pinned += task_bp_pinned(tsk, type);
+			slots->pinned += task_bp_pinned(bp, type);
 		slots->flexible = per_cpu(nr_bp_flexible[type], cpu);
 
 		return;
@@ -162,7 +154,7 @@ fetch_bp_busy_slots(struct bp_busy_slots *slots, struct perf_event *bp,
 		if (!tsk)
 			nr += max_task_bp_pinned(cpu, type);
 		else
-			nr += task_bp_pinned(tsk, type);
+			nr += task_bp_pinned(bp, type);
 
 		if (nr > slots->pinned)
 			slots->pinned = nr;
@@ -188,7 +180,7 @@ fetch_this_slot(struct bp_busy_slots *slots, int weight)
 /*
  * Add a pinned breakpoint for the given task in our constraint table
  */
-static void toggle_bp_task_slot(struct task_struct *tsk, int cpu, bool enable,
+static void toggle_bp_task_slot(struct perf_event *bp, int cpu, bool enable,
 				enum bp_type_idx type, int weight)
 {
 	unsigned int *tsk_pinned;
@@ -196,10 +188,11 @@ static void toggle_bp_task_slot(struct task_struct *tsk, int cpu, bool enable,
 	int old_idx = 0;
 	int idx = 0;
 
-	old_count = task_bp_pinned(tsk, type);
+	old_count = task_bp_pinned(bp, type);
 	old_idx = old_count - 1;
 	idx = old_idx + weight;
 
+	/* tsk_pinned[n] is the number of tasks having n breakpoints */
 	tsk_pinned = per_cpu(nr_task_bp_pinned[type], cpu);
 	if (enable) {
 		tsk_pinned[idx]++;
@@ -222,23 +215,30 @@ toggle_bp_slot(struct perf_event *bp, bool enable, enum bp_type_idx type,
 	int cpu = bp->cpu;
 	struct task_struct *tsk = bp->ctx->task;
 
+	/* Pinned counter cpu profiling */
+	if (!tsk) {
+
+		if (enable)
+			per_cpu(nr_cpu_bp_pinned[type], bp->cpu) += weight;
+		else
+			per_cpu(nr_cpu_bp_pinned[type], bp->cpu) -= weight;
+		return;
+	}
+
 	/* Pinned counter task profiling */
-	if (tsk) {
-		if (cpu >= 0) {
-			toggle_bp_task_slot(tsk, cpu, enable, type, weight);
-			return;
-		}
 
+	if (!enable)
+		list_del(&bp->hw.bp_list);
+
+	if (cpu >= 0) {
+		toggle_bp_task_slot(bp, cpu, enable, type, weight);
+	} else {
 		for_each_online_cpu(cpu)
-			toggle_bp_task_slot(tsk, cpu, enable, type, weight);
-		return;
+			toggle_bp_task_slot(bp, cpu, enable, type, weight);
 	}
 
-	/* Pinned counter cpu profiling */
 	if (enable)
-		per_cpu(nr_cpu_bp_pinned[type], bp->cpu) += weight;
-	else
-		per_cpu(nr_cpu_bp_pinned[type], bp->cpu) -= weight;
+		list_add_tail(&bp->hw.bp_list, &bp_task_head);
 }
 
 /*
@@ -301,6 +301,10 @@ static int __reserve_bp_slot(struct perf_event *bp)
 	weight = hw_breakpoint_weight(bp);
 
 	fetch_bp_busy_slots(&slots, bp, type);
+	/*
+	 * Simulate the addition of this breakpoint to the constraints
+	 * and see the result.
+	 */
 	fetch_this_slot(&slots, weight);
 
 	/* Flexible counters need to keep at least one slot */
-- 
cgit v1.2.3


From 5cfaf214856eb934759ae500a0b812dd06a00bd9 Mon Sep 17 00:00:00 2001
From: Nobuhiro Iwamatsu <nobuhiro.iwamatsu.yj@renesas.com>
Date: Wed, 23 Jun 2010 09:17:53 +0900
Subject: perf: Fix argument of perf_arch_fetch_caller_regs

"struct regs" was set to argument of perf_arch_fetch_caller_regs
off-case. It should be "struct pt_regs".

This fixes various build errors in archs that have CONFIG_PERF_EVENTS=y
but no overriden implementation of perf_arch_fetch_caller_regs.

cc1: warnings being treated as errors
In file included from include/linux/ftrace_event.h:8,
                 from include/trace/syscall.h:6,
                 from include/linux/syscalls.h:75,
                 from arch/sh/kernel/sys_sh32.c:9:
include/linux/perf_event.h:937: error: 'struct regs' declared inside parameter list
include/linux/perf_event.h:937: error: its scope is only this definition or declaration, which is probably not what you want
include/linux/perf_event.h: In function 'perf_fetch_caller_regs':
include/linux/perf_event.h:952: error: passing argument 1 of 'perf_arch_fetch_caller_regs' from incompatible pointer type

Signed-off-by: Nobuhiro Iwamatsu <nobuhiro.iwamatsu.yj@renesas.com>
Reported-by: Stephen Rothwell <sfr@canb.auug.org.au>
Cc: Paul Mackerras <paulus@samba.org>
Cc: David Miller <davem@davemloft.net>
Cc: Ingo Molnar <mingo@elte.hu>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
LKML-Reference: <AANLkTinKKFKEBQrZ3Hkj-XCaMwaTqulb-XnFzqEYiFRr@mail.gmail.com>
Signed-off-by: Frederic Weisbecker <fweisbec@gmail.com>
---
 include/linux/perf_event.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h
index 0dd5f8ad77ac..937495c25073 100644
--- a/include/linux/perf_event.h
+++ b/include/linux/perf_event.h
@@ -936,7 +936,7 @@ extern void __perf_sw_event(u32, u64, int, struct pt_regs *, u64);
 
 #ifndef perf_arch_fetch_caller_regs
 static inline void
-perf_arch_fetch_caller_regs(struct regs *regs, unsigned long ip) { }
+perf_arch_fetch_caller_regs(struct pt_regs *regs, unsigned long ip) { }
 #endif
 
 /*
-- 
cgit v1.2.3


From b505ff5e7291cca6379549297e3852ce3622d550 Mon Sep 17 00:00:00 2001
From: Grant Likely <grant.likely@secretlab.ca>
Date: Fri, 18 Jun 2010 11:09:59 -0600
Subject: of: kill struct of_device

Now that the device tree node pointer has been moved out of struct
of_device and into the common struct device, there isn't anything
unique about of_device anymore.  In fact, there isn't much need
for a separate of_bus when all busses have access to OF style
probing.

arch/powerpc and arch/microblaze are moving away from using the of_bus
and using the regular platform bus instead for mmio devices.  This
patch makes of_device the same as platform_device as a stepping stone
in migrating of_platform_drivers over to the platform bus.

Signed-off-by: Grant Likely <grant.likely@secretlab.ca>
Acked-by: David S. Miller <davem@davemloft.net>
Cc: Michal Simek <monstr@monstr.eu>
Cc: Benjamin Herrenschmidt <benh@kernel.crashing.org>
Cc: Stephen Rothwell <sfr@canb.auug.org.au>
---
 arch/microblaze/include/asm/of_device.h | 10 ----------
 arch/powerpc/include/asm/of_device.h    | 11 -----------
 arch/powerpc/include/asm/smu.h          |  4 ++--
 arch/sparc/include/asm/device.h         |  4 ++--
 arch/sparc/include/asm/of_device.h      | 14 --------------
 drivers/net/niu.h                       |  4 ++--
 include/linux/of_device.h               | 17 +++++++++++++++++
 7 files changed, 23 insertions(+), 41 deletions(-)

(limited to 'include/linux')

diff --git a/arch/microblaze/include/asm/of_device.h b/arch/microblaze/include/asm/of_device.h
index 73cb98040982..0a5f3f914b42 100644
--- a/arch/microblaze/include/asm/of_device.h
+++ b/arch/microblaze/include/asm/of_device.h
@@ -15,16 +15,6 @@
 #include <linux/device.h>
 #include <linux/of.h>
 
-/*
- * The of_device is a kind of "base class" that is a superset of
- * struct device for use by devices attached to an OF node and
- * probed using OF properties.
- */
-struct of_device {
-	struct device		dev; /* Generic device interface */
-	struct pdev_archdata	archdata;
-};
-
 extern ssize_t of_device_get_modalias(struct of_device *ofdev,
 					char *str, ssize_t len);
 
diff --git a/arch/powerpc/include/asm/of_device.h b/arch/powerpc/include/asm/of_device.h
index 444e97e2982e..cb36632f953c 100644
--- a/arch/powerpc/include/asm/of_device.h
+++ b/arch/powerpc/include/asm/of_device.h
@@ -5,17 +5,6 @@
 #include <linux/device.h>
 #include <linux/of.h>
 
-/*
- * The of_device is a kind of "base class" that is a superset of
- * struct device for use by devices attached to an OF node and
- * probed using OF properties.
- */
-struct of_device
-{
-	struct device		dev;		/* Generic device interface */
-	struct pdev_archdata	archdata;
-};
-
 extern struct of_device *of_device_alloc(struct device_node *np,
 					 const char *bus_id,
 					 struct device *parent);
diff --git a/arch/powerpc/include/asm/smu.h b/arch/powerpc/include/asm/smu.h
index 7ae2753da565..e3bdada8c542 100644
--- a/arch/powerpc/include/asm/smu.h
+++ b/arch/powerpc/include/asm/smu.h
@@ -457,8 +457,8 @@ extern void smu_poll(void);
  */
 extern int smu_init(void);
 extern int smu_present(void);
-struct of_device;
-extern struct of_device *smu_get_ofdev(void);
+struct platform_device;
+extern struct platform_device *smu_get_ofdev(void);
 
 
 /*
diff --git a/arch/sparc/include/asm/device.h b/arch/sparc/include/asm/device.h
index f9740d065fe7..fb220e482039 100644
--- a/arch/sparc/include/asm/device.h
+++ b/arch/sparc/include/asm/device.h
@@ -9,13 +9,13 @@
 #include <asm/openprom.h>
 
 struct device_node;
-struct of_device;
+struct platform_device;
 
 struct dev_archdata {
 	void			*iommu;
 	void			*stc;
 	void			*host_controller;
-	struct of_device	*op;
+	struct platform_device	*op;
 	int			numa_node;
 };
 
diff --git a/arch/sparc/include/asm/of_device.h b/arch/sparc/include/asm/of_device.h
index 6d1844a547b4..22b9828fe693 100644
--- a/arch/sparc/include/asm/of_device.h
+++ b/arch/sparc/include/asm/of_device.h
@@ -7,20 +7,6 @@
 #include <linux/mod_devicetable.h>
 #include <asm/openprom.h>
 
-/*
- * The of_device is a kind of "base class" that is a superset of
- * struct device for use by devices attached to an OF node and
- * probed using OF properties.
- */
-struct of_device
-{
-	struct device			dev;
-	u32				num_resources;
-	struct resource			*resource;
-
-	struct pdev_archdata		archdata;
-};
-
 extern void __iomem *of_ioremap(struct resource *res, unsigned long offset, unsigned long size, char *name);
 extern void of_iounmap(struct resource *res, void __iomem *base, unsigned long size);
 
diff --git a/drivers/net/niu.h b/drivers/net/niu.h
index d6715465f35d..a41fa8ebe05f 100644
--- a/drivers/net/niu.h
+++ b/drivers/net/niu.h
@@ -3236,7 +3236,7 @@ struct niu_phy_ops {
 	int (*link_status)(struct niu *np, int *);
 };
 
-struct of_device;
+struct platform_device;
 struct niu {
 	void __iomem			*regs;
 	struct net_device		*dev;
@@ -3297,7 +3297,7 @@ struct niu {
 	struct niu_vpd			vpd;
 	u32				eeprom_len;
 
-	struct of_device		*op;
+	struct platform_device		*op;
 	void __iomem			*vir_regs_1;
 	void __iomem			*vir_regs_2;
 };
diff --git a/include/linux/of_device.h b/include/linux/of_device.h
index 11651facc5f1..a3ae5900fc58 100644
--- a/include/linux/of_device.h
+++ b/include/linux/of_device.h
@@ -3,9 +3,26 @@
 
 #ifdef CONFIG_OF_DEVICE
 #include <linux/device.h>
+#include <linux/platform_device.h>
 #include <linux/of.h>
 #include <linux/mod_devicetable.h>
 
+
+/*
+ * The of_device *was* a kind of "base class" that was a superset of
+ * struct device for use by devices attached to an OF node and probed
+ * using OF properties.  However, the important bit of OF-style
+ * probing, namely the device node pointer, has been moved into the
+ * common struct device when CONFIG_OF is set to make OF-style probing
+ * available to all bus types.  So now, just make of_device and
+ * platform_device equivalent so that current of_platform bus users
+ * can be transparently migrated over to using the platform bus.
+ *
+ * This line will go away once all references to of_device are removed
+ * from the kernel.
+ */
+#define of_device platform_device
+
 #include <asm/of_device.h>
 
 #define	to_of_device(d) container_of(d, struct of_device, dev)
-- 
cgit v1.2.3


From e3873444990dd6f8a095d1f72b5ad45192f8c506 Mon Sep 17 00:00:00 2001
From: Grant Likely <grant.likely@secretlab.ca>
Date: Fri, 18 Jun 2010 11:09:59 -0600
Subject: of/irq: Move irq_of_parse_and_map() to common code

Merge common code between PowerPC and Microblaze.  SPARC implements
irq_of_parse_and_map(), but the implementation is different, so it
does not use this code.

Signed-off-by: Grant Likely <grant.likely@secretlab.ca>
Acked-by: Benjamin Herrenschmidt <benh@kernel.crashing.org>
Cc: Michal Simek <monstr@monstr.eu>
Cc: "David S. Miller" <davem@davemloft.net>
Cc: Stephen Rothwell <sfr@canb.auug.org.au>
Cc: Jeremy Kerr <jeremy.kerr@canonical.com>
---
 arch/microblaze/include/asm/irq.h  | 24 --------------------
 arch/microblaze/include/asm/prom.h | 26 +---------------------
 arch/microblaze/kernel/irq.c       | 14 ++----------
 arch/powerpc/include/asm/irq.h     | 28 ------------------------
 arch/powerpc/include/asm/prom.h    | 27 +----------------------
 arch/powerpc/kernel/irq.c          | 14 ++----------
 arch/sparc/include/asm/prom.h      |  1 -
 drivers/of/Kconfig                 |  4 ++++
 drivers/of/Makefile                |  1 +
 drivers/of/irq.c                   | 45 ++++++++++++++++++++++++++++++++++++++
 drivers/of/of_mdio.c               |  1 +
 drivers/of/of_spi.c                |  1 +
 include/linux/of_irq.h             | 41 ++++++++++++++++++++++++++++++++++
 13 files changed, 99 insertions(+), 128 deletions(-)
 create mode 100644 drivers/of/irq.c
 create mode 100644 include/linux/of_irq.h

(limited to 'include/linux')

diff --git a/arch/microblaze/include/asm/irq.h b/arch/microblaze/include/asm/irq.h
index 31a35c33df63..ec5583d6111c 100644
--- a/arch/microblaze/include/asm/irq.h
+++ b/arch/microblaze/include/asm/irq.h
@@ -27,17 +27,6 @@ extern unsigned int nr_irq;
 struct pt_regs;
 extern void do_IRQ(struct pt_regs *regs);
 
-/**
- * irq_of_parse_and_map - Parse and Map an interrupt into linux virq space
- * @device: Device node of the device whose interrupt is to be mapped
- * @index: Index of the interrupt to map
- *
- * This function is a wrapper that chains of_irq_map_one() and
- * irq_create_of_mapping() to make things easier to callers
- */
-struct device_node;
-extern unsigned int irq_of_parse_and_map(struct device_node *dev, int index);
-
 /** FIXME - not implement
  * irq_dispose_mapping - Unmap an interrupt
  * @virq: linux virq number of the interrupt to unmap
@@ -62,17 +51,4 @@ struct irq_host;
 extern unsigned int irq_create_mapping(struct irq_host *host,
 					irq_hw_number_t hwirq);
 
-/**
- * irq_create_of_mapping - Map a hardware interrupt into linux virq space
- * @controller: Device node of the interrupt controller
- * @inspec: Interrupt specifier from the device-tree
- * @intsize: Size of the interrupt specifier from the device-tree
- *
- * This function is identical to irq_create_mapping except that it takes
- * as input informations straight from the device-tree (typically the results
- * of the of_irq_map_*() functions.
- */
-extern unsigned int irq_create_of_mapping(struct device_node *controller,
-					u32 *intspec, unsigned int intsize);
-
 #endif /* _ASM_MICROBLAZE_IRQ_H */
diff --git a/arch/microblaze/include/asm/prom.h b/arch/microblaze/include/asm/prom.h
index e7d67a329bd7..e9fb2eb0035d 100644
--- a/arch/microblaze/include/asm/prom.h
+++ b/arch/microblaze/include/asm/prom.h
@@ -20,6 +20,7 @@
 #ifndef __ASSEMBLY__
 
 #include <linux/types.h>
+#include <linux/of_irq.h>
 #include <linux/of_fdt.h>
 #include <linux/proc_fs.h>
 #include <linux/platform_device.h>
@@ -92,18 +93,6 @@ extern const void *of_get_mac_address(struct device_node *np);
  * OF interrupt mapping
  */
 
-/* This structure is returned when an interrupt is mapped. The controller
- * field needs to be put() after use
- */
-
-#define OF_MAX_IRQ_SPEC		4 /* We handle specifiers of at most 4 cells */
-
-struct of_irq {
-	struct device_node *controller; /* Interrupt controller node */
-	u32 size; /* Specifier size */
-	u32 specifier[OF_MAX_IRQ_SPEC]; /* Specifier copy */
-};
-
 /**
  * of_irq_map_init - Initialize the irq remapper
  * @flags:	flags defining workarounds to enable
@@ -138,19 +127,6 @@ extern int of_irq_map_raw(struct device_node *parent, const u32 *intspec,
 			u32 ointsize, const u32 *addr,
 			struct of_irq *out_irq);
 
-/**
- * of_irq_map_one - Resolve an interrupt for a device
- * @device:	the device whose interrupt is to be resolved
- * @index:	index of the interrupt to resolve
- * @out_irq:	structure of_irq filled by this function
- *
- * This function resolves an interrupt, walking the tree, for a given
- * device-tree node. It's the high level pendant to of_irq_map_raw().
- * It also implements the workarounds for OldWolrd Macs.
- */
-extern int of_irq_map_one(struct device_node *device, int index,
-			struct of_irq *out_irq);
-
 /**
  * of_irq_map_pci - Resolve the interrupt for a PCI device
  * @pdev:	the device whose interrupt is to be resolved
diff --git a/arch/microblaze/kernel/irq.c b/arch/microblaze/kernel/irq.c
index 8f120aca123d..dd32b09b4a3c 100644
--- a/arch/microblaze/kernel/irq.c
+++ b/arch/microblaze/kernel/irq.c
@@ -17,20 +17,10 @@
 #include <linux/seq_file.h>
 #include <linux/kernel_stat.h>
 #include <linux/irq.h>
+#include <linux/of_irq.h>
 
 #include <asm/prom.h>
 
-unsigned int irq_of_parse_and_map(struct device_node *dev, int index)
-{
-	struct of_irq oirq;
-
-	if (of_irq_map_one(dev, index, &oirq))
-		return NO_IRQ;
-
-	return oirq.specifier[0];
-}
-EXPORT_SYMBOL_GPL(irq_of_parse_and_map);
-
 static u32 concurrent_irq;
 
 void __irq_entry do_IRQ(struct pt_regs *regs)
@@ -104,7 +94,7 @@ unsigned int irq_create_mapping(struct irq_host *host, irq_hw_number_t hwirq)
 EXPORT_SYMBOL_GPL(irq_create_mapping);
 
 unsigned int irq_create_of_mapping(struct device_node *controller,
-					u32 *intspec, unsigned int intsize)
+				   const u32 *intspec, unsigned int intsize)
 {
 	return intspec[0];
 }
diff --git a/arch/powerpc/include/asm/irq.h b/arch/powerpc/include/asm/irq.h
index e054baef1845..4e3051595b2b 100644
--- a/arch/powerpc/include/asm/irq.h
+++ b/arch/powerpc/include/asm/irq.h
@@ -300,34 +300,6 @@ extern unsigned int irq_alloc_virt(struct irq_host *host,
  */
 extern void irq_free_virt(unsigned int virq, unsigned int count);
 
-
-/* -- OF helpers -- */
-
-/**
- * irq_create_of_mapping - Map a hardware interrupt into linux virq space
- * @controller: Device node of the interrupt controller
- * @inspec: Interrupt specifier from the device-tree
- * @intsize: Size of the interrupt specifier from the device-tree
- *
- * This function is identical to irq_create_mapping except that it takes
- * as input informations straight from the device-tree (typically the results
- * of the of_irq_map_*() functions.
- */
-extern unsigned int irq_create_of_mapping(struct device_node *controller,
-					  const u32 *intspec, unsigned int intsize);
-
-/**
- * irq_of_parse_and_map - Parse and Map an interrupt into linux virq space
- * @device: Device node of the device whose interrupt is to be mapped
- * @index: Index of the interrupt to map
- *
- * This function is a wrapper that chains of_irq_map_one() and
- * irq_create_of_mapping() to make things easier to callers
- */
-extern unsigned int irq_of_parse_and_map(struct device_node *dev, int index);
-
-/* -- End OF helpers -- */
-
 /**
  * irq_early_init - Init irq remapping subsystem
  */
diff --git a/arch/powerpc/include/asm/prom.h b/arch/powerpc/include/asm/prom.h
index ddd408a93b5a..47d41b67c94d 100644
--- a/arch/powerpc/include/asm/prom.h
+++ b/arch/powerpc/include/asm/prom.h
@@ -18,6 +18,7 @@
  */
 #include <linux/types.h>
 #include <linux/of_fdt.h>
+#include <linux/of_irq.h>
 #include <linux/proc_fs.h>
 #include <linux/platform_device.h>
 #include <asm/irq.h>
@@ -108,18 +109,6 @@ extern const void *of_get_mac_address(struct device_node *np);
  * OF interrupt mapping
  */
 
-/* This structure is returned when an interrupt is mapped. The controller
- * field needs to be put() after use
- */
-
-#define OF_MAX_IRQ_SPEC		 4 /* We handle specifiers of at most 4 cells */
-
-struct of_irq {
-	struct device_node *controller;	/* Interrupt controller node */
-	u32 size;			/* Specifier size */
-	u32 specifier[OF_MAX_IRQ_SPEC];	/* Specifier copy */
-};
-
 /**
  * of_irq_map_init - Initialize the irq remapper
  * @flags:	flags defining workarounds to enable
@@ -154,20 +143,6 @@ extern int of_irq_map_raw(struct device_node *parent, const u32 *intspec,
 			  u32 ointsize, const u32 *addr,
 			  struct of_irq *out_irq);
 
-
-/**
- * of_irq_map_one - Resolve an interrupt for a device
- * @device:	the device whose interrupt is to be resolved
- * @index:     	index of the interrupt to resolve
- * @out_irq:	structure of_irq filled by this function
- *
- * This function resolves an interrupt, walking the tree, for a given
- * device-tree node. It's the high level pendant to of_irq_map_raw().
- * It also implements the workarounds for OldWolrd Macs.
- */
-extern int of_irq_map_one(struct device_node *device, int index,
-			  struct of_irq *out_irq);
-
 /**
  * of_irq_map_pci - Resolve the interrupt for a PCI device
  * @pdev:	the device whose interrupt is to be resolved
diff --git a/arch/powerpc/kernel/irq.c b/arch/powerpc/kernel/irq.c
index 30817d9b20cb..2676ef288bf5 100644
--- a/arch/powerpc/kernel/irq.c
+++ b/arch/powerpc/kernel/irq.c
@@ -53,6 +53,8 @@
 #include <linux/bootmem.h>
 #include <linux/pci.h>
 #include <linux/debugfs.h>
+#include <linux/of.h>
+#include <linux/of_irq.h>
 
 #include <asm/uaccess.h>
 #include <asm/system.h>
@@ -813,18 +815,6 @@ unsigned int irq_create_of_mapping(struct device_node *controller,
 }
 EXPORT_SYMBOL_GPL(irq_create_of_mapping);
 
-unsigned int irq_of_parse_and_map(struct device_node *dev, int index)
-{
-	struct of_irq oirq;
-
-	if (of_irq_map_one(dev, index, &oirq))
-		return NO_IRQ;
-
-	return irq_create_of_mapping(oirq.controller, oirq.specifier,
-				     oirq.size);
-}
-EXPORT_SYMBOL_GPL(irq_of_parse_and_map);
-
 void irq_dispose_mapping(unsigned int virq)
 {
 	struct irq_host *host;
diff --git a/arch/sparc/include/asm/prom.h b/arch/sparc/include/asm/prom.h
index f845828ca4c6..ac695742df85 100644
--- a/arch/sparc/include/asm/prom.h
+++ b/arch/sparc/include/asm/prom.h
@@ -56,7 +56,6 @@ extern void of_fill_in_cpu_data(void);
  * register them in the of_device objects, whereas powerpc computes them
  * on request.
  */
-extern unsigned int irq_of_parse_and_map(struct device_node *node, int index);
 static inline void irq_dispose_mapping(unsigned int virq)
 {
 }
diff --git a/drivers/of/Kconfig b/drivers/of/Kconfig
index 7cecc8fea9bd..b87495efa16e 100644
--- a/drivers/of/Kconfig
+++ b/drivers/of/Kconfig
@@ -6,6 +6,10 @@ config OF_DYNAMIC
 	def_bool y
 	depends on OF && PPC_OF
 
+config OF_IRQ
+	def_bool y
+	depends on OF && !SPARC
+
 config OF_DEVICE
 	def_bool y
 	depends on OF && (SPARC || PPC_OF || MICROBLAZE)
diff --git a/drivers/of/Makefile b/drivers/of/Makefile
index f232cc98ce00..3631a5ea0b47 100644
--- a/drivers/of/Makefile
+++ b/drivers/of/Makefile
@@ -1,5 +1,6 @@
 obj-y = base.o
 obj-$(CONFIG_OF_FLATTREE) += fdt.o
+obj-$(CONFIG_OF_IRQ)    += irq.o
 obj-$(CONFIG_OF_DEVICE) += device.o platform.o
 obj-$(CONFIG_OF_GPIO)   += gpio.o
 obj-$(CONFIG_OF_I2C)	+= of_i2c.o
diff --git a/drivers/of/irq.c b/drivers/of/irq.c
new file mode 100644
index 000000000000..9b3397c27096
--- /dev/null
+++ b/drivers/of/irq.c
@@ -0,0 +1,45 @@
+/*
+ *  Derived from arch/i386/kernel/irq.c
+ *    Copyright (C) 1992 Linus Torvalds
+ *  Adapted from arch/i386 by Gary Thomas
+ *    Copyright (C) 1995-1996 Gary Thomas (gdt@linuxppc.org)
+ *  Updated and modified by Cort Dougan <cort@fsmlabs.com>
+ *    Copyright (C) 1996-2001 Cort Dougan
+ *  Adapted for Power Macintosh by Paul Mackerras
+ *    Copyright (C) 1996 Paul Mackerras (paulus@cs.anu.edu.au)
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ *
+ * This file contains the code used to make IRQ descriptions in the
+ * device tree to actual irq numbers on an interrupt controller
+ * driver.
+ */
+
+#include <linux/errno.h>
+#include <linux/module.h>
+#include <linux/of.h>
+#include <linux/of_irq.h>
+#include <linux/string.h>
+
+/**
+ * irq_of_parse_and_map - Parse and map an interrupt into linux virq space
+ * @device: Device node of the device whose interrupt is to be mapped
+ * @index: Index of the interrupt to map
+ *
+ * This function is a wrapper that chains of_irq_map_one() and
+ * irq_create_of_mapping() to make things easier to callers
+ */
+unsigned int irq_of_parse_and_map(struct device_node *dev, int index)
+{
+	struct of_irq oirq;
+
+	if (of_irq_map_one(dev, index, &oirq))
+		return NO_IRQ;
+
+	return irq_create_of_mapping(oirq.controller, oirq.specifier,
+				     oirq.size);
+}
+EXPORT_SYMBOL_GPL(irq_of_parse_and_map);
diff --git a/drivers/of/of_mdio.c b/drivers/of/of_mdio.c
index 42a6715f8e84..1fce00eb421b 100644
--- a/drivers/of/of_mdio.c
+++ b/drivers/of/of_mdio.c
@@ -15,6 +15,7 @@
 #include <linux/err.h>
 #include <linux/phy.h>
 #include <linux/of.h>
+#include <linux/of_irq.h>
 #include <linux/of_mdio.h>
 #include <linux/module.h>
 
diff --git a/drivers/of/of_spi.c b/drivers/of/of_spi.c
index 5fed7e3c7da3..d504f1d1324b 100644
--- a/drivers/of/of_spi.c
+++ b/drivers/of/of_spi.c
@@ -9,6 +9,7 @@
 #include <linux/of.h>
 #include <linux/device.h>
 #include <linux/spi/spi.h>
+#include <linux/of_irq.h>
 #include <linux/of_spi.h>
 
 /**
diff --git a/include/linux/of_irq.h b/include/linux/of_irq.h
new file mode 100644
index 000000000000..0e37c05b7dd8
--- /dev/null
+++ b/include/linux/of_irq.h
@@ -0,0 +1,41 @@
+#ifndef __OF_IRQ_H
+#define __OF_IRQ_H
+
+#if defined(CONFIG_OF)
+struct of_irq;
+#include <linux/types.h>
+#include <linux/of.h>
+
+/*
+ * irq_of_parse_and_map() is used ba all OF enabled platforms; but SPARC
+ * implements it differently.  However, the prototype is the same for all,
+ * so declare it here regardless of the CONFIG_OF_IRQ setting.
+ */
+extern unsigned int irq_of_parse_and_map(struct device_node *node, int index);
+
+#if defined(CONFIG_OF_IRQ)
+/**
+ * of_irq - container for device_node/irq_specifier pair for an irq controller
+ * @controller: pointer to interrupt controller device tree node
+ * @size: size of interrupt specifier
+ * @specifier: array of cells @size long specifing the specific interrupt
+ *
+ * This structure is returned when an interrupt is mapped. The controller
+ * field needs to be put() after use
+ */
+#define OF_MAX_IRQ_SPEC		4 /* We handle specifiers of at most 4 cells */
+struct of_irq {
+	struct device_node *controller; /* Interrupt controller node */
+	u32 size; /* Specifier size */
+	u32 specifier[OF_MAX_IRQ_SPEC]; /* Specifier copy */
+};
+
+extern int of_irq_map_one(struct device_node *device, int index,
+			  struct of_irq *out_irq);
+extern unsigned int irq_create_of_mapping(struct device_node *controller,
+					  const u32 *intspec,
+					  unsigned int intsize);
+
+#endif /* CONFIG_OF_IRQ */
+#endif /* CONFIG_OF */
+#endif /* __OF_IRQ_H */
-- 
cgit v1.2.3


From c9642c49aae1272d7c24008a40ae614470b957a6 Mon Sep 17 00:00:00 2001
From: Li Zefan <lizf@cn.fujitsu.com>
Date: Mon, 24 May 2010 16:22:30 +0800
Subject: tracing: Use a global field list for all syscall exit events

All syscall exit events have the same fields.

The kernel size drops 2.5K:

   text    data     bss     dec     hex filename
7018612 2034376 7251132 16304120         f8c7f8 vmlinux.o.orig
7018612 2031888 7251132 16301632         f8be40 vmlinux.o

Signed-off-by: Li Zefan <lizf@cn.fujitsu.com>
LKML-Reference: <4BFA3746.8070100@cn.fujitsu.com>
Signed-off-by: Steven Rostedt <rostedt@goodmis.org>
---
 include/linux/syscalls.h      | 2 --
 include/trace/syscall.h       | 1 -
 kernel/trace/trace_syscalls.c | 7 ++++---
 3 files changed, 4 insertions(+), 6 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/syscalls.h b/include/linux/syscalls.h
index 7f614ce274a9..7994bd44eb56 100644
--- a/include/linux/syscalls.h
+++ b/include/linux/syscalls.h
@@ -165,7 +165,6 @@ extern struct trace_event_functions exit_syscall_print_funcs;
 		.enter_event	= &event_enter_##sname,		\
 		.exit_event	= &event_exit_##sname,		\
 		.enter_fields	= LIST_HEAD_INIT(__syscall_meta_##sname.enter_fields), \
-		.exit_fields	= LIST_HEAD_INIT(__syscall_meta_##sname.exit_fields), \
 	};
 
 #define SYSCALL_DEFINE0(sname)					\
@@ -180,7 +179,6 @@ extern struct trace_event_functions exit_syscall_print_funcs;
 		.enter_event	= &event_enter__##sname,	\
 		.exit_event	= &event_exit__##sname,		\
 		.enter_fields	= LIST_HEAD_INIT(__syscall_meta__##sname.enter_fields), \
-		.exit_fields	= LIST_HEAD_INIT(__syscall_meta__##sname.exit_fields), \
 	};							\
 	asmlinkage long sys_##sname(void)
 #else
diff --git a/include/trace/syscall.h b/include/trace/syscall.h
index 257e08960d7b..31966a4fb8cc 100644
--- a/include/trace/syscall.h
+++ b/include/trace/syscall.h
@@ -26,7 +26,6 @@ struct syscall_metadata {
 	const char	**types;
 	const char	**args;
 	struct list_head enter_fields;
-	struct list_head exit_fields;
 
 	struct ftrace_event_call *enter_event;
 	struct ftrace_event_call *exit_event;
diff --git a/kernel/trace/trace_syscalls.c b/kernel/trace/trace_syscalls.c
index 34e35804304b..bac752f0cfb5 100644
--- a/kernel/trace/trace_syscalls.c
+++ b/kernel/trace/trace_syscalls.c
@@ -23,6 +23,9 @@ static int syscall_exit_register(struct ftrace_event_call *event,
 static int syscall_enter_define_fields(struct ftrace_event_call *call);
 static int syscall_exit_define_fields(struct ftrace_event_call *call);
 
+/* All syscall exit events have the same fields */
+static LIST_HEAD(syscall_exit_fields);
+
 static struct list_head *
 syscall_get_enter_fields(struct ftrace_event_call *call)
 {
@@ -34,9 +37,7 @@ syscall_get_enter_fields(struct ftrace_event_call *call)
 static struct list_head *
 syscall_get_exit_fields(struct ftrace_event_call *call)
 {
-	struct syscall_metadata *entry = call->data;
-
-	return &entry->exit_fields;
+	return &syscall_exit_fields;
 }
 
 struct trace_event_functions enter_syscall_print_funcs = {
-- 
cgit v1.2.3


From a1d0ce8213e9ddf4046ef5ba95c55762d075f541 Mon Sep 17 00:00:00 2001
From: Steven Rostedt <srostedt@redhat.com>
Date: Tue, 8 Jun 2010 11:22:06 -0400
Subject: tracing: Use class->reg() for all registering of events

Because kprobes and syscalls need special processing to register
events, the class->reg() method was created to handle the differences.

But instead of creating a default ->reg for perf and ftrace events,
the code was scattered with:

	if (class->reg)
		class->reg();
	else
		default_reg();

This is messy and can also lead to bugs.

This patch cleans up this code and creates a default reg() entry for
the events allowing for the code to directly call the class->reg()
without the condition.

Reported-by: Peter Zijlstra <peterz@infradead.org>
Acked-by: Peter Zijlstra <peterz@infradead.org>
Signed-off-by: Steven Rostedt <rostedt@goodmis.org>
---
 include/linux/ftrace_event.h    |  3 +++
 include/trace/ftrace.h          |  2 ++
 kernel/trace/trace_event_perf.c | 19 +++-----------
 kernel/trace/trace_events.c     | 55 +++++++++++++++++++++++++++--------------
 4 files changed, 44 insertions(+), 35 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/ftrace_event.h b/include/linux/ftrace_event.h
index 0af31cd335d6..01df7ca4ead7 100644
--- a/include/linux/ftrace_event.h
+++ b/include/linux/ftrace_event.h
@@ -146,6 +146,9 @@ struct ftrace_event_class {
 	int			(*raw_init)(struct ftrace_event_call *);
 };
 
+extern int ftrace_event_reg(struct ftrace_event_call *event,
+			    enum trace_reg type);
+
 enum {
 	TRACE_EVENT_FL_ENABLED_BIT,
 	TRACE_EVENT_FL_FILTERED_BIT,
diff --git a/include/trace/ftrace.h b/include/trace/ftrace.h
index fc013a8201e9..55c1fd1bbc3d 100644
--- a/include/trace/ftrace.h
+++ b/include/trace/ftrace.h
@@ -439,6 +439,7 @@ static inline notrace int ftrace_get_offsets_##call(			\
  *	.fields			= LIST_HEAD_INIT(event_class_##call.fields),
  *	.raw_init		= trace_event_raw_init,
  *	.probe			= ftrace_raw_event_##call,
+ *	.reg			= ftrace_event_reg,
  * };
  *
  * static struct ftrace_event_call __used
@@ -567,6 +568,7 @@ static struct ftrace_event_class __used event_class_##call = {		\
 	.fields			= LIST_HEAD_INIT(event_class_##call.fields),\
 	.raw_init		= trace_event_raw_init,			\
 	.probe			= ftrace_raw_event_##call,		\
+	.reg			= ftrace_event_reg,			\
 	_TRACE_PERF_INIT(call)						\
 };
 
diff --git a/kernel/trace/trace_event_perf.c b/kernel/trace/trace_event_perf.c
index 6053982dc302..23751659582e 100644
--- a/kernel/trace/trace_event_perf.c
+++ b/kernel/trace/trace_event_perf.c
@@ -54,13 +54,7 @@ static int perf_trace_event_init(struct ftrace_event_call *tp_event,
 		}
 	}
 
-	if (tp_event->class->reg)
-		ret = tp_event->class->reg(tp_event, TRACE_REG_PERF_REGISTER);
-	else
-		ret = tracepoint_probe_register(tp_event->name,
-						tp_event->class->perf_probe,
-						tp_event);
-
+	ret = tp_event->class->reg(tp_event, TRACE_REG_PERF_REGISTER);
 	if (ret)
 		goto fail;
 
@@ -94,9 +88,7 @@ int perf_trace_init(struct perf_event *p_event)
 	mutex_lock(&event_mutex);
 	list_for_each_entry(tp_event, &ftrace_events, list) {
 		if (tp_event->event.type == event_id &&
-		    tp_event->class &&
-		    (tp_event->class->perf_probe ||
-		     tp_event->class->reg) &&
+		    tp_event->class && tp_event->class->reg &&
 		    try_module_get(tp_event->mod)) {
 			ret = perf_trace_event_init(tp_event, p_event);
 			break;
@@ -136,12 +128,7 @@ void perf_trace_destroy(struct perf_event *p_event)
 	if (--tp_event->perf_refcount > 0)
 		goto out;
 
-	if (tp_event->class->reg)
-		tp_event->class->reg(tp_event, TRACE_REG_PERF_UNREGISTER);
-	else
-		tracepoint_probe_unregister(tp_event->name,
-					    tp_event->class->perf_probe,
-					    tp_event);
+	tp_event->class->reg(tp_event, TRACE_REG_PERF_UNREGISTER);
 
 	/*
 	 * Ensure our callback won't be called anymore. See
diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c
index 69bee4cc0e10..e8e6043f4d29 100644
--- a/kernel/trace/trace_events.c
+++ b/kernel/trace/trace_events.c
@@ -141,6 +141,35 @@ int trace_event_raw_init(struct ftrace_event_call *call)
 }
 EXPORT_SYMBOL_GPL(trace_event_raw_init);
 
+int ftrace_event_reg(struct ftrace_event_call *call, enum trace_reg type)
+{
+	switch (type) {
+	case TRACE_REG_REGISTER:
+		return tracepoint_probe_register(call->name,
+						 call->class->probe,
+						 call);
+	case TRACE_REG_UNREGISTER:
+		tracepoint_probe_unregister(call->name,
+					    call->class->probe,
+					    call);
+		return 0;
+
+#ifdef CONFIG_PERF_EVENTS
+	case TRACE_REG_PERF_REGISTER:
+		return tracepoint_probe_register(call->name,
+						 call->class->perf_probe,
+						 call);
+	case TRACE_REG_PERF_UNREGISTER:
+		tracepoint_probe_unregister(call->name,
+					    call->class->perf_probe,
+					    call);
+		return 0;
+#endif
+	}
+	return 0;
+}
+EXPORT_SYMBOL_GPL(ftrace_event_reg);
+
 static int ftrace_event_enable_disable(struct ftrace_event_call *call,
 					int enable)
 {
@@ -151,23 +180,13 @@ static int ftrace_event_enable_disable(struct ftrace_event_call *call,
 		if (call->flags & TRACE_EVENT_FL_ENABLED) {
 			call->flags &= ~TRACE_EVENT_FL_ENABLED;
 			tracing_stop_cmdline_record();
-			if (call->class->reg)
-				call->class->reg(call, TRACE_REG_UNREGISTER);
-			else
-				tracepoint_probe_unregister(call->name,
-							    call->class->probe,
-							    call);
+			call->class->reg(call, TRACE_REG_UNREGISTER);
 		}
 		break;
 	case 1:
 		if (!(call->flags & TRACE_EVENT_FL_ENABLED)) {
 			tracing_start_cmdline_record();
-			if (call->class->reg)
-				ret = call->class->reg(call, TRACE_REG_REGISTER);
-			else
-				ret = tracepoint_probe_register(call->name,
-								call->class->probe,
-								call);
+			ret = call->class->reg(call, TRACE_REG_REGISTER);
 			if (ret) {
 				tracing_stop_cmdline_record();
 				pr_info("event trace: Could not enable event "
@@ -205,8 +224,7 @@ static int __ftrace_set_clr_event(const char *match, const char *sub,
 	mutex_lock(&event_mutex);
 	list_for_each_entry(call, &ftrace_events, list) {
 
-		if (!call->name || !call->class ||
-		    (!call->class->probe && !call->class->reg))
+		if (!call->name || !call->class || !call->class->reg)
 			continue;
 
 		if (match &&
@@ -332,7 +350,7 @@ t_next(struct seq_file *m, void *v, loff_t *pos)
 		 * The ftrace subsystem is for showing formats only.
 		 * They can not be enabled or disabled via the event files.
 		 */
-		if (call->class && (call->class->probe || call->class->reg))
+		if (call->class && call->class->reg)
 			return call;
 	}
 
@@ -485,8 +503,7 @@ system_enable_read(struct file *filp, char __user *ubuf, size_t cnt,
 
 	mutex_lock(&event_mutex);
 	list_for_each_entry(call, &ftrace_events, list) {
-		if (!call->name || !call->class ||
-		    (!call->class->probe && !call->class->reg))
+		if (!call->name || !call->class || !call->class->reg)
 			continue;
 
 		if (system && strcmp(call->class->system, system) != 0)
@@ -977,12 +994,12 @@ event_create_dir(struct ftrace_event_call *call, struct dentry *d_events,
 		return -1;
 	}
 
-	if (call->class->probe || call->class->reg)
+	if (call->class->reg)
 		trace_create_file("enable", 0644, call->dir, call,
 				  enable);
 
 #ifdef CONFIG_PERF_EVENTS
-	if (call->event.type && (call->class->perf_probe || call->class->reg))
+	if (call->event.type && call->class->reg)
 		trace_create_file("id", 0444, call->dir, call,
 		 		  id);
 #endif
-- 
cgit v1.2.3


From b56c0d8937e665a27d90517ee7a746d0aa05af46 Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Tue, 29 Jun 2010 10:07:09 +0200
Subject: kthread: implement kthread_worker

Implement simple work processor for kthread.  This is to ease using
kthread.  Single thread workqueue used to be used for things like this
but workqueue won't guarantee fixed kthread association anymore to
enable worker sharing.

This can be used in cases where specific kthread association is
necessary, for example, when it should have RT priority or be assigned
to certain cgroup.

Signed-off-by: Tejun Heo <tj@kernel.org>
Cc: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/kthread.h |  64 +++++++++++++++++++++
 kernel/kthread.c        | 149 ++++++++++++++++++++++++++++++++++++++++++++++++
 2 files changed, 213 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/kthread.h b/include/linux/kthread.h
index aabc8a13ba71..f93cb6979edc 100644
--- a/include/linux/kthread.h
+++ b/include/linux/kthread.h
@@ -34,4 +34,68 @@ int kthread_should_stop(void);
 int kthreadd(void *unused);
 extern struct task_struct *kthreadd_task;
 
+/*
+ * Simple work processor based on kthread.
+ *
+ * This provides easier way to make use of kthreads.  A kthread_work
+ * can be queued and flushed using queue/flush_kthread_work()
+ * respectively.  Queued kthread_works are processed by a kthread
+ * running kthread_worker_fn().
+ *
+ * A kthread_work can't be freed while it is executing.
+ */
+struct kthread_work;
+typedef void (*kthread_work_func_t)(struct kthread_work *work);
+
+struct kthread_worker {
+	spinlock_t		lock;
+	struct list_head	work_list;
+	struct task_struct	*task;
+};
+
+struct kthread_work {
+	struct list_head	node;
+	kthread_work_func_t	func;
+	wait_queue_head_t	done;
+	atomic_t		flushing;
+	int			queue_seq;
+	int			done_seq;
+};
+
+#define KTHREAD_WORKER_INIT(worker)	{				\
+	.lock = SPIN_LOCK_UNLOCKED,					\
+	.work_list = LIST_HEAD_INIT((worker).work_list),		\
+	}
+
+#define KTHREAD_WORK_INIT(work, fn)	{				\
+	.node = LIST_HEAD_INIT((work).node),				\
+	.func = (fn),							\
+	.done = __WAIT_QUEUE_HEAD_INITIALIZER((work).done),		\
+	.flushing = ATOMIC_INIT(0),					\
+	}
+
+#define DEFINE_KTHREAD_WORKER(worker)					\
+	struct kthread_worker worker = KTHREAD_WORKER_INIT(worker)
+
+#define DEFINE_KTHREAD_WORK(work, fn)					\
+	struct kthread_work work = KTHREAD_WORK_INIT(work, fn)
+
+static inline void init_kthread_worker(struct kthread_worker *worker)
+{
+	*worker = (struct kthread_worker)KTHREAD_WORKER_INIT(*worker);
+}
+
+static inline void init_kthread_work(struct kthread_work *work,
+				     kthread_work_func_t fn)
+{
+	*work = (struct kthread_work)KTHREAD_WORK_INIT(*work, fn);
+}
+
+int kthread_worker_fn(void *worker_ptr);
+
+bool queue_kthread_work(struct kthread_worker *worker,
+			struct kthread_work *work);
+void flush_kthread_work(struct kthread_work *work);
+void flush_kthread_worker(struct kthread_worker *worker);
+
 #endif /* _LINUX_KTHREAD_H */
diff --git a/kernel/kthread.c b/kernel/kthread.c
index 83911c780175..8b63c7fee73b 100644
--- a/kernel/kthread.c
+++ b/kernel/kthread.c
@@ -14,6 +14,8 @@
 #include <linux/file.h>
 #include <linux/module.h>
 #include <linux/mutex.h>
+#include <linux/slab.h>
+#include <linux/freezer.h>
 #include <trace/events/sched.h>
 
 static DEFINE_SPINLOCK(kthread_create_lock);
@@ -247,3 +249,150 @@ int kthreadd(void *unused)
 
 	return 0;
 }
+
+/**
+ * kthread_worker_fn - kthread function to process kthread_worker
+ * @worker_ptr: pointer to initialized kthread_worker
+ *
+ * This function can be used as @threadfn to kthread_create() or
+ * kthread_run() with @worker_ptr argument pointing to an initialized
+ * kthread_worker.  The started kthread will process work_list until
+ * the it is stopped with kthread_stop().  A kthread can also call
+ * this function directly after extra initialization.
+ *
+ * Different kthreads can be used for the same kthread_worker as long
+ * as there's only one kthread attached to it at any given time.  A
+ * kthread_worker without an attached kthread simply collects queued
+ * kthread_works.
+ */
+int kthread_worker_fn(void *worker_ptr)
+{
+	struct kthread_worker *worker = worker_ptr;
+	struct kthread_work *work;
+
+	WARN_ON(worker->task);
+	worker->task = current;
+repeat:
+	set_current_state(TASK_INTERRUPTIBLE);	/* mb paired w/ kthread_stop */
+
+	if (kthread_should_stop()) {
+		__set_current_state(TASK_RUNNING);
+		spin_lock_irq(&worker->lock);
+		worker->task = NULL;
+		spin_unlock_irq(&worker->lock);
+		return 0;
+	}
+
+	work = NULL;
+	spin_lock_irq(&worker->lock);
+	if (!list_empty(&worker->work_list)) {
+		work = list_first_entry(&worker->work_list,
+					struct kthread_work, node);
+		list_del_init(&work->node);
+	}
+	spin_unlock_irq(&worker->lock);
+
+	if (work) {
+		__set_current_state(TASK_RUNNING);
+		work->func(work);
+		smp_wmb();	/* wmb worker-b0 paired with flush-b1 */
+		work->done_seq = work->queue_seq;
+		smp_mb();	/* mb worker-b1 paired with flush-b0 */
+		if (atomic_read(&work->flushing))
+			wake_up_all(&work->done);
+	} else if (!freezing(current))
+		schedule();
+
+	try_to_freeze();
+	goto repeat;
+}
+EXPORT_SYMBOL_GPL(kthread_worker_fn);
+
+/**
+ * queue_kthread_work - queue a kthread_work
+ * @worker: target kthread_worker
+ * @work: kthread_work to queue
+ *
+ * Queue @work to work processor @task for async execution.  @task
+ * must have been created with kthread_worker_create().  Returns %true
+ * if @work was successfully queued, %false if it was already pending.
+ */
+bool queue_kthread_work(struct kthread_worker *worker,
+			struct kthread_work *work)
+{
+	bool ret = false;
+	unsigned long flags;
+
+	spin_lock_irqsave(&worker->lock, flags);
+	if (list_empty(&work->node)) {
+		list_add_tail(&work->node, &worker->work_list);
+		work->queue_seq++;
+		if (likely(worker->task))
+			wake_up_process(worker->task);
+		ret = true;
+	}
+	spin_unlock_irqrestore(&worker->lock, flags);
+	return ret;
+}
+EXPORT_SYMBOL_GPL(queue_kthread_work);
+
+/**
+ * flush_kthread_work - flush a kthread_work
+ * @work: work to flush
+ *
+ * If @work is queued or executing, wait for it to finish execution.
+ */
+void flush_kthread_work(struct kthread_work *work)
+{
+	int seq = work->queue_seq;
+
+	atomic_inc(&work->flushing);
+
+	/*
+	 * mb flush-b0 paired with worker-b1, to make sure either
+	 * worker sees the above increment or we see done_seq update.
+	 */
+	smp_mb__after_atomic_inc();
+
+	/* A - B <= 0 tests whether B is in front of A regardless of overflow */
+	wait_event(work->done, seq - work->done_seq <= 0);
+	atomic_dec(&work->flushing);
+
+	/*
+	 * rmb flush-b1 paired with worker-b0, to make sure our caller
+	 * sees every change made by work->func().
+	 */
+	smp_mb__after_atomic_dec();
+}
+EXPORT_SYMBOL_GPL(flush_kthread_work);
+
+struct kthread_flush_work {
+	struct kthread_work	work;
+	struct completion	done;
+};
+
+static void kthread_flush_work_fn(struct kthread_work *work)
+{
+	struct kthread_flush_work *fwork =
+		container_of(work, struct kthread_flush_work, work);
+	complete(&fwork->done);
+}
+
+/**
+ * flush_kthread_worker - flush all current works on a kthread_worker
+ * @worker: worker to flush
+ *
+ * Wait until all currently executing or pending works on @worker are
+ * finished.
+ */
+void flush_kthread_worker(struct kthread_worker *worker)
+{
+	struct kthread_flush_work fwork = {
+		KTHREAD_WORK_INIT(fwork.work, kthread_flush_work_fn),
+		COMPLETION_INITIALIZER_ONSTACK(fwork.done),
+	};
+
+	queue_kthread_work(worker, &fwork.work);
+	wait_for_completion(&fwork.done);
+}
+EXPORT_SYMBOL_GPL(flush_kthread_worker);
-- 
cgit v1.2.3


From 82805ab77d25643f579d90397dcd34f05d1b750a Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Tue, 29 Jun 2010 10:07:09 +0200
Subject: kthread: implement kthread_data()

Implement kthread_data() which takes @task pointing to a kthread and
returns @data specified when creating the kthread.  The caller is
responsible for ensuring the validity of @task when calling this
function.

Signed-off-by: Tejun Heo <tj@kernel.org>
---
 include/linux/kthread.h |  1 +
 kernel/kthread.c        | 15 +++++++++++++++
 2 files changed, 16 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/kthread.h b/include/linux/kthread.h
index f93cb6979edc..685ea65eb803 100644
--- a/include/linux/kthread.h
+++ b/include/linux/kthread.h
@@ -30,6 +30,7 @@ struct task_struct *kthread_create(int (*threadfn)(void *data),
 void kthread_bind(struct task_struct *k, unsigned int cpu);
 int kthread_stop(struct task_struct *k);
 int kthread_should_stop(void);
+void *kthread_data(struct task_struct *k);
 
 int kthreadd(void *unused);
 extern struct task_struct *kthreadd_task;
diff --git a/kernel/kthread.c b/kernel/kthread.c
index 8b63c7fee73b..2dc3786349d1 100644
--- a/kernel/kthread.c
+++ b/kernel/kthread.c
@@ -37,6 +37,7 @@ struct kthread_create_info
 
 struct kthread {
 	int should_stop;
+	void *data;
 	struct completion exited;
 };
 
@@ -56,6 +57,19 @@ int kthread_should_stop(void)
 }
 EXPORT_SYMBOL(kthread_should_stop);
 
+/**
+ * kthread_data - return data value specified on kthread creation
+ * @task: kthread task in question
+ *
+ * Return the data value specified when kthread @task was created.
+ * The caller is responsible for ensuring the validity of @task when
+ * calling this function.
+ */
+void *kthread_data(struct task_struct *task)
+{
+	return to_kthread(task)->data;
+}
+
 static int kthread(void *_create)
 {
 	/* Copy data: it's on kthread's stack */
@@ -66,6 +80,7 @@ static int kthread(void *_create)
 	int ret;
 
 	self.should_stop = 0;
+	self.data = data;
 	init_completion(&self.exited);
 	current->vfork_done = &self.exited;
 
-- 
cgit v1.2.3


From c790bce0481857412c964c5e9d46d56e41c4b051 Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Tue, 29 Jun 2010 10:07:09 +0200
Subject: workqueue: kill RT workqueue

With stop_machine() converted to use cpu_stop, RT workqueue doesn't
have any user left.  Kill RT workqueue support.

Signed-off-by: Tejun Heo <tj@kernel.org>
---
 include/linux/workqueue.h | 20 +++++++++-----------
 kernel/workqueue.c        |  6 ------
 2 files changed, 9 insertions(+), 17 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/workqueue.h b/include/linux/workqueue.h
index 9466e860d8c2..0697946c66a1 100644
--- a/include/linux/workqueue.h
+++ b/include/linux/workqueue.h
@@ -181,12 +181,11 @@ static inline void destroy_work_on_stack(struct work_struct *work) { }
 
 
 extern struct workqueue_struct *
-__create_workqueue_key(const char *name, int singlethread,
-		       int freezeable, int rt, struct lock_class_key *key,
-		       const char *lock_name);
+__create_workqueue_key(const char *name, int singlethread, int freezeable,
+		       struct lock_class_key *key, const char *lock_name);
 
 #ifdef CONFIG_LOCKDEP
-#define __create_workqueue(name, singlethread, freezeable, rt)	\
+#define __create_workqueue(name, singlethread, freezeable)	\
 ({								\
 	static struct lock_class_key __key;			\
 	const char *__lock_name;				\
@@ -197,19 +196,18 @@ __create_workqueue_key(const char *name, int singlethread,
 		__lock_name = #name;				\
 								\
 	__create_workqueue_key((name), (singlethread),		\
-			       (freezeable), (rt), &__key,	\
+			       (freezeable), &__key,		\
 			       __lock_name);			\
 })
 #else
-#define __create_workqueue(name, singlethread, freezeable, rt)	\
-	__create_workqueue_key((name), (singlethread), (freezeable), (rt), \
+#define __create_workqueue(name, singlethread, freezeable)	\
+	__create_workqueue_key((name), (singlethread), (freezeable), \
 			       NULL, NULL)
 #endif
 
-#define create_workqueue(name) __create_workqueue((name), 0, 0, 0)
-#define create_rt_workqueue(name) __create_workqueue((name), 0, 0, 1)
-#define create_freezeable_workqueue(name) __create_workqueue((name), 1, 1, 0)
-#define create_singlethread_workqueue(name) __create_workqueue((name), 1, 0, 0)
+#define create_workqueue(name) __create_workqueue((name), 0, 0)
+#define create_freezeable_workqueue(name) __create_workqueue((name), 1, 1)
+#define create_singlethread_workqueue(name) __create_workqueue((name), 1, 0)
 
 extern void destroy_workqueue(struct workqueue_struct *wq);
 
diff --git a/kernel/workqueue.c b/kernel/workqueue.c
index 327d2deb4451..1a47fbf92fae 100644
--- a/kernel/workqueue.c
+++ b/kernel/workqueue.c
@@ -62,7 +62,6 @@ struct workqueue_struct {
 	const char *name;
 	int singlethread;
 	int freezeable;		/* Freeze threads during suspend */
-	int rt;
 #ifdef CONFIG_LOCKDEP
 	struct lockdep_map lockdep_map;
 #endif
@@ -947,7 +946,6 @@ init_cpu_workqueue(struct workqueue_struct *wq, int cpu)
 
 static int create_workqueue_thread(struct cpu_workqueue_struct *cwq, int cpu)
 {
-	struct sched_param param = { .sched_priority = MAX_RT_PRIO-1 };
 	struct workqueue_struct *wq = cwq->wq;
 	const char *fmt = is_wq_single_threaded(wq) ? "%s" : "%s/%d";
 	struct task_struct *p;
@@ -963,8 +961,6 @@ static int create_workqueue_thread(struct cpu_workqueue_struct *cwq, int cpu)
 	 */
 	if (IS_ERR(p))
 		return PTR_ERR(p);
-	if (cwq->wq->rt)
-		sched_setscheduler_nocheck(p, SCHED_FIFO, &param);
 	cwq->thread = p;
 
 	trace_workqueue_creation(cwq->thread, cpu);
@@ -986,7 +982,6 @@ static void start_workqueue_thread(struct cpu_workqueue_struct *cwq, int cpu)
 struct workqueue_struct *__create_workqueue_key(const char *name,
 						int singlethread,
 						int freezeable,
-						int rt,
 						struct lock_class_key *key,
 						const char *lock_name)
 {
@@ -1008,7 +1003,6 @@ struct workqueue_struct *__create_workqueue_key(const char *name,
 	lockdep_init_map(&wq->lockdep_map, lock_name, key, 0);
 	wq->singlethread = singlethread;
 	wq->freezeable = freezeable;
-	wq->rt = rt;
 	INIT_LIST_HEAD(&wq->list);
 
 	if (singlethread) {
-- 
cgit v1.2.3


From 4690c4ab56c71919893ca25252f2dd65b58188c7 Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Tue, 29 Jun 2010 10:07:10 +0200
Subject: workqueue: misc/cosmetic updates

Make the following updates in preparation of concurrency managed
workqueue.  None of these changes causes any visible behavior
difference.

* Add comments and adjust indentations to data structures and several
  functions.

* Rename wq_per_cpu() to get_cwq() and swap the position of two
  parameters for consistency.  Convert a direct per_cpu_ptr() access
  to wq->cpu_wq to get_cwq().

* Add work_static() and Update set_wq_data() such that it sets the
  flags part to WORK_STRUCT_PENDING | WORK_STRUCT_STATIC if static |
  @extra_flags.

* Move santiy check on work->entry emptiness from queue_work_on() to
  __queue_work() which all queueing paths share.

* Make __queue_work() take @cpu and @wq instead of @cwq.

* Restructure flush_work() and __create_workqueue_key() to make them
  easier to modify.

Signed-off-by: Tejun Heo <tj@kernel.org>
---
 include/linux/workqueue.h |   5 ++
 kernel/workqueue.c        | 131 +++++++++++++++++++++++++++++-----------------
 2 files changed, 89 insertions(+), 47 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/workqueue.h b/include/linux/workqueue.h
index 0697946c66a1..e724dafc9e6d 100644
--- a/include/linux/workqueue.h
+++ b/include/linux/workqueue.h
@@ -96,9 +96,14 @@ struct execute_work {
 #ifdef CONFIG_DEBUG_OBJECTS_WORK
 extern void __init_work(struct work_struct *work, int onstack);
 extern void destroy_work_on_stack(struct work_struct *work);
+static inline unsigned int work_static(struct work_struct *work)
+{
+	return *work_data_bits(work) & (1 << WORK_STRUCT_STATIC);
+}
 #else
 static inline void __init_work(struct work_struct *work, int onstack) { }
 static inline void destroy_work_on_stack(struct work_struct *work) { }
+static inline unsigned int work_static(struct work_struct *work) { return 0; }
 #endif
 
 /*
diff --git a/kernel/workqueue.c b/kernel/workqueue.c
index 1a47fbf92fae..c56146a755e5 100644
--- a/kernel/workqueue.c
+++ b/kernel/workqueue.c
@@ -36,6 +36,16 @@
 #define CREATE_TRACE_POINTS
 #include <trace/events/workqueue.h>
 
+/*
+ * Structure fields follow one of the following exclusion rules.
+ *
+ * I: Set during initialization and read-only afterwards.
+ *
+ * L: cwq->lock protected.  Access with cwq->lock held.
+ *
+ * W: workqueue_lock protected.
+ */
+
 /*
  * The per-CPU workqueue (if single thread, we always use the first
  * possible cpu).
@@ -48,8 +58,8 @@ struct cpu_workqueue_struct {
 	wait_queue_head_t more_work;
 	struct work_struct *current_work;
 
-	struct workqueue_struct *wq;
-	struct task_struct *thread;
+	struct workqueue_struct *wq;		/* I: the owning workqueue */
+	struct task_struct	*thread;
 } ____cacheline_aligned;
 
 /*
@@ -57,13 +67,13 @@ struct cpu_workqueue_struct {
  * per-CPU workqueues:
  */
 struct workqueue_struct {
-	struct cpu_workqueue_struct *cpu_wq;
-	struct list_head list;
-	const char *name;
+	struct cpu_workqueue_struct *cpu_wq;	/* I: cwq's */
+	struct list_head	list;		/* W: list of all workqueues */
+	const char		*name;		/* I: workqueue name */
 	int singlethread;
 	int freezeable;		/* Freeze threads during suspend */
 #ifdef CONFIG_LOCKDEP
-	struct lockdep_map lockdep_map;
+	struct lockdep_map	lockdep_map;
 #endif
 };
 
@@ -204,8 +214,8 @@ static const struct cpumask *wq_cpu_map(struct workqueue_struct *wq)
 		? cpu_singlethread_map : cpu_populated_map;
 }
 
-static
-struct cpu_workqueue_struct *wq_per_cpu(struct workqueue_struct *wq, int cpu)
+static struct cpu_workqueue_struct *get_cwq(unsigned int cpu,
+					    struct workqueue_struct *wq)
 {
 	if (unlikely(is_wq_single_threaded(wq)))
 		cpu = singlethread_cpu;
@@ -217,15 +227,13 @@ struct cpu_workqueue_struct *wq_per_cpu(struct workqueue_struct *wq, int cpu)
  * - Must *only* be called if the pending flag is set
  */
 static inline void set_wq_data(struct work_struct *work,
-				struct cpu_workqueue_struct *cwq)
+			       struct cpu_workqueue_struct *cwq,
+			       unsigned long extra_flags)
 {
-	unsigned long new;
-
 	BUG_ON(!work_pending(work));
 
-	new = (unsigned long) cwq | (1UL << WORK_STRUCT_PENDING);
-	new |= WORK_STRUCT_FLAG_MASK & *work_data_bits(work);
-	atomic_long_set(&work->data, new);
+	atomic_long_set(&work->data, (unsigned long)cwq | work_static(work) |
+			(1UL << WORK_STRUCT_PENDING) | extra_flags);
 }
 
 /*
@@ -233,9 +241,7 @@ static inline void set_wq_data(struct work_struct *work,
  */
 static inline void clear_wq_data(struct work_struct *work)
 {
-	unsigned long flags = *work_data_bits(work) &
-				(1UL << WORK_STRUCT_STATIC);
-	atomic_long_set(&work->data, flags);
+	atomic_long_set(&work->data, work_static(work));
 }
 
 static inline
@@ -244,29 +250,47 @@ struct cpu_workqueue_struct *get_wq_data(struct work_struct *work)
 	return (void *) (atomic_long_read(&work->data) & WORK_STRUCT_WQ_DATA_MASK);
 }
 
+/**
+ * insert_work - insert a work into cwq
+ * @cwq: cwq @work belongs to
+ * @work: work to insert
+ * @head: insertion point
+ * @extra_flags: extra WORK_STRUCT_* flags to set
+ *
+ * Insert @work into @cwq after @head.
+ *
+ * CONTEXT:
+ * spin_lock_irq(cwq->lock).
+ */
 static void insert_work(struct cpu_workqueue_struct *cwq,
-			struct work_struct *work, struct list_head *head)
+			struct work_struct *work, struct list_head *head,
+			unsigned int extra_flags)
 {
 	trace_workqueue_insertion(cwq->thread, work);
 
-	set_wq_data(work, cwq);
+	/* we own @work, set data and link */
+	set_wq_data(work, cwq, extra_flags);
+
 	/*
 	 * Ensure that we get the right work->data if we see the
 	 * result of list_add() below, see try_to_grab_pending().
 	 */
 	smp_wmb();
+
 	list_add_tail(&work->entry, head);
 	wake_up(&cwq->more_work);
 }
 
-static void __queue_work(struct cpu_workqueue_struct *cwq,
+static void __queue_work(unsigned int cpu, struct workqueue_struct *wq,
 			 struct work_struct *work)
 {
+	struct cpu_workqueue_struct *cwq = get_cwq(cpu, wq);
 	unsigned long flags;
 
 	debug_work_activate(work);
 	spin_lock_irqsave(&cwq->lock, flags);
-	insert_work(cwq, work, &cwq->worklist);
+	BUG_ON(!list_empty(&work->entry));
+	insert_work(cwq, work, &cwq->worklist, 0);
 	spin_unlock_irqrestore(&cwq->lock, flags);
 }
 
@@ -308,8 +332,7 @@ queue_work_on(int cpu, struct workqueue_struct *wq, struct work_struct *work)
 	int ret = 0;
 
 	if (!test_and_set_bit(WORK_STRUCT_PENDING, work_data_bits(work))) {
-		BUG_ON(!list_empty(&work->entry));
-		__queue_work(wq_per_cpu(wq, cpu), work);
+		__queue_work(cpu, wq, work);
 		ret = 1;
 	}
 	return ret;
@@ -320,9 +343,8 @@ static void delayed_work_timer_fn(unsigned long __data)
 {
 	struct delayed_work *dwork = (struct delayed_work *)__data;
 	struct cpu_workqueue_struct *cwq = get_wq_data(&dwork->work);
-	struct workqueue_struct *wq = cwq->wq;
 
-	__queue_work(wq_per_cpu(wq, smp_processor_id()), &dwork->work);
+	__queue_work(smp_processor_id(), cwq->wq, &dwork->work);
 }
 
 /**
@@ -366,7 +388,7 @@ int queue_delayed_work_on(int cpu, struct workqueue_struct *wq,
 		timer_stats_timer_set_start_info(&dwork->timer);
 
 		/* This stores cwq for the moment, for the timer_fn */
-		set_wq_data(work, wq_per_cpu(wq, raw_smp_processor_id()));
+		set_wq_data(work, get_cwq(raw_smp_processor_id(), wq), 0);
 		timer->expires = jiffies + delay;
 		timer->data = (unsigned long)dwork;
 		timer->function = delayed_work_timer_fn;
@@ -430,6 +452,12 @@ static void run_workqueue(struct cpu_workqueue_struct *cwq)
 	spin_unlock_irq(&cwq->lock);
 }
 
+/**
+ * worker_thread - the worker thread function
+ * @__cwq: cwq to serve
+ *
+ * The cwq worker thread function.
+ */
 static int worker_thread(void *__cwq)
 {
 	struct cpu_workqueue_struct *cwq = __cwq;
@@ -468,6 +496,17 @@ static void wq_barrier_func(struct work_struct *work)
 	complete(&barr->done);
 }
 
+/**
+ * insert_wq_barrier - insert a barrier work
+ * @cwq: cwq to insert barrier into
+ * @barr: wq_barrier to insert
+ * @head: insertion point
+ *
+ * Insert barrier @barr into @cwq before @head.
+ *
+ * CONTEXT:
+ * spin_lock_irq(cwq->lock).
+ */
 static void insert_wq_barrier(struct cpu_workqueue_struct *cwq,
 			struct wq_barrier *barr, struct list_head *head)
 {
@@ -479,11 +518,10 @@ static void insert_wq_barrier(struct cpu_workqueue_struct *cwq,
 	 */
 	INIT_WORK_ON_STACK(&barr->work, wq_barrier_func);
 	__set_bit(WORK_STRUCT_PENDING, work_data_bits(&barr->work));
-
 	init_completion(&barr->done);
 
 	debug_work_activate(&barr->work);
-	insert_work(cwq, &barr->work, head);
+	insert_work(cwq, &barr->work, head, 0);
 }
 
 static int flush_cpu_workqueue(struct cpu_workqueue_struct *cwq)
@@ -517,9 +555,6 @@ static int flush_cpu_workqueue(struct cpu_workqueue_struct *cwq)
  *
  * We sleep until all works which were queued on entry have been handled,
  * but we are not livelocked by new incoming ones.
- *
- * This function used to run the workqueues itself.  Now we just wait for the
- * helper threads to do it.
  */
 void flush_workqueue(struct workqueue_struct *wq)
 {
@@ -558,7 +593,6 @@ int flush_work(struct work_struct *work)
 	lock_map_acquire(&cwq->wq->lockdep_map);
 	lock_map_release(&cwq->wq->lockdep_map);
 
-	prev = NULL;
 	spin_lock_irq(&cwq->lock);
 	if (!list_empty(&work->entry)) {
 		/*
@@ -567,22 +601,22 @@ int flush_work(struct work_struct *work)
 		 */
 		smp_rmb();
 		if (unlikely(cwq != get_wq_data(work)))
-			goto out;
+			goto already_gone;
 		prev = &work->entry;
 	} else {
 		if (cwq->current_work != work)
-			goto out;
+			goto already_gone;
 		prev = &cwq->worklist;
 	}
 	insert_wq_barrier(cwq, &barr, prev->next);
-out:
-	spin_unlock_irq(&cwq->lock);
-	if (!prev)
-		return 0;
 
+	spin_unlock_irq(&cwq->lock);
 	wait_for_completion(&barr.done);
 	destroy_work_on_stack(&barr.work);
 	return 1;
+already_gone:
+	spin_unlock_irq(&cwq->lock);
+	return 0;
 }
 EXPORT_SYMBOL_GPL(flush_work);
 
@@ -665,7 +699,7 @@ static void wait_on_work(struct work_struct *work)
 	cpu_map = wq_cpu_map(wq);
 
 	for_each_cpu(cpu, cpu_map)
-		wait_on_cpu_work(per_cpu_ptr(wq->cpu_wq, cpu), work);
+		wait_on_cpu_work(get_cwq(cpu, wq), work);
 }
 
 static int __cancel_work_timer(struct work_struct *work,
@@ -782,9 +816,8 @@ EXPORT_SYMBOL(schedule_delayed_work);
 void flush_delayed_work(struct delayed_work *dwork)
 {
 	if (del_timer_sync(&dwork->timer)) {
-		struct cpu_workqueue_struct *cwq;
-		cwq = wq_per_cpu(get_wq_data(&dwork->work)->wq, get_cpu());
-		__queue_work(cwq, &dwork->work);
+		__queue_work(get_cpu(), get_wq_data(&dwork->work)->wq,
+			     &dwork->work);
 		put_cpu();
 	}
 	flush_work(&dwork->work);
@@ -991,13 +1024,11 @@ struct workqueue_struct *__create_workqueue_key(const char *name,
 
 	wq = kzalloc(sizeof(*wq), GFP_KERNEL);
 	if (!wq)
-		return NULL;
+		goto err;
 
 	wq->cpu_wq = alloc_percpu(struct cpu_workqueue_struct);
-	if (!wq->cpu_wq) {
-		kfree(wq);
-		return NULL;
-	}
+	if (!wq->cpu_wq)
+		goto err;
 
 	wq->name = name;
 	lockdep_init_map(&wq->lockdep_map, lock_name, key, 0);
@@ -1041,6 +1072,12 @@ struct workqueue_struct *__create_workqueue_key(const char *name,
 		wq = NULL;
 	}
 	return wq;
+err:
+	if (wq) {
+		free_percpu(wq->cpu_wq);
+		kfree(wq);
+	}
+	return NULL;
 }
 EXPORT_SYMBOL_GPL(__create_workqueue_key);
 
-- 
cgit v1.2.3


From 97e37d7b9e65a6ac939f796f91081135b7a08acc Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Tue, 29 Jun 2010 10:07:10 +0200
Subject: workqueue: merge feature parameters into flags

Currently, __create_workqueue_key() takes @singlethread and
@freezeable paramters and store them separately in workqueue_struct.
Merge them into a single flags parameter and field and use
WQ_FREEZEABLE and WQ_SINGLE_THREAD.

Signed-off-by: Tejun Heo <tj@kernel.org>
---
 include/linux/workqueue.h | 25 +++++++++++++++----------
 kernel/workqueue.c        | 17 +++++++----------
 2 files changed, 22 insertions(+), 20 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/workqueue.h b/include/linux/workqueue.h
index e724dafc9e6d..d89cfc143b1a 100644
--- a/include/linux/workqueue.h
+++ b/include/linux/workqueue.h
@@ -184,13 +184,17 @@ static inline unsigned int work_static(struct work_struct *work) { return 0; }
 #define work_clear_pending(work) \
 	clear_bit(WORK_STRUCT_PENDING, work_data_bits(work))
 
+enum {
+	WQ_FREEZEABLE		= 1 << 0, /* freeze during suspend */
+	WQ_SINGLE_THREAD	= 1 << 1, /* no per-cpu worker */
+};
 
 extern struct workqueue_struct *
-__create_workqueue_key(const char *name, int singlethread, int freezeable,
+__create_workqueue_key(const char *name, unsigned int flags,
 		       struct lock_class_key *key, const char *lock_name);
 
 #ifdef CONFIG_LOCKDEP
-#define __create_workqueue(name, singlethread, freezeable)	\
+#define __create_workqueue(name, flags)				\
 ({								\
 	static struct lock_class_key __key;			\
 	const char *__lock_name;				\
@@ -200,19 +204,20 @@ __create_workqueue_key(const char *name, int singlethread, int freezeable,
 	else							\
 		__lock_name = #name;				\
 								\
-	__create_workqueue_key((name), (singlethread),		\
-			       (freezeable), &__key,		\
+	__create_workqueue_key((name), (flags), &__key,		\
 			       __lock_name);			\
 })
 #else
-#define __create_workqueue(name, singlethread, freezeable)	\
-	__create_workqueue_key((name), (singlethread), (freezeable), \
-			       NULL, NULL)
+#define __create_workqueue(name, flags)				\
+	__create_workqueue_key((name), (flags), NULL, NULL)
 #endif
 
-#define create_workqueue(name) __create_workqueue((name), 0, 0)
-#define create_freezeable_workqueue(name) __create_workqueue((name), 1, 1)
-#define create_singlethread_workqueue(name) __create_workqueue((name), 1, 0)
+#define create_workqueue(name)					\
+	__create_workqueue((name), 0)
+#define create_freezeable_workqueue(name)			\
+	__create_workqueue((name), WQ_FREEZEABLE | WQ_SINGLE_THREAD)
+#define create_singlethread_workqueue(name)			\
+	__create_workqueue((name), WQ_SINGLE_THREAD)
 
 extern void destroy_workqueue(struct workqueue_struct *wq);
 
diff --git a/kernel/workqueue.c b/kernel/workqueue.c
index c56146a755e5..68e4dd808ec0 100644
--- a/kernel/workqueue.c
+++ b/kernel/workqueue.c
@@ -67,11 +67,10 @@ struct cpu_workqueue_struct {
  * per-CPU workqueues:
  */
 struct workqueue_struct {
+	unsigned int		flags;		/* I: WQ_* flags */
 	struct cpu_workqueue_struct *cpu_wq;	/* I: cwq's */
 	struct list_head	list;		/* W: list of all workqueues */
 	const char		*name;		/* I: workqueue name */
-	int singlethread;
-	int freezeable;		/* Freeze threads during suspend */
 #ifdef CONFIG_LOCKDEP
 	struct lockdep_map	lockdep_map;
 #endif
@@ -203,9 +202,9 @@ static const struct cpumask *cpu_singlethread_map __read_mostly;
 static cpumask_var_t cpu_populated_map __read_mostly;
 
 /* If it's single threaded, it isn't in the list of workqueues. */
-static inline int is_wq_single_threaded(struct workqueue_struct *wq)
+static inline bool is_wq_single_threaded(struct workqueue_struct *wq)
 {
-	return wq->singlethread;
+	return wq->flags & WQ_SINGLE_THREAD;
 }
 
 static const struct cpumask *wq_cpu_map(struct workqueue_struct *wq)
@@ -463,7 +462,7 @@ static int worker_thread(void *__cwq)
 	struct cpu_workqueue_struct *cwq = __cwq;
 	DEFINE_WAIT(wait);
 
-	if (cwq->wq->freezeable)
+	if (cwq->wq->flags & WQ_FREEZEABLE)
 		set_freezable();
 
 	for (;;) {
@@ -1013,8 +1012,7 @@ static void start_workqueue_thread(struct cpu_workqueue_struct *cwq, int cpu)
 }
 
 struct workqueue_struct *__create_workqueue_key(const char *name,
-						int singlethread,
-						int freezeable,
+						unsigned int flags,
 						struct lock_class_key *key,
 						const char *lock_name)
 {
@@ -1030,13 +1028,12 @@ struct workqueue_struct *__create_workqueue_key(const char *name,
 	if (!wq->cpu_wq)
 		goto err;
 
+	wq->flags = flags;
 	wq->name = name;
 	lockdep_init_map(&wq->lockdep_map, lock_name, key, 0);
-	wq->singlethread = singlethread;
-	wq->freezeable = freezeable;
 	INIT_LIST_HEAD(&wq->list);
 
-	if (singlethread) {
+	if (flags & WQ_SINGLE_THREAD) {
 		cwq = init_cpu_workqueue(wq, singlethread_cpu);
 		err = create_workqueue_thread(cwq, singlethread_cpu);
 		start_workqueue_thread(cwq, -1);
-- 
cgit v1.2.3


From 22df02bb3fab24af97bff4c69cc6fd8529fc66fe Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Tue, 29 Jun 2010 10:07:10 +0200
Subject: workqueue: define masks for work flags and conditionalize STATIC
 flags

Work flags are about to see more traditional mask handling.  Define
WORK_STRUCT_*_BIT as the bit position constant and redefine
WORK_STRUCT_* as bit masks.  Also, make WORK_STRUCT_STATIC_* flags
conditional

While at it, re-define these constants as enums and use
WORK_STRUCT_STATIC instead of hard-coding 2 in
WORK_DATA_STATIC_INIT().

Signed-off-by: Tejun Heo <tj@kernel.org>
---
 include/linux/workqueue.h | 29 +++++++++++++++++++++--------
 kernel/workqueue.c        | 12 ++++++------
 2 files changed, 27 insertions(+), 14 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/workqueue.h b/include/linux/workqueue.h
index d89cfc143b1a..d60c5701ab45 100644
--- a/include/linux/workqueue.h
+++ b/include/linux/workqueue.h
@@ -22,12 +22,25 @@ typedef void (*work_func_t)(struct work_struct *work);
  */
 #define work_data_bits(work) ((unsigned long *)(&(work)->data))
 
+enum {
+	WORK_STRUCT_PENDING_BIT	= 0,	/* work item is pending execution */
+#ifdef CONFIG_DEBUG_OBJECTS_WORK
+	WORK_STRUCT_STATIC_BIT	= 1,	/* static initializer (debugobjects) */
+#endif
+
+	WORK_STRUCT_PENDING	= 1 << WORK_STRUCT_PENDING_BIT,
+#ifdef CONFIG_DEBUG_OBJECTS_WORK
+	WORK_STRUCT_STATIC	= 1 << WORK_STRUCT_STATIC_BIT,
+#else
+	WORK_STRUCT_STATIC	= 0,
+#endif
+
+	WORK_STRUCT_FLAG_MASK	= 3UL,
+	WORK_STRUCT_WQ_DATA_MASK = ~WORK_STRUCT_FLAG_MASK,
+};
+
 struct work_struct {
 	atomic_long_t data;
-#define WORK_STRUCT_PENDING 0		/* T if work item pending execution */
-#define WORK_STRUCT_STATIC  1		/* static initializer (debugobjects) */
-#define WORK_STRUCT_FLAG_MASK (3UL)
-#define WORK_STRUCT_WQ_DATA_MASK (~WORK_STRUCT_FLAG_MASK)
 	struct list_head entry;
 	work_func_t func;
 #ifdef CONFIG_LOCKDEP
@@ -36,7 +49,7 @@ struct work_struct {
 };
 
 #define WORK_DATA_INIT()	ATOMIC_LONG_INIT(0)
-#define WORK_DATA_STATIC_INIT()	ATOMIC_LONG_INIT(2)
+#define WORK_DATA_STATIC_INIT()	ATOMIC_LONG_INIT(WORK_STRUCT_STATIC)
 
 struct delayed_work {
 	struct work_struct work;
@@ -98,7 +111,7 @@ extern void __init_work(struct work_struct *work, int onstack);
 extern void destroy_work_on_stack(struct work_struct *work);
 static inline unsigned int work_static(struct work_struct *work)
 {
-	return *work_data_bits(work) & (1 << WORK_STRUCT_STATIC);
+	return *work_data_bits(work) & WORK_STRUCT_STATIC;
 }
 #else
 static inline void __init_work(struct work_struct *work, int onstack) { }
@@ -167,7 +180,7 @@ static inline unsigned int work_static(struct work_struct *work) { return 0; }
  * @work: The work item in question
  */
 #define work_pending(work) \
-	test_bit(WORK_STRUCT_PENDING, work_data_bits(work))
+	test_bit(WORK_STRUCT_PENDING_BIT, work_data_bits(work))
 
 /**
  * delayed_work_pending - Find out whether a delayable work item is currently
@@ -182,7 +195,7 @@ static inline unsigned int work_static(struct work_struct *work) { return 0; }
  * @work: The work item in question
  */
 #define work_clear_pending(work) \
-	clear_bit(WORK_STRUCT_PENDING, work_data_bits(work))
+	clear_bit(WORK_STRUCT_PENDING_BIT, work_data_bits(work))
 
 enum {
 	WQ_FREEZEABLE		= 1 << 0, /* freeze during suspend */
diff --git a/kernel/workqueue.c b/kernel/workqueue.c
index 68e4dd808ec0..5c49d762293b 100644
--- a/kernel/workqueue.c
+++ b/kernel/workqueue.c
@@ -115,7 +115,7 @@ static int work_fixup_activate(void *addr, enum debug_obj_state state)
 		 * statically initialized. We just make sure that it
 		 * is tracked in the object tracker.
 		 */
-		if (test_bit(WORK_STRUCT_STATIC, work_data_bits(work))) {
+		if (test_bit(WORK_STRUCT_STATIC_BIT, work_data_bits(work))) {
 			debug_object_init(work, &work_debug_descr);
 			debug_object_activate(work, &work_debug_descr);
 			return 0;
@@ -232,7 +232,7 @@ static inline void set_wq_data(struct work_struct *work,
 	BUG_ON(!work_pending(work));
 
 	atomic_long_set(&work->data, (unsigned long)cwq | work_static(work) |
-			(1UL << WORK_STRUCT_PENDING) | extra_flags);
+			WORK_STRUCT_PENDING | extra_flags);
 }
 
 /*
@@ -330,7 +330,7 @@ queue_work_on(int cpu, struct workqueue_struct *wq, struct work_struct *work)
 {
 	int ret = 0;
 
-	if (!test_and_set_bit(WORK_STRUCT_PENDING, work_data_bits(work))) {
+	if (!test_and_set_bit(WORK_STRUCT_PENDING_BIT, work_data_bits(work))) {
 		__queue_work(cpu, wq, work);
 		ret = 1;
 	}
@@ -380,7 +380,7 @@ int queue_delayed_work_on(int cpu, struct workqueue_struct *wq,
 	struct timer_list *timer = &dwork->timer;
 	struct work_struct *work = &dwork->work;
 
-	if (!test_and_set_bit(WORK_STRUCT_PENDING, work_data_bits(work))) {
+	if (!test_and_set_bit(WORK_STRUCT_PENDING_BIT, work_data_bits(work))) {
 		BUG_ON(timer_pending(timer));
 		BUG_ON(!list_empty(&work->entry));
 
@@ -516,7 +516,7 @@ static void insert_wq_barrier(struct cpu_workqueue_struct *cwq,
 	 * might deadlock.
 	 */
 	INIT_WORK_ON_STACK(&barr->work, wq_barrier_func);
-	__set_bit(WORK_STRUCT_PENDING, work_data_bits(&barr->work));
+	__set_bit(WORK_STRUCT_PENDING_BIT, work_data_bits(&barr->work));
 	init_completion(&barr->done);
 
 	debug_work_activate(&barr->work);
@@ -628,7 +628,7 @@ static int try_to_grab_pending(struct work_struct *work)
 	struct cpu_workqueue_struct *cwq;
 	int ret = -1;
 
-	if (!test_and_set_bit(WORK_STRUCT_PENDING, work_data_bits(work)))
+	if (!test_and_set_bit(WORK_STRUCT_PENDING_BIT, work_data_bits(work)))
 		return 0;
 
 	/*
-- 
cgit v1.2.3


From 0f900049cbe2767d47c2a62b54f0e822e1d66840 Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Tue, 29 Jun 2010 10:07:11 +0200
Subject: workqueue: update cwq alignement

work->data field is used for two purposes.  It points to cwq it's
queued on and the lower bits are used for flags.  Currently, two bits
are reserved which is always safe as 4 byte alignment is guaranteed on
every architecture.  However, future changes will need more flag bits.

On SMP, the percpu allocator is capable of honoring larger alignment
(there are other users which depend on it) and larger alignment works
just fine.  On UP, percpu allocator is a thin wrapper around
kzalloc/kfree() and don't honor alignment request.

This patch introduces WORK_STRUCT_FLAG_BITS and implements
alloc/free_cwqs() which guarantees max(1 << WORK_STRUCT_FLAG_BITS,
__alignof__(unsigned long long) alignment both on SMP and UP.  On SMP,
simply wrapping percpu allocator is enough.  On UP, extra space is
allocated so that cwq can be aligned and the original pointer can be
stored after it which is used in the free path.

* Alignment problem on UP is reported by Michal Simek.

Signed-off-by: Tejun Heo <tj@kernel.org>
Cc: Christoph Lameter <cl@linux-foundation.org>
Cc: Ingo Molnar <mingo@elte.hu>
Cc: Frederic Weisbecker <fweisbec@gmail.com>
Reported-by: Michal Simek <michal.simek@petalogix.com>
---
 include/linux/workqueue.h |  5 +++-
 kernel/workqueue.c        | 60 +++++++++++++++++++++++++++++++++++++++++++----
 2 files changed, 59 insertions(+), 6 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/workqueue.h b/include/linux/workqueue.h
index d60c5701ab45..b90958a037dc 100644
--- a/include/linux/workqueue.h
+++ b/include/linux/workqueue.h
@@ -26,6 +26,9 @@ enum {
 	WORK_STRUCT_PENDING_BIT	= 0,	/* work item is pending execution */
 #ifdef CONFIG_DEBUG_OBJECTS_WORK
 	WORK_STRUCT_STATIC_BIT	= 1,	/* static initializer (debugobjects) */
+	WORK_STRUCT_FLAG_BITS	= 2,
+#else
+	WORK_STRUCT_FLAG_BITS	= 1,
 #endif
 
 	WORK_STRUCT_PENDING	= 1 << WORK_STRUCT_PENDING_BIT,
@@ -35,7 +38,7 @@ enum {
 	WORK_STRUCT_STATIC	= 0,
 #endif
 
-	WORK_STRUCT_FLAG_MASK	= 3UL,
+	WORK_STRUCT_FLAG_MASK	= (1UL << WORK_STRUCT_FLAG_BITS) - 1,
 	WORK_STRUCT_WQ_DATA_MASK = ~WORK_STRUCT_FLAG_MASK,
 };
 
diff --git a/kernel/workqueue.c b/kernel/workqueue.c
index dc78956ccf03..74a38499b19a 100644
--- a/kernel/workqueue.c
+++ b/kernel/workqueue.c
@@ -46,7 +46,9 @@
 
 /*
  * The per-CPU workqueue (if single thread, we always use the first
- * possible cpu).
+ * possible cpu).  The lower WORK_STRUCT_FLAG_BITS of
+ * work_struct->data are used for flags and thus cwqs need to be
+ * aligned at two's power of the number of flag bits.
  */
 struct cpu_workqueue_struct {
 
@@ -59,7 +61,7 @@ struct cpu_workqueue_struct {
 
 	struct workqueue_struct *wq;		/* I: the owning workqueue */
 	struct task_struct	*thread;
-} ____cacheline_aligned;
+};
 
 /*
  * The externally visible workqueue abstraction is an array of
@@ -967,6 +969,53 @@ int current_is_keventd(void)
 
 }
 
+static struct cpu_workqueue_struct *alloc_cwqs(void)
+{
+	/*
+	 * cwqs are forced aligned according to WORK_STRUCT_FLAG_BITS.
+	 * Make sure that the alignment isn't lower than that of
+	 * unsigned long long.
+	 */
+	const size_t size = sizeof(struct cpu_workqueue_struct);
+	const size_t align = max_t(size_t, 1 << WORK_STRUCT_FLAG_BITS,
+				   __alignof__(unsigned long long));
+	struct cpu_workqueue_struct *cwqs;
+#ifndef CONFIG_SMP
+	void *ptr;
+
+	/*
+	 * On UP, percpu allocator doesn't honor alignment parameter
+	 * and simply uses arch-dependent default.  Allocate enough
+	 * room to align cwq and put an extra pointer at the end
+	 * pointing back to the originally allocated pointer which
+	 * will be used for free.
+	 *
+	 * FIXME: This really belongs to UP percpu code.  Update UP
+	 * percpu code to honor alignment and remove this ugliness.
+	 */
+	ptr = __alloc_percpu(size + align + sizeof(void *), 1);
+	cwqs = PTR_ALIGN(ptr, align);
+	*(void **)per_cpu_ptr(cwqs + 1, 0) = ptr;
+#else
+	/* On SMP, percpu allocator can do it itself */
+	cwqs = __alloc_percpu(size, align);
+#endif
+	/* just in case, make sure it's actually aligned */
+	BUG_ON(!IS_ALIGNED((unsigned long)cwqs, align));
+	return cwqs;
+}
+
+static void free_cwqs(struct cpu_workqueue_struct *cwqs)
+{
+#ifndef CONFIG_SMP
+	/* on UP, the pointer to free is stored right after the cwq */
+	if (cwqs)
+		free_percpu(*(void **)per_cpu_ptr(cwqs + 1, 0));
+#else
+	free_percpu(cwqs);
+#endif
+}
+
 static int create_workqueue_thread(struct cpu_workqueue_struct *cwq, int cpu)
 {
 	struct workqueue_struct *wq = cwq->wq;
@@ -1012,7 +1061,7 @@ struct workqueue_struct *__create_workqueue_key(const char *name,
 	if (!wq)
 		goto err;
 
-	wq->cpu_wq = alloc_percpu(struct cpu_workqueue_struct);
+	wq->cpu_wq = alloc_cwqs();
 	if (!wq->cpu_wq)
 		goto err;
 
@@ -1031,6 +1080,7 @@ struct workqueue_struct *__create_workqueue_key(const char *name,
 	for_each_possible_cpu(cpu) {
 		struct cpu_workqueue_struct *cwq = get_cwq(cpu, wq);
 
+		BUG_ON((unsigned long)cwq & WORK_STRUCT_FLAG_MASK);
 		cwq->wq = wq;
 		cwq->cpu = cpu;
 		spin_lock_init(&cwq->lock);
@@ -1059,7 +1109,7 @@ struct workqueue_struct *__create_workqueue_key(const char *name,
 	return wq;
 err:
 	if (wq) {
-		free_percpu(wq->cpu_wq);
+		free_cwqs(wq->cpu_wq);
 		kfree(wq);
 	}
 	return NULL;
@@ -1112,7 +1162,7 @@ void destroy_workqueue(struct workqueue_struct *wq)
 	for_each_possible_cpu(cpu)
 		cleanup_workqueue_thread(get_cwq(cpu, wq));
 
-	free_percpu(wq->cpu_wq);
+	free_cwqs(wq->cpu_wq);
 	kfree(wq);
 }
 EXPORT_SYMBOL_GPL(destroy_workqueue);
-- 
cgit v1.2.3


From 73f53c4aa732eced5fcb1844d3d452c30905f20f Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Tue, 29 Jun 2010 10:07:11 +0200
Subject: workqueue: reimplement workqueue flushing using color coded works

Reimplement workqueue flushing using color coded works.  wq has the
current work color which is painted on the works being issued via
cwqs.  Flushing a workqueue is achieved by advancing the current work
colors of cwqs and waiting for all the works which have any of the
previous colors to drain.

Currently there are 16 possible colors, one is reserved for no color
and 15 colors are useable allowing 14 concurrent flushes.  When color
space gets full, flush attempts are batched up and processed together
when color frees up, so even with many concurrent flushers, the new
implementation won't build up huge queue of flushers which has to be
processed one after another.

Only works which are queued via __queue_work() are colored.  Works
which are directly put on queue using insert_work() use NO_COLOR and
don't participate in workqueue flushing.  Currently only works used
for work-specific flush fall in this category.

This new implementation leaves only cleanup_workqueue_thread() as the
user of flush_cpu_workqueue().  Just make its users use
flush_workqueue() and kthread_stop() directly and kill
cleanup_workqueue_thread().  As workqueue flushing doesn't use barrier
request anymore, the comment describing the complex synchronization
around it in cleanup_workqueue_thread() is removed together with the
function.

This new implementation is to allow having and sharing multiple
workers per cpu.

Please note that one more bit is reserved for a future work flag by
this patch.  This is to avoid shifting bits and updating comments
later.

Signed-off-by: Tejun Heo <tj@kernel.org>
---
 include/linux/workqueue.h |  21 ++-
 kernel/workqueue.c        | 355 +++++++++++++++++++++++++++++++++++++++-------
 2 files changed, 322 insertions(+), 54 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/workqueue.h b/include/linux/workqueue.h
index b90958a037dc..8762f62103d8 100644
--- a/include/linux/workqueue.h
+++ b/include/linux/workqueue.h
@@ -26,11 +26,13 @@ enum {
 	WORK_STRUCT_PENDING_BIT	= 0,	/* work item is pending execution */
 #ifdef CONFIG_DEBUG_OBJECTS_WORK
 	WORK_STRUCT_STATIC_BIT	= 1,	/* static initializer (debugobjects) */
-	WORK_STRUCT_FLAG_BITS	= 2,
+	WORK_STRUCT_COLOR_SHIFT	= 3,	/* color for workqueue flushing */
 #else
-	WORK_STRUCT_FLAG_BITS	= 1,
+	WORK_STRUCT_COLOR_SHIFT	= 2,	/* color for workqueue flushing */
 #endif
 
+	WORK_STRUCT_COLOR_BITS	= 4,
+
 	WORK_STRUCT_PENDING	= 1 << WORK_STRUCT_PENDING_BIT,
 #ifdef CONFIG_DEBUG_OBJECTS_WORK
 	WORK_STRUCT_STATIC	= 1 << WORK_STRUCT_STATIC_BIT,
@@ -38,6 +40,21 @@ enum {
 	WORK_STRUCT_STATIC	= 0,
 #endif
 
+	/*
+	 * The last color is no color used for works which don't
+	 * participate in workqueue flushing.
+	 */
+	WORK_NR_COLORS		= (1 << WORK_STRUCT_COLOR_BITS) - 1,
+	WORK_NO_COLOR		= WORK_NR_COLORS,
+
+	/*
+	 * Reserve 6 bits off of cwq pointer w/ debugobjects turned
+	 * off.  This makes cwqs aligned to 64 bytes which isn't too
+	 * excessive while allowing 15 workqueue flush colors.
+	 */
+	WORK_STRUCT_FLAG_BITS	= WORK_STRUCT_COLOR_SHIFT +
+				  WORK_STRUCT_COLOR_BITS,
+
 	WORK_STRUCT_FLAG_MASK	= (1UL << WORK_STRUCT_FLAG_BITS) - 1,
 	WORK_STRUCT_WQ_DATA_MASK = ~WORK_STRUCT_FLAG_MASK,
 };
diff --git a/kernel/workqueue.c b/kernel/workqueue.c
index 74a38499b19a..56e47c59d73b 100644
--- a/kernel/workqueue.c
+++ b/kernel/workqueue.c
@@ -41,6 +41,8 @@
  *
  * L: cwq->lock protected.  Access with cwq->lock held.
  *
+ * F: wq->flush_mutex protected.
+ *
  * W: workqueue_lock protected.
  */
 
@@ -60,9 +62,22 @@ struct cpu_workqueue_struct {
 	unsigned int		cpu;
 
 	struct workqueue_struct *wq;		/* I: the owning workqueue */
+	int			work_color;	/* L: current color */
+	int			flush_color;	/* L: flushing color */
+	int			nr_in_flight[WORK_NR_COLORS];
+						/* L: nr of in_flight works */
 	struct task_struct	*thread;
 };
 
+/*
+ * Structure used to wait for workqueue flush.
+ */
+struct wq_flusher {
+	struct list_head	list;		/* F: list of flushers */
+	int			flush_color;	/* F: flush color waiting for */
+	struct completion	done;		/* flush completion */
+};
+
 /*
  * The externally visible workqueue abstraction is an array of
  * per-CPU workqueues:
@@ -71,6 +86,15 @@ struct workqueue_struct {
 	unsigned int		flags;		/* I: WQ_* flags */
 	struct cpu_workqueue_struct *cpu_wq;	/* I: cwq's */
 	struct list_head	list;		/* W: list of all workqueues */
+
+	struct mutex		flush_mutex;	/* protects wq flushing */
+	int			work_color;	/* F: current work color */
+	int			flush_color;	/* F: current flush color */
+	atomic_t		nr_cwqs_to_flush; /* flush in progress */
+	struct wq_flusher	*first_flusher;	/* F: first flusher */
+	struct list_head	flusher_queue;	/* F: flush waiters */
+	struct list_head	flusher_overflow; /* F: flush overflow list */
+
 	const char		*name;		/* I: workqueue name */
 #ifdef CONFIG_LOCKDEP
 	struct lockdep_map	lockdep_map;
@@ -207,6 +231,22 @@ static struct cpu_workqueue_struct *target_cwq(unsigned int cpu,
 	return get_cwq(cpu, wq);
 }
 
+static unsigned int work_color_to_flags(int color)
+{
+	return color << WORK_STRUCT_COLOR_SHIFT;
+}
+
+static int get_work_color(struct work_struct *work)
+{
+	return (*work_data_bits(work) >> WORK_STRUCT_COLOR_SHIFT) &
+		((1 << WORK_STRUCT_COLOR_BITS) - 1);
+}
+
+static int work_next_color(int color)
+{
+	return (color + 1) % WORK_NR_COLORS;
+}
+
 /*
  * Set the workqueue on which a work item is to be run
  * - Must *only* be called if the pending flag is set
@@ -273,7 +313,9 @@ static void __queue_work(unsigned int cpu, struct workqueue_struct *wq,
 	debug_work_activate(work);
 	spin_lock_irqsave(&cwq->lock, flags);
 	BUG_ON(!list_empty(&work->entry));
-	insert_work(cwq, work, &cwq->worklist, 0);
+	cwq->nr_in_flight[cwq->work_color]++;
+	insert_work(cwq, work, &cwq->worklist,
+		    work_color_to_flags(cwq->work_color));
 	spin_unlock_irqrestore(&cwq->lock, flags);
 }
 
@@ -386,6 +428,44 @@ int queue_delayed_work_on(int cpu, struct workqueue_struct *wq,
 }
 EXPORT_SYMBOL_GPL(queue_delayed_work_on);
 
+/**
+ * cwq_dec_nr_in_flight - decrement cwq's nr_in_flight
+ * @cwq: cwq of interest
+ * @color: color of work which left the queue
+ *
+ * A work either has completed or is removed from pending queue,
+ * decrement nr_in_flight of its cwq and handle workqueue flushing.
+ *
+ * CONTEXT:
+ * spin_lock_irq(cwq->lock).
+ */
+static void cwq_dec_nr_in_flight(struct cpu_workqueue_struct *cwq, int color)
+{
+	/* ignore uncolored works */
+	if (color == WORK_NO_COLOR)
+		return;
+
+	cwq->nr_in_flight[color]--;
+
+	/* is flush in progress and are we at the flushing tip? */
+	if (likely(cwq->flush_color != color))
+		return;
+
+	/* are there still in-flight works? */
+	if (cwq->nr_in_flight[color])
+		return;
+
+	/* this cwq is done, clear flush_color */
+	cwq->flush_color = -1;
+
+	/*
+	 * If this was the last cwq, wake up the first flusher.  It
+	 * will handle the rest.
+	 */
+	if (atomic_dec_and_test(&cwq->wq->nr_cwqs_to_flush))
+		complete(&cwq->wq->first_flusher->done);
+}
+
 /**
  * process_one_work - process single work
  * @cwq: cwq to process work for
@@ -404,6 +484,7 @@ static void process_one_work(struct cpu_workqueue_struct *cwq,
 			     struct work_struct *work)
 {
 	work_func_t f = work->func;
+	int work_color;
 #ifdef CONFIG_LOCKDEP
 	/*
 	 * It is permissible to free the struct work_struct from
@@ -417,6 +498,7 @@ static void process_one_work(struct cpu_workqueue_struct *cwq,
 	/* claim and process */
 	debug_work_deactivate(work);
 	cwq->current_work = work;
+	work_color = get_work_color(work);
 	list_del_init(&work->entry);
 
 	spin_unlock_irq(&cwq->lock);
@@ -443,6 +525,7 @@ static void process_one_work(struct cpu_workqueue_struct *cwq,
 
 	/* we're done with it, release */
 	cwq->current_work = NULL;
+	cwq_dec_nr_in_flight(cwq, work_color);
 }
 
 static void run_workqueue(struct cpu_workqueue_struct *cwq)
@@ -529,29 +612,78 @@ static void insert_wq_barrier(struct cpu_workqueue_struct *cwq,
 	init_completion(&barr->done);
 
 	debug_work_activate(&barr->work);
-	insert_work(cwq, &barr->work, head, 0);
+	insert_work(cwq, &barr->work, head, work_color_to_flags(WORK_NO_COLOR));
 }
 
-static int flush_cpu_workqueue(struct cpu_workqueue_struct *cwq)
+/**
+ * flush_workqueue_prep_cwqs - prepare cwqs for workqueue flushing
+ * @wq: workqueue being flushed
+ * @flush_color: new flush color, < 0 for no-op
+ * @work_color: new work color, < 0 for no-op
+ *
+ * Prepare cwqs for workqueue flushing.
+ *
+ * If @flush_color is non-negative, flush_color on all cwqs should be
+ * -1.  If no cwq has in-flight commands at the specified color, all
+ * cwq->flush_color's stay at -1 and %false is returned.  If any cwq
+ * has in flight commands, its cwq->flush_color is set to
+ * @flush_color, @wq->nr_cwqs_to_flush is updated accordingly, cwq
+ * wakeup logic is armed and %true is returned.
+ *
+ * The caller should have initialized @wq->first_flusher prior to
+ * calling this function with non-negative @flush_color.  If
+ * @flush_color is negative, no flush color update is done and %false
+ * is returned.
+ *
+ * If @work_color is non-negative, all cwqs should have the same
+ * work_color which is previous to @work_color and all will be
+ * advanced to @work_color.
+ *
+ * CONTEXT:
+ * mutex_lock(wq->flush_mutex).
+ *
+ * RETURNS:
+ * %true if @flush_color >= 0 and there's something to flush.  %false
+ * otherwise.
+ */
+static bool flush_workqueue_prep_cwqs(struct workqueue_struct *wq,
+				      int flush_color, int work_color)
 {
-	int active = 0;
-	struct wq_barrier barr;
+	bool wait = false;
+	unsigned int cpu;
 
-	WARN_ON(cwq->thread == current);
-
-	spin_lock_irq(&cwq->lock);
-	if (!list_empty(&cwq->worklist) || cwq->current_work != NULL) {
-		insert_wq_barrier(cwq, &barr, &cwq->worklist);
-		active = 1;
+	if (flush_color >= 0) {
+		BUG_ON(atomic_read(&wq->nr_cwqs_to_flush));
+		atomic_set(&wq->nr_cwqs_to_flush, 1);
 	}
-	spin_unlock_irq(&cwq->lock);
 
-	if (active) {
-		wait_for_completion(&barr.done);
-		destroy_work_on_stack(&barr.work);
+	for_each_possible_cpu(cpu) {
+		struct cpu_workqueue_struct *cwq = get_cwq(cpu, wq);
+
+		spin_lock_irq(&cwq->lock);
+
+		if (flush_color >= 0) {
+			BUG_ON(cwq->flush_color != -1);
+
+			if (cwq->nr_in_flight[flush_color]) {
+				cwq->flush_color = flush_color;
+				atomic_inc(&wq->nr_cwqs_to_flush);
+				wait = true;
+			}
+		}
+
+		if (work_color >= 0) {
+			BUG_ON(work_color != work_next_color(cwq->work_color));
+			cwq->work_color = work_color;
+		}
+
+		spin_unlock_irq(&cwq->lock);
 	}
 
-	return active;
+	if (flush_color >= 0 && atomic_dec_and_test(&wq->nr_cwqs_to_flush))
+		complete(&wq->first_flusher->done);
+
+	return wait;
 }
 
 /**
@@ -566,13 +698,143 @@ static int flush_cpu_workqueue(struct cpu_workqueue_struct *cwq)
  */
 void flush_workqueue(struct workqueue_struct *wq)
 {
-	int cpu;
+	struct wq_flusher this_flusher = {
+		.list = LIST_HEAD_INIT(this_flusher.list),
+		.flush_color = -1,
+		.done = COMPLETION_INITIALIZER_ONSTACK(this_flusher.done),
+	};
+	int next_color;
 
-	might_sleep();
 	lock_map_acquire(&wq->lockdep_map);
 	lock_map_release(&wq->lockdep_map);
-	for_each_possible_cpu(cpu)
-		flush_cpu_workqueue(get_cwq(cpu, wq));
+
+	mutex_lock(&wq->flush_mutex);
+
+	/*
+	 * Start-to-wait phase
+	 */
+	next_color = work_next_color(wq->work_color);
+
+	if (next_color != wq->flush_color) {
+		/*
+		 * Color space is not full.  The current work_color
+		 * becomes our flush_color and work_color is advanced
+		 * by one.
+		 */
+		BUG_ON(!list_empty(&wq->flusher_overflow));
+		this_flusher.flush_color = wq->work_color;
+		wq->work_color = next_color;
+
+		if (!wq->first_flusher) {
+			/* no flush in progress, become the first flusher */
+			BUG_ON(wq->flush_color != this_flusher.flush_color);
+
+			wq->first_flusher = &this_flusher;
+
+			if (!flush_workqueue_prep_cwqs(wq, wq->flush_color,
+						       wq->work_color)) {
+				/* nothing to flush, done */
+				wq->flush_color = next_color;
+				wq->first_flusher = NULL;
+				goto out_unlock;
+			}
+		} else {
+			/* wait in queue */
+			BUG_ON(wq->flush_color == this_flusher.flush_color);
+			list_add_tail(&this_flusher.list, &wq->flusher_queue);
+			flush_workqueue_prep_cwqs(wq, -1, wq->work_color);
+		}
+	} else {
+		/*
+		 * Oops, color space is full, wait on overflow queue.
+		 * The next flush completion will assign us
+		 * flush_color and transfer to flusher_queue.
+		 */
+		list_add_tail(&this_flusher.list, &wq->flusher_overflow);
+	}
+
+	mutex_unlock(&wq->flush_mutex);
+
+	wait_for_completion(&this_flusher.done);
+
+	/*
+	 * Wake-up-and-cascade phase
+	 *
+	 * First flushers are responsible for cascading flushes and
+	 * handling overflow.  Non-first flushers can simply return.
+	 */
+	if (wq->first_flusher != &this_flusher)
+		return;
+
+	mutex_lock(&wq->flush_mutex);
+
+	wq->first_flusher = NULL;
+
+	BUG_ON(!list_empty(&this_flusher.list));
+	BUG_ON(wq->flush_color != this_flusher.flush_color);
+
+	while (true) {
+		struct wq_flusher *next, *tmp;
+
+		/* complete all the flushers sharing the current flush color */
+		list_for_each_entry_safe(next, tmp, &wq->flusher_queue, list) {
+			if (next->flush_color != wq->flush_color)
+				break;
+			list_del_init(&next->list);
+			complete(&next->done);
+		}
+
+		BUG_ON(!list_empty(&wq->flusher_overflow) &&
+		       wq->flush_color != work_next_color(wq->work_color));
+
+		/* this flush_color is finished, advance by one */
+		wq->flush_color = work_next_color(wq->flush_color);
+
+		/* one color has been freed, handle overflow queue */
+		if (!list_empty(&wq->flusher_overflow)) {
+			/*
+			 * Assign the same color to all overflowed
+			 * flushers, advance work_color and append to
+			 * flusher_queue.  This is the start-to-wait
+			 * phase for these overflowed flushers.
+			 */
+			list_for_each_entry(tmp, &wq->flusher_overflow, list)
+				tmp->flush_color = wq->work_color;
+
+			wq->work_color = work_next_color(wq->work_color);
+
+			list_splice_tail_init(&wq->flusher_overflow,
+					      &wq->flusher_queue);
+			flush_workqueue_prep_cwqs(wq, -1, wq->work_color);
+		}
+
+		if (list_empty(&wq->flusher_queue)) {
+			BUG_ON(wq->flush_color != wq->work_color);
+			break;
+		}
+
+		/*
+		 * Need to flush more colors.  Make the next flusher
+		 * the new first flusher and arm cwqs.
+		 */
+		BUG_ON(wq->flush_color == wq->work_color);
+		BUG_ON(wq->flush_color != next->flush_color);
+
+		list_del_init(&next->list);
+		wq->first_flusher = next;
+
+		if (flush_workqueue_prep_cwqs(wq, wq->flush_color, -1))
+			break;
+
+		/*
+		 * Meh... this color is already done, clear first
+		 * flusher and repeat cascading.
+		 */
+		wq->first_flusher = NULL;
+	}
+
+out_unlock:
+	mutex_unlock(&wq->flush_mutex);
 }
 EXPORT_SYMBOL_GPL(flush_workqueue);
 
@@ -659,6 +921,7 @@ static int try_to_grab_pending(struct work_struct *work)
 		if (cwq == get_wq_data(work)) {
 			debug_work_deactivate(work);
 			list_del_init(&work->entry);
+			cwq_dec_nr_in_flight(cwq, get_work_color(work));
 			ret = 1;
 		}
 	}
@@ -1066,6 +1329,10 @@ struct workqueue_struct *__create_workqueue_key(const char *name,
 		goto err;
 
 	wq->flags = flags;
+	mutex_init(&wq->flush_mutex);
+	atomic_set(&wq->nr_cwqs_to_flush, 0);
+	INIT_LIST_HEAD(&wq->flusher_queue);
+	INIT_LIST_HEAD(&wq->flusher_overflow);
 	wq->name = name;
 	lockdep_init_map(&wq->lockdep_map, lock_name, key, 0);
 	INIT_LIST_HEAD(&wq->list);
@@ -1083,6 +1350,7 @@ struct workqueue_struct *__create_workqueue_key(const char *name,
 		BUG_ON((unsigned long)cwq & WORK_STRUCT_FLAG_MASK);
 		cwq->wq = wq;
 		cwq->cpu = cpu;
+		cwq->flush_color = -1;
 		spin_lock_init(&cwq->lock);
 		INIT_LIST_HEAD(&cwq->worklist);
 		init_waitqueue_head(&cwq->more_work);
@@ -1116,33 +1384,6 @@ err:
 }
 EXPORT_SYMBOL_GPL(__create_workqueue_key);
 
-static void cleanup_workqueue_thread(struct cpu_workqueue_struct *cwq)
-{
-	/*
-	 * Our caller is either destroy_workqueue() or CPU_POST_DEAD,
-	 * cpu_add_remove_lock protects cwq->thread.
-	 */
-	if (cwq->thread == NULL)
-		return;
-
-	lock_map_acquire(&cwq->wq->lockdep_map);
-	lock_map_release(&cwq->wq->lockdep_map);
-
-	flush_cpu_workqueue(cwq);
-	/*
-	 * If the caller is CPU_POST_DEAD and cwq->worklist was not empty,
-	 * a concurrent flush_workqueue() can insert a barrier after us.
-	 * However, in that case run_workqueue() won't return and check
-	 * kthread_should_stop() until it flushes all work_struct's.
-	 * When ->worklist becomes empty it is safe to exit because no
-	 * more work_structs can be queued on this cwq: flush_workqueue
-	 * checks list_empty(), and a "normal" queue_work() can't use
-	 * a dead CPU.
-	 */
-	kthread_stop(cwq->thread);
-	cwq->thread = NULL;
-}
-
 /**
  * destroy_workqueue - safely terminate a workqueue
  * @wq: target workqueue
@@ -1159,8 +1400,20 @@ void destroy_workqueue(struct workqueue_struct *wq)
 	spin_unlock(&workqueue_lock);
 	cpu_maps_update_done();
 
-	for_each_possible_cpu(cpu)
-		cleanup_workqueue_thread(get_cwq(cpu, wq));
+	flush_workqueue(wq);
+
+	for_each_possible_cpu(cpu) {
+		struct cpu_workqueue_struct *cwq = get_cwq(cpu, wq);
+		int i;
+
+		if (cwq->thread) {
+			kthread_stop(cwq->thread);
+			cwq->thread = NULL;
+		}
+
+		for (i = 0; i < WORK_NR_COLORS; i++)
+			BUG_ON(cwq->nr_in_flight[i]);
+	}
 
 	free_cwqs(wq->cpu_wq);
 	kfree(wq);
@@ -1185,9 +1438,7 @@ static int __devinit workqueue_cpu_callback(struct notifier_block *nfb,
 
 		switch (action) {
 		case CPU_POST_DEAD:
-			lock_map_acquire(&cwq->wq->lockdep_map);
-			lock_map_release(&cwq->wq->lockdep_map);
-			flush_cpu_workqueue(cwq);
+			flush_workqueue(wq);
 			break;
 		}
 	}
-- 
cgit v1.2.3


From affee4b294a0fc97d67c8a77dc080c4dd262a79e Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Tue, 29 Jun 2010 10:07:12 +0200
Subject: workqueue: reimplement work flushing using linked works

A work is linked to the next one by having WORK_STRUCT_LINKED bit set
and these links can be chained.  When a linked work is dispatched to a
worker, all linked works are dispatched to the worker's newly added
->scheduled queue and processed back-to-back.

Currently, as there's only single worker per cwq, having linked works
doesn't make any visible behavior difference.  This change is to
prepare for multiple shared workers per cpu.

Signed-off-by: Tejun Heo <tj@kernel.org>
---
 include/linux/workqueue.h |   4 +-
 kernel/workqueue.c        | 152 +++++++++++++++++++++++++++++++++++++++-------
 2 files changed, 134 insertions(+), 22 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/workqueue.h b/include/linux/workqueue.h
index 8762f62103d8..4f4fdba722c3 100644
--- a/include/linux/workqueue.h
+++ b/include/linux/workqueue.h
@@ -24,8 +24,9 @@ typedef void (*work_func_t)(struct work_struct *work);
 
 enum {
 	WORK_STRUCT_PENDING_BIT	= 0,	/* work item is pending execution */
+	WORK_STRUCT_LINKED_BIT	= 1,	/* next work is linked to this one */
 #ifdef CONFIG_DEBUG_OBJECTS_WORK
-	WORK_STRUCT_STATIC_BIT	= 1,	/* static initializer (debugobjects) */
+	WORK_STRUCT_STATIC_BIT	= 2,	/* static initializer (debugobjects) */
 	WORK_STRUCT_COLOR_SHIFT	= 3,	/* color for workqueue flushing */
 #else
 	WORK_STRUCT_COLOR_SHIFT	= 2,	/* color for workqueue flushing */
@@ -34,6 +35,7 @@ enum {
 	WORK_STRUCT_COLOR_BITS	= 4,
 
 	WORK_STRUCT_PENDING	= 1 << WORK_STRUCT_PENDING_BIT,
+	WORK_STRUCT_LINKED	= 1 << WORK_STRUCT_LINKED_BIT,
 #ifdef CONFIG_DEBUG_OBJECTS_WORK
 	WORK_STRUCT_STATIC	= 1 << WORK_STRUCT_STATIC_BIT,
 #else
diff --git a/kernel/workqueue.c b/kernel/workqueue.c
index 600db10a4dbf..9953d3c7bd10 100644
--- a/kernel/workqueue.c
+++ b/kernel/workqueue.c
@@ -51,6 +51,7 @@ struct cpu_workqueue_struct;
 
 struct worker {
 	struct work_struct	*current_work;	/* L: work being processed */
+	struct list_head	scheduled;	/* L: scheduled works */
 	struct task_struct	*task;		/* I: worker task */
 	struct cpu_workqueue_struct *cwq;	/* I: the associated cwq */
 	int			id;		/* I: worker id */
@@ -445,6 +446,8 @@ static struct worker *alloc_worker(void)
 	struct worker *worker;
 
 	worker = kzalloc(sizeof(*worker), GFP_KERNEL);
+	if (worker)
+		INIT_LIST_HEAD(&worker->scheduled);
 	return worker;
 }
 
@@ -530,6 +533,7 @@ static void destroy_worker(struct worker *worker)
 
 	/* sanity check frenzy */
 	BUG_ON(worker->current_work);
+	BUG_ON(!list_empty(&worker->scheduled));
 
 	kthread_stop(worker->task);
 	kfree(worker);
@@ -539,6 +543,47 @@ static void destroy_worker(struct worker *worker)
 	spin_unlock(&workqueue_lock);
 }
 
+/**
+ * move_linked_works - move linked works to a list
+ * @work: start of series of works to be scheduled
+ * @head: target list to append @work to
+ * @nextp: out paramter for nested worklist walking
+ *
+ * Schedule linked works starting from @work to @head.  Work series to
+ * be scheduled starts at @work and includes any consecutive work with
+ * WORK_STRUCT_LINKED set in its predecessor.
+ *
+ * If @nextp is not NULL, it's updated to point to the next work of
+ * the last scheduled work.  This allows move_linked_works() to be
+ * nested inside outer list_for_each_entry_safe().
+ *
+ * CONTEXT:
+ * spin_lock_irq(cwq->lock).
+ */
+static void move_linked_works(struct work_struct *work, struct list_head *head,
+			      struct work_struct **nextp)
+{
+	struct work_struct *n;
+
+	/*
+	 * Linked worklist will always end before the end of the list,
+	 * use NULL for list head.
+	 */
+	list_for_each_entry_safe_from(work, n, NULL, entry) {
+		list_move_tail(&work->entry, head);
+		if (!(*work_data_bits(work) & WORK_STRUCT_LINKED))
+			break;
+	}
+
+	/*
+	 * If we're already inside safe list traversal and have moved
+	 * multiple works to the scheduled queue, the next position
+	 * needs to be updated.
+	 */
+	if (nextp)
+		*nextp = n;
+}
+
 /**
  * cwq_dec_nr_in_flight - decrement cwq's nr_in_flight
  * @cwq: cwq of interest
@@ -639,17 +684,25 @@ static void process_one_work(struct worker *worker, struct work_struct *work)
 	cwq_dec_nr_in_flight(cwq, work_color);
 }
 
-static void run_workqueue(struct worker *worker)
+/**
+ * process_scheduled_works - process scheduled works
+ * @worker: self
+ *
+ * Process all scheduled works.  Please note that the scheduled list
+ * may change while processing a work, so this function repeatedly
+ * fetches a work from the top and executes it.
+ *
+ * CONTEXT:
+ * spin_lock_irq(cwq->lock) which may be released and regrabbed
+ * multiple times.
+ */
+static void process_scheduled_works(struct worker *worker)
 {
-	struct cpu_workqueue_struct *cwq = worker->cwq;
-
-	spin_lock_irq(&cwq->lock);
-	while (!list_empty(&cwq->worklist)) {
-		struct work_struct *work = list_entry(cwq->worklist.next,
+	while (!list_empty(&worker->scheduled)) {
+		struct work_struct *work = list_first_entry(&worker->scheduled,
 						struct work_struct, entry);
 		process_one_work(worker, work);
 	}
-	spin_unlock_irq(&cwq->lock);
 }
 
 /**
@@ -684,7 +737,28 @@ static int worker_thread(void *__worker)
 					    get_cpu_mask(cwq->cpu))))
 			set_cpus_allowed_ptr(worker->task,
 					     get_cpu_mask(cwq->cpu));
-		run_workqueue(worker);
+
+		spin_lock_irq(&cwq->lock);
+
+		while (!list_empty(&cwq->worklist)) {
+			struct work_struct *work =
+				list_first_entry(&cwq->worklist,
+						 struct work_struct, entry);
+
+			if (likely(!(*work_data_bits(work) &
+				     WORK_STRUCT_LINKED))) {
+				/* optimization path, not strictly necessary */
+				process_one_work(worker, work);
+				if (unlikely(!list_empty(&worker->scheduled)))
+					process_scheduled_works(worker);
+			} else {
+				move_linked_works(work, &worker->scheduled,
+						  NULL);
+				process_scheduled_works(worker);
+			}
+		}
+
+		spin_unlock_irq(&cwq->lock);
 	}
 
 	return 0;
@@ -705,16 +779,33 @@ static void wq_barrier_func(struct work_struct *work)
  * insert_wq_barrier - insert a barrier work
  * @cwq: cwq to insert barrier into
  * @barr: wq_barrier to insert
- * @head: insertion point
+ * @target: target work to attach @barr to
+ * @worker: worker currently executing @target, NULL if @target is not executing
  *
- * Insert barrier @barr into @cwq before @head.
+ * @barr is linked to @target such that @barr is completed only after
+ * @target finishes execution.  Please note that the ordering
+ * guarantee is observed only with respect to @target and on the local
+ * cpu.
+ *
+ * Currently, a queued barrier can't be canceled.  This is because
+ * try_to_grab_pending() can't determine whether the work to be
+ * grabbed is at the head of the queue and thus can't clear LINKED
+ * flag of the previous work while there must be a valid next work
+ * after a work with LINKED flag set.
+ *
+ * Note that when @worker is non-NULL, @target may be modified
+ * underneath us, so we can't reliably determine cwq from @target.
  *
  * CONTEXT:
  * spin_lock_irq(cwq->lock).
  */
 static void insert_wq_barrier(struct cpu_workqueue_struct *cwq,
-			struct wq_barrier *barr, struct list_head *head)
+			      struct wq_barrier *barr,
+			      struct work_struct *target, struct worker *worker)
 {
+	struct list_head *head;
+	unsigned int linked = 0;
+
 	/*
 	 * debugobject calls are safe here even with cwq->lock locked
 	 * as we know for sure that this will not trigger any of the
@@ -725,8 +816,24 @@ static void insert_wq_barrier(struct cpu_workqueue_struct *cwq,
 	__set_bit(WORK_STRUCT_PENDING_BIT, work_data_bits(&barr->work));
 	init_completion(&barr->done);
 
+	/*
+	 * If @target is currently being executed, schedule the
+	 * barrier to the worker; otherwise, put it after @target.
+	 */
+	if (worker)
+		head = worker->scheduled.next;
+	else {
+		unsigned long *bits = work_data_bits(target);
+
+		head = target->entry.next;
+		/* there can already be other linked works, inherit and set */
+		linked = *bits & WORK_STRUCT_LINKED;
+		__set_bit(WORK_STRUCT_LINKED_BIT, bits);
+	}
+
 	debug_work_activate(&barr->work);
-	insert_work(cwq, &barr->work, head, work_color_to_flags(WORK_NO_COLOR));
+	insert_work(cwq, &barr->work, head,
+		    work_color_to_flags(WORK_NO_COLOR) | linked);
 }
 
 /**
@@ -964,8 +1071,8 @@ EXPORT_SYMBOL_GPL(flush_workqueue);
  */
 int flush_work(struct work_struct *work)
 {
+	struct worker *worker = NULL;
 	struct cpu_workqueue_struct *cwq;
-	struct list_head *prev;
 	struct wq_barrier barr;
 
 	might_sleep();
@@ -985,14 +1092,14 @@ int flush_work(struct work_struct *work)
 		smp_rmb();
 		if (unlikely(cwq != get_wq_data(work)))
 			goto already_gone;
-		prev = &work->entry;
 	} else {
-		if (!cwq->worker || cwq->worker->current_work != work)
+		if (cwq->worker && cwq->worker->current_work == work)
+			worker = cwq->worker;
+		if (!worker)
 			goto already_gone;
-		prev = &cwq->worklist;
 	}
-	insert_wq_barrier(cwq, &barr, prev->next);
 
+	insert_wq_barrier(cwq, &barr, work, worker);
 	spin_unlock_irq(&cwq->lock);
 	wait_for_completion(&barr.done);
 	destroy_work_on_stack(&barr.work);
@@ -1048,16 +1155,19 @@ static void wait_on_cpu_work(struct cpu_workqueue_struct *cwq,
 				struct work_struct *work)
 {
 	struct wq_barrier barr;
-	int running = 0;
+	struct worker *worker;
 
 	spin_lock_irq(&cwq->lock);
+
+	worker = NULL;
 	if (unlikely(cwq->worker && cwq->worker->current_work == work)) {
-		insert_wq_barrier(cwq, &barr, cwq->worklist.next);
-		running = 1;
+		worker = cwq->worker;
+		insert_wq_barrier(cwq, &barr, work, worker);
 	}
+
 	spin_unlock_irq(&cwq->lock);
 
-	if (unlikely(running)) {
+	if (unlikely(worker)) {
 		wait_for_completion(&barr.done);
 		destroy_work_on_stack(&barr.work);
 	}
-- 
cgit v1.2.3


From 1e19ffc63dbbaea7a7d1c63d99c38d3e5a4c7edf Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Tue, 29 Jun 2010 10:07:12 +0200
Subject: workqueue: implement per-cwq active work limit

Add cwq->nr_active, cwq->max_active and cwq->delayed_work.  nr_active
counts the number of active works per cwq.  A work is active if it's
flushable (colored) and is on cwq's worklist.  If nr_active reaches
max_active, new works are queued on cwq->delayed_work and activated
later as works on the cwq complete and decrement nr_active.

cwq->max_active can be specified via the new @max_active parameter to
__create_workqueue() and is set to 1 for all workqueues for now.  As
each cwq has only single worker now, this double queueing doesn't
cause any behavior difference visible to its users.

This will be used to reimplement freeze/thaw and implement shared
worker pool.

Signed-off-by: Tejun Heo <tj@kernel.org>
---
 include/linux/workqueue.h | 18 +++++++++---------
 kernel/workqueue.c        | 39 +++++++++++++++++++++++++++++++++++++--
 2 files changed, 46 insertions(+), 11 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/workqueue.h b/include/linux/workqueue.h
index 4f4fdba722c3..eb753b7790e5 100644
--- a/include/linux/workqueue.h
+++ b/include/linux/workqueue.h
@@ -225,11 +225,11 @@ enum {
 };
 
 extern struct workqueue_struct *
-__create_workqueue_key(const char *name, unsigned int flags,
+__create_workqueue_key(const char *name, unsigned int flags, int max_active,
 		       struct lock_class_key *key, const char *lock_name);
 
 #ifdef CONFIG_LOCKDEP
-#define __create_workqueue(name, flags)				\
+#define __create_workqueue(name, flags, max_active)		\
 ({								\
 	static struct lock_class_key __key;			\
 	const char *__lock_name;				\
@@ -239,20 +239,20 @@ __create_workqueue_key(const char *name, unsigned int flags,
 	else							\
 		__lock_name = #name;				\
 								\
-	__create_workqueue_key((name), (flags), &__key,		\
-			       __lock_name);			\
+	__create_workqueue_key((name), (flags), (max_active),	\
+				&__key, __lock_name);		\
 })
 #else
-#define __create_workqueue(name, flags)				\
-	__create_workqueue_key((name), (flags), NULL, NULL)
+#define __create_workqueue(name, flags, max_active)		\
+	__create_workqueue_key((name), (flags), (max_active), NULL, NULL)
 #endif
 
 #define create_workqueue(name)					\
-	__create_workqueue((name), 0)
+	__create_workqueue((name), 0, 1)
 #define create_freezeable_workqueue(name)			\
-	__create_workqueue((name), WQ_FREEZEABLE | WQ_SINGLE_THREAD)
+	__create_workqueue((name), WQ_FREEZEABLE | WQ_SINGLE_THREAD, 1)
 #define create_singlethread_workqueue(name)			\
-	__create_workqueue((name), WQ_SINGLE_THREAD)
+	__create_workqueue((name), WQ_SINGLE_THREAD, 1)
 
 extern void destroy_workqueue(struct workqueue_struct *wq);
 
diff --git a/kernel/workqueue.c b/kernel/workqueue.c
index 9953d3c7bd10..e541b5db67dd 100644
--- a/kernel/workqueue.c
+++ b/kernel/workqueue.c
@@ -77,6 +77,9 @@ struct cpu_workqueue_struct {
 	int			flush_color;	/* L: flushing color */
 	int			nr_in_flight[WORK_NR_COLORS];
 						/* L: nr of in_flight works */
+	int			nr_active;	/* L: nr of active works */
+	int			max_active;	/* I: max active works */
+	struct list_head	delayed_works;	/* L: delayed works */
 };
 
 /*
@@ -321,14 +324,24 @@ static void __queue_work(unsigned int cpu, struct workqueue_struct *wq,
 			 struct work_struct *work)
 {
 	struct cpu_workqueue_struct *cwq = target_cwq(cpu, wq);
+	struct list_head *worklist;
 	unsigned long flags;
 
 	debug_work_activate(work);
+
 	spin_lock_irqsave(&cwq->lock, flags);
 	BUG_ON(!list_empty(&work->entry));
+
 	cwq->nr_in_flight[cwq->work_color]++;
-	insert_work(cwq, work, &cwq->worklist,
-		    work_color_to_flags(cwq->work_color));
+
+	if (likely(cwq->nr_active < cwq->max_active)) {
+		cwq->nr_active++;
+		worklist = &cwq->worklist;
+	} else
+		worklist = &cwq->delayed_works;
+
+	insert_work(cwq, work, worklist, work_color_to_flags(cwq->work_color));
+
 	spin_unlock_irqrestore(&cwq->lock, flags);
 }
 
@@ -584,6 +597,15 @@ static void move_linked_works(struct work_struct *work, struct list_head *head,
 		*nextp = n;
 }
 
+static void cwq_activate_first_delayed(struct cpu_workqueue_struct *cwq)
+{
+	struct work_struct *work = list_first_entry(&cwq->delayed_works,
+						    struct work_struct, entry);
+
+	move_linked_works(work, &cwq->worklist, NULL);
+	cwq->nr_active++;
+}
+
 /**
  * cwq_dec_nr_in_flight - decrement cwq's nr_in_flight
  * @cwq: cwq of interest
@@ -602,6 +624,12 @@ static void cwq_dec_nr_in_flight(struct cpu_workqueue_struct *cwq, int color)
 		return;
 
 	cwq->nr_in_flight[color]--;
+	cwq->nr_active--;
+
+	/* one down, submit a delayed one */
+	if (!list_empty(&cwq->delayed_works) &&
+	    cwq->nr_active < cwq->max_active)
+		cwq_activate_first_delayed(cwq);
 
 	/* is flush in progress and are we at the flushing tip? */
 	if (likely(cwq->flush_color != color))
@@ -1505,6 +1533,7 @@ static void free_cwqs(struct cpu_workqueue_struct *cwqs)
 
 struct workqueue_struct *__create_workqueue_key(const char *name,
 						unsigned int flags,
+						int max_active,
 						struct lock_class_key *key,
 						const char *lock_name)
 {
@@ -1513,6 +1542,8 @@ struct workqueue_struct *__create_workqueue_key(const char *name,
 	bool failed = false;
 	unsigned int cpu;
 
+	max_active = clamp_val(max_active, 1, INT_MAX);
+
 	wq = kzalloc(sizeof(*wq), GFP_KERNEL);
 	if (!wq)
 		goto err;
@@ -1544,8 +1575,10 @@ struct workqueue_struct *__create_workqueue_key(const char *name,
 		cwq->cpu = cpu;
 		cwq->wq = wq;
 		cwq->flush_color = -1;
+		cwq->max_active = max_active;
 		spin_lock_init(&cwq->lock);
 		INIT_LIST_HEAD(&cwq->worklist);
+		INIT_LIST_HEAD(&cwq->delayed_works);
 		init_waitqueue_head(&cwq->more_work);
 
 		if (failed)
@@ -1607,6 +1640,8 @@ void destroy_workqueue(struct workqueue_struct *wq)
 
 		for (i = 0; i < WORK_NR_COLORS; i++)
 			BUG_ON(cwq->nr_in_flight[i]);
+		BUG_ON(cwq->nr_active);
+		BUG_ON(!list_empty(&cwq->delayed_works));
 	}
 
 	free_cwqs(wq->cpu_wq);
-- 
cgit v1.2.3


From a0a1a5fd4fb15ec61117c759fe9f5c16c53d9e9c Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Tue, 29 Jun 2010 10:07:12 +0200
Subject: workqueue: reimplement workqueue freeze using max_active

Currently, workqueue freezing is implemented by marking the worker
freezeable and calling try_to_freeze() from dispatch loop.
Reimplement it using cwq->limit so that the workqueue is frozen
instead of the worker.

* workqueue_struct->saved_max_active is added which stores the
  specified max_active on initialization.

* On freeze, all cwq->max_active's are quenched to zero.  Freezing is
  complete when nr_active on all cwqs reach zero.

* On thaw, all cwq->max_active's are restored to wq->saved_max_active
  and the worklist is repopulated.

This new implementation allows having single shared pool of workers
per cpu.

Signed-off-by: Tejun Heo <tj@kernel.org>
---
 include/linux/workqueue.h |   7 ++
 kernel/power/process.c    |  21 +++++-
 kernel/workqueue.c        | 163 +++++++++++++++++++++++++++++++++++++++++++---
 3 files changed, 179 insertions(+), 12 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/workqueue.h b/include/linux/workqueue.h
index eb753b7790e5..ab0b7fb99bc2 100644
--- a/include/linux/workqueue.h
+++ b/include/linux/workqueue.h
@@ -340,4 +340,11 @@ static inline long work_on_cpu(unsigned int cpu, long (*fn)(void *), void *arg)
 #else
 long work_on_cpu(unsigned int cpu, long (*fn)(void *), void *arg);
 #endif /* CONFIG_SMP */
+
+#ifdef CONFIG_FREEZER
+extern void freeze_workqueues_begin(void);
+extern bool freeze_workqueues_busy(void);
+extern void thaw_workqueues(void);
+#endif /* CONFIG_FREEZER */
+
 #endif
diff --git a/kernel/power/process.c b/kernel/power/process.c
index 71ae29052ab6..028a99598f49 100644
--- a/kernel/power/process.c
+++ b/kernel/power/process.c
@@ -15,6 +15,7 @@
 #include <linux/syscalls.h>
 #include <linux/freezer.h>
 #include <linux/delay.h>
+#include <linux/workqueue.h>
 
 /* 
  * Timeout for stopping processes
@@ -35,6 +36,7 @@ static int try_to_freeze_tasks(bool sig_only)
 	struct task_struct *g, *p;
 	unsigned long end_time;
 	unsigned int todo;
+	bool wq_busy = false;
 	struct timeval start, end;
 	u64 elapsed_csecs64;
 	unsigned int elapsed_csecs;
@@ -42,6 +44,10 @@ static int try_to_freeze_tasks(bool sig_only)
 	do_gettimeofday(&start);
 
 	end_time = jiffies + TIMEOUT;
+
+	if (!sig_only)
+		freeze_workqueues_begin();
+
 	while (true) {
 		todo = 0;
 		read_lock(&tasklist_lock);
@@ -63,6 +69,12 @@ static int try_to_freeze_tasks(bool sig_only)
 				todo++;
 		} while_each_thread(g, p);
 		read_unlock(&tasklist_lock);
+
+		if (!sig_only) {
+			wq_busy = freeze_workqueues_busy();
+			todo += wq_busy;
+		}
+
 		if (!todo || time_after(jiffies, end_time))
 			break;
 
@@ -86,8 +98,12 @@ static int try_to_freeze_tasks(bool sig_only)
 		 */
 		printk("\n");
 		printk(KERN_ERR "Freezing of tasks failed after %d.%02d seconds "
-				"(%d tasks refusing to freeze):\n",
-				elapsed_csecs / 100, elapsed_csecs % 100, todo);
+		       "(%d tasks refusing to freeze, wq_busy=%d):\n",
+		       elapsed_csecs / 100, elapsed_csecs % 100,
+		       todo - wq_busy, wq_busy);
+
+		thaw_workqueues();
+
 		read_lock(&tasklist_lock);
 		do_each_thread(g, p) {
 			task_lock(p);
@@ -157,6 +173,7 @@ void thaw_processes(void)
 	oom_killer_enable();
 
 	printk("Restarting tasks ... ");
+	thaw_workqueues();
 	thaw_tasks(true);
 	thaw_tasks(false);
 	schedule();
diff --git a/kernel/workqueue.c b/kernel/workqueue.c
index e541b5db67dd..4d059c532792 100644
--- a/kernel/workqueue.c
+++ b/kernel/workqueue.c
@@ -78,7 +78,7 @@ struct cpu_workqueue_struct {
 	int			nr_in_flight[WORK_NR_COLORS];
 						/* L: nr of in_flight works */
 	int			nr_active;	/* L: nr of active works */
-	int			max_active;	/* I: max active works */
+	int			max_active;	/* L: max active works */
 	struct list_head	delayed_works;	/* L: delayed works */
 };
 
@@ -108,6 +108,7 @@ struct workqueue_struct {
 	struct list_head	flusher_queue;	/* F: flush waiters */
 	struct list_head	flusher_overflow; /* F: flush overflow list */
 
+	int			saved_max_active; /* I: saved cwq max_active */
 	const char		*name;		/* I: workqueue name */
 #ifdef CONFIG_LOCKDEP
 	struct lockdep_map	lockdep_map;
@@ -228,6 +229,7 @@ static inline void debug_work_deactivate(struct work_struct *work) { }
 static DEFINE_SPINLOCK(workqueue_lock);
 static LIST_HEAD(workqueues);
 static DEFINE_PER_CPU(struct ida, worker_ida);
+static bool workqueue_freezing;		/* W: have wqs started freezing? */
 
 static int worker_thread(void *__worker);
 
@@ -745,19 +747,13 @@ static int worker_thread(void *__worker)
 	struct cpu_workqueue_struct *cwq = worker->cwq;
 	DEFINE_WAIT(wait);
 
-	if (cwq->wq->flags & WQ_FREEZEABLE)
-		set_freezable();
-
 	for (;;) {
 		prepare_to_wait(&cwq->more_work, &wait, TASK_INTERRUPTIBLE);
-		if (!freezing(current) &&
-		    !kthread_should_stop() &&
+		if (!kthread_should_stop() &&
 		    list_empty(&cwq->worklist))
 			schedule();
 		finish_wait(&cwq->more_work, &wait);
 
-		try_to_freeze();
-
 		if (kthread_should_stop())
 			break;
 
@@ -1553,6 +1549,7 @@ struct workqueue_struct *__create_workqueue_key(const char *name,
 		goto err;
 
 	wq->flags = flags;
+	wq->saved_max_active = max_active;
 	mutex_init(&wq->flush_mutex);
 	atomic_set(&wq->nr_cwqs_to_flush, 0);
 	INIT_LIST_HEAD(&wq->flusher_queue);
@@ -1591,8 +1588,19 @@ struct workqueue_struct *__create_workqueue_key(const char *name,
 			failed = true;
 	}
 
+	/*
+	 * workqueue_lock protects global freeze state and workqueues
+	 * list.  Grab it, set max_active accordingly and add the new
+	 * workqueue to workqueues list.
+	 */
 	spin_lock(&workqueue_lock);
+
+	if (workqueue_freezing && wq->flags & WQ_FREEZEABLE)
+		for_each_possible_cpu(cpu)
+			get_cwq(cpu, wq)->max_active = 0;
+
 	list_add(&wq->list, &workqueues);
+
 	spin_unlock(&workqueue_lock);
 
 	cpu_maps_update_done();
@@ -1621,14 +1629,18 @@ void destroy_workqueue(struct workqueue_struct *wq)
 {
 	int cpu;
 
+	flush_workqueue(wq);
+
+	/*
+	 * wq list is used to freeze wq, remove from list after
+	 * flushing is complete in case freeze races us.
+	 */
 	cpu_maps_update_begin();
 	spin_lock(&workqueue_lock);
 	list_del(&wq->list);
 	spin_unlock(&workqueue_lock);
 	cpu_maps_update_done();
 
-	flush_workqueue(wq);
-
 	for_each_possible_cpu(cpu) {
 		struct cpu_workqueue_struct *cwq = get_cwq(cpu, wq);
 		int i;
@@ -1722,6 +1734,137 @@ long work_on_cpu(unsigned int cpu, long (*fn)(void *), void *arg)
 EXPORT_SYMBOL_GPL(work_on_cpu);
 #endif /* CONFIG_SMP */
 
+#ifdef CONFIG_FREEZER
+
+/**
+ * freeze_workqueues_begin - begin freezing workqueues
+ *
+ * Start freezing workqueues.  After this function returns, all
+ * freezeable workqueues will queue new works to their frozen_works
+ * list instead of the cwq ones.
+ *
+ * CONTEXT:
+ * Grabs and releases workqueue_lock and cwq->lock's.
+ */
+void freeze_workqueues_begin(void)
+{
+	struct workqueue_struct *wq;
+	unsigned int cpu;
+
+	spin_lock(&workqueue_lock);
+
+	BUG_ON(workqueue_freezing);
+	workqueue_freezing = true;
+
+	for_each_possible_cpu(cpu) {
+		list_for_each_entry(wq, &workqueues, list) {
+			struct cpu_workqueue_struct *cwq = get_cwq(cpu, wq);
+
+			spin_lock_irq(&cwq->lock);
+
+			if (wq->flags & WQ_FREEZEABLE)
+				cwq->max_active = 0;
+
+			spin_unlock_irq(&cwq->lock);
+		}
+	}
+
+	spin_unlock(&workqueue_lock);
+}
+
+/**
+ * freeze_workqueues_busy - are freezeable workqueues still busy?
+ *
+ * Check whether freezing is complete.  This function must be called
+ * between freeze_workqueues_begin() and thaw_workqueues().
+ *
+ * CONTEXT:
+ * Grabs and releases workqueue_lock.
+ *
+ * RETURNS:
+ * %true if some freezeable workqueues are still busy.  %false if
+ * freezing is complete.
+ */
+bool freeze_workqueues_busy(void)
+{
+	struct workqueue_struct *wq;
+	unsigned int cpu;
+	bool busy = false;
+
+	spin_lock(&workqueue_lock);
+
+	BUG_ON(!workqueue_freezing);
+
+	for_each_possible_cpu(cpu) {
+		/*
+		 * nr_active is monotonically decreasing.  It's safe
+		 * to peek without lock.
+		 */
+		list_for_each_entry(wq, &workqueues, list) {
+			struct cpu_workqueue_struct *cwq = get_cwq(cpu, wq);
+
+			if (!(wq->flags & WQ_FREEZEABLE))
+				continue;
+
+			BUG_ON(cwq->nr_active < 0);
+			if (cwq->nr_active) {
+				busy = true;
+				goto out_unlock;
+			}
+		}
+	}
+out_unlock:
+	spin_unlock(&workqueue_lock);
+	return busy;
+}
+
+/**
+ * thaw_workqueues - thaw workqueues
+ *
+ * Thaw workqueues.  Normal queueing is restored and all collected
+ * frozen works are transferred to their respective cwq worklists.
+ *
+ * CONTEXT:
+ * Grabs and releases workqueue_lock and cwq->lock's.
+ */
+void thaw_workqueues(void)
+{
+	struct workqueue_struct *wq;
+	unsigned int cpu;
+
+	spin_lock(&workqueue_lock);
+
+	if (!workqueue_freezing)
+		goto out_unlock;
+
+	for_each_possible_cpu(cpu) {
+		list_for_each_entry(wq, &workqueues, list) {
+			struct cpu_workqueue_struct *cwq = get_cwq(cpu, wq);
+
+			if (!(wq->flags & WQ_FREEZEABLE))
+				continue;
+
+			spin_lock_irq(&cwq->lock);
+
+			/* restore max_active and repopulate worklist */
+			cwq->max_active = wq->saved_max_active;
+
+			while (!list_empty(&cwq->delayed_works) &&
+			       cwq->nr_active < cwq->max_active)
+				cwq_activate_first_delayed(cwq);
+
+			wake_up(&cwq->more_work);
+
+			spin_unlock_irq(&cwq->lock);
+		}
+	}
+
+	workqueue_freezing = false;
+out_unlock:
+	spin_unlock(&workqueue_lock);
+}
+#endif /* CONFIG_FREEZER */
+
 void __init init_workqueues(void)
 {
 	unsigned int cpu;
-- 
cgit v1.2.3


From db7bccf45cb87522096b8f43144e31ca605a9f24 Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Tue, 29 Jun 2010 10:07:12 +0200
Subject: workqueue: reimplement CPU hotplugging support using trustee

Reimplement CPU hotplugging support using trustee thread.  On CPU
down, a trustee thread is created and each step of CPU down is
executed by the trustee and workqueue_cpu_callback() simply drives and
waits for trustee state transitions.

CPU down operation no longer waits for works to be drained but trustee
sticks around till all pending works have been completed.  If CPU is
brought back up while works are still draining,
workqueue_cpu_callback() tells trustee to step down and tell workers
to rebind to the cpu.

As it's difficult to tell whether cwqs are empty if it's freezing or
frozen, trustee doesn't consider draining to be complete while a gcwq
is freezing or frozen (tracked by new GCWQ_FREEZING flag).  Also,
workers which get unbound from their cpu are marked with WORKER_ROGUE.

Trustee based implementation doesn't bring any new feature at this
point but it will be used to manage worker pool when dynamic shared
worker pool is implemented.

Signed-off-by: Tejun Heo <tj@kernel.org>
---
 include/linux/cpu.h |   2 +
 kernel/workqueue.c  | 293 +++++++++++++++++++++++++++++++++++++++++++++++++---
 2 files changed, 279 insertions(+), 16 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/cpu.h b/include/linux/cpu.h
index de6b1722cdca..4823af64e9db 100644
--- a/include/linux/cpu.h
+++ b/include/linux/cpu.h
@@ -71,6 +71,8 @@ enum {
 	/* migration should happen before other stuff but after perf */
 	CPU_PRI_PERF		= 20,
 	CPU_PRI_MIGRATION	= 10,
+	/* prepare workqueues for other notifiers */
+	CPU_PRI_WORKQUEUE	= 5,
 };
 
 #ifdef CONFIG_SMP
diff --git a/kernel/workqueue.c b/kernel/workqueue.c
index d64913aa486a..f57855f718d7 100644
--- a/kernel/workqueue.c
+++ b/kernel/workqueue.c
@@ -36,14 +36,27 @@
 #include <linux/idr.h>
 
 enum {
+	/* global_cwq flags */
+	GCWQ_FREEZING		= 1 << 3,	/* freeze in progress */
+
 	/* worker flags */
 	WORKER_STARTED		= 1 << 0,	/* started */
 	WORKER_DIE		= 1 << 1,	/* die die die */
 	WORKER_IDLE		= 1 << 2,	/* is idle */
+	WORKER_ROGUE		= 1 << 4,	/* not bound to any cpu */
+
+	/* gcwq->trustee_state */
+	TRUSTEE_START		= 0,		/* start */
+	TRUSTEE_IN_CHARGE	= 1,		/* trustee in charge of gcwq */
+	TRUSTEE_BUTCHER		= 2,		/* butcher workers */
+	TRUSTEE_RELEASE		= 3,		/* release workers */
+	TRUSTEE_DONE		= 4,		/* trustee is done */
 
 	BUSY_WORKER_HASH_ORDER	= 6,		/* 64 pointers */
 	BUSY_WORKER_HASH_SIZE	= 1 << BUSY_WORKER_HASH_ORDER,
 	BUSY_WORKER_HASH_MASK	= BUSY_WORKER_HASH_SIZE - 1,
+
+	TRUSTEE_COOLDOWN	= HZ / 10,	/* for trustee draining */
 };
 
 /*
@@ -83,6 +96,7 @@ struct worker {
 struct global_cwq {
 	spinlock_t		lock;		/* the gcwq lock */
 	unsigned int		cpu;		/* I: the associated cpu */
+	unsigned int		flags;		/* L: GCWQ_* flags */
 
 	int			nr_workers;	/* L: total number of workers */
 	int			nr_idle;	/* L: currently idle ones */
@@ -93,6 +107,10 @@ struct global_cwq {
 						/* L: hash of busy workers */
 
 	struct ida		worker_ida;	/* L: for worker IDs */
+
+	struct task_struct	*trustee;	/* L: for gcwq shutdown */
+	unsigned int		trustee_state;	/* L: trustee state */
+	wait_queue_head_t	trustee_wait;	/* trustee wait */
 } ____cacheline_aligned_in_smp;
 
 /*
@@ -148,6 +166,10 @@ struct workqueue_struct {
 #endif
 };
 
+#define for_each_busy_worker(worker, i, pos, gcwq)			\
+	for (i = 0; i < BUSY_WORKER_HASH_SIZE; i++)			\
+		hlist_for_each_entry(worker, pos, &gcwq->busy_hash[i], hentry)
+
 #ifdef CONFIG_DEBUG_OBJECTS_WORK
 
 static struct debug_obj_descr work_debug_descr;
@@ -546,6 +568,9 @@ static void worker_enter_idle(struct worker *worker)
 
 	/* idle_list is LIFO */
 	list_add(&worker->entry, &gcwq->idle_list);
+
+	if (unlikely(worker->flags & WORKER_ROGUE))
+		wake_up_all(&gcwq->trustee_wait);
 }
 
 /**
@@ -622,8 +647,15 @@ static struct worker *create_worker(struct cpu_workqueue_struct *cwq, bool bind)
 	if (IS_ERR(worker->task))
 		goto fail;
 
+	/*
+	 * A rogue worker will become a regular one if CPU comes
+	 * online later on.  Make sure every worker has
+	 * PF_THREAD_BOUND set.
+	 */
 	if (bind)
 		kthread_bind(worker->task, gcwq->cpu);
+	else
+		worker->task->flags |= PF_THREAD_BOUND;
 
 	return worker;
 fail:
@@ -882,10 +914,6 @@ static int worker_thread(void *__worker)
 	struct cpu_workqueue_struct *cwq = worker->cwq;
 
 woke_up:
-	if (unlikely(!cpumask_equal(&worker->task->cpus_allowed,
-				    get_cpu_mask(gcwq->cpu))))
-		set_cpus_allowed_ptr(worker->task, get_cpu_mask(gcwq->cpu));
-
 	spin_lock_irq(&gcwq->lock);
 
 	/* DIE can be set only while we're idle, checking here is enough */
@@ -895,7 +923,7 @@ woke_up:
 	}
 
 	worker_leave_idle(worker);
-
+recheck:
 	/*
 	 * ->scheduled list can only be filled while a worker is
 	 * preparing to process a work or actually processing it.
@@ -908,6 +936,22 @@ woke_up:
 			list_first_entry(&cwq->worklist,
 					 struct work_struct, entry);
 
+		/*
+		 * The following is a rather inefficient way to close
+		 * race window against cpu hotplug operations.  Will
+		 * be replaced soon.
+		 */
+		if (unlikely(!(worker->flags & WORKER_ROGUE) &&
+			     !cpumask_equal(&worker->task->cpus_allowed,
+					    get_cpu_mask(gcwq->cpu)))) {
+			spin_unlock_irq(&gcwq->lock);
+			set_cpus_allowed_ptr(worker->task,
+					     get_cpu_mask(gcwq->cpu));
+			cpu_relax();
+			spin_lock_irq(&gcwq->lock);
+			goto recheck;
+		}
+
 		if (likely(!(*work_data_bits(work) & WORK_STRUCT_LINKED))) {
 			/* optimization path, not strictly necessary */
 			process_one_work(worker, work);
@@ -1812,29 +1856,237 @@ void destroy_workqueue(struct workqueue_struct *wq)
 }
 EXPORT_SYMBOL_GPL(destroy_workqueue);
 
+/*
+ * CPU hotplug.
+ *
+ * CPU hotplug is implemented by allowing cwqs to be detached from
+ * CPU, running with unbound workers and allowing them to be
+ * reattached later if the cpu comes back online.  A separate thread
+ * is created to govern cwqs in such state and is called the trustee.
+ *
+ * Trustee states and their descriptions.
+ *
+ * START	Command state used on startup.  On CPU_DOWN_PREPARE, a
+ *		new trustee is started with this state.
+ *
+ * IN_CHARGE	Once started, trustee will enter this state after
+ *		making all existing workers rogue.  DOWN_PREPARE waits
+ *		for trustee to enter this state.  After reaching
+ *		IN_CHARGE, trustee tries to execute the pending
+ *		worklist until it's empty and the state is set to
+ *		BUTCHER, or the state is set to RELEASE.
+ *
+ * BUTCHER	Command state which is set by the cpu callback after
+ *		the cpu has went down.  Once this state is set trustee
+ *		knows that there will be no new works on the worklist
+ *		and once the worklist is empty it can proceed to
+ *		killing idle workers.
+ *
+ * RELEASE	Command state which is set by the cpu callback if the
+ *		cpu down has been canceled or it has come online
+ *		again.  After recognizing this state, trustee stops
+ *		trying to drain or butcher and transits to DONE.
+ *
+ * DONE		Trustee will enter this state after BUTCHER or RELEASE
+ *		is complete.
+ *
+ *          trustee                 CPU                draining
+ *         took over                down               complete
+ * START -----------> IN_CHARGE -----------> BUTCHER -----------> DONE
+ *                        |                     |                  ^
+ *                        | CPU is back online  v   return workers |
+ *                         ----------------> RELEASE --------------
+ */
+
+/**
+ * trustee_wait_event_timeout - timed event wait for trustee
+ * @cond: condition to wait for
+ * @timeout: timeout in jiffies
+ *
+ * wait_event_timeout() for trustee to use.  Handles locking and
+ * checks for RELEASE request.
+ *
+ * CONTEXT:
+ * spin_lock_irq(gcwq->lock) which may be released and regrabbed
+ * multiple times.  To be used by trustee.
+ *
+ * RETURNS:
+ * Positive indicating left time if @cond is satisfied, 0 if timed
+ * out, -1 if canceled.
+ */
+#define trustee_wait_event_timeout(cond, timeout) ({			\
+	long __ret = (timeout);						\
+	while (!((cond) || (gcwq->trustee_state == TRUSTEE_RELEASE)) &&	\
+	       __ret) {							\
+		spin_unlock_irq(&gcwq->lock);				\
+		__wait_event_timeout(gcwq->trustee_wait, (cond) ||	\
+			(gcwq->trustee_state == TRUSTEE_RELEASE),	\
+			__ret);						\
+		spin_lock_irq(&gcwq->lock);				\
+	}								\
+	gcwq->trustee_state == TRUSTEE_RELEASE ? -1 : (__ret);		\
+})
+
+/**
+ * trustee_wait_event - event wait for trustee
+ * @cond: condition to wait for
+ *
+ * wait_event() for trustee to use.  Automatically handles locking and
+ * checks for CANCEL request.
+ *
+ * CONTEXT:
+ * spin_lock_irq(gcwq->lock) which may be released and regrabbed
+ * multiple times.  To be used by trustee.
+ *
+ * RETURNS:
+ * 0 if @cond is satisfied, -1 if canceled.
+ */
+#define trustee_wait_event(cond) ({					\
+	long __ret1;							\
+	__ret1 = trustee_wait_event_timeout(cond, MAX_SCHEDULE_TIMEOUT);\
+	__ret1 < 0 ? -1 : 0;						\
+})
+
+static int __cpuinit trustee_thread(void *__gcwq)
+{
+	struct global_cwq *gcwq = __gcwq;
+	struct worker *worker;
+	struct hlist_node *pos;
+	int i;
+
+	BUG_ON(gcwq->cpu != smp_processor_id());
+
+	spin_lock_irq(&gcwq->lock);
+	/*
+	 * Make all multithread workers rogue.  Trustee must be bound
+	 * to the target cpu and can't be cancelled.
+	 */
+	BUG_ON(gcwq->cpu != smp_processor_id());
+
+	list_for_each_entry(worker, &gcwq->idle_list, entry)
+		if (!(worker->cwq->wq->flags & WQ_SINGLE_THREAD))
+			worker->flags |= WORKER_ROGUE;
+
+	for_each_busy_worker(worker, i, pos, gcwq)
+		if (!(worker->cwq->wq->flags & WQ_SINGLE_THREAD))
+			worker->flags |= WORKER_ROGUE;
+
+	/*
+	 * We're now in charge.  Notify and proceed to drain.  We need
+	 * to keep the gcwq running during the whole CPU down
+	 * procedure as other cpu hotunplug callbacks may need to
+	 * flush currently running tasks.
+	 */
+	gcwq->trustee_state = TRUSTEE_IN_CHARGE;
+	wake_up_all(&gcwq->trustee_wait);
+
+	/*
+	 * The original cpu is in the process of dying and may go away
+	 * anytime now.  When that happens, we and all workers would
+	 * be migrated to other cpus.  Try draining any left work.
+	 * Note that if the gcwq is frozen, there may be frozen works
+	 * in freezeable cwqs.  Don't declare completion while frozen.
+	 */
+	while (gcwq->nr_workers != gcwq->nr_idle ||
+	       gcwq->flags & GCWQ_FREEZING ||
+	       gcwq->trustee_state == TRUSTEE_IN_CHARGE) {
+		/* give a breather */
+		if (trustee_wait_event_timeout(false, TRUSTEE_COOLDOWN) < 0)
+			break;
+	}
+
+	/* notify completion */
+	gcwq->trustee = NULL;
+	gcwq->trustee_state = TRUSTEE_DONE;
+	wake_up_all(&gcwq->trustee_wait);
+	spin_unlock_irq(&gcwq->lock);
+	return 0;
+}
+
+/**
+ * wait_trustee_state - wait for trustee to enter the specified state
+ * @gcwq: gcwq the trustee of interest belongs to
+ * @state: target state to wait for
+ *
+ * Wait for the trustee to reach @state.  DONE is already matched.
+ *
+ * CONTEXT:
+ * spin_lock_irq(gcwq->lock) which may be released and regrabbed
+ * multiple times.  To be used by cpu_callback.
+ */
+static void __cpuinit wait_trustee_state(struct global_cwq *gcwq, int state)
+{
+	if (!(gcwq->trustee_state == state ||
+	      gcwq->trustee_state == TRUSTEE_DONE)) {
+		spin_unlock_irq(&gcwq->lock);
+		__wait_event(gcwq->trustee_wait,
+			     gcwq->trustee_state == state ||
+			     gcwq->trustee_state == TRUSTEE_DONE);
+		spin_lock_irq(&gcwq->lock);
+	}
+}
+
 static int __devinit workqueue_cpu_callback(struct notifier_block *nfb,
 						unsigned long action,
 						void *hcpu)
 {
 	unsigned int cpu = (unsigned long)hcpu;
-	struct cpu_workqueue_struct *cwq;
-	struct workqueue_struct *wq;
+	struct global_cwq *gcwq = get_gcwq(cpu);
+	struct task_struct *new_trustee = NULL;
+	struct worker *worker;
+	struct hlist_node *pos;
+	unsigned long flags;
+	int i;
 
 	action &= ~CPU_TASKS_FROZEN;
 
-	list_for_each_entry(wq, &workqueues, list) {
-		if (wq->flags & WQ_SINGLE_THREAD)
-			continue;
+	switch (action) {
+	case CPU_DOWN_PREPARE:
+		new_trustee = kthread_create(trustee_thread, gcwq,
+					     "workqueue_trustee/%d\n", cpu);
+		if (IS_ERR(new_trustee))
+			return notifier_from_errno(PTR_ERR(new_trustee));
+		kthread_bind(new_trustee, cpu);
+	}
 
-		cwq = get_cwq(cpu, wq);
+	/* some are called w/ irq disabled, don't disturb irq status */
+	spin_lock_irqsave(&gcwq->lock, flags);
 
-		switch (action) {
-		case CPU_POST_DEAD:
-			flush_workqueue(wq);
-			break;
+	switch (action) {
+	case CPU_DOWN_PREPARE:
+		/* initialize trustee and tell it to acquire the gcwq */
+		BUG_ON(gcwq->trustee || gcwq->trustee_state != TRUSTEE_DONE);
+		gcwq->trustee = new_trustee;
+		gcwq->trustee_state = TRUSTEE_START;
+		wake_up_process(gcwq->trustee);
+		wait_trustee_state(gcwq, TRUSTEE_IN_CHARGE);
+		break;
+
+	case CPU_POST_DEAD:
+		gcwq->trustee_state = TRUSTEE_BUTCHER;
+		break;
+
+	case CPU_DOWN_FAILED:
+	case CPU_ONLINE:
+		if (gcwq->trustee_state != TRUSTEE_DONE) {
+			gcwq->trustee_state = TRUSTEE_RELEASE;
+			wake_up_process(gcwq->trustee);
+			wait_trustee_state(gcwq, TRUSTEE_DONE);
 		}
+
+		/* clear ROGUE from all multithread workers */
+		list_for_each_entry(worker, &gcwq->idle_list, entry)
+			if (!(worker->cwq->wq->flags & WQ_SINGLE_THREAD))
+				worker->flags &= ~WORKER_ROGUE;
+
+		for_each_busy_worker(worker, i, pos, gcwq)
+			if (!(worker->cwq->wq->flags & WQ_SINGLE_THREAD))
+				worker->flags &= ~WORKER_ROGUE;
+		break;
 	}
 
+	spin_unlock_irqrestore(&gcwq->lock, flags);
+
 	return notifier_from_errno(0);
 }
 
@@ -1912,6 +2164,9 @@ void freeze_workqueues_begin(void)
 
 		spin_lock_irq(&gcwq->lock);
 
+		BUG_ON(gcwq->flags & GCWQ_FREEZING);
+		gcwq->flags |= GCWQ_FREEZING;
+
 		list_for_each_entry(wq, &workqueues, list) {
 			struct cpu_workqueue_struct *cwq = get_cwq(cpu, wq);
 
@@ -1995,6 +2250,9 @@ void thaw_workqueues(void)
 
 		spin_lock_irq(&gcwq->lock);
 
+		BUG_ON(!(gcwq->flags & GCWQ_FREEZING));
+		gcwq->flags &= ~GCWQ_FREEZING;
+
 		list_for_each_entry(wq, &workqueues, list) {
 			struct cpu_workqueue_struct *cwq = get_cwq(cpu, wq);
 
@@ -2026,7 +2284,7 @@ void __init init_workqueues(void)
 	int i;
 
 	singlethread_cpu = cpumask_first(cpu_possible_mask);
-	hotcpu_notifier(workqueue_cpu_callback, 0);
+	hotcpu_notifier(workqueue_cpu_callback, CPU_PRI_WORKQUEUE);
 
 	/* initialize gcwqs */
 	for_each_possible_cpu(cpu) {
@@ -2040,6 +2298,9 @@ void __init init_workqueues(void)
 			INIT_HLIST_HEAD(&gcwq->busy_hash[i]);
 
 		ida_init(&gcwq->worker_ida);
+
+		gcwq->trustee_state = TRUSTEE_DONE;
+		init_waitqueue_head(&gcwq->trustee_wait);
 	}
 
 	keventd_wq = create_workqueue("events");
-- 
cgit v1.2.3


From 502ca9d819792e7d79b6e002afe9094c641fe410 Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Tue, 29 Jun 2010 10:07:13 +0200
Subject: workqueue: make single thread workqueue shared worker pool friendly

Reimplement st (single thread) workqueue so that it's friendly to
shared worker pool.  It was originally implemented by confining st
workqueues to use cwq of a fixed cpu and always having a worker for
the cpu.  This implementation isn't very friendly to shared worker
pool and suboptimal in that it ends up crossing cpu boundaries often.

Reimplement st workqueue using dynamic single cpu binding and
cwq->limit.  WQ_SINGLE_THREAD is replaced with WQ_SINGLE_CPU.  In a
single cpu workqueue, at most single cwq is bound to the wq at any
given time.  Arbitration is done using atomic accesses to
wq->single_cpu when queueing a work.  Once bound, the binding stays
till the workqueue is drained.

Note that the binding is never broken while a workqueue is frozen.
This is because idle cwqs may have works waiting in delayed_works
queue while frozen.  On thaw, the cwq is restarted if there are any
delayed works or unbound otherwise.

When combined with max_active limit of 1, single cpu workqueue has
exactly the same execution properties as the original single thread
workqueue while allowing sharing of per-cpu workers.

Signed-off-by: Tejun Heo <tj@kernel.org>
---
 include/linux/workqueue.h |   6 +--
 kernel/workqueue.c        | 135 ++++++++++++++++++++++++++++++++++------------
 2 files changed, 103 insertions(+), 38 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/workqueue.h b/include/linux/workqueue.h
index ab0b7fb99bc2..10611f7fc809 100644
--- a/include/linux/workqueue.h
+++ b/include/linux/workqueue.h
@@ -221,7 +221,7 @@ static inline unsigned int work_static(struct work_struct *work) { return 0; }
 
 enum {
 	WQ_FREEZEABLE		= 1 << 0, /* freeze during suspend */
-	WQ_SINGLE_THREAD	= 1 << 1, /* no per-cpu worker */
+	WQ_SINGLE_CPU		= 1 << 1, /* only single cpu at a time */
 };
 
 extern struct workqueue_struct *
@@ -250,9 +250,9 @@ __create_workqueue_key(const char *name, unsigned int flags, int max_active,
 #define create_workqueue(name)					\
 	__create_workqueue((name), 0, 1)
 #define create_freezeable_workqueue(name)			\
-	__create_workqueue((name), WQ_FREEZEABLE | WQ_SINGLE_THREAD, 1)
+	__create_workqueue((name), WQ_FREEZEABLE | WQ_SINGLE_CPU, 1)
 #define create_singlethread_workqueue(name)			\
-	__create_workqueue((name), WQ_SINGLE_THREAD, 1)
+	__create_workqueue((name), WQ_SINGLE_CPU, 1)
 
 extern void destroy_workqueue(struct workqueue_struct *wq);
 
diff --git a/kernel/workqueue.c b/kernel/workqueue.c
index f57855f718d7..cfb8aa567e17 100644
--- a/kernel/workqueue.c
+++ b/kernel/workqueue.c
@@ -114,8 +114,7 @@ struct global_cwq {
 } ____cacheline_aligned_in_smp;
 
 /*
- * The per-CPU workqueue (if single thread, we always use the first
- * possible cpu).  The lower WORK_STRUCT_FLAG_BITS of
+ * The per-CPU workqueue.  The lower WORK_STRUCT_FLAG_BITS of
  * work_struct->data are used for flags and thus cwqs need to be
  * aligned at two's power of the number of flag bits.
  */
@@ -159,6 +158,8 @@ struct workqueue_struct {
 	struct list_head	flusher_queue;	/* F: flush waiters */
 	struct list_head	flusher_overflow; /* F: flush overflow list */
 
+	unsigned long		single_cpu;	/* cpu for single cpu wq */
+
 	int			saved_max_active; /* I: saved cwq max_active */
 	const char		*name;		/* I: workqueue name */
 #ifdef CONFIG_LOCKDEP
@@ -289,8 +290,6 @@ static DEFINE_PER_CPU(struct global_cwq, global_cwq);
 
 static int worker_thread(void *__worker);
 
-static int singlethread_cpu __read_mostly;
-
 static struct global_cwq *get_gcwq(unsigned int cpu)
 {
 	return &per_cpu(global_cwq, cpu);
@@ -302,14 +301,6 @@ static struct cpu_workqueue_struct *get_cwq(unsigned int cpu,
 	return per_cpu_ptr(wq->cpu_wq, cpu);
 }
 
-static struct cpu_workqueue_struct *target_cwq(unsigned int cpu,
-					       struct workqueue_struct *wq)
-{
-	if (unlikely(wq->flags & WQ_SINGLE_THREAD))
-		cpu = singlethread_cpu;
-	return get_cwq(cpu, wq);
-}
-
 static unsigned int work_color_to_flags(int color)
 {
 	return color << WORK_STRUCT_COLOR_SHIFT;
@@ -410,17 +401,87 @@ static void insert_work(struct cpu_workqueue_struct *cwq,
 	wake_up_process(cwq->worker->task);
 }
 
+/**
+ * cwq_unbind_single_cpu - unbind cwq from single cpu workqueue processing
+ * @cwq: cwq to unbind
+ *
+ * Try to unbind @cwq from single cpu workqueue processing.  If
+ * @cwq->wq is frozen, unbind is delayed till the workqueue is thawed.
+ *
+ * CONTEXT:
+ * spin_lock_irq(gcwq->lock).
+ */
+static void cwq_unbind_single_cpu(struct cpu_workqueue_struct *cwq)
+{
+	struct workqueue_struct *wq = cwq->wq;
+	struct global_cwq *gcwq = cwq->gcwq;
+
+	BUG_ON(wq->single_cpu != gcwq->cpu);
+	/*
+	 * Unbind from workqueue if @cwq is not frozen.  If frozen,
+	 * thaw_workqueues() will either restart processing on this
+	 * cpu or unbind if empty.  This keeps works queued while
+	 * frozen fully ordered and flushable.
+	 */
+	if (likely(!(gcwq->flags & GCWQ_FREEZING))) {
+		smp_wmb();	/* paired with cmpxchg() in __queue_work() */
+		wq->single_cpu = NR_CPUS;
+	}
+}
+
 static void __queue_work(unsigned int cpu, struct workqueue_struct *wq,
 			 struct work_struct *work)
 {
-	struct cpu_workqueue_struct *cwq = target_cwq(cpu, wq);
-	struct global_cwq *gcwq = cwq->gcwq;
+	struct global_cwq *gcwq;
+	struct cpu_workqueue_struct *cwq;
 	struct list_head *worklist;
 	unsigned long flags;
+	bool arbitrate;
 
 	debug_work_activate(work);
 
-	spin_lock_irqsave(&gcwq->lock, flags);
+	/* determine gcwq to use */
+	if (!(wq->flags & WQ_SINGLE_CPU)) {
+		/* just use the requested cpu for multicpu workqueues */
+		gcwq = get_gcwq(cpu);
+		spin_lock_irqsave(&gcwq->lock, flags);
+	} else {
+		unsigned int req_cpu = cpu;
+
+		/*
+		 * It's a bit more complex for single cpu workqueues.
+		 * We first need to determine which cpu is going to be
+		 * used.  If no cpu is currently serving this
+		 * workqueue, arbitrate using atomic accesses to
+		 * wq->single_cpu; otherwise, use the current one.
+		 */
+	retry:
+		cpu = wq->single_cpu;
+		arbitrate = cpu == NR_CPUS;
+		if (arbitrate)
+			cpu = req_cpu;
+
+		gcwq = get_gcwq(cpu);
+		spin_lock_irqsave(&gcwq->lock, flags);
+
+		/*
+		 * The following cmpxchg() is a full barrier paired
+		 * with smp_wmb() in cwq_unbind_single_cpu() and
+		 * guarantees that all changes to wq->st_* fields are
+		 * visible on the new cpu after this point.
+		 */
+		if (arbitrate)
+			cmpxchg(&wq->single_cpu, NR_CPUS, cpu);
+
+		if (unlikely(wq->single_cpu != cpu)) {
+			spin_unlock_irqrestore(&gcwq->lock, flags);
+			goto retry;
+		}
+	}
+
+	/* gcwq determined, get cwq and queue */
+	cwq = get_cwq(gcwq->cpu, wq);
+
 	BUG_ON(!list_empty(&work->entry));
 
 	cwq->nr_in_flight[cwq->work_color]++;
@@ -530,7 +591,7 @@ int queue_delayed_work_on(int cpu, struct workqueue_struct *wq,
 		timer_stats_timer_set_start_info(&dwork->timer);
 
 		/* This stores cwq for the moment, for the timer_fn */
-		set_wq_data(work, target_cwq(raw_smp_processor_id(), wq), 0);
+		set_wq_data(work, get_cwq(raw_smp_processor_id(), wq), 0);
 		timer->expires = jiffies + delay;
 		timer->data = (unsigned long)dwork;
 		timer->function = delayed_work_timer_fn;
@@ -790,10 +851,14 @@ static void cwq_dec_nr_in_flight(struct cpu_workqueue_struct *cwq, int color)
 	cwq->nr_in_flight[color]--;
 	cwq->nr_active--;
 
-	/* one down, submit a delayed one */
-	if (!list_empty(&cwq->delayed_works) &&
-	    cwq->nr_active < cwq->max_active)
-		cwq_activate_first_delayed(cwq);
+	if (!list_empty(&cwq->delayed_works)) {
+		/* one down, submit a delayed one */
+		if (cwq->nr_active < cwq->max_active)
+			cwq_activate_first_delayed(cwq);
+	} else if (!cwq->nr_active && cwq->wq->flags & WQ_SINGLE_CPU) {
+		/* this was the last work, unbind from single cpu */
+		cwq_unbind_single_cpu(cwq);
+	}
 
 	/* is flush in progress and are we at the flushing tip? */
 	if (likely(cwq->flush_color != color))
@@ -1727,7 +1792,6 @@ struct workqueue_struct *__create_workqueue_key(const char *name,
 						struct lock_class_key *key,
 						const char *lock_name)
 {
-	bool singlethread = flags & WQ_SINGLE_THREAD;
 	struct workqueue_struct *wq;
 	bool failed = false;
 	unsigned int cpu;
@@ -1748,6 +1812,8 @@ struct workqueue_struct *__create_workqueue_key(const char *name,
 	atomic_set(&wq->nr_cwqs_to_flush, 0);
 	INIT_LIST_HEAD(&wq->flusher_queue);
 	INIT_LIST_HEAD(&wq->flusher_overflow);
+	wq->single_cpu = NR_CPUS;
+
 	wq->name = name;
 	lockdep_init_map(&wq->lockdep_map, lock_name, key, 0);
 	INIT_LIST_HEAD(&wq->list);
@@ -1773,8 +1839,7 @@ struct workqueue_struct *__create_workqueue_key(const char *name,
 
 		if (failed)
 			continue;
-		cwq->worker = create_worker(cwq,
-					    cpu_online(cpu) && !singlethread);
+		cwq->worker = create_worker(cwq, cpu_online(cpu));
 		if (cwq->worker)
 			start_worker(cwq->worker);
 		else
@@ -1958,18 +2023,16 @@ static int __cpuinit trustee_thread(void *__gcwq)
 
 	spin_lock_irq(&gcwq->lock);
 	/*
-	 * Make all multithread workers rogue.  Trustee must be bound
-	 * to the target cpu and can't be cancelled.
+	 * Make all workers rogue.  Trustee must be bound to the
+	 * target cpu and can't be cancelled.
 	 */
 	BUG_ON(gcwq->cpu != smp_processor_id());
 
 	list_for_each_entry(worker, &gcwq->idle_list, entry)
-		if (!(worker->cwq->wq->flags & WQ_SINGLE_THREAD))
-			worker->flags |= WORKER_ROGUE;
+		worker->flags |= WORKER_ROGUE;
 
 	for_each_busy_worker(worker, i, pos, gcwq)
-		if (!(worker->cwq->wq->flags & WQ_SINGLE_THREAD))
-			worker->flags |= WORKER_ROGUE;
+		worker->flags |= WORKER_ROGUE;
 
 	/*
 	 * We're now in charge.  Notify and proceed to drain.  We need
@@ -2074,14 +2137,12 @@ static int __devinit workqueue_cpu_callback(struct notifier_block *nfb,
 			wait_trustee_state(gcwq, TRUSTEE_DONE);
 		}
 
-		/* clear ROGUE from all multithread workers */
+		/* clear ROGUE from all workers */
 		list_for_each_entry(worker, &gcwq->idle_list, entry)
-			if (!(worker->cwq->wq->flags & WQ_SINGLE_THREAD))
-				worker->flags &= ~WORKER_ROGUE;
+			worker->flags &= ~WORKER_ROGUE;
 
 		for_each_busy_worker(worker, i, pos, gcwq)
-			if (!(worker->cwq->wq->flags & WQ_SINGLE_THREAD))
-				worker->flags &= ~WORKER_ROGUE;
+			worker->flags &= ~WORKER_ROGUE;
 		break;
 	}
 
@@ -2266,6 +2327,11 @@ void thaw_workqueues(void)
 			       cwq->nr_active < cwq->max_active)
 				cwq_activate_first_delayed(cwq);
 
+			/* perform delayed unbind from single cpu if empty */
+			if (wq->single_cpu == gcwq->cpu &&
+			    !cwq->nr_active && list_empty(&cwq->delayed_works))
+				cwq_unbind_single_cpu(cwq);
+
 			wake_up_process(cwq->worker->task);
 		}
 
@@ -2283,7 +2349,6 @@ void __init init_workqueues(void)
 	unsigned int cpu;
 	int i;
 
-	singlethread_cpu = cpumask_first(cpu_possible_mask);
 	hotcpu_notifier(workqueue_cpu_callback, CPU_PRI_WORKQUEUE);
 
 	/* initialize gcwqs */
-- 
cgit v1.2.3


From 7a22ad757ec75186ad43a5b4670fa7423ee8f480 Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Tue, 29 Jun 2010 10:07:13 +0200
Subject: workqueue: carry cpu number in work data once execution starts

To implement non-reentrant workqueue, the last gcwq a work was
executed on must be reliably obtainable as long as the work structure
is valid even if the previous workqueue has been destroyed.

To achieve this, work->data will be overloaded to carry the last cpu
number once execution starts so that the previous gcwq can be located
reliably.  This means that cwq can't be obtained from work after
execution starts but only gcwq.

Implement set_work_{cwq|cpu}(), get_work_[g]cwq() and
clear_work_data() to set work data to the cpu number when starting
execution, access the overloaded work data and clear it after
cancellation.

queue_delayed_work_on() is updated to preserve the last cpu while
in-flight in timer and other callers which depended on getting cwq
from work after execution starts are converted to depend on gcwq
instead.

* Anton Blanchard fixed compile error on powerpc due to missing
  linux/threads.h include.

Signed-off-by: Tejun Heo <tj@kernel.org>
Cc: Anton Blanchard <anton@samba.org>
---
 include/linux/workqueue.h |   7 +-
 kernel/workqueue.c        | 163 +++++++++++++++++++++++++++++-----------------
 2 files changed, 109 insertions(+), 61 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/workqueue.h b/include/linux/workqueue.h
index 10611f7fc809..0a7814131e66 100644
--- a/include/linux/workqueue.h
+++ b/include/linux/workqueue.h
@@ -9,6 +9,7 @@
 #include <linux/linkage.h>
 #include <linux/bitops.h>
 #include <linux/lockdep.h>
+#include <linux/threads.h>
 #include <asm/atomic.h>
 
 struct workqueue_struct;
@@ -59,6 +60,7 @@ enum {
 
 	WORK_STRUCT_FLAG_MASK	= (1UL << WORK_STRUCT_FLAG_BITS) - 1,
 	WORK_STRUCT_WQ_DATA_MASK = ~WORK_STRUCT_FLAG_MASK,
+	WORK_STRUCT_NO_CPU	= NR_CPUS << WORK_STRUCT_FLAG_BITS,
 };
 
 struct work_struct {
@@ -70,8 +72,9 @@ struct work_struct {
 #endif
 };
 
-#define WORK_DATA_INIT()	ATOMIC_LONG_INIT(0)
-#define WORK_DATA_STATIC_INIT()	ATOMIC_LONG_INIT(WORK_STRUCT_STATIC)
+#define WORK_DATA_INIT()	ATOMIC_LONG_INIT(WORK_STRUCT_NO_CPU)
+#define WORK_DATA_STATIC_INIT()	\
+	ATOMIC_LONG_INIT(WORK_STRUCT_NO_CPU | WORK_STRUCT_STATIC)
 
 struct delayed_work {
 	struct work_struct work;
diff --git a/kernel/workqueue.c b/kernel/workqueue.c
index c276dec75ea4..c68277c204ab 100644
--- a/kernel/workqueue.c
+++ b/kernel/workqueue.c
@@ -319,31 +319,71 @@ static int work_next_color(int color)
 }
 
 /*
- * Set the workqueue on which a work item is to be run
- * - Must *only* be called if the pending flag is set
+ * Work data points to the cwq while a work is on queue.  Once
+ * execution starts, it points to the cpu the work was last on.  This
+ * can be distinguished by comparing the data value against
+ * PAGE_OFFSET.
+ *
+ * set_work_{cwq|cpu}() and clear_work_data() can be used to set the
+ * cwq, cpu or clear work->data.  These functions should only be
+ * called while the work is owned - ie. while the PENDING bit is set.
+ *
+ * get_work_[g]cwq() can be used to obtain the gcwq or cwq
+ * corresponding to a work.  gcwq is available once the work has been
+ * queued anywhere after initialization.  cwq is available only from
+ * queueing until execution starts.
  */
-static inline void set_wq_data(struct work_struct *work,
-			       struct cpu_workqueue_struct *cwq,
-			       unsigned long extra_flags)
+static inline void set_work_data(struct work_struct *work, unsigned long data,
+				 unsigned long flags)
 {
 	BUG_ON(!work_pending(work));
+	atomic_long_set(&work->data, data | flags | work_static(work));
+}
 
-	atomic_long_set(&work->data, (unsigned long)cwq | work_static(work) |
-			WORK_STRUCT_PENDING | extra_flags);
+static void set_work_cwq(struct work_struct *work,
+			 struct cpu_workqueue_struct *cwq,
+			 unsigned long extra_flags)
+{
+	set_work_data(work, (unsigned long)cwq,
+		      WORK_STRUCT_PENDING | extra_flags);
 }
 
-/*
- * Clear WORK_STRUCT_PENDING and the workqueue on which it was queued.
- */
-static inline void clear_wq_data(struct work_struct *work)
+static void set_work_cpu(struct work_struct *work, unsigned int cpu)
+{
+	set_work_data(work, cpu << WORK_STRUCT_FLAG_BITS, WORK_STRUCT_PENDING);
+}
+
+static void clear_work_data(struct work_struct *work)
+{
+	set_work_data(work, WORK_STRUCT_NO_CPU, 0);
+}
+
+static inline unsigned long get_work_data(struct work_struct *work)
+{
+	return atomic_long_read(&work->data) & WORK_STRUCT_WQ_DATA_MASK;
+}
+
+static struct cpu_workqueue_struct *get_work_cwq(struct work_struct *work)
 {
-	atomic_long_set(&work->data, work_static(work));
+	unsigned long data = get_work_data(work);
+
+	return data >= PAGE_OFFSET ? (void *)data : NULL;
 }
 
-static inline struct cpu_workqueue_struct *get_wq_data(struct work_struct *work)
+static struct global_cwq *get_work_gcwq(struct work_struct *work)
 {
-	return (void *)(atomic_long_read(&work->data) &
-			WORK_STRUCT_WQ_DATA_MASK);
+	unsigned long data = get_work_data(work);
+	unsigned int cpu;
+
+	if (data >= PAGE_OFFSET)
+		return ((struct cpu_workqueue_struct *)data)->gcwq;
+
+	cpu = data >> WORK_STRUCT_FLAG_BITS;
+	if (cpu == NR_CPUS)
+		return NULL;
+
+	BUG_ON(cpu >= num_possible_cpus());
+	return get_gcwq(cpu);
 }
 
 /**
@@ -443,7 +483,7 @@ static void insert_work(struct cpu_workqueue_struct *cwq,
 			unsigned int extra_flags)
 {
 	/* we own @work, set data and link */
-	set_wq_data(work, cwq, extra_flags);
+	set_work_cwq(work, cwq, extra_flags);
 
 	/*
 	 * Ensure that we get the right work->data if we see the
@@ -599,7 +639,7 @@ EXPORT_SYMBOL_GPL(queue_work_on);
 static void delayed_work_timer_fn(unsigned long __data)
 {
 	struct delayed_work *dwork = (struct delayed_work *)__data;
-	struct cpu_workqueue_struct *cwq = get_wq_data(&dwork->work);
+	struct cpu_workqueue_struct *cwq = get_work_cwq(&dwork->work);
 
 	__queue_work(smp_processor_id(), cwq->wq, &dwork->work);
 }
@@ -639,13 +679,19 @@ int queue_delayed_work_on(int cpu, struct workqueue_struct *wq,
 	struct work_struct *work = &dwork->work;
 
 	if (!test_and_set_bit(WORK_STRUCT_PENDING_BIT, work_data_bits(work))) {
+		struct global_cwq *gcwq = get_work_gcwq(work);
+		unsigned int lcpu = gcwq ? gcwq->cpu : raw_smp_processor_id();
+
 		BUG_ON(timer_pending(timer));
 		BUG_ON(!list_empty(&work->entry));
 
 		timer_stats_timer_set_start_info(&dwork->timer);
-
-		/* This stores cwq for the moment, for the timer_fn */
-		set_wq_data(work, get_cwq(raw_smp_processor_id(), wq), 0);
+		/*
+		 * This stores cwq for the moment, for the timer_fn.
+		 * Note that the work's gcwq is preserved to allow
+		 * reentrance detection for delayed works.
+		 */
+		set_work_cwq(work, get_cwq(lcpu, wq), 0);
 		timer->expires = jiffies + delay;
 		timer->data = (unsigned long)dwork;
 		timer->function = delayed_work_timer_fn;
@@ -970,11 +1016,14 @@ static void process_one_work(struct worker *worker, struct work_struct *work)
 	worker->current_work = work;
 	worker->current_cwq = cwq;
 	work_color = get_work_color(work);
+
+	BUG_ON(get_work_cwq(work) != cwq);
+	/* record the current cpu number in the work data and dequeue */
+	set_work_cpu(work, gcwq->cpu);
 	list_del_init(&work->entry);
 
 	spin_unlock_irq(&gcwq->lock);
 
-	BUG_ON(get_wq_data(work) != cwq);
 	work_clear_pending(work);
 	lock_map_acquire(&cwq->wq->lockdep_map);
 	lock_map_acquire(&lockdep_map);
@@ -1406,37 +1455,39 @@ EXPORT_SYMBOL_GPL(flush_workqueue);
 int flush_work(struct work_struct *work)
 {
 	struct worker *worker = NULL;
-	struct cpu_workqueue_struct *cwq;
 	struct global_cwq *gcwq;
+	struct cpu_workqueue_struct *cwq;
 	struct wq_barrier barr;
 
 	might_sleep();
-	cwq = get_wq_data(work);
-	if (!cwq)
+	gcwq = get_work_gcwq(work);
+	if (!gcwq)
 		return 0;
-	gcwq = cwq->gcwq;
-
-	lock_map_acquire(&cwq->wq->lockdep_map);
-	lock_map_release(&cwq->wq->lockdep_map);
 
 	spin_lock_irq(&gcwq->lock);
 	if (!list_empty(&work->entry)) {
 		/*
 		 * See the comment near try_to_grab_pending()->smp_rmb().
-		 * If it was re-queued under us we are not going to wait.
+		 * If it was re-queued to a different gcwq under us, we
+		 * are not going to wait.
 		 */
 		smp_rmb();
-		if (unlikely(cwq != get_wq_data(work)))
+		cwq = get_work_cwq(work);
+		if (unlikely(!cwq || gcwq != cwq->gcwq))
 			goto already_gone;
 	} else {
-		if (cwq->worker && cwq->worker->current_work == work)
-			worker = cwq->worker;
+		worker = find_worker_executing_work(gcwq, work);
 		if (!worker)
 			goto already_gone;
+		cwq = worker->current_cwq;
 	}
 
 	insert_wq_barrier(cwq, &barr, work, worker);
 	spin_unlock_irq(&gcwq->lock);
+
+	lock_map_acquire(&cwq->wq->lockdep_map);
+	lock_map_release(&cwq->wq->lockdep_map);
+
 	wait_for_completion(&barr.done);
 	destroy_work_on_stack(&barr.work);
 	return 1;
@@ -1453,7 +1504,6 @@ EXPORT_SYMBOL_GPL(flush_work);
 static int try_to_grab_pending(struct work_struct *work)
 {
 	struct global_cwq *gcwq;
-	struct cpu_workqueue_struct *cwq;
 	int ret = -1;
 
 	if (!test_and_set_bit(WORK_STRUCT_PENDING_BIT, work_data_bits(work)))
@@ -1463,24 +1513,23 @@ static int try_to_grab_pending(struct work_struct *work)
 	 * The queueing is in progress, or it is already queued. Try to
 	 * steal it from ->worklist without clearing WORK_STRUCT_PENDING.
 	 */
-
-	cwq = get_wq_data(work);
-	if (!cwq)
+	gcwq = get_work_gcwq(work);
+	if (!gcwq)
 		return ret;
-	gcwq = cwq->gcwq;
 
 	spin_lock_irq(&gcwq->lock);
 	if (!list_empty(&work->entry)) {
 		/*
-		 * This work is queued, but perhaps we locked the wrong cwq.
+		 * This work is queued, but perhaps we locked the wrong gcwq.
 		 * In that case we must see the new value after rmb(), see
 		 * insert_work()->wmb().
 		 */
 		smp_rmb();
-		if (cwq == get_wq_data(work)) {
+		if (gcwq == get_work_gcwq(work)) {
 			debug_work_deactivate(work);
 			list_del_init(&work->entry);
-			cwq_dec_nr_in_flight(cwq, get_work_color(work));
+			cwq_dec_nr_in_flight(get_work_cwq(work),
+					     get_work_color(work));
 			ret = 1;
 		}
 	}
@@ -1489,20 +1538,16 @@ static int try_to_grab_pending(struct work_struct *work)
 	return ret;
 }
 
-static void wait_on_cpu_work(struct cpu_workqueue_struct *cwq,
-				struct work_struct *work)
+static void wait_on_cpu_work(struct global_cwq *gcwq, struct work_struct *work)
 {
-	struct global_cwq *gcwq = cwq->gcwq;
 	struct wq_barrier barr;
 	struct worker *worker;
 
 	spin_lock_irq(&gcwq->lock);
 
-	worker = NULL;
-	if (unlikely(cwq->worker && cwq->worker->current_work == work)) {
-		worker = cwq->worker;
-		insert_wq_barrier(cwq, &barr, work, worker);
-	}
+	worker = find_worker_executing_work(gcwq, work);
+	if (unlikely(worker))
+		insert_wq_barrier(worker->current_cwq, &barr, work, worker);
 
 	spin_unlock_irq(&gcwq->lock);
 
@@ -1514,8 +1559,6 @@ static void wait_on_cpu_work(struct cpu_workqueue_struct *cwq,
 
 static void wait_on_work(struct work_struct *work)
 {
-	struct cpu_workqueue_struct *cwq;
-	struct workqueue_struct *wq;
 	int cpu;
 
 	might_sleep();
@@ -1523,14 +1566,8 @@ static void wait_on_work(struct work_struct *work)
 	lock_map_acquire(&work->lockdep_map);
 	lock_map_release(&work->lockdep_map);
 
-	cwq = get_wq_data(work);
-	if (!cwq)
-		return;
-
-	wq = cwq->wq;
-
 	for_each_possible_cpu(cpu)
-		wait_on_cpu_work(get_cwq(cpu, wq), work);
+		wait_on_cpu_work(get_gcwq(cpu), work);
 }
 
 static int __cancel_work_timer(struct work_struct *work,
@@ -1545,7 +1582,7 @@ static int __cancel_work_timer(struct work_struct *work,
 		wait_on_work(work);
 	} while (unlikely(ret < 0));
 
-	clear_wq_data(work);
+	clear_work_data(work);
 	return ret;
 }
 
@@ -1647,7 +1684,7 @@ EXPORT_SYMBOL(schedule_delayed_work);
 void flush_delayed_work(struct delayed_work *dwork)
 {
 	if (del_timer_sync(&dwork->timer)) {
-		__queue_work(get_cpu(), get_wq_data(&dwork->work)->wq,
+		__queue_work(get_cpu(), get_work_cwq(&dwork->work)->wq,
 			     &dwork->work);
 		put_cpu();
 	}
@@ -2405,6 +2442,14 @@ void __init init_workqueues(void)
 	unsigned int cpu;
 	int i;
 
+	/*
+	 * The pointer part of work->data is either pointing to the
+	 * cwq or contains the cpu number the work ran last on.  Make
+	 * sure cpu number won't overflow into kernel pointer area so
+	 * that they can be distinguished.
+	 */
+	BUILD_BUG_ON(NR_CPUS << WORK_STRUCT_FLAG_BITS >= PAGE_OFFSET);
+
 	hotcpu_notifier(workqueue_cpu_callback, CPU_PRI_WORKQUEUE);
 
 	/* initialize gcwqs */
-- 
cgit v1.2.3


From 18aa9effad4adb2c1efe123af4eb24fec9f59b30 Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Tue, 29 Jun 2010 10:07:13 +0200
Subject: workqueue: implement WQ_NON_REENTRANT

With gcwq managing all the workers and work->data pointing to the last
gcwq it was on, non-reentrance can be easily implemented by checking
whether the work is still running on the previous gcwq on queueing.
Implement it.

Signed-off-by: Tejun Heo <tj@kernel.org>
---
 include/linux/workqueue.h |  1 +
 kernel/workqueue.c        | 32 +++++++++++++++++++++++++++++---
 2 files changed, 30 insertions(+), 3 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/workqueue.h b/include/linux/workqueue.h
index 0a7814131e66..07cf5e5f91cb 100644
--- a/include/linux/workqueue.h
+++ b/include/linux/workqueue.h
@@ -225,6 +225,7 @@ static inline unsigned int work_static(struct work_struct *work) { return 0; }
 enum {
 	WQ_FREEZEABLE		= 1 << 0, /* freeze during suspend */
 	WQ_SINGLE_CPU		= 1 << 1, /* only single cpu at a time */
+	WQ_NON_REENTRANT	= 1 << 2, /* guarantee non-reentrance */
 };
 
 extern struct workqueue_struct *
diff --git a/kernel/workqueue.c b/kernel/workqueue.c
index c68277c204ab..bce1074bdec1 100644
--- a/kernel/workqueue.c
+++ b/kernel/workqueue.c
@@ -534,11 +534,37 @@ static void __queue_work(unsigned int cpu, struct workqueue_struct *wq,
 
 	debug_work_activate(work);
 
-	/* determine gcwq to use */
+	/*
+	 * Determine gcwq to use.  SINGLE_CPU is inherently
+	 * NON_REENTRANT, so test it first.
+	 */
 	if (!(wq->flags & WQ_SINGLE_CPU)) {
-		/* just use the requested cpu for multicpu workqueues */
+		struct global_cwq *last_gcwq;
+
+		/*
+		 * It's multi cpu.  If @wq is non-reentrant and @work
+		 * was previously on a different cpu, it might still
+		 * be running there, in which case the work needs to
+		 * be queued on that cpu to guarantee non-reentrance.
+		 */
 		gcwq = get_gcwq(cpu);
-		spin_lock_irqsave(&gcwq->lock, flags);
+		if (wq->flags & WQ_NON_REENTRANT &&
+		    (last_gcwq = get_work_gcwq(work)) && last_gcwq != gcwq) {
+			struct worker *worker;
+
+			spin_lock_irqsave(&last_gcwq->lock, flags);
+
+			worker = find_worker_executing_work(last_gcwq, work);
+
+			if (worker && worker->current_cwq->wq == wq)
+				gcwq = last_gcwq;
+			else {
+				/* meh... not running there, queue here */
+				spin_unlock_irqrestore(&last_gcwq->lock, flags);
+				spin_lock_irqsave(&gcwq->lock, flags);
+			}
+		} else
+			spin_lock_irqsave(&gcwq->lock, flags);
 	} else {
 		unsigned int req_cpu = cpu;
 
-- 
cgit v1.2.3


From e22bee782b3b00bd4534ae9b1c5fb2e8e6573c5c Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Tue, 29 Jun 2010 10:07:14 +0200
Subject: workqueue: implement concurrency managed dynamic worker pool

Instead of creating a worker for each cwq and putting it into the
shared pool, manage per-cpu workers dynamically.

Works aren't supposed to be cpu cycle hogs and maintaining just enough
concurrency to prevent work processing from stalling due to lack of
processing context is optimal.  gcwq keeps the number of concurrent
active workers to minimum but no less.  As long as there's one or more
running workers on the cpu, no new worker is scheduled so that works
can be processed in batch as much as possible but when the last
running worker blocks, gcwq immediately schedules new worker so that
the cpu doesn't sit idle while there are works to be processed.

gcwq always keeps at least single idle worker around.  When a new
worker is necessary and the worker is the last idle one, the worker
assumes the role of "manager" and manages the worker pool -
ie. creates another worker.  Forward-progress is guaranteed by having
dedicated rescue workers for workqueues which may be necessary while
creating a new worker.  When the manager is having problem creating a
new worker, mayday timer activates and rescue workers are summoned to
the cpu and execute works which might be necessary to create new
workers.

Trustee is expanded to serve the role of manager while a CPU is being
taken down and stays down.  As no new works are supposed to be queued
on a dead cpu, it just needs to drain all the existing ones.  Trustee
continues to try to create new workers and summon rescuers as long as
there are pending works.  If the CPU is brought back up while the
trustee is still trying to drain the gcwq from the previous offlining,
the trustee will kill all idles ones and tell workers which are still
busy to rebind to the cpu, and pass control over to gcwq which assumes
the manager role as necessary.

Concurrency managed worker pool reduces the number of workers
drastically.  Only workers which are necessary to keep the processing
going are created and kept.  Also, it reduces cache footprint by
avoiding unnecessarily switching contexts between different workers.

Please note that this patch does not increase max_active of any
workqueue.  All workqueues can still only process one work per cpu.

Signed-off-by: Tejun Heo <tj@kernel.org>
---
 include/linux/workqueue.h |   8 +-
 kernel/workqueue.c        | 936 +++++++++++++++++++++++++++++++++++++++++-----
 kernel/workqueue_sched.h  |  13 +-
 3 files changed, 841 insertions(+), 116 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/workqueue.h b/include/linux/workqueue.h
index 07cf5e5f91cb..b8f4ec45c40a 100644
--- a/include/linux/workqueue.h
+++ b/include/linux/workqueue.h
@@ -226,6 +226,7 @@ enum {
 	WQ_FREEZEABLE		= 1 << 0, /* freeze during suspend */
 	WQ_SINGLE_CPU		= 1 << 1, /* only single cpu at a time */
 	WQ_NON_REENTRANT	= 1 << 2, /* guarantee non-reentrance */
+	WQ_RESCUER		= 1 << 3, /* has an rescue worker */
 };
 
 extern struct workqueue_struct *
@@ -252,11 +253,12 @@ __create_workqueue_key(const char *name, unsigned int flags, int max_active,
 #endif
 
 #define create_workqueue(name)					\
-	__create_workqueue((name), 0, 1)
+	__create_workqueue((name), WQ_RESCUER, 1)
 #define create_freezeable_workqueue(name)			\
-	__create_workqueue((name), WQ_FREEZEABLE | WQ_SINGLE_CPU, 1)
+	__create_workqueue((name),				\
+			   WQ_FREEZEABLE | WQ_SINGLE_CPU | WQ_RESCUER, 1)
 #define create_singlethread_workqueue(name)			\
-	__create_workqueue((name), WQ_SINGLE_CPU, 1)
+	__create_workqueue((name), WQ_SINGLE_CPU | WQ_RESCUER, 1)
 
 extern void destroy_workqueue(struct workqueue_struct *wq);
 
diff --git a/kernel/workqueue.c b/kernel/workqueue.c
index 4c31fde092c6..0ad46523b423 100644
--- a/kernel/workqueue.c
+++ b/kernel/workqueue.c
@@ -34,17 +34,25 @@
 #include <linux/debug_locks.h>
 #include <linux/lockdep.h>
 #include <linux/idr.h>
-#include <linux/delay.h>
+
+#include "workqueue_sched.h"
 
 enum {
 	/* global_cwq flags */
+	GCWQ_MANAGE_WORKERS	= 1 << 0,	/* need to manage workers */
+	GCWQ_MANAGING_WORKERS	= 1 << 1,	/* managing workers */
+	GCWQ_DISASSOCIATED	= 1 << 2,	/* cpu can't serve workers */
 	GCWQ_FREEZING		= 1 << 3,	/* freeze in progress */
 
 	/* worker flags */
 	WORKER_STARTED		= 1 << 0,	/* started */
 	WORKER_DIE		= 1 << 1,	/* die die die */
 	WORKER_IDLE		= 1 << 2,	/* is idle */
+	WORKER_PREP		= 1 << 3,	/* preparing to run works */
 	WORKER_ROGUE		= 1 << 4,	/* not bound to any cpu */
+	WORKER_REBIND		= 1 << 5,	/* mom is home, come back */
+
+	WORKER_NOT_RUNNING	= WORKER_PREP | WORKER_ROGUE | WORKER_REBIND,
 
 	/* gcwq->trustee_state */
 	TRUSTEE_START		= 0,		/* start */
@@ -57,7 +65,19 @@ enum {
 	BUSY_WORKER_HASH_SIZE	= 1 << BUSY_WORKER_HASH_ORDER,
 	BUSY_WORKER_HASH_MASK	= BUSY_WORKER_HASH_SIZE - 1,
 
+	MAX_IDLE_WORKERS_RATIO	= 4,		/* 1/4 of busy can be idle */
+	IDLE_WORKER_TIMEOUT	= 300 * HZ,	/* keep idle ones for 5 mins */
+
+	MAYDAY_INITIAL_TIMEOUT	= HZ / 100,	/* call for help after 10ms */
+	MAYDAY_INTERVAL		= HZ / 10,	/* and then every 100ms */
+	CREATE_COOLDOWN		= HZ,		/* time to breath after fail */
 	TRUSTEE_COOLDOWN	= HZ / 10,	/* for trustee draining */
+
+	/*
+	 * Rescue workers are used only on emergencies and shared by
+	 * all cpus.  Give -20.
+	 */
+	RESCUER_NICE_LEVEL	= -20,
 };
 
 /*
@@ -65,8 +85,16 @@ enum {
  *
  * I: Set during initialization and read-only afterwards.
  *
+ * P: Preemption protected.  Disabling preemption is enough and should
+ *    only be modified and accessed from the local cpu.
+ *
  * L: gcwq->lock protected.  Access with gcwq->lock held.
  *
+ * X: During normal operation, modification requires gcwq->lock and
+ *    should be done only from local cpu.  Either disabling preemption
+ *    on local cpu or grabbing gcwq->lock is enough for read access.
+ *    While trustee is in charge, it's identical to L.
+ *
  * F: wq->flush_mutex protected.
  *
  * W: workqueue_lock protected.
@@ -74,6 +102,10 @@ enum {
 
 struct global_cwq;
 
+/*
+ * The poor guys doing the actual heavy lifting.  All on-duty workers
+ * are either serving the manager role, on idle list or on busy hash.
+ */
 struct worker {
 	/* on idle list while idle, on busy hash table while busy */
 	union {
@@ -86,12 +118,17 @@ struct worker {
 	struct list_head	scheduled;	/* L: scheduled works */
 	struct task_struct	*task;		/* I: worker task */
 	struct global_cwq	*gcwq;		/* I: the associated gcwq */
-	unsigned int		flags;		/* L: flags */
+	/* 64 bytes boundary on 64bit, 32 on 32bit */
+	unsigned long		last_active;	/* L: last active timestamp */
+	unsigned int		flags;		/* X: flags */
 	int			id;		/* I: worker id */
+	struct work_struct	rebind_work;	/* L: rebind worker to cpu */
 };
 
 /*
- * Global per-cpu workqueue.
+ * Global per-cpu workqueue.  There's one and only one for each cpu
+ * and all works are queued and processed here regardless of their
+ * target workqueues.
  */
 struct global_cwq {
 	spinlock_t		lock;		/* the gcwq lock */
@@ -103,15 +140,19 @@ struct global_cwq {
 	int			nr_idle;	/* L: currently idle ones */
 
 	/* workers are chained either in the idle_list or busy_hash */
-	struct list_head	idle_list;	/* L: list of idle workers */
+	struct list_head	idle_list;	/* X: list of idle workers */
 	struct hlist_head	busy_hash[BUSY_WORKER_HASH_SIZE];
 						/* L: hash of busy workers */
 
+	struct timer_list	idle_timer;	/* L: worker idle timeout */
+	struct timer_list	mayday_timer;	/* L: SOS timer for dworkers */
+
 	struct ida		worker_ida;	/* L: for worker IDs */
 
 	struct task_struct	*trustee;	/* L: for gcwq shutdown */
 	unsigned int		trustee_state;	/* L: trustee state */
 	wait_queue_head_t	trustee_wait;	/* trustee wait */
+	struct worker		*first_idle;	/* L: first idle worker */
 } ____cacheline_aligned_in_smp;
 
 /*
@@ -121,7 +162,6 @@ struct global_cwq {
  */
 struct cpu_workqueue_struct {
 	struct global_cwq	*gcwq;		/* I: the associated gcwq */
-	struct worker		*worker;
 	struct workqueue_struct *wq;		/* I: the owning workqueue */
 	int			work_color;	/* L: current color */
 	int			flush_color;	/* L: flushing color */
@@ -160,6 +200,9 @@ struct workqueue_struct {
 
 	unsigned long		single_cpu;	/* cpu for single cpu wq */
 
+	cpumask_var_t		mayday_mask;	/* cpus requesting rescue */
+	struct worker		*rescuer;	/* I: rescue worker */
+
 	int			saved_max_active; /* I: saved cwq max_active */
 	const char		*name;		/* I: workqueue name */
 #ifdef CONFIG_LOCKDEP
@@ -286,7 +329,13 @@ static DEFINE_SPINLOCK(workqueue_lock);
 static LIST_HEAD(workqueues);
 static bool workqueue_freezing;		/* W: have wqs started freezing? */
 
+/*
+ * The almighty global cpu workqueues.  nr_running is the only field
+ * which is expected to be used frequently by other cpus via
+ * try_to_wake_up().  Put it in a separate cacheline.
+ */
 static DEFINE_PER_CPU(struct global_cwq, global_cwq);
+static DEFINE_PER_CPU_SHARED_ALIGNED(atomic_t, gcwq_nr_running);
 
 static int worker_thread(void *__worker);
 
@@ -295,6 +344,11 @@ static struct global_cwq *get_gcwq(unsigned int cpu)
 	return &per_cpu(global_cwq, cpu);
 }
 
+static atomic_t *get_gcwq_nr_running(unsigned int cpu)
+{
+	return &per_cpu(gcwq_nr_running, cpu);
+}
+
 static struct cpu_workqueue_struct *get_cwq(unsigned int cpu,
 					    struct workqueue_struct *wq)
 {
@@ -385,6 +439,63 @@ static struct global_cwq *get_work_gcwq(struct work_struct *work)
 	return get_gcwq(cpu);
 }
 
+/*
+ * Policy functions.  These define the policies on how the global
+ * worker pool is managed.  Unless noted otherwise, these functions
+ * assume that they're being called with gcwq->lock held.
+ */
+
+/*
+ * Need to wake up a worker?  Called from anything but currently
+ * running workers.
+ */
+static bool need_more_worker(struct global_cwq *gcwq)
+{
+	atomic_t *nr_running = get_gcwq_nr_running(gcwq->cpu);
+
+	return !list_empty(&gcwq->worklist) && !atomic_read(nr_running);
+}
+
+/* Can I start working?  Called from busy but !running workers. */
+static bool may_start_working(struct global_cwq *gcwq)
+{
+	return gcwq->nr_idle;
+}
+
+/* Do I need to keep working?  Called from currently running workers. */
+static bool keep_working(struct global_cwq *gcwq)
+{
+	atomic_t *nr_running = get_gcwq_nr_running(gcwq->cpu);
+
+	return !list_empty(&gcwq->worklist) && atomic_read(nr_running) <= 1;
+}
+
+/* Do we need a new worker?  Called from manager. */
+static bool need_to_create_worker(struct global_cwq *gcwq)
+{
+	return need_more_worker(gcwq) && !may_start_working(gcwq);
+}
+
+/* Do I need to be the manager? */
+static bool need_to_manage_workers(struct global_cwq *gcwq)
+{
+	return need_to_create_worker(gcwq) || gcwq->flags & GCWQ_MANAGE_WORKERS;
+}
+
+/* Do we have too many workers and should some go away? */
+static bool too_many_workers(struct global_cwq *gcwq)
+{
+	bool managing = gcwq->flags & GCWQ_MANAGING_WORKERS;
+	int nr_idle = gcwq->nr_idle + managing; /* manager is considered idle */
+	int nr_busy = gcwq->nr_workers - nr_idle;
+
+	return nr_idle > 2 && (nr_idle - 2) * MAX_IDLE_WORKERS_RATIO >= nr_busy;
+}
+
+/*
+ * Wake up functions.
+ */
+
 /* Return the first worker.  Safe with preemption disabled */
 static struct worker *first_worker(struct global_cwq *gcwq)
 {
@@ -412,12 +523,77 @@ static void wake_up_worker(struct global_cwq *gcwq)
 }
 
 /**
- * worker_set_flags - set worker flags
+ * wq_worker_waking_up - a worker is waking up
+ * @task: task waking up
+ * @cpu: CPU @task is waking up to
+ *
+ * This function is called during try_to_wake_up() when a worker is
+ * being awoken.
+ *
+ * CONTEXT:
+ * spin_lock_irq(rq->lock)
+ */
+void wq_worker_waking_up(struct task_struct *task, unsigned int cpu)
+{
+	struct worker *worker = kthread_data(task);
+
+	if (likely(!(worker->flags & WORKER_NOT_RUNNING)))
+		atomic_inc(get_gcwq_nr_running(cpu));
+}
+
+/**
+ * wq_worker_sleeping - a worker is going to sleep
+ * @task: task going to sleep
+ * @cpu: CPU in question, must be the current CPU number
+ *
+ * This function is called during schedule() when a busy worker is
+ * going to sleep.  Worker on the same cpu can be woken up by
+ * returning pointer to its task.
+ *
+ * CONTEXT:
+ * spin_lock_irq(rq->lock)
+ *
+ * RETURNS:
+ * Worker task on @cpu to wake up, %NULL if none.
+ */
+struct task_struct *wq_worker_sleeping(struct task_struct *task,
+				       unsigned int cpu)
+{
+	struct worker *worker = kthread_data(task), *to_wakeup = NULL;
+	struct global_cwq *gcwq = get_gcwq(cpu);
+	atomic_t *nr_running = get_gcwq_nr_running(cpu);
+
+	if (unlikely(worker->flags & WORKER_NOT_RUNNING))
+		return NULL;
+
+	/* this can only happen on the local cpu */
+	BUG_ON(cpu != raw_smp_processor_id());
+
+	/*
+	 * The counterpart of the following dec_and_test, implied mb,
+	 * worklist not empty test sequence is in insert_work().
+	 * Please read comment there.
+	 *
+	 * NOT_RUNNING is clear.  This means that trustee is not in
+	 * charge and we're running on the local cpu w/ rq lock held
+	 * and preemption disabled, which in turn means that none else
+	 * could be manipulating idle_list, so dereferencing idle_list
+	 * without gcwq lock is safe.
+	 */
+	if (atomic_dec_and_test(nr_running) && !list_empty(&gcwq->worklist))
+		to_wakeup = first_worker(gcwq);
+	return to_wakeup ? to_wakeup->task : NULL;
+}
+
+/**
+ * worker_set_flags - set worker flags and adjust nr_running accordingly
  * @worker: worker to set flags for
  * @flags: flags to set
  * @wakeup: wakeup an idle worker if necessary
  *
- * Set @flags in @worker->flags.
+ * Set @flags in @worker->flags and adjust nr_running accordingly.  If
+ * nr_running becomes zero and @wakeup is %true, an idle worker is
+ * woken up.
  *
  * LOCKING:
  * spin_lock_irq(gcwq->lock).
@@ -425,22 +601,49 @@ static void wake_up_worker(struct global_cwq *gcwq)
 static inline void worker_set_flags(struct worker *worker, unsigned int flags,
 				    bool wakeup)
 {
+	struct global_cwq *gcwq = worker->gcwq;
+
+	/*
+	 * If transitioning into NOT_RUNNING, adjust nr_running and
+	 * wake up an idle worker as necessary if requested by
+	 * @wakeup.
+	 */
+	if ((flags & WORKER_NOT_RUNNING) &&
+	    !(worker->flags & WORKER_NOT_RUNNING)) {
+		atomic_t *nr_running = get_gcwq_nr_running(gcwq->cpu);
+
+		if (wakeup) {
+			if (atomic_dec_and_test(nr_running) &&
+			    !list_empty(&gcwq->worklist))
+				wake_up_worker(gcwq);
+		} else
+			atomic_dec(nr_running);
+	}
+
 	worker->flags |= flags;
 }
 
 /**
- * worker_clr_flags - clear worker flags
+ * worker_clr_flags - clear worker flags and adjust nr_running accordingly
  * @worker: worker to set flags for
  * @flags: flags to clear
  *
- * Clear @flags in @worker->flags.
+ * Clear @flags in @worker->flags and adjust nr_running accordingly.
  *
  * LOCKING:
  * spin_lock_irq(gcwq->lock).
  */
 static inline void worker_clr_flags(struct worker *worker, unsigned int flags)
 {
+	struct global_cwq *gcwq = worker->gcwq;
+	unsigned int oflags = worker->flags;
+
 	worker->flags &= ~flags;
+
+	/* if transitioning out of NOT_RUNNING, increment nr_running */
+	if ((flags & WORKER_NOT_RUNNING) && (oflags & WORKER_NOT_RUNNING))
+		if (!(worker->flags & WORKER_NOT_RUNNING))
+			atomic_inc(get_gcwq_nr_running(gcwq->cpu));
 }
 
 /**
@@ -540,6 +743,8 @@ static void insert_work(struct cpu_workqueue_struct *cwq,
 			struct work_struct *work, struct list_head *head,
 			unsigned int extra_flags)
 {
+	struct global_cwq *gcwq = cwq->gcwq;
+
 	/* we own @work, set data and link */
 	set_work_cwq(work, cwq, extra_flags);
 
@@ -550,7 +755,16 @@ static void insert_work(struct cpu_workqueue_struct *cwq,
 	smp_wmb();
 
 	list_add_tail(&work->entry, head);
-	wake_up_worker(cwq->gcwq);
+
+	/*
+	 * Ensure either worker_sched_deactivated() sees the above
+	 * list_add_tail() or we see zero nr_running to avoid workers
+	 * lying around lazily while there are works to be processed.
+	 */
+	smp_mb();
+
+	if (!atomic_read(get_gcwq_nr_running(gcwq->cpu)))
+		wake_up_worker(gcwq);
 }
 
 /**
@@ -810,11 +1024,16 @@ static void worker_enter_idle(struct worker *worker)
 
 	worker_set_flags(worker, WORKER_IDLE, false);
 	gcwq->nr_idle++;
+	worker->last_active = jiffies;
 
 	/* idle_list is LIFO */
 	list_add(&worker->entry, &gcwq->idle_list);
 
-	if (unlikely(worker->flags & WORKER_ROGUE))
+	if (likely(!(worker->flags & WORKER_ROGUE))) {
+		if (too_many_workers(gcwq) && !timer_pending(&gcwq->idle_timer))
+			mod_timer(&gcwq->idle_timer,
+				  jiffies + IDLE_WORKER_TIMEOUT);
+	} else
 		wake_up_all(&gcwq->trustee_wait);
 }
 
@@ -837,6 +1056,81 @@ static void worker_leave_idle(struct worker *worker)
 	list_del_init(&worker->entry);
 }
 
+/**
+ * worker_maybe_bind_and_lock - bind worker to its cpu if possible and lock gcwq
+ * @worker: self
+ *
+ * Works which are scheduled while the cpu is online must at least be
+ * scheduled to a worker which is bound to the cpu so that if they are
+ * flushed from cpu callbacks while cpu is going down, they are
+ * guaranteed to execute on the cpu.
+ *
+ * This function is to be used by rogue workers and rescuers to bind
+ * themselves to the target cpu and may race with cpu going down or
+ * coming online.  kthread_bind() can't be used because it may put the
+ * worker to already dead cpu and set_cpus_allowed_ptr() can't be used
+ * verbatim as it's best effort and blocking and gcwq may be
+ * [dis]associated in the meantime.
+ *
+ * This function tries set_cpus_allowed() and locks gcwq and verifies
+ * the binding against GCWQ_DISASSOCIATED which is set during
+ * CPU_DYING and cleared during CPU_ONLINE, so if the worker enters
+ * idle state or fetches works without dropping lock, it can guarantee
+ * the scheduling requirement described in the first paragraph.
+ *
+ * CONTEXT:
+ * Might sleep.  Called without any lock but returns with gcwq->lock
+ * held.
+ *
+ * RETURNS:
+ * %true if the associated gcwq is online (@worker is successfully
+ * bound), %false if offline.
+ */
+static bool worker_maybe_bind_and_lock(struct worker *worker)
+{
+	struct global_cwq *gcwq = worker->gcwq;
+	struct task_struct *task = worker->task;
+
+	while (true) {
+		/*
+		 * The following call may fail, succeed or succeed
+		 * without actually migrating the task to the cpu if
+		 * it races with cpu hotunplug operation.  Verify
+		 * against GCWQ_DISASSOCIATED.
+		 */
+		set_cpus_allowed_ptr(task, get_cpu_mask(gcwq->cpu));
+
+		spin_lock_irq(&gcwq->lock);
+		if (gcwq->flags & GCWQ_DISASSOCIATED)
+			return false;
+		if (task_cpu(task) == gcwq->cpu &&
+		    cpumask_equal(&current->cpus_allowed,
+				  get_cpu_mask(gcwq->cpu)))
+			return true;
+		spin_unlock_irq(&gcwq->lock);
+
+		/* CPU has come up inbetween, retry migration */
+		cpu_relax();
+	}
+}
+
+/*
+ * Function for worker->rebind_work used to rebind rogue busy workers
+ * to the associated cpu which is coming back online.  This is
+ * scheduled by cpu up but can race with other cpu hotplug operations
+ * and may be executed twice without intervening cpu down.
+ */
+static void worker_rebind_fn(struct work_struct *work)
+{
+	struct worker *worker = container_of(work, struct worker, rebind_work);
+	struct global_cwq *gcwq = worker->gcwq;
+
+	if (worker_maybe_bind_and_lock(worker))
+		worker_clr_flags(worker, WORKER_REBIND);
+
+	spin_unlock_irq(&gcwq->lock);
+}
+
 static struct worker *alloc_worker(void)
 {
 	struct worker *worker;
@@ -845,6 +1139,9 @@ static struct worker *alloc_worker(void)
 	if (worker) {
 		INIT_LIST_HEAD(&worker->entry);
 		INIT_LIST_HEAD(&worker->scheduled);
+		INIT_WORK(&worker->rebind_work, worker_rebind_fn);
+		/* on creation a worker is in !idle && prep state */
+		worker->flags = WORKER_PREP;
 	}
 	return worker;
 }
@@ -963,6 +1260,220 @@ static void destroy_worker(struct worker *worker)
 	ida_remove(&gcwq->worker_ida, id);
 }
 
+static void idle_worker_timeout(unsigned long __gcwq)
+{
+	struct global_cwq *gcwq = (void *)__gcwq;
+
+	spin_lock_irq(&gcwq->lock);
+
+	if (too_many_workers(gcwq)) {
+		struct worker *worker;
+		unsigned long expires;
+
+		/* idle_list is kept in LIFO order, check the last one */
+		worker = list_entry(gcwq->idle_list.prev, struct worker, entry);
+		expires = worker->last_active + IDLE_WORKER_TIMEOUT;
+
+		if (time_before(jiffies, expires))
+			mod_timer(&gcwq->idle_timer, expires);
+		else {
+			/* it's been idle for too long, wake up manager */
+			gcwq->flags |= GCWQ_MANAGE_WORKERS;
+			wake_up_worker(gcwq);
+		}
+	}
+
+	spin_unlock_irq(&gcwq->lock);
+}
+
+static bool send_mayday(struct work_struct *work)
+{
+	struct cpu_workqueue_struct *cwq = get_work_cwq(work);
+	struct workqueue_struct *wq = cwq->wq;
+
+	if (!(wq->flags & WQ_RESCUER))
+		return false;
+
+	/* mayday mayday mayday */
+	if (!cpumask_test_and_set_cpu(cwq->gcwq->cpu, wq->mayday_mask))
+		wake_up_process(wq->rescuer->task);
+	return true;
+}
+
+static void gcwq_mayday_timeout(unsigned long __gcwq)
+{
+	struct global_cwq *gcwq = (void *)__gcwq;
+	struct work_struct *work;
+
+	spin_lock_irq(&gcwq->lock);
+
+	if (need_to_create_worker(gcwq)) {
+		/*
+		 * We've been trying to create a new worker but
+		 * haven't been successful.  We might be hitting an
+		 * allocation deadlock.  Send distress signals to
+		 * rescuers.
+		 */
+		list_for_each_entry(work, &gcwq->worklist, entry)
+			send_mayday(work);
+	}
+
+	spin_unlock_irq(&gcwq->lock);
+
+	mod_timer(&gcwq->mayday_timer, jiffies + MAYDAY_INTERVAL);
+}
+
+/**
+ * maybe_create_worker - create a new worker if necessary
+ * @gcwq: gcwq to create a new worker for
+ *
+ * Create a new worker for @gcwq if necessary.  @gcwq is guaranteed to
+ * have at least one idle worker on return from this function.  If
+ * creating a new worker takes longer than MAYDAY_INTERVAL, mayday is
+ * sent to all rescuers with works scheduled on @gcwq to resolve
+ * possible allocation deadlock.
+ *
+ * On return, need_to_create_worker() is guaranteed to be false and
+ * may_start_working() true.
+ *
+ * LOCKING:
+ * spin_lock_irq(gcwq->lock) which may be released and regrabbed
+ * multiple times.  Does GFP_KERNEL allocations.  Called only from
+ * manager.
+ *
+ * RETURNS:
+ * false if no action was taken and gcwq->lock stayed locked, true
+ * otherwise.
+ */
+static bool maybe_create_worker(struct global_cwq *gcwq)
+{
+	if (!need_to_create_worker(gcwq))
+		return false;
+restart:
+	/* if we don't make progress in MAYDAY_INITIAL_TIMEOUT, call for help */
+	mod_timer(&gcwq->mayday_timer, jiffies + MAYDAY_INITIAL_TIMEOUT);
+
+	while (true) {
+		struct worker *worker;
+
+		spin_unlock_irq(&gcwq->lock);
+
+		worker = create_worker(gcwq, true);
+		if (worker) {
+			del_timer_sync(&gcwq->mayday_timer);
+			spin_lock_irq(&gcwq->lock);
+			start_worker(worker);
+			BUG_ON(need_to_create_worker(gcwq));
+			return true;
+		}
+
+		if (!need_to_create_worker(gcwq))
+			break;
+
+		spin_unlock_irq(&gcwq->lock);
+		__set_current_state(TASK_INTERRUPTIBLE);
+		schedule_timeout(CREATE_COOLDOWN);
+		spin_lock_irq(&gcwq->lock);
+		if (!need_to_create_worker(gcwq))
+			break;
+	}
+
+	spin_unlock_irq(&gcwq->lock);
+	del_timer_sync(&gcwq->mayday_timer);
+	spin_lock_irq(&gcwq->lock);
+	if (need_to_create_worker(gcwq))
+		goto restart;
+	return true;
+}
+
+/**
+ * maybe_destroy_worker - destroy workers which have been idle for a while
+ * @gcwq: gcwq to destroy workers for
+ *
+ * Destroy @gcwq workers which have been idle for longer than
+ * IDLE_WORKER_TIMEOUT.
+ *
+ * LOCKING:
+ * spin_lock_irq(gcwq->lock) which may be released and regrabbed
+ * multiple times.  Called only from manager.
+ *
+ * RETURNS:
+ * false if no action was taken and gcwq->lock stayed locked, true
+ * otherwise.
+ */
+static bool maybe_destroy_workers(struct global_cwq *gcwq)
+{
+	bool ret = false;
+
+	while (too_many_workers(gcwq)) {
+		struct worker *worker;
+		unsigned long expires;
+
+		worker = list_entry(gcwq->idle_list.prev, struct worker, entry);
+		expires = worker->last_active + IDLE_WORKER_TIMEOUT;
+
+		if (time_before(jiffies, expires)) {
+			mod_timer(&gcwq->idle_timer, expires);
+			break;
+		}
+
+		destroy_worker(worker);
+		ret = true;
+	}
+
+	return ret;
+}
+
+/**
+ * manage_workers - manage worker pool
+ * @worker: self
+ *
+ * Assume the manager role and manage gcwq worker pool @worker belongs
+ * to.  At any given time, there can be only zero or one manager per
+ * gcwq.  The exclusion is handled automatically by this function.
+ *
+ * The caller can safely start processing works on false return.  On
+ * true return, it's guaranteed that need_to_create_worker() is false
+ * and may_start_working() is true.
+ *
+ * CONTEXT:
+ * spin_lock_irq(gcwq->lock) which may be released and regrabbed
+ * multiple times.  Does GFP_KERNEL allocations.
+ *
+ * RETURNS:
+ * false if no action was taken and gcwq->lock stayed locked, true if
+ * some action was taken.
+ */
+static bool manage_workers(struct worker *worker)
+{
+	struct global_cwq *gcwq = worker->gcwq;
+	bool ret = false;
+
+	if (gcwq->flags & GCWQ_MANAGING_WORKERS)
+		return ret;
+
+	gcwq->flags &= ~GCWQ_MANAGE_WORKERS;
+	gcwq->flags |= GCWQ_MANAGING_WORKERS;
+
+	/*
+	 * Destroy and then create so that may_start_working() is true
+	 * on return.
+	 */
+	ret |= maybe_destroy_workers(gcwq);
+	ret |= maybe_create_worker(gcwq);
+
+	gcwq->flags &= ~GCWQ_MANAGING_WORKERS;
+
+	/*
+	 * The trustee might be waiting to take over the manager
+	 * position, tell it we're done.
+	 */
+	if (unlikely(gcwq->trustee))
+		wake_up_all(&gcwq->trustee_wait);
+
+	return ret;
+}
+
 /**
  * move_linked_works - move linked works to a list
  * @work: start of series of works to be scheduled
@@ -1169,24 +1680,39 @@ static void process_scheduled_works(struct worker *worker)
  * worker_thread - the worker thread function
  * @__worker: self
  *
- * The cwq worker thread function.
+ * The gcwq worker thread function.  There's a single dynamic pool of
+ * these per each cpu.  These workers process all works regardless of
+ * their specific target workqueue.  The only exception is works which
+ * belong to workqueues with a rescuer which will be explained in
+ * rescuer_thread().
  */
 static int worker_thread(void *__worker)
 {
 	struct worker *worker = __worker;
 	struct global_cwq *gcwq = worker->gcwq;
 
+	/* tell the scheduler that this is a workqueue worker */
+	worker->task->flags |= PF_WQ_WORKER;
 woke_up:
 	spin_lock_irq(&gcwq->lock);
 
 	/* DIE can be set only while we're idle, checking here is enough */
 	if (worker->flags & WORKER_DIE) {
 		spin_unlock_irq(&gcwq->lock);
+		worker->task->flags &= ~PF_WQ_WORKER;
 		return 0;
 	}
 
 	worker_leave_idle(worker);
 recheck:
+	/* no more worker necessary? */
+	if (!need_more_worker(gcwq))
+		goto sleep;
+
+	/* do we need to manage? */
+	if (unlikely(!may_start_working(gcwq)) && manage_workers(worker))
+		goto recheck;
+
 	/*
 	 * ->scheduled list can only be filled while a worker is
 	 * preparing to process a work or actually processing it.
@@ -1194,27 +1720,18 @@ recheck:
 	 */
 	BUG_ON(!list_empty(&worker->scheduled));
 
-	while (!list_empty(&gcwq->worklist)) {
+	/*
+	 * When control reaches this point, we're guaranteed to have
+	 * at least one idle worker or that someone else has already
+	 * assumed the manager role.
+	 */
+	worker_clr_flags(worker, WORKER_PREP);
+
+	do {
 		struct work_struct *work =
 			list_first_entry(&gcwq->worklist,
 					 struct work_struct, entry);
 
-		/*
-		 * The following is a rather inefficient way to close
-		 * race window against cpu hotplug operations.  Will
-		 * be replaced soon.
-		 */
-		if (unlikely(!(worker->flags & WORKER_ROGUE) &&
-			     !cpumask_equal(&worker->task->cpus_allowed,
-					    get_cpu_mask(gcwq->cpu)))) {
-			spin_unlock_irq(&gcwq->lock);
-			set_cpus_allowed_ptr(worker->task,
-					     get_cpu_mask(gcwq->cpu));
-			cpu_relax();
-			spin_lock_irq(&gcwq->lock);
-			goto recheck;
-		}
-
 		if (likely(!(*work_data_bits(work) & WORK_STRUCT_LINKED))) {
 			/* optimization path, not strictly necessary */
 			process_one_work(worker, work);
@@ -1224,13 +1741,19 @@ recheck:
 			move_linked_works(work, &worker->scheduled, NULL);
 			process_scheduled_works(worker);
 		}
-	}
+	} while (keep_working(gcwq));
+
+	worker_set_flags(worker, WORKER_PREP, false);
 
+	if (unlikely(need_to_manage_workers(gcwq)) && manage_workers(worker))
+		goto recheck;
+sleep:
 	/*
-	 * gcwq->lock is held and there's no work to process, sleep.
-	 * Workers are woken up only while holding gcwq->lock, so
-	 * setting the current state before releasing gcwq->lock is
-	 * enough to prevent losing any event.
+	 * gcwq->lock is held and there's no work to process and no
+	 * need to manage, sleep.  Workers are woken up only while
+	 * holding gcwq->lock or from local cpu, so setting the
+	 * current state before releasing gcwq->lock is enough to
+	 * prevent losing any event.
 	 */
 	worker_enter_idle(worker);
 	__set_current_state(TASK_INTERRUPTIBLE);
@@ -1239,6 +1762,68 @@ recheck:
 	goto woke_up;
 }
 
+/**
+ * rescuer_thread - the rescuer thread function
+ * @__wq: the associated workqueue
+ *
+ * Workqueue rescuer thread function.  There's one rescuer for each
+ * workqueue which has WQ_RESCUER set.
+ *
+ * Regular work processing on a gcwq may block trying to create a new
+ * worker which uses GFP_KERNEL allocation which has slight chance of
+ * developing into deadlock if some works currently on the same queue
+ * need to be processed to satisfy the GFP_KERNEL allocation.  This is
+ * the problem rescuer solves.
+ *
+ * When such condition is possible, the gcwq summons rescuers of all
+ * workqueues which have works queued on the gcwq and let them process
+ * those works so that forward progress can be guaranteed.
+ *
+ * This should happen rarely.
+ */
+static int rescuer_thread(void *__wq)
+{
+	struct workqueue_struct *wq = __wq;
+	struct worker *rescuer = wq->rescuer;
+	struct list_head *scheduled = &rescuer->scheduled;
+	unsigned int cpu;
+
+	set_user_nice(current, RESCUER_NICE_LEVEL);
+repeat:
+	set_current_state(TASK_INTERRUPTIBLE);
+
+	if (kthread_should_stop())
+		return 0;
+
+	for_each_cpu(cpu, wq->mayday_mask) {
+		struct cpu_workqueue_struct *cwq = get_cwq(cpu, wq);
+		struct global_cwq *gcwq = cwq->gcwq;
+		struct work_struct *work, *n;
+
+		__set_current_state(TASK_RUNNING);
+		cpumask_clear_cpu(cpu, wq->mayday_mask);
+
+		/* migrate to the target cpu if possible */
+		rescuer->gcwq = gcwq;
+		worker_maybe_bind_and_lock(rescuer);
+
+		/*
+		 * Slurp in all works issued via this workqueue and
+		 * process'em.
+		 */
+		BUG_ON(!list_empty(&rescuer->scheduled));
+		list_for_each_entry_safe(work, n, &gcwq->worklist, entry)
+			if (get_work_cwq(work) == cwq)
+				move_linked_works(work, scheduled, &n);
+
+		process_scheduled_works(rescuer);
+		spin_unlock_irq(&gcwq->lock);
+	}
+
+	schedule();
+	goto repeat;
+}
+
 struct wq_barrier {
 	struct work_struct	work;
 	struct completion	done;
@@ -1998,7 +2583,6 @@ struct workqueue_struct *__create_workqueue_key(const char *name,
 						const char *lock_name)
 {
 	struct workqueue_struct *wq;
-	bool failed = false;
 	unsigned int cpu;
 
 	max_active = clamp_val(max_active, 1, INT_MAX);
@@ -2023,13 +2607,6 @@ struct workqueue_struct *__create_workqueue_key(const char *name,
 	lockdep_init_map(&wq->lockdep_map, lock_name, key, 0);
 	INIT_LIST_HEAD(&wq->list);
 
-	cpu_maps_update_begin();
-	/*
-	 * We must initialize cwqs for each possible cpu even if we
-	 * are going to call destroy_workqueue() finally. Otherwise
-	 * cpu_up() can hit the uninitialized cwq once we drop the
-	 * lock.
-	 */
 	for_each_possible_cpu(cpu) {
 		struct cpu_workqueue_struct *cwq = get_cwq(cpu, wq);
 		struct global_cwq *gcwq = get_gcwq(cpu);
@@ -2040,14 +2617,25 @@ struct workqueue_struct *__create_workqueue_key(const char *name,
 		cwq->flush_color = -1;
 		cwq->max_active = max_active;
 		INIT_LIST_HEAD(&cwq->delayed_works);
+	}
 
-		if (failed)
-			continue;
-		cwq->worker = create_worker(gcwq, cpu_online(cpu));
-		if (cwq->worker)
-			start_worker(cwq->worker);
-		else
-			failed = true;
+	if (flags & WQ_RESCUER) {
+		struct worker *rescuer;
+
+		if (!alloc_cpumask_var(&wq->mayday_mask, GFP_KERNEL))
+			goto err;
+
+		wq->rescuer = rescuer = alloc_worker();
+		if (!rescuer)
+			goto err;
+
+		rescuer->task = kthread_create(rescuer_thread, wq, "%s", name);
+		if (IS_ERR(rescuer->task))
+			goto err;
+
+		wq->rescuer = rescuer;
+		rescuer->task->flags |= PF_THREAD_BOUND;
+		wake_up_process(rescuer->task);
 	}
 
 	/*
@@ -2065,16 +2653,12 @@ struct workqueue_struct *__create_workqueue_key(const char *name,
 
 	spin_unlock(&workqueue_lock);
 
-	cpu_maps_update_done();
-
-	if (failed) {
-		destroy_workqueue(wq);
-		wq = NULL;
-	}
 	return wq;
 err:
 	if (wq) {
 		free_cwqs(wq->cpu_wq);
+		free_cpumask_var(wq->mayday_mask);
+		kfree(wq->rescuer);
 		kfree(wq);
 	}
 	return NULL;
@@ -2097,42 +2681,26 @@ void destroy_workqueue(struct workqueue_struct *wq)
 	 * wq list is used to freeze wq, remove from list after
 	 * flushing is complete in case freeze races us.
 	 */
-	cpu_maps_update_begin();
 	spin_lock(&workqueue_lock);
 	list_del(&wq->list);
 	spin_unlock(&workqueue_lock);
-	cpu_maps_update_done();
 
+	/* sanity check */
 	for_each_possible_cpu(cpu) {
 		struct cpu_workqueue_struct *cwq = get_cwq(cpu, wq);
-		struct global_cwq *gcwq = cwq->gcwq;
 		int i;
 
-		if (cwq->worker) {
-		retry:
-			spin_lock_irq(&gcwq->lock);
-			/*
-			 * Worker can only be destroyed while idle.
-			 * Wait till it becomes idle.  This is ugly
-			 * and prone to starvation.  It will go away
-			 * once dynamic worker pool is implemented.
-			 */
-			if (!(cwq->worker->flags & WORKER_IDLE)) {
-				spin_unlock_irq(&gcwq->lock);
-				msleep(100);
-				goto retry;
-			}
-			destroy_worker(cwq->worker);
-			cwq->worker = NULL;
-			spin_unlock_irq(&gcwq->lock);
-		}
-
 		for (i = 0; i < WORK_NR_COLORS; i++)
 			BUG_ON(cwq->nr_in_flight[i]);
 		BUG_ON(cwq->nr_active);
 		BUG_ON(!list_empty(&cwq->delayed_works));
 	}
 
+	if (wq->flags & WQ_RESCUER) {
+		kthread_stop(wq->rescuer->task);
+		free_cpumask_var(wq->mayday_mask);
+	}
+
 	free_cwqs(wq->cpu_wq);
 	kfree(wq);
 }
@@ -2141,10 +2709,18 @@ EXPORT_SYMBOL_GPL(destroy_workqueue);
 /*
  * CPU hotplug.
  *
- * CPU hotplug is implemented by allowing cwqs to be detached from
- * CPU, running with unbound workers and allowing them to be
- * reattached later if the cpu comes back online.  A separate thread
- * is created to govern cwqs in such state and is called the trustee.
+ * There are two challenges in supporting CPU hotplug.  Firstly, there
+ * are a lot of assumptions on strong associations among work, cwq and
+ * gcwq which make migrating pending and scheduled works very
+ * difficult to implement without impacting hot paths.  Secondly,
+ * gcwqs serve mix of short, long and very long running works making
+ * blocked draining impractical.
+ *
+ * This is solved by allowing a gcwq to be detached from CPU, running
+ * it with unbound (rogue) workers and allowing it to be reattached
+ * later if the cpu comes back online.  A separate thread is created
+ * to govern a gcwq in such state and is called the trustee of the
+ * gcwq.
  *
  * Trustee states and their descriptions.
  *
@@ -2152,11 +2728,12 @@ EXPORT_SYMBOL_GPL(destroy_workqueue);
  *		new trustee is started with this state.
  *
  * IN_CHARGE	Once started, trustee will enter this state after
- *		making all existing workers rogue.  DOWN_PREPARE waits
- *		for trustee to enter this state.  After reaching
- *		IN_CHARGE, trustee tries to execute the pending
- *		worklist until it's empty and the state is set to
- *		BUTCHER, or the state is set to RELEASE.
+ *		assuming the manager role and making all existing
+ *		workers rogue.  DOWN_PREPARE waits for trustee to
+ *		enter this state.  After reaching IN_CHARGE, trustee
+ *		tries to execute the pending worklist until it's empty
+ *		and the state is set to BUTCHER, or the state is set
+ *		to RELEASE.
  *
  * BUTCHER	Command state which is set by the cpu callback after
  *		the cpu has went down.  Once this state is set trustee
@@ -2167,7 +2744,9 @@ EXPORT_SYMBOL_GPL(destroy_workqueue);
  * RELEASE	Command state which is set by the cpu callback if the
  *		cpu down has been canceled or it has come online
  *		again.  After recognizing this state, trustee stops
- *		trying to drain or butcher and transits to DONE.
+ *		trying to drain or butcher and clears ROGUE, rebinds
+ *		all remaining workers back to the cpu and releases
+ *		manager role.
  *
  * DONE		Trustee will enter this state after BUTCHER or RELEASE
  *		is complete.
@@ -2233,17 +2812,24 @@ static int __cpuinit trustee_thread(void *__gcwq)
 {
 	struct global_cwq *gcwq = __gcwq;
 	struct worker *worker;
+	struct work_struct *work;
 	struct hlist_node *pos;
+	long rc;
 	int i;
 
 	BUG_ON(gcwq->cpu != smp_processor_id());
 
 	spin_lock_irq(&gcwq->lock);
 	/*
-	 * Make all workers rogue.  Trustee must be bound to the
-	 * target cpu and can't be cancelled.
+	 * Claim the manager position and make all workers rogue.
+	 * Trustee must be bound to the target cpu and can't be
+	 * cancelled.
 	 */
 	BUG_ON(gcwq->cpu != smp_processor_id());
+	rc = trustee_wait_event(!(gcwq->flags & GCWQ_MANAGING_WORKERS));
+	BUG_ON(rc < 0);
+
+	gcwq->flags |= GCWQ_MANAGING_WORKERS;
 
 	list_for_each_entry(worker, &gcwq->idle_list, entry)
 		worker_set_flags(worker, WORKER_ROGUE, false);
@@ -2251,6 +2837,28 @@ static int __cpuinit trustee_thread(void *__gcwq)
 	for_each_busy_worker(worker, i, pos, gcwq)
 		worker_set_flags(worker, WORKER_ROGUE, false);
 
+	/*
+	 * Call schedule() so that we cross rq->lock and thus can
+	 * guarantee sched callbacks see the rogue flag.  This is
+	 * necessary as scheduler callbacks may be invoked from other
+	 * cpus.
+	 */
+	spin_unlock_irq(&gcwq->lock);
+	schedule();
+	spin_lock_irq(&gcwq->lock);
+
+	/*
+	 * Sched callbacks are disabled now.  gcwq->nr_running should
+	 * be zero and will stay that way, making need_more_worker()
+	 * and keep_working() always return true as long as the
+	 * worklist is not empty.
+	 */
+	WARN_ON_ONCE(atomic_read(get_gcwq_nr_running(gcwq->cpu)) != 0);
+
+	spin_unlock_irq(&gcwq->lock);
+	del_timer_sync(&gcwq->idle_timer);
+	spin_lock_irq(&gcwq->lock);
+
 	/*
 	 * We're now in charge.  Notify and proceed to drain.  We need
 	 * to keep the gcwq running during the whole CPU down
@@ -2263,18 +2871,90 @@ static int __cpuinit trustee_thread(void *__gcwq)
 	/*
 	 * The original cpu is in the process of dying and may go away
 	 * anytime now.  When that happens, we and all workers would
-	 * be migrated to other cpus.  Try draining any left work.
-	 * Note that if the gcwq is frozen, there may be frozen works
-	 * in freezeable cwqs.  Don't declare completion while frozen.
+	 * be migrated to other cpus.  Try draining any left work.  We
+	 * want to get it over with ASAP - spam rescuers, wake up as
+	 * many idlers as necessary and create new ones till the
+	 * worklist is empty.  Note that if the gcwq is frozen, there
+	 * may be frozen works in freezeable cwqs.  Don't declare
+	 * completion while frozen.
 	 */
 	while (gcwq->nr_workers != gcwq->nr_idle ||
 	       gcwq->flags & GCWQ_FREEZING ||
 	       gcwq->trustee_state == TRUSTEE_IN_CHARGE) {
+		int nr_works = 0;
+
+		list_for_each_entry(work, &gcwq->worklist, entry) {
+			send_mayday(work);
+			nr_works++;
+		}
+
+		list_for_each_entry(worker, &gcwq->idle_list, entry) {
+			if (!nr_works--)
+				break;
+			wake_up_process(worker->task);
+		}
+
+		if (need_to_create_worker(gcwq)) {
+			spin_unlock_irq(&gcwq->lock);
+			worker = create_worker(gcwq, false);
+			spin_lock_irq(&gcwq->lock);
+			if (worker) {
+				worker_set_flags(worker, WORKER_ROGUE, false);
+				start_worker(worker);
+			}
+		}
+
 		/* give a breather */
 		if (trustee_wait_event_timeout(false, TRUSTEE_COOLDOWN) < 0)
 			break;
 	}
 
+	/*
+	 * Either all works have been scheduled and cpu is down, or
+	 * cpu down has already been canceled.  Wait for and butcher
+	 * all workers till we're canceled.
+	 */
+	do {
+		rc = trustee_wait_event(!list_empty(&gcwq->idle_list));
+		while (!list_empty(&gcwq->idle_list))
+			destroy_worker(list_first_entry(&gcwq->idle_list,
+							struct worker, entry));
+	} while (gcwq->nr_workers && rc >= 0);
+
+	/*
+	 * At this point, either draining has completed and no worker
+	 * is left, or cpu down has been canceled or the cpu is being
+	 * brought back up.  There shouldn't be any idle one left.
+	 * Tell the remaining busy ones to rebind once it finishes the
+	 * currently scheduled works by scheduling the rebind_work.
+	 */
+	WARN_ON(!list_empty(&gcwq->idle_list));
+
+	for_each_busy_worker(worker, i, pos, gcwq) {
+		struct work_struct *rebind_work = &worker->rebind_work;
+
+		/*
+		 * Rebind_work may race with future cpu hotplug
+		 * operations.  Use a separate flag to mark that
+		 * rebinding is scheduled.
+		 */
+		worker_set_flags(worker, WORKER_REBIND, false);
+		worker_clr_flags(worker, WORKER_ROGUE);
+
+		/* queue rebind_work, wq doesn't matter, use the default one */
+		if (test_and_set_bit(WORK_STRUCT_PENDING_BIT,
+				     work_data_bits(rebind_work)))
+			continue;
+
+		debug_work_activate(rebind_work);
+		insert_work(get_cwq(gcwq->cpu, keventd_wq), rebind_work,
+			    worker->scheduled.next,
+			    work_color_to_flags(WORK_NO_COLOR));
+	}
+
+	/* relinquish manager role */
+	gcwq->flags &= ~GCWQ_MANAGING_WORKERS;
+
 	/* notify completion */
 	gcwq->trustee = NULL;
 	gcwq->trustee_state = TRUSTEE_DONE;
@@ -2313,10 +2993,8 @@ static int __devinit workqueue_cpu_callback(struct notifier_block *nfb,
 	unsigned int cpu = (unsigned long)hcpu;
 	struct global_cwq *gcwq = get_gcwq(cpu);
 	struct task_struct *new_trustee = NULL;
-	struct worker *worker;
-	struct hlist_node *pos;
+	struct worker *uninitialized_var(new_worker);
 	unsigned long flags;
-	int i;
 
 	action &= ~CPU_TASKS_FROZEN;
 
@@ -2327,6 +3005,15 @@ static int __devinit workqueue_cpu_callback(struct notifier_block *nfb,
 		if (IS_ERR(new_trustee))
 			return notifier_from_errno(PTR_ERR(new_trustee));
 		kthread_bind(new_trustee, cpu);
+		/* fall through */
+	case CPU_UP_PREPARE:
+		BUG_ON(gcwq->first_idle);
+		new_worker = create_worker(gcwq, false);
+		if (!new_worker) {
+			if (new_trustee)
+				kthread_stop(new_trustee);
+			return NOTIFY_BAD;
+		}
 	}
 
 	/* some are called w/ irq disabled, don't disturb irq status */
@@ -2340,26 +3027,50 @@ static int __devinit workqueue_cpu_callback(struct notifier_block *nfb,
 		gcwq->trustee_state = TRUSTEE_START;
 		wake_up_process(gcwq->trustee);
 		wait_trustee_state(gcwq, TRUSTEE_IN_CHARGE);
+		/* fall through */
+	case CPU_UP_PREPARE:
+		BUG_ON(gcwq->first_idle);
+		gcwq->first_idle = new_worker;
+		break;
+
+	case CPU_DYING:
+		/*
+		 * Before this, the trustee and all workers except for
+		 * the ones which are still executing works from
+		 * before the last CPU down must be on the cpu.  After
+		 * this, they'll all be diasporas.
+		 */
+		gcwq->flags |= GCWQ_DISASSOCIATED;
 		break;
 
 	case CPU_POST_DEAD:
 		gcwq->trustee_state = TRUSTEE_BUTCHER;
+		/* fall through */
+	case CPU_UP_CANCELED:
+		destroy_worker(gcwq->first_idle);
+		gcwq->first_idle = NULL;
 		break;
 
 	case CPU_DOWN_FAILED:
 	case CPU_ONLINE:
+		gcwq->flags &= ~GCWQ_DISASSOCIATED;
 		if (gcwq->trustee_state != TRUSTEE_DONE) {
 			gcwq->trustee_state = TRUSTEE_RELEASE;
 			wake_up_process(gcwq->trustee);
 			wait_trustee_state(gcwq, TRUSTEE_DONE);
 		}
 
-		/* clear ROGUE from all workers */
-		list_for_each_entry(worker, &gcwq->idle_list, entry)
-			worker_clr_flags(worker, WORKER_ROGUE);
-
-		for_each_busy_worker(worker, i, pos, gcwq)
-			worker_clr_flags(worker, WORKER_ROGUE);
+		/*
+		 * Trustee is done and there might be no worker left.
+		 * Put the first_idle in and request a real manager to
+		 * take a look.
+		 */
+		spin_unlock_irq(&gcwq->lock);
+		kthread_bind(gcwq->first_idle->task, cpu);
+		spin_lock_irq(&gcwq->lock);
+		gcwq->flags |= GCWQ_MANAGE_WORKERS;
+		start_worker(gcwq->first_idle);
+		gcwq->first_idle = NULL;
 		break;
 	}
 
@@ -2548,10 +3259,10 @@ void thaw_workqueues(void)
 			if (wq->single_cpu == gcwq->cpu &&
 			    !cwq->nr_active && list_empty(&cwq->delayed_works))
 				cwq_unbind_single_cpu(cwq);
-
-			wake_up_process(cwq->worker->task);
 		}
 
+		wake_up_worker(gcwq);
+
 		spin_unlock_irq(&gcwq->lock);
 	}
 
@@ -2588,12 +3299,31 @@ void __init init_workqueues(void)
 		for (i = 0; i < BUSY_WORKER_HASH_SIZE; i++)
 			INIT_HLIST_HEAD(&gcwq->busy_hash[i]);
 
+		init_timer_deferrable(&gcwq->idle_timer);
+		gcwq->idle_timer.function = idle_worker_timeout;
+		gcwq->idle_timer.data = (unsigned long)gcwq;
+
+		setup_timer(&gcwq->mayday_timer, gcwq_mayday_timeout,
+			    (unsigned long)gcwq);
+
 		ida_init(&gcwq->worker_ida);
 
 		gcwq->trustee_state = TRUSTEE_DONE;
 		init_waitqueue_head(&gcwq->trustee_wait);
 	}
 
+	/* create the initial worker */
+	for_each_online_cpu(cpu) {
+		struct global_cwq *gcwq = get_gcwq(cpu);
+		struct worker *worker;
+
+		worker = create_worker(gcwq, true);
+		BUG_ON(!worker);
+		spin_lock_irq(&gcwq->lock);
+		start_worker(worker);
+		spin_unlock_irq(&gcwq->lock);
+	}
+
 	keventd_wq = create_workqueue("events");
 	BUG_ON(!keventd_wq);
 }
diff --git a/kernel/workqueue_sched.h b/kernel/workqueue_sched.h
index af040babb742..2d10fc98dc79 100644
--- a/kernel/workqueue_sched.h
+++ b/kernel/workqueue_sched.h
@@ -4,13 +4,6 @@
  * Scheduler hooks for concurrency managed workqueue.  Only to be
  * included from sched.c and workqueue.c.
  */
-static inline void wq_worker_waking_up(struct task_struct *task,
-				       unsigned int cpu)
-{
-}
-
-static inline struct task_struct *wq_worker_sleeping(struct task_struct *task,
-						     unsigned int cpu)
-{
-	return NULL;
-}
+void wq_worker_waking_up(struct task_struct *task, unsigned int cpu);
+struct task_struct *wq_worker_sleeping(struct task_struct *task,
+				       unsigned int cpu);
-- 
cgit v1.2.3


From b71ab8c2025caef8db719aa41af0ed735dc543cd Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Tue, 29 Jun 2010 10:07:14 +0200
Subject: workqueue: increase max_active of keventd and kill
 current_is_keventd()

Define WQ_MAX_ACTIVE and create keventd with max_active set to half of
it which means that keventd now can process upto WQ_MAX_ACTIVE / 2 - 1
works concurrently.  Unless some combination can result in dependency
loop longer than max_active, deadlock won't happen and thus it's
unnecessary to check whether current_is_keventd() before trying to
schedule a work.  Kill current_is_keventd().

(Lockdep annotations are broken.  We need lock_map_acquire_read_norecurse())

Signed-off-by: Tejun Heo <tj@kernel.org>
Cc: Ingo Molnar <mingo@elte.hu>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Christoph Lameter <cl@linux-foundation.org>
Cc: Tony Luck <tony.luck@intel.com>
Cc: Andi Kleen <ak@linux.intel.com>
Cc: Oleg Nesterov <oleg@redhat.com>
---
 arch/ia64/kernel/smpboot.c |  2 +-
 arch/x86/kernel/smpboot.c  |  2 +-
 include/linux/workqueue.h  |  4 ++-
 kernel/workqueue.c         | 63 ++++++++++------------------------------------
 4 files changed, 18 insertions(+), 53 deletions(-)

(limited to 'include/linux')

diff --git a/arch/ia64/kernel/smpboot.c b/arch/ia64/kernel/smpboot.c
index 6a1380e90f87..99dcc85193c9 100644
--- a/arch/ia64/kernel/smpboot.c
+++ b/arch/ia64/kernel/smpboot.c
@@ -519,7 +519,7 @@ do_boot_cpu (int sapicid, int cpu)
 	/*
 	 * We can't use kernel_thread since we must avoid to reschedule the child.
 	 */
-	if (!keventd_up() || current_is_keventd())
+	if (!keventd_up())
 		c_idle.work.func(&c_idle.work);
 	else {
 		schedule_work(&c_idle.work);
diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c
index c4f33b2e77d6..4d90f376e985 100644
--- a/arch/x86/kernel/smpboot.c
+++ b/arch/x86/kernel/smpboot.c
@@ -735,7 +735,7 @@ static int __cpuinit do_boot_cpu(int apicid, int cpu)
 		goto do_rest;
 	}
 
-	if (!keventd_up() || current_is_keventd())
+	if (!keventd_up())
 		c_idle.work.func(&c_idle.work);
 	else {
 		schedule_work(&c_idle.work);
diff --git a/include/linux/workqueue.h b/include/linux/workqueue.h
index b8f4ec45c40a..33e24e734d50 100644
--- a/include/linux/workqueue.h
+++ b/include/linux/workqueue.h
@@ -227,6 +227,9 @@ enum {
 	WQ_SINGLE_CPU		= 1 << 1, /* only single cpu at a time */
 	WQ_NON_REENTRANT	= 1 << 2, /* guarantee non-reentrance */
 	WQ_RESCUER		= 1 << 3, /* has an rescue worker */
+
+	WQ_MAX_ACTIVE		= 512,	  /* I like 512, better ideas? */
+	WQ_DFL_ACTIVE		= WQ_MAX_ACTIVE / 2,
 };
 
 extern struct workqueue_struct *
@@ -280,7 +283,6 @@ extern int schedule_delayed_work(struct delayed_work *work, unsigned long delay)
 extern int schedule_delayed_work_on(int cpu, struct delayed_work *work,
 					unsigned long delay);
 extern int schedule_on_each_cpu(work_func_t func);
-extern int current_is_keventd(void);
 extern int keventd_up(void);
 
 extern void init_workqueues(void);
diff --git a/kernel/workqueue.c b/kernel/workqueue.c
index 0ad46523b423..4190e84cf995 100644
--- a/kernel/workqueue.c
+++ b/kernel/workqueue.c
@@ -2398,7 +2398,6 @@ EXPORT_SYMBOL(schedule_delayed_work_on);
 int schedule_on_each_cpu(work_func_t func)
 {
 	int cpu;
-	int orig = -1;
 	struct work_struct *works;
 
 	works = alloc_percpu(struct work_struct);
@@ -2407,23 +2406,12 @@ int schedule_on_each_cpu(work_func_t func)
 
 	get_online_cpus();
 
-	/*
-	 * When running in keventd don't schedule a work item on
-	 * itself.  Can just call directly because the work queue is
-	 * already bound.  This also is faster.
-	 */
-	if (current_is_keventd())
-		orig = raw_smp_processor_id();
-
 	for_each_online_cpu(cpu) {
 		struct work_struct *work = per_cpu_ptr(works, cpu);
 
 		INIT_WORK(work, func);
-		if (cpu != orig)
-			schedule_work_on(cpu, work);
+		schedule_work_on(cpu, work);
 	}
-	if (orig >= 0)
-		func(per_cpu_ptr(works, orig));
 
 	for_each_online_cpu(cpu)
 		flush_work(per_cpu_ptr(works, cpu));
@@ -2494,41 +2482,6 @@ int keventd_up(void)
 	return keventd_wq != NULL;
 }
 
-int current_is_keventd(void)
-{
-	bool found = false;
-	unsigned int cpu;
-
-	/*
-	 * There no longer is one-to-one relation between worker and
-	 * work queue and a worker task might be unbound from its cpu
-	 * if the cpu was offlined.  Match all busy workers.  This
-	 * function will go away once dynamic pool is implemented.
-	 */
-	for_each_possible_cpu(cpu) {
-		struct global_cwq *gcwq = get_gcwq(cpu);
-		struct worker *worker;
-		struct hlist_node *pos;
-		unsigned long flags;
-		int i;
-
-		spin_lock_irqsave(&gcwq->lock, flags);
-
-		for_each_busy_worker(worker, i, pos, gcwq) {
-			if (worker->task == current) {
-				found = true;
-				break;
-			}
-		}
-
-		spin_unlock_irqrestore(&gcwq->lock, flags);
-		if (found)
-			break;
-	}
-
-	return found;
-}
-
 static struct cpu_workqueue_struct *alloc_cwqs(void)
 {
 	/*
@@ -2576,6 +2529,16 @@ static void free_cwqs(struct cpu_workqueue_struct *cwqs)
 #endif
 }
 
+static int wq_clamp_max_active(int max_active, const char *name)
+{
+	if (max_active < 1 || max_active > WQ_MAX_ACTIVE)
+		printk(KERN_WARNING "workqueue: max_active %d requested for %s "
+		       "is out of range, clamping between %d and %d\n",
+		       max_active, name, 1, WQ_MAX_ACTIVE);
+
+	return clamp_val(max_active, 1, WQ_MAX_ACTIVE);
+}
+
 struct workqueue_struct *__create_workqueue_key(const char *name,
 						unsigned int flags,
 						int max_active,
@@ -2585,7 +2548,7 @@ struct workqueue_struct *__create_workqueue_key(const char *name,
 	struct workqueue_struct *wq;
 	unsigned int cpu;
 
-	max_active = clamp_val(max_active, 1, INT_MAX);
+	max_active = wq_clamp_max_active(max_active, name);
 
 	wq = kzalloc(sizeof(*wq), GFP_KERNEL);
 	if (!wq)
@@ -3324,6 +3287,6 @@ void __init init_workqueues(void)
 		spin_unlock_irq(&gcwq->lock);
 	}
 
-	keventd_wq = create_workqueue("events");
+	keventd_wq = __create_workqueue("events", 0, WQ_DFL_ACTIVE);
 	BUG_ON(!keventd_wq);
 }
-- 
cgit v1.2.3


From d320c03830b17af64e4547075003b1eeb274bc6c Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Tue, 29 Jun 2010 10:07:14 +0200
Subject: workqueue: s/__create_workqueue()/alloc_workqueue()/, and add system
 workqueues

This patch makes changes to make new workqueue features available to
its users.

* Now that workqueue is more featureful, there should be a public
  workqueue creation function which takes paramters to control them.
  Rename __create_workqueue() to alloc_workqueue() and make 0
  max_active mean WQ_DFL_ACTIVE.  In the long run, all
  create_workqueue_*() will be converted over to alloc_workqueue().

* To further unify access interface, rename keventd_wq to system_wq
  and export it.

* Add system_long_wq and system_nrt_wq.  The former is to host long
  running works separately (so that flush_scheduled_work() dosen't
  take so long) and the latter guarantees any queued work item is
  never executed in parallel by multiple CPUs.  These will be used by
  future patches to update workqueue users.

Signed-off-by: Tejun Heo <tj@kernel.org>
---
 include/linux/workqueue.h | 40 +++++++++++++++++++++++++++++-----------
 kernel/workqueue.c        | 42 +++++++++++++++++++++++++-----------------
 2 files changed, 54 insertions(+), 28 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/workqueue.h b/include/linux/workqueue.h
index 33e24e734d50..48b7422f25ae 100644
--- a/include/linux/workqueue.h
+++ b/include/linux/workqueue.h
@@ -232,12 +232,31 @@ enum {
 	WQ_DFL_ACTIVE		= WQ_MAX_ACTIVE / 2,
 };
 
+/*
+ * System-wide workqueues which are always present.
+ *
+ * system_wq is the one used by schedule[_delayed]_work[_on]().
+ * Multi-CPU multi-threaded.  There are users which expect relatively
+ * short queue flush time.  Don't queue works which can run for too
+ * long.
+ *
+ * system_long_wq is similar to system_wq but may host long running
+ * works.  Queue flushing might take relatively long.
+ *
+ * system_nrt_wq is non-reentrant and guarantees that any given work
+ * item is never executed in parallel by multiple CPUs.  Queue
+ * flushing might take relatively long.
+ */
+extern struct workqueue_struct *system_wq;
+extern struct workqueue_struct *system_long_wq;
+extern struct workqueue_struct *system_nrt_wq;
+
 extern struct workqueue_struct *
-__create_workqueue_key(const char *name, unsigned int flags, int max_active,
-		       struct lock_class_key *key, const char *lock_name);
+__alloc_workqueue_key(const char *name, unsigned int flags, int max_active,
+		      struct lock_class_key *key, const char *lock_name);
 
 #ifdef CONFIG_LOCKDEP
-#define __create_workqueue(name, flags, max_active)		\
+#define alloc_workqueue(name, flags, max_active)		\
 ({								\
 	static struct lock_class_key __key;			\
 	const char *__lock_name;				\
@@ -247,21 +266,20 @@ __create_workqueue_key(const char *name, unsigned int flags, int max_active,
 	else							\
 		__lock_name = #name;				\
 								\
-	__create_workqueue_key((name), (flags), (max_active),	\
-				&__key, __lock_name);		\
+	__alloc_workqueue_key((name), (flags), (max_active),	\
+			      &__key, __lock_name);		\
 })
 #else
-#define __create_workqueue(name, flags, max_active)		\
-	__create_workqueue_key((name), (flags), (max_active), NULL, NULL)
+#define alloc_workqueue(name, flags, max_active)		\
+	__alloc_workqueue_key((name), (flags), (max_active), NULL, NULL)
 #endif
 
 #define create_workqueue(name)					\
-	__create_workqueue((name), WQ_RESCUER, 1)
+	alloc_workqueue((name), WQ_RESCUER, 1)
 #define create_freezeable_workqueue(name)			\
-	__create_workqueue((name),				\
-			   WQ_FREEZEABLE | WQ_SINGLE_CPU | WQ_RESCUER, 1)
+	alloc_workqueue((name), WQ_FREEZEABLE | WQ_SINGLE_CPU | WQ_RESCUER, 1)
 #define create_singlethread_workqueue(name)			\
-	__create_workqueue((name), WQ_SINGLE_CPU | WQ_RESCUER, 1)
+	alloc_workqueue((name), WQ_SINGLE_CPU | WQ_RESCUER, 1)
 
 extern void destroy_workqueue(struct workqueue_struct *wq);
 
diff --git a/kernel/workqueue.c b/kernel/workqueue.c
index 4190e84cf995..16ce617974d2 100644
--- a/kernel/workqueue.c
+++ b/kernel/workqueue.c
@@ -210,6 +210,13 @@ struct workqueue_struct {
 #endif
 };
 
+struct workqueue_struct *system_wq __read_mostly;
+struct workqueue_struct *system_long_wq __read_mostly;
+struct workqueue_struct *system_nrt_wq __read_mostly;
+EXPORT_SYMBOL_GPL(system_wq);
+EXPORT_SYMBOL_GPL(system_long_wq);
+EXPORT_SYMBOL_GPL(system_nrt_wq);
+
 #define for_each_busy_worker(worker, i, pos, gcwq)			\
 	for (i = 0; i < BUSY_WORKER_HASH_SIZE; i++)			\
 		hlist_for_each_entry(worker, pos, &gcwq->busy_hash[i], hentry)
@@ -2306,8 +2313,6 @@ int cancel_delayed_work_sync(struct delayed_work *dwork)
 }
 EXPORT_SYMBOL(cancel_delayed_work_sync);
 
-static struct workqueue_struct *keventd_wq __read_mostly;
-
 /**
  * schedule_work - put work task in global workqueue
  * @work: job to be done
@@ -2321,7 +2326,7 @@ static struct workqueue_struct *keventd_wq __read_mostly;
  */
 int schedule_work(struct work_struct *work)
 {
-	return queue_work(keventd_wq, work);
+	return queue_work(system_wq, work);
 }
 EXPORT_SYMBOL(schedule_work);
 
@@ -2334,7 +2339,7 @@ EXPORT_SYMBOL(schedule_work);
  */
 int schedule_work_on(int cpu, struct work_struct *work)
 {
-	return queue_work_on(cpu, keventd_wq, work);
+	return queue_work_on(cpu, system_wq, work);
 }
 EXPORT_SYMBOL(schedule_work_on);
 
@@ -2349,7 +2354,7 @@ EXPORT_SYMBOL(schedule_work_on);
 int schedule_delayed_work(struct delayed_work *dwork,
 					unsigned long delay)
 {
-	return queue_delayed_work(keventd_wq, dwork, delay);
+	return queue_delayed_work(system_wq, dwork, delay);
 }
 EXPORT_SYMBOL(schedule_delayed_work);
 
@@ -2382,7 +2387,7 @@ EXPORT_SYMBOL(flush_delayed_work);
 int schedule_delayed_work_on(int cpu,
 			struct delayed_work *dwork, unsigned long delay)
 {
-	return queue_delayed_work_on(cpu, keventd_wq, dwork, delay);
+	return queue_delayed_work_on(cpu, system_wq, dwork, delay);
 }
 EXPORT_SYMBOL(schedule_delayed_work_on);
 
@@ -2447,7 +2452,7 @@ int schedule_on_each_cpu(work_func_t func)
  */
 void flush_scheduled_work(void)
 {
-	flush_workqueue(keventd_wq);
+	flush_workqueue(system_wq);
 }
 EXPORT_SYMBOL(flush_scheduled_work);
 
@@ -2479,7 +2484,7 @@ EXPORT_SYMBOL_GPL(execute_in_process_context);
 
 int keventd_up(void)
 {
-	return keventd_wq != NULL;
+	return system_wq != NULL;
 }
 
 static struct cpu_workqueue_struct *alloc_cwqs(void)
@@ -2539,15 +2544,16 @@ static int wq_clamp_max_active(int max_active, const char *name)
 	return clamp_val(max_active, 1, WQ_MAX_ACTIVE);
 }
 
-struct workqueue_struct *__create_workqueue_key(const char *name,
-						unsigned int flags,
-						int max_active,
-						struct lock_class_key *key,
-						const char *lock_name)
+struct workqueue_struct *__alloc_workqueue_key(const char *name,
+					       unsigned int flags,
+					       int max_active,
+					       struct lock_class_key *key,
+					       const char *lock_name)
 {
 	struct workqueue_struct *wq;
 	unsigned int cpu;
 
+	max_active = max_active ?: WQ_DFL_ACTIVE;
 	max_active = wq_clamp_max_active(max_active, name);
 
 	wq = kzalloc(sizeof(*wq), GFP_KERNEL);
@@ -2626,7 +2632,7 @@ err:
 	}
 	return NULL;
 }
-EXPORT_SYMBOL_GPL(__create_workqueue_key);
+EXPORT_SYMBOL_GPL(__alloc_workqueue_key);
 
 /**
  * destroy_workqueue - safely terminate a workqueue
@@ -2910,7 +2916,7 @@ static int __cpuinit trustee_thread(void *__gcwq)
 			continue;
 
 		debug_work_activate(rebind_work);
-		insert_work(get_cwq(gcwq->cpu, keventd_wq), rebind_work,
+		insert_work(get_cwq(gcwq->cpu, system_wq), rebind_work,
 			    worker->scheduled.next,
 			    work_color_to_flags(WORK_NO_COLOR));
 	}
@@ -3287,6 +3293,8 @@ void __init init_workqueues(void)
 		spin_unlock_irq(&gcwq->lock);
 	}
 
-	keventd_wq = __create_workqueue("events", 0, WQ_DFL_ACTIVE);
-	BUG_ON(!keventd_wq);
+	system_wq = alloc_workqueue("events", 0, 0);
+	system_long_wq = alloc_workqueue("events_long", 0, 0);
+	system_nrt_wq = alloc_workqueue("events_nrt", WQ_NON_REENTRANT, 0);
+	BUG_ON(!system_wq || !system_long_wq || !system_nrt_wq);
 }
-- 
cgit v1.2.3


From dcd989cb73ab0f7b722d64ab6516f101d9f43f88 Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Tue, 29 Jun 2010 10:07:14 +0200
Subject: workqueue: implement several utility APIs

Implement the following utility APIs.

 workqueue_set_max_active()	: adjust max_active of a wq
 workqueue_congested()		: test whether a wq is contested
 work_cpu()			: determine the last / current cpu of a work
 work_busy()			: query whether a work is busy

* Anton Blanchard fixed missing ret initialization in work_busy().

Signed-off-by: Tejun Heo <tj@kernel.org>
Cc: Anton Blanchard <anton@samba.org>
---
 include/linux/workqueue.h |  11 ++++-
 kernel/workqueue.c        | 108 +++++++++++++++++++++++++++++++++++++++++++++-
 2 files changed, 117 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/workqueue.h b/include/linux/workqueue.h
index 48b7422f25ae..0a7f79729380 100644
--- a/include/linux/workqueue.h
+++ b/include/linux/workqueue.h
@@ -61,6 +61,10 @@ enum {
 	WORK_STRUCT_FLAG_MASK	= (1UL << WORK_STRUCT_FLAG_BITS) - 1,
 	WORK_STRUCT_WQ_DATA_MASK = ~WORK_STRUCT_FLAG_MASK,
 	WORK_STRUCT_NO_CPU	= NR_CPUS << WORK_STRUCT_FLAG_BITS,
+
+	/* bit mask for work_busy() return values */
+	WORK_BUSY_PENDING	= 1 << 0,
+	WORK_BUSY_RUNNING	= 1 << 1,
 };
 
 struct work_struct {
@@ -307,9 +311,14 @@ extern void init_workqueues(void);
 int execute_in_process_context(work_func_t fn, struct execute_work *);
 
 extern int flush_work(struct work_struct *work);
-
 extern int cancel_work_sync(struct work_struct *work);
 
+extern void workqueue_set_max_active(struct workqueue_struct *wq,
+				     int max_active);
+extern bool workqueue_congested(unsigned int cpu, struct workqueue_struct *wq);
+extern unsigned int work_cpu(struct work_struct *work);
+extern unsigned int work_busy(struct work_struct *work);
+
 /*
  * Kill off a pending schedule_delayed_work().  Note that the work callback
  * function may still be running on return from cancel_delayed_work(), unless
diff --git a/kernel/workqueue.c b/kernel/workqueue.c
index 16ce617974d2..c1aa65c2ff38 100644
--- a/kernel/workqueue.c
+++ b/kernel/workqueue.c
@@ -203,7 +203,7 @@ struct workqueue_struct {
 	cpumask_var_t		mayday_mask;	/* cpus requesting rescue */
 	struct worker		*rescuer;	/* I: rescue worker */
 
-	int			saved_max_active; /* I: saved cwq max_active */
+	int			saved_max_active; /* W: saved cwq max_active */
 	const char		*name;		/* I: workqueue name */
 #ifdef CONFIG_LOCKDEP
 	struct lockdep_map	lockdep_map;
@@ -2675,6 +2675,112 @@ void destroy_workqueue(struct workqueue_struct *wq)
 }
 EXPORT_SYMBOL_GPL(destroy_workqueue);
 
+/**
+ * workqueue_set_max_active - adjust max_active of a workqueue
+ * @wq: target workqueue
+ * @max_active: new max_active value.
+ *
+ * Set max_active of @wq to @max_active.
+ *
+ * CONTEXT:
+ * Don't call from IRQ context.
+ */
+void workqueue_set_max_active(struct workqueue_struct *wq, int max_active)
+{
+	unsigned int cpu;
+
+	max_active = wq_clamp_max_active(max_active, wq->name);
+
+	spin_lock(&workqueue_lock);
+
+	wq->saved_max_active = max_active;
+
+	for_each_possible_cpu(cpu) {
+		struct global_cwq *gcwq = get_gcwq(cpu);
+
+		spin_lock_irq(&gcwq->lock);
+
+		if (!(wq->flags & WQ_FREEZEABLE) ||
+		    !(gcwq->flags & GCWQ_FREEZING))
+			get_cwq(gcwq->cpu, wq)->max_active = max_active;
+
+		spin_unlock_irq(&gcwq->lock);
+	}
+
+	spin_unlock(&workqueue_lock);
+}
+EXPORT_SYMBOL_GPL(workqueue_set_max_active);
+
+/**
+ * workqueue_congested - test whether a workqueue is congested
+ * @cpu: CPU in question
+ * @wq: target workqueue
+ *
+ * Test whether @wq's cpu workqueue for @cpu is congested.  There is
+ * no synchronization around this function and the test result is
+ * unreliable and only useful as advisory hints or for debugging.
+ *
+ * RETURNS:
+ * %true if congested, %false otherwise.
+ */
+bool workqueue_congested(unsigned int cpu, struct workqueue_struct *wq)
+{
+	struct cpu_workqueue_struct *cwq = get_cwq(cpu, wq);
+
+	return !list_empty(&cwq->delayed_works);
+}
+EXPORT_SYMBOL_GPL(workqueue_congested);
+
+/**
+ * work_cpu - return the last known associated cpu for @work
+ * @work: the work of interest
+ *
+ * RETURNS:
+ * CPU number if @work was ever queued.  NR_CPUS otherwise.
+ */
+unsigned int work_cpu(struct work_struct *work)
+{
+	struct global_cwq *gcwq = get_work_gcwq(work);
+
+	return gcwq ? gcwq->cpu : NR_CPUS;
+}
+EXPORT_SYMBOL_GPL(work_cpu);
+
+/**
+ * work_busy - test whether a work is currently pending or running
+ * @work: the work to be tested
+ *
+ * Test whether @work is currently pending or running.  There is no
+ * synchronization around this function and the test result is
+ * unreliable and only useful as advisory hints or for debugging.
+ * Especially for reentrant wqs, the pending state might hide the
+ * running state.
+ *
+ * RETURNS:
+ * OR'd bitmask of WORK_BUSY_* bits.
+ */
+unsigned int work_busy(struct work_struct *work)
+{
+	struct global_cwq *gcwq = get_work_gcwq(work);
+	unsigned long flags;
+	unsigned int ret = 0;
+
+	if (!gcwq)
+		return false;
+
+	spin_lock_irqsave(&gcwq->lock, flags);
+
+	if (work_pending(work))
+		ret |= WORK_BUSY_PENDING;
+	if (find_worker_executing_work(gcwq, work))
+		ret |= WORK_BUSY_RUNNING;
+
+	spin_unlock_irqrestore(&gcwq->lock, flags);
+
+	return ret;
+}
+EXPORT_SYMBOL_GPL(work_busy);
+
 /*
  * CPU hotplug.
  *
-- 
cgit v1.2.3


From 649027d73a6309ac34dc2886362e662bd73456dc Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Tue, 29 Jun 2010 10:07:14 +0200
Subject: workqueue: implement high priority workqueue

This patch implements high priority workqueue which can be specified
with WQ_HIGHPRI flag on creation.  A high priority workqueue has the
following properties.

* A work queued to it is queued at the head of the worklist of the
  respective gcwq after other highpri works, while normal works are
  always appended at the end.

* As long as there are highpri works on gcwq->worklist,
  [__]need_more_worker() remains %true and process_one_work() wakes up
  another worker before it start executing a work.

The above two properties guarantee that works queued to high priority
workqueues are dispatched to workers and start execution as soon as
possible regardless of the state of other works.

Signed-off-by: Tejun Heo <tj@kernel.org>
Cc: Andi Kleen <andi@firstfloor.org>
Cc: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/workqueue.h |  1 +
 kernel/workqueue.c        | 70 +++++++++++++++++++++++++++++++++++++++++++----
 2 files changed, 65 insertions(+), 6 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/workqueue.h b/include/linux/workqueue.h
index 0a7f79729380..006dcf7e808a 100644
--- a/include/linux/workqueue.h
+++ b/include/linux/workqueue.h
@@ -231,6 +231,7 @@ enum {
 	WQ_SINGLE_CPU		= 1 << 1, /* only single cpu at a time */
 	WQ_NON_REENTRANT	= 1 << 2, /* guarantee non-reentrance */
 	WQ_RESCUER		= 1 << 3, /* has an rescue worker */
+	WQ_HIGHPRI		= 1 << 4, /* high priority */
 
 	WQ_MAX_ACTIVE		= 512,	  /* I like 512, better ideas? */
 	WQ_DFL_ACTIVE		= WQ_MAX_ACTIVE / 2,
diff --git a/kernel/workqueue.c b/kernel/workqueue.c
index c1aa65c2ff38..5775717288d5 100644
--- a/kernel/workqueue.c
+++ b/kernel/workqueue.c
@@ -43,6 +43,7 @@ enum {
 	GCWQ_MANAGING_WORKERS	= 1 << 1,	/* managing workers */
 	GCWQ_DISASSOCIATED	= 1 << 2,	/* cpu can't serve workers */
 	GCWQ_FREEZING		= 1 << 3,	/* freeze in progress */
+	GCWQ_HIGHPRI_PENDING	= 1 << 4,	/* highpri works on queue */
 
 	/* worker flags */
 	WORKER_STARTED		= 1 << 0,	/* started */
@@ -452,15 +453,19 @@ static struct global_cwq *get_work_gcwq(struct work_struct *work)
  * assume that they're being called with gcwq->lock held.
  */
 
+static bool __need_more_worker(struct global_cwq *gcwq)
+{
+	return !atomic_read(get_gcwq_nr_running(gcwq->cpu)) ||
+		gcwq->flags & GCWQ_HIGHPRI_PENDING;
+}
+
 /*
  * Need to wake up a worker?  Called from anything but currently
  * running workers.
  */
 static bool need_more_worker(struct global_cwq *gcwq)
 {
-	atomic_t *nr_running = get_gcwq_nr_running(gcwq->cpu);
-
-	return !list_empty(&gcwq->worklist) && !atomic_read(nr_running);
+	return !list_empty(&gcwq->worklist) && __need_more_worker(gcwq);
 }
 
 /* Can I start working?  Called from busy but !running workers. */
@@ -733,6 +738,43 @@ static struct worker *find_worker_executing_work(struct global_cwq *gcwq,
 					    work);
 }
 
+/**
+ * gcwq_determine_ins_pos - find insertion position
+ * @gcwq: gcwq of interest
+ * @cwq: cwq a work is being queued for
+ *
+ * A work for @cwq is about to be queued on @gcwq, determine insertion
+ * position for the work.  If @cwq is for HIGHPRI wq, the work is
+ * queued at the head of the queue but in FIFO order with respect to
+ * other HIGHPRI works; otherwise, at the end of the queue.  This
+ * function also sets GCWQ_HIGHPRI_PENDING flag to hint @gcwq that
+ * there are HIGHPRI works pending.
+ *
+ * CONTEXT:
+ * spin_lock_irq(gcwq->lock).
+ *
+ * RETURNS:
+ * Pointer to inserstion position.
+ */
+static inline struct list_head *gcwq_determine_ins_pos(struct global_cwq *gcwq,
+					       struct cpu_workqueue_struct *cwq)
+{
+	struct work_struct *twork;
+
+	if (likely(!(cwq->wq->flags & WQ_HIGHPRI)))
+		return &gcwq->worklist;
+
+	list_for_each_entry(twork, &gcwq->worklist, entry) {
+		struct cpu_workqueue_struct *tcwq = get_work_cwq(twork);
+
+		if (!(tcwq->wq->flags & WQ_HIGHPRI))
+			break;
+	}
+
+	gcwq->flags |= GCWQ_HIGHPRI_PENDING;
+	return &twork->entry;
+}
+
 /**
  * insert_work - insert a work into gcwq
  * @cwq: cwq @work belongs to
@@ -770,7 +812,7 @@ static void insert_work(struct cpu_workqueue_struct *cwq,
 	 */
 	smp_mb();
 
-	if (!atomic_read(get_gcwq_nr_running(gcwq->cpu)))
+	if (__need_more_worker(gcwq))
 		wake_up_worker(gcwq);
 }
 
@@ -887,7 +929,7 @@ static void __queue_work(unsigned int cpu, struct workqueue_struct *wq,
 
 	if (likely(cwq->nr_active < cwq->max_active)) {
 		cwq->nr_active++;
-		worklist = &gcwq->worklist;
+		worklist = gcwq_determine_ins_pos(gcwq, cwq);
 	} else
 		worklist = &cwq->delayed_works;
 
@@ -1526,8 +1568,9 @@ static void cwq_activate_first_delayed(struct cpu_workqueue_struct *cwq)
 {
 	struct work_struct *work = list_first_entry(&cwq->delayed_works,
 						    struct work_struct, entry);
+	struct list_head *pos = gcwq_determine_ins_pos(cwq->gcwq, cwq);
 
-	move_linked_works(work, &cwq->gcwq->worklist, NULL);
+	move_linked_works(work, pos, NULL);
 	cwq->nr_active++;
 }
 
@@ -1634,6 +1677,21 @@ static void process_one_work(struct worker *worker, struct work_struct *work)
 	set_work_cpu(work, gcwq->cpu);
 	list_del_init(&work->entry);
 
+	/*
+	 * If HIGHPRI_PENDING, check the next work, and, if HIGHPRI,
+	 * wake up another worker; otherwise, clear HIGHPRI_PENDING.
+	 */
+	if (unlikely(gcwq->flags & GCWQ_HIGHPRI_PENDING)) {
+		struct work_struct *nwork = list_first_entry(&gcwq->worklist,
+						struct work_struct, entry);
+
+		if (!list_empty(&gcwq->worklist) &&
+		    get_work_cwq(nwork)->wq->flags & WQ_HIGHPRI)
+			wake_up_worker(gcwq);
+		else
+			gcwq->flags &= ~GCWQ_HIGHPRI_PENDING;
+	}
+
 	spin_unlock_irq(&gcwq->lock);
 
 	work_clear_pending(work);
-- 
cgit v1.2.3


From fb0e7beb5c1b6fb4da786ba709d7138373d5fb22 Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Tue, 29 Jun 2010 10:07:15 +0200
Subject: workqueue: implement cpu intensive workqueue

This patch implements cpu intensive workqueue which can be specified
with WQ_CPU_INTENSIVE flag on creation.  Works queued to a cpu
intensive workqueue don't participate in concurrency management.  IOW,
it doesn't contribute to gcwq->nr_running and thus doesn't delay
excution of other works.

Note that although cpu intensive works won't delay other works, they
can be delayed by other works.  Combine with WQ_HIGHPRI to avoid being
delayed by other works too.

As the name suggests this is useful when using workqueue for cpu
intensive works.  Workers executing cpu intensive works are not
considered for workqueue concurrency management and left for the
scheduler to manage.

Signed-off-by: Tejun Heo <tj@kernel.org>
Cc: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/workqueue.h |  1 +
 kernel/workqueue.c        | 16 +++++++++++++++-
 2 files changed, 16 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/workqueue.h b/include/linux/workqueue.h
index 006dcf7e808a..3f36d37ac5ba 100644
--- a/include/linux/workqueue.h
+++ b/include/linux/workqueue.h
@@ -232,6 +232,7 @@ enum {
 	WQ_NON_REENTRANT	= 1 << 2, /* guarantee non-reentrance */
 	WQ_RESCUER		= 1 << 3, /* has an rescue worker */
 	WQ_HIGHPRI		= 1 << 4, /* high priority */
+	WQ_CPU_INTENSIVE	= 1 << 5, /* cpu instensive workqueue */
 
 	WQ_MAX_ACTIVE		= 512,	  /* I like 512, better ideas? */
 	WQ_DFL_ACTIVE		= WQ_MAX_ACTIVE / 2,
diff --git a/kernel/workqueue.c b/kernel/workqueue.c
index 5775717288d5..6fa847c5c5e9 100644
--- a/kernel/workqueue.c
+++ b/kernel/workqueue.c
@@ -52,8 +52,10 @@ enum {
 	WORKER_PREP		= 1 << 3,	/* preparing to run works */
 	WORKER_ROGUE		= 1 << 4,	/* not bound to any cpu */
 	WORKER_REBIND		= 1 << 5,	/* mom is home, come back */
+	WORKER_CPU_INTENSIVE	= 1 << 6,	/* cpu intensive */
 
-	WORKER_NOT_RUNNING	= WORKER_PREP | WORKER_ROGUE | WORKER_REBIND,
+	WORKER_NOT_RUNNING	= WORKER_PREP | WORKER_ROGUE | WORKER_REBIND |
+				  WORKER_CPU_INTENSIVE,
 
 	/* gcwq->trustee_state */
 	TRUSTEE_START		= 0,		/* start */
@@ -1641,6 +1643,7 @@ static void process_one_work(struct worker *worker, struct work_struct *work)
 	struct cpu_workqueue_struct *cwq = get_work_cwq(work);
 	struct global_cwq *gcwq = cwq->gcwq;
 	struct hlist_head *bwh = busy_worker_head(gcwq, work);
+	bool cpu_intensive = cwq->wq->flags & WQ_CPU_INTENSIVE;
 	work_func_t f = work->func;
 	int work_color;
 	struct worker *collision;
@@ -1692,6 +1695,13 @@ static void process_one_work(struct worker *worker, struct work_struct *work)
 			gcwq->flags &= ~GCWQ_HIGHPRI_PENDING;
 	}
 
+	/*
+	 * CPU intensive works don't participate in concurrency
+	 * management.  They're the scheduler's responsibility.
+	 */
+	if (unlikely(cpu_intensive))
+		worker_set_flags(worker, WORKER_CPU_INTENSIVE, true);
+
 	spin_unlock_irq(&gcwq->lock);
 
 	work_clear_pending(work);
@@ -1713,6 +1723,10 @@ static void process_one_work(struct worker *worker, struct work_struct *work)
 
 	spin_lock_irq(&gcwq->lock);
 
+	/* clear cpu intensive status */
+	if (unlikely(cpu_intensive))
+		worker_clr_flags(worker, WORKER_CPU_INTENSIVE);
+
 	/* we're done with it, release */
 	hlist_del_init(&worker->hentry);
 	worker->current_work = NULL;
-- 
cgit v1.2.3


From 2ec57d448b2e8fcfba539a46701b43f14f037f17 Mon Sep 17 00:00:00 2001
From: Michael Neuling <mikey@neuling.org>
Date: Tue, 29 Jun 2010 12:02:01 +1000
Subject: sched: Fix spelling of sibling

No logic changes, only spelling.

Signed-off-by: Michael Neuling <mikey@neuling.org>
Cc: linuxppc-dev@ozlabs.org
Cc: David Howells <dhowells@redhat.com>
Cc: Peter Zijlstra <peterz@infradead.org>
LKML-Reference: <15249.1277776921@neuling.org>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/powerpc/kernel/process.c | 2 +-
 include/linux/topology.h      | 2 +-
 kernel/sched_fair.c           | 2 +-
 3 files changed, 3 insertions(+), 3 deletions(-)

(limited to 'include/linux')

diff --git a/arch/powerpc/kernel/process.c b/arch/powerpc/kernel/process.c
index 9b41ece010b6..22f08cb7e7d1 100644
--- a/arch/powerpc/kernel/process.c
+++ b/arch/powerpc/kernel/process.c
@@ -1270,7 +1270,7 @@ unsigned long randomize_et_dyn(unsigned long base)
 }
 
 #ifdef CONFIG_SMP
-int arch_sd_sibiling_asym_packing(void)
+int arch_sd_sibling_asym_packing(void)
 {
 	if (cpu_has_feature(CPU_FTR_ASYM_SMT)) {
 		printk_once(KERN_INFO "Enabling Asymmetric SMT scheduling\n");
diff --git a/include/linux/topology.h b/include/linux/topology.h
index cf57f30d0dcb..b572e432d2f3 100644
--- a/include/linux/topology.h
+++ b/include/linux/topology.h
@@ -103,7 +103,7 @@ int arch_update_cpu_topology(void);
 				| 1*SD_SHARE_PKG_RESOURCES		\
 				| 0*SD_SERIALIZE			\
 				| 0*SD_PREFER_SIBLING			\
-				| arch_sd_sibiling_asym_packing()	\
+				| arch_sd_sibling_asym_packing()	\
 				,					\
 	.last_balance		= jiffies,				\
 	.balance_interval	= 1,					\
diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c
index 5e8f98c103fb..b4da534f4b8c 100644
--- a/kernel/sched_fair.c
+++ b/kernel/sched_fair.c
@@ -2567,7 +2567,7 @@ static inline void update_sd_lb_stats(struct sched_domain *sd, int this_cpu,
 	} while (sg != sd->groups);
 }
 
-int __weak arch_sd_sibiling_asym_packing(void)
+int __weak arch_sd_sibling_asym_packing(void)
 {
        return 0*SD_ASYM_PACKING;
 }
-- 
cgit v1.2.3


From a53f4b61a76a7e95139b8e8abba02e9bfe87a58a Mon Sep 17 00:00:00 2001
From: "Paul E. McKenney" <paulmck@linux.vnet.ibm.com>
Date: Thu, 1 Jul 2010 12:45:34 -0700
Subject: Revert "net: Make accesses to ->br_port safe for sparse RCU"

This reverts commit 81bdf5bd7349bd4523538cbd7878f334bc2bfe14, which is
obsoleted by commit f350a0a87374 from the net tree.
---
 include/linux/if_bridge.h           | 3 ---
 net/bridge/br_fdb.c                 | 2 +-
 net/bridge/br_private.h             | 5 -----
 net/bridge/netfilter/ebt_redirect.c | 2 +-
 net/bridge/netfilter/ebt_ulog.c     | 4 ++--
 net/bridge/netfilter/ebtables.c     | 4 ++--
 net/netfilter/nfnetlink_log.c       | 4 ++--
 net/netfilter/nfnetlink_queue.c     | 4 ++--
 8 files changed, 10 insertions(+), 18 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/if_bridge.h b/include/linux/if_bridge.h
index d001d782922d..938b7e81df95 100644
--- a/include/linux/if_bridge.h
+++ b/include/linux/if_bridge.h
@@ -101,9 +101,6 @@ struct __fdb_entry {
 
 #include <linux/netdevice.h>
 
-/* br_handle_frame_hook() needs the following forward declaration. */
-struct net_bridge_port;
-
 extern void brioctl_set(int (*ioctl_hook)(struct net *, unsigned int, void __user *));
 extern struct sk_buff *(*br_handle_frame_hook)(struct net_bridge_port *p,
 					       struct sk_buff *skb);
diff --git a/net/bridge/br_fdb.c b/net/bridge/br_fdb.c
index 09c479e05623..b01dde35a69e 100644
--- a/net/bridge/br_fdb.c
+++ b/net/bridge/br_fdb.c
@@ -244,7 +244,7 @@ int br_fdb_test_addr(struct net_device *dev, unsigned char *addr)
 		return 0;
 
 	rcu_read_lock();
-	fdb = __br_fdb_get(br_port(dev)->br, addr);
+	fdb = __br_fdb_get(dev->br_port->br, addr);
 	ret = fdb && fdb->dst->dev != dev &&
 		fdb->dst->state == BR_STATE_FORWARDING;
 	rcu_read_unlock();
diff --git a/net/bridge/br_private.h b/net/bridge/br_private.h
index 3255188355b5..0f4a74bc6a9b 100644
--- a/net/bridge/br_private.h
+++ b/net/bridge/br_private.h
@@ -268,11 +268,6 @@ static inline int br_is_root_bridge(const struct net_bridge *br)
 	return !memcmp(&br->bridge_id, &br->designated_root, 8);
 }
 
-static inline struct net_bridge_port *br_port(const struct net_device *dev)
-{
-	return rcu_dereference(dev->br_port);
-}
-
 /* br_device.c */
 extern void br_dev_setup(struct net_device *dev);
 extern netdev_tx_t br_dev_xmit(struct sk_buff *skb,
diff --git a/net/bridge/netfilter/ebt_redirect.c b/net/bridge/netfilter/ebt_redirect.c
index a39df0ae0f81..9e19166ba453 100644
--- a/net/bridge/netfilter/ebt_redirect.c
+++ b/net/bridge/netfilter/ebt_redirect.c
@@ -25,7 +25,7 @@ ebt_redirect_tg(struct sk_buff *skb, const struct xt_action_param *par)
 
 	if (par->hooknum != NF_BR_BROUTING)
 		memcpy(eth_hdr(skb)->h_dest,
-		       br_port(par->in)->br->dev->dev_addr, ETH_ALEN);
+		       par->in->br_port->br->dev->dev_addr, ETH_ALEN);
 	else
 		memcpy(eth_hdr(skb)->h_dest, par->in->dev_addr, ETH_ALEN);
 	skb->pkt_type = PACKET_HOST;
diff --git a/net/bridge/netfilter/ebt_ulog.c b/net/bridge/netfilter/ebt_ulog.c
index 5a4996bbb090..ae3c7cef1484 100644
--- a/net/bridge/netfilter/ebt_ulog.c
+++ b/net/bridge/netfilter/ebt_ulog.c
@@ -178,7 +178,7 @@ static void ebt_ulog_packet(unsigned int hooknr, const struct sk_buff *skb,
 		strcpy(pm->physindev, in->name);
 		/* If in isn't a bridge, then physindev==indev */
 		if (in->br_port)
-			strcpy(pm->indev, br_port(in)->br->dev->name);
+			strcpy(pm->indev, in->br_port->br->dev->name);
 		else
 			strcpy(pm->indev, in->name);
 	} else
@@ -187,7 +187,7 @@ static void ebt_ulog_packet(unsigned int hooknr, const struct sk_buff *skb,
 	if (out) {
 		/* If out exists, then out is a bridge port */
 		strcpy(pm->physoutdev, out->name);
-		strcpy(pm->outdev, br_port(out)->br->dev->name);
+		strcpy(pm->outdev, out->br_port->br->dev->name);
 	} else
 		pm->outdev[0] = pm->physoutdev[0] = '\0';
 
diff --git a/net/bridge/netfilter/ebtables.c b/net/bridge/netfilter/ebtables.c
index 4c2aab8cbfc7..59ca00e40dec 100644
--- a/net/bridge/netfilter/ebtables.c
+++ b/net/bridge/netfilter/ebtables.c
@@ -141,10 +141,10 @@ ebt_basic_match(const struct ebt_entry *e, const struct ethhdr *h,
 	if (FWINV2(ebt_dev_check(e->out, out), EBT_IOUT))
 		return 1;
 	if ((!in || !in->br_port) ? 0 : FWINV2(ebt_dev_check(
-	   e->logical_in, br_port(in)->br->dev), EBT_ILOGICALIN))
+	   e->logical_in, in->br_port->br->dev), EBT_ILOGICALIN))
 		return 1;
 	if ((!out || !out->br_port) ? 0 : FWINV2(ebt_dev_check(
-	   e->logical_out, br_port(out)->br->dev), EBT_ILOGICALOUT))
+	   e->logical_out, out->br_port->br->dev), EBT_ILOGICALOUT))
 		return 1;
 
 	if (e->bitmask & EBT_SOURCEMAC) {
diff --git a/net/netfilter/nfnetlink_log.c b/net/netfilter/nfnetlink_log.c
index 78957cfa3bdd..fc9a211e629e 100644
--- a/net/netfilter/nfnetlink_log.c
+++ b/net/netfilter/nfnetlink_log.c
@@ -404,7 +404,7 @@ __build_packet_message(struct nfulnl_instance *inst,
 				     htonl(indev->ifindex));
 			/* this is the bridge group "brX" */
 			NLA_PUT_BE32(inst->skb, NFULA_IFINDEX_INDEV,
-				     htonl(br_port(indev)->br->dev->ifindex));
+				     htonl(indev->br_port->br->dev->ifindex));
 		} else {
 			/* Case 2: indev is bridge group, we need to look for
 			 * physical device (when called from ipv4) */
@@ -431,7 +431,7 @@ __build_packet_message(struct nfulnl_instance *inst,
 				     htonl(outdev->ifindex));
 			/* this is the bridge group "brX" */
 			NLA_PUT_BE32(inst->skb, NFULA_IFINDEX_OUTDEV,
-				     htonl(br_port(outdev)->br->dev->ifindex));
+				     htonl(outdev->br_port->br->dev->ifindex));
 		} else {
 			/* Case 2: indev is a bridge group, we need to look
 			 * for physical device (when called from ipv4) */
diff --git a/net/netfilter/nfnetlink_queue.c b/net/netfilter/nfnetlink_queue.c
index c3c17498298e..12e1ab37fcd8 100644
--- a/net/netfilter/nfnetlink_queue.c
+++ b/net/netfilter/nfnetlink_queue.c
@@ -297,7 +297,7 @@ nfqnl_build_packet_message(struct nfqnl_instance *queue,
 				     htonl(indev->ifindex));
 			/* this is the bridge group "brX" */
 			NLA_PUT_BE32(skb, NFQA_IFINDEX_INDEV,
-				     htonl(br_port(indev)->br->dev->ifindex));
+				     htonl(indev->br_port->br->dev->ifindex));
 		} else {
 			/* Case 2: indev is bridge group, we need to look for
 			 * physical device (when called from ipv4) */
@@ -322,7 +322,7 @@ nfqnl_build_packet_message(struct nfqnl_instance *queue,
 				     htonl(outdev->ifindex));
 			/* this is the bridge group "brX" */
 			NLA_PUT_BE32(skb, NFQA_IFINDEX_OUTDEV,
-				     htonl(br_port(outdev)->br->dev->ifindex));
+				     htonl(outdev->br_port->br->dev->ifindex));
 		} else {
 			/* Case 2: outdev is bridge group, we need to look for
 			 * physical output device (when called from ipv4) */
-- 
cgit v1.2.3


From ad72cf9885c536e3adae03f8337557ac9dd1e4bb Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Fri, 2 Jul 2010 10:03:52 +0200
Subject: libata: take advantage of cmwq and remove concurrency limitations

libata has two concurrency related limitations.

a. ata_wq which is used for polling PIO has single thread per CPU.  If
   there are multiple devices doing polling PIO on the same CPU, they
   can't be executed simultaneously.

b. ata_aux_wq which is used for SCSI probing has single thread.  In
   cases where SCSI probing is stalled for extended period of time
   which is possible for ATAPI devices, this will stall all probing.

#a is solved by increasing maximum concurrency of ata_wq.  Please note
that polling PIO might be used under allocation path and thus needs to
be served by a separate wq with a rescuer.

#b is solved by using the default wq instead and achieving exclusion
via per-port mutex.

Signed-off-by: Tejun Heo <tj@kernel.org>
Acked-by: Jeff Garzik <jgarzik@pobox.com>
---
 drivers/ata/libata-core.c | 20 +++++---------------
 drivers/ata/libata-eh.c   |  4 ++--
 drivers/ata/libata-scsi.c | 10 ++++++----
 drivers/ata/libata-sff.c  |  9 +--------
 drivers/ata/libata.h      |  1 -
 include/linux/libata.h    |  1 +
 6 files changed, 15 insertions(+), 30 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/ata/libata-core.c b/drivers/ata/libata-core.c
index ddf8e4862787..4f78741692dc 100644
--- a/drivers/ata/libata-core.c
+++ b/drivers/ata/libata-core.c
@@ -98,8 +98,6 @@ static unsigned long ata_dev_blacklisted(const struct ata_device *dev);
 
 unsigned int ata_print_id = 1;
 
-struct workqueue_struct *ata_aux_wq;
-
 struct ata_force_param {
 	const char	*name;
 	unsigned int	cbl;
@@ -5611,6 +5609,7 @@ struct ata_port *ata_port_alloc(struct ata_host *host)
 	ap->msg_enable = ATA_MSG_DRV | ATA_MSG_ERR | ATA_MSG_WARN;
 #endif
 
+	mutex_init(&ap->scsi_scan_mutex);
 	INIT_DELAYED_WORK(&ap->hotplug_task, ata_scsi_hotplug);
 	INIT_WORK(&ap->scsi_rescan_task, ata_scsi_dev_rescan);
 	INIT_LIST_HEAD(&ap->eh_done_q);
@@ -6549,29 +6548,20 @@ static int __init ata_init(void)
 
 	ata_parse_force_param();
 
-	ata_aux_wq = create_singlethread_workqueue("ata_aux");
-	if (!ata_aux_wq)
-		goto fail;
-
 	rc = ata_sff_init();
-	if (rc)
-		goto fail;
+	if (rc) {
+		kfree(ata_force_tbl);
+		return rc;
+	}
 
 	printk(KERN_DEBUG "libata version " DRV_VERSION " loaded.\n");
 	return 0;
-
-fail:
-	kfree(ata_force_tbl);
-	if (ata_aux_wq)
-		destroy_workqueue(ata_aux_wq);
-	return rc;
 }
 
 static void __exit ata_exit(void)
 {
 	ata_sff_exit();
 	kfree(ata_force_tbl);
-	destroy_workqueue(ata_aux_wq);
 }
 
 subsys_initcall(ata_init);
diff --git a/drivers/ata/libata-eh.c b/drivers/ata/libata-eh.c
index f77a67303f8b..4d2af824dd23 100644
--- a/drivers/ata/libata-eh.c
+++ b/drivers/ata/libata-eh.c
@@ -727,7 +727,7 @@ void ata_scsi_error(struct Scsi_Host *host)
 	if (ap->pflags & ATA_PFLAG_LOADING)
 		ap->pflags &= ~ATA_PFLAG_LOADING;
 	else if (ap->pflags & ATA_PFLAG_SCSI_HOTPLUG)
-		queue_delayed_work(ata_aux_wq, &ap->hotplug_task, 0);
+		schedule_delayed_work(&ap->hotplug_task, 0);
 
 	if (ap->pflags & ATA_PFLAG_RECOVERED)
 		ata_port_printk(ap, KERN_INFO, "EH complete\n");
@@ -2944,7 +2944,7 @@ static int ata_eh_revalidate_and_attach(struct ata_link *link,
 			ehc->i.flags |= ATA_EHI_SETMODE;
 
 			/* schedule the scsi_rescan_device() here */
-			queue_work(ata_aux_wq, &(ap->scsi_rescan_task));
+			schedule_work(&(ap->scsi_rescan_task));
 		} else if (dev->class == ATA_DEV_UNKNOWN &&
 			   ehc->tries[dev->devno] &&
 			   ata_class_enabled(ehc->classes[dev->devno])) {
diff --git a/drivers/ata/libata-scsi.c b/drivers/ata/libata-scsi.c
index a54273d2c3c6..d75c9c479d1a 100644
--- a/drivers/ata/libata-scsi.c
+++ b/drivers/ata/libata-scsi.c
@@ -3435,7 +3435,7 @@ void ata_scsi_scan_host(struct ata_port *ap, int sync)
 				"                  switching to async\n");
 	}
 
-	queue_delayed_work(ata_aux_wq, &ap->hotplug_task,
+	queue_delayed_work(system_long_wq, &ap->hotplug_task,
 			   round_jiffies_relative(HZ));
 }
 
@@ -3582,6 +3582,7 @@ void ata_scsi_hotplug(struct work_struct *work)
 	}
 
 	DPRINTK("ENTER\n");
+	mutex_lock(&ap->scsi_scan_mutex);
 
 	/* Unplug detached devices.  We cannot use link iterator here
 	 * because PMP links have to be scanned even if PMP is
@@ -3595,6 +3596,7 @@ void ata_scsi_hotplug(struct work_struct *work)
 	/* scan for new ones */
 	ata_scsi_scan_host(ap, 0);
 
+	mutex_unlock(&ap->scsi_scan_mutex);
 	DPRINTK("EXIT\n");
 }
 
@@ -3673,9 +3675,7 @@ static int ata_scsi_user_scan(struct Scsi_Host *shost, unsigned int channel,
  *	@work: Pointer to ATA port to perform scsi_rescan_device()
  *
  *	After ATA pass thru (SAT) commands are executed successfully,
- *	libata need to propagate the changes to SCSI layer.  This
- *	function must be executed from ata_aux_wq such that sdev
- *	attach/detach don't race with rescan.
+ *	libata need to propagate the changes to SCSI layer.
  *
  *	LOCKING:
  *	Kernel thread context (may sleep).
@@ -3688,6 +3688,7 @@ void ata_scsi_dev_rescan(struct work_struct *work)
 	struct ata_device *dev;
 	unsigned long flags;
 
+	mutex_lock(&ap->scsi_scan_mutex);
 	spin_lock_irqsave(ap->lock, flags);
 
 	ata_for_each_link(link, ap, EDGE) {
@@ -3707,6 +3708,7 @@ void ata_scsi_dev_rescan(struct work_struct *work)
 	}
 
 	spin_unlock_irqrestore(ap->lock, flags);
+	mutex_unlock(&ap->scsi_scan_mutex);
 }
 
 /**
diff --git a/drivers/ata/libata-sff.c b/drivers/ata/libata-sff.c
index efa4a18cfb9d..674c1436491f 100644
--- a/drivers/ata/libata-sff.c
+++ b/drivers/ata/libata-sff.c
@@ -3318,14 +3318,7 @@ void ata_sff_port_init(struct ata_port *ap)
 
 int __init ata_sff_init(void)
 {
-	/*
-	 * FIXME: In UP case, there is only one workqueue thread and if you
-	 * have more than one PIO device, latency is bloody awful, with
-	 * occasional multi-second "hiccups" as one PIO device waits for
-	 * another.  It's an ugly wart that users DO occasionally complain
-	 * about; luckily most users have at most one PIO polled device.
-	 */
-	ata_sff_wq = create_workqueue("ata_sff");
+	ata_sff_wq = alloc_workqueue("ata_sff", WQ_RESCUER, WQ_MAX_ACTIVE);
 	if (!ata_sff_wq)
 		return -ENOMEM;
 
diff --git a/drivers/ata/libata.h b/drivers/ata/libata.h
index 4b84ed60324a..9ce1ecc63e39 100644
--- a/drivers/ata/libata.h
+++ b/drivers/ata/libata.h
@@ -54,7 +54,6 @@ enum {
 };
 
 extern unsigned int ata_print_id;
-extern struct workqueue_struct *ata_aux_wq;
 extern int atapi_passthru16;
 extern int libata_fua;
 extern int libata_noacpi;
diff --git a/include/linux/libata.h b/include/linux/libata.h
index b85f3ff34d7d..f010f18a0f86 100644
--- a/include/linux/libata.h
+++ b/include/linux/libata.h
@@ -751,6 +751,7 @@ struct ata_port {
 	struct ata_host		*host;
 	struct device 		*dev;
 
+	struct mutex		scsi_scan_mutex;
 	struct delayed_work	hotplug_task;
 	struct work_struct	scsi_rescan_task;
 
-- 
cgit v1.2.3


From bdbc5dd7de5d07d6c9d3536e598956165a031d4c Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Fri, 2 Jul 2010 10:03:51 +0200
Subject: workqueue: prepare for WQ_UNBOUND implementation

In preparation of WQ_UNBOUND addition, make the following changes.

* Add WORK_CPU_* constants for pseudo cpu id numbers used (currently
  only WORK_CPU_NONE) and use them instead of NR_CPUS.  This is to
  allow another pseudo cpu id for unbound cpu.

* Reorder WQ_* flags.

* Make workqueue_struct->cpu_wq a union which contains a percpu
  pointer, regular pointer and an unsigned long value and use
  kzalloc/kfree() in UP allocation path.  This will be used to
  implement unbound workqueues which will use only one cwq on SMPs.

* Move alloc_cwqs() allocation after initialization of wq fields, so
  that alloc_cwqs() has access to wq->flags.

* Trivial relocation of wq local variables in freeze functions.

These changes don't cause any functional change.

Signed-off-by: Tejun Heo <tj@kernel.org>
---
 include/linux/workqueue.h | 10 ++++--
 kernel/workqueue.c        | 83 ++++++++++++++++++++++++-----------------------
 2 files changed, 50 insertions(+), 43 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/workqueue.h b/include/linux/workqueue.h
index 3f36d37ac5ba..139069a6286c 100644
--- a/include/linux/workqueue.h
+++ b/include/linux/workqueue.h
@@ -50,6 +50,10 @@ enum {
 	WORK_NR_COLORS		= (1 << WORK_STRUCT_COLOR_BITS) - 1,
 	WORK_NO_COLOR		= WORK_NR_COLORS,
 
+	/* special cpu IDs */
+	WORK_CPU_NONE		= NR_CPUS,
+	WORK_CPU_LAST		= WORK_CPU_NONE,
+
 	/*
 	 * Reserve 6 bits off of cwq pointer w/ debugobjects turned
 	 * off.  This makes cwqs aligned to 64 bytes which isn't too
@@ -60,7 +64,7 @@ enum {
 
 	WORK_STRUCT_FLAG_MASK	= (1UL << WORK_STRUCT_FLAG_BITS) - 1,
 	WORK_STRUCT_WQ_DATA_MASK = ~WORK_STRUCT_FLAG_MASK,
-	WORK_STRUCT_NO_CPU	= NR_CPUS << WORK_STRUCT_FLAG_BITS,
+	WORK_STRUCT_NO_CPU	= WORK_CPU_NONE << WORK_STRUCT_FLAG_BITS,
 
 	/* bit mask for work_busy() return values */
 	WORK_BUSY_PENDING	= 1 << 0,
@@ -227,9 +231,9 @@ static inline unsigned int work_static(struct work_struct *work) { return 0; }
 	clear_bit(WORK_STRUCT_PENDING_BIT, work_data_bits(work))
 
 enum {
-	WQ_FREEZEABLE		= 1 << 0, /* freeze during suspend */
+	WQ_NON_REENTRANT	= 1 << 0, /* guarantee non-reentrance */
 	WQ_SINGLE_CPU		= 1 << 1, /* only single cpu at a time */
-	WQ_NON_REENTRANT	= 1 << 2, /* guarantee non-reentrance */
+	WQ_FREEZEABLE		= 1 << 2, /* freeze during suspend */
 	WQ_RESCUER		= 1 << 3, /* has an rescue worker */
 	WQ_HIGHPRI		= 1 << 4, /* high priority */
 	WQ_CPU_INTENSIVE	= 1 << 5, /* cpu instensive workqueue */
diff --git a/kernel/workqueue.c b/kernel/workqueue.c
index 2eb9fbddf5c6..a105ddf55f79 100644
--- a/kernel/workqueue.c
+++ b/kernel/workqueue.c
@@ -190,7 +190,11 @@ struct wq_flusher {
  */
 struct workqueue_struct {
 	unsigned int		flags;		/* I: WQ_* flags */
-	struct cpu_workqueue_struct *cpu_wq;	/* I: cwq's */
+	union {
+		struct cpu_workqueue_struct __percpu	*pcpu;
+		struct cpu_workqueue_struct		*single;
+		unsigned long				v;
+	} cpu_wq;				/* I: cwq's */
 	struct list_head	list;		/* W: list of all workqueues */
 
 	struct mutex		flush_mutex;	/* protects wq flushing */
@@ -362,7 +366,11 @@ static atomic_t *get_gcwq_nr_running(unsigned int cpu)
 static struct cpu_workqueue_struct *get_cwq(unsigned int cpu,
 					    struct workqueue_struct *wq)
 {
-	return per_cpu_ptr(wq->cpu_wq, cpu);
+#ifndef CONFIG_SMP
+	return wq->cpu_wq.single;
+#else
+	return per_cpu_ptr(wq->cpu_wq.pcpu, cpu);
+#endif
 }
 
 static unsigned int work_color_to_flags(int color)
@@ -442,7 +450,7 @@ static struct global_cwq *get_work_gcwq(struct work_struct *work)
 		return ((struct cpu_workqueue_struct *)data)->gcwq;
 
 	cpu = data >> WORK_STRUCT_FLAG_BITS;
-	if (cpu == NR_CPUS)
+	if (cpu == WORK_CPU_NONE)
 		return NULL;
 
 	BUG_ON(cpu >= nr_cpu_ids);
@@ -846,7 +854,7 @@ static void cwq_unbind_single_cpu(struct cpu_workqueue_struct *cwq)
 	 */
 	if (likely(!(gcwq->flags & GCWQ_FREEZING))) {
 		smp_wmb();	/* paired with cmpxchg() in __queue_work() */
-		wq->single_cpu = NR_CPUS;
+		wq->single_cpu = WORK_CPU_NONE;
 	}
 }
 
@@ -904,7 +912,7 @@ static void __queue_work(unsigned int cpu, struct workqueue_struct *wq,
 		 */
 	retry:
 		cpu = wq->single_cpu;
-		arbitrate = cpu == NR_CPUS;
+		arbitrate = cpu == WORK_CPU_NONE;
 		if (arbitrate)
 			cpu = req_cpu;
 
@@ -918,7 +926,7 @@ static void __queue_work(unsigned int cpu, struct workqueue_struct *wq,
 		 * visible on the new cpu after this point.
 		 */
 		if (arbitrate)
-			cmpxchg(&wq->single_cpu, NR_CPUS, cpu);
+			cmpxchg(&wq->single_cpu, WORK_CPU_NONE, cpu);
 
 		if (unlikely(wq->single_cpu != cpu)) {
 			spin_unlock_irqrestore(&gcwq->lock, flags);
@@ -2572,7 +2580,7 @@ int keventd_up(void)
 	return system_wq != NULL;
 }
 
-static struct cpu_workqueue_struct *alloc_cwqs(void)
+static int alloc_cwqs(struct workqueue_struct *wq)
 {
 	/*
 	 * cwqs are forced aligned according to WORK_STRUCT_FLAG_BITS.
@@ -2582,40 +2590,36 @@ static struct cpu_workqueue_struct *alloc_cwqs(void)
 	const size_t size = sizeof(struct cpu_workqueue_struct);
 	const size_t align = max_t(size_t, 1 << WORK_STRUCT_FLAG_BITS,
 				   __alignof__(unsigned long long));
-	struct cpu_workqueue_struct *cwqs;
 #ifndef CONFIG_SMP
 	void *ptr;
 
 	/*
-	 * On UP, percpu allocator doesn't honor alignment parameter
-	 * and simply uses arch-dependent default.  Allocate enough
-	 * room to align cwq and put an extra pointer at the end
-	 * pointing back to the originally allocated pointer which
-	 * will be used for free.
-	 *
-	 * FIXME: This really belongs to UP percpu code.  Update UP
-	 * percpu code to honor alignment and remove this ugliness.
+	 * Allocate enough room to align cwq and put an extra pointer
+	 * at the end pointing back to the originally allocated
+	 * pointer which will be used for free.
 	 */
-	ptr = __alloc_percpu(size + align + sizeof(void *), 1);
-	cwqs = PTR_ALIGN(ptr, align);
-	*(void **)per_cpu_ptr(cwqs + 1, 0) = ptr;
+	ptr = kzalloc(size + align + sizeof(void *), GFP_KERNEL);
+	if (ptr) {
+		wq->cpu_wq.single = PTR_ALIGN(ptr, align);
+		*(void **)(wq->cpu_wq.single + 1) = ptr;
+	}
 #else
-	/* On SMP, percpu allocator can do it itself */
-	cwqs = __alloc_percpu(size, align);
+	/* On SMP, percpu allocator can align itself */
+	wq->cpu_wq.pcpu = __alloc_percpu(size, align);
 #endif
 	/* just in case, make sure it's actually aligned */
-	BUG_ON(!IS_ALIGNED((unsigned long)cwqs, align));
-	return cwqs;
+	BUG_ON(!IS_ALIGNED(wq->cpu_wq.v, align));
+	return wq->cpu_wq.v ? 0 : -ENOMEM;
 }
 
-static void free_cwqs(struct cpu_workqueue_struct *cwqs)
+static void free_cwqs(struct workqueue_struct *wq)
 {
 #ifndef CONFIG_SMP
 	/* on UP, the pointer to free is stored right after the cwq */
-	if (cwqs)
-		free_percpu(*(void **)per_cpu_ptr(cwqs + 1, 0));
+	if (wq->cpu_wq.single)
+		kfree(*(void **)(wq->cpu_wq.single + 1));
 #else
-	free_percpu(cwqs);
+	free_percpu(wq->cpu_wq.pcpu);
 #endif
 }
 
@@ -2645,22 +2649,21 @@ struct workqueue_struct *__alloc_workqueue_key(const char *name,
 	if (!wq)
 		goto err;
 
-	wq->cpu_wq = alloc_cwqs();
-	if (!wq->cpu_wq)
-		goto err;
-
 	wq->flags = flags;
 	wq->saved_max_active = max_active;
 	mutex_init(&wq->flush_mutex);
 	atomic_set(&wq->nr_cwqs_to_flush, 0);
 	INIT_LIST_HEAD(&wq->flusher_queue);
 	INIT_LIST_HEAD(&wq->flusher_overflow);
-	wq->single_cpu = NR_CPUS;
+	wq->single_cpu = WORK_CPU_NONE;
 
 	wq->name = name;
 	lockdep_init_map(&wq->lockdep_map, lock_name, key, 0);
 	INIT_LIST_HEAD(&wq->list);
 
+	if (alloc_cwqs(wq) < 0)
+		goto err;
+
 	for_each_possible_cpu(cpu) {
 		struct cpu_workqueue_struct *cwq = get_cwq(cpu, wq);
 		struct global_cwq *gcwq = get_gcwq(cpu);
@@ -2710,7 +2713,7 @@ struct workqueue_struct *__alloc_workqueue_key(const char *name,
 	return wq;
 err:
 	if (wq) {
-		free_cwqs(wq->cpu_wq);
+		free_cwqs(wq);
 		free_cpumask_var(wq->mayday_mask);
 		kfree(wq->rescuer);
 		kfree(wq);
@@ -2755,7 +2758,7 @@ void destroy_workqueue(struct workqueue_struct *wq)
 		free_cpumask_var(wq->mayday_mask);
 	}
 
-	free_cwqs(wq->cpu_wq);
+	free_cwqs(wq);
 	kfree(wq);
 }
 EXPORT_SYMBOL_GPL(destroy_workqueue);
@@ -2821,13 +2824,13 @@ EXPORT_SYMBOL_GPL(workqueue_congested);
  * @work: the work of interest
  *
  * RETURNS:
- * CPU number if @work was ever queued.  NR_CPUS otherwise.
+ * CPU number if @work was ever queued.  WORK_CPU_NONE otherwise.
  */
 unsigned int work_cpu(struct work_struct *work)
 {
 	struct global_cwq *gcwq = get_work_gcwq(work);
 
-	return gcwq ? gcwq->cpu : NR_CPUS;
+	return gcwq ? gcwq->cpu : WORK_CPU_NONE;
 }
 EXPORT_SYMBOL_GPL(work_cpu);
 
@@ -3300,7 +3303,6 @@ EXPORT_SYMBOL_GPL(work_on_cpu);
  */
 void freeze_workqueues_begin(void)
 {
-	struct workqueue_struct *wq;
 	unsigned int cpu;
 
 	spin_lock(&workqueue_lock);
@@ -3310,6 +3312,7 @@ void freeze_workqueues_begin(void)
 
 	for_each_possible_cpu(cpu) {
 		struct global_cwq *gcwq = get_gcwq(cpu);
+		struct workqueue_struct *wq;
 
 		spin_lock_irq(&gcwq->lock);
 
@@ -3344,7 +3347,6 @@ void freeze_workqueues_begin(void)
  */
 bool freeze_workqueues_busy(void)
 {
-	struct workqueue_struct *wq;
 	unsigned int cpu;
 	bool busy = false;
 
@@ -3353,6 +3355,7 @@ bool freeze_workqueues_busy(void)
 	BUG_ON(!workqueue_freezing);
 
 	for_each_possible_cpu(cpu) {
+		struct workqueue_struct *wq;
 		/*
 		 * nr_active is monotonically decreasing.  It's safe
 		 * to peek without lock.
@@ -3386,7 +3389,6 @@ out_unlock:
  */
 void thaw_workqueues(void)
 {
-	struct workqueue_struct *wq;
 	unsigned int cpu;
 
 	spin_lock(&workqueue_lock);
@@ -3396,6 +3398,7 @@ void thaw_workqueues(void)
 
 	for_each_possible_cpu(cpu) {
 		struct global_cwq *gcwq = get_gcwq(cpu);
+		struct workqueue_struct *wq;
 
 		spin_lock_irq(&gcwq->lock);
 
@@ -3443,7 +3446,7 @@ void __init init_workqueues(void)
 	 * sure cpu number won't overflow into kernel pointer area so
 	 * that they can be distinguished.
 	 */
-	BUILD_BUG_ON(NR_CPUS << WORK_STRUCT_FLAG_BITS >= PAGE_OFFSET);
+	BUILD_BUG_ON(WORK_CPU_LAST << WORK_STRUCT_FLAG_BITS >= PAGE_OFFSET);
 
 	hotcpu_notifier(workqueue_cpu_callback, CPU_PRI_WORKQUEUE);
 
-- 
cgit v1.2.3


From f34217977d717385a3e9fd7018ac39fade3964c0 Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Fri, 2 Jul 2010 10:03:51 +0200
Subject: workqueue: implement unbound workqueue

This patch implements unbound workqueue which can be specified with
WQ_UNBOUND flag on creation.  An unbound workqueue has the following
properties.

* It uses a dedicated gcwq with a pseudo CPU number WORK_CPU_UNBOUND.
  This gcwq is always online and disassociated.

* Workers are not bound to any CPU and not concurrency managed.  Works
  are dispatched to workers as soon as possible and the only applied
  limitation is @max_active.  IOW, all unbound workqeueues are
  implicitly high priority.

Unbound workqueues can be used as simple execution context provider.
Contexts unbound to any cpu are served as soon as possible.

Signed-off-by: Tejun Heo <tj@kernel.org>
Cc: Arjan van de Ven <arjan@linux.intel.com>
Cc: David Howells <dhowells@redhat.com>
---
 include/linux/workqueue.h |  15 +++-
 kernel/workqueue.c        | 218 +++++++++++++++++++++++++++++++++-------------
 2 files changed, 173 insertions(+), 60 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/workqueue.h b/include/linux/workqueue.h
index 139069a6286c..67ce734747f6 100644
--- a/include/linux/workqueue.h
+++ b/include/linux/workqueue.h
@@ -51,7 +51,8 @@ enum {
 	WORK_NO_COLOR		= WORK_NR_COLORS,
 
 	/* special cpu IDs */
-	WORK_CPU_NONE		= NR_CPUS,
+	WORK_CPU_UNBOUND	= NR_CPUS,
+	WORK_CPU_NONE		= NR_CPUS + 1,
 	WORK_CPU_LAST		= WORK_CPU_NONE,
 
 	/*
@@ -237,11 +238,17 @@ enum {
 	WQ_RESCUER		= 1 << 3, /* has an rescue worker */
 	WQ_HIGHPRI		= 1 << 4, /* high priority */
 	WQ_CPU_INTENSIVE	= 1 << 5, /* cpu instensive workqueue */
+	WQ_UNBOUND		= 1 << 6, /* not bound to any cpu */
 
 	WQ_MAX_ACTIVE		= 512,	  /* I like 512, better ideas? */
+	WQ_MAX_UNBOUND_PER_CPU	= 4,	  /* 4 * #cpus for unbound wq */
 	WQ_DFL_ACTIVE		= WQ_MAX_ACTIVE / 2,
 };
 
+/* unbound wq's aren't per-cpu, scale max_active according to #cpus */
+#define WQ_UNBOUND_MAX_ACTIVE	\
+	max_t(int, WQ_MAX_ACTIVE, num_possible_cpus() * WQ_MAX_UNBOUND_PER_CPU)
+
 /*
  * System-wide workqueues which are always present.
  *
@@ -256,10 +263,16 @@ enum {
  * system_nrt_wq is non-reentrant and guarantees that any given work
  * item is never executed in parallel by multiple CPUs.  Queue
  * flushing might take relatively long.
+ *
+ * system_unbound_wq is unbound workqueue.  Workers are not bound to
+ * any specific CPU, not concurrency managed, and all queued works are
+ * executed immediately as long as max_active limit is not reached and
+ * resources are available.
  */
 extern struct workqueue_struct *system_wq;
 extern struct workqueue_struct *system_long_wq;
 extern struct workqueue_struct *system_nrt_wq;
+extern struct workqueue_struct *system_unbound_wq;
 
 extern struct workqueue_struct *
 __alloc_workqueue_key(const char *name, unsigned int flags, int max_active,
diff --git a/kernel/workqueue.c b/kernel/workqueue.c
index a105ddf55f79..4608563cdd63 100644
--- a/kernel/workqueue.c
+++ b/kernel/workqueue.c
@@ -53,9 +53,10 @@ enum {
 	WORKER_ROGUE		= 1 << 4,	/* not bound to any cpu */
 	WORKER_REBIND		= 1 << 5,	/* mom is home, come back */
 	WORKER_CPU_INTENSIVE	= 1 << 6,	/* cpu intensive */
+	WORKER_UNBOUND		= 1 << 7,	/* worker is unbound */
 
 	WORKER_NOT_RUNNING	= WORKER_PREP | WORKER_ROGUE | WORKER_REBIND |
-				  WORKER_CPU_INTENSIVE,
+				  WORKER_CPU_INTENSIVE | WORKER_UNBOUND,
 
 	/* gcwq->trustee_state */
 	TRUSTEE_START		= 0,		/* start */
@@ -96,7 +97,7 @@ enum {
  * X: During normal operation, modification requires gcwq->lock and
  *    should be done only from local cpu.  Either disabling preemption
  *    on local cpu or grabbing gcwq->lock is enough for read access.
- *    While trustee is in charge, it's identical to L.
+ *    If GCWQ_DISASSOCIATED is set, it's identical to L.
  *
  * F: wq->flush_mutex protected.
  *
@@ -220,14 +221,52 @@ struct workqueue_struct {
 struct workqueue_struct *system_wq __read_mostly;
 struct workqueue_struct *system_long_wq __read_mostly;
 struct workqueue_struct *system_nrt_wq __read_mostly;
+struct workqueue_struct *system_unbound_wq __read_mostly;
 EXPORT_SYMBOL_GPL(system_wq);
 EXPORT_SYMBOL_GPL(system_long_wq);
 EXPORT_SYMBOL_GPL(system_nrt_wq);
+EXPORT_SYMBOL_GPL(system_unbound_wq);
 
 #define for_each_busy_worker(worker, i, pos, gcwq)			\
 	for (i = 0; i < BUSY_WORKER_HASH_SIZE; i++)			\
 		hlist_for_each_entry(worker, pos, &gcwq->busy_hash[i], hentry)
 
+static inline int __next_gcwq_cpu(int cpu, const struct cpumask *mask,
+				  unsigned int sw)
+{
+	if (cpu < nr_cpu_ids) {
+		if (sw & 1) {
+			cpu = cpumask_next(cpu, mask);
+			if (cpu < nr_cpu_ids)
+				return cpu;
+		}
+		if (sw & 2)
+			return WORK_CPU_UNBOUND;
+	}
+	return WORK_CPU_NONE;
+}
+
+static inline int __next_wq_cpu(int cpu, const struct cpumask *mask,
+				struct workqueue_struct *wq)
+{
+	return __next_gcwq_cpu(cpu, mask, !(wq->flags & WQ_UNBOUND) ? 1 : 2);
+}
+
+#define for_each_gcwq_cpu(cpu)						\
+	for ((cpu) = __next_gcwq_cpu(-1, cpu_possible_mask, 3);		\
+	     (cpu) < WORK_CPU_NONE;					\
+	     (cpu) = __next_gcwq_cpu((cpu), cpu_possible_mask, 3))
+
+#define for_each_online_gcwq_cpu(cpu)					\
+	for ((cpu) = __next_gcwq_cpu(-1, cpu_online_mask, 3);		\
+	     (cpu) < WORK_CPU_NONE;					\
+	     (cpu) = __next_gcwq_cpu((cpu), cpu_online_mask, 3))
+
+#define for_each_cwq_cpu(cpu, wq)					\
+	for ((cpu) = __next_wq_cpu(-1, cpu_possible_mask, (wq));	\
+	     (cpu) < WORK_CPU_NONE;					\
+	     (cpu) = __next_wq_cpu((cpu), cpu_possible_mask, (wq)))
+
 #ifdef CONFIG_DEBUG_OBJECTS_WORK
 
 static struct debug_obj_descr work_debug_descr;
@@ -351,26 +390,46 @@ static bool workqueue_freezing;		/* W: have wqs started freezing? */
 static DEFINE_PER_CPU(struct global_cwq, global_cwq);
 static DEFINE_PER_CPU_SHARED_ALIGNED(atomic_t, gcwq_nr_running);
 
+/*
+ * Global cpu workqueue and nr_running counter for unbound gcwq.  The
+ * gcwq is always online, has GCWQ_DISASSOCIATED set, and all its
+ * workers have WORKER_UNBOUND set.
+ */
+static struct global_cwq unbound_global_cwq;
+static atomic_t unbound_gcwq_nr_running = ATOMIC_INIT(0);	/* always 0 */
+
 static int worker_thread(void *__worker);
 
 static struct global_cwq *get_gcwq(unsigned int cpu)
 {
-	return &per_cpu(global_cwq, cpu);
+	if (cpu != WORK_CPU_UNBOUND)
+		return &per_cpu(global_cwq, cpu);
+	else
+		return &unbound_global_cwq;
 }
 
 static atomic_t *get_gcwq_nr_running(unsigned int cpu)
 {
-	return &per_cpu(gcwq_nr_running, cpu);
+	if (cpu != WORK_CPU_UNBOUND)
+		return &per_cpu(gcwq_nr_running, cpu);
+	else
+		return &unbound_gcwq_nr_running;
 }
 
 static struct cpu_workqueue_struct *get_cwq(unsigned int cpu,
 					    struct workqueue_struct *wq)
 {
-#ifndef CONFIG_SMP
-	return wq->cpu_wq.single;
+	if (!(wq->flags & WQ_UNBOUND)) {
+		if (likely(cpu < nr_cpu_ids)) {
+#ifdef CONFIG_SMP
+			return per_cpu_ptr(wq->cpu_wq.pcpu, cpu);
 #else
-	return per_cpu_ptr(wq->cpu_wq.pcpu, cpu);
+			return wq->cpu_wq.single;
 #endif
+		}
+	} else if (likely(cpu == WORK_CPU_UNBOUND))
+		return wq->cpu_wq.single;
+	return NULL;
 }
 
 static unsigned int work_color_to_flags(int color)
@@ -453,7 +512,7 @@ static struct global_cwq *get_work_gcwq(struct work_struct *work)
 	if (cpu == WORK_CPU_NONE)
 		return NULL;
 
-	BUG_ON(cpu >= nr_cpu_ids);
+	BUG_ON(cpu >= nr_cpu_ids && cpu != WORK_CPU_UNBOUND);
 	return get_gcwq(cpu);
 }
 
@@ -869,11 +928,14 @@ static void __queue_work(unsigned int cpu, struct workqueue_struct *wq,
 
 	debug_work_activate(work);
 
+	if (unlikely(cpu == WORK_CPU_UNBOUND))
+		cpu = raw_smp_processor_id();
+
 	/*
 	 * Determine gcwq to use.  SINGLE_CPU is inherently
 	 * NON_REENTRANT, so test it first.
 	 */
-	if (!(wq->flags & WQ_SINGLE_CPU)) {
+	if (!(wq->flags & (WQ_SINGLE_CPU | WQ_UNBOUND))) {
 		struct global_cwq *last_gcwq;
 
 		/*
@@ -900,7 +962,7 @@ static void __queue_work(unsigned int cpu, struct workqueue_struct *wq,
 			}
 		} else
 			spin_lock_irqsave(&gcwq->lock, flags);
-	} else {
+	} else if (!(wq->flags & WQ_UNBOUND)) {
 		unsigned int req_cpu = cpu;
 
 		/*
@@ -932,6 +994,9 @@ static void __queue_work(unsigned int cpu, struct workqueue_struct *wq,
 			spin_unlock_irqrestore(&gcwq->lock, flags);
 			goto retry;
 		}
+	} else {
+		gcwq = get_gcwq(WORK_CPU_UNBOUND);
+		spin_lock_irqsave(&gcwq->lock, flags);
 	}
 
 	/* gcwq determined, get cwq and queue */
@@ -1166,7 +1231,8 @@ static bool worker_maybe_bind_and_lock(struct worker *worker)
 		 * it races with cpu hotunplug operation.  Verify
 		 * against GCWQ_DISASSOCIATED.
 		 */
-		set_cpus_allowed_ptr(task, get_cpu_mask(gcwq->cpu));
+		if (!(gcwq->flags & GCWQ_DISASSOCIATED))
+			set_cpus_allowed_ptr(task, get_cpu_mask(gcwq->cpu));
 
 		spin_lock_irq(&gcwq->lock);
 		if (gcwq->flags & GCWQ_DISASSOCIATED)
@@ -1231,8 +1297,9 @@ static struct worker *alloc_worker(void)
  */
 static struct worker *create_worker(struct global_cwq *gcwq, bool bind)
 {
-	int id = -1;
+	bool on_unbound_cpu = gcwq->cpu == WORK_CPU_UNBOUND;
 	struct worker *worker = NULL;
+	int id = -1;
 
 	spin_lock_irq(&gcwq->lock);
 	while (ida_get_new(&gcwq->worker_ida, &id)) {
@@ -1250,8 +1317,12 @@ static struct worker *create_worker(struct global_cwq *gcwq, bool bind)
 	worker->gcwq = gcwq;
 	worker->id = id;
 
-	worker->task = kthread_create(worker_thread, worker, "kworker/%u:%d",
-				      gcwq->cpu, id);
+	if (!on_unbound_cpu)
+		worker->task = kthread_create(worker_thread, worker,
+					      "kworker/%u:%d", gcwq->cpu, id);
+	else
+		worker->task = kthread_create(worker_thread, worker,
+					      "kworker/u:%d", id);
 	if (IS_ERR(worker->task))
 		goto fail;
 
@@ -1260,10 +1331,13 @@ static struct worker *create_worker(struct global_cwq *gcwq, bool bind)
 	 * online later on.  Make sure every worker has
 	 * PF_THREAD_BOUND set.
 	 */
-	if (bind)
+	if (bind && !on_unbound_cpu)
 		kthread_bind(worker->task, gcwq->cpu);
-	else
+	else {
 		worker->task->flags |= PF_THREAD_BOUND;
+		if (on_unbound_cpu)
+			worker->flags |= WORKER_UNBOUND;
+	}
 
 	return worker;
 fail:
@@ -1358,12 +1432,17 @@ static bool send_mayday(struct work_struct *work)
 {
 	struct cpu_workqueue_struct *cwq = get_work_cwq(work);
 	struct workqueue_struct *wq = cwq->wq;
+	unsigned int cpu;
 
 	if (!(wq->flags & WQ_RESCUER))
 		return false;
 
 	/* mayday mayday mayday */
-	if (!cpumask_test_and_set_cpu(cwq->gcwq->cpu, wq->mayday_mask))
+	cpu = cwq->gcwq->cpu;
+	/* WORK_CPU_UNBOUND can't be set in cpumask, use cpu 0 instead */
+	if (cpu == WORK_CPU_UNBOUND)
+		cpu = 0;
+	if (!cpumask_test_and_set_cpu(cpu, wq->mayday_mask))
 		wake_up_process(wq->rescuer->task);
 	return true;
 }
@@ -1882,6 +1961,7 @@ static int rescuer_thread(void *__wq)
 	struct workqueue_struct *wq = __wq;
 	struct worker *rescuer = wq->rescuer;
 	struct list_head *scheduled = &rescuer->scheduled;
+	bool is_unbound = wq->flags & WQ_UNBOUND;
 	unsigned int cpu;
 
 	set_user_nice(current, RESCUER_NICE_LEVEL);
@@ -1891,8 +1971,13 @@ repeat:
 	if (kthread_should_stop())
 		return 0;
 
+	/*
+	 * See whether any cpu is asking for help.  Unbounded
+	 * workqueues use cpu 0 in mayday_mask for CPU_UNBOUND.
+	 */
 	for_each_cpu(cpu, wq->mayday_mask) {
-		struct cpu_workqueue_struct *cwq = get_cwq(cpu, wq);
+		unsigned int tcpu = is_unbound ? WORK_CPU_UNBOUND : cpu;
+		struct cpu_workqueue_struct *cwq = get_cwq(tcpu, wq);
 		struct global_cwq *gcwq = cwq->gcwq;
 		struct work_struct *work, *n;
 
@@ -2034,7 +2119,7 @@ static bool flush_workqueue_prep_cwqs(struct workqueue_struct *wq,
 		atomic_set(&wq->nr_cwqs_to_flush, 1);
 	}
 
-	for_each_possible_cpu(cpu) {
+	for_each_cwq_cpu(cpu, wq) {
 		struct cpu_workqueue_struct *cwq = get_cwq(cpu, wq);
 		struct global_cwq *gcwq = cwq->gcwq;
 
@@ -2344,7 +2429,7 @@ static void wait_on_work(struct work_struct *work)
 	lock_map_acquire(&work->lockdep_map);
 	lock_map_release(&work->lockdep_map);
 
-	for_each_possible_cpu(cpu)
+	for_each_gcwq_cpu(cpu)
 		wait_on_cpu_work(get_gcwq(cpu), work);
 }
 
@@ -2590,23 +2675,25 @@ static int alloc_cwqs(struct workqueue_struct *wq)
 	const size_t size = sizeof(struct cpu_workqueue_struct);
 	const size_t align = max_t(size_t, 1 << WORK_STRUCT_FLAG_BITS,
 				   __alignof__(unsigned long long));
-#ifndef CONFIG_SMP
-	void *ptr;
 
-	/*
-	 * Allocate enough room to align cwq and put an extra pointer
-	 * at the end pointing back to the originally allocated
-	 * pointer which will be used for free.
-	 */
-	ptr = kzalloc(size + align + sizeof(void *), GFP_KERNEL);
-	if (ptr) {
-		wq->cpu_wq.single = PTR_ALIGN(ptr, align);
-		*(void **)(wq->cpu_wq.single + 1) = ptr;
+	if (CONFIG_SMP && !(wq->flags & WQ_UNBOUND)) {
+		/* on SMP, percpu allocator can align itself */
+		wq->cpu_wq.pcpu = __alloc_percpu(size, align);
+	} else {
+		void *ptr;
+
+		/*
+		 * Allocate enough room to align cwq and put an extra
+		 * pointer at the end pointing back to the originally
+		 * allocated pointer which will be used for free.
+		 */
+		ptr = kzalloc(size + align + sizeof(void *), GFP_KERNEL);
+		if (ptr) {
+			wq->cpu_wq.single = PTR_ALIGN(ptr, align);
+			*(void **)(wq->cpu_wq.single + 1) = ptr;
+		}
 	}
-#else
-	/* On SMP, percpu allocator can align itself */
-	wq->cpu_wq.pcpu = __alloc_percpu(size, align);
-#endif
+
 	/* just in case, make sure it's actually aligned */
 	BUG_ON(!IS_ALIGNED(wq->cpu_wq.v, align));
 	return wq->cpu_wq.v ? 0 : -ENOMEM;
@@ -2614,23 +2701,25 @@ static int alloc_cwqs(struct workqueue_struct *wq)
 
 static void free_cwqs(struct workqueue_struct *wq)
 {
-#ifndef CONFIG_SMP
-	/* on UP, the pointer to free is stored right after the cwq */
-	if (wq->cpu_wq.single)
+	if (CONFIG_SMP && !(wq->flags & WQ_UNBOUND))
+		free_percpu(wq->cpu_wq.pcpu);
+	else if (wq->cpu_wq.single) {
+		/* the pointer to free is stored right after the cwq */
 		kfree(*(void **)(wq->cpu_wq.single + 1));
-#else
-	free_percpu(wq->cpu_wq.pcpu);
-#endif
+	}
 }
 
-static int wq_clamp_max_active(int max_active, const char *name)
+static int wq_clamp_max_active(int max_active, unsigned int flags,
+			       const char *name)
 {
-	if (max_active < 1 || max_active > WQ_MAX_ACTIVE)
+	int lim = flags & WQ_UNBOUND ? WQ_UNBOUND_MAX_ACTIVE : WQ_MAX_ACTIVE;
+
+	if (max_active < 1 || max_active > lim)
 		printk(KERN_WARNING "workqueue: max_active %d requested for %s "
 		       "is out of range, clamping between %d and %d\n",
-		       max_active, name, 1, WQ_MAX_ACTIVE);
+		       max_active, name, 1, lim);
 
-	return clamp_val(max_active, 1, WQ_MAX_ACTIVE);
+	return clamp_val(max_active, 1, lim);
 }
 
 struct workqueue_struct *__alloc_workqueue_key(const char *name,
@@ -2642,8 +2731,15 @@ struct workqueue_struct *__alloc_workqueue_key(const char *name,
 	struct workqueue_struct *wq;
 	unsigned int cpu;
 
+	/*
+	 * Unbound workqueues aren't concurrency managed and should be
+	 * dispatched to workers immediately.
+	 */
+	if (flags & WQ_UNBOUND)
+		flags |= WQ_HIGHPRI;
+
 	max_active = max_active ?: WQ_DFL_ACTIVE;
-	max_active = wq_clamp_max_active(max_active, name);
+	max_active = wq_clamp_max_active(max_active, flags, name);
 
 	wq = kzalloc(sizeof(*wq), GFP_KERNEL);
 	if (!wq)
@@ -2664,7 +2760,7 @@ struct workqueue_struct *__alloc_workqueue_key(const char *name,
 	if (alloc_cwqs(wq) < 0)
 		goto err;
 
-	for_each_possible_cpu(cpu) {
+	for_each_cwq_cpu(cpu, wq) {
 		struct cpu_workqueue_struct *cwq = get_cwq(cpu, wq);
 		struct global_cwq *gcwq = get_gcwq(cpu);
 
@@ -2703,7 +2799,7 @@ struct workqueue_struct *__alloc_workqueue_key(const char *name,
 	spin_lock(&workqueue_lock);
 
 	if (workqueue_freezing && wq->flags & WQ_FREEZEABLE)
-		for_each_possible_cpu(cpu)
+		for_each_cwq_cpu(cpu, wq)
 			get_cwq(cpu, wq)->max_active = 0;
 
 	list_add(&wq->list, &workqueues);
@@ -2743,7 +2839,7 @@ void destroy_workqueue(struct workqueue_struct *wq)
 	spin_unlock(&workqueue_lock);
 
 	/* sanity check */
-	for_each_possible_cpu(cpu) {
+	for_each_cwq_cpu(cpu, wq) {
 		struct cpu_workqueue_struct *cwq = get_cwq(cpu, wq);
 		int i;
 
@@ -2777,13 +2873,13 @@ void workqueue_set_max_active(struct workqueue_struct *wq, int max_active)
 {
 	unsigned int cpu;
 
-	max_active = wq_clamp_max_active(max_active, wq->name);
+	max_active = wq_clamp_max_active(max_active, wq->flags, wq->name);
 
 	spin_lock(&workqueue_lock);
 
 	wq->saved_max_active = max_active;
 
-	for_each_possible_cpu(cpu) {
+	for_each_cwq_cpu(cpu, wq) {
 		struct global_cwq *gcwq = get_gcwq(cpu);
 
 		spin_lock_irq(&gcwq->lock);
@@ -3310,7 +3406,7 @@ void freeze_workqueues_begin(void)
 	BUG_ON(workqueue_freezing);
 	workqueue_freezing = true;
 
-	for_each_possible_cpu(cpu) {
+	for_each_gcwq_cpu(cpu) {
 		struct global_cwq *gcwq = get_gcwq(cpu);
 		struct workqueue_struct *wq;
 
@@ -3322,7 +3418,7 @@ void freeze_workqueues_begin(void)
 		list_for_each_entry(wq, &workqueues, list) {
 			struct cpu_workqueue_struct *cwq = get_cwq(cpu, wq);
 
-			if (wq->flags & WQ_FREEZEABLE)
+			if (cwq && wq->flags & WQ_FREEZEABLE)
 				cwq->max_active = 0;
 		}
 
@@ -3354,7 +3450,7 @@ bool freeze_workqueues_busy(void)
 
 	BUG_ON(!workqueue_freezing);
 
-	for_each_possible_cpu(cpu) {
+	for_each_gcwq_cpu(cpu) {
 		struct workqueue_struct *wq;
 		/*
 		 * nr_active is monotonically decreasing.  It's safe
@@ -3363,7 +3459,7 @@ bool freeze_workqueues_busy(void)
 		list_for_each_entry(wq, &workqueues, list) {
 			struct cpu_workqueue_struct *cwq = get_cwq(cpu, wq);
 
-			if (!(wq->flags & WQ_FREEZEABLE))
+			if (!cwq || !(wq->flags & WQ_FREEZEABLE))
 				continue;
 
 			BUG_ON(cwq->nr_active < 0);
@@ -3396,7 +3492,7 @@ void thaw_workqueues(void)
 	if (!workqueue_freezing)
 		goto out_unlock;
 
-	for_each_possible_cpu(cpu) {
+	for_each_gcwq_cpu(cpu) {
 		struct global_cwq *gcwq = get_gcwq(cpu);
 		struct workqueue_struct *wq;
 
@@ -3408,7 +3504,7 @@ void thaw_workqueues(void)
 		list_for_each_entry(wq, &workqueues, list) {
 			struct cpu_workqueue_struct *cwq = get_cwq(cpu, wq);
 
-			if (!(wq->flags & WQ_FREEZEABLE))
+			if (!cwq || !(wq->flags & WQ_FREEZEABLE))
 				continue;
 
 			/* restore max_active and repopulate worklist */
@@ -3451,12 +3547,14 @@ void __init init_workqueues(void)
 	hotcpu_notifier(workqueue_cpu_callback, CPU_PRI_WORKQUEUE);
 
 	/* initialize gcwqs */
-	for_each_possible_cpu(cpu) {
+	for_each_gcwq_cpu(cpu) {
 		struct global_cwq *gcwq = get_gcwq(cpu);
 
 		spin_lock_init(&gcwq->lock);
 		INIT_LIST_HEAD(&gcwq->worklist);
 		gcwq->cpu = cpu;
+		if (cpu == WORK_CPU_UNBOUND)
+			gcwq->flags |= GCWQ_DISASSOCIATED;
 
 		INIT_LIST_HEAD(&gcwq->idle_list);
 		for (i = 0; i < BUSY_WORKER_HASH_SIZE; i++)
@@ -3476,7 +3574,7 @@ void __init init_workqueues(void)
 	}
 
 	/* create the initial worker */
-	for_each_online_cpu(cpu) {
+	for_each_online_gcwq_cpu(cpu) {
 		struct global_cwq *gcwq = get_gcwq(cpu);
 		struct worker *worker;
 
@@ -3490,5 +3588,7 @@ void __init init_workqueues(void)
 	system_wq = alloc_workqueue("events", 0, 0);
 	system_long_wq = alloc_workqueue("events_long", 0, 0);
 	system_nrt_wq = alloc_workqueue("events_nrt", WQ_NON_REENTRANT, 0);
+	system_unbound_wq = alloc_workqueue("events_unbound", WQ_UNBOUND,
+					    WQ_UNBOUND_MAX_ACTIVE);
 	BUG_ON(!system_wq || !system_long_wq || !system_nrt_wq);
 }
-- 
cgit v1.2.3


From c7fc77f78f16d138ca997ce096a62f46e2e9420a Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Fri, 2 Jul 2010 10:03:51 +0200
Subject: workqueue: remove WQ_SINGLE_CPU and use WQ_UNBOUND instead

WQ_SINGLE_CPU combined with @max_active of 1 is used to achieve full
ordering among works queued to a workqueue.  The same can be achieved
using WQ_UNBOUND as unbound workqueues always use the gcwq for
WORK_CPU_UNBOUND.  As @max_active is always one and benefits from cpu
locality isn't accessible anyway, serving them with unbound workqueues
should be fine.

Drop WQ_SINGLE_CPU support and use WQ_UNBOUND instead.  Note that most
single thread workqueue users will be converted to use multithread or
non-reentrant instead and only the ones which require strict ordering
will keep using WQ_UNBOUND + @max_active of 1.

Signed-off-by: Tejun Heo <tj@kernel.org>
---
 include/linux/workqueue.h |   7 ++--
 kernel/workqueue.c        | 100 +++++++++-------------------------------------
 2 files changed, 21 insertions(+), 86 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/workqueue.h b/include/linux/workqueue.h
index 67ce734747f6..d74a529ed13e 100644
--- a/include/linux/workqueue.h
+++ b/include/linux/workqueue.h
@@ -233,12 +233,11 @@ static inline unsigned int work_static(struct work_struct *work) { return 0; }
 
 enum {
 	WQ_NON_REENTRANT	= 1 << 0, /* guarantee non-reentrance */
-	WQ_SINGLE_CPU		= 1 << 1, /* only single cpu at a time */
+	WQ_UNBOUND		= 1 << 1, /* not bound to any cpu */
 	WQ_FREEZEABLE		= 1 << 2, /* freeze during suspend */
 	WQ_RESCUER		= 1 << 3, /* has an rescue worker */
 	WQ_HIGHPRI		= 1 << 4, /* high priority */
 	WQ_CPU_INTENSIVE	= 1 << 5, /* cpu instensive workqueue */
-	WQ_UNBOUND		= 1 << 6, /* not bound to any cpu */
 
 	WQ_MAX_ACTIVE		= 512,	  /* I like 512, better ideas? */
 	WQ_MAX_UNBOUND_PER_CPU	= 4,	  /* 4 * #cpus for unbound wq */
@@ -300,9 +299,9 @@ __alloc_workqueue_key(const char *name, unsigned int flags, int max_active,
 #define create_workqueue(name)					\
 	alloc_workqueue((name), WQ_RESCUER, 1)
 #define create_freezeable_workqueue(name)			\
-	alloc_workqueue((name), WQ_FREEZEABLE | WQ_SINGLE_CPU | WQ_RESCUER, 1)
+	alloc_workqueue((name), WQ_FREEZEABLE | WQ_UNBOUND | WQ_RESCUER, 1)
 #define create_singlethread_workqueue(name)			\
-	alloc_workqueue((name), WQ_SINGLE_CPU | WQ_RESCUER, 1)
+	alloc_workqueue((name), WQ_UNBOUND | WQ_RESCUER, 1)
 
 extern void destroy_workqueue(struct workqueue_struct *wq);
 
diff --git a/kernel/workqueue.c b/kernel/workqueue.c
index 4608563cdd63..20d6237d7498 100644
--- a/kernel/workqueue.c
+++ b/kernel/workqueue.c
@@ -206,8 +206,6 @@ struct workqueue_struct {
 	struct list_head	flusher_queue;	/* F: flush waiters */
 	struct list_head	flusher_overflow; /* F: flush overflow list */
 
-	unsigned long		single_cpu;	/* cpu for single cpu wq */
-
 	cpumask_var_t		mayday_mask;	/* cpus requesting rescue */
 	struct worker		*rescuer;	/* I: rescue worker */
 
@@ -889,34 +887,6 @@ static void insert_work(struct cpu_workqueue_struct *cwq,
 		wake_up_worker(gcwq);
 }
 
-/**
- * cwq_unbind_single_cpu - unbind cwq from single cpu workqueue processing
- * @cwq: cwq to unbind
- *
- * Try to unbind @cwq from single cpu workqueue processing.  If
- * @cwq->wq is frozen, unbind is delayed till the workqueue is thawed.
- *
- * CONTEXT:
- * spin_lock_irq(gcwq->lock).
- */
-static void cwq_unbind_single_cpu(struct cpu_workqueue_struct *cwq)
-{
-	struct workqueue_struct *wq = cwq->wq;
-	struct global_cwq *gcwq = cwq->gcwq;
-
-	BUG_ON(wq->single_cpu != gcwq->cpu);
-	/*
-	 * Unbind from workqueue if @cwq is not frozen.  If frozen,
-	 * thaw_workqueues() will either restart processing on this
-	 * cpu or unbind if empty.  This keeps works queued while
-	 * frozen fully ordered and flushable.
-	 */
-	if (likely(!(gcwq->flags & GCWQ_FREEZING))) {
-		smp_wmb();	/* paired with cmpxchg() in __queue_work() */
-		wq->single_cpu = WORK_CPU_NONE;
-	}
-}
-
 static void __queue_work(unsigned int cpu, struct workqueue_struct *wq,
 			 struct work_struct *work)
 {
@@ -924,20 +894,16 @@ static void __queue_work(unsigned int cpu, struct workqueue_struct *wq,
 	struct cpu_workqueue_struct *cwq;
 	struct list_head *worklist;
 	unsigned long flags;
-	bool arbitrate;
 
 	debug_work_activate(work);
 
-	if (unlikely(cpu == WORK_CPU_UNBOUND))
-		cpu = raw_smp_processor_id();
-
-	/*
-	 * Determine gcwq to use.  SINGLE_CPU is inherently
-	 * NON_REENTRANT, so test it first.
-	 */
-	if (!(wq->flags & (WQ_SINGLE_CPU | WQ_UNBOUND))) {
+	/* determine gcwq to use */
+	if (!(wq->flags & WQ_UNBOUND)) {
 		struct global_cwq *last_gcwq;
 
+		if (unlikely(cpu == WORK_CPU_UNBOUND))
+			cpu = raw_smp_processor_id();
+
 		/*
 		 * It's multi cpu.  If @wq is non-reentrant and @work
 		 * was previously on a different cpu, it might still
@@ -962,38 +928,6 @@ static void __queue_work(unsigned int cpu, struct workqueue_struct *wq,
 			}
 		} else
 			spin_lock_irqsave(&gcwq->lock, flags);
-	} else if (!(wq->flags & WQ_UNBOUND)) {
-		unsigned int req_cpu = cpu;
-
-		/*
-		 * It's a bit more complex for single cpu workqueues.
-		 * We first need to determine which cpu is going to be
-		 * used.  If no cpu is currently serving this
-		 * workqueue, arbitrate using atomic accesses to
-		 * wq->single_cpu; otherwise, use the current one.
-		 */
-	retry:
-		cpu = wq->single_cpu;
-		arbitrate = cpu == WORK_CPU_NONE;
-		if (arbitrate)
-			cpu = req_cpu;
-
-		gcwq = get_gcwq(cpu);
-		spin_lock_irqsave(&gcwq->lock, flags);
-
-		/*
-		 * The following cmpxchg() is a full barrier paired
-		 * with smp_wmb() in cwq_unbind_single_cpu() and
-		 * guarantees that all changes to wq->st_* fields are
-		 * visible on the new cpu after this point.
-		 */
-		if (arbitrate)
-			cmpxchg(&wq->single_cpu, WORK_CPU_NONE, cpu);
-
-		if (unlikely(wq->single_cpu != cpu)) {
-			spin_unlock_irqrestore(&gcwq->lock, flags);
-			goto retry;
-		}
 	} else {
 		gcwq = get_gcwq(WORK_CPU_UNBOUND);
 		spin_lock_irqsave(&gcwq->lock, flags);
@@ -1105,19 +1039,30 @@ int queue_delayed_work_on(int cpu, struct workqueue_struct *wq,
 	struct work_struct *work = &dwork->work;
 
 	if (!test_and_set_bit(WORK_STRUCT_PENDING_BIT, work_data_bits(work))) {
-		struct global_cwq *gcwq = get_work_gcwq(work);
-		unsigned int lcpu = gcwq ? gcwq->cpu : raw_smp_processor_id();
+		unsigned int lcpu;
 
 		BUG_ON(timer_pending(timer));
 		BUG_ON(!list_empty(&work->entry));
 
 		timer_stats_timer_set_start_info(&dwork->timer);
+
 		/*
 		 * This stores cwq for the moment, for the timer_fn.
 		 * Note that the work's gcwq is preserved to allow
 		 * reentrance detection for delayed works.
 		 */
+		if (!(wq->flags & WQ_UNBOUND)) {
+			struct global_cwq *gcwq = get_work_gcwq(work);
+
+			if (gcwq && gcwq->cpu != WORK_CPU_UNBOUND)
+				lcpu = gcwq->cpu;
+			else
+				lcpu = raw_smp_processor_id();
+		} else
+			lcpu = WORK_CPU_UNBOUND;
+
 		set_work_cwq(work, get_cwq(lcpu, wq), 0);
+
 		timer->expires = jiffies + delay;
 		timer->data = (unsigned long)dwork;
 		timer->function = delayed_work_timer_fn;
@@ -1696,9 +1641,6 @@ static void cwq_dec_nr_in_flight(struct cpu_workqueue_struct *cwq, int color)
 		/* one down, submit a delayed one */
 		if (cwq->nr_active < cwq->max_active)
 			cwq_activate_first_delayed(cwq);
-	} else if (!cwq->nr_active && cwq->wq->flags & WQ_SINGLE_CPU) {
-		/* this was the last work, unbind from single cpu */
-		cwq_unbind_single_cpu(cwq);
 	}
 
 	/* is flush in progress and are we at the flushing tip? */
@@ -2751,7 +2693,6 @@ struct workqueue_struct *__alloc_workqueue_key(const char *name,
 	atomic_set(&wq->nr_cwqs_to_flush, 0);
 	INIT_LIST_HEAD(&wq->flusher_queue);
 	INIT_LIST_HEAD(&wq->flusher_overflow);
-	wq->single_cpu = WORK_CPU_NONE;
 
 	wq->name = name;
 	lockdep_init_map(&wq->lockdep_map, lock_name, key, 0);
@@ -3513,11 +3454,6 @@ void thaw_workqueues(void)
 			while (!list_empty(&cwq->delayed_works) &&
 			       cwq->nr_active < cwq->max_active)
 				cwq_activate_first_delayed(cwq);
-
-			/* perform delayed unbind from single cpu if empty */
-			if (wq->single_cpu == gcwq->cpu &&
-			    !cwq->nr_active && list_empty(&cwq->delayed_works))
-				cwq_unbind_single_cpu(cwq);
 		}
 
 		wake_up_worker(gcwq);
-- 
cgit v1.2.3


From 7adde04a2f5a798f04a556dfb3b69bff388e5dc4 Mon Sep 17 00:00:00 2001
From: Xiaotian Feng <dfeng@redhat.com>
Date: Wed, 30 Jun 2010 17:57:22 +0800
Subject: slab: fix caller tracking on !CONFIG_DEBUG_SLAB && CONFIG_TRACING

In slab, all __xxx_track_caller is defined on CONFIG_DEBUG_SLAB || CONFIG_TRACING,
thus caller tracking function should be worked for CONFIG_TRACING. But if
CONFIG_DEBUG_SLAB is not set, include/linux/slab.h will define xxx_track_caller to
__xxx() without consideration of CONFIG_TRACING. This will break the caller tracking
behaviour then.

Cc: Christoph Lameter <cl@linux-foundation.org>
Cc: Matt Mackall <mpm@selenic.com>
Cc: Vegard Nossum <vegard.nossum@gmail.com>
Cc: Dmitry Monakhov <dmonakhov@openvz.org>
Cc: Catalin Marinas <catalin.marinas@arm.com>
Acked-by: David Rientjes <rientjes@google.com>
Signed-off-by: Xiaotian Feng <dfeng@redhat.com>
Signed-off-by: Pekka Enberg <penberg@cs.helsinki.fi>
---
 include/linux/slab.h | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/slab.h b/include/linux/slab.h
index 49d1247cd6d9..59260e21bdf5 100644
--- a/include/linux/slab.h
+++ b/include/linux/slab.h
@@ -268,7 +268,8 @@ static inline void *kmem_cache_alloc_node(struct kmem_cache *cachep,
  * allocator where we care about the real place the memory allocation
  * request comes from.
  */
-#if defined(CONFIG_DEBUG_SLAB) || defined(CONFIG_SLUB)
+#if defined(CONFIG_DEBUG_SLAB) || defined(CONFIG_SLUB) || \
+	(defined(CONFIG_SLAB) && defined(CONFIG_TRACING))
 extern void *__kmalloc_track_caller(size_t, gfp_t, unsigned long);
 #define kmalloc_track_caller(size, flags) \
 	__kmalloc_track_caller(size, flags, _RET_IP_)
@@ -286,7 +287,8 @@ extern void *__kmalloc_track_caller(size_t, gfp_t, unsigned long);
  * standard allocator where we care about the real place the memory
  * allocation request comes from.
  */
-#if defined(CONFIG_DEBUG_SLAB) || defined(CONFIG_SLUB)
+#if defined(CONFIG_DEBUG_SLAB) || defined(CONFIG_SLUB) || \
+	(defined(CONFIG_SLAB) && defined(CONFIG_TRACING))
 extern void *__kmalloc_node_track_caller(size_t, gfp_t, int, unsigned long);
 #define kmalloc_node_track_caller(size, flags, node) \
 	__kmalloc_node_track_caller(size, flags, node, \
-- 
cgit v1.2.3


From 7dc2e1134a22dc242175d5321c0c9e97d16eb87b Mon Sep 17 00:00:00 2001
From: Grant Likely <grant.likely@secretlab.ca>
Date: Tue, 8 Jun 2010 07:48:06 -0600
Subject: of/irq: merge irq mapping code

Merge common irq mapping code between PowerPC and Microblaze.

This patch merges of_irq_find_parent(), of_irq_map_raw() and
of_irq_map_one().  The functions are dependent on one another, so all
three are merged in a single patch.  Other than cosmetic difference
(ie. DBG() vs. pr_debug()), the implementations are identical.

of_irq_to_resource() is also merged, but in this case the
implementations are different.  This patch drops the microblaze version
and uses the powerpc implementation unchanged.  The microblaze version
essentially open-coded irq_of_parse_and_map() which it does not need
to do.  Therefore the powerpc version is safe to adopt.

Signed-off-by: Grant Likely <grant.likely@secretlab.ca>
CC: Michal Simek <monstr@monstr.eu>
CC: Benjamin Herrenschmidt <benh@kernel.crashing.org>
CC: Stephen Rothwell <sfr@canb.auug.org.au>
---
 arch/microblaze/include/asm/prom.h  |  31 ----
 arch/microblaze/kernel/prom_parse.c | 290 ----------------------------------
 arch/powerpc/include/asm/prom.h     |  47 ------
 arch/powerpc/kernel/prom_parse.c    | 266 -------------------------------
 drivers/of/irq.c                    | 301 ++++++++++++++++++++++++++++++++++++
 include/linux/of_irq.h              |  29 ++++
 6 files changed, 330 insertions(+), 634 deletions(-)

(limited to 'include/linux')

diff --git a/arch/microblaze/include/asm/prom.h b/arch/microblaze/include/asm/prom.h
index 4f34bc5baa83..5fbdfe76fe76 100644
--- a/arch/microblaze/include/asm/prom.h
+++ b/arch/microblaze/include/asm/prom.h
@@ -89,34 +89,6 @@ struct device_node *of_get_cpu_node(int cpu, unsigned int *thread);
 /* Get the MAC address */
 extern const void *of_get_mac_address(struct device_node *np);
 
-/*
- * OF interrupt mapping
- */
-
-#define OF_IMAP_OLDWORLD_MAC	0x00000001
-#define OF_IMAP_NO_PHANDLE	0x00000002
-
-/**
- * of_irq_map_raw - Low level interrupt tree parsing
- * @parent:	the device interrupt parent
- * @intspec:	interrupt specifier ("interrupts" property of the device)
- * @ointsize:	size of the passed in interrupt specifier
- * @addr:	address specifier (start of "reg" property of the device)
- * @out_irq:	structure of_irq filled by this function
- *
- * Returns 0 on success and a negative number on error
- *
- * This function is a low-level interrupt tree walking function. It
- * can be used to do a partial walk with synthetized reg and interrupts
- * properties, for example when resolving PCI interrupts when no device
- * node exist for the parent.
- *
- */
-
-extern int of_irq_map_raw(struct device_node *parent, const u32 *intspec,
-			u32 ointsize, const u32 *addr,
-			struct of_irq *out_irq);
-
 /**
  * of_irq_map_pci - Resolve the interrupt for a PCI device
  * @pdev:	the device whose interrupt is to be resolved
@@ -131,9 +103,6 @@ extern int of_irq_map_raw(struct device_node *parent, const u32 *intspec,
 struct pci_dev;
 extern int of_irq_map_pci(struct pci_dev *pdev, struct of_irq *out_irq);
 
-extern int of_irq_to_resource(struct device_node *dev, int index,
-			struct resource *r);
-
 /**
  * of_iomap - Maps the memory mapped IO for a given device_node
  * @device:	the device whose io range will be mapped
diff --git a/arch/microblaze/kernel/prom_parse.c b/arch/microblaze/kernel/prom_parse.c
index cba05812ab46..e28968fa34c1 100644
--- a/arch/microblaze/kernel/prom_parse.c
+++ b/arch/microblaze/kernel/prom_parse.c
@@ -644,267 +644,6 @@ void of_parse_dma_window(struct device_node *dn, const void *dma_window_prop,
 	*size = of_read_number(dma_window, cells);
 }
 
-/*
- * Interrupt remapper
- */
-
-static unsigned int of_irq_workarounds;
-static struct device_node *of_irq_dflt_pic;
-
-static struct device_node *of_irq_find_parent(struct device_node *child)
-{
-	struct device_node *p;
-	const phandle *parp;
-
-	if (!of_node_get(child))
-		return NULL;
-
-	do {
-		parp = of_get_property(child, "interrupt-parent", NULL);
-		if (parp == NULL)
-			p = of_get_parent(child);
-		else {
-			if (of_irq_workarounds & OF_IMAP_NO_PHANDLE)
-				p = of_node_get(of_irq_dflt_pic);
-			else
-				p = of_find_node_by_phandle(*parp);
-		}
-		of_node_put(child);
-		child = p;
-	} while (p && of_get_property(p, "#interrupt-cells", NULL) == NULL);
-
-	return p;
-}
-
-int of_irq_map_raw(struct device_node *parent, const u32 *intspec, u32 ointsize,
-		const u32 *addr, struct of_irq *out_irq)
-{
-	struct device_node *ipar, *tnode, *old = NULL, *newpar = NULL;
-	const u32 *tmp, *imap, *imask;
-	u32 intsize = 1, addrsize, newintsize = 0, newaddrsize = 0;
-	int imaplen, match, i;
-
-	pr_debug("of_irq_map_raw: par=%s,intspec=[0x%08x 0x%08x...],"
-		"ointsize=%d\n",
-		parent->full_name, intspec[0], intspec[1], ointsize);
-
-	ipar = of_node_get(parent);
-
-	/* First get the #interrupt-cells property of the current cursor
-	 * that tells us how to interpret the passed-in intspec. If there
-	 * is none, we are nice and just walk up the tree
-	 */
-	do {
-		tmp = of_get_property(ipar, "#interrupt-cells", NULL);
-		if (tmp != NULL) {
-			intsize = *tmp;
-			break;
-		}
-		tnode = ipar;
-		ipar = of_irq_find_parent(ipar);
-		of_node_put(tnode);
-	} while (ipar);
-	if (ipar == NULL) {
-		pr_debug(" -> no parent found !\n");
-		goto fail;
-	}
-
-	pr_debug("of_irq_map_raw: ipar=%s, size=%d\n",
-			ipar->full_name, intsize);
-
-	if (ointsize != intsize)
-		return -EINVAL;
-
-	/* Look for this #address-cells. We have to implement the old linux
-	 * trick of looking for the parent here as some device-trees rely on it
-	 */
-	old = of_node_get(ipar);
-	do {
-		tmp = of_get_property(old, "#address-cells", NULL);
-		tnode = of_get_parent(old);
-		of_node_put(old);
-		old = tnode;
-	} while (old && tmp == NULL);
-	of_node_put(old);
-	old = NULL;
-	addrsize = (tmp == NULL) ? 2 : *tmp;
-
-	pr_debug(" -> addrsize=%d\n", addrsize);
-
-	/* Now start the actual "proper" walk of the interrupt tree */
-	while (ipar != NULL) {
-		/* Now check if cursor is an interrupt-controller and if it is
-		 * then we are done
-		 */
-		if (of_get_property(ipar, "interrupt-controller", NULL) !=
-				NULL) {
-			pr_debug(" -> got it !\n");
-			memcpy(out_irq->specifier, intspec,
-				intsize * sizeof(u32));
-			out_irq->size = intsize;
-			out_irq->controller = ipar;
-			of_node_put(old);
-			return 0;
-		}
-
-		/* Now look for an interrupt-map */
-		imap = of_get_property(ipar, "interrupt-map", &imaplen);
-		/* No interrupt map, check for an interrupt parent */
-		if (imap == NULL) {
-			pr_debug(" -> no map, getting parent\n");
-			newpar = of_irq_find_parent(ipar);
-			goto skiplevel;
-		}
-		imaplen /= sizeof(u32);
-
-		/* Look for a mask */
-		imask = of_get_property(ipar, "interrupt-map-mask", NULL);
-
-		/* If we were passed no "reg" property and we attempt to parse
-		 * an interrupt-map, then #address-cells must be 0.
-		 * Fail if it's not.
-		 */
-		if (addr == NULL && addrsize != 0) {
-			pr_debug(" -> no reg passed in when needed !\n");
-			goto fail;
-		}
-
-		/* Parse interrupt-map */
-		match = 0;
-		while (imaplen > (addrsize + intsize + 1) && !match) {
-			/* Compare specifiers */
-			match = 1;
-			for (i = 0; i < addrsize && match; ++i) {
-				u32 mask = imask ? imask[i] : 0xffffffffu;
-				match = ((addr[i] ^ imap[i]) & mask) == 0;
-			}
-			for (; i < (addrsize + intsize) && match; ++i) {
-				u32 mask = imask ? imask[i] : 0xffffffffu;
-				match =
-					((intspec[i-addrsize] ^ imap[i])
-						& mask) == 0;
-			}
-			imap += addrsize + intsize;
-			imaplen -= addrsize + intsize;
-
-			pr_debug(" -> match=%d (imaplen=%d)\n", match, imaplen);
-
-			/* Get the interrupt parent */
-			if (of_irq_workarounds & OF_IMAP_NO_PHANDLE)
-				newpar = of_node_get(of_irq_dflt_pic);
-			else
-				newpar =
-					of_find_node_by_phandle((phandle)*imap);
-			imap++;
-			--imaplen;
-
-			/* Check if not found */
-			if (newpar == NULL) {
-				pr_debug(" -> imap parent not found !\n");
-				goto fail;
-			}
-
-			/* Get #interrupt-cells and #address-cells of new
-			 * parent
-			 */
-			tmp = of_get_property(newpar, "#interrupt-cells", NULL);
-			if (tmp == NULL) {
-				pr_debug(" -> parent lacks "
-						"#interrupt-cells!\n");
-				goto fail;
-			}
-			newintsize = *tmp;
-			tmp = of_get_property(newpar, "#address-cells", NULL);
-			newaddrsize = (tmp == NULL) ? 0 : *tmp;
-
-			pr_debug(" -> newintsize=%d, newaddrsize=%d\n",
-				newintsize, newaddrsize);
-
-			/* Check for malformed properties */
-			if (imaplen < (newaddrsize + newintsize))
-				goto fail;
-
-			imap += newaddrsize + newintsize;
-			imaplen -= newaddrsize + newintsize;
-
-			pr_debug(" -> imaplen=%d\n", imaplen);
-		}
-		if (!match)
-			goto fail;
-
-		of_node_put(old);
-		old = of_node_get(newpar);
-		addrsize = newaddrsize;
-		intsize = newintsize;
-		intspec = imap - intsize;
-		addr = intspec - addrsize;
-
-skiplevel:
-		/* Iterate again with new parent */
-		pr_debug(" -> new parent: %s\n",
-				newpar ? newpar->full_name : "<>");
-		of_node_put(ipar);
-		ipar = newpar;
-		newpar = NULL;
-	}
-fail:
-	of_node_put(ipar);
-	of_node_put(old);
-	of_node_put(newpar);
-
-	return -EINVAL;
-}
-EXPORT_SYMBOL_GPL(of_irq_map_raw);
-
-int of_irq_map_one(struct device_node *device,
-			int index, struct of_irq *out_irq)
-{
-	struct device_node *p;
-	const u32 *intspec, *tmp, *addr;
-	u32 intsize, intlen;
-	int res;
-
-	pr_debug("of_irq_map_one: dev=%s, index=%d\n",
-			device->full_name, index);
-
-	/* Get the interrupts property */
-	intspec = of_get_property(device, "interrupts", (int *) &intlen);
-	if (intspec == NULL)
-		return -EINVAL;
-	intlen /= sizeof(u32);
-
-	pr_debug(" intspec=%d intlen=%d\n", *intspec, intlen);
-
-	/* Get the reg property (if any) */
-	addr = of_get_property(device, "reg", NULL);
-
-	/* Look for the interrupt parent. */
-	p = of_irq_find_parent(device);
-	if (p == NULL)
-		return -EINVAL;
-
-	/* Get size of interrupt specifier */
-	tmp = of_get_property(p, "#interrupt-cells", NULL);
-	if (tmp == NULL) {
-		of_node_put(p);
-		return -EINVAL;
-	}
-	intsize = *tmp;
-
-	pr_debug(" intsize=%d intlen=%d\n", intsize, intlen);
-
-	/* Check index */
-	if ((index + 1) * intsize > intlen)
-		return -EINVAL;
-
-	/* Get new specifier and map it */
-	res = of_irq_map_raw(p, intspec + index * intsize, intsize,
-				addr, out_irq);
-	of_node_put(p);
-	return res;
-}
-EXPORT_SYMBOL_GPL(of_irq_map_one);
-
 /**
  * Search the device tree for the best MAC address to use.  'mac-address' is
  * checked first, because that is supposed to contain to "most recent" MAC
@@ -943,35 +682,6 @@ const void *of_get_mac_address(struct device_node *np)
 }
 EXPORT_SYMBOL(of_get_mac_address);
 
-int of_irq_to_resource(struct device_node *dev, int index, struct resource *r)
-{
-	struct of_irq out_irq;
-	int irq;
-	int res;
-
-	res = of_irq_map_one(dev, index, &out_irq);
-
-	/* Get irq for the device */
-	if (res) {
-		pr_debug("IRQ not found... code = %d", res);
-		return NO_IRQ;
-	}
-	/* Assuming single interrupt controller... */
-	irq = out_irq.specifier[0];
-
-	pr_debug("IRQ found = %d", irq);
-
-	/* Only dereference the resource if both the
-	 * resource and the irq are valid. */
-	if (r && irq != NO_IRQ) {
-		r->start = r->end = irq;
-		r->flags = IORESOURCE_IRQ;
-	}
-
-	return irq;
-}
-EXPORT_SYMBOL_GPL(of_irq_to_resource);
-
 void __iomem *of_iomap(struct device_node *np, int index)
 {
 	struct resource res;
diff --git a/arch/powerpc/include/asm/prom.h b/arch/powerpc/include/asm/prom.h
index 4486765db6e7..10d5ee556702 100644
--- a/arch/powerpc/include/asm/prom.h
+++ b/arch/powerpc/include/asm/prom.h
@@ -105,50 +105,6 @@ struct device_node *of_find_next_cache_node(struct device_node *np);
 /* Get the MAC address */
 extern const void *of_get_mac_address(struct device_node *np);
 
-/*
- * OF interrupt mapping
- */
-
-#define OF_IMAP_OLDWORLD_MAC	0x00000001
-#define OF_IMAP_NO_PHANDLE	0x00000002
-
-#if defined(CONFIG_PPC32) && defined(CONFIG_PPC_PMAC)
-/* Workarounds only needed for 32bit powermac machines */
-extern unsigned int of_irq_workarounds;
-extern struct device_node *of_irq_dflt_pic;
-extern int of_irq_map_oldworld(struct device_node *device, int index,
-			       struct of_irq *out_irq);
-#else
-#define of_irq_workarounds (0)
-#define of_irq_dflt_pic (NULL)
-static inline int of_irq_map_oldworld(struct device_node *device, int index,
-				      struct of_irq *out_irq)
-{
-	return -EINVAL;
-}
-#endif
-
-/**
- * of_irq_map_raw - Low level interrupt tree parsing
- * @parent:	the device interrupt parent
- * @intspec:	interrupt specifier ("interrupts" property of the device)
- * @ointsize:   size of the passed in interrupt specifier
- * @addr:	address specifier (start of "reg" property of the device)
- * @out_irq:	structure of_irq filled by this function
- *
- * Returns 0 on success and a negative number on error
- *
- * This function is a low-level interrupt tree walking function. It
- * can be used to do a partial walk with synthetized reg and interrupts
- * properties, for example when resolving PCI interrupts when no device
- * node exist for the parent.
- *
- */
-
-extern int of_irq_map_raw(struct device_node *parent, const u32 *intspec,
-			  u32 ointsize, const u32 *addr,
-			  struct of_irq *out_irq);
-
 /**
  * of_irq_map_pci - Resolve the interrupt for a PCI device
  * @pdev:	the device whose interrupt is to be resolved
@@ -163,9 +119,6 @@ extern int of_irq_map_raw(struct device_node *parent, const u32 *intspec,
 struct pci_dev;
 extern int of_irq_map_pci(struct pci_dev *pdev, struct of_irq *out_irq);
 
-extern int of_irq_to_resource(struct device_node *dev, int index,
-			struct resource *r);
-
 /**
  * of_iomap - Maps the memory mapped IO for a given device_node
  * @device:	the device whose io range will be mapped
diff --git a/arch/powerpc/kernel/prom_parse.c b/arch/powerpc/kernel/prom_parse.c
index dfa6de6572bb..d61a5c5fe690 100644
--- a/arch/powerpc/kernel/prom_parse.c
+++ b/arch/powerpc/kernel/prom_parse.c
@@ -678,257 +678,6 @@ void of_parse_dma_window(struct device_node *dn, const void *dma_window_prop,
 	*size = of_read_number(dma_window, cells);
 }
 
-/*
- * Interrupt remapper
- */
-
-static struct device_node *of_irq_find_parent(struct device_node *child)
-{
-	struct device_node *p;
-	const phandle *parp;
-
-	if (!of_node_get(child))
-		return NULL;
-
-	do {
-		parp = of_get_property(child, "interrupt-parent", NULL);
-		if (parp == NULL)
-			p = of_get_parent(child);
-		else {
-			if (of_irq_workarounds & OF_IMAP_NO_PHANDLE)
-				p = of_node_get(of_irq_dflt_pic);
-			else
-				p = of_find_node_by_phandle(*parp);
-		}
-		of_node_put(child);
-		child = p;
-	} while (p && of_get_property(p, "#interrupt-cells", NULL) == NULL);
-
-	return p;
-}
-
-int of_irq_map_raw(struct device_node *parent, const u32 *intspec, u32 ointsize,
-		const u32 *addr, struct of_irq *out_irq)
-{
-	struct device_node *ipar, *tnode, *old = NULL, *newpar = NULL;
-	const u32 *tmp, *imap, *imask;
-	u32 intsize = 1, addrsize, newintsize = 0, newaddrsize = 0;
-	int imaplen, match, i;
-
-	DBG("of_irq_map_raw: par=%s,intspec=[0x%08x 0x%08x...],ointsize=%d\n",
-	    parent->full_name, intspec[0], intspec[1], ointsize);
-
-	ipar = of_node_get(parent);
-
-	/* First get the #interrupt-cells property of the current cursor
-	 * that tells us how to interpret the passed-in intspec. If there
-	 * is none, we are nice and just walk up the tree
-	 */
-	do {
-		tmp = of_get_property(ipar, "#interrupt-cells", NULL);
-		if (tmp != NULL) {
-			intsize = *tmp;
-			break;
-		}
-		tnode = ipar;
-		ipar = of_irq_find_parent(ipar);
-		of_node_put(tnode);
-	} while (ipar);
-	if (ipar == NULL) {
-		DBG(" -> no parent found !\n");
-		goto fail;
-	}
-
-	DBG("of_irq_map_raw: ipar=%s, size=%d\n", ipar->full_name, intsize);
-
-	if (ointsize != intsize)
-		return -EINVAL;
-
-	/* Look for this #address-cells. We have to implement the old linux
-	 * trick of looking for the parent here as some device-trees rely on it
-	 */
-	old = of_node_get(ipar);
-	do {
-		tmp = of_get_property(old, "#address-cells", NULL);
-		tnode = of_get_parent(old);
-		of_node_put(old);
-		old = tnode;
-	} while(old && tmp == NULL);
-	of_node_put(old);
-	old = NULL;
-	addrsize = (tmp == NULL) ? 2 : *tmp;
-
-	DBG(" -> addrsize=%d\n", addrsize);
-
-	/* Now start the actual "proper" walk of the interrupt tree */
-	while (ipar != NULL) {
-		/* Now check if cursor is an interrupt-controller and if it is
-		 * then we are done
-		 */
-		if (of_get_property(ipar, "interrupt-controller", NULL) !=
-				NULL) {
-			DBG(" -> got it !\n");
-			memcpy(out_irq->specifier, intspec,
-			       intsize * sizeof(u32));
-			out_irq->size = intsize;
-			out_irq->controller = ipar;
-			of_node_put(old);
-			return 0;
-		}
-
-		/* Now look for an interrupt-map */
-		imap = of_get_property(ipar, "interrupt-map", &imaplen);
-		/* No interrupt map, check for an interrupt parent */
-		if (imap == NULL) {
-			DBG(" -> no map, getting parent\n");
-			newpar = of_irq_find_parent(ipar);
-			goto skiplevel;
-		}
-		imaplen /= sizeof(u32);
-
-		/* Look for a mask */
-		imask = of_get_property(ipar, "interrupt-map-mask", NULL);
-
-		/* If we were passed no "reg" property and we attempt to parse
-		 * an interrupt-map, then #address-cells must be 0.
-		 * Fail if it's not.
-		 */
-		if (addr == NULL && addrsize != 0) {
-			DBG(" -> no reg passed in when needed !\n");
-			goto fail;
-		}
-
-		/* Parse interrupt-map */
-		match = 0;
-		while (imaplen > (addrsize + intsize + 1) && !match) {
-			/* Compare specifiers */
-			match = 1;
-			for (i = 0; i < addrsize && match; ++i) {
-				u32 mask = imask ? imask[i] : 0xffffffffu;
-				match = ((addr[i] ^ imap[i]) & mask) == 0;
-			}
-			for (; i < (addrsize + intsize) && match; ++i) {
-				u32 mask = imask ? imask[i] : 0xffffffffu;
-				match =
-				   ((intspec[i-addrsize] ^ imap[i]) & mask) == 0;
-			}
-			imap += addrsize + intsize;
-			imaplen -= addrsize + intsize;
-
-			DBG(" -> match=%d (imaplen=%d)\n", match, imaplen);
-
-			/* Get the interrupt parent */
-			if (of_irq_workarounds & OF_IMAP_NO_PHANDLE)
-				newpar = of_node_get(of_irq_dflt_pic);
-			else
-				newpar = of_find_node_by_phandle((phandle)*imap);
-			imap++;
-			--imaplen;
-
-			/* Check if not found */
-			if (newpar == NULL) {
-				DBG(" -> imap parent not found !\n");
-				goto fail;
-			}
-
-			/* Get #interrupt-cells and #address-cells of new
-			 * parent
-			 */
-			tmp = of_get_property(newpar, "#interrupt-cells", NULL);
-			if (tmp == NULL) {
-				DBG(" -> parent lacks #interrupt-cells !\n");
-				goto fail;
-			}
-			newintsize = *tmp;
-			tmp = of_get_property(newpar, "#address-cells", NULL);
-			newaddrsize = (tmp == NULL) ? 0 : *tmp;
-
-			DBG(" -> newintsize=%d, newaddrsize=%d\n",
-			    newintsize, newaddrsize);
-
-			/* Check for malformed properties */
-			if (imaplen < (newaddrsize + newintsize))
-				goto fail;
-
-			imap += newaddrsize + newintsize;
-			imaplen -= newaddrsize + newintsize;
-
-			DBG(" -> imaplen=%d\n", imaplen);
-		}
-		if (!match)
-			goto fail;
-
-		of_node_put(old);
-		old = of_node_get(newpar);
-		addrsize = newaddrsize;
-		intsize = newintsize;
-		intspec = imap - intsize;
-		addr = intspec - addrsize;
-
-	skiplevel:
-		/* Iterate again with new parent */
-		DBG(" -> new parent: %s\n", newpar ? newpar->full_name : "<>");
-		of_node_put(ipar);
-		ipar = newpar;
-		newpar = NULL;
-	}
- fail:
-	of_node_put(ipar);
-	of_node_put(old);
-	of_node_put(newpar);
-
-	return -EINVAL;
-}
-EXPORT_SYMBOL_GPL(of_irq_map_raw);
-
-int of_irq_map_one(struct device_node *device, int index, struct of_irq *out_irq)
-{
-	struct device_node *p;
-	const u32 *intspec, *tmp, *addr;
-	u32 intsize, intlen;
-	int res = -EINVAL;
-
-	DBG("of_irq_map_one: dev=%s, index=%d\n", device->full_name, index);
-
-	/* OldWorld mac stuff is "special", handle out of line */
-	if (of_irq_workarounds & OF_IMAP_OLDWORLD_MAC)
-		return of_irq_map_oldworld(device, index, out_irq);
-
-	/* Get the interrupts property */
-	intspec = of_get_property(device, "interrupts", &intlen);
-	if (intspec == NULL)
-		return -EINVAL;
-	intlen /= sizeof(u32);
-
-	/* Get the reg property (if any) */
-	addr = of_get_property(device, "reg", NULL);
-
-	/* Look for the interrupt parent. */
-	p = of_irq_find_parent(device);
-	if (p == NULL)
-		return -EINVAL;
-
-	/* Get size of interrupt specifier */
-	tmp = of_get_property(p, "#interrupt-cells", NULL);
-	if (tmp == NULL)
-		goto out;
-	intsize = *tmp;
-
-	DBG(" intsize=%d intlen=%d\n", intsize, intlen);
-
-	/* Check index */
-	if ((index + 1) * intsize > intlen)
-		goto out;
-
-	/* Get new specifier and map it */
-	res = of_irq_map_raw(p, intspec + index * intsize, intsize,
-			     addr, out_irq);
-out:
-	of_node_put(p);
-	return res;
-}
-EXPORT_SYMBOL_GPL(of_irq_map_one);
-
 /**
  * Search the device tree for the best MAC address to use.  'mac-address' is
  * checked first, because that is supposed to contain to "most recent" MAC
@@ -967,21 +716,6 @@ const void *of_get_mac_address(struct device_node *np)
 }
 EXPORT_SYMBOL(of_get_mac_address);
 
-int of_irq_to_resource(struct device_node *dev, int index, struct resource *r)
-{
-	int irq = irq_of_parse_and_map(dev, index);
-
-	/* Only dereference the resource if both the
-	 * resource and the irq are valid. */
-	if (r && irq != NO_IRQ) {
-		r->start = r->end = irq;
-		r->flags = IORESOURCE_IRQ;
-	}
-
-	return irq;
-}
-EXPORT_SYMBOL_GPL(of_irq_to_resource);
-
 void __iomem *of_iomap(struct device_node *np, int index)
 {
 	struct resource res;
diff --git a/drivers/of/irq.c b/drivers/of/irq.c
index 9b3397c27096..598454fbdd1e 100644
--- a/drivers/of/irq.c
+++ b/drivers/of/irq.c
@@ -43,3 +43,304 @@ unsigned int irq_of_parse_and_map(struct device_node *dev, int index)
 				     oirq.size);
 }
 EXPORT_SYMBOL_GPL(irq_of_parse_and_map);
+
+/**
+ * of_irq_find_parent - Given a device node, find its interrupt parent node
+ * @child: pointer to device node
+ *
+ * Returns a pointer to the interrupt parent node, or NULL if the interrupt
+ * parent could not be determined.
+ */
+static struct device_node *of_irq_find_parent(struct device_node *child)
+{
+	struct device_node *p;
+	const phandle *parp;
+
+	if (!of_node_get(child))
+		return NULL;
+
+	do {
+		parp = of_get_property(child, "interrupt-parent", NULL);
+		if (parp == NULL)
+			p = of_get_parent(child);
+		else {
+			if (of_irq_workarounds & OF_IMAP_NO_PHANDLE)
+				p = of_node_get(of_irq_dflt_pic);
+			else
+				p = of_find_node_by_phandle(*parp);
+		}
+		of_node_put(child);
+		child = p;
+	} while (p && of_get_property(p, "#interrupt-cells", NULL) == NULL);
+
+	return p;
+}
+
+/**
+ * of_irq_map_raw - Low level interrupt tree parsing
+ * @parent:	the device interrupt parent
+ * @intspec:	interrupt specifier ("interrupts" property of the device)
+ * @ointsize:   size of the passed in interrupt specifier
+ * @addr:	address specifier (start of "reg" property of the device)
+ * @out_irq:	structure of_irq filled by this function
+ *
+ * Returns 0 on success and a negative number on error
+ *
+ * This function is a low-level interrupt tree walking function. It
+ * can be used to do a partial walk with synthetized reg and interrupts
+ * properties, for example when resolving PCI interrupts when no device
+ * node exist for the parent.
+ */
+int of_irq_map_raw(struct device_node *parent, const u32 *intspec, u32 ointsize,
+		const u32 *addr, struct of_irq *out_irq)
+{
+	struct device_node *ipar, *tnode, *old = NULL, *newpar = NULL;
+	const u32 *tmp, *imap, *imask;
+	u32 intsize = 1, addrsize, newintsize = 0, newaddrsize = 0;
+	int imaplen, match, i;
+
+	pr_debug("of_irq_map_raw: par=%s,intspec=[0x%08x 0x%08x...],ointsize=%d\n",
+	    parent->full_name, intspec[0], intspec[1], ointsize);
+
+	ipar = of_node_get(parent);
+
+	/* First get the #interrupt-cells property of the current cursor
+	 * that tells us how to interpret the passed-in intspec. If there
+	 * is none, we are nice and just walk up the tree
+	 */
+	do {
+		tmp = of_get_property(ipar, "#interrupt-cells", NULL);
+		if (tmp != NULL) {
+			intsize = *tmp;
+			break;
+		}
+		tnode = ipar;
+		ipar = of_irq_find_parent(ipar);
+		of_node_put(tnode);
+	} while (ipar);
+	if (ipar == NULL) {
+		pr_debug(" -> no parent found !\n");
+		goto fail;
+	}
+
+	pr_debug("of_irq_map_raw: ipar=%s, size=%d\n", ipar->full_name, intsize);
+
+	if (ointsize != intsize)
+		return -EINVAL;
+
+	/* Look for this #address-cells. We have to implement the old linux
+	 * trick of looking for the parent here as some device-trees rely on it
+	 */
+	old = of_node_get(ipar);
+	do {
+		tmp = of_get_property(old, "#address-cells", NULL);
+		tnode = of_get_parent(old);
+		of_node_put(old);
+		old = tnode;
+	} while (old && tmp == NULL);
+	of_node_put(old);
+	old = NULL;
+	addrsize = (tmp == NULL) ? 2 : *tmp;
+
+	pr_debug(" -> addrsize=%d\n", addrsize);
+
+	/* Now start the actual "proper" walk of the interrupt tree */
+	while (ipar != NULL) {
+		/* Now check if cursor is an interrupt-controller and if it is
+		 * then we are done
+		 */
+		if (of_get_property(ipar, "interrupt-controller", NULL) !=
+				NULL) {
+			pr_debug(" -> got it !\n");
+			memcpy(out_irq->specifier, intspec,
+			       intsize * sizeof(u32));
+			out_irq->size = intsize;
+			out_irq->controller = ipar;
+			of_node_put(old);
+			return 0;
+		}
+
+		/* Now look for an interrupt-map */
+		imap = of_get_property(ipar, "interrupt-map", &imaplen);
+		/* No interrupt map, check for an interrupt parent */
+		if (imap == NULL) {
+			pr_debug(" -> no map, getting parent\n");
+			newpar = of_irq_find_parent(ipar);
+			goto skiplevel;
+		}
+		imaplen /= sizeof(u32);
+
+		/* Look for a mask */
+		imask = of_get_property(ipar, "interrupt-map-mask", NULL);
+
+		/* If we were passed no "reg" property and we attempt to parse
+		 * an interrupt-map, then #address-cells must be 0.
+		 * Fail if it's not.
+		 */
+		if (addr == NULL && addrsize != 0) {
+			pr_debug(" -> no reg passed in when needed !\n");
+			goto fail;
+		}
+
+		/* Parse interrupt-map */
+		match = 0;
+		while (imaplen > (addrsize + intsize + 1) && !match) {
+			/* Compare specifiers */
+			match = 1;
+			for (i = 0; i < addrsize && match; ++i) {
+				u32 mask = imask ? imask[i] : 0xffffffffu;
+				match = ((addr[i] ^ imap[i]) & mask) == 0;
+			}
+			for (; i < (addrsize + intsize) && match; ++i) {
+				u32 mask = imask ? imask[i] : 0xffffffffu;
+				match =
+				   ((intspec[i-addrsize] ^ imap[i]) & mask) == 0;
+			}
+			imap += addrsize + intsize;
+			imaplen -= addrsize + intsize;
+
+			pr_debug(" -> match=%d (imaplen=%d)\n", match, imaplen);
+
+			/* Get the interrupt parent */
+			if (of_irq_workarounds & OF_IMAP_NO_PHANDLE)
+				newpar = of_node_get(of_irq_dflt_pic);
+			else
+				newpar = of_find_node_by_phandle((phandle)*imap);
+			imap++;
+			--imaplen;
+
+			/* Check if not found */
+			if (newpar == NULL) {
+				pr_debug(" -> imap parent not found !\n");
+				goto fail;
+			}
+
+			/* Get #interrupt-cells and #address-cells of new
+			 * parent
+			 */
+			tmp = of_get_property(newpar, "#interrupt-cells", NULL);
+			if (tmp == NULL) {
+				pr_debug(" -> parent lacks #interrupt-cells!\n");
+				goto fail;
+			}
+			newintsize = *tmp;
+			tmp = of_get_property(newpar, "#address-cells", NULL);
+			newaddrsize = (tmp == NULL) ? 0 : *tmp;
+
+			pr_debug(" -> newintsize=%d, newaddrsize=%d\n",
+			    newintsize, newaddrsize);
+
+			/* Check for malformed properties */
+			if (imaplen < (newaddrsize + newintsize))
+				goto fail;
+
+			imap += newaddrsize + newintsize;
+			imaplen -= newaddrsize + newintsize;
+
+			pr_debug(" -> imaplen=%d\n", imaplen);
+		}
+		if (!match)
+			goto fail;
+
+		of_node_put(old);
+		old = of_node_get(newpar);
+		addrsize = newaddrsize;
+		intsize = newintsize;
+		intspec = imap - intsize;
+		addr = intspec - addrsize;
+
+	skiplevel:
+		/* Iterate again with new parent */
+		pr_debug(" -> new parent: %s\n", newpar ? newpar->full_name : "<>");
+		of_node_put(ipar);
+		ipar = newpar;
+		newpar = NULL;
+	}
+ fail:
+	of_node_put(ipar);
+	of_node_put(old);
+	of_node_put(newpar);
+
+	return -EINVAL;
+}
+EXPORT_SYMBOL_GPL(of_irq_map_raw);
+
+/**
+ * of_irq_map_one - Resolve an interrupt for a device
+ * @device: the device whose interrupt is to be resolved
+ * @index: index of the interrupt to resolve
+ * @out_irq: structure of_irq filled by this function
+ *
+ * This function resolves an interrupt, walking the tree, for a given
+ * device-tree node. It's the high level pendant to of_irq_map_raw().
+ */
+int of_irq_map_one(struct device_node *device, int index, struct of_irq *out_irq)
+{
+	struct device_node *p;
+	const u32 *intspec, *tmp, *addr;
+	u32 intsize, intlen;
+	int res = -EINVAL;
+
+	pr_debug("of_irq_map_one: dev=%s, index=%d\n", device->full_name, index);
+
+	/* OldWorld mac stuff is "special", handle out of line */
+	if (of_irq_workarounds & OF_IMAP_OLDWORLD_MAC)
+		return of_irq_map_oldworld(device, index, out_irq);
+
+	/* Get the interrupts property */
+	intspec = of_get_property(device, "interrupts", &intlen);
+	if (intspec == NULL)
+		return -EINVAL;
+	intlen /= sizeof(u32);
+
+	pr_debug(" intspec=%d intlen=%d\n", *intspec, intlen);
+
+	/* Get the reg property (if any) */
+	addr = of_get_property(device, "reg", NULL);
+
+	/* Look for the interrupt parent. */
+	p = of_irq_find_parent(device);
+	if (p == NULL)
+		return -EINVAL;
+
+	/* Get size of interrupt specifier */
+	tmp = of_get_property(p, "#interrupt-cells", NULL);
+	if (tmp == NULL)
+		goto out;
+	intsize = *tmp;
+
+	pr_debug(" intsize=%d intlen=%d\n", intsize, intlen);
+
+	/* Check index */
+	if ((index + 1) * intsize > intlen)
+		goto out;
+
+	/* Get new specifier and map it */
+	res = of_irq_map_raw(p, intspec + index * intsize, intsize,
+			     addr, out_irq);
+ out:
+	of_node_put(p);
+	return res;
+}
+EXPORT_SYMBOL_GPL(of_irq_map_one);
+
+/**
+ * of_irq_to_resource - Decode a node's IRQ and return it as a resource
+ * @dev: pointer to device tree node
+ * @index: zero-based index of the irq
+ * @r: pointer to resource structure to return result into.
+ */
+int of_irq_to_resource(struct device_node *dev, int index, struct resource *r)
+{
+	int irq = irq_of_parse_and_map(dev, index);
+
+	/* Only dereference the resource if both the
+	 * resource and the irq are valid. */
+	if (r && irq != NO_IRQ) {
+		r->start = r->end = irq;
+		r->flags = IORESOURCE_IRQ;
+	}
+
+	return irq;
+}
+EXPORT_SYMBOL_GPL(of_irq_to_resource);
diff --git a/include/linux/of_irq.h b/include/linux/of_irq.h
index 0e37c05b7dd8..5929781c104d 100644
--- a/include/linux/of_irq.h
+++ b/include/linux/of_irq.h
@@ -4,6 +4,8 @@
 #if defined(CONFIG_OF)
 struct of_irq;
 #include <linux/types.h>
+#include <linux/errno.h>
+#include <linux/ioport.h>
 #include <linux/of.h>
 
 /*
@@ -30,11 +32,38 @@ struct of_irq {
 	u32 specifier[OF_MAX_IRQ_SPEC]; /* Specifier copy */
 };
 
+/*
+ * Workarounds only applied to 32bit powermac machines
+ */
+#define OF_IMAP_OLDWORLD_MAC	0x00000001
+#define OF_IMAP_NO_PHANDLE	0x00000002
+
+#if defined(CONFIG_PPC32) && defined(CONFIG_PPC_PMAC)
+extern unsigned int of_irq_workarounds;
+extern struct device_node *of_irq_dflt_pic;
+extern int of_irq_map_oldworld(struct device_node *device, int index,
+			       struct of_irq *out_irq);
+#else /* CONFIG_PPC32 && CONFIG_PPC_PMAC */
+#define of_irq_workarounds (0)
+#define of_irq_dflt_pic (NULL)
+static inline int of_irq_map_oldworld(struct device_node *device, int index,
+				      struct of_irq *out_irq)
+{
+	return -EINVAL;
+}
+#endif /* CONFIG_PPC32 && CONFIG_PPC_PMAC */
+
+
+extern int of_irq_map_raw(struct device_node *parent, const u32 *intspec,
+			  u32 ointsize, const u32 *addr,
+			  struct of_irq *out_irq);
 extern int of_irq_map_one(struct device_node *device, int index,
 			  struct of_irq *out_irq);
 extern unsigned int irq_create_of_mapping(struct device_node *controller,
 					  const u32 *intspec,
 					  unsigned int intsize);
+extern int of_irq_to_resource(struct device_node *dev, int index,
+			      struct resource *r);
 
 #endif /* CONFIG_OF_IRQ */
 #endif /* CONFIG_OF */
-- 
cgit v1.2.3


From 6b884a8d50a6eea2fb3dad7befe748f67193073b Mon Sep 17 00:00:00 2001
From: Grant Likely <grant.likely@secretlab.ca>
Date: Tue, 8 Jun 2010 07:48:09 -0600
Subject: of/address: merge of_iomap()

Merge common code between Microblaze and PowerPC.  This patch creates
new of_address.h and address.c files to containing address translation
and mapping routines.  First routine to be moved it of_iomap()

Signed-off-by: Grant Likely <grant.likely@secretlab.ca>
Acked-by: Benjamin Herrenschmidt <benh@kernel.crashing.org>
CC: Michal Simek <monstr@monstr.eu>
CC: Stephen Rothwell <sfr@canb.auug.org.au>
---
 arch/microblaze/include/asm/prom.h  | 10 +---------
 arch/microblaze/kernel/prom_parse.c | 11 -----------
 arch/powerpc/include/asm/prom.h     | 10 +---------
 arch/powerpc/kernel/prom_parse.c    | 11 -----------
 drivers/of/Kconfig                  |  4 ++++
 drivers/of/Makefile                 |  1 +
 drivers/of/address.c                | 22 ++++++++++++++++++++++
 include/linux/of_address.h          |  9 +++++++++
 8 files changed, 38 insertions(+), 40 deletions(-)
 create mode 100644 drivers/of/address.c
 create mode 100644 include/linux/of_address.h

(limited to 'include/linux')

diff --git a/arch/microblaze/include/asm/prom.h b/arch/microblaze/include/asm/prom.h
index 5fbdfe76fe76..6411c3b3a80f 100644
--- a/arch/microblaze/include/asm/prom.h
+++ b/arch/microblaze/include/asm/prom.h
@@ -20,6 +20,7 @@
 #ifndef __ASSEMBLY__
 
 #include <linux/types.h>
+#include <linux/of_address.h>
 #include <linux/of_irq.h>
 #include <linux/of_fdt.h>
 #include <linux/proc_fs.h>
@@ -103,15 +104,6 @@ extern const void *of_get_mac_address(struct device_node *np);
 struct pci_dev;
 extern int of_irq_map_pci(struct pci_dev *pdev, struct of_irq *out_irq);
 
-/**
- * of_iomap - Maps the memory mapped IO for a given device_node
- * @device:	the device whose io range will be mapped
- * @index:	index of the io range
- *
- * Returns a pointer to the mapped memory
- */
-extern void __iomem *of_iomap(struct device_node *device, int index);
-
 #endif /* __ASSEMBLY__ */
 #endif /* __KERNEL__ */
 #endif /* _ASM_MICROBLAZE_PROM_H */
diff --git a/arch/microblaze/kernel/prom_parse.c b/arch/microblaze/kernel/prom_parse.c
index e28968fa34c1..1159ba52ad4a 100644
--- a/arch/microblaze/kernel/prom_parse.c
+++ b/arch/microblaze/kernel/prom_parse.c
@@ -681,14 +681,3 @@ const void *of_get_mac_address(struct device_node *np)
 	return NULL;
 }
 EXPORT_SYMBOL(of_get_mac_address);
-
-void __iomem *of_iomap(struct device_node *np, int index)
-{
-	struct resource res;
-
-	if (of_address_to_resource(np, index, &res))
-		return NULL;
-
-	return ioremap(res.start, 1 + res.end - res.start);
-}
-EXPORT_SYMBOL(of_iomap);
diff --git a/arch/powerpc/include/asm/prom.h b/arch/powerpc/include/asm/prom.h
index 10d5ee556702..0abe379c6f33 100644
--- a/arch/powerpc/include/asm/prom.h
+++ b/arch/powerpc/include/asm/prom.h
@@ -18,6 +18,7 @@
  */
 #include <linux/types.h>
 #include <linux/of_fdt.h>
+#include <linux/of_address.h>
 #include <linux/of_irq.h>
 #include <linux/proc_fs.h>
 #include <linux/platform_device.h>
@@ -119,14 +120,5 @@ extern const void *of_get_mac_address(struct device_node *np);
 struct pci_dev;
 extern int of_irq_map_pci(struct pci_dev *pdev, struct of_irq *out_irq);
 
-/**
- * of_iomap - Maps the memory mapped IO for a given device_node
- * @device:	the device whose io range will be mapped
- * @index:	index of the io range
- *
- * Returns a pointer to the mapped memory
- */
-extern void __iomem *of_iomap(struct device_node *device, int index);
-
 #endif /* __KERNEL__ */
 #endif /* _POWERPC_PROM_H */
diff --git a/arch/powerpc/kernel/prom_parse.c b/arch/powerpc/kernel/prom_parse.c
index d61a5c5fe690..1d5d4f6dfef5 100644
--- a/arch/powerpc/kernel/prom_parse.c
+++ b/arch/powerpc/kernel/prom_parse.c
@@ -715,14 +715,3 @@ const void *of_get_mac_address(struct device_node *np)
 	return NULL;
 }
 EXPORT_SYMBOL(of_get_mac_address);
-
-void __iomem *of_iomap(struct device_node *np, int index)
-{
-	struct resource res;
-
-	if (of_address_to_resource(np, index, &res))
-		return NULL;
-
-	return ioremap(res.start, 1 + res.end - res.start);
-}
-EXPORT_SYMBOL(of_iomap);
diff --git a/drivers/of/Kconfig b/drivers/of/Kconfig
index b87495efa16e..097f42aebe90 100644
--- a/drivers/of/Kconfig
+++ b/drivers/of/Kconfig
@@ -6,6 +6,10 @@ config OF_DYNAMIC
 	def_bool y
 	depends on OF && PPC_OF
 
+config OF_ADDRESS
+	def_bool y
+	depends on OF && !SPARC
+
 config OF_IRQ
 	def_bool y
 	depends on OF && !SPARC
diff --git a/drivers/of/Makefile b/drivers/of/Makefile
index 3631a5ea0b47..0052c405463a 100644
--- a/drivers/of/Makefile
+++ b/drivers/of/Makefile
@@ -1,5 +1,6 @@
 obj-y = base.o
 obj-$(CONFIG_OF_FLATTREE) += fdt.o
+obj-$(CONFIG_OF_ADDRESS)  += address.o
 obj-$(CONFIG_OF_IRQ)    += irq.o
 obj-$(CONFIG_OF_DEVICE) += device.o platform.o
 obj-$(CONFIG_OF_GPIO)   += gpio.o
diff --git a/drivers/of/address.c b/drivers/of/address.c
new file mode 100644
index 000000000000..258528d6c4fe
--- /dev/null
+++ b/drivers/of/address.c
@@ -0,0 +1,22 @@
+
+#include <linux/io.h>
+#include <linux/ioport.h>
+#include <linux/of_address.h>
+
+/**
+ * of_iomap - Maps the memory mapped IO for a given device_node
+ * @device:	the device whose io range will be mapped
+ * @index:	index of the io range
+ *
+ * Returns a pointer to the mapped memory
+ */
+void __iomem *of_iomap(struct device_node *np, int index)
+{
+	struct resource res;
+
+	if (of_address_to_resource(np, index, &res))
+		return NULL;
+
+	return ioremap(res.start, 1 + res.end - res.start);
+}
+EXPORT_SYMBOL(of_iomap);
diff --git a/include/linux/of_address.h b/include/linux/of_address.h
new file mode 100644
index 000000000000..570831d7e795
--- /dev/null
+++ b/include/linux/of_address.h
@@ -0,0 +1,9 @@
+#ifndef __OF_ADDRESS_H
+#define __OF_ADDRESS_H
+#include <linux/ioport.h>
+#include <linux/of.h>
+
+extern void __iomem *of_iomap(struct device_node *device, int index);
+
+#endif /* __OF_ADDRESS_H */
+
-- 
cgit v1.2.3


From 1f5bef30cf6c66f097ea5dfc580a41924df888d1 Mon Sep 17 00:00:00 2001
From: Grant Likely <grant.likely@secretlab.ca>
Date: Tue, 8 Jun 2010 07:48:09 -0600
Subject: of/address: merge of_address_to_resource()

Merge common code between PowerPC and Microblaze.  This patch also
moves the prototype of pci_address_to_pio() out of pci-bridge.h and
into prom.h because the only user of pci_address_to_pio() is
of_address_to_resource().

Signed-off-by: Grant Likely <grant.likely@secretlab.ca>
Acked-by: Benjamin Herrenschmidt <benh@kernel.crashing.org>
CC: Michal Simek <monstr@monstr.eu>
CC: Stephen Rothwell <sfr@canb.auug.org.au>
---
 arch/microblaze/include/asm/pci-bridge.h |  5 ----
 arch/microblaze/include/asm/prom.h       | 17 ++++++-----
 arch/microblaze/kernel/prom_parse.c      | 46 +---------------------------
 arch/powerpc/include/asm/pci-bridge.h    |  5 ----
 arch/powerpc/include/asm/prom.h          | 17 ++++++-----
 arch/powerpc/kernel/prom_parse.c         | 47 +----------------------------
 drivers/of/address.c                     | 51 ++++++++++++++++++++++++++++++++
 include/linux/of_address.h               |  5 ++++
 8 files changed, 76 insertions(+), 117 deletions(-)

(limited to 'include/linux')

diff --git a/arch/microblaze/include/asm/pci-bridge.h b/arch/microblaze/include/asm/pci-bridge.h
index 0c77cda9f5d8..0c68764ab547 100644
--- a/arch/microblaze/include/asm/pci-bridge.h
+++ b/arch/microblaze/include/asm/pci-bridge.h
@@ -172,13 +172,8 @@ static inline int pci_has_flag(int flag)
 
 extern struct list_head hose_list;
 
-extern unsigned long pci_address_to_pio(phys_addr_t address);
 extern int pcibios_vaddr_is_ioport(void __iomem *address);
 #else
-static inline unsigned long pci_address_to_pio(phys_addr_t address)
-{
-	return (unsigned long)-1;
-}
 static inline int pcibios_vaddr_is_ioport(void __iomem *address)
 {
 	return 0;
diff --git a/arch/microblaze/include/asm/prom.h b/arch/microblaze/include/asm/prom.h
index 6411c3b3a80f..4e94c0706c50 100644
--- a/arch/microblaze/include/asm/prom.h
+++ b/arch/microblaze/include/asm/prom.h
@@ -65,17 +65,18 @@ extern const u32 *of_get_address(struct device_node *dev, int index,
 extern const u32 *of_get_pci_address(struct device_node *dev, int bar_no,
 			u64 *size, unsigned int *flags);
 
-/* Get an address as a resource. Note that if your address is
- * a PIO address, the conversion will fail if the physical address
- * can't be internally converted to an IO token with
- * pci_address_to_pio(), that is because it's either called to early
- * or it can't be matched to any host bridge IO space
- */
-extern int of_address_to_resource(struct device_node *dev, int index,
-				struct resource *r);
 extern int of_pci_address_to_resource(struct device_node *dev, int bar,
 				struct resource *r);
 
+#ifdef CONFIG_PCI
+extern unsigned long pci_address_to_pio(phys_addr_t address);
+#else
+static inline unsigned long pci_address_to_pio(phys_addr_t address)
+{
+	return (unsigned long)-1;
+}
+#endif	/* CONFIG_PCI */
+
 /* Parse the ibm,dma-window property of an OF node into the busno, phys and
  * size parameters.
  */
diff --git a/arch/microblaze/kernel/prom_parse.c b/arch/microblaze/kernel/prom_parse.c
index 1159ba52ad4a..2f9cdd26ca12 100644
--- a/arch/microblaze/kernel/prom_parse.c
+++ b/arch/microblaze/kernel/prom_parse.c
@@ -6,6 +6,7 @@
 #include <linux/module.h>
 #include <linux/ioport.h>
 #include <linux/etherdevice.h>
+#include <linux/of_address.h>
 #include <asm/prom.h>
 #include <asm/pci-bridge.h>
 
@@ -17,9 +18,6 @@
 			(ns) > 0)
 
 static struct of_bus *of_match_bus(struct device_node *np);
-static int __of_address_to_resource(struct device_node *dev,
-		const u32 *addrp, u64 size, unsigned int flags,
-		struct resource *r);
 
 /* Debug utility */
 #ifdef DEBUG
@@ -576,48 +574,6 @@ const u32 *of_get_address(struct device_node *dev, int index, u64 *size,
 }
 EXPORT_SYMBOL(of_get_address);
 
-static int __of_address_to_resource(struct device_node *dev, const u32 *addrp,
-				u64 size, unsigned int flags,
-				struct resource *r)
-{
-	u64 taddr;
-
-	if ((flags & (IORESOURCE_IO | IORESOURCE_MEM)) == 0)
-		return -EINVAL;
-	taddr = of_translate_address(dev, addrp);
-	if (taddr == OF_BAD_ADDR)
-		return -EINVAL;
-	memset(r, 0, sizeof(struct resource));
-	if (flags & IORESOURCE_IO) {
-		unsigned long port;
-		port = -1; /* pci_address_to_pio(taddr); */
-		if (port == (unsigned long)-1)
-			return -EINVAL;
-		r->start = port;
-		r->end = port + size - 1;
-	} else {
-		r->start = taddr;
-		r->end = taddr + size - 1;
-	}
-	r->flags = flags;
-	r->name = dev->name;
-	return 0;
-}
-
-int of_address_to_resource(struct device_node *dev, int index,
-			struct resource *r)
-{
-	const u32	*addrp;
-	u64		size;
-	unsigned int	flags;
-
-	addrp = of_get_address(dev, index, &size, &flags);
-	if (addrp == NULL)
-		return -EINVAL;
-	return __of_address_to_resource(dev, addrp, size, flags, r);
-}
-EXPORT_SYMBOL_GPL(of_address_to_resource);
-
 void of_parse_dma_window(struct device_node *dn, const void *dma_window_prop,
 		unsigned long *busno, unsigned long *phys, unsigned long *size)
 {
diff --git a/arch/powerpc/include/asm/pci-bridge.h b/arch/powerpc/include/asm/pci-bridge.h
index 76e1f313a58e..51e9e6f90d12 100644
--- a/arch/powerpc/include/asm/pci-bridge.h
+++ b/arch/powerpc/include/asm/pci-bridge.h
@@ -303,13 +303,8 @@ extern void pcibios_free_controller(struct pci_controller *phb);
 extern void pcibios_setup_phb_resources(struct pci_controller *hose);
 
 #ifdef CONFIG_PCI
-extern unsigned long pci_address_to_pio(phys_addr_t address);
 extern int pcibios_vaddr_is_ioport(void __iomem *address);
 #else
-static inline unsigned long pci_address_to_pio(phys_addr_t address)
-{
-	return (unsigned long)-1;
-}
 static inline int pcibios_vaddr_is_ioport(void __iomem *address)
 {
 	return 0;
diff --git a/arch/powerpc/include/asm/prom.h b/arch/powerpc/include/asm/prom.h
index 0abe379c6f33..ceace966c51c 100644
--- a/arch/powerpc/include/asm/prom.h
+++ b/arch/powerpc/include/asm/prom.h
@@ -70,14 +70,6 @@ static inline const u32 *of_get_pci_address(struct device_node *dev,
 }
 #endif /* CONFIG_PCI */
 
-/* Get an address as a resource. Note that if your address is
- * a PIO address, the conversion will fail if the physical address
- * can't be internally converted to an IO token with
- * pci_address_to_pio(), that is because it's either called to early
- * or it can't be matched to any host bridge IO space
- */
-extern int of_address_to_resource(struct device_node *dev, int index,
-				  struct resource *r);
 #ifdef CONFIG_PCI
 extern int of_pci_address_to_resource(struct device_node *dev, int bar,
 				      struct resource *r);
@@ -89,6 +81,15 @@ static inline int of_pci_address_to_resource(struct device_node *dev, int bar,
 }
 #endif /* CONFIG_PCI */
 
+#ifdef CONFIG_PCI
+extern unsigned long pci_address_to_pio(phys_addr_t address);
+#else
+static inline unsigned long pci_address_to_pio(phys_addr_t address)
+{
+	return (unsigned long)-1;
+}
+#endif	/* CONFIG_PCI */
+
 /* Parse the ibm,dma-window property of an OF node into the busno, phys and
  * size parameters.
  */
diff --git a/arch/powerpc/kernel/prom_parse.c b/arch/powerpc/kernel/prom_parse.c
index 1d5d4f6dfef5..1dac535de78f 100644
--- a/arch/powerpc/kernel/prom_parse.c
+++ b/arch/powerpc/kernel/prom_parse.c
@@ -6,6 +6,7 @@
 #include <linux/module.h>
 #include <linux/ioport.h>
 #include <linux/etherdevice.h>
+#include <linux/of_address.h>
 #include <asm/prom.h>
 #include <asm/pci-bridge.h>
 
@@ -27,10 +28,6 @@
 			(ns) > 0)
 
 static struct of_bus *of_match_bus(struct device_node *np);
-static int __of_address_to_resource(struct device_node *dev,
-		const u32 *addrp, u64 size, unsigned int flags,
-		struct resource *r);
-
 
 /* Debug utility */
 #ifdef DEBUG
@@ -610,48 +607,6 @@ const u32 *of_get_address(struct device_node *dev, int index, u64 *size,
 }
 EXPORT_SYMBOL(of_get_address);
 
-static int __of_address_to_resource(struct device_node *dev, const u32 *addrp,
-				    u64 size, unsigned int flags,
-				    struct resource *r)
-{
-	u64 taddr;
-
-	if ((flags & (IORESOURCE_IO | IORESOURCE_MEM)) == 0)
-		return -EINVAL;
-	taddr = of_translate_address(dev, addrp);
-	if (taddr == OF_BAD_ADDR)
-		return -EINVAL;
-	memset(r, 0, sizeof(struct resource));
-	if (flags & IORESOURCE_IO) {
-		unsigned long port;
-		port = pci_address_to_pio(taddr);
-		if (port == (unsigned long)-1)
-			return -EINVAL;
-		r->start = port;
-		r->end = port + size - 1;
-	} else {
-		r->start = taddr;
-		r->end = taddr + size - 1;
-	}
-	r->flags = flags;
-	r->name = dev->name;
-	return 0;
-}
-
-int of_address_to_resource(struct device_node *dev, int index,
-			   struct resource *r)
-{
-	const u32	*addrp;
-	u64		size;
-	unsigned int	flags;
-
-	addrp = of_get_address(dev, index, &size, &flags);
-	if (addrp == NULL)
-		return -EINVAL;
-	return __of_address_to_resource(dev, addrp, size, flags, r);
-}
-EXPORT_SYMBOL_GPL(of_address_to_resource);
-
 void of_parse_dma_window(struct device_node *dn, const void *dma_window_prop,
 		unsigned long *busno, unsigned long *phys, unsigned long *size)
 {
diff --git a/drivers/of/address.c b/drivers/of/address.c
index 258528d6c4fe..c3819550f907 100644
--- a/drivers/of/address.c
+++ b/drivers/of/address.c
@@ -3,6 +3,57 @@
 #include <linux/ioport.h>
 #include <linux/of_address.h>
 
+int __of_address_to_resource(struct device_node *dev, const u32 *addrp,
+			     u64 size, unsigned int flags,
+			     struct resource *r)
+{
+	u64 taddr;
+
+	if ((flags & (IORESOURCE_IO | IORESOURCE_MEM)) == 0)
+		return -EINVAL;
+	taddr = of_translate_address(dev, addrp);
+	if (taddr == OF_BAD_ADDR)
+		return -EINVAL;
+	memset(r, 0, sizeof(struct resource));
+	if (flags & IORESOURCE_IO) {
+		unsigned long port;
+		port = pci_address_to_pio(taddr);
+		if (port == (unsigned long)-1)
+			return -EINVAL;
+		r->start = port;
+		r->end = port + size - 1;
+	} else {
+		r->start = taddr;
+		r->end = taddr + size - 1;
+	}
+	r->flags = flags;
+	r->name = dev->name;
+	return 0;
+}
+
+/**
+ * of_address_to_resource - Translate device tree address and return as resource
+ *
+ * Note that if your address is a PIO address, the conversion will fail if
+ * the physical address can't be internally converted to an IO token with
+ * pci_address_to_pio(), that is because it's either called to early or it
+ * can't be matched to any host bridge IO space
+ */
+int of_address_to_resource(struct device_node *dev, int index,
+			   struct resource *r)
+{
+	const u32	*addrp;
+	u64		size;
+	unsigned int	flags;
+
+	addrp = of_get_address(dev, index, &size, &flags);
+	if (addrp == NULL)
+		return -EINVAL;
+	return __of_address_to_resource(dev, addrp, size, flags, r);
+}
+EXPORT_SYMBOL_GPL(of_address_to_resource);
+
+
 /**
  * of_iomap - Maps the memory mapped IO for a given device_node
  * @device:	the device whose io range will be mapped
diff --git a/include/linux/of_address.h b/include/linux/of_address.h
index 570831d7e795..474b794ed9d2 100644
--- a/include/linux/of_address.h
+++ b/include/linux/of_address.h
@@ -3,6 +3,11 @@
 #include <linux/ioport.h>
 #include <linux/of.h>
 
+extern int __of_address_to_resource(struct device_node *dev, const u32 *addrp,
+				    u64 size, unsigned int flags,
+				    struct resource *r);
+extern int of_address_to_resource(struct device_node *dev, int index,
+				  struct resource *r);
 extern void __iomem *of_iomap(struct device_node *device, int index);
 
 #endif /* __OF_ADDRESS_H */
-- 
cgit v1.2.3


From dbbdee94734bf6f1db7af42008a53655e77cab8f Mon Sep 17 00:00:00 2001
From: Grant Likely <grant.likely@secretlab.ca>
Date: Tue, 8 Jun 2010 07:48:10 -0600
Subject: of/address: Merge all of the bus translation code

Microblaze and PowerPC share a large chunk of code for translating
OF device tree data into usable addresses.  Differences between the two
consist of cosmetic differences, and the addition of dma-ranges support
code to powerpc but not microblaze.  This patch moves the powerpc
version into common code and applies many of the cosmetic (non-functional)
changes from the microblaze version.

Signed-off-by: Grant Likely <grant.likely@secretlab.ca>
Acked-by: Benjamin Herrenschmidt <benh@kernel.crashing.org>
CC: Michal Simek <monstr@monstr.eu>
CC: Wolfram Sang <w.sang@pengutronix.de>
CC: Stephen Rothwell <sfr@canb.auug.org.au>
---
 arch/microblaze/include/asm/prom.h  |   4 -
 arch/microblaze/kernel/prom_parse.c | 489 ----------------------------------
 arch/powerpc/include/asm/prom.h     |   4 -
 arch/powerpc/kernel/prom_parse.c    | 515 -----------------------------------
 drivers/of/address.c                | 517 +++++++++++++++++++++++++++++++++++-
 include/linux/of_address.h          |   4 +-
 6 files changed, 515 insertions(+), 1018 deletions(-)

(limited to 'include/linux')

diff --git a/arch/microblaze/include/asm/prom.h b/arch/microblaze/include/asm/prom.h
index 4e94c0706c50..cb9c3dd9a23b 100644
--- a/arch/microblaze/include/asm/prom.h
+++ b/arch/microblaze/include/asm/prom.h
@@ -52,10 +52,6 @@ extern void pci_create_OF_bus_map(void);
  * OF address retreival & translation
  */
 
-/* Translate an OF address block into a CPU physical address
- */
-extern u64 of_translate_address(struct device_node *np, const u32 *addr);
-
 /* Extract an address from a device, returns the region size and
  * the address space flags too. The PCI version uses a BAR number
  * instead of an absolute index
diff --git a/arch/microblaze/kernel/prom_parse.c b/arch/microblaze/kernel/prom_parse.c
index 2f9cdd26ca12..d33ba17601fa 100644
--- a/arch/microblaze/kernel/prom_parse.c
+++ b/arch/microblaze/kernel/prom_parse.c
@@ -10,213 +10,7 @@
 #include <asm/prom.h>
 #include <asm/pci-bridge.h>
 
-#define PRu64	"%llx"
-
-/* Max address size we deal with */
-#define OF_MAX_ADDR_CELLS	4
-#define OF_CHECK_COUNTS(na, ns)	((na) > 0 && (na) <= OF_MAX_ADDR_CELLS && \
-			(ns) > 0)
-
-static struct of_bus *of_match_bus(struct device_node *np);
-
-/* Debug utility */
-#ifdef DEBUG
-static void of_dump_addr(const char *s, const u32 *addr, int na)
-{
-	printk(KERN_INFO "%s", s);
-	while (na--)
-		printk(KERN_INFO " %08x", *(addr++));
-	printk(KERN_INFO "\n");
-}
-#else
-static void of_dump_addr(const char *s, const u32 *addr, int na) { }
-#endif
-
-/* Callbacks for bus specific translators */
-struct of_bus {
-	const char	*name;
-	const char	*addresses;
-	int		(*match)(struct device_node *parent);
-	void		(*count_cells)(struct device_node *child,
-					int *addrc, int *sizec);
-	u64		(*map)(u32 *addr, const u32 *range,
-				int na, int ns, int pna);
-	int		(*translate)(u32 *addr, u64 offset, int na);
-	unsigned int	(*get_flags)(const u32 *addr);
-};
-
-/*
- * Default translator (generic bus)
- */
-
-static void of_bus_default_count_cells(struct device_node *dev,
-					int *addrc, int *sizec)
-{
-	if (addrc)
-		*addrc = of_n_addr_cells(dev);
-	if (sizec)
-		*sizec = of_n_size_cells(dev);
-}
-
-static u64 of_bus_default_map(u32 *addr, const u32 *range,
-		int na, int ns, int pna)
-{
-	u64 cp, s, da;
-
-	cp = of_read_number(range, na);
-	s  = of_read_number(range + na + pna, ns);
-	da = of_read_number(addr, na);
-
-	pr_debug("OF: default map, cp="PRu64", s="PRu64", da="PRu64"\n",
-		cp, s, da);
-
-	if (da < cp || da >= (cp + s))
-		return OF_BAD_ADDR;
-	return da - cp;
-}
-
-static int of_bus_default_translate(u32 *addr, u64 offset, int na)
-{
-	u64 a = of_read_number(addr, na);
-	memset(addr, 0, na * 4);
-	a += offset;
-	if (na > 1)
-		addr[na - 2] = a >> 32;
-	addr[na - 1] = a & 0xffffffffu;
-
-	return 0;
-}
-
-static unsigned int of_bus_default_get_flags(const u32 *addr)
-{
-	return IORESOURCE_MEM;
-}
-
 #ifdef CONFIG_PCI
-/*
- * PCI bus specific translator
- */
-
-static int of_bus_pci_match(struct device_node *np)
-{
-	/* "vci" is for the /chaos bridge on 1st-gen PCI powermacs */
-	return !strcmp(np->type, "pci") || !strcmp(np->type, "vci");
-}
-
-static void of_bus_pci_count_cells(struct device_node *np,
-				int *addrc, int *sizec)
-{
-	if (addrc)
-		*addrc = 3;
-	if (sizec)
-		*sizec = 2;
-}
-
-static u64 of_bus_pci_map(u32 *addr, const u32 *range, int na, int ns, int pna)
-{
-	u64 cp, s, da;
-
-	/* Check address type match */
-	if ((addr[0] ^ range[0]) & 0x03000000)
-		return OF_BAD_ADDR;
-
-	/* Read address values, skipping high cell */
-	cp = of_read_number(range + 1, na - 1);
-	s  = of_read_number(range + na + pna, ns);
-	da = of_read_number(addr + 1, na - 1);
-
-	pr_debug("OF: PCI map, cp="PRu64", s="PRu64", da="PRu64"\n", cp, s, da);
-
-	if (da < cp || da >= (cp + s))
-		return OF_BAD_ADDR;
-	return da - cp;
-}
-
-static int of_bus_pci_translate(u32 *addr, u64 offset, int na)
-{
-	return of_bus_default_translate(addr + 1, offset, na - 1);
-}
-
-static unsigned int of_bus_pci_get_flags(const u32 *addr)
-{
-	unsigned int flags = 0;
-	u32 w = addr[0];
-
-	switch ((w >> 24) & 0x03) {
-	case 0x01:
-		flags |= IORESOURCE_IO;
-		break;
-	case 0x02: /* 32 bits */
-	case 0x03: /* 64 bits */
-		flags |= IORESOURCE_MEM;
-		break;
-	}
-	if (w & 0x40000000)
-		flags |= IORESOURCE_PREFETCH;
-	return flags;
-}
-
-const u32 *of_get_pci_address(struct device_node *dev, int bar_no, u64 *size,
-			unsigned int *flags)
-{
-	const u32 *prop;
-	unsigned int psize;
-	struct device_node *parent;
-	struct of_bus *bus;
-	int onesize, i, na, ns;
-
-	/* Get parent & match bus type */
-	parent = of_get_parent(dev);
-	if (parent == NULL)
-		return NULL;
-	bus = of_match_bus(parent);
-	if (strcmp(bus->name, "pci")) {
-		of_node_put(parent);
-		return NULL;
-	}
-	bus->count_cells(dev, &na, &ns);
-	of_node_put(parent);
-	if (!OF_CHECK_COUNTS(na, ns))
-		return NULL;
-
-	/* Get "reg" or "assigned-addresses" property */
-	prop = of_get_property(dev, bus->addresses, &psize);
-	if (prop == NULL)
-		return NULL;
-	psize /= 4;
-
-	onesize = na + ns;
-	for (i = 0; psize >= onesize; psize -= onesize, prop += onesize, i++)
-		if ((prop[0] & 0xff) == ((bar_no * 4) + PCI_BASE_ADDRESS_0)) {
-			if (size)
-				*size = of_read_number(prop + na, ns);
-			if (flags)
-				*flags = bus->get_flags(prop);
-			return prop;
-		}
-	return NULL;
-}
-EXPORT_SYMBOL(of_get_pci_address);
-
-int of_pci_address_to_resource(struct device_node *dev, int bar,
-				struct resource *r)
-{
-	const u32	*addrp;
-	u64		size;
-	unsigned int	flags;
-
-	addrp = of_get_pci_address(dev, bar, &size, &flags);
-	if (addrp == NULL)
-		return -EINVAL;
-	return __of_address_to_resource(dev, addrp, size, flags, r);
-}
-EXPORT_SYMBOL_GPL(of_pci_address_to_resource);
-
-static u8 of_irq_pci_swizzle(u8 slot, u8 pin)
-{
-	return (((pin - 1) + slot) % 4) + 1;
-}
-
 int of_irq_map_pci(struct pci_dev *pdev, struct of_irq *out_irq)
 {
 	struct device_node *dn, *ppnode;
@@ -291,289 +85,6 @@ int of_irq_map_pci(struct pci_dev *pdev, struct of_irq *out_irq)
 EXPORT_SYMBOL_GPL(of_irq_map_pci);
 #endif /* CONFIG_PCI */
 
-/*
- * ISA bus specific translator
- */
-
-static int of_bus_isa_match(struct device_node *np)
-{
-	return !strcmp(np->name, "isa");
-}
-
-static void of_bus_isa_count_cells(struct device_node *child,
-				int *addrc, int *sizec)
-{
-	if (addrc)
-		*addrc = 2;
-	if (sizec)
-		*sizec = 1;
-}
-
-static u64 of_bus_isa_map(u32 *addr, const u32 *range, int na, int ns, int pna)
-{
-	u64 cp, s, da;
-
-	/* Check address type match */
-	if ((addr[0] ^ range[0]) & 0x00000001)
-		return OF_BAD_ADDR;
-
-	/* Read address values, skipping high cell */
-	cp = of_read_number(range + 1, na - 1);
-	s  = of_read_number(range + na + pna, ns);
-	da = of_read_number(addr + 1, na - 1);
-
-	pr_debug("OF: ISA map, cp="PRu64", s="PRu64", da="PRu64"\n", cp, s, da);
-
-	if (da < cp || da >= (cp + s))
-		return OF_BAD_ADDR;
-	return da - cp;
-}
-
-static int of_bus_isa_translate(u32 *addr, u64 offset, int na)
-{
-	return of_bus_default_translate(addr + 1, offset, na - 1);
-}
-
-static unsigned int of_bus_isa_get_flags(const u32 *addr)
-{
-	unsigned int flags = 0;
-	u32 w = addr[0];
-
-	if (w & 1)
-		flags |= IORESOURCE_IO;
-	else
-		flags |= IORESOURCE_MEM;
-	return flags;
-}
-
-/*
- * Array of bus specific translators
- */
-
-static struct of_bus of_busses[] = {
-#ifdef CONFIG_PCI
-	/* PCI */
-	{
-		.name = "pci",
-		.addresses = "assigned-addresses",
-		.match = of_bus_pci_match,
-		.count_cells = of_bus_pci_count_cells,
-		.map = of_bus_pci_map,
-		.translate = of_bus_pci_translate,
-		.get_flags = of_bus_pci_get_flags,
-	},
-#endif /* CONFIG_PCI */
-	/* ISA */
-	{
-		.name = "isa",
-		.addresses = "reg",
-		.match = of_bus_isa_match,
-		.count_cells = of_bus_isa_count_cells,
-		.map = of_bus_isa_map,
-		.translate = of_bus_isa_translate,
-		.get_flags = of_bus_isa_get_flags,
-	},
-	/* Default */
-	{
-		.name = "default",
-		.addresses = "reg",
-		.match = NULL,
-		.count_cells = of_bus_default_count_cells,
-		.map = of_bus_default_map,
-		.translate = of_bus_default_translate,
-		.get_flags = of_bus_default_get_flags,
-	},
-};
-
-static struct of_bus *of_match_bus(struct device_node *np)
-{
-	int i;
-
-	for (i = 0; i < ARRAY_SIZE(of_busses); i++)
-		if (!of_busses[i].match || of_busses[i].match(np))
-			return &of_busses[i];
-	BUG();
-	return NULL;
-}
-
-static int of_translate_one(struct device_node *parent, struct of_bus *bus,
-			struct of_bus *pbus, u32 *addr,
-			int na, int ns, int pna)
-{
-	const u32 *ranges;
-	unsigned int rlen;
-	int rone;
-	u64 offset = OF_BAD_ADDR;
-
-	/* Normally, an absence of a "ranges" property means we are
-	 * crossing a non-translatable boundary, and thus the addresses
-	 * below the current not cannot be converted to CPU physical ones.
-	 * Unfortunately, while this is very clear in the spec, it's not
-	 * what Apple understood, and they do have things like /uni-n or
-	 * /ht nodes with no "ranges" property and a lot of perfectly
-	 * useable mapped devices below them. Thus we treat the absence of
-	 * "ranges" as equivalent to an empty "ranges" property which means
-	 * a 1:1 translation at that level. It's up to the caller not to try
-	 * to translate addresses that aren't supposed to be translated in
-	 * the first place. --BenH.
-	 */
-	ranges = of_get_property(parent, "ranges", (int *) &rlen);
-	if (ranges == NULL || rlen == 0) {
-		offset = of_read_number(addr, na);
-		memset(addr, 0, pna * 4);
-		pr_debug("OF: no ranges, 1:1 translation\n");
-		goto finish;
-	}
-
-	pr_debug("OF: walking ranges...\n");
-
-	/* Now walk through the ranges */
-	rlen /= 4;
-	rone = na + pna + ns;
-	for (; rlen >= rone; rlen -= rone, ranges += rone) {
-		offset = bus->map(addr, ranges, na, ns, pna);
-		if (offset != OF_BAD_ADDR)
-			break;
-	}
-	if (offset == OF_BAD_ADDR) {
-		pr_debug("OF: not found !\n");
-		return 1;
-	}
-	memcpy(addr, ranges + na, 4 * pna);
-
- finish:
-	of_dump_addr("OF: parent translation for:", addr, pna);
-	pr_debug("OF: with offset: "PRu64"\n", offset);
-
-	/* Translate it into parent bus space */
-	return pbus->translate(addr, offset, pna);
-}
-
-/*
- * Translate an address from the device-tree into a CPU physical address,
- * this walks up the tree and applies the various bus mappings on the
- * way.
- *
- * Note: We consider that crossing any level with #size-cells == 0 to mean
- * that translation is impossible (that is we are not dealing with a value
- * that can be mapped to a cpu physical address). This is not really specified
- * that way, but this is traditionally the way IBM at least do things
- */
-u64 of_translate_address(struct device_node *dev, const u32 *in_addr)
-{
-	struct device_node *parent = NULL;
-	struct of_bus *bus, *pbus;
-	u32 addr[OF_MAX_ADDR_CELLS];
-	int na, ns, pna, pns;
-	u64 result = OF_BAD_ADDR;
-
-	pr_debug("OF: ** translation for device %s **\n", dev->full_name);
-
-	/* Increase refcount at current level */
-	of_node_get(dev);
-
-	/* Get parent & match bus type */
-	parent = of_get_parent(dev);
-	if (parent == NULL)
-		goto bail;
-	bus = of_match_bus(parent);
-
-	/* Cound address cells & copy address locally */
-	bus->count_cells(dev, &na, &ns);
-	if (!OF_CHECK_COUNTS(na, ns)) {
-		printk(KERN_ERR "prom_parse: Bad cell count for %s\n",
-			dev->full_name);
-		goto bail;
-	}
-	memcpy(addr, in_addr, na * 4);
-
-	pr_debug("OF: bus is %s (na=%d, ns=%d) on %s\n",
-		bus->name, na, ns, parent->full_name);
-	of_dump_addr("OF: translating address:", addr, na);
-
-	/* Translate */
-	for (;;) {
-		/* Switch to parent bus */
-		of_node_put(dev);
-		dev = parent;
-		parent = of_get_parent(dev);
-
-		/* If root, we have finished */
-		if (parent == NULL) {
-			pr_debug("OF: reached root node\n");
-			result = of_read_number(addr, na);
-			break;
-		}
-
-		/* Get new parent bus and counts */
-		pbus = of_match_bus(parent);
-		pbus->count_cells(dev, &pna, &pns);
-		if (!OF_CHECK_COUNTS(pna, pns)) {
-			printk(KERN_ERR "prom_parse: Bad cell count for %s\n",
-				dev->full_name);
-			break;
-		}
-
-		pr_debug("OF: parent bus is %s (na=%d, ns=%d) on %s\n",
-			pbus->name, pna, pns, parent->full_name);
-
-		/* Apply bus translation */
-		if (of_translate_one(dev, bus, pbus, addr, na, ns, pna))
-			break;
-
-		/* Complete the move up one level */
-		na = pna;
-		ns = pns;
-		bus = pbus;
-
-		of_dump_addr("OF: one level translation:", addr, na);
-	}
- bail:
-	of_node_put(parent);
-	of_node_put(dev);
-
-	return result;
-}
-EXPORT_SYMBOL(of_translate_address);
-
-const u32 *of_get_address(struct device_node *dev, int index, u64 *size,
-			unsigned int *flags)
-{
-	const u32 *prop;
-	unsigned int psize;
-	struct device_node *parent;
-	struct of_bus *bus;
-	int onesize, i, na, ns;
-
-	/* Get parent & match bus type */
-	parent = of_get_parent(dev);
-	if (parent == NULL)
-		return NULL;
-	bus = of_match_bus(parent);
-	bus->count_cells(dev, &na, &ns);
-	of_node_put(parent);
-	if (!OF_CHECK_COUNTS(na, ns))
-		return NULL;
-
-	/* Get "reg" or "assigned-addresses" property */
-	prop = of_get_property(dev, bus->addresses, (int *) &psize);
-	if (prop == NULL)
-		return NULL;
-	psize /= 4;
-
-	onesize = na + ns;
-	for (i = 0; psize >= onesize; psize -= onesize, prop += onesize, i++)
-		if (i == index) {
-			if (size)
-				*size = of_read_number(prop + na, ns);
-			if (flags)
-				*flags = bus->get_flags(prop);
-			return prop;
-		}
-	return NULL;
-}
-EXPORT_SYMBOL(of_get_address);
-
 void of_parse_dma_window(struct device_node *dn, const void *dma_window_prop,
 		unsigned long *busno, unsigned long *phys, unsigned long *size)
 {
diff --git a/arch/powerpc/include/asm/prom.h b/arch/powerpc/include/asm/prom.h
index ceace966c51c..f864722679e8 100644
--- a/arch/powerpc/include/asm/prom.h
+++ b/arch/powerpc/include/asm/prom.h
@@ -45,10 +45,6 @@ extern void pci_create_OF_bus_map(void);
  * OF address retreival & translation
  */
 
-/* Translate an OF address block into a CPU physical address
- */
-extern u64 of_translate_address(struct device_node *np, const u32 *addr);
-
 /* Translate a DMA address from device space to CPU space */
 extern u64 of_translate_dma_address(struct device_node *dev,
 				    const u32 *in_addr);
diff --git a/arch/powerpc/kernel/prom_parse.c b/arch/powerpc/kernel/prom_parse.c
index 1dac535de78f..88334af038e5 100644
--- a/arch/powerpc/kernel/prom_parse.c
+++ b/arch/powerpc/kernel/prom_parse.c
@@ -10,225 +10,7 @@
 #include <asm/prom.h>
 #include <asm/pci-bridge.h>
 
-#ifdef DEBUG
-#define DBG(fmt...) do { printk(fmt); } while(0)
-#else
-#define DBG(fmt...) do { } while(0)
-#endif
-
-#ifdef CONFIG_PPC64
-#define PRu64	"%lx"
-#else
-#define PRu64	"%llx"
-#endif
-
-/* Max address size we deal with */
-#define OF_MAX_ADDR_CELLS	4
-#define OF_CHECK_COUNTS(na, ns)	((na) > 0 && (na) <= OF_MAX_ADDR_CELLS && \
-			(ns) > 0)
-
-static struct of_bus *of_match_bus(struct device_node *np);
-
-/* Debug utility */
-#ifdef DEBUG
-static void of_dump_addr(const char *s, const u32 *addr, int na)
-{
-	printk("%s", s);
-	while(na--)
-		printk(" %08x", *(addr++));
-	printk("\n");
-}
-#else
-static void of_dump_addr(const char *s, const u32 *addr, int na) { }
-#endif
-
-
-/* Callbacks for bus specific translators */
-struct of_bus {
-	const char	*name;
-	const char	*addresses;
-	int		(*match)(struct device_node *parent);
-	void		(*count_cells)(struct device_node *child,
-				       int *addrc, int *sizec);
-	u64		(*map)(u32 *addr, const u32 *range,
-				int na, int ns, int pna);
-	int		(*translate)(u32 *addr, u64 offset, int na);
-	unsigned int	(*get_flags)(const u32 *addr);
-};
-
-
-/*
- * Default translator (generic bus)
- */
-
-static void of_bus_default_count_cells(struct device_node *dev,
-				       int *addrc, int *sizec)
-{
-	if (addrc)
-		*addrc = of_n_addr_cells(dev);
-	if (sizec)
-		*sizec = of_n_size_cells(dev);
-}
-
-static u64 of_bus_default_map(u32 *addr, const u32 *range,
-		int na, int ns, int pna)
-{
-	u64 cp, s, da;
-
-	cp = of_read_number(range, na);
-	s  = of_read_number(range + na + pna, ns);
-	da = of_read_number(addr, na);
-
-	DBG("OF: default map, cp="PRu64", s="PRu64", da="PRu64"\n",
-	    cp, s, da);
-
-	if (da < cp || da >= (cp + s))
-		return OF_BAD_ADDR;
-	return da - cp;
-}
-
-static int of_bus_default_translate(u32 *addr, u64 offset, int na)
-{
-	u64 a = of_read_number(addr, na);
-	memset(addr, 0, na * 4);
-	a += offset;
-	if (na > 1)
-		addr[na - 2] = a >> 32;
-	addr[na - 1] = a & 0xffffffffu;
-
-	return 0;
-}
-
-static unsigned int of_bus_default_get_flags(const u32 *addr)
-{
-	return IORESOURCE_MEM;
-}
-
-
 #ifdef CONFIG_PCI
-/*
- * PCI bus specific translator
- */
-
-static int of_bus_pci_match(struct device_node *np)
-{
-	/* "vci" is for the /chaos bridge on 1st-gen PCI powermacs */
-	return !strcmp(np->type, "pci") || !strcmp(np->type, "vci");
-}
-
-static void of_bus_pci_count_cells(struct device_node *np,
-				   int *addrc, int *sizec)
-{
-	if (addrc)
-		*addrc = 3;
-	if (sizec)
-		*sizec = 2;
-}
-
-static unsigned int of_bus_pci_get_flags(const u32 *addr)
-{
-	unsigned int flags = 0;
-	u32 w = addr[0];
-
-	switch((w >> 24) & 0x03) {
-	case 0x01:
-		flags |= IORESOURCE_IO;
-		break;
-	case 0x02: /* 32 bits */
-	case 0x03: /* 64 bits */
-		flags |= IORESOURCE_MEM;
-		break;
-	}
-	if (w & 0x40000000)
-		flags |= IORESOURCE_PREFETCH;
-	return flags;
-}
-
-static u64 of_bus_pci_map(u32 *addr, const u32 *range, int na, int ns, int pna)
-{
-	u64 cp, s, da;
-	unsigned int af, rf;
-
-	af = of_bus_pci_get_flags(addr);
-	rf = of_bus_pci_get_flags(range);
-
-	/* Check address type match */
-	if ((af ^ rf) & (IORESOURCE_MEM | IORESOURCE_IO))
-		return OF_BAD_ADDR;
-
-	/* Read address values, skipping high cell */
-	cp = of_read_number(range + 1, na - 1);
-	s  = of_read_number(range + na + pna, ns);
-	da = of_read_number(addr + 1, na - 1);
-
-	DBG("OF: PCI map, cp="PRu64", s="PRu64", da="PRu64"\n", cp, s, da);
-
-	if (da < cp || da >= (cp + s))
-		return OF_BAD_ADDR;
-	return da - cp;
-}
-
-static int of_bus_pci_translate(u32 *addr, u64 offset, int na)
-{
-	return of_bus_default_translate(addr + 1, offset, na - 1);
-}
-
-const u32 *of_get_pci_address(struct device_node *dev, int bar_no, u64 *size,
-			unsigned int *flags)
-{
-	const u32 *prop;
-	unsigned int psize;
-	struct device_node *parent;
-	struct of_bus *bus;
-	int onesize, i, na, ns;
-
-	/* Get parent & match bus type */
-	parent = of_get_parent(dev);
-	if (parent == NULL)
-		return NULL;
-	bus = of_match_bus(parent);
-	if (strcmp(bus->name, "pci")) {
-		of_node_put(parent);
-		return NULL;
-	}
-	bus->count_cells(dev, &na, &ns);
-	of_node_put(parent);
-	if (!OF_CHECK_COUNTS(na, ns))
-		return NULL;
-
-	/* Get "reg" or "assigned-addresses" property */
-	prop = of_get_property(dev, bus->addresses, &psize);
-	if (prop == NULL)
-		return NULL;
-	psize /= 4;
-
-	onesize = na + ns;
-	for (i = 0; psize >= onesize; psize -= onesize, prop += onesize, i++)
-		if ((prop[0] & 0xff) == ((bar_no * 4) + PCI_BASE_ADDRESS_0)) {
-			if (size)
-				*size = of_read_number(prop + na, ns);
-			if (flags)
-				*flags = bus->get_flags(prop);
-			return prop;
-		}
-	return NULL;
-}
-EXPORT_SYMBOL(of_get_pci_address);
-
-int of_pci_address_to_resource(struct device_node *dev, int bar,
-			       struct resource *r)
-{
-	const u32	*addrp;
-	u64		size;
-	unsigned int	flags;
-
-	addrp = of_get_pci_address(dev, bar, &size, &flags);
-	if (addrp == NULL)
-		return -EINVAL;
-	return __of_address_to_resource(dev, addrp, size, flags, r);
-}
-EXPORT_SYMBOL_GPL(of_pci_address_to_resource);
-
 int of_irq_map_pci(struct pci_dev *pdev, struct of_irq *out_irq)
 {
 	struct device_node *dn, *ppnode;
@@ -310,303 +92,6 @@ int of_irq_map_pci(struct pci_dev *pdev, struct of_irq *out_irq)
 EXPORT_SYMBOL_GPL(of_irq_map_pci);
 #endif /* CONFIG_PCI */
 
-/*
- * ISA bus specific translator
- */
-
-static int of_bus_isa_match(struct device_node *np)
-{
-	return !strcmp(np->name, "isa");
-}
-
-static void of_bus_isa_count_cells(struct device_node *child,
-				   int *addrc, int *sizec)
-{
-	if (addrc)
-		*addrc = 2;
-	if (sizec)
-		*sizec = 1;
-}
-
-static u64 of_bus_isa_map(u32 *addr, const u32 *range, int na, int ns, int pna)
-{
-	u64 cp, s, da;
-
-	/* Check address type match */
-	if ((addr[0] ^ range[0]) & 0x00000001)
-		return OF_BAD_ADDR;
-
-	/* Read address values, skipping high cell */
-	cp = of_read_number(range + 1, na - 1);
-	s  = of_read_number(range + na + pna, ns);
-	da = of_read_number(addr + 1, na - 1);
-
-	DBG("OF: ISA map, cp="PRu64", s="PRu64", da="PRu64"\n", cp, s, da);
-
-	if (da < cp || da >= (cp + s))
-		return OF_BAD_ADDR;
-	return da - cp;
-}
-
-static int of_bus_isa_translate(u32 *addr, u64 offset, int na)
-{
-	return of_bus_default_translate(addr + 1, offset, na - 1);
-}
-
-static unsigned int of_bus_isa_get_flags(const u32 *addr)
-{
-	unsigned int flags = 0;
-	u32 w = addr[0];
-
-	if (w & 1)
-		flags |= IORESOURCE_IO;
-	else
-		flags |= IORESOURCE_MEM;
-	return flags;
-}
-
-
-/*
- * Array of bus specific translators
- */
-
-static struct of_bus of_busses[] = {
-#ifdef CONFIG_PCI
-	/* PCI */
-	{
-		.name = "pci",
-		.addresses = "assigned-addresses",
-		.match = of_bus_pci_match,
-		.count_cells = of_bus_pci_count_cells,
-		.map = of_bus_pci_map,
-		.translate = of_bus_pci_translate,
-		.get_flags = of_bus_pci_get_flags,
-	},
-#endif /* CONFIG_PCI */
-	/* ISA */
-	{
-		.name = "isa",
-		.addresses = "reg",
-		.match = of_bus_isa_match,
-		.count_cells = of_bus_isa_count_cells,
-		.map = of_bus_isa_map,
-		.translate = of_bus_isa_translate,
-		.get_flags = of_bus_isa_get_flags,
-	},
-	/* Default */
-	{
-		.name = "default",
-		.addresses = "reg",
-		.match = NULL,
-		.count_cells = of_bus_default_count_cells,
-		.map = of_bus_default_map,
-		.translate = of_bus_default_translate,
-		.get_flags = of_bus_default_get_flags,
-	},
-};
-
-static struct of_bus *of_match_bus(struct device_node *np)
-{
-	int i;
-
-	for (i = 0; i < ARRAY_SIZE(of_busses); i ++)
-		if (!of_busses[i].match || of_busses[i].match(np))
-			return &of_busses[i];
-	BUG();
-	return NULL;
-}
-
-static int of_translate_one(struct device_node *parent, struct of_bus *bus,
-			    struct of_bus *pbus, u32 *addr,
-			    int na, int ns, int pna, const char *rprop)
-{
-	const u32 *ranges;
-	unsigned int rlen;
-	int rone;
-	u64 offset = OF_BAD_ADDR;
-
-	/* Normally, an absence of a "ranges" property means we are
-	 * crossing a non-translatable boundary, and thus the addresses
-	 * below the current not cannot be converted to CPU physical ones.
-	 * Unfortunately, while this is very clear in the spec, it's not
-	 * what Apple understood, and they do have things like /uni-n or
-	 * /ht nodes with no "ranges" property and a lot of perfectly
-	 * useable mapped devices below them. Thus we treat the absence of
-	 * "ranges" as equivalent to an empty "ranges" property which means
-	 * a 1:1 translation at that level. It's up to the caller not to try
-	 * to translate addresses that aren't supposed to be translated in
-	 * the first place. --BenH.
-	 */
-	ranges = of_get_property(parent, rprop, &rlen);
-	if (ranges == NULL || rlen == 0) {
-		offset = of_read_number(addr, na);
-		memset(addr, 0, pna * 4);
-		DBG("OF: no ranges, 1:1 translation\n");
-		goto finish;
-	}
-
-	DBG("OF: walking ranges...\n");
-
-	/* Now walk through the ranges */
-	rlen /= 4;
-	rone = na + pna + ns;
-	for (; rlen >= rone; rlen -= rone, ranges += rone) {
-		offset = bus->map(addr, ranges, na, ns, pna);
-		if (offset != OF_BAD_ADDR)
-			break;
-	}
-	if (offset == OF_BAD_ADDR) {
-		DBG("OF: not found !\n");
-		return 1;
-	}
-	memcpy(addr, ranges + na, 4 * pna);
-
- finish:
-	of_dump_addr("OF: parent translation for:", addr, pna);
-	DBG("OF: with offset: "PRu64"\n", offset);
-
-	/* Translate it into parent bus space */
-	return pbus->translate(addr, offset, pna);
-}
-
-
-/*
- * Translate an address from the device-tree into a CPU physical address,
- * this walks up the tree and applies the various bus mappings on the
- * way.
- *
- * Note: We consider that crossing any level with #size-cells == 0 to mean
- * that translation is impossible (that is we are not dealing with a value
- * that can be mapped to a cpu physical address). This is not really specified
- * that way, but this is traditionally the way IBM at least do things
- */
-u64 __of_translate_address(struct device_node *dev, const u32 *in_addr,
-			   const char *rprop)
-{
-	struct device_node *parent = NULL;
-	struct of_bus *bus, *pbus;
-	u32 addr[OF_MAX_ADDR_CELLS];
-	int na, ns, pna, pns;
-	u64 result = OF_BAD_ADDR;
-
-	DBG("OF: ** translation for device %s **\n", dev->full_name);
-
-	/* Increase refcount at current level */
-	of_node_get(dev);
-
-	/* Get parent & match bus type */
-	parent = of_get_parent(dev);
-	if (parent == NULL)
-		goto bail;
-	bus = of_match_bus(parent);
-
-	/* Cound address cells & copy address locally */
-	bus->count_cells(dev, &na, &ns);
-	if (!OF_CHECK_COUNTS(na, ns)) {
-		printk(KERN_ERR "prom_parse: Bad cell count for %s\n",
-		       dev->full_name);
-		goto bail;
-	}
-	memcpy(addr, in_addr, na * 4);
-
-	DBG("OF: bus is %s (na=%d, ns=%d) on %s\n",
-	    bus->name, na, ns, parent->full_name);
-	of_dump_addr("OF: translating address:", addr, na);
-
-	/* Translate */
-	for (;;) {
-		/* Switch to parent bus */
-		of_node_put(dev);
-		dev = parent;
-		parent = of_get_parent(dev);
-
-		/* If root, we have finished */
-		if (parent == NULL) {
-			DBG("OF: reached root node\n");
-			result = of_read_number(addr, na);
-			break;
-		}
-
-		/* Get new parent bus and counts */
-		pbus = of_match_bus(parent);
-		pbus->count_cells(dev, &pna, &pns);
-		if (!OF_CHECK_COUNTS(pna, pns)) {
-			printk(KERN_ERR "prom_parse: Bad cell count for %s\n",
-			       dev->full_name);
-			break;
-		}
-
-		DBG("OF: parent bus is %s (na=%d, ns=%d) on %s\n",
-		    pbus->name, pna, pns, parent->full_name);
-
-		/* Apply bus translation */
-		if (of_translate_one(dev, bus, pbus, addr, na, ns, pna, rprop))
-			break;
-
-		/* Complete the move up one level */
-		na = pna;
-		ns = pns;
-		bus = pbus;
-
-		of_dump_addr("OF: one level translation:", addr, na);
-	}
- bail:
-	of_node_put(parent);
-	of_node_put(dev);
-
-	return result;
-}
-
-u64 of_translate_address(struct device_node *dev, const u32 *in_addr)
-{
-	return __of_translate_address(dev, in_addr, "ranges");
-}
-EXPORT_SYMBOL(of_translate_address);
-
-u64 of_translate_dma_address(struct device_node *dev, const u32 *in_addr)
-{
-	return __of_translate_address(dev, in_addr, "dma-ranges");
-}
-EXPORT_SYMBOL(of_translate_dma_address);
-
-const u32 *of_get_address(struct device_node *dev, int index, u64 *size,
-		    unsigned int *flags)
-{
-	const u32 *prop;
-	unsigned int psize;
-	struct device_node *parent;
-	struct of_bus *bus;
-	int onesize, i, na, ns;
-
-	/* Get parent & match bus type */
-	parent = of_get_parent(dev);
-	if (parent == NULL)
-		return NULL;
-	bus = of_match_bus(parent);
-	bus->count_cells(dev, &na, &ns);
-	of_node_put(parent);
-	if (!OF_CHECK_COUNTS(na, ns))
-		return NULL;
-
-	/* Get "reg" or "assigned-addresses" property */
-	prop = of_get_property(dev, bus->addresses, &psize);
-	if (prop == NULL)
-		return NULL;
-	psize /= 4;
-
-	onesize = na + ns;
-	for (i = 0; psize >= onesize; psize -= onesize, prop += onesize, i++)
-		if (i == index) {
-			if (size)
-				*size = of_read_number(prop + na, ns);
-			if (flags)
-				*flags = bus->get_flags(prop);
-			return prop;
-		}
-	return NULL;
-}
-EXPORT_SYMBOL(of_get_address);
-
 void of_parse_dma_window(struct device_node *dn, const void *dma_window_prop,
 		unsigned long *busno, unsigned long *phys, unsigned long *size)
 {
diff --git a/drivers/of/address.c b/drivers/of/address.c
index c3819550f907..2a905d560c1c 100644
--- a/drivers/of/address.c
+++ b/drivers/of/address.c
@@ -1,11 +1,522 @@
 
 #include <linux/io.h>
 #include <linux/ioport.h>
+#include <linux/module.h>
 #include <linux/of_address.h>
+#include <linux/pci_regs.h>
+#include <linux/string.h>
 
-int __of_address_to_resource(struct device_node *dev, const u32 *addrp,
-			     u64 size, unsigned int flags,
-			     struct resource *r)
+/* Max address size we deal with */
+#define OF_MAX_ADDR_CELLS	4
+#define OF_CHECK_COUNTS(na, ns)	((na) > 0 && (na) <= OF_MAX_ADDR_CELLS && \
+			(ns) > 0)
+
+static struct of_bus *of_match_bus(struct device_node *np);
+static int __of_address_to_resource(struct device_node *dev, const u32 *addrp,
+				    u64 size, unsigned int flags,
+				    struct resource *r);
+
+/* Debug utility */
+#ifdef DEBUG
+static void of_dump_addr(const char *s, const u32 *addr, int na)
+{
+	printk(KERN_DEBUG "%s", s);
+	while (na--)
+		printk(" %08x", *(addr++));
+	printk("\n");
+}
+#else
+static void of_dump_addr(const char *s, const u32 *addr, int na) { }
+#endif
+
+/* Callbacks for bus specific translators */
+struct of_bus {
+	const char	*name;
+	const char	*addresses;
+	int		(*match)(struct device_node *parent);
+	void		(*count_cells)(struct device_node *child,
+				       int *addrc, int *sizec);
+	u64		(*map)(u32 *addr, const u32 *range,
+				int na, int ns, int pna);
+	int		(*translate)(u32 *addr, u64 offset, int na);
+	unsigned int	(*get_flags)(const u32 *addr);
+};
+
+/*
+ * Default translator (generic bus)
+ */
+
+static void of_bus_default_count_cells(struct device_node *dev,
+				       int *addrc, int *sizec)
+{
+	if (addrc)
+		*addrc = of_n_addr_cells(dev);
+	if (sizec)
+		*sizec = of_n_size_cells(dev);
+}
+
+static u64 of_bus_default_map(u32 *addr, const u32 *range,
+		int na, int ns, int pna)
+{
+	u64 cp, s, da;
+
+	cp = of_read_number(range, na);
+	s  = of_read_number(range + na + pna, ns);
+	da = of_read_number(addr, na);
+
+	pr_debug("OF: default map, cp=%llx, s=%llx, da=%llx\n",
+		 (unsigned long long)cp, (unsigned long long)s,
+		 (unsigned long long)da);
+
+	if (da < cp || da >= (cp + s))
+		return OF_BAD_ADDR;
+	return da - cp;
+}
+
+static int of_bus_default_translate(u32 *addr, u64 offset, int na)
+{
+	u64 a = of_read_number(addr, na);
+	memset(addr, 0, na * 4);
+	a += offset;
+	if (na > 1)
+		addr[na - 2] = a >> 32;
+	addr[na - 1] = a & 0xffffffffu;
+
+	return 0;
+}
+
+static unsigned int of_bus_default_get_flags(const u32 *addr)
+{
+	return IORESOURCE_MEM;
+}
+
+#ifdef CONFIG_PCI
+/*
+ * PCI bus specific translator
+ */
+
+static int of_bus_pci_match(struct device_node *np)
+{
+	/* "vci" is for the /chaos bridge on 1st-gen PCI powermacs */
+	return !strcmp(np->type, "pci") || !strcmp(np->type, "vci");
+}
+
+static void of_bus_pci_count_cells(struct device_node *np,
+				   int *addrc, int *sizec)
+{
+	if (addrc)
+		*addrc = 3;
+	if (sizec)
+		*sizec = 2;
+}
+
+static unsigned int of_bus_pci_get_flags(const u32 *addr)
+{
+	unsigned int flags = 0;
+	u32 w = addr[0];
+
+	switch((w >> 24) & 0x03) {
+	case 0x01:
+		flags |= IORESOURCE_IO;
+		break;
+	case 0x02: /* 32 bits */
+	case 0x03: /* 64 bits */
+		flags |= IORESOURCE_MEM;
+		break;
+	}
+	if (w & 0x40000000)
+		flags |= IORESOURCE_PREFETCH;
+	return flags;
+}
+
+static u64 of_bus_pci_map(u32 *addr, const u32 *range, int na, int ns, int pna)
+{
+	u64 cp, s, da;
+	unsigned int af, rf;
+
+	af = of_bus_pci_get_flags(addr);
+	rf = of_bus_pci_get_flags(range);
+
+	/* Check address type match */
+	if ((af ^ rf) & (IORESOURCE_MEM | IORESOURCE_IO))
+		return OF_BAD_ADDR;
+
+	/* Read address values, skipping high cell */
+	cp = of_read_number(range + 1, na - 1);
+	s  = of_read_number(range + na + pna, ns);
+	da = of_read_number(addr + 1, na - 1);
+
+	pr_debug("OF: PCI map, cp=%llx, s=%llx, da=%llx\n",
+		 (unsigned long long)cp, (unsigned long long)s,
+		 (unsigned long long)da);
+
+	if (da < cp || da >= (cp + s))
+		return OF_BAD_ADDR;
+	return da - cp;
+}
+
+static int of_bus_pci_translate(u32 *addr, u64 offset, int na)
+{
+	return of_bus_default_translate(addr + 1, offset, na - 1);
+}
+
+const u32 *of_get_pci_address(struct device_node *dev, int bar_no, u64 *size,
+			unsigned int *flags)
+{
+	const u32 *prop;
+	unsigned int psize;
+	struct device_node *parent;
+	struct of_bus *bus;
+	int onesize, i, na, ns;
+
+	/* Get parent & match bus type */
+	parent = of_get_parent(dev);
+	if (parent == NULL)
+		return NULL;
+	bus = of_match_bus(parent);
+	if (strcmp(bus->name, "pci")) {
+		of_node_put(parent);
+		return NULL;
+	}
+	bus->count_cells(dev, &na, &ns);
+	of_node_put(parent);
+	if (!OF_CHECK_COUNTS(na, ns))
+		return NULL;
+
+	/* Get "reg" or "assigned-addresses" property */
+	prop = of_get_property(dev, bus->addresses, &psize);
+	if (prop == NULL)
+		return NULL;
+	psize /= 4;
+
+	onesize = na + ns;
+	for (i = 0; psize >= onesize; psize -= onesize, prop += onesize, i++)
+		if ((prop[0] & 0xff) == ((bar_no * 4) + PCI_BASE_ADDRESS_0)) {
+			if (size)
+				*size = of_read_number(prop + na, ns);
+			if (flags)
+				*flags = bus->get_flags(prop);
+			return prop;
+		}
+	return NULL;
+}
+EXPORT_SYMBOL(of_get_pci_address);
+
+int of_pci_address_to_resource(struct device_node *dev, int bar,
+			       struct resource *r)
+{
+	const u32	*addrp;
+	u64		size;
+	unsigned int	flags;
+
+	addrp = of_get_pci_address(dev, bar, &size, &flags);
+	if (addrp == NULL)
+		return -EINVAL;
+	return __of_address_to_resource(dev, addrp, size, flags, r);
+}
+EXPORT_SYMBOL_GPL(of_pci_address_to_resource);
+#endif /* CONFIG_PCI */
+
+/*
+ * ISA bus specific translator
+ */
+
+static int of_bus_isa_match(struct device_node *np)
+{
+	return !strcmp(np->name, "isa");
+}
+
+static void of_bus_isa_count_cells(struct device_node *child,
+				   int *addrc, int *sizec)
+{
+	if (addrc)
+		*addrc = 2;
+	if (sizec)
+		*sizec = 1;
+}
+
+static u64 of_bus_isa_map(u32 *addr, const u32 *range, int na, int ns, int pna)
+{
+	u64 cp, s, da;
+
+	/* Check address type match */
+	if ((addr[0] ^ range[0]) & 0x00000001)
+		return OF_BAD_ADDR;
+
+	/* Read address values, skipping high cell */
+	cp = of_read_number(range + 1, na - 1);
+	s  = of_read_number(range + na + pna, ns);
+	da = of_read_number(addr + 1, na - 1);
+
+	pr_debug("OF: ISA map, cp=%llx, s=%llx, da=%llx\n",
+		 (unsigned long long)cp, (unsigned long long)s,
+		 (unsigned long long)da);
+
+	if (da < cp || da >= (cp + s))
+		return OF_BAD_ADDR;
+	return da - cp;
+}
+
+static int of_bus_isa_translate(u32 *addr, u64 offset, int na)
+{
+	return of_bus_default_translate(addr + 1, offset, na - 1);
+}
+
+static unsigned int of_bus_isa_get_flags(const u32 *addr)
+{
+	unsigned int flags = 0;
+	u32 w = addr[0];
+
+	if (w & 1)
+		flags |= IORESOURCE_IO;
+	else
+		flags |= IORESOURCE_MEM;
+	return flags;
+}
+
+/*
+ * Array of bus specific translators
+ */
+
+static struct of_bus of_busses[] = {
+#ifdef CONFIG_PCI
+	/* PCI */
+	{
+		.name = "pci",
+		.addresses = "assigned-addresses",
+		.match = of_bus_pci_match,
+		.count_cells = of_bus_pci_count_cells,
+		.map = of_bus_pci_map,
+		.translate = of_bus_pci_translate,
+		.get_flags = of_bus_pci_get_flags,
+	},
+#endif /* CONFIG_PCI */
+	/* ISA */
+	{
+		.name = "isa",
+		.addresses = "reg",
+		.match = of_bus_isa_match,
+		.count_cells = of_bus_isa_count_cells,
+		.map = of_bus_isa_map,
+		.translate = of_bus_isa_translate,
+		.get_flags = of_bus_isa_get_flags,
+	},
+	/* Default */
+	{
+		.name = "default",
+		.addresses = "reg",
+		.match = NULL,
+		.count_cells = of_bus_default_count_cells,
+		.map = of_bus_default_map,
+		.translate = of_bus_default_translate,
+		.get_flags = of_bus_default_get_flags,
+	},
+};
+
+static struct of_bus *of_match_bus(struct device_node *np)
+{
+	int i;
+
+	for (i = 0; i < ARRAY_SIZE(of_busses); i++)
+		if (!of_busses[i].match || of_busses[i].match(np))
+			return &of_busses[i];
+	BUG();
+	return NULL;
+}
+
+static int of_translate_one(struct device_node *parent, struct of_bus *bus,
+			    struct of_bus *pbus, u32 *addr,
+			    int na, int ns, int pna, const char *rprop)
+{
+	const u32 *ranges;
+	unsigned int rlen;
+	int rone;
+	u64 offset = OF_BAD_ADDR;
+
+	/* Normally, an absence of a "ranges" property means we are
+	 * crossing a non-translatable boundary, and thus the addresses
+	 * below the current not cannot be converted to CPU physical ones.
+	 * Unfortunately, while this is very clear in the spec, it's not
+	 * what Apple understood, and they do have things like /uni-n or
+	 * /ht nodes with no "ranges" property and a lot of perfectly
+	 * useable mapped devices below them. Thus we treat the absence of
+	 * "ranges" as equivalent to an empty "ranges" property which means
+	 * a 1:1 translation at that level. It's up to the caller not to try
+	 * to translate addresses that aren't supposed to be translated in
+	 * the first place. --BenH.
+	 */
+	ranges = of_get_property(parent, rprop, &rlen);
+	if (ranges == NULL || rlen == 0) {
+		offset = of_read_number(addr, na);
+		memset(addr, 0, pna * 4);
+		pr_debug("OF: no ranges, 1:1 translation\n");
+		goto finish;
+	}
+
+	pr_debug("OF: walking ranges...\n");
+
+	/* Now walk through the ranges */
+	rlen /= 4;
+	rone = na + pna + ns;
+	for (; rlen >= rone; rlen -= rone, ranges += rone) {
+		offset = bus->map(addr, ranges, na, ns, pna);
+		if (offset != OF_BAD_ADDR)
+			break;
+	}
+	if (offset == OF_BAD_ADDR) {
+		pr_debug("OF: not found !\n");
+		return 1;
+	}
+	memcpy(addr, ranges + na, 4 * pna);
+
+ finish:
+	of_dump_addr("OF: parent translation for:", addr, pna);
+	pr_debug("OF: with offset: %llx\n", (unsigned long long)offset);
+
+	/* Translate it into parent bus space */
+	return pbus->translate(addr, offset, pna);
+}
+
+/*
+ * Translate an address from the device-tree into a CPU physical address,
+ * this walks up the tree and applies the various bus mappings on the
+ * way.
+ *
+ * Note: We consider that crossing any level with #size-cells == 0 to mean
+ * that translation is impossible (that is we are not dealing with a value
+ * that can be mapped to a cpu physical address). This is not really specified
+ * that way, but this is traditionally the way IBM at least do things
+ */
+u64 __of_translate_address(struct device_node *dev, const u32 *in_addr,
+			   const char *rprop)
+{
+	struct device_node *parent = NULL;
+	struct of_bus *bus, *pbus;
+	u32 addr[OF_MAX_ADDR_CELLS];
+	int na, ns, pna, pns;
+	u64 result = OF_BAD_ADDR;
+
+	pr_debug("OF: ** translation for device %s **\n", dev->full_name);
+
+	/* Increase refcount at current level */
+	of_node_get(dev);
+
+	/* Get parent & match bus type */
+	parent = of_get_parent(dev);
+	if (parent == NULL)
+		goto bail;
+	bus = of_match_bus(parent);
+
+	/* Cound address cells & copy address locally */
+	bus->count_cells(dev, &na, &ns);
+	if (!OF_CHECK_COUNTS(na, ns)) {
+		printk(KERN_ERR "prom_parse: Bad cell count for %s\n",
+		       dev->full_name);
+		goto bail;
+	}
+	memcpy(addr, in_addr, na * 4);
+
+	pr_debug("OF: bus is %s (na=%d, ns=%d) on %s\n",
+	    bus->name, na, ns, parent->full_name);
+	of_dump_addr("OF: translating address:", addr, na);
+
+	/* Translate */
+	for (;;) {
+		/* Switch to parent bus */
+		of_node_put(dev);
+		dev = parent;
+		parent = of_get_parent(dev);
+
+		/* If root, we have finished */
+		if (parent == NULL) {
+			pr_debug("OF: reached root node\n");
+			result = of_read_number(addr, na);
+			break;
+		}
+
+		/* Get new parent bus and counts */
+		pbus = of_match_bus(parent);
+		pbus->count_cells(dev, &pna, &pns);
+		if (!OF_CHECK_COUNTS(pna, pns)) {
+			printk(KERN_ERR "prom_parse: Bad cell count for %s\n",
+			       dev->full_name);
+			break;
+		}
+
+		pr_debug("OF: parent bus is %s (na=%d, ns=%d) on %s\n",
+		    pbus->name, pna, pns, parent->full_name);
+
+		/* Apply bus translation */
+		if (of_translate_one(dev, bus, pbus, addr, na, ns, pna, rprop))
+			break;
+
+		/* Complete the move up one level */
+		na = pna;
+		ns = pns;
+		bus = pbus;
+
+		of_dump_addr("OF: one level translation:", addr, na);
+	}
+ bail:
+	of_node_put(parent);
+	of_node_put(dev);
+
+	return result;
+}
+
+u64 of_translate_address(struct device_node *dev, const u32 *in_addr)
+{
+	return __of_translate_address(dev, in_addr, "ranges");
+}
+EXPORT_SYMBOL(of_translate_address);
+
+u64 of_translate_dma_address(struct device_node *dev, const u32 *in_addr)
+{
+	return __of_translate_address(dev, in_addr, "dma-ranges");
+}
+EXPORT_SYMBOL(of_translate_dma_address);
+
+const u32 *of_get_address(struct device_node *dev, int index, u64 *size,
+		    unsigned int *flags)
+{
+	const u32 *prop;
+	unsigned int psize;
+	struct device_node *parent;
+	struct of_bus *bus;
+	int onesize, i, na, ns;
+
+	/* Get parent & match bus type */
+	parent = of_get_parent(dev);
+	if (parent == NULL)
+		return NULL;
+	bus = of_match_bus(parent);
+	bus->count_cells(dev, &na, &ns);
+	of_node_put(parent);
+	if (!OF_CHECK_COUNTS(na, ns))
+		return NULL;
+
+	/* Get "reg" or "assigned-addresses" property */
+	prop = of_get_property(dev, bus->addresses, &psize);
+	if (prop == NULL)
+		return NULL;
+	psize /= 4;
+
+	onesize = na + ns;
+	for (i = 0; psize >= onesize; psize -= onesize, prop += onesize, i++)
+		if (i == index) {
+			if (size)
+				*size = of_read_number(prop + na, ns);
+			if (flags)
+				*flags = bus->get_flags(prop);
+			return prop;
+		}
+	return NULL;
+}
+EXPORT_SYMBOL(of_get_address);
+
+static int __of_address_to_resource(struct device_node *dev, const u32 *addrp,
+				    u64 size, unsigned int flags,
+				    struct resource *r)
 {
 	u64 taddr;
 
diff --git a/include/linux/of_address.h b/include/linux/of_address.h
index 474b794ed9d2..cc567df9a00d 100644
--- a/include/linux/of_address.h
+++ b/include/linux/of_address.h
@@ -3,9 +3,7 @@
 #include <linux/ioport.h>
 #include <linux/of.h>
 
-extern int __of_address_to_resource(struct device_node *dev, const u32 *addrp,
-				    u64 size, unsigned int flags,
-				    struct resource *r);
+extern u64 of_translate_address(struct device_node *np, const u32 *addr);
 extern int of_address_to_resource(struct device_node *dev, int index,
 				  struct resource *r);
 extern void __iomem *of_iomap(struct device_node *device, int index);
-- 
cgit v1.2.3


From dd27dcda37f0b1a3b674760fb411abc5c8fe309c Mon Sep 17 00:00:00 2001
From: Grant Likely <grant.likely@secretlab.ca>
Date: Tue, 8 Jun 2010 07:48:12 -0600
Subject: of/device: merge of_device_uevent

Merge common code between powerpc and microblaze

Signed-off-by: Grant Likely <grant.likely@secretlab.ca>
CC: Michal Simek <monstr@monstr.eu>
CC: Wolfram Sang <w.sang@pengutronix.de>
CC: Stephen Rothwell <sfr@canb.auug.org.au>
CC: Benjamin Herrenschmidt <benh@kernel.crashing.org>
CC: microblaze-uclinux@itee.uq.edu.au
CC: linuxppc-dev@ozlabs.org
---
 arch/microblaze/include/asm/of_device.h |  3 --
 arch/microblaze/kernel/of_device.c      | 48 --------------------------------
 arch/powerpc/include/asm/of_device.h    |  3 --
 arch/powerpc/kernel/of_device.c         | 49 ---------------------------------
 drivers/of/device.c                     | 48 ++++++++++++++++++++++++++++++++
 include/linux/of_device.h               |  4 +++
 6 files changed, 52 insertions(+), 103 deletions(-)

(limited to 'include/linux')

diff --git a/arch/microblaze/include/asm/of_device.h b/arch/microblaze/include/asm/of_device.h
index 0a5f3f914b42..58e627dc1412 100644
--- a/arch/microblaze/include/asm/of_device.h
+++ b/arch/microblaze/include/asm/of_device.h
@@ -22,9 +22,6 @@ extern struct of_device *of_device_alloc(struct device_node *np,
 					 const char *bus_id,
 					 struct device *parent);
 
-extern int of_device_uevent(struct device *dev,
-			    struct kobj_uevent_env *env);
-
 extern void of_device_make_bus_id(struct of_device *dev);
 
 /* This is just here during the transition */
diff --git a/arch/microblaze/kernel/of_device.c b/arch/microblaze/kernel/of_device.c
index b372787886ed..3a367d788451 100644
--- a/arch/microblaze/kernel/of_device.c
+++ b/arch/microblaze/kernel/of_device.c
@@ -62,51 +62,3 @@ struct of_device *of_device_alloc(struct device_node *np,
 	return dev;
 }
 EXPORT_SYMBOL(of_device_alloc);
-
-int of_device_uevent(struct device *dev, struct kobj_uevent_env *env)
-{
-	struct of_device *ofdev;
-	const char *compat;
-	int seen = 0, cplen, sl;
-
-	if (!dev)
-		return -ENODEV;
-
-	ofdev = to_of_device(dev);
-
-	if (add_uevent_var(env, "OF_NAME=%s", ofdev->dev.of_node->name))
-		return -ENOMEM;
-
-	if (add_uevent_var(env, "OF_TYPE=%s", ofdev->dev.of_node->type))
-		return -ENOMEM;
-
-	/* Since the compatible field can contain pretty much anything
-	 * it's not really legal to split it out with commas. We split it
-	 * up using a number of environment variables instead. */
-
-	compat = of_get_property(ofdev->dev.of_node, "compatible", &cplen);
-	while (compat && *compat && cplen > 0) {
-		if (add_uevent_var(env, "OF_COMPATIBLE_%d=%s", seen, compat))
-			return -ENOMEM;
-
-		sl = strlen(compat) + 1;
-		compat += sl;
-		cplen -= sl;
-		seen++;
-	}
-
-	if (add_uevent_var(env, "OF_COMPATIBLE_N=%d", seen))
-		return -ENOMEM;
-
-	/* modalias is trickier, we add it in 2 steps */
-	if (add_uevent_var(env, "MODALIAS="))
-		return -ENOMEM;
-	sl = of_device_get_modalias(ofdev, &env->buf[env->buflen-1],
-				    sizeof(env->buf) - env->buflen);
-	if (sl >= (sizeof(env->buf) - env->buflen))
-		return -ENOMEM;
-	env->buflen += sl;
-
-	return 0;
-}
-EXPORT_SYMBOL(of_device_uevent);
diff --git a/arch/powerpc/include/asm/of_device.h b/arch/powerpc/include/asm/of_device.h
index cb36632f953c..5d5103cac641 100644
--- a/arch/powerpc/include/asm/of_device.h
+++ b/arch/powerpc/include/asm/of_device.h
@@ -9,8 +9,5 @@ extern struct of_device *of_device_alloc(struct device_node *np,
 					 const char *bus_id,
 					 struct device *parent);
 
-extern int of_device_uevent(struct device *dev,
-			    struct kobj_uevent_env *env);
-
 #endif /* __KERNEL__ */
 #endif /* _ASM_POWERPC_OF_DEVICE_H */
diff --git a/arch/powerpc/kernel/of_device.c b/arch/powerpc/kernel/of_device.c
index df78e0236a02..db91a9dbafb5 100644
--- a/arch/powerpc/kernel/of_device.c
+++ b/arch/powerpc/kernel/of_device.c
@@ -82,52 +82,3 @@ struct of_device *of_device_alloc(struct device_node *np,
 	return dev;
 }
 EXPORT_SYMBOL(of_device_alloc);
-
-int of_device_uevent(struct device *dev, struct kobj_uevent_env *env)
-{
-	struct of_device *ofdev;
-	const char *compat;
-	int seen = 0, cplen, sl;
-
-	if (!dev)
-		return -ENODEV;
-
-	ofdev = to_of_device(dev);
-
-	if (add_uevent_var(env, "OF_NAME=%s", ofdev->dev.of_node->name))
-		return -ENOMEM;
-
-	if (add_uevent_var(env, "OF_TYPE=%s", ofdev->dev.of_node->type))
-		return -ENOMEM;
-
-        /* Since the compatible field can contain pretty much anything
-         * it's not really legal to split it out with commas. We split it
-         * up using a number of environment variables instead. */
-
-	compat = of_get_property(ofdev->dev.of_node, "compatible", &cplen);
-	while (compat && *compat && cplen > 0) {
-		if (add_uevent_var(env, "OF_COMPATIBLE_%d=%s", seen, compat))
-			return -ENOMEM;
-
-		sl = strlen (compat) + 1;
-		compat += sl;
-		cplen -= sl;
-		seen++;
-	}
-
-	if (add_uevent_var(env, "OF_COMPATIBLE_N=%d", seen))
-		return -ENOMEM;
-
-	/* modalias is trickier, we add it in 2 steps */
-	if (add_uevent_var(env, "MODALIAS="))
-		return -ENOMEM;
-	sl = of_device_get_modalias(ofdev, &env->buf[env->buflen-1],
-				    sizeof(env->buf) - env->buflen);
-	if (sl >= (sizeof(env->buf) - env->buflen))
-		return -ENOMEM;
-	env->buflen += sl;
-
-	return 0;
-}
-EXPORT_SYMBOL(of_device_uevent);
-EXPORT_SYMBOL(of_device_get_modalias);
diff --git a/drivers/of/device.c b/drivers/of/device.c
index 7d18f8e0b013..275cc9cee14c 100644
--- a/drivers/of/device.c
+++ b/drivers/of/device.c
@@ -170,3 +170,51 @@ ssize_t of_device_get_modalias(struct of_device *ofdev,
 
 	return tsize;
 }
+
+/**
+ * of_device_uevent - Display OF related uevent information
+ */
+int of_device_uevent(struct device *dev, struct kobj_uevent_env *env)
+{
+	const char *compat;
+	int seen = 0, cplen, sl;
+
+	if ((!dev) || (!dev->of_node))
+		return -ENODEV;
+
+	if (add_uevent_var(env, "OF_NAME=%s", dev->of_node->name))
+		return -ENOMEM;
+
+	if (add_uevent_var(env, "OF_TYPE=%s", dev->of_node->type))
+		return -ENOMEM;
+
+	/* Since the compatible field can contain pretty much anything
+	 * it's not really legal to split it out with commas. We split it
+	 * up using a number of environment variables instead. */
+
+	compat = of_get_property(dev->of_node, "compatible", &cplen);
+	while (compat && *compat && cplen > 0) {
+		if (add_uevent_var(env, "OF_COMPATIBLE_%d=%s", seen, compat))
+			return -ENOMEM;
+
+		sl = strlen(compat) + 1;
+		compat += sl;
+		cplen -= sl;
+		seen++;
+	}
+
+	if (add_uevent_var(env, "OF_COMPATIBLE_N=%d", seen))
+		return -ENOMEM;
+
+	/* modalias is trickier, we add it in 2 steps */
+	if (add_uevent_var(env, "MODALIAS="))
+		return -ENOMEM;
+
+	sl = of_device_get_modalias(to_of_device(dev), &env->buf[env->buflen-1],
+				    sizeof(env->buf) - env->buflen);
+	if (sl >= (sizeof(env->buf) - env->buflen))
+		return -ENOMEM;
+	env->buflen += sl;
+
+	return 0;
+}
diff --git a/include/linux/of_device.h b/include/linux/of_device.h
index a3ae5900fc58..da83e734c026 100644
--- a/include/linux/of_device.h
+++ b/include/linux/of_device.h
@@ -44,6 +44,10 @@ static inline void of_device_free(struct of_device *dev)
 
 extern ssize_t of_device_get_modalias(struct of_device *ofdev,
 					char *str, ssize_t len);
+
+extern int of_device_uevent(struct device *dev, struct kobj_uevent_env *env);
+
+
 #endif /* CONFIG_OF_DEVICE */
 
 #endif /* _LINUX_OF_DEVICE_H */
-- 
cgit v1.2.3


From 34a1c1e8c700f7cd849deb21193718a172722f8d Mon Sep 17 00:00:00 2001
From: Grant Likely <grant.likely@secretlab.ca>
Date: Tue, 8 Jun 2010 07:48:13 -0600
Subject: of: Modify of_device_get_modalias to be passed struct device

Now that the of_node pointer is part of struct device,
of_device_get_modalias could be used on any struct device
that has the device node pointer set.  This patch changes
of_device_get_modalias to accept a struct device instead
of a struct of_device.

Signed-off-by: Grant Likely <grant.likely@secretlab.ca>
CC: Michal Simek <monstr@monstr.eu>
CC: Benjamin Herrenschmidt <benh@kernel.crashing.org>
CC: Wolfram Sang <w.sang@pengutronix.de>
CC: Stephen Rothwell <sfr@canb.auug.org.au>
CC: microblaze-uclinux@itee.uq.edu.au
CC: linuxppc-dev@ozlabs.org
---
 arch/microblaze/include/asm/of_device.h |  3 ---
 drivers/macintosh/macio_sysfs.c         |  5 +----
 drivers/of/device.c                     | 16 ++++++----------
 include/linux/of_device.h               |  2 +-
 4 files changed, 8 insertions(+), 18 deletions(-)

(limited to 'include/linux')

diff --git a/arch/microblaze/include/asm/of_device.h b/arch/microblaze/include/asm/of_device.h
index 58e627dc1412..c9be53487434 100644
--- a/arch/microblaze/include/asm/of_device.h
+++ b/arch/microblaze/include/asm/of_device.h
@@ -15,9 +15,6 @@
 #include <linux/device.h>
 #include <linux/of.h>
 
-extern ssize_t of_device_get_modalias(struct of_device *ofdev,
-					char *str, ssize_t len);
-
 extern struct of_device *of_device_alloc(struct device_node *np,
 					 const char *bus_id,
 					 struct device *parent);
diff --git a/drivers/macintosh/macio_sysfs.c b/drivers/macintosh/macio_sysfs.c
index 6999ce59fd10..6024038a5b9d 100644
--- a/drivers/macintosh/macio_sysfs.c
+++ b/drivers/macintosh/macio_sysfs.c
@@ -41,10 +41,7 @@ compatible_show (struct device *dev, struct device_attribute *attr, char *buf)
 static ssize_t modalias_show (struct device *dev, struct device_attribute *attr,
 			      char *buf)
 {
-	struct of_device *ofdev = to_of_device(dev);
-	int len;
-
-	len = of_device_get_modalias(ofdev, buf, PAGE_SIZE - 2);
+	int len = of_device_get_modalias(dev, buf, PAGE_SIZE - 2);
 
 	buf[len] = '\n';
 	buf[len+1] = 0;
diff --git a/drivers/of/device.c b/drivers/of/device.c
index 275cc9cee14c..c2a98f5ca80d 100644
--- a/drivers/of/device.c
+++ b/drivers/of/device.c
@@ -68,10 +68,7 @@ static ssize_t name_show(struct device *dev,
 static ssize_t modalias_show(struct device *dev,
 				struct device_attribute *attr, char *buf)
 {
-	struct of_device *ofdev = to_of_device(dev);
-	ssize_t len = 0;
-
-	len = of_device_get_modalias(ofdev, buf, PAGE_SIZE - 2);
+	ssize_t len = of_device_get_modalias(dev, buf, PAGE_SIZE - 2);
 	buf[len] = '\n';
 	buf[len+1] = 0;
 	return len+1;
@@ -123,19 +120,18 @@ void of_device_unregister(struct of_device *ofdev)
 }
 EXPORT_SYMBOL(of_device_unregister);
 
-ssize_t of_device_get_modalias(struct of_device *ofdev,
-				char *str, ssize_t len)
+ssize_t of_device_get_modalias(struct device *dev, char *str, ssize_t len)
 {
 	const char *compat;
 	int cplen, i;
 	ssize_t tsize, csize, repend;
 
 	/* Name & Type */
-	csize = snprintf(str, len, "of:N%sT%s", ofdev->dev.of_node->name,
-			 ofdev->dev.of_node->type);
+	csize = snprintf(str, len, "of:N%sT%s", dev->of_node->name,
+			 dev->of_node->type);
 
 	/* Get compatible property if any */
-	compat = of_get_property(ofdev->dev.of_node, "compatible", &cplen);
+	compat = of_get_property(dev->of_node, "compatible", &cplen);
 	if (!compat)
 		return csize;
 
@@ -210,7 +206,7 @@ int of_device_uevent(struct device *dev, struct kobj_uevent_env *env)
 	if (add_uevent_var(env, "MODALIAS="))
 		return -ENOMEM;
 
-	sl = of_device_get_modalias(to_of_device(dev), &env->buf[env->buflen-1],
+	sl = of_device_get_modalias(dev, &env->buf[env->buflen-1],
 				    sizeof(env->buf) - env->buflen);
 	if (sl >= (sizeof(env->buf) - env->buflen))
 		return -ENOMEM;
diff --git a/include/linux/of_device.h b/include/linux/of_device.h
index da83e734c026..238e92e007e6 100644
--- a/include/linux/of_device.h
+++ b/include/linux/of_device.h
@@ -42,7 +42,7 @@ static inline void of_device_free(struct of_device *dev)
 	of_release_dev(&dev->dev);
 }
 
-extern ssize_t of_device_get_modalias(struct of_device *ofdev,
+extern ssize_t of_device_get_modalias(struct device *dev,
 					char *str, ssize_t len);
 
 extern int of_device_uevent(struct device *dev, struct kobj_uevent_env *env);
-- 
cgit v1.2.3


From 5fd200f3b351183b5489cef69961c60af9cead2f Mon Sep 17 00:00:00 2001
From: Grant Likely <grant.likely@secretlab.ca>
Date: Tue, 8 Jun 2010 07:48:13 -0600
Subject: of/device: Merge of_platform_bus_probe()

Merge common code between PowerPC and microblaze.  This patch merges
the code that scans the tree and registers devices.  The functions
merged are of_platform_bus_probe(), of_platform_bus_create(), and
of_platform_device_create().

This patch also move the of_default_bus_ids[] table out of a Microblaze
header file and makes it non-static.  The device ids table isn't merged
because powerpc and microblaze use different default data.

Signed-off-by: Grant Likely <grant.likely@secretlab.ca>
CC: Michal Simek <monstr@monstr.eu>
CC: Grant Likely <grant.likely@secretlab.ca>
CC: Benjamin Herrenschmidt <benh@kernel.crashing.org>
CC: Stephen Rothwell <sfr@canb.auug.org.au>
CC: microblaze-uclinux@itee.uq.edu.au
CC: linuxppc-dev@ozlabs.org
---
 arch/microblaze/include/asm/of_platform.h |  33 -------
 arch/microblaze/kernel/of_platform.c      | 141 ++++-------------------------
 arch/powerpc/include/asm/of_platform.h    |  11 ---
 arch/powerpc/kernel/of_platform.c         | 131 +--------------------------
 drivers/of/platform.c                     | 143 ++++++++++++++++++++++++++++++
 include/linux/of_platform.h               |  17 ++++
 6 files changed, 179 insertions(+), 297 deletions(-)

(limited to 'include/linux')

diff --git a/arch/microblaze/include/asm/of_platform.h b/arch/microblaze/include/asm/of_platform.h
index 37491276c6ca..625003f70888 100644
--- a/arch/microblaze/include/asm/of_platform.h
+++ b/arch/microblaze/include/asm/of_platform.h
@@ -14,39 +14,6 @@
 /* This is just here during the transition */
 #include <linux/of_platform.h>
 
-/*
- * The list of OF IDs below is used for matching bus types in the
- * system whose devices are to be exposed as of_platform_devices.
- *
- * This is the default list valid for most platforms. This file provides
- * functions who can take an explicit list if necessary though
- *
- * The search is always performed recursively looking for children of
- * the provided device_node and recursively if such a children matches
- * a bus type in the list
- */
-
-static const struct of_device_id of_default_bus_ids[] = {
-	{ .type = "soc", },
-	{ .compatible = "soc", },
-	{ .type = "plb5", },
-	{ .type = "plb4", },
-	{ .type = "opb", },
-	{ .type = "simple", },
-	{},
-};
-
-/* Platform devices and busses creation */
-extern struct of_device *of_platform_device_create(struct device_node *np,
-						const char *bus_id,
-						struct device *parent);
-/* pseudo "matches" value to not do deep probe */
-#define OF_NO_DEEP_PROBE ((struct of_device_id *)-1)
-
-extern int of_platform_bus_probe(struct device_node *root,
-				const struct of_device_id *matches,
-				struct device *parent);
-
 extern struct of_device *of_find_device_by_phandle(phandle ph);
 
 extern void of_instantiate_rtc(void);
diff --git a/arch/microblaze/kernel/of_platform.c b/arch/microblaze/kernel/of_platform.c
index ccf6f4257f4b..a07abdd6859b 100644
--- a/arch/microblaze/kernel/of_platform.c
+++ b/arch/microblaze/kernel/of_platform.c
@@ -37,132 +37,27 @@ static int __init of_bus_driver_init(void)
 }
 postcore_initcall(of_bus_driver_init);
 
-struct of_device *of_platform_device_create(struct device_node *np,
-					    const char *bus_id,
-					    struct device *parent)
-{
-	struct of_device *dev;
-
-	dev = of_device_alloc(np, bus_id, parent);
-	if (!dev)
-		return NULL;
-
-	dev->archdata.dma_mask = 0xffffffffUL;
-	dev->dev.bus = &of_platform_bus_type;
-
-	/* We do not fill the DMA ops for platform devices by default.
-	 * This is currently the responsibility of the platform code
-	 * to do such, possibly using a device notifier
-	 */
-
-	if (of_device_register(dev) != 0) {
-		of_device_free(dev);
-		return NULL;
-	}
-
-	return dev;
-}
-EXPORT_SYMBOL(of_platform_device_create);
-
-/**
- * of_platform_bus_create - Create an OF device for a bus node and all its
- * children. Optionally recursively instanciate matching busses.
- * @bus: device node of the bus to instanciate
- * @matches: match table, NULL to use the default, OF_NO_DEEP_PROBE to
- * disallow recursive creation of child busses
- */
-static int of_platform_bus_create(const struct device_node *bus,
-				  const struct of_device_id *matches,
-				  struct device *parent)
-{
-	struct device_node *child;
-	struct of_device *dev;
-	int rc = 0;
-
-	for_each_child_of_node(bus, child) {
-		pr_debug("   create child: %s\n", child->full_name);
-		dev = of_platform_device_create(child, NULL, parent);
-		if (dev == NULL)
-			rc = -ENOMEM;
-		else if (!of_match_node(matches, child))
-			continue;
-		if (rc == 0) {
-			pr_debug("   and sub busses\n");
-			rc = of_platform_bus_create(child, matches, &dev->dev);
-		}
-		if (rc) {
-			of_node_put(child);
-			break;
-		}
-	}
-	return rc;
-}
-
-
-/**
- * of_platform_bus_probe - Probe the device-tree for platform busses
- * @root: parent of the first level to probe or NULL for the root of the tree
- * @matches: match table, NULL to use the default
- * @parent: parent to hook devices from, NULL for toplevel
+/*
+ * The list of OF IDs below is used for matching bus types in the
+ * system whose devices are to be exposed as of_platform_devices.
  *
- * Note that children of the provided root are not instanciated as devices
- * unless the specified root itself matches the bus list and is not NULL.
+ * This is the default list valid for most platforms. This file provides
+ * functions who can take an explicit list if necessary though
+ *
+ * The search is always performed recursively looking for children of
+ * the provided device_node and recursively if such a children matches
+ * a bus type in the list
  */
 
-int of_platform_bus_probe(struct device_node *root,
-			  const struct of_device_id *matches,
-			  struct device *parent)
-{
-	struct device_node *child;
-	struct of_device *dev;
-	int rc = 0;
-
-	if (matches == NULL)
-		matches = of_default_bus_ids;
-	if (matches == OF_NO_DEEP_PROBE)
-		return -EINVAL;
-	if (root == NULL)
-		root = of_find_node_by_path("/");
-	else
-		of_node_get(root);
-
-	pr_debug("of_platform_bus_probe()\n");
-	pr_debug(" starting at: %s\n", root->full_name);
-
-	/* Do a self check of bus type, if there's a match, create
-	 * children
-	 */
-	if (of_match_node(matches, root)) {
-		pr_debug(" root match, create all sub devices\n");
-		dev = of_platform_device_create(root, NULL, parent);
-		if (dev == NULL) {
-			rc = -ENOMEM;
-			goto bail;
-		}
-		pr_debug(" create all sub busses\n");
-		rc = of_platform_bus_create(root, matches, &dev->dev);
-		goto bail;
-	}
-	for_each_child_of_node(root, child) {
-		if (!of_match_node(matches, child))
-			continue;
-
-		pr_debug("  match: %s\n", child->full_name);
-		dev = of_platform_device_create(child, NULL, parent);
-		if (dev == NULL)
-			rc = -ENOMEM;
-		else
-			rc = of_platform_bus_create(child, matches, &dev->dev);
-		if (rc) {
-			of_node_put(child);
-			break;
-		}
-	}
- bail:
-	of_node_put(root);
-	return rc;
-}
-EXPORT_SYMBOL(of_platform_bus_probe);
+const struct of_device_id of_default_bus_ids[] = {
+	{ .type = "soc", },
+	{ .compatible = "soc", },
+	{ .type = "plb5", },
+	{ .type = "plb4", },
+	{ .type = "opb", },
+	{ .type = "simple", },
+	{},
+};
 
 static int of_dev_node_match(struct device *dev, void *data)
 {
diff --git a/arch/powerpc/include/asm/of_platform.h b/arch/powerpc/include/asm/of_platform.h
index d4aaa3489440..b37d2dcddb97 100644
--- a/arch/powerpc/include/asm/of_platform.h
+++ b/arch/powerpc/include/asm/of_platform.h
@@ -11,17 +11,6 @@
  *
  */
 
-/* Platform devices and busses creation */
-extern struct of_device *of_platform_device_create(struct device_node *np,
-						   const char *bus_id,
-						   struct device *parent);
-/* pseudo "matches" value to not do deep probe */
-#define OF_NO_DEEP_PROBE ((struct of_device_id *)-1)
-
-extern int of_platform_bus_probe(struct device_node *root,
-				 const struct of_device_id *matches,
-				 struct device *parent);
-
 extern struct of_device *of_find_device_by_phandle(phandle ph);
 
 extern void of_instantiate_rtc(void);
diff --git a/arch/powerpc/kernel/of_platform.c b/arch/powerpc/kernel/of_platform.c
index 487a98851ba6..0b5cc6d892a7 100644
--- a/arch/powerpc/kernel/of_platform.c
+++ b/arch/powerpc/kernel/of_platform.c
@@ -40,7 +40,7 @@
  * a bus type in the list
  */
 
-static const struct of_device_id of_default_bus_ids[] = {
+const struct of_device_id of_default_bus_ids[] = {
 	{ .type = "soc", },
 	{ .compatible = "soc", },
 	{ .type = "spider", },
@@ -64,135 +64,6 @@ static int __init of_bus_driver_init(void)
 
 postcore_initcall(of_bus_driver_init);
 
-struct of_device* of_platform_device_create(struct device_node *np,
-					    const char *bus_id,
-					    struct device *parent)
-{
-	struct of_device *dev;
-
-	dev = of_device_alloc(np, bus_id, parent);
-	if (!dev)
-		return NULL;
-
-	dev->archdata.dma_mask = 0xffffffffUL;
-	dev->dev.coherent_dma_mask = DMA_BIT_MASK(32);
-
-	dev->dev.bus = &of_platform_bus_type;
-
-	/* We do not fill the DMA ops for platform devices by default.
-	 * This is currently the responsibility of the platform code
-	 * to do such, possibly using a device notifier
-	 */
-
-	if (of_device_register(dev) != 0) {
-		of_device_free(dev);
-		return NULL;
-	}
-
-	return dev;
-}
-EXPORT_SYMBOL(of_platform_device_create);
-
-
-
-/**
- * of_platform_bus_create - Create an OF device for a bus node and all its
- * children. Optionally recursively instanciate matching busses.
- * @bus: device node of the bus to instanciate
- * @matches: match table, NULL to use the default, OF_NO_DEEP_PROBE to
- * disallow recursive creation of child busses
- */
-static int of_platform_bus_create(const struct device_node *bus,
-				  const struct of_device_id *matches,
-				  struct device *parent)
-{
-	struct device_node *child;
-	struct of_device *dev;
-	int rc = 0;
-
-	for_each_child_of_node(bus, child) {
-		pr_debug("   create child: %s\n", child->full_name);
-		dev = of_platform_device_create(child, NULL, parent);
-		if (dev == NULL)
-			rc = -ENOMEM;
-		else if (!of_match_node(matches, child))
-			continue;
-		if (rc == 0) {
-			pr_debug("   and sub busses\n");
-			rc = of_platform_bus_create(child, matches, &dev->dev);
-		} if (rc) {
-			of_node_put(child);
-			break;
-		}
-	}
-	return rc;
-}
-
-/**
- * of_platform_bus_probe - Probe the device-tree for platform busses
- * @root: parent of the first level to probe or NULL for the root of the tree
- * @matches: match table, NULL to use the default
- * @parent: parent to hook devices from, NULL for toplevel
- *
- * Note that children of the provided root are not instanciated as devices
- * unless the specified root itself matches the bus list and is not NULL.
- */
-
-int of_platform_bus_probe(struct device_node *root,
-			  const struct of_device_id *matches,
-			  struct device *parent)
-{
-	struct device_node *child;
-	struct of_device *dev;
-	int rc = 0;
-
-	if (matches == NULL)
-		matches = of_default_bus_ids;
-	if (matches == OF_NO_DEEP_PROBE)
-		return -EINVAL;
-	if (root == NULL)
-		root = of_find_node_by_path("/");
-	else
-		of_node_get(root);
-
-	pr_debug("of_platform_bus_probe()\n");
-	pr_debug(" starting at: %s\n", root->full_name);
-
-	/* Do a self check of bus type, if there's a match, create
-	 * children
-	 */
-	if (of_match_node(matches, root)) {
-		pr_debug(" root match, create all sub devices\n");
-		dev = of_platform_device_create(root, NULL, parent);
-		if (dev == NULL) {
-			rc = -ENOMEM;
-			goto bail;
-		}
-		pr_debug(" create all sub busses\n");
-		rc = of_platform_bus_create(root, matches, &dev->dev);
-		goto bail;
-	}
-	for_each_child_of_node(root, child) {
-		if (!of_match_node(matches, child))
-			continue;
-
-		pr_debug("  match: %s\n", child->full_name);
-		dev = of_platform_device_create(child, NULL, parent);
-		if (dev == NULL)
-			rc = -ENOMEM;
-		else
-			rc = of_platform_bus_create(child, matches, &dev->dev);
-		if (rc) {
-			of_node_put(child);
-			break;
-		}
-	}
- bail:
-	of_node_put(root);
-	return rc;
-}
-EXPORT_SYMBOL(of_platform_bus_probe);
-
 static int of_dev_node_match(struct device *dev, void *data)
 {
 	return to_of_device(dev)->dev.of_node == data;
diff --git a/drivers/of/platform.c b/drivers/of/platform.c
index 7dacc1ebe91e..d9c81e93bdd0 100644
--- a/drivers/of/platform.c
+++ b/drivers/of/platform.c
@@ -14,6 +14,7 @@
 #include <linux/errno.h>
 #include <linux/module.h>
 #include <linux/device.h>
+#include <linux/dma-mapping.h>
 #include <linux/of_device.h>
 #include <linux/of_platform.h>
 
@@ -396,3 +397,145 @@ void of_unregister_driver(struct of_platform_driver *drv)
 	driver_unregister(&drv->driver);
 }
 EXPORT_SYMBOL(of_unregister_driver);
+
+#if !defined(CONFIG_SPARC)
+/*
+ * The following routines scan a subtree and registers a device for
+ * each applicable node.
+ *
+ * Note: sparc doesn't use these routines because it has a different
+ * mechanism for creating devices from device tree nodes.
+ */
+
+/**
+ * of_platform_device_create - Alloc, initialize and register an of_device
+ * @np: pointer to node to create device for
+ * @bus_id: name to assign device
+ * @parent: Linux device model parent device.
+ */
+struct of_device *of_platform_device_create(struct device_node *np,
+					    const char *bus_id,
+					    struct device *parent)
+{
+	struct of_device *dev;
+
+	dev = of_device_alloc(np, bus_id, parent);
+	if (!dev)
+		return NULL;
+
+	dev->archdata.dma_mask = 0xffffffffUL;
+	dev->dev.coherent_dma_mask = DMA_BIT_MASK(32);
+	dev->dev.bus = &of_platform_bus_type;
+
+	/* We do not fill the DMA ops for platform devices by default.
+	 * This is currently the responsibility of the platform code
+	 * to do such, possibly using a device notifier
+	 */
+
+	if (of_device_register(dev) != 0) {
+		of_device_free(dev);
+		return NULL;
+	}
+
+	return dev;
+}
+EXPORT_SYMBOL(of_platform_device_create);
+
+/**
+ * of_platform_bus_create - Create an OF device for a bus node and all its
+ * children. Optionally recursively instantiate matching busses.
+ * @bus: device node of the bus to instantiate
+ * @matches: match table, NULL to use the default, OF_NO_DEEP_PROBE to
+ * disallow recursive creation of child busses
+ */
+static int of_platform_bus_create(const struct device_node *bus,
+				  const struct of_device_id *matches,
+				  struct device *parent)
+{
+	struct device_node *child;
+	struct of_device *dev;
+	int rc = 0;
+
+	for_each_child_of_node(bus, child) {
+		pr_debug("   create child: %s\n", child->full_name);
+		dev = of_platform_device_create(child, NULL, parent);
+		if (dev == NULL)
+			rc = -ENOMEM;
+		else if (!of_match_node(matches, child))
+			continue;
+		if (rc == 0) {
+			pr_debug("   and sub busses\n");
+			rc = of_platform_bus_create(child, matches, &dev->dev);
+		}
+		if (rc) {
+			of_node_put(child);
+			break;
+		}
+	}
+	return rc;
+}
+
+/**
+ * of_platform_bus_probe - Probe the device-tree for platform busses
+ * @root: parent of the first level to probe or NULL for the root of the tree
+ * @matches: match table, NULL to use the default
+ * @parent: parent to hook devices from, NULL for toplevel
+ *
+ * Note that children of the provided root are not instantiated as devices
+ * unless the specified root itself matches the bus list and is not NULL.
+ */
+int of_platform_bus_probe(struct device_node *root,
+			  const struct of_device_id *matches,
+			  struct device *parent)
+{
+	struct device_node *child;
+	struct of_device *dev;
+	int rc = 0;
+
+	if (matches == NULL)
+		matches = of_default_bus_ids;
+	if (matches == OF_NO_DEEP_PROBE)
+		return -EINVAL;
+	if (root == NULL)
+		root = of_find_node_by_path("/");
+	else
+		of_node_get(root);
+
+	pr_debug("of_platform_bus_probe()\n");
+	pr_debug(" starting at: %s\n", root->full_name);
+
+	/* Do a self check of bus type, if there's a match, create
+	 * children
+	 */
+	if (of_match_node(matches, root)) {
+		pr_debug(" root match, create all sub devices\n");
+		dev = of_platform_device_create(root, NULL, parent);
+		if (dev == NULL) {
+			rc = -ENOMEM;
+			goto bail;
+		}
+		pr_debug(" create all sub busses\n");
+		rc = of_platform_bus_create(root, matches, &dev->dev);
+		goto bail;
+	}
+	for_each_child_of_node(root, child) {
+		if (!of_match_node(matches, child))
+			continue;
+
+		pr_debug("  match: %s\n", child->full_name);
+		dev = of_platform_device_create(child, NULL, parent);
+		if (dev == NULL)
+			rc = -ENOMEM;
+		else
+			rc = of_platform_bus_create(child, matches, &dev->dev);
+		if (rc) {
+			of_node_put(child);
+			break;
+		}
+	}
+ bail:
+	of_node_put(root);
+	return rc;
+}
+EXPORT_SYMBOL(of_platform_bus_probe);
+#endif /* !CONFIG_SPARC */
diff --git a/include/linux/of_platform.h b/include/linux/of_platform.h
index 1643d3761eb4..4bbba41396ef 100644
--- a/include/linux/of_platform.h
+++ b/include/linux/of_platform.h
@@ -25,6 +25,8 @@
  */
 extern struct bus_type of_platform_bus_type;
 
+extern const struct of_device_id of_default_bus_ids[];
+
 /*
  * An of_platform_driver driver is attached to a basic of_device on
  * the "platform bus" (of_platform_bus_type).
@@ -63,6 +65,21 @@ static inline void of_unregister_platform_driver(struct of_platform_driver *drv)
 extern struct of_device *of_find_device_by_node(struct device_node *np);
 
 extern int of_bus_type_init(struct bus_type *bus, const char *name);
+
+#if !defined(CONFIG_SPARC) /* SPARC has its own device registration method */
+/* Platform devices and busses creation */
+extern struct of_device *of_platform_device_create(struct device_node *np,
+						   const char *bus_id,
+						   struct device *parent);
+
+/* pseudo "matches" value to not do deep probe */
+#define OF_NO_DEEP_PROBE ((struct of_device_id *)-1)
+
+extern int of_platform_bus_probe(struct device_node *root,
+				 const struct of_device_id *matches,
+				 struct device *parent);
+#endif /* !CONFIG_SPARC */
+
 #endif /* CONFIG_OF_DEVICE */
 
 #endif	/* _LINUX_OF_PLATFORM_H */
-- 
cgit v1.2.3


From 94c0931983ee9d1cd96c32d52ac64c17464f0bbd Mon Sep 17 00:00:00 2001
From: Grant Likely <grant.likely@secretlab.ca>
Date: Tue, 8 Jun 2010 07:48:14 -0600
Subject: of: Merge of_device_alloc() and of_device_make_bus_id()

This patch merges the common routines of_device_alloc() and
of_device_make_bus_id() from powerpc and microblaze.

Signed-off-by: Grant Likely <grant.likely@secretlab.ca>
CC: Michal Simek <monstr@monstr.eu>
CC: Grant Likely <grant.likely@secretlab.ca>
CC: Benjamin Herrenschmidt <benh@kernel.crashing.org>
CC: Stephen Rothwell <sfr@canb.auug.org.au>
CC: microblaze-uclinux@itee.uq.edu.au
CC: linuxppc-dev@ozlabs.org
CC: devicetree-discuss@lists.ozlabs.org
---
 arch/microblaze/include/asm/of_device.h | 15 ------
 arch/microblaze/kernel/Makefile         |  2 +-
 arch/microblaze/kernel/of_device.c      | 64 ------------------------
 arch/powerpc/include/asm/of_device.h    | 10 ----
 arch/powerpc/kernel/Makefile            |  2 +-
 arch/powerpc/kernel/of_device.c         | 84 -------------------------------
 drivers/of/platform.c                   | 87 +++++++++++++++++++++++++++++++++
 include/linux/of_platform.h             |  3 ++
 8 files changed, 92 insertions(+), 175 deletions(-)
 delete mode 100644 arch/microblaze/kernel/of_device.c
 delete mode 100644 arch/powerpc/kernel/of_device.c

(limited to 'include/linux')

diff --git a/arch/microblaze/include/asm/of_device.h b/arch/microblaze/include/asm/of_device.h
index c9be53487434..47e8d42aee8f 100644
--- a/arch/microblaze/include/asm/of_device.h
+++ b/arch/microblaze/include/asm/of_device.h
@@ -10,19 +10,4 @@
 
 #ifndef _ASM_MICROBLAZE_OF_DEVICE_H
 #define _ASM_MICROBLAZE_OF_DEVICE_H
-#ifdef __KERNEL__
-
-#include <linux/device.h>
-#include <linux/of.h>
-
-extern struct of_device *of_device_alloc(struct device_node *np,
-					 const char *bus_id,
-					 struct device *parent);
-
-extern void of_device_make_bus_id(struct of_device *dev);
-
-/* This is just here during the transition */
-#include <linux/of_device.h>
-
-#endif /* __KERNEL__ */
 #endif /* _ASM_MICROBLAZE_OF_DEVICE_H */
diff --git a/arch/microblaze/kernel/Makefile b/arch/microblaze/kernel/Makefile
index e51bc1520825..727e2cbff9c6 100644
--- a/arch/microblaze/kernel/Makefile
+++ b/arch/microblaze/kernel/Makefile
@@ -15,7 +15,7 @@ endif
 extra-y := head.o vmlinux.lds
 
 obj-y += dma.o exceptions.o \
-	hw_exception_handler.o init_task.o intc.o irq.o of_device.o \
+	hw_exception_handler.o init_task.o intc.o irq.o \
 	of_platform.o process.o prom.o prom_parse.o ptrace.o \
 	setup.o signal.o sys_microblaze.o timer.o traps.o reset.o
 
diff --git a/arch/microblaze/kernel/of_device.c b/arch/microblaze/kernel/of_device.c
deleted file mode 100644
index 3a367d788451..000000000000
--- a/arch/microblaze/kernel/of_device.c
+++ /dev/null
@@ -1,64 +0,0 @@
-#include <linux/string.h>
-#include <linux/kernel.h>
-#include <linux/of.h>
-#include <linux/init.h>
-#include <linux/module.h>
-#include <linux/mod_devicetable.h>
-#include <linux/slab.h>
-#include <linux/of_device.h>
-
-#include <linux/errno.h>
-
-void of_device_make_bus_id(struct of_device *dev)
-{
-	static atomic_t bus_no_reg_magic;
-	struct device_node *node = dev->dev.of_node;
-	const u32 *reg;
-	u64 addr;
-	int magic;
-
-	/*
-	 * For MMIO, get the physical address
-	 */
-	reg = of_get_property(node, "reg", NULL);
-	if (reg) {
-		addr = of_translate_address(node, reg);
-		if (addr != OF_BAD_ADDR) {
-			dev_set_name(&dev->dev, "%llx.%s",
-				     (unsigned long long)addr, node->name);
-			return;
-		}
-	}
-
-	/*
-	 * No BusID, use the node name and add a globally incremented
-	 * counter (and pray...)
-	 */
-	magic = atomic_add_return(1, &bus_no_reg_magic);
-	dev_set_name(&dev->dev, "%s.%d", node->name, magic - 1);
-}
-EXPORT_SYMBOL(of_device_make_bus_id);
-
-struct of_device *of_device_alloc(struct device_node *np,
-				  const char *bus_id,
-				  struct device *parent)
-{
-	struct of_device *dev;
-
-	dev = kzalloc(sizeof(*dev), GFP_KERNEL);
-	if (!dev)
-		return NULL;
-
-	dev->dev.of_node = of_node_get(np);
-	dev->dev.dma_mask = &dev->archdata.dma_mask;
-	dev->dev.parent = parent;
-	dev->dev.release = of_release_dev;
-
-	if (bus_id)
-		dev_set_name(&dev->dev, bus_id);
-	else
-		of_device_make_bus_id(dev);
-
-	return dev;
-}
-EXPORT_SYMBOL(of_device_alloc);
diff --git a/arch/powerpc/include/asm/of_device.h b/arch/powerpc/include/asm/of_device.h
index 5d5103cac641..04f76717f82c 100644
--- a/arch/powerpc/include/asm/of_device.h
+++ b/arch/powerpc/include/asm/of_device.h
@@ -1,13 +1,3 @@
 #ifndef _ASM_POWERPC_OF_DEVICE_H
 #define _ASM_POWERPC_OF_DEVICE_H
-#ifdef __KERNEL__
-
-#include <linux/device.h>
-#include <linux/of.h>
-
-extern struct of_device *of_device_alloc(struct device_node *np,
-					 const char *bus_id,
-					 struct device *parent);
-
-#endif /* __KERNEL__ */
 #endif /* _ASM_POWERPC_OF_DEVICE_H */
diff --git a/arch/powerpc/kernel/Makefile b/arch/powerpc/kernel/Makefile
index 58d0572de6f9..83aa1fd0908f 100644
--- a/arch/powerpc/kernel/Makefile
+++ b/arch/powerpc/kernel/Makefile
@@ -40,7 +40,7 @@ obj-$(CONFIG_PPC_BOOK3E_64)	+= exceptions-64e.o
 obj-$(CONFIG_PPC64)		+= vdso64/
 obj-$(CONFIG_ALTIVEC)		+= vecemu.o
 obj-$(CONFIG_PPC_970_NAP)	+= idle_power4.o
-obj-$(CONFIG_PPC_OF)		+= of_device.o of_platform.o prom_parse.o
+obj-$(CONFIG_PPC_OF)		+= of_platform.o prom_parse.o
 obj-$(CONFIG_PPC_CLOCK)		+= clock.o
 procfs-y			:= proc_powerpc.o
 obj-$(CONFIG_PROC_FS)		+= $(procfs-y)
diff --git a/arch/powerpc/kernel/of_device.c b/arch/powerpc/kernel/of_device.c
deleted file mode 100644
index db91a9dbafb5..000000000000
--- a/arch/powerpc/kernel/of_device.c
+++ /dev/null
@@ -1,84 +0,0 @@
-#include <linux/string.h>
-#include <linux/kernel.h>
-#include <linux/of.h>
-#include <linux/init.h>
-#include <linux/module.h>
-#include <linux/mod_devicetable.h>
-#include <linux/slab.h>
-#include <linux/of_device.h>
-
-#include <asm/errno.h>
-#include <asm/dcr.h>
-
-static void of_device_make_bus_id(struct of_device *dev)
-{
-	static atomic_t bus_no_reg_magic;
-	struct device_node *node = dev->dev.of_node;
-	const u32 *reg;
-	u64 addr;
-	int magic;
-
-	/*
-	 * If it's a DCR based device, use 'd' for native DCRs
-	 * and 'D' for MMIO DCRs.
-	 */
-#ifdef CONFIG_PPC_DCR
-	reg = of_get_property(node, "dcr-reg", NULL);
-	if (reg) {
-#ifdef CONFIG_PPC_DCR_NATIVE
-		dev_set_name(&dev->dev, "d%x.%s", *reg, node->name);
-#else /* CONFIG_PPC_DCR_NATIVE */
-		addr = of_translate_dcr_address(node, *reg, NULL);
-		if (addr != OF_BAD_ADDR) {
-			dev_set_name(&dev->dev, "D%llx.%s",
-				     (unsigned long long)addr, node->name);
-			return;
-		}
-#endif /* !CONFIG_PPC_DCR_NATIVE */
-	}
-#endif /* CONFIG_PPC_DCR */
-
-	/*
-	 * For MMIO, get the physical address
-	 */
-	reg = of_get_property(node, "reg", NULL);
-	if (reg) {
-		addr = of_translate_address(node, reg);
-		if (addr != OF_BAD_ADDR) {
-			dev_set_name(&dev->dev, "%llx.%s",
-				     (unsigned long long)addr, node->name);
-			return;
-		}
-	}
-
-	/*
-	 * No BusID, use the node name and add a globally incremented
-	 * counter (and pray...)
-	 */
-	magic = atomic_add_return(1, &bus_no_reg_magic);
-	dev_set_name(&dev->dev, "%s.%d", node->name, magic - 1);
-}
-
-struct of_device *of_device_alloc(struct device_node *np,
-				  const char *bus_id,
-				  struct device *parent)
-{
-	struct of_device *dev;
-
-	dev = kzalloc(sizeof(*dev), GFP_KERNEL);
-	if (!dev)
-		return NULL;
-
-	dev->dev.of_node = of_node_get(np);
-	dev->dev.dma_mask = &dev->archdata.dma_mask;
-	dev->dev.parent = parent;
-	dev->dev.release = of_release_dev;
-
-	if (bus_id)
-		dev_set_name(&dev->dev, "%s", bus_id);
-	else
-		of_device_make_bus_id(dev);
-
-	return dev;
-}
-EXPORT_SYMBOL(of_device_alloc);
diff --git a/drivers/of/platform.c b/drivers/of/platform.c
index d9c81e93bdd0..ea87a3cf7860 100644
--- a/drivers/of/platform.c
+++ b/drivers/of/platform.c
@@ -407,6 +407,93 @@ EXPORT_SYMBOL(of_unregister_driver);
  * mechanism for creating devices from device tree nodes.
  */
 
+/**
+ * of_device_make_bus_id - Use the device node data to assign a unique name
+ * @dev: pointer to device structure that is linked to a device tree node
+ *
+ * This routine will first try using either the dcr-reg or the reg property
+ * value to derive a unique name.  As a last resort it will use the node
+ * name followed by a unique number.
+ */
+static void of_device_make_bus_id(struct device *dev)
+{
+	static atomic_t bus_no_reg_magic;
+	struct device_node *node = dev->of_node;
+	const u32 *reg;
+	u64 addr;
+	int magic;
+
+#ifdef CONFIG_PPC_DCR
+	/*
+	 * If it's a DCR based device, use 'd' for native DCRs
+	 * and 'D' for MMIO DCRs.
+	 */
+	reg = of_get_property(node, "dcr-reg", NULL);
+	if (reg) {
+#ifdef CONFIG_PPC_DCR_NATIVE
+		dev_set_name(dev, "d%x.%s", *reg, node->name);
+#else /* CONFIG_PPC_DCR_NATIVE */
+		u64 addr = of_translate_dcr_address(node, *reg, NULL);
+		if (addr != OF_BAD_ADDR) {
+			dev_set_name(dev, "D%llx.%s",
+				     (unsigned long long)addr, node->name);
+			return;
+		}
+#endif /* !CONFIG_PPC_DCR_NATIVE */
+	}
+#endif /* CONFIG_PPC_DCR */
+
+	/*
+	 * For MMIO, get the physical address
+	 */
+	reg = of_get_property(node, "reg", NULL);
+	if (reg) {
+		addr = of_translate_address(node, reg);
+		if (addr != OF_BAD_ADDR) {
+			dev_set_name(dev, "%llx.%s",
+				     (unsigned long long)addr, node->name);
+			return;
+		}
+	}
+
+	/*
+	 * No BusID, use the node name and add a globally incremented
+	 * counter (and pray...)
+	 */
+	magic = atomic_add_return(1, &bus_no_reg_magic);
+	dev_set_name(dev, "%s.%d", node->name, magic - 1);
+}
+
+/**
+ * of_device_alloc - Allocate and initialize an of_device
+ * @np: device node to assign to device
+ * @bus_id: Name to assign to the device.  May be null to use default name.
+ * @parent: Parent device.
+ */
+struct of_device *of_device_alloc(struct device_node *np,
+				  const char *bus_id,
+				  struct device *parent)
+{
+	struct of_device *dev;
+
+	dev = kzalloc(sizeof(*dev), GFP_KERNEL);
+	if (!dev)
+		return NULL;
+
+	dev->dev.of_node = of_node_get(np);
+	dev->dev.dma_mask = &dev->archdata.dma_mask;
+	dev->dev.parent = parent;
+	dev->dev.release = of_release_dev;
+
+	if (bus_id)
+		dev_set_name(&dev->dev, "%s", bus_id);
+	else
+		of_device_make_bus_id(&dev->dev);
+
+	return dev;
+}
+EXPORT_SYMBOL(of_device_alloc);
+
 /**
  * of_platform_device_create - Alloc, initialize and register an of_device
  * @np: pointer to node to create device for
diff --git a/include/linux/of_platform.h b/include/linux/of_platform.h
index 4bbba41396ef..a51fd30176aa 100644
--- a/include/linux/of_platform.h
+++ b/include/linux/of_platform.h
@@ -60,6 +60,9 @@ static inline void of_unregister_platform_driver(struct of_platform_driver *drv)
 	of_unregister_driver(drv);
 }
 
+extern struct of_device *of_device_alloc(struct device_node *np,
+					 const char *bus_id,
+					 struct device *parent);
 #include <asm/of_platform.h>
 
 extern struct of_device *of_find_device_by_node(struct device_node *np);
-- 
cgit v1.2.3


From a19e3da5bc5fc6c10ab73f310bea80f3845b4531 Mon Sep 17 00:00:00 2001
From: Anton Vorontsov <avorontsov@ru.mvista.com>
Date: Tue, 8 Jun 2010 07:48:16 -0600
Subject: of/gpio: Kill of_gpio_chip and add members directly to gpio_chip

The OF gpio infrastructure is great for describing GPIO connections within
the device tree.  However, using a GPIO binding still requires changes to
the gpio controller just to add an of_gpio structure.  In most cases, the
gpio controller doesn't actually need any special support and the simple
OF gpio mapping function is more than sufficient.  Additional, the current
scheme of using of_gpio_chip requires a convoluted scheme to maintain
1:1 mappings between of_gpio_chip and gpio_chip instances.

If the struct of_gpio_chip data members were moved into struct gpio_chip,
then it would simplify the processing of OF gpio bindings, and it would
make it trivial to use device tree OF connections on existing gpiolib
controller drivers.

This patch eliminates the of_gpio_chip structure and moves the relevant
fields into struct gpio_chip (conditional on CONFIG_OF_GPIO).  This move
simplifies the existing code and prepares for adding automatic device tree
support to existing drivers.

Signed-off-by: Grant Likely <grant.likely@secretlab.ca>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Anton Vorontsov <avorontsov@ru.mvista.com>
Cc: Grant Likely <grant.likely@secretlab.ca>
Cc: David Brownell <dbrownell@users.sourceforge.net>
Cc: Bill Gatliff <bgat@billgatliff.com>
Cc: Dmitry Eremin-Solenikov <dbaryshkov@gmail.com>
Cc: Benjamin Herrenschmidt <benh@kernel.crashing.org>
Cc: Jean Delvare <khali@linux-fr.org>
---
 arch/microblaze/kernel/reset.c                 | 12 +++----
 arch/powerpc/platforms/52xx/mpc52xx_gpio.c     | 32 ++++++++---------
 arch/powerpc/platforms/52xx/mpc52xx_gpt.c      | 29 ++++++++-------
 arch/powerpc/platforms/83xx/mcu_mpc8349emitx.c | 15 ++++----
 arch/powerpc/platforms/86xx/gef_gpio.c         | 24 ++++++-------
 arch/powerpc/sysdev/cpm1.c                     | 12 +++----
 arch/powerpc/sysdev/cpm_common.c               |  6 ++--
 arch/powerpc/sysdev/mpc8xxx_gpio.c             |  6 ++--
 arch/powerpc/sysdev/ppc4xx_gpio.c              |  6 ++--
 arch/powerpc/sysdev/qe_lib/gpio.c              | 32 ++++++++---------
 arch/powerpc/sysdev/simple_gpio.c              |  6 ++--
 drivers/gpio/xilinx_gpio.c                     | 16 ++++-----
 drivers/of/gpio.c                              | 50 +++++++++++++-------------
 include/asm-generic/gpio.h                     | 12 +++++++
 include/linux/of_gpio.h                        | 29 +++------------
 15 files changed, 129 insertions(+), 158 deletions(-)

(limited to 'include/linux')

diff --git a/arch/microblaze/kernel/reset.c b/arch/microblaze/kernel/reset.c
index a1721a33042e..5476d3caf045 100644
--- a/arch/microblaze/kernel/reset.c
+++ b/arch/microblaze/kernel/reset.c
@@ -24,8 +24,8 @@ static int of_reset_gpio_handle(void)
 	int ret; /* variable which stored handle reset gpio pin */
 	struct device_node *root; /* root node */
 	struct device_node *gpio; /* gpio node */
-	struct of_gpio_chip *of_gc = NULL;
-	enum of_gpio_flags flags ;
+	struct gpio_chip *gc;
+	u32 flags;
 	const void *gpio_spec;
 
 	/* find out root node */
@@ -39,19 +39,19 @@ static int of_reset_gpio_handle(void)
 		goto err0;
 	}
 
-	of_gc = gpio->data;
-	if (!of_gc) {
+	gc = gpio->data;
+	if (!gc) {
 		pr_debug("%s: gpio controller %s isn't registered\n",
 			 root->full_name, gpio->full_name);
 		ret = -ENODEV;
 		goto err1;
 	}
 
-	ret = of_gc->xlate(of_gc, root, gpio_spec, &flags);
+	ret = gc->of_xlate(gc, root, gpio_spec, &flags);
 	if (ret < 0)
 		goto err1;
 
-	ret += of_gc->gc.base;
+	ret += gc->base;
 err1:
 	of_node_put(gpio);
 err0:
diff --git a/arch/powerpc/platforms/52xx/mpc52xx_gpio.c b/arch/powerpc/platforms/52xx/mpc52xx_gpio.c
index ca5305a5bd61..fd0912eeffe2 100644
--- a/arch/powerpc/platforms/52xx/mpc52xx_gpio.c
+++ b/arch/powerpc/platforms/52xx/mpc52xx_gpio.c
@@ -152,21 +152,21 @@ static int __devinit mpc52xx_wkup_gpiochip_probe(struct of_device *ofdev,
 {
 	struct mpc52xx_gpiochip *chip;
 	struct mpc52xx_gpio_wkup __iomem *regs;
-	struct of_gpio_chip *ofchip;
+	struct gpio_chip *gc;
 	int ret;
 
 	chip = kzalloc(sizeof(*chip), GFP_KERNEL);
 	if (!chip)
 		return -ENOMEM;
 
-	ofchip = &chip->mmchip.of_gc;
+	gc = &chip->mmchip.gc;
 
-	ofchip->gpio_cells          = 2;
-	ofchip->gc.ngpio            = 8;
-	ofchip->gc.direction_input  = mpc52xx_wkup_gpio_dir_in;
-	ofchip->gc.direction_output = mpc52xx_wkup_gpio_dir_out;
-	ofchip->gc.get              = mpc52xx_wkup_gpio_get;
-	ofchip->gc.set              = mpc52xx_wkup_gpio_set;
+	gc->of_gpio_n_cells  = 2;
+	gc->ngpio            = 8;
+	gc->direction_input  = mpc52xx_wkup_gpio_dir_in;
+	gc->direction_output = mpc52xx_wkup_gpio_dir_out;
+	gc->get              = mpc52xx_wkup_gpio_get;
+	gc->set              = mpc52xx_wkup_gpio_set;
 
 	ret = of_mm_gpiochip_add(ofdev->dev.of_node, &chip->mmchip);
 	if (ret)
@@ -315,7 +315,7 @@ static int __devinit mpc52xx_simple_gpiochip_probe(struct of_device *ofdev,
 					const struct of_device_id *match)
 {
 	struct mpc52xx_gpiochip *chip;
-	struct of_gpio_chip *ofchip;
+	struct gpio_chip *gc;
 	struct mpc52xx_gpio __iomem *regs;
 	int ret;
 
@@ -323,14 +323,14 @@ static int __devinit mpc52xx_simple_gpiochip_probe(struct of_device *ofdev,
 	if (!chip)
 		return -ENOMEM;
 
-	ofchip = &chip->mmchip.of_gc;
+	gc = &chip->mmchip.gc;
 
-	ofchip->gpio_cells          = 2;
-	ofchip->gc.ngpio            = 32;
-	ofchip->gc.direction_input  = mpc52xx_simple_gpio_dir_in;
-	ofchip->gc.direction_output = mpc52xx_simple_gpio_dir_out;
-	ofchip->gc.get              = mpc52xx_simple_gpio_get;
-	ofchip->gc.set              = mpc52xx_simple_gpio_set;
+	gc->of_gpio_n_cells  = 2;
+	gc->ngpio            = 32;
+	gc->direction_input  = mpc52xx_simple_gpio_dir_in;
+	gc->direction_output = mpc52xx_simple_gpio_dir_out;
+	gc->get              = mpc52xx_simple_gpio_get;
+	gc->set              = mpc52xx_simple_gpio_set;
 
 	ret = of_mm_gpiochip_add(ofdev->dev.of_node, &chip->mmchip);
 	if (ret)
diff --git a/arch/powerpc/platforms/52xx/mpc52xx_gpt.c b/arch/powerpc/platforms/52xx/mpc52xx_gpt.c
index 46c93578cbf0..3f2ee47f1d01 100644
--- a/arch/powerpc/platforms/52xx/mpc52xx_gpt.c
+++ b/arch/powerpc/platforms/52xx/mpc52xx_gpt.c
@@ -78,7 +78,7 @@ MODULE_LICENSE("GPL");
  * @dev: pointer to device structure
  * @regs: virtual address of GPT registers
  * @lock: spinlock to coordinate between different functions.
- * @of_gc: of_gpio_chip instance structure; used when GPIO is enabled
+ * @gc: gpio_chip instance structure; used when GPIO is enabled
  * @irqhost: Pointer to irq_host instance; used when IRQ mode is supported
  * @wdt_mode: only relevant for gpt0: bit 0 (MPC52xx_GPT_CAN_WDT) indicates
  *   if the gpt may be used as wdt, bit 1 (MPC52xx_GPT_IS_WDT) indicates
@@ -94,7 +94,7 @@ struct mpc52xx_gpt_priv {
 	u8 wdt_mode;
 
 #if defined(CONFIG_GPIOLIB)
-	struct of_gpio_chip of_gc;
+	struct gpio_chip gc;
 #endif
 };
 
@@ -280,7 +280,7 @@ mpc52xx_gpt_irq_setup(struct mpc52xx_gpt_priv *gpt, struct device_node *node)
 #if defined(CONFIG_GPIOLIB)
 static inline struct mpc52xx_gpt_priv *gc_to_mpc52xx_gpt(struct gpio_chip *gc)
 {
-	return container_of(to_of_gpio_chip(gc), struct mpc52xx_gpt_priv,of_gc);
+	return container_of(gc, struct mpc52xx_gpt_priv, gc);
 }
 
 static int mpc52xx_gpt_gpio_get(struct gpio_chip *gc, unsigned int gpio)
@@ -336,28 +336,27 @@ mpc52xx_gpt_gpio_setup(struct mpc52xx_gpt_priv *gpt, struct device_node *node)
 	if (!of_find_property(node, "gpio-controller", NULL))
 		return;
 
-	gpt->of_gc.gc.label = kstrdup(node->full_name, GFP_KERNEL);
-	if (!gpt->of_gc.gc.label) {
+	gpt->gc.label = kstrdup(node->full_name, GFP_KERNEL);
+	if (!gpt->gc.label) {
 		dev_err(gpt->dev, "out of memory\n");
 		return;
 	}
 
-	gpt->of_gc.gpio_cells = 2;
-	gpt->of_gc.gc.ngpio = 1;
-	gpt->of_gc.gc.direction_input  = mpc52xx_gpt_gpio_dir_in;
-	gpt->of_gc.gc.direction_output = mpc52xx_gpt_gpio_dir_out;
-	gpt->of_gc.gc.get = mpc52xx_gpt_gpio_get;
-	gpt->of_gc.gc.set = mpc52xx_gpt_gpio_set;
-	gpt->of_gc.gc.base = -1;
-	gpt->of_gc.xlate = of_gpio_simple_xlate;
-	node->data = &gpt->of_gc;
+	gpt->gc.ngpio = 1;
+	gpt->gc.direction_input  = mpc52xx_gpt_gpio_dir_in;
+	gpt->gc.direction_output = mpc52xx_gpt_gpio_dir_out;
+	gpt->gc.get = mpc52xx_gpt_gpio_get;
+	gpt->gc.set = mpc52xx_gpt_gpio_set;
+	gpt->gc.base = -1;
+	gpt->gc.of_gpio_n_cells = 2;
+	gpt->gc.of_xlate = of_gpio_simple_xlate;
 	of_node_get(node);
 
 	/* Setup external pin in GPIO mode */
 	clrsetbits_be32(&gpt->regs->mode, MPC52xx_GPT_MODE_MS_MASK,
 			MPC52xx_GPT_MODE_MS_GPIO);
 
-	rc = gpiochip_add(&gpt->of_gc.gc);
+	rc = gpiochip_add(&gpt->gc);
 	if (rc)
 		dev_err(gpt->dev, "gpiochip_add() failed; rc=%i\n", rc);
 
diff --git a/arch/powerpc/platforms/83xx/mcu_mpc8349emitx.c b/arch/powerpc/platforms/83xx/mcu_mpc8349emitx.c
index d119a7c1c17a..e49f4bd2f991 100644
--- a/arch/powerpc/platforms/83xx/mcu_mpc8349emitx.c
+++ b/arch/powerpc/platforms/83xx/mcu_mpc8349emitx.c
@@ -37,7 +37,7 @@ struct mcu {
 	struct mutex lock;
 	struct device_node *np;
 	struct i2c_client *client;
-	struct of_gpio_chip of_gc;
+	struct gpio_chip gc;
 	u8 reg_ctrl;
 };
 
@@ -56,8 +56,7 @@ static void mcu_power_off(void)
 
 static void mcu_gpio_set(struct gpio_chip *gc, unsigned int gpio, int val)
 {
-	struct of_gpio_chip *of_gc = to_of_gpio_chip(gc);
-	struct mcu *mcu = container_of(of_gc, struct mcu, of_gc);
+	struct mcu *mcu = container_of(gc, struct mcu, gc);
 	u8 bit = 1 << (4 + gpio);
 
 	mutex_lock(&mcu->lock);
@@ -79,8 +78,7 @@ static int mcu_gpio_dir_out(struct gpio_chip *gc, unsigned int gpio, int val)
 static int mcu_gpiochip_add(struct mcu *mcu)
 {
 	struct device_node *np;
-	struct of_gpio_chip *of_gc = &mcu->of_gc;
-	struct gpio_chip *gc = &of_gc->gc;
+	struct gpio_chip *gc = &mcu->gc;
 	int ret;
 
 	np = of_find_compatible_node(NULL, NULL, "fsl,mcu-mpc8349emitx");
@@ -94,10 +92,9 @@ static int mcu_gpiochip_add(struct mcu *mcu)
 	gc->base = -1;
 	gc->set = mcu_gpio_set;
 	gc->direction_output = mcu_gpio_dir_out;
-	of_gc->gpio_cells = 2;
-	of_gc->xlate = of_gpio_simple_xlate;
+	gc->of_gpio_n_cells = 2;
+	gc->of_xlate = of_gpio_simple_xlate;
 
-	np->data = of_gc;
 	mcu->np = np;
 
 	/*
@@ -114,7 +111,7 @@ static int mcu_gpiochip_remove(struct mcu *mcu)
 {
 	int ret;
 
-	ret = gpiochip_remove(&mcu->of_gc.gc);
+	ret = gpiochip_remove(&mcu->gc);
 	if (ret)
 		return ret;
 	of_node_put(mcu->np);
diff --git a/arch/powerpc/platforms/86xx/gef_gpio.c b/arch/powerpc/platforms/86xx/gef_gpio.c
index b8cb08dbd89c..4ff7b1e7bbad 100644
--- a/arch/powerpc/platforms/86xx/gef_gpio.c
+++ b/arch/powerpc/platforms/86xx/gef_gpio.c
@@ -118,12 +118,12 @@ static int __init gef_gpio_init(void)
 		}
 
 		/* Setup pointers to chip functions */
-		gef_gpio_chip->of_gc.gpio_cells = 2;
-		gef_gpio_chip->of_gc.gc.ngpio = 19;
-		gef_gpio_chip->of_gc.gc.direction_input = gef_gpio_dir_in;
-		gef_gpio_chip->of_gc.gc.direction_output = gef_gpio_dir_out;
-		gef_gpio_chip->of_gc.gc.get = gef_gpio_get;
-		gef_gpio_chip->of_gc.gc.set = gef_gpio_set;
+		gef_gpio_chip->gc.of_gpio_n_cells = 2;
+		gef_gpio_chip->gc.ngpio = 19;
+		gef_gpio_chip->gc.direction_input = gef_gpio_dir_in;
+		gef_gpio_chip->gc.direction_output = gef_gpio_dir_out;
+		gef_gpio_chip->gc.get = gef_gpio_get;
+		gef_gpio_chip->gc.set = gef_gpio_set;
 
 		/* This function adds a memory mapped GPIO chip */
 		retval = of_mm_gpiochip_add(np, gef_gpio_chip);
@@ -146,12 +146,12 @@ static int __init gef_gpio_init(void)
 		}
 
 		/* Setup pointers to chip functions */
-		gef_gpio_chip->of_gc.gpio_cells = 2;
-		gef_gpio_chip->of_gc.gc.ngpio = 6;
-		gef_gpio_chip->of_gc.gc.direction_input = gef_gpio_dir_in;
-		gef_gpio_chip->of_gc.gc.direction_output = gef_gpio_dir_out;
-		gef_gpio_chip->of_gc.gc.get = gef_gpio_get;
-		gef_gpio_chip->of_gc.gc.set = gef_gpio_set;
+		gef_gpio_chip->gc.of_gpio_n_cells = 2;
+		gef_gpio_chip->gc.ngpio = 6;
+		gef_gpio_chip->gc.direction_input = gef_gpio_dir_in;
+		gef_gpio_chip->gc.direction_output = gef_gpio_dir_out;
+		gef_gpio_chip->gc.get = gef_gpio_get;
+		gef_gpio_chip->gc.set = gef_gpio_set;
 
 		/* This function adds a memory mapped GPIO chip */
 		retval = of_mm_gpiochip_add(np, gef_gpio_chip);
diff --git a/arch/powerpc/sysdev/cpm1.c b/arch/powerpc/sysdev/cpm1.c
index 8d103ca6d6ab..d5cf7d4ccf81 100644
--- a/arch/powerpc/sysdev/cpm1.c
+++ b/arch/powerpc/sysdev/cpm1.c
@@ -621,7 +621,6 @@ int cpm1_gpiochip_add16(struct device_node *np)
 {
 	struct cpm1_gpio16_chip *cpm1_gc;
 	struct of_mm_gpio_chip *mm_gc;
-	struct of_gpio_chip *of_gc;
 	struct gpio_chip *gc;
 
 	cpm1_gc = kzalloc(sizeof(*cpm1_gc), GFP_KERNEL);
@@ -631,11 +630,10 @@ int cpm1_gpiochip_add16(struct device_node *np)
 	spin_lock_init(&cpm1_gc->lock);
 
 	mm_gc = &cpm1_gc->mm_gc;
-	of_gc = &mm_gc->of_gc;
-	gc = &of_gc->gc;
+	gc = &mm_gc->gc;
 
 	mm_gc->save_regs = cpm1_gpio16_save_regs;
-	of_gc->gpio_cells = 2;
+	gc->of_gpio_n_cells = 2;
 	gc->ngpio = 16;
 	gc->direction_input = cpm1_gpio16_dir_in;
 	gc->direction_output = cpm1_gpio16_dir_out;
@@ -745,7 +743,6 @@ int cpm1_gpiochip_add32(struct device_node *np)
 {
 	struct cpm1_gpio32_chip *cpm1_gc;
 	struct of_mm_gpio_chip *mm_gc;
-	struct of_gpio_chip *of_gc;
 	struct gpio_chip *gc;
 
 	cpm1_gc = kzalloc(sizeof(*cpm1_gc), GFP_KERNEL);
@@ -755,11 +752,10 @@ int cpm1_gpiochip_add32(struct device_node *np)
 	spin_lock_init(&cpm1_gc->lock);
 
 	mm_gc = &cpm1_gc->mm_gc;
-	of_gc = &mm_gc->of_gc;
-	gc = &of_gc->gc;
+	gc = &mm_gc->gc;
 
 	mm_gc->save_regs = cpm1_gpio32_save_regs;
-	of_gc->gpio_cells = 2;
+	gc->of_gpio_n_cells = 2;
 	gc->ngpio = 32;
 	gc->direction_input = cpm1_gpio32_dir_in;
 	gc->direction_output = cpm1_gpio32_dir_out;
diff --git a/arch/powerpc/sysdev/cpm_common.c b/arch/powerpc/sysdev/cpm_common.c
index 88b9812c854f..67e9b47dcf8e 100644
--- a/arch/powerpc/sysdev/cpm_common.c
+++ b/arch/powerpc/sysdev/cpm_common.c
@@ -325,7 +325,6 @@ int cpm2_gpiochip_add32(struct device_node *np)
 {
 	struct cpm2_gpio32_chip *cpm2_gc;
 	struct of_mm_gpio_chip *mm_gc;
-	struct of_gpio_chip *of_gc;
 	struct gpio_chip *gc;
 
 	cpm2_gc = kzalloc(sizeof(*cpm2_gc), GFP_KERNEL);
@@ -335,11 +334,10 @@ int cpm2_gpiochip_add32(struct device_node *np)
 	spin_lock_init(&cpm2_gc->lock);
 
 	mm_gc = &cpm2_gc->mm_gc;
-	of_gc = &mm_gc->of_gc;
-	gc = &of_gc->gc;
+	gc = &mm_gc->gc;
 
 	mm_gc->save_regs = cpm2_gpio32_save_regs;
-	of_gc->gpio_cells = 2;
+	gc->of_gpio_n_cells = 2;
 	gc->ngpio = 32;
 	gc->direction_input = cpm2_gpio32_dir_in;
 	gc->direction_output = cpm2_gpio32_dir_out;
diff --git a/arch/powerpc/sysdev/mpc8xxx_gpio.c b/arch/powerpc/sysdev/mpc8xxx_gpio.c
index 83f519655fac..ec8fcd42101e 100644
--- a/arch/powerpc/sysdev/mpc8xxx_gpio.c
+++ b/arch/powerpc/sysdev/mpc8xxx_gpio.c
@@ -257,7 +257,6 @@ static void __init mpc8xxx_add_controller(struct device_node *np)
 {
 	struct mpc8xxx_gpio_chip *mpc8xxx_gc;
 	struct of_mm_gpio_chip *mm_gc;
-	struct of_gpio_chip *of_gc;
 	struct gpio_chip *gc;
 	unsigned hwirq;
 	int ret;
@@ -271,11 +270,10 @@ static void __init mpc8xxx_add_controller(struct device_node *np)
 	spin_lock_init(&mpc8xxx_gc->lock);
 
 	mm_gc = &mpc8xxx_gc->mm_gc;
-	of_gc = &mm_gc->of_gc;
-	gc = &of_gc->gc;
+	gc = &mm_gc->gc;
 
 	mm_gc->save_regs = mpc8xxx_gpio_save_regs;
-	of_gc->gpio_cells = 2;
+	gc->of_gpio_n_cells = 2;
 	gc->ngpio = MPC8XXX_GPIO_PINS;
 	gc->direction_input = mpc8xxx_gpio_dir_in;
 	gc->direction_output = mpc8xxx_gpio_dir_out;
diff --git a/arch/powerpc/sysdev/ppc4xx_gpio.c b/arch/powerpc/sysdev/ppc4xx_gpio.c
index 3812fc366bec..42e7a5eea665 100644
--- a/arch/powerpc/sysdev/ppc4xx_gpio.c
+++ b/arch/powerpc/sysdev/ppc4xx_gpio.c
@@ -181,7 +181,6 @@ static int __init ppc4xx_add_gpiochips(void)
 		int ret;
 		struct ppc4xx_gpio_chip *ppc4xx_gc;
 		struct of_mm_gpio_chip *mm_gc;
-		struct of_gpio_chip *of_gc;
 		struct gpio_chip *gc;
 
 		ppc4xx_gc = kzalloc(sizeof(*ppc4xx_gc), GFP_KERNEL);
@@ -193,10 +192,9 @@ static int __init ppc4xx_add_gpiochips(void)
 		spin_lock_init(&ppc4xx_gc->lock);
 
 		mm_gc = &ppc4xx_gc->mm_gc;
-		of_gc = &mm_gc->of_gc;
-		gc = &of_gc->gc;
+		gc = &mm_gc->gc;
 
-		of_gc->gpio_cells = 2;
+		gc->of_gpio_n_cells = 2;
 		gc->ngpio = 32;
 		gc->direction_input = ppc4xx_gpio_dir_in;
 		gc->direction_output = ppc4xx_gpio_dir_out;
diff --git a/arch/powerpc/sysdev/qe_lib/gpio.c b/arch/powerpc/sysdev/qe_lib/gpio.c
index dc8f8d618074..194478c2f4b4 100644
--- a/arch/powerpc/sysdev/qe_lib/gpio.c
+++ b/arch/powerpc/sysdev/qe_lib/gpio.c
@@ -138,8 +138,8 @@ struct qe_pin {
 struct qe_pin *qe_pin_request(struct device_node *np, int index)
 {
 	struct qe_pin *qe_pin;
-	struct device_node *gc;
-	struct of_gpio_chip *of_gc = NULL;
+	struct device_node *gpio_np;
+	struct gpio_chip *gc;
 	struct of_mm_gpio_chip *mm_gc;
 	struct qe_gpio_chip *qe_gc;
 	int err;
@@ -155,40 +155,40 @@ struct qe_pin *qe_pin_request(struct device_node *np, int index)
 	}
 
 	err = of_parse_phandles_with_args(np, "gpios", "#gpio-cells", index,
-					  &gc, &gpio_spec);
+					  &gpio_np, &gpio_spec);
 	if (err) {
 		pr_debug("%s: can't parse gpios property\n", __func__);
 		goto err0;
 	}
 
-	if (!of_device_is_compatible(gc, "fsl,mpc8323-qe-pario-bank")) {
+	if (!of_device_is_compatible(gpio_np, "fsl,mpc8323-qe-pario-bank")) {
 		pr_debug("%s: tried to get a non-qe pin\n", __func__);
 		err = -EINVAL;
 		goto err1;
 	}
 
-	of_gc = gc->data;
-	if (!of_gc) {
+	gc = gpio_np->data;
+	if (!gc) {
 		pr_debug("%s: gpio controller %s isn't registered\n",
-			 np->full_name, gc->full_name);
+			 np->full_name, gpio_np->full_name);
 		err = -ENODEV;
 		goto err1;
 	}
 
-	gpio_cells = of_get_property(gc, "#gpio-cells", &size);
+	gpio_cells = of_get_property(gpio_np, "#gpio-cells", &size);
 	if (!gpio_cells || size != sizeof(*gpio_cells) ||
-			*gpio_cells != of_gc->gpio_cells) {
+			*gpio_cells != gc->of_gpio_n_cells) {
 		pr_debug("%s: wrong #gpio-cells for %s\n",
-			 np->full_name, gc->full_name);
+			 np->full_name, gpio_np->full_name);
 		err = -EINVAL;
 		goto err1;
 	}
 
-	err = of_gc->xlate(of_gc, np, gpio_spec, NULL);
+	err = gc->of_xlate(gc, np, gpio_spec, NULL);
 	if (err < 0)
 		goto err1;
 
-	mm_gc = to_of_mm_gpio_chip(&of_gc->gc);
+	mm_gc = to_of_mm_gpio_chip(gc);
 	qe_gc = to_qe_gpio_chip(mm_gc);
 
 	spin_lock_irqsave(&qe_gc->lock, flags);
@@ -206,7 +206,7 @@ struct qe_pin *qe_pin_request(struct device_node *np, int index)
 	if (!err)
 		return qe_pin;
 err1:
-	of_node_put(gc);
+	of_node_put(gpio_np);
 err0:
 	kfree(qe_pin);
 	pr_debug("%s failed with status %d\n", __func__, err);
@@ -307,7 +307,6 @@ static int __init qe_add_gpiochips(void)
 		int ret;
 		struct qe_gpio_chip *qe_gc;
 		struct of_mm_gpio_chip *mm_gc;
-		struct of_gpio_chip *of_gc;
 		struct gpio_chip *gc;
 
 		qe_gc = kzalloc(sizeof(*qe_gc), GFP_KERNEL);
@@ -319,11 +318,10 @@ static int __init qe_add_gpiochips(void)
 		spin_lock_init(&qe_gc->lock);
 
 		mm_gc = &qe_gc->mm_gc;
-		of_gc = &mm_gc->of_gc;
-		gc = &of_gc->gc;
+		gc = &mm_gc->gc;
 
 		mm_gc->save_regs = qe_gpio_save_regs;
-		of_gc->gpio_cells = 2;
+		gc->of_gpio_n_cells = 2;
 		gc->ngpio = QE_PIO_PINS;
 		gc->direction_input = qe_gpio_dir_in;
 		gc->direction_output = qe_gpio_dir_out;
diff --git a/arch/powerpc/sysdev/simple_gpio.c b/arch/powerpc/sysdev/simple_gpio.c
index d5fb173e588c..b7559aa0c165 100644
--- a/arch/powerpc/sysdev/simple_gpio.c
+++ b/arch/powerpc/sysdev/simple_gpio.c
@@ -91,7 +91,6 @@ static int __init u8_simple_gpiochip_add(struct device_node *np)
 	int ret;
 	struct u8_gpio_chip *u8_gc;
 	struct of_mm_gpio_chip *mm_gc;
-	struct of_gpio_chip *of_gc;
 	struct gpio_chip *gc;
 
 	u8_gc = kzalloc(sizeof(*u8_gc), GFP_KERNEL);
@@ -101,11 +100,10 @@ static int __init u8_simple_gpiochip_add(struct device_node *np)
 	spin_lock_init(&u8_gc->lock);
 
 	mm_gc = &u8_gc->mm_gc;
-	of_gc = &mm_gc->of_gc;
-	gc = &of_gc->gc;
+	gc = &mm_gc->gc;
 
 	mm_gc->save_regs = u8_gpio_save_regs;
-	of_gc->gpio_cells = 2;
+	gc->of_gpio_n_cells = 2;
 	gc->ngpio = 8;
 	gc->direction_input = u8_gpio_dir_in;
 	gc->direction_output = u8_gpio_dir_out;
diff --git a/drivers/gpio/xilinx_gpio.c b/drivers/gpio/xilinx_gpio.c
index b8fa65b5bfca..2993c40b48e1 100644
--- a/drivers/gpio/xilinx_gpio.c
+++ b/drivers/gpio/xilinx_gpio.c
@@ -161,14 +161,12 @@ static void xgpio_save_regs(struct of_mm_gpio_chip *mm_gc)
 static int __devinit xgpio_of_probe(struct device_node *np)
 {
 	struct xgpio_instance *chip;
-	struct of_gpio_chip *ofchip;
 	int status = 0;
 	const u32 *tree_info;
 
 	chip = kzalloc(sizeof(*chip), GFP_KERNEL);
 	if (!chip)
 		return -ENOMEM;
-	ofchip = &chip->mmchip.of_gc;
 
 	/* Update GPIO state shadow register with default value */
 	tree_info = of_get_property(np, "xlnx,dout-default", NULL);
@@ -182,21 +180,21 @@ static int __devinit xgpio_of_probe(struct device_node *np)
 		chip->gpio_dir = *tree_info;
 
 	/* Check device node and parent device node for device width */
-	ofchip->gc.ngpio = 32; /* By default assume full GPIO controller */
+	chip->mmchip.gc.ngpio = 32; /* By default assume full GPIO controller */
 	tree_info = of_get_property(np, "xlnx,gpio-width", NULL);
 	if (!tree_info)
 		tree_info = of_get_property(np->parent,
 					    "xlnx,gpio-width", NULL);
 	if (tree_info)
-		ofchip->gc.ngpio = *tree_info;
+		chip->mmchip.gc.ngpio = *tree_info;
 
 	spin_lock_init(&chip->gpio_lock);
 
-	ofchip->gpio_cells = 2;
-	ofchip->gc.direction_input = xgpio_dir_in;
-	ofchip->gc.direction_output = xgpio_dir_out;
-	ofchip->gc.get = xgpio_get;
-	ofchip->gc.set = xgpio_set;
+	chip->mmchip.gc.of_gpio_n_cells = 2;
+	chip->mmchip.gc.direction_input = xgpio_dir_in;
+	chip->mmchip.gc.direction_output = xgpio_dir_out;
+	chip->mmchip.gc.get = xgpio_get;
+	chip->mmchip.gc.set = xgpio_set;
 
 	chip->mmchip.save_regs = xgpio_save_regs;
 
diff --git a/drivers/of/gpio.c b/drivers/of/gpio.c
index a1b31a4abae4..fde53a3a45a3 100644
--- a/drivers/of/gpio.c
+++ b/drivers/of/gpio.c
@@ -33,32 +33,32 @@ int of_get_gpio_flags(struct device_node *np, int index,
 		      enum of_gpio_flags *flags)
 {
 	int ret;
-	struct device_node *gc;
-	struct of_gpio_chip *of_gc = NULL;
+	struct device_node *gpio_np;
+	struct gpio_chip *gc;
 	int size;
 	const void *gpio_spec;
 	const __be32 *gpio_cells;
 
 	ret = of_parse_phandles_with_args(np, "gpios", "#gpio-cells", index,
-					  &gc, &gpio_spec);
+					  &gpio_np, &gpio_spec);
 	if (ret) {
 		pr_debug("%s: can't parse gpios property\n", __func__);
 		goto err0;
 	}
 
-	of_gc = gc->data;
-	if (!of_gc) {
+	gc = gpio_np->data;
+	if (!gc) {
 		pr_debug("%s: gpio controller %s isn't registered\n",
-			 np->full_name, gc->full_name);
+			 np->full_name, gpio_np->full_name);
 		ret = -ENODEV;
 		goto err1;
 	}
 
-	gpio_cells = of_get_property(gc, "#gpio-cells", &size);
+	gpio_cells = of_get_property(gpio_np, "#gpio-cells", &size);
 	if (!gpio_cells || size != sizeof(*gpio_cells) ||
-			be32_to_cpup(gpio_cells) != of_gc->gpio_cells) {
+			be32_to_cpup(gpio_cells) != gc->of_gpio_n_cells) {
 		pr_debug("%s: wrong #gpio-cells for %s\n",
-			 np->full_name, gc->full_name);
+			 np->full_name, gpio_np->full_name);
 		ret = -EINVAL;
 		goto err1;
 	}
@@ -67,13 +67,13 @@ int of_get_gpio_flags(struct device_node *np, int index,
 	if (flags)
 		*flags = 0;
 
-	ret = of_gc->xlate(of_gc, np, gpio_spec, flags);
+	ret = gc->of_xlate(gc, np, gpio_spec, flags);
 	if (ret < 0)
 		goto err1;
 
-	ret += of_gc->gc.base;
+	ret += gc->base;
 err1:
-	of_node_put(gc);
+	of_node_put(gpio_np);
 err0:
 	pr_debug("%s exited with status %d\n", __func__, ret);
 	return ret;
@@ -116,7 +116,7 @@ EXPORT_SYMBOL(of_gpio_count);
 
 /**
  * of_gpio_simple_xlate - translate gpio_spec to the GPIO number and flags
- * @of_gc:	pointer to the of_gpio_chip structure
+ * @gc:		pointer to the gpio_chip structure
  * @np:		device node of the GPIO chip
  * @gpio_spec:	gpio specifier as found in the device tree
  * @flags:	a flags pointer to fill in
@@ -125,8 +125,8 @@ EXPORT_SYMBOL(of_gpio_count);
  * gpio chips. This function performs only one sanity check: whether gpio
  * is less than ngpios (that is specified in the gpio_chip).
  */
-int of_gpio_simple_xlate(struct of_gpio_chip *of_gc, struct device_node *np,
-			 const void *gpio_spec, enum of_gpio_flags *flags)
+int of_gpio_simple_xlate(struct gpio_chip *gc, struct device_node *np,
+			 const void *gpio_spec, u32 *flags)
 {
 	const __be32 *gpio = gpio_spec;
 	const u32 n = be32_to_cpup(gpio);
@@ -137,12 +137,12 @@ int of_gpio_simple_xlate(struct of_gpio_chip *of_gc, struct device_node *np,
 	 * number and the flags from a single gpio cell -- this is possible,
 	 * but not recommended).
 	 */
-	if (of_gc->gpio_cells < 2) {
+	if (gc->of_gpio_n_cells < 2) {
 		WARN_ON(1);
 		return -EINVAL;
 	}
 
-	if (n > of_gc->gc.ngpio)
+	if (n > gc->ngpio)
 		return -EINVAL;
 
 	if (flags)
@@ -161,10 +161,8 @@ EXPORT_SYMBOL(of_gpio_simple_xlate);
  *
  * 1) In the gpio_chip structure:
  *    - all the callbacks
- *
- * 2) In the of_gpio_chip structure:
- *    - gpio_cells
- *    - xlate callback (optional)
+ *    - of_gpio_n_cells
+ *    - of_xlate callback (optional)
  *
  * 3) In the of_mm_gpio_chip structure:
  *    - save_regs callback (optional)
@@ -177,8 +175,7 @@ int of_mm_gpiochip_add(struct device_node *np,
 		       struct of_mm_gpio_chip *mm_gc)
 {
 	int ret = -ENOMEM;
-	struct of_gpio_chip *of_gc = &mm_gc->of_gc;
-	struct gpio_chip *gc = &of_gc->gc;
+	struct gpio_chip *gc = &mm_gc->gc;
 
 	gc->label = kstrdup(np->full_name, GFP_KERNEL);
 	if (!gc->label)
@@ -190,13 +187,14 @@ int of_mm_gpiochip_add(struct device_node *np,
 
 	gc->base = -1;
 
-	if (!of_gc->xlate)
-		of_gc->xlate = of_gpio_simple_xlate;
+	if (!gc->of_xlate)
+		gc->of_xlate = of_gpio_simple_xlate;
 
 	if (mm_gc->save_regs)
 		mm_gc->save_regs(mm_gc);
 
-	np->data = of_gc;
+	np->data = &mm_gc->gc;
+	mm_gc->gc.of_node = np;
 
 	ret = gpiochip_add(gc);
 	if (ret)
diff --git a/include/asm-generic/gpio.h b/include/asm-generic/gpio.h
index 4f3d75e1ad39..af2544ef0b59 100644
--- a/include/asm-generic/gpio.h
+++ b/include/asm-generic/gpio.h
@@ -31,6 +31,7 @@ static inline int gpio_is_valid(int number)
 struct device;
 struct seq_file;
 struct module;
+struct device_node;
 
 /**
  * struct gpio_chip - abstract a GPIO controller
@@ -106,6 +107,17 @@ struct gpio_chip {
 	const char		*const *names;
 	unsigned		can_sleep:1;
 	unsigned		exported:1;
+
+#if defined(CONFIG_OF_GPIO)
+	/*
+	 * If CONFIG_OF is enabled, then all GPIO controllers described in the
+	 * device tree automatically may have an OF translation
+	 */
+	struct device_node *of_node;
+	int of_gpio_n_cells;
+	int (*of_xlate)(struct gpio_chip *gc, struct device_node *np,
+		        const void *gpio_spec, u32 *flags);
+#endif
 };
 
 extern const char *gpiochip_is_requested(struct gpio_chip *chip,
diff --git a/include/linux/of_gpio.h b/include/linux/of_gpio.h
index fc2472c3c254..460d6810c5eb 100644
--- a/include/linux/of_gpio.h
+++ b/include/linux/of_gpio.h
@@ -32,35 +32,18 @@ enum of_gpio_flags {
 
 #ifdef CONFIG_OF_GPIO
 
-/*
- * Generic OF GPIO chip
- */
-struct of_gpio_chip {
-	struct gpio_chip gc;
-	int gpio_cells;
-	int (*xlate)(struct of_gpio_chip *of_gc, struct device_node *np,
-		     const void *gpio_spec, enum of_gpio_flags *flags);
-};
-
-static inline struct of_gpio_chip *to_of_gpio_chip(struct gpio_chip *gc)
-{
-	return container_of(gc, struct of_gpio_chip, gc);
-}
-
 /*
  * OF GPIO chip for memory mapped banks
  */
 struct of_mm_gpio_chip {
-	struct of_gpio_chip of_gc;
+	struct gpio_chip gc;
 	void (*save_regs)(struct of_mm_gpio_chip *mm_gc);
 	void __iomem *regs;
 };
 
 static inline struct of_mm_gpio_chip *to_of_mm_gpio_chip(struct gpio_chip *gc)
 {
-	struct of_gpio_chip *of_gc = to_of_gpio_chip(gc);
-
-	return container_of(of_gc, struct of_mm_gpio_chip, of_gc);
+	return container_of(gc, struct of_mm_gpio_chip, gc);
 }
 
 extern int of_get_gpio_flags(struct device_node *np, int index,
@@ -69,11 +52,9 @@ extern unsigned int of_gpio_count(struct device_node *np);
 
 extern int of_mm_gpiochip_add(struct device_node *np,
 			      struct of_mm_gpio_chip *mm_gc);
-extern int of_gpio_simple_xlate(struct of_gpio_chip *of_gc,
-				struct device_node *np,
-				const void *gpio_spec,
-				enum of_gpio_flags *flags);
-#else
+extern int of_gpio_simple_xlate(struct gpio_chip *gc, struct device_node *np,
+				const void *gpio_spec, u32 *flags);
+#else /* CONFIG_OF_GPIO */
 
 /* Drivers may not strictly depend on the GPIO support, so let them link. */
 static inline int of_get_gpio_flags(struct device_node *np, int index,
-- 
cgit v1.2.3


From 594fa265e084073443390c5b93d5410fd28e9bcd Mon Sep 17 00:00:00 2001
From: Grant Likely <grant.likely@secretlab.ca>
Date: Tue, 8 Jun 2010 07:48:16 -0600
Subject: of/gpio: stop using device_node data pointer to find gpio_chip

Currently the kernel uses the struct device_node.data pointer to resolve
a struct gpio_chip pointer from a device tree node.  However, the .data
member doesn't provide any type checking and there aren't any rules
enforced on what it should be used for.  There's no guarantee that the
data stored in it actually points to an gpio_chip pointer.

Instead of relying on the .data pointer, this patch modifies the code
to add a lookup function which scans through the registered gpio_chips
and returns the gpio_chip that has a pointer to the specified
device_node.

Signed-off-by: Grant Likely <grant.likely@secretlab.ca>
CC: Andrew Morton <akpm@linux-foundation.org>
CC: Anton Vorontsov <avorontsov@ru.mvista.com>
CC: Grant Likely <grant.likely@secretlab.ca>
CC: David Brownell <dbrownell@users.sourceforge.net>
CC: Bill Gatliff <bgat@billgatliff.com>
CC: Dmitry Eremin-Solenikov <dbaryshkov@gmail.com>
CC: Benjamin Herrenschmidt <benh@kernel.crashing.org>
CC: Jean Delvare <khali@linux-fr.org>
CC: linux-kernel@vger.kernel.org
CC: devicetree-discuss@lists.ozlabs.org
---
 arch/microblaze/kernel/reset.c                 |  2 +-
 arch/powerpc/platforms/52xx/mpc52xx_gpt.c      |  1 +
 arch/powerpc/platforms/83xx/mcu_mpc8349emitx.c | 23 +++---------------
 arch/powerpc/sysdev/qe_lib/gpio.c              |  2 +-
 drivers/gpio/gpiolib.c                         | 32 ++++++++++++++++++++++++++
 drivers/of/gpio.c                              | 15 +++++++++---
 include/asm-generic/gpio.h                     |  3 +++
 include/linux/of_gpio.h                        |  3 +++
 8 files changed, 56 insertions(+), 25 deletions(-)

(limited to 'include/linux')

diff --git a/arch/microblaze/kernel/reset.c b/arch/microblaze/kernel/reset.c
index 5476d3caf045..bd8ccab5ceff 100644
--- a/arch/microblaze/kernel/reset.c
+++ b/arch/microblaze/kernel/reset.c
@@ -39,7 +39,7 @@ static int of_reset_gpio_handle(void)
 		goto err0;
 	}
 
-	gc = gpio->data;
+	gc = of_node_to_gpiochip(gpio);
 	if (!gc) {
 		pr_debug("%s: gpio controller %s isn't registered\n",
 			 root->full_name, gpio->full_name);
diff --git a/arch/powerpc/platforms/52xx/mpc52xx_gpt.c b/arch/powerpc/platforms/52xx/mpc52xx_gpt.c
index 3f2ee47f1d01..6e82bd27132c 100644
--- a/arch/powerpc/platforms/52xx/mpc52xx_gpt.c
+++ b/arch/powerpc/platforms/52xx/mpc52xx_gpt.c
@@ -350,6 +350,7 @@ mpc52xx_gpt_gpio_setup(struct mpc52xx_gpt_priv *gpt, struct device_node *node)
 	gpt->gc.base = -1;
 	gpt->gc.of_gpio_n_cells = 2;
 	gpt->gc.of_xlate = of_gpio_simple_xlate;
+	gpt->gc.of_node = node;
 	of_node_get(node);
 
 	/* Setup external pin in GPIO mode */
diff --git a/arch/powerpc/platforms/83xx/mcu_mpc8349emitx.c b/arch/powerpc/platforms/83xx/mcu_mpc8349emitx.c
index e49f4bd2f991..f0dbace6185a 100644
--- a/arch/powerpc/platforms/83xx/mcu_mpc8349emitx.c
+++ b/arch/powerpc/platforms/83xx/mcu_mpc8349emitx.c
@@ -35,7 +35,6 @@
 
 struct mcu {
 	struct mutex lock;
-	struct device_node *np;
 	struct i2c_client *client;
 	struct gpio_chip gc;
 	u8 reg_ctrl;
@@ -79,7 +78,6 @@ static int mcu_gpiochip_add(struct mcu *mcu)
 {
 	struct device_node *np;
 	struct gpio_chip *gc = &mcu->gc;
-	int ret;
 
 	np = of_find_compatible_node(NULL, NULL, "fsl,mcu-mpc8349emitx");
 	if (!np)
@@ -94,29 +92,14 @@ static int mcu_gpiochip_add(struct mcu *mcu)
 	gc->direction_output = mcu_gpio_dir_out;
 	gc->of_gpio_n_cells = 2;
 	gc->of_xlate = of_gpio_simple_xlate;
+	gc->of_node = np;
 
-	mcu->np = np;
-
-	/*
-	 * We don't want to lose the node, its ->data and ->full_name...
-	 * So, if succeeded, we don't put the node here.
-	 */
-	ret = gpiochip_add(gc);
-	if (ret)
-		of_node_put(np);
-	return ret;
+	return gpiochip_add(gc);
 }
 
 static int mcu_gpiochip_remove(struct mcu *mcu)
 {
-	int ret;
-
-	ret = gpiochip_remove(&mcu->gc);
-	if (ret)
-		return ret;
-	of_node_put(mcu->np);
-
-	return 0;
+	return gpiochip_remove(&mcu->gc);
 }
 
 static int __devinit mcu_probe(struct i2c_client *client,
diff --git a/arch/powerpc/sysdev/qe_lib/gpio.c b/arch/powerpc/sysdev/qe_lib/gpio.c
index 194478c2f4b4..32e9440010a1 100644
--- a/arch/powerpc/sysdev/qe_lib/gpio.c
+++ b/arch/powerpc/sysdev/qe_lib/gpio.c
@@ -167,7 +167,7 @@ struct qe_pin *qe_pin_request(struct device_node *np, int index)
 		goto err1;
 	}
 
-	gc = gpio_np->data;
+	gc = of_node_to_gpiochip(gpio_np);
 	if (!gc) {
 		pr_debug("%s: gpio controller %s isn't registered\n",
 			 np->full_name, gpio_np->full_name);
diff --git a/drivers/gpio/gpiolib.c b/drivers/gpio/gpiolib.c
index 713ca0e37f23..73fd328f6fe4 100644
--- a/drivers/gpio/gpiolib.c
+++ b/drivers/gpio/gpiolib.c
@@ -1153,6 +1153,38 @@ int gpiochip_remove(struct gpio_chip *chip)
 }
 EXPORT_SYMBOL_GPL(gpiochip_remove);
 
+/**
+ * gpiochip_find() - iterator for locating a specific gpio_chip
+ * @data: data to pass to match function
+ * @callback: Callback function to check gpio_chip
+ *
+ * Similar to bus_find_device.  It returns a reference to a gpio_chip as
+ * determined by a user supplied @match callback.  The callback should return
+ * 0 if the device doesn't match and non-zero if it does.  If the callback is
+ * non-zero, this function will return to the caller and not iterate over any
+ * more gpio_chips.
+ */
+struct gpio_chip *gpiochip_find(void *data,
+				int (*match)(struct gpio_chip *chip, void *data))
+{
+	struct gpio_chip *chip = NULL;
+	unsigned long flags;
+	int i;
+
+	spin_lock_irqsave(&gpio_lock, flags);
+	for (i = 0; i < ARCH_NR_GPIOS; i++) {
+		if (!gpio_desc[i].chip)
+			continue;
+
+		if (match(gpio_desc[i].chip, data)) {
+			chip = gpio_desc[i].chip;
+			break;
+		}
+	}
+	spin_unlock_irqrestore(&gpio_lock, flags);
+
+	return chip;
+}
 
 /* These "optional" allocation calls help prevent drivers from stomping
  * on each other, and help provide better diagnostics in debugfs.
diff --git a/drivers/of/gpio.c b/drivers/of/gpio.c
index fde53a3a45a3..c8618d3282cf 100644
--- a/drivers/of/gpio.c
+++ b/drivers/of/gpio.c
@@ -46,7 +46,7 @@ int of_get_gpio_flags(struct device_node *np, int index,
 		goto err0;
 	}
 
-	gc = gpio_np->data;
+	gc = of_node_to_gpiochip(gpio_np);
 	if (!gc) {
 		pr_debug("%s: gpio controller %s isn't registered\n",
 			 np->full_name, gpio_np->full_name);
@@ -193,7 +193,6 @@ int of_mm_gpiochip_add(struct device_node *np,
 	if (mm_gc->save_regs)
 		mm_gc->save_regs(mm_gc);
 
-	np->data = &mm_gc->gc;
 	mm_gc->gc.of_node = np;
 
 	ret = gpiochip_add(gc);
@@ -207,7 +206,6 @@ int of_mm_gpiochip_add(struct device_node *np,
 		 np->full_name, gc->base);
 	return 0;
 err2:
-	np->data = NULL;
 	iounmap(mm_gc->regs);
 err1:
 	kfree(gc->label);
@@ -217,3 +215,14 @@ err0:
 	return ret;
 }
 EXPORT_SYMBOL(of_mm_gpiochip_add);
+
+/* Private function for resolving node pointer to gpio_chip */
+static int of_gpiochip_is_match(struct gpio_chip *chip, void *data)
+{
+	return chip->of_node == data;
+}
+
+struct gpio_chip *of_node_to_gpiochip(struct device_node *np)
+{
+	return gpiochip_find(np, of_gpiochip_is_match);
+}
diff --git a/include/asm-generic/gpio.h b/include/asm-generic/gpio.h
index af2544ef0b59..c7376bf80b06 100644
--- a/include/asm-generic/gpio.h
+++ b/include/asm-generic/gpio.h
@@ -127,6 +127,9 @@ extern int __must_check gpiochip_reserve(int start, int ngpio);
 /* add/remove chips */
 extern int gpiochip_add(struct gpio_chip *chip);
 extern int __must_check gpiochip_remove(struct gpio_chip *chip);
+extern struct gpio_chip *gpiochip_find(void *data,
+					int (*match)(struct gpio_chip *chip,
+						     void *data));
 
 
 /* Always use the library code for GPIO management calls,
diff --git a/include/linux/of_gpio.h b/include/linux/of_gpio.h
index 460d6810c5eb..1020587efed1 100644
--- a/include/linux/of_gpio.h
+++ b/include/linux/of_gpio.h
@@ -54,6 +54,9 @@ extern int of_mm_gpiochip_add(struct device_node *np,
 			      struct of_mm_gpio_chip *mm_gc);
 extern int of_gpio_simple_xlate(struct gpio_chip *gc, struct device_node *np,
 				const void *gpio_spec, u32 *flags);
+
+extern struct gpio_chip *of_node_to_gpiochip(struct device_node *np);
+
 #else /* CONFIG_OF_GPIO */
 
 /* Drivers may not strictly depend on the GPIO support, so let them link. */
-- 
cgit v1.2.3


From 391c970c0dd1100e3b9e1681f7d0f20aac35455a Mon Sep 17 00:00:00 2001
From: Anton Vorontsov <avorontsov@ru.mvista.com>
Date: Tue, 8 Jun 2010 07:48:17 -0600
Subject: of/gpio: add default of_xlate function if device has a node pointer

Implement generic OF gpio hooks and thus make device-enabled GPIO chips
(i.e.  the ones that have gpio_chip->dev specified) automatically attach
to the OpenFirmware subsystem.  Which means that now we can handle I2C and
SPI GPIO chips almost* transparently.

* "Almost" because some chips still require platform data, and for these
  chips OF-glue is still needed, though with this change the glue will
  be much smaller.

Signed-off-by: Anton Vorontsov <avorontsov@ru.mvista.com>
Signed-off-by: Grant Likely <grant.likely@secretlab.ca>
Cc: David Brownell <dbrownell@users.sourceforge.net>
Cc: Bill Gatliff <bgat@billgatliff.com>
Cc: Dmitry Eremin-Solenikov <dbaryshkov@gmail.com>
Cc: Benjamin Herrenschmidt <benh@kernel.crashing.org>
Cc: Jean Delvare <khali@linux-fr.org>
Cc: Andrew Morton <akpm@linux-foundation.org>
CC: linux-kernel@vger.kernel.org
CC: devicetree-discuss@lists.ozlabs.org
---
 arch/powerpc/platforms/52xx/mpc52xx_gpio.c     |  2 --
 arch/powerpc/platforms/52xx/mpc52xx_gpt.c      |  3 ---
 arch/powerpc/platforms/83xx/mcu_mpc8349emitx.c |  2 --
 arch/powerpc/sysdev/cpm1.c                     |  2 --
 arch/powerpc/sysdev/cpm_common.c               |  1 -
 arch/powerpc/sysdev/mpc8xxx_gpio.c             |  1 -
 arch/powerpc/sysdev/ppc4xx_gpio.c              |  1 -
 arch/powerpc/sysdev/qe_lib/gpio.c              |  1 -
 arch/powerpc/sysdev/simple_gpio.c              |  1 -
 drivers/gpio/gpiolib.c                         |  5 ++++
 drivers/gpio/xilinx_gpio.c                     |  1 -
 drivers/of/gpio.c                              | 33 +++++++++++++++++++-------
 include/linux/of_gpio.h                        |  7 ++++--
 13 files changed, 34 insertions(+), 26 deletions(-)

(limited to 'include/linux')

diff --git a/arch/powerpc/platforms/52xx/mpc52xx_gpio.c b/arch/powerpc/platforms/52xx/mpc52xx_gpio.c
index fd0912eeffe2..0855e804fc0d 100644
--- a/arch/powerpc/platforms/52xx/mpc52xx_gpio.c
+++ b/arch/powerpc/platforms/52xx/mpc52xx_gpio.c
@@ -161,7 +161,6 @@ static int __devinit mpc52xx_wkup_gpiochip_probe(struct of_device *ofdev,
 
 	gc = &chip->mmchip.gc;
 
-	gc->of_gpio_n_cells  = 2;
 	gc->ngpio            = 8;
 	gc->direction_input  = mpc52xx_wkup_gpio_dir_in;
 	gc->direction_output = mpc52xx_wkup_gpio_dir_out;
@@ -325,7 +324,6 @@ static int __devinit mpc52xx_simple_gpiochip_probe(struct of_device *ofdev,
 
 	gc = &chip->mmchip.gc;
 
-	gc->of_gpio_n_cells  = 2;
 	gc->ngpio            = 32;
 	gc->direction_input  = mpc52xx_simple_gpio_dir_in;
 	gc->direction_output = mpc52xx_simple_gpio_dir_out;
diff --git a/arch/powerpc/platforms/52xx/mpc52xx_gpt.c b/arch/powerpc/platforms/52xx/mpc52xx_gpt.c
index 6e82bd27132c..5d7d607617cd 100644
--- a/arch/powerpc/platforms/52xx/mpc52xx_gpt.c
+++ b/arch/powerpc/platforms/52xx/mpc52xx_gpt.c
@@ -348,10 +348,7 @@ mpc52xx_gpt_gpio_setup(struct mpc52xx_gpt_priv *gpt, struct device_node *node)
 	gpt->gc.get = mpc52xx_gpt_gpio_get;
 	gpt->gc.set = mpc52xx_gpt_gpio_set;
 	gpt->gc.base = -1;
-	gpt->gc.of_gpio_n_cells = 2;
-	gpt->gc.of_xlate = of_gpio_simple_xlate;
 	gpt->gc.of_node = node;
-	of_node_get(node);
 
 	/* Setup external pin in GPIO mode */
 	clrsetbits_be32(&gpt->regs->mode, MPC52xx_GPT_MODE_MS_MASK,
diff --git a/arch/powerpc/platforms/83xx/mcu_mpc8349emitx.c b/arch/powerpc/platforms/83xx/mcu_mpc8349emitx.c
index f0dbace6185a..59b0ed1a56b1 100644
--- a/arch/powerpc/platforms/83xx/mcu_mpc8349emitx.c
+++ b/arch/powerpc/platforms/83xx/mcu_mpc8349emitx.c
@@ -90,8 +90,6 @@ static int mcu_gpiochip_add(struct mcu *mcu)
 	gc->base = -1;
 	gc->set = mcu_gpio_set;
 	gc->direction_output = mcu_gpio_dir_out;
-	gc->of_gpio_n_cells = 2;
-	gc->of_xlate = of_gpio_simple_xlate;
 	gc->of_node = np;
 
 	return gpiochip_add(gc);
diff --git a/arch/powerpc/sysdev/cpm1.c b/arch/powerpc/sysdev/cpm1.c
index d5cf7d4ccf81..00852124ff4a 100644
--- a/arch/powerpc/sysdev/cpm1.c
+++ b/arch/powerpc/sysdev/cpm1.c
@@ -633,7 +633,6 @@ int cpm1_gpiochip_add16(struct device_node *np)
 	gc = &mm_gc->gc;
 
 	mm_gc->save_regs = cpm1_gpio16_save_regs;
-	gc->of_gpio_n_cells = 2;
 	gc->ngpio = 16;
 	gc->direction_input = cpm1_gpio16_dir_in;
 	gc->direction_output = cpm1_gpio16_dir_out;
@@ -755,7 +754,6 @@ int cpm1_gpiochip_add32(struct device_node *np)
 	gc = &mm_gc->gc;
 
 	mm_gc->save_regs = cpm1_gpio32_save_regs;
-	gc->of_gpio_n_cells = 2;
 	gc->ngpio = 32;
 	gc->direction_input = cpm1_gpio32_dir_in;
 	gc->direction_output = cpm1_gpio32_dir_out;
diff --git a/arch/powerpc/sysdev/cpm_common.c b/arch/powerpc/sysdev/cpm_common.c
index 67e9b47dcf8e..2b69aa0315b3 100644
--- a/arch/powerpc/sysdev/cpm_common.c
+++ b/arch/powerpc/sysdev/cpm_common.c
@@ -337,7 +337,6 @@ int cpm2_gpiochip_add32(struct device_node *np)
 	gc = &mm_gc->gc;
 
 	mm_gc->save_regs = cpm2_gpio32_save_regs;
-	gc->of_gpio_n_cells = 2;
 	gc->ngpio = 32;
 	gc->direction_input = cpm2_gpio32_dir_in;
 	gc->direction_output = cpm2_gpio32_dir_out;
diff --git a/arch/powerpc/sysdev/mpc8xxx_gpio.c b/arch/powerpc/sysdev/mpc8xxx_gpio.c
index ec8fcd42101e..2b69084d0f0c 100644
--- a/arch/powerpc/sysdev/mpc8xxx_gpio.c
+++ b/arch/powerpc/sysdev/mpc8xxx_gpio.c
@@ -273,7 +273,6 @@ static void __init mpc8xxx_add_controller(struct device_node *np)
 	gc = &mm_gc->gc;
 
 	mm_gc->save_regs = mpc8xxx_gpio_save_regs;
-	gc->of_gpio_n_cells = 2;
 	gc->ngpio = MPC8XXX_GPIO_PINS;
 	gc->direction_input = mpc8xxx_gpio_dir_in;
 	gc->direction_output = mpc8xxx_gpio_dir_out;
diff --git a/arch/powerpc/sysdev/ppc4xx_gpio.c b/arch/powerpc/sysdev/ppc4xx_gpio.c
index 42e7a5eea665..fc65ad1b3293 100644
--- a/arch/powerpc/sysdev/ppc4xx_gpio.c
+++ b/arch/powerpc/sysdev/ppc4xx_gpio.c
@@ -194,7 +194,6 @@ static int __init ppc4xx_add_gpiochips(void)
 		mm_gc = &ppc4xx_gc->mm_gc;
 		gc = &mm_gc->gc;
 
-		gc->of_gpio_n_cells = 2;
 		gc->ngpio = 32;
 		gc->direction_input = ppc4xx_gpio_dir_in;
 		gc->direction_output = ppc4xx_gpio_dir_out;
diff --git a/arch/powerpc/sysdev/qe_lib/gpio.c b/arch/powerpc/sysdev/qe_lib/gpio.c
index 32e9440010a1..36bf845df127 100644
--- a/arch/powerpc/sysdev/qe_lib/gpio.c
+++ b/arch/powerpc/sysdev/qe_lib/gpio.c
@@ -321,7 +321,6 @@ static int __init qe_add_gpiochips(void)
 		gc = &mm_gc->gc;
 
 		mm_gc->save_regs = qe_gpio_save_regs;
-		gc->of_gpio_n_cells = 2;
 		gc->ngpio = QE_PIO_PINS;
 		gc->direction_input = qe_gpio_dir_in;
 		gc->direction_output = qe_gpio_dir_out;
diff --git a/arch/powerpc/sysdev/simple_gpio.c b/arch/powerpc/sysdev/simple_gpio.c
index b7559aa0c165..b6defda5ccc9 100644
--- a/arch/powerpc/sysdev/simple_gpio.c
+++ b/arch/powerpc/sysdev/simple_gpio.c
@@ -103,7 +103,6 @@ static int __init u8_simple_gpiochip_add(struct device_node *np)
 	gc = &mm_gc->gc;
 
 	mm_gc->save_regs = u8_gpio_save_regs;
-	gc->of_gpio_n_cells = 2;
 	gc->ngpio = 8;
 	gc->direction_input = u8_gpio_dir_in;
 	gc->direction_output = u8_gpio_dir_out;
diff --git a/drivers/gpio/gpiolib.c b/drivers/gpio/gpiolib.c
index 73fd328f6fe4..83cbc34e3a76 100644
--- a/drivers/gpio/gpiolib.c
+++ b/drivers/gpio/gpiolib.c
@@ -8,6 +8,7 @@
 #include <linux/debugfs.h>
 #include <linux/seq_file.h>
 #include <linux/gpio.h>
+#include <linux/of_gpio.h>
 #include <linux/idr.h>
 #include <linux/slab.h>
 
@@ -1099,6 +1100,8 @@ int gpiochip_add(struct gpio_chip *chip)
 		}
 	}
 
+	of_gpiochip_add(chip);
+
 unlock:
 	spin_unlock_irqrestore(&gpio_lock, flags);
 
@@ -1133,6 +1136,8 @@ int gpiochip_remove(struct gpio_chip *chip)
 
 	spin_lock_irqsave(&gpio_lock, flags);
 
+	of_gpiochip_remove(chip);
+
 	for (id = chip->base; id < chip->base + chip->ngpio; id++) {
 		if (test_bit(FLAG_REQUESTED, &gpio_desc[id].flags)) {
 			status = -EBUSY;
diff --git a/drivers/gpio/xilinx_gpio.c b/drivers/gpio/xilinx_gpio.c
index 2993c40b48e1..709690995d0d 100644
--- a/drivers/gpio/xilinx_gpio.c
+++ b/drivers/gpio/xilinx_gpio.c
@@ -190,7 +190,6 @@ static int __devinit xgpio_of_probe(struct device_node *np)
 
 	spin_lock_init(&chip->gpio_lock);
 
-	chip->mmchip.gc.of_gpio_n_cells = 2;
 	chip->mmchip.gc.direction_input = xgpio_dir_in;
 	chip->mmchip.gc.direction_output = xgpio_dir_out;
 	chip->mmchip.gc.get = xgpio_get;
diff --git a/drivers/of/gpio.c b/drivers/of/gpio.c
index c8618d3282cf..09f05a178668 100644
--- a/drivers/of/gpio.c
+++ b/drivers/of/gpio.c
@@ -125,8 +125,8 @@ EXPORT_SYMBOL(of_gpio_count);
  * gpio chips. This function performs only one sanity check: whether gpio
  * is less than ngpios (that is specified in the gpio_chip).
  */
-int of_gpio_simple_xlate(struct gpio_chip *gc, struct device_node *np,
-			 const void *gpio_spec, u32 *flags)
+static int of_gpio_simple_xlate(struct gpio_chip *gc, struct device_node *np,
+				const void *gpio_spec, u32 *flags)
 {
 	const __be32 *gpio = gpio_spec;
 	const u32 n = be32_to_cpup(gpio);
@@ -150,7 +150,6 @@ int of_gpio_simple_xlate(struct gpio_chip *gc, struct device_node *np,
 
 	return n;
 }
-EXPORT_SYMBOL(of_gpio_simple_xlate);
 
 /**
  * of_mm_gpiochip_add - Add memory mapped GPIO chip (bank)
@@ -187,9 +186,6 @@ int of_mm_gpiochip_add(struct device_node *np,
 
 	gc->base = -1;
 
-	if (!gc->of_xlate)
-		gc->of_xlate = of_gpio_simple_xlate;
-
 	if (mm_gc->save_regs)
 		mm_gc->save_regs(mm_gc);
 
@@ -199,9 +195,6 @@ int of_mm_gpiochip_add(struct device_node *np,
 	if (ret)
 		goto err2;
 
-	/* We don't want to lose the node and its ->data */
-	of_node_get(np);
-
 	pr_debug("%s: registered as generic GPIO chip, base is %d\n",
 		 np->full_name, gc->base);
 	return 0;
@@ -216,6 +209,28 @@ err0:
 }
 EXPORT_SYMBOL(of_mm_gpiochip_add);
 
+void of_gpiochip_add(struct gpio_chip *chip)
+{
+	if ((!chip->of_node) && (chip->dev))
+		chip->of_node = chip->dev->of_node;
+
+	if (!chip->of_node)
+		return;
+
+	if (!chip->of_xlate) {
+		chip->of_gpio_n_cells = 2;
+		chip->of_xlate = of_gpio_simple_xlate;
+	}
+
+	of_node_get(chip->of_node);
+}
+
+void of_gpiochip_remove(struct gpio_chip *chip)
+{
+	if (chip->of_node)
+		of_node_put(chip->of_node);
+}
+
 /* Private function for resolving node pointer to gpio_chip */
 static int of_gpiochip_is_match(struct gpio_chip *chip, void *data)
 {
diff --git a/include/linux/of_gpio.h b/include/linux/of_gpio.h
index 1020587efed1..6598c04dab01 100644
--- a/include/linux/of_gpio.h
+++ b/include/linux/of_gpio.h
@@ -52,9 +52,9 @@ extern unsigned int of_gpio_count(struct device_node *np);
 
 extern int of_mm_gpiochip_add(struct device_node *np,
 			      struct of_mm_gpio_chip *mm_gc);
-extern int of_gpio_simple_xlate(struct gpio_chip *gc, struct device_node *np,
-				const void *gpio_spec, u32 *flags);
 
+extern void of_gpiochip_add(struct gpio_chip *gc);
+extern void of_gpiochip_remove(struct gpio_chip *gc);
 extern struct gpio_chip *of_node_to_gpiochip(struct device_node *np);
 
 #else /* CONFIG_OF_GPIO */
@@ -71,6 +71,9 @@ static inline unsigned int of_gpio_count(struct device_node *np)
 	return 0;
 }
 
+static inline void of_gpiochip_add(struct gpio_chip *gc) { }
+static inline void of_gpiochip_remove(struct gpio_chip *gc) { }
+
 #endif /* CONFIG_OF_GPIO */
 
 /**
-- 
cgit v1.2.3


From 8cec0e7b4c7c0b76f2b5285f250211ad81c3eafd Mon Sep 17 00:00:00 2001
From: Grant Likely <grant.likely@secretlab.ca>
Date: Tue, 8 Jun 2010 07:48:17 -0600
Subject: of/device: Add OF style matching helper function

Add of_driver_match_device() helper function.  This function can be used
by bus types to determine if a driver works with a device when using OF
style matching.  If CONFIG_OF is unselected, then it is a nop.

Signed-off-by: Grant Likely <grant.likely@secretlab.ca>
CC: Greg Kroah-Hartman <gregkh@suse.de>
CC: Michal Simek <monstr@monstr.eu>
CC: Grant Likely <grant.likely@secretlab.ca>
CC: Benjamin Herrenschmidt <benh@kernel.crashing.org>
CC: Stephen Rothwell <sfr@canb.auug.org.au>
CC: linux-kernel@vger.kernel.org
CC: microblaze-uclinux@itee.uq.edu.au
CC: linuxppc-dev@ozlabs.org
CC: devicetree-discuss@lists.ozlabs.org
---
 drivers/of/device.c       |  2 +-
 include/linux/of_device.h | 19 +++++++++++++++++++
 2 files changed, 20 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/drivers/of/device.c b/drivers/of/device.c
index c2a98f5ca80d..5282a202f5a9 100644
--- a/drivers/of/device.c
+++ b/drivers/of/device.c
@@ -20,7 +20,7 @@
 const struct of_device_id *of_match_device(const struct of_device_id *matches,
 					   const struct device *dev)
 {
-	if (!dev->of_node)
+	if ((!matches) || (!dev->of_node))
 		return NULL;
 	return of_match_node(matches, dev->of_node);
 }
diff --git a/include/linux/of_device.h b/include/linux/of_device.h
index 238e92e007e6..91d75fb0c726 100644
--- a/include/linux/of_device.h
+++ b/include/linux/of_device.h
@@ -30,6 +30,17 @@
 extern const struct of_device_id *of_match_device(
 	const struct of_device_id *matches, const struct device *dev);
 
+/**
+ * of_driver_match_device - Tell if a driver's of_match_table matches a device.
+ * @drv: the device_driver structure to test
+ * @dev: the device structure to match against
+ */
+static inline int of_driver_match_device(const struct device *dev,
+					 const struct device_driver *drv)
+{
+	return of_match_device(drv->of_match_table, dev) != NULL;
+}
+
 extern struct of_device *of_dev_get(struct of_device *dev);
 extern void of_dev_put(struct of_device *dev);
 
@@ -48,6 +59,14 @@ extern ssize_t of_device_get_modalias(struct device *dev,
 extern int of_device_uevent(struct device *dev, struct kobj_uevent_env *env);
 
 
+#else /* CONFIG_OF_DEVICE */
+
+static inline int of_driver_match_device(struct device *dev,
+					 struct device_driver *drv)
+{
+	return 0;
+}
+
 #endif /* CONFIG_OF_DEVICE */
 
 #endif /* _LINUX_OF_DEVICE_H */
-- 
cgit v1.2.3


From f9f5a4669f1334a558f102c311debfd008e7c2bc Mon Sep 17 00:00:00 2001
From: Grant Likely <grant.likely@secretlab.ca>
Date: Wed, 9 Jun 2010 22:22:17 -0600
Subject: of/device: Move struct of_device define outside of CONFIG_OF_DEVICE
 test

Some code uses of_device even when CONFIG_OF_DEVICE is not set.  This
patch makes of_device valid all the time by moving it outside of the
ifdef CONFIG_OF_DEVICE test.

Reported-by: Randy Dunlap <randy.dunlap@oracle.com>
Signed-off-by: Grant Likely <grant.likely@secretlab.ca>
Acked-by: Randy Dunlap <randy.dunlap@oracle.com>
---
 include/linux/of_device.h | 12 +++++-------
 1 file changed, 5 insertions(+), 7 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/of_device.h b/include/linux/of_device.h
index 91d75fb0c726..7d27f5a878f6 100644
--- a/include/linux/of_device.h
+++ b/include/linux/of_device.h
@@ -1,13 +1,6 @@
 #ifndef _LINUX_OF_DEVICE_H
 #define _LINUX_OF_DEVICE_H
 
-#ifdef CONFIG_OF_DEVICE
-#include <linux/device.h>
-#include <linux/platform_device.h>
-#include <linux/of.h>
-#include <linux/mod_devicetable.h>
-
-
 /*
  * The of_device *was* a kind of "base class" that was a superset of
  * struct device for use by devices attached to an OF node and probed
@@ -22,7 +15,12 @@
  * from the kernel.
  */
 #define of_device platform_device
+#include <linux/platform_device.h>
 
+#ifdef CONFIG_OF_DEVICE
+#include <linux/device.h>
+#include <linux/of.h>
+#include <linux/mod_devicetable.h>
 #include <asm/of_device.h>
 
 #define	to_of_device(d) container_of(d, struct of_device, dev)
-- 
cgit v1.2.3


From 9fd049927ccba1c1d0343239b82f28c4e07fb95d Mon Sep 17 00:00:00 2001
From: Grant Likely <grant.likely@secretlab.ca>
Date: Tue, 8 Jun 2010 07:48:18 -0600
Subject: of/i2c: Generalize OF support

This patch cleans up the i2c OF support code to make it selectable by
all architectures and allow for automatic registration of i2c devices.

Signed-off-by: Grant Likely <grant.likely@secretlab.ca>
---
 drivers/i2c/busses/i2c-cpm.c     |  3 ++-
 drivers/i2c/busses/i2c-ibm_iic.c |  3 ++-
 drivers/i2c/busses/i2c-mpc.c     |  3 ++-
 drivers/of/Kconfig               |  2 +-
 drivers/of/of_i2c.c              | 50 +++++++++++++++++++++++-----------------
 include/linux/of_i2c.h           | 13 ++++++++---
 6 files changed, 46 insertions(+), 28 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/i2c/busses/i2c-cpm.c b/drivers/i2c/busses/i2c-cpm.c
index b02b4533651d..03ae62e69596 100644
--- a/drivers/i2c/busses/i2c-cpm.c
+++ b/drivers/i2c/busses/i2c-cpm.c
@@ -652,6 +652,7 @@ static int __devinit cpm_i2c_probe(struct of_device *ofdev,
 	cpm->adap = cpm_ops;
 	i2c_set_adapdata(&cpm->adap, cpm);
 	cpm->adap.dev.parent = &ofdev->dev;
+	cpm->adap.dev.of_node = of_node_get(ofdev->dev.of_node);
 
 	result = cpm_i2c_setup(cpm);
 	if (result) {
@@ -679,7 +680,7 @@ static int __devinit cpm_i2c_probe(struct of_device *ofdev,
 	/*
 	 * register OF I2C devices
 	 */
-	of_register_i2c_devices(&cpm->adap, ofdev->dev.of_node);
+	of_i2c_register_devices(&cpm->adap);
 
 	return 0;
 out_shut:
diff --git a/drivers/i2c/busses/i2c-ibm_iic.c b/drivers/i2c/busses/i2c-ibm_iic.c
index bf344135647a..d9641210dd3a 100644
--- a/drivers/i2c/busses/i2c-ibm_iic.c
+++ b/drivers/i2c/busses/i2c-ibm_iic.c
@@ -745,6 +745,7 @@ static int __devinit iic_probe(struct of_device *ofdev,
 	/* Register it with i2c layer */
 	adap = &dev->adap;
 	adap->dev.parent = &ofdev->dev;
+	adap->dev.of_node = of_node_get(np);
 	strlcpy(adap->name, "IBM IIC", sizeof(adap->name));
 	i2c_set_adapdata(adap, dev);
 	adap->class = I2C_CLASS_HWMON | I2C_CLASS_SPD;
@@ -761,7 +762,7 @@ static int __devinit iic_probe(struct of_device *ofdev,
 		 dev->fast_mode ? "fast (400 kHz)" : "standard (100 kHz)");
 
 	/* Now register all the child nodes */
-	of_register_i2c_devices(adap, np);
+	of_i2c_register_devices(adap);
 
 	return 0;
 
diff --git a/drivers/i2c/busses/i2c-mpc.c b/drivers/i2c/busses/i2c-mpc.c
index df00eb1f11f9..d2e26d290e7a 100644
--- a/drivers/i2c/busses/i2c-mpc.c
+++ b/drivers/i2c/busses/i2c-mpc.c
@@ -600,13 +600,14 @@ static int __devinit fsl_i2c_probe(struct of_device *op,
 	i2c->adap = mpc_ops;
 	i2c_set_adapdata(&i2c->adap, i2c);
 	i2c->adap.dev.parent = &op->dev;
+	i2c->adap.dev.of_node = of_node_get(op->dev.of_node);
 
 	result = i2c_add_adapter(&i2c->adap);
 	if (result < 0) {
 		dev_err(i2c->dev, "failed to add adapter\n");
 		goto fail_add;
 	}
-	of_register_i2c_devices(&i2c->adap, op->dev.of_node);
+	of_i2c_register_devices(&i2c->adap);
 
 	return result;
 
diff --git a/drivers/of/Kconfig b/drivers/of/Kconfig
index 097f42aebe90..80dd6318db68 100644
--- a/drivers/of/Kconfig
+++ b/drivers/of/Kconfig
@@ -26,7 +26,7 @@ config OF_GPIO
 
 config OF_I2C
 	def_tristate I2C
-	depends on (PPC_OF || MICROBLAZE) && I2C
+	depends on OF && !SPARC && I2C
 	help
 	  OpenFirmware I2C accessors
 
diff --git a/drivers/of/of_i2c.c b/drivers/of/of_i2c.c
index ab6522c8e4fe..0a694debd226 100644
--- a/drivers/of/of_i2c.c
+++ b/drivers/of/of_i2c.c
@@ -14,57 +14,65 @@
 #include <linux/i2c.h>
 #include <linux/of.h>
 #include <linux/of_i2c.h>
+#include <linux/of_irq.h>
 #include <linux/module.h>
 
-void of_register_i2c_devices(struct i2c_adapter *adap,
-			     struct device_node *adap_node)
+void of_i2c_register_devices(struct i2c_adapter *adap)
 {
 	void *result;
 	struct device_node *node;
 
-	for_each_child_of_node(adap_node, node) {
+	/* Only register child devices if the adapter has a node pointer set */
+	if (!adap->dev.of_node)
+		return;
+
+	dev_dbg(&adap->dev, "of_i2c: walking child nodes\n");
+
+	for_each_child_of_node(adap->dev.of_node, node) {
 		struct i2c_board_info info = {};
 		struct dev_archdata dev_ad = {};
 		const __be32 *addr;
 		int len;
 
-		if (of_modalias_node(node, info.type, sizeof(info.type)) < 0)
+		dev_dbg(&adap->dev, "of_i2c: register %s\n", node->full_name);
+
+		if (of_modalias_node(node, info.type, sizeof(info.type)) < 0) {
+			dev_err(&adap->dev, "of_i2c: modalias failure on %s\n",
+				node->full_name);
 			continue;
+		}
 
 		addr = of_get_property(node, "reg", &len);
-		if (!addr || len < sizeof(int) || *addr > (1 << 10) - 1) {
-			printk(KERN_ERR
-			       "of-i2c: invalid i2c device entry\n");
+		if (!addr || (len < sizeof(int))) {
+			dev_err(&adap->dev, "of_i2c: invalid reg on %s\n",
+				node->full_name);
 			continue;
 		}
 
-		info.irq = irq_of_parse_and_map(node, 0);
-
 		info.addr = be32_to_cpup(addr);
+		if (info.addr > (1 << 10) - 1) {
+			dev_err(&adap->dev, "of_i2c: invalid addr=%x on %s\n",
+				info.addr, node->full_name);
+			continue;
+		}
 
-		info.of_node = node;
+		info.irq = irq_of_parse_and_map(node, 0);
+		info.of_node = of_node_get(node);
 		info.archdata = &dev_ad;
 
 		request_module("%s", info.type);
 
 		result = i2c_new_device(adap, &info);
 		if (result == NULL) {
-			printk(KERN_ERR
-			       "of-i2c: Failed to load driver for %s\n",
-			       info.type);
+			dev_err(&adap->dev, "of_i2c: Failure registering %s\n",
+			        node->full_name);
+			of_node_put(node);
 			irq_dispose_mapping(info.irq);
 			continue;
 		}
-
-		/*
-		 * Get the node to not lose the dev_archdata->of_node.
-		 * Currently there is no way to put it back, as well as no
-		 * of_unregister_i2c_devices() call.
-		 */
-		of_node_get(node);
 	}
 }
-EXPORT_SYMBOL(of_register_i2c_devices);
+EXPORT_SYMBOL(of_i2c_register_devices);
 
 static int of_dev_node_match(struct device *dev, void *data)
 {
diff --git a/include/linux/of_i2c.h b/include/linux/of_i2c.h
index 34974b5a76f7..0efe8d465f55 100644
--- a/include/linux/of_i2c.h
+++ b/include/linux/of_i2c.h
@@ -12,12 +12,19 @@
 #ifndef __LINUX_OF_I2C_H
 #define __LINUX_OF_I2C_H
 
+#if defined(CONFIG_OF_I2C) || defined(CONFIG_OF_I2C_MODULE)
 #include <linux/i2c.h>
 
-void of_register_i2c_devices(struct i2c_adapter *adap,
-			     struct device_node *adap_node);
+extern void of_i2c_register_devices(struct i2c_adapter *adap);
 
 /* must call put_device() when done with returned i2c_client device */
-struct i2c_client *of_find_i2c_device_by_node(struct device_node *node);
+extern struct i2c_client *of_find_i2c_device_by_node(struct device_node *node);
+
+#else
+static inline void of_i2c_register_devices(struct i2c_adapter *adap)
+{
+	return;
+}
+#endif /* CONFIG_OF_I2C */
 
 #endif /* __LINUX_OF_I2C_H */
-- 
cgit v1.2.3


From 8eab945c5616fc984e97b922d6a2559be93f39a1 Mon Sep 17 00:00:00 2001
From: Artem Bityutskiy <Artem.Bityutskiy@nokia.com>
Date: Thu, 1 Jul 2010 18:05:56 +0300
Subject: sunrpc: make the cache cleaner workqueue deferrable

This patch makes the cache_cleaner workqueue deferrable, to prevent
unnecessary system wake-ups, which is very important for embedded
battery-powered devices.

do_cache_clean() is called every 30 seconds at the moment, and often
makes the system wake up from its power-save sleep state. With this
change, when the workqueue uses a deferrable timer, the
do_cache_clean() invocation will be delayed and combined with the
closest "real" wake-up. This improves the power consumption situation.

Note, I tried to create a DECLARE_DELAYED_WORK_DEFERRABLE() helper
macro, similar to DECLARE_DELAYED_WORK(), but failed because of the
way the timer wheel core stores the deferrable flag (it is the
LSBit in the time->base pointer). My attempt to define a static
variable with this bit set ended up with the "initializer element is
not constant" error.

Thus, I have to use run-time initialization, so I created a new
cache_initialize() function which is called once when sunrpc is
being initialized.

Signed-off-by: Artem Bityutskiy <Artem.Bityutskiy@nokia.com>
Signed-off-by: J. Bruce Fields <bfields@citi.umich.edu>
---
 include/linux/sunrpc/cache.h | 1 +
 net/sunrpc/cache.c           | 7 ++++++-
 net/sunrpc/sunrpc_syms.c     | 1 +
 3 files changed, 8 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/sunrpc/cache.h b/include/linux/sunrpc/cache.h
index 6f52b4d7c447..7bf3e84b92f4 100644
--- a/include/linux/sunrpc/cache.h
+++ b/include/linux/sunrpc/cache.h
@@ -192,6 +192,7 @@ extern int cache_check(struct cache_detail *detail,
 extern void cache_flush(void);
 extern void cache_purge(struct cache_detail *detail);
 #define NEVER (0x7FFFFFFF)
+extern void __init cache_initialize(void);
 extern int cache_register(struct cache_detail *cd);
 extern void cache_unregister(struct cache_detail *cd);
 
diff --git a/net/sunrpc/cache.c b/net/sunrpc/cache.c
index 58de76c8540c..939d048ef92b 100644
--- a/net/sunrpc/cache.c
+++ b/net/sunrpc/cache.c
@@ -320,7 +320,7 @@ static struct cache_detail *current_detail;
 static int current_index;
 
 static void do_cache_clean(struct work_struct *work);
-static DECLARE_DELAYED_WORK(cache_cleaner, do_cache_clean);
+static struct delayed_work cache_cleaner;
 
 static void sunrpc_init_cache_detail(struct cache_detail *cd)
 {
@@ -1504,6 +1504,11 @@ static int create_cache_proc_entries(struct cache_detail *cd)
 }
 #endif
 
+void __init cache_initialize(void)
+{
+	INIT_DELAYED_WORK_DEFERRABLE(&cache_cleaner, do_cache_clean);
+}
+
 int cache_register(struct cache_detail *cd)
 {
 	int ret;
diff --git a/net/sunrpc/sunrpc_syms.c b/net/sunrpc/sunrpc_syms.c
index f438347d817b..c52b18489149 100644
--- a/net/sunrpc/sunrpc_syms.c
+++ b/net/sunrpc/sunrpc_syms.c
@@ -43,6 +43,7 @@ init_sunrpc(void)
 #ifdef CONFIG_PROC_FS
 	rpc_proc_init();
 #endif
+	cache_initialize();
 	cache_register(&ip_map_cache);
 	cache_register(&unix_gid_cache);
 	svc_init_xprt_sock();	/* svc sock transport */
-- 
cgit v1.2.3


From de5d9bf6541736dc7ad264d2b5cc99bc1b2ad958 Mon Sep 17 00:00:00 2001
From: Chris Metcalf <cmetcalf@tilera.com>
Date: Fri, 2 Jul 2010 13:41:14 -0400
Subject: Move list types from <linux/list.h> to <linux/types.h>.

This allows a list_head (or hlist_head, etc.) to be used from places
that used to be impractical, in particular <asm/processor.h>, which
used to cause include file recursion: <linux/list.h> includes
<linux/prefetch.h>, which always includes <asm/processor.h> for the
prefetch macros, as well as <asm/system.h>, which often includes
<asm/processor.h> directly or indirectly.

This avoids a lot of painful workaround hackery on the tile
architecture, where we use a list_head in the thread_struct to chain
together tasks that are activated on a particular hardwall.

Signed-off-by: Chris Metcalf <cmetcalf@tilera.com>
Reviewed-by: Matthew Wilcox <willy@linux.intel.com>
---
 include/linux/list.h  | 13 +------------
 include/linux/types.h | 12 ++++++++++++
 2 files changed, 13 insertions(+), 12 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/list.h b/include/linux/list.h
index 8392884a2977..bc43e8a0d7f6 100644
--- a/include/linux/list.h
+++ b/include/linux/list.h
@@ -1,6 +1,7 @@
 #ifndef _LINUX_LIST_H
 #define _LINUX_LIST_H
 
+#include <linux/types.h>
 #include <linux/stddef.h>
 #include <linux/poison.h>
 #include <linux/prefetch.h>
@@ -16,10 +17,6 @@
  * using the generic single-entry routines.
  */
 
-struct list_head {
-	struct list_head *next, *prev;
-};
-
 #define LIST_HEAD_INIT(name) { &(name), &(name) }
 
 #define LIST_HEAD(name) \
@@ -551,14 +548,6 @@ static inline void list_splice_tail_init(struct list_head *list,
  * You lose the ability to access the tail in O(1).
  */
 
-struct hlist_head {
-	struct hlist_node *first;
-};
-
-struct hlist_node {
-	struct hlist_node *next, **pprev;
-};
-
 #define HLIST_HEAD_INIT { .first = NULL }
 #define HLIST_HEAD(name) struct hlist_head name = {  .first = NULL }
 #define INIT_HLIST_HEAD(ptr) ((ptr)->first = NULL)
diff --git a/include/linux/types.h b/include/linux/types.h
index 23d237a075e2..336cc39c46f1 100644
--- a/include/linux/types.h
+++ b/include/linux/types.h
@@ -197,6 +197,18 @@ typedef struct {
 } atomic64_t;
 #endif
 
+struct list_head {
+	struct list_head *next, *prev;
+};
+
+struct hlist_head {
+	struct hlist_node *first;
+};
+
+struct hlist_node {
+	struct hlist_node *next, **pprev;
+};
+
 struct ustat {
 	__kernel_daddr_t	f_tfree;
 	__kernel_ino_t		f_tinode;
-- 
cgit v1.2.3


From 250b2b6dd421c9f8844a867d2ac06e0661e0ad93 Mon Sep 17 00:00:00 2001
From: Stefan Richter <stefanr@s5r6.in-berlin.de>
Date: Mon, 21 Jun 2010 23:24:35 +0200
Subject: firewire: cdev: fix fw_cdev_event_bus_reset.bm_node_id

Fix an obscure ABI feature that is a bit of a hassle to implement.
However, somebody put it into the ABI, so let's fill in a sensible
value there.

Signed-off-by: Stefan Richter <stefanr@s5r6.in-berlin.de>
---
 drivers/firewire/core-card.c     | 12 +++++++++---
 drivers/firewire/core-cdev.c     |  2 +-
 drivers/firewire/core-topology.c |  1 +
 include/linux/firewire-cdev.h    |  5 +++++
 include/linux/firewire.h         |  1 +
 5 files changed, 17 insertions(+), 4 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/firewire/core-card.c b/drivers/firewire/core-card.c
index 11fc81500f82..6c316cfe70c4 100644
--- a/drivers/firewire/core-card.c
+++ b/drivers/firewire/core-card.c
@@ -239,7 +239,7 @@ static void fw_card_bm_work(struct work_struct *work)
 	struct fw_card *card = container_of(work, struct fw_card, work.work);
 	struct fw_device *root_device;
 	struct fw_node *root_node;
-	int root_id, new_root_id, irm_id, local_id;
+	int root_id, new_root_id, irm_id, bm_id, local_id;
 	int gap_count, generation, grace, rcode;
 	bool do_reset = false;
 	bool root_device_is_running;
@@ -301,9 +301,15 @@ static void fw_card_bm_work(struct work_struct *work)
 			/* Another bus reset, BM work has been rescheduled. */
 			goto out;
 
-		if (rcode == RCODE_COMPLETE &&
-		    card->bm_transaction_data[0] != cpu_to_be32(0x3f)) {
+		bm_id = be32_to_cpu(card->bm_transaction_data[0]);
 
+		spin_lock_irq(&card->lock);
+		if (rcode == RCODE_COMPLETE && generation == card->generation)
+			card->bm_node_id =
+			    bm_id == 0x3f ? local_id : 0xffc0 | bm_id;
+		spin_unlock_irq(&card->lock);
+
+		if (rcode == RCODE_COMPLETE && bm_id != 0x3f) {
 			/* Somebody else is BM.  Only act as IRM. */
 			if (local_id == irm_id)
 				allocate_broadcast_channel(card, generation);
diff --git a/drivers/firewire/core-cdev.c b/drivers/firewire/core-cdev.c
index 9b8df2039155..d8ac0ce2d6bf 100644
--- a/drivers/firewire/core-cdev.c
+++ b/drivers/firewire/core-cdev.c
@@ -318,7 +318,7 @@ static void fill_bus_reset_event(struct fw_cdev_event_bus_reset *event,
 	event->generation    = client->device->generation;
 	event->node_id       = client->device->node_id;
 	event->local_node_id = card->local_node->node_id;
-	event->bm_node_id    = 0; /* FIXME: We don't track the BM. */
+	event->bm_node_id    = card->bm_node_id;
 	event->irm_node_id   = card->irm_node->node_id;
 	event->root_node_id  = card->root_node->node_id;
 
diff --git a/drivers/firewire/core-topology.c b/drivers/firewire/core-topology.c
index 56e908ba43f1..88d5133ae702 100644
--- a/drivers/firewire/core-topology.c
+++ b/drivers/firewire/core-topology.c
@@ -552,6 +552,7 @@ void fw_core_handle_bus_reset(struct fw_card *card, int node_id, int generation,
 	smp_wmb();
 	card->generation = generation;
 	card->reset_jiffies = jiffies;
+	card->bm_node_id  = 0xffff;
 	card->bm_abdicate = bm_abdicate;
 	fw_schedule_bm_work(card, 0);
 
diff --git a/include/linux/firewire-cdev.h b/include/linux/firewire-cdev.h
index 8b9b27373219..d31022b05bd9 100644
--- a/include/linux/firewire-cdev.h
+++ b/include/linux/firewire-cdev.h
@@ -66,6 +66,10 @@ struct fw_cdev_event_common {
  * This event is sent when the bus the device belongs to goes through a bus
  * reset.  It provides information about the new bus configuration, such as
  * new node ID for this device, new root ID, and others.
+ *
+ * If @bm_node_id is 0xffff right after bus reset it can be reread by an
+ * %FW_CDEV_IOC_GET_INFO ioctl after bus manager selection was finished.
+ * Kernels with ABI version < 4 do not set @bm_node_id.
  */
 struct fw_cdev_event_bus_reset {
 	__u64 closure;
@@ -348,6 +352,7 @@ union fw_cdev_event {
  *  3  (2.6.34)  - made &fw_cdev_get_cycle_timer reliable
  *               - added %FW_CDEV_IOC_GET_CYCLE_TIMER2
  *  4  (2.6.36)  - added %FW_CDEV_EVENT_REQUEST2
+ *               - implemented &fw_cdev_event_bus_reset.bm_node_id
  */
 #define FW_CDEV_VERSION 3 /* Meaningless; don't use this macro. */
 
diff --git a/include/linux/firewire.h b/include/linux/firewire.h
index e44b502c8341..db30a752a87a 100644
--- a/include/linux/firewire.h
+++ b/include/linux/firewire.h
@@ -119,6 +119,7 @@ struct fw_card {
 	int bm_retries;
 	int bm_generation;
 	__be32 bm_transaction_data[2];
+	int bm_node_id;
 	bool bm_abdicate;
 
 	bool priority_budget_implemented;	/* controller feature */
-- 
cgit v1.2.3


From ffa71f33a820d1ab3f2fc5723819ac60fb76080b Mon Sep 17 00:00:00 2001
From: Kenji Kaneshige <kaneshige.kenji@jp.fujitsu.com>
Date: Fri, 18 Jun 2010 12:22:40 +0900
Subject: x86, ioremap: Fix incorrect physical address handling in PAE mode

Current x86 ioremap() doesn't handle physical address higher than
32-bit properly in X86_32 PAE mode. When physical address higher than
32-bit is passed to ioremap(), higher 32-bits in physical address is
cleared wrongly. Due to this bug, ioremap() can map wrong address to
linear address space.

In my case, 64-bit MMIO region was assigned to a PCI device (ioat
device) on my system. Because of the ioremap()'s bug, wrong physical
address (instead of MMIO region) was mapped to linear address space.
Because of this, loading ioatdma driver caused unexpected behavior
(kernel panic, kernel hangup, ...).

Signed-off-by: Kenji Kaneshige <kaneshige.kenji@jp.fujitsu.com>
LKML-Reference: <4C1AE680.7090408@jp.fujitsu.com>
Signed-off-by: H. Peter Anvin <hpa@linux.intel.com>
---
 arch/x86/mm/ioremap.c   | 12 +++++-------
 include/linux/io.h      |  4 ++--
 include/linux/vmalloc.h |  2 +-
 lib/ioremap.c           | 10 +++++-----
 mm/vmalloc.c            |  2 +-
 5 files changed, 14 insertions(+), 16 deletions(-)

(limited to 'include/linux')

diff --git a/arch/x86/mm/ioremap.c b/arch/x86/mm/ioremap.c
index 12e4d2d3c110..754cb4cbce66 100644
--- a/arch/x86/mm/ioremap.c
+++ b/arch/x86/mm/ioremap.c
@@ -62,8 +62,8 @@ int ioremap_change_attr(unsigned long vaddr, unsigned long size,
 static void __iomem *__ioremap_caller(resource_size_t phys_addr,
 		unsigned long size, unsigned long prot_val, void *caller)
 {
-	unsigned long pfn, offset, vaddr;
-	resource_size_t last_addr;
+	unsigned long offset, vaddr;
+	resource_size_t pfn, last_pfn, last_addr;
 	const resource_size_t unaligned_phys_addr = phys_addr;
 	const unsigned long unaligned_size = size;
 	struct vm_struct *area;
@@ -100,10 +100,8 @@ static void __iomem *__ioremap_caller(resource_size_t phys_addr,
 	/*
 	 * Don't allow anybody to remap normal RAM that we're using..
 	 */
-	for (pfn = phys_addr >> PAGE_SHIFT;
-				(pfn << PAGE_SHIFT) < (last_addr & PAGE_MASK);
-				pfn++) {
-
+	last_pfn = last_addr >> PAGE_SHIFT;
+	for (pfn = phys_addr >> PAGE_SHIFT; pfn < last_pfn; pfn++) {
 		int is_ram = page_is_ram(pfn);
 
 		if (is_ram && pfn_valid(pfn) && !PageReserved(pfn_to_page(pfn)))
@@ -115,7 +113,7 @@ static void __iomem *__ioremap_caller(resource_size_t phys_addr,
 	 * Mappings have to be page-aligned
 	 */
 	offset = phys_addr & ~PAGE_MASK;
-	phys_addr &= PAGE_MASK;
+	phys_addr &= PHYSICAL_PAGE_MASK;
 	size = PAGE_ALIGN(last_addr+1) - phys_addr;
 
 	retval = reserve_memtype(phys_addr, (u64)phys_addr + size,
diff --git a/include/linux/io.h b/include/linux/io.h
index 6c7f0ba0d5fa..7fd2d2138bf3 100644
--- a/include/linux/io.h
+++ b/include/linux/io.h
@@ -29,10 +29,10 @@ void __iowrite64_copy(void __iomem *to, const void *from, size_t count);
 
 #ifdef CONFIG_MMU
 int ioremap_page_range(unsigned long addr, unsigned long end,
-		       unsigned long phys_addr, pgprot_t prot);
+		       phys_addr_t phys_addr, pgprot_t prot);
 #else
 static inline int ioremap_page_range(unsigned long addr, unsigned long end,
-				     unsigned long phys_addr, pgprot_t prot)
+				     phys_addr_t phys_addr, pgprot_t prot)
 {
 	return 0;
 }
diff --git a/include/linux/vmalloc.h b/include/linux/vmalloc.h
index 227c2a585e4f..de05e96e0a70 100644
--- a/include/linux/vmalloc.h
+++ b/include/linux/vmalloc.h
@@ -30,7 +30,7 @@ struct vm_struct {
 	unsigned long		flags;
 	struct page		**pages;
 	unsigned int		nr_pages;
-	unsigned long		phys_addr;
+	phys_addr_t		phys_addr;
 	void			*caller;
 };
 
diff --git a/lib/ioremap.c b/lib/ioremap.c
index 14c6078f17a2..5730ecd3eb66 100644
--- a/lib/ioremap.c
+++ b/lib/ioremap.c
@@ -13,10 +13,10 @@
 #include <asm/pgtable.h>
 
 static int ioremap_pte_range(pmd_t *pmd, unsigned long addr,
-		unsigned long end, unsigned long phys_addr, pgprot_t prot)
+		unsigned long end, phys_addr_t phys_addr, pgprot_t prot)
 {
 	pte_t *pte;
-	unsigned long pfn;
+	u64 pfn;
 
 	pfn = phys_addr >> PAGE_SHIFT;
 	pte = pte_alloc_kernel(pmd, addr);
@@ -31,7 +31,7 @@ static int ioremap_pte_range(pmd_t *pmd, unsigned long addr,
 }
 
 static inline int ioremap_pmd_range(pud_t *pud, unsigned long addr,
-		unsigned long end, unsigned long phys_addr, pgprot_t prot)
+		unsigned long end, phys_addr_t phys_addr, pgprot_t prot)
 {
 	pmd_t *pmd;
 	unsigned long next;
@@ -49,7 +49,7 @@ static inline int ioremap_pmd_range(pud_t *pud, unsigned long addr,
 }
 
 static inline int ioremap_pud_range(pgd_t *pgd, unsigned long addr,
-		unsigned long end, unsigned long phys_addr, pgprot_t prot)
+		unsigned long end, phys_addr_t phys_addr, pgprot_t prot)
 {
 	pud_t *pud;
 	unsigned long next;
@@ -67,7 +67,7 @@ static inline int ioremap_pud_range(pgd_t *pgd, unsigned long addr,
 }
 
 int ioremap_page_range(unsigned long addr,
-		       unsigned long end, unsigned long phys_addr, pgprot_t prot)
+		       unsigned long end, phys_addr_t phys_addr, pgprot_t prot)
 {
 	pgd_t *pgd;
 	unsigned long start;
diff --git a/mm/vmalloc.c b/mm/vmalloc.c
index ae007462b7f6..b7e314b1009f 100644
--- a/mm/vmalloc.c
+++ b/mm/vmalloc.c
@@ -2403,7 +2403,7 @@ static int s_show(struct seq_file *m, void *p)
 		seq_printf(m, " pages=%d", v->nr_pages);
 
 	if (v->phys_addr)
-		seq_printf(m, " phys=%lx", v->phys_addr);
+		seq_printf(m, " phys=%llx", (unsigned long long)v->phys_addr);
 
 	if (v->flags & VM_IOREMAP)
 		seq_printf(m, " ioremap");
-- 
cgit v1.2.3


From a1d75f258230b75d46aecdf28b2e732413028863 Mon Sep 17 00:00:00 2001
From: Miklos Szeredi <mszeredi@suse.cz>
Date: Mon, 12 Jul 2010 14:41:40 +0200
Subject: fuse: add store request

Userspace filesystem can request data to be stored in the inode's
mapping.  This request is synchronous and has no reply.  If the write
to the fuse device returns an error then the store request was not
fully completed (but may have updated some pages).

If the stored data overflows the current file size, then the size is
extended, similarly to a write(2) on the filesystem.

Pages which have been completely stored are marked uptodate.

Signed-off-by: Miklos Szeredi <mszeredi@suse.cz>
---
 fs/fuse/dev.c        | 88 ++++++++++++++++++++++++++++++++++++++++++++++++++++
 fs/fuse/file.c       |  2 +-
 fs/fuse/fuse_i.h     |  2 ++
 include/linux/fuse.h | 13 +++++++-
 4 files changed, 103 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/fs/fuse/dev.c b/fs/fuse/dev.c
index 7eb80d33c4f3..8e01c865586e 100644
--- a/fs/fuse/dev.c
+++ b/fs/fuse/dev.c
@@ -1231,6 +1231,91 @@ err:
 	return err;
 }
 
+static int fuse_notify_store(struct fuse_conn *fc, unsigned int size,
+			     struct fuse_copy_state *cs)
+{
+	struct fuse_notify_store_out outarg;
+	struct inode *inode;
+	struct address_space *mapping;
+	u64 nodeid;
+	int err;
+	pgoff_t index;
+	unsigned int offset;
+	unsigned int num;
+	loff_t file_size;
+	loff_t end;
+
+	err = -EINVAL;
+	if (size < sizeof(outarg))
+		goto out_finish;
+
+	err = fuse_copy_one(cs, &outarg, sizeof(outarg));
+	if (err)
+		goto out_finish;
+
+	err = -EINVAL;
+	if (size - sizeof(outarg) != outarg.size)
+		goto out_finish;
+
+	nodeid = outarg.nodeid;
+
+	down_read(&fc->killsb);
+
+	err = -ENOENT;
+	if (!fc->sb)
+		goto out_up_killsb;
+
+	inode = ilookup5(fc->sb, nodeid, fuse_inode_eq, &nodeid);
+	if (!inode)
+		goto out_up_killsb;
+
+	mapping = inode->i_mapping;
+	index = outarg.offset >> PAGE_CACHE_SHIFT;
+	offset = outarg.offset & ~PAGE_CACHE_MASK;
+	file_size = i_size_read(inode);
+	end = outarg.offset + outarg.size;
+	if (end > file_size) {
+		file_size = end;
+		fuse_write_update_size(inode, file_size);
+	}
+
+	num = outarg.size;
+	while (num) {
+		struct page *page;
+		unsigned int this_num;
+
+		err = -ENOMEM;
+		page = find_or_create_page(mapping, index,
+					   mapping_gfp_mask(mapping));
+		if (!page)
+			goto out_iput;
+
+		this_num = min_t(unsigned, num, PAGE_CACHE_SIZE - offset);
+		err = fuse_copy_page(cs, &page, offset, this_num, 0);
+		if (!err && offset == 0 && (num != 0 || file_size == end))
+			SetPageUptodate(page);
+		unlock_page(page);
+		page_cache_release(page);
+
+		if (err)
+			goto out_iput;
+
+		num -= this_num;
+		offset = 0;
+		index++;
+	}
+
+	err = 0;
+
+out_iput:
+	iput(inode);
+out_up_killsb:
+	up_read(&fc->killsb);
+out_finish:
+	fuse_copy_finish(cs);
+	return err;
+}
+
 static int fuse_notify(struct fuse_conn *fc, enum fuse_notify_code code,
 		       unsigned int size, struct fuse_copy_state *cs)
 {
@@ -1244,6 +1329,9 @@ static int fuse_notify(struct fuse_conn *fc, enum fuse_notify_code code,
 	case FUSE_NOTIFY_INVAL_ENTRY:
 		return fuse_notify_inval_entry(fc, size, cs);
 
+	case FUSE_NOTIFY_STORE:
+		return fuse_notify_store(fc, size, cs);
+
 	default:
 		fuse_copy_finish(cs);
 		return -EINVAL;
diff --git a/fs/fuse/file.c b/fs/fuse/file.c
index ada0adeb3bb5..147c1f71bdb9 100644
--- a/fs/fuse/file.c
+++ b/fs/fuse/file.c
@@ -706,7 +706,7 @@ static int fuse_write_begin(struct file *file, struct address_space *mapping,
 	return 0;
 }
 
-static void fuse_write_update_size(struct inode *inode, loff_t pos)
+void fuse_write_update_size(struct inode *inode, loff_t pos)
 {
 	struct fuse_conn *fc = get_fuse_conn(inode);
 	struct fuse_inode *fi = get_fuse_inode(inode);
diff --git a/fs/fuse/fuse_i.h b/fs/fuse/fuse_i.h
index 8f309f04064e..61267d8d527b 100644
--- a/fs/fuse/fuse_i.h
+++ b/fs/fuse/fuse_i.h
@@ -748,4 +748,6 @@ long fuse_do_ioctl(struct file *file, unsigned int cmd, unsigned long arg,
 unsigned fuse_file_poll(struct file *file, poll_table *wait);
 int fuse_dev_release(struct inode *inode, struct file *file);
 
+void fuse_write_update_size(struct inode *inode, loff_t pos);
+
 #endif /* _FS_FUSE_I_H */
diff --git a/include/linux/fuse.h b/include/linux/fuse.h
index 88e0eb596919..a90bd49834aa 100644
--- a/include/linux/fuse.h
+++ b/include/linux/fuse.h
@@ -37,6 +37,9 @@
  *
  * 7.14
  *  - add splice support to fuse device
+ *
+ * 7.15
+ *  - add store notify
  */
 
 #ifndef _LINUX_FUSE_H
@@ -68,7 +71,7 @@
 #define FUSE_KERNEL_VERSION 7
 
 /** Minor version number of this interface */
-#define FUSE_KERNEL_MINOR_VERSION 14
+#define FUSE_KERNEL_MINOR_VERSION 15
 
 /** The node ID of the root inode */
 #define FUSE_ROOT_ID 1
@@ -260,6 +263,7 @@ enum fuse_notify_code {
 	FUSE_NOTIFY_POLL   = 1,
 	FUSE_NOTIFY_INVAL_INODE = 2,
 	FUSE_NOTIFY_INVAL_ENTRY = 3,
+	FUSE_NOTIFY_STORE = 4,
 	FUSE_NOTIFY_CODE_MAX,
 };
 
@@ -568,4 +572,11 @@ struct fuse_notify_inval_entry_out {
 	__u32	padding;
 };
 
+struct fuse_notify_store_out {
+	__u64	nodeid;
+	__u64	offset;
+	__u32	size;
+	__u32	padding;
+};
+
 #endif /* _LINUX_FUSE_H */
-- 
cgit v1.2.3


From 2d45ba381a74a743eeaa2b06c7c5c0d2bf73ba1a Mon Sep 17 00:00:00 2001
From: Miklos Szeredi <mszeredi@suse.cz>
Date: Mon, 12 Jul 2010 14:41:40 +0200
Subject: fuse: add retrieve request

Userspace filesystem can request data to be retrieved from the inode's
mapping.  This request is synchronous and the retrieved data is queued
as a new request.  If the write to the fuse device returns an error
then the retrieve request was not completed and a reply will not be
sent.

Only present pages are returned in the retrieve reply.  Retrieving
stops when it finds a non-present page and only data prior to that is
returned.

This request doesn't change the dirty state of pages.

Signed-off-by: Miklos Szeredi <mszeredi@suse.cz>
---
 fs/fuse/dev.c        | 131 ++++++++++++++++++++++++++++++++++++++++++++++++++-
 fs/fuse/fuse_i.h     |   1 +
 include/linux/fuse.h |  21 +++++++++
 3 files changed, 152 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/fs/fuse/dev.c b/fs/fuse/dev.c
index 8e01c865586e..69ad053ffd78 100644
--- a/fs/fuse/dev.c
+++ b/fs/fuse/dev.c
@@ -239,7 +239,6 @@ static u64 fuse_get_unique(struct fuse_conn *fc)
 
 static void queue_request(struct fuse_conn *fc, struct fuse_req *req)
 {
-	req->in.h.unique = fuse_get_unique(fc);
 	req->in.h.len = sizeof(struct fuse_in_header) +
 		len_args(req->in.numargs, (struct fuse_arg *) req->in.args);
 	list_add_tail(&req->list, &fc->pending);
@@ -261,6 +260,7 @@ static void flush_bg_queue(struct fuse_conn *fc)
 		req = list_entry(fc->bg_queue.next, struct fuse_req, list);
 		list_del(&req->list);
 		fc->active_background++;
+		req->in.h.unique = fuse_get_unique(fc);
 		queue_request(fc, req);
 	}
 }
@@ -398,6 +398,7 @@ void fuse_request_send(struct fuse_conn *fc, struct fuse_req *req)
 	else if (fc->conn_error)
 		req->out.h.error = -ECONNREFUSED;
 	else {
+		req->in.h.unique = fuse_get_unique(fc);
 		queue_request(fc, req);
 		/* acquire extra reference, since request is still needed
 		   after request_end() */
@@ -450,6 +451,23 @@ void fuse_request_send_background(struct fuse_conn *fc, struct fuse_req *req)
 }
 EXPORT_SYMBOL_GPL(fuse_request_send_background);
 
+static int fuse_request_send_notify_reply(struct fuse_conn *fc,
+					  struct fuse_req *req, u64 unique)
+{
+	int err = -ENODEV;
+
+	req->isreply = 0;
+	req->in.h.unique = unique;
+	spin_lock(&fc->lock);
+	if (fc->connected) {
+		queue_request(fc, req);
+		err = 0;
+	}
+	spin_unlock(&fc->lock);
+
+	return err;
+}
+
 /*
  * Called under fc->lock
  *
@@ -1316,6 +1334,114 @@ out_finish:
 	return err;
 }
 
+static void fuse_retrieve_end(struct fuse_conn *fc, struct fuse_req *req)
+{
+	int i;
+
+	for (i = 0; i < req->num_pages; i++) {
+		struct page *page = req->pages[i];
+		page_cache_release(page);
+	}
+}
+
+static int fuse_retrieve(struct fuse_conn *fc, struct inode *inode,
+			 struct fuse_notify_retrieve_out *outarg)
+{
+	int err;
+	struct address_space *mapping = inode->i_mapping;
+	struct fuse_req *req;
+	pgoff_t index;
+	loff_t file_size;
+	unsigned int num;
+	unsigned int offset;
+	size_t total_len;
+
+	req = fuse_get_req(fc);
+	if (IS_ERR(req))
+		return PTR_ERR(req);
+
+	offset = outarg->offset & ~PAGE_CACHE_MASK;
+
+	req->in.h.opcode = FUSE_NOTIFY_REPLY;
+	req->in.h.nodeid = outarg->nodeid;
+	req->in.numargs = 2;
+	req->in.argpages = 1;
+	req->page_offset = offset;
+	req->end = fuse_retrieve_end;
+
+	index = outarg->offset >> PAGE_CACHE_SHIFT;
+	file_size = i_size_read(inode);
+	num = outarg->size;
+	if (outarg->offset > file_size)
+		num = 0;
+	else if (outarg->offset + num > file_size)
+		num = file_size - outarg->offset;
+
+	while (num) {
+		struct page *page;
+		unsigned int this_num;
+
+		page = find_get_page(mapping, index);
+		if (!page)
+			break;
+
+		this_num = min_t(unsigned, num, PAGE_CACHE_SIZE - offset);
+		req->pages[req->num_pages] = page;
+		req->num_pages++;
+
+		num -= this_num;
+		total_len += this_num;
+	}
+	req->misc.retrieve_in.offset = outarg->offset;
+	req->misc.retrieve_in.size = total_len;
+	req->in.args[0].size = sizeof(req->misc.retrieve_in);
+	req->in.args[0].value = &req->misc.retrieve_in;
+	req->in.args[1].size = total_len;
+
+	err = fuse_request_send_notify_reply(fc, req, outarg->notify_unique);
+	if (err)
+		fuse_retrieve_end(fc, req);
+
+	return err;
+}
+
+static int fuse_notify_retrieve(struct fuse_conn *fc, unsigned int size,
+				struct fuse_copy_state *cs)
+{
+	struct fuse_notify_retrieve_out outarg;
+	struct inode *inode;
+	int err;
+
+	err = -EINVAL;
+	if (size != sizeof(outarg))
+		goto copy_finish;
+
+	err = fuse_copy_one(cs, &outarg, sizeof(outarg));
+	if (err)
+		goto copy_finish;
+
+	fuse_copy_finish(cs);
+
+	down_read(&fc->killsb);
+	err = -ENOENT;
+	if (fc->sb) {
+		u64 nodeid = outarg.nodeid;
+
+		inode = ilookup5(fc->sb, nodeid, fuse_inode_eq, &nodeid);
+		if (inode) {
+			err = fuse_retrieve(fc, inode, &outarg);
+			iput(inode);
+		}
+	}
+	up_read(&fc->killsb);
+
+	return err;
+
+copy_finish:
+	fuse_copy_finish(cs);
+	return err;
+}
+
 static int fuse_notify(struct fuse_conn *fc, enum fuse_notify_code code,
 		       unsigned int size, struct fuse_copy_state *cs)
 {
@@ -1332,6 +1458,9 @@ static int fuse_notify(struct fuse_conn *fc, enum fuse_notify_code code,
 	case FUSE_NOTIFY_STORE:
 		return fuse_notify_store(fc, size, cs);
 
+	case FUSE_NOTIFY_RETRIEVE:
+		return fuse_notify_retrieve(fc, size, cs);
+
 	default:
 		fuse_copy_finish(cs);
 		return -EINVAL;
diff --git a/fs/fuse/fuse_i.h b/fs/fuse/fuse_i.h
index 61267d8d527b..57d4a3a0f102 100644
--- a/fs/fuse/fuse_i.h
+++ b/fs/fuse/fuse_i.h
@@ -272,6 +272,7 @@ struct fuse_req {
 			struct fuse_write_in in;
 			struct fuse_write_out out;
 		} write;
+		struct fuse_notify_retrieve_in retrieve_in;
 		struct fuse_lk_in lk_in;
 	} misc;
 
diff --git a/include/linux/fuse.h b/include/linux/fuse.h
index a90bd49834aa..c3c578e09833 100644
--- a/include/linux/fuse.h
+++ b/include/linux/fuse.h
@@ -40,6 +40,7 @@
  *
  * 7.15
  *  - add store notify
+ *  - add retrieve notify
  */
 
 #ifndef _LINUX_FUSE_H
@@ -254,6 +255,7 @@ enum fuse_opcode {
 	FUSE_DESTROY       = 38,
 	FUSE_IOCTL         = 39,
 	FUSE_POLL          = 40,
+	FUSE_NOTIFY_REPLY  = 41,
 
 	/* CUSE specific operations */
 	CUSE_INIT          = 4096,
@@ -264,6 +266,7 @@ enum fuse_notify_code {
 	FUSE_NOTIFY_INVAL_INODE = 2,
 	FUSE_NOTIFY_INVAL_ENTRY = 3,
 	FUSE_NOTIFY_STORE = 4,
+	FUSE_NOTIFY_RETRIEVE = 5,
 	FUSE_NOTIFY_CODE_MAX,
 };
 
@@ -579,4 +582,22 @@ struct fuse_notify_store_out {
 	__u32	padding;
 };
 
+struct fuse_notify_retrieve_out {
+	__u64	notify_unique;
+	__u64	nodeid;
+	__u64	offset;
+	__u32	size;
+	__u32	padding;
+};
+
+/* Matches the size of fuse_write_in */
+struct fuse_notify_retrieve_in {
+	__u64	dummy1;
+	__u64	offset;
+	__u32	size;
+	__u32	dummy2;
+	__u64	dummy3;
+	__u64	dummy4;
+};
+
 #endif /* _LINUX_FUSE_H */
-- 
cgit v1.2.3


From 02d37bed188c500ee7afb0a2dc6b65a80704c58e Mon Sep 17 00:00:00 2001
From: Stefan Richter <stefanr@s5r6.in-berlin.de>
Date: Thu, 8 Jul 2010 16:09:06 +0200
Subject: firewire: core: integrate software-forced bus resets with bus
 management

Bus resets which are triggered
  - by the kernel drivers after updates of the local nodes' config ROM,
  - by userspace software via ioctl
shall be deferred until after >=2 seconds after the last bus reset.

If multiple modifications of the local nodes' config ROM happen in a row,
only a single bus reset should happen after them.

When the local node's link goes from inactive to active or vice versa,
and at the two occasions of bus resets mentioned above --- and if the
current gap count differs from 63 --- the bus reset should be preceded
by a PHY configuration packet that reaffirms the gap count.  Otherwise a
bus manager would have to reset the bus again right after that.

This is necessary to promote bus stability, e.g. leave grace periods for
allocations and reallocations of isochronous channels and bandwidth,
SBP-2 reconnections etc.; see IEEE 1394 clause 8.2.1.

This change implements all of the above by moving bus reset initiation
into a delayed work (except for bus resets which are triggered by the
bus manager workqueue job and are performed there immediately).  It
comes with a necessary addition to the card driver methods that allows
to get the current gap count from PHY registers.

Signed-off-by: Stefan Richter <stefanr@s5r6.in-berlin.de>
---
 drivers/firewire/core-card.c        | 68 ++++++++++++++++++++++++++++---------
 drivers/firewire/core-cdev.c        |  3 +-
 drivers/firewire/core-transaction.c | 18 ++++++++--
 drivers/firewire/core.h             |  6 +++-
 drivers/firewire/ohci.c             | 53 +++++++++++++++++++++--------
 include/linux/firewire.h            |  6 ++--
 6 files changed, 116 insertions(+), 38 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/firewire/core-card.c b/drivers/firewire/core-card.c
index 6c316cfe70c4..2bb5c036e806 100644
--- a/drivers/firewire/core-card.c
+++ b/drivers/firewire/core-card.c
@@ -204,6 +204,45 @@ void fw_core_remove_descriptor(struct fw_descriptor *desc)
 }
 EXPORT_SYMBOL(fw_core_remove_descriptor);
 
+static int reset_bus(struct fw_card *card, bool short_reset)
+{
+	int reg = short_reset ? 5 : 1;
+	int bit = short_reset ? PHY_BUS_SHORT_RESET : PHY_BUS_RESET;
+
+	return card->driver->update_phy_reg(card, reg, 0, bit);
+}
+
+void fw_schedule_bus_reset(struct fw_card *card, bool delayed, bool short_reset)
+{
+	/* We don't try hard to sort out requests of long vs. short resets. */
+	card->br_short = short_reset;
+
+	/* Use an arbitrary short delay to combine multiple reset requests. */
+	fw_card_get(card);
+	if (!schedule_delayed_work(&card->br_work,
+				   delayed ? DIV_ROUND_UP(HZ, 100) : 0))
+		fw_card_put(card);
+}
+EXPORT_SYMBOL(fw_schedule_bus_reset);
+
+static void br_work(struct work_struct *work)
+{
+	struct fw_card *card = container_of(work, struct fw_card, br_work.work);
+
+	/* Delay for 2s after last reset per IEEE 1394 clause 8.2.1. */
+	if (card->reset_jiffies != 0 &&
+	    time_is_after_jiffies(card->reset_jiffies + 2 * HZ)) {
+		if (!schedule_delayed_work(&card->br_work, 2 * HZ))
+			fw_card_put(card);
+		return;
+	}
+
+	fw_send_phy_config(card, FW_PHY_CONFIG_NO_NODE_ID, card->generation,
+			   FW_PHY_CONFIG_CURRENT_GAP_COUNT);
+	reset_bus(card, card->br_short);
+	fw_card_put(card);
+}
+
 static void allocate_broadcast_channel(struct fw_card *card, int generation)
 {
 	int channel, bandwidth = 0;
@@ -230,13 +269,13 @@ static const char gap_count_table[] = {
 void fw_schedule_bm_work(struct fw_card *card, unsigned long delay)
 {
 	fw_card_get(card);
-	if (!schedule_delayed_work(&card->work, delay))
+	if (!schedule_delayed_work(&card->bm_work, delay))
 		fw_card_put(card);
 }
 
-static void fw_card_bm_work(struct work_struct *work)
+static void bm_work(struct work_struct *work)
 {
-	struct fw_card *card = container_of(work, struct fw_card, work.work);
+	struct fw_card *card = container_of(work, struct fw_card, bm_work.work);
 	struct fw_device *root_device;
 	struct fw_node *root_node;
 	int root_id, new_root_id, irm_id, bm_id, local_id;
@@ -413,7 +452,7 @@ static void fw_card_bm_work(struct work_struct *work)
 		fw_notify("phy config: card %d, new root=%x, gap_count=%d\n",
 			  card->index, new_root_id, gap_count);
 		fw_send_phy_config(card, new_root_id, generation, gap_count);
-		fw_core_initiate_bus_reset(card, 1);
+		reset_bus(card, true);
 		/* Will allocate broadcast channel after the reset. */
 		goto out;
 	}
@@ -465,7 +504,8 @@ void fw_card_initialize(struct fw_card *card,
 
 	card->local_node = NULL;
 
-	INIT_DELAYED_WORK(&card->work, fw_card_bm_work);
+	INIT_DELAYED_WORK(&card->br_work, br_work);
+	INIT_DELAYED_WORK(&card->bm_work, bm_work);
 }
 EXPORT_SYMBOL(fw_card_initialize);
 
@@ -491,7 +531,6 @@ int fw_card_add(struct fw_card *card,
 }
 EXPORT_SYMBOL(fw_card_add);
 
-
 /*
  * The next few functions implement a dummy driver that is used once a card
  * driver shuts down an fw_card.  This allows the driver to cleanly unload,
@@ -507,6 +546,11 @@ static int dummy_enable(struct fw_card *card,
 	return -1;
 }
 
+static int dummy_read_phy_reg(struct fw_card *card, int address)
+{
+	return -ENODEV;
+}
+
 static int dummy_update_phy_reg(struct fw_card *card, int address,
 				int clear_bits, int set_bits)
 {
@@ -547,6 +591,7 @@ static int dummy_enable_phys_dma(struct fw_card *card,
 
 static const struct fw_card_driver dummy_driver_template = {
 	.enable          = dummy_enable,
+	.read_phy_reg    = dummy_read_phy_reg,
 	.update_phy_reg  = dummy_update_phy_reg,
 	.set_config_rom  = dummy_set_config_rom,
 	.send_request    = dummy_send_request,
@@ -568,7 +613,7 @@ void fw_core_remove_card(struct fw_card *card)
 
 	card->driver->update_phy_reg(card, 4,
 				     PHY_LINK_ACTIVE | PHY_CONTENDER, 0);
-	fw_core_initiate_bus_reset(card, 1);
+	fw_schedule_bus_reset(card, false, true);
 
 	mutex_lock(&card_mutex);
 	list_del_init(&card->link);
@@ -588,12 +633,3 @@ void fw_core_remove_card(struct fw_card *card)
 	WARN_ON(!list_empty(&card->transaction_list));
 }
 EXPORT_SYMBOL(fw_core_remove_card);
-
-int fw_core_initiate_bus_reset(struct fw_card *card, int short_reset)
-{
-	int reg = short_reset ? 5 : 1;
-	int bit = short_reset ? PHY_BUS_SHORT_RESET : PHY_BUS_RESET;
-
-	return card->driver->update_phy_reg(card, reg, 0, bit);
-}
-EXPORT_SYMBOL(fw_core_initiate_bus_reset);
diff --git a/drivers/firewire/core-cdev.c b/drivers/firewire/core-cdev.c
index 7a690c466ce9..ee2e87353102 100644
--- a/drivers/firewire/core-cdev.c
+++ b/drivers/firewire/core-cdev.c
@@ -820,8 +820,9 @@ static int ioctl_send_response(struct client *client, union ioctl_arg *arg)
 
 static int ioctl_initiate_bus_reset(struct client *client, union ioctl_arg *arg)
 {
-	return fw_core_initiate_bus_reset(client->device->card,
+	fw_schedule_bus_reset(client->device->card, true,
 			arg->initiate_bus_reset.type == FW_CDEV_SHORT_RESET);
+	return 0;
 }
 
 static void release_descriptor(struct client *client,
diff --git a/drivers/firewire/core-transaction.c b/drivers/firewire/core-transaction.c
index 7813da8a1293..5f5a7852f7ac 100644
--- a/drivers/firewire/core-transaction.c
+++ b/drivers/firewire/core-transaction.c
@@ -426,9 +426,21 @@ void fw_send_phy_config(struct fw_card *card,
 			int node_id, int generation, int gap_count)
 {
 	long timeout = DIV_ROUND_UP(HZ, 10);
-	u32 data = PHY_IDENTIFIER(PHY_PACKET_CONFIG) |
-		   PHY_CONFIG_ROOT_ID(node_id) |
-		   PHY_CONFIG_GAP_COUNT(gap_count);
+	u32 data = PHY_IDENTIFIER(PHY_PACKET_CONFIG);
+
+	if (node_id != FW_PHY_CONFIG_NO_NODE_ID)
+		data |= PHY_CONFIG_ROOT_ID(node_id);
+
+	if (gap_count == FW_PHY_CONFIG_CURRENT_GAP_COUNT) {
+		gap_count = card->driver->read_phy_reg(card, 1);
+		if (gap_count < 0)
+			return;
+
+		gap_count &= 63;
+		if (gap_count == 63)
+			return;
+	}
+	data |= PHY_CONFIG_GAP_COUNT(gap_count);
 
 	mutex_lock(&phy_config_mutex);
 
diff --git a/drivers/firewire/core.h b/drivers/firewire/core.h
index 3000dd74acfd..ff6c90922001 100644
--- a/drivers/firewire/core.h
+++ b/drivers/firewire/core.h
@@ -51,6 +51,7 @@ struct fw_card_driver {
 	int (*enable)(struct fw_card *card,
 		      const __be32 *config_rom, size_t length);
 
+	int (*read_phy_reg)(struct fw_card *card, int address);
 	int (*update_phy_reg)(struct fw_card *card, int address,
 			      int clear_bits, int set_bits);
 
@@ -102,8 +103,8 @@ void fw_card_initialize(struct fw_card *card,
 int fw_card_add(struct fw_card *card,
 		u32 max_receive, u32 link_speed, u64 guid);
 void fw_core_remove_card(struct fw_card *card);
-int fw_core_initiate_bus_reset(struct fw_card *card, int short_reset);
 int fw_compute_block_crc(__be32 *block);
+void fw_schedule_bus_reset(struct fw_card *card, bool delayed, bool short_reset);
 void fw_schedule_bm_work(struct fw_card *card, unsigned long delay);
 
 static inline struct fw_card *fw_card_get(struct fw_card *card)
@@ -225,6 +226,9 @@ void fw_core_handle_response(struct fw_card *card, struct fw_packet *packet);
 int fw_get_response_length(struct fw_request *request);
 void fw_fill_response(struct fw_packet *response, u32 *request_header,
 		      int rcode, void *payload, size_t length);
+
+#define FW_PHY_CONFIG_NO_NODE_ID	-1
+#define FW_PHY_CONFIG_CURRENT_GAP_COUNT	-1
 void fw_send_phy_config(struct fw_card *card,
 			int node_id, int generation, int gap_count);
 
diff --git a/drivers/firewire/ohci.c b/drivers/firewire/ohci.c
index a4bbf3dadf58..bb6a92bc9e6a 100644
--- a/drivers/firewire/ohci.c
+++ b/drivers/firewire/ohci.c
@@ -34,6 +34,7 @@
 #include <linux/mm.h>
 #include <linux/module.h>
 #include <linux/moduleparam.h>
+#include <linux/mutex.h>
 #include <linux/pci.h>
 #include <linux/pci_ids.h>
 #include <linux/spinlock.h>
@@ -182,6 +183,8 @@ struct fw_ohci {
 	 */
 	spinlock_t lock;
 
+	struct mutex phy_reg_mutex;
+
 	struct ar_context ar_request_ctx;
 	struct ar_context ar_response_ctx;
 	struct context at_request_ctx;
@@ -517,13 +520,10 @@ static int write_phy_reg(const struct fw_ohci *ohci, int addr, u32 val)
 	return -EBUSY;
 }
 
-static int ohci_update_phy_reg(struct fw_card *card, int addr,
-			       int clear_bits, int set_bits)
+static int update_phy_reg(struct fw_ohci *ohci, int addr,
+			  int clear_bits, int set_bits)
 {
-	struct fw_ohci *ohci = fw_ohci(card);
-	int ret;
-
-	ret = read_phy_reg(ohci, addr);
+	int ret = read_phy_reg(ohci, addr);
 	if (ret < 0)
 		return ret;
 
@@ -541,13 +541,38 @@ static int read_paged_phy_reg(struct fw_ohci *ohci, int page, int addr)
 {
 	int ret;
 
-	ret = ohci_update_phy_reg(&ohci->card, 7, PHY_PAGE_SELECT, page << 5);
+	ret = update_phy_reg(ohci, 7, PHY_PAGE_SELECT, page << 5);
 	if (ret < 0)
 		return ret;
 
 	return read_phy_reg(ohci, addr);
 }
 
+static int ohci_read_phy_reg(struct fw_card *card, int addr)
+{
+	struct fw_ohci *ohci = fw_ohci(card);
+	int ret;
+
+	mutex_lock(&ohci->phy_reg_mutex);
+	ret = read_phy_reg(ohci, addr);
+	mutex_unlock(&ohci->phy_reg_mutex);
+
+	return ret;
+}
+
+static int ohci_update_phy_reg(struct fw_card *card, int addr,
+			       int clear_bits, int set_bits)
+{
+	struct fw_ohci *ohci = fw_ohci(card);
+	int ret;
+
+	mutex_lock(&ohci->phy_reg_mutex);
+	ret = update_phy_reg(ohci, addr, clear_bits, set_bits);
+	mutex_unlock(&ohci->phy_reg_mutex);
+
+	return ret;
+}
+
 static int ar_context_add_page(struct ar_context *ctx)
 {
 	struct device *dev = ctx->ohci->card.device;
@@ -1676,7 +1701,7 @@ static int configure_1394a_enhancements(struct fw_ohci *ohci)
 		clear = PHY_ENABLE_ACCEL | PHY_ENABLE_MULTI;
 		set = 0;
 	}
-	ret = ohci_update_phy_reg(&ohci->card, 5, clear, set);
+	ret = update_phy_reg(ohci, 5, clear, set);
 	if (ret < 0)
 		return ret;
 
@@ -1856,12 +1881,8 @@ static int ohci_enable(struct fw_card *card,
 		  OHCI1394_HCControl_BIBimageValid);
 	flush_writes(ohci);
 
-	/*
-	 * We are ready to go, initiate bus reset to finish the
-	 * initialization.
-	 */
-
-	fw_core_initiate_bus_reset(&ohci->card, 1);
+	/* We are ready to go, reset bus to finish initialization. */
+	fw_schedule_bus_reset(&ohci->card, false, true);
 
 	return 0;
 }
@@ -1936,7 +1957,7 @@ static int ohci_set_config_rom(struct fw_card *card,
 	 * takes effect.
 	 */
 	if (ret == 0)
-		fw_core_initiate_bus_reset(&ohci->card, 1);
+		fw_schedule_bus_reset(&ohci->card, true, true);
 	else
 		dma_free_coherent(ohci->card.device, CONFIG_ROM_SIZE,
 				  next_config_rom, next_config_rom_bus);
@@ -2570,6 +2591,7 @@ static int ohci_queue_iso(struct fw_iso_context *base,
 
 static const struct fw_card_driver ohci_driver = {
 	.enable			= ohci_enable,
+	.read_phy_reg		= ohci_read_phy_reg,
 	.update_phy_reg		= ohci_update_phy_reg,
 	.set_config_rom		= ohci_set_config_rom,
 	.send_request		= ohci_send_request,
@@ -2645,6 +2667,7 @@ static int __devinit pci_probe(struct pci_dev *dev,
 	pci_set_drvdata(dev, ohci);
 
 	spin_lock_init(&ohci->lock);
+	mutex_init(&ohci->phy_reg_mutex);
 
 	tasklet_init(&ohci->bus_reset_tasklet,
 		     bus_reset_tasklet, (unsigned long)ohci);
diff --git a/include/linux/firewire.h b/include/linux/firewire.h
index db30a752a87a..adc5b55e6e5f 100644
--- a/include/linux/firewire.h
+++ b/include/linux/firewire.h
@@ -114,8 +114,10 @@ struct fw_card {
 
 	struct list_head link;
 
-	/* Work struct for BM duties. */
-	struct delayed_work work;
+	struct delayed_work br_work; /* bus reset job */
+	bool br_short;
+
+	struct delayed_work bm_work; /* bus manager job */
 	int bm_retries;
 	int bm_generation;
 	__be32 bm_transaction_data[2];
-- 
cgit v1.2.3


From 035ebefc737cce56d3938e9b7eaa5ac0e9c28715 Mon Sep 17 00:00:00 2001
From: Andres Salomon <dilinger@queued.net>
Date: Tue, 13 Jul 2010 09:42:26 +0000
Subject: of/sparc: move is_root_node() to of.h

Rename is_root_node() to of_node_is_root() and make it available for
all archs to use, as it's not PROM-specific.

Signed-off-by: Andres Salomon <dilinger@queued.net>
Acked-by: David S. Miller <davem@davemloft.net>
Signed-off-by: Grant Likely <grant.likely@secretlab.ca>
---
 arch/sparc/kernel/prom.h        | 8 --------
 arch/sparc/kernel/prom_64.c     | 6 +++---
 arch/sparc/kernel/prom_common.c | 2 +-
 include/linux/of.h              | 5 +++++
 4 files changed, 9 insertions(+), 12 deletions(-)

(limited to 'include/linux')

diff --git a/arch/sparc/kernel/prom.h b/arch/sparc/kernel/prom.h
index a8591ef2636d..eeb04a782ec8 100644
--- a/arch/sparc/kernel/prom.h
+++ b/arch/sparc/kernel/prom.h
@@ -9,14 +9,6 @@ extern void irq_trans_init(struct device_node *dp);
 
 extern unsigned int prom_unique_id;
 
-static inline int is_root_node(const struct device_node *dp)
-{
-	if (!dp)
-		return 0;
-
-	return (dp->parent == NULL);
-}
-
 extern char *build_path_component(struct device_node *dp);
 extern void of_console_init(void);
 
diff --git a/arch/sparc/kernel/prom_64.c b/arch/sparc/kernel/prom_64.c
index fb06ac2bd38f..0bffafdee350 100644
--- a/arch/sparc/kernel/prom_64.c
+++ b/arch/sparc/kernel/prom_64.c
@@ -21,7 +21,7 @@
 #include <linux/mm.h>
 #include <linux/module.h>
 #include <linux/lmb.h>
-#include <linux/of_device.h>
+#include <linux/of.h>
 
 #include <asm/prom.h>
 #include <asm/oplib.h>
@@ -81,7 +81,7 @@ static void __init sun4v_path_component(struct device_node *dp, char *tmp_buf)
 		return;
 
 	regs = rprop->value;
-	if (!is_root_node(dp->parent)) {
+	if (!of_node_is_root(dp->parent)) {
 		sprintf(tmp_buf, "%s@%x,%x",
 			dp->name,
 			(unsigned int) (regs->phys_addr >> 32UL),
@@ -121,7 +121,7 @@ static void __init sun4u_path_component(struct device_node *dp, char *tmp_buf)
 		return;
 
 	regs = prop->value;
-	if (!is_root_node(dp->parent)) {
+	if (!of_node_is_root(dp->parent)) {
 		sprintf(tmp_buf, "%s@%x,%x",
 			dp->name,
 			(unsigned int) (regs->phys_addr >> 32UL),
diff --git a/arch/sparc/kernel/prom_common.c b/arch/sparc/kernel/prom_common.c
index 57ac9e28be0c..1f830da2ddf2 100644
--- a/arch/sparc/kernel/prom_common.c
+++ b/arch/sparc/kernel/prom_common.c
@@ -244,7 +244,7 @@ char * __init build_full_name(struct device_node *dp)
 
 	n = prom_early_alloc(len);
 	strcpy(n, dp->parent->full_name);
-	if (!is_root_node(dp->parent)) {
+	if (!of_node_is_root(dp->parent)) {
 		strcpy(n + plen, "/");
 		plen++;
 	}
diff --git a/include/linux/of.h b/include/linux/of.h
index a367e19bb3af..b0756f33249e 100644
--- a/include/linux/of.h
+++ b/include/linux/of.h
@@ -70,6 +70,11 @@ extern struct device_node *allnodes;
 extern struct device_node *of_chosen;
 extern rwlock_t devtree_lock;
 
+static inline bool of_node_is_root(const struct device_node *node)
+{
+	return node && (node->parent == NULL);
+}
+
 static inline int of_node_check_flag(struct device_node *n, unsigned long flag)
 {
 	return test_bit(flag, &n->_flags);
-- 
cgit v1.2.3


From af537b0a6c650ab6ff7104d8163e96866b31c835 Mon Sep 17 00:00:00 2001
From: Christoph Lameter <cl@linux-foundation.org>
Date: Fri, 9 Jul 2010 14:07:14 -0500
Subject: slub: Use kmem_cache flags to detect if slab is in debugging mode.

The cacheline with the flags is reachable from the hot paths after the
percpu allocator changes went in. So there is no need anymore to put a
flag into each slab page. Get rid of the SlubDebug flag and use
the flags in kmem_cache instead.

Acked-by: David Rientjes <rientjes@google.com>
Signed-off-by: Christoph Lameter <cl@linux-foundation.org>
Signed-off-by: Pekka Enberg <penberg@cs.helsinki.fi>
---
 include/linux/page-flags.h |  2 --
 mm/slub.c                  | 33 ++++++++++++---------------------
 2 files changed, 12 insertions(+), 23 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/page-flags.h b/include/linux/page-flags.h
index 5b59f35dcb8f..6fa317801e1c 100644
--- a/include/linux/page-flags.h
+++ b/include/linux/page-flags.h
@@ -128,7 +128,6 @@ enum pageflags {
 
 	/* SLUB */
 	PG_slub_frozen = PG_active,
-	PG_slub_debug = PG_error,
 };
 
 #ifndef __GENERATING_BOUNDS_H
@@ -215,7 +214,6 @@ PAGEFLAG(SwapBacked, swapbacked) __CLEARPAGEFLAG(SwapBacked, swapbacked)
 __PAGEFLAG(SlobFree, slob_free)
 
 __PAGEFLAG(SlubFrozen, slub_frozen)
-__PAGEFLAG(SlubDebug, slub_debug)
 
 /*
  * Private page markings that may be used by the filesystem that owns the page
diff --git a/mm/slub.c b/mm/slub.c
index b89a7c99b2fa..9cf5dae7815e 100644
--- a/mm/slub.c
+++ b/mm/slub.c
@@ -107,11 +107,17 @@
  * 			the fast path and disables lockless freelists.
  */
 
+#define SLAB_DEBUG_FLAGS (SLAB_RED_ZONE | SLAB_POISON | SLAB_STORE_USER | \
+		SLAB_TRACE | SLAB_DEBUG_FREE)
+
+static inline int kmem_cache_debug(struct kmem_cache *s)
+{
 #ifdef CONFIG_SLUB_DEBUG
-#define SLABDEBUG 1
+	return unlikely(s->flags & SLAB_DEBUG_FLAGS);
 #else
-#define SLABDEBUG 0
+	return 0;
 #endif
+}
 
 /*
  * Issues still to be resolved:
@@ -1157,9 +1163,6 @@ static struct page *new_slab(struct kmem_cache *s, gfp_t flags, int node)
 	inc_slabs_node(s, page_to_nid(page), page->objects);
 	page->slab = s;
 	page->flags |= 1 << PG_slab;
-	if (s->flags & (SLAB_DEBUG_FREE | SLAB_RED_ZONE | SLAB_POISON |
-			SLAB_STORE_USER | SLAB_TRACE))
-		__SetPageSlubDebug(page);
 
 	start = page_address(page);
 
@@ -1186,14 +1189,13 @@ static void __free_slab(struct kmem_cache *s, struct page *page)
 	int order = compound_order(page);
 	int pages = 1 << order;
 
-	if (unlikely(SLABDEBUG && PageSlubDebug(page))) {
+	if (kmem_cache_debug(s)) {
 		void *p;
 
 		slab_pad_check(s, page);
 		for_each_object(p, s, page_address(page),
 						page->objects)
 			check_object(s, page, p, 0);
-		__ClearPageSlubDebug(page);
 	}
 
 	kmemcheck_free_shadow(page, compound_order(page));
@@ -1415,8 +1417,7 @@ static void unfreeze_slab(struct kmem_cache *s, struct page *page, int tail)
 			stat(s, tail ? DEACTIVATE_TO_TAIL : DEACTIVATE_TO_HEAD);
 		} else {
 			stat(s, DEACTIVATE_FULL);
-			if (SLABDEBUG && PageSlubDebug(page) &&
-						(s->flags & SLAB_STORE_USER))
+			if (kmem_cache_debug(s) && (s->flags & SLAB_STORE_USER))
 				add_full(n, page);
 		}
 		slab_unlock(page);
@@ -1624,7 +1625,7 @@ load_freelist:
 	object = c->page->freelist;
 	if (unlikely(!object))
 		goto another_slab;
-	if (unlikely(SLABDEBUG && PageSlubDebug(c->page)))
+	if (kmem_cache_debug(s))
 		goto debug;
 
 	c->freelist = get_freepointer(s, object);
@@ -1783,7 +1784,7 @@ static void __slab_free(struct kmem_cache *s, struct page *page,
 	stat(s, FREE_SLOWPATH);
 	slab_lock(page);
 
-	if (unlikely(SLABDEBUG && PageSlubDebug(page)))
+	if (kmem_cache_debug(s))
 		goto debug;
 
 checks_ok:
@@ -3398,16 +3399,6 @@ static void validate_slab_slab(struct kmem_cache *s, struct page *page,
 	} else
 		printk(KERN_INFO "SLUB %s: Skipped busy slab 0x%p\n",
 			s->name, page);
-
-	if (s->flags & DEBUG_DEFAULT_FLAGS) {
-		if (!PageSlubDebug(page))
-			printk(KERN_ERR "SLUB %s: SlubDebug not set "
-				"on slab 0x%p\n", s->name, page);
-	} else {
-		if (PageSlubDebug(page))
-			printk(KERN_ERR "SLUB %s: SlubDebug set on "
-				"slab 0x%p\n", s->name, page);
-	}
 }
 
 static int validate_slab_node(struct kmem_cache *s,
-- 
cgit v1.2.3


From 396e894d289d69bacf5acd983c97cd6e21a14c08 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <a.p.zijlstra@chello.nl>
Date: Fri, 9 Jul 2010 15:12:27 +0200
Subject: sched: Revert nohz_ratelimit() for now

Norbert reported that nohz_ratelimit() causes his laptop to burn about
4W (40%) extra. For now back out the change and see if we can adjust
the power management code to make better decisions.

Reported-by: Norbert Preining <preining@logic.at>
Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Acked-by: Mike Galbraith <efault@gmx.de>
Cc: Arjan van de Ven <arjan@infradead.org>
LKML-Reference: <new-submission>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 include/linux/sched.h    |  6 ------
 kernel/sched.c           | 10 ----------
 kernel/time/tick-sched.c |  2 +-
 3 files changed, 1 insertion(+), 17 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/sched.h b/include/linux/sched.h
index 747fcaedddb7..6e0bb86de990 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -273,17 +273,11 @@ extern cpumask_var_t nohz_cpu_mask;
 #if defined(CONFIG_SMP) && defined(CONFIG_NO_HZ)
 extern int select_nohz_load_balancer(int cpu);
 extern int get_nohz_load_balancer(void);
-extern int nohz_ratelimit(int cpu);
 #else
 static inline int select_nohz_load_balancer(int cpu)
 {
 	return 0;
 }
-
-static inline int nohz_ratelimit(int cpu)
-{
-	return 0;
-}
 #endif
 
 /*
diff --git a/kernel/sched.c b/kernel/sched.c
index f52a8801b7a2..63b4a14682fa 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -1232,16 +1232,6 @@ void wake_up_idle_cpu(int cpu)
 		smp_send_reschedule(cpu);
 }
 
-int nohz_ratelimit(int cpu)
-{
-	struct rq *rq = cpu_rq(cpu);
-	u64 diff = rq->clock - rq->nohz_stamp;
-
-	rq->nohz_stamp = rq->clock;
-
-	return diff < (NSEC_PER_SEC / HZ) >> 1;
-}
-
 #endif /* CONFIG_NO_HZ */
 
 static u64 sched_avg_period(void)
diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c
index 813993b5fb61..f898af608171 100644
--- a/kernel/time/tick-sched.c
+++ b/kernel/time/tick-sched.c
@@ -325,7 +325,7 @@ void tick_nohz_stop_sched_tick(int inidle)
 	} while (read_seqretry(&xtime_lock, seq));
 
 	if (rcu_needs_cpu(cpu) || printk_needs_cpu(cpu) ||
-	    arch_needs_cpu(cpu) || nohz_ratelimit(cpu)) {
+	    arch_needs_cpu(cpu)) {
 		next_jiffies = last_jiffies + 1;
 		delta_jiffies = 1;
 	} else {
-- 
cgit v1.2.3


From 323f99cbc35c52a65dea9d072b3ecf1e662240d2 Mon Sep 17 00:00:00 2001
From: Tom Lyon <pugs@cisco.com>
Date: Fri, 2 Jul 2010 16:56:14 -0400
Subject: iommu-api: Extension to check for interrupt remapping

This patch allows IOMMU users to determine whether the
hardware and software support safe, isolated interrupt
remapping.  Not all Intel IOMMUs have the hardware, and the
software for AMD is not there yet.

Signed-off-by: Tom Lyon <pugs@cisco.com>
Signed-off-by: Joerg Roedel <joerg.roedel@amd.com>
---
 drivers/pci/intel-iommu.c | 2 ++
 include/linux/iommu.h     | 1 +
 2 files changed, 3 insertions(+)

(limited to 'include/linux')

diff --git a/drivers/pci/intel-iommu.c b/drivers/pci/intel-iommu.c
index c9171be74564..6a5af18faf68 100644
--- a/drivers/pci/intel-iommu.c
+++ b/drivers/pci/intel-iommu.c
@@ -3698,6 +3698,8 @@ static int intel_iommu_domain_has_cap(struct iommu_domain *domain,
 
 	if (cap == IOMMU_CAP_CACHE_COHERENCY)
 		return dmar_domain->iommu_snooping;
+	if (cap == IOMMU_CAP_INTR_REMAP)
+		return intr_remapping_enabled;
 
 	return 0;
 }
diff --git a/include/linux/iommu.h b/include/linux/iommu.h
index be22ad83689c..0a2ba4098996 100644
--- a/include/linux/iommu.h
+++ b/include/linux/iommu.h
@@ -30,6 +30,7 @@ struct iommu_domain {
 };
 
 #define IOMMU_CAP_CACHE_COHERENCY	0x1
+#define IOMMU_CAP_INTR_REMAP		0x2	/* isolates device intrs */
 
 struct iommu_ops {
 	int (*domain_init)(struct iommu_domain *domain);
-- 
cgit v1.2.3


From eb7beb5c09af75494234ea6acd09d0a647cf7338 Mon Sep 17 00:00:00 2001
From: Frederic Weisbecker <fweisbec@gmail.com>
Date: Fri, 16 Jul 2010 00:50:03 +0200
Subject: tracing: Remove special traces

Special traces type was only used by sysprof. Lets remove it now
that sysprof ftrace plugin has been dropped.

Signed-off-by: Frederic Weisbecker <fweisbec@gmail.com>
Acked-by: Soeren Sandmann <sandmann@daimi.au.dk>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Ingo Molnar <mingo@elte.hu>
Cc: Steven Rostedt <rostedt@goodmis.org>
Cc: Li Zefan <lizf@cn.fujitsu.com>
---
 include/linux/kernel.h        |  5 ----
 include/linux/sched.h         | 12 --------
 kernel/trace/trace.c          | 55 ------------------------------------
 kernel/trace/trace.h          |  7 -----
 kernel/trace/trace_entries.h  | 17 -----------
 kernel/trace/trace_output.c   | 66 -------------------------------------------
 kernel/trace/trace_selftest.c |  1 -
 7 files changed, 163 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/kernel.h b/include/linux/kernel.h
index 8317ec4b9f3b..adee958b5989 100644
--- a/include/linux/kernel.h
+++ b/include/linux/kernel.h
@@ -508,9 +508,6 @@ extern void tracing_start(void);
 extern void tracing_stop(void);
 extern void ftrace_off_permanent(void);
 
-extern void
-ftrace_special(unsigned long arg1, unsigned long arg2, unsigned long arg3);
-
 static inline void __attribute__ ((format (printf, 1, 2)))
 ____trace_printk_check_format(const char *fmt, ...)
 {
@@ -586,8 +583,6 @@ __ftrace_vprintk(unsigned long ip, const char *fmt, va_list ap);
 
 extern void ftrace_dump(enum ftrace_dump_mode oops_dump_mode);
 #else
-static inline void
-ftrace_special(unsigned long arg1, unsigned long arg2, unsigned long arg3) { }
 static inline int
 trace_printk(const char *fmt, ...) __attribute__ ((format (printf, 1, 2)));
 
diff --git a/include/linux/sched.h b/include/linux/sched.h
index 747fcaedddb7..f751ea9dcb7b 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -2434,18 +2434,6 @@ static inline void set_task_cpu(struct task_struct *p, unsigned int cpu)
 
 #endif /* CONFIG_SMP */
 
-#ifdef CONFIG_TRACING
-extern void
-__trace_special(void *__tr, void *__data,
-		unsigned long arg1, unsigned long arg2, unsigned long arg3);
-#else
-static inline void
-__trace_special(void *__tr, void *__data,
-		unsigned long arg1, unsigned long arg2, unsigned long arg3)
-{
-}
-#endif
-
 extern long sched_setaffinity(pid_t pid, const struct cpumask *new_mask);
 extern long sched_getaffinity(pid_t pid, struct cpumask *mask);
 
diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index 78a49e67f7db..d9a4aa02c384 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -1331,61 +1331,6 @@ static void __trace_userstack(struct trace_array *tr, unsigned long flags)
 
 #endif /* CONFIG_STACKTRACE */
 
-static void
-ftrace_trace_special(void *__tr,
-		     unsigned long arg1, unsigned long arg2, unsigned long arg3,
-		     int pc)
-{
-	struct ftrace_event_call *call = &event_special;
-	struct ring_buffer_event *event;
-	struct trace_array *tr = __tr;
-	struct ring_buffer *buffer = tr->buffer;
-	struct special_entry *entry;
-
-	event = trace_buffer_lock_reserve(buffer, TRACE_SPECIAL,
-					  sizeof(*entry), 0, pc);
-	if (!event)
-		return;
-	entry	= ring_buffer_event_data(event);
-	entry->arg1			= arg1;
-	entry->arg2			= arg2;
-	entry->arg3			= arg3;
-
-	if (!filter_check_discard(call, entry, buffer, event))
-		trace_buffer_unlock_commit(buffer, event, 0, pc);
-}
-
-void
-__trace_special(void *__tr, void *__data,
-		unsigned long arg1, unsigned long arg2, unsigned long arg3)
-{
-	ftrace_trace_special(__tr, arg1, arg2, arg3, preempt_count());
-}
-
-void
-ftrace_special(unsigned long arg1, unsigned long arg2, unsigned long arg3)
-{
-	struct trace_array *tr = &global_trace;
-	struct trace_array_cpu *data;
-	unsigned long flags;
-	int cpu;
-	int pc;
-
-	if (tracing_disabled)
-		return;
-
-	pc = preempt_count();
-	local_irq_save(flags);
-	cpu = raw_smp_processor_id();
-	data = tr->data[cpu];
-
-	if (likely(atomic_inc_return(&data->disabled) == 1))
-		ftrace_trace_special(tr, arg1, arg2, arg3, pc);
-
-	atomic_dec(&data->disabled);
-	local_irq_restore(flags);
-}
-
 /**
  * trace_vbprintk - write binary msg to tracing buffer
  *
diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h
index 2114b4c1150f..638a5887e2ec 100644
--- a/kernel/trace/trace.h
+++ b/kernel/trace/trace.h
@@ -22,7 +22,6 @@ enum trace_type {
 	TRACE_STACK,
 	TRACE_PRINT,
 	TRACE_BPRINT,
-	TRACE_SPECIAL,
 	TRACE_MMIO_RW,
 	TRACE_MMIO_MAP,
 	TRACE_BRANCH,
@@ -189,7 +188,6 @@ extern void __ftrace_bad_type(void);
 		IF_ASSIGN(var, ent, struct userstack_entry, TRACE_USER_STACK);\
 		IF_ASSIGN(var, ent, struct print_entry, TRACE_PRINT);	\
 		IF_ASSIGN(var, ent, struct bprint_entry, TRACE_BPRINT);	\
-		IF_ASSIGN(var, ent, struct special_entry, 0);		\
 		IF_ASSIGN(var, ent, struct trace_mmiotrace_rw,		\
 			  TRACE_MMIO_RW);				\
 		IF_ASSIGN(var, ent, struct trace_mmiotrace_map,		\
@@ -332,11 +330,6 @@ void tracing_sched_wakeup_trace(struct trace_array *tr,
 				struct task_struct *wakee,
 				struct task_struct *cur,
 				unsigned long flags, int pc);
-void trace_special(struct trace_array *tr,
-		   struct trace_array_cpu *data,
-		   unsigned long arg1,
-		   unsigned long arg2,
-		   unsigned long arg3, int pc);
 void trace_function(struct trace_array *tr,
 		    unsigned long ip,
 		    unsigned long parent_ip,
diff --git a/kernel/trace/trace_entries.h b/kernel/trace/trace_entries.h
index 84128371f254..e3dfecaf13e6 100644
--- a/kernel/trace/trace_entries.h
+++ b/kernel/trace/trace_entries.h
@@ -150,23 +150,6 @@ FTRACE_ENTRY_DUP(wakeup, ctx_switch_entry,
 		)
 );
 
-/*
- * Special (free-form) trace entry:
- */
-FTRACE_ENTRY(special, special_entry,
-
-	TRACE_SPECIAL,
-
-	F_STRUCT(
-		__field(	unsigned long,	arg1	)
-		__field(	unsigned long,	arg2	)
-		__field(	unsigned long,	arg3	)
-	),
-
-	F_printk("(%08lx) (%08lx) (%08lx)",
-		 __entry->arg1, __entry->arg2, __entry->arg3)
-);
-
 /*
  * Stack-trace entry:
  */
diff --git a/kernel/trace/trace_output.c b/kernel/trace/trace_output.c
index 57c1b4596470..a46197b80b7f 100644
--- a/kernel/trace/trace_output.c
+++ b/kernel/trace/trace_output.c
@@ -1069,65 +1069,6 @@ static struct trace_event trace_wake_event = {
 	.funcs		= &trace_wake_funcs,
 };
 
-/* TRACE_SPECIAL */
-static enum print_line_t trace_special_print(struct trace_iterator *iter,
-					     int flags, struct trace_event *event)
-{
-	struct special_entry *field;
-
-	trace_assign_type(field, iter->ent);
-
-	if (!trace_seq_printf(&iter->seq, "# %ld %ld %ld\n",
-			      field->arg1,
-			      field->arg2,
-			      field->arg3))
-		return TRACE_TYPE_PARTIAL_LINE;
-
-	return TRACE_TYPE_HANDLED;
-}
-
-static enum print_line_t trace_special_hex(struct trace_iterator *iter,
-					   int flags, struct trace_event *event)
-{
-	struct special_entry *field;
-	struct trace_seq *s = &iter->seq;
-
-	trace_assign_type(field, iter->ent);
-
-	SEQ_PUT_HEX_FIELD_RET(s, field->arg1);
-	SEQ_PUT_HEX_FIELD_RET(s, field->arg2);
-	SEQ_PUT_HEX_FIELD_RET(s, field->arg3);
-
-	return TRACE_TYPE_HANDLED;
-}
-
-static enum print_line_t trace_special_bin(struct trace_iterator *iter,
-					   int flags, struct trace_event *event)
-{
-	struct special_entry *field;
-	struct trace_seq *s = &iter->seq;
-
-	trace_assign_type(field, iter->ent);
-
-	SEQ_PUT_FIELD_RET(s, field->arg1);
-	SEQ_PUT_FIELD_RET(s, field->arg2);
-	SEQ_PUT_FIELD_RET(s, field->arg3);
-
-	return TRACE_TYPE_HANDLED;
-}
-
-static struct trace_event_functions trace_special_funcs = {
-	.trace		= trace_special_print,
-	.raw		= trace_special_print,
-	.hex		= trace_special_hex,
-	.binary		= trace_special_bin,
-};
-
-static struct trace_event trace_special_event = {
-	.type		= TRACE_SPECIAL,
-	.funcs		= &trace_special_funcs,
-};
-
 /* TRACE_STACK */
 
 static enum print_line_t trace_stack_print(struct trace_iterator *iter,
@@ -1161,9 +1102,6 @@ static enum print_line_t trace_stack_print(struct trace_iterator *iter,
 
 static struct trace_event_functions trace_stack_funcs = {
 	.trace		= trace_stack_print,
-	.raw		= trace_special_print,
-	.hex		= trace_special_hex,
-	.binary		= trace_special_bin,
 };
 
 static struct trace_event trace_stack_event = {
@@ -1194,9 +1132,6 @@ static enum print_line_t trace_user_stack_print(struct trace_iterator *iter,
 
 static struct trace_event_functions trace_user_stack_funcs = {
 	.trace		= trace_user_stack_print,
-	.raw		= trace_special_print,
-	.hex		= trace_special_hex,
-	.binary		= trace_special_bin,
 };
 
 static struct trace_event trace_user_stack_event = {
@@ -1314,7 +1249,6 @@ static struct trace_event *events[] __initdata = {
 	&trace_fn_event,
 	&trace_ctx_event,
 	&trace_wake_event,
-	&trace_special_event,
 	&trace_stack_event,
 	&trace_user_stack_event,
 	&trace_bprint_event,
diff --git a/kernel/trace/trace_selftest.c b/kernel/trace/trace_selftest.c
index 6ed05ee6cbc7..155a415b3209 100644
--- a/kernel/trace/trace_selftest.c
+++ b/kernel/trace/trace_selftest.c
@@ -13,7 +13,6 @@ static inline int trace_valid_entry(struct trace_entry *entry)
 	case TRACE_WAKE:
 	case TRACE_STACK:
 	case TRACE_PRINT:
-	case TRACE_SPECIAL:
 	case TRACE_BRANCH:
 	case TRACE_GRAPH_ENT:
 	case TRACE_GRAPH_RET:
-- 
cgit v1.2.3


From e870e9a1240bcef1157ffaaf71dac63362e71904 Mon Sep 17 00:00:00 2001
From: Li Zefan <lizf@cn.fujitsu.com>
Date: Fri, 2 Jul 2010 11:07:32 +0800
Subject: tracing: Allow to disable cmdline recording

We found that even enabling a single trace event that will rarely be
triggered can add big overhead to context switch.

(lmbench context switch test)
 -------------------------------------------------
 2p/0K 2p/16K 2p/64K 8p/16K 8p/64K 16p/16K 16p/64K
 ctxsw  ctxsw  ctxsw ctxsw  ctxsw   ctxsw   ctxsw
------ ------ ------ ------ ------ ------- -------
  2.19   2.3   2.21   2.56   2.13     2.54    2.07
  2.39   2.51  2.35   2.75   2.27     2.81    2.24

The overhead is 6% ~ 11%.

It's because when a trace event is enabled 3 tracepoints (sched_switch,
sched_wakeup, sched_wakeup_new) will be activated to map pid to cmdname.

We'd like to avoid this overhead, so add a trace option '(no)record-cmd'
to allow to disable cmdline recording.

Signed-off-by: Li Zefan <lizf@cn.fujitsu.com>
LKML-Reference: <4C2D57F4.2050204@cn.fujitsu.com>
Signed-off-by: Steven Rostedt <rostedt@goodmis.org>
---
 include/linux/ftrace_event.h |  7 +++++--
 kernel/trace/trace.c         |  6 +++++-
 kernel/trace/trace.h         |  3 +++
 kernel/trace/trace_events.c  | 30 ++++++++++++++++++++++++++++--
 4 files changed, 41 insertions(+), 5 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/ftrace_event.h b/include/linux/ftrace_event.h
index 01df7ca4ead7..2b7b1395b4d5 100644
--- a/include/linux/ftrace_event.h
+++ b/include/linux/ftrace_event.h
@@ -152,11 +152,13 @@ extern int ftrace_event_reg(struct ftrace_event_call *event,
 enum {
 	TRACE_EVENT_FL_ENABLED_BIT,
 	TRACE_EVENT_FL_FILTERED_BIT,
+	TRACE_EVENT_FL_RECORDED_CMD_BIT,
 };
 
 enum {
-	TRACE_EVENT_FL_ENABLED	= (1 << TRACE_EVENT_FL_ENABLED_BIT),
-	TRACE_EVENT_FL_FILTERED	= (1 << TRACE_EVENT_FL_FILTERED_BIT),
+	TRACE_EVENT_FL_ENABLED		= (1 << TRACE_EVENT_FL_ENABLED_BIT),
+	TRACE_EVENT_FL_FILTERED		= (1 << TRACE_EVENT_FL_FILTERED_BIT),
+	TRACE_EVENT_FL_RECORDED_CMD	= (1 << TRACE_EVENT_FL_RECORDED_CMD_BIT),
 };
 
 struct ftrace_event_call {
@@ -174,6 +176,7 @@ struct ftrace_event_call {
 	 * 32 bit flags:
 	 *   bit 1:		enabled
 	 *   bit 2:		filter_active
+	 *   bit 3:		enabled cmd record
 	 *
 	 * Changes to flags must hold the event_mutex.
 	 *
diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index 8683dec6946b..af9042977c08 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -344,7 +344,7 @@ static DECLARE_WAIT_QUEUE_HEAD(trace_wait);
 /* trace_flags holds trace_options default values */
 unsigned long trace_flags = TRACE_ITER_PRINT_PARENT | TRACE_ITER_PRINTK |
 	TRACE_ITER_ANNOTATE | TRACE_ITER_CONTEXT_INFO | TRACE_ITER_SLEEP_TIME |
-	TRACE_ITER_GRAPH_TIME;
+	TRACE_ITER_GRAPH_TIME | TRACE_ITER_RECORD_CMD;
 
 static int trace_stop_count;
 static DEFINE_SPINLOCK(tracing_start_lock);
@@ -428,6 +428,7 @@ static const char *trace_options[] = {
 	"latency-format",
 	"sleep-time",
 	"graph-time",
+	"record-cmd",
 	NULL
 };
 
@@ -2561,6 +2562,9 @@ static void set_tracer_flags(unsigned int mask, int enabled)
 		trace_flags |= mask;
 	else
 		trace_flags &= ~mask;
+
+	if (mask == TRACE_ITER_RECORD_CMD)
+		trace_event_enable_cmd_record(enabled);
 }
 
 static ssize_t
diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h
index 84d3f123e86f..7778f067fc8b 100644
--- a/kernel/trace/trace.h
+++ b/kernel/trace/trace.h
@@ -591,6 +591,7 @@ enum trace_iterator_flags {
 	TRACE_ITER_LATENCY_FMT		= 0x20000,
 	TRACE_ITER_SLEEP_TIME		= 0x40000,
 	TRACE_ITER_GRAPH_TIME		= 0x80000,
+	TRACE_ITER_RECORD_CMD		= 0x100000,
 };
 
 /*
@@ -723,6 +724,8 @@ filter_check_discard(struct ftrace_event_call *call, void *rec,
 	return 0;
 }
 
+extern void trace_event_enable_cmd_record(bool enable);
+
 extern struct mutex event_mutex;
 extern struct list_head ftrace_events;
 
diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c
index e8e6043f4d29..09b4fa6e4d3b 100644
--- a/kernel/trace/trace_events.c
+++ b/kernel/trace/trace_events.c
@@ -170,6 +170,26 @@ int ftrace_event_reg(struct ftrace_event_call *call, enum trace_reg type)
 }
 EXPORT_SYMBOL_GPL(ftrace_event_reg);
 
+void trace_event_enable_cmd_record(bool enable)
+{
+	struct ftrace_event_call *call;
+
+	mutex_lock(&event_mutex);
+	list_for_each_entry(call, &ftrace_events, list) {
+		if (!(call->flags & TRACE_EVENT_FL_ENABLED))
+			continue;
+
+		if (enable) {
+			tracing_start_cmdline_record();
+			call->flags |= TRACE_EVENT_FL_RECORDED_CMD;
+		} else {
+			tracing_stop_cmdline_record();
+			call->flags &= ~TRACE_EVENT_FL_RECORDED_CMD;
+		}
+	}
+	mutex_unlock(&event_mutex);
+}
+
 static int ftrace_event_enable_disable(struct ftrace_event_call *call,
 					int enable)
 {
@@ -179,13 +199,19 @@ static int ftrace_event_enable_disable(struct ftrace_event_call *call,
 	case 0:
 		if (call->flags & TRACE_EVENT_FL_ENABLED) {
 			call->flags &= ~TRACE_EVENT_FL_ENABLED;
-			tracing_stop_cmdline_record();
+			if (call->flags & TRACE_EVENT_FL_RECORDED_CMD) {
+				tracing_stop_cmdline_record();
+				call->flags &= ~TRACE_EVENT_FL_RECORDED_CMD;
+			}
 			call->class->reg(call, TRACE_REG_UNREGISTER);
 		}
 		break;
 	case 1:
 		if (!(call->flags & TRACE_EVENT_FL_ENABLED)) {
-			tracing_start_cmdline_record();
+			if (trace_flags & TRACE_ITER_RECORD_CMD) {
+				tracing_start_cmdline_record();
+				call->flags |= TRACE_EVENT_FL_RECORDED_CMD;
+			}
 			ret = call->class->reg(call, TRACE_REG_REGISTER);
 			if (ret) {
 				tracing_stop_cmdline_record();
-- 
cgit v1.2.3


From bc289ae98b75d93228d24f521ef02a076e506e94 Mon Sep 17 00:00:00 2001
From: Lai Jiangshan <laijs@cn.fujitsu.com>
Date: Thu, 3 Jun 2010 18:26:24 +0800
Subject: tracing: Reduce latency and remove percpu trace_seq

__print_flags() and __print_symbolic() use percpu trace_seq:

1) Its memory is allocated at compile time, it wastes memory if we don't use tracing.
2) It is percpu data and it wastes more memory for multi-cpus system.
3) It disables preemption when it executes its core routine
   "trace_seq_printf(s, "%s: ", #call);" and introduces latency.

So we move this trace_seq to struct trace_iterator.

Signed-off-by: Lai Jiangshan <laijs@cn.fujitsu.com>
LKML-Reference: <4C078350.7090106@cn.fujitsu.com>
Signed-off-by: Steven Rostedt <rostedt@goodmis.org>
---
 include/linux/ftrace_event.h |  5 +++--
 include/trace/ftrace.h       | 12 +++---------
 kernel/trace/trace_output.c  |  3 ---
 3 files changed, 6 insertions(+), 14 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/ftrace_event.h b/include/linux/ftrace_event.h
index 2b7b1395b4d5..02b8b24f8f51 100644
--- a/include/linux/ftrace_event.h
+++ b/include/linux/ftrace_event.h
@@ -11,8 +11,6 @@ struct trace_array;
 struct tracer;
 struct dentry;
 
-DECLARE_PER_CPU(struct trace_seq, ftrace_event_seq);
-
 struct trace_print_flags {
 	unsigned long		mask;
 	const char		*name;
@@ -58,6 +56,9 @@ struct trace_iterator {
 	struct ring_buffer_iter	*buffer_iter[NR_CPUS];
 	unsigned long		iter_flags;
 
+	/* trace_seq for __print_flags() and __print_symbolic() etc. */
+	struct trace_seq	tmp_seq;
+
 	/* The below is zeroed out in pipe_read */
 	struct trace_seq	seq;
 	struct trace_entry	*ent;
diff --git a/include/trace/ftrace.h b/include/trace/ftrace.h
index 55c1fd1bbc3d..fb783d94fc54 100644
--- a/include/trace/ftrace.h
+++ b/include/trace/ftrace.h
@@ -145,7 +145,7 @@
  *	struct trace_seq *s = &iter->seq;
  *	struct ftrace_raw_<call> *field; <-- defined in stage 1
  *	struct trace_entry *entry;
- *	struct trace_seq *p;
+ *	struct trace_seq *p = &iter->tmp_seq;
  *	int ret;
  *
  *	entry = iter->ent;
@@ -157,12 +157,10 @@
  *
  *	field = (typeof(field))entry;
  *
- *	p = &get_cpu_var(ftrace_event_seq);
  *	trace_seq_init(p);
  *	ret = trace_seq_printf(s, "%s: ", <call>);
  *	if (ret)
  *		ret = trace_seq_printf(s, <TP_printk> "\n");
- *	put_cpu();
  *	if (!ret)
  *		return TRACE_TYPE_PARTIAL_LINE;
  *
@@ -216,7 +214,7 @@ ftrace_raw_output_##call(struct trace_iterator *iter, int flags,	\
 	struct trace_seq *s = &iter->seq;				\
 	struct ftrace_raw_##call *field;				\
 	struct trace_entry *entry;					\
-	struct trace_seq *p;						\
+	struct trace_seq *p = &iter->tmp_seq;				\
 	int ret;							\
 									\
 	event = container_of(trace_event, struct ftrace_event_call,	\
@@ -231,12 +229,10 @@ ftrace_raw_output_##call(struct trace_iterator *iter, int flags,	\
 									\
 	field = (typeof(field))entry;					\
 									\
-	p = &get_cpu_var(ftrace_event_seq);				\
 	trace_seq_init(p);						\
 	ret = trace_seq_printf(s, "%s: ", event->name);			\
 	if (ret)							\
 		ret = trace_seq_printf(s, print);			\
-	put_cpu();							\
 	if (!ret)							\
 		return TRACE_TYPE_PARTIAL_LINE;				\
 									\
@@ -255,7 +251,7 @@ ftrace_raw_output_##call(struct trace_iterator *iter, int flags,	\
 	struct trace_seq *s = &iter->seq;				\
 	struct ftrace_raw_##template *field;				\
 	struct trace_entry *entry;					\
-	struct trace_seq *p;						\
+	struct trace_seq *p = &iter->tmp_seq;				\
 	int ret;							\
 									\
 	entry = iter->ent;						\
@@ -267,12 +263,10 @@ ftrace_raw_output_##call(struct trace_iterator *iter, int flags,	\
 									\
 	field = (typeof(field))entry;					\
 									\
-	p = &get_cpu_var(ftrace_event_seq);				\
 	trace_seq_init(p);						\
 	ret = trace_seq_printf(s, "%s: ", #call);			\
 	if (ret)							\
 		ret = trace_seq_printf(s, print);			\
-	put_cpu();							\
 	if (!ret)							\
 		return TRACE_TYPE_PARTIAL_LINE;				\
 									\
diff --git a/kernel/trace/trace_output.c b/kernel/trace/trace_output.c
index 57c1b4596470..1ba64d3cc567 100644
--- a/kernel/trace/trace_output.c
+++ b/kernel/trace/trace_output.c
@@ -16,9 +16,6 @@
 
 DECLARE_RWSEM(trace_event_mutex);
 
-DEFINE_PER_CPU(struct trace_seq, ftrace_event_seq);
-EXPORT_PER_CPU_SYMBOL(ftrace_event_seq);
-
 static struct hlist_head event_hash[EVENT_HASHSIZE] __read_mostly;
 
 static int next_event_type = __TRACE_LAST_TYPE + 1;
-- 
cgit v1.2.3


From ade7ce31c22e961dfbe1a6d57fd362c90c187cbd Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Fri, 4 Jun 2010 10:56:01 +0200
Subject: quota: Clean up the namespace in dqblk_xfs.h

Almost all identifiers use the FS_* namespace, so rename the missing few
XFS_* ones to FS_* as well.  Without this some people might get upset
about having too many XFS names in generic code.

Acked-by: Steven Whitehouse <swhiteho@redhat.com>
Signed-off-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Jan Kara <jack@suse.cz>
---
 fs/gfs2/quota.c                 | 10 +++++-----
 fs/quota/dquot.c                |  2 +-
 fs/xfs/linux-2.6/xfs_quotaops.c | 10 +++++-----
 fs/xfs/quota/xfs_qm_syscalls.c  | 32 ++++++++++++++++----------------
 include/linux/dqblk_xfs.h       | 24 ++++++++++++------------
 5 files changed, 39 insertions(+), 39 deletions(-)

(limited to 'include/linux')

diff --git a/fs/gfs2/quota.c b/fs/gfs2/quota.c
index b256d6f24288..ce345f8c69c2 100644
--- a/fs/gfs2/quota.c
+++ b/fs/gfs2/quota.c
@@ -1455,10 +1455,10 @@ static int gfs2_quota_get_xstate(struct super_block *sb,
 
 	switch (sdp->sd_args.ar_quota) {
 	case GFS2_QUOTA_ON:
-		fqs->qs_flags |= (XFS_QUOTA_UDQ_ENFD | XFS_QUOTA_GDQ_ENFD);
+		fqs->qs_flags |= (FS_QUOTA_UDQ_ENFD | FS_QUOTA_GDQ_ENFD);
 		/*FALLTHRU*/
 	case GFS2_QUOTA_ACCOUNT:
-		fqs->qs_flags |= (XFS_QUOTA_UDQ_ACCT | XFS_QUOTA_GDQ_ACCT);
+		fqs->qs_flags |= (FS_QUOTA_UDQ_ACCT | FS_QUOTA_GDQ_ACCT);
 		break;
 	case GFS2_QUOTA_OFF:
 		break;
@@ -1504,7 +1504,7 @@ static int gfs2_get_dqblk(struct super_block *sb, int type, qid_t id,
 
 	qlvb = (struct gfs2_quota_lvb *)qd->qd_gl->gl_lvb;
 	fdq->d_version = FS_DQUOT_VERSION;
-	fdq->d_flags = (type == QUOTA_USER) ? XFS_USER_QUOTA : XFS_GROUP_QUOTA;
+	fdq->d_flags = (type == QUOTA_USER) ? FS_USER_QUOTA : FS_GROUP_QUOTA;
 	fdq->d_id = id;
 	fdq->d_blk_hardlimit = be64_to_cpu(qlvb->qb_limit);
 	fdq->d_blk_softlimit = be64_to_cpu(qlvb->qb_warn);
@@ -1539,12 +1539,12 @@ static int gfs2_set_dqblk(struct super_block *sb, int type, qid_t id,
 	switch(type) {
 	case USRQUOTA:
 		type = QUOTA_USER;
-		if (fdq->d_flags != XFS_USER_QUOTA)
+		if (fdq->d_flags != FS_USER_QUOTA)
 			return -EINVAL;
 		break;
 	case GRPQUOTA:
 		type = QUOTA_GROUP;
-		if (fdq->d_flags != XFS_GROUP_QUOTA)
+		if (fdq->d_flags != FS_GROUP_QUOTA)
 			return -EINVAL;
 		break;
 	default:
diff --git a/fs/quota/dquot.c b/fs/quota/dquot.c
index a5974c49a78b..2857fd67ff33 100644
--- a/fs/quota/dquot.c
+++ b/fs/quota/dquot.c
@@ -2281,7 +2281,7 @@ static void do_get_dqblk(struct dquot *dquot, struct fs_disk_quota *di)
 	memset(di, 0, sizeof(*di));
 	di->d_version = FS_DQUOT_VERSION;
 	di->d_flags = dquot->dq_type == USRQUOTA ?
-			XFS_USER_QUOTA : XFS_GROUP_QUOTA;
+			FS_USER_QUOTA : FS_GROUP_QUOTA;
 	di->d_id = dquot->dq_id;
 
 	spin_lock(&dq_data_lock);
diff --git a/fs/xfs/linux-2.6/xfs_quotaops.c b/fs/xfs/linux-2.6/xfs_quotaops.c
index 067cafbfc635..b9ba7536f4b4 100644
--- a/fs/xfs/linux-2.6/xfs_quotaops.c
+++ b/fs/xfs/linux-2.6/xfs_quotaops.c
@@ -69,15 +69,15 @@ xfs_fs_set_xstate(
 	if (op != Q_XQUOTARM && !XFS_IS_QUOTA_RUNNING(mp))
 		return -ENOSYS;
 
-	if (uflags & XFS_QUOTA_UDQ_ACCT)
+	if (uflags & FS_QUOTA_UDQ_ACCT)
 		flags |= XFS_UQUOTA_ACCT;
-	if (uflags & XFS_QUOTA_PDQ_ACCT)
+	if (uflags & FS_QUOTA_PDQ_ACCT)
 		flags |= XFS_PQUOTA_ACCT;
-	if (uflags & XFS_QUOTA_GDQ_ACCT)
+	if (uflags & FS_QUOTA_GDQ_ACCT)
 		flags |= XFS_GQUOTA_ACCT;
-	if (uflags & XFS_QUOTA_UDQ_ENFD)
+	if (uflags & FS_QUOTA_UDQ_ENFD)
 		flags |= XFS_UQUOTA_ENFD;
-	if (uflags & (XFS_QUOTA_PDQ_ENFD|XFS_QUOTA_GDQ_ENFD))
+	if (uflags & (FS_QUOTA_PDQ_ENFD|FS_QUOTA_GDQ_ENFD))
 		flags |= XFS_OQUOTA_ENFD;
 
 	switch (op) {
diff --git a/fs/xfs/quota/xfs_qm_syscalls.c b/fs/xfs/quota/xfs_qm_syscalls.c
index b4487764e923..41b04b96975b 100644
--- a/fs/xfs/quota/xfs_qm_syscalls.c
+++ b/fs/xfs/quota/xfs_qm_syscalls.c
@@ -786,9 +786,9 @@ xfs_qm_export_dquot(
 	}
 
 #ifdef DEBUG
-	if (((XFS_IS_UQUOTA_ENFORCED(mp) && dst->d_flags == XFS_USER_QUOTA) ||
+	if (((XFS_IS_UQUOTA_ENFORCED(mp) && dst->d_flags == FS_USER_QUOTA) ||
 	     (XFS_IS_OQUOTA_ENFORCED(mp) &&
-			(dst->d_flags & (XFS_PROJ_QUOTA | XFS_GROUP_QUOTA)))) &&
+			(dst->d_flags & (FS_PROJ_QUOTA | FS_GROUP_QUOTA)))) &&
 	    dst->d_id != 0) {
 		if (((int) dst->d_bcount >= (int) dst->d_blk_softlimit) &&
 		    (dst->d_blk_softlimit > 0)) {
@@ -809,17 +809,17 @@ xfs_qm_export_qtype_flags(
 	/*
 	 * Can't be more than one, or none.
 	 */
-	ASSERT((flags & (XFS_PROJ_QUOTA | XFS_USER_QUOTA)) !=
-		(XFS_PROJ_QUOTA | XFS_USER_QUOTA));
-	ASSERT((flags & (XFS_PROJ_QUOTA | XFS_GROUP_QUOTA)) !=
-		(XFS_PROJ_QUOTA | XFS_GROUP_QUOTA));
-	ASSERT((flags & (XFS_USER_QUOTA | XFS_GROUP_QUOTA)) !=
-		(XFS_USER_QUOTA | XFS_GROUP_QUOTA));
-	ASSERT((flags & (XFS_PROJ_QUOTA|XFS_USER_QUOTA|XFS_GROUP_QUOTA)) != 0);
+	ASSERT((flags & (FS_PROJ_QUOTA | FS_USER_QUOTA)) !=
+		(FS_PROJ_QUOTA | FS_USER_QUOTA));
+	ASSERT((flags & (FS_PROJ_QUOTA | FS_GROUP_QUOTA)) !=
+		(FS_PROJ_QUOTA | FS_GROUP_QUOTA));
+	ASSERT((flags & (FS_USER_QUOTA | FS_GROUP_QUOTA)) !=
+		(FS_USER_QUOTA | FS_GROUP_QUOTA));
+	ASSERT((flags & (FS_PROJ_QUOTA|FS_USER_QUOTA|FS_GROUP_QUOTA)) != 0);
 
 	return (flags & XFS_DQ_USER) ?
-		XFS_USER_QUOTA : (flags & XFS_DQ_PROJ) ?
-			XFS_PROJ_QUOTA : XFS_GROUP_QUOTA;
+		FS_USER_QUOTA : (flags & XFS_DQ_PROJ) ?
+			FS_PROJ_QUOTA : FS_GROUP_QUOTA;
 }
 
 STATIC uint
@@ -830,16 +830,16 @@ xfs_qm_export_flags(
 
 	uflags = 0;
 	if (flags & XFS_UQUOTA_ACCT)
-		uflags |= XFS_QUOTA_UDQ_ACCT;
+		uflags |= FS_QUOTA_UDQ_ACCT;
 	if (flags & XFS_PQUOTA_ACCT)
-		uflags |= XFS_QUOTA_PDQ_ACCT;
+		uflags |= FS_QUOTA_PDQ_ACCT;
 	if (flags & XFS_GQUOTA_ACCT)
-		uflags |= XFS_QUOTA_GDQ_ACCT;
+		uflags |= FS_QUOTA_GDQ_ACCT;
 	if (flags & XFS_UQUOTA_ENFD)
-		uflags |= XFS_QUOTA_UDQ_ENFD;
+		uflags |= FS_QUOTA_UDQ_ENFD;
 	if (flags & (XFS_OQUOTA_ENFD)) {
 		uflags |= (flags & XFS_GQUOTA_ACCT) ?
-			XFS_QUOTA_GDQ_ENFD : XFS_QUOTA_PDQ_ENFD;
+			FS_QUOTA_GDQ_ENFD : FS_QUOTA_PDQ_ENFD;
 	}
 	return (uflags);
 }
diff --git a/include/linux/dqblk_xfs.h b/include/linux/dqblk_xfs.h
index 4389ae72024e..86552807aed9 100644
--- a/include/linux/dqblk_xfs.h
+++ b/include/linux/dqblk_xfs.h
@@ -49,7 +49,7 @@
 #define FS_DQUOT_VERSION	1	/* fs_disk_quota.d_version */
 typedef struct fs_disk_quota {
 	__s8		d_version;	/* version of this structure */
-	__s8		d_flags;	/* XFS_{USER,PROJ,GROUP}_QUOTA */
+	__s8		d_flags;	/* FS_{USER,PROJ,GROUP}_QUOTA */
 	__u16		d_fieldmask;	/* field specifier */
 	__u32		d_id;		/* user, project, or group ID */
 	__u64		d_blk_hardlimit;/* absolute limit on disk blks */
@@ -119,18 +119,18 @@ typedef struct fs_disk_quota {
 #define FS_DQ_ACCT_MASK		(FS_DQ_BCOUNT | FS_DQ_ICOUNT | FS_DQ_RTBCOUNT)
 
 /*
- * Various flags related to quotactl(2).  Only relevant to XFS filesystems.
+ * Various flags related to quotactl(2).
  */
-#define XFS_QUOTA_UDQ_ACCT	(1<<0)  /* user quota accounting */
-#define XFS_QUOTA_UDQ_ENFD	(1<<1)  /* user quota limits enforcement */
-#define XFS_QUOTA_GDQ_ACCT	(1<<2)  /* group quota accounting */
-#define XFS_QUOTA_GDQ_ENFD	(1<<3)  /* group quota limits enforcement */
-#define XFS_QUOTA_PDQ_ACCT	(1<<4)  /* project quota accounting */
-#define XFS_QUOTA_PDQ_ENFD	(1<<5)  /* project quota limits enforcement */
+#define FS_QUOTA_UDQ_ACCT	(1<<0)  /* user quota accounting */
+#define FS_QUOTA_UDQ_ENFD	(1<<1)  /* user quota limits enforcement */
+#define FS_QUOTA_GDQ_ACCT	(1<<2)  /* group quota accounting */
+#define FS_QUOTA_GDQ_ENFD	(1<<3)  /* group quota limits enforcement */
+#define FS_QUOTA_PDQ_ACCT	(1<<4)  /* project quota accounting */
+#define FS_QUOTA_PDQ_ENFD	(1<<5)  /* project quota limits enforcement */
 
-#define XFS_USER_QUOTA		(1<<0)	/* user quota type */
-#define XFS_PROJ_QUOTA		(1<<1)	/* project quota type */
-#define XFS_GROUP_QUOTA		(1<<2)	/* group quota type */
+#define FS_USER_QUOTA		(1<<0)	/* user quota type */
+#define FS_PROJ_QUOTA		(1<<1)	/* project quota type */
+#define FS_GROUP_QUOTA		(1<<2)	/* group quota type */
 
 /*
  * fs_quota_stat is the struct returned in Q_XGETQSTAT for a given file system.
@@ -151,7 +151,7 @@ typedef struct fs_qfilestat {
 
 typedef struct fs_quota_stat {
 	__s8		qs_version;	/* version number for future changes */
-	__u16		qs_flags;	/* XFS_QUOTA_{U,P,G}DQ_{ACCT,ENFD} */
+	__u16		qs_flags;	/* FS_QUOTA_{U,P,G}DQ_{ACCT,ENFD} */
 	__s8		qs_pad;		/* unused */
 	fs_qfilestat_t	qs_uquota;	/* user quota storage information */
 	fs_qfilestat_t	qs_gquota;	/* group quota storage information */
-- 
cgit v1.2.3


From 189eef59e70e3e56edf726864629f310d114eefb Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Fri, 4 Jun 2010 10:56:29 +0200
Subject: quota: clean up quota active checks

The various quota operations check for any quota beeing active on
a superblock, and the inode not having the noquota flag.

Merge these two checks into a dquot_active check and move that
into dquot.c as that's the only place where it's needed.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Jan Kara <jack@suse.cz>
---
 fs/quota/dquot.c         | 23 ++++++++++++++++-------
 include/linux/quotaops.h | 10 ----------
 2 files changed, 16 insertions(+), 17 deletions(-)

(limited to 'include/linux')

diff --git a/fs/quota/dquot.c b/fs/quota/dquot.c
index 2857fd67ff33..2eebf72d07c8 100644
--- a/fs/quota/dquot.c
+++ b/fs/quota/dquot.c
@@ -1315,6 +1315,15 @@ static int info_bdq_free(struct dquot *dquot, qsize_t space)
 	return QUOTA_NL_NOWARN;
 }
 
+static int dquot_active(const struct inode *inode)
+{
+	struct super_block *sb = inode->i_sb;
+
+	if (IS_NOQUOTA(inode))
+		return 0;
+	return sb_any_quota_loaded(sb) & ~sb_any_quota_suspended(sb);
+}
+
 /*
  * Initialize quota pointers in inode
  *
@@ -1334,7 +1343,7 @@ static void __dquot_initialize(struct inode *inode, int type)
 
 	/* First test before acquiring mutex - solves deadlocks when we
          * re-enter the quota code and are already holding the mutex */
-	if (!sb_any_quota_active(inode->i_sb) || IS_NOQUOTA(inode))
+	if (!dquot_active(inode))
 		return;
 
 	/* First get references to structures we might need. */
@@ -1518,7 +1527,7 @@ int __dquot_alloc_space(struct inode *inode, qsize_t number, int flags)
 	 * First test before acquiring mutex - solves deadlocks when we
 	 * re-enter the quota code and are already holding the mutex
 	 */
-	if (!sb_any_quota_active(inode->i_sb) || IS_NOQUOTA(inode)) {
+	if (!dquot_active(inode)) {
 		inode_incr_space(inode, number, reserve);
 		goto out;
 	}
@@ -1570,7 +1579,7 @@ int dquot_alloc_inode(const struct inode *inode)
 
 	/* First test before acquiring mutex - solves deadlocks when we
          * re-enter the quota code and are already holding the mutex */
-	if (!sb_any_quota_active(inode->i_sb) || IS_NOQUOTA(inode))
+	if (!dquot_active(inode))
 		return 0;
 	for (cnt = 0; cnt < MAXQUOTAS; cnt++)
 		warntype[cnt] = QUOTA_NL_NOWARN;
@@ -1607,7 +1616,7 @@ int dquot_claim_space_nodirty(struct inode *inode, qsize_t number)
 {
 	int cnt;
 
-	if (!sb_any_quota_active(inode->i_sb) || IS_NOQUOTA(inode)) {
+	if (!dquot_active(inode)) {
 		inode_claim_rsv_space(inode, number);
 		return 0;
 	}
@@ -1640,7 +1649,7 @@ void __dquot_free_space(struct inode *inode, qsize_t number, int flags)
 
 	/* First test before acquiring mutex - solves deadlocks when we
          * re-enter the quota code and are already holding the mutex */
-	if (!sb_any_quota_active(inode->i_sb) || IS_NOQUOTA(inode)) {
+	if (!dquot_active(inode)) {
 		inode_decr_space(inode, number, reserve);
 		return;
 	}
@@ -1678,7 +1687,7 @@ void dquot_free_inode(const struct inode *inode)
 
 	/* First test before acquiring mutex - solves deadlocks when we
          * re-enter the quota code and are already holding the mutex */
-	if (!sb_any_quota_active(inode->i_sb) || IS_NOQUOTA(inode))
+	if (!dquot_active(inode))
 		return;
 
 	down_read(&sb_dqopt(inode->i_sb)->dqptr_sem);
@@ -1801,7 +1810,7 @@ int dquot_transfer(struct inode *inode, struct iattr *iattr)
 	struct super_block *sb = inode->i_sb;
 	int ret;
 
-	if (!sb_any_quota_active(sb) || IS_NOQUOTA(inode))
+	if (!dquot_active(inode))
 		return 0;
 
 	if (iattr->ia_valid & ATTR_UID && iattr->ia_uid != inode->i_uid)
diff --git a/include/linux/quotaops.h b/include/linux/quotaops.h
index aa36793b48bd..126193c1a5ce 100644
--- a/include/linux/quotaops.h
+++ b/include/linux/quotaops.h
@@ -145,11 +145,6 @@ static inline bool sb_has_quota_active(struct super_block *sb, int type)
 	       !sb_has_quota_suspended(sb, type);
 }
 
-static inline unsigned sb_any_quota_active(struct super_block *sb)
-{
-	return sb_any_quota_loaded(sb) & ~sb_any_quota_suspended(sb);
-}
-
 /*
  * Operations supported for diskquotas.
  */
@@ -194,11 +189,6 @@ static inline int sb_has_quota_active(struct super_block *sb, int type)
 	return 0;
 }
 
-static inline int sb_any_quota_active(struct super_block *sb)
-{
-	return 0;
-}
-
 static inline void dquot_initialize(struct inode *inode)
 {
 }
-- 
cgit v1.2.3


From 4c4d3901225518ed1a4c938ba15ba09842a00770 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Mon, 7 Jun 2010 10:20:39 +0200
Subject: ext3: remove vestiges of nobh support

The nobh option was only supported for writeback mode, but given that all
write paths (except mmapped writed) actually create buffer heads, it
effectively was a no-op already.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Jan Kara <jack@suse.cz>
---
 fs/ext3/inode.c         | 16 +---------------
 fs/ext3/super.c         | 17 ++++-------------
 include/linux/ext3_fs.h |  1 -
 3 files changed, 5 insertions(+), 29 deletions(-)

(limited to 'include/linux')

diff --git a/fs/ext3/inode.c b/fs/ext3/inode.c
index 735f0190ec2a..a786db403efc 100644
--- a/fs/ext3/inode.c
+++ b/fs/ext3/inode.c
@@ -1625,10 +1625,7 @@ static int ext3_writeback_writepage(struct page *page,
 		goto out_fail;
 	}
 
-	if (test_opt(inode->i_sb, NOBH) && ext3_should_writeback_data(inode))
-		ret = nobh_writepage(page, ext3_get_block, wbc);
-	else
-		ret = block_write_full_page(page, ext3_get_block, wbc);
+	ret = block_write_full_page(page, ext3_get_block, wbc);
 
 	err = ext3_journal_stop(handle);
 	if (!ret)
@@ -1922,17 +1919,6 @@ static int ext3_block_truncate_page(handle_t *handle, struct page *page,
 	length = blocksize - (offset & (blocksize - 1));
 	iblock = index << (PAGE_CACHE_SHIFT - inode->i_sb->s_blocksize_bits);
 
-	/*
-	 * For "nobh" option,  we can only work if we don't need to
-	 * read-in the page - otherwise we create buffers to do the IO.
-	 */
-	if (!page_has_buffers(page) && test_opt(inode->i_sb, NOBH) &&
-	     ext3_should_writeback_data(inode) && PageUptodate(page)) {
-		zero_user(page, offset, length);
-		set_page_dirty(page);
-		goto unlock;
-	}
-
 	if (!page_has_buffers(page))
 		create_empty_buffers(page, blocksize, 0);
 
diff --git a/fs/ext3/super.c b/fs/ext3/super.c
index 6c953bb255e7..9650a956fd0e 100644
--- a/fs/ext3/super.c
+++ b/fs/ext3/super.c
@@ -661,9 +661,6 @@ static int ext3_show_options(struct seq_file *seq, struct vfsmount *vfs)
 	 */
 	seq_puts(seq, ",barrier=");
 	seq_puts(seq, test_opt(sb, BARRIER) ? "1" : "0");
-	if (test_opt(sb, NOBH))
-		seq_puts(seq, ",nobh");
-
 	seq_printf(seq, ",data=%s", data_mode_string(test_opt(sb, DATA_FLAGS)));
 	if (test_opt(sb, DATA_ERR_ABORT))
 		seq_puts(seq, ",data_err=abort");
@@ -1255,10 +1252,12 @@ set_qf_format:
 			*n_blocks_count = option;
 			break;
 		case Opt_nobh:
-			set_opt(sbi->s_mount_opt, NOBH);
+			ext3_msg(sb, KERN_WARNING,
+				"warning: ignoring deprecated nobh option");
 			break;
 		case Opt_bh:
-			clear_opt(sbi->s_mount_opt, NOBH);
+			ext3_msg(sb, KERN_WARNING,
+				"warning: ignoring deprecated bh option");
 			break;
 		default:
 			ext3_msg(sb, KERN_ERR,
@@ -2001,14 +2000,6 @@ static int ext3_fill_super (struct super_block *sb, void *data, int silent)
 		break;
 	}
 
-	if (test_opt(sb, NOBH)) {
-		if (!(test_opt(sb, DATA_FLAGS) == EXT3_MOUNT_WRITEBACK_DATA)) {
-			ext3_msg(sb, KERN_WARNING,
-				"warning: ignoring nobh option - "
-				"it is supported only with writeback mode");
-			clear_opt(sbi->s_mount_opt, NOBH);
-		}
-	}
 	/*
 	 * The journal_load will have done any necessary log recovery,
 	 * so we can safely mount the rest of the filesystem now.
diff --git a/include/linux/ext3_fs.h b/include/linux/ext3_fs.h
index 7fc62d4550b2..3d3a9915dde2 100644
--- a/include/linux/ext3_fs.h
+++ b/include/linux/ext3_fs.h
@@ -400,7 +400,6 @@ struct ext3_inode {
 #define EXT3_MOUNT_POSIX_ACL		0x08000	/* POSIX Access Control Lists */
 #define EXT3_MOUNT_RESERVATION		0x10000	/* Preallocation */
 #define EXT3_MOUNT_BARRIER		0x20000 /* Use block barriers */
-#define EXT3_MOUNT_NOBH			0x40000 /* No bufferheads */
 #define EXT3_MOUNT_QUOTA		0x80000 /* Some quota option set */
 #define EXT3_MOUNT_USRQUOTA		0x100000 /* "old" user quota */
 #define EXT3_MOUNT_GRPQUOTA		0x200000 /* "old" group quota */
-- 
cgit v1.2.3


From fb5ffb0e160c93c3fe08ab83845eb9a2768af812 Mon Sep 17 00:00:00 2001
From: Jiaying Zhang <jiayingz@google.com>
Date: Tue, 20 Jul 2010 16:54:43 +0200
Subject: quota: Change quota error message to print out disk and function name

The current quota error message doesn't always print the disk name, so
it is hard to identify the "bad" disk when quota error happens.

This patch changes the standardized quota error message to print out disk name
and function name. It also uses a combination of cpp macro and inline function
to provide better type checking and to lower the text size of the message.

[Jan Kara: Export __quota_error]

Signed-off-by: Jiaying Zhang <jiayingz@google.com>
Signed-off-by: Jan Kara <jack@suse.cz>
---
 fs/quota/dquot.c         | 39 +++++++++++++++-------
 fs/quota/quota_tree.c    | 85 ++++++++++++++++++++++++------------------------
 fs/quota/quota_tree.h    |  6 ----
 fs/quota/quota_v1.c      |  3 +-
 fs/quota/quota_v2.c      | 11 +++----
 include/linux/quotaops.h |  6 ++++
 6 files changed, 80 insertions(+), 70 deletions(-)

(limited to 'include/linux')

diff --git a/fs/quota/dquot.c b/fs/quota/dquot.c
index 2eebf72d07c8..b171221000fa 100644
--- a/fs/quota/dquot.c
+++ b/fs/quota/dquot.c
@@ -132,6 +132,22 @@ static __cacheline_aligned_in_smp DEFINE_SPINLOCK(dq_state_lock);
 __cacheline_aligned_in_smp DEFINE_SPINLOCK(dq_data_lock);
 EXPORT_SYMBOL(dq_data_lock);
 
+void __quota_error(struct super_block *sb, const char *func,
+		  const char *fmt, ...)
+{
+	va_list args;
+
+	if (printk_ratelimit()) {
+		va_start(args, fmt);
+		printk(KERN_ERR "Quota error (device %s): %s: ",
+		       sb->s_id, func);
+		vprintk(fmt, args);
+		printk("\n");
+		va_end(args);
+	}
+}
+EXPORT_SYMBOL(__quota_error);
+
 #if defined(CONFIG_QUOTA_DEBUG) || defined(CONFIG_PRINT_QUOTA_WARNING)
 static char *quotatypes[] = INITQFNAMES;
 #endif
@@ -705,11 +721,8 @@ void dqput(struct dquot *dquot)
 		return;
 #ifdef CONFIG_QUOTA_DEBUG
 	if (!atomic_read(&dquot->dq_count)) {
-		printk("VFS: dqput: trying to free free dquot\n");
-		printk("VFS: device %s, dquot of %s %d\n",
-			dquot->dq_sb->s_id,
-			quotatypes[dquot->dq_type],
-			dquot->dq_id);
+		quota_error(dquot->dq_sb, "trying to free free dquot of %s %d",
+			    quotatypes[dquot->dq_type], dquot->dq_id);
 		BUG();
 	}
 #endif
@@ -732,9 +745,9 @@ we_slept:
 		/* Commit dquot before releasing */
 		ret = dquot->dq_sb->dq_op->write_dquot(dquot);
 		if (ret < 0) {
-			printk(KERN_ERR "VFS: cannot write quota structure on "
-				"device %s (error %d). Quota may get out of "
-				"sync!\n", dquot->dq_sb->s_id, ret);
+			quota_error(dquot->dq_sb, "Can't write quota structure"
+				    " (error %d). Quota may get out of sync!",
+				    ret);
 			/*
 			 * We clear dirty bit anyway, so that we avoid
 			 * infinite loop here
@@ -914,9 +927,9 @@ static void add_dquot_ref(struct super_block *sb, int type)
 
 #ifdef CONFIG_QUOTA_DEBUG
 	if (reserved) {
-		printk(KERN_WARNING "VFS (%s): Writes happened before quota"
-			" was turned on thus quota information is probably "
-			"inconsistent. Please run quotacheck(8).\n", sb->s_id);
+		quota_error(sb, "Writes happened before quota was turned on "
+			"thus quota information is probably inconsistent. "
+			"Please run quotacheck(8)");
 	}
 #endif
 }
@@ -947,7 +960,9 @@ static int remove_inode_dquot_ref(struct inode *inode, int type,
 		if (dqput_blocks(dquot)) {
 #ifdef CONFIG_QUOTA_DEBUG
 			if (atomic_read(&dquot->dq_count) != 1)
-				printk(KERN_WARNING "VFS: Adding dquot with dq_count %d to dispose list.\n", atomic_read(&dquot->dq_count));
+				quota_error(inode->i_sb, "Adding dquot with "
+					    "dq_count %d to dispose list",
+					    atomic_read(&dquot->dq_count));
 #endif
 			spin_lock(&dq_list_lock);
 			/* As dquot must have currently users it can't be on
diff --git a/fs/quota/quota_tree.c b/fs/quota/quota_tree.c
index 24f03407eeb5..9e48874eabcc 100644
--- a/fs/quota/quota_tree.c
+++ b/fs/quota/quota_tree.c
@@ -65,8 +65,7 @@ static ssize_t write_blk(struct qtree_mem_dqinfo *info, uint blk, char *buf)
 	ret = sb->s_op->quota_write(sb, info->dqi_type, buf,
 	       info->dqi_usable_bs, blk << info->dqi_blocksize_bits);
 	if (ret != info->dqi_usable_bs) {
-		q_warn(KERN_WARNING "VFS: dquota write failed on "
-			"dev %s\n", sb->s_id);
+		quota_error(sb, "dquota write failed");
 		if (ret >= 0)
 			ret = -EIO;
 	}
@@ -160,9 +159,8 @@ static int remove_free_dqentry(struct qtree_mem_dqinfo *info, char *buf,
 	dh->dqdh_next_free = dh->dqdh_prev_free = cpu_to_le32(0);
 	/* No matter whether write succeeds block is out of list */
 	if (write_blk(info, blk, buf) < 0)
-		q_warn(KERN_ERR
-		       "VFS: Can't write block (%u) with free entries.\n",
-		       blk);
+		quota_error(info->dqi_sb, "Can't write block (%u) "
+			    "with free entries", blk);
 	return 0;
 out_buf:
 	kfree(tmpbuf);
@@ -252,9 +250,8 @@ static uint find_free_dqentry(struct qtree_mem_dqinfo *info,
 	if (le16_to_cpu(dh->dqdh_entries) + 1 >= qtree_dqstr_in_blk(info)) {
 		*err = remove_free_dqentry(info, buf, blk);
 		if (*err < 0) {
-			q_warn(KERN_ERR "VFS: find_free_dqentry(): Can't "
-			       "remove block (%u) from entry free list.\n",
-			       blk);
+			quota_error(dquot->dq_sb, "Can't remove block (%u) "
+				    "from entry free list", blk);
 			goto out_buf;
 		}
 	}
@@ -268,16 +265,15 @@ static uint find_free_dqentry(struct qtree_mem_dqinfo *info,
 	}
 #ifdef __QUOTA_QT_PARANOIA
 	if (i == qtree_dqstr_in_blk(info)) {
-		printk(KERN_ERR "VFS: find_free_dqentry(): Data block full "
-				"but it shouldn't.\n");
+		quota_error(dquot->dq_sb, "Data block full but it shouldn't");
 		*err = -EIO;
 		goto out_buf;
 	}
 #endif
 	*err = write_blk(info, blk, buf);
 	if (*err < 0) {
-		q_warn(KERN_ERR "VFS: find_free_dqentry(): Can't write quota "
-				"data block %u.\n", blk);
+		quota_error(dquot->dq_sb, "Can't write quota data block %u",
+			    blk);
 		goto out_buf;
 	}
 	dquot->dq_off = (blk << info->dqi_blocksize_bits) +
@@ -311,8 +307,8 @@ static int do_insert_tree(struct qtree_mem_dqinfo *info, struct dquot *dquot,
 	} else {
 		ret = read_blk(info, *treeblk, buf);
 		if (ret < 0) {
-			q_warn(KERN_ERR "VFS: Can't read tree quota block "
-					"%u.\n", *treeblk);
+			quota_error(dquot->dq_sb, "Can't read tree quota "
+				    "block %u", *treeblk);
 			goto out_buf;
 		}
 	}
@@ -323,9 +319,9 @@ static int do_insert_tree(struct qtree_mem_dqinfo *info, struct dquot *dquot,
 	if (depth == info->dqi_qtree_depth - 1) {
 #ifdef __QUOTA_QT_PARANOIA
 		if (newblk) {
-			printk(KERN_ERR "VFS: Inserting already present quota "
-					"entry (block %u).\n",
-			       le32_to_cpu(ref[get_index(info,
+			quota_error(dquot->dq_sb, "Inserting already present "
+				    "quota entry (block %u)",
+				    le32_to_cpu(ref[get_index(info,
 						dquot->dq_id, depth)]));
 			ret = -EIO;
 			goto out_buf;
@@ -373,8 +369,8 @@ int qtree_write_dquot(struct qtree_mem_dqinfo *info, struct dquot *dquot)
 	if (!dquot->dq_off) {
 		ret = dq_insert_tree(info, dquot);
 		if (ret < 0) {
-			q_warn(KERN_ERR "VFS: Error %zd occurred while "
-					"creating quota.\n", ret);
+			quota_error(sb, "Error %zd occurred while creating "
+				    "quota", ret);
 			kfree(ddquot);
 			return ret;
 		}
@@ -385,8 +381,7 @@ int qtree_write_dquot(struct qtree_mem_dqinfo *info, struct dquot *dquot)
 	ret = sb->s_op->quota_write(sb, type, ddquot, info->dqi_entry_size,
 				    dquot->dq_off);
 	if (ret != info->dqi_entry_size) {
-		q_warn(KERN_WARNING "VFS: dquota write failed on dev %s\n",
-		       sb->s_id);
+		quota_error(sb, "dquota write failed");
 		if (ret >= 0)
 			ret = -ENOSPC;
 	} else {
@@ -410,14 +405,15 @@ static int free_dqentry(struct qtree_mem_dqinfo *info, struct dquot *dquot,
 	if (!buf)
 		return -ENOMEM;
 	if (dquot->dq_off >> info->dqi_blocksize_bits != blk) {
-		q_warn(KERN_ERR "VFS: Quota structure has offset to other "
-		  "block (%u) than it should (%u).\n", blk,
-		  (uint)(dquot->dq_off >> info->dqi_blocksize_bits));
+		quota_error(dquot->dq_sb, "Quota structure has offset to "
+			"other block (%u) than it should (%u)", blk,
+			(uint)(dquot->dq_off >> info->dqi_blocksize_bits));
 		goto out_buf;
 	}
 	ret = read_blk(info, blk, buf);
 	if (ret < 0) {
-		q_warn(KERN_ERR "VFS: Can't read quota data block %u\n", blk);
+		quota_error(dquot->dq_sb, "Can't read quota data block %u",
+			    blk);
 		goto out_buf;
 	}
 	dh = (struct qt_disk_dqdbheader *)buf;
@@ -427,8 +423,8 @@ static int free_dqentry(struct qtree_mem_dqinfo *info, struct dquot *dquot,
 		if (ret >= 0)
 			ret = put_free_dqblk(info, buf, blk);
 		if (ret < 0) {
-			q_warn(KERN_ERR "VFS: Can't move quota data block (%u) "
-			  "to free list.\n", blk);
+			quota_error(dquot->dq_sb, "Can't move quota data block "
+				    "(%u) to free list", blk);
 			goto out_buf;
 		}
 	} else {
@@ -440,15 +436,15 @@ static int free_dqentry(struct qtree_mem_dqinfo *info, struct dquot *dquot,
 			/* Insert will write block itself */
 			ret = insert_free_dqentry(info, buf, blk);
 			if (ret < 0) {
-				q_warn(KERN_ERR "VFS: Can't insert quota data "
-				       "block (%u) to free entry list.\n", blk);
+				quota_error(dquot->dq_sb, "Can't insert quota "
+				    "data block (%u) to free entry list", blk);
 				goto out_buf;
 			}
 		} else {
 			ret = write_blk(info, blk, buf);
 			if (ret < 0) {
-				q_warn(KERN_ERR "VFS: Can't write quota data "
-				  "block %u\n", blk);
+				quota_error(dquot->dq_sb, "Can't write quota "
+					    "data block %u", blk);
 				goto out_buf;
 			}
 		}
@@ -472,7 +468,8 @@ static int remove_tree(struct qtree_mem_dqinfo *info, struct dquot *dquot,
 		return -ENOMEM;
 	ret = read_blk(info, *blk, buf);
 	if (ret < 0) {
-		q_warn(KERN_ERR "VFS: Can't read quota data block %u\n", *blk);
+		quota_error(dquot->dq_sb, "Can't read quota data "
+			    "block %u", blk);
 		goto out_buf;
 	}
 	newblk = le32_to_cpu(ref[get_index(info, dquot->dq_id, depth)]);
@@ -496,8 +493,8 @@ static int remove_tree(struct qtree_mem_dqinfo *info, struct dquot *dquot,
 		} else {
 			ret = write_blk(info, *blk, buf);
 			if (ret < 0)
-				q_warn(KERN_ERR "VFS: Can't write quota tree "
-				  "block %u.\n", *blk);
+				quota_error(dquot->dq_sb, "Can't write quota "
+					    "tree block %u", blk);
 		}
 	}
 out_buf:
@@ -529,7 +526,8 @@ static loff_t find_block_dqentry(struct qtree_mem_dqinfo *info,
 		return -ENOMEM;
 	ret = read_blk(info, blk, buf);
 	if (ret < 0) {
-		q_warn(KERN_ERR "VFS: Can't read quota tree block %u.\n", blk);
+		quota_error(dquot->dq_sb, "Can't read quota tree "
+			    "block %u", blk);
 		goto out_buf;
 	}
 	ddquot = buf + sizeof(struct qt_disk_dqdbheader);
@@ -539,8 +537,8 @@ static loff_t find_block_dqentry(struct qtree_mem_dqinfo *info,
 		ddquot += info->dqi_entry_size;
 	}
 	if (i == qtree_dqstr_in_blk(info)) {
-		q_warn(KERN_ERR "VFS: Quota for id %u referenced "
-		  "but not present.\n", dquot->dq_id);
+		quota_error(dquot->dq_sb, "Quota for id %u referenced "
+			    "but not present", dquot->dq_id);
 		ret = -EIO;
 		goto out_buf;
 	} else {
@@ -564,7 +562,8 @@ static loff_t find_tree_dqentry(struct qtree_mem_dqinfo *info,
 		return -ENOMEM;
 	ret = read_blk(info, blk, buf);
 	if (ret < 0) {
-		q_warn(KERN_ERR "VFS: Can't read quota tree block %u.\n", blk);
+		quota_error(dquot->dq_sb, "Can't read quota tree block %u",
+			    blk);
 		goto out_buf;
 	}
 	ret = 0;
@@ -598,7 +597,7 @@ int qtree_read_dquot(struct qtree_mem_dqinfo *info, struct dquot *dquot)
 #ifdef __QUOTA_QT_PARANOIA
 	/* Invalidated quota? */
 	if (!sb_dqopt(dquot->dq_sb)->files[type]) {
-		printk(KERN_ERR "VFS: Quota invalidated while reading!\n");
+		quota_error(sb, "Quota invalidated while reading!");
 		return -EIO;
 	}
 #endif
@@ -607,8 +606,8 @@ int qtree_read_dquot(struct qtree_mem_dqinfo *info, struct dquot *dquot)
 		offset = find_dqentry(info, dquot);
 		if (offset <= 0) {	/* Entry not present? */
 			if (offset < 0)
-				q_warn(KERN_ERR "VFS: Can't read quota "
-				  "structure for id %u.\n", dquot->dq_id);
+				quota_error(sb, "Can't read quota structure "
+					    "for id %u", dquot->dq_id);
 			dquot->dq_off = 0;
 			set_bit(DQ_FAKE_B, &dquot->dq_flags);
 			memset(&dquot->dq_dqb, 0, sizeof(struct mem_dqblk));
@@ -625,8 +624,8 @@ int qtree_read_dquot(struct qtree_mem_dqinfo *info, struct dquot *dquot)
 	if (ret != info->dqi_entry_size) {
 		if (ret >= 0)
 			ret = -EIO;
-		q_warn(KERN_ERR "VFS: Error while reading quota "
-				"structure for id %u.\n", dquot->dq_id);
+		quota_error(sb, "Error while reading quota structure for id %u",
+			    dquot->dq_id);
 		set_bit(DQ_FAKE_B, &dquot->dq_flags);
 		memset(&dquot->dq_dqb, 0, sizeof(struct mem_dqblk));
 		kfree(ddquot);
diff --git a/fs/quota/quota_tree.h b/fs/quota/quota_tree.h
index ccc3e71fb1d8..a1ab8db81a51 100644
--- a/fs/quota/quota_tree.h
+++ b/fs/quota/quota_tree.h
@@ -22,10 +22,4 @@ struct qt_disk_dqdbheader {
 
 #define QT_TREEOFF	1		/* Offset of tree in file in blocks */
 
-#define q_warn(fmt, args...) \
-do { \
-	if (printk_ratelimit()) \
-		printk(fmt, ## args); \
-} while(0)
-
 #endif /* _LINUX_QUOTAIO_TREE_H */
diff --git a/fs/quota/quota_v1.c b/fs/quota/quota_v1.c
index 4af344c5852a..34b37a67bb16 100644
--- a/fs/quota/quota_v1.c
+++ b/fs/quota/quota_v1.c
@@ -95,8 +95,7 @@ static int v1_commit_dqblk(struct dquot *dquot)
 			(char *)&dqblk, sizeof(struct v1_disk_dqblk),
 			v1_dqoff(dquot->dq_id));
 	if (ret != sizeof(struct v1_disk_dqblk)) {
-		printk(KERN_WARNING "VFS: dquota write failed on dev %s\n",
-			dquot->dq_sb->s_id);
+		quota_error(dquot->dq_sb, "dquota write failed");
 		if (ret >= 0)
 			ret = -EIO;
 		goto out;
diff --git a/fs/quota/quota_v2.c b/fs/quota/quota_v2.c
index 135206af1458..65444d29406b 100644
--- a/fs/quota/quota_v2.c
+++ b/fs/quota/quota_v2.c
@@ -63,9 +63,8 @@ static int v2_read_header(struct super_block *sb, int type,
 	size = sb->s_op->quota_read(sb, type, (char *)dqhead,
 				    sizeof(struct v2_disk_dqheader), 0);
 	if (size != sizeof(struct v2_disk_dqheader)) {
-		q_warn(KERN_WARNING "quota_v2: Failed header read:"
-		       " expected=%zd got=%zd\n",
-			sizeof(struct v2_disk_dqheader), size);
+		quota_error(sb, "Failed header read: expected=%zd got=%zd",
+			    sizeof(struct v2_disk_dqheader), size);
 		return 0;
 	}
 	return 1;
@@ -106,8 +105,7 @@ static int v2_read_file_info(struct super_block *sb, int type)
 	size = sb->s_op->quota_read(sb, type, (char *)&dinfo,
 	       sizeof(struct v2_disk_dqinfo), V2_DQINFOOFF);
 	if (size != sizeof(struct v2_disk_dqinfo)) {
-		q_warn(KERN_WARNING "quota_v2: Can't read info structure on device %s.\n",
-			sb->s_id);
+		quota_error(sb, "Can't read info structure");
 		return -1;
 	}
 	info->dqi_priv = kmalloc(sizeof(struct qtree_mem_dqinfo), GFP_NOFS);
@@ -167,8 +165,7 @@ static int v2_write_file_info(struct super_block *sb, int type)
 	size = sb->s_op->quota_write(sb, type, (char *)&dinfo,
 	       sizeof(struct v2_disk_dqinfo), V2_DQINFOOFF);
 	if (size != sizeof(struct v2_disk_dqinfo)) {
-		q_warn(KERN_WARNING "Can't write info structure on device %s.\n",
-			sb->s_id);
+		quota_error(sb, "Can't write info structure");
 		return -1;
 	}
 	return 0;
diff --git a/include/linux/quotaops.h b/include/linux/quotaops.h
index 126193c1a5ce..4881b49b1a9a 100644
--- a/include/linux/quotaops.h
+++ b/include/linux/quotaops.h
@@ -28,6 +28,12 @@ static inline bool is_quota_modification(struct inode *inode, struct iattr *ia)
 
 #if defined(CONFIG_QUOTA)
 
+#define quota_error(sb, fmt, args...) \
+	__quota_error((sb), __func__, fmt , ## args)
+
+extern void __quota_error(struct super_block *sb, const char *func,
+			 const char *fmt, ...);
+
 /*
  * declaration of quota_function calls in kernel.
  */
-- 
cgit v1.2.3


From 9849ed4d72251d273524efb8b70be0be9aecb1df Mon Sep 17 00:00:00 2001
From: Mike Frysinger <vapier@gentoo.org>
Date: Tue, 20 Jul 2010 03:13:35 -0400
Subject: tracing/documentation: Document dynamic ftracer internals

Add more details to the dynamic function tracing design implementation.

Signed-off-by: Mike Frysinger <vapier@gentoo.org>
LKML-Reference: <1279610015-10250-1-git-send-email-vapier@gentoo.org>
Signed-off-by: Steven Rostedt <rostedt@goodmis.org>
---
 Documentation/trace/ftrace-design.txt | 153 ++++++++++++++++++++++++++++++++--
 include/linux/ftrace.h                |   5 ++
 2 files changed, 153 insertions(+), 5 deletions(-)

(limited to 'include/linux')

diff --git a/Documentation/trace/ftrace-design.txt b/Documentation/trace/ftrace-design.txt
index f1f81afee8a0..dc52bd442c92 100644
--- a/Documentation/trace/ftrace-design.txt
+++ b/Documentation/trace/ftrace-design.txt
@@ -13,6 +13,9 @@ Note that this focuses on architecture implementation details only.  If you
 want more explanation of a feature in terms of common code, review the common
 ftrace.txt file.
 
+Ideally, everyone who wishes to retain performance while supporting tracing in
+their kernel should make it all the way to dynamic ftrace support.
+
 
 Prerequisites
 -------------
@@ -215,7 +218,7 @@ An arch may pass in a unique value (frame pointer) to both the entering and
 exiting of a function.  On exit, the value is compared and if it does not
 match, then it will panic the kernel.  This is largely a sanity check for bad
 code generation with gcc.  If gcc for your port sanely updates the frame
-pointer under different opitmization levels, then ignore this option.
+pointer under different optimization levels, then ignore this option.
 
 However, adding support for it isn't terribly difficult.  In your assembly code
 that calls prepare_ftrace_return(), pass the frame pointer as the 3rd argument.
@@ -234,7 +237,7 @@ If you can't trace NMI functions, then skip this option.
 
 
 HAVE_SYSCALL_TRACEPOINTS
----------------------
+------------------------
 
 You need very few things to get the syscalls tracing in an arch.
 
@@ -250,12 +253,152 @@ You need very few things to get the syscalls tracing in an arch.
 HAVE_FTRACE_MCOUNT_RECORD
 -------------------------
 
-See scripts/recordmcount.pl for more info.
+See scripts/recordmcount.pl for more info.  Just fill in the arch-specific
+details for how to locate the addresses of mcount call sites via objdump.
+This option doesn't make much sense without also implementing dynamic ftrace.
 
+
+HAVE_DYNAMIC_FTRACE
+-------------------
+
+You will first need HAVE_FTRACE_MCOUNT_RECORD and HAVE_FUNCTION_TRACER, so
+scroll your reader back up if you got over eager.
+
+Once those are out of the way, you will need to implement:
+	- asm/ftrace.h:
+		- MCOUNT_ADDR
+		- ftrace_call_adjust()
+		- struct dyn_arch_ftrace{}
+	- asm code:
+		- mcount() (new stub)
+		- ftrace_caller()
+		- ftrace_call()
+		- ftrace_stub()
+	- C code:
+		- ftrace_dyn_arch_init()
+		- ftrace_make_nop()
+		- ftrace_make_call()
+		- ftrace_update_ftrace_func()
+
+First you will need to fill out some arch details in your asm/ftrace.h.
+
+Define MCOUNT_ADDR as the address of your mcount symbol similar to:
+	#define MCOUNT_ADDR ((unsigned long)mcount)
+Since no one else will have a decl for that function, you will need to:
+	extern void mcount(void);
+
+You will also need the helper function ftrace_call_adjust().  Most people
+will be able to stub it out like so:
+	static inline unsigned long ftrace_call_adjust(unsigned long addr)
+	{
+		return addr;
+	}
 <details to be filled>
 
+Lastly you will need the custom dyn_arch_ftrace structure.  If you need
+some extra state when runtime patching arbitrary call sites, this is the
+place.  For now though, create an empty struct:
+	struct dyn_arch_ftrace {
+		/* No extra data needed */
+	};
+
+With the header out of the way, we can fill out the assembly code.  While we
+did already create a mcount() function earlier, dynamic ftrace only wants a
+stub function.  This is because the mcount() will only be used during boot
+and then all references to it will be patched out never to return.  Instead,
+the guts of the old mcount() will be used to create a new ftrace_caller()
+function.  Because the two are hard to merge, it will most likely be a lot
+easier to have two separate definitions split up by #ifdefs.  Same goes for
+the ftrace_stub() as that will now be inlined in ftrace_caller().
+
+Before we get confused anymore, let's check out some pseudo code so you can
+implement your own stuff in assembly:
 
-HAVE_DYNAMIC_FTRACE
----------------------
+void mcount(void)
+{
+	return;
+}
+
+void ftrace_caller(void)
+{
+	/* implement HAVE_FUNCTION_TRACE_MCOUNT_TEST if you desire */
+
+	/* save all state needed by the ABI (see paragraph above) */
+
+	unsigned long frompc = ...;
+	unsigned long selfpc = <return address> - MCOUNT_INSN_SIZE;
+
+ftrace_call:
+	ftrace_stub(frompc, selfpc);
+
+	/* restore all state needed by the ABI */
+
+ftrace_stub:
+	return;
+}
+
+This might look a little odd at first, but keep in mind that we will be runtime
+patching multiple things.  First, only functions that we actually want to trace
+will be patched to call ftrace_caller().  Second, since we only have one tracer
+active at a time, we will patch the ftrace_caller() function itself to call the
+specific tracer in question.  That is the point of the ftrace_call label.
+
+With that in mind, let's move on to the C code that will actually be doing the
+runtime patching.  You'll need a little knowledge of your arch's opcodes in
+order to make it through the next section.
+
+Every arch has an init callback function.  If you need to do something early on
+to initialize some state, this is the time to do that.  Otherwise, this simple
+function below should be sufficient for most people:
+
+int __init ftrace_dyn_arch_init(void *data)
+{
+	/* return value is done indirectly via data */
+	*(unsigned long *)data = 0;
+
+	return 0;
+}
+
+There are two functions that are used to do runtime patching of arbitrary
+functions.  The first is used to turn the mcount call site into a nop (which
+is what helps us retain runtime performance when not tracing).  The second is
+used to turn the mcount call site into a call to an arbitrary location (but
+typically that is ftracer_caller()).  See the general function definition in
+linux/ftrace.h for the functions:
+	ftrace_make_nop()
+	ftrace_make_call()
+The rec->ip value is the address of the mcount call site that was collected
+by the scripts/recordmcount.pl during build time.
+
+The last function is used to do runtime patching of the active tracer.  This
+will be modifying the assembly code at the location of the ftrace_call symbol
+inside of the ftrace_caller() function.  So you should have sufficient padding
+at that location to support the new function calls you'll be inserting.  Some
+people will be using a "call" type instruction while others will be using a
+"branch" type instruction.  Specifically, the function is:
+	ftrace_update_ftrace_func()
+
+
+HAVE_DYNAMIC_FTRACE + HAVE_FUNCTION_GRAPH_TRACER
+------------------------------------------------
+
+The function grapher needs a few tweaks in order to work with dynamic ftrace.
+Basically, you will need to:
+	- update:
+		- ftrace_caller()
+		- ftrace_graph_call()
+		- ftrace_graph_caller()
+	- implement:
+		- ftrace_enable_ftrace_graph_caller()
+		- ftrace_disable_ftrace_graph_caller()
 
 <details to be filled>
+Quick notes:
+	- add a nop stub after the ftrace_call location named ftrace_graph_call;
+	  stub needs to be large enough to support a call to ftrace_graph_caller()
+	- update ftrace_graph_caller() to work with being called by the new
+	  ftrace_caller() since some semantics may have changed
+	- ftrace_enable_ftrace_graph_caller() will runtime patch the
+	  ftrace_graph_call location with a call to ftrace_graph_caller()
+	- ftrace_disable_ftrace_graph_caller() will runtime patch the
+	  ftrace_graph_call location with nops
diff --git a/include/linux/ftrace.h b/include/linux/ftrace.h
index 41e46330d9be..dcd6a7c3a435 100644
--- a/include/linux/ftrace.h
+++ b/include/linux/ftrace.h
@@ -1,3 +1,8 @@
+/*
+ * Ftrace header.  For implementation details beyond the random comments
+ * scattered below, see: Documentation/trace/ftrace-design.txt
+ */
+
 #ifndef _LINUX_FTRACE_H
 #define _LINUX_FTRACE_H
 
-- 
cgit v1.2.3


From e120153ddf8620fd0a194d301e9c5a8b28483bb5 Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Thu, 22 Jul 2010 14:14:25 +0200
Subject: workqueue: fix how cpu number is stored in work->data

Once a work starts execution, its data contains the cpu number it was
on instead of pointing to cwq.  This is added by commit 7a22ad75
(workqueue: carry cpu number in work data once execution starts) to
reliably determine the work was last on even if the workqueue itself
was destroyed inbetween.

Whether data points to a cwq or contains a cpu number was
distinguished by comparing the value against PAGE_OFFSET.  The
assumption was that a cpu number should be below PAGE_OFFSET while a
pointer to cwq should be above it.  However, on architectures which
use separate address spaces for user and kernel spaces, this doesn't
hold as PAGE_OFFSET is zero.

Fix it by using an explicit flag, WORK_STRUCT_CWQ, to mark what the
data field contains.  If the flag is set, it's pointing to a cwq;
otherwise, it contains a cpu number.

Reported on s390 and microblaze during linux-next testing.

Signed-off-by: Tejun Heo <tj@kernel.org>
Reported-by: Sachin Sant <sachinp@in.ibm.com>
Reported-by: Michal Simek <michal.simek@petalogix.com>
Reported-by: Martin Schwidefsky <schwidefsky@de.ibm.com>
Tested-by: Martin Schwidefsky <schwidefsky@de.ibm.com>
Tested-by: Michal Simek <monstr@monstr.eu>
---
 include/linux/workqueue.h | 14 ++++++++------
 kernel/workqueue.c        | 36 +++++++++++++-----------------------
 2 files changed, 21 insertions(+), 29 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/workqueue.h b/include/linux/workqueue.h
index d74a529ed13e..5f76001c4e6d 100644
--- a/include/linux/workqueue.h
+++ b/include/linux/workqueue.h
@@ -25,17 +25,19 @@ typedef void (*work_func_t)(struct work_struct *work);
 
 enum {
 	WORK_STRUCT_PENDING_BIT	= 0,	/* work item is pending execution */
-	WORK_STRUCT_LINKED_BIT	= 1,	/* next work is linked to this one */
+	WORK_STRUCT_CWQ_BIT	= 1,	/* data points to cwq */
+	WORK_STRUCT_LINKED_BIT	= 2,	/* next work is linked to this one */
 #ifdef CONFIG_DEBUG_OBJECTS_WORK
-	WORK_STRUCT_STATIC_BIT	= 2,	/* static initializer (debugobjects) */
-	WORK_STRUCT_COLOR_SHIFT	= 3,	/* color for workqueue flushing */
+	WORK_STRUCT_STATIC_BIT	= 3,	/* static initializer (debugobjects) */
+	WORK_STRUCT_COLOR_SHIFT	= 4,	/* color for workqueue flushing */
 #else
-	WORK_STRUCT_COLOR_SHIFT	= 2,	/* color for workqueue flushing */
+	WORK_STRUCT_COLOR_SHIFT	= 3,	/* color for workqueue flushing */
 #endif
 
 	WORK_STRUCT_COLOR_BITS	= 4,
 
 	WORK_STRUCT_PENDING	= 1 << WORK_STRUCT_PENDING_BIT,
+	WORK_STRUCT_CWQ		= 1 << WORK_STRUCT_CWQ_BIT,
 	WORK_STRUCT_LINKED	= 1 << WORK_STRUCT_LINKED_BIT,
 #ifdef CONFIG_DEBUG_OBJECTS_WORK
 	WORK_STRUCT_STATIC	= 1 << WORK_STRUCT_STATIC_BIT,
@@ -56,8 +58,8 @@ enum {
 	WORK_CPU_LAST		= WORK_CPU_NONE,
 
 	/*
-	 * Reserve 6 bits off of cwq pointer w/ debugobjects turned
-	 * off.  This makes cwqs aligned to 64 bytes which isn't too
+	 * Reserve 7 bits off of cwq pointer w/ debugobjects turned
+	 * off.  This makes cwqs aligned to 128 bytes which isn't too
 	 * excessive while allowing 15 workqueue flush colors.
 	 */
 	WORK_STRUCT_FLAG_BITS	= WORK_STRUCT_COLOR_SHIFT +
diff --git a/kernel/workqueue.c b/kernel/workqueue.c
index c11edc9c9365..e5cb7faac58e 100644
--- a/kernel/workqueue.c
+++ b/kernel/workqueue.c
@@ -468,10 +468,9 @@ static int work_next_color(int color)
 }
 
 /*
- * Work data points to the cwq while a work is on queue.  Once
- * execution starts, it points to the cpu the work was last on.  This
- * can be distinguished by comparing the data value against
- * PAGE_OFFSET.
+ * A work's data points to the cwq with WORK_STRUCT_CWQ set while the
+ * work is on queue.  Once execution starts, WORK_STRUCT_CWQ is
+ * cleared and the work data contains the cpu number it was last on.
  *
  * set_work_{cwq|cpu}() and clear_work_data() can be used to set the
  * cwq, cpu or clear work->data.  These functions should only be
@@ -494,7 +493,7 @@ static void set_work_cwq(struct work_struct *work,
 			 unsigned long extra_flags)
 {
 	set_work_data(work, (unsigned long)cwq,
-		      WORK_STRUCT_PENDING | extra_flags);
+		      WORK_STRUCT_PENDING | WORK_STRUCT_CWQ | extra_flags);
 }
 
 static void set_work_cpu(struct work_struct *work, unsigned int cpu)
@@ -507,25 +506,24 @@ static void clear_work_data(struct work_struct *work)
 	set_work_data(work, WORK_STRUCT_NO_CPU, 0);
 }
 
-static inline unsigned long get_work_data(struct work_struct *work)
-{
-	return atomic_long_read(&work->data) & WORK_STRUCT_WQ_DATA_MASK;
-}
-
 static struct cpu_workqueue_struct *get_work_cwq(struct work_struct *work)
 {
-	unsigned long data = get_work_data(work);
+	unsigned long data = atomic_long_read(&work->data);
 
-	return data >= PAGE_OFFSET ? (void *)data : NULL;
+	if (data & WORK_STRUCT_CWQ)
+		return (void *)(data & WORK_STRUCT_WQ_DATA_MASK);
+	else
+		return NULL;
 }
 
 static struct global_cwq *get_work_gcwq(struct work_struct *work)
 {
-	unsigned long data = get_work_data(work);
+	unsigned long data = atomic_long_read(&work->data);
 	unsigned int cpu;
 
-	if (data >= PAGE_OFFSET)
-		return ((struct cpu_workqueue_struct *)data)->gcwq;
+	if (data & WORK_STRUCT_CWQ)
+		return ((struct cpu_workqueue_struct *)
+			(data & WORK_STRUCT_WQ_DATA_MASK))->gcwq;
 
 	cpu = data >> WORK_STRUCT_FLAG_BITS;
 	if (cpu == WORK_CPU_NONE)
@@ -3501,14 +3499,6 @@ void __init init_workqueues(void)
 	unsigned int cpu;
 	int i;
 
-	/*
-	 * The pointer part of work->data is either pointing to the
-	 * cwq or contains the cpu number the work ran last on.  Make
-	 * sure cpu number won't overflow into kernel pointer area so
-	 * that they can be distinguished.
-	 */
-	BUILD_BUG_ON(WORK_CPU_LAST << WORK_STRUCT_FLAG_BITS >= PAGE_OFFSET);
-
 	hotcpu_notifier(workqueue_cpu_callback, CPU_PRI_WORKQUEUE);
 
 	/* initialize gcwqs */
-- 
cgit v1.2.3


From 8b8edefa2fffbff97f9eec8b70e78ae23abad1a0 Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Tue, 20 Jul 2010 22:09:01 +0200
Subject: fscache: convert object to use workqueue instead of slow-work

Make fscache object state transition callbacks use workqueue instead
of slow-work.  New dedicated unbound CPU workqueue fscache_object_wq
is created.  get/put callbacks are renamed and modified to take
@object and called directly from the enqueue wrapper and the work
function.  While at it, make all open coded instances of get/put to
use fscache_get/put_object().

* Unbound workqueue is used.

* work_busy() output is printed instead of slow-work flags in object
  debugging outputs.  They mean basically the same thing bit-for-bit.

* sysctl fscache.object_max_active added to control concurrency.  The
  default value is nr_cpus clamped between 4 and
  WQ_UNBOUND_MAX_ACTIVE.

* slow_work_sleep_till_thread_needed() is replaced with fscache
  private implementation fscache_object_sleep_till_congested() which
  waits on fscache_object_wq congestion.

* debugfs support is dropped for now.  Tracing API based debug
  facility is planned to be added.

Signed-off-by: Tejun Heo <tj@kernel.org>
Acked-by: David Howells <dhowells@redhat.com>
---
 Documentation/filesystems/caching/fscache.txt |  10 +--
 fs/cachefiles/namei.c                         |  13 ++--
 fs/fscache/internal.h                         |   7 ++
 fs/fscache/main.c                             |  76 ++++++++++++++++++
 fs/fscache/object-list.c                      |  11 ++-
 fs/fscache/object.c                           | 106 +++++++++++++-------------
 include/linux/fscache-cache.h                 |   9 ++-
 7 files changed, 158 insertions(+), 74 deletions(-)

(limited to 'include/linux')

diff --git a/Documentation/filesystems/caching/fscache.txt b/Documentation/filesystems/caching/fscache.txt
index a91e2e2095b0..770267af5b3e 100644
--- a/Documentation/filesystems/caching/fscache.txt
+++ b/Documentation/filesystems/caching/fscache.txt
@@ -343,8 +343,8 @@ This will look something like:
 	[root@andromeda ~]# head /proc/fs/fscache/objects
 	OBJECT   PARENT   STAT CHLDN OPS OOP IPR EX READS EM EV F S | NETFS_COOKIE_DEF TY FL NETFS_DATA       OBJECT_KEY, AUX_DATA
 	======== ======== ==== ===== === === === == ===== == == = = | ================ == == ================ ================
-	   17e4b        2 ACTV     0   0   0   0  0     0 7b  4 0 8 | NFS.fh           DT  0 ffff88001dd82820 010006017edcf8bbc93b43298fdfbe71e50b57b13a172c0117f38472, e567634700000000000000000000000063f2404a000000000000000000000000c9030000000000000000000063f2404a
-	   1693a        2 ACTV     0   0   0   0  0     0 7b  4 0 8 | NFS.fh           DT  0 ffff88002db23380 010006017edcf8bbc93b43298fdfbe71e50b57b1e0162c01a2df0ea6, 420ebc4a000000000000000000000000420ebc4a0000000000000000000000000e1801000000000000000000420ebc4a
+	   17e4b        2 ACTV     0   0   0   0  0     0 7b  4 0 0 | NFS.fh           DT  0 ffff88001dd82820 010006017edcf8bbc93b43298fdfbe71e50b57b13a172c0117f38472, e567634700000000000000000000000063f2404a000000000000000000000000c9030000000000000000000063f2404a
+	   1693a        2 ACTV     0   0   0   0  0     0 7b  4 0 0 | NFS.fh           DT  0 ffff88002db23380 010006017edcf8bbc93b43298fdfbe71e50b57b1e0162c01a2df0ea6, 420ebc4a000000000000000000000000420ebc4a0000000000000000000000000e1801000000000000000000420ebc4a
 
 where the first set of columns before the '|' describe the object:
 
@@ -362,7 +362,7 @@ where the first set of columns before the '|' describe the object:
 	EM	Object's event mask
 	EV	Events raised on this object
 	F	Object flags
-	S	Object slow-work work item flags
+	S	Object work item busy state mask (1:pending 2:running)
 
 and the second set of columns describe the object's cookie, if present:
 
@@ -395,8 +395,8 @@ and the following paired letters:
 	w	Show objects that don't have pending writes
 	R	Show objects that have outstanding reads
 	r	Show objects that don't have outstanding reads
-	S	Show objects that have slow work queued
-	s	Show objects that don't have slow work queued
+	S	Show objects that have work queued
+	s	Show objects that don't have work queued
 
 If neither side of a letter pair is given, then both are implied.  For example:
 
diff --git a/fs/cachefiles/namei.c b/fs/cachefiles/namei.c
index f4a7840bf42c..42c7fafc8bfe 100644
--- a/fs/cachefiles/namei.c
+++ b/fs/cachefiles/namei.c
@@ -37,9 +37,9 @@ void __cachefiles_printk_object(struct cachefiles_object *object,
 
 	printk(KERN_ERR "%sobject: OBJ%x\n",
 	       prefix, object->fscache.debug_id);
-	printk(KERN_ERR "%sobjstate=%s fl=%lx swfl=%lx ev=%lx[%lx]\n",
+	printk(KERN_ERR "%sobjstate=%s fl=%lx wbusy=%x ev=%lx[%lx]\n",
 	       prefix, fscache_object_states[object->fscache.state],
-	       object->fscache.flags, object->fscache.work.flags,
+	       object->fscache.flags, work_busy(&object->fscache.work),
 	       object->fscache.events,
 	       object->fscache.event_mask & FSCACHE_OBJECT_EVENTS_MASK);
 	printk(KERN_ERR "%sops=%u inp=%u exc=%u\n",
@@ -212,7 +212,7 @@ wait_for_old_object:
 
 		/* if the object we're waiting for is queued for processing,
 		 * then just put ourselves on the queue behind it */
-		if (slow_work_is_queued(&xobject->fscache.work)) {
+		if (work_pending(&xobject->fscache.work)) {
 			_debug("queue OBJ%x behind OBJ%x immediately",
 			       object->fscache.debug_id,
 			       xobject->fscache.debug_id);
@@ -220,8 +220,7 @@ wait_for_old_object:
 		}
 
 		/* otherwise we sleep until either the object we're waiting for
-		 * is done, or the slow-work facility wants the thread back to
-		 * do other work */
+		 * is done, or the fscache_object is congested */
 		wq = bit_waitqueue(&xobject->flags, CACHEFILES_OBJECT_ACTIVE);
 		init_wait(&wait);
 		requeue = false;
@@ -229,8 +228,8 @@ wait_for_old_object:
 			prepare_to_wait(wq, &wait, TASK_UNINTERRUPTIBLE);
 			if (!test_bit(CACHEFILES_OBJECT_ACTIVE, &xobject->flags))
 				break;
-			requeue = slow_work_sleep_till_thread_needed(
-				&object->fscache.work, &timeout);
+
+			requeue = fscache_object_sleep_till_congested(&timeout);
 		} while (timeout > 0 && !requeue);
 		finish_wait(wq, &wait);
 
diff --git a/fs/fscache/internal.h b/fs/fscache/internal.h
index edd7434ab6e5..6e0b5fb25231 100644
--- a/fs/fscache/internal.h
+++ b/fs/fscache/internal.h
@@ -82,6 +82,13 @@ extern unsigned fscache_defer_lookup;
 extern unsigned fscache_defer_create;
 extern unsigned fscache_debug;
 extern struct kobject *fscache_root;
+extern struct workqueue_struct *fscache_object_wq;
+DECLARE_PER_CPU(wait_queue_head_t, fscache_object_cong_wait);
+
+static inline bool fscache_object_congested(void)
+{
+	return workqueue_congested(WORK_CPU_UNBOUND, fscache_object_wq);
+}
 
 extern int fscache_wait_bit(void *);
 extern int fscache_wait_bit_interruptible(void *);
diff --git a/fs/fscache/main.c b/fs/fscache/main.c
index add6bdb53f04..bb8d4c35c7a2 100644
--- a/fs/fscache/main.c
+++ b/fs/fscache/main.c
@@ -15,6 +15,7 @@
 #include <linux/sched.h>
 #include <linux/completion.h>
 #include <linux/slab.h>
+#include <linux/seq_file.h>
 #include "internal.h"
 
 MODULE_DESCRIPTION("FS Cache Manager");
@@ -40,22 +41,89 @@ MODULE_PARM_DESC(fscache_debug,
 		 "FS-Cache debugging mask");
 
 struct kobject *fscache_root;
+struct workqueue_struct *fscache_object_wq;
+
+DEFINE_PER_CPU(wait_queue_head_t, fscache_object_cong_wait);
+
+/* these values serve as lower bounds, will be adjusted in fscache_init() */
+static unsigned fscache_object_max_active = 4;
+
+#ifdef CONFIG_SYSCTL
+static struct ctl_table_header *fscache_sysctl_header;
+
+static int fscache_max_active_sysctl(struct ctl_table *table, int write,
+				     void __user *buffer,
+				     size_t *lenp, loff_t *ppos)
+{
+	struct workqueue_struct **wqp = table->extra1;
+	unsigned int *datap = table->data;
+	int ret;
+
+	ret = proc_dointvec(table, write, buffer, lenp, ppos);
+	if (ret == 0)
+		workqueue_set_max_active(*wqp, *datap);
+	return ret;
+}
+
+ctl_table fscache_sysctls[] = {
+	{
+		.procname	= "object_max_active",
+		.data		= &fscache_object_max_active,
+		.maxlen		= sizeof(unsigned),
+		.mode		= 0644,
+		.proc_handler	= fscache_max_active_sysctl,
+		.extra1		= &fscache_object_wq,
+	},
+	{}
+};
+
+ctl_table fscache_sysctls_root[] = {
+	{
+		.procname	= "fscache",
+		.mode		= 0555,
+		.child		= fscache_sysctls,
+	},
+	{}
+};
+#endif
 
 /*
  * initialise the fs caching module
  */
 static int __init fscache_init(void)
 {
+	unsigned int nr_cpus = num_possible_cpus();
+	unsigned int cpu;
 	int ret;
 
 	ret = slow_work_register_user(THIS_MODULE);
 	if (ret < 0)
 		goto error_slow_work;
 
+	fscache_object_max_active =
+		clamp_val(nr_cpus,
+			  fscache_object_max_active, WQ_UNBOUND_MAX_ACTIVE);
+
+	ret = -ENOMEM;
+	fscache_object_wq = alloc_workqueue("fscache_object", WQ_UNBOUND,
+					    fscache_object_max_active);
+	if (!fscache_object_wq)
+		goto error_object_wq;
+
+	for_each_possible_cpu(cpu)
+		init_waitqueue_head(&per_cpu(fscache_object_cong_wait, cpu));
+
 	ret = fscache_proc_init();
 	if (ret < 0)
 		goto error_proc;
 
+#ifdef CONFIG_SYSCTL
+	ret = -ENOMEM;
+	fscache_sysctl_header = register_sysctl_table(fscache_sysctls_root);
+	if (!fscache_sysctl_header)
+		goto error_sysctl;
+#endif
+
 	fscache_cookie_jar = kmem_cache_create("fscache_cookie_jar",
 					       sizeof(struct fscache_cookie),
 					       0,
@@ -78,8 +146,14 @@ static int __init fscache_init(void)
 error_kobj:
 	kmem_cache_destroy(fscache_cookie_jar);
 error_cookie_jar:
+#ifdef CONFIG_SYSCTL
+	unregister_sysctl_table(fscache_sysctl_header);
+error_sysctl:
+#endif
 	fscache_proc_cleanup();
 error_proc:
+	destroy_workqueue(fscache_object_wq);
+error_object_wq:
 	slow_work_unregister_user(THIS_MODULE);
 error_slow_work:
 	return ret;
@@ -96,7 +170,9 @@ static void __exit fscache_exit(void)
 
 	kobject_put(fscache_root);
 	kmem_cache_destroy(fscache_cookie_jar);
+	unregister_sysctl_table(fscache_sysctl_header);
 	fscache_proc_cleanup();
+	destroy_workqueue(fscache_object_wq);
 	slow_work_unregister_user(THIS_MODULE);
 	printk(KERN_NOTICE "FS-Cache: Unloaded\n");
 }
diff --git a/fs/fscache/object-list.c b/fs/fscache/object-list.c
index 4a8eb31c5338..ebe29c581380 100644
--- a/fs/fscache/object-list.c
+++ b/fs/fscache/object-list.c
@@ -34,8 +34,8 @@ struct fscache_objlist_data {
 #define FSCACHE_OBJLIST_CONFIG_NOREADS	0x00000200	/* show objects without active reads */
 #define FSCACHE_OBJLIST_CONFIG_EVENTS	0x00000400	/* show objects with events */
 #define FSCACHE_OBJLIST_CONFIG_NOEVENTS	0x00000800	/* show objects without no events */
-#define FSCACHE_OBJLIST_CONFIG_WORK	0x00001000	/* show objects with slow work */
-#define FSCACHE_OBJLIST_CONFIG_NOWORK	0x00002000	/* show objects without slow work */
+#define FSCACHE_OBJLIST_CONFIG_WORK	0x00001000	/* show objects with work */
+#define FSCACHE_OBJLIST_CONFIG_NOWORK	0x00002000	/* show objects without work */
 
 	u8		buf[512];	/* key and aux data buffer */
 };
@@ -231,12 +231,11 @@ static int fscache_objlist_show(struct seq_file *m, void *v)
 		       READS, NOREADS);
 		FILTER(obj->events & obj->event_mask,
 		       EVENTS, NOEVENTS);
-		FILTER(obj->work.flags & ~(1UL << SLOW_WORK_VERY_SLOW),
-		       WORK, NOWORK);
+		FILTER(work_busy(&obj->work), WORK, NOWORK);
 	}
 
 	seq_printf(m,
-		   "%8x %8x %s %5u %3u %3u %3u %2u %5u %2lx %2lx %1lx %1lx | ",
+		   "%8x %8x %s %5u %3u %3u %3u %2u %5u %2lx %2lx %1lx %1x | ",
 		   obj->debug_id,
 		   obj->parent ? obj->parent->debug_id : -1,
 		   fscache_object_states_short[obj->state],
@@ -249,7 +248,7 @@ static int fscache_objlist_show(struct seq_file *m, void *v)
 		   obj->event_mask & FSCACHE_OBJECT_EVENTS_MASK,
 		   obj->events,
 		   obj->flags,
-		   obj->work.flags);
+		   work_busy(&obj->work));
 
 	no_cookie = true;
 	keylen = auxlen = 0;
diff --git a/fs/fscache/object.c b/fs/fscache/object.c
index 0b589a9b4ffc..b6b897c550ac 100644
--- a/fs/fscache/object.c
+++ b/fs/fscache/object.c
@@ -14,7 +14,6 @@
 
 #define FSCACHE_DEBUG_LEVEL COOKIE
 #include <linux/module.h>
-#include <linux/seq_file.h>
 #include "internal.h"
 
 const char *fscache_object_states[FSCACHE_OBJECT__NSTATES] = {
@@ -50,12 +49,8 @@ const char fscache_object_states_short[FSCACHE_OBJECT__NSTATES][5] = {
 	[FSCACHE_OBJECT_DEAD]		= "DEAD",
 };
 
-static void fscache_object_slow_work_put_ref(struct slow_work *);
-static int  fscache_object_slow_work_get_ref(struct slow_work *);
-static void fscache_object_slow_work_execute(struct slow_work *);
-#ifdef CONFIG_SLOW_WORK_DEBUG
-static void fscache_object_slow_work_desc(struct slow_work *, struct seq_file *);
-#endif
+static int  fscache_get_object(struct fscache_object *);
+static void fscache_put_object(struct fscache_object *);
 static void fscache_initialise_object(struct fscache_object *);
 static void fscache_lookup_object(struct fscache_object *);
 static void fscache_object_available(struct fscache_object *);
@@ -64,17 +59,6 @@ static void fscache_withdraw_object(struct fscache_object *);
 static void fscache_enqueue_dependents(struct fscache_object *);
 static void fscache_dequeue_object(struct fscache_object *);
 
-const struct slow_work_ops fscache_object_slow_work_ops = {
-	.owner		= THIS_MODULE,
-	.get_ref	= fscache_object_slow_work_get_ref,
-	.put_ref	= fscache_object_slow_work_put_ref,
-	.execute	= fscache_object_slow_work_execute,
-#ifdef CONFIG_SLOW_WORK_DEBUG
-	.desc		= fscache_object_slow_work_desc,
-#endif
-};
-EXPORT_SYMBOL(fscache_object_slow_work_ops);
-
 /*
  * we need to notify the parent when an op completes that we had outstanding
  * upon it
@@ -345,7 +329,7 @@ unsupported_event:
 /*
  * execute an object
  */
-static void fscache_object_slow_work_execute(struct slow_work *work)
+void fscache_object_work_func(struct work_struct *work)
 {
 	struct fscache_object *object =
 		container_of(work, struct fscache_object, work);
@@ -359,23 +343,9 @@ static void fscache_object_slow_work_execute(struct slow_work *work)
 	if (object->events & object->event_mask)
 		fscache_enqueue_object(object);
 	clear_bit(FSCACHE_OBJECT_EV_REQUEUE, &object->events);
+	fscache_put_object(object);
 }
-
-/*
- * describe an object for slow-work debugging
- */
-#ifdef CONFIG_SLOW_WORK_DEBUG
-static void fscache_object_slow_work_desc(struct slow_work *work,
-					  struct seq_file *m)
-{
-	struct fscache_object *object =
-		container_of(work, struct fscache_object, work);
-
-	seq_printf(m, "FSC: OBJ%x: %s",
-		   object->debug_id,
-		   fscache_object_states_short[object->state]);
-}
-#endif
+EXPORT_SYMBOL(fscache_object_work_func);
 
 /*
  * initialise an object
@@ -393,7 +363,6 @@ static void fscache_initialise_object(struct fscache_object *object)
 	_enter("");
 	ASSERT(object->cookie != NULL);
 	ASSERT(object->cookie->parent != NULL);
-	ASSERT(list_empty(&object->work.link));
 
 	if (object->events & ((1 << FSCACHE_OBJECT_EV_ERROR) |
 			      (1 << FSCACHE_OBJECT_EV_RELEASE) |
@@ -671,10 +640,8 @@ static void fscache_drop_object(struct fscache_object *object)
 		object->parent = NULL;
 	}
 
-	/* this just shifts the object release to the slow work processor */
-	fscache_stat(&fscache_n_cop_put_object);
-	object->cache->ops->put_object(object);
-	fscache_stat_d(&fscache_n_cop_put_object);
+	/* this just shifts the object release to the work processor */
+	fscache_put_object(object);
 
 	_leave("");
 }
@@ -758,12 +725,10 @@ void fscache_withdrawing_object(struct fscache_cache *cache,
 }
 
 /*
- * allow the slow work item processor to get a ref on an object
+ * get a ref on an object
  */
-static int fscache_object_slow_work_get_ref(struct slow_work *work)
+static int fscache_get_object(struct fscache_object *object)
 {
-	struct fscache_object *object =
-		container_of(work, struct fscache_object, work);
 	int ret;
 
 	fscache_stat(&fscache_n_cop_grab_object);
@@ -773,13 +738,10 @@ static int fscache_object_slow_work_get_ref(struct slow_work *work)
 }
 
 /*
- * allow the slow work item processor to discard a ref on a work item
+ * discard a ref on a work item
  */
-static void fscache_object_slow_work_put_ref(struct slow_work *work)
+static void fscache_put_object(struct fscache_object *object)
 {
-	struct fscache_object *object =
-		container_of(work, struct fscache_object, work);
-
 	fscache_stat(&fscache_n_cop_put_object);
 	object->cache->ops->put_object(object);
 	fscache_stat_d(&fscache_n_cop_put_object);
@@ -792,8 +754,48 @@ void fscache_enqueue_object(struct fscache_object *object)
 {
 	_enter("{OBJ%x}", object->debug_id);
 
-	slow_work_enqueue(&object->work);
+	if (fscache_get_object(object) >= 0) {
+		wait_queue_head_t *cong_wq =
+			&get_cpu_var(fscache_object_cong_wait);
+
+		if (queue_work(fscache_object_wq, &object->work)) {
+			if (fscache_object_congested())
+				wake_up(cong_wq);
+		} else
+			fscache_put_object(object);
+
+		put_cpu_var(fscache_object_cong_wait);
+	}
+}
+
+/**
+ * fscache_object_sleep_till_congested - Sleep until object wq is congested
+ * @timoutp: Scheduler sleep timeout
+ *
+ * Allow an object handler to sleep until the object workqueue is congested.
+ *
+ * The caller must set up a wake up event before calling this and must have set
+ * the appropriate sleep mode (such as TASK_UNINTERRUPTIBLE) and tested its own
+ * condition before calling this function as no test is made here.
+ *
+ * %true is returned if the object wq is congested, %false otherwise.
+ */
+bool fscache_object_sleep_till_congested(signed long *timeoutp)
+{
+	wait_queue_head_t *cong_wq = &__get_cpu_var(fscache_object_cong_wait);
+	DEFINE_WAIT(wait);
+
+	if (fscache_object_congested())
+		return true;
+
+	add_wait_queue_exclusive(cong_wq, &wait);
+	if (!fscache_object_congested())
+		*timeoutp = schedule_timeout(*timeoutp);
+	finish_wait(cong_wq, &wait);
+
+	return fscache_object_congested();
 }
+EXPORT_SYMBOL_GPL(fscache_object_sleep_till_congested);
 
 /*
  * enqueue the dependents of an object for metadata-type processing
@@ -819,9 +821,7 @@ static void fscache_enqueue_dependents(struct fscache_object *object)
 
 		/* sort onto appropriate lists */
 		fscache_enqueue_object(dep);
-		fscache_stat(&fscache_n_cop_put_object);
-		dep->cache->ops->put_object(dep);
-		fscache_stat_d(&fscache_n_cop_put_object);
+		fscache_put_object(dep);
 
 		if (!list_empty(&object->dependents))
 			cond_resched_lock(&object->lock);
diff --git a/include/linux/fscache-cache.h b/include/linux/fscache-cache.h
index c57db27ac861..27c8df503152 100644
--- a/include/linux/fscache-cache.h
+++ b/include/linux/fscache-cache.h
@@ -21,6 +21,7 @@
 #include <linux/fscache.h>
 #include <linux/sched.h>
 #include <linux/slow-work.h>
+#include <linux/workqueue.h>
 
 #define NR_MAXCACHES BITS_PER_LONG
 
@@ -389,7 +390,7 @@ struct fscache_object {
 	struct fscache_cache	*cache;		/* cache that supplied this object */
 	struct fscache_cookie	*cookie;	/* netfs's file/index object */
 	struct fscache_object	*parent;	/* parent object */
-	struct slow_work	work;		/* attention scheduling record */
+	struct work_struct	work;		/* attention scheduling record */
 	struct list_head	dependents;	/* FIFO of dependent objects */
 	struct list_head	dep_link;	/* link in parent's dependents list */
 	struct list_head	pending_ops;	/* unstarted operations on this object */
@@ -411,7 +412,7 @@ extern const char *fscache_object_states[];
 	(test_bit(FSCACHE_IOERROR, &(obj)->cache->flags) &&	\
 	 (obj)->state >= FSCACHE_OBJECT_DYING)
 
-extern const struct slow_work_ops fscache_object_slow_work_ops;
+extern void fscache_object_work_func(struct work_struct *work);
 
 /**
  * fscache_object_init - Initialise a cache object description
@@ -433,7 +434,7 @@ void fscache_object_init(struct fscache_object *object,
 	spin_lock_init(&object->lock);
 	INIT_LIST_HEAD(&object->cache_link);
 	INIT_HLIST_NODE(&object->cookie_link);
-	vslow_work_init(&object->work, &fscache_object_slow_work_ops);
+	INIT_WORK(&object->work, fscache_object_work_func);
 	INIT_LIST_HEAD(&object->dependents);
 	INIT_LIST_HEAD(&object->dep_link);
 	INIT_LIST_HEAD(&object->pending_ops);
@@ -534,6 +535,8 @@ extern void fscache_io_error(struct fscache_cache *cache);
 extern void fscache_mark_pages_cached(struct fscache_retrieval *op,
 				      struct pagevec *pagevec);
 
+extern bool fscache_object_sleep_till_congested(signed long *timeoutp);
+
 extern enum fscache_checkaux fscache_check_aux(struct fscache_object *object,
 					       const void *data,
 					       uint16_t datalen);
-- 
cgit v1.2.3


From 8af7c12436803291c90295259db23d371a7ad9cc Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Tue, 20 Jul 2010 22:09:01 +0200
Subject: fscache: convert operation to use workqueue instead of slow-work

Make fscache operation to use only workqueue instead of combination of
workqueue and slow-work.  FSCACHE_OP_SLOW is dropped and
FSCACHE_OP_FAST is renamed to FSCACHE_OP_ASYNC and uses newly added
fscache_op_wq workqueue to execute op->processor().
fscache_operation_init_slow() is dropped and fscache_operation_init()
now takes @processor argument directly.

* Unbound workqueue is used.

* fscache_retrieval_work() is no longer necessary as OP_ASYNC now does
  the equivalent thing.

* sysctl fscache.operation_max_active added to control concurrency.
  The default value is nr_cpus clamped between 2 and
  WQ_UNBOUND_MAX_ACTIVE.

* debugfs support is dropped for now.  Tracing API based debug
  facility is planned to be added.

Signed-off-by: Tejun Heo <tj@kernel.org>
Acked-by: David Howells <dhowells@redhat.com>
---
 fs/cachefiles/rdwr.c          |  4 +--
 fs/fscache/internal.h         |  1 +
 fs/fscache/main.c             | 23 +++++++++++++++
 fs/fscache/operation.c        | 67 ++++++-------------------------------------
 fs/fscache/page.c             | 36 ++++++-----------------
 include/linux/fscache-cache.h | 37 +++++++-----------------
 6 files changed, 53 insertions(+), 115 deletions(-)

(limited to 'include/linux')

diff --git a/fs/cachefiles/rdwr.c b/fs/cachefiles/rdwr.c
index 0f0d41fbb03f..0e3c0924cc3a 100644
--- a/fs/cachefiles/rdwr.c
+++ b/fs/cachefiles/rdwr.c
@@ -422,7 +422,7 @@ int cachefiles_read_or_alloc_page(struct fscache_retrieval *op,
 	shift = PAGE_SHIFT - inode->i_sb->s_blocksize_bits;
 
 	op->op.flags &= FSCACHE_OP_KEEP_FLAGS;
-	op->op.flags |= FSCACHE_OP_FAST;
+	op->op.flags |= FSCACHE_OP_ASYNC;
 	op->op.processor = cachefiles_read_copier;
 
 	pagevec_init(&pagevec, 0);
@@ -729,7 +729,7 @@ int cachefiles_read_or_alloc_pages(struct fscache_retrieval *op,
 	pagevec_init(&pagevec, 0);
 
 	op->op.flags &= FSCACHE_OP_KEEP_FLAGS;
-	op->op.flags |= FSCACHE_OP_FAST;
+	op->op.flags |= FSCACHE_OP_ASYNC;
 	op->op.processor = cachefiles_read_copier;
 
 	INIT_LIST_HEAD(&backpages);
diff --git a/fs/fscache/internal.h b/fs/fscache/internal.h
index 6e0b5fb25231..6a026441c5a6 100644
--- a/fs/fscache/internal.h
+++ b/fs/fscache/internal.h
@@ -83,6 +83,7 @@ extern unsigned fscache_defer_create;
 extern unsigned fscache_debug;
 extern struct kobject *fscache_root;
 extern struct workqueue_struct *fscache_object_wq;
+extern struct workqueue_struct *fscache_op_wq;
 DECLARE_PER_CPU(wait_queue_head_t, fscache_object_cong_wait);
 
 static inline bool fscache_object_congested(void)
diff --git a/fs/fscache/main.c b/fs/fscache/main.c
index bb8d4c35c7a2..44d13ddab2cc 100644
--- a/fs/fscache/main.c
+++ b/fs/fscache/main.c
@@ -42,11 +42,13 @@ MODULE_PARM_DESC(fscache_debug,
 
 struct kobject *fscache_root;
 struct workqueue_struct *fscache_object_wq;
+struct workqueue_struct *fscache_op_wq;
 
 DEFINE_PER_CPU(wait_queue_head_t, fscache_object_cong_wait);
 
 /* these values serve as lower bounds, will be adjusted in fscache_init() */
 static unsigned fscache_object_max_active = 4;
+static unsigned fscache_op_max_active = 2;
 
 #ifdef CONFIG_SYSCTL
 static struct ctl_table_header *fscache_sysctl_header;
@@ -74,6 +76,14 @@ ctl_table fscache_sysctls[] = {
 		.proc_handler	= fscache_max_active_sysctl,
 		.extra1		= &fscache_object_wq,
 	},
+	{
+		.procname	= "operation_max_active",
+		.data		= &fscache_op_max_active,
+		.maxlen		= sizeof(unsigned),
+		.mode		= 0644,
+		.proc_handler	= fscache_max_active_sysctl,
+		.extra1		= &fscache_op_wq,
+	},
 	{}
 };
 
@@ -110,6 +120,16 @@ static int __init fscache_init(void)
 	if (!fscache_object_wq)
 		goto error_object_wq;
 
+	fscache_op_max_active =
+		clamp_val(fscache_object_max_active / 2,
+			  fscache_op_max_active, WQ_UNBOUND_MAX_ACTIVE);
+
+	ret = -ENOMEM;
+	fscache_op_wq = alloc_workqueue("fscache_operation", WQ_UNBOUND,
+					fscache_op_max_active);
+	if (!fscache_op_wq)
+		goto error_op_wq;
+
 	for_each_possible_cpu(cpu)
 		init_waitqueue_head(&per_cpu(fscache_object_cong_wait, cpu));
 
@@ -152,6 +172,8 @@ error_sysctl:
 #endif
 	fscache_proc_cleanup();
 error_proc:
+	destroy_workqueue(fscache_op_wq);
+error_op_wq:
 	destroy_workqueue(fscache_object_wq);
 error_object_wq:
 	slow_work_unregister_user(THIS_MODULE);
@@ -172,6 +194,7 @@ static void __exit fscache_exit(void)
 	kmem_cache_destroy(fscache_cookie_jar);
 	unregister_sysctl_table(fscache_sysctl_header);
 	fscache_proc_cleanup();
+	destroy_workqueue(fscache_op_wq);
 	destroy_workqueue(fscache_object_wq);
 	slow_work_unregister_user(THIS_MODULE);
 	printk(KERN_NOTICE "FS-Cache: Unloaded\n");
diff --git a/fs/fscache/operation.c b/fs/fscache/operation.c
index f17cecafae44..b9f34eaede09 100644
--- a/fs/fscache/operation.c
+++ b/fs/fscache/operation.c
@@ -42,16 +42,12 @@ void fscache_enqueue_operation(struct fscache_operation *op)
 
 	fscache_stat(&fscache_n_op_enqueue);
 	switch (op->flags & FSCACHE_OP_TYPE) {
-	case FSCACHE_OP_FAST:
-		_debug("queue fast");
+	case FSCACHE_OP_ASYNC:
+		_debug("queue async");
 		atomic_inc(&op->usage);
-		if (!schedule_work(&op->fast_work))
+		if (!queue_work(fscache_op_wq, &op->work))
 			fscache_put_operation(op);
 		break;
-	case FSCACHE_OP_SLOW:
-		_debug("queue slow");
-		slow_work_enqueue(&op->slow_work);
-		break;
 	case FSCACHE_OP_MYTHREAD:
 		_debug("queue for caller's attention");
 		break;
@@ -455,36 +451,13 @@ void fscache_operation_gc(struct work_struct *work)
 }
 
 /*
- * allow the slow work item processor to get a ref on an operation
- */
-static int fscache_op_get_ref(struct slow_work *work)
-{
-	struct fscache_operation *op =
-		container_of(work, struct fscache_operation, slow_work);
-
-	atomic_inc(&op->usage);
-	return 0;
-}
-
-/*
- * allow the slow work item processor to discard a ref on an operation
- */
-static void fscache_op_put_ref(struct slow_work *work)
-{
-	struct fscache_operation *op =
-		container_of(work, struct fscache_operation, slow_work);
-
-	fscache_put_operation(op);
-}
-
-/*
- * execute an operation using the slow thread pool to provide processing context
- * - the caller holds a ref to this object, so we don't need to hold one
+ * execute an operation using fs_op_wq to provide processing context -
+ * the caller holds a ref to this object, so we don't need to hold one
  */
-static void fscache_op_execute(struct slow_work *work)
+void fscache_op_work_func(struct work_struct *work)
 {
 	struct fscache_operation *op =
-		container_of(work, struct fscache_operation, slow_work);
+		container_of(work, struct fscache_operation, work);
 	unsigned long start;
 
 	_enter("{OBJ%x OP%x,%d}",
@@ -494,31 +467,7 @@ static void fscache_op_execute(struct slow_work *work)
 	start = jiffies;
 	op->processor(op);
 	fscache_hist(fscache_ops_histogram, start);
+	fscache_put_operation(op);
 
 	_leave("");
 }
-
-/*
- * describe an operation for slow-work debugging
- */
-#ifdef CONFIG_SLOW_WORK_DEBUG
-static void fscache_op_desc(struct slow_work *work, struct seq_file *m)
-{
-	struct fscache_operation *op =
-		container_of(work, struct fscache_operation, slow_work);
-
-	seq_printf(m, "FSC: OBJ%x OP%x: %s/%s fl=%lx",
-		   op->object->debug_id, op->debug_id,
-		   op->name, op->state, op->flags);
-}
-#endif
-
-const struct slow_work_ops fscache_op_slow_work_ops = {
-	.owner		= THIS_MODULE,
-	.get_ref	= fscache_op_get_ref,
-	.put_ref	= fscache_op_put_ref,
-	.execute	= fscache_op_execute,
-#ifdef CONFIG_SLOW_WORK_DEBUG
-	.desc		= fscache_op_desc,
-#endif
-};
diff --git a/fs/fscache/page.c b/fs/fscache/page.c
index 723b889fd219..41c441c2058d 100644
--- a/fs/fscache/page.c
+++ b/fs/fscache/page.c
@@ -105,7 +105,7 @@ bool __fscache_maybe_release_page(struct fscache_cookie *cookie,
 
 page_busy:
 	/* we might want to wait here, but that could deadlock the allocator as
-	 * the slow-work threads writing to the cache may all end up sleeping
+	 * the work threads writing to the cache may all end up sleeping
 	 * on memory allocation */
 	fscache_stat(&fscache_n_store_vmscan_busy);
 	return false;
@@ -188,9 +188,8 @@ int __fscache_attr_changed(struct fscache_cookie *cookie)
 		return -ENOMEM;
 	}
 
-	fscache_operation_init(op, NULL);
-	fscache_operation_init_slow(op, fscache_attr_changed_op);
-	op->flags = FSCACHE_OP_SLOW | (1 << FSCACHE_OP_EXCLUSIVE);
+	fscache_operation_init(op, fscache_attr_changed_op, NULL);
+	op->flags = FSCACHE_OP_ASYNC | (1 << FSCACHE_OP_EXCLUSIVE);
 	fscache_set_op_name(op, "Attr");
 
 	spin_lock(&cookie->lock);
@@ -217,24 +216,6 @@ nobufs:
 }
 EXPORT_SYMBOL(__fscache_attr_changed);
 
-/*
- * handle secondary execution given to a retrieval op on behalf of the
- * cache
- */
-static void fscache_retrieval_work(struct work_struct *work)
-{
-	struct fscache_retrieval *op =
-		container_of(work, struct fscache_retrieval, op.fast_work);
-	unsigned long start;
-
-	_enter("{OP%x}", op->op.debug_id);
-
-	start = jiffies;
-	op->op.processor(&op->op);
-	fscache_hist(fscache_ops_histogram, start);
-	fscache_put_operation(&op->op);
-}
-
 /*
  * release a retrieval op reference
  */
@@ -269,13 +250,12 @@ static struct fscache_retrieval *fscache_alloc_retrieval(
 		return NULL;
 	}
 
-	fscache_operation_init(&op->op, fscache_release_retrieval_op);
+	fscache_operation_init(&op->op, NULL, fscache_release_retrieval_op);
 	op->op.flags	= FSCACHE_OP_MYTHREAD | (1 << FSCACHE_OP_WAITING);
 	op->mapping	= mapping;
 	op->end_io_func	= end_io_func;
 	op->context	= context;
 	op->start_time	= jiffies;
-	INIT_WORK(&op->op.fast_work, fscache_retrieval_work);
 	INIT_LIST_HEAD(&op->to_do);
 	fscache_set_op_name(&op->op, "Retr");
 	return op;
@@ -795,9 +775,9 @@ int __fscache_write_page(struct fscache_cookie *cookie,
 	if (!op)
 		goto nomem;
 
-	fscache_operation_init(&op->op, fscache_release_write_op);
-	fscache_operation_init_slow(&op->op, fscache_write_op);
-	op->op.flags = FSCACHE_OP_SLOW | (1 << FSCACHE_OP_WAITING);
+	fscache_operation_init(&op->op, fscache_write_op,
+			       fscache_release_write_op);
+	op->op.flags = FSCACHE_OP_ASYNC | (1 << FSCACHE_OP_WAITING);
 	fscache_set_op_name(&op->op, "Write1");
 
 	ret = radix_tree_preload(gfp & ~__GFP_HIGHMEM);
@@ -852,7 +832,7 @@ int __fscache_write_page(struct fscache_cookie *cookie,
 	fscache_stat(&fscache_n_store_ops);
 	fscache_stat(&fscache_n_stores_ok);
 
-	/* the slow work queue now carries its own ref on the object */
+	/* the work queue now carries its own ref on the object */
 	fscache_put_operation(&op->op);
 	_leave(" = 0");
 	return 0;
diff --git a/include/linux/fscache-cache.h b/include/linux/fscache-cache.h
index 27c8df503152..17ed9c1dbfbe 100644
--- a/include/linux/fscache-cache.h
+++ b/include/linux/fscache-cache.h
@@ -77,18 +77,14 @@ typedef void (*fscache_operation_release_t)(struct fscache_operation *op);
 typedef void (*fscache_operation_processor_t)(struct fscache_operation *op);
 
 struct fscache_operation {
-	union {
-		struct work_struct fast_work;	/* record for fast ops */
-		struct slow_work slow_work;	/* record for (very) slow ops */
-	};
+	struct work_struct	work;		/* record for async ops */
 	struct list_head	pend_link;	/* link in object->pending_ops */
 	struct fscache_object	*object;	/* object to be operated upon */
 
 	unsigned long		flags;
 #define FSCACHE_OP_TYPE		0x000f	/* operation type */
-#define FSCACHE_OP_FAST		0x0001	/* - fast op, processor may not sleep for disk */
-#define FSCACHE_OP_SLOW		0x0002	/* - (very) slow op, processor may sleep for disk */
-#define FSCACHE_OP_MYTHREAD	0x0003	/* - processing is done be issuing thread, not pool */
+#define FSCACHE_OP_ASYNC	0x0001	/* - async op, processor may sleep for disk */
+#define FSCACHE_OP_MYTHREAD	0x0002	/* - processing is done be issuing thread, not pool */
 #define FSCACHE_OP_WAITING	4	/* cleared when op is woken */
 #define FSCACHE_OP_EXCLUSIVE	5	/* exclusive op, other ops must wait */
 #define FSCACHE_OP_DEAD		6	/* op is now dead */
@@ -106,7 +102,8 @@ struct fscache_operation {
 	/* operation releaser */
 	fscache_operation_release_t release;
 
-#ifdef CONFIG_SLOW_WORK_DEBUG
+#ifdef CONFIG_WORKQUEUE_DEBUGFS
+	struct work_struct put_work;	/* work to delay operation put */
 	const char *name;		/* operation name */
 	const char *state;		/* operation state */
 #define fscache_set_op_name(OP, N)	do { (OP)->name  = (N); } while(0)
@@ -118,7 +115,7 @@ struct fscache_operation {
 };
 
 extern atomic_t fscache_op_debug_id;
-extern const struct slow_work_ops fscache_op_slow_work_ops;
+extern void fscache_op_work_func(struct work_struct *work);
 
 extern void fscache_enqueue_operation(struct fscache_operation *);
 extern void fscache_put_operation(struct fscache_operation *);
@@ -129,33 +126,21 @@ extern void fscache_put_operation(struct fscache_operation *);
  * @release: The release function to assign
  *
  * Do basic initialisation of an operation.  The caller must still set flags,
- * object, either fast_work or slow_work if necessary, and processor if needed.
+ * object and processor if needed.
  */
 static inline void fscache_operation_init(struct fscache_operation *op,
-					  fscache_operation_release_t release)
+					fscache_operation_processor_t processor,
+					fscache_operation_release_t release)
 {
+	INIT_WORK(&op->work, fscache_op_work_func);
 	atomic_set(&op->usage, 1);
 	op->debug_id = atomic_inc_return(&fscache_op_debug_id);
+	op->processor = processor;
 	op->release = release;
 	INIT_LIST_HEAD(&op->pend_link);
 	fscache_set_op_state(op, "Init");
 }
 
-/**
- * fscache_operation_init_slow - Do additional initialisation of a slow op
- * @op: The operation to initialise
- * @processor: The processor function to assign
- *
- * Do additional initialisation of an operation as required for slow work.
- */
-static inline
-void fscache_operation_init_slow(struct fscache_operation *op,
-				 fscache_operation_processor_t processor)
-{
-	op->processor = processor;
-	slow_work_init(&op->slow_work, &fscache_op_slow_work_ops);
-}
-
 /*
  * data read operation
  */
-- 
cgit v1.2.3


From d098adfb7d281258173a43151483e52e21761021 Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Tue, 20 Jul 2010 22:09:01 +0200
Subject: fscache: drop references to slow-work

fscache no longer uses slow-work.  Drop references to it.

Signed-off-by: Tejun Heo <tj@kernel.org>
Acked-by: David Howells <dhowells@redhat.com>
---
 fs/fscache/Kconfig            | 1 -
 fs/fscache/main.c             | 7 -------
 include/linux/fscache-cache.h | 1 -
 3 files changed, 9 deletions(-)

(limited to 'include/linux')

diff --git a/fs/fscache/Kconfig b/fs/fscache/Kconfig
index cc94bb9563f2..3f6dfa989881 100644
--- a/fs/fscache/Kconfig
+++ b/fs/fscache/Kconfig
@@ -1,7 +1,6 @@
 
 config FSCACHE
 	tristate "General filesystem local caching manager"
-	select SLOW_WORK
 	help
 	  This option enables a generic filesystem caching manager that can be
 	  used by various network and other filesystems to cache data locally.
diff --git a/fs/fscache/main.c b/fs/fscache/main.c
index 44d13ddab2cc..500936d9fff2 100644
--- a/fs/fscache/main.c
+++ b/fs/fscache/main.c
@@ -106,10 +106,6 @@ static int __init fscache_init(void)
 	unsigned int cpu;
 	int ret;
 
-	ret = slow_work_register_user(THIS_MODULE);
-	if (ret < 0)
-		goto error_slow_work;
-
 	fscache_object_max_active =
 		clamp_val(nr_cpus,
 			  fscache_object_max_active, WQ_UNBOUND_MAX_ACTIVE);
@@ -176,8 +172,6 @@ error_proc:
 error_op_wq:
 	destroy_workqueue(fscache_object_wq);
 error_object_wq:
-	slow_work_unregister_user(THIS_MODULE);
-error_slow_work:
 	return ret;
 }
 
@@ -196,7 +190,6 @@ static void __exit fscache_exit(void)
 	fscache_proc_cleanup();
 	destroy_workqueue(fscache_op_wq);
 	destroy_workqueue(fscache_object_wq);
-	slow_work_unregister_user(THIS_MODULE);
 	printk(KERN_NOTICE "FS-Cache: Unloaded\n");
 }
 
diff --git a/include/linux/fscache-cache.h b/include/linux/fscache-cache.h
index 17ed9c1dbfbe..b8581c09d19f 100644
--- a/include/linux/fscache-cache.h
+++ b/include/linux/fscache-cache.h
@@ -20,7 +20,6 @@
 
 #include <linux/fscache.h>
 #include <linux/sched.h>
-#include <linux/slow-work.h>
 #include <linux/workqueue.h>
 
 #define NR_MAXCACHES BITS_PER_LONG
-- 
cgit v1.2.3


From 183d03cc4ff39e0f0d952c09aa96d0abfd6e0c3c Mon Sep 17 00:00:00 2001
From: Stefano Stabellini <stefano.stabellini@eu.citrix.com>
Date: Mon, 17 May 2010 17:08:21 +0100
Subject: xen: Xen PCI platform device driver.

Add the xen pci platform device driver that is responsible
for initializing the grant table and xenbus in PV on HVM mode.
Few changes to xenbus and grant table are necessary to allow the delayed
initialization in HVM mode.
Grant table needs few additional modifications to work in HVM mode.

The Xen PCI platform device raises an irq every time an event has been
delivered to us. However these interrupts are only delivered to vcpu 0.
The Xen PCI platform interrupt handler calls xen_hvm_evtchn_do_upcall
that is a little wrapper around __xen_evtchn_do_upcall, the traditional
Xen upcall handler, the very same used with traditional PV guests.

When running on HVM the event channel upcall is never called while in
progress because it is a normal Linux irq handler (and we cannot switch
the irq chip wholesale to the Xen PV ones as we are running QEMU and
might have passed in PCI devices), therefore we cannot be sure that
evtchn_upcall_pending is 0 when returning.
For this reason if evtchn_upcall_pending is set by Xen we need to loop
again on the event channels set pending otherwise we might loose some
event channel deliveries.

Signed-off-by: Stefano Stabellini <stefano.stabellini@eu.citrix.com>
Signed-off-by: Sheng Yang <sheng@linux.intel.com>
Signed-off-by: Jeremy Fitzhardinge <jeremy.fitzhardinge@citrix.com>
---
 drivers/xen/Kconfig                 |   9 ++
 drivers/xen/Makefile                |   3 +-
 drivers/xen/events.c                |   8 +-
 drivers/xen/grant-table.c           |  77 +++++++++++++--
 drivers/xen/manage.c                |   1 +
 drivers/xen/platform-pci.c          | 181 ++++++++++++++++++++++++++++++++++++
 drivers/xen/xenbus/xenbus_probe.c   |  22 ++++-
 include/linux/pci_ids.h             |   3 +
 include/xen/grant_table.h           |   4 +
 include/xen/interface/grant_table.h |   1 +
 10 files changed, 291 insertions(+), 18 deletions(-)
 create mode 100644 drivers/xen/platform-pci.c

(limited to 'include/linux')

diff --git a/drivers/xen/Kconfig b/drivers/xen/Kconfig
index fad3df2c1276..8f84b108b491 100644
--- a/drivers/xen/Kconfig
+++ b/drivers/xen/Kconfig
@@ -62,4 +62,13 @@ config XEN_SYS_HYPERVISOR
 	 virtual environment, /sys/hypervisor will still be present,
 	 but will have no xen contents.
 
+config XEN_PLATFORM_PCI
+	tristate "xen platform pci device driver"
+	depends on XEN
+	default m
+	help
+	  Driver for the Xen PCI Platform device: it is responsible for
+	  initializing xenbus and grant_table when running in a Xen HVM
+	  domain. As a consequence this driver is required to run any Xen PV
+	  frontend on Xen HVM.
 endmenu
diff --git a/drivers/xen/Makefile b/drivers/xen/Makefile
index 7c284342f30f..e392fb776af3 100644
--- a/drivers/xen/Makefile
+++ b/drivers/xen/Makefile
@@ -9,4 +9,5 @@ obj-$(CONFIG_XEN_XENCOMM)	+= xencomm.o
 obj-$(CONFIG_XEN_BALLOON)	+= balloon.o
 obj-$(CONFIG_XEN_DEV_EVTCHN)	+= evtchn.o
 obj-$(CONFIG_XENFS)		+= xenfs/
-obj-$(CONFIG_XEN_SYS_HYPERVISOR)	+= sys-hypervisor.o
\ No newline at end of file
+obj-$(CONFIG_XEN_SYS_HYPERVISOR)	+= sys-hypervisor.o
+obj-$(CONFIG_XEN_PLATFORM_PCI)	+= platform-pci.o
diff --git a/drivers/xen/events.c b/drivers/xen/events.c
index d659480125f0..7c64473c9f3f 100644
--- a/drivers/xen/events.c
+++ b/drivers/xen/events.c
@@ -665,7 +665,7 @@ static void __xen_evtchn_do_upcall(void)
 
 		count = __get_cpu_var(xed_nesting_count);
 		__get_cpu_var(xed_nesting_count) = 0;
-	} while(count != 1);
+	} while (count != 1 || vcpu_info->evtchn_upcall_pending);
 
 out:
 
@@ -689,6 +689,7 @@ void xen_hvm_evtchn_do_upcall(void)
 {
 	__xen_evtchn_do_upcall();
 }
+EXPORT_SYMBOL_GPL(xen_hvm_evtchn_do_upcall);
 
 /* Rebind a new event channel to an existing irq. */
 void rebind_evtchn_irq(int evtchn, int irq)
@@ -725,7 +726,10 @@ static int rebind_irq_to_cpu(unsigned irq, unsigned tcpu)
 	struct evtchn_bind_vcpu bind_vcpu;
 	int evtchn = evtchn_from_irq(irq);
 
-	if (!VALID_EVTCHN(evtchn))
+	/* events delivered via platform PCI interrupts are always
+	 * routed to vcpu 0 */
+	if (!VALID_EVTCHN(evtchn) ||
+		(xen_hvm_domain() && !xen_have_vector_callback))
 		return -1;
 
 	/* Send future instances of this interrupt to other vcpu. */
diff --git a/drivers/xen/grant-table.c b/drivers/xen/grant-table.c
index f66db3b91d61..6c4531816496 100644
--- a/drivers/xen/grant-table.c
+++ b/drivers/xen/grant-table.c
@@ -37,11 +37,13 @@
 #include <linux/slab.h>
 #include <linux/vmalloc.h>
 #include <linux/uaccess.h>
+#include <linux/io.h>
 
 #include <xen/xen.h>
 #include <xen/interface/xen.h>
 #include <xen/page.h>
 #include <xen/grant_table.h>
+#include <xen/interface/memory.h>
 #include <asm/xen/hypercall.h>
 
 #include <asm/pgtable.h>
@@ -59,6 +61,8 @@ static unsigned int boot_max_nr_grant_frames;
 static int gnttab_free_count;
 static grant_ref_t gnttab_free_head;
 static DEFINE_SPINLOCK(gnttab_list_lock);
+unsigned long xen_hvm_resume_frames;
+EXPORT_SYMBOL_GPL(xen_hvm_resume_frames);
 
 static struct grant_entry *shared;
 
@@ -433,7 +437,7 @@ static unsigned int __max_nr_grant_frames(void)
 	return query.max_nr_frames;
 }
 
-static inline unsigned int max_nr_grant_frames(void)
+unsigned int gnttab_max_grant_frames(void)
 {
 	unsigned int xen_max = __max_nr_grant_frames();
 
@@ -441,6 +445,7 @@ static inline unsigned int max_nr_grant_frames(void)
 		return boot_max_nr_grant_frames;
 	return xen_max;
 }
+EXPORT_SYMBOL_GPL(gnttab_max_grant_frames);
 
 static int gnttab_map(unsigned int start_idx, unsigned int end_idx)
 {
@@ -449,6 +454,30 @@ static int gnttab_map(unsigned int start_idx, unsigned int end_idx)
 	unsigned int nr_gframes = end_idx + 1;
 	int rc;
 
+	if (xen_hvm_domain()) {
+		struct xen_add_to_physmap xatp;
+		unsigned int i = end_idx;
+		rc = 0;
+		/*
+		 * Loop backwards, so that the first hypercall has the largest
+		 * index, ensuring that the table will grow only once.
+		 */
+		do {
+			xatp.domid = DOMID_SELF;
+			xatp.idx = i;
+			xatp.space = XENMAPSPACE_grant_table;
+			xatp.gpfn = (xen_hvm_resume_frames >> PAGE_SHIFT) + i;
+			rc = HYPERVISOR_memory_op(XENMEM_add_to_physmap, &xatp);
+			if (rc != 0) {
+				printk(KERN_WARNING
+						"grant table add_to_physmap failed, err=%d\n", rc);
+				break;
+			}
+		} while (i-- > start_idx);
+
+		return rc;
+	}
+
 	frames = kmalloc(nr_gframes * sizeof(unsigned long), GFP_ATOMIC);
 	if (!frames)
 		return -ENOMEM;
@@ -465,7 +494,7 @@ static int gnttab_map(unsigned int start_idx, unsigned int end_idx)
 
 	BUG_ON(rc || setup.status);
 
-	rc = arch_gnttab_map_shared(frames, nr_gframes, max_nr_grant_frames(),
+	rc = arch_gnttab_map_shared(frames, nr_gframes, gnttab_max_grant_frames(),
 				    &shared);
 	BUG_ON(rc);
 
@@ -476,9 +505,27 @@ static int gnttab_map(unsigned int start_idx, unsigned int end_idx)
 
 int gnttab_resume(void)
 {
-	if (max_nr_grant_frames() < nr_grant_frames)
+	unsigned int max_nr_gframes;
+
+	max_nr_gframes = gnttab_max_grant_frames();
+	if (max_nr_gframes < nr_grant_frames)
 		return -ENOSYS;
-	return gnttab_map(0, nr_grant_frames - 1);
+
+	if (xen_pv_domain())
+		return gnttab_map(0, nr_grant_frames - 1);
+
+	if (!shared) {
+		shared = ioremap(xen_hvm_resume_frames, PAGE_SIZE * max_nr_gframes);
+		if (shared == NULL) {
+			printk(KERN_WARNING
+					"Failed to ioremap gnttab share frames!");
+			return -ENOMEM;
+		}
+	}
+
+	gnttab_map(0, nr_grant_frames - 1);
+
+	return 0;
 }
 
 int gnttab_suspend(void)
@@ -495,7 +542,7 @@ static int gnttab_expand(unsigned int req_entries)
 	cur = nr_grant_frames;
 	extra = ((req_entries + (GREFS_PER_GRANT_FRAME-1)) /
 		 GREFS_PER_GRANT_FRAME);
-	if (cur + extra > max_nr_grant_frames())
+	if (cur + extra > gnttab_max_grant_frames())
 		return -ENOSPC;
 
 	rc = gnttab_map(cur, cur + extra - 1);
@@ -505,15 +552,12 @@ static int gnttab_expand(unsigned int req_entries)
 	return rc;
 }
 
-static int __devinit gnttab_init(void)
+int gnttab_init(void)
 {
 	int i;
 	unsigned int max_nr_glist_frames, nr_glist_frames;
 	unsigned int nr_init_grefs;
 
-	if (!xen_domain())
-		return -ENODEV;
-
 	nr_grant_frames = 1;
 	boot_max_nr_grant_frames = __max_nr_grant_frames();
 
@@ -556,5 +600,18 @@ static int __devinit gnttab_init(void)
 	kfree(gnttab_list);
 	return -ENOMEM;
 }
+EXPORT_SYMBOL_GPL(gnttab_init);
+
+static int __devinit __gnttab_init(void)
+{
+	/* Delay grant-table initialization in the PV on HVM case */
+	if (xen_hvm_domain())
+		return 0;
+
+	if (!xen_pv_domain())
+		return -ENODEV;
+
+	return gnttab_init();
+}
 
-core_initcall(gnttab_init);
+core_initcall(__gnttab_init);
diff --git a/drivers/xen/manage.c b/drivers/xen/manage.c
index 07e857b0de13..af9c5594d315 100644
--- a/drivers/xen/manage.c
+++ b/drivers/xen/manage.c
@@ -264,5 +264,6 @@ static int __init setup_shutdown_event(void)
 
 	return 0;
 }
+EXPORT_SYMBOL_GPL(xen_setup_shutdown_event);
 
 subsys_initcall(setup_shutdown_event);
diff --git a/drivers/xen/platform-pci.c b/drivers/xen/platform-pci.c
new file mode 100644
index 000000000000..a0ee5d06f715
--- /dev/null
+++ b/drivers/xen/platform-pci.c
@@ -0,0 +1,181 @@
+/******************************************************************************
+ * platform-pci.c
+ *
+ * Xen platform PCI device driver
+ * Copyright (c) 2005, Intel Corporation.
+ * Copyright (c) 2007, XenSource Inc.
+ * Copyright (c) 2010, Citrix
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms and conditions of the GNU General Public License,
+ * version 2, as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
+ * more details.
+ *
+ * You should have received a copy of the GNU General Public License along with
+ * this program; if not, write to the Free Software Foundation, Inc., 59 Temple
+ * Place - Suite 330, Boston, MA 02111-1307 USA.
+ *
+ */
+
+
+#include <linux/interrupt.h>
+#include <linux/io.h>
+#include <linux/module.h>
+#include <linux/pci.h>
+
+#include <xen/grant_table.h>
+#include <xen/xenbus.h>
+#include <xen/events.h>
+#include <xen/hvm.h>
+
+#define DRV_NAME    "xen-platform-pci"
+
+MODULE_AUTHOR("ssmith@xensource.com and stefano.stabellini@eu.citrix.com");
+MODULE_DESCRIPTION("Xen platform PCI device");
+MODULE_LICENSE("GPL");
+
+static unsigned long platform_mmio;
+static unsigned long platform_mmio_alloc;
+static unsigned long platform_mmiolen;
+
+unsigned long alloc_xen_mmio(unsigned long len)
+{
+	unsigned long addr;
+
+	addr = platform_mmio + platform_mmio_alloc;
+	platform_mmio_alloc += len;
+	BUG_ON(platform_mmio_alloc > platform_mmiolen);
+
+	return addr;
+}
+
+static uint64_t get_callback_via(struct pci_dev *pdev)
+{
+	u8 pin;
+	int irq;
+
+	irq = pdev->irq;
+	if (irq < 16)
+		return irq; /* ISA IRQ */
+
+	pin = pdev->pin;
+
+	/* We don't know the GSI. Specify the PCI INTx line instead. */
+	return ((uint64_t)0x01 << 56) | /* PCI INTx identifier */
+		((uint64_t)pci_domain_nr(pdev->bus) << 32) |
+		((uint64_t)pdev->bus->number << 16) |
+		((uint64_t)(pdev->devfn & 0xff) << 8) |
+		((uint64_t)(pin - 1) & 3);
+}
+
+static irqreturn_t do_hvm_evtchn_intr(int irq, void *dev_id)
+{
+	xen_hvm_evtchn_do_upcall();
+	return IRQ_HANDLED;
+}
+
+static int xen_allocate_irq(struct pci_dev *pdev)
+{
+	return request_irq(pdev->irq, do_hvm_evtchn_intr,
+			IRQF_DISABLED | IRQF_NOBALANCING | IRQF_TRIGGER_RISING,
+			"xen-platform-pci", pdev);
+}
+
+static int __devinit platform_pci_init(struct pci_dev *pdev,
+				       const struct pci_device_id *ent)
+{
+	int i, ret;
+	long ioaddr, iolen;
+	long mmio_addr, mmio_len;
+	uint64_t callback_via;
+	unsigned int max_nr_gframes;
+
+	i = pci_enable_device(pdev);
+	if (i)
+		return i;
+
+	ioaddr = pci_resource_start(pdev, 0);
+	iolen = pci_resource_len(pdev, 0);
+
+	mmio_addr = pci_resource_start(pdev, 1);
+	mmio_len = pci_resource_len(pdev, 1);
+
+	if (mmio_addr == 0 || ioaddr == 0) {
+		dev_err(&pdev->dev, "no resources found\n");
+		ret = -ENOENT;
+		goto pci_out;
+	}
+
+	if (request_mem_region(mmio_addr, mmio_len, DRV_NAME) == NULL) {
+		dev_err(&pdev->dev, "MEM I/O resource 0x%lx @ 0x%lx busy\n",
+		       mmio_addr, mmio_len);
+		ret = -EBUSY;
+		goto pci_out;
+	}
+
+	if (request_region(ioaddr, iolen, DRV_NAME) == NULL) {
+		dev_err(&pdev->dev, "I/O resource 0x%lx @ 0x%lx busy\n",
+		       iolen, ioaddr);
+		ret = -EBUSY;
+		goto mem_out;
+	}
+
+	platform_mmio = mmio_addr;
+	platform_mmiolen = mmio_len;
+
+	if (!xen_have_vector_callback) {
+		ret = xen_allocate_irq(pdev);
+		if (ret) {
+			dev_warn(&pdev->dev, "request_irq failed err=%d\n", ret);
+			goto out;
+		}
+		callback_via = get_callback_via(pdev);
+		ret = xen_set_callback_via(callback_via);
+		if (ret) {
+			dev_warn(&pdev->dev, "Unable to set the evtchn callback "
+					 "err=%d\n", ret);
+			goto out;
+		}
+	}
+
+	max_nr_gframes = gnttab_max_grant_frames();
+	xen_hvm_resume_frames = alloc_xen_mmio(PAGE_SIZE * max_nr_gframes);
+	ret = gnttab_init();
+	if (ret)
+		goto out;
+	xenbus_probe(NULL);
+	return 0;
+
+out:
+	release_region(ioaddr, iolen);
+mem_out:
+	release_mem_region(mmio_addr, mmio_len);
+pci_out:
+	pci_disable_device(pdev);
+	return ret;
+}
+
+static struct pci_device_id platform_pci_tbl[] __devinitdata = {
+	{PCI_VENDOR_ID_XEN, PCI_DEVICE_ID_XEN_PLATFORM,
+		PCI_ANY_ID, PCI_ANY_ID, 0, 0, 0},
+	{0,}
+};
+
+MODULE_DEVICE_TABLE(pci, platform_pci_tbl);
+
+static struct pci_driver platform_driver = {
+	.name =           DRV_NAME,
+	.probe =          platform_pci_init,
+	.id_table =       platform_pci_tbl,
+};
+
+static int __init platform_pci_module_init(void)
+{
+	return pci_register_driver(&platform_driver);
+}
+
+module_init(platform_pci_module_init);
diff --git a/drivers/xen/xenbus/xenbus_probe.c b/drivers/xen/xenbus/xenbus_probe.c
index d96fa75b45ec..a9e83c438cbb 100644
--- a/drivers/xen/xenbus/xenbus_probe.c
+++ b/drivers/xen/xenbus/xenbus_probe.c
@@ -781,8 +781,23 @@ void xenbus_probe(struct work_struct *unused)
 	/* Notify others that xenstore is up */
 	blocking_notifier_call_chain(&xenstore_chain, 0, NULL);
 }
+EXPORT_SYMBOL_GPL(xenbus_probe);
 
-static int __init xenbus_probe_init(void)
+static int __init xenbus_probe_initcall(void)
+{
+	if (!xen_domain())
+		return -ENODEV;
+
+	if (xen_initial_domain() || xen_hvm_domain())
+		return 0;
+
+	xenbus_probe(NULL);
+	return 0;
+}
+
+device_initcall(xenbus_probe_initcall);
+
+static int __init xenbus_init(void)
 {
 	int err = 0;
 
@@ -834,9 +849,6 @@ static int __init xenbus_probe_init(void)
 		goto out_unreg_back;
 	}
 
-	if (!xen_initial_domain())
-		xenbus_probe(NULL);
-
 #ifdef CONFIG_XEN_COMPAT_XENFS
 	/*
 	 * Create xenfs mountpoint in /proc for compatibility with
@@ -857,7 +869,7 @@ static int __init xenbus_probe_init(void)
 	return err;
 }
 
-postcore_initcall(xenbus_probe_init);
+postcore_initcall(xenbus_init);
 
 MODULE_LICENSE("GPL");
 
diff --git a/include/linux/pci_ids.h b/include/linux/pci_ids.h
index 3bedcc149c84..cca2526f28d7 100644
--- a/include/linux/pci_ids.h
+++ b/include/linux/pci_ids.h
@@ -2772,3 +2772,6 @@
 #define PCI_DEVICE_ID_RME_DIGI32	0x9896
 #define PCI_DEVICE_ID_RME_DIGI32_PRO	0x9897
 #define PCI_DEVICE_ID_RME_DIGI32_8	0x9898
+
+#define PCI_VENDOR_ID_XEN		0x5853
+#define PCI_DEVICE_ID_XEN_PLATFORM	0x0001
diff --git a/include/xen/grant_table.h b/include/xen/grant_table.h
index a40f1cd91be1..9a731706a016 100644
--- a/include/xen/grant_table.h
+++ b/include/xen/grant_table.h
@@ -51,6 +51,7 @@ struct gnttab_free_callback {
 	u16 count;
 };
 
+int gnttab_init(void);
 int gnttab_suspend(void);
 int gnttab_resume(void);
 
@@ -112,6 +113,9 @@ int arch_gnttab_map_shared(unsigned long *frames, unsigned long nr_gframes,
 void arch_gnttab_unmap_shared(struct grant_entry *shared,
 			      unsigned long nr_gframes);
 
+extern unsigned long xen_hvm_resume_frames;
+unsigned int gnttab_max_grant_frames(void);
+
 #define gnttab_map_vaddr(map) ((void *)(map.host_virt_addr))
 
 #endif /* __ASM_GNTTAB_H__ */
diff --git a/include/xen/interface/grant_table.h b/include/xen/interface/grant_table.h
index 39da93c21de0..39e571796e32 100644
--- a/include/xen/interface/grant_table.h
+++ b/include/xen/interface/grant_table.h
@@ -28,6 +28,7 @@
 #ifndef __XEN_PUBLIC_GRANT_TABLE_H__
 #define __XEN_PUBLIC_GRANT_TABLE_H__
 
+#include <xen/interface/xen.h>
 
 /***********************************
  * GRANT TABLE REPRESENTATION
-- 
cgit v1.2.3


From 2f1b7cd29fa4917f19d2624afc773d941684c5df Mon Sep 17 00:00:00 2001
From: Ryusuke Konishi <konishi.ryusuke@lab.ntt.co.jp>
Date: Thu, 22 Jul 2010 03:22:18 +0900
Subject: nilfs2: clarify byte offset in super block format

This inserts comments indicating hexadecimal offset in declaration of
nilfs_super_block structure so that people can know offset of its
fields without counting from the head.

Signed-off-by: Ryusuke Konishi <konishi.ryusuke@lab.ntt.co.jp>
---
 include/linux/nilfs2_fs.h | 26 +++++++++++++-------------
 1 file changed, 13 insertions(+), 13 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/nilfs2_fs.h b/include/linux/nilfs2_fs.h
index 8c2c6116e788..cc3465e5be38 100644
--- a/include/linux/nilfs2_fs.h
+++ b/include/linux/nilfs2_fs.h
@@ -160,7 +160,7 @@ struct nilfs_super_root {
  * struct nilfs_super_block - structure of super block on disk
  */
 struct nilfs_super_block {
-	__le32	s_rev_level;		/* Revision level */
+/*00*/	__le32	s_rev_level;		/* Revision level */
 	__le16	s_minor_rev_level;	/* minor revision level */
 	__le16	s_magic;		/* Magic signature */
 
@@ -169,47 +169,47 @@ struct nilfs_super_block {
 					   is excluded. */
 	__le16  s_flags;		/* flags */
 	__le32  s_crc_seed;		/* Seed value of CRC calculation */
-	__le32	s_sum;			/* Check sum of super block */
+/*10*/	__le32	s_sum;			/* Check sum of super block */
 
 	__le32	s_log_block_size;	/* Block size represented as follows
 					   blocksize =
 					       1 << (s_log_block_size + 10) */
 	__le64  s_nsegments;		/* Number of segments in filesystem */
-	__le64  s_dev_size;		/* block device size in bytes */
+/*20*/	__le64  s_dev_size;		/* block device size in bytes */
 	__le64	s_first_data_block;	/* 1st seg disk block number */
-	__le32  s_blocks_per_segment;   /* number of blocks per full segment */
+/*30*/	__le32  s_blocks_per_segment;   /* number of blocks per full segment */
 	__le32	s_r_segments_percentage; /* Reserved segments percentage */
 
 	__le64  s_last_cno;		/* Last checkpoint number */
-	__le64  s_last_pseg;		/* disk block addr pseg written last */
+/*40*/	__le64  s_last_pseg;		/* disk block addr pseg written last */
 	__le64  s_last_seq;             /* seq. number of seg written last */
-	__le64	s_free_blocks_count;	/* Free blocks count */
+/*50*/	__le64	s_free_blocks_count;	/* Free blocks count */
 
 	__le64	s_ctime;		/* Creation time (execution time of
 					   newfs) */
-	__le64	s_mtime;		/* Mount time */
+/*60*/	__le64	s_mtime;		/* Mount time */
 	__le64	s_wtime;		/* Write time */
-	__le16	s_mnt_count;		/* Mount count */
+/*70*/	__le16	s_mnt_count;		/* Mount count */
 	__le16	s_max_mnt_count;	/* Maximal mount count */
 	__le16	s_state;		/* File system state */
 	__le16	s_errors;		/* Behaviour when detecting errors */
 	__le64	s_lastcheck;		/* time of last check */
 
-	__le32	s_checkinterval;	/* max. time between checks */
+/*80*/	__le32	s_checkinterval;	/* max. time between checks */
 	__le32	s_creator_os;		/* OS */
 	__le16	s_def_resuid;		/* Default uid for reserved blocks */
 	__le16	s_def_resgid;		/* Default gid for reserved blocks */
 	__le32	s_first_ino;		/* First non-reserved inode */
 
-	__le16  s_inode_size;		/* Size of an inode */
+/*90*/	__le16  s_inode_size;		/* Size of an inode */
 	__le16  s_dat_entry_size;       /* Size of a dat entry */
 	__le16  s_checkpoint_size;      /* Size of a checkpoint */
 	__le16	s_segment_usage_size;	/* Size of a segment usage */
 
-	__u8	s_uuid[16];		/* 128-bit uuid for volume */
-	char	s_volume_name[80];	/* volume name */
+/*98*/	__u8	s_uuid[16];		/* 128-bit uuid for volume */
+/*A8*/	char	s_volume_name[80];	/* volume name */
 
-	__le32  s_c_interval;           /* Commit interval of segment */
+/*F8*/	__le32  s_c_interval;           /* Commit interval of segment */
 	__le32  s_c_block_max;          /* Threshold of data amount for
 					   the segment construction */
 	__u32	s_reserved[192];	/* padding to the end of the block */
-- 
cgit v1.2.3


From 1a80a1763fb760b3a84a28df87515f7cdc07a4f4 Mon Sep 17 00:00:00 2001
From: Ryusuke Konishi <konishi.ryusuke@lab.ntt.co.jp>
Date: Thu, 22 Jul 2010 03:22:19 +0900
Subject: nilfs2: add feature set fields to super block

This adds three new fields to nilfs_super_block structure, compatible
feature set, readonly-compatible feature set, and incompatible feature
set in order to prepare for future disk format modifications.

The role of these fields conforms to those of ext3 or other
filesystems.  Most important flags are the incompatible feature set;
it is used to refuse to mount the filesystem which sets an
incompatible feature the kernel doesn't know about.

Signed-off-by: Ryusuke Konishi <konishi.ryusuke@lab.ntt.co.jp>
---
 include/linux/nilfs2_fs.h | 15 ++++++++++++++-
 1 file changed, 14 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/nilfs2_fs.h b/include/linux/nilfs2_fs.h
index cc3465e5be38..7dd4cd494490 100644
--- a/include/linux/nilfs2_fs.h
+++ b/include/linux/nilfs2_fs.h
@@ -212,7 +212,10 @@ struct nilfs_super_block {
 /*F8*/	__le32  s_c_interval;           /* Commit interval of segment */
 	__le32  s_c_block_max;          /* Threshold of data amount for
 					   the segment construction */
-	__u32	s_reserved[192];	/* padding to the end of the block */
+/*100*/	__le64  s_feature_compat;	/* Compatible feature set */
+	__le64  s_feature_compat_ro;	/* Read-only compatible feature set */
+	__le64  s_feature_incompat;	/* Incompatible feature set */
+	__u32	s_reserved[186];	/* padding to the end of the block */
 };
 
 /*
@@ -227,6 +230,16 @@ struct nilfs_super_block {
 #define NILFS_CURRENT_REV	2	/* current major revision */
 #define NILFS_MINOR_REV		0	/* minor revision */
 
+/*
+ * Feature set definitions
+ *
+ * If there is a bit set in the incompatible feature set that the kernel
+ * doesn't know about, it should refuse to mount the filesystem.
+ */
+#define NILFS_FEATURE_COMPAT_SUPP	0ULL
+#define NILFS_FEATURE_COMPAT_RO_SUPP	0ULL
+#define NILFS_FEATURE_INCOMPAT_SUPP	0ULL
+
 /*
  * Bytes count of super_block for CRC-calculation
  */
-- 
cgit v1.2.3


From 43d2932d88e4ab776dd388c20b003ebd5e1d1f1f Mon Sep 17 00:00:00 2001
From: Jan Kara <jack@suse.cz>
Date: Wed, 21 Jul 2010 14:22:21 +0200
Subject: quota: Use mark_inode_dirty_sync instead of mark_inode_dirty

Quota code never touches file data. It just modifies i_blocks + i_bytes
of inodes and inode flags of quota files. So use mark_inode_dirty_sync
instead of mark_inode_dirty.

Signed-off-by: Jan Kara <jack@suse.cz>
---
 fs/quota/dquot.c         |  2 +-
 include/linux/quotaops.h | 10 +++++-----
 2 files changed, 6 insertions(+), 6 deletions(-)

(limited to 'include/linux')

diff --git a/fs/quota/dquot.c b/fs/quota/dquot.c
index b171221000fa..a7023bcfae4f 100644
--- a/fs/quota/dquot.c
+++ b/fs/quota/dquot.c
@@ -1992,7 +1992,7 @@ int dquot_disable(struct super_block *sb, int type, unsigned int flags)
 				truncate_inode_pages(&toputinode[cnt]->i_data,
 						     0);
 				mutex_unlock(&toputinode[cnt]->i_mutex);
-				mark_inode_dirty(toputinode[cnt]);
+				mark_inode_dirty_sync(toputinode[cnt]);
 			}
 			mutex_unlock(&dqopt->dqonoff_mutex);
 		}
diff --git a/include/linux/quotaops.h b/include/linux/quotaops.h
index 4881b49b1a9a..d50ba858cfe0 100644
--- a/include/linux/quotaops.h
+++ b/include/linux/quotaops.h
@@ -266,7 +266,7 @@ static inline int dquot_alloc_space_nodirty(struct inode *inode, qsize_t nr)
 static inline void dquot_alloc_space_nofail(struct inode *inode, qsize_t nr)
 {
 	__dquot_alloc_space(inode, nr, DQUOT_SPACE_WARN|DQUOT_SPACE_NOFAIL);
-	mark_inode_dirty(inode);
+	mark_inode_dirty_sync(inode);
 }
 
 static inline int dquot_alloc_space(struct inode *inode, qsize_t nr)
@@ -275,7 +275,7 @@ static inline int dquot_alloc_space(struct inode *inode, qsize_t nr)
 
 	ret = dquot_alloc_space_nodirty(inode, nr);
 	if (!ret)
-		mark_inode_dirty(inode);
+		mark_inode_dirty_sync(inode);
 	return ret;
 }
 
@@ -305,7 +305,7 @@ static inline int dquot_prealloc_block(struct inode *inode, qsize_t nr)
 
 	ret = dquot_prealloc_block_nodirty(inode, nr);
 	if (!ret)
-		mark_inode_dirty(inode);
+		mark_inode_dirty_sync(inode);
 	return ret;
 }
 
@@ -321,7 +321,7 @@ static inline int dquot_claim_block(struct inode *inode, qsize_t nr)
 
 	ret = dquot_claim_space_nodirty(inode, nr << inode->i_blkbits);
 	if (!ret)
-		mark_inode_dirty(inode);
+		mark_inode_dirty_sync(inode);
 	return ret;
 }
 
@@ -333,7 +333,7 @@ static inline void dquot_free_space_nodirty(struct inode *inode, qsize_t nr)
 static inline void dquot_free_space(struct inode *inode, qsize_t nr)
 {
 	dquot_free_space_nodirty(inode, nr);
-	mark_inode_dirty(inode);
+	mark_inode_dirty_sync(inode);
 }
 
 static inline void dquot_free_block_nodirty(struct inode *inode, qsize_t nr)
-- 
cgit v1.2.3


From 181a51f6e040d0ac006d6adaf4a031ffa440f41c Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Tue, 20 Jul 2010 22:09:02 +0200
Subject: slow-work: kill it

slow-work doesn't have any user left.  Kill it.

Signed-off-by: Tejun Heo <tj@kernel.org>
Acked-by: David Howells <dhowells@redhat.com>
---
 Documentation/slow-work.txt |  322 -------------
 include/linux/slow-work.h   |  163 -------
 init/Kconfig                |   24 -
 kernel/Makefile             |    2 -
 kernel/slow-work-debugfs.c  |  227 ---------
 kernel/slow-work.c          | 1068 -------------------------------------------
 kernel/slow-work.h          |   72 ---
 kernel/sysctl.c             |    8 -
 8 files changed, 1886 deletions(-)
 delete mode 100644 Documentation/slow-work.txt
 delete mode 100644 include/linux/slow-work.h
 delete mode 100644 kernel/slow-work-debugfs.c
 delete mode 100644 kernel/slow-work.c
 delete mode 100644 kernel/slow-work.h

(limited to 'include/linux')

diff --git a/Documentation/slow-work.txt b/Documentation/slow-work.txt
deleted file mode 100644
index 9dbf4470c7e1..000000000000
--- a/Documentation/slow-work.txt
+++ /dev/null
@@ -1,322 +0,0 @@
-		     ====================================
-		     SLOW WORK ITEM EXECUTION THREAD POOL
-		     ====================================
-
-By: David Howells <dhowells@redhat.com>
-
-The slow work item execution thread pool is a pool of threads for performing
-things that take a relatively long time, such as making mkdir calls.
-Typically, when processing something, these items will spend a lot of time
-blocking a thread on I/O, thus making that thread unavailable for doing other
-work.
-
-The standard workqueue model is unsuitable for this class of work item as that
-limits the owner to a single thread or a single thread per CPU.  For some
-tasks, however, more threads - or fewer - are required.
-
-There is just one pool per system.  It contains no threads unless something
-wants to use it - and that something must register its interest first.  When
-the pool is active, the number of threads it contains is dynamic, varying
-between a maximum and minimum setting, depending on the load.
-
-
-====================
-CLASSES OF WORK ITEM
-====================
-
-This pool support two classes of work items:
-
- (*) Slow work items.
-
- (*) Very slow work items.
-
-The former are expected to finish much quicker than the latter.
-
-An operation of the very slow class may do a batch combination of several
-lookups, mkdirs, and a create for instance.
-
-An operation of the ordinarily slow class may, for example, write stuff or
-expand files, provided the time taken to do so isn't too long.
-
-Operations of both types may sleep during execution, thus tying up the thread
-loaned to it.
-
-A further class of work item is available, based on the slow work item class:
-
- (*) Delayed slow work items.
-
-These are slow work items that have a timer to defer queueing of the item for
-a while.
-
-
-THREAD-TO-CLASS ALLOCATION
---------------------------
-
-Not all the threads in the pool are available to work on very slow work items.
-The number will be between one and one fewer than the number of active threads.
-This is configurable (see the "Pool Configuration" section).
-
-All the threads are available to work on ordinarily slow work items, but a
-percentage of the threads will prefer to work on very slow work items.
-
-The configuration ensures that at least one thread will be available to work on
-very slow work items, and at least one thread will be available that won't work
-on very slow work items at all.
-
-
-=====================
-USING SLOW WORK ITEMS
-=====================
-
-Firstly, a module or subsystem wanting to make use of slow work items must
-register its interest:
-
-	 int ret = slow_work_register_user(struct module *module);
-
-This will return 0 if successful, or a -ve error upon failure.  The module
-pointer should be the module interested in using this facility (almost
-certainly THIS_MODULE).
-
-
-Slow work items may then be set up by:
-
- (1) Declaring a slow_work struct type variable:
-
-	#include <linux/slow-work.h>
-
-	struct slow_work myitem;
-
- (2) Declaring the operations to be used for this item:
-
-	struct slow_work_ops myitem_ops = {
-		.get_ref = myitem_get_ref,
-		.put_ref = myitem_put_ref,
-		.execute = myitem_execute,
-	};
-
-     [*] For a description of the ops, see section "Item Operations".
-
- (3) Initialising the item:
-
-	slow_work_init(&myitem, &myitem_ops);
-
-     or:
-
-	delayed_slow_work_init(&myitem, &myitem_ops);
-
-     or:
-
-	vslow_work_init(&myitem, &myitem_ops);
-
-     depending on its class.
-
-A suitably set up work item can then be enqueued for processing:
-
-	int ret = slow_work_enqueue(&myitem);
-
-This will return a -ve error if the thread pool is unable to gain a reference
-on the item, 0 otherwise, or (for delayed work):
-
-	int ret = delayed_slow_work_enqueue(&myitem, my_jiffy_delay);
-
-
-The items are reference counted, so there ought to be no need for a flush
-operation.  But as the reference counting is optional, means to cancel
-existing work items are also included:
-
-	cancel_slow_work(&myitem);
-	cancel_delayed_slow_work(&myitem);
-
-can be used to cancel pending work.  The above cancel function waits for
-existing work to have been executed (or prevent execution of them, depending
-on timing).
-
-
-When all a module's slow work items have been processed, and the
-module has no further interest in the facility, it should unregister its
-interest:
-
-	slow_work_unregister_user(struct module *module);
-
-The module pointer is used to wait for all outstanding work items for that
-module before completing the unregistration.  This prevents the put_ref() code
-from being taken away before it completes.  module should almost certainly be
-THIS_MODULE.
-
-
-================
-HELPER FUNCTIONS
-================
-
-The slow-work facility provides a function by which it can be determined
-whether or not an item is queued for later execution:
-
-	bool queued = slow_work_is_queued(struct slow_work *work);
-
-If it returns false, then the item is not on the queue (it may be executing
-with a requeue pending).  This can be used to work out whether an item on which
-another depends is on the queue, thus allowing a dependent item to be queued
-after it.
-
-If the above shows an item on which another depends not to be queued, then the
-owner of the dependent item might need to wait.  However, to avoid locking up
-the threads unnecessarily be sleeping in them, it can make sense under some
-circumstances to return the work item to the queue, thus deferring it until
-some other items have had a chance to make use of the yielded thread.
-
-To yield a thread and defer an item, the work function should simply enqueue
-the work item again and return.  However, this doesn't work if there's nothing
-actually on the queue, as the thread just vacated will jump straight back into
-the item's work function, thus busy waiting on a CPU.
-
-Instead, the item should use the thread to wait for the dependency to go away,
-but rather than using schedule() or schedule_timeout() to sleep, it should use
-the following function:
-
-	bool requeue = slow_work_sleep_till_thread_needed(
-			struct slow_work *work,
-			signed long *_timeout);
-
-This will add a second wait and then sleep, such that it will be woken up if
-either something appears on the queue that could usefully make use of the
-thread - and behind which this item can be queued, or if the event the caller
-set up to wait for happens.  True will be returned if something else appeared
-on the queue and this work function should perhaps return, of false if
-something else woke it up.  The timeout is as for schedule_timeout().
-
-For example:
-
-	wq = bit_waitqueue(&my_flags, MY_BIT);
-	init_wait(&wait);
-	requeue = false;
-	do {
-		prepare_to_wait(wq, &wait, TASK_UNINTERRUPTIBLE);
-		if (!test_bit(MY_BIT, &my_flags))
-			break;
-		requeue = slow_work_sleep_till_thread_needed(&my_work,
-							     &timeout);
-	} while (timeout > 0 && !requeue);
-	finish_wait(wq, &wait);
-	if (!test_bit(MY_BIT, &my_flags)
-		goto do_my_thing;
-	if (requeue)
-		return; // to slow_work
-
-
-===============
-ITEM OPERATIONS
-===============
-
-Each work item requires a table of operations of type struct slow_work_ops.
-Only ->execute() is required; the getting and putting of a reference and the
-describing of an item are all optional.
-
- (*) Get a reference on an item:
-
-	int (*get_ref)(struct slow_work *work);
-
-     This allows the thread pool to attempt to pin an item by getting a
-     reference on it.  This function should return 0 if the reference was
-     granted, or a -ve error otherwise.  If an error is returned,
-     slow_work_enqueue() will fail.
-
-     The reference is held whilst the item is queued and whilst it is being
-     executed.  The item may then be requeued with the same reference held, or
-     the reference will be released.
-
- (*) Release a reference on an item:
-
-	void (*put_ref)(struct slow_work *work);
-
-     This allows the thread pool to unpin an item by releasing the reference on
-     it.  The thread pool will not touch the item again once this has been
-     called.
-
- (*) Execute an item:
-
-	void (*execute)(struct slow_work *work);
-
-     This should perform the work required of the item.  It may sleep, it may
-     perform disk I/O and it may wait for locks.
-
- (*) View an item through /proc:
-
-	void (*desc)(struct slow_work *work, struct seq_file *m);
-
-     If supplied, this should print to 'm' a small string describing the work
-     the item is to do.  This should be no more than about 40 characters, and
-     shouldn't include a newline character.
-
-     See the 'Viewing executing and queued items' section below.
-
-
-==================
-POOL CONFIGURATION
-==================
-
-The slow-work thread pool has a number of configurables:
-
- (*) /proc/sys/kernel/slow-work/min-threads
-
-     The minimum number of threads that should be in the pool whilst it is in
-     use.  This may be anywhere between 2 and max-threads.
-
- (*) /proc/sys/kernel/slow-work/max-threads
-
-     The maximum number of threads that should in the pool.  This may be
-     anywhere between min-threads and 255 or NR_CPUS * 2, whichever is greater.
-
- (*) /proc/sys/kernel/slow-work/vslow-percentage
-
-     The percentage of active threads in the pool that may be used to execute
-     very slow work items.  This may be between 1 and 99.  The resultant number
-     is bounded to between 1 and one fewer than the number of active threads.
-     This ensures there is always at least one thread that can process very
-     slow work items, and always at least one thread that won't.
-
-
-==================================
-VIEWING EXECUTING AND QUEUED ITEMS
-==================================
-
-If CONFIG_SLOW_WORK_DEBUG is enabled, a debugfs file is made available:
-
-	/sys/kernel/debug/slow_work/runqueue
-
-through which the list of work items being executed and the queues of items to
-be executed may be viewed.  The owner of a work item is given the chance to
-add some information of its own.
-
-The contents look something like the following:
-
-    THR PID   ITEM ADDR        FL MARK  DESC
-    === ===== ================ == ===== ==========
-      0  3005 ffff880023f52348  a 952ms FSC: OBJ17d3: LOOK
-      1  3006 ffff880024e33668  2 160ms FSC: OBJ17e5 OP60d3b: Write1/Store fl=2
-      2  3165 ffff8800296dd180  a 424ms FSC: OBJ17e4: LOOK
-      3  4089 ffff8800262c8d78  a 212ms FSC: OBJ17ea: CRTN
-      4  4090 ffff88002792bed8  2 388ms FSC: OBJ17e8 OP60d36: Write1/Store fl=2
-      5  4092 ffff88002a0ef308  2 388ms FSC: OBJ17e7 OP60d2e: Write1/Store fl=2
-      6  4094 ffff88002abaf4b8  2 132ms FSC: OBJ17e2 OP60d4e: Write1/Store fl=2
-      7  4095 ffff88002bb188e0  a 388ms FSC: OBJ17e9: CRTN
-    vsq     - ffff880023d99668  1 308ms FSC: OBJ17e0 OP60f91: Write1/EnQ fl=2
-    vsq     - ffff8800295d1740  1 212ms FSC: OBJ16be OP4d4b6: Write1/EnQ fl=2
-    vsq     - ffff880025ba3308  1 160ms FSC: OBJ179a OP58dec: Write1/EnQ fl=2
-    vsq     - ffff880024ec83e0  1 160ms FSC: OBJ17ae OP599f2: Write1/EnQ fl=2
-    vsq     - ffff880026618e00  1 160ms FSC: OBJ17e6 OP60d33: Write1/EnQ fl=2
-    vsq     - ffff880025a2a4b8  1 132ms FSC: OBJ16a2 OP4d583: Write1/EnQ fl=2
-    vsq     - ffff880023cbe6d8  9 212ms FSC: OBJ17eb: LOOK
-    vsq     - ffff880024d37590  9 212ms FSC: OBJ17ec: LOOK
-    vsq     - ffff880027746cb0  9 212ms FSC: OBJ17ed: LOOK
-    vsq     - ffff880024d37ae8  9 212ms FSC: OBJ17ee: LOOK
-    vsq     - ffff880024d37cb0  9 212ms FSC: OBJ17ef: LOOK
-    vsq     - ffff880025036550  9 212ms FSC: OBJ17f0: LOOK
-    vsq     - ffff8800250368e0  9 212ms FSC: OBJ17f1: LOOK
-    vsq     - ffff880025036aa8  9 212ms FSC: OBJ17f2: LOOK
-
-In the 'THR' column, executing items show the thread they're occupying and
-queued threads indicate which queue they're on.  'PID' shows the process ID of
-a slow-work thread that's executing something.  'FL' shows the work item flags.
-'MARK' indicates how long since an item was queued or began executing.  Lastly,
-the 'DESC' column permits the owner of an item to give some information.
-
diff --git a/include/linux/slow-work.h b/include/linux/slow-work.h
deleted file mode 100644
index 13337bf6c3f5..000000000000
--- a/include/linux/slow-work.h
+++ /dev/null
@@ -1,163 +0,0 @@
-/* Worker thread pool for slow items, such as filesystem lookups or mkdirs
- *
- * Copyright (C) 2008 Red Hat, Inc. All Rights Reserved.
- * Written by David Howells (dhowells@redhat.com)
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public Licence
- * as published by the Free Software Foundation; either version
- * 2 of the Licence, or (at your option) any later version.
- *
- * See Documentation/slow-work.txt
- */
-
-#ifndef _LINUX_SLOW_WORK_H
-#define _LINUX_SLOW_WORK_H
-
-#ifdef CONFIG_SLOW_WORK
-
-#include <linux/sysctl.h>
-#include <linux/timer.h>
-
-struct slow_work;
-#ifdef CONFIG_SLOW_WORK_DEBUG
-struct seq_file;
-#endif
-
-/*
- * The operations used to support slow work items
- */
-struct slow_work_ops {
-	/* owner */
-	struct module *owner;
-
-	/* get a ref on a work item
-	 * - return 0 if successful, -ve if not
-	 */
-	int (*get_ref)(struct slow_work *work);
-
-	/* discard a ref to a work item */
-	void (*put_ref)(struct slow_work *work);
-
-	/* execute a work item */
-	void (*execute)(struct slow_work *work);
-
-#ifdef CONFIG_SLOW_WORK_DEBUG
-	/* describe a work item for debugfs */
-	void (*desc)(struct slow_work *work, struct seq_file *m);
-#endif
-};
-
-/*
- * A slow work item
- * - A reference is held on the parent object by the thread pool when it is
- *   queued
- */
-struct slow_work {
-	struct module		*owner;	/* the owning module */
-	unsigned long		flags;
-#define SLOW_WORK_PENDING	0	/* item pending (further) execution */
-#define SLOW_WORK_EXECUTING	1	/* item currently executing */
-#define SLOW_WORK_ENQ_DEFERRED	2	/* item enqueue deferred */
-#define SLOW_WORK_VERY_SLOW	3	/* item is very slow */
-#define SLOW_WORK_CANCELLING	4	/* item is being cancelled, don't enqueue */
-#define SLOW_WORK_DELAYED	5	/* item is struct delayed_slow_work with active timer */
-	const struct slow_work_ops *ops; /* operations table for this item */
-	struct list_head	link;	/* link in queue */
-#ifdef CONFIG_SLOW_WORK_DEBUG
-	struct timespec		mark;	/* jiffies at which queued or exec begun */
-#endif
-};
-
-struct delayed_slow_work {
-	struct slow_work	work;
-	struct timer_list	timer;
-};
-
-/**
- * slow_work_init - Initialise a slow work item
- * @work: The work item to initialise
- * @ops: The operations to use to handle the slow work item
- *
- * Initialise a slow work item.
- */
-static inline void slow_work_init(struct slow_work *work,
-				  const struct slow_work_ops *ops)
-{
-	work->flags = 0;
-	work->ops = ops;
-	INIT_LIST_HEAD(&work->link);
-}
-
-/**
- * slow_work_init - Initialise a delayed slow work item
- * @work: The work item to initialise
- * @ops: The operations to use to handle the slow work item
- *
- * Initialise a delayed slow work item.
- */
-static inline void delayed_slow_work_init(struct delayed_slow_work *dwork,
-					  const struct slow_work_ops *ops)
-{
-	init_timer(&dwork->timer);
-	slow_work_init(&dwork->work, ops);
-}
-
-/**
- * vslow_work_init - Initialise a very slow work item
- * @work: The work item to initialise
- * @ops: The operations to use to handle the slow work item
- *
- * Initialise a very slow work item.  This item will be restricted such that
- * only a certain number of the pool threads will be able to execute items of
- * this type.
- */
-static inline void vslow_work_init(struct slow_work *work,
-				   const struct slow_work_ops *ops)
-{
-	work->flags = 1 << SLOW_WORK_VERY_SLOW;
-	work->ops = ops;
-	INIT_LIST_HEAD(&work->link);
-}
-
-/**
- * slow_work_is_queued - Determine if a slow work item is on the work queue
- * work: The work item to test
- *
- * Determine if the specified slow-work item is on the work queue.  This
- * returns true if it is actually on the queue.
- *
- * If the item is executing and has been marked for requeue when execution
- * finishes, then false will be returned.
- *
- * Anyone wishing to wait for completion of execution can wait on the
- * SLOW_WORK_EXECUTING bit.
- */
-static inline bool slow_work_is_queued(struct slow_work *work)
-{
-	unsigned long flags = work->flags;
-	return flags & SLOW_WORK_PENDING && !(flags & SLOW_WORK_EXECUTING);
-}
-
-extern int slow_work_enqueue(struct slow_work *work);
-extern void slow_work_cancel(struct slow_work *work);
-extern int slow_work_register_user(struct module *owner);
-extern void slow_work_unregister_user(struct module *owner);
-
-extern int delayed_slow_work_enqueue(struct delayed_slow_work *dwork,
-				     unsigned long delay);
-
-static inline void delayed_slow_work_cancel(struct delayed_slow_work *dwork)
-{
-	slow_work_cancel(&dwork->work);
-}
-
-extern bool slow_work_sleep_till_thread_needed(struct slow_work *work,
-					       signed long *_timeout);
-
-#ifdef CONFIG_SYSCTL
-extern ctl_table slow_work_sysctls[];
-#endif
-
-#endif /* CONFIG_SLOW_WORK */
-#endif /* _LINUX_SLOW_WORK_H */
diff --git a/init/Kconfig b/init/Kconfig
index 5cff9a980c39..cb64c5889e02 100644
--- a/init/Kconfig
+++ b/init/Kconfig
@@ -1143,30 +1143,6 @@ config TRACEPOINTS
 
 source "arch/Kconfig"
 
-config SLOW_WORK
-	default n
-	bool
-	help
-	  The slow work thread pool provides a number of dynamically allocated
-	  threads that can be used by the kernel to perform operations that
-	  take a relatively long time.
-
-	  An example of this would be CacheFiles doing a path lookup followed
-	  by a series of mkdirs and a create call, all of which have to touch
-	  disk.
-
-	  See Documentation/slow-work.txt.
-
-config SLOW_WORK_DEBUG
-	bool "Slow work debugging through debugfs"
-	default n
-	depends on SLOW_WORK && DEBUG_FS
-	help
-	  Display the contents of the slow work run queue through debugfs,
-	  including items currently executing.
-
-	  See Documentation/slow-work.txt.
-
 endmenu		# General setup
 
 config HAVE_GENERIC_DMA_COHERENT
diff --git a/kernel/Makefile b/kernel/Makefile
index 057472fbc272..2484ac39b2e2 100644
--- a/kernel/Makefile
+++ b/kernel/Makefile
@@ -99,8 +99,6 @@ obj-$(CONFIG_TRACING) += trace/
 obj-$(CONFIG_X86_DS) += trace/
 obj-$(CONFIG_RING_BUFFER) += trace/
 obj-$(CONFIG_SMP) += sched_cpupri.o
-obj-$(CONFIG_SLOW_WORK) += slow-work.o
-obj-$(CONFIG_SLOW_WORK_DEBUG) += slow-work-debugfs.o
 obj-$(CONFIG_PERF_EVENTS) += perf_event.o
 obj-$(CONFIG_HAVE_HW_BREAKPOINT) += hw_breakpoint.o
 obj-$(CONFIG_USER_RETURN_NOTIFIER) += user-return-notifier.o
diff --git a/kernel/slow-work-debugfs.c b/kernel/slow-work-debugfs.c
deleted file mode 100644
index e45c43645298..000000000000
--- a/kernel/slow-work-debugfs.c
+++ /dev/null
@@ -1,227 +0,0 @@
-/* Slow work debugging
- *
- * Copyright (C) 2009 Red Hat, Inc. All Rights Reserved.
- * Written by David Howells (dhowells@redhat.com)
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public Licence
- * as published by the Free Software Foundation; either version
- * 2 of the Licence, or (at your option) any later version.
- */
-
-#include <linux/module.h>
-#include <linux/slow-work.h>
-#include <linux/fs.h>
-#include <linux/time.h>
-#include <linux/seq_file.h>
-#include "slow-work.h"
-
-#define ITERATOR_SHIFT		(BITS_PER_LONG - 4)
-#define ITERATOR_SELECTOR	(0xfUL << ITERATOR_SHIFT)
-#define ITERATOR_COUNTER	(~ITERATOR_SELECTOR)
-
-void slow_work_new_thread_desc(struct slow_work *work, struct seq_file *m)
-{
-	seq_puts(m, "Slow-work: New thread");
-}
-
-/*
- * Render the time mark field on a work item into a 5-char time with units plus
- * a space
- */
-static void slow_work_print_mark(struct seq_file *m, struct slow_work *work)
-{
-	struct timespec now, diff;
-
-	now = CURRENT_TIME;
-	diff = timespec_sub(now, work->mark);
-
-	if (diff.tv_sec < 0)
-		seq_puts(m, "  -ve ");
-	else if (diff.tv_sec == 0 && diff.tv_nsec < 1000)
-		seq_printf(m, "%3luns ", diff.tv_nsec);
-	else if (diff.tv_sec == 0 && diff.tv_nsec < 1000000)
-		seq_printf(m, "%3luus ", diff.tv_nsec / 1000);
-	else if (diff.tv_sec == 0 && diff.tv_nsec < 1000000000)
-		seq_printf(m, "%3lums ", diff.tv_nsec / 1000000);
-	else if (diff.tv_sec <= 1)
-		seq_puts(m, "   1s ");
-	else if (diff.tv_sec < 60)
-		seq_printf(m, "%4lus ", diff.tv_sec);
-	else if (diff.tv_sec < 60 * 60)
-		seq_printf(m, "%4lum ", diff.tv_sec / 60);
-	else if (diff.tv_sec < 60 * 60 * 24)
-		seq_printf(m, "%4luh ", diff.tv_sec / 3600);
-	else
-		seq_puts(m, "exces ");
-}
-
-/*
- * Describe a slow work item for debugfs
- */
-static int slow_work_runqueue_show(struct seq_file *m, void *v)
-{
-	struct slow_work *work;
-	struct list_head *p = v;
-	unsigned long id;
-
-	switch ((unsigned long) v) {
-	case 1:
-		seq_puts(m, "THR PID   ITEM ADDR        FL MARK  DESC\n");
-		return 0;
-	case 2:
-		seq_puts(m, "=== ===== ================ == ===== ==========\n");
-		return 0;
-
-	case 3 ... 3 + SLOW_WORK_THREAD_LIMIT - 1:
-		id = (unsigned long) v - 3;
-
-		read_lock(&slow_work_execs_lock);
-		work = slow_work_execs[id];
-		if (work) {
-			smp_read_barrier_depends();
-
-			seq_printf(m, "%3lu %5d %16p %2lx ",
-				   id, slow_work_pids[id], work, work->flags);
-			slow_work_print_mark(m, work);
-
-			if (work->ops->desc)
-				work->ops->desc(work, m);
-			seq_putc(m, '\n');
-		}
-		read_unlock(&slow_work_execs_lock);
-		return 0;
-
-	default:
-		work = list_entry(p, struct slow_work, link);
-		seq_printf(m, "%3s     - %16p %2lx ",
-			   work->flags & SLOW_WORK_VERY_SLOW ? "vsq" : "sq",
-			   work, work->flags);
-		slow_work_print_mark(m, work);
-
-		if (work->ops->desc)
-			work->ops->desc(work, m);
-		seq_putc(m, '\n');
-		return 0;
-	}
-}
-
-/*
- * map the iterator to a work item
- */
-static void *slow_work_runqueue_index(struct seq_file *m, loff_t *_pos)
-{
-	struct list_head *p;
-	unsigned long count, id;
-
-	switch (*_pos >> ITERATOR_SHIFT) {
-	case 0x0:
-		if (*_pos == 0)
-			*_pos = 1;
-		if (*_pos < 3)
-			return (void *)(unsigned long) *_pos;
-		if (*_pos < 3 + SLOW_WORK_THREAD_LIMIT)
-			for (id = *_pos - 3;
-			     id < SLOW_WORK_THREAD_LIMIT;
-			     id++, (*_pos)++)
-				if (slow_work_execs[id])
-					return (void *)(unsigned long) *_pos;
-		*_pos = 0x1UL << ITERATOR_SHIFT;
-
-	case 0x1:
-		count = *_pos & ITERATOR_COUNTER;
-		list_for_each(p, &slow_work_queue) {
-			if (count == 0)
-				return p;
-			count--;
-		}
-		*_pos = 0x2UL << ITERATOR_SHIFT;
-
-	case 0x2:
-		count = *_pos & ITERATOR_COUNTER;
-		list_for_each(p, &vslow_work_queue) {
-			if (count == 0)
-				return p;
-			count--;
-		}
-		*_pos = 0x3UL << ITERATOR_SHIFT;
-
-	default:
-		return NULL;
-	}
-}
-
-/*
- * set up the iterator to start reading from the first line
- */
-static void *slow_work_runqueue_start(struct seq_file *m, loff_t *_pos)
-{
-	spin_lock_irq(&slow_work_queue_lock);
-	return slow_work_runqueue_index(m, _pos);
-}
-
-/*
- * move to the next line
- */
-static void *slow_work_runqueue_next(struct seq_file *m, void *v, loff_t *_pos)
-{
-	struct list_head *p = v;
-	unsigned long selector = *_pos >> ITERATOR_SHIFT;
-
-	(*_pos)++;
-	switch (selector) {
-	case 0x0:
-		return slow_work_runqueue_index(m, _pos);
-
-	case 0x1:
-		if (*_pos >> ITERATOR_SHIFT == 0x1) {
-			p = p->next;
-			if (p != &slow_work_queue)
-				return p;
-		}
-		*_pos = 0x2UL << ITERATOR_SHIFT;
-		p = &vslow_work_queue;
-
-	case 0x2:
-		if (*_pos >> ITERATOR_SHIFT == 0x2) {
-			p = p->next;
-			if (p != &vslow_work_queue)
-				return p;
-		}
-		*_pos = 0x3UL << ITERATOR_SHIFT;
-
-	default:
-		return NULL;
-	}
-}
-
-/*
- * clean up after reading
- */
-static void slow_work_runqueue_stop(struct seq_file *m, void *v)
-{
-	spin_unlock_irq(&slow_work_queue_lock);
-}
-
-static const struct seq_operations slow_work_runqueue_ops = {
-	.start		= slow_work_runqueue_start,
-	.stop		= slow_work_runqueue_stop,
-	.next		= slow_work_runqueue_next,
-	.show		= slow_work_runqueue_show,
-};
-
-/*
- * open "/sys/kernel/debug/slow_work/runqueue" to list queue contents
- */
-static int slow_work_runqueue_open(struct inode *inode, struct file *file)
-{
-	return seq_open(file, &slow_work_runqueue_ops);
-}
-
-const struct file_operations slow_work_runqueue_fops = {
-	.owner		= THIS_MODULE,
-	.open		= slow_work_runqueue_open,
-	.read		= seq_read,
-	.llseek		= seq_lseek,
-	.release	= seq_release,
-};
diff --git a/kernel/slow-work.c b/kernel/slow-work.c
deleted file mode 100644
index 7d3f4fa9ef4f..000000000000
--- a/kernel/slow-work.c
+++ /dev/null
@@ -1,1068 +0,0 @@
-/* Worker thread pool for slow items, such as filesystem lookups or mkdirs
- *
- * Copyright (C) 2008 Red Hat, Inc. All Rights Reserved.
- * Written by David Howells (dhowells@redhat.com)
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public Licence
- * as published by the Free Software Foundation; either version
- * 2 of the Licence, or (at your option) any later version.
- *
- * See Documentation/slow-work.txt
- */
-
-#include <linux/module.h>
-#include <linux/slow-work.h>
-#include <linux/kthread.h>
-#include <linux/freezer.h>
-#include <linux/wait.h>
-#include <linux/debugfs.h>
-#include "slow-work.h"
-
-static void slow_work_cull_timeout(unsigned long);
-static void slow_work_oom_timeout(unsigned long);
-
-#ifdef CONFIG_SYSCTL
-static int slow_work_min_threads_sysctl(struct ctl_table *, int,
-					void __user *, size_t *, loff_t *);
-
-static int slow_work_max_threads_sysctl(struct ctl_table *, int ,
-					void __user *, size_t *, loff_t *);
-#endif
-
-/*
- * The pool of threads has at least min threads in it as long as someone is
- * using the facility, and may have as many as max.
- *
- * A portion of the pool may be processing very slow operations.
- */
-static unsigned slow_work_min_threads = 2;
-static unsigned slow_work_max_threads = 4;
-static unsigned vslow_work_proportion = 50; /* % of threads that may process
-					     * very slow work */
-
-#ifdef CONFIG_SYSCTL
-static const int slow_work_min_min_threads = 2;
-static int slow_work_max_max_threads = SLOW_WORK_THREAD_LIMIT;
-static const int slow_work_min_vslow = 1;
-static const int slow_work_max_vslow = 99;
-
-ctl_table slow_work_sysctls[] = {
-	{
-		.procname	= "min-threads",
-		.data		= &slow_work_min_threads,
-		.maxlen		= sizeof(unsigned),
-		.mode		= 0644,
-		.proc_handler	= slow_work_min_threads_sysctl,
-		.extra1		= (void *) &slow_work_min_min_threads,
-		.extra2		= &slow_work_max_threads,
-	},
-	{
-		.procname	= "max-threads",
-		.data		= &slow_work_max_threads,
-		.maxlen		= sizeof(unsigned),
-		.mode		= 0644,
-		.proc_handler	= slow_work_max_threads_sysctl,
-		.extra1		= &slow_work_min_threads,
-		.extra2		= (void *) &slow_work_max_max_threads,
-	},
-	{
-		.procname	= "vslow-percentage",
-		.data		= &vslow_work_proportion,
-		.maxlen		= sizeof(unsigned),
-		.mode		= 0644,
-		.proc_handler	= proc_dointvec_minmax,
-		.extra1		= (void *) &slow_work_min_vslow,
-		.extra2		= (void *) &slow_work_max_vslow,
-	},
-	{}
-};
-#endif
-
-/*
- * The active state of the thread pool
- */
-static atomic_t slow_work_thread_count;
-static atomic_t vslow_work_executing_count;
-
-static bool slow_work_may_not_start_new_thread;
-static bool slow_work_cull; /* cull a thread due to lack of activity */
-static DEFINE_TIMER(slow_work_cull_timer, slow_work_cull_timeout, 0, 0);
-static DEFINE_TIMER(slow_work_oom_timer, slow_work_oom_timeout, 0, 0);
-static struct slow_work slow_work_new_thread; /* new thread starter */
-
-/*
- * slow work ID allocation (use slow_work_queue_lock)
- */
-static DECLARE_BITMAP(slow_work_ids, SLOW_WORK_THREAD_LIMIT);
-
-/*
- * Unregistration tracking to prevent put_ref() from disappearing during module
- * unload
- */
-#ifdef CONFIG_MODULES
-static struct module *slow_work_thread_processing[SLOW_WORK_THREAD_LIMIT];
-static struct module *slow_work_unreg_module;
-static struct slow_work *slow_work_unreg_work_item;
-static DECLARE_WAIT_QUEUE_HEAD(slow_work_unreg_wq);
-static DEFINE_MUTEX(slow_work_unreg_sync_lock);
-
-static void slow_work_set_thread_processing(int id, struct slow_work *work)
-{
-	if (work)
-		slow_work_thread_processing[id] = work->owner;
-}
-static void slow_work_done_thread_processing(int id, struct slow_work *work)
-{
-	struct module *module = slow_work_thread_processing[id];
-
-	slow_work_thread_processing[id] = NULL;
-	smp_mb();
-	if (slow_work_unreg_work_item == work ||
-	    slow_work_unreg_module == module)
-		wake_up_all(&slow_work_unreg_wq);
-}
-static void slow_work_clear_thread_processing(int id)
-{
-	slow_work_thread_processing[id] = NULL;
-}
-#else
-static void slow_work_set_thread_processing(int id, struct slow_work *work) {}
-static void slow_work_done_thread_processing(int id, struct slow_work *work) {}
-static void slow_work_clear_thread_processing(int id) {}
-#endif
-
-/*
- * Data for tracking currently executing items for indication through /proc
- */
-#ifdef CONFIG_SLOW_WORK_DEBUG
-struct slow_work *slow_work_execs[SLOW_WORK_THREAD_LIMIT];
-pid_t slow_work_pids[SLOW_WORK_THREAD_LIMIT];
-DEFINE_RWLOCK(slow_work_execs_lock);
-#endif
-
-/*
- * The queues of work items and the lock governing access to them.  These are
- * shared between all the CPUs.  It doesn't make sense to have per-CPU queues
- * as the number of threads bears no relation to the number of CPUs.
- *
- * There are two queues of work items: one for slow work items, and one for
- * very slow work items.
- */
-LIST_HEAD(slow_work_queue);
-LIST_HEAD(vslow_work_queue);
-DEFINE_SPINLOCK(slow_work_queue_lock);
-
-/*
- * The following are two wait queues that get pinged when a work item is placed
- * on an empty queue.  These allow work items that are hogging a thread by
- * sleeping in a way that could be deferred to yield their thread and enqueue
- * themselves.
- */
-static DECLARE_WAIT_QUEUE_HEAD(slow_work_queue_waits_for_occupation);
-static DECLARE_WAIT_QUEUE_HEAD(vslow_work_queue_waits_for_occupation);
-
-/*
- * The thread controls.  A variable used to signal to the threads that they
- * should exit when the queue is empty, a waitqueue used by the threads to wait
- * for signals, and a completion set by the last thread to exit.
- */
-static bool slow_work_threads_should_exit;
-static DECLARE_WAIT_QUEUE_HEAD(slow_work_thread_wq);
-static DECLARE_COMPLETION(slow_work_last_thread_exited);
-
-/*
- * The number of users of the thread pool and its lock.  Whilst this is zero we
- * have no threads hanging around, and when this reaches zero, we wait for all
- * active or queued work items to complete and kill all the threads we do have.
- */
-static int slow_work_user_count;
-static DEFINE_MUTEX(slow_work_user_lock);
-
-static inline int slow_work_get_ref(struct slow_work *work)
-{
-	if (work->ops->get_ref)
-		return work->ops->get_ref(work);
-
-	return 0;
-}
-
-static inline void slow_work_put_ref(struct slow_work *work)
-{
-	if (work->ops->put_ref)
-		work->ops->put_ref(work);
-}
-
-/*
- * Calculate the maximum number of active threads in the pool that are
- * permitted to process very slow work items.
- *
- * The answer is rounded up to at least 1, but may not equal or exceed the
- * maximum number of the threads in the pool.  This means we always have at
- * least one thread that can process slow work items, and we always have at
- * least one thread that won't get tied up doing so.
- */
-static unsigned slow_work_calc_vsmax(void)
-{
-	unsigned vsmax;
-
-	vsmax = atomic_read(&slow_work_thread_count) * vslow_work_proportion;
-	vsmax /= 100;
-	vsmax = max(vsmax, 1U);
-	return min(vsmax, slow_work_max_threads - 1);
-}
-
-/*
- * Attempt to execute stuff queued on a slow thread.  Return true if we managed
- * it, false if there was nothing to do.
- */
-static noinline bool slow_work_execute(int id)
-{
-	struct slow_work *work = NULL;
-	unsigned vsmax;
-	bool very_slow;
-
-	vsmax = slow_work_calc_vsmax();
-
-	/* see if we can schedule a new thread to be started if we're not
-	 * keeping up with the work */
-	if (!waitqueue_active(&slow_work_thread_wq) &&
-	    (!list_empty(&slow_work_queue) || !list_empty(&vslow_work_queue)) &&
-	    atomic_read(&slow_work_thread_count) < slow_work_max_threads &&
-	    !slow_work_may_not_start_new_thread)
-		slow_work_enqueue(&slow_work_new_thread);
-
-	/* find something to execute */
-	spin_lock_irq(&slow_work_queue_lock);
-	if (!list_empty(&vslow_work_queue) &&
-	    atomic_read(&vslow_work_executing_count) < vsmax) {
-		work = list_entry(vslow_work_queue.next,
-				  struct slow_work, link);
-		if (test_and_set_bit_lock(SLOW_WORK_EXECUTING, &work->flags))
-			BUG();
-		list_del_init(&work->link);
-		atomic_inc(&vslow_work_executing_count);
-		very_slow = true;
-	} else if (!list_empty(&slow_work_queue)) {
-		work = list_entry(slow_work_queue.next,
-				  struct slow_work, link);
-		if (test_and_set_bit_lock(SLOW_WORK_EXECUTING, &work->flags))
-			BUG();
-		list_del_init(&work->link);
-		very_slow = false;
-	} else {
-		very_slow = false; /* avoid the compiler warning */
-	}
-
-	slow_work_set_thread_processing(id, work);
-	if (work) {
-		slow_work_mark_time(work);
-		slow_work_begin_exec(id, work);
-	}
-
-	spin_unlock_irq(&slow_work_queue_lock);
-
-	if (!work)
-		return false;
-
-	if (!test_and_clear_bit(SLOW_WORK_PENDING, &work->flags))
-		BUG();
-
-	/* don't execute if the work is in the process of being cancelled */
-	if (!test_bit(SLOW_WORK_CANCELLING, &work->flags))
-		work->ops->execute(work);
-
-	if (very_slow)
-		atomic_dec(&vslow_work_executing_count);
-	clear_bit_unlock(SLOW_WORK_EXECUTING, &work->flags);
-
-	/* wake up anyone waiting for this work to be complete */
-	wake_up_bit(&work->flags, SLOW_WORK_EXECUTING);
-
-	slow_work_end_exec(id, work);
-
-	/* if someone tried to enqueue the item whilst we were executing it,
-	 * then it'll be left unenqueued to avoid multiple threads trying to
-	 * execute it simultaneously
-	 *
-	 * there is, however, a race between us testing the pending flag and
-	 * getting the spinlock, and between the enqueuer setting the pending
-	 * flag and getting the spinlock, so we use a deferral bit to tell us
-	 * if the enqueuer got there first
-	 */
-	if (test_bit(SLOW_WORK_PENDING, &work->flags)) {
-		spin_lock_irq(&slow_work_queue_lock);
-
-		if (!test_bit(SLOW_WORK_EXECUTING, &work->flags) &&
-		    test_and_clear_bit(SLOW_WORK_ENQ_DEFERRED, &work->flags))
-			goto auto_requeue;
-
-		spin_unlock_irq(&slow_work_queue_lock);
-	}
-
-	/* sort out the race between module unloading and put_ref() */
-	slow_work_put_ref(work);
-	slow_work_done_thread_processing(id, work);
-
-	return true;
-
-auto_requeue:
-	/* we must complete the enqueue operation
-	 * - we transfer our ref on the item back to the appropriate queue
-	 * - don't wake another thread up as we're awake already
-	 */
-	slow_work_mark_time(work);
-	if (test_bit(SLOW_WORK_VERY_SLOW, &work->flags))
-		list_add_tail(&work->link, &vslow_work_queue);
-	else
-		list_add_tail(&work->link, &slow_work_queue);
-	spin_unlock_irq(&slow_work_queue_lock);
-	slow_work_clear_thread_processing(id);
-	return true;
-}
-
-/**
- * slow_work_sleep_till_thread_needed - Sleep till thread needed by other work
- * work: The work item under execution that wants to sleep
- * _timeout: Scheduler sleep timeout
- *
- * Allow a requeueable work item to sleep on a slow-work processor thread until
- * that thread is needed to do some other work or the sleep is interrupted by
- * some other event.
- *
- * The caller must set up a wake up event before calling this and must have set
- * the appropriate sleep mode (such as TASK_UNINTERRUPTIBLE) and tested its own
- * condition before calling this function as no test is made here.
- *
- * False is returned if there is nothing on the queue; true is returned if the
- * work item should be requeued
- */
-bool slow_work_sleep_till_thread_needed(struct slow_work *work,
-					signed long *_timeout)
-{
-	wait_queue_head_t *wfo_wq;
-	struct list_head *queue;
-
-	DEFINE_WAIT(wait);
-
-	if (test_bit(SLOW_WORK_VERY_SLOW, &work->flags)) {
-		wfo_wq = &vslow_work_queue_waits_for_occupation;
-		queue = &vslow_work_queue;
-	} else {
-		wfo_wq = &slow_work_queue_waits_for_occupation;
-		queue = &slow_work_queue;
-	}
-
-	if (!list_empty(queue))
-		return true;
-
-	add_wait_queue_exclusive(wfo_wq, &wait);
-	if (list_empty(queue))
-		*_timeout = schedule_timeout(*_timeout);
-	finish_wait(wfo_wq, &wait);
-
-	return !list_empty(queue);
-}
-EXPORT_SYMBOL(slow_work_sleep_till_thread_needed);
-
-/**
- * slow_work_enqueue - Schedule a slow work item for processing
- * @work: The work item to queue
- *
- * Schedule a slow work item for processing.  If the item is already undergoing
- * execution, this guarantees not to re-enter the execution routine until the
- * first execution finishes.
- *
- * The item is pinned by this function as it retains a reference to it, managed
- * through the item operations.  The item is unpinned once it has been
- * executed.
- *
- * An item may hog the thread that is running it for a relatively large amount
- * of time, sufficient, for example, to perform several lookup, mkdir, create
- * and setxattr operations.  It may sleep on I/O and may sleep to obtain locks.
- *
- * Conversely, if a number of items are awaiting processing, it may take some
- * time before any given item is given attention.  The number of threads in the
- * pool may be increased to deal with demand, but only up to a limit.
- *
- * If SLOW_WORK_VERY_SLOW is set on the work item, then it will be placed in
- * the very slow queue, from which only a portion of the threads will be
- * allowed to pick items to execute.  This ensures that very slow items won't
- * overly block ones that are just ordinarily slow.
- *
- * Returns 0 if successful, -EAGAIN if not (or -ECANCELED if cancelled work is
- * attempted queued)
- */
-int slow_work_enqueue(struct slow_work *work)
-{
-	wait_queue_head_t *wfo_wq;
-	struct list_head *queue;
-	unsigned long flags;
-	int ret;
-
-	if (test_bit(SLOW_WORK_CANCELLING, &work->flags))
-		return -ECANCELED;
-
-	BUG_ON(slow_work_user_count <= 0);
-	BUG_ON(!work);
-	BUG_ON(!work->ops);
-
-	/* when honouring an enqueue request, we only promise that we will run
-	 * the work function in the future; we do not promise to run it once
-	 * per enqueue request
-	 *
-	 * we use the PENDING bit to merge together repeat requests without
-	 * having to disable IRQs and take the spinlock, whilst still
-	 * maintaining our promise
-	 */
-	if (!test_and_set_bit_lock(SLOW_WORK_PENDING, &work->flags)) {
-		if (test_bit(SLOW_WORK_VERY_SLOW, &work->flags)) {
-			wfo_wq = &vslow_work_queue_waits_for_occupation;
-			queue = &vslow_work_queue;
-		} else {
-			wfo_wq = &slow_work_queue_waits_for_occupation;
-			queue = &slow_work_queue;
-		}
-
-		spin_lock_irqsave(&slow_work_queue_lock, flags);
-
-		if (unlikely(test_bit(SLOW_WORK_CANCELLING, &work->flags)))
-			goto cancelled;
-
-		/* we promise that we will not attempt to execute the work
-		 * function in more than one thread simultaneously
-		 *
-		 * this, however, leaves us with a problem if we're asked to
-		 * enqueue the work whilst someone is executing the work
-		 * function as simply queueing the work immediately means that
-		 * another thread may try executing it whilst it is already
-		 * under execution
-		 *
-		 * to deal with this, we set the ENQ_DEFERRED bit instead of
-		 * enqueueing, and the thread currently executing the work
-		 * function will enqueue the work item when the work function
-		 * returns and it has cleared the EXECUTING bit
-		 */
-		if (test_bit(SLOW_WORK_EXECUTING, &work->flags)) {
-			set_bit(SLOW_WORK_ENQ_DEFERRED, &work->flags);
-		} else {
-			ret = slow_work_get_ref(work);
-			if (ret < 0)
-				goto failed;
-			slow_work_mark_time(work);
-			list_add_tail(&work->link, queue);
-			wake_up(&slow_work_thread_wq);
-
-			/* if someone who could be requeued is sleeping on a
-			 * thread, then ask them to yield their thread */
-			if (work->link.prev == queue)
-				wake_up(wfo_wq);
-		}
-
-		spin_unlock_irqrestore(&slow_work_queue_lock, flags);
-	}
-	return 0;
-
-cancelled:
-	ret = -ECANCELED;
-failed:
-	spin_unlock_irqrestore(&slow_work_queue_lock, flags);
-	return ret;
-}
-EXPORT_SYMBOL(slow_work_enqueue);
-
-static int slow_work_wait(void *word)
-{
-	schedule();
-	return 0;
-}
-
-/**
- * slow_work_cancel - Cancel a slow work item
- * @work: The work item to cancel
- *
- * This function will cancel a previously enqueued work item. If we cannot
- * cancel the work item, it is guarenteed to have run when this function
- * returns.
- */
-void slow_work_cancel(struct slow_work *work)
-{
-	bool wait = true, put = false;
-
-	set_bit(SLOW_WORK_CANCELLING, &work->flags);
-	smp_mb();
-
-	/* if the work item is a delayed work item with an active timer, we
-	 * need to wait for the timer to finish _before_ getting the spinlock,
-	 * lest we deadlock against the timer routine
-	 *
-	 * the timer routine will leave DELAYED set if it notices the
-	 * CANCELLING flag in time
-	 */
-	if (test_bit(SLOW_WORK_DELAYED, &work->flags)) {
-		struct delayed_slow_work *dwork =
-			container_of(work, struct delayed_slow_work, work);
-		del_timer_sync(&dwork->timer);
-	}
-
-	spin_lock_irq(&slow_work_queue_lock);
-
-	if (test_bit(SLOW_WORK_DELAYED, &work->flags)) {
-		/* the timer routine aborted or never happened, so we are left
-		 * holding the timer's reference on the item and should just
-		 * drop the pending flag and wait for any ongoing execution to
-		 * finish */
-		struct delayed_slow_work *dwork =
-			container_of(work, struct delayed_slow_work, work);
-
-		BUG_ON(timer_pending(&dwork->timer));
-		BUG_ON(!list_empty(&work->link));
-
-		clear_bit(SLOW_WORK_DELAYED, &work->flags);
-		put = true;
-		clear_bit(SLOW_WORK_PENDING, &work->flags);
-
-	} else if (test_bit(SLOW_WORK_PENDING, &work->flags) &&
-		   !list_empty(&work->link)) {
-		/* the link in the pending queue holds a reference on the item
-		 * that we will need to release */
-		list_del_init(&work->link);
-		wait = false;
-		put = true;
-		clear_bit(SLOW_WORK_PENDING, &work->flags);
-
-	} else if (test_and_clear_bit(SLOW_WORK_ENQ_DEFERRED, &work->flags)) {
-		/* the executor is holding our only reference on the item, so
-		 * we merely need to wait for it to finish executing */
-		clear_bit(SLOW_WORK_PENDING, &work->flags);
-	}
-
-	spin_unlock_irq(&slow_work_queue_lock);
-
-	/* the EXECUTING flag is set by the executor whilst the spinlock is set
-	 * and before the item is dequeued - so assuming the above doesn't
-	 * actually dequeue it, simply waiting for the EXECUTING flag to be
-	 * released here should be sufficient */
-	if (wait)
-		wait_on_bit(&work->flags, SLOW_WORK_EXECUTING, slow_work_wait,
-			    TASK_UNINTERRUPTIBLE);
-
-	clear_bit(SLOW_WORK_CANCELLING, &work->flags);
-	if (put)
-		slow_work_put_ref(work);
-}
-EXPORT_SYMBOL(slow_work_cancel);
-
-/*
- * Handle expiry of the delay timer, indicating that a delayed slow work item
- * should now be queued if not cancelled
- */
-static void delayed_slow_work_timer(unsigned long data)
-{
-	wait_queue_head_t *wfo_wq;
-	struct list_head *queue;
-	struct slow_work *work = (struct slow_work *) data;
-	unsigned long flags;
-	bool queued = false, put = false, first = false;
-
-	if (test_bit(SLOW_WORK_VERY_SLOW, &work->flags)) {
-		wfo_wq = &vslow_work_queue_waits_for_occupation;
-		queue = &vslow_work_queue;
-	} else {
-		wfo_wq = &slow_work_queue_waits_for_occupation;
-		queue = &slow_work_queue;
-	}
-
-	spin_lock_irqsave(&slow_work_queue_lock, flags);
-	if (likely(!test_bit(SLOW_WORK_CANCELLING, &work->flags))) {
-		clear_bit(SLOW_WORK_DELAYED, &work->flags);
-
-		if (test_bit(SLOW_WORK_EXECUTING, &work->flags)) {
-			/* we discard the reference the timer was holding in
-			 * favour of the one the executor holds */
-			set_bit(SLOW_WORK_ENQ_DEFERRED, &work->flags);
-			put = true;
-		} else {
-			slow_work_mark_time(work);
-			list_add_tail(&work->link, queue);
-			queued = true;
-			if (work->link.prev == queue)
-				first = true;
-		}
-	}
-
-	spin_unlock_irqrestore(&slow_work_queue_lock, flags);
-	if (put)
-		slow_work_put_ref(work);
-	if (first)
-		wake_up(wfo_wq);
-	if (queued)
-		wake_up(&slow_work_thread_wq);
-}
-
-/**
- * delayed_slow_work_enqueue - Schedule a delayed slow work item for processing
- * @dwork: The delayed work item to queue
- * @delay: When to start executing the work, in jiffies from now
- *
- * This is similar to slow_work_enqueue(), but it adds a delay before the work
- * is actually queued for processing.
- *
- * The item can have delayed processing requested on it whilst it is being
- * executed.  The delay will begin immediately, and if it expires before the
- * item finishes executing, the item will be placed back on the queue when it
- * has done executing.
- */
-int delayed_slow_work_enqueue(struct delayed_slow_work *dwork,
-			      unsigned long delay)
-{
-	struct slow_work *work = &dwork->work;
-	unsigned long flags;
-	int ret;
-
-	if (delay == 0)
-		return slow_work_enqueue(&dwork->work);
-
-	BUG_ON(slow_work_user_count <= 0);
-	BUG_ON(!work);
-	BUG_ON(!work->ops);
-
-	if (test_bit(SLOW_WORK_CANCELLING, &work->flags))
-		return -ECANCELED;
-
-	if (!test_and_set_bit_lock(SLOW_WORK_PENDING, &work->flags)) {
-		spin_lock_irqsave(&slow_work_queue_lock, flags);
-
-		if (test_bit(SLOW_WORK_CANCELLING, &work->flags))
-			goto cancelled;
-
-		/* the timer holds a reference whilst it is pending */
-		ret = slow_work_get_ref(work);
-		if (ret < 0)
-			goto cant_get_ref;
-
-		if (test_and_set_bit(SLOW_WORK_DELAYED, &work->flags))
-			BUG();
-		dwork->timer.expires = jiffies + delay;
-		dwork->timer.data = (unsigned long) work;
-		dwork->timer.function = delayed_slow_work_timer;
-		add_timer(&dwork->timer);
-
-		spin_unlock_irqrestore(&slow_work_queue_lock, flags);
-	}
-
-	return 0;
-
-cancelled:
-	ret = -ECANCELED;
-cant_get_ref:
-	spin_unlock_irqrestore(&slow_work_queue_lock, flags);
-	return ret;
-}
-EXPORT_SYMBOL(delayed_slow_work_enqueue);
-
-/*
- * Schedule a cull of the thread pool at some time in the near future
- */
-static void slow_work_schedule_cull(void)
-{
-	mod_timer(&slow_work_cull_timer,
-		  round_jiffies(jiffies + SLOW_WORK_CULL_TIMEOUT));
-}
-
-/*
- * Worker thread culling algorithm
- */
-static bool slow_work_cull_thread(void)
-{
-	unsigned long flags;
-	bool do_cull = false;
-
-	spin_lock_irqsave(&slow_work_queue_lock, flags);
-
-	if (slow_work_cull) {
-		slow_work_cull = false;
-
-		if (list_empty(&slow_work_queue) &&
-		    list_empty(&vslow_work_queue) &&
-		    atomic_read(&slow_work_thread_count) >
-		    slow_work_min_threads) {
-			slow_work_schedule_cull();
-			do_cull = true;
-		}
-	}
-
-	spin_unlock_irqrestore(&slow_work_queue_lock, flags);
-	return do_cull;
-}
-
-/*
- * Determine if there is slow work available for dispatch
- */
-static inline bool slow_work_available(int vsmax)
-{
-	return !list_empty(&slow_work_queue) ||
-		(!list_empty(&vslow_work_queue) &&
-		 atomic_read(&vslow_work_executing_count) < vsmax);
-}
-
-/*
- * Worker thread dispatcher
- */
-static int slow_work_thread(void *_data)
-{
-	int vsmax, id;
-
-	DEFINE_WAIT(wait);
-
-	set_freezable();
-	set_user_nice(current, -5);
-
-	/* allocate ourselves an ID */
-	spin_lock_irq(&slow_work_queue_lock);
-	id = find_first_zero_bit(slow_work_ids, SLOW_WORK_THREAD_LIMIT);
-	BUG_ON(id < 0 || id >= SLOW_WORK_THREAD_LIMIT);
-	__set_bit(id, slow_work_ids);
-	slow_work_set_thread_pid(id, current->pid);
-	spin_unlock_irq(&slow_work_queue_lock);
-
-	sprintf(current->comm, "kslowd%03u", id);
-
-	for (;;) {
-		vsmax = vslow_work_proportion;
-		vsmax *= atomic_read(&slow_work_thread_count);
-		vsmax /= 100;
-
-		prepare_to_wait_exclusive(&slow_work_thread_wq, &wait,
-					  TASK_INTERRUPTIBLE);
-		if (!freezing(current) &&
-		    !slow_work_threads_should_exit &&
-		    !slow_work_available(vsmax) &&
-		    !slow_work_cull)
-			schedule();
-		finish_wait(&slow_work_thread_wq, &wait);
-
-		try_to_freeze();
-
-		vsmax = vslow_work_proportion;
-		vsmax *= atomic_read(&slow_work_thread_count);
-		vsmax /= 100;
-
-		if (slow_work_available(vsmax) && slow_work_execute(id)) {
-			cond_resched();
-			if (list_empty(&slow_work_queue) &&
-			    list_empty(&vslow_work_queue) &&
-			    atomic_read(&slow_work_thread_count) >
-			    slow_work_min_threads)
-				slow_work_schedule_cull();
-			continue;
-		}
-
-		if (slow_work_threads_should_exit)
-			break;
-
-		if (slow_work_cull && slow_work_cull_thread())
-			break;
-	}
-
-	spin_lock_irq(&slow_work_queue_lock);
-	slow_work_set_thread_pid(id, 0);
-	__clear_bit(id, slow_work_ids);
-	spin_unlock_irq(&slow_work_queue_lock);
-
-	if (atomic_dec_and_test(&slow_work_thread_count))
-		complete_and_exit(&slow_work_last_thread_exited, 0);
-	return 0;
-}
-
-/*
- * Handle thread cull timer expiration
- */
-static void slow_work_cull_timeout(unsigned long data)
-{
-	slow_work_cull = true;
-	wake_up(&slow_work_thread_wq);
-}
-
-/*
- * Start a new slow work thread
- */
-static void slow_work_new_thread_execute(struct slow_work *work)
-{
-	struct task_struct *p;
-
-	if (slow_work_threads_should_exit)
-		return;
-
-	if (atomic_read(&slow_work_thread_count) >= slow_work_max_threads)
-		return;
-
-	if (!mutex_trylock(&slow_work_user_lock))
-		return;
-
-	slow_work_may_not_start_new_thread = true;
-	atomic_inc(&slow_work_thread_count);
-	p = kthread_run(slow_work_thread, NULL, "kslowd");
-	if (IS_ERR(p)) {
-		printk(KERN_DEBUG "Slow work thread pool: OOM\n");
-		if (atomic_dec_and_test(&slow_work_thread_count))
-			BUG(); /* we're running on a slow work thread... */
-		mod_timer(&slow_work_oom_timer,
-			  round_jiffies(jiffies + SLOW_WORK_OOM_TIMEOUT));
-	} else {
-		/* ratelimit the starting of new threads */
-		mod_timer(&slow_work_oom_timer, jiffies + 1);
-	}
-
-	mutex_unlock(&slow_work_user_lock);
-}
-
-static const struct slow_work_ops slow_work_new_thread_ops = {
-	.owner		= THIS_MODULE,
-	.execute	= slow_work_new_thread_execute,
-#ifdef CONFIG_SLOW_WORK_DEBUG
-	.desc		= slow_work_new_thread_desc,
-#endif
-};
-
-/*
- * post-OOM new thread start suppression expiration
- */
-static void slow_work_oom_timeout(unsigned long data)
-{
-	slow_work_may_not_start_new_thread = false;
-}
-
-#ifdef CONFIG_SYSCTL
-/*
- * Handle adjustment of the minimum number of threads
- */
-static int slow_work_min_threads_sysctl(struct ctl_table *table, int write,
-					void __user *buffer,
-					size_t *lenp, loff_t *ppos)
-{
-	int ret = proc_dointvec_minmax(table, write, buffer, lenp, ppos);
-	int n;
-
-	if (ret == 0) {
-		mutex_lock(&slow_work_user_lock);
-		if (slow_work_user_count > 0) {
-			/* see if we need to start or stop threads */
-			n = atomic_read(&slow_work_thread_count) -
-				slow_work_min_threads;
-
-			if (n < 0 && !slow_work_may_not_start_new_thread)
-				slow_work_enqueue(&slow_work_new_thread);
-			else if (n > 0)
-				slow_work_schedule_cull();
-		}
-		mutex_unlock(&slow_work_user_lock);
-	}
-
-	return ret;
-}
-
-/*
- * Handle adjustment of the maximum number of threads
- */
-static int slow_work_max_threads_sysctl(struct ctl_table *table, int write,
-					void __user *buffer,
-					size_t *lenp, loff_t *ppos)
-{
-	int ret = proc_dointvec_minmax(table, write, buffer, lenp, ppos);
-	int n;
-
-	if (ret == 0) {
-		mutex_lock(&slow_work_user_lock);
-		if (slow_work_user_count > 0) {
-			/* see if we need to stop threads */
-			n = slow_work_max_threads -
-				atomic_read(&slow_work_thread_count);
-
-			if (n < 0)
-				slow_work_schedule_cull();
-		}
-		mutex_unlock(&slow_work_user_lock);
-	}
-
-	return ret;
-}
-#endif /* CONFIG_SYSCTL */
-
-/**
- * slow_work_register_user - Register a user of the facility
- * @module: The module about to make use of the facility
- *
- * Register a user of the facility, starting up the initial threads if there
- * aren't any other users at this point.  This will return 0 if successful, or
- * an error if not.
- */
-int slow_work_register_user(struct module *module)
-{
-	struct task_struct *p;
-	int loop;
-
-	mutex_lock(&slow_work_user_lock);
-
-	if (slow_work_user_count == 0) {
-		printk(KERN_NOTICE "Slow work thread pool: Starting up\n");
-		init_completion(&slow_work_last_thread_exited);
-
-		slow_work_threads_should_exit = false;
-		slow_work_init(&slow_work_new_thread,
-			       &slow_work_new_thread_ops);
-		slow_work_may_not_start_new_thread = false;
-		slow_work_cull = false;
-
-		/* start the minimum number of threads */
-		for (loop = 0; loop < slow_work_min_threads; loop++) {
-			atomic_inc(&slow_work_thread_count);
-			p = kthread_run(slow_work_thread, NULL, "kslowd");
-			if (IS_ERR(p))
-				goto error;
-		}
-		printk(KERN_NOTICE "Slow work thread pool: Ready\n");
-	}
-
-	slow_work_user_count++;
-	mutex_unlock(&slow_work_user_lock);
-	return 0;
-
-error:
-	if (atomic_dec_and_test(&slow_work_thread_count))
-		complete(&slow_work_last_thread_exited);
-	if (loop > 0) {
-		printk(KERN_ERR "Slow work thread pool:"
-		       " Aborting startup on ENOMEM\n");
-		slow_work_threads_should_exit = true;
-		wake_up_all(&slow_work_thread_wq);
-		wait_for_completion(&slow_work_last_thread_exited);
-		printk(KERN_ERR "Slow work thread pool: Aborted\n");
-	}
-	mutex_unlock(&slow_work_user_lock);
-	return PTR_ERR(p);
-}
-EXPORT_SYMBOL(slow_work_register_user);
-
-/*
- * wait for all outstanding items from the calling module to complete
- * - note that more items may be queued whilst we're waiting
- */
-static void slow_work_wait_for_items(struct module *module)
-{
-#ifdef CONFIG_MODULES
-	DECLARE_WAITQUEUE(myself, current);
-	struct slow_work *work;
-	int loop;
-
-	mutex_lock(&slow_work_unreg_sync_lock);
-	add_wait_queue(&slow_work_unreg_wq, &myself);
-
-	for (;;) {
-		spin_lock_irq(&slow_work_queue_lock);
-
-		/* first of all, we wait for the last queued item in each list
-		 * to be processed */
-		list_for_each_entry_reverse(work, &vslow_work_queue, link) {
-			if (work->owner == module) {
-				set_current_state(TASK_UNINTERRUPTIBLE);
-				slow_work_unreg_work_item = work;
-				goto do_wait;
-			}
-		}
-		list_for_each_entry_reverse(work, &slow_work_queue, link) {
-			if (work->owner == module) {
-				set_current_state(TASK_UNINTERRUPTIBLE);
-				slow_work_unreg_work_item = work;
-				goto do_wait;
-			}
-		}
-
-		/* then we wait for the items being processed to finish */
-		slow_work_unreg_module = module;
-		smp_mb();
-		for (loop = 0; loop < SLOW_WORK_THREAD_LIMIT; loop++) {
-			if (slow_work_thread_processing[loop] == module)
-				goto do_wait;
-		}
-		spin_unlock_irq(&slow_work_queue_lock);
-		break; /* okay, we're done */
-
-	do_wait:
-		spin_unlock_irq(&slow_work_queue_lock);
-		schedule();
-		slow_work_unreg_work_item = NULL;
-		slow_work_unreg_module = NULL;
-	}
-
-	remove_wait_queue(&slow_work_unreg_wq, &myself);
-	mutex_unlock(&slow_work_unreg_sync_lock);
-#endif /* CONFIG_MODULES */
-}
-
-/**
- * slow_work_unregister_user - Unregister a user of the facility
- * @module: The module whose items should be cleared
- *
- * Unregister a user of the facility, killing all the threads if this was the
- * last one.
- *
- * This waits for all the work items belonging to the nominated module to go
- * away before proceeding.
- */
-void slow_work_unregister_user(struct module *module)
-{
-	/* first of all, wait for all outstanding items from the calling module
-	 * to complete */
-	if (module)
-		slow_work_wait_for_items(module);
-
-	/* then we can actually go about shutting down the facility if need
-	 * be */
-	mutex_lock(&slow_work_user_lock);
-
-	BUG_ON(slow_work_user_count <= 0);
-
-	slow_work_user_count--;
-	if (slow_work_user_count == 0) {
-		printk(KERN_NOTICE "Slow work thread pool: Shutting down\n");
-		slow_work_threads_should_exit = true;
-		del_timer_sync(&slow_work_cull_timer);
-		del_timer_sync(&slow_work_oom_timer);
-		wake_up_all(&slow_work_thread_wq);
-		wait_for_completion(&slow_work_last_thread_exited);
-		printk(KERN_NOTICE "Slow work thread pool:"
-		       " Shut down complete\n");
-	}
-
-	mutex_unlock(&slow_work_user_lock);
-}
-EXPORT_SYMBOL(slow_work_unregister_user);
-
-/*
- * Initialise the slow work facility
- */
-static int __init init_slow_work(void)
-{
-	unsigned nr_cpus = num_possible_cpus();
-
-	if (slow_work_max_threads < nr_cpus)
-		slow_work_max_threads = nr_cpus;
-#ifdef CONFIG_SYSCTL
-	if (slow_work_max_max_threads < nr_cpus * 2)
-		slow_work_max_max_threads = nr_cpus * 2;
-#endif
-#ifdef CONFIG_SLOW_WORK_DEBUG
-	{
-		struct dentry *dbdir;
-
-		dbdir = debugfs_create_dir("slow_work", NULL);
-		if (dbdir && !IS_ERR(dbdir))
-			debugfs_create_file("runqueue", S_IFREG | 0400, dbdir,
-					    NULL, &slow_work_runqueue_fops);
-	}
-#endif
-	return 0;
-}
-
-subsys_initcall(init_slow_work);
diff --git a/kernel/slow-work.h b/kernel/slow-work.h
deleted file mode 100644
index a29ebd1ef41d..000000000000
--- a/kernel/slow-work.h
+++ /dev/null
@@ -1,72 +0,0 @@
-/* Slow work private definitions
- *
- * Copyright (C) 2009 Red Hat, Inc. All Rights Reserved.
- * Written by David Howells (dhowells@redhat.com)
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public Licence
- * as published by the Free Software Foundation; either version
- * 2 of the Licence, or (at your option) any later version.
- */
-
-#define SLOW_WORK_CULL_TIMEOUT (5 * HZ)	/* cull threads 5s after running out of
-					 * things to do */
-#define SLOW_WORK_OOM_TIMEOUT (5 * HZ)	/* can't start new threads for 5s after
-					 * OOM */
-
-#define SLOW_WORK_THREAD_LIMIT	255	/* abs maximum number of slow-work threads */
-
-/*
- * slow-work.c
- */
-#ifdef CONFIG_SLOW_WORK_DEBUG
-extern struct slow_work *slow_work_execs[];
-extern pid_t slow_work_pids[];
-extern rwlock_t slow_work_execs_lock;
-#endif
-
-extern struct list_head slow_work_queue;
-extern struct list_head vslow_work_queue;
-extern spinlock_t slow_work_queue_lock;
-
-/*
- * slow-work-debugfs.c
- */
-#ifdef CONFIG_SLOW_WORK_DEBUG
-extern const struct file_operations slow_work_runqueue_fops;
-
-extern void slow_work_new_thread_desc(struct slow_work *, struct seq_file *);
-#endif
-
-/*
- * Helper functions
- */
-static inline void slow_work_set_thread_pid(int id, pid_t pid)
-{
-#ifdef CONFIG_SLOW_WORK_DEBUG
-	slow_work_pids[id] = pid;
-#endif
-}
-
-static inline void slow_work_mark_time(struct slow_work *work)
-{
-#ifdef CONFIG_SLOW_WORK_DEBUG
-	work->mark = CURRENT_TIME;
-#endif
-}
-
-static inline void slow_work_begin_exec(int id, struct slow_work *work)
-{
-#ifdef CONFIG_SLOW_WORK_DEBUG
-	slow_work_execs[id] = work;
-#endif
-}
-
-static inline void slow_work_end_exec(int id, struct slow_work *work)
-{
-#ifdef CONFIG_SLOW_WORK_DEBUG
-	write_lock(&slow_work_execs_lock);
-	slow_work_execs[id] = NULL;
-	write_unlock(&slow_work_execs_lock);
-#endif
-}
diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index d24f761f4876..5821365b9605 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -50,7 +50,6 @@
 #include <linux/acpi.h>
 #include <linux/reboot.h>
 #include <linux/ftrace.h>
-#include <linux/slow-work.h>
 #include <linux/perf_event.h>
 #include <linux/kprobes.h>
 #include <linux/pipe_fs_i.h>
@@ -906,13 +905,6 @@ static struct ctl_table kern_table[] = {
 		.proc_handler	= proc_dointvec,
 	},
 #endif
-#ifdef CONFIG_SLOW_WORK
-	{
-		.procname	= "slow-work",
-		.mode		= 0555,
-		.child		= slow_work_sysctls,
-	},
-#endif
 #ifdef CONFIG_PERF_EVENTS
 	{
 		.procname	= "perf_event_paranoid",
-- 
cgit v1.2.3


From 18d0cdfd1a4cc9028c0ef80f94538b31541f8fe5 Mon Sep 17 00:00:00 2001
From: Stefan Richter <stefanr@s5r6.in-berlin.de>
Date: Sun, 18 Jul 2010 12:44:01 +0200
Subject: firewire: normalize status values in packet callbacks

core-transaction.c transmit_complete_callback() and close_transaction()
expect packet callback status to be an ACK or RCODE, and ACKs get
translated to RCODEs for transaction callbacks.

An old comment on the packet callback API (been there from the initial
submission of the stack) and the dummy_driver implementation of
send_request/send_response deviated from this as they also included
-ERRNO in the range of status values.

Let's narrow status values down to ACK and RCODE to prevent surprises.
RCODE_CANCELLED is chosen as the dummy_driver's RCODE as its meaning of
"transaction timed out" comes closest to what happens when a transaction
coincides with card removal.

Signed-off-by: Stefan Richter <stefanr@s5r6.in-berlin.de>
---
 drivers/firewire/core-card.c        | 4 ++--
 drivers/firewire/core-transaction.c | 5 ++++-
 include/linux/firewire.h            | 8 ++++----
 3 files changed, 10 insertions(+), 7 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/firewire/core-card.c b/drivers/firewire/core-card.c
index 2bb5c036e806..0c312c4bb4bd 100644
--- a/drivers/firewire/core-card.c
+++ b/drivers/firewire/core-card.c
@@ -570,12 +570,12 @@ static int dummy_set_config_rom(struct fw_card *card,
 
 static void dummy_send_request(struct fw_card *card, struct fw_packet *packet)
 {
-	packet->callback(packet, card, -ENODEV);
+	packet->callback(packet, card, RCODE_CANCELLED);
 }
 
 static void dummy_send_response(struct fw_card *card, struct fw_packet *packet)
 {
-	packet->callback(packet, card, -ENODEV);
+	packet->callback(packet, card, RCODE_CANCELLED);
 }
 
 static int dummy_cancel_packet(struct fw_card *card, struct fw_packet *packet)
diff --git a/drivers/firewire/core-transaction.c b/drivers/firewire/core-transaction.c
index 5f5a7852f7ac..e2e4dc624fb6 100644
--- a/drivers/firewire/core-transaction.c
+++ b/drivers/firewire/core-transaction.c
@@ -310,7 +310,10 @@ static int allocate_tlabel(struct fw_card *card)
  * After the transaction is completed successfully or unsuccessfully, the
  * @callback will be called.  Among its parameters is the response code which
  * is either one of the rcodes per IEEE 1394 or, in case of internal errors,
- * the firewire-core specific %RCODE_SEND_ERROR.
+ * the firewire-core specific %RCODE_SEND_ERROR.  The other firewire-core
+ * specific rcodes (%RCODE_CANCELLED, %RCODE_BUSY, %RCODE_GENERATION,
+ * %RCODE_NO_ACK) denote transaction timeout, busy responder, stale request
+ * generation, or missing ACK respectively.
  *
  * Note some timing corner cases:  fw_send_request() may complete much earlier
  * than when the request packet actually hits the wire.  On the other hand,
diff --git a/include/linux/firewire.h b/include/linux/firewire.h
index adc5b55e6e5f..0c38b8e97722 100644
--- a/include/linux/firewire.h
+++ b/include/linux/firewire.h
@@ -285,10 +285,10 @@ struct fw_packet {
 	u32 timestamp;
 
 	/*
-	 * This callback is called when the packet transmission has
-	 * completed; for successful transmission, the status code is
-	 * the ack received from the destination, otherwise it's a
-	 * negative errno: ENOMEM, ESTALE, ETIMEDOUT, ENODEV, EIO.
+	 * This callback is called when the packet transmission has completed.
+	 * For successful transmission, the status code is the ack received
+	 * from the destination.  Otherwise it is one of the juju-specific
+	 * rcodes:  RCODE_SEND_ERROR, _CANCELLED, _BUSY, _GENERATION, _NO_ACK.
 	 * The callback can be called from tasklet context and thus
 	 * must never block.
 	 */
-- 
cgit v1.2.3


From d505e6e87127d4dbdaa5d91561eed810c180ca23 Mon Sep 17 00:00:00 2001
From: Stefan Richter <stefanr@s5r6.in-berlin.de>
Date: Sat, 17 Jul 2010 21:36:02 +0200
Subject: firewire: cdev: some clarifications to the API documentation

Response events:
  - are generated on more occasions than their documentation claimed.

CSR allocation:
  - An already occupied CSR can be determined from errno==EBUSY.

Bus resets:
  - Note that FW_CDEV_IOC_INITIATE_BUS_RESET is nonblocking and that the
    client is not required to observe a grace period since kernels
    2.6.36+ will enforce it now (commit 02d37bed).

  - The possible values of fw_cdev_initiate_bus_reset.type are listed in
    the kerneldoc comment already.

  - Clarify that an application that uses FW_CDEV_IOC_ADD_DESCRIPTOR and
    FW_CDEV_IOC_REMOVE_DESCRIPTOR does not have to issue a bus reset.

Isochronous I/O contexts:
  - At most one can be created per open file descriptor.

Signed-off-by: Stefan Richter <stefanr@s5r6.in-berlin.de>
---
 include/linux/firewire-cdev.h | 33 +++++++++++++++++++++++++--------
 1 file changed, 25 insertions(+), 8 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/firewire-cdev.h b/include/linux/firewire-cdev.h
index d31022b05bd9..fde9568151d5 100644
--- a/include/linux/firewire-cdev.h
+++ b/include/linux/firewire-cdev.h
@@ -84,8 +84,9 @@ struct fw_cdev_event_bus_reset {
 
 /**
  * struct fw_cdev_event_response - Sent when a response packet was received
- * @closure:	See &fw_cdev_event_common;
- *		set by %FW_CDEV_IOC_SEND_REQUEST ioctl
+ * @closure:	See &fw_cdev_event_common; set by %FW_CDEV_IOC_SEND_REQUEST
+ *		or %FW_CDEV_IOC_SEND_BROADCAST_REQUEST
+ *		or %FW_CDEV_IOC_SEND_STREAM_PACKET ioctl
  * @type:	See &fw_cdev_event_common; always %FW_CDEV_EVENT_RESPONSE
  * @rcode:	Response code returned by the remote node
  * @length:	Data length, i.e. the response's payload size in bytes
@@ -95,6 +96,11 @@ struct fw_cdev_event_bus_reset {
  * sent by %FW_CDEV_IOC_SEND_REQUEST ioctl.  The payload data for responses
  * carrying data (read and lock responses) follows immediately and can be
  * accessed through the @data field.
+ *
+ * The event is also generated after conclusions of transactions that do not
+ * involve response packets.  This includes unified write transactions,
+ * broadcast write transactions, and transmission of asynchronous stream
+ * packets.  @rcode indicates success or failure of such transmissions.
  */
 struct fw_cdev_event_response {
 	__u64 closure;
@@ -447,7 +453,9 @@ struct fw_cdev_send_response {
  * range to be used for later deallocation of the range.
  *
  * The address range is allocated on all local nodes.  The address allocation
- * is exclusive except for the FCP command and response registers.
+ * is exclusive except for the FCP command and response registers.  If an
+ * exclusive address region is already in use, the ioctl fails with errno set
+ * to %EBUSY.
  */
 struct fw_cdev_allocate {
 	__u64 offset;
@@ -475,9 +483,14 @@ struct fw_cdev_deallocate {
  * Initiate a bus reset for the bus this device is on.  The bus reset can be
  * either the original (long) bus reset or the arbitrated (short) bus reset
  * introduced in 1394a-2000.
+ *
+ * The ioctl returns immediately.  A subsequent &fw_cdev_event_bus_reset
+ * indicates when the reset actually happened.  Since ABI v4, this may be
+ * considerably later than the ioctl because the kernel ensures a grace period
+ * between subsequent bus resets as per IEEE 1394 bus management specification.
  */
 struct fw_cdev_initiate_bus_reset {
-	__u32 type;	/* FW_CDEV_SHORT_RESET or FW_CDEV_LONG_RESET */
+	__u32 type;
 };
 
 /**
@@ -501,9 +514,10 @@ struct fw_cdev_initiate_bus_reset {
  *
  * @immediate, @key, and @data array elements are CPU-endian quadlets.
  *
- * If successful, the kernel adds the descriptor and writes back a handle to the
- * kernel-side object to be used for later removal of the descriptor block and
- * immediate key.
+ * If successful, the kernel adds the descriptor and writes back a @handle to
+ * the kernel-side object to be used for later removal of the descriptor block
+ * and immediate key.  The kernel will also generate a bus reset to signal the
+ * change of the configuration ROM to other nodes.
  *
  * This ioctl affects the configuration ROMs of all local nodes.
  * The ioctl only succeeds on device files which represent a local node.
@@ -522,7 +536,8 @@ struct fw_cdev_add_descriptor {
  *		descriptor was added
  *
  * Remove a descriptor block and accompanying immediate key from the local
- * nodes' configuration ROMs.
+ * nodes' configuration ROMs.  The kernel will also generate a bus reset to
+ * signal the change of the configuration ROM to other nodes.
  */
 struct fw_cdev_remove_descriptor {
 	__u32 handle;
@@ -554,6 +569,8 @@ struct fw_cdev_remove_descriptor {
  *
  * Note that the effect of a @header_size > 4 depends on
  * &fw_cdev_get_info.version, as documented at &fw_cdev_event_iso_interrupt.
+ *
+ * No more than one iso context can be created per fd.
  */
 struct fw_cdev_create_iso_context {
 	__u32 type;
-- 
cgit v1.2.3


From 850bb6f23b93c04ce1e4509a87fa607dc17d97c1 Mon Sep 17 00:00:00 2001
From: Stefan Richter <stefanr@s5r6.in-berlin.de>
Date: Fri, 16 Jul 2010 22:25:14 +0200
Subject: firewire: cdev: add PHY packet transmission

Add an FW_CDEV_IOC_SEND_PHY_PACKET ioctl() for /dev/fw* which can be
used to implement bus management related functionality in userspace.

This is also half of the functionality (the transmit part) that is
needed to support a userspace implementation of a VersaPHY transaction
layer.

Safety considerations:

  - PHY packets are generally broadcasts and may have interesting
    effects on PHYs and the bus, e.g. make asynchronous arbitration
    impossible due to too low gap count.  Hence some kind of elevated
    privileges should be required of a process to be able to send
    PHY packets.  This implementation assumes that a process that is
    allowed to open the /dev/fw* of a local node does have this
    privilege.

    There was an inconclusive discussion about introducing POSIX
    capabilities as a means to check for user privileges for these
    kinds of operations.

  - The kernel does not check integrity of the supplied packet data.
    That would be far too much code, considering the many kinds of
    PHY packets.  A process which got the privilege to send these
    packets is trusted to do it correctly.

Just like with the other "send packet" ioctls, a non-blocking API is
chosen; i.e. the ioctl may return even before AT DMA started.  After
transmission, an event for poll()/read() is enqueued.  Most users are
going to need a blocking API, but a blocking userspace wrapper is easy
to implement, and the second of the two existing libraw1394 calls
raw1394_phy_packet_write() and raw1394_start_phy_packet_write() can be
better supported that way.

Signed-off-by: Stefan Richter <stefanr@s5r6.in-berlin.de>
---
 drivers/firewire/core-cdev.c  | 64 +++++++++++++++++++++++++++++++++++++++++++
 include/linux/firewire-cdev.h | 44 ++++++++++++++++++++++++++++-
 2 files changed, 107 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/drivers/firewire/core-cdev.c b/drivers/firewire/core-cdev.c
index acf4fa1f3f8c..f95719926487 100644
--- a/drivers/firewire/core-cdev.c
+++ b/drivers/firewire/core-cdev.c
@@ -194,6 +194,13 @@ struct iso_resource_event {
 	struct fw_cdev_event_iso_resource iso_resource;
 };
 
+struct outbound_phy_packet_event {
+	struct event event;
+	struct client *client;
+	struct fw_packet p;
+	struct fw_cdev_event_phy_packet phy_packet;
+};
+
 static inline void __user *u64_to_uptr(__u64 value)
 {
 	return (void __user *)(unsigned long)value;
@@ -396,6 +403,7 @@ union ioctl_arg {
 	struct fw_cdev_allocate_iso_resource	allocate_iso_resource;
 	struct fw_cdev_send_stream_packet	send_stream_packet;
 	struct fw_cdev_get_cycle_timer2		get_cycle_timer2;
+	struct fw_cdev_send_phy_packet		send_phy_packet;
 };
 
 static int ioctl_get_info(struct client *client, union ioctl_arg *arg)
@@ -1384,6 +1392,61 @@ static int ioctl_send_stream_packet(struct client *client, union ioctl_arg *arg)
 	return init_request(client, &request, dest, a->speed);
 }
 
+static void outbound_phy_packet_callback(struct fw_packet *packet,
+					 struct fw_card *card, int status)
+{
+	struct outbound_phy_packet_event *e =
+		container_of(packet, struct outbound_phy_packet_event, p);
+
+	switch (status) {
+	/* expected: */
+	case ACK_COMPLETE:	e->phy_packet.rcode = RCODE_COMPLETE;	break;
+	/* should never happen with PHY packets: */
+	case ACK_PENDING:	e->phy_packet.rcode = RCODE_COMPLETE;	break;
+	case ACK_BUSY_X:
+	case ACK_BUSY_A:
+	case ACK_BUSY_B:	e->phy_packet.rcode = RCODE_BUSY;	break;
+	case ACK_DATA_ERROR:	e->phy_packet.rcode = RCODE_DATA_ERROR;	break;
+	case ACK_TYPE_ERROR:	e->phy_packet.rcode = RCODE_TYPE_ERROR;	break;
+	/* stale generation; cancelled; on certain controllers: no ack */
+	default:		e->phy_packet.rcode = status;		break;
+	}
+
+	queue_event(e->client, &e->event,
+		    &e->phy_packet, sizeof(e->phy_packet), NULL, 0);
+	client_put(e->client);
+}
+
+static int ioctl_send_phy_packet(struct client *client, union ioctl_arg *arg)
+{
+	struct fw_cdev_send_phy_packet *a = &arg->send_phy_packet;
+	struct fw_card *card = client->device->card;
+	struct outbound_phy_packet_event *e;
+
+	/* Access policy: Allow this ioctl only on local nodes' device files. */
+	if (!client->device->is_local)
+		return -ENOSYS;
+
+	e = kzalloc(sizeof(*e), GFP_KERNEL);
+	if (e == NULL)
+		return -ENOMEM;
+
+	client_get(client);
+	e->client		= client;
+	e->p.speed		= SCODE_100;
+	e->p.generation		= a->generation;
+	e->p.header[0]		= a->data[0];
+	e->p.header[1]		= a->data[1];
+	e->p.header_length	= 8;
+	e->p.callback		= outbound_phy_packet_callback;
+	e->phy_packet.closure	= a->closure;
+	e->phy_packet.type	= FW_CDEV_EVENT_PHY_PACKET_SENT;
+
+	card->driver->send_request(card, &e->p);
+
+	return 0;
+}
+
 static int (* const ioctl_handlers[])(struct client *, union ioctl_arg *) = {
 	[0x00] = ioctl_get_info,
 	[0x01] = ioctl_send_request,
@@ -1406,6 +1469,7 @@ static int (* const ioctl_handlers[])(struct client *, union ioctl_arg *) = {
 	[0x12] = ioctl_send_broadcast_request,
 	[0x13] = ioctl_send_stream_packet,
 	[0x14] = ioctl_get_cycle_timer2,
+	[0x15] = ioctl_send_phy_packet,
 };
 
 static int dispatch_ioctl(struct client *client,
diff --git a/include/linux/firewire-cdev.h b/include/linux/firewire-cdev.h
index fde9568151d5..5bc051b9a013 100644
--- a/include/linux/firewire-cdev.h
+++ b/include/linux/firewire-cdev.h
@@ -34,6 +34,7 @@
 
 /* available since kernel version 2.6.36 */
 #define FW_CDEV_EVENT_REQUEST2			0x06
+#define FW_CDEV_EVENT_PHY_PACKET_SENT		0x07
 
 /**
  * struct fw_cdev_event_common - Common part of all fw_cdev_event_ types
@@ -283,6 +284,19 @@ struct fw_cdev_event_iso_resource {
 	__s32 bandwidth;
 };
 
+/**
+ * struct fw_cdev_event_phy_packet - A PHY packet was transmitted
+ * @closure:	See &fw_cdev_event_common;
+ *		set by %FW_CDEV_IOC_SEND_PHY_PACKET ioctl
+ * @type:	%FW_CDEV_EVENT_PHY_PACKET_SENT
+ * @rcode:	%RCODE_..., indicates success or failure of transmission
+ */
+struct fw_cdev_event_phy_packet {
+	__u64 closure;
+	__u32 type;
+	__u32 rcode;
+};
+
 /**
  * union fw_cdev_event - Convenience union of fw_cdev_event_ types
  * @common:        Valid for all types
@@ -294,6 +308,7 @@ struct fw_cdev_event_iso_resource {
  * @iso_resource:  Valid if @common.type ==
  *				%FW_CDEV_EVENT_ISO_RESOURCE_ALLOCATED or
  *				%FW_CDEV_EVENT_ISO_RESOURCE_DEALLOCATED
+ * @phy_packet:    Valid if @common.type == %FW_CDEV_EVENT_PHY_PACKET_SENT
  *
  * Convenience union for userspace use.  Events could be read(2) into an
  * appropriately aligned char buffer and then cast to this union for further
@@ -311,6 +326,7 @@ union fw_cdev_event {
 	struct fw_cdev_event_request2		request2;     /* added in 2.6.36 */
 	struct fw_cdev_event_iso_interrupt	iso_interrupt;
 	struct fw_cdev_event_iso_resource	iso_resource; /* added in 2.6.30 */
+	struct fw_cdev_event_phy_packet		phy_packet;   /* added in 2.6.36 */
 };
 
 /* available since kernel version 2.6.22 */
@@ -342,6 +358,9 @@ union fw_cdev_event {
 /* available since kernel version 2.6.34 */
 #define FW_CDEV_IOC_GET_CYCLE_TIMER2   _IOWR('#', 0x14, struct fw_cdev_get_cycle_timer2)
 
+/* available since kernel version 2.6.36 */
+#define FW_CDEV_IOC_SEND_PHY_PACKET    _IOWR('#', 0x15, struct fw_cdev_send_phy_packet)
+
 /*
  * ABI version history
  *  1  (2.6.22)  - initial version
@@ -357,8 +376,9 @@ union fw_cdev_event {
  *               - shared use and auto-response for FCP registers
  *  3  (2.6.34)  - made &fw_cdev_get_cycle_timer reliable
  *               - added %FW_CDEV_IOC_GET_CYCLE_TIMER2
- *  4  (2.6.36)  - added %FW_CDEV_EVENT_REQUEST2
+ *  4  (2.6.36)  - added %FW_CDEV_EVENT_REQUEST2, %FW_CDEV_EVENT_PHY_PACKET_SENT
  *               - implemented &fw_cdev_event_bus_reset.bm_node_id
+ *               - added %FW_CDEV_IOC_SEND_PHY_PACKET
  */
 #define FW_CDEV_VERSION 3 /* Meaningless; don't use this macro. */
 
@@ -808,4 +828,26 @@ struct fw_cdev_send_stream_packet {
 	__u32 speed;
 };
 
+/**
+ * struct fw_cdev_send_phy_packet - send a PHY packet
+ * @closure:	Passed back to userspace in the PHY-packet-sent event
+ * @data:	First and second quadlet of the PHY packet
+ * @generation:	The bus generation where packet is valid
+ *
+ * The %FW_CDEV_IOC_SEND_PHY_PACKET ioctl sends a PHY packet to all nodes
+ * on the same card as this device.  After transmission, an
+ * %FW_CDEV_EVENT_PHY_PACKET_SENT event is generated.
+ *
+ * The payload @data[] shall be specified in host byte order.  Usually,
+ * @data[1] needs to be the bitwise inverse of @data[0].  VersaPHY packets
+ * are an exception to this rule.
+ *
+ * The ioctl is only permitted on device files which represent a local node.
+ */
+struct fw_cdev_send_phy_packet {
+	__u64 closure;
+	__u32 data[2];
+	__u32 generation;
+};
+
 #endif /* _LINUX_FIREWIRE_CDEV_H */
-- 
cgit v1.2.3


From bf54e1462b9192fdef7ea9e2bc44fdc16a4b87bc Mon Sep 17 00:00:00 2001
From: Stefan Richter <stefanr@s5r6.in-berlin.de>
Date: Fri, 16 Jul 2010 22:25:51 +0200
Subject: firewire: cdev: add PHY packet reception

Add an FW_CDEV_IOC_RECEIVE_PHY_PACKETS ioctl() and
FW_CDEV_EVENT_PHY_PACKET_RECEIVED poll()/read() event for /dev/fw*.
This can be used to get information from remote PHYs by remote access
PHY packets.

This is also the 2nd half of the functionality (the receive part) to
support a userspace implementation of a VersaPHY transaction layer.

Safety considerations:

  - PHY packets are generally broadcasts, hence some kind of elevated
    privileges should be required of a process to be able to listen in
    on PHY packets.  This implementation assumes that a process that is
    allowed to open the /dev/fw* of a local node does have this
    privilege.

    There was an inconclusive discussion about introducing POSIX
    capabilities as a means to check for user privileges for these
    kinds of operations.

Other limitations:

  - PHY packet reception may be switched on by ioctl() but cannot be
    switched off again.  It would be trivial to provide an off switch,
    but this is not worth the code.  The client should simply close()
    the fd then, or just ignore further events.

  - For sake of simplicity of API and kernel-side implementation, no
    filter per packet content is provided.

Signed-off-by: Stefan Richter <stefanr@s5r6.in-berlin.de>
---
 drivers/firewire/core-card.c        |  1 +
 drivers/firewire/core-cdev.c        | 73 ++++++++++++++++++++++++++++++++++---
 drivers/firewire/core-transaction.c |  5 +++
 drivers/firewire/core.h             |  2 +
 drivers/firewire/ohci.c             |  3 +-
 include/linux/firewire-cdev.h       | 39 ++++++++++++++++----
 include/linux/firewire.h            |  3 +-
 7 files changed, 111 insertions(+), 15 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/firewire/core-card.c b/drivers/firewire/core-card.c
index 0c312c4bb4bd..6d1cfae6aad4 100644
--- a/drivers/firewire/core-card.c
+++ b/drivers/firewire/core-card.c
@@ -500,6 +500,7 @@ void fw_card_initialize(struct fw_card *card,
 	kref_init(&card->kref);
 	init_completion(&card->done);
 	INIT_LIST_HEAD(&card->transaction_list);
+	INIT_LIST_HEAD(&card->phy_receiver_list);
 	spin_lock_init(&card->lock);
 
 	card->local_node = NULL;
diff --git a/drivers/firewire/core-cdev.c b/drivers/firewire/core-cdev.c
index f95719926487..0425dd5dfcd3 100644
--- a/drivers/firewire/core-cdev.c
+++ b/drivers/firewire/core-cdev.c
@@ -69,6 +69,9 @@ struct client {
 	struct fw_iso_buffer buffer;
 	unsigned long vm_start;
 
+	struct list_head phy_receiver_link;
+	u64 phy_receiver_closure;
+
 	struct list_head link;
 	struct kref kref;
 };
@@ -201,6 +204,11 @@ struct outbound_phy_packet_event {
 	struct fw_cdev_event_phy_packet phy_packet;
 };
 
+struct inbound_phy_packet_event {
+	struct event event;
+	struct fw_cdev_event_phy_packet phy_packet;
+};
+
 static inline void __user *u64_to_uptr(__u64 value)
 {
 	return (void __user *)(unsigned long)value;
@@ -236,6 +244,7 @@ static int fw_device_op_open(struct inode *inode, struct file *file)
 	idr_init(&client->resource_idr);
 	INIT_LIST_HEAD(&client->event_list);
 	init_waitqueue_head(&client->wait);
+	INIT_LIST_HEAD(&client->phy_receiver_link);
 	kref_init(&client->kref);
 
 	file->private_data = client;
@@ -357,7 +366,7 @@ static void queue_bus_reset_event(struct client *client)
 
 	e = kzalloc(sizeof(*e), GFP_KERNEL);
 	if (e == NULL) {
-		fw_notify("Out of memory when allocating bus reset event\n");
+		fw_notify("Out of memory when allocating event\n");
 		return;
 	}
 
@@ -404,6 +413,7 @@ union ioctl_arg {
 	struct fw_cdev_send_stream_packet	send_stream_packet;
 	struct fw_cdev_get_cycle_timer2		get_cycle_timer2;
 	struct fw_cdev_send_phy_packet		send_phy_packet;
+	struct fw_cdev_receive_phy_packets	receive_phy_packets;
 };
 
 static int ioctl_get_info(struct client *client, union ioctl_arg *arg)
@@ -671,9 +681,10 @@ static void handle_request(struct fw_card *card, struct fw_request *request,
 
 	r = kmalloc(sizeof(*r), GFP_ATOMIC);
 	e = kmalloc(sizeof(*e), GFP_ATOMIC);
-	if (r == NULL || e == NULL)
+	if (r == NULL || e == NULL) {
+		fw_notify("Out of memory when allocating event\n");
 		goto failed;
-
+	}
 	r->card    = card;
 	r->request = request;
 	r->data    = payload;
@@ -902,9 +913,10 @@ static void iso_callback(struct fw_iso_context *context, u32 cycle,
 	struct iso_interrupt_event *e;
 
 	e = kmalloc(sizeof(*e) + header_length, GFP_ATOMIC);
-	if (e == NULL)
+	if (e == NULL) {
+		fw_notify("Out of memory when allocating event\n");
 		return;
-
+	}
 	e->interrupt.type      = FW_CDEV_EVENT_ISO_INTERRUPT;
 	e->interrupt.closure   = client->iso_closure;
 	e->interrupt.cycle     = cycle;
@@ -1447,6 +1459,52 @@ static int ioctl_send_phy_packet(struct client *client, union ioctl_arg *arg)
 	return 0;
 }
 
+static int ioctl_receive_phy_packets(struct client *client, union ioctl_arg *arg)
+{
+	struct fw_cdev_receive_phy_packets *a = &arg->receive_phy_packets;
+	struct fw_card *card = client->device->card;
+
+	/* Access policy: Allow this ioctl only on local nodes' device files. */
+	if (!client->device->is_local)
+		return -ENOSYS;
+
+	spin_lock_irq(&card->lock);
+
+	list_move_tail(&client->phy_receiver_link, &card->phy_receiver_list);
+	client->phy_receiver_closure = a->closure;
+
+	spin_unlock_irq(&card->lock);
+
+	return 0;
+}
+
+void fw_cdev_handle_phy_packet(struct fw_card *card, struct fw_packet *p)
+{
+	struct client *client;
+	struct inbound_phy_packet_event *e;
+	unsigned long flags;
+
+	spin_lock_irqsave(&card->lock, flags);
+
+	list_for_each_entry(client, &card->phy_receiver_list, phy_receiver_link) {
+		e = kmalloc(sizeof(*e) + 8, GFP_ATOMIC);
+		if (e == NULL) {
+			fw_notify("Out of memory when allocating event\n");
+			break;
+		}
+		e->phy_packet.closure	= client->phy_receiver_closure;
+		e->phy_packet.type	= FW_CDEV_EVENT_PHY_PACKET_RECEIVED;
+		e->phy_packet.rcode	= RCODE_COMPLETE;
+		e->phy_packet.length	= 8;
+		e->phy_packet.data[0]	= p->header[1];
+		e->phy_packet.data[1]	= p->header[2];
+		queue_event(client, &e->event,
+			    &e->phy_packet, sizeof(e->phy_packet) + 8, NULL, 0);
+	}
+
+	spin_unlock_irqrestore(&card->lock, flags);
+}
+
 static int (* const ioctl_handlers[])(struct client *, union ioctl_arg *) = {
 	[0x00] = ioctl_get_info,
 	[0x01] = ioctl_send_request,
@@ -1470,6 +1528,7 @@ static int (* const ioctl_handlers[])(struct client *, union ioctl_arg *) = {
 	[0x13] = ioctl_send_stream_packet,
 	[0x14] = ioctl_get_cycle_timer2,
 	[0x15] = ioctl_send_phy_packet,
+	[0x16] = ioctl_receive_phy_packets,
 };
 
 static int dispatch_ioctl(struct client *client,
@@ -1577,6 +1636,10 @@ static int fw_device_op_release(struct inode *inode, struct file *file)
 	struct client *client = file->private_data;
 	struct event *event, *next_event;
 
+	spin_lock_irq(&client->device->card->lock);
+	list_del(&client->phy_receiver_link);
+	spin_unlock_irq(&client->device->card->lock);
+
 	mutex_lock(&client->device->client_list_mutex);
 	list_del(&client->link);
 	mutex_unlock(&client->device->client_list_mutex);
diff --git a/drivers/firewire/core-transaction.c b/drivers/firewire/core-transaction.c
index e2e4dc624fb6..6f225cacbc3d 100644
--- a/drivers/firewire/core-transaction.c
+++ b/drivers/firewire/core-transaction.c
@@ -883,6 +883,11 @@ void fw_core_handle_request(struct fw_card *card, struct fw_packet *p)
 	if (p->ack != ACK_PENDING && p->ack != ACK_COMPLETE)
 		return;
 
+	if (TCODE_IS_LINK_INTERNAL(HEADER_GET_TCODE(p->header[0]))) {
+		fw_cdev_handle_phy_packet(card, p);
+		return;
+	}
+
 	request = allocate_request(card, p);
 	if (request == NULL) {
 		/* FIXME: send statically allocated busy packet. */
diff --git a/drivers/firewire/core.h b/drivers/firewire/core.h
index ff6c90922001..3102b6b63438 100644
--- a/drivers/firewire/core.h
+++ b/drivers/firewire/core.h
@@ -128,6 +128,7 @@ extern const struct file_operations fw_device_ops;
 
 void fw_device_cdev_update(struct fw_device *device);
 void fw_device_cdev_remove(struct fw_device *device);
+void fw_cdev_handle_phy_packet(struct fw_card *card, struct fw_packet *p);
 
 
 /* -device */
@@ -214,6 +215,7 @@ static inline bool is_next_generation(int new_generation, int old_generation)
 
 #define TCODE_IS_READ_REQUEST(tcode)	(((tcode) & ~1) == 4)
 #define TCODE_IS_BLOCK_PACKET(tcode)	(((tcode) &  1) != 0)
+#define TCODE_IS_LINK_INTERNAL(tcode)	((tcode) == 0xe)
 #define TCODE_IS_REQUEST(tcode)		(((tcode) &  2) == 0)
 #define TCODE_IS_RESPONSE(tcode)	(((tcode) &  2) != 0)
 #define TCODE_HAS_REQUEST_DATA(tcode)	(((tcode) & 12) != 4)
diff --git a/drivers/firewire/ohci.c b/drivers/firewire/ohci.c
index bb6a92bc9e6a..08afccc66333 100644
--- a/drivers/firewire/ohci.c
+++ b/drivers/firewire/ohci.c
@@ -1759,10 +1759,9 @@ static int ohci_enable(struct fw_card *card,
 		  OHCI1394_HCControl_noByteSwapData);
 
 	reg_write(ohci, OHCI1394_SelfIDBuffer, ohci->self_id_bus);
-	reg_write(ohci, OHCI1394_LinkControlClear,
-		  OHCI1394_LinkControl_rcvPhyPkt);
 	reg_write(ohci, OHCI1394_LinkControlSet,
 		  OHCI1394_LinkControl_rcvSelfID |
+		  OHCI1394_LinkControl_rcvPhyPkt |
 		  OHCI1394_LinkControl_cycleTimerEnable |
 		  OHCI1394_LinkControl_cycleMaster);
 
diff --git a/include/linux/firewire-cdev.h b/include/linux/firewire-cdev.h
index 5bc051b9a013..b87409160794 100644
--- a/include/linux/firewire-cdev.h
+++ b/include/linux/firewire-cdev.h
@@ -35,6 +35,7 @@
 /* available since kernel version 2.6.36 */
 #define FW_CDEV_EVENT_REQUEST2			0x06
 #define FW_CDEV_EVENT_PHY_PACKET_SENT		0x07
+#define FW_CDEV_EVENT_PHY_PACKET_RECEIVED	0x08
 
 /**
  * struct fw_cdev_event_common - Common part of all fw_cdev_event_ types
@@ -285,16 +286,24 @@ struct fw_cdev_event_iso_resource {
 };
 
 /**
- * struct fw_cdev_event_phy_packet - A PHY packet was transmitted
- * @closure:	See &fw_cdev_event_common;
- *		set by %FW_CDEV_IOC_SEND_PHY_PACKET ioctl
- * @type:	%FW_CDEV_EVENT_PHY_PACKET_SENT
+ * struct fw_cdev_event_phy_packet - A PHY packet was transmitted or received
+ * @closure:	See &fw_cdev_event_common; set by %FW_CDEV_IOC_SEND_PHY_PACKET
+ *		or %FW_CDEV_IOC_RECEIVE_PHY_PACKETS ioctl
+ * @type:	%FW_CDEV_EVENT_PHY_PACKET_SENT or %..._RECEIVED
  * @rcode:	%RCODE_..., indicates success or failure of transmission
+ * @length:	Data length in bytes
+ * @data:	Incoming data
+ *
+ * If @type is %FW_CDEV_EVENT_PHY_PACKET_SENT, @length is 0 and @data empty.
+ * If @type is %FW_CDEV_EVENT_PHY_PACKET_RECEIVED, @length is 8 and @data
+ * consists of the two PHY packet quadlets, in host byte order.
  */
 struct fw_cdev_event_phy_packet {
 	__u64 closure;
 	__u32 type;
 	__u32 rcode;
+	__u32 length;
+	__u32 data[0];
 };
 
 /**
@@ -308,7 +317,9 @@ struct fw_cdev_event_phy_packet {
  * @iso_resource:  Valid if @common.type ==
  *				%FW_CDEV_EVENT_ISO_RESOURCE_ALLOCATED or
  *				%FW_CDEV_EVENT_ISO_RESOURCE_DEALLOCATED
- * @phy_packet:    Valid if @common.type == %FW_CDEV_EVENT_PHY_PACKET_SENT
+ * @phy_packet:    Valid if @common.type ==
+ *				%FW_CDEV_EVENT_PHY_PACKET_SENT or
+ *				%FW_CDEV_EVENT_PHY_PACKET_RECEIVED
  *
  * Convenience union for userspace use.  Events could be read(2) into an
  * appropriately aligned char buffer and then cast to this union for further
@@ -360,6 +371,7 @@ union fw_cdev_event {
 
 /* available since kernel version 2.6.36 */
 #define FW_CDEV_IOC_SEND_PHY_PACKET    _IOWR('#', 0x15, struct fw_cdev_send_phy_packet)
+#define FW_CDEV_IOC_RECEIVE_PHY_PACKETS _IOW('#', 0x16, struct fw_cdev_receive_phy_packets)
 
 /*
  * ABI version history
@@ -376,9 +388,9 @@ union fw_cdev_event {
  *               - shared use and auto-response for FCP registers
  *  3  (2.6.34)  - made &fw_cdev_get_cycle_timer reliable
  *               - added %FW_CDEV_IOC_GET_CYCLE_TIMER2
- *  4  (2.6.36)  - added %FW_CDEV_EVENT_REQUEST2, %FW_CDEV_EVENT_PHY_PACKET_SENT
+ *  4  (2.6.36)  - added %FW_CDEV_EVENT_REQUEST2, %FW_CDEV_EVENT_PHY_PACKET_*
  *               - implemented &fw_cdev_event_bus_reset.bm_node_id
- *               - added %FW_CDEV_IOC_SEND_PHY_PACKET
+ *               - added %FW_CDEV_IOC_SEND_PHY_PACKET, _RECEIVE_PHY_PACKETS
  */
 #define FW_CDEV_VERSION 3 /* Meaningless; don't use this macro. */
 
@@ -850,4 +862,17 @@ struct fw_cdev_send_phy_packet {
 	__u32 generation;
 };
 
+/**
+ * struct fw_cdev_receive_phy_packets - start reception of PHY packets
+ * @closure: Passed back to userspace in phy packet events
+ *
+ * This ioctl activates issuing of %FW_CDEV_EVENT_PHY_PACKET_RECEIVED due to
+ * incoming PHY packets from any node on the same bus as the device.
+ *
+ * The ioctl is only permitted on device files which represent a local node.
+ */
+struct fw_cdev_receive_phy_packets {
+	__u64 closure;
+};
+
 #endif /* _LINUX_FIREWIRE_CDEV_H */
diff --git a/include/linux/firewire.h b/include/linux/firewire.h
index 0c38b8e97722..d974aa4a24c9 100644
--- a/include/linux/firewire.h
+++ b/include/linux/firewire.h
@@ -111,9 +111,10 @@ struct fw_card {
 	bool beta_repeaters_present;
 
 	int index;
-
 	struct list_head link;
 
+	struct list_head phy_receiver_list;
+
 	struct delayed_work br_work; /* bus reset job */
 	bool br_short;
 
-- 
cgit v1.2.3


From cc550216ae9a2993ef3973464714dc1a39ab1f86 Mon Sep 17 00:00:00 2001
From: Stefan Richter <stefanr@s5r6.in-berlin.de>
Date: Sun, 18 Jul 2010 13:00:50 +0200
Subject: firewire: cdev: add PHY pinging

This extends the FW_CDEV_IOC_SEND_PHY_PACKET ioctl() for /dev/fw* to be
useful for ping time measurements.  One application for it would be gap
count optimization in userspace that is based on ping times rather than
hop count.  (The latter is implemented in firewire-core itself but is
not applicable to beta PHYs that act as repeater.)

Signed-off-by: Stefan Richter <stefanr@s5r6.in-berlin.de>
---
 drivers/firewire/core-cdev.c  | 9 ++++++---
 drivers/firewire/core.h       | 5 +++++
 drivers/firewire/ohci.c       | 3 +++
 include/linux/firewire-cdev.h | 5 ++++-
 4 files changed, 18 insertions(+), 4 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/firewire/core-cdev.c b/drivers/firewire/core-cdev.c
index 0425dd5dfcd3..31863cf8b6c4 100644
--- a/drivers/firewire/core-cdev.c
+++ b/drivers/firewire/core-cdev.c
@@ -1423,9 +1423,10 @@ static void outbound_phy_packet_callback(struct fw_packet *packet,
 	/* stale generation; cancelled; on certain controllers: no ack */
 	default:		e->phy_packet.rcode = status;		break;
 	}
+	e->phy_packet.data[0] = packet->timestamp;
 
-	queue_event(e->client, &e->event,
-		    &e->phy_packet, sizeof(e->phy_packet), NULL, 0);
+	queue_event(e->client, &e->event, &e->phy_packet,
+		    sizeof(e->phy_packet) + e->phy_packet.length, NULL, 0);
 	client_put(e->client);
 }
 
@@ -1439,7 +1440,7 @@ static int ioctl_send_phy_packet(struct client *client, union ioctl_arg *arg)
 	if (!client->device->is_local)
 		return -ENOSYS;
 
-	e = kzalloc(sizeof(*e), GFP_KERNEL);
+	e = kzalloc(sizeof(*e) + 4, GFP_KERNEL);
 	if (e == NULL)
 		return -ENOMEM;
 
@@ -1453,6 +1454,8 @@ static int ioctl_send_phy_packet(struct client *client, union ioctl_arg *arg)
 	e->p.callback		= outbound_phy_packet_callback;
 	e->phy_packet.closure	= a->closure;
 	e->phy_packet.type	= FW_CDEV_EVENT_PHY_PACKET_SENT;
+	if (is_ping_packet(a->data))
+			e->phy_packet.length = 4;
 
 	card->driver->send_request(card, &e->p);
 
diff --git a/drivers/firewire/core.h b/drivers/firewire/core.h
index 3102b6b63438..28621e44b111 100644
--- a/drivers/firewire/core.h
+++ b/drivers/firewire/core.h
@@ -234,4 +234,9 @@ void fw_fill_response(struct fw_packet *response, u32 *request_header,
 void fw_send_phy_config(struct fw_card *card,
 			int node_id, int generation, int gap_count);
 
+static inline bool is_ping_packet(u32 *data)
+{
+	return (data[0] & 0xc0ffffff) == 0 && ~data[0] == data[1];
+}
+
 #endif /* _FIREWIRE_CORE_H */
diff --git a/drivers/firewire/ohci.c b/drivers/firewire/ohci.c
index 08afccc66333..5f6bb2c53808 100644
--- a/drivers/firewire/ohci.c
+++ b/drivers/firewire/ohci.c
@@ -1068,6 +1068,9 @@ static int at_context_queue_packet(struct context *ctx,
 		header[1] = cpu_to_le32(packet->header[0]);
 		header[2] = cpu_to_le32(packet->header[1]);
 		d[0].req_count = cpu_to_le16(12);
+
+		if (is_ping_packet(packet->header))
+			d[0].control |= cpu_to_le16(DESCRIPTOR_PING);
 		break;
 
 	case 4:
diff --git a/include/linux/firewire-cdev.h b/include/linux/firewire-cdev.h
index b87409160794..da0fec7e8dc0 100644
--- a/include/linux/firewire-cdev.h
+++ b/include/linux/firewire-cdev.h
@@ -294,7 +294,10 @@ struct fw_cdev_event_iso_resource {
  * @length:	Data length in bytes
  * @data:	Incoming data
  *
- * If @type is %FW_CDEV_EVENT_PHY_PACKET_SENT, @length is 0 and @data empty.
+ * If @type is %FW_CDEV_EVENT_PHY_PACKET_SENT, @length is 0 and @data empty,
+ * except in case of a ping packet:  Then, @length is 4, and @data[0] is the
+ * ping time in 49.152MHz clocks if @rcode is %RCODE_COMPLETE.
+ *
  * If @type is %FW_CDEV_EVENT_PHY_PACKET_RECEIVED, @length is 8 and @data
  * consists of the two PHY packet quadlets, in host byte order.
  */
-- 
cgit v1.2.3


From 8e2b2b46ea4ca5ef790dddf78b360ed736a62d7c Mon Sep 17 00:00:00 2001
From: Stefan Richter <stefanr@s5r6.in-berlin.de>
Date: Fri, 23 Jul 2010 13:05:39 +0200
Subject: firewire: cdev: improve FW_CDEV_IOC_ALLOCATE

In both the ieee1394 stack and the firewire stack, the core treats
kernelspace drivers better than userspace drivers when it comes to
CSR address range allocation:  The former may request a register to be
placed automatically at a free spot anywhere inside a specified address
range.  The latter may only request a register at a fixed offset.

Hence, userspace drivers which do not require a fixed offset potentially
need to implement a retry loop with incremented offset in each retry
until the kernel does not fail allocation with EBUSY.  This awkward
procedure is not fundamentally necessary as the core already provides a
superior allocation API to kernelspace drivers.

Therefore change the ioctl() ABI by addition of a region_end member in
the existing struct fw_cdev_allocate.  Userspace and kernelspace APIs
work the same way now.

There is a small cost to pay by clients though:  If client source code
is required to compile with older kernel headers too, then any use of
the new member fw_cdev_allocate.region_end needs to be enclosed by
#ifdef/#endif directives.  However, any client program that seriously
wants to use address range allocations will require a kernel of cdev ABI
version >= 4 at runtime and a linux/firewire-cdev.h header of >= 4
anyway.  This is because v4 brings FW_CDEV_EVENT_REQUEST2.  The only
client program in which build-time compatibility with struct
fw_cdev_allocate as found in older kernel headers makes sense is
libraw1394.

(libraw1394 uses the older broken FW_CDEV_EVENT_REQUEST to implement a
makeshift, incorrect transaction responder that does at least work
somewhat in many simple scenarios, relying on guesswork by libraw1394
and by libraw1394 based applications.  Plus, address range allocation
and transaction responder is only one of many features that libraw1394
needs to provide, and these other features need to work with kernel and
kernel-headers as old as possible.  Any new linux/firewire-cdev.h based
client that implements a transaction responder should never attempt to
do it like libraw1394;  instead it should make a header and kernel of v4
or later a hard requirement.)

While we are at it, update the struct fw_cdev_allocate documentation to
better reflect the recent fw_cdev_event_request2 ABI addition.

Signed-off-by: Stefan Richter <stefanr@s5r6.in-berlin.de>
---
 drivers/firewire/core-cdev.c  | 12 +++++++++---
 include/linux/firewire-cdev.h | 29 ++++++++++++++++++++++++-----
 2 files changed, 33 insertions(+), 8 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/firewire/core-cdev.c b/drivers/firewire/core-cdev.c
index 31863cf8b6c4..f40098dec14b 100644
--- a/drivers/firewire/core-cdev.c
+++ b/drivers/firewire/core-cdev.c
@@ -50,8 +50,9 @@
 /*
  * ABI version history is documented in linux/firewire-cdev.h.
  */
-#define FW_CDEV_KERNEL_VERSION		4
-#define FW_CDEV_VERSION_EVENT_REQUEST2	4
+#define FW_CDEV_KERNEL_VERSION			4
+#define FW_CDEV_VERSION_EVENT_REQUEST2		4
+#define FW_CDEV_VERSION_ALLOCATE_REGION_END	4
 
 struct client {
 	u32 version;
@@ -773,7 +774,11 @@ static int ioctl_allocate(struct client *client, union ioctl_arg *arg)
 		return -ENOMEM;
 
 	region.start = a->offset;
-	region.end   = a->offset + a->length;
+	if (client->version < FW_CDEV_VERSION_ALLOCATE_REGION_END)
+		region.end = a->offset + a->length;
+	else
+		region.end = a->region_end;
+
 	r->handler.length           = a->length;
 	r->handler.address_callback = handle_request;
 	r->handler.callback_data    = r;
@@ -785,6 +790,7 @@ static int ioctl_allocate(struct client *client, union ioctl_arg *arg)
 		kfree(r);
 		return ret;
 	}
+	a->offset = r->handler.offset;
 
 	r->resource.release = release_address_handler;
 	ret = add_client_resource(client, &r->resource, GFP_KERNEL);
diff --git a/include/linux/firewire-cdev.h b/include/linux/firewire-cdev.h
index da0fec7e8dc0..14831119ff71 100644
--- a/include/linux/firewire-cdev.h
+++ b/include/linux/firewire-cdev.h
@@ -394,6 +394,7 @@ union fw_cdev_event {
  *  4  (2.6.36)  - added %FW_CDEV_EVENT_REQUEST2, %FW_CDEV_EVENT_PHY_PACKET_*
  *               - implemented &fw_cdev_event_bus_reset.bm_node_id
  *               - added %FW_CDEV_IOC_SEND_PHY_PACKET, _RECEIVE_PHY_PACKETS
+ *               - added &fw_cdev_allocate.region_end
  */
 #define FW_CDEV_VERSION 3 /* Meaningless; don't use this macro. */
 
@@ -473,17 +474,21 @@ struct fw_cdev_send_response {
 };
 
 /**
- * struct fw_cdev_allocate - Allocate a CSR address range
+ * struct fw_cdev_allocate - Allocate a CSR in an address range
  * @offset:	Start offset of the address range
  * @closure:	To be passed back to userspace in request events
- * @length:	Length of the address range, in bytes
+ * @length:	Length of the CSR, in bytes
  * @handle:	Handle to the allocation, written by the kernel
+ * @region_end:	First address above the address range (added in ABI v4, 2.6.36)
  *
  * Allocate an address range in the 48-bit address space on the local node
  * (the controller).  This allows userspace to listen for requests with an
- * offset within that address range.  When the kernel receives a request
- * within the range, an &fw_cdev_event_request event will be written back.
- * The @closure field is passed back to userspace in the response event.
+ * offset within that address range.  Every time when the kernel receives a
+ * request within the range, an &fw_cdev_event_request2 event will be emitted.
+ * (If the kernel or the client implements ABI version <= 3, an
+ * &fw_cdev_event_request will be generated instead.)
+ *
+ * The @closure field is passed back to userspace in these request events.
  * The @handle field is an out parameter, returning a handle to the allocated
  * range to be used for later deallocation of the range.
  *
@@ -491,12 +496,26 @@ struct fw_cdev_send_response {
  * is exclusive except for the FCP command and response registers.  If an
  * exclusive address region is already in use, the ioctl fails with errno set
  * to %EBUSY.
+ *
+ * If kernel and client implement ABI version >= 4, the kernel looks up a free
+ * spot of size @length inside [@offset..@region_end) and, if found, writes
+ * the start address of the new CSR back in @offset.  I.e. @offset is an
+ * in and out parameter.  If this automatic placement of a CSR in a bigger
+ * address range is not desired, the client simply needs to set @region_end
+ * = @offset + @length.
+ *
+ * If the kernel or the client implements ABI version <= 3, @region_end is
+ * ignored and effectively assumed to be @offset + @length.
+ *
+ * @region_end is only present in a kernel header >= 2.6.36.  If necessary,
+ * this can for example be tested by #ifdef FW_CDEV_EVENT_REQUEST2.
  */
 struct fw_cdev_allocate {
 	__u64 offset;
 	__u64 closure;
 	__u32 length;
 	__u32 handle;
+	__u64 region_end;	/* available since kernel version 2.6.36 */
 };
 
 /**
-- 
cgit v1.2.3


From 22b8f15c2f7130bb0386f548428df2ffd4e81903 Mon Sep 17 00:00:00 2001
From: Patrick Pannuto <ppannuto@codeaurora.org>
Date: Mon, 19 Jul 2010 15:09:26 -0700
Subject: timer: Added usleep[_range] timer

usleep[_range] are finer precision implementations of msleep
and are designed to be drop-in replacements for udelay where
a precise sleep / busy-wait is unnecessary. They also allow
an easy interface to specify slack when a precise (ish)
wakeup is unnecessary to help minimize wakeups

Signed-off-by: Patrick Pannuto <ppannuto@codeaurora.org>
Cc: akinobu.mita@gmail.com
Cc: sboyd@codeaurora.org
Acked-by: Arjan van de Ven <arjan@linux.intel.com>
LKML-Reference: <4C44CDD2.1070708@codeaurora.org>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 include/linux/delay.h |  6 ++++++
 kernel/timer.c        | 22 ++++++++++++++++++++++
 2 files changed, 28 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/delay.h b/include/linux/delay.h
index fd832c6d419e..0e303d1aacd8 100644
--- a/include/linux/delay.h
+++ b/include/linux/delay.h
@@ -45,6 +45,12 @@ extern unsigned long lpj_fine;
 void calibrate_delay(void);
 void msleep(unsigned int msecs);
 unsigned long msleep_interruptible(unsigned int msecs);
+void usleep_range(unsigned long min, unsigned long max);
+
+static inline void usleep(unsigned long usecs)
+{
+	usleep_range(usecs, usecs);
+}
 
 static inline void ssleep(unsigned int seconds)
 {
diff --git a/kernel/timer.c b/kernel/timer.c
index ce98685cd1cb..f110f241ab66 100644
--- a/kernel/timer.c
+++ b/kernel/timer.c
@@ -1755,3 +1755,25 @@ unsigned long msleep_interruptible(unsigned int msecs)
 }
 
 EXPORT_SYMBOL(msleep_interruptible);
+
+static int __sched do_usleep_range(unsigned long min, unsigned long max)
+{
+	ktime_t kmin;
+	unsigned long delta;
+
+	kmin = ktime_set(0, min * NSEC_PER_USEC);
+	delta = max - min;
+	return schedule_hrtimeout_range(&kmin, delta, HRTIMER_MODE_REL);
+}
+
+/**
+ * usleep_range - Drop in replacement for udelay where wakeup is flexible
+ * @min: Minimum time in usecs to sleep
+ * @max: Maximum time in usecs to sleep
+ */
+void usleep_range(unsigned long min, unsigned long max)
+{
+	__set_current_state(TASK_UNINTERRUPTIBLE);
+	do_usleep_range(min, max);
+}
+EXPORT_SYMBOL(usleep_range);
-- 
cgit v1.2.3


From eca3930163ba8884060ce9d9ff5ef0d9b7c7b00f Mon Sep 17 00:00:00 2001
From: Grant Likely <grant.likely@secretlab.ca>
Date: Tue, 8 Jun 2010 07:48:21 -0600
Subject: of: Merge of_platform_bus_type with platform_bus_type

of_platform_bus was being used in the same manner as the platform_bus.
The only difference being that of_platform_bus devices are generated
from data in the device tree, and platform_bus devices are usually
statically allocated in platform code.  Having them separate causes
the problem of device drivers having to be registered twice if it
was possible for the same device to appear on either bus.

This patch removes of_platform_bus_type and registers all of_platform
bus devices and drivers on the platform bus instead.  A previous patch
made the of_device structure an alias for the platform_device structure,
and a shim is used to adapt of_platform_drivers to the platform bus.

After all of of_platform_bus drivers are converted to be normal platform
drivers, the shim code can be removed.

Signed-off-by: Grant Likely <grant.likely@secretlab.ca>
Acked-by: David S. Miller <davem@davemloft.net>
---
 arch/microblaze/kernel/of_platform.c     | 11 ------
 arch/microblaze/kernel/setup.c           |  6 ---
 arch/powerpc/kernel/dma-swiotlb.c        |  8 ----
 arch/powerpc/kernel/of_platform.c        | 12 ------
 arch/powerpc/kernel/setup-common.c       |  7 ----
 arch/powerpc/platforms/cell/beat_iommu.c |  2 +-
 arch/powerpc/platforms/cell/iommu.c      |  2 +-
 arch/powerpc/sysdev/mv64x60_dev.c        |  7 +---
 arch/sparc/kernel/of_device_32.c         | 21 +++-------
 arch/sparc/kernel/of_device_64.c         | 21 +++-------
 arch/sparc/kernel/of_device_common.c     |  3 --
 drivers/base/platform.c                  |  6 +++
 drivers/of/device.c                      |  5 +++
 drivers/of/platform.c                    | 67 ++++++++++++++++++++++++++++++--
 include/linux/of_device.h                |  6 +++
 include/linux/of_platform.h              | 21 ++++------
 16 files changed, 102 insertions(+), 103 deletions(-)

(limited to 'include/linux')

diff --git a/arch/microblaze/kernel/of_platform.c b/arch/microblaze/kernel/of_platform.c
index da79edf45420..fb2866104331 100644
--- a/arch/microblaze/kernel/of_platform.c
+++ b/arch/microblaze/kernel/of_platform.c
@@ -26,17 +26,6 @@
 #include <linux/topology.h>
 #include <asm/atomic.h>
 
-struct bus_type of_platform_bus_type = {
-       .uevent	= of_device_uevent,
-};
-EXPORT_SYMBOL(of_platform_bus_type);
-
-static int __init of_bus_driver_init(void)
-{
-	return of_bus_type_init(&of_platform_bus_type, "of_platform");
-}
-postcore_initcall(of_bus_driver_init);
-
 /*
  * The list of OF IDs below is used for matching bus types in the
  * system whose devices are to be exposed as of_platform_devices.
diff --git a/arch/microblaze/kernel/setup.c b/arch/microblaze/kernel/setup.c
index 17c98dbcec88..f5f768842354 100644
--- a/arch/microblaze/kernel/setup.c
+++ b/arch/microblaze/kernel/setup.c
@@ -213,15 +213,9 @@ static struct notifier_block dflt_plat_bus_notifier = {
 	.priority = INT_MAX,
 };
 
-static struct notifier_block dflt_of_bus_notifier = {
-	.notifier_call = dflt_bus_notify,
-	.priority = INT_MAX,
-};
-
 static int __init setup_bus_notifier(void)
 {
 	bus_register_notifier(&platform_bus_type, &dflt_plat_bus_notifier);
-	bus_register_notifier(&of_platform_bus_type, &dflt_of_bus_notifier);
 
 	return 0;
 }
diff --git a/arch/powerpc/kernel/dma-swiotlb.c b/arch/powerpc/kernel/dma-swiotlb.c
index 02f724f36753..4295e0b94b2d 100644
--- a/arch/powerpc/kernel/dma-swiotlb.c
+++ b/arch/powerpc/kernel/dma-swiotlb.c
@@ -82,17 +82,9 @@ static struct notifier_block ppc_swiotlb_plat_bus_notifier = {
 	.priority = 0,
 };
 
-static struct notifier_block ppc_swiotlb_of_bus_notifier = {
-	.notifier_call = ppc_swiotlb_bus_notify,
-	.priority = 0,
-};
-
 int __init swiotlb_setup_bus_notifier(void)
 {
 	bus_register_notifier(&platform_bus_type,
 			      &ppc_swiotlb_plat_bus_notifier);
-	bus_register_notifier(&of_platform_bus_type,
-			      &ppc_swiotlb_of_bus_notifier);
-
 	return 0;
 }
diff --git a/arch/powerpc/kernel/of_platform.c b/arch/powerpc/kernel/of_platform.c
index 4e0a2f7c1dd3..d3497cd81e8a 100644
--- a/arch/powerpc/kernel/of_platform.c
+++ b/arch/powerpc/kernel/of_platform.c
@@ -52,18 +52,6 @@ const struct of_device_id of_default_bus_ids[] = {
 	{},
 };
 
-struct bus_type of_platform_bus_type = {
-       .uevent	= of_device_uevent,
-};
-EXPORT_SYMBOL(of_platform_bus_type);
-
-static int __init of_bus_driver_init(void)
-{
-	return of_bus_type_init(&of_platform_bus_type, "of_platform");
-}
-
-postcore_initcall(of_bus_driver_init);
-
 static int of_dev_node_match(struct device *dev, void *data)
 {
 	return to_of_device(dev)->dev.of_node == data;
diff --git a/arch/powerpc/kernel/setup-common.c b/arch/powerpc/kernel/setup-common.c
index b7e6c7e193ae..d1a5304b3ddd 100644
--- a/arch/powerpc/kernel/setup-common.c
+++ b/arch/powerpc/kernel/setup-common.c
@@ -701,16 +701,9 @@ static struct notifier_block ppc_dflt_plat_bus_notifier = {
 	.priority = INT_MAX,
 };
 
-static struct notifier_block ppc_dflt_of_bus_notifier = {
-	.notifier_call = ppc_dflt_bus_notify,
-	.priority = INT_MAX,
-};
-
 static int __init setup_bus_notifier(void)
 {
 	bus_register_notifier(&platform_bus_type, &ppc_dflt_plat_bus_notifier);
-	bus_register_notifier(&of_platform_bus_type, &ppc_dflt_of_bus_notifier);
-
 	return 0;
 }
 
diff --git a/arch/powerpc/platforms/cell/beat_iommu.c b/arch/powerpc/platforms/cell/beat_iommu.c
index 39d361c5c6d2..beec405eb6f8 100644
--- a/arch/powerpc/platforms/cell/beat_iommu.c
+++ b/arch/powerpc/platforms/cell/beat_iommu.c
@@ -108,7 +108,7 @@ static int __init celleb_init_iommu(void)
 	celleb_init_direct_mapping();
 	set_pci_dma_ops(&dma_direct_ops);
 	ppc_md.pci_dma_dev_setup = celleb_pci_dma_dev_setup;
-	bus_register_notifier(&of_platform_bus_type, &celleb_of_bus_notifier);
+	bus_register_notifier(&platform_bus_type, &celleb_of_bus_notifier);
 
 	return 0;
 }
diff --git a/arch/powerpc/platforms/cell/iommu.c b/arch/powerpc/platforms/cell/iommu.c
index 3712900471ba..58b13ce3847e 100644
--- a/arch/powerpc/platforms/cell/iommu.c
+++ b/arch/powerpc/platforms/cell/iommu.c
@@ -1204,7 +1204,7 @@ static int __init cell_iommu_init(void)
 	/* Register callbacks on OF platform device addition/removal
 	 * to handle linking them to the right DMA operations
 	 */
-	bus_register_notifier(&of_platform_bus_type, &cell_of_bus_notifier);
+	bus_register_notifier(&platform_bus_type, &cell_of_bus_notifier);
 
 	return 0;
 }
diff --git a/arch/powerpc/sysdev/mv64x60_dev.c b/arch/powerpc/sysdev/mv64x60_dev.c
index 31acd3b1718b..1398bc454999 100644
--- a/arch/powerpc/sysdev/mv64x60_dev.c
+++ b/arch/powerpc/sysdev/mv64x60_dev.c
@@ -20,12 +20,7 @@
 
 #include <asm/prom.h>
 
-/*
- * These functions provide the necessary setup for the mv64x60 drivers.
- * These drivers are unusual in that they work on both the MIPS and PowerPC
- * architectures.  Because of that, the drivers do not support the normal
- * PowerPC of_platform_bus_type.  They support platform_bus_type instead.
- */
+/* These functions provide the necessary setup for the mv64x60 drivers. */
 
 static struct of_device_id __initdata of_mv64x60_devices[] = {
 	{ .compatible = "marvell,mv64306-devctrl", },
diff --git a/arch/sparc/kernel/of_device_32.c b/arch/sparc/kernel/of_device_32.c
index 331de91ad2bc..75fc9d5cd7e6 100644
--- a/arch/sparc/kernel/of_device_32.c
+++ b/arch/sparc/kernel/of_device_32.c
@@ -424,7 +424,7 @@ build_resources:
 	build_device_resources(op, parent);
 
 	op->dev.parent = parent;
-	op->dev.bus = &of_platform_bus_type;
+	op->dev.bus = &platform_bus_type;
 	if (!parent)
 		dev_set_name(&op->dev, "root");
 	else
@@ -452,30 +452,19 @@ static void __init scan_tree(struct device_node *dp, struct device *parent)
 	}
 }
 
-static void __init scan_of_devices(void)
+static int __init scan_of_devices(void)
 {
 	struct device_node *root = of_find_node_by_path("/");
 	struct of_device *parent;
 
 	parent = scan_one_device(root, NULL);
 	if (!parent)
-		return;
+		return 0;
 
 	scan_tree(root->child, &parent->dev);
+	return 0;
 }
-
-static int __init of_bus_driver_init(void)
-{
-	int err;
-
-	err = of_bus_type_init(&of_platform_bus_type, "of");
-	if (!err)
-		scan_of_devices();
-
-	return err;
-}
-
-postcore_initcall(of_bus_driver_init);
+postcore_initcall(scan_of_devices);
 
 static int __init of_debug(char *str)
 {
diff --git a/arch/sparc/kernel/of_device_64.c b/arch/sparc/kernel/of_device_64.c
index 5e8cbb942d3d..9743d1d9fa03 100644
--- a/arch/sparc/kernel/of_device_64.c
+++ b/arch/sparc/kernel/of_device_64.c
@@ -667,7 +667,7 @@ static struct of_device * __init scan_one_device(struct device_node *dp,
 		op->archdata.irqs[i] = build_one_device_irq(op, parent, op->archdata.irqs[i]);
 
 	op->dev.parent = parent;
-	op->dev.bus = &of_platform_bus_type;
+	op->dev.bus = &platform_bus_type;
 	if (!parent)
 		dev_set_name(&op->dev, "root");
 	else
@@ -695,30 +695,19 @@ static void __init scan_tree(struct device_node *dp, struct device *parent)
 	}
 }
 
-static void __init scan_of_devices(void)
+static int __init scan_of_devices(void)
 {
 	struct device_node *root = of_find_node_by_path("/");
 	struct of_device *parent;
 
 	parent = scan_one_device(root, NULL);
 	if (!parent)
-		return;
+		return 0;
 
 	scan_tree(root->child, &parent->dev);
+	return 0;
 }
-
-static int __init of_bus_driver_init(void)
-{
-	int err;
-
-	err = of_bus_type_init(&of_platform_bus_type, "of");
-	if (!err)
-		scan_of_devices();
-
-	return err;
-}
-
-postcore_initcall(of_bus_driver_init);
+postcore_initcall(scan_of_devices);
 
 static int __init of_debug(char *str)
 {
diff --git a/arch/sparc/kernel/of_device_common.c b/arch/sparc/kernel/of_device_common.c
index 016c947d4cae..01f380c7995c 100644
--- a/arch/sparc/kernel/of_device_common.c
+++ b/arch/sparc/kernel/of_device_common.c
@@ -64,9 +64,6 @@ void of_propagate_archdata(struct of_device *bus)
 	}
 }
 
-struct bus_type of_platform_bus_type;
-EXPORT_SYMBOL(of_platform_bus_type);
-
 static void get_cells(struct device_node *dp, int *addrc, int *sizec)
 {
 	if (addrc)
diff --git a/drivers/base/platform.c b/drivers/base/platform.c
index fac3633c7223..f699fabf403b 100644
--- a/drivers/base/platform.c
+++ b/drivers/base/platform.c
@@ -636,6 +636,12 @@ static struct device_attribute platform_dev_attrs[] = {
 static int platform_uevent(struct device *dev, struct kobj_uevent_env *env)
 {
 	struct platform_device	*pdev = to_platform_device(dev);
+	int rc;
+
+	/* Some devices have extra OF data and an OF-style MODALIAS */
+	rc = of_device_uevent(dev,env);
+	if (rc != -ENODEV)
+		return rc;
 
 	add_uevent_var(env, "MODALIAS=%s%s", PLATFORM_MODULE_PREFIX,
 		(pdev->id_entry) ? pdev->id_entry->name : pdev->name);
diff --git a/drivers/of/device.c b/drivers/of/device.c
index 5282a202f5a9..12a44b493511 100644
--- a/drivers/of/device.c
+++ b/drivers/of/device.c
@@ -104,6 +104,11 @@ int of_device_register(struct of_device *ofdev)
 
 	device_initialize(&ofdev->dev);
 
+	/* name and id have to be set so that the platform bus doesn't get
+	 * confused on matching */
+	ofdev->name = dev_name(&ofdev->dev);
+	ofdev->id = -1;
+
 	/* device_add will assume that this device is on the same node as
 	 * the parent. If there is no parent defined, set the node
 	 * explicitly */
diff --git a/drivers/of/platform.c b/drivers/of/platform.c
index 9d3d932bcb6f..712dfd866df0 100644
--- a/drivers/of/platform.c
+++ b/drivers/of/platform.c
@@ -20,6 +20,54 @@
 #include <linux/of_device.h>
 #include <linux/of_irq.h>
 #include <linux/of_platform.h>
+#include <linux/platform_device.h>
+
+static int platform_driver_probe_shim(struct platform_device *pdev)
+{
+	struct platform_driver *pdrv;
+	struct of_platform_driver *ofpdrv;
+	const struct of_device_id *match;
+
+	pdrv = container_of(pdev->dev.driver, struct platform_driver, driver);
+	ofpdrv = container_of(pdrv, struct of_platform_driver, platform_driver);
+	match = of_match_device(pdev->dev.driver->of_match_table, &pdev->dev);
+	return ofpdrv->probe(pdev, match);
+}
+
+static void platform_driver_shutdown_shim(struct platform_device *pdev)
+{
+	struct platform_driver *pdrv;
+	struct of_platform_driver *ofpdrv;
+
+	pdrv = container_of(pdev->dev.driver, struct platform_driver, driver);
+	ofpdrv = container_of(pdrv, struct of_platform_driver, platform_driver);
+	ofpdrv->shutdown(pdev);
+}
+
+/**
+ * of_register_platform_driver
+ */
+int of_register_platform_driver(struct of_platform_driver *drv)
+{
+	/* setup of_platform_driver to platform_driver adaptors */
+	drv->platform_driver.driver = drv->driver;
+	if (drv->probe)
+		drv->platform_driver.probe = platform_driver_probe_shim;
+	drv->platform_driver.remove = drv->remove;
+	if (drv->shutdown)
+		drv->platform_driver.shutdown = platform_driver_shutdown_shim;
+	drv->platform_driver.suspend = drv->suspend;
+	drv->platform_driver.resume = drv->resume;
+
+	return platform_driver_register(&drv->platform_driver);
+}
+EXPORT_SYMBOL(of_register_platform_driver);
+
+void of_unregister_platform_driver(struct of_platform_driver *drv)
+{
+	platform_driver_unregister(&drv->platform_driver);
+}
+EXPORT_SYMBOL(of_unregister_platform_driver);
 
 #if defined(CONFIG_PPC_DCR)
 #include <asm/dcr.h>
@@ -392,16 +440,29 @@ int of_bus_type_init(struct bus_type *bus, const char *name)
 
 int of_register_driver(struct of_platform_driver *drv, struct bus_type *bus)
 {
-	drv->driver.bus = bus;
+	/*
+	 * Temporary: of_platform_bus used to be distinct from the platform
+	 * bus.  It isn't anymore, and so drivers on the platform bus need
+	 * to be registered in a special way.
+	 *
+	 * After all of_platform_bus_type drivers are converted to
+	 * platform_drivers, this exception can be removed.
+	 */
+	if (bus == &platform_bus_type)
+		return of_register_platform_driver(drv);
 
 	/* register with core */
+	drv->driver.bus = bus;
 	return driver_register(&drv->driver);
 }
 EXPORT_SYMBOL(of_register_driver);
 
 void of_unregister_driver(struct of_platform_driver *drv)
 {
-	driver_unregister(&drv->driver);
+	if (drv->driver.bus == &platform_bus_type)
+		of_unregister_platform_driver(drv);
+	else
+		driver_unregister(&drv->driver);
 }
 EXPORT_SYMBOL(of_unregister_driver);
 
@@ -548,7 +609,7 @@ struct of_device *of_platform_device_create(struct device_node *np,
 	dev->archdata.dma_mask = 0xffffffffUL;
 #endif
 	dev->dev.coherent_dma_mask = DMA_BIT_MASK(32);
-	dev->dev.bus = &of_platform_bus_type;
+	dev->dev.bus = &platform_bus_type;
 
 	/* We do not fill the DMA ops for platform devices by default.
 	 * This is currently the responsibility of the platform code
diff --git a/include/linux/of_device.h b/include/linux/of_device.h
index 7d27f5a878f6..8cd1fe7864e3 100644
--- a/include/linux/of_device.h
+++ b/include/linux/of_device.h
@@ -65,6 +65,12 @@ static inline int of_driver_match_device(struct device *dev,
 	return 0;
 }
 
+static inline int of_device_uevent(struct device *dev,
+				   struct kobj_uevent_env *env)
+{
+	return -ENODEV;
+}
+
 #endif /* CONFIG_OF_DEVICE */
 
 #endif /* _LINUX_OF_DEVICE_H */
diff --git a/include/linux/of_platform.h b/include/linux/of_platform.h
index a51fd30176aa..133ecf31a60f 100644
--- a/include/linux/of_platform.h
+++ b/include/linux/of_platform.h
@@ -17,19 +17,19 @@
 #include <linux/mod_devicetable.h>
 #include <linux/pm.h>
 #include <linux/of_device.h>
+#include <linux/platform_device.h>
 
 /*
- * The of_platform_bus_type is a bus type used by drivers that do not
- * attach to a macio or similar bus but still use OF probing
- * mechanism
+ * of_platform_bus_type isn't it's own bus anymore.  It's now just an alias
+ * for the platform bus.
  */
-extern struct bus_type of_platform_bus_type;
+#define of_platform_bus_type platform_bus_type
 
 extern const struct of_device_id of_default_bus_ids[];
 
 /*
  * An of_platform_driver driver is attached to a basic of_device on
- * the "platform bus" (of_platform_bus_type).
+ * the "platform bus" (platform_bus_type).
  */
 struct of_platform_driver
 {
@@ -42,6 +42,7 @@ struct of_platform_driver
 	int	(*shutdown)(struct of_device* dev);
 
 	struct device_driver	driver;
+	struct platform_driver	platform_driver;
 };
 #define	to_of_platform_driver(drv) \
 	container_of(drv,struct of_platform_driver, driver)
@@ -51,14 +52,8 @@ extern int of_register_driver(struct of_platform_driver *drv,
 extern void of_unregister_driver(struct of_platform_driver *drv);
 
 /* Platform drivers register/unregister */
-static inline int of_register_platform_driver(struct of_platform_driver *drv)
-{
-	return of_register_driver(drv, &of_platform_bus_type);
-}
-static inline void of_unregister_platform_driver(struct of_platform_driver *drv)
-{
-	of_unregister_driver(drv);
-}
+extern int of_register_platform_driver(struct of_platform_driver *drv);
+extern void of_unregister_platform_driver(struct of_platform_driver *drv);
 
 extern struct of_device *of_device_alloc(struct device_node *np,
 					 const char *bus_id,
-- 
cgit v1.2.3


From 1ab1d63a85cee2545272f63a7644e9f855cb65d0 Mon Sep 17 00:00:00 2001
From: Grant Likely <grant.likely@secretlab.ca>
Date: Thu, 24 Jun 2010 15:14:37 -0600
Subject: of/platform: remove all of_bus_type and of_platform_bus_type
 references

Both of_bus_type and of_platform_bus_type are just #define aliases
for the platform bus.  This patch removes all references to them and
switches to the of_register_platform_driver()/of_unregister_platform_driver()
API for registering.

Subsequent patches will convert each user of of_register_platform_driver()
into plain platform_drivers without the of_platform_driver shim.  At which
point the of_register_platform_driver()/of_unregister_platform_driver()
functions can be removed.

Signed-off-by: Grant Likely <grant.likely@secretlab.ca>
Acked-by: David S. Miller <davem@davemloft.net>
---
 arch/microblaze/kernel/of_platform.c |  3 +--
 arch/powerpc/kernel/of_platform.c    |  3 +--
 arch/sparc/include/asm/of_platform.h |  2 --
 arch/sparc/include/asm/parport.h     |  4 +---
 arch/sparc/kernel/apc.c              |  2 +-
 arch/sparc/kernel/auxio_64.c         |  2 +-
 arch/sparc/kernel/central.c          |  4 ++--
 arch/sparc/kernel/chmc.c             |  4 ++--
 arch/sparc/kernel/of_device_common.c |  2 +-
 arch/sparc/kernel/pci_fire.c         |  2 +-
 arch/sparc/kernel/pci_psycho.c       |  2 +-
 arch/sparc/kernel/pci_sabre.c        |  2 +-
 arch/sparc/kernel/pci_schizo.c       |  2 +-
 arch/sparc/kernel/pci_sun4v.c        |  2 +-
 arch/sparc/kernel/pmc.c              |  2 +-
 arch/sparc/kernel/power.c            |  2 +-
 arch/sparc/kernel/time_32.c          |  2 +-
 arch/sparc/kernel/time_64.c          |  6 +++---
 drivers/atm/fore200e.c               |  6 +++---
 drivers/char/hw_random/n2-drv.c      |  4 ++--
 drivers/crypto/n2_core.c             | 10 +++++-----
 drivers/hwmon/ultra45_env.c          |  4 ++--
 drivers/input/misc/sparcspkr.c       | 12 +++++-------
 drivers/input/serio/i8042-sparcio.h  |  5 ++---
 drivers/mtd/maps/sun_uflash.c        |  4 ++--
 drivers/net/ibm_newemac/core.c       |  4 ++--
 drivers/net/myri_sbus.c              |  4 ++--
 drivers/net/niu.c                    |  6 +++---
 drivers/net/sunbmac.c                |  4 ++--
 drivers/net/sunhme.c                 |  4 ++--
 drivers/net/sunlance.c               |  4 ++--
 drivers/net/sunqe.c                  |  4 ++--
 drivers/parport/parport_sunbpp.c     |  4 ++--
 drivers/sbus/char/bbc_i2c.c          |  4 ++--
 drivers/sbus/char/display7seg.c      |  4 ++--
 drivers/sbus/char/envctrl.c          |  4 ++--
 drivers/sbus/char/flash.c            |  4 ++--
 drivers/sbus/char/uctrl.c            |  4 ++--
 drivers/scsi/qlogicpti.c             |  4 ++--
 drivers/scsi/sun_esp.c               |  4 ++--
 drivers/serial/sunhv.c               |  4 ++--
 drivers/serial/sunsab.c              |  4 ++--
 drivers/serial/sunsu.c               |  2 +-
 drivers/serial/sunzilog.c            |  6 +++---
 drivers/video/bw2.c                  |  4 ++--
 drivers/video/cg14.c                 |  4 ++--
 drivers/video/cg3.c                  |  4 ++--
 drivers/video/cg6.c                  |  4 ++--
 drivers/video/ffb.c                  |  4 ++--
 drivers/video/leo.c                  |  4 ++--
 drivers/video/p9100.c                |  4 ++--
 drivers/video/sunxvr1000.c           |  4 ++--
 drivers/video/tcx.c                  |  4 ++--
 drivers/watchdog/cpwd.c              |  4 ++--
 drivers/watchdog/riowd.c             |  4 ++--
 include/linux/of_platform.h          |  6 ------
 sound/sparc/amd7930.c                |  4 ++--
 sound/sparc/cs4231.c                 |  4 ++--
 sound/sparc/dbri.c                   |  4 ++--
 59 files changed, 109 insertions(+), 124 deletions(-)

(limited to 'include/linux')

diff --git a/arch/microblaze/kernel/of_platform.c b/arch/microblaze/kernel/of_platform.c
index fb2866104331..80c9c493cb25 100644
--- a/arch/microblaze/kernel/of_platform.c
+++ b/arch/microblaze/kernel/of_platform.c
@@ -57,8 +57,7 @@ struct of_device *of_find_device_by_node(struct device_node *np)
 {
 	struct device *dev;
 
-	dev = bus_find_device(&of_platform_bus_type,
-			      NULL, np, of_dev_node_match);
+	dev = bus_find_device(&platform_bus_type, NULL, np, of_dev_node_match);
 	if (dev)
 		return to_of_device(dev);
 	return NULL;
diff --git a/arch/powerpc/kernel/of_platform.c b/arch/powerpc/kernel/of_platform.c
index d3497cd81e8a..b093d4b1f09c 100644
--- a/arch/powerpc/kernel/of_platform.c
+++ b/arch/powerpc/kernel/of_platform.c
@@ -61,8 +61,7 @@ struct of_device *of_find_device_by_node(struct device_node *np)
 {
 	struct device *dev;
 
-	dev = bus_find_device(&of_platform_bus_type,
-			      NULL, np, of_dev_node_match);
+	dev = bus_find_device(&platform_bus_type, NULL, np, of_dev_node_match);
 	if (dev)
 		return to_of_device(dev);
 	return NULL;
diff --git a/arch/sparc/include/asm/of_platform.h b/arch/sparc/include/asm/of_platform.h
index 90da99059f83..26540ddfc511 100644
--- a/arch/sparc/include/asm/of_platform.h
+++ b/arch/sparc/include/asm/of_platform.h
@@ -13,6 +13,4 @@
  *
  */
 
-#define of_bus_type	of_platform_bus_type	/* for compatibility */
-
 #endif
diff --git a/arch/sparc/include/asm/parport.h b/arch/sparc/include/asm/parport.h
index 0c34a8792fc7..4891fbce1114 100644
--- a/arch/sparc/include/asm/parport.h
+++ b/arch/sparc/include/asm/parport.h
@@ -243,9 +243,7 @@ static struct of_platform_driver ecpp_driver = {
 
 static int parport_pc_find_nonpci_ports(int autoirq, int autodma)
 {
-	of_register_driver(&ecpp_driver, &of_bus_type);
-
-	return 0;
+	return of_register_platform_driver(&ecpp_driver);
 }
 
 #endif /* !(_ASM_SPARC64_PARPORT_H */
diff --git a/arch/sparc/kernel/apc.c b/arch/sparc/kernel/apc.c
index b27476caa133..c471251cd3f1 100644
--- a/arch/sparc/kernel/apc.c
+++ b/arch/sparc/kernel/apc.c
@@ -184,7 +184,7 @@ static struct of_platform_driver apc_driver = {
 
 static int __init apc_init(void)
 {
-	return of_register_driver(&apc_driver, &of_bus_type);
+	return of_register_platform_driver(&apc_driver);
 }
 
 /* This driver is not critical to the boot process
diff --git a/arch/sparc/kernel/auxio_64.c b/arch/sparc/kernel/auxio_64.c
index ddc84128b3c2..46ba58a8510f 100644
--- a/arch/sparc/kernel/auxio_64.c
+++ b/arch/sparc/kernel/auxio_64.c
@@ -142,7 +142,7 @@ static struct of_platform_driver auxio_driver = {
 
 static int __init auxio_init(void)
 {
-	return of_register_driver(&auxio_driver, &of_platform_bus_type);
+	return of_register_platform_driver(&auxio_driver);
 }
 
 /* Must be after subsys_initcall() so that busses are probed.  Must
diff --git a/arch/sparc/kernel/central.c b/arch/sparc/kernel/central.c
index 434335f65823..b6080c39ed4b 100644
--- a/arch/sparc/kernel/central.c
+++ b/arch/sparc/kernel/central.c
@@ -265,8 +265,8 @@ static struct of_platform_driver fhc_driver = {
 
 static int __init sunfire_init(void)
 {
-	(void) of_register_driver(&fhc_driver, &of_platform_bus_type);
-	(void) of_register_driver(&clock_board_driver, &of_platform_bus_type);
+	(void) of_register_platform_driver(&fhc_driver);
+	(void) of_register_platform_driver(&clock_board_driver);
 	return 0;
 }
 
diff --git a/arch/sparc/kernel/chmc.c b/arch/sparc/kernel/chmc.c
index 870cb65b3f21..04bb7df9f716 100644
--- a/arch/sparc/kernel/chmc.c
+++ b/arch/sparc/kernel/chmc.c
@@ -848,7 +848,7 @@ static int __init us3mc_init(void)
 	ret = register_dimm_printer(us3mc_dimm_printer);
 
 	if (!ret) {
-		ret = of_register_driver(&us3mc_driver, &of_bus_type);
+		ret = of_register_platform_driver(&us3mc_driver);
 		if (ret)
 			unregister_dimm_printer(us3mc_dimm_printer);
 	}
@@ -859,7 +859,7 @@ static void __exit us3mc_cleanup(void)
 {
 	if (us3mc_platform()) {
 		unregister_dimm_printer(us3mc_dimm_printer);
-		of_unregister_driver(&us3mc_driver);
+		of_unregister_platform_driver(&us3mc_driver);
 	}
 }
 
diff --git a/arch/sparc/kernel/of_device_common.c b/arch/sparc/kernel/of_device_common.c
index 01f380c7995c..2a5c639e4c3f 100644
--- a/arch/sparc/kernel/of_device_common.c
+++ b/arch/sparc/kernel/of_device_common.c
@@ -21,7 +21,7 @@ static int node_match(struct device *dev, void *data)
 
 struct of_device *of_find_device_by_node(struct device_node *dp)
 {
-	struct device *dev = bus_find_device(&of_platform_bus_type, NULL,
+	struct device *dev = bus_find_device(&platform_bus_type, NULL,
 					     dp, node_match);
 
 	if (dev)
diff --git a/arch/sparc/kernel/pci_fire.c b/arch/sparc/kernel/pci_fire.c
index 51cfa09e392a..885f10b742e2 100644
--- a/arch/sparc/kernel/pci_fire.c
+++ b/arch/sparc/kernel/pci_fire.c
@@ -518,7 +518,7 @@ static struct of_platform_driver fire_driver = {
 
 static int __init fire_init(void)
 {
-	return of_register_driver(&fire_driver, &of_bus_type);
+	return of_register_platform_driver(&fire_driver);
 }
 
 subsys_initcall(fire_init);
diff --git a/arch/sparc/kernel/pci_psycho.c b/arch/sparc/kernel/pci_psycho.c
index 93011e6e7ddc..71550a7aacd8 100644
--- a/arch/sparc/kernel/pci_psycho.c
+++ b/arch/sparc/kernel/pci_psycho.c
@@ -612,7 +612,7 @@ static struct of_platform_driver psycho_driver = {
 
 static int __init psycho_init(void)
 {
-	return of_register_driver(&psycho_driver, &of_bus_type);
+	return of_register_platform_driver(&psycho_driver);
 }
 
 subsys_initcall(psycho_init);
diff --git a/arch/sparc/kernel/pci_sabre.c b/arch/sparc/kernel/pci_sabre.c
index 99c6dba7d4fd..2d7bf30552dd 100644
--- a/arch/sparc/kernel/pci_sabre.c
+++ b/arch/sparc/kernel/pci_sabre.c
@@ -606,7 +606,7 @@ static struct of_platform_driver sabre_driver = {
 
 static int __init sabre_init(void)
 {
-	return of_register_driver(&sabre_driver, &of_bus_type);
+	return of_register_platform_driver(&sabre_driver);
 }
 
 subsys_initcall(sabre_init);
diff --git a/arch/sparc/kernel/pci_schizo.c b/arch/sparc/kernel/pci_schizo.c
index 9041dae7aaca..04f29c46bfa0 100644
--- a/arch/sparc/kernel/pci_schizo.c
+++ b/arch/sparc/kernel/pci_schizo.c
@@ -1501,7 +1501,7 @@ static struct of_platform_driver schizo_driver = {
 
 static int __init schizo_init(void)
 {
-	return of_register_driver(&schizo_driver, &of_bus_type);
+	return of_register_platform_driver(&schizo_driver);
 }
 
 subsys_initcall(schizo_init);
diff --git a/arch/sparc/kernel/pci_sun4v.c b/arch/sparc/kernel/pci_sun4v.c
index a24af6f7e17f..18ee8b6f4036 100644
--- a/arch/sparc/kernel/pci_sun4v.c
+++ b/arch/sparc/kernel/pci_sun4v.c
@@ -1019,7 +1019,7 @@ static struct of_platform_driver pci_sun4v_driver = {
 
 static int __init pci_sun4v_init(void)
 {
-	return of_register_driver(&pci_sun4v_driver, &of_bus_type);
+	return of_register_platform_driver(&pci_sun4v_driver);
 }
 
 subsys_initcall(pci_sun4v_init);
diff --git a/arch/sparc/kernel/pmc.c b/arch/sparc/kernel/pmc.c
index 9589d8b9b0c1..a4c73edc8975 100644
--- a/arch/sparc/kernel/pmc.c
+++ b/arch/sparc/kernel/pmc.c
@@ -89,7 +89,7 @@ static struct of_platform_driver pmc_driver = {
 
 static int __init pmc_init(void)
 {
-	return of_register_driver(&pmc_driver, &of_bus_type);
+	return of_register_platform_driver(&pmc_driver);
 }
 
 /* This driver is not critical to the boot process
diff --git a/arch/sparc/kernel/power.c b/arch/sparc/kernel/power.c
index 1cfee577f6b5..abc194ed5a78 100644
--- a/arch/sparc/kernel/power.c
+++ b/arch/sparc/kernel/power.c
@@ -70,7 +70,7 @@ static struct of_platform_driver power_driver = {
 
 static int __init power_init(void)
 {
-	return of_register_driver(&power_driver, &of_platform_bus_type);
+	return of_register_platform_driver(&power_driver);
 }
 
 device_initcall(power_init);
diff --git a/arch/sparc/kernel/time_32.c b/arch/sparc/kernel/time_32.c
index e404b063be2c..5dc20216952e 100644
--- a/arch/sparc/kernel/time_32.c
+++ b/arch/sparc/kernel/time_32.c
@@ -189,7 +189,7 @@ static struct of_platform_driver clock_driver = {
 /* Probe for the mostek real time clock chip. */
 static int __init clock_init(void)
 {
-	return of_register_driver(&clock_driver, &of_platform_bus_type);
+	return of_register_platform_driver(&clock_driver);
 }
 /* Must be after subsys_initcall() so that busses are probed.  Must
  * be before device_initcall() because things like the RTC driver
diff --git a/arch/sparc/kernel/time_64.c b/arch/sparc/kernel/time_64.c
index 21e9fcae0668..2423b336a717 100644
--- a/arch/sparc/kernel/time_64.c
+++ b/arch/sparc/kernel/time_64.c
@@ -586,9 +586,9 @@ static int __init clock_init(void)
 	if (tlb_type == hypervisor)
 		return platform_device_register(&rtc_sun4v_device);
 
-	(void) of_register_driver(&rtc_driver, &of_platform_bus_type);
-	(void) of_register_driver(&mostek_driver, &of_platform_bus_type);
-	(void) of_register_driver(&bq4802_driver, &of_platform_bus_type);
+	(void) of_register_platform_driver(&rtc_driver);
+	(void) of_register_platform_driver(&mostek_driver);
+	(void) of_register_platform_driver(&bq4802_driver);
 
 	return 0;
 }
diff --git a/drivers/atm/fore200e.c b/drivers/atm/fore200e.c
index 38df87b198d5..b7385e077717 100644
--- a/drivers/atm/fore200e.c
+++ b/drivers/atm/fore200e.c
@@ -2795,7 +2795,7 @@ static int __init fore200e_module_init(void)
 	printk(FORE200E "FORE Systems 200E-series ATM driver - version " FORE200E_VERSION "\n");
 
 #ifdef CONFIG_SBUS
-	err = of_register_driver(&fore200e_sba_driver, &of_bus_type);
+	err = of_register_platform_driver(&fore200e_sba_driver);
 	if (err)
 		return err;
 #endif
@@ -2806,7 +2806,7 @@ static int __init fore200e_module_init(void)
 
 #ifdef CONFIG_SBUS
 	if (err)
-		of_unregister_driver(&fore200e_sba_driver);
+		of_unregister_platform_driver(&fore200e_sba_driver);
 #endif
 
 	return err;
@@ -2818,7 +2818,7 @@ static void __exit fore200e_module_cleanup(void)
 	pci_unregister_driver(&fore200e_pca_driver);
 #endif
 #ifdef CONFIG_SBUS
-	of_unregister_driver(&fore200e_sba_driver);
+	of_unregister_platform_driver(&fore200e_sba_driver);
 #endif
 }
 
diff --git a/drivers/char/hw_random/n2-drv.c b/drivers/char/hw_random/n2-drv.c
index 0f9cbf1aaf15..d8a4ca87987d 100644
--- a/drivers/char/hw_random/n2-drv.c
+++ b/drivers/char/hw_random/n2-drv.c
@@ -762,12 +762,12 @@ static struct of_platform_driver n2rng_driver = {
 
 static int __init n2rng_init(void)
 {
-	return of_register_driver(&n2rng_driver, &of_bus_type);
+	return of_register_platform_driver(&n2rng_driver);
 }
 
 static void __exit n2rng_exit(void)
 {
-	of_unregister_driver(&n2rng_driver);
+	of_unregister_platform_driver(&n2rng_driver);
 }
 
 module_init(n2rng_init);
diff --git a/drivers/crypto/n2_core.c b/drivers/crypto/n2_core.c
index 23163fda5035..34ac8ac8aaec 100644
--- a/drivers/crypto/n2_core.c
+++ b/drivers/crypto/n2_core.c
@@ -2070,20 +2070,20 @@ static struct of_platform_driver n2_mau_driver = {
 
 static int __init n2_init(void)
 {
-	int err = of_register_driver(&n2_crypto_driver, &of_bus_type);
+	int err = of_register_platform_driver(&n2_crypto_driver);
 
 	if (!err) {
-		err = of_register_driver(&n2_mau_driver, &of_bus_type);
+		err = of_register_platform_driver(&n2_mau_driver);
 		if (err)
-			of_unregister_driver(&n2_crypto_driver);
+			of_unregister_platform_driver(&n2_crypto_driver);
 	}
 	return err;
 }
 
 static void __exit n2_exit(void)
 {
-	of_unregister_driver(&n2_mau_driver);
-	of_unregister_driver(&n2_crypto_driver);
+	of_unregister_platform_driver(&n2_mau_driver);
+	of_unregister_platform_driver(&n2_crypto_driver);
 }
 
 module_init(n2_init);
diff --git a/drivers/hwmon/ultra45_env.c b/drivers/hwmon/ultra45_env.c
index 5da5942cf970..89643261ccdb 100644
--- a/drivers/hwmon/ultra45_env.c
+++ b/drivers/hwmon/ultra45_env.c
@@ -311,12 +311,12 @@ static struct of_platform_driver env_driver = {
 
 static int __init env_init(void)
 {
-	return of_register_driver(&env_driver, &of_bus_type);
+	return of_register_platform_driver(&env_driver);
 }
 
 static void __exit env_exit(void)
 {
-	of_unregister_driver(&env_driver);
+	of_unregister_platform_driver(&env_driver);
 }
 
 module_init(env_init);
diff --git a/drivers/input/misc/sparcspkr.c b/drivers/input/misc/sparcspkr.c
index 1dacae4b43f0..f3bb92e9755f 100644
--- a/drivers/input/misc/sparcspkr.c
+++ b/drivers/input/misc/sparcspkr.c
@@ -353,14 +353,12 @@ static struct of_platform_driver grover_beep_driver = {
 
 static int __init sparcspkr_init(void)
 {
-	int err = of_register_driver(&bbc_beep_driver,
-				     &of_platform_bus_type);
+	int err = of_register_platform_driver(&bbc_beep_driver);
 
 	if (!err) {
-		err = of_register_driver(&grover_beep_driver,
-					 &of_platform_bus_type);
+		err = of_register_platform_driver(&grover_beep_driver);
 		if (err)
-			of_unregister_driver(&bbc_beep_driver);
+			of_unregister_platform_driver(&bbc_beep_driver);
 	}
 
 	return err;
@@ -368,8 +366,8 @@ static int __init sparcspkr_init(void)
 
 static void __exit sparcspkr_exit(void)
 {
-	of_unregister_driver(&bbc_beep_driver);
-	of_unregister_driver(&grover_beep_driver);
+	of_unregister_platform_driver(&bbc_beep_driver);
+	of_unregister_platform_driver(&grover_beep_driver);
 }
 
 module_init(sparcspkr_init);
diff --git a/drivers/input/serio/i8042-sparcio.h b/drivers/input/serio/i8042-sparcio.h
index c7d50ff43fca..cb2a24b94746 100644
--- a/drivers/input/serio/i8042-sparcio.h
+++ b/drivers/input/serio/i8042-sparcio.h
@@ -116,8 +116,7 @@ static int __init i8042_platform_init(void)
 		if (!kbd_iobase)
 			return -ENODEV;
 	} else {
-		int err = of_register_driver(&sparc_i8042_driver,
-					     &of_bus_type);
+		int err = of_register_platform_driver(&sparc_i8042_driver);
 		if (err)
 			return err;
 
@@ -141,7 +140,7 @@ static inline void i8042_platform_exit(void)
 	struct device_node *root = of_find_node_by_path("/");
 
 	if (strcmp(root->name, "SUNW,JavaStation-1"))
-		of_unregister_driver(&sparc_i8042_driver);
+		of_unregister_platform_driver(&sparc_i8042_driver);
 }
 
 #else /* !CONFIG_PCI */
diff --git a/drivers/mtd/maps/sun_uflash.c b/drivers/mtd/maps/sun_uflash.c
index 0391c2527bd7..8984236a8d0a 100644
--- a/drivers/mtd/maps/sun_uflash.c
+++ b/drivers/mtd/maps/sun_uflash.c
@@ -160,12 +160,12 @@ static struct of_platform_driver uflash_driver = {
 
 static int __init uflash_init(void)
 {
-	return of_register_driver(&uflash_driver, &of_bus_type);
+	return of_register_platform_driver(&uflash_driver);
 }
 
 static void __exit uflash_exit(void)
 {
-	of_unregister_driver(&uflash_driver);
+	of_unregister_platform_driver(&uflash_driver);
 }
 
 module_init(uflash_init);
diff --git a/drivers/net/ibm_newemac/core.c b/drivers/net/ibm_newemac/core.c
index b150c102ca5a..f10476fb2623 100644
--- a/drivers/net/ibm_newemac/core.c
+++ b/drivers/net/ibm_newemac/core.c
@@ -2339,11 +2339,11 @@ static int __devinit emac_wait_deps(struct emac_instance *dev)
 		deps[EMAC_DEP_MDIO_IDX].phandle = dev->mdio_ph;
 	if (dev->blist && dev->blist > emac_boot_list)
 		deps[EMAC_DEP_PREV_IDX].phandle = 0xffffffffu;
-	bus_register_notifier(&of_platform_bus_type, &emac_of_bus_notifier);
+	bus_register_notifier(&platform_bus_type, &emac_of_bus_notifier);
 	wait_event_timeout(emac_probe_wait,
 			   emac_check_deps(dev, deps),
 			   EMAC_PROBE_DEP_TIMEOUT);
-	bus_unregister_notifier(&of_platform_bus_type, &emac_of_bus_notifier);
+	bus_unregister_notifier(&platform_bus_type, &emac_of_bus_notifier);
 	err = emac_check_deps(dev, deps) ? 0 : -ENODEV;
 	for (i = 0; i < EMAC_DEP_COUNT; i++) {
 		if (deps[i].node)
diff --git a/drivers/net/myri_sbus.c b/drivers/net/myri_sbus.c
index 370d3c17f24c..04e552aa14ec 100644
--- a/drivers/net/myri_sbus.c
+++ b/drivers/net/myri_sbus.c
@@ -1172,12 +1172,12 @@ static struct of_platform_driver myri_sbus_driver = {
 
 static int __init myri_sbus_init(void)
 {
-	return of_register_driver(&myri_sbus_driver, &of_bus_type);
+	return of_register_platform_driver(&myri_sbus_driver);
 }
 
 static void __exit myri_sbus_exit(void)
 {
-	of_unregister_driver(&myri_sbus_driver);
+	of_unregister_platform_driver(&myri_sbus_driver);
 }
 
 module_init(myri_sbus_init);
diff --git a/drivers/net/niu.c b/drivers/net/niu.c
index f6ecf6180f72..8cb2b30eccae 100644
--- a/drivers/net/niu.c
+++ b/drivers/net/niu.c
@@ -10251,14 +10251,14 @@ static int __init niu_init(void)
 	niu_debug = netif_msg_init(debug, NIU_MSG_DEFAULT);
 
 #ifdef CONFIG_SPARC64
-	err = of_register_driver(&niu_of_driver, &of_bus_type);
+	err = of_register_platform_driver(&niu_of_driver);
 #endif
 
 	if (!err) {
 		err = pci_register_driver(&niu_pci_driver);
 #ifdef CONFIG_SPARC64
 		if (err)
-			of_unregister_driver(&niu_of_driver);
+			of_unregister_platform_driver(&niu_of_driver);
 #endif
 	}
 
@@ -10269,7 +10269,7 @@ static void __exit niu_exit(void)
 {
 	pci_unregister_driver(&niu_pci_driver);
 #ifdef CONFIG_SPARC64
-	of_unregister_driver(&niu_of_driver);
+	of_unregister_platform_driver(&niu_of_driver);
 #endif
 }
 
diff --git a/drivers/net/sunbmac.c b/drivers/net/sunbmac.c
index 0b10d24de051..09c071bd6ad4 100644
--- a/drivers/net/sunbmac.c
+++ b/drivers/net/sunbmac.c
@@ -1301,12 +1301,12 @@ static struct of_platform_driver bigmac_sbus_driver = {
 
 static int __init bigmac_init(void)
 {
-	return of_register_driver(&bigmac_sbus_driver, &of_bus_type);
+	return of_register_platform_driver(&bigmac_sbus_driver);
 }
 
 static void __exit bigmac_exit(void)
 {
-	of_unregister_driver(&bigmac_sbus_driver);
+	of_unregister_platform_driver(&bigmac_sbus_driver);
 }
 
 module_init(bigmac_init);
diff --git a/drivers/net/sunhme.c b/drivers/net/sunhme.c
index 0a63ebef86a0..eec443f64079 100644
--- a/drivers/net/sunhme.c
+++ b/drivers/net/sunhme.c
@@ -3304,7 +3304,7 @@ static int __init happy_meal_sbus_init(void)
 {
 	int err;
 
-	err = of_register_driver(&hme_sbus_driver, &of_bus_type);
+	err = of_register_platform_driver(&hme_sbus_driver);
 	if (!err)
 		err = quattro_sbus_register_irqs();
 
@@ -3313,7 +3313,7 @@ static int __init happy_meal_sbus_init(void)
 
 static void happy_meal_sbus_exit(void)
 {
-	of_unregister_driver(&hme_sbus_driver);
+	of_unregister_platform_driver(&hme_sbus_driver);
 	quattro_sbus_free_irqs();
 
 	while (qfe_sbus_list) {
diff --git a/drivers/net/sunlance.c b/drivers/net/sunlance.c
index c6bfdad6c0c8..ee364fa75634 100644
--- a/drivers/net/sunlance.c
+++ b/drivers/net/sunlance.c
@@ -1558,12 +1558,12 @@ static struct of_platform_driver sunlance_sbus_driver = {
 /* Find all the lance cards on the system and initialize them */
 static int __init sparc_lance_init(void)
 {
-	return of_register_driver(&sunlance_sbus_driver, &of_bus_type);
+	return of_register_platform_driver(&sunlance_sbus_driver);
 }
 
 static void __exit sparc_lance_exit(void)
 {
-	of_unregister_driver(&sunlance_sbus_driver);
+	of_unregister_platform_driver(&sunlance_sbus_driver);
 }
 
 module_init(sparc_lance_init);
diff --git a/drivers/net/sunqe.c b/drivers/net/sunqe.c
index 446517487084..5f84a5dadedd 100644
--- a/drivers/net/sunqe.c
+++ b/drivers/net/sunqe.c
@@ -988,12 +988,12 @@ static struct of_platform_driver qec_sbus_driver = {
 
 static int __init qec_init(void)
 {
-	return of_register_driver(&qec_sbus_driver, &of_bus_type);
+	return of_register_platform_driver(&qec_sbus_driver);
 }
 
 static void __exit qec_exit(void)
 {
-	of_unregister_driver(&qec_sbus_driver);
+	of_unregister_platform_driver(&qec_sbus_driver);
 
 	while (root_qec_dev) {
 		struct sunqec *next = root_qec_dev->next_module;
diff --git a/drivers/parport/parport_sunbpp.c b/drivers/parport/parport_sunbpp.c
index 3cdfe96e8999..210a6441a066 100644
--- a/drivers/parport/parport_sunbpp.c
+++ b/drivers/parport/parport_sunbpp.c
@@ -393,12 +393,12 @@ static struct of_platform_driver bpp_sbus_driver = {
 
 static int __init parport_sunbpp_init(void)
 {
-	return of_register_driver(&bpp_sbus_driver, &of_bus_type);
+	return of_register_platform_driver(&bpp_sbus_driver);
 }
 
 static void __exit parport_sunbpp_exit(void)
 {
-	of_unregister_driver(&bpp_sbus_driver);
+	of_unregister_platform_driver(&bpp_sbus_driver);
 }
 
 MODULE_AUTHOR("Derrick J Brashear");
diff --git a/drivers/sbus/char/bbc_i2c.c b/drivers/sbus/char/bbc_i2c.c
index 40d7a1fc69af..3e89c313e98d 100644
--- a/drivers/sbus/char/bbc_i2c.c
+++ b/drivers/sbus/char/bbc_i2c.c
@@ -425,12 +425,12 @@ static struct of_platform_driver bbc_i2c_driver = {
 
 static int __init bbc_i2c_init(void)
 {
-	return of_register_driver(&bbc_i2c_driver, &of_bus_type);
+	return of_register_platform_driver(&bbc_i2c_driver);
 }
 
 static void __exit bbc_i2c_exit(void)
 {
-	of_unregister_driver(&bbc_i2c_driver);
+	of_unregister_platform_driver(&bbc_i2c_driver);
 }
 
 module_init(bbc_i2c_init);
diff --git a/drivers/sbus/char/display7seg.c b/drivers/sbus/char/display7seg.c
index 7baf1b644039..8fd362e7fa65 100644
--- a/drivers/sbus/char/display7seg.c
+++ b/drivers/sbus/char/display7seg.c
@@ -277,12 +277,12 @@ static struct of_platform_driver d7s_driver = {
 
 static int __init d7s_init(void)
 {
-	return of_register_driver(&d7s_driver, &of_bus_type);
+	return of_register_platform_driver(&d7s_driver);
 }
 
 static void __exit d7s_exit(void)
 {
-	of_unregister_driver(&d7s_driver);
+	of_unregister_platform_driver(&d7s_driver);
 }
 
 module_init(d7s_init);
diff --git a/drivers/sbus/char/envctrl.c b/drivers/sbus/char/envctrl.c
index c8166ecf5276..2c76f700265b 100644
--- a/drivers/sbus/char/envctrl.c
+++ b/drivers/sbus/char/envctrl.c
@@ -1142,12 +1142,12 @@ static struct of_platform_driver envctrl_driver = {
 
 static int __init envctrl_init(void)
 {
-	return of_register_driver(&envctrl_driver, &of_bus_type);
+	return of_register_platform_driver(&envctrl_driver);
 }
 
 static void __exit envctrl_exit(void)
 {
-	of_unregister_driver(&envctrl_driver);
+	of_unregister_platform_driver(&envctrl_driver);
 }
 
 module_init(envctrl_init);
diff --git a/drivers/sbus/char/flash.c b/drivers/sbus/char/flash.c
index 368d66294d83..d79f386c3486 100644
--- a/drivers/sbus/char/flash.c
+++ b/drivers/sbus/char/flash.c
@@ -218,12 +218,12 @@ static struct of_platform_driver flash_driver = {
 
 static int __init flash_init(void)
 {
-	return of_register_driver(&flash_driver, &of_bus_type);
+	return of_register_platform_driver(&flash_driver);
 }
 
 static void __exit flash_cleanup(void)
 {
-	of_unregister_driver(&flash_driver);
+	of_unregister_platform_driver(&flash_driver);
 }
 
 module_init(flash_init);
diff --git a/drivers/sbus/char/uctrl.c b/drivers/sbus/char/uctrl.c
index b8b40e9eca79..57f0612bb013 100644
--- a/drivers/sbus/char/uctrl.c
+++ b/drivers/sbus/char/uctrl.c
@@ -437,12 +437,12 @@ static struct of_platform_driver uctrl_driver = {
 
 static int __init uctrl_init(void)
 {
-	return of_register_driver(&uctrl_driver, &of_bus_type);
+	return of_register_platform_driver(&uctrl_driver);
 }
 
 static void __exit uctrl_exit(void)
 {
-	of_unregister_driver(&uctrl_driver);
+	of_unregister_platform_driver(&uctrl_driver);
 }
 
 module_init(uctrl_init);
diff --git a/drivers/scsi/qlogicpti.c b/drivers/scsi/qlogicpti.c
index 3f5b5411e6bc..53d7ed0dc169 100644
--- a/drivers/scsi/qlogicpti.c
+++ b/drivers/scsi/qlogicpti.c
@@ -1467,12 +1467,12 @@ static struct of_platform_driver qpti_sbus_driver = {
 
 static int __init qpti_init(void)
 {
-	return of_register_driver(&qpti_sbus_driver, &of_bus_type);
+	return of_register_platform_driver(&qpti_sbus_driver);
 }
 
 static void __exit qpti_exit(void)
 {
-	of_unregister_driver(&qpti_sbus_driver);
+	of_unregister_platform_driver(&qpti_sbus_driver);
 }
 
 MODULE_DESCRIPTION("QlogicISP SBUS driver");
diff --git a/drivers/scsi/sun_esp.c b/drivers/scsi/sun_esp.c
index ddc221acd14c..89ba6fe02f80 100644
--- a/drivers/scsi/sun_esp.c
+++ b/drivers/scsi/sun_esp.c
@@ -644,12 +644,12 @@ static struct of_platform_driver esp_sbus_driver = {
 
 static int __init sunesp_init(void)
 {
-	return of_register_driver(&esp_sbus_driver, &of_bus_type);
+	return of_register_platform_driver(&esp_sbus_driver);
 }
 
 static void __exit sunesp_exit(void)
 {
-	of_unregister_driver(&esp_sbus_driver);
+	of_unregister_platform_driver(&esp_sbus_driver);
 }
 
 MODULE_DESCRIPTION("Sun ESP SCSI driver");
diff --git a/drivers/serial/sunhv.c b/drivers/serial/sunhv.c
index 36e244867dd8..a779e22d213e 100644
--- a/drivers/serial/sunhv.c
+++ b/drivers/serial/sunhv.c
@@ -644,12 +644,12 @@ static int __init sunhv_init(void)
 	if (tlb_type != hypervisor)
 		return -ENODEV;
 
-	return of_register_driver(&hv_driver, &of_bus_type);
+	return of_register_platform_driver(&hv_driver);
 }
 
 static void __exit sunhv_exit(void)
 {
-	of_unregister_driver(&hv_driver);
+	of_unregister_platform_driver(&hv_driver);
 }
 
 module_init(sunhv_init);
diff --git a/drivers/serial/sunsab.c b/drivers/serial/sunsab.c
index 0a7dd6841ff8..9845fb1cfb1f 100644
--- a/drivers/serial/sunsab.c
+++ b/drivers/serial/sunsab.c
@@ -1130,12 +1130,12 @@ static int __init sunsab_init(void)
 		}
 	}
 
-	return of_register_driver(&sab_driver, &of_bus_type);
+	return of_register_platform_driver(&sab_driver);
 }
 
 static void __exit sunsab_exit(void)
 {
-	of_unregister_driver(&sab_driver);
+	of_unregister_platform_driver(&sab_driver);
 	if (sunsab_reg.nr) {
 		sunserial_unregister_minors(&sunsab_reg, sunsab_reg.nr);
 	}
diff --git a/drivers/serial/sunsu.c b/drivers/serial/sunsu.c
index 5deafc8180b5..3cdf74822db5 100644
--- a/drivers/serial/sunsu.c
+++ b/drivers/serial/sunsu.c
@@ -1586,7 +1586,7 @@ static int __init sunsu_init(void)
 			return err;
 	}
 
-	err = of_register_driver(&su_driver, &of_bus_type);
+	err = of_register_platform_driver(&su_driver);
 	if (err && num_uart)
 		sunserial_unregister_minors(&sunsu_reg, num_uart);
 
diff --git a/drivers/serial/sunzilog.c b/drivers/serial/sunzilog.c
index fcbe20d48039..d1e6bcb59546 100644
--- a/drivers/serial/sunzilog.c
+++ b/drivers/serial/sunzilog.c
@@ -1576,7 +1576,7 @@ static int __init sunzilog_init(void)
 			goto out_free_tables;
 	}
 
-	err = of_register_driver(&zs_driver, &of_bus_type);
+	err = of_register_platform_driver(&zs_driver);
 	if (err)
 		goto out_unregister_uart;
 
@@ -1604,7 +1604,7 @@ out:
 	return err;
 
 out_unregister_driver:
-	of_unregister_driver(&zs_driver);
+	of_unregister_platform_driver(&zs_driver);
 
 out_unregister_uart:
 	if (num_sunzilog) {
@@ -1619,7 +1619,7 @@ out_free_tables:
 
 static void __exit sunzilog_exit(void)
 {
-	of_unregister_driver(&zs_driver);
+	of_unregister_platform_driver(&zs_driver);
 
 	if (zilog_irq != -1) {
 		struct uart_sunzilog_port *up = sunzilog_irq_chain;
diff --git a/drivers/video/bw2.c b/drivers/video/bw2.c
index 09f1b9b462f4..c7796637bafd 100644
--- a/drivers/video/bw2.c
+++ b/drivers/video/bw2.c
@@ -390,12 +390,12 @@ static int __init bw2_init(void)
 	if (fb_get_options("bw2fb", NULL))
 		return -ENODEV;
 
-	return of_register_driver(&bw2_driver, &of_bus_type);
+	return of_register_platform_driver(&bw2_driver);
 }
 
 static void __exit bw2_exit(void)
 {
-	of_unregister_driver(&bw2_driver);
+	of_unregister_platform_driver(&bw2_driver);
 }
 
 module_init(bw2_init);
diff --git a/drivers/video/cg14.c b/drivers/video/cg14.c
index e5dc2241194f..d09fde8beb69 100644
--- a/drivers/video/cg14.c
+++ b/drivers/video/cg14.c
@@ -610,12 +610,12 @@ static int __init cg14_init(void)
 	if (fb_get_options("cg14fb", NULL))
 		return -ENODEV;
 
-	return of_register_driver(&cg14_driver, &of_bus_type);
+	return of_register_platform_driver(&cg14_driver);
 }
 
 static void __exit cg14_exit(void)
 {
-	of_unregister_driver(&cg14_driver);
+	of_unregister_platform_driver(&cg14_driver);
 }
 
 module_init(cg14_init);
diff --git a/drivers/video/cg3.c b/drivers/video/cg3.c
index 558d73a948a0..64aa29809fb9 100644
--- a/drivers/video/cg3.c
+++ b/drivers/video/cg3.c
@@ -477,12 +477,12 @@ static int __init cg3_init(void)
 	if (fb_get_options("cg3fb", NULL))
 		return -ENODEV;
 
-	return of_register_driver(&cg3_driver, &of_bus_type);
+	return of_register_platform_driver(&cg3_driver);
 }
 
 static void __exit cg3_exit(void)
 {
-	of_unregister_driver(&cg3_driver);
+	of_unregister_platform_driver(&cg3_driver);
 }
 
 module_init(cg3_init);
diff --git a/drivers/video/cg6.c b/drivers/video/cg6.c
index 480d761a27a8..2389a719dcc7 100644
--- a/drivers/video/cg6.c
+++ b/drivers/video/cg6.c
@@ -870,12 +870,12 @@ static int __init cg6_init(void)
 	if (fb_get_options("cg6fb", NULL))
 		return -ENODEV;
 
-	return of_register_driver(&cg6_driver, &of_bus_type);
+	return of_register_platform_driver(&cg6_driver);
 }
 
 static void __exit cg6_exit(void)
 {
-	of_unregister_driver(&cg6_driver);
+	of_unregister_platform_driver(&cg6_driver);
 }
 
 module_init(cg6_init);
diff --git a/drivers/video/ffb.c b/drivers/video/ffb.c
index 95c0227f47fc..f6ecfab296d3 100644
--- a/drivers/video/ffb.c
+++ b/drivers/video/ffb.c
@@ -1067,12 +1067,12 @@ static int __init ffb_init(void)
 	if (fb_get_options("ffb", NULL))
 		return -ENODEV;
 
-	return of_register_driver(&ffb_driver, &of_bus_type);
+	return of_register_platform_driver(&ffb_driver);
 }
 
 static void __exit ffb_exit(void)
 {
-	of_unregister_driver(&ffb_driver);
+	of_unregister_platform_driver(&ffb_driver);
 }
 
 module_init(ffb_init);
diff --git a/drivers/video/leo.c b/drivers/video/leo.c
index 9e8bf7d5e249..ad677637ffbb 100644
--- a/drivers/video/leo.c
+++ b/drivers/video/leo.c
@@ -677,12 +677,12 @@ static int __init leo_init(void)
 	if (fb_get_options("leofb", NULL))
 		return -ENODEV;
 
-	return of_register_driver(&leo_driver, &of_bus_type);
+	return of_register_platform_driver(&leo_driver);
 }
 
 static void __exit leo_exit(void)
 {
-	of_unregister_driver(&leo_driver);
+	of_unregister_platform_driver(&leo_driver);
 }
 
 module_init(leo_init);
diff --git a/drivers/video/p9100.c b/drivers/video/p9100.c
index 6552751e81aa..688b055abab2 100644
--- a/drivers/video/p9100.c
+++ b/drivers/video/p9100.c
@@ -367,12 +367,12 @@ static int __init p9100_init(void)
 	if (fb_get_options("p9100fb", NULL))
 		return -ENODEV;
 
-	return of_register_driver(&p9100_driver, &of_bus_type);
+	return of_register_platform_driver(&p9100_driver);
 }
 
 static void __exit p9100_exit(void)
 {
-	of_unregister_driver(&p9100_driver);
+	of_unregister_platform_driver(&p9100_driver);
 }
 
 module_init(p9100_init);
diff --git a/drivers/video/sunxvr1000.c b/drivers/video/sunxvr1000.c
index 489b44e8db81..7288934c0d49 100644
--- a/drivers/video/sunxvr1000.c
+++ b/drivers/video/sunxvr1000.c
@@ -213,12 +213,12 @@ static int __init gfb_init(void)
 	if (fb_get_options("gfb", NULL))
 		return -ENODEV;
 
-	return of_register_driver(&gfb_driver, &of_bus_type);
+	return of_register_platform_driver(&gfb_driver);
 }
 
 static void __exit gfb_exit(void)
 {
-	of_unregister_driver(&gfb_driver);
+	of_unregister_platform_driver(&gfb_driver);
 }
 
 module_init(gfb_init);
diff --git a/drivers/video/tcx.c b/drivers/video/tcx.c
index cc039b33d2d8..f375e0db6776 100644
--- a/drivers/video/tcx.c
+++ b/drivers/video/tcx.c
@@ -526,12 +526,12 @@ static int __init tcx_init(void)
 	if (fb_get_options("tcxfb", NULL))
 		return -ENODEV;
 
-	return of_register_driver(&tcx_driver, &of_bus_type);
+	return of_register_platform_driver(&tcx_driver);
 }
 
 static void __exit tcx_exit(void)
 {
-	of_unregister_driver(&tcx_driver);
+	of_unregister_platform_driver(&tcx_driver);
 }
 
 module_init(tcx_init);
diff --git a/drivers/watchdog/cpwd.c b/drivers/watchdog/cpwd.c
index 8c03fd71693e..30a2512fd52e 100644
--- a/drivers/watchdog/cpwd.c
+++ b/drivers/watchdog/cpwd.c
@@ -688,12 +688,12 @@ static struct of_platform_driver cpwd_driver = {
 
 static int __init cpwd_init(void)
 {
-	return of_register_driver(&cpwd_driver, &of_bus_type);
+	return of_register_platform_driver(&cpwd_driver);
 }
 
 static void __exit cpwd_exit(void)
 {
-	of_unregister_driver(&cpwd_driver);
+	of_unregister_platform_driver(&cpwd_driver);
 }
 
 module_init(cpwd_init);
diff --git a/drivers/watchdog/riowd.c b/drivers/watchdog/riowd.c
index 5dceeddc8859..4082b4ace1fc 100644
--- a/drivers/watchdog/riowd.c
+++ b/drivers/watchdog/riowd.c
@@ -250,12 +250,12 @@ static struct of_platform_driver riowd_driver = {
 
 static int __init riowd_init(void)
 {
-	return of_register_driver(&riowd_driver, &of_bus_type);
+	return of_register_platform_driver(&riowd_driver);
 }
 
 static void __exit riowd_exit(void)
 {
-	of_unregister_driver(&riowd_driver);
+	of_unregister_platform_driver(&riowd_driver);
 }
 
 module_init(riowd_init);
diff --git a/include/linux/of_platform.h b/include/linux/of_platform.h
index 133ecf31a60f..429513ae8f6e 100644
--- a/include/linux/of_platform.h
+++ b/include/linux/of_platform.h
@@ -19,12 +19,6 @@
 #include <linux/of_device.h>
 #include <linux/platform_device.h>
 
-/*
- * of_platform_bus_type isn't it's own bus anymore.  It's now just an alias
- * for the platform bus.
- */
-#define of_platform_bus_type platform_bus_type
-
 extern const struct of_device_id of_default_bus_ids[];
 
 /*
diff --git a/sound/sparc/amd7930.c b/sound/sparc/amd7930.c
index 43c63d441087..9eb1a4e0363b 100644
--- a/sound/sparc/amd7930.c
+++ b/sound/sparc/amd7930.c
@@ -1075,7 +1075,7 @@ static struct of_platform_driver amd7930_sbus_driver = {
 
 static int __init amd7930_init(void)
 {
-	return of_register_driver(&amd7930_sbus_driver, &of_bus_type);
+	return of_register_platform_driver(&amd7930_sbus_driver);
 }
 
 static void __exit amd7930_exit(void)
@@ -1092,7 +1092,7 @@ static void __exit amd7930_exit(void)
 
 	amd7930_list = NULL;
 
-	of_unregister_driver(&amd7930_sbus_driver);
+	of_unregister_platform_driver(&amd7930_sbus_driver);
 }
 
 module_init(amd7930_init);
diff --git a/sound/sparc/cs4231.c b/sound/sparc/cs4231.c
index f7f05c246303..68570ee2c9bb 100644
--- a/sound/sparc/cs4231.c
+++ b/sound/sparc/cs4231.c
@@ -2120,12 +2120,12 @@ static struct of_platform_driver cs4231_driver = {
 
 static int __init cs4231_init(void)
 {
-	return of_register_driver(&cs4231_driver, &of_bus_type);
+	return of_register_platform_driver(&cs4231_driver);
 }
 
 static void __exit cs4231_exit(void)
 {
-	of_unregister_driver(&cs4231_driver);
+	of_unregister_platform_driver(&cs4231_driver);
 }
 
 module_init(cs4231_init);
diff --git a/sound/sparc/dbri.c b/sound/sparc/dbri.c
index 491ce71c84b6..c421901c48d0 100644
--- a/sound/sparc/dbri.c
+++ b/sound/sparc/dbri.c
@@ -2699,12 +2699,12 @@ static struct of_platform_driver dbri_sbus_driver = {
 /* Probe for the dbri chip and then attach the driver. */
 static int __init dbri_init(void)
 {
-	return of_register_driver(&dbri_sbus_driver, &of_bus_type);
+	return of_register_platform_driver(&dbri_sbus_driver);
 }
 
 static void __exit dbri_exit(void)
 {
-	of_unregister_driver(&dbri_sbus_driver);
+	of_unregister_platform_driver(&dbri_sbus_driver);
 }
 
 module_init(dbri_init);
-- 
cgit v1.2.3


From 129ac799ad627b1e08382739f9e8cd75d7477fa3 Mon Sep 17 00:00:00 2001
From: Grant Likely <grant.likely@secretlab.ca>
Date: Tue, 29 Jun 2010 09:26:53 -0700
Subject: of: remove asm/of_platform.h

Only thing left in it is of_instantiate_rtc() which can be moved to
asm/prom.h on PowerPC and is unused in microblaze.

Signed-off-by: Grant Likely <grant.likely@secretlab.ca>
Acked-by: David S. Miller <davem@davemloft.net>
---
 arch/microblaze/include/asm/of_platform.h | 19 -------------------
 arch/powerpc/include/asm/of_platform.h    | 16 ----------------
 arch/powerpc/include/asm/prom.h           |  2 ++
 arch/sparc/include/asm/of_platform.h      | 16 ----------------
 include/linux/of_platform.h               |  2 --
 5 files changed, 2 insertions(+), 53 deletions(-)
 delete mode 100644 arch/microblaze/include/asm/of_platform.h
 delete mode 100644 arch/powerpc/include/asm/of_platform.h
 delete mode 100644 arch/sparc/include/asm/of_platform.h

(limited to 'include/linux')

diff --git a/arch/microblaze/include/asm/of_platform.h b/arch/microblaze/include/asm/of_platform.h
deleted file mode 100644
index 353d8f651e30..000000000000
--- a/arch/microblaze/include/asm/of_platform.h
+++ /dev/null
@@ -1,19 +0,0 @@
-/*
- * Copyright (C) 2006 Benjamin Herrenschmidt, IBM Corp.
- *			<benh@kernel.crashing.org>
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License
- * as published by the Free Software Foundation; either version
- * 2 of the License, or (at your option) any later version.
- */
-
-#ifndef _ASM_MICROBLAZE_OF_PLATFORM_H
-#define _ASM_MICROBLAZE_OF_PLATFORM_H
-
-/* This is just here during the transition */
-#include <linux/of_platform.h>
-
-extern void of_instantiate_rtc(void);
-
-#endif /* _ASM_MICROBLAZE_OF_PLATFORM_H */
diff --git a/arch/powerpc/include/asm/of_platform.h b/arch/powerpc/include/asm/of_platform.h
deleted file mode 100644
index d506aa61db83..000000000000
--- a/arch/powerpc/include/asm/of_platform.h
+++ /dev/null
@@ -1,16 +0,0 @@
-#ifndef _ASM_POWERPC_OF_PLATFORM_H
-#define _ASM_POWERPC_OF_PLATFORM_H
-/*
- *    Copyright (C) 2006 Benjamin Herrenschmidt, IBM Corp.
- *			 <benh@kernel.crashing.org>
- *
- *  This program is free software; you can redistribute it and/or
- *  modify it under the terms of the GNU General Public License
- *  as published by the Free Software Foundation; either version
- *  2 of the License, or (at your option) any later version.
- *
- */
-
-extern void of_instantiate_rtc(void);
-
-#endif	/* _ASM_POWERPC_OF_PLATFORM_H */
diff --git a/arch/powerpc/include/asm/prom.h b/arch/powerpc/include/asm/prom.h
index f864722679e8..da7dd634e7ce 100644
--- a/arch/powerpc/include/asm/prom.h
+++ b/arch/powerpc/include/asm/prom.h
@@ -117,5 +117,7 @@ extern const void *of_get_mac_address(struct device_node *np);
 struct pci_dev;
 extern int of_irq_map_pci(struct pci_dev *pdev, struct of_irq *out_irq);
 
+extern void of_instantiate_rtc(void);
+
 #endif /* __KERNEL__ */
 #endif /* _POWERPC_PROM_H */
diff --git a/arch/sparc/include/asm/of_platform.h b/arch/sparc/include/asm/of_platform.h
deleted file mode 100644
index 26540ddfc511..000000000000
--- a/arch/sparc/include/asm/of_platform.h
+++ /dev/null
@@ -1,16 +0,0 @@
-#ifndef ___ASM_SPARC_OF_PLATFORM_H
-#define ___ASM_SPARC_OF_PLATFORM_H
-/*
- *    Copyright (C) 2006 Benjamin Herrenschmidt, IBM Corp.
- *			 <benh@kernel.crashing.org>
- *    Modified for Sparc by merging parts of asm/of_device.h
- *		by Stephen Rothwell
- *
- *  This program is free software; you can redistribute it and/or
- *  modify it under the terms of the GNU General Public License
- *  as published by the Free Software Foundation; either version
- *  2 of the License, or (at your option) any later version.
- *
- */
-
-#endif
diff --git a/include/linux/of_platform.h b/include/linux/of_platform.h
index 429513ae8f6e..a79f59bf61a0 100644
--- a/include/linux/of_platform.h
+++ b/include/linux/of_platform.h
@@ -52,8 +52,6 @@ extern void of_unregister_platform_driver(struct of_platform_driver *drv);
 extern struct of_device *of_device_alloc(struct device_node *np,
 					 const char *bus_id,
 					 struct device *parent);
-#include <asm/of_platform.h>
-
 extern struct of_device *of_find_device_by_node(struct device_node *np);
 
 extern int of_bus_type_init(struct bus_type *bus, const char *name);
-- 
cgit v1.2.3


From 295960429675e17ec658320ebb24385727032bed Mon Sep 17 00:00:00 2001
From: Grant Likely <grant.likely@secretlab.ca>
Date: Tue, 29 Jun 2010 11:15:54 -0600
Subject: of: remove asm/of_device.h

It is mostly unused now.  Sparc has a few defines left in it, but they
can be moved to other headers.  Removing this header means that new
architectures adding CONFIG_OF support don't need to also add this
header file.

Signed-off-by: Grant Likely <grant.likely@secretlab.ca>
Acked-by: David S. Miller <davem@davemloft.net>
---
 arch/microblaze/include/asm/of_device.h | 13 -------------
 arch/powerpc/include/asm/of_device.h    |  3 ---
 arch/sparc/include/asm/device.h         |  2 ++
 arch/sparc/include/asm/of_device.h      | 19 -------------------
 arch/sparc/include/asm/prom.h           |  4 ++++
 include/linux/of_device.h               |  2 +-
 6 files changed, 7 insertions(+), 36 deletions(-)
 delete mode 100644 arch/microblaze/include/asm/of_device.h
 delete mode 100644 arch/powerpc/include/asm/of_device.h
 delete mode 100644 arch/sparc/include/asm/of_device.h

(limited to 'include/linux')

diff --git a/arch/microblaze/include/asm/of_device.h b/arch/microblaze/include/asm/of_device.h
deleted file mode 100644
index 47e8d42aee8f..000000000000
--- a/arch/microblaze/include/asm/of_device.h
+++ /dev/null
@@ -1,13 +0,0 @@
-/*
- * Copyright (C) 2007-2008 Michal Simek <monstr@monstr.eu>
- *
- * based on PowerPC of_device.h
- *
- * This file is subject to the terms and conditions of the GNU General Public
- * License. See the file "COPYING" in the main directory of this archive
- * for more details.
- */
-
-#ifndef _ASM_MICROBLAZE_OF_DEVICE_H
-#define _ASM_MICROBLAZE_OF_DEVICE_H
-#endif /* _ASM_MICROBLAZE_OF_DEVICE_H */
diff --git a/arch/powerpc/include/asm/of_device.h b/arch/powerpc/include/asm/of_device.h
deleted file mode 100644
index 04f76717f82c..000000000000
--- a/arch/powerpc/include/asm/of_device.h
+++ /dev/null
@@ -1,3 +0,0 @@
-#ifndef _ASM_POWERPC_OF_DEVICE_H
-#define _ASM_POWERPC_OF_DEVICE_H
-#endif /* _ASM_POWERPC_OF_DEVICE_H */
diff --git a/arch/sparc/include/asm/device.h b/arch/sparc/include/asm/device.h
index fb220e482039..daa6a8a5e9cd 100644
--- a/arch/sparc/include/asm/device.h
+++ b/arch/sparc/include/asm/device.h
@@ -19,6 +19,8 @@ struct dev_archdata {
 	int			numa_node;
 };
 
+extern void of_propagate_archdata(struct platform_device *bus);
+
 struct pdev_archdata {
 	struct resource		resource[PROMREG_MAX];
 	unsigned int		irqs[PROMINTR_MAX];
diff --git a/arch/sparc/include/asm/of_device.h b/arch/sparc/include/asm/of_device.h
deleted file mode 100644
index 22b9828fe693..000000000000
--- a/arch/sparc/include/asm/of_device.h
+++ /dev/null
@@ -1,19 +0,0 @@
-#ifndef _ASM_SPARC_OF_DEVICE_H
-#define _ASM_SPARC_OF_DEVICE_H
-#ifdef __KERNEL__
-
-#include <linux/device.h>
-#include <linux/of.h>
-#include <linux/mod_devicetable.h>
-#include <asm/openprom.h>
-
-extern void __iomem *of_ioremap(struct resource *res, unsigned long offset, unsigned long size, char *name);
-extern void of_iounmap(struct resource *res, void __iomem *base, unsigned long size);
-
-extern void of_propagate_archdata(struct of_device *bus);
-
-/* This is just here during the transition */
-#include <linux/of_platform.h>
-
-#endif /* __KERNEL__ */
-#endif /* _ASM_SPARC_OF_DEVICE_H */
diff --git a/arch/sparc/include/asm/prom.h b/arch/sparc/include/asm/prom.h
index ac695742df85..d35df5ace18a 100644
--- a/arch/sparc/include/asm/prom.h
+++ b/arch/sparc/include/asm/prom.h
@@ -51,6 +51,10 @@ extern void prom_build_devicetree(void);
 extern void of_populate_present_mask(void);
 extern void of_fill_in_cpu_data(void);
 
+struct resource;
+extern void __iomem *of_ioremap(struct resource *res, unsigned long offset, unsigned long size, char *name);
+extern void of_iounmap(struct resource *res, void __iomem *base, unsigned long size);
+
 /* These routines are here to provide compatibility with how powerpc
  * handles IRQ mapping for OF device nodes.  We precompute and permanently
  * register them in the of_device objects, whereas powerpc computes them
diff --git a/include/linux/of_device.h b/include/linux/of_device.h
index 8cd1fe7864e3..0f1911994559 100644
--- a/include/linux/of_device.h
+++ b/include/linux/of_device.h
@@ -16,12 +16,12 @@
  */
 #define of_device platform_device
 #include <linux/platform_device.h>
+#include <linux/of_platform.h> /* temporary until merge */
 
 #ifdef CONFIG_OF_DEVICE
 #include <linux/device.h>
 #include <linux/of.h>
 #include <linux/mod_devicetable.h>
-#include <asm/of_device.h>
 
 #define	to_of_device(d) container_of(d, struct of_device, dev)
 
-- 
cgit v1.2.3


From 94a0cb1fc61ab7a0d47d268a7764374efeb2160b Mon Sep 17 00:00:00 2001
From: Grant Likely <grant.likely@secretlab.ca>
Date: Thu, 22 Jul 2010 13:59:23 -0600
Subject: of/device: Replace of_device with platform_device in includes and
 core code

of_device is currently just an #define alias to platform_device until it
gets removed entirely.  This patch removes references to it from the
include directories and the core drivers/of code.

Signed-off-by: Grant Likely <grant.likely@secretlab.ca>
Acked-by: David S. Miller <davem@davemloft.net>
---
 arch/powerpc/include/asm/macio.h   |  2 +-
 arch/sparc/include/asm/floppy_64.h |  6 +++---
 arch/sparc/include/asm/parport.h   |  4 ++--
 drivers/of/device.c                | 22 +++++++++++-----------
 drivers/of/platform.c              | 24 ++++++++++++------------
 include/linux/of_device.h          | 10 +++++-----
 include/linux/of_platform.h        | 16 ++++++++--------
 7 files changed, 42 insertions(+), 42 deletions(-)

(limited to 'include/linux')

diff --git a/arch/powerpc/include/asm/macio.h b/arch/powerpc/include/asm/macio.h
index 675e159b5ef4..7ab82c825a03 100644
--- a/arch/powerpc/include/asm/macio.h
+++ b/arch/powerpc/include/asm/macio.h
@@ -38,7 +38,7 @@ struct macio_dev
 {
 	struct macio_bus	*bus;		/* macio bus this device is on */
 	struct macio_dev	*media_bay;	/* Device is part of a media bay */
-	struct of_device	ofdev;
+	struct platform_device	ofdev;
 	struct device_dma_parameters dma_parms; /* ide needs that */
 	int			n_resources;
 	struct resource		resource[MACIO_DEV_COUNT_RESOURCES];
diff --git a/arch/sparc/include/asm/floppy_64.h b/arch/sparc/include/asm/floppy_64.h
index 4f5bde638f72..6597ce874d78 100644
--- a/arch/sparc/include/asm/floppy_64.h
+++ b/arch/sparc/include/asm/floppy_64.h
@@ -43,7 +43,7 @@ struct sun_flpy_controller {
 /* You'll only ever find one controller on an Ultra anyways. */
 static struct sun_flpy_controller *sun_fdc = (struct sun_flpy_controller *)-1;
 unsigned long fdc_status;
-static struct of_device *floppy_op = NULL;
+static struct platform_device *floppy_op = NULL;
 
 struct sun_floppy_ops {
 	unsigned char	(*fd_inb) (unsigned long port);
@@ -548,7 +548,7 @@ static unsigned long __init sun_floppy_init(void)
 {
 	static int initialized = 0;
 	struct device_node *dp;
-	struct of_device *op;
+	struct platform_device *op;
 	const char *prop;
 	char state[128];
 
@@ -661,7 +661,7 @@ static unsigned long __init sun_floppy_init(void)
 		config = 0;
 		for (dp = ebus_dp->child; dp; dp = dp->sibling) {
 			if (!strcmp(dp->name, "ecpp")) {
-				struct of_device *ecpp_op;
+				struct platform_device *ecpp_op;
 
 				ecpp_op = of_find_device_by_node(dp);
 				if (ecpp_op)
diff --git a/arch/sparc/include/asm/parport.h b/arch/sparc/include/asm/parport.h
index 4891fbce1114..4f7afa01b2ae 100644
--- a/arch/sparc/include/asm/parport.h
+++ b/arch/sparc/include/asm/parport.h
@@ -103,7 +103,7 @@ static inline unsigned int get_dma_residue(unsigned int dmanr)
 	return ebus_dma_residue(&sparc_ebus_dmas[dmanr].info);
 }
 
-static int __devinit ecpp_probe(struct of_device *op, const struct of_device_id *match)
+static int __devinit ecpp_probe(struct platform_device *op, const struct of_device_id *match)
 {
 	unsigned long base = op->resource[0].start;
 	unsigned long config = op->resource[1].start;
@@ -192,7 +192,7 @@ out_err:
 	return err;
 }
 
-static int __devexit ecpp_remove(struct of_device *op)
+static int __devexit ecpp_remove(struct platform_device *op)
 {
 	struct parport *p = dev_get_drvdata(&op->dev);
 	int slot = p->dma;
diff --git a/drivers/of/device.c b/drivers/of/device.c
index 12a44b493511..0d8a0644f540 100644
--- a/drivers/of/device.c
+++ b/drivers/of/device.c
@@ -26,7 +26,7 @@ const struct of_device_id *of_match_device(const struct of_device_id *matches,
 }
 EXPORT_SYMBOL(of_match_device);
 
-struct of_device *of_dev_get(struct of_device *dev)
+struct platform_device *of_dev_get(struct platform_device *dev)
 {
 	struct device *tmp;
 
@@ -34,13 +34,13 @@ struct of_device *of_dev_get(struct of_device *dev)
 		return NULL;
 	tmp = get_device(&dev->dev);
 	if (tmp)
-		return to_of_device(tmp);
+		return to_platform_device(tmp);
 	else
 		return NULL;
 }
 EXPORT_SYMBOL(of_dev_get);
 
-void of_dev_put(struct of_device *dev)
+void of_dev_put(struct platform_device *dev)
 {
 	if (dev)
 		put_device(&dev->dev);
@@ -50,18 +50,18 @@ EXPORT_SYMBOL(of_dev_put);
 static ssize_t devspec_show(struct device *dev,
 				struct device_attribute *attr, char *buf)
 {
-	struct of_device *ofdev;
+	struct platform_device *ofdev;
 
-	ofdev = to_of_device(dev);
+	ofdev = to_platform_device(dev);
 	return sprintf(buf, "%s\n", ofdev->dev.of_node->full_name);
 }
 
 static ssize_t name_show(struct device *dev,
 				struct device_attribute *attr, char *buf)
 {
-	struct of_device *ofdev;
+	struct platform_device *ofdev;
 
-	ofdev = to_of_device(dev);
+	ofdev = to_platform_device(dev);
 	return sprintf(buf, "%s\n", ofdev->dev.of_node->name);
 }
 
@@ -90,15 +90,15 @@ struct device_attribute of_platform_device_attrs[] = {
  */
 void of_release_dev(struct device *dev)
 {
-	struct of_device *ofdev;
+	struct platform_device *ofdev;
 
-	ofdev = to_of_device(dev);
+	ofdev = to_platform_device(dev);
 	of_node_put(ofdev->dev.of_node);
 	kfree(ofdev);
 }
 EXPORT_SYMBOL(of_release_dev);
 
-int of_device_register(struct of_device *ofdev)
+int of_device_register(struct platform_device *ofdev)
 {
 	BUG_ON(ofdev->dev.of_node == NULL);
 
@@ -119,7 +119,7 @@ int of_device_register(struct of_device *ofdev)
 }
 EXPORT_SYMBOL(of_device_register);
 
-void of_device_unregister(struct of_device *ofdev)
+void of_device_unregister(struct platform_device *ofdev)
 {
 	device_unregister(&ofdev->dev);
 }
diff --git a/drivers/of/platform.c b/drivers/of/platform.c
index f3f1ec81ef48..9f3840cdcde5 100644
--- a/drivers/of/platform.c
+++ b/drivers/of/platform.c
@@ -94,11 +94,11 @@ static int of_platform_device_probe(struct device *dev)
 {
 	int error = -ENODEV;
 	struct of_platform_driver *drv;
-	struct of_device *of_dev;
+	struct platform_device *of_dev;
 	const struct of_device_id *match;
 
 	drv = to_of_platform_driver(dev->driver);
-	of_dev = to_of_device(dev);
+	of_dev = to_platform_device(dev);
 
 	if (!drv->probe)
 		return error;
@@ -116,7 +116,7 @@ static int of_platform_device_probe(struct device *dev)
 
 static int of_platform_device_remove(struct device *dev)
 {
-	struct of_device *of_dev = to_of_device(dev);
+	struct platform_device *of_dev = to_platform_device(dev);
 	struct of_platform_driver *drv = to_of_platform_driver(dev->driver);
 
 	if (dev->driver && drv->remove)
@@ -126,7 +126,7 @@ static int of_platform_device_remove(struct device *dev)
 
 static void of_platform_device_shutdown(struct device *dev)
 {
-	struct of_device *of_dev = to_of_device(dev);
+	struct platform_device *of_dev = to_platform_device(dev);
 	struct of_platform_driver *drv = to_of_platform_driver(dev->driver);
 
 	if (dev->driver && drv->shutdown)
@@ -137,7 +137,7 @@ static void of_platform_device_shutdown(struct device *dev)
 
 static int of_platform_legacy_suspend(struct device *dev, pm_message_t mesg)
 {
-	struct of_device *of_dev = to_of_device(dev);
+	struct platform_device *of_dev = to_platform_device(dev);
 	struct of_platform_driver *drv = to_of_platform_driver(dev->driver);
 	int ret = 0;
 
@@ -148,7 +148,7 @@ static int of_platform_legacy_suspend(struct device *dev, pm_message_t mesg)
 
 static int of_platform_legacy_resume(struct device *dev)
 {
-	struct of_device *of_dev = to_of_device(dev);
+	struct platform_device *of_dev = to_platform_device(dev);
 	struct of_platform_driver *drv = to_of_platform_driver(dev->driver);
 	int ret = 0;
 
@@ -543,11 +543,11 @@ static void of_device_make_bus_id(struct device *dev)
  * @bus_id: Name to assign to the device.  May be null to use default name.
  * @parent: Parent device.
  */
-struct of_device *of_device_alloc(struct device_node *np,
+struct platform_device *of_device_alloc(struct device_node *np,
 				  const char *bus_id,
 				  struct device *parent)
 {
-	struct of_device *dev;
+	struct platform_device *dev;
 	int rc, i, num_reg = 0, num_irq = 0;
 	struct resource *res, temp_res;
 
@@ -600,11 +600,11 @@ EXPORT_SYMBOL(of_device_alloc);
  * @bus_id: name to assign device
  * @parent: Linux device model parent device.
  */
-struct of_device *of_platform_device_create(struct device_node *np,
+struct platform_device *of_platform_device_create(struct device_node *np,
 					    const char *bus_id,
 					    struct device *parent)
 {
-	struct of_device *dev;
+	struct platform_device *dev;
 
 	dev = of_device_alloc(np, bus_id, parent);
 	if (!dev)
@@ -642,7 +642,7 @@ static int of_platform_bus_create(const struct device_node *bus,
 				  struct device *parent)
 {
 	struct device_node *child;
-	struct of_device *dev;
+	struct platform_device *dev;
 	int rc = 0;
 
 	for_each_child_of_node(bus, child) {
@@ -678,7 +678,7 @@ int of_platform_bus_probe(struct device_node *root,
 			  struct device *parent)
 {
 	struct device_node *child;
-	struct of_device *dev;
+	struct platform_device *dev;
 	int rc = 0;
 
 	if (matches == NULL)
diff --git a/include/linux/of_device.h b/include/linux/of_device.h
index 0f1911994559..e11a0be7893a 100644
--- a/include/linux/of_device.h
+++ b/include/linux/of_device.h
@@ -39,14 +39,14 @@ static inline int of_driver_match_device(const struct device *dev,
 	return of_match_device(drv->of_match_table, dev) != NULL;
 }
 
-extern struct of_device *of_dev_get(struct of_device *dev);
-extern void of_dev_put(struct of_device *dev);
+extern struct platform_device *of_dev_get(struct platform_device *dev);
+extern void of_dev_put(struct platform_device *dev);
 
-extern int of_device_register(struct of_device *ofdev);
-extern void of_device_unregister(struct of_device *ofdev);
+extern int of_device_register(struct platform_device *ofdev);
+extern void of_device_unregister(struct platform_device *ofdev);
 extern void of_release_dev(struct device *dev);
 
-static inline void of_device_free(struct of_device *dev)
+static inline void of_device_free(struct platform_device *dev)
 {
 	of_release_dev(&dev->dev);
 }
diff --git a/include/linux/of_platform.h b/include/linux/of_platform.h
index a79f59bf61a0..b24c5a5b0426 100644
--- a/include/linux/of_platform.h
+++ b/include/linux/of_platform.h
@@ -27,13 +27,13 @@ extern const struct of_device_id of_default_bus_ids[];
  */
 struct of_platform_driver
 {
-	int	(*probe)(struct of_device* dev,
+	int	(*probe)(struct platform_device* dev,
 			 const struct of_device_id *match);
-	int	(*remove)(struct of_device* dev);
+	int	(*remove)(struct platform_device* dev);
 
-	int	(*suspend)(struct of_device* dev, pm_message_t state);
-	int	(*resume)(struct of_device* dev);
-	int	(*shutdown)(struct of_device* dev);
+	int	(*suspend)(struct platform_device* dev, pm_message_t state);
+	int	(*resume)(struct platform_device* dev);
+	int	(*shutdown)(struct platform_device* dev);
 
 	struct device_driver	driver;
 	struct platform_driver	platform_driver;
@@ -49,16 +49,16 @@ extern void of_unregister_driver(struct of_platform_driver *drv);
 extern int of_register_platform_driver(struct of_platform_driver *drv);
 extern void of_unregister_platform_driver(struct of_platform_driver *drv);
 
-extern struct of_device *of_device_alloc(struct device_node *np,
+extern struct platform_device *of_device_alloc(struct device_node *np,
 					 const char *bus_id,
 					 struct device *parent);
-extern struct of_device *of_find_device_by_node(struct device_node *np);
+extern struct platform_device *of_find_device_by_node(struct device_node *np);
 
 extern int of_bus_type_init(struct bus_type *bus, const char *name);
 
 #if !defined(CONFIG_SPARC) /* SPARC has its own device registration method */
 /* Platform devices and busses creation */
-extern struct of_device *of_platform_device_create(struct device_node *np,
+extern struct platform_device *of_platform_device_create(struct device_node *np,
 						   const char *bus_id,
 						   struct device *parent);
 
-- 
cgit v1.2.3


From c0dd394ca5e78649b7013c3ce2d6338af9f228f0 Mon Sep 17 00:00:00 2001
From: Jonas Bonn <jonas@southpole.se>
Date: Fri, 23 Jul 2010 20:19:24 +0200
Subject: of: remove of_default_bus_ids

This list used was by only two platforms with all other platforms defining an
own list of valid bus id's to pass to of_platform_bus_probe.  This patch:

i)   copies the default list to the two platforms that depended on it (powerpc)
ii)  remove the usage of of_default_bus_ids in of_platform_bus_probe
iii) removes the definition of the list from all architectures that defined it

Passing a NULL 'matches' parameter to of_platform_bus_probe is still valid; the
function returns no error in that case as the NULL value is equivalent to an
empty list.

Signed-off-by: Jonas Bonn <jonas@southpole.se>
[grant.likely@secretlab.ca: added __initdata annotations, warn on and return error on missing match table, and fix whitespace errors]
Signed-off-by: Grant Likely <grant.likely@secretlab.ca>
---
 arch/microblaze/kernel/Makefile           |  2 +-
 arch/microblaze/kernel/of_platform.c      | 49 -------------------------------
 arch/powerpc/kernel/of_platform.c         | 24 ---------------
 arch/powerpc/platforms/cell/qpace_setup.c | 14 ++++++++-
 arch/powerpc/platforms/cell/setup.c       | 14 ++++++++-
 drivers/of/platform.c                     |  4 +--
 include/linux/of_platform.h               |  2 --
 7 files changed, 28 insertions(+), 81 deletions(-)
 delete mode 100644 arch/microblaze/kernel/of_platform.c

(limited to 'include/linux')

diff --git a/arch/microblaze/kernel/Makefile b/arch/microblaze/kernel/Makefile
index 727e2cbff9c6..7fcc5f7b5a46 100644
--- a/arch/microblaze/kernel/Makefile
+++ b/arch/microblaze/kernel/Makefile
@@ -16,7 +16,7 @@ extra-y := head.o vmlinux.lds
 
 obj-y += dma.o exceptions.o \
 	hw_exception_handler.o init_task.o intc.o irq.o \
-	of_platform.o process.o prom.o prom_parse.o ptrace.o \
+	process.o prom.o prom_parse.o ptrace.o \
 	setup.o signal.o sys_microblaze.o timer.o traps.o reset.o
 
 obj-y += cpu/
diff --git a/arch/microblaze/kernel/of_platform.c b/arch/microblaze/kernel/of_platform.c
deleted file mode 100644
index 6cffadbe2fcd..000000000000
--- a/arch/microblaze/kernel/of_platform.c
+++ /dev/null
@@ -1,49 +0,0 @@
-/*
- *    Copyright (C) 2006 Benjamin Herrenschmidt, IBM Corp.
- *			 <benh@kernel.crashing.org>
- *    and		 Arnd Bergmann, IBM Corp.
- *
- *  This program is free software; you can redistribute it and/or
- *  modify it under the terms of the GNU General Public License
- *  as published by the Free Software Foundation; either version
- *  2 of the License, or (at your option) any later version.
- *
- */
-
-#undef DEBUG
-
-#include <linux/string.h>
-#include <linux/kernel.h>
-#include <linux/init.h>
-#include <linux/module.h>
-#include <linux/mod_devicetable.h>
-#include <linux/pci.h>
-#include <linux/of.h>
-#include <linux/of_device.h>
-#include <linux/of_platform.h>
-
-#include <linux/errno.h>
-#include <linux/topology.h>
-#include <asm/atomic.h>
-
-/*
- * The list of OF IDs below is used for matching bus types in the
- * system whose devices are to be exposed as of_platform_devices.
- *
- * This is the default list valid for most platforms. This file provides
- * functions who can take an explicit list if necessary though
- *
- * The search is always performed recursively looking for children of
- * the provided device_node and recursively if such a children matches
- * a bus type in the list
- */
-
-const struct of_device_id of_default_bus_ids[] = {
-	{ .type = "soc", },
-	{ .compatible = "soc", },
-	{ .type = "plb5", },
-	{ .type = "plb4", },
-	{ .type = "opb", },
-	{ .type = "simple", },
-	{},
-};
diff --git a/arch/powerpc/kernel/of_platform.c b/arch/powerpc/kernel/of_platform.c
index 760a7af7fdb5..b2c363ef38ad 100644
--- a/arch/powerpc/kernel/of_platform.c
+++ b/arch/powerpc/kernel/of_platform.c
@@ -28,30 +28,6 @@
 #include <asm/ppc-pci.h>
 #include <asm/atomic.h>
 
-/*
- * The list of OF IDs below is used for matching bus types in the
- * system whose devices are to be exposed as of_platform_devices.
- *
- * This is the default list valid for most platforms. This file provides
- * functions who can take an explicit list if necessary though
- *
- * The search is always performed recursively looking for children of
- * the provided device_node and recursively if such a children matches
- * a bus type in the list
- */
-
-const struct of_device_id of_default_bus_ids[] = {
-	{ .type = "soc", },
-	{ .compatible = "soc", },
-	{ .type = "spider", },
-	{ .type = "axon", },
-	{ .type = "plb5", },
-	{ .type = "plb4", },
-	{ .type = "opb", },
-	{ .type = "ebc", },
-	{},
-};
-
 #ifdef CONFIG_PPC_OF_PLATFORM_PCI
 
 /* The probing of PCI controllers from of_platform is currently
diff --git a/arch/powerpc/platforms/cell/qpace_setup.c b/arch/powerpc/platforms/cell/qpace_setup.c
index c5ce02e84c8e..1b5749042756 100644
--- a/arch/powerpc/platforms/cell/qpace_setup.c
+++ b/arch/powerpc/platforms/cell/qpace_setup.c
@@ -61,12 +61,24 @@ static void qpace_progress(char *s, unsigned short hex)
 	printk("*** %04x : %s\n", hex, s ? s : "");
 }
 
+static const struct of_device_id qpace_bus_ids[] __initdata = {
+	{ .type = "soc", },
+	{ .compatible = "soc", },
+	{ .type = "spider", },
+	{ .type = "axon", },
+	{ .type = "plb5", },
+	{ .type = "plb4", },
+	{ .type = "opb", },
+	{ .type = "ebc", },
+	{},
+};
+
 static int __init qpace_publish_devices(void)
 {
 	int node;
 
 	/* Publish OF platform devices for southbridge IOs */
-	of_platform_bus_probe(NULL, NULL, NULL);
+	of_platform_bus_probe(NULL, qpace_bus_ids, NULL);
 
 	/* There is no device for the MIC memory controller, thus we create
 	 * a platform device for it to attach the EDAC driver to.
diff --git a/arch/powerpc/platforms/cell/setup.c b/arch/powerpc/platforms/cell/setup.c
index 50385db586bd..691995761b3d 100644
--- a/arch/powerpc/platforms/cell/setup.c
+++ b/arch/powerpc/platforms/cell/setup.c
@@ -141,6 +141,18 @@ static int __devinit cell_setup_phb(struct pci_controller *phb)
 	return 0;
 }
 
+static const struct of_device_id cell_bus_ids[] __initdata = {
+	{ .type = "soc", },
+	{ .compatible = "soc", },
+	{ .type = "spider", },
+	{ .type = "axon", },
+	{ .type = "plb5", },
+	{ .type = "plb4", },
+	{ .type = "opb", },
+	{ .type = "ebc", },
+	{},
+};
+
 static int __init cell_publish_devices(void)
 {
 	struct device_node *root = of_find_node_by_path("/");
@@ -148,7 +160,7 @@ static int __init cell_publish_devices(void)
 	int node;
 
 	/* Publish OF platform devices for southbridge IOs */
-	of_platform_bus_probe(NULL, NULL, NULL);
+	of_platform_bus_probe(NULL, cell_bus_ids, NULL);
 
 	/* On spider based blades, we need to manually create the OF
 	 * platform devices for the PCI host bridges
diff --git a/drivers/of/platform.c b/drivers/of/platform.c
index f79f40b516c6..033a224a9fda 100644
--- a/drivers/of/platform.c
+++ b/drivers/of/platform.c
@@ -701,9 +701,7 @@ int of_platform_bus_probe(struct device_node *root,
 	struct platform_device *dev;
 	int rc = 0;
 
-	if (matches == NULL)
-		matches = of_default_bus_ids;
-	if (matches == OF_NO_DEEP_PROBE)
+	if (WARN_ON(!matches || matches == OF_NO_DEEP_PROBE))
 		return -EINVAL;
 	if (root == NULL)
 		root = of_find_node_by_path("/");
diff --git a/include/linux/of_platform.h b/include/linux/of_platform.h
index b24c5a5b0426..4e6d989c06df 100644
--- a/include/linux/of_platform.h
+++ b/include/linux/of_platform.h
@@ -19,8 +19,6 @@
 #include <linux/of_device.h>
 #include <linux/platform_device.h>
 
-extern const struct of_device_id of_default_bus_ids[];
-
 /*
  * An of_platform_driver driver is attached to a basic of_device on
  * the "platform bus" (platform_bus_type).
-- 
cgit v1.2.3


From 6cda9fa2575ec0869fe77b0bdf295c0e51868cab Mon Sep 17 00:00:00 2001
From: Ryusuke Konishi <konishi.ryusuke@lab.ntt.co.jp>
Date: Sun, 25 Jul 2010 20:39:03 +0900
Subject: nilfs2: avoid rec_len overflow with 64KB block size

With 64KB blocksize, a directory entry can have size 64KB which does
not fit into 16 bits we have for entry length.  So this patch stores
0xffff instead and converts value when read from / written to disk.

Nilfs derives its directory implementation from ext2 filesystem, and
this draws upon the corresponding change on ext2.

Signed-off-by: Ryusuke Konishi <konishi.ryusuke@lab.ntt.co.jp>
---
 fs/nilfs2/dir.c           | 26 ++++++++++++++------------
 include/linux/nilfs2_fs.h | 18 ++++++++++++++++++
 2 files changed, 32 insertions(+), 12 deletions(-)

(limited to 'include/linux')

diff --git a/fs/nilfs2/dir.c b/fs/nilfs2/dir.c
index d8d183e6d095..b60277b44468 100644
--- a/fs/nilfs2/dir.c
+++ b/fs/nilfs2/dir.c
@@ -141,7 +141,7 @@ static void nilfs_check_page(struct page *page)
 	}
 	for (offs = 0; offs <= limit - NILFS_DIR_REC_LEN(1); offs += rec_len) {
 		p = (struct nilfs_dir_entry *)(kaddr + offs);
-		rec_len = le16_to_cpu(p->rec_len);
+		rec_len = nilfs_rec_len_from_disk(p->rec_len);
 
 		if (rec_len < NILFS_DIR_REC_LEN(1))
 			goto Eshort;
@@ -235,7 +235,8 @@ nilfs_match(int len, const unsigned char *name, struct nilfs_dir_entry *de)
  */
 static struct nilfs_dir_entry *nilfs_next_entry(struct nilfs_dir_entry *p)
 {
-	return (struct nilfs_dir_entry *)((char *)p + le16_to_cpu(p->rec_len));
+	return (struct nilfs_dir_entry *)((char *)p +
+					  nilfs_rec_len_from_disk(p->rec_len));
 }
 
 static unsigned char
@@ -326,7 +327,7 @@ static int nilfs_readdir(struct file *filp, void *dirent, filldir_t filldir)
 					goto success;
 				}
 			}
-			filp->f_pos += le16_to_cpu(de->rec_len);
+			filp->f_pos += nilfs_rec_len_from_disk(de->rec_len);
 		}
 		nilfs_put_page(page);
 	}
@@ -441,7 +442,7 @@ void nilfs_set_link(struct inode *dir, struct nilfs_dir_entry *de,
 		    struct page *page, struct inode *inode)
 {
 	unsigned from = (char *) de - (char *) page_address(page);
-	unsigned to = from + le16_to_cpu(de->rec_len);
+	unsigned to = from + nilfs_rec_len_from_disk(de->rec_len);
 	struct address_space *mapping = page->mapping;
 	int err;
 
@@ -497,7 +498,7 @@ int nilfs_add_link(struct dentry *dentry, struct inode *inode)
 				/* We hit i_size */
 				name_len = 0;
 				rec_len = chunk_size;
-				de->rec_len = cpu_to_le16(chunk_size);
+				de->rec_len = nilfs_rec_len_to_disk(chunk_size);
 				de->inode = 0;
 				goto got_it;
 			}
@@ -511,7 +512,7 @@ int nilfs_add_link(struct dentry *dentry, struct inode *inode)
 			if (nilfs_match(namelen, name, de))
 				goto out_unlock;
 			name_len = NILFS_DIR_REC_LEN(de->name_len);
-			rec_len = le16_to_cpu(de->rec_len);
+			rec_len = nilfs_rec_len_from_disk(de->rec_len);
 			if (!de->inode && rec_len >= reclen)
 				goto got_it;
 			if (rec_len >= name_len + reclen)
@@ -534,8 +535,8 @@ got_it:
 		struct nilfs_dir_entry *de1;
 
 		de1 = (struct nilfs_dir_entry *)((char *)de + name_len);
-		de1->rec_len = cpu_to_le16(rec_len - name_len);
-		de->rec_len = cpu_to_le16(name_len);
+		de1->rec_len = nilfs_rec_len_to_disk(rec_len - name_len);
+		de->rec_len = nilfs_rec_len_to_disk(name_len);
 		de = de1;
 	}
 	de->name_len = namelen;
@@ -566,7 +567,8 @@ int nilfs_delete_entry(struct nilfs_dir_entry *dir, struct page *page)
 	struct inode *inode = mapping->host;
 	char *kaddr = page_address(page);
 	unsigned from = ((char *)dir - kaddr) & ~(nilfs_chunk_size(inode) - 1);
-	unsigned to = ((char *)dir - kaddr) + le16_to_cpu(dir->rec_len);
+	unsigned to = ((char *)dir - kaddr) +
+		nilfs_rec_len_from_disk(dir->rec_len);
 	struct nilfs_dir_entry *pde = NULL;
 	struct nilfs_dir_entry *de = (struct nilfs_dir_entry *)(kaddr + from);
 	int err;
@@ -587,7 +589,7 @@ int nilfs_delete_entry(struct nilfs_dir_entry *dir, struct page *page)
 	err = nilfs_prepare_chunk(page, mapping, from, to);
 	BUG_ON(err);
 	if (pde)
-		pde->rec_len = cpu_to_le16(to - from);
+		pde->rec_len = nilfs_rec_len_to_disk(to - from);
 	dir->inode = 0;
 	nilfs_commit_chunk(page, mapping, from, to);
 	inode->i_ctime = inode->i_mtime = CURRENT_TIME;
@@ -621,14 +623,14 @@ int nilfs_make_empty(struct inode *inode, struct inode *parent)
 	memset(kaddr, 0, chunk_size);
 	de = (struct nilfs_dir_entry *)kaddr;
 	de->name_len = 1;
-	de->rec_len = cpu_to_le16(NILFS_DIR_REC_LEN(1));
+	de->rec_len = nilfs_rec_len_to_disk(NILFS_DIR_REC_LEN(1));
 	memcpy(de->name, ".\0\0", 4);
 	de->inode = cpu_to_le64(inode->i_ino);
 	nilfs_set_de_type(de, inode);
 
 	de = (struct nilfs_dir_entry *)(kaddr + NILFS_DIR_REC_LEN(1));
 	de->name_len = 2;
-	de->rec_len = cpu_to_le16(chunk_size - NILFS_DIR_REC_LEN(1));
+	de->rec_len = nilfs_rec_len_to_disk(chunk_size - NILFS_DIR_REC_LEN(1));
 	de->inode = cpu_to_le64(parent->i_ino);
 	memcpy(de->name, "..\0", 4);
 	nilfs_set_de_type(de, inode);
diff --git a/include/linux/nilfs2_fs.h b/include/linux/nilfs2_fs.h
index 7dd4cd494490..970828a5ffc5 100644
--- a/include/linux/nilfs2_fs.h
+++ b/include/linux/nilfs2_fs.h
@@ -326,7 +326,25 @@ enum {
 #define NILFS_DIR_ROUND			(NILFS_DIR_PAD - 1)
 #define NILFS_DIR_REC_LEN(name_len)	(((name_len) + 12 + NILFS_DIR_ROUND) & \
 					~NILFS_DIR_ROUND)
+#define NILFS_MAX_REC_LEN		((1<<16)-1)
 
+static inline unsigned nilfs_rec_len_from_disk(__le16 dlen)
+{
+	unsigned len = le16_to_cpu(dlen);
+
+	if (len == NILFS_MAX_REC_LEN)
+		return 1 << 16;
+	return len;
+}
+
+static inline __le16 nilfs_rec_len_to_disk(unsigned len)
+{
+	if (len == (1 << 16))
+		return cpu_to_le16(NILFS_MAX_REC_LEN);
+	else if (len > (1 << 16))
+		BUG();
+	return cpu_to_le16(len);
+}
 
 /**
  * struct nilfs_finfo - file information
-- 
cgit v1.2.3


From 89c0fd014d34d409a7b196667c2b9a4813b6c968 Mon Sep 17 00:00:00 2001
From: Ryusuke Konishi <konishi.ryusuke@lab.ntt.co.jp>
Date: Sun, 25 Jul 2010 22:44:53 +0900
Subject: nilfs2: reject filesystem with unsupported block size

This inserts sanity check that refuses to mount a filesystem with
unsupported block size.

Previously, kernel code of nilfs was looking only limitation of
devices though mkfs.nilfs2 limits the range of block sizes; there was
no check that prevents rec_len overflow with larger block sizes.

With this change, block sizes larger than 64KB or smaller than 1KB
will get rejected explicitly by kernel.

Signed-off-by: Ryusuke Konishi <konishi.ryusuke@lab.ntt.co.jp>
---
 fs/nilfs2/the_nilfs.c     | 9 ++++++++-
 include/linux/nilfs2_fs.h | 6 ++++++
 2 files changed, 14 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/fs/nilfs2/the_nilfs.c b/fs/nilfs2/the_nilfs.c
index da67b560f3c3..37de1f062d81 100644
--- a/fs/nilfs2/the_nilfs.c
+++ b/fs/nilfs2/the_nilfs.c
@@ -671,7 +671,7 @@ int init_nilfs(struct the_nilfs *nilfs, struct nilfs_sb_info *sbi, char *data)
 		goto out;
 	}
 
-	blocksize = sb_min_blocksize(sb, BLOCK_SIZE);
+	blocksize = sb_min_blocksize(sb, NILFS_MIN_BLOCK_SIZE);
 	if (!blocksize) {
 		printk(KERN_ERR "NILFS: unable to set blocksize\n");
 		err = -EINVAL;
@@ -690,6 +690,13 @@ int init_nilfs(struct the_nilfs *nilfs, struct nilfs_sb_info *sbi, char *data)
 		goto failed_sbh;
 
 	blocksize = BLOCK_SIZE << le32_to_cpu(sbp->s_log_block_size);
+	if (blocksize < NILFS_MIN_BLOCK_SIZE ||
+	    blocksize > NILFS_MAX_BLOCK_SIZE) {
+		printk(KERN_ERR "NILFS: couldn't mount because of unsupported "
+		       "filesystem blocksize %d\n", blocksize);
+		err = -EINVAL;
+		goto failed_sbh;
+	}
 	if (sb->s_blocksize != blocksize) {
 		int hw_blocksize = bdev_logical_block_size(sb->s_bdev);
 
diff --git a/include/linux/nilfs2_fs.h b/include/linux/nilfs2_fs.h
index 970828a5ffc5..f5487b6f91ed 100644
--- a/include/linux/nilfs2_fs.h
+++ b/include/linux/nilfs2_fs.h
@@ -286,6 +286,12 @@ struct nilfs_super_block {
 
 #define NILFS_NAME_LEN 255
 
+/*
+ * Block size limitations
+ */
+#define NILFS_MIN_BLOCK_SIZE		1024
+#define NILFS_MAX_BLOCK_SIZE		65536
+
 /*
  * The new version of the directory entry.  Since V0 structures are
  * stored in intel byte order, and the name_len field could never be
-- 
cgit v1.2.3


From ce3bf7ab22527183634a76512d9854a38615e4d5 Mon Sep 17 00:00:00 2001
From: John Stultz <johnstul@us.ibm.com>
Date: Tue, 13 Jul 2010 17:56:19 -0700
Subject: time: Implement timespec_add

After accidentally misusing timespec_add_safe, I wanted to make sure
we don't accidently trip over that issue again, so I created a simple
timespec_add() function which we can use to replace the instances
of timespec_add_safe() that don't want the overflow detection.

Signed-off-by: John Stultz <johnstul@us.ibm.com>
LKML-Reference: <1279068988-21864-3-git-send-email-johnstul@us.ibm.com>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 include/linux/time.h      | 16 ++++++++++++++++
 kernel/time/timekeeping.c |  6 +++---
 2 files changed, 19 insertions(+), 3 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/time.h b/include/linux/time.h
index ea3559f0b3f2..9072df83de1f 100644
--- a/include/linux/time.h
+++ b/include/linux/time.h
@@ -76,9 +76,25 @@ extern unsigned long mktime(const unsigned int year, const unsigned int mon,
 			    const unsigned int min, const unsigned int sec);
 
 extern void set_normalized_timespec(struct timespec *ts, time_t sec, s64 nsec);
+
+/*
+ * timespec_add_safe assumes both values are positive and checks
+ * for overflow. It will return TIME_T_MAX if the reutrn would be
+ * smaller then either of the arguments.
+ */
 extern struct timespec timespec_add_safe(const struct timespec lhs,
 					 const struct timespec rhs);
 
+
+static inline struct timespec timespec_add(struct timespec lhs,
+						struct timespec rhs)
+{
+	struct timespec ts_delta;
+	set_normalized_timespec(&ts_delta, lhs.tv_sec + rhs.tv_sec,
+				lhs.tv_nsec + rhs.tv_nsec);
+	return ts_delta;
+}
+
 /*
  * sub = lhs - rhs, in normalized form
  */
diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c
index caf8d4d4f5c8..623fe3d504dc 100644
--- a/kernel/time/timekeeping.c
+++ b/kernel/time/timekeeping.c
@@ -579,9 +579,9 @@ static int timekeeping_resume(struct sys_device *dev)
 
 	if (timespec_compare(&ts, &timekeeping_suspend_time) > 0) {
 		ts = timespec_sub(ts, timekeeping_suspend_time);
-		xtime = timespec_add_safe(xtime, ts);
+		xtime = timespec_add(xtime, ts);
 		wall_to_monotonic = timespec_sub(wall_to_monotonic, ts);
-		total_sleep_time = timespec_add_safe(total_sleep_time, ts);
+		total_sleep_time = timespec_add(total_sleep_time, ts);
 	}
 	/* re-base the last cycle value */
 	timekeeper.clock->cycle_last = timekeeper.clock->read(timekeeper.clock);
@@ -887,7 +887,7 @@ EXPORT_SYMBOL_GPL(getboottime);
  */
 void monotonic_to_bootbased(struct timespec *ts)
 {
-	*ts = timespec_add_safe(*ts, total_sleep_time);
+	*ts = timespec_add(*ts, total_sleep_time);
 }
 EXPORT_SYMBOL_GPL(monotonic_to_bootbased);
 
-- 
cgit v1.2.3


From 7615856ebfee52b080c22d263ca4debbd0df0ac1 Mon Sep 17 00:00:00 2001
From: John Stultz <johnstul@us.ibm.com>
Date: Tue, 13 Jul 2010 17:56:23 -0700
Subject: timkeeping: Fix update_vsyscall to provide wall_to_monotonic offset

update_vsyscall() did not provide the wall_to_monotoinc offset,
so arch specific implementations tend to reference wall_to_monotonic
directly. This limits future cleanups in the timekeeping core, so
this patch fixes the update_vsyscall interface to provide
wall_to_monotonic, allowing wall_to_monotonic to be made static
as planned in Documentation/feature-removal-schedule.txt

Signed-off-by: John Stultz <johnstul@us.ibm.com>
Cc: Martin Schwidefsky <schwidefsky@de.ibm.com>
Cc: Anton Blanchard <anton@samba.org>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Tony Luck <tony.luck@intel.com>
LKML-Reference: <1279068988-21864-7-git-send-email-johnstul@us.ibm.com>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 arch/ia64/kernel/time.c       | 7 ++++---
 arch/powerpc/kernel/time.c    | 8 ++++----
 arch/s390/kernel/time.c       | 8 ++++----
 arch/x86/kernel/vsyscall_64.c | 6 +++---
 include/linux/clocksource.h   | 6 ++++--
 kernel/time/timekeeping.c     | 9 ++++++---
 6 files changed, 25 insertions(+), 19 deletions(-)

(limited to 'include/linux')

diff --git a/arch/ia64/kernel/time.c b/arch/ia64/kernel/time.c
index 653b3c46ea82..ed6f22eb5b12 100644
--- a/arch/ia64/kernel/time.c
+++ b/arch/ia64/kernel/time.c
@@ -471,7 +471,8 @@ void update_vsyscall_tz(void)
 {
 }
 
-void update_vsyscall(struct timespec *wall, struct clocksource *c, u32 mult)
+void update_vsyscall(struct timespec *wall, struct timespec *wtm,
+			struct clocksource *c, u32 mult)
 {
         unsigned long flags;
 
@@ -487,9 +488,9 @@ void update_vsyscall(struct timespec *wall, struct clocksource *c, u32 mult)
 	/* copy kernel time structures */
         fsyscall_gtod_data.wall_time.tv_sec = wall->tv_sec;
         fsyscall_gtod_data.wall_time.tv_nsec = wall->tv_nsec;
-        fsyscall_gtod_data.monotonic_time.tv_sec = wall_to_monotonic.tv_sec
+	fsyscall_gtod_data.monotonic_time.tv_sec = wtm->tv_sec
 							+ wall->tv_sec;
-        fsyscall_gtod_data.monotonic_time.tv_nsec = wall_to_monotonic.tv_nsec
+	fsyscall_gtod_data.monotonic_time.tv_nsec = wtm->tv_nsec
 							+ wall->tv_nsec;
 
 	/* normalize */
diff --git a/arch/powerpc/kernel/time.c b/arch/powerpc/kernel/time.c
index 0711d60f40b0..e215f76bba1c 100644
--- a/arch/powerpc/kernel/time.c
+++ b/arch/powerpc/kernel/time.c
@@ -849,8 +849,8 @@ static cycle_t timebase_read(struct clocksource *cs)
 	return (cycle_t)get_tb();
 }
 
-void update_vsyscall(struct timespec *wall_time, struct clocksource *clock,
-		     u32 mult)
+void update_vsyscall(struct timespec *wall_time, struct timespec *wtm,
+			struct clocksource *clock, u32 mult)
 {
 	u64 new_tb_to_xs, new_stamp_xsec;
 
@@ -882,8 +882,8 @@ void update_vsyscall(struct timespec *wall_time, struct clocksource *clock,
 	vdso_data->tb_orig_stamp = clock->cycle_last;
 	vdso_data->stamp_xsec = new_stamp_xsec;
 	vdso_data->tb_to_xs = new_tb_to_xs;
-	vdso_data->wtom_clock_sec = wall_to_monotonic.tv_sec;
-	vdso_data->wtom_clock_nsec = wall_to_monotonic.tv_nsec;
+	vdso_data->wtom_clock_sec = wtm->tv_sec;
+	vdso_data->wtom_clock_nsec = wtm->tv_nsec;
 	vdso_data->stamp_xtime = *wall_time;
 	smp_wmb();
 	++(vdso_data->tb_update_count);
diff --git a/arch/s390/kernel/time.c b/arch/s390/kernel/time.c
index a2163c95eb98..aeb30c6f279c 100644
--- a/arch/s390/kernel/time.c
+++ b/arch/s390/kernel/time.c
@@ -207,8 +207,8 @@ struct clocksource * __init clocksource_default_clock(void)
 	return &clocksource_tod;
 }
 
-void update_vsyscall(struct timespec *wall_time, struct clocksource *clock,
-		     u32 mult)
+void update_vsyscall(struct timespec *wall_time, struct timespec *wtm,
+			struct clocksource *clock, u32 mult)
 {
 	if (clock != &clocksource_tod)
 		return;
@@ -219,8 +219,8 @@ void update_vsyscall(struct timespec *wall_time, struct clocksource *clock,
 	vdso_data->xtime_tod_stamp = clock->cycle_last;
 	vdso_data->xtime_clock_sec = wall_time->tv_sec;
 	vdso_data->xtime_clock_nsec = wall_time->tv_nsec;
-	vdso_data->wtom_clock_sec = wall_to_monotonic.tv_sec;
-	vdso_data->wtom_clock_nsec = wall_to_monotonic.tv_nsec;
+	vdso_data->wtom_clock_sec = wtm->tv_sec;
+	vdso_data->wtom_clock_nsec = wtm->tv_nsec;
 	vdso_data->ntp_mult = mult;
 	smp_wmb();
 	++vdso_data->tb_update_count;
diff --git a/arch/x86/kernel/vsyscall_64.c b/arch/x86/kernel/vsyscall_64.c
index dce0c3c5a787..dcbb28c4b694 100644
--- a/arch/x86/kernel/vsyscall_64.c
+++ b/arch/x86/kernel/vsyscall_64.c
@@ -73,8 +73,8 @@ void update_vsyscall_tz(void)
 	write_sequnlock_irqrestore(&vsyscall_gtod_data.lock, flags);
 }
 
-void update_vsyscall(struct timespec *wall_time, struct clocksource *clock,
-		     u32 mult)
+void update_vsyscall(struct timespec *wall_time, struct timespec *wtm,
+			struct clocksource *clock, u32 mult)
 {
 	unsigned long flags;
 
@@ -87,7 +87,7 @@ void update_vsyscall(struct timespec *wall_time, struct clocksource *clock,
 	vsyscall_gtod_data.clock.shift = clock->shift;
 	vsyscall_gtod_data.wall_time_sec = wall_time->tv_sec;
 	vsyscall_gtod_data.wall_time_nsec = wall_time->tv_nsec;
-	vsyscall_gtod_data.wall_to_monotonic = wall_to_monotonic;
+	vsyscall_gtod_data.wall_to_monotonic = *wtm;
 	vsyscall_gtod_data.wall_time_coarse = __current_kernel_time();
 	write_sequnlock_irqrestore(&vsyscall_gtod_data.lock, flags);
 }
diff --git a/include/linux/clocksource.h b/include/linux/clocksource.h
index 5ea3c60c160c..21677d99a161 100644
--- a/include/linux/clocksource.h
+++ b/include/linux/clocksource.h
@@ -313,11 +313,13 @@ clocksource_calc_mult_shift(struct clocksource *cs, u32 freq, u32 minsec)
 
 #ifdef CONFIG_GENERIC_TIME_VSYSCALL
 extern void
-update_vsyscall(struct timespec *ts, struct clocksource *c, u32 mult);
+update_vsyscall(struct timespec *ts, struct timespec *wtm,
+			struct clocksource *c, u32 mult);
 extern void update_vsyscall_tz(void);
 #else
 static inline void
-update_vsyscall(struct timespec *ts, struct clocksource *c, u32 mult)
+update_vsyscall(struct timespec *ts, struct timespec *wtm,
+			struct clocksource *c, u32 mult)
 {
 }
 
diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c
index 73edd4074b50..b15c3acafd5a 100644
--- a/kernel/time/timekeeping.c
+++ b/kernel/time/timekeeping.c
@@ -170,7 +170,8 @@ void timekeeping_leap_insert(int leapsecond)
 {
 	xtime.tv_sec += leapsecond;
 	wall_to_monotonic.tv_sec -= leapsecond;
-	update_vsyscall(&xtime, timekeeper.clock, timekeeper.mult);
+	update_vsyscall(&xtime, &wall_to_monotonic, timekeeper.clock,
+			timekeeper.mult);
 }
 
 /**
@@ -326,7 +327,8 @@ int do_settimeofday(struct timespec *tv)
 	timekeeper.ntp_error = 0;
 	ntp_clear();
 
-	update_vsyscall(&xtime, timekeeper.clock, timekeeper.mult);
+	update_vsyscall(&xtime, &wall_to_monotonic, timekeeper.clock,
+				timekeeper.mult);
 
 	write_sequnlock_irqrestore(&xtime_lock, flags);
 
@@ -809,7 +811,8 @@ void update_wall_time(void)
 	}
 
 	/* check to see if there is a new clocksource to use */
-	update_vsyscall(&xtime, timekeeper.clock, timekeeper.mult);
+	update_vsyscall(&xtime, &wall_to_monotonic, timekeeper.clock,
+				timekeeper.mult);
 }
 
 /**
-- 
cgit v1.2.3


From 8ab4351a4c888016620f43bde605b3d0964af339 Mon Sep 17 00:00:00 2001
From: John Stultz <johnstul@us.ibm.com>
Date: Tue, 13 Jul 2010 17:56:25 -0700
Subject: hrtimer: Cleanup direct access to wall_to_monotonic

Provides an accessor function to replace hrtimer.c's
direct access of wall_to_monotonic.

This will allow wall_to_monotonic to be made static as
planned in Documentation/feature-removal-schedule.txt

Signed-off-by: John Stultz <johnstul@us.ibm.com>
LKML-Reference: <1279068988-21864-9-git-send-email-johnstul@us.ibm.com>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 include/linux/time.h      | 3 ++-
 kernel/hrtimer.c          | 9 ++++-----
 kernel/time/timekeeping.c | 5 +++++
 3 files changed, 11 insertions(+), 6 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/time.h b/include/linux/time.h
index 9072df83de1f..a57e0f67b3d0 100644
--- a/include/linux/time.h
+++ b/include/linux/time.h
@@ -126,7 +126,8 @@ extern int timekeeping_suspended;
 
 unsigned long get_seconds(void);
 struct timespec current_kernel_time(void);
-struct timespec __current_kernel_time(void); /* does not hold xtime_lock */
+struct timespec __current_kernel_time(void); /* does not take xtime_lock */
+struct timespec __get_wall_to_monotonic(void); /* does not take xtime_lock */
 struct timespec get_monotonic_coarse(void);
 
 #define CURRENT_TIME		(current_kernel_time())
diff --git a/kernel/hrtimer.c b/kernel/hrtimer.c
index 5c69e996bd0f..809f48c70553 100644
--- a/kernel/hrtimer.c
+++ b/kernel/hrtimer.c
@@ -90,7 +90,7 @@ static void hrtimer_get_softirq_time(struct hrtimer_cpu_base *base)
 	do {
 		seq = read_seqbegin(&xtime_lock);
 		xts = __current_kernel_time();
-		tom = wall_to_monotonic;
+		tom = __get_wall_to_monotonic();
 	} while (read_seqretry(&xtime_lock, seq));
 
 	xtim = timespec_to_ktime(xts);
@@ -612,7 +612,7 @@ static int hrtimer_reprogram(struct hrtimer *timer,
 static void retrigger_next_event(void *arg)
 {
 	struct hrtimer_cpu_base *base;
-	struct timespec realtime_offset;
+	struct timespec realtime_offset, wtm;
 	unsigned long seq;
 
 	if (!hrtimer_hres_active())
@@ -620,10 +620,9 @@ static void retrigger_next_event(void *arg)
 
 	do {
 		seq = read_seqbegin(&xtime_lock);
-		set_normalized_timespec(&realtime_offset,
-					-wall_to_monotonic.tv_sec,
-					-wall_to_monotonic.tv_nsec);
+		wtm = __get_wall_to_monotonic();
 	} while (read_seqretry(&xtime_lock, seq));
+	set_normalized_timespec(&realtime_offset, -wtm.tv_sec, -wtm.tv_nsec);
 
 	base = &__get_cpu_var(hrtimer_bases);
 
diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c
index b15c3acafd5a..fb61c2ed3660 100644
--- a/kernel/time/timekeeping.c
+++ b/kernel/time/timekeeping.c
@@ -858,6 +858,11 @@ struct timespec __current_kernel_time(void)
 	return xtime;
 }
 
+struct timespec __get_wall_to_monotonic(void)
+{
+	return wall_to_monotonic;
+}
+
 struct timespec current_kernel_time(void)
 {
 	struct timespec now;
-- 
cgit v1.2.3


From 0fb86b06298b6cd3205cac2e68a499f269282dac Mon Sep 17 00:00:00 2001
From: John Stultz <johnstul@us.ibm.com>
Date: Tue, 13 Jul 2010 17:56:26 -0700
Subject: timekeeping: Make xtime and wall_to_monotonic static

This patch makes xtime and wall_to_monotonic static, as planned in
Documentation/feature-removal-schedule.txt. This will allow for
further cleanups to the timekeeping core.

Signed-off-by: John Stultz <johnstul@us.ibm.com>
LKML-Reference: <1279068988-21864-10-git-send-email-johnstul@us.ibm.com>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 Documentation/feature-removal-schedule.txt | 10 ----------
 include/linux/time.h                       |  2 --
 kernel/time/timekeeping.c                  |  4 ++--
 3 files changed, 2 insertions(+), 14 deletions(-)

(limited to 'include/linux')

diff --git a/Documentation/feature-removal-schedule.txt b/Documentation/feature-removal-schedule.txt
index 1571c0c83dba..cd648dbb5146 100644
--- a/Documentation/feature-removal-schedule.txt
+++ b/Documentation/feature-removal-schedule.txt
@@ -549,16 +549,6 @@ Who:	Avi Kivity <avi@redhat.com>
 
 ----------------------------
 
-What:	xtime, wall_to_monotonic
-When:	2.6.36+
-Files:	kernel/time/timekeeping.c include/linux/time.h
-Why:	Cleaning up timekeeping internal values. Please use
-	existing timekeeping accessor functions to access
-	the equivalent functionality.
-Who:	John Stultz <johnstul@us.ibm.com>
-
-----------------------------
-
 What:	KVM kernel-allocated memory slots
 When:	July 2010
 Why:	Since 2.6.25, kvm supports user-allocated memory slots, which are
diff --git a/include/linux/time.h b/include/linux/time.h
index a57e0f67b3d0..cb34e35fabac 100644
--- a/include/linux/time.h
+++ b/include/linux/time.h
@@ -113,8 +113,6 @@ static inline struct timespec timespec_sub(struct timespec lhs,
 #define timespec_valid(ts) \
 	(((ts)->tv_sec >= 0) && (((unsigned long) (ts)->tv_nsec) < NSEC_PER_SEC))
 
-extern struct timespec xtime;
-extern struct timespec wall_to_monotonic;
 extern seqlock_t xtime_lock;
 
 extern void read_persistent_clock(struct timespec *ts);
diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c
index fb61c2ed3660..e14c839e9faa 100644
--- a/kernel/time/timekeeping.c
+++ b/kernel/time/timekeeping.c
@@ -153,8 +153,8 @@ __cacheline_aligned_in_smp DEFINE_SEQLOCK(xtime_lock);
  * - wall_to_monotonic is no longer the boot time, getboottime must be
  * used instead.
  */
-struct timespec xtime __attribute__ ((aligned (16)));
-struct timespec wall_to_monotonic __attribute__ ((aligned (16)));
+static struct timespec xtime __attribute__ ((aligned (16)));
+static struct timespec wall_to_monotonic __attribute__ ((aligned (16)));
 static struct timespec total_sleep_time;
 
 /*
-- 
cgit v1.2.3


From 852db46d55e85b475a72e665ca08d3317769ceef Mon Sep 17 00:00:00 2001
From: John Stultz <johnstul@us.ibm.com>
Date: Tue, 13 Jul 2010 17:56:28 -0700
Subject: clocksource: Add __clocksource_updatefreq_hz/khz methods

To properly handle clocksources that change frequencies
at the clocksource->enable() point, this patch adds
a method that will update the clocksource's mult/shift and
max_idle_ns values.

Signed-off-by: John Stultz <johnstul@us.ibm.com>
LKML-Reference: <1279068988-21864-12-git-send-email-johnstul@us.ibm.com>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 include/linux/clocksource.h | 11 +++++++++++
 kernel/time/clocksource.c   | 29 ++++++++++++++++++++++++-----
 2 files changed, 35 insertions(+), 5 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/clocksource.h b/include/linux/clocksource.h
index 21677d99a161..c37b21ad5a3b 100644
--- a/include/linux/clocksource.h
+++ b/include/linux/clocksource.h
@@ -292,6 +292,8 @@ clocks_calc_mult_shift(u32 *mult, u32 *shift, u32 from, u32 to, u32 minsec);
  */
 extern int
 __clocksource_register_scale(struct clocksource *cs, u32 scale, u32 freq);
+extern void
+__clocksource_updatefreq_scale(struct clocksource *cs, u32 scale, u32 freq);
 
 static inline int clocksource_register_hz(struct clocksource *cs, u32 hz)
 {
@@ -303,6 +305,15 @@ static inline int clocksource_register_khz(struct clocksource *cs, u32 khz)
 	return __clocksource_register_scale(cs, 1000, khz);
 }
 
+static inline void __clocksource_updatefreq_hz(struct clocksource *cs, u32 hz)
+{
+	__clocksource_updatefreq_scale(cs, 1, hz);
+}
+
+static inline void __clocksource_updatefreq_khz(struct clocksource *cs, u32 khz)
+{
+	__clocksource_updatefreq_scale(cs, 1000, khz);
+}
 
 static inline void
 clocksource_calc_mult_shift(struct clocksource *cs, u32 freq, u32 minsec)
diff --git a/kernel/time/clocksource.c b/kernel/time/clocksource.c
index c543d21b4e54..c18d7efa1b4b 100644
--- a/kernel/time/clocksource.c
+++ b/kernel/time/clocksource.c
@@ -639,19 +639,18 @@ static void clocksource_enqueue(struct clocksource *cs)
 #define MAX_UPDATE_LENGTH 5 /* Seconds */
 
 /**
- * __clocksource_register_scale - Used to install new clocksources
+ * __clocksource_updatefreq_scale - Used update clocksource with new freq
  * @t:		clocksource to be registered
  * @scale:	Scale factor multiplied against freq to get clocksource hz
  * @freq:	clocksource frequency (cycles per second) divided by scale
  *
- * Returns -EBUSY if registration fails, zero otherwise.
+ * This should only be called from the clocksource->enable() method.
  *
  * This *SHOULD NOT* be called directly! Please use the
- * clocksource_register_hz() or clocksource_register_khz helper functions.
+ * clocksource_updatefreq_hz() or clocksource_updatefreq_khz helper functions.
  */
-int __clocksource_register_scale(struct clocksource *cs, u32 scale, u32 freq)
+void __clocksource_updatefreq_scale(struct clocksource *cs, u32 scale, u32 freq)
 {
-
 	/*
 	 * Ideally we want to use  some of the limits used in
 	 * clocksource_max_deferment, to provide a more informed
@@ -662,7 +661,27 @@ int __clocksource_register_scale(struct clocksource *cs, u32 scale, u32 freq)
 				      NSEC_PER_SEC/scale,
 				      MAX_UPDATE_LENGTH*scale);
 	cs->max_idle_ns = clocksource_max_deferment(cs);
+}
+EXPORT_SYMBOL_GPL(__clocksource_updatefreq_scale);
+
+/**
+ * __clocksource_register_scale - Used to install new clocksources
+ * @t:		clocksource to be registered
+ * @scale:	Scale factor multiplied against freq to get clocksource hz
+ * @freq:	clocksource frequency (cycles per second) divided by scale
+ *
+ * Returns -EBUSY if registration fails, zero otherwise.
+ *
+ * This *SHOULD NOT* be called directly! Please use the
+ * clocksource_register_hz() or clocksource_register_khz helper functions.
+ */
+int __clocksource_register_scale(struct clocksource *cs, u32 scale, u32 freq)
+{
+
+	/* Intialize mult/shift and max_idle_ns */
+	__clocksource_updatefreq_scale(cs, scale, freq);
 
+	/* Add clocksource to the clcoksource list */
 	mutex_lock(&clocksource_mutex);
 	clocksource_enqueue(cs);
 	clocksource_select();
-- 
cgit v1.2.3


From 47def82672b3ba4e7c5e9a4fe48a556f8684d0d6 Mon Sep 17 00:00:00 2001
From: Theodore Ts'o <tytso@mit.edu>
Date: Tue, 27 Jul 2010 11:56:05 -0400
Subject: jbd2: Remove __GFP_NOFAIL from jbd2 layer

__GFP_NOFAIL is going away, so add our own retry loop.  Also add
jbd2__journal_start() and jbd2__journal_restart() which take a gfp
mask, so that file systems can optionally (re)start transaction
handles using GFP_KERNEL.  If they do this, then they need to be
prepared to handle receiving an PTR_ERR(-ENOMEM) error, and be ready
to reflect that error up to userspace.

Signed-off-by: "Theodore Ts'o" <tytso@mit.edu>
---
 fs/jbd2/journal.c     | 15 ++++++++++---
 fs/jbd2/transaction.c | 61 +++++++++++++++++++++++++++++++++++----------------
 include/linux/jbd2.h  |  4 +++-
 3 files changed, 57 insertions(+), 23 deletions(-)

(limited to 'include/linux')

diff --git a/fs/jbd2/journal.c b/fs/jbd2/journal.c
index f7bf15787d68..a79d3345b55a 100644
--- a/fs/jbd2/journal.c
+++ b/fs/jbd2/journal.c
@@ -41,6 +41,7 @@
 #include <linux/hash.h>
 #include <linux/log2.h>
 #include <linux/vmalloc.h>
+#include <linux/backing-dev.h>
 
 #define CREATE_TRACE_POINTS
 #include <trace/events/jbd2.h>
@@ -48,8 +49,6 @@
 #include <asm/uaccess.h>
 #include <asm/page.h>
 
-EXPORT_SYMBOL(jbd2_journal_start);
-EXPORT_SYMBOL(jbd2_journal_restart);
 EXPORT_SYMBOL(jbd2_journal_extend);
 EXPORT_SYMBOL(jbd2_journal_stop);
 EXPORT_SYMBOL(jbd2_journal_lock_updates);
@@ -311,7 +310,17 @@ int jbd2_journal_write_metadata_buffer(transaction_t *transaction,
 	 */
 	J_ASSERT_BH(bh_in, buffer_jbddirty(bh_in));
 
-	new_bh = alloc_buffer_head(GFP_NOFS|__GFP_NOFAIL);
+retry_alloc:
+	new_bh = alloc_buffer_head(GFP_NOFS);
+	if (!new_bh) {
+		/*
+		 * Failure is not an option, but __GFP_NOFAIL is going
+		 * away; so we retry ourselves here.
+		 */
+		congestion_wait(BLK_RW_ASYNC, HZ/50);
+		goto retry_alloc;
+	}
+
 	/* keep subsequent assertions sane */
 	new_bh->b_state = 0;
 	init_buffer(new_bh, NULL, NULL);
diff --git a/fs/jbd2/transaction.c b/fs/jbd2/transaction.c
index e214d68620ac..001e95fb0fe1 100644
--- a/fs/jbd2/transaction.c
+++ b/fs/jbd2/transaction.c
@@ -26,6 +26,8 @@
 #include <linux/mm.h>
 #include <linux/highmem.h>
 #include <linux/hrtimer.h>
+#include <linux/backing-dev.h>
+#include <linux/module.h>
 
 static void __jbd2_journal_temp_unlink_buffer(struct journal_head *jh);
 
@@ -83,30 +85,38 @@ jbd2_get_transaction(journal_t *journal, transaction_t *transaction)
  * transaction's buffer credits.
  */
 
-static int start_this_handle(journal_t *journal, handle_t *handle)
+static int start_this_handle(journal_t *journal, handle_t *handle,
+			     int gfp_mask)
 {
 	transaction_t *transaction;
 	int needed;
 	int nblocks = handle->h_buffer_credits;
 	transaction_t *new_transaction = NULL;
-	int ret = 0;
 	unsigned long ts = jiffies;
 
 	if (nblocks > journal->j_max_transaction_buffers) {
 		printk(KERN_ERR "JBD: %s wants too many credits (%d > %d)\n",
 		       current->comm, nblocks,
 		       journal->j_max_transaction_buffers);
-		ret = -ENOSPC;
-		goto out;
+		return -ENOSPC;
 	}
 
 alloc_transaction:
 	if (!journal->j_running_transaction) {
-		new_transaction = kzalloc(sizeof(*new_transaction),
-						GFP_NOFS|__GFP_NOFAIL);
+		new_transaction = kzalloc(sizeof(*new_transaction), gfp_mask);
 		if (!new_transaction) {
-			ret = -ENOMEM;
-			goto out;
+			/*
+			 * If __GFP_FS is not present, then we may be
+			 * being called from inside the fs writeback
+			 * layer, so we MUST NOT fail.  Since
+			 * __GFP_NOFAIL is going away, we will arrange
+			 * to retry the allocation ourselves.
+			 */
+			if ((gfp_mask & __GFP_FS) == 0) {
+				congestion_wait(BLK_RW_ASYNC, HZ/50);
+				goto alloc_transaction;
+			}
+			return -ENOMEM;
 		}
 	}
 
@@ -123,8 +133,8 @@ repeat_locked:
 	if (is_journal_aborted(journal) ||
 	    (journal->j_errno != 0 && !(journal->j_flags & JBD2_ACK_ERR))) {
 		spin_unlock(&journal->j_state_lock);
-		ret = -EROFS;
-		goto out;
+		kfree(new_transaction);
+		return -EROFS;
 	}
 
 	/* Wait on the journal's transaction barrier if necessary */
@@ -240,10 +250,8 @@ repeat_locked:
 	spin_unlock(&journal->j_state_lock);
 
 	lock_map_acquire(&handle->h_lockdep_map);
-out:
-	if (unlikely(new_transaction))		/* It's usually NULL */
-		kfree(new_transaction);
-	return ret;
+	kfree(new_transaction);
+	return 0;
 }
 
 static struct lock_class_key jbd2_handle_key;
@@ -278,7 +286,7 @@ static handle_t *new_handle(int nblocks)
  *
  * Return a pointer to a newly allocated handle, or NULL on failure
  */
-handle_t *jbd2_journal_start(journal_t *journal, int nblocks)
+handle_t *jbd2__journal_start(journal_t *journal, int nblocks, int gfp_mask)
 {
 	handle_t *handle = journal_current_handle();
 	int err;
@@ -298,7 +306,7 @@ handle_t *jbd2_journal_start(journal_t *journal, int nblocks)
 
 	current->journal_info = handle;
 
-	err = start_this_handle(journal, handle);
+	err = start_this_handle(journal, handle, gfp_mask);
 	if (err < 0) {
 		jbd2_free_handle(handle);
 		current->journal_info = NULL;
@@ -308,6 +316,15 @@ handle_t *jbd2_journal_start(journal_t *journal, int nblocks)
 out:
 	return handle;
 }
+EXPORT_SYMBOL(jbd2__journal_start);
+
+
+handle_t *jbd2_journal_start(journal_t *journal, int nblocks)
+{
+	return jbd2__journal_start(journal, nblocks, GFP_NOFS);
+}
+EXPORT_SYMBOL(jbd2_journal_start);
+
 
 /**
  * int jbd2_journal_extend() - extend buffer credits.
@@ -394,8 +411,7 @@ out:
  * transaction capabable of guaranteeing the requested number of
  * credits.
  */
-
-int jbd2_journal_restart(handle_t *handle, int nblocks)
+int jbd2__journal_restart(handle_t *handle, int nblocks, int gfp_mask)
 {
 	transaction_t *transaction = handle->h_transaction;
 	journal_t *journal = transaction->t_journal;
@@ -428,10 +444,17 @@ int jbd2_journal_restart(handle_t *handle, int nblocks)
 
 	lock_map_release(&handle->h_lockdep_map);
 	handle->h_buffer_credits = nblocks;
-	ret = start_this_handle(journal, handle);
+	ret = start_this_handle(journal, handle, gfp_mask);
 	return ret;
 }
+EXPORT_SYMBOL(jbd2__journal_restart);
+
 
+int jbd2_journal_restart(handle_t *handle, int nblocks)
+{
+	return jbd2__journal_restart(handle, nblocks, GFP_NOFS);
+}
+EXPORT_SYMBOL(jbd2_journal_restart);
 
 /**
  * void jbd2_journal_lock_updates () - establish a transaction barrier.
diff --git a/include/linux/jbd2.h b/include/linux/jbd2.h
index a4d2e9f7088a..5a72bc75b273 100644
--- a/include/linux/jbd2.h
+++ b/include/linux/jbd2.h
@@ -1081,7 +1081,9 @@ static inline handle_t *journal_current_handle(void)
  */
 
 extern handle_t *jbd2_journal_start(journal_t *, int nblocks);
-extern int	 jbd2_journal_restart (handle_t *, int nblocks);
+extern handle_t *jbd2__journal_start(journal_t *, int nblocks, int gfp_mask);
+extern int	 jbd2_journal_restart(handle_t *, int nblocks);
+extern int	 jbd2__journal_restart(handle_t *, int nblocks, int gfp_mask);
 extern int	 jbd2_journal_extend (handle_t *, int nblocks);
 extern int	 jbd2_journal_get_write_access(handle_t *, struct buffer_head *);
 extern int	 jbd2_journal_get_create_access (handle_t *, struct buffer_head *);
-- 
cgit v1.2.3


From 552ef8024f909d9b3a7442d0ab0d48a22de24e9e Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Tue, 27 Jul 2010 11:56:06 -0400
Subject: direct-io: move aio_complete into ->end_io

Filesystems with unwritten extent support must not complete an AIO request
until the transaction to convert the extent has been commited.  That means
the aio_complete calls needs to be moved into the ->end_io callback so
that the filesystem can control when to call it exactly.

This makes a bit of a mess out of dio_complete and the ->end_io callback
prototype even more complicated.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Jan Kara <jack@suse.cz>
Signed-off-by: "Theodore Ts'o" <tytso@mit.edu>
---
 fs/direct-io.c              | 26 ++++++++++++++------------
 fs/ext4/inode.c             | 10 +++++++---
 fs/ocfs2/aops.c             |  7 ++++++-
 fs/xfs/linux-2.6/xfs_aops.c |  7 ++++++-
 fs/xfs/linux-2.6/xfs_aops.h |  2 ++
 include/linux/fs.h          |  3 ++-
 6 files changed, 37 insertions(+), 18 deletions(-)

(limited to 'include/linux')

diff --git a/fs/direct-io.c b/fs/direct-io.c
index 7600aacf531d..a10cb91cadea 100644
--- a/fs/direct-io.c
+++ b/fs/direct-io.c
@@ -218,7 +218,7 @@ static struct page *dio_get_page(struct dio *dio)
  * filesystems can use it to hold additional state between get_block calls and
  * dio_complete.
  */
-static int dio_complete(struct dio *dio, loff_t offset, int ret)
+static int dio_complete(struct dio *dio, loff_t offset, int ret, bool is_async)
 {
 	ssize_t transferred = 0;
 
@@ -239,14 +239,6 @@ static int dio_complete(struct dio *dio, loff_t offset, int ret)
 			transferred = dio->i_size - offset;
 	}
 
-	if (dio->end_io && dio->result)
-		dio->end_io(dio->iocb, offset, transferred,
-			    dio->map_bh.b_private);
-
-	if (dio->flags & DIO_LOCKING)
-		/* lockdep: non-owner release */
-		up_read_non_owner(&dio->inode->i_alloc_sem);
-
 	if (ret == 0)
 		ret = dio->page_errors;
 	if (ret == 0)
@@ -254,6 +246,17 @@ static int dio_complete(struct dio *dio, loff_t offset, int ret)
 	if (ret == 0)
 		ret = transferred;
 
+	if (dio->end_io && dio->result) {
+		dio->end_io(dio->iocb, offset, transferred,
+			    dio->map_bh.b_private, ret, is_async);
+	} else if (is_async) {
+		aio_complete(dio->iocb, ret, 0);
+	}
+
+	if (dio->flags & DIO_LOCKING)
+		/* lockdep: non-owner release */
+		up_read_non_owner(&dio->inode->i_alloc_sem);
+
 	return ret;
 }
 
@@ -277,8 +280,7 @@ static void dio_bio_end_aio(struct bio *bio, int error)
 	spin_unlock_irqrestore(&dio->bio_lock, flags);
 
 	if (remaining == 0) {
-		int ret = dio_complete(dio, dio->iocb->ki_pos, 0);
-		aio_complete(dio->iocb, ret, 0);
+		dio_complete(dio, dio->iocb->ki_pos, 0, true);
 		kfree(dio);
 	}
 }
@@ -1126,7 +1128,7 @@ direct_io_worker(int rw, struct kiocb *iocb, struct inode *inode,
 	spin_unlock_irqrestore(&dio->bio_lock, flags);
 
 	if (ret2 == 0) {
-		ret = dio_complete(dio, offset, ret);
+		ret = dio_complete(dio, offset, ret, false);
 		kfree(dio);
 	} else
 		BUG_ON(ret != -EIOCBQUEUED);
diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
index 699d1d01c5df..609159e990de 100644
--- a/fs/ext4/inode.c
+++ b/fs/ext4/inode.c
@@ -3775,7 +3775,8 @@ static ext4_io_end_t *ext4_init_io_end (struct inode *inode, gfp_t flags)
 }
 
 static void ext4_end_io_dio(struct kiocb *iocb, loff_t offset,
-			    ssize_t size, void *private)
+			    ssize_t size, void *private, int ret,
+			    bool is_async)
 {
         ext4_io_end_t *io_end = iocb->private;
 	struct workqueue_struct *wq;
@@ -3784,7 +3785,7 @@ static void ext4_end_io_dio(struct kiocb *iocb, loff_t offset,
 
 	/* if not async direct IO or dio with 0 bytes write, just return */
 	if (!io_end || !size)
-		return;
+		goto out;
 
 	ext_debug("ext4_end_io_dio(): io_end 0x%p"
 		  "for inode %lu, iocb 0x%p, offset %llu, size %llu\n",
@@ -3795,7 +3796,7 @@ static void ext4_end_io_dio(struct kiocb *iocb, loff_t offset,
 	if (io_end->flag != EXT4_IO_UNWRITTEN){
 		ext4_free_io_end(io_end);
 		iocb->private = NULL;
-		return;
+		goto out;
 	}
 
 	io_end->offset = offset;
@@ -3812,6 +3813,9 @@ static void ext4_end_io_dio(struct kiocb *iocb, loff_t offset,
 	list_add_tail(&io_end->list, &ei->i_completed_io_list);
 	spin_unlock_irqrestore(&ei->i_completed_io_lock, flags);
 	iocb->private = NULL;
+out:
+	if (is_async)
+		aio_complete(iocb, ret, 0);
 }
 
 static void ext4_end_io_buffer_write(struct buffer_head *bh, int uptodate)
diff --git a/fs/ocfs2/aops.c b/fs/ocfs2/aops.c
index 3623ca20cc18..1d2b1f156bcf 100644
--- a/fs/ocfs2/aops.c
+++ b/fs/ocfs2/aops.c
@@ -609,7 +609,9 @@ bail:
 static void ocfs2_dio_end_io(struct kiocb *iocb,
 			     loff_t offset,
 			     ssize_t bytes,
-			     void *private)
+			     void *private,
+			     int ret,
+			     bool is_async)
 {
 	struct inode *inode = iocb->ki_filp->f_path.dentry->d_inode;
 	int level;
@@ -623,6 +625,9 @@ static void ocfs2_dio_end_io(struct kiocb *iocb,
 	if (!level)
 		up_read(&inode->i_alloc_sem);
 	ocfs2_rw_unlock(inode, level);
+
+	if (is_async)
+		aio_complete(iocb, ret, 0);
 }
 
 /*
diff --git a/fs/xfs/linux-2.6/xfs_aops.c b/fs/xfs/linux-2.6/xfs_aops.c
index 34640d6dbdcb..5895aaf62ace 100644
--- a/fs/xfs/linux-2.6/xfs_aops.c
+++ b/fs/xfs/linux-2.6/xfs_aops.c
@@ -1599,7 +1599,9 @@ xfs_end_io_direct(
 	struct kiocb	*iocb,
 	loff_t		offset,
 	ssize_t		size,
-	void		*private)
+	void		*private,
+	int		ret,
+	bool		is_async)
 {
 	xfs_ioend_t	*ioend = iocb->private;
 
@@ -1645,6 +1647,9 @@ xfs_end_io_direct(
 	 * against double-freeing.
 	 */
 	iocb->private = NULL;
+
+	if (is_async)
+		aio_complete(iocb, ret, 0);
 }
 
 STATIC ssize_t
diff --git a/fs/xfs/linux-2.6/xfs_aops.h b/fs/xfs/linux-2.6/xfs_aops.h
index 4cfc6ea87df8..9f566d92ae3a 100644
--- a/fs/xfs/linux-2.6/xfs_aops.h
+++ b/fs/xfs/linux-2.6/xfs_aops.h
@@ -37,6 +37,8 @@ typedef struct xfs_ioend {
 	size_t			io_size;	/* size of the extent */
 	xfs_off_t		io_offset;	/* offset in the file */
 	struct work_struct	io_work;	/* xfsdatad work queue */
+	struct kiocb		*io_iocb;
+	int			io_result;
 } xfs_ioend_t;
 
 extern const struct address_space_operations xfs_address_space_operations;
diff --git a/include/linux/fs.h b/include/linux/fs.h
index 471e1ff5079a..a0912f6075e5 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -415,7 +415,8 @@ struct buffer_head;
 typedef int (get_block_t)(struct inode *inode, sector_t iblock,
 			struct buffer_head *bh_result, int create);
 typedef void (dio_iodone_t)(struct kiocb *iocb, loff_t offset,
-			ssize_t bytes, void *private);
+			ssize_t bytes, void *private, int ret,
+			bool is_async);
 
 /*
  * Attribute flags.  These should be or-ed together to figure out what
-- 
cgit v1.2.3


From b3c567e474b5ba4447b6e16063a3b0cffc22d205 Mon Sep 17 00:00:00 2001
From: Vinod Koul <vinod.koul@intel.com>
Date: Wed, 21 Jul 2010 13:28:10 +0530
Subject: intel_mid: Add Mrst & Mfld DMA Drivers

This patch add DMA drivers for DMA controllers in Langwell chipset
of Intel(R) Moorestown platform and DMA controllers in Penwell of
Intel(R) Medfield platfrom

This patch adds support for Moorestown DMAC1 and DMAC2 controllers.
It also add support for Medfiled GP DMA and DMAC1 controllers.
These controllers supports memory to peripheral and peripheral to
memory transfers. It support only single block transfers.

This driver is based on Kernel DMA engine
Anyone who wishes to use this controller should use DMA engine APIs

This controller exposes DMA_SLAVE capabilities and notifies the client drivers
of DMA transaction completion

Config option required to be enabled CONFIG_INTEL_MID_DMAC=y

Signed-off-by: Vinod Koul <vinod.koul@intel.com>
Signed-off-by: Alan Cox <alan@linux.intel.com>
Signed-off-by: Dan Williams <dan.j.williams@intel.com>
---
 drivers/dma/Kconfig              |   13 +
 drivers/dma/Makefile             |    1 +
 drivers/dma/intel_mid_dma.c      | 1143 ++++++++++++++++++++++++++++++++++++++
 drivers/dma/intel_mid_dma_regs.h |  260 +++++++++
 include/linux/intel_mid_dma.h    |   86 +++
 5 files changed, 1503 insertions(+)
 create mode 100644 drivers/dma/intel_mid_dma.c
 create mode 100644 drivers/dma/intel_mid_dma_regs.h
 create mode 100644 include/linux/intel_mid_dma.h

(limited to 'include/linux')

diff --git a/drivers/dma/Kconfig b/drivers/dma/Kconfig
index 9e01e96fee94..6a3e5bfa6742 100644
--- a/drivers/dma/Kconfig
+++ b/drivers/dma/Kconfig
@@ -33,6 +33,19 @@ if DMADEVICES
 
 comment "DMA Devices"
 
+config INTEL_MID_DMAC
+	tristate "Intel MID DMA support for Peripheral DMA controllers"
+	depends on PCI && X86
+	select DMA_ENGINE
+	default n
+	help
+	  Enable support for the Intel(R) MID DMA engine present
+	  in Intel MID chipsets.
+
+	  Say Y here if you have such a chipset.
+
+	  If unsure, say N.
+
 config ASYNC_TX_DISABLE_CHANNEL_SWITCH
 	bool
 
diff --git a/drivers/dma/Makefile b/drivers/dma/Makefile
index 0fe5ebbfda5d..27b6c69ae639 100644
--- a/drivers/dma/Makefile
+++ b/drivers/dma/Makefile
@@ -7,6 +7,7 @@ endif
 
 obj-$(CONFIG_DMA_ENGINE) += dmaengine.o
 obj-$(CONFIG_NET_DMA) += iovlock.o
+obj-$(CONFIG_INTEL_MID_DMAC) += intel_mid_dma.o
 obj-$(CONFIG_DMATEST) += dmatest.o
 obj-$(CONFIG_INTEL_IOATDMA) += ioat/
 obj-$(CONFIG_INTEL_IOP_ADMA) += iop-adma.o
diff --git a/drivers/dma/intel_mid_dma.c b/drivers/dma/intel_mid_dma.c
new file mode 100644
index 000000000000..c2591e8d9b6e
--- /dev/null
+++ b/drivers/dma/intel_mid_dma.c
@@ -0,0 +1,1143 @@
+/*
+ *  intel_mid_dma.c - Intel Langwell DMA Drivers
+ *
+ *  Copyright (C) 2008-10 Intel Corp
+ *  Author: Vinod Koul <vinod.koul@intel.com>
+ *  The driver design is based on dw_dmac driver
+ *  ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+ *
+ *  This program is free software; you can redistribute it and/or modify
+ *  it under the terms of the GNU General Public License as published by
+ *  the Free Software Foundation; version 2 of the License.
+ *
+ *  This program is distributed in the hope that it will be useful, but
+ *  WITHOUT ANY WARRANTY; without even the implied warranty of
+ *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ *  General Public License for more details.
+ *
+ *  You should have received a copy of the GNU General Public License along
+ *  with this program; if not, write to the Free Software Foundation, Inc.,
+ *  59 Temple Place, Suite 330, Boston, MA 02111-1307 USA.
+ *
+ * ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+ *
+ *
+ */
+#include <linux/pci.h>
+#include <linux/interrupt.h>
+#include <linux/intel_mid_dma.h>
+
+#define MAX_CHAN	4 /*max ch across controllers*/
+#include "intel_mid_dma_regs.h"
+
+#define INTEL_MID_DMAC1_ID		0x0814
+#define INTEL_MID_DMAC2_ID		0x0813
+#define INTEL_MID_GP_DMAC2_ID		0x0827
+#define INTEL_MFLD_DMAC1_ID		0x0830
+#define LNW_PERIPHRAL_MASK_BASE		0xFFAE8008
+#define LNW_PERIPHRAL_MASK_SIZE		0x10
+#define LNW_PERIPHRAL_STATUS		0x0
+#define LNW_PERIPHRAL_MASK		0x8
+
+struct intel_mid_dma_probe_info {
+	u8 max_chan;
+	u8 ch_base;
+	u16 block_size;
+	u32 pimr_mask;
+};
+
+#define INFO(_max_chan, _ch_base, _block_size, _pimr_mask) \
+	((kernel_ulong_t)&(struct intel_mid_dma_probe_info) {	\
+		.max_chan = (_max_chan),			\
+		.ch_base = (_ch_base),				\
+		.block_size = (_block_size),			\
+		.pimr_mask = (_pimr_mask),			\
+	})
+
+/*****************************************************************************
+Utility Functions*/
+/**
+ * get_ch_index	-	convert status to channel
+ * @status: status mask
+ * @base: dma ch base value
+ *
+ * Modify the status mask and return the channel index needing
+ * attention (or -1 if neither)
+ */
+static int get_ch_index(int *status, unsigned int base)
+{
+	int i;
+	for (i = 0; i < MAX_CHAN; i++) {
+		if (*status & (1 << (i + base))) {
+			*status = *status & ~(1 << (i + base));
+			pr_debug("MDMA: index %d New status %x\n", i, *status);
+			return i;
+		}
+	}
+	return -1;
+}
+
+/**
+ * get_block_ts	-	calculates dma transaction length
+ * @len: dma transfer length
+ * @tx_width: dma transfer src width
+ * @block_size: dma controller max block size
+ *
+ * Based on src width calculate the DMA trsaction length in data items
+ * return data items or FFFF if exceeds max length for block
+ */
+static int get_block_ts(int len, int tx_width, int block_size)
+{
+	int byte_width = 0, block_ts = 0;
+
+	switch (tx_width) {
+	case LNW_DMA_WIDTH_8BIT:
+		byte_width = 1;
+		break;
+	case LNW_DMA_WIDTH_16BIT:
+		byte_width = 2;
+		break;
+	case LNW_DMA_WIDTH_32BIT:
+	default:
+		byte_width = 4;
+		break;
+	}
+
+	block_ts = len/byte_width;
+	if (block_ts > block_size)
+		block_ts = 0xFFFF;
+	return block_ts;
+}
+
+/*****************************************************************************
+DMAC1 interrupt Functions*/
+
+/**
+ * dmac1_mask_periphral_intr -	mask the periphral interrupt
+ * @midc: dma channel for which masking is required
+ *
+ * Masks the DMA periphral interrupt
+ * this is valid for DMAC1 family controllers only
+ * This controller should have periphral mask registers already mapped
+ */
+static void dmac1_mask_periphral_intr(struct intel_mid_dma_chan *midc)
+{
+	u32 pimr;
+	struct middma_device *mid = to_middma_device(midc->chan.device);
+
+	if (mid->pimr_mask) {
+		pimr = readl(mid->mask_reg + LNW_PERIPHRAL_MASK);
+		pimr |= mid->pimr_mask;
+		writel(pimr, mid->mask_reg + LNW_PERIPHRAL_MASK);
+	}
+	return;
+}
+
+/**
+ * dmac1_unmask_periphral_intr -	unmask the periphral interrupt
+ * @midc: dma channel for which masking is required
+ *
+ * UnMasks the DMA periphral interrupt,
+ * this is valid for DMAC1 family controllers only
+ * This controller should have periphral mask registers already mapped
+ */
+static void dmac1_unmask_periphral_intr(struct intel_mid_dma_chan *midc)
+{
+	u32 pimr;
+	struct middma_device *mid = to_middma_device(midc->chan.device);
+
+	if (mid->pimr_mask) {
+		pimr = readl(mid->mask_reg + LNW_PERIPHRAL_MASK);
+		pimr &= ~mid->pimr_mask;
+		writel(pimr, mid->mask_reg + LNW_PERIPHRAL_MASK);
+	}
+	return;
+}
+
+/**
+ * enable_dma_interrupt -	enable the periphral interrupt
+ * @midc: dma channel for which enable interrupt is required
+ *
+ * Enable the DMA periphral interrupt,
+ * this is valid for DMAC1 family controllers only
+ * This controller should have periphral mask registers already mapped
+ */
+static void enable_dma_interrupt(struct intel_mid_dma_chan *midc)
+{
+	dmac1_unmask_periphral_intr(midc);
+
+	/*en ch interrupts*/
+	iowrite32(UNMASK_INTR_REG(midc->ch_id), midc->dma_base + MASK_TFR);
+	iowrite32(UNMASK_INTR_REG(midc->ch_id), midc->dma_base + MASK_ERR);
+	return;
+}
+
+/**
+ * disable_dma_interrupt -	disable the periphral interrupt
+ * @midc: dma channel for which disable interrupt is required
+ *
+ * Disable the DMA periphral interrupt,
+ * this is valid for DMAC1 family controllers only
+ * This controller should have periphral mask registers already mapped
+ */
+static void disable_dma_interrupt(struct intel_mid_dma_chan *midc)
+{
+	/*Check LPE PISR, make sure fwd is disabled*/
+	dmac1_mask_periphral_intr(midc);
+	iowrite32(MASK_INTR_REG(midc->ch_id), midc->dma_base + MASK_BLOCK);
+	iowrite32(MASK_INTR_REG(midc->ch_id), midc->dma_base + MASK_TFR);
+	iowrite32(MASK_INTR_REG(midc->ch_id), midc->dma_base + MASK_ERR);
+	return;
+}
+
+/*****************************************************************************
+DMA channel helper Functions*/
+/**
+ * mid_desc_get		-	get a descriptor
+ * @midc: dma channel for which descriptor is required
+ *
+ * Obtain a descriptor for the channel. Returns NULL if none are free.
+ * Once the descriptor is returned it is private until put on another
+ * list or freed
+ */
+static struct intel_mid_dma_desc *midc_desc_get(struct intel_mid_dma_chan *midc)
+{
+	struct intel_mid_dma_desc *desc, *_desc;
+	struct intel_mid_dma_desc *ret = NULL;
+
+	spin_lock_bh(&midc->lock);
+	list_for_each_entry_safe(desc, _desc, &midc->free_list, desc_node) {
+		if (async_tx_test_ack(&desc->txd)) {
+			list_del(&desc->desc_node);
+			ret = desc;
+			break;
+		}
+	}
+	spin_unlock_bh(&midc->lock);
+	return ret;
+}
+
+/**
+ * mid_desc_put		-	put a descriptor
+ * @midc: dma channel for which descriptor is required
+ * @desc: descriptor to put
+ *
+ * Return a descriptor from lwn_desc_get back to the free pool
+ */
+static void midc_desc_put(struct intel_mid_dma_chan *midc,
+			struct intel_mid_dma_desc *desc)
+{
+	if (desc) {
+		spin_lock_bh(&midc->lock);
+		list_add_tail(&desc->desc_node, &midc->free_list);
+		spin_unlock_bh(&midc->lock);
+	}
+}
+/**
+ * midc_dostart		-		begin a DMA transaction
+ * @midc: channel for which txn is to be started
+ * @first: first descriptor of series
+ *
+ * Load a transaction into the engine. This must be called with midc->lock
+ * held and bh disabled.
+ */
+static void midc_dostart(struct intel_mid_dma_chan *midc,
+			struct intel_mid_dma_desc *first)
+{
+	struct middma_device *mid = to_middma_device(midc->chan.device);
+
+	/*  channel is idle */
+	if (midc->in_use && test_ch_en(midc->dma_base, midc->ch_id)) {
+		/*error*/
+		pr_err("ERR_MDMA: channel is busy in start\n");
+		/* The tasklet will hopefully advance the queue... */
+		return;
+	}
+
+	/*write registers and en*/
+	iowrite32(first->sar, midc->ch_regs + SAR);
+	iowrite32(first->dar, midc->ch_regs + DAR);
+	iowrite32(first->cfg_hi, midc->ch_regs + CFG_HIGH);
+	iowrite32(first->cfg_lo, midc->ch_regs + CFG_LOW);
+	iowrite32(first->ctl_lo, midc->ch_regs + CTL_LOW);
+	iowrite32(first->ctl_hi, midc->ch_regs + CTL_HIGH);
+	pr_debug("MDMA:TX SAR %x,DAR %x,CFGL %x,CFGH %x,CTLH %x, CTLL %x\n",
+		(int)first->sar, (int)first->dar, first->cfg_hi,
+		first->cfg_lo, first->ctl_hi, first->ctl_lo);
+
+	iowrite32(ENABLE_CHANNEL(midc->ch_id), mid->dma_base + DMA_CHAN_EN);
+	first->status = DMA_IN_PROGRESS;
+}
+
+/**
+ * midc_descriptor_complete	-	process completed descriptor
+ * @midc: channel owning the descriptor
+ * @desc: the descriptor itself
+ *
+ * Process a completed descriptor and perform any callbacks upon
+ * the completion. The completion handling drops the lock during the
+ * callbacks but must be called with the lock held.
+ */
+static void midc_descriptor_complete(struct intel_mid_dma_chan *midc,
+	       struct intel_mid_dma_desc *desc)
+{
+	struct dma_async_tx_descriptor	*txd = &desc->txd;
+	dma_async_tx_callback callback_txd = NULL;
+	void *param_txd = NULL;
+
+	midc->completed = txd->cookie;
+	callback_txd = txd->callback;
+	param_txd = txd->callback_param;
+
+	list_move(&desc->desc_node, &midc->free_list);
+
+	spin_unlock_bh(&midc->lock);
+	if (callback_txd) {
+		pr_debug("MDMA: TXD callback set ... calling\n");
+		callback_txd(param_txd);
+		spin_lock_bh(&midc->lock);
+		return;
+	}
+	spin_lock_bh(&midc->lock);
+
+}
+/**
+ * midc_scan_descriptors -		check the descriptors in channel
+ *					mark completed when tx is completete
+ * @mid: device
+ * @midc: channel to scan
+ *
+ * Walk the descriptor chain for the device and process any entries
+ * that are complete.
+ */
+static void midc_scan_descriptors(struct middma_device *mid,
+				struct intel_mid_dma_chan *midc)
+{
+	struct intel_mid_dma_desc *desc = NULL, *_desc = NULL;
+
+	/*tx is complete*/
+	list_for_each_entry_safe(desc, _desc, &midc->active_list, desc_node) {
+		if (desc->status == DMA_IN_PROGRESS)  {
+			desc->status = DMA_SUCCESS;
+			midc_descriptor_complete(midc, desc);
+		}
+	}
+	return;
+}
+
+/*****************************************************************************
+DMA engine callback Functions*/
+/**
+ * intel_mid_dma_tx_submit -	callback to submit DMA transaction
+ * @tx: dma engine descriptor
+ *
+ * Submit the DMA trasaction for this descriptor, start if ch idle
+ */
+static dma_cookie_t intel_mid_dma_tx_submit(struct dma_async_tx_descriptor *tx)
+{
+	struct intel_mid_dma_desc	*desc = to_intel_mid_dma_desc(tx);
+	struct intel_mid_dma_chan	*midc = to_intel_mid_dma_chan(tx->chan);
+	dma_cookie_t		cookie;
+
+	spin_lock_bh(&midc->lock);
+	cookie = midc->chan.cookie;
+
+	if (++cookie < 0)
+		cookie = 1;
+
+	midc->chan.cookie = cookie;
+	desc->txd.cookie = cookie;
+
+
+	if (list_empty(&midc->active_list)) {
+		midc_dostart(midc, desc);
+		list_add_tail(&desc->desc_node, &midc->active_list);
+	} else {
+		list_add_tail(&desc->desc_node, &midc->queue);
+	}
+	spin_unlock_bh(&midc->lock);
+
+	return cookie;
+}
+
+/**
+ * intel_mid_dma_issue_pending -	callback to issue pending txn
+ * @chan: chan where pending trascation needs to be checked and submitted
+ *
+ * Call for scan to issue pending descriptors
+ */
+static void intel_mid_dma_issue_pending(struct dma_chan *chan)
+{
+	struct intel_mid_dma_chan	*midc = to_intel_mid_dma_chan(chan);
+
+	spin_lock_bh(&midc->lock);
+	if (!list_empty(&midc->queue))
+		midc_scan_descriptors(to_middma_device(chan->device), midc);
+	spin_unlock_bh(&midc->lock);
+}
+
+/**
+ * intel_mid_dma_tx_status -	Return status of txn
+ * @chan: chan for where status needs to be checked
+ * @cookie: cookie for txn
+ * @txstate: DMA txn state
+ *
+ * Return status of DMA txn
+ */
+static enum dma_status intel_mid_dma_tx_status(struct dma_chan *chan,
+						dma_cookie_t cookie,
+						struct dma_tx_state *txstate)
+{
+	struct intel_mid_dma_chan	*midc = to_intel_mid_dma_chan(chan);
+	dma_cookie_t		last_used;
+	dma_cookie_t		last_complete;
+	int				ret;
+
+	last_complete = midc->completed;
+	last_used = chan->cookie;
+
+	ret = dma_async_is_complete(cookie, last_complete, last_used);
+	if (ret != DMA_SUCCESS) {
+		midc_scan_descriptors(to_middma_device(chan->device), midc);
+
+		last_complete = midc->completed;
+		last_used = chan->cookie;
+
+		ret = dma_async_is_complete(cookie, last_complete, last_used);
+	}
+
+	if (txstate) {
+		txstate->last = last_complete;
+		txstate->used = last_used;
+		txstate->residue = 0;
+	}
+	return ret;
+}
+
+/**
+ * intel_mid_dma_device_control -	DMA device control
+ * @chan: chan for DMA control
+ * @cmd: control cmd
+ * @arg: cmd arg value
+ *
+ * Perform DMA control command
+ */
+static int intel_mid_dma_device_control(struct dma_chan *chan,
+			enum dma_ctrl_cmd cmd, unsigned long arg)
+{
+	struct intel_mid_dma_chan	*midc = to_intel_mid_dma_chan(chan);
+	struct middma_device	*mid = to_middma_device(chan->device);
+	struct intel_mid_dma_desc	*desc, *_desc;
+	LIST_HEAD(list);
+
+	if (cmd != DMA_TERMINATE_ALL)
+		return -ENXIO;
+
+	spin_lock_bh(&midc->lock);
+	if (midc->in_use == false) {
+		spin_unlock_bh(&midc->lock);
+		return 0;
+	}
+	list_splice_init(&midc->free_list, &list);
+	midc->descs_allocated = 0;
+	midc->slave = NULL;
+
+	/* Disable interrupts */
+	disable_dma_interrupt(midc);
+
+	spin_unlock_bh(&midc->lock);
+	list_for_each_entry_safe(desc, _desc, &list, desc_node) {
+		pr_debug("MDMA: freeing descriptor %p\n", desc);
+		pci_pool_free(mid->dma_pool, desc, desc->txd.phys);
+	}
+	return 0;
+}
+
+/**
+ * intel_mid_dma_prep_slave_sg -	Prep slave sg txn
+ * @chan: chan for DMA transfer
+ * @sgl: scatter gather list
+ * @sg_len: length of sg txn
+ * @direction: DMA transfer dirtn
+ * @flags: DMA flags
+ *
+ * Do DMA sg txn: NOT supported now
+ */
+static struct dma_async_tx_descriptor *intel_mid_dma_prep_slave_sg(
+			struct dma_chan *chan, struct scatterlist *sgl,
+			unsigned int sg_len, enum dma_data_direction direction,
+			unsigned long flags)
+{
+	/*not supported now*/
+	return NULL;
+}
+
+/**
+ * intel_mid_dma_prep_memcpy -	Prep memcpy txn
+ * @chan: chan for DMA transfer
+ * @dest: destn address
+ * @src: src address
+ * @len: DMA transfer len
+ * @flags: DMA flags
+ *
+ * Perform a DMA memcpy. Note we support slave periphral DMA transfers only
+ * The periphral txn details should be filled in slave structure properly
+ * Returns the descriptor for this txn
+ */
+static struct dma_async_tx_descriptor *intel_mid_dma_prep_memcpy(
+			struct dma_chan *chan, dma_addr_t dest,
+			dma_addr_t src, size_t len, unsigned long flags)
+{
+	struct intel_mid_dma_chan *midc;
+	struct intel_mid_dma_desc *desc = NULL;
+	struct intel_mid_dma_slave *mids;
+	union intel_mid_dma_ctl_lo ctl_lo;
+	union intel_mid_dma_ctl_hi ctl_hi;
+	union intel_mid_dma_cfg_lo cfg_lo;
+	union intel_mid_dma_cfg_hi cfg_hi;
+	enum intel_mid_dma_width width = 0;
+
+	pr_debug("MDMA: Prep for memcpy\n");
+	WARN_ON(!chan);
+	if (!len)
+		return NULL;
+
+	mids = chan->private;
+	WARN_ON(!mids);
+
+	midc = to_intel_mid_dma_chan(chan);
+	WARN_ON(!midc);
+
+	pr_debug("MDMA:called for DMA %x CH %d Length %zu\n",
+				midc->dma->pci_id, midc->ch_id, len);
+	pr_debug("MDMA:Cfg passed Mode %x, Dirn %x, HS %x, Width %x\n",
+		mids->cfg_mode, mids->dirn, mids->hs_mode, mids->src_width);
+
+	/*calculate CFG_LO*/
+	if (mids->hs_mode == LNW_DMA_SW_HS) {
+		cfg_lo.cfg_lo = 0;
+		cfg_lo.cfgx.hs_sel_dst = 1;
+		cfg_lo.cfgx.hs_sel_src = 1;
+	} else if (mids->hs_mode == LNW_DMA_HW_HS)
+		cfg_lo.cfg_lo = 0x00000;
+
+	/*calculate CFG_HI*/
+	if (mids->cfg_mode == LNW_DMA_MEM_TO_MEM) {
+		/*SW HS only*/
+		cfg_hi.cfg_hi = 0;
+	} else {
+		cfg_hi.cfg_hi = 0;
+		if (midc->dma->pimr_mask) {
+			cfg_hi.cfgx.protctl = 0x0; /*default value*/
+			cfg_hi.cfgx.fifo_mode = 1;
+			if (mids->dirn == DMA_TO_DEVICE) {
+				cfg_hi.cfgx.src_per = 0;
+				if (mids->device_instance == 0)
+					cfg_hi.cfgx.dst_per = 3;
+				if (mids->device_instance == 1)
+					cfg_hi.cfgx.dst_per = 1;
+			} else if (mids->dirn == DMA_FROM_DEVICE) {
+				if (mids->device_instance == 0)
+					cfg_hi.cfgx.src_per = 2;
+				if (mids->device_instance == 1)
+					cfg_hi.cfgx.src_per = 0;
+				cfg_hi.cfgx.dst_per = 0;
+			}
+		} else {
+			cfg_hi.cfgx.protctl = 0x1; /*default value*/
+			cfg_hi.cfgx.src_per = cfg_hi.cfgx.dst_per =
+					midc->ch_id - midc->dma->chan_base;
+		}
+	}
+
+	/*calculate CTL_HI*/
+	ctl_hi.ctlx.reser = 0;
+	width = mids->src_width;
+
+	ctl_hi.ctlx.block_ts = get_block_ts(len, width, midc->dma->block_size);
+	pr_debug("MDMA:calc len %d for block size %d\n",
+				ctl_hi.ctlx.block_ts, midc->dma->block_size);
+	/*calculate CTL_LO*/
+	ctl_lo.ctl_lo = 0;
+	ctl_lo.ctlx.int_en = 1;
+	ctl_lo.ctlx.dst_tr_width = mids->dst_width;
+	ctl_lo.ctlx.src_tr_width = mids->src_width;
+	ctl_lo.ctlx.dst_msize = mids->src_msize;
+	ctl_lo.ctlx.src_msize = mids->dst_msize;
+
+	if (mids->cfg_mode == LNW_DMA_MEM_TO_MEM) {
+		ctl_lo.ctlx.tt_fc = 0;
+		ctl_lo.ctlx.sinc = 0;
+		ctl_lo.ctlx.dinc = 0;
+	} else {
+		if (mids->dirn == DMA_TO_DEVICE) {
+			ctl_lo.ctlx.sinc = 0;
+			ctl_lo.ctlx.dinc = 2;
+			ctl_lo.ctlx.tt_fc = 1;
+		} else if (mids->dirn == DMA_FROM_DEVICE) {
+			ctl_lo.ctlx.sinc = 2;
+			ctl_lo.ctlx.dinc = 0;
+			ctl_lo.ctlx.tt_fc = 2;
+		}
+	}
+
+	pr_debug("MDMA:Calc CTL LO %x, CTL HI %x, CFG LO %x, CFG HI %x\n",
+		ctl_lo.ctl_lo, ctl_hi.ctl_hi, cfg_lo.cfg_lo, cfg_hi.cfg_hi);
+
+	enable_dma_interrupt(midc);
+
+	desc = midc_desc_get(midc);
+	if (desc == NULL)
+		goto err_desc_get;
+	desc->sar = src;
+	desc->dar = dest ;
+	desc->len = len;
+	desc->cfg_hi = cfg_hi.cfg_hi;
+	desc->cfg_lo = cfg_lo.cfg_lo;
+	desc->ctl_lo = ctl_lo.ctl_lo;
+	desc->ctl_hi = ctl_hi.ctl_hi;
+	desc->width = width;
+	desc->dirn = mids->dirn;
+	return &desc->txd;
+
+err_desc_get:
+	pr_err("ERR_MDMA: Failed to get desc\n");
+	midc_desc_put(midc, desc);
+	return NULL;
+}
+
+/**
+ * intel_mid_dma_free_chan_resources -	Frees dma resources
+ * @chan: chan requiring attention
+ *
+ * Frees the allocated resources on this DMA chan
+ */
+static void intel_mid_dma_free_chan_resources(struct dma_chan *chan)
+{
+	struct intel_mid_dma_chan	*midc = to_intel_mid_dma_chan(chan);
+	struct middma_device	*mid = to_middma_device(chan->device);
+	struct intel_mid_dma_desc	*desc, *_desc;
+
+	if (true == midc->in_use) {
+		/*trying to free ch in use!!!!!*/
+		pr_err("ERR_MDMA: trying to free ch in use\n");
+	}
+
+	spin_lock_bh(&midc->lock);
+	midc->descs_allocated = 0;
+	list_for_each_entry_safe(desc, _desc, &midc->active_list, desc_node) {
+		list_del(&desc->desc_node);
+		pci_pool_free(mid->dma_pool, desc, desc->txd.phys);
+	}
+	list_for_each_entry_safe(desc, _desc, &midc->free_list, desc_node) {
+		list_del(&desc->desc_node);
+		pci_pool_free(mid->dma_pool, desc, desc->txd.phys);
+	}
+	list_for_each_entry_safe(desc, _desc, &midc->queue, desc_node) {
+		list_del(&desc->desc_node);
+		pci_pool_free(mid->dma_pool, desc, desc->txd.phys);
+	}
+	spin_unlock_bh(&midc->lock);
+	midc->in_use = false;
+	/* Disable CH interrupts */
+	iowrite32(MASK_INTR_REG(midc->ch_id), mid->dma_base + MASK_BLOCK);
+	iowrite32(MASK_INTR_REG(midc->ch_id), mid->dma_base + MASK_ERR);
+}
+
+/**
+ * intel_mid_dma_alloc_chan_resources -	Allocate dma resources
+ * @chan: chan requiring attention
+ *
+ * Allocates DMA resources on this chan
+ * Return the descriptors allocated
+ */
+static int intel_mid_dma_alloc_chan_resources(struct dma_chan *chan)
+{
+	struct intel_mid_dma_chan	*midc = to_intel_mid_dma_chan(chan);
+	struct middma_device	*mid = to_middma_device(chan->device);
+	struct intel_mid_dma_desc	*desc;
+	dma_addr_t		phys;
+	int	i = 0;
+
+
+	/* ASSERT:  channel is idle */
+	if (test_ch_en(mid->dma_base, midc->ch_id)) {
+		/*ch is not idle*/
+		pr_err("ERR_MDMA: ch not idle\n");
+		return -EIO;
+	}
+	midc->completed = chan->cookie = 1;
+
+	spin_lock_bh(&midc->lock);
+	while (midc->descs_allocated < DESCS_PER_CHANNEL) {
+		spin_unlock_bh(&midc->lock);
+		desc = pci_pool_alloc(mid->dma_pool, GFP_KERNEL, &phys);
+		if (!desc) {
+			pr_err("ERR_MDMA: desc failed\n");
+			return -ENOMEM;
+			/*check*/
+		}
+		dma_async_tx_descriptor_init(&desc->txd, chan);
+		desc->txd.tx_submit = intel_mid_dma_tx_submit;
+		desc->txd.flags = DMA_CTRL_ACK;
+		desc->txd.phys = phys;
+		spin_lock_bh(&midc->lock);
+		i = ++midc->descs_allocated;
+		list_add_tail(&desc->desc_node, &midc->free_list);
+	}
+	spin_unlock_bh(&midc->lock);
+	midc->in_use = false;
+	pr_debug("MID_DMA: Desc alloc done ret: %d desc\n", i);
+	return i;
+}
+
+/**
+ * midc_handle_error -	Handle DMA txn error
+ * @mid: controller where error occured
+ * @midc: chan where error occured
+ *
+ * Scan the descriptor for error
+ */
+static void midc_handle_error(struct middma_device *mid,
+		struct intel_mid_dma_chan *midc)
+{
+	midc_scan_descriptors(mid, midc);
+}
+
+/**
+ * dma_tasklet -	DMA interrupt tasklet
+ * @data: tasklet arg (the controller structure)
+ *
+ * Scan the controller for interrupts for completion/error
+ * Clear the interrupt and call for handling completion/error
+ */
+static void dma_tasklet(unsigned long data)
+{
+	struct middma_device *mid = NULL;
+	struct intel_mid_dma_chan *midc = NULL;
+	u32 status;
+	int i;
+
+	mid = (struct middma_device *)data;
+	if (mid == NULL) {
+		pr_err("ERR_MDMA: tasklet Null param\n");
+		return;
+	}
+	pr_debug("MDMA: in tasklet for device %x\n", mid->pci_id);
+	status = ioread32(mid->dma_base + RAW_TFR);
+	pr_debug("MDMA:RAW_TFR %x\n", status);
+	status &= mid->intr_mask;
+	while (status) {
+		/*txn interrupt*/
+		i = get_ch_index(&status, mid->chan_base);
+		if (i < 0) {
+			pr_err("ERR_MDMA:Invalid ch index %x\n", i);
+			return;
+		}
+		midc = &mid->ch[i];
+		if (midc == NULL) {
+			pr_err("ERR_MDMA:Null param midc\n");
+			return;
+		}
+		pr_debug("MDMA:Tx complete interrupt %x, Ch No %d Index %d\n",
+				status, midc->ch_id, i);
+		/*clearing this interrupts first*/
+		iowrite32((1 << midc->ch_id), mid->dma_base + CLEAR_TFR);
+		iowrite32((1 << midc->ch_id), mid->dma_base + CLEAR_BLOCK);
+
+		spin_lock_bh(&midc->lock);
+		midc_scan_descriptors(mid, midc);
+		pr_debug("MDMA:Scan of desc... complete, unmasking\n");
+		iowrite32(UNMASK_INTR_REG(midc->ch_id),
+				mid->dma_base + MASK_TFR);
+		spin_unlock_bh(&midc->lock);
+	}
+
+	status = ioread32(mid->dma_base + RAW_ERR);
+	status &= mid->intr_mask;
+	while (status) {
+		/*err interrupt*/
+		i = get_ch_index(&status, mid->chan_base);
+		if (i < 0) {
+			pr_err("ERR_MDMA:Invalid ch index %x\n", i);
+			return;
+		}
+		midc = &mid->ch[i];
+		if (midc == NULL) {
+			pr_err("ERR_MDMA:Null param midc\n");
+			return;
+		}
+		pr_debug("MDMA:Tx complete interrupt %x, Ch No %d Index %d\n",
+				status, midc->ch_id, i);
+
+		iowrite32((1 << midc->ch_id), mid->dma_base + CLEAR_ERR);
+		spin_lock_bh(&midc->lock);
+		midc_handle_error(mid, midc);
+		iowrite32(UNMASK_INTR_REG(midc->ch_id),
+				mid->dma_base + MASK_ERR);
+		spin_unlock_bh(&midc->lock);
+	}
+	pr_debug("MDMA:Exiting takslet...\n");
+	return;
+}
+
+static void dma_tasklet1(unsigned long data)
+{
+	pr_debug("MDMA:in takslet1...\n");
+	return dma_tasklet(data);
+}
+
+static void dma_tasklet2(unsigned long data)
+{
+	pr_debug("MDMA:in takslet2...\n");
+	return dma_tasklet(data);
+}
+
+/**
+ * intel_mid_dma_interrupt -	DMA ISR
+ * @irq: IRQ where interrupt occurred
+ * @data: ISR cllback data (the controller structure)
+ *
+ * See if this is our interrupt if so then schedule the tasklet
+ * otherwise ignore
+ */
+static irqreturn_t intel_mid_dma_interrupt(int irq, void *data)
+{
+	struct middma_device *mid = data;
+	u32 status;
+	int call_tasklet = 0;
+
+	/*DMA Interrupt*/
+	pr_debug("MDMA:Got an interrupt on irq %d\n", irq);
+	if (!mid) {
+		pr_err("ERR_MDMA:null pointer mid\n");
+		return -EINVAL;
+	}
+
+	status = ioread32(mid->dma_base + RAW_TFR);
+	pr_debug("MDMA: Status %x, Mask %x\n", status, mid->intr_mask);
+	status &= mid->intr_mask;
+	if (status) {
+		/*need to disable intr*/
+		iowrite32((status << 8), mid->dma_base + MASK_TFR);
+		pr_debug("MDMA: Calling tasklet %x\n", status);
+		call_tasklet = 1;
+	}
+	status = ioread32(mid->dma_base + RAW_ERR);
+	status &= mid->intr_mask;
+	if (status) {
+		iowrite32(MASK_INTR_REG(status), mid->dma_base + MASK_ERR);
+		call_tasklet = 1;
+	}
+	if (call_tasklet)
+		tasklet_schedule(&mid->tasklet);
+
+	return IRQ_HANDLED;
+}
+
+static irqreturn_t intel_mid_dma_interrupt1(int irq, void *data)
+{
+	return intel_mid_dma_interrupt(irq, data);
+}
+
+static irqreturn_t intel_mid_dma_interrupt2(int irq, void *data)
+{
+	return intel_mid_dma_interrupt(irq, data);
+}
+
+/**
+ * mid_setup_dma -	Setup the DMA controller
+ * @pdev: Controller PCI device structure
+ *
+ * Initilize the DMA controller, channels, registers with DMA engine,
+ * ISR. Initilize DMA controller channels.
+ */
+static int mid_setup_dma(struct pci_dev *pdev)
+{
+	struct middma_device *dma = pci_get_drvdata(pdev);
+	int err, i;
+	unsigned int irq_level;
+
+	/* DMA coherent memory pool for DMA descriptor allocations */
+	dma->dma_pool = pci_pool_create("intel_mid_dma_desc_pool", pdev,
+					sizeof(struct intel_mid_dma_desc),
+					32, 0);
+	if (NULL == dma->dma_pool) {
+		pr_err("ERR_MDMA:pci_pool_create failed\n");
+		err = -ENOMEM;
+		kfree(dma);
+		goto err_dma_pool;
+	}
+
+	INIT_LIST_HEAD(&dma->common.channels);
+	dma->pci_id = pdev->device;
+	if (dma->pimr_mask) {
+		dma->mask_reg = ioremap(LNW_PERIPHRAL_MASK_BASE,
+					LNW_PERIPHRAL_MASK_SIZE);
+		if (dma->mask_reg == NULL) {
+			pr_err("ERR_MDMA:Cant map periphral intr space !!\n");
+			return -ENOMEM;
+		}
+	} else
+		dma->mask_reg = NULL;
+
+	pr_debug("MDMA:Adding %d channel for this controller\n", dma->max_chan);
+	/*init CH structures*/
+	dma->intr_mask = 0;
+	for (i = 0; i < dma->max_chan; i++) {
+		struct intel_mid_dma_chan *midch = &dma->ch[i];
+
+		midch->chan.device = &dma->common;
+		midch->chan.cookie =  1;
+		midch->chan.chan_id = i;
+		midch->ch_id = dma->chan_base + i;
+		pr_debug("MDMA:Init CH %d, ID %d\n", i, midch->ch_id);
+
+		midch->dma_base = dma->dma_base;
+		midch->ch_regs = dma->dma_base + DMA_CH_SIZE * midch->ch_id;
+		midch->dma = dma;
+		dma->intr_mask |= 1 << (dma->chan_base + i);
+		spin_lock_init(&midch->lock);
+
+		INIT_LIST_HEAD(&midch->active_list);
+		INIT_LIST_HEAD(&midch->queue);
+		INIT_LIST_HEAD(&midch->free_list);
+		/*mask interrupts*/
+		iowrite32(MASK_INTR_REG(midch->ch_id),
+			dma->dma_base + MASK_BLOCK);
+		iowrite32(MASK_INTR_REG(midch->ch_id),
+			dma->dma_base + MASK_SRC_TRAN);
+		iowrite32(MASK_INTR_REG(midch->ch_id),
+			dma->dma_base + MASK_DST_TRAN);
+		iowrite32(MASK_INTR_REG(midch->ch_id),
+			dma->dma_base + MASK_ERR);
+		iowrite32(MASK_INTR_REG(midch->ch_id),
+			dma->dma_base + MASK_TFR);
+
+		disable_dma_interrupt(midch);
+		list_add_tail(&midch->chan.device_node, &dma->common.channels);
+	}
+	pr_debug("MDMA: Calc Mask as %x for this controller\n", dma->intr_mask);
+
+	/*init dma structure*/
+	dma_cap_zero(dma->common.cap_mask);
+	dma_cap_set(DMA_MEMCPY, dma->common.cap_mask);
+	dma_cap_set(DMA_SLAVE, dma->common.cap_mask);
+	dma_cap_set(DMA_PRIVATE, dma->common.cap_mask);
+	dma->common.dev = &pdev->dev;
+	dma->common.chancnt = dma->max_chan;
+
+	dma->common.device_alloc_chan_resources =
+					intel_mid_dma_alloc_chan_resources;
+	dma->common.device_free_chan_resources =
+					intel_mid_dma_free_chan_resources;
+
+	dma->common.device_tx_status = intel_mid_dma_tx_status;
+	dma->common.device_prep_dma_memcpy = intel_mid_dma_prep_memcpy;
+	dma->common.device_issue_pending = intel_mid_dma_issue_pending;
+	dma->common.device_prep_slave_sg = intel_mid_dma_prep_slave_sg;
+	dma->common.device_control = intel_mid_dma_device_control;
+
+	/*enable dma cntrl*/
+	iowrite32(REG_BIT0, dma->dma_base + DMA_CFG);
+
+	/*register irq */
+	if (dma->pimr_mask) {
+		irq_level = IRQF_SHARED;
+		pr_debug("MDMA:Requesting irq shared for DMAC1\n");
+		err = request_irq(pdev->irq, intel_mid_dma_interrupt1,
+			IRQF_SHARED, "INTEL_MID_DMAC1", dma);
+		if (0 != err)
+			goto err_irq;
+	} else {
+		dma->intr_mask = 0x03;
+		irq_level = 0;
+		pr_debug("MDMA:Requesting irq for DMAC2\n");
+		err = request_irq(pdev->irq, intel_mid_dma_interrupt2,
+			0, "INTEL_MID_DMAC2", dma);
+		if (0 != err)
+			goto err_irq;
+	}
+	/*register device w/ engine*/
+	err = dma_async_device_register(&dma->common);
+	if (0 != err) {
+		pr_err("ERR_MDMA:device_register failed: %d\n", err);
+		goto err_engine;
+	}
+	if (dma->pimr_mask) {
+		pr_debug("setting up tasklet1 for DMAC1\n");
+		tasklet_init(&dma->tasklet, dma_tasklet1, (unsigned long)dma);
+	} else {
+		pr_debug("setting up tasklet2 for DMAC2\n");
+		tasklet_init(&dma->tasklet, dma_tasklet2, (unsigned long)dma);
+	}
+	return 0;
+
+err_engine:
+	free_irq(pdev->irq, dma);
+err_irq:
+	pci_pool_destroy(dma->dma_pool);
+	kfree(dma);
+err_dma_pool:
+	pr_err("ERR_MDMA:setup_dma failed: %d\n", err);
+	return err;
+
+}
+
+/**
+ * middma_shutdown -	Shutdown the DMA controller
+ * @pdev: Controller PCI device structure
+ *
+ * Called by remove
+ * Unregister DMa controller, clear all structures and free interrupt
+ */
+static void middma_shutdown(struct pci_dev *pdev)
+{
+	struct middma_device *device = pci_get_drvdata(pdev);
+
+	dma_async_device_unregister(&device->common);
+	pci_pool_destroy(device->dma_pool);
+	if (device->mask_reg)
+		iounmap(device->mask_reg);
+	if (device->dma_base)
+		iounmap(device->dma_base);
+	free_irq(pdev->irq, device);
+	return;
+}
+
+/**
+ * intel_mid_dma_probe -	PCI Probe
+ * @pdev: Controller PCI device structure
+ * @id: pci device id structure
+ *
+ * Initilize the PCI device, map BARs, query driver data.
+ * Call setup_dma to complete contoller and chan initilzation
+ */
+static int __devinit intel_mid_dma_probe(struct pci_dev *pdev,
+					const struct pci_device_id *id)
+{
+	struct middma_device *device;
+	u32 base_addr, bar_size;
+	struct intel_mid_dma_probe_info *info;
+	int err;
+
+	pr_debug("MDMA: probe for %x\n", pdev->device);
+	info = (void *)id->driver_data;
+	pr_debug("MDMA: CH %d, base %d, block len %d, Periphral mask %x\n",
+				info->max_chan, info->ch_base,
+				info->block_size, info->pimr_mask);
+
+	err = pci_enable_device(pdev);
+	if (err)
+		goto err_enable_device;
+
+	err = pci_request_regions(pdev, "intel_mid_dmac");
+	if (err)
+		goto err_request_regions;
+
+	err = pci_set_dma_mask(pdev, DMA_BIT_MASK(32));
+	if (err)
+		goto err_set_dma_mask;
+
+	err = pci_set_consistent_dma_mask(pdev, DMA_BIT_MASK(32));
+	if (err)
+		goto err_set_dma_mask;
+
+	device = kzalloc(sizeof(*device), GFP_KERNEL);
+	if (!device) {
+		pr_err("ERR_MDMA:kzalloc failed probe\n");
+		err = -ENOMEM;
+		goto err_kzalloc;
+	}
+	device->pdev = pci_dev_get(pdev);
+
+	base_addr = pci_resource_start(pdev, 0);
+	bar_size  = pci_resource_len(pdev, 0);
+	device->dma_base = ioremap_nocache(base_addr, DMA_REG_SIZE);
+	if (!device->dma_base) {
+		pr_err("ERR_MDMA:ioremap failed\n");
+		err = -ENOMEM;
+		goto err_ioremap;
+	}
+	pci_set_drvdata(pdev, device);
+	pci_set_master(pdev);
+	device->max_chan = info->max_chan;
+	device->chan_base = info->ch_base;
+	device->block_size = info->block_size;
+	device->pimr_mask = info->pimr_mask;
+
+	err = mid_setup_dma(pdev);
+	if (err)
+		goto err_dma;
+
+	return 0;
+
+err_dma:
+	iounmap(device->dma_base);
+err_ioremap:
+	pci_dev_put(pdev);
+	kfree(device);
+err_kzalloc:
+err_set_dma_mask:
+	pci_release_regions(pdev);
+	pci_disable_device(pdev);
+err_request_regions:
+err_enable_device:
+	pr_err("ERR_MDMA:Probe failed %d\n", err);
+	return err;
+}
+
+/**
+ * intel_mid_dma_remove -	PCI remove
+ * @pdev: Controller PCI device structure
+ *
+ * Free up all resources and data
+ * Call shutdown_dma to complete contoller and chan cleanup
+ */
+static void __devexit intel_mid_dma_remove(struct pci_dev *pdev)
+{
+	struct middma_device *device = pci_get_drvdata(pdev);
+	middma_shutdown(pdev);
+	pci_dev_put(pdev);
+	kfree(device);
+	pci_release_regions(pdev);
+	pci_disable_device(pdev);
+}
+
+/******************************************************************************
+* PCI stuff
+*/
+static struct pci_device_id intel_mid_dma_ids[] = {
+	{ PCI_VDEVICE(INTEL, INTEL_MID_DMAC1_ID),	INFO(2, 6, 4095, 0x200020)},
+	{ PCI_VDEVICE(INTEL, INTEL_MID_DMAC2_ID),	INFO(2, 0, 2047, 0)},
+	{ PCI_VDEVICE(INTEL, INTEL_MID_GP_DMAC2_ID),	INFO(2, 0, 2047, 0)},
+	{ PCI_VDEVICE(INTEL, INTEL_MFLD_DMAC1_ID),	INFO(4, 0, 4095, 0x400040)},
+	{ 0, }
+};
+MODULE_DEVICE_TABLE(pci, intel_mid_dma_ids);
+
+static struct pci_driver intel_mid_dma_pci = {
+	.name		=	"Intel MID DMA",
+	.id_table	=	intel_mid_dma_ids,
+	.probe		=	intel_mid_dma_probe,
+	.remove		=	__devexit_p(intel_mid_dma_remove),
+};
+
+static int __init intel_mid_dma_init(void)
+{
+	pr_debug("INFO_MDMA: LNW DMA Driver Version %s\n",
+			INTEL_MID_DMA_DRIVER_VERSION);
+	return pci_register_driver(&intel_mid_dma_pci);
+}
+fs_initcall(intel_mid_dma_init);
+
+static void __exit intel_mid_dma_exit(void)
+{
+	pci_unregister_driver(&intel_mid_dma_pci);
+}
+module_exit(intel_mid_dma_exit);
+
+MODULE_AUTHOR("Vinod Koul <vinod.koul@intel.com>");
+MODULE_DESCRIPTION("Intel (R) MID DMAC Driver");
+MODULE_LICENSE("GPL v2");
+MODULE_VERSION(INTEL_MID_DMA_DRIVER_VERSION);
diff --git a/drivers/dma/intel_mid_dma_regs.h b/drivers/dma/intel_mid_dma_regs.h
new file mode 100644
index 000000000000..d81aa658ab09
--- /dev/null
+++ b/drivers/dma/intel_mid_dma_regs.h
@@ -0,0 +1,260 @@
+/*
+ *  intel_mid_dma_regs.h - Intel MID DMA Drivers
+ *
+ *  Copyright (C) 2008-10 Intel Corp
+ *  Author: Vinod Koul <vinod.koul@intel.com>
+ *  ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+ *
+ *  This program is free software; you can redistribute it and/or modify
+ *  it under the terms of the GNU General Public License as published by
+ *  the Free Software Foundation; version 2 of the License.
+ *
+ *  This program is distributed in the hope that it will be useful, but
+ *  WITHOUT ANY WARRANTY; without even the implied warranty of
+ *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ *  General Public License for more details.
+ *
+ *  You should have received a copy of the GNU General Public License along
+ *  with this program; if not, write to the Free Software Foundation, Inc.,
+ *  59 Temple Place, Suite 330, Boston, MA 02111-1307 USA.
+ *
+ * ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+ *
+ *
+ */
+#ifndef __INTEL_MID_DMAC_REGS_H__
+#define __INTEL_MID_DMAC_REGS_H__
+
+#include <linux/dmaengine.h>
+#include <linux/dmapool.h>
+#include <linux/pci_ids.h>
+
+#define INTEL_MID_DMA_DRIVER_VERSION "1.0.5"
+
+#define	REG_BIT0		0x00000001
+#define	REG_BIT8		0x00000100
+
+#define UNMASK_INTR_REG(chan_num) \
+	((REG_BIT0 << chan_num) | (REG_BIT8 << chan_num))
+#define MASK_INTR_REG(chan_num) (REG_BIT8 << chan_num)
+
+#define ENABLE_CHANNEL(chan_num) \
+	((REG_BIT0 << chan_num) | (REG_BIT8 << chan_num))
+
+#define DESCS_PER_CHANNEL	16
+/*DMA Registers*/
+/*registers associated with channel programming*/
+#define DMA_REG_SIZE		0x400
+#define DMA_CH_SIZE		0x58
+
+/*CH X REG = (DMA_CH_SIZE)*CH_NO + REG*/
+#define SAR			0x00 /* Source Address Register*/
+#define DAR			0x08 /* Destination Address Register*/
+#define CTL_LOW			0x18 /* Control Register*/
+#define CTL_HIGH		0x1C /* Control Register*/
+#define CFG_LOW			0x40 /* Configuration Register Low*/
+#define CFG_HIGH		0x44 /* Configuration Register high*/
+
+#define STATUS_TFR		0x2E8
+#define STATUS_BLOCK		0x2F0
+#define STATUS_ERR		0x308
+
+#define RAW_TFR			0x2C0
+#define RAW_BLOCK		0x2C8
+#define RAW_ERR			0x2E0
+
+#define MASK_TFR		0x310
+#define MASK_BLOCK		0x318
+#define MASK_SRC_TRAN		0x320
+#define MASK_DST_TRAN		0x328
+#define MASK_ERR		0x330
+
+#define CLEAR_TFR		0x338
+#define CLEAR_BLOCK		0x340
+#define CLEAR_SRC_TRAN		0x348
+#define CLEAR_DST_TRAN		0x350
+#define CLEAR_ERR		0x358
+
+#define INTR_STATUS		0x360
+#define DMA_CFG			0x398
+#define DMA_CHAN_EN		0x3A0
+
+/*DMA channel control registers*/
+union intel_mid_dma_ctl_lo {
+	struct {
+		u32	int_en:1;	/*enable or disable interrupts*/
+					/*should be 0*/
+		u32	dst_tr_width:3;	/*destination transfer width*/
+					/*usually 32 bits = 010*/
+		u32	src_tr_width:3; /*source transfer width*/
+					/*usually 32 bits = 010*/
+		u32	dinc:2;		/*destination address inc/dec*/
+					/*For mem:INC=00, Periphral NoINC=11*/
+		u32	sinc:2;		/*source address inc or dec, as above*/
+		u32	dst_msize:3;	/*destination burst transaction length*/
+					/*always = 16 ie 011*/
+		u32	src_msize:3;	/*source burst transaction length*/
+					/*always = 16 ie 011*/
+		u32	reser1:3;
+		u32	tt_fc:3;	/*transfer type and flow controller*/
+					/*M-M = 000
+					  P-M = 010
+					  M-P = 001*/
+		u32	dms:2;		/*destination master select = 0*/
+		u32	sms:2;		/*source master select = 0*/
+		u32	llp_dst_en:1;	/*enable/disable destination LLP = 0*/
+		u32	llp_src_en:1;	/*enable/disable source LLP = 0*/
+		u32	reser2:3;
+	} ctlx;
+	u32	ctl_lo;
+};
+
+union intel_mid_dma_ctl_hi {
+	struct {
+		u32	block_ts:12;	/*block transfer size*/
+					/*configured by DMAC*/
+		u32	reser:20;
+	} ctlx;
+	u32	ctl_hi;
+
+};
+
+/*DMA channel configuration registers*/
+union intel_mid_dma_cfg_lo {
+	struct {
+		u32	reser1:5;
+		u32	ch_prior:3;	/*channel priority = 0*/
+		u32	ch_susp:1;	/*channel suspend = 0*/
+		u32	fifo_empty:1;	/*FIFO empty or not R bit = 0*/
+		u32	hs_sel_dst:1;	/*select HW/SW destn handshaking*/
+					/*HW = 0, SW = 1*/
+		u32	hs_sel_src:1;	/*select HW/SW src handshaking*/
+		u32	reser2:6;
+		u32	dst_hs_pol:1;	/*dest HS interface polarity*/
+		u32	src_hs_pol:1;	/*src HS interface polarity*/
+		u32	max_abrst:10;	/*max AMBA burst len = 0 (no sw limit*/
+		u32	reload_src:1;	/*auto reload src addr =1 if src is P*/
+		u32	reload_dst:1;	/*AR destn addr =1 if dstn is P*/
+	} cfgx;
+	u32	cfg_lo;
+};
+
+union intel_mid_dma_cfg_hi {
+	struct {
+		u32	fcmode:1;	/*flow control mode = 1*/
+		u32	fifo_mode:1;	/*FIFO mode select = 1*/
+		u32	protctl:3;	/*protection control = 0*/
+		u32	rsvd:2;
+		u32	src_per:4;	/*src hw HS interface*/
+		u32	dst_per:4;	/*dstn hw HS interface*/
+		u32	reser2:17;
+	} cfgx;
+	u32	cfg_hi;
+};
+
+/**
+ * struct intel_mid_dma_chan - internal mid representation of a DMA channel
+ * @chan: dma_chan strcture represetation for mid chan
+ * @ch_regs: MMIO register space pointer to channel register
+ * @dma_base: MMIO register space DMA engine base pointer
+ * @ch_id: DMA channel id
+ * @lock: channel spinlock
+ * @completed: DMA cookie
+ * @active_list: current active descriptors
+ * @queue: current queued up descriptors
+ * @free_list: current free descriptors
+ * @slave: dma slave struture
+ * @descs_allocated: total number of decsiptors allocated
+ * @dma: dma device struture pointer
+ * @in_use: bool representing if ch is in use or not
+ */
+struct intel_mid_dma_chan {
+	struct dma_chan		chan;
+	void __iomem		*ch_regs;
+	void __iomem		*dma_base;
+	int			ch_id;
+	spinlock_t		lock;
+	dma_cookie_t		completed;
+	struct list_head	active_list;
+	struct list_head	queue;
+	struct list_head	free_list;
+	struct intel_mid_dma_slave	*slave;
+	unsigned int		descs_allocated;
+	struct middma_device	*dma;
+	bool			in_use;
+};
+
+static inline struct intel_mid_dma_chan *to_intel_mid_dma_chan(
+						struct dma_chan *chan)
+{
+	return container_of(chan, struct intel_mid_dma_chan, chan);
+}
+
+/**
+ * struct middma_device - internal representation of a DMA device
+ * @pdev: PCI device
+ * @dma_base: MMIO register space pointer of DMA
+ * @dma_pool: for allocating DMA descriptors
+ * @common: embedded struct dma_device
+ * @tasklet: dma tasklet for processing interrupts
+ * @ch: per channel data
+ * @pci_id: DMA device PCI ID
+ * @intr_mask: Interrupt mask to be used
+ * @mask_reg: MMIO register for periphral mask
+ * @chan_base: Base ch index (read from driver data)
+ * @max_chan: max number of chs supported (from drv_data)
+ * @block_size: Block size of DMA transfer supported (from drv_data)
+ * @pimr_mask: MMIO register addr for periphral interrupt (from drv_data)
+ */
+struct middma_device {
+	struct pci_dev		*pdev;
+	void __iomem		*dma_base;
+	struct pci_pool		*dma_pool;
+	struct dma_device	common;
+	struct tasklet_struct   tasklet;
+	struct intel_mid_dma_chan ch[MAX_CHAN];
+	unsigned int		pci_id;
+	unsigned int		intr_mask;
+	void __iomem		*mask_reg;
+	int			chan_base;
+	int			max_chan;
+	int			block_size;
+	unsigned int		pimr_mask;
+};
+
+static inline struct middma_device *to_middma_device(struct dma_device *common)
+{
+	return container_of(common, struct middma_device, common);
+}
+
+struct intel_mid_dma_desc {
+	void __iomem			*block; /*ch ptr*/
+	struct list_head		desc_node;
+	struct dma_async_tx_descriptor	txd;
+	size_t				len;
+	dma_addr_t			sar;
+	dma_addr_t			dar;
+	u32				cfg_hi;
+	u32				cfg_lo;
+	u32				ctl_lo;
+	u32				ctl_hi;
+	dma_addr_t			next;
+	enum dma_data_direction		dirn;
+	enum dma_status			status;
+	enum intel_mid_dma_width	width; /*width of DMA txn*/
+	enum intel_mid_dma_mode		cfg_mode; /*mode configuration*/
+
+};
+
+static inline int test_ch_en(void __iomem *dma, u32 ch_no)
+{
+	u32 en_reg = ioread32(dma + DMA_CHAN_EN);
+	return (en_reg >> ch_no) & 0x1;
+}
+
+static inline struct intel_mid_dma_desc *to_intel_mid_dma_desc
+		(struct dma_async_tx_descriptor *txd)
+{
+	return container_of(txd, struct intel_mid_dma_desc, txd);
+}
+#endif /*__INTEL_MID_DMAC_REGS_H__*/
diff --git a/include/linux/intel_mid_dma.h b/include/linux/intel_mid_dma.h
new file mode 100644
index 000000000000..d9d08b6269b6
--- /dev/null
+++ b/include/linux/intel_mid_dma.h
@@ -0,0 +1,86 @@
+/*
+ *  intel_mid_dma.h - Intel MID DMA Drivers
+ *
+ *  Copyright (C) 2008-10 Intel Corp
+ *  Author: Vinod Koul <vinod.koul@intel.com>
+ *  ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+ *
+ *  This program is free software; you can redistribute it and/or modify
+ *  it under the terms of the GNU General Public License as published by
+ *  the Free Software Foundation; version 2 of the License.
+ *
+ *  This program is distributed in the hope that it will be useful, but
+ *  WITHOUT ANY WARRANTY; without even the implied warranty of
+ *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ *  General Public License for more details.
+ *
+ *  You should have received a copy of the GNU General Public License along
+ *  with this program; if not, write to the Free Software Foundation, Inc.,
+ *  59 Temple Place, Suite 330, Boston, MA 02111-1307 USA.
+ *
+ * ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+ *
+ *
+ */
+#ifndef __INTEL_MID_DMA_H__
+#define __INTEL_MID_DMA_H__
+
+#include <linux/dmaengine.h>
+
+/*DMA transaction width, src and dstn width would be same
+The DMA length must be width aligned,
+for 32 bit width the length must be 32 bit (4bytes) aligned only*/
+enum intel_mid_dma_width {
+	LNW_DMA_WIDTH_8BIT = 0x0,
+	LNW_DMA_WIDTH_16BIT = 0x1,
+	LNW_DMA_WIDTH_32BIT = 0x2,
+};
+
+/*DMA mode configurations*/
+enum intel_mid_dma_mode {
+	LNW_DMA_PER_TO_MEM = 0, /*periphral to memory configuration*/
+	LNW_DMA_MEM_TO_PER,	/*memory to periphral configuration*/
+	LNW_DMA_MEM_TO_MEM,	/*mem to mem confg (testing only)*/
+};
+
+/*DMA handshaking*/
+enum intel_mid_dma_hs_mode {
+	LNW_DMA_HW_HS = 0,	/*HW Handshaking only*/
+	LNW_DMA_SW_HS = 1,	/*SW Handshaking not recommended*/
+};
+
+/*Burst size configuration*/
+enum intel_mid_dma_msize {
+	LNW_DMA_MSIZE_1 = 0x0,
+	LNW_DMA_MSIZE_4 = 0x1,
+	LNW_DMA_MSIZE_8 = 0x2,
+	LNW_DMA_MSIZE_16 = 0x3,
+	LNW_DMA_MSIZE_32 = 0x4,
+	LNW_DMA_MSIZE_64 = 0x5,
+};
+
+/**
+ * struct intel_mid_dma_slave - DMA slave structure
+ *
+ * @dirn: DMA trf direction
+ * @src_width: tx register width
+ * @dst_width: rx register width
+ * @hs_mode: HW/SW handshaking mode
+ * @cfg_mode: DMA data transfer mode (per-per/mem-per/mem-mem)
+ * @src_msize: Source DMA burst size
+ * @dst_msize: Dst DMA burst size
+ * @device_instance: DMA peripheral device instance, we can have multiple
+ *		peripheral device connected to single DMAC
+ */
+struct intel_mid_dma_slave {
+	enum dma_data_direction		dirn;
+	enum intel_mid_dma_width	src_width; /*width of DMA src txn*/
+	enum intel_mid_dma_width	dst_width; /*width of DMA dst txn*/
+	enum intel_mid_dma_hs_mode	hs_mode;  /*handshaking*/
+	enum intel_mid_dma_mode		cfg_mode; /*mode configuration*/
+	enum intel_mid_dma_msize	src_msize; /*size if src burst*/
+	enum intel_mid_dma_msize	dst_msize; /*size of dst burst*/
+	unsigned int		device_instance; /*0, 1 for periphral instance*/
+};
+
+#endif /*__INTEL_MID_DMA_H__*/
-- 
cgit v1.2.3


From 685fd0b4ea3f0f1d5385610b0d5b57775a8d5842 Mon Sep 17 00:00:00 2001
From: Ian Campbell <ian.campbell@citrix.com>
Date: Thu, 29 Jul 2010 11:16:32 +0100
Subject: irq: Add new IRQ flag IRQF_NO_SUSPEND

A small number of users of IRQF_TIMER are using it for the implied no
suspend behaviour on interrupts which are not timer interrupts.

Therefore add a new IRQF_NO_SUSPEND flag, rename IRQF_TIMER to
__IRQF_TIMER and redefine IRQF_TIMER in terms of these new flags.

Signed-off-by: Ian Campbell <ian.campbell@citrix.com>
Cc: Jeremy Fitzhardinge <jeremy@goop.org>
Cc: Dmitry Torokhov <dmitry.torokhov@gmail.com>
Cc: Benjamin Herrenschmidt <benh@kernel.crashing.org>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Grant Likely <grant.likely@secretlab.ca>
Cc: xen-devel@lists.xensource.com
Cc: linux-input@vger.kernel.org
Cc: linuxppc-dev@ozlabs.org
Cc: devicetree-discuss@lists.ozlabs.org
LKML-Reference: <1280398595-29708-1-git-send-email-ian.campbell@citrix.com>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 include/linux/interrupt.h | 7 ++++++-
 kernel/irq/manage.c       | 2 +-
 2 files changed, 7 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/interrupt.h b/include/linux/interrupt.h
index c2331138ca1b..a0384a4d1e6f 100644
--- a/include/linux/interrupt.h
+++ b/include/linux/interrupt.h
@@ -53,16 +53,21 @@
  * IRQF_ONESHOT - Interrupt is not reenabled after the hardirq handler finished.
  *                Used by threaded interrupts which need to keep the
  *                irq line disabled until the threaded handler has been run.
+ * IRQF_NO_SUSPEND - Do not disable this IRQ during suspend
+ *
  */
 #define IRQF_DISABLED		0x00000020
 #define IRQF_SAMPLE_RANDOM	0x00000040
 #define IRQF_SHARED		0x00000080
 #define IRQF_PROBE_SHARED	0x00000100
-#define IRQF_TIMER		0x00000200
+#define __IRQF_TIMER		0x00000200
 #define IRQF_PERCPU		0x00000400
 #define IRQF_NOBALANCING	0x00000800
 #define IRQF_IRQPOLL		0x00001000
 #define IRQF_ONESHOT		0x00002000
+#define IRQF_NO_SUSPEND		0x00004000
+
+#define IRQF_TIMER		(__IRQF_TIMER | IRQF_NO_SUSPEND)
 
 /*
  * Bits used by threaded handlers:
diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c
index e1497481fe8a..c3003e9d91a3 100644
--- a/kernel/irq/manage.c
+++ b/kernel/irq/manage.c
@@ -216,7 +216,7 @@ static inline int setup_affinity(unsigned int irq, struct irq_desc *desc)
 void __disable_irq(struct irq_desc *desc, unsigned int irq, bool suspend)
 {
 	if (suspend) {
-		if (!desc->action || (desc->action->flags & IRQF_TIMER))
+		if (!desc->action || (desc->action->flags & IRQF_NO_SUSPEND))
 			return;
 		desc->status |= IRQ_SUSPENDED;
 	}
-- 
cgit v1.2.3


From 872e330e38806d835bd6c311c93ab998e2fb9058 Mon Sep 17 00:00:00 2001
From: Stefan Richter <stefanr@s5r6.in-berlin.de>
Date: Thu, 29 Jul 2010 18:19:22 +0200
Subject: firewire: add isochronous multichannel reception

This adds the DMA context programming and userspace ABI for multichannel
reception, i.e. for listening on multiple channel numbers by means of a
single DMA context.

The use case is reception of more streams than there are IR DMA units
offered by the link layer.  This is already implemented by the older
ohci1394 + ieee1394 + raw1394 stack.  And as discussed recently on
linux1394-devel, this feature is occasionally used in practice.

The big drawbacks of this mode are that buffer layout and interrupt
generation necessarily differ from single-channel reception:  Headers
and trailers are not stripped from packets, packets are not aligned with
buffer chunks, interrupts are per buffer chunk, not per packet.

These drawbacks also cause a rather hefty code footprint to support this
rarely used OHCI-1394 feature.  (367 lines added, among them 94 lines of
added userspace ABI documentation.)

This implementation enforces that a multichannel reception context may
only listen to channels to which no single-channel context on the same
link layer is presently listening to.  OHCI-1394 would allow to overlay
single-channel contexts by the multi-channel context, but this would be
a departure from the present first-come-first-served policy of IR
context creation.

The implementation is heavily based on an earlier one by Jay Fenlason.
Thanks Jay.

Signed-off-by: Stefan Richter <stefanr@s5r6.in-berlin.de>
---
 drivers/firewire/core-cdev.c  |  93 ++++++++++---
 drivers/firewire/core-iso.c   |  32 ++++-
 drivers/firewire/core.h       |   2 +
 drivers/firewire/ohci.c       | 316 +++++++++++++++++++++++++++++++++---------
 include/linux/firewire-cdev.h | 281 +++++++++++++++++++++++++------------
 include/linux/firewire.h      |  29 ++--
 6 files changed, 560 insertions(+), 193 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/firewire/core-cdev.c b/drivers/firewire/core-cdev.c
index cf989e1635e1..ba23646bb108 100644
--- a/drivers/firewire/core-cdev.c
+++ b/drivers/firewire/core-cdev.c
@@ -193,6 +193,11 @@ struct iso_interrupt_event {
 	struct fw_cdev_event_iso_interrupt interrupt;
 };
 
+struct iso_interrupt_mc_event {
+	struct event event;
+	struct fw_cdev_event_iso_interrupt_mc interrupt;
+};
+
 struct iso_resource_event {
 	struct event event;
 	struct fw_cdev_event_iso_resource iso_resource;
@@ -415,6 +420,7 @@ union ioctl_arg {
 	struct fw_cdev_get_cycle_timer2		get_cycle_timer2;
 	struct fw_cdev_send_phy_packet		send_phy_packet;
 	struct fw_cdev_receive_phy_packets	receive_phy_packets;
+	struct fw_cdev_set_iso_channels		set_iso_channels;
 };
 
 static int ioctl_get_info(struct client *client, union ioctl_arg *arg)
@@ -932,26 +938,54 @@ static void iso_callback(struct fw_iso_context *context, u32 cycle,
 		    sizeof(e->interrupt) + header_length, NULL, 0);
 }
 
+static void iso_mc_callback(struct fw_iso_context *context,
+			    dma_addr_t completed, void *data)
+{
+	struct client *client = data;
+	struct iso_interrupt_mc_event *e;
+
+	e = kmalloc(sizeof(*e), GFP_ATOMIC);
+	if (e == NULL) {
+		fw_notify("Out of memory when allocating event\n");
+		return;
+	}
+	e->interrupt.type      = FW_CDEV_EVENT_ISO_INTERRUPT_MULTICHANNEL;
+	e->interrupt.closure   = client->iso_closure;
+	e->interrupt.completed = fw_iso_buffer_lookup(&client->buffer,
+						      completed);
+	queue_event(client, &e->event, &e->interrupt,
+		    sizeof(e->interrupt), NULL, 0);
+}
+
 static int ioctl_create_iso_context(struct client *client, union ioctl_arg *arg)
 {
 	struct fw_cdev_create_iso_context *a = &arg->create_iso_context;
 	struct fw_iso_context *context;
+	fw_iso_callback_t cb;
 
 	BUILD_BUG_ON(FW_CDEV_ISO_CONTEXT_TRANSMIT != FW_ISO_CONTEXT_TRANSMIT ||
-		     FW_CDEV_ISO_CONTEXT_RECEIVE  != FW_ISO_CONTEXT_RECEIVE);
-
-	if (a->channel > 63)
-		return -EINVAL;
+		     FW_CDEV_ISO_CONTEXT_RECEIVE  != FW_ISO_CONTEXT_RECEIVE  ||
+		     FW_CDEV_ISO_CONTEXT_RECEIVE_MULTICHANNEL !=
+					FW_ISO_CONTEXT_RECEIVE_MULTICHANNEL);
 
 	switch (a->type) {
-	case FW_ISO_CONTEXT_RECEIVE:
-		if (a->header_size < 4 || (a->header_size & 3))
+	case FW_ISO_CONTEXT_TRANSMIT:
+		if (a->speed > SCODE_3200 || a->channel > 63)
 			return -EINVAL;
+
+		cb = iso_callback;
 		break;
 
-	case FW_ISO_CONTEXT_TRANSMIT:
-		if (a->speed > SCODE_3200)
+	case FW_ISO_CONTEXT_RECEIVE:
+		if (a->header_size < 4 || (a->header_size & 3) ||
+		    a->channel > 63)
 			return -EINVAL;
+
+		cb = iso_callback;
+		break;
+
+	case FW_ISO_CONTEXT_RECEIVE_MULTICHANNEL:
+		cb = (fw_iso_callback_t)iso_mc_callback;
 		break;
 
 	default:
@@ -959,8 +993,7 @@ static int ioctl_create_iso_context(struct client *client, union ioctl_arg *arg)
 	}
 
 	context = fw_iso_context_create(client->device->card, a->type,
-					a->channel, a->speed, a->header_size,
-					iso_callback, client);
+			a->channel, a->speed, a->header_size, cb, client);
 	if (IS_ERR(context))
 		return PTR_ERR(context);
 
@@ -980,6 +1013,17 @@ static int ioctl_create_iso_context(struct client *client, union ioctl_arg *arg)
 	return 0;
 }
 
+static int ioctl_set_iso_channels(struct client *client, union ioctl_arg *arg)
+{
+	struct fw_cdev_set_iso_channels *a = &arg->set_iso_channels;
+	struct fw_iso_context *ctx = client->iso_context;
+
+	if (ctx == NULL || a->handle != 0)
+		return -EINVAL;
+
+	return fw_iso_context_set_channels(ctx, &a->channels);
+}
+
 /* Macros for decoding the iso packet control header. */
 #define GET_PAYLOAD_LENGTH(v)	((v) & 0xffff)
 #define GET_INTERRUPT(v)	(((v) >> 16) & 0x01)
@@ -993,7 +1037,7 @@ static int ioctl_queue_iso(struct client *client, union ioctl_arg *arg)
 	struct fw_cdev_queue_iso *a = &arg->queue_iso;
 	struct fw_cdev_iso_packet __user *p, *end, *next;
 	struct fw_iso_context *ctx = client->iso_context;
-	unsigned long payload, buffer_end, transmit_header_bytes;
+	unsigned long payload, buffer_end, transmit_header_bytes = 0;
 	u32 control;
 	int count;
 	struct {
@@ -1013,7 +1057,6 @@ static int ioctl_queue_iso(struct client *client, union ioctl_arg *arg)
 	 * use the indirect payload, the iso buffer need not be mapped
 	 * and the a->data pointer is ignored.
 	 */
-
 	payload = (unsigned long)a->data - client->vm_start;
 	buffer_end = client->buffer.page_count << PAGE_SHIFT;
 	if (a->data == 0 || client->buffer.pages == NULL ||
@@ -1022,8 +1065,10 @@ static int ioctl_queue_iso(struct client *client, union ioctl_arg *arg)
 		buffer_end = 0;
 	}
 
-	p = (struct fw_cdev_iso_packet __user *)u64_to_uptr(a->packets);
+	if (ctx->type == FW_ISO_CONTEXT_RECEIVE_MULTICHANNEL && payload & 3)
+		return -EINVAL;
 
+	p = (struct fw_cdev_iso_packet __user *)u64_to_uptr(a->packets);
 	if (!access_ok(VERIFY_READ, p, a->size))
 		return -EFAULT;
 
@@ -1039,19 +1084,24 @@ static int ioctl_queue_iso(struct client *client, union ioctl_arg *arg)
 		u.packet.sy = GET_SY(control);
 		u.packet.header_length = GET_HEADER_LENGTH(control);
 
-		if (ctx->type == FW_ISO_CONTEXT_TRANSMIT) {
-			if (u.packet.header_length % 4 != 0)
+		switch (ctx->type) {
+		case FW_ISO_CONTEXT_TRANSMIT:
+			if (u.packet.header_length & 3)
 				return -EINVAL;
 			transmit_header_bytes = u.packet.header_length;
-		} else {
-			/*
-			 * We require that header_length is a multiple of
-			 * the fixed header size, ctx->header_size.
-			 */
+			break;
+
+		case FW_ISO_CONTEXT_RECEIVE:
 			if (u.packet.header_length == 0 ||
 			    u.packet.header_length % ctx->header_size != 0)
 				return -EINVAL;
-			transmit_header_bytes = 0;
+			break;
+
+		case FW_ISO_CONTEXT_RECEIVE_MULTICHANNEL:
+			if (u.packet.payload_length == 0 ||
+			    u.packet.payload_length & 3)
+				return -EINVAL;
+			break;
 		}
 
 		next = (struct fw_cdev_iso_packet __user *)
@@ -1534,6 +1584,7 @@ static int (* const ioctl_handlers[])(struct client *, union ioctl_arg *) = {
 	[0x14] = ioctl_get_cycle_timer2,
 	[0x15] = ioctl_send_phy_packet,
 	[0x16] = ioctl_receive_phy_packets,
+	[0x17] = ioctl_set_iso_channels,
 };
 
 static int dispatch_ioctl(struct client *client,
diff --git a/drivers/firewire/core-iso.c b/drivers/firewire/core-iso.c
index 4fe932e60fb8..0c8e662a5daf 100644
--- a/drivers/firewire/core-iso.c
+++ b/drivers/firewire/core-iso.c
@@ -117,6 +117,23 @@ void fw_iso_buffer_destroy(struct fw_iso_buffer *buffer,
 }
 EXPORT_SYMBOL(fw_iso_buffer_destroy);
 
+/* Convert DMA address to offset into virtually contiguous buffer. */
+size_t fw_iso_buffer_lookup(struct fw_iso_buffer *buffer, dma_addr_t completed)
+{
+	int i;
+	dma_addr_t address;
+	ssize_t offset;
+
+	for (i = 0; i < buffer->page_count; i++) {
+		address = page_private(buffer->pages[i]);
+		offset = (ssize_t)completed - (ssize_t)address;
+		if (offset > 0 && offset <= PAGE_SIZE)
+			return (i << PAGE_SHIFT) + offset;
+	}
+
+	return 0;
+}
+
 struct fw_iso_context *fw_iso_context_create(struct fw_card *card,
 		int type, int channel, int speed, size_t header_size,
 		fw_iso_callback_t callback, void *callback_data)
@@ -133,7 +150,7 @@ struct fw_iso_context *fw_iso_context_create(struct fw_card *card,
 	ctx->channel = channel;
 	ctx->speed = speed;
 	ctx->header_size = header_size;
-	ctx->callback = callback;
+	ctx->callback.sc = callback;
 	ctx->callback_data = callback_data;
 
 	return ctx;
@@ -142,9 +159,7 @@ EXPORT_SYMBOL(fw_iso_context_create);
 
 void fw_iso_context_destroy(struct fw_iso_context *ctx)
 {
-	struct fw_card *card = ctx->card;
-
-	card->driver->free_iso_context(ctx);
+	ctx->card->driver->free_iso_context(ctx);
 }
 EXPORT_SYMBOL(fw_iso_context_destroy);
 
@@ -155,14 +170,17 @@ int fw_iso_context_start(struct fw_iso_context *ctx,
 }
 EXPORT_SYMBOL(fw_iso_context_start);
 
+int fw_iso_context_set_channels(struct fw_iso_context *ctx, u64 *channels)
+{
+	return ctx->card->driver->set_iso_channels(ctx, channels);
+}
+
 int fw_iso_context_queue(struct fw_iso_context *ctx,
 			 struct fw_iso_packet *packet,
 			 struct fw_iso_buffer *buffer,
 			 unsigned long payload)
 {
-	struct fw_card *card = ctx->card;
-
-	return card->driver->queue_iso(ctx, packet, buffer, payload);
+	return ctx->card->driver->queue_iso(ctx, packet, buffer, payload);
 }
 EXPORT_SYMBOL(fw_iso_context_queue);
 
diff --git a/drivers/firewire/core.h b/drivers/firewire/core.h
index 28621e44b111..e6239f971be6 100644
--- a/drivers/firewire/core.h
+++ b/drivers/firewire/core.h
@@ -90,6 +90,8 @@ struct fw_card_driver {
 	int (*start_iso)(struct fw_iso_context *ctx,
 			 s32 cycle, u32 sync, u32 tags);
 
+	int (*set_iso_channels)(struct fw_iso_context *ctx, u64 *channels);
+
 	int (*queue_iso)(struct fw_iso_context *ctx,
 			 struct fw_iso_packet *packet,
 			 struct fw_iso_buffer *buffer,
diff --git a/drivers/firewire/ohci.c b/drivers/firewire/ohci.c
index 2e4b425847a7..4bda1c1b74ba 100644
--- a/drivers/firewire/ohci.c
+++ b/drivers/firewire/ohci.c
@@ -190,11 +190,13 @@ struct fw_ohci {
 	struct context at_request_ctx;
 	struct context at_response_ctx;
 
-	u32 it_context_mask;
+	u32 it_context_mask;     /* unoccupied IT contexts */
 	struct iso_context *it_context_list;
-	u64 ir_context_channels;
-	u32 ir_context_mask;
+	u64 ir_context_channels; /* unoccupied channels */
+	u32 ir_context_mask;     /* unoccupied IR contexts */
 	struct iso_context *ir_context_list;
+	u64 mc_channels; /* channels in use by the multichannel IR context */
+	bool mc_allocated;
 
 	__be32    *config_rom;
 	dma_addr_t config_rom_bus;
@@ -2197,10 +2199,9 @@ static int handle_ir_packet_per_buffer(struct context *context,
 	__le32 *ir_header;
 	void *p;
 
-	for (pd = d; pd <= last; pd++) {
+	for (pd = d; pd <= last; pd++)
 		if (pd->transfer_status)
 			break;
-	}
 	if (pd > last)
 		/* Descriptor(s) not done yet, stop iteration */
 		return 0;
@@ -2210,16 +2211,38 @@ static int handle_ir_packet_per_buffer(struct context *context,
 
 	if (le16_to_cpu(last->control) & DESCRIPTOR_IRQ_ALWAYS) {
 		ir_header = (__le32 *) p;
-		ctx->base.callback(&ctx->base,
-				   le32_to_cpu(ir_header[0]) & 0xffff,
-				   ctx->header_length, ctx->header,
-				   ctx->base.callback_data);
+		ctx->base.callback.sc(&ctx->base,
+				      le32_to_cpu(ir_header[0]) & 0xffff,
+				      ctx->header_length, ctx->header,
+				      ctx->base.callback_data);
 		ctx->header_length = 0;
 	}
 
 	return 1;
 }
 
+/* d == last because each descriptor block is only a single descriptor. */
+static int handle_ir_buffer_fill(struct context *context,
+				 struct descriptor *d,
+				 struct descriptor *last)
+{
+	struct iso_context *ctx =
+		container_of(context, struct iso_context, context);
+
+	if (!last->transfer_status)
+		/* Descriptor(s) not done yet, stop iteration */
+		return 0;
+
+	if (le16_to_cpu(last->control) & DESCRIPTOR_IRQ_ALWAYS)
+		ctx->base.callback.mc(&ctx->base,
+				      le32_to_cpu(last->data_address) +
+				      le16_to_cpu(last->req_count) -
+				      le16_to_cpu(last->res_count),
+				      ctx->base.callback_data);
+
+	return 1;
+}
+
 static int handle_it_packet(struct context *context,
 			    struct descriptor *d,
 			    struct descriptor *last)
@@ -2245,72 +2268,118 @@ static int handle_it_packet(struct context *context,
 		ctx->header_length += 4;
 	}
 	if (le16_to_cpu(last->control) & DESCRIPTOR_IRQ_ALWAYS) {
-		ctx->base.callback(&ctx->base, le16_to_cpu(last->res_count),
-				   ctx->header_length, ctx->header,
-				   ctx->base.callback_data);
+		ctx->base.callback.sc(&ctx->base, le16_to_cpu(last->res_count),
+				      ctx->header_length, ctx->header,
+				      ctx->base.callback_data);
 		ctx->header_length = 0;
 	}
 	return 1;
 }
 
+static void set_multichannel_mask(struct fw_ohci *ohci, u64 channels)
+{
+	u32 hi = channels >> 32, lo = channels;
+
+	reg_write(ohci, OHCI1394_IRMultiChanMaskHiClear, ~hi);
+	reg_write(ohci, OHCI1394_IRMultiChanMaskLoClear, ~lo);
+	reg_write(ohci, OHCI1394_IRMultiChanMaskHiSet, hi);
+	reg_write(ohci, OHCI1394_IRMultiChanMaskLoSet, lo);
+	mmiowb();
+	ohci->mc_channels = channels;
+}
+
 static struct fw_iso_context *ohci_allocate_iso_context(struct fw_card *card,
 				int type, int channel, size_t header_size)
 {
 	struct fw_ohci *ohci = fw_ohci(card);
-	struct iso_context *ctx, *list;
-	descriptor_callback_t callback;
-	u64 *channels, dont_care = ~0ULL;
-	u32 *mask, regs;
+	struct iso_context *uninitialized_var(ctx);
+	descriptor_callback_t uninitialized_var(callback);
+	u64 *uninitialized_var(channels);
+	u32 *uninitialized_var(mask), uninitialized_var(regs);
 	unsigned long flags;
-	int index, ret = -ENOMEM;
+	int index, ret = -EBUSY;
 
-	if (type == FW_ISO_CONTEXT_TRANSMIT) {
-		channels = &dont_care;
-		mask = &ohci->it_context_mask;
-		list = ohci->it_context_list;
+	spin_lock_irqsave(&ohci->lock, flags);
+
+	switch (type) {
+	case FW_ISO_CONTEXT_TRANSMIT:
+		mask     = &ohci->it_context_mask;
 		callback = handle_it_packet;
-	} else {
+		index    = ffs(*mask) - 1;
+		if (index >= 0) {
+			*mask &= ~(1 << index);
+			regs = OHCI1394_IsoXmitContextBase(index);
+			ctx  = &ohci->it_context_list[index];
+		}
+		break;
+
+	case FW_ISO_CONTEXT_RECEIVE:
 		channels = &ohci->ir_context_channels;
-		mask = &ohci->ir_context_mask;
-		list = ohci->ir_context_list;
+		mask     = &ohci->ir_context_mask;
 		callback = handle_ir_packet_per_buffer;
-	}
+		index    = *channels & 1ULL << channel ? ffs(*mask) - 1 : -1;
+		if (index >= 0) {
+			*channels &= ~(1ULL << channel);
+			*mask     &= ~(1 << index);
+			regs = OHCI1394_IsoRcvContextBase(index);
+			ctx  = &ohci->ir_context_list[index];
+		}
+		break;
 
-	spin_lock_irqsave(&ohci->lock, flags);
-	index = *channels & 1ULL << channel ? ffs(*mask) - 1 : -1;
-	if (index >= 0) {
-		*channels &= ~(1ULL << channel);
-		*mask &= ~(1 << index);
+	case FW_ISO_CONTEXT_RECEIVE_MULTICHANNEL:
+		mask     = &ohci->ir_context_mask;
+		callback = handle_ir_buffer_fill;
+		index    = !ohci->mc_allocated ? ffs(*mask) - 1 : -1;
+		if (index >= 0) {
+			ohci->mc_allocated = true;
+			*mask &= ~(1 << index);
+			regs = OHCI1394_IsoRcvContextBase(index);
+			ctx  = &ohci->ir_context_list[index];
+		}
+		break;
+
+	default:
+		index = -1;
+		ret = -ENOSYS;
 	}
+
 	spin_unlock_irqrestore(&ohci->lock, flags);
 
 	if (index < 0)
-		return ERR_PTR(-EBUSY);
-
-	if (type == FW_ISO_CONTEXT_TRANSMIT)
-		regs = OHCI1394_IsoXmitContextBase(index);
-	else
-		regs = OHCI1394_IsoRcvContextBase(index);
+		return ERR_PTR(ret);
 
-	ctx = &list[index];
 	memset(ctx, 0, sizeof(*ctx));
 	ctx->header_length = 0;
 	ctx->header = (void *) __get_free_page(GFP_KERNEL);
-	if (ctx->header == NULL)
+	if (ctx->header == NULL) {
+		ret = -ENOMEM;
 		goto out;
-
+	}
 	ret = context_init(&ctx->context, ohci, regs, callback);
 	if (ret < 0)
 		goto out_with_header;
 
+	if (type == FW_ISO_CONTEXT_RECEIVE_MULTICHANNEL)
+		set_multichannel_mask(ohci, 0);
+
 	return &ctx->base;
 
  out_with_header:
 	free_page((unsigned long)ctx->header);
  out:
 	spin_lock_irqsave(&ohci->lock, flags);
-	*channels |= 1ULL << channel;
+
+	switch (type) {
+	case FW_ISO_CONTEXT_RECEIVE:
+		*channels |= 1ULL << channel;
+		break;
+
+	case FW_ISO_CONTEXT_RECEIVE_MULTICHANNEL:
+		ohci->mc_allocated = false;
+		break;
+	}
 	*mask |= 1 << index;
+
 	spin_unlock_irqrestore(&ohci->lock, flags);
 
 	return ERR_PTR(ret);
@@ -2321,10 +2390,11 @@ static int ohci_start_iso(struct fw_iso_context *base,
 {
 	struct iso_context *ctx = container_of(base, struct iso_context, base);
 	struct fw_ohci *ohci = ctx->context.ohci;
-	u32 control, match;
+	u32 control = IR_CONTEXT_ISOCH_HEADER, match;
 	int index;
 
-	if (ctx->base.type == FW_ISO_CONTEXT_TRANSMIT) {
+	switch (ctx->base.type) {
+	case FW_ISO_CONTEXT_TRANSMIT:
 		index = ctx - ohci->it_context_list;
 		match = 0;
 		if (cycle >= 0)
@@ -2334,9 +2404,13 @@ static int ohci_start_iso(struct fw_iso_context *base,
 		reg_write(ohci, OHCI1394_IsoXmitIntEventClear, 1 << index);
 		reg_write(ohci, OHCI1394_IsoXmitIntMaskSet, 1 << index);
 		context_run(&ctx->context, match);
-	} else {
+		break;
+
+	case FW_ISO_CONTEXT_RECEIVE_MULTICHANNEL:
+		control |= IR_CONTEXT_BUFFER_FILL|IR_CONTEXT_MULTI_CHANNEL_MODE;
+		/* fall through */
+	case FW_ISO_CONTEXT_RECEIVE:
 		index = ctx - ohci->ir_context_list;
-		control = IR_CONTEXT_ISOCH_HEADER;
 		match = (tags << 28) | (sync << 8) | ctx->base.channel;
 		if (cycle >= 0) {
 			match |= (cycle & 0x07fff) << 12;
@@ -2347,6 +2421,7 @@ static int ohci_start_iso(struct fw_iso_context *base,
 		reg_write(ohci, OHCI1394_IsoRecvIntMaskSet, 1 << index);
 		reg_write(ohci, CONTEXT_MATCH(ctx->context.regs), match);
 		context_run(&ctx->context, control);
+		break;
 	}
 
 	return 0;
@@ -2358,12 +2433,17 @@ static int ohci_stop_iso(struct fw_iso_context *base)
 	struct iso_context *ctx = container_of(base, struct iso_context, base);
 	int index;
 
-	if (ctx->base.type == FW_ISO_CONTEXT_TRANSMIT) {
+	switch (ctx->base.type) {
+	case FW_ISO_CONTEXT_TRANSMIT:
 		index = ctx - ohci->it_context_list;
 		reg_write(ohci, OHCI1394_IsoXmitIntMaskClear, 1 << index);
-	} else {
+		break;
+
+	case FW_ISO_CONTEXT_RECEIVE:
+	case FW_ISO_CONTEXT_RECEIVE_MULTICHANNEL:
 		index = ctx - ohci->ir_context_list;
 		reg_write(ohci, OHCI1394_IsoRecvIntMaskClear, 1 << index);
+		break;
 	}
 	flush_writes(ohci);
 	context_stop(&ctx->context);
@@ -2384,24 +2464,65 @@ static void ohci_free_iso_context(struct fw_iso_context *base)
 
 	spin_lock_irqsave(&ohci->lock, flags);
 
-	if (ctx->base.type == FW_ISO_CONTEXT_TRANSMIT) {
+	switch (base->type) {
+	case FW_ISO_CONTEXT_TRANSMIT:
 		index = ctx - ohci->it_context_list;
 		ohci->it_context_mask |= 1 << index;
-	} else {
+		break;
+
+	case FW_ISO_CONTEXT_RECEIVE:
 		index = ctx - ohci->ir_context_list;
 		ohci->ir_context_mask |= 1 << index;
 		ohci->ir_context_channels |= 1ULL << base->channel;
+		break;
+
+	case FW_ISO_CONTEXT_RECEIVE_MULTICHANNEL:
+		index = ctx - ohci->ir_context_list;
+		ohci->ir_context_mask |= 1 << index;
+		ohci->ir_context_channels |= ohci->mc_channels;
+		ohci->mc_channels = 0;
+		ohci->mc_allocated = false;
+		break;
 	}
 
 	spin_unlock_irqrestore(&ohci->lock, flags);
 }
 
-static int ohci_queue_iso_transmit(struct fw_iso_context *base,
-				   struct fw_iso_packet *packet,
-				   struct fw_iso_buffer *buffer,
-				   unsigned long payload)
+static int ohci_set_iso_channels(struct fw_iso_context *base, u64 *channels)
+{
+	struct fw_ohci *ohci = fw_ohci(base->card);
+	unsigned long flags;
+	int ret;
+
+	switch (base->type) {
+	case FW_ISO_CONTEXT_RECEIVE_MULTICHANNEL:
+
+		spin_lock_irqsave(&ohci->lock, flags);
+
+		/* Don't allow multichannel to grab other contexts' channels. */
+		if (~ohci->ir_context_channels & ~ohci->mc_channels & *channels) {
+			*channels = ohci->ir_context_channels;
+			ret = -EBUSY;
+		} else {
+			set_multichannel_mask(ohci, *channels);
+			ret = 0;
+		}
+
+		spin_unlock_irqrestore(&ohci->lock, flags);
+
+		break;
+	default:
+		ret = -EINVAL;
+	}
+
+	return ret;
+}
+
+static int queue_iso_transmit(struct iso_context *ctx,
+			      struct fw_iso_packet *packet,
+			      struct fw_iso_buffer *buffer,
+			      unsigned long payload)
 {
-	struct iso_context *ctx = container_of(base, struct iso_context, base);
 	struct descriptor *d, *last, *pd;
 	struct fw_iso_packet *p;
 	__le32 *header;
@@ -2497,14 +2618,12 @@ static int ohci_queue_iso_transmit(struct fw_iso_context *base,
 	return 0;
 }
 
-static int ohci_queue_iso_receive_packet_per_buffer(struct fw_iso_context *base,
-					struct fw_iso_packet *packet,
-					struct fw_iso_buffer *buffer,
-					unsigned long payload)
+static int queue_iso_packet_per_buffer(struct iso_context *ctx,
+				       struct fw_iso_packet *packet,
+				       struct fw_iso_buffer *buffer,
+				       unsigned long payload)
 {
-	struct iso_context *ctx = container_of(base, struct iso_context, base);
 	struct descriptor *d, *pd;
-	struct fw_iso_packet *p = packet;
 	dma_addr_t d_bus, page_bus;
 	u32 z, header_z, rest;
 	int i, j, length;
@@ -2514,14 +2633,14 @@ static int ohci_queue_iso_receive_packet_per_buffer(struct fw_iso_context *base,
 	 * The OHCI controller puts the isochronous header and trailer in the
 	 * buffer, so we need at least 8 bytes.
 	 */
-	packet_count = p->header_length / ctx->base.header_size;
+	packet_count = packet->header_length / ctx->base.header_size;
 	header_size  = max(ctx->base.header_size, (size_t)8);
 
 	/* Get header size in number of descriptors. */
 	header_z = DIV_ROUND_UP(header_size, sizeof(*d));
 	page     = payload >> PAGE_SHIFT;
 	offset   = payload & ~PAGE_MASK;
-	payload_per_buffer = p->payload_length / packet_count;
+	payload_per_buffer = packet->payload_length / packet_count;
 
 	for (i = 0; i < packet_count; i++) {
 		/* d points to the header descriptor */
@@ -2533,7 +2652,7 @@ static int ohci_queue_iso_receive_packet_per_buffer(struct fw_iso_context *base,
 
 		d->control      = cpu_to_le16(DESCRIPTOR_STATUS |
 					      DESCRIPTOR_INPUT_MORE);
-		if (p->skip && i == 0)
+		if (packet->skip && i == 0)
 			d->control |= cpu_to_le16(DESCRIPTOR_WAIT);
 		d->req_count    = cpu_to_le16(header_size);
 		d->res_count    = d->req_count;
@@ -2566,7 +2685,7 @@ static int ohci_queue_iso_receive_packet_per_buffer(struct fw_iso_context *base,
 		pd->control = cpu_to_le16(DESCRIPTOR_STATUS |
 					  DESCRIPTOR_INPUT_LAST |
 					  DESCRIPTOR_BRANCH_ALWAYS);
-		if (p->interrupt && i == packet_count - 1)
+		if (packet->interrupt && i == packet_count - 1)
 			pd->control |= cpu_to_le16(DESCRIPTOR_IRQ_ALWAYS);
 
 		context_append(&ctx->context, d, z, header_z);
@@ -2575,6 +2694,58 @@ static int ohci_queue_iso_receive_packet_per_buffer(struct fw_iso_context *base,
 	return 0;
 }
 
+static int queue_iso_buffer_fill(struct iso_context *ctx,
+				 struct fw_iso_packet *packet,
+				 struct fw_iso_buffer *buffer,
+				 unsigned long payload)
+{
+	struct descriptor *d;
+	dma_addr_t d_bus, page_bus;
+	int page, offset, rest, z, i, length;
+
+	page   = payload >> PAGE_SHIFT;
+	offset = payload & ~PAGE_MASK;
+	rest   = packet->payload_length;
+
+	/* We need one descriptor for each page in the buffer. */
+	z = DIV_ROUND_UP(offset + rest, PAGE_SIZE);
+
+	if (WARN_ON(offset & 3 || rest & 3 || page + z > buffer->page_count))
+		return -EFAULT;
+
+	for (i = 0; i < z; i++) {
+		d = context_get_descriptors(&ctx->context, 1, &d_bus);
+		if (d == NULL)
+			return -ENOMEM;
+
+		d->control = cpu_to_le16(DESCRIPTOR_INPUT_MORE |
+					 DESCRIPTOR_BRANCH_ALWAYS);
+		if (packet->skip && i == 0)
+			d->control |= cpu_to_le16(DESCRIPTOR_WAIT);
+		if (packet->interrupt && i == z - 1)
+			d->control |= cpu_to_le16(DESCRIPTOR_IRQ_ALWAYS);
+
+		if (offset + rest < PAGE_SIZE)
+			length = rest;
+		else
+			length = PAGE_SIZE - offset;
+		d->req_count = cpu_to_le16(length);
+		d->res_count = d->req_count;
+		d->transfer_status = 0;
+
+		page_bus = page_private(buffer->pages[page]);
+		d->data_address = cpu_to_le32(page_bus + offset);
+
+		rest -= length;
+		offset = 0;
+		page++;
+
+		context_append(&ctx->context, d, 1, 0);
+	}
+
+	return 0;
+}
+
 static int ohci_queue_iso(struct fw_iso_context *base,
 			  struct fw_iso_packet *packet,
 			  struct fw_iso_buffer *buffer,
@@ -2582,14 +2753,20 @@ static int ohci_queue_iso(struct fw_iso_context *base,
 {
 	struct iso_context *ctx = container_of(base, struct iso_context, base);
 	unsigned long flags;
-	int ret;
+	int ret = -ENOSYS;
 
 	spin_lock_irqsave(&ctx->context.ohci->lock, flags);
-	if (base->type == FW_ISO_CONTEXT_TRANSMIT)
-		ret = ohci_queue_iso_transmit(base, packet, buffer, payload);
-	else
-		ret = ohci_queue_iso_receive_packet_per_buffer(base, packet,
-							buffer, payload);
+	switch (base->type) {
+	case FW_ISO_CONTEXT_TRANSMIT:
+		ret = queue_iso_transmit(ctx, packet, buffer, payload);
+		break;
+	case FW_ISO_CONTEXT_RECEIVE:
+		ret = queue_iso_packet_per_buffer(ctx, packet, buffer, payload);
+		break;
+	case FW_ISO_CONTEXT_RECEIVE_MULTICHANNEL:
+		ret = queue_iso_buffer_fill(ctx, packet, buffer, payload);
+		break;
+	}
 	spin_unlock_irqrestore(&ctx->context.ohci->lock, flags);
 
 	return ret;
@@ -2609,6 +2786,7 @@ static const struct fw_card_driver ohci_driver = {
 
 	.allocate_iso_context	= ohci_allocate_iso_context,
 	.free_iso_context	= ohci_free_iso_context,
+	.set_iso_channels	= ohci_set_iso_channels,
 	.queue_iso		= ohci_queue_iso,
 	.start_iso		= ohci_start_iso,
 	.stop_iso		= ohci_stop_iso,
diff --git a/include/linux/firewire-cdev.h b/include/linux/firewire-cdev.h
index 14831119ff71..bc5c26fc1c64 100644
--- a/include/linux/firewire-cdev.h
+++ b/include/linux/firewire-cdev.h
@@ -25,17 +25,18 @@
 #include <linux/types.h>
 #include <linux/firewire-constants.h>
 
-#define FW_CDEV_EVENT_BUS_RESET			0x00
-#define FW_CDEV_EVENT_RESPONSE			0x01
-#define FW_CDEV_EVENT_REQUEST			0x02
-#define FW_CDEV_EVENT_ISO_INTERRUPT		0x03
-#define FW_CDEV_EVENT_ISO_RESOURCE_ALLOCATED	0x04
-#define FW_CDEV_EVENT_ISO_RESOURCE_DEALLOCATED	0x05
+#define FW_CDEV_EVENT_BUS_RESET				0x00
+#define FW_CDEV_EVENT_RESPONSE				0x01
+#define FW_CDEV_EVENT_REQUEST				0x02
+#define FW_CDEV_EVENT_ISO_INTERRUPT			0x03
+#define FW_CDEV_EVENT_ISO_RESOURCE_ALLOCATED		0x04
+#define FW_CDEV_EVENT_ISO_RESOURCE_DEALLOCATED		0x05
 
 /* available since kernel version 2.6.36 */
-#define FW_CDEV_EVENT_REQUEST2			0x06
-#define FW_CDEV_EVENT_PHY_PACKET_SENT		0x07
-#define FW_CDEV_EVENT_PHY_PACKET_RECEIVED	0x08
+#define FW_CDEV_EVENT_REQUEST2				0x06
+#define FW_CDEV_EVENT_PHY_PACKET_SENT			0x07
+#define FW_CDEV_EVENT_PHY_PACKET_RECEIVED		0x08
+#define FW_CDEV_EVENT_ISO_INTERRUPT_MULTICHANNEL	0x09
 
 /**
  * struct fw_cdev_event_common - Common part of all fw_cdev_event_ types
@@ -218,35 +219,41 @@ struct fw_cdev_event_request2 {
  * This event is sent when the controller has completed an &fw_cdev_iso_packet
  * with the %FW_CDEV_ISO_INTERRUPT bit set.
  *
- * Isochronous transmit events:
+ * Isochronous transmit events (context type %FW_CDEV_ISO_CONTEXT_TRANSMIT):
  *
- * In version 1 of the ABI, &header_length is 0.  In version 3 and some
- * implementations of version 2 of the ABI, &header_length is a multiple of 4
- * and &header contains timestamps of all packets up until the interrupt packet.
- * The format of the timestamps is as described below for isochronous reception.
+ * In version 3 and some implementations of version 2 of the ABI, &header_length
+ * is a multiple of 4 and &header contains timestamps of all packets up until
+ * the interrupt packet.  The format of the timestamps is as described below for
+ * isochronous reception.  In version 1 of the ABI, &header_length was 0.
  *
- * Isochronous receive events:
+ * Isochronous receive events (context type %FW_CDEV_ISO_CONTEXT_RECEIVE):
  *
  * The headers stripped of all packets up until and including the interrupt
  * packet are returned in the @header field.  The amount of header data per
  * packet is as specified at iso context creation by
  * &fw_cdev_create_iso_context.header_size.
  *
- * In version 1 of this ABI, header data consisted of the 1394 isochronous
- * packet header, followed by quadlets from the packet payload if
- * &fw_cdev_create_iso_context.header_size > 4.
+ * Hence, _interrupt.header_length / _context.header_size is the number of
+ * packets received in this interrupt event.  The client can now iterate
+ * through the mmap()'ed DMA buffer according to this number of packets and
+ * to the buffer sizes as the client specified in &fw_cdev_queue_iso.
  *
- * In version 2 of this ABI, header data consist of the 1394 isochronous
- * packet header, followed by a timestamp quadlet if
- * &fw_cdev_create_iso_context.header_size > 4, followed by quadlets from the
- * packet payload if &fw_cdev_create_iso_context.header_size > 8.
+ * Since version 2 of this ABI, the portion for each packet in _interrupt.header
+ * consists of the 1394 isochronous packet header, followed by a timestamp
+ * quadlet if &fw_cdev_create_iso_context.header_size > 4, followed by quadlets
+ * from the packet payload if &fw_cdev_create_iso_context.header_size > 8.
  *
- * Behaviour of ver. 1 of this ABI is no longer available since ABI ver. 2.
+ * Format of 1394 iso packet header:  16 bits data_length, 2 bits tag, 6 bits
+ * channel, 4 bits tcode, 4 bits sy, in big endian byte order.
+ * data_length is the actual received size of the packet without the four
+ * 1394 iso packet header bytes.
+ *
+ * Format of timestamp:  16 bits invalid, 3 bits cycleSeconds, 13 bits
+ * cycleCount, in big endian byte order.
  *
- * Format of 1394 iso packet header: 16 bits len, 2 bits tag, 6 bits channel,
- * 4 bits tcode, 4 bits sy, in big endian byte order.  Format of timestamp:
- * 16 bits invalid, 3 bits cycleSeconds, 13 bits cycleCount, in big endian byte
- * order.
+ * In version 1 of the ABI, no timestamp quadlet was inserted; instead, payload
+ * data followed directly after the 1394 is header if header_size > 4.
+ * Behaviour of ver. 1 of this ABI is no longer available since ABI ver. 2.
  */
 struct fw_cdev_event_iso_interrupt {
 	__u64 closure;
@@ -256,6 +263,43 @@ struct fw_cdev_event_iso_interrupt {
 	__u32 header[0];
 };
 
+/**
+ * struct fw_cdev_event_iso_interrupt_mc - An iso buffer chunk was completed
+ * @closure:	See &fw_cdev_event_common;
+ *		set by %FW_CDEV_CREATE_ISO_CONTEXT ioctl
+ * @type:	%FW_CDEV_EVENT_ISO_INTERRUPT_MULTICHANNEL
+ * @completed:	Offset into the receive buffer; data before this offest is valid
+ *
+ * This event is sent in multichannel contexts (context type
+ * %FW_CDEV_ISO_CONTEXT_RECEIVE_MULTICHANNEL) for &fw_cdev_iso_packet buffer
+ * chunks that have the %FW_CDEV_ISO_INTERRUPT bit set.  Whether this happens
+ * when a packet is completed and/or when a buffer chunk is completed depends
+ * on the hardware implementation.
+ *
+ * The buffer is continuously filled with the following data, per packet:
+ *  - the 1394 iso packet header as described at &fw_cdev_event_iso_interrupt,
+ *    but in little endian byte order,
+ *  - packet payload (as many bytes as specified in the data_length field of
+ *    the 1394 iso packet header) in big endian byte order,
+ *  - 0...3 padding bytes as needed to align the following trailer quadlet,
+ *  - trailer quadlet, containing the reception timestamp as described at
+ *    &fw_cdev_event_iso_interrupt, but in little endian byte order.
+ *
+ * Hence the per-packet size is data_length (rounded up to a multiple of 4) + 8.
+ * When processing the data, stop before a packet that would cross the
+ * @completed offset.
+ *
+ * A packet near the end of a buffer chunk will typically spill over into the
+ * next queued buffer chunk.  It is the responsibility of the client to check
+ * for this condition, assemble a broken-up packet from its parts, and not to
+ * re-queue any buffer chunks in which as yet unread packet parts reside.
+ */
+struct fw_cdev_event_iso_interrupt_mc {
+	__u64 closure;
+	__u32 type;
+	__u32 completed;
+};
+
 /**
  * struct fw_cdev_event_iso_resource - Iso resources were allocated or freed
  * @closure:	See &fw_cdev_event_common;
@@ -311,16 +355,18 @@ struct fw_cdev_event_phy_packet {
 
 /**
  * union fw_cdev_event - Convenience union of fw_cdev_event_ types
- * @common:        Valid for all types
- * @bus_reset:     Valid if @common.type == %FW_CDEV_EVENT_BUS_RESET
- * @response:      Valid if @common.type == %FW_CDEV_EVENT_RESPONSE
- * @request:       Valid if @common.type == %FW_CDEV_EVENT_REQUEST
- * @request2:      Valid if @common.type == %FW_CDEV_EVENT_REQUEST2
- * @iso_interrupt: Valid if @common.type == %FW_CDEV_EVENT_ISO_INTERRUPT
- * @iso_resource:  Valid if @common.type ==
+ * @common:		Valid for all types
+ * @bus_reset:		Valid if @common.type == %FW_CDEV_EVENT_BUS_RESET
+ * @response:		Valid if @common.type == %FW_CDEV_EVENT_RESPONSE
+ * @request:		Valid if @common.type == %FW_CDEV_EVENT_REQUEST
+ * @request2:		Valid if @common.type == %FW_CDEV_EVENT_REQUEST2
+ * @iso_interrupt:	Valid if @common.type == %FW_CDEV_EVENT_ISO_INTERRUPT
+ * @iso_interrupt_mc:	Valid if @common.type ==
+ *				%FW_CDEV_EVENT_ISO_INTERRUPT_MULTICHANNEL
+ * @iso_resource:	Valid if @common.type ==
  *				%FW_CDEV_EVENT_ISO_RESOURCE_ALLOCATED or
  *				%FW_CDEV_EVENT_ISO_RESOURCE_DEALLOCATED
- * @phy_packet:    Valid if @common.type ==
+ * @phy_packet:		Valid if @common.type ==
  *				%FW_CDEV_EVENT_PHY_PACKET_SENT or
  *				%FW_CDEV_EVENT_PHY_PACKET_RECEIVED
  *
@@ -337,10 +383,11 @@ union fw_cdev_event {
 	struct fw_cdev_event_bus_reset		bus_reset;
 	struct fw_cdev_event_response		response;
 	struct fw_cdev_event_request		request;
-	struct fw_cdev_event_request2		request2;     /* added in 2.6.36 */
+	struct fw_cdev_event_request2		request2;		/* added in 2.6.36 */
 	struct fw_cdev_event_iso_interrupt	iso_interrupt;
-	struct fw_cdev_event_iso_resource	iso_resource; /* added in 2.6.30 */
-	struct fw_cdev_event_phy_packet		phy_packet;   /* added in 2.6.36 */
+	struct fw_cdev_event_iso_interrupt_mc	iso_interrupt_mc;	/* added in 2.6.36 */
+	struct fw_cdev_event_iso_resource	iso_resource;		/* added in 2.6.30 */
+	struct fw_cdev_event_phy_packet		phy_packet;		/* added in 2.6.36 */
 };
 
 /* available since kernel version 2.6.22 */
@@ -375,6 +422,7 @@ union fw_cdev_event {
 /* available since kernel version 2.6.36 */
 #define FW_CDEV_IOC_SEND_PHY_PACKET    _IOWR('#', 0x15, struct fw_cdev_send_phy_packet)
 #define FW_CDEV_IOC_RECEIVE_PHY_PACKETS _IOW('#', 0x16, struct fw_cdev_receive_phy_packets)
+#define FW_CDEV_IOC_SET_ISO_CHANNELS    _IOW('#', 0x17, struct fw_cdev_set_iso_channels)
 
 /*
  * ABI version history
@@ -391,10 +439,13 @@ union fw_cdev_event {
  *               - shared use and auto-response for FCP registers
  *  3  (2.6.34)  - made &fw_cdev_get_cycle_timer reliable
  *               - added %FW_CDEV_IOC_GET_CYCLE_TIMER2
- *  4  (2.6.36)  - added %FW_CDEV_EVENT_REQUEST2, %FW_CDEV_EVENT_PHY_PACKET_*
+ *  4  (2.6.36)  - added %FW_CDEV_EVENT_REQUEST2, %FW_CDEV_EVENT_PHY_PACKET_*,
+ *                 and &fw_cdev_allocate.region_end
  *               - implemented &fw_cdev_event_bus_reset.bm_node_id
  *               - added %FW_CDEV_IOC_SEND_PHY_PACKET, _RECEIVE_PHY_PACKETS
- *               - added &fw_cdev_allocate.region_end
+ *               - added %FW_CDEV_EVENT_ISO_INTERRUPT_MULTICHANNEL,
+ *                 %FW_CDEV_ISO_CONTEXT_RECEIVE_MULTICHANNEL, and
+ *                 %FW_CDEV_IOC_SET_ISO_CHANNELS
  */
 #define FW_CDEV_VERSION 3 /* Meaningless; don't use this macro. */
 
@@ -597,34 +648,43 @@ struct fw_cdev_remove_descriptor {
 	__u32 handle;
 };
 
-#define FW_CDEV_ISO_CONTEXT_TRANSMIT	0
-#define FW_CDEV_ISO_CONTEXT_RECEIVE	1
+#define FW_CDEV_ISO_CONTEXT_TRANSMIT			0
+#define FW_CDEV_ISO_CONTEXT_RECEIVE			1
+#define FW_CDEV_ISO_CONTEXT_RECEIVE_MULTICHANNEL	2 /* added in 2.6.36 */
 
 /**
- * struct fw_cdev_create_iso_context - Create a context for isochronous IO
- * @type:	%FW_CDEV_ISO_CONTEXT_TRANSMIT or %FW_CDEV_ISO_CONTEXT_RECEIVE
- * @header_size: Header size to strip for receive contexts
- * @channel:	Channel to bind to
- * @speed:	Speed for transmit contexts
- * @closure:	To be returned in &fw_cdev_event_iso_interrupt
+ * struct fw_cdev_create_iso_context - Create a context for isochronous I/O
+ * @type:	%FW_CDEV_ISO_CONTEXT_TRANSMIT or %FW_CDEV_ISO_CONTEXT_RECEIVE or
+ *		%FW_CDEV_ISO_CONTEXT_RECEIVE_MULTICHANNEL
+ * @header_size: Header size to strip in single-channel reception
+ * @channel:	Channel to bind to in single-channel reception or transmission
+ * @speed:	Transmission speed
+ * @closure:	To be returned in &fw_cdev_event_iso_interrupt or
+ *		&fw_cdev_event_iso_interrupt_multichannel
  * @handle:	Handle to context, written back by kernel
  *
  * Prior to sending or receiving isochronous I/O, a context must be created.
  * The context records information about the transmit or receive configuration
  * and typically maps to an underlying hardware resource.  A context is set up
  * for either sending or receiving.  It is bound to a specific isochronous
- * channel.
+ * @channel.
  *
- * If a context was successfully created, the kernel writes back a handle to the
- * context, which must be passed in for subsequent operations on that context.
+ * In case of multichannel reception, @header_size and @channel are ignored
+ * and the channels are selected by %FW_CDEV_IOC_SET_ISO_CHANNELS.
+ *
+ * For %FW_CDEV_ISO_CONTEXT_RECEIVE contexts, @header_size must be at least 4
+ * and must be a multiple of 4.  It is ignored in other context types.
  *
- * For receive contexts, @header_size must be at least 4 and must be a multiple
- * of 4.
+ * @speed is ignored in receive context types.
  *
- * Note that the effect of a @header_size > 4 depends on
- * &fw_cdev_get_info.version, as documented at &fw_cdev_event_iso_interrupt.
+ * If a context was successfully created, the kernel writes back a handle to the
+ * context, which must be passed in for subsequent operations on that context.
  *
+ * Limitations:
  * No more than one iso context can be created per fd.
+ * The total number of contexts that all userspace and kernelspace drivers can
+ * create on a card at a time is a hardware limit, typically 4 or 8 contexts per
+ * direction, and of them at most one multichannel receive context.
  */
 struct fw_cdev_create_iso_context {
 	__u32 type;
@@ -635,6 +695,22 @@ struct fw_cdev_create_iso_context {
 	__u32 handle;
 };
 
+/**
+ * struct fw_cdev_set_iso_channels - Select channels in multichannel reception
+ * @channels:	Bitmask of channels to listen to
+ * @handle:	Handle of the mutichannel receive context
+ *
+ * @channels is the bitwise or of 1ULL << n for each channel n to listen to.
+ *
+ * The ioctl fails with errno %EBUSY if there is already another receive context
+ * on a channel in @channels.  In that case, the bitmask of all unoccupied
+ * channels is returned in @channels.
+ */
+struct fw_cdev_set_iso_channels {
+	__u64 channels;
+	__u32 handle;
+};
+
 #define FW_CDEV_ISO_PAYLOAD_LENGTH(v)	(v)
 #define FW_CDEV_ISO_INTERRUPT		(1 << 16)
 #define FW_CDEV_ISO_SKIP		(1 << 17)
@@ -645,42 +721,72 @@ struct fw_cdev_create_iso_context {
 
 /**
  * struct fw_cdev_iso_packet - Isochronous packet
- * @control:	Contains the header length (8 uppermost bits), the sy field
- *		(4 bits), the tag field (2 bits), a sync flag (1 bit),
- *		a skip flag (1 bit), an interrupt flag (1 bit), and the
+ * @control:	Contains the header length (8 uppermost bits),
+ *		the sy field (4 bits), the tag field (2 bits), a sync flag
+ *		or a skip flag (1 bit), an interrupt flag (1 bit), and the
  *		payload length (16 lowermost bits)
- * @header:	Header and payload
+ * @header:	Header and payload in case of a transmit context.
  *
  * &struct fw_cdev_iso_packet is used to describe isochronous packet queues.
- *
  * Use the FW_CDEV_ISO_ macros to fill in @control.
+ * The @header array is empty in case of receive contexts.
+ *
+ * Context type %FW_CDEV_ISO_CONTEXT_TRANSMIT:
+ *
+ * @control.HEADER_LENGTH must be a multiple of 4.  It specifies the numbers of
+ * bytes in @header that will be prepended to the packet's payload.  These bytes
+ * are copied into the kernel and will not be accessed after the ioctl has
+ * returned.
+ *
+ * The @control.SY and TAG fields are copied to the iso packet header.  These
+ * fields are specified by IEEE 1394a and IEC 61883-1.
+ *
+ * The @control.SKIP flag specifies that no packet is to be sent in a frame.
+ * When using this, all other fields except @control.INTERRUPT must be zero.
+ *
+ * When a packet with the @control.INTERRUPT flag set has been completed, an
+ * &fw_cdev_event_iso_interrupt event will be sent.
+ *
+ * Context type %FW_CDEV_ISO_CONTEXT_RECEIVE:
  *
- * For transmit packets, the header length must be a multiple of 4 and specifies
- * the numbers of bytes in @header that will be prepended to the packet's
- * payload; these bytes are copied into the kernel and will not be accessed
- * after the ioctl has returned.  The sy and tag fields are copied to the iso
- * packet header (these fields are specified by IEEE 1394a and IEC 61883-1).
- * The skip flag specifies that no packet is to be sent in a frame; when using
- * this, all other fields except the interrupt flag must be zero.
- *
- * For receive packets, the header length must be a multiple of the context's
- * header size; if the header length is larger than the context's header size,
- * multiple packets are queued for this entry.  The sy and tag fields are
- * ignored.  If the sync flag is set, the context drops all packets until
- * a packet with a matching sy field is received (the sync value to wait for is
- * specified in the &fw_cdev_start_iso structure).  The payload length defines
- * how many payload bytes can be received for one packet (in addition to payload
- * quadlets that have been defined as headers and are stripped and returned in
- * the &fw_cdev_event_iso_interrupt structure).  If more bytes are received, the
- * additional bytes are dropped.  If less bytes are received, the remaining
- * bytes in this part of the payload buffer will not be written to, not even by
- * the next packet, i.e., packets received in consecutive frames will not
- * necessarily be consecutive in memory.  If an entry has queued multiple
- * packets, the payload length is divided equally among them.
- *
- * When a packet with the interrupt flag set has been completed, the
+ * @control.HEADER_LENGTH must be a multiple of the context's header_size.
+ * If the HEADER_LENGTH is larger than the context's header_size, multiple
+ * packets are queued for this entry.
+ *
+ * The @control.SY and TAG fields are ignored.
+ *
+ * If the @control.SYNC flag is set, the context drops all packets until a
+ * packet with a sy field is received which matches &fw_cdev_start_iso.sync.
+ *
+ * @control.PAYLOAD_LENGTH defines how many payload bytes can be received for
+ * one packet (in addition to payload quadlets that have been defined as headers
+ * and are stripped and returned in the &fw_cdev_event_iso_interrupt structure).
+ * If more bytes are received, the additional bytes are dropped.  If less bytes
+ * are received, the remaining bytes in this part of the payload buffer will not
+ * be written to, not even by the next packet.  I.e., packets received in
+ * consecutive frames will not necessarily be consecutive in memory.  If an
+ * entry has queued multiple packets, the PAYLOAD_LENGTH is divided equally
+ * among them.
+ *
+ * When a packet with the @control.INTERRUPT flag set has been completed, an
  * &fw_cdev_event_iso_interrupt event will be sent.  An entry that has queued
  * multiple receive packets is completed when its last packet is completed.
+ *
+ * Context type %FW_CDEV_ISO_CONTEXT_RECEIVE_MULTICHANNEL:
+ *
+ * Here, &fw_cdev_iso_packet would be more aptly named _iso_buffer_chunk since
+ * it specifies a chunk of the mmap()'ed buffer, while the number and alignment
+ * of packets to be placed into the buffer chunk is not known beforehand.
+ *
+ * @control.PAYLOAD_LENGTH is the size of the buffer chunk and specifies room
+ * for header, payload, padding, and trailer bytes of one or more packets.
+ * It must be a multiple of 4.
+ *
+ * @control.HEADER_LENGTH, TAG and SY are ignored.  SYNC is treated as described
+ * for single-channel reception.
+ *
+ * When a buffer chunk with the @control.INTERRUPT flag set has been filled
+ * entirely, an &fw_cdev_event_iso_interrupt_mc event will be sent.
  */
 struct fw_cdev_iso_packet {
 	__u32 control;
@@ -689,9 +795,9 @@ struct fw_cdev_iso_packet {
 
 /**
  * struct fw_cdev_queue_iso - Queue isochronous packets for I/O
- * @packets:	Userspace pointer to packet data
+ * @packets:	Userspace pointer to an array of &fw_cdev_iso_packet
  * @data:	Pointer into mmap()'ed payload buffer
- * @size:	Size of packet data in bytes
+ * @size:	Size of the @packets array, in bytes
  * @handle:	Isochronous context handle
  *
  * Queue a number of isochronous packets for reception or transmission.
@@ -704,6 +810,9 @@ struct fw_cdev_iso_packet {
  * The kernel may or may not queue all packets, but will write back updated
  * values of the @packets, @data and @size fields, so the ioctl can be
  * resubmitted easily.
+ *
+ * In case of a multichannel receive context, @data must be quadlet-aligned
+ * relative to the buffer start.
  */
 struct fw_cdev_queue_iso {
 	__u64 packets;
diff --git a/include/linux/firewire.h b/include/linux/firewire.h
index d974aa4a24c9..1cd637ef62d2 100644
--- a/include/linux/firewire.h
+++ b/include/linux/firewire.h
@@ -372,17 +372,19 @@ void fw_core_remove_descriptor(struct fw_descriptor *desc);
  * scatter-gather streaming (e.g. assembling video frame automatically).
  */
 struct fw_iso_packet {
-	u16 payload_length;	/* Length of indirect payload. */
-	u32 interrupt:1;	/* Generate interrupt on this packet */
-	u32 skip:1;		/* Set to not send packet at all. */
-	u32 tag:2;
-	u32 sy:4;
-	u32 header_length:8;	/* Length of immediate header. */
-	u32 header[0];
+	u16 payload_length;	/* Length of indirect payload		*/
+	u32 interrupt:1;	/* Generate interrupt on this packet	*/
+	u32 skip:1;		/* tx: Set to not send packet at all	*/
+				/* rx: Sync bit, wait for matching sy	*/
+	u32 tag:2;		/* tx: Tag in packet header		*/
+	u32 sy:4;		/* tx: Sy in packet header		*/
+	u32 header_length:8;	/* Length of immediate header		*/
+	u32 header[0];		/* tx: Top of 1394 isoch. data_block	*/
 };
 
-#define FW_ISO_CONTEXT_TRANSMIT	0
-#define FW_ISO_CONTEXT_RECEIVE	1
+#define FW_ISO_CONTEXT_TRANSMIT			0
+#define FW_ISO_CONTEXT_RECEIVE			1
+#define FW_ISO_CONTEXT_RECEIVE_MULTICHANNEL	2
 
 #define FW_ISO_CONTEXT_MATCH_TAG0	 1
 #define FW_ISO_CONTEXT_MATCH_TAG1	 2
@@ -406,24 +408,31 @@ struct fw_iso_buffer {
 int fw_iso_buffer_init(struct fw_iso_buffer *buffer, struct fw_card *card,
 		       int page_count, enum dma_data_direction direction);
 void fw_iso_buffer_destroy(struct fw_iso_buffer *buffer, struct fw_card *card);
+size_t fw_iso_buffer_lookup(struct fw_iso_buffer *buffer, dma_addr_t completed);
 
 struct fw_iso_context;
 typedef void (*fw_iso_callback_t)(struct fw_iso_context *context,
 				  u32 cycle, size_t header_length,
 				  void *header, void *data);
+typedef void (*fw_iso_mc_callback_t)(struct fw_iso_context *context,
+				     dma_addr_t completed, void *data);
 struct fw_iso_context {
 	struct fw_card *card;
 	int type;
 	int channel;
 	int speed;
 	size_t header_size;
-	fw_iso_callback_t callback;
+	union {
+		fw_iso_callback_t sc;
+		fw_iso_mc_callback_t mc;
+	} callback;
 	void *callback_data;
 };
 
 struct fw_iso_context *fw_iso_context_create(struct fw_card *card,
 		int type, int channel, int speed, size_t header_size,
 		fw_iso_callback_t callback, void *callback_data);
+int fw_iso_context_set_channels(struct fw_iso_context *ctx, u64 *channels);
 int fw_iso_context_queue(struct fw_iso_context *ctx,
 			 struct fw_iso_packet *packet,
 			 struct fw_iso_buffer *buffer,
-- 
cgit v1.2.3


From c6601225380088018ae93df2ba7f0bb65334d63b Mon Sep 17 00:00:00 2001
From: Grant Likely <grant.likely@secretlab.ca>
Date: Fri, 23 Jul 2010 15:04:01 -0600
Subject: of/device: Make of_device_make_bus_id() usable by other code.

The AMBA bus should also use of_device_make_bus_id() when populating device
out of device tree data.  This patch makes the function non-static, and
adds a suitable prototype in of_device.h

Signed-off-by: Grant Likely <grant.likely@secretlab.ca>
---
 drivers/of/platform.c     | 2 +-
 include/linux/of_device.h | 1 +
 2 files changed, 2 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/drivers/of/platform.c b/drivers/of/platform.c
index 033a224a9fda..30a4641e798a 100644
--- a/drivers/of/platform.c
+++ b/drivers/of/platform.c
@@ -508,7 +508,7 @@ EXPORT_SYMBOL(of_unregister_driver);
  * value to derive a unique name.  As a last resort it will use the node
  * name followed by a unique number.
  */
-static void of_device_make_bus_id(struct device *dev)
+void of_device_make_bus_id(struct device *dev)
 {
 	static atomic_t bus_no_reg_magic;
 	struct device_node *node = dev->of_node;
diff --git a/include/linux/of_device.h b/include/linux/of_device.h
index e11a0be7893a..35aa44ad9f2c 100644
--- a/include/linux/of_device.h
+++ b/include/linux/of_device.h
@@ -27,6 +27,7 @@
 
 extern const struct of_device_id *of_match_device(
 	const struct of_device_id *matches, const struct device *dev);
+extern void of_device_make_bus_id(struct device *dev);
 
 /**
  * of_driver_match_device - Tell if a driver's of_match_table matches a device.
-- 
cgit v1.2.3


From 559e2b7ee7a1c7753d534abcb2742a4775339293 Mon Sep 17 00:00:00 2001
From: Grant Likely <grant.likely@secretlab.ca>
Date: Fri, 23 Jul 2010 20:11:18 -0600
Subject: of: Provide default of_node_to_nid() implementation.

of_node_to_nid() is only relevant in a few architectures.  Don't force
everyone to implement it anyway.

Signed-off-by: Grant Likely <grant.likely@secretlab.ca>
---
 arch/microblaze/include/asm/topology.h | 10 ----------
 arch/powerpc/include/asm/prom.h        |  7 +++++++
 arch/powerpc/include/asm/topology.h    |  7 -------
 arch/sparc/include/asm/prom.h          |  3 +--
 include/linux/of.h                     |  5 +++++
 5 files changed, 13 insertions(+), 19 deletions(-)

(limited to 'include/linux')

diff --git a/arch/microblaze/include/asm/topology.h b/arch/microblaze/include/asm/topology.h
index 96bcea5a9920..5428f333a02c 100644
--- a/arch/microblaze/include/asm/topology.h
+++ b/arch/microblaze/include/asm/topology.h
@@ -1,11 +1 @@
 #include <asm-generic/topology.h>
-
-#ifndef _ASM_MICROBLAZE_TOPOLOGY_H
-#define _ASM_MICROBLAZE_TOPOLOGY_H
-
-struct device_node;
-static inline int of_node_to_nid(struct device_node *device)
-{
-	return 0;
-}
-#endif /* _ASM_MICROBLAZE_TOPOLOGY_H */
diff --git a/arch/powerpc/include/asm/prom.h b/arch/powerpc/include/asm/prom.h
index da7dd634e7ce..55bccc0a21c3 100644
--- a/arch/powerpc/include/asm/prom.h
+++ b/arch/powerpc/include/asm/prom.h
@@ -103,6 +103,13 @@ struct device_node *of_find_next_cache_node(struct device_node *np);
 /* Get the MAC address */
 extern const void *of_get_mac_address(struct device_node *np);
 
+#ifdef CONFIG_NUMA
+extern int of_node_to_nid(struct device_node *device);
+#else
+static inline int of_node_to_nid(struct device_node *device) { return 0; }
+#endif
+#define of_node_to_nid of_node_to_nid
+
 /**
  * of_irq_map_pci - Resolve the interrupt for a PCI device
  * @pdev:	the device whose interrupt is to be resolved
diff --git a/arch/powerpc/include/asm/topology.h b/arch/powerpc/include/asm/topology.h
index 32adf7280720..09dd38c8882c 100644
--- a/arch/powerpc/include/asm/topology.h
+++ b/arch/powerpc/include/asm/topology.h
@@ -41,8 +41,6 @@ static inline int cpu_to_node(int cpu)
 			       cpu_all_mask :				\
 			       node_to_cpumask_map[node])
 
-int of_node_to_nid(struct device_node *device);
-
 struct pci_bus;
 #ifdef CONFIG_PCI
 extern int pcibus_to_node(struct pci_bus *bus);
@@ -94,11 +92,6 @@ extern void sysfs_remove_device_from_node(struct sys_device *dev, int nid);
 
 #else
 
-static inline int of_node_to_nid(struct device_node *device)
-{
-	return 0;
-}
-
 static inline void dump_numa_cpu_topology(void) {}
 
 static inline int sysfs_add_device_to_node(struct sys_device *dev, int nid)
diff --git a/arch/sparc/include/asm/prom.h b/arch/sparc/include/asm/prom.h
index c82a7da25f9f..291f12575edd 100644
--- a/arch/sparc/include/asm/prom.h
+++ b/arch/sparc/include/asm/prom.h
@@ -43,8 +43,7 @@ extern int of_getintprop_default(struct device_node *np,
 extern int of_find_in_proplist(const char *list, const char *match, int len);
 #ifdef CONFIG_NUMA
 extern int of_node_to_nid(struct device_node *dp);
-#else
-#define of_node_to_nid(dp)	(-1)
+#define of_node_to_nid of_node_to_nid
 #endif
 
 extern void prom_build_devicetree(void);
diff --git a/include/linux/of.h b/include/linux/of.h
index b0756f33249e..cad7cf0ab278 100644
--- a/include/linux/of.h
+++ b/include/linux/of.h
@@ -146,6 +146,11 @@ static inline unsigned long of_read_ulong(const __be32 *cell, int size)
 
 #define OF_BAD_ADDR	((u64)-1)
 
+#ifndef of_node_to_nid
+static inline int of_node_to_nid(struct device_node *np) { return -1; }
+#define of_node_to_nid of_node_to_nid
+#endif
+
 extern struct device_node *of_find_node_by_name(struct device_node *from,
 	const char *name);
 #define for_each_node_by_name(dn, name) \
-- 
cgit v1.2.3


From 12b15e83289bc7cf2ec9a342412e0c955beeb395 Mon Sep 17 00:00:00 2001
From: Anatolij Gustschin <agust@denx.de>
Date: Tue, 27 Jul 2010 22:35:58 +0200
Subject: of/spi: call of_register_spi_devices() from spi core code

Move of_register_spi_devices() call from drivers to
spi_register_master(). Also change the function to use
the struct device_node pointer from master spi device
instead of passing it as function argument.

Signed-off-by: Anatolij Gustschin <agust@denx.de>
Signed-off-by: Grant Likely <grant.likely@secretlab.ca>
---
 drivers/of/of_spi.c           | 10 ++++++----
 drivers/spi/mpc512x_psc_spi.c |  1 +
 drivers/spi/mpc52xx_psc_spi.c | 10 ++--------
 drivers/spi/mpc52xx_spi.c     |  3 +--
 drivers/spi/spi.c             |  4 ++++
 drivers/spi/spi_mpc8xxx.c     |  4 +---
 drivers/spi/spi_ppc4xx.c      |  2 +-
 drivers/spi/xilinx_spi.c      |  3 +++
 drivers/spi/xilinx_spi_of.c   |  3 ---
 include/linux/of_spi.h        | 11 ++++++++---
 10 files changed, 27 insertions(+), 24 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/of/of_spi.c b/drivers/of/of_spi.c
index d504f1d1324b..1dbce58a58b0 100644
--- a/drivers/of/of_spi.c
+++ b/drivers/of/of_spi.c
@@ -15,12 +15,11 @@
 /**
  * of_register_spi_devices - Register child devices onto the SPI bus
  * @master:	Pointer to spi_master device
- * @np:		parent node of SPI device nodes
  *
- * Registers an spi_device for each child node of 'np' which has a 'reg'
+ * Registers an spi_device for each child node of master node which has a 'reg'
  * property.
  */
-void of_register_spi_devices(struct spi_master *master, struct device_node *np)
+void of_register_spi_devices(struct spi_master *master)
 {
 	struct spi_device *spi;
 	struct device_node *nc;
@@ -28,7 +27,10 @@ void of_register_spi_devices(struct spi_master *master, struct device_node *np)
 	int rc;
 	int len;
 
-	for_each_child_of_node(np, nc) {
+	if (!master->dev.of_node)
+		return;
+
+	for_each_child_of_node(master->dev.of_node, nc) {
 		/* Alloc an spi_device */
 		spi = spi_alloc_device(master);
 		if (!spi) {
diff --git a/drivers/spi/mpc512x_psc_spi.c b/drivers/spi/mpc512x_psc_spi.c
index 2534b1ec3edd..1bb4315f5f84 100644
--- a/drivers/spi/mpc512x_psc_spi.c
+++ b/drivers/spi/mpc512x_psc_spi.c
@@ -440,6 +440,7 @@ static int __init mpc512x_psc_spi_do_probe(struct device *dev, u32 regaddr,
 	master->setup = mpc512x_psc_spi_setup;
 	master->transfer = mpc512x_psc_spi_transfer;
 	master->cleanup = mpc512x_psc_spi_cleanup;
+	master->dev.of_node = dev->of_node;
 
 	tempp = ioremap(regaddr, size);
 	if (!tempp) {
diff --git a/drivers/spi/mpc52xx_psc_spi.c b/drivers/spi/mpc52xx_psc_spi.c
index 7104cb739da7..bd81ff90cfb3 100644
--- a/drivers/spi/mpc52xx_psc_spi.c
+++ b/drivers/spi/mpc52xx_psc_spi.c
@@ -17,7 +17,6 @@
 #include <linux/errno.h>
 #include <linux/interrupt.h>
 #include <linux/of_platform.h>
-#include <linux/of_spi.h>
 #include <linux/workqueue.h>
 #include <linux/completion.h>
 #include <linux/io.h>
@@ -398,6 +397,7 @@ static int __init mpc52xx_psc_spi_do_probe(struct device *dev, u32 regaddr,
 	master->setup = mpc52xx_psc_spi_setup;
 	master->transfer = mpc52xx_psc_spi_transfer;
 	master->cleanup = mpc52xx_psc_spi_cleanup;
+	master->dev.of_node = dev->of_node;
 
 	mps->psc = ioremap(regaddr, size);
 	if (!mps->psc) {
@@ -470,7 +470,6 @@ static int __init mpc52xx_psc_spi_of_probe(struct of_device *op,
 	const u32 *regaddr_p;
 	u64 regaddr64, size64;
 	s16 id = -1;
-	int rc;
 
 	regaddr_p = of_get_address(op->dev.of_node, 0, &size64, NULL);
 	if (!regaddr_p) {
@@ -491,13 +490,8 @@ static int __init mpc52xx_psc_spi_of_probe(struct of_device *op,
 		id = *psc_nump + 1;
 	}
 
-	rc = mpc52xx_psc_spi_do_probe(&op->dev, (u32)regaddr64, (u32)size64,
+	return mpc52xx_psc_spi_do_probe(&op->dev, (u32)regaddr64, (u32)size64,
 				irq_of_parse_and_map(op->dev.of_node, 0), id);
-	if (rc == 0)
-		of_register_spi_devices(dev_get_drvdata(&op->dev),
-					op->dev.of_node);
-
-	return rc;
 }
 
 static int __exit mpc52xx_psc_spi_of_remove(struct of_device *op)
diff --git a/drivers/spi/mpc52xx_spi.c b/drivers/spi/mpc52xx_spi.c
index b1a76bff775f..56136ff00e01 100644
--- a/drivers/spi/mpc52xx_spi.c
+++ b/drivers/spi/mpc52xx_spi.c
@@ -18,7 +18,6 @@
 #include <linux/interrupt.h>
 #include <linux/delay.h>
 #include <linux/spi/spi.h>
-#include <linux/of_spi.h>
 #include <linux/io.h>
 #include <linux/of_gpio.h>
 #include <linux/slab.h>
@@ -439,6 +438,7 @@ static int __devinit mpc52xx_spi_probe(struct of_device *op,
 	master->setup = mpc52xx_spi_setup;
 	master->transfer = mpc52xx_spi_transfer;
 	master->mode_bits = SPI_CPOL | SPI_CPHA | SPI_LSB_FIRST;
+	master->dev.of_node = op->dev.of_node;
 
 	dev_set_drvdata(&op->dev, master);
 
@@ -512,7 +512,6 @@ static int __devinit mpc52xx_spi_probe(struct of_device *op,
 	if (rc)
 		goto err_register;
 
-	of_register_spi_devices(master, op->dev.of_node);
 	dev_info(&ms->master->dev, "registered MPC5200 SPI bus\n");
 
 	return rc;
diff --git a/drivers/spi/spi.c b/drivers/spi/spi.c
index b3a1f9259b62..1bb1b88780ce 100644
--- a/drivers/spi/spi.c
+++ b/drivers/spi/spi.c
@@ -26,6 +26,7 @@
 #include <linux/slab.h>
 #include <linux/mod_devicetable.h>
 #include <linux/spi/spi.h>
+#include <linux/of_spi.h>
 
 
 /* SPI bustype and spi_master class are registered after board init code
@@ -540,6 +541,9 @@ int spi_register_master(struct spi_master *master)
 	/* populate children from any spi device tables */
 	scan_boardinfo(master);
 	status = 0;
+
+	/* Register devices from the device tree */
+	of_register_spi_devices(master);
 done:
 	return status;
 }
diff --git a/drivers/spi/spi_mpc8xxx.c b/drivers/spi/spi_mpc8xxx.c
index 97ab0a81338a..aad9ae1b9c69 100644
--- a/drivers/spi/spi_mpc8xxx.c
+++ b/drivers/spi/spi_mpc8xxx.c
@@ -38,7 +38,6 @@
 #include <linux/of_platform.h>
 #include <linux/gpio.h>
 #include <linux/of_gpio.h>
-#include <linux/of_spi.h>
 #include <linux/slab.h>
 
 #include <sysdev/fsl_soc.h>
@@ -1009,6 +1008,7 @@ mpc8xxx_spi_probe(struct device *dev, struct resource *mem, unsigned int irq)
 	master->setup = mpc8xxx_spi_setup;
 	master->transfer = mpc8xxx_spi_transfer;
 	master->cleanup = mpc8xxx_spi_cleanup;
+	master->dev.of_node = dev->of_node;
 
 	mpc8xxx_spi = spi_master_get_devdata(master);
 	mpc8xxx_spi->dev = dev;
@@ -1299,8 +1299,6 @@ static int __devinit of_mpc8xxx_spi_probe(struct of_device *ofdev,
 		goto err;
 	}
 
-	of_register_spi_devices(master, np);
-
 	return 0;
 
 err:
diff --git a/drivers/spi/spi_ppc4xx.c b/drivers/spi/spi_ppc4xx.c
index d53466a249d9..0f5fa7e2a550 100644
--- a/drivers/spi/spi_ppc4xx.c
+++ b/drivers/spi/spi_ppc4xx.c
@@ -407,6 +407,7 @@ static int __init spi_ppc4xx_of_probe(struct of_device *op,
 	master = spi_alloc_master(dev, sizeof *hw);
 	if (master == NULL)
 		return -ENOMEM;
+	master->dev.of_node = np;
 	dev_set_drvdata(dev, master);
 	hw = spi_master_get_devdata(master);
 	hw->master = spi_master_get(master);
@@ -545,7 +546,6 @@ static int __init spi_ppc4xx_of_probe(struct of_device *op,
 	}
 
 	dev_info(dev, "driver initialized\n");
-	of_register_spi_devices(master, np);
 
 	return 0;
 
diff --git a/drivers/spi/xilinx_spi.c b/drivers/spi/xilinx_spi.c
index 1b47363cb73f..80f2db5bcfd6 100644
--- a/drivers/spi/xilinx_spi.c
+++ b/drivers/spi/xilinx_spi.c
@@ -390,6 +390,9 @@ struct spi_master *xilinx_spi_init(struct device *dev, struct resource *mem,
 
 	master->bus_num = bus_num;
 	master->num_chipselect = pdata->num_chipselect;
+#ifdef CONFIG_OF
+	master->dev.of_node = dev->of_node;
+#endif
 
 	xspi->mem = *mem;
 	xspi->irq = irq;
diff --git a/drivers/spi/xilinx_spi_of.c b/drivers/spi/xilinx_spi_of.c
index 4654805b08d8..87cda0956a83 100644
--- a/drivers/spi/xilinx_spi_of.c
+++ b/drivers/spi/xilinx_spi_of.c
@@ -80,9 +80,6 @@ static int __devinit xilinx_spi_of_probe(struct of_device *ofdev,
 
 	dev_set_drvdata(&ofdev->dev, master);
 
-	/* Add any subnodes on the SPI bus */
-	of_register_spi_devices(master, ofdev->dev.of_node);
-
 	return 0;
 }
 
diff --git a/include/linux/of_spi.h b/include/linux/of_spi.h
index 5f71ee8c0868..9e3e70f78ae6 100644
--- a/include/linux/of_spi.h
+++ b/include/linux/of_spi.h
@@ -9,10 +9,15 @@
 #ifndef __LINUX_OF_SPI_H
 #define __LINUX_OF_SPI_H
 
-#include <linux/of.h>
 #include <linux/spi/spi.h>
 
-extern void of_register_spi_devices(struct spi_master *master,
-				    struct device_node *np);
+#if defined(CONFIG_OF_SPI) || defined(CONFIG_OF_SPI_MODULE)
+extern void of_register_spi_devices(struct spi_master *master);
+#else
+static inline void of_register_spi_devices(struct spi_master *master)
+{
+	return;
+}
+#endif /* CONFIG_OF_SPI */
 
 #endif /* __LINUX_OF_SPI */
-- 
cgit v1.2.3


From 253d2e549818f5a4a52e2db0aba3dacee21e5b38 Mon Sep 17 00:00:00 2001
From: Jacob Pan <jacob.jun.pan@linux.intel.com>
Date: Fri, 16 Jul 2010 10:19:22 -0700
Subject: PCI: disable mmio during bar sizing

It is a known issue that mmio decoding shall be disabled while doing PCI
bar sizing. Host bridge and other devices (PCI PIC) shall be excluded for
certain platforms. This patch mainly comes from Mathew Willcox's
patch in http://kerneltrap.org/mailarchive/linux-kernel/2007/9/13/258969.

A new flag bit "mmio_alway_on" is added to pci_dev with the intention that
devices with their mmio decoding cannot be disabled during BAR sizing shall
have this bit set, preferrablly in their quirks.

Without this patch, Intel Moorestown platform graphics unit will be
corrupted during bar sizing activities.

Signed-off-by: Jacob Pan <jacob.jun.pan@linux.intel.com>
Signed-off-by: Jesse Barnes <jbarnes@virtuousgeek.org>
---
 drivers/pci/probe.c  | 10 ++++++++++
 drivers/pci/quirks.c | 13 +++++++++++++
 include/linux/pci.h  |  2 ++
 3 files changed, 25 insertions(+)

(limited to 'include/linux')

diff --git a/drivers/pci/probe.c b/drivers/pci/probe.c
index f4adba2d1dd3..12625d90f8b5 100644
--- a/drivers/pci/probe.c
+++ b/drivers/pci/probe.c
@@ -163,9 +163,16 @@ int __pci_read_base(struct pci_dev *dev, enum pci_bar_type type,
 			struct resource *res, unsigned int pos)
 {
 	u32 l, sz, mask;
+	u16 orig_cmd;
 
 	mask = type ? PCI_ROM_ADDRESS_MASK : ~0;
 
+	if (!dev->mmio_always_on) {
+		pci_read_config_word(dev, PCI_COMMAND, &orig_cmd);
+		pci_write_config_word(dev, PCI_COMMAND,
+			orig_cmd & ~(PCI_COMMAND_MEMORY | PCI_COMMAND_IO));
+	}
+
 	res->name = pci_name(dev);
 
 	pci_read_config_dword(dev, pos, &l);
@@ -173,6 +180,9 @@ int __pci_read_base(struct pci_dev *dev, enum pci_bar_type type,
 	pci_read_config_dword(dev, pos, &sz);
 	pci_write_config_dword(dev, pos, l);
 
+	if (!dev->mmio_always_on)
+		pci_write_config_word(dev, PCI_COMMAND, orig_cmd);
+
 	/*
 	 * All bits set in sz means the device isn't working properly.
 	 * If the BAR isn't implemented, all bits must be 0.  If it's a
diff --git a/drivers/pci/quirks.c b/drivers/pci/quirks.c
index 3a81d9d44019..202efa6f57c4 100644
--- a/drivers/pci/quirks.c
+++ b/drivers/pci/quirks.c
@@ -91,6 +91,19 @@ static void __devinit quirk_resource_alignment(struct pci_dev *dev)
 }
 DECLARE_PCI_FIXUP_HEADER(PCI_ANY_ID, PCI_ANY_ID, quirk_resource_alignment);
 
+/*
+ * Decoding should be disabled for a PCI device during BAR sizing to avoid
+ * conflict. But doing so may cause problems on host bridge and perhaps other
+ * key system devices. For devices that need to have mmio decoding always-on,
+ * we need to set the dev->mmio_always_on bit.
+ */
+static void __devinit quirk_mmio_always_on(struct pci_dev *dev)
+{
+	if ((dev->class >> 8) == PCI_CLASS_BRIDGE_HOST)
+		dev->mmio_always_on = 1;
+}
+DECLARE_PCI_FIXUP_EARLY(PCI_ANY_ID, PCI_ANY_ID, quirk_mmio_always_on);
+
 /* The Mellanox Tavor device gives false positive parity errors
  * Mark this device with a broken_parity_status, to allow
  * PCI scanning code to "skip" this now blacklisted device.
diff --git a/include/linux/pci.h b/include/linux/pci.h
index f26fda76b87f..b1d17956a153 100644
--- a/include/linux/pci.h
+++ b/include/linux/pci.h
@@ -270,6 +270,8 @@ struct pci_dev {
 	unsigned int	d1_support:1;	/* Low power state D1 is supported */
 	unsigned int	d2_support:1;	/* Low power state D2 is supported */
 	unsigned int	no_d1d2:1;	/* Only allow D0 and D3 */
+	unsigned int	mmio_always_on:1;	/* disallow turning off io/mem
+						   decoding during bar sizing */
 	unsigned int	wakeup_prepared:1;
 	unsigned int	d3_delay;	/* D3->D0 transition time in ms */
 
-- 
cgit v1.2.3


From 911e1c9b05a8e3559a7aa89083930700a0b9e7ee Mon Sep 17 00:00:00 2001
From: Narendra K <Narendra_K@dell.com>
Date: Mon, 26 Jul 2010 05:56:50 -0500
Subject: PCI: export SMBIOS provided firmware instance and label to sysfs

This patch exports SMBIOS provided firmware instance and label of
onboard PCI devices to sysfs.  New files are:
  /sys/bus/pci/devices/.../label which contains the firmware name for
the device in question, and
  /sys/bus/pci/devices/.../index which contains the firmware device type
instance for the given device.

Signed-off-by: Jordan Hargrave <jordan_hargrave@dell.com>
Signed-off-by: Narendra K <narendra_k@dell.com>
Signed-off-by: Jesse Barnes <jbarnes@virtuousgeek.org>
---
 Documentation/ABI/testing/sysfs-bus-pci |  27 ++++++
 drivers/firmware/dmi_scan.c             |  25 ++++++
 drivers/pci/Makefile                    |   3 +
 drivers/pci/pci-label.c                 | 143 ++++++++++++++++++++++++++++++++
 drivers/pci/pci-sysfs.c                 |   5 ++
 drivers/pci/pci.h                       |   9 ++
 include/linux/dmi.h                     |   9 ++
 7 files changed, 221 insertions(+)
 create mode 100644 drivers/pci/pci-label.c

(limited to 'include/linux')

diff --git a/Documentation/ABI/testing/sysfs-bus-pci b/Documentation/ABI/testing/sysfs-bus-pci
index 25be3250f7d6..f979d825d112 100644
--- a/Documentation/ABI/testing/sysfs-bus-pci
+++ b/Documentation/ABI/testing/sysfs-bus-pci
@@ -139,3 +139,30 @@ Contact:	linux-pci@vger.kernel.org
 Description:
 		This symbolic link points to the PCI hotplug controller driver
 		module that manages the hotplug slot.
+
+What:		/sys/bus/pci/devices/.../label
+Date:		July 2010
+Contact:	Narendra K <narendra_k@dell.com>, linux-bugs@dell.com
+Description:
+		Reading this attribute will provide the firmware
+		given name(SMBIOS type 41 string) of the PCI device.
+		The attribute will be created only if the firmware
+		has given a name to the PCI device.
+Users:
+		Userspace applications interested in knowing the
+		firmware assigned name of the PCI device.
+
+What:		/sys/bus/pci/devices/.../index
+Date:		July 2010
+Contact:	Narendra K <narendra_k@dell.com>, linux-bugs@dell.com
+Description:
+		Reading this attribute will provide the firmware
+		given instance(SMBIOS type 41 device type instance)
+		of the PCI device. The attribute will be created
+		only if the firmware has given a device type instance
+		to the PCI device.
+Users:
+		Userspace applications interested in knowing the
+		firmware assigned device type instance of the PCI
+		device that can help in understanding the firmware
+		intended order of the PCI device.
diff --git a/drivers/firmware/dmi_scan.c b/drivers/firmware/dmi_scan.c
index d46467271349..b3d22d659990 100644
--- a/drivers/firmware/dmi_scan.c
+++ b/drivers/firmware/dmi_scan.c
@@ -277,6 +277,29 @@ static void __init dmi_save_ipmi_device(const struct dmi_header *dm)
 	list_add_tail(&dev->list, &dmi_devices);
 }
 
+static void __init dmi_save_dev_onboard(int instance, int segment, int bus,
+					int devfn, const char *name)
+{
+	struct dmi_dev_onboard *onboard_dev;
+
+	onboard_dev = dmi_alloc(sizeof(*onboard_dev) + strlen(name) + 1);
+	if (!onboard_dev) {
+		printk(KERN_ERR "dmi_save_dev_onboard: out of memory.\n");
+		return;
+	}
+	onboard_dev->instance = instance;
+	onboard_dev->segment = segment;
+	onboard_dev->bus = bus;
+	onboard_dev->devfn = devfn;
+
+	strcpy((char *)&onboard_dev[1], name);
+	onboard_dev->dev.type = DMI_DEV_TYPE_DEV_ONBOARD;
+	onboard_dev->dev.name = (char *)&onboard_dev[1];
+	onboard_dev->dev.device_data = onboard_dev;
+
+	list_add(&onboard_dev->dev.list, &dmi_devices);
+}
+
 static void __init dmi_save_extended_devices(const struct dmi_header *dm)
 {
 	const u8 *d = (u8*) dm + 5;
@@ -285,6 +308,8 @@ static void __init dmi_save_extended_devices(const struct dmi_header *dm)
 	if ((*d & 0x80) == 0)
 		return;
 
+	dmi_save_dev_onboard(*(d+1), *(u16 *)(d+2), *(d+4), *(d+5),
+			     dmi_string_nosave(dm, *(d-1)));
 	dmi_save_one_device(*d & 0x7f, dmi_string_nosave(dm, *(d - 1)));
 }
 
diff --git a/drivers/pci/Makefile b/drivers/pci/Makefile
index 0b51857fbaf7..dc1aa0922868 100644
--- a/drivers/pci/Makefile
+++ b/drivers/pci/Makefile
@@ -55,6 +55,9 @@ obj-$(CONFIG_MICROBLAZE) += setup-bus.o
 #
 obj-$(CONFIG_ACPI)    += pci-acpi.o
 
+# SMBIOS provided firmware instance and labels
+obj-$(CONFIG_DMI)    += pci-label.o
+
 # Cardbus & CompactPCI use setup-bus
 obj-$(CONFIG_HOTPLUG) += setup-bus.o
 
diff --git a/drivers/pci/pci-label.c b/drivers/pci/pci-label.c
new file mode 100644
index 000000000000..111500e86f94
--- /dev/null
+++ b/drivers/pci/pci-label.c
@@ -0,0 +1,143 @@
+/*
+ * Purpose: Export the firmware instance and label associated with
+ * a pci device to sysfs
+ * Copyright (C) 2010 Dell Inc.
+ * by Narendra K <Narendra_K@dell.com>,
+ * Jordan Hargrave <Jordan_Hargrave@dell.com>
+ *
+ * SMBIOS defines type 41 for onboard pci devices. This code retrieves
+ * the instance number and string from the type 41 record and exports
+ * it to sysfs.
+ *
+ * Please see http://linux.dell.com/wiki/index.php/Oss/libnetdevname for more
+ * information.
+ */
+
+#include <linux/dmi.h>
+#include <linux/sysfs.h>
+#include <linux/pci.h>
+#include <linux/pci_ids.h>
+#include <linux/module.h>
+#include <linux/device.h>
+#include "pci.h"
+
+enum smbios_attr_enum {
+	SMBIOS_ATTR_NONE = 0,
+	SMBIOS_ATTR_LABEL_SHOW,
+	SMBIOS_ATTR_INSTANCE_SHOW,
+};
+
+static mode_t
+find_smbios_instance_string(struct pci_dev *pdev, char *buf,
+			    enum smbios_attr_enum attribute)
+{
+	const struct dmi_device *dmi;
+	struct dmi_dev_onboard *donboard;
+	int bus;
+	int devfn;
+
+	bus = pdev->bus->number;
+	devfn = pdev->devfn;
+
+	dmi = NULL;
+	while ((dmi = dmi_find_device(DMI_DEV_TYPE_DEV_ONBOARD,
+				      NULL, dmi)) != NULL) {
+		donboard = dmi->device_data;
+		if (donboard && donboard->bus == bus &&
+					donboard->devfn == devfn) {
+			if (buf) {
+				if (attribute == SMBIOS_ATTR_INSTANCE_SHOW)
+					return scnprintf(buf, PAGE_SIZE,
+							 "%d\n",
+							 donboard->instance);
+				else if (attribute == SMBIOS_ATTR_LABEL_SHOW)
+					return scnprintf(buf, PAGE_SIZE,
+							 "%s\n",
+							 dmi->name);
+			}
+			return strlen(dmi->name);
+		}
+	}
+	return 0;
+}
+
+static mode_t
+smbios_instance_string_exist(struct kobject *kobj, struct attribute *attr,
+			     int n)
+{
+	struct device *dev;
+	struct pci_dev *pdev;
+
+	dev = container_of(kobj, struct device, kobj);
+	pdev = to_pci_dev(dev);
+
+	return find_smbios_instance_string(pdev, NULL, SMBIOS_ATTR_NONE) ?
+					   S_IRUGO : 0;
+}
+
+static ssize_t
+smbioslabel_show(struct device *dev, struct device_attribute *attr, char *buf)
+{
+	struct pci_dev *pdev;
+	pdev = to_pci_dev(dev);
+
+	return find_smbios_instance_string(pdev, buf,
+					   SMBIOS_ATTR_LABEL_SHOW);
+}
+
+static ssize_t
+smbiosinstance_show(struct device *dev,
+		    struct device_attribute *attr, char *buf)
+{
+	struct pci_dev *pdev;
+	pdev = to_pci_dev(dev);
+
+	return find_smbios_instance_string(pdev, buf,
+					   SMBIOS_ATTR_INSTANCE_SHOW);
+}
+
+static struct device_attribute smbios_attr_label = {
+	.attr = {.name = "label", .mode = 0444, .owner = THIS_MODULE},
+	.show = smbioslabel_show,
+};
+
+static struct device_attribute smbios_attr_instance = {
+	.attr = {.name = "index", .mode = 0444, .owner = THIS_MODULE},
+	.show = smbiosinstance_show,
+};
+
+static struct attribute *smbios_attributes[] = {
+	&smbios_attr_label.attr,
+	&smbios_attr_instance.attr,
+	NULL,
+};
+
+static struct attribute_group smbios_attr_group = {
+	.attrs = smbios_attributes,
+	.is_visible = smbios_instance_string_exist,
+};
+
+static int
+pci_create_smbiosname_file(struct pci_dev *pdev)
+{
+	if (!sysfs_create_group(&pdev->dev.kobj, &smbios_attr_group))
+		return 0;
+	return -ENODEV;
+}
+
+static void
+pci_remove_smbiosname_file(struct pci_dev *pdev)
+{
+	sysfs_remove_group(&pdev->dev.kobj, &smbios_attr_group);
+}
+
+void pci_create_firmware_label_files(struct pci_dev *pdev)
+{
+	if (!pci_create_smbiosname_file(pdev))
+		;
+}
+
+void pci_remove_firmware_label_files(struct pci_dev *pdev)
+{
+	pci_remove_smbiosname_file(pdev);
+}
diff --git a/drivers/pci/pci-sysfs.c b/drivers/pci/pci-sysfs.c
index f7692dc531e4..b5a7d9bfcb24 100644
--- a/drivers/pci/pci-sysfs.c
+++ b/drivers/pci/pci-sysfs.c
@@ -1165,6 +1165,8 @@ int __must_check pci_create_sysfs_dev_files (struct pci_dev *pdev)
 	if (retval)
 		goto err_vga_file;
 
+	pci_create_firmware_label_files(pdev);
+
 	return 0;
 
 err_vga_file:
@@ -1232,6 +1234,9 @@ void pci_remove_sysfs_dev_files(struct pci_dev *pdev)
 		sysfs_remove_bin_file(&pdev->dev.kobj, pdev->rom_attr);
 		kfree(pdev->rom_attr);
 	}
+
+	pci_remove_firmware_label_files(pdev);
+
 }
 
 static int __init pci_sysfs_init(void)
diff --git a/drivers/pci/pci.h b/drivers/pci/pci.h
index f8077b3c8c8c..d930338e0922 100644
--- a/drivers/pci/pci.h
+++ b/drivers/pci/pci.h
@@ -11,6 +11,15 @@
 extern int pci_uevent(struct device *dev, struct kobj_uevent_env *env);
 extern int pci_create_sysfs_dev_files(struct pci_dev *pdev);
 extern void pci_remove_sysfs_dev_files(struct pci_dev *pdev);
+#ifndef CONFIG_DMI
+static inline void pci_create_firmware_label_files(struct pci_dev *pdev)
+{ return 0; }
+static inline void pci_remove_firmware_label_files(struct pci_dev *pdev)
+{ return 0; }
+#else
+extern void pci_create_firmware_label_files(struct pci_dev *pdev);
+extern void pci_remove_firmware_label_files(struct pci_dev *pdev);
+#endif
 extern void pci_cleanup_rom(struct pci_dev *dev);
 #ifdef HAVE_PCI_MMAP
 extern int pci_mmap_fits(struct pci_dev *pdev, int resno,
diff --git a/include/linux/dmi.h b/include/linux/dmi.h
index a8a3e1ac281d..90e087f8d951 100644
--- a/include/linux/dmi.h
+++ b/include/linux/dmi.h
@@ -20,6 +20,7 @@ enum dmi_device_type {
 	DMI_DEV_TYPE_SAS,
 	DMI_DEV_TYPE_IPMI = -1,
 	DMI_DEV_TYPE_OEM_STRING = -2,
+	DMI_DEV_TYPE_DEV_ONBOARD = -3,
 };
 
 struct dmi_header {
@@ -37,6 +38,14 @@ struct dmi_device {
 
 #ifdef CONFIG_DMI
 
+struct dmi_dev_onboard {
+	struct dmi_device dev;
+	int instance;
+	int segment;
+	int bus;
+	int devfn;
+};
+
 extern int dmi_check_system(const struct dmi_system_id *list);
 const struct dmi_system_id *dmi_first_match(const struct dmi_system_id *list);
 extern const char * dmi_get_system_info(int field);
-- 
cgit v1.2.3


From 30da55242818a8ca08583188ebcbaccd283ad4d9 Mon Sep 17 00:00:00 2001
From: Ben Hutchings <bhutchings@solarflare.com>
Date: Fri, 23 Jul 2010 14:56:28 +0100
Subject: PCI: MSI: Restore read_msi_msg_desc(); add get_cached_msi_msg_desc()

commit 2ca1af9aa3285c6a5f103ed31ad09f7399fc65d7 "PCI: MSI: Remove
unsafe and unnecessary hardware access" changed read_msi_msg_desc() to
return the last MSI message written instead of reading it from the
device, since it may be called while the device is in a reduced
power state.

However, the pSeries platform code really does need to read messages
from the device, since they are initially written by firmware.
Therefore:
- Restore the previous behaviour of read_msi_msg_desc()
- Add new functions get_cached_msi_msg{,_desc}() which return the
  last MSI message written
- Use the new functions where appropriate

Acked-by: Michael Ellerman <michael@ellerman.id.au>
Signed-off-by: Ben Hutchings <bhutchings@solarflare.com>
Signed-off-by: Jesse Barnes <jbarnes@virtuousgeek.org>
---
 arch/ia64/kernel/msi_ia64.c    |  2 +-
 arch/ia64/sn/kernel/msi_sn.c   |  2 +-
 arch/x86/kernel/apic/io_apic.c |  2 +-
 drivers/pci/msi.c              | 47 +++++++++++++++++++++++++++++++++++++-----
 include/linux/msi.h            |  2 ++
 5 files changed, 47 insertions(+), 8 deletions(-)

(limited to 'include/linux')

diff --git a/arch/ia64/kernel/msi_ia64.c b/arch/ia64/kernel/msi_ia64.c
index 6c8922856049..4a746ea838ff 100644
--- a/arch/ia64/kernel/msi_ia64.c
+++ b/arch/ia64/kernel/msi_ia64.c
@@ -25,7 +25,7 @@ static int ia64_set_msi_irq_affinity(unsigned int irq,
 	if (irq_prepare_move(irq, cpu))
 		return -1;
 
-	read_msi_msg(irq, &msg);
+	get_cached_msi_msg(irq, &msg);
 
 	addr = msg.address_lo;
 	addr &= MSI_ADDR_DEST_ID_MASK;
diff --git a/arch/ia64/sn/kernel/msi_sn.c b/arch/ia64/sn/kernel/msi_sn.c
index ebfdd6a9ae1a..0c72dd463831 100644
--- a/arch/ia64/sn/kernel/msi_sn.c
+++ b/arch/ia64/sn/kernel/msi_sn.c
@@ -175,7 +175,7 @@ static int sn_set_msi_irq_affinity(unsigned int irq,
 	 * Release XIO resources for the old MSI PCI address
 	 */
 
-	read_msi_msg(irq, &msg);
+	get_cached_msi_msg(irq, &msg);
         sn_pdev = (struct pcidev_info *)sn_irq_info->irq_pciioinfo;
 	pdev = sn_pdev->pdi_linux_pcidev;
 	provider = SN_PCIDEV_BUSPROVIDER(pdev);
diff --git a/arch/x86/kernel/apic/io_apic.c b/arch/x86/kernel/apic/io_apic.c
index e41ed24ab26d..4dc0084ec1b1 100644
--- a/arch/x86/kernel/apic/io_apic.c
+++ b/arch/x86/kernel/apic/io_apic.c
@@ -3397,7 +3397,7 @@ static int set_msi_irq_affinity(unsigned int irq, const struct cpumask *mask)
 
 	cfg = desc->chip_data;
 
-	read_msi_msg_desc(desc, &msg);
+	get_cached_msi_msg_desc(desc, &msg);
 
 	msg.data &= ~MSI_DATA_VECTOR_MASK;
 	msg.data |= MSI_DATA_VECTOR(cfg->vector);
diff --git a/drivers/pci/msi.c b/drivers/pci/msi.c
index 4c14f31f2b4d..69b7be33b3a2 100644
--- a/drivers/pci/msi.c
+++ b/drivers/pci/msi.c
@@ -197,9 +197,46 @@ void read_msi_msg_desc(struct irq_desc *desc, struct msi_msg *msg)
 {
 	struct msi_desc *entry = get_irq_desc_msi(desc);
 
-	/* We do not touch the hardware (which may not even be
-	 * accessible at the moment) but return the last message
-	 * written.  Assert that this is valid, assuming that
+	BUG_ON(entry->dev->current_state != PCI_D0);
+
+	if (entry->msi_attrib.is_msix) {
+		void __iomem *base = entry->mask_base +
+			entry->msi_attrib.entry_nr * PCI_MSIX_ENTRY_SIZE;
+
+		msg->address_lo = readl(base + PCI_MSIX_ENTRY_LOWER_ADDR);
+		msg->address_hi = readl(base + PCI_MSIX_ENTRY_UPPER_ADDR);
+		msg->data = readl(base + PCI_MSIX_ENTRY_DATA);
+	} else {
+		struct pci_dev *dev = entry->dev;
+		int pos = entry->msi_attrib.pos;
+		u16 data;
+
+		pci_read_config_dword(dev, msi_lower_address_reg(pos),
+					&msg->address_lo);
+		if (entry->msi_attrib.is_64) {
+			pci_read_config_dword(dev, msi_upper_address_reg(pos),
+						&msg->address_hi);
+			pci_read_config_word(dev, msi_data_reg(pos, 1), &data);
+		} else {
+			msg->address_hi = 0;
+			pci_read_config_word(dev, msi_data_reg(pos, 0), &data);
+		}
+		msg->data = data;
+	}
+}
+
+void read_msi_msg(unsigned int irq, struct msi_msg *msg)
+{
+	struct irq_desc *desc = irq_to_desc(irq);
+
+	read_msi_msg_desc(desc, msg);
+}
+
+void get_cached_msi_msg_desc(struct irq_desc *desc, struct msi_msg *msg)
+{
+	struct msi_desc *entry = get_irq_desc_msi(desc);
+
+	/* Assert that the cache is valid, assuming that
 	 * valid messages are not all-zeroes. */
 	BUG_ON(!(entry->msg.address_hi | entry->msg.address_lo |
 		 entry->msg.data));
@@ -207,11 +244,11 @@ void read_msi_msg_desc(struct irq_desc *desc, struct msi_msg *msg)
 	*msg = entry->msg;
 }
 
-void read_msi_msg(unsigned int irq, struct msi_msg *msg)
+void get_cached_msi_msg(unsigned int irq, struct msi_msg *msg)
 {
 	struct irq_desc *desc = irq_to_desc(irq);
 
-	read_msi_msg_desc(desc, msg);
+	get_cached_msi_msg_desc(desc, msg);
 }
 
 void write_msi_msg_desc(struct irq_desc *desc, struct msi_msg *msg)
diff --git a/include/linux/msi.h b/include/linux/msi.h
index 6991ab5b24d1..91b05c171854 100644
--- a/include/linux/msi.h
+++ b/include/linux/msi.h
@@ -14,8 +14,10 @@ struct irq_desc;
 extern void mask_msi_irq(unsigned int irq);
 extern void unmask_msi_irq(unsigned int irq);
 extern void read_msi_msg_desc(struct irq_desc *desc, struct msi_msg *msg);
+extern void get_cached_msi_msg_desc(struct irq_desc *desc, struct msi_msg *msg);
 extern void write_msi_msg_desc(struct irq_desc *desc, struct msi_msg *msg);
 extern void read_msi_msg(unsigned int irq, struct msi_msg *msg);
+extern void get_cached_msi_msg(unsigned int irq, struct msi_msg *msg);
 extern void write_msi_msg(unsigned int irq, struct msi_msg *msg);
 
 struct msi_desc {
-- 
cgit v1.2.3


From f11ac8db5d07b6e99d41ff4aa39d878ee5cef1c5 Mon Sep 17 00:00:00 2001
From: Trond Myklebust <Trond.Myklebust@netapp.com>
Date: Fri, 25 Jun 2010 16:35:53 -0400
Subject: NFSv4: Ensure that we track the NFSv4 lock state in read/write
 requests.

This patch fixes bugzilla entry 14501:
  https://bugzilla.kernel.org/show_bug.cgi?id=14501

Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
---
 fs/nfs/direct.c          | 29 +++++++++++++++-----
 fs/nfs/inode.c           | 70 +++++++++++++++++++++++++++++++++++++++++++++---
 fs/nfs/nfs4xdr.c         |  8 +++---
 fs/nfs/pagelist.c        |  8 +++++-
 fs/nfs/read.c            |  1 +
 fs/nfs/write.c           |  5 +++-
 include/linux/nfs_fs.h   | 13 +++++++--
 include/linux/nfs_page.h |  1 +
 include/linux/nfs_xdr.h  |  2 ++
 9 files changed, 118 insertions(+), 19 deletions(-)

(limited to 'include/linux')

diff --git a/fs/nfs/direct.c b/fs/nfs/direct.c
index ad4cd31d6050..064a80961677 100644
--- a/fs/nfs/direct.c
+++ b/fs/nfs/direct.c
@@ -69,6 +69,7 @@ struct nfs_direct_req {
 
 	/* I/O parameters */
 	struct nfs_open_context	*ctx;		/* file open context info */
+	struct nfs_lock_context *l_ctx;		/* Lock context info */
 	struct kiocb *		iocb;		/* controlling i/o request */
 	struct inode *		inode;		/* target file of i/o */
 
@@ -160,6 +161,7 @@ static inline struct nfs_direct_req *nfs_direct_req_alloc(void)
 	INIT_LIST_HEAD(&dreq->rewrite_list);
 	dreq->iocb = NULL;
 	dreq->ctx = NULL;
+	dreq->l_ctx = NULL;
 	spin_lock_init(&dreq->lock);
 	atomic_set(&dreq->io_count, 0);
 	dreq->count = 0;
@@ -173,6 +175,8 @@ static void nfs_direct_req_free(struct kref *kref)
 {
 	struct nfs_direct_req *dreq = container_of(kref, struct nfs_direct_req, kref);
 
+	if (dreq->l_ctx != NULL)
+		nfs_put_lock_context(dreq->l_ctx);
 	if (dreq->ctx != NULL)
 		put_nfs_open_context(dreq->ctx);
 	kmem_cache_free(nfs_direct_cachep, dreq);
@@ -336,6 +340,7 @@ static ssize_t nfs_direct_read_schedule_segment(struct nfs_direct_req *dreq,
 		data->cred = msg.rpc_cred;
 		data->args.fh = NFS_FH(inode);
 		data->args.context = ctx;
+		data->args.lock_context = dreq->l_ctx;
 		data->args.offset = pos;
 		data->args.pgbase = pgbase;
 		data->args.pages = data->pagevec;
@@ -416,24 +421,28 @@ static ssize_t nfs_direct_read_schedule_iovec(struct nfs_direct_req *dreq,
 static ssize_t nfs_direct_read(struct kiocb *iocb, const struct iovec *iov,
 			       unsigned long nr_segs, loff_t pos)
 {
-	ssize_t result = 0;
+	ssize_t result = -ENOMEM;
 	struct inode *inode = iocb->ki_filp->f_mapping->host;
 	struct nfs_direct_req *dreq;
 
 	dreq = nfs_direct_req_alloc();
-	if (!dreq)
-		return -ENOMEM;
+	if (dreq == NULL)
+		goto out;
 
 	dreq->inode = inode;
 	dreq->ctx = get_nfs_open_context(nfs_file_open_context(iocb->ki_filp));
+	dreq->l_ctx = nfs_get_lock_context(dreq->ctx);
+	if (dreq->l_ctx == NULL)
+		goto out_release;
 	if (!is_sync_kiocb(iocb))
 		dreq->iocb = iocb;
 
 	result = nfs_direct_read_schedule_iovec(dreq, iov, nr_segs, pos);
 	if (!result)
 		result = nfs_direct_wait(dreq);
+out_release:
 	nfs_direct_req_release(dreq);
-
+out:
 	return result;
 }
 
@@ -574,6 +583,7 @@ static void nfs_direct_commit_schedule(struct nfs_direct_req *dreq)
 	data->args.offset = 0;
 	data->args.count = 0;
 	data->args.context = dreq->ctx;
+	data->args.lock_context = dreq->l_ctx;
 	data->res.count = 0;
 	data->res.fattr = &data->fattr;
 	data->res.verf = &data->verf;
@@ -761,6 +771,7 @@ static ssize_t nfs_direct_write_schedule_segment(struct nfs_direct_req *dreq,
 		data->cred = msg.rpc_cred;
 		data->args.fh = NFS_FH(inode);
 		data->args.context = ctx;
+		data->args.lock_context = dreq->l_ctx;
 		data->args.offset = pos;
 		data->args.pgbase = pgbase;
 		data->args.pages = data->pagevec;
@@ -845,7 +856,7 @@ static ssize_t nfs_direct_write(struct kiocb *iocb, const struct iovec *iov,
 				unsigned long nr_segs, loff_t pos,
 				size_t count)
 {
-	ssize_t result = 0;
+	ssize_t result = -ENOMEM;
 	struct inode *inode = iocb->ki_filp->f_mapping->host;
 	struct nfs_direct_req *dreq;
 	size_t wsize = NFS_SERVER(inode)->wsize;
@@ -853,7 +864,7 @@ static ssize_t nfs_direct_write(struct kiocb *iocb, const struct iovec *iov,
 
 	dreq = nfs_direct_req_alloc();
 	if (!dreq)
-		return -ENOMEM;
+		goto out;
 	nfs_alloc_commit_data(dreq);
 
 	if (dreq->commit_data == NULL || count < wsize)
@@ -861,14 +872,18 @@ static ssize_t nfs_direct_write(struct kiocb *iocb, const struct iovec *iov,
 
 	dreq->inode = inode;
 	dreq->ctx = get_nfs_open_context(nfs_file_open_context(iocb->ki_filp));
+	dreq->l_ctx = nfs_get_lock_context(dreq->ctx);
+	if (dreq->l_ctx != NULL)
+		goto out_release;
 	if (!is_sync_kiocb(iocb))
 		dreq->iocb = iocb;
 
 	result = nfs_direct_write_schedule_iovec(dreq, iov, nr_segs, pos, sync);
 	if (!result)
 		result = nfs_direct_wait(dreq);
+out_release:
 	nfs_direct_req_release(dreq);
-
+out:
 	return result;
 }
 
diff --git a/fs/nfs/inode.c b/fs/nfs/inode.c
index 099b3518feea..ec7a8f96a2c2 100644
--- a/fs/nfs/inode.c
+++ b/fs/nfs/inode.c
@@ -530,6 +530,68 @@ out:
 	return err;
 }
 
+static void nfs_init_lock_context(struct nfs_lock_context *l_ctx)
+{
+	atomic_set(&l_ctx->count, 1);
+	l_ctx->lockowner = current->files;
+	l_ctx->pid = current->tgid;
+	INIT_LIST_HEAD(&l_ctx->list);
+}
+
+static struct nfs_lock_context *__nfs_find_lock_context(struct nfs_open_context *ctx)
+{
+	struct nfs_lock_context *pos;
+
+	list_for_each_entry(pos, &ctx->lock_context.list, list) {
+		if (pos->lockowner != current->files)
+			continue;
+		if (pos->pid != current->tgid)
+			continue;
+		atomic_inc(&pos->count);
+		return pos;
+	}
+	return NULL;
+}
+
+struct nfs_lock_context *nfs_get_lock_context(struct nfs_open_context *ctx)
+{
+	struct nfs_lock_context *res, *new = NULL;
+	struct inode *inode = ctx->path.dentry->d_inode;
+
+	spin_lock(&inode->i_lock);
+	res = __nfs_find_lock_context(ctx);
+	if (res == NULL) {
+		spin_unlock(&inode->i_lock);
+		new = kmalloc(sizeof(*new), GFP_KERNEL);
+		if (new == NULL)
+			return NULL;
+		nfs_init_lock_context(new);
+		spin_lock(&inode->i_lock);
+		res = __nfs_find_lock_context(ctx);
+		if (res == NULL) {
+			list_add_tail(&new->list, &ctx->lock_context.list);
+			new->open_context = ctx;
+			res = new;
+			new = NULL;
+		}
+	}
+	spin_unlock(&inode->i_lock);
+	kfree(new);
+	return res;
+}
+
+void nfs_put_lock_context(struct nfs_lock_context *l_ctx)
+{
+	struct nfs_open_context *ctx = l_ctx->open_context;
+	struct inode *inode = ctx->path.dentry->d_inode;
+
+	if (!atomic_dec_and_lock(&l_ctx->count, &inode->i_lock))
+		return;
+	list_del(&l_ctx->list);
+	spin_unlock(&inode->i_lock);
+	kfree(l_ctx);
+}
+
 /**
  * nfs_close_context - Common close_context() routine NFSv2/v3
  * @ctx: pointer to context
@@ -566,11 +628,11 @@ static struct nfs_open_context *alloc_nfs_open_context(struct path *path, struct
 		path_get(&ctx->path);
 		ctx->cred = get_rpccred(cred);
 		ctx->state = NULL;
-		ctx->lockowner = current->files;
 		ctx->flags = 0;
 		ctx->error = 0;
 		ctx->dir_cookie = 0;
-		atomic_set(&ctx->count, 1);
+		nfs_init_lock_context(&ctx->lock_context);
+		ctx->lock_context.open_context = ctx;
 	}
 	return ctx;
 }
@@ -578,7 +640,7 @@ static struct nfs_open_context *alloc_nfs_open_context(struct path *path, struct
 struct nfs_open_context *get_nfs_open_context(struct nfs_open_context *ctx)
 {
 	if (ctx != NULL)
-		atomic_inc(&ctx->count);
+		atomic_inc(&ctx->lock_context.count);
 	return ctx;
 }
 
@@ -586,7 +648,7 @@ static void __put_nfs_open_context(struct nfs_open_context *ctx, int is_sync)
 {
 	struct inode *inode = ctx->path.dentry->d_inode;
 
-	if (!atomic_dec_and_lock(&ctx->count, &inode->i_lock))
+	if (!atomic_dec_and_lock(&ctx->lock_context.count, &inode->i_lock))
 		return;
 	list_del(&ctx->list);
 	spin_unlock(&inode->i_lock);
diff --git a/fs/nfs/nfs4xdr.c b/fs/nfs/nfs4xdr.c
index 1f7781d636ae..873b62f209ea 100644
--- a/fs/nfs/nfs4xdr.c
+++ b/fs/nfs/nfs4xdr.c
@@ -1324,14 +1324,14 @@ static void encode_putrootfh(struct xdr_stream *xdr, struct compound_hdr *hdr)
 	hdr->replen += decode_putrootfh_maxsz;
 }
 
-static void encode_stateid(struct xdr_stream *xdr, const struct nfs_open_context *ctx)
+static void encode_stateid(struct xdr_stream *xdr, const struct nfs_open_context *ctx, const struct nfs_lock_context *l_ctx)
 {
 	nfs4_stateid stateid;
 	__be32 *p;
 
 	p = reserve_space(xdr, NFS4_STATEID_SIZE);
 	if (ctx->state != NULL) {
-		nfs4_copy_stateid(&stateid, ctx->state, ctx->lockowner);
+		nfs4_copy_stateid(&stateid, ctx->state, l_ctx->lockowner);
 		xdr_encode_opaque_fixed(p, stateid.data, NFS4_STATEID_SIZE);
 	} else
 		xdr_encode_opaque_fixed(p, zero_stateid.data, NFS4_STATEID_SIZE);
@@ -1344,7 +1344,7 @@ static void encode_read(struct xdr_stream *xdr, const struct nfs_readargs *args,
 	p = reserve_space(xdr, 4);
 	*p = cpu_to_be32(OP_READ);
 
-	encode_stateid(xdr, args->context);
+	encode_stateid(xdr, args->context, args->lock_context);
 
 	p = reserve_space(xdr, 12);
 	p = xdr_encode_hyper(p, args->offset);
@@ -1523,7 +1523,7 @@ static void encode_write(struct xdr_stream *xdr, const struct nfs_writeargs *arg
 	p = reserve_space(xdr, 4);
 	*p = cpu_to_be32(OP_WRITE);
 
-	encode_stateid(xdr, args->context);
+	encode_stateid(xdr, args->context, args->lock_context);
 
 	p = reserve_space(xdr, 16);
 	p = xdr_encode_hyper(p, args->offset);
diff --git a/fs/nfs/pagelist.c b/fs/nfs/pagelist.c
index a3654e57b589..919490232e17 100644
--- a/fs/nfs/pagelist.c
+++ b/fs/nfs/pagelist.c
@@ -79,6 +79,7 @@ nfs_create_request(struct nfs_open_context *ctx, struct inode *inode,
 	req->wb_pgbase	= offset;
 	req->wb_bytes   = count;
 	req->wb_context = get_nfs_open_context(ctx);
+	req->wb_lock_context = nfs_get_lock_context(ctx);
 	kref_init(&req->wb_kref);
 	return req;
 }
@@ -141,11 +142,16 @@ void nfs_clear_request(struct nfs_page *req)
 {
 	struct page *page = req->wb_page;
 	struct nfs_open_context *ctx = req->wb_context;
+	struct nfs_lock_context *l_ctx = req->wb_lock_context;
 
 	if (page != NULL) {
 		page_cache_release(page);
 		req->wb_page = NULL;
 	}
+	if (l_ctx != NULL) {
+		nfs_put_lock_context(l_ctx);
+		req->wb_lock_context = NULL;
+	}
 	if (ctx != NULL) {
 		put_nfs_open_context(ctx);
 		req->wb_context = NULL;
@@ -235,7 +241,7 @@ static int nfs_can_coalesce_requests(struct nfs_page *prev,
 {
 	if (req->wb_context->cred != prev->wb_context->cred)
 		return 0;
-	if (req->wb_context->lockowner != prev->wb_context->lockowner)
+	if (req->wb_lock_context->lockowner != prev->wb_lock_context->lockowner)
 		return 0;
 	if (req->wb_context->state != prev->wb_context->state)
 		return 0;
diff --git a/fs/nfs/read.c b/fs/nfs/read.c
index 5a33a92e8168..87adc2744246 100644
--- a/fs/nfs/read.c
+++ b/fs/nfs/read.c
@@ -190,6 +190,7 @@ static int nfs_read_rpcsetup(struct nfs_page *req, struct nfs_read_data *data,
 	data->args.pages  = data->pagevec;
 	data->args.count  = count;
 	data->args.context = get_nfs_open_context(req->wb_context);
+	data->args.lock_context = req->wb_lock_context;
 
 	data->res.fattr   = &data->fattr;
 	data->res.count   = count;
diff --git a/fs/nfs/write.c b/fs/nfs/write.c
index 03df22822c4c..5eccea127cac 100644
--- a/fs/nfs/write.c
+++ b/fs/nfs/write.c
@@ -689,7 +689,9 @@ int nfs_flush_incompatible(struct file *file, struct page *page)
 		req = nfs_page_find_request(page);
 		if (req == NULL)
 			return 0;
-		do_flush = req->wb_page != page || req->wb_context != ctx;
+		do_flush = req->wb_page != page || req->wb_context != ctx ||
+			req->wb_lock_context->lockowner != current->files ||
+			req->wb_lock_context->pid != current->tgid;
 		nfs_release_request(req);
 		if (!do_flush)
 			return 0;
@@ -813,6 +815,7 @@ static int nfs_write_rpcsetup(struct nfs_page *req,
 	data->args.pages  = data->pagevec;
 	data->args.count  = count;
 	data->args.context = get_nfs_open_context(req->wb_context);
+	data->args.lock_context = req->wb_lock_context;
 	data->args.stable  = NFS_UNSTABLE;
 	if (how & FLUSH_STABLE) {
 		data->args.stable = NFS_DATA_SYNC;
diff --git a/include/linux/nfs_fs.h b/include/linux/nfs_fs.h
index 77c2ae53431c..a9d802615082 100644
--- a/include/linux/nfs_fs.h
+++ b/include/linux/nfs_fs.h
@@ -72,13 +72,20 @@ struct nfs_access_entry {
 	int			mask;
 };
 
+struct nfs_lock_context {
+	atomic_t count;
+	struct list_head list;
+	struct nfs_open_context *open_context;
+	fl_owner_t lockowner;
+	pid_t pid;
+};
+
 struct nfs4_state;
 struct nfs_open_context {
-	atomic_t count;
+	struct nfs_lock_context lock_context;
 	struct path path;
 	struct rpc_cred *cred;
 	struct nfs4_state *state;
-	fl_owner_t lockowner;
 	fmode_t mode;
 
 	unsigned long flags;
@@ -353,6 +360,8 @@ extern void nfs_setattr_update_inode(struct inode *inode, struct iattr *attr);
 extern struct nfs_open_context *get_nfs_open_context(struct nfs_open_context *ctx);
 extern void put_nfs_open_context(struct nfs_open_context *ctx);
 extern struct nfs_open_context *nfs_find_open_context(struct inode *inode, struct rpc_cred *cred, fmode_t mode);
+extern struct nfs_lock_context *nfs_get_lock_context(struct nfs_open_context *ctx);
+extern void nfs_put_lock_context(struct nfs_lock_context *l_ctx);
 extern u64 nfs_compat_user_ino64(u64 fileid);
 extern void nfs_fattr_init(struct nfs_fattr *fattr);
 
diff --git a/include/linux/nfs_page.h b/include/linux/nfs_page.h
index 3c60685d972b..f8b60e7f4c44 100644
--- a/include/linux/nfs_page.h
+++ b/include/linux/nfs_page.h
@@ -39,6 +39,7 @@ struct nfs_page {
 	struct list_head	wb_list;	/* Defines state of page: */
 	struct page		*wb_page;	/* page to read in/write out */
 	struct nfs_open_context	*wb_context;	/* File state context info */
+	struct nfs_lock_context	*wb_lock_context;	/* lock context info */
 	atomic_t		wb_complete;	/* i/os we're waiting for */
 	pgoff_t			wb_index;	/* Offset >> PAGE_CACHE_SHIFT */
 	unsigned int		wb_offset,	/* Offset & ~PAGE_CACHE_MASK */
diff --git a/include/linux/nfs_xdr.h b/include/linux/nfs_xdr.h
index a319cb926abf..87202c7026e3 100644
--- a/include/linux/nfs_xdr.h
+++ b/include/linux/nfs_xdr.h
@@ -334,6 +334,7 @@ struct nfs4_delegreturnres {
 struct nfs_readargs {
 	struct nfs_fh *		fh;
 	struct nfs_open_context *context;
+	struct nfs_lock_context *lock_context;
 	__u64			offset;
 	__u32			count;
 	unsigned int		pgbase;
@@ -354,6 +355,7 @@ struct nfs_readres {
 struct nfs_writeargs {
 	struct nfs_fh *		fh;
 	struct nfs_open_context *context;
+	struct nfs_lock_context *lock_context;
 	__u64			offset;
 	__u32			count;
 	enum nfs3_stable_how	stable;
-- 
cgit v1.2.3


From d3c7b7ccc199ee564177ee914c04771d6bc00295 Mon Sep 17 00:00:00 2001
From: Trond Myklebust <Trond.Myklebust@netapp.com>
Date: Thu, 1 Jul 2010 12:49:01 -0400
Subject: NFSv4: Add support for the RELEASE_LOCKOWNER operation

This is needed by NFSv4.0 servers in order to keep the number of locking
stateids at a manageable level.

Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
---
 fs/nfs/nfs4_fs.h        |  1 +
 fs/nfs/nfs4proc.c       | 28 +++++++++++++++++++++++++
 fs/nfs/nfs4state.c      |  2 ++
 fs/nfs/nfs4xdr.c        | 55 +++++++++++++++++++++++++++++++++++++++++++++++++
 include/linux/nfs4.h    |  1 +
 include/linux/nfs_xdr.h |  4 ++++
 6 files changed, 91 insertions(+)

(limited to 'include/linux')

diff --git a/fs/nfs/nfs4_fs.h b/fs/nfs/nfs4_fs.h
index cee871471e8d..deaf37f5a7a9 100644
--- a/fs/nfs/nfs4_fs.h
+++ b/fs/nfs/nfs4_fs.h
@@ -236,6 +236,7 @@ extern int nfs4_open_revalidate(struct inode *, struct dentry *, int, struct nam
 extern int nfs4_server_capabilities(struct nfs_server *server, struct nfs_fh *fhandle);
 extern int nfs4_proc_fs_locations(struct inode *dir, const struct qstr *name,
 		struct nfs4_fs_locations *fs_locations, struct page *page);
+extern void nfs4_release_lockowner(const struct nfs4_lock_state *);
 
 #if defined(CONFIG_NFS_V4_1)
 static inline struct nfs4_session *nfs4_get_session(const struct nfs_server *server)
diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c
index de9ff1505a24..5d3e8a2db99f 100644
--- a/fs/nfs/nfs4proc.c
+++ b/fs/nfs/nfs4proc.c
@@ -4414,6 +4414,34 @@ out:
 	return err;
 }
 
+static void nfs4_release_lockowner_release(void *calldata)
+{
+	kfree(calldata);
+}
+
+const struct rpc_call_ops nfs4_release_lockowner_ops = {
+	.rpc_release = nfs4_release_lockowner_release,
+};
+
+void nfs4_release_lockowner(const struct nfs4_lock_state *lsp)
+{
+	struct nfs_server *server = lsp->ls_state->owner->so_server;
+	struct nfs_release_lockowner_args *args;
+	struct rpc_message msg = {
+		.rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_RELEASE_LOCKOWNER],
+	};
+
+	if (server->nfs_client->cl_mvops->minor_version != 0)
+		return;
+	args = kmalloc(sizeof(*args), GFP_NOFS);
+	if (!args)
+		return;
+	args->lock_owner.clientid = server->nfs_client->cl_clientid;
+	args->lock_owner.id = lsp->ls_id.id;
+	msg.rpc_argp = args;
+	rpc_call_async(server->client, &msg, 0, &nfs4_release_lockowner_ops, args);
+}
+
 #define XATTR_NAME_NFSV4_ACL "system.nfs4_acl"
 
 int nfs4_setxattr(struct dentry *dentry, const char *key, const void *buf,
diff --git a/fs/nfs/nfs4state.c b/fs/nfs/nfs4state.c
index 13e17e32e3e4..13a4f27e7271 100644
--- a/fs/nfs/nfs4state.c
+++ b/fs/nfs/nfs4state.c
@@ -701,6 +701,8 @@ void nfs4_put_lock_state(struct nfs4_lock_state *lsp)
 	if (list_empty(&state->lock_states))
 		clear_bit(LK_STATE_IN_USE, &state->flags);
 	spin_unlock(&state->state_lock);
+	if (lsp->ls_flags & NFS_LOCK_INITIALIZED)
+		nfs4_release_lockowner(lsp);
 	nfs4_free_lock_state(lsp);
 }
 
diff --git a/fs/nfs/nfs4xdr.c b/fs/nfs/nfs4xdr.c
index 49df05afdc64..15185c2abd11 100644
--- a/fs/nfs/nfs4xdr.c
+++ b/fs/nfs/nfs4xdr.c
@@ -220,6 +220,11 @@ static int nfs4_stat_to_errno(int);
 				 4)
 #define decode_locku_maxsz	(op_decode_hdr_maxsz + \
 				 decode_stateid_maxsz)
+#define encode_release_lockowner_maxsz \
+				(op_encode_hdr_maxsz + \
+				 encode_lockowner_maxsz)
+#define decode_release_lockowner_maxsz \
+				(op_decode_hdr_maxsz)
 #define encode_access_maxsz	(op_encode_hdr_maxsz + 1)
 #define decode_access_maxsz	(op_decode_hdr_maxsz + 2)
 #define encode_symlink_maxsz	(op_encode_hdr_maxsz + \
@@ -474,6 +479,12 @@ static int nfs4_stat_to_errno(int);
 				decode_sequence_maxsz + \
 				decode_putfh_maxsz + \
 				decode_locku_maxsz)
+#define NFS4_enc_release_lockowner_sz \
+				(compound_encode_hdr_maxsz + \
+				 encode_lockowner_maxsz)
+#define NFS4_dec_release_lockowner_sz \
+				(compound_decode_hdr_maxsz + \
+				 decode_lockowner_maxsz)
 #define NFS4_enc_access_sz	(compound_encode_hdr_maxsz + \
 				encode_sequence_maxsz + \
 				encode_putfh_maxsz + \
@@ -1116,6 +1127,17 @@ static void encode_locku(struct xdr_stream *xdr, const struct nfs_locku_args *ar
 	hdr->replen += decode_locku_maxsz;
 }
 
+static void encode_release_lockowner(struct xdr_stream *xdr, const struct nfs_lowner *lowner, struct compound_hdr *hdr)
+{
+	__be32 *p;
+
+	p = reserve_space(xdr, 4);
+	*p = cpu_to_be32(OP_RELEASE_LOCKOWNER);
+	encode_lockowner(xdr, lowner);
+	hdr->nops++;
+	hdr->replen += decode_release_lockowner_maxsz;
+}
+
 static void encode_lookup(struct xdr_stream *xdr, const struct qstr *name, struct compound_hdr *hdr)
 {
 	int len = name->len;
@@ -2056,6 +2078,20 @@ static int nfs4_xdr_enc_locku(struct rpc_rqst *req, __be32 *p, struct nfs_locku_
 	return 0;
 }
 
+static int nfs4_xdr_enc_release_lockowner(struct rpc_rqst *req, __be32 *p, struct nfs_release_lockowner_args *args)
+{
+	struct xdr_stream xdr;
+	struct compound_hdr hdr = {
+		.minorversion = 0,
+	};
+
+	xdr_init_encode(&xdr, &req->rq_snd_buf, p);
+	encode_compound_hdr(&xdr, req, &hdr);
+	encode_release_lockowner(&xdr, &args->lock_owner, &hdr);
+	encode_nops(&hdr);
+	return 0;
+}
+
 /*
  * Encode a READLINK request
  */
@@ -3981,6 +4017,11 @@ static int decode_locku(struct xdr_stream *xdr, struct nfs_locku_res *res)
 	return status;
 }
 
+static int decode_release_lockowner(struct xdr_stream *xdr)
+{
+	return decode_op_hdr(xdr, OP_RELEASE_LOCKOWNER);
+}
+
 static int decode_lookup(struct xdr_stream *xdr)
 {
 	return decode_op_hdr(xdr, OP_LOOKUP);
@@ -5267,6 +5308,19 @@ out:
 	return status;
 }
 
+static int nfs4_xdr_dec_release_lockowner(struct rpc_rqst *rqstp, __be32 *p, void *dummy)
+{
+	struct xdr_stream xdr;
+	struct compound_hdr hdr;
+	int status;
+
+	xdr_init_decode(&xdr, &rqstp->rq_rcv_buf, p);
+	status = decode_compound_hdr(&xdr, &hdr);
+	if (!status)
+		status = decode_release_lockowner(&xdr);
+	return status;
+}
+
 /*
  * Decode READLINK response
  */
@@ -5874,6 +5928,7 @@ struct rpc_procinfo	nfs4_procedures[] = {
   PROC(GETACL,		enc_getacl,	dec_getacl),
   PROC(SETACL,		enc_setacl,	dec_setacl),
   PROC(FS_LOCATIONS,	enc_fs_locations, dec_fs_locations),
+  PROC(RELEASE_LOCKOWNER, enc_release_lockowner, dec_release_lockowner),
 #if defined(CONFIG_NFS_V4_1)
   PROC(EXCHANGE_ID,	enc_exchange_id,	dec_exchange_id),
   PROC(CREATE_SESSION,	enc_create_session,	dec_create_session),
diff --git a/include/linux/nfs4.h b/include/linux/nfs4.h
index 9b8299af3741..07e40c625972 100644
--- a/include/linux/nfs4.h
+++ b/include/linux/nfs4.h
@@ -523,6 +523,7 @@ enum {
 	NFSPROC4_CLNT_GETACL,
 	NFSPROC4_CLNT_SETACL,
 	NFSPROC4_CLNT_FS_LOCATIONS,
+	NFSPROC4_CLNT_RELEASE_LOCKOWNER,
 
 	/* nfs41 */
 	NFSPROC4_CLNT_EXCHANGE_ID,
diff --git a/include/linux/nfs_xdr.h b/include/linux/nfs_xdr.h
index 87202c7026e3..fc461926c412 100644
--- a/include/linux/nfs_xdr.h
+++ b/include/linux/nfs_xdr.h
@@ -315,6 +315,10 @@ struct nfs_lockt_res {
 	struct nfs4_sequence_res	seq_res;
 };
 
+struct nfs_release_lockowner_args {
+	struct nfs_lowner	lock_owner;
+};
+
 struct nfs4_delegreturnargs {
 	const struct nfs_fh *fhandle;
 	const nfs4_stateid *stateid;
-- 
cgit v1.2.3


From 22ae782f86b726f9cea752c0f269ff6dcdf2f6e1 Mon Sep 17 00:00:00 2001
From: Grant Likely <grant.likely@secretlab.ca>
Date: Thu, 29 Jul 2010 11:49:01 -0600
Subject: of/address: Clean up function declarations

This patch moves the declaration of of_get_address(), of_get_pci_address(),
and of_pci_address_to_resource() out of arch code and into the common
linux/of_address header file.

This patch also fixes some of the asm/prom.h ordering issues.  It still
includes some header files that it ideally shouldn't be, but at least the
ordering is consistent now so that of_* overrides work.

Signed-off-by: Grant Likely <grant.likely@secretlab.ca>
---
 arch/microblaze/include/asm/prom.h        | 33 +++++++--------------
 arch/powerpc/include/asm/prom.h           | 49 +++++++------------------------
 arch/powerpc/kernel/legacy_serial.c       |  1 +
 arch/powerpc/kernel/pci-common.c          |  1 +
 arch/powerpc/platforms/52xx/lite5200.c    |  1 +
 arch/powerpc/platforms/amigaone/setup.c   |  3 +-
 arch/powerpc/platforms/iseries/mf.c       |  1 +
 arch/powerpc/platforms/powermac/feature.c |  2 ++
 arch/powerpc/sysdev/bestcomm/sram.c       |  1 +
 arch/powerpc/sysdev/fsl_gtm.c             |  1 +
 drivers/char/bsr.c                        |  1 +
 drivers/net/fsl_pq_mdio.c                 |  1 +
 drivers/net/xilinx_emaclite.c             |  2 +-
 drivers/serial/uartlite.c                 |  1 +
 drivers/spi/mpc512x_psc_spi.c             |  1 +
 drivers/spi/mpc52xx_psc_spi.c             |  1 +
 drivers/spi/xilinx_spi_of.c               |  1 +
 drivers/usb/gadget/fsl_qe_udc.c           |  1 +
 drivers/video/controlfb.c                 |  2 ++
 drivers/video/offb.c                      |  3 +-
 include/linux/of_address.h                | 32 ++++++++++++++++++++
 21 files changed, 76 insertions(+), 63 deletions(-)

(limited to 'include/linux')

diff --git a/arch/microblaze/include/asm/prom.h b/arch/microblaze/include/asm/prom.h
index cb9c3dd9a23b..101fa098f62a 100644
--- a/arch/microblaze/include/asm/prom.h
+++ b/arch/microblaze/include/asm/prom.h
@@ -20,11 +20,6 @@
 #ifndef __ASSEMBLY__
 
 #include <linux/types.h>
-#include <linux/of_address.h>
-#include <linux/of_irq.h>
-#include <linux/of_fdt.h>
-#include <linux/proc_fs.h>
-#include <linux/platform_device.h>
 #include <asm/irq.h>
 #include <asm/atomic.h>
 
@@ -52,25 +47,9 @@ extern void pci_create_OF_bus_map(void);
  * OF address retreival & translation
  */
 
-/* Extract an address from a device, returns the region size and
- * the address space flags too. The PCI version uses a BAR number
- * instead of an absolute index
- */
-extern const u32 *of_get_address(struct device_node *dev, int index,
-			u64 *size, unsigned int *flags);
-extern const u32 *of_get_pci_address(struct device_node *dev, int bar_no,
-			u64 *size, unsigned int *flags);
-
-extern int of_pci_address_to_resource(struct device_node *dev, int bar,
-				struct resource *r);
-
 #ifdef CONFIG_PCI
 extern unsigned long pci_address_to_pio(phys_addr_t address);
-#else
-static inline unsigned long pci_address_to_pio(phys_addr_t address)
-{
-	return (unsigned long)-1;
-}
+#define pci_address_to_pio pci_address_to_pio
 #endif	/* CONFIG_PCI */
 
 /* Parse the ibm,dma-window property of an OF node into the busno, phys and
@@ -99,8 +78,18 @@ extern const void *of_get_mac_address(struct device_node *np);
  * resolving using the OF tree walking.
  */
 struct pci_dev;
+struct of_irq;
 extern int of_irq_map_pci(struct pci_dev *pdev, struct of_irq *out_irq);
 
 #endif /* __ASSEMBLY__ */
 #endif /* __KERNEL__ */
+
+/* These includes are put at the bottom because they may contain things
+ * that are overridden by this file.  Ideally they shouldn't be included
+ * by this file, but there are a bunch of .c files that currently depend
+ * on it.  Eventually they will be cleaned up. */
+#include <linux/of_fdt.h>
+#include <linux/of_irq.h>
+#include <linux/platform_device.h>
+
 #endif /* _ASM_MICROBLAZE_PROM_H */
diff --git a/arch/powerpc/include/asm/prom.h b/arch/powerpc/include/asm/prom.h
index 55bccc0a21c3..ae26f2efd089 100644
--- a/arch/powerpc/include/asm/prom.h
+++ b/arch/powerpc/include/asm/prom.h
@@ -17,11 +17,6 @@
  * 2 of the License, or (at your option) any later version.
  */
 #include <linux/types.h>
-#include <linux/of_fdt.h>
-#include <linux/of_address.h>
-#include <linux/of_irq.h>
-#include <linux/proc_fs.h>
-#include <linux/platform_device.h>
 #include <asm/irq.h>
 #include <asm/atomic.h>
 
@@ -49,41 +44,9 @@ extern void pci_create_OF_bus_map(void);
 extern u64 of_translate_dma_address(struct device_node *dev,
 				    const u32 *in_addr);
 
-/* Extract an address from a device, returns the region size and
- * the address space flags too. The PCI version uses a BAR number
- * instead of an absolute index
- */
-extern const u32 *of_get_address(struct device_node *dev, int index,
-			   u64 *size, unsigned int *flags);
-#ifdef CONFIG_PCI
-extern const u32 *of_get_pci_address(struct device_node *dev, int bar_no,
-			       u64 *size, unsigned int *flags);
-#else
-static inline const u32 *of_get_pci_address(struct device_node *dev,
-		int bar_no, u64 *size, unsigned int *flags)
-{
-	return NULL;
-}
-#endif /* CONFIG_PCI */
-
-#ifdef CONFIG_PCI
-extern int of_pci_address_to_resource(struct device_node *dev, int bar,
-				      struct resource *r);
-#else
-static inline int of_pci_address_to_resource(struct device_node *dev, int bar,
-		struct resource *r)
-{
-	return -ENOSYS;
-}
-#endif /* CONFIG_PCI */
-
 #ifdef CONFIG_PCI
 extern unsigned long pci_address_to_pio(phys_addr_t address);
-#else
-static inline unsigned long pci_address_to_pio(phys_addr_t address)
-{
-	return (unsigned long)-1;
-}
+#define pci_address_to_pio pci_address_to_pio
 #endif	/* CONFIG_PCI */
 
 /* Parse the ibm,dma-window property of an OF node into the busno, phys and
@@ -122,9 +85,19 @@ static inline int of_node_to_nid(struct device_node *device) { return 0; }
  * resolving using the OF tree walking.
  */
 struct pci_dev;
+struct of_irq;
 extern int of_irq_map_pci(struct pci_dev *pdev, struct of_irq *out_irq);
 
 extern void of_instantiate_rtc(void);
 
+/* These includes are put at the bottom because they may contain things
+ * that are overridden by this file.  Ideally they shouldn't be included
+ * by this file, but there are a bunch of .c files that currently depend
+ * on it.  Eventually they will be cleaned up. */
+#include <linux/of_fdt.h>
+#include <linux/of_address.h>
+#include <linux/of_irq.h>
+#include <linux/platform_device.h>
+
 #endif /* __KERNEL__ */
 #endif /* _POWERPC_PROM_H */
diff --git a/arch/powerpc/kernel/legacy_serial.c b/arch/powerpc/kernel/legacy_serial.c
index 035ada5443ee..c1fd0f9658fd 100644
--- a/arch/powerpc/kernel/legacy_serial.c
+++ b/arch/powerpc/kernel/legacy_serial.c
@@ -4,6 +4,7 @@
 #include <linux/serial_core.h>
 #include <linux/console.h>
 #include <linux/pci.h>
+#include <linux/of_address.h>
 #include <linux/of_device.h>
 #include <asm/io.h>
 #include <asm/mmu.h>
diff --git a/arch/powerpc/kernel/pci-common.c b/arch/powerpc/kernel/pci-common.c
index 5b38f6ae2b29..9021c4ad4bbd 100644
--- a/arch/powerpc/kernel/pci-common.c
+++ b/arch/powerpc/kernel/pci-common.c
@@ -21,6 +21,7 @@
 #include <linux/string.h>
 #include <linux/init.h>
 #include <linux/bootmem.h>
+#include <linux/of_address.h>
 #include <linux/mm.h>
 #include <linux/list.h>
 #include <linux/syscalls.h>
diff --git a/arch/powerpc/platforms/52xx/lite5200.c b/arch/powerpc/platforms/52xx/lite5200.c
index 6d584f4e3c9a..de55bc0584b5 100644
--- a/arch/powerpc/platforms/52xx/lite5200.c
+++ b/arch/powerpc/platforms/52xx/lite5200.c
@@ -18,6 +18,7 @@
 #include <linux/init.h>
 #include <linux/pci.h>
 #include <linux/of.h>
+#include <linux/of_address.h>
 #include <linux/root_dev.h>
 #include <linux/initrd.h>
 #include <asm/time.h>
diff --git a/arch/powerpc/platforms/amigaone/setup.c b/arch/powerpc/platforms/amigaone/setup.c
index fb4eb0df054c..03aabc0e16ac 100644
--- a/arch/powerpc/platforms/amigaone/setup.c
+++ b/arch/powerpc/platforms/amigaone/setup.c
@@ -13,12 +13,13 @@
  */
 
 #include <linux/kernel.h>
+#include <linux/of.h>
+#include <linux/of_address.h>
 #include <linux/seq_file.h>
 #include <generated/utsrelease.h>
 
 #include <asm/machdep.h>
 #include <asm/cputable.h>
-#include <asm/prom.h>
 #include <asm/pci-bridge.h>
 #include <asm/i8259.h>
 #include <asm/time.h>
diff --git a/arch/powerpc/platforms/iseries/mf.c b/arch/powerpc/platforms/iseries/mf.c
index d2c1d497846e..33e5fc7334fc 100644
--- a/arch/powerpc/platforms/iseries/mf.c
+++ b/arch/powerpc/platforms/iseries/mf.c
@@ -30,6 +30,7 @@
 #include <linux/init.h>
 #include <linux/completion.h>
 #include <linux/delay.h>
+#include <linux/proc_fs.h>
 #include <linux/dma-mapping.h>
 #include <linux/bcd.h>
 #include <linux/rtc.h>
diff --git a/arch/powerpc/platforms/powermac/feature.c b/arch/powerpc/platforms/powermac/feature.c
index 9e1b9fd75206..75eec031e7a2 100644
--- a/arch/powerpc/platforms/powermac/feature.c
+++ b/arch/powerpc/platforms/powermac/feature.c
@@ -21,6 +21,8 @@
 #include <linux/delay.h>
 #include <linux/kernel.h>
 #include <linux/sched.h>
+#include <linux/of.h>
+#include <linux/of_address.h>
 #include <linux/spinlock.h>
 #include <linux/adb.h>
 #include <linux/pmu.h>
diff --git a/arch/powerpc/sysdev/bestcomm/sram.c b/arch/powerpc/sysdev/bestcomm/sram.c
index 5d74ef7a651f..1225012a681a 100644
--- a/arch/powerpc/sysdev/bestcomm/sram.c
+++ b/arch/powerpc/sysdev/bestcomm/sram.c
@@ -11,6 +11,7 @@
  * kind, whether express or implied.
  */
 
+#include <linux/err.h>
 #include <linux/kernel.h>
 #include <linux/module.h>
 #include <linux/slab.h>
diff --git a/arch/powerpc/sysdev/fsl_gtm.c b/arch/powerpc/sysdev/fsl_gtm.c
index eca4545dd52e..7dd2885321ad 100644
--- a/arch/powerpc/sysdev/fsl_gtm.c
+++ b/arch/powerpc/sysdev/fsl_gtm.c
@@ -14,6 +14,7 @@
  */
 
 #include <linux/kernel.h>
+#include <linux/err.h>
 #include <linux/errno.h>
 #include <linux/list.h>
 #include <linux/io.h>
diff --git a/drivers/char/bsr.c b/drivers/char/bsr.c
index 89d871ef8c2f..91917133ae0a 100644
--- a/drivers/char/bsr.c
+++ b/drivers/char/bsr.c
@@ -23,6 +23,7 @@
 #include <linux/of.h>
 #include <linux/of_device.h>
 #include <linux/of_platform.h>
+#include <linux/fs.h>
 #include <linux/module.h>
 #include <linux/cdev.h>
 #include <linux/list.h>
diff --git a/drivers/net/fsl_pq_mdio.c b/drivers/net/fsl_pq_mdio.c
index b4c41d72c423..f53f850b6418 100644
--- a/drivers/net/fsl_pq_mdio.c
+++ b/drivers/net/fsl_pq_mdio.c
@@ -35,6 +35,7 @@
 #include <linux/mii.h>
 #include <linux/phy.h>
 #include <linux/of.h>
+#include <linux/of_address.h>
 #include <linux/of_mdio.h>
 #include <linux/of_platform.h>
 
diff --git a/drivers/net/xilinx_emaclite.c b/drivers/net/xilinx_emaclite.c
index d04c5b262050..b2c2f391b29d 100644
--- a/drivers/net/xilinx_emaclite.c
+++ b/drivers/net/xilinx_emaclite.c
@@ -20,7 +20,7 @@
 #include <linux/skbuff.h>
 #include <linux/io.h>
 #include <linux/slab.h>
-
+#include <linux/of_address.h>
 #include <linux/of_device.h>
 #include <linux/of_platform.h>
 #include <linux/of_mdio.h>
diff --git a/drivers/serial/uartlite.c b/drivers/serial/uartlite.c
index 8acccd564378..caf085d3a76a 100644
--- a/drivers/serial/uartlite.c
+++ b/drivers/serial/uartlite.c
@@ -21,6 +21,7 @@
 #include <asm/io.h>
 #if defined(CONFIG_OF) && (defined(CONFIG_PPC32) || defined(CONFIG_MICROBLAZE))
 #include <linux/of.h>
+#include <linux/of_address.h>
 #include <linux/of_device.h>
 #include <linux/of_platform.h>
 
diff --git a/drivers/spi/mpc512x_psc_spi.c b/drivers/spi/mpc512x_psc_spi.c
index 1bb4315f5f84..10baac3f8ea5 100644
--- a/drivers/spi/mpc512x_psc_spi.c
+++ b/drivers/spi/mpc512x_psc_spi.c
@@ -19,6 +19,7 @@
 #include <linux/init.h>
 #include <linux/errno.h>
 #include <linux/interrupt.h>
+#include <linux/of_address.h>
 #include <linux/of_platform.h>
 #include <linux/workqueue.h>
 #include <linux/completion.h>
diff --git a/drivers/spi/mpc52xx_psc_spi.c b/drivers/spi/mpc52xx_psc_spi.c
index bd81ff90cfb3..66d170147dcc 100644
--- a/drivers/spi/mpc52xx_psc_spi.c
+++ b/drivers/spi/mpc52xx_psc_spi.c
@@ -16,6 +16,7 @@
 #include <linux/types.h>
 #include <linux/errno.h>
 #include <linux/interrupt.h>
+#include <linux/of_address.h>
 #include <linux/of_platform.h>
 #include <linux/workqueue.h>
 #include <linux/completion.h>
diff --git a/drivers/spi/xilinx_spi_of.c b/drivers/spi/xilinx_spi_of.c
index 87cda0956a83..f53d3f6b9f61 100644
--- a/drivers/spi/xilinx_spi_of.c
+++ b/drivers/spi/xilinx_spi_of.c
@@ -29,6 +29,7 @@
 #include <linux/io.h>
 #include <linux/slab.h>
 
+#include <linux/of_address.h>
 #include <linux/of_platform.h>
 #include <linux/of_device.h>
 #include <linux/of_spi.h>
diff --git a/drivers/usb/gadget/fsl_qe_udc.c b/drivers/usb/gadget/fsl_qe_udc.c
index 82506ca297d5..9648b75f0283 100644
--- a/drivers/usb/gadget/fsl_qe_udc.c
+++ b/drivers/usb/gadget/fsl_qe_udc.c
@@ -32,6 +32,7 @@
 #include <linux/interrupt.h>
 #include <linux/io.h>
 #include <linux/moduleparam.h>
+#include <linux/of_address.h>
 #include <linux/of_platform.h>
 #include <linux/dma-mapping.h>
 #include <linux/usb/ch9.h>
diff --git a/drivers/video/controlfb.c b/drivers/video/controlfb.c
index 49fcbe8f18ac..c225dcce89e7 100644
--- a/drivers/video/controlfb.c
+++ b/drivers/video/controlfb.c
@@ -40,6 +40,8 @@
 #include <linux/vmalloc.h>
 #include <linux/delay.h>
 #include <linux/interrupt.h>
+#include <linux/of.h>
+#include <linux/of_address.h>
 #include <linux/fb.h>
 #include <linux/init.h>
 #include <linux/pci.h>
diff --git a/drivers/video/offb.c b/drivers/video/offb.c
index 46dda7d8aaee..cb163a5397be 100644
--- a/drivers/video/offb.c
+++ b/drivers/video/offb.c
@@ -19,13 +19,14 @@
 #include <linux/mm.h>
 #include <linux/vmalloc.h>
 #include <linux/delay.h>
+#include <linux/of.h>
+#include <linux/of_address.h>
 #include <linux/interrupt.h>
 #include <linux/fb.h>
 #include <linux/init.h>
 #include <linux/ioport.h>
 #include <linux/pci.h>
 #include <asm/io.h>
-#include <asm/prom.h>
 
 #ifdef CONFIG_PPC64
 #include <asm/pci-bridge.h>
diff --git a/include/linux/of_address.h b/include/linux/of_address.h
index cc567df9a00d..8aea06f0564c 100644
--- a/include/linux/of_address.h
+++ b/include/linux/of_address.h
@@ -8,5 +8,37 @@ extern int of_address_to_resource(struct device_node *dev, int index,
 				  struct resource *r);
 extern void __iomem *of_iomap(struct device_node *device, int index);
 
+/* Extract an address from a device, returns the region size and
+ * the address space flags too. The PCI version uses a BAR number
+ * instead of an absolute index
+ */
+extern const u32 *of_get_address(struct device_node *dev, int index,
+			   u64 *size, unsigned int *flags);
+
+#ifndef pci_address_to_pio
+static inline unsigned long pci_address_to_pio(phys_addr_t addr) { return -1; }
+#define pci_address_to_pio pci_address_to_pio
+#endif
+
+#ifdef CONFIG_PCI
+extern const u32 *of_get_pci_address(struct device_node *dev, int bar_no,
+			       u64 *size, unsigned int *flags);
+extern int of_pci_address_to_resource(struct device_node *dev, int bar,
+				      struct resource *r);
+#else /* CONFIG_PCI */
+static inline int of_pci_address_to_resource(struct device_node *dev, int bar,
+				             struct resource *r)
+{
+	return -ENOSYS;
+}
+
+static inline const u32 *of_get_pci_address(struct device_node *dev,
+		int bar_no, u64 *size, unsigned int *flags)
+{
+	return NULL;
+}
+#endif /* CONFIG_PCI */
+
+
 #endif /* __OF_ADDRESS_H */
 
-- 
cgit v1.2.3


From 6ee0578b4daaea01c96b172c6aacca43fd9807a6 Mon Sep 17 00:00:00 2001
From: Suresh Siddha <suresh.b.siddha@intel.com>
Date: Fri, 30 Jul 2010 14:57:37 -0700
Subject: workqueue: mark init_workqueues() as early_initcall()

Mark init_workqueues() as early_initcall() and thus it will be initialized
before smp bringup. init_workqueues() registers for the hotcpu notifier
and thus it should cope with the processors that are brought online after
the workqueues are initialized.

x86 smp bringup code uses workqueues and uses a workaround for the
cold boot process (as the workqueues are initialized post smp_init()).
Marking init_workqueues() as early_initcall() will pave the way for
cleaning up this code.

Signed-off-by: Suresh Siddha <suresh.b.siddha@intel.com>
Signed-off-by: Tejun Heo <tj@kernel.org>
Cc: Oleg Nesterov <oleg@redhat.com>
Cc: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/workqueue.h | 1 -
 init/main.c               | 2 --
 kernel/workqueue.c        | 4 +++-
 3 files changed, 3 insertions(+), 4 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/workqueue.h b/include/linux/workqueue.h
index 5f76001c4e6d..51dc9a727e5e 100644
--- a/include/linux/workqueue.h
+++ b/include/linux/workqueue.h
@@ -327,7 +327,6 @@ extern int schedule_delayed_work_on(int cpu, struct delayed_work *work,
 extern int schedule_on_each_cpu(work_func_t func);
 extern int keventd_up(void);
 
-extern void init_workqueues(void);
 int execute_in_process_context(work_func_t fn, struct execute_work *);
 
 extern int flush_work(struct work_struct *work);
diff --git a/init/main.c b/init/main.c
index 3bdb152f412f..5f2ec2cdd900 100644
--- a/init/main.c
+++ b/init/main.c
@@ -32,7 +32,6 @@
 #include <linux/start_kernel.h>
 #include <linux/security.h>
 #include <linux/smp.h>
-#include <linux/workqueue.h>
 #include <linux/profile.h>
 #include <linux/rcupdate.h>
 #include <linux/moduleparam.h>
@@ -786,7 +785,6 @@ static void __init do_initcalls(void)
  */
 static void __init do_basic_setup(void)
 {
-	init_workqueues();
 	cpuset_init_smp();
 	usermodehelper_init();
 	init_tmpfs();
diff --git a/kernel/workqueue.c b/kernel/workqueue.c
index 1105c474073a..e2eb351d9152 100644
--- a/kernel/workqueue.c
+++ b/kernel/workqueue.c
@@ -3507,7 +3507,7 @@ out_unlock:
 }
 #endif /* CONFIG_FREEZER */
 
-void __init init_workqueues(void)
+static int __init init_workqueues(void)
 {
 	unsigned int cpu;
 	int i;
@@ -3559,4 +3559,6 @@ void __init init_workqueues(void)
 	system_unbound_wq = alloc_workqueue("events_unbound", WQ_UNBOUND,
 					    WQ_UNBOUND_MAX_ACTIVE);
 	BUG_ON(!system_wq || !system_long_wq || !system_nrt_wq);
+	return 0;
 }
+early_initcall(init_workqueues);
-- 
cgit v1.2.3


From 0814a979a64a5ae61c7567496d090e204ecabd2b Mon Sep 17 00:00:00 2001
From: Anatolij Gustschin <agust@denx.de>
Date: Fri, 23 Jul 2010 04:00:36 +0000
Subject: powerpc/5121: move fsl-diu-fb.h to include/linux

Some DIU structures will be used in platform code in
subsequent MPC5121 DIU patch, so we move this header
to be able to include it elsewhere.

Signed-off-by: Anatolij Gustschin <agust@denx.de>
Acked-by: Timur Tabi <timur@freescale.com>
Signed-off-by: Grant Likely <grant.likely@secretlab.ca>
---
 drivers/video/fsl-diu-fb.c |   2 +-
 drivers/video/fsl-diu-fb.h | 223 ---------------------------------------------
 include/linux/fsl-diu-fb.h | 223 +++++++++++++++++++++++++++++++++++++++++++++
 3 files changed, 224 insertions(+), 224 deletions(-)
 delete mode 100644 drivers/video/fsl-diu-fb.h
 create mode 100644 include/linux/fsl-diu-fb.h

(limited to 'include/linux')

diff --git a/drivers/video/fsl-diu-fb.c b/drivers/video/fsl-diu-fb.c
index 9b8c99111221..48905d5f4e8d 100644
--- a/drivers/video/fsl-diu-fb.c
+++ b/drivers/video/fsl-diu-fb.c
@@ -34,7 +34,7 @@
 #include <linux/of_platform.h>
 
 #include <sysdev/fsl_soc.h>
-#include "fsl-diu-fb.h"
+#include <linux/fsl-diu-fb.h>
 
 /*
  * These parameters give default parameters
diff --git a/drivers/video/fsl-diu-fb.h b/drivers/video/fsl-diu-fb.h
deleted file mode 100644
index fc295d7ea463..000000000000
--- a/drivers/video/fsl-diu-fb.h
+++ /dev/null
@@ -1,223 +0,0 @@
-/*
- * Copyright 2008 Freescale Semiconductor, Inc. All Rights Reserved.
- *
- *  Freescale DIU Frame Buffer device driver
- *
- *  Authors: Hongjun Chen <hong-jun.chen@freescale.com>
- *           Paul Widmer <paul.widmer@freescale.com>
- *           Srikanth Srinivasan <srikanth.srinivasan@freescale.com>
- *           York Sun <yorksun@freescale.com>
- *
- *   Based on imxfb.c Copyright (C) 2004 S.Hauer, Pengutronix
- *
- * This program is free software; you can redistribute  it and/or modify it
- * under  the terms of  the GNU General  Public License as published by the
- * Free Software Foundation;  either version 2 of the  License, or (at your
- * option) any later version.
- *
- */
-
-#ifndef __FSL_DIU_FB_H__
-#define __FSL_DIU_FB_H__
-
-/* Arbitrary threshold to determine the allocation method
- * See mpc8610fb_set_par(), map_video_memory(), and unmap_video_memory()
- */
-#define MEM_ALLOC_THRESHOLD (1024*768*4+32)
-/* Minimum value that the pixel clock can be set to in pico seconds
- * This is determined by platform clock/3 where the minimum platform
- * clock is 533MHz. This gives 5629 pico seconds.
- */
-#define MIN_PIX_CLK 5629
-#define MAX_PIX_CLK 96096
-
-#include <linux/types.h>
-
-struct mfb_alpha {
-	int enable;
-	int alpha;
-};
-
-struct mfb_chroma_key {
-	int enable;
-	__u8  red_max;
-	__u8  green_max;
-	__u8  blue_max;
-	__u8  red_min;
-	__u8  green_min;
-	__u8  blue_min;
-};
-
-struct aoi_display_offset {
-	int x_aoi_d;
-	int y_aoi_d;
-};
-
-#define MFB_SET_CHROMA_KEY	_IOW('M', 1, struct mfb_chroma_key)
-#define MFB_WAIT_FOR_VSYNC	_IOW('F', 0x20, u_int32_t)
-#define MFB_SET_BRIGHTNESS	_IOW('M', 3, __u8)
-
-#define MFB_SET_ALPHA		0x80014d00
-#define MFB_GET_ALPHA		0x40014d00
-#define MFB_SET_AOID		0x80084d04
-#define MFB_GET_AOID		0x40084d04
-#define MFB_SET_PIXFMT		0x80014d08
-#define MFB_GET_PIXFMT		0x40014d08
-
-#define FBIOGET_GWINFO		0x46E0
-#define FBIOPUT_GWINFO		0x46E1
-
-#ifdef __KERNEL__
-#include <linux/spinlock.h>
-
-/*
- * These are the fields of area descriptor(in DDR memory) for every plane
- */
-struct diu_ad {
-	/* Word 0(32-bit) in DDR memory */
-/* 	__u16 comp; */
-/* 	__u16 pixel_s:2; */
-/* 	__u16 pallete:1; */
-/* 	__u16 red_c:2; */
-/* 	__u16 green_c:2; */
-/* 	__u16 blue_c:2; */
-/* 	__u16 alpha_c:3; */
-/* 	__u16 byte_f:1; */
-/* 	__u16 res0:3; */
-
-	__be32 pix_fmt; /* hard coding pixel format */
-
-	/* Word 1(32-bit) in DDR memory */
-	__le32 addr;
-
-	/* Word 2(32-bit) in DDR memory */
-/* 	__u32 delta_xs:11; */
-/* 	__u32 res1:1; */
-/* 	__u32 delta_ys:11; */
-/* 	__u32 res2:1; */
-/* 	__u32 g_alpha:8; */
-	__le32 src_size_g_alpha;
-
-	/* Word 3(32-bit) in DDR memory */
-/* 	__u32 delta_xi:11; */
-/* 	__u32 res3:5; */
-/* 	__u32 delta_yi:11; */
-/* 	__u32 res4:3; */
-/* 	__u32 flip:2; */
-	__le32 aoi_size;
-
-	/* Word 4(32-bit) in DDR memory */
-	/*__u32 offset_xi:11;
-	__u32 res5:5;
-	__u32 offset_yi:11;
-	__u32 res6:5;
-	*/
-	__le32 offset_xyi;
-
-	/* Word 5(32-bit) in DDR memory */
-	/*__u32 offset_xd:11;
-	__u32 res7:5;
-	__u32 offset_yd:11;
-	__u32 res8:5; */
-	__le32 offset_xyd;
-
-
-	/* Word 6(32-bit) in DDR memory */
-	__u8 ckmax_r;
-	__u8 ckmax_g;
-	__u8 ckmax_b;
-	__u8 res9;
-
-	/* Word 7(32-bit) in DDR memory */
-	__u8 ckmin_r;
-	__u8 ckmin_g;
-	__u8 ckmin_b;
-	__u8 res10;
-/* 	__u32 res10:8; */
-
-	/* Word 8(32-bit) in DDR memory */
-	__le32 next_ad;
-
-	/* Word 9(32-bit) in DDR memory, just for 64-bit aligned */
-	__u32 paddr;
-} __attribute__ ((packed));
-
-/* DIU register map */
-struct diu {
-	__be32 desc[3];
-	__be32 gamma;
-	__be32 pallete;
-	__be32 cursor;
-	__be32 curs_pos;
-	__be32 diu_mode;
-	__be32 bgnd;
-	__be32 bgnd_wb;
-	__be32 disp_size;
-	__be32 wb_size;
-	__be32 wb_mem_addr;
-	__be32 hsyn_para;
-	__be32 vsyn_para;
-	__be32 syn_pol;
-	__be32 thresholds;
-	__be32 int_status;
-	__be32 int_mask;
-	__be32 colorbar[8];
-	__be32 filling;
-	__be32 plut;
-} __attribute__ ((packed));
-
-struct diu_hw {
-	struct diu *diu_reg;
-	spinlock_t reg_lock;
-
-	__u32 mode;		/* DIU operation mode */
-};
-
-struct diu_addr {
-	__u8 __iomem *vaddr;	/* Virtual address */
-	dma_addr_t paddr;	/* Physical address */
-	__u32 	   offset;
-};
-
-struct diu_pool {
-	struct diu_addr ad;
-	struct diu_addr gamma;
-	struct diu_addr pallete;
-	struct diu_addr cursor;
-};
-
-#define FSL_DIU_BASE_OFFSET	0x2C000	/* Offset of DIU */
-#define INT_LCDC		64	/* DIU interrupt number */
-
-#define FSL_AOI_NUM	6	/* 5 AOIs and one dummy AOI */
-				/* 1 for plane 0, 2 for plane 1&2 each */
-
-/* Minimum X and Y resolutions */
-#define MIN_XRES	64
-#define MIN_YRES	64
-
-/* HW cursor parameters */
-#define MAX_CURS		32
-
-/* Modes of operation of DIU */
-#define MFB_MODE0	0	/* DIU off */
-#define MFB_MODE1	1	/* All three planes output to display */
-#define MFB_MODE2	2	/* Plane 1 to display, planes 2+3 written back*/
-#define MFB_MODE3	3	/* All three planes written back to memory */
-#define MFB_MODE4	4	/* Color bar generation */
-
-/* INT_STATUS/INT_MASK field descriptions */
-#define INT_VSYNC	0x01	/* Vsync interrupt  */
-#define INT_VSYNC_WB	0x02	/* Vsync interrupt for write back operation */
-#define INT_UNDRUN	0x04	/* Under run exception interrupt */
-#define INT_PARERR	0x08	/* Display parameters error interrupt */
-#define INT_LS_BF_VS	0x10	/* Lines before vsync. interrupt */
-
-/* Panels'operation modes */
-#define MFB_TYPE_OUTPUT	0	/* Panel output to display */
-#define MFB_TYPE_OFF	1	/* Panel off */
-#define MFB_TYPE_WB	2	/* Panel written back to memory */
-#define MFB_TYPE_TEST	3	/* Panel generate color bar */
-
-#endif /* __KERNEL__ */
-#endif /* __FSL_DIU_FB_H__ */
diff --git a/include/linux/fsl-diu-fb.h b/include/linux/fsl-diu-fb.h
new file mode 100644
index 000000000000..fc295d7ea463
--- /dev/null
+++ b/include/linux/fsl-diu-fb.h
@@ -0,0 +1,223 @@
+/*
+ * Copyright 2008 Freescale Semiconductor, Inc. All Rights Reserved.
+ *
+ *  Freescale DIU Frame Buffer device driver
+ *
+ *  Authors: Hongjun Chen <hong-jun.chen@freescale.com>
+ *           Paul Widmer <paul.widmer@freescale.com>
+ *           Srikanth Srinivasan <srikanth.srinivasan@freescale.com>
+ *           York Sun <yorksun@freescale.com>
+ *
+ *   Based on imxfb.c Copyright (C) 2004 S.Hauer, Pengutronix
+ *
+ * This program is free software; you can redistribute  it and/or modify it
+ * under  the terms of  the GNU General  Public License as published by the
+ * Free Software Foundation;  either version 2 of the  License, or (at your
+ * option) any later version.
+ *
+ */
+
+#ifndef __FSL_DIU_FB_H__
+#define __FSL_DIU_FB_H__
+
+/* Arbitrary threshold to determine the allocation method
+ * See mpc8610fb_set_par(), map_video_memory(), and unmap_video_memory()
+ */
+#define MEM_ALLOC_THRESHOLD (1024*768*4+32)
+/* Minimum value that the pixel clock can be set to in pico seconds
+ * This is determined by platform clock/3 where the minimum platform
+ * clock is 533MHz. This gives 5629 pico seconds.
+ */
+#define MIN_PIX_CLK 5629
+#define MAX_PIX_CLK 96096
+
+#include <linux/types.h>
+
+struct mfb_alpha {
+	int enable;
+	int alpha;
+};
+
+struct mfb_chroma_key {
+	int enable;
+	__u8  red_max;
+	__u8  green_max;
+	__u8  blue_max;
+	__u8  red_min;
+	__u8  green_min;
+	__u8  blue_min;
+};
+
+struct aoi_display_offset {
+	int x_aoi_d;
+	int y_aoi_d;
+};
+
+#define MFB_SET_CHROMA_KEY	_IOW('M', 1, struct mfb_chroma_key)
+#define MFB_WAIT_FOR_VSYNC	_IOW('F', 0x20, u_int32_t)
+#define MFB_SET_BRIGHTNESS	_IOW('M', 3, __u8)
+
+#define MFB_SET_ALPHA		0x80014d00
+#define MFB_GET_ALPHA		0x40014d00
+#define MFB_SET_AOID		0x80084d04
+#define MFB_GET_AOID		0x40084d04
+#define MFB_SET_PIXFMT		0x80014d08
+#define MFB_GET_PIXFMT		0x40014d08
+
+#define FBIOGET_GWINFO		0x46E0
+#define FBIOPUT_GWINFO		0x46E1
+
+#ifdef __KERNEL__
+#include <linux/spinlock.h>
+
+/*
+ * These are the fields of area descriptor(in DDR memory) for every plane
+ */
+struct diu_ad {
+	/* Word 0(32-bit) in DDR memory */
+/* 	__u16 comp; */
+/* 	__u16 pixel_s:2; */
+/* 	__u16 pallete:1; */
+/* 	__u16 red_c:2; */
+/* 	__u16 green_c:2; */
+/* 	__u16 blue_c:2; */
+/* 	__u16 alpha_c:3; */
+/* 	__u16 byte_f:1; */
+/* 	__u16 res0:3; */
+
+	__be32 pix_fmt; /* hard coding pixel format */
+
+	/* Word 1(32-bit) in DDR memory */
+	__le32 addr;
+
+	/* Word 2(32-bit) in DDR memory */
+/* 	__u32 delta_xs:11; */
+/* 	__u32 res1:1; */
+/* 	__u32 delta_ys:11; */
+/* 	__u32 res2:1; */
+/* 	__u32 g_alpha:8; */
+	__le32 src_size_g_alpha;
+
+	/* Word 3(32-bit) in DDR memory */
+/* 	__u32 delta_xi:11; */
+/* 	__u32 res3:5; */
+/* 	__u32 delta_yi:11; */
+/* 	__u32 res4:3; */
+/* 	__u32 flip:2; */
+	__le32 aoi_size;
+
+	/* Word 4(32-bit) in DDR memory */
+	/*__u32 offset_xi:11;
+	__u32 res5:5;
+	__u32 offset_yi:11;
+	__u32 res6:5;
+	*/
+	__le32 offset_xyi;
+
+	/* Word 5(32-bit) in DDR memory */
+	/*__u32 offset_xd:11;
+	__u32 res7:5;
+	__u32 offset_yd:11;
+	__u32 res8:5; */
+	__le32 offset_xyd;
+
+
+	/* Word 6(32-bit) in DDR memory */
+	__u8 ckmax_r;
+	__u8 ckmax_g;
+	__u8 ckmax_b;
+	__u8 res9;
+
+	/* Word 7(32-bit) in DDR memory */
+	__u8 ckmin_r;
+	__u8 ckmin_g;
+	__u8 ckmin_b;
+	__u8 res10;
+/* 	__u32 res10:8; */
+
+	/* Word 8(32-bit) in DDR memory */
+	__le32 next_ad;
+
+	/* Word 9(32-bit) in DDR memory, just for 64-bit aligned */
+	__u32 paddr;
+} __attribute__ ((packed));
+
+/* DIU register map */
+struct diu {
+	__be32 desc[3];
+	__be32 gamma;
+	__be32 pallete;
+	__be32 cursor;
+	__be32 curs_pos;
+	__be32 diu_mode;
+	__be32 bgnd;
+	__be32 bgnd_wb;
+	__be32 disp_size;
+	__be32 wb_size;
+	__be32 wb_mem_addr;
+	__be32 hsyn_para;
+	__be32 vsyn_para;
+	__be32 syn_pol;
+	__be32 thresholds;
+	__be32 int_status;
+	__be32 int_mask;
+	__be32 colorbar[8];
+	__be32 filling;
+	__be32 plut;
+} __attribute__ ((packed));
+
+struct diu_hw {
+	struct diu *diu_reg;
+	spinlock_t reg_lock;
+
+	__u32 mode;		/* DIU operation mode */
+};
+
+struct diu_addr {
+	__u8 __iomem *vaddr;	/* Virtual address */
+	dma_addr_t paddr;	/* Physical address */
+	__u32 	   offset;
+};
+
+struct diu_pool {
+	struct diu_addr ad;
+	struct diu_addr gamma;
+	struct diu_addr pallete;
+	struct diu_addr cursor;
+};
+
+#define FSL_DIU_BASE_OFFSET	0x2C000	/* Offset of DIU */
+#define INT_LCDC		64	/* DIU interrupt number */
+
+#define FSL_AOI_NUM	6	/* 5 AOIs and one dummy AOI */
+				/* 1 for plane 0, 2 for plane 1&2 each */
+
+/* Minimum X and Y resolutions */
+#define MIN_XRES	64
+#define MIN_YRES	64
+
+/* HW cursor parameters */
+#define MAX_CURS		32
+
+/* Modes of operation of DIU */
+#define MFB_MODE0	0	/* DIU off */
+#define MFB_MODE1	1	/* All three planes output to display */
+#define MFB_MODE2	2	/* Plane 1 to display, planes 2+3 written back*/
+#define MFB_MODE3	3	/* All three planes written back to memory */
+#define MFB_MODE4	4	/* Color bar generation */
+
+/* INT_STATUS/INT_MASK field descriptions */
+#define INT_VSYNC	0x01	/* Vsync interrupt  */
+#define INT_VSYNC_WB	0x02	/* Vsync interrupt for write back operation */
+#define INT_UNDRUN	0x04	/* Under run exception interrupt */
+#define INT_PARERR	0x08	/* Display parameters error interrupt */
+#define INT_LS_BF_VS	0x10	/* Lines before vsync. interrupt */
+
+/* Panels'operation modes */
+#define MFB_TYPE_OUTPUT	0	/* Panel output to display */
+#define MFB_TYPE_OFF	1	/* Panel off */
+#define MFB_TYPE_WB	2	/* Panel written back to memory */
+#define MFB_TYPE_TEST	3	/* Panel generate color bar */
+
+#endif /* __KERNEL__ */
+#endif /* __FSL_DIU_FB_H__ */
-- 
cgit v1.2.3


From 08354809d6c73eb73973e132502a0a4e53250971 Mon Sep 17 00:00:00 2001
From: Jassi Brar <jassi.brar@samsung.com>
Date: Fri, 25 Jun 2010 18:21:19 +0900
Subject: ahci_platform: Provide for vendor specific init

Some AHCI implementations may use Vendor Specific HBA[A0h, FFh]
and/or Port[70h, 7Fh] registers to 'prepare' for initialization.
For that, the platform needs memory mapped address of AHCI registers.

This patch adds the 'mmio' argument and reorders the call to
platform init function.

Signed-off-by: Jassi Brar <jassi.brar@samsung.com>
Signed-off-by: Jeff Garzik <jgarzik@redhat.com>
---
 drivers/ata/ahci_platform.c   | 25 +++++++++++++++----------
 include/linux/ahci_platform.h |  4 +++-
 2 files changed, 18 insertions(+), 11 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/ata/ahci_platform.c b/drivers/ata/ahci_platform.c
index 5e11b160f247..68ef6b563b7d 100644
--- a/drivers/ata/ahci_platform.c
+++ b/drivers/ata/ahci_platform.c
@@ -54,19 +54,13 @@ static int __init ahci_probe(struct platform_device *pdev)
 		return -EINVAL;
 	}
 
-	if (pdata && pdata->init) {
-		rc = pdata->init(dev);
-		if (rc)
-			return rc;
-	}
-
 	if (pdata && pdata->ata_port_info)
 		pi = *pdata->ata_port_info;
 
 	hpriv = devm_kzalloc(dev, sizeof(*hpriv), GFP_KERNEL);
 	if (!hpriv) {
-		rc = -ENOMEM;
-		goto err0;
+		dev_err(dev, "can't alloc ahci_host_priv\n");
+		return -ENOMEM;
 	}
 
 	hpriv->flags |= (unsigned long)pi.private_data;
@@ -74,8 +68,19 @@ static int __init ahci_probe(struct platform_device *pdev)
 	hpriv->mmio = devm_ioremap(dev, mem->start, resource_size(mem));
 	if (!hpriv->mmio) {
 		dev_err(dev, "can't map %pR\n", mem);
-		rc = -ENOMEM;
-		goto err0;
+		return -ENOMEM;
+	}
+
+	/*
+	 * Some platforms might need to prepare for mmio region access,
+	 * which could be done in the following init call. So, the mmio
+	 * region shouldn't be accessed before init (if provided) has
+	 * returned successfully.
+	 */
+	if (pdata && pdata->init) {
+		rc = pdata->init(dev, hpriv->mmio);
+		if (rc)
+			return rc;
 	}
 
 	ahci_save_initial_config(dev, hpriv,
diff --git a/include/linux/ahci_platform.h b/include/linux/ahci_platform.h
index f7dd576dd5a4..be3d9a77d6ed 100644
--- a/include/linux/ahci_platform.h
+++ b/include/linux/ahci_platform.h
@@ -15,11 +15,13 @@
 #ifndef _AHCI_PLATFORM_H
 #define _AHCI_PLATFORM_H
 
+#include <linux/compiler.h>
+
 struct device;
 struct ata_port_info;
 
 struct ahci_platform_data {
-	int (*init)(struct device *dev);
+	int (*init)(struct device *dev, void __iomem *addr);
 	void (*exit)(struct device *dev);
 	const struct ata_port_info *ata_port_info;
 	unsigned int force_port_map;
-- 
cgit v1.2.3


From 5b6ae5ba0c45c4d04721537308728688414c9e6b Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Fri, 30 Jul 2010 11:42:42 +0200
Subject: libata: more PCI IDs for jmicron controllers

Add support for JMB364 and 369.

Patch-originally-from: Aries Lee <arieslee@jmicron.com>
Signed-off-by: Tejun Heo <tj@kernel.org>
Signed-off-by: Jeff Garzik <jgarzik@redhat.com>
---
 drivers/pci/quirks.c    | 6 ++++++
 include/linux/pci_ids.h | 2 ++
 2 files changed, 8 insertions(+)

(limited to 'include/linux')

diff --git a/drivers/pci/quirks.c b/drivers/pci/quirks.c
index 477345d41641..a0c20d9e8396 100644
--- a/drivers/pci/quirks.c
+++ b/drivers/pci/quirks.c
@@ -1459,6 +1459,7 @@ static void quirk_jmicron_ata(struct pci_dev *pdev)
 	switch (pdev->device) {
 	case PCI_DEVICE_ID_JMICRON_JMB360: /* SATA single port */
 	case PCI_DEVICE_ID_JMICRON_JMB362: /* SATA dual ports */
+	case PCI_DEVICE_ID_JMICRON_JMB364: /* SATA dual ports */
 		/* The controller should be in single function ahci mode */
 		conf1 |= 0x0002A100; /* Set 8, 13, 15, 17 */
 		break;
@@ -1470,6 +1471,7 @@ static void quirk_jmicron_ata(struct pci_dev *pdev)
 		/* Fall through */
 	case PCI_DEVICE_ID_JMICRON_JMB361:
 	case PCI_DEVICE_ID_JMICRON_JMB363:
+	case PCI_DEVICE_ID_JMICRON_JMB369:
 		/* Enable dual function mode, AHCI on fn 0, IDE fn1 */
 		/* Set the class codes correctly and then direct IDE 0 */
 		conf1 |= 0x00C2A1B3; /* Set 0, 1, 4, 5, 7, 8, 13, 15, 17, 22, 23 */
@@ -1496,16 +1498,20 @@ DECLARE_PCI_FIXUP_EARLY(PCI_VENDOR_ID_JMICRON, PCI_DEVICE_ID_JMICRON_JMB360, qui
 DECLARE_PCI_FIXUP_EARLY(PCI_VENDOR_ID_JMICRON, PCI_DEVICE_ID_JMICRON_JMB361, quirk_jmicron_ata);
 DECLARE_PCI_FIXUP_EARLY(PCI_VENDOR_ID_JMICRON, PCI_DEVICE_ID_JMICRON_JMB362, quirk_jmicron_ata);
 DECLARE_PCI_FIXUP_EARLY(PCI_VENDOR_ID_JMICRON, PCI_DEVICE_ID_JMICRON_JMB363, quirk_jmicron_ata);
+DECLARE_PCI_FIXUP_EARLY(PCI_VENDOR_ID_JMICRON, PCI_DEVICE_ID_JMICRON_JMB364, quirk_jmicron_ata);
 DECLARE_PCI_FIXUP_EARLY(PCI_VENDOR_ID_JMICRON, PCI_DEVICE_ID_JMICRON_JMB365, quirk_jmicron_ata);
 DECLARE_PCI_FIXUP_EARLY(PCI_VENDOR_ID_JMICRON, PCI_DEVICE_ID_JMICRON_JMB366, quirk_jmicron_ata);
 DECLARE_PCI_FIXUP_EARLY(PCI_VENDOR_ID_JMICRON, PCI_DEVICE_ID_JMICRON_JMB368, quirk_jmicron_ata);
+DECLARE_PCI_FIXUP_EARLY(PCI_VENDOR_ID_JMICRON, PCI_DEVICE_ID_JMICRON_JMB369, quirk_jmicron_ata);
 DECLARE_PCI_FIXUP_RESUME_EARLY(PCI_VENDOR_ID_JMICRON, PCI_DEVICE_ID_JMICRON_JMB360, quirk_jmicron_ata);
 DECLARE_PCI_FIXUP_RESUME_EARLY(PCI_VENDOR_ID_JMICRON, PCI_DEVICE_ID_JMICRON_JMB361, quirk_jmicron_ata);
 DECLARE_PCI_FIXUP_RESUME_EARLY(PCI_VENDOR_ID_JMICRON, PCI_DEVICE_ID_JMICRON_JMB362, quirk_jmicron_ata);
 DECLARE_PCI_FIXUP_RESUME_EARLY(PCI_VENDOR_ID_JMICRON, PCI_DEVICE_ID_JMICRON_JMB363, quirk_jmicron_ata);
+DECLARE_PCI_FIXUP_RESUME_EARLY(PCI_VENDOR_ID_JMICRON, PCI_DEVICE_ID_JMICRON_JMB364, quirk_jmicron_ata);
 DECLARE_PCI_FIXUP_RESUME_EARLY(PCI_VENDOR_ID_JMICRON, PCI_DEVICE_ID_JMICRON_JMB365, quirk_jmicron_ata);
 DECLARE_PCI_FIXUP_RESUME_EARLY(PCI_VENDOR_ID_JMICRON, PCI_DEVICE_ID_JMICRON_JMB366, quirk_jmicron_ata);
 DECLARE_PCI_FIXUP_RESUME_EARLY(PCI_VENDOR_ID_JMICRON, PCI_DEVICE_ID_JMICRON_JMB368, quirk_jmicron_ata);
+DECLARE_PCI_FIXUP_RESUME_EARLY(PCI_VENDOR_ID_JMICRON, PCI_DEVICE_ID_JMICRON_JMB369, quirk_jmicron_ata);
 
 #endif
 
diff --git a/include/linux/pci_ids.h b/include/linux/pci_ids.h
index 3bedcc149c84..eb200e6beb65 100644
--- a/include/linux/pci_ids.h
+++ b/include/linux/pci_ids.h
@@ -2324,9 +2324,11 @@
 #define PCI_DEVICE_ID_JMICRON_JMB361	0x2361
 #define PCI_DEVICE_ID_JMICRON_JMB362	0x2362
 #define PCI_DEVICE_ID_JMICRON_JMB363	0x2363
+#define PCI_DEVICE_ID_JMICRON_JMB364	0x2364
 #define PCI_DEVICE_ID_JMICRON_JMB365	0x2365
 #define PCI_DEVICE_ID_JMICRON_JMB366	0x2366
 #define PCI_DEVICE_ID_JMICRON_JMB368	0x2368
+#define PCI_DEVICE_ID_JMICRON_JMB369	0x2369
 #define PCI_DEVICE_ID_JMICRON_JMB38X_SD	0x2381
 #define PCI_DEVICE_ID_JMICRON_JMB38X_MMC 0x2382
 #define PCI_DEVICE_ID_JMICRON_JMB38X_MS	0x2383
-- 
cgit v1.2.3


From a51dca9cd3bb4ec5a05bfb6feabf024a5c808a37 Mon Sep 17 00:00:00 2001
From: Theodore Ts'o <tytso@mit.edu>
Date: Mon, 2 Aug 2010 08:43:25 -0400
Subject: jbd2: Use atomic variables to avoid taking t_handle_lock in
 jbd2_journal_stop

By using an atomic_t for t_updates and t_outstanding credits, this
should allow us to not need to take transaction t_handle_lock in
jbd2_journal_stop().

Signed-off-by: "Theodore Ts'o" <tytso@mit.edu>
---
 fs/jbd2/checkpoint.c  |  2 +-
 fs/jbd2/commit.c      | 13 +++++-----
 fs/jbd2/transaction.c | 69 +++++++++++++++++++++++++++++----------------------
 include/linux/jbd2.h  |  8 +++---
 4 files changed, 51 insertions(+), 41 deletions(-)

(limited to 'include/linux')

diff --git a/fs/jbd2/checkpoint.c b/fs/jbd2/checkpoint.c
index 076d1cc44f95..f8cdc02520f9 100644
--- a/fs/jbd2/checkpoint.c
+++ b/fs/jbd2/checkpoint.c
@@ -775,7 +775,7 @@ void __jbd2_journal_drop_transaction(journal_t *journal, transaction_t *transact
 	J_ASSERT(transaction->t_log_list == NULL);
 	J_ASSERT(transaction->t_checkpoint_list == NULL);
 	J_ASSERT(transaction->t_checkpoint_io_list == NULL);
-	J_ASSERT(transaction->t_updates == 0);
+	J_ASSERT(atomic_read(&transaction->t_updates) == 0);
 	J_ASSERT(journal->j_committing_transaction != transaction);
 	J_ASSERT(journal->j_running_transaction != transaction);
 
diff --git a/fs/jbd2/commit.c b/fs/jbd2/commit.c
index af056810acb6..fbd2c564e916 100644
--- a/fs/jbd2/commit.c
+++ b/fs/jbd2/commit.c
@@ -417,12 +417,12 @@ void jbd2_journal_commit_transaction(journal_t *journal)
 					      stats.run.rs_locked);
 
 	spin_lock(&commit_transaction->t_handle_lock);
-	while (commit_transaction->t_updates) {
+	while (atomic_read(&commit_transaction->t_updates)) {
 		DEFINE_WAIT(wait);
 
 		prepare_to_wait(&journal->j_wait_updates, &wait,
 					TASK_UNINTERRUPTIBLE);
-		if (commit_transaction->t_updates) {
+		if (atomic_read(&commit_transaction->t_updates)) {
 			spin_unlock(&commit_transaction->t_handle_lock);
 			spin_unlock(&journal->j_state_lock);
 			schedule();
@@ -433,7 +433,7 @@ void jbd2_journal_commit_transaction(journal_t *journal)
 	}
 	spin_unlock(&commit_transaction->t_handle_lock);
 
-	J_ASSERT (commit_transaction->t_outstanding_credits <=
+	J_ASSERT (atomic_read(&commit_transaction->t_outstanding_credits) <=
 			journal->j_max_transaction_buffers);
 
 	/*
@@ -527,11 +527,12 @@ void jbd2_journal_commit_transaction(journal_t *journal)
 	stats.run.rs_logging = jiffies;
 	stats.run.rs_flushing = jbd2_time_diff(stats.run.rs_flushing,
 					       stats.run.rs_logging);
-	stats.run.rs_blocks = commit_transaction->t_outstanding_credits;
+	stats.run.rs_blocks =
+		atomic_read(&commit_transaction->t_outstanding_credits);
 	stats.run.rs_blocks_logged = 0;
 
 	J_ASSERT(commit_transaction->t_nr_buffers <=
-		 commit_transaction->t_outstanding_credits);
+		 atomic_read(&commit_transaction->t_outstanding_credits));
 
 	err = 0;
 	descriptor = NULL;
@@ -616,7 +617,7 @@ void jbd2_journal_commit_transaction(journal_t *journal)
 		 * the free space in the log, but this counter is changed
 		 * by jbd2_journal_next_log_block() also.
 		 */
-		commit_transaction->t_outstanding_credits--;
+		atomic_dec(&commit_transaction->t_outstanding_credits);
 
 		/* Bump b_count to prevent truncate from stumbling over
                    the shadowed buffer!  @@@ This can go if we ever get
diff --git a/fs/jbd2/transaction.c b/fs/jbd2/transaction.c
index 001e95fb0fe1..9c64c7ec48d4 100644
--- a/fs/jbd2/transaction.c
+++ b/fs/jbd2/transaction.c
@@ -55,6 +55,8 @@ jbd2_get_transaction(journal_t *journal, transaction_t *transaction)
 	transaction->t_tid = journal->j_transaction_sequence++;
 	transaction->t_expires = jiffies + journal->j_commit_interval;
 	spin_lock_init(&transaction->t_handle_lock);
+	atomic_set(&transaction->t_updates, 0);
+	atomic_set(&transaction->t_outstanding_credits, 0);
 	INIT_LIST_HEAD(&transaction->t_inode_list);
 	INIT_LIST_HEAD(&transaction->t_private_list);
 
@@ -177,7 +179,7 @@ repeat_locked:
 	 * checkpoint to free some more log space.
 	 */
 	spin_lock(&transaction->t_handle_lock);
-	needed = transaction->t_outstanding_credits + nblocks;
+	needed = atomic_read(&transaction->t_outstanding_credits) + nblocks;
 
 	if (needed > journal->j_max_transaction_buffers) {
 		/*
@@ -240,11 +242,12 @@ repeat_locked:
 	}
 
 	handle->h_transaction = transaction;
-	transaction->t_outstanding_credits += nblocks;
-	transaction->t_updates++;
+	atomic_add(nblocks, &transaction->t_outstanding_credits);
+	atomic_inc(&transaction->t_updates);
 	transaction->t_handle_count++;
 	jbd_debug(4, "Handle %p given %d credits (total %d, free %d)\n",
-		  handle, nblocks, transaction->t_outstanding_credits,
+		  handle, nblocks,
+		  atomic_read(&transaction->t_outstanding_credits),
 		  __jbd2_log_space_left(journal));
 	spin_unlock(&transaction->t_handle_lock);
 	spin_unlock(&journal->j_state_lock);
@@ -369,7 +372,7 @@ int jbd2_journal_extend(handle_t *handle, int nblocks)
 	}
 
 	spin_lock(&transaction->t_handle_lock);
-	wanted = transaction->t_outstanding_credits + nblocks;
+	wanted = atomic_read(&transaction->t_outstanding_credits) + nblocks;
 
 	if (wanted > journal->j_max_transaction_buffers) {
 		jbd_debug(3, "denied handle %p %d blocks: "
@@ -384,7 +387,7 @@ int jbd2_journal_extend(handle_t *handle, int nblocks)
 	}
 
 	handle->h_buffer_credits += nblocks;
-	transaction->t_outstanding_credits += nblocks;
+	atomic_add(nblocks, &transaction->t_outstanding_credits);
 	result = 0;
 
 	jbd_debug(3, "extended handle %p by %d\n", handle, nblocks);
@@ -426,15 +429,14 @@ int jbd2__journal_restart(handle_t *handle, int nblocks, int gfp_mask)
 	 * First unlink the handle from its current transaction, and start the
 	 * commit on that.
 	 */
-	J_ASSERT(transaction->t_updates > 0);
+	J_ASSERT(atomic_read(&transaction->t_updates) > 0);
 	J_ASSERT(journal_current_handle() == handle);
 
 	spin_lock(&journal->j_state_lock);
 	spin_lock(&transaction->t_handle_lock);
-	transaction->t_outstanding_credits -= handle->h_buffer_credits;
-	transaction->t_updates--;
-
-	if (!transaction->t_updates)
+	atomic_sub(handle->h_buffer_credits,
+		   &transaction->t_outstanding_credits);
+	if (atomic_dec_and_test(&transaction->t_updates))
 		wake_up(&journal->j_wait_updates);
 	spin_unlock(&transaction->t_handle_lock);
 
@@ -481,7 +483,7 @@ void jbd2_journal_lock_updates(journal_t *journal)
 			break;
 
 		spin_lock(&transaction->t_handle_lock);
-		if (!transaction->t_updates) {
+		if (!atomic_read(&transaction->t_updates)) {
 			spin_unlock(&transaction->t_handle_lock);
 			break;
 		}
@@ -1258,7 +1260,8 @@ int jbd2_journal_stop(handle_t *handle)
 {
 	transaction_t *transaction = handle->h_transaction;
 	journal_t *journal = transaction->t_journal;
-	int err;
+	int err, wait_for_commit = 0;
+	tid_t tid;
 	pid_t pid;
 
 	J_ASSERT(journal_current_handle() == handle);
@@ -1266,7 +1269,7 @@ int jbd2_journal_stop(handle_t *handle)
 	if (is_handle_aborted(handle))
 		err = -EIO;
 	else {
-		J_ASSERT(transaction->t_updates > 0);
+		J_ASSERT(atomic_read(&transaction->t_updates) > 0);
 		err = 0;
 	}
 
@@ -1334,14 +1337,8 @@ int jbd2_journal_stop(handle_t *handle)
 	if (handle->h_sync)
 		transaction->t_synchronous_commit = 1;
 	current->journal_info = NULL;
-	spin_lock(&transaction->t_handle_lock);
-	transaction->t_outstanding_credits -= handle->h_buffer_credits;
-	transaction->t_updates--;
-	if (!transaction->t_updates) {
-		wake_up(&journal->j_wait_updates);
-		if (journal->j_barrier_count)
-			wake_up(&journal->j_wait_transaction_locked);
-	}
+	atomic_sub(handle->h_buffer_credits,
+		   &transaction->t_outstanding_credits);
 
 	/*
 	 * If the handle is marked SYNC, we need to set another commit
@@ -1350,15 +1347,13 @@ int jbd2_journal_stop(handle_t *handle)
 	 * transaction is too old now.
 	 */
 	if (handle->h_sync ||
-			transaction->t_outstanding_credits >
-				journal->j_max_transaction_buffers ||
-			time_after_eq(jiffies, transaction->t_expires)) {
+	    (atomic_read(&transaction->t_outstanding_credits) >
+	     journal->j_max_transaction_buffers) ||
+	    time_after_eq(jiffies, transaction->t_expires)) {
 		/* Do this even for aborted journals: an abort still
 		 * completes the commit thread, it just doesn't write
 		 * anything to disk. */
-		tid_t tid = transaction->t_tid;
 
-		spin_unlock(&transaction->t_handle_lock);
 		jbd_debug(2, "transaction too old, requesting commit for "
 					"handle %p\n", handle);
 		/* This is non-blocking */
@@ -1369,11 +1364,25 @@ int jbd2_journal_stop(handle_t *handle)
 		 * to wait for the commit to complete.
 		 */
 		if (handle->h_sync && !(current->flags & PF_MEMALLOC))
-			err = jbd2_log_wait_commit(journal, tid);
-	} else {
-		spin_unlock(&transaction->t_handle_lock);
+			wait_for_commit = 1;
 	}
 
+	/*
+	 * Once we drop t_updates, if it goes to zero the transaction
+	 * could start commiting on us and eventually disappear.  So
+	 * once we do this, we must not dereference transaction
+	 * pointer again.
+	 */
+	tid = transaction->t_tid;
+	if (atomic_dec_and_test(&transaction->t_updates)) {
+		wake_up(&journal->j_wait_updates);
+		if (journal->j_barrier_count)
+			wake_up(&journal->j_wait_transaction_locked);
+	}
+
+	if (wait_for_commit)
+		err = jbd2_log_wait_commit(journal, tid);
+
 	lock_map_release(&handle->h_lockdep_map);
 
 	jbd2_free_handle(handle);
diff --git a/include/linux/jbd2.h b/include/linux/jbd2.h
index 5a72bc75b273..a72ce21de0e1 100644
--- a/include/linux/jbd2.h
+++ b/include/linux/jbd2.h
@@ -601,13 +601,13 @@ struct transaction_s
 	 * Number of outstanding updates running on this transaction
 	 * [t_handle_lock]
 	 */
-	int			t_updates;
+	atomic_t		t_updates;
 
 	/*
 	 * Number of buffers reserved for use by all handles in this transaction
 	 * handle but not yet modified. [t_handle_lock]
 	 */
-	int			t_outstanding_credits;
+	atomic_t		t_outstanding_credits;
 
 	/*
 	 * Forward and backward links for the circular list of all transactions
@@ -1258,8 +1258,8 @@ static inline int jbd_space_needed(journal_t *journal)
 {
 	int nblocks = journal->j_max_transaction_buffers;
 	if (journal->j_committing_transaction)
-		nblocks += journal->j_committing_transaction->
-					t_outstanding_credits;
+		nblocks += atomic_read(&journal->j_committing_transaction->
+				       t_outstanding_credits);
 	return nblocks;
 }
 
-- 
cgit v1.2.3


From 078ff546a806b2c2ab74c25c8edd4c6d4680656a Mon Sep 17 00:00:00 2001
From: Ville Syrjälä <ville.syrjala@nokia.com>
Date: Wed, 17 Mar 2010 20:36:51 +0200
Subject: OMAP: DSS2: OMAPFB: Add support for switching memory regions
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Separate the memory region from the framebuffer device a little bit.
It's now possible to select the memory region used by the framebuffer
device using the new mem_idx parameter of omapfb_plane_info. If the
mem_idx is specified it will be interpreted as an index into the
memory regions array, if it's not specified the framebuffer's index is
used instead. So by default each framebuffer keeps using it's own
memory region which preserves backwards compatibility.

This allows cloning the same memory region to several overlays and yet
each overlay can be controlled independently since they can be
associated with separate framebuffer devices.

Signed-off-by: Ville Syrjälä <ville.syrjala@nokia.com>
Signed-off-by: Tomi Valkeinen <tomi.valkeinen@nokia.com>
---
 drivers/video/omap2/omapfb/omapfb-ioctl.c | 125 ++++++++++++++++++++++++------
 drivers/video/omap2/omapfb/omapfb-main.c  |  71 ++++++++++-------
 drivers/video/omap2/omapfb/omapfb-sysfs.c |  37 ++++++---
 drivers/video/omap2/omapfb/omapfb.h       |   9 ++-
 include/linux/omapfb.h                    |   5 +-
 5 files changed, 183 insertions(+), 64 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/video/omap2/omapfb/omapfb-ioctl.c b/drivers/video/omap2/omapfb/omapfb-ioctl.c
index 9c7361871d78..6635bd75affa 100644
--- a/drivers/video/omap2/omapfb/omapfb-ioctl.c
+++ b/drivers/video/omap2/omapfb/omapfb-ioctl.c
@@ -34,12 +34,37 @@
 
 #include "omapfb.h"
 
+static u8 get_mem_idx(struct omapfb_info *ofbi)
+{
+	if (ofbi->id == ofbi->region->id)
+		return 0;
+
+	return OMAPFB_MEM_IDX_ENABLED | ofbi->region->id;
+}
+
+static struct omapfb2_mem_region *get_mem_region(struct omapfb_info *ofbi,
+						 u8 mem_idx)
+{
+	struct omapfb2_device *fbdev = ofbi->fbdev;
+
+	if (mem_idx & OMAPFB_MEM_IDX_ENABLED)
+		mem_idx &= OMAPFB_MEM_IDX_MASK;
+	else
+		mem_idx = ofbi->id;
+
+	if (mem_idx >= fbdev->num_fbs)
+		return NULL;
+
+	return &fbdev->regions[mem_idx];
+}
+
 static int omapfb_setup_plane(struct fb_info *fbi, struct omapfb_plane_info *pi)
 {
 	struct omapfb_info *ofbi = FB2OFB(fbi);
 	struct omapfb2_device *fbdev = ofbi->fbdev;
 	struct omap_overlay *ovl;
-	struct omap_overlay_info info;
+	struct omap_overlay_info old_info;
+	struct omapfb2_mem_region *old_rg, *new_rg;
 	int r = 0;
 
 	DBG("omapfb_setup_plane\n");
@@ -52,7 +77,14 @@ static int omapfb_setup_plane(struct fb_info *fbi, struct omapfb_plane_info *pi)
 	/* XXX uses only the first overlay */
 	ovl = ofbi->overlays[0];
 
-	if (pi->enabled && !ofbi->region.size) {
+	old_rg = ofbi->region;
+	new_rg = get_mem_region(ofbi, pi->mem_idx);
+	if (!new_rg) {
+		r = -EINVAL;
+		goto out;
+	}
+
+	if (pi->enabled && !new_rg->size) {
 		/*
 		 * This plane's memory was freed, can't enable it
 		 * until it's reallocated.
@@ -61,27 +93,60 @@ static int omapfb_setup_plane(struct fb_info *fbi, struct omapfb_plane_info *pi)
 		goto out;
 	}
 
-	ovl->get_overlay_info(ovl, &info);
+	ovl->get_overlay_info(ovl, &old_info);
 
-	info.pos_x = pi->pos_x;
-	info.pos_y = pi->pos_y;
-	info.out_width = pi->out_width;
-	info.out_height = pi->out_height;
-	info.enabled = pi->enabled;
+	if (old_rg != new_rg) {
+		ofbi->region = new_rg;
+		set_fb_fix(fbi);
+	}
 
-	r = ovl->set_overlay_info(ovl, &info);
-	if (r)
-		goto out;
+	if (pi->enabled) {
+		struct omap_overlay_info info;
+
+		r = omapfb_setup_overlay(fbi, ovl, pi->pos_x, pi->pos_y,
+			pi->out_width, pi->out_height);
+		if (r)
+			goto undo;
 
-	if (ovl->manager) {
-		r = ovl->manager->apply(ovl->manager);
+		ovl->get_overlay_info(ovl, &info);
+
+		if (!info.enabled) {
+			info.enabled = pi->enabled;
+			r = ovl->set_overlay_info(ovl, &info);
+			if (r)
+				goto undo;
+		}
+	} else {
+		struct omap_overlay_info info;
+
+		ovl->get_overlay_info(ovl, &info);
+
+		info.enabled = pi->enabled;
+		info.pos_x = pi->pos_x;
+		info.pos_y = pi->pos_y;
+		info.out_width = pi->out_width;
+		info.out_height = pi->out_height;
+
+		r = ovl->set_overlay_info(ovl, &info);
 		if (r)
-			goto out;
+			goto undo;
 	}
 
-out:
-	if (r)
-		dev_err(fbdev->dev, "setup_plane failed\n");
+	if (ovl->manager)
+		ovl->manager->apply(ovl->manager);
+
+	return 0;
+
+ undo:
+	if (old_rg != new_rg) {
+		ofbi->region = old_rg;
+		set_fb_fix(fbi);
+	}
+
+	ovl->set_overlay_info(ovl, &old_info);
+ out:
+	dev_err(fbdev->dev, "setup_plane failed\n");
+
 	return r;
 }
 
@@ -92,8 +157,8 @@ static int omapfb_query_plane(struct fb_info *fbi, struct omapfb_plane_info *pi)
 	if (ofbi->num_overlays != 1) {
 		memset(pi, 0, sizeof(*pi));
 	} else {
-		struct omap_overlay_info *ovli;
 		struct omap_overlay *ovl;
+		struct omap_overlay_info *ovli;
 
 		ovl = ofbi->overlays[0];
 		ovli = &ovl->info;
@@ -103,6 +168,7 @@ static int omapfb_query_plane(struct fb_info *fbi, struct omapfb_plane_info *pi)
 		pi->enabled = ovli->enabled;
 		pi->channel_out = 0; /* xxx */
 		pi->mirror = 0;
+		pi->mem_idx = get_mem_idx(ofbi);
 		pi->out_width = ovli->out_width;
 		pi->out_height = ovli->out_height;
 	}
@@ -123,11 +189,24 @@ static int omapfb_setup_mem(struct fb_info *fbi, struct omapfb_mem_info *mi)
 
 	size = PAGE_ALIGN(mi->size);
 
-	rg = &ofbi->region;
+	rg = ofbi->region;
 
-	for (i = 0; i < ofbi->num_overlays; i++) {
-		if (ofbi->overlays[i]->info.enabled)
-			return -EBUSY;
+	if (atomic_read(&rg->map_count))
+		return -EBUSY;
+
+	for (i = 0; i < fbdev->num_fbs; i++) {
+		struct omapfb_info *ofbi2 = FB2OFB(fbdev->fbs[i]);
+		int j;
+
+		if (ofbi2->region != rg)
+			continue;
+
+		for (j = 0; j < ofbi2->num_overlays; j++) {
+			if (ofbi2->overlays[j]->info.enabled) {
+				r = -EBUSY;
+				return r;
+			}
+		}
 	}
 
 	if (rg->size != size || rg->type != mi->type) {
@@ -146,7 +225,7 @@ static int omapfb_query_mem(struct fb_info *fbi, struct omapfb_mem_info *mi)
 	struct omapfb_info *ofbi = FB2OFB(fbi);
 	struct omapfb2_mem_region *rg;
 
-	rg = &ofbi->region;
+	rg = ofbi->region;
 	memset(mi, 0, sizeof(*mi));
 
 	mi->size = rg->size;
diff --git a/drivers/video/omap2/omapfb/omapfb-main.c b/drivers/video/omap2/omapfb/omapfb-main.c
index ccccf3d71a39..4a0588022b33 100644
--- a/drivers/video/omap2/omapfb/omapfb-main.c
+++ b/drivers/video/omap2/omapfb/omapfb-main.c
@@ -157,7 +157,7 @@ static void fill_fb(struct fb_info *fbi)
 
 static unsigned omapfb_get_vrfb_offset(const struct omapfb_info *ofbi, int rot)
 {
-	const struct vrfb *vrfb = &ofbi->region.vrfb;
+	const struct vrfb *vrfb = &ofbi->region->vrfb;
 	unsigned offset;
 
 	switch (rot) {
@@ -185,27 +185,27 @@ static unsigned omapfb_get_vrfb_offset(const struct omapfb_info *ofbi, int rot)
 static u32 omapfb_get_region_rot_paddr(const struct omapfb_info *ofbi, int rot)
 {
 	if (ofbi->rotation_type == OMAP_DSS_ROT_VRFB) {
-		return ofbi->region.vrfb.paddr[rot]
+		return ofbi->region->vrfb.paddr[rot]
 			+ omapfb_get_vrfb_offset(ofbi, rot);
 	} else {
-		return ofbi->region.paddr;
+		return ofbi->region->paddr;
 	}
 }
 
 static u32 omapfb_get_region_paddr(const struct omapfb_info *ofbi)
 {
 	if (ofbi->rotation_type == OMAP_DSS_ROT_VRFB)
-		return ofbi->region.vrfb.paddr[0];
+		return ofbi->region->vrfb.paddr[0];
 	else
-		return ofbi->region.paddr;
+		return ofbi->region->paddr;
 }
 
 static void __iomem *omapfb_get_region_vaddr(const struct omapfb_info *ofbi)
 {
 	if (ofbi->rotation_type == OMAP_DSS_ROT_VRFB)
-		return ofbi->region.vrfb.vaddr[0];
+		return ofbi->region->vrfb.vaddr[0];
 	else
-		return ofbi->region.vaddr;
+		return ofbi->region->vaddr;
 }
 
 static struct omapfb_colormode omapfb_colormodes[] = {
@@ -450,7 +450,7 @@ static int check_vrfb_fb_size(unsigned long region_size,
 static int check_fb_size(const struct omapfb_info *ofbi,
 		struct fb_var_screeninfo *var)
 {
-	unsigned long max_frame_size = ofbi->region.size;
+	unsigned long max_frame_size = ofbi->region->size;
 	int bytespp = var->bits_per_pixel >> 3;
 	unsigned long line_size = var->xres_virtual * bytespp;
 
@@ -497,7 +497,7 @@ static int check_fb_size(const struct omapfb_info *ofbi,
 static int setup_vrfb_rotation(struct fb_info *fbi)
 {
 	struct omapfb_info *ofbi = FB2OFB(fbi);
-	struct omapfb2_mem_region *rg = &ofbi->region;
+	struct omapfb2_mem_region *rg = ofbi->region;
 	struct vrfb *vrfb = &rg->vrfb;
 	struct fb_var_screeninfo *var = &fbi->var;
 	struct fb_fix_screeninfo *fix = &fbi->fix;
@@ -558,9 +558,9 @@ static int setup_vrfb_rotation(struct fb_info *fbi)
 		return r;
 
 	/* used by open/write in fbmem.c */
-	fbi->screen_base = ofbi->region.vrfb.vaddr[0];
+	fbi->screen_base = ofbi->region->vrfb.vaddr[0];
 
-	fix->smem_start = ofbi->region.vrfb.paddr[0];
+	fix->smem_start = ofbi->region->vrfb.paddr[0];
 
 	switch (var->nonstd) {
 	case OMAPFB_COLOR_YUV422:
@@ -599,7 +599,7 @@ void set_fb_fix(struct fb_info *fbi)
 	struct fb_fix_screeninfo *fix = &fbi->fix;
 	struct fb_var_screeninfo *var = &fbi->var;
 	struct omapfb_info *ofbi = FB2OFB(fbi);
-	struct omapfb2_mem_region *rg = &ofbi->region;
+	struct omapfb2_mem_region *rg = ofbi->region;
 
 	DBG("set_fb_fix\n");
 
@@ -688,7 +688,7 @@ int check_fb_var(struct fb_info *fbi, struct fb_var_screeninfo *var)
 		return -EINVAL;
 
 	/* When no memory is allocated ignore the size check */
-	if (ofbi->region.size != 0 && check_fb_size(ofbi, var))
+	if (ofbi->region->size != 0 && check_fb_size(ofbi, var))
 		return -EINVAL;
 
 	if (var->xres + var->xoffset > var->xres_virtual)
@@ -856,7 +856,7 @@ static void omapfb_calc_addr(const struct omapfb_info *ofbi,
 }
 
 /* setup overlay according to the fb */
-static int omapfb_setup_overlay(struct fb_info *fbi, struct omap_overlay *ovl,
+int omapfb_setup_overlay(struct fb_info *fbi, struct omap_overlay *ovl,
 		u16 posx, u16 posy, u16 outw, u16 outh)
 {
 	int r = 0;
@@ -892,7 +892,7 @@ static int omapfb_setup_overlay(struct fb_info *fbi, struct omap_overlay *ovl,
 		yres = var->yres;
 	}
 
-	if (ofbi->region.size)
+	if (ofbi->region->size)
 		omapfb_calc_addr(ofbi, var, fix, rotation,
 				 &data_start_p, &data_start_v);
 
@@ -971,7 +971,7 @@ int omapfb_apply_changes(struct fb_info *fbi, int init)
 
 		DBG("apply_changes, fb %d, ovl %d\n", ofbi->id, ovl->id);
 
-		if (ofbi->region.size == 0) {
+		if (ofbi->region->size == 0) {
 			/* the fb is not available. disable the overlay */
 			omapfb_overlay_enable(ovl, 0);
 			if (!init && ovl->manager)
@@ -1071,16 +1071,16 @@ static int omapfb_pan_display(struct fb_var_screeninfo *var,
 
 static void mmap_user_open(struct vm_area_struct *vma)
 {
-	struct omapfb_info *ofbi = (struct omapfb_info *)vma->vm_private_data;
+	struct omapfb2_mem_region *rg = vma->vm_private_data;
 
-	atomic_inc(&ofbi->map_count);
+	atomic_inc(&rg->map_count);
 }
 
 static void mmap_user_close(struct vm_area_struct *vma)
 {
-	struct omapfb_info *ofbi = (struct omapfb_info *)vma->vm_private_data;
+	struct omapfb2_mem_region *rg = vma->vm_private_data;
 
-	atomic_dec(&ofbi->map_count);
+	atomic_dec(&rg->map_count);
 }
 
 static struct vm_operations_struct mmap_user_ops = {
@@ -1092,6 +1092,7 @@ static int omapfb_mmap(struct fb_info *fbi, struct vm_area_struct *vma)
 {
 	struct omapfb_info *ofbi = FB2OFB(fbi);
 	struct fb_fix_screeninfo *fix = &fbi->fix;
+	struct omapfb2_mem_region *rg;
 	unsigned long off;
 	unsigned long start;
 	u32 len;
@@ -1102,6 +1103,8 @@ static int omapfb_mmap(struct fb_info *fbi, struct vm_area_struct *vma)
 		return -EINVAL;
 	off = vma->vm_pgoff << PAGE_SHIFT;
 
+	rg = ofbi->region;
+
 	start = omapfb_get_region_paddr(ofbi);
 	len = fix->smem_len;
 	if (off >= len)
@@ -1117,12 +1120,12 @@ static int omapfb_mmap(struct fb_info *fbi, struct vm_area_struct *vma)
 	vma->vm_flags |= VM_IO | VM_RESERVED;
 	vma->vm_page_prot = pgprot_writecombine(vma->vm_page_prot);
 	vma->vm_ops = &mmap_user_ops;
-	vma->vm_private_data = ofbi;
+	vma->vm_private_data = rg;
 	if (io_remap_pfn_range(vma, vma->vm_start, off >> PAGE_SHIFT,
 			     vma->vm_end - vma->vm_start, vma->vm_page_prot))
 		return -EAGAIN;
 	/* vm_ops.open won't be called for mmap itself. */
-	atomic_inc(&ofbi->map_count);
+	atomic_inc(&rg->map_count);
 	return 0;
 }
 
@@ -1312,7 +1315,9 @@ static void omapfb_free_fbmem(struct fb_info *fbi)
 	struct omapfb2_device *fbdev = ofbi->fbdev;
 	struct omapfb2_mem_region *rg;
 
-	rg = &ofbi->region;
+	rg = ofbi->region;
+
+	WARN_ON(atomic_read(&rg->map_count));
 
 	if (rg->paddr)
 		if (omap_vram_free(rg->paddr, rg->size))
@@ -1367,8 +1372,15 @@ static int omapfb_alloc_fbmem(struct fb_info *fbi, unsigned long size,
 	void __iomem *vaddr;
 	int r;
 
-	rg = &ofbi->region;
-	memset(rg, 0, sizeof(*rg));
+	rg = ofbi->region;
+
+	rg->paddr = 0;
+	rg->vaddr = NULL;
+	memset(&rg->vrfb, 0, sizeof rg->vrfb);
+	rg->size = 0;
+	rg->type = 0;
+	rg->alloc = false;
+	rg->map = false;
 
 	size = PAGE_ALIGN(size);
 
@@ -1621,7 +1633,7 @@ static int omapfb_allocate_all_fbs(struct omapfb2_device *fbdev)
 	for (i = 0; i < fbdev->num_fbs; i++) {
 		struct omapfb_info *ofbi = FB2OFB(fbdev->fbs[i]);
 		struct omapfb2_mem_region *rg;
-		rg = &ofbi->region;
+		rg = ofbi->region;
 
 		DBG("region%d phys %08x virt %p size=%lu\n",
 				i,
@@ -1638,7 +1650,7 @@ int omapfb_realloc_fbmem(struct fb_info *fbi, unsigned long size, int type)
 	struct omapfb_info *ofbi = FB2OFB(fbi);
 	struct omapfb2_device *fbdev = ofbi->fbdev;
 	struct omap_dss_device *display = fb2display(fbi);
-	struct omapfb2_mem_region *rg = &ofbi->region;
+	struct omapfb2_mem_region *rg = ofbi->region;
 	unsigned long old_size = rg->size;
 	unsigned long old_paddr = rg->paddr;
 	int old_type = rg->type;
@@ -1721,7 +1733,7 @@ static int omapfb_fb_init(struct omapfb2_device *fbdev, struct fb_info *fbi)
 	fbi->flags = FBINFO_FLAG_DEFAULT;
 	fbi->pseudo_palette = fbdev->pseudo_palette;
 
-	if (ofbi->region.size == 0) {
+	if (ofbi->region->size == 0) {
 		clear_fb_info(fbi);
 		return 0;
 	}
@@ -1883,6 +1895,9 @@ static int omapfb_create_framebuffers(struct omapfb2_device *fbdev)
 		ofbi->fbdev = fbdev;
 		ofbi->id = i;
 
+		ofbi->region = &fbdev->regions[i];
+		ofbi->region->id = i;
+
 		/* assign these early, so that fb alloc can use them */
 		ofbi->rotation_type = def_vrfb ? OMAP_DSS_ROT_VRFB :
 			OMAP_DSS_ROT_DMA;
diff --git a/drivers/video/omap2/omapfb/omapfb-sysfs.c b/drivers/video/omap2/omapfb/omapfb-sysfs.c
index 5179219128bd..dea1aa46a7db 100644
--- a/drivers/video/omap2/omapfb/omapfb-sysfs.c
+++ b/drivers/video/omap2/omapfb/omapfb-sysfs.c
@@ -64,7 +64,7 @@ static ssize_t store_rotate_type(struct device *dev,
 	if (rot_type == ofbi->rotation_type)
 		goto out;
 
-	if (ofbi->region.size) {
+	if (ofbi->region->size) {
 		r = -EBUSY;
 		goto out;
 	}
@@ -408,7 +408,7 @@ static ssize_t show_size(struct device *dev,
 	struct fb_info *fbi = dev_get_drvdata(dev);
 	struct omapfb_info *ofbi = FB2OFB(fbi);
 
-	return snprintf(buf, PAGE_SIZE, "%lu\n", ofbi->region.size);
+	return snprintf(buf, PAGE_SIZE, "%lu\n", ofbi->region->size);
 }
 
 static ssize_t store_size(struct device *dev, struct device_attribute *attr,
@@ -416,6 +416,8 @@ static ssize_t store_size(struct device *dev, struct device_attribute *attr,
 {
 	struct fb_info *fbi = dev_get_drvdata(dev);
 	struct omapfb_info *ofbi = FB2OFB(fbi);
+	struct omapfb2_device *fbdev = ofbi->fbdev;
+	struct omapfb2_mem_region *rg;
 	unsigned long size;
 	int r;
 	int i;
@@ -425,15 +427,30 @@ static ssize_t store_size(struct device *dev, struct device_attribute *attr,
 	if (!lock_fb_info(fbi))
 		return -ENODEV;
 
-	for (i = 0; i < ofbi->num_overlays; i++) {
-		if (ofbi->overlays[i]->info.enabled) {
-			r = -EBUSY;
-			goto out;
+	rg = ofbi->region;
+
+	if (atomic_read(&rg->map_count)) {
+		r = -EBUSY;
+		goto out;
+	}
+
+	for (i = 0; i < fbdev->num_fbs; i++) {
+		struct omapfb_info *ofbi2 = FB2OFB(fbdev->fbs[i]);
+		int j;
+
+		if (ofbi2->region != rg)
+			continue;
+
+		for (j = 0; j < ofbi2->num_overlays; j++) {
+			if (ofbi2->overlays[j]->info.enabled) {
+				r = -EBUSY;
+				goto out;
+			}
 		}
 	}
 
-	if (size != ofbi->region.size) {
-		r = omapfb_realloc_fbmem(fbi, size, ofbi->region.type);
+	if (size != ofbi->region->size) {
+		r = omapfb_realloc_fbmem(fbi, size, ofbi->region->type);
 		if (r) {
 			dev_err(dev, "realloc fbmem failed\n");
 			goto out;
@@ -453,7 +470,7 @@ static ssize_t show_phys(struct device *dev,
 	struct fb_info *fbi = dev_get_drvdata(dev);
 	struct omapfb_info *ofbi = FB2OFB(fbi);
 
-	return snprintf(buf, PAGE_SIZE, "%0x\n", ofbi->region.paddr);
+	return snprintf(buf, PAGE_SIZE, "%0x\n", ofbi->region->paddr);
 }
 
 static ssize_t show_virt(struct device *dev,
@@ -462,7 +479,7 @@ static ssize_t show_virt(struct device *dev,
 	struct fb_info *fbi = dev_get_drvdata(dev);
 	struct omapfb_info *ofbi = FB2OFB(fbi);
 
-	return snprintf(buf, PAGE_SIZE, "%p\n", ofbi->region.vaddr);
+	return snprintf(buf, PAGE_SIZE, "%p\n", ofbi->region->vaddr);
 }
 
 static struct device_attribute omapfb_attrs[] = {
diff --git a/drivers/video/omap2/omapfb/omapfb.h b/drivers/video/omap2/omapfb/omapfb.h
index c9866be0460a..02f1ba9b228c 100644
--- a/drivers/video/omap2/omapfb/omapfb.h
+++ b/drivers/video/omap2/omapfb/omapfb.h
@@ -44,6 +44,7 @@ extern unsigned int omapfb_debug;
 #define OMAPFB_MAX_OVL_PER_FB 3
 
 struct omapfb2_mem_region {
+	int             id;
 	u32		paddr;
 	void __iomem	*vaddr;
 	struct vrfb	vrfb;
@@ -51,13 +52,13 @@ struct omapfb2_mem_region {
 	u8		type;		/* OMAPFB_PLANE_MEM_* */
 	bool		alloc;		/* allocated by the driver */
 	bool		map;		/* kernel mapped by the driver */
+	atomic_t	map_count;
 };
 
 /* appended to fb_info */
 struct omapfb_info {
 	int id;
-	struct omapfb2_mem_region region;
-	atomic_t map_count;
+	struct omapfb2_mem_region *region;
 	int num_overlays;
 	struct omap_overlay *overlays[OMAPFB_MAX_OVL_PER_FB];
 	struct omapfb2_device *fbdev;
@@ -76,6 +77,7 @@ struct omapfb2_device {
 
 	unsigned num_fbs;
 	struct fb_info *fbs[10];
+	struct omapfb2_mem_region regions[10];
 
 	unsigned num_displays;
 	struct omap_dss_device *displays[10];
@@ -117,6 +119,9 @@ int omapfb_update_window(struct fb_info *fbi,
 int dss_mode_to_fb_mode(enum omap_color_mode dssmode,
 			struct fb_var_screeninfo *var);
 
+int omapfb_setup_overlay(struct fb_info *fbi, struct omap_overlay *ovl,
+		u16 posx, u16 posy, u16 outw, u16 outh);
+
 /* find the display connected to this fb, if any */
 static inline struct omap_dss_device *fb2display(struct fb_info *fbi)
 {
diff --git a/include/linux/omapfb.h b/include/linux/omapfb.h
index 9bdd91486b49..0ecf7311c1ae 100644
--- a/include/linux/omapfb.h
+++ b/include/linux/omapfb.h
@@ -85,6 +85,9 @@
 #define OMAPFB_MEMTYPE_SRAM		1
 #define OMAPFB_MEMTYPE_MAX		1
 
+#define OMAPFB_MEM_IDX_ENABLED	0x80
+#define OMAPFB_MEM_IDX_MASK	0x7f
+
 enum omapfb_color_format {
 	OMAPFB_COLOR_RGB565 = 0,
 	OMAPFB_COLOR_YUV422,
@@ -136,7 +139,7 @@ struct omapfb_plane_info {
 	__u8  enabled;
 	__u8  channel_out;
 	__u8  mirror;
-	__u8  reserved1;
+	__u8  mem_idx;
 	__u32 out_width;
 	__u32 out_height;
 	__u32 reserved2[12];
-- 
cgit v1.2.3


From a931da6ac9331a6c80dd91c199105806f2336188 Mon Sep 17 00:00:00 2001
From: Theodore Ts'o <tytso@mit.edu>
Date: Tue, 3 Aug 2010 21:35:12 -0400
Subject: jbd2: Change j_state_lock to be a rwlock_t

Lockstat reports have shown that j_state_lock is a major source of
lock contention, especially on systems with more than 4 CPU cores.  So
change it to be a read/write spinlock.

Signed-off-by: "Theodore Ts'o" <tytso@mit.edu>
---
 fs/ext4/inode.c       |  4 +--
 fs/ext4/super.c       |  4 +--
 fs/jbd2/checkpoint.c  | 16 ++++-----
 fs/jbd2/commit.c      | 26 +++++++-------
 fs/jbd2/journal.c     | 94 +++++++++++++++++++++++++--------------------------
 fs/jbd2/transaction.c | 74 +++++++++++++++++++++-------------------
 fs/ocfs2/journal.c    |  4 +--
 include/linux/jbd2.h  |  2 +-
 8 files changed, 114 insertions(+), 110 deletions(-)

(limited to 'include/linux')

diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
index 533b607f9cb5..ab2247d642c6 100644
--- a/fs/ext4/inode.c
+++ b/fs/ext4/inode.c
@@ -5066,7 +5066,7 @@ struct inode *ext4_iget(struct super_block *sb, unsigned long ino)
 		transaction_t *transaction;
 		tid_t tid;
 
-		spin_lock(&journal->j_state_lock);
+		read_lock(&journal->j_state_lock);
 		if (journal->j_running_transaction)
 			transaction = journal->j_running_transaction;
 		else
@@ -5075,7 +5075,7 @@ struct inode *ext4_iget(struct super_block *sb, unsigned long ino)
 			tid = transaction->t_tid;
 		else
 			tid = journal->j_commit_sequence;
-		spin_unlock(&journal->j_state_lock);
+		read_unlock(&journal->j_state_lock);
 		ei->i_sync_tid = tid;
 		ei->i_datasync_tid = tid;
 	}
diff --git a/fs/ext4/super.c b/fs/ext4/super.c
index 3fd65eb66ccd..81cb3fc1218e 100644
--- a/fs/ext4/super.c
+++ b/fs/ext4/super.c
@@ -3232,7 +3232,7 @@ static void ext4_init_journal_params(struct super_block *sb, journal_t *journal)
 	journal->j_min_batch_time = sbi->s_min_batch_time;
 	journal->j_max_batch_time = sbi->s_max_batch_time;
 
-	spin_lock(&journal->j_state_lock);
+	write_lock(&journal->j_state_lock);
 	if (test_opt(sb, BARRIER))
 		journal->j_flags |= JBD2_BARRIER;
 	else
@@ -3241,7 +3241,7 @@ static void ext4_init_journal_params(struct super_block *sb, journal_t *journal)
 		journal->j_flags |= JBD2_ABORT_ON_SYNCDATA_ERR;
 	else
 		journal->j_flags &= ~JBD2_ABORT_ON_SYNCDATA_ERR;
-	spin_unlock(&journal->j_state_lock);
+	write_unlock(&journal->j_state_lock);
 }
 
 static journal_t *ext4_get_journal(struct super_block *sb,
diff --git a/fs/jbd2/checkpoint.c b/fs/jbd2/checkpoint.c
index f8cdc02520f9..1c23a0f4e8a3 100644
--- a/fs/jbd2/checkpoint.c
+++ b/fs/jbd2/checkpoint.c
@@ -118,13 +118,13 @@ static int __try_to_free_cp_buf(struct journal_head *jh)
 void __jbd2_log_wait_for_space(journal_t *journal)
 {
 	int nblocks, space_left;
-	assert_spin_locked(&journal->j_state_lock);
+	/* assert_spin_locked(&journal->j_state_lock); */
 
 	nblocks = jbd_space_needed(journal);
 	while (__jbd2_log_space_left(journal) < nblocks) {
 		if (journal->j_flags & JBD2_ABORT)
 			return;
-		spin_unlock(&journal->j_state_lock);
+		write_unlock(&journal->j_state_lock);
 		mutex_lock(&journal->j_checkpoint_mutex);
 
 		/*
@@ -138,7 +138,7 @@ void __jbd2_log_wait_for_space(journal_t *journal)
 		 * filesystem, so abort the journal and leave a stack
 		 * trace for forensic evidence.
 		 */
-		spin_lock(&journal->j_state_lock);
+		write_lock(&journal->j_state_lock);
 		spin_lock(&journal->j_list_lock);
 		nblocks = jbd_space_needed(journal);
 		space_left = __jbd2_log_space_left(journal);
@@ -149,7 +149,7 @@ void __jbd2_log_wait_for_space(journal_t *journal)
 			if (journal->j_committing_transaction)
 				tid = journal->j_committing_transaction->t_tid;
 			spin_unlock(&journal->j_list_lock);
-			spin_unlock(&journal->j_state_lock);
+			write_unlock(&journal->j_state_lock);
 			if (chkpt) {
 				jbd2_log_do_checkpoint(journal);
 			} else if (jbd2_cleanup_journal_tail(journal) == 0) {
@@ -167,7 +167,7 @@ void __jbd2_log_wait_for_space(journal_t *journal)
 				WARN_ON(1);
 				jbd2_journal_abort(journal, 0);
 			}
-			spin_lock(&journal->j_state_lock);
+			write_lock(&journal->j_state_lock);
 		} else {
 			spin_unlock(&journal->j_list_lock);
 		}
@@ -474,7 +474,7 @@ int jbd2_cleanup_journal_tail(journal_t *journal)
 	 * next transaction ID we will write, and where it will
 	 * start. */
 
-	spin_lock(&journal->j_state_lock);
+	write_lock(&journal->j_state_lock);
 	spin_lock(&journal->j_list_lock);
 	transaction = journal->j_checkpoint_transactions;
 	if (transaction) {
@@ -496,7 +496,7 @@ int jbd2_cleanup_journal_tail(journal_t *journal)
 	/* If the oldest pinned transaction is at the tail of the log
            already then there's not much we can do right now. */
 	if (journal->j_tail_sequence == first_tid) {
-		spin_unlock(&journal->j_state_lock);
+		write_unlock(&journal->j_state_lock);
 		return 1;
 	}
 
@@ -516,7 +516,7 @@ int jbd2_cleanup_journal_tail(journal_t *journal)
 	journal->j_free += freed;
 	journal->j_tail_sequence = first_tid;
 	journal->j_tail = blocknr;
-	spin_unlock(&journal->j_state_lock);
+	write_unlock(&journal->j_state_lock);
 
 	/*
 	 * If there is an external journal, we need to make sure that
diff --git a/fs/jbd2/commit.c b/fs/jbd2/commit.c
index fbd2c564e916..67bb0a2f35e5 100644
--- a/fs/jbd2/commit.c
+++ b/fs/jbd2/commit.c
@@ -152,9 +152,9 @@ static int journal_submit_commit_record(journal_t *journal,
 		printk(KERN_WARNING
 		       "JBD2: Disabling barriers on %s, "
 		       "not supported by device\n", journal->j_devname);
-		spin_lock(&journal->j_state_lock);
+		write_lock(&journal->j_state_lock);
 		journal->j_flags &= ~JBD2_BARRIER;
-		spin_unlock(&journal->j_state_lock);
+		write_unlock(&journal->j_state_lock);
 
 		/* And try again, without the barrier */
 		lock_buffer(bh);
@@ -182,9 +182,9 @@ retry:
 		printk(KERN_WARNING
 		       "JBD2: %s: disabling barries on %s - not supported "
 		       "by device\n", __func__, journal->j_devname);
-		spin_lock(&journal->j_state_lock);
+		write_lock(&journal->j_state_lock);
 		journal->j_flags &= ~JBD2_BARRIER;
-		spin_unlock(&journal->j_state_lock);
+		write_unlock(&journal->j_state_lock);
 
 		lock_buffer(bh);
 		clear_buffer_dirty(bh);
@@ -400,7 +400,7 @@ void jbd2_journal_commit_transaction(journal_t *journal)
 	jbd_debug(1, "JBD: starting commit of transaction %d\n",
 			commit_transaction->t_tid);
 
-	spin_lock(&journal->j_state_lock);
+	write_lock(&journal->j_state_lock);
 	commit_transaction->t_state = T_LOCKED;
 
 	/*
@@ -424,9 +424,9 @@ void jbd2_journal_commit_transaction(journal_t *journal)
 					TASK_UNINTERRUPTIBLE);
 		if (atomic_read(&commit_transaction->t_updates)) {
 			spin_unlock(&commit_transaction->t_handle_lock);
-			spin_unlock(&journal->j_state_lock);
+			write_unlock(&journal->j_state_lock);
 			schedule();
-			spin_lock(&journal->j_state_lock);
+			write_lock(&journal->j_state_lock);
 			spin_lock(&commit_transaction->t_handle_lock);
 		}
 		finish_wait(&journal->j_wait_updates, &wait);
@@ -497,7 +497,7 @@ void jbd2_journal_commit_transaction(journal_t *journal)
 	start_time = ktime_get();
 	commit_transaction->t_log_start = journal->j_head;
 	wake_up(&journal->j_wait_transaction_locked);
-	spin_unlock(&journal->j_state_lock);
+	write_unlock(&journal->j_state_lock);
 
 	jbd_debug (3, "JBD: commit phase 2\n");
 
@@ -519,9 +519,9 @@ void jbd2_journal_commit_transaction(journal_t *journal)
 	 * transaction!  Now comes the tricky part: we need to write out
 	 * metadata.  Loop over the transaction's entire buffer list:
 	 */
-	spin_lock(&journal->j_state_lock);
+	write_lock(&journal->j_state_lock);
 	commit_transaction->t_state = T_COMMIT;
-	spin_unlock(&journal->j_state_lock);
+	write_unlock(&journal->j_state_lock);
 
 	trace_jbd2_commit_logging(journal, commit_transaction);
 	stats.run.rs_logging = jiffies;
@@ -978,7 +978,7 @@ restart_loop:
 	 * __jbd2_journal_drop_transaction(). Otherwise we could race with
 	 * other checkpointing code processing the transaction...
 	 */
-	spin_lock(&journal->j_state_lock);
+	write_lock(&journal->j_state_lock);
 	spin_lock(&journal->j_list_lock);
 	/*
 	 * Now recheck if some buffers did not get attached to the transaction
@@ -986,7 +986,7 @@ restart_loop:
 	 */
 	if (commit_transaction->t_forget) {
 		spin_unlock(&journal->j_list_lock);
-		spin_unlock(&journal->j_state_lock);
+		write_unlock(&journal->j_state_lock);
 		goto restart_loop;
 	}
 
@@ -1038,7 +1038,7 @@ restart_loop:
 				journal->j_average_commit_time*3) / 4;
 	else
 		journal->j_average_commit_time = commit_time;
-	spin_unlock(&journal->j_state_lock);
+	write_unlock(&journal->j_state_lock);
 
 	if (commit_transaction->t_checkpoint_list == NULL &&
 	    commit_transaction->t_checkpoint_io_list == NULL) {
diff --git a/fs/jbd2/journal.c b/fs/jbd2/journal.c
index a79d3345b55a..e7bf0fd9cec7 100644
--- a/fs/jbd2/journal.c
+++ b/fs/jbd2/journal.c
@@ -142,7 +142,7 @@ static int kjournald2(void *arg)
 	/*
 	 * And now, wait forever for commit wakeup events.
 	 */
-	spin_lock(&journal->j_state_lock);
+	write_lock(&journal->j_state_lock);
 
 loop:
 	if (journal->j_flags & JBD2_UNMOUNT)
@@ -153,10 +153,10 @@ loop:
 
 	if (journal->j_commit_sequence != journal->j_commit_request) {
 		jbd_debug(1, "OK, requests differ\n");
-		spin_unlock(&journal->j_state_lock);
+		write_unlock(&journal->j_state_lock);
 		del_timer_sync(&journal->j_commit_timer);
 		jbd2_journal_commit_transaction(journal);
-		spin_lock(&journal->j_state_lock);
+		write_lock(&journal->j_state_lock);
 		goto loop;
 	}
 
@@ -168,9 +168,9 @@ loop:
 		 * be already stopped.
 		 */
 		jbd_debug(1, "Now suspending kjournald2\n");
-		spin_unlock(&journal->j_state_lock);
+		write_unlock(&journal->j_state_lock);
 		refrigerator();
-		spin_lock(&journal->j_state_lock);
+		write_lock(&journal->j_state_lock);
 	} else {
 		/*
 		 * We assume on resume that commits are already there,
@@ -190,9 +190,9 @@ loop:
 		if (journal->j_flags & JBD2_UNMOUNT)
 			should_sleep = 0;
 		if (should_sleep) {
-			spin_unlock(&journal->j_state_lock);
+			write_unlock(&journal->j_state_lock);
 			schedule();
-			spin_lock(&journal->j_state_lock);
+			write_lock(&journal->j_state_lock);
 		}
 		finish_wait(&journal->j_wait_commit, &wait);
 	}
@@ -210,7 +210,7 @@ loop:
 	goto loop;
 
 end_loop:
-	spin_unlock(&journal->j_state_lock);
+	write_unlock(&journal->j_state_lock);
 	del_timer_sync(&journal->j_commit_timer);
 	journal->j_task = NULL;
 	wake_up(&journal->j_wait_done_commit);
@@ -233,16 +233,16 @@ static int jbd2_journal_start_thread(journal_t *journal)
 
 static void journal_kill_thread(journal_t *journal)
 {
-	spin_lock(&journal->j_state_lock);
+	write_lock(&journal->j_state_lock);
 	journal->j_flags |= JBD2_UNMOUNT;
 
 	while (journal->j_task) {
 		wake_up(&journal->j_wait_commit);
-		spin_unlock(&journal->j_state_lock);
+		write_unlock(&journal->j_state_lock);
 		wait_event(journal->j_wait_done_commit, journal->j_task == NULL);
-		spin_lock(&journal->j_state_lock);
+		write_lock(&journal->j_state_lock);
 	}
-	spin_unlock(&journal->j_state_lock);
+	write_unlock(&journal->j_state_lock);
 }
 
 /*
@@ -452,7 +452,7 @@ int __jbd2_log_space_left(journal_t *journal)
 {
 	int left = journal->j_free;
 
-	assert_spin_locked(&journal->j_state_lock);
+	/* assert_spin_locked(&journal->j_state_lock); */
 
 	/*
 	 * Be pessimistic here about the number of those free blocks which
@@ -497,9 +497,9 @@ int jbd2_log_start_commit(journal_t *journal, tid_t tid)
 {
 	int ret;
 
-	spin_lock(&journal->j_state_lock);
+	write_lock(&journal->j_state_lock);
 	ret = __jbd2_log_start_commit(journal, tid);
-	spin_unlock(&journal->j_state_lock);
+	write_unlock(&journal->j_state_lock);
 	return ret;
 }
 
@@ -518,7 +518,7 @@ int jbd2_journal_force_commit_nested(journal_t *journal)
 	transaction_t *transaction = NULL;
 	tid_t tid;
 
-	spin_lock(&journal->j_state_lock);
+	read_lock(&journal->j_state_lock);
 	if (journal->j_running_transaction && !current->journal_info) {
 		transaction = journal->j_running_transaction;
 		__jbd2_log_start_commit(journal, transaction->t_tid);
@@ -526,12 +526,12 @@ int jbd2_journal_force_commit_nested(journal_t *journal)
 		transaction = journal->j_committing_transaction;
 
 	if (!transaction) {
-		spin_unlock(&journal->j_state_lock);
+		read_unlock(&journal->j_state_lock);
 		return 0;	/* Nothing to retry */
 	}
 
 	tid = transaction->t_tid;
-	spin_unlock(&journal->j_state_lock);
+	read_unlock(&journal->j_state_lock);
 	jbd2_log_wait_commit(journal, tid);
 	return 1;
 }
@@ -545,7 +545,7 @@ int jbd2_journal_start_commit(journal_t *journal, tid_t *ptid)
 {
 	int ret = 0;
 
-	spin_lock(&journal->j_state_lock);
+	write_lock(&journal->j_state_lock);
 	if (journal->j_running_transaction) {
 		tid_t tid = journal->j_running_transaction->t_tid;
 
@@ -564,7 +564,7 @@ int jbd2_journal_start_commit(journal_t *journal, tid_t *ptid)
 			*ptid = journal->j_committing_transaction->t_tid;
 		ret = 1;
 	}
-	spin_unlock(&journal->j_state_lock);
+	write_unlock(&journal->j_state_lock);
 	return ret;
 }
 
@@ -576,26 +576,24 @@ int jbd2_log_wait_commit(journal_t *journal, tid_t tid)
 {
 	int err = 0;
 
+	read_lock(&journal->j_state_lock);
 #ifdef CONFIG_JBD2_DEBUG
-	spin_lock(&journal->j_state_lock);
 	if (!tid_geq(journal->j_commit_request, tid)) {
 		printk(KERN_EMERG
 		       "%s: error: j_commit_request=%d, tid=%d\n",
 		       __func__, journal->j_commit_request, tid);
 	}
-	spin_unlock(&journal->j_state_lock);
 #endif
-	spin_lock(&journal->j_state_lock);
 	while (tid_gt(tid, journal->j_commit_sequence)) {
 		jbd_debug(1, "JBD: want %d, j_commit_sequence=%d\n",
 				  tid, journal->j_commit_sequence);
 		wake_up(&journal->j_wait_commit);
-		spin_unlock(&journal->j_state_lock);
+		read_unlock(&journal->j_state_lock);
 		wait_event(journal->j_wait_done_commit,
 				!tid_gt(tid, journal->j_commit_sequence));
-		spin_lock(&journal->j_state_lock);
+		read_lock(&journal->j_state_lock);
 	}
-	spin_unlock(&journal->j_state_lock);
+	read_unlock(&journal->j_state_lock);
 
 	if (unlikely(is_journal_aborted(journal))) {
 		printk(KERN_EMERG "journal commit I/O error\n");
@@ -612,7 +610,7 @@ int jbd2_journal_next_log_block(journal_t *journal, unsigned long long *retp)
 {
 	unsigned long blocknr;
 
-	spin_lock(&journal->j_state_lock);
+	write_lock(&journal->j_state_lock);
 	J_ASSERT(journal->j_free > 1);
 
 	blocknr = journal->j_head;
@@ -620,7 +618,7 @@ int jbd2_journal_next_log_block(journal_t *journal, unsigned long long *retp)
 	journal->j_free--;
 	if (journal->j_head == journal->j_last)
 		journal->j_head = journal->j_first;
-	spin_unlock(&journal->j_state_lock);
+	write_unlock(&journal->j_state_lock);
 	return jbd2_journal_bmap(journal, blocknr, retp);
 }
 
@@ -840,7 +838,7 @@ static journal_t * journal_init_common (void)
 	mutex_init(&journal->j_checkpoint_mutex);
 	spin_lock_init(&journal->j_revoke_lock);
 	spin_lock_init(&journal->j_list_lock);
-	spin_lock_init(&journal->j_state_lock);
+	rwlock_init(&journal->j_state_lock);
 
 	journal->j_commit_interval = (HZ * JBD2_DEFAULT_MAX_COMMIT_AGE);
 	journal->j_min_batch_time = 0;
@@ -1106,14 +1104,14 @@ void jbd2_journal_update_superblock(journal_t *journal, int wait)
 		set_buffer_uptodate(bh);
 	}
 
-	spin_lock(&journal->j_state_lock);
+	read_lock(&journal->j_state_lock);
 	jbd_debug(1,"JBD: updating superblock (start %ld, seq %d, errno %d)\n",
 		  journal->j_tail, journal->j_tail_sequence, journal->j_errno);
 
 	sb->s_sequence = cpu_to_be32(journal->j_tail_sequence);
 	sb->s_start    = cpu_to_be32(journal->j_tail);
 	sb->s_errno    = cpu_to_be32(journal->j_errno);
-	spin_unlock(&journal->j_state_lock);
+	read_unlock(&journal->j_state_lock);
 
 	BUFFER_TRACE(bh, "marking dirty");
 	mark_buffer_dirty(bh);
@@ -1134,12 +1132,12 @@ out:
 	 * any future commit will have to be careful to update the
 	 * superblock again to re-record the true start of the log. */
 
-	spin_lock(&journal->j_state_lock);
+	write_lock(&journal->j_state_lock);
 	if (sb->s_start)
 		journal->j_flags &= ~JBD2_FLUSHED;
 	else
 		journal->j_flags |= JBD2_FLUSHED;
-	spin_unlock(&journal->j_state_lock);
+	write_unlock(&journal->j_state_lock);
 }
 
 /*
@@ -1551,7 +1549,7 @@ int jbd2_journal_flush(journal_t *journal)
 	transaction_t *transaction = NULL;
 	unsigned long old_tail;
 
-	spin_lock(&journal->j_state_lock);
+	write_lock(&journal->j_state_lock);
 
 	/* Force everything buffered to the log... */
 	if (journal->j_running_transaction) {
@@ -1564,10 +1562,10 @@ int jbd2_journal_flush(journal_t *journal)
 	if (transaction) {
 		tid_t tid = transaction->t_tid;
 
-		spin_unlock(&journal->j_state_lock);
+		write_unlock(&journal->j_state_lock);
 		jbd2_log_wait_commit(journal, tid);
 	} else {
-		spin_unlock(&journal->j_state_lock);
+		write_unlock(&journal->j_state_lock);
 	}
 
 	/* ...and flush everything in the log out to disk. */
@@ -1591,12 +1589,12 @@ int jbd2_journal_flush(journal_t *journal)
 	 * the magic code for a fully-recovered superblock.  Any future
 	 * commits of data to the journal will restore the current
 	 * s_start value. */
-	spin_lock(&journal->j_state_lock);
+	write_lock(&journal->j_state_lock);
 	old_tail = journal->j_tail;
 	journal->j_tail = 0;
-	spin_unlock(&journal->j_state_lock);
+	write_unlock(&journal->j_state_lock);
 	jbd2_journal_update_superblock(journal, 1);
-	spin_lock(&journal->j_state_lock);
+	write_lock(&journal->j_state_lock);
 	journal->j_tail = old_tail;
 
 	J_ASSERT(!journal->j_running_transaction);
@@ -1604,7 +1602,7 @@ int jbd2_journal_flush(journal_t *journal)
 	J_ASSERT(!journal->j_checkpoint_transactions);
 	J_ASSERT(journal->j_head == journal->j_tail);
 	J_ASSERT(journal->j_tail_sequence == journal->j_transaction_sequence);
-	spin_unlock(&journal->j_state_lock);
+	write_unlock(&journal->j_state_lock);
 	return 0;
 }
 
@@ -1668,12 +1666,12 @@ void __jbd2_journal_abort_hard(journal_t *journal)
 	printk(KERN_ERR "Aborting journal on device %s.\n",
 	       journal->j_devname);
 
-	spin_lock(&journal->j_state_lock);
+	write_lock(&journal->j_state_lock);
 	journal->j_flags |= JBD2_ABORT;
 	transaction = journal->j_running_transaction;
 	if (transaction)
 		__jbd2_log_start_commit(journal, transaction->t_tid);
-	spin_unlock(&journal->j_state_lock);
+	write_unlock(&journal->j_state_lock);
 }
 
 /* Soft abort: record the abort error status in the journal superblock,
@@ -1758,12 +1756,12 @@ int jbd2_journal_errno(journal_t *journal)
 {
 	int err;
 
-	spin_lock(&journal->j_state_lock);
+	read_lock(&journal->j_state_lock);
 	if (journal->j_flags & JBD2_ABORT)
 		err = -EROFS;
 	else
 		err = journal->j_errno;
-	spin_unlock(&journal->j_state_lock);
+	read_unlock(&journal->j_state_lock);
 	return err;
 }
 
@@ -1778,12 +1776,12 @@ int jbd2_journal_clear_err(journal_t *journal)
 {
 	int err = 0;
 
-	spin_lock(&journal->j_state_lock);
+	write_lock(&journal->j_state_lock);
 	if (journal->j_flags & JBD2_ABORT)
 		err = -EROFS;
 	else
 		journal->j_errno = 0;
-	spin_unlock(&journal->j_state_lock);
+	write_unlock(&journal->j_state_lock);
 	return err;
 }
 
@@ -1796,10 +1794,10 @@ int jbd2_journal_clear_err(journal_t *journal)
  */
 void jbd2_journal_ack_err(journal_t *journal)
 {
-	spin_lock(&journal->j_state_lock);
+	write_lock(&journal->j_state_lock);
 	if (journal->j_errno)
 		journal->j_flags |= JBD2_ACK_ERR;
-	spin_unlock(&journal->j_state_lock);
+	write_unlock(&journal->j_state_lock);
 }
 
 int jbd2_journal_blocks_per_page(struct inode *inode)
diff --git a/fs/jbd2/transaction.c b/fs/jbd2/transaction.c
index 9c64c7ec48d4..663065142b42 100644
--- a/fs/jbd2/transaction.c
+++ b/fs/jbd2/transaction.c
@@ -124,36 +124,38 @@ alloc_transaction:
 
 	jbd_debug(3, "New handle %p going live.\n", handle);
 
-repeat:
-
 	/*
 	 * We need to hold j_state_lock until t_updates has been incremented,
 	 * for proper journal barrier handling
 	 */
-	spin_lock(&journal->j_state_lock);
-repeat_locked:
+repeat:
+	read_lock(&journal->j_state_lock);
 	if (is_journal_aborted(journal) ||
 	    (journal->j_errno != 0 && !(journal->j_flags & JBD2_ACK_ERR))) {
-		spin_unlock(&journal->j_state_lock);
+		read_unlock(&journal->j_state_lock);
 		kfree(new_transaction);
 		return -EROFS;
 	}
 
 	/* Wait on the journal's transaction barrier if necessary */
 	if (journal->j_barrier_count) {
-		spin_unlock(&journal->j_state_lock);
+		read_unlock(&journal->j_state_lock);
 		wait_event(journal->j_wait_transaction_locked,
 				journal->j_barrier_count == 0);
 		goto repeat;
 	}
 
 	if (!journal->j_running_transaction) {
-		if (!new_transaction) {
-			spin_unlock(&journal->j_state_lock);
+		read_unlock(&journal->j_state_lock);
+		if (!new_transaction)
 			goto alloc_transaction;
+		write_lock(&journal->j_state_lock);
+		if (!journal->j_running_transaction) {
+			jbd2_get_transaction(journal, new_transaction);
+			new_transaction = NULL;
 		}
-		jbd2_get_transaction(journal, new_transaction);
-		new_transaction = NULL;
+		write_unlock(&journal->j_state_lock);
+		goto repeat;
 	}
 
 	transaction = journal->j_running_transaction;
@@ -167,7 +169,7 @@ repeat_locked:
 
 		prepare_to_wait(&journal->j_wait_transaction_locked,
 					&wait, TASK_UNINTERRUPTIBLE);
-		spin_unlock(&journal->j_state_lock);
+		read_unlock(&journal->j_state_lock);
 		schedule();
 		finish_wait(&journal->j_wait_transaction_locked, &wait);
 		goto repeat;
@@ -194,7 +196,7 @@ repeat_locked:
 		prepare_to_wait(&journal->j_wait_transaction_locked, &wait,
 				TASK_UNINTERRUPTIBLE);
 		__jbd2_log_start_commit(journal, transaction->t_tid);
-		spin_unlock(&journal->j_state_lock);
+		read_unlock(&journal->j_state_lock);
 		schedule();
 		finish_wait(&journal->j_wait_transaction_locked, &wait);
 		goto repeat;
@@ -228,8 +230,12 @@ repeat_locked:
 	if (__jbd2_log_space_left(journal) < jbd_space_needed(journal)) {
 		jbd_debug(2, "Handle %p waiting for checkpoint...\n", handle);
 		spin_unlock(&transaction->t_handle_lock);
-		__jbd2_log_wait_for_space(journal);
-		goto repeat_locked;
+		read_unlock(&journal->j_state_lock);
+		write_lock(&journal->j_state_lock);
+		if (__jbd2_log_space_left(journal) < jbd_space_needed(journal))
+			__jbd2_log_wait_for_space(journal);
+		write_unlock(&journal->j_state_lock);
+		goto repeat;
 	}
 
 	/* OK, account for the buffers that this operation expects to
@@ -250,7 +256,7 @@ repeat_locked:
 		  atomic_read(&transaction->t_outstanding_credits),
 		  __jbd2_log_space_left(journal));
 	spin_unlock(&transaction->t_handle_lock);
-	spin_unlock(&journal->j_state_lock);
+	read_unlock(&journal->j_state_lock);
 
 	lock_map_acquire(&handle->h_lockdep_map);
 	kfree(new_transaction);
@@ -362,7 +368,7 @@ int jbd2_journal_extend(handle_t *handle, int nblocks)
 
 	result = 1;
 
-	spin_lock(&journal->j_state_lock);
+	read_lock(&journal->j_state_lock);
 
 	/* Don't extend a locked-down transaction! */
 	if (handle->h_transaction->t_state != T_RUNNING) {
@@ -394,7 +400,7 @@ int jbd2_journal_extend(handle_t *handle, int nblocks)
 unlock:
 	spin_unlock(&transaction->t_handle_lock);
 error_out:
-	spin_unlock(&journal->j_state_lock);
+	read_unlock(&journal->j_state_lock);
 out:
 	return result;
 }
@@ -432,7 +438,7 @@ int jbd2__journal_restart(handle_t *handle, int nblocks, int gfp_mask)
 	J_ASSERT(atomic_read(&transaction->t_updates) > 0);
 	J_ASSERT(journal_current_handle() == handle);
 
-	spin_lock(&journal->j_state_lock);
+	read_lock(&journal->j_state_lock);
 	spin_lock(&transaction->t_handle_lock);
 	atomic_sub(handle->h_buffer_credits,
 		   &transaction->t_outstanding_credits);
@@ -442,7 +448,7 @@ int jbd2__journal_restart(handle_t *handle, int nblocks, int gfp_mask)
 
 	jbd_debug(2, "restarting handle %p\n", handle);
 	__jbd2_log_start_commit(journal, transaction->t_tid);
-	spin_unlock(&journal->j_state_lock);
+	read_unlock(&journal->j_state_lock);
 
 	lock_map_release(&handle->h_lockdep_map);
 	handle->h_buffer_credits = nblocks;
@@ -472,7 +478,7 @@ void jbd2_journal_lock_updates(journal_t *journal)
 {
 	DEFINE_WAIT(wait);
 
-	spin_lock(&journal->j_state_lock);
+	write_lock(&journal->j_state_lock);
 	++journal->j_barrier_count;
 
 	/* Wait until there are no running updates */
@@ -490,12 +496,12 @@ void jbd2_journal_lock_updates(journal_t *journal)
 		prepare_to_wait(&journal->j_wait_updates, &wait,
 				TASK_UNINTERRUPTIBLE);
 		spin_unlock(&transaction->t_handle_lock);
-		spin_unlock(&journal->j_state_lock);
+		write_unlock(&journal->j_state_lock);
 		schedule();
 		finish_wait(&journal->j_wait_updates, &wait);
-		spin_lock(&journal->j_state_lock);
+		write_lock(&journal->j_state_lock);
 	}
-	spin_unlock(&journal->j_state_lock);
+	write_unlock(&journal->j_state_lock);
 
 	/*
 	 * We have now established a barrier against other normal updates, but
@@ -519,9 +525,9 @@ void jbd2_journal_unlock_updates (journal_t *journal)
 	J_ASSERT(journal->j_barrier_count != 0);
 
 	mutex_unlock(&journal->j_barrier);
-	spin_lock(&journal->j_state_lock);
+	write_lock(&journal->j_state_lock);
 	--journal->j_barrier_count;
-	spin_unlock(&journal->j_state_lock);
+	write_unlock(&journal->j_state_lock);
 	wake_up(&journal->j_wait_transaction_locked);
 }
 
@@ -1314,9 +1320,9 @@ int jbd2_journal_stop(handle_t *handle)
 
 		journal->j_last_sync_writer = pid;
 
-		spin_lock(&journal->j_state_lock);
+		read_lock(&journal->j_state_lock);
 		commit_time = journal->j_average_commit_time;
-		spin_unlock(&journal->j_state_lock);
+		read_unlock(&journal->j_state_lock);
 
 		trans_time = ktime_to_ns(ktime_sub(ktime_get(),
 						   transaction->t_start_time));
@@ -1748,7 +1754,7 @@ static int journal_unmap_buffer(journal_t *journal, struct buffer_head *bh)
 		goto zap_buffer_unlocked;
 
 	/* OK, we have data buffer in journaled mode */
-	spin_lock(&journal->j_state_lock);
+	write_lock(&journal->j_state_lock);
 	jbd_lock_bh_state(bh);
 	spin_lock(&journal->j_list_lock);
 
@@ -1801,7 +1807,7 @@ static int journal_unmap_buffer(journal_t *journal, struct buffer_head *bh)
 			jbd2_journal_put_journal_head(jh);
 			spin_unlock(&journal->j_list_lock);
 			jbd_unlock_bh_state(bh);
-			spin_unlock(&journal->j_state_lock);
+			write_unlock(&journal->j_state_lock);
 			return ret;
 		} else {
 			/* There is no currently-running transaction. So the
@@ -1815,7 +1821,7 @@ static int journal_unmap_buffer(journal_t *journal, struct buffer_head *bh)
 				jbd2_journal_put_journal_head(jh);
 				spin_unlock(&journal->j_list_lock);
 				jbd_unlock_bh_state(bh);
-				spin_unlock(&journal->j_state_lock);
+				write_unlock(&journal->j_state_lock);
 				return ret;
 			} else {
 				/* The orphan record's transaction has
@@ -1839,7 +1845,7 @@ static int journal_unmap_buffer(journal_t *journal, struct buffer_head *bh)
 		jbd2_journal_put_journal_head(jh);
 		spin_unlock(&journal->j_list_lock);
 		jbd_unlock_bh_state(bh);
-		spin_unlock(&journal->j_state_lock);
+		write_unlock(&journal->j_state_lock);
 		return 0;
 	} else {
 		/* Good, the buffer belongs to the running transaction.
@@ -1858,7 +1864,7 @@ zap_buffer:
 zap_buffer_no_jh:
 	spin_unlock(&journal->j_list_lock);
 	jbd_unlock_bh_state(bh);
-	spin_unlock(&journal->j_state_lock);
+	write_unlock(&journal->j_state_lock);
 zap_buffer_unlocked:
 	clear_buffer_dirty(bh);
 	J_ASSERT_BH(bh, !buffer_jbddirty(bh));
@@ -2165,9 +2171,9 @@ int jbd2_journal_begin_ordered_truncate(journal_t *journal,
 	/* Locks are here just to force reading of recent values, it is
 	 * enough that the transaction was not committing before we started
 	 * a transaction adding the inode to orphan list */
-	spin_lock(&journal->j_state_lock);
+	read_lock(&journal->j_state_lock);
 	commit_trans = journal->j_committing_transaction;
-	spin_unlock(&journal->j_state_lock);
+	read_unlock(&journal->j_state_lock);
 	spin_lock(&journal->j_list_lock);
 	inode_trans = jinode->i_transaction;
 	spin_unlock(&journal->j_list_lock);
diff --git a/fs/ocfs2/journal.c b/fs/ocfs2/journal.c
index 47878cf16418..9c1b92ebeb94 100644
--- a/fs/ocfs2/journal.c
+++ b/fs/ocfs2/journal.c
@@ -760,13 +760,13 @@ void ocfs2_set_journal_params(struct ocfs2_super *osb)
 	if (osb->osb_commit_interval)
 		commit_interval = osb->osb_commit_interval;
 
-	spin_lock(&journal->j_state_lock);
+	write_lock(&journal->j_state_lock);
 	journal->j_commit_interval = commit_interval;
 	if (osb->s_mount_opt & OCFS2_MOUNT_BARRIER)
 		journal->j_flags |= JBD2_BARRIER;
 	else
 		journal->j_flags &= ~JBD2_BARRIER;
-	spin_unlock(&journal->j_state_lock);
+	write_unlock(&journal->j_state_lock);
 }
 
 int ocfs2_journal_init(struct ocfs2_journal *journal, int *dirty)
diff --git a/include/linux/jbd2.h b/include/linux/jbd2.h
index a72ce21de0e1..15d5743ccfbb 100644
--- a/include/linux/jbd2.h
+++ b/include/linux/jbd2.h
@@ -764,7 +764,7 @@ struct journal_s
 	/*
 	 * Protect the various scalars in the journal
 	 */
-	spinlock_t		j_state_lock;
+	rwlock_t		j_state_lock;
 
 	/*
 	 * Number of processes waiting to create a barrier lock [j_state_lock]
-- 
cgit v1.2.3


From 8dd420466c7bfc459fa04680bd5690bfc41a4553 Mon Sep 17 00:00:00 2001
From: Theodore Ts'o <tytso@mit.edu>
Date: Tue, 3 Aug 2010 21:38:29 -0400
Subject: jbd2: Remove t_handle_lock from start_this_handle()

This should remove the last exclusive lock from start_this_handle(),
so that we should now be able to start multiple transactions at the
same time on large SMP systems.

Signed-off-by: "Theodore Ts'o" <tytso@mit.edu>
---
 fs/jbd2/commit.c      |  3 ++-
 fs/jbd2/transaction.c | 33 ++++++++++++++++++++++-----------
 include/linux/jbd2.h  |  2 +-
 3 files changed, 25 insertions(+), 13 deletions(-)

(limited to 'include/linux')

diff --git a/fs/jbd2/commit.c b/fs/jbd2/commit.c
index 67bb0a2f35e5..f52e5e8049f1 100644
--- a/fs/jbd2/commit.c
+++ b/fs/jbd2/commit.c
@@ -1004,7 +1004,8 @@ restart_loop:
 	 * File the transaction statistics
 	 */
 	stats.ts_tid = commit_transaction->t_tid;
-	stats.run.rs_handle_count = commit_transaction->t_handle_count;
+	stats.run.rs_handle_count =
+		atomic_read(&commit_transaction->t_handle_count);
 	trace_jbd2_run_stats(journal->j_fs_dev->bd_dev,
 			     commit_transaction->t_tid, &stats.run);
 
diff --git a/fs/jbd2/transaction.c b/fs/jbd2/transaction.c
index 663065142b42..0752bcda535f 100644
--- a/fs/jbd2/transaction.c
+++ b/fs/jbd2/transaction.c
@@ -57,6 +57,7 @@ jbd2_get_transaction(journal_t *journal, transaction_t *transaction)
 	spin_lock_init(&transaction->t_handle_lock);
 	atomic_set(&transaction->t_updates, 0);
 	atomic_set(&transaction->t_outstanding_credits, 0);
+	atomic_set(&transaction->t_handle_count, 0);
 	INIT_LIST_HEAD(&transaction->t_inode_list);
 	INIT_LIST_HEAD(&transaction->t_private_list);
 
@@ -180,8 +181,8 @@ repeat:
 	 * buffers requested by this operation, we need to stall pending a log
 	 * checkpoint to free some more log space.
 	 */
-	spin_lock(&transaction->t_handle_lock);
-	needed = atomic_read(&transaction->t_outstanding_credits) + nblocks;
+	needed = atomic_add_return(nblocks,
+				   &transaction->t_outstanding_credits);
 
 	if (needed > journal->j_max_transaction_buffers) {
 		/*
@@ -192,7 +193,7 @@ repeat:
 		DEFINE_WAIT(wait);
 
 		jbd_debug(2, "Handle %p starting new commit...\n", handle);
-		spin_unlock(&transaction->t_handle_lock);
+		atomic_sub(nblocks, &transaction->t_outstanding_credits);
 		prepare_to_wait(&journal->j_wait_transaction_locked, &wait,
 				TASK_UNINTERRUPTIBLE);
 		__jbd2_log_start_commit(journal, transaction->t_tid);
@@ -229,7 +230,7 @@ repeat:
 	 */
 	if (__jbd2_log_space_left(journal) < jbd_space_needed(journal)) {
 		jbd_debug(2, "Handle %p waiting for checkpoint...\n", handle);
-		spin_unlock(&transaction->t_handle_lock);
+		atomic_sub(nblocks, &transaction->t_outstanding_credits);
 		read_unlock(&journal->j_state_lock);
 		write_lock(&journal->j_state_lock);
 		if (__jbd2_log_space_left(journal) < jbd_space_needed(journal))
@@ -239,23 +240,33 @@ repeat:
 	}
 
 	/* OK, account for the buffers that this operation expects to
-	 * use and add the handle to the running transaction. */
-
-	if (time_after(transaction->t_start, ts)) {
+	 * use and add the handle to the running transaction. 
+	 *
+	 * In order for t_max_wait to be reliable, it must be
+	 * protected by a lock.  But doing so will mean that
+	 * start_this_handle() can not be run in parallel on SMP
+	 * systems, which limits our scalability.  So we only enable
+	 * it when debugging is enabled.  We may want to use a
+	 * separate flag, eventually, so we can enable this
+	 * independently of debugging.
+	 */
+#ifdef CONFIG_JBD2_DEBUG
+	if (jbd2_journal_enable_debug &&
+	    time_after(transaction->t_start, ts)) {
 		ts = jbd2_time_diff(ts, transaction->t_start);
+		spin_lock(&transaction->t_handle_lock);
 		if (ts > transaction->t_max_wait)
 			transaction->t_max_wait = ts;
+		spin_unlock(&transaction->t_handle_lock);
 	}
-
+#endif
 	handle->h_transaction = transaction;
-	atomic_add(nblocks, &transaction->t_outstanding_credits);
 	atomic_inc(&transaction->t_updates);
-	transaction->t_handle_count++;
+	atomic_inc(&transaction->t_handle_count);
 	jbd_debug(4, "Handle %p given %d credits (total %d, free %d)\n",
 		  handle, nblocks,
 		  atomic_read(&transaction->t_outstanding_credits),
 		  __jbd2_log_space_left(journal));
-	spin_unlock(&transaction->t_handle_lock);
 	read_unlock(&journal->j_state_lock);
 
 	lock_map_acquire(&handle->h_lockdep_map);
diff --git a/include/linux/jbd2.h b/include/linux/jbd2.h
index 15d5743ccfbb..01743b5446ff 100644
--- a/include/linux/jbd2.h
+++ b/include/linux/jbd2.h
@@ -629,7 +629,7 @@ struct transaction_s
 	/*
 	 * How many handles used this transaction? [t_handle_lock]
 	 */
-	int t_handle_count;
+	atomic_t		t_handle_count;
 
 	/*
 	 * This transaction is being forced and some process is
-- 
cgit v1.2.3


From f1f88fc7e818c6678c6799a2edb8f1aeccc124aa Mon Sep 17 00:00:00 2001
From: Trond Myklebust <Trond.Myklebust@netapp.com>
Date: Sat, 31 Jul 2010 14:29:07 -0400
Subject: SUNRPC: The function rpc_restart_call() should return success/failure

Both rpc_restart_call_prepare() and rpc_restart_call() test for the
RPC_TASK_KILLED flag, and fail to restart the RPC call if that flag is set.

This patch allows callers to know whether or not the restart was
successful, so that they can perform cleanups etc in case of failure.

Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
---
 include/linux/sunrpc/clnt.h |  4 ++--
 net/sunrpc/clnt.c           | 11 ++++++-----
 2 files changed, 8 insertions(+), 7 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/sunrpc/clnt.h b/include/linux/sunrpc/clnt.h
index 8ed9642a5a76..debe75532199 100644
--- a/include/linux/sunrpc/clnt.h
+++ b/include/linux/sunrpc/clnt.h
@@ -148,8 +148,8 @@ int		rpc_call_sync(struct rpc_clnt *clnt,
 			      const struct rpc_message *msg, int flags);
 struct rpc_task *rpc_call_null(struct rpc_clnt *clnt, struct rpc_cred *cred,
 			       int flags);
-void		rpc_restart_call_prepare(struct rpc_task *);
-void		rpc_restart_call(struct rpc_task *);
+int		rpc_restart_call_prepare(struct rpc_task *);
+int		rpc_restart_call(struct rpc_task *);
 void		rpc_setbufsize(struct rpc_clnt *, unsigned int, unsigned int);
 size_t		rpc_max_payload(struct rpc_clnt *);
 void		rpc_force_rebind(struct rpc_clnt *);
diff --git a/net/sunrpc/clnt.c b/net/sunrpc/clnt.c
index 756fc324db9e..234c40c15f69 100644
--- a/net/sunrpc/clnt.c
+++ b/net/sunrpc/clnt.c
@@ -756,12 +756,13 @@ EXPORT_SYMBOL_GPL(rpc_force_rebind);
  * Restart an (async) RPC call from the call_prepare state.
  * Usually called from within the exit handler.
  */
-void
+int
 rpc_restart_call_prepare(struct rpc_task *task)
 {
 	if (RPC_ASSASSINATED(task))
-		return;
+		return 0;
 	task->tk_action = rpc_prepare_task;
+	return 1;
 }
 EXPORT_SYMBOL_GPL(rpc_restart_call_prepare);
 
@@ -769,13 +770,13 @@ EXPORT_SYMBOL_GPL(rpc_restart_call_prepare);
  * Restart an (async) RPC call. Usually called from within the
  * exit handler.
  */
-void
+int
 rpc_restart_call(struct rpc_task *task)
 {
 	if (RPC_ASSASSINATED(task))
-		return;
-
+		return 0;
 	task->tk_action = call_start;
+	return 1;
 }
 EXPORT_SYMBOL_GPL(rpc_restart_call);
 
-- 
cgit v1.2.3


From e1b004c3ef9c59db5f013528628b51c8653155ec Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Wed, 4 Aug 2010 10:53:00 +0200
Subject: Revert "timer: Added usleep[_range] timer"

This reverts commit 22b8f15c2f7130bb0386f548428df2ffd4e81903 to merge
an advanced version.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 include/linux/delay.h |  6 ------
 kernel/timer.c        | 22 ----------------------
 2 files changed, 28 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/delay.h b/include/linux/delay.h
index 0e303d1aacd8..fd832c6d419e 100644
--- a/include/linux/delay.h
+++ b/include/linux/delay.h
@@ -45,12 +45,6 @@ extern unsigned long lpj_fine;
 void calibrate_delay(void);
 void msleep(unsigned int msecs);
 unsigned long msleep_interruptible(unsigned int msecs);
-void usleep_range(unsigned long min, unsigned long max);
-
-static inline void usleep(unsigned long usecs)
-{
-	usleep_range(usecs, usecs);
-}
 
 static inline void ssleep(unsigned int seconds)
 {
diff --git a/kernel/timer.c b/kernel/timer.c
index f110f241ab66..ce98685cd1cb 100644
--- a/kernel/timer.c
+++ b/kernel/timer.c
@@ -1755,25 +1755,3 @@ unsigned long msleep_interruptible(unsigned int msecs)
 }
 
 EXPORT_SYMBOL(msleep_interruptible);
-
-static int __sched do_usleep_range(unsigned long min, unsigned long max)
-{
-	ktime_t kmin;
-	unsigned long delta;
-
-	kmin = ktime_set(0, min * NSEC_PER_USEC);
-	delta = max - min;
-	return schedule_hrtimeout_range(&kmin, delta, HRTIMER_MODE_REL);
-}
-
-/**
- * usleep_range - Drop in replacement for udelay where wakeup is flexible
- * @min: Minimum time in usecs to sleep
- * @max: Maximum time in usecs to sleep
- */
-void usleep_range(unsigned long min, unsigned long max)
-{
-	__set_current_state(TASK_UNINTERRUPTIBLE);
-	do_usleep_range(min, max);
-}
-EXPORT_SYMBOL(usleep_range);
-- 
cgit v1.2.3


From 5e7f5a178bba45c5aca3448fddecabd4e28f1f6b Mon Sep 17 00:00:00 2001
From: Patrick Pannuto <ppannuto@codeaurora.org>
Date: Mon, 2 Aug 2010 15:01:04 -0700
Subject: timer: Added usleep_range timer

usleep_range is a finer precision implementations of msleep
and is designed to be a drop-in replacement for udelay where
a precise sleep / busy-wait is unnecessary.

Since an easy interface to hrtimers could lead to an undesired
proliferation of interrupts, we provide only a "range" API,
forcing the caller to think about an acceptable tolerance on
both ends and hopefully avoiding introducing another interrupt.

INTRO

As discussed here ( http://lkml.org/lkml/2007/8/3/250 ), msleep(1) is not
precise enough for many drivers (yes, sleep precision is an unfair notion,
but consistently sleeping for ~an order of magnitude greater than requested
is worth fixing). This patch adds a usleep API so that udelay does not have
to be used. Obviously not every udelay can be replaced (those in atomic
contexts or being used for simple bitbanging come to mind), but there are
many, many examples of

mydriver_write(...)
/* Wait for hardware to latch */
udelay(100)

in various drivers where a busy-wait loop is neither beneficial nor
necessary, but msleep simply does not provide enough precision and people
are using a busy-wait loop instead.

CONCERNS FROM THE RFC

Why is udelay a problem / necessary? Most callers of udelay are in device/
driver initialization code, which is serial...

	As I see it, there is only benefit to sleeping over a delay; the
	notion of "refactoring" areas that use udelay was presented, but
	I see usleep as the refactoring. Consider i2c, if the bus is busy,
	you need to wait a bit (say 100us) before trying again, your
	current options are:

		* udelay(100)
		* msleep(1) <-- As noted above, actually as high as ~20ms
				on some platforms, so not really an option
		* Manually set up an hrtimer to try again in 100us (which
		  is what usleep does anyway...)

	People choose the udelay route because it is EASY; we need to
	provide a better easy route.

	Device / driver / boot code is *currently* serial, but every few
	months someone makes noise about parallelizing boot, and IMHO, a
	little forward-thinking now is one less thing to worry about
	if/when that ever happens

udelay's could be preempted

	Sure, but if udelay plans on looping 1000 times, and it gets
	preempted on loop 200, whenever it's scheduled again, it is
	going to do the next 800 loops.

Is the interruptible case needed?

	Probably not, but I see usleep as a very logical parallel to msleep,
	so it made sense to include the "full" API. Processors are getting
	faster (albeit not as quickly as they are becoming more parallel),
	so if someone wanted to be interruptible for a few usecs, why not
	let them? If this is a contentious point, I'm happy to remove it.

OTHER THOUGHTS

I believe there is also value in exposing the usleep_range option; it gives
the scheduler a lot more flexibility and allows the programmer to express
his intent much more clearly; it's something I would hope future driver
writers will take advantage of.

To get the results in the NUMBERS section below, I literally s/udelay/usleep
the kernel tree; I had to go in and undo the changes to the USB drivers, but
everything else booted successfully; I find that extremely telling in and
of itself -- many people are using a delay API where a sleep will suit them
just fine.

SOME ATTEMPTS AT NUMBERS

It turns out that calculating quantifiable benefit on this is challenging,
so instead I will simply present the current state of things, and I hope
this to be sufficient:

How many udelay calls are there in 2.6.35-rc5?

	udealy(ARG) >=	| COUNT
	1000		| 319
	500		| 414
	100		| 1146
	20		| 1832

I am working on Android, so that is my focus for this. The following table
is a modified usleep that simply printk's the amount of time requested to
sleep; these tests were run on a kernel with udelay >= 20 --> usleep

"boot" is power-on to lock screen
"power collapse" is when the power button is pushed and the device suspends
"resume" is when the power button is pushed and the lock screen is displayed
         (no touchscreen events or anything, just turning on the display)
"use device" is from the unlock swipe to clicking around a bit; there is no
	sd card in this phone, so fail loading music, video, camera

	ACTION		| TOTAL NUMBER OF USLEEP CALLS	| NET TIME (us)
	boot		| 22				| 1250
	power-collapse	| 9				| 1200
	resume		| 5				| 500
	use device	| 59				| 7700

The most interesting category to me is the "use device" field; 7700us of
busy-wait time that could be put towards better responsiveness, or at the
least less power usage.

Signed-off-by: Patrick Pannuto <ppannuto@codeaurora.org>
Cc: apw@canonical.com
Cc: corbet@lwn.net
Cc: arjan@linux.intel.com
Cc: Randy Dunlap <rdunlap@xenotime.net>
Cc: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 include/linux/delay.h |  1 +
 kernel/timer.c        | 22 ++++++++++++++++++++++
 2 files changed, 23 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/delay.h b/include/linux/delay.h
index fd832c6d419e..a6ecb34cf547 100644
--- a/include/linux/delay.h
+++ b/include/linux/delay.h
@@ -45,6 +45,7 @@ extern unsigned long lpj_fine;
 void calibrate_delay(void);
 void msleep(unsigned int msecs);
 unsigned long msleep_interruptible(unsigned int msecs);
+void usleep_range(unsigned long min, unsigned long max);
 
 static inline void ssleep(unsigned int seconds)
 {
diff --git a/kernel/timer.c b/kernel/timer.c
index ce98685cd1cb..723a62e86dcb 100644
--- a/kernel/timer.c
+++ b/kernel/timer.c
@@ -1755,3 +1755,25 @@ unsigned long msleep_interruptible(unsigned int msecs)
 }
 
 EXPORT_SYMBOL(msleep_interruptible);
+
+static int __sched do_usleep_range(unsigned long min, unsigned long max)
+{
+	ktime_t kmin;
+	unsigned long delta;
+
+	kmin = ktime_set(0, min * NSEC_PER_USEC);
+	delta = (max - min) * NSEC_PER_USEC;
+	return schedule_hrtimeout_range(&kmin, delta, HRTIMER_MODE_REL);
+}
+
+/**
+ * usleep_range - Drop in replacement for udelay where wakeup is flexible
+ * @min: Minimum time in usecs to sleep
+ * @max: Maximum time in usecs to sleep
+ */
+void usleep_range(unsigned long min, unsigned long max)
+{
+	__set_current_state(TASK_UNINTERRUPTIBLE);
+	do_usleep_range(min, max);
+}
+EXPORT_SYMBOL(usleep_range);
-- 
cgit v1.2.3


From 5d8d9a4d9ff74c55901642b4e2ac5124830ddafe Mon Sep 17 00:00:00 2001
From: Trond Myklebust <Trond.Myklebust@netapp.com>
Date: Sat, 31 Jul 2010 14:29:07 -0400
Subject: NFS: Ensure the AUTH_UNIX credcache is allocated dynamically

Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
---
 include/linux/sunrpc/auth.h |  7 ++++---
 net/sunrpc/auth.c           | 19 ++++++++++++++++---
 net/sunrpc/auth_generic.c   | 12 +++---------
 net/sunrpc/auth_unix.c      | 15 +++++++--------
 net/sunrpc/sunrpc_syms.c    | 15 ++++++++++-----
 5 files changed, 40 insertions(+), 28 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/sunrpc/auth.h b/include/linux/sunrpc/auth.h
index 87d7ec0bf779..784e78c73ec5 100644
--- a/include/linux/sunrpc/auth.h
+++ b/include/linux/sunrpc/auth.h
@@ -125,11 +125,12 @@ struct rpc_credops {
 extern const struct rpc_authops	authunix_ops;
 extern const struct rpc_authops	authnull_ops;
 
-void __init		rpc_init_authunix(void);
-void __init		rpc_init_generic_auth(void);
-void __init		rpcauth_init_module(void);
+int __init		rpc_init_authunix(void);
+int __init		rpc_init_generic_auth(void);
+int __init		rpcauth_init_module(void);
 void __exit		rpcauth_remove_module(void);
 void __exit		rpc_destroy_generic_auth(void);
+void 			rpc_destroy_authunix(void);
 
 struct rpc_cred *	rpc_lookup_cred(void);
 struct rpc_cred *	rpc_lookup_machine_cred(void);
diff --git a/net/sunrpc/auth.c b/net/sunrpc/auth.c
index 73affb8624fa..db135543d21e 100644
--- a/net/sunrpc/auth.c
+++ b/net/sunrpc/auth.c
@@ -587,14 +587,27 @@ static struct shrinker rpc_cred_shrinker = {
 	.seeks = DEFAULT_SEEKS,
 };
 
-void __init rpcauth_init_module(void)
+int __init rpcauth_init_module(void)
 {
-	rpc_init_authunix();
-	rpc_init_generic_auth();
+	int err;
+
+	err = rpc_init_authunix();
+	if (err < 0)
+		goto out1;
+	err = rpc_init_generic_auth();
+	if (err < 0)
+		goto out2;
 	register_shrinker(&rpc_cred_shrinker);
+	return 0;
+out2:
+	rpc_destroy_authunix();
+out1:
+	return err;
 }
 
 void __exit rpcauth_remove_module(void)
 {
+	rpc_destroy_authunix();
+	rpc_destroy_generic_auth();
 	unregister_shrinker(&rpc_cred_shrinker);
 }
diff --git a/net/sunrpc/auth_generic.c b/net/sunrpc/auth_generic.c
index 8f623b0f03dd..8bae33b36cc6 100644
--- a/net/sunrpc/auth_generic.c
+++ b/net/sunrpc/auth_generic.c
@@ -27,7 +27,6 @@ struct generic_cred {
 };
 
 static struct rpc_auth generic_auth;
-static struct rpc_cred_cache generic_cred_cache;
 static const struct rpc_credops generic_credops;
 
 /*
@@ -159,20 +158,16 @@ out_nomatch:
 	return 0;
 }
 
-void __init rpc_init_generic_auth(void)
+int __init rpc_init_generic_auth(void)
 {
-	spin_lock_init(&generic_cred_cache.lock);
+	return rpcauth_init_credcache(&generic_auth);
 }
 
 void __exit rpc_destroy_generic_auth(void)
 {
-	rpcauth_clear_credcache(&generic_cred_cache);
+	rpcauth_destroy_credcache(&generic_auth);
 }
 
-static struct rpc_cred_cache generic_cred_cache = {
-	{{ NULL, },},
-};
-
 static const struct rpc_authops generic_auth_ops = {
 	.owner = THIS_MODULE,
 	.au_name = "Generic",
@@ -183,7 +178,6 @@ static const struct rpc_authops generic_auth_ops = {
 static struct rpc_auth generic_auth = {
 	.au_ops = &generic_auth_ops,
 	.au_count = ATOMIC_INIT(0),
-	.au_credcache = &generic_cred_cache,
 };
 
 static const struct rpc_credops generic_credops = {
diff --git a/net/sunrpc/auth_unix.c b/net/sunrpc/auth_unix.c
index aac2f8b4ee21..d5e37dbf207b 100644
--- a/net/sunrpc/auth_unix.c
+++ b/net/sunrpc/auth_unix.c
@@ -29,7 +29,6 @@ struct unx_cred {
 #endif
 
 static struct rpc_auth		unix_auth;
-static struct rpc_cred_cache	unix_cred_cache;
 static const struct rpc_credops	unix_credops;
 
 static struct rpc_auth *
@@ -203,9 +202,14 @@ unx_validate(struct rpc_task *task, __be32 *p)
 	return p;
 }
 
-void __init rpc_init_authunix(void)
+int __init rpc_init_authunix(void)
 {
-	spin_lock_init(&unix_cred_cache.lock);
+	return rpcauth_init_credcache(&unix_auth);
+}
+
+void rpc_destroy_authunix(void)
+{
+	rpcauth_destroy_credcache(&unix_auth);
 }
 
 const struct rpc_authops authunix_ops = {
@@ -218,10 +222,6 @@ const struct rpc_authops authunix_ops = {
 	.crcreate	= unx_create_cred,
 };
 
-static
-struct rpc_cred_cache	unix_cred_cache = {
-};
-
 static
 struct rpc_auth		unix_auth = {
 	.au_cslack	= UNX_WRITESLACK,
@@ -229,7 +229,6 @@ struct rpc_auth		unix_auth = {
 	.au_ops		= &authunix_ops,
 	.au_flavor	= RPC_AUTH_UNIX,
 	.au_count	= ATOMIC_INIT(0),
-	.au_credcache	= &unix_cred_cache,
 };
 
 static
diff --git a/net/sunrpc/sunrpc_syms.c b/net/sunrpc/sunrpc_syms.c
index f438347d817b..34b58f9e704a 100644
--- a/net/sunrpc/sunrpc_syms.c
+++ b/net/sunrpc/sunrpc_syms.c
@@ -33,10 +33,11 @@ init_sunrpc(void)
 	if (err)
 		goto out;
 	err = rpc_init_mempool();
-	if (err) {
-		unregister_rpc_pipefs();
-		goto out;
-	}
+	if (err)
+		goto out2;
+	err = rpcauth_init_module();
+	if (err)
+		goto out3;
 #ifdef RPC_DEBUG
 	rpc_register_sysctl();
 #endif
@@ -47,7 +48,11 @@ init_sunrpc(void)
 	cache_register(&unix_gid_cache);
 	svc_init_xprt_sock();	/* svc sock transport */
 	init_socket_xprt();	/* clnt sock transport */
-	rpcauth_init_module();
+	return 0;
+out3:
+	rpc_destroy_mempool();
+out2:
+	unregister_rpc_pipefs();
 out:
 	return err;
 }
-- 
cgit v1.2.3


From 988664a0f6bbfc356e6ce55f7a87b8594050012f Mon Sep 17 00:00:00 2001
From: Trond Myklebust <Trond.Myklebust@netapp.com>
Date: Sat, 31 Jul 2010 14:29:07 -0400
Subject: SUNRPC: Store the hashtable size in struct rpc_cred_cache

Cleanup in preparation for allowing the user to determine the maximum hash
table size.

Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
---
 include/linux/sunrpc/auth.h |  1 +
 net/sunrpc/auth.c           | 10 +++++++---
 2 files changed, 8 insertions(+), 3 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/sunrpc/auth.h b/include/linux/sunrpc/auth.h
index 784e78c73ec5..84d64b6926a9 100644
--- a/include/linux/sunrpc/auth.h
+++ b/include/linux/sunrpc/auth.h
@@ -65,6 +65,7 @@ struct rpc_cred {
 #define RPC_CREDCACHE_NR	(1 << RPC_CREDCACHE_HASHBITS)
 struct rpc_cred_cache {
 	struct hlist_head	hashtable[RPC_CREDCACHE_NR];
+	unsigned int		hashbits;
 	spinlock_t		lock;
 };
 
diff --git a/net/sunrpc/auth.c b/net/sunrpc/auth.c
index db135543d21e..eef76a1f1dd6 100644
--- a/net/sunrpc/auth.c
+++ b/net/sunrpc/auth.c
@@ -145,12 +145,15 @@ int
 rpcauth_init_credcache(struct rpc_auth *auth)
 {
 	struct rpc_cred_cache *new;
+	unsigned int hashsize;
 	int i;
 
 	new = kmalloc(sizeof(*new), GFP_KERNEL);
 	if (!new)
 		return -ENOMEM;
-	for (i = 0; i < RPC_CREDCACHE_NR; i++)
+	new->hashbits = RPC_CREDCACHE_HASHBITS;
+	hashsize = 1U << new->hashbits;
+	for (i = 0; i < hashsize; i++)
 		INIT_HLIST_HEAD(&new->hashtable[i]);
 	spin_lock_init(&new->lock);
 	auth->au_credcache = new;
@@ -183,11 +186,12 @@ rpcauth_clear_credcache(struct rpc_cred_cache *cache)
 	LIST_HEAD(free);
 	struct hlist_head *head;
 	struct rpc_cred	*cred;
+	unsigned int hashsize = 1U << cache->hashbits;
 	int		i;
 
 	spin_lock(&rpc_credcache_lock);
 	spin_lock(&cache->lock);
-	for (i = 0; i < RPC_CREDCACHE_NR; i++) {
+	for (i = 0; i < hashsize; i++) {
 		head = &cache->hashtable[i];
 		while (!hlist_empty(head)) {
 			cred = hlist_entry(head->first, struct rpc_cred, cr_hash);
@@ -297,7 +301,7 @@ rpcauth_lookup_credcache(struct rpc_auth *auth, struct auth_cred * acred,
 			*entry, *new;
 	unsigned int nr;
 
-	nr = hash_long(acred->uid, RPC_CREDCACHE_HASHBITS);
+	nr = hash_long(acred->uid, cache->hashbits);
 
 	rcu_read_lock();
 	hlist_for_each_entry_rcu(entry, pos, &cache->hashtable[nr], cr_hash) {
-- 
cgit v1.2.3


From 241269bd0b580faae71575443d9ab38df7469126 Mon Sep 17 00:00:00 2001
From: Trond Myklebust <Trond.Myklebust@netapp.com>
Date: Sat, 31 Jul 2010 14:29:08 -0400
Subject: SUNRPC: Make the credential cache hashtable size configurable

This patch allows the user to configure the credential cache hashtable size
using a new module parameter: auth_hashtable_size
When set, this parameter will be rounded up to the nearest power of two,
with a maximum allowed value of 1024 elements.

Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
---
 include/linux/sunrpc/auth.h |  9 +------
 net/sunrpc/auth.c           | 60 +++++++++++++++++++++++++++++++++++++++++----
 2 files changed, 56 insertions(+), 13 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/sunrpc/auth.h b/include/linux/sunrpc/auth.h
index 84d64b6926a9..d2737625a247 100644
--- a/include/linux/sunrpc/auth.h
+++ b/include/linux/sunrpc/auth.h
@@ -61,14 +61,7 @@ struct rpc_cred {
 /*
  * Client authentication handle
  */
-#define RPC_CREDCACHE_HASHBITS	4
-#define RPC_CREDCACHE_NR	(1 << RPC_CREDCACHE_HASHBITS)
-struct rpc_cred_cache {
-	struct hlist_head	hashtable[RPC_CREDCACHE_NR];
-	unsigned int		hashbits;
-	spinlock_t		lock;
-};
-
+struct rpc_cred_cache;
 struct rpc_authops;
 struct rpc_auth {
 	unsigned int		au_cslack;	/* call cred size estimate */
diff --git a/net/sunrpc/auth.c b/net/sunrpc/auth.c
index eef76a1f1dd6..d80f01725fc2 100644
--- a/net/sunrpc/auth.c
+++ b/net/sunrpc/auth.c
@@ -19,6 +19,15 @@
 # define RPCDBG_FACILITY	RPCDBG_AUTH
 #endif
 
+#define RPC_CREDCACHE_DEFAULT_HASHBITS	(4)
+struct rpc_cred_cache {
+	struct hlist_head	*hashtable;
+	unsigned int		hashbits;
+	spinlock_t		lock;
+};
+
+static unsigned int auth_hashbits = RPC_CREDCACHE_DEFAULT_HASHBITS;
+
 static DEFINE_SPINLOCK(rpc_authflavor_lock);
 static const struct rpc_authops *auth_flavors[RPC_AUTH_MAXFLAVOR] = {
 	&authnull_ops,		/* AUTH_NULL */
@@ -29,6 +38,42 @@ static const struct rpc_authops *auth_flavors[RPC_AUTH_MAXFLAVOR] = {
 static LIST_HEAD(cred_unused);
 static unsigned long number_cred_unused;
 
+#define MAX_HASHTABLE_BITS (10) 
+static int param_set_hashtbl_sz(const char *val, struct kernel_param *kp)
+{
+	unsigned long num;
+	unsigned int nbits;
+	int ret;
+
+	if (!val)
+		goto out_inval;
+	ret = strict_strtoul(val, 0, &num);
+	if (ret == -EINVAL)
+		goto out_inval;
+	nbits = fls(num);
+	if (num > (1U << nbits))
+		nbits++;
+	if (nbits > MAX_HASHTABLE_BITS || nbits < 2)
+		goto out_inval;
+	*(unsigned int *)kp->arg = nbits;
+	return 0;
+out_inval:
+	return -EINVAL;
+}
+
+static int param_get_hashtbl_sz(char *buffer, struct kernel_param *kp)
+{
+	unsigned int nbits;
+
+	nbits = *(unsigned int *)kp->arg;
+	return sprintf(buffer, "%u", 1U << nbits);
+}
+
+#define param_check_hashtbl_sz(name, p) __param_check(name, p, unsigned int);
+
+module_param_named(auth_hashtable_size, auth_hashbits, hashtbl_sz, 0644);
+MODULE_PARM_DESC(auth_hashtable_size, "RPC credential cache hashtable size");
+
 static u32
 pseudoflavor_to_flavor(u32 flavor) {
 	if (flavor >= RPC_AUTH_MAXFLAVOR)
@@ -146,18 +191,22 @@ rpcauth_init_credcache(struct rpc_auth *auth)
 {
 	struct rpc_cred_cache *new;
 	unsigned int hashsize;
-	int i;
 
 	new = kmalloc(sizeof(*new), GFP_KERNEL);
 	if (!new)
-		return -ENOMEM;
-	new->hashbits = RPC_CREDCACHE_HASHBITS;
+		goto out_nocache;
+	new->hashbits = auth_hashbits;
 	hashsize = 1U << new->hashbits;
-	for (i = 0; i < hashsize; i++)
-		INIT_HLIST_HEAD(&new->hashtable[i]);
+	new->hashtable = kcalloc(hashsize, sizeof(new->hashtable[0]), GFP_KERNEL);
+	if (!new->hashtable)
+		goto out_nohashtbl;
 	spin_lock_init(&new->lock);
 	auth->au_credcache = new;
 	return 0;
+out_nohashtbl:
+	kfree(new);
+out_nocache:
+	return -ENOMEM;
 }
 EXPORT_SYMBOL_GPL(rpcauth_init_credcache);
 
@@ -220,6 +269,7 @@ rpcauth_destroy_credcache(struct rpc_auth *auth)
 	if (cache) {
 		auth->au_credcache = NULL;
 		rpcauth_clear_credcache(cache);
+		kfree(cache->hashtable);
 		kfree(cache);
 	}
 }
-- 
cgit v1.2.3


From d9b6cd94601e1d17273f93a326a135fbf487a918 Mon Sep 17 00:00:00 2001
From: Trond Myklebust <Trond.Myklebust@netapp.com>
Date: Sat, 31 Jul 2010 14:29:08 -0400
Subject: SUNRPC: Ensure that rpc_exit() always wakes up a sleeping task

Make rpc_exit() non-inline, and ensure that it always wakes up a task that
has been queued.

Kill off the now unused rpc_wake_up_task().

Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
---
 include/linux/sunrpc/sched.h |  7 +------
 net/sunrpc/sched.c           | 20 +++++++++-----------
 2 files changed, 10 insertions(+), 17 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/sunrpc/sched.h b/include/linux/sunrpc/sched.h
index 7be4f3a6d246..88513fd8e208 100644
--- a/include/linux/sunrpc/sched.h
+++ b/include/linux/sunrpc/sched.h
@@ -213,6 +213,7 @@ struct rpc_task *rpc_run_bc_task(struct rpc_rqst *req,
 				const struct rpc_call_ops *ops);
 void		rpc_put_task(struct rpc_task *);
 void		rpc_exit_task(struct rpc_task *);
+void		rpc_exit(struct rpc_task *, int);
 void		rpc_release_calldata(const struct rpc_call_ops *, void *);
 void		rpc_killall_tasks(struct rpc_clnt *);
 void		rpc_execute(struct rpc_task *);
@@ -241,12 +242,6 @@ void		rpc_destroy_mempool(void);
 extern struct workqueue_struct *rpciod_workqueue;
 void		rpc_prepare_task(struct rpc_task *task);
 
-static inline void rpc_exit(struct rpc_task *task, int status)
-{
-	task->tk_status = status;
-	task->tk_action = rpc_exit_task;
-}
-
 static inline int rpc_wait_for_completion_task(struct rpc_task *task)
 {
 	return __rpc_wait_for_completion_task(task, NULL);
diff --git a/net/sunrpc/sched.c b/net/sunrpc/sched.c
index 4a843b883b89..37452762af70 100644
--- a/net/sunrpc/sched.c
+++ b/net/sunrpc/sched.c
@@ -405,14 +405,6 @@ void rpc_wake_up_queued_task(struct rpc_wait_queue *queue, struct rpc_task *task
 }
 EXPORT_SYMBOL_GPL(rpc_wake_up_queued_task);
 
-/*
- * Wake up the specified task
- */
-static void rpc_wake_up_task(struct rpc_task *task)
-{
-	rpc_wake_up_queued_task(task->tk_waitqueue, task);
-}
-
 /*
  * Wake up the next task on a priority queue.
  */
@@ -600,7 +592,15 @@ void rpc_exit_task(struct rpc_task *task)
 		}
 	}
 }
-EXPORT_SYMBOL_GPL(rpc_exit_task);
+
+void rpc_exit(struct rpc_task *task, int status)
+{
+	task->tk_status = status;
+	task->tk_action = rpc_exit_task;
+	if (RPC_IS_QUEUED(task))
+		rpc_wake_up_queued_task(task->tk_waitqueue, task);
+}
+EXPORT_SYMBOL_GPL(rpc_exit);
 
 void rpc_release_calldata(const struct rpc_call_ops *ops, void *calldata)
 {
@@ -690,7 +690,6 @@ static void __rpc_execute(struct rpc_task *task)
 			dprintk("RPC: %5u got signal\n", task->tk_pid);
 			task->tk_flags |= RPC_TASK_KILLED;
 			rpc_exit(task, -ERESTARTSYS);
-			rpc_wake_up_task(task);
 		}
 		rpc_set_running(task);
 		dprintk("RPC: %5u sync task resuming\n", task->tk_pid);
@@ -950,7 +949,6 @@ void rpc_killall_tasks(struct rpc_clnt *clnt)
 		if (!(rovr->tk_flags & RPC_TASK_KILLED)) {
 			rovr->tk_flags |= RPC_TASK_KILLED;
 			rpc_exit(rovr, -EIO);
-			rpc_wake_up_task(rovr);
 		}
 	}
 	spin_unlock(&clnt->cl_lock);
-- 
cgit v1.2.3


From 58f9612c6ea858f532021a0ce42ec53cb0a493b3 Mon Sep 17 00:00:00 2001
From: Trond Myklebust <Trond.Myklebust@netapp.com>
Date: Sat, 31 Jul 2010 14:29:08 -0400
Subject: SUNRPC: Move remaining RPC client related task initialisation into
 clnt.c

Now that rpc_run_task() is the sole entry point for RPC calls, we can move
the remaining rpc_client-related initialisation of struct rpc_task from
sched.c into clnt.c.

Also move rpc_killall_tasks() into the same file, since that too is
relative to the rpc_clnt.

Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
---
 include/linux/sunrpc/clnt.h |  1 +
 net/sunrpc/clnt.c           | 84 +++++++++++++++++++++++++++++++++++++++++++++
 net/sunrpc/sched.c          | 77 +++--------------------------------------
 3 files changed, 89 insertions(+), 73 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/sunrpc/clnt.h b/include/linux/sunrpc/clnt.h
index debe75532199..569dc722a600 100644
--- a/include/linux/sunrpc/clnt.h
+++ b/include/linux/sunrpc/clnt.h
@@ -131,6 +131,7 @@ struct rpc_clnt	*rpc_bind_new_program(struct rpc_clnt *,
 struct rpc_clnt *rpc_clone_client(struct rpc_clnt *);
 void		rpc_shutdown_client(struct rpc_clnt *);
 void		rpc_release_client(struct rpc_clnt *);
+void		rpc_task_release_client(struct rpc_task *);
 
 int		rpcb_register(u32, u32, int, unsigned short);
 int		rpcb_v4_register(const u32 program, const u32 version,
diff --git a/net/sunrpc/clnt.c b/net/sunrpc/clnt.c
index 234c40c15f69..3647c81fd689 100644
--- a/net/sunrpc/clnt.c
+++ b/net/sunrpc/clnt.c
@@ -413,6 +413,35 @@ out_no_clnt:
 }
 EXPORT_SYMBOL_GPL(rpc_clone_client);
 
+/*
+ * Kill all tasks for the given client.
+ * XXX: kill their descendants as well?
+ */
+void rpc_killall_tasks(struct rpc_clnt *clnt)
+{
+	struct rpc_task	*rovr;
+
+
+	if (list_empty(&clnt->cl_tasks))
+		return;
+	dprintk("RPC:       killing all tasks for client %p\n", clnt);
+	/*
+	 * Spin lock all_tasks to prevent changes...
+	 */
+	spin_lock(&clnt->cl_lock);
+	list_for_each_entry(rovr, &clnt->cl_tasks, tk_task) {
+		if (!RPC_IS_ACTIVATED(rovr))
+			continue;
+		if (!(rovr->tk_flags & RPC_TASK_KILLED)) {
+			rovr->tk_flags |= RPC_TASK_KILLED;
+			rpc_exit(rovr, -EIO);
+			rpc_wake_up_queued_task(rovr->tk_waitqueue, rovr);
+		}
+	}
+	spin_unlock(&clnt->cl_lock);
+}
+EXPORT_SYMBOL_GPL(rpc_killall_tasks);
+
 /*
  * Properly shut down an RPC client, terminating all outstanding
  * requests.
@@ -538,6 +567,49 @@ out:
 }
 EXPORT_SYMBOL_GPL(rpc_bind_new_program);
 
+void rpc_task_release_client(struct rpc_task *task)
+{
+	struct rpc_clnt *clnt = task->tk_client;
+
+	if (clnt != NULL) {
+		/* Remove from client task list */
+		spin_lock(&clnt->cl_lock);
+		list_del(&task->tk_task);
+		spin_unlock(&clnt->cl_lock);
+		task->tk_client = NULL;
+
+		rpc_release_client(clnt);
+	}
+}
+
+static
+void rpc_task_set_client(struct rpc_task *task, struct rpc_clnt *clnt)
+{
+	if (clnt != NULL) {
+		rpc_task_release_client(task);
+		task->tk_client = clnt;
+		kref_get(&clnt->cl_kref);
+		if (clnt->cl_softrtry)
+			task->tk_flags |= RPC_TASK_SOFT;
+		/* Add to the client's list of all tasks */
+		spin_lock(&clnt->cl_lock);
+		list_add_tail(&task->tk_task, &clnt->cl_tasks);
+		spin_unlock(&clnt->cl_lock);
+	}
+}
+
+static void
+rpc_task_set_rpc_message(struct rpc_task *task, const struct rpc_message *msg)
+{
+	if (msg != NULL) {
+		task->tk_msg.rpc_proc = msg->rpc_proc;
+		task->tk_msg.rpc_argp = msg->rpc_argp;
+		task->tk_msg.rpc_resp = msg->rpc_resp;
+		/* Bind the user cred */
+		rpcauth_bindcred(task, msg->rpc_cred, task->tk_flags);
+	}
+}
+
 /*
  * Default callback for async RPC calls
  */
@@ -562,6 +634,18 @@ struct rpc_task *rpc_run_task(const struct rpc_task_setup *task_setup_data)
 	if (IS_ERR(task))
 		goto out;
 
+	rpc_task_set_client(task, task_setup_data->rpc_client);
+	rpc_task_set_rpc_message(task, task_setup_data->rpc_message);
+
+	if (task->tk_status != 0) {
+		int ret = task->tk_status;
+		rpc_put_task(task);
+		return ERR_PTR(ret);
+	}
+
+	if (task->tk_action == NULL)
+		rpc_call_start(task);
+
 	atomic_inc(&task->tk_count);
 	rpc_execute(task);
 out:
diff --git a/net/sunrpc/sched.c b/net/sunrpc/sched.c
index 37452762af70..a42296db2ecd 100644
--- a/net/sunrpc/sched.c
+++ b/net/sunrpc/sched.c
@@ -246,17 +246,8 @@ static inline void rpc_task_set_debuginfo(struct rpc_task *task)
 
 static void rpc_set_active(struct rpc_task *task)
 {
-	struct rpc_clnt *clnt;
-	if (test_and_set_bit(RPC_TASK_ACTIVE, &task->tk_runstate) != 0)
-		return;
 	rpc_task_set_debuginfo(task);
-	/* Add to global list of all tasks */
-	clnt = task->tk_client;
-	if (clnt != NULL) {
-		spin_lock(&clnt->cl_lock);
-		list_add_tail(&task->tk_task, &clnt->cl_tasks);
-		spin_unlock(&clnt->cl_lock);
-	}
+	set_bit(RPC_TASK_ACTIVE, &task->tk_runstate);
 }
 
 /*
@@ -319,11 +310,6 @@ static void __rpc_sleep_on(struct rpc_wait_queue *q, struct rpc_task *task,
 	dprintk("RPC: %5u sleep_on(queue \"%s\" time %lu)\n",
 			task->tk_pid, rpc_qname(q), jiffies);
 
-	if (!RPC_IS_ASYNC(task) && !RPC_IS_ACTIVATED(task)) {
-		printk(KERN_ERR "RPC: Inactive synchronous task put to sleep!\n");
-		return;
-	}
-
 	__rpc_add_wait_queue(q, task);
 
 	BUG_ON(task->tk_callback != NULL);
@@ -334,8 +320,8 @@ static void __rpc_sleep_on(struct rpc_wait_queue *q, struct rpc_task *task,
 void rpc_sleep_on(struct rpc_wait_queue *q, struct rpc_task *task,
 				rpc_action action)
 {
-	/* Mark the task as being activated if so needed */
-	rpc_set_active(task);
+	/* We shouldn't ever put an inactive task to sleep */
+	BUG_ON(!RPC_IS_ACTIVATED(task));
 
 	/*
 	 * Protect the queue operations.
@@ -807,26 +793,9 @@ static void rpc_init_task(struct rpc_task *task, const struct rpc_task_setup *ta
 	/* Initialize workqueue for async tasks */
 	task->tk_workqueue = task_setup_data->workqueue;
 
-	task->tk_client = task_setup_data->rpc_client;
-	if (task->tk_client != NULL) {
-		kref_get(&task->tk_client->cl_kref);
-		if (task->tk_client->cl_softrtry)
-			task->tk_flags |= RPC_TASK_SOFT;
-	}
-
 	if (task->tk_ops->rpc_call_prepare != NULL)
 		task->tk_action = rpc_prepare_task;
 
-	if (task_setup_data->rpc_message != NULL) {
-		task->tk_msg.rpc_proc = task_setup_data->rpc_message->rpc_proc;
-		task->tk_msg.rpc_argp = task_setup_data->rpc_message->rpc_argp;
-		task->tk_msg.rpc_resp = task_setup_data->rpc_message->rpc_resp;
-		/* Bind the user cred */
-		rpcauth_bindcred(task, task_setup_data->rpc_message->rpc_cred, task_setup_data->flags);
-		if (task->tk_action == NULL)
-			rpc_call_start(task);
-	}
-
 	/* starting timestamp */
 	task->tk_start = ktime_get();
 
@@ -896,10 +865,7 @@ void rpc_put_task(struct rpc_task *task)
 		xprt_release(task);
 	if (task->tk_msg.rpc_cred)
 		rpcauth_unbindcred(task);
-	if (task->tk_client) {
-		rpc_release_client(task->tk_client);
-		task->tk_client = NULL;
-	}
+	rpc_task_release_client(task);
 	if (task->tk_workqueue != NULL) {
 		INIT_WORK(&task->u.tk_work, rpc_async_release);
 		queue_work(task->tk_workqueue, &task->u.tk_work);
@@ -912,13 +878,6 @@ static void rpc_release_task(struct rpc_task *task)
 {
 	dprintk("RPC: %5u release task\n", task->tk_pid);
 
-	if (!list_empty(&task->tk_task)) {
-		struct rpc_clnt *clnt = task->tk_client;
-		/* Remove from client task list */
-		spin_lock(&clnt->cl_lock);
-		list_del(&task->tk_task);
-		spin_unlock(&clnt->cl_lock);
-	}
 	BUG_ON (RPC_IS_QUEUED(task));
 
 	/* Wake up anyone who is waiting for task completion */
@@ -927,34 +886,6 @@ static void rpc_release_task(struct rpc_task *task)
 	rpc_put_task(task);
 }
 
-/*
- * Kill all tasks for the given client.
- * XXX: kill their descendants as well?
- */
-void rpc_killall_tasks(struct rpc_clnt *clnt)
-{
-	struct rpc_task	*rovr;
-
-
-	if (list_empty(&clnt->cl_tasks))
-		return;
-	dprintk("RPC:       killing all tasks for client %p\n", clnt);
-	/*
-	 * Spin lock all_tasks to prevent changes...
-	 */
-	spin_lock(&clnt->cl_lock);
-	list_for_each_entry(rovr, &clnt->cl_tasks, tk_task) {
-		if (! RPC_IS_ACTIVATED(rovr))
-			continue;
-		if (!(rovr->tk_flags & RPC_TASK_KILLED)) {
-			rovr->tk_flags |= RPC_TASK_KILLED;
-			rpc_exit(rovr, -EIO);
-		}
-	}
-	spin_unlock(&clnt->cl_lock);
-}
-EXPORT_SYMBOL_GPL(rpc_killall_tasks);
-
 int rpciod_up(void)
 {
 	return try_module_get(THIS_MODULE) ? 0 : -EINVAL;
-- 
cgit v1.2.3


From 8572b8e2e3c5f3d990122348c4d2c64dad338611 Mon Sep 17 00:00:00 2001
From: Trond Myklebust <Trond.Myklebust@netapp.com>
Date: Sat, 31 Jul 2010 14:29:08 -0400
Subject: SUNRPC: Clean up of rpc_bindcred()

Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
---
 include/linux/sunrpc/auth.h |  6 +++---
 net/sunrpc/auth.c           | 37 +++++++++++++++++--------------------
 net/sunrpc/auth_generic.c   | 11 +++--------
 net/sunrpc/clnt.c           |  2 +-
 4 files changed, 24 insertions(+), 32 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/sunrpc/auth.h b/include/linux/sunrpc/auth.h
index d2737625a247..90e4c3827ac0 100644
--- a/include/linux/sunrpc/auth.h
+++ b/include/linux/sunrpc/auth.h
@@ -106,7 +106,7 @@ struct rpc_credops {
 	void			(*crdestroy)(struct rpc_cred *);
 
 	int			(*crmatch)(struct auth_cred *, struct rpc_cred *, int);
-	void			(*crbind)(struct rpc_task *, struct rpc_cred *, int);
+	struct rpc_cred *	(*crbind)(struct rpc_task *, struct rpc_cred *, int);
 	__be32 *		(*crmarshal)(struct rpc_task *, __be32 *);
 	int			(*crrefresh)(struct rpc_task *);
 	__be32 *		(*crvalidate)(struct rpc_task *, __be32 *);
@@ -135,8 +135,8 @@ void			rpcauth_release(struct rpc_auth *);
 struct rpc_cred *	rpcauth_lookup_credcache(struct rpc_auth *, struct auth_cred *, int);
 void			rpcauth_init_cred(struct rpc_cred *, const struct auth_cred *, struct rpc_auth *, const struct rpc_credops *);
 struct rpc_cred *	rpcauth_lookupcred(struct rpc_auth *, int);
-void			rpcauth_bindcred(struct rpc_task *, struct rpc_cred *, int);
-void			rpcauth_generic_bind_cred(struct rpc_task *, struct rpc_cred *, int);
+int			rpcauth_bindcred(struct rpc_task *, struct rpc_cred *, int);
+struct rpc_cred *	rpcauth_generic_bind_cred(struct rpc_task *, struct rpc_cred *, int);
 void			put_rpccred(struct rpc_cred *);
 void			rpcauth_unbindcred(struct rpc_task *);
 __be32 *		rpcauth_marshcred(struct rpc_task *, __be32 *);
diff --git a/net/sunrpc/auth.c b/net/sunrpc/auth.c
index d80f01725fc2..d8968faf5ccf 100644
--- a/net/sunrpc/auth.c
+++ b/net/sunrpc/auth.c
@@ -444,16 +444,16 @@ rpcauth_init_cred(struct rpc_cred *cred, const struct auth_cred *acred,
 }
 EXPORT_SYMBOL_GPL(rpcauth_init_cred);
 
-void
+struct rpc_cred *
 rpcauth_generic_bind_cred(struct rpc_task *task, struct rpc_cred *cred, int lookupflags)
 {
-	task->tk_msg.rpc_cred = get_rpccred(cred);
 	dprintk("RPC: %5u holding %s cred %p\n", task->tk_pid,
 			cred->cr_auth->au_ops->au_name, cred);
+	return get_rpccred(cred);
 }
 EXPORT_SYMBOL_GPL(rpcauth_generic_bind_cred);
 
-static void
+static struct rpc_cred *
 rpcauth_bind_root_cred(struct rpc_task *task, int lookupflags)
 {
 	struct rpc_auth *auth = task->tk_client->cl_auth;
@@ -461,45 +461,42 @@ rpcauth_bind_root_cred(struct rpc_task *task, int lookupflags)
 		.uid = 0,
 		.gid = 0,
 	};
-	struct rpc_cred *ret;
 
 	dprintk("RPC: %5u looking up %s cred\n",
 		task->tk_pid, task->tk_client->cl_auth->au_ops->au_name);
-	ret = auth->au_ops->lookup_cred(auth, &acred, lookupflags);
-	if (!IS_ERR(ret))
-		task->tk_msg.rpc_cred = ret;
-	else
-		task->tk_status = PTR_ERR(ret);
+	return auth->au_ops->lookup_cred(auth, &acred, lookupflags);
 }
 
-static void
+static struct rpc_cred *
 rpcauth_bind_new_cred(struct rpc_task *task, int lookupflags)
 {
 	struct rpc_auth *auth = task->tk_client->cl_auth;
-	struct rpc_cred *ret;
 
 	dprintk("RPC: %5u looking up %s cred\n",
 		task->tk_pid, auth->au_ops->au_name);
-	ret = rpcauth_lookupcred(auth, lookupflags);
-	if (!IS_ERR(ret))
-		task->tk_msg.rpc_cred = ret;
-	else
-		task->tk_status = PTR_ERR(ret);
+	return rpcauth_lookupcred(auth, lookupflags);
 }
 
-void
+int
 rpcauth_bindcred(struct rpc_task *task, struct rpc_cred *cred, int flags)
 {
+	struct rpc_cred *new;
 	int lookupflags = 0;
 
 	if (flags & RPC_TASK_ASYNC)
 		lookupflags |= RPCAUTH_LOOKUP_NEW;
 	if (cred != NULL)
-		cred->cr_ops->crbind(task, cred, lookupflags);
+		new = cred->cr_ops->crbind(task, cred, lookupflags);
 	else if (flags & RPC_TASK_ROOTCREDS)
-		rpcauth_bind_root_cred(task, lookupflags);
+		new = rpcauth_bind_root_cred(task, lookupflags);
 	else
-		rpcauth_bind_new_cred(task, lookupflags);
+		new = rpcauth_bind_new_cred(task, lookupflags);
+	if (IS_ERR(new))
+		return PTR_ERR(new);
+	if (task->tk_msg.rpc_cred != NULL)
+		put_rpccred(task->tk_msg.rpc_cred);
+	task->tk_msg.rpc_cred = new;
+	return 0;
 }
 
 void
diff --git a/net/sunrpc/auth_generic.c b/net/sunrpc/auth_generic.c
index 8bae33b36cc6..43162bb3b78f 100644
--- a/net/sunrpc/auth_generic.c
+++ b/net/sunrpc/auth_generic.c
@@ -54,18 +54,13 @@ struct rpc_cred *rpc_lookup_machine_cred(void)
 }
 EXPORT_SYMBOL_GPL(rpc_lookup_machine_cred);
 
-static void
-generic_bind_cred(struct rpc_task *task, struct rpc_cred *cred, int lookupflags)
+static struct rpc_cred *generic_bind_cred(struct rpc_task *task,
+		struct rpc_cred *cred, int lookupflags)
 {
 	struct rpc_auth *auth = task->tk_client->cl_auth;
 	struct auth_cred *acred = &container_of(cred, struct generic_cred, gc_base)->acred;
-	struct rpc_cred *ret;
 
-	ret = auth->au_ops->lookup_cred(auth, acred, lookupflags);
-	if (!IS_ERR(ret))
-		task->tk_msg.rpc_cred = ret;
-	else
-		task->tk_status = PTR_ERR(ret);
+	return auth->au_ops->lookup_cred(auth, acred, lookupflags);
 }
 
 /*
diff --git a/net/sunrpc/clnt.c b/net/sunrpc/clnt.c
index 3647c81fd689..f34b5e3823c0 100644
--- a/net/sunrpc/clnt.c
+++ b/net/sunrpc/clnt.c
@@ -606,7 +606,7 @@ rpc_task_set_rpc_message(struct rpc_task *task, const struct rpc_message *msg)
 		task->tk_msg.rpc_argp = msg->rpc_argp;
 		task->tk_msg.rpc_resp = msg->rpc_resp;
 		/* Bind the user cred */
-		rpcauth_bindcred(task, msg->rpc_cred, task->tk_flags);
+		task->tk_status = rpcauth_bindcred(task, msg->rpc_cred, task->tk_flags);
 	}
 }
 
-- 
cgit v1.2.3


From a17c2153d2e271b0cbacae9bed83b0eaa41db7e1 Mon Sep 17 00:00:00 2001
From: Trond Myklebust <Trond.Myklebust@netapp.com>
Date: Sat, 31 Jul 2010 14:29:08 -0400
Subject: SUNRPC: Move the bound cred to struct rpc_rqst

This will allow us to save the original generic cred in rpc_message, so
that if we migrate from one server to another, we can generate a new bound
cred without having to punt back to the NFS layer.

Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
---
 fs/nfs/nfs2xdr.c               |  7 ++--
 fs/nfs/nfs3xdr.c               |  8 ++--
 fs/nfs/nfs4xdr.c               |  2 +-
 include/linux/sunrpc/auth.h    |  2 -
 include/linux/sunrpc/xprt.h    |  1 +
 net/sunrpc/auth.c              | 43 ++++++++++----------
 net/sunrpc/auth_gss/auth_gss.c | 22 +++++-----
 net/sunrpc/auth_null.c         |  2 +-
 net/sunrpc/auth_unix.c         |  6 +--
 net/sunrpc/clnt.c              | 91 +++++++++++++++++++++---------------------
 net/sunrpc/sched.c             |  2 +-
 net/sunrpc/xprt.c              |  2 +
 12 files changed, 92 insertions(+), 96 deletions(-)

(limited to 'include/linux')

diff --git a/fs/nfs/nfs2xdr.c b/fs/nfs/nfs2xdr.c
index 81cf14257916..db8846a0e82e 100644
--- a/fs/nfs/nfs2xdr.c
+++ b/fs/nfs/nfs2xdr.c
@@ -233,7 +233,7 @@ nfs_xdr_removeargs(struct rpc_rqst *req, __be32 *p, const struct nfs_removeargs
 static int
 nfs_xdr_readargs(struct rpc_rqst *req, __be32 *p, struct nfs_readargs *args)
 {
-	struct rpc_auth	*auth = req->rq_task->tk_msg.rpc_cred->cr_auth;
+	struct rpc_auth	*auth = req->rq_cred->cr_auth;
 	unsigned int replen;
 	u32 offset = (u32)args->offset;
 	u32 count = args->count;
@@ -393,8 +393,7 @@ nfs_xdr_symlinkargs(struct rpc_rqst *req, __be32 *p, struct nfs_symlinkargs *arg
 static int
 nfs_xdr_readdirargs(struct rpc_rqst *req, __be32 *p, struct nfs_readdirargs *args)
 {
-	struct rpc_task	*task = req->rq_task;
-	struct rpc_auth	*auth = task->tk_msg.rpc_cred->cr_auth;
+	struct rpc_auth	*auth = req->rq_cred->cr_auth;
 	unsigned int replen;
 	u32 count = args->count;
 
@@ -575,7 +574,7 @@ nfs_xdr_diropres(struct rpc_rqst *req, __be32 *p, struct nfs_diropok *res)
 static int
 nfs_xdr_readlinkargs(struct rpc_rqst *req, __be32 *p, struct nfs_readlinkargs *args)
 {
-	struct rpc_auth	*auth = req->rq_task->tk_msg.rpc_cred->cr_auth;
+	struct rpc_auth	*auth = req->rq_cred->cr_auth;
 	unsigned int replen;
 
 	p = xdr_encode_fhandle(p, args->fh);
diff --git a/fs/nfs/nfs3xdr.c b/fs/nfs/nfs3xdr.c
index 75dcfc7da365..9769704f8ce6 100644
--- a/fs/nfs/nfs3xdr.c
+++ b/fs/nfs/nfs3xdr.c
@@ -330,7 +330,7 @@ nfs3_xdr_accessargs(struct rpc_rqst *req, __be32 *p, struct nfs3_accessargs *arg
 static int
 nfs3_xdr_readargs(struct rpc_rqst *req, __be32 *p, struct nfs_readargs *args)
 {
-	struct rpc_auth	*auth = req->rq_task->tk_msg.rpc_cred->cr_auth;
+	struct rpc_auth	*auth = req->rq_cred->cr_auth;
 	unsigned int replen;
 	u32 count = args->count;
 
@@ -471,7 +471,7 @@ nfs3_xdr_linkargs(struct rpc_rqst *req, __be32 *p, struct nfs3_linkargs *args)
 static int
 nfs3_xdr_readdirargs(struct rpc_rqst *req, __be32 *p, struct nfs3_readdirargs *args)
 {
-	struct rpc_auth	*auth = req->rq_task->tk_msg.rpc_cred->cr_auth;
+	struct rpc_auth	*auth = req->rq_cred->cr_auth;
 	unsigned int replen;
 	u32 count = args->count;
 
@@ -675,7 +675,7 @@ static int
 nfs3_xdr_getaclargs(struct rpc_rqst *req, __be32 *p,
 		    struct nfs3_getaclargs *args)
 {
-	struct rpc_auth	*auth = req->rq_task->tk_msg.rpc_cred->cr_auth;
+	struct rpc_auth	*auth = req->rq_cred->cr_auth;
 	unsigned int replen;
 
 	p = xdr_encode_fhandle(p, args->fh);
@@ -802,7 +802,7 @@ nfs3_xdr_accessres(struct rpc_rqst *req, __be32 *p, struct nfs3_accessres *res)
 static int
 nfs3_xdr_readlinkargs(struct rpc_rqst *req, __be32 *p, struct nfs3_readlinkargs *args)
 {
-	struct rpc_auth	*auth = req->rq_task->tk_msg.rpc_cred->cr_auth;
+	struct rpc_auth	*auth = req->rq_cred->cr_auth;
 	unsigned int replen;
 
 	p = xdr_encode_fhandle(p, args->fh);
diff --git a/fs/nfs/nfs4xdr.c b/fs/nfs/nfs4xdr.c
index 257c1811feb4..08ef91291132 100644
--- a/fs/nfs/nfs4xdr.c
+++ b/fs/nfs/nfs4xdr.c
@@ -758,7 +758,7 @@ static void encode_compound_hdr(struct xdr_stream *xdr,
 				struct compound_hdr *hdr)
 {
 	__be32 *p;
-	struct rpc_auth *auth = req->rq_task->tk_msg.rpc_cred->cr_auth;
+	struct rpc_auth *auth = req->rq_cred->cr_auth;
 
 	/* initialize running count of expected bytes in reply.
 	 * NOTE: the replied tag SHOULD be the same is the one sent,
diff --git a/include/linux/sunrpc/auth.h b/include/linux/sunrpc/auth.h
index 90e4c3827ac0..5bbc447175dc 100644
--- a/include/linux/sunrpc/auth.h
+++ b/include/linux/sunrpc/auth.h
@@ -135,10 +135,8 @@ void			rpcauth_release(struct rpc_auth *);
 struct rpc_cred *	rpcauth_lookup_credcache(struct rpc_auth *, struct auth_cred *, int);
 void			rpcauth_init_cred(struct rpc_cred *, const struct auth_cred *, struct rpc_auth *, const struct rpc_credops *);
 struct rpc_cred *	rpcauth_lookupcred(struct rpc_auth *, int);
-int			rpcauth_bindcred(struct rpc_task *, struct rpc_cred *, int);
 struct rpc_cred *	rpcauth_generic_bind_cred(struct rpc_task *, struct rpc_cred *, int);
 void			put_rpccred(struct rpc_cred *);
-void			rpcauth_unbindcred(struct rpc_task *);
 __be32 *		rpcauth_marshcred(struct rpc_task *, __be32 *);
 __be32 *		rpcauth_checkverf(struct rpc_task *, __be32 *);
 int			rpcauth_wrap_req(struct rpc_task *task, kxdrproc_t encode, void *rqstp, __be32 *data, void *obj);
diff --git a/include/linux/sunrpc/xprt.h b/include/linux/sunrpc/xprt.h
index b51470302399..ff5a77b28c50 100644
--- a/include/linux/sunrpc/xprt.h
+++ b/include/linux/sunrpc/xprt.h
@@ -64,6 +64,7 @@ struct rpc_rqst {
 	 * This is the private part
 	 */
 	struct rpc_task *	rq_task;	/* RPC task data */
+	struct rpc_cred *	rq_cred;	/* Bound cred */
 	__be32			rq_xid;		/* request XID */
 	int			rq_cong;	/* has incremented xprt->cong */
 	u32			rq_seqno;	/* gss seq no. used on req. */
diff --git a/net/sunrpc/auth.c b/net/sunrpc/auth.c
index d8968faf5ccf..95721426296d 100644
--- a/net/sunrpc/auth.c
+++ b/net/sunrpc/auth.c
@@ -477,9 +477,10 @@ rpcauth_bind_new_cred(struct rpc_task *task, int lookupflags)
 	return rpcauth_lookupcred(auth, lookupflags);
 }
 
-int
+static int
 rpcauth_bindcred(struct rpc_task *task, struct rpc_cred *cred, int flags)
 {
+	struct rpc_rqst *req = task->tk_rqstp;
 	struct rpc_cred *new;
 	int lookupflags = 0;
 
@@ -493,9 +494,9 @@ rpcauth_bindcred(struct rpc_task *task, struct rpc_cred *cred, int flags)
 		new = rpcauth_bind_new_cred(task, lookupflags);
 	if (IS_ERR(new))
 		return PTR_ERR(new);
-	if (task->tk_msg.rpc_cred != NULL)
-		put_rpccred(task->tk_msg.rpc_cred);
-	task->tk_msg.rpc_cred = new;
+	if (req->rq_cred != NULL)
+		put_rpccred(req->rq_cred);
+	req->rq_cred = new;
 	return 0;
 }
 
@@ -535,22 +536,10 @@ out_nodestroy:
 }
 EXPORT_SYMBOL_GPL(put_rpccred);
 
-void
-rpcauth_unbindcred(struct rpc_task *task)
-{
-	struct rpc_cred	*cred = task->tk_msg.rpc_cred;
-
-	dprintk("RPC: %5u releasing %s cred %p\n",
-		task->tk_pid, cred->cr_auth->au_ops->au_name, cred);
-
-	put_rpccred(cred);
-	task->tk_msg.rpc_cred = NULL;
-}
-
 __be32 *
 rpcauth_marshcred(struct rpc_task *task, __be32 *p)
 {
-	struct rpc_cred	*cred = task->tk_msg.rpc_cred;
+	struct rpc_cred	*cred = task->tk_rqstp->rq_cred;
 
 	dprintk("RPC: %5u marshaling %s cred %p\n",
 		task->tk_pid, cred->cr_auth->au_ops->au_name, cred);
@@ -561,7 +550,7 @@ rpcauth_marshcred(struct rpc_task *task, __be32 *p)
 __be32 *
 rpcauth_checkverf(struct rpc_task *task, __be32 *p)
 {
-	struct rpc_cred	*cred = task->tk_msg.rpc_cred;
+	struct rpc_cred	*cred = task->tk_rqstp->rq_cred;
 
 	dprintk("RPC: %5u validating %s cred %p\n",
 		task->tk_pid, cred->cr_auth->au_ops->au_name, cred);
@@ -573,7 +562,7 @@ int
 rpcauth_wrap_req(struct rpc_task *task, kxdrproc_t encode, void *rqstp,
 		__be32 *data, void *obj)
 {
-	struct rpc_cred *cred = task->tk_msg.rpc_cred;
+	struct rpc_cred *cred = task->tk_rqstp->rq_cred;
 
 	dprintk("RPC: %5u using %s cred %p to wrap rpc data\n",
 			task->tk_pid, cred->cr_ops->cr_name, cred);
@@ -587,7 +576,7 @@ int
 rpcauth_unwrap_resp(struct rpc_task *task, kxdrproc_t decode, void *rqstp,
 		__be32 *data, void *obj)
 {
-	struct rpc_cred *cred = task->tk_msg.rpc_cred;
+	struct rpc_cred *cred = task->tk_rqstp->rq_cred;
 
 	dprintk("RPC: %5u using %s cred %p to unwrap rpc data\n",
 			task->tk_pid, cred->cr_ops->cr_name, cred);
@@ -601,13 +590,21 @@ rpcauth_unwrap_resp(struct rpc_task *task, kxdrproc_t decode, void *rqstp,
 int
 rpcauth_refreshcred(struct rpc_task *task)
 {
-	struct rpc_cred	*cred = task->tk_msg.rpc_cred;
+	struct rpc_cred	*cred = task->tk_rqstp->rq_cred;
 	int err;
 
+	cred = task->tk_rqstp->rq_cred;
+	if (cred == NULL) {
+		err = rpcauth_bindcred(task, task->tk_msg.rpc_cred, task->tk_flags);
+		if (err < 0)
+			goto out;
+		cred = task->tk_rqstp->rq_cred;
+	};
 	dprintk("RPC: %5u refreshing %s cred %p\n",
 		task->tk_pid, cred->cr_auth->au_ops->au_name, cred);
 
 	err = cred->cr_ops->crrefresh(task);
+out:
 	if (err < 0)
 		task->tk_status = err;
 	return err;
@@ -616,7 +613,7 @@ rpcauth_refreshcred(struct rpc_task *task)
 void
 rpcauth_invalcred(struct rpc_task *task)
 {
-	struct rpc_cred *cred = task->tk_msg.rpc_cred;
+	struct rpc_cred *cred = task->tk_rqstp->rq_cred;
 
 	dprintk("RPC: %5u invalidating %s cred %p\n",
 		task->tk_pid, cred->cr_auth->au_ops->au_name, cred);
@@ -627,7 +624,7 @@ rpcauth_invalcred(struct rpc_task *task)
 int
 rpcauth_uptodatecred(struct rpc_task *task)
 {
-	struct rpc_cred *cred = task->tk_msg.rpc_cred;
+	struct rpc_cred *cred = task->tk_rqstp->rq_cred;
 
 	return cred == NULL ||
 		test_bit(RPCAUTH_CRED_UPTODATE, &cred->cr_flags) != 0;
diff --git a/net/sunrpc/auth_gss/auth_gss.c b/net/sunrpc/auth_gss/auth_gss.c
index 8da2a0e68574..096e1260bc67 100644
--- a/net/sunrpc/auth_gss/auth_gss.c
+++ b/net/sunrpc/auth_gss/auth_gss.c
@@ -373,7 +373,7 @@ gss_handle_downcall_result(struct gss_cred *gss_cred, struct gss_upcall_msg *gss
 static void
 gss_upcall_callback(struct rpc_task *task)
 {
-	struct gss_cred *gss_cred = container_of(task->tk_msg.rpc_cred,
+	struct gss_cred *gss_cred = container_of(task->tk_rqstp->rq_cred,
 			struct gss_cred, gc_base);
 	struct gss_upcall_msg *gss_msg = gss_cred->gc_upcall;
 	struct inode *inode = &gss_msg->inode->vfs_inode;
@@ -502,7 +502,7 @@ static void warn_gssd(void)
 static inline int
 gss_refresh_upcall(struct rpc_task *task)
 {
-	struct rpc_cred *cred = task->tk_msg.rpc_cred;
+	struct rpc_cred *cred = task->tk_rqstp->rq_cred;
 	struct gss_auth *gss_auth = container_of(cred->cr_auth,
 			struct gss_auth, rpc_auth);
 	struct gss_cred *gss_cred = container_of(cred,
@@ -1064,12 +1064,12 @@ out:
 static __be32 *
 gss_marshal(struct rpc_task *task, __be32 *p)
 {
-	struct rpc_cred *cred = task->tk_msg.rpc_cred;
+	struct rpc_rqst *req = task->tk_rqstp;
+	struct rpc_cred *cred = req->rq_cred;
 	struct gss_cred	*gss_cred = container_of(cred, struct gss_cred,
 						 gc_base);
 	struct gss_cl_ctx	*ctx = gss_cred_get_ctx(cred);
 	__be32		*cred_len;
-	struct rpc_rqst *req = task->tk_rqstp;
 	u32             maj_stat = 0;
 	struct xdr_netobj mic;
 	struct kvec	iov;
@@ -1119,7 +1119,7 @@ out_put_ctx:
 
 static int gss_renew_cred(struct rpc_task *task)
 {
-	struct rpc_cred *oldcred = task->tk_msg.rpc_cred;
+	struct rpc_cred *oldcred = task->tk_rqstp->rq_cred;
 	struct gss_cred *gss_cred = container_of(oldcred,
 						 struct gss_cred,
 						 gc_base);
@@ -1133,7 +1133,7 @@ static int gss_renew_cred(struct rpc_task *task)
 	new = gss_lookup_cred(auth, &acred, RPCAUTH_LOOKUP_NEW);
 	if (IS_ERR(new))
 		return PTR_ERR(new);
-	task->tk_msg.rpc_cred = new;
+	task->tk_rqstp->rq_cred = new;
 	put_rpccred(oldcred);
 	return 0;
 }
@@ -1161,7 +1161,7 @@ static int gss_cred_is_negative_entry(struct rpc_cred *cred)
 static int
 gss_refresh(struct rpc_task *task)
 {
-	struct rpc_cred *cred = task->tk_msg.rpc_cred;
+	struct rpc_cred *cred = task->tk_rqstp->rq_cred;
 	int ret = 0;
 
 	if (gss_cred_is_negative_entry(cred))
@@ -1172,7 +1172,7 @@ gss_refresh(struct rpc_task *task)
 		ret = gss_renew_cred(task);
 		if (ret < 0)
 			goto out;
-		cred = task->tk_msg.rpc_cred;
+		cred = task->tk_rqstp->rq_cred;
 	}
 
 	if (test_bit(RPCAUTH_CRED_NEW, &cred->cr_flags))
@@ -1191,7 +1191,7 @@ gss_refresh_null(struct rpc_task *task)
 static __be32 *
 gss_validate(struct rpc_task *task, __be32 *p)
 {
-	struct rpc_cred *cred = task->tk_msg.rpc_cred;
+	struct rpc_cred *cred = task->tk_rqstp->rq_cred;
 	struct gss_cl_ctx *ctx = gss_cred_get_ctx(cred);
 	__be32		seq;
 	struct kvec	iov;
@@ -1400,7 +1400,7 @@ static int
 gss_wrap_req(struct rpc_task *task,
 	     kxdrproc_t encode, void *rqstp, __be32 *p, void *obj)
 {
-	struct rpc_cred *cred = task->tk_msg.rpc_cred;
+	struct rpc_cred *cred = task->tk_rqstp->rq_cred;
 	struct gss_cred	*gss_cred = container_of(cred, struct gss_cred,
 			gc_base);
 	struct gss_cl_ctx *ctx = gss_cred_get_ctx(cred);
@@ -1503,7 +1503,7 @@ static int
 gss_unwrap_resp(struct rpc_task *task,
 		kxdrproc_t decode, void *rqstp, __be32 *p, void *obj)
 {
-	struct rpc_cred *cred = task->tk_msg.rpc_cred;
+	struct rpc_cred *cred = task->tk_rqstp->rq_cred;
 	struct gss_cred *gss_cred = container_of(cred, struct gss_cred,
 			gc_base);
 	struct gss_cl_ctx *ctx = gss_cred_get_ctx(cred);
diff --git a/net/sunrpc/auth_null.c b/net/sunrpc/auth_null.c
index 1db618f56ecb..a5c36c01707b 100644
--- a/net/sunrpc/auth_null.c
+++ b/net/sunrpc/auth_null.c
@@ -75,7 +75,7 @@ nul_marshal(struct rpc_task *task, __be32 *p)
 static int
 nul_refresh(struct rpc_task *task)
 {
-	set_bit(RPCAUTH_CRED_UPTODATE, &task->tk_msg.rpc_cred->cr_flags);
+	set_bit(RPCAUTH_CRED_UPTODATE, &task->tk_rqstp->rq_cred->cr_flags);
 	return 0;
 }
 
diff --git a/net/sunrpc/auth_unix.c b/net/sunrpc/auth_unix.c
index d5e37dbf207b..4cb70dc6e7ad 100644
--- a/net/sunrpc/auth_unix.c
+++ b/net/sunrpc/auth_unix.c
@@ -140,7 +140,7 @@ static __be32 *
 unx_marshal(struct rpc_task *task, __be32 *p)
 {
 	struct rpc_clnt	*clnt = task->tk_client;
-	struct unx_cred	*cred = container_of(task->tk_msg.rpc_cred, struct unx_cred, uc_base);
+	struct unx_cred	*cred = container_of(task->tk_rqstp->rq_cred, struct unx_cred, uc_base);
 	__be32		*base, *hold;
 	int		i;
 
@@ -173,7 +173,7 @@ unx_marshal(struct rpc_task *task, __be32 *p)
 static int
 unx_refresh(struct rpc_task *task)
 {
-	set_bit(RPCAUTH_CRED_UPTODATE, &task->tk_msg.rpc_cred->cr_flags);
+	set_bit(RPCAUTH_CRED_UPTODATE, &task->tk_rqstp->rq_cred->cr_flags);
 	return 0;
 }
 
@@ -196,7 +196,7 @@ unx_validate(struct rpc_task *task, __be32 *p)
 		printk("RPC: giant verf size: %u\n", size);
 		return NULL;
 	}
-	task->tk_msg.rpc_cred->cr_auth->au_rslack = (size >> 2) + 2;
+	task->tk_rqstp->rq_cred->cr_auth->au_rslack = (size >> 2) + 2;
 	p += (size >> 2);
 
 	return p;
diff --git a/net/sunrpc/clnt.c b/net/sunrpc/clnt.c
index f34b5e3823c0..2388d83b68ff 100644
--- a/net/sunrpc/clnt.c
+++ b/net/sunrpc/clnt.c
@@ -605,8 +605,8 @@ rpc_task_set_rpc_message(struct rpc_task *task, const struct rpc_message *msg)
 		task->tk_msg.rpc_proc = msg->rpc_proc;
 		task->tk_msg.rpc_argp = msg->rpc_argp;
 		task->tk_msg.rpc_resp = msg->rpc_resp;
-		/* Bind the user cred */
-		task->tk_status = rpcauth_bindcred(task, msg->rpc_cred, task->tk_flags);
+		if (msg->rpc_cred != NULL)
+			task->tk_msg.rpc_cred = get_rpccred(msg->rpc_cred);
 	}
 }
 
@@ -909,11 +909,6 @@ call_reserve(struct rpc_task *task)
 {
 	dprint_status(task);
 
-	if (!rpcauth_uptodatecred(task)) {
-		task->tk_action = call_refresh;
-		return;
-	}
-
 	task->tk_status  = 0;
 	task->tk_action  = call_reserveresult;
 	xprt_reserve(task);
@@ -977,7 +972,7 @@ call_reserveresult(struct rpc_task *task)
 static void
 call_allocate(struct rpc_task *task)
 {
-	unsigned int slack = task->tk_msg.rpc_cred->cr_auth->au_cslack;
+	unsigned int slack = task->tk_client->cl_auth->au_cslack;
 	struct rpc_rqst *req = task->tk_rqstp;
 	struct rpc_xprt *xprt = task->tk_xprt;
 	struct rpc_procinfo *proc = task->tk_msg.rpc_proc;
@@ -985,7 +980,7 @@ call_allocate(struct rpc_task *task)
 	dprint_status(task);
 
 	task->tk_status = 0;
-	task->tk_action = call_bind;
+	task->tk_action = call_refresh;
 
 	if (req->rq_buffer)
 		return;
@@ -1022,6 +1017,47 @@ call_allocate(struct rpc_task *task)
 	rpc_exit(task, -ERESTARTSYS);
 }
 
+/*
+ * 2a.	Bind and/or refresh the credentials
+ */
+static void
+call_refresh(struct rpc_task *task)
+{
+	dprint_status(task);
+
+	task->tk_action = call_refreshresult;
+	task->tk_status = 0;
+	task->tk_client->cl_stats->rpcauthrefresh++;
+	rpcauth_refreshcred(task);
+}
+
+/*
+ * 2b.	Process the results of a credential refresh
+ */
+static void
+call_refreshresult(struct rpc_task *task)
+{
+	int status = task->tk_status;
+
+	dprint_status(task);
+
+	task->tk_status = 0;
+	task->tk_action = call_bind;
+	if (status >= 0 && rpcauth_uptodatecred(task))
+		return;
+	switch (status) {
+	case -EACCES:
+		rpc_exit(task, -EACCES);
+		return;
+	case -ENOMEM:
+		rpc_exit(task, -ENOMEM);
+		return;
+	case -ETIMEDOUT:
+		rpc_delay(task, 3*HZ);
+	}
+	task->tk_action = call_refresh;
+}
+
 static inline int
 rpc_task_need_encode(struct rpc_task *task)
 {
@@ -1557,43 +1593,6 @@ out_retry:
 	}
 }
 
-/*
- * 8.	Refresh the credentials if rejected by the server
- */
-static void
-call_refresh(struct rpc_task *task)
-{
-	dprint_status(task);
-
-	task->tk_action = call_refreshresult;
-	task->tk_status = 0;
-	task->tk_client->cl_stats->rpcauthrefresh++;
-	rpcauth_refreshcred(task);
-}
-
-/*
- * 8a.	Process the results of a credential refresh
- */
-static void
-call_refreshresult(struct rpc_task *task)
-{
-	int status = task->tk_status;
-
-	dprint_status(task);
-
-	task->tk_status = 0;
-	task->tk_action = call_reserve;
-	if (status >= 0 && rpcauth_uptodatecred(task))
-		return;
-	if (status == -EACCES) {
-		rpc_exit(task, -EACCES);
-		return;
-	}
-	task->tk_action = call_refresh;
-	if (status != -ETIMEDOUT)
-		rpc_delay(task, 3*HZ);
-}
-
 static __be32 *
 rpc_encode_header(struct rpc_task *task)
 {
diff --git a/net/sunrpc/sched.c b/net/sunrpc/sched.c
index a42296db2ecd..f6db6131fb2e 100644
--- a/net/sunrpc/sched.c
+++ b/net/sunrpc/sched.c
@@ -864,7 +864,7 @@ void rpc_put_task(struct rpc_task *task)
 	if (task->tk_rqstp)
 		xprt_release(task);
 	if (task->tk_msg.rpc_cred)
-		rpcauth_unbindcred(task);
+		put_rpccred(task->tk_msg.rpc_cred);
 	rpc_task_release_client(task);
 	if (task->tk_workqueue != NULL) {
 		INIT_WORK(&task->u.tk_work, rpc_async_release);
diff --git a/net/sunrpc/xprt.c b/net/sunrpc/xprt.c
index dcd0132396ba..70297836a191 100644
--- a/net/sunrpc/xprt.c
+++ b/net/sunrpc/xprt.c
@@ -1032,6 +1032,8 @@ void xprt_release(struct rpc_task *task)
 	spin_unlock_bh(&xprt->transport_lock);
 	if (req->rq_buffer)
 		xprt->ops->buf_free(req->rq_buffer);
+	if (req->rq_cred != NULL)
+		put_rpccred(req->rq_cred);
 	task->tk_rqstp = NULL;
 	if (req->rq_release_snd_buf)
 		req->rq_release_snd_buf(req);
-- 
cgit v1.2.3


From e3b5e0d552b34d65e15b20610273b200555eea53 Mon Sep 17 00:00:00 2001
From: Ilya Yanok <yanok@emcraft.com>
Date: Thu, 8 Jul 2010 10:10:38 +0000
Subject: powerpc/fsl_pci: add quirk for mpc8308 pcie bridge

This patch adds the quirk for PCIE controller found on Freescale MPC8308.
The quirk is the same as for other MPC83xx processors.

Signed-off-by: Ilya Yanok <yanok@emcraft.com>
Signed-off-by: Kumar Gala <galak@kernel.crashing.org>
---
 arch/powerpc/sysdev/fsl_pci.c | 1 +
 include/linux/pci_ids.h       | 1 +
 2 files changed, 2 insertions(+)

(limited to 'include/linux')

diff --git a/arch/powerpc/sysdev/fsl_pci.c b/arch/powerpc/sysdev/fsl_pci.c
index a14760fe513a..7e900ec988f6 100644
--- a/arch/powerpc/sysdev/fsl_pci.c
+++ b/arch/powerpc/sysdev/fsl_pci.c
@@ -412,6 +412,7 @@ DECLARE_PCI_FIXUP_HEADER(0x1957, PCI_DEVICE_ID_P4080, quirk_fsl_pcie_header);
 #endif /* CONFIG_FSL_SOC_BOOKE || CONFIG_PPC_86xx */
 
 #if defined(CONFIG_PPC_83xx) || defined(CONFIG_PPC_MPC512x)
+DECLARE_PCI_FIXUP_HEADER(0x1957, PCI_DEVICE_ID_MPC8308, quirk_fsl_pcie_header);
 DECLARE_PCI_FIXUP_HEADER(0x1957, PCI_DEVICE_ID_MPC8314E, quirk_fsl_pcie_header);
 DECLARE_PCI_FIXUP_HEADER(0x1957, PCI_DEVICE_ID_MPC8314, quirk_fsl_pcie_header);
 DECLARE_PCI_FIXUP_HEADER(0x1957, PCI_DEVICE_ID_MPC8315E, quirk_fsl_pcie_header);
diff --git a/include/linux/pci_ids.h b/include/linux/pci_ids.h
index 3bedcc149c84..79bb11f35c4b 100644
--- a/include/linux/pci_ids.h
+++ b/include/linux/pci_ids.h
@@ -2264,6 +2264,7 @@
 #define PCI_DEVICE_ID_TDI_EHCI          0x0101
 
 #define PCI_VENDOR_ID_FREESCALE		0x1957
+#define PCI_DEVICE_ID_MPC8308		0xc006
 #define PCI_DEVICE_ID_MPC8315E		0x00b4
 #define PCI_DEVICE_ID_MPC8315		0x00b5
 #define PCI_DEVICE_ID_MPC8314E		0x00b6
-- 
cgit v1.2.3


From 0c42bd0e425e9c8ddb7019fc446f7d915e36c5f6 Mon Sep 17 00:00:00 2001
From: Yong Wang <yong.y.wang@linux.intel.com>
Date: Fri, 30 Jul 2010 16:23:03 +0800
Subject: dmaengine: Driver for Topcliff PCH DMA controller

Topcliff PCH is the platform controller hub that is going to
be used in Intel's upcoming general embedded platforms. This
adds the driver for Topcliff PCH DMA controller. The DMA
channels are strictly for device to host or host to device
transfers and cannot be used for generic memcpy.

Signed-off-by: Yong Wang <yong.y.wang@intel.com>
[kill GFP_ATOMIC, kill __raw_{read|write}l, locking fixlet]
Signed-off-by: Dan Williams <dan.j.williams@intel.com>
---
 drivers/dma/Kconfig     |   7 +
 drivers/dma/Makefile    |   1 +
 drivers/dma/pch_dma.c   | 957 ++++++++++++++++++++++++++++++++++++++++++++++++
 include/linux/pch_dma.h |  37 ++
 4 files changed, 1002 insertions(+)
 create mode 100644 drivers/dma/pch_dma.c
 create mode 100644 include/linux/pch_dma.h

(limited to 'include/linux')

diff --git a/drivers/dma/Kconfig b/drivers/dma/Kconfig
index 6a3e5bfa6742..fed57634b6c1 100644
--- a/drivers/dma/Kconfig
+++ b/drivers/dma/Kconfig
@@ -188,6 +188,13 @@ config PL330_DMA
 	  You need to provide platform specific settings via
 	  platform_data for a dma-pl330 device.
 
+config PCH_DMA
+	tristate "Topcliff PCH DMA support"
+	depends on PCI && X86
+	select DMA_ENGINE
+	help
+	  Enable support for the Topcliff PCH DMA engine.
+
 config DMA_ENGINE
 	bool
 
diff --git a/drivers/dma/Makefile b/drivers/dma/Makefile
index 27b6c69ae639..72bd70384d8a 100644
--- a/drivers/dma/Makefile
+++ b/drivers/dma/Makefile
@@ -24,3 +24,4 @@ obj-$(CONFIG_AMCC_PPC440SPE_ADMA) += ppc4xx/
 obj-$(CONFIG_TIMB_DMA) += timb_dma.o
 obj-$(CONFIG_STE_DMA40) += ste_dma40.o ste_dma40_ll.o
 obj-$(CONFIG_PL330_DMA) += pl330.o
+obj-$(CONFIG_PCH_DMA) += pch_dma.o
diff --git a/drivers/dma/pch_dma.c b/drivers/dma/pch_dma.c
new file mode 100644
index 000000000000..37139ca496ec
--- /dev/null
+++ b/drivers/dma/pch_dma.c
@@ -0,0 +1,957 @@
+/*
+ * Topcliff PCH DMA controller driver
+ * Copyright (c) 2010 Intel Corporation
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+ */
+
+#include <linux/dmaengine.h>
+#include <linux/dma-mapping.h>
+#include <linux/init.h>
+#include <linux/pci.h>
+#include <linux/interrupt.h>
+#include <linux/module.h>
+#include <linux/pch_dma.h>
+
+#define DRV_NAME "pch-dma"
+
+#define DMA_CTL0_DISABLE		0x0
+#define DMA_CTL0_SG			0x1
+#define DMA_CTL0_ONESHOT		0x2
+#define DMA_CTL0_MODE_MASK_BITS		0x3
+#define DMA_CTL0_DIR_SHIFT_BITS		2
+#define DMA_CTL0_BITS_PER_CH		4
+
+#define DMA_CTL2_START_SHIFT_BITS	8
+#define DMA_CTL2_IRQ_ENABLE_MASK	((1UL << DMA_CTL2_START_SHIFT_BITS) - 1)
+
+#define DMA_STATUS_IDLE			0x0
+#define DMA_STATUS_DESC_READ		0x1
+#define DMA_STATUS_WAIT			0x2
+#define DMA_STATUS_ACCESS		0x3
+#define DMA_STATUS_BITS_PER_CH		2
+#define DMA_STATUS_MASK_BITS		0x3
+#define DMA_STATUS_SHIFT_BITS		16
+#define DMA_STATUS_IRQ(x)		(0x1 << (x))
+#define DMA_STATUS_ERR(x)		(0x1 << ((x) + 8))
+
+#define DMA_DESC_WIDTH_SHIFT_BITS	12
+#define DMA_DESC_WIDTH_1_BYTE		(0x3 << DMA_DESC_WIDTH_SHIFT_BITS)
+#define DMA_DESC_WIDTH_2_BYTES		(0x2 << DMA_DESC_WIDTH_SHIFT_BITS)
+#define DMA_DESC_WIDTH_4_BYTES		(0x0 << DMA_DESC_WIDTH_SHIFT_BITS)
+#define DMA_DESC_MAX_COUNT_1_BYTE	0x3FF
+#define DMA_DESC_MAX_COUNT_2_BYTES	0x3FF
+#define DMA_DESC_MAX_COUNT_4_BYTES	0x7FF
+#define DMA_DESC_END_WITHOUT_IRQ	0x0
+#define DMA_DESC_END_WITH_IRQ		0x1
+#define DMA_DESC_FOLLOW_WITHOUT_IRQ	0x2
+#define DMA_DESC_FOLLOW_WITH_IRQ	0x3
+
+#define MAX_CHAN_NR			8
+
+static unsigned int init_nr_desc_per_channel = 64;
+module_param(init_nr_desc_per_channel, uint, 0644);
+MODULE_PARM_DESC(init_nr_desc_per_channel,
+		 "initial descriptors per channel (default: 64)");
+
+struct pch_dma_desc_regs {
+	u32	dev_addr;
+	u32	mem_addr;
+	u32	size;
+	u32	next;
+};
+
+struct pch_dma_regs {
+	u32	dma_ctl0;
+	u32	dma_ctl1;
+	u32	dma_ctl2;
+	u32	reserved1;
+	u32	dma_sts0;
+	u32	dma_sts1;
+	u32	reserved2;
+	u32	reserved3;
+	struct pch_dma_desc_regs desc[0];
+};
+
+struct pch_dma_desc {
+	struct pch_dma_desc_regs regs;
+	struct dma_async_tx_descriptor txd;
+	struct list_head	desc_node;
+	struct list_head	tx_list;
+};
+
+struct pch_dma_chan {
+	struct dma_chan		chan;
+	void __iomem *membase;
+	enum dma_data_direction	dir;
+	struct tasklet_struct	tasklet;
+	unsigned long		err_status;
+
+	spinlock_t		lock;
+
+	dma_cookie_t		completed_cookie;
+	struct list_head	active_list;
+	struct list_head	queue;
+	struct list_head	free_list;
+	unsigned int		descs_allocated;
+};
+
+#define PDC_DEV_ADDR	0x00
+#define PDC_MEM_ADDR	0x04
+#define PDC_SIZE	0x08
+#define PDC_NEXT	0x0C
+
+#define channel_readl(pdc, name) \
+	readl((pdc)->membase + PDC_##name)
+#define channel_writel(pdc, name, val) \
+	writel((val), (pdc)->membase + PDC_##name)
+
+struct pch_dma {
+	struct dma_device	dma;
+	void __iomem *membase;
+	struct pci_pool		*pool;
+	struct pch_dma_regs	regs;
+	struct pch_dma_desc_regs ch_regs[MAX_CHAN_NR];
+	struct pch_dma_chan	channels[0];
+};
+
+#define PCH_DMA_CTL0	0x00
+#define PCH_DMA_CTL1	0x04
+#define PCH_DMA_CTL2	0x08
+#define PCH_DMA_STS0	0x10
+#define PCH_DMA_STS1	0x14
+
+#define dma_readl(pd, name) \
+	__raw_readl((pd)->membase + PCH_DMA_##name)
+#define dma_writel(pd, name, val) \
+	__raw_writel((val), (pd)->membase + PCH_DMA_##name)
+
+static inline struct pch_dma_desc *to_pd_desc(struct dma_async_tx_descriptor *txd)
+{
+	return container_of(txd, struct pch_dma_desc, txd);
+}
+
+static inline struct pch_dma_chan *to_pd_chan(struct dma_chan *chan)
+{
+	return container_of(chan, struct pch_dma_chan, chan);
+}
+
+static inline struct pch_dma *to_pd(struct dma_device *ddev)
+{
+	return container_of(ddev, struct pch_dma, dma);
+}
+
+static inline struct device *chan2dev(struct dma_chan *chan)
+{
+	return &chan->dev->device;
+}
+
+static inline struct device *chan2parent(struct dma_chan *chan)
+{
+	return chan->dev->device.parent;
+}
+
+static inline struct pch_dma_desc *pdc_first_active(struct pch_dma_chan *pd_chan)
+{
+	return list_first_entry(&pd_chan->active_list,
+				struct pch_dma_desc, desc_node);
+}
+
+static inline struct pch_dma_desc *pdc_first_queued(struct pch_dma_chan *pd_chan)
+{
+	return list_first_entry(&pd_chan->queue,
+				struct pch_dma_desc, desc_node);
+}
+
+static void pdc_enable_irq(struct dma_chan *chan, int enable)
+{
+	struct pch_dma *pd = to_pd(chan->device);
+	u32 val;
+
+	val = dma_readl(pd, CTL2);
+
+	if (enable)
+		val |= 0x1 << chan->chan_id;
+	else
+		val &= ~(0x1 << chan->chan_id);
+
+	dma_writel(pd, CTL2, val);
+
+	dev_dbg(chan2dev(chan), "pdc_enable_irq: chan %d -> %x\n",
+		chan->chan_id, val);
+}
+
+static void pdc_set_dir(struct dma_chan *chan)
+{
+	struct pch_dma_chan *pd_chan = to_pd_chan(chan);
+	struct pch_dma *pd = to_pd(chan->device);
+	u32 val;
+
+	val = dma_readl(pd, CTL0);
+
+	if (pd_chan->dir == DMA_TO_DEVICE)
+		val |= 0x1 << (DMA_CTL0_BITS_PER_CH * chan->chan_id +
+			       DMA_CTL0_DIR_SHIFT_BITS);
+	else
+		val &= ~(0x1 << (DMA_CTL0_BITS_PER_CH * chan->chan_id +
+				 DMA_CTL0_DIR_SHIFT_BITS));
+
+	dma_writel(pd, CTL0, val);
+
+	dev_dbg(chan2dev(chan), "pdc_set_dir: chan %d -> %x\n",
+		chan->chan_id, val);
+}
+
+static void pdc_set_mode(struct dma_chan *chan, u32 mode)
+{
+	struct pch_dma *pd = to_pd(chan->device);
+	u32 val;
+
+	val = dma_readl(pd, CTL0);
+
+	val &= ~(DMA_CTL0_MODE_MASK_BITS <<
+		(DMA_CTL0_BITS_PER_CH * chan->chan_id));
+	val |= mode << (DMA_CTL0_BITS_PER_CH * chan->chan_id);
+
+	dma_writel(pd, CTL0, val);
+
+	dev_dbg(chan2dev(chan), "pdc_set_mode: chan %d -> %x\n",
+		chan->chan_id, val);
+}
+
+static u32 pdc_get_status(struct pch_dma_chan *pd_chan)
+{
+	struct pch_dma *pd = to_pd(pd_chan->chan.device);
+	u32 val;
+
+	val = dma_readl(pd, STS0);
+	return DMA_STATUS_MASK_BITS & (val >> (DMA_STATUS_SHIFT_BITS +
+			DMA_STATUS_BITS_PER_CH * pd_chan->chan.chan_id));
+}
+
+static bool pdc_is_idle(struct pch_dma_chan *pd_chan)
+{
+	if (pdc_get_status(pd_chan) == DMA_STATUS_IDLE)
+		return true;
+	else
+		return false;
+}
+
+static void pdc_dostart(struct pch_dma_chan *pd_chan, struct pch_dma_desc* desc)
+{
+	struct pch_dma *pd = to_pd(pd_chan->chan.device);
+	u32 val;
+
+	if (!pdc_is_idle(pd_chan)) {
+		dev_err(chan2dev(&pd_chan->chan),
+			"BUG: Attempt to start non-idle channel\n");
+		return;
+	}
+
+	channel_writel(pd_chan, DEV_ADDR, desc->regs.dev_addr);
+	channel_writel(pd_chan, MEM_ADDR, desc->regs.mem_addr);
+	channel_writel(pd_chan, SIZE, desc->regs.size);
+	channel_writel(pd_chan, NEXT, desc->regs.next);
+
+	dev_dbg(chan2dev(&pd_chan->chan), "chan %d -> dev_addr: %x\n",
+		pd_chan->chan.chan_id, desc->regs.dev_addr);
+	dev_dbg(chan2dev(&pd_chan->chan), "chan %d -> mem_addr: %x\n",
+		pd_chan->chan.chan_id, desc->regs.mem_addr);
+	dev_dbg(chan2dev(&pd_chan->chan), "chan %d -> size: %x\n",
+		pd_chan->chan.chan_id, desc->regs.size);
+	dev_dbg(chan2dev(&pd_chan->chan), "chan %d -> next: %x\n",
+		pd_chan->chan.chan_id, desc->regs.next);
+
+	if (list_empty(&desc->tx_list))
+		pdc_set_mode(&pd_chan->chan, DMA_CTL0_ONESHOT);
+	else
+		pdc_set_mode(&pd_chan->chan, DMA_CTL0_SG);
+
+	val = dma_readl(pd, CTL2);
+	val |= 1 << (DMA_CTL2_START_SHIFT_BITS + pd_chan->chan.chan_id);
+	dma_writel(pd, CTL2, val);
+}
+
+static void pdc_chain_complete(struct pch_dma_chan *pd_chan,
+			       struct pch_dma_desc *desc)
+{
+	struct dma_async_tx_descriptor *txd = &desc->txd;
+	dma_async_tx_callback callback = txd->callback;
+	void *param = txd->callback_param;
+
+	list_splice_init(&desc->tx_list, &pd_chan->free_list);
+	list_move(&desc->desc_node, &pd_chan->free_list);
+
+	if (callback)
+		callback(param);
+}
+
+static void pdc_complete_all(struct pch_dma_chan *pd_chan)
+{
+	struct pch_dma_desc *desc, *_d;
+	LIST_HEAD(list);
+
+	BUG_ON(!pdc_is_idle(pd_chan));
+
+	if (!list_empty(&pd_chan->queue))
+		pdc_dostart(pd_chan, pdc_first_queued(pd_chan));
+
+	list_splice_init(&pd_chan->active_list, &list);
+	list_splice_init(&pd_chan->queue, &pd_chan->active_list);
+
+	list_for_each_entry_safe(desc, _d, &list, desc_node)
+		pdc_chain_complete(pd_chan, desc);
+}
+
+static void pdc_handle_error(struct pch_dma_chan *pd_chan)
+{
+	struct pch_dma_desc *bad_desc;
+
+	bad_desc = pdc_first_active(pd_chan);
+	list_del(&bad_desc->desc_node);
+
+	list_splice_init(&pd_chan->queue, pd_chan->active_list.prev);
+
+	if (!list_empty(&pd_chan->active_list))
+		pdc_dostart(pd_chan, pdc_first_active(pd_chan));
+
+	dev_crit(chan2dev(&pd_chan->chan), "Bad descriptor submitted\n");
+	dev_crit(chan2dev(&pd_chan->chan), "descriptor cookie: %d\n",
+		 bad_desc->txd.cookie);
+
+	pdc_chain_complete(pd_chan, bad_desc);
+}
+
+static void pdc_advance_work(struct pch_dma_chan *pd_chan)
+{
+	if (list_empty(&pd_chan->active_list) ||
+		list_is_singular(&pd_chan->active_list)) {
+		pdc_complete_all(pd_chan);
+	} else {
+		pdc_chain_complete(pd_chan, pdc_first_active(pd_chan));
+		pdc_dostart(pd_chan, pdc_first_active(pd_chan));
+	}
+}
+
+static dma_cookie_t pdc_assign_cookie(struct pch_dma_chan *pd_chan,
+				      struct pch_dma_desc *desc)
+{
+	dma_cookie_t cookie = pd_chan->chan.cookie;
+
+	if (++cookie < 0)
+		cookie = 1;
+
+	pd_chan->chan.cookie = cookie;
+	desc->txd.cookie = cookie;
+
+	return cookie;
+}
+
+static dma_cookie_t pd_tx_submit(struct dma_async_tx_descriptor *txd)
+{
+	struct pch_dma_desc *desc = to_pd_desc(txd);
+	struct pch_dma_chan *pd_chan = to_pd_chan(txd->chan);
+	dma_cookie_t cookie;
+
+	spin_lock_bh(&pd_chan->lock);
+	cookie = pdc_assign_cookie(pd_chan, desc);
+
+	if (list_empty(&pd_chan->active_list)) {
+		list_add_tail(&desc->desc_node, &pd_chan->active_list);
+		pdc_dostart(pd_chan, desc);
+	} else {
+		list_add_tail(&desc->desc_node, &pd_chan->queue);
+	}
+
+	spin_unlock_bh(&pd_chan->lock);
+	return 0;
+}
+
+static struct pch_dma_desc *pdc_alloc_desc(struct dma_chan *chan, gfp_t flags)
+{
+	struct pch_dma_desc *desc = NULL;
+	struct pch_dma *pd = to_pd(chan->device);
+	dma_addr_t addr;
+
+	desc = pci_pool_alloc(pd->pool, GFP_KERNEL, &addr);
+	if (desc) {
+		memset(desc, 0, sizeof(struct pch_dma_desc));
+		INIT_LIST_HEAD(&desc->tx_list);
+		dma_async_tx_descriptor_init(&desc->txd, chan);
+		desc->txd.tx_submit = pd_tx_submit;
+		desc->txd.flags = DMA_CTRL_ACK;
+		desc->txd.phys = addr;
+	}
+
+	return desc;
+}
+
+static struct pch_dma_desc *pdc_desc_get(struct pch_dma_chan *pd_chan)
+{
+	struct pch_dma_desc *desc, *_d;
+	struct pch_dma_desc *ret = NULL;
+	int i;
+
+	spin_lock_bh(&pd_chan->lock);
+	list_for_each_entry_safe(desc, _d, &pd_chan->free_list, desc_node) {
+		i++;
+		if (async_tx_test_ack(&desc->txd)) {
+			list_del(&desc->desc_node);
+			ret = desc;
+			break;
+		}
+		dev_dbg(chan2dev(&pd_chan->chan), "desc %p not ACKed\n", desc);
+	}
+	spin_unlock_bh(&pd_chan->lock);
+	dev_dbg(chan2dev(&pd_chan->chan), "scanned %d descriptors\n", i);
+
+	if (!ret) {
+		ret = pdc_alloc_desc(&pd_chan->chan, GFP_NOIO);
+		if (ret) {
+			spin_lock_bh(&pd_chan->lock);
+			pd_chan->descs_allocated++;
+			spin_unlock_bh(&pd_chan->lock);
+		} else {
+			dev_err(chan2dev(&pd_chan->chan),
+				"failed to alloc desc\n");
+		}
+	}
+
+	return ret;
+}
+
+static void pdc_desc_put(struct pch_dma_chan *pd_chan,
+			 struct pch_dma_desc *desc)
+{
+	if (desc) {
+		spin_lock_bh(&pd_chan->lock);
+		list_splice_init(&desc->tx_list, &pd_chan->free_list);
+		list_add(&desc->desc_node, &pd_chan->free_list);
+		spin_unlock_bh(&pd_chan->lock);
+	}
+}
+
+static int pd_alloc_chan_resources(struct dma_chan *chan)
+{
+	struct pch_dma_chan *pd_chan = to_pd_chan(chan);
+	struct pch_dma_desc *desc;
+	LIST_HEAD(tmp_list);
+	int i;
+
+	if (!pdc_is_idle(pd_chan)) {
+		dev_dbg(chan2dev(chan), "DMA channel not idle ?\n");
+		return -EIO;
+	}
+
+	if (!list_empty(&pd_chan->free_list))
+		return pd_chan->descs_allocated;
+
+	for (i = 0; i < init_nr_desc_per_channel; i++) {
+		desc = pdc_alloc_desc(chan, GFP_KERNEL);
+
+		if (!desc) {
+			dev_warn(chan2dev(chan),
+				"Only allocated %d initial descriptors\n", i);
+			break;
+		}
+
+		list_add_tail(&desc->desc_node, &tmp_list);
+	}
+
+	spin_lock_bh(&pd_chan->lock);
+	list_splice(&tmp_list, &pd_chan->free_list);
+	pd_chan->descs_allocated = i;
+	pd_chan->completed_cookie = chan->cookie = 1;
+	spin_unlock_bh(&pd_chan->lock);
+
+	pdc_enable_irq(chan, 1);
+	pdc_set_dir(chan);
+
+	return pd_chan->descs_allocated;
+}
+
+static void pd_free_chan_resources(struct dma_chan *chan)
+{
+	struct pch_dma_chan *pd_chan = to_pd_chan(chan);
+	struct pch_dma *pd = to_pd(chan->device);
+	struct pch_dma_desc *desc, *_d;
+	LIST_HEAD(tmp_list);
+
+	BUG_ON(!pdc_is_idle(pd_chan));
+	BUG_ON(!list_empty(&pd_chan->active_list));
+	BUG_ON(!list_empty(&pd_chan->queue));
+
+	spin_lock_bh(&pd_chan->lock);
+	list_splice_init(&pd_chan->free_list, &tmp_list);
+	pd_chan->descs_allocated = 0;
+	spin_unlock_bh(&pd_chan->lock);
+
+	list_for_each_entry_safe(desc, _d, &tmp_list, desc_node)
+		pci_pool_free(pd->pool, desc, desc->txd.phys);
+
+	pdc_enable_irq(chan, 0);
+}
+
+static enum dma_status pd_tx_status(struct dma_chan *chan, dma_cookie_t cookie,
+				    struct dma_tx_state *txstate)
+{
+	struct pch_dma_chan *pd_chan = to_pd_chan(chan);
+	dma_cookie_t last_used;
+	dma_cookie_t last_completed;
+	int ret;
+
+	spin_lock_bh(&pd_chan->lock);
+	last_completed = pd_chan->completed_cookie;
+	last_used = chan->cookie;
+	spin_unlock_bh(&pd_chan->lock);
+
+	ret = dma_async_is_complete(cookie, last_completed, last_used);
+
+	dma_set_tx_state(txstate, last_completed, last_used, 0);
+
+	return ret;
+}
+
+static void pd_issue_pending(struct dma_chan *chan)
+{
+	struct pch_dma_chan *pd_chan = to_pd_chan(chan);
+
+	if (pdc_is_idle(pd_chan)) {
+		spin_lock_bh(&pd_chan->lock);
+		pdc_advance_work(pd_chan);
+		spin_unlock_bh(&pd_chan->lock);
+	}
+}
+
+static struct dma_async_tx_descriptor *pd_prep_slave_sg(struct dma_chan *chan,
+			struct scatterlist *sgl, unsigned int sg_len,
+			enum dma_data_direction direction, unsigned long flags)
+{
+	struct pch_dma_chan *pd_chan = to_pd_chan(chan);
+	struct pch_dma_slave *pd_slave = chan->private;
+	struct pch_dma_desc *first = NULL;
+	struct pch_dma_desc *prev = NULL;
+	struct pch_dma_desc *desc = NULL;
+	struct scatterlist *sg;
+	dma_addr_t reg;
+	int i;
+
+	if (unlikely(!sg_len)) {
+		dev_info(chan2dev(chan), "prep_slave_sg: length is zero!\n");
+		return NULL;
+	}
+
+	if (direction == DMA_FROM_DEVICE)
+		reg = pd_slave->rx_reg;
+	else if (direction == DMA_TO_DEVICE)
+		reg = pd_slave->tx_reg;
+	else
+		return NULL;
+
+	for_each_sg(sgl, sg, sg_len, i) {
+		desc = pdc_desc_get(pd_chan);
+
+		if (!desc)
+			goto err_desc_get;
+
+		desc->regs.dev_addr = reg;
+		desc->regs.mem_addr = sg_phys(sg);
+		desc->regs.size = sg_dma_len(sg);
+		desc->regs.next = DMA_DESC_FOLLOW_WITHOUT_IRQ;
+
+		switch (pd_slave->width) {
+		case PCH_DMA_WIDTH_1_BYTE:
+			if (desc->regs.size > DMA_DESC_MAX_COUNT_1_BYTE)
+				goto err_desc_get;
+			desc->regs.size |= DMA_DESC_WIDTH_1_BYTE;
+			break;
+		case PCH_DMA_WIDTH_2_BYTES:
+			if (desc->regs.size > DMA_DESC_MAX_COUNT_2_BYTES)
+				goto err_desc_get;
+			desc->regs.size |= DMA_DESC_WIDTH_2_BYTES;
+			break;
+		case PCH_DMA_WIDTH_4_BYTES:
+			if (desc->regs.size > DMA_DESC_MAX_COUNT_4_BYTES)
+				goto err_desc_get;
+			desc->regs.size |= DMA_DESC_WIDTH_4_BYTES;
+			break;
+		default:
+			goto err_desc_get;
+		}
+
+
+		if (!first) {
+			first = desc;
+		} else {
+			prev->regs.next |= desc->txd.phys;
+			list_add_tail(&desc->desc_node, &first->tx_list);
+		}
+
+		prev = desc;
+	}
+
+	if (flags & DMA_PREP_INTERRUPT)
+		desc->regs.next = DMA_DESC_END_WITH_IRQ;
+	else
+		desc->regs.next = DMA_DESC_END_WITHOUT_IRQ;
+
+	first->txd.cookie = -EBUSY;
+	desc->txd.flags = flags;
+
+	return &first->txd;
+
+err_desc_get:
+	dev_err(chan2dev(chan), "failed to get desc or wrong parameters\n");
+	pdc_desc_put(pd_chan, first);
+	return NULL;
+}
+
+static int pd_device_control(struct dma_chan *chan, enum dma_ctrl_cmd cmd,
+			     unsigned long arg)
+{
+	struct pch_dma_chan *pd_chan = to_pd_chan(chan);
+	struct pch_dma_desc *desc, *_d;
+	LIST_HEAD(list);
+
+	if (cmd != DMA_TERMINATE_ALL)
+		return -ENXIO;
+
+	spin_lock_bh(&pd_chan->lock);
+
+	pdc_set_mode(&pd_chan->chan, DMA_CTL0_DISABLE);
+
+	list_splice_init(&pd_chan->active_list, &list);
+	list_splice_init(&pd_chan->queue, &list);
+
+	list_for_each_entry_safe(desc, _d, &list, desc_node)
+		pdc_chain_complete(pd_chan, desc);
+
+	spin_unlock_bh(&pd_chan->lock);
+
+
+	return 0;
+}
+
+static void pdc_tasklet(unsigned long data)
+{
+	struct pch_dma_chan *pd_chan = (struct pch_dma_chan *)data;
+
+	if (!pdc_is_idle(pd_chan)) {
+		dev_err(chan2dev(&pd_chan->chan),
+			"BUG: handle non-idle channel in tasklet\n");
+		return;
+	}
+
+	spin_lock_bh(&pd_chan->lock);
+	if (test_and_clear_bit(0, &pd_chan->err_status))
+		pdc_handle_error(pd_chan);
+	else
+		pdc_advance_work(pd_chan);
+	spin_unlock_bh(&pd_chan->lock);
+}
+
+static irqreturn_t pd_irq(int irq, void *devid)
+{
+	struct pch_dma *pd = (struct pch_dma *)devid;
+	struct pch_dma_chan *pd_chan;
+	u32 sts0;
+	int i;
+	int ret = IRQ_NONE;
+
+	sts0 = dma_readl(pd, STS0);
+
+	dev_dbg(pd->dma.dev, "pd_irq sts0: %x\n", sts0);
+
+	for (i = 0; i < pd->dma.chancnt; i++) {
+		pd_chan = &pd->channels[i];
+
+		if (sts0 & DMA_STATUS_IRQ(i)) {
+			if (sts0 & DMA_STATUS_ERR(i))
+				set_bit(0, &pd_chan->err_status);
+
+			tasklet_schedule(&pd_chan->tasklet);
+			ret = IRQ_HANDLED;
+		}
+
+	}
+
+	/* clear interrupt bits in status register */
+	dma_writel(pd, STS0, sts0);
+
+	return ret;
+}
+
+static void pch_dma_save_regs(struct pch_dma *pd)
+{
+	struct pch_dma_chan *pd_chan;
+	struct dma_chan *chan, *_c;
+	int i = 0;
+
+	pd->regs.dma_ctl0 = dma_readl(pd, CTL0);
+	pd->regs.dma_ctl1 = dma_readl(pd, CTL1);
+	pd->regs.dma_ctl2 = dma_readl(pd, CTL2);
+
+	list_for_each_entry_safe(chan, _c, &pd->dma.channels, device_node) {
+		pd_chan = to_pd_chan(chan);
+
+		pd->ch_regs[i].dev_addr = channel_readl(pd_chan, DEV_ADDR);
+		pd->ch_regs[i].mem_addr = channel_readl(pd_chan, MEM_ADDR);
+		pd->ch_regs[i].size = channel_readl(pd_chan, SIZE);
+		pd->ch_regs[i].next = channel_readl(pd_chan, NEXT);
+
+		i++;
+	}
+}
+
+static void pch_dma_restore_regs(struct pch_dma *pd)
+{
+	struct pch_dma_chan *pd_chan;
+	struct dma_chan *chan, *_c;
+	int i = 0;
+
+	dma_writel(pd, CTL0, pd->regs.dma_ctl0);
+	dma_writel(pd, CTL1, pd->regs.dma_ctl1);
+	dma_writel(pd, CTL2, pd->regs.dma_ctl2);
+
+	list_for_each_entry_safe(chan, _c, &pd->dma.channels, device_node) {
+		pd_chan = to_pd_chan(chan);
+
+		channel_writel(pd_chan, DEV_ADDR, pd->ch_regs[i].dev_addr);
+		channel_writel(pd_chan, MEM_ADDR, pd->ch_regs[i].mem_addr);
+		channel_writel(pd_chan, SIZE, pd->ch_regs[i].size);
+		channel_writel(pd_chan, NEXT, pd->ch_regs[i].next);
+
+		i++;
+	}
+}
+
+static int pch_dma_suspend(struct pci_dev *pdev, pm_message_t state)
+{
+	struct pch_dma *pd = pci_get_drvdata(pdev);
+
+	if (pd)
+		pch_dma_save_regs(pd);
+
+	pci_save_state(pdev);
+	pci_disable_device(pdev);
+	pci_set_power_state(pdev, pci_choose_state(pdev, state));
+
+	return 0;
+}
+
+static int pch_dma_resume(struct pci_dev *pdev)
+{
+	struct pch_dma *pd = pci_get_drvdata(pdev);
+	int err;
+
+	pci_set_power_state(pdev, PCI_D0);
+	pci_restore_state(pdev);
+
+	err = pci_enable_device(pdev);
+	if (err) {
+		dev_dbg(&pdev->dev, "failed to enable device\n");
+		return err;
+	}
+
+	if (pd)
+		pch_dma_restore_regs(pd);
+
+	return 0;
+}
+
+static int __devinit pch_dma_probe(struct pci_dev *pdev,
+				   const struct pci_device_id *id)
+{
+	struct pch_dma *pd;
+	struct pch_dma_regs *regs;
+	unsigned int nr_channels;
+	int err;
+	int i;
+
+	nr_channels = id->driver_data;
+	pd = kzalloc(sizeof(struct pch_dma)+
+		sizeof(struct pch_dma_chan) * nr_channels, GFP_KERNEL);
+	if (!pd)
+		return -ENOMEM;
+
+	pci_set_drvdata(pdev, pd);
+
+	err = pci_enable_device(pdev);
+	if (err) {
+		dev_err(&pdev->dev, "Cannot enable PCI device\n");
+		goto err_free_mem;
+	}
+
+	if (!(pci_resource_flags(pdev, 1) & IORESOURCE_MEM)) {
+		dev_err(&pdev->dev, "Cannot find proper base address\n");
+		goto err_disable_pdev;
+	}
+
+	err = pci_request_regions(pdev, DRV_NAME);
+	if (err) {
+		dev_err(&pdev->dev, "Cannot obtain PCI resources\n");
+		goto err_disable_pdev;
+	}
+
+	err = pci_set_dma_mask(pdev, DMA_BIT_MASK(32));
+	if (err) {
+		dev_err(&pdev->dev, "Cannot set proper DMA config\n");
+		goto err_free_res;
+	}
+
+	regs = pd->membase = pci_iomap(pdev, 1, 0);
+	if (!pd->membase) {
+		dev_err(&pdev->dev, "Cannot map MMIO registers\n");
+		err = -ENOMEM;
+		goto err_free_res;
+	}
+
+	pci_set_master(pdev);
+
+	err = request_irq(pdev->irq, pd_irq, IRQF_SHARED, DRV_NAME, pd);
+	if (err) {
+		dev_err(&pdev->dev, "Failed to request IRQ\n");
+		goto err_iounmap;
+	}
+
+	pd->pool = pci_pool_create("pch_dma_desc_pool", pdev,
+				   sizeof(struct pch_dma_desc), 4, 0);
+	if (!pd->pool) {
+		dev_err(&pdev->dev, "Failed to alloc DMA descriptors\n");
+		err = -ENOMEM;
+		goto err_free_irq;
+	}
+
+	pd->dma.dev = &pdev->dev;
+	pd->dma.chancnt = nr_channels;
+
+	INIT_LIST_HEAD(&pd->dma.channels);
+
+	for (i = 0; i < nr_channels; i++) {
+		struct pch_dma_chan *pd_chan = &pd->channels[i];
+
+		pd_chan->chan.device = &pd->dma;
+		pd_chan->chan.cookie = 1;
+		pd_chan->chan.chan_id = i;
+
+		pd_chan->membase = &regs->desc[i];
+
+		pd_chan->dir = (i % 2) ? DMA_FROM_DEVICE : DMA_TO_DEVICE;
+
+		spin_lock_init(&pd_chan->lock);
+
+		INIT_LIST_HEAD(&pd_chan->active_list);
+		INIT_LIST_HEAD(&pd_chan->queue);
+		INIT_LIST_HEAD(&pd_chan->free_list);
+
+		tasklet_init(&pd_chan->tasklet, pdc_tasklet,
+			     (unsigned long)pd_chan);
+		list_add_tail(&pd_chan->chan.device_node, &pd->dma.channels);
+	}
+
+	dma_cap_zero(pd->dma.cap_mask);
+	dma_cap_set(DMA_PRIVATE, pd->dma.cap_mask);
+	dma_cap_set(DMA_SLAVE, pd->dma.cap_mask);
+
+	pd->dma.device_alloc_chan_resources = pd_alloc_chan_resources;
+	pd->dma.device_free_chan_resources = pd_free_chan_resources;
+	pd->dma.device_tx_status = pd_tx_status;
+	pd->dma.device_issue_pending = pd_issue_pending;
+	pd->dma.device_prep_slave_sg = pd_prep_slave_sg;
+	pd->dma.device_control = pd_device_control;
+
+	err = dma_async_device_register(&pd->dma);
+	if (err) {
+		dev_err(&pdev->dev, "Failed to register DMA device\n");
+		goto err_free_pool;
+	}
+
+	return 0;
+
+err_free_pool:
+	pci_pool_destroy(pd->pool);
+err_free_irq:
+	free_irq(pdev->irq, pd);
+err_iounmap:
+	pci_iounmap(pdev, pd->membase);
+err_free_res:
+	pci_release_regions(pdev);
+err_disable_pdev:
+	pci_disable_device(pdev);
+err_free_mem:
+	return err;
+}
+
+static void __devexit pch_dma_remove(struct pci_dev *pdev)
+{
+	struct pch_dma *pd = pci_get_drvdata(pdev);
+	struct pch_dma_chan *pd_chan;
+	struct dma_chan *chan, *_c;
+
+	if (pd) {
+		dma_async_device_unregister(&pd->dma);
+
+		list_for_each_entry_safe(chan, _c, &pd->dma.channels,
+					 device_node) {
+			pd_chan = to_pd_chan(chan);
+
+			tasklet_disable(&pd_chan->tasklet);
+			tasklet_kill(&pd_chan->tasklet);
+		}
+
+		pci_pool_destroy(pd->pool);
+		free_irq(pdev->irq, pd);
+		pci_iounmap(pdev, pd->membase);
+		pci_release_regions(pdev);
+		pci_disable_device(pdev);
+		kfree(pd);
+	}
+}
+
+/* PCI Device ID of DMA device */
+#define PCI_DEVICE_ID_PCH_DMA_8CH        0x8810
+#define PCI_DEVICE_ID_PCH_DMA_4CH        0x8815
+
+static const struct pci_device_id pch_dma_id_table[] = {
+	{ PCI_VDEVICE(INTEL, PCI_DEVICE_ID_PCH_DMA_8CH), 8 },
+	{ PCI_VDEVICE(INTEL, PCI_DEVICE_ID_PCH_DMA_4CH), 4 },
+};
+
+static struct pci_driver pch_dma_driver = {
+	.name		= DRV_NAME,
+	.id_table	= pch_dma_id_table,
+	.probe		= pch_dma_probe,
+	.remove		= __devexit_p(pch_dma_remove),
+#ifdef CONFIG_PM
+	.suspend	= pch_dma_suspend,
+	.resume		= pch_dma_resume,
+#endif
+};
+
+static int __init pch_dma_init(void)
+{
+	return pci_register_driver(&pch_dma_driver);
+}
+
+static void __exit pch_dma_exit(void)
+{
+	pci_unregister_driver(&pch_dma_driver);
+}
+
+module_init(pch_dma_init);
+module_exit(pch_dma_exit);
+
+MODULE_DESCRIPTION("Topcliff PCH DMA controller driver");
+MODULE_AUTHOR("Yong Wang <yong.y.wang@intel.com>");
+MODULE_LICENSE("GPL v2");
diff --git a/include/linux/pch_dma.h b/include/linux/pch_dma.h
new file mode 100644
index 000000000000..fdafe529ef8a
--- /dev/null
+++ b/include/linux/pch_dma.h
@@ -0,0 +1,37 @@
+/*
+ * Copyright (c) 2010 Intel Corporation
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+ */
+
+#ifndef PCH_DMA_H
+#define PCH_DMA_H
+
+#include <linux/dmaengine.h>
+
+enum pch_dma_width {
+	PCH_DMA_WIDTH_1_BYTE,
+	PCH_DMA_WIDTH_2_BYTES,
+	PCH_DMA_WIDTH_4_BYTES,
+};
+
+struct pch_dma_slave {
+	struct device		*dma_dev;
+	unsigned int		chan_id;
+	dma_addr_t		tx_reg;
+	dma_addr_t		rx_reg;
+	enum pch_dma_width	width;
+};
+
+#endif
-- 
cgit v1.2.3


From c156d0a5b0c667999e06d0bb52e3d1376faec8bf Mon Sep 17 00:00:00 2001
From: Linus Walleij <linus.walleij@stericsson.com>
Date: Wed, 4 Aug 2010 13:37:33 +0200
Subject: DMAENGINE: generic slave channel control v3

This adds an interface to the DMAengine to make it possible to
reconfigure a slave channel at runtime. We add a few foreseen
config parameters to the passed struct, with a void * pointer
for custom per-device or per-platform runtime slave data.

Signed-off-by: Linus Walleij <linus.walleij@stericsson.com>
Signed-off-by: Dan Williams <dan.j.williams@intel.com>
---
 include/linux/dmaengine.h | 71 +++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 71 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/dmaengine.h b/include/linux/dmaengine.h
index 5204f018931b..c61d4ca27bcc 100644
--- a/include/linux/dmaengine.h
+++ b/include/linux/dmaengine.h
@@ -114,11 +114,17 @@ enum dma_ctrl_flags {
  * @DMA_TERMINATE_ALL: terminate all ongoing transfers
  * @DMA_PAUSE: pause ongoing transfers
  * @DMA_RESUME: resume paused transfer
+ * @DMA_SLAVE_CONFIG: this command is only implemented by DMA controllers
+ * that need to runtime reconfigure the slave channels (as opposed to passing
+ * configuration data in statically from the platform). An additional
+ * argument of struct dma_slave_config must be passed in with this
+ * command.
  */
 enum dma_ctrl_cmd {
 	DMA_TERMINATE_ALL,
 	DMA_PAUSE,
 	DMA_RESUME,
+	DMA_SLAVE_CONFIG,
 };
 
 /**
@@ -199,6 +205,71 @@ struct dma_chan_dev {
 	atomic_t *idr_ref;
 };
 
+/**
+ * enum dma_slave_buswidth - defines bus with of the DMA slave
+ * device, source or target buses
+ */
+enum dma_slave_buswidth {
+	DMA_SLAVE_BUSWIDTH_UNDEFINED = 0,
+	DMA_SLAVE_BUSWIDTH_1_BYTE = 1,
+	DMA_SLAVE_BUSWIDTH_2_BYTES = 2,
+	DMA_SLAVE_BUSWIDTH_4_BYTES = 4,
+	DMA_SLAVE_BUSWIDTH_8_BYTES = 8,
+};
+
+/**
+ * struct dma_slave_config - dma slave channel runtime config
+ * @direction: whether the data shall go in or out on this slave
+ * channel, right now. DMA_TO_DEVICE and DMA_FROM_DEVICE are
+ * legal values, DMA_BIDIRECTIONAL is not acceptable since we
+ * need to differentiate source and target addresses.
+ * @src_addr: this is the physical address where DMA slave data
+ * should be read (RX), if the source is memory this argument is
+ * ignored.
+ * @dst_addr: this is the physical address where DMA slave data
+ * should be written (TX), if the source is memory this argument
+ * is ignored.
+ * @src_addr_width: this is the width in bytes of the source (RX)
+ * register where DMA data shall be read. If the source
+ * is memory this may be ignored depending on architecture.
+ * Legal values: 1, 2, 4, 8.
+ * @dst_addr_width: same as src_addr_width but for destination
+ * target (TX) mutatis mutandis.
+ * @src_maxburst: the maximum number of words (note: words, as in
+ * units of the src_addr_width member, not bytes) that can be sent
+ * in one burst to the device. Typically something like half the
+ * FIFO depth on I/O peripherals so you don't overflow it. This
+ * may or may not be applicable on memory sources.
+ * @dst_maxburst: same as src_maxburst but for destination target
+ * mutatis mutandis.
+ *
+ * This struct is passed in as configuration data to a DMA engine
+ * in order to set up a certain channel for DMA transport at runtime.
+ * The DMA device/engine has to provide support for an additional
+ * command in the channel config interface, DMA_SLAVE_CONFIG
+ * and this struct will then be passed in as an argument to the
+ * DMA engine device_control() function.
+ *
+ * The rationale for adding configuration information to this struct
+ * is as follows: if it is likely that most DMA slave controllers in
+ * the world will support the configuration option, then make it
+ * generic. If not: if it is fixed so that it be sent in static from
+ * the platform data, then prefer to do that. Else, if it is neither
+ * fixed at runtime, nor generic enough (such as bus mastership on
+ * some CPU family and whatnot) then create a custom slave config
+ * struct and pass that, then make this config a member of that
+ * struct, if applicable.
+ */
+struct dma_slave_config {
+	enum dma_data_direction direction;
+	dma_addr_t src_addr;
+	dma_addr_t dst_addr;
+	enum dma_slave_buswidth src_addr_width;
+	enum dma_slave_buswidth dst_addr_width;
+	u32 src_maxburst;
+	u32 dst_maxburst;
+};
+
 static inline const char *dma_chan_name(struct dma_chan *chan)
 {
 	return dev_name(&chan->dev->device);
-- 
cgit v1.2.3


From fca3ec01e0b40cab82cac7745e154b01969e6219 Mon Sep 17 00:00:00 2001
From: Chris Wilson <chris@chris-wilson.co.uk>
Date: Wed, 4 Aug 2010 14:34:24 +0100
Subject: drm,io-mapping: Specify slot to use for atomic mappings

This is required should we ever attempt to use an io-mapping where
KM_USER0 is verboten, such as inside an IRQ context.

Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
Cc: Eric Anholt <eric@anholt.net>
Signed-off-by: Dave Airlie <airlied@redhat.com>
---
 drivers/gpu/drm/i915/i915_gem.c        |  9 +++++----
 drivers/gpu/drm/i915/intel_overlay.c   |  5 +++--
 drivers/gpu/drm/nouveau/nouveau_bios.c |  8 ++++----
 include/linux/io-mapping.h             | 16 ++++++++++------
 4 files changed, 22 insertions(+), 16 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/gpu/drm/i915/i915_gem.c b/drivers/gpu/drm/i915/i915_gem.c
index 4efd4fd3b340..2a4ed7ca8b4e 100644
--- a/drivers/gpu/drm/i915/i915_gem.c
+++ b/drivers/gpu/drm/i915/i915_gem.c
@@ -496,10 +496,10 @@ fast_user_write(struct io_mapping *mapping,
 	char *vaddr_atomic;
 	unsigned long unwritten;
 
-	vaddr_atomic = io_mapping_map_atomic_wc(mapping, page_base);
+	vaddr_atomic = io_mapping_map_atomic_wc(mapping, page_base, KM_USER0);
 	unwritten = __copy_from_user_inatomic_nocache(vaddr_atomic + page_offset,
 						      user_data, length);
-	io_mapping_unmap_atomic(vaddr_atomic);
+	io_mapping_unmap_atomic(vaddr_atomic, KM_USER0);
 	if (unwritten)
 		return -EFAULT;
 	return 0;
@@ -3487,7 +3487,8 @@ i915_gem_object_pin_and_relocate(struct drm_gem_object *obj,
 		reloc_offset = obj_priv->gtt_offset + reloc->offset;
 		reloc_page = io_mapping_map_atomic_wc(dev_priv->mm.gtt_mapping,
 						      (reloc_offset &
-						       ~(PAGE_SIZE - 1)));
+						       ~(PAGE_SIZE - 1)),
+						      KM_USER0);
 		reloc_entry = (uint32_t __iomem *)(reloc_page +
 						   (reloc_offset & (PAGE_SIZE - 1)));
 		reloc_val = target_obj_priv->gtt_offset + reloc->delta;
@@ -3498,7 +3499,7 @@ i915_gem_object_pin_and_relocate(struct drm_gem_object *obj,
 			  readl(reloc_entry), reloc_val);
 #endif
 		writel(reloc_val, reloc_entry);
-		io_mapping_unmap_atomic(reloc_page);
+		io_mapping_unmap_atomic(reloc_page, KM_USER0);
 
 		/* The updated presumed offset for this entry will be
 		 * copied back out to the user.
diff --git a/drivers/gpu/drm/i915/intel_overlay.c b/drivers/gpu/drm/i915/intel_overlay.c
index f26ec2f27d36..d39aea24eabe 100644
--- a/drivers/gpu/drm/i915/intel_overlay.c
+++ b/drivers/gpu/drm/i915/intel_overlay.c
@@ -185,7 +185,8 @@ static struct overlay_registers *intel_overlay_map_regs_atomic(struct intel_over
 
 	if (OVERLAY_NONPHYSICAL(overlay->dev)) {
 		regs = io_mapping_map_atomic_wc(dev_priv->mm.gtt_mapping,
-				overlay->reg_bo->gtt_offset);
+						overlay->reg_bo->gtt_offset,
+						KM_USER0);
 
 		if (!regs) {
 			DRM_ERROR("failed to map overlay regs in GTT\n");
@@ -200,7 +201,7 @@ static struct overlay_registers *intel_overlay_map_regs_atomic(struct intel_over
 static void intel_overlay_unmap_regs_atomic(struct intel_overlay *overlay)
 {
 	if (OVERLAY_NONPHYSICAL(overlay->dev))
-		io_mapping_unmap_atomic(overlay->virt_addr);
+		io_mapping_unmap_atomic(overlay->virt_addr, KM_USER0);
 
 	overlay->virt_addr = NULL;
 
diff --git a/drivers/gpu/drm/nouveau/nouveau_bios.c b/drivers/gpu/drm/nouveau/nouveau_bios.c
index b59f348f14fc..7369b5e73649 100644
--- a/drivers/gpu/drm/nouveau/nouveau_bios.c
+++ b/drivers/gpu/drm/nouveau/nouveau_bios.c
@@ -2083,11 +2083,11 @@ peek_fb(struct drm_device *dev, struct io_mapping *fb,
 	uint32_t val = 0;
 
 	if (off < pci_resource_len(dev->pdev, 1)) {
-		uint32_t __iomem *p = io_mapping_map_atomic_wc(fb, off);
+		uint32_t __iomem *p = io_mapping_map_atomic_wc(fb, off, KM_USER0);
 
 		val = ioread32(p);
 
-		io_mapping_unmap_atomic(p);
+		io_mapping_unmap_atomic(p, KM_USER0);
 	}
 
 	return val;
@@ -2098,12 +2098,12 @@ poke_fb(struct drm_device *dev, struct io_mapping *fb,
 	uint32_t off, uint32_t val)
 {
 	if (off < pci_resource_len(dev->pdev, 1)) {
-		uint32_t __iomem *p = io_mapping_map_atomic_wc(fb, off);
+		uint32_t __iomem *p = io_mapping_map_atomic_wc(fb, off, KM_USER0);
 
 		iowrite32(val, p);
 		wmb();
 
-		io_mapping_unmap_atomic(p);
+		io_mapping_unmap_atomic(p, KM_USER0);
 	}
 }
 
diff --git a/include/linux/io-mapping.h b/include/linux/io-mapping.h
index 25085ddd955f..e0ea40f6c515 100644
--- a/include/linux/io-mapping.h
+++ b/include/linux/io-mapping.h
@@ -79,7 +79,9 @@ io_mapping_free(struct io_mapping *mapping)
 
 /* Atomic map/unmap */
 static inline void *
-io_mapping_map_atomic_wc(struct io_mapping *mapping, unsigned long offset)
+io_mapping_map_atomic_wc(struct io_mapping *mapping,
+			 unsigned long offset,
+			 int slot)
 {
 	resource_size_t phys_addr;
 	unsigned long pfn;
@@ -87,13 +89,13 @@ io_mapping_map_atomic_wc(struct io_mapping *mapping, unsigned long offset)
 	BUG_ON(offset >= mapping->size);
 	phys_addr = mapping->base + offset;
 	pfn = (unsigned long) (phys_addr >> PAGE_SHIFT);
-	return iomap_atomic_prot_pfn(pfn, KM_USER0, mapping->prot);
+	return iomap_atomic_prot_pfn(pfn, slot, mapping->prot);
 }
 
 static inline void
-io_mapping_unmap_atomic(void *vaddr)
+io_mapping_unmap_atomic(void *vaddr, int slot)
 {
-	iounmap_atomic(vaddr, KM_USER0);
+	iounmap_atomic(vaddr, slot);
 }
 
 static inline void *
@@ -133,13 +135,15 @@ io_mapping_free(struct io_mapping *mapping)
 
 /* Atomic map/unmap */
 static inline void *
-io_mapping_map_atomic_wc(struct io_mapping *mapping, unsigned long offset)
+io_mapping_map_atomic_wc(struct io_mapping *mapping,
+			 unsigned long offset,
+			 int slot)
 {
 	return ((char *) mapping) + offset;
 }
 
 static inline void
-io_mapping_unmap_atomic(void *vaddr)
+io_mapping_unmap_atomic(void *vaddr, int slot)
 {
 }
 
-- 
cgit v1.2.3


From f9599ce11192f24dbb0f4fdb70121a05edc58342 Mon Sep 17 00:00:00 2001
From: Changli Gao <xiaosuo@gmail.com>
Date: Wed, 4 Aug 2010 04:43:44 +0000
Subject: sk_buff: introduce pskb_network_may_pull()

Signed-off-by: Changli Gao <xiaosuo@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/skbuff.h | 5 +++++
 1 file changed, 5 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h
index d20d9e7a9bbd..77eb60d2b496 100644
--- a/include/linux/skbuff.h
+++ b/include/linux/skbuff.h
@@ -1379,6 +1379,11 @@ static inline int skb_network_offset(const struct sk_buff *skb)
 	return skb_network_header(skb) - skb->data;
 }
 
+static inline int pskb_network_may_pull(struct sk_buff *skb, unsigned int len)
+{
+	return pskb_may_pull(skb, skb_network_offset(skb) + len);
+}
+
 /*
  * CPUs often take a performance hit when accessing unaligned memory
  * locations. The actual performance hit varies, it can be small if the
-- 
cgit v1.2.3


From d7100da026317fcf07411f765fe1cdb044053917 Mon Sep 17 00:00:00 2001
From: stephen hemminger <shemminger@vyatta.com>
Date: Wed, 4 Aug 2010 07:34:36 +0000
Subject: ppp: make channel_ops const

The PPP channel ops structure should be const.
Cleanup the declarations to use standard C99 format.

Signed-off-by: Stephen Hemminger <shemminger@vyatta.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/char/pcmcia/ipwireless/network.c | 2 +-
 drivers/net/ppp_async.c                  | 6 +++---
 drivers/net/ppp_synctty.c                | 6 +++---
 drivers/net/pppoe.c                      | 4 ++--
 include/linux/ppp_channel.h              | 2 +-
 net/atm/pppoatm.c                        | 2 +-
 net/irda/irnet/irnet_ppp.c               | 2 +-
 net/l2tp/l2tp_ppp.c                      | 5 ++++-
 8 files changed, 16 insertions(+), 13 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/char/pcmcia/ipwireless/network.c b/drivers/char/pcmcia/ipwireless/network.c
index 65920163f53d..9fe538347932 100644
--- a/drivers/char/pcmcia/ipwireless/network.c
+++ b/drivers/char/pcmcia/ipwireless/network.c
@@ -239,7 +239,7 @@ static int ipwireless_ppp_ioctl(struct ppp_channel *ppp_channel,
 	return err;
 }
 
-static struct ppp_channel_ops ipwireless_ppp_channel_ops = {
+static const struct ppp_channel_ops ipwireless_ppp_channel_ops = {
 	.start_xmit = ipwireless_ppp_start_xmit,
 	.ioctl      = ipwireless_ppp_ioctl
 };
diff --git a/drivers/net/ppp_async.c b/drivers/net/ppp_async.c
index 6c2e8fa0ca31..af50a530daee 100644
--- a/drivers/net/ppp_async.c
+++ b/drivers/net/ppp_async.c
@@ -108,9 +108,9 @@ static void ppp_async_process(unsigned long arg);
 static void async_lcp_peek(struct asyncppp *ap, unsigned char *data,
 			   int len, int inbound);
 
-static struct ppp_channel_ops async_ops = {
-	ppp_async_send,
-	ppp_async_ioctl
+static const struct ppp_channel_ops async_ops = {
+	.start_xmit = ppp_async_send,
+	.ioctl      = ppp_async_ioctl,
 };
 
 /*
diff --git a/drivers/net/ppp_synctty.c b/drivers/net/ppp_synctty.c
index 52938da1e542..4c95ec3fb8d4 100644
--- a/drivers/net/ppp_synctty.c
+++ b/drivers/net/ppp_synctty.c
@@ -97,9 +97,9 @@ static void ppp_sync_flush_output(struct syncppp *ap);
 static void ppp_sync_input(struct syncppp *ap, const unsigned char *buf,
 			   char *flags, int count);
 
-static struct ppp_channel_ops sync_ops = {
-	ppp_sync_send,
-	ppp_sync_ioctl
+static const struct ppp_channel_ops sync_ops = {
+	.start_xmit = ppp_sync_send,
+	.ioctl      = ppp_sync_ioctl,
 };
 
 /*
diff --git a/drivers/net/pppoe.c b/drivers/net/pppoe.c
index 344ef330e123..c07de359dc07 100644
--- a/drivers/net/pppoe.c
+++ b/drivers/net/pppoe.c
@@ -92,7 +92,7 @@
 static int __pppoe_xmit(struct sock *sk, struct sk_buff *skb);
 
 static const struct proto_ops pppoe_ops;
-static struct ppp_channel_ops pppoe_chan_ops;
+static const struct ppp_channel_ops pppoe_chan_ops;
 
 /* per-net private data for this module */
 static int pppoe_net_id __read_mostly;
@@ -963,7 +963,7 @@ static int pppoe_xmit(struct ppp_channel *chan, struct sk_buff *skb)
 	return __pppoe_xmit(sk, skb);
 }
 
-static struct ppp_channel_ops pppoe_chan_ops = {
+static const struct ppp_channel_ops pppoe_chan_ops = {
 	.start_xmit = pppoe_xmit,
 };
 
diff --git a/include/linux/ppp_channel.h b/include/linux/ppp_channel.h
index bff98ec1bfed..5d87f810a3b7 100644
--- a/include/linux/ppp_channel.h
+++ b/include/linux/ppp_channel.h
@@ -36,7 +36,7 @@ struct ppp_channel_ops {
 
 struct ppp_channel {
 	void		*private;	/* channel private data */
-	struct ppp_channel_ops *ops;	/* operations for this channel */
+	const struct ppp_channel_ops *ops; /* operations for this channel */
 	int		mtu;		/* max transmit packet size */
 	int		hdrlen;		/* amount of headroom channel needs */
 	void		*ppp;		/* opaque to channel */
diff --git a/net/atm/pppoatm.c b/net/atm/pppoatm.c
index e49bb6d948a1..e9aced0ec56b 100644
--- a/net/atm/pppoatm.c
+++ b/net/atm/pppoatm.c
@@ -260,7 +260,7 @@ static int pppoatm_devppp_ioctl(struct ppp_channel *chan, unsigned int cmd,
 	return -ENOTTY;
 }
 
-static /*const*/ struct ppp_channel_ops pppoatm_ops = {
+static const struct ppp_channel_ops pppoatm_ops = {
 	.start_xmit = pppoatm_send,
 	.ioctl = pppoatm_devppp_ioctl,
 };
diff --git a/net/irda/irnet/irnet_ppp.c b/net/irda/irnet/irnet_ppp.c
index 800bc53b7f63..dfe7b38dd4af 100644
--- a/net/irda/irnet/irnet_ppp.c
+++ b/net/irda/irnet/irnet_ppp.c
@@ -20,7 +20,7 @@
 /* Please put other headers in irnet.h - Thanks */
 
 /* Generic PPP callbacks (to call us) */
-static struct ppp_channel_ops irnet_ppp_ops = {
+static const struct ppp_channel_ops irnet_ppp_ops = {
 	.start_xmit = ppp_irnet_send,
 	.ioctl = ppp_irnet_ioctl
 };
diff --git a/net/l2tp/l2tp_ppp.c b/net/l2tp/l2tp_ppp.c
index 90d82b3f2889..ff954b3e94b6 100644
--- a/net/l2tp/l2tp_ppp.c
+++ b/net/l2tp/l2tp_ppp.c
@@ -135,7 +135,10 @@ struct pppol2tp_session {
 
 static int pppol2tp_xmit(struct ppp_channel *chan, struct sk_buff *skb);
 
-static struct ppp_channel_ops pppol2tp_chan_ops = { pppol2tp_xmit , NULL };
+static const struct ppp_channel_ops pppol2tp_chan_ops = {
+	.start_xmit =  pppol2tp_xmit,
+};
+
 static const struct proto_ops pppol2tp_ops;
 
 /* Helpers to obtain tunnel/session contexts from sockets.
-- 
cgit v1.2.3


From f6a21388bd255773cc80d4423afb4c69d4daa173 Mon Sep 17 00:00:00 2001
From: Lars-Peter Clausen <lars@metafoo.de>
Date: Sat, 19 Jun 2010 04:08:29 +0000
Subject: POWER: Add JZ4740 battery driver.

Add support for the battery voltage measurement part of the JZ4740 ADC unit.

Signed-off-by: Lars-Peter Clausen <lars@metafoo.de>
Acked-by: Anton Vorontsov <cbouatmailru@gmail.com>
Cc: linux-mips@linux-mips.org
Cc: linux-kernel@vger.kernel.org
Patchwork: https://patchwork.linux-mips.org/patch/1416/
Signed-off-by: Ralf Baechle <ralf@linux-mips.org>
---
 drivers/power/Kconfig                |  11 +
 drivers/power/Makefile               |   1 +
 drivers/power/jz4740-battery.c       | 445 +++++++++++++++++++++++++++++++++++
 include/linux/power/jz4740-battery.h |  24 ++
 4 files changed, 481 insertions(+)
 create mode 100644 drivers/power/jz4740-battery.c
 create mode 100644 include/linux/power/jz4740-battery.h

(limited to 'include/linux')

diff --git a/drivers/power/Kconfig b/drivers/power/Kconfig
index 8e9ba177d817..1e5506be39b4 100644
--- a/drivers/power/Kconfig
+++ b/drivers/power/Kconfig
@@ -142,4 +142,15 @@ config CHARGER_PCF50633
 	help
 	 Say Y to include support for NXP PCF50633 Main Battery Charger.
 
+config BATTERY_JZ4740
+	tristate "Ingenic JZ4740 battery"
+	depends on MACH_JZ4740
+	depends on MFD_JZ4740_ADC
+	help
+	  Say Y to enable support for the battery on Ingenic JZ4740 based
+	  boards.
+
+	  This driver can be build as a module. If so, the module will be
+	  called jz4740-battery.
+
 endif # POWER_SUPPLY
diff --git a/drivers/power/Makefile b/drivers/power/Makefile
index 00050809a6c7..cf95009d9bcd 100644
--- a/drivers/power/Makefile
+++ b/drivers/power/Makefile
@@ -34,3 +34,4 @@ obj-$(CONFIG_BATTERY_DA9030)	+= da9030_battery.o
 obj-$(CONFIG_BATTERY_MAX17040)	+= max17040_battery.o
 obj-$(CONFIG_BATTERY_Z2)	+= z2_battery.o
 obj-$(CONFIG_CHARGER_PCF50633)	+= pcf50633-charger.o
+obj-$(CONFIG_BATTERY_JZ4740)	+= jz4740-battery.o
diff --git a/drivers/power/jz4740-battery.c b/drivers/power/jz4740-battery.c
new file mode 100644
index 000000000000..20c4b952e9bd
--- /dev/null
+++ b/drivers/power/jz4740-battery.c
@@ -0,0 +1,445 @@
+/*
+ * Battery measurement code for Ingenic JZ SOC.
+ *
+ * Copyright (C) 2009 Jiejing Zhang <kzjeef@gmail.com>
+ * Copyright (C) 2010, Lars-Peter Clausen <lars@metafoo.de>
+ *
+ * based on tosa_battery.c
+ *
+ * Copyright (C) 2008 Marek Vasut <marek.vasut@gmail.com>
+*
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ *
+ */
+
+#include <linux/interrupt.h>
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/platform_device.h>
+#include <linux/slab.h>
+
+#include <linux/delay.h>
+#include <linux/gpio.h>
+#include <linux/mfd/core.h>
+#include <linux/power_supply.h>
+
+#include <linux/power/jz4740-battery.h>
+#include <linux/jz4740-adc.h>
+
+struct jz_battery {
+	struct jz_battery_platform_data *pdata;
+	struct platform_device *pdev;
+
+	struct resource *mem;
+	void __iomem *base;
+
+	int irq;
+	int charge_irq;
+
+	struct mfd_cell *cell;
+
+	int status;
+	long voltage;
+
+	struct completion read_completion;
+
+	struct power_supply battery;
+	struct delayed_work work;
+};
+
+static inline struct jz_battery *psy_to_jz_battery(struct power_supply *psy)
+{
+	return container_of(psy, struct jz_battery, battery);
+}
+
+static irqreturn_t jz_battery_irq_handler(int irq, void *devid)
+{
+	struct jz_battery *battery = devid;
+
+	complete(&battery->read_completion);
+	return IRQ_HANDLED;
+}
+
+static long jz_battery_read_voltage(struct jz_battery *battery)
+{
+	unsigned long t;
+	unsigned long val;
+	long voltage;
+
+	INIT_COMPLETION(battery->read_completion);
+
+	enable_irq(battery->irq);
+	battery->cell->enable(battery->pdev);
+
+	t = wait_for_completion_interruptible_timeout(&battery->read_completion,
+		HZ);
+
+	if (t > 0) {
+		val = readw(battery->base) & 0xfff;
+
+		if (battery->pdata->info.voltage_max_design <= 2500000)
+			val = (val * 78125UL) >> 7UL;
+		else
+			val = ((val * 924375UL) >> 9UL) + 33000;
+		voltage = (long)val;
+	} else {
+		voltage = t ? t : -ETIMEDOUT;
+	}
+
+	battery->cell->disable(battery->pdev);
+	disable_irq(battery->irq);
+
+	return voltage;
+}
+
+static int jz_battery_get_capacity(struct power_supply *psy)
+{
+	struct jz_battery *jz_battery = psy_to_jz_battery(psy);
+	struct power_supply_info *info = &jz_battery->pdata->info;
+	long voltage;
+	int ret;
+	int voltage_span;
+
+	voltage = jz_battery_read_voltage(jz_battery);
+
+	if (voltage < 0)
+		return voltage;
+
+	voltage_span = info->voltage_max_design - info->voltage_min_design;
+	ret = ((voltage - info->voltage_min_design) * 100) / voltage_span;
+
+	if (ret > 100)
+		ret = 100;
+	else if (ret < 0)
+		ret = 0;
+
+	return ret;
+}
+
+static int jz_battery_get_property(struct power_supply *psy,
+	enum power_supply_property psp, union power_supply_propval *val)
+{
+	struct jz_battery *jz_battery = psy_to_jz_battery(psy);
+	struct power_supply_info *info = &jz_battery->pdata->info;
+	long voltage;
+
+	switch (psp) {
+	case POWER_SUPPLY_PROP_STATUS:
+		val->intval = jz_battery->status;
+		break;
+	case POWER_SUPPLY_PROP_TECHNOLOGY:
+		val->intval = jz_battery->pdata->info.technology;
+		break;
+	case POWER_SUPPLY_PROP_HEALTH:
+		voltage = jz_battery_read_voltage(jz_battery);
+		if (voltage < info->voltage_min_design)
+			val->intval = POWER_SUPPLY_HEALTH_DEAD;
+		else
+			val->intval = POWER_SUPPLY_HEALTH_GOOD;
+		break;
+	case POWER_SUPPLY_PROP_CAPACITY:
+		val->intval = jz_battery_get_capacity(psy);
+		break;
+	case POWER_SUPPLY_PROP_VOLTAGE_NOW:
+		val->intval = jz_battery_read_voltage(jz_battery);
+		if (val->intval < 0)
+			return val->intval;
+		break;
+	case POWER_SUPPLY_PROP_VOLTAGE_MAX_DESIGN:
+		val->intval = info->voltage_max_design;
+		break;
+	case POWER_SUPPLY_PROP_VOLTAGE_MIN_DESIGN:
+		val->intval = info->voltage_min_design;
+		break;
+	case POWER_SUPPLY_PROP_PRESENT:
+		val->intval = 1;
+		break;
+	default:
+		return -EINVAL;
+	}
+	return 0;
+}
+
+static void jz_battery_external_power_changed(struct power_supply *psy)
+{
+	struct jz_battery *jz_battery = psy_to_jz_battery(psy);
+
+	cancel_delayed_work(&jz_battery->work);
+	schedule_delayed_work(&jz_battery->work, 0);
+}
+
+static irqreturn_t jz_battery_charge_irq(int irq, void *data)
+{
+	struct jz_battery *jz_battery = data;
+
+	cancel_delayed_work(&jz_battery->work);
+	schedule_delayed_work(&jz_battery->work, 0);
+
+	return IRQ_HANDLED;
+}
+
+static void jz_battery_update(struct jz_battery *jz_battery)
+{
+	int status;
+	long voltage;
+	bool has_changed = false;
+	int is_charging;
+
+	if (gpio_is_valid(jz_battery->pdata->gpio_charge)) {
+		is_charging = gpio_get_value(jz_battery->pdata->gpio_charge);
+		is_charging ^= jz_battery->pdata->gpio_charge_active_low;
+		if (is_charging)
+			status = POWER_SUPPLY_STATUS_CHARGING;
+		else
+			status = POWER_SUPPLY_STATUS_NOT_CHARGING;
+
+		if (status != jz_battery->status) {
+			jz_battery->status = status;
+			has_changed = true;
+		}
+	}
+
+	voltage = jz_battery_read_voltage(jz_battery);
+	if (abs(voltage - jz_battery->voltage) < 50000) {
+		jz_battery->voltage = voltage;
+		has_changed = true;
+	}
+
+	if (has_changed)
+		power_supply_changed(&jz_battery->battery);
+}
+
+static enum power_supply_property jz_battery_properties[] = {
+	POWER_SUPPLY_PROP_STATUS,
+	POWER_SUPPLY_PROP_TECHNOLOGY,
+	POWER_SUPPLY_PROP_HEALTH,
+	POWER_SUPPLY_PROP_CAPACITY,
+	POWER_SUPPLY_PROP_VOLTAGE_NOW,
+	POWER_SUPPLY_PROP_VOLTAGE_MAX_DESIGN,
+	POWER_SUPPLY_PROP_VOLTAGE_MIN_DESIGN,
+	POWER_SUPPLY_PROP_PRESENT,
+};
+
+static void jz_battery_work(struct work_struct *work)
+{
+	/* Too small interval will increase system workload */
+	const int interval = HZ * 30;
+	struct jz_battery *jz_battery = container_of(work, struct jz_battery,
+					    work.work);
+
+	jz_battery_update(jz_battery);
+	schedule_delayed_work(&jz_battery->work, interval);
+}
+
+static int __devinit jz_battery_probe(struct platform_device *pdev)
+{
+	int ret = 0;
+	struct jz_battery_platform_data *pdata = pdev->dev.parent->platform_data;
+	struct jz_battery *jz_battery;
+	struct power_supply *battery;
+
+	jz_battery = kzalloc(sizeof(*jz_battery), GFP_KERNEL);
+	if (!jz_battery) {
+		dev_err(&pdev->dev, "Failed to allocate driver structure\n");
+		return -ENOMEM;
+	}
+
+	jz_battery->cell = pdev->dev.platform_data;
+
+	jz_battery->irq = platform_get_irq(pdev, 0);
+	if (jz_battery->irq < 0) {
+		ret = jz_battery->irq;
+		dev_err(&pdev->dev, "Failed to get platform irq: %d\n", ret);
+		goto err_free;
+	}
+
+	jz_battery->mem = platform_get_resource(pdev, IORESOURCE_MEM, 0);
+	if (!jz_battery->mem) {
+		ret = -ENOENT;
+		dev_err(&pdev->dev, "Failed to get platform mmio resource\n");
+		goto err_free;
+	}
+
+	jz_battery->mem = request_mem_region(jz_battery->mem->start,
+				resource_size(jz_battery->mem),	pdev->name);
+	if (!jz_battery->mem) {
+		ret = -EBUSY;
+		dev_err(&pdev->dev, "Failed to request mmio memory region\n");
+		goto err_free;
+	}
+
+	jz_battery->base = ioremap_nocache(jz_battery->mem->start,
+				resource_size(jz_battery->mem));
+	if (!jz_battery->base) {
+		ret = -EBUSY;
+		dev_err(&pdev->dev, "Failed to ioremap mmio memory\n");
+		goto err_release_mem_region;
+	}
+
+	battery = &jz_battery->battery;
+	battery->name = pdata->info.name;
+	battery->type = POWER_SUPPLY_TYPE_BATTERY;
+	battery->properties	= jz_battery_properties;
+	battery->num_properties	= ARRAY_SIZE(jz_battery_properties);
+	battery->get_property = jz_battery_get_property;
+	battery->external_power_changed = jz_battery_external_power_changed;
+	battery->use_for_apm = 1;
+
+	jz_battery->pdata = pdata;
+	jz_battery->pdev = pdev;
+
+	init_completion(&jz_battery->read_completion);
+
+	INIT_DELAYED_WORK(&jz_battery->work, jz_battery_work);
+
+	ret = request_irq(jz_battery->irq, jz_battery_irq_handler, 0, pdev->name,
+			jz_battery);
+	if (ret) {
+		dev_err(&pdev->dev, "Failed to request irq %d\n", ret);
+		goto err_iounmap;
+	}
+	disable_irq(jz_battery->irq);
+
+	if (gpio_is_valid(pdata->gpio_charge)) {
+		ret = gpio_request(pdata->gpio_charge, dev_name(&pdev->dev));
+		if (ret) {
+			dev_err(&pdev->dev, "charger state gpio request failed.\n");
+			goto err_free_irq;
+		}
+		ret = gpio_direction_input(pdata->gpio_charge);
+		if (ret) {
+			dev_err(&pdev->dev, "charger state gpio set direction failed.\n");
+			goto err_free_gpio;
+		}
+
+		jz_battery->charge_irq = gpio_to_irq(pdata->gpio_charge);
+
+		if (jz_battery->charge_irq >= 0) {
+			ret = request_irq(jz_battery->charge_irq,
+				    jz_battery_charge_irq,
+				    IRQF_TRIGGER_RISING | IRQF_TRIGGER_FALLING,
+				    dev_name(&pdev->dev), jz_battery);
+			if (ret) {
+				dev_err(&pdev->dev, "Failed to request charge irq: %d\n", ret);
+				goto err_free_gpio;
+			}
+		}
+	} else {
+		jz_battery->charge_irq = -1;
+	}
+
+	if (jz_battery->pdata->info.voltage_max_design <= 2500000)
+		jz4740_adc_set_config(pdev->dev.parent, JZ_ADC_CONFIG_BAT_MB,
+			JZ_ADC_CONFIG_BAT_MB);
+	else
+		jz4740_adc_set_config(pdev->dev.parent, JZ_ADC_CONFIG_BAT_MB, 0);
+
+	ret = power_supply_register(&pdev->dev, &jz_battery->battery);
+	if (ret) {
+		dev_err(&pdev->dev, "power supply battery register failed.\n");
+		goto err_free_charge_irq;
+	}
+
+	platform_set_drvdata(pdev, jz_battery);
+	schedule_delayed_work(&jz_battery->work, 0);
+
+	return 0;
+
+err_free_charge_irq:
+	if (jz_battery->charge_irq >= 0)
+		free_irq(jz_battery->charge_irq, jz_battery);
+err_free_gpio:
+	if (gpio_is_valid(pdata->gpio_charge))
+		gpio_free(jz_battery->pdata->gpio_charge);
+err_free_irq:
+	free_irq(jz_battery->irq, jz_battery);
+err_iounmap:
+	platform_set_drvdata(pdev, NULL);
+	iounmap(jz_battery->base);
+err_release_mem_region:
+	release_mem_region(jz_battery->mem->start, resource_size(jz_battery->mem));
+err_free:
+	kfree(jz_battery);
+	return ret;
+}
+
+static int __devexit jz_battery_remove(struct platform_device *pdev)
+{
+	struct jz_battery *jz_battery = platform_get_drvdata(pdev);
+
+	cancel_delayed_work_sync(&jz_battery->work);
+
+	if (gpio_is_valid(jz_battery->pdata->gpio_charge)) {
+		if (jz_battery->charge_irq >= 0)
+			free_irq(jz_battery->charge_irq, jz_battery);
+		gpio_free(jz_battery->pdata->gpio_charge);
+	}
+
+	power_supply_unregister(&jz_battery->battery);
+
+	free_irq(jz_battery->irq, jz_battery);
+
+	iounmap(jz_battery->base);
+	release_mem_region(jz_battery->mem->start, resource_size(jz_battery->mem));
+
+	return 0;
+}
+
+#ifdef CONFIG_PM
+static int jz_battery_suspend(struct device *dev)
+{
+	struct jz_battery *jz_battery = dev_get_drvdata(dev);
+
+	cancel_delayed_work_sync(&jz_battery->work);
+	jz_battery->status = POWER_SUPPLY_STATUS_UNKNOWN;
+
+	return 0;
+}
+
+static int jz_battery_resume(struct device *dev)
+{
+	struct jz_battery *jz_battery = dev_get_drvdata(dev);
+
+	schedule_delayed_work(&jz_battery->work, 0);
+
+	return 0;
+}
+
+static const struct dev_pm_ops jz_battery_pm_ops = {
+	.suspend	= jz_battery_suspend,
+	.resume		= jz_battery_resume,
+};
+
+#define JZ_BATTERY_PM_OPS (&jz_battery_pm_ops)
+#else
+#define JZ_BATTERY_PM_OPS NULL
+#endif
+
+static struct platform_driver jz_battery_driver = {
+	.probe		= jz_battery_probe,
+	.remove		= __devexit_p(jz_battery_remove),
+	.driver = {
+		.name = "jz4740-battery",
+		.owner = THIS_MODULE,
+		.pm = JZ_BATTERY_PM_OPS,
+	},
+};
+
+static int __init jz_battery_init(void)
+{
+	return platform_driver_register(&jz_battery_driver);
+}
+module_init(jz_battery_init);
+
+static void __exit jz_battery_exit(void)
+{
+	platform_driver_unregister(&jz_battery_driver);
+}
+module_exit(jz_battery_exit);
+
+MODULE_ALIAS("platform:jz4740-battery");
+MODULE_LICENSE("GPL");
+MODULE_AUTHOR("Lars-Peter Clausen <lars@metafoo.de>");
+MODULE_DESCRIPTION("JZ4740 SoC battery driver");
diff --git a/include/linux/power/jz4740-battery.h b/include/linux/power/jz4740-battery.h
new file mode 100644
index 000000000000..19c9610c720a
--- /dev/null
+++ b/include/linux/power/jz4740-battery.h
@@ -0,0 +1,24 @@
+/*
+ *  Copyright (C) 2009, Jiejing Zhang <kzjeef@gmail.com>
+ *
+ *  This program is free software; you can redistribute	 it and/or modify it
+ *  under  the terms of	 the GNU General  Public License as published by the
+ *  Free Software Foundation;  either version 2 of the	License, or (at your
+ *  option) any later version.
+ *
+ *  You should have received a copy of the  GNU General Public License along
+ *  with this program; if not, write  to the Free Software Foundation, Inc.,
+ *  675 Mass Ave, Cambridge, MA 02139, USA.
+ *
+ */
+
+#ifndef __JZ4740_BATTERY_H
+#define __JZ4740_BATTERY_H
+
+struct jz_battery_platform_data {
+	struct power_supply_info info;
+	int gpio_charge;	/* GPIO port of Charger state */
+	int gpio_charge_active_low;
+};
+
+#endif
-- 
cgit v1.2.3


From 534af1082329392bc29f6badf815e69ae2ae0f4c Mon Sep 17 00:00:00 2001
From: Jason Wessel <jason.wessel@windriver.com>
Date: Thu, 5 Aug 2010 09:22:20 -0500
Subject: kgdb,kdb: individual register set and and get API

The kdb shell specification includes the ability to get and set
architecture specific registers by name.

For the time being individual register get and set will be implemented
on a per architecture basis.  If an architecture defines
DBG_MAX_REG_NUM > 0 then kdb and the gdbstub will use the capability
for individually getting and setting architecture specific registers.

Signed-off-by: Jason Wessel <jason.wessel@windriver.com>
---
 include/linux/kgdb.h        |  13 +++++
 kernel/debug/gdbstub.c      |  26 +++++++++
 kernel/debug/kdb/kdb_main.c | 132 ++++++++++++++++++++++++++++++++++++++++----
 3 files changed, 159 insertions(+), 12 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/kgdb.h b/include/linux/kgdb.h
index 9340f34d1bb5..d5eb882e01f3 100644
--- a/include/linux/kgdb.h
+++ b/include/linux/kgdb.h
@@ -90,6 +90,19 @@ struct kgdb_bkpt {
 	enum kgdb_bpstate	state;
 };
 
+struct dbg_reg_def_t {
+	char *name;
+	int size;
+	int offset;
+};
+
+#ifndef DBG_MAX_REG_NUM
+#define DBG_MAX_REG_NUM 0
+#else
+extern struct dbg_reg_def_t dbg_reg_def[];
+extern char *dbg_get_reg(int regno, void *mem, struct pt_regs *regs);
+extern int dbg_set_reg(int regno, void *mem, struct pt_regs *regs);
+#endif
 #ifndef KGDB_MAX_BREAKPOINTS
 # define KGDB_MAX_BREAKPOINTS	1000
 #endif
diff --git a/kernel/debug/gdbstub.c b/kernel/debug/gdbstub.c
index e117cfd75887..006bad8905d3 100644
--- a/kernel/debug/gdbstub.c
+++ b/kernel/debug/gdbstub.c
@@ -328,6 +328,32 @@ static int kgdb_ebin2mem(char *buf, char *mem, int count)
 	return probe_kernel_write(mem, c, size);
 }
 
+#if DBG_MAX_REG_NUM > 0
+void pt_regs_to_gdb_regs(unsigned long *gdb_regs, struct pt_regs *regs)
+{
+	int i;
+	int idx = 0;
+	char *ptr = (char *)gdb_regs;
+
+	for (i = 0; i < DBG_MAX_REG_NUM; i++) {
+		dbg_get_reg(i, ptr + idx, regs);
+		idx += dbg_reg_def[i].size;
+	}
+}
+
+void gdb_regs_to_pt_regs(unsigned long *gdb_regs, struct pt_regs *regs)
+{
+	int i;
+	int idx = 0;
+	char *ptr = (char *)gdb_regs;
+
+	for (i = 0; i < DBG_MAX_REG_NUM; i++) {
+		dbg_set_reg(i, ptr + idx, regs);
+		idx += dbg_reg_def[i].size;
+	}
+}
+#endif /* DBG_MAX_REG_NUM > 0 */
+
 /* Write memory due to an 'M' or 'X' packet. */
 static int write_mem_msg(int binary)
 {
diff --git a/kernel/debug/kdb/kdb_main.c b/kernel/debug/kdb/kdb_main.c
index ebe4a287419e..8577e45a9a58 100644
--- a/kernel/debug/kdb/kdb_main.c
+++ b/kernel/debug/kdb/kdb_main.c
@@ -312,7 +312,7 @@ int kdbgetularg(const char *arg, unsigned long *value)
 
 	if (endp == arg) {
 		/*
-		 * Try base 16, for us folks too lazy to type the
+		 * Also try base 16, for us folks too lazy to type the
 		 * leading 0x...
 		 */
 		val = simple_strtoul(arg, &endp, 16);
@@ -325,6 +325,25 @@ int kdbgetularg(const char *arg, unsigned long *value)
 	return 0;
 }
 
+int kdbgetu64arg(const char *arg, u64 *value)
+{
+	char *endp;
+	u64 val;
+
+	val = simple_strtoull(arg, &endp, 0);
+
+	if (endp == arg) {
+
+		val = simple_strtoull(arg, &endp, 16);
+		if (endp == arg)
+			return KDB_BADINT;
+	}
+
+	*value = val;
+
+	return 0;
+}
+
 /*
  * kdb_set - This function implements the 'set' command.  Alter an
  *	existing environment variable or create a new one.
@@ -1770,11 +1789,65 @@ static int kdb_go(int argc, const char **argv)
  */
 static int kdb_rd(int argc, const char **argv)
 {
-	int diag = kdb_check_regs();
-	if (diag)
-		return diag;
+	int len = kdb_check_regs();
+#if DBG_MAX_REG_NUM > 0
+	int i;
+	char *rname;
+	int rsize;
+	u64 reg64;
+	u32 reg32;
+	u16 reg16;
+	u8 reg8;
+
+	if (len)
+		return len;
+
+	for (i = 0; i < DBG_MAX_REG_NUM; i++) {
+		rsize = dbg_reg_def[i].size * 2;
+		if (rsize > 16)
+			rsize = 2;
+		if (len + strlen(dbg_reg_def[i].name) + 4 + rsize > 80) {
+			len = 0;
+			kdb_printf("\n");
+		}
+		if (len)
+			len += kdb_printf("  ");
+		switch(dbg_reg_def[i].size * 8) {
+		case 8:
+			rname = dbg_get_reg(i, &reg8, kdb_current_regs);
+			if (!rname)
+				break;
+			len += kdb_printf("%s: %02x", rname, reg8);
+			break;
+		case 16:
+			rname = dbg_get_reg(i, &reg16, kdb_current_regs);
+			if (!rname)
+				break;
+			len += kdb_printf("%s: %04x", rname, reg16);
+			break;
+		case 32:
+			rname = dbg_get_reg(i, &reg32, kdb_current_regs);
+			if (!rname)
+				break;
+			len += kdb_printf("%s: %08x", rname, reg32);
+			break;
+		case 64:
+			rname = dbg_get_reg(i, &reg64, kdb_current_regs);
+			if (!rname)
+				break;
+			len += kdb_printf("%s: %016llx", rname, reg64);
+			break;
+		default:
+			len += kdb_printf("%s: ??", dbg_reg_def[i].name);
+		}
+	}
+	kdb_printf("\n");
+#else
+	if (len)
+		return len;
 
 	kdb_dumpregs(kdb_current_regs);
+#endif
 	return 0;
 }
 
@@ -1782,32 +1855,67 @@ static int kdb_rd(int argc, const char **argv)
  * kdb_rm - This function implements the 'rm' (register modify)  command.
  *	rm register-name new-contents
  * Remarks:
- *	Currently doesn't allow modification of control or
- *	debug registers.
+ *	Allows register modification with the same restrictions as gdb
  */
 static int kdb_rm(int argc, const char **argv)
 {
+#if DBG_MAX_REG_NUM > 0
 	int diag;
-	int ind = 0;
-	unsigned long contents;
+	const char *rname;
+	int i;
+	u64 reg64;
+	u32 reg32;
+	u16 reg16;
+	u8 reg8;
 
 	if (argc != 2)
 		return KDB_ARGCOUNT;
 	/*
 	 * Allow presence or absence of leading '%' symbol.
 	 */
-	if (argv[1][0] == '%')
-		ind = 1;
+	rname = argv[1];
+	if (*rname == '%')
+		rname++;
 
-	diag = kdbgetularg(argv[2], &contents);
+	diag = kdbgetu64arg(argv[2], &reg64);
 	if (diag)
 		return diag;
 
 	diag = kdb_check_regs();
 	if (diag)
 		return diag;
+
+	diag = KDB_BADREG;
+	for (i = 0; i < DBG_MAX_REG_NUM; i++) {
+		if (strcmp(rname, dbg_reg_def[i].name) == 0) {
+			diag = 0;
+			break;
+		}
+	}
+	if (!diag) {
+		switch(dbg_reg_def[i].size * 8) {
+		case 8:
+			reg8 = reg64;
+			dbg_set_reg(i, &reg8, kdb_current_regs);
+			break;
+		case 16:
+			reg16 = reg64;
+			dbg_set_reg(i, &reg16, kdb_current_regs);
+			break;
+		case 32:
+			reg32 = reg64;
+			dbg_set_reg(i, &reg32, kdb_current_regs);
+			break;
+		case 64:
+			dbg_set_reg(i, &reg64, kdb_current_regs);
+			break;
+		}
+	}
+	return diag;
+#else
 	kdb_printf("ERROR: Register set currently not implemented\n");
-	return 0;
+    return 0;
+#endif
 }
 
 #if defined(CONFIG_MAGIC_SYSRQ)
-- 
cgit v1.2.3


From 55751145dc1e08e16df418cdd101661f5c6ac991 Mon Sep 17 00:00:00 2001
From: Jason Wessel <jason.wessel@windriver.com>
Date: Thu, 5 Aug 2010 09:22:21 -0500
Subject: gdbstub: Implement gdbserial 'p' and 'P' packets

The gdbserial 'p' and 'P' packets allow gdb to individually get and
set registers instead of querying for all the available registers.

Signed-off-by: Jason Wessel <jason.wessel@windriver.com>
---
 include/linux/kgdb.h   |  2 +-
 kernel/debug/gdbstub.c | 97 +++++++++++++++++++++++++++++++++++++++-----------
 2 files changed, 78 insertions(+), 21 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/kgdb.h b/include/linux/kgdb.h
index d5eb882e01f3..cc96f0f23e04 100644
--- a/include/linux/kgdb.h
+++ b/include/linux/kgdb.h
@@ -294,7 +294,7 @@ extern void kgdb_unregister_io_module(struct kgdb_io *local_kgdb_io_ops);
 extern struct kgdb_io *dbg_io_ops;
 
 extern int kgdb_hex2long(char **ptr, unsigned long *long_val);
-extern int kgdb_mem2hex(char *mem, char *buf, int count);
+extern char *kgdb_mem2hex(char *mem, char *buf, int count);
 extern int kgdb_hex2mem(char *buf, char *mem, int count);
 
 extern int kgdb_isremovedbreak(unsigned long addr);
diff --git a/kernel/debug/gdbstub.c b/kernel/debug/gdbstub.c
index 006bad8905d3..4ef9dddf4588 100644
--- a/kernel/debug/gdbstub.c
+++ b/kernel/debug/gdbstub.c
@@ -225,7 +225,7 @@ void gdbstub_msg_write(const char *s, int len)
  * buf.  Return a pointer to the last char put in buf (null). May
  * return an error.
  */
-int kgdb_mem2hex(char *mem, char *buf, int count)
+char *kgdb_mem2hex(char *mem, char *buf, int count)
 {
 	char *tmp;
 	int err;
@@ -237,17 +237,16 @@ int kgdb_mem2hex(char *mem, char *buf, int count)
 	tmp = buf + count;
 
 	err = probe_kernel_read(tmp, mem, count);
-	if (!err) {
-		while (count > 0) {
-			buf = pack_hex_byte(buf, *tmp);
-			tmp++;
-			count--;
-		}
-
-		*buf = 0;
+	if (err)
+		return NULL;
+	while (count > 0) {
+		buf = pack_hex_byte(buf, *tmp);
+		tmp++;
+		count--;
 	}
+	*buf = 0;
 
-	return err;
+	return buf;
 }
 
 /*
@@ -481,8 +480,7 @@ static void gdb_cmd_status(struct kgdb_state *ks)
 	pack_hex_byte(&remcom_out_buffer[1], ks->signo);
 }
 
-/* Handle the 'g' get registers request */
-static void gdb_cmd_getregs(struct kgdb_state *ks)
+static void gdb_get_regs_helper(struct kgdb_state *ks)
 {
 	struct task_struct *thread;
 	void *local_debuggerinfo;
@@ -523,6 +521,12 @@ static void gdb_cmd_getregs(struct kgdb_state *ks)
 		 */
 		sleeping_thread_to_gdb_regs(gdb_regs, thread);
 	}
+}
+
+/* Handle the 'g' get registers request */
+static void gdb_cmd_getregs(struct kgdb_state *ks)
+{
+	gdb_get_regs_helper(ks);
 	kgdb_mem2hex((char *)gdb_regs, remcom_out_buffer, NUMREGBYTES);
 }
 
@@ -545,13 +549,13 @@ static void gdb_cmd_memread(struct kgdb_state *ks)
 	char *ptr = &remcom_in_buffer[1];
 	unsigned long length;
 	unsigned long addr;
-	int err;
+	char *err;
 
 	if (kgdb_hex2long(&ptr, &addr) > 0 && *ptr++ == ',' &&
 					kgdb_hex2long(&ptr, &length) > 0) {
 		err = kgdb_mem2hex((char *)addr, remcom_out_buffer, length);
-		if (err)
-			error_packet(remcom_out_buffer, err);
+		if (!err)
+			error_packet(remcom_out_buffer, -EINVAL);
 	} else {
 		error_packet(remcom_out_buffer, -EINVAL);
 	}
@@ -568,6 +572,52 @@ static void gdb_cmd_memwrite(struct kgdb_state *ks)
 		strcpy(remcom_out_buffer, "OK");
 }
 
+#if DBG_MAX_REG_NUM > 0
+static char *gdb_hex_reg_helper(int regnum, char *out)
+{
+	int i;
+	int offset = 0;
+
+	for (i = 0; i < regnum; i++)
+		offset += dbg_reg_def[i].size;
+	return kgdb_mem2hex((char *)gdb_regs + offset, out,
+			    dbg_reg_def[i].size);
+}
+
+/* Handle the 'p' individual regster get */
+static void gdb_cmd_reg_get(struct kgdb_state *ks)
+{
+	unsigned long regnum;
+	char *ptr = &remcom_in_buffer[1];
+
+	kgdb_hex2long(&ptr, &regnum);
+	if (regnum >= DBG_MAX_REG_NUM) {
+		error_packet(remcom_out_buffer, -EINVAL);
+		return;
+	}
+	gdb_get_regs_helper(ks);
+	gdb_hex_reg_helper(regnum, remcom_out_buffer);
+}
+
+/* Handle the 'P' individual regster set */
+static void gdb_cmd_reg_set(struct kgdb_state *ks)
+{
+	unsigned long regnum;
+	char *ptr = &remcom_in_buffer[1];
+
+	kgdb_hex2long(&ptr, &regnum);
+	if (*ptr++ != '=' ||
+	    !(!kgdb_usethread || kgdb_usethread == current) ||
+	    !dbg_get_reg(regnum, gdb_regs, ks->linux_regs)) {
+		error_packet(remcom_out_buffer, -EINVAL);
+		return;
+	}
+	kgdb_hex2mem(ptr, (char *)gdb_regs, dbg_reg_def[regnum].size);
+	dbg_set_reg(regnum, gdb_regs, ks->linux_regs);
+	strcpy(remcom_out_buffer, "OK");
+}
+#endif /* DBG_MAX_REG_NUM > 0 */
+
 /* Handle the 'X' memory binary write bytes */
 static void gdb_cmd_binwrite(struct kgdb_state *ks)
 {
@@ -874,8 +924,11 @@ int gdb_serial_stub(struct kgdb_state *ks)
 	int error = 0;
 	int tmp;
 
-	/* Clear the out buffer. */
+	/* Initialize comm buffer and globals. */
 	memset(remcom_out_buffer, 0, sizeof(remcom_out_buffer));
+	kgdb_usethread = kgdb_info[ks->cpu].task;
+	ks->kgdb_usethreadid = shadow_pid(kgdb_info[ks->cpu].task->pid);
+	ks->pass_exception = 0;
 
 	if (kgdb_connected) {
 		unsigned char thref[BUF_THREAD_ID_SIZE];
@@ -892,10 +945,6 @@ int gdb_serial_stub(struct kgdb_state *ks)
 		put_packet(remcom_out_buffer);
 	}
 
-	kgdb_usethread = kgdb_info[ks->cpu].task;
-	ks->kgdb_usethreadid = shadow_pid(kgdb_info[ks->cpu].task->pid);
-	ks->pass_exception = 0;
-
 	while (1) {
 		error = 0;
 
@@ -920,6 +969,14 @@ int gdb_serial_stub(struct kgdb_state *ks)
 		case 'M': /* MAA..AA,LLLL: Write LLLL bytes at address AA..AA */
 			gdb_cmd_memwrite(ks);
 			break;
+#if DBG_MAX_REG_NUM > 0
+		case 'p': /* pXX Return gdb register XX (in hex) */
+			gdb_cmd_reg_get(ks);
+			break;
+		case 'P': /* PXX=aaaa Set gdb register XX to aaaa (in hex) */
+			gdb_cmd_reg_set(ks);
+			break;
+#endif /* DBG_MAX_REG_NUM > 0 */
 		case 'X': /* XAA..AA,LLLL: Write LLLL bytes at address AA..AA */
 			gdb_cmd_binwrite(ks);
 			break;
-- 
cgit v1.2.3


From b45cfba4e9005d64d419718e7ff7f7cab44c1994 Mon Sep 17 00:00:00 2001
From: Jesse Barnes <jbarnes@virtuousgeek.org>
Date: Thu, 5 Aug 2010 09:22:30 -0500
Subject: vt,console,kdb: implement atomic console enter/leave functions

These functions allow the kernel debugger to save and restore the
state of the system console.

Signed-off-by: Jesse Barnes <jbarnes@virtuousgeek.org>
Signed-off-by: Jason Wessel <jason.wessel@windriver.com>
CC: David Airlie <airlied@linux.ie>
CC: Andrew Morton <akpm@linux-foundation.org>
---
 drivers/char/vt.c       | 61 +++++++++++++++++++++++++++++++++++++++++++++++++
 include/linux/console.h | 13 +++++++++++
 2 files changed, 74 insertions(+)

(limited to 'include/linux')

diff --git a/drivers/char/vt.c b/drivers/char/vt.c
index 7cdb6ee569cd..117ce99115dc 100644
--- a/drivers/char/vt.c
+++ b/drivers/char/vt.c
@@ -187,10 +187,15 @@ static DECLARE_WORK(console_work, console_callback);
  * fg_console is the current virtual console,
  * last_console is the last used one,
  * want_console is the console we want to switch to,
+ * saved_* variants are for save/restore around kernel debugger enter/leave
  */
 int fg_console;
 int last_console;
 int want_console = -1;
+int saved_fg_console;
+int saved_last_console;
+int saved_want_console;
+int saved_vc_mode;
 
 /*
  * For each existing display, we have a pointer to console currently visible
@@ -3413,6 +3418,62 @@ int con_is_bound(const struct consw *csw)
 }
 EXPORT_SYMBOL(con_is_bound);
 
+/**
+ * con_debug_enter - prepare the console for the kernel debugger
+ * @sw: console driver
+ *
+ * Called when the console is taken over by the kernel debugger, this
+ * function needs to save the current console state, then put the console
+ * into a state suitable for the kernel debugger.
+ *
+ * RETURNS:
+ * Zero on success, nonzero if a failure occurred when trying to prepare
+ * the console for the debugger.
+ */
+int con_debug_enter(struct vc_data *vc)
+{
+	int ret = 0;
+
+	saved_fg_console = fg_console;
+	saved_last_console = last_console;
+	saved_want_console = want_console;
+	saved_vc_mode = vc->vc_mode;
+	vc->vc_mode = KD_TEXT;
+	console_blanked = 0;
+	if (vc->vc_sw->con_debug_enter)
+		ret = vc->vc_sw->con_debug_enter(vc);
+	return ret;
+}
+EXPORT_SYMBOL_GPL(con_debug_enter);
+
+/**
+ * con_debug_leave - restore console state
+ * @sw: console driver
+ *
+ * Restore the console state to what it was before the kernel debugger
+ * was invoked.
+ *
+ * RETURNS:
+ * Zero on success, nonzero if a failure occurred when trying to restore
+ * the console.
+ */
+int con_debug_leave(void)
+{
+	struct vc_data *vc;
+	int ret = 0;
+
+	fg_console = saved_fg_console;
+	last_console = saved_last_console;
+	want_console = saved_want_console;
+	vc_cons[fg_console].d->vc_mode = saved_vc_mode;
+
+	vc = vc_cons[fg_console].d;
+	if (vc->vc_sw->con_debug_leave)
+		ret = vc->vc_sw->con_debug_leave(vc);
+	return ret;
+}
+EXPORT_SYMBOL_GPL(con_debug_leave);
+
 /**
  * register_con_driver - register console driver to console layer
  * @csw: console driver
diff --git a/include/linux/console.h b/include/linux/console.h
index dcca5339ceb3..f76fc297322d 100644
--- a/include/linux/console.h
+++ b/include/linux/console.h
@@ -55,6 +55,16 @@ struct consw {
 	void	(*con_invert_region)(struct vc_data *, u16 *, int);
 	u16    *(*con_screen_pos)(struct vc_data *, int);
 	unsigned long (*con_getxy)(struct vc_data *, unsigned long, int *, int *);
+	/*
+	 * Prepare the console for the debugger.  This includes, but is not
+	 * limited to, unblanking the console, loading an appropriate
+	 * palette, and allowing debugger generated output.
+	 */
+	int	(*con_debug_enter)(struct vc_data *);
+	/*
+	 * Restore the console to its pre-debug state as closely as possible.
+	 */
+	int	(*con_debug_leave)(struct vc_data *);
 };
 
 extern const struct consw *conswitchp;
@@ -69,6 +79,9 @@ int register_con_driver(const struct consw *csw, int first, int last);
 int unregister_con_driver(const struct consw *csw);
 int take_over_console(const struct consw *sw, int first, int last, int deflt);
 void give_up_console(const struct consw *sw);
+int con_debug_enter(struct vc_data *vc);
+int con_debug_leave(void);
+
 /* scroll */
 #define SM_UP       (1)
 #define SM_DOWN     (2)
-- 
cgit v1.2.3


From 81d4450732c68aa728f2c86c0c2993c6cfc3d032 Mon Sep 17 00:00:00 2001
From: Jason Wessel <jason.wessel@windriver.com>
Date: Thu, 5 Aug 2010 09:22:30 -0500
Subject: vt,console,kdb: automatically set kdb LINES variable

The kernel console interface stores the number of lines it is
configured to use. The kdb debugger can greatly benefit by knowing how
many lines there are on the console for the pager functionality
without having the end user compile in the setting or have to
repeatedly change it at run time.

Signed-off-by: Jason Wessel <jason.wessel@windriver.com>
Signed-off-by: Jesse Barnes <jbarnes@virtuousgeek.org>
CC: David Airlie <airlied@linux.ie>
CC: Andrew Morton <akpm@linux-foundation.org>
---
 drivers/char/vt.c              | 17 +++++++++++++++++
 include/linux/kdb.h            |  4 ++++
 kernel/debug/kdb/kdb_private.h |  2 --
 3 files changed, 21 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/char/vt.c b/drivers/char/vt.c
index 117ce99115dc..4a9eb3044e52 100644
--- a/drivers/char/vt.c
+++ b/drivers/char/vt.c
@@ -104,6 +104,7 @@
 #include <linux/io.h>
 #include <asm/system.h>
 #include <linux/uaccess.h>
+#include <linux/kdb.h>
 
 #define MAX_NR_CON_DRIVER 16
 
@@ -3442,6 +3443,22 @@ int con_debug_enter(struct vc_data *vc)
 	console_blanked = 0;
 	if (vc->vc_sw->con_debug_enter)
 		ret = vc->vc_sw->con_debug_enter(vc);
+#ifdef CONFIG_KGDB_KDB
+	/* Set the initial LINES variable if it is not already set */
+	if (vc->vc_rows < 999) {
+		int linecount;
+		char lns[4];
+		const char *setargs[3] = {
+			"set",
+			"LINES",
+			lns,
+		};
+		if (kdbgetintenv(setargs[0], &linecount)) {
+			snprintf(lns, 4, "%i", vc->vc_rows);
+			kdb_set(2, setargs);
+		}
+	}
+#endif /* CONFIG_KGDB_KDB */
 	return ret;
 }
 EXPORT_SYMBOL_GPL(con_debug_enter);
diff --git a/include/linux/kdb.h b/include/linux/kdb.h
index ccb2b3ec0fe8..ea6e5244ed3f 100644
--- a/include/linux/kdb.h
+++ b/include/linux/kdb.h
@@ -114,4 +114,8 @@ enum {
 	KDB_INIT_EARLY,
 	KDB_INIT_FULL,
 };
+
+extern int kdbgetintenv(const char *, int *);
+extern int kdb_set(int, const char **);
+
 #endif	/* !_KDB_H */
diff --git a/kernel/debug/kdb/kdb_private.h b/kernel/debug/kdb/kdb_private.h
index 97d3ba69775d..c438f545a321 100644
--- a/kernel/debug/kdb/kdb_private.h
+++ b/kernel/debug/kdb/kdb_private.h
@@ -144,9 +144,7 @@ extern int kdb_getword(unsigned long *, unsigned long, size_t);
 extern int kdb_putword(unsigned long, unsigned long, size_t);
 
 extern int kdbgetularg(const char *, unsigned long *);
-extern int kdb_set(int, const char **);
 extern char *kdbgetenv(const char *);
-extern int kdbgetintenv(const char *, int *);
 extern int kdbgetaddrarg(int, const char **, int*, unsigned long *,
 			 long *, char **);
 extern int kdbgetsymval(const char *, kdb_symtab_t *);
-- 
cgit v1.2.3


From d219adc1228a3887486b58a430e736b0831f192c Mon Sep 17 00:00:00 2001
From: Jesse Barnes <jbarnes@virtuousgeek.org>
Date: Mon, 2 Aug 2010 12:05:41 -0700
Subject: fb: add hooks to handle KDB enter/exit

Add fb ops to handle enter/exit of the kernel debugger.  If present, the
fb core will register them with KGDB and they'll be called when the
debugger is entered and exited.  The new functions are responsible for
switching to an appropriate debug framebuffer and restoring the
interrupted state at exit time.

Signed-off-by: Jesse Barnes <jbarnes@virtuousgeek.org>
Signed-off-by: Jason Wessel <jason.wessel@windriver.com>
---
 drivers/video/console/fbcon.c | 26 ++++++++++++++++++++++++++
 drivers/video/console/fbcon.h |  1 +
 include/linux/fb.h            | 13 +++++++++++++
 3 files changed, 40 insertions(+)

(limited to 'include/linux')

diff --git a/drivers/video/console/fbcon.c b/drivers/video/console/fbcon.c
index b0a3fa00706d..3b3f5749af92 100644
--- a/drivers/video/console/fbcon.c
+++ b/drivers/video/console/fbcon.c
@@ -2342,6 +2342,30 @@ static int fbcon_blank(struct vc_data *vc, int blank, int mode_switch)
 	return 0;
 }
 
+static int fbcon_debug_enter(struct vc_data *vc)
+{
+	struct fb_info *info = registered_fb[con2fb_map[vc->vc_num]];
+	struct fbcon_ops *ops = info->fbcon_par;
+
+	ops->save_graphics = ops->graphics;
+	ops->graphics = 0;
+	if (info->fbops->fb_debug_enter)
+		info->fbops->fb_debug_enter(info);
+	fbcon_set_palette(vc, color_table);
+	return 0;
+}
+
+static int fbcon_debug_leave(struct vc_data *vc)
+{
+	struct fb_info *info = registered_fb[con2fb_map[vc->vc_num]];
+	struct fbcon_ops *ops = info->fbcon_par;
+
+	ops->graphics = ops->save_graphics;
+	if (info->fbops->fb_debug_leave)
+		info->fbops->fb_debug_leave(info);
+	return 0;
+}
+
 static int fbcon_get_font(struct vc_data *vc, struct console_font *font)
 {
 	u8 *fontdata = vc->vc_font.data;
@@ -3276,6 +3300,8 @@ static const struct consw fb_con = {
 	.con_screen_pos 	= fbcon_screen_pos,
 	.con_getxy 		= fbcon_getxy,
 	.con_resize             = fbcon_resize,
+	.con_debug_enter	= fbcon_debug_enter,
+	.con_debug_leave	= fbcon_debug_leave,
 };
 
 static struct notifier_block fbcon_event_notifier = {
diff --git a/drivers/video/console/fbcon.h b/drivers/video/console/fbcon.h
index 89a346880ec0..6bd2e0c7f209 100644
--- a/drivers/video/console/fbcon.h
+++ b/drivers/video/console/fbcon.h
@@ -74,6 +74,7 @@ struct fbcon_ops {
 	int    cursor_reset;
 	int    blank_state;
 	int    graphics;
+	int    save_graphics; /* for debug enter/leave */
 	int    flags;
 	int    rotate;
 	int    cur_rotate;
diff --git a/include/linux/fb.h b/include/linux/fb.h
index e7445df44d6c..0c5659c41b01 100644
--- a/include/linux/fb.h
+++ b/include/linux/fb.h
@@ -3,6 +3,9 @@
 
 #include <linux/types.h>
 #include <linux/i2c.h>
+#ifdef __KERNEL__
+#include <linux/kgdb.h>
+#endif /* __KERNEL__ */
 
 /* Definitions of frame buffers						*/
 
@@ -607,6 +610,12 @@ struct fb_deferred_io {
  * LOCKING NOTE: those functions must _ALL_ be called with the console
  * semaphore held, this is the only suitable locking mechanism we have
  * in 2.6. Some may be called at interrupt time at this point though.
+ *
+ * The exception to this is the debug related hooks.  Putting the fb
+ * into a debug state (e.g. flipping to the kernel console) and restoring
+ * it must be done in a lock-free manner, so low level drivers should
+ * keep track of the initial console (if applicable) and may need to
+ * perform direct, unlocked hardware writes in these hooks.
  */
 
 struct fb_ops {
@@ -676,6 +685,10 @@ struct fb_ops {
 
 	/* teardown any resources to do with this framebuffer */
 	void (*fb_destroy)(struct fb_info *info);
+
+	/* called at KDB enter and leave time to prepare the console */
+	int (*fb_debug_enter)(struct fb_info *info);
+	int (*fb_debug_leave)(struct fb_info *info);
 };
 
 #ifdef CONFIG_FB_TILEBLITTING
-- 
cgit v1.2.3


From 1a4240f4764ac78adbf4b0ebb49b3bd8c72ffa11 Mon Sep 17 00:00:00 2001
From: Wang Lei <wang840925@gmail.com>
Date: Wed, 4 Aug 2010 15:16:33 +0100
Subject: DNS: Separate out CIFS DNS Resolver code

Separate out the DNS resolver key type from the CIFS filesystem into its own
module so that it can be made available for general use, including the AFS
filesystem module.

This facility makes it possible for the kernel to upcall to userspace to have
it issue DNS requests, package up the replies and present them to the kernel
in a useful form.  The kernel is then able to cache the DNS replies as keys
can be retained in keyrings.

Resolver keys are of type "dns_resolver" and have a case-insensitive
description that is of the form "[<type>:]<domain_name>".  The optional <type>
indicates the particular DNS lookup and packaging that's required.  The
<domain_name> is the query to be made.

If <type> isn't given, a basic hostname to IP address lookup is made, and the
result is stored in the key in the form of a printable string consisting of a
comma-separated list of IPv4 and IPv6 addresses.

This key type is supported by userspace helpers driven from /sbin/request-key
and configured through /etc/request-key.conf.  The cifs.upcall utility is
invoked for UNC path server name to IP address resolution.

The CIFS functionality is encapsulated by the dns_resolve_unc_to_ip() function,
which is used to resolve a UNC path to an IP address for CIFS filesystem.  This
part remains in the CIFS module for now.

See the added Documentation/networking/dns_resolver.txt for more information.

Signed-off-by: Wang Lei <wang840925@gmail.com>
Signed-off-by: David Howells <dhowells@redhat.com>
Acked-by: Jeff Layton <jlayton@redhat.com>
Signed-off-by: Steve French <sfrench@us.ibm.com>
---
 Documentation/networking/dns_resolver.txt | 146 +++++++++++++++++++
 fs/cifs/Kconfig                           |  17 +--
 fs/cifs/cifsfs.c                          |  13 +-
 fs/cifs/dns_resolve.c                     | 229 ++++++------------------------
 fs/cifs/dns_resolve.h                     |   2 -
 include/keys/dns_resolver-type.h          |  23 +++
 include/linux/dns_resolver.h              |  34 +++++
 net/Kconfig                               |   1 +
 net/Makefile                              |   1 +
 net/dns_resolver/Kconfig                  |  27 ++++
 net/dns_resolver/Makefile                 |   7 +
 net/dns_resolver/dns_key.c                | 210 +++++++++++++++++++++++++++
 net/dns_resolver/dns_query.c              | 159 +++++++++++++++++++++
 net/dns_resolver/internal.h               |  44 ++++++
 14 files changed, 708 insertions(+), 205 deletions(-)
 create mode 100644 Documentation/networking/dns_resolver.txt
 create mode 100644 include/keys/dns_resolver-type.h
 create mode 100644 include/linux/dns_resolver.h
 create mode 100644 net/dns_resolver/Kconfig
 create mode 100644 net/dns_resolver/Makefile
 create mode 100644 net/dns_resolver/dns_key.c
 create mode 100644 net/dns_resolver/dns_query.c
 create mode 100644 net/dns_resolver/internal.h

(limited to 'include/linux')

diff --git a/Documentation/networking/dns_resolver.txt b/Documentation/networking/dns_resolver.txt
new file mode 100644
index 000000000000..d8e0ce1d38c7
--- /dev/null
+++ b/Documentation/networking/dns_resolver.txt
@@ -0,0 +1,146 @@
+			     ===================
+			     DNS Resolver Module
+			     ===================
+
+Contents:
+
+ - Overview.
+ - Compilation.
+ - Setting up.
+ - Usage.
+ - Mechanism.
+ - Debugging.
+
+
+========
+OVERVIEW
+========
+
+The DNS resolver module provides a way for kernel services to make DNS queries
+by way of requesting a key of key type dns_resolver.  These queries are
+upcalled to userspace through /sbin/request-key.
+
+These routines must be supported by userspace tools dns.upcall, cifs.upcall and
+request-key.  It is under development and does not yet provide the full feature
+set.  The features it does support include:
+
+ (*) Implements the dns_resolver key_type to contact userspace.
+
+It does not yet support the following AFS features:
+
+ (*) Dns query support for AFSDB resource record.
+
+This code is extracted from the CIFS filesystem.
+
+
+===========
+COMPILATION
+===========
+
+The module should be enabled by turning on the kernel configuration options:
+
+	CONFIG_DNS_RESOLVER	- tristate "DNS Resolver support"
+
+
+==========
+SETTING UP
+==========
+
+To set up this facility, the /etc/request-key.conf file must be altered so that
+/sbin/request-key can appropriately direct the upcalls.  For example, to handle
+basic dname to IPv4/IPv6 address resolution, the following line should be
+added:
+
+	#OP	TYPE		DESC	CO-INFO	PROGRAM ARG1 ARG2 ARG3 ...
+	#======	============	=======	=======	==========================
+	create	dns_resolver  	*	*	/usr/sbin/cifs.upcall %k
+
+To direct a query for query type 'foo', a line of the following should be added
+before the more general line given above as the first match is the one taken.
+
+	create	dns_resolver  	foo:*	*	/usr/sbin/dns.foo %k
+
+
+
+=====
+USAGE
+=====
+
+To make use of this facility, one of the following functions that are
+implemented in the module can be called after doing:
+
+	#include <linux/dns_resolver.h>
+
+ (1) int dns_query(const char *type, const char *name, size_t namelen,
+		   const char *options, char **_result, time_t *_expiry);
+
+     This is the basic access function.  It looks for a cached DNS query and if
+     it doesn't find it, it upcalls to userspace to make a new DNS query, which
+     may then be cached.  The key description is constructed as a string of the
+     form:
+
+		[<type>:]<name>
+
+     where <type> optionally specifies the particular upcall program to invoke,
+     and thus the type of query to do, and <name> specifies the string to be
+     looked up.  The default query type is a straight hostname to IP address
+     set lookup.
+
+     The name parameter is not required to be a NUL-terminated string, and its
+     length should be given by the namelen argument.
+
+     The options parameter may be NULL or it may be a set of options
+     appropriate to the query type.
+
+     The return value is a string appropriate to the query type.  For instance,
+     for the default query type it is just a list of comma-separated IPv4 and
+     IPv6 addresses.  The caller must free the result.
+
+     The length of the result string is returned on success, and a -ve error
+     code is returned otherwise.  -EKEYREJECTED will be returned if the DNS
+     lookup failed.
+
+     If _expiry is non-NULL, the expiry time (TTL) of the result will be
+     returned also.
+
+
+=========
+MECHANISM
+=========
+
+The dnsresolver module registers a key type called "dns_resolver".  Keys of
+this type are used to transport and cache DNS lookup results from userspace.
+
+When dns_query() is invoked, it calls request_key() to search the local
+keyrings for a cached DNS result.  If that fails to find one, it upcalls to
+userspace to get a new result.
+
+Upcalls to userspace are made through the request_key() upcall vector, and are
+directed by means of configuration lines in /etc/request-key.conf that tell
+/sbin/request-key what program to run to instantiate the key.
+
+The upcall handler program is responsible for querying the DNS, processing the
+result into a form suitable for passing to the keyctl_instantiate_key()
+routine.  This then passes the data to dns_resolver_instantiate() which strips
+off and processes any options included in the data, and then attaches the
+remainder of the string to the key as its payload.
+
+The upcall handler program should set the expiry time on the key to that of the
+lowest TTL of all the records it has extracted a result from.  This means that
+the key will be discarded and recreated when the data it holds has expired.
+
+dns_query() returns a copy of the value attached to the key, or an error if
+that is indicated instead.
+
+See <file:Documentation/keys-request-key.txt> for further information about
+request-key function.
+
+
+=========
+DEBUGGING
+=========
+
+Debugging messages can be turned on dynamically by writing a 1 into the
+following file:
+
+        /sys/module/dnsresolver/parameters/debug
diff --git a/fs/cifs/Kconfig b/fs/cifs/Kconfig
index 5739fd7f88b4..57f0aa9f141f 100644
--- a/fs/cifs/Kconfig
+++ b/fs/cifs/Kconfig
@@ -71,14 +71,14 @@ config CIFS_WEAK_PW_HASH
 	  If unsure, say N.
 
 config CIFS_UPCALL
-	  bool "Kerberos/SPNEGO advanced session setup"
-	  depends on CIFS && KEYS
-	  help
-	    Enables an upcall mechanism for CIFS which accesses
-	    userspace helper utilities to provide SPNEGO packaged (RFC 4178)
-	    Kerberos tickets which are needed to mount to certain secure servers
-	    (for which more secure Kerberos authentication is required). If
-	    unsure, say N.
+	bool "Kerberos/SPNEGO advanced session setup"
+	depends on CIFS && KEYS
+	select DNS_RESOLVER
+	help
+	  Enables an upcall mechanism for CIFS which accesses userspace helper
+	  utilities to provide SPNEGO packaged (RFC 4178) Kerberos tickets
+	  which are needed to mount to certain secure servers (for which more
+	  secure Kerberos authentication is required). If unsure, say N.
 
 config CIFS_XATTR
         bool "CIFS extended attributes"
@@ -122,6 +122,7 @@ config CIFS_DEBUG2
 config CIFS_DFS_UPCALL
 	  bool "DFS feature support"
 	  depends on CIFS && KEYS
+	  select DNS_RESOLVER
 	  help
 	    Distributed File System (DFS) support is used to access shares
 	    transparently in an enterprise name space, even if the share
diff --git a/fs/cifs/cifsfs.c b/fs/cifs/cifsfs.c
index 8a2cf129e535..2a0c892959f4 100644
--- a/fs/cifs/cifsfs.c
+++ b/fs/cifs/cifsfs.c
@@ -45,7 +45,6 @@
 #include "cifs_fs_sb.h"
 #include <linux/mm.h>
 #include <linux/key-type.h>
-#include "dns_resolve.h"
 #include "cifs_spnego.h"
 #include "fscache.h"
 #define CIFS_MAGIC_NUMBER 0xFF534D42	/* the first four bytes of SMB PDUs */
@@ -933,23 +932,14 @@ init_cifs(void)
 	rc = register_key_type(&cifs_spnego_key_type);
 	if (rc)
 		goto out_unregister_filesystem;
-#endif
-#ifdef CONFIG_CIFS_DFS_UPCALL
-	rc = cifs_init_dns_resolver();
-	if (rc)
-		goto out_unregister_key_type;
 #endif
 	rc = slow_work_register_user(THIS_MODULE);
 	if (rc)
-		goto out_unregister_resolver_key;
+		goto out_unregister_key_type;
 
 	return 0;
 
- out_unregister_resolver_key:
-#ifdef CONFIG_CIFS_DFS_UPCALL
-	cifs_exit_dns_resolver();
  out_unregister_key_type:
-#endif
 #ifdef CONFIG_CIFS_UPCALL
 	unregister_key_type(&cifs_spnego_key_type);
  out_unregister_filesystem:
@@ -976,7 +966,6 @@ exit_cifs(void)
 	cifs_fscache_unregister();
 #ifdef CONFIG_CIFS_DFS_UPCALL
 	cifs_dfs_release_automount_timer();
-	cifs_exit_dns_resolver();
 #endif
 #ifdef CONFIG_CIFS_UPCALL
 	unregister_key_type(&cifs_spnego_key_type);
diff --git a/fs/cifs/dns_resolve.c b/fs/cifs/dns_resolve.c
index aa967e7917f8..0eb87026cad3 100644
--- a/fs/cifs/dns_resolve.c
+++ b/fs/cifs/dns_resolve.c
@@ -4,6 +4,8 @@
  *   Copyright (c) 2007 Igor Mammedov
  *   Author(s): Igor Mammedov (niallain@gmail.com)
  *              Steve French (sfrench@us.ibm.com)
+ *              Wang Lei (wang840925@gmail.com)
+ *		David Howells (dhowells@redhat.com)
  *
  *   Contains the CIFS DFS upcall routines used for hostname to
  *   IP address translation.
@@ -24,212 +26,73 @@
  */
 
 #include <linux/slab.h>
-#include <linux/keyctl.h>
-#include <linux/key-type.h>
-#include <keys/user-type.h>
+#include <linux/dns_resolver.h>
 #include "dns_resolve.h"
 #include "cifsglob.h"
 #include "cifsproto.h"
 #include "cifs_debug.h"
 
-static const struct cred *dns_resolver_cache;
-
-/* Checks if supplied name is IP address
- * returns:
- * 		1 - name is IP
- * 		0 - name is not IP
- */
-static int
-is_ip(const char *name, int len)
-{
-	struct sockaddr_storage ss;
-
-	return cifs_convert_address((struct sockaddr *)&ss, name, len);
-}
-
-static int
-dns_resolver_instantiate(struct key *key, const void *data,
-		size_t datalen)
-{
-	int rc = 0;
-	char *ip;
-
-	/* make sure this looks like an address */
-	if (!is_ip(data, datalen))
-		return -EINVAL;
-
-	ip = kmalloc(datalen + 1, GFP_KERNEL);
-	if (!ip)
-		return -ENOMEM;
-
-	memcpy(ip, data, datalen);
-	ip[datalen] = '\0';
-
-	key->type_data.x[0] = datalen;
-	key->payload.data = ip;
-
-	return rc;
-}
-
-static void
-dns_resolver_destroy(struct key *key)
-{
-	kfree(key->payload.data);
-}
-
-struct key_type key_type_dns_resolver = {
-	.name        = "dns_resolver",
-	.def_datalen = sizeof(struct in_addr),
-	.describe    = user_describe,
-	.instantiate = dns_resolver_instantiate,
-	.destroy     = dns_resolver_destroy,
-	.match       = user_match,
-};
-
-/* Resolves server name to ip address.
- * input:
- * 	unc - server UNC
- * output:
- * 	*ip_addr - pointer to server ip, caller responcible for freeing it.
- * return the length of the returned string on success
+/**
+ * dns_resolve_server_name_to_ip - Resolve UNC server name to ip address.
+ * @unc: UNC path specifying the server
+ * @ip_addr: Where to return the IP address.
+ *
+ * The IP address will be returned in string form, and the caller is
+ * responsible for freeing it.
+ *
+ * Returns length of result on success, -ve on error.
  */
 int
 dns_resolve_server_name_to_ip(const char *unc, char **ip_addr)
 {
-	const struct cred *saved_cred;
-	int rc = -EAGAIN;
-	struct key *rkey = ERR_PTR(-EAGAIN);
+	struct sockaddr_storage ss;
+	const char *hostname, *sep;
 	char *name;
-	char *data = NULL;
-	int len;
+	int len, rc;
 
 	if (!ip_addr || !unc)
 		return -EINVAL;
 
-	/* search for server name delimiter */
 	len = strlen(unc);
 	if (len < 3) {
 		cFYI(1, "%s: unc is too short: %s", __func__, unc);
 		return -EINVAL;
 	}
-	len -= 2;
-	name = memchr(unc+2, '\\', len);
-	if (!name) {
-		cFYI(1, "%s: probably server name is whole unc: %s",
-					__func__, unc);
-	} else {
-		len = (name - unc) - 2/* leading // */;
-	}
-
-	name = kmalloc(len+1, GFP_KERNEL);
-	if (!name) {
-		rc = -ENOMEM;
-		return rc;
-	}
-	memcpy(name, unc+2, len);
-	name[len] = 0;
-
-	if (is_ip(name, len)) {
-		cFYI(1, "%s: it is IP, skipping dns upcall: %s",
-					__func__, name);
-		data = name;
-		goto skip_upcall;
-	}
 
-	saved_cred = override_creds(dns_resolver_cache);
-	rkey = request_key(&key_type_dns_resolver, name, "");
-	revert_creds(saved_cred);
-	if (!IS_ERR(rkey)) {
-		if (!(rkey->perm & KEY_USR_VIEW)) {
-			down_read(&rkey->sem);
-			rkey->perm |= KEY_USR_VIEW;
-			up_read(&rkey->sem);
-		}
-		len = rkey->type_data.x[0];
-		data = rkey->payload.data;
-	} else {
-		cERROR(1, "%s: unable to resolve: %s", __func__, name);
-		goto out;
-	}
-
-skip_upcall:
-	if (data) {
-		*ip_addr = kmalloc(len + 1, GFP_KERNEL);
-		if (*ip_addr) {
-			memcpy(*ip_addr, data, len + 1);
-			if (!IS_ERR(rkey))
-				cFYI(1, "%s: resolved: %s to %s", __func__,
-							name,
-							*ip_addr
-					);
-			rc = len;
-		} else {
-			rc = -ENOMEM;
-		}
-		if (!IS_ERR(rkey))
-			key_put(rkey);
-	}
+	/* Discount leading slashes for cifs */
+	len -= 2;
+	hostname = unc + 2;
 
-out:
-	kfree(name);
+	/* Search for server name delimiter */
+	sep = memchr(hostname, '\\', len);
+	if (sep)
+		len = sep - unc;
+	else
+		cFYI(1, "%s: probably server name is whole unc: %s",
+		     __func__, unc);
+
+	/* Try to interpret hostname as an IPv4 or IPv6 address */
+	rc = cifs_convert_address((struct sockaddr *)&ss, hostname, len);
+	if (rc > 0)
+		goto name_is_IP_address;
+
+	/* Perform the upcall */
+	rc = dns_query(NULL, hostname, len, NULL, ip_addr, NULL);
+	if (rc < 0)
+		cERROR(1, "%s: unable to resolve: %*.*s",
+		       __func__, len, len, hostname);
+	else
+		cFYI(1, "%s: resolved: %*.*s to %s",
+		     __func__, len, len, hostname, *ip_addr);
 	return rc;
-}
 
-int __init cifs_init_dns_resolver(void)
-{
-	struct cred *cred;
-	struct key *keyring;
-	int ret;
-
-	printk(KERN_NOTICE "Registering the %s key type\n",
-	       key_type_dns_resolver.name);
-
-	/* create an override credential set with a special thread keyring in
-	 * which DNS requests are cached
-	 *
-	 * this is used to prevent malicious redirections from being installed
-	 * with add_key().
-	 */
-	cred = prepare_kernel_cred(NULL);
-	if (!cred)
+name_is_IP_address:
+	name = kmalloc(len + 1, GFP_KERNEL);
+	if (!name)
 		return -ENOMEM;
-
-	keyring = key_alloc(&key_type_keyring, ".dns_resolver", 0, 0, cred,
-			    (KEY_POS_ALL & ~KEY_POS_SETATTR) |
-			    KEY_USR_VIEW | KEY_USR_READ,
-			    KEY_ALLOC_NOT_IN_QUOTA);
-	if (IS_ERR(keyring)) {
-		ret = PTR_ERR(keyring);
-		goto failed_put_cred;
-	}
-
-	ret = key_instantiate_and_link(keyring, NULL, 0, NULL, NULL);
-	if (ret < 0)
-		goto failed_put_key;
-
-	ret = register_key_type(&key_type_dns_resolver);
-	if (ret < 0)
-		goto failed_put_key;
-
-	/* instruct request_key() to use this special keyring as a cache for
-	 * the results it looks up */
-	cred->thread_keyring = keyring;
-	cred->jit_keyring = KEY_REQKEY_DEFL_THREAD_KEYRING;
-	dns_resolver_cache = cred;
+	memcpy(name, hostname, len);
+	name[len] = 0;
+	cFYI(1, "%s: unc is IP, skipping dns upcall: %s", __func__, name);
+	*ip_addr = name;
 	return 0;
-
-failed_put_key:
-	key_put(keyring);
-failed_put_cred:
-	put_cred(cred);
-	return ret;
-}
-
-void cifs_exit_dns_resolver(void)
-{
-	key_revoke(dns_resolver_cache->thread_keyring);
-	unregister_key_type(&key_type_dns_resolver);
-	put_cred(dns_resolver_cache);
-	printk(KERN_NOTICE "Unregistered %s key type\n",
-	       key_type_dns_resolver.name);
 }
diff --git a/fs/cifs/dns_resolve.h b/fs/cifs/dns_resolve.h
index 5d7f291df162..d3f5d27f4d06 100644
--- a/fs/cifs/dns_resolve.h
+++ b/fs/cifs/dns_resolve.h
@@ -24,8 +24,6 @@
 #define _DNS_RESOLVE_H
 
 #ifdef __KERNEL__
-extern int __init cifs_init_dns_resolver(void);
-extern void cifs_exit_dns_resolver(void);
 extern int dns_resolve_server_name_to_ip(const char *unc, char **ip_addr);
 #endif /* KERNEL */
 
diff --git a/include/keys/dns_resolver-type.h b/include/keys/dns_resolver-type.h
new file mode 100644
index 000000000000..9284a19393aa
--- /dev/null
+++ b/include/keys/dns_resolver-type.h
@@ -0,0 +1,23 @@
+/* DNS resolver key type
+ *
+ * Copyright (C) 2010 Wang Lei. All Rights Reserved.
+ * Written by Wang Lei (wang840925@gmail.com)
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ */
+
+#ifndef _KEYS_DNS_RESOLVER_TYPE_H
+#define _KEYS_DNS_RESOLVER_TYPE_H
+
+#include <linux/key-type.h>
+
+extern struct key_type key_type_dns_resolver;
+
+extern int request_dns_resolver_key(const char *description,
+				    const char *callout_info,
+				    char **data);
+
+#endif /* _KEYS_DNS_RESOLVER_TYPE_H */
diff --git a/include/linux/dns_resolver.h b/include/linux/dns_resolver.h
new file mode 100644
index 000000000000..cc92268af89a
--- /dev/null
+++ b/include/linux/dns_resolver.h
@@ -0,0 +1,34 @@
+/*
+ *   DNS Resolver upcall management for CIFS DFS and AFS
+ *   Handles host name to IP address resolution and DNS query for AFSDB RR.
+ *
+ *   Copyright (c) International Business Machines  Corp., 2008
+ *   Author(s): Steve French (sfrench@us.ibm.com)
+ *              Wang Lei (wang840925@gmail.com)
+ *
+ *   This library is free software; you can redistribute it and/or modify
+ *   it under the terms of the GNU Lesser General Public License as published
+ *   by the Free Software Foundation; either version 2.1 of the License, or
+ *   (at your option) any later version.
+ *
+ *   This library is distributed in the hope that it will be useful,
+ *   but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See
+ *   the GNU Lesser General Public License for more details.
+ *
+ *   You should have received a copy of the GNU Lesser General Public License
+ *   along with this library; if not, write to the Free Software
+ *   Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
+ */
+
+#ifndef _LINUX_DNS_RESOLVER_H
+#define _LINUX_DNS_RESOLVER_H
+
+#ifdef __KERNEL__
+
+extern int dns_query(const char *type, const char *name, size_t namelen,
+		     const char *options, char **_result, time_t *_expiry);
+
+#endif /* KERNEL */
+
+#endif /* _LINUX_DNS_RESOLVER_H */
diff --git a/net/Kconfig b/net/Kconfig
index e24fa0873f32..e330594d3709 100644
--- a/net/Kconfig
+++ b/net/Kconfig
@@ -213,6 +213,7 @@ source "net/phonet/Kconfig"
 source "net/ieee802154/Kconfig"
 source "net/sched/Kconfig"
 source "net/dcb/Kconfig"
+source "net/dns_resolver/Kconfig"
 
 config RPS
 	boolean
diff --git a/net/Makefile b/net/Makefile
index 41d420070a38..ea60fbce9b1b 100644
--- a/net/Makefile
+++ b/net/Makefile
@@ -67,3 +67,4 @@ ifeq ($(CONFIG_NET),y)
 obj-$(CONFIG_SYSCTL)		+= sysctl_net.o
 endif
 obj-$(CONFIG_WIMAX)		+= wimax/
+obj-$(CONFIG_DNS_RESOLVER)	+= dns_resolver/
diff --git a/net/dns_resolver/Kconfig b/net/dns_resolver/Kconfig
new file mode 100644
index 000000000000..2ec47cb5d0d6
--- /dev/null
+++ b/net/dns_resolver/Kconfig
@@ -0,0 +1,27 @@
+#
+# Configuration for DNS Resolver
+#
+config DNS_RESOLVER
+	tristate "DNS Resolver support"
+	depends on NET && KEYS
+	help
+	  Saying Y here will include support for the DNS Resolver key type
+	  which can be used to make upcalls to perform DNS lookups in
+	  userspace.
+
+	  DNS Resolver is used to query DNS server for information.  Examples
+	  being resolving a UNC hostname element to an IP address for CIFS or
+	  performing a DNS query for AFSDB records so that AFS can locate a
+	  cell's volume location database servers.
+
+	  DNS Resolver is used by the CIFS and AFS modules, and would support
+	  samba4 later.  DNS Resolver is supported by the userspace upcall
+	  helper "/sbin/dns.resolver" via /etc/request-key.conf.
+
+	  See <file:Documentation/networking/dns_resolver.txt> for further
+	  information.
+
+	  To compile this as a module, choose M here: the module will be called
+	  dnsresolver.
+
+	  If unsure, say N.
diff --git a/net/dns_resolver/Makefile b/net/dns_resolver/Makefile
new file mode 100644
index 000000000000..c0ef4e71dc49
--- /dev/null
+++ b/net/dns_resolver/Makefile
@@ -0,0 +1,7 @@
+#
+# Makefile for the Linux DNS Resolver.
+#
+
+obj-$(CONFIG_DNS_RESOLVER) += dns_resolver.o
+
+dns_resolver-objs :=  dns_key.o dns_query.o
diff --git a/net/dns_resolver/dns_key.c b/net/dns_resolver/dns_key.c
new file mode 100644
index 000000000000..1b1b411adcf1
--- /dev/null
+++ b/net/dns_resolver/dns_key.c
@@ -0,0 +1,210 @@
+/* Key type used to cache DNS lookups made by the kernel
+ *
+ * See Documentation/networking/dns_resolver.txt
+ *
+ *   Copyright (c) 2007 Igor Mammedov
+ *   Author(s): Igor Mammedov (niallain@gmail.com)
+ *              Steve French (sfrench@us.ibm.com)
+ *              Wang Lei (wang840925@gmail.com)
+ *		David Howells (dhowells@redhat.com)
+ *
+ *   This library is free software; you can redistribute it and/or modify
+ *   it under the terms of the GNU Lesser General Public License as published
+ *   by the Free Software Foundation; either version 2.1 of the License, or
+ *   (at your option) any later version.
+ *
+ *   This library is distributed in the hope that it will be useful,
+ *   but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See
+ *   the GNU Lesser General Public License for more details.
+ *
+ *   You should have received a copy of the GNU Lesser General Public License
+ *   along with this library; if not, write to the Free Software
+ *   Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
+ */
+#include <linux/module.h>
+#include <linux/moduleparam.h>
+#include <linux/slab.h>
+#include <linux/string.h>
+#include <linux/kernel.h>
+#include <linux/keyctl.h>
+#include <keys/dns_resolver-type.h>
+#include <keys/user-type.h>
+#include "internal.h"
+
+MODULE_DESCRIPTION("DNS Resolver");
+MODULE_AUTHOR("Wang Lei");
+MODULE_LICENSE("GPL");
+
+unsigned dns_resolver_debug;
+module_param_named(debug, dns_resolver_debug, uint, S_IWUSR | S_IRUGO);
+MODULE_PARM_DESC(debug, "DNS Resolver debugging mask");
+
+const struct cred *dns_resolver_cache;
+
+/*
+ * Instantiate a user defined key for dns_resolver.
+ *
+ * The data must be a NUL-terminated string, with the NUL char accounted in
+ * datalen.
+ *
+ * If the data contains a '#' characters, then we take the clause after each
+ * one to be an option of the form 'key=value'.  The actual data of interest is
+ * the string leading up to the first '#'.  For instance:
+ *
+ *        "ip1,ip2,...#foo=bar"
+ */
+static int
+dns_resolver_instantiate(struct key *key, const void *_data, size_t datalen)
+{
+	struct user_key_payload *upayload;
+	int ret;
+	size_t result_len = 0;
+	const char *data = _data, *opt;
+
+	kenter("%%%d,%s,'%s',%zu",
+	       key->serial, key->description, data, datalen);
+
+	if (datalen <= 1 || !data || data[datalen - 1] != '\0')
+		return -EINVAL;
+	datalen--;
+
+	/* deal with any options embedded in the data */
+	opt = memchr(data, '#', datalen);
+	if (!opt) {
+		kdebug("no options currently supported");
+		return -EINVAL;
+	}
+
+	result_len = datalen;
+	ret = key_payload_reserve(key, result_len);
+	if (ret < 0)
+		return -EINVAL;
+
+	upayload = kmalloc(sizeof(*upayload) + result_len + 1, GFP_KERNEL);
+	if (!upayload) {
+		kleave(" = -ENOMEM");
+		return -ENOMEM;
+	}
+
+	upayload->datalen = result_len;
+	memcpy(upayload->data, data, result_len);
+	upayload->data[result_len] = '\0';
+	rcu_assign_pointer(key->payload.data, upayload);
+
+	kleave(" = 0");
+	return 0;
+}
+
+/*
+ * The description is of the form "[<type>:]<domain_name>"
+ *
+ * The domain name may be a simple name or an absolute domain name (which
+ * should end with a period).  The domain name is case-independent.
+ */
+static int
+dns_resolver_match(const struct key *key, const void *description)
+{
+	int slen, dlen, ret = 0;
+	const char *src = key->description, *dsp = description;
+
+	kenter("%s,%s", src, dsp);
+
+	if (!src || !dsp)
+		goto no_match;
+
+	if (strcasecmp(src, dsp) == 0)
+		goto matched;
+
+	slen = strlen(src);
+	dlen = strlen(dsp);
+	if (slen <= 0 || dlen <= 0)
+		goto no_match;
+	if (src[slen - 1] == '.')
+		slen--;
+	if (dsp[dlen - 1] == '.')
+		dlen--;
+	if (slen != dlen || strncasecmp(src, dsp, slen) != 0)
+		goto no_match;
+
+matched:
+	ret = 1;
+no_match:
+	kleave(" = %d", ret);
+	return ret;
+}
+
+struct key_type key_type_dns_resolver = {
+	.name		= "dns_resolver",
+	.instantiate	= dns_resolver_instantiate,
+	.match		= dns_resolver_match,
+	.revoke		= user_revoke,
+	.destroy	= user_destroy,
+	.describe	= user_describe,
+	.read		= user_read,
+};
+
+static int __init init_dns_resolver(void)
+{
+	struct cred *cred;
+	struct key *keyring;
+	int ret;
+
+	printk(KERN_NOTICE "Registering the %s key type\n",
+	       key_type_dns_resolver.name);
+
+	/* create an override credential set with a special thread keyring in
+	 * which DNS requests are cached
+	 *
+	 * this is used to prevent malicious redirections from being installed
+	 * with add_key().
+	 */
+	cred = prepare_kernel_cred(NULL);
+	if (!cred)
+		return -ENOMEM;
+
+	keyring = key_alloc(&key_type_keyring, ".dns_resolver", 0, 0, cred,
+			    (KEY_POS_ALL & ~KEY_POS_SETATTR) |
+			    KEY_USR_VIEW | KEY_USR_READ,
+			    KEY_ALLOC_NOT_IN_QUOTA);
+	if (IS_ERR(keyring)) {
+		ret = PTR_ERR(keyring);
+		goto failed_put_cred;
+	}
+
+	ret = key_instantiate_and_link(keyring, NULL, 0, NULL, NULL);
+	if (ret < 0)
+		goto failed_put_key;
+
+	ret = register_key_type(&key_type_dns_resolver);
+	if (ret < 0)
+		goto failed_put_key;
+
+	/* instruct request_key() to use this special keyring as a cache for
+	 * the results it looks up */
+	cred->thread_keyring = keyring;
+	cred->jit_keyring = KEY_REQKEY_DEFL_THREAD_KEYRING;
+	dns_resolver_cache = cred;
+
+	kdebug("DNS resolver keyring: %d\n", key_serial(keyring));
+	return 0;
+
+failed_put_key:
+	key_put(keyring);
+failed_put_cred:
+	put_cred(cred);
+	return ret;
+}
+
+static void __exit exit_dns_resolver(void)
+{
+	key_revoke(dns_resolver_cache->thread_keyring);
+	unregister_key_type(&key_type_dns_resolver);
+	put_cred(dns_resolver_cache);
+	printk(KERN_NOTICE "Unregistered %s key type\n",
+	       key_type_dns_resolver.name);
+}
+
+module_init(init_dns_resolver)
+module_exit(exit_dns_resolver)
+MODULE_LICENSE("GPL");
diff --git a/net/dns_resolver/dns_query.c b/net/dns_resolver/dns_query.c
new file mode 100644
index 000000000000..6c0cf31ea00d
--- /dev/null
+++ b/net/dns_resolver/dns_query.c
@@ -0,0 +1,159 @@
+/* Upcall routine, designed to work as a key type and working through
+ * /sbin/request-key to contact userspace when handling DNS queries.
+ *
+ * See Documentation/networking/dns_resolver.txt
+ *
+ *   Copyright (c) 2007 Igor Mammedov
+ *   Author(s): Igor Mammedov (niallain@gmail.com)
+ *              Steve French (sfrench@us.ibm.com)
+ *              Wang Lei (wang840925@gmail.com)
+ *		David Howells (dhowells@redhat.com)
+ *
+ *   The upcall wrapper used to make an arbitrary DNS query.
+ *
+ *   This function requires the appropriate userspace tool dns.upcall to be
+ *   installed and something like the following lines should be added to the
+ *   /etc/request-key.conf file:
+ *
+ *	create dns_resolver * * /sbin/dns.upcall %k
+ *
+ *   For example to use this module to query AFSDB RR:
+ *
+ *	create dns_resolver afsdb:* * /sbin/dns.afsdb %k
+ *
+ *   This library is free software; you can redistribute it and/or modify
+ *   it under the terms of the GNU Lesser General Public License as published
+ *   by the Free Software Foundation; either version 2.1 of the License, or
+ *   (at your option) any later version.
+ *
+ *   This library is distributed in the hope that it will be useful,
+ *   but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See
+ *   the GNU Lesser General Public License for more details.
+ *
+ *   You should have received a copy of the GNU Lesser General Public License
+ *   along with this library; if not, write to the Free Software
+ *   Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
+ */
+
+#include <linux/module.h>
+#include <linux/slab.h>
+#include <linux/dns_resolver.h>
+#include <keys/dns_resolver-type.h>
+#include <keys/user-type.h>
+
+#include "internal.h"
+
+/*
+ * dns_query - Query the DNS
+ * @type: Query type (or NULL for straight host->IP lookup)
+ * @name: Name to look up
+ * @namelen: Length of name
+ * @options: Request options (or NULL if no options)
+ * @_result: Where to place the returned data.
+ * @_expiry: Where to store the result expiry time (or NULL)
+ *
+ * The data will be returned in the pointer at *result, and the caller is
+ * responsible for freeing it.
+ *
+ * The description should be of the form "[<query_type>:]<domain_name>", and
+ * the options need to be appropriate for the query type requested.  If no
+ * query_type is given, then the query is a straight hostname to IP address
+ * lookup.
+ *
+ * The DNS resolution lookup is performed by upcalling to userspace by way of
+ * requesting a key of type dns_resolver.
+ *
+ * Returns the size of the result on success, -ve error code otherwise.
+ */
+int dns_query(const char *type, const char *name, size_t namelen,
+	      const char *options, char **_result, time_t *_expiry)
+{
+	struct key *rkey;
+	struct user_key_payload *upayload;
+	const struct cred *saved_cred;
+	size_t typelen, desclen;
+	char *desc, *cp;
+	int ret, len;
+
+	kenter("%s,%*.*s,%zu,%s",
+	       type, (int)namelen, (int)namelen, name, namelen, options);
+
+	if (!name || namelen == 0 || !_result)
+		return -EINVAL;
+
+	/* construct the query key description as "[<type>:]<name>" */
+	typelen = 0;
+	desclen = 0;
+	if (type) {
+		typelen = strlen(type);
+		if (typelen < 1)
+			return -EINVAL;
+		desclen += typelen + 1;
+	}
+
+	if (!namelen)
+		namelen = strlen(name);
+	if (namelen < 3)
+		return -EINVAL;
+	desclen += namelen + 1;
+
+	desc = kmalloc(desclen, GFP_KERNEL);
+	if (!desc)
+		return -ENOMEM;
+
+	cp = desc;
+	if (type) {
+		memcpy(cp, type, typelen);
+		cp += typelen;
+		*cp++ = ':';
+	}
+	memcpy(cp, name, namelen);
+	cp += namelen;
+	*cp = '\0';
+
+	if (!options)
+		options = "";
+	kdebug("call request_key(,%s,%s)", desc, options);
+
+	/* make the upcall, using special credentials to prevent the use of
+	 * add_key() to preinstall malicious redirections
+	 */
+	saved_cred = override_creds(dns_resolver_cache);
+	rkey = request_key(&key_type_dns_resolver, desc, options);
+	revert_creds(saved_cred);
+	kfree(desc);
+	if (IS_ERR(rkey)) {
+		ret = PTR_ERR(rkey);
+		goto out;
+	}
+
+	down_read(&rkey->sem);
+	rkey->perm |= KEY_USR_VIEW;
+
+	ret = key_validate(rkey);
+	if (ret < 0)
+		goto put;
+
+	upayload = rcu_dereference_protected(rkey->payload.data,
+					     lockdep_is_held(&rkey->sem));
+	len = upayload->datalen;
+
+	ret = -ENOMEM;
+	*_result = kmalloc(len + 1, GFP_KERNEL);
+	if (!*_result)
+		goto put;
+
+	memcpy(*_result, upayload->data, len + 1);
+	if (_expiry)
+		*_expiry = rkey->expiry;
+
+	ret = len;
+put:
+	up_read(&rkey->sem);
+	key_put(rkey);
+out:
+	kleave(" = %d", ret);
+	return ret;
+}
+EXPORT_SYMBOL(dns_query);
diff --git a/net/dns_resolver/internal.h b/net/dns_resolver/internal.h
new file mode 100644
index 000000000000..189ca9e9b785
--- /dev/null
+++ b/net/dns_resolver/internal.h
@@ -0,0 +1,44 @@
+/*
+ *   Copyright (c) 2010 Wang Lei
+ *   Author(s): Wang Lei (wang840925@gmail.com). All Rights Reserved.
+ *
+ *   Internal DNS Rsolver stuff
+ *
+ *   This library is free software; you can redistribute it and/or modify
+ *   it under the terms of the GNU Lesser General Public License as published
+ *   by the Free Software Foundation; either version 2.1 of the License, or
+ *   (at your option) any later version.
+ *
+ *   This library is distributed in the hope that it will be useful,
+ *   but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See
+ *   the GNU Lesser General Public License for more details.
+ *
+ *   You should have received a copy of the GNU Lesser General Public License
+ *   along with this library; if not, write to the Free Software
+ *   Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
+ */
+
+#include <linux/compiler.h>
+#include <linux/kernel.h>
+#include <linux/sched.h>
+
+/*
+ * dns_key.c
+ */
+extern const struct cred *dns_resolver_cache;
+
+/*
+ * debug tracing
+ */
+extern unsigned dns_resolver_debug;
+
+#define	kdebug(FMT, ...)				\
+do {							\
+	if (unlikely(dns_resolver_debug))		\
+		printk(KERN_DEBUG "[%-6.6s] "FMT"\n",	\
+		       current->comm, ##__VA_ARGS__);	\
+} while (0)
+
+#define kenter(FMT, ...) kdebug("==> %s("FMT")", __func__, ##__VA_ARGS__)
+#define kleave(FMT, ...) kdebug("<== %s()"FMT"", __func__, ##__VA_ARGS__)
-- 
cgit v1.2.3


From cc7447a5fa92759b0856d6a83ba2539c6a94e67e Mon Sep 17 00:00:00 2001
From: Jean Delvare <khali@linux-fr.org>
Date: Wed, 16 Jun 2010 11:44:18 +0200
Subject: Driver core: Drop __must_check from bus_for_each_drv()

There is little rationale for marking bus_for_each_drv() __must_check.
It is more of an iteration helper than a real function. You don't know
in advance which callback it will be used on, so you have no clue how
important it can be to check the returned value. In practice, this
helper function can be used for best-effort tasks.

As a matter of fact, bus_for_each_dev() is not marked __must_check.
So remove it from bus_for_each_drv() as well. This is the same that
was done back in October 2006 by Russell King for
device_for_each_child(), for exactly the same reasons.

Signed-off-by: Jean Delvare <khali@linux-fr.org>
Cc: Andrew Morton <akpm@osdl.org>
Signed-off-by: Greg Kroah-Hartman <gregkh@suse.de>
---
 include/linux/device.h | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/device.h b/include/linux/device.h
index 6a8276f683b6..ddffdf7da393 100644
--- a/include/linux/device.h
+++ b/include/linux/device.h
@@ -84,9 +84,8 @@ struct device *bus_find_device_by_name(struct bus_type *bus,
 				       struct device *start,
 				       const char *name);
 
-int __must_check bus_for_each_drv(struct bus_type *bus,
-				  struct device_driver *start, void *data,
-				  int (*fn)(struct device_driver *, void *));
+int bus_for_each_drv(struct bus_type *bus, struct device_driver *start,
+		     void *data, int (*fn)(struct device_driver *, void *));
 
 void bus_sort_breadthfirst(struct bus_type *bus,
 			   int (*compare)(const struct device *a,
-- 
cgit v1.2.3


From 44f28bdea09415d40b4d73a7668db5961362ec53 Mon Sep 17 00:00:00 2001
From: Uwe Kleine-König <u.kleine-koenig@pengutronix.de>
Date: Mon, 21 Jun 2010 16:11:44 +0200
Subject: Driver core: reduce duplicated code for platform_device creation
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

This makes the two similar functions platform_device_register_simple
and platform_device_register_data one line inline functions using a new
generic function platform_device_register_resndata.

Signed-off-by: Uwe Kleine-König <u.kleine-koenig@pengutronix.de>
Signed-off-by: Greg Kroah-Hartman <gregkh@suse.de>
---
 Documentation/DocBook/device-drivers.tmpl |   1 +
 drivers/base/platform.c                   | 104 ++++++++----------------------
 include/linux/platform_device.h           |  62 ++++++++++++++++--
 3 files changed, 85 insertions(+), 82 deletions(-)

(limited to 'include/linux')

diff --git a/Documentation/DocBook/device-drivers.tmpl b/Documentation/DocBook/device-drivers.tmpl
index 1b2dd4fc3db2..ecd35e9d4410 100644
--- a/Documentation/DocBook/device-drivers.tmpl
+++ b/Documentation/DocBook/device-drivers.tmpl
@@ -111,6 +111,7 @@ X!Edrivers/base/attribute_container.c
 <!--
 X!Edrivers/base/interface.c
 -->
+!Iinclude/linux/platform_device.h
 !Edrivers/base/platform.c
 !Edrivers/base/bus.c
      </sect1>
diff --git a/drivers/base/platform.c b/drivers/base/platform.c
index 26eb69d88eb6..ffcfd73fe8a6 100644
--- a/drivers/base/platform.c
+++ b/drivers/base/platform.c
@@ -344,108 +344,56 @@ void platform_device_unregister(struct platform_device *pdev)
 EXPORT_SYMBOL_GPL(platform_device_unregister);
 
 /**
- * platform_device_register_simple - add a platform-level device and its resources
- * @name: base name of the device we're adding
- * @id: instance id
- * @res: set of resources that needs to be allocated for the device
- * @num: number of resources
- *
- * This function creates a simple platform device that requires minimal
- * resource and memory management. Canned release function freeing memory
- * allocated for the device allows drivers using such devices to be
- * unloaded without waiting for the last reference to the device to be
- * dropped.
+ * platform_device_register_resndata - add a platform-level device with
+ * resources and platform-specific data
  *
- * This interface is primarily intended for use with legacy drivers which
- * probe hardware directly.  Because such drivers create sysfs device nodes
- * themselves, rather than letting system infrastructure handle such device
- * enumeration tasks, they don't fully conform to the Linux driver model.
- * In particular, when such drivers are built as modules, they can't be
- * "hotplugged".
- *
- * Returns &struct platform_device pointer on success, or ERR_PTR() on error.
- */
-struct platform_device *platform_device_register_simple(const char *name,
-							int id,
-							const struct resource *res,
-							unsigned int num)
-{
-	struct platform_device *pdev;
-	int retval;
-
-	pdev = platform_device_alloc(name, id);
-	if (!pdev) {
-		retval = -ENOMEM;
-		goto error;
-	}
-
-	if (num) {
-		retval = platform_device_add_resources(pdev, res, num);
-		if (retval)
-			goto error;
-	}
-
-	retval = platform_device_add(pdev);
-	if (retval)
-		goto error;
-
-	return pdev;
-
-error:
-	platform_device_put(pdev);
-	return ERR_PTR(retval);
-}
-EXPORT_SYMBOL_GPL(platform_device_register_simple);
-
-/**
- * platform_device_register_data - add a platform-level device with platform-specific data
  * @parent: parent device for the device we're adding
  * @name: base name of the device we're adding
  * @id: instance id
+ * @res: set of resources that needs to be allocated for the device
+ * @num: number of resources
  * @data: platform specific data for this platform device
  * @size: size of platform specific data
  *
- * This function creates a simple platform device that requires minimal
- * resource and memory management. Canned release function freeing memory
- * allocated for the device allows drivers using such devices to be
- * unloaded without waiting for the last reference to the device to be
- * dropped.
- *
  * Returns &struct platform_device pointer on success, or ERR_PTR() on error.
  */
-struct platform_device *platform_device_register_data(
+struct platform_device *platform_device_register_resndata(
 		struct device *parent,
 		const char *name, int id,
+		const struct resource *res, unsigned int num,
 		const void *data, size_t size)
 {
+	int ret = -ENOMEM;
 	struct platform_device *pdev;
-	int retval;
 
 	pdev = platform_device_alloc(name, id);
-	if (!pdev) {
-		retval = -ENOMEM;
-		goto error;
-	}
+	if (!pdev)
+		goto err;
 
 	pdev->dev.parent = parent;
 
-	if (size) {
-		retval = platform_device_add_data(pdev, data, size);
-		if (retval)
-			goto error;
+	if (res) {
+		ret = platform_device_add_resources(pdev, res, num);
+		if (ret)
+			goto err;
 	}
 
-	retval = platform_device_add(pdev);
-	if (retval)
-		goto error;
+	if (data) {
+		ret = platform_device_add_data(pdev, data, size);
+		if (ret)
+			goto err;
+	}
 
-	return pdev;
+	ret = platform_device_add(pdev);
+	if (ret) {
+err:
+		platform_device_put(pdev);
+		return ERR_PTR(ret);
+	}
 
-error:
-	platform_device_put(pdev);
-	return ERR_PTR(retval);
+	return pdev;
 }
-EXPORT_SYMBOL_GPL(platform_device_register_data);
+EXPORT_SYMBOL_GPL(platform_device_register_resndata);
 
 static int platform_drv_probe(struct device *_dev)
 {
diff --git a/include/linux/platform_device.h b/include/linux/platform_device.h
index 5417944d3687..d7ecad0093bb 100644
--- a/include/linux/platform_device.h
+++ b/include/linux/platform_device.h
@@ -43,10 +43,64 @@ extern struct resource *platform_get_resource_byname(struct platform_device *, u
 extern int platform_get_irq_byname(struct platform_device *, const char *);
 extern int platform_add_devices(struct platform_device **, int);
 
-extern struct platform_device *platform_device_register_simple(const char *, int id,
-					const struct resource *, unsigned int);
-extern struct platform_device *platform_device_register_data(struct device *,
-		const char *, int, const void *, size_t);
+extern struct platform_device *platform_device_register_resndata(
+		struct device *parent, const char *name, int id,
+		const struct resource *res, unsigned int num,
+		const void *data, size_t size);
+
+/**
+ * platform_device_register_simple - add a platform-level device and its resources
+ * @name: base name of the device we're adding
+ * @id: instance id
+ * @res: set of resources that needs to be allocated for the device
+ * @num: number of resources
+ *
+ * This function creates a simple platform device that requires minimal
+ * resource and memory management. Canned release function freeing memory
+ * allocated for the device allows drivers using such devices to be
+ * unloaded without waiting for the last reference to the device to be
+ * dropped.
+ *
+ * This interface is primarily intended for use with legacy drivers which
+ * probe hardware directly.  Because such drivers create sysfs device nodes
+ * themselves, rather than letting system infrastructure handle such device
+ * enumeration tasks, they don't fully conform to the Linux driver model.
+ * In particular, when such drivers are built as modules, they can't be
+ * "hotplugged".
+ *
+ * Returns &struct platform_device pointer on success, or ERR_PTR() on error.
+ */
+static inline struct platform_device *platform_device_register_simple(
+		const char *name, int id,
+		const struct resource *res, unsigned int num)
+{
+	return platform_device_register_resndata(NULL, name, id,
+			res, num, NULL, 0);
+}
+
+/**
+ * platform_device_register_data - add a platform-level device with platform-specific data
+ * @parent: parent device for the device we're adding
+ * @name: base name of the device we're adding
+ * @id: instance id
+ * @data: platform specific data for this platform device
+ * @size: size of platform specific data
+ *
+ * This function creates a simple platform device that requires minimal
+ * resource and memory management. Canned release function freeing memory
+ * allocated for the device allows drivers using such devices to be
+ * unloaded without waiting for the last reference to the device to be
+ * dropped.
+ *
+ * Returns &struct platform_device pointer on success, or ERR_PTR() on error.
+ */
+static inline struct platform_device *platform_device_register_data(
+		struct device *parent, const char *name, int id,
+		const void *data, size_t size)
+{
+	return platform_device_register_resndata(parent, name, id,
+			NULL, 0, data, size);
+}
 
 extern struct platform_device *platform_device_alloc(const char *name, int id);
 extern int platform_device_add_resources(struct platform_device *pdev,
-- 
cgit v1.2.3


From 49c19400f60bbe362202d7e7b3e68cc66040d0fa Mon Sep 17 00:00:00 2001
From: Jean Delvare <khali@linux-fr.org>
Date: Fri, 2 Jul 2010 16:54:05 +0200
Subject: sysfs: sysfs_chmod_file's attr can be const

sysfs_chmod_file doesn't change the attribute it operates on, so this
attribute can be marked const.

Signed-off-by: Jean Delvare <khali@linux-fr.org>
Signed-off-by: Greg Kroah-Hartman <gregkh@suse.de>
---
 fs/sysfs/file.c       | 3 ++-
 include/linux/sysfs.h | 6 +++---
 2 files changed, 5 insertions(+), 4 deletions(-)

(limited to 'include/linux')

diff --git a/fs/sysfs/file.c b/fs/sysfs/file.c
index 1beaa739d0a6..1b27b5688f62 100644
--- a/fs/sysfs/file.c
+++ b/fs/sysfs/file.c
@@ -593,7 +593,8 @@ EXPORT_SYMBOL_GPL(sysfs_add_file_to_group);
  * @mode: file permissions.
  *
  */
-int sysfs_chmod_file(struct kobject *kobj, struct attribute *attr, mode_t mode)
+int sysfs_chmod_file(struct kobject *kobj, const struct attribute *attr,
+		     mode_t mode)
 {
 	struct sysfs_dirent *sd;
 	struct iattr newattrs;
diff --git a/include/linux/sysfs.h b/include/linux/sysfs.h
index f2694eb4dd3d..8bf06b64487c 100644
--- a/include/linux/sysfs.h
+++ b/include/linux/sysfs.h
@@ -136,8 +136,8 @@ int __must_check sysfs_create_file(struct kobject *kobj,
 				   const struct attribute *attr);
 int __must_check sysfs_create_files(struct kobject *kobj,
 				   const struct attribute **attr);
-int __must_check sysfs_chmod_file(struct kobject *kobj, struct attribute *attr,
-				  mode_t mode);
+int __must_check sysfs_chmod_file(struct kobject *kobj,
+				  const struct attribute *attr, mode_t mode);
 void sysfs_remove_file(struct kobject *kobj, const struct attribute *attr);
 void sysfs_remove_files(struct kobject *kobj, const struct attribute **attr);
 
@@ -225,7 +225,7 @@ static inline int sysfs_create_files(struct kobject *kobj,
 }
 
 static inline int sysfs_chmod_file(struct kobject *kobj,
-				   struct attribute *attr, mode_t mode)
+				   const struct attribute *attr, mode_t mode)
 {
 	return 0;
 }
-- 
cgit v1.2.3


From 45daef0fdcc44f6af86fdebc4fc7eb7c79375398 Mon Sep 17 00:00:00 2001
From: Magnus Damm <damm@opensource.se>
Date: Fri, 23 Jul 2010 19:56:18 +0900
Subject: Driver core: Add BUS_NOTIFY_BIND_DRIVER

Add BUS_NOTIFY_BIND_DRIVER as a bus notifier event.

For driver binding/unbinding we with this in
place have the following bus notifier events:
 - BUS_NOTIFY_BIND_DRIVER - before ->probe()
 - BUS_NOTIFY_BOUND_DRIVER - after ->probe()
 - BUS_NOTIFY_UNBIND_DRIVER - before ->remove()
 - BUS_NOTIFY_UNBOUND_DRIVER - after ->remove()

The event BUS_NOTIFY_BIND_DRIVER allows bus code
to be notified that ->probe() is about to be called.

Useful for bus code that needs to setup hardware before
the driver gets to run. With this in place platform
drivers can be loaded and unloaded as modules and the
new BIND event allows bus code to control for instance
device clocks that must be enabled before the driver
can be executed.

Without this patch there is no way for the bus code to
get notified that a modular driver is about to be probed.

Signed-off-by: Magnus Damm <damm@opensource.se>
Signed-off-by: Greg Kroah-Hartman <gregkh@suse.de>
---
 drivers/base/dd.c      | 4 ++++
 include/linux/device.h | 8 +++++---
 2 files changed, 9 insertions(+), 3 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/base/dd.c b/drivers/base/dd.c
index 503c2620bbcc..da57ee9d63fe 100644
--- a/drivers/base/dd.c
+++ b/drivers/base/dd.c
@@ -51,6 +51,10 @@ static int driver_sysfs_add(struct device *dev)
 {
 	int ret;
 
+	if (dev->bus)
+		blocking_notifier_call_chain(&dev->bus->p->bus_notifier,
+					     BUS_NOTIFY_BIND_DRIVER, dev);
+
 	ret = sysfs_create_link(&dev->driver->p->kobj, &dev->kobj,
 			  kobject_name(&dev->kobj));
 	if (ret == 0) {
diff --git a/include/linux/device.h b/include/linux/device.h
index ddffdf7da393..0ca24e93304f 100644
--- a/include/linux/device.h
+++ b/include/linux/device.h
@@ -109,10 +109,12 @@ extern int bus_unregister_notifier(struct bus_type *bus,
  */
 #define BUS_NOTIFY_ADD_DEVICE		0x00000001 /* device added */
 #define BUS_NOTIFY_DEL_DEVICE		0x00000002 /* device removed */
-#define BUS_NOTIFY_BOUND_DRIVER		0x00000003 /* driver bound to device */
-#define BUS_NOTIFY_UNBIND_DRIVER	0x00000004 /* driver about to be
+#define BUS_NOTIFY_BIND_DRIVER		0x00000003 /* driver about to be
+						      bound */
+#define BUS_NOTIFY_BOUND_DRIVER		0x00000004 /* driver bound to device */
+#define BUS_NOTIFY_UNBIND_DRIVER	0x00000005 /* driver about to be
 						      unbound */
-#define BUS_NOTIFY_UNBOUND_DRIVER	0x00000005 /* driver is unbound
+#define BUS_NOTIFY_UNBOUND_DRIVER	0x00000006 /* driver is unbound
 						      from the device */
 
 extern struct kset *bus_get_kset(struct bus_type *bus);
-- 
cgit v1.2.3


From 6fd69dc578fa0b1bbc3aad70ae3af9a137211707 Mon Sep 17 00:00:00 2001
From: Guenter Roeck <guenter.roeck@ericsson.com>
Date: Wed, 28 Jul 2010 22:09:26 -0700
Subject: sysfs: Remove owner field from sysfs struct attribute

Signed-off-by: Guenter Roeck <guenter.roeck@ericsson.com>
Acked-by: Tejun Heo <tj@kernel.org>
Signed-off-by: Greg Kroah-Hartman <gregkh@suse.de>
---
 include/linux/sysfs.h | 6 ------
 1 file changed, 6 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/sysfs.h b/include/linux/sysfs.h
index 8bf06b64487c..3c92121ba9af 100644
--- a/include/linux/sysfs.h
+++ b/include/linux/sysfs.h
@@ -22,14 +22,8 @@ struct kobject;
 struct module;
 enum kobj_ns_type;
 
-/* FIXME
- * The *owner field is no longer used.
- * x86 tree has been cleaned up. The owner
- * attribute is still left for other arches.
- */
 struct attribute {
 	const char		*name;
-	struct module		*owner;
 	mode_t			mode;
 #ifdef CONFIG_DEBUG_LOCK_ALLOC
 	struct lock_class_key	*key;
-- 
cgit v1.2.3


From 6937e8f8c0135f2325194c372ada6dc655499992 Mon Sep 17 00:00:00 2001
From: Johannes Berg <johannes@sipsolutions.net>
Date: Thu, 5 Aug 2010 17:38:18 +0200
Subject: driver core: device_rename's new_name can be const

The new_name argument to device_rename() can be
const as kobject_rename's new_name argument is.

Signed-off-by: Johannes Berg <johannes.berg@intel.com>
Signed-off-by: Greg Kroah-Hartman <gregkh@suse.de>
---
 drivers/base/core.c    | 2 +-
 include/linux/device.h | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/base/core.c b/drivers/base/core.c
index f8e72724dd4b..d1b2c9adc271 100644
--- a/drivers/base/core.c
+++ b/drivers/base/core.c
@@ -1599,7 +1599,7 @@ EXPORT_SYMBOL_GPL(device_destroy);
  * on the same device to ensure that new_name is valid and
  * won't conflict with other devices.
  */
-int device_rename(struct device *dev, char *new_name)
+int device_rename(struct device *dev, const char *new_name)
 {
 	char *old_class_name = NULL;
 	char *new_class_name = NULL;
diff --git a/include/linux/device.h b/include/linux/device.h
index 0ca24e93304f..516fecacf27b 100644
--- a/include/linux/device.h
+++ b/include/linux/device.h
@@ -552,7 +552,7 @@ extern int device_for_each_child(struct device *dev, void *data,
 		     int (*fn)(struct device *dev, void *data));
 extern struct device *device_find_child(struct device *dev, void *data,
 				int (*match)(struct device *dev, void *data));
-extern int device_rename(struct device *dev, char *new_name);
+extern int device_rename(struct device *dev, const char *new_name);
 extern int device_move(struct device *dev, struct device *new_parent,
 		       enum dpm_order dpm_order);
 extern const char *device_get_devnode(struct device *dev,
-- 
cgit v1.2.3


From 31d1d48e199e99077fb30f6fb9a793be7bec756f Mon Sep 17 00:00:00 2001
From: David Howells <dhowells@redhat.com>
Date: Fri, 6 Aug 2010 16:34:43 +0100
Subject: Fix init ordering of /dev/console vs callers of modprobe

Make /dev/console get initialised before any initialisation routine that
invokes modprobe because if modprobe fails, it's going to want to open
/dev/console, presumably to write an error message to.

The problem with that is that if the /dev/console driver is not yet
initialised, the chardev handler will call request_module() to invoke
modprobe, which will fail, because we never compile /dev/console as a
module.

This will lead to a modprobe loop, showing the following in the kernel
log:

	request_module: runaway loop modprobe char-major-5-1
	request_module: runaway loop modprobe char-major-5-1
	request_module: runaway loop modprobe char-major-5-1
	request_module: runaway loop modprobe char-major-5-1
	request_module: runaway loop modprobe char-major-5-1

This can happen, for example, when the built in md5 module can't find
the built in cryptomgr module (because the latter fails to initialise).
The md5 module comes before the call to tty_init(), presumably because
'crypto' comes before 'drivers' alphabetically.

Fix this by calling tty_init() from chrdev_init().

Signed-off-by: David Howells <dhowells@redhat.com>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 drivers/char/mem.c    | 2 +-
 drivers/char/tty_io.c | 4 ++--
 fs/char_dev.c         | 1 +
 include/linux/tty.h   | 3 +++
 4 files changed, 7 insertions(+), 3 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/char/mem.c b/drivers/char/mem.c
index f54dab8acdcd..a398ecdbd758 100644
--- a/drivers/char/mem.c
+++ b/drivers/char/mem.c
@@ -916,7 +916,7 @@ static int __init chr_dev_init(void)
 			      NULL, devlist[minor].name);
 	}
 
-	return 0;
+	return tty_init();
 }
 
 fs_initcall(chr_dev_init);
diff --git a/drivers/char/tty_io.c b/drivers/char/tty_io.c
index d71f0fc34b46..507441ac6edb 100644
--- a/drivers/char/tty_io.c
+++ b/drivers/char/tty_io.c
@@ -3128,7 +3128,7 @@ static struct cdev tty_cdev, console_cdev;
  * Ok, now we can initialize the rest of the tty devices and can count
  * on memory allocations, interrupts etc..
  */
-static int __init tty_init(void)
+int __init tty_init(void)
 {
 	cdev_init(&tty_cdev, &tty_fops);
 	if (cdev_add(&tty_cdev, MKDEV(TTYAUX_MAJOR, 0), 1) ||
@@ -3149,4 +3149,4 @@ static int __init tty_init(void)
 #endif
 	return 0;
 }
-module_init(tty_init);
+
diff --git a/fs/char_dev.c b/fs/char_dev.c
index d6db933df2b2..f80a4f25123c 100644
--- a/fs/char_dev.c
+++ b/fs/char_dev.c
@@ -20,6 +20,7 @@
 #include <linux/cdev.h>
 #include <linux/mutex.h>
 #include <linux/backing-dev.h>
+#include <linux/tty.h>
 
 #include "internal.h"
 
diff --git a/include/linux/tty.h b/include/linux/tty.h
index 931078b73226..7802a243ee13 100644
--- a/include/linux/tty.h
+++ b/include/linux/tty.h
@@ -552,6 +552,9 @@ static inline void tty_audit_push_task(struct task_struct *tsk,
 }
 #endif
 
+/* tty_io.c */
+extern int __init tty_init(void);
+
 /* tty_ioctl.c */
 extern int n_tty_ioctl_helper(struct tty_struct *tty, struct file *file,
 		       unsigned int cmd, unsigned long arg);
-- 
cgit v1.2.3


From d5eff1a3412f6d75bf28f423c5015ece8055407a Mon Sep 17 00:00:00 2001
From: Bryan Schumaker <bjschuma@netapp.com>
Date: Tue, 3 Aug 2010 13:04:00 -0400
Subject: NFS: Fix /proc/mount for legacy binary interface

Add a flag so we know if we mounted the NFS server using the legacy
binary interface.  If we used the legacy interface, then we should not
show the mountd options.

Signed-off-by: Bryan Schumaker <bjschuma@netapp.com>
Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
---
 fs/nfs/super.c            | 4 ++++
 include/linux/nfs_mount.h | 1 +
 2 files changed, 5 insertions(+)

(limited to 'include/linux')

diff --git a/fs/nfs/super.c b/fs/nfs/super.c
index f9df16de4a56..f1ae39f6cb02 100644
--- a/fs/nfs/super.c
+++ b/fs/nfs/super.c
@@ -546,6 +546,9 @@ static void nfs_show_mountd_options(struct seq_file *m, struct nfs_server *nfss,
 {
 	struct sockaddr *sap = (struct sockaddr *)&nfss->mountd_address;
 
+	if (nfss->flags & NFS_MOUNT_LEGACY_INTERFACE)
+		return;
+
 	switch (sap->sa_family) {
 	case AF_INET: {
 		struct sockaddr_in *sin = (struct sockaddr_in *)sap;
@@ -1780,6 +1783,7 @@ static int nfs_validate_mount_data(void *options,
 		 * can deal with.
 		 */
 		args->flags		= data->flags & NFS_MOUNT_FLAGMASK;
+		args->flags		|= NFS_MOUNT_LEGACY_INTERFACE;
 		args->rsize		= data->rsize;
 		args->wsize		= data->wsize;
 		args->timeo		= data->timeo;
diff --git a/include/linux/nfs_mount.h b/include/linux/nfs_mount.h
index 4499016e6d0d..5d59ae861aa6 100644
--- a/include/linux/nfs_mount.h
+++ b/include/linux/nfs_mount.h
@@ -69,5 +69,6 @@ struct nfs_mount_data {
 #define NFS_MOUNT_LOOKUP_CACHE_NONEG	0x10000
 #define NFS_MOUNT_LOOKUP_CACHE_NONE	0x20000
 #define NFS_MOUNT_NORESVPORT		0x40000
+#define NFS_MOUNT_LEGACY_INTERFACE	0x80000
 
 #endif
-- 
cgit v1.2.3


From 9261ec1a8d7b17e2540bef7cad3470870d13b61e Mon Sep 17 00:00:00 2001
From: Jason Wessel <jason.wessel@windriver.com>
Date: Fri, 6 Aug 2010 15:36:47 -0500
Subject: console: Fix compilation regression

A regression of building without CONFIG_HW_CONSOLE was introduced with
commit b45cfba4e9005d64d419718e7ff7f7cab44c1994 (vt,console,kdb:
implement atomic console enter/leave functions).

ERROR: "con_debug_enter" [drivers/serial/kgdboc.ko] undefined!
ERROR: "vc_cons" [drivers/serial/kgdboc.ko] undefined!
ERROR: "fg_console" [drivers/serial/kgdboc.ko] undefined!
ERROR: "con_debug_leave" [drivers/serial/kgdboc.ko] undefined!

When there is no HW console the con_debug_enter and con_debug_leave
functions should have no code.

Signed-off-by: Jason Wessel <jason.wessel@windriver.com>
CC: Jesse Barnes <jbarnes@virtuousgeek.org>
Reported-by: Randy Dunlap <randy.dunlap@oracle.com>
---
 include/linux/console.h | 5 +++++
 1 file changed, 5 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/console.h b/include/linux/console.h
index f76fc297322d..95cf6f08a59d 100644
--- a/include/linux/console.h
+++ b/include/linux/console.h
@@ -79,8 +79,13 @@ int register_con_driver(const struct consw *csw, int first, int last);
 int unregister_con_driver(const struct consw *csw);
 int take_over_console(const struct consw *sw, int first, int last, int deflt);
 void give_up_console(const struct consw *sw);
+#ifdef CONFIG_HW_CONSOLE
 int con_debug_enter(struct vc_data *vc);
 int con_debug_leave(void);
+#else
+#define con_debug_enter(vc) (0)
+#define con_debug_leave() (0)
+#endif
 
 /* scroll */
 #define SM_UP       (1)
-- 
cgit v1.2.3


From 597781f3e51f48ef8e67be772196d9e9673752c4 Mon Sep 17 00:00:00 2001
From: Cesar Eduardo Barros <cesarb@cesarb.net>
Date: Mon, 9 Aug 2010 17:18:32 -0700
Subject: kmap_atomic: make kunmap_atomic() harder to misuse

kunmap_atomic() is currently at level -4 on Rusty's "Hard To Misuse"
list[1] ("Follow common convention and you'll get it wrong"), except in
some architectures when CONFIG_DEBUG_HIGHMEM is set[2][3].

kunmap() takes a pointer to a struct page; kunmap_atomic(), however, takes
takes a pointer to within the page itself.  This seems to once in a while
trip people up (the convention they are following is the one from
kunmap()).

Make it much harder to misuse, by moving it to level 9 on Rusty's list[4]
("The compiler/linker won't let you get it wrong").  This is done by
refusing to build if the type of its first argument is a pointer to a
struct page.

The real kunmap_atomic() is renamed to kunmap_atomic_notypecheck()
(which is what you would call in case for some strange reason calling it
with a pointer to a struct page is not incorrect in your code).

The previous version of this patch was compile tested on x86-64.

[1] http://ozlabs.org/~rusty/index.cgi/tech/2008-04-01.html
[2] In these cases, it is at level 5, "Do it right or it will always
    break at runtime."
[3] At least mips and powerpc look very similar, and sparc also seems to
    share a common ancestor with both; there seems to be quite some
    degree of copy-and-paste coding here. The include/asm/highmem.h file
    for these three archs mention x86 CPUs at its top.
[4] http://ozlabs.org/~rusty/index.cgi/tech/2008-03-30.html
[5] As an aside, could someone tell me why mn10300 uses unsigned long as
    the first parameter of kunmap_atomic() instead of void *?

Signed-off-by: Cesar Eduardo Barros <cesarb@cesarb.net>
Cc: Russell King <linux@arm.linux.org.uk> (arch/arm)
Cc: Ralf Baechle <ralf@linux-mips.org> (arch/mips)
Cc: David Howells <dhowells@redhat.com> (arch/frv, arch/mn10300)
Cc: Koichi Yasutake <yasutake.koichi@jp.panasonic.com> (arch/mn10300)
Cc: Kyle McMartin <kyle@mcmartin.ca> (arch/parisc)
Cc: Helge Deller <deller@gmx.de> (arch/parisc)
Cc: "James E.J. Bottomley" <jejb@parisc-linux.org> (arch/parisc)
Cc: Benjamin Herrenschmidt <benh@kernel.crashing.org> (arch/powerpc)
Cc: Paul Mackerras <paulus@samba.org> (arch/powerpc)
Cc: "David S. Miller" <davem@davemloft.net> (arch/sparc)
Cc: Thomas Gleixner <tglx@linutronix.de> (arch/x86)
Cc: Ingo Molnar <mingo@redhat.com> (arch/x86)
Cc: "H. Peter Anvin" <hpa@zytor.com> (arch/x86)
Cc: Arnd Bergmann <arnd@arndb.de> (include/asm-generic)
Cc: Rusty Russell <rusty@rustcorp.com.au> ("Hard To Misuse" list)
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 arch/arm/include/asm/highmem.h       |  2 +-
 arch/arm/mm/highmem.c                |  4 ++--
 arch/frv/include/asm/highmem.h       |  2 +-
 arch/mips/include/asm/highmem.h      |  4 ++--
 arch/mips/mm/highmem.c               |  4 ++--
 arch/mn10300/include/asm/highmem.h   |  2 +-
 arch/parisc/include/asm/cacheflush.h |  2 +-
 arch/powerpc/include/asm/highmem.h   |  2 +-
 arch/powerpc/mm/highmem.c            |  4 ++--
 arch/sparc/include/asm/highmem.h     |  2 +-
 arch/sparc/mm/highmem.c              |  4 ++--
 arch/x86/include/asm/highmem.h       |  2 +-
 arch/x86/mm/highmem_32.c             |  4 ++--
 include/linux/highmem.h              | 10 +++++++++-
 14 files changed, 28 insertions(+), 20 deletions(-)

(limited to 'include/linux')

diff --git a/arch/arm/include/asm/highmem.h b/arch/arm/include/asm/highmem.h
index feb988a7ec37..5aff58126602 100644
--- a/arch/arm/include/asm/highmem.h
+++ b/arch/arm/include/asm/highmem.h
@@ -36,7 +36,7 @@ extern void kunmap_high_l1_vipt(struct page *page, pte_t saved_pte);
 extern void *kmap(struct page *page);
 extern void kunmap(struct page *page);
 extern void *kmap_atomic(struct page *page, enum km_type type);
-extern void kunmap_atomic(void *kvaddr, enum km_type type);
+extern void kunmap_atomic_notypecheck(void *kvaddr, enum km_type type);
 extern void *kmap_atomic_pfn(unsigned long pfn, enum km_type type);
 extern struct page *kmap_atomic_to_page(const void *ptr);
 #endif
diff --git a/arch/arm/mm/highmem.c b/arch/arm/mm/highmem.c
index 6ab244062b4a..1fbdb55bfd1b 100644
--- a/arch/arm/mm/highmem.c
+++ b/arch/arm/mm/highmem.c
@@ -82,7 +82,7 @@ void *kmap_atomic(struct page *page, enum km_type type)
 }
 EXPORT_SYMBOL(kmap_atomic);
 
-void kunmap_atomic(void *kvaddr, enum km_type type)
+void kunmap_atomic_notypecheck(void *kvaddr, enum km_type type)
 {
 	unsigned long vaddr = (unsigned long) kvaddr & PAGE_MASK;
 	unsigned int idx = type + KM_TYPE_NR * smp_processor_id();
@@ -103,7 +103,7 @@ void kunmap_atomic(void *kvaddr, enum km_type type)
 	}
 	pagefault_enable();
 }
-EXPORT_SYMBOL(kunmap_atomic);
+EXPORT_SYMBOL(kunmap_atomic_notypecheck);
 
 void *kmap_atomic_pfn(unsigned long pfn, enum km_type type)
 {
diff --git a/arch/frv/include/asm/highmem.h b/arch/frv/include/asm/highmem.h
index 68e4677fb9e7..cb4c317eaecc 100644
--- a/arch/frv/include/asm/highmem.h
+++ b/arch/frv/include/asm/highmem.h
@@ -152,7 +152,7 @@ do {									\
 	asm volatile("tlbpr %0,gr0,#4,#1" : : "r"(vaddr) : "memory");	\
 } while(0)
 
-static inline void kunmap_atomic(void *kvaddr, enum km_type type)
+static inline void kunmap_atomic_notypecheck(void *kvaddr, enum km_type type)
 {
 	switch (type) {
         case 0:		__kunmap_atomic_primary(0, 2);	break;
diff --git a/arch/mips/include/asm/highmem.h b/arch/mips/include/asm/highmem.h
index 25adfb02923d..75753ca73bfd 100644
--- a/arch/mips/include/asm/highmem.h
+++ b/arch/mips/include/asm/highmem.h
@@ -48,14 +48,14 @@ extern void kunmap_high(struct page *page);
 extern void *__kmap(struct page *page);
 extern void __kunmap(struct page *page);
 extern void *__kmap_atomic(struct page *page, enum km_type type);
-extern void __kunmap_atomic(void *kvaddr, enum km_type type);
+extern void __kunmap_atomic_notypecheck(void *kvaddr, enum km_type type);
 extern void *kmap_atomic_pfn(unsigned long pfn, enum km_type type);
 extern struct page *__kmap_atomic_to_page(void *ptr);
 
 #define kmap			__kmap
 #define kunmap			__kunmap
 #define kmap_atomic		__kmap_atomic
-#define kunmap_atomic		__kunmap_atomic
+#define kunmap_atomic_notypecheck		__kunmap_atomic_notypecheck
 #define kmap_atomic_to_page	__kmap_atomic_to_page
 
 #define flush_cache_kmaps()	flush_cache_all()
diff --git a/arch/mips/mm/highmem.c b/arch/mips/mm/highmem.c
index 127d732474bf..6a2b1bf9ef11 100644
--- a/arch/mips/mm/highmem.c
+++ b/arch/mips/mm/highmem.c
@@ -64,7 +64,7 @@ void *__kmap_atomic(struct page *page, enum km_type type)
 }
 EXPORT_SYMBOL(__kmap_atomic);
 
-void __kunmap_atomic(void *kvaddr, enum km_type type)
+void __kunmap_atomic_notypecheck(void *kvaddr, enum km_type type)
 {
 #ifdef CONFIG_DEBUG_HIGHMEM
 	unsigned long vaddr = (unsigned long) kvaddr & PAGE_MASK;
@@ -87,7 +87,7 @@ void __kunmap_atomic(void *kvaddr, enum km_type type)
 
 	pagefault_enable();
 }
-EXPORT_SYMBOL(__kunmap_atomic);
+EXPORT_SYMBOL(__kunmap_atomic_notypecheck);
 
 /*
  * This is the same as kmap_atomic() but can map memory that doesn't
diff --git a/arch/mn10300/include/asm/highmem.h b/arch/mn10300/include/asm/highmem.h
index 90f2abb04bfd..b0b187a29b88 100644
--- a/arch/mn10300/include/asm/highmem.h
+++ b/arch/mn10300/include/asm/highmem.h
@@ -91,7 +91,7 @@ static inline unsigned long kmap_atomic(struct page *page, enum km_type type)
 	return vaddr;
 }
 
-static inline void kunmap_atomic(unsigned long vaddr, enum km_type type)
+static inline void kunmap_atomic_notypecheck(unsigned long vaddr, enum km_type type)
 {
 #if HIGHMEM_DEBUG
 	enum fixed_addresses idx = type + KM_TYPE_NR * smp_processor_id();
diff --git a/arch/parisc/include/asm/cacheflush.h b/arch/parisc/include/asm/cacheflush.h
index 4556d820128a..dba11aedce1b 100644
--- a/arch/parisc/include/asm/cacheflush.h
+++ b/arch/parisc/include/asm/cacheflush.h
@@ -132,7 +132,7 @@ static inline void *kmap_atomic(struct page *page, enum km_type idx)
 	return page_address(page);
 }
 
-static inline void kunmap_atomic(void *addr, enum km_type idx)
+static inline void kunmap_atomic_notypecheck(void *addr, enum km_type idx)
 {
 	kunmap_parisc(addr);
 	pagefault_enable();
diff --git a/arch/powerpc/include/asm/highmem.h b/arch/powerpc/include/asm/highmem.h
index a74c4ee6c020..d10d64a4be38 100644
--- a/arch/powerpc/include/asm/highmem.h
+++ b/arch/powerpc/include/asm/highmem.h
@@ -62,7 +62,7 @@ extern void *kmap_high(struct page *page);
 extern void kunmap_high(struct page *page);
 extern void *kmap_atomic_prot(struct page *page, enum km_type type,
 			      pgprot_t prot);
-extern void kunmap_atomic(void *kvaddr, enum km_type type);
+extern void kunmap_atomic_notypecheck(void *kvaddr, enum km_type type);
 
 static inline void *kmap(struct page *page)
 {
diff --git a/arch/powerpc/mm/highmem.c b/arch/powerpc/mm/highmem.c
index c2186c74c85a..857d4173f9c6 100644
--- a/arch/powerpc/mm/highmem.c
+++ b/arch/powerpc/mm/highmem.c
@@ -52,7 +52,7 @@ void *kmap_atomic_prot(struct page *page, enum km_type type, pgprot_t prot)
 }
 EXPORT_SYMBOL(kmap_atomic_prot);
 
-void kunmap_atomic(void *kvaddr, enum km_type type)
+void kunmap_atomic_notypecheck(void *kvaddr, enum km_type type)
 {
 #ifdef CONFIG_DEBUG_HIGHMEM
 	unsigned long vaddr = (unsigned long) kvaddr & PAGE_MASK;
@@ -74,4 +74,4 @@ void kunmap_atomic(void *kvaddr, enum km_type type)
 #endif
 	pagefault_enable();
 }
-EXPORT_SYMBOL(kunmap_atomic);
+EXPORT_SYMBOL(kunmap_atomic_notypecheck);
diff --git a/arch/sparc/include/asm/highmem.h b/arch/sparc/include/asm/highmem.h
index 3de42e776274..ec23b0a87b98 100644
--- a/arch/sparc/include/asm/highmem.h
+++ b/arch/sparc/include/asm/highmem.h
@@ -71,7 +71,7 @@ static inline void kunmap(struct page *page)
 }
 
 extern void *kmap_atomic(struct page *page, enum km_type type);
-extern void kunmap_atomic(void *kvaddr, enum km_type type);
+extern void kunmap_atomic_notypecheck(void *kvaddr, enum km_type type);
 extern struct page *kmap_atomic_to_page(void *vaddr);
 
 #define flush_cache_kmaps()	flush_cache_all()
diff --git a/arch/sparc/mm/highmem.c b/arch/sparc/mm/highmem.c
index 7916feba6e4a..e139e9cbf5f7 100644
--- a/arch/sparc/mm/highmem.c
+++ b/arch/sparc/mm/highmem.c
@@ -65,7 +65,7 @@ void *kmap_atomic(struct page *page, enum km_type type)
 }
 EXPORT_SYMBOL(kmap_atomic);
 
-void kunmap_atomic(void *kvaddr, enum km_type type)
+void kunmap_atomic_notypecheck(void *kvaddr, enum km_type type)
 {
 #ifdef CONFIG_DEBUG_HIGHMEM
 	unsigned long vaddr = (unsigned long) kvaddr & PAGE_MASK;
@@ -100,7 +100,7 @@ void kunmap_atomic(void *kvaddr, enum km_type type)
 
 	pagefault_enable();
 }
-EXPORT_SYMBOL(kunmap_atomic);
+EXPORT_SYMBOL(kunmap_atomic_notypecheck);
 
 /* We may be fed a pagetable here by ptep_to_xxx and others. */
 struct page *kmap_atomic_to_page(void *ptr)
diff --git a/arch/x86/include/asm/highmem.h b/arch/x86/include/asm/highmem.h
index a726650fc80f..8caac76ac324 100644
--- a/arch/x86/include/asm/highmem.h
+++ b/arch/x86/include/asm/highmem.h
@@ -61,7 +61,7 @@ void *kmap(struct page *page);
 void kunmap(struct page *page);
 void *kmap_atomic_prot(struct page *page, enum km_type type, pgprot_t prot);
 void *kmap_atomic(struct page *page, enum km_type type);
-void kunmap_atomic(void *kvaddr, enum km_type type);
+void kunmap_atomic_notypecheck(void *kvaddr, enum km_type type);
 void *kmap_atomic_pfn(unsigned long pfn, enum km_type type);
 void *kmap_atomic_prot_pfn(unsigned long pfn, enum km_type type, pgprot_t prot);
 struct page *kmap_atomic_to_page(void *ptr);
diff --git a/arch/x86/mm/highmem_32.c b/arch/x86/mm/highmem_32.c
index 63a6ba66cbe0..5e8fa12ef861 100644
--- a/arch/x86/mm/highmem_32.c
+++ b/arch/x86/mm/highmem_32.c
@@ -53,7 +53,7 @@ void *kmap_atomic(struct page *page, enum km_type type)
 	return kmap_atomic_prot(page, type, kmap_prot);
 }
 
-void kunmap_atomic(void *kvaddr, enum km_type type)
+void kunmap_atomic_notypecheck(void *kvaddr, enum km_type type)
 {
 	unsigned long vaddr = (unsigned long) kvaddr & PAGE_MASK;
 	enum fixed_addresses idx = type + KM_TYPE_NR*smp_processor_id();
@@ -102,7 +102,7 @@ struct page *kmap_atomic_to_page(void *ptr)
 EXPORT_SYMBOL(kmap);
 EXPORT_SYMBOL(kunmap);
 EXPORT_SYMBOL(kmap_atomic);
-EXPORT_SYMBOL(kunmap_atomic);
+EXPORT_SYMBOL(kunmap_atomic_notypecheck);
 EXPORT_SYMBOL(kmap_atomic_prot);
 EXPORT_SYMBOL(kmap_atomic_to_page);
 
diff --git a/include/linux/highmem.h b/include/linux/highmem.h
index caafd0561aa1..67460f010224 100644
--- a/include/linux/highmem.h
+++ b/include/linux/highmem.h
@@ -2,6 +2,7 @@
 #define _LINUX_HIGHMEM_H
 
 #include <linux/fs.h>
+#include <linux/kernel.h>
 #include <linux/mm.h>
 #include <linux/uaccess.h>
 
@@ -72,7 +73,7 @@ static inline void *kmap_atomic(struct page *page, enum km_type idx)
 }
 #define kmap_atomic_prot(page, idx, prot)	kmap_atomic(page, idx)
 
-#define kunmap_atomic(addr, idx)	do { pagefault_enable(); } while (0)
+#define kunmap_atomic_notypecheck(addr, idx)	do { pagefault_enable(); } while (0)
 #define kmap_atomic_pfn(pfn, idx)	kmap_atomic(pfn_to_page(pfn), (idx))
 #define kmap_atomic_to_page(ptr)	virt_to_page(ptr)
 
@@ -81,6 +82,13 @@ static inline void *kmap_atomic(struct page *page, enum km_type idx)
 
 #endif /* CONFIG_HIGHMEM */
 
+/* Prevent people trying to call kunmap_atomic() as if it were kunmap() */
+/* kunmap_atomic() should get the return value of kmap_atomic, not the page. */
+#define kunmap_atomic(addr, idx) do { \
+		BUILD_BUG_ON(__same_type((addr), struct page *)); \
+		kunmap_atomic_notypecheck((addr), (idx)); \
+	} while (0)
+
 /* when CONFIG_HIGHMEM is not set these will be plain clear/copy_page */
 #ifndef clear_user_highpage
 static inline void clear_user_highpage(struct page *page, unsigned long vaddr)
-- 
cgit v1.2.3


From bb4a340e075b7897ece109686bfa177f8518d2db Mon Sep 17 00:00:00 2001
From: Rik van Riel <riel@redhat.com>
Date: Mon, 9 Aug 2010 17:18:37 -0700
Subject: mm: rename anon_vma_lock to vma_lock_anon_vma

Rename anon_vma_lock to vma_lock_anon_vma.  This matches the naming style
used in page_lock_anon_vma and will come in really handy further down in
this patch series.

Signed-off-by: Rik van Riel <riel@redhat.com>
Acked-by: Mel Gorman <mel@csn.ul.ie>
Acked-by: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
Tested-by: Larry Woodman <lwoodman@redhat.com>
Acked-by: Larry Woodman <lwoodman@redhat.com>
Reviewed-by: Minchan Kim <minchan.kim@gmail.com>
Acked-by: Linus Torvalds <torvalds@linux-foundation.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/rmap.h |  4 ++--
 mm/mmap.c            | 14 +++++++-------
 2 files changed, 9 insertions(+), 9 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/rmap.h b/include/linux/rmap.h
index 77216742c178..80cd162a8aa6 100644
--- a/include/linux/rmap.h
+++ b/include/linux/rmap.h
@@ -99,14 +99,14 @@ static inline struct anon_vma *page_anon_vma(struct page *page)
 	return page_rmapping(page);
 }
 
-static inline void anon_vma_lock(struct vm_area_struct *vma)
+static inline void vma_lock_anon_vma(struct vm_area_struct *vma)
 {
 	struct anon_vma *anon_vma = vma->anon_vma;
 	if (anon_vma)
 		spin_lock(&anon_vma->lock);
 }
 
-static inline void anon_vma_unlock(struct vm_area_struct *vma)
+static inline void vma_unlock_anon_vma(struct vm_area_struct *vma)
 {
 	struct anon_vma *anon_vma = vma->anon_vma;
 	if (anon_vma)
diff --git a/mm/mmap.c b/mm/mmap.c
index e38e910cb756..e26f1ea7c904 100644
--- a/mm/mmap.c
+++ b/mm/mmap.c
@@ -452,12 +452,12 @@ static void vma_link(struct mm_struct *mm, struct vm_area_struct *vma,
 		spin_lock(&mapping->i_mmap_lock);
 		vma->vm_truncate_count = mapping->truncate_count;
 	}
-	anon_vma_lock(vma);
+	vma_lock_anon_vma(vma);
 
 	__vma_link(mm, vma, prev, rb_link, rb_parent);
 	__vma_link_file(vma);
 
-	anon_vma_unlock(vma);
+	vma_unlock_anon_vma(vma);
 	if (mapping)
 		spin_unlock(&mapping->i_mmap_lock);
 
@@ -1710,7 +1710,7 @@ int expand_upwards(struct vm_area_struct *vma, unsigned long address)
 	 */
 	if (unlikely(anon_vma_prepare(vma)))
 		return -ENOMEM;
-	anon_vma_lock(vma);
+	vma_lock_anon_vma(vma);
 
 	/*
 	 * vma->vm_start/vm_end cannot change under us because the caller
@@ -1721,7 +1721,7 @@ int expand_upwards(struct vm_area_struct *vma, unsigned long address)
 	if (address < PAGE_ALIGN(address+4))
 		address = PAGE_ALIGN(address+4);
 	else {
-		anon_vma_unlock(vma);
+		vma_unlock_anon_vma(vma);
 		return -ENOMEM;
 	}
 	error = 0;
@@ -1739,7 +1739,7 @@ int expand_upwards(struct vm_area_struct *vma, unsigned long address)
 			perf_event_mmap(vma);
 		}
 	}
-	anon_vma_unlock(vma);
+	vma_unlock_anon_vma(vma);
 	return error;
 }
 #endif /* CONFIG_STACK_GROWSUP || CONFIG_IA64 */
@@ -1764,7 +1764,7 @@ static int expand_downwards(struct vm_area_struct *vma,
 	if (error)
 		return error;
 
-	anon_vma_lock(vma);
+	vma_lock_anon_vma(vma);
 
 	/*
 	 * vma->vm_start/vm_end cannot change under us because the caller
@@ -1786,7 +1786,7 @@ static int expand_downwards(struct vm_area_struct *vma,
 			perf_event_mmap(vma);
 		}
 	}
-	anon_vma_unlock(vma);
+	vma_unlock_anon_vma(vma);
 	return error;
 }
 
-- 
cgit v1.2.3


From cba48b98f2348c814316c4b4f411a07a0e4a2bf9 Mon Sep 17 00:00:00 2001
From: Rik van Riel <riel@redhat.com>
Date: Mon, 9 Aug 2010 17:18:38 -0700
Subject: mm: change direct call of spin_lock(anon_vma->lock) to inline
 function

Subsitute a direct call of spin_lock(anon_vma->lock) with an inline
function doing exactly the same.

This makes it easier to do the substitution to the root anon_vma lock in a
following patch.

We will deal with the handful of special locks (nested, dec_and_lock, etc)
separately.

Signed-off-by: Rik van Riel <riel@redhat.com>
Acked-by: Mel Gorman <mel@csn.ul.ie>
Acked-by: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
Tested-by: Larry Woodman <lwoodman@redhat.com>
Acked-by: Larry Woodman <lwoodman@redhat.com>
Reviewed-by: Minchan Kim <minchan.kim@gmail.com>
Acked-by: Linus Torvalds <torvalds@linux-foundation.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/rmap.h | 10 ++++++++++
 mm/ksm.c             | 18 +++++++++---------
 mm/migrate.c         |  2 +-
 mm/mmap.c            |  2 +-
 mm/rmap.c            | 20 ++++++++++----------
 5 files changed, 31 insertions(+), 21 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/rmap.h b/include/linux/rmap.h
index 80cd162a8aa6..5f981be61416 100644
--- a/include/linux/rmap.h
+++ b/include/linux/rmap.h
@@ -113,6 +113,16 @@ static inline void vma_unlock_anon_vma(struct vm_area_struct *vma)
 		spin_unlock(&anon_vma->lock);
 }
 
+static inline void anon_vma_lock(struct anon_vma *anon_vma)
+{
+	spin_lock(&anon_vma->lock);
+}
+
+static inline void anon_vma_unlock(struct anon_vma *anon_vma)
+{
+	spin_unlock(&anon_vma->lock);
+}
+
 /*
  * anon_vma helper functions.
  */
diff --git a/mm/ksm.c b/mm/ksm.c
index 6c3e99b4ae7c..eb9f6806ed51 100644
--- a/mm/ksm.c
+++ b/mm/ksm.c
@@ -327,7 +327,7 @@ static void drop_anon_vma(struct rmap_item *rmap_item)
 
 	if (atomic_dec_and_lock(&anon_vma->external_refcount, &anon_vma->lock)) {
 		int empty = list_empty(&anon_vma->head);
-		spin_unlock(&anon_vma->lock);
+		anon_vma_unlock(anon_vma);
 		if (empty)
 			anon_vma_free(anon_vma);
 	}
@@ -1566,7 +1566,7 @@ again:
 		struct anon_vma_chain *vmac;
 		struct vm_area_struct *vma;
 
-		spin_lock(&anon_vma->lock);
+		anon_vma_lock(anon_vma);
 		list_for_each_entry(vmac, &anon_vma->head, same_anon_vma) {
 			vma = vmac->vma;
 			if (rmap_item->address < vma->vm_start ||
@@ -1589,7 +1589,7 @@ again:
 			if (!search_new_forks || !mapcount)
 				break;
 		}
-		spin_unlock(&anon_vma->lock);
+		anon_vma_unlock(anon_vma);
 		if (!mapcount)
 			goto out;
 	}
@@ -1619,7 +1619,7 @@ again:
 		struct anon_vma_chain *vmac;
 		struct vm_area_struct *vma;
 
-		spin_lock(&anon_vma->lock);
+		anon_vma_lock(anon_vma);
 		list_for_each_entry(vmac, &anon_vma->head, same_anon_vma) {
 			vma = vmac->vma;
 			if (rmap_item->address < vma->vm_start ||
@@ -1637,11 +1637,11 @@ again:
 			ret = try_to_unmap_one(page, vma,
 					rmap_item->address, flags);
 			if (ret != SWAP_AGAIN || !page_mapped(page)) {
-				spin_unlock(&anon_vma->lock);
+				anon_vma_unlock(anon_vma);
 				goto out;
 			}
 		}
-		spin_unlock(&anon_vma->lock);
+		anon_vma_unlock(anon_vma);
 	}
 	if (!search_new_forks++)
 		goto again;
@@ -1671,7 +1671,7 @@ again:
 		struct anon_vma_chain *vmac;
 		struct vm_area_struct *vma;
 
-		spin_lock(&anon_vma->lock);
+		anon_vma_lock(anon_vma);
 		list_for_each_entry(vmac, &anon_vma->head, same_anon_vma) {
 			vma = vmac->vma;
 			if (rmap_item->address < vma->vm_start ||
@@ -1688,11 +1688,11 @@ again:
 
 			ret = rmap_one(page, vma, rmap_item->address, arg);
 			if (ret != SWAP_AGAIN) {
-				spin_unlock(&anon_vma->lock);
+				anon_vma_unlock(anon_vma);
 				goto out;
 			}
 		}
-		spin_unlock(&anon_vma->lock);
+		anon_vma_unlock(anon_vma);
 	}
 	if (!search_new_forks++)
 		goto again;
diff --git a/mm/migrate.c b/mm/migrate.c
index 4205b1d6049e..1855f869917d 100644
--- a/mm/migrate.c
+++ b/mm/migrate.c
@@ -684,7 +684,7 @@ rcu_unlock:
 	/* Drop an anon_vma reference if we took one */
 	if (anon_vma && atomic_dec_and_lock(&anon_vma->external_refcount, &anon_vma->lock)) {
 		int empty = list_empty(&anon_vma->head);
-		spin_unlock(&anon_vma->lock);
+		anon_vma_unlock(anon_vma);
 		if (empty)
 			anon_vma_free(anon_vma);
 	}
diff --git a/mm/mmap.c b/mm/mmap.c
index e26f1ea7c904..f5db18decc2e 100644
--- a/mm/mmap.c
+++ b/mm/mmap.c
@@ -2593,7 +2593,7 @@ static void vm_unlock_anon_vma(struct anon_vma *anon_vma)
 		if (!__test_and_clear_bit(0, (unsigned long *)
 					  &anon_vma->head.next))
 			BUG();
-		spin_unlock(&anon_vma->lock);
+		anon_vma_unlock(anon_vma);
 	}
 }
 
diff --git a/mm/rmap.c b/mm/rmap.c
index 38a336e2eea1..b65f00d1707f 100644
--- a/mm/rmap.c
+++ b/mm/rmap.c
@@ -134,7 +134,7 @@ int anon_vma_prepare(struct vm_area_struct *vma)
 			allocated = anon_vma;
 		}
 
-		spin_lock(&anon_vma->lock);
+		anon_vma_lock(anon_vma);
 		/* page_table_lock to protect against threads */
 		spin_lock(&mm->page_table_lock);
 		if (likely(!vma->anon_vma)) {
@@ -147,7 +147,7 @@ int anon_vma_prepare(struct vm_area_struct *vma)
 			avc = NULL;
 		}
 		spin_unlock(&mm->page_table_lock);
-		spin_unlock(&anon_vma->lock);
+		anon_vma_unlock(anon_vma);
 
 		if (unlikely(allocated))
 			anon_vma_free(allocated);
@@ -170,9 +170,9 @@ static void anon_vma_chain_link(struct vm_area_struct *vma,
 	avc->anon_vma = anon_vma;
 	list_add(&avc->same_vma, &vma->anon_vma_chain);
 
-	spin_lock(&anon_vma->lock);
+	anon_vma_lock(anon_vma);
 	list_add_tail(&avc->same_anon_vma, &anon_vma->head);
-	spin_unlock(&anon_vma->lock);
+	anon_vma_unlock(anon_vma);
 }
 
 /*
@@ -246,12 +246,12 @@ static void anon_vma_unlink(struct anon_vma_chain *anon_vma_chain)
 	if (!anon_vma)
 		return;
 
-	spin_lock(&anon_vma->lock);
+	anon_vma_lock(anon_vma);
 	list_del(&anon_vma_chain->same_anon_vma);
 
 	/* We must garbage collect the anon_vma if it's empty */
 	empty = list_empty(&anon_vma->head) && !anonvma_external_refcount(anon_vma);
-	spin_unlock(&anon_vma->lock);
+	anon_vma_unlock(anon_vma);
 
 	if (empty)
 		anon_vma_free(anon_vma);
@@ -302,7 +302,7 @@ struct anon_vma *page_lock_anon_vma(struct page *page)
 		goto out;
 
 	anon_vma = (struct anon_vma *) (anon_mapping - PAGE_MAPPING_ANON);
-	spin_lock(&anon_vma->lock);
+	anon_vma_lock(anon_vma);
 	return anon_vma;
 out:
 	rcu_read_unlock();
@@ -311,7 +311,7 @@ out:
 
 void page_unlock_anon_vma(struct anon_vma *anon_vma)
 {
-	spin_unlock(&anon_vma->lock);
+	anon_vma_unlock(anon_vma);
 	rcu_read_unlock();
 }
 
@@ -1389,7 +1389,7 @@ static int rmap_walk_anon(struct page *page, int (*rmap_one)(struct page *,
 	anon_vma = page_anon_vma(page);
 	if (!anon_vma)
 		return ret;
-	spin_lock(&anon_vma->lock);
+	anon_vma_lock(anon_vma);
 	list_for_each_entry(avc, &anon_vma->head, same_anon_vma) {
 		struct vm_area_struct *vma = avc->vma;
 		unsigned long address = vma_address(page, vma);
@@ -1399,7 +1399,7 @@ static int rmap_walk_anon(struct page *page, int (*rmap_one)(struct page *,
 		if (ret != SWAP_AGAIN)
 			break;
 	}
-	spin_unlock(&anon_vma->lock);
+	anon_vma_unlock(anon_vma);
 	return ret;
 }
 
-- 
cgit v1.2.3


From 5c341ee1dfc8fe69d66b1c8b19e463c6d7201ae1 Mon Sep 17 00:00:00 2001
From: Rik van Riel <riel@redhat.com>
Date: Mon, 9 Aug 2010 17:18:39 -0700
Subject: mm: track the root (oldest) anon_vma

Track the root (oldest) anon_vma in each anon_vma tree.  Because we only
take the lock on the root anon_vma, we cannot use the lock on higher-up
anon_vmas to lock anything.  This makes it impossible to do an indirect
lookup of the root anon_vma, since the data structures could go away from
under us.

However, a direct pointer is safe because the root anon_vma is always the
last one that gets freed on munmap or exit, by virtue of the same_vma list
order and unlink_anon_vmas walking the list forward.

[akpm@linux-foundation.org: fix typo]
Signed-off-by: Rik van Riel <riel@redhat.com>
Acked-by: Mel Gorman <mel@csn.ul.ie>
Acked-by: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
Tested-by: Larry Woodman <lwoodman@redhat.com>
Acked-by: Larry Woodman <lwoodman@redhat.com>
Reviewed-by: Minchan Kim <minchan.kim@gmail.com>
Acked-by: Linus Torvalds <torvalds@linux-foundation.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/rmap.h |  1 +
 mm/rmap.c            | 18 ++++++++++++++++--
 2 files changed, 17 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/rmap.h b/include/linux/rmap.h
index 5f981be61416..41fa6ddc6214 100644
--- a/include/linux/rmap.h
+++ b/include/linux/rmap.h
@@ -26,6 +26,7 @@
  */
 struct anon_vma {
 	spinlock_t lock;	/* Serialize access to vma list */
+	struct anon_vma *root;	/* Root of this anon_vma tree */
 #if defined(CONFIG_KSM) || defined(CONFIG_MIGRATION)
 
 	/*
diff --git a/mm/rmap.c b/mm/rmap.c
index b65f00d1707f..caa48b27371b 100644
--- a/mm/rmap.c
+++ b/mm/rmap.c
@@ -132,6 +132,11 @@ int anon_vma_prepare(struct vm_area_struct *vma)
 			if (unlikely(!anon_vma))
 				goto out_enomem_free_avc;
 			allocated = anon_vma;
+			/*
+			 * This VMA had no anon_vma yet.  This anon_vma is
+			 * the root of any anon_vma tree that might form.
+			 */
+			anon_vma->root = anon_vma;
 		}
 
 		anon_vma_lock(anon_vma);
@@ -224,9 +229,15 @@ int anon_vma_fork(struct vm_area_struct *vma, struct vm_area_struct *pvma)
 	avc = anon_vma_chain_alloc();
 	if (!avc)
 		goto out_error_free_anon_vma;
-	anon_vma_chain_link(vma, avc, anon_vma);
+
+	/*
+	 * The root anon_vma's spinlock is the lock actually used when we
+	 * lock any of the anon_vmas in this anon_vma tree.
+	 */
+	anon_vma->root = pvma->anon_vma->root;
 	/* Mark this anon_vma as the one where our new (COWed) pages go. */
 	vma->anon_vma = anon_vma;
+	anon_vma_chain_link(vma, avc, anon_vma);
 
 	return 0;
 
@@ -261,7 +272,10 @@ void unlink_anon_vmas(struct vm_area_struct *vma)
 {
 	struct anon_vma_chain *avc, *next;
 
-	/* Unlink each anon_vma chained to the VMA. */
+	/*
+	 * Unlink each anon_vma chained to the VMA.  This list is ordered
+	 * from newest to oldest, ensuring the root anon_vma gets freed last.
+	 */
 	list_for_each_entry_safe(avc, next, &vma->anon_vma_chain, same_vma) {
 		anon_vma_unlink(avc);
 		list_del(&avc->same_vma);
-- 
cgit v1.2.3


From 012f18004da33ba672e3c60838cc4898126174d3 Mon Sep 17 00:00:00 2001
From: Rik van Riel <riel@redhat.com>
Date: Mon, 9 Aug 2010 17:18:40 -0700
Subject: mm: always lock the root (oldest) anon_vma

Always (and only) lock the root (oldest) anon_vma whenever we do something
in an anon_vma.  The recently introduced anon_vma scalability is due to
the rmap code scanning only the VMAs that need to be scanned.  Many common
operations still took the anon_vma lock on the root anon_vma, so always
taking that lock is not expected to introduce any scalability issues.

However, always taking the same lock does mean we only need to take one
lock, which means rmap_walk on pages from any anon_vma in the vma is
excluded from occurring during an munmap, expand_stack or other operation
that needs to exclude rmap_walk and similar functions.

Also add the proper locking to vma_adjust.

Signed-off-by: Rik van Riel <riel@redhat.com>
Tested-by: Larry Woodman <lwoodman@redhat.com>
Acked-by: Larry Woodman <lwoodman@redhat.com>
Reviewed-by: Minchan Kim <minchan.kim@gmail.com>
Reviewed-by: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
Acked-by: Mel Gorman <mel@csn.ul.ie>
Acked-by: Linus Torvalds <torvalds@linux-foundation.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/rmap.h |  8 ++++----
 mm/ksm.c             |  2 +-
 mm/migrate.c         |  2 +-
 mm/mmap.c            | 30 ++++++++++++++++++++++--------
 4 files changed, 28 insertions(+), 14 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/rmap.h b/include/linux/rmap.h
index 41fa6ddc6214..af43cb9a0506 100644
--- a/include/linux/rmap.h
+++ b/include/linux/rmap.h
@@ -104,24 +104,24 @@ static inline void vma_lock_anon_vma(struct vm_area_struct *vma)
 {
 	struct anon_vma *anon_vma = vma->anon_vma;
 	if (anon_vma)
-		spin_lock(&anon_vma->lock);
+		spin_lock(&anon_vma->root->lock);
 }
 
 static inline void vma_unlock_anon_vma(struct vm_area_struct *vma)
 {
 	struct anon_vma *anon_vma = vma->anon_vma;
 	if (anon_vma)
-		spin_unlock(&anon_vma->lock);
+		spin_unlock(&anon_vma->root->lock);
 }
 
 static inline void anon_vma_lock(struct anon_vma *anon_vma)
 {
-	spin_lock(&anon_vma->lock);
+	spin_lock(&anon_vma->root->lock);
 }
 
 static inline void anon_vma_unlock(struct anon_vma *anon_vma)
 {
-	spin_unlock(&anon_vma->lock);
+	spin_unlock(&anon_vma->root->lock);
 }
 
 /*
diff --git a/mm/ksm.c b/mm/ksm.c
index eb9f6806ed51..da6037c261f1 100644
--- a/mm/ksm.c
+++ b/mm/ksm.c
@@ -325,7 +325,7 @@ static void drop_anon_vma(struct rmap_item *rmap_item)
 {
 	struct anon_vma *anon_vma = rmap_item->anon_vma;
 
-	if (atomic_dec_and_lock(&anon_vma->external_refcount, &anon_vma->lock)) {
+	if (atomic_dec_and_lock(&anon_vma->external_refcount, &anon_vma->root->lock)) {
 		int empty = list_empty(&anon_vma->head);
 		anon_vma_unlock(anon_vma);
 		if (empty)
diff --git a/mm/migrate.c b/mm/migrate.c
index 1855f869917d..5208fa1d9712 100644
--- a/mm/migrate.c
+++ b/mm/migrate.c
@@ -682,7 +682,7 @@ skip_unmap:
 rcu_unlock:
 
 	/* Drop an anon_vma reference if we took one */
-	if (anon_vma && atomic_dec_and_lock(&anon_vma->external_refcount, &anon_vma->lock)) {
+	if (anon_vma && atomic_dec_and_lock(&anon_vma->external_refcount, &anon_vma->root->lock)) {
 		int empty = list_empty(&anon_vma->head);
 		anon_vma_unlock(anon_vma);
 		if (empty)
diff --git a/mm/mmap.c b/mm/mmap.c
index f5db18decc2e..fb89360a2120 100644
--- a/mm/mmap.c
+++ b/mm/mmap.c
@@ -506,6 +506,7 @@ int vma_adjust(struct vm_area_struct *vma, unsigned long start,
 	struct vm_area_struct *importer = NULL;
 	struct address_space *mapping = NULL;
 	struct prio_tree_root *root = NULL;
+	struct anon_vma *anon_vma = NULL;
 	struct file *file = vma->vm_file;
 	long adjust_next = 0;
 	int remove_next = 0;
@@ -578,6 +579,17 @@ again:			remove_next = 1 + (end > next->vm_end);
 		}
 	}
 
+	/*
+	 * When changing only vma->vm_end, we don't really need anon_vma
+	 * lock. This is a fairly rare case by itself, but the anon_vma
+	 * lock may be shared between many sibling processes.  Skipping
+	 * the lock for brk adjustments makes a difference sometimes.
+	 */
+	if (vma->anon_vma && (insert || importer || start != vma->vm_start)) {
+		anon_vma = vma->anon_vma;
+		anon_vma_lock(anon_vma);
+	}
+
 	if (root) {
 		flush_dcache_mmap_lock(mapping);
 		vma_prio_tree_remove(vma, root);
@@ -617,6 +629,8 @@ again:			remove_next = 1 + (end > next->vm_end);
 		__insert_vm_struct(mm, insert);
 	}
 
+	if (anon_vma)
+		anon_vma_unlock(anon_vma);
 	if (mapping)
 		spin_unlock(&mapping->i_mmap_lock);
 
@@ -2470,23 +2484,23 @@ static DEFINE_MUTEX(mm_all_locks_mutex);
 
 static void vm_lock_anon_vma(struct mm_struct *mm, struct anon_vma *anon_vma)
 {
-	if (!test_bit(0, (unsigned long *) &anon_vma->head.next)) {
+	if (!test_bit(0, (unsigned long *) &anon_vma->root->head.next)) {
 		/*
 		 * The LSB of head.next can't change from under us
 		 * because we hold the mm_all_locks_mutex.
 		 */
-		spin_lock_nest_lock(&anon_vma->lock, &mm->mmap_sem);
+		spin_lock_nest_lock(&anon_vma->root->lock, &mm->mmap_sem);
 		/*
 		 * We can safely modify head.next after taking the
-		 * anon_vma->lock. If some other vma in this mm shares
+		 * anon_vma->root->lock. If some other vma in this mm shares
 		 * the same anon_vma we won't take it again.
 		 *
 		 * No need of atomic instructions here, head.next
 		 * can't change from under us thanks to the
-		 * anon_vma->lock.
+		 * anon_vma->root->lock.
 		 */
 		if (__test_and_set_bit(0, (unsigned long *)
-				       &anon_vma->head.next))
+				       &anon_vma->root->head.next))
 			BUG();
 	}
 }
@@ -2577,7 +2591,7 @@ out_unlock:
 
 static void vm_unlock_anon_vma(struct anon_vma *anon_vma)
 {
-	if (test_bit(0, (unsigned long *) &anon_vma->head.next)) {
+	if (test_bit(0, (unsigned long *) &anon_vma->root->head.next)) {
 		/*
 		 * The LSB of head.next can't change to 0 from under
 		 * us because we hold the mm_all_locks_mutex.
@@ -2588,10 +2602,10 @@ static void vm_unlock_anon_vma(struct anon_vma *anon_vma)
 		 *
 		 * No need of atomic instructions here, head.next
 		 * can't change from under us until we release the
-		 * anon_vma->lock.
+		 * anon_vma->root->lock.
 		 */
 		if (!__test_and_clear_bit(0, (unsigned long *)
-					  &anon_vma->head.next))
+					  &anon_vma->root->head.next))
 			BUG();
 		anon_vma_unlock(anon_vma);
 	}
-- 
cgit v1.2.3


From 76545066c8521f3e32c849744744842b4df25b79 Mon Sep 17 00:00:00 2001
From: Rik van Riel <riel@redhat.com>
Date: Mon, 9 Aug 2010 17:18:41 -0700
Subject: mm: extend KSM refcounts to the anon_vma root

KSM reference counts can cause an anon_vma to exist after the processe it
belongs to have already exited.  Because the anon_vma lock now lives in
the root anon_vma, we need to ensure that the root anon_vma stays around
until after all the "child" anon_vmas have been freed.

The obvious way to do this is to have a "child" anon_vma take a reference
to the root in anon_vma_fork.  When the anon_vma is freed at munmap or
process exit, we drop the refcount in anon_vma_unlink and possibly free
the root anon_vma.

The KSM anon_vma reference count function also needs to be modified to
deal with the possibility of freeing 2 levels of anon_vma.  The easiest
way to do this is to break out the KSM magic and make it generic.

When compiling without CONFIG_KSM, this code is compiled out.

Signed-off-by: Rik van Riel <riel@redhat.com>
Tested-by: Larry Woodman <lwoodman@redhat.com>
Acked-by: Larry Woodman <lwoodman@redhat.com>
Reviewed-by: Minchan Kim <minchan.kim@gmail.com>
Cc: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
Acked-by: Mel Gorman <mel@csn.ul.ie>
Acked-by: Linus Torvalds <torvalds@linux-foundation.org>
Tested-by: Dave Young <hidave.darkstar@gmail.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/rmap.h | 15 +++++++++++++++
 mm/ksm.c             | 17 ++++++-----------
 mm/migrate.c         | 10 +++-------
 mm/rmap.c            | 46 +++++++++++++++++++++++++++++++++++++++++++++-
 4 files changed, 69 insertions(+), 19 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/rmap.h b/include/linux/rmap.h
index af43cb9a0506..dc9b3c0bf5d4 100644
--- a/include/linux/rmap.h
+++ b/include/linux/rmap.h
@@ -81,6 +81,13 @@ static inline int anonvma_external_refcount(struct anon_vma *anon_vma)
 {
 	return atomic_read(&anon_vma->external_refcount);
 }
+
+static inline void get_anon_vma(struct anon_vma *anon_vma)
+{
+	atomic_inc(&anon_vma->external_refcount);
+}
+
+void drop_anon_vma(struct anon_vma *);
 #else
 static inline void anonvma_external_refcount_init(struct anon_vma *anon_vma)
 {
@@ -90,6 +97,14 @@ static inline int anonvma_external_refcount(struct anon_vma *anon_vma)
 {
 	return 0;
 }
+
+static inline void get_anon_vma(struct anon_vma *anon_vma)
+{
+}
+
+static inline void drop_anon_vma(struct anon_vma *anon_vma)
+{
+}
 #endif /* CONFIG_KSM */
 
 static inline struct anon_vma *page_anon_vma(struct page *page)
diff --git a/mm/ksm.c b/mm/ksm.c
index da6037c261f1..9f2acc998a37 100644
--- a/mm/ksm.c
+++ b/mm/ksm.c
@@ -318,19 +318,14 @@ static void hold_anon_vma(struct rmap_item *rmap_item,
 			  struct anon_vma *anon_vma)
 {
 	rmap_item->anon_vma = anon_vma;
-	atomic_inc(&anon_vma->external_refcount);
+	get_anon_vma(anon_vma);
 }
 
-static void drop_anon_vma(struct rmap_item *rmap_item)
+static void ksm_drop_anon_vma(struct rmap_item *rmap_item)
 {
 	struct anon_vma *anon_vma = rmap_item->anon_vma;
 
-	if (atomic_dec_and_lock(&anon_vma->external_refcount, &anon_vma->root->lock)) {
-		int empty = list_empty(&anon_vma->head);
-		anon_vma_unlock(anon_vma);
-		if (empty)
-			anon_vma_free(anon_vma);
-	}
+	drop_anon_vma(anon_vma);
 }
 
 /*
@@ -415,7 +410,7 @@ static void break_cow(struct rmap_item *rmap_item)
 	 * It is not an accident that whenever we want to break COW
 	 * to undo, we also need to drop a reference to the anon_vma.
 	 */
-	drop_anon_vma(rmap_item);
+	ksm_drop_anon_vma(rmap_item);
 
 	down_read(&mm->mmap_sem);
 	if (ksm_test_exit(mm))
@@ -470,7 +465,7 @@ static void remove_node_from_stable_tree(struct stable_node *stable_node)
 			ksm_pages_sharing--;
 		else
 			ksm_pages_shared--;
-		drop_anon_vma(rmap_item);
+		ksm_drop_anon_vma(rmap_item);
 		rmap_item->address &= PAGE_MASK;
 		cond_resched();
 	}
@@ -558,7 +553,7 @@ static void remove_rmap_item_from_tree(struct rmap_item *rmap_item)
 		else
 			ksm_pages_shared--;
 
-		drop_anon_vma(rmap_item);
+		ksm_drop_anon_vma(rmap_item);
 		rmap_item->address &= PAGE_MASK;
 
 	} else if (rmap_item->address & UNSTABLE_FLAG) {
diff --git a/mm/migrate.c b/mm/migrate.c
index 5208fa1d9712..38e7cad782f4 100644
--- a/mm/migrate.c
+++ b/mm/migrate.c
@@ -639,7 +639,7 @@ static int unmap_and_move(new_page_t get_new_page, unsigned long private,
 			 * exist when the page is remapped later
 			 */
 			anon_vma = page_anon_vma(page);
-			atomic_inc(&anon_vma->external_refcount);
+			get_anon_vma(anon_vma);
 		}
 	}
 
@@ -682,12 +682,8 @@ skip_unmap:
 rcu_unlock:
 
 	/* Drop an anon_vma reference if we took one */
-	if (anon_vma && atomic_dec_and_lock(&anon_vma->external_refcount, &anon_vma->root->lock)) {
-		int empty = list_empty(&anon_vma->head);
-		anon_vma_unlock(anon_vma);
-		if (empty)
-			anon_vma_free(anon_vma);
-	}
+	if (anon_vma)
+		drop_anon_vma(anon_vma);
 
 	if (rcu_locked)
 		rcu_read_unlock();
diff --git a/mm/rmap.c b/mm/rmap.c
index caa48b27371b..07e9814c7a41 100644
--- a/mm/rmap.c
+++ b/mm/rmap.c
@@ -235,6 +235,12 @@ int anon_vma_fork(struct vm_area_struct *vma, struct vm_area_struct *pvma)
 	 * lock any of the anon_vmas in this anon_vma tree.
 	 */
 	anon_vma->root = pvma->anon_vma->root;
+	/*
+	 * With KSM refcounts, an anon_vma can stay around longer than the
+	 * process it belongs to.  The root anon_vma needs to be pinned
+	 * until this anon_vma is freed, because the lock lives in the root.
+	 */
+	get_anon_vma(anon_vma->root);
 	/* Mark this anon_vma as the one where our new (COWed) pages go. */
 	vma->anon_vma = anon_vma;
 	anon_vma_chain_link(vma, avc, anon_vma);
@@ -264,8 +270,12 @@ static void anon_vma_unlink(struct anon_vma_chain *anon_vma_chain)
 	empty = list_empty(&anon_vma->head) && !anonvma_external_refcount(anon_vma);
 	anon_vma_unlock(anon_vma);
 
-	if (empty)
+	if (empty) {
+		/* We no longer need the root anon_vma */
+		if (anon_vma->root != anon_vma)
+			drop_anon_vma(anon_vma->root);
 		anon_vma_free(anon_vma);
+	}
 }
 
 void unlink_anon_vmas(struct vm_area_struct *vma)
@@ -1382,6 +1392,40 @@ int try_to_munlock(struct page *page)
 		return try_to_unmap_file(page, TTU_MUNLOCK);
 }
 
+#if defined(CONFIG_KSM) || defined(CONFIG_MIGRATION)
+/*
+ * Drop an anon_vma refcount, freeing the anon_vma and anon_vma->root
+ * if necessary.  Be careful to do all the tests under the lock.  Once
+ * we know we are the last user, nobody else can get a reference and we
+ * can do the freeing without the lock.
+ */
+void drop_anon_vma(struct anon_vma *anon_vma)
+{
+	if (atomic_dec_and_lock(&anon_vma->external_refcount, &anon_vma->root->lock)) {
+		struct anon_vma *root = anon_vma->root;
+		int empty = list_empty(&anon_vma->head);
+		int last_root_user = 0;
+		int root_empty = 0;
+
+		/*
+		 * The refcount on a non-root anon_vma got dropped.  Drop
+		 * the refcount on the root and check if we need to free it.
+		 */
+		if (empty && anon_vma != root) {
+			last_root_user = atomic_dec_and_test(&root->external_refcount);
+			root_empty = list_empty(&root->head);
+		}
+		anon_vma_unlock(anon_vma);
+
+		if (empty) {
+			anon_vma_free(anon_vma);
+			if (root_empty && last_root_user)
+				anon_vma_free(root);
+		}
+	}
+}
+#endif
+
 #ifdef CONFIG_MIGRATION
 /*
  * rmap_walk() and its helpers rmap_walk_anon() and rmap_walk_file():
-- 
cgit v1.2.3


From a9877cc2937889e5669114f28612b494384152a4 Mon Sep 17 00:00:00 2001
From: Richard Kennedy <richard@rsk.demon.co.uk>
Date: Mon, 9 Aug 2010 17:18:42 -0700
Subject: buffer_head: remove redundant test from wait_on_buffer

The comment suggests that when b_count equals zero it is calling
__wait_no_buffer to trigger some debug, but as there is no debug in
__wait_on_buffer the whole thing is redundant.

AFAICT from the git log this has been the case for at least 5 years, so
it seems safe just to remove this.

Signed-off-by: Richard Kennedy <richard@rsk.demon.co.uk>
Cc: Nick Piggin <nickpiggin@yahoo.com.au>
Cc: Jens Axboe <axboe@kernel.dk>
Cc: Jeff Mahoney <jeffm@suse.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/buffer_head.h | 7 +------
 1 file changed, 1 insertion(+), 6 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/buffer_head.h b/include/linux/buffer_head.h
index 1b9ba193b789..2ce51fac7d3d 100644
--- a/include/linux/buffer_head.h
+++ b/include/linux/buffer_head.h
@@ -314,15 +314,10 @@ map_bh(struct buffer_head *bh, struct super_block *sb, sector_t block)
 	bh->b_size = sb->s_blocksize;
 }
 
-/*
- * Calling wait_on_buffer() for a zero-ref buffer is illegal, so we call into
- * __wait_on_buffer() just to trip a debug check.  Because debug code in inline
- * functions is bloaty.
- */
 static inline void wait_on_buffer(struct buffer_head *bh)
 {
 	might_sleep();
-	if (buffer_locked(bh) || atomic_read(&bh->b_count) == 0)
+	if (buffer_locked(bh))
 		__wait_on_buffer(bh);
 }
 
-- 
cgit v1.2.3


From 6f48d0ebd907ae419387f27b602ee98870cfa7bb Mon Sep 17 00:00:00 2001
From: David Rientjes <rientjes@google.com>
Date: Mon, 9 Aug 2010 17:18:52 -0700
Subject: oom: select task from tasklist for mempolicy ooms

The oom killer presently kills current whenever there is no more memory
free or reclaimable on its mempolicy's nodes.  There is no guarantee that
current is a memory-hogging task or that killing it will free any
substantial amount of memory, however.

In such situations, it is better to scan the tasklist for nodes that are
allowed to allocate on current's set of nodes and kill the task with the
highest badness() score.  This ensures that the most memory-hogging task,
or the one configured by the user with /proc/pid/oom_adj, is always
selected in such scenarios.

Signed-off-by: David Rientjes <rientjes@google.com>
Reviewed-by: KOSAKI Motohiro <kosaki.motohiro@jp.fujitsu.com>
Cc: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/mempolicy.h |  13 +++++-
 mm/mempolicy.c            |  44 ++++++++++++++++++++
 mm/oom_kill.c             | 104 ++++++++++++++++++++++++++++++----------------
 3 files changed, 124 insertions(+), 37 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/mempolicy.h b/include/linux/mempolicy.h
index 7b9ef6bf45aa..31ac26ca4acf 100644
--- a/include/linux/mempolicy.h
+++ b/include/linux/mempolicy.h
@@ -210,6 +210,8 @@ extern struct zonelist *huge_zonelist(struct vm_area_struct *vma,
 				unsigned long addr, gfp_t gfp_flags,
 				struct mempolicy **mpol, nodemask_t **nodemask);
 extern bool init_nodemask_of_mempolicy(nodemask_t *mask);
+extern bool mempolicy_nodemask_intersects(struct task_struct *tsk,
+				const nodemask_t *mask);
 extern unsigned slab_node(struct mempolicy *policy);
 
 extern enum zone_type policy_zone;
@@ -338,7 +340,16 @@ static inline struct zonelist *huge_zonelist(struct vm_area_struct *vma,
 	return node_zonelist(0, gfp_flags);
 }
 
-static inline bool init_nodemask_of_mempolicy(nodemask_t *m) { return false; }
+static inline bool init_nodemask_of_mempolicy(nodemask_t *m)
+{
+	return false;
+}
+
+static inline bool mempolicy_nodemask_intersects(struct task_struct *tsk,
+			const nodemask_t *mask)
+{
+	return false;
+}
 
 static inline int do_migrate_pages(struct mm_struct *mm,
 			const nodemask_t *from_nodes,
diff --git a/mm/mempolicy.c b/mm/mempolicy.c
index 5bc0a96beb51..8a73708d59bb 100644
--- a/mm/mempolicy.c
+++ b/mm/mempolicy.c
@@ -1712,6 +1712,50 @@ bool init_nodemask_of_mempolicy(nodemask_t *mask)
 }
 #endif
 
+/*
+ * mempolicy_nodemask_intersects
+ *
+ * If tsk's mempolicy is "default" [NULL], return 'true' to indicate default
+ * policy.  Otherwise, check for intersection between mask and the policy
+ * nodemask for 'bind' or 'interleave' policy.  For 'perferred' or 'local'
+ * policy, always return true since it may allocate elsewhere on fallback.
+ *
+ * Takes task_lock(tsk) to prevent freeing of its mempolicy.
+ */
+bool mempolicy_nodemask_intersects(struct task_struct *tsk,
+					const nodemask_t *mask)
+{
+	struct mempolicy *mempolicy;
+	bool ret = true;
+
+	if (!mask)
+		return ret;
+	task_lock(tsk);
+	mempolicy = tsk->mempolicy;
+	if (!mempolicy)
+		goto out;
+
+	switch (mempolicy->mode) {
+	case MPOL_PREFERRED:
+		/*
+		 * MPOL_PREFERRED and MPOL_F_LOCAL are only preferred nodes to
+		 * allocate from, they may fallback to other nodes when oom.
+		 * Thus, it's possible for tsk to have allocated memory from
+		 * nodes in mask.
+		 */
+		break;
+	case MPOL_BIND:
+	case MPOL_INTERLEAVE:
+		ret = nodes_intersects(mempolicy->v.nodes, *mask);
+		break;
+	default:
+		BUG();
+	}
+out:
+	task_unlock(tsk);
+	return ret;
+}
+
 /* Allocate a page in interleaved policy.
    Own path because it needs to do special accounting. */
 static struct page *alloc_page_interleave(gfp_t gfp, unsigned order,
diff --git a/mm/oom_kill.c b/mm/oom_kill.c
index 7c8488f6a3f5..13ceed78bc45 100644
--- a/mm/oom_kill.c
+++ b/mm/oom_kill.c
@@ -27,6 +27,7 @@
 #include <linux/module.h>
 #include <linux/notifier.h>
 #include <linux/memcontrol.h>
+#include <linux/mempolicy.h>
 #include <linux/security.h>
 
 int sysctl_panic_on_oom;
@@ -35,23 +36,57 @@ int sysctl_oom_dump_tasks;
 static DEFINE_SPINLOCK(zone_scan_lock);
 /* #define DEBUG */
 
-/*
- * Is all threads of the target process nodes overlap ours?
+#ifdef CONFIG_NUMA
+/**
+ * has_intersects_mems_allowed() - check task eligiblity for kill
+ * @tsk: task struct of which task to consider
+ * @mask: nodemask passed to page allocator for mempolicy ooms
+ *
+ * Task eligibility is determined by whether or not a candidate task, @tsk,
+ * shares the same mempolicy nodes as current if it is bound by such a policy
+ * and whether or not it has the same set of allowed cpuset nodes.
  */
-static int has_intersects_mems_allowed(struct task_struct *tsk)
+static bool has_intersects_mems_allowed(struct task_struct *tsk,
+					const nodemask_t *mask)
 {
-	struct task_struct *t;
+	struct task_struct *start = tsk;
 
-	t = tsk;
 	do {
-		if (cpuset_mems_allowed_intersects(current, t))
-			return 1;
-		t = next_thread(t);
-	} while (t != tsk);
-
-	return 0;
+		if (mask) {
+			/*
+			 * If this is a mempolicy constrained oom, tsk's
+			 * cpuset is irrelevant.  Only return true if its
+			 * mempolicy intersects current, otherwise it may be
+			 * needlessly killed.
+			 */
+			if (mempolicy_nodemask_intersects(tsk, mask))
+				return true;
+		} else {
+			/*
+			 * This is not a mempolicy constrained oom, so only
+			 * check the mems of tsk's cpuset.
+			 */
+			if (cpuset_mems_allowed_intersects(current, tsk))
+				return true;
+		}
+		tsk = next_thread(tsk);
+	} while (tsk != start);
+	return false;
+}
+#else
+static bool has_intersects_mems_allowed(struct task_struct *tsk,
+					const nodemask_t *mask)
+{
+	return true;
 }
+#endif /* CONFIG_NUMA */
 
+/*
+ * The process p may have detached its own ->mm while exiting or through
+ * use_mm(), but one or more of its subthreads may still have a valid
+ * pointer.  Return p, or any of its subthreads with a valid ->mm, with
+ * task_lock() held.
+ */
 static struct task_struct *find_lock_task_mm(struct task_struct *p)
 {
 	struct task_struct *t = p;
@@ -106,10 +141,6 @@ unsigned long badness(struct task_struct *p, unsigned long uptime)
 	 * The memory size of the process is the basis for the badness.
 	 */
 	points = p->mm->total_vm;
-
-	/*
-	 * After this unlock we can no longer dereference local variable `mm'
-	 */
 	task_unlock(p);
 
 	/*
@@ -253,7 +284,8 @@ static enum oom_constraint constrained_alloc(struct zonelist *zonelist,
  * (not docbooked, we don't want this one cluttering up the manual)
  */
 static struct task_struct *select_bad_process(unsigned long *ppoints,
-						struct mem_cgroup *mem)
+		struct mem_cgroup *mem, enum oom_constraint constraint,
+		const nodemask_t *mask)
 {
 	struct task_struct *p;
 	struct task_struct *chosen = NULL;
@@ -269,7 +301,9 @@ static struct task_struct *select_bad_process(unsigned long *ppoints,
 			continue;
 		if (mem && !task_in_mem_cgroup(p, mem))
 			continue;
-		if (!has_intersects_mems_allowed(p))
+		if (!has_intersects_mems_allowed(p,
+				constraint == CONSTRAINT_MEMORY_POLICY ? mask :
+									 NULL))
 			continue;
 
 		/*
@@ -497,7 +531,7 @@ void mem_cgroup_out_of_memory(struct mem_cgroup *mem, gfp_t gfp_mask)
 		panic("out of memory(memcg). panic_on_oom is selected.\n");
 	read_lock(&tasklist_lock);
 retry:
-	p = select_bad_process(&points, mem);
+	p = select_bad_process(&points, mem, CONSTRAINT_NONE, NULL);
 	if (!p || PTR_ERR(p) == -1UL)
 		goto out;
 
@@ -576,7 +610,8 @@ void clear_zonelist_oom(struct zonelist *zonelist, gfp_t gfp_mask)
 /*
  * Must be called with tasklist_lock held for read.
  */
-static void __out_of_memory(gfp_t gfp_mask, int order)
+static void __out_of_memory(gfp_t gfp_mask, int order,
+			enum oom_constraint constraint, const nodemask_t *mask)
 {
 	struct task_struct *p;
 	unsigned long points;
@@ -590,7 +625,7 @@ retry:
 	 * Rambo mode: Shoot down a process and hope it solves whatever
 	 * issues we may have.
 	 */
-	p = select_bad_process(&points, NULL);
+	p = select_bad_process(&points, NULL, constraint, mask);
 
 	if (PTR_ERR(p) == -1UL)
 		return;
@@ -624,7 +659,8 @@ void pagefault_out_of_memory(void)
 		panic("out of memory from page fault. panic_on_oom is selected.\n");
 
 	read_lock(&tasklist_lock);
-	__out_of_memory(0, 0); /* unknown gfp_mask and order */
+	/* unknown gfp_mask and order */
+	__out_of_memory(0, 0, CONSTRAINT_NONE, NULL);
 	read_unlock(&tasklist_lock);
 
 	/*
@@ -640,6 +676,7 @@ void pagefault_out_of_memory(void)
  * @zonelist: zonelist pointer
  * @gfp_mask: memory allocation flags
  * @order: amount of memory being requested as a power of 2
+ * @nodemask: nodemask passed to page allocator
  *
  * If we run out of memory, we have the choice between either
  * killing a random task (bad), letting the system crash (worse)
@@ -678,24 +715,19 @@ void out_of_memory(struct zonelist *zonelist, gfp_t gfp_mask,
 	 */
 	constraint = constrained_alloc(zonelist, gfp_mask, nodemask);
 	read_lock(&tasklist_lock);
-
-	switch (constraint) {
-	case CONSTRAINT_MEMORY_POLICY:
-		oom_kill_process(current, gfp_mask, order, 0, NULL,
-				"No available memory (MPOL_BIND)");
-		break;
-
-	case CONSTRAINT_NONE:
-		if (sysctl_panic_on_oom) {
+	if (unlikely(sysctl_panic_on_oom)) {
+		/*
+		 * panic_on_oom only affects CONSTRAINT_NONE, the kernel
+		 * should not panic for cpuset or mempolicy induced memory
+		 * failures.
+		 */
+		if (constraint == CONSTRAINT_NONE) {
 			dump_header(NULL, gfp_mask, order, NULL);
-			panic("out of memory. panic_on_oom is selected\n");
+			read_unlock(&tasklist_lock);
+			panic("Out of memory: panic_on_oom is enabled\n");
 		}
-		/* Fall-through */
-	case CONSTRAINT_CPUSET:
-		__out_of_memory(gfp_mask, order);
-		break;
 	}
-
+	__out_of_memory(gfp_mask, order, constraint, nodemask);
 	read_unlock(&tasklist_lock);
 
 	/*
-- 
cgit v1.2.3


From 309ed882508cc471320ff79265e7340774d6746c Mon Sep 17 00:00:00 2001
From: David Rientjes <rientjes@google.com>
Date: Mon, 9 Aug 2010 17:18:54 -0700
Subject: oom: extract panic helper function

There are various points in the oom killer where the kernel must determine
whether to panic or not.  It's better to extract this to a helper function
to remove all the confusion as to its semantics.

Also fix a call to dump_header() where tasklist_lock is not read- locked,
as required.

There's no functional change with this patch.

Acked-by: KOSAKI Motohiro <kosaki.motohiro@jp.fujitsu.com>
Signed-off-by: David Rientjes <rientjes@google.com>
Cc: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/oom.h |  1 +
 mm/oom_kill.c       | 53 +++++++++++++++++++++++++++++------------------------
 2 files changed, 30 insertions(+), 24 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/oom.h b/include/linux/oom.h
index 537662315627..3ae6d94d0540 100644
--- a/include/linux/oom.h
+++ b/include/linux/oom.h
@@ -22,6 +22,7 @@ enum oom_constraint {
 	CONSTRAINT_NONE,
 	CONSTRAINT_CPUSET,
 	CONSTRAINT_MEMORY_POLICY,
+	CONSTRAINT_MEMCG,
 };
 
 extern int try_set_zone_oom(struct zonelist *zonelist, gfp_t gfp_flags);
diff --git a/mm/oom_kill.c b/mm/oom_kill.c
index 01b5e01e52cb..fca886d8b5fb 100644
--- a/mm/oom_kill.c
+++ b/mm/oom_kill.c
@@ -521,17 +521,40 @@ static int oom_kill_process(struct task_struct *p, gfp_t gfp_mask, int order,
 	return oom_kill_task(victim);
 }
 
+/*
+ * Determines whether the kernel must panic because of the panic_on_oom sysctl.
+ */
+static void check_panic_on_oom(enum oom_constraint constraint, gfp_t gfp_mask,
+				int order)
+{
+	if (likely(!sysctl_panic_on_oom))
+		return;
+	if (sysctl_panic_on_oom != 2) {
+		/*
+		 * panic_on_oom == 1 only affects CONSTRAINT_NONE, the kernel
+		 * does not panic for cpuset, mempolicy, or memcg allocation
+		 * failures.
+		 */
+		if (constraint != CONSTRAINT_NONE)
+			return;
+	}
+	read_lock(&tasklist_lock);
+	dump_header(NULL, gfp_mask, order, NULL);
+	read_unlock(&tasklist_lock);
+	panic("Out of memory: %s panic_on_oom is enabled\n",
+		sysctl_panic_on_oom == 2 ? "compulsory" : "system-wide");
+}
+
 #ifdef CONFIG_CGROUP_MEM_RES_CTLR
 void mem_cgroup_out_of_memory(struct mem_cgroup *mem, gfp_t gfp_mask)
 {
 	unsigned long points = 0;
 	struct task_struct *p;
 
-	if (sysctl_panic_on_oom == 2)
-		panic("out of memory(memcg). panic_on_oom is selected.\n");
+	check_panic_on_oom(CONSTRAINT_MEMCG, gfp_mask, 0);
 	read_lock(&tasklist_lock);
 retry:
-	p = select_bad_process(&points, mem, CONSTRAINT_NONE, NULL);
+	p = select_bad_process(&points, mem, CONSTRAINT_MEMCG, NULL);
 	if (!p || PTR_ERR(p) == -1UL)
 		goto out;
 
@@ -632,8 +655,8 @@ retry:
 
 	/* Found nothing?!?! Either we hang forever, or we panic. */
 	if (!p) {
-		read_unlock(&tasklist_lock);
 		dump_header(NULL, gfp_mask, order, NULL);
+		read_unlock(&tasklist_lock);
 		panic("Out of memory and no killable processes...\n");
 	}
 
@@ -655,9 +678,7 @@ void pagefault_out_of_memory(void)
 		/* Got some memory back in the last second. */
 		return;
 
-	if (sysctl_panic_on_oom)
-		panic("out of memory from page fault. panic_on_oom is selected.\n");
-
+	check_panic_on_oom(CONSTRAINT_NONE, 0, 0);
 	read_lock(&tasklist_lock);
 	/* unknown gfp_mask and order */
 	__out_of_memory(0, 0, CONSTRAINT_NONE, NULL);
@@ -704,29 +725,13 @@ void out_of_memory(struct zonelist *zonelist, gfp_t gfp_mask,
 		return;
 	}
 
-	if (sysctl_panic_on_oom == 2) {
-		dump_header(NULL, gfp_mask, order, NULL);
-		panic("out of memory. Compulsory panic_on_oom is selected.\n");
-	}
-
 	/*
 	 * Check if there were limitations on the allocation (only relevant for
 	 * NUMA) that may require different handling.
 	 */
 	constraint = constrained_alloc(zonelist, gfp_mask, nodemask);
+	check_panic_on_oom(constraint, gfp_mask, order);
 	read_lock(&tasklist_lock);
-	if (unlikely(sysctl_panic_on_oom)) {
-		/*
-		 * panic_on_oom only affects CONSTRAINT_NONE, the kernel
-		 * should not panic for cpuset or mempolicy induced memory
-		 * failures.
-		 */
-		if (constraint == CONSTRAINT_NONE) {
-			dump_header(NULL, gfp_mask, order, NULL);
-			read_unlock(&tasklist_lock);
-			panic("Out of memory: panic_on_oom is enabled\n");
-		}
-	}
 	__out_of_memory(gfp_mask, order, constraint, nodemask);
 	read_unlock(&tasklist_lock);
 
-- 
cgit v1.2.3


From 8e4228e1edb922afa83366803a1e5b3fa8e987c2 Mon Sep 17 00:00:00 2001
From: David Rientjes <rientjes@google.com>
Date: Mon, 9 Aug 2010 17:18:56 -0700
Subject: oom: move sysctl declarations to oom.h

The three oom killer sysctl variables (sysctl_oom_dump_tasks,
sysctl_oom_kill_allocating_task, and sysctl_panic_on_oom) are better
declared in include/linux/oom.h rather than kernel/sysctl.c.

Signed-off-by: David Rientjes <rientjes@google.com>
Acked-by: KOSAKI Motohiro <kosaki.motohiro@jp.fujitsu.com>
Cc: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/oom.h | 5 +++++
 kernel/sysctl.c     | 4 +---
 2 files changed, 6 insertions(+), 3 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/oom.h b/include/linux/oom.h
index 3ae6d94d0540..1a8e407a1a53 100644
--- a/include/linux/oom.h
+++ b/include/linux/oom.h
@@ -44,5 +44,10 @@ static inline void oom_killer_enable(void)
 {
 	oom_killer_disabled = false;
 }
+
+/* sysctls */
+extern int sysctl_oom_dump_tasks;
+extern int sysctl_oom_kill_allocating_task;
+extern int sysctl_panic_on_oom;
 #endif /* __KERNEL__*/
 #endif /* _INCLUDE_LINUX_OOM_H */
diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index 6d850bf0a517..6b005e4912b5 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -53,6 +53,7 @@
 #include <linux/perf_event.h>
 #include <linux/kprobes.h>
 #include <linux/pipe_fs_i.h>
+#include <linux/oom.h>
 
 #include <asm/uaccess.h>
 #include <asm/processor.h>
@@ -85,9 +86,6 @@
 /* External variables not in a header file. */
 extern int sysctl_overcommit_memory;
 extern int sysctl_overcommit_ratio;
-extern int sysctl_panic_on_oom;
-extern int sysctl_oom_kill_allocating_task;
-extern int sysctl_oom_dump_tasks;
 extern int max_threads;
 extern int core_uses_pid;
 extern int suid_dumpable;
-- 
cgit v1.2.3


From ff321feac22313cf53ffceb69224b09ac19ff22b Mon Sep 17 00:00:00 2001
From: Minchan Kim <minchan.kim@gmail.com>
Date: Mon, 9 Aug 2010 17:18:57 -0700
Subject: mm: rename try_set_zone_oom() to try_set_zonelist_oom()

We have been used naming try_set_zone_oom and clear_zonelist_oom.
The role of functions is to lock of zonelist for preventing parallel
OOM. So clear_zonelist_oom makes sense but try_set_zone_oome is rather
awkward and unmatched with clear_zonelist_oom.

Let's change it with try_set_zonelist_oom.

Signed-off-by: Minchan Kim <minchan.kim@gmail.com>
Acked-by: David Rientjes <rientjes@google.com>
Reviewed-by: KOSAKI Motohiro <kosaki.motohiro@jp.fujitsu.com>
Cc: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/oom.h | 2 +-
 mm/oom_kill.c       | 4 ++--
 mm/page_alloc.c     | 2 +-
 3 files changed, 4 insertions(+), 4 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/oom.h b/include/linux/oom.h
index 1a8e407a1a53..9d7c34a741e7 100644
--- a/include/linux/oom.h
+++ b/include/linux/oom.h
@@ -25,7 +25,7 @@ enum oom_constraint {
 	CONSTRAINT_MEMCG,
 };
 
-extern int try_set_zone_oom(struct zonelist *zonelist, gfp_t gfp_flags);
+extern int try_set_zonelist_oom(struct zonelist *zonelist, gfp_t gfp_flags);
 extern void clear_zonelist_oom(struct zonelist *zonelist, gfp_t gfp_flags);
 
 extern void out_of_memory(struct zonelist *zonelist, gfp_t gfp_mask,
diff --git a/mm/oom_kill.c b/mm/oom_kill.c
index 2f37f6113d9e..c29cf00d9084 100644
--- a/mm/oom_kill.c
+++ b/mm/oom_kill.c
@@ -549,7 +549,7 @@ EXPORT_SYMBOL_GPL(unregister_oom_notifier);
  * if a parallel OOM killing is already taking place that includes a zone in
  * the zonelist.  Otherwise, locks all zones in the zonelist and returns 1.
  */
-int try_set_zone_oom(struct zonelist *zonelist, gfp_t gfp_mask)
+int try_set_zonelist_oom(struct zonelist *zonelist, gfp_t gfp_mask)
 {
 	struct zoneref *z;
 	struct zone *zone;
@@ -566,7 +566,7 @@ int try_set_zone_oom(struct zonelist *zonelist, gfp_t gfp_mask)
 	for_each_zone_zonelist(zone, z, zonelist, gfp_zone(gfp_mask)) {
 		/*
 		 * Lock each zone in the zonelist under zone_scan_lock so a
-		 * parallel invocation of try_set_zone_oom() doesn't succeed
+		 * parallel invocation of try_set_zonelist_oom() doesn't succeed
 		 * when it shouldn't.
 		 */
 		zone_set_flag(zone, ZONE_OOM_LOCKED);
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 527f73e4c63f..33c6b4c1277b 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -1738,7 +1738,7 @@ __alloc_pages_may_oom(gfp_t gfp_mask, unsigned int order,
 	struct page *page;
 
 	/* Acquire the OOM killer lock for the zones in zonelist */
-	if (!try_set_zone_oom(zonelist, gfp_mask)) {
+	if (!try_set_zonelist_oom(zonelist, gfp_mask)) {
 		schedule_timeout_uninterruptible(1);
 		return NULL;
 	}
-- 
cgit v1.2.3


From b645bd1286f2fbcd2eb4ab3bed5884f63c42e363 Mon Sep 17 00:00:00 2001
From: Alexander Nevenchannyy <a.nevenchannyy@gmail.com>
Date: Mon, 9 Aug 2010 17:19:00 -0700
Subject: mmzone.h: remove dead prototype

get_zone_counts() was dropped from kernel tree, see:
http://www.mail-archive.com/mm-commits@vger.kernel.org/msg07313.html but
its prototype remains.

Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/mmzone.h | 2 --
 1 file changed, 2 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h
index b4d109e389b8..9ed9c459b14c 100644
--- a/include/linux/mmzone.h
+++ b/include/linux/mmzone.h
@@ -651,8 +651,6 @@ typedef struct pglist_data {
 #include <linux/memory_hotplug.h>
 
 extern struct mutex zonelists_mutex;
-void get_zone_counts(unsigned long *active, unsigned long *inactive,
-			unsigned long *free);
 void build_all_zonelists(void *data);
 void wakeup_kswapd(struct zone *zone, int order);
 int zone_watermark_ok(struct zone *z, int order, unsigned long mark,
-- 
cgit v1.2.3


From 251060006003b79b788f8ce5a827ee5354a42910 Mon Sep 17 00:00:00 2001
From: Lee Schermerhorn <Lee.Schermerhorn@hp.com>
Date: Mon, 9 Aug 2010 17:19:00 -0700
Subject: topology: alternate fix for ia64 tiger_defconfig build breakage

Define stubs for the numa_*_id() generic percpu related functions for
non-NUMA configurations in <asm-generic/topology.h> where the other
non-numa stubs live.

Fixes ia64 !NUMA build breakage -- e.g., tiger_defconfig

Back out now unneeded '#ifndef CONFIG_NUMA' guards from ia64 smpboot.c

Signed-off-by: Lee Schermerhorn <lee.schermerhorn@hp.com>
Tested-by: Tony Luck <tony.luck@intel.com>
Acked-by: Tony Luck <tony.luck@intel.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 arch/ia64/Kconfig              |  3 +--
 arch/ia64/kernel/smpboot.c     |  4 ----
 include/asm-generic/topology.h | 20 +++++++++++++++++++-
 include/linux/topology.h       |  4 ----
 4 files changed, 20 insertions(+), 11 deletions(-)

(limited to 'include/linux')

diff --git a/arch/ia64/Kconfig b/arch/ia64/Kconfig
index 8711d13cd79f..ba22849ee3ec 100644
--- a/arch/ia64/Kconfig
+++ b/arch/ia64/Kconfig
@@ -499,8 +499,7 @@ config USE_PERCPU_NUMA_NODE_ID
 	depends on NUMA
 
 config HAVE_MEMORYLESS_NODES
-	def_bool y
-	depends on NUMA
+	def_bool NUMA
 
 config ARCH_PROC_KCORE_TEXT
 	def_bool y
diff --git a/arch/ia64/kernel/smpboot.c b/arch/ia64/kernel/smpboot.c
index 99dcc85193c9..1d85d8cfaa7d 100644
--- a/arch/ia64/kernel/smpboot.c
+++ b/arch/ia64/kernel/smpboot.c
@@ -390,13 +390,11 @@ smp_callin (void)
 
 	fix_b0_for_bsp();
 
-#ifdef CONFIG_NUMA
 	/*
 	 * numa_node_id() works after this.
 	 */
 	set_numa_node(cpu_to_node_map[cpuid]);
 	set_numa_mem(local_memory_node(cpu_to_node_map[cpuid]));
-#endif
 
 	ipi_call_lock_irq();
 	spin_lock(&vector_lock);
@@ -640,9 +638,7 @@ void __devinit smp_prepare_boot_cpu(void)
 {
 	cpu_set(smp_processor_id(), cpu_online_map);
 	cpu_set(smp_processor_id(), cpu_callin_map);
-#ifdef CONFIG_NUMA
 	set_numa_node(cpu_to_node_map[smp_processor_id()]);
-#endif
 	per_cpu(cpu_state, smp_processor_id()) = CPU_ONLINE;
 	paravirt_post_smp_prepare_boot_cpu();
 }
diff --git a/include/asm-generic/topology.h b/include/asm-generic/topology.h
index fd60700503c8..fc824e2828f3 100644
--- a/include/asm-generic/topology.h
+++ b/include/asm-generic/topology.h
@@ -5,7 +5,7 @@
  *
  * Copyright (C) 2002, IBM Corp.
  *
- * All rights reserved.          
+ * All rights reserved.
  *
  * This program is free software; you can redistribute it and/or modify
  * it under the terms of the GNU General Public License as published by
@@ -34,9 +34,16 @@
 #ifndef cpu_to_node
 #define cpu_to_node(cpu)	((void)(cpu),0)
 #endif
+#ifndef set_numa_node
+#define set_numa_node(node)
+#endif
+#ifndef set_cpu_numa_node
+#define set_cpu_numa_node(cpu, node)
+#endif
 #ifndef cpu_to_mem
 #define cpu_to_mem(cpu)		((void)(cpu),0)
 #endif
+
 #ifndef parent_node
 #define parent_node(node)	((void)(node),0)
 #endif
@@ -55,4 +62,15 @@
 
 #endif	/* CONFIG_NUMA */
 
+#if !defined(CONFIG_NUMA) || !defined(CONFIG_HAVE_MEMORYLESS_NODES)
+
+#ifndef set_numa_mem
+#define set_numa_mem(node)
+#endif
+#ifndef set_cpu_numa_mem
+#define set_cpu_numa_mem(cpu, node)
+#endif
+
+#endif	/* !CONFIG_NUMA || !CONFIG_HAVE_MEMORYLESS_NODES */
+
 #endif /* _ASM_GENERIC_TOPOLOGY_H */
diff --git a/include/linux/topology.h b/include/linux/topology.h
index b572e432d2f3..64e084ff5e5c 100644
--- a/include/linux/topology.h
+++ b/include/linux/topology.h
@@ -292,10 +292,6 @@ static inline void set_cpu_numa_mem(int cpu, int node)
 
 #else	/* !CONFIG_HAVE_MEMORYLESS_NODES */
 
-static inline void set_numa_mem(int node) {}
-
-static inline void set_cpu_numa_mem(int cpu, int node) {}
-
 #ifndef numa_mem_id
 /* Returns the number of the nearest Node with memory */
 static inline int numa_mem_id(void)
-- 
cgit v1.2.3


From 627295e492638936e76f3d8fcb1e0a3367b88341 Mon Sep 17 00:00:00 2001
From: Andi Kleen <andi@firstfloor.org>
Date: Mon, 9 Aug 2010 17:19:02 -0700
Subject: gcc-4.6: pagemap: avoid unused-but-set variable

Avoid quite a lot of warnings in header files in a gcc 4.6 -Wall builds

Signed-off-by: Andi Kleen <ak@linux.intel.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/pagemap.h | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/pagemap.h b/include/linux/pagemap.h
index 3c62ed408492..78a702ce4fcb 100644
--- a/include/linux/pagemap.h
+++ b/include/linux/pagemap.h
@@ -423,8 +423,10 @@ static inline int fault_in_pages_readable(const char __user *uaddr, int size)
 		const char __user *end = uaddr + size - 1;
 
 		if (((unsigned long)uaddr & PAGE_MASK) !=
-				((unsigned long)end & PAGE_MASK))
+				((unsigned long)end & PAGE_MASK)) {
 		 	ret = __get_user(c, end);
+			(void)c;
+		}
 	}
 	return ret;
 }
-- 
cgit v1.2.3


From 4e60c86bd9e5a7110ed28874d0b6592186550ae8 Mon Sep 17 00:00:00 2001
From: Andi Kleen <andi@firstfloor.org>
Date: Mon, 9 Aug 2010 17:19:03 -0700
Subject: gcc-4.6: mm: fix unused but set warnings

No real bugs, just some dead code and some fixups.

Signed-off-by: Andi Kleen <ak@linux.intel.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 arch/x86/include/asm/pgtable_64.h | 4 ++--
 include/linux/highmem.h           | 6 +++++-
 include/linux/mmdebug.h           | 2 +-
 mm/filemap.c                      | 2 --
 mm/memory.c                       | 2 --
 mm/slab.c                         | 2 +-
 6 files changed, 9 insertions(+), 9 deletions(-)

(limited to 'include/linux')

diff --git a/arch/x86/include/asm/pgtable_64.h b/arch/x86/include/asm/pgtable_64.h
index 181be528c612..076052cd62be 100644
--- a/arch/x86/include/asm/pgtable_64.h
+++ b/arch/x86/include/asm/pgtable_64.h
@@ -126,8 +126,8 @@ static inline int pgd_large(pgd_t pgd) { return 0; }
 /* x86-64 always has all page tables mapped. */
 #define pte_offset_map(dir, address) pte_offset_kernel((dir), (address))
 #define pte_offset_map_nested(dir, address) pte_offset_kernel((dir), (address))
-#define pte_unmap(pte) /* NOP */
-#define pte_unmap_nested(pte) /* NOP */
+#define pte_unmap(pte) ((void)(pte))/* NOP */
+#define pte_unmap_nested(pte) ((void)(pte)) /* NOP */
 
 #define update_mmu_cache(vma, address, ptep) do { } while (0)
 
diff --git a/include/linux/highmem.h b/include/linux/highmem.h
index 67460f010224..e3060ef85b6d 100644
--- a/include/linux/highmem.h
+++ b/include/linux/highmem.h
@@ -73,7 +73,11 @@ static inline void *kmap_atomic(struct page *page, enum km_type idx)
 }
 #define kmap_atomic_prot(page, idx, prot)	kmap_atomic(page, idx)
 
-#define kunmap_atomic_notypecheck(addr, idx)	do { pagefault_enable(); } while (0)
+static inline void kunmap_atomic_notypecheck(void *addr, enum km_type idx)
+{
+	pagefault_enable();
+}
+
 #define kmap_atomic_pfn(pfn, idx)	kmap_atomic(pfn_to_page(pfn), (idx))
 #define kmap_atomic_to_page(ptr)	virt_to_page(ptr)
 
diff --git a/include/linux/mmdebug.h b/include/linux/mmdebug.h
index ee24ef8ab616..c04ecfe03f7f 100644
--- a/include/linux/mmdebug.h
+++ b/include/linux/mmdebug.h
@@ -4,7 +4,7 @@
 #ifdef CONFIG_DEBUG_VM
 #define VM_BUG_ON(cond) BUG_ON(cond)
 #else
-#define VM_BUG_ON(cond) do { } while (0)
+#define VM_BUG_ON(cond) do { (void)(cond); } while (0)
 #endif
 
 #ifdef CONFIG_DEBUG_VIRTUAL
diff --git a/mm/filemap.c b/mm/filemap.c
index 20e5642e9f9f..3d4df44e4221 100644
--- a/mm/filemap.c
+++ b/mm/filemap.c
@@ -2238,14 +2238,12 @@ static ssize_t generic_perform_write(struct file *file,
 
 	do {
 		struct page *page;
-		pgoff_t index;		/* Pagecache index for current page */
 		unsigned long offset;	/* Offset into pagecache page */
 		unsigned long bytes;	/* Bytes to write to page */
 		size_t copied;		/* Bytes copied from user */
 		void *fsdata;
 
 		offset = (pos & (PAGE_CACHE_SIZE - 1));
-		index = pos >> PAGE_CACHE_SHIFT;
 		bytes = min_t(unsigned long, PAGE_CACHE_SIZE - offset,
 						iov_iter_count(i));
 
diff --git a/mm/memory.c b/mm/memory.c
index bde42c6d3633..6b0c37dcfd16 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -307,7 +307,6 @@ void free_pgd_range(struct mmu_gather *tlb,
 {
 	pgd_t *pgd;
 	unsigned long next;
-	unsigned long start;
 
 	/*
 	 * The next few lines have given us lots of grief...
@@ -351,7 +350,6 @@ void free_pgd_range(struct mmu_gather *tlb,
 	if (addr > end - 1)
 		return;
 
-	start = addr;
 	pgd = pgd_offset(tlb->mm, addr);
 	do {
 		next = pgd_addr_end(addr, end);
diff --git a/mm/slab.c b/mm/slab.c
index 736e497733d6..88435fcc8387 100644
--- a/mm/slab.c
+++ b/mm/slab.c
@@ -394,7 +394,7 @@ static void kmem_list3_init(struct kmem_list3 *parent)
 #define	STATS_DEC_ACTIVE(x)	do { } while (0)
 #define	STATS_INC_ALLOCED(x)	do { } while (0)
 #define	STATS_INC_GROWN(x)	do { } while (0)
-#define	STATS_ADD_REAPED(x,y)	do { } while (0)
+#define	STATS_ADD_REAPED(x,y)	do { (void)(y); } while (0)
 #define	STATS_SET_HIGH(x)	do { } while (0)
 #define	STATS_INC_ERR(x)	do { } while (0)
 #define	STATS_INC_NODEALLOCS(x)	do { } while (0)
-- 
cgit v1.2.3


From 27f5e0f694fd0600274a76854636c0749e3bb1f6 Mon Sep 17 00:00:00 2001
From: Tim Chen <tim.c.chen@linux.intel.com>
Date: Mon, 9 Aug 2010 17:19:04 -0700
Subject: tmpfs: add accurate compare function to percpu_counter library

Add percpu_counter_compare that allows for a quick but accurate comparison
of percpu_counter with a given value.

A rough count is provided by the count field in percpu_counter structure,
without accounting for the other values stored in individual cpu counters.

The actual count is a sum of count and the cpu counters.  However, count
field is never different from the actual value by a factor of
batch*num_online_cpu.  We do not need to get actual count for comparison
if count is different from the given value by this factor and allows for
quick comparison without summing up all the per cpu counters.

Signed-off-by: Tim Chen <tim.c.chen@linux.intel.com>
Cc: Hugh Dickins <hughd@google.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/percpu_counter.h | 11 +++++++++++
 lib/percpu_counter.c           | 27 +++++++++++++++++++++++++++
 2 files changed, 38 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/percpu_counter.h b/include/linux/percpu_counter.h
index c88d67b59394..8a7d510ffa9c 100644
--- a/include/linux/percpu_counter.h
+++ b/include/linux/percpu_counter.h
@@ -40,6 +40,7 @@ void percpu_counter_destroy(struct percpu_counter *fbc);
 void percpu_counter_set(struct percpu_counter *fbc, s64 amount);
 void __percpu_counter_add(struct percpu_counter *fbc, s64 amount, s32 batch);
 s64 __percpu_counter_sum(struct percpu_counter *fbc);
+int percpu_counter_compare(struct percpu_counter *fbc, s64 rhs);
 
 static inline void percpu_counter_add(struct percpu_counter *fbc, s64 amount)
 {
@@ -98,6 +99,16 @@ static inline void percpu_counter_set(struct percpu_counter *fbc, s64 amount)
 	fbc->count = amount;
 }
 
+static inline int percpu_counter_compare(struct percpu_counter *fbc, s64 rhs)
+{
+	if (fbc->count > rhs)
+		return 1;
+	else if (fbc->count < rhs)
+		return -1;
+	else
+		return 0;
+}
+
 static inline void
 percpu_counter_add(struct percpu_counter *fbc, s64 amount)
 {
diff --git a/lib/percpu_counter.c b/lib/percpu_counter.c
index aeaa6d734447..ec9048e74f44 100644
--- a/lib/percpu_counter.c
+++ b/lib/percpu_counter.c
@@ -137,6 +137,33 @@ static int __cpuinit percpu_counter_hotcpu_callback(struct notifier_block *nb,
 	return NOTIFY_OK;
 }
 
+/*
+ * Compare counter against given value.
+ * Return 1 if greater, 0 if equal and -1 if less
+ */
+int percpu_counter_compare(struct percpu_counter *fbc, s64 rhs)
+{
+	s64	count;
+
+	count = percpu_counter_read(fbc);
+	/* Check to see if rough count will be sufficient for comparison */
+	if (abs(count - rhs) > (percpu_counter_batch*num_online_cpus())) {
+		if (count > rhs)
+			return 1;
+		else
+			return -1;
+	}
+	/* Need to use precise count */
+	count = percpu_counter_sum(fbc);
+	if (count > rhs)
+		return 1;
+	else if (count < rhs)
+		return -1;
+	else
+		return 0;
+}
+EXPORT_SYMBOL(percpu_counter_compare);
+
 static int __init percpu_counter_startup(void)
 {
 	compute_batch_value();
-- 
cgit v1.2.3


From 7e496299d4d2ad8083effed6c5a18313a919edc6 Mon Sep 17 00:00:00 2001
From: Tim Chen <tim.c.chen@linux.intel.com>
Date: Mon, 9 Aug 2010 17:19:05 -0700
Subject: tmpfs: make tmpfs scalable with percpu_counter for used blocks

The current implementation of tmpfs is not scalable.  We found that
stat_lock is contended by multiple threads when we need to get a new page,
leading to useless spinning inside this spin lock.

This patch makes use of the percpu_counter library to maintain local count
of used blocks to speed up getting and returning of pages.  So the
acquisition of stat_lock is unnecessary for getting and returning blocks,
improving the performance of tmpfs on system with large number of cpus.
On a 4 socket 32 core NHM-EX system, we saw improvement of 270%.

The implementation below has a slight chance of race between threads
causing a slight overshoot of the maximum configured blocks.  However, any
overshoot is small, and is bounded by the number of cpus.  This happens
when the number of used blocks is slightly below the maximum configured
blocks when a thread checks the used block count, and another thread
allocates the last block before the current thread does.  This should not
be a problem for tmpfs, as the overshoot is most likely to be a few blocks
and bounded.  If a strict limit is really desired, then configured the max
blocks to be the limit less the number of cpus in system.

Signed-off-by: Tim Chen <tim.c.chen@linux.intel.com>
Cc: Hugh Dickins <hughd@google.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/shmem_fs.h |  3 ++-
 mm/shmem.c               | 40 +++++++++++++++++-----------------------
 2 files changed, 19 insertions(+), 24 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/shmem_fs.h b/include/linux/shmem_fs.h
index e164291fb3e7..399be5ad2f99 100644
--- a/include/linux/shmem_fs.h
+++ b/include/linux/shmem_fs.h
@@ -3,6 +3,7 @@
 
 #include <linux/swap.h>
 #include <linux/mempolicy.h>
+#include <linux/percpu_counter.h>
 
 /* inode in-kernel data */
 
@@ -23,7 +24,7 @@ struct shmem_inode_info {
 
 struct shmem_sb_info {
 	unsigned long max_blocks;   /* How many blocks are allowed */
-	unsigned long free_blocks;  /* How many are left for allocation */
+	struct percpu_counter used_blocks;  /* How many are allocated */
 	unsigned long max_inodes;   /* How many inodes are allowed */
 	unsigned long free_inodes;  /* How many are left for allocation */
 	spinlock_t stat_lock;	    /* Serialize shmem_sb_info changes */
diff --git a/mm/shmem.c b/mm/shmem.c
index f65f84062db5..0618fdad406c 100644
--- a/mm/shmem.c
+++ b/mm/shmem.c
@@ -28,6 +28,7 @@
 #include <linux/file.h>
 #include <linux/mm.h>
 #include <linux/module.h>
+#include <linux/percpu_counter.h>
 #include <linux/swap.h>
 
 static struct vfsmount *shm_mnt;
@@ -233,10 +234,10 @@ static void shmem_free_blocks(struct inode *inode, long pages)
 {
 	struct shmem_sb_info *sbinfo = SHMEM_SB(inode->i_sb);
 	if (sbinfo->max_blocks) {
-		spin_lock(&sbinfo->stat_lock);
-		sbinfo->free_blocks += pages;
+		percpu_counter_add(&sbinfo->used_blocks, -pages);
+		spin_lock(&inode->i_lock);
 		inode->i_blocks -= pages*BLOCKS_PER_PAGE;
-		spin_unlock(&sbinfo->stat_lock);
+		spin_unlock(&inode->i_lock);
 	}
 }
 
@@ -416,19 +417,17 @@ static swp_entry_t *shmem_swp_alloc(struct shmem_inode_info *info, unsigned long
 		if (sgp == SGP_READ)
 			return shmem_swp_map(ZERO_PAGE(0));
 		/*
-		 * Test free_blocks against 1 not 0, since we have 1 data
+		 * Test used_blocks against 1 less max_blocks, since we have 1 data
 		 * page (and perhaps indirect index pages) yet to allocate:
 		 * a waste to allocate index if we cannot allocate data.
 		 */
 		if (sbinfo->max_blocks) {
-			spin_lock(&sbinfo->stat_lock);
-			if (sbinfo->free_blocks <= 1) {
-				spin_unlock(&sbinfo->stat_lock);
+			if (percpu_counter_compare(&sbinfo->used_blocks, (sbinfo->max_blocks - 1)) > 0)
 				return ERR_PTR(-ENOSPC);
-			}
-			sbinfo->free_blocks--;
+			percpu_counter_inc(&sbinfo->used_blocks);
+			spin_lock(&inode->i_lock);
 			inode->i_blocks += BLOCKS_PER_PAGE;
-			spin_unlock(&sbinfo->stat_lock);
+			spin_unlock(&inode->i_lock);
 		}
 
 		spin_unlock(&info->lock);
@@ -1387,17 +1386,16 @@ repeat:
 		shmem_swp_unmap(entry);
 		sbinfo = SHMEM_SB(inode->i_sb);
 		if (sbinfo->max_blocks) {
-			spin_lock(&sbinfo->stat_lock);
-			if (sbinfo->free_blocks == 0 ||
+			if ((percpu_counter_compare(&sbinfo->used_blocks, sbinfo->max_blocks) > 0) ||
 			    shmem_acct_block(info->flags)) {
-				spin_unlock(&sbinfo->stat_lock);
 				spin_unlock(&info->lock);
 				error = -ENOSPC;
 				goto failed;
 			}
-			sbinfo->free_blocks--;
+			percpu_counter_inc(&sbinfo->used_blocks);
+			spin_lock(&inode->i_lock);
 			inode->i_blocks += BLOCKS_PER_PAGE;
-			spin_unlock(&sbinfo->stat_lock);
+			spin_unlock(&inode->i_lock);
 		} else if (shmem_acct_block(info->flags)) {
 			spin_unlock(&info->lock);
 			error = -ENOSPC;
@@ -1791,17 +1789,16 @@ static int shmem_statfs(struct dentry *dentry, struct kstatfs *buf)
 	buf->f_type = TMPFS_MAGIC;
 	buf->f_bsize = PAGE_CACHE_SIZE;
 	buf->f_namelen = NAME_MAX;
-	spin_lock(&sbinfo->stat_lock);
 	if (sbinfo->max_blocks) {
 		buf->f_blocks = sbinfo->max_blocks;
-		buf->f_bavail = buf->f_bfree = sbinfo->free_blocks;
+		buf->f_bavail = buf->f_bfree =
+				sbinfo->max_blocks - percpu_counter_sum(&sbinfo->used_blocks);
 	}
 	if (sbinfo->max_inodes) {
 		buf->f_files = sbinfo->max_inodes;
 		buf->f_ffree = sbinfo->free_inodes;
 	}
 	/* else leave those fields 0 like simple_statfs */
-	spin_unlock(&sbinfo->stat_lock);
 	return 0;
 }
 
@@ -2242,7 +2239,6 @@ static int shmem_remount_fs(struct super_block *sb, int *flags, char *data)
 {
 	struct shmem_sb_info *sbinfo = SHMEM_SB(sb);
 	struct shmem_sb_info config = *sbinfo;
-	unsigned long blocks;
 	unsigned long inodes;
 	int error = -EINVAL;
 
@@ -2250,9 +2246,8 @@ static int shmem_remount_fs(struct super_block *sb, int *flags, char *data)
 		return error;
 
 	spin_lock(&sbinfo->stat_lock);
-	blocks = sbinfo->max_blocks - sbinfo->free_blocks;
 	inodes = sbinfo->max_inodes - sbinfo->free_inodes;
-	if (config.max_blocks < blocks)
+	if (percpu_counter_compare(&sbinfo->used_blocks, config.max_blocks) > 0)
 		goto out;
 	if (config.max_inodes < inodes)
 		goto out;
@@ -2269,7 +2264,6 @@ static int shmem_remount_fs(struct super_block *sb, int *flags, char *data)
 
 	error = 0;
 	sbinfo->max_blocks  = config.max_blocks;
-	sbinfo->free_blocks = config.max_blocks - blocks;
 	sbinfo->max_inodes  = config.max_inodes;
 	sbinfo->free_inodes = config.max_inodes - inodes;
 
@@ -2344,7 +2338,7 @@ int shmem_fill_super(struct super_block *sb, void *data, int silent)
 #endif
 
 	spin_lock_init(&sbinfo->stat_lock);
-	sbinfo->free_blocks = sbinfo->max_blocks;
+	percpu_counter_init(&sbinfo->used_blocks, 0);
 	sbinfo->free_inodes = sbinfo->max_inodes;
 
 	sb->s_maxbytes = SHMEM_MAX_BYTES;
-- 
cgit v1.2.3


From ba6f0ff3981e6263ab81ac512f04cca55b85ec81 Mon Sep 17 00:00:00 2001
From: Andrea Arcangeli <aarcange@redhat.com>
Date: Mon, 9 Aug 2010 17:19:08 -0700
Subject: ksm: fix ksm swapin time optimization

The new anon-vma code, was suboptimal and it lead to erratic invocation of
ksm_does_need_to_copy.  That leads to host hangs or guest vnc lockup, or
weird behavior.  It's unclear why ksm_does_need_to_copy is unstable but
the point is that when KSM is not in use, ksm_does_need_to_copy must never
run or we bounce pages for no good reason.  I suspect the same hangs will
happen with KVM swaps.  But this at least fixes the regression in the
new-anon-vma code and it only let KSM bugs triggers when KSM is in use.

The code in do_swap_page likely doesn't cope well with a not-swapcache,
especially the memcg code.

Signed-off-by: Andrea Arcangeli <aarcange@redhat.com>
Signed-off-by: Rik van Riel <riel@redhat.com>
Cc: Izik Eidus <ieidus@yahoo.com>
Cc: Avi Kivity <avi@redhat.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/ksm.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/ksm.h b/include/linux/ksm.h
index 43bdab769fc3..74d691ee9121 100644
--- a/include/linux/ksm.h
+++ b/include/linux/ksm.h
@@ -78,7 +78,7 @@ static inline struct page *ksm_might_need_to_copy(struct page *page,
 	struct anon_vma *anon_vma = page_anon_vma(page);
 
 	if (!anon_vma ||
-	    (anon_vma == vma->anon_vma &&
+	    (anon_vma->root == vma->anon_vma->root &&
 	     page->index == linear_page_index(vma, address)))
 		return page;
 
-- 
cgit v1.2.3


From ebf8aa44beed48cd17893a83d92a4403e5f9d9e2 Mon Sep 17 00:00:00 2001
From: Jan Kara <jack@suse.cz>
Date: Mon, 9 Aug 2010 17:19:11 -0700
Subject: radix-tree: omplement function radix_tree_range_tag_if_tagged

Implement function for setting one tag if another tag is set for each item
in given range.

Signed-off-by: Jan Kara <jack@suse.cz>
Cc: Dave Chinner <david@fromorbit.com>
Cc: Nick Piggin <nickpiggin@yahoo.com.au>
Cc: Chris Mason <chris.mason@oracle.com>
Cc: Theodore Ts'o <tytso@mit.edu>
Cc: Jens Axboe <axboe@kernel.dk>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/radix-tree.h |  4 ++
 lib/radix-tree.c           | 94 ++++++++++++++++++++++++++++++++++++++++++++++
 2 files changed, 98 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/radix-tree.h b/include/linux/radix-tree.h
index 55ca73cf25e5..a4b00e9cca90 100644
--- a/include/linux/radix-tree.h
+++ b/include/linux/radix-tree.h
@@ -192,6 +192,10 @@ unsigned int
 radix_tree_gang_lookup_tag_slot(struct radix_tree_root *root, void ***results,
 		unsigned long first_index, unsigned int max_items,
 		unsigned int tag);
+unsigned long radix_tree_range_tag_if_tagged(struct radix_tree_root *root,
+		unsigned long *first_indexp, unsigned long last_index,
+		unsigned long nr_to_tag,
+		unsigned int fromtag, unsigned int totag);
 int radix_tree_tagged(struct radix_tree_root *root, unsigned int tag);
 
 static inline void radix_tree_preload_end(void)
diff --git a/lib/radix-tree.c b/lib/radix-tree.c
index 05da38bcc298..e907858498a6 100644
--- a/lib/radix-tree.c
+++ b/lib/radix-tree.c
@@ -608,6 +608,100 @@ int radix_tree_tag_get(struct radix_tree_root *root,
 }
 EXPORT_SYMBOL(radix_tree_tag_get);
 
+/**
+ * radix_tree_range_tag_if_tagged - for each item in given range set given
+ *				   tag if item has another tag set
+ * @root:		radix tree root
+ * @first_indexp:	pointer to a starting index of a range to scan
+ * @last_index:		last index of a range to scan
+ * @nr_to_tag:		maximum number items to tag
+ * @iftag:		tag index to test
+ * @settag:		tag index to set if tested tag is set
+ *
+ * This function scans range of radix tree from first_index to last_index
+ * (inclusive).  For each item in the range if iftag is set, the function sets
+ * also settag. The function stops either after tagging nr_to_tag items or
+ * after reaching last_index.
+ *
+ * The function returns number of leaves where the tag was set and sets
+ * *first_indexp to the first unscanned index.
+ */
+unsigned long radix_tree_range_tag_if_tagged(struct radix_tree_root *root,
+		unsigned long *first_indexp, unsigned long last_index,
+		unsigned long nr_to_tag,
+		unsigned int iftag, unsigned int settag)
+{
+	unsigned int height = root->height, shift;
+	unsigned long tagged = 0, index = *first_indexp;
+	struct radix_tree_node *open_slots[height], *slot;
+
+	last_index = min(last_index, radix_tree_maxindex(height));
+	if (index > last_index)
+		return 0;
+	if (!nr_to_tag)
+		return 0;
+	if (!root_tag_get(root, iftag)) {
+		*first_indexp = last_index + 1;
+		return 0;
+	}
+	if (height == 0) {
+		*first_indexp = last_index + 1;
+		root_tag_set(root, settag);
+		return 1;
+	}
+
+	shift = (height - 1) * RADIX_TREE_MAP_SHIFT;
+	slot = radix_tree_indirect_to_ptr(root->rnode);
+
+	for (;;) {
+		int offset;
+
+		offset = (index >> shift) & RADIX_TREE_MAP_MASK;
+		if (!slot->slots[offset])
+			goto next;
+		if (!tag_get(slot, iftag, offset))
+			goto next;
+		tag_set(slot, settag, offset);
+		if (height == 1) {
+			tagged++;
+			goto next;
+		}
+		/* Go down one level */
+		height--;
+		shift -= RADIX_TREE_MAP_SHIFT;
+		open_slots[height] = slot;
+		slot = slot->slots[offset];
+		continue;
+next:
+		/* Go to next item at level determined by 'shift' */
+		index = ((index >> shift) + 1) << shift;
+		if (index > last_index)
+			break;
+		if (tagged >= nr_to_tag)
+			break;
+		while (((index >> shift) & RADIX_TREE_MAP_MASK) == 0) {
+			/*
+			 * We've fully scanned this node. Go up. Because
+			 * last_index is guaranteed to be in the tree, what
+			 * we do below cannot wander astray.
+			 */
+			slot = open_slots[height];
+			height++;
+			shift += RADIX_TREE_MAP_SHIFT;
+		}
+	}
+	/*
+	 * The iftag must have been set somewhere because otherwise
+	 * we would return immediated at the beginning of the function
+	 */
+	root_tag_set(root, settag);
+	*first_indexp = index;
+
+	return tagged;
+}
+EXPORT_SYMBOL(radix_tree_range_tag_if_tagged);
+
+
 /**
  *	radix_tree_next_hole    -    find the next hole (not-present entry)
  *	@root:		tree root
-- 
cgit v1.2.3


From f446daaea9d4a420d16c606f755f3689dcb2d0ce Mon Sep 17 00:00:00 2001
From: Jan Kara <jack@suse.cz>
Date: Mon, 9 Aug 2010 17:19:12 -0700
Subject: mm: implement writeback livelock avoidance using page tagging

We try to avoid livelocks of writeback when some steadily creates dirty
pages in a mapping we are writing out.  For memory-cleaning writeback,
using nr_to_write works reasonably well but we cannot really use it for
data integrity writeback.  This patch tries to solve the problem.

The idea is simple: Tag all pages that should be written back with a
special tag (TOWRITE) in the radix tree.  This can be done rather quickly
and thus livelocks should not happen in practice.  Then we start doing the
hard work of locking pages and sending them to disk only for those pages
that have TOWRITE tag set.

Note: Adding new radix tree tag grows radix tree node from 288 to 296
bytes for 32-bit archs and from 552 to 560 bytes for 64-bit archs.
However, the number of slab/slub items per page remains the same (13 and 7
respectively).

Signed-off-by: Jan Kara <jack@suse.cz>
Cc: Dave Chinner <david@fromorbit.com>
Cc: Nick Piggin <nickpiggin@yahoo.com.au>
Cc: Chris Mason <chris.mason@oracle.com>
Cc: Theodore Ts'o <tytso@mit.edu>
Cc: Jens Axboe <axboe@kernel.dk>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/fs.h         |  1 +
 include/linux/radix-tree.h |  2 +-
 mm/page-writeback.c        | 70 +++++++++++++++++++++++++++++++++++-----------
 3 files changed, 55 insertions(+), 18 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/fs.h b/include/linux/fs.h
index e5106e49bd2c..488efec09d14 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -687,6 +687,7 @@ struct block_device {
  */
 #define PAGECACHE_TAG_DIRTY	0
 #define PAGECACHE_TAG_WRITEBACK	1
+#define PAGECACHE_TAG_TOWRITE	2
 
 int mapping_tagged(struct address_space *mapping, int tag);
 
diff --git a/include/linux/radix-tree.h b/include/linux/radix-tree.h
index a4b00e9cca90..634b8e674ac5 100644
--- a/include/linux/radix-tree.h
+++ b/include/linux/radix-tree.h
@@ -55,7 +55,7 @@ static inline int radix_tree_is_indirect_ptr(void *ptr)
 
 /*** radix-tree API starts here ***/
 
-#define RADIX_TREE_MAX_TAGS 2
+#define RADIX_TREE_MAX_TAGS 3
 
 /* root tags are stored in gfp_mask, shifted by __GFP_BITS_SHIFT */
 struct radix_tree_root {
diff --git a/mm/page-writeback.c b/mm/page-writeback.c
index 37498ef61548..df8202ebc7b8 100644
--- a/mm/page-writeback.c
+++ b/mm/page-writeback.c
@@ -804,6 +804,41 @@ void __init page_writeback_init(void)
 	prop_descriptor_init(&vm_dirties, shift);
 }
 
+/**
+ * tag_pages_for_writeback - tag pages to be written by write_cache_pages
+ * @mapping: address space structure to write
+ * @start: starting page index
+ * @end: ending page index (inclusive)
+ *
+ * This function scans the page range from @start to @end (inclusive) and tags
+ * all pages that have DIRTY tag set with a special TOWRITE tag. The idea is
+ * that write_cache_pages (or whoever calls this function) will then use
+ * TOWRITE tag to identify pages eligible for writeback.  This mechanism is
+ * used to avoid livelocking of writeback by a process steadily creating new
+ * dirty pages in the file (thus it is important for this function to be quick
+ * so that it can tag pages faster than a dirtying process can create them).
+ */
+/*
+ * We tag pages in batches of WRITEBACK_TAG_BATCH to reduce tree_lock latency.
+ */
+#define WRITEBACK_TAG_BATCH 4096
+void tag_pages_for_writeback(struct address_space *mapping,
+			     pgoff_t start, pgoff_t end)
+{
+	unsigned long tagged;
+
+	do {
+		spin_lock_irq(&mapping->tree_lock);
+		tagged = radix_tree_range_tag_if_tagged(&mapping->page_tree,
+				&start, end, WRITEBACK_TAG_BATCH,
+				PAGECACHE_TAG_DIRTY, PAGECACHE_TAG_TOWRITE);
+		spin_unlock_irq(&mapping->tree_lock);
+		WARN_ON_ONCE(tagged > WRITEBACK_TAG_BATCH);
+		cond_resched();
+	} while (tagged >= WRITEBACK_TAG_BATCH);
+}
+EXPORT_SYMBOL(tag_pages_for_writeback);
+
 /**
  * write_cache_pages - walk the list of dirty pages of the given address space and write all of them.
  * @mapping: address space structure to write
@@ -818,6 +853,13 @@ void __init page_writeback_init(void)
  * the call was made get new I/O started against them.  If wbc->sync_mode is
  * WB_SYNC_ALL then we were called for data integrity and we must wait for
  * existing IO to complete.
+ *
+ * To avoid livelocks (when other process dirties new pages), we first tag
+ * pages which should be written back with TOWRITE tag and only then start
+ * writing them. For data-integrity sync we have to be careful so that we do
+ * not miss some pages (e.g., because some other process has cleared TOWRITE
+ * tag we set). The rule we follow is that TOWRITE tag can be cleared only
+ * by the process clearing the DIRTY tag (and submitting the page for IO).
  */
 int write_cache_pages(struct address_space *mapping,
 		      struct writeback_control *wbc, writepage_t writepage,
@@ -833,6 +875,7 @@ int write_cache_pages(struct address_space *mapping,
 	pgoff_t done_index;
 	int cycled;
 	int range_whole = 0;
+	int tag;
 
 	pagevec_init(&pvec, 0);
 	if (wbc->range_cyclic) {
@@ -849,29 +892,19 @@ int write_cache_pages(struct address_space *mapping,
 		if (wbc->range_start == 0 && wbc->range_end == LLONG_MAX)
 			range_whole = 1;
 		cycled = 1; /* ignore range_cyclic tests */
-
-		/*
-		 * If this is a data integrity sync, cap the writeback to the
-		 * current end of file. Any extension to the file that occurs
-		 * after this is a new write and we don't need to write those
-		 * pages out to fulfil our data integrity requirements. If we
-		 * try to write them out, we can get stuck in this scan until
-		 * the concurrent writer stops adding dirty pages and extending
-		 * EOF.
-		 */
-		if (wbc->sync_mode == WB_SYNC_ALL &&
-		    wbc->range_end == LLONG_MAX) {
-			end = i_size_read(mapping->host) >> PAGE_CACHE_SHIFT;
-		}
 	}
-
+	if (wbc->sync_mode == WB_SYNC_ALL)
+		tag = PAGECACHE_TAG_TOWRITE;
+	else
+		tag = PAGECACHE_TAG_DIRTY;
 retry:
+	if (wbc->sync_mode == WB_SYNC_ALL)
+		tag_pages_for_writeback(mapping, index, end);
 	done_index = index;
 	while (!done && (index <= end)) {
 		int i;
 
-		nr_pages = pagevec_lookup_tag(&pvec, mapping, &index,
-			      PAGECACHE_TAG_DIRTY,
+		nr_pages = pagevec_lookup_tag(&pvec, mapping, &index, tag,
 			      min(end - index, (pgoff_t)PAGEVEC_SIZE-1) + 1);
 		if (nr_pages == 0)
 			break;
@@ -1327,6 +1360,9 @@ int test_set_page_writeback(struct page *page)
 			radix_tree_tag_clear(&mapping->page_tree,
 						page_index(page),
 						PAGECACHE_TAG_DIRTY);
+		radix_tree_tag_clear(&mapping->page_tree,
+				     page_index(page),
+				     PAGECACHE_TAG_TOWRITE);
 		spin_unlock_irqrestore(&mapping->tree_lock, flags);
 	} else {
 		ret = TestSetPageWriteback(page);
-- 
cgit v1.2.3


From 25edde0332916ae706ccf83de688be57bcc844b7 Mon Sep 17 00:00:00 2001
From: KOSAKI Motohiro <kosaki.motohiro@jp.fujitsu.com>
Date: Mon, 9 Aug 2010 17:19:27 -0700
Subject: vmscan: kill prev_priority completely

Since 2.6.28 zone->prev_priority is unused. Then it can be removed
safely. It reduce stack usage slightly.

Now I have to say that I'm sorry. 2 years ago, I thought prev_priority
can be integrate again, it's useful. but four (or more) times trying
haven't got good performance number. Thus I give up such approach.

The rest of this changelog is notes on prev_priority and why it existed in
the first place and why it might be not necessary any more. This information
is based heavily on discussions between Andrew Morton, Rik van Riel and
Kosaki Motohiro who is heavily quotes from.

Historically prev_priority was important because it determined when the VM
would start unmapping PTE pages. i.e. there are no balances of note within
the VM, Anon vs File and Mapped vs Unmapped. Without prev_priority, there
is a potential risk of unnecessarily increasing minor faults as a large
amount of read activity of use-once pages could push mapped pages to the
end of the LRU and get unmapped.

There is no proof this is still a problem but currently it is not considered
to be. Active files are not deactivated if the active file list is smaller
than the inactive list reducing the liklihood that file-mapped pages are
being pushed off the LRU and referenced executable pages are kept on the
active list to avoid them getting pushed out by read activity.

Even if it is a problem, prev_priority prev_priority wouldn't works
nowadays. First of all, current vmscan still a lot of UP centric code. it
expose some weakness on some dozens CPUs machine. I think we need more and
more improvement.

The problem is, current vmscan mix up per-system-pressure, per-zone-pressure
and per-task-pressure a bit. example, prev_priority try to boost priority to
other concurrent priority. but if the another task have mempolicy restriction,
it is unnecessary, but also makes wrong big latency and exceeding reclaim.
per-task based priority + prev_priority adjustment make the emulation of
per-system pressure. but it have two issue 1) too rough and brutal emulation
2) we need per-zone pressure, not per-system.

Another example, currently DEF_PRIORITY is 12. it mean the lru rotate about
2 cycle (1/4096 + 1/2048 + 1/1024 + .. + 1) before invoking OOM-Killer.
but if 10,0000 thrreads enter DEF_PRIORITY reclaim at the same time, the
system have higher memory pressure than priority==0 (1/4096*10,000 > 2).
prev_priority can't solve such multithreads workload issue. In other word,
prev_priority concept assume the sysmtem don't have lots threads."

Signed-off-by: KOSAKI Motohiro <kosaki.motohiro@jp.fujitsu.com>
Signed-off-by: Mel Gorman <mel@csn.ul.ie>
Reviewed-by: Johannes Weiner <hannes@cmpxchg.org>
Reviewed-by: Rik van Riel <riel@redhat.com>
Cc: Dave Chinner <david@fromorbit.com>
Cc: Chris Mason <chris.mason@oracle.com>
Cc: Nick Piggin <npiggin@suse.de>
Cc: Rik van Riel <riel@redhat.com>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Cc: Christoph Hellwig <hch@infradead.org>
Cc: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
Cc: KOSAKI Motohiro <kosaki.motohiro@jp.fujitsu.com>
Cc: Andrea Arcangeli <aarcange@redhat.com>
Cc: Michael Rubin <mrubin@google.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/memcontrol.h |  5 ----
 include/linux/mmzone.h     | 15 ------------
 mm/memcontrol.c            | 31 -------------------------
 mm/page_alloc.c            |  2 --
 mm/vmscan.c                | 57 ----------------------------------------------
 mm/vmstat.c                |  2 --
 6 files changed, 112 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h
index 9411d32840b0..9f1afd361583 100644
--- a/include/linux/memcontrol.h
+++ b/include/linux/memcontrol.h
@@ -98,11 +98,6 @@ extern void mem_cgroup_end_migration(struct mem_cgroup *mem,
 /*
  * For memory reclaim.
  */
-extern int mem_cgroup_get_reclaim_priority(struct mem_cgroup *mem);
-extern void mem_cgroup_note_reclaim_priority(struct mem_cgroup *mem,
-							int priority);
-extern void mem_cgroup_record_reclaim_priority(struct mem_cgroup *mem,
-							int priority);
 int mem_cgroup_inactive_anon_is_low(struct mem_cgroup *memcg);
 int mem_cgroup_inactive_file_is_low(struct mem_cgroup *memcg);
 unsigned long mem_cgroup_zone_nr_pages(struct mem_cgroup *memcg,
diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h
index 9ed9c459b14c..6e6e62648a4d 100644
--- a/include/linux/mmzone.h
+++ b/include/linux/mmzone.h
@@ -347,21 +347,6 @@ struct zone {
 	/* Zone statistics */
 	atomic_long_t		vm_stat[NR_VM_ZONE_STAT_ITEMS];
 
-	/*
-	 * prev_priority holds the scanning priority for this zone.  It is
-	 * defined as the scanning priority at which we achieved our reclaim
-	 * target at the previous try_to_free_pages() or balance_pgdat()
-	 * invocation.
-	 *
-	 * We use prev_priority as a measure of how much stress page reclaim is
-	 * under - it drives the swappiness decision: whether to unmap mapped
-	 * pages.
-	 *
-	 * Access to both this field is quite racy even on uniprocessor.  But
-	 * it is expected to average out OK.
-	 */
-	int prev_priority;
-
 	/*
 	 * The target ratio of ACTIVE_ANON to INACTIVE_ANON pages on
 	 * this zone's LRU.  Maintained by the pageout code.
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index 20a8193a7af8..31abd1c2c0c5 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -211,8 +211,6 @@ struct mem_cgroup {
 	*/
 	spinlock_t reclaim_param_lock;
 
-	int	prev_priority;	/* for recording reclaim priority */
-
 	/*
 	 * While reclaiming in a hierarchy, we cache the last child we
 	 * reclaimed from.
@@ -858,35 +856,6 @@ int task_in_mem_cgroup(struct task_struct *task, const struct mem_cgroup *mem)
 	return ret;
 }
 
-/*
- * prev_priority control...this will be used in memory reclaim path.
- */
-int mem_cgroup_get_reclaim_priority(struct mem_cgroup *mem)
-{
-	int prev_priority;
-
-	spin_lock(&mem->reclaim_param_lock);
-	prev_priority = mem->prev_priority;
-	spin_unlock(&mem->reclaim_param_lock);
-
-	return prev_priority;
-}
-
-void mem_cgroup_note_reclaim_priority(struct mem_cgroup *mem, int priority)
-{
-	spin_lock(&mem->reclaim_param_lock);
-	if (priority < mem->prev_priority)
-		mem->prev_priority = priority;
-	spin_unlock(&mem->reclaim_param_lock);
-}
-
-void mem_cgroup_record_reclaim_priority(struct mem_cgroup *mem, int priority)
-{
-	spin_lock(&mem->reclaim_param_lock);
-	mem->prev_priority = priority;
-	spin_unlock(&mem->reclaim_param_lock);
-}
-
 static int calc_inactive_ratio(struct mem_cgroup *memcg, unsigned long *present_pages)
 {
 	unsigned long active;
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 33c6b4c1277b..a9649f4b261e 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -4100,8 +4100,6 @@ static void __paginginit free_area_init_core(struct pglist_data *pgdat,
 		zone_seqlock_init(zone);
 		zone->zone_pgdat = pgdat;
 
-		zone->prev_priority = DEF_PRIORITY;
-
 		zone_pcp_init(zone);
 		for_each_lru(l) {
 			INIT_LIST_HEAD(&zone->lru[l].list);
diff --git a/mm/vmscan.c b/mm/vmscan.c
index b7a4e6a3cf89..594eba8a44c0 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -1289,20 +1289,6 @@ done:
 	return nr_reclaimed;
 }
 
-/*
- * We are about to scan this zone at a certain priority level.  If that priority
- * level is smaller (ie: more urgent) than the previous priority, then note
- * that priority level within the zone.  This is done so that when the next
- * process comes in to scan this zone, it will immediately start out at this
- * priority level rather than having to build up its own scanning priority.
- * Here, this priority affects only the reclaim-mapped threshold.
- */
-static inline void note_zone_scanning_priority(struct zone *zone, int priority)
-{
-	if (priority < zone->prev_priority)
-		zone->prev_priority = priority;
-}
-
 /*
  * This moves pages from the active list to the inactive list.
  *
@@ -1766,17 +1752,8 @@ static bool shrink_zones(int priority, struct zonelist *zonelist,
 		if (scanning_global_lru(sc)) {
 			if (!cpuset_zone_allowed_hardwall(zone, GFP_KERNEL))
 				continue;
-			note_zone_scanning_priority(zone, priority);
-
 			if (zone->all_unreclaimable && priority != DEF_PRIORITY)
 				continue;	/* Let kswapd poll it */
-		} else {
-			/*
-			 * Ignore cpuset limitation here. We just want to reduce
-			 * # of used pages by us regardless of memory shortage.
-			 */
-			mem_cgroup_note_reclaim_priority(sc->mem_cgroup,
-							priority);
 		}
 
 		shrink_zone(priority, zone, sc);
@@ -1877,17 +1854,6 @@ out:
 	if (priority < 0)
 		priority = 0;
 
-	if (scanning_global_lru(sc)) {
-		for_each_zone_zonelist(zone, z, zonelist, high_zoneidx) {
-
-			if (!cpuset_zone_allowed_hardwall(zone, GFP_KERNEL))
-				continue;
-
-			zone->prev_priority = priority;
-		}
-	} else
-		mem_cgroup_record_reclaim_priority(sc->mem_cgroup, priority);
-
 	delayacct_freepages_end();
 	put_mems_allowed();
 
@@ -2053,22 +2019,12 @@ static unsigned long balance_pgdat(pg_data_t *pgdat, int order)
 		.order = order,
 		.mem_cgroup = NULL,
 	};
-	/*
-	 * temp_priority is used to remember the scanning priority at which
-	 * this zone was successfully refilled to
-	 * free_pages == high_wmark_pages(zone).
-	 */
-	int temp_priority[MAX_NR_ZONES];
-
 loop_again:
 	total_scanned = 0;
 	sc.nr_reclaimed = 0;
 	sc.may_writepage = !laptop_mode;
 	count_vm_event(PAGEOUTRUN);
 
-	for (i = 0; i < pgdat->nr_zones; i++)
-		temp_priority[i] = DEF_PRIORITY;
-
 	for (priority = DEF_PRIORITY; priority >= 0; priority--) {
 		int end_zone = 0;	/* Inclusive.  0 = ZONE_DMA */
 		unsigned long lru_pages = 0;
@@ -2136,9 +2092,7 @@ loop_again:
 			if (zone->all_unreclaimable && priority != DEF_PRIORITY)
 				continue;
 
-			temp_priority[i] = priority;
 			sc.nr_scanned = 0;
-			note_zone_scanning_priority(zone, priority);
 
 			nid = pgdat->node_id;
 			zid = zone_idx(zone);
@@ -2211,16 +2165,6 @@ loop_again:
 			break;
 	}
 out:
-	/*
-	 * Note within each zone the priority level at which this zone was
-	 * brought into a happy state.  So that the next thread which scans this
-	 * zone will start out at that priority level.
-	 */
-	for (i = 0; i < pgdat->nr_zones; i++) {
-		struct zone *zone = pgdat->node_zones + i;
-
-		zone->prev_priority = temp_priority[i];
-	}
 	if (!all_zones_ok) {
 		cond_resched();
 
@@ -2639,7 +2583,6 @@ static int __zone_reclaim(struct zone *zone, gfp_t gfp_mask, unsigned int order)
 		 */
 		priority = ZONE_RECLAIM_PRIORITY;
 		do {
-			note_zone_scanning_priority(zone, priority);
 			shrink_zone(priority, zone, &sc);
 			priority--;
 		} while (priority >= 0 && sc.nr_reclaimed < nr_pages);
diff --git a/mm/vmstat.c b/mm/vmstat.c
index 15a14b16e176..f389168f9a83 100644
--- a/mm/vmstat.c
+++ b/mm/vmstat.c
@@ -853,11 +853,9 @@ static void zoneinfo_show_print(struct seq_file *m, pg_data_t *pgdat,
 	}
 	seq_printf(m,
 		   "\n  all_unreclaimable: %u"
-		   "\n  prev_priority:     %i"
 		   "\n  start_pfn:         %lu"
 		   "\n  inactive_ratio:    %u",
 		   zone->all_unreclaimable,
-		   zone->prev_priority,
 		   zone->zone_start_pfn,
 		   zone->inactive_ratio);
 	seq_putc(m, '\n');
-- 
cgit v1.2.3


From 74bcbf40546bb7500f2a7ba4ff3cc056a6bd004a Mon Sep 17 00:00:00 2001
From: Andrew Morton <akpm@linux-foundation.org>
Date: Mon, 9 Aug 2010 17:19:43 -0700
Subject: oom: move badness() declaration into oom.h

Cc: Minchan Kim <minchan.kim@gmail.com>
Cc: David Rientjes <rientjes@google.com>
Cc: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
Cc: Oleg Nesterov <oleg@redhat.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/proc/base.c      | 3 ---
 include/linux/oom.h | 6 ++++++
 2 files changed, 6 insertions(+), 3 deletions(-)

(limited to 'include/linux')

diff --git a/fs/proc/base.c b/fs/proc/base.c
index fc23f62bb0b8..5949d0ac30f2 100644
--- a/fs/proc/base.c
+++ b/fs/proc/base.c
@@ -427,9 +427,6 @@ static const struct file_operations proc_lstats_operations = {
 
 #endif
 
-/* The badness from the OOM killer */
-unsigned long badness(struct task_struct *p, struct mem_cgroup *mem,
-		      nodemask_t *nodemask, unsigned long uptime);
 static int proc_oom_score(struct task_struct *task, char *buffer)
 {
 	unsigned long points = 0;
diff --git a/include/linux/oom.h b/include/linux/oom.h
index 9d7c34a741e7..40e5e3a6bc20 100644
--- a/include/linux/oom.h
+++ b/include/linux/oom.h
@@ -14,6 +14,8 @@
 
 struct zonelist;
 struct notifier_block;
+struct mem_cgroup;
+struct task_struct;
 
 /*
  * Types of limitations to the nodes from which allocations may occur
@@ -45,6 +47,10 @@ static inline void oom_killer_enable(void)
 	oom_killer_disabled = false;
 }
 
+/* The badness from the OOM killer */
+extern unsigned long badness(struct task_struct *p, struct mem_cgroup *mem,
+		      const nodemask_t *nodemask, unsigned long uptime);
+
 /* sysctls */
 extern int sysctl_oom_dump_tasks;
 extern int sysctl_oom_kill_allocating_task;
-- 
cgit v1.2.3


From a63d83f427fbce97a6cea0db2e64b0eb8435cd10 Mon Sep 17 00:00:00 2001
From: David Rientjes <rientjes@google.com>
Date: Mon, 9 Aug 2010 17:19:46 -0700
Subject: oom: badness heuristic rewrite

This a complete rewrite of the oom killer's badness() heuristic which is
used to determine which task to kill in oom conditions.  The goal is to
make it as simple and predictable as possible so the results are better
understood and we end up killing the task which will lead to the most
memory freeing while still respecting the fine-tuning from userspace.

Instead of basing the heuristic on mm->total_vm for each task, the task's
rss and swap space is used instead.  This is a better indication of the
amount of memory that will be freeable if the oom killed task is chosen
and subsequently exits.  This helps specifically in cases where KDE or
GNOME is chosen for oom kill on desktop systems instead of a memory
hogging task.

The baseline for the heuristic is a proportion of memory that each task is
currently using in memory plus swap compared to the amount of "allowable"
memory.  "Allowable," in this sense, means the system-wide resources for
unconstrained oom conditions, the set of mempolicy nodes, the mems
attached to current's cpuset, or a memory controller's limit.  The
proportion is given on a scale of 0 (never kill) to 1000 (always kill),
roughly meaning that if a task has a badness() score of 500 that the task
consumes approximately 50% of allowable memory resident in RAM or in swap
space.

The proportion is always relative to the amount of "allowable" memory and
not the total amount of RAM systemwide so that mempolicies and cpusets may
operate in isolation; they shall not need to know the true size of the
machine on which they are running if they are bound to a specific set of
nodes or mems, respectively.

Root tasks are given 3% extra memory just like __vm_enough_memory()
provides in LSMs.  In the event of two tasks consuming similar amounts of
memory, it is generally better to save root's task.

Because of the change in the badness() heuristic's baseline, it is also
necessary to introduce a new user interface to tune it.  It's not possible
to redefine the meaning of /proc/pid/oom_adj with a new scale since the
ABI cannot be changed for backward compatability.  Instead, a new tunable,
/proc/pid/oom_score_adj, is added that ranges from -1000 to +1000.  It may
be used to polarize the heuristic such that certain tasks are never
considered for oom kill while others may always be considered.  The value
is added directly into the badness() score so a value of -500, for
example, means to discount 50% of its memory consumption in comparison to
other tasks either on the system, bound to the mempolicy, in the cpuset,
or sharing the same memory controller.

/proc/pid/oom_adj is changed so that its meaning is rescaled into the
units used by /proc/pid/oom_score_adj, and vice versa.  Changing one of
these per-task tunables will rescale the value of the other to an
equivalent meaning.  Although /proc/pid/oom_adj was originally defined as
a bitshift on the badness score, it now shares the same linear growth as
/proc/pid/oom_score_adj but with different granularity.  This is required
so the ABI is not broken with userspace applications and allows oom_adj to
be deprecated for future removal.

Signed-off-by: David Rientjes <rientjes@google.com>
Cc: Nick Piggin <npiggin@suse.de>
Cc: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
Cc: KOSAKI Motohiro <kosaki.motohiro@jp.fujitsu.com>
Cc: Oleg Nesterov <oleg@redhat.com>
Cc: Balbir Singh <balbir@in.ibm.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 Documentation/filesystems/proc.txt |  94 ++++++++------
 fs/proc/base.c                     |  94 +++++++++++++-
 include/linux/memcontrol.h         |   8 ++
 include/linux/oom.h                |  14 +-
 include/linux/sched.h              |   3 +-
 kernel/fork.c                      |   1 +
 mm/memcontrol.c                    |  18 +++
 mm/oom_kill.c                      | 259 ++++++++++++++++---------------------
 8 files changed, 300 insertions(+), 191 deletions(-)

(limited to 'include/linux')

diff --git a/Documentation/filesystems/proc.txt b/Documentation/filesystems/proc.txt
index 8fe8895894d8..cf1295c2bb66 100644
--- a/Documentation/filesystems/proc.txt
+++ b/Documentation/filesystems/proc.txt
@@ -33,7 +33,8 @@ Table of Contents
   2	Modifying System Parameters
 
   3	Per-Process Parameters
-  3.1	/proc/<pid>/oom_adj - Adjust the oom-killer score
+  3.1	/proc/<pid>/oom_adj & /proc/<pid>/oom_score_adj - Adjust the oom-killer
+								score
   3.2	/proc/<pid>/oom_score - Display current oom-killer score
   3.3	/proc/<pid>/io - Display the IO accounting fields
   3.4	/proc/<pid>/coredump_filter - Core dump filtering settings
@@ -1234,42 +1235,61 @@ of the kernel.
 CHAPTER 3: PER-PROCESS PARAMETERS
 ------------------------------------------------------------------------------
 
-3.1 /proc/<pid>/oom_adj - Adjust the oom-killer score
-------------------------------------------------------
-
-This file can be used to adjust the score used to select which processes
-should be killed in an  out-of-memory  situation.  Giving it a high score will
-increase the likelihood of this process being killed by the oom-killer.  Valid
-values are in the range -16 to +15, plus the special value -17, which disables
-oom-killing altogether for this process.
-
-The process to be killed in an out-of-memory situation is selected among all others
-based on its badness score. This value equals the original memory size of the process
-and is then updated according to its CPU time (utime + stime) and the
-run time (uptime - start time). The longer it runs the smaller is the score.
-Badness score is divided by the square root of the CPU time and then by
-the double square root of the run time.
-
-Swapped out tasks are killed first. Half of each child's memory size is added to
-the parent's score if they do not share the same memory. Thus forking servers
-are the prime candidates to be killed. Having only one 'hungry' child will make
-parent less preferable than the child.
-
-/proc/<pid>/oom_score shows process' current badness score.
-
-The following heuristics are then applied:
- * if the task was reniced, its score doubles
- * superuser or direct hardware access tasks (CAP_SYS_ADMIN, CAP_SYS_RESOURCE
- 	or CAP_SYS_RAWIO) have their score divided by 4
- * if oom condition happened in one cpuset and checked process does not belong
- 	to it, its score is divided by 8
- * the resulting score is multiplied by two to the power of oom_adj, i.e.
-	points <<= oom_adj when it is positive and
-	points >>= -(oom_adj) otherwise
-
-The task with the highest badness score is then selected and its children
-are killed, process itself will be killed in an OOM situation when it does
-not have children or some of them disabled oom like described above.
+3.1 /proc/<pid>/oom_adj & /proc/<pid>/oom_score_adj- Adjust the oom-killer score
+--------------------------------------------------------------------------------
+
+These file can be used to adjust the badness heuristic used to select which
+process gets killed in out of memory conditions.
+
+The badness heuristic assigns a value to each candidate task ranging from 0
+(never kill) to 1000 (always kill) to determine which process is targeted.  The
+units are roughly a proportion along that range of allowed memory the process
+may allocate from based on an estimation of its current memory and swap use.
+For example, if a task is using all allowed memory, its badness score will be
+1000.  If it is using half of its allowed memory, its score will be 500.
+
+There is an additional factor included in the badness score: root
+processes are given 3% extra memory over other tasks.
+
+The amount of "allowed" memory depends on the context in which the oom killer
+was called.  If it is due to the memory assigned to the allocating task's cpuset
+being exhausted, the allowed memory represents the set of mems assigned to that
+cpuset.  If it is due to a mempolicy's node(s) being exhausted, the allowed
+memory represents the set of mempolicy nodes.  If it is due to a memory
+limit (or swap limit) being reached, the allowed memory is that configured
+limit.  Finally, if it is due to the entire system being out of memory, the
+allowed memory represents all allocatable resources.
+
+The value of /proc/<pid>/oom_score_adj is added to the badness score before it
+is used to determine which task to kill.  Acceptable values range from -1000
+(OOM_SCORE_ADJ_MIN) to +1000 (OOM_SCORE_ADJ_MAX).  This allows userspace to
+polarize the preference for oom killing either by always preferring a certain
+task or completely disabling it.  The lowest possible value, -1000, is
+equivalent to disabling oom killing entirely for that task since it will always
+report a badness score of 0.
+
+Consequently, it is very simple for userspace to define the amount of memory to
+consider for each task.  Setting a /proc/<pid>/oom_score_adj value of +500, for
+example, is roughly equivalent to allowing the remainder of tasks sharing the
+same system, cpuset, mempolicy, or memory controller resources to use at least
+50% more memory.  A value of -500, on the other hand, would be roughly
+equivalent to discounting 50% of the task's allowed memory from being considered
+as scoring against the task.
+
+For backwards compatibility with previous kernels, /proc/<pid>/oom_adj may also
+be used to tune the badness score.  Its acceptable values range from -16
+(OOM_ADJUST_MIN) to +15 (OOM_ADJUST_MAX) and a special value of -17
+(OOM_DISABLE) to disable oom killing entirely for that task.  Its value is
+scaled linearly with /proc/<pid>/oom_score_adj.
+
+Writing to /proc/<pid>/oom_score_adj or /proc/<pid>/oom_adj will change the
+other with its scaled value.
+
+Caveat: when a parent task is selected, the oom killer will sacrifice any first
+generation children with seperate address spaces instead, if possible.  This
+avoids servers and important system daemons from being killed and loses the
+minimal amount of work.
+
 
 3.2 /proc/<pid>/oom_score - Display current oom-killer score
 -------------------------------------------------------------
diff --git a/fs/proc/base.c b/fs/proc/base.c
index 5949d0ac30f2..f923b728388a 100644
--- a/fs/proc/base.c
+++ b/fs/proc/base.c
@@ -63,6 +63,7 @@
 #include <linux/namei.h>
 #include <linux/mnt_namespace.h>
 #include <linux/mm.h>
+#include <linux/swap.h>
 #include <linux/rcupdate.h>
 #include <linux/kallsyms.h>
 #include <linux/stacktrace.h>
@@ -430,12 +431,11 @@ static const struct file_operations proc_lstats_operations = {
 static int proc_oom_score(struct task_struct *task, char *buffer)
 {
 	unsigned long points = 0;
-	struct timespec uptime;
 
-	do_posix_clock_monotonic_gettime(&uptime);
 	read_lock(&tasklist_lock);
 	if (pid_alive(task))
-		points = badness(task, NULL, NULL, uptime.tv_sec);
+		points = oom_badness(task, NULL, NULL,
+					totalram_pages + total_swap_pages);
 	read_unlock(&tasklist_lock);
 	return sprintf(buffer, "%lu\n", points);
 }
@@ -1038,7 +1038,15 @@ static ssize_t oom_adjust_write(struct file *file, const char __user *buf,
 	}
 
 	task->signal->oom_adj = oom_adjust;
-
+	/*
+	 * Scale /proc/pid/oom_score_adj appropriately ensuring that a maximum
+	 * value is always attainable.
+	 */
+	if (task->signal->oom_adj == OOM_ADJUST_MAX)
+		task->signal->oom_score_adj = OOM_SCORE_ADJ_MAX;
+	else
+		task->signal->oom_score_adj = (oom_adjust * OOM_SCORE_ADJ_MAX) /
+								-OOM_DISABLE;
 	unlock_task_sighand(task, &flags);
 	put_task_struct(task);
 
@@ -1051,6 +1059,82 @@ static const struct file_operations proc_oom_adjust_operations = {
 	.llseek		= generic_file_llseek,
 };
 
+static ssize_t oom_score_adj_read(struct file *file, char __user *buf,
+					size_t count, loff_t *ppos)
+{
+	struct task_struct *task = get_proc_task(file->f_path.dentry->d_inode);
+	char buffer[PROC_NUMBUF];
+	int oom_score_adj = OOM_SCORE_ADJ_MIN;
+	unsigned long flags;
+	size_t len;
+
+	if (!task)
+		return -ESRCH;
+	if (lock_task_sighand(task, &flags)) {
+		oom_score_adj = task->signal->oom_score_adj;
+		unlock_task_sighand(task, &flags);
+	}
+	put_task_struct(task);
+	len = snprintf(buffer, sizeof(buffer), "%d\n", oom_score_adj);
+	return simple_read_from_buffer(buf, count, ppos, buffer, len);
+}
+
+static ssize_t oom_score_adj_write(struct file *file, const char __user *buf,
+					size_t count, loff_t *ppos)
+{
+	struct task_struct *task;
+	char buffer[PROC_NUMBUF];
+	unsigned long flags;
+	long oom_score_adj;
+	int err;
+
+	memset(buffer, 0, sizeof(buffer));
+	if (count > sizeof(buffer) - 1)
+		count = sizeof(buffer) - 1;
+	if (copy_from_user(buffer, buf, count))
+		return -EFAULT;
+
+	err = strict_strtol(strstrip(buffer), 0, &oom_score_adj);
+	if (err)
+		return -EINVAL;
+	if (oom_score_adj < OOM_SCORE_ADJ_MIN ||
+			oom_score_adj > OOM_SCORE_ADJ_MAX)
+		return -EINVAL;
+
+	task = get_proc_task(file->f_path.dentry->d_inode);
+	if (!task)
+		return -ESRCH;
+	if (!lock_task_sighand(task, &flags)) {
+		put_task_struct(task);
+		return -ESRCH;
+	}
+	if (oom_score_adj < task->signal->oom_score_adj &&
+			!capable(CAP_SYS_RESOURCE)) {
+		unlock_task_sighand(task, &flags);
+		put_task_struct(task);
+		return -EACCES;
+	}
+
+	task->signal->oom_score_adj = oom_score_adj;
+	/*
+	 * Scale /proc/pid/oom_adj appropriately ensuring that OOM_DISABLE is
+	 * always attainable.
+	 */
+	if (task->signal->oom_score_adj == OOM_SCORE_ADJ_MIN)
+		task->signal->oom_adj = OOM_DISABLE;
+	else
+		task->signal->oom_adj = (oom_score_adj * OOM_ADJUST_MAX) /
+							OOM_SCORE_ADJ_MAX;
+	unlock_task_sighand(task, &flags);
+	put_task_struct(task);
+	return count;
+}
+
+static const struct file_operations proc_oom_score_adj_operations = {
+	.read		= oom_score_adj_read,
+	.write		= oom_score_adj_write,
+};
+
 #ifdef CONFIG_AUDITSYSCALL
 #define TMPBUFLEN 21
 static ssize_t proc_loginuid_read(struct file * file, char __user * buf,
@@ -2623,6 +2707,7 @@ static const struct pid_entry tgid_base_stuff[] = {
 #endif
 	INF("oom_score",  S_IRUGO, proc_oom_score),
 	REG("oom_adj",    S_IRUGO|S_IWUSR, proc_oom_adjust_operations),
+	REG("oom_score_adj", S_IRUGO|S_IWUSR, proc_oom_score_adj_operations),
 #ifdef CONFIG_AUDITSYSCALL
 	REG("loginuid",   S_IWUSR|S_IRUGO, proc_loginuid_operations),
 	REG("sessionid",  S_IRUGO, proc_sessionid_operations),
@@ -2957,6 +3042,7 @@ static const struct pid_entry tid_base_stuff[] = {
 #endif
 	INF("oom_score", S_IRUGO, proc_oom_score),
 	REG("oom_adj",   S_IRUGO|S_IWUSR, proc_oom_adjust_operations),
+	REG("oom_score_adj", S_IRUGO|S_IWUSR, proc_oom_score_adj_operations),
 #ifdef CONFIG_AUDITSYSCALL
 	REG("loginuid",  S_IWUSR|S_IRUGO, proc_loginuid_operations),
 	REG("sessionid",  S_IRUSR, proc_sessionid_operations),
diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h
index 9f1afd361583..73564cac38c7 100644
--- a/include/linux/memcontrol.h
+++ b/include/linux/memcontrol.h
@@ -125,6 +125,8 @@ void mem_cgroup_update_file_mapped(struct page *page, int val);
 unsigned long mem_cgroup_soft_limit_reclaim(struct zone *zone, int order,
 						gfp_t gfp_mask, int nid,
 						int zid);
+u64 mem_cgroup_get_limit(struct mem_cgroup *mem);
+
 #else /* CONFIG_CGROUP_MEM_RES_CTLR */
 struct mem_cgroup;
 
@@ -304,6 +306,12 @@ unsigned long mem_cgroup_soft_limit_reclaim(struct zone *zone, int order,
 	return 0;
 }
 
+static inline
+u64 mem_cgroup_get_limit(struct mem_cgroup *mem)
+{
+	return 0;
+}
+
 #endif /* CONFIG_CGROUP_MEM_CONT */
 
 #endif /* _LINUX_MEMCONTROL_H */
diff --git a/include/linux/oom.h b/include/linux/oom.h
index 40e5e3a6bc20..73b8d7b6dd19 100644
--- a/include/linux/oom.h
+++ b/include/linux/oom.h
@@ -1,14 +1,24 @@
 #ifndef __INCLUDE_LINUX_OOM_H
 #define __INCLUDE_LINUX_OOM_H
 
-/* /proc/<pid>/oom_adj set to -17 protects from the oom-killer */
+/*
+ * /proc/<pid>/oom_adj set to -17 protects from the oom-killer
+ */
 #define OOM_DISABLE (-17)
 /* inclusive */
 #define OOM_ADJUST_MIN (-16)
 #define OOM_ADJUST_MAX 15
 
+/*
+ * /proc/<pid>/oom_score_adj set to OOM_SCORE_ADJ_MIN disables oom killing for
+ * pid.
+ */
+#define OOM_SCORE_ADJ_MIN	(-1000)
+#define OOM_SCORE_ADJ_MAX	1000
+
 #ifdef __KERNEL__
 
+#include <linux/sched.h>
 #include <linux/types.h>
 #include <linux/nodemask.h>
 
@@ -27,6 +37,8 @@ enum oom_constraint {
 	CONSTRAINT_MEMCG,
 };
 
+extern unsigned int oom_badness(struct task_struct *p, struct mem_cgroup *mem,
+			const nodemask_t *nodemask, unsigned long totalpages);
 extern int try_set_zonelist_oom(struct zonelist *zonelist, gfp_t gfp_flags);
 extern void clear_zonelist_oom(struct zonelist *zonelist, gfp_t gfp_flags);
 
diff --git a/include/linux/sched.h b/include/linux/sched.h
index 9591907c4f79..ce160d68f5e7 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -621,7 +621,8 @@ struct signal_struct {
 	struct tty_audit_buf *tty_audit_buf;
 #endif
 
-	int oom_adj;	/* OOM kill score adjustment (bit shift) */
+	int oom_adj;		/* OOM kill score adjustment (bit shift) */
+	int oom_score_adj;	/* OOM kill score adjustment */
 };
 
 /* Context switch must be unlocked if interrupts are to be enabled */
diff --git a/kernel/fork.c b/kernel/fork.c
index a82a65cef741..98b450876f93 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -899,6 +899,7 @@ static int copy_signal(unsigned long clone_flags, struct task_struct *tsk)
 	tty_audit_fork(sig);
 
 	sig->oom_adj = current->signal->oom_adj;
+	sig->oom_score_adj = current->signal->oom_score_adj;
 
 	return 0;
 }
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index 31abd1c2c0c5..de54ea0094a1 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -1126,6 +1126,24 @@ static int mem_cgroup_count_children(struct mem_cgroup *mem)
 	return num;
 }
 
+/*
+ * Return the memory (and swap, if configured) limit for a memcg.
+ */
+u64 mem_cgroup_get_limit(struct mem_cgroup *memcg)
+{
+	u64 limit;
+	u64 memsw;
+
+	limit = res_counter_read_u64(&memcg->res, RES_LIMIT) +
+			total_swap_pages;
+	memsw = res_counter_read_u64(&memcg->memsw, RES_LIMIT);
+	/*
+	 * If memsw is finite and limits the amount of swap space available
+	 * to this memcg, return that limit.
+	 */
+	return min(limit, memsw);
+}
+
 /*
  * Visit the first child (need not be the first child as per the ordering
  * of the cgroup list, since we track last_scanned_child) of @mem and use
diff --git a/mm/oom_kill.c b/mm/oom_kill.c
index 0a4ca8a0234b..d3def05a33d9 100644
--- a/mm/oom_kill.c
+++ b/mm/oom_kill.c
@@ -4,6 +4,8 @@
  *  Copyright (C)  1998,2000  Rik van Riel
  *	Thanks go out to Claus Fischer for some serious inspiration and
  *	for goading me into coding this file...
+ *  Copyright (C)  2010  Google, Inc.
+ *	Rewritten by David Rientjes
  *
  *  The routines in this file are used to kill a process when
  *  we're seriously out of memory. This gets called from __alloc_pages()
@@ -34,7 +36,6 @@ int sysctl_panic_on_oom;
 int sysctl_oom_kill_allocating_task;
 int sysctl_oom_dump_tasks = 1;
 static DEFINE_SPINLOCK(zone_scan_lock);
-/* #define DEBUG */
 
 #ifdef CONFIG_NUMA
 /**
@@ -140,137 +141,76 @@ static bool oom_unkillable_task(struct task_struct *p, struct mem_cgroup *mem,
 }
 
 /**
- * badness - calculate a numeric value for how bad this task has been
+ * oom_badness - heuristic function to determine which candidate task to kill
  * @p: task struct of which task we should calculate
- * @uptime: current uptime in seconds
+ * @totalpages: total present RAM allowed for page allocation
  *
- * The formula used is relatively simple and documented inline in the
- * function. The main rationale is that we want to select a good task
- * to kill when we run out of memory.
- *
- * Good in this context means that:
- * 1) we lose the minimum amount of work done
- * 2) we recover a large amount of memory
- * 3) we don't kill anything innocent of eating tons of memory
- * 4) we want to kill the minimum amount of processes (one)
- * 5) we try to kill the process the user expects us to kill, this
- *    algorithm has been meticulously tuned to meet the principle
- *    of least surprise ... (be careful when you change it)
+ * The heuristic for determining which task to kill is made to be as simple and
+ * predictable as possible.  The goal is to return the highest value for the
+ * task consuming the most memory to avoid subsequent oom failures.
  */
-unsigned long badness(struct task_struct *p, struct mem_cgroup *mem,
-		      const nodemask_t *nodemask, unsigned long uptime)
+unsigned int oom_badness(struct task_struct *p, struct mem_cgroup *mem,
+		      const nodemask_t *nodemask, unsigned long totalpages)
 {
-	unsigned long points, cpu_time, run_time;
-	struct task_struct *child;
-	struct task_struct *c, *t;
-	int oom_adj = p->signal->oom_adj;
-	struct task_cputime task_time;
-	unsigned long utime;
-	unsigned long stime;
+	int points;
 
 	if (oom_unkillable_task(p, mem, nodemask))
 		return 0;
-	if (oom_adj == OOM_DISABLE)
-		return 0;
 
 	p = find_lock_task_mm(p);
 	if (!p)
 		return 0;
 
 	/*
-	 * The memory size of the process is the basis for the badness.
-	 */
-	points = p->mm->total_vm;
-	task_unlock(p);
-
-	/*
-	 * swapoff can easily use up all memory, so kill those first.
-	 */
-	if (p->flags & PF_OOM_ORIGIN)
-		return ULONG_MAX;
-
-	/*
-	 * Processes which fork a lot of child processes are likely
-	 * a good choice. We add half the vmsize of the children if they
-	 * have an own mm. This prevents forking servers to flood the
-	 * machine with an endless amount of children. In case a single
-	 * child is eating the vast majority of memory, adding only half
-	 * to the parents will make the child our kill candidate of choice.
+	 * Shortcut check for OOM_SCORE_ADJ_MIN so the entire heuristic doesn't
+	 * need to be executed for something that cannot be killed.
 	 */
-	t = p;
-	do {
-		list_for_each_entry(c, &t->children, sibling) {
-			child = find_lock_task_mm(c);
-			if (child) {
-				if (child->mm != p->mm)
-					points += child->mm->total_vm/2 + 1;
-				task_unlock(child);
-			}
-		}
-	} while_each_thread(p, t);
+	if (p->signal->oom_score_adj == OOM_SCORE_ADJ_MIN) {
+		task_unlock(p);
+		return 0;
+	}
 
 	/*
-	 * CPU time is in tens of seconds and run time is in thousands
-         * of seconds. There is no particular reason for this other than
-         * that it turned out to work very well in practice.
+	 * When the PF_OOM_ORIGIN bit is set, it indicates the task should have
+	 * priority for oom killing.
 	 */
-	thread_group_cputime(p, &task_time);
-	utime = cputime_to_jiffies(task_time.utime);
-	stime = cputime_to_jiffies(task_time.stime);
-	cpu_time = (utime + stime) >> (SHIFT_HZ + 3);
-
-
-	if (uptime >= p->start_time.tv_sec)
-		run_time = (uptime - p->start_time.tv_sec) >> 10;
-	else
-		run_time = 0;
-
-	if (cpu_time)
-		points /= int_sqrt(cpu_time);
-	if (run_time)
-		points /= int_sqrt(int_sqrt(run_time));
+	if (p->flags & PF_OOM_ORIGIN) {
+		task_unlock(p);
+		return 1000;
+	}
 
 	/*
-	 * Niced processes are most likely less important, so double
-	 * their badness points.
+	 * The memory controller may have a limit of 0 bytes, so avoid a divide
+	 * by zero, if necessary.
 	 */
-	if (task_nice(p) > 0)
-		points *= 2;
+	if (!totalpages)
+		totalpages = 1;
 
 	/*
-	 * Superuser processes are usually more important, so we make it
-	 * less likely that we kill those.
+	 * The baseline for the badness score is the proportion of RAM that each
+	 * task's rss and swap space use.
 	 */
-	if (has_capability_noaudit(p, CAP_SYS_ADMIN) ||
-	    has_capability_noaudit(p, CAP_SYS_RESOURCE))
-		points /= 4;
+	points = (get_mm_rss(p->mm) + get_mm_counter(p->mm, MM_SWAPENTS)) * 1000 /
+			totalpages;
+	task_unlock(p);
 
 	/*
-	 * We don't want to kill a process with direct hardware access.
-	 * Not only could that mess up the hardware, but usually users
-	 * tend to only have this flag set on applications they think
-	 * of as important.
+	 * Root processes get 3% bonus, just like the __vm_enough_memory()
+	 * implementation used by LSMs.
 	 */
-	if (has_capability_noaudit(p, CAP_SYS_RAWIO))
-		points /= 4;
+	if (has_capability_noaudit(p, CAP_SYS_ADMIN))
+		points -= 30;
 
 	/*
-	 * Adjust the score by oom_adj.
+	 * /proc/pid/oom_score_adj ranges from -1000 to +1000 such that it may
+	 * either completely disable oom killing or always prefer a certain
+	 * task.
 	 */
-	if (oom_adj) {
-		if (oom_adj > 0) {
-			if (!points)
-				points = 1;
-			points <<= oom_adj;
-		} else
-			points >>= -(oom_adj);
-	}
+	points += p->signal->oom_score_adj;
 
-#ifdef DEBUG
-	printk(KERN_DEBUG "OOMkill: task %d (%s) got %lu points\n",
-	p->pid, p->comm, points);
-#endif
-	return points;
+	if (points < 0)
+		return 0;
+	return (points < 1000) ? points : 1000;
 }
 
 /*
@@ -278,12 +218,20 @@ unsigned long badness(struct task_struct *p, struct mem_cgroup *mem,
  */
 #ifdef CONFIG_NUMA
 static enum oom_constraint constrained_alloc(struct zonelist *zonelist,
-				    gfp_t gfp_mask, nodemask_t *nodemask)
+				gfp_t gfp_mask, nodemask_t *nodemask,
+				unsigned long *totalpages)
 {
 	struct zone *zone;
 	struct zoneref *z;
 	enum zone_type high_zoneidx = gfp_zone(gfp_mask);
+	bool cpuset_limited = false;
+	int nid;
 
+	/* Default to all available memory */
+	*totalpages = totalram_pages + total_swap_pages;
+
+	if (!zonelist)
+		return CONSTRAINT_NONE;
 	/*
 	 * Reach here only when __GFP_NOFAIL is used. So, we should avoid
 	 * to kill current.We have to random task kill in this case.
@@ -293,26 +241,37 @@ static enum oom_constraint constrained_alloc(struct zonelist *zonelist,
 		return CONSTRAINT_NONE;
 
 	/*
-	 * The nodemask here is a nodemask passed to alloc_pages(). Now,
-	 * cpuset doesn't use this nodemask for its hardwall/softwall/hierarchy
-	 * feature. mempolicy is an only user of nodemask here.
-	 * check mempolicy's nodemask contains all N_HIGH_MEMORY
+	 * This is not a __GFP_THISNODE allocation, so a truncated nodemask in
+	 * the page allocator means a mempolicy is in effect.  Cpuset policy
+	 * is enforced in get_page_from_freelist().
 	 */
-	if (nodemask && !nodes_subset(node_states[N_HIGH_MEMORY], *nodemask))
+	if (nodemask && !nodes_subset(node_states[N_HIGH_MEMORY], *nodemask)) {
+		*totalpages = total_swap_pages;
+		for_each_node_mask(nid, *nodemask)
+			*totalpages += node_spanned_pages(nid);
 		return CONSTRAINT_MEMORY_POLICY;
+	}
 
 	/* Check this allocation failure is caused by cpuset's wall function */
 	for_each_zone_zonelist_nodemask(zone, z, zonelist,
 			high_zoneidx, nodemask)
 		if (!cpuset_zone_allowed_softwall(zone, gfp_mask))
-			return CONSTRAINT_CPUSET;
+			cpuset_limited = true;
 
+	if (cpuset_limited) {
+		*totalpages = total_swap_pages;
+		for_each_node_mask(nid, cpuset_current_mems_allowed)
+			*totalpages += node_spanned_pages(nid);
+		return CONSTRAINT_CPUSET;
+	}
 	return CONSTRAINT_NONE;
 }
 #else
 static enum oom_constraint constrained_alloc(struct zonelist *zonelist,
-				gfp_t gfp_mask, nodemask_t *nodemask)
+				gfp_t gfp_mask, nodemask_t *nodemask,
+				unsigned long *totalpages)
 {
+	*totalpages = totalram_pages + total_swap_pages;
 	return CONSTRAINT_NONE;
 }
 #endif
@@ -323,17 +282,16 @@ static enum oom_constraint constrained_alloc(struct zonelist *zonelist,
  *
  * (not docbooked, we don't want this one cluttering up the manual)
  */
-static struct task_struct *select_bad_process(unsigned long *ppoints,
-		struct mem_cgroup *mem, const nodemask_t *nodemask)
+static struct task_struct *select_bad_process(unsigned int *ppoints,
+		unsigned long totalpages, struct mem_cgroup *mem,
+		const nodemask_t *nodemask)
 {
 	struct task_struct *p;
 	struct task_struct *chosen = NULL;
-	struct timespec uptime;
 	*ppoints = 0;
 
-	do_posix_clock_monotonic_gettime(&uptime);
 	for_each_process(p) {
-		unsigned long points;
+		unsigned int points;
 
 		if (oom_unkillable_task(p, mem, nodemask))
 			continue;
@@ -365,11 +323,11 @@ static struct task_struct *select_bad_process(unsigned long *ppoints,
 				return ERR_PTR(-1UL);
 
 			chosen = p;
-			*ppoints = ULONG_MAX;
+			*ppoints = 1000;
 		}
 
-		points = badness(p, mem, nodemask, uptime.tv_sec);
-		if (points > *ppoints || !chosen) {
+		points = oom_badness(p, mem, nodemask, totalpages);
+		if (points > *ppoints) {
 			chosen = p;
 			*ppoints = points;
 		}
@@ -384,7 +342,7 @@ static struct task_struct *select_bad_process(unsigned long *ppoints,
  *
  * Dumps the current memory state of all system tasks, excluding kernel threads.
  * State information includes task's pid, uid, tgid, vm size, rss, cpu, oom_adj
- * score, and name.
+ * value, oom_score_adj value, and name.
  *
  * If the actual is non-NULL, only tasks that are a member of the mem_cgroup are
  * shown.
@@ -396,8 +354,7 @@ static void dump_tasks(const struct mem_cgroup *mem)
 	struct task_struct *p;
 	struct task_struct *task;
 
-	printk(KERN_INFO "[ pid ]   uid  tgid total_vm      rss cpu oom_adj "
-	       "name\n");
+	pr_info("[ pid ]   uid  tgid total_vm      rss cpu oom_adj oom_score_adj name\n");
 	for_each_process(p) {
 		if (p->flags & PF_KTHREAD)
 			continue;
@@ -414,10 +371,11 @@ static void dump_tasks(const struct mem_cgroup *mem)
 			continue;
 		}
 
-		printk(KERN_INFO "[%5d] %5d %5d %8lu %8lu %3u     %3d %s\n",
-		       task->pid, __task_cred(task)->uid, task->tgid,
-		       task->mm->total_vm, get_mm_rss(task->mm),
-		       task_cpu(task), task->signal->oom_adj, task->comm);
+		pr_info("[%5d] %5d %5d %8lu %8lu %3u     %3d         %5d %s\n",
+			task->pid, __task_cred(task)->uid, task->tgid,
+			task->mm->total_vm, get_mm_rss(task->mm),
+			task_cpu(task), task->signal->oom_adj,
+			task->signal->oom_score_adj, task->comm);
 		task_unlock(task);
 	}
 }
@@ -427,8 +385,9 @@ static void dump_header(struct task_struct *p, gfp_t gfp_mask, int order,
 {
 	task_lock(current);
 	pr_warning("%s invoked oom-killer: gfp_mask=0x%x, order=%d, "
-		"oom_adj=%d\n",
-		current->comm, gfp_mask, order, current->signal->oom_adj);
+		"oom_adj=%d, oom_score_adj=%d\n",
+		current->comm, gfp_mask, order, current->signal->oom_adj,
+		current->signal->oom_score_adj);
 	cpuset_print_task_mems_allowed(current);
 	task_unlock(current);
 	dump_stack();
@@ -468,14 +427,14 @@ static int oom_kill_task(struct task_struct *p, struct mem_cgroup *mem)
 #undef K
 
 static int oom_kill_process(struct task_struct *p, gfp_t gfp_mask, int order,
-			    unsigned long points, struct mem_cgroup *mem,
-			    nodemask_t *nodemask, const char *message)
+			    unsigned int points, unsigned long totalpages,
+			    struct mem_cgroup *mem, nodemask_t *nodemask,
+			    const char *message)
 {
 	struct task_struct *victim = p;
 	struct task_struct *child;
 	struct task_struct *t = p;
-	unsigned long victim_points = 0;
-	struct timespec uptime;
+	unsigned int victim_points = 0;
 
 	if (printk_ratelimit())
 		dump_header(p, gfp_mask, order, mem);
@@ -491,7 +450,7 @@ static int oom_kill_process(struct task_struct *p, gfp_t gfp_mask, int order,
 	}
 
 	task_lock(p);
-	pr_err("%s: Kill process %d (%s) score %lu or sacrifice child\n",
+	pr_err("%s: Kill process %d (%s) score %d or sacrifice child\n",
 		message, task_pid_nr(p), p->comm, points);
 	task_unlock(p);
 
@@ -501,14 +460,15 @@ static int oom_kill_process(struct task_struct *p, gfp_t gfp_mask, int order,
 	 * parent.  This attempts to lose the minimal amount of work done while
 	 * still freeing memory.
 	 */
-	do_posix_clock_monotonic_gettime(&uptime);
 	do {
 		list_for_each_entry(child, &t->children, sibling) {
-			unsigned long child_points;
+			unsigned int child_points;
 
-			/* badness() returns 0 if the thread is unkillable */
-			child_points = badness(child, mem, nodemask,
-					       uptime.tv_sec);
+			/*
+			 * oom_badness() returns 0 if the thread is unkillable
+			 */
+			child_points = oom_badness(child, mem, nodemask,
+								totalpages);
 			if (child_points > victim_points) {
 				victim = child;
 				victim_points = child_points;
@@ -546,17 +506,19 @@ static void check_panic_on_oom(enum oom_constraint constraint, gfp_t gfp_mask,
 #ifdef CONFIG_CGROUP_MEM_RES_CTLR
 void mem_cgroup_out_of_memory(struct mem_cgroup *mem, gfp_t gfp_mask)
 {
-	unsigned long points = 0;
+	unsigned long limit;
+	unsigned int points = 0;
 	struct task_struct *p;
 
 	check_panic_on_oom(CONSTRAINT_MEMCG, gfp_mask, 0);
+	limit = mem_cgroup_get_limit(mem) >> PAGE_SHIFT;
 	read_lock(&tasklist_lock);
 retry:
-	p = select_bad_process(&points, mem, NULL);
+	p = select_bad_process(&points, limit, mem, NULL);
 	if (!p || PTR_ERR(p) == -1UL)
 		goto out;
 
-	if (oom_kill_process(p, gfp_mask, 0, points, mem, NULL,
+	if (oom_kill_process(p, gfp_mask, 0, points, limit, mem, NULL,
 				"Memory cgroup out of memory"))
 		goto retry;
 out:
@@ -681,8 +643,9 @@ void out_of_memory(struct zonelist *zonelist, gfp_t gfp_mask,
 		int order, nodemask_t *nodemask)
 {
 	struct task_struct *p;
+	unsigned long totalpages;
 	unsigned long freed = 0;
-	unsigned long points;
+	unsigned int points;
 	enum oom_constraint constraint = CONSTRAINT_NONE;
 
 	blocking_notifier_call_chain(&oom_notify_list, 0, &freed);
@@ -705,8 +668,8 @@ void out_of_memory(struct zonelist *zonelist, gfp_t gfp_mask,
 	 * Check if there were limitations on the allocation (only relevant for
 	 * NUMA) that may require different handling.
 	 */
-	if (zonelist)
-		constraint = constrained_alloc(zonelist, gfp_mask, nodemask);
+	constraint = constrained_alloc(zonelist, gfp_mask, nodemask,
+						&totalpages);
 	check_panic_on_oom(constraint, gfp_mask, order);
 
 	read_lock(&tasklist_lock);
@@ -718,14 +681,14 @@ void out_of_memory(struct zonelist *zonelist, gfp_t gfp_mask,
 		 * non-zero, current could not be killed so we must fallback to
 		 * the tasklist scan.
 		 */
-		if (!oom_kill_process(current, gfp_mask, order, 0, NULL,
-				nodemask,
+		if (!oom_kill_process(current, gfp_mask, order, 0, totalpages,
+				NULL, nodemask,
 				"Out of memory (oom_kill_allocating_task)"))
 			return;
 	}
 
 retry:
-	p = select_bad_process(&points, NULL,
+	p = select_bad_process(&points, totalpages, NULL,
 			constraint == CONSTRAINT_MEMORY_POLICY ? nodemask :
 								 NULL);
 	if (PTR_ERR(p) == -1UL)
@@ -738,8 +701,8 @@ retry:
 		panic("Out of memory and no killable processes...\n");
 	}
 
-	if (oom_kill_process(p, gfp_mask, order, points, NULL, nodemask,
-			     "Out of memory"))
+	if (oom_kill_process(p, gfp_mask, order, points, totalpages, NULL,
+				nodemask, "Out of memory"))
 		goto retry;
 	read_unlock(&tasklist_lock);
 
-- 
cgit v1.2.3


From 51b1bd2ace1595b72956224deda349efa880b693 Mon Sep 17 00:00:00 2001
From: David Rientjes <rientjes@google.com>
Date: Mon, 9 Aug 2010 17:19:47 -0700
Subject: oom: deprecate oom_adj tunable

/proc/pid/oom_adj is now deprecated so that that it may eventually be
removed.  The target date for removal is August 2012.

A warning will be printed to the kernel log if a task attempts to use this
interface.  Future warning will be suppressed until the kernel is rebooted
to prevent spamming the kernel log.

Signed-off-by: David Rientjes <rientjes@google.com>
Cc: Nick Piggin <npiggin@suse.de>
Cc: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
Cc: KOSAKI Motohiro <kosaki.motohiro@jp.fujitsu.com>
Cc: Oleg Nesterov <oleg@redhat.com>
Cc: Balbir Singh <balbir@in.ibm.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 Documentation/feature-removal-schedule.txt | 25 +++++++++++++++++++++++++
 Documentation/filesystems/proc.txt         |  3 +++
 fs/proc/base.c                             |  8 ++++++++
 include/linux/oom.h                        |  3 +++
 4 files changed, 39 insertions(+)

(limited to 'include/linux')

diff --git a/Documentation/feature-removal-schedule.txt b/Documentation/feature-removal-schedule.txt
index 71f0fea1058f..56cee4727b1a 100644
--- a/Documentation/feature-removal-schedule.txt
+++ b/Documentation/feature-removal-schedule.txt
@@ -151,6 +151,31 @@ Who:	Eric Biederman <ebiederm@xmission.com>
 
 ---------------------------
 
+What:	/proc/<pid>/oom_adj
+When:	August 2012
+Why:	/proc/<pid>/oom_adj allows userspace to influence the oom killer's
+	badness heuristic used to determine which task to kill when the kernel
+	is out of memory.
+
+	The badness heuristic has since been rewritten since the introduction of
+	this tunable such that its meaning is deprecated.  The value was
+	implemented as a bitshift on a score generated by the badness()
+	function that did not have any precise units of measure.  With the
+	rewrite, the score is given as a proportion of available memory to the
+	task allocating pages, so using a bitshift which grows the score
+	exponentially is, thus, impossible to tune with fine granularity.
+
+	A much more powerful interface, /proc/<pid>/oom_score_adj, was
+	introduced with the oom killer rewrite that allows users to increase or
+	decrease the badness() score linearly.  This interface will replace
+	/proc/<pid>/oom_adj.
+
+	A warning will be emitted to the kernel log if an application uses this
+	deprecated interface.  After it is printed once, future warnings will be
+	suppressed until the kernel is rebooted.
+
+---------------------------
+
 What:	remove EXPORT_SYMBOL(kernel_thread)
 When:	August 2006
 Files:	arch/*/kernel/*_ksyms.c
diff --git a/Documentation/filesystems/proc.txt b/Documentation/filesystems/proc.txt
index cf1295c2bb66..a6aca8740883 100644
--- a/Documentation/filesystems/proc.txt
+++ b/Documentation/filesystems/proc.txt
@@ -1285,6 +1285,9 @@ scaled linearly with /proc/<pid>/oom_score_adj.
 Writing to /proc/<pid>/oom_score_adj or /proc/<pid>/oom_adj will change the
 other with its scaled value.
 
+NOTICE: /proc/<pid>/oom_adj is deprecated and will be removed, please see
+Documentation/feature-removal-schedule.txt.
+
 Caveat: when a parent task is selected, the oom killer will sacrifice any first
 generation children with seperate address spaces instead, if possible.  This
 avoids servers and important system daemons from being killed and loses the
diff --git a/fs/proc/base.c b/fs/proc/base.c
index f923b728388a..69254a365ce2 100644
--- a/fs/proc/base.c
+++ b/fs/proc/base.c
@@ -1037,6 +1037,14 @@ static ssize_t oom_adjust_write(struct file *file, const char __user *buf,
 		return -EACCES;
 	}
 
+	/*
+	 * Warn that /proc/pid/oom_adj is deprecated, see
+	 * Documentation/feature-removal-schedule.txt.
+	 */
+	printk_once(KERN_WARNING "%s (%d): /proc/%d/oom_adj is deprecated, "
+			"please use /proc/%d/oom_score_adj instead.\n",
+			current->comm, task_pid_nr(current),
+			task_pid_nr(task), task_pid_nr(task));
 	task->signal->oom_adj = oom_adjust;
 	/*
 	 * Scale /proc/pid/oom_score_adj appropriately ensuring that a maximum
diff --git a/include/linux/oom.h b/include/linux/oom.h
index 73b8d7b6dd19..f209b683e118 100644
--- a/include/linux/oom.h
+++ b/include/linux/oom.h
@@ -2,6 +2,9 @@
 #define __INCLUDE_LINUX_OOM_H
 
 /*
+ * /proc/<pid>/oom_adj is deprecated, see
+ * Documentation/feature-removal-schedule.txt.
+ *
  * /proc/<pid>/oom_adj set to -17 protects from the oom-killer
  */
 #define OOM_DISABLE (-17)
-- 
cgit v1.2.3


From ad8c2ee801ad7a52d919b478d9b2c7b39a72d295 Mon Sep 17 00:00:00 2001
From: Rik van Riel <riel@redhat.com>
Date: Mon, 9 Aug 2010 17:19:48 -0700
Subject: rmap: add exclusive page to private anon_vma on swapin

On swapin it is fairly common for a page to be owned exclusively by one
process.  In that case we want to add the page to the anon_vma of that
process's VMA, instead of to the root anon_vma.

This will reduce the amount of rmap searching that the swapout code needs
to do.

Signed-off-by: Rik van Riel <riel@redhat.com>
Cc: Andrea Arcangeli <aarcange@redhat.com>
Cc: KOSAKI Motohiro <kosaki.motohiro@jp.fujitsu.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/rmap.h |  2 ++
 mm/memory.c          |  4 +++-
 mm/rmap.c            | 13 ++++++++++++-
 3 files changed, 17 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/rmap.h b/include/linux/rmap.h
index dc9b3c0bf5d4..d6661de56f30 100644
--- a/include/linux/rmap.h
+++ b/include/linux/rmap.h
@@ -162,6 +162,8 @@ static inline void anon_vma_merge(struct vm_area_struct *vma,
  */
 void page_move_anon_rmap(struct page *, struct vm_area_struct *, unsigned long);
 void page_add_anon_rmap(struct page *, struct vm_area_struct *, unsigned long);
+void do_page_add_anon_rmap(struct page *, struct vm_area_struct *,
+			   unsigned long, int);
 void page_add_new_anon_rmap(struct page *, struct vm_area_struct *, unsigned long);
 void page_add_file_rmap(struct page *);
 void page_remove_rmap(struct page *);
diff --git a/mm/memory.c b/mm/memory.c
index 6b0c37dcfd16..6bc039486e9f 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -2628,6 +2628,7 @@ static int do_swap_page(struct mm_struct *mm, struct vm_area_struct *vma,
 	swp_entry_t entry;
 	pte_t pte;
 	struct mem_cgroup *ptr = NULL;
+	int exclusive = 0;
 	int ret = 0;
 
 	if (!pte_unmap_same(mm, pmd, page_table, orig_pte))
@@ -2722,10 +2723,11 @@ static int do_swap_page(struct mm_struct *mm, struct vm_area_struct *vma,
 	if ((flags & FAULT_FLAG_WRITE) && reuse_swap_page(page)) {
 		pte = maybe_mkwrite(pte_mkdirty(pte), vma);
 		flags &= ~FAULT_FLAG_WRITE;
+		exclusive = 1;
 	}
 	flush_icache_page(vma, page);
 	set_pte_at(mm, address, page_table, pte);
-	page_add_anon_rmap(page, vma, address);
+	do_page_add_anon_rmap(page, vma, address, exclusive);
 	/* It's better to call commit-charge after rmap is established */
 	mem_cgroup_commit_charge_swapin(page, ptr);
 
diff --git a/mm/rmap.c b/mm/rmap.c
index 4d152a6d3a89..a7d0f5482634 100644
--- a/mm/rmap.c
+++ b/mm/rmap.c
@@ -829,6 +829,17 @@ static void __page_check_anon_rmap(struct page *page,
  */
 void page_add_anon_rmap(struct page *page,
 	struct vm_area_struct *vma, unsigned long address)
+{
+	do_page_add_anon_rmap(page, vma, address, 0);
+}
+
+/*
+ * Special version of the above for do_swap_page, which often runs
+ * into pages that are exclusively owned by the current process.
+ * Everybody else should continue to use page_add_anon_rmap above.
+ */
+void do_page_add_anon_rmap(struct page *page,
+	struct vm_area_struct *vma, unsigned long address, int exclusive)
 {
 	int first = atomic_inc_and_test(&page->_mapcount);
 	if (first)
@@ -839,7 +850,7 @@ void page_add_anon_rmap(struct page *page,
 	VM_BUG_ON(!PageLocked(page));
 	VM_BUG_ON(address < vma->vm_start || address >= vma->vm_end);
 	if (first)
-		__page_set_anon_rmap(page, vma, address, 0);
+		__page_set_anon_rmap(page, vma, address, exclusive);
 	else
 		__page_check_anon_rmap(page, vma, address);
 }
-- 
cgit v1.2.3


From d2997b1042ec150616c1963b5e5e919ffd0b0ebf Mon Sep 17 00:00:00 2001
From: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
Date: Mon, 9 Aug 2010 17:20:11 -0700
Subject: hibernation: freeze swap at hibernation

When taking a memory snapshot in hibernate_snapshot(), all (directly
called) memory allocations use GFP_ATOMIC.  Hence swap misusage during
hibernation never occurs.

But from a pessimistic point of view, there is no guarantee that no page
allcation has __GFP_WAIT.  It is better to have a global indication "we
enter hibernation, don't use swap!".

This patch tries to freeze new-swap-allocation during hibernation.  (All
user processes are frozenm so swapin is not a concern).

This way, no updates will happen to swap_map[] between
hibernate_snapshot() and save_image().  Swap is thawed when swsusp_free()
is called.  We can be assured that swap corruption will not occur.

Signed-off-by: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
Cc: "Rafael J. Wysocki" <rjw@sisk.pl>
Cc: Hugh Dickins <hughd@google.com>
Cc: KOSAKI Motohiro <kosaki.motohiro@jp.fujitsu.com>
Cc: Ondrej Zary <linux@rainbow-software.org>
Cc: Balbir Singh <balbir@in.ibm.com>
Cc: Andrea Arcangeli <aarcange@redhat.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/swap.h     |  8 ++++-
 kernel/power/hibernate.c |  1 +
 kernel/power/snapshot.c  |  1 +
 kernel/power/swap.c      |  6 ++--
 mm/swapfile.c            | 94 ++++++++++++++++++++++++++++++++++++------------
 5 files changed, 84 insertions(+), 26 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/swap.h b/include/linux/swap.h
index ff4acea9bbdb..91c9d3fc8513 100644
--- a/include/linux/swap.h
+++ b/include/linux/swap.h
@@ -316,7 +316,6 @@ extern long nr_swap_pages;
 extern long total_swap_pages;
 extern void si_swapinfo(struct sysinfo *);
 extern swp_entry_t get_swap_page(void);
-extern swp_entry_t get_swap_page_of_type(int);
 extern int valid_swaphandles(swp_entry_t, unsigned long *);
 extern int add_swap_count_continuation(swp_entry_t, gfp_t);
 extern void swap_shmem_alloc(swp_entry_t);
@@ -333,6 +332,13 @@ extern int reuse_swap_page(struct page *);
 extern int try_to_free_swap(struct page *);
 struct backing_dev_info;
 
+#ifdef CONFIG_HIBERNATION
+void hibernation_freeze_swap(void);
+void hibernation_thaw_swap(void);
+swp_entry_t get_swap_for_hibernation(int type);
+void swap_free_for_hibernation(swp_entry_t val);
+#endif
+
 /* linux/mm/thrash.c */
 extern struct mm_struct *swap_token_mm;
 extern void grab_swap_token(struct mm_struct *);
diff --git a/kernel/power/hibernate.c b/kernel/power/hibernate.c
index 8dc31e02ae12..c77963938bca 100644
--- a/kernel/power/hibernate.c
+++ b/kernel/power/hibernate.c
@@ -338,6 +338,7 @@ int hibernation_snapshot(int platform_mode)
 		goto Close;
 
 	suspend_console();
+	hibernation_freeze_swap();
 	saved_mask = clear_gfp_allowed_mask(GFP_IOFS);
 	error = dpm_suspend_start(PMSG_FREEZE);
 	if (error)
diff --git a/kernel/power/snapshot.c b/kernel/power/snapshot.c
index f6cd6faf84fd..5e7edfb05e66 100644
--- a/kernel/power/snapshot.c
+++ b/kernel/power/snapshot.c
@@ -1086,6 +1086,7 @@ void swsusp_free(void)
 	buffer = NULL;
 	alloc_normal = 0;
 	alloc_highmem = 0;
+	hibernation_thaw_swap();
 }
 
 /* Helper functions used for the shrinking of memory. */
diff --git a/kernel/power/swap.c b/kernel/power/swap.c
index e6a5bdf61a37..5d0059eed3e4 100644
--- a/kernel/power/swap.c
+++ b/kernel/power/swap.c
@@ -136,10 +136,10 @@ sector_t alloc_swapdev_block(int swap)
 {
 	unsigned long offset;
 
-	offset = swp_offset(get_swap_page_of_type(swap));
+	offset = swp_offset(get_swap_for_hibernation(swap));
 	if (offset) {
 		if (swsusp_extents_insert(offset))
-			swap_free(swp_entry(swap, offset));
+			swap_free_for_hibernation(swp_entry(swap, offset));
 		else
 			return swapdev_block(swap, offset);
 	}
@@ -163,7 +163,7 @@ void free_all_swap_pages(int swap)
 		ext = container_of(node, struct swsusp_extent, node);
 		rb_erase(node, &swsusp_extents);
 		for (offset = ext->start; offset <= ext->end; offset++)
-			swap_free(swp_entry(swap, offset));
+			swap_free_for_hibernation(swp_entry(swap, offset));
 
 		kfree(ext);
 	}
diff --git a/mm/swapfile.c b/mm/swapfile.c
index f08d165871b3..1f3f9c59a73a 100644
--- a/mm/swapfile.c
+++ b/mm/swapfile.c
@@ -47,6 +47,8 @@ long nr_swap_pages;
 long total_swap_pages;
 static int least_priority;
 
+static bool swap_for_hibernation;
+
 static const char Bad_file[] = "Bad swap file entry ";
 static const char Unused_file[] = "Unused swap file entry ";
 static const char Bad_offset[] = "Bad swap offset entry ";
@@ -451,6 +453,8 @@ swp_entry_t get_swap_page(void)
 	spin_lock(&swap_lock);
 	if (nr_swap_pages <= 0)
 		goto noswap;
+	if (swap_for_hibernation)
+		goto noswap;
 	nr_swap_pages--;
 
 	for (type = swap_list.next; type >= 0 && wrapped < 2; type = next) {
@@ -483,28 +487,6 @@ noswap:
 	return (swp_entry_t) {0};
 }
 
-/* The only caller of this function is now susupend routine */
-swp_entry_t get_swap_page_of_type(int type)
-{
-	struct swap_info_struct *si;
-	pgoff_t offset;
-
-	spin_lock(&swap_lock);
-	si = swap_info[type];
-	if (si && (si->flags & SWP_WRITEOK)) {
-		nr_swap_pages--;
-		/* This is called for allocating swap entry, not cache */
-		offset = scan_swap_map(si, 1);
-		if (offset) {
-			spin_unlock(&swap_lock);
-			return swp_entry(type, offset);
-		}
-		nr_swap_pages++;
-	}
-	spin_unlock(&swap_lock);
-	return (swp_entry_t) {0};
-}
-
 static struct swap_info_struct *swap_info_get(swp_entry_t entry)
 {
 	struct swap_info_struct *p;
@@ -764,6 +746,74 @@ int mem_cgroup_count_swap_user(swp_entry_t ent, struct page **pagep)
 #endif
 
 #ifdef CONFIG_HIBERNATION
+
+static pgoff_t hibernation_offset[MAX_SWAPFILES];
+/*
+ * Once hibernation starts to use swap, we freeze swap_map[]. Otherwise,
+ * saved swap_map[] image to the disk will be an incomplete because it's
+ * changing without synchronization with hibernation snap shot.
+ * At resume, we just make swap_for_hibernation=false. We can forget
+ * used maps easily.
+ */
+void hibernation_freeze_swap(void)
+{
+	int i;
+
+	spin_lock(&swap_lock);
+
+	printk(KERN_INFO "PM: Freeze Swap\n");
+	swap_for_hibernation = true;
+	for (i = 0; i < MAX_SWAPFILES; i++)
+		hibernation_offset[i] = 1;
+	spin_unlock(&swap_lock);
+}
+
+void hibernation_thaw_swap(void)
+{
+	spin_lock(&swap_lock);
+	if (swap_for_hibernation) {
+		printk(KERN_INFO "PM: Thaw Swap\n");
+		swap_for_hibernation = false;
+	}
+	spin_unlock(&swap_lock);
+}
+
+/*
+ * Because updateing swap_map[] can make not-saved-status-change,
+ * we use our own easy allocator.
+ * Please see kernel/power/swap.c, Used swaps are recorded into
+ * RB-tree.
+ */
+swp_entry_t get_swap_for_hibernation(int type)
+{
+	pgoff_t off;
+	swp_entry_t val = {0};
+	struct swap_info_struct *si;
+
+	spin_lock(&swap_lock);
+
+	si = swap_info[type];
+	if (!si || !(si->flags & SWP_WRITEOK))
+		goto done;
+
+	for (off = hibernation_offset[type]; off < si->max; ++off) {
+		if (!si->swap_map[off])
+			break;
+	}
+	if (off < si->max) {
+		val = swp_entry(type, off);
+		hibernation_offset[type] = off + 1;
+	}
+done:
+	spin_unlock(&swap_lock);
+	return val;
+}
+
+void swap_free_for_hibernation(swp_entry_t ent)
+{
+	/* Nothing to do */
+}
+
 /*
  * Find the swap type that corresponds to given device (if any).
  *
-- 
cgit v1.2.3


From 71abbbf856a0e70ca478782505c800891260ba84 Mon Sep 17 00:00:00 2001
From: Ai Li <aili@codeaurora.org>
Date: Mon, 9 Aug 2010 17:20:13 -0700
Subject: cpuidle: extend cpuidle and menu governor to handle dynamic states

On some SoC chips, HW resources may be in use during any particular idle
period.  As a consequence, the cpuidle states that the SoC is safe to
enter can change from idle period to idle period.  In addition, the
latency and threshold of each cpuidle state can vary, depending on the
operating condition when the CPU becomes idle, e.g.  the current cpu
frequency, the current state of the HW blocks, etc.

cpuidle core and the menu governor, in the current form, are geared
towards cpuidle states that are static, i.e.  the availabiltiy of the
states, their latencies, their thresholds are non-changing during run
time.  cpuidle does not provide any hook that cpuidle drivers can use to
adjust those values on the fly for the current idle period before the menu
governor selects the target cpuidle state.

This patch extends cpuidle core and the menu governor to handle states
that are dynamic.  There are three additions in the patch and the patch
maintains backwards-compatibility with existing cpuidle drivers.

1) add prepare() to struct cpuidle_device.  A cpuidle driver can hook
   into the callback and cpuidle will call prepare() before calling the
   governor's select function.  The callback gives the cpuidle driver a
   chance to update the dynamic information of the cpuidle states for the
   current idle period, e.g.  state availability, latencies, thresholds,
   power values, etc.

2) add CPUIDLE_FLAG_IGNORE as one of the state flags.  In the prepare()
   function, a cpuidle driver can set/clear the flag to indicate to the
   menu governor whether a cpuidle state should be ignored, i.e.  not
   available, during the current idle period.

3) add power_specified bit to struct cpuidle_device.  The menu governor
   currently assumes that the cpuidle states are arranged in the order of
   increasing latency, threshold, and power savings.  This is true or can
   be made true for static states.  Once the state parameters are dynamic,
   the latencies, thresholds, and power savings for the cpuidle states can
   increase or decrease by different amounts from idle period to idle
   period.  So the assumption of increasing latency, threshold, and power
   savings from Cn to C(n+1) can no longer be guaranteed.

It can be straightforward to calculate the power consumption of each
available state and to specify it in power_usage for the idle period.
Using the power_usage fields, the menu governor then selects the state
that has the lowest power consumption and that still satisfies all other
critieria.  The power_specified bit defaults to 0.  For existing cpuidle
drivers, cpuidle detects that power_specified is 0 and fills in a dummy
set of power_usage values.

Signed-off-by: Ai Li <aili@codeaurora.org>
Cc: Len Brown <len.brown@intel.com>
Acked-by: Arjan van de Ven <arjan@linux.intel.com>
Cc: Ingo Molnar <mingo@elte.hu>
Cc: Venkatesh Pallipadi <venki@google.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 drivers/cpuidle/cpuidle.c        | 31 +++++++++++++++++++++++++++++++
 drivers/cpuidle/governors/menu.c | 23 ++++++++++++++++-------
 include/linux/cpuidle.h          |  4 ++++
 3 files changed, 51 insertions(+), 7 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/cpuidle/cpuidle.c b/drivers/cpuidle/cpuidle.c
index dbefe15bd582..a50710843378 100644
--- a/drivers/cpuidle/cpuidle.c
+++ b/drivers/cpuidle/cpuidle.c
@@ -74,6 +74,17 @@ static void cpuidle_idle_call(void)
 	 */
 	hrtimer_peek_ahead_timers();
 #endif
+
+	/*
+	 * Call the device's prepare function before calling the
+	 * governor's select function.  ->prepare gives the device's
+	 * cpuidle driver a chance to update any dynamic information
+	 * of its cpuidle states for the current idle period, e.g.
+	 * state availability, latencies, residencies, etc.
+	 */
+	if (dev->prepare)
+		dev->prepare(dev);
+
 	/* ask the governor for the next state */
 	next_state = cpuidle_curr_governor->select(dev);
 	if (need_resched()) {
@@ -282,6 +293,26 @@ static int __cpuidle_register_device(struct cpuidle_device *dev)
 
 	poll_idle_init(dev);
 
+	/*
+	 * cpuidle driver should set the dev->power_specified bit
+	 * before registering the device if the driver provides
+	 * power_usage numbers.
+	 *
+	 * For those devices whose ->power_specified is not set,
+	 * we fill in power_usage with decreasing values as the
+	 * cpuidle code has an implicit assumption that state Cn
+	 * uses less power than C(n-1).
+	 *
+	 * With CONFIG_ARCH_HAS_CPU_RELAX, C0 is already assigned
+	 * an power value of -1.  So we use -2, -3, etc, for other
+	 * c-states.
+	 */
+	if (!dev->power_specified) {
+		int i;
+		for (i = CPUIDLE_DRIVER_STATE_START; i < dev->state_count; i++)
+			dev->states[i].power_usage = -1 - i;
+	}
+
 	per_cpu(cpuidle_devices, dev->cpu) = dev;
 	list_add(&dev->device_list, &cpuidle_detected_devices);
 	if ((ret = cpuidle_add_sysfs(sys_dev))) {
diff --git a/drivers/cpuidle/governors/menu.c b/drivers/cpuidle/governors/menu.c
index 1b128702d300..c2408bbe9c2e 100644
--- a/drivers/cpuidle/governors/menu.c
+++ b/drivers/cpuidle/governors/menu.c
@@ -234,6 +234,7 @@ static int menu_select(struct cpuidle_device *dev)
 {
 	struct menu_device *data = &__get_cpu_var(menu_devices);
 	int latency_req = pm_qos_request(PM_QOS_CPU_DMA_LATENCY);
+	unsigned int power_usage = -1;
 	int i;
 	int multiplier;
 
@@ -278,19 +279,27 @@ static int menu_select(struct cpuidle_device *dev)
 	if (data->expected_us > 5)
 		data->last_state_idx = CPUIDLE_DRIVER_STATE_START;
 
-
-	/* find the deepest idle state that satisfies our constraints */
+	/*
+	 * Find the idle state with the lowest power while satisfying
+	 * our constraints.
+	 */
 	for (i = CPUIDLE_DRIVER_STATE_START; i < dev->state_count; i++) {
 		struct cpuidle_state *s = &dev->states[i];
 
+		if (s->flags & CPUIDLE_FLAG_IGNORE)
+			continue;
 		if (s->target_residency > data->predicted_us)
-			break;
+			continue;
 		if (s->exit_latency > latency_req)
-			break;
+			continue;
 		if (s->exit_latency * multiplier > data->predicted_us)
-			break;
-		data->exit_us = s->exit_latency;
-		data->last_state_idx = i;
+			continue;
+
+		if (s->power_usage < power_usage) {
+			power_usage = s->power_usage;
+			data->last_state_idx = i;
+			data->exit_us = s->exit_latency;
+		}
 	}
 
 	return data->last_state_idx;
diff --git a/include/linux/cpuidle.h b/include/linux/cpuidle.h
index 55215cce5005..36ca9721a0c2 100644
--- a/include/linux/cpuidle.h
+++ b/include/linux/cpuidle.h
@@ -52,6 +52,7 @@ struct cpuidle_state {
 #define CPUIDLE_FLAG_SHALLOW	(0x20) /* low latency, minimal savings */
 #define CPUIDLE_FLAG_BALANCED	(0x40) /* medium latency, moderate savings */
 #define CPUIDLE_FLAG_DEEP	(0x80) /* high latency, large savings */
+#define CPUIDLE_FLAG_IGNORE	(0x100) /* ignore during this idle period */
 
 #define CPUIDLE_DRIVER_FLAGS_MASK (0xFFFF0000)
 
@@ -84,6 +85,7 @@ struct cpuidle_state_kobj {
 struct cpuidle_device {
 	unsigned int		registered:1;
 	unsigned int		enabled:1;
+	unsigned int		power_specified:1;
 	unsigned int		cpu;
 
 	int			last_residency;
@@ -97,6 +99,8 @@ struct cpuidle_device {
 	struct completion	kobj_unregister;
 	void			*governor_data;
 	struct cpuidle_state	*safe_state;
+
+	int (*prepare)		(struct cpuidle_device *dev);
 };
 
 DECLARE_PER_CPU(struct cpuidle_device *, cpuidle_devices);
-- 
cgit v1.2.3


From ea6b101d8a3ea4e1dec29df31188c2f9852296fe Mon Sep 17 00:00:00 2001
From: Rusty Russell <rusty@rustcorp.com.au>
Date: Mon, 9 Aug 2010 17:20:18 -0700
Subject: include/linux/compiler-gcc.h: use __same_type() in __must_be_array()

We should use the __same_type() helper in __must_be_array().

Signed-off-by: Rusty Russell <rusty@rustcorp.com.au>
Reported-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/compiler-gcc.h | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/compiler-gcc.h b/include/linux/compiler-gcc.h
index 0da5b187f124..16508bcddacc 100644
--- a/include/linux/compiler-gcc.h
+++ b/include/linux/compiler-gcc.h
@@ -35,8 +35,7 @@
     (typeof(ptr)) (__ptr + (off)); })
 
 /* &a[0] degrades to a pointer: a different type from an array */
-#define __must_be_array(a) \
-  BUILD_BUG_ON_ZERO(__builtin_types_compatible_p(typeof(a), typeof(&a[0])))
+#define __must_be_array(a) BUILD_BUG_ON_ZERO(__same_type((a), &(a)[0]))
 
 /*
  * Force always-inline if the user requests it so via the .config,
-- 
cgit v1.2.3


From cf4ca4874fc45166198424384275f443a672d0b7 Mon Sep 17 00:00:00 2001
From: Joe Perches <joe@perches.com>
Date: Mon, 9 Aug 2010 17:20:20 -0700
Subject: kernel.h: remove unused NIPQUAD and NIPQUAD_FMT

There are no more uses of NIPQUAD or NIPQUAD_FMT.  Remove the definitions.

Signed-off-by: Joe Perches <joe@perches.com>
Cc: "David S. Miller" <davem@davemloft.net>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/kernel.h | 11 -----------
 1 file changed, 11 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/kernel.h b/include/linux/kernel.h
index 7d5b10ff63e0..5b57236dfbd0 100644
--- a/include/linux/kernel.h
+++ b/include/linux/kernel.h
@@ -615,17 +615,6 @@ ftrace_vprintk(const char *fmt, va_list ap)
 static inline void ftrace_dump(enum ftrace_dump_mode oops_dump_mode) { }
 #endif /* CONFIG_TRACING */
 
-/*
- *      Display an IP address in readable format.
- */
-
-#define NIPQUAD(addr) \
-	((unsigned char *)&addr)[0], \
-	((unsigned char *)&addr)[1], \
-	((unsigned char *)&addr)[2], \
-	((unsigned char *)&addr)[3]
-#define NIPQUAD_FMT "%u.%u.%u.%u"
-
 /*
  * min()/max()/clamp() macros that also do
  * strict type-checking.. See the
-- 
cgit v1.2.3


From e269b085175acf03fc687a7416b9fd84aa9c6c23 Mon Sep 17 00:00:00 2001
From: Anton Blanchard <anton@samba.org>
Date: Mon, 9 Aug 2010 17:20:23 -0700
Subject: iommu: inline iommu_num_pages

A profile of a network benchmark showed iommu_num_pages rather high up:

     0.52%  iommu_num_pages

Looking at the profile, an integer divide is taking almost all of the time:

      %
         :      c000000000376ea4 <.iommu_num_pages>:
    1.93 :      c000000000376ea4:       fb e1 ff f8     std     r31,-8(r1)
    0.00 :      c000000000376ea8:       f8 21 ff c1     stdu    r1,-64(r1)
    0.00 :      c000000000376eac:       7c 3f 0b 78     mr      r31,r1
    3.86 :      c000000000376eb0:       38 84 ff ff     addi    r4,r4,-1
    0.00 :      c000000000376eb4:       38 05 ff ff     addi    r0,r5,-1
    0.00 :      c000000000376eb8:       7c 84 2a 14     add     r4,r4,r5
   46.95 :      c000000000376ebc:       7c 00 18 38     and     r0,r0,r3
   45.66 :      c000000000376ec0:       7c 84 02 14     add     r4,r4,r0
    0.00 :      c000000000376ec4:       7c 64 2b 92     divdu   r3,r4,r5
    0.00 :      c000000000376ec8:       38 3f 00 40     addi    r1,r31,64
    0.00 :      c000000000376ecc:       eb e1 ff f8     ld      r31,-8(r1)
    1.61 :      c000000000376ed0:       4e 80 00 20     blr

Since every caller of iommu_num_pages passes in a constant power of two
we can inline this such that the divide is replaced by a shift. The
entire function is only a few instructions once optimised, so it is
a good candidate for inlining overall.

Signed-off-by: Anton Blanchard <anton@samba.org>
Cc: Akinobu Mita <akinobu.mita@gmail.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/iommu-helper.h | 12 ++++++++++--
 lib/iommu-helper.c           |  9 ---------
 2 files changed, 10 insertions(+), 11 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/iommu-helper.h b/include/linux/iommu-helper.h
index 64d1b638745d..86bdeffe43ad 100644
--- a/include/linux/iommu-helper.h
+++ b/include/linux/iommu-helper.h
@@ -1,6 +1,8 @@
 #ifndef _LINUX_IOMMU_HELPER_H
 #define _LINUX_IOMMU_HELPER_H
 
+#include <linux/kernel.h>
+
 static inline unsigned long iommu_device_max_index(unsigned long size,
 						   unsigned long offset,
 						   u64 dma_mask)
@@ -20,7 +22,13 @@ extern unsigned long iommu_area_alloc(unsigned long *map, unsigned long size,
 				      unsigned long boundary_size,
 				      unsigned long align_mask);
 
-extern unsigned long iommu_num_pages(unsigned long addr, unsigned long len,
-				     unsigned long io_page_size);
+static inline unsigned long iommu_num_pages(unsigned long addr,
+					    unsigned long len,
+					    unsigned long io_page_size)
+{
+	unsigned long size = (addr & (io_page_size - 1)) + len;
+
+	return DIV_ROUND_UP(size, io_page_size);
+}
 
 #endif
diff --git a/lib/iommu-helper.c b/lib/iommu-helper.c
index c0251f4ad08b..da053313ee5c 100644
--- a/lib/iommu-helper.c
+++ b/lib/iommu-helper.c
@@ -38,12 +38,3 @@ again:
 	return -1;
 }
 EXPORT_SYMBOL(iommu_area_alloc);
-
-unsigned long iommu_num_pages(unsigned long addr, unsigned long len,
-			      unsigned long io_page_size)
-{
-	unsigned long size = (addr & (io_page_size - 1)) + len;
-
-	return DIV_ROUND_UP(size, io_page_size);
-}
-EXPORT_SYMBOL(iommu_num_pages);
-- 
cgit v1.2.3


From ea98eed9bcb62d1319db8b1210712c6a110a886c Mon Sep 17 00:00:00 2001
From: Eric Paris <eparis@redhat.com>
Date: Mon, 9 Aug 2010 17:20:56 -0700
Subject: flex_array: add helpers to get and put to make pointers easy to use

Getting and putting arrays of pointers with flex arrays is a PITA.  You
have to remember to pass &ptr to the _put and you have to do weird and
wacky casting to get the ptr back from the _get.  Add two functions
flex_array_get_ptr() and flex_array_put_ptr() to handle all of the magic.

[akpm@linux-foundation.org: simplification suggested by Joe]
Signed-off-by: Eric Paris <eparis@redhat.com>
Cc: David Rientjes <rientjes@google.com>
Cc: Dave Hansen <dave@linux.vnet.ibm.com>
Cc: Joe Perches <joe@perches.com>
Cc: James Morris <jmorris@namei.org>
Cc: Joe Perches <joe@perches.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/flex_array.h |  5 +++++
 lib/flex_array.c           | 25 ++++++++++++++++++++++++-
 2 files changed, 29 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/flex_array.h b/include/linux/flex_array.h
index 1d747f72298b..631b77f2ac70 100644
--- a/include/linux/flex_array.h
+++ b/include/linux/flex_array.h
@@ -70,4 +70,9 @@ int flex_array_clear(struct flex_array *fa, unsigned int element_nr);
 void *flex_array_get(struct flex_array *fa, unsigned int element_nr);
 int flex_array_shrink(struct flex_array *fa);
 
+#define flex_array_put_ptr(fa, nr, src, gfp) \
+	flex_array_put(fa, nr, &(void *)(src), gfp)
+
+void *flex_array_get_ptr(struct flex_array *fa, unsigned int element_nr);
+
 #endif /* _FLEX_ARRAY_H */
diff --git a/lib/flex_array.c b/lib/flex_array.c
index 41b1804fa728..77a6fea7481e 100644
--- a/lib/flex_array.c
+++ b/lib/flex_array.c
@@ -171,6 +171,8 @@ __fa_get_part(struct flex_array *fa, int part_nr, gfp_t flags)
  * Note that this *copies* the contents of @src into
  * the array.  If you are trying to store an array of
  * pointers, make sure to pass in &ptr instead of ptr.
+ * You may instead wish to use the flex_array_put_ptr()
+ * helper function.
  *
  * Locking must be provided by the caller.
  */
@@ -265,7 +267,8 @@ int flex_array_prealloc(struct flex_array *fa, unsigned int start,
  *
  * Returns a pointer to the data at index @element_nr.  Note
  * that this is a copy of the data that was passed in.  If you
- * are using this to store pointers, you'll get back &ptr.
+ * are using this to store pointers, you'll get back &ptr.  You
+ * may instead wish to use the flex_array_get_ptr helper.
  *
  * Locking must be provided by the caller.
  */
@@ -286,6 +289,26 @@ void *flex_array_get(struct flex_array *fa, unsigned int element_nr)
 	return &part->elements[index_inside_part(fa, element_nr)];
 }
 
+/**
+ * flex_array_get_ptr - pull a ptr back out of the array
+ * @fa:		the flex array from which to extract data
+ * @element_nr:	index of the element to fetch from the array
+ *
+ * Returns the pointer placed in the flex array at element_nr using
+ * flex_array_put_ptr().  This function should not be called if the
+ * element in question was not set using the _put_ptr() helper.
+ */
+void *flex_array_get_ptr(struct flex_array *fa, unsigned int element_nr)
+{
+	void **tmp;
+
+	tmp = flex_array_get(fa, element_nr);
+	if (!tmp)
+		return NULL;
+
+	return *tmp;
+}
+
 static int part_is_free(struct flex_array_part *part)
 {
 	int i;
-- 
cgit v1.2.3