From 8446f1d391f3d27e6bf9c43d4cbcdac0ca720417 Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Tue, 6 Sep 2005 15:16:27 -0700
Subject: [PATCH] detect soft lockups

This patch adds a new kernel debug feature: CONFIG_DETECT_SOFTLOCKUP.

When enabled then per-CPU watchdog threads are started, which try to run
once per second.  If they get delayed for more than 10 seconds then a
callback from the timer interrupt detects this condition and prints out a
warning message and a stack dump (once per lockup incident).  The feature
is otherwise non-intrusive, it doesnt try to unlock the box in any way, it
only gets the debug info out, automatically, and on all CPUs affected by
the lockup.

Signed-off-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Nishanth Aravamudan <nacc@us.ibm.com>
Signed-Off-By: Matthias Urlichs <smurf@smurf.noris.de>
Signed-off-by: Richard Purdie <rpurdie@rpsys.net>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 include/linux/sched.h | 17 +++++++++++++++++
 1 file changed, 17 insertions(+)

(limited to 'include/linux/sched.h')

diff --git a/include/linux/sched.h b/include/linux/sched.h
index dec5827c7742..5fb31bede103 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -176,6 +176,23 @@ extern void trap_init(void);
 extern void update_process_times(int user);
 extern void scheduler_tick(void);
 
+#ifdef CONFIG_DETECT_SOFTLOCKUP
+extern void softlockup_tick(struct pt_regs *regs);
+extern void spawn_softlockup_task(void);
+extern void touch_softlockup_watchdog(void);
+#else
+static inline void softlockup_tick(struct pt_regs *regs)
+{
+}
+static inline void spawn_softlockup_task(void)
+{
+}
+static inline void touch_softlockup_watchdog(void)
+{
+}
+#endif
+
+
 /* Attach to any functions which should be ignored in wchan output. */
 #define __sched		__attribute__((__section__(".sched.text")))
 /* Is this address in the __sched functions? */
-- 
cgit v1.2.3


From 36d57ac4a818cb4aa3edbdf63ad2ebc31106f925 Mon Sep 17 00:00:00 2001
From: "H. J. Lu" <hjl@lucon.org>
Date: Tue, 6 Sep 2005 15:16:49 -0700
Subject: [PATCH] auxiliary vector cleanups

The size of auxiliary vector is fixed at 42 in linux/sched.h.  But it isn't
very obvious when looking at linux/elf.h.  This patch adds AT_VECTOR_SIZE
so that we can change it if necessary when a new vector is added.

Because of include file ordering problems, doing this necessitated the
extraction of the AT_* symbols into a standalone header file.

Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 include/asm-alpha/auxvec.h     | 24 ++++++++++++++++++++++++
 include/asm-alpha/elf.h        | 22 ++--------------------
 include/asm-arm/auxvec.h       |  4 ++++
 include/asm-arm26/auxvec.h     |  4 ++++
 include/asm-cris/auxvec.h      |  4 ++++
 include/asm-frv/auxvec.h       |  4 ++++
 include/asm-h8300/auxvec.h     |  4 ++++
 include/asm-i386/auxvec.h      | 11 +++++++++++
 include/asm-i386/elf.h         |  8 +-------
 include/asm-ia64/auxvec.h      | 11 +++++++++++
 include/asm-ia64/elf.h         |  8 +-------
 include/asm-m32r/auxvec.h      |  4 ++++
 include/asm-m68k/auxvec.h      |  4 ++++
 include/asm-m68knommu/auxvec.h |  4 ++++
 include/asm-mips/auxvec.h      |  4 ++++
 include/asm-parisc/auxvec.h    |  4 ++++
 include/asm-ppc/auxvec.h       | 14 ++++++++++++++
 include/asm-ppc/elf.h          | 11 +----------
 include/asm-ppc64/auxvec.h     | 19 +++++++++++++++++++
 include/asm-ppc64/elf.h        | 16 +---------------
 include/asm-s390/auxvec.h      |  4 ++++
 include/asm-sh/auxvec.h        |  4 ++++
 include/asm-sh64/auxvec.h      |  4 ++++
 include/asm-sparc/auxvec.h     |  4 ++++
 include/asm-sparc64/auxvec.h   |  4 ++++
 include/asm-um/auxvec.h        |  4 ++++
 include/asm-v850/auxvec.h      |  4 ++++
 include/asm-x86_64/auxvec.h    |  4 ++++
 include/asm-xtensa/auxvec.h    |  4 ++++
 include/linux/auxvec.h         | 31 +++++++++++++++++++++++++++++++
 include/linux/elf.h            | 24 +-----------------------
 include/linux/sched.h          |  4 +++-
 32 files changed, 196 insertions(+), 83 deletions(-)
 create mode 100644 include/asm-alpha/auxvec.h
 create mode 100644 include/asm-arm/auxvec.h
 create mode 100644 include/asm-arm26/auxvec.h
 create mode 100644 include/asm-cris/auxvec.h
 create mode 100644 include/asm-frv/auxvec.h
 create mode 100644 include/asm-h8300/auxvec.h
 create mode 100644 include/asm-i386/auxvec.h
 create mode 100644 include/asm-ia64/auxvec.h
 create mode 100644 include/asm-m32r/auxvec.h
 create mode 100644 include/asm-m68k/auxvec.h
 create mode 100644 include/asm-m68knommu/auxvec.h
 create mode 100644 include/asm-mips/auxvec.h
 create mode 100644 include/asm-parisc/auxvec.h
 create mode 100644 include/asm-ppc/auxvec.h
 create mode 100644 include/asm-ppc64/auxvec.h
 create mode 100644 include/asm-s390/auxvec.h
 create mode 100644 include/asm-sh/auxvec.h
 create mode 100644 include/asm-sh64/auxvec.h
 create mode 100644 include/asm-sparc/auxvec.h
 create mode 100644 include/asm-sparc64/auxvec.h
 create mode 100644 include/asm-um/auxvec.h
 create mode 100644 include/asm-v850/auxvec.h
 create mode 100644 include/asm-x86_64/auxvec.h
 create mode 100644 include/asm-xtensa/auxvec.h
 create mode 100644 include/linux/auxvec.h

(limited to 'include/linux/sched.h')

diff --git a/include/asm-alpha/auxvec.h b/include/asm-alpha/auxvec.h
new file mode 100644
index 000000000000..e96fe880e310
--- /dev/null
+++ b/include/asm-alpha/auxvec.h
@@ -0,0 +1,24 @@
+#ifndef __ASM_ALPHA_AUXVEC_H
+#define __ASM_ALPHA_AUXVEC_H
+
+/* Reserve these numbers for any future use of a VDSO.  */
+#if 0
+#define AT_SYSINFO		32
+#define AT_SYSINFO_EHDR		33
+#endif
+
+/* More complete cache descriptions than AT_[DIU]CACHEBSIZE.  If the
+   value is -1, then the cache doesn't exist.  Otherwise:
+
+      bit 0-3:	  Cache set-associativity; 0 means fully associative.
+      bit 4-7:	  Log2 of cacheline size.
+      bit 8-31:	  Size of the entire cache >> 8.
+      bit 32-63:  Reserved.
+*/
+
+#define AT_L1I_CACHESHAPE	34
+#define AT_L1D_CACHESHAPE	35
+#define AT_L2_CACHESHAPE	36
+#define AT_L3_CACHESHAPE	37
+
+#endif /* __ASM_ALPHA_AUXVEC_H */
diff --git a/include/asm-alpha/elf.h b/include/asm-alpha/elf.h
index e94a945a2314..6c2d78fba264 100644
--- a/include/asm-alpha/elf.h
+++ b/include/asm-alpha/elf.h
@@ -1,6 +1,8 @@
 #ifndef __ASM_ALPHA_ELF_H
 #define __ASM_ALPHA_ELF_H
 
+#include <asm/auxvec.h>
+
 /* Special values for the st_other field in the symbol table.  */
 
 #define STO_ALPHA_NOPV		0x80
@@ -142,26 +144,6 @@ extern int dump_elf_task_fp(elf_fpreg_t *dest, struct task_struct *task);
 	: amask (AMASK_CIX) ? "ev6" : "ev67");	\
 })
 
-/* Reserve these numbers for any future use of a VDSO.  */
-#if 0
-#define AT_SYSINFO		32
-#define AT_SYSINFO_EHDR		33
-#endif
-
-/* More complete cache descriptions than AT_[DIU]CACHEBSIZE.  If the
-   value is -1, then the cache doesn't exist.  Otherwise:
-
-      bit 0-3:	  Cache set-associativity; 0 means fully associative.
-      bit 4-7:	  Log2 of cacheline size.
-      bit 8-31:	  Size of the entire cache >> 8.
-      bit 32-63:  Reserved.
-*/
-
-#define AT_L1I_CACHESHAPE	34
-#define AT_L1D_CACHESHAPE	35
-#define AT_L2_CACHESHAPE	36
-#define AT_L3_CACHESHAPE	37
-
 #ifdef __KERNEL__
 
 #define SET_PERSONALITY(EX, IBCS2)				\
diff --git a/include/asm-arm/auxvec.h b/include/asm-arm/auxvec.h
new file mode 100644
index 000000000000..c0536f6b29a7
--- /dev/null
+++ b/include/asm-arm/auxvec.h
@@ -0,0 +1,4 @@
+#ifndef __ASMARM_AUXVEC_H
+#define __ASMARM_AUXVEC_H
+
+#endif
diff --git a/include/asm-arm26/auxvec.h b/include/asm-arm26/auxvec.h
new file mode 100644
index 000000000000..c0536f6b29a7
--- /dev/null
+++ b/include/asm-arm26/auxvec.h
@@ -0,0 +1,4 @@
+#ifndef __ASMARM_AUXVEC_H
+#define __ASMARM_AUXVEC_H
+
+#endif
diff --git a/include/asm-cris/auxvec.h b/include/asm-cris/auxvec.h
new file mode 100644
index 000000000000..cb30b01bf19f
--- /dev/null
+++ b/include/asm-cris/auxvec.h
@@ -0,0 +1,4 @@
+#ifndef __ASMCRIS_AUXVEC_H
+#define __ASMCRIS_AUXVEC_H
+
+#endif
diff --git a/include/asm-frv/auxvec.h b/include/asm-frv/auxvec.h
new file mode 100644
index 000000000000..07710778fa10
--- /dev/null
+++ b/include/asm-frv/auxvec.h
@@ -0,0 +1,4 @@
+#ifndef __FRV_AUXVEC_H
+#define __FRV_AUXVEC_H
+
+#endif
diff --git a/include/asm-h8300/auxvec.h b/include/asm-h8300/auxvec.h
new file mode 100644
index 000000000000..1d36fe38b088
--- /dev/null
+++ b/include/asm-h8300/auxvec.h
@@ -0,0 +1,4 @@
+#ifndef __ASMH8300_AUXVEC_H
+#define __ASMH8300_AUXVEC_H
+
+#endif
diff --git a/include/asm-i386/auxvec.h b/include/asm-i386/auxvec.h
new file mode 100644
index 000000000000..395e13016bfb
--- /dev/null
+++ b/include/asm-i386/auxvec.h
@@ -0,0 +1,11 @@
+#ifndef __ASMi386_AUXVEC_H
+#define __ASMi386_AUXVEC_H
+
+/*
+ * Architecture-neutral AT_ values in 0-17, leave some room
+ * for more of them, start the x86-specific ones at 32.
+ */
+#define AT_SYSINFO		32
+#define AT_SYSINFO_EHDR		33
+
+#endif
diff --git a/include/asm-i386/elf.h b/include/asm-i386/elf.h
index 130bdc8c68cf..fa11117d3cfa 100644
--- a/include/asm-i386/elf.h
+++ b/include/asm-i386/elf.h
@@ -9,6 +9,7 @@
 #include <asm/user.h>
 #include <asm/processor.h>
 #include <asm/system.h>		/* for savesegment */
+#include <asm/auxvec.h>
 
 #include <linux/utsname.h>
 
@@ -109,13 +110,6 @@ typedef struct user_fxsr_struct elf_fpxregset_t;
 
 #define ELF_PLATFORM  (system_utsname.machine)
 
-/*
- * Architecture-neutral AT_ values in 0-17, leave some room
- * for more of them, start the x86-specific ones at 32.
- */
-#define AT_SYSINFO		32
-#define AT_SYSINFO_EHDR		33
-
 #ifdef __KERNEL__
 #define SET_PERSONALITY(ex, ibcs2) do { } while (0)
 
diff --git a/include/asm-ia64/auxvec.h b/include/asm-ia64/auxvec.h
new file mode 100644
index 000000000000..23cebe5685b9
--- /dev/null
+++ b/include/asm-ia64/auxvec.h
@@ -0,0 +1,11 @@
+#ifndef _ASM_IA64_AUXVEC_H
+#define _ASM_IA64_AUXVEC_H
+
+/*
+ * Architecture-neutral AT_ values are in the range 0-17.  Leave some room for more of
+ * them, start the architecture-specific ones at 32.
+ */
+#define AT_SYSINFO	32
+#define AT_SYSINFO_EHDR	33
+
+#endif /* _ASM_IA64_AUXVEC_H */
diff --git a/include/asm-ia64/elf.h b/include/asm-ia64/elf.h
index 7d4ccc4b976e..446fce036fd9 100644
--- a/include/asm-ia64/elf.h
+++ b/include/asm-ia64/elf.h
@@ -12,6 +12,7 @@
 
 #include <asm/fpu.h>
 #include <asm/page.h>
+#include <asm/auxvec.h>
 
 /*
  * This is used to ensure we don't load something for the wrong architecture.
@@ -177,13 +178,6 @@ extern void ia64_elf_core_copy_regs (struct pt_regs *src, elf_gregset_t dst);
    relevant until we have real hardware to play with... */
 #define ELF_PLATFORM	NULL
 
-/*
- * Architecture-neutral AT_ values are in the range 0-17.  Leave some room for more of
- * them, start the architecture-specific ones at 32.
- */
-#define AT_SYSINFO	32
-#define AT_SYSINFO_EHDR	33
-
 #ifdef __KERNEL__
 #define SET_PERSONALITY(ex, ibcs2)	set_personality(PER_LINUX)
 #define elf_read_implies_exec(ex, executable_stack)					\
diff --git a/include/asm-m32r/auxvec.h b/include/asm-m32r/auxvec.h
new file mode 100644
index 000000000000..f76dcc860fae
--- /dev/null
+++ b/include/asm-m32r/auxvec.h
@@ -0,0 +1,4 @@
+#ifndef _ASM_M32R__AUXVEC_H
+#define _ASM_M32R__AUXVEC_H
+
+#endif  /* _ASM_M32R__AUXVEC_H */
diff --git a/include/asm-m68k/auxvec.h b/include/asm-m68k/auxvec.h
new file mode 100644
index 000000000000..844d6d52204b
--- /dev/null
+++ b/include/asm-m68k/auxvec.h
@@ -0,0 +1,4 @@
+#ifndef __ASMm68k_AUXVEC_H
+#define __ASMm68k_AUXVEC_H
+
+#endif
diff --git a/include/asm-m68knommu/auxvec.h b/include/asm-m68knommu/auxvec.h
new file mode 100644
index 000000000000..844d6d52204b
--- /dev/null
+++ b/include/asm-m68knommu/auxvec.h
@@ -0,0 +1,4 @@
+#ifndef __ASMm68k_AUXVEC_H
+#define __ASMm68k_AUXVEC_H
+
+#endif
diff --git a/include/asm-mips/auxvec.h b/include/asm-mips/auxvec.h
new file mode 100644
index 000000000000..7cf7f2d21943
--- /dev/null
+++ b/include/asm-mips/auxvec.h
@@ -0,0 +1,4 @@
+#ifndef _ASM_AUXVEC_H
+#define _ASM_AUXVEC_H
+
+#endif /* _ASM_AUXVEC_H */
diff --git a/include/asm-parisc/auxvec.h b/include/asm-parisc/auxvec.h
new file mode 100644
index 000000000000..9c3ac4b89dc9
--- /dev/null
+++ b/include/asm-parisc/auxvec.h
@@ -0,0 +1,4 @@
+#ifndef __ASMPARISC_AUXVEC_H
+#define __ASMPARISC_AUXVEC_H
+
+#endif
diff --git a/include/asm-ppc/auxvec.h b/include/asm-ppc/auxvec.h
new file mode 100644
index 000000000000..172358df29c8
--- /dev/null
+++ b/include/asm-ppc/auxvec.h
@@ -0,0 +1,14 @@
+#ifndef __PPC_AUXVEC_H
+#define __PPC_AUXVEC_H
+
+/*
+ * We need to put in some extra aux table entries to tell glibc what
+ * the cache block size is, so it can use the dcbz instruction safely.
+ */
+#define AT_DCACHEBSIZE		19
+#define AT_ICACHEBSIZE		20
+#define AT_UCACHEBSIZE		21
+/* A special ignored type value for PPC, for glibc compatibility.  */
+#define AT_IGNOREPPC		22
+
+#endif
diff --git a/include/asm-ppc/elf.h b/include/asm-ppc/elf.h
index 2c056966efd3..c25cc35e6ab5 100644
--- a/include/asm-ppc/elf.h
+++ b/include/asm-ppc/elf.h
@@ -7,6 +7,7 @@
 #include <asm/types.h>
 #include <asm/ptrace.h>
 #include <asm/cputable.h>
+#include <asm/auxvec.h>
 
 /* PowerPC relocations defined by the ABIs */
 #define R_PPC_NONE		0
@@ -122,16 +123,6 @@ extern int dump_task_fpu(struct task_struct *t, elf_fpregset_t *fpu);
 
 #define SET_PERSONALITY(ex, ibcs2) set_personality((ibcs2)?PER_SVR4:PER_LINUX)
 
-/*
- * We need to put in some extra aux table entries to tell glibc what
- * the cache block size is, so it can use the dcbz instruction safely.
- */
-#define AT_DCACHEBSIZE		19
-#define AT_ICACHEBSIZE		20
-#define AT_UCACHEBSIZE		21
-/* A special ignored type value for PPC, for glibc compatibility.  */
-#define AT_IGNOREPPC		22
-
 extern int dcache_bsize;
 extern int icache_bsize;
 extern int ucache_bsize;
diff --git a/include/asm-ppc64/auxvec.h b/include/asm-ppc64/auxvec.h
new file mode 100644
index 000000000000..ac6381a106e1
--- /dev/null
+++ b/include/asm-ppc64/auxvec.h
@@ -0,0 +1,19 @@
+#ifndef __PPC64_AUXVEC_H
+#define __PPC64_AUXVEC_H
+
+/*
+ * We need to put in some extra aux table entries to tell glibc what
+ * the cache block size is, so it can use the dcbz instruction safely.
+ */
+#define AT_DCACHEBSIZE		19
+#define AT_ICACHEBSIZE		20
+#define AT_UCACHEBSIZE		21
+/* A special ignored type value for PPC, for glibc compatibility.  */
+#define AT_IGNOREPPC		22
+
+/* The vDSO location. We have to use the same value as x86 for glibc's
+ * sake :-)
+ */
+#define AT_SYSINFO_EHDR		33
+
+#endif /* __PPC64_AUXVEC_H */
diff --git a/include/asm-ppc64/elf.h b/include/asm-ppc64/elf.h
index 085eedb956fe..c919a89343db 100644
--- a/include/asm-ppc64/elf.h
+++ b/include/asm-ppc64/elf.h
@@ -4,6 +4,7 @@
 #include <asm/types.h>
 #include <asm/ptrace.h>
 #include <asm/cputable.h>
+#include <asm/auxvec.h>
 
 /* PowerPC relocations defined by the ABIs */
 #define R_PPC_NONE		0
@@ -237,21 +238,6 @@ do {								\
 
 #endif
 
-/*
- * We need to put in some extra aux table entries to tell glibc what
- * the cache block size is, so it can use the dcbz instruction safely.
- */
-#define AT_DCACHEBSIZE		19
-#define AT_ICACHEBSIZE		20
-#define AT_UCACHEBSIZE		21
-/* A special ignored type value for PPC, for glibc compatibility.  */
-#define AT_IGNOREPPC		22
-
-/* The vDSO location. We have to use the same value as x86 for glibc's
- * sake :-)
- */
-#define AT_SYSINFO_EHDR		33
-
 extern int dcache_bsize;
 extern int icache_bsize;
 extern int ucache_bsize;
diff --git a/include/asm-s390/auxvec.h b/include/asm-s390/auxvec.h
new file mode 100644
index 000000000000..0d340720fd99
--- /dev/null
+++ b/include/asm-s390/auxvec.h
@@ -0,0 +1,4 @@
+#ifndef __ASMS390_AUXVEC_H
+#define __ASMS390_AUXVEC_H
+
+#endif
diff --git a/include/asm-sh/auxvec.h b/include/asm-sh/auxvec.h
new file mode 100644
index 000000000000..fc21e4db5881
--- /dev/null
+++ b/include/asm-sh/auxvec.h
@@ -0,0 +1,4 @@
+#ifndef __ASM_SH_AUXVEC_H
+#define __ASM_SH_AUXVEC_H
+
+#endif /* __ASM_SH_AUXVEC_H */
diff --git a/include/asm-sh64/auxvec.h b/include/asm-sh64/auxvec.h
new file mode 100644
index 000000000000..1ad5a44bdc76
--- /dev/null
+++ b/include/asm-sh64/auxvec.h
@@ -0,0 +1,4 @@
+#ifndef __ASM_SH64_AUXVEC_H
+#define __ASM_SH64_AUXVEC_H
+
+#endif /* __ASM_SH64_AUXVEC_H */
diff --git a/include/asm-sparc/auxvec.h b/include/asm-sparc/auxvec.h
new file mode 100644
index 000000000000..ad6f360261f6
--- /dev/null
+++ b/include/asm-sparc/auxvec.h
@@ -0,0 +1,4 @@
+#ifndef __ASMSPARC_AUXVEC_H
+#define __ASMSPARC_AUXVEC_H
+
+#endif /* !(__ASMSPARC_AUXVEC_H) */
diff --git a/include/asm-sparc64/auxvec.h b/include/asm-sparc64/auxvec.h
new file mode 100644
index 000000000000..436a29129828
--- /dev/null
+++ b/include/asm-sparc64/auxvec.h
@@ -0,0 +1,4 @@
+#ifndef __ASM_SPARC64_AUXVEC_H
+#define __ASM_SPARC64_AUXVEC_H
+
+#endif /* !(__ASM_SPARC64_AUXVEC_H) */
diff --git a/include/asm-um/auxvec.h b/include/asm-um/auxvec.h
new file mode 100644
index 000000000000..1e5e1c2fc9b1
--- /dev/null
+++ b/include/asm-um/auxvec.h
@@ -0,0 +1,4 @@
+#ifndef __UM_AUXVEC_H
+#define __UM_AUXVEC_H
+
+#endif
diff --git a/include/asm-v850/auxvec.h b/include/asm-v850/auxvec.h
new file mode 100644
index 000000000000..f493232d0224
--- /dev/null
+++ b/include/asm-v850/auxvec.h
@@ -0,0 +1,4 @@
+#ifndef __V850_AUXVEC_H__
+#define __V850_AUXVEC_H__
+
+#endif /* __V850_AUXVEC_H__ */
diff --git a/include/asm-x86_64/auxvec.h b/include/asm-x86_64/auxvec.h
new file mode 100644
index 000000000000..2403c4cfced2
--- /dev/null
+++ b/include/asm-x86_64/auxvec.h
@@ -0,0 +1,4 @@
+#ifndef __ASM_X86_64_AUXVEC_H
+#define __ASM_X86_64_AUXVEC_H
+
+#endif
diff --git a/include/asm-xtensa/auxvec.h b/include/asm-xtensa/auxvec.h
new file mode 100644
index 000000000000..257dec75c5af
--- /dev/null
+++ b/include/asm-xtensa/auxvec.h
@@ -0,0 +1,4 @@
+#ifndef __XTENSA_AUXVEC_H
+#define __XTENSA_AUXVEC_H
+
+#endif
diff --git a/include/linux/auxvec.h b/include/linux/auxvec.h
new file mode 100644
index 000000000000..9a7b374c9fb4
--- /dev/null
+++ b/include/linux/auxvec.h
@@ -0,0 +1,31 @@
+#ifndef _LINUX_AUXVEC_H
+#define _LINUX_AUXVEC_H
+
+#include <asm/auxvec.h>
+
+/* Symbolic values for the entries in the auxiliary table
+   put on the initial stack */
+#define AT_NULL   0	/* end of vector */
+#define AT_IGNORE 1	/* entry should be ignored */
+#define AT_EXECFD 2	/* file descriptor of program */
+#define AT_PHDR   3	/* program headers for program */
+#define AT_PHENT  4	/* size of program header entry */
+#define AT_PHNUM  5	/* number of program headers */
+#define AT_PAGESZ 6	/* system page size */
+#define AT_BASE   7	/* base address of interpreter */
+#define AT_FLAGS  8	/* flags */
+#define AT_ENTRY  9	/* entry point of program */
+#define AT_NOTELF 10	/* program is not ELF */
+#define AT_UID    11	/* real uid */
+#define AT_EUID   12	/* effective uid */
+#define AT_GID    13	/* real gid */
+#define AT_EGID   14	/* effective gid */
+#define AT_PLATFORM 15  /* string identifying CPU for optimizations */
+#define AT_HWCAP  16    /* arch dependent hints at CPU capabilities */
+#define AT_CLKTCK 17	/* frequency at which times() increments */
+
+#define AT_SECURE 23   /* secure mode boolean */
+
+#define AT_VECTOR_SIZE  42 /* Size of auxiliary table.  */
+
+#endif /* _LINUX_AUXVEC_H */
diff --git a/include/linux/elf.h b/include/linux/elf.h
index f5b3ba5a317d..ff955dbf510d 100644
--- a/include/linux/elf.h
+++ b/include/linux/elf.h
@@ -2,6 +2,7 @@
 #define _LINUX_ELF_H
 
 #include <linux/types.h>
+#include <linux/auxvec.h>
 #include <asm/elf.h>
 
 #ifndef elf_read_implies_exec
@@ -158,29 +159,6 @@ typedef __s64	Elf64_Sxword;
 #define ELF64_ST_BIND(x)	ELF_ST_BIND(x)
 #define ELF64_ST_TYPE(x)	ELF_ST_TYPE(x)
 
-/* Symbolic values for the entries in the auxiliary table
-   put on the initial stack */
-#define AT_NULL   0	/* end of vector */
-#define AT_IGNORE 1	/* entry should be ignored */
-#define AT_EXECFD 2	/* file descriptor of program */
-#define AT_PHDR   3	/* program headers for program */
-#define AT_PHENT  4	/* size of program header entry */
-#define AT_PHNUM  5	/* number of program headers */
-#define AT_PAGESZ 6	/* system page size */
-#define AT_BASE   7	/* base address of interpreter */
-#define AT_FLAGS  8	/* flags */
-#define AT_ENTRY  9	/* entry point of program */
-#define AT_NOTELF 10	/* program is not ELF */
-#define AT_UID    11	/* real uid */
-#define AT_EUID   12	/* effective uid */
-#define AT_GID    13	/* real gid */
-#define AT_EGID   14	/* effective gid */
-#define AT_PLATFORM 15  /* string identifying CPU for optimizations */
-#define AT_HWCAP  16    /* arch dependent hints at CPU capabilities */
-#define AT_CLKTCK 17	/* frequency at which times() increments */
-
-#define AT_SECURE 23   /* secure mode boolean */
-
 typedef struct dynamic{
   Elf32_Sword d_tag;
   union{
diff --git a/include/linux/sched.h b/include/linux/sched.h
index 5fb31bede103..b5a22ea80045 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -35,6 +35,8 @@
 #include <linux/topology.h>
 #include <linux/seccomp.h>
 
+#include <linux/auxvec.h>	/* For AT_VECTOR_SIZE */
+
 struct exec_domain;
 
 /*
@@ -261,7 +263,7 @@ struct mm_struct {
 	mm_counter_t _rss;
 	mm_counter_t _anon_rss;
 
-	unsigned long saved_auxv[42]; /* for /proc/PID/auxv */
+	unsigned long saved_auxv[AT_VECTOR_SIZE]; /* for /proc/PID/auxv */
 
 	unsigned dumpable:2;
 	cpumask_t cpu_vm_mask;
-- 
cgit v1.2.3


From 9c1cfda20a508b181bdda8c0045f7c0c333880a5 Mon Sep 17 00:00:00 2001
From: John Hawkes <hawkes@sgi.com>
Date: Tue, 6 Sep 2005 15:18:14 -0700
Subject: [PATCH] cpusets: Move the ia64 domain setup code to the generic code

Signed-off-by: John Hawkes <hawkes@sgi.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 arch/ia64/kernel/Makefile    |   2 +-
 arch/ia64/kernel/domain.c    | 444 -------------------------------------------
 include/asm-ia64/processor.h |   3 -
 include/asm-ia64/topology.h  |  23 ---
 include/linux/sched.h        |   7 -
 include/linux/topology.h     |  23 +++
 kernel/sched.c               | 290 ++++++++++++++++++++++------
 7 files changed, 260 insertions(+), 532 deletions(-)
 delete mode 100644 arch/ia64/kernel/domain.c

(limited to 'include/linux/sched.h')

diff --git a/arch/ia64/kernel/Makefile b/arch/ia64/kernel/Makefile
index b242594be55b..307514f7a282 100644
--- a/arch/ia64/kernel/Makefile
+++ b/arch/ia64/kernel/Makefile
@@ -16,7 +16,7 @@ obj-$(CONFIG_IA64_HP_ZX1_SWIOTLB) += acpi-ext.o
 obj-$(CONFIG_IA64_PALINFO)	+= palinfo.o
 obj-$(CONFIG_IOSAPIC)		+= iosapic.o
 obj-$(CONFIG_MODULES)		+= module.o
-obj-$(CONFIG_SMP)		+= smp.o smpboot.o domain.o
+obj-$(CONFIG_SMP)		+= smp.o smpboot.o
 obj-$(CONFIG_NUMA)		+= numa.o
 obj-$(CONFIG_PERFMON)		+= perfmon_default_smpl.o
 obj-$(CONFIG_IA64_CYCLONE)	+= cyclone.o
diff --git a/arch/ia64/kernel/domain.c b/arch/ia64/kernel/domain.c
deleted file mode 100644
index e907109983f1..000000000000
--- a/arch/ia64/kernel/domain.c
+++ /dev/null
@@ -1,444 +0,0 @@
-/*
- * arch/ia64/kernel/domain.c
- * Architecture specific sched-domains builder.
- *
- * Copyright (C) 2004 Jesse Barnes
- * Copyright (C) 2004 Silicon Graphics, Inc.
- */
-
-#include <linux/sched.h>
-#include <linux/percpu.h>
-#include <linux/slab.h>
-#include <linux/cpumask.h>
-#include <linux/init.h>
-#include <linux/topology.h>
-#include <linux/nodemask.h>
-
-#define SD_NODES_PER_DOMAIN 16
-
-#ifdef CONFIG_NUMA
-/**
- * find_next_best_node - find the next node to include in a sched_domain
- * @node: node whose sched_domain we're building
- * @used_nodes: nodes already in the sched_domain
- *
- * Find the next node to include in a given scheduling domain.  Simply
- * finds the closest node not already in the @used_nodes map.
- *
- * Should use nodemask_t.
- */
-static int find_next_best_node(int node, unsigned long *used_nodes)
-{
-	int i, n, val, min_val, best_node = 0;
-
-	min_val = INT_MAX;
-
-	for (i = 0; i < MAX_NUMNODES; i++) {
-		/* Start at @node */
-		n = (node + i) % MAX_NUMNODES;
-
-		if (!nr_cpus_node(n))
-			continue;
-
-		/* Skip already used nodes */
-		if (test_bit(n, used_nodes))
-			continue;
-
-		/* Simple min distance search */
-		val = node_distance(node, n);
-
-		if (val < min_val) {
-			min_val = val;
-			best_node = n;
-		}
-	}
-
-	set_bit(best_node, used_nodes);
-	return best_node;
-}
-
-/**
- * sched_domain_node_span - get a cpumask for a node's sched_domain
- * @node: node whose cpumask we're constructing
- * @size: number of nodes to include in this span
- *
- * Given a node, construct a good cpumask for its sched_domain to span.  It
- * should be one that prevents unnecessary balancing, but also spreads tasks
- * out optimally.
- */
-static cpumask_t sched_domain_node_span(int node)
-{
-	int i;
-	cpumask_t span, nodemask;
-	DECLARE_BITMAP(used_nodes, MAX_NUMNODES);
-
-	cpus_clear(span);
-	bitmap_zero(used_nodes, MAX_NUMNODES);
-
-	nodemask = node_to_cpumask(node);
-	cpus_or(span, span, nodemask);
-	set_bit(node, used_nodes);
-
-	for (i = 1; i < SD_NODES_PER_DOMAIN; i++) {
-		int next_node = find_next_best_node(node, used_nodes);
-		nodemask = node_to_cpumask(next_node);
-		cpus_or(span, span, nodemask);
-	}
-
-	return span;
-}
-#endif
-
-/*
- * At the moment, CONFIG_SCHED_SMT is never defined, but leave it in so we
- * can switch it on easily if needed.
- */
-#ifdef CONFIG_SCHED_SMT
-static DEFINE_PER_CPU(struct sched_domain, cpu_domains);
-static struct sched_group sched_group_cpus[NR_CPUS];
-static int cpu_to_cpu_group(int cpu)
-{
-	return cpu;
-}
-#endif
-
-static DEFINE_PER_CPU(struct sched_domain, phys_domains);
-static struct sched_group sched_group_phys[NR_CPUS];
-static int cpu_to_phys_group(int cpu)
-{
-#ifdef CONFIG_SCHED_SMT
-	return first_cpu(cpu_sibling_map[cpu]);
-#else
-	return cpu;
-#endif
-}
-
-#ifdef CONFIG_NUMA
-/*
- * The init_sched_build_groups can't handle what we want to do with node
- * groups, so roll our own. Now each node has its own list of groups which
- * gets dynamically allocated.
- */
-static DEFINE_PER_CPU(struct sched_domain, node_domains);
-static struct sched_group **sched_group_nodes_bycpu[NR_CPUS];
-
-static DEFINE_PER_CPU(struct sched_domain, allnodes_domains);
-static struct sched_group *sched_group_allnodes_bycpu[NR_CPUS];
-
-static int cpu_to_allnodes_group(int cpu)
-{
-	return cpu_to_node(cpu);
-}
-#endif
-
-/*
- * Build sched domains for a given set of cpus and attach the sched domains
- * to the individual cpus
- */
-void build_sched_domains(const cpumask_t *cpu_map)
-{
-	int i;
-#ifdef CONFIG_NUMA
-	struct sched_group **sched_group_nodes = NULL;
-	struct sched_group *sched_group_allnodes = NULL;
-
-	/*
-	 * Allocate the per-node list of sched groups
-	 */
-	sched_group_nodes = kmalloc(sizeof(struct sched_group*)*MAX_NUMNODES,
-					   GFP_ATOMIC);
-	if (!sched_group_nodes) {
-		printk(KERN_WARNING "Can not alloc sched group node list\n");
-		return;
-	}
-	sched_group_nodes_bycpu[first_cpu(*cpu_map)] = sched_group_nodes;
-#endif
-
-	/*
-	 * Set up domains for cpus specified by the cpu_map.
-	 */
-	for_each_cpu_mask(i, *cpu_map) {
-		int group;
-		struct sched_domain *sd = NULL, *p;
-		cpumask_t nodemask = node_to_cpumask(cpu_to_node(i));
-
-		cpus_and(nodemask, nodemask, *cpu_map);
-
-#ifdef CONFIG_NUMA
-		if (cpus_weight(*cpu_map)
-				> SD_NODES_PER_DOMAIN*cpus_weight(nodemask)) {
-			if (!sched_group_allnodes) {
-				sched_group_allnodes
-					= kmalloc(sizeof(struct sched_group)
-							* MAX_NUMNODES,
-						  GFP_KERNEL);
-				if (!sched_group_allnodes) {
-					printk(KERN_WARNING
-					"Can not alloc allnodes sched group\n");
-					break;
-				}
-				sched_group_allnodes_bycpu[i]
-						= sched_group_allnodes;
-			}
-			sd = &per_cpu(allnodes_domains, i);
-			*sd = SD_ALLNODES_INIT;
-			sd->span = *cpu_map;
-			group = cpu_to_allnodes_group(i);
-			sd->groups = &sched_group_allnodes[group];
-			p = sd;
-		} else
-			p = NULL;
-
-		sd = &per_cpu(node_domains, i);
-		*sd = SD_NODE_INIT;
-		sd->span = sched_domain_node_span(cpu_to_node(i));
-		sd->parent = p;
-		cpus_and(sd->span, sd->span, *cpu_map);
-#endif
-
-		p = sd;
-		sd = &per_cpu(phys_domains, i);
-		group = cpu_to_phys_group(i);
-		*sd = SD_CPU_INIT;
-		sd->span = nodemask;
-		sd->parent = p;
-		sd->groups = &sched_group_phys[group];
-
-#ifdef CONFIG_SCHED_SMT
-		p = sd;
-		sd = &per_cpu(cpu_domains, i);
-		group = cpu_to_cpu_group(i);
-		*sd = SD_SIBLING_INIT;
-		sd->span = cpu_sibling_map[i];
-		cpus_and(sd->span, sd->span, *cpu_map);
-		sd->parent = p;
-		sd->groups = &sched_group_cpus[group];
-#endif
-	}
-
-#ifdef CONFIG_SCHED_SMT
-	/* Set up CPU (sibling) groups */
-	for_each_cpu_mask(i, *cpu_map) {
-		cpumask_t this_sibling_map = cpu_sibling_map[i];
-		cpus_and(this_sibling_map, this_sibling_map, *cpu_map);
-		if (i != first_cpu(this_sibling_map))
-			continue;
-
-		init_sched_build_groups(sched_group_cpus, this_sibling_map,
-						&cpu_to_cpu_group);
-	}
-#endif
-
-	/* Set up physical groups */
-	for (i = 0; i < MAX_NUMNODES; i++) {
-		cpumask_t nodemask = node_to_cpumask(i);
-
-		cpus_and(nodemask, nodemask, *cpu_map);
-		if (cpus_empty(nodemask))
-			continue;
-
-		init_sched_build_groups(sched_group_phys, nodemask,
-						&cpu_to_phys_group);
-	}
-
-#ifdef CONFIG_NUMA
-	if (sched_group_allnodes)
-		init_sched_build_groups(sched_group_allnodes, *cpu_map,
-					&cpu_to_allnodes_group);
-
-	for (i = 0; i < MAX_NUMNODES; i++) {
-		/* Set up node groups */
-		struct sched_group *sg, *prev;
-		cpumask_t nodemask = node_to_cpumask(i);
-		cpumask_t domainspan;
-		cpumask_t covered = CPU_MASK_NONE;
-		int j;
-
-		cpus_and(nodemask, nodemask, *cpu_map);
-		if (cpus_empty(nodemask)) {
-			sched_group_nodes[i] = NULL;
-			continue;
-		}
-
-		domainspan = sched_domain_node_span(i);
-		cpus_and(domainspan, domainspan, *cpu_map);
-
-		sg = kmalloc(sizeof(struct sched_group), GFP_KERNEL);
-		sched_group_nodes[i] = sg;
-		for_each_cpu_mask(j, nodemask) {
-			struct sched_domain *sd;
-			sd = &per_cpu(node_domains, j);
-			sd->groups = sg;
-			if (sd->groups == NULL) {
-				/* Turn off balancing if we have no groups */
-				sd->flags = 0;
-			}
-		}
-		if (!sg) {
-			printk(KERN_WARNING
-			"Can not alloc domain group for node %d\n", i);
-			continue;
-		}
-		sg->cpu_power = 0;
-		sg->cpumask = nodemask;
-		cpus_or(covered, covered, nodemask);
-		prev = sg;
-
-		for (j = 0; j < MAX_NUMNODES; j++) {
-			cpumask_t tmp, notcovered;
-			int n = (i + j) % MAX_NUMNODES;
-
-			cpus_complement(notcovered, covered);
-			cpus_and(tmp, notcovered, *cpu_map);
-			cpus_and(tmp, tmp, domainspan);
-			if (cpus_empty(tmp))
-				break;
-
-			nodemask = node_to_cpumask(n);
-			cpus_and(tmp, tmp, nodemask);
-			if (cpus_empty(tmp))
-				continue;
-
-			sg = kmalloc(sizeof(struct sched_group), GFP_KERNEL);
-			if (!sg) {
-				printk(KERN_WARNING
-				"Can not alloc domain group for node %d\n", j);
-				break;
-			}
-			sg->cpu_power = 0;
-			sg->cpumask = tmp;
-			cpus_or(covered, covered, tmp);
-			prev->next = sg;
-			prev = sg;
-		}
-		prev->next = sched_group_nodes[i];
-	}
-#endif
-
-	/* Calculate CPU power for physical packages and nodes */
-	for_each_cpu_mask(i, *cpu_map) {
-		int power;
-		struct sched_domain *sd;
-#ifdef CONFIG_SCHED_SMT
-		sd = &per_cpu(cpu_domains, i);
-		power = SCHED_LOAD_SCALE;
-		sd->groups->cpu_power = power;
-#endif
-
-		sd = &per_cpu(phys_domains, i);
-		power = SCHED_LOAD_SCALE + SCHED_LOAD_SCALE *
-				(cpus_weight(sd->groups->cpumask)-1) / 10;
-		sd->groups->cpu_power = power;
-
-#ifdef CONFIG_NUMA
-		sd = &per_cpu(allnodes_domains, i);
-		if (sd->groups) {
-			power = SCHED_LOAD_SCALE + SCHED_LOAD_SCALE *
-				(cpus_weight(sd->groups->cpumask)-1) / 10;
-			sd->groups->cpu_power = power;
-		}
-#endif
-	}
-
-#ifdef CONFIG_NUMA
-	for (i = 0; i < MAX_NUMNODES; i++) {
-		struct sched_group *sg = sched_group_nodes[i];
-		int j;
-
-		if (sg == NULL)
-			continue;
-next_sg:
-		for_each_cpu_mask(j, sg->cpumask) {
-			struct sched_domain *sd;
-			int power;
-
-			sd = &per_cpu(phys_domains, j);
-			if (j != first_cpu(sd->groups->cpumask)) {
-				/*
-				 * Only add "power" once for each
-				 * physical package.
-				 */
-				continue;
-			}
-			power = SCHED_LOAD_SCALE + SCHED_LOAD_SCALE *
-				(cpus_weight(sd->groups->cpumask)-1) / 10;
-
-			sg->cpu_power += power;
-		}
-		sg = sg->next;
-		if (sg != sched_group_nodes[i])
-			goto next_sg;
-	}
-#endif
-
-	/* Attach the domains */
-	for_each_cpu_mask(i, *cpu_map) {
-		struct sched_domain *sd;
-#ifdef CONFIG_SCHED_SMT
-		sd = &per_cpu(cpu_domains, i);
-#else
-		sd = &per_cpu(phys_domains, i);
-#endif
-		cpu_attach_domain(sd, i);
-	}
-}
-/*
- * Set up scheduler domains and groups.  Callers must hold the hotplug lock.
- */
-void arch_init_sched_domains(const cpumask_t *cpu_map)
-{
-	cpumask_t cpu_default_map;
-
-	/*
-	 * Setup mask for cpus without special case scheduling requirements.
-	 * For now this just excludes isolated cpus, but could be used to
-	 * exclude other special cases in the future.
-	 */
-	cpus_andnot(cpu_default_map, *cpu_map, cpu_isolated_map);
-
-	build_sched_domains(&cpu_default_map);
-}
-
-void arch_destroy_sched_domains(const cpumask_t *cpu_map)
-{
-#ifdef CONFIG_NUMA
-	int i;
-	int cpu;
-
-	for_each_cpu_mask(cpu, *cpu_map) {
-		struct sched_group *sched_group_allnodes
-			= sched_group_allnodes_bycpu[cpu];
-		struct sched_group **sched_group_nodes
-			= sched_group_nodes_bycpu[cpu];
-
-		if (sched_group_allnodes) {
-			kfree(sched_group_allnodes);
-			sched_group_allnodes_bycpu[cpu] = NULL;
-		}
-
-		if (!sched_group_nodes)
-			continue;
-
-		for (i = 0; i < MAX_NUMNODES; i++) {
-			cpumask_t nodemask = node_to_cpumask(i);
-			struct sched_group *oldsg, *sg = sched_group_nodes[i];
-
-			cpus_and(nodemask, nodemask, *cpu_map);
-			if (cpus_empty(nodemask))
-				continue;
-
-			if (sg == NULL)
-				continue;
-			sg = sg->next;
-next_sg:
-			oldsg = sg;
-			sg = sg->next;
-			kfree(oldsg);
-			if (oldsg != sched_group_nodes[i])
-				goto next_sg;
-		}
-		kfree(sched_group_nodes);
-		sched_group_nodes_bycpu[cpu] = NULL;
-	}
-#endif
-}
diff --git a/include/asm-ia64/processor.h b/include/asm-ia64/processor.h
index 91bbd1f22461..94e07e727395 100644
--- a/include/asm-ia64/processor.h
+++ b/include/asm-ia64/processor.h
@@ -20,9 +20,6 @@
 #include <asm/ptrace.h>
 #include <asm/ustack.h>
 
-/* Our arch specific arch_init_sched_domain is in arch/ia64/kernel/domain.c */
-#define ARCH_HAS_SCHED_DOMAIN
-
 #define IA64_NUM_DBG_REGS	8
 /*
  * Limits for PMC and PMD are set to less than maximum architected values
diff --git a/include/asm-ia64/topology.h b/include/asm-ia64/topology.h
index 399bc29729fd..a9f738bf18a7 100644
--- a/include/asm-ia64/topology.h
+++ b/include/asm-ia64/topology.h
@@ -98,29 +98,6 @@ void build_cpu_to_node_map(void);
 	.nr_balance_failed	= 0,			\
 }
 
-/* sched_domains SD_ALLNODES_INIT for IA64 NUMA machines */
-#define SD_ALLNODES_INIT (struct sched_domain) {	\
-	.span			= CPU_MASK_NONE,	\
-	.parent			= NULL,			\
-	.groups			= NULL,			\
-	.min_interval		= 64,			\
-	.max_interval		= 64*num_online_cpus(),	\
-	.busy_factor		= 128,			\
-	.imbalance_pct		= 133,			\
-	.cache_hot_time		= (10*1000000),		\
-	.cache_nice_tries	= 1,			\
-	.busy_idx		= 3,			\
-	.idle_idx		= 3,			\
-	.newidle_idx		= 0, /* unused */	\
-	.wake_idx		= 0, /* unused */	\
-	.forkexec_idx		= 0, /* unused */	\
-	.per_cpu_gain		= 100,			\
-	.flags			= SD_LOAD_BALANCE,	\
-	.last_balance		= jiffies,		\
-	.balance_interval	= 64,			\
-	.nr_balance_failed	= 0,			\
-}
-
 #endif /* CONFIG_NUMA */
 
 #include <asm-generic/topology.h>
diff --git a/include/linux/sched.h b/include/linux/sched.h
index b5a22ea80045..ea1b5f32ec5c 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -564,13 +564,6 @@ struct sched_domain {
 
 extern void partition_sched_domains(cpumask_t *partition1,
 				    cpumask_t *partition2);
-#ifdef ARCH_HAS_SCHED_DOMAIN
-/* Useful helpers that arch setup code may use. Defined in kernel/sched.c */
-extern cpumask_t cpu_isolated_map;
-extern void init_sched_build_groups(struct sched_group groups[],
-	                        cpumask_t span, int (*group_fn)(int cpu));
-extern void cpu_attach_domain(struct sched_domain *sd, int cpu);
-#endif /* ARCH_HAS_SCHED_DOMAIN */
 #endif /* CONFIG_SMP */
 
 
diff --git a/include/linux/topology.h b/include/linux/topology.h
index 0320225e96da..3df1d474e5c5 100644
--- a/include/linux/topology.h
+++ b/include/linux/topology.h
@@ -135,6 +135,29 @@
 }
 #endif
 
+/* sched_domains SD_ALLNODES_INIT for NUMA machines */
+#define SD_ALLNODES_INIT (struct sched_domain) {	\
+	.span			= CPU_MASK_NONE,	\
+	.parent			= NULL,			\
+	.groups			= NULL,			\
+	.min_interval		= 64,			\
+	.max_interval		= 64*num_online_cpus(),	\
+	.busy_factor		= 128,			\
+	.imbalance_pct		= 133,			\
+	.cache_hot_time		= (10*1000000),		\
+	.cache_nice_tries	= 1,			\
+	.busy_idx		= 3,			\
+	.idle_idx		= 3,			\
+	.newidle_idx		= 0, /* unused */	\
+	.wake_idx		= 0, /* unused */	\
+	.forkexec_idx		= 0, /* unused */	\
+	.per_cpu_gain		= 100,			\
+	.flags			= SD_LOAD_BALANCE,	\
+	.last_balance		= jiffies,		\
+	.balance_interval	= 64,			\
+	.nr_balance_failed	= 0,			\
+}
+
 #ifdef CONFIG_NUMA
 #ifndef SD_NODE_INIT
 #error Please define an appropriate SD_NODE_INIT in include/asm/topology.h!!!
diff --git a/kernel/sched.c b/kernel/sched.c
index 5f889d0cbfcc..50860ad5b624 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -4779,7 +4779,7 @@ static int sd_parent_degenerate(struct sched_domain *sd,
  * Attach the domain 'sd' to 'cpu' as its base domain.  Callers must
  * hold the hotplug lock.
  */
-void cpu_attach_domain(struct sched_domain *sd, int cpu)
+static void cpu_attach_domain(struct sched_domain *sd, int cpu)
 {
 	runqueue_t *rq = cpu_rq(cpu);
 	struct sched_domain *tmp;
@@ -4802,7 +4802,7 @@ void cpu_attach_domain(struct sched_domain *sd, int cpu)
 }
 
 /* cpus with isolated domains */
-cpumask_t __devinitdata cpu_isolated_map = CPU_MASK_NONE;
+static cpumask_t __devinitdata cpu_isolated_map = CPU_MASK_NONE;
 
 /* Setup the mask of cpus configured for isolated domains */
 static int __init isolated_cpu_setup(char *str)
@@ -4830,8 +4830,8 @@ __setup ("isolcpus=", isolated_cpu_setup);
  * covered by the given span, and will set each group's ->cpumask correctly,
  * and ->cpu_power to 0.
  */
-void init_sched_build_groups(struct sched_group groups[],
-			cpumask_t span, int (*group_fn)(int cpu))
+static void init_sched_build_groups(struct sched_group groups[], cpumask_t span,
+				    int (*group_fn)(int cpu))
 {
 	struct sched_group *first = NULL, *last = NULL;
 	cpumask_t covered = CPU_MASK_NONE;
@@ -4864,12 +4864,85 @@ void init_sched_build_groups(struct sched_group groups[],
 	last->next = first;
 }
 
+#define SD_NODES_PER_DOMAIN 16
 
-#ifdef ARCH_HAS_SCHED_DOMAIN
-extern void build_sched_domains(const cpumask_t *cpu_map);
-extern void arch_init_sched_domains(const cpumask_t *cpu_map);
-extern void arch_destroy_sched_domains(const cpumask_t *cpu_map);
-#else
+#ifdef CONFIG_NUMA
+/**
+ * find_next_best_node - find the next node to include in a sched_domain
+ * @node: node whose sched_domain we're building
+ * @used_nodes: nodes already in the sched_domain
+ *
+ * Find the next node to include in a given scheduling domain.  Simply
+ * finds the closest node not already in the @used_nodes map.
+ *
+ * Should use nodemask_t.
+ */
+static int find_next_best_node(int node, unsigned long *used_nodes)
+{
+	int i, n, val, min_val, best_node = 0;
+
+	min_val = INT_MAX;
+
+	for (i = 0; i < MAX_NUMNODES; i++) {
+		/* Start at @node */
+		n = (node + i) % MAX_NUMNODES;
+
+		if (!nr_cpus_node(n))
+			continue;
+
+		/* Skip already used nodes */
+		if (test_bit(n, used_nodes))
+			continue;
+
+		/* Simple min distance search */
+		val = node_distance(node, n);
+
+		if (val < min_val) {
+			min_val = val;
+			best_node = n;
+		}
+	}
+
+	set_bit(best_node, used_nodes);
+	return best_node;
+}
+
+/**
+ * sched_domain_node_span - get a cpumask for a node's sched_domain
+ * @node: node whose cpumask we're constructing
+ * @size: number of nodes to include in this span
+ *
+ * Given a node, construct a good cpumask for its sched_domain to span.  It
+ * should be one that prevents unnecessary balancing, but also spreads tasks
+ * out optimally.
+ */
+static cpumask_t sched_domain_node_span(int node)
+{
+	int i;
+	cpumask_t span, nodemask;
+	DECLARE_BITMAP(used_nodes, MAX_NUMNODES);
+
+	cpus_clear(span);
+	bitmap_zero(used_nodes, MAX_NUMNODES);
+
+	nodemask = node_to_cpumask(node);
+	cpus_or(span, span, nodemask);
+	set_bit(node, used_nodes);
+
+	for (i = 1; i < SD_NODES_PER_DOMAIN; i++) {
+		int next_node = find_next_best_node(node, used_nodes);
+		nodemask = node_to_cpumask(next_node);
+		cpus_or(span, span, nodemask);
+	}
+
+	return span;
+}
+#endif
+
+/*
+ * At the moment, CONFIG_SCHED_SMT is never defined, but leave it in so we
+ * can switch it on easily if needed.
+ */
 #ifdef CONFIG_SCHED_SMT
 static DEFINE_PER_CPU(struct sched_domain, cpu_domains);
 static struct sched_group sched_group_cpus[NR_CPUS];
@@ -4891,36 +4964,20 @@ static int cpu_to_phys_group(int cpu)
 }
 
 #ifdef CONFIG_NUMA
-
-static DEFINE_PER_CPU(struct sched_domain, node_domains);
-static struct sched_group sched_group_nodes[MAX_NUMNODES];
-static int cpu_to_node_group(int cpu)
-{
-	return cpu_to_node(cpu);
-}
-#endif
-
-#if defined(CONFIG_SCHED_SMT) && defined(CONFIG_NUMA)
 /*
- * The domains setup code relies on siblings not spanning
- * multiple nodes. Make sure the architecture has a proper
- * siblings map:
+ * The init_sched_build_groups can't handle what we want to do with node
+ * groups, so roll our own. Now each node has its own list of groups which
+ * gets dynamically allocated.
  */
-static void check_sibling_maps(void)
-{
-	int i, j;
+static DEFINE_PER_CPU(struct sched_domain, node_domains);
+static struct sched_group *sched_group_nodes[MAX_NUMNODES];
 
-	for_each_online_cpu(i) {
-		for_each_cpu_mask(j, cpu_sibling_map[i]) {
-			if (cpu_to_node(i) != cpu_to_node(j)) {
-				printk(KERN_INFO "warning: CPU %d siblings map "
-					"to different node - isolating "
-					"them.\n", i);
-				cpu_sibling_map[i] = cpumask_of_cpu(i);
-				break;
-			}
-		}
-	}
+static DEFINE_PER_CPU(struct sched_domain, allnodes_domains);
+static struct sched_group sched_group_allnodes[MAX_NUMNODES];
+
+static int cpu_to_allnodes_group(int cpu)
+{
+	return cpu_to_node(cpu);
 }
 #endif
 
@@ -4928,7 +4985,7 @@ static void check_sibling_maps(void)
  * Build sched domains for a given set of cpus and attach the sched domains
  * to the individual cpus
  */
-static void build_sched_domains(const cpumask_t *cpu_map)
+void build_sched_domains(const cpumask_t *cpu_map)
 {
 	int i;
 
@@ -4943,11 +5000,22 @@ static void build_sched_domains(const cpumask_t *cpu_map)
 		cpus_and(nodemask, nodemask, *cpu_map);
 
 #ifdef CONFIG_NUMA
+		if (num_online_cpus()
+				> SD_NODES_PER_DOMAIN*cpus_weight(nodemask)) {
+			sd = &per_cpu(allnodes_domains, i);
+			*sd = SD_ALLNODES_INIT;
+			sd->span = *cpu_map;
+			group = cpu_to_allnodes_group(i);
+			sd->groups = &sched_group_allnodes[group];
+			p = sd;
+		} else
+			p = NULL;
+
 		sd = &per_cpu(node_domains, i);
-		group = cpu_to_node_group(i);
 		*sd = SD_NODE_INIT;
-		sd->span = *cpu_map;
-		sd->groups = &sched_group_nodes[group];
+		sd->span = sched_domain_node_span(cpu_to_node(i));
+		sd->parent = p;
+		cpus_and(sd->span, sd->span, *cpu_map);
 #endif
 
 		p = sd;
@@ -4972,7 +5040,7 @@ static void build_sched_domains(const cpumask_t *cpu_map)
 
 #ifdef CONFIG_SCHED_SMT
 	/* Set up CPU (sibling) groups */
-	for_each_online_cpu(i) {
+	for_each_cpu_mask(i, *cpu_map) {
 		cpumask_t this_sibling_map = cpu_sibling_map[i];
 		cpus_and(this_sibling_map, this_sibling_map, *cpu_map);
 		if (i != first_cpu(this_sibling_map))
@@ -4997,8 +5065,74 @@ static void build_sched_domains(const cpumask_t *cpu_map)
 
 #ifdef CONFIG_NUMA
 	/* Set up node groups */
-	init_sched_build_groups(sched_group_nodes, *cpu_map,
-					&cpu_to_node_group);
+	init_sched_build_groups(sched_group_allnodes, *cpu_map,
+				&cpu_to_allnodes_group);
+
+	for (i = 0; i < MAX_NUMNODES; i++) {
+		/* Set up node groups */
+		struct sched_group *sg, *prev;
+		cpumask_t nodemask = node_to_cpumask(i);
+		cpumask_t domainspan;
+		cpumask_t covered = CPU_MASK_NONE;
+		int j;
+
+		cpus_and(nodemask, nodemask, *cpu_map);
+		if (cpus_empty(nodemask))
+			continue;
+
+		domainspan = sched_domain_node_span(i);
+		cpus_and(domainspan, domainspan, *cpu_map);
+
+		sg = kmalloc(sizeof(struct sched_group), GFP_KERNEL);
+		sched_group_nodes[i] = sg;
+		for_each_cpu_mask(j, nodemask) {
+			struct sched_domain *sd;
+			sd = &per_cpu(node_domains, j);
+			sd->groups = sg;
+			if (sd->groups == NULL) {
+				/* Turn off balancing if we have no groups */
+				sd->flags = 0;
+			}
+		}
+		if (!sg) {
+			printk(KERN_WARNING
+			"Can not alloc domain group for node %d\n", i);
+			continue;
+		}
+		sg->cpu_power = 0;
+		sg->cpumask = nodemask;
+		cpus_or(covered, covered, nodemask);
+		prev = sg;
+
+		for (j = 0; j < MAX_NUMNODES; j++) {
+			cpumask_t tmp, notcovered;
+			int n = (i + j) % MAX_NUMNODES;
+
+			cpus_complement(notcovered, covered);
+			cpus_and(tmp, notcovered, *cpu_map);
+			cpus_and(tmp, tmp, domainspan);
+			if (cpus_empty(tmp))
+				break;
+
+			nodemask = node_to_cpumask(n);
+			cpus_and(tmp, tmp, nodemask);
+			if (cpus_empty(tmp))
+				continue;
+
+			sg = kmalloc(sizeof(struct sched_group), GFP_KERNEL);
+			if (!sg) {
+				printk(KERN_WARNING
+				"Can not alloc domain group for node %d\n", j);
+				break;
+			}
+			sg->cpu_power = 0;
+			sg->cpumask = tmp;
+			cpus_or(covered, covered, tmp);
+			prev->next = sg;
+			prev = sg;
+		}
+		prev->next = sched_group_nodes[i];
+	}
 #endif
 
 	/* Calculate CPU power for physical packages and nodes */
@@ -5017,14 +5151,46 @@ static void build_sched_domains(const cpumask_t *cpu_map)
 		sd->groups->cpu_power = power;
 
 #ifdef CONFIG_NUMA
-		if (i == first_cpu(sd->groups->cpumask)) {
-			/* Only add "power" once for each physical package. */
-			sd = &per_cpu(node_domains, i);
-			sd->groups->cpu_power += power;
+		sd = &per_cpu(allnodes_domains, i);
+		if (sd->groups) {
+			power = SCHED_LOAD_SCALE + SCHED_LOAD_SCALE *
+				(cpus_weight(sd->groups->cpumask)-1) / 10;
+			sd->groups->cpu_power = power;
 		}
 #endif
 	}
 
+#ifdef CONFIG_NUMA
+	for (i = 0; i < MAX_NUMNODES; i++) {
+		struct sched_group *sg = sched_group_nodes[i];
+		int j;
+
+		if (sg == NULL)
+			continue;
+next_sg:
+		for_each_cpu_mask(j, sg->cpumask) {
+			struct sched_domain *sd;
+			int power;
+
+			sd = &per_cpu(phys_domains, j);
+			if (j != first_cpu(sd->groups->cpumask)) {
+				/*
+				 * Only add "power" once for each
+				 * physical package.
+				 */
+				continue;
+			}
+			power = SCHED_LOAD_SCALE + SCHED_LOAD_SCALE *
+				(cpus_weight(sd->groups->cpumask)-1) / 10;
+
+			sg->cpu_power += power;
+		}
+		sg = sg->next;
+		if (sg != sched_group_nodes[i])
+			goto next_sg;
+	}
+#endif
+
 	/* Attach the domains */
 	for_each_cpu_mask(i, *cpu_map) {
 		struct sched_domain *sd;
@@ -5039,13 +5205,10 @@ static void build_sched_domains(const cpumask_t *cpu_map)
 /*
  * Set up scheduler domains and groups.  Callers must hold the hotplug lock.
  */
-static void arch_init_sched_domains(cpumask_t *cpu_map)
+static void arch_init_sched_domains(const cpumask_t *cpu_map)
 {
 	cpumask_t cpu_default_map;
 
-#if defined(CONFIG_SCHED_SMT) && defined(CONFIG_NUMA)
-	check_sibling_maps();
-#endif
 	/*
 	 * Setup mask for cpus without special case scheduling requirements.
 	 * For now this just excludes isolated cpus, but could be used to
@@ -5058,10 +5221,29 @@ static void arch_init_sched_domains(cpumask_t *cpu_map)
 
 static void arch_destroy_sched_domains(const cpumask_t *cpu_map)
 {
-	/* Do nothing: everything is statically allocated. */
-}
+#ifdef CONFIG_NUMA
+	int i;
+	for (i = 0; i < MAX_NUMNODES; i++) {
+		cpumask_t nodemask = node_to_cpumask(i);
+		struct sched_group *oldsg, *sg = sched_group_nodes[i];
 
-#endif /* ARCH_HAS_SCHED_DOMAIN */
+		cpus_and(nodemask, nodemask, *cpu_map);
+		if (cpus_empty(nodemask))
+			continue;
+
+		if (sg == NULL)
+			continue;
+		sg = sg->next;
+next_sg:
+		oldsg = sg;
+		sg = sg->next;
+		kfree(oldsg);
+		if (oldsg != sched_group_nodes[i])
+			goto next_sg;
+		sched_group_nodes[i] = NULL;
+	}
+#endif
+}
 
 /*
  * Detach sched domains from a group of cpus specified in cpu_map
-- 
cgit v1.2.3