From 8315eca25583c369e28f48909d3341dc21d6214d Mon Sep 17 00:00:00 2001
From: Andi Kleen <ak@suse.de>
Date: Sat, 5 Nov 2005 17:25:54 +0100
Subject: [PATCH] x86_64: Some clarifications for Documention/x86_64/mm.txt

I got some questions on this, so just fix up the documentation.

Signed-off-by: Andi Kleen <ak@suse.de>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 Documentation/x86_64/mm.txt | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

(limited to 'Documentation')

diff --git a/Documentation/x86_64/mm.txt b/Documentation/x86_64/mm.txt
index 662b73971a67..133561b9cb0c 100644
--- a/Documentation/x86_64/mm.txt
+++ b/Documentation/x86_64/mm.txt
@@ -6,7 +6,7 @@ Virtual memory map with 4 level page tables:
 0000000000000000 - 00007fffffffffff (=47bits) user space, different per mm
 hole caused by [48:63] sign extension
 ffff800000000000 - ffff80ffffffffff (=40bits) guard hole
-ffff810000000000 - ffffc0ffffffffff (=46bits) direct mapping of phys. memory
+ffff810000000000 - ffffc0ffffffffff (=46bits) direct mapping of all phys. memory
 ffffc10000000000 - ffffc1ffffffffff (=40bits) hole
 ffffc20000000000 - ffffe1ffffffffff (=45bits) vmalloc/ioremap space
 ... unused hole ...
@@ -14,6 +14,10 @@ ffffffff80000000 - ffffffff82800000 (=40MB)   kernel text mapping, from phys 0
 ... unused hole ...
 ffffffff88000000 - fffffffffff00000 (=1919MB) module mapping space
 
+The direct mapping covers all memory in the system upto the highest
+memory address (this means in some cases it can also include PCI memory
+holes)
+
 vmalloc space is lazily synchronized into the different PML4 pages of
 the processes using the page fault handler, with init_level4_pgt as
 reference.
-- 
cgit v1.2.3


From 420f8f68c9c5148dddf946bebdbc7eacde2172cb Mon Sep 17 00:00:00 2001
From: Andi Kleen <ak@suse.de>
Date: Sat, 5 Nov 2005 17:25:54 +0100
Subject: [PATCH] x86_64: New heuristics to find out hotpluggable CPUs.

With a NR_CPUS==128 kernel with CPU hotplug enabled we would waste 4MB
on per CPU data of all possible CPUs.  The reason was that HOTPLUG
always set up possible map to NR_CPUS cpus and then we need to allocate
that much (each per CPU data is roughly ~32k now)

The underlying problem is that ACPI didn't tell us how many hotplug CPUs
the platform supports.  So the old code just assumed all, which would
lead to this memory wastage.

This implements some new heuristics:

 - If the BIOS specified disabled CPUs in the ACPI/mptables assume they
   can be enabled later (this is bending the ACPI specification a bit,
   but seems like a obvious extension)
 - The user can overwrite it with a new additionals_cpus=NUM option
 - Otherwise use half of the available CPUs or 2, whatever is more.

Cc: ashok.raj@intel.com
Cc: len.brown@intel.com

Signed-off-by: Andi Kleen <ak@suse.de>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 Documentation/x86_64/boot-options.txt |  3 +++
 arch/x86_64/kernel/mpparse.c          |  8 +++++--
 arch/x86_64/kernel/smpboot.c          | 39 ++++++++++++++++++++++++++++++++---
 include/asm-x86_64/smp.h              |  2 ++
 4 files changed, 47 insertions(+), 5 deletions(-)

(limited to 'Documentation')

diff --git a/Documentation/x86_64/boot-options.txt b/Documentation/x86_64/boot-options.txt
index ffe1c062088b..a83139692cdf 100644
--- a/Documentation/x86_64/boot-options.txt
+++ b/Documentation/x86_64/boot-options.txt
@@ -122,6 +122,9 @@ SMP
 
   cpumask=MASK   only use cpus with bits set in mask
 
+  additional_cpus=NUM Allow NUM more CPUs for hotplug
+		 (defaults are specified by the BIOS or half the available CPUs)
+
 NUMA
 
   numa=off	Only set up a single NUMA node spanning all memory.
diff --git a/arch/x86_64/kernel/mpparse.c b/arch/x86_64/kernel/mpparse.c
index 4a581d1cefbd..1d61f10a92c6 100644
--- a/arch/x86_64/kernel/mpparse.c
+++ b/arch/x86_64/kernel/mpparse.c
@@ -65,7 +65,9 @@ unsigned long mp_lapic_addr = 0;
 /* Processor that is doing the boot up */
 unsigned int boot_cpu_id = -1U;
 /* Internal processor count */
-static unsigned int num_processors = 0;
+unsigned int num_processors __initdata = 0;
+
+unsigned disabled_cpus __initdata;
 
 /* Bitmask of physically existing CPUs */
 physid_mask_t phys_cpu_present_map = PHYSID_MASK_NONE;
@@ -109,8 +111,10 @@ static void __init MP_processor_info (struct mpc_config_processor *m)
 	int ver, cpu;
 	static int found_bsp=0;
 
-	if (!(m->mpc_cpuflag & CPU_ENABLED))
+	if (!(m->mpc_cpuflag & CPU_ENABLED)) {
+		disabled_cpus++;
 		return;
+	}
 
 	printk(KERN_INFO "Processor #%d %d:%d APIC version %d\n",
 		m->mpc_apicid,
diff --git a/arch/x86_64/kernel/smpboot.c b/arch/x86_64/kernel/smpboot.c
index 3393fc08823b..f74319a80659 100644
--- a/arch/x86_64/kernel/smpboot.c
+++ b/arch/x86_64/kernel/smpboot.c
@@ -880,6 +880,9 @@ static __init void disable_smp(void)
 }
 
 #ifdef CONFIG_HOTPLUG_CPU
+
+int additional_cpus __initdata = -1;
+
 /*
  * cpu_possible_map should be static, it cannot change as cpu's
  * are onlined, or offlined. The reason is per-cpu data-structures
@@ -888,14 +891,38 @@ static __init void disable_smp(void)
  * cpu_present_map on the other hand can change dynamically.
  * In case when cpu_hotplug is not compiled, then we resort to current
  * behaviour, which is cpu_possible == cpu_present.
- * If cpu-hotplug is supported, then we need to preallocate for all
- * those NR_CPUS, hence cpu_possible_map represents entire NR_CPUS range.
  * - Ashok Raj
+ *
+ * Three ways to find out the number of additional hotplug CPUs:
+ * - If the BIOS specified disabled CPUs in ACPI/mptables use that.
+ * - otherwise use half of the available CPUs or 2, whatever is more.
+ * - The user can overwrite it with additional_cpus=NUM
+ * We do this because additional CPUs waste a lot of memory.
+ * -AK
  */
 __init void prefill_possible_map(void)
 {
 	int i;
-	for (i = 0; i < NR_CPUS; i++)
+	int possible;
+
+ 	if (additional_cpus == -1) {
+ 		if (disabled_cpus > 0) {
+ 			additional_cpus = disabled_cpus;
+ 		} else {
+ 			additional_cpus = num_processors / 2;
+ 			if (additional_cpus == 0)
+ 				additional_cpus = 2;
+ 		}
+ 	}
+	possible = num_processors + additional_cpus;
+	if (possible > NR_CPUS) 
+		possible = NR_CPUS;
+
+	printk(KERN_INFO "SMP: Allowing %d CPUs, %d hotplug CPUs\n",
+		possible,
+	        max_t(int, possible - num_processors, 0));
+
+	for (i = 0; i < possible; i++)
 		cpu_set(i, cpu_possible_map);
 }
 #endif
@@ -1151,6 +1178,12 @@ void __cpu_die(unsigned int cpu)
  	printk(KERN_ERR "CPU %u didn't die...\n", cpu);
 }
 
+static __init int setup_additional_cpus(char *s)
+{
+	return get_option(&s, &additional_cpus);
+}
+__setup("additional_cpus=", setup_additional_cpus);
+
 #else /* ... !CONFIG_HOTPLUG_CPU */
 
 int __cpu_disable(void)
diff --git a/include/asm-x86_64/smp.h b/include/asm-x86_64/smp.h
index 592161e979e5..cf8f969f9020 100644
--- a/include/asm-x86_64/smp.h
+++ b/include/asm-x86_64/smp.h
@@ -81,6 +81,8 @@ extern int safe_smp_processor_id(void);
 extern int __cpu_disable(void);
 extern void __cpu_die(unsigned int cpu);
 extern void prefill_possible_map(void);
+extern unsigned num_processors;
+extern unsigned disabled_cpus;
 
 #endif /* !ASSEMBLY */
 
-- 
cgit v1.2.3


From e583538f077d5f70191670b47a046ba436ec3428 Mon Sep 17 00:00:00 2001
From: Andi Kleen <ak@suse.de>
Date: Sat, 5 Nov 2005 17:25:54 +0100
Subject: [PATCH] x86_64: Log machine checks from boot on Intel systems

The logging for boot errors was turned off because it was broken
on some AMD systems. But give Intel EM64T systems a chance because they are
supposed to be correct there.

The advantage is that there is a chance to actually log uncorrected
machine checks after the reset.

Signed-off-by: Andi Kleen <ak@suse.de>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 Documentation/x86_64/boot-options.txt |  6 ++++--
 arch/x86_64/kernel/mce.c              | 14 +++++++++-----
 2 files changed, 13 insertions(+), 7 deletions(-)

(limited to 'Documentation')

diff --git a/Documentation/x86_64/boot-options.txt b/Documentation/x86_64/boot-options.txt
index a83139692cdf..aaabb5883ab8 100644
--- a/Documentation/x86_64/boot-options.txt
+++ b/Documentation/x86_64/boot-options.txt
@@ -7,10 +7,12 @@ Machine check
 
    mce=off disable machine check
    mce=bootlog Enable logging of machine checks left over from booting.
-               Disabled by default because some BIOS leave bogus ones.
+               Disabled by default on AMD because some BIOS leave bogus ones.
                If your BIOS doesn't do that it's a good idea to enable though
                to make sure you log even machine check events that result
-               in a reboot.
+               in a reboot. On Intel systems it is enabled by default.
+   mce=nobootlog
+		Disable boot machine check logging.
    mce=tolerancelevel (number)
 		0: always panic, 1: panic if deadlock possible,
 		2: try to avoid panic, 3: never panic or exit (for testing)
diff --git a/arch/x86_64/kernel/mce.c b/arch/x86_64/kernel/mce.c
index cf8a76f0f47e..183dc6105429 100644
--- a/arch/x86_64/kernel/mce.c
+++ b/arch/x86_64/kernel/mce.c
@@ -37,7 +37,7 @@ static unsigned long bank[NR_BANKS] = { [0 ... NR_BANKS-1] = ~0UL };
 static unsigned long console_logged;
 static int notify_user;
 static int rip_msr;
-static int mce_bootlog;
+static int mce_bootlog = 1;
 
 /*
  * Lockless MCE logging infrastructure.
@@ -347,7 +347,11 @@ static void __cpuinit mce_cpu_quirks(struct cpuinfo_x86 *c)
 		/* disable GART TBL walk error reporting, which trips off 
 		   incorrectly with the IOMMU & 3ware & Cerberus. */
 		clear_bit(10, &bank[4]);
+		/* Lots of broken BIOS around that don't clear them
+		   by default and leave crap in there. Don't log. */
+		mce_bootlog = 0;
 	}
+
 }			
 
 static void __cpuinit mce_cpu_features(struct cpuinfo_x86 *c)
@@ -498,16 +502,16 @@ static int __init mcheck_disable(char *str)
 /* mce=off disables machine check. Note you can reenable it later
    using sysfs.
    mce=TOLERANCELEVEL (number, see above)
-   mce=bootlog Log MCEs from before booting. Disabled by default to work
-   around buggy BIOS that leave bogus MCEs.  */
+   mce=bootlog Log MCEs from before booting. Disabled by default on AMD.
+   mce=nobootlog Don't log MCEs from before booting. */
 static int __init mcheck_enable(char *str)
 {
 	if (*str == '=')
 		str++;
 	if (!strcmp(str, "off"))
 		mce_dont_init = 1;
-	else if (!strcmp(str, "bootlog"))
-		mce_bootlog = 1;
+	else if (!strcmp(str, "bootlog") || !strcmp(str,"nobootlog"))
+		mce_bootlog = str[0] == 'b';
 	else if (isdigit(str[0]))
 		get_option(&str, &tolerant);
 	else
-- 
cgit v1.2.3


From 9e43e1b7c7c9872da032442d8e4bb112a02d16f4 Mon Sep 17 00:00:00 2001
From: Andi Kleen <ak@suse.de>
Date: Sat, 5 Nov 2005 17:25:54 +0100
Subject: [PATCH] x86_64: Remove CONFIG_CHECKING and add command line option
 for pagefault tracing

CONFIG_CHECKING covered some debugging code used in the early times
of the port. But it wasn't even SMP safe for quite some time
and the bugs it checked for seem to be gone.

This patch removes all the code to verify GS at kernel entry. There
haven't been any new bugs in this area for a long time.

Previously it also covered the sysctl for the page fault tracing.
That didn't make much sense because that code was unconditionally
compiled in. I made that a boot option now because it is typically
only useful at boot.

Signed-off-by: Andi Kleen <ak@suse.de>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 Documentation/x86_64/boot-options.txt |  3 +++
 arch/x86_64/Kconfig.debug             |  9 --------
 arch/x86_64/kernel/traps.c            | 40 -----------------------------------
 arch/x86_64/mm/fault.c                | 19 ++++++-----------
 arch/x86_64/mm/init.c                 |  4 ----
 5 files changed, 10 insertions(+), 65 deletions(-)

(limited to 'Documentation')

diff --git a/Documentation/x86_64/boot-options.txt b/Documentation/x86_64/boot-options.txt
index aaabb5883ab8..e566affeed7f 100644
--- a/Documentation/x86_64/boot-options.txt
+++ b/Documentation/x86_64/boot-options.txt
@@ -193,6 +193,9 @@ Debugging
 
   kstack=N   Print that many words from the kernel stack in oops dumps.
 
+  pagefaulttrace Dump all page faults. Only useful for extreme debugging
+		and will create a lot of output.
+
 Misc
 
   noreplacement  Don't replace instructions with more appropiate ones
diff --git a/arch/x86_64/Kconfig.debug b/arch/x86_64/Kconfig.debug
index 9cf1410d2f5a..3ccf6f4d1068 100644
--- a/arch/x86_64/Kconfig.debug
+++ b/arch/x86_64/Kconfig.debug
@@ -2,15 +2,6 @@ menu "Kernel hacking"
 
 source "lib/Kconfig.debug"
 
-# !SMP for now because the context switch early causes GPF in segment reloading
-# and the GS base checking does the wrong thing then, causing a hang.
-config CHECKING
-	bool "Additional run-time checks"
-	depends on DEBUG_KERNEL && !SMP
-	help
-	  Enables some internal consistency checks for kernel debugging.
-	  You should normally say N.
-
 config INIT_DEBUG
 	bool "Debug __init statements"
 	depends on DEBUG_KERNEL
diff --git a/arch/x86_64/kernel/traps.c b/arch/x86_64/kernel/traps.c
index 4a836384dd0f..bf337f493189 100644
--- a/arch/x86_64/kernel/traps.c
+++ b/arch/x86_64/kernel/traps.c
@@ -428,19 +428,6 @@ static void __kprobes do_trap(int trapnr, int signr, char *str,
 {
 	conditional_sti(regs);
 
-#ifdef CONFIG_CHECKING
-       { 
-               unsigned long gs; 
-               struct x8664_pda *pda = cpu_pda + safe_smp_processor_id(); 
-               rdmsrl(MSR_GS_BASE, gs); 
-               if (gs != (unsigned long)pda) { 
-                       wrmsrl(MSR_GS_BASE, pda); 
-                       printk("%s: wrong gs %lx expected %p rip %lx\n", str, gs, pda,
-			      regs->rip);
-               }
-       }
-#endif
-
 	if (user_mode(regs)) {
 		struct task_struct *tsk = current;
 
@@ -513,20 +500,6 @@ asmlinkage void __kprobes do_general_protection(struct pt_regs * regs,
 {
 	conditional_sti(regs);
 
-#ifdef CONFIG_CHECKING
-       { 
-               unsigned long gs; 
-               struct x8664_pda *pda = cpu_pda + safe_smp_processor_id(); 
-               rdmsrl(MSR_GS_BASE, gs); 
-               if (gs != (unsigned long)pda) { 
-                       wrmsrl(MSR_GS_BASE, pda); 
-		       oops_in_progress++;
-                       printk("general protection handler: wrong gs %lx expected %p\n", gs, pda);
-		       oops_in_progress--;
-               }
-       }
-#endif
-
 	if (user_mode(regs)) {
 		struct task_struct *tsk = current;
 
@@ -665,19 +638,6 @@ asmlinkage void __kprobes do_debug(struct pt_regs * regs,
 	struct task_struct *tsk = current;
 	siginfo_t info;
 
-#ifdef CONFIG_CHECKING
-       { 
-	       /* RED-PEN interaction with debugger - could destroy gs */
-               unsigned long gs; 
-               struct x8664_pda *pda = cpu_pda + safe_smp_processor_id(); 
-               rdmsrl(MSR_GS_BASE, gs); 
-               if (gs != (unsigned long)pda) { 
-                       wrmsrl(MSR_GS_BASE, pda); 
-                       printk("debug handler: wrong gs %lx expected %p\n", gs, pda);
-               }
-       }
-#endif
-
 	get_debugreg(condition, 6);
 
 	if (notify_die(DIE_DEBUG, "debug", regs, condition, error_code,
diff --git a/arch/x86_64/mm/fault.c b/arch/x86_64/mm/fault.c
index b75b872ec154..3a63707a698b 100644
--- a/arch/x86_64/mm/fault.c
+++ b/arch/x86_64/mm/fault.c
@@ -308,18 +308,6 @@ asmlinkage void __kprobes do_page_fault(struct pt_regs *regs,
 	unsigned long flags;
 	siginfo_t info;
 
-#ifdef CONFIG_CHECKING
-	{ 
-		unsigned long gs; 
-		struct x8664_pda *pda = cpu_pda + stack_smp_processor_id(); 
-		rdmsrl(MSR_GS_BASE, gs); 
-		if (gs != (unsigned long)pda) { 
-			wrmsrl(MSR_GS_BASE, pda); 
-			printk("page_fault: wrong gs %lx expected %p\n", gs, pda);
-		}
-	}
-#endif
-
 	/* get the address */
 	__asm__("movq %%cr2,%0":"=r" (address));
 	if (notify_die(DIE_PAGE_FAULT, "page fault", regs, error_code, 14,
@@ -571,3 +559,10 @@ do_sigbus:
 	force_sig_info(SIGBUS, &info, tsk);
 	return;
 }
+
+static int __init enable_pagefaulttrace(char *str)
+{
+	page_fault_trace = 1;
+	return 0;
+}
+__setup("pagefaulttrace", enable_pagefaulttrace);
diff --git a/arch/x86_64/mm/init.c b/arch/x86_64/mm/init.c
index 854a41b8372b..286f6a624c3a 100644
--- a/arch/x86_64/mm/init.c
+++ b/arch/x86_64/mm/init.c
@@ -565,10 +565,6 @@ extern int exception_trace, page_fault_trace;
 static ctl_table debug_table2[] = {
 	{ 99, "exception-trace", &exception_trace, sizeof(int), 0644, NULL,
 	  proc_dointvec },
-#ifdef CONFIG_CHECKING
-	{ 100, "page-fault-trace", &page_fault_trace, sizeof(int), 0644, NULL,
-	  proc_dointvec },
-#endif
 	{ 0, }
 }; 
 
-- 
cgit v1.2.3