From 57b165948fe0e1cca78bca1fe1e1ae9a38baedd2 Mon Sep 17 00:00:00 2001
From: Michael D Labriola <michael.d.labriola@gmail.com>
Date: Wed, 1 Feb 2012 10:05:00 -0500
Subject: x86/reboot: Reduce to a single DMI table for reboot quirks

This commit reduces the X86_32 reboot_dmi_table and the X86_64
pci_reboot_dmi_table into a single table with a single set of
functions (e.g., only 1 call to core_initcall).

The table entries that use set_bios_reboot are grouped together
inside a #define CONFIG_X86_32 block.

Note that there's a single entry that uses set_kbd_reboot, which
used to be available only on X86_32.  This commit moves that
entry outside the X86_32 block because it seems it never should
have been in there.  There's multiple places in reboot.c that
assume KBD is valid regardless of X86_32/X86_64.

Signed-off-by: Michael D Labriola <michael.d.labriola@gmail.com>
Cc: Matthew Garrett <mjg@redhat.com>
Link: http://lkml.kernel.org/n/tip-lv3aliubas2l3aenq8v3uklk@git.kernel.org
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/reboot.c | 176 ++++++++++++++++++++++-------------------------
 1 file changed, 83 insertions(+), 93 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kernel/reboot.c b/arch/x86/kernel/reboot.c
index d840e69a853c..e73973769076 100644
--- a/arch/x86/kernel/reboot.c
+++ b/arch/x86/kernel/reboot.c
@@ -150,6 +150,80 @@ static int __init set_bios_reboot(const struct dmi_system_id *d)
 	return 0;
 }
 
+extern const unsigned char machine_real_restart_asm[];
+extern const u64 machine_real_restart_gdt[3];
+
+void machine_real_restart(unsigned int type)
+{
+	void *restart_va;
+	unsigned long restart_pa;
+	void (*restart_lowmem)(unsigned int);
+	u64 *lowmem_gdt;
+
+	local_irq_disable();
+
+	/* Write zero to CMOS register number 0x0f, which the BIOS POST
+	   routine will recognize as telling it to do a proper reboot.  (Well
+	   that's what this book in front of me says -- it may only apply to
+	   the Phoenix BIOS though, it's not clear).  At the same time,
+	   disable NMIs by setting the top bit in the CMOS address register,
+	   as we're about to do peculiar things to the CPU.  I'm not sure if
+	   `outb_p' is needed instead of just `outb'.  Use it to be on the
+	   safe side.  (Yes, CMOS_WRITE does outb_p's. -  Paul G.)
+	 */
+	spin_lock(&rtc_lock);
+	CMOS_WRITE(0x00, 0x8f);
+	spin_unlock(&rtc_lock);
+
+	/*
+	 * Switch back to the initial page table.
+	 */
+	load_cr3(initial_page_table);
+
+	/* Write 0x1234 to absolute memory location 0x472.  The BIOS reads
+	   this on booting to tell it to "Bypass memory test (also warm
+	   boot)".  This seems like a fairly standard thing that gets set by
+	   REBOOT.COM programs, and the previous reset routine did this
+	   too. */
+	*((unsigned short *)0x472) = reboot_mode;
+
+	/* Patch the GDT in the low memory trampoline */
+	lowmem_gdt = TRAMPOLINE_SYM(machine_real_restart_gdt);
+
+	restart_va = TRAMPOLINE_SYM(machine_real_restart_asm);
+	restart_pa = virt_to_phys(restart_va);
+	restart_lowmem = (void (*)(unsigned int))restart_pa;
+
+	/* GDT[0]: GDT self-pointer */
+	lowmem_gdt[0] =
+		(u64)(sizeof(machine_real_restart_gdt) - 1) +
+		((u64)virt_to_phys(lowmem_gdt) << 16);
+	/* GDT[1]: 64K real mode code segment */
+	lowmem_gdt[1] =
+		GDT_ENTRY(0x009b, restart_pa, 0xffff);
+
+	/* Jump to the identity-mapped low memory code */
+	restart_lowmem(type);
+}
+#ifdef CONFIG_APM_MODULE
+EXPORT_SYMBOL(machine_real_restart);
+#endif
+
+#endif /* CONFIG_X86_32 */
+
+/*
+ * Some Apple MacBook and MacBookPro's needs reboot=p to be able to reboot
+ */
+static int __init set_pci_reboot(const struct dmi_system_id *d)
+{
+	if (reboot_type != BOOT_CF9) {
+		reboot_type = BOOT_CF9;
+		printk(KERN_INFO "%s series board detected. "
+		       "Selecting PCI-method for reboots.\n", d->ident);
+	}
+	return 0;
+}
+
 static int __init set_kbd_reboot(const struct dmi_system_id *d)
 {
 	if (reboot_type != BOOT_KBD) {
@@ -159,7 +233,11 @@ static int __init set_kbd_reboot(const struct dmi_system_id *d)
 	return 0;
 }
 
+/* This is a single dmi_table handling all reboot quirks.  Note that
+ * REBOOT_BIOS is only available for 32bit
+ */
 static struct dmi_system_id __initdata reboot_dmi_table[] = {
+#ifdef CONFIG_X86_32
 	{	/* Handle problems with rebooting on Dell E520's */
 		.callback = set_bios_reboot,
 		.ident = "Dell E520",
@@ -309,6 +387,8 @@ static struct dmi_system_id __initdata reboot_dmi_table[] = {
 			DMI_MATCH(DMI_BOARD_NAME, "P4S800"),
 		},
 	},
+#endif /* CONFIG_X86_32 */
+
 	{ /* Handle reboot issue on Acer Aspire one */
 		.callback = set_kbd_reboot,
 		.ident = "Acer Aspire One A110",
@@ -317,96 +397,6 @@ static struct dmi_system_id __initdata reboot_dmi_table[] = {
 			DMI_MATCH(DMI_PRODUCT_NAME, "AOA110"),
 		},
 	},
-	{ }
-};
-
-static int __init reboot_init(void)
-{
-	/* Only do the DMI check if reboot_type hasn't been overridden
-	 * on the command line
-	 */
-	if (reboot_default) {
-		dmi_check_system(reboot_dmi_table);
-	}
-	return 0;
-}
-core_initcall(reboot_init);
-
-extern const unsigned char machine_real_restart_asm[];
-extern const u64 machine_real_restart_gdt[3];
-
-void machine_real_restart(unsigned int type)
-{
-	void *restart_va;
-	unsigned long restart_pa;
-	void (*restart_lowmem)(unsigned int);
-	u64 *lowmem_gdt;
-
-	local_irq_disable();
-
-	/* Write zero to CMOS register number 0x0f, which the BIOS POST
-	   routine will recognize as telling it to do a proper reboot.  (Well
-	   that's what this book in front of me says -- it may only apply to
-	   the Phoenix BIOS though, it's not clear).  At the same time,
-	   disable NMIs by setting the top bit in the CMOS address register,
-	   as we're about to do peculiar things to the CPU.  I'm not sure if
-	   `outb_p' is needed instead of just `outb'.  Use it to be on the
-	   safe side.  (Yes, CMOS_WRITE does outb_p's. -  Paul G.)
-	 */
-	spin_lock(&rtc_lock);
-	CMOS_WRITE(0x00, 0x8f);
-	spin_unlock(&rtc_lock);
-
-	/*
-	 * Switch back to the initial page table.
-	 */
-	load_cr3(initial_page_table);
-
-	/* Write 0x1234 to absolute memory location 0x472.  The BIOS reads
-	   this on booting to tell it to "Bypass memory test (also warm
-	   boot)".  This seems like a fairly standard thing that gets set by
-	   REBOOT.COM programs, and the previous reset routine did this
-	   too. */
-	*((unsigned short *)0x472) = reboot_mode;
-
-	/* Patch the GDT in the low memory trampoline */
-	lowmem_gdt = TRAMPOLINE_SYM(machine_real_restart_gdt);
-
-	restart_va = TRAMPOLINE_SYM(machine_real_restart_asm);
-	restart_pa = virt_to_phys(restart_va);
-	restart_lowmem = (void (*)(unsigned int))restart_pa;
-
-	/* GDT[0]: GDT self-pointer */
-	lowmem_gdt[0] =
-		(u64)(sizeof(machine_real_restart_gdt) - 1) +
-		((u64)virt_to_phys(lowmem_gdt) << 16);
-	/* GDT[1]: 64K real mode code segment */
-	lowmem_gdt[1] =
-		GDT_ENTRY(0x009b, restart_pa, 0xffff);
-
-	/* Jump to the identity-mapped low memory code */
-	restart_lowmem(type);
-}
-#ifdef CONFIG_APM_MODULE
-EXPORT_SYMBOL(machine_real_restart);
-#endif
-
-#endif /* CONFIG_X86_32 */
-
-/*
- * Some Apple MacBook and MacBookPro's needs reboot=p to be able to reboot
- */
-static int __init set_pci_reboot(const struct dmi_system_id *d)
-{
-	if (reboot_type != BOOT_CF9) {
-		reboot_type = BOOT_CF9;
-		printk(KERN_INFO "%s series board detected. "
-		       "Selecting PCI-method for reboots.\n", d->ident);
-	}
-	return 0;
-}
-
-static struct dmi_system_id __initdata pci_reboot_dmi_table[] = {
 	{	/* Handle problems with rebooting on Apple MacBook5 */
 		.callback = set_pci_reboot,
 		.ident = "Apple MacBook5",
@@ -474,17 +464,17 @@ static struct dmi_system_id __initdata pci_reboot_dmi_table[] = {
 	{ }
 };
 
-static int __init pci_reboot_init(void)
+static int __init reboot_init(void)
 {
 	/* Only do the DMI check if reboot_type hasn't been overridden
 	 * on the command line
 	 */
 	if (reboot_default) {
-		dmi_check_system(pci_reboot_dmi_table);
+		dmi_check_system(reboot_dmi_table);
 	}
 	return 0;
 }
-core_initcall(pci_reboot_init);
+core_initcall(reboot_init);
 
 static inline void kb_wait(void)
 {
-- 
cgit v1.2.3


From 144d102b926f887d3d9f909b69a5c4f504ae0d40 Mon Sep 17 00:00:00 2001
From: Michael D Labriola <michael.d.labriola@gmail.com>
Date: Wed, 1 Feb 2012 10:06:34 -0500
Subject: x86/reboot: Clean up coding style

This commit simply cleans up the style used in this file to fall
in line with what's specified in CodingStyle.  Mostly comment
changes, with a single removal of unneeded braces.

Note that the comments for all the DMI quirks in
reboot_dmi_table now all line up consistently using tabs instead
of spaces.

Signed-off-by: Michael D Labriola <michael.d.labriola@gmail.com>
Link: http://lkml.kernel.org/n/tip-lde9yy7qsomh0sdqevn7xp56@git.kernel.org
[ Fixed a few small details. ]
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/reboot.c | 129 +++++++++++++++++++++++++----------------------
 1 file changed, 70 insertions(+), 59 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kernel/reboot.c b/arch/x86/kernel/reboot.c
index e73973769076..77215c23fba1 100644
--- a/arch/x86/kernel/reboot.c
+++ b/arch/x86/kernel/reboot.c
@@ -39,7 +39,8 @@ static int reboot_mode;
 enum reboot_type reboot_type = BOOT_ACPI;
 int reboot_force;
 
-/* This variable is used privately to keep track of whether or not
+/*
+ * This variable is used privately to keep track of whether or not
  * reboot_type is still set to its default value (i.e., reboot= hasn't
  * been set on the command line).  This is needed so that we can
  * suppress DMI scanning for reboot quirks.  Without it, it's
@@ -51,7 +52,8 @@ static int reboot_default = 1;
 static int reboot_cpu = -1;
 #endif
 
-/* This is set if we need to go through the 'emergency' path.
+/*
+ * This is set if we need to go through the 'emergency' path.
  * When machine_emergency_restart() is called, we may be on
  * an inconsistent state and won't be able to do a clean cleanup
  */
@@ -60,22 +62,24 @@ static int reboot_emergency;
 /* This is set by the PCI code if either type 1 or type 2 PCI is detected */
 bool port_cf9_safe = false;
 
-/* reboot=b[ios] | s[mp] | t[riple] | k[bd] | e[fi] [, [w]arm | [c]old] | p[ci]
-   warm   Don't set the cold reboot flag
-   cold   Set the cold reboot flag
-   bios   Reboot by jumping through the BIOS (only for X86_32)
-   smp    Reboot by executing reset on BSP or other CPU (only for X86_32)
-   triple Force a triple fault (init)
-   kbd    Use the keyboard controller. cold reset (default)
-   acpi   Use the RESET_REG in the FADT
-   efi    Use efi reset_system runtime service
-   pci    Use the so-called "PCI reset register", CF9
-   force  Avoid anything that could hang.
+/*
+ * reboot=b[ios] | s[mp] | t[riple] | k[bd] | e[fi] [, [w]arm | [c]old] | p[ci]
+ * warm   Don't set the cold reboot flag
+ * cold   Set the cold reboot flag
+ * bios   Reboot by jumping through the BIOS (only for X86_32)
+ * smp    Reboot by executing reset on BSP or other CPU (only for X86_32)
+ * triple Force a triple fault (init)
+ * kbd    Use the keyboard controller. cold reset (default)
+ * acpi   Use the RESET_REG in the FADT
+ * efi    Use efi reset_system runtime service
+ * pci    Use the so-called "PCI reset register", CF9
+ * force  Avoid anything that could hang.
  */
 static int __init reboot_setup(char *str)
 {
 	for (;;) {
-		/* Having anything passed on the command line via
+		/*
+		 * Having anything passed on the command line via
 		 * reboot= will cause us to disable DMI checking
 		 * below.
 		 */
@@ -98,9 +102,11 @@ static int __init reboot_setup(char *str)
 				if (isdigit(*(str+2)))
 					reboot_cpu = reboot_cpu*10 + (int)(*(str+2) - '0');
 			}
-				/* we will leave sorting out the final value
-				   when we are ready to reboot, since we might not
-				   have detected BSP APIC ID or smp_num_cpu */
+			/*
+			 * We will leave sorting out the final value
+			 * when we are ready to reboot, since we might not
+			 * have detected BSP APIC ID or smp_num_cpu
+			 */
 			break;
 #endif /* CONFIG_SMP */
 
@@ -162,14 +168,15 @@ void machine_real_restart(unsigned int type)
 
 	local_irq_disable();
 
-	/* Write zero to CMOS register number 0x0f, which the BIOS POST
-	   routine will recognize as telling it to do a proper reboot.  (Well
-	   that's what this book in front of me says -- it may only apply to
-	   the Phoenix BIOS though, it's not clear).  At the same time,
-	   disable NMIs by setting the top bit in the CMOS address register,
-	   as we're about to do peculiar things to the CPU.  I'm not sure if
-	   `outb_p' is needed instead of just `outb'.  Use it to be on the
-	   safe side.  (Yes, CMOS_WRITE does outb_p's. -  Paul G.)
+	/*
+	 * Write zero to CMOS register number 0x0f, which the BIOS POST
+	 * routine will recognize as telling it to do a proper reboot.  (Well
+	 * that's what this book in front of me says -- it may only apply to
+	 * the Phoenix BIOS though, it's not clear).  At the same time,
+	 * disable NMIs by setting the top bit in the CMOS address register,
+	 * as we're about to do peculiar things to the CPU.  I'm not sure if
+	 * `outb_p' is needed instead of just `outb'.  Use it to be on the
+	 * safe side.  (Yes, CMOS_WRITE does outb_p's. -  Paul G.)
 	 */
 	spin_lock(&rtc_lock);
 	CMOS_WRITE(0x00, 0x8f);
@@ -180,11 +187,12 @@ void machine_real_restart(unsigned int type)
 	 */
 	load_cr3(initial_page_table);
 
-	/* Write 0x1234 to absolute memory location 0x472.  The BIOS reads
-	   this on booting to tell it to "Bypass memory test (also warm
-	   boot)".  This seems like a fairly standard thing that gets set by
-	   REBOOT.COM programs, and the previous reset routine did this
-	   too. */
+	/*
+	 * Write 0x1234 to absolute memory location 0x472.  The BIOS reads
+	 * this on booting to tell it to "Bypass memory test (also warm
+	 * boot)".  This seems like a fairly standard thing that gets set by
+	 * REBOOT.COM programs, and the previous reset routine did this
+	 * too. */
 	*((unsigned short *)0x472) = reboot_mode;
 
 	/* Patch the GDT in the low memory trampoline */
@@ -233,7 +241,8 @@ static int __init set_kbd_reboot(const struct dmi_system_id *d)
 	return 0;
 }
 
-/* This is a single dmi_table handling all reboot quirks.  Note that
+/*
+ * This is a single dmi_table handling all reboot quirks.  Note that
  * REBOOT_BIOS is only available for 32bit
  */
 static struct dmi_system_id __initdata reboot_dmi_table[] = {
@@ -262,7 +271,7 @@ static struct dmi_system_id __initdata reboot_dmi_table[] = {
 			DMI_MATCH(DMI_PRODUCT_NAME, "PowerEdge 300/"),
 		},
 	},
-	{       /* Handle problems with rebooting on Dell Optiplex 745's SFF*/
+	{	/* Handle problems with rebooting on Dell Optiplex 745's SFF */
 		.callback = set_bios_reboot,
 		.ident = "Dell OptiPlex 745",
 		.matches = {
@@ -270,7 +279,7 @@ static struct dmi_system_id __initdata reboot_dmi_table[] = {
 			DMI_MATCH(DMI_PRODUCT_NAME, "OptiPlex 745"),
 		},
 	},
-	{       /* Handle problems with rebooting on Dell Optiplex 745's DFF*/
+	{	/* Handle problems with rebooting on Dell Optiplex 745's DFF */
 		.callback = set_bios_reboot,
 		.ident = "Dell OptiPlex 745",
 		.matches = {
@@ -279,7 +288,7 @@ static struct dmi_system_id __initdata reboot_dmi_table[] = {
 			DMI_MATCH(DMI_BOARD_NAME, "0MM599"),
 		},
 	},
-	{       /* Handle problems with rebooting on Dell Optiplex 745 with 0KW626 */
+	{	/* Handle problems with rebooting on Dell Optiplex 745 with 0KW626 */
 		.callback = set_bios_reboot,
 		.ident = "Dell OptiPlex 745",
 		.matches = {
@@ -288,7 +297,7 @@ static struct dmi_system_id __initdata reboot_dmi_table[] = {
 			DMI_MATCH(DMI_BOARD_NAME, "0KW626"),
 		},
 	},
-	{   /* Handle problems with rebooting on Dell Optiplex 330 with 0KP561 */
+	{	/* Handle problems with rebooting on Dell Optiplex 330 with 0KP561 */
 		.callback = set_bios_reboot,
 		.ident = "Dell OptiPlex 330",
 		.matches = {
@@ -297,7 +306,7 @@ static struct dmi_system_id __initdata reboot_dmi_table[] = {
 			DMI_MATCH(DMI_BOARD_NAME, "0KP561"),
 		},
 	},
-	{   /* Handle problems with rebooting on Dell Optiplex 360 with 0T656F */
+	{	/* Handle problems with rebooting on Dell Optiplex 360 with 0T656F */
 		.callback = set_bios_reboot,
 		.ident = "Dell OptiPlex 360",
 		.matches = {
@@ -306,7 +315,7 @@ static struct dmi_system_id __initdata reboot_dmi_table[] = {
 			DMI_MATCH(DMI_BOARD_NAME, "0T656F"),
 		},
 	},
-	{	/* Handle problems with rebooting on Dell OptiPlex 760 with 0G919G*/
+	{	/* Handle problems with rebooting on Dell OptiPlex 760 with 0G919G */
 		.callback = set_bios_reboot,
 		.ident = "Dell OptiPlex 760",
 		.matches = {
@@ -379,7 +388,7 @@ static struct dmi_system_id __initdata reboot_dmi_table[] = {
 			DMI_MATCH(DMI_PRODUCT_NAME, "SBC-FITPC2"),
 		},
 	},
-	{       /* Handle problems with rebooting on ASUS P4S800 */
+	{	/* Handle problems with rebooting on ASUS P4S800 */
 		.callback = set_bios_reboot,
 		.ident = "ASUS P4S800",
 		.matches = {
@@ -389,7 +398,7 @@ static struct dmi_system_id __initdata reboot_dmi_table[] = {
 	},
 #endif /* CONFIG_X86_32 */
 
-	{ /* Handle reboot issue on Acer Aspire one */
+	{	/* Handle reboot issue on Acer Aspire one */
 		.callback = set_kbd_reboot,
 		.ident = "Acer Aspire One A110",
 		.matches = {
@@ -466,12 +475,12 @@ static struct dmi_system_id __initdata reboot_dmi_table[] = {
 
 static int __init reboot_init(void)
 {
-	/* Only do the DMI check if reboot_type hasn't been overridden
+	/*
+	 * Only do the DMI check if reboot_type hasn't been overridden
 	 * on the command line
 	 */
-	if (reboot_default) {
+	if (reboot_default)
 		dmi_check_system(reboot_dmi_table);
-	}
 	return 0;
 }
 core_initcall(reboot_init);
@@ -492,14 +501,14 @@ static void vmxoff_nmi(int cpu, struct pt_regs *regs)
 	cpu_emergency_vmxoff();
 }
 
-/* Use NMIs as IPIs to tell all CPUs to disable virtualization
- */
+/* Use NMIs as IPIs to tell all CPUs to disable virtualization */
 static void emergency_vmx_disable_all(void)
 {
 	/* Just make sure we won't change CPUs while doing this */
 	local_irq_disable();
 
-	/* We need to disable VMX on all CPUs before rebooting, otherwise
+	/*
+	 * We need to disable VMX on all CPUs before rebooting, otherwise
 	 * we risk hanging up the machine, because the CPU ignore INIT
 	 * signals when VMX is enabled.
 	 *
@@ -518,8 +527,7 @@ static void emergency_vmx_disable_all(void)
 	 * is still enabling VMX.
 	 */
 	if (cpu_has_vmx() && cpu_vmx_enabled()) {
-		/* Disable VMX on this CPU.
-		 */
+		/* Disable VMX on this CPU. */
 		cpu_vmxoff();
 
 		/* Halt and disable VMX on the other CPUs */
@@ -564,12 +572,12 @@ static void native_machine_emergency_restart(void)
 		/* Could also try the reset bit in the Hammer NB */
 		switch (reboot_type) {
 		case BOOT_KBD:
-			mach_reboot_fixups(); /* for board specific fixups */
+			mach_reboot_fixups(); /* For board specific fixups */
 
 			for (i = 0; i < 10; i++) {
 				kb_wait();
 				udelay(50);
-				outb(0xfe, 0x64); /* pulse reset low */
+				outb(0xfe, 0x64); /* Pulse reset low */
 				udelay(50);
 			}
 			if (attempt == 0 && orig_reboot_type == BOOT_ACPI) {
@@ -611,7 +619,7 @@ static void native_machine_emergency_restart(void)
 
 		case BOOT_CF9:
 			port_cf9_safe = true;
-			/* fall through */
+			/* Fall through */
 
 		case BOOT_CF9_COND:
 			if (port_cf9_safe) {
@@ -649,7 +657,8 @@ void native_machine_shutdown(void)
 	/* Make certain I only run on the appropriate processor */
 	set_cpus_allowed_ptr(current, cpumask_of(reboot_cpu_id));
 
-	/* O.K Now that I'm on the appropriate processor,
+	/*
+	 * O.K Now that I'm on the appropriate processor,
 	 * stop all of the others.
 	 */
 	stop_other_cpus();
@@ -687,12 +696,11 @@ static void native_machine_restart(char *__unused)
 
 static void native_machine_halt(void)
 {
-	/* stop other cpus and apics */
+	/* Stop other cpus and apics */
 	machine_shutdown();
 
 	tboot_shutdown(TB_SHUTDOWN_HALT);
 
-	/* stop this cpu */
 	stop_this_cpu(NULL);
 }
 
@@ -703,7 +711,7 @@ static void native_machine_power_off(void)
 			machine_shutdown();
 		pm_power_off();
 	}
-	/* a fallback in case there is no PM info available */
+	/* A fallback in case there is no PM info available */
 	tboot_shutdown(TB_SHUTDOWN_HALT);
 }
 
@@ -765,7 +773,8 @@ static int crash_nmi_callback(unsigned int val, struct pt_regs *regs)
 
 	cpu = raw_smp_processor_id();
 
-	/* Don't do anything if this handler is invoked on crashing cpu.
+	/*
+	 * Don't do anything if this handler is invoked on crashing cpu.
 	 * Otherwise, system will completely hang. Crashing cpu can get
 	 * an NMI if system was initially booted with nmi_watchdog parameter.
 	 */
@@ -789,7 +798,8 @@ static void smp_send_nmi_allbutself(void)
 	apic->send_IPI_allbutself(NMI_VECTOR);
 }
 
-/* Halt all other CPUs, calling the specified function on each of them
+/*
+ * Halt all other CPUs, calling the specified function on each of them
  *
  * This function can be used to halt all other CPUs on crash
  * or emergency reboot time. The function passed as parameter
@@ -800,7 +810,7 @@ void nmi_shootdown_cpus(nmi_shootdown_cb callback)
 	unsigned long msecs;
 	local_irq_disable();
 
-	/* Make a note of crashing cpu. Will be used in NMI callback.*/
+	/* Make a note of crashing cpu. Will be used in NMI callback. */
 	crashing_cpu = safe_smp_processor_id();
 
 	shootdown_callback = callback;
@@ -809,8 +819,9 @@ void nmi_shootdown_cpus(nmi_shootdown_cb callback)
 	/* Would it be better to replace the trap vector here? */
 	if (register_nmi_handler(NMI_LOCAL, crash_nmi_callback,
 				 NMI_FLAG_FIRST, "crash"))
-		return;		/* return what? */
-	/* Ensure the new callback function is set before sending
+		return;		/* Return what? */
+	/*
+	 * Ensure the new callback function is set before sending
 	 * out the NMI
 	 */
 	wmb();
-- 
cgit v1.2.3


From 2b144498350860b6ee9dc57ff27a93ad488de5dc Mon Sep 17 00:00:00 2001
From: Srikar Dronamraju <srikar@linux.vnet.ibm.com>
Date: Thu, 9 Feb 2012 14:56:42 +0530
Subject: uprobes, mm, x86: Add the ability to install and remove uprobes
 breakpoints

Add uprobes support to the core kernel, with x86 support.

This commit adds the kernel facilities, the actual uprobes
user-space ABI and perf probe support comes in later commits.

General design:

Uprobes are maintained in an rb-tree indexed by inode and offset
(the offset here is from the start of the mapping). For a unique
(inode, offset) tuple, there can be at most one uprobe in the
rb-tree.

Since the (inode, offset) tuple identifies a unique uprobe, more
than one user may be interested in the same uprobe. This provides
the ability to connect multiple 'consumers' to the same uprobe.

Each consumer defines a handler and a filter (optional). The
'handler' is run every time the uprobe is hit, if it matches the
'filter' criteria.

The first consumer of a uprobe causes the breakpoint to be
inserted at the specified address and subsequent consumers are
appended to this list.  On subsequent probes, the consumer gets
appended to the existing list of consumers. The breakpoint is
removed when the last consumer unregisters. For all other
unregisterations, the consumer is removed from the list of
consumers.

Given a inode, we get a list of the mms that have mapped the
inode. Do the actual registration if mm maps the page where a
probe needs to be inserted/removed.

We use a temporary list to walk through the vmas that map the
inode.

- The number of maps that map the inode, is not known before we
  walk the rmap and keeps changing.
- extending vm_area_struct wasn't recommended, it's a
  size-critical data structure.
- There can be more than one maps of the inode in the same mm.

We add callbacks to the mmap methods to keep an eye on text vmas
that are of interest to uprobes.  When a vma of interest is mapped,
we insert the breakpoint at the right address.

Uprobe works by replacing the instruction at the address defined
by (inode, offset) with the arch specific breakpoint
instruction. We save a copy of the original instruction at the
uprobed address.

This is needed for:

 a. executing the instruction out-of-line (xol).
 b. instruction analysis for any subsequent fixups.
 c. restoring the instruction back when the uprobe is unregistered.

We insert or delete a breakpoint instruction, and this
breakpoint instruction is assumed to be the smallest instruction
available on the platform. For fixed size instruction platforms
this is trivially true, for variable size instruction platforms
the breakpoint instruction is typically the smallest (often a
single byte).

Writing the instruction is done by COWing the page and changing
the instruction during the copy, this even though most platforms
allow atomic writes of the breakpoint instruction. This also
mirrors the behaviour of a ptrace() memory write to a PRIVATE
file map.

The core worker is derived from KSM's replace_page() logic.

In essence, similar to KSM:

 a. allocate a new page and copy over contents of the page that
    has the uprobed vaddr
 b. modify the copy and insert the breakpoint at the required
    address
 c. switch the original page with the copy containing the
    breakpoint
 d. flush page tables.

replace_page() is being replicated here because of some minor
changes in the type of pages and also because Hugh Dickins had
plans to improve replace_page() for KSM specific work.

Instruction analysis on x86 is based on instruction decoder and
determines if an instruction can be probed and determines the
necessary fixups after singlestep.  Instruction analysis is done
at probe insertion time so that we avoid having to repeat the
same analysis every time a probe is hit.

A lot of code here is due to the improvement/suggestions/inputs
from Peter Zijlstra.

Changelog:

(v10):
 - Add code to clear REX.B prefix as suggested by Denys Vlasenko
   and Masami Hiramatsu.

(v9):
 - Use insn_offset_modrm as suggested by Masami Hiramatsu.

(v7):

 Handle comments from Peter Zijlstra:

 - Dont take reference to inode. (expect inode to uprobe_register to be sane).
 - Use PTR_ERR to set the return value.
 - No need to take reference to inode.
 - use PTR_ERR to return error value.
 - register and uprobe_unregister share code.

(v5):

 - Modified del_consumer as per comments from Peter.
 - Drop reference to inode before dropping reference to uprobe.
 - Use i_size_read(inode) instead of inode->i_size.
 - Ensure uprobe->consumers is NULL, before __uprobe_unregister() is called.
 - Includes errno.h as recommended by Stephen Rothwell to fix a build issue
   on sparc defconfig
 - Remove restrictions while unregistering.
 - Earlier code leaked inode references under some conditions while
   registering/unregistering.
 - Continue the vma-rmap walk even if the intermediate vma doesnt
   meet the requirements.
 - Validate the vma found by find_vma before inserting/removing the
   breakpoint
 - Call del_consumer under mutex_lock.
 - Use hash locks.
 - Handle mremap.
 - Introduce find_least_offset_node() instead of close match logic in
   find_uprobe
 - Uprobes no more depends on MM_OWNER; No reference to task_structs
   while inserting/removing a probe.
 - Uses read_mapping_page instead of grab_cache_page so that the pages
   have valid content.
 - pass NULL to get_user_pages for the task parameter.
 - call SetPageUptodate on the new page allocated in write_opcode.
 - fix leaking a reference to the new page under certain conditions.
 - Include Instruction Decoder if Uprobes gets defined.
 - Remove const attributes for instruction prefix arrays.
 - Uses mm_context to know if the application is 32 bit.

Signed-off-by: Srikar Dronamraju <srikar@linux.vnet.ibm.com>
Also-written-by: Jim Keniston <jkenisto@us.ibm.com>
Reviewed-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Oleg Nesterov <oleg@redhat.com>
Cc: Andi Kleen <andi@firstfloor.org>
Cc: Christoph Hellwig <hch@infradead.org>
Cc: Steven Rostedt <rostedt@goodmis.org>
Cc: Roland McGrath <roland@hack.frob.com>
Cc: Masami Hiramatsu <masami.hiramatsu.pt@hitachi.com>
Cc: Arnaldo Carvalho de Melo <acme@infradead.org>
Cc: Anton Arapov <anton@redhat.com>
Cc: Ananth N Mavinakayanahalli <ananth@in.ibm.com>
Cc: Stephen Rothwell <sfr@canb.auug.org.au>
Cc: Denys Vlasenko <vda.linux@googlemail.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Linux-mm <linux-mm@kvack.org>
Link: http://lkml.kernel.org/r/20120209092642.GE16600@linux.vnet.ibm.com
[ Made various small edits to the commit log ]
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/Kconfig                   |  11 +
 arch/x86/Kconfig               |   5 +-
 arch/x86/include/asm/uprobes.h |  42 ++
 arch/x86/kernel/Makefile       |   1 +
 arch/x86/kernel/uprobes.c      | 412 +++++++++++++++++
 include/linux/uprobes.h        |  98 +++++
 kernel/Makefile                |   1 +
 kernel/uprobes.c               | 976 +++++++++++++++++++++++++++++++++++++++++
 mm/mmap.c                      |  23 +
 9 files changed, 1568 insertions(+), 1 deletion(-)
 create mode 100644 arch/x86/include/asm/uprobes.h
 create mode 100644 arch/x86/kernel/uprobes.c
 create mode 100644 include/linux/uprobes.h
 create mode 100644 kernel/uprobes.c

(limited to 'arch/x86')

diff --git a/arch/Kconfig b/arch/Kconfig
index 4f55c736be11..284f5898f526 100644
--- a/arch/Kconfig
+++ b/arch/Kconfig
@@ -65,6 +65,17 @@ config OPTPROBES
 	depends on KPROBES && HAVE_OPTPROBES
 	depends on !PREEMPT
 
+config UPROBES
+	bool "User-space probes (EXPERIMENTAL)"
+	depends on ARCH_SUPPORTS_UPROBES
+	default n
+	help
+	  Uprobes enables kernel subsystems to establish probepoints
+	  in user applications and execute handler functions when
+	  the probepoints are hit.
+
+	  If in doubt, say "N".
+
 config HAVE_EFFICIENT_UNALIGNED_ACCESS
 	bool
 	help
diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
index 5bed94e189fa..481dbfcf14ed 100644
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -84,7 +84,7 @@ config X86
 	select GENERIC_IOMAP
 
 config INSTRUCTION_DECODER
-	def_bool (KPROBES || PERF_EVENTS)
+	def_bool (KPROBES || PERF_EVENTS || UPROBES)
 
 config OUTPUT_FORMAT
 	string
@@ -240,6 +240,9 @@ config ARCH_CPU_PROBE_RELEASE
 	def_bool y
 	depends on HOTPLUG_CPU
 
+config ARCH_SUPPORTS_UPROBES
+	def_bool y
+
 source "init/Kconfig"
 source "kernel/Kconfig.freezer"
 
diff --git a/arch/x86/include/asm/uprobes.h b/arch/x86/include/asm/uprobes.h
new file mode 100644
index 000000000000..8208234391ff
--- /dev/null
+++ b/arch/x86/include/asm/uprobes.h
@@ -0,0 +1,42 @@
+#ifndef _ASM_UPROBES_H
+#define _ASM_UPROBES_H
+/*
+ * Userspace Probes (UProbes) for x86
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
+ *
+ * Copyright (C) IBM Corporation, 2008-2011
+ * Authors:
+ *	Srikar Dronamraju
+ *	Jim Keniston
+ */
+
+typedef u8 uprobe_opcode_t;
+#define MAX_UINSN_BYTES 16
+#define UPROBES_XOL_SLOT_BYTES	128	/* to keep it cache aligned */
+
+#define UPROBES_BKPT_INSN 0xcc
+#define UPROBES_BKPT_INSN_SIZE 1
+
+struct uprobe_arch_info {
+	u16			fixups;
+#ifdef CONFIG_X86_64
+	unsigned long rip_rela_target_address;
+#endif
+};
+
+struct uprobe;
+extern int analyze_insn(struct mm_struct *mm, struct uprobe *uprobe);
+#endif	/* _ASM_UPROBES_H */
diff --git a/arch/x86/kernel/Makefile b/arch/x86/kernel/Makefile
index 5369059c07a9..8c8c365a3bc3 100644
--- a/arch/x86/kernel/Makefile
+++ b/arch/x86/kernel/Makefile
@@ -100,6 +100,7 @@ obj-$(CONFIG_X86_CHECK_BIOS_CORRUPTION) += check.o
 
 obj-$(CONFIG_SWIOTLB)			+= pci-swiotlb.o
 obj-$(CONFIG_OF)			+= devicetree.o
+obj-$(CONFIG_UPROBES)			+= uprobes.o
 
 ###
 # 64 bit specific files
diff --git a/arch/x86/kernel/uprobes.c b/arch/x86/kernel/uprobes.c
new file mode 100644
index 000000000000..2a301bb91bdb
--- /dev/null
+++ b/arch/x86/kernel/uprobes.c
@@ -0,0 +1,412 @@
+/*
+ * Userspace Probes (UProbes) for x86
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
+ *
+ * Copyright (C) IBM Corporation, 2008-2011
+ * Authors:
+ *	Srikar Dronamraju
+ *	Jim Keniston
+ */
+
+#include <linux/kernel.h>
+#include <linux/sched.h>
+#include <linux/ptrace.h>
+#include <linux/uprobes.h>
+
+#include <linux/kdebug.h>
+#include <asm/insn.h>
+
+/* Post-execution fixups. */
+
+/* No fixup needed */
+#define UPROBES_FIX_NONE	0x0
+/* Adjust IP back to vicinity of actual insn */
+#define UPROBES_FIX_IP		0x1
+/* Adjust the return address of a call insn */
+#define UPROBES_FIX_CALL	0x2
+
+#define UPROBES_FIX_RIP_AX	0x8000
+#define UPROBES_FIX_RIP_CX	0x4000
+
+/* Adaptations for mhiramat x86 decoder v14. */
+#define OPCODE1(insn) ((insn)->opcode.bytes[0])
+#define OPCODE2(insn) ((insn)->opcode.bytes[1])
+#define OPCODE3(insn) ((insn)->opcode.bytes[2])
+#define MODRM_REG(insn) X86_MODRM_REG(insn->modrm.value)
+
+#define W(row, b0, b1, b2, b3, b4, b5, b6, b7, b8, b9, ba, bb, bc, bd, be, bf)\
+	(((b0##UL << 0x0)|(b1##UL << 0x1)|(b2##UL << 0x2)|(b3##UL << 0x3) |   \
+	  (b4##UL << 0x4)|(b5##UL << 0x5)|(b6##UL << 0x6)|(b7##UL << 0x7) |   \
+	  (b8##UL << 0x8)|(b9##UL << 0x9)|(ba##UL << 0xa)|(bb##UL << 0xb) |   \
+	  (bc##UL << 0xc)|(bd##UL << 0xd)|(be##UL << 0xe)|(bf##UL << 0xf))    \
+	 << (row % 32))
+
+#ifdef CONFIG_X86_64
+static volatile u32 good_insns_64[256 / 32] = {
+	/*      0  1  2  3  4  5  6  7  8  9  a  b  c  d  e  f         */
+	/*      ----------------------------------------------         */
+	W(0x00, 1, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 1, 1, 0, 0) | /* 00 */
+	W(0x10, 1, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 1, 1, 0, 0) , /* 10 */
+	W(0x20, 1, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 1, 1, 0, 0) | /* 20 */
+	W(0x30, 1, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 1, 1, 0, 0) , /* 30 */
+	W(0x40, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0) | /* 40 */
+	W(0x50, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1) , /* 50 */
+	W(0x60, 0, 0, 0, 1, 1, 1, 0, 0, 1, 1, 1, 1, 0, 0, 0, 0) | /* 60 */
+	W(0x70, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1) , /* 70 */
+	W(0x80, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1) | /* 80 */
+	W(0x90, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1) , /* 90 */
+	W(0xa0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1) | /* a0 */
+	W(0xb0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1) , /* b0 */
+	W(0xc0, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0) | /* c0 */
+	W(0xd0, 1, 1, 1, 1, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1) , /* d0 */
+	W(0xe0, 1, 1, 1, 1, 0, 0, 0, 0, 1, 1, 1, 1, 0, 0, 0, 0) | /* e0 */
+	W(0xf0, 0, 0, 1, 1, 0, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1)   /* f0 */
+	/*      ----------------------------------------------         */
+	/*      0  1  2  3  4  5  6  7  8  9  a  b  c  d  e  f         */
+};
+#endif
+
+/* Good-instruction tables for 32-bit apps */
+
+static volatile u32 good_insns_32[256 / 32] = {
+	/*      0  1  2  3  4  5  6  7  8  9  a  b  c  d  e  f         */
+	/*      ----------------------------------------------         */
+	W(0x00, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 0) | /* 00 */
+	W(0x10, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 0) , /* 10 */
+	W(0x20, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 0, 1) | /* 20 */
+	W(0x30, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 0, 1) , /* 30 */
+	W(0x40, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1) | /* 40 */
+	W(0x50, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1) , /* 50 */
+	W(0x60, 1, 1, 1, 0, 1, 1, 0, 0, 1, 1, 1, 1, 0, 0, 0, 0) | /* 60 */
+	W(0x70, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1) , /* 70 */
+	W(0x80, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1) | /* 80 */
+	W(0x90, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1) , /* 90 */
+	W(0xa0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1) | /* a0 */
+	W(0xb0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1) , /* b0 */
+	W(0xc0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0) | /* c0 */
+	W(0xd0, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1) , /* d0 */
+	W(0xe0, 1, 1, 1, 1, 0, 0, 0, 0, 1, 1, 1, 1, 0, 0, 0, 0) | /* e0 */
+	W(0xf0, 0, 0, 1, 1, 0, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1)   /* f0 */
+	/*      ----------------------------------------------         */
+	/*      0  1  2  3  4  5  6  7  8  9  a  b  c  d  e  f         */
+};
+
+/* Using this for both 64-bit and 32-bit apps */
+static volatile u32 good_2byte_insns[256 / 32] = {
+	/*      0  1  2  3  4  5  6  7  8  9  a  b  c  d  e  f         */
+	/*      ----------------------------------------------         */
+	W(0x00, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1) | /* 00 */
+	W(0x10, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1) , /* 10 */
+	W(0x20, 1, 1, 1, 1, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1) | /* 20 */
+	W(0x30, 0, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0) , /* 30 */
+	W(0x40, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1) | /* 40 */
+	W(0x50, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1) , /* 50 */
+	W(0x60, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1) | /* 60 */
+	W(0x70, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 1, 1) , /* 70 */
+	W(0x80, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1) | /* 80 */
+	W(0x90, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1) , /* 90 */
+	W(0xa0, 1, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 1, 1, 0, 1) | /* a0 */
+	W(0xb0, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1) , /* b0 */
+	W(0xc0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1) | /* c0 */
+	W(0xd0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1) , /* d0 */
+	W(0xe0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1) | /* e0 */
+	W(0xf0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0)   /* f0 */
+	/*      ----------------------------------------------         */
+	/*      0  1  2  3  4  5  6  7  8  9  a  b  c  d  e  f         */
+};
+
+#undef W
+
+/*
+ * opcodes we'll probably never support:
+ * 6c-6d, e4-e5, ec-ed - in
+ * 6e-6f, e6-e7, ee-ef - out
+ * cc, cd - int3, int
+ * cf - iret
+ * d6 - illegal instruction
+ * f1 - int1/icebp
+ * f4 - hlt
+ * fa, fb - cli, sti
+ * 0f - lar, lsl, syscall, clts, sysret, sysenter, sysexit, invd, wbinvd, ud2
+ *
+ * invalid opcodes in 64-bit mode:
+ * 06, 0e, 16, 1e, 27, 2f, 37, 3f, 60-62, 82, c4-c5, d4-d5
+ *
+ * 63 - we support this opcode in x86_64 but not in i386.
+ *
+ * opcodes we may need to refine support for:
+ * 0f - 2-byte instructions: For many of these instructions, the validity
+ * depends on the prefix and/or the reg field.  On such instructions, we
+ * just consider the opcode combination valid if it corresponds to any
+ * valid instruction.
+ * 8f - Group 1 - only reg = 0 is OK
+ * c6-c7 - Group 11 - only reg = 0 is OK
+ * d9-df - fpu insns with some illegal encodings
+ * f2, f3 - repnz, repz prefixes.  These are also the first byte for
+ * certain floating-point instructions, such as addsd.
+ * fe - Group 4 - only reg = 0 or 1 is OK
+ * ff - Group 5 - only reg = 0-6 is OK
+ *
+ * others -- Do we need to support these?
+ * 0f - (floating-point?) prefetch instructions
+ * 07, 17, 1f - pop es, pop ss, pop ds
+ * 26, 2e, 36, 3e - es:, cs:, ss:, ds: segment prefixes --
+ *	but 64 and 65 (fs: and gs:) seem to be used, so we support them
+ * 67 - addr16 prefix
+ * ce - into
+ * f0 - lock prefix
+ */
+
+/*
+ * TODO:
+ * - Where necessary, examine the modrm byte and allow only valid instructions
+ * in the different Groups and fpu instructions.
+ */
+
+static bool is_prefix_bad(struct insn *insn)
+{
+	int i;
+
+	for (i = 0; i < insn->prefixes.nbytes; i++) {
+		switch (insn->prefixes.bytes[i]) {
+		case 0x26:	/*INAT_PFX_ES   */
+		case 0x2E:	/*INAT_PFX_CS   */
+		case 0x36:	/*INAT_PFX_DS   */
+		case 0x3E:	/*INAT_PFX_SS   */
+		case 0xF0:	/*INAT_PFX_LOCK */
+			return true;
+		}
+	}
+	return false;
+}
+
+static int validate_insn_32bits(struct uprobe *uprobe, struct insn *insn)
+{
+	insn_init(insn, uprobe->insn, false);
+
+	/* Skip good instruction prefixes; reject "bad" ones. */
+	insn_get_opcode(insn);
+	if (is_prefix_bad(insn))
+		return -ENOTSUPP;
+	if (test_bit(OPCODE1(insn), (unsigned long *)good_insns_32))
+		return 0;
+	if (insn->opcode.nbytes == 2) {
+		if (test_bit(OPCODE2(insn), (unsigned long *)good_2byte_insns))
+			return 0;
+	}
+	return -ENOTSUPP;
+}
+
+/*
+ * Figure out which fixups post_xol() will need to perform, and annotate
+ * uprobe->arch_info.fixups accordingly.  To start with,
+ * uprobe->arch_info.fixups is either zero or it reflects rip-related
+ * fixups.
+ */
+static void prepare_fixups(struct uprobe *uprobe, struct insn *insn)
+{
+	bool fix_ip = true, fix_call = false;	/* defaults */
+	int reg;
+
+	insn_get_opcode(insn);	/* should be a nop */
+
+	switch (OPCODE1(insn)) {
+	case 0xc3:		/* ret/lret */
+	case 0xcb:
+	case 0xc2:
+	case 0xca:
+		/* ip is correct */
+		fix_ip = false;
+		break;
+	case 0xe8:		/* call relative - Fix return addr */
+		fix_call = true;
+		break;
+	case 0x9a:		/* call absolute - Fix return addr, not ip */
+		fix_call = true;
+		fix_ip = false;
+		break;
+	case 0xff:
+		insn_get_modrm(insn);
+		reg = MODRM_REG(insn);
+		if (reg == 2 || reg == 3) {
+			/* call or lcall, indirect */
+			/* Fix return addr; ip is correct. */
+			fix_call = true;
+			fix_ip = false;
+		} else if (reg == 4 || reg == 5) {
+			/* jmp or ljmp, indirect */
+			/* ip is correct. */
+			fix_ip = false;
+		}
+		break;
+	case 0xea:		/* jmp absolute -- ip is correct */
+		fix_ip = false;
+		break;
+	default:
+		break;
+	}
+	if (fix_ip)
+		uprobe->arch_info.fixups |= UPROBES_FIX_IP;
+	if (fix_call)
+		uprobe->arch_info.fixups |= UPROBES_FIX_CALL;
+}
+
+#ifdef CONFIG_X86_64
+/*
+ * If uprobe->insn doesn't use rip-relative addressing, return
+ * immediately.  Otherwise, rewrite the instruction so that it accesses
+ * its memory operand indirectly through a scratch register.  Set
+ * uprobe->arch_info.fixups and uprobe->arch_info.rip_rela_target_address
+ * accordingly.  (The contents of the scratch register will be saved
+ * before we single-step the modified instruction, and restored
+ * afterward.)
+ *
+ * We do this because a rip-relative instruction can access only a
+ * relatively small area (+/- 2 GB from the instruction), and the XOL
+ * area typically lies beyond that area.  At least for instructions
+ * that store to memory, we can't execute the original instruction
+ * and "fix things up" later, because the misdirected store could be
+ * disastrous.
+ *
+ * Some useful facts about rip-relative instructions:
+ * - There's always a modrm byte.
+ * - There's never a SIB byte.
+ * - The displacement is always 4 bytes.
+ */
+static void handle_riprel_insn(struct mm_struct *mm, struct uprobe *uprobe,
+							struct insn *insn)
+{
+	u8 *cursor;
+	u8 reg;
+
+	if (mm->context.ia32_compat)
+		return;
+
+	uprobe->arch_info.rip_rela_target_address = 0x0;
+	if (!insn_rip_relative(insn))
+		return;
+
+	/*
+	 * insn_rip_relative() would have decoded rex_prefix, modrm.
+	 * Clear REX.b bit (extension of MODRM.rm field):
+	 * we want to encode rax/rcx, not r8/r9.
+	 */
+	if (insn->rex_prefix.nbytes) {
+		cursor = uprobe->insn + insn_offset_rex_prefix(insn);
+		*cursor &= 0xfe;	/* Clearing REX.B bit */
+	}
+
+	/*
+	 * Point cursor at the modrm byte.  The next 4 bytes are the
+	 * displacement.  Beyond the displacement, for some instructions,
+	 * is the immediate operand.
+	 */
+	cursor = uprobe->insn + insn_offset_modrm(insn);
+	insn_get_length(insn);
+
+	/*
+	 * Convert from rip-relative addressing to indirect addressing
+	 * via a scratch register.  Change the r/m field from 0x5 (%rip)
+	 * to 0x0 (%rax) or 0x1 (%rcx), and squeeze out the offset field.
+	 */
+	reg = MODRM_REG(insn);
+	if (reg == 0) {
+		/*
+		 * The register operand (if any) is either the A register
+		 * (%rax, %eax, etc.) or (if the 0x4 bit is set in the
+		 * REX prefix) %r8.  In any case, we know the C register
+		 * is NOT the register operand, so we use %rcx (register
+		 * #1) for the scratch register.
+		 */
+		uprobe->arch_info.fixups = UPROBES_FIX_RIP_CX;
+		/* Change modrm from 00 000 101 to 00 000 001. */
+		*cursor = 0x1;
+	} else {
+		/* Use %rax (register #0) for the scratch register. */
+		uprobe->arch_info.fixups = UPROBES_FIX_RIP_AX;
+		/* Change modrm from 00 xxx 101 to 00 xxx 000 */
+		*cursor = (reg << 3);
+	}
+
+	/* Target address = address of next instruction + (signed) offset */
+	uprobe->arch_info.rip_rela_target_address = (long)insn->length
+					+ insn->displacement.value;
+	/* Displacement field is gone; slide immediate field (if any) over. */
+	if (insn->immediate.nbytes) {
+		cursor++;
+		memmove(cursor, cursor + insn->displacement.nbytes,
+						insn->immediate.nbytes);
+	}
+	return;
+}
+
+static int validate_insn_64bits(struct uprobe *uprobe, struct insn *insn)
+{
+	insn_init(insn, uprobe->insn, true);
+
+	/* Skip good instruction prefixes; reject "bad" ones. */
+	insn_get_opcode(insn);
+	if (is_prefix_bad(insn))
+		return -ENOTSUPP;
+	if (test_bit(OPCODE1(insn), (unsigned long *)good_insns_64))
+		return 0;
+	if (insn->opcode.nbytes == 2) {
+		if (test_bit(OPCODE2(insn), (unsigned long *)good_2byte_insns))
+			return 0;
+	}
+	return -ENOTSUPP;
+}
+
+static int validate_insn_bits(struct mm_struct *mm, struct uprobe *uprobe,
+				struct insn *insn)
+{
+	if (mm->context.ia32_compat)
+		return validate_insn_32bits(uprobe, insn);
+	return validate_insn_64bits(uprobe, insn);
+}
+#else
+static void handle_riprel_insn(struct mm_struct *mm, struct uprobe *uprobe,
+							struct insn *insn)
+{
+	return;
+}
+
+static int validate_insn_bits(struct mm_struct *mm, struct uprobe *uprobe,
+				struct insn *insn)
+{
+	return validate_insn_32bits(uprobe, insn);
+}
+#endif /* CONFIG_X86_64 */
+
+/**
+ * analyze_insn - instruction analysis including validity and fixups.
+ * @mm: the probed address space.
+ * @uprobe: the probepoint information.
+ * Return 0 on success or a -ve number on error.
+ */
+int analyze_insn(struct mm_struct *mm, struct uprobe *uprobe)
+{
+	int ret;
+	struct insn insn;
+
+	uprobe->arch_info.fixups = 0;
+	ret = validate_insn_bits(mm, uprobe, &insn);
+	if (ret != 0)
+		return ret;
+	handle_riprel_insn(mm, uprobe, &insn);
+	prepare_fixups(uprobe, &insn);
+	return 0;
+}
diff --git a/include/linux/uprobes.h b/include/linux/uprobes.h
new file mode 100644
index 000000000000..f1d13fd140f2
--- /dev/null
+++ b/include/linux/uprobes.h
@@ -0,0 +1,98 @@
+#ifndef _LINUX_UPROBES_H
+#define _LINUX_UPROBES_H
+/*
+ * Userspace Probes (UProbes)
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
+ *
+ * Copyright (C) IBM Corporation, 2008-2011
+ * Authors:
+ *	Srikar Dronamraju
+ *	Jim Keniston
+ */
+
+#include <linux/errno.h>
+#include <linux/rbtree.h>
+
+struct vm_area_struct;
+#ifdef CONFIG_ARCH_SUPPORTS_UPROBES
+#include <asm/uprobes.h>
+#else
+
+typedef u8 uprobe_opcode_t;
+struct uprobe_arch_info {};
+
+#define MAX_UINSN_BYTES 4
+#endif
+
+#define uprobe_opcode_sz sizeof(uprobe_opcode_t)
+
+/* flags that denote/change uprobes behaviour */
+/* Have a copy of original instruction */
+#define UPROBES_COPY_INSN	0x1
+/* Dont run handlers when first register/ last unregister in progress*/
+#define UPROBES_RUN_HANDLER	0x2
+
+struct uprobe_consumer {
+	int (*handler)(struct uprobe_consumer *self, struct pt_regs *regs);
+	/*
+	 * filter is optional; If a filter exists, handler is run
+	 * if and only if filter returns true.
+	 */
+	bool (*filter)(struct uprobe_consumer *self, struct task_struct *task);
+
+	struct uprobe_consumer *next;
+};
+
+struct uprobe {
+	struct rb_node		rb_node;	/* node in the rb tree */
+	atomic_t		ref;
+	struct rw_semaphore	consumer_rwsem;
+	struct list_head	pending_list;
+	struct uprobe_arch_info arch_info;
+	struct uprobe_consumer	*consumers;
+	struct inode		*inode;		/* Also hold a ref to inode */
+	loff_t			offset;
+	int			flags;
+	u8			insn[MAX_UINSN_BYTES];
+};
+
+#ifdef CONFIG_UPROBES
+extern int __weak set_bkpt(struct mm_struct *mm, struct uprobe *uprobe,
+							unsigned long vaddr);
+extern int __weak set_orig_insn(struct mm_struct *mm, struct uprobe *uprobe,
+					unsigned long vaddr, bool verify);
+extern bool __weak is_bkpt_insn(uprobe_opcode_t *insn);
+extern int register_uprobe(struct inode *inode, loff_t offset,
+				struct uprobe_consumer *consumer);
+extern void unregister_uprobe(struct inode *inode, loff_t offset,
+				struct uprobe_consumer *consumer);
+extern int mmap_uprobe(struct vm_area_struct *vma);
+#else /* CONFIG_UPROBES is not defined */
+static inline int register_uprobe(struct inode *inode, loff_t offset,
+				struct uprobe_consumer *consumer)
+{
+	return -ENOSYS;
+}
+static inline void unregister_uprobe(struct inode *inode, loff_t offset,
+				struct uprobe_consumer *consumer)
+{
+}
+static inline int mmap_uprobe(struct vm_area_struct *vma)
+{
+	return 0;
+}
+#endif /* CONFIG_UPROBES */
+#endif	/* _LINUX_UPROBES_H */
diff --git a/kernel/Makefile b/kernel/Makefile
index 2d9de86b7e76..8609dd3d875a 100644
--- a/kernel/Makefile
+++ b/kernel/Makefile
@@ -107,6 +107,7 @@ obj-$(CONFIG_USER_RETURN_NOTIFIER) += user-return-notifier.o
 obj-$(CONFIG_PADATA) += padata.o
 obj-$(CONFIG_CRASH_DUMP) += crash_dump.o
 obj-$(CONFIG_JUMP_LABEL) += jump_label.o
+obj-$(CONFIG_UPROBES) += uprobes.o
 
 $(obj)/configs.o: $(obj)/config_data.h
 
diff --git a/kernel/uprobes.c b/kernel/uprobes.c
new file mode 100644
index 000000000000..72e8bb3b52cd
--- /dev/null
+++ b/kernel/uprobes.c
@@ -0,0 +1,976 @@
+/*
+ * Userspace Probes (UProbes)
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
+ *
+ * Copyright (C) IBM Corporation, 2008-2011
+ * Authors:
+ *	Srikar Dronamraju
+ *	Jim Keniston
+ */
+
+#include <linux/kernel.h>
+#include <linux/highmem.h>
+#include <linux/pagemap.h>	/* read_mapping_page */
+#include <linux/slab.h>
+#include <linux/sched.h>
+#include <linux/rmap.h>		/* anon_vma_prepare */
+#include <linux/mmu_notifier.h>	/* set_pte_at_notify */
+#include <linux/swap.h>		/* try_to_free_swap */
+#include <linux/uprobes.h>
+
+static struct rb_root uprobes_tree = RB_ROOT;
+static DEFINE_SPINLOCK(uprobes_treelock);	/* serialize rbtree access */
+
+#define UPROBES_HASH_SZ	13
+/* serialize (un)register */
+static struct mutex uprobes_mutex[UPROBES_HASH_SZ];
+#define uprobes_hash(v)	(&uprobes_mutex[((unsigned long)(v)) %\
+						UPROBES_HASH_SZ])
+
+/* serialize uprobe->pending_list */
+static struct mutex uprobes_mmap_mutex[UPROBES_HASH_SZ];
+#define uprobes_mmap_hash(v)	(&uprobes_mmap_mutex[((unsigned long)(v)) %\
+						UPROBES_HASH_SZ])
+
+/*
+ * uprobe_events allows us to skip the mmap_uprobe if there are no uprobe
+ * events active at this time.  Probably a fine grained per inode count is
+ * better?
+ */
+static atomic_t uprobe_events = ATOMIC_INIT(0);
+
+/*
+ * Maintain a temporary per vma info that can be used to search if a vma
+ * has already been handled. This structure is introduced since extending
+ * vm_area_struct wasnt recommended.
+ */
+struct vma_info {
+	struct list_head probe_list;
+	struct mm_struct *mm;
+	loff_t vaddr;
+};
+
+/*
+ * valid_vma: Verify if the specified vma is an executable vma
+ * Relax restrictions while unregistering: vm_flags might have
+ * changed after breakpoint was inserted.
+ *	- is_register: indicates if we are in register context.
+ *	- Return 1 if the specified virtual address is in an
+ *	  executable vma.
+ */
+static bool valid_vma(struct vm_area_struct *vma, bool is_register)
+{
+	if (!vma->vm_file)
+		return false;
+
+	if (!is_register)
+		return true;
+
+	if ((vma->vm_flags & (VM_READ|VM_WRITE|VM_EXEC|VM_SHARED)) ==
+						(VM_READ|VM_EXEC))
+		return true;
+
+	return false;
+}
+
+static loff_t vma_address(struct vm_area_struct *vma, loff_t offset)
+{
+	loff_t vaddr;
+
+	vaddr = vma->vm_start + offset;
+	vaddr -= vma->vm_pgoff << PAGE_SHIFT;
+	return vaddr;
+}
+
+/**
+ * __replace_page - replace page in vma by new page.
+ * based on replace_page in mm/ksm.c
+ *
+ * @vma:      vma that holds the pte pointing to page
+ * @page:     the cowed page we are replacing by kpage
+ * @kpage:    the modified page we replace page by
+ *
+ * Returns 0 on success, -EFAULT on failure.
+ */
+static int __replace_page(struct vm_area_struct *vma, struct page *page,
+					struct page *kpage)
+{
+	struct mm_struct *mm = vma->vm_mm;
+	pgd_t *pgd;
+	pud_t *pud;
+	pmd_t *pmd;
+	pte_t *ptep;
+	spinlock_t *ptl;
+	unsigned long addr;
+	int err = -EFAULT;
+
+	addr = page_address_in_vma(page, vma);
+	if (addr == -EFAULT)
+		goto out;
+
+	pgd = pgd_offset(mm, addr);
+	if (!pgd_present(*pgd))
+		goto out;
+
+	pud = pud_offset(pgd, addr);
+	if (!pud_present(*pud))
+		goto out;
+
+	pmd = pmd_offset(pud, addr);
+	if (!pmd_present(*pmd))
+		goto out;
+
+	ptep = pte_offset_map_lock(mm, pmd, addr, &ptl);
+	if (!ptep)
+		goto out;
+
+	get_page(kpage);
+	page_add_new_anon_rmap(kpage, vma, addr);
+
+	flush_cache_page(vma, addr, pte_pfn(*ptep));
+	ptep_clear_flush(vma, addr, ptep);
+	set_pte_at_notify(mm, addr, ptep, mk_pte(kpage, vma->vm_page_prot));
+
+	page_remove_rmap(page);
+	if (!page_mapped(page))
+		try_to_free_swap(page);
+	put_page(page);
+	pte_unmap_unlock(ptep, ptl);
+	err = 0;
+
+out:
+	return err;
+}
+
+/**
+ * is_bkpt_insn - check if instruction is breakpoint instruction.
+ * @insn: instruction to be checked.
+ * Default implementation of is_bkpt_insn
+ * Returns true if @insn is a breakpoint instruction.
+ */
+bool __weak is_bkpt_insn(uprobe_opcode_t *insn)
+{
+	return (*insn == UPROBES_BKPT_INSN);
+}
+
+/*
+ * NOTE:
+ * Expect the breakpoint instruction to be the smallest size instruction for
+ * the architecture. If an arch has variable length instruction and the
+ * breakpoint instruction is not of the smallest length instruction
+ * supported by that architecture then we need to modify read_opcode /
+ * write_opcode accordingly. This would never be a problem for archs that
+ * have fixed length instructions.
+ */
+
+/*
+ * write_opcode - write the opcode at a given virtual address.
+ * @mm: the probed process address space.
+ * @uprobe: the breakpointing information.
+ * @vaddr: the virtual address to store the opcode.
+ * @opcode: opcode to be written at @vaddr.
+ *
+ * Called with mm->mmap_sem held (for read and with a reference to
+ * mm).
+ *
+ * For mm @mm, write the opcode at @vaddr.
+ * Return 0 (success) or a negative errno.
+ */
+static int write_opcode(struct mm_struct *mm, struct uprobe *uprobe,
+			unsigned long vaddr, uprobe_opcode_t opcode)
+{
+	struct page *old_page, *new_page;
+	struct address_space *mapping;
+	void *vaddr_old, *vaddr_new;
+	struct vm_area_struct *vma;
+	loff_t addr;
+	int ret;
+
+	/* Read the page with vaddr into memory */
+	ret = get_user_pages(NULL, mm, vaddr, 1, 0, 0, &old_page, &vma);
+	if (ret <= 0)
+		return ret;
+	ret = -EINVAL;
+
+	/*
+	 * We are interested in text pages only. Our pages of interest
+	 * should be mapped for read and execute only. We desist from
+	 * adding probes in write mapped pages since the breakpoints
+	 * might end up in the file copy.
+	 */
+	if (!valid_vma(vma, is_bkpt_insn(&opcode)))
+		goto put_out;
+
+	mapping = uprobe->inode->i_mapping;
+	if (mapping != vma->vm_file->f_mapping)
+		goto put_out;
+
+	addr = vma_address(vma, uprobe->offset);
+	if (vaddr != (unsigned long)addr)
+		goto put_out;
+
+	ret = -ENOMEM;
+	new_page = alloc_page_vma(GFP_HIGHUSER_MOVABLE, vma, vaddr);
+	if (!new_page)
+		goto put_out;
+
+	__SetPageUptodate(new_page);
+
+	/*
+	 * lock page will serialize against do_wp_page()'s
+	 * PageAnon() handling
+	 */
+	lock_page(old_page);
+	/* copy the page now that we've got it stable */
+	vaddr_old = kmap_atomic(old_page);
+	vaddr_new = kmap_atomic(new_page);
+
+	memcpy(vaddr_new, vaddr_old, PAGE_SIZE);
+	/* poke the new insn in, ASSUMES we don't cross page boundary */
+	vaddr &= ~PAGE_MASK;
+	BUG_ON(vaddr + uprobe_opcode_sz > PAGE_SIZE);
+	memcpy(vaddr_new + vaddr, &opcode, uprobe_opcode_sz);
+
+	kunmap_atomic(vaddr_new);
+	kunmap_atomic(vaddr_old);
+
+	ret = anon_vma_prepare(vma);
+	if (ret)
+		goto unlock_out;
+
+	lock_page(new_page);
+	ret = __replace_page(vma, old_page, new_page);
+	unlock_page(new_page);
+
+unlock_out:
+	unlock_page(old_page);
+	page_cache_release(new_page);
+
+put_out:
+	put_page(old_page);	/* we did a get_page in the beginning */
+	return ret;
+}
+
+/**
+ * read_opcode - read the opcode at a given virtual address.
+ * @mm: the probed process address space.
+ * @vaddr: the virtual address to read the opcode.
+ * @opcode: location to store the read opcode.
+ *
+ * Called with mm->mmap_sem held (for read and with a reference to
+ * mm.
+ *
+ * For mm @mm, read the opcode at @vaddr and store it in @opcode.
+ * Return 0 (success) or a negative errno.
+ */
+static int read_opcode(struct mm_struct *mm, unsigned long vaddr,
+						uprobe_opcode_t *opcode)
+{
+	struct page *page;
+	void *vaddr_new;
+	int ret;
+
+	ret = get_user_pages(NULL, mm, vaddr, 1, 0, 0, &page, NULL);
+	if (ret <= 0)
+		return ret;
+
+	lock_page(page);
+	vaddr_new = kmap_atomic(page);
+	vaddr &= ~PAGE_MASK;
+	memcpy(opcode, vaddr_new + vaddr, uprobe_opcode_sz);
+	kunmap_atomic(vaddr_new);
+	unlock_page(page);
+	put_page(page);		/* we did a get_user_pages in the beginning */
+	return 0;
+}
+
+static int is_bkpt_at_addr(struct mm_struct *mm, unsigned long vaddr)
+{
+	uprobe_opcode_t opcode;
+	int result = read_opcode(mm, vaddr, &opcode);
+
+	if (result)
+		return result;
+
+	if (is_bkpt_insn(&opcode))
+		return 1;
+
+	return 0;
+}
+
+/**
+ * set_bkpt - store breakpoint at a given address.
+ * @mm: the probed process address space.
+ * @uprobe: the probepoint information.
+ * @vaddr: the virtual address to insert the opcode.
+ *
+ * For mm @mm, store the breakpoint instruction at @vaddr.
+ * Return 0 (success) or a negative errno.
+ */
+int __weak set_bkpt(struct mm_struct *mm, struct uprobe *uprobe,
+						unsigned long vaddr)
+{
+	int result = is_bkpt_at_addr(mm, vaddr);
+
+	if (result == 1)
+		return -EEXIST;
+
+	if (result)
+		return result;
+
+	return write_opcode(mm, uprobe, vaddr, UPROBES_BKPT_INSN);
+}
+
+/**
+ * set_orig_insn - Restore the original instruction.
+ * @mm: the probed process address space.
+ * @uprobe: the probepoint information.
+ * @vaddr: the virtual address to insert the opcode.
+ * @verify: if true, verify existance of breakpoint instruction.
+ *
+ * For mm @mm, restore the original opcode (opcode) at @vaddr.
+ * Return 0 (success) or a negative errno.
+ */
+int __weak set_orig_insn(struct mm_struct *mm, struct uprobe *uprobe,
+					unsigned long vaddr, bool verify)
+{
+	if (verify) {
+		int result = is_bkpt_at_addr(mm, vaddr);
+
+		if (!result)
+			return -EINVAL;
+
+		if (result != 1)
+			return result;
+	}
+	return write_opcode(mm, uprobe, vaddr,
+				*(uprobe_opcode_t *)uprobe->insn);
+}
+
+static int match_uprobe(struct uprobe *l, struct uprobe *r)
+{
+	if (l->inode < r->inode)
+		return -1;
+	if (l->inode > r->inode)
+		return 1;
+	else {
+		if (l->offset < r->offset)
+			return -1;
+
+		if (l->offset > r->offset)
+			return 1;
+	}
+
+	return 0;
+}
+
+static struct uprobe *__find_uprobe(struct inode *inode, loff_t offset)
+{
+	struct uprobe u = { .inode = inode, .offset = offset };
+	struct rb_node *n = uprobes_tree.rb_node;
+	struct uprobe *uprobe;
+	int match;
+
+	while (n) {
+		uprobe = rb_entry(n, struct uprobe, rb_node);
+		match = match_uprobe(&u, uprobe);
+		if (!match) {
+			atomic_inc(&uprobe->ref);
+			return uprobe;
+		}
+		if (match < 0)
+			n = n->rb_left;
+		else
+			n = n->rb_right;
+	}
+	return NULL;
+}
+
+/*
+ * Find a uprobe corresponding to a given inode:offset
+ * Acquires uprobes_treelock
+ */
+static struct uprobe *find_uprobe(struct inode *inode, loff_t offset)
+{
+	struct uprobe *uprobe;
+	unsigned long flags;
+
+	spin_lock_irqsave(&uprobes_treelock, flags);
+	uprobe = __find_uprobe(inode, offset);
+	spin_unlock_irqrestore(&uprobes_treelock, flags);
+	return uprobe;
+}
+
+static struct uprobe *__insert_uprobe(struct uprobe *uprobe)
+{
+	struct rb_node **p = &uprobes_tree.rb_node;
+	struct rb_node *parent = NULL;
+	struct uprobe *u;
+	int match;
+
+	while (*p) {
+		parent = *p;
+		u = rb_entry(parent, struct uprobe, rb_node);
+		match = match_uprobe(uprobe, u);
+		if (!match) {
+			atomic_inc(&u->ref);
+			return u;
+		}
+
+		if (match < 0)
+			p = &parent->rb_left;
+		else
+			p = &parent->rb_right;
+
+	}
+	u = NULL;
+	rb_link_node(&uprobe->rb_node, parent, p);
+	rb_insert_color(&uprobe->rb_node, &uprobes_tree);
+	/* get access + creation ref */
+	atomic_set(&uprobe->ref, 2);
+	return u;
+}
+
+/*
+ * Acquires uprobes_treelock.
+ * Matching uprobe already exists in rbtree;
+ *	increment (access refcount) and return the matching uprobe.
+ *
+ * No matching uprobe; insert the uprobe in rb_tree;
+ *	get a double refcount (access + creation) and return NULL.
+ */
+static struct uprobe *insert_uprobe(struct uprobe *uprobe)
+{
+	unsigned long flags;
+	struct uprobe *u;
+
+	spin_lock_irqsave(&uprobes_treelock, flags);
+	u = __insert_uprobe(uprobe);
+	spin_unlock_irqrestore(&uprobes_treelock, flags);
+	return u;
+}
+
+static void put_uprobe(struct uprobe *uprobe)
+{
+	if (atomic_dec_and_test(&uprobe->ref))
+		kfree(uprobe);
+}
+
+static struct uprobe *alloc_uprobe(struct inode *inode, loff_t offset)
+{
+	struct uprobe *uprobe, *cur_uprobe;
+
+	uprobe = kzalloc(sizeof(struct uprobe), GFP_KERNEL);
+	if (!uprobe)
+		return NULL;
+
+	uprobe->inode = igrab(inode);
+	uprobe->offset = offset;
+	init_rwsem(&uprobe->consumer_rwsem);
+	INIT_LIST_HEAD(&uprobe->pending_list);
+
+	/* add to uprobes_tree, sorted on inode:offset */
+	cur_uprobe = insert_uprobe(uprobe);
+
+	/* a uprobe exists for this inode:offset combination */
+	if (cur_uprobe) {
+		kfree(uprobe);
+		uprobe = cur_uprobe;
+		iput(inode);
+	} else
+		atomic_inc(&uprobe_events);
+	return uprobe;
+}
+
+/* Returns the previous consumer */
+static struct uprobe_consumer *add_consumer(struct uprobe *uprobe,
+				struct uprobe_consumer *consumer)
+{
+	down_write(&uprobe->consumer_rwsem);
+	consumer->next = uprobe->consumers;
+	uprobe->consumers = consumer;
+	up_write(&uprobe->consumer_rwsem);
+	return consumer->next;
+}
+
+/*
+ * For uprobe @uprobe, delete the consumer @consumer.
+ * Return true if the @consumer is deleted successfully
+ * or return false.
+ */
+static bool del_consumer(struct uprobe *uprobe,
+				struct uprobe_consumer *consumer)
+{
+	struct uprobe_consumer **con;
+	bool ret = false;
+
+	down_write(&uprobe->consumer_rwsem);
+	for (con = &uprobe->consumers; *con; con = &(*con)->next) {
+		if (*con == consumer) {
+			*con = consumer->next;
+			ret = true;
+			break;
+		}
+	}
+	up_write(&uprobe->consumer_rwsem);
+	return ret;
+}
+
+static int __copy_insn(struct address_space *mapping,
+			struct vm_area_struct *vma, char *insn,
+			unsigned long nbytes, unsigned long offset)
+{
+	struct file *filp = vma->vm_file;
+	struct page *page;
+	void *vaddr;
+	unsigned long off1;
+	unsigned long idx;
+
+	if (!filp)
+		return -EINVAL;
+
+	idx = (unsigned long)(offset >> PAGE_CACHE_SHIFT);
+	off1 = offset &= ~PAGE_MASK;
+
+	/*
+	 * Ensure that the page that has the original instruction is
+	 * populated and in page-cache.
+	 */
+	page = read_mapping_page(mapping, idx, filp);
+	if (IS_ERR(page))
+		return PTR_ERR(page);
+
+	vaddr = kmap_atomic(page);
+	memcpy(insn, vaddr + off1, nbytes);
+	kunmap_atomic(vaddr);
+	page_cache_release(page);
+	return 0;
+}
+
+static int copy_insn(struct uprobe *uprobe, struct vm_area_struct *vma,
+					unsigned long addr)
+{
+	struct address_space *mapping;
+	int bytes;
+	unsigned long nbytes;
+
+	addr &= ~PAGE_MASK;
+	nbytes = PAGE_SIZE - addr;
+	mapping = uprobe->inode->i_mapping;
+
+	/* Instruction at end of binary; copy only available bytes */
+	if (uprobe->offset + MAX_UINSN_BYTES > uprobe->inode->i_size)
+		bytes = uprobe->inode->i_size - uprobe->offset;
+	else
+		bytes = MAX_UINSN_BYTES;
+
+	/* Instruction at the page-boundary; copy bytes in second page */
+	if (nbytes < bytes) {
+		if (__copy_insn(mapping, vma, uprobe->insn + nbytes,
+				bytes - nbytes, uprobe->offset + nbytes))
+			return -ENOMEM;
+
+		bytes = nbytes;
+	}
+	return __copy_insn(mapping, vma, uprobe->insn, bytes, uprobe->offset);
+}
+
+static int install_breakpoint(struct mm_struct *mm, struct uprobe *uprobe,
+				struct vm_area_struct *vma, loff_t vaddr)
+{
+	unsigned long addr;
+	int ret;
+
+	/*
+	 * If probe is being deleted, unregister thread could be done with
+	 * the vma-rmap-walk through. Adding a probe now can be fatal since
+	 * nobody will be able to cleanup. Also we could be from fork or
+	 * mremap path, where the probe might have already been inserted.
+	 * Hence behave as if probe already existed.
+	 */
+	if (!uprobe->consumers)
+		return -EEXIST;
+
+	addr = (unsigned long)vaddr;
+	if (!(uprobe->flags & UPROBES_COPY_INSN)) {
+		ret = copy_insn(uprobe, vma, addr);
+		if (ret)
+			return ret;
+
+		if (is_bkpt_insn((uprobe_opcode_t *)uprobe->insn))
+			return -EEXIST;
+
+		ret = analyze_insn(mm, uprobe);
+		if (ret)
+			return ret;
+
+		uprobe->flags |= UPROBES_COPY_INSN;
+	}
+	ret = set_bkpt(mm, uprobe, addr);
+
+	return ret;
+}
+
+static void remove_breakpoint(struct mm_struct *mm, struct uprobe *uprobe,
+							loff_t vaddr)
+{
+	set_orig_insn(mm, uprobe, (unsigned long)vaddr, true);
+}
+
+static void delete_uprobe(struct uprobe *uprobe)
+{
+	unsigned long flags;
+
+	spin_lock_irqsave(&uprobes_treelock, flags);
+	rb_erase(&uprobe->rb_node, &uprobes_tree);
+	spin_unlock_irqrestore(&uprobes_treelock, flags);
+	iput(uprobe->inode);
+	put_uprobe(uprobe);
+	atomic_dec(&uprobe_events);
+}
+
+static struct vma_info *__find_next_vma_info(struct list_head *head,
+			loff_t offset, struct address_space *mapping,
+			struct vma_info *vi, bool is_register)
+{
+	struct prio_tree_iter iter;
+	struct vm_area_struct *vma;
+	struct vma_info *tmpvi;
+	loff_t vaddr;
+	unsigned long pgoff = offset >> PAGE_SHIFT;
+	int existing_vma;
+
+	vma_prio_tree_foreach(vma, &iter, &mapping->i_mmap, pgoff, pgoff) {
+		if (!valid_vma(vma, is_register))
+			continue;
+
+		existing_vma = 0;
+		vaddr = vma_address(vma, offset);
+		list_for_each_entry(tmpvi, head, probe_list) {
+			if (tmpvi->mm == vma->vm_mm && tmpvi->vaddr == vaddr) {
+				existing_vma = 1;
+				break;
+			}
+		}
+
+		/*
+		 * Another vma needs a probe to be installed. However skip
+		 * installing the probe if the vma is about to be unlinked.
+		 */
+		if (!existing_vma &&
+				atomic_inc_not_zero(&vma->vm_mm->mm_users)) {
+			vi->mm = vma->vm_mm;
+			vi->vaddr = vaddr;
+			list_add(&vi->probe_list, head);
+			return vi;
+		}
+	}
+	return NULL;
+}
+
+/*
+ * Iterate in the rmap prio tree  and find a vma where a probe has not
+ * yet been inserted.
+ */
+static struct vma_info *find_next_vma_info(struct list_head *head,
+			loff_t offset, struct address_space *mapping,
+			bool is_register)
+{
+	struct vma_info *vi, *retvi;
+	vi = kzalloc(sizeof(struct vma_info), GFP_KERNEL);
+	if (!vi)
+		return ERR_PTR(-ENOMEM);
+
+	mutex_lock(&mapping->i_mmap_mutex);
+	retvi = __find_next_vma_info(head, offset, mapping, vi, is_register);
+	mutex_unlock(&mapping->i_mmap_mutex);
+
+	if (!retvi)
+		kfree(vi);
+	return retvi;
+}
+
+static int register_for_each_vma(struct uprobe *uprobe, bool is_register)
+{
+	struct list_head try_list;
+	struct vm_area_struct *vma;
+	struct address_space *mapping;
+	struct vma_info *vi, *tmpvi;
+	struct mm_struct *mm;
+	loff_t vaddr;
+	int ret = 0;
+
+	mapping = uprobe->inode->i_mapping;
+	INIT_LIST_HEAD(&try_list);
+	while ((vi = find_next_vma_info(&try_list, uprobe->offset,
+					mapping, is_register)) != NULL) {
+		if (IS_ERR(vi)) {
+			ret = PTR_ERR(vi);
+			break;
+		}
+		mm = vi->mm;
+		down_read(&mm->mmap_sem);
+		vma = find_vma(mm, (unsigned long)vi->vaddr);
+		if (!vma || !valid_vma(vma, is_register)) {
+			list_del(&vi->probe_list);
+			kfree(vi);
+			up_read(&mm->mmap_sem);
+			mmput(mm);
+			continue;
+		}
+		vaddr = vma_address(vma, uprobe->offset);
+		if (vma->vm_file->f_mapping->host != uprobe->inode ||
+						vaddr != vi->vaddr) {
+			list_del(&vi->probe_list);
+			kfree(vi);
+			up_read(&mm->mmap_sem);
+			mmput(mm);
+			continue;
+		}
+
+		if (is_register)
+			ret = install_breakpoint(mm, uprobe, vma, vi->vaddr);
+		else
+			remove_breakpoint(mm, uprobe, vi->vaddr);
+
+		up_read(&mm->mmap_sem);
+		mmput(mm);
+		if (is_register) {
+			if (ret && ret == -EEXIST)
+				ret = 0;
+			if (ret)
+				break;
+		}
+	}
+	list_for_each_entry_safe(vi, tmpvi, &try_list, probe_list) {
+		list_del(&vi->probe_list);
+		kfree(vi);
+	}
+	return ret;
+}
+
+static int __register_uprobe(struct uprobe *uprobe)
+{
+	return register_for_each_vma(uprobe, true);
+}
+
+static void __unregister_uprobe(struct uprobe *uprobe)
+{
+	if (!register_for_each_vma(uprobe, false))
+		delete_uprobe(uprobe);
+
+	/* TODO : cant unregister? schedule a worker thread */
+}
+
+/*
+ * register_uprobe - register a probe
+ * @inode: the file in which the probe has to be placed.
+ * @offset: offset from the start of the file.
+ * @consumer: information on howto handle the probe..
+ *
+ * Apart from the access refcount, register_uprobe() takes a creation
+ * refcount (thro alloc_uprobe) if and only if this @uprobe is getting
+ * inserted into the rbtree (i.e first consumer for a @inode:@offset
+ * tuple).  Creation refcount stops unregister_uprobe from freeing the
+ * @uprobe even before the register operation is complete. Creation
+ * refcount is released when the last @consumer for the @uprobe
+ * unregisters.
+ *
+ * Return errno if it cannot successully install probes
+ * else return 0 (success)
+ */
+int register_uprobe(struct inode *inode, loff_t offset,
+				struct uprobe_consumer *consumer)
+{
+	struct uprobe *uprobe;
+	int ret = -EINVAL;
+
+	if (!inode || !consumer || consumer->next)
+		return ret;
+
+	if (offset > i_size_read(inode))
+		return ret;
+
+	ret = 0;
+	mutex_lock(uprobes_hash(inode));
+	uprobe = alloc_uprobe(inode, offset);
+	if (uprobe && !add_consumer(uprobe, consumer)) {
+		ret = __register_uprobe(uprobe);
+		if (ret) {
+			uprobe->consumers = NULL;
+			__unregister_uprobe(uprobe);
+		} else
+			uprobe->flags |= UPROBES_RUN_HANDLER;
+	}
+
+	mutex_unlock(uprobes_hash(inode));
+	put_uprobe(uprobe);
+
+	return ret;
+}
+
+/*
+ * unregister_uprobe - unregister a already registered probe.
+ * @inode: the file in which the probe has to be removed.
+ * @offset: offset from the start of the file.
+ * @consumer: identify which probe if multiple probes are colocated.
+ */
+void unregister_uprobe(struct inode *inode, loff_t offset,
+				struct uprobe_consumer *consumer)
+{
+	struct uprobe *uprobe = NULL;
+
+	if (!inode || !consumer)
+		return;
+
+	uprobe = find_uprobe(inode, offset);
+	if (!uprobe)
+		return;
+
+	mutex_lock(uprobes_hash(inode));
+	if (!del_consumer(uprobe, consumer))
+		goto unreg_out;
+
+	if (!uprobe->consumers) {
+		__unregister_uprobe(uprobe);
+		uprobe->flags &= ~UPROBES_RUN_HANDLER;
+	}
+
+unreg_out:
+	mutex_unlock(uprobes_hash(inode));
+	if (uprobe)
+		put_uprobe(uprobe);
+}
+
+/*
+ * Of all the nodes that correspond to the given inode, return the node
+ * with the least offset.
+ */
+static struct rb_node *find_least_offset_node(struct inode *inode)
+{
+	struct uprobe u = { .inode = inode, .offset = 0};
+	struct rb_node *n = uprobes_tree.rb_node;
+	struct rb_node *close_node = NULL;
+	struct uprobe *uprobe;
+	int match;
+
+	while (n) {
+		uprobe = rb_entry(n, struct uprobe, rb_node);
+		match = match_uprobe(&u, uprobe);
+		if (uprobe->inode == inode)
+			close_node = n;
+
+		if (!match)
+			return close_node;
+
+		if (match < 0)
+			n = n->rb_left;
+		else
+			n = n->rb_right;
+	}
+	return close_node;
+}
+
+/*
+ * For a given inode, build a list of probes that need to be inserted.
+ */
+static void build_probe_list(struct inode *inode, struct list_head *head)
+{
+	struct uprobe *uprobe;
+	struct rb_node *n;
+	unsigned long flags;
+
+	spin_lock_irqsave(&uprobes_treelock, flags);
+	n = find_least_offset_node(inode);
+	for (; n; n = rb_next(n)) {
+		uprobe = rb_entry(n, struct uprobe, rb_node);
+		if (uprobe->inode != inode)
+			break;
+
+		list_add(&uprobe->pending_list, head);
+		atomic_inc(&uprobe->ref);
+	}
+	spin_unlock_irqrestore(&uprobes_treelock, flags);
+}
+
+/*
+ * Called from mmap_region.
+ * called with mm->mmap_sem acquired.
+ *
+ * Return -ve no if we fail to insert probes and we cannot
+ * bail-out.
+ * Return 0 otherwise. i.e :
+ *	- successful insertion of probes
+ *	- (or) no possible probes to be inserted.
+ *	- (or) insertion of probes failed but we can bail-out.
+ */
+int mmap_uprobe(struct vm_area_struct *vma)
+{
+	struct list_head tmp_list;
+	struct uprobe *uprobe, *u;
+	struct inode *inode;
+	int ret = 0;
+
+	if (!atomic_read(&uprobe_events) || !valid_vma(vma, true))
+		return ret;	/* Bail-out */
+
+	inode = vma->vm_file->f_mapping->host;
+	if (!inode)
+		return ret;
+
+	INIT_LIST_HEAD(&tmp_list);
+	mutex_lock(uprobes_mmap_hash(inode));
+	build_probe_list(inode, &tmp_list);
+	list_for_each_entry_safe(uprobe, u, &tmp_list, pending_list) {
+		loff_t vaddr;
+
+		list_del(&uprobe->pending_list);
+		if (!ret) {
+			vaddr = vma_address(vma, uprobe->offset);
+			if (vaddr < vma->vm_start || vaddr >= vma->vm_end) {
+				put_uprobe(uprobe);
+				continue;
+			}
+			ret = install_breakpoint(vma->vm_mm, uprobe, vma,
+								vaddr);
+			if (ret == -EEXIST)
+				ret = 0;
+		}
+		put_uprobe(uprobe);
+	}
+
+	mutex_unlock(uprobes_mmap_hash(inode));
+
+	return ret;
+}
+
+static int __init init_uprobes(void)
+{
+	int i;
+
+	for (i = 0; i < UPROBES_HASH_SZ; i++) {
+		mutex_init(&uprobes_mutex[i]);
+		mutex_init(&uprobes_mmap_mutex[i]);
+	}
+	return 0;
+}
+
+static void __exit exit_uprobes(void)
+{
+}
+
+module_init(init_uprobes);
+module_exit(exit_uprobes);
diff --git a/mm/mmap.c b/mm/mmap.c
index 3f758c7f4c81..1aed183636d7 100644
--- a/mm/mmap.c
+++ b/mm/mmap.c
@@ -30,6 +30,7 @@
 #include <linux/perf_event.h>
 #include <linux/audit.h>
 #include <linux/khugepaged.h>
+#include <linux/uprobes.h>
 
 #include <asm/uaccess.h>
 #include <asm/cacheflush.h>
@@ -616,6 +617,13 @@ again:			remove_next = 1 + (end > next->vm_end);
 	if (mapping)
 		mutex_unlock(&mapping->i_mmap_mutex);
 
+	if (root) {
+		mmap_uprobe(vma);
+
+		if (adjust_next)
+			mmap_uprobe(next);
+	}
+
 	if (remove_next) {
 		if (file) {
 			fput(file);
@@ -637,6 +645,8 @@ again:			remove_next = 1 + (end > next->vm_end);
 			goto again;
 		}
 	}
+	if (insert && file)
+		mmap_uprobe(insert);
 
 	validate_mm(mm);
 
@@ -1329,6 +1339,11 @@ out:
 			mm->locked_vm += (len >> PAGE_SHIFT);
 	} else if ((flags & MAP_POPULATE) && !(flags & MAP_NONBLOCK))
 		make_pages_present(addr, addr + len);
+
+	if (file && mmap_uprobe(vma))
+		/* matching probes but cannot insert */
+		goto unmap_and_free_vma;
+
 	return addr;
 
 unmap_and_free_vma:
@@ -2285,6 +2300,10 @@ int insert_vm_struct(struct mm_struct * mm, struct vm_area_struct * vma)
 	if ((vma->vm_flags & VM_ACCOUNT) &&
 	     security_vm_enough_memory_mm(mm, vma_pages(vma)))
 		return -ENOMEM;
+
+	if (vma->vm_file && mmap_uprobe(vma))
+		return -EINVAL;
+
 	vma_link(mm, vma, prev, rb_link, rb_parent);
 	return 0;
 }
@@ -2354,6 +2373,10 @@ struct vm_area_struct *copy_vma(struct vm_area_struct **vmap,
 			new_vma->vm_pgoff = pgoff;
 			if (new_vma->vm_file) {
 				get_file(new_vma->vm_file);
+
+				if (mmap_uprobe(new_vma))
+					goto out_free_mempol;
+
 				if (vma->vm_flags & VM_EXECUTABLE)
 					added_exe_file_vma(mm);
 			}
-- 
cgit v1.2.3


From 7b2d81d48a2d8e37efb6ce7b4d5ef58822b30d89 Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Fri, 17 Feb 2012 09:27:41 +0100
Subject: uprobes/core: Clean up, refactor and improve the code

Make the uprobes code readable to me:

 - improve the Kconfig text so that a mere mortal gets some idea
   what CONFIG_UPROBES=y is really about

 - do trivial renames to standardize around the uprobes_*() namespace

 - clean up and simplify various code flow details

 - separate basic blocks of functionality

 - line break artifact and white space related removal

 - use standard local varible definition blocks

 - use vertical spacing to make things more readable

 - remove unnecessary volatile

 - restructure comment blocks to make them more uniform and
   more readable in general

Cc: Srikar Dronamraju <srikar@linux.vnet.ibm.com>
Cc: Jim Keniston <jkenisto@us.ibm.com>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Oleg Nesterov <oleg@redhat.com>
Cc: Masami Hiramatsu <masami.hiramatsu.pt@hitachi.com>
Cc: Arnaldo Carvalho de Melo <acme@infradead.org>
Cc: Anton Arapov <anton@redhat.com>
Cc: Ananth N Mavinakayanahalli <ananth@in.ibm.com>
Link: http://lkml.kernel.org/n/tip-ewbwhb8o6navvllsauu7k07p@git.kernel.org
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/Kconfig                   |  14 ++-
 arch/x86/include/asm/uprobes.h |  17 ++--
 arch/x86/kernel/uprobes.c      | 129 ++++++++++++------------
 include/linux/uprobes.h        |  28 +++---
 kernel/uprobes.c               | 219 ++++++++++++++++++++++++-----------------
 mm/mmap.c                      |  12 +--
 6 files changed, 233 insertions(+), 186 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/Kconfig b/arch/Kconfig
index 284f5898f526..cca5b545d806 100644
--- a/arch/Kconfig
+++ b/arch/Kconfig
@@ -66,13 +66,19 @@ config OPTPROBES
 	depends on !PREEMPT
 
 config UPROBES
-	bool "User-space probes (EXPERIMENTAL)"
+	bool "Transparent user-space probes (EXPERIMENTAL)"
 	depends on ARCH_SUPPORTS_UPROBES
 	default n
 	help
-	  Uprobes enables kernel subsystems to establish probepoints
-	  in user applications and execute handler functions when
-	  the probepoints are hit.
+	  Uprobes is the user-space counterpart to kprobes: they
+	  enable instrumentation applications (such as 'perf probe')
+	  to establish unintrusive probes in user-space binaries and
+	  libraries, by executing handler functions when the probes
+	  are hit by user-space applications.
+
+	  ( These probes come in the form of single-byte breakpoints,
+	    managed by the kernel and kept transparent to the probed
+	    application. )
 
 	  If in doubt, say "N".
 
diff --git a/arch/x86/include/asm/uprobes.h b/arch/x86/include/asm/uprobes.h
index 8208234391ff..072df3902636 100644
--- a/arch/x86/include/asm/uprobes.h
+++ b/arch/x86/include/asm/uprobes.h
@@ -1,7 +1,7 @@
 #ifndef _ASM_UPROBES_H
 #define _ASM_UPROBES_H
 /*
- * Userspace Probes (UProbes) for x86
+ * User-space Probes (UProbes) for x86
  *
  * This program is free software; you can redistribute it and/or modify
  * it under the terms of the GNU General Public License as published by
@@ -24,19 +24,20 @@
  */
 
 typedef u8 uprobe_opcode_t;
-#define MAX_UINSN_BYTES 16
-#define UPROBES_XOL_SLOT_BYTES	128	/* to keep it cache aligned */
 
-#define UPROBES_BKPT_INSN 0xcc
-#define UPROBES_BKPT_INSN_SIZE 1
+#define MAX_UINSN_BYTES			  16
+#define UPROBES_XOL_SLOT_BYTES		 128	/* to keep it cache aligned */
+
+#define UPROBES_BKPT_INSN		0xcc
+#define UPROBES_BKPT_INSN_SIZE		   1
 
 struct uprobe_arch_info {
-	u16			fixups;
+	u16				fixups;
 #ifdef CONFIG_X86_64
-	unsigned long rip_rela_target_address;
+	unsigned long			rip_rela_target_address;
 #endif
 };
 
 struct uprobe;
-extern int analyze_insn(struct mm_struct *mm, struct uprobe *uprobe);
+extern int arch_uprobes_analyze_insn(struct mm_struct *mm, struct uprobe *uprobe);
 #endif	/* _ASM_UPROBES_H */
diff --git a/arch/x86/kernel/uprobes.c b/arch/x86/kernel/uprobes.c
index 2a301bb91bdb..cf2a18498425 100644
--- a/arch/x86/kernel/uprobes.c
+++ b/arch/x86/kernel/uprobes.c
@@ -1,5 +1,5 @@
 /*
- * Userspace Probes (UProbes) for x86
+ * User-space Probes (UProbes) for x86
  *
  * This program is free software; you can redistribute it and/or modify
  * it under the terms of the GNU General Public License as published by
@@ -20,7 +20,6 @@
  *	Srikar Dronamraju
  *	Jim Keniston
  */
-
 #include <linux/kernel.h>
 #include <linux/sched.h>
 #include <linux/ptrace.h>
@@ -42,10 +41,10 @@
 #define UPROBES_FIX_RIP_CX	0x4000
 
 /* Adaptations for mhiramat x86 decoder v14. */
-#define OPCODE1(insn) ((insn)->opcode.bytes[0])
-#define OPCODE2(insn) ((insn)->opcode.bytes[1])
-#define OPCODE3(insn) ((insn)->opcode.bytes[2])
-#define MODRM_REG(insn) X86_MODRM_REG(insn->modrm.value)
+#define OPCODE1(insn)		((insn)->opcode.bytes[0])
+#define OPCODE2(insn)		((insn)->opcode.bytes[1])
+#define OPCODE3(insn)		((insn)->opcode.bytes[2])
+#define MODRM_REG(insn)		X86_MODRM_REG(insn->modrm.value)
 
 #define W(row, b0, b1, b2, b3, b4, b5, b6, b7, b8, b9, ba, bb, bc, bd, be, bf)\
 	(((b0##UL << 0x0)|(b1##UL << 0x1)|(b2##UL << 0x2)|(b3##UL << 0x3) |   \
@@ -55,7 +54,7 @@
 	 << (row % 32))
 
 #ifdef CONFIG_X86_64
-static volatile u32 good_insns_64[256 / 32] = {
+static u32 good_insns_64[256 / 32] = {
 	/*      0  1  2  3  4  5  6  7  8  9  a  b  c  d  e  f         */
 	/*      ----------------------------------------------         */
 	W(0x00, 1, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 1, 1, 0, 0) | /* 00 */
@@ -81,7 +80,7 @@ static volatile u32 good_insns_64[256 / 32] = {
 
 /* Good-instruction tables for 32-bit apps */
 
-static volatile u32 good_insns_32[256 / 32] = {
+static u32 good_insns_32[256 / 32] = {
 	/*      0  1  2  3  4  5  6  7  8  9  a  b  c  d  e  f         */
 	/*      ----------------------------------------------         */
 	W(0x00, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 0) | /* 00 */
@@ -105,7 +104,7 @@ static volatile u32 good_insns_32[256 / 32] = {
 };
 
 /* Using this for both 64-bit and 32-bit apps */
-static volatile u32 good_2byte_insns[256 / 32] = {
+static u32 good_2byte_insns[256 / 32] = {
 	/*      0  1  2  3  4  5  6  7  8  9  a  b  c  d  e  f         */
 	/*      ----------------------------------------------         */
 	W(0x00, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1) | /* 00 */
@@ -132,42 +131,47 @@ static volatile u32 good_2byte_insns[256 / 32] = {
 
 /*
  * opcodes we'll probably never support:
- * 6c-6d, e4-e5, ec-ed - in
- * 6e-6f, e6-e7, ee-ef - out
- * cc, cd - int3, int
- * cf - iret
- * d6 - illegal instruction
- * f1 - int1/icebp
- * f4 - hlt
- * fa, fb - cli, sti
- * 0f - lar, lsl, syscall, clts, sysret, sysenter, sysexit, invd, wbinvd, ud2
+ *
+ *  6c-6d, e4-e5, ec-ed - in
+ *  6e-6f, e6-e7, ee-ef - out
+ *  cc, cd - int3, int
+ *  cf - iret
+ *  d6 - illegal instruction
+ *  f1 - int1/icebp
+ *  f4 - hlt
+ *  fa, fb - cli, sti
+ *  0f - lar, lsl, syscall, clts, sysret, sysenter, sysexit, invd, wbinvd, ud2
  *
  * invalid opcodes in 64-bit mode:
- * 06, 0e, 16, 1e, 27, 2f, 37, 3f, 60-62, 82, c4-c5, d4-d5
  *
- * 63 - we support this opcode in x86_64 but not in i386.
+ *  06, 0e, 16, 1e, 27, 2f, 37, 3f, 60-62, 82, c4-c5, d4-d5
+ *  63 - we support this opcode in x86_64 but not in i386.
  *
  * opcodes we may need to refine support for:
- * 0f - 2-byte instructions: For many of these instructions, the validity
- * depends on the prefix and/or the reg field.  On such instructions, we
- * just consider the opcode combination valid if it corresponds to any
- * valid instruction.
- * 8f - Group 1 - only reg = 0 is OK
- * c6-c7 - Group 11 - only reg = 0 is OK
- * d9-df - fpu insns with some illegal encodings
- * f2, f3 - repnz, repz prefixes.  These are also the first byte for
- * certain floating-point instructions, such as addsd.
- * fe - Group 4 - only reg = 0 or 1 is OK
- * ff - Group 5 - only reg = 0-6 is OK
+ *
+ *  0f - 2-byte instructions: For many of these instructions, the validity
+ *  depends on the prefix and/or the reg field.  On such instructions, we
+ *  just consider the opcode combination valid if it corresponds to any
+ *  valid instruction.
+ *
+ *  8f - Group 1 - only reg = 0 is OK
+ *  c6-c7 - Group 11 - only reg = 0 is OK
+ *  d9-df - fpu insns with some illegal encodings
+ *  f2, f3 - repnz, repz prefixes.  These are also the first byte for
+ *  certain floating-point instructions, such as addsd.
+ *
+ *  fe - Group 4 - only reg = 0 or 1 is OK
+ *  ff - Group 5 - only reg = 0-6 is OK
  *
  * others -- Do we need to support these?
- * 0f - (floating-point?) prefetch instructions
- * 07, 17, 1f - pop es, pop ss, pop ds
- * 26, 2e, 36, 3e - es:, cs:, ss:, ds: segment prefixes --
+ *
+ *  0f - (floating-point?) prefetch instructions
+ *  07, 17, 1f - pop es, pop ss, pop ds
+ *  26, 2e, 36, 3e - es:, cs:, ss:, ds: segment prefixes --
  *	but 64 and 65 (fs: and gs:) seem to be used, so we support them
- * 67 - addr16 prefix
- * ce - into
- * f0 - lock prefix
+ *  67 - addr16 prefix
+ *  ce - into
+ *  f0 - lock prefix
  */
 
 /*
@@ -182,11 +186,11 @@ static bool is_prefix_bad(struct insn *insn)
 
 	for (i = 0; i < insn->prefixes.nbytes; i++) {
 		switch (insn->prefixes.bytes[i]) {
-		case 0x26:	/*INAT_PFX_ES   */
-		case 0x2E:	/*INAT_PFX_CS   */
-		case 0x36:	/*INAT_PFX_DS   */
-		case 0x3E:	/*INAT_PFX_SS   */
-		case 0xF0:	/*INAT_PFX_LOCK */
+		case 0x26:	/* INAT_PFX_ES   */
+		case 0x2E:	/* INAT_PFX_CS   */
+		case 0x36:	/* INAT_PFX_DS   */
+		case 0x3E:	/* INAT_PFX_SS   */
+		case 0xF0:	/* INAT_PFX_LOCK */
 			return true;
 		}
 	}
@@ -201,12 +205,15 @@ static int validate_insn_32bits(struct uprobe *uprobe, struct insn *insn)
 	insn_get_opcode(insn);
 	if (is_prefix_bad(insn))
 		return -ENOTSUPP;
+
 	if (test_bit(OPCODE1(insn), (unsigned long *)good_insns_32))
 		return 0;
+
 	if (insn->opcode.nbytes == 2) {
 		if (test_bit(OPCODE2(insn), (unsigned long *)good_2byte_insns))
 			return 0;
 	}
+
 	return -ENOTSUPP;
 }
 
@@ -282,12 +289,12 @@ static void prepare_fixups(struct uprobe *uprobe, struct insn *insn)
  * disastrous.
  *
  * Some useful facts about rip-relative instructions:
- * - There's always a modrm byte.
- * - There's never a SIB byte.
- * - The displacement is always 4 bytes.
+ *
+ *  - There's always a modrm byte.
+ *  - There's never a SIB byte.
+ *  - The displacement is always 4 bytes.
  */
-static void handle_riprel_insn(struct mm_struct *mm, struct uprobe *uprobe,
-							struct insn *insn)
+static void handle_riprel_insn(struct mm_struct *mm, struct uprobe *uprobe, struct insn *insn)
 {
 	u8 *cursor;
 	u8 reg;
@@ -342,13 +349,12 @@ static void handle_riprel_insn(struct mm_struct *mm, struct uprobe *uprobe,
 	}
 
 	/* Target address = address of next instruction + (signed) offset */
-	uprobe->arch_info.rip_rela_target_address = (long)insn->length
-					+ insn->displacement.value;
+	uprobe->arch_info.rip_rela_target_address = (long)insn->length + insn->displacement.value;
+
 	/* Displacement field is gone; slide immediate field (if any) over. */
 	if (insn->immediate.nbytes) {
 		cursor++;
-		memmove(cursor, cursor + insn->displacement.nbytes,
-						insn->immediate.nbytes);
+		memmove(cursor, cursor + insn->displacement.nbytes, insn->immediate.nbytes);
 	}
 	return;
 }
@@ -361,8 +367,10 @@ static int validate_insn_64bits(struct uprobe *uprobe, struct insn *insn)
 	insn_get_opcode(insn);
 	if (is_prefix_bad(insn))
 		return -ENOTSUPP;
+
 	if (test_bit(OPCODE1(insn), (unsigned long *)good_insns_64))
 		return 0;
+
 	if (insn->opcode.nbytes == 2) {
 		if (test_bit(OPCODE2(insn), (unsigned long *)good_2byte_insns))
 			return 0;
@@ -370,34 +378,31 @@ static int validate_insn_64bits(struct uprobe *uprobe, struct insn *insn)
 	return -ENOTSUPP;
 }
 
-static int validate_insn_bits(struct mm_struct *mm, struct uprobe *uprobe,
-				struct insn *insn)
+static int validate_insn_bits(struct mm_struct *mm, struct uprobe *uprobe, struct insn *insn)
 {
 	if (mm->context.ia32_compat)
 		return validate_insn_32bits(uprobe, insn);
 	return validate_insn_64bits(uprobe, insn);
 }
-#else
-static void handle_riprel_insn(struct mm_struct *mm, struct uprobe *uprobe,
-							struct insn *insn)
+#else /* 32-bit: */
+static void handle_riprel_insn(struct mm_struct *mm, struct uprobe *uprobe, struct insn *insn)
 {
-	return;
+	/* No RIP-relative addressing on 32-bit */
 }
 
-static int validate_insn_bits(struct mm_struct *mm, struct uprobe *uprobe,
-				struct insn *insn)
+static int validate_insn_bits(struct mm_struct *mm, struct uprobe *uprobe, struct insn *insn)
 {
 	return validate_insn_32bits(uprobe, insn);
 }
 #endif /* CONFIG_X86_64 */
 
 /**
- * analyze_insn - instruction analysis including validity and fixups.
+ * arch_uprobes_analyze_insn - instruction analysis including validity and fixups.
  * @mm: the probed address space.
  * @uprobe: the probepoint information.
  * Return 0 on success or a -ve number on error.
  */
-int analyze_insn(struct mm_struct *mm, struct uprobe *uprobe)
+int arch_uprobes_analyze_insn(struct mm_struct *mm, struct uprobe *uprobe)
 {
 	int ret;
 	struct insn insn;
@@ -406,7 +411,9 @@ int analyze_insn(struct mm_struct *mm, struct uprobe *uprobe)
 	ret = validate_insn_bits(mm, uprobe, &insn);
 	if (ret != 0)
 		return ret;
+
 	handle_riprel_insn(mm, uprobe, &insn);
 	prepare_fixups(uprobe, &insn);
+
 	return 0;
 }
diff --git a/include/linux/uprobes.h b/include/linux/uprobes.h
index f1d13fd140f2..64e45f116b2a 100644
--- a/include/linux/uprobes.h
+++ b/include/linux/uprobes.h
@@ -1,7 +1,7 @@
 #ifndef _LINUX_UPROBES_H
 #define _LINUX_UPROBES_H
 /*
- * Userspace Probes (UProbes)
+ * User-space Probes (UProbes)
  *
  * This program is free software; you can redistribute it and/or modify
  * it under the terms of the GNU General Public License as published by
@@ -40,8 +40,10 @@ struct uprobe_arch_info {};
 #define uprobe_opcode_sz sizeof(uprobe_opcode_t)
 
 /* flags that denote/change uprobes behaviour */
+
 /* Have a copy of original instruction */
 #define UPROBES_COPY_INSN	0x1
+
 /* Dont run handlers when first register/ last unregister in progress*/
 #define UPROBES_RUN_HANDLER	0x2
 
@@ -70,27 +72,23 @@ struct uprobe {
 };
 
 #ifdef CONFIG_UPROBES
-extern int __weak set_bkpt(struct mm_struct *mm, struct uprobe *uprobe,
-							unsigned long vaddr);
-extern int __weak set_orig_insn(struct mm_struct *mm, struct uprobe *uprobe,
-					unsigned long vaddr, bool verify);
+extern int __weak set_bkpt(struct mm_struct *mm, struct uprobe *uprobe, unsigned long vaddr);
+extern int __weak set_orig_insn(struct mm_struct *mm, struct uprobe *uprobe, unsigned long vaddr, bool verify);
 extern bool __weak is_bkpt_insn(uprobe_opcode_t *insn);
-extern int register_uprobe(struct inode *inode, loff_t offset,
-				struct uprobe_consumer *consumer);
-extern void unregister_uprobe(struct inode *inode, loff_t offset,
-				struct uprobe_consumer *consumer);
-extern int mmap_uprobe(struct vm_area_struct *vma);
+extern int uprobe_register(struct inode *inode, loff_t offset, struct uprobe_consumer *consumer);
+extern void uprobe_unregister(struct inode *inode, loff_t offset, struct uprobe_consumer *consumer);
+extern int uprobe_mmap(struct vm_area_struct *vma);
 #else /* CONFIG_UPROBES is not defined */
-static inline int register_uprobe(struct inode *inode, loff_t offset,
-				struct uprobe_consumer *consumer)
+static inline int
+uprobe_register(struct inode *inode, loff_t offset, struct uprobe_consumer *consumer)
 {
 	return -ENOSYS;
 }
-static inline void unregister_uprobe(struct inode *inode, loff_t offset,
-				struct uprobe_consumer *consumer)
+static inline void
+uprobe_unregister(struct inode *inode, loff_t offset, struct uprobe_consumer *consumer)
 {
 }
-static inline int mmap_uprobe(struct vm_area_struct *vma)
+static inline int uprobe_mmap(struct vm_area_struct *vma)
 {
 	return 0;
 }
diff --git a/kernel/uprobes.c b/kernel/uprobes.c
index 72e8bb3b52cd..884817f1b0d3 100644
--- a/kernel/uprobes.c
+++ b/kernel/uprobes.c
@@ -1,5 +1,5 @@
 /*
- * Userspace Probes (UProbes)
+ * User-space Probes (UProbes)
  *
  * This program is free software; you can redistribute it and/or modify
  * it under the terms of the GNU General Public License as published by
@@ -29,24 +29,26 @@
 #include <linux/rmap.h>		/* anon_vma_prepare */
 #include <linux/mmu_notifier.h>	/* set_pte_at_notify */
 #include <linux/swap.h>		/* try_to_free_swap */
+
 #include <linux/uprobes.h>
 
 static struct rb_root uprobes_tree = RB_ROOT;
+
 static DEFINE_SPINLOCK(uprobes_treelock);	/* serialize rbtree access */
 
 #define UPROBES_HASH_SZ	13
+
 /* serialize (un)register */
 static struct mutex uprobes_mutex[UPROBES_HASH_SZ];
-#define uprobes_hash(v)	(&uprobes_mutex[((unsigned long)(v)) %\
-						UPROBES_HASH_SZ])
+
+#define uprobes_hash(v)		(&uprobes_mutex[((unsigned long)(v)) % UPROBES_HASH_SZ])
 
 /* serialize uprobe->pending_list */
 static struct mutex uprobes_mmap_mutex[UPROBES_HASH_SZ];
-#define uprobes_mmap_hash(v)	(&uprobes_mmap_mutex[((unsigned long)(v)) %\
-						UPROBES_HASH_SZ])
+#define uprobes_mmap_hash(v)	(&uprobes_mmap_mutex[((unsigned long)(v)) % UPROBES_HASH_SZ])
 
 /*
- * uprobe_events allows us to skip the mmap_uprobe if there are no uprobe
+ * uprobe_events allows us to skip the uprobe_mmap if there are no uprobe
  * events active at this time.  Probably a fine grained per inode count is
  * better?
  */
@@ -58,9 +60,9 @@ static atomic_t uprobe_events = ATOMIC_INIT(0);
  * vm_area_struct wasnt recommended.
  */
 struct vma_info {
-	struct list_head probe_list;
-	struct mm_struct *mm;
-	loff_t vaddr;
+	struct list_head	probe_list;
+	struct mm_struct	*mm;
+	loff_t			vaddr;
 };
 
 /*
@@ -79,8 +81,7 @@ static bool valid_vma(struct vm_area_struct *vma, bool is_register)
 	if (!is_register)
 		return true;
 
-	if ((vma->vm_flags & (VM_READ|VM_WRITE|VM_EXEC|VM_SHARED)) ==
-						(VM_READ|VM_EXEC))
+	if ((vma->vm_flags & (VM_READ|VM_WRITE|VM_EXEC|VM_SHARED)) == (VM_READ|VM_EXEC))
 		return true;
 
 	return false;
@@ -92,6 +93,7 @@ static loff_t vma_address(struct vm_area_struct *vma, loff_t offset)
 
 	vaddr = vma->vm_start + offset;
 	vaddr -= vma->vm_pgoff << PAGE_SHIFT;
+
 	return vaddr;
 }
 
@@ -105,8 +107,7 @@ static loff_t vma_address(struct vm_area_struct *vma, loff_t offset)
  *
  * Returns 0 on success, -EFAULT on failure.
  */
-static int __replace_page(struct vm_area_struct *vma, struct page *page,
-					struct page *kpage)
+static int __replace_page(struct vm_area_struct *vma, struct page *page, struct page *kpage)
 {
 	struct mm_struct *mm = vma->vm_mm;
 	pgd_t *pgd;
@@ -163,7 +164,7 @@ out:
  */
 bool __weak is_bkpt_insn(uprobe_opcode_t *insn)
 {
-	return (*insn == UPROBES_BKPT_INSN);
+	return *insn == UPROBES_BKPT_INSN;
 }
 
 /*
@@ -203,6 +204,7 @@ static int write_opcode(struct mm_struct *mm, struct uprobe *uprobe,
 	ret = get_user_pages(NULL, mm, vaddr, 1, 0, 0, &old_page, &vma);
 	if (ret <= 0)
 		return ret;
+
 	ret = -EINVAL;
 
 	/*
@@ -239,6 +241,7 @@ static int write_opcode(struct mm_struct *mm, struct uprobe *uprobe,
 	vaddr_new = kmap_atomic(new_page);
 
 	memcpy(vaddr_new, vaddr_old, PAGE_SIZE);
+
 	/* poke the new insn in, ASSUMES we don't cross page boundary */
 	vaddr &= ~PAGE_MASK;
 	BUG_ON(vaddr + uprobe_opcode_sz > PAGE_SIZE);
@@ -260,7 +263,8 @@ unlock_out:
 	page_cache_release(new_page);
 
 put_out:
-	put_page(old_page);	/* we did a get_page in the beginning */
+	put_page(old_page);
+
 	return ret;
 }
 
@@ -276,8 +280,7 @@ put_out:
  * For mm @mm, read the opcode at @vaddr and store it in @opcode.
  * Return 0 (success) or a negative errno.
  */
-static int read_opcode(struct mm_struct *mm, unsigned long vaddr,
-						uprobe_opcode_t *opcode)
+static int read_opcode(struct mm_struct *mm, unsigned long vaddr, uprobe_opcode_t *opcode)
 {
 	struct page *page;
 	void *vaddr_new;
@@ -293,15 +296,18 @@ static int read_opcode(struct mm_struct *mm, unsigned long vaddr,
 	memcpy(opcode, vaddr_new + vaddr, uprobe_opcode_sz);
 	kunmap_atomic(vaddr_new);
 	unlock_page(page);
-	put_page(page);		/* we did a get_user_pages in the beginning */
+
+	put_page(page);
+
 	return 0;
 }
 
 static int is_bkpt_at_addr(struct mm_struct *mm, unsigned long vaddr)
 {
 	uprobe_opcode_t opcode;
-	int result = read_opcode(mm, vaddr, &opcode);
+	int result;
 
+	result = read_opcode(mm, vaddr, &opcode);
 	if (result)
 		return result;
 
@@ -320,11 +326,11 @@ static int is_bkpt_at_addr(struct mm_struct *mm, unsigned long vaddr)
  * For mm @mm, store the breakpoint instruction at @vaddr.
  * Return 0 (success) or a negative errno.
  */
-int __weak set_bkpt(struct mm_struct *mm, struct uprobe *uprobe,
-						unsigned long vaddr)
+int __weak set_bkpt(struct mm_struct *mm, struct uprobe *uprobe, unsigned long vaddr)
 {
-	int result = is_bkpt_at_addr(mm, vaddr);
+	int result;
 
+	result = is_bkpt_at_addr(mm, vaddr);
 	if (result == 1)
 		return -EEXIST;
 
@@ -344,35 +350,35 @@ int __weak set_bkpt(struct mm_struct *mm, struct uprobe *uprobe,
  * For mm @mm, restore the original opcode (opcode) at @vaddr.
  * Return 0 (success) or a negative errno.
  */
-int __weak set_orig_insn(struct mm_struct *mm, struct uprobe *uprobe,
-					unsigned long vaddr, bool verify)
+int __weak
+set_orig_insn(struct mm_struct *mm, struct uprobe *uprobe, unsigned long vaddr, bool verify)
 {
 	if (verify) {
-		int result = is_bkpt_at_addr(mm, vaddr);
+		int result;
 
+		result = is_bkpt_at_addr(mm, vaddr);
 		if (!result)
 			return -EINVAL;
 
 		if (result != 1)
 			return result;
 	}
-	return write_opcode(mm, uprobe, vaddr,
-				*(uprobe_opcode_t *)uprobe->insn);
+	return write_opcode(mm, uprobe, vaddr, *(uprobe_opcode_t *)uprobe->insn);
 }
 
 static int match_uprobe(struct uprobe *l, struct uprobe *r)
 {
 	if (l->inode < r->inode)
 		return -1;
+
 	if (l->inode > r->inode)
 		return 1;
-	else {
-		if (l->offset < r->offset)
-			return -1;
 
-		if (l->offset > r->offset)
-			return 1;
-	}
+	if (l->offset < r->offset)
+		return -1;
+
+	if (l->offset > r->offset)
+		return 1;
 
 	return 0;
 }
@@ -391,6 +397,7 @@ static struct uprobe *__find_uprobe(struct inode *inode, loff_t offset)
 			atomic_inc(&uprobe->ref);
 			return uprobe;
 		}
+
 		if (match < 0)
 			n = n->rb_left;
 		else
@@ -411,6 +418,7 @@ static struct uprobe *find_uprobe(struct inode *inode, loff_t offset)
 	spin_lock_irqsave(&uprobes_treelock, flags);
 	uprobe = __find_uprobe(inode, offset);
 	spin_unlock_irqrestore(&uprobes_treelock, flags);
+
 	return uprobe;
 }
 
@@ -436,16 +444,18 @@ static struct uprobe *__insert_uprobe(struct uprobe *uprobe)
 			p = &parent->rb_right;
 
 	}
+
 	u = NULL;
 	rb_link_node(&uprobe->rb_node, parent, p);
 	rb_insert_color(&uprobe->rb_node, &uprobes_tree);
 	/* get access + creation ref */
 	atomic_set(&uprobe->ref, 2);
+
 	return u;
 }
 
 /*
- * Acquires uprobes_treelock.
+ * Acquire uprobes_treelock.
  * Matching uprobe already exists in rbtree;
  *	increment (access refcount) and return the matching uprobe.
  *
@@ -460,6 +470,7 @@ static struct uprobe *insert_uprobe(struct uprobe *uprobe)
 	spin_lock_irqsave(&uprobes_treelock, flags);
 	u = __insert_uprobe(uprobe);
 	spin_unlock_irqrestore(&uprobes_treelock, flags);
+
 	return u;
 }
 
@@ -490,19 +501,22 @@ static struct uprobe *alloc_uprobe(struct inode *inode, loff_t offset)
 		kfree(uprobe);
 		uprobe = cur_uprobe;
 		iput(inode);
-	} else
+	} else {
 		atomic_inc(&uprobe_events);
+	}
+
 	return uprobe;
 }
 
 /* Returns the previous consumer */
-static struct uprobe_consumer *add_consumer(struct uprobe *uprobe,
-				struct uprobe_consumer *consumer)
+static struct uprobe_consumer *
+consumer_add(struct uprobe *uprobe, struct uprobe_consumer *consumer)
 {
 	down_write(&uprobe->consumer_rwsem);
 	consumer->next = uprobe->consumers;
 	uprobe->consumers = consumer;
 	up_write(&uprobe->consumer_rwsem);
+
 	return consumer->next;
 }
 
@@ -511,8 +525,7 @@ static struct uprobe_consumer *add_consumer(struct uprobe *uprobe,
  * Return true if the @consumer is deleted successfully
  * or return false.
  */
-static bool del_consumer(struct uprobe *uprobe,
-				struct uprobe_consumer *consumer)
+static bool consumer_del(struct uprobe *uprobe, struct uprobe_consumer *consumer)
 {
 	struct uprobe_consumer **con;
 	bool ret = false;
@@ -526,6 +539,7 @@ static bool del_consumer(struct uprobe *uprobe,
 		}
 	}
 	up_write(&uprobe->consumer_rwsem);
+
 	return ret;
 }
 
@@ -557,15 +571,15 @@ static int __copy_insn(struct address_space *mapping,
 	memcpy(insn, vaddr + off1, nbytes);
 	kunmap_atomic(vaddr);
 	page_cache_release(page);
+
 	return 0;
 }
 
-static int copy_insn(struct uprobe *uprobe, struct vm_area_struct *vma,
-					unsigned long addr)
+static int copy_insn(struct uprobe *uprobe, struct vm_area_struct *vma, unsigned long addr)
 {
 	struct address_space *mapping;
-	int bytes;
 	unsigned long nbytes;
+	int bytes;
 
 	addr &= ~PAGE_MASK;
 	nbytes = PAGE_SIZE - addr;
@@ -605,6 +619,7 @@ static int install_breakpoint(struct mm_struct *mm, struct uprobe *uprobe,
 		return -EEXIST;
 
 	addr = (unsigned long)vaddr;
+
 	if (!(uprobe->flags & UPROBES_COPY_INSN)) {
 		ret = copy_insn(uprobe, vma, addr);
 		if (ret)
@@ -613,7 +628,7 @@ static int install_breakpoint(struct mm_struct *mm, struct uprobe *uprobe,
 		if (is_bkpt_insn((uprobe_opcode_t *)uprobe->insn))
 			return -EEXIST;
 
-		ret = analyze_insn(mm, uprobe);
+		ret = arch_uprobes_analyze_insn(mm, uprobe);
 		if (ret)
 			return ret;
 
@@ -624,8 +639,7 @@ static int install_breakpoint(struct mm_struct *mm, struct uprobe *uprobe,
 	return ret;
 }
 
-static void remove_breakpoint(struct mm_struct *mm, struct uprobe *uprobe,
-							loff_t vaddr)
+static void remove_breakpoint(struct mm_struct *mm, struct uprobe *uprobe, loff_t vaddr)
 {
 	set_orig_insn(mm, uprobe, (unsigned long)vaddr, true);
 }
@@ -649,9 +663,11 @@ static struct vma_info *__find_next_vma_info(struct list_head *head,
 	struct prio_tree_iter iter;
 	struct vm_area_struct *vma;
 	struct vma_info *tmpvi;
-	loff_t vaddr;
-	unsigned long pgoff = offset >> PAGE_SHIFT;
+	unsigned long pgoff;
 	int existing_vma;
+	loff_t vaddr;
+
+	pgoff = offset >> PAGE_SHIFT;
 
 	vma_prio_tree_foreach(vma, &iter, &mapping->i_mmap, pgoff, pgoff) {
 		if (!valid_vma(vma, is_register))
@@ -659,6 +675,7 @@ static struct vma_info *__find_next_vma_info(struct list_head *head,
 
 		existing_vma = 0;
 		vaddr = vma_address(vma, offset);
+
 		list_for_each_entry(tmpvi, head, probe_list) {
 			if (tmpvi->mm == vma->vm_mm && tmpvi->vaddr == vaddr) {
 				existing_vma = 1;
@@ -670,14 +687,15 @@ static struct vma_info *__find_next_vma_info(struct list_head *head,
 		 * Another vma needs a probe to be installed. However skip
 		 * installing the probe if the vma is about to be unlinked.
 		 */
-		if (!existing_vma &&
-				atomic_inc_not_zero(&vma->vm_mm->mm_users)) {
+		if (!existing_vma && atomic_inc_not_zero(&vma->vm_mm->mm_users)) {
 			vi->mm = vma->vm_mm;
 			vi->vaddr = vaddr;
 			list_add(&vi->probe_list, head);
+
 			return vi;
 		}
 	}
+
 	return NULL;
 }
 
@@ -685,11 +703,12 @@ static struct vma_info *__find_next_vma_info(struct list_head *head,
  * Iterate in the rmap prio tree  and find a vma where a probe has not
  * yet been inserted.
  */
-static struct vma_info *find_next_vma_info(struct list_head *head,
-			loff_t offset, struct address_space *mapping,
-			bool is_register)
+static struct vma_info *
+find_next_vma_info(struct list_head *head, loff_t offset, struct address_space *mapping,
+		   bool is_register)
 {
 	struct vma_info *vi, *retvi;
+
 	vi = kzalloc(sizeof(struct vma_info), GFP_KERNEL);
 	if (!vi)
 		return ERR_PTR(-ENOMEM);
@@ -700,6 +719,7 @@ static struct vma_info *find_next_vma_info(struct list_head *head,
 
 	if (!retvi)
 		kfree(vi);
+
 	return retvi;
 }
 
@@ -711,16 +731,23 @@ static int register_for_each_vma(struct uprobe *uprobe, bool is_register)
 	struct vma_info *vi, *tmpvi;
 	struct mm_struct *mm;
 	loff_t vaddr;
-	int ret = 0;
+	int ret;
 
 	mapping = uprobe->inode->i_mapping;
 	INIT_LIST_HEAD(&try_list);
-	while ((vi = find_next_vma_info(&try_list, uprobe->offset,
-					mapping, is_register)) != NULL) {
+
+	ret = 0;
+
+	for (;;) {
+		vi = find_next_vma_info(&try_list, uprobe->offset, mapping, is_register);
+		if (!vi)
+			break;
+
 		if (IS_ERR(vi)) {
 			ret = PTR_ERR(vi);
 			break;
 		}
+
 		mm = vi->mm;
 		down_read(&mm->mmap_sem);
 		vma = find_vma(mm, (unsigned long)vi->vaddr);
@@ -755,19 +782,21 @@ static int register_for_each_vma(struct uprobe *uprobe, bool is_register)
 				break;
 		}
 	}
+
 	list_for_each_entry_safe(vi, tmpvi, &try_list, probe_list) {
 		list_del(&vi->probe_list);
 		kfree(vi);
 	}
+
 	return ret;
 }
 
-static int __register_uprobe(struct uprobe *uprobe)
+static int __uprobe_register(struct uprobe *uprobe)
 {
 	return register_for_each_vma(uprobe, true);
 }
 
-static void __unregister_uprobe(struct uprobe *uprobe)
+static void __uprobe_unregister(struct uprobe *uprobe)
 {
 	if (!register_for_each_vma(uprobe, false))
 		delete_uprobe(uprobe);
@@ -776,15 +805,15 @@ static void __unregister_uprobe(struct uprobe *uprobe)
 }
 
 /*
- * register_uprobe - register a probe
+ * uprobe_register - register a probe
  * @inode: the file in which the probe has to be placed.
  * @offset: offset from the start of the file.
  * @consumer: information on howto handle the probe..
  *
- * Apart from the access refcount, register_uprobe() takes a creation
+ * Apart from the access refcount, uprobe_register() takes a creation
  * refcount (thro alloc_uprobe) if and only if this @uprobe is getting
  * inserted into the rbtree (i.e first consumer for a @inode:@offset
- * tuple).  Creation refcount stops unregister_uprobe from freeing the
+ * tuple).  Creation refcount stops uprobe_unregister from freeing the
  * @uprobe even before the register operation is complete. Creation
  * refcount is released when the last @consumer for the @uprobe
  * unregisters.
@@ -792,28 +821,29 @@ static void __unregister_uprobe(struct uprobe *uprobe)
  * Return errno if it cannot successully install probes
  * else return 0 (success)
  */
-int register_uprobe(struct inode *inode, loff_t offset,
-				struct uprobe_consumer *consumer)
+int uprobe_register(struct inode *inode, loff_t offset, struct uprobe_consumer *consumer)
 {
 	struct uprobe *uprobe;
-	int ret = -EINVAL;
+	int ret;
 
 	if (!inode || !consumer || consumer->next)
-		return ret;
+		return -EINVAL;
 
 	if (offset > i_size_read(inode))
-		return ret;
+		return -EINVAL;
 
 	ret = 0;
 	mutex_lock(uprobes_hash(inode));
 	uprobe = alloc_uprobe(inode, offset);
-	if (uprobe && !add_consumer(uprobe, consumer)) {
-		ret = __register_uprobe(uprobe);
+
+	if (uprobe && !consumer_add(uprobe, consumer)) {
+		ret = __uprobe_register(uprobe);
 		if (ret) {
 			uprobe->consumers = NULL;
-			__unregister_uprobe(uprobe);
-		} else
+			__uprobe_unregister(uprobe);
+		} else {
 			uprobe->flags |= UPROBES_RUN_HANDLER;
+		}
 	}
 
 	mutex_unlock(uprobes_hash(inode));
@@ -823,15 +853,14 @@ int register_uprobe(struct inode *inode, loff_t offset,
 }
 
 /*
- * unregister_uprobe - unregister a already registered probe.
+ * uprobe_unregister - unregister a already registered probe.
  * @inode: the file in which the probe has to be removed.
  * @offset: offset from the start of the file.
  * @consumer: identify which probe if multiple probes are colocated.
  */
-void unregister_uprobe(struct inode *inode, loff_t offset,
-				struct uprobe_consumer *consumer)
+void uprobe_unregister(struct inode *inode, loff_t offset, struct uprobe_consumer *consumer)
 {
-	struct uprobe *uprobe = NULL;
+	struct uprobe *uprobe;
 
 	if (!inode || !consumer)
 		return;
@@ -841,15 +870,14 @@ void unregister_uprobe(struct inode *inode, loff_t offset,
 		return;
 
 	mutex_lock(uprobes_hash(inode));
-	if (!del_consumer(uprobe, consumer))
-		goto unreg_out;
 
-	if (!uprobe->consumers) {
-		__unregister_uprobe(uprobe);
-		uprobe->flags &= ~UPROBES_RUN_HANDLER;
+	if (consumer_del(uprobe, consumer)) {
+		if (!uprobe->consumers) {
+			__uprobe_unregister(uprobe);
+			uprobe->flags &= ~UPROBES_RUN_HANDLER;
+		}
 	}
 
-unreg_out:
 	mutex_unlock(uprobes_hash(inode));
 	if (uprobe)
 		put_uprobe(uprobe);
@@ -870,6 +898,7 @@ static struct rb_node *find_least_offset_node(struct inode *inode)
 	while (n) {
 		uprobe = rb_entry(n, struct uprobe, rb_node);
 		match = match_uprobe(&u, uprobe);
+
 		if (uprobe->inode == inode)
 			close_node = n;
 
@@ -881,6 +910,7 @@ static struct rb_node *find_least_offset_node(struct inode *inode)
 		else
 			n = n->rb_right;
 	}
+
 	return close_node;
 }
 
@@ -890,11 +920,13 @@ static struct rb_node *find_least_offset_node(struct inode *inode)
 static void build_probe_list(struct inode *inode, struct list_head *head)
 {
 	struct uprobe *uprobe;
-	struct rb_node *n;
 	unsigned long flags;
+	struct rb_node *n;
 
 	spin_lock_irqsave(&uprobes_treelock, flags);
+
 	n = find_least_offset_node(inode);
+
 	for (; n; n = rb_next(n)) {
 		uprobe = rb_entry(n, struct uprobe, rb_node);
 		if (uprobe->inode != inode)
@@ -903,6 +935,7 @@ static void build_probe_list(struct inode *inode, struct list_head *head)
 		list_add(&uprobe->pending_list, head);
 		atomic_inc(&uprobe->ref);
 	}
+
 	spin_unlock_irqrestore(&uprobes_treelock, flags);
 }
 
@@ -912,42 +945,44 @@ static void build_probe_list(struct inode *inode, struct list_head *head)
  *
  * Return -ve no if we fail to insert probes and we cannot
  * bail-out.
- * Return 0 otherwise. i.e :
+ * Return 0 otherwise. i.e:
+ *
  *	- successful insertion of probes
  *	- (or) no possible probes to be inserted.
  *	- (or) insertion of probes failed but we can bail-out.
  */
-int mmap_uprobe(struct vm_area_struct *vma)
+int uprobe_mmap(struct vm_area_struct *vma)
 {
 	struct list_head tmp_list;
 	struct uprobe *uprobe, *u;
 	struct inode *inode;
-	int ret = 0;
+	int ret;
 
 	if (!atomic_read(&uprobe_events) || !valid_vma(vma, true))
-		return ret;	/* Bail-out */
+		return 0;
 
 	inode = vma->vm_file->f_mapping->host;
 	if (!inode)
-		return ret;
+		return 0;
 
 	INIT_LIST_HEAD(&tmp_list);
 	mutex_lock(uprobes_mmap_hash(inode));
 	build_probe_list(inode, &tmp_list);
+
+	ret = 0;
+
 	list_for_each_entry_safe(uprobe, u, &tmp_list, pending_list) {
 		loff_t vaddr;
 
 		list_del(&uprobe->pending_list);
 		if (!ret) {
 			vaddr = vma_address(vma, uprobe->offset);
-			if (vaddr < vma->vm_start || vaddr >= vma->vm_end) {
-				put_uprobe(uprobe);
-				continue;
+			if (vaddr >= vma->vm_start && vaddr < vma->vm_end) {
+				ret = install_breakpoint(vma->vm_mm, uprobe, vma, vaddr);
+				/* Ignore double add: */
+				if (ret == -EEXIST)
+					ret = 0;
 			}
-			ret = install_breakpoint(vma->vm_mm, uprobe, vma,
-								vaddr);
-			if (ret == -EEXIST)
-				ret = 0;
 		}
 		put_uprobe(uprobe);
 	}
diff --git a/mm/mmap.c b/mm/mmap.c
index 1aed183636d7..5a863d328a44 100644
--- a/mm/mmap.c
+++ b/mm/mmap.c
@@ -618,10 +618,10 @@ again:			remove_next = 1 + (end > next->vm_end);
 		mutex_unlock(&mapping->i_mmap_mutex);
 
 	if (root) {
-		mmap_uprobe(vma);
+		uprobe_mmap(vma);
 
 		if (adjust_next)
-			mmap_uprobe(next);
+			uprobe_mmap(next);
 	}
 
 	if (remove_next) {
@@ -646,7 +646,7 @@ again:			remove_next = 1 + (end > next->vm_end);
 		}
 	}
 	if (insert && file)
-		mmap_uprobe(insert);
+		uprobe_mmap(insert);
 
 	validate_mm(mm);
 
@@ -1340,7 +1340,7 @@ out:
 	} else if ((flags & MAP_POPULATE) && !(flags & MAP_NONBLOCK))
 		make_pages_present(addr, addr + len);
 
-	if (file && mmap_uprobe(vma))
+	if (file && uprobe_mmap(vma))
 		/* matching probes but cannot insert */
 		goto unmap_and_free_vma;
 
@@ -2301,7 +2301,7 @@ int insert_vm_struct(struct mm_struct * mm, struct vm_area_struct * vma)
 	     security_vm_enough_memory_mm(mm, vma_pages(vma)))
 		return -ENOMEM;
 
-	if (vma->vm_file && mmap_uprobe(vma))
+	if (vma->vm_file && uprobe_mmap(vma))
 		return -EINVAL;
 
 	vma_link(mm, vma, prev, rb_link, rb_parent);
@@ -2374,7 +2374,7 @@ struct vm_area_struct *copy_vma(struct vm_area_struct **vmap,
 			if (new_vma->vm_file) {
 				get_file(new_vma->vm_file);
 
-				if (mmap_uprobe(new_vma))
+				if (uprobe_mmap(new_vma))
 					goto out_free_mempol;
 
 				if (vma->vm_flags & VM_EXECUTABLE)
-- 
cgit v1.2.3


From 04a3d984d32e47983770d314cdb4e4d8f38fccb7 Mon Sep 17 00:00:00 2001
From: Srikar Dronamraju <srikar@linux.vnet.ibm.com>
Date: Wed, 22 Feb 2012 14:45:35 +0530
Subject: uprobes/core: Make instruction tables volatile

Some versions of gcc spits a warning about the asm operand for
test_bit and also causes the first long of the instruction table
to be output.

Fix is similar to 7115e3fc on arch/x86/kernel/kprobes.c

Signed-off-by: Srikar Dronamraju <srikar@linux.vnet.ibm.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Oleg Nesterov <oleg@redhat.com>
Cc: Christoph Hellwig <hch@infradead.org>
Cc: Steven Rostedt <rostedt@goodmis.org>
Cc: Masami Hiramatsu <masami.hiramatsu.pt@hitachi.com>
Cc: Anton Arapov <anton@redhat.com>
Cc: Ananth N Mavinakayanahalli <ananth@in.ibm.com>
Cc: Jim Keniston <jkenisto@linux.vnet.ibm.com>
Cc: Jiri Olsa <jolsa@redhat.com>
Cc: Josh Stone <jistone@redhat.com>
Link: http://lkml.kernel.org/r/20120222091535.15880.12502.sendpatchset@srdronam.in.ibm.com
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/uprobes.c | 61 +++++++++++++++++++++++++----------------------
 1 file changed, 32 insertions(+), 29 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kernel/uprobes.c b/arch/x86/kernel/uprobes.c
index cf2a18498425..13d616d6519b 100644
--- a/arch/x86/kernel/uprobes.c
+++ b/arch/x86/kernel/uprobes.c
@@ -53,34 +53,12 @@
 	  (bc##UL << 0xc)|(bd##UL << 0xd)|(be##UL << 0xe)|(bf##UL << 0xf))    \
 	 << (row % 32))
 
-#ifdef CONFIG_X86_64
-static u32 good_insns_64[256 / 32] = {
-	/*      0  1  2  3  4  5  6  7  8  9  a  b  c  d  e  f         */
-	/*      ----------------------------------------------         */
-	W(0x00, 1, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 1, 1, 0, 0) | /* 00 */
-	W(0x10, 1, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 1, 1, 0, 0) , /* 10 */
-	W(0x20, 1, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 1, 1, 0, 0) | /* 20 */
-	W(0x30, 1, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 1, 1, 0, 0) , /* 30 */
-	W(0x40, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0) | /* 40 */
-	W(0x50, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1) , /* 50 */
-	W(0x60, 0, 0, 0, 1, 1, 1, 0, 0, 1, 1, 1, 1, 0, 0, 0, 0) | /* 60 */
-	W(0x70, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1) , /* 70 */
-	W(0x80, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1) | /* 80 */
-	W(0x90, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1) , /* 90 */
-	W(0xa0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1) | /* a0 */
-	W(0xb0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1) , /* b0 */
-	W(0xc0, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0) | /* c0 */
-	W(0xd0, 1, 1, 1, 1, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1) , /* d0 */
-	W(0xe0, 1, 1, 1, 1, 0, 0, 0, 0, 1, 1, 1, 1, 0, 0, 0, 0) | /* e0 */
-	W(0xf0, 0, 0, 1, 1, 0, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1)   /* f0 */
-	/*      ----------------------------------------------         */
-	/*      0  1  2  3  4  5  6  7  8  9  a  b  c  d  e  f         */
-};
-#endif
-
-/* Good-instruction tables for 32-bit apps */
-
-static u32 good_insns_32[256 / 32] = {
+/*
+ * Good-instruction tables for 32-bit apps.  This is non-const and volatile
+ * to keep gcc from statically optimizing it out, as variable_test_bit makes
+ * some versions of gcc to think only *(unsigned long*) is used.
+ */
+static volatile u32 good_insns_32[256 / 32] = {
 	/*      0  1  2  3  4  5  6  7  8  9  a  b  c  d  e  f         */
 	/*      ----------------------------------------------         */
 	W(0x00, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 0) | /* 00 */
@@ -104,7 +82,7 @@ static u32 good_insns_32[256 / 32] = {
 };
 
 /* Using this for both 64-bit and 32-bit apps */
-static u32 good_2byte_insns[256 / 32] = {
+static volatile u32 good_2byte_insns[256 / 32] = {
 	/*      0  1  2  3  4  5  6  7  8  9  a  b  c  d  e  f         */
 	/*      ----------------------------------------------         */
 	W(0x00, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1) | /* 00 */
@@ -127,6 +105,31 @@ static u32 good_2byte_insns[256 / 32] = {
 	/*      0  1  2  3  4  5  6  7  8  9  a  b  c  d  e  f         */
 };
 
+#ifdef CONFIG_X86_64
+/* Good-instruction tables for 64-bit apps */
+static volatile u32 good_insns_64[256 / 32] = {
+	/*      0  1  2  3  4  5  6  7  8  9  a  b  c  d  e  f         */
+	/*      ----------------------------------------------         */
+	W(0x00, 1, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 1, 1, 0, 0) | /* 00 */
+	W(0x10, 1, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 1, 1, 0, 0) , /* 10 */
+	W(0x20, 1, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 1, 1, 0, 0) | /* 20 */
+	W(0x30, 1, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 1, 1, 0, 0) , /* 30 */
+	W(0x40, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0) | /* 40 */
+	W(0x50, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1) , /* 50 */
+	W(0x60, 0, 0, 0, 1, 1, 1, 0, 0, 1, 1, 1, 1, 0, 0, 0, 0) | /* 60 */
+	W(0x70, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1) , /* 70 */
+	W(0x80, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1) | /* 80 */
+	W(0x90, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1) , /* 90 */
+	W(0xa0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1) | /* a0 */
+	W(0xb0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1) , /* b0 */
+	W(0xc0, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0) | /* c0 */
+	W(0xd0, 1, 1, 1, 1, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1) , /* d0 */
+	W(0xe0, 1, 1, 1, 1, 0, 0, 0, 0, 1, 1, 1, 1, 0, 0, 0, 0) | /* e0 */
+	W(0xf0, 0, 0, 1, 1, 0, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1)   /* f0 */
+	/*      ----------------------------------------------         */
+	/*      0  1  2  3  4  5  6  7  8  9  a  b  c  d  e  f         */
+};
+#endif
 #undef W
 
 /*
-- 
cgit v1.2.3


From 3ff54efdfaace9e9b2b7c1959a865be6b91de96c Mon Sep 17 00:00:00 2001
From: Srikar Dronamraju <srikar@linux.vnet.ibm.com>
Date: Wed, 22 Feb 2012 14:46:02 +0530
Subject: uprobes/core: Move insn to arch specific structure

Few cleanups suggested by Ingo Molnar.

- Rename struct uprobe_arch_info to struct arch_uprobe.
- Move insn from struct uprobe to struct arch_uprobe.
- Make arch specific uprobe functions to accept struct arch_uprobe
  instead of  struct uprobe.
- Move struct uprobe to kernel/uprobes.c from include/linux/uprobes.h

Signed-off-by: Srikar Dronamraju <srikar@linux.vnet.ibm.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Oleg Nesterov <oleg@redhat.com>
Cc: Christoph Hellwig <hch@infradead.org>
Cc: Steven Rostedt <rostedt@goodmis.org>
Cc: Masami Hiramatsu <masami.hiramatsu.pt@hitachi.com>
Cc: Anton Arapov <anton@redhat.com>
Cc: Ananth N Mavinakayanahalli <ananth@in.ibm.com>
Cc: Jim Keniston <jkenisto@linux.vnet.ibm.com>
Cc: Jiri Olsa <jolsa@redhat.com>
Cc: Josh Stone <jistone@redhat.com>
Link: http://lkml.kernel.org/r/20120222091602.15880.40249.sendpatchset@srdronam.in.ibm.com
[ Made various small improvements ]
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/include/asm/uprobes.h |  6 ++---
 arch/x86/kernel/uprobes.c      | 60 +++++++++++++++++++++---------------------
 include/linux/uprobes.h        | 23 ++--------------
 kernel/events/uprobes.c        | 38 +++++++++++++++++---------
 4 files changed, 61 insertions(+), 66 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/include/asm/uprobes.h b/arch/x86/include/asm/uprobes.h
index 072df3902636..f7ce310a429d 100644
--- a/arch/x86/include/asm/uprobes.h
+++ b/arch/x86/include/asm/uprobes.h
@@ -31,13 +31,13 @@ typedef u8 uprobe_opcode_t;
 #define UPROBES_BKPT_INSN		0xcc
 #define UPROBES_BKPT_INSN_SIZE		   1
 
-struct uprobe_arch_info {
+struct arch_uprobe {
 	u16				fixups;
+	u8				insn[MAX_UINSN_BYTES];
 #ifdef CONFIG_X86_64
 	unsigned long			rip_rela_target_address;
 #endif
 };
 
-struct uprobe;
-extern int arch_uprobes_analyze_insn(struct mm_struct *mm, struct uprobe *uprobe);
+extern int arch_uprobes_analyze_insn(struct mm_struct *mm, struct arch_uprobe *arch_uprobe);
 #endif	/* _ASM_UPROBES_H */
diff --git a/arch/x86/kernel/uprobes.c b/arch/x86/kernel/uprobes.c
index 13d616d6519b..04dfcef2d028 100644
--- a/arch/x86/kernel/uprobes.c
+++ b/arch/x86/kernel/uprobes.c
@@ -200,9 +200,9 @@ static bool is_prefix_bad(struct insn *insn)
 	return false;
 }
 
-static int validate_insn_32bits(struct uprobe *uprobe, struct insn *insn)
+static int validate_insn_32bits(struct arch_uprobe *auprobe, struct insn *insn)
 {
-	insn_init(insn, uprobe->insn, false);
+	insn_init(insn, auprobe->insn, false);
 
 	/* Skip good instruction prefixes; reject "bad" ones. */
 	insn_get_opcode(insn);
@@ -222,11 +222,11 @@ static int validate_insn_32bits(struct uprobe *uprobe, struct insn *insn)
 
 /*
  * Figure out which fixups post_xol() will need to perform, and annotate
- * uprobe->arch_info.fixups accordingly.  To start with,
- * uprobe->arch_info.fixups is either zero or it reflects rip-related
+ * arch_uprobe->fixups accordingly.  To start with,
+ * arch_uprobe->fixups is either zero or it reflects rip-related
  * fixups.
  */
-static void prepare_fixups(struct uprobe *uprobe, struct insn *insn)
+static void prepare_fixups(struct arch_uprobe *auprobe, struct insn *insn)
 {
 	bool fix_ip = true, fix_call = false;	/* defaults */
 	int reg;
@@ -269,17 +269,17 @@ static void prepare_fixups(struct uprobe *uprobe, struct insn *insn)
 		break;
 	}
 	if (fix_ip)
-		uprobe->arch_info.fixups |= UPROBES_FIX_IP;
+		auprobe->fixups |= UPROBES_FIX_IP;
 	if (fix_call)
-		uprobe->arch_info.fixups |= UPROBES_FIX_CALL;
+		auprobe->fixups |= UPROBES_FIX_CALL;
 }
 
 #ifdef CONFIG_X86_64
 /*
- * If uprobe->insn doesn't use rip-relative addressing, return
+ * If arch_uprobe->insn doesn't use rip-relative addressing, return
  * immediately.  Otherwise, rewrite the instruction so that it accesses
  * its memory operand indirectly through a scratch register.  Set
- * uprobe->arch_info.fixups and uprobe->arch_info.rip_rela_target_address
+ * arch_uprobe->fixups and arch_uprobe->rip_rela_target_address
  * accordingly.  (The contents of the scratch register will be saved
  * before we single-step the modified instruction, and restored
  * afterward.)
@@ -297,7 +297,7 @@ static void prepare_fixups(struct uprobe *uprobe, struct insn *insn)
  *  - There's never a SIB byte.
  *  - The displacement is always 4 bytes.
  */
-static void handle_riprel_insn(struct mm_struct *mm, struct uprobe *uprobe, struct insn *insn)
+static void handle_riprel_insn(struct mm_struct *mm, struct arch_uprobe *auprobe, struct insn *insn)
 {
 	u8 *cursor;
 	u8 reg;
@@ -305,7 +305,7 @@ static void handle_riprel_insn(struct mm_struct *mm, struct uprobe *uprobe, stru
 	if (mm->context.ia32_compat)
 		return;
 
-	uprobe->arch_info.rip_rela_target_address = 0x0;
+	auprobe->rip_rela_target_address = 0x0;
 	if (!insn_rip_relative(insn))
 		return;
 
@@ -315,7 +315,7 @@ static void handle_riprel_insn(struct mm_struct *mm, struct uprobe *uprobe, stru
 	 * we want to encode rax/rcx, not r8/r9.
 	 */
 	if (insn->rex_prefix.nbytes) {
-		cursor = uprobe->insn + insn_offset_rex_prefix(insn);
+		cursor = auprobe->insn + insn_offset_rex_prefix(insn);
 		*cursor &= 0xfe;	/* Clearing REX.B bit */
 	}
 
@@ -324,7 +324,7 @@ static void handle_riprel_insn(struct mm_struct *mm, struct uprobe *uprobe, stru
 	 * displacement.  Beyond the displacement, for some instructions,
 	 * is the immediate operand.
 	 */
-	cursor = uprobe->insn + insn_offset_modrm(insn);
+	cursor = auprobe->insn + insn_offset_modrm(insn);
 	insn_get_length(insn);
 
 	/*
@@ -341,18 +341,18 @@ static void handle_riprel_insn(struct mm_struct *mm, struct uprobe *uprobe, stru
 		 * is NOT the register operand, so we use %rcx (register
 		 * #1) for the scratch register.
 		 */
-		uprobe->arch_info.fixups = UPROBES_FIX_RIP_CX;
+		auprobe->fixups = UPROBES_FIX_RIP_CX;
 		/* Change modrm from 00 000 101 to 00 000 001. */
 		*cursor = 0x1;
 	} else {
 		/* Use %rax (register #0) for the scratch register. */
-		uprobe->arch_info.fixups = UPROBES_FIX_RIP_AX;
+		auprobe->fixups = UPROBES_FIX_RIP_AX;
 		/* Change modrm from 00 xxx 101 to 00 xxx 000 */
 		*cursor = (reg << 3);
 	}
 
 	/* Target address = address of next instruction + (signed) offset */
-	uprobe->arch_info.rip_rela_target_address = (long)insn->length + insn->displacement.value;
+	auprobe->rip_rela_target_address = (long)insn->length + insn->displacement.value;
 
 	/* Displacement field is gone; slide immediate field (if any) over. */
 	if (insn->immediate.nbytes) {
@@ -362,9 +362,9 @@ static void handle_riprel_insn(struct mm_struct *mm, struct uprobe *uprobe, stru
 	return;
 }
 
-static int validate_insn_64bits(struct uprobe *uprobe, struct insn *insn)
+static int validate_insn_64bits(struct arch_uprobe *auprobe, struct insn *insn)
 {
-	insn_init(insn, uprobe->insn, true);
+	insn_init(insn, auprobe->insn, true);
 
 	/* Skip good instruction prefixes; reject "bad" ones. */
 	insn_get_opcode(insn);
@@ -381,42 +381,42 @@ static int validate_insn_64bits(struct uprobe *uprobe, struct insn *insn)
 	return -ENOTSUPP;
 }
 
-static int validate_insn_bits(struct mm_struct *mm, struct uprobe *uprobe, struct insn *insn)
+static int validate_insn_bits(struct mm_struct *mm, struct arch_uprobe *auprobe, struct insn *insn)
 {
 	if (mm->context.ia32_compat)
-		return validate_insn_32bits(uprobe, insn);
-	return validate_insn_64bits(uprobe, insn);
+		return validate_insn_32bits(auprobe, insn);
+	return validate_insn_64bits(auprobe, insn);
 }
 #else /* 32-bit: */
-static void handle_riprel_insn(struct mm_struct *mm, struct uprobe *uprobe, struct insn *insn)
+static void handle_riprel_insn(struct mm_struct *mm, struct arch_uprobe *auprobe, struct insn *insn)
 {
 	/* No RIP-relative addressing on 32-bit */
 }
 
-static int validate_insn_bits(struct mm_struct *mm, struct uprobe *uprobe, struct insn *insn)
+static int validate_insn_bits(struct mm_struct *mm, struct arch_uprobe *auprobe, struct insn *insn)
 {
-	return validate_insn_32bits(uprobe, insn);
+	return validate_insn_32bits(auprobe, insn);
 }
 #endif /* CONFIG_X86_64 */
 
 /**
  * arch_uprobes_analyze_insn - instruction analysis including validity and fixups.
  * @mm: the probed address space.
- * @uprobe: the probepoint information.
+ * @arch_uprobe: the probepoint information.
  * Return 0 on success or a -ve number on error.
  */
-int arch_uprobes_analyze_insn(struct mm_struct *mm, struct uprobe *uprobe)
+int arch_uprobes_analyze_insn(struct mm_struct *mm, struct arch_uprobe *auprobe)
 {
 	int ret;
 	struct insn insn;
 
-	uprobe->arch_info.fixups = 0;
-	ret = validate_insn_bits(mm, uprobe, &insn);
+	auprobe->fixups = 0;
+	ret = validate_insn_bits(mm, auprobe, &insn);
 	if (ret != 0)
 		return ret;
 
-	handle_riprel_insn(mm, uprobe, &insn);
-	prepare_fixups(uprobe, &insn);
+	handle_riprel_insn(mm, auprobe, &insn);
+	prepare_fixups(auprobe, &insn);
 
 	return 0;
 }
diff --git a/include/linux/uprobes.h b/include/linux/uprobes.h
index fd45b70750d4..9c6be62787ed 100644
--- a/include/linux/uprobes.h
+++ b/include/linux/uprobes.h
@@ -29,12 +29,6 @@
 struct vm_area_struct;
 #ifdef CONFIG_ARCH_SUPPORTS_UPROBES
 #include <asm/uprobes.h>
-#else
-
-typedef u8 uprobe_opcode_t;
-struct uprobe_arch_info {};
-
-#define MAX_UINSN_BYTES 4
 #endif
 
 /* flags that denote/change uprobes behaviour */
@@ -56,22 +50,9 @@ struct uprobe_consumer {
 	struct uprobe_consumer *next;
 };
 
-struct uprobe {
-	struct rb_node		rb_node;	/* node in the rb tree */
-	atomic_t		ref;
-	struct rw_semaphore	consumer_rwsem;
-	struct list_head	pending_list;
-	struct uprobe_arch_info arch_info;
-	struct uprobe_consumer	*consumers;
-	struct inode		*inode;		/* Also hold a ref to inode */
-	loff_t			offset;
-	int			flags;
-	u8			insn[MAX_UINSN_BYTES];
-};
-
 #ifdef CONFIG_UPROBES
-extern int __weak set_bkpt(struct mm_struct *mm, struct uprobe *uprobe, unsigned long vaddr);
-extern int __weak set_orig_insn(struct mm_struct *mm, struct uprobe *uprobe, unsigned long vaddr, bool verify);
+extern int __weak set_bkpt(struct mm_struct *mm, struct arch_uprobe *auprobe, unsigned long vaddr);
+extern int __weak set_orig_insn(struct mm_struct *mm, struct arch_uprobe *auprobe, unsigned long vaddr, bool verify);
 extern bool __weak is_bkpt_insn(uprobe_opcode_t *insn);
 extern int uprobe_register(struct inode *inode, loff_t offset, struct uprobe_consumer *consumer);
 extern void uprobe_unregister(struct inode *inode, loff_t offset, struct uprobe_consumer *consumer);
diff --git a/kernel/events/uprobes.c b/kernel/events/uprobes.c
index ee496ad95db3..13f1b5909af4 100644
--- a/kernel/events/uprobes.c
+++ b/kernel/events/uprobes.c
@@ -65,6 +65,18 @@ struct vma_info {
 	loff_t			vaddr;
 };
 
+struct uprobe {
+	struct rb_node		rb_node;	/* node in the rb tree */
+	atomic_t		ref;
+	struct rw_semaphore	consumer_rwsem;
+	struct list_head	pending_list;
+	struct uprobe_consumer	*consumers;
+	struct inode		*inode;		/* Also hold a ref to inode */
+	loff_t			offset;
+	int			flags;
+	struct arch_uprobe	arch;
+};
+
 /*
  * valid_vma: Verify if the specified vma is an executable vma
  * Relax restrictions while unregistering: vm_flags might have
@@ -180,7 +192,7 @@ bool __weak is_bkpt_insn(uprobe_opcode_t *insn)
 /*
  * write_opcode - write the opcode at a given virtual address.
  * @mm: the probed process address space.
- * @uprobe: the breakpointing information.
+ * @arch_uprobe: the breakpointing information.
  * @vaddr: the virtual address to store the opcode.
  * @opcode: opcode to be written at @vaddr.
  *
@@ -190,13 +202,14 @@ bool __weak is_bkpt_insn(uprobe_opcode_t *insn)
  * For mm @mm, write the opcode at @vaddr.
  * Return 0 (success) or a negative errno.
  */
-static int write_opcode(struct mm_struct *mm, struct uprobe *uprobe,
+static int write_opcode(struct mm_struct *mm, struct arch_uprobe *auprobe,
 			unsigned long vaddr, uprobe_opcode_t opcode)
 {
 	struct page *old_page, *new_page;
 	struct address_space *mapping;
 	void *vaddr_old, *vaddr_new;
 	struct vm_area_struct *vma;
+	struct uprobe *uprobe;
 	loff_t addr;
 	int ret;
 
@@ -216,6 +229,7 @@ static int write_opcode(struct mm_struct *mm, struct uprobe *uprobe,
 	if (!valid_vma(vma, is_bkpt_insn(&opcode)))
 		goto put_out;
 
+	uprobe = container_of(auprobe, struct uprobe, arch);
 	mapping = uprobe->inode->i_mapping;
 	if (mapping != vma->vm_file->f_mapping)
 		goto put_out;
@@ -326,7 +340,7 @@ static int is_bkpt_at_addr(struct mm_struct *mm, unsigned long vaddr)
  * For mm @mm, store the breakpoint instruction at @vaddr.
  * Return 0 (success) or a negative errno.
  */
-int __weak set_bkpt(struct mm_struct *mm, struct uprobe *uprobe, unsigned long vaddr)
+int __weak set_bkpt(struct mm_struct *mm, struct arch_uprobe *auprobe, unsigned long vaddr)
 {
 	int result;
 
@@ -337,7 +351,7 @@ int __weak set_bkpt(struct mm_struct *mm, struct uprobe *uprobe, unsigned long v
 	if (result)
 		return result;
 
-	return write_opcode(mm, uprobe, vaddr, UPROBES_BKPT_INSN);
+	return write_opcode(mm, auprobe, vaddr, UPROBES_BKPT_INSN);
 }
 
 /**
@@ -351,7 +365,7 @@ int __weak set_bkpt(struct mm_struct *mm, struct uprobe *uprobe, unsigned long v
  * Return 0 (success) or a negative errno.
  */
 int __weak
-set_orig_insn(struct mm_struct *mm, struct uprobe *uprobe, unsigned long vaddr, bool verify)
+set_orig_insn(struct mm_struct *mm, struct arch_uprobe *auprobe, unsigned long vaddr, bool verify)
 {
 	if (verify) {
 		int result;
@@ -363,7 +377,7 @@ set_orig_insn(struct mm_struct *mm, struct uprobe *uprobe, unsigned long vaddr,
 		if (result != 1)
 			return result;
 	}
-	return write_opcode(mm, uprobe, vaddr, *(uprobe_opcode_t *)uprobe->insn);
+	return write_opcode(mm, auprobe, vaddr, *(uprobe_opcode_t *)auprobe->insn);
 }
 
 static int match_uprobe(struct uprobe *l, struct uprobe *r)
@@ -593,13 +607,13 @@ static int copy_insn(struct uprobe *uprobe, struct vm_area_struct *vma, unsigned
 
 	/* Instruction at the page-boundary; copy bytes in second page */
 	if (nbytes < bytes) {
-		if (__copy_insn(mapping, vma, uprobe->insn + nbytes,
+		if (__copy_insn(mapping, vma, uprobe->arch.insn + nbytes,
 				bytes - nbytes, uprobe->offset + nbytes))
 			return -ENOMEM;
 
 		bytes = nbytes;
 	}
-	return __copy_insn(mapping, vma, uprobe->insn, bytes, uprobe->offset);
+	return __copy_insn(mapping, vma, uprobe->arch.insn, bytes, uprobe->offset);
 }
 
 static int install_breakpoint(struct mm_struct *mm, struct uprobe *uprobe,
@@ -625,23 +639,23 @@ static int install_breakpoint(struct mm_struct *mm, struct uprobe *uprobe,
 		if (ret)
 			return ret;
 
-		if (is_bkpt_insn((uprobe_opcode_t *)uprobe->insn))
+		if (is_bkpt_insn((uprobe_opcode_t *)uprobe->arch.insn))
 			return -EEXIST;
 
-		ret = arch_uprobes_analyze_insn(mm, uprobe);
+		ret = arch_uprobes_analyze_insn(mm, &uprobe->arch);
 		if (ret)
 			return ret;
 
 		uprobe->flags |= UPROBES_COPY_INSN;
 	}
-	ret = set_bkpt(mm, uprobe, addr);
+	ret = set_bkpt(mm, &uprobe->arch, addr);
 
 	return ret;
 }
 
 static void remove_breakpoint(struct mm_struct *mm, struct uprobe *uprobe, loff_t vaddr)
 {
-	set_orig_insn(mm, uprobe, (unsigned long)vaddr, true);
+	set_orig_insn(mm, &uprobe->arch, (unsigned long)vaddr, true);
 }
 
 static void delete_uprobe(struct uprobe *uprobe)
-- 
cgit v1.2.3


From 722bc6b16771ed80871e1fd81c86d3627dda2ac8 Mon Sep 17 00:00:00 2001
From: WANG Cong <xiyou.wangcong@gmail.com>
Date: Mon, 5 Mar 2012 15:05:13 -0800
Subject: x86/mm: Fix the size calculation of mapping tables

For machines that enable PSE, the first 2/4M memory region still uses
4K pages, so needs more PTEs in this case, but
find_early_table_space() doesn't count this.

This patch fixes it.

The bug was found via code review, no misbehavior of the kernel
was observed.

Signed-off-by: WANG Cong <xiyou.wangcong@gmail.com>
Cc: Yinghai Lu <yinghai@kernel.org>
Cc: Tejun Heo <tj@kernel.org>
Cc: <ianfang.cn@gmail.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Link: http://lkml.kernel.org/n/tip-kq6a00qe33h7c7ais2xsywnh@git.kernel.org
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/mm/init.c | 21 ++++++++++++---------
 1 file changed, 12 insertions(+), 9 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/mm/init.c b/arch/x86/mm/init.c
index 6cabf6570d64..2e92fdcbea86 100644
--- a/arch/x86/mm/init.c
+++ b/arch/x86/mm/init.c
@@ -30,8 +30,14 @@ int direct_gbpages
 #endif
 ;
 
-static void __init find_early_table_space(unsigned long end, int use_pse,
-					  int use_gbpages)
+struct map_range {
+	unsigned long start;
+	unsigned long end;
+	unsigned page_size_mask;
+};
+
+static void __init find_early_table_space(struct map_range *mr, unsigned long end,
+					  int use_pse, int use_gbpages)
 {
 	unsigned long puds, pmds, ptes, tables, start = 0, good_end = end;
 	phys_addr_t base;
@@ -56,6 +62,9 @@ static void __init find_early_table_space(unsigned long end, int use_pse,
 #ifdef CONFIG_X86_32
 		extra += PMD_SIZE;
 #endif
+		/* The first 2/4M doesn't use large pages. */
+		extra += mr->end - mr->start;
+
 		ptes = (extra + PAGE_SIZE - 1) >> PAGE_SHIFT;
 	} else
 		ptes = (end + PAGE_SIZE - 1) >> PAGE_SHIFT;
@@ -85,12 +94,6 @@ void __init native_pagetable_reserve(u64 start, u64 end)
 	memblock_reserve(start, end - start);
 }
 
-struct map_range {
-	unsigned long start;
-	unsigned long end;
-	unsigned page_size_mask;
-};
-
 #ifdef CONFIG_X86_32
 #define NR_RANGE_MR 3
 #else /* CONFIG_X86_64 */
@@ -262,7 +265,7 @@ unsigned long __init_refok init_memory_mapping(unsigned long start,
 	 * nodes are discovered.
 	 */
 	if (!after_bootmem)
-		find_early_table_space(end, use_pse, use_gbpages);
+		find_early_table_space(&mr[0], end, use_pse, use_gbpages);
 
 	for (i = 0; i < nr_range; i++)
 		ret = kernel_physical_mapping_init(mr[i].start, mr[i].end,
-- 
cgit v1.2.3


From 510419435c6948fb32959d691bf84eaba41ca474 Mon Sep 17 00:00:00 2001
From: Robert Richter <robert.richter@amd.com>
Date: Thu, 15 Dec 2011 17:56:36 +0100
Subject: perf/x86: Implement IBS event configuration

This patch implements perf configuration for AMD IBS. The IBS
pmu is selected using the type attribute in sysfs. There are two
types of ibs pmus, for instruction fetch (IBS_FETCH) and for
instruction execution (IBS_OP):

 /sys/bus/event_source/devices/ibs_fetch/type
 /sys/bus/event_source/devices/ibs_op/type

Except for the sample period IBS can only be set up with raw
config values and raw data samples. The event attributes for the
syscall should be programmed like this (IBS_FETCH):

        type = get_pmu_type("/sys/bus/event_source/devices/ibs_fetch/type");

        memset(&attr, 0, sizeof(attr));
        attr.type        = type;
        attr.sample_type = PERF_SAMPLE_CPU | PERF_SAMPLE_RAW;
        attr.config      = IBS_FETCH_CONFIG_DEFAULT;

This implementation does not yet support 64 bit counters. It is
limited to the hardware counter bit width which is 20 bits. 64
bit support can be added later.

Signed-off-by: Robert Richter <robert.richter@amd.com>
Acked-by: Peter Zijlstra <peterz@infradead.org>
Cc: Stephane Eranian <eranian@google.com>
Link: http://lkml.kernel.org/r/1323968199-9326-2-git-send-email-robert.richter@amd.com
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/cpu/perf_event_amd_ibs.c | 92 +++++++++++++++++++++++++++++---
 1 file changed, 85 insertions(+), 7 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kernel/cpu/perf_event_amd_ibs.c b/arch/x86/kernel/cpu/perf_event_amd_ibs.c
index 3b8a2d30d14e..36684eb248de 100644
--- a/arch/x86/kernel/cpu/perf_event_amd_ibs.c
+++ b/arch/x86/kernel/cpu/perf_event_amd_ibs.c
@@ -16,12 +16,67 @@ static u32 ibs_caps;
 
 #if defined(CONFIG_PERF_EVENTS) && defined(CONFIG_CPU_SUP_AMD)
 
-static struct pmu perf_ibs;
+#define IBS_FETCH_CONFIG_MASK	(IBS_FETCH_RAND_EN | IBS_FETCH_MAX_CNT)
+#define IBS_OP_CONFIG_MASK	IBS_OP_MAX_CNT
+
+struct perf_ibs {
+	struct pmu	pmu;
+	unsigned int	msr;
+	u64		config_mask;
+	u64		cnt_mask;
+	u64		enable_mask;
+};
+
+static struct perf_ibs perf_ibs_fetch;
+static struct perf_ibs perf_ibs_op;
+
+static struct perf_ibs *get_ibs_pmu(int type)
+{
+	if (perf_ibs_fetch.pmu.type == type)
+		return &perf_ibs_fetch;
+	if (perf_ibs_op.pmu.type == type)
+		return &perf_ibs_op;
+	return NULL;
+}
 
 static int perf_ibs_init(struct perf_event *event)
 {
-	if (perf_ibs.type != event->attr.type)
+	struct hw_perf_event *hwc = &event->hw;
+	struct perf_ibs *perf_ibs;
+	u64 max_cnt, config;
+
+	perf_ibs = get_ibs_pmu(event->attr.type);
+	if (!perf_ibs)
 		return -ENOENT;
+
+	config = event->attr.config;
+	if (config & ~perf_ibs->config_mask)
+		return -EINVAL;
+
+	if (hwc->sample_period) {
+		if (config & perf_ibs->cnt_mask)
+			/* raw max_cnt may not be set */
+			return -EINVAL;
+		if (hwc->sample_period & 0x0f)
+			/* lower 4 bits can not be set in ibs max cnt */
+			return -EINVAL;
+		max_cnt = hwc->sample_period >> 4;
+		if (max_cnt & ~perf_ibs->cnt_mask)
+			/* out of range */
+			return -EINVAL;
+		config |= max_cnt;
+	} else {
+		max_cnt = config & perf_ibs->cnt_mask;
+		event->attr.sample_period = max_cnt << 4;
+		hwc->sample_period = event->attr.sample_period;
+	}
+
+	if (!max_cnt)
+		return -EINVAL;
+
+	hwc->config_base = perf_ibs->msr;
+	hwc->config = config;
+
 	return 0;
 }
 
@@ -34,10 +89,32 @@ static void perf_ibs_del(struct perf_event *event, int flags)
 {
 }
 
-static struct pmu perf_ibs = {
-	.event_init= perf_ibs_init,
-	.add= perf_ibs_add,
-	.del= perf_ibs_del,
+static struct perf_ibs perf_ibs_fetch = {
+	.pmu = {
+		.task_ctx_nr	= perf_invalid_context,
+
+		.event_init	= perf_ibs_init,
+		.add		= perf_ibs_add,
+		.del		= perf_ibs_del,
+	},
+	.msr			= MSR_AMD64_IBSFETCHCTL,
+	.config_mask		= IBS_FETCH_CONFIG_MASK,
+	.cnt_mask		= IBS_FETCH_MAX_CNT,
+	.enable_mask		= IBS_FETCH_ENABLE,
+};
+
+static struct perf_ibs perf_ibs_op = {
+	.pmu = {
+		.task_ctx_nr	= perf_invalid_context,
+
+		.event_init	= perf_ibs_init,
+		.add		= perf_ibs_add,
+		.del		= perf_ibs_del,
+	},
+	.msr			= MSR_AMD64_IBSOPCTL,
+	.config_mask		= IBS_OP_CONFIG_MASK,
+	.cnt_mask		= IBS_OP_MAX_CNT,
+	.enable_mask		= IBS_OP_ENABLE,
 };
 
 static __init int perf_event_ibs_init(void)
@@ -45,7 +122,8 @@ static __init int perf_event_ibs_init(void)
 	if (!ibs_caps)
 		return -ENODEV;	/* ibs not supported by the cpu */
 
-	perf_pmu_register(&perf_ibs, "ibs", -1);
+	perf_pmu_register(&perf_ibs_fetch.pmu, "ibs_fetch", -1);
+	perf_pmu_register(&perf_ibs_op.pmu, "ibs_op", -1);
 	printk(KERN_INFO "perf: AMD IBS detected (0x%08x)\n", ibs_caps);
 
 	return 0;
-- 
cgit v1.2.3


From b7074f1fbd6149eac1ec25063e4a364c39a85473 Mon Sep 17 00:00:00 2001
From: Robert Richter <robert.richter@amd.com>
Date: Thu, 15 Dec 2011 17:56:37 +0100
Subject: perf/x86: Implement IBS interrupt handler

This patch implements code to handle ibs interrupts. If ibs data
is available a raw perf_event data sample is created and sent
back to the userland. This patch only implements the storage of
ibs data in the raw sample, but this could be extended in a
later patch by generating generic event data such as the rip
from the ibs sampling data.

Signed-off-by: Robert Richter <robert.richter@amd.com>
Acked-by: Peter Zijlstra <peterz@infradead.org>
Cc: Stephane Eranian <eranian@google.com>
Link: http://lkml.kernel.org/r/1323968199-9326-3-git-send-email-robert.richter@amd.com
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/include/asm/msr-index.h         |  5 ++
 arch/x86/kernel/cpu/perf_event_amd_ibs.c | 84 ++++++++++++++++++++++++++++++++
 2 files changed, 89 insertions(+)

(limited to 'arch/x86')

diff --git a/arch/x86/include/asm/msr-index.h b/arch/x86/include/asm/msr-index.h
index a6962d9161a0..4e3cd382a06f 100644
--- a/arch/x86/include/asm/msr-index.h
+++ b/arch/x86/include/asm/msr-index.h
@@ -127,6 +127,8 @@
 #define MSR_AMD64_IBSFETCHCTL		0xc0011030
 #define MSR_AMD64_IBSFETCHLINAD		0xc0011031
 #define MSR_AMD64_IBSFETCHPHYSAD	0xc0011032
+#define MSR_AMD64_IBSFETCH_REG_COUNT	3
+#define MSR_AMD64_IBSFETCH_REG_MASK	((1UL<<MSR_AMD64_IBSFETCH_REG_COUNT)-1)
 #define MSR_AMD64_IBSOPCTL		0xc0011033
 #define MSR_AMD64_IBSOPRIP		0xc0011034
 #define MSR_AMD64_IBSOPDATA		0xc0011035
@@ -134,8 +136,11 @@
 #define MSR_AMD64_IBSOPDATA3		0xc0011037
 #define MSR_AMD64_IBSDCLINAD		0xc0011038
 #define MSR_AMD64_IBSDCPHYSAD		0xc0011039
+#define MSR_AMD64_IBSOP_REG_COUNT	7
+#define MSR_AMD64_IBSOP_REG_MASK	((1UL<<MSR_AMD64_IBSOP_REG_COUNT)-1)
 #define MSR_AMD64_IBSCTL		0xc001103a
 #define MSR_AMD64_IBSBRTARGET		0xc001103b
+#define MSR_AMD64_IBS_REG_COUNT_MAX	8 /* includes MSR_AMD64_IBSBRTARGET */
 
 /* Fam 15h MSRs */
 #define MSR_F15H_PERF_CTL		0xc0010200
diff --git a/arch/x86/kernel/cpu/perf_event_amd_ibs.c b/arch/x86/kernel/cpu/perf_event_amd_ibs.c
index 36684eb248de..a7ec6bdf0a63 100644
--- a/arch/x86/kernel/cpu/perf_event_amd_ibs.c
+++ b/arch/x86/kernel/cpu/perf_event_amd_ibs.c
@@ -16,6 +16,11 @@ static u32 ibs_caps;
 
 #if defined(CONFIG_PERF_EVENTS) && defined(CONFIG_CPU_SUP_AMD)
 
+#include <linux/kprobes.h>
+#include <linux/hardirq.h>
+
+#include <asm/nmi.h>
+
 #define IBS_FETCH_CONFIG_MASK	(IBS_FETCH_RAND_EN | IBS_FETCH_MAX_CNT)
 #define IBS_OP_CONFIG_MASK	IBS_OP_MAX_CNT
 
@@ -25,6 +30,18 @@ struct perf_ibs {
 	u64		config_mask;
 	u64		cnt_mask;
 	u64		enable_mask;
+	u64		valid_mask;
+	unsigned long	offset_mask[1];
+	int		offset_max;
+};
+
+struct perf_ibs_data {
+	u32		size;
+	union {
+		u32	data[0];	/* data buffer starts here */
+		u32	caps;
+	};
+	u64		regs[MSR_AMD64_IBS_REG_COUNT_MAX];
 };
 
 static struct perf_ibs perf_ibs_fetch;
@@ -101,6 +118,9 @@ static struct perf_ibs perf_ibs_fetch = {
 	.config_mask		= IBS_FETCH_CONFIG_MASK,
 	.cnt_mask		= IBS_FETCH_MAX_CNT,
 	.enable_mask		= IBS_FETCH_ENABLE,
+	.valid_mask		= IBS_FETCH_VAL,
+	.offset_mask		= { MSR_AMD64_IBSFETCH_REG_MASK },
+	.offset_max		= MSR_AMD64_IBSFETCH_REG_COUNT,
 };
 
 static struct perf_ibs perf_ibs_op = {
@@ -115,8 +135,71 @@ static struct perf_ibs perf_ibs_op = {
 	.config_mask		= IBS_OP_CONFIG_MASK,
 	.cnt_mask		= IBS_OP_MAX_CNT,
 	.enable_mask		= IBS_OP_ENABLE,
+	.valid_mask		= IBS_OP_VAL,
+	.offset_mask		= { MSR_AMD64_IBSOP_REG_MASK },
+	.offset_max		= MSR_AMD64_IBSOP_REG_COUNT,
 };
 
+static int perf_ibs_handle_irq(struct perf_ibs *perf_ibs, struct pt_regs *iregs)
+{
+	struct perf_event *event = NULL;
+	struct hw_perf_event *hwc = &event->hw;
+	struct perf_sample_data data;
+	struct perf_raw_record raw;
+	struct pt_regs regs;
+	struct perf_ibs_data ibs_data;
+	int offset, size;
+	unsigned int msr;
+	u64 *buf;
+
+	msr = hwc->config_base;
+	buf = ibs_data.regs;
+	rdmsrl(msr, *buf);
+	if (!(*buf++ & perf_ibs->valid_mask))
+		return 0;
+
+	perf_sample_data_init(&data, 0);
+	if (event->attr.sample_type & PERF_SAMPLE_RAW) {
+		ibs_data.caps = ibs_caps;
+		size = 1;
+		offset = 1;
+		do {
+		    rdmsrl(msr + offset, *buf++);
+		    size++;
+		    offset = find_next_bit(perf_ibs->offset_mask,
+					   perf_ibs->offset_max,
+					   offset + 1);
+		} while (offset < perf_ibs->offset_max);
+		raw.size = sizeof(u32) + sizeof(u64) * size;
+		raw.data = ibs_data.data;
+		data.raw = &raw;
+	}
+
+	regs = *iregs; /* XXX: update ip from ibs sample */
+
+	if (perf_event_overflow(event, &data, &regs))
+		; /* stop */
+	else
+		/* reenable */
+		wrmsrl(hwc->config_base, hwc->config | perf_ibs->enable_mask);
+
+	return 1;
+}
+
+static int __kprobes
+perf_ibs_nmi_handler(unsigned int cmd, struct pt_regs *regs)
+{
+	int handled = 0;
+
+	handled += perf_ibs_handle_irq(&perf_ibs_fetch, regs);
+	handled += perf_ibs_handle_irq(&perf_ibs_op, regs);
+
+	if (handled)
+		inc_irq_stat(apic_perf_irqs);
+
+	return handled;
+}
+
 static __init int perf_event_ibs_init(void)
 {
 	if (!ibs_caps)
@@ -124,6 +207,7 @@ static __init int perf_event_ibs_init(void)
 
 	perf_pmu_register(&perf_ibs_fetch.pmu, "ibs_fetch", -1);
 	perf_pmu_register(&perf_ibs_op.pmu, "ibs_op", -1);
+	register_nmi_handler(NMI_LOCAL, &perf_ibs_nmi_handler, 0, "perf_ibs");
 	printk(KERN_INFO "perf: AMD IBS detected (0x%08x)\n", ibs_caps);
 
 	return 0;
-- 
cgit v1.2.3


From 4db2e8e6500d9ba6406f2714fa3968b39a325274 Mon Sep 17 00:00:00 2001
From: Robert Richter <robert.richter@amd.com>
Date: Thu, 15 Dec 2011 17:56:38 +0100
Subject: perf/x86: Implement IBS pmu control ops

Add code to control the IBS pmu. We need to maintain per-cpu
states. Since some states are used and changed by the nmi
handler, access to these states must be atomic.

Signed-off-by: Robert Richter <robert.richter@amd.com>
Acked-by: Peter Zijlstra <peterz@infradead.org>
Cc: Stephane Eranian <eranian@google.com>
Link: http://lkml.kernel.org/r/1323968199-9326-4-git-send-email-robert.richter@amd.com
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/cpu/perf_event_amd_ibs.c | 106 ++++++++++++++++++++++++++++++-
 1 file changed, 103 insertions(+), 3 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kernel/cpu/perf_event_amd_ibs.c b/arch/x86/kernel/cpu/perf_event_amd_ibs.c
index a7ec6bdf0a63..40a6d9d5dd23 100644
--- a/arch/x86/kernel/cpu/perf_event_amd_ibs.c
+++ b/arch/x86/kernel/cpu/perf_event_amd_ibs.c
@@ -24,6 +24,19 @@ static u32 ibs_caps;
 #define IBS_FETCH_CONFIG_MASK	(IBS_FETCH_RAND_EN | IBS_FETCH_MAX_CNT)
 #define IBS_OP_CONFIG_MASK	IBS_OP_MAX_CNT
 
+enum ibs_states {
+	IBS_ENABLED	= 0,
+	IBS_STARTED	= 1,
+	IBS_STOPPING	= 2,
+
+	IBS_MAX_STATES,
+};
+
+struct cpu_perf_ibs {
+	struct perf_event	*event;
+	unsigned long		state[BITS_TO_LONGS(IBS_MAX_STATES)];
+};
+
 struct perf_ibs {
 	struct pmu	pmu;
 	unsigned int	msr;
@@ -33,6 +46,7 @@ struct perf_ibs {
 	u64		valid_mask;
 	unsigned long	offset_mask[1];
 	int		offset_max;
+	struct cpu_perf_ibs __percpu *pcpu;
 };
 
 struct perf_ibs_data {
@@ -97,15 +111,66 @@ static int perf_ibs_init(struct perf_event *event)
 	return 0;
 }
 
+static void perf_ibs_start(struct perf_event *event, int flags)
+{
+	struct hw_perf_event *hwc = &event->hw;
+	struct perf_ibs *perf_ibs = container_of(event->pmu, struct perf_ibs, pmu);
+	struct cpu_perf_ibs *pcpu = this_cpu_ptr(perf_ibs->pcpu);
+
+	if (test_and_set_bit(IBS_STARTED, pcpu->state))
+		return;
+
+	wrmsrl(hwc->config_base, hwc->config | perf_ibs->enable_mask);
+}
+
+static void perf_ibs_stop(struct perf_event *event, int flags)
+{
+	struct hw_perf_event *hwc = &event->hw;
+	struct perf_ibs *perf_ibs = container_of(event->pmu, struct perf_ibs, pmu);
+	struct cpu_perf_ibs *pcpu = this_cpu_ptr(perf_ibs->pcpu);
+	u64 val;
+
+	if (!test_and_clear_bit(IBS_STARTED, pcpu->state))
+		return;
+
+	set_bit(IBS_STOPPING, pcpu->state);
+
+	rdmsrl(hwc->config_base, val);
+	val &= ~perf_ibs->enable_mask;
+	wrmsrl(hwc->config_base, val);
+}
+
 static int perf_ibs_add(struct perf_event *event, int flags)
 {
+	struct perf_ibs *perf_ibs = container_of(event->pmu, struct perf_ibs, pmu);
+	struct cpu_perf_ibs *pcpu = this_cpu_ptr(perf_ibs->pcpu);
+
+	if (test_and_set_bit(IBS_ENABLED, pcpu->state))
+		return -ENOSPC;
+
+	pcpu->event = event;
+
+	if (flags & PERF_EF_START)
+		perf_ibs_start(event, PERF_EF_RELOAD);
+
 	return 0;
 }
 
 static void perf_ibs_del(struct perf_event *event, int flags)
 {
+	struct perf_ibs *perf_ibs = container_of(event->pmu, struct perf_ibs, pmu);
+	struct cpu_perf_ibs *pcpu = this_cpu_ptr(perf_ibs->pcpu);
+
+	if (!test_and_clear_bit(IBS_ENABLED, pcpu->state))
+		return;
+
+	perf_ibs_stop(event, 0);
+
+	pcpu->event = NULL;
 }
 
+static void perf_ibs_read(struct perf_event *event) { }
+
 static struct perf_ibs perf_ibs_fetch = {
 	.pmu = {
 		.task_ctx_nr	= perf_invalid_context,
@@ -113,6 +178,9 @@ static struct perf_ibs perf_ibs_fetch = {
 		.event_init	= perf_ibs_init,
 		.add		= perf_ibs_add,
 		.del		= perf_ibs_del,
+		.start		= perf_ibs_start,
+		.stop		= perf_ibs_stop,
+		.read		= perf_ibs_read,
 	},
 	.msr			= MSR_AMD64_IBSFETCHCTL,
 	.config_mask		= IBS_FETCH_CONFIG_MASK,
@@ -130,6 +198,9 @@ static struct perf_ibs perf_ibs_op = {
 		.event_init	= perf_ibs_init,
 		.add		= perf_ibs_add,
 		.del		= perf_ibs_del,
+		.start		= perf_ibs_start,
+		.stop		= perf_ibs_stop,
+		.read		= perf_ibs_read,
 	},
 	.msr			= MSR_AMD64_IBSOPCTL,
 	.config_mask		= IBS_OP_CONFIG_MASK,
@@ -142,7 +213,8 @@ static struct perf_ibs perf_ibs_op = {
 
 static int perf_ibs_handle_irq(struct perf_ibs *perf_ibs, struct pt_regs *iregs)
 {
-	struct perf_event *event = NULL;
+	struct cpu_perf_ibs *pcpu = this_cpu_ptr(perf_ibs->pcpu);
+	struct perf_event *event = pcpu->event;
 	struct hw_perf_event *hwc = &event->hw;
 	struct perf_sample_data data;
 	struct perf_raw_record raw;
@@ -152,6 +224,14 @@ static int perf_ibs_handle_irq(struct perf_ibs *perf_ibs, struct pt_regs *iregs)
 	unsigned int msr;
 	u64 *buf;
 
+	if (!test_bit(IBS_STARTED, pcpu->state)) {
+		/* Catch spurious interrupts after stopping IBS: */
+		if (!test_and_clear_bit(IBS_STOPPING, pcpu->state))
+			return 0;
+		rdmsrl(perf_ibs->msr, *ibs_data.regs);
+		return (*ibs_data.regs & perf_ibs->valid_mask) ? 1 : 0;
+	}
+
 	msr = hwc->config_base;
 	buf = ibs_data.regs;
 	rdmsrl(msr, *buf);
@@ -200,13 +280,33 @@ perf_ibs_nmi_handler(unsigned int cmd, struct pt_regs *regs)
 	return handled;
 }
 
+static __init int perf_ibs_pmu_init(struct perf_ibs *perf_ibs, char *name)
+{
+	struct cpu_perf_ibs __percpu *pcpu;
+	int ret;
+
+	pcpu = alloc_percpu(struct cpu_perf_ibs);
+	if (!pcpu)
+		return -ENOMEM;
+
+	perf_ibs->pcpu = pcpu;
+
+	ret = perf_pmu_register(&perf_ibs->pmu, name, -1);
+	if (ret) {
+		perf_ibs->pcpu = NULL;
+		free_percpu(pcpu);
+	}
+
+	return ret;
+}
+
 static __init int perf_event_ibs_init(void)
 {
 	if (!ibs_caps)
 		return -ENODEV;	/* ibs not supported by the cpu */
 
-	perf_pmu_register(&perf_ibs_fetch.pmu, "ibs_fetch", -1);
-	perf_pmu_register(&perf_ibs_op.pmu, "ibs_op", -1);
+	perf_ibs_pmu_init(&perf_ibs_fetch, "ibs_fetch");
+	perf_ibs_pmu_init(&perf_ibs_op, "ibs_op");
 	register_nmi_handler(NMI_LOCAL, &perf_ibs_nmi_handler, 0, "perf_ibs");
 	printk(KERN_INFO "perf: AMD IBS detected (0x%08x)\n", ibs_caps);
 
-- 
cgit v1.2.3


From db98c5faf8cb350212ea3af786cb3ba0d4e7a01e Mon Sep 17 00:00:00 2001
From: Robert Richter <robert.richter@amd.com>
Date: Thu, 15 Dec 2011 17:56:39 +0100
Subject: perf/x86: Implement 64-bit counter support for IBS

This patch implements 64 bit counter support for IBS. The
sampling period is no longer limited to the hw counter width.

The functions perf_event_set_period() and
perf_event_try_update() can be used as generic functions. They
can replace similar code that is duplicate across architectures.

Signed-off-by: Robert Richter <robert.richter@amd.com>
Acked-by: Peter Zijlstra <peterz@infradead.org>
Cc: Stephane Eranian <eranian@google.com>
Link: http://lkml.kernel.org/r/1323968199-9326-5-git-send-email-robert.richter@amd.com
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/include/asm/perf_event.h        |   2 +
 arch/x86/kernel/cpu/perf_event_amd_ibs.c | 204 +++++++++++++++++++++++++++----
 2 files changed, 185 insertions(+), 21 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/include/asm/perf_event.h b/arch/x86/include/asm/perf_event.h
index e8fb2c7a5f4f..9cf66965141d 100644
--- a/arch/x86/include/asm/perf_event.h
+++ b/arch/x86/include/asm/perf_event.h
@@ -177,6 +177,8 @@ struct x86_pmu_capability {
 #define IBS_FETCH_MAX_CNT	0x0000FFFFULL
 
 /* IbsOpCtl bits */
+/* lower 4 bits of the current count are ignored: */
+#define IBS_OP_CUR_CNT		(0xFFFF0ULL<<32)
 #define IBS_OP_CNT_CTL		(1ULL<<19)
 #define IBS_OP_VAL		(1ULL<<18)
 #define IBS_OP_ENABLE		(1ULL<<17)
diff --git a/arch/x86/kernel/cpu/perf_event_amd_ibs.c b/arch/x86/kernel/cpu/perf_event_amd_ibs.c
index 40a6d9d5dd23..573d24873459 100644
--- a/arch/x86/kernel/cpu/perf_event_amd_ibs.c
+++ b/arch/x86/kernel/cpu/perf_event_amd_ibs.c
@@ -44,9 +44,11 @@ struct perf_ibs {
 	u64		cnt_mask;
 	u64		enable_mask;
 	u64		valid_mask;
+	u64		max_period;
 	unsigned long	offset_mask[1];
 	int		offset_max;
 	struct cpu_perf_ibs __percpu *pcpu;
+	u64		(*get_count)(u64 config);
 };
 
 struct perf_ibs_data {
@@ -58,6 +60,78 @@ struct perf_ibs_data {
 	u64		regs[MSR_AMD64_IBS_REG_COUNT_MAX];
 };
 
+static int
+perf_event_set_period(struct hw_perf_event *hwc, u64 min, u64 max, u64 *count)
+{
+	s64 left = local64_read(&hwc->period_left);
+	s64 period = hwc->sample_period;
+	int overflow = 0;
+
+	/*
+	 * If we are way outside a reasonable range then just skip forward:
+	 */
+	if (unlikely(left <= -period)) {
+		left = period;
+		local64_set(&hwc->period_left, left);
+		hwc->last_period = period;
+		overflow = 1;
+	}
+
+	if (unlikely(left <= 0)) {
+		left += period;
+		local64_set(&hwc->period_left, left);
+		hwc->last_period = period;
+		overflow = 1;
+	}
+
+	if (unlikely(left < min))
+		left = min;
+
+	if (left > max)
+		left = max;
+
+	*count = (u64)left;
+
+	return overflow;
+}
+
+static  int
+perf_event_try_update(struct perf_event *event, u64 new_raw_count, int width)
+{
+	struct hw_perf_event *hwc = &event->hw;
+	int shift = 64 - width;
+	u64 prev_raw_count;
+	u64 delta;
+
+	/*
+	 * Careful: an NMI might modify the previous event value.
+	 *
+	 * Our tactic to handle this is to first atomically read and
+	 * exchange a new raw count - then add that new-prev delta
+	 * count to the generic event atomically:
+	 */
+	prev_raw_count = local64_read(&hwc->prev_count);
+	if (local64_cmpxchg(&hwc->prev_count, prev_raw_count,
+					new_raw_count) != prev_raw_count)
+		return 0;
+
+	/*
+	 * Now we have the new raw value and have updated the prev
+	 * timestamp already. We can now calculate the elapsed delta
+	 * (event-)time and add that to the generic event.
+	 *
+	 * Careful, not all hw sign-extends above the physical width
+	 * of the count.
+	 */
+	delta = (new_raw_count << shift) - (prev_raw_count << shift);
+	delta >>= shift;
+
+	local64_add(delta, &event->count);
+	local64_sub(delta, &hwc->period_left);
+
+	return 1;
+}
+
 static struct perf_ibs perf_ibs_fetch;
 static struct perf_ibs perf_ibs_op;
 
@@ -91,18 +165,14 @@ static int perf_ibs_init(struct perf_event *event)
 		if (hwc->sample_period & 0x0f)
 			/* lower 4 bits can not be set in ibs max cnt */
 			return -EINVAL;
-		max_cnt = hwc->sample_period >> 4;
-		if (max_cnt & ~perf_ibs->cnt_mask)
-			/* out of range */
-			return -EINVAL;
-		config |= max_cnt;
 	} else {
 		max_cnt = config & perf_ibs->cnt_mask;
+		config &= ~perf_ibs->cnt_mask;
 		event->attr.sample_period = max_cnt << 4;
 		hwc->sample_period = event->attr.sample_period;
 	}
 
-	if (!max_cnt)
+	if (!hwc->sample_period)
 		return -EINVAL;
 
 	hwc->config_base = perf_ibs->msr;
@@ -111,16 +181,71 @@ static int perf_ibs_init(struct perf_event *event)
 	return 0;
 }
 
+static int perf_ibs_set_period(struct perf_ibs *perf_ibs,
+			       struct hw_perf_event *hwc, u64 *period)
+{
+	int ret;
+
+	/* ignore lower 4 bits in min count: */
+	ret = perf_event_set_period(hwc, 1<<4, perf_ibs->max_period, period);
+	local64_set(&hwc->prev_count, 0);
+
+	return ret;
+}
+
+static u64 get_ibs_fetch_count(u64 config)
+{
+	return (config & IBS_FETCH_CNT) >> 12;
+}
+
+static u64 get_ibs_op_count(u64 config)
+{
+	return (config & IBS_OP_CUR_CNT) >> 32;
+}
+
+static void
+perf_ibs_event_update(struct perf_ibs *perf_ibs, struct perf_event *event,
+		      u64 config)
+{
+	u64 count = perf_ibs->get_count(config);
+
+	while (!perf_event_try_update(event, count, 20)) {
+		rdmsrl(event->hw.config_base, config);
+		count = perf_ibs->get_count(config);
+	}
+}
+
+/* Note: The enable mask must be encoded in the config argument. */
+static inline void perf_ibs_enable_event(struct hw_perf_event *hwc, u64 config)
+{
+	wrmsrl(hwc->config_base, hwc->config | config);
+}
+
+/*
+ * We cannot restore the ibs pmu state, so we always needs to update
+ * the event while stopping it and then reset the state when starting
+ * again. Thus, ignoring PERF_EF_RELOAD and PERF_EF_UPDATE flags in
+ * perf_ibs_start()/perf_ibs_stop() and instead always do it.
+ */
 static void perf_ibs_start(struct perf_event *event, int flags)
 {
 	struct hw_perf_event *hwc = &event->hw;
 	struct perf_ibs *perf_ibs = container_of(event->pmu, struct perf_ibs, pmu);
 	struct cpu_perf_ibs *pcpu = this_cpu_ptr(perf_ibs->pcpu);
+	u64 config;
 
-	if (test_and_set_bit(IBS_STARTED, pcpu->state))
+	if (WARN_ON_ONCE(!(hwc->state & PERF_HES_STOPPED)))
 		return;
 
-	wrmsrl(hwc->config_base, hwc->config | perf_ibs->enable_mask);
+	WARN_ON_ONCE(!(hwc->state & PERF_HES_UPTODATE));
+	hwc->state = 0;
+
+	perf_ibs_set_period(perf_ibs, hwc, &config);
+	config = (config >> 4) | perf_ibs->enable_mask;
+	set_bit(IBS_STARTED, pcpu->state);
+	perf_ibs_enable_event(hwc, config);
+
+	perf_event_update_userpage(event);
 }
 
 static void perf_ibs_stop(struct perf_event *event, int flags)
@@ -129,15 +254,28 @@ static void perf_ibs_stop(struct perf_event *event, int flags)
 	struct perf_ibs *perf_ibs = container_of(event->pmu, struct perf_ibs, pmu);
 	struct cpu_perf_ibs *pcpu = this_cpu_ptr(perf_ibs->pcpu);
 	u64 val;
+	int stopping;
 
-	if (!test_and_clear_bit(IBS_STARTED, pcpu->state))
-		return;
+	stopping = test_and_clear_bit(IBS_STARTED, pcpu->state);
 
-	set_bit(IBS_STOPPING, pcpu->state);
+	if (!stopping && (hwc->state & PERF_HES_UPTODATE))
+		return;
 
 	rdmsrl(hwc->config_base, val);
-	val &= ~perf_ibs->enable_mask;
-	wrmsrl(hwc->config_base, val);
+
+	if (stopping) {
+		set_bit(IBS_STOPPING, pcpu->state);
+		val &= ~perf_ibs->enable_mask;
+		wrmsrl(hwc->config_base, val);
+		WARN_ON_ONCE(hwc->state & PERF_HES_STOPPED);
+		hwc->state |= PERF_HES_STOPPED;
+	}
+
+	if (hwc->state & PERF_HES_UPTODATE)
+		return;
+
+	perf_ibs_event_update(perf_ibs, event, val);
+	hwc->state |= PERF_HES_UPTODATE;
 }
 
 static int perf_ibs_add(struct perf_event *event, int flags)
@@ -148,6 +286,8 @@ static int perf_ibs_add(struct perf_event *event, int flags)
 	if (test_and_set_bit(IBS_ENABLED, pcpu->state))
 		return -ENOSPC;
 
+	event->hw.state = PERF_HES_UPTODATE | PERF_HES_STOPPED;
+
 	pcpu->event = event;
 
 	if (flags & PERF_EF_START)
@@ -164,9 +304,11 @@ static void perf_ibs_del(struct perf_event *event, int flags)
 	if (!test_and_clear_bit(IBS_ENABLED, pcpu->state))
 		return;
 
-	perf_ibs_stop(event, 0);
+	perf_ibs_stop(event, PERF_EF_UPDATE);
 
 	pcpu->event = NULL;
+
+	perf_event_update_userpage(event);
 }
 
 static void perf_ibs_read(struct perf_event *event) { }
@@ -187,8 +329,11 @@ static struct perf_ibs perf_ibs_fetch = {
 	.cnt_mask		= IBS_FETCH_MAX_CNT,
 	.enable_mask		= IBS_FETCH_ENABLE,
 	.valid_mask		= IBS_FETCH_VAL,
+	.max_period		= IBS_FETCH_MAX_CNT << 4,
 	.offset_mask		= { MSR_AMD64_IBSFETCH_REG_MASK },
 	.offset_max		= MSR_AMD64_IBSFETCH_REG_COUNT,
+
+	.get_count		= get_ibs_fetch_count,
 };
 
 static struct perf_ibs perf_ibs_op = {
@@ -207,8 +352,11 @@ static struct perf_ibs perf_ibs_op = {
 	.cnt_mask		= IBS_OP_MAX_CNT,
 	.enable_mask		= IBS_OP_ENABLE,
 	.valid_mask		= IBS_OP_VAL,
+	.max_period		= IBS_OP_MAX_CNT << 4,
 	.offset_mask		= { MSR_AMD64_IBSOP_REG_MASK },
 	.offset_max		= MSR_AMD64_IBSOP_REG_COUNT,
+
+	.get_count		= get_ibs_op_count,
 };
 
 static int perf_ibs_handle_irq(struct perf_ibs *perf_ibs, struct pt_regs *iregs)
@@ -220,9 +368,9 @@ static int perf_ibs_handle_irq(struct perf_ibs *perf_ibs, struct pt_regs *iregs)
 	struct perf_raw_record raw;
 	struct pt_regs regs;
 	struct perf_ibs_data ibs_data;
-	int offset, size;
+	int offset, size, overflow, reenable;
 	unsigned int msr;
-	u64 *buf;
+	u64 *buf, config;
 
 	if (!test_bit(IBS_STARTED, pcpu->state)) {
 		/* Catch spurious interrupts after stopping IBS: */
@@ -257,11 +405,25 @@ static int perf_ibs_handle_irq(struct perf_ibs *perf_ibs, struct pt_regs *iregs)
 
 	regs = *iregs; /* XXX: update ip from ibs sample */
 
-	if (perf_event_overflow(event, &data, &regs))
-		; /* stop */
-	else
-		/* reenable */
-		wrmsrl(hwc->config_base, hwc->config | perf_ibs->enable_mask);
+	/*
+	 * Emulate IbsOpCurCnt in MSRC001_1033 (IbsOpCtl), not
+	 * supported in all cpus. As this triggered an interrupt, we
+	 * set the current count to the max count.
+	 */
+	config = ibs_data.regs[0];
+	if (perf_ibs == &perf_ibs_op && !(ibs_caps & IBS_CAPS_RDWROPCNT)) {
+		config &= ~IBS_OP_CUR_CNT;
+		config |= (config & IBS_OP_MAX_CNT) << 36;
+	}
+
+	perf_ibs_event_update(perf_ibs, event, config);
+
+	overflow = perf_ibs_set_period(perf_ibs, hwc, &config);
+	reenable = !(overflow && perf_event_overflow(event, &data, &regs));
+	config = (config >> 4) | (reenable ? perf_ibs->enable_mask : 0);
+	perf_ibs_enable_event(hwc, config);
+
+	perf_event_update_userpage(event);
 
 	return 1;
 }
-- 
cgit v1.2.3


From 900771a483ef28915a48066d7895d8252315607a Mon Sep 17 00:00:00 2001
From: Srikar Dronamraju <srikar@linux.vnet.ibm.com>
Date: Mon, 12 Mar 2012 14:55:14 +0530
Subject: uprobes/core: Make macro names consistent

Rename macros that refer to individual uprobe to start with
UPROBE_ instead of UPROBES_.

This is pure cleanup, no functional change intended.

Signed-off-by: Srikar Dronamraju <srikar@linux.vnet.ibm.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Ananth N Mavinakayanahalli <ananth@in.ibm.com>
Cc: Jim Keniston <jkenisto@linux.vnet.ibm.com>
Cc: Linux-mm <linux-mm@kvack.org>
Cc: Oleg Nesterov <oleg@redhat.com>
Cc: Andi Kleen <andi@firstfloor.org>
Cc: Christoph Hellwig <hch@infradead.org>
Cc: Steven Rostedt <rostedt@goodmis.org>
Cc: Arnaldo Carvalho de Melo <acme@infradead.org>
Cc: Masami Hiramatsu <masami.hiramatsu.pt@hitachi.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Link: http://lkml.kernel.org/r/20120312092514.5379.36595.sendpatchset@srdronam.in.ibm.com
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/include/asm/uprobes.h |  6 +++---
 arch/x86/kernel/uprobes.c      | 18 +++++++++---------
 include/linux/uprobes.h        |  4 ++--
 kernel/events/uprobes.c        | 18 +++++++++---------
 4 files changed, 23 insertions(+), 23 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/include/asm/uprobes.h b/arch/x86/include/asm/uprobes.h
index f7ce310a429d..5c399e446512 100644
--- a/arch/x86/include/asm/uprobes.h
+++ b/arch/x86/include/asm/uprobes.h
@@ -26,10 +26,10 @@
 typedef u8 uprobe_opcode_t;
 
 #define MAX_UINSN_BYTES			  16
-#define UPROBES_XOL_SLOT_BYTES		 128	/* to keep it cache aligned */
+#define UPROBE_XOL_SLOT_BYTES		 128	/* to keep it cache aligned */
 
-#define UPROBES_BKPT_INSN		0xcc
-#define UPROBES_BKPT_INSN_SIZE		   1
+#define UPROBE_BKPT_INSN		0xcc
+#define UPROBE_BKPT_INSN_SIZE		   1
 
 struct arch_uprobe {
 	u16				fixups;
diff --git a/arch/x86/kernel/uprobes.c b/arch/x86/kernel/uprobes.c
index 04dfcef2d028..6dfa89e6f24a 100644
--- a/arch/x86/kernel/uprobes.c
+++ b/arch/x86/kernel/uprobes.c
@@ -31,14 +31,14 @@
 /* Post-execution fixups. */
 
 /* No fixup needed */
-#define UPROBES_FIX_NONE	0x0
+#define UPROBE_FIX_NONE	0x0
 /* Adjust IP back to vicinity of actual insn */
-#define UPROBES_FIX_IP		0x1
+#define UPROBE_FIX_IP		0x1
 /* Adjust the return address of a call insn */
-#define UPROBES_FIX_CALL	0x2
+#define UPROBE_FIX_CALL	0x2
 
-#define UPROBES_FIX_RIP_AX	0x8000
-#define UPROBES_FIX_RIP_CX	0x4000
+#define UPROBE_FIX_RIP_AX	0x8000
+#define UPROBE_FIX_RIP_CX	0x4000
 
 /* Adaptations for mhiramat x86 decoder v14. */
 #define OPCODE1(insn)		((insn)->opcode.bytes[0])
@@ -269,9 +269,9 @@ static void prepare_fixups(struct arch_uprobe *auprobe, struct insn *insn)
 		break;
 	}
 	if (fix_ip)
-		auprobe->fixups |= UPROBES_FIX_IP;
+		auprobe->fixups |= UPROBE_FIX_IP;
 	if (fix_call)
-		auprobe->fixups |= UPROBES_FIX_CALL;
+		auprobe->fixups |= UPROBE_FIX_CALL;
 }
 
 #ifdef CONFIG_X86_64
@@ -341,12 +341,12 @@ static void handle_riprel_insn(struct mm_struct *mm, struct arch_uprobe *auprobe
 		 * is NOT the register operand, so we use %rcx (register
 		 * #1) for the scratch register.
 		 */
-		auprobe->fixups = UPROBES_FIX_RIP_CX;
+		auprobe->fixups = UPROBE_FIX_RIP_CX;
 		/* Change modrm from 00 000 101 to 00 000 001. */
 		*cursor = 0x1;
 	} else {
 		/* Use %rax (register #0) for the scratch register. */
-		auprobe->fixups = UPROBES_FIX_RIP_AX;
+		auprobe->fixups = UPROBE_FIX_RIP_AX;
 		/* Change modrm from 00 xxx 101 to 00 xxx 000 */
 		*cursor = (reg << 3);
 	}
diff --git a/include/linux/uprobes.h b/include/linux/uprobes.h
index f85797e1ccd4..838fb312926a 100644
--- a/include/linux/uprobes.h
+++ b/include/linux/uprobes.h
@@ -35,10 +35,10 @@ struct vm_area_struct;
 /* flags that denote/change uprobes behaviour */
 
 /* Have a copy of original instruction */
-#define UPROBES_COPY_INSN	0x1
+#define UPROBE_COPY_INSN	0x1
 
 /* Dont run handlers when first register/ last unregister in progress*/
-#define UPROBES_RUN_HANDLER	0x2
+#define UPROBE_RUN_HANDLER	0x2
 
 struct uprobe_consumer {
 	int (*handler)(struct uprobe_consumer *self, struct pt_regs *regs);
diff --git a/kernel/events/uprobes.c b/kernel/events/uprobes.c
index 5ce32e3ae9e9..0d36bf3920ba 100644
--- a/kernel/events/uprobes.c
+++ b/kernel/events/uprobes.c
@@ -177,7 +177,7 @@ out:
  */
 bool __weak is_bkpt_insn(uprobe_opcode_t *insn)
 {
-	return *insn == UPROBES_BKPT_INSN;
+	return *insn == UPROBE_BKPT_INSN;
 }
 
 /*
@@ -259,8 +259,8 @@ static int write_opcode(struct mm_struct *mm, struct arch_uprobe *auprobe,
 
 	/* poke the new insn in, ASSUMES we don't cross page boundary */
 	vaddr &= ~PAGE_MASK;
-	BUG_ON(vaddr + UPROBES_BKPT_INSN_SIZE > PAGE_SIZE);
-	memcpy(vaddr_new + vaddr, &opcode, UPROBES_BKPT_INSN_SIZE);
+	BUG_ON(vaddr + UPROBE_BKPT_INSN_SIZE > PAGE_SIZE);
+	memcpy(vaddr_new + vaddr, &opcode, UPROBE_BKPT_INSN_SIZE);
 
 	kunmap_atomic(vaddr_new);
 	kunmap_atomic(vaddr_old);
@@ -308,7 +308,7 @@ static int read_opcode(struct mm_struct *mm, unsigned long vaddr, uprobe_opcode_
 	lock_page(page);
 	vaddr_new = kmap_atomic(page);
 	vaddr &= ~PAGE_MASK;
-	memcpy(opcode, vaddr_new + vaddr, UPROBES_BKPT_INSN_SIZE);
+	memcpy(opcode, vaddr_new + vaddr, UPROBE_BKPT_INSN_SIZE);
 	kunmap_atomic(vaddr_new);
 	unlock_page(page);
 
@@ -352,7 +352,7 @@ int __weak set_bkpt(struct mm_struct *mm, struct arch_uprobe *auprobe, unsigned
 	if (result)
 		return result;
 
-	return write_opcode(mm, auprobe, vaddr, UPROBES_BKPT_INSN);
+	return write_opcode(mm, auprobe, vaddr, UPROBE_BKPT_INSN);
 }
 
 /**
@@ -635,7 +635,7 @@ static int install_breakpoint(struct mm_struct *mm, struct uprobe *uprobe,
 
 	addr = (unsigned long)vaddr;
 
-	if (!(uprobe->flags & UPROBES_COPY_INSN)) {
+	if (!(uprobe->flags & UPROBE_COPY_INSN)) {
 		ret = copy_insn(uprobe, vma, addr);
 		if (ret)
 			return ret;
@@ -647,7 +647,7 @@ static int install_breakpoint(struct mm_struct *mm, struct uprobe *uprobe,
 		if (ret)
 			return ret;
 
-		uprobe->flags |= UPROBES_COPY_INSN;
+		uprobe->flags |= UPROBE_COPY_INSN;
 	}
 	ret = set_bkpt(mm, &uprobe->arch, addr);
 
@@ -857,7 +857,7 @@ int uprobe_register(struct inode *inode, loff_t offset, struct uprobe_consumer *
 			uprobe->consumers = NULL;
 			__uprobe_unregister(uprobe);
 		} else {
-			uprobe->flags |= UPROBES_RUN_HANDLER;
+			uprobe->flags |= UPROBE_RUN_HANDLER;
 		}
 	}
 
@@ -889,7 +889,7 @@ void uprobe_unregister(struct inode *inode, loff_t offset, struct uprobe_consume
 	if (consumer_del(uprobe, consumer)) {
 		if (!uprobe->consumers) {
 			__uprobe_unregister(uprobe);
-			uprobe->flags &= ~UPROBES_RUN_HANDLER;
+			uprobe->flags &= ~UPROBE_RUN_HANDLER;
 		}
 	}
 
-- 
cgit v1.2.3


From e3343e6a2819ff5d0dfc4bb5c9fb7f9a4d04da73 Mon Sep 17 00:00:00 2001
From: Srikar Dronamraju <srikar@linux.vnet.ibm.com>
Date: Mon, 12 Mar 2012 14:55:30 +0530
Subject: uprobes/core: Make order of function parameters consistent across
 functions

If a function takes struct uprobe or struct arch_uprobe, then it
is passed as the first parameter.

This is pure cleanup, no functional change intended.

Signed-off-by: Srikar Dronamraju <srikar@linux.vnet.ibm.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Ananth N Mavinakayanahalli <ananth@in.ibm.com>
Cc: Jim Keniston <jkenisto@linux.vnet.ibm.com>
Cc: Linux-mm <linux-mm@kvack.org>
Cc: Oleg Nesterov <oleg@redhat.com>
Cc: Andi Kleen <andi@firstfloor.org>
Cc: Christoph Hellwig <hch@infradead.org>
Cc: Steven Rostedt <rostedt@goodmis.org>
Cc: Arnaldo Carvalho de Melo <acme@infradead.org>
Cc: Masami Hiramatsu <masami.hiramatsu.pt@hitachi.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Link: http://lkml.kernel.org/r/20120312092530.5379.18394.sendpatchset@srdronam.in.ibm.com
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/include/asm/uprobes.h |  2 +-
 arch/x86/kernel/uprobes.c      | 15 +++----
 include/linux/uprobes.h        | 12 +++---
 kernel/events/uprobes.c        | 93 ++++++++++++++++++++++--------------------
 4 files changed, 63 insertions(+), 59 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/include/asm/uprobes.h b/arch/x86/include/asm/uprobes.h
index 5c399e446512..384f1bebf884 100644
--- a/arch/x86/include/asm/uprobes.h
+++ b/arch/x86/include/asm/uprobes.h
@@ -39,5 +39,5 @@ struct arch_uprobe {
 #endif
 };
 
-extern int arch_uprobes_analyze_insn(struct mm_struct *mm, struct arch_uprobe *arch_uprobe);
+extern int arch_uprobes_analyze_insn(struct arch_uprobe *aup, struct mm_struct *mm);
 #endif	/* _ASM_UPROBES_H */
diff --git a/arch/x86/kernel/uprobes.c b/arch/x86/kernel/uprobes.c
index 6dfa89e6f24a..851a11b0d38c 100644
--- a/arch/x86/kernel/uprobes.c
+++ b/arch/x86/kernel/uprobes.c
@@ -297,7 +297,8 @@ static void prepare_fixups(struct arch_uprobe *auprobe, struct insn *insn)
  *  - There's never a SIB byte.
  *  - The displacement is always 4 bytes.
  */
-static void handle_riprel_insn(struct mm_struct *mm, struct arch_uprobe *auprobe, struct insn *insn)
+static void
+handle_riprel_insn(struct arch_uprobe *auprobe, struct mm_struct *mm, struct insn *insn)
 {
 	u8 *cursor;
 	u8 reg;
@@ -381,19 +382,19 @@ static int validate_insn_64bits(struct arch_uprobe *auprobe, struct insn *insn)
 	return -ENOTSUPP;
 }
 
-static int validate_insn_bits(struct mm_struct *mm, struct arch_uprobe *auprobe, struct insn *insn)
+static int validate_insn_bits(struct arch_uprobe *auprobe, struct mm_struct *mm, struct insn *insn)
 {
 	if (mm->context.ia32_compat)
 		return validate_insn_32bits(auprobe, insn);
 	return validate_insn_64bits(auprobe, insn);
 }
 #else /* 32-bit: */
-static void handle_riprel_insn(struct mm_struct *mm, struct arch_uprobe *auprobe, struct insn *insn)
+static void handle_riprel_insn(struct arch_uprobe *auprobe, struct mm_struct *mm, struct insn *insn)
 {
 	/* No RIP-relative addressing on 32-bit */
 }
 
-static int validate_insn_bits(struct mm_struct *mm, struct arch_uprobe *auprobe, struct insn *insn)
+static int validate_insn_bits(struct arch_uprobe *auprobe, struct mm_struct *mm,  struct insn *insn)
 {
 	return validate_insn_32bits(auprobe, insn);
 }
@@ -405,17 +406,17 @@ static int validate_insn_bits(struct mm_struct *mm, struct arch_uprobe *auprobe,
  * @arch_uprobe: the probepoint information.
  * Return 0 on success or a -ve number on error.
  */
-int arch_uprobes_analyze_insn(struct mm_struct *mm, struct arch_uprobe *auprobe)
+int arch_uprobes_analyze_insn(struct arch_uprobe *auprobe, struct mm_struct *mm)
 {
 	int ret;
 	struct insn insn;
 
 	auprobe->fixups = 0;
-	ret = validate_insn_bits(mm, auprobe, &insn);
+	ret = validate_insn_bits(auprobe, mm, &insn);
 	if (ret != 0)
 		return ret;
 
-	handle_riprel_insn(mm, auprobe, &insn);
+	handle_riprel_insn(auprobe, mm, &insn);
 	prepare_fixups(auprobe, &insn);
 
 	return 0;
diff --git a/include/linux/uprobes.h b/include/linux/uprobes.h
index 838fb312926a..58699182e9a7 100644
--- a/include/linux/uprobes.h
+++ b/include/linux/uprobes.h
@@ -52,20 +52,20 @@ struct uprobe_consumer {
 };
 
 #ifdef CONFIG_UPROBES
-extern int __weak set_bkpt(struct mm_struct *mm, struct arch_uprobe *auprobe, unsigned long vaddr);
-extern int __weak set_orig_insn(struct mm_struct *mm, struct arch_uprobe *auprobe, unsigned long vaddr, bool verify);
+extern int __weak set_bkpt(struct arch_uprobe *aup, struct mm_struct *mm, unsigned long vaddr);
+extern int __weak set_orig_insn(struct arch_uprobe *aup, struct mm_struct *mm,  unsigned long vaddr, bool verify);
 extern bool __weak is_bkpt_insn(uprobe_opcode_t *insn);
-extern int uprobe_register(struct inode *inode, loff_t offset, struct uprobe_consumer *consumer);
-extern void uprobe_unregister(struct inode *inode, loff_t offset, struct uprobe_consumer *consumer);
+extern int uprobe_register(struct inode *inode, loff_t offset, struct uprobe_consumer *uc);
+extern void uprobe_unregister(struct inode *inode, loff_t offset, struct uprobe_consumer *uc);
 extern int uprobe_mmap(struct vm_area_struct *vma);
 #else /* CONFIG_UPROBES is not defined */
 static inline int
-uprobe_register(struct inode *inode, loff_t offset, struct uprobe_consumer *consumer)
+uprobe_register(struct inode *inode, loff_t offset, struct uprobe_consumer *uc)
 {
 	return -ENOSYS;
 }
 static inline void
-uprobe_unregister(struct inode *inode, loff_t offset, struct uprobe_consumer *consumer)
+uprobe_unregister(struct inode *inode, loff_t offset, struct uprobe_consumer *uc)
 {
 }
 static inline int uprobe_mmap(struct vm_area_struct *vma)
diff --git a/kernel/events/uprobes.c b/kernel/events/uprobes.c
index 0d36bf3920ba..9c5ddff1c8da 100644
--- a/kernel/events/uprobes.c
+++ b/kernel/events/uprobes.c
@@ -192,8 +192,8 @@ bool __weak is_bkpt_insn(uprobe_opcode_t *insn)
 
 /*
  * write_opcode - write the opcode at a given virtual address.
+ * @auprobe: arch breakpointing information.
  * @mm: the probed process address space.
- * @arch_uprobe: the breakpointing information.
  * @vaddr: the virtual address to store the opcode.
  * @opcode: opcode to be written at @vaddr.
  *
@@ -203,7 +203,7 @@ bool __weak is_bkpt_insn(uprobe_opcode_t *insn)
  * For mm @mm, write the opcode at @vaddr.
  * Return 0 (success) or a negative errno.
  */
-static int write_opcode(struct mm_struct *mm, struct arch_uprobe *auprobe,
+static int write_opcode(struct arch_uprobe *auprobe, struct mm_struct *mm,
 			unsigned long vaddr, uprobe_opcode_t opcode)
 {
 	struct page *old_page, *new_page;
@@ -334,14 +334,14 @@ static int is_bkpt_at_addr(struct mm_struct *mm, unsigned long vaddr)
 
 /**
  * set_bkpt - store breakpoint at a given address.
+ * @auprobe: arch specific probepoint information.
  * @mm: the probed process address space.
- * @uprobe: the probepoint information.
  * @vaddr: the virtual address to insert the opcode.
  *
  * For mm @mm, store the breakpoint instruction at @vaddr.
  * Return 0 (success) or a negative errno.
  */
-int __weak set_bkpt(struct mm_struct *mm, struct arch_uprobe *auprobe, unsigned long vaddr)
+int __weak set_bkpt(struct arch_uprobe *auprobe, struct mm_struct *mm, unsigned long vaddr)
 {
 	int result;
 
@@ -352,13 +352,13 @@ int __weak set_bkpt(struct mm_struct *mm, struct arch_uprobe *auprobe, unsigned
 	if (result)
 		return result;
 
-	return write_opcode(mm, auprobe, vaddr, UPROBE_BKPT_INSN);
+	return write_opcode(auprobe, mm, vaddr, UPROBE_BKPT_INSN);
 }
 
 /**
  * set_orig_insn - Restore the original instruction.
  * @mm: the probed process address space.
- * @uprobe: the probepoint information.
+ * @auprobe: arch specific probepoint information.
  * @vaddr: the virtual address to insert the opcode.
  * @verify: if true, verify existance of breakpoint instruction.
  *
@@ -366,7 +366,7 @@ int __weak set_bkpt(struct mm_struct *mm, struct arch_uprobe *auprobe, unsigned
  * Return 0 (success) or a negative errno.
  */
 int __weak
-set_orig_insn(struct mm_struct *mm, struct arch_uprobe *auprobe, unsigned long vaddr, bool verify)
+set_orig_insn(struct arch_uprobe *auprobe, struct mm_struct *mm, unsigned long vaddr, bool verify)
 {
 	if (verify) {
 		int result;
@@ -378,7 +378,7 @@ set_orig_insn(struct mm_struct *mm, struct arch_uprobe *auprobe, unsigned long v
 		if (result != 1)
 			return result;
 	}
-	return write_opcode(mm, auprobe, vaddr, *(uprobe_opcode_t *)auprobe->insn);
+	return write_opcode(auprobe, mm, vaddr, *(uprobe_opcode_t *)auprobe->insn);
 }
 
 static int match_uprobe(struct uprobe *l, struct uprobe *r)
@@ -525,30 +525,30 @@ static struct uprobe *alloc_uprobe(struct inode *inode, loff_t offset)
 
 /* Returns the previous consumer */
 static struct uprobe_consumer *
-consumer_add(struct uprobe *uprobe, struct uprobe_consumer *consumer)
+consumer_add(struct uprobe *uprobe, struct uprobe_consumer *uc)
 {
 	down_write(&uprobe->consumer_rwsem);
-	consumer->next = uprobe->consumers;
-	uprobe->consumers = consumer;
+	uc->next = uprobe->consumers;
+	uprobe->consumers = uc;
 	up_write(&uprobe->consumer_rwsem);
 
-	return consumer->next;
+	return uc->next;
 }
 
 /*
- * For uprobe @uprobe, delete the consumer @consumer.
- * Return true if the @consumer is deleted successfully
+ * For uprobe @uprobe, delete the consumer @uc.
+ * Return true if the @uc is deleted successfully
  * or return false.
  */
-static bool consumer_del(struct uprobe *uprobe, struct uprobe_consumer *consumer)
+static bool consumer_del(struct uprobe *uprobe, struct uprobe_consumer *uc)
 {
 	struct uprobe_consumer **con;
 	bool ret = false;
 
 	down_write(&uprobe->consumer_rwsem);
 	for (con = &uprobe->consumers; *con; con = &(*con)->next) {
-		if (*con == consumer) {
-			*con = consumer->next;
+		if (*con == uc) {
+			*con = uc->next;
 			ret = true;
 			break;
 		}
@@ -558,8 +558,8 @@ static bool consumer_del(struct uprobe *uprobe, struct uprobe_consumer *consumer
 	return ret;
 }
 
-static int __copy_insn(struct address_space *mapping,
-			struct vm_area_struct *vma, char *insn,
+static int
+__copy_insn(struct address_space *mapping, struct vm_area_struct *vma, char *insn,
 			unsigned long nbytes, unsigned long offset)
 {
 	struct file *filp = vma->vm_file;
@@ -590,7 +590,8 @@ static int __copy_insn(struct address_space *mapping,
 	return 0;
 }
 
-static int copy_insn(struct uprobe *uprobe, struct vm_area_struct *vma, unsigned long addr)
+static int
+copy_insn(struct uprobe *uprobe, struct vm_area_struct *vma, unsigned long addr)
 {
 	struct address_space *mapping;
 	unsigned long nbytes;
@@ -617,8 +618,9 @@ static int copy_insn(struct uprobe *uprobe, struct vm_area_struct *vma, unsigned
 	return __copy_insn(mapping, vma, uprobe->arch.insn, bytes, uprobe->offset);
 }
 
-static int install_breakpoint(struct mm_struct *mm, struct uprobe *uprobe,
-				struct vm_area_struct *vma, loff_t vaddr)
+static int
+install_breakpoint(struct uprobe *uprobe, struct mm_struct *mm,
+			struct vm_area_struct *vma, loff_t vaddr)
 {
 	unsigned long addr;
 	int ret;
@@ -643,20 +645,21 @@ static int install_breakpoint(struct mm_struct *mm, struct uprobe *uprobe,
 		if (is_bkpt_insn((uprobe_opcode_t *)uprobe->arch.insn))
 			return -EEXIST;
 
-		ret = arch_uprobes_analyze_insn(mm, &uprobe->arch);
+		ret = arch_uprobes_analyze_insn(&uprobe->arch, mm);
 		if (ret)
 			return ret;
 
 		uprobe->flags |= UPROBE_COPY_INSN;
 	}
-	ret = set_bkpt(mm, &uprobe->arch, addr);
+	ret = set_bkpt(&uprobe->arch, mm, addr);
 
 	return ret;
 }
 
-static void remove_breakpoint(struct mm_struct *mm, struct uprobe *uprobe, loff_t vaddr)
+static void
+remove_breakpoint(struct uprobe *uprobe, struct mm_struct *mm, loff_t vaddr)
 {
-	set_orig_insn(mm, &uprobe->arch, (unsigned long)vaddr, true);
+	set_orig_insn(&uprobe->arch, mm, (unsigned long)vaddr, true);
 }
 
 static void delete_uprobe(struct uprobe *uprobe)
@@ -671,9 +674,9 @@ static void delete_uprobe(struct uprobe *uprobe)
 	atomic_dec(&uprobe_events);
 }
 
-static struct vma_info *__find_next_vma_info(struct list_head *head,
-			loff_t offset, struct address_space *mapping,
-			struct vma_info *vi, bool is_register)
+static struct vma_info *
+__find_next_vma_info(struct address_space *mapping, struct list_head *head,
+			struct vma_info *vi, loff_t offset, bool is_register)
 {
 	struct prio_tree_iter iter;
 	struct vm_area_struct *vma;
@@ -719,8 +722,8 @@ static struct vma_info *__find_next_vma_info(struct list_head *head,
  * yet been inserted.
  */
 static struct vma_info *
-find_next_vma_info(struct list_head *head, loff_t offset, struct address_space *mapping,
-		   bool is_register)
+find_next_vma_info(struct address_space *mapping, struct list_head *head,
+		loff_t offset, bool is_register)
 {
 	struct vma_info *vi, *retvi;
 
@@ -729,7 +732,7 @@ find_next_vma_info(struct list_head *head, loff_t offset, struct address_space *
 		return ERR_PTR(-ENOMEM);
 
 	mutex_lock(&mapping->i_mmap_mutex);
-	retvi = __find_next_vma_info(head, offset, mapping, vi, is_register);
+	retvi = __find_next_vma_info(mapping, head, vi, offset, is_register);
 	mutex_unlock(&mapping->i_mmap_mutex);
 
 	if (!retvi)
@@ -754,7 +757,7 @@ static int register_for_each_vma(struct uprobe *uprobe, bool is_register)
 	ret = 0;
 
 	for (;;) {
-		vi = find_next_vma_info(&try_list, uprobe->offset, mapping, is_register);
+		vi = find_next_vma_info(mapping, &try_list, uprobe->offset, is_register);
 		if (!vi)
 			break;
 
@@ -784,9 +787,9 @@ static int register_for_each_vma(struct uprobe *uprobe, bool is_register)
 		}
 
 		if (is_register)
-			ret = install_breakpoint(mm, uprobe, vma, vi->vaddr);
+			ret = install_breakpoint(uprobe, mm, vma, vi->vaddr);
 		else
-			remove_breakpoint(mm, uprobe, vi->vaddr);
+			remove_breakpoint(uprobe, mm, vi->vaddr);
 
 		up_read(&mm->mmap_sem);
 		mmput(mm);
@@ -823,25 +826,25 @@ static void __uprobe_unregister(struct uprobe *uprobe)
  * uprobe_register - register a probe
  * @inode: the file in which the probe has to be placed.
  * @offset: offset from the start of the file.
- * @consumer: information on howto handle the probe..
+ * @uc: information on howto handle the probe..
  *
  * Apart from the access refcount, uprobe_register() takes a creation
  * refcount (thro alloc_uprobe) if and only if this @uprobe is getting
  * inserted into the rbtree (i.e first consumer for a @inode:@offset
  * tuple).  Creation refcount stops uprobe_unregister from freeing the
  * @uprobe even before the register operation is complete. Creation
- * refcount is released when the last @consumer for the @uprobe
+ * refcount is released when the last @uc for the @uprobe
  * unregisters.
  *
  * Return errno if it cannot successully install probes
  * else return 0 (success)
  */
-int uprobe_register(struct inode *inode, loff_t offset, struct uprobe_consumer *consumer)
+int uprobe_register(struct inode *inode, loff_t offset, struct uprobe_consumer *uc)
 {
 	struct uprobe *uprobe;
 	int ret;
 
-	if (!inode || !consumer || consumer->next)
+	if (!inode || !uc || uc->next)
 		return -EINVAL;
 
 	if (offset > i_size_read(inode))
@@ -851,7 +854,7 @@ int uprobe_register(struct inode *inode, loff_t offset, struct uprobe_consumer *
 	mutex_lock(uprobes_hash(inode));
 	uprobe = alloc_uprobe(inode, offset);
 
-	if (uprobe && !consumer_add(uprobe, consumer)) {
+	if (uprobe && !consumer_add(uprobe, uc)) {
 		ret = __uprobe_register(uprobe);
 		if (ret) {
 			uprobe->consumers = NULL;
@@ -871,13 +874,13 @@ int uprobe_register(struct inode *inode, loff_t offset, struct uprobe_consumer *
  * uprobe_unregister - unregister a already registered probe.
  * @inode: the file in which the probe has to be removed.
  * @offset: offset from the start of the file.
- * @consumer: identify which probe if multiple probes are colocated.
+ * @uc: identify which probe if multiple probes are colocated.
  */
-void uprobe_unregister(struct inode *inode, loff_t offset, struct uprobe_consumer *consumer)
+void uprobe_unregister(struct inode *inode, loff_t offset, struct uprobe_consumer *uc)
 {
 	struct uprobe *uprobe;
 
-	if (!inode || !consumer)
+	if (!inode || !uc)
 		return;
 
 	uprobe = find_uprobe(inode, offset);
@@ -886,7 +889,7 @@ void uprobe_unregister(struct inode *inode, loff_t offset, struct uprobe_consume
 
 	mutex_lock(uprobes_hash(inode));
 
-	if (consumer_del(uprobe, consumer)) {
+	if (consumer_del(uprobe, uc)) {
 		if (!uprobe->consumers) {
 			__uprobe_unregister(uprobe);
 			uprobe->flags &= ~UPROBE_RUN_HANDLER;
@@ -993,7 +996,7 @@ int uprobe_mmap(struct vm_area_struct *vma)
 		if (!ret) {
 			vaddr = vma_address(vma, uprobe->offset);
 			if (vaddr >= vma->vm_start && vaddr < vma->vm_end) {
-				ret = install_breakpoint(vma->vm_mm, uprobe, vma, vaddr);
+				ret = install_breakpoint(uprobe, vma->vm_mm, vma, vaddr);
 				/* Ignore double add: */
 				if (ret == -EEXIST)
 					ret = 0;
-- 
cgit v1.2.3


From 5cb4ac3a583d4ee18c8682ab857e093c4a0d0895 Mon Sep 17 00:00:00 2001
From: Srikar Dronamraju <srikar@linux.vnet.ibm.com>
Date: Mon, 12 Mar 2012 14:55:45 +0530
Subject: uprobes/core: Rename bkpt to swbp

bkpt doesnt seem to be a correct abbrevation for breakpoint.
Choice was between bp and breakpoint. Since bp can refer to
things other than breakpoint, use swbp to refer to breakpoints.

This is pure cleanup, no functional change intended.

Signed-off-by: Srikar Dronamraju <srikar@linux.vnet.ibm.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Ananth N Mavinakayanahalli <ananth@in.ibm.com>
Cc: Jim Keniston <jkenisto@linux.vnet.ibm.com>
Cc: Linux-mm <linux-mm@kvack.org>
Cc: Oleg Nesterov <oleg@redhat.com>
Cc: Andi Kleen <andi@firstfloor.org>
Cc: Christoph Hellwig <hch@infradead.org>
Cc: Steven Rostedt <rostedt@goodmis.org>
Cc: Arnaldo Carvalho de Melo <acme@infradead.org>
Cc: Masami Hiramatsu <masami.hiramatsu.pt@hitachi.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Link: http://lkml.kernel.org/r/20120312092545.5379.91251.sendpatchset@srdronam.in.ibm.com
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/include/asm/uprobes.h |  4 ++--
 include/linux/uprobes.h        |  4 ++--
 kernel/events/uprobes.c        | 34 +++++++++++++++++-----------------
 3 files changed, 21 insertions(+), 21 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/include/asm/uprobes.h b/arch/x86/include/asm/uprobes.h
index 384f1bebf884..0500391f57d0 100644
--- a/arch/x86/include/asm/uprobes.h
+++ b/arch/x86/include/asm/uprobes.h
@@ -28,8 +28,8 @@ typedef u8 uprobe_opcode_t;
 #define MAX_UINSN_BYTES			  16
 #define UPROBE_XOL_SLOT_BYTES		 128	/* to keep it cache aligned */
 
-#define UPROBE_BKPT_INSN		0xcc
-#define UPROBE_BKPT_INSN_SIZE		   1
+#define UPROBE_SWBP_INSN		0xcc
+#define UPROBE_SWBP_INSN_SIZE		   1
 
 struct arch_uprobe {
 	u16				fixups;
diff --git a/include/linux/uprobes.h b/include/linux/uprobes.h
index 58699182e9a7..eac525f41b94 100644
--- a/include/linux/uprobes.h
+++ b/include/linux/uprobes.h
@@ -52,9 +52,9 @@ struct uprobe_consumer {
 };
 
 #ifdef CONFIG_UPROBES
-extern int __weak set_bkpt(struct arch_uprobe *aup, struct mm_struct *mm, unsigned long vaddr);
+extern int __weak set_swbp(struct arch_uprobe *aup, struct mm_struct *mm, unsigned long vaddr);
 extern int __weak set_orig_insn(struct arch_uprobe *aup, struct mm_struct *mm,  unsigned long vaddr, bool verify);
-extern bool __weak is_bkpt_insn(uprobe_opcode_t *insn);
+extern bool __weak is_swbp_insn(uprobe_opcode_t *insn);
 extern int uprobe_register(struct inode *inode, loff_t offset, struct uprobe_consumer *uc);
 extern void uprobe_unregister(struct inode *inode, loff_t offset, struct uprobe_consumer *uc);
 extern int uprobe_mmap(struct vm_area_struct *vma);
diff --git a/kernel/events/uprobes.c b/kernel/events/uprobes.c
index 9c5ddff1c8da..e56e56aa7535 100644
--- a/kernel/events/uprobes.c
+++ b/kernel/events/uprobes.c
@@ -170,14 +170,14 @@ out:
 }
 
 /**
- * is_bkpt_insn - check if instruction is breakpoint instruction.
+ * is_swbp_insn - check if instruction is breakpoint instruction.
  * @insn: instruction to be checked.
- * Default implementation of is_bkpt_insn
+ * Default implementation of is_swbp_insn
  * Returns true if @insn is a breakpoint instruction.
  */
-bool __weak is_bkpt_insn(uprobe_opcode_t *insn)
+bool __weak is_swbp_insn(uprobe_opcode_t *insn)
 {
-	return *insn == UPROBE_BKPT_INSN;
+	return *insn == UPROBE_SWBP_INSN;
 }
 
 /*
@@ -227,7 +227,7 @@ static int write_opcode(struct arch_uprobe *auprobe, struct mm_struct *mm,
 	 * adding probes in write mapped pages since the breakpoints
 	 * might end up in the file copy.
 	 */
-	if (!valid_vma(vma, is_bkpt_insn(&opcode)))
+	if (!valid_vma(vma, is_swbp_insn(&opcode)))
 		goto put_out;
 
 	uprobe = container_of(auprobe, struct uprobe, arch);
@@ -259,8 +259,8 @@ static int write_opcode(struct arch_uprobe *auprobe, struct mm_struct *mm,
 
 	/* poke the new insn in, ASSUMES we don't cross page boundary */
 	vaddr &= ~PAGE_MASK;
-	BUG_ON(vaddr + UPROBE_BKPT_INSN_SIZE > PAGE_SIZE);
-	memcpy(vaddr_new + vaddr, &opcode, UPROBE_BKPT_INSN_SIZE);
+	BUG_ON(vaddr + UPROBE_SWBP_INSN_SIZE > PAGE_SIZE);
+	memcpy(vaddr_new + vaddr, &opcode, UPROBE_SWBP_INSN_SIZE);
 
 	kunmap_atomic(vaddr_new);
 	kunmap_atomic(vaddr_old);
@@ -308,7 +308,7 @@ static int read_opcode(struct mm_struct *mm, unsigned long vaddr, uprobe_opcode_
 	lock_page(page);
 	vaddr_new = kmap_atomic(page);
 	vaddr &= ~PAGE_MASK;
-	memcpy(opcode, vaddr_new + vaddr, UPROBE_BKPT_INSN_SIZE);
+	memcpy(opcode, vaddr_new + vaddr, UPROBE_SWBP_INSN_SIZE);
 	kunmap_atomic(vaddr_new);
 	unlock_page(page);
 
@@ -317,7 +317,7 @@ static int read_opcode(struct mm_struct *mm, unsigned long vaddr, uprobe_opcode_
 	return 0;
 }
 
-static int is_bkpt_at_addr(struct mm_struct *mm, unsigned long vaddr)
+static int is_swbp_at_addr(struct mm_struct *mm, unsigned long vaddr)
 {
 	uprobe_opcode_t opcode;
 	int result;
@@ -326,14 +326,14 @@ static int is_bkpt_at_addr(struct mm_struct *mm, unsigned long vaddr)
 	if (result)
 		return result;
 
-	if (is_bkpt_insn(&opcode))
+	if (is_swbp_insn(&opcode))
 		return 1;
 
 	return 0;
 }
 
 /**
- * set_bkpt - store breakpoint at a given address.
+ * set_swbp - store breakpoint at a given address.
  * @auprobe: arch specific probepoint information.
  * @mm: the probed process address space.
  * @vaddr: the virtual address to insert the opcode.
@@ -341,18 +341,18 @@ static int is_bkpt_at_addr(struct mm_struct *mm, unsigned long vaddr)
  * For mm @mm, store the breakpoint instruction at @vaddr.
  * Return 0 (success) or a negative errno.
  */
-int __weak set_bkpt(struct arch_uprobe *auprobe, struct mm_struct *mm, unsigned long vaddr)
+int __weak set_swbp(struct arch_uprobe *auprobe, struct mm_struct *mm, unsigned long vaddr)
 {
 	int result;
 
-	result = is_bkpt_at_addr(mm, vaddr);
+	result = is_swbp_at_addr(mm, vaddr);
 	if (result == 1)
 		return -EEXIST;
 
 	if (result)
 		return result;
 
-	return write_opcode(auprobe, mm, vaddr, UPROBE_BKPT_INSN);
+	return write_opcode(auprobe, mm, vaddr, UPROBE_SWBP_INSN);
 }
 
 /**
@@ -371,7 +371,7 @@ set_orig_insn(struct arch_uprobe *auprobe, struct mm_struct *mm, unsigned long v
 	if (verify) {
 		int result;
 
-		result = is_bkpt_at_addr(mm, vaddr);
+		result = is_swbp_at_addr(mm, vaddr);
 		if (!result)
 			return -EINVAL;
 
@@ -642,7 +642,7 @@ install_breakpoint(struct uprobe *uprobe, struct mm_struct *mm,
 		if (ret)
 			return ret;
 
-		if (is_bkpt_insn((uprobe_opcode_t *)uprobe->arch.insn))
+		if (is_swbp_insn((uprobe_opcode_t *)uprobe->arch.insn))
 			return -EEXIST;
 
 		ret = arch_uprobes_analyze_insn(&uprobe->arch, mm);
@@ -651,7 +651,7 @@ install_breakpoint(struct uprobe *uprobe, struct mm_struct *mm,
 
 		uprobe->flags |= UPROBE_COPY_INSN;
 	}
-	ret = set_bkpt(&uprobe->arch, mm, addr);
+	ret = set_swbp(&uprobe->arch, mm, addr);
 
 	return ret;
 }
-- 
cgit v1.2.3


From 0326f5a94ddea33fa331b2519f4172f4fb387baa Mon Sep 17 00:00:00 2001
From: Srikar Dronamraju <srikar@linux.vnet.ibm.com>
Date: Tue, 13 Mar 2012 23:30:11 +0530
Subject: uprobes/core: Handle breakpoint and singlestep exceptions

Uprobes uses exception notifiers to get to know if a thread hit
a breakpoint or a singlestep exception.

When a thread hits a uprobe or is singlestepping post a uprobe
hit, the uprobe exception notifier sets its TIF_UPROBE bit,
which will then be checked on its return to userspace path
(do_notify_resume() ->uprobe_notify_resume()), where the
consumers handlers are run (in task context) based on the
defined filters.

Uprobe hits are thread specific and hence we need to maintain
information about if a task hit a uprobe, what uprobe was hit,
the slot where the original instruction was copied for xol so
that it can be singlestepped with appropriate fixups.

In some cases, special care is needed for instructions that are
executed out of line (xol). These are architecture specific
artefacts, such as handling RIP relative instructions on x86_64.

Since the instruction at which the uprobe was inserted is
executed out of line, architecture specific fixups are added so
that the thread continues normal execution in the presence of a
uprobe.

Postpone the signals until we execute the probed insn.
post_xol() path does a recalc_sigpending() before return to
user-mode, this ensures the signal can't be lost.

Uprobes relies on DIE_DEBUG notification to notify if a
singlestep is complete.

Adds x86 specific uprobe exception notifiers and appropriate
hooks needed to determine a uprobe hit and subsequent post
processing.

Add requisite x86 fixups for xol for uprobes. Specific cases
needing fixups include relative jumps (x86_64), calls, etc.

Where possible, we check and skip singlestepping the
breakpointed instructions. For now we skip single byte as well
as few multibyte nop instructions. However this can be extended
to other instructions too.

Credits to Oleg Nesterov for suggestions/patches related to
signal, breakpoint, singlestep handling code.

Signed-off-by: Srikar Dronamraju <srikar@linux.vnet.ibm.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Ananth N Mavinakayanahalli <ananth@in.ibm.com>
Cc: Jim Keniston <jkenisto@linux.vnet.ibm.com>
Cc: Linux-mm <linux-mm@kvack.org>
Cc: Oleg Nesterov <oleg@redhat.com>
Cc: Andi Kleen <andi@firstfloor.org>
Cc: Christoph Hellwig <hch@infradead.org>
Cc: Steven Rostedt <rostedt@goodmis.org>
Cc: Arnaldo Carvalho de Melo <acme@infradead.org>
Cc: Masami Hiramatsu <masami.hiramatsu.pt@hitachi.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Link: http://lkml.kernel.org/r/20120313180011.29771.89027.sendpatchset@srdronam.in.ibm.com
[ Performed various cleanliness edits ]
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/include/asm/thread_info.h |   2 +
 arch/x86/include/asm/uprobes.h     |  16 +-
 arch/x86/kernel/signal.c           |   6 +
 arch/x86/kernel/uprobes.c          | 265 +++++++++++++++++++++++++++++-
 include/linux/sched.h              |   4 +
 include/linux/uprobes.h            |  55 ++++++-
 kernel/events/uprobes.c            | 323 ++++++++++++++++++++++++++++++++++++-
 kernel/fork.c                      |   4 +
 kernel/signal.c                    |   4 +
 9 files changed, 664 insertions(+), 15 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/include/asm/thread_info.h b/arch/x86/include/asm/thread_info.h
index ad6df8ccd715..0710c11305d4 100644
--- a/arch/x86/include/asm/thread_info.h
+++ b/arch/x86/include/asm/thread_info.h
@@ -85,6 +85,7 @@ struct thread_info {
 #define TIF_SECCOMP		8	/* secure computing */
 #define TIF_MCE_NOTIFY		10	/* notify userspace of an MCE */
 #define TIF_USER_RETURN_NOTIFY	11	/* notify kernel of userspace return */
+#define TIF_UPROBE		12	/* breakpointed or singlestepping */
 #define TIF_NOTSC		16	/* TSC is not accessible in userland */
 #define TIF_IA32		17	/* IA32 compatibility process */
 #define TIF_FORK		18	/* ret_from_fork */
@@ -109,6 +110,7 @@ struct thread_info {
 #define _TIF_SECCOMP		(1 << TIF_SECCOMP)
 #define _TIF_MCE_NOTIFY		(1 << TIF_MCE_NOTIFY)
 #define _TIF_USER_RETURN_NOTIFY	(1 << TIF_USER_RETURN_NOTIFY)
+#define _TIF_UPROBE		(1 << TIF_UPROBE)
 #define _TIF_NOTSC		(1 << TIF_NOTSC)
 #define _TIF_IA32		(1 << TIF_IA32)
 #define _TIF_FORK		(1 << TIF_FORK)
diff --git a/arch/x86/include/asm/uprobes.h b/arch/x86/include/asm/uprobes.h
index 0500391f57d0..1e9bed14f7ae 100644
--- a/arch/x86/include/asm/uprobes.h
+++ b/arch/x86/include/asm/uprobes.h
@@ -23,6 +23,8 @@
  *	Jim Keniston
  */
 
+#include <linux/notifier.h>
+
 typedef u8 uprobe_opcode_t;
 
 #define MAX_UINSN_BYTES			  16
@@ -39,5 +41,17 @@ struct arch_uprobe {
 #endif
 };
 
-extern int arch_uprobes_analyze_insn(struct arch_uprobe *aup, struct mm_struct *mm);
+struct arch_uprobe_task {
+	unsigned long			saved_trap_nr;
+#ifdef CONFIG_X86_64
+	unsigned long			saved_scratch_register;
+#endif
+};
+
+extern int  arch_uprobe_analyze_insn(struct arch_uprobe *aup, struct mm_struct *mm);
+extern int  arch_uprobe_pre_xol(struct arch_uprobe *aup, struct pt_regs *regs);
+extern int  arch_uprobe_post_xol(struct arch_uprobe *aup, struct pt_regs *regs);
+extern bool arch_uprobe_xol_was_trapped(struct task_struct *tsk);
+extern int  arch_uprobe_exception_notify(struct notifier_block *self, unsigned long val, void *data);
+extern void arch_uprobe_abort_xol(struct arch_uprobe *aup, struct pt_regs *regs);
 #endif	/* _ASM_UPROBES_H */
diff --git a/arch/x86/kernel/signal.c b/arch/x86/kernel/signal.c
index 9c73acc1c860..b3cd6913ceea 100644
--- a/arch/x86/kernel/signal.c
+++ b/arch/x86/kernel/signal.c
@@ -18,6 +18,7 @@
 #include <linux/personality.h>
 #include <linux/uaccess.h>
 #include <linux/user-return-notifier.h>
+#include <linux/uprobes.h>
 
 #include <asm/processor.h>
 #include <asm/ucontext.h>
@@ -823,6 +824,11 @@ do_notify_resume(struct pt_regs *regs, void *unused, __u32 thread_info_flags)
 		mce_notify_process();
 #endif /* CONFIG_X86_64 && CONFIG_X86_MCE */
 
+	if (thread_info_flags & _TIF_UPROBE) {
+		clear_thread_flag(TIF_UPROBE);
+		uprobe_notify_resume(regs);
+	}
+
 	/* deal with pending signal delivery */
 	if (thread_info_flags & _TIF_SIGPENDING)
 		do_signal(regs);
diff --git a/arch/x86/kernel/uprobes.c b/arch/x86/kernel/uprobes.c
index 851a11b0d38c..dc4e910a7d96 100644
--- a/arch/x86/kernel/uprobes.c
+++ b/arch/x86/kernel/uprobes.c
@@ -24,22 +24,28 @@
 #include <linux/sched.h>
 #include <linux/ptrace.h>
 #include <linux/uprobes.h>
+#include <linux/uaccess.h>
 
 #include <linux/kdebug.h>
+#include <asm/processor.h>
 #include <asm/insn.h>
 
 /* Post-execution fixups. */
 
 /* No fixup needed */
-#define UPROBE_FIX_NONE	0x0
+#define UPROBE_FIX_NONE		0x0
+
 /* Adjust IP back to vicinity of actual insn */
 #define UPROBE_FIX_IP		0x1
+
 /* Adjust the return address of a call insn */
 #define UPROBE_FIX_CALL	0x2
 
 #define UPROBE_FIX_RIP_AX	0x8000
 #define UPROBE_FIX_RIP_CX	0x4000
 
+#define	UPROBE_TRAP_NR		UINT_MAX
+
 /* Adaptations for mhiramat x86 decoder v14. */
 #define OPCODE1(insn)		((insn)->opcode.bytes[0])
 #define OPCODE2(insn)		((insn)->opcode.bytes[1])
@@ -221,10 +227,9 @@ static int validate_insn_32bits(struct arch_uprobe *auprobe, struct insn *insn)
 }
 
 /*
- * Figure out which fixups post_xol() will need to perform, and annotate
- * arch_uprobe->fixups accordingly.  To start with,
- * arch_uprobe->fixups is either zero or it reflects rip-related
- * fixups.
+ * Figure out which fixups arch_uprobe_post_xol() will need to perform, and
+ * annotate arch_uprobe->fixups accordingly.  To start with,
+ * arch_uprobe->fixups is either zero or it reflects rip-related fixups.
  */
 static void prepare_fixups(struct arch_uprobe *auprobe, struct insn *insn)
 {
@@ -401,12 +406,12 @@ static int validate_insn_bits(struct arch_uprobe *auprobe, struct mm_struct *mm,
 #endif /* CONFIG_X86_64 */
 
 /**
- * arch_uprobes_analyze_insn - instruction analysis including validity and fixups.
+ * arch_uprobe_analyze_insn - instruction analysis including validity and fixups.
  * @mm: the probed address space.
  * @arch_uprobe: the probepoint information.
  * Return 0 on success or a -ve number on error.
  */
-int arch_uprobes_analyze_insn(struct arch_uprobe *auprobe, struct mm_struct *mm)
+int arch_uprobe_analyze_insn(struct arch_uprobe *auprobe, struct mm_struct *mm)
 {
 	int ret;
 	struct insn insn;
@@ -421,3 +426,249 @@ int arch_uprobes_analyze_insn(struct arch_uprobe *auprobe, struct mm_struct *mm)
 
 	return 0;
 }
+
+#ifdef CONFIG_X86_64
+/*
+ * If we're emulating a rip-relative instruction, save the contents
+ * of the scratch register and store the target address in that register.
+ */
+static void
+pre_xol_rip_insn(struct arch_uprobe *auprobe, struct pt_regs *regs,
+				struct arch_uprobe_task *autask)
+{
+	if (auprobe->fixups & UPROBE_FIX_RIP_AX) {
+		autask->saved_scratch_register = regs->ax;
+		regs->ax = current->utask->vaddr;
+		regs->ax += auprobe->rip_rela_target_address;
+	} else if (auprobe->fixups & UPROBE_FIX_RIP_CX) {
+		autask->saved_scratch_register = regs->cx;
+		regs->cx = current->utask->vaddr;
+		regs->cx += auprobe->rip_rela_target_address;
+	}
+}
+#else
+static void
+pre_xol_rip_insn(struct arch_uprobe *auprobe, struct pt_regs *regs,
+				struct arch_uprobe_task *autask)
+{
+	/* No RIP-relative addressing on 32-bit */
+}
+#endif
+
+/*
+ * arch_uprobe_pre_xol - prepare to execute out of line.
+ * @auprobe: the probepoint information.
+ * @regs: reflects the saved user state of current task.
+ */
+int arch_uprobe_pre_xol(struct arch_uprobe *auprobe, struct pt_regs *regs)
+{
+	struct arch_uprobe_task *autask;
+
+	autask = &current->utask->autask;
+	autask->saved_trap_nr = current->thread.trap_nr;
+	current->thread.trap_nr = UPROBE_TRAP_NR;
+	regs->ip = current->utask->xol_vaddr;
+	pre_xol_rip_insn(auprobe, regs, autask);
+
+	return 0;
+}
+
+/*
+ * This function is called by arch_uprobe_post_xol() to adjust the return
+ * address pushed by a call instruction executed out of line.
+ */
+static int adjust_ret_addr(unsigned long sp, long correction)
+{
+	int rasize, ncopied;
+	long ra = 0;
+
+	if (is_ia32_task())
+		rasize = 4;
+	else
+		rasize = 8;
+
+	ncopied = copy_from_user(&ra, (void __user *)sp, rasize);
+	if (unlikely(ncopied))
+		return -EFAULT;
+
+	ra += correction;
+	ncopied = copy_to_user((void __user *)sp, &ra, rasize);
+	if (unlikely(ncopied))
+		return -EFAULT;
+
+	return 0;
+}
+
+#ifdef CONFIG_X86_64
+static bool is_riprel_insn(struct arch_uprobe *auprobe)
+{
+	return ((auprobe->fixups & (UPROBE_FIX_RIP_AX | UPROBE_FIX_RIP_CX)) != 0);
+}
+
+static void
+handle_riprel_post_xol(struct arch_uprobe *auprobe, struct pt_regs *regs, long *correction)
+{
+	if (is_riprel_insn(auprobe)) {
+		struct arch_uprobe_task *autask;
+
+		autask = &current->utask->autask;
+		if (auprobe->fixups & UPROBE_FIX_RIP_AX)
+			regs->ax = autask->saved_scratch_register;
+		else
+			regs->cx = autask->saved_scratch_register;
+
+		/*
+		 * The original instruction includes a displacement, and so
+		 * is 4 bytes longer than what we've just single-stepped.
+		 * Fall through to handle stuff like "jmpq *...(%rip)" and
+		 * "callq *...(%rip)".
+		 */
+		if (correction)
+			*correction += 4;
+	}
+}
+#else
+static void
+handle_riprel_post_xol(struct arch_uprobe *auprobe, struct pt_regs *regs, long *correction)
+{
+	/* No RIP-relative addressing on 32-bit */
+}
+#endif
+
+/*
+ * If xol insn itself traps and generates a signal(Say,
+ * SIGILL/SIGSEGV/etc), then detect the case where a singlestepped
+ * instruction jumps back to its own address. It is assumed that anything
+ * like do_page_fault/do_trap/etc sets thread.trap_nr != -1.
+ *
+ * arch_uprobe_pre_xol/arch_uprobe_post_xol save/restore thread.trap_nr,
+ * arch_uprobe_xol_was_trapped() simply checks that ->trap_nr is not equal to
+ * UPROBE_TRAP_NR == -1 set by arch_uprobe_pre_xol().
+ */
+bool arch_uprobe_xol_was_trapped(struct task_struct *t)
+{
+	if (t->thread.trap_nr != UPROBE_TRAP_NR)
+		return true;
+
+	return false;
+}
+
+/*
+ * Called after single-stepping. To avoid the SMP problems that can
+ * occur when we temporarily put back the original opcode to
+ * single-step, we single-stepped a copy of the instruction.
+ *
+ * This function prepares to resume execution after the single-step.
+ * We have to fix things up as follows:
+ *
+ * Typically, the new ip is relative to the copied instruction.  We need
+ * to make it relative to the original instruction (FIX_IP).  Exceptions
+ * are return instructions and absolute or indirect jump or call instructions.
+ *
+ * If the single-stepped instruction was a call, the return address that
+ * is atop the stack is the address following the copied instruction.  We
+ * need to make it the address following the original instruction (FIX_CALL).
+ *
+ * If the original instruction was a rip-relative instruction such as
+ * "movl %edx,0xnnnn(%rip)", we have instead executed an equivalent
+ * instruction using a scratch register -- e.g., "movl %edx,(%rax)".
+ * We need to restore the contents of the scratch register and adjust
+ * the ip, keeping in mind that the instruction we executed is 4 bytes
+ * shorter than the original instruction (since we squeezed out the offset
+ * field).  (FIX_RIP_AX or FIX_RIP_CX)
+ */
+int arch_uprobe_post_xol(struct arch_uprobe *auprobe, struct pt_regs *regs)
+{
+	struct uprobe_task *utask;
+	long correction;
+	int result = 0;
+
+	WARN_ON_ONCE(current->thread.trap_nr != UPROBE_TRAP_NR);
+
+	utask = current->utask;
+	current->thread.trap_nr = utask->autask.saved_trap_nr;
+	correction = (long)(utask->vaddr - utask->xol_vaddr);
+	handle_riprel_post_xol(auprobe, regs, &correction);
+	if (auprobe->fixups & UPROBE_FIX_IP)
+		regs->ip += correction;
+
+	if (auprobe->fixups & UPROBE_FIX_CALL)
+		result = adjust_ret_addr(regs->sp, correction);
+
+	return result;
+}
+
+/* callback routine for handling exceptions. */
+int arch_uprobe_exception_notify(struct notifier_block *self, unsigned long val, void *data)
+{
+	struct die_args *args = data;
+	struct pt_regs *regs = args->regs;
+	int ret = NOTIFY_DONE;
+
+	/* We are only interested in userspace traps */
+	if (regs && !user_mode_vm(regs))
+		return NOTIFY_DONE;
+
+	switch (val) {
+	case DIE_INT3:
+		if (uprobe_pre_sstep_notifier(regs))
+			ret = NOTIFY_STOP;
+
+		break;
+
+	case DIE_DEBUG:
+		if (uprobe_post_sstep_notifier(regs))
+			ret = NOTIFY_STOP;
+
+	default:
+		break;
+	}
+
+	return ret;
+}
+
+/*
+ * This function gets called when XOL instruction either gets trapped or
+ * the thread has a fatal signal, so reset the instruction pointer to its
+ * probed address.
+ */
+void arch_uprobe_abort_xol(struct arch_uprobe *auprobe, struct pt_regs *regs)
+{
+	struct uprobe_task *utask = current->utask;
+
+	current->thread.trap_nr = utask->autask.saved_trap_nr;
+	handle_riprel_post_xol(auprobe, regs, NULL);
+	instruction_pointer_set(regs, utask->vaddr);
+}
+
+/*
+ * Skip these instructions as per the currently known x86 ISA.
+ * 0x66* { 0x90 | 0x0f 0x1f | 0x0f 0x19 | 0x87 0xc0 }
+ */
+bool arch_uprobe_skip_sstep(struct arch_uprobe *auprobe, struct pt_regs *regs)
+{
+	int i;
+
+	for (i = 0; i < MAX_UINSN_BYTES; i++) {
+		if ((auprobe->insn[i] == 0x66))
+			continue;
+
+		if (auprobe->insn[i] == 0x90)
+			return true;
+
+		if (i == (MAX_UINSN_BYTES - 1))
+			break;
+
+		if ((auprobe->insn[i] == 0x0f) && (auprobe->insn[i+1] == 0x1f))
+			return true;
+
+		if ((auprobe->insn[i] == 0x0f) && (auprobe->insn[i+1] == 0x19))
+			return true;
+
+		if ((auprobe->insn[i] == 0x87) && (auprobe->insn[i+1] == 0xc0))
+			return true;
+
+		break;
+	}
+	return false;
+}
diff --git a/include/linux/sched.h b/include/linux/sched.h
index 7d379a6bfd88..8379e3771690 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -1590,6 +1590,10 @@ struct task_struct {
 #ifdef CONFIG_HAVE_HW_BREAKPOINT
 	atomic_t ptrace_bp_refcnt;
 #endif
+#ifdef CONFIG_UPROBES
+	struct uprobe_task *utask;
+	int uprobe_srcu_id;
+#endif
 };
 
 /* Future-safe accessor for struct task_struct's cpus_allowed. */
diff --git a/include/linux/uprobes.h b/include/linux/uprobes.h
index eac525f41b94..5ec778fdce6f 100644
--- a/include/linux/uprobes.h
+++ b/include/linux/uprobes.h
@@ -28,8 +28,9 @@
 #include <linux/rbtree.h>
 
 struct vm_area_struct;
+
 #ifdef CONFIG_ARCH_SUPPORTS_UPROBES
-#include <asm/uprobes.h>
+# include <asm/uprobes.h>
 #endif
 
 /* flags that denote/change uprobes behaviour */
@@ -39,6 +40,8 @@ struct vm_area_struct;
 
 /* Dont run handlers when first register/ last unregister in progress*/
 #define UPROBE_RUN_HANDLER	0x2
+/* Can skip singlestep */
+#define UPROBE_SKIP_SSTEP	0x4
 
 struct uprobe_consumer {
 	int (*handler)(struct uprobe_consumer *self, struct pt_regs *regs);
@@ -52,13 +55,42 @@ struct uprobe_consumer {
 };
 
 #ifdef CONFIG_UPROBES
+enum uprobe_task_state {
+	UTASK_RUNNING,
+	UTASK_BP_HIT,
+	UTASK_SSTEP,
+	UTASK_SSTEP_ACK,
+	UTASK_SSTEP_TRAPPED,
+};
+
+/*
+ * uprobe_task: Metadata of a task while it singlesteps.
+ */
+struct uprobe_task {
+	enum uprobe_task_state		state;
+	struct arch_uprobe_task		autask;
+
+	struct uprobe			*active_uprobe;
+
+	unsigned long			xol_vaddr;
+	unsigned long			vaddr;
+};
+
 extern int __weak set_swbp(struct arch_uprobe *aup, struct mm_struct *mm, unsigned long vaddr);
 extern int __weak set_orig_insn(struct arch_uprobe *aup, struct mm_struct *mm,  unsigned long vaddr, bool verify);
 extern bool __weak is_swbp_insn(uprobe_opcode_t *insn);
 extern int uprobe_register(struct inode *inode, loff_t offset, struct uprobe_consumer *uc);
 extern void uprobe_unregister(struct inode *inode, loff_t offset, struct uprobe_consumer *uc);
 extern int uprobe_mmap(struct vm_area_struct *vma);
-#else /* CONFIG_UPROBES is not defined */
+extern void uprobe_free_utask(struct task_struct *t);
+extern void uprobe_copy_process(struct task_struct *t);
+extern unsigned long __weak uprobe_get_swbp_addr(struct pt_regs *regs);
+extern int uprobe_post_sstep_notifier(struct pt_regs *regs);
+extern int uprobe_pre_sstep_notifier(struct pt_regs *regs);
+extern void uprobe_notify_resume(struct pt_regs *regs);
+extern bool uprobe_deny_signal(void);
+extern bool __weak arch_uprobe_skip_sstep(struct arch_uprobe *aup, struct pt_regs *regs);
+#else /* !CONFIG_UPROBES */
 static inline int
 uprobe_register(struct inode *inode, loff_t offset, struct uprobe_consumer *uc)
 {
@@ -72,5 +104,22 @@ static inline int uprobe_mmap(struct vm_area_struct *vma)
 {
 	return 0;
 }
-#endif /* CONFIG_UPROBES */
+static inline void uprobe_notify_resume(struct pt_regs *regs)
+{
+}
+static inline bool uprobe_deny_signal(void)
+{
+	return false;
+}
+static inline unsigned long uprobe_get_swbp_addr(struct pt_regs *regs)
+{
+	return 0;
+}
+static inline void uprobe_free_utask(struct task_struct *t)
+{
+}
+static inline void uprobe_copy_process(struct task_struct *t)
+{
+}
+#endif /* !CONFIG_UPROBES */
 #endif	/* _LINUX_UPROBES_H */
diff --git a/kernel/events/uprobes.c b/kernel/events/uprobes.c
index e56e56aa7535..b807d1566b64 100644
--- a/kernel/events/uprobes.c
+++ b/kernel/events/uprobes.c
@@ -30,9 +30,12 @@
 #include <linux/rmap.h>		/* anon_vma_prepare */
 #include <linux/mmu_notifier.h>	/* set_pte_at_notify */
 #include <linux/swap.h>		/* try_to_free_swap */
+#include <linux/ptrace.h>	/* user_enable_single_step */
+#include <linux/kdebug.h>	/* notifier mechanism */
 
 #include <linux/uprobes.h>
 
+static struct srcu_struct uprobes_srcu;
 static struct rb_root uprobes_tree = RB_ROOT;
 
 static DEFINE_SPINLOCK(uprobes_treelock);	/* serialize rbtree access */
@@ -486,6 +489,9 @@ static struct uprobe *insert_uprobe(struct uprobe *uprobe)
 	u = __insert_uprobe(uprobe);
 	spin_unlock_irqrestore(&uprobes_treelock, flags);
 
+	/* For now assume that the instruction need not be single-stepped */
+	uprobe->flags |= UPROBE_SKIP_SSTEP;
+
 	return u;
 }
 
@@ -523,6 +529,21 @@ static struct uprobe *alloc_uprobe(struct inode *inode, loff_t offset)
 	return uprobe;
 }
 
+static void handler_chain(struct uprobe *uprobe, struct pt_regs *regs)
+{
+	struct uprobe_consumer *uc;
+
+	if (!(uprobe->flags & UPROBE_RUN_HANDLER))
+		return;
+
+	down_read(&uprobe->consumer_rwsem);
+	for (uc = uprobe->consumers; uc; uc = uc->next) {
+		if (!uc->filter || uc->filter(uc, current))
+			uc->handler(uc, regs);
+	}
+	up_read(&uprobe->consumer_rwsem);
+}
+
 /* Returns the previous consumer */
 static struct uprobe_consumer *
 consumer_add(struct uprobe *uprobe, struct uprobe_consumer *uc)
@@ -645,7 +666,7 @@ install_breakpoint(struct uprobe *uprobe, struct mm_struct *mm,
 		if (is_swbp_insn((uprobe_opcode_t *)uprobe->arch.insn))
 			return -EEXIST;
 
-		ret = arch_uprobes_analyze_insn(&uprobe->arch, mm);
+		ret = arch_uprobe_analyze_insn(&uprobe->arch, mm);
 		if (ret)
 			return ret;
 
@@ -662,10 +683,21 @@ remove_breakpoint(struct uprobe *uprobe, struct mm_struct *mm, loff_t vaddr)
 	set_orig_insn(&uprobe->arch, mm, (unsigned long)vaddr, true);
 }
 
+/*
+ * There could be threads that have hit the breakpoint and are entering the
+ * notifier code and trying to acquire the uprobes_treelock. The thread
+ * calling delete_uprobe() that is removing the uprobe from the rb_tree can
+ * race with these threads and might acquire the uprobes_treelock compared
+ * to some of the breakpoint hit threads. In such a case, the breakpoint
+ * hit threads will not find the uprobe. The current unregistering thread
+ * waits till all other threads have hit a breakpoint, to acquire the
+ * uprobes_treelock before the uprobe is removed from the rbtree.
+ */
 static void delete_uprobe(struct uprobe *uprobe)
 {
 	unsigned long flags;
 
+	synchronize_srcu(&uprobes_srcu);
 	spin_lock_irqsave(&uprobes_treelock, flags);
 	rb_erase(&uprobe->rb_node, &uprobes_tree);
 	spin_unlock_irqrestore(&uprobes_treelock, flags);
@@ -1010,6 +1042,288 @@ int uprobe_mmap(struct vm_area_struct *vma)
 	return ret;
 }
 
+/**
+ * uprobe_get_swbp_addr - compute address of swbp given post-swbp regs
+ * @regs: Reflects the saved state of the task after it has hit a breakpoint
+ * instruction.
+ * Return the address of the breakpoint instruction.
+ */
+unsigned long __weak uprobe_get_swbp_addr(struct pt_regs *regs)
+{
+	return instruction_pointer(regs) - UPROBE_SWBP_INSN_SIZE;
+}
+
+/*
+ * Called with no locks held.
+ * Called in context of a exiting or a exec-ing thread.
+ */
+void uprobe_free_utask(struct task_struct *t)
+{
+	struct uprobe_task *utask = t->utask;
+
+	if (t->uprobe_srcu_id != -1)
+		srcu_read_unlock_raw(&uprobes_srcu, t->uprobe_srcu_id);
+
+	if (!utask)
+		return;
+
+	if (utask->active_uprobe)
+		put_uprobe(utask->active_uprobe);
+
+	kfree(utask);
+	t->utask = NULL;
+}
+
+/*
+ * Called in context of a new clone/fork from copy_process.
+ */
+void uprobe_copy_process(struct task_struct *t)
+{
+	t->utask = NULL;
+	t->uprobe_srcu_id = -1;
+}
+
+/*
+ * Allocate a uprobe_task object for the task.
+ * Called when the thread hits a breakpoint for the first time.
+ *
+ * Returns:
+ * - pointer to new uprobe_task on success
+ * - NULL otherwise
+ */
+static struct uprobe_task *add_utask(void)
+{
+	struct uprobe_task *utask;
+
+	utask = kzalloc(sizeof *utask, GFP_KERNEL);
+	if (unlikely(!utask))
+		return NULL;
+
+	utask->active_uprobe = NULL;
+	current->utask = utask;
+	return utask;
+}
+
+/* Prepare to single-step probed instruction out of line. */
+static int
+pre_ssout(struct uprobe *uprobe, struct pt_regs *regs, unsigned long vaddr)
+{
+	return -EFAULT;
+}
+
+/*
+ * If we are singlestepping, then ensure this thread is not connected to
+ * non-fatal signals until completion of singlestep.  When xol insn itself
+ * triggers the signal,  restart the original insn even if the task is
+ * already SIGKILL'ed (since coredump should report the correct ip).  This
+ * is even more important if the task has a handler for SIGSEGV/etc, The
+ * _same_ instruction should be repeated again after return from the signal
+ * handler, and SSTEP can never finish in this case.
+ */
+bool uprobe_deny_signal(void)
+{
+	struct task_struct *t = current;
+	struct uprobe_task *utask = t->utask;
+
+	if (likely(!utask || !utask->active_uprobe))
+		return false;
+
+	WARN_ON_ONCE(utask->state != UTASK_SSTEP);
+
+	if (signal_pending(t)) {
+		spin_lock_irq(&t->sighand->siglock);
+		clear_tsk_thread_flag(t, TIF_SIGPENDING);
+		spin_unlock_irq(&t->sighand->siglock);
+
+		if (__fatal_signal_pending(t) || arch_uprobe_xol_was_trapped(t)) {
+			utask->state = UTASK_SSTEP_TRAPPED;
+			set_tsk_thread_flag(t, TIF_UPROBE);
+			set_tsk_thread_flag(t, TIF_NOTIFY_RESUME);
+		}
+	}
+
+	return true;
+}
+
+/*
+ * Avoid singlestepping the original instruction if the original instruction
+ * is a NOP or can be emulated.
+ */
+static bool can_skip_sstep(struct uprobe *uprobe, struct pt_regs *regs)
+{
+	if (arch_uprobe_skip_sstep(&uprobe->arch, regs))
+		return true;
+
+	uprobe->flags &= ~UPROBE_SKIP_SSTEP;
+	return false;
+}
+
+/*
+ * Run handler and ask thread to singlestep.
+ * Ensure all non-fatal signals cannot interrupt thread while it singlesteps.
+ */
+static void handle_swbp(struct pt_regs *regs)
+{
+	struct vm_area_struct *vma;
+	struct uprobe_task *utask;
+	struct uprobe *uprobe;
+	struct mm_struct *mm;
+	unsigned long bp_vaddr;
+
+	uprobe = NULL;
+	bp_vaddr = uprobe_get_swbp_addr(regs);
+	mm = current->mm;
+	down_read(&mm->mmap_sem);
+	vma = find_vma(mm, bp_vaddr);
+
+	if (vma && vma->vm_start <= bp_vaddr && valid_vma(vma, false)) {
+		struct inode *inode;
+		loff_t offset;
+
+		inode = vma->vm_file->f_mapping->host;
+		offset = bp_vaddr - vma->vm_start;
+		offset += (vma->vm_pgoff << PAGE_SHIFT);
+		uprobe = find_uprobe(inode, offset);
+	}
+
+	srcu_read_unlock_raw(&uprobes_srcu, current->uprobe_srcu_id);
+	current->uprobe_srcu_id = -1;
+	up_read(&mm->mmap_sem);
+
+	if (!uprobe) {
+		/* No matching uprobe; signal SIGTRAP. */
+		send_sig(SIGTRAP, current, 0);
+		return;
+	}
+
+	utask = current->utask;
+	if (!utask) {
+		utask = add_utask();
+		/* Cannot allocate; re-execute the instruction. */
+		if (!utask)
+			goto cleanup_ret;
+	}
+	utask->active_uprobe = uprobe;
+	handler_chain(uprobe, regs);
+	if (uprobe->flags & UPROBE_SKIP_SSTEP && can_skip_sstep(uprobe, regs))
+		goto cleanup_ret;
+
+	utask->state = UTASK_SSTEP;
+	if (!pre_ssout(uprobe, regs, bp_vaddr)) {
+		user_enable_single_step(current);
+		return;
+	}
+
+cleanup_ret:
+	if (utask) {
+		utask->active_uprobe = NULL;
+		utask->state = UTASK_RUNNING;
+	}
+	if (uprobe) {
+		if (!(uprobe->flags & UPROBE_SKIP_SSTEP))
+
+			/*
+			 * cannot singlestep; cannot skip instruction;
+			 * re-execute the instruction.
+			 */
+			instruction_pointer_set(regs, bp_vaddr);
+
+		put_uprobe(uprobe);
+	}
+}
+
+/*
+ * Perform required fix-ups and disable singlestep.
+ * Allow pending signals to take effect.
+ */
+static void handle_singlestep(struct uprobe_task *utask, struct pt_regs *regs)
+{
+	struct uprobe *uprobe;
+
+	uprobe = utask->active_uprobe;
+	if (utask->state == UTASK_SSTEP_ACK)
+		arch_uprobe_post_xol(&uprobe->arch, regs);
+	else if (utask->state == UTASK_SSTEP_TRAPPED)
+		arch_uprobe_abort_xol(&uprobe->arch, regs);
+	else
+		WARN_ON_ONCE(1);
+
+	put_uprobe(uprobe);
+	utask->active_uprobe = NULL;
+	utask->state = UTASK_RUNNING;
+	user_disable_single_step(current);
+
+	spin_lock_irq(&current->sighand->siglock);
+	recalc_sigpending(); /* see uprobe_deny_signal() */
+	spin_unlock_irq(&current->sighand->siglock);
+}
+
+/*
+ * On breakpoint hit, breakpoint notifier sets the TIF_UPROBE flag.  (and on
+ * subsequent probe hits on the thread sets the state to UTASK_BP_HIT) and
+ * allows the thread to return from interrupt.
+ *
+ * On singlestep exception, singlestep notifier sets the TIF_UPROBE flag and
+ * also sets the state to UTASK_SSTEP_ACK and allows the thread to return from
+ * interrupt.
+ *
+ * While returning to userspace, thread notices the TIF_UPROBE flag and calls
+ * uprobe_notify_resume().
+ */
+void uprobe_notify_resume(struct pt_regs *regs)
+{
+	struct uprobe_task *utask;
+
+	utask = current->utask;
+	if (!utask || utask->state == UTASK_BP_HIT)
+		handle_swbp(regs);
+	else
+		handle_singlestep(utask, regs);
+}
+
+/*
+ * uprobe_pre_sstep_notifier gets called from interrupt context as part of
+ * notifier mechanism. Set TIF_UPROBE flag and indicate breakpoint hit.
+ */
+int uprobe_pre_sstep_notifier(struct pt_regs *regs)
+{
+	struct uprobe_task *utask;
+
+	if (!current->mm)
+		return 0;
+
+	utask = current->utask;
+	if (utask)
+		utask->state = UTASK_BP_HIT;
+
+	set_thread_flag(TIF_UPROBE);
+	current->uprobe_srcu_id = srcu_read_lock_raw(&uprobes_srcu);
+
+	return 1;
+}
+
+/*
+ * uprobe_post_sstep_notifier gets called in interrupt context as part of notifier
+ * mechanism. Set TIF_UPROBE flag and indicate completion of singlestep.
+ */
+int uprobe_post_sstep_notifier(struct pt_regs *regs)
+{
+	struct uprobe_task *utask = current->utask;
+
+	if (!current->mm || !utask || !utask->active_uprobe)
+		/* task is currently not uprobed */
+		return 0;
+
+	utask->state = UTASK_SSTEP_ACK;
+	set_thread_flag(TIF_UPROBE);
+	return 1;
+}
+
+static struct notifier_block uprobe_exception_nb = {
+	.notifier_call		= arch_uprobe_exception_notify,
+	.priority		= INT_MAX-1,	/* notified after kprobes, kgdb */
+};
+
 static int __init init_uprobes(void)
 {
 	int i;
@@ -1018,12 +1332,13 @@ static int __init init_uprobes(void)
 		mutex_init(&uprobes_mutex[i]);
 		mutex_init(&uprobes_mmap_mutex[i]);
 	}
-	return 0;
+	init_srcu_struct(&uprobes_srcu);
+
+	return register_die_notifier(&uprobe_exception_nb);
 }
+module_init(init_uprobes);
 
 static void __exit exit_uprobes(void)
 {
 }
-
-module_init(init_uprobes);
 module_exit(exit_uprobes);
diff --git a/kernel/fork.c b/kernel/fork.c
index e2cd3e2a5ae8..eb7b63334009 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -67,6 +67,7 @@
 #include <linux/oom.h>
 #include <linux/khugepaged.h>
 #include <linux/signalfd.h>
+#include <linux/uprobes.h>
 
 #include <asm/pgtable.h>
 #include <asm/pgalloc.h>
@@ -701,6 +702,8 @@ void mm_release(struct task_struct *tsk, struct mm_struct *mm)
 		exit_pi_state_list(tsk);
 #endif
 
+	uprobe_free_utask(tsk);
+
 	/* Get rid of any cached register state */
 	deactivate_mm(tsk, mm);
 
@@ -1295,6 +1298,7 @@ static struct task_struct *copy_process(unsigned long clone_flags,
 	INIT_LIST_HEAD(&p->pi_state_list);
 	p->pi_state_cache = NULL;
 #endif
+	uprobe_copy_process(p);
 	/*
 	 * sigaltstack should be cleared when sharing the same VM
 	 */
diff --git a/kernel/signal.c b/kernel/signal.c
index 8511e39813c7..e93ff0a719a0 100644
--- a/kernel/signal.c
+++ b/kernel/signal.c
@@ -29,6 +29,7 @@
 #include <linux/pid_namespace.h>
 #include <linux/nsproxy.h>
 #include <linux/user_namespace.h>
+#include <linux/uprobes.h>
 #define CREATE_TRACE_POINTS
 #include <trace/events/signal.h>
 
@@ -2192,6 +2193,9 @@ int get_signal_to_deliver(siginfo_t *info, struct k_sigaction *return_ka,
 	struct signal_struct *signal = current->signal;
 	int signr;
 
+	if (unlikely(uprobe_deny_signal()))
+		return 0;
+
 relock:
 	/*
 	 * We'll jump back here after any time we were stopped in TASK_STOPPED.
-- 
cgit v1.2.3


From a6fca40f1d7f3e232c9de27c1cebbb9f787fbc4f Mon Sep 17 00:00:00 2001
From: Suresh Siddha <suresh.b.siddha@intel.com>
Date: Thu, 22 Mar 2012 17:01:25 -0700
Subject: x86, tlb: Switch cr3 in leave_mm() only when needed

Currently leave_mm() unconditionally switches the cr3 to swapper_pg_dir.
But there is no need to change the cr3, if we already left that mm.

intel_idle() for example calls leave_mm() on every deep c-state entry where
the CPU flushes the TLB for us. Similarly flush_tlb_all() was also calling
leave_mm() whenever the TLB is in LAZY state. Both these paths will be
improved with this change.

Signed-off-by: Suresh Siddha <suresh.b.siddha@intel.com>
Link: http://lkml.kernel.org/r/1332460885.16101.147.camel@sbsiddha-desk.sc.intel.com
Signed-off-by: H. Peter Anvin <hpa@zytor.com>
---
 arch/x86/mm/tlb.c | 8 +++++---
 1 file changed, 5 insertions(+), 3 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/mm/tlb.c b/arch/x86/mm/tlb.c
index d6c0418c3e47..125bcad1b757 100644
--- a/arch/x86/mm/tlb.c
+++ b/arch/x86/mm/tlb.c
@@ -61,11 +61,13 @@ static DEFINE_PER_CPU_READ_MOSTLY(int, tlb_vector_offset);
  */
 void leave_mm(int cpu)
 {
+	struct mm_struct *active_mm = percpu_read(cpu_tlbstate.active_mm);
 	if (percpu_read(cpu_tlbstate.state) == TLBSTATE_OK)
 		BUG();
-	cpumask_clear_cpu(cpu,
-			  mm_cpumask(percpu_read(cpu_tlbstate.active_mm)));
-	load_cr3(swapper_pg_dir);
+	if (cpumask_test_cpu(cpu, mm_cpumask(active_mm))) {
+		cpumask_clear_cpu(cpu, mm_cpumask(active_mm));
+		load_cr3(swapper_pg_dir);
+	}
 }
 EXPORT_SYMBOL_GPL(leave_mm);
 
-- 
cgit v1.2.3


From 2e064b1e131eba262c0ba4268cb79dbc72edeece Mon Sep 17 00:00:00 2001
From: Jordan Justen <jordan.l.justen@intel.com>
Date: Fri, 23 Mar 2012 09:35:04 -0700
Subject: x86, efi: Fix issue of overlapping .reloc section for EFI_STUB

Previously the .reloc section was embedded in the .text
section.

No relocations are required during the PE/COFF loading phase
for the kernel using the EFI_STUB UEFI loader. To fix the
issue of overlapping sections, create a .reloc section with a
zero length.

The .reloc section header must exist to make sure the image
will be loaded by the UEFI firmware, but a zero-length
section header seems to be sufficient.

Signed-off-by: Jordan Justen <jordan.l.justen@intel.com>
Link: http://lkml.kernel.org/r/1332520506-6472-2-git-send-email-jordan.l.justen@intel.com
Acked-by: Matt Fleming <matt.fleming@intel.com>
Signed-off-by: H. Peter Anvin <hpa@linux.intel.com>
---
 arch/x86/boot/header.S | 22 +++++++---------------
 1 file changed, 7 insertions(+), 15 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/boot/header.S b/arch/x86/boot/header.S
index f1bbeeb09148..4e9124b148c2 100644
--- a/arch/x86/boot/header.S
+++ b/arch/x86/boot/header.S
@@ -217,18 +217,17 @@ section_table:
 
 	#
 	# The EFI application loader requires a relocation section
-	# because EFI applications are relocatable and not having
-	# this section seems to confuse it. But since we don't need
-	# the loader to fixup any relocs for us just fill it with a
-	# single dummy reloc.
+	# because EFI applications must be relocatable. But since
+	# we don't need the loader to fixup any relocs for us, we
+	# just create an empty (zero-length) .reloc section header.
 	#
 	.ascii	".reloc"
 	.byte	0
 	.byte	0
-	.long	reloc_end - reloc_start
-	.long	reloc_start
-	.long	reloc_end - reloc_start		# SizeOfRawData
-	.long	reloc_start			# PointerToRawData
+	.long	0
+	.long	0
+	.long	0				# SizeOfRawData
+	.long	0				# PointerToRawData
 	.long	0				# PointerToRelocations
 	.long	0				# PointerToLineNumbers
 	.word	0				# NumberOfRelocations
@@ -469,10 +468,3 @@ setup_corrupt:
 
 	.data
 dummy:	.long	0
-
-	.section .reloc
-reloc_start:
-	.long	dummy - reloc_start
-	.long	10
-	.word	0
-reloc_end:
-- 
cgit v1.2.3


From e31be363df3092821bf179cf4baa076f501b8ae6 Mon Sep 17 00:00:00 2001
From: Matt Fleming <matt.fleming@intel.com>
Date: Fri, 23 Mar 2012 09:35:05 -0700
Subject: x86, efi: Fix .text section overlapping image header for EFI_STUB

This change modifes the PE .text section to start after
the first sector of the kernel image.

The header may be modified by the UEFI secure boot signing,
so it is not appropriate for it to be included in one of the
image sections. Since the sections are part of the secure
boot hash, this modification to the .text section contents
would invalidate the secure boot signed hash.

Note: UEFI secure boot does hash the image header, but
fields that are changed by the signing process are excluded
from the hash calculation.  This exclusion process is only
handled for the image header, and not image sections.

Luckily, we can still easily boot without the first sector
by initializing a few fields in arch/x86/boot/compressed/eboot.c.

Signed-off-by: Matt Fleming <matt.fleming@intel.com>
Link: http://lkml.kernel.org/r/1332520506-6472-3-git-send-email-jordan.l.justen@intel.com
[jordan.l.justen@intel.com: set .text vma & file offset]
Signed-off-by: Jordan Justen <jordan.l.justen@intel.com>
Signed-off-by: H. Peter Anvin <hpa@linux.intel.com>
---
 arch/x86/boot/compressed/eboot.c | 14 +++++++++++---
 arch/x86/boot/header.S           |  2 +-
 arch/x86/boot/tools/build.c      | 25 ++++++++++++++++++++++---
 3 files changed, 34 insertions(+), 7 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/boot/compressed/eboot.c b/arch/x86/boot/compressed/eboot.c
index fec216f4fbc3..01cbb8707a31 100644
--- a/arch/x86/boot/compressed/eboot.c
+++ b/arch/x86/boot/compressed/eboot.c
@@ -904,11 +904,19 @@ struct boot_params *efi_main(void *handle, efi_system_table_t *_table)
 
 	memset(boot_params, 0x0, 0x4000);
 
-	/* Copy first two sectors to boot_params */
-	memcpy(boot_params, image->image_base, 1024);
-
 	hdr = &boot_params->hdr;
 
+	/* Copy the second sector to boot_params */
+	memcpy(&hdr->jump, image->image_base + 512, 512);
+
+	/*
+	 * Fill out some of the header fields ourselves because the
+	 * EFI firmware loader doesn't load the first sector.
+	 */
+	hdr->root_flags = 1;
+	hdr->vid_mode = 0xffff;
+	hdr->boot_flag = 0xAA55;
+
 	/*
 	 * The EFI firmware loader could have placed the kernel image
 	 * anywhere in memory, but the kernel has various restrictions
diff --git a/arch/x86/boot/header.S b/arch/x86/boot/header.S
index 4e9124b148c2..4ceb56e9a4ce 100644
--- a/arch/x86/boot/header.S
+++ b/arch/x86/boot/header.S
@@ -147,7 +147,7 @@ optional_header:
 	# Filled in by build.c
 	.long	0x0000				# AddressOfEntryPoint
 
-	.long	0x0000				# BaseOfCode
+	.long	0x0200				# BaseOfCode
 #ifdef CONFIG_X86_32
 	.long	0				# data
 #endif
diff --git a/arch/x86/boot/tools/build.c b/arch/x86/boot/tools/build.c
index 4e9bd6bcafa6..2aeab3dc9e5f 100644
--- a/arch/x86/boot/tools/build.c
+++ b/arch/x86/boot/tools/build.c
@@ -202,12 +202,19 @@ int main(int argc, char ** argv)
 
 	pe_header = *(unsigned int *)&buf[0x3c];
 
-	/* Size of code */
-	*(unsigned int *)&buf[pe_header + 0x1c] = file_sz;
-
 	/* Size of image */
 	*(unsigned int *)&buf[pe_header + 0x50] = file_sz;
 
+	/*
+	 * Subtract the size of the first section (512 bytes) which
+	 * includes the header and .reloc section. The remaining size
+	 * is that of the .text section.
+	 */
+	file_sz -= 512;
+
+	/* Size of code */
+	*(unsigned int *)&buf[pe_header + 0x1c] = file_sz;
+
 #ifdef CONFIG_X86_32
 	/* Address of entry point */
 	*(unsigned int *)&buf[pe_header + 0x28] = i;
@@ -215,8 +222,14 @@ int main(int argc, char ** argv)
 	/* .text size */
 	*(unsigned int *)&buf[pe_header + 0xb0] = file_sz;
 
+	/* .text vma */
+	*(unsigned int *)&buf[pe_header + 0xb4] = 0x200;
+
 	/* .text size of initialised data */
 	*(unsigned int *)&buf[pe_header + 0xb8] = file_sz;
+
+	/* .text file offset */
+	*(unsigned int *)&buf[pe_header + 0xbc] = 0x200;
 #else
 	/*
 	 * Address of entry point. startup_32 is at the beginning and
@@ -228,8 +241,14 @@ int main(int argc, char ** argv)
 	/* .text size */
 	*(unsigned int *)&buf[pe_header + 0xc0] = file_sz;
 
+	/* .text vma */
+	*(unsigned int *)&buf[pe_header + 0xc4] = 0x200;
+
 	/* .text size of initialised data */
 	*(unsigned int *)&buf[pe_header + 0xc8] = file_sz;
+
+	/* .text file offset */
+	*(unsigned int *)&buf[pe_header + 0xcc] = 0x200;
 #endif /* CONFIG_X86_32 */
 #endif /* CONFIG_EFI_STUB */
 
-- 
cgit v1.2.3


From e47bb0bda46bf50f81671db502d0c903e0a32604 Mon Sep 17 00:00:00 2001
From: Matt Fleming <matt.fleming@intel.com>
Date: Fri, 23 Mar 2012 09:35:06 -0700
Subject: x86, efi: Fix NumberOfRvaAndSizes field in PE32 header for EFI_STUB

We've actually got six data directories in the header, not one. Even
though the firmware loader doesn't seem to mind, when we come to sign
the kernel image the signing tool thinks that there is no Certificate
Table data directory, even though we've allocated space for one.

Signed-off-by: Matt Fleming <matt.fleming@intel.com>
Link: http://lkml.kernel.org/r/1332520506-6472-4-git-send-email-jordan.l.justen@intel.com
Reviewed-by: Jordan Justen <jordan.l.justen@intel.com>
Signed-off-by: H. Peter Anvin <hpa@linux.intel.com>
---
 arch/x86/boot/header.S | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'arch/x86')

diff --git a/arch/x86/boot/header.S b/arch/x86/boot/header.S
index 4ceb56e9a4ce..8bbea6aa40d9 100644
--- a/arch/x86/boot/header.S
+++ b/arch/x86/boot/header.S
@@ -189,7 +189,7 @@ extra_header_fields:
 	.quad	0				# SizeOfHeapCommit
 #endif
 	.long	0				# LoaderFlags
-	.long	0x1				# NumberOfRvaAndSizes
+	.long	0x6				# NumberOfRvaAndSizes
 
 	.quad	0				# ExportTable
 	.quad	0				# ImportTable
-- 
cgit v1.2.3


From 35372a7d45291140a97518a8d1c8cb0e31ee2bb7 Mon Sep 17 00:00:00 2001
From: Richard Weinberger <richard@nod.at>
Date: Fri, 30 Mar 2012 01:38:03 +0200
Subject: x86: spinlock.h: Remove REG_PTR_MODE

REG_PTR_MODE has no users at all.

Signed-off-by: Richard Weinberger <richard@nod.at>
Link: http://lkml.kernel.org/r/1333064283-3109-1-git-send-email-richard@nod.at
Acked-by: Acked-by: Jan Beulich <jbeulich@suse.com>
Signed-off-by: H. Peter Anvin <hpa@zytor.com>
---
 arch/x86/include/asm/spinlock.h | 2 --
 1 file changed, 2 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/include/asm/spinlock.h b/arch/x86/include/asm/spinlock.h
index 76bfa2cf301d..b315a33867f2 100644
--- a/arch/x86/include/asm/spinlock.h
+++ b/arch/x86/include/asm/spinlock.h
@@ -20,10 +20,8 @@
 
 #ifdef CONFIG_X86_32
 # define LOCK_PTR_REG "a"
-# define REG_PTR_MODE "k"
 #else
 # define LOCK_PTR_REG "D"
-# define REG_PTR_MODE "q"
 #endif
 
 #if defined(CONFIG_X86_32) && \
-- 
cgit v1.2.3


From 3f3aaea29ff7ee2d43b430338427f30ba7f60ff9 Mon Sep 17 00:00:00 2001
From: Konrad Rzeszutek Wilk <konrad.wilk@oracle.com>
Date: Fri, 30 Mar 2012 11:45:01 -0400
Subject: xen/p2m: Move code around to allow for better re-usage.

We are going to be using the early_alloc_p2m (and
early_alloc_p2m_middle) code in follow up patches which
are not related to setting identity pages.

Hence lets move the code out in its own function and
rename them as appropiate.

Signed-off-by: Konrad Rzeszutek Wilk <konrad.wilk@oracle.com>
---
 arch/x86/xen/p2m.c | 62 ++++++++++++++++++++++++++++++------------------------
 1 file changed, 34 insertions(+), 28 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/xen/p2m.c b/arch/x86/xen/p2m.c
index 1b267e75158d..3cc3afeb09a1 100644
--- a/arch/x86/xen/p2m.c
+++ b/arch/x86/xen/p2m.c
@@ -499,7 +499,7 @@ static bool alloc_p2m(unsigned long pfn)
 	return true;
 }
 
-static bool __init __early_alloc_p2m(unsigned long pfn)
+static bool __init early_alloc_p2m_middle(unsigned long pfn)
 {
 	unsigned topidx, mididx, idx;
 
@@ -541,6 +541,36 @@ static bool __init __early_alloc_p2m(unsigned long pfn)
 	}
 	return idx != 0;
 }
+
+static bool __init early_alloc_p2m(unsigned long pfn)
+{
+	unsigned topidx = p2m_top_index(pfn);
+	unsigned long *mid_mfn_p;
+	unsigned long **mid;
+
+	mid = p2m_top[topidx];
+	mid_mfn_p = p2m_top_mfn_p[topidx];
+	if (mid == p2m_mid_missing) {
+		mid = extend_brk(PAGE_SIZE, PAGE_SIZE);
+
+		p2m_mid_init(mid);
+
+		p2m_top[topidx] = mid;
+
+		BUG_ON(mid_mfn_p != p2m_mid_missing_mfn);
+	}
+	/* And the save/restore P2M tables.. */
+	if (mid_mfn_p == p2m_mid_missing_mfn) {
+		mid_mfn_p = extend_brk(PAGE_SIZE, PAGE_SIZE);
+		p2m_mid_mfn_init(mid_mfn_p);
+
+		p2m_top_mfn_p[topidx] = mid_mfn_p;
+		p2m_top_mfn[topidx] = virt_to_mfn(mid_mfn_p);
+		/* Note: we don't set mid_mfn_p[midix] here,
+		 * look in early_alloc_p2m_middle */
+	}
+	return true;
+}
 unsigned long __init set_phys_range_identity(unsigned long pfn_s,
 				      unsigned long pfn_e)
 {
@@ -559,35 +589,11 @@ unsigned long __init set_phys_range_identity(unsigned long pfn_s,
 		pfn < ALIGN(pfn_e, (P2M_MID_PER_PAGE * P2M_PER_PAGE));
 		pfn += P2M_MID_PER_PAGE * P2M_PER_PAGE)
 	{
-		unsigned topidx = p2m_top_index(pfn);
-		unsigned long *mid_mfn_p;
-		unsigned long **mid;
-
-		mid = p2m_top[topidx];
-		mid_mfn_p = p2m_top_mfn_p[topidx];
-		if (mid == p2m_mid_missing) {
-			mid = extend_brk(PAGE_SIZE, PAGE_SIZE);
-
-			p2m_mid_init(mid);
-
-			p2m_top[topidx] = mid;
-
-			BUG_ON(mid_mfn_p != p2m_mid_missing_mfn);
-		}
-		/* And the save/restore P2M tables.. */
-		if (mid_mfn_p == p2m_mid_missing_mfn) {
-			mid_mfn_p = extend_brk(PAGE_SIZE, PAGE_SIZE);
-			p2m_mid_mfn_init(mid_mfn_p);
-
-			p2m_top_mfn_p[topidx] = mid_mfn_p;
-			p2m_top_mfn[topidx] = virt_to_mfn(mid_mfn_p);
-			/* Note: we don't set mid_mfn_p[midix] here,
-		 	 * look in __early_alloc_p2m */
-		}
+		WARN_ON(!early_alloc_p2m(pfn));
 	}
 
-	__early_alloc_p2m(pfn_s);
-	__early_alloc_p2m(pfn_e);
+	early_alloc_p2m_middle(pfn_s);
+	early_alloc_p2m_middle(pfn_e);
 
 	for (pfn = pfn_s; pfn < pfn_e; pfn++)
 		if (!__set_phys_to_machine(pfn, IDENTITY_FRAME(pfn)))
-- 
cgit v1.2.3


From cef4cca551d652b7f69c9d76337c5fae24e069dc Mon Sep 17 00:00:00 2001
From: Konrad Rzeszutek Wilk <konrad.wilk@oracle.com>
Date: Fri, 30 Mar 2012 14:15:14 -0400
Subject: xen/p2m: Allow alloc_p2m_middle to call reserve_brk depending on
 argument

For identity cases we want to call reserve_brk only on the boundary
conditions of the middle P2M (so P2M[x][y][0] = extend_brk). This is
to work around identify regions (PCI spaces, gaps in E820) which are not
aligned on 2MB regions.

However for the case were we want to allocate P2M middle leafs at the
early bootup stage, irregardless of this alignment check we need some
means of doing that.  For that we provide the new argument.

Signed-off-by: Konrad Rzeszutek Wilk <konrad.wilk@oracle.com>
---
 arch/x86/xen/p2m.c | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/xen/p2m.c b/arch/x86/xen/p2m.c
index 3cc3afeb09a1..8b3a3958d120 100644
--- a/arch/x86/xen/p2m.c
+++ b/arch/x86/xen/p2m.c
@@ -499,7 +499,7 @@ static bool alloc_p2m(unsigned long pfn)
 	return true;
 }
 
-static bool __init early_alloc_p2m_middle(unsigned long pfn)
+static bool __init early_alloc_p2m_middle(unsigned long pfn, bool check_boundary)
 {
 	unsigned topidx, mididx, idx;
 
@@ -508,7 +508,7 @@ static bool __init early_alloc_p2m_middle(unsigned long pfn)
 	idx = p2m_index(pfn);
 
 	/* Pfff.. No boundary cross-over, lets get out. */
-	if (!idx)
+	if (!idx && check_boundary)
 		return false;
 
 	WARN(p2m_top[topidx][mididx] == p2m_identity,
@@ -531,7 +531,7 @@ static bool __init early_alloc_p2m_middle(unsigned long pfn)
 		p2m_top[topidx][mididx] = p2m;
 
 		/* For save/restore we need to MFN of the P2M saved */
-		
+
 		mid_mfn_p = p2m_top_mfn_p[topidx];
 		WARN(mid_mfn_p[mididx] != virt_to_mfn(p2m_missing),
 			"P2M_TOP_P[%d][%d] != MFN of p2m_missing!\n",
@@ -592,8 +592,8 @@ unsigned long __init set_phys_range_identity(unsigned long pfn_s,
 		WARN_ON(!early_alloc_p2m(pfn));
 	}
 
-	early_alloc_p2m_middle(pfn_s);
-	early_alloc_p2m_middle(pfn_e);
+	early_alloc_p2m_middle(pfn_s, true);
+	early_alloc_p2m_middle(pfn_e, true);
 
 	for (pfn = pfn_s; pfn < pfn_e; pfn++)
 		if (!__set_phys_to_machine(pfn, IDENTITY_FRAME(pfn)))
-- 
cgit v1.2.3


From d5096850b47424fb0f1c6a75b8f7184f7169319a Mon Sep 17 00:00:00 2001
From: Konrad Rzeszutek Wilk <konrad.wilk@oracle.com>
Date: Fri, 30 Mar 2012 14:16:49 -0400
Subject: xen/p2m: Collapse early_alloc_p2m_middle redundant checks.

At the start of the function we were checking for idx != 0
and bailing out. And later calling extend_brk if idx != 0.

That is unnecessary so remove that checks.

Signed-off-by: Konrad Rzeszutek Wilk <konrad.wilk@oracle.com>
---
 arch/x86/xen/p2m.c | 25 ++++++++++++-------------
 1 file changed, 12 insertions(+), 13 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/xen/p2m.c b/arch/x86/xen/p2m.c
index 8b3a3958d120..952edefcedb3 100644
--- a/arch/x86/xen/p2m.c
+++ b/arch/x86/xen/p2m.c
@@ -502,6 +502,8 @@ static bool alloc_p2m(unsigned long pfn)
 static bool __init early_alloc_p2m_middle(unsigned long pfn, bool check_boundary)
 {
 	unsigned topidx, mididx, idx;
+	unsigned long *p2m;
+	unsigned long *mid_mfn_p;
 
 	topidx = p2m_top_index(pfn);
 	mididx = p2m_mid_index(pfn);
@@ -522,24 +524,21 @@ static bool __init early_alloc_p2m_middle(unsigned long pfn, bool check_boundary
 		return false;
 
 	/* Boundary cross-over for the edges: */
-	if (idx) {
-		unsigned long *p2m = extend_brk(PAGE_SIZE, PAGE_SIZE);
-		unsigned long *mid_mfn_p;
+	p2m = extend_brk(PAGE_SIZE, PAGE_SIZE);
 
-		p2m_init(p2m);
+	p2m_init(p2m);
 
-		p2m_top[topidx][mididx] = p2m;
+	p2m_top[topidx][mididx] = p2m;
 
-		/* For save/restore we need to MFN of the P2M saved */
+	/* For save/restore we need to MFN of the P2M saved */
 
-		mid_mfn_p = p2m_top_mfn_p[topidx];
-		WARN(mid_mfn_p[mididx] != virt_to_mfn(p2m_missing),
-			"P2M_TOP_P[%d][%d] != MFN of p2m_missing!\n",
-			topidx, mididx);
-		mid_mfn_p[mididx] = virt_to_mfn(p2m);
+	mid_mfn_p = p2m_top_mfn_p[topidx];
+	WARN(mid_mfn_p[mididx] != virt_to_mfn(p2m_missing),
+		"P2M_TOP_P[%d][%d] != MFN of p2m_missing!\n",
+		topidx, mididx);
+	mid_mfn_p[mididx] = virt_to_mfn(p2m);
 
-	}
-	return idx != 0;
+	return true;
 }
 
 static bool __init early_alloc_p2m(unsigned long pfn)
-- 
cgit v1.2.3


From 940713bb2ce3033f468a220094a07250a2f69bdd Mon Sep 17 00:00:00 2001
From: Konrad Rzeszutek Wilk <konrad.wilk@oracle.com>
Date: Fri, 30 Mar 2012 14:33:14 -0400
Subject: xen/p2m: An early bootup variant of set_phys_to_machine

During early bootup we can't use alloc_page, so to allocate
leaf pages in the P2M we need to use extend_brk. For that
we are utilizing the early_alloc_p2m and early_alloc_p2m_middle
functions to do the job for us. This function follows the
same logic as set_phys_to_machine.

Signed-off-by: Konrad Rzeszutek Wilk <konrad.wilk@oracle.com>
---
 arch/x86/include/asm/xen/page.h |  1 +
 arch/x86/xen/p2m.c              | 15 +++++++++++++++
 2 files changed, 16 insertions(+)

(limited to 'arch/x86')

diff --git a/arch/x86/include/asm/xen/page.h b/arch/x86/include/asm/xen/page.h
index c34f96c2f7a0..93971e841dd5 100644
--- a/arch/x86/include/asm/xen/page.h
+++ b/arch/x86/include/asm/xen/page.h
@@ -44,6 +44,7 @@ extern unsigned long  machine_to_phys_nr;
 
 extern unsigned long get_phys_to_machine(unsigned long pfn);
 extern bool set_phys_to_machine(unsigned long pfn, unsigned long mfn);
+extern bool __init early_set_phys_to_machine(unsigned long pfn, unsigned long mfn);
 extern bool __set_phys_to_machine(unsigned long pfn, unsigned long mfn);
 extern unsigned long set_phys_range_identity(unsigned long pfn_s,
 					     unsigned long pfn_e);
diff --git a/arch/x86/xen/p2m.c b/arch/x86/xen/p2m.c
index 952edefcedb3..ffd08c414e91 100644
--- a/arch/x86/xen/p2m.c
+++ b/arch/x86/xen/p2m.c
@@ -570,6 +570,21 @@ static bool __init early_alloc_p2m(unsigned long pfn)
 	}
 	return true;
 }
+bool __init early_set_phys_to_machine(unsigned long pfn, unsigned long mfn)
+{
+	if (unlikely(!__set_phys_to_machine(pfn, mfn)))  {
+		if (!early_alloc_p2m(pfn))
+			return false;
+
+		if (!early_alloc_p2m_middle(pfn, false /* boundary crossover OK!*/))
+			return false;
+
+		if (!__set_phys_to_machine(pfn, mfn))
+			return false;
+	}
+
+	return true;
+}
 unsigned long __init set_phys_range_identity(unsigned long pfn_s,
 				      unsigned long pfn_e)
 {
-- 
cgit v1.2.3


From 83c529151ab0d4a813e3f6a3e293fff75d468519 Mon Sep 17 00:00:00 2001
From: "Liu, Jinsong" <jinsong.liu@intel.com>
Date: Tue, 28 Feb 2012 05:15:46 +0000
Subject: KVM: x86: expose Intel cpu new features (HLE, RTM) to guest

Intel recently release 2 new features, HLE and RTM.
Refer to http://software.intel.com/file/41417.
This patch expose them to guest.

Signed-off-by: Liu, Jinsong <jinsong.liu@intel.com>
Signed-off-by: Marcelo Tosatti <mtosatti@redhat.com>
Signed-off-by: Avi Kivity <avi@redhat.com>
---
 arch/x86/kvm/cpuid.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kvm/cpuid.c b/arch/x86/kvm/cpuid.c
index 9fed5bedaad6..c2134b881033 100644
--- a/arch/x86/kvm/cpuid.c
+++ b/arch/x86/kvm/cpuid.c
@@ -247,7 +247,8 @@ static int do_cpuid_ent(struct kvm_cpuid_entry2 *entry, u32 function,
 
 	/* cpuid 7.0.ebx */
 	const u32 kvm_supported_word9_x86_features =
-		F(FSGSBASE) | F(BMI1) | F(AVX2) | F(SMEP) | F(BMI2) | F(ERMS);
+		F(FSGSBASE) | F(BMI1) | F(HLE) | F(AVX2) | F(SMEP) |
+		F(BMI2) | F(ERMS) | F(RTM);
 
 	/* all calls to cpuid_count() should be made on the same cpu */
 	get_cpu();
-- 
cgit v1.2.3


From 675acb758ab2381c72fe3ceb5c091cbd0879d4dd Mon Sep 17 00:00:00 2001
From: Jason Wang <jasowang@redhat.com>
Date: Thu, 8 Mar 2012 18:07:56 +0800
Subject: KVM: SVM: count all irq windows exit

Also count the exits of fast-path.

Signed-off-by: Jason Wang <jasowang@redhat.com>
Acked-by: Joerg Roedel <joerg.roedel@amd.com>
Signed-off-by: Marcelo Tosatti <mtosatti@redhat.com>
Signed-off-by: Avi Kivity <avi@redhat.com>
---
 arch/x86/kvm/svm.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c
index e334389e1c75..f3167208562e 100644
--- a/arch/x86/kvm/svm.c
+++ b/arch/x86/kvm/svm.c
@@ -3240,6 +3240,7 @@ static int interrupt_window_interception(struct vcpu_svm *svm)
 	svm_clear_vintr(svm);
 	svm->vmcb->control.int_ctl &= ~V_IRQ_MASK;
 	mark_dirty(svm->vmcb, VMCB_INTR);
+	++svm->vcpu.stat.irq_window_exits;
 	/*
 	 * If the user space waits to inject interrupts, exit as soon as
 	 * possible
@@ -3247,7 +3248,6 @@ static int interrupt_window_interception(struct vcpu_svm *svm)
 	if (!irqchip_in_kernel(svm->vcpu.kvm) &&
 	    kvm_run->request_interrupt_window &&
 	    !kvm_cpu_has_interrupt(&svm->vcpu)) {
-		++svm->vcpu.stat.irq_window_exits;
 		kvm_run->exit_reason = KVM_EXIT_IRQ_WINDOW_OPEN;
 		return 0;
 	}
-- 
cgit v1.2.3


From b6d33834bd4e8bdf4a199812e31b3e36da53c794 Mon Sep 17 00:00:00 2001
From: Christoffer Dall <c.dall@virtualopensystems.com>
Date: Thu, 8 Mar 2012 16:44:24 -0500
Subject: KVM: Factor out kvm_vcpu_kick to arch-generic code

The kvm_vcpu_kick function performs roughly the same funcitonality on
most all architectures, so we shouldn't have separate copies.

PowerPC keeps a pointer to interchanging waitqueues on the vcpu_arch
structure and to accomodate this special need a
__KVM_HAVE_ARCH_VCPU_GET_WQ define and accompanying function
kvm_arch_vcpu_wq have been defined. For all other architectures this
is a generic inline that just returns &vcpu->wq;

Acked-by: Scott Wood <scottwood@freescale.com>
Signed-off-by: Christoffer Dall <c.dall@virtualopensystems.com>
Signed-off-by: Marcelo Tosatti <mtosatti@redhat.com>
Signed-off-by: Avi Kivity <avi@redhat.com>
---
 arch/ia64/include/asm/kvm_host.h    |  1 +
 arch/ia64/kvm/kvm-ia64.c            | 20 +++++---------------
 arch/powerpc/include/asm/kvm_host.h |  6 ++++++
 arch/powerpc/kvm/powerpc.c          | 21 ++++++---------------
 arch/s390/kvm/kvm-s390.c            |  8 ++++++++
 arch/x86/kvm/x86.c                  | 16 ++--------------
 include/linux/kvm_host.h            |  9 +++++++++
 virt/kvm/kvm_main.c                 | 22 ++++++++++++++++++++++
 8 files changed, 59 insertions(+), 44 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/ia64/include/asm/kvm_host.h b/arch/ia64/include/asm/kvm_host.h
index e35b3a84a40b..c4b4bac3d09e 100644
--- a/arch/ia64/include/asm/kvm_host.h
+++ b/arch/ia64/include/asm/kvm_host.h
@@ -365,6 +365,7 @@ struct thash_cb {
 };
 
 struct kvm_vcpu_stat {
+	u32 halt_wakeup;
 };
 
 struct kvm_vcpu_arch {
diff --git a/arch/ia64/kvm/kvm-ia64.c b/arch/ia64/kvm/kvm-ia64.c
index f5104b7c52cd..9d80ff8d9eff 100644
--- a/arch/ia64/kvm/kvm-ia64.c
+++ b/arch/ia64/kvm/kvm-ia64.c
@@ -1872,21 +1872,6 @@ void kvm_arch_hardware_unsetup(void)
 {
 }
 
-void kvm_vcpu_kick(struct kvm_vcpu *vcpu)
-{
-	int me;
-	int cpu = vcpu->cpu;
-
-	if (waitqueue_active(&vcpu->wq))
-		wake_up_interruptible(&vcpu->wq);
-
-	me = get_cpu();
-	if (cpu != me && (unsigned) cpu < nr_cpu_ids && cpu_online(cpu))
-		if (!test_and_set_bit(KVM_REQ_KICK, &vcpu->requests))
-			smp_send_reschedule(cpu);
-	put_cpu();
-}
-
 int kvm_apic_set_irq(struct kvm_vcpu *vcpu, struct kvm_lapic_irq *irq)
 {
 	return __apic_accept_irq(vcpu, irq->vector);
@@ -1956,6 +1941,11 @@ int kvm_arch_vcpu_runnable(struct kvm_vcpu *vcpu)
 		(kvm_highest_pending_irq(vcpu) != -1);
 }
 
+int kvm_arch_vcpu_should_kick(struct kvm_vcpu *vcpu)
+{
+	return (!test_and_set_bit(KVM_REQ_KICK, &vcpu->requests));
+}
+
 int kvm_arch_vcpu_ioctl_get_mpstate(struct kvm_vcpu *vcpu,
 				    struct kvm_mp_state *mp_state)
 {
diff --git a/arch/powerpc/include/asm/kvm_host.h b/arch/powerpc/include/asm/kvm_host.h
index 52eb9c1f4fe0..889383735e73 100644
--- a/arch/powerpc/include/asm/kvm_host.h
+++ b/arch/powerpc/include/asm/kvm_host.h
@@ -498,4 +498,10 @@ struct kvm_vcpu_arch {
 #define KVM_MMIO_REG_QPR	0x0040
 #define KVM_MMIO_REG_FQPR	0x0060
 
+#define __KVM_HAVE_ARCH_VCPU_GET_WQ 1
+static inline wait_queue_head *kvm_arch_vcpu_wq(struct kvm_vcpu *vcpu)
+{
+	return vcpu->arch.wqp;
+}
+
 #endif /* __POWERPC_KVM_HOST_H__ */
diff --git a/arch/powerpc/kvm/powerpc.c b/arch/powerpc/kvm/powerpc.c
index 00d7e345b3fe..b5e9046462fd 100644
--- a/arch/powerpc/kvm/powerpc.c
+++ b/arch/powerpc/kvm/powerpc.c
@@ -43,6 +43,11 @@ int kvm_arch_vcpu_runnable(struct kvm_vcpu *v)
 	       v->requests;
 }
 
+int kvm_arch_vcpu_should_kick(struct kvm_vcpu *vcpu)
+{
+	return 1;
+}
+
 int kvmppc_kvm_pv(struct kvm_vcpu *vcpu)
 {
 	int nr = kvmppc_get_gpr(vcpu, 11);
@@ -588,21 +593,6 @@ int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu, struct kvm_run *run)
 	return r;
 }
 
-void kvm_vcpu_kick(struct kvm_vcpu *vcpu)
-{
-	int me;
-	int cpu = vcpu->cpu;
-
-	me = get_cpu();
-	if (waitqueue_active(vcpu->arch.wqp)) {
-		wake_up_interruptible(vcpu->arch.wqp);
-		vcpu->stat.halt_wakeup++;
-	} else if (cpu != me && cpu != -1) {
-		smp_send_reschedule(vcpu->cpu);
-	}
-	put_cpu();
-}
-
 int kvm_vcpu_ioctl_interrupt(struct kvm_vcpu *vcpu, struct kvm_interrupt *irq)
 {
 	if (irq->irq == KVM_INTERRUPT_UNSET) {
@@ -611,6 +601,7 @@ int kvm_vcpu_ioctl_interrupt(struct kvm_vcpu *vcpu, struct kvm_interrupt *irq)
 	}
 
 	kvmppc_core_queue_external(vcpu, irq);
+
 	kvm_vcpu_kick(vcpu);
 
 	return 0;
diff --git a/arch/s390/kvm/kvm-s390.c b/arch/s390/kvm/kvm-s390.c
index 217ce44395a4..d30c8350b949 100644
--- a/arch/s390/kvm/kvm-s390.c
+++ b/arch/s390/kvm/kvm-s390.c
@@ -423,6 +423,14 @@ int kvm_arch_vcpu_runnable(struct kvm_vcpu *vcpu)
 	return 0;
 }
 
+int kvm_arch_vcpu_should_kick(struct kvm_vcpu *vcpu)
+{
+	/* kvm common code refers to this, but never calls it */
+	BUG();
+	return 0;
+}
+
+
 static int kvm_arch_vcpu_ioctl_initial_reset(struct kvm_vcpu *vcpu)
 {
 	kvm_s390_vcpu_initial_reset(vcpu);
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 4044ce0bf7c1..511031dcb9cc 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -6403,21 +6403,9 @@ int kvm_arch_vcpu_runnable(struct kvm_vcpu *vcpu)
 		 kvm_cpu_has_interrupt(vcpu));
 }
 
-void kvm_vcpu_kick(struct kvm_vcpu *vcpu)
+int kvm_arch_vcpu_should_kick(struct kvm_vcpu *vcpu)
 {
-	int me;
-	int cpu = vcpu->cpu;
-
-	if (waitqueue_active(&vcpu->wq)) {
-		wake_up_interruptible(&vcpu->wq);
-		++vcpu->stat.halt_wakeup;
-	}
-
-	me = get_cpu();
-	if (cpu != me && (unsigned)cpu < nr_cpu_ids && cpu_online(cpu))
-		if (kvm_vcpu_exiting_guest_mode(vcpu) == IN_GUEST_MODE)
-			smp_send_reschedule(cpu);
-	put_cpu();
+	return kvm_vcpu_exiting_guest_mode(vcpu) == IN_GUEST_MODE;
 }
 
 int kvm_arch_interrupt_allowed(struct kvm_vcpu *vcpu)
diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h
index 3a2cea616283..5b624e1ff814 100644
--- a/include/linux/kvm_host.h
+++ b/include/linux/kvm_host.h
@@ -439,6 +439,7 @@ void mark_page_dirty_in_slot(struct kvm *kvm, struct kvm_memory_slot *memslot,
 			     gfn_t gfn);
 
 void kvm_vcpu_block(struct kvm_vcpu *vcpu);
+void kvm_vcpu_kick(struct kvm_vcpu *vcpu);
 void kvm_vcpu_on_spin(struct kvm_vcpu *vcpu);
 void kvm_resched(struct kvm_vcpu *vcpu);
 void kvm_load_guest_fpu(struct kvm_vcpu *vcpu);
@@ -507,6 +508,7 @@ int kvm_arch_hardware_setup(void);
 void kvm_arch_hardware_unsetup(void);
 void kvm_arch_check_processor_compat(void *rtn);
 int kvm_arch_vcpu_runnable(struct kvm_vcpu *vcpu);
+int kvm_arch_vcpu_should_kick(struct kvm_vcpu *vcpu);
 
 void kvm_free_physmem(struct kvm *kvm);
 
@@ -522,6 +524,13 @@ static inline void kvm_arch_free_vm(struct kvm *kvm)
 }
 #endif
 
+#ifndef __KVM_HAVE_ARCH_VCPU_GET_WQ
+static inline wait_queue_head_t *kvm_arch_vcpu_wq(struct kvm_vcpu *vcpu)
+{
+	return &vcpu->wq;
+}
+#endif
+
 int kvm_arch_init_vm(struct kvm *kvm, unsigned long type);
 void kvm_arch_destroy_vm(struct kvm *kvm);
 void kvm_free_all_assigned_devices(struct kvm *kvm);
diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
index a9565e240636..7149a2e65524 100644
--- a/virt/kvm/kvm_main.c
+++ b/virt/kvm/kvm_main.c
@@ -1514,6 +1514,28 @@ void kvm_vcpu_block(struct kvm_vcpu *vcpu)
 	finish_wait(&vcpu->wq, &wait);
 }
 
+/*
+ * Kick a sleeping VCPU, or a guest VCPU in guest mode, into host kernel mode.
+ */
+void kvm_vcpu_kick(struct kvm_vcpu *vcpu)
+{
+	int me;
+	int cpu = vcpu->cpu;
+	wait_queue_head_t *wqp;
+
+	wqp = kvm_arch_vcpu_wq(vcpu);
+	if (waitqueue_active(wqp)) {
+		wake_up_interruptible(wqp);
+		++vcpu->stat.halt_wakeup;
+	}
+
+	me = get_cpu();
+	if (cpu != me && (unsigned)cpu < nr_cpu_ids && cpu_online(cpu))
+		if (kvm_arch_vcpu_should_kick(vcpu))
+			smp_send_reschedule(cpu);
+	put_cpu();
+}
+
 void kvm_resched(struct kvm_vcpu *vcpu)
 {
 	if (!need_resched())
-- 
cgit v1.2.3


From eae3ee7d8a7c59cf63441dedf28674889f5fc477 Mon Sep 17 00:00:00 2001
From: Eric B Munson <emunson@mgebm.net>
Date: Sat, 10 Mar 2012 14:37:25 -0500
Subject: x86: pvclock: Add flag to indicate that a vm was stopped by the host

This flag will be used to check if the vm was stopped by the host when a soft
lockup was detected.  The host will set the flag when it stops the guest.  On
resume, the guest will check this flag if a soft lockup is detected and skip
issuing the warning.

Signed-off-by: Eric B Munson <emunson@mgebm.net>
Signed-off-by: Marcelo Tosatti <mtosatti@redhat.com>
Signed-off-by: Avi Kivity <avi@redhat.com>
---
 arch/x86/include/asm/pvclock-abi.h | 1 +
 1 file changed, 1 insertion(+)

(limited to 'arch/x86')

diff --git a/arch/x86/include/asm/pvclock-abi.h b/arch/x86/include/asm/pvclock-abi.h
index 35f2d1948ada..6167fd798188 100644
--- a/arch/x86/include/asm/pvclock-abi.h
+++ b/arch/x86/include/asm/pvclock-abi.h
@@ -40,5 +40,6 @@ struct pvclock_wall_clock {
 } __attribute__((__packed__));
 
 #define PVCLOCK_TSC_STABLE_BIT	(1 << 0)
+#define PVCLOCK_GUEST_STOPPED	(1 << 1)
 #endif /* __ASSEMBLY__ */
 #endif /* _ASM_X86_PVCLOCK_ABI_H */
-- 
cgit v1.2.3


From 3b5d56b9317fa7b5407dff1aa7b115bf6cdbd494 Mon Sep 17 00:00:00 2001
From: Eric B Munson <emunson@mgebm.net>
Date: Sat, 10 Mar 2012 14:37:26 -0500
Subject: kvmclock: Add functions to check if the host has stopped the vm

When a host stops or suspends a VM it will set a flag to show this.  The
watchdog will use these functions to determine if a softlockup is real, or the
result of a suspended VM.

Signed-off-by: Eric B Munson <emunson@mgebm.net>
asm-generic changes Acked-by: Arnd Bergmann <arnd@arndb.de>
Signed-off-by: Marcelo Tosatti <mtosatti@redhat.com>
Signed-off-by: Avi Kivity <avi@redhat.com>
---
 arch/alpha/include/asm/kvm_para.h      |  1 +
 arch/arm/include/asm/kvm_para.h        |  1 +
 arch/avr32/include/asm/kvm_para.h      |  1 +
 arch/blackfin/include/asm/kvm_para.h   |  1 +
 arch/c6x/include/asm/kvm_para.h        |  1 +
 arch/frv/include/asm/kvm_para.h        |  1 +
 arch/h8300/include/asm/kvm_para.h      |  1 +
 arch/hexagon/include/asm/kvm_para.h    |  1 +
 arch/ia64/include/asm/kvm_para.h       |  5 +++++
 arch/m68k/include/asm/kvm_para.h       |  1 +
 arch/microblaze/include/asm/kvm_para.h |  1 +
 arch/mips/include/asm/kvm_para.h       |  1 +
 arch/mn10300/include/asm/kvm_para.h    |  1 +
 arch/openrisc/include/asm/kvm_para.h   |  1 +
 arch/parisc/include/asm/kvm_para.h     |  1 +
 arch/powerpc/include/asm/kvm_para.h    |  5 +++++
 arch/s390/include/asm/kvm_para.h       |  5 +++++
 arch/score/include/asm/kvm_para.h      |  1 +
 arch/sh/include/asm/kvm_para.h         |  1 +
 arch/sparc/include/asm/kvm_para.h      |  1 +
 arch/tile/include/asm/kvm_para.h       |  1 +
 arch/um/include/asm/kvm_para.h         |  1 +
 arch/unicore32/include/asm/kvm_para.h  |  1 +
 arch/x86/include/asm/kvm_para.h        |  8 ++++++++
 arch/x86/kernel/kvmclock.c             | 21 +++++++++++++++++++++
 arch/xtensa/include/asm/kvm_para.h     |  1 +
 include/asm-generic/kvm_para.h         | 14 ++++++++++++++
 27 files changed, 79 insertions(+)
 create mode 100644 arch/alpha/include/asm/kvm_para.h
 create mode 100644 arch/arm/include/asm/kvm_para.h
 create mode 100644 arch/avr32/include/asm/kvm_para.h
 create mode 100644 arch/blackfin/include/asm/kvm_para.h
 create mode 100644 arch/c6x/include/asm/kvm_para.h
 create mode 100644 arch/frv/include/asm/kvm_para.h
 create mode 100644 arch/h8300/include/asm/kvm_para.h
 create mode 100644 arch/hexagon/include/asm/kvm_para.h
 create mode 100644 arch/m68k/include/asm/kvm_para.h
 create mode 100644 arch/microblaze/include/asm/kvm_para.h
 create mode 100644 arch/mips/include/asm/kvm_para.h
 create mode 100644 arch/mn10300/include/asm/kvm_para.h
 create mode 100644 arch/openrisc/include/asm/kvm_para.h
 create mode 100644 arch/parisc/include/asm/kvm_para.h
 create mode 100644 arch/score/include/asm/kvm_para.h
 create mode 100644 arch/sh/include/asm/kvm_para.h
 create mode 100644 arch/sparc/include/asm/kvm_para.h
 create mode 100644 arch/tile/include/asm/kvm_para.h
 create mode 100644 arch/um/include/asm/kvm_para.h
 create mode 100644 arch/unicore32/include/asm/kvm_para.h
 create mode 100644 arch/xtensa/include/asm/kvm_para.h
 create mode 100644 include/asm-generic/kvm_para.h

(limited to 'arch/x86')

diff --git a/arch/alpha/include/asm/kvm_para.h b/arch/alpha/include/asm/kvm_para.h
new file mode 100644
index 000000000000..14fab8f0b957
--- /dev/null
+++ b/arch/alpha/include/asm/kvm_para.h
@@ -0,0 +1 @@
+#include <asm-generic/kvm_para.h>
diff --git a/arch/arm/include/asm/kvm_para.h b/arch/arm/include/asm/kvm_para.h
new file mode 100644
index 000000000000..14fab8f0b957
--- /dev/null
+++ b/arch/arm/include/asm/kvm_para.h
@@ -0,0 +1 @@
+#include <asm-generic/kvm_para.h>
diff --git a/arch/avr32/include/asm/kvm_para.h b/arch/avr32/include/asm/kvm_para.h
new file mode 100644
index 000000000000..14fab8f0b957
--- /dev/null
+++ b/arch/avr32/include/asm/kvm_para.h
@@ -0,0 +1 @@
+#include <asm-generic/kvm_para.h>
diff --git a/arch/blackfin/include/asm/kvm_para.h b/arch/blackfin/include/asm/kvm_para.h
new file mode 100644
index 000000000000..14fab8f0b957
--- /dev/null
+++ b/arch/blackfin/include/asm/kvm_para.h
@@ -0,0 +1 @@
+#include <asm-generic/kvm_para.h>
diff --git a/arch/c6x/include/asm/kvm_para.h b/arch/c6x/include/asm/kvm_para.h
new file mode 100644
index 000000000000..14fab8f0b957
--- /dev/null
+++ b/arch/c6x/include/asm/kvm_para.h
@@ -0,0 +1 @@
+#include <asm-generic/kvm_para.h>
diff --git a/arch/frv/include/asm/kvm_para.h b/arch/frv/include/asm/kvm_para.h
new file mode 100644
index 000000000000..14fab8f0b957
--- /dev/null
+++ b/arch/frv/include/asm/kvm_para.h
@@ -0,0 +1 @@
+#include <asm-generic/kvm_para.h>
diff --git a/arch/h8300/include/asm/kvm_para.h b/arch/h8300/include/asm/kvm_para.h
new file mode 100644
index 000000000000..14fab8f0b957
--- /dev/null
+++ b/arch/h8300/include/asm/kvm_para.h
@@ -0,0 +1 @@
+#include <asm-generic/kvm_para.h>
diff --git a/arch/hexagon/include/asm/kvm_para.h b/arch/hexagon/include/asm/kvm_para.h
new file mode 100644
index 000000000000..14fab8f0b957
--- /dev/null
+++ b/arch/hexagon/include/asm/kvm_para.h
@@ -0,0 +1 @@
+#include <asm-generic/kvm_para.h>
diff --git a/arch/ia64/include/asm/kvm_para.h b/arch/ia64/include/asm/kvm_para.h
index 1588aee781a2..2019cb99335e 100644
--- a/arch/ia64/include/asm/kvm_para.h
+++ b/arch/ia64/include/asm/kvm_para.h
@@ -26,6 +26,11 @@ static inline unsigned int kvm_arch_para_features(void)
 	return 0;
 }
 
+static inline bool kvm_check_and_clear_guest_paused(void)
+{
+	return false;
+}
+
 #endif
 
 #endif
diff --git a/arch/m68k/include/asm/kvm_para.h b/arch/m68k/include/asm/kvm_para.h
new file mode 100644
index 000000000000..14fab8f0b957
--- /dev/null
+++ b/arch/m68k/include/asm/kvm_para.h
@@ -0,0 +1 @@
+#include <asm-generic/kvm_para.h>
diff --git a/arch/microblaze/include/asm/kvm_para.h b/arch/microblaze/include/asm/kvm_para.h
new file mode 100644
index 000000000000..14fab8f0b957
--- /dev/null
+++ b/arch/microblaze/include/asm/kvm_para.h
@@ -0,0 +1 @@
+#include <asm-generic/kvm_para.h>
diff --git a/arch/mips/include/asm/kvm_para.h b/arch/mips/include/asm/kvm_para.h
new file mode 100644
index 000000000000..14fab8f0b957
--- /dev/null
+++ b/arch/mips/include/asm/kvm_para.h
@@ -0,0 +1 @@
+#include <asm-generic/kvm_para.h>
diff --git a/arch/mn10300/include/asm/kvm_para.h b/arch/mn10300/include/asm/kvm_para.h
new file mode 100644
index 000000000000..14fab8f0b957
--- /dev/null
+++ b/arch/mn10300/include/asm/kvm_para.h
@@ -0,0 +1 @@
+#include <asm-generic/kvm_para.h>
diff --git a/arch/openrisc/include/asm/kvm_para.h b/arch/openrisc/include/asm/kvm_para.h
new file mode 100644
index 000000000000..14fab8f0b957
--- /dev/null
+++ b/arch/openrisc/include/asm/kvm_para.h
@@ -0,0 +1 @@
+#include <asm-generic/kvm_para.h>
diff --git a/arch/parisc/include/asm/kvm_para.h b/arch/parisc/include/asm/kvm_para.h
new file mode 100644
index 000000000000..14fab8f0b957
--- /dev/null
+++ b/arch/parisc/include/asm/kvm_para.h
@@ -0,0 +1 @@
+#include <asm-generic/kvm_para.h>
diff --git a/arch/powerpc/include/asm/kvm_para.h b/arch/powerpc/include/asm/kvm_para.h
index 7b754e743003..c18916bff689 100644
--- a/arch/powerpc/include/asm/kvm_para.h
+++ b/arch/powerpc/include/asm/kvm_para.h
@@ -206,6 +206,11 @@ static inline unsigned int kvm_arch_para_features(void)
 	return r;
 }
 
+static inline bool kvm_check_and_clear_guest_paused(void)
+{
+	return false;
+}
+
 #endif /* __KERNEL__ */
 
 #endif /* __POWERPC_KVM_PARA_H__ */
diff --git a/arch/s390/include/asm/kvm_para.h b/arch/s390/include/asm/kvm_para.h
index 6964db226f83..a98832961035 100644
--- a/arch/s390/include/asm/kvm_para.h
+++ b/arch/s390/include/asm/kvm_para.h
@@ -149,6 +149,11 @@ static inline unsigned int kvm_arch_para_features(void)
 	return 0;
 }
 
+static inline bool kvm_check_and_clear_guest_paused(void)
+{
+	return false;
+}
+
 #endif
 
 #endif /* __S390_KVM_PARA_H */
diff --git a/arch/score/include/asm/kvm_para.h b/arch/score/include/asm/kvm_para.h
new file mode 100644
index 000000000000..14fab8f0b957
--- /dev/null
+++ b/arch/score/include/asm/kvm_para.h
@@ -0,0 +1 @@
+#include <asm-generic/kvm_para.h>
diff --git a/arch/sh/include/asm/kvm_para.h b/arch/sh/include/asm/kvm_para.h
new file mode 100644
index 000000000000..14fab8f0b957
--- /dev/null
+++ b/arch/sh/include/asm/kvm_para.h
@@ -0,0 +1 @@
+#include <asm-generic/kvm_para.h>
diff --git a/arch/sparc/include/asm/kvm_para.h b/arch/sparc/include/asm/kvm_para.h
new file mode 100644
index 000000000000..14fab8f0b957
--- /dev/null
+++ b/arch/sparc/include/asm/kvm_para.h
@@ -0,0 +1 @@
+#include <asm-generic/kvm_para.h>
diff --git a/arch/tile/include/asm/kvm_para.h b/arch/tile/include/asm/kvm_para.h
new file mode 100644
index 000000000000..14fab8f0b957
--- /dev/null
+++ b/arch/tile/include/asm/kvm_para.h
@@ -0,0 +1 @@
+#include <asm-generic/kvm_para.h>
diff --git a/arch/um/include/asm/kvm_para.h b/arch/um/include/asm/kvm_para.h
new file mode 100644
index 000000000000..14fab8f0b957
--- /dev/null
+++ b/arch/um/include/asm/kvm_para.h
@@ -0,0 +1 @@
+#include <asm-generic/kvm_para.h>
diff --git a/arch/unicore32/include/asm/kvm_para.h b/arch/unicore32/include/asm/kvm_para.h
new file mode 100644
index 000000000000..14fab8f0b957
--- /dev/null
+++ b/arch/unicore32/include/asm/kvm_para.h
@@ -0,0 +1 @@
+#include <asm-generic/kvm_para.h>
diff --git a/arch/x86/include/asm/kvm_para.h b/arch/x86/include/asm/kvm_para.h
index 734c3767cfac..99c4bbe0cca2 100644
--- a/arch/x86/include/asm/kvm_para.h
+++ b/arch/x86/include/asm/kvm_para.h
@@ -95,6 +95,14 @@ struct kvm_vcpu_pv_apf_data {
 extern void kvmclock_init(void);
 extern int kvm_register_clock(char *txt);
 
+#ifdef CONFIG_KVM_CLOCK
+bool kvm_check_and_clear_guest_paused(void);
+#else
+static inline bool kvm_check_and_clear_guest_paused(void)
+{
+	return false;
+}
+#endif /* CONFIG_KVMCLOCK */
 
 /* This instruction is vmcall.  On non-VT architectures, it will generate a
  * trap that we will then rewrite to the appropriate instruction.
diff --git a/arch/x86/kernel/kvmclock.c b/arch/x86/kernel/kvmclock.c
index f8492da65bfc..4ba090ca689d 100644
--- a/arch/x86/kernel/kvmclock.c
+++ b/arch/x86/kernel/kvmclock.c
@@ -22,6 +22,7 @@
 #include <asm/msr.h>
 #include <asm/apic.h>
 #include <linux/percpu.h>
+#include <linux/hardirq.h>
 
 #include <asm/x86_init.h>
 #include <asm/reboot.h>
@@ -114,6 +115,26 @@ static void kvm_get_preset_lpj(void)
 	preset_lpj = lpj;
 }
 
+bool kvm_check_and_clear_guest_paused(void)
+{
+	bool ret = false;
+	struct pvclock_vcpu_time_info *src;
+
+	/*
+	 * per_cpu() is safe here because this function is only called from
+	 * timer functions where preemption is already disabled.
+	 */
+	WARN_ON(!in_atomic());
+	src = &__get_cpu_var(hv_clock);
+	if ((src->flags & PVCLOCK_GUEST_STOPPED) != 0) {
+		__this_cpu_and(hv_clock.flags, ~PVCLOCK_GUEST_STOPPED);
+		ret = true;
+	}
+
+	return ret;
+}
+EXPORT_SYMBOL_GPL(kvm_check_and_clear_guest_paused);
+
 static struct clocksource kvm_clock = {
 	.name = "kvm-clock",
 	.read = kvm_clock_get_cycles,
diff --git a/arch/xtensa/include/asm/kvm_para.h b/arch/xtensa/include/asm/kvm_para.h
new file mode 100644
index 000000000000..14fab8f0b957
--- /dev/null
+++ b/arch/xtensa/include/asm/kvm_para.h
@@ -0,0 +1 @@
+#include <asm-generic/kvm_para.h>
diff --git a/include/asm-generic/kvm_para.h b/include/asm-generic/kvm_para.h
new file mode 100644
index 000000000000..05ef7e705939
--- /dev/null
+++ b/include/asm-generic/kvm_para.h
@@ -0,0 +1,14 @@
+#ifndef _ASM_GENERIC_KVM_PARA_H
+#define _ASM_GENERIC_KVM_PARA_H
+
+
+/*
+ * This function is used by architectures that support kvm to avoid issuing
+ * false soft lockup messages.
+ */
+static inline bool kvm_check_and_clear_guest_paused(void)
+{
+	return false;
+}
+
+#endif
-- 
cgit v1.2.3


From 1c0b28c2a46d98cd258d96b8c222144b22876c46 Mon Sep 17 00:00:00 2001
From: Eric B Munson <emunson@mgebm.net>
Date: Sat, 10 Mar 2012 14:37:27 -0500
Subject: KVM: x86: Add ioctl for KVM_KVMCLOCK_CTRL

Now that we have a flag that will tell the guest it was suspended, create an
interface for that communication using a KVM ioctl.

Signed-off-by: Eric B Munson <emunson@mgebm.net>
Signed-off-by: Marcelo Tosatti <mtosatti@redhat.com>
Signed-off-by: Avi Kivity <avi@redhat.com>
---
 Documentation/virtual/kvm/api.txt | 20 ++++++++++++++++++++
 Documentation/virtual/kvm/msr.txt |  4 ++++
 arch/x86/kvm/x86.c                | 22 ++++++++++++++++++++++
 include/linux/kvm.h               |  3 +++
 4 files changed, 49 insertions(+)

(limited to 'arch/x86')

diff --git a/Documentation/virtual/kvm/api.txt b/Documentation/virtual/kvm/api.txt
index 6386f8c0482e..81ff39f6248d 100644
--- a/Documentation/virtual/kvm/api.txt
+++ b/Documentation/virtual/kvm/api.txt
@@ -1669,6 +1669,26 @@ at the memory location pointed to by "addr".
 The list of registers accessible using this interface is identical to the
 list in 4.64.
 
+4.70 KVM_KVMCLOCK_CTRL
+
+Capability: KVM_CAP_KVMCLOCK_CTRL
+Architectures: Any that implement pvclocks (currently x86 only)
+Type: vcpu ioctl
+Parameters: None
+Returns: 0 on success, -1 on error
+
+This signals to the host kernel that the specified guest is being paused by
+userspace.  The host will set a flag in the pvclock structure that is checked
+from the soft lockup watchdog.  The flag is part of the pvclock structure that
+is shared between guest and host, specifically the second bit of the flags
+field of the pvclock_vcpu_time_info structure.  It will be set exclusively by
+the host and read/cleared exclusively by the guest.  The guest operation of
+checking and clearing the flag must an atomic operation so
+load-link/store-conditional, or equivalent must be used.  There are two cases
+where the guest will clear the flag: when the soft lockup watchdog timer resets
+itself or when a soft lockup is detected.  This ioctl can be called any time
+after pausing the vcpu, but before it is resumed.
+
 5. The kvm_run structure
 
 Application code obtains a pointer to the kvm_run structure by
diff --git a/Documentation/virtual/kvm/msr.txt b/Documentation/virtual/kvm/msr.txt
index 50317809113d..96b41bd97523 100644
--- a/Documentation/virtual/kvm/msr.txt
+++ b/Documentation/virtual/kvm/msr.txt
@@ -108,6 +108,10 @@ MSR_KVM_SYSTEM_TIME_NEW:  0x4b564d01
 			    |	           | time measures taken across
 		     0      |	   24      | multiple cpus are guaranteed to
 			    |		   | be monotonic
+		-------------------------------------------------------------
+			    |		   | guest vcpu has been paused by
+		     1	    |	  N/A	   | the host
+			    |		   | See 4.70 in api.txt
 		-------------------------------------------------------------
 
 	Availability of this MSR must be checked via bit 3 in 0x4000001 cpuid
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 511031dcb9cc..99b738028fc0 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -2147,6 +2147,7 @@ int kvm_dev_ioctl_check_extension(long ext)
 	case KVM_CAP_ASYNC_PF:
 	case KVM_CAP_GET_TSC_KHZ:
 	case KVM_CAP_PCI_2_3:
+	case KVM_CAP_KVMCLOCK_CTRL:
 		r = 1;
 		break;
 	case KVM_CAP_COALESCED_MMIO:
@@ -2597,6 +2598,23 @@ static int kvm_vcpu_ioctl_x86_set_xcrs(struct kvm_vcpu *vcpu,
 	return r;
 }
 
+/*
+ * kvm_set_guest_paused() indicates to the guest kernel that it has been
+ * stopped by the hypervisor.  This function will be called from the host only.
+ * EINVAL is returned when the host attempts to set the flag for a guest that
+ * does not support pv clocks.
+ */
+static int kvm_set_guest_paused(struct kvm_vcpu *vcpu)
+{
+	struct pvclock_vcpu_time_info *src = &vcpu->arch.hv_clock;
+	if (!vcpu->arch.time_page)
+		return -EINVAL;
+	src->flags |= PVCLOCK_GUEST_STOPPED;
+	mark_page_dirty(vcpu->kvm, vcpu->arch.time >> PAGE_SHIFT);
+	kvm_make_request(KVM_REQ_CLOCK_UPDATE, vcpu);
+	return 0;
+}
+
 long kvm_arch_vcpu_ioctl(struct file *filp,
 			 unsigned int ioctl, unsigned long arg)
 {
@@ -2873,6 +2891,10 @@ long kvm_arch_vcpu_ioctl(struct file *filp,
 		r = vcpu->arch.virtual_tsc_khz;
 		goto out;
 	}
+	case KVM_KVMCLOCK_CTRL: {
+		r = kvm_set_guest_paused(vcpu);
+		goto out;
+	}
 	default:
 		r = -EINVAL;
 	}
diff --git a/include/linux/kvm.h b/include/linux/kvm.h
index 6c322a90b92f..7a9dd4b3dede 100644
--- a/include/linux/kvm.h
+++ b/include/linux/kvm.h
@@ -589,6 +589,7 @@ struct kvm_ppc_pvinfo {
 #define KVM_CAP_S390_UCONTROL 73
 #define KVM_CAP_SYNC_REGS 74
 #define KVM_CAP_PCI_2_3 75
+#define KVM_CAP_KVMCLOCK_CTRL 76
 
 #ifdef KVM_CAP_IRQ_ROUTING
 
@@ -859,6 +860,8 @@ struct kvm_s390_ucas_mapping {
 /* Available with KVM_CAP_ONE_REG */
 #define KVM_GET_ONE_REG		  _IOW(KVMIO,  0xab, struct kvm_one_reg)
 #define KVM_SET_ONE_REG		  _IOW(KVMIO,  0xac, struct kvm_one_reg)
+/* VM is being stopped by host */
+#define KVM_KVMCLOCK_CTRL	  _IO(KVMIO,   0xad)
 
 #define KVM_DEV_ASSIGN_ENABLE_IOMMU	(1 << 0)
 #define KVM_DEV_ASSIGN_PCI_2_3		(1 << 1)
-- 
cgit v1.2.3


From 248997095d652576f1213028a95ca5fff85d089f Mon Sep 17 00:00:00 2001
From: Eric B Munson <emunson@mgebm.net>
Date: Thu, 15 Mar 2012 18:16:49 -0400
Subject: kvmclock: remove unneeded EXPORT macro

check_and_clear_guest_paused does not need to be exported as it isn't used
by any modules, remove the export.

Signed-off-by: Eric B Munson <emunson@mgebm.net>
Signed-off-by: Marcelo Tosatti <mtosatti@redhat.com>
Signed-off-by: Avi Kivity <avi@redhat.com>
---
 arch/x86/kernel/kvmclock.c | 1 -
 1 file changed, 1 deletion(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kernel/kvmclock.c b/arch/x86/kernel/kvmclock.c
index 4ba090ca689d..086eb58c6e80 100644
--- a/arch/x86/kernel/kvmclock.c
+++ b/arch/x86/kernel/kvmclock.c
@@ -133,7 +133,6 @@ bool kvm_check_and_clear_guest_paused(void)
 
 	return ret;
 }
-EXPORT_SYMBOL_GPL(kvm_check_and_clear_guest_paused);
 
 static struct clocksource kvm_clock = {
 	.name = "kvm-clock",
-- 
cgit v1.2.3


From a0ed46073c14f66dbf0707aaa7588b78da83d7c6 Mon Sep 17 00:00:00 2001
From: Takuya Yoshikawa <yoshikawa.takuya@oss.ntt.co.jp>
Date: Thu, 1 Mar 2012 19:31:22 +0900
Subject: KVM: MMU: Split the main body of rmap_write_protect() off from others

We will use this in the following patch to implement another function
which needs to write protect pages using the rmap information.

Note that there is a small change in debug printing for large pages:
we do not differentiate them from others to avoid duplicating code.

Signed-off-by: Takuya Yoshikawa <yoshikawa.takuya@oss.ntt.co.jp>
Signed-off-by: Avi Kivity <avi@redhat.com>
---
 arch/x86/kvm/mmu.c | 53 +++++++++++++++++++++++++++--------------------------
 1 file changed, 27 insertions(+), 26 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
index 4cb164268846..c8b5694d1a48 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -1010,42 +1010,43 @@ static void drop_spte(struct kvm *kvm, u64 *sptep)
 		rmap_remove(kvm, sptep);
 }
 
-int kvm_mmu_rmap_write_protect(struct kvm *kvm, u64 gfn,
-			       struct kvm_memory_slot *slot)
+static int __rmap_write_protect(struct kvm *kvm, unsigned long *rmapp, int level)
 {
-	unsigned long *rmapp;
-	u64 *spte;
-	int i, write_protected = 0;
+	u64 *spte = NULL;
+	int write_protected = 0;
 
-	rmapp = __gfn_to_rmap(gfn, PT_PAGE_TABLE_LEVEL, slot);
-	spte = rmap_next(rmapp, NULL);
-	while (spte) {
+	while ((spte = rmap_next(rmapp, spte))) {
 		BUG_ON(!(*spte & PT_PRESENT_MASK));
 		rmap_printk("rmap_write_protect: spte %p %llx\n", spte, *spte);
-		if (is_writable_pte(*spte)) {
+
+		if (!is_writable_pte(*spte))
+			continue;
+
+		if (level == PT_PAGE_TABLE_LEVEL) {
 			mmu_spte_update(spte, *spte & ~PT_WRITABLE_MASK);
-			write_protected = 1;
+		} else {
+			BUG_ON(!is_large_pte(*spte));
+			drop_spte(kvm, spte);
+			--kvm->stat.lpages;
+			spte = NULL;
 		}
-		spte = rmap_next(rmapp, spte);
+
+		write_protected = 1;
 	}
 
-	/* check for huge page mappings */
-	for (i = PT_DIRECTORY_LEVEL;
+	return write_protected;
+}
+
+int kvm_mmu_rmap_write_protect(struct kvm *kvm, u64 gfn,
+			       struct kvm_memory_slot *slot)
+{
+	unsigned long *rmapp;
+	int i, write_protected = 0;
+
+	for (i = PT_PAGE_TABLE_LEVEL;
 	     i < PT_PAGE_TABLE_LEVEL + KVM_NR_PAGE_SIZES; ++i) {
 		rmapp = __gfn_to_rmap(gfn, i, slot);
-		spte = rmap_next(rmapp, NULL);
-		while (spte) {
-			BUG_ON(!(*spte & PT_PRESENT_MASK));
-			BUG_ON(!is_large_pte(*spte));
-			pgprintk("rmap_write_protect(large): spte %p %llx %lld\n", spte, *spte, gfn);
-			if (is_writable_pte(*spte)) {
-				drop_spte(kvm, spte);
-				--kvm->stat.lpages;
-				spte = NULL;
-				write_protected = 1;
-			}
-			spte = rmap_next(rmapp, spte);
-		}
+		write_protected |= __rmap_write_protect(kvm, rmapp, i);
 	}
 
 	return write_protected;
-- 
cgit v1.2.3


From 5dc99b2380d59b8aeafa98791f92b96400ed3187 Mon Sep 17 00:00:00 2001
From: Takuya Yoshikawa <yoshikawa.takuya@oss.ntt.co.jp>
Date: Thu, 1 Mar 2012 19:32:16 +0900
Subject: KVM: Avoid checking huge page mappings in get_dirty_log()

Dropped such mappings when we enabled dirty logging and we will never
create new ones until we stop the logging.

For this we introduce a new function which can be used to write protect
a range of PT level pages: although we do not need to care about a range
of pages at this point, the following patch will need this feature to
optimize the write protection of many pages.

Signed-off-by: Takuya Yoshikawa <yoshikawa.takuya@oss.ntt.co.jp>
Signed-off-by: Avi Kivity <avi@redhat.com>
---
 arch/x86/include/asm/kvm_host.h |  5 +++--
 arch/x86/kvm/mmu.c              | 40 ++++++++++++++++++++++++++++++----------
 arch/x86/kvm/x86.c              |  8 +++-----
 3 files changed, 36 insertions(+), 17 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index e216ba066e79..f624ca72ea24 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -712,8 +712,9 @@ void kvm_mmu_set_mask_ptes(u64 user_mask, u64 accessed_mask,
 
 int kvm_mmu_reset_context(struct kvm_vcpu *vcpu);
 void kvm_mmu_slot_remove_write_access(struct kvm *kvm, int slot);
-int kvm_mmu_rmap_write_protect(struct kvm *kvm, u64 gfn,
-			       struct kvm_memory_slot *slot);
+void kvm_mmu_write_protect_pt_masked(struct kvm *kvm,
+				     struct kvm_memory_slot *slot,
+				     gfn_t gfn_offset, unsigned long mask);
 void kvm_mmu_zap_all(struct kvm *kvm);
 unsigned int kvm_mmu_calculate_mmu_pages(struct kvm *kvm);
 void kvm_mmu_change_mmu_pages(struct kvm *kvm, unsigned int kvm_nr_mmu_pages);
diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
index c8b5694d1a48..dc5f2459db6c 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -1037,27 +1037,47 @@ static int __rmap_write_protect(struct kvm *kvm, unsigned long *rmapp, int level
 	return write_protected;
 }
 
-int kvm_mmu_rmap_write_protect(struct kvm *kvm, u64 gfn,
-			       struct kvm_memory_slot *slot)
+/**
+ * kvm_mmu_write_protect_pt_masked - write protect selected PT level pages
+ * @kvm: kvm instance
+ * @slot: slot to protect
+ * @gfn_offset: start of the BITS_PER_LONG pages we care about
+ * @mask: indicates which pages we should protect
+ *
+ * Used when we do not need to care about huge page mappings: e.g. during dirty
+ * logging we do not have any such mappings.
+ */
+void kvm_mmu_write_protect_pt_masked(struct kvm *kvm,
+				     struct kvm_memory_slot *slot,
+				     gfn_t gfn_offset, unsigned long mask)
 {
 	unsigned long *rmapp;
-	int i, write_protected = 0;
 
-	for (i = PT_PAGE_TABLE_LEVEL;
-	     i < PT_PAGE_TABLE_LEVEL + KVM_NR_PAGE_SIZES; ++i) {
-		rmapp = __gfn_to_rmap(gfn, i, slot);
-		write_protected |= __rmap_write_protect(kvm, rmapp, i);
-	}
+	while (mask) {
+		rmapp = &slot->rmap[gfn_offset + __ffs(mask)];
+		__rmap_write_protect(kvm, rmapp, PT_PAGE_TABLE_LEVEL);
 
-	return write_protected;
+		/* clear the first set bit */
+		mask &= mask - 1;
+	}
 }
 
 static int rmap_write_protect(struct kvm *kvm, u64 gfn)
 {
 	struct kvm_memory_slot *slot;
+	unsigned long *rmapp;
+	int i;
+	int write_protected = 0;
 
 	slot = gfn_to_memslot(kvm, gfn);
-	return kvm_mmu_rmap_write_protect(kvm, gfn, slot);
+
+	for (i = PT_PAGE_TABLE_LEVEL;
+	     i < PT_PAGE_TABLE_LEVEL + KVM_NR_PAGE_SIZES; ++i) {
+		rmapp = __gfn_to_rmap(gfn, i, slot);
+		write_protected |= __rmap_write_protect(kvm, rmapp, i);
+	}
+
+	return write_protected;
 }
 
 static int kvm_unmap_rmapp(struct kvm *kvm, unsigned long *rmapp,
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 99b738028fc0..813ebf1e55a0 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -3095,13 +3095,11 @@ static void write_protect_slot(struct kvm *kvm,
 
 	/* Not many dirty pages compared to # of shadow pages. */
 	if (nr_dirty_pages < kvm->arch.n_used_mmu_pages) {
-		unsigned long gfn_offset;
+		gfn_t offset;
 
-		for_each_set_bit(gfn_offset, dirty_bitmap, memslot->npages) {
-			unsigned long gfn = memslot->base_gfn + gfn_offset;
+		for_each_set_bit(offset, dirty_bitmap, memslot->npages)
+			kvm_mmu_write_protect_pt_masked(kvm, memslot, offset, 1);
 
-			kvm_mmu_rmap_write_protect(kvm, gfn, memslot);
-		}
 		kvm_flush_remote_tlbs(kvm);
 	} else
 		kvm_mmu_slot_remove_write_access(kvm, memslot->id);
-- 
cgit v1.2.3


From 60c34612b70711fb14a8dcbc6a79509902450d2e Mon Sep 17 00:00:00 2001
From: Takuya Yoshikawa <takuya.yoshikawa@gmail.com>
Date: Sat, 3 Mar 2012 14:21:48 +0900
Subject: KVM: Switch to srcu-less get_dirty_log()

We have seen some problems of the current implementation of
get_dirty_log() which uses synchronize_srcu_expedited() for updating
dirty bitmaps; e.g. it is noticeable that this sometimes gives us ms
order of latency when we use VGA displays.

Furthermore the recent discussion on the following thread
    "srcu: Implement call_srcu()"
    http://lkml.org/lkml/2012/1/31/211
also motivated us to implement get_dirty_log() without SRCU.

This patch achieves this goal without sacrificing the performance of
both VGA and live migration: in practice the new code is much faster
than the old one unless we have too many dirty pages.

Implementation:

The key part of the implementation is the use of xchg() operation for
clearing dirty bits atomically.  Since this allows us to update only
BITS_PER_LONG pages at once, we need to iterate over the dirty bitmap
until every dirty bit is cleared again for the next call.

Although some people may worry about the problem of using the atomic
memory instruction many times to the concurrently accessible bitmap,
it is usually accessed with mmu_lock held and we rarely see concurrent
accesses: so what we need to care about is the pure xchg() overheads.

Another point to note is that we do not use for_each_set_bit() to check
which ones in each BITS_PER_LONG pages are actually dirty.  Instead we
simply use __ffs() in a loop.  This is much faster than repeatedly call
find_next_bit().

Performance:

The dirty-log-perf unit test showed nice improvements, some times faster
than before, except for some extreme cases; for such cases the speed of
getting dirty page information is much faster than we process it in the
userspace.

For real workloads, both VGA and live migration, we have observed pure
improvements: when the guest was reading a file during live migration,
we originally saw a few ms of latency, but with the new method the
latency was less than 200us.

Signed-off-by: Takuya Yoshikawa <yoshikawa.takuya@oss.ntt.co.jp>
Signed-off-by: Avi Kivity <avi@redhat.com>
---
 arch/x86/kvm/x86.c | 116 ++++++++++++++++++++---------------------------------
 1 file changed, 43 insertions(+), 73 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 813ebf1e55a0..0d9a57875f0b 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -3067,55 +3067,32 @@ static int kvm_vm_ioctl_reinject(struct kvm *kvm,
 }
 
 /**
- * write_protect_slot - write protect a slot for dirty logging
- * @kvm: the kvm instance
- * @memslot: the slot we protect
- * @dirty_bitmap: the bitmap indicating which pages are dirty
- * @nr_dirty_pages: the number of dirty pages
+ * kvm_vm_ioctl_get_dirty_log - get and clear the log of dirty pages in a slot
+ * @kvm: kvm instance
+ * @log: slot id and address to which we copy the log
  *
- * We have two ways to find all sptes to protect:
- * 1. Use kvm_mmu_slot_remove_write_access() which walks all shadow pages and
- *    checks ones that have a spte mapping a page in the slot.
- * 2. Use kvm_mmu_rmap_write_protect() for each gfn found in the bitmap.
+ * We need to keep it in mind that VCPU threads can write to the bitmap
+ * concurrently.  So, to avoid losing data, we keep the following order for
+ * each bit:
  *
- * Generally speaking, if there are not so many dirty pages compared to the
- * number of shadow pages, we should use the latter.
+ *   1. Take a snapshot of the bit and clear it if needed.
+ *   2. Write protect the corresponding page.
+ *   3. Flush TLB's if needed.
+ *   4. Copy the snapshot to the userspace.
  *
- * Note that letting others write into a page marked dirty in the old bitmap
- * by using the remaining tlb entry is not a problem.  That page will become
- * write protected again when we flush the tlb and then be reported dirty to
- * the user space by copying the old bitmap.
+ * Between 2 and 3, the guest may write to the page using the remaining TLB
+ * entry.  This is not a problem because the page will be reported dirty at
+ * step 4 using the snapshot taken before and step 3 ensures that successive
+ * writes will be logged for the next call.
  */
-static void write_protect_slot(struct kvm *kvm,
-			       struct kvm_memory_slot *memslot,
-			       unsigned long *dirty_bitmap,
-			       unsigned long nr_dirty_pages)
-{
-	spin_lock(&kvm->mmu_lock);
-
-	/* Not many dirty pages compared to # of shadow pages. */
-	if (nr_dirty_pages < kvm->arch.n_used_mmu_pages) {
-		gfn_t offset;
-
-		for_each_set_bit(offset, dirty_bitmap, memslot->npages)
-			kvm_mmu_write_protect_pt_masked(kvm, memslot, offset, 1);
-
-		kvm_flush_remote_tlbs(kvm);
-	} else
-		kvm_mmu_slot_remove_write_access(kvm, memslot->id);
-
-	spin_unlock(&kvm->mmu_lock);
-}
-
-/*
- * Get (and clear) the dirty memory log for a memory slot.
- */
-int kvm_vm_ioctl_get_dirty_log(struct kvm *kvm,
-				      struct kvm_dirty_log *log)
+int kvm_vm_ioctl_get_dirty_log(struct kvm *kvm, struct kvm_dirty_log *log)
 {
 	int r;
 	struct kvm_memory_slot *memslot;
-	unsigned long n, nr_dirty_pages;
+	unsigned long n, i;
+	unsigned long *dirty_bitmap;
+	unsigned long *dirty_bitmap_buffer;
+	bool is_dirty = false;
 
 	mutex_lock(&kvm->slots_lock);
 
@@ -3124,49 +3101,42 @@ int kvm_vm_ioctl_get_dirty_log(struct kvm *kvm,
 		goto out;
 
 	memslot = id_to_memslot(kvm->memslots, log->slot);
+
+	dirty_bitmap = memslot->dirty_bitmap;
 	r = -ENOENT;
-	if (!memslot->dirty_bitmap)
+	if (!dirty_bitmap)
 		goto out;
 
 	n = kvm_dirty_bitmap_bytes(memslot);
-	nr_dirty_pages = memslot->nr_dirty_pages;
 
-	/* If nothing is dirty, don't bother messing with page tables. */
-	if (nr_dirty_pages) {
-		struct kvm_memslots *slots, *old_slots;
-		unsigned long *dirty_bitmap, *dirty_bitmap_head;
+	dirty_bitmap_buffer = dirty_bitmap + n / sizeof(long);
+	memset(dirty_bitmap_buffer, 0, n);
 
-		dirty_bitmap = memslot->dirty_bitmap;
-		dirty_bitmap_head = memslot->dirty_bitmap_head;
-		if (dirty_bitmap == dirty_bitmap_head)
-			dirty_bitmap_head += n / sizeof(long);
-		memset(dirty_bitmap_head, 0, n);
+	spin_lock(&kvm->mmu_lock);
 
-		r = -ENOMEM;
-		slots = kmemdup(kvm->memslots, sizeof(*kvm->memslots), GFP_KERNEL);
-		if (!slots)
-			goto out;
+	for (i = 0; i < n / sizeof(long); i++) {
+		unsigned long mask;
+		gfn_t offset;
 
-		memslot = id_to_memslot(slots, log->slot);
-		memslot->nr_dirty_pages = 0;
-		memslot->dirty_bitmap = dirty_bitmap_head;
-		update_memslots(slots, NULL);
+		if (!dirty_bitmap[i])
+			continue;
 
-		old_slots = kvm->memslots;
-		rcu_assign_pointer(kvm->memslots, slots);
-		synchronize_srcu_expedited(&kvm->srcu);
-		kfree(old_slots);
+		is_dirty = true;
 
-		write_protect_slot(kvm, memslot, dirty_bitmap, nr_dirty_pages);
+		mask = xchg(&dirty_bitmap[i], 0);
+		dirty_bitmap_buffer[i] = mask;
 
-		r = -EFAULT;
-		if (copy_to_user(log->dirty_bitmap, dirty_bitmap, n))
-			goto out;
-	} else {
-		r = -EFAULT;
-		if (clear_user(log->dirty_bitmap, n))
-			goto out;
+		offset = i * BITS_PER_LONG;
+		kvm_mmu_write_protect_pt_masked(kvm, memslot, offset, mask);
 	}
+	if (is_dirty)
+		kvm_flush_remote_tlbs(kvm);
+
+	spin_unlock(&kvm->mmu_lock);
+
+	r = -EFAULT;
+	if (copy_to_user(log->dirty_bitmap, dirty_bitmap_buffer, n))
+		goto out;
 
 	r = 0;
 out:
-- 
cgit v1.2.3


From e9bda3b3d0ce775afe15eaf71922d342cc74991c Mon Sep 17 00:00:00 2001
From: Josh Triplett <josh@joshtriplett.org>
Date: Tue, 20 Mar 2012 23:33:51 -0700
Subject: KVM: VMX: Auto-load on CPUs with VMX

Enable x86 feature-based autoloading for the kvm-intel module on CPUs
with X86_FEATURE_VMX.

Signed-off-by: Josh Triplett <josh@joshtriplett.org>
Acked-By: Kay Sievers <kay@vrfy.org>
Signed-off-by: Avi Kivity <avi@redhat.com>
---
 arch/x86/kvm/vmx.c | 7 +++++++
 1 file changed, 7 insertions(+)

(limited to 'arch/x86')

diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index ad85adfef843..52f685635766 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -27,6 +27,7 @@
 #include <linux/highmem.h>
 #include <linux/sched.h>
 #include <linux/moduleparam.h>
+#include <linux/mod_devicetable.h>
 #include <linux/ftrace_event.h>
 #include <linux/slab.h>
 #include <linux/tboot.h>
@@ -51,6 +52,12 @@
 MODULE_AUTHOR("Qumranet");
 MODULE_LICENSE("GPL");
 
+static const struct x86_cpu_id vmx_cpu_id[] = {
+	X86_FEATURE_MATCH(X86_FEATURE_VMX),
+	{}
+};
+MODULE_DEVICE_TABLE(x86cpu, vmx_cpu_id);
+
 static bool __read_mostly enable_vpid = 1;
 module_param_named(vpid, enable_vpid, bool, 0444);
 
-- 
cgit v1.2.3


From c36fc04ef558c95cff46a8c89d2f804f217335f5 Mon Sep 17 00:00:00 2001
From: Davidlohr Bueso <dave@gnu.org>
Date: Thu, 8 Mar 2012 12:45:54 +0100
Subject: KVM: x86: add paging gcc optimization

Since most guests will have paging enabled for memory management, add likely() optimization
around CR0.PG checks.

Signed-off-by: Davidlohr Bueso <dave@gnu.org>
Signed-off-by: Avi Kivity <avi@redhat.com>
---
 arch/x86/kvm/x86.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kvm/x86.h b/arch/x86/kvm/x86.h
index cb80c293cdd8..3d1134ddb885 100644
--- a/arch/x86/kvm/x86.h
+++ b/arch/x86/kvm/x86.h
@@ -64,7 +64,7 @@ static inline int is_pse(struct kvm_vcpu *vcpu)
 
 static inline int is_paging(struct kvm_vcpu *vcpu)
 {
-	return kvm_read_cr0_bits(vcpu, X86_CR0_PG);
+	return likely(kvm_read_cr0_bits(vcpu, X86_CR0_PG));
 }
 
 static inline u32 bit(int bitno)
-- 
cgit v1.2.3


From 220f773a0013bf6fe2eefd9718ac7471f368fd8e Mon Sep 17 00:00:00 2001
From: Takuya Yoshikawa <yoshikawa.takuya@oss.ntt.co.jp>
Date: Wed, 21 Mar 2012 23:49:39 +0900
Subject: KVM: MMU: Make pte_list_desc fit cache lines well

We have PTE_LIST_EXT + 1 pointers in this structure and these 40/20
bytes do not fit cache lines well.  Furthermore, some allocators may
use 64/32-byte objects for the pte_list_desc cache.

This patch solves this problem by changing PTE_LIST_EXT from 4 to 3.

For shadow paging, the new size is still large enough to hold both the
kernel and process mappings for usual anonymous pages.  For file
mappings, there may be a slight change in the cache usage.

Note: with EPT/NPT we almost always have a single spte in each reverse
mapping and we will not see any change by this.

Signed-off-by: Takuya Yoshikawa <yoshikawa.takuya@oss.ntt.co.jp>
Signed-off-by: Avi Kivity <avi@redhat.com>
---
 arch/x86/kvm/mmu.c | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
index dc5f2459db6c..3213348e3a93 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -135,8 +135,6 @@ module_param(dbg, bool, 0644);
 #define PT64_PERM_MASK (PT_PRESENT_MASK | PT_WRITABLE_MASK | PT_USER_MASK \
 			| PT64_NX_MASK)
 
-#define PTE_LIST_EXT 4
-
 #define ACC_EXEC_MASK    1
 #define ACC_WRITE_MASK   PT_WRITABLE_MASK
 #define ACC_USER_MASK    PT_USER_MASK
@@ -151,6 +149,9 @@ module_param(dbg, bool, 0644);
 
 #define SHADOW_PT_INDEX(addr, level) PT64_INDEX(addr, level)
 
+/* make pte_list_desc fit well in cache line */
+#define PTE_LIST_EXT 3
+
 struct pte_list_desc {
 	u64 *sptes[PTE_LIST_EXT];
 	struct pte_list_desc *more;
-- 
cgit v1.2.3


From 1e3f42f03c38c29c1814199a6f0a2f01b919ea3f Mon Sep 17 00:00:00 2001
From: Takuya Yoshikawa <yoshikawa.takuya@oss.ntt.co.jp>
Date: Wed, 21 Mar 2012 23:50:34 +0900
Subject: KVM: MMU: Improve iteration through sptes from rmap

Iteration using rmap_next(), the actual body is pte_list_next(), is
inefficient: every time we call it we start from checking whether rmap
holds a single spte or points to a descriptor which links more sptes.

In the case of shadow paging, this quadratic total iteration cost is a
problem.  Even for two dimensional paging, with EPT/NPT on, in which we
almost always have a single mapping, the extra checks at the end of the
iteration should be eliminated.

This patch fixes this by introducing rmap_iterator which keeps the
iteration context for the next search.  Furthermore the implementation
of rmap_next() is splitted into two functions, rmap_get_first() and
rmap_get_next(), to avoid repeatedly checking whether the rmap being
iterated on has only one spte.

Although there seemed to be only a slight change for EPT/NPT, the actual
improvement was significant: we observed that GET_DIRTY_LOG for 1GB
dirty memory became 15% faster than before.  This is probably because
the new code is easy to make branch predictions.

Note: we just remove pte_list_next() because we can think of parent_ptes
as a reverse mapping.

Signed-off-by: Takuya Yoshikawa <yoshikawa.takuya@oss.ntt.co.jp>
Signed-off-by: Avi Kivity <avi@redhat.com>
---
 arch/x86/kvm/mmu.c       | 196 ++++++++++++++++++++++++++++-------------------
 arch/x86/kvm/mmu_audit.c |  10 +--
 2 files changed, 124 insertions(+), 82 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
index 3213348e3a93..29ad6f9c58a5 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -842,32 +842,6 @@ static int pte_list_add(struct kvm_vcpu *vcpu, u64 *spte,
 	return count;
 }
 
-static u64 *pte_list_next(unsigned long *pte_list, u64 *spte)
-{
-	struct pte_list_desc *desc;
-	u64 *prev_spte;
-	int i;
-
-	if (!*pte_list)
-		return NULL;
-	else if (!(*pte_list & 1)) {
-		if (!spte)
-			return (u64 *)*pte_list;
-		return NULL;
-	}
-	desc = (struct pte_list_desc *)(*pte_list & ~1ul);
-	prev_spte = NULL;
-	while (desc) {
-		for (i = 0; i < PTE_LIST_EXT && desc->sptes[i]; ++i) {
-			if (prev_spte == spte)
-				return desc->sptes[i];
-			prev_spte = desc->sptes[i];
-		}
-		desc = desc->more;
-	}
-	return NULL;
-}
-
 static void
 pte_list_desc_remove_entry(unsigned long *pte_list, struct pte_list_desc *desc,
 			   int i, struct pte_list_desc *prev_desc)
@@ -988,11 +962,6 @@ static int rmap_add(struct kvm_vcpu *vcpu, u64 *spte, gfn_t gfn)
 	return pte_list_add(vcpu, spte, rmapp);
 }
 
-static u64 *rmap_next(unsigned long *rmapp, u64 *spte)
-{
-	return pte_list_next(rmapp, spte);
-}
-
 static void rmap_remove(struct kvm *kvm, u64 *spte)
 {
 	struct kvm_mmu_page *sp;
@@ -1005,6 +974,67 @@ static void rmap_remove(struct kvm *kvm, u64 *spte)
 	pte_list_remove(spte, rmapp);
 }
 
+/*
+ * Used by the following functions to iterate through the sptes linked by a
+ * rmap.  All fields are private and not assumed to be used outside.
+ */
+struct rmap_iterator {
+	/* private fields */
+	struct pte_list_desc *desc;	/* holds the sptep if not NULL */
+	int pos;			/* index of the sptep */
+};
+
+/*
+ * Iteration must be started by this function.  This should also be used after
+ * removing/dropping sptes from the rmap link because in such cases the
+ * information in the itererator may not be valid.
+ *
+ * Returns sptep if found, NULL otherwise.
+ */
+static u64 *rmap_get_first(unsigned long rmap, struct rmap_iterator *iter)
+{
+	if (!rmap)
+		return NULL;
+
+	if (!(rmap & 1)) {
+		iter->desc = NULL;
+		return (u64 *)rmap;
+	}
+
+	iter->desc = (struct pte_list_desc *)(rmap & ~1ul);
+	iter->pos = 0;
+	return iter->desc->sptes[iter->pos];
+}
+
+/*
+ * Must be used with a valid iterator: e.g. after rmap_get_first().
+ *
+ * Returns sptep if found, NULL otherwise.
+ */
+static u64 *rmap_get_next(struct rmap_iterator *iter)
+{
+	if (iter->desc) {
+		if (iter->pos < PTE_LIST_EXT - 1) {
+			u64 *sptep;
+
+			++iter->pos;
+			sptep = iter->desc->sptes[iter->pos];
+			if (sptep)
+				return sptep;
+		}
+
+		iter->desc = iter->desc->more;
+
+		if (iter->desc) {
+			iter->pos = 0;
+			/* desc->sptes[0] cannot be NULL */
+			return iter->desc->sptes[iter->pos];
+		}
+	}
+
+	return NULL;
+}
+
 static void drop_spte(struct kvm *kvm, u64 *sptep)
 {
 	if (mmu_spte_clear_track_bits(sptep))
@@ -1013,23 +1043,27 @@ static void drop_spte(struct kvm *kvm, u64 *sptep)
 
 static int __rmap_write_protect(struct kvm *kvm, unsigned long *rmapp, int level)
 {
-	u64 *spte = NULL;
+	u64 *sptep;
+	struct rmap_iterator iter;
 	int write_protected = 0;
 
-	while ((spte = rmap_next(rmapp, spte))) {
-		BUG_ON(!(*spte & PT_PRESENT_MASK));
-		rmap_printk("rmap_write_protect: spte %p %llx\n", spte, *spte);
+	for (sptep = rmap_get_first(*rmapp, &iter); sptep;) {
+		BUG_ON(!(*sptep & PT_PRESENT_MASK));
+		rmap_printk("rmap_write_protect: spte %p %llx\n", sptep, *sptep);
 
-		if (!is_writable_pte(*spte))
+		if (!is_writable_pte(*sptep)) {
+			sptep = rmap_get_next(&iter);
 			continue;
+		}
 
 		if (level == PT_PAGE_TABLE_LEVEL) {
-			mmu_spte_update(spte, *spte & ~PT_WRITABLE_MASK);
+			mmu_spte_update(sptep, *sptep & ~PT_WRITABLE_MASK);
+			sptep = rmap_get_next(&iter);
 		} else {
-			BUG_ON(!is_large_pte(*spte));
-			drop_spte(kvm, spte);
+			BUG_ON(!is_large_pte(*sptep));
+			drop_spte(kvm, sptep);
 			--kvm->stat.lpages;
-			spte = NULL;
+			sptep = rmap_get_first(*rmapp, &iter);
 		}
 
 		write_protected = 1;
@@ -1084,48 +1118,57 @@ static int rmap_write_protect(struct kvm *kvm, u64 gfn)
 static int kvm_unmap_rmapp(struct kvm *kvm, unsigned long *rmapp,
 			   unsigned long data)
 {
-	u64 *spte;
+	u64 *sptep;
+	struct rmap_iterator iter;
 	int need_tlb_flush = 0;
 
-	while ((spte = rmap_next(rmapp, NULL))) {
-		BUG_ON(!(*spte & PT_PRESENT_MASK));
-		rmap_printk("kvm_rmap_unmap_hva: spte %p %llx\n", spte, *spte);
-		drop_spte(kvm, spte);
+	while ((sptep = rmap_get_first(*rmapp, &iter))) {
+		BUG_ON(!(*sptep & PT_PRESENT_MASK));
+		rmap_printk("kvm_rmap_unmap_hva: spte %p %llx\n", sptep, *sptep);
+
+		drop_spte(kvm, sptep);
 		need_tlb_flush = 1;
 	}
+
 	return need_tlb_flush;
 }
 
 static int kvm_set_pte_rmapp(struct kvm *kvm, unsigned long *rmapp,
 			     unsigned long data)
 {
+	u64 *sptep;
+	struct rmap_iterator iter;
 	int need_flush = 0;
-	u64 *spte, new_spte;
+	u64 new_spte;
 	pte_t *ptep = (pte_t *)data;
 	pfn_t new_pfn;
 
 	WARN_ON(pte_huge(*ptep));
 	new_pfn = pte_pfn(*ptep);
-	spte = rmap_next(rmapp, NULL);
-	while (spte) {
-		BUG_ON(!is_shadow_present_pte(*spte));
-		rmap_printk("kvm_set_pte_rmapp: spte %p %llx\n", spte, *spte);
+
+	for (sptep = rmap_get_first(*rmapp, &iter); sptep;) {
+		BUG_ON(!is_shadow_present_pte(*sptep));
+		rmap_printk("kvm_set_pte_rmapp: spte %p %llx\n", sptep, *sptep);
+
 		need_flush = 1;
+
 		if (pte_write(*ptep)) {
-			drop_spte(kvm, spte);
-			spte = rmap_next(rmapp, NULL);
+			drop_spte(kvm, sptep);
+			sptep = rmap_get_first(*rmapp, &iter);
 		} else {
-			new_spte = *spte &~ (PT64_BASE_ADDR_MASK);
+			new_spte = *sptep & ~PT64_BASE_ADDR_MASK;
 			new_spte |= (u64)new_pfn << PAGE_SHIFT;
 
 			new_spte &= ~PT_WRITABLE_MASK;
 			new_spte &= ~SPTE_HOST_WRITEABLE;
 			new_spte &= ~shadow_accessed_mask;
-			mmu_spte_clear_track_bits(spte);
-			mmu_spte_set(spte, new_spte);
-			spte = rmap_next(rmapp, spte);
+
+			mmu_spte_clear_track_bits(sptep);
+			mmu_spte_set(sptep, new_spte);
+			sptep = rmap_get_next(&iter);
 		}
 	}
+
 	if (need_flush)
 		kvm_flush_remote_tlbs(kvm);
 
@@ -1184,7 +1227,8 @@ void kvm_set_spte_hva(struct kvm *kvm, unsigned long hva, pte_t pte)
 static int kvm_age_rmapp(struct kvm *kvm, unsigned long *rmapp,
 			 unsigned long data)
 {
-	u64 *spte;
+	u64 *sptep;
+	struct rmap_iterator iter;
 	int young = 0;
 
 	/*
@@ -1197,25 +1241,24 @@ static int kvm_age_rmapp(struct kvm *kvm, unsigned long *rmapp,
 	if (!shadow_accessed_mask)
 		return kvm_unmap_rmapp(kvm, rmapp, data);
 
-	spte = rmap_next(rmapp, NULL);
-	while (spte) {
-		int _young;
-		u64 _spte = *spte;
-		BUG_ON(!(_spte & PT_PRESENT_MASK));
-		_young = _spte & PT_ACCESSED_MASK;
-		if (_young) {
+	for (sptep = rmap_get_first(*rmapp, &iter); sptep;
+	     sptep = rmap_get_next(&iter)) {
+		BUG_ON(!(*sptep & PT_PRESENT_MASK));
+
+		if (*sptep & PT_ACCESSED_MASK) {
 			young = 1;
-			clear_bit(PT_ACCESSED_SHIFT, (unsigned long *)spte);
+			clear_bit(PT_ACCESSED_SHIFT, (unsigned long *)sptep);
 		}
-		spte = rmap_next(rmapp, spte);
 	}
+
 	return young;
 }
 
 static int kvm_test_age_rmapp(struct kvm *kvm, unsigned long *rmapp,
 			      unsigned long data)
 {
-	u64 *spte;
+	u64 *sptep;
+	struct rmap_iterator iter;
 	int young = 0;
 
 	/*
@@ -1226,16 +1269,14 @@ static int kvm_test_age_rmapp(struct kvm *kvm, unsigned long *rmapp,
 	if (!shadow_accessed_mask)
 		goto out;
 
-	spte = rmap_next(rmapp, NULL);
-	while (spte) {
-		u64 _spte = *spte;
-		BUG_ON(!(_spte & PT_PRESENT_MASK));
-		young = _spte & PT_ACCESSED_MASK;
-		if (young) {
+	for (sptep = rmap_get_first(*rmapp, &iter); sptep;
+	     sptep = rmap_get_next(&iter)) {
+		BUG_ON(!(*sptep & PT_PRESENT_MASK));
+
+		if (*sptep & PT_ACCESSED_MASK) {
 			young = 1;
 			break;
 		}
-		spte = rmap_next(rmapp, spte);
 	}
 out:
 	return young;
@@ -1887,10 +1928,11 @@ static void kvm_mmu_put_page(struct kvm_mmu_page *sp, u64 *parent_pte)
 
 static void kvm_mmu_unlink_parents(struct kvm *kvm, struct kvm_mmu_page *sp)
 {
-	u64 *parent_pte;
+	u64 *sptep;
+	struct rmap_iterator iter;
 
-	while ((parent_pte = pte_list_next(&sp->parent_ptes, NULL)))
-		drop_parent_pte(sp, parent_pte);
+	while ((sptep = rmap_get_first(sp->parent_ptes, &iter)))
+		drop_parent_pte(sp, sptep);
 }
 
 static int mmu_zap_unsync_children(struct kvm *kvm,
diff --git a/arch/x86/kvm/mmu_audit.c b/arch/x86/kvm/mmu_audit.c
index 715da5a19a5b..7d7d0b9e23eb 100644
--- a/arch/x86/kvm/mmu_audit.c
+++ b/arch/x86/kvm/mmu_audit.c
@@ -192,7 +192,8 @@ static void audit_write_protection(struct kvm *kvm, struct kvm_mmu_page *sp)
 {
 	struct kvm_memory_slot *slot;
 	unsigned long *rmapp;
-	u64 *spte;
+	u64 *sptep;
+	struct rmap_iterator iter;
 
 	if (sp->role.direct || sp->unsync || sp->role.invalid)
 		return;
@@ -200,13 +201,12 @@ static void audit_write_protection(struct kvm *kvm, struct kvm_mmu_page *sp)
 	slot = gfn_to_memslot(kvm, sp->gfn);
 	rmapp = &slot->rmap[sp->gfn - slot->base_gfn];
 
-	spte = rmap_next(rmapp, NULL);
-	while (spte) {
-		if (is_writable_pte(*spte))
+	for (sptep = rmap_get_first(*rmapp, &iter); sptep;
+	     sptep = rmap_get_next(&iter)) {
+		if (is_writable_pte(*sptep))
 			audit_printk(kvm, "shadow page has writable "
 				     "mappings: gfn %llx role %x\n",
 				     sp->gfn, sp->role.word);
-		spte = rmap_next(rmapp, spte);
 	}
 }
 
-- 
cgit v1.2.3


From 4692d77fc3c8978a36406a3cf9e8b899f86f68f1 Mon Sep 17 00:00:00 2001
From: Alessandro Rubini <rubini@gnudd.com>
Date: Wed, 4 Apr 2012 19:39:58 +0200
Subject: x86-32: Introduce CONFIG_X86_DEV_DMA_OPS

32-bit x86 systems may need their own DMA operations, so add
a new config option, which is turned on for 64-bit systems. This
patch has no functional effect but it paves the way for supporting
the STA2x11 I/O Hub and possibly other chips.

Signed-off-by: Alessandro Rubini <rubini@gnudd.com>
Link: http://lkml.kernel.org/r/f79fcc1a2e17ef942e1b798b92aac43a80202532.1333560789.git.rubini@gnudd.com
Acked-by: Giancarlo Asnaghi <giancarlo.asnaghi@st.com>
Cc: Alan Cox <alan@linux.intel.com>
Signed-off-by: H. Peter Anvin <hpa@zytor.com>
---
 arch/x86/Kconfig                   | 5 +++++
 arch/x86/include/asm/device.h      | 4 ++--
 arch/x86/include/asm/dma-mapping.h | 2 +-
 3 files changed, 8 insertions(+), 3 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
index 1d14cc6b79ad..07b412aed38b 100644
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -12,6 +12,7 @@ config X86_32
 
 config X86_64
 	def_bool 64BIT
+	select X86_DEV_DMA_OPS
 
 ### Arch settings
 config X86
@@ -2215,6 +2216,10 @@ config HAVE_TEXT_POKE_SMP
 	bool
 	select STOP_MACHINE if SMP
 
+config X86_DEV_DMA_OPS
+	bool
+	depends on X86_64
+
 source "net/Kconfig"
 
 source "drivers/Kconfig"
diff --git a/arch/x86/include/asm/device.h b/arch/x86/include/asm/device.h
index 63a2a03d7d51..93e1c55f14ab 100644
--- a/arch/x86/include/asm/device.h
+++ b/arch/x86/include/asm/device.h
@@ -5,8 +5,8 @@ struct dev_archdata {
 #ifdef CONFIG_ACPI
 	void	*acpi_handle;
 #endif
-#ifdef CONFIG_X86_64
-struct dma_map_ops *dma_ops;
+#ifdef CONFIG_X86_DEV_DMA_OPS
+	struct dma_map_ops *dma_ops;
 #endif
 #if defined(CONFIG_INTEL_IOMMU) || defined(CONFIG_AMD_IOMMU)
 	void *iommu; /* hook for IOMMU specific extension */
diff --git a/arch/x86/include/asm/dma-mapping.h b/arch/x86/include/asm/dma-mapping.h
index 4b4331d71935..09aa473e2917 100644
--- a/arch/x86/include/asm/dma-mapping.h
+++ b/arch/x86/include/asm/dma-mapping.h
@@ -30,7 +30,7 @@ extern struct dma_map_ops *dma_ops;
 
 static inline struct dma_map_ops *get_dma_ops(struct device *dev)
 {
-#ifdef CONFIG_X86_32
+#ifndef CONFIG_X86_DEV_DMA_OPS
 	return dma_ops;
 #else
 	if (unlikely(!dev) || !dev->archdata.dma_ops)
-- 
cgit v1.2.3


From f7219a5300ba753b0c762d631763bd878b8bb00c Mon Sep 17 00:00:00 2001
From: Alessandro Rubini <rubini@gnudd.com>
Date: Wed, 4 Apr 2012 19:40:10 +0200
Subject: x86: Introduce CONFIG_X86_DMA_REMAP

The default functions phys_to_dma, dma_to_phys implement identity
mapping as fast inline functions.  Some systems, however, may need a
custom function to implement its own mapping between CPU addresses and
device addresses. This new configuration option allows the functions
to be external when needed (such as for the ConneXt device)

Signed-off-by: Alessandro Rubini <rubini@gnudd.com>
Link: http://lkml.kernel.org/r/6e4329b772df675f1c442f68e59e844e4dd8c965.1333560789.git.rubini@gnudd.com
Acked-by: Giancarlo Asnaghi <giancarlo.asnaghi@st.com>
Cc: Alan Cox <alan@linux.intel.com>
Signed-off-by: H. Peter Anvin <hpa@zytor.com>
---
 arch/x86/Kconfig                   | 3 +++
 arch/x86/include/asm/dma-mapping.h | 7 +++++++
 2 files changed, 10 insertions(+)

(limited to 'arch/x86')

diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
index 07b412aed38b..95ca56036030 100644
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -2220,6 +2220,9 @@ config X86_DEV_DMA_OPS
 	bool
 	depends on X86_64
 
+config X86_DMA_REMAP
+	bool
+
 source "net/Kconfig"
 
 source "drivers/Kconfig"
diff --git a/arch/x86/include/asm/dma-mapping.h b/arch/x86/include/asm/dma-mapping.h
index 09aa473e2917..61c0bd25845a 100644
--- a/arch/x86/include/asm/dma-mapping.h
+++ b/arch/x86/include/asm/dma-mapping.h
@@ -62,6 +62,12 @@ extern void *dma_generic_alloc_coherent(struct device *dev, size_t size,
 					dma_addr_t *dma_addr, gfp_t flag,
 					struct dma_attrs *attrs);
 
+#ifdef CONFIG_X86_DMA_REMAP /* Platform code defines bridge-specific code */
+extern bool dma_capable(struct device *dev, dma_addr_t addr, size_t size);
+extern dma_addr_t phys_to_dma(struct device *dev, phys_addr_t paddr);
+extern phys_addr_t dma_to_phys(struct device *dev, dma_addr_t daddr);
+#else
+
 static inline bool dma_capable(struct device *dev, dma_addr_t addr, size_t size)
 {
 	if (!dev->dma_mask)
@@ -79,6 +85,7 @@ static inline phys_addr_t dma_to_phys(struct device *dev, dma_addr_t daddr)
 {
 	return daddr;
 }
+#endif /* CONFIG_X86_DMA_REMAP */
 
 static inline void
 dma_cache_sync(struct device *dev, void *vaddr, size_t size,
-- 
cgit v1.2.3


From 83125a3a189ec34fb22a04e8efad69ae6d52674a Mon Sep 17 00:00:00 2001
From: Alessandro Rubini <rubini@gnudd.com>
Date: Wed, 4 Apr 2012 19:40:21 +0200
Subject: x86, platform: Initial support for sta2x11 I/O hub

The "ConneXt" sta2x11 I/O Hub is a bridge from PCIe to AMBA, and is
used as main chipset in some Atom boards.  The set of peripherals it
exports live in an AMBA bus internal to the chip, so a custom
remapping of addresses is needed. This is implemented by fixup calls
for the PCI deivices, based on CONFIG_X86_DEV_DMA_OPS and
CONFIG_X86_DMA_REMAP .

Signed-off-by: Alessandro Rubini <rubini@gnudd.com>
Link: http://lkml.kernel.org/r/ddca670ca8180e52d49b3fe642742ddd23ab2cb2.1333560789.git.rubini@gnudd.com
Acked-by: Giancarlo Asnaghi <giancarlo.asnaghi@st.com>
Cc: Alan Cox <alan@linux.intel.com>
Signed-off-by: H. Peter Anvin <hpa@zytor.com>
---
 arch/x86/Kconfig             |  28 +++-
 arch/x86/pci/Makefile        |   2 +
 arch/x86/pci/sta2x11-fixup.c | 366 +++++++++++++++++++++++++++++++++++++++++++
 3 files changed, 391 insertions(+), 5 deletions(-)
 create mode 100644 arch/x86/pci/sta2x11-fixup.c

(limited to 'arch/x86')

diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
index 95ca56036030..f9ed801abaf9 100644
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -329,6 +329,7 @@ config X86_EXTENDED_PLATFORM
 		NUMAQ (IBM/Sequent)
 		RDC R-321x SoC
 		SGI 320/540 (Visual Workstation)
+		STA2X11-based (e.g. Northville)
 		Summit/EXA (IBM x440)
 		Unisys ES7000 IA32 series
 		Moorestown MID devices
@@ -461,10 +462,10 @@ config X86_32_NON_STANDARD
 	depends on X86_32 && SMP
 	depends on X86_EXTENDED_PLATFORM
 	---help---
-	  This option compiles in the NUMAQ, Summit, bigsmp, ES7000, default
-	  subarchitectures.  It is intended for a generic binary kernel.
-	  if you select them all, kernel will probe it one by one. and will
-	  fallback to default.
+	  This option compiles in the NUMAQ, Summit, bigsmp, ES7000,
+	  STA2X11, default subarchitectures.  It is intended for a generic
+	  binary kernel. If you select them all, kernel will probe it
+	  one by one and will fallback to default.
 
 # Alphabetically sorted list of Non standard 32 bit platforms
 
@@ -504,6 +505,22 @@ config X86_VISWS
 	  A kernel compiled for the Visual Workstation will run on general
 	  PCs as well. See <file:Documentation/sgi-visws.txt> for details.
 
+config STA2X11
+	bool "STA2X11 Companion Chip Support"
+	depends on X86_32_NON_STANDARD && PCI
+	select X86_DEV_DMA_OPS
+	select X86_DMA_REMAP
+	select SWIOTLB
+	select MFD_STA2X11
+	select ARCH_REQUIRE_GPIOLIB
+	default n
+	---help---
+	  This adds support for boards based on the STA2X11 IO-Hub,
+	  a.k.a. "ConneXt". The chip is used in place of the standard
+	  PC chipset, so all "standard" peripherals are missing. If this
+	  option is selected the kernel will still be able to boot on
+	  standard PC machines.
+
 config X86_SUMMIT
 	bool "Summit/EXA (IBM x440)"
 	depends on X86_32_NON_STANDARD
@@ -2218,10 +2235,11 @@ config HAVE_TEXT_POKE_SMP
 
 config X86_DEV_DMA_OPS
 	bool
-	depends on X86_64
+	depends on X86_64 || STA2X11
 
 config X86_DMA_REMAP
 	bool
+	depends on STA2X11
 
 source "net/Kconfig"
 
diff --git a/arch/x86/pci/Makefile b/arch/x86/pci/Makefile
index e76e18c94a3c..3af5a1e79c9c 100644
--- a/arch/x86/pci/Makefile
+++ b/arch/x86/pci/Makefile
@@ -11,6 +11,8 @@ obj-$(CONFIG_X86_INTEL_CE)      += ce4100.o
 obj-$(CONFIG_ACPI)		+= acpi.o
 obj-y				+= legacy.o irq.o
 
+obj-$(CONFIG_STA2X11)           += sta2x11-fixup.o
+
 obj-$(CONFIG_X86_VISWS)		+= visws.o
 
 obj-$(CONFIG_X86_NUMAQ)		+= numaq_32.o
diff --git a/arch/x86/pci/sta2x11-fixup.c b/arch/x86/pci/sta2x11-fixup.c
new file mode 100644
index 000000000000..9d8a509c9730
--- /dev/null
+++ b/arch/x86/pci/sta2x11-fixup.c
@@ -0,0 +1,366 @@
+/*
+ * arch/x86/pci/sta2x11-fixup.c
+ * glue code for lib/swiotlb.c and DMA translation between STA2x11
+ * AMBA memory mapping and the X86 memory mapping
+ *
+ * ST Microelectronics ConneXt (STA2X11/STA2X10)
+ *
+ * Copyright (c) 2010-2011 Wind River Systems, Inc.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ * See the GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
+ *
+ */
+
+#include <linux/pci.h>
+#include <linux/pci_ids.h>
+#include <linux/export.h>
+#include <linux/list.h>
+
+#define STA2X11_SWIOTLB_SIZE (4*1024*1024)
+extern int swiotlb_late_init_with_default_size(size_t default_size);
+
+/*
+ * We build a list of bus numbers that are under the ConneXt. The
+ * main bridge hosts 4 busses, which are the 4 endpoints, in order.
+ */
+#define STA2X11_NR_EP		4	/* 0..3 included */
+#define STA2X11_NR_FUNCS	8	/* 0..7 included */
+#define STA2X11_AMBA_SIZE	(512 << 20)
+
+struct sta2x11_ahb_regs { /* saved during suspend */
+	u32 base, pexlbase, pexhbase, crw;
+};
+
+struct sta2x11_mapping {
+	u32 amba_base;
+	int is_suspended;
+	struct sta2x11_ahb_regs regs[STA2X11_NR_FUNCS];
+};
+
+struct sta2x11_instance {
+	struct list_head list;
+	int bus0;
+	struct sta2x11_mapping map[STA2X11_NR_EP];
+};
+
+static LIST_HEAD(sta2x11_instance_list);
+
+/* At probe time, record new instances of this bridge (likely one only) */
+static void sta2x11_new_instance(struct pci_dev *pdev)
+{
+	struct sta2x11_instance *instance;
+
+	instance = kzalloc(sizeof(*instance), GFP_ATOMIC);
+	if (!instance)
+		return;
+	/* This has a subordinate bridge, with 4 more-subordinate ones */
+	instance->bus0 = pdev->subordinate->number + 1;
+
+	if (list_empty(&sta2x11_instance_list)) {
+		int size = STA2X11_SWIOTLB_SIZE;
+		/* First instance: register your own swiotlb area */
+		dev_info(&pdev->dev, "Using SWIOTLB (size %i)\n", size);
+		if (swiotlb_late_init_with_default_size(size))
+			dev_emerg(&pdev->dev, "init swiotlb failed\n");
+	}
+	list_add(&instance->list, &sta2x11_instance_list);
+}
+DECLARE_PCI_FIXUP_ENABLE(PCI_VENDOR_ID_STMICRO, 0xcc17, sta2x11_new_instance);
+
+/*
+ * Utility functions used in this file from below
+ */
+static struct sta2x11_instance *sta2x11_pdev_to_instance(struct pci_dev *pdev)
+{
+	struct sta2x11_instance *instance;
+	int ep;
+
+	list_for_each_entry(instance, &sta2x11_instance_list, list) {
+		ep = pdev->bus->number - instance->bus0;
+		if (ep >= 0 && ep < STA2X11_NR_EP)
+			return instance;
+	}
+	return NULL;
+}
+
+static int sta2x11_pdev_to_ep(struct pci_dev *pdev)
+{
+	struct sta2x11_instance *instance;
+
+	instance = sta2x11_pdev_to_instance(pdev);
+	if (!instance)
+		return -1;
+
+	return pdev->bus->number - instance->bus0;
+}
+
+static struct sta2x11_mapping *sta2x11_pdev_to_mapping(struct pci_dev *pdev)
+{
+	struct sta2x11_instance *instance;
+	int ep;
+
+	instance = sta2x11_pdev_to_instance(pdev);
+	if (!instance)
+		return NULL;
+	ep = sta2x11_pdev_to_ep(pdev);
+	return instance->map + ep;
+}
+
+/* This is exported, as some devices need to access the MFD registers */
+struct sta2x11_instance *sta2x11_get_instance(struct pci_dev *pdev)
+{
+	return sta2x11_pdev_to_instance(pdev);
+}
+EXPORT_SYMBOL(sta2x11_get_instance);
+
+
+/**
+ * p2a - Translate physical address to STA2x11 AMBA address,
+ *       used for DMA transfers to STA2x11
+ * @p: Physical address
+ * @pdev: PCI device (must be hosted within the connext)
+ */
+static dma_addr_t p2a(dma_addr_t p, struct pci_dev *pdev)
+{
+	struct sta2x11_mapping *map;
+	dma_addr_t a;
+
+	map = sta2x11_pdev_to_mapping(pdev);
+	a = p + map->amba_base;
+	return a;
+}
+
+/**
+ * a2p - Translate STA2x11 AMBA address to physical address
+ *       used for DMA transfers from STA2x11
+ * @a: STA2x11 AMBA address
+ * @pdev: PCI device (must be hosted within the connext)
+ */
+static dma_addr_t a2p(dma_addr_t a, struct pci_dev *pdev)
+{
+	struct sta2x11_mapping *map;
+	dma_addr_t p;
+
+	map = sta2x11_pdev_to_mapping(pdev);
+	p = a - map->amba_base;
+	return p;
+}
+
+/**
+ * sta2x11_swiotlb_alloc_coherent - Allocate swiotlb bounce buffers
+ *     returns virtual address. This is the only "special" function here.
+ * @dev: PCI device
+ * @size: Size of the buffer
+ * @dma_handle: DMA address
+ * @flags: memory flags
+ */
+static void *sta2x11_swiotlb_alloc_coherent(struct device *dev,
+					    size_t size,
+					    dma_addr_t *dma_handle,
+					    gfp_t flags,
+					    struct dma_attrs *attrs)
+{
+	void *vaddr;
+
+	vaddr = dma_generic_alloc_coherent(dev, size, dma_handle, flags, attrs);
+	if (!vaddr)
+		vaddr = swiotlb_alloc_coherent(dev, size, dma_handle, flags);
+	*dma_handle = p2a(*dma_handle, to_pci_dev(dev));
+	return vaddr;
+}
+
+/* We have our own dma_ops: the same as swiotlb but from alloc (above) */
+static struct dma_map_ops sta2x11_dma_ops = {
+	.alloc = sta2x11_swiotlb_alloc_coherent,
+	.free = swiotlb_free_coherent,
+	.map_page = swiotlb_map_page,
+	.unmap_page = swiotlb_unmap_page,
+	.map_sg = swiotlb_map_sg_attrs,
+	.unmap_sg = swiotlb_unmap_sg_attrs,
+	.sync_single_for_cpu = swiotlb_sync_single_for_cpu,
+	.sync_single_for_device = swiotlb_sync_single_for_device,
+	.sync_sg_for_cpu = swiotlb_sync_sg_for_cpu,
+	.sync_sg_for_device = swiotlb_sync_sg_for_device,
+	.mapping_error = swiotlb_dma_mapping_error,
+	.dma_supported = NULL, /* FIXME: we should use this instead! */
+};
+
+/* At setup time, we use our own ops if the device is a ConneXt one */
+static void sta2x11_setup_pdev(struct pci_dev *pdev)
+{
+	struct sta2x11_instance *instance = sta2x11_pdev_to_instance(pdev);
+
+	if (!instance) /* either a sta2x11 bridge or another ST device */
+		return;
+	pci_set_consistent_dma_mask(pdev, STA2X11_AMBA_SIZE - 1);
+	pci_set_dma_mask(pdev, STA2X11_AMBA_SIZE - 1);
+	pdev->dev.archdata.dma_ops = &sta2x11_dma_ops;
+
+	/* We must enable all devices as master, for audio DMA to work */
+	pci_set_master(pdev);
+}
+DECLARE_PCI_FIXUP_ENABLE(PCI_VENDOR_ID_STMICRO, PCI_ANY_ID, sta2x11_setup_pdev);
+
+/*
+ * The following three functions are exported (used in swiotlb: FIXME)
+ */
+/**
+ * dma_capable - Check if device can manage DMA transfers (FIXME: kill it)
+ * @dev: device for a PCI device
+ * @addr: DMA address
+ * @size: DMA size
+ */
+bool dma_capable(struct device *dev, dma_addr_t addr, size_t size)
+{
+	struct sta2x11_mapping *map;
+
+	if (dev->archdata.dma_ops != &sta2x11_dma_ops) {
+		if (!dev->dma_mask)
+			return false;
+		return addr + size - 1 <= *dev->dma_mask;
+	}
+
+	map = sta2x11_pdev_to_mapping(to_pci_dev(dev));
+
+	if (!map || (addr < map->amba_base))
+		return false;
+	if (addr + size >= map->amba_base + STA2X11_AMBA_SIZE) {
+		return false;
+	}
+
+	return true;
+}
+
+/**
+ * phys_to_dma - Return the DMA AMBA address used for this STA2x11 device
+ * @dev: device for a PCI device
+ * @paddr: Physical address
+ */
+dma_addr_t phys_to_dma(struct device *dev, phys_addr_t paddr)
+{
+	if (dev->archdata.dma_ops != &sta2x11_dma_ops)
+		return paddr;
+	return p2a(paddr, to_pci_dev(dev));
+}
+
+/**
+ * dma_to_phys - Return the physical address used for this STA2x11 DMA address
+ * @dev: device for a PCI device
+ * @daddr: STA2x11 AMBA DMA address
+ */
+phys_addr_t dma_to_phys(struct device *dev, dma_addr_t daddr)
+{
+	if (dev->archdata.dma_ops != &sta2x11_dma_ops)
+		return daddr;
+	return a2p(daddr, to_pci_dev(dev));
+}
+
+
+/*
+ * At boot we must set up the mappings for the pcie-to-amba bridge.
+ * It involves device access, and the same happens at suspend/resume time
+ */
+
+#define AHB_MAPB		0xCA4
+#define AHB_CRW(i)		(AHB_MAPB + 0  + (i) * 0x10)
+#define AHB_CRW_SZMASK			0xfffffc00UL
+#define AHB_CRW_ENABLE			(1 << 0)
+#define AHB_CRW_WTYPE_MEM		(2 << 1)
+#define AHB_CRW_ROE			(1UL << 3)	/* Relax Order Ena */
+#define AHB_CRW_NSE			(1UL << 4)	/* No Snoop Enable */
+#define AHB_BASE(i)		(AHB_MAPB + 4  + (i) * 0x10)
+#define AHB_PEXLBASE(i)		(AHB_MAPB + 8  + (i) * 0x10)
+#define AHB_PEXHBASE(i)		(AHB_MAPB + 12 + (i) * 0x10)
+
+/* At probe time, enable mapping for each endpoint, using the pdev */
+static void sta2x11_map_ep(struct pci_dev *pdev)
+{
+	struct sta2x11_mapping *map = sta2x11_pdev_to_mapping(pdev);
+	int i;
+
+	if (!map)
+		return;
+	pci_read_config_dword(pdev, AHB_BASE(0), &map->amba_base);
+
+	/* Configure AHB mapping */
+	pci_write_config_dword(pdev, AHB_PEXLBASE(0), 0);
+	pci_write_config_dword(pdev, AHB_PEXHBASE(0), 0);
+	pci_write_config_dword(pdev, AHB_CRW(0), STA2X11_AMBA_SIZE |
+			       AHB_CRW_WTYPE_MEM | AHB_CRW_ENABLE);
+
+	/* Disable all the other windows */
+	for (i = 1; i < STA2X11_NR_FUNCS; i++)
+		pci_write_config_dword(pdev, AHB_CRW(i), 0);
+
+	dev_info(&pdev->dev,
+		 "sta2x11: Map EP %i: AMBA address %#8x-%#8x\n",
+		 sta2x11_pdev_to_ep(pdev),  map->amba_base,
+		 map->amba_base + STA2X11_AMBA_SIZE - 1);
+}
+DECLARE_PCI_FIXUP_ENABLE(PCI_VENDOR_ID_STMICRO, PCI_ANY_ID, sta2x11_map_ep);
+
+#ifdef CONFIG_PM /* Some register values must be saved and restored */
+
+static void suspend_mapping(struct pci_dev *pdev)
+{
+	struct sta2x11_mapping *map = sta2x11_pdev_to_mapping(pdev);
+	int i;
+
+	if (!map)
+		return;
+
+	if (map->is_suspended)
+		return;
+	map->is_suspended = 1;
+
+	/* Save all window configs */
+	for (i = 0; i < STA2X11_NR_FUNCS; i++) {
+		struct sta2x11_ahb_regs *regs = map->regs + i;
+
+		pci_read_config_dword(pdev, AHB_BASE(i), &regs->base);
+		pci_read_config_dword(pdev, AHB_PEXLBASE(i), &regs->pexlbase);
+		pci_read_config_dword(pdev, AHB_PEXHBASE(i), &regs->pexhbase);
+		pci_read_config_dword(pdev, AHB_CRW(i), &regs->crw);
+	}
+}
+DECLARE_PCI_FIXUP_SUSPEND(PCI_VENDOR_ID_STMICRO, PCI_ANY_ID, suspend_mapping);
+
+static void resume_mapping(struct pci_dev *pdev)
+{
+	struct sta2x11_mapping *map = sta2x11_pdev_to_mapping(pdev);
+	int i;
+
+	if (!map)
+		return;
+
+
+	if (!map->is_suspended)
+		goto out;
+	map->is_suspended = 0;
+
+	/* Restore all window configs */
+	for (i = 0; i < STA2X11_NR_FUNCS; i++) {
+		struct sta2x11_ahb_regs *regs = map->regs + i;
+
+		pci_write_config_dword(pdev, AHB_BASE(i), regs->base);
+		pci_write_config_dword(pdev, AHB_PEXLBASE(i), regs->pexlbase);
+		pci_write_config_dword(pdev, AHB_PEXHBASE(i), regs->pexhbase);
+		pci_write_config_dword(pdev, AHB_CRW(i), regs->crw);
+	}
+out:
+	pci_set_master(pdev); /* Like at boot, enable master on all devices */
+}
+DECLARE_PCI_FIXUP_RESUME(PCI_VENDOR_ID_STMICRO, PCI_ANY_ID, resume_mapping);
+
+#endif /* CONFIG_PM */
-- 
cgit v1.2.3


From b7456536cf9466b402b540c5588d79a4177c723a Mon Sep 17 00:00:00 2001
From: Will Drewry <wad@chromium.org>
Date: Thu, 12 Apr 2012 16:47:56 -0500
Subject: arch/x86: add syscall_get_arch to syscall.h

Add syscall_get_arch() to export the current AUDIT_ARCH_* based on system call
entry path.

Signed-off-by: Will Drewry <wad@chromium.org>
Acked-by: Serge Hallyn <serge.hallyn@canonical.com>
Reviewed-by: H. Peter Anvin <hpa@zytor.com>
Acked-by: Eric Paris <eparis@redhat.com>
Reviewed-by: Kees Cook <keescook@chromium.org>

v18: - update comment about x32 tasks
     - rebase to v3.4-rc2
v17: rebase and reviewed-by
v14: rebase/nochanges
v13: rebase on to 88ebdda6159ffc15699f204c33feb3e431bf9bdc
Signed-off-by: James Morris <james.l.morris@oracle.com>
---
 arch/x86/include/asm/syscall.h | 27 +++++++++++++++++++++++++++
 1 file changed, 27 insertions(+)

(limited to 'arch/x86')

diff --git a/arch/x86/include/asm/syscall.h b/arch/x86/include/asm/syscall.h
index 386b78686c4d..1ace47b62592 100644
--- a/arch/x86/include/asm/syscall.h
+++ b/arch/x86/include/asm/syscall.h
@@ -13,9 +13,11 @@
 #ifndef _ASM_X86_SYSCALL_H
 #define _ASM_X86_SYSCALL_H
 
+#include <linux/audit.h>
 #include <linux/sched.h>
 #include <linux/err.h>
 #include <asm/asm-offsets.h>	/* For NR_syscalls */
+#include <asm/thread_info.h>	/* for TS_COMPAT */
 #include <asm/unistd.h>
 
 extern const unsigned long sys_call_table[];
@@ -88,6 +90,12 @@ static inline void syscall_set_arguments(struct task_struct *task,
 	memcpy(&regs->bx + i, args, n * sizeof(args[0]));
 }
 
+static inline int syscall_get_arch(struct task_struct *task,
+				   struct pt_regs *regs)
+{
+	return AUDIT_ARCH_I386;
+}
+
 #else	 /* CONFIG_X86_64 */
 
 static inline void syscall_get_arguments(struct task_struct *task,
@@ -212,6 +220,25 @@ static inline void syscall_set_arguments(struct task_struct *task,
 		}
 }
 
+static inline int syscall_get_arch(struct task_struct *task,
+				   struct pt_regs *regs)
+{
+#ifdef CONFIG_IA32_EMULATION
+	/*
+	 * TS_COMPAT is set for 32-bit syscall entry and then
+	 * remains set until we return to user mode.
+	 *
+	 * TIF_IA32 tasks should always have TS_COMPAT set at
+	 * system call time.
+	 *
+	 * x32 tasks should be considered AUDIT_ARCH_X86_64.
+	 */
+	if (task_thread_info(task)->status & TS_COMPAT)
+		return AUDIT_ARCH_I386;
+#endif
+	/* Both x32 and x86_64 are considered "64-bit". */
+	return AUDIT_ARCH_X86_64;
+}
 #endif	/* CONFIG_X86_32 */
 
 #endif	/* _ASM_X86_SYSCALL_H */
-- 
cgit v1.2.3


From a0727e8ce513fe6890416da960181ceb10fbfae6 Mon Sep 17 00:00:00 2001
From: Will Drewry <wad@chromium.org>
Date: Thu, 12 Apr 2012 16:48:00 -0500
Subject: signal, x86: add SIGSYS info and make it synchronous.

This change enables SIGSYS, defines _sigfields._sigsys, and adds
x86 (compat) arch support.  _sigsys defines fields which allow
a signal handler to receive the triggering system call number,
the relevant AUDIT_ARCH_* value for that number, and the address
of the callsite.

SIGSYS is added to the SYNCHRONOUS_MASK because it is desirable for it
to have setup_frame() called for it. The goal is to ensure that
ucontext_t reflects the machine state from the time-of-syscall and not
from another signal handler.

The first consumer of SIGSYS would be seccomp filter.  In particular,
a filter program could specify a new return value, SECCOMP_RET_TRAP,
which would result in the system call being denied and the calling
thread signaled.  This also means that implementing arch-specific
support can be dependent upon HAVE_ARCH_SECCOMP_FILTER.

Suggested-by: H. Peter Anvin <hpa@zytor.com>
Signed-off-by: Will Drewry <wad@chromium.org>
Acked-by: Serge Hallyn <serge.hallyn@canonical.com>
Reviewed-by: H. Peter Anvin <hpa@zytor.com>
Acked-by: Eric Paris <eparis@redhat.com>

v18: - added acked by, rebase
v17: - rebase and reviewed-by addition
v14: - rebase/nochanges
v13: - rebase on to 88ebdda6159ffc15699f204c33feb3e431bf9bdc
v12: - reworded changelog (oleg@redhat.com)
v11: - fix dropped words in the change description
     - added fallback copy_siginfo support.
     - added __ARCH_SIGSYS define to allow stepped arch support.
v10: - first version based on suggestion
Signed-off-by: James Morris <james.l.morris@oracle.com>
---
 arch/x86/ia32/ia32_signal.c   |  4 ++++
 arch/x86/include/asm/ia32.h   |  6 ++++++
 include/asm-generic/siginfo.h | 22 ++++++++++++++++++++++
 kernel/signal.c               |  9 ++++++++-
 4 files changed, 40 insertions(+), 1 deletion(-)

(limited to 'arch/x86')

diff --git a/arch/x86/ia32/ia32_signal.c b/arch/x86/ia32/ia32_signal.c
index a69245ba27e3..0b3f2354f6aa 100644
--- a/arch/x86/ia32/ia32_signal.c
+++ b/arch/x86/ia32/ia32_signal.c
@@ -67,6 +67,10 @@ int copy_siginfo_to_user32(compat_siginfo_t __user *to, siginfo_t *from)
 			switch (from->si_code >> 16) {
 			case __SI_FAULT >> 16:
 				break;
+			case __SI_SYS >> 16:
+				put_user_ex(from->si_syscall, &to->si_syscall);
+				put_user_ex(from->si_arch, &to->si_arch);
+				break;
 			case __SI_CHLD >> 16:
 				if (ia32) {
 					put_user_ex(from->si_utime, &to->si_utime);
diff --git a/arch/x86/include/asm/ia32.h b/arch/x86/include/asm/ia32.h
index ee52760549f0..b04cbdb138cd 100644
--- a/arch/x86/include/asm/ia32.h
+++ b/arch/x86/include/asm/ia32.h
@@ -144,6 +144,12 @@ typedef struct compat_siginfo {
 			int _band;	/* POLL_IN, POLL_OUT, POLL_MSG */
 			int _fd;
 		} _sigpoll;
+
+		struct {
+			unsigned int _call_addr; /* calling insn */
+			int _syscall;	/* triggering system call number */
+			unsigned int _arch;	/* AUDIT_ARCH_* of syscall */
+		} _sigsys;
 	} _sifields;
 } compat_siginfo_t;
 
diff --git a/include/asm-generic/siginfo.h b/include/asm-generic/siginfo.h
index 0dd4e87f6fba..31306f55eb02 100644
--- a/include/asm-generic/siginfo.h
+++ b/include/asm-generic/siginfo.h
@@ -90,9 +90,18 @@ typedef struct siginfo {
 			__ARCH_SI_BAND_T _band;	/* POLL_IN, POLL_OUT, POLL_MSG */
 			int _fd;
 		} _sigpoll;
+
+		/* SIGSYS */
+		struct {
+			void __user *_call_addr; /* calling insn */
+			int _syscall;	/* triggering system call number */
+			unsigned int _arch;	/* AUDIT_ARCH_* of syscall */
+		} _sigsys;
 	} _sifields;
 } siginfo_t;
 
+/* If the arch shares siginfo, then it has SIGSYS. */
+#define __ARCH_SIGSYS
 #endif
 
 /*
@@ -116,6 +125,11 @@ typedef struct siginfo {
 #define si_addr_lsb	_sifields._sigfault._addr_lsb
 #define si_band		_sifields._sigpoll._band
 #define si_fd		_sifields._sigpoll._fd
+#ifdef __ARCH_SIGSYS
+#define si_call_addr	_sifields._sigsys._call_addr
+#define si_syscall	_sifields._sigsys._syscall
+#define si_arch		_sifields._sigsys._arch
+#endif
 
 #ifdef __KERNEL__
 #define __SI_MASK	0xffff0000u
@@ -126,6 +140,7 @@ typedef struct siginfo {
 #define __SI_CHLD	(4 << 16)
 #define __SI_RT		(5 << 16)
 #define __SI_MESGQ	(6 << 16)
+#define __SI_SYS	(7 << 16)
 #define __SI_CODE(T,N)	((T) | ((N) & 0xffff))
 #else
 #define __SI_KILL	0
@@ -135,6 +150,7 @@ typedef struct siginfo {
 #define __SI_CHLD	0
 #define __SI_RT		0
 #define __SI_MESGQ	0
+#define __SI_SYS	0
 #define __SI_CODE(T,N)	(N)
 #endif
 
@@ -231,6 +247,12 @@ typedef struct siginfo {
 #define POLL_HUP	(__SI_POLL|6)	/* device disconnected */
 #define NSIGPOLL	6
 
+/*
+ * SIGSYS si_codes
+ */
+#define SYS_SECCOMP		(__SI_SYS|1)	/* seccomp triggered */
+#define NSIGSYS	1
+
 /*
  * sigevent definitions
  * 
diff --git a/kernel/signal.c b/kernel/signal.c
index 17afcaf582d0..1a006b5d9d9d 100644
--- a/kernel/signal.c
+++ b/kernel/signal.c
@@ -160,7 +160,7 @@ void recalc_sigpending(void)
 
 #define SYNCHRONOUS_MASK \
 	(sigmask(SIGSEGV) | sigmask(SIGBUS) | sigmask(SIGILL) | \
-	 sigmask(SIGTRAP) | sigmask(SIGFPE))
+	 sigmask(SIGTRAP) | sigmask(SIGFPE) | sigmask(SIGSYS))
 
 int next_signal(struct sigpending *pending, sigset_t *mask)
 {
@@ -2706,6 +2706,13 @@ int copy_siginfo_to_user(siginfo_t __user *to, siginfo_t *from)
 		err |= __put_user(from->si_uid, &to->si_uid);
 		err |= __put_user(from->si_ptr, &to->si_ptr);
 		break;
+#ifdef __ARCH_SIGSYS
+	case __SI_SYS:
+		err |= __put_user(from->si_call_addr, &to->si_call_addr);
+		err |= __put_user(from->si_syscall, &to->si_syscall);
+		err |= __put_user(from->si_arch, &to->si_arch);
+		break;
+#endif
 	default: /* this is just in case for now ... */
 		err |= __put_user(from->si_pid, &to->si_pid);
 		err |= __put_user(from->si_uid, &to->si_uid);
-- 
cgit v1.2.3


From c6cfbeb4029610c8c330c312dcf4d514cc067554 Mon Sep 17 00:00:00 2001
From: Will Drewry <wad@chromium.org>
Date: Thu, 12 Apr 2012 16:48:03 -0500
Subject: x86: Enable HAVE_ARCH_SECCOMP_FILTER

Enable support for seccomp filter on x86:
- syscall_get_arch()
- syscall_get_arguments()
- syscall_rollback()
- syscall_set_return_value()
- SIGSYS siginfo_t support
- secure_computing is called from a ptrace_event()-safe context
- secure_computing return value is checked (see below).

SECCOMP_RET_TRACE and SECCOMP_RET_TRAP may result in seccomp needing to
skip a system call without killing the process.  This is done by
returning a non-zero (-1) value from secure_computing.  This change
makes x86 respect that return value.

To ensure that minimal kernel code is exposed, a non-zero return value
results in an immediate return to user space (with an invalid syscall
number).

Signed-off-by: Will Drewry <wad@chromium.org>
Reviewed-by: H. Peter Anvin <hpa@zytor.com>
Acked-by: Eric Paris <eparis@redhat.com>
Reviewed-by: Kees Cook <keescook@chromium.org>

v18: rebase and tweaked change description, acked-by
v17: added reviewed by and rebased
v..: all rebases since original introduction.
Signed-off-by: James Morris <james.l.morris@oracle.com>
---
 arch/x86/Kconfig         | 1 +
 arch/x86/kernel/ptrace.c | 7 ++++++-
 2 files changed, 7 insertions(+), 1 deletion(-)

(limited to 'arch/x86')

diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
index 1d14cc6b79ad..3a41c4424a0a 100644
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -82,6 +82,7 @@ config X86
 	select ARCH_HAVE_NMI_SAFE_CMPXCHG
 	select GENERIC_IOMAP
 	select DCACHE_WORD_ACCESS if !DEBUG_PAGEALLOC
+	select HAVE_ARCH_SECCOMP_FILTER
 
 config INSTRUCTION_DECODER
 	def_bool (KPROBES || PERF_EVENTS)
diff --git a/arch/x86/kernel/ptrace.c b/arch/x86/kernel/ptrace.c
index 685845cf16e0..13b1990c7c58 100644
--- a/arch/x86/kernel/ptrace.c
+++ b/arch/x86/kernel/ptrace.c
@@ -1480,7 +1480,11 @@ long syscall_trace_enter(struct pt_regs *regs)
 		regs->flags |= X86_EFLAGS_TF;
 
 	/* do the secure computing check first */
-	secure_computing(regs->orig_ax);
+	if (secure_computing(regs->orig_ax)) {
+		/* seccomp failures shouldn't expose any additional code. */
+		ret = -1L;
+		goto out;
+	}
 
 	if (unlikely(test_thread_flag(TIF_SYSCALL_EMU)))
 		ret = -1L;
@@ -1505,6 +1509,7 @@ long syscall_trace_enter(struct pt_regs *regs)
 				    regs->dx, regs->r10);
 #endif
 
+out:
 	return ret ?: regs->orig_ax;
 }
 
-- 
cgit v1.2.3


From 302616911da8e868d3f1a00dce517ca30b0e065d Mon Sep 17 00:00:00 2001
From: Sam Ravnborg <sam@ravnborg.org>
Date: Fri, 6 Apr 2012 14:47:35 +0200
Subject: x86: Drop obsolete ARCH_BOOTMEM support

x86 unconditionally uses NO_BOOTMEM so there is no use
of the HAVE_ARCH_BOOTMEM support as mm/bootmem.c is the
only file referencing this symbol.

bootmem_arch_preferred_node() is the function referred
in the mm/bootmem.c code and can thuis be dropped too.

x86 was the sole user of HAVE_ARCH_BOOTMEM - so there is
an opportunity to clean up a little in mm/bootmem.c too
if we do not expect other users to emerge.

Signed-off-by: Sam Ravnborg <sam@ravnborg.org>
Cc: Tejun Heo <tj@kernel.org>
Link: http://lkml.kernel.org/r/20120406124735.GA6920@merkur.ravnborg.org
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/x86/Kconfig                 | 4 ----
 arch/x86/include/asm/mmzone_32.h | 6 ------
 2 files changed, 10 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
index 5bed94e189fa..a105ee75bd85 100644
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -1255,10 +1255,6 @@ config NODES_SHIFT
 	  Specify the maximum number of NUMA Nodes available on the target
 	  system.  Increases memory reserved to accommodate various tables.
 
-config HAVE_ARCH_BOOTMEM
-	def_bool y
-	depends on X86_32 && NUMA
-
 config HAVE_ARCH_ALLOC_REMAP
 	def_bool y
 	depends on X86_32 && NUMA
diff --git a/arch/x86/include/asm/mmzone_32.h b/arch/x86/include/asm/mmzone_32.h
index 55728e121473..eb05fb3b02fb 100644
--- a/arch/x86/include/asm/mmzone_32.h
+++ b/arch/x86/include/asm/mmzone_32.h
@@ -61,10 +61,4 @@ static inline int pfn_valid(int pfn)
 
 #endif /* CONFIG_DISCONTIGMEM */
 
-#ifdef CONFIG_NEED_MULTIPLE_NODES
-/* always use node 0 for bootmem on this numa platform */
-#define bootmem_arch_preferred_node(__bdata, size, align, goal, limit)	\
-	(NODE_DATA(0)->bdata)
-#endif /* CONFIG_NEED_MULTIPLE_NODES */
-
 #endif /* _ASM_X86_MMZONE_32_H */
-- 
cgit v1.2.3


From a7e0e4e99fabe0094fcb0b011b741558b8b28fa7 Mon Sep 17 00:00:00 2001
From: Josh Triplett <josh@joshtriplett.org>
Date: Tue, 20 Mar 2012 16:46:14 -0700
Subject: x86: Fix typo in MODULE_DEVICE_TABLE example: s/x86_cpu/x86cpu/

Signed-off-by: Josh Triplett <josh@joshtriplett.org>
Signed-off-by: Jiri Kosina <jkosina@suse.cz>
---
 arch/x86/kernel/cpu/match.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kernel/cpu/match.c b/arch/x86/kernel/cpu/match.c
index 5502b289341b..36565373af87 100644
--- a/arch/x86/kernel/cpu/match.c
+++ b/arch/x86/kernel/cpu/match.c
@@ -23,7 +23,7 @@
  * %X86_MODEL_ANY, %X86_FEATURE_ANY or 0 (except for vendor)
  *
  * Arrays used to match for this should also be declared using
- * MODULE_DEVICE_TABLE(x86_cpu, ...)
+ * MODULE_DEVICE_TABLE(x86cpu, ...)
  *
  * This always matches against the boot cpu, assuming models and features are
  * consistent over all CPUs.
-- 
cgit v1.2.3


From ae75954457eee0a608072368c5b477e40f378d7b Mon Sep 17 00:00:00 2001
From: Josh Triplett <josh@joshtriplett.org>
Date: Wed, 28 Mar 2012 11:32:28 -0700
Subject: KVM: SVM: Auto-load on CPUs with SVM

Enable x86 feature-based autoloading for the kvm-amd module on CPUs
with X86_FEATURE_SVM.

Signed-off-by: Josh Triplett <josh@joshtriplett.org>
Signed-off-by: Marcelo Tosatti <mtosatti@redhat.com>
---
 arch/x86/kvm/svm.c | 7 +++++++
 1 file changed, 7 insertions(+)

(limited to 'arch/x86')

diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c
index f3167208562e..f75af406b268 100644
--- a/arch/x86/kvm/svm.c
+++ b/arch/x86/kvm/svm.c
@@ -22,6 +22,7 @@
 #include "x86.h"
 
 #include <linux/module.h>
+#include <linux/mod_devicetable.h>
 #include <linux/kernel.h>
 #include <linux/vmalloc.h>
 #include <linux/highmem.h>
@@ -42,6 +43,12 @@
 MODULE_AUTHOR("Qumranet");
 MODULE_LICENSE("GPL");
 
+static const struct x86_cpu_id svm_cpu_id[] = {
+	X86_FEATURE_MATCH(X86_FEATURE_SVM),
+	{}
+};
+MODULE_DEVICE_TABLE(x86cpu, svm_cpu_id);
+
 #define IOPM_ALLOC_ORDER 2
 #define MSRPM_ALLOC_ORDER 1
 
-- 
cgit v1.2.3


From 1c11b37669a5209bd11fb857a103634afef971e8 Mon Sep 17 00:00:00 2001
From: Avi Kivity <avi@redhat.com>
Date: Mon, 9 Apr 2012 18:39:59 +0300
Subject: KVM: x86 emulator: add support for vector alignment

x86 defines three classes of vector instructions: explicitly
aligned (#GP(0) if unaligned, explicitly unaligned, and default
(which depends on the encoding: AVX is unaligned, SSE is aligned).

Add support for marking an instruction as explicitly aligned or
unaligned, and mark MOVDQU as unaligned.

Signed-off-by: Avi Kivity <avi@redhat.com>
Signed-off-by: Marcelo Tosatti <mtosatti@redhat.com>
---
 arch/x86/kvm/emulate.c | 30 +++++++++++++++++++++++++++++-
 1 file changed, 29 insertions(+), 1 deletion(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c
index 83756223f8aa..6302e5c74341 100644
--- a/arch/x86/kvm/emulate.c
+++ b/arch/x86/kvm/emulate.c
@@ -142,6 +142,9 @@
 #define Src2FS      (OpFS << Src2Shift)
 #define Src2GS      (OpGS << Src2Shift)
 #define Src2Mask    (OpMask << Src2Shift)
+#define Aligned     ((u64)1 << 41)  /* Explicitly aligned (e.g. MOVDQA) */
+#define Unaligned   ((u64)1 << 42)  /* Explicitly unaligned (e.g. MOVDQU) */
+#define Avx         ((u64)1 << 43)  /* Advanced Vector Extensions */
 
 #define X2(x...) x, x
 #define X3(x...) X2(x), x
@@ -557,6 +560,29 @@ static void set_segment_selector(struct x86_emulate_ctxt *ctxt, u16 selector,
 	ctxt->ops->set_segment(ctxt, selector, &desc, base3, seg);
 }
 
+/*
+ * x86 defines three classes of vector instructions: explicitly
+ * aligned, explicitly unaligned, and the rest, which change behaviour
+ * depending on whether they're AVX encoded or not.
+ *
+ * Also included is CMPXCHG16B which is not a vector instruction, yet it is
+ * subject to the same check.
+ */
+static bool insn_aligned(struct x86_emulate_ctxt *ctxt, unsigned size)
+{
+	if (likely(size < 16))
+		return false;
+
+	if (ctxt->d & Aligned)
+		return true;
+	else if (ctxt->d & Unaligned)
+		return false;
+	else if (ctxt->d & Avx)
+		return false;
+	else
+		return true;
+}
+
 static int __linearize(struct x86_emulate_ctxt *ctxt,
 		     struct segmented_address addr,
 		     unsigned size, bool write, bool fetch,
@@ -621,6 +647,8 @@ static int __linearize(struct x86_emulate_ctxt *ctxt,
 	}
 	if (fetch ? ctxt->mode != X86EMUL_MODE_PROT64 : ctxt->ad_bytes != 8)
 		la &= (u32)-1;
+	if (insn_aligned(ctxt, size) && ((la & (size - 1)) != 0))
+		return emulate_gp(ctxt, 0);
 	*linear = la;
 	return X86EMUL_CONTINUE;
 bad:
@@ -3415,7 +3443,7 @@ static struct opcode group11[] = {
 };
 
 static struct gprefix pfx_0f_6f_0f_7f = {
-	N, N, N, I(Sse, em_movdqu),
+	N, N, N, I(Sse | Unaligned, em_movdqu),
 };
 
 static struct opcode opcode_table[256] = {
-- 
cgit v1.2.3


From 49597d8116ad70aabb598e606b218ddd9315b0af Mon Sep 17 00:00:00 2001
From: Stefan Hajnoczi <stefanha@linux.vnet.ibm.com>
Date: Mon, 9 Apr 2012 18:40:00 +0300
Subject: KVM: x86: emulate movdqa

An Ubuntu 9.10 Karmic Koala guest is unable to boot or install due to
missing movdqa emulation:

kvm_exit: reason EXCEPTION_NMI rip 0x7fef3e025a7b info 7fef3e799000 80000b0e
kvm_page_fault: address 7fef3e799000 error_code f
kvm_emulate_insn: 0:7fef3e025a7b: 66 0f 7f 07 (prot64)

movdqa %xmm0,(%rdi)

[avi: mark it explicitly aligned]

Signed-off-by: Stefan Hajnoczi <stefanha@linux.vnet.ibm.com>
Signed-off-by: Avi Kivity <avi@redhat.com>
Signed-off-by: Marcelo Tosatti <mtosatti@redhat.com>
---
 arch/x86/kvm/emulate.c | 10 ++--------
 1 file changed, 2 insertions(+), 8 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c
index 6302e5c74341..b160fb1fc68b 100644
--- a/arch/x86/kvm/emulate.c
+++ b/arch/x86/kvm/emulate.c
@@ -2818,7 +2818,7 @@ static int em_rdpmc(struct x86_emulate_ctxt *ctxt)
 
 static int em_mov(struct x86_emulate_ctxt *ctxt)
 {
-	ctxt->dst.val = ctxt->src.val;
+	memcpy(ctxt->dst.valptr, ctxt->src.valptr, ctxt->op_bytes);
 	return X86EMUL_CONTINUE;
 }
 
@@ -2898,12 +2898,6 @@ static int em_mov_sreg_rm(struct x86_emulate_ctxt *ctxt)
 	return load_segment_descriptor(ctxt, sel, ctxt->modrm_reg);
 }
 
-static int em_movdqu(struct x86_emulate_ctxt *ctxt)
-{
-	memcpy(&ctxt->dst.vec_val, &ctxt->src.vec_val, ctxt->op_bytes);
-	return X86EMUL_CONTINUE;
-}
-
 static int em_invlpg(struct x86_emulate_ctxt *ctxt)
 {
 	int rc;
@@ -3443,7 +3437,7 @@ static struct opcode group11[] = {
 };
 
 static struct gprefix pfx_0f_6f_0f_7f = {
-	N, N, N, I(Sse | Unaligned, em_movdqu),
+	N, I(Sse | Aligned, em_mov), N, I(Sse | Unaligned, em_mov),
 };
 
 static struct opcode opcode_table[256] = {
-- 
cgit v1.2.3


From 3e114eb4db3a33141b8c91bb53dae9ba6b015a32 Mon Sep 17 00:00:00 2001
From: Avi Kivity <avi@redhat.com>
Date: Mon, 9 Apr 2012 18:40:01 +0300
Subject: KVM: x86 emulator: implement movntps

Used to write to framebuffers (by at least Icaros).

Signed-off-by: Avi Kivity <avi@redhat.com>
Signed-off-by: Marcelo Tosatti <mtosatti@redhat.com>
---
 arch/x86/kvm/emulate.c | 7 ++++++-
 1 file changed, 6 insertions(+), 1 deletion(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c
index b160fb1fc68b..fb39e0b32ed1 100644
--- a/arch/x86/kvm/emulate.c
+++ b/arch/x86/kvm/emulate.c
@@ -3440,6 +3440,10 @@ static struct gprefix pfx_0f_6f_0f_7f = {
 	N, I(Sse | Aligned, em_mov), N, I(Sse | Unaligned, em_mov),
 };
 
+static struct gprefix pfx_vmovntpx = {
+	I(0, em_mov), N, N, N,
+};
+
 static struct opcode opcode_table[256] = {
 	/* 0x00 - 0x07 */
 	I6ALU(Lock, em_add),
@@ -3571,7 +3575,8 @@ static struct opcode twobyte_table[256] = {
 	IIP(ModRM | SrcMem | Priv | Op3264, em_cr_write, cr_write, check_cr_write),
 	IIP(ModRM | SrcMem | Priv | Op3264, em_dr_write, dr_write, check_dr_write),
 	N, N, N, N,
-	N, N, N, N, N, N, N, N,
+	N, N, N, GP(ModRM | DstMem | SrcReg | Sse | Mov | Aligned, &pfx_vmovntpx),
+	N, N, N, N,
 	/* 0x30 - 0x3F */
 	II(ImplicitOps | Priv, em_wrmsr, wrmsr),
 	IIP(ImplicitOps, em_rdtsc, rdtsc, check_rdtsc),
-- 
cgit v1.2.3


From cbe2c9d30aa69b0551247ddb0fb450b6e8080ec4 Mon Sep 17 00:00:00 2001
From: Avi Kivity <avi@redhat.com>
Date: Mon, 9 Apr 2012 18:40:02 +0300
Subject: KVM: x86 emulator: MMX support

General support for the MMX instruction set.  Special care is taken
to trap pending x87 exceptions so that they are properly reflected
to the guest.

Signed-off-by: Avi Kivity <avi@redhat.com>
Signed-off-by: Marcelo Tosatti <mtosatti@redhat.com>
---
 arch/x86/include/asm/kvm_emulate.h |   4 +-
 arch/x86/kvm/emulate.c             | 103 +++++++++++++++++++++++++++++++++++--
 2 files changed, 102 insertions(+), 5 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/include/asm/kvm_emulate.h b/arch/x86/include/asm/kvm_emulate.h
index c222e1a1b12a..1ac46c22dd50 100644
--- a/arch/x86/include/asm/kvm_emulate.h
+++ b/arch/x86/include/asm/kvm_emulate.h
@@ -200,7 +200,7 @@ typedef u32 __attribute__((vector_size(16))) sse128_t;
 
 /* Type, address-of, and value of an instruction's operand. */
 struct operand {
-	enum { OP_REG, OP_MEM, OP_IMM, OP_XMM, OP_NONE } type;
+	enum { OP_REG, OP_MEM, OP_IMM, OP_XMM, OP_MM, OP_NONE } type;
 	unsigned int bytes;
 	union {
 		unsigned long orig_val;
@@ -213,12 +213,14 @@ struct operand {
 			unsigned seg;
 		} mem;
 		unsigned xmm;
+		unsigned mm;
 	} addr;
 	union {
 		unsigned long val;
 		u64 val64;
 		char valptr[sizeof(unsigned long) + 2];
 		sse128_t vec_val;
+		u64 mm_val;
 	};
 };
 
diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c
index fb39e0b32ed1..0011b4ad44b5 100644
--- a/arch/x86/kvm/emulate.c
+++ b/arch/x86/kvm/emulate.c
@@ -142,6 +142,7 @@
 #define Src2FS      (OpFS << Src2Shift)
 #define Src2GS      (OpGS << Src2Shift)
 #define Src2Mask    (OpMask << Src2Shift)
+#define Mmx         ((u64)1 << 40)  /* MMX Vector instruction */
 #define Aligned     ((u64)1 << 41)  /* Explicitly aligned (e.g. MOVDQA) */
 #define Unaligned   ((u64)1 << 42)  /* Explicitly unaligned (e.g. MOVDQU) */
 #define Avx         ((u64)1 << 43)  /* Advanced Vector Extensions */
@@ -887,6 +888,40 @@ static void write_sse_reg(struct x86_emulate_ctxt *ctxt, sse128_t *data,
 	ctxt->ops->put_fpu(ctxt);
 }
 
+static void read_mmx_reg(struct x86_emulate_ctxt *ctxt, u64 *data, int reg)
+{
+	ctxt->ops->get_fpu(ctxt);
+	switch (reg) {
+	case 0: asm("movq %%mm0, %0" : "=m"(*data)); break;
+	case 1: asm("movq %%mm1, %0" : "=m"(*data)); break;
+	case 2: asm("movq %%mm2, %0" : "=m"(*data)); break;
+	case 3: asm("movq %%mm3, %0" : "=m"(*data)); break;
+	case 4: asm("movq %%mm4, %0" : "=m"(*data)); break;
+	case 5: asm("movq %%mm5, %0" : "=m"(*data)); break;
+	case 6: asm("movq %%mm6, %0" : "=m"(*data)); break;
+	case 7: asm("movq %%mm7, %0" : "=m"(*data)); break;
+	default: BUG();
+	}
+	ctxt->ops->put_fpu(ctxt);
+}
+
+static void write_mmx_reg(struct x86_emulate_ctxt *ctxt, u64 *data, int reg)
+{
+	ctxt->ops->get_fpu(ctxt);
+	switch (reg) {
+	case 0: asm("movq %0, %%mm0" : : "m"(*data)); break;
+	case 1: asm("movq %0, %%mm1" : : "m"(*data)); break;
+	case 2: asm("movq %0, %%mm2" : : "m"(*data)); break;
+	case 3: asm("movq %0, %%mm3" : : "m"(*data)); break;
+	case 4: asm("movq %0, %%mm4" : : "m"(*data)); break;
+	case 5: asm("movq %0, %%mm5" : : "m"(*data)); break;
+	case 6: asm("movq %0, %%mm6" : : "m"(*data)); break;
+	case 7: asm("movq %0, %%mm7" : : "m"(*data)); break;
+	default: BUG();
+	}
+	ctxt->ops->put_fpu(ctxt);
+}
+
 static void decode_register_operand(struct x86_emulate_ctxt *ctxt,
 				    struct operand *op)
 {
@@ -903,6 +938,13 @@ static void decode_register_operand(struct x86_emulate_ctxt *ctxt,
 		read_sse_reg(ctxt, &op->vec_val, reg);
 		return;
 	}
+	if (ctxt->d & Mmx) {
+		reg &= 7;
+		op->type = OP_MM;
+		op->bytes = 8;
+		op->addr.mm = reg;
+		return;
+	}
 
 	op->type = OP_REG;
 	if (ctxt->d & ByteOp) {
@@ -948,6 +990,12 @@ static int decode_modrm(struct x86_emulate_ctxt *ctxt,
 			read_sse_reg(ctxt, &op->vec_val, ctxt->modrm_rm);
 			return rc;
 		}
+		if (ctxt->d & Mmx) {
+			op->type = OP_MM;
+			op->bytes = 8;
+			op->addr.xmm = ctxt->modrm_rm & 7;
+			return rc;
+		}
 		fetch_register_operand(op);
 		return rc;
 	}
@@ -1415,6 +1463,9 @@ static int writeback(struct x86_emulate_ctxt *ctxt)
 	case OP_XMM:
 		write_sse_reg(ctxt, &ctxt->dst.vec_val, ctxt->dst.addr.xmm);
 		break;
+	case OP_MM:
+		write_mmx_reg(ctxt, &ctxt->dst.mm_val, ctxt->dst.addr.mm);
+		break;
 	case OP_NONE:
 		/* no writeback */
 		break;
@@ -3987,6 +4038,8 @@ done_prefixes:
 
 	if (ctxt->d & Sse)
 		ctxt->op_bytes = 16;
+	else if (ctxt->d & Mmx)
+		ctxt->op_bytes = 8;
 
 	/* ModRM and SIB bytes. */
 	if (ctxt->d & ModRM) {
@@ -4057,6 +4110,35 @@ static bool string_insn_completed(struct x86_emulate_ctxt *ctxt)
 	return false;
 }
 
+static int flush_pending_x87_faults(struct x86_emulate_ctxt *ctxt)
+{
+	bool fault = false;
+
+	ctxt->ops->get_fpu(ctxt);
+	asm volatile("1: fwait \n\t"
+		     "2: \n\t"
+		     ".pushsection .fixup,\"ax\" \n\t"
+		     "3: \n\t"
+		     "movb $1, %[fault] \n\t"
+		     "jmp 2b \n\t"
+		     ".popsection \n\t"
+		     _ASM_EXTABLE(1b, 3b)
+		     : [fault]"+rm"(fault));
+	ctxt->ops->put_fpu(ctxt);
+
+	if (unlikely(fault))
+		return emulate_exception(ctxt, MF_VECTOR, 0, false);
+
+	return X86EMUL_CONTINUE;
+}
+
+static void fetch_possible_mmx_operand(struct x86_emulate_ctxt *ctxt,
+				       struct operand *op)
+{
+	if (op->type == OP_MM)
+		read_mmx_reg(ctxt, &op->mm_val, op->addr.mm);
+}
+
 int x86_emulate_insn(struct x86_emulate_ctxt *ctxt)
 {
 	struct x86_emulate_ops *ops = ctxt->ops;
@@ -4081,18 +4163,31 @@ int x86_emulate_insn(struct x86_emulate_ctxt *ctxt)
 		goto done;
 	}
 
-	if ((ctxt->d & Sse)
-	    && ((ops->get_cr(ctxt, 0) & X86_CR0_EM)
-		|| !(ops->get_cr(ctxt, 4) & X86_CR4_OSFXSR))) {
+	if (((ctxt->d & (Sse|Mmx)) && ((ops->get_cr(ctxt, 0) & X86_CR0_EM)))
+	    || ((ctxt->d & Sse) && !(ops->get_cr(ctxt, 4) & X86_CR4_OSFXSR))) {
 		rc = emulate_ud(ctxt);
 		goto done;
 	}
 
-	if ((ctxt->d & Sse) && (ops->get_cr(ctxt, 0) & X86_CR0_TS)) {
+	if ((ctxt->d & (Sse|Mmx)) && (ops->get_cr(ctxt, 0) & X86_CR0_TS)) {
 		rc = emulate_nm(ctxt);
 		goto done;
 	}
 
+	if (ctxt->d & Mmx) {
+		rc = flush_pending_x87_faults(ctxt);
+		if (rc != X86EMUL_CONTINUE)
+			goto done;
+		/*
+		 * Now that we know the fpu is exception safe, we can fetch
+		 * operands from it.
+		 */
+		fetch_possible_mmx_operand(ctxt, &ctxt->src);
+		fetch_possible_mmx_operand(ctxt, &ctxt->src2);
+		if (!(ctxt->d & Mov))
+			fetch_possible_mmx_operand(ctxt, &ctxt->dst);
+	}
+
 	if (unlikely(ctxt->guest_mode) && ctxt->intercept) {
 		rc = emulator_check_intercept(ctxt, ctxt->intercept,
 					      X86_ICPT_PRE_EXCEPT);
-- 
cgit v1.2.3


From e59717550e5cf0e7159c5b7af1d1ead35fef49dd Mon Sep 17 00:00:00 2001
From: Avi Kivity <avi@redhat.com>
Date: Mon, 9 Apr 2012 18:40:03 +0300
Subject: KVM: x86 emulator: implement MMX MOVQ (opcodes 0f 6f, 0f 7f)

Needed by some framebuffer drivers.  See

https://bugzilla.kernel.org/show_bug.cgi?id=42779

Signed-off-by: Avi Kivity <avi@redhat.com>
Signed-off-by: Marcelo Tosatti <mtosatti@redhat.com>
---
 arch/x86/kvm/emulate.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c
index 0011b4ad44b5..d5729a91d08d 100644
--- a/arch/x86/kvm/emulate.c
+++ b/arch/x86/kvm/emulate.c
@@ -3488,7 +3488,7 @@ static struct opcode group11[] = {
 };
 
 static struct gprefix pfx_0f_6f_0f_7f = {
-	N, I(Sse | Aligned, em_mov), N, I(Sse | Unaligned, em_mov),
+	I(Mmx, em_mov), I(Sse | Aligned, em_mov), N, I(Sse | Unaligned, em_mov),
 };
 
 static struct gprefix pfx_vmovntpx = {
-- 
cgit v1.2.3


From a0c9a822bf37e6282eb6006b407ec5aec22e08fb Mon Sep 17 00:00:00 2001
From: "Michael S. Tsirkin" <mst@redhat.com>
Date: Wed, 11 Apr 2012 18:49:55 +0300
Subject: KVM: dont clear TMR on EOI

Intel spec says that TMR needs to be set/cleared
when IRR is set, but kvm also clears it on  EOI.

I did some tests on a real (AMD based) system,
and I see same TMR values both before
and after EOI, so I think it's a minor bug in kvm.

This patch fixes TMR to be set/cleared on IRR set
only as per spec.

And now that we don't clear TMR, we can save
an atomic read of TMR on EOI that's not propagated
to ioapic, by checking whether ioapic needs
a specific vector first and calculating
the mode afterwards.

Signed-off-by: Michael S. Tsirkin <mst@redhat.com>
Signed-off-by: Marcelo Tosatti <mtosatti@redhat.com>
---
 arch/x86/kvm/lapic.c | 19 +++++++++++++------
 virt/kvm/ioapic.c    | 10 +++++++---
 virt/kvm/ioapic.h    |  1 +
 3 files changed, 21 insertions(+), 9 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c
index 858432287ab6..992b4eaae684 100644
--- a/arch/x86/kvm/lapic.c
+++ b/arch/x86/kvm/lapic.c
@@ -92,6 +92,11 @@ static inline int apic_test_and_clear_vector(int vec, void *bitmap)
 	return test_and_clear_bit(VEC_POS(vec), (bitmap) + REG_POS(vec));
 }
 
+static inline int apic_test_vector(int vec, void *bitmap)
+{
+	return test_bit(VEC_POS(vec), (bitmap) + REG_POS(vec));
+}
+
 static inline void apic_set_vector(int vec, void *bitmap)
 {
 	set_bit(VEC_POS(vec), (bitmap) + REG_POS(vec));
@@ -480,7 +485,6 @@ int kvm_apic_compare_prio(struct kvm_vcpu *vcpu1, struct kvm_vcpu *vcpu2)
 static void apic_set_eoi(struct kvm_lapic *apic)
 {
 	int vector = apic_find_highest_isr(apic);
-	int trigger_mode;
 	/*
 	 * Not every write EOI will has corresponding ISR,
 	 * one example is when Kernel check timer on setup_IO_APIC
@@ -491,12 +495,15 @@ static void apic_set_eoi(struct kvm_lapic *apic)
 	apic_clear_vector(vector, apic->regs + APIC_ISR);
 	apic_update_ppr(apic);
 
-	if (apic_test_and_clear_vector(vector, apic->regs + APIC_TMR))
-		trigger_mode = IOAPIC_LEVEL_TRIG;
-	else
-		trigger_mode = IOAPIC_EDGE_TRIG;
-	if (!(apic_get_reg(apic, APIC_SPIV) & APIC_SPIV_DIRECTED_EOI))
+	if (!(apic_get_reg(apic, APIC_SPIV) & APIC_SPIV_DIRECTED_EOI) &&
+	    kvm_ioapic_handles_vector(apic->vcpu->kvm, vector)) {
+		int trigger_mode;
+		if (apic_test_vector(vector, apic->regs + APIC_TMR))
+			trigger_mode = IOAPIC_LEVEL_TRIG;
+		else
+			trigger_mode = IOAPIC_EDGE_TRIG;
 		kvm_ioapic_update_eoi(apic->vcpu->kvm, vector, trigger_mode);
+	}
 	kvm_make_request(KVM_REQ_EVENT, apic->vcpu);
 }
 
diff --git a/virt/kvm/ioapic.c b/virt/kvm/ioapic.c
index dcaf272c26c0..26fd54dc459e 100644
--- a/virt/kvm/ioapic.c
+++ b/virt/kvm/ioapic.c
@@ -254,13 +254,17 @@ static void __kvm_ioapic_update_eoi(struct kvm_ioapic *ioapic, int vector,
 	}
 }
 
+bool kvm_ioapic_handles_vector(struct kvm *kvm, int vector)
+{
+	struct kvm_ioapic *ioapic = kvm->arch.vioapic;
+	smp_rmb();
+	return test_bit(vector, ioapic->handled_vectors);
+}
+
 void kvm_ioapic_update_eoi(struct kvm *kvm, int vector, int trigger_mode)
 {
 	struct kvm_ioapic *ioapic = kvm->arch.vioapic;
 
-	smp_rmb();
-	if (!test_bit(vector, ioapic->handled_vectors))
-		return;
 	spin_lock(&ioapic->lock);
 	__kvm_ioapic_update_eoi(ioapic, vector, trigger_mode);
 	spin_unlock(&ioapic->lock);
diff --git a/virt/kvm/ioapic.h b/virt/kvm/ioapic.h
index 0b190c34ccc3..32872a09b63f 100644
--- a/virt/kvm/ioapic.h
+++ b/virt/kvm/ioapic.h
@@ -71,6 +71,7 @@ int kvm_apic_match_dest(struct kvm_vcpu *vcpu, struct kvm_lapic *source,
 		int short_hand, int dest, int dest_mode);
 int kvm_apic_compare_prio(struct kvm_vcpu *vcpu1, struct kvm_vcpu *vcpu2);
 void kvm_ioapic_update_eoi(struct kvm *kvm, int vector, int trigger_mode);
+bool kvm_ioapic_handles_vector(struct kvm *kvm, int vector);
 int kvm_ioapic_init(struct kvm *kvm);
 void kvm_ioapic_destroy(struct kvm *kvm);
 int kvm_ioapic_set_irq(struct kvm_ioapic *ioapic, int irq, int level);
-- 
cgit v1.2.3


From 9fe2a7015393dc0203ac39242ae9c89038994f3c Mon Sep 17 00:00:00 2001
From: Srivatsa Vaddagiri <vatsa@linux.vnet.ibm.com>
Date: Fri, 23 Mar 2012 13:36:28 +0530
Subject: debugfs: Add support to print u32 array in debugfs

Move the code from Xen to debugfs to make the code common
for other users as well.

Accked-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Signed-off-by: Srivatsa Vaddagiri <vatsa@linux.vnet.ibm.com>
Signed-off-by: Suzuki Poulose <suzuki@in.ibm.com>
[v1: Fixed rebase issues]
[v2: Fixed PPC compile issues]
Signed-off-by: Raghavendra K T <raghavendra.kt@linux.vnet.ibm.com>
Signed-off-by: Konrad Rzeszutek Wilk <konrad.wilk@oracle.com>
---
 arch/x86/xen/debugfs.c  | 104 ---------------------------------------
 arch/x86/xen/debugfs.h  |   4 --
 arch/x86/xen/spinlock.c |  12 ++---
 fs/debugfs/file.c       | 128 ++++++++++++++++++++++++++++++++++++++++++++++++
 include/linux/debugfs.h |  11 +++++
 5 files changed, 145 insertions(+), 114 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/xen/debugfs.c b/arch/x86/xen/debugfs.c
index ef1db1900d86..c8377fb26cdf 100644
--- a/arch/x86/xen/debugfs.c
+++ b/arch/x86/xen/debugfs.c
@@ -19,107 +19,3 @@ struct dentry * __init xen_init_debugfs(void)
 	return d_xen_debug;
 }
 
-struct array_data
-{
-	void *array;
-	unsigned elements;
-};
-
-static int u32_array_open(struct inode *inode, struct file *file)
-{
-	file->private_data = NULL;
-	return nonseekable_open(inode, file);
-}
-
-static size_t format_array(char *buf, size_t bufsize, const char *fmt,
-			   u32 *array, unsigned array_size)
-{
-	size_t ret = 0;
-	unsigned i;
-
-	for(i = 0; i < array_size; i++) {
-		size_t len;
-
-		len = snprintf(buf, bufsize, fmt, array[i]);
-		len++;	/* ' ' or '\n' */
-		ret += len;
-
-		if (buf) {
-			buf += len;
-			bufsize -= len;
-			buf[-1] = (i == array_size-1) ? '\n' : ' ';
-		}
-	}
-
-	ret++;		/* \0 */
-	if (buf)
-		*buf = '\0';
-
-	return ret;
-}
-
-static char *format_array_alloc(const char *fmt, u32 *array, unsigned array_size)
-{
-	size_t len = format_array(NULL, 0, fmt, array, array_size);
-	char *ret;
-
-	ret = kmalloc(len, GFP_KERNEL);
-	if (ret == NULL)
-		return NULL;
-
-	format_array(ret, len, fmt, array, array_size);
-	return ret;
-}
-
-static ssize_t u32_array_read(struct file *file, char __user *buf, size_t len,
-			      loff_t *ppos)
-{
-	struct inode *inode = file->f_path.dentry->d_inode;
-	struct array_data *data = inode->i_private;
-	size_t size;
-
-	if (*ppos == 0) {
-		if (file->private_data) {
-			kfree(file->private_data);
-			file->private_data = NULL;
-		}
-
-		file->private_data = format_array_alloc("%u", data->array, data->elements);
-	}
-
-	size = 0;
-	if (file->private_data)
-		size = strlen(file->private_data);
-
-	return simple_read_from_buffer(buf, len, ppos, file->private_data, size);
-}
-
-static int xen_array_release(struct inode *inode, struct file *file)
-{
-	kfree(file->private_data);
-
-	return 0;
-}
-
-static const struct file_operations u32_array_fops = {
-	.owner	= THIS_MODULE,
-	.open	= u32_array_open,
-	.release= xen_array_release,
-	.read	= u32_array_read,
-	.llseek = no_llseek,
-};
-
-struct dentry *xen_debugfs_create_u32_array(const char *name, umode_t mode,
-					    struct dentry *parent,
-					    u32 *array, unsigned elements)
-{
-	struct array_data *data = kmalloc(sizeof(*data), GFP_KERNEL);
-
-	if (data == NULL)
-		return NULL;
-
-	data->array = array;
-	data->elements = elements;
-
-	return debugfs_create_file(name, mode, parent, data, &u32_array_fops);
-}
diff --git a/arch/x86/xen/debugfs.h b/arch/x86/xen/debugfs.h
index 78d25499be5b..12ebf3325c7b 100644
--- a/arch/x86/xen/debugfs.h
+++ b/arch/x86/xen/debugfs.h
@@ -3,8 +3,4 @@
 
 struct dentry * __init xen_init_debugfs(void);
 
-struct dentry *xen_debugfs_create_u32_array(const char *name, umode_t mode,
-					    struct dentry *parent,
-					    u32 *array, unsigned elements);
-
 #endif /* _XEN_DEBUGFS_H */
diff --git a/arch/x86/xen/spinlock.c b/arch/x86/xen/spinlock.c
index d69cc6c3f808..83e866d714ce 100644
--- a/arch/x86/xen/spinlock.c
+++ b/arch/x86/xen/spinlock.c
@@ -440,12 +440,12 @@ static int __init xen_spinlock_debugfs(void)
 	debugfs_create_u64("time_total", 0444, d_spin_debug,
 			   &spinlock_stats.time_total);
 
-	xen_debugfs_create_u32_array("histo_total", 0444, d_spin_debug,
-				     spinlock_stats.histo_spin_total, HISTO_BUCKETS + 1);
-	xen_debugfs_create_u32_array("histo_spinning", 0444, d_spin_debug,
-				     spinlock_stats.histo_spin_spinning, HISTO_BUCKETS + 1);
-	xen_debugfs_create_u32_array("histo_blocked", 0444, d_spin_debug,
-				     spinlock_stats.histo_spin_blocked, HISTO_BUCKETS + 1);
+	debugfs_create_u32_array("histo_total", 0444, d_spin_debug,
+				spinlock_stats.histo_spin_total, HISTO_BUCKETS + 1);
+	debugfs_create_u32_array("histo_spinning", 0444, d_spin_debug,
+				spinlock_stats.histo_spin_spinning, HISTO_BUCKETS + 1);
+	debugfs_create_u32_array("histo_blocked", 0444, d_spin_debug,
+				spinlock_stats.histo_spin_blocked, HISTO_BUCKETS + 1);
 
 	return 0;
 }
diff --git a/fs/debugfs/file.c b/fs/debugfs/file.c
index 5dfafdd1dbd3..2340f6978d6e 100644
--- a/fs/debugfs/file.c
+++ b/fs/debugfs/file.c
@@ -20,6 +20,7 @@
 #include <linux/namei.h>
 #include <linux/debugfs.h>
 #include <linux/io.h>
+#include <linux/slab.h>
 
 static ssize_t default_read_file(struct file *file, char __user *buf,
 				 size_t count, loff_t *ppos)
@@ -520,6 +521,133 @@ struct dentry *debugfs_create_blob(const char *name, umode_t mode,
 }
 EXPORT_SYMBOL_GPL(debugfs_create_blob);
 
+struct array_data {
+	void *array;
+	u32 elements;
+};
+
+static int u32_array_open(struct inode *inode, struct file *file)
+{
+	file->private_data = NULL;
+	return nonseekable_open(inode, file);
+}
+
+static size_t format_array(char *buf, size_t bufsize, const char *fmt,
+			   u32 *array, u32 array_size)
+{
+	size_t ret = 0;
+	u32 i;
+
+	for (i = 0; i < array_size; i++) {
+		size_t len;
+
+		len = snprintf(buf, bufsize, fmt, array[i]);
+		len++;	/* ' ' or '\n' */
+		ret += len;
+
+		if (buf) {
+			buf += len;
+			bufsize -= len;
+			buf[-1] = (i == array_size-1) ? '\n' : ' ';
+		}
+	}
+
+	ret++;		/* \0 */
+	if (buf)
+		*buf = '\0';
+
+	return ret;
+}
+
+static char *format_array_alloc(const char *fmt, u32 *array,
+						u32 array_size)
+{
+	size_t len = format_array(NULL, 0, fmt, array, array_size);
+	char *ret;
+
+	ret = kmalloc(len, GFP_KERNEL);
+	if (ret == NULL)
+		return NULL;
+
+	format_array(ret, len, fmt, array, array_size);
+	return ret;
+}
+
+static ssize_t u32_array_read(struct file *file, char __user *buf, size_t len,
+			      loff_t *ppos)
+{
+	struct inode *inode = file->f_path.dentry->d_inode;
+	struct array_data *data = inode->i_private;
+	size_t size;
+
+	if (*ppos == 0) {
+		if (file->private_data) {
+			kfree(file->private_data);
+			file->private_data = NULL;
+		}
+
+		file->private_data = format_array_alloc("%u", data->array,
+							      data->elements);
+	}
+
+	size = 0;
+	if (file->private_data)
+		size = strlen(file->private_data);
+
+	return simple_read_from_buffer(buf, len, ppos,
+					file->private_data, size);
+}
+
+static int u32_array_release(struct inode *inode, struct file *file)
+{
+	kfree(file->private_data);
+
+	return 0;
+}
+
+static const struct file_operations u32_array_fops = {
+	.owner	 = THIS_MODULE,
+	.open	 = u32_array_open,
+	.release = u32_array_release,
+	.read	 = u32_array_read,
+	.llseek  = no_llseek,
+};
+
+/**
+ * debugfs_create_u32_array - create a debugfs file that is used to read u32
+ * array.
+ * @name: a pointer to a string containing the name of the file to create.
+ * @mode: the permission that the file should have.
+ * @parent: a pointer to the parent dentry for this file.  This should be a
+ *          directory dentry if set.  If this parameter is %NULL, then the
+ *          file will be created in the root of the debugfs filesystem.
+ * @array: u32 array that provides data.
+ * @elements: total number of elements in the array.
+ *
+ * This function creates a file in debugfs with the given name that exports
+ * @array as data. If the @mode variable is so set it can be read from.
+ * Writing is not supported. Seek within the file is also not supported.
+ * Once array is created its size can not be changed.
+ *
+ * The function returns a pointer to dentry on success. If debugfs is not
+ * enabled in the kernel, the value -%ENODEV will be returned.
+ */
+struct dentry *debugfs_create_u32_array(const char *name, umode_t mode,
+					    struct dentry *parent,
+					    u32 *array, u32 elements)
+{
+	struct array_data *data = kmalloc(sizeof(*data), GFP_KERNEL);
+
+	if (data == NULL)
+		return NULL;
+
+	data->array = array;
+	data->elements = elements;
+
+	return debugfs_create_file(name, mode, parent, data, &u32_array_fops);
+}
+EXPORT_SYMBOL_GPL(debugfs_create_u32_array);
+
 #ifdef CONFIG_HAS_IOMEM
 
 /*
diff --git a/include/linux/debugfs.h b/include/linux/debugfs.h
index ae36b72c22f3..66c434f5dd1e 100644
--- a/include/linux/debugfs.h
+++ b/include/linux/debugfs.h
@@ -93,6 +93,10 @@ struct dentry *debugfs_create_regset32(const char *name, umode_t mode,
 int debugfs_print_regs32(struct seq_file *s, const struct debugfs_reg32 *regs,
 			 int nregs, void __iomem *base, char *prefix);
 
+struct dentry *debugfs_create_u32_array(const char *name, umode_t mode,
+					struct dentry *parent,
+					u32 *array, u32 elements);
+
 bool debugfs_initialized(void);
 
 #else
@@ -219,6 +223,13 @@ static inline bool debugfs_initialized(void)
 	return false;
 }
 
+static inline struct dentry *debugfs_create_u32_array(const char *name, umode_t mode,
+					struct dentry *parent,
+					u32 *array, u32 elements)
+{
+	return ERR_PTR(-ENODEV);
+}
+
 #endif
 
 #endif
-- 
cgit v1.2.3


From f71fa31f9f7ac33cba12b8897983f950ad2c7a5b Mon Sep 17 00:00:00 2001
From: Davidlohr Bueso <dave@gnu.org>
Date: Wed, 18 Apr 2012 12:24:29 +0200
Subject: KVM: MMU: use page table level macro

Its much cleaner to use PT_PAGE_TABLE_LEVEL than its numeric value.

Signed-off-by: Davidlohr Bueso <dave@gnu.org>
Signed-off-by: Marcelo Tosatti <mtosatti@redhat.com>
---
 arch/x86/kvm/mmu.c         | 2 +-
 arch/x86/kvm/paging_tmpl.h | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
index 29ad6f9c58a5..07424cf60434 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -3618,7 +3618,7 @@ static bool detect_write_flooding(struct kvm_mmu_page *sp)
 	 * Skip write-flooding detected for the sp whose level is 1, because
 	 * it can become unsync, then the guest page is not write-protected.
 	 */
-	if (sp->role.level == 1)
+	if (sp->role.level == PT_PAGE_TABLE_LEVEL)
 		return false;
 
 	return ++sp->write_flooding_count >= 3;
diff --git a/arch/x86/kvm/paging_tmpl.h b/arch/x86/kvm/paging_tmpl.h
index df5a70311be8..34f970937ef1 100644
--- a/arch/x86/kvm/paging_tmpl.h
+++ b/arch/x86/kvm/paging_tmpl.h
@@ -658,7 +658,7 @@ static gpa_t FNAME(get_level1_sp_gpa)(struct kvm_mmu_page *sp)
 {
 	int offset = 0;
 
-	WARN_ON(sp->role.level != 1);
+	WARN_ON(sp->role.level != PT_PAGE_TABLE_LEVEL);
 
 	if (PTTYPE == 32)
 		offset = sp->role.quadrant << PT64_LEVEL_BITS;
-- 
cgit v1.2.3


From 95022b8cf6ed7f3292b60c8e85fe59a12bfb1c9e Mon Sep 17 00:00:00 2001
From: Tony Luck <tony.luck@intel.com>
Date: Wed, 18 Apr 2012 15:19:40 -0700
Subject: x86/mce: Avoid reading every machine check bank register twice.

Reading machine check bank registers is slow. There is a trend of
increasing the number of banks, and the number of cores. The main section
of do_machine_check() is a serialized section where each cpu in turn
checks every bank. Even on a little two socket SandyBridge-EP system
that multiplies out as:

	2 sockets * 8 cores * 2 hyperthreads * 20 banks = 640 MSRs

We already scan the banks in parallel in mce_no_way_out() to see if there
is a fatal error anywhere in the system. If we build a cache of VALID
bits during this scan, we can avoid uselessly re-reading banks that have
no data. Note that this cache is only a hint. If the valid bit is set in a
shared bank, all cpus that share that bank will see it during the parallel
scan, but the first to find it in the sequential scan will (usually) clear
the bank.

Acked-by: Borislav Petkov <borislav.petkov@amd.com>
Signed-off-by: Tony Luck <tony.luck@intel.com>
---
 arch/x86/kernel/cpu/mcheck/mce.c | 16 +++++++++++-----
 1 file changed, 11 insertions(+), 5 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kernel/cpu/mcheck/mce.c b/arch/x86/kernel/cpu/mcheck/mce.c
index d086a09c087d..66e1c51be084 100644
--- a/arch/x86/kernel/cpu/mcheck/mce.c
+++ b/arch/x86/kernel/cpu/mcheck/mce.c
@@ -641,16 +641,18 @@ EXPORT_SYMBOL_GPL(machine_check_poll);
  * Do a quick check if any of the events requires a panic.
  * This decides if we keep the events around or clear them.
  */
-static int mce_no_way_out(struct mce *m, char **msg)
+static int mce_no_way_out(struct mce *m, char **msg, unsigned long *validp)
 {
-	int i;
+	int i, ret = 0;
 
 	for (i = 0; i < banks; i++) {
 		m->status = mce_rdmsrl(MSR_IA32_MCx_STATUS(i));
+		if (m->status & MCI_STATUS_VAL)
+			__set_bit(i, validp);
 		if (mce_severity(m, tolerant, msg) >= MCE_PANIC_SEVERITY)
-			return 1;
+			ret = 1;
 	}
-	return 0;
+	return ret;
 }
 
 /*
@@ -1011,6 +1013,7 @@ void do_machine_check(struct pt_regs *regs, long error_code)
 	 */
 	int kill_it = 0;
 	DECLARE_BITMAP(toclear, MAX_NR_BANKS);
+	DECLARE_BITMAP(valid_banks, MAX_NR_BANKS);
 	char *msg = "Unknown";
 
 	atomic_inc(&mce_entry);
@@ -1025,7 +1028,8 @@ void do_machine_check(struct pt_regs *regs, long error_code)
 	final = &__get_cpu_var(mces_seen);
 	*final = m;
 
-	no_way_out = mce_no_way_out(&m, &msg);
+	memset(valid_banks, 0, sizeof(valid_banks));
+	no_way_out = mce_no_way_out(&m, &msg, valid_banks);
 
 	barrier();
 
@@ -1045,6 +1049,8 @@ void do_machine_check(struct pt_regs *regs, long error_code)
 	order = mce_start(&no_way_out);
 	for (i = 0; i < banks; i++) {
 		__clear_bit(i, toclear);
+		if (!test_bit(i, valid_banks))
+			continue;
 		if (!mce_banks[i].ctl)
 			continue;
 
-- 
cgit v1.2.3


From d405c60128a1973648058fa950a8960ec1f27e38 Mon Sep 17 00:00:00 2001
From: David Daney <david.daney@cavium.com>
Date: Thu, 19 Apr 2012 14:59:59 -0700
Subject: x86: Select BUILDTIME_EXTABLE_SORT

We can sort the exeception table at build time for x86, so let's do
it.

Signed-off-by: David Daney <david.daney@cavium.com>
Link: http://lkml.kernel.org/r/1334872799-14589-6-git-send-email-ddaney.cavm@gmail.com
Signed-off-by: H. Peter Anvin <hpa@linux.intel.com>
---
 arch/x86/Kconfig | 1 +
 1 file changed, 1 insertion(+)

(limited to 'arch/x86')

diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
index 1d14cc6b79ad..2f925ccb3e5b 100644
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -82,6 +82,7 @@ config X86
 	select ARCH_HAVE_NMI_SAFE_CMPXCHG
 	select GENERIC_IOMAP
 	select DCACHE_WORD_ACCESS if !DEBUG_PAGEALLOC
+	select BUILDTIME_EXTABLE_SORT
 
 config INSTRUCTION_DECODER
 	def_bool (KPROBES || PERF_EVENTS)
-- 
cgit v1.2.3


From 46326013e34eb5c178a91f06c1f2e99e79eed924 Mon Sep 17 00:00:00 2001
From: "H. Peter Anvin" <hpa@zytor.com>
Date: Wed, 18 Apr 2012 17:16:46 -0700
Subject: x86, nop: Make the ASM_NOP* macros work from assembly

Make the ASM_NOP* macros work in actual assembly files.

Signed-off-by: H. Peter Anvin <hpa@zytor.com>
Link: http://lkml.kernel.org/r/1334794610-5546-2-git-send-email-hpa@zytor.com
---
 arch/x86/include/asm/nops.h | 4 ++++
 1 file changed, 4 insertions(+)

(limited to 'arch/x86')

diff --git a/arch/x86/include/asm/nops.h b/arch/x86/include/asm/nops.h
index 405b4032a60b..aff2b3356101 100644
--- a/arch/x86/include/asm/nops.h
+++ b/arch/x86/include/asm/nops.h
@@ -87,7 +87,11 @@
 #define P6_NOP8	0x0f,0x1f,0x84,0x00,0,0,0,0
 #define P6_NOP5_ATOMIC P6_NOP5
 
+#ifdef __ASSEMBLY__
+#define _ASM_MK_NOP(x) .byte x
+#else
 #define _ASM_MK_NOP(x) ".byte " __stringify(x) "\n"
+#endif
 
 #if defined(CONFIG_MK7)
 #define ASM_NOP1 _ASM_MK_NOP(K7_NOP1)
-- 
cgit v1.2.3


From 84f4fc524eed040660bd4ebc8cba259d8afe8461 Mon Sep 17 00:00:00 2001
From: "H. Peter Anvin" <hpa@zytor.com>
Date: Wed, 18 Apr 2012 17:16:47 -0700
Subject: x86: Add symbolic constant for exceptions with error code

Add a symbolic constant for the bitmask which states which exceptions
carry an error code.

Signed-off-by: H. Peter Anvin <hpa@zytor.com>
Link: http://lkml.kernel.org/r/1334794610-5546-3-git-send-email-hpa@zytor.com
---
 arch/x86/include/asm/segment.h | 2 ++
 arch/x86/kernel/head_64.S      | 2 +-
 2 files changed, 3 insertions(+), 1 deletion(-)

(limited to 'arch/x86')

diff --git a/arch/x86/include/asm/segment.h b/arch/x86/include/asm/segment.h
index 165466233ab0..58c1e6cd91b6 100644
--- a/arch/x86/include/asm/segment.h
+++ b/arch/x86/include/asm/segment.h
@@ -205,6 +205,8 @@
 
 #define IDT_ENTRIES 256
 #define NUM_EXCEPTION_VECTORS 32
+/* Bitmask of exception vectors which push an error code on the stack */
+#define EXCEPTION_ERRCODE_MASK  0x00027d00
 #define GDT_SIZE (GDT_ENTRIES * 8)
 #define GDT_ENTRY_TLS_ENTRIES 3
 #define TLS_SIZE (GDT_ENTRY_TLS_ENTRIES * 8)
diff --git a/arch/x86/kernel/head_64.S b/arch/x86/kernel/head_64.S
index 40f4eb3766d1..adf52e85d551 100644
--- a/arch/x86/kernel/head_64.S
+++ b/arch/x86/kernel/head_64.S
@@ -295,7 +295,7 @@ ENTRY(early_idt_handler)
 	ja 0f
 	movl $1,%eax
 	salq %cl,%rax
-	testl $0x27d00,%eax
+	testl $EXCEPTION_ERRCODE_MASK,%eax
 	je 0f
 	popq %r8		# get error code
 0:	movq 0(%rsp),%rcx	# get ip
-- 
cgit v1.2.3


From ffc4bc9c6fa4eaf935d96d139bfa7443cac0b88e Mon Sep 17 00:00:00 2001
From: "H. Peter Anvin" <hpa@zytor.com>
Date: Wed, 18 Apr 2012 17:16:48 -0700
Subject: x86, paravirt: Replace GET_CR2_INTO_RCX with GET_CR2_INTO_RAX

GET_CR2_INTO_RCX is asinine: it is only used in one place, the actual
paravirt call returns the value in %rax, not %rcx; and the one place
that wants it wants the result in %r9.  We actually generate as a
result of this call:

       call ...
       movq %rax, %rcx
       xorq %rax, %rax		/* this value isn't even used... */
       movq %rcx, %r9

At least make the macro do what the paravirt call does, which is put
the value into %rax.

Nevermind the fact that the macro clobbers all the volatile registers.

Signed-off-by: H. Peter Anvin <hpa@zytor.com>
Link: http://lkml.kernel.org/r/1334794610-5546-4-git-send-email-hpa@zytor.com
Cc: Glauber de Oliveira Costa <glommer@parallels.com>
---
 arch/x86/include/asm/paravirt.h | 6 ++----
 arch/x86/kernel/head_64.S       | 6 +++---
 2 files changed, 5 insertions(+), 7 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/include/asm/paravirt.h b/arch/x86/include/asm/paravirt.h
index aa0f91308367..6cbbabf52707 100644
--- a/arch/x86/include/asm/paravirt.h
+++ b/arch/x86/include/asm/paravirt.h
@@ -1023,10 +1023,8 @@ extern void default_banner(void);
 		  call PARA_INDIRECT(pv_cpu_ops+PV_CPU_swapgs)		\
 		 )
 
-#define GET_CR2_INTO_RCX				\
-	call PARA_INDIRECT(pv_mmu_ops+PV_MMU_read_cr2);	\
-	movq %rax, %rcx;				\
-	xorq %rax, %rax;
+#define GET_CR2_INTO_RAX				\
+	call PARA_INDIRECT(pv_mmu_ops+PV_MMU_read_cr2)
 
 #define PARAVIRT_ADJUST_EXCEPTION_FRAME					\
 	PARA_SITE(PARA_PATCH(pv_irq_ops, PV_IRQ_adjust_exception_frame), \
diff --git a/arch/x86/kernel/head_64.S b/arch/x86/kernel/head_64.S
index adf52e85d551..d1e112c8b577 100644
--- a/arch/x86/kernel/head_64.S
+++ b/arch/x86/kernel/head_64.S
@@ -23,8 +23,9 @@
 #ifdef CONFIG_PARAVIRT
 #include <asm/asm-offsets.h>
 #include <asm/paravirt.h>
+#define GET_CR2_INTO(reg) GET_CR2_INTO_RAX ; movq %rax, reg
 #else
-#define GET_CR2_INTO_RCX movq %cr2, %rcx
+#define GET_CR2_INTO(reg) movq %cr2, reg
 #endif
 
 /* we are not able to switch in one step to the final KERNEL ADDRESS SPACE
@@ -286,8 +287,7 @@ ENTRY(early_idt_handler)
 	cmpl $2,early_recursion_flag(%rip)
 	jz  1f
 	incl early_recursion_flag(%rip)
-	GET_CR2_INTO_RCX
-	movq %rcx,%r9
+	GET_CR2_INTO(%r9)
 	xorl %r8d,%r8d		# zero for error code
 	movl %esi,%ecx		# get vector number
 	# Test %ecx against mask of vectors that push error code.
-- 
cgit v1.2.3


From 6a1ea279c210e7dc05de86dc29c0d4f577f484fb Mon Sep 17 00:00:00 2001
From: "H. Peter Anvin" <hpa@linux.intel.com>
Date: Thu, 19 Apr 2012 15:24:20 -0700
Subject: x86, extable: Add early_fixup_exception()

Add a restricted version of fixup_exception() to be used during early
boot only.  In particular, this doesn't support the try..catch variant
since we may not have a thread_info set up yet.

This relies on the exception table being sorted already at build time.

Link: http://lkml.kernel.org/r/1334794610-5546-1-git-send-email-hpa@zytor.com
Signed-off-by: H. Peter Anvin <hpa@linux.intel.com>
---
 arch/x86/mm/extable.c | 17 +++++++++++++++++
 1 file changed, 17 insertions(+)

(limited to 'arch/x86')

diff --git a/arch/x86/mm/extable.c b/arch/x86/mm/extable.c
index 1fb85dbe390a..5555675dadb6 100644
--- a/arch/x86/mm/extable.c
+++ b/arch/x86/mm/extable.c
@@ -35,3 +35,20 @@ int fixup_exception(struct pt_regs *regs)
 
 	return 0;
 }
+
+/* Restricted version used during very early boot */
+int __init early_fixup_exception(unsigned long *ip)
+{
+	const struct exception_table_entry *fixup;
+
+	fixup = search_exception_tables(*ip);
+	if (fixup) {
+		if (fixup->fixup < 16)
+			return 0; /* Not supported during early boot */
+
+		*ip = fixup->fixup;
+		return 1;
+	}
+
+	return 0;
+}
-- 
cgit v1.2.3


From 9900aa2f95844eb81428c1d3d202c01b7f3ac77a Mon Sep 17 00:00:00 2001
From: "H. Peter Anvin" <hpa@zytor.com>
Date: Wed, 18 Apr 2012 17:16:49 -0700
Subject: x86-64: Handle exception table entries during early boot

If we get an exception during early boot, walk the exception table to
see if we should intercept it.  The main use case for this is to allow
rdmsr_safe()/wrmsr_safe() during CPU initialization.

Since the exception table is currently sorted at runtime, and fairly
late in startup, this code walks the exception table linearly.  We
obviously don't need to worry about modules, however: none have been
loaded at this point.

[ v2: Use early_fixup_exception() instead of linear search ]

Link: http://lkml.kernel.org/r/1334794610-5546-5-git-send-email-hpa@zytor.com
Signed-off-by: H. Peter Anvin <hpa@linux.intel.com>
---
 arch/x86/include/asm/segment.h |  2 +-
 arch/x86/kernel/head_64.S      | 76 +++++++++++++++++++++++++++++++-----------
 2 files changed, 58 insertions(+), 20 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/include/asm/segment.h b/arch/x86/include/asm/segment.h
index 58c1e6cd91b6..c48a95035a77 100644
--- a/arch/x86/include/asm/segment.h
+++ b/arch/x86/include/asm/segment.h
@@ -213,7 +213,7 @@
 
 #ifdef __KERNEL__
 #ifndef __ASSEMBLY__
-extern const char early_idt_handlers[NUM_EXCEPTION_VECTORS][10];
+extern const char early_idt_handlers[NUM_EXCEPTION_VECTORS][2+2+5];
 
 /*
  * Load a segment. Fall back on loading the zero
diff --git a/arch/x86/kernel/head_64.S b/arch/x86/kernel/head_64.S
index d1e112c8b577..7a40f2447321 100644
--- a/arch/x86/kernel/head_64.S
+++ b/arch/x86/kernel/head_64.S
@@ -19,6 +19,7 @@
 #include <asm/cache.h>
 #include <asm/processor-flags.h>
 #include <asm/percpu.h>
+#include <asm/nops.h>
 
 #ifdef CONFIG_PARAVIRT
 #include <asm/asm-offsets.h>
@@ -26,6 +27,7 @@
 #define GET_CR2_INTO(reg) GET_CR2_INTO_RAX ; movq %rax, reg
 #else
 #define GET_CR2_INTO(reg) movq %cr2, reg
+#define INTERRUPT_RETURN iretq
 #endif
 
 /* we are not able to switch in one step to the final KERNEL ADDRESS SPACE
@@ -271,35 +273,56 @@ bad_address:
 	jmp bad_address
 
 	.section ".init.text","ax"
-#ifdef CONFIG_EARLY_PRINTK
 	.globl early_idt_handlers
 early_idt_handlers:
+	# 104(%rsp) %rflags
+	#  96(%rsp) %cs
+	#  88(%rsp) %rip
+	#  80(%rsp) error code
 	i = 0
 	.rept NUM_EXCEPTION_VECTORS
-	movl $i, %esi
+	.if (EXCEPTION_ERRCODE_MASK >> i) & 1
+	ASM_NOP2
+	.else
+	pushq $0		# Dummy error code, to make stack frame uniform
+	.endif
+	pushq $i		# 72(%rsp) Vector number
 	jmp early_idt_handler
 	i = i + 1
 	.endr
-#endif
 
 ENTRY(early_idt_handler)
-#ifdef CONFIG_EARLY_PRINTK
+	cld
+
 	cmpl $2,early_recursion_flag(%rip)
 	jz  1f
 	incl early_recursion_flag(%rip)
-	GET_CR2_INTO(%r9)
-	xorl %r8d,%r8d		# zero for error code
-	movl %esi,%ecx		# get vector number
-	# Test %ecx against mask of vectors that push error code.
-	cmpl $31,%ecx
-	ja 0f
-	movl $1,%eax
-	salq %cl,%rax
-	testl $EXCEPTION_ERRCODE_MASK,%eax
-	je 0f
-	popq %r8		# get error code
-0:	movq 0(%rsp),%rcx	# get ip
-	movq 8(%rsp),%rdx	# get cs
+
+	pushq %rax		# 64(%rsp)
+	pushq %rcx		# 56(%rsp)
+	pushq %rdx		# 48(%rsp)
+	pushq %rsi		# 40(%rsp)
+	pushq %rdi		# 32(%rsp)
+	pushq %r8		# 24(%rsp)
+	pushq %r9		# 16(%rsp)
+	pushq %r10		#  8(%rsp)
+	pushq %r11		#  0(%rsp)
+
+	cmpl $__KERNEL_CS,96(%rsp)
+	jne 10f
+
+	leaq 88(%rsp),%rdi	# Pointer to %rip
+	call early_fixup_exception
+	andl %eax,%eax
+	jnz 20f			# Found an exception entry
+
+10:
+#ifdef CONFIG_EARLY_PRINTK
+	GET_CR2_INTO(%r9)	# can clobber any volatile register if pv
+	movl 80(%rsp),%r8d	# error code
+	movl 72(%rsp),%esi	# vector number
+	movl 96(%rsp),%edx	# %cs
+	movq 88(%rsp),%rcx	# %rip
 	xorl %eax,%eax
 	leaq early_idt_msg(%rip),%rdi
 	call early_printk
@@ -308,17 +331,32 @@ ENTRY(early_idt_handler)
 	call dump_stack
 #ifdef CONFIG_KALLSYMS	
 	leaq early_idt_ripmsg(%rip),%rdi
-	movq 0(%rsp),%rsi	# get rip again
+	movq 40(%rsp),%rsi	# %rip again
 	call __print_symbol
 #endif
 #endif /* EARLY_PRINTK */
 1:	hlt
 	jmp 1b
 
-#ifdef CONFIG_EARLY_PRINTK
+20:	# Exception table entry found
+	popq %r11
+	popq %r10
+	popq %r9
+	popq %r8
+	popq %rdi
+	popq %rsi
+	popq %rdx
+	popq %rcx
+	popq %rax
+	addq $16,%rsp		# drop vector number and error code
+	decl early_recursion_flag(%rip)
+	INTERRUPT_RETURN
+
+	.balign 4
 early_recursion_flag:
 	.long 0
 
+#ifdef CONFIG_EARLY_PRINTK
 early_idt_msg:
 	.asciz "PANIC: early exception %02lx rip %lx:%lx error %lx cr2 %lx\n"
 early_idt_ripmsg:
-- 
cgit v1.2.3


From f78146b0f9230765c6315b2e14f56112513389ad Mon Sep 17 00:00:00 2001
From: Avi Kivity <avi@redhat.com>
Date: Wed, 18 Apr 2012 19:22:47 +0300
Subject: KVM: Fix page-crossing MMIO

MMIO that are split across a page boundary are currently broken - the
code does not expect to be aborted by the exit to userspace for the
first MMIO fragment.

This patch fixes the problem by generalizing the current code for handling
16-byte MMIOs to handle a number of "fragments", and changes the MMIO
code to create those fragments.

Signed-off-by: Avi Kivity <avi@redhat.com>
Signed-off-by: Marcelo Tosatti <mtosatti@redhat.com>
---
 arch/ia64/include/asm/kvm_host.h |   2 +
 arch/ia64/kvm/kvm-ia64.c         |  10 ++--
 arch/x86/kvm/x86.c               | 114 +++++++++++++++++++++++++++------------
 include/linux/kvm_host.h         |  31 +++++++++--
 4 files changed, 115 insertions(+), 42 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/ia64/include/asm/kvm_host.h b/arch/ia64/include/asm/kvm_host.h
index c4b4bac3d09e..6d6a5ac48d85 100644
--- a/arch/ia64/include/asm/kvm_host.h
+++ b/arch/ia64/include/asm/kvm_host.h
@@ -449,6 +449,8 @@ struct kvm_vcpu_arch {
 	char log_buf[VMM_LOG_LEN];
 	union context host;
 	union context guest;
+
+	char mmio_data[8];
 };
 
 struct kvm_vm_stat {
diff --git a/arch/ia64/kvm/kvm-ia64.c b/arch/ia64/kvm/kvm-ia64.c
index 9d80ff8d9eff..882ab21a8dcd 100644
--- a/arch/ia64/kvm/kvm-ia64.c
+++ b/arch/ia64/kvm/kvm-ia64.c
@@ -232,12 +232,12 @@ static int handle_mmio(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
 	if ((p->addr & PAGE_MASK) == IOAPIC_DEFAULT_BASE_ADDRESS)
 		goto mmio;
 	vcpu->mmio_needed = 1;
-	vcpu->mmio_phys_addr = kvm_run->mmio.phys_addr = p->addr;
-	vcpu->mmio_size = kvm_run->mmio.len = p->size;
+	vcpu->mmio_fragments[0].gpa = kvm_run->mmio.phys_addr = p->addr;
+	vcpu->mmio_fragments[0].len = kvm_run->mmio.len = p->size;
 	vcpu->mmio_is_write = kvm_run->mmio.is_write = !p->dir;
 
 	if (vcpu->mmio_is_write)
-		memcpy(vcpu->mmio_data, &p->data, p->size);
+		memcpy(vcpu->arch.mmio_data, &p->data, p->size);
 	memcpy(kvm_run->mmio.data, &p->data, p->size);
 	kvm_run->exit_reason = KVM_EXIT_MMIO;
 	return 0;
@@ -719,7 +719,7 @@ static void kvm_set_mmio_data(struct kvm_vcpu *vcpu)
 	struct kvm_mmio_req *p = kvm_get_vcpu_ioreq(vcpu);
 
 	if (!vcpu->mmio_is_write)
-		memcpy(&p->data, vcpu->mmio_data, 8);
+		memcpy(&p->data, vcpu->arch.mmio_data, 8);
 	p->state = STATE_IORESP_READY;
 }
 
@@ -739,7 +739,7 @@ int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
 	}
 
 	if (vcpu->mmio_needed) {
-		memcpy(vcpu->mmio_data, kvm_run->mmio.data, 8);
+		memcpy(vcpu->arch.mmio_data, kvm_run->mmio.data, 8);
 		kvm_set_mmio_data(vcpu);
 		vcpu->mmio_read_completed = 1;
 		vcpu->mmio_needed = 0;
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 0d9a57875f0b..4de705cdcafd 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -3718,9 +3718,8 @@ struct read_write_emulator_ops {
 static int read_prepare(struct kvm_vcpu *vcpu, void *val, int bytes)
 {
 	if (vcpu->mmio_read_completed) {
-		memcpy(val, vcpu->mmio_data, bytes);
 		trace_kvm_mmio(KVM_TRACE_MMIO_READ, bytes,
-			       vcpu->mmio_phys_addr, *(u64 *)val);
+			       vcpu->mmio_fragments[0].gpa, *(u64 *)val);
 		vcpu->mmio_read_completed = 0;
 		return 1;
 	}
@@ -3756,8 +3755,9 @@ static int read_exit_mmio(struct kvm_vcpu *vcpu, gpa_t gpa,
 static int write_exit_mmio(struct kvm_vcpu *vcpu, gpa_t gpa,
 			   void *val, int bytes)
 {
-	memcpy(vcpu->mmio_data, val, bytes);
-	memcpy(vcpu->run->mmio.data, vcpu->mmio_data, 8);
+	struct kvm_mmio_fragment *frag = &vcpu->mmio_fragments[0];
+
+	memcpy(vcpu->run->mmio.data, frag->data, frag->len);
 	return X86EMUL_CONTINUE;
 }
 
@@ -3784,10 +3784,7 @@ static int emulator_read_write_onepage(unsigned long addr, void *val,
 	gpa_t gpa;
 	int handled, ret;
 	bool write = ops->write;
-
-	if (ops->read_write_prepare &&
-		  ops->read_write_prepare(vcpu, val, bytes))
-		return X86EMUL_CONTINUE;
+	struct kvm_mmio_fragment *frag;
 
 	ret = vcpu_mmio_gva_to_gpa(vcpu, addr, &gpa, exception, write);
 
@@ -3813,15 +3810,19 @@ mmio:
 	bytes -= handled;
 	val += handled;
 
-	vcpu->mmio_needed = 1;
-	vcpu->run->exit_reason = KVM_EXIT_MMIO;
-	vcpu->run->mmio.phys_addr = vcpu->mmio_phys_addr = gpa;
-	vcpu->mmio_size = bytes;
-	vcpu->run->mmio.len = min(vcpu->mmio_size, 8);
-	vcpu->run->mmio.is_write = vcpu->mmio_is_write = write;
-	vcpu->mmio_index = 0;
+	while (bytes) {
+		unsigned now = min(bytes, 8U);
 
-	return ops->read_write_exit_mmio(vcpu, gpa, val, bytes);
+		frag = &vcpu->mmio_fragments[vcpu->mmio_nr_fragments++];
+		frag->gpa = gpa;
+		frag->data = val;
+		frag->len = now;
+
+		gpa += now;
+		val += now;
+		bytes -= now;
+	}
+	return X86EMUL_CONTINUE;
 }
 
 int emulator_read_write(struct x86_emulate_ctxt *ctxt, unsigned long addr,
@@ -3830,10 +3831,18 @@ int emulator_read_write(struct x86_emulate_ctxt *ctxt, unsigned long addr,
 			struct read_write_emulator_ops *ops)
 {
 	struct kvm_vcpu *vcpu = emul_to_vcpu(ctxt);
+	gpa_t gpa;
+	int rc;
+
+	if (ops->read_write_prepare &&
+		  ops->read_write_prepare(vcpu, val, bytes))
+		return X86EMUL_CONTINUE;
+
+	vcpu->mmio_nr_fragments = 0;
 
 	/* Crossing a page boundary? */
 	if (((addr + bytes - 1) ^ addr) & PAGE_MASK) {
-		int rc, now;
+		int now;
 
 		now = -addr & ~PAGE_MASK;
 		rc = emulator_read_write_onepage(addr, val, now, exception,
@@ -3846,8 +3855,25 @@ int emulator_read_write(struct x86_emulate_ctxt *ctxt, unsigned long addr,
 		bytes -= now;
 	}
 
-	return emulator_read_write_onepage(addr, val, bytes, exception,
-					   vcpu, ops);
+	rc = emulator_read_write_onepage(addr, val, bytes, exception,
+					 vcpu, ops);
+	if (rc != X86EMUL_CONTINUE)
+		return rc;
+
+	if (!vcpu->mmio_nr_fragments)
+		return rc;
+
+	gpa = vcpu->mmio_fragments[0].gpa;
+
+	vcpu->mmio_needed = 1;
+	vcpu->mmio_cur_fragment = 0;
+
+	vcpu->run->mmio.len = vcpu->mmio_fragments[0].len;
+	vcpu->run->mmio.is_write = vcpu->mmio_is_write = ops->write;
+	vcpu->run->exit_reason = KVM_EXIT_MMIO;
+	vcpu->run->mmio.phys_addr = gpa;
+
+	return ops->read_write_exit_mmio(vcpu, gpa, val, bytes);
 }
 
 static int emulator_read_emulated(struct x86_emulate_ctxt *ctxt,
@@ -5446,33 +5472,55 @@ static int __vcpu_run(struct kvm_vcpu *vcpu)
 	return r;
 }
 
+/*
+ * Implements the following, as a state machine:
+ *
+ * read:
+ *   for each fragment
+ *     write gpa, len
+ *     exit
+ *     copy data
+ *   execute insn
+ *
+ * write:
+ *   for each fragment
+ *      write gpa, len
+ *      copy data
+ *      exit
+ */
 static int complete_mmio(struct kvm_vcpu *vcpu)
 {
 	struct kvm_run *run = vcpu->run;
+	struct kvm_mmio_fragment *frag;
 	int r;
 
 	if (!(vcpu->arch.pio.count || vcpu->mmio_needed))
 		return 1;
 
 	if (vcpu->mmio_needed) {
-		vcpu->mmio_needed = 0;
+		/* Complete previous fragment */
+		frag = &vcpu->mmio_fragments[vcpu->mmio_cur_fragment++];
 		if (!vcpu->mmio_is_write)
-			memcpy(vcpu->mmio_data + vcpu->mmio_index,
-			       run->mmio.data, 8);
-		vcpu->mmio_index += 8;
-		if (vcpu->mmio_index < vcpu->mmio_size) {
-			run->exit_reason = KVM_EXIT_MMIO;
-			run->mmio.phys_addr = vcpu->mmio_phys_addr + vcpu->mmio_index;
-			memcpy(run->mmio.data, vcpu->mmio_data + vcpu->mmio_index, 8);
-			run->mmio.len = min(vcpu->mmio_size - vcpu->mmio_index, 8);
-			run->mmio.is_write = vcpu->mmio_is_write;
-			vcpu->mmio_needed = 1;
-			return 0;
+			memcpy(frag->data, run->mmio.data, frag->len);
+		if (vcpu->mmio_cur_fragment == vcpu->mmio_nr_fragments) {
+			vcpu->mmio_needed = 0;
+			if (vcpu->mmio_is_write)
+				return 1;
+			vcpu->mmio_read_completed = 1;
+			goto done;
 		}
+		/* Initiate next fragment */
+		++frag;
+		run->exit_reason = KVM_EXIT_MMIO;
+		run->mmio.phys_addr = frag->gpa;
 		if (vcpu->mmio_is_write)
-			return 1;
-		vcpu->mmio_read_completed = 1;
+			memcpy(run->mmio.data, frag->data, frag->len);
+		run->mmio.len = frag->len;
+		run->mmio.is_write = vcpu->mmio_is_write;
+		return 0;
+
 	}
+done:
 	vcpu->srcu_idx = srcu_read_lock(&vcpu->kvm->srcu);
 	r = emulate_instruction(vcpu, EMULTYPE_NO_DECODE);
 	srcu_read_unlock(&vcpu->kvm->srcu, vcpu->srcu_idx);
diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h
index a2d00b1bbf54..186ffab0b9f0 100644
--- a/include/linux/kvm_host.h
+++ b/include/linux/kvm_host.h
@@ -34,6 +34,20 @@
 #define KVM_MMIO_SIZE 8
 #endif
 
+/*
+ * If we support unaligned MMIO, at most one fragment will be split into two:
+ */
+#ifdef KVM_UNALIGNED_MMIO
+#  define KVM_EXTRA_MMIO_FRAGMENTS 1
+#else
+#  define KVM_EXTRA_MMIO_FRAGMENTS 0
+#endif
+
+#define KVM_USER_MMIO_SIZE 8
+
+#define KVM_MAX_MMIO_FRAGMENTS \
+	(KVM_MMIO_SIZE / KVM_USER_MMIO_SIZE + KVM_EXTRA_MMIO_FRAGMENTS)
+
 /*
  * vcpu->requests bit members
  */
@@ -117,6 +131,16 @@ enum {
 	EXITING_GUEST_MODE
 };
 
+/*
+ * Sometimes a large or cross-page mmio needs to be broken up into separate
+ * exits for userspace servicing.
+ */
+struct kvm_mmio_fragment {
+	gpa_t gpa;
+	void *data;
+	unsigned len;
+};
+
 struct kvm_vcpu {
 	struct kvm *kvm;
 #ifdef CONFIG_PREEMPT_NOTIFIERS
@@ -144,10 +168,9 @@ struct kvm_vcpu {
 	int mmio_needed;
 	int mmio_read_completed;
 	int mmio_is_write;
-	int mmio_size;
-	int mmio_index;
-	unsigned char mmio_data[KVM_MMIO_SIZE];
-	gpa_t mmio_phys_addr;
+	int mmio_cur_fragment;
+	int mmio_nr_fragments;
+	struct kvm_mmio_fragment mmio_fragments[KVM_MAX_MMIO_FRAGMENTS];
 #endif
 
 #ifdef CONFIG_KVM_ASYNC_PF
-- 
cgit v1.2.3


From 4c5023a3fa2ec12b7ed313b276b157917575745b Mon Sep 17 00:00:00 2001
From: "H. Peter Anvin" <hpa@zytor.com>
Date: Wed, 18 Apr 2012 17:16:50 -0700
Subject: x86-32: Handle exception table entries during early boot

If we get an exception during early boot, walk the exception table to
see if we should intercept it.  The main use case for this is to allow
rdmsr_safe()/wrmsr_safe() during CPU initialization.

Since the exception table is currently sorted at runtime, and fairly
late in startup, this code walks the exception table linearly.  We
obviously don't need to worry about modules, however: none have been
loaded at this point.

This patch changes the early IDT setup to look a lot more like x86-64:
we now install handlers for all 32 exception vectors.  The output of
the early exception handler has changed somewhat as it directly
reflects the stack frame of the exception handler, and the stack frame
has been somewhat restructured.

Finally, centralize the code that can and should be run only once.

[ v2: Use early_fixup_exception() instead of linear search ]

Signed-off-by: H. Peter Anvin <hpa@zytor.com>
Link: http://lkml.kernel.org/r/1334794610-5546-6-git-send-email-hpa@zytor.com
---
 arch/x86/kernel/head_32.S | 223 +++++++++++++++++++++++++++-------------------
 1 file changed, 129 insertions(+), 94 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kernel/head_32.S b/arch/x86/kernel/head_32.S
index ce0be7cd085e..463c9797ca6a 100644
--- a/arch/x86/kernel/head_32.S
+++ b/arch/x86/kernel/head_32.S
@@ -21,6 +21,7 @@
 #include <asm/msr-index.h>
 #include <asm/cpufeature.h>
 #include <asm/percpu.h>
+#include <asm/nops.h>
 
 /* Physical address */
 #define pa(X) ((X) - __PAGE_OFFSET)
@@ -363,28 +364,23 @@ default_entry:
 	pushl $0
 	popfl
 
-#ifdef CONFIG_SMP
-	cmpb $0, ready
-	jnz checkCPUtype
-#endif /* CONFIG_SMP */
-
 /*
  * start system 32-bit setup. We need to re-do some of the things done
  * in 16-bit mode for the "real" operations.
  */
-	call setup_idt
-
-checkCPUtype:
-
-	movl $-1,X86_CPUID		#  -1 for no CPUID initially
-
+	movl setup_once_ref,%eax
+	andl %eax,%eax
+	jz 1f				# Did we do this already?
+	call *%eax
+1:
+	
 /* check if it is 486 or 386. */
 /*
  * XXX - this does a lot of unnecessary setup.  Alignment checks don't
  * apply at our cpl of 0 and the stack ought to be aligned already, and
  * we don't need to preserve eflags.
  */
-
+	movl $-1,X86_CPUID	# -1 for no CPUID initially
 	movb $3,X86		# at least 386
 	pushfl			# push EFLAGS
 	popl %eax		# get EFLAGS
@@ -450,21 +446,6 @@ is386:	movl $2,%ecx		# set MP
 	movl $(__KERNEL_PERCPU), %eax
 	movl %eax,%fs			# set this cpu's percpu
 
-#ifdef CONFIG_CC_STACKPROTECTOR
-	/*
-	 * The linker can't handle this by relocation.  Manually set
-	 * base address in stack canary segment descriptor.
-	 */
-	cmpb $0,ready
-	jne 1f
-	movl $gdt_page,%eax
-	movl $stack_canary,%ecx
-	movw %cx, 8 * GDT_ENTRY_STACK_CANARY + 2(%eax)
-	shrl $16, %ecx
-	movb %cl, 8 * GDT_ENTRY_STACK_CANARY + 4(%eax)
-	movb %ch, 8 * GDT_ENTRY_STACK_CANARY + 7(%eax)
-1:
-#endif
 	movl $(__KERNEL_STACK_CANARY),%eax
 	movl %eax,%gs
 
@@ -473,7 +454,6 @@ is386:	movl $2,%ecx		# set MP
 
 	cld			# gcc2 wants the direction flag cleared at all times
 	pushl $0		# fake return address for unwinder
-	movb $1, ready
 	jmp *(initial_code)
 
 /*
@@ -495,81 +475,122 @@ check_x87:
 	.byte 0xDB,0xE4		/* fsetpm for 287, ignored by 387 */
 	ret
 
+	
+#include "verify_cpu.S"
+
 /*
- *  setup_idt
+ *  setup_once
  *
- *  sets up a idt with 256 entries pointing to
- *  ignore_int, interrupt gates. It doesn't actually load
- *  idt - that can be done only after paging has been enabled
- *  and the kernel moved to PAGE_OFFSET. Interrupts
- *  are enabled elsewhere, when we can be relatively
- *  sure everything is ok.
+ *  The setup work we only want to run on the BSP.
  *
  *  Warning: %esi is live across this function.
  */
-setup_idt:
-	lea ignore_int,%edx
-	movl $(__KERNEL_CS << 16),%eax
-	movw %dx,%ax		/* selector = 0x0010 = cs */
-	movw $0x8E00,%dx	/* interrupt gate - dpl=0, present */
+__INIT
+setup_once:
+	/*
+	 * Set up a idt with 256 entries pointing to ignore_int,
+	 * interrupt gates. It doesn't actually load idt - that needs
+	 * to be done on each CPU. Interrupts are enabled elsewhere,
+	 * when we can be relatively sure everything is ok.
+	 */
 
-	lea idt_table,%edi
-	mov $256,%ecx
-rp_sidt:
+	movl $idt_table,%edi
+	movl $early_idt_handlers,%eax
+	movl $NUM_EXCEPTION_VECTORS,%ecx
+1:
 	movl %eax,(%edi)
-	movl %edx,4(%edi)
+	movl %eax,4(%edi)
+	/* interrupt gate, dpl=0, present */
+	movl $(0x8E000000 + __KERNEL_CS),2(%edi)
+	addl $9,%eax
 	addl $8,%edi
-	dec %ecx
-	jne rp_sidt
+	loop 1b
 
-.macro	set_early_handler handler,trapno
-	lea \handler,%edx
+	movl $256 - NUM_EXCEPTION_VECTORS,%ecx
+	movl $ignore_int,%edx
 	movl $(__KERNEL_CS << 16),%eax
-	movw %dx,%ax
+	movw %dx,%ax		/* selector = 0x0010 = cs */
 	movw $0x8E00,%dx	/* interrupt gate - dpl=0, present */
-	lea idt_table,%edi
-	movl %eax,8*\trapno(%edi)
-	movl %edx,8*\trapno+4(%edi)
-.endm
+2:
+	movl %eax,(%edi)
+	movl %edx,4(%edi)
+	addl $8,%edi
+	loop 2b
 
-	set_early_handler handler=early_divide_err,trapno=0
-	set_early_handler handler=early_illegal_opcode,trapno=6
-	set_early_handler handler=early_protection_fault,trapno=13
-	set_early_handler handler=early_page_fault,trapno=14
+#ifdef CONFIG_CC_STACKPROTECTOR
+	/*
+	 * Configure the stack canary. The linker can't handle this by
+	 * relocation.  Manually set base address in stack canary
+	 * segment descriptor.
+	 */
+	movl $gdt_page,%eax
+	movl $stack_canary,%ecx
+	movw %cx, 8 * GDT_ENTRY_STACK_CANARY + 2(%eax)
+	shrl $16, %ecx
+	movb %cl, 8 * GDT_ENTRY_STACK_CANARY + 4(%eax)
+	movb %ch, 8 * GDT_ENTRY_STACK_CANARY + 7(%eax)
+#endif
 
+	andl $0,setup_once_ref	/* Once is enough, thanks */
 	ret
 
-early_divide_err:
-	xor %edx,%edx
-	pushl $0	/* fake errcode */
-	jmp early_fault
+ENTRY(early_idt_handlers)
+	# 36(%esp) %eflags
+	# 32(%esp) %cs
+	# 28(%esp) %eip
+	# 24(%rsp) error code
+	i = 0
+	.rept NUM_EXCEPTION_VECTORS
+	.if (EXCEPTION_ERRCODE_MASK >> i) & 1
+	ASM_NOP2
+	.else
+	pushl $0		# Dummy error code, to make stack frame uniform
+	.endif
+	pushl $i		# 20(%esp) Vector number
+	jmp early_idt_handler
+	i = i + 1
+	.endr
+ENDPROC(early_idt_handlers)
+	
+	/* This is global to keep gas from relaxing the jumps */
+ENTRY(early_idt_handler)
+	cld
+	cmpl $2,%ss:early_recursion_flag
+	je hlt_loop
+	incl %ss:early_recursion_flag
 
-early_illegal_opcode:
-	movl $6,%edx
-	pushl $0	/* fake errcode */
-	jmp early_fault
+	push %eax		# 16(%esp)
+	push %ecx		# 12(%esp)
+	push %edx		#  8(%esp)
+	push %ds		#  4(%esp)
+	push %es		#  0(%esp)
+	movl $(__KERNEL_DS),%eax
+	movl %eax,%ds
+	movl %eax,%es
 
-early_protection_fault:
-	movl $13,%edx
-	jmp early_fault
+	cmpl $(__KERNEL_CS),32(%esp)
+	jne 10f
 
-early_page_fault:
-	movl $14,%edx
-	jmp early_fault
+	leal 28(%esp),%eax	# Pointer to %eip
+	call early_fixup_exception
+	andl %eax,%eax
+	jnz ex_entry		/* found an exception entry */
 
-early_fault:
-	cld
+10:
 #ifdef CONFIG_PRINTK
-	pusha
-	movl $(__KERNEL_DS),%eax
-	movl %eax,%ds
-	movl %eax,%es
-	cmpl $2,early_recursion_flag
-	je hlt_loop
-	incl early_recursion_flag
+	xorl %eax,%eax
+	movw %ax,2(%esp)	/* clean up the segment values on some cpus */
+	movw %ax,6(%esp)
+	movw %ax,34(%esp)
+	leal  40(%esp),%eax
+	pushl %eax		/* %esp before the exception */
+	pushl %ebx
+	pushl %ebp
+	pushl %esi
+	pushl %edi
 	movl %cr2,%eax
 	pushl %eax
-	pushl %edx		/* trapno */
+	pushl (20+6*4)(%esp)	/* trapno */
 	pushl $fault_msg
 	call printk
 #endif
@@ -578,6 +599,17 @@ hlt_loop:
 	hlt
 	jmp hlt_loop
 
+ex_entry:
+	pop %es
+	pop %ds
+	pop %edx
+	pop %ecx
+	pop %eax
+	addl $8,%esp		/* drop vector number and error code */
+	decl %ss:early_recursion_flag
+	iret
+ENDPROC(early_idt_handler)
+
 /* This is the default interrupt "handler" :-) */
 	ALIGN
 ignore_int:
@@ -611,13 +643,18 @@ ignore_int:
 	popl %eax
 #endif
 	iret
+ENDPROC(ignore_int)
+__INITDATA
+	.align 4
+early_recursion_flag:
+	.long 0
 
-#include "verify_cpu.S"
-
-	__REFDATA
-.align 4
+__REFDATA
+	.align 4
 ENTRY(initial_code)
 	.long i386_start_kernel
+ENTRY(setup_once_ref)
+	.long setup_once
 
 /*
  * BSS section
@@ -670,22 +707,19 @@ ENTRY(initial_page_table)
 ENTRY(stack_start)
 	.long init_thread_union+THREAD_SIZE
 
-early_recursion_flag:
-	.long 0
-
-ready:	.byte 0
-
+__INITRODATA
 int_msg:
 	.asciz "Unknown interrupt or fault at: %p %p %p\n"
 
 fault_msg:
 /* fault info: */
 	.ascii "BUG: Int %d: CR2 %p\n"
-/* pusha regs: */
-	.ascii "     EDI %p  ESI %p  EBP %p  ESP %p\n"
-	.ascii "     EBX %p  EDX %p  ECX %p  EAX %p\n"
+/* regs pushed in early_idt_handler: */
+	.ascii "     EDI %p  ESI %p  EBP %p  EBX %p\n"
+	.ascii "     ESP %p   ES %p   DS %p\n"
+	.ascii "     EDX %p  ECX %p  EAX %p\n"
 /* fault frame: */
-	.ascii "     err %p  EIP %p   CS %p  flg %p\n"
+	.ascii "     vec %p  err %p  EIP %p   CS %p  flg %p\n"
 	.ascii "Stack: %p %p %p %p %p %p %p %p\n"
 	.ascii "       %p %p %p %p %p %p %p %p\n"
 	.asciz "       %p %p %p %p %p %p %p %p\n"
@@ -699,6 +733,7 @@ fault_msg:
  * segment size, and 32-bit linear address value:
  */
 
+	.data
 .globl boot_gdt_descr
 .globl idt_descr
 
-- 
cgit v1.2.3


From 060feb650010c261fcfbae9de9348b46cedcd3cd Mon Sep 17 00:00:00 2001
From: "H. Peter Anvin" <hpa@linux.intel.com>
Date: Thu, 19 Apr 2012 17:07:34 -0700
Subject: x86, doc: Revert "x86: Document rdmsr_safe restrictions"

This reverts commit ce37defc0f6673f5ca2c92ed5cfcaf290ae7dd16
"x86: Document rdmsr_safe restrictions", as these restrictions no longer apply.

Reported-by: Borislav Petkov <borislav.petkov@amd.com>
Link: http://lkml.kernel.org/r/20120419171609.GH3221@aftab.osrc.amd.com
Signed-off-by: H. Peter Anvin <hpa@linux.intel.com>
---
 arch/x86/include/asm/msr.h | 9 +--------
 1 file changed, 1 insertion(+), 8 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/include/asm/msr.h b/arch/x86/include/asm/msr.h
index 95203d40ffdd..084ef95274cd 100644
--- a/arch/x86/include/asm/msr.h
+++ b/arch/x86/include/asm/msr.h
@@ -169,14 +169,7 @@ static inline int wrmsr_safe(unsigned msr, unsigned low, unsigned high)
 	return native_write_msr_safe(msr, low, high);
 }
 
-/*
- * rdmsr with exception handling.
- *
- * Please note that the exception handling works only after we've
- * switched to the "smart" #GP handler in trap_init() which knows about
- * exception tables - using this macro earlier than that causes machine
- * hangs on boxes which do not implement the @msr in the first argument.
- */
+/* rdmsr with exception handling */
 #define rdmsr_safe(msr, p1, p2)					\
 ({								\
 	int __err;						\
-- 
cgit v1.2.3


From d4541805e812abb5110d5de83246488fa0aa9a8e Mon Sep 17 00:00:00 2001
From: "H. Peter Anvin" <hpa@zytor.com>
Date: Fri, 20 Apr 2012 12:12:27 -0700
Subject: x86, extable: Use .pushsection ... .popsection for _ASM_EXTABLE()

Instead of using .section ... .previous, use .pushsection
... .popsection; this is (hopefully) a bit more robust, especially in
complex assembly code.

Signed-off-by: H. Peter Anvin <hpa@zytor.com>
Cc: David Daney <david.daney@cavium.com>
Link: http://lkml.kernel.org/r/CA%2B55aFyijf43qSu3N9nWHEBwaGbb7T2Oq9A=9EyR=Jtyqfq_cQ@mail.gmail.com
---
 arch/x86/include/asm/asm.h | 20 ++++++++++----------
 1 file changed, 10 insertions(+), 10 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/include/asm/asm.h b/arch/x86/include/asm/asm.h
index 9412d6558c88..ff3f6bffcbf9 100644
--- a/arch/x86/include/asm/asm.h
+++ b/arch/x86/include/asm/asm.h
@@ -42,17 +42,17 @@
 
 /* Exception table entry */
 #ifdef __ASSEMBLY__
-# define _ASM_EXTABLE(from,to)	    \
-	__ASM_EX_SEC ;		    \
-	_ASM_ALIGN ;		    \
-	_ASM_PTR from , to ;	    \
-	.previous
+# define _ASM_EXTABLE(from,to)			\
+	.pushsection "__ex_table","a" ;		\
+	_ASM_ALIGN ;				\
+	_ASM_PTR from , to ;			\
+	.popsection
 #else
-# define _ASM_EXTABLE(from,to) \
-	__ASM_EX_SEC	\
-	_ASM_ALIGN "\n" \
-	_ASM_PTR #from "," #to "\n" \
-	" .previous\n"
+# define _ASM_EXTABLE(from,to)			\
+	" .pushsection \"__ex_table\",\"a\"\n"	\
+	_ASM_ALIGN "\n" 			\
+	_ASM_PTR #from "," #to "\n" 		\
+	" .popsection\n"
 #endif
 
 #endif /* _ASM_X86_ASM_H */
-- 
cgit v1.2.3


From 1ce6f86815a392acce2b45512106b525dc994cc0 Mon Sep 17 00:00:00 2001
From: "H. Peter Anvin" <hpa@zytor.com>
Date: Fri, 20 Apr 2012 12:19:50 -0700
Subject: x86, extable: Remove open-coded exception table entries in
 arch/x86/ia32/ia32entry.S

Remove open-coded exception table entries in arch/x86/ia32/ia32entry.S,
and replace them with _ASM_EXTABLE() macros; this will allow us to
change the format and type of the exception table entries.

Signed-off-by: H. Peter Anvin <hpa@zytor.com>
Cc: David Daney <david.daney@cavium.com>
Link: http://lkml.kernel.org/r/CA%2B55aFyijf43qSu3N9nWHEBwaGbb7T2Oq9A=9EyR=Jtyqfq_cQ@mail.gmail.com
---
 arch/x86/ia32/ia32entry.S | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/ia32/ia32entry.S b/arch/x86/ia32/ia32entry.S
index e3e734005e19..eb48edd0cad2 100644
--- a/arch/x86/ia32/ia32entry.S
+++ b/arch/x86/ia32/ia32entry.S
@@ -13,6 +13,7 @@
 #include <asm/thread_info.h>	
 #include <asm/segment.h>
 #include <asm/irqflags.h>
+#include <asm/asm.h>
 #include <linux/linkage.h>
 #include <linux/err.h>
 
@@ -146,9 +147,7 @@ ENTRY(ia32_sysenter_target)
  	/* no need to do an access_ok check here because rbp has been
  	   32bit zero extended */ 
 1:	movl	(%rbp),%ebp
- 	.section __ex_table,"a"
- 	.quad 1b,ia32_badarg
- 	.previous	
+	_ASM_EXTABLE(1b,ia32_badarg)
 	orl     $TS_COMPAT,TI_status+THREAD_INFO(%rsp,RIP-ARGOFFSET)
 	testl   $_TIF_WORK_SYSCALL_ENTRY,TI_flags+THREAD_INFO(%rsp,RIP-ARGOFFSET)
 	CFI_REMEMBER_STATE
-- 
cgit v1.2.3


From 6837a54dd6127f055dcb26d00fee0df05c07a674 Mon Sep 17 00:00:00 2001
From: "H. Peter Anvin" <hpa@zytor.com>
Date: Fri, 20 Apr 2012 12:19:50 -0700
Subject: x86, extable: Remove open-coded exception table entries in
 arch/x86/kernel/entry_32.S

Remove open-coded exception table entries in arch/x86/kernel/entry_32.S,
and replace them with _ASM_EXTABLE() macros; this will allow us to
change the format and type of the exception table entries.

Signed-off-by: H. Peter Anvin <hpa@zytor.com>
Cc: David Daney <david.daney@cavium.com>
Link: http://lkml.kernel.org/r/CA%2B55aFyijf43qSu3N9nWHEBwaGbb7T2Oq9A=9EyR=Jtyqfq_cQ@mail.gmail.com
---
 arch/x86/kernel/entry_32.S | 47 ++++++++++++++--------------------------------
 1 file changed, 14 insertions(+), 33 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kernel/entry_32.S b/arch/x86/kernel/entry_32.S
index 7b784f4ef1e4..01ccf9b71473 100644
--- a/arch/x86/kernel/entry_32.S
+++ b/arch/x86/kernel/entry_32.S
@@ -56,6 +56,7 @@
 #include <asm/irq_vectors.h>
 #include <asm/cpufeature.h>
 #include <asm/alternative-asm.h>
+#include <asm/asm.h>
 
 /* Avoid __ASSEMBLER__'ifying <linux/audit.h> just for this.  */
 #include <linux/elf-em.h>
@@ -151,10 +152,8 @@
 .pushsection .fixup, "ax"
 99:	movl $0, (%esp)
 	jmp 98b
-.section __ex_table, "a"
-	.align 4
-	.long 98b, 99b
 .popsection
+	_ASM_EXTABLE(98b,99b)
 .endm
 
 .macro PTGS_TO_GS
@@ -164,10 +163,8 @@
 .pushsection .fixup, "ax"
 99:	movl $0, PT_GS(%esp)
 	jmp 98b
-.section __ex_table, "a"
-	.align 4
-	.long 98b, 99b
 .popsection
+	_ASM_EXTABLE(98b,99b)
 .endm
 
 .macro GS_TO_REG reg
@@ -249,12 +246,10 @@
 	jmp 2b
 6:	movl $0, (%esp)
 	jmp 3b
-.section __ex_table, "a"
-	.align 4
-	.long 1b, 4b
-	.long 2b, 5b
-	.long 3b, 6b
 .popsection
+	_ASM_EXTABLE(1b,4b)
+	_ASM_EXTABLE(2b,5b)
+	_ASM_EXTABLE(3b,6b)
 	POP_GS_EX
 .endm
 
@@ -415,10 +410,7 @@ sysenter_past_esp:
 	jae syscall_fault
 1:	movl (%ebp),%ebp
 	movl %ebp,PT_EBP(%esp)
-.section __ex_table,"a"
-	.align 4
-	.long 1b,syscall_fault
-.previous
+	_ASM_EXTABLE(1b,syscall_fault)
 
 	GET_THREAD_INFO(%ebp)
 
@@ -485,10 +477,8 @@ sysexit_audit:
 .pushsection .fixup,"ax"
 2:	movl $0,PT_FS(%esp)
 	jmp 1b
-.section __ex_table,"a"
-	.align 4
-	.long 1b,2b
 .popsection
+	_ASM_EXTABLE(1b,2b)
 	PTGS_TO_GS_EX
 ENDPROC(ia32_sysenter_target)
 
@@ -543,10 +533,7 @@ ENTRY(iret_exc)
 	pushl $do_iret_error
 	jmp error_code
 .previous
-.section __ex_table,"a"
-	.align 4
-	.long irq_return,iret_exc
-.previous
+	_ASM_EXTABLE(irq_return,iret_exc)
 
 	CFI_RESTORE_STATE
 ldt_ss:
@@ -901,10 +888,7 @@ END(device_not_available)
 #ifdef CONFIG_PARAVIRT
 ENTRY(native_iret)
 	iret
-.section __ex_table,"a"
-	.align 4
-	.long native_iret, iret_exc
-.previous
+	_ASM_EXTABLE(native_iret, iret_exc)
 END(native_iret)
 
 ENTRY(native_irq_enable_sysexit)
@@ -1093,13 +1077,10 @@ ENTRY(xen_failsafe_callback)
 	movl %eax,16(%esp)
 	jmp 4b
 .previous
-.section __ex_table,"a"
-	.align 4
-	.long 1b,6b
-	.long 2b,7b
-	.long 3b,8b
-	.long 4b,9b
-.previous
+	_ASM_EXTABLE(1b,6b)
+	_ASM_EXTABLE(2b,7b)
+	_ASM_EXTABLE(3b,8b)
+	_ASM_EXTABLE(4b,9b)
 ENDPROC(xen_failsafe_callback)
 
 BUILD_INTERRUPT3(xen_hvm_callback_vector, XEN_HVM_EVTCHN_CALLBACK,
-- 
cgit v1.2.3


From d7abc0fa997972ddb6d3c403e03a6eefda0c0881 Mon Sep 17 00:00:00 2001
From: "H. Peter Anvin" <hpa@zytor.com>
Date: Fri, 20 Apr 2012 12:19:50 -0700
Subject: x86, extable: Remove open-coded exception table entries in
 arch/x86/kernel/entry_64.S

Remove open-coded exception table entries in arch/x86/kernel/entry_64.S,
and replace them with _ASM_EXTABLE() macros; this will allow us to
change the format and type of the exception table entries.

Signed-off-by: H. Peter Anvin <hpa@zytor.com>
Cc: David Daney <david.daney@cavium.com>
Link: http://lkml.kernel.org/r/CA%2B55aFyijf43qSu3N9nWHEBwaGbb7T2Oq9A=9EyR=Jtyqfq_cQ@mail.gmail.com
---
 arch/x86/kernel/entry_64.S | 16 ++++------------
 1 file changed, 4 insertions(+), 12 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kernel/entry_64.S b/arch/x86/kernel/entry_64.S
index cdc79b5cfcd9..320852d02026 100644
--- a/arch/x86/kernel/entry_64.S
+++ b/arch/x86/kernel/entry_64.S
@@ -55,6 +55,7 @@
 #include <asm/paravirt.h>
 #include <asm/ftrace.h>
 #include <asm/percpu.h>
+#include <asm/asm.h>
 #include <linux/err.h>
 
 /* Avoid __ASSEMBLER__'ifying <linux/audit.h> just for this.  */
@@ -900,18 +901,12 @@ restore_args:
 
 irq_return:
 	INTERRUPT_RETURN
-
-	.section __ex_table, "a"
-	.quad irq_return, bad_iret
-	.previous
+	_ASM_EXTABLE(irq_return, bad_iret)
 
 #ifdef CONFIG_PARAVIRT
 ENTRY(native_iret)
 	iretq
-
-	.section __ex_table,"a"
-	.quad native_iret, bad_iret
-	.previous
+	_ASM_EXTABLE(native_iret, bad_iret)
 #endif
 
 	.section .fixup,"ax"
@@ -1181,10 +1176,7 @@ gs_change:
 	CFI_ENDPROC
 END(native_load_gs_index)
 
-	.section __ex_table,"a"
-	.align 8
-	.quad gs_change,bad_gs
-	.previous
+	_ASM_EXTABLE(gs_change,bad_gs)
 	.section .fixup,"ax"
 	/* running with kernelgs */
 bad_gs:
-- 
cgit v1.2.3


From 5d6f8d77ede50417dcca4c31a74f0d40a1ee537a Mon Sep 17 00:00:00 2001
From: "H. Peter Anvin" <hpa@zytor.com>
Date: Fri, 20 Apr 2012 12:19:50 -0700
Subject: x86, extable: Remove open-coded exception table entries in
 arch/x86/kernel/test_rodata.c

Remove open-coded exception table entries in arch/x86/kernel/test_rodata.c,
and replace them with _ASM_EXTABLE() macros; this will allow us to
change the format and type of the exception table entries.

Signed-off-by: H. Peter Anvin <hpa@zytor.com>
Cc: David Daney <david.daney@cavium.com>
Link: http://lkml.kernel.org/r/CA%2B55aFyijf43qSu3N9nWHEBwaGbb7T2Oq9A=9EyR=Jtyqfq_cQ@mail.gmail.com
---
 arch/x86/kernel/test_rodata.c | 10 ++--------
 1 file changed, 2 insertions(+), 8 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kernel/test_rodata.c b/arch/x86/kernel/test_rodata.c
index c29e235792af..b79133abda48 100644
--- a/arch/x86/kernel/test_rodata.c
+++ b/arch/x86/kernel/test_rodata.c
@@ -12,6 +12,7 @@
 #include <linux/module.h>
 #include <asm/cacheflush.h>
 #include <asm/sections.h>
+#include <asm/asm.h>
 
 int rodata_test(void)
 {
@@ -42,14 +43,7 @@ int rodata_test(void)
 		".section .fixup,\"ax\"\n"
 		"2:	jmp 1b\n"
 		".previous\n"
-		".section __ex_table,\"a\"\n"
-		"       .align 16\n"
-#ifdef CONFIG_X86_32
-		"	.long 0b,2b\n"
-#else
-		"	.quad 0b,2b\n"
-#endif
-		".previous"
+		_ASM_EXTABLE(0b,2b)
 		: [rslt] "=r" (result)
 		: [rodata_test] "r" (&rodata_test_data), [zero] "r" (0UL)
 	);
-- 
cgit v1.2.3


From 5f2e8a84f07bb43f9c0ce317d7e0c5e541db00e3 Mon Sep 17 00:00:00 2001
From: "H. Peter Anvin" <hpa@zytor.com>
Date: Fri, 20 Apr 2012 12:19:50 -0700
Subject: x86, extable: Remove open-coded exception table entries in
 arch/x86/lib/checksum_32.S

Remove open-coded exception table entries in arch/x86/lib/checksum_32.S,
and replace them with _ASM_EXTABLE() macros; this will allow us to
change the format and type of the exception table entries.

Signed-off-by: H. Peter Anvin <hpa@zytor.com>
Cc: David Daney <david.daney@cavium.com>
Link: http://lkml.kernel.org/r/CA%2B55aFyijf43qSu3N9nWHEBwaGbb7T2Oq9A=9EyR=Jtyqfq_cQ@mail.gmail.com
---
 arch/x86/lib/checksum_32.S | 9 +++------
 1 file changed, 3 insertions(+), 6 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/lib/checksum_32.S b/arch/x86/lib/checksum_32.S
index 78d16a554db0..2af5df3ade7c 100644
--- a/arch/x86/lib/checksum_32.S
+++ b/arch/x86/lib/checksum_32.S
@@ -28,6 +28,7 @@
 #include <linux/linkage.h>
 #include <asm/dwarf2.h>
 #include <asm/errno.h>
+#include <asm/asm.h>
 				
 /*
  * computes a partial checksum, e.g. for TCP/UDP fragments
@@ -282,15 +283,11 @@ unsigned int csum_partial_copy_generic (const char *src, char *dst,
 
 #define SRC(y...)			\
 	9999: y;			\
-	.section __ex_table, "a";	\
-	.long 9999b, 6001f	;	\
-	.previous
+	_ASM_EXTABLE(9999b, 6001f)
 
 #define DST(y...)			\
 	9999: y;			\
-	.section __ex_table, "a";	\
-	.long 9999b, 6002f	;	\
-	.previous
+	_ASM_EXTABLE(9999b, 6002f)
 
 #ifndef CONFIG_X86_USE_PPRO_CHECKSUM
 
-- 
cgit v1.2.3


From 9732da8ca860053515431298ec969e1f3e6bc64a Mon Sep 17 00:00:00 2001
From: "H. Peter Anvin" <hpa@zytor.com>
Date: Fri, 20 Apr 2012 12:19:51 -0700
Subject: x86, extable: Remove open-coded exception table entries in
 arch/x86/lib/copy_user_64.S

Remove open-coded exception table entries in arch/x86/lib/copy_user_64.S,
and replace them with _ASM_EXTABLE() macros; this will allow us to
change the format and type of the exception table entries.

Signed-off-by: H. Peter Anvin <hpa@zytor.com>
Cc: David Daney <david.daney@cavium.com>
Link: http://lkml.kernel.org/r/CA%2B55aFyijf43qSu3N9nWHEBwaGbb7T2Oq9A=9EyR=Jtyqfq_cQ@mail.gmail.com
---
 arch/x86/lib/copy_user_64.S | 63 +++++++++++++++++++--------------------------
 1 file changed, 26 insertions(+), 37 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/lib/copy_user_64.S b/arch/x86/lib/copy_user_64.S
index 024840266ba0..5b2995f4557a 100644
--- a/arch/x86/lib/copy_user_64.S
+++ b/arch/x86/lib/copy_user_64.S
@@ -16,6 +16,7 @@
 #include <asm/thread_info.h>
 #include <asm/cpufeature.h>
 #include <asm/alternative-asm.h>
+#include <asm/asm.h>
 
 /*
  * By placing feature2 after feature1 in altinstructions section, we logically
@@ -63,11 +64,8 @@
 	jmp copy_user_handle_tail
 	.previous
 
-	.section __ex_table,"a"
-	.align 8
-	.quad 100b,103b
-	.quad 101b,103b
-	.previous
+	_ASM_EXTABLE(100b,103b)
+	_ASM_EXTABLE(101b,103b)
 #endif
 	.endm
 
@@ -191,29 +189,26 @@ ENTRY(copy_user_generic_unrolled)
 60:	jmp copy_user_handle_tail /* ecx is zerorest also */
 	.previous
 
-	.section __ex_table,"a"
-	.align 8
-	.quad 1b,30b
-	.quad 2b,30b
-	.quad 3b,30b
-	.quad 4b,30b
-	.quad 5b,30b
-	.quad 6b,30b
-	.quad 7b,30b
-	.quad 8b,30b
-	.quad 9b,30b
-	.quad 10b,30b
-	.quad 11b,30b
-	.quad 12b,30b
-	.quad 13b,30b
-	.quad 14b,30b
-	.quad 15b,30b
-	.quad 16b,30b
-	.quad 18b,40b
-	.quad 19b,40b
-	.quad 21b,50b
-	.quad 22b,50b
-	.previous
+	_ASM_EXTABLE(1b,30b)
+	_ASM_EXTABLE(2b,30b)
+	_ASM_EXTABLE(3b,30b)
+	_ASM_EXTABLE(4b,30b)
+	_ASM_EXTABLE(5b,30b)
+	_ASM_EXTABLE(6b,30b)
+	_ASM_EXTABLE(7b,30b)
+	_ASM_EXTABLE(8b,30b)
+	_ASM_EXTABLE(9b,30b)
+	_ASM_EXTABLE(10b,30b)
+	_ASM_EXTABLE(11b,30b)
+	_ASM_EXTABLE(12b,30b)
+	_ASM_EXTABLE(13b,30b)
+	_ASM_EXTABLE(14b,30b)
+	_ASM_EXTABLE(15b,30b)
+	_ASM_EXTABLE(16b,30b)
+	_ASM_EXTABLE(18b,40b)
+	_ASM_EXTABLE(19b,40b)
+	_ASM_EXTABLE(21b,50b)
+	_ASM_EXTABLE(22b,50b)
 	CFI_ENDPROC
 ENDPROC(copy_user_generic_unrolled)
 
@@ -259,11 +254,8 @@ ENTRY(copy_user_generic_string)
 	jmp copy_user_handle_tail
 	.previous
 
-	.section __ex_table,"a"
-	.align 8
-	.quad 1b,11b
-	.quad 3b,12b
-	.previous
+	_ASM_EXTABLE(1b,11b)
+	_ASM_EXTABLE(3b,12b)
 	CFI_ENDPROC
 ENDPROC(copy_user_generic_string)
 
@@ -294,9 +286,6 @@ ENTRY(copy_user_enhanced_fast_string)
 	jmp copy_user_handle_tail
 	.previous
 
-	.section __ex_table,"a"
-	.align 8
-	.quad 1b,12b
-	.previous
+	_ASM_EXTABLE(1b,12b)
 	CFI_ENDPROC
 ENDPROC(copy_user_enhanced_fast_string)
-- 
cgit v1.2.3


From 0d8559feafbc9dc5a2c17ba42aea7de824b18308 Mon Sep 17 00:00:00 2001
From: "H. Peter Anvin" <hpa@zytor.com>
Date: Fri, 20 Apr 2012 12:19:51 -0700
Subject: x86, extable: Remove open-coded exception table entries in
 arch/x86/lib/copy_user_nocache_64.S

Remove open-coded exception table entries in arch/x86/lib/copy_user_nocache_64.S,
and replace them with _ASM_EXTABLE() macros; this will allow us to
change the format and type of the exception table entries.

Signed-off-by: H. Peter Anvin <hpa@zytor.com>
Cc: David Daney <david.daney@cavium.com>
Link: http://lkml.kernel.org/r/CA%2B55aFyijf43qSu3N9nWHEBwaGbb7T2Oq9A=9EyR=Jtyqfq_cQ@mail.gmail.com
---
 arch/x86/lib/copy_user_nocache_64.S | 50 +++++++++++++++++--------------------
 1 file changed, 23 insertions(+), 27 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/lib/copy_user_nocache_64.S b/arch/x86/lib/copy_user_nocache_64.S
index cb0c112386fb..cacddc7163eb 100644
--- a/arch/x86/lib/copy_user_nocache_64.S
+++ b/arch/x86/lib/copy_user_nocache_64.S
@@ -14,6 +14,7 @@
 #include <asm/current.h>
 #include <asm/asm-offsets.h>
 #include <asm/thread_info.h>
+#include <asm/asm.h>
 
 	.macro ALIGN_DESTINATION
 #ifdef FIX_ALIGNMENT
@@ -36,11 +37,8 @@
 	jmp copy_user_handle_tail
 	.previous
 
-	.section __ex_table,"a"
-	.align 8
-	.quad 100b,103b
-	.quad 101b,103b
-	.previous
+	_ASM_EXTABLE(100b,103b)
+	_ASM_EXTABLE(101b,103b)
 #endif
 	.endm
 
@@ -111,27 +109,25 @@ ENTRY(__copy_user_nocache)
 	jmp copy_user_handle_tail
 	.previous
 
-	.section __ex_table,"a"
-	.quad 1b,30b
-	.quad 2b,30b
-	.quad 3b,30b
-	.quad 4b,30b
-	.quad 5b,30b
-	.quad 6b,30b
-	.quad 7b,30b
-	.quad 8b,30b
-	.quad 9b,30b
-	.quad 10b,30b
-	.quad 11b,30b
-	.quad 12b,30b
-	.quad 13b,30b
-	.quad 14b,30b
-	.quad 15b,30b
-	.quad 16b,30b
-	.quad 18b,40b
-	.quad 19b,40b
-	.quad 21b,50b
-	.quad 22b,50b
-	.previous
+	_ASM_EXTABLE(1b,30b)
+	_ASM_EXTABLE(2b,30b)
+	_ASM_EXTABLE(3b,30b)
+	_ASM_EXTABLE(4b,30b)
+	_ASM_EXTABLE(5b,30b)
+	_ASM_EXTABLE(6b,30b)
+	_ASM_EXTABLE(7b,30b)
+	_ASM_EXTABLE(8b,30b)
+	_ASM_EXTABLE(9b,30b)
+	_ASM_EXTABLE(10b,30b)
+	_ASM_EXTABLE(11b,30b)
+	_ASM_EXTABLE(12b,30b)
+	_ASM_EXTABLE(13b,30b)
+	_ASM_EXTABLE(14b,30b)
+	_ASM_EXTABLE(15b,30b)
+	_ASM_EXTABLE(16b,30b)
+	_ASM_EXTABLE(18b,40b)
+	_ASM_EXTABLE(19b,40b)
+	_ASM_EXTABLE(21b,50b)
+	_ASM_EXTABLE(22b,50b)
 	CFI_ENDPROC
 ENDPROC(__copy_user_nocache)
-- 
cgit v1.2.3


From 015e6f11a9737684469feef9d523373b1746159d Mon Sep 17 00:00:00 2001
From: "H. Peter Anvin" <hpa@zytor.com>
Date: Fri, 20 Apr 2012 12:19:51 -0700
Subject: x86, extable: Remove open-coded exception table entries in
 arch/x86/lib/csum-copy_64.S

Remove open-coded exception table entries in arch/x86/lib/csum-copy_64.S,
and replace them with _ASM_EXTABLE() macros; this will allow us to
change the format and type of the exception table entries.

Signed-off-by: H. Peter Anvin <hpa@zytor.com>
Cc: David Daney <david.daney@cavium.com>
Link: http://lkml.kernel.org/r/CA%2B55aFyijf43qSu3N9nWHEBwaGbb7T2Oq9A=9EyR=Jtyqfq_cQ@mail.gmail.com
---
 arch/x86/lib/csum-copy_64.S | 16 ++++------------
 1 file changed, 4 insertions(+), 12 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/lib/csum-copy_64.S b/arch/x86/lib/csum-copy_64.S
index fb903b758da8..2419d5fefae3 100644
--- a/arch/x86/lib/csum-copy_64.S
+++ b/arch/x86/lib/csum-copy_64.S
@@ -8,6 +8,7 @@
 #include <linux/linkage.h>
 #include <asm/dwarf2.h>
 #include <asm/errno.h>
+#include <asm/asm.h>
 
 /*
  * Checksum copy with exception handling.
@@ -31,26 +32,17 @@
 
 	.macro source
 10:
-	.section __ex_table, "a"
-	.align 8
-	.quad 10b, .Lbad_source
-	.previous
+	_ASM_EXTABLE(10b, .Lbad_source)
 	.endm
 
 	.macro dest
 20:
-	.section __ex_table, "a"
-	.align 8
-	.quad 20b, .Lbad_dest
-	.previous
+	_ASM_EXTABLE(20b, .Lbad_dest)
 	.endm
 
 	.macro ignore L=.Lignore
 30:
-	.section __ex_table, "a"
-	.align 8
-	.quad 30b, \L
-	.previous
+	_ASM_EXTABLE(30b, \L)
 	.endm
 
 
-- 
cgit v1.2.3


From 1a27bc0d99aabea6b628cb994a21a1c79b569fc9 Mon Sep 17 00:00:00 2001
From: "H. Peter Anvin" <hpa@zytor.com>
Date: Fri, 20 Apr 2012 12:19:51 -0700
Subject: x86, extable: Remove open-coded exception table entries in
 arch/x86/lib/getuser.S

Remove open-coded exception table entries in arch/x86/lib/getuser.S,
and replace them with _ASM_EXTABLE() macros; this will allow us to
change the format and type of the exception table entries.

Signed-off-by: H. Peter Anvin <hpa@zytor.com>
Cc: David Daney <david.daney@cavium.com>
Link: http://lkml.kernel.org/r/CA%2B55aFyijf43qSu3N9nWHEBwaGbb7T2Oq9A=9EyR=Jtyqfq_cQ@mail.gmail.com
---
 arch/x86/lib/getuser.S | 9 ++++-----
 1 file changed, 4 insertions(+), 5 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/lib/getuser.S b/arch/x86/lib/getuser.S
index 51f1504cddd9..b33b1fb1e6d4 100644
--- a/arch/x86/lib/getuser.S
+++ b/arch/x86/lib/getuser.S
@@ -95,10 +95,9 @@ bad_get_user:
 	CFI_ENDPROC
 END(bad_get_user)
 
-.section __ex_table,"a"
-	_ASM_PTR 1b,bad_get_user
-	_ASM_PTR 2b,bad_get_user
-	_ASM_PTR 3b,bad_get_user
+	_ASM_EXTABLE(1b,bad_get_user)
+	_ASM_EXTABLE(2b,bad_get_user)
+	_ASM_EXTABLE(3b,bad_get_user)
 #ifdef CONFIG_X86_64
-	_ASM_PTR 4b,bad_get_user
+	_ASM_EXTABLE(4b,bad_get_user)
 #endif
-- 
cgit v1.2.3


From a53a96e5413d3639ed75d202bbfe68aa0a56c091 Mon Sep 17 00:00:00 2001
From: "H. Peter Anvin" <hpa@zytor.com>
Date: Fri, 20 Apr 2012 12:19:52 -0700
Subject: x86, extable: Remove open-coded exception table entries in
 arch/x86/lib/putuser.S

Remove open-coded exception table entries in arch/x86/lib/putuser.S,
and replace them with _ASM_EXTABLE() macros; this will allow us to
change the format and type of the exception table entries.

Signed-off-by: H. Peter Anvin <hpa@zytor.com>
Cc: David Daney <david.daney@cavium.com>
Link: http://lkml.kernel.org/r/CA%2B55aFyijf43qSu3N9nWHEBwaGbb7T2Oq9A=9EyR=Jtyqfq_cQ@mail.gmail.com
---
 arch/x86/lib/putuser.S | 12 +++++-------
 1 file changed, 5 insertions(+), 7 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/lib/putuser.S b/arch/x86/lib/putuser.S
index 36b0d15ae6e9..7f951c8f76c4 100644
--- a/arch/x86/lib/putuser.S
+++ b/arch/x86/lib/putuser.S
@@ -86,12 +86,10 @@ bad_put_user:
 	EXIT
 END(bad_put_user)
 
-.section __ex_table,"a"
-	_ASM_PTR 1b,bad_put_user
-	_ASM_PTR 2b,bad_put_user
-	_ASM_PTR 3b,bad_put_user
-	_ASM_PTR 4b,bad_put_user
+	_ASM_EXTABLE(1b,bad_put_user)
+	_ASM_EXTABLE(2b,bad_put_user)
+	_ASM_EXTABLE(3b,bad_put_user)
+	_ASM_EXTABLE(4b,bad_put_user)
 #ifdef CONFIG_X86_32
-	_ASM_PTR 5b,bad_put_user
+	_ASM_EXTABLE(5b,bad_put_user)
 #endif
-.previous
-- 
cgit v1.2.3


From 9c6751280b6206e2a96f9600938003a29968e4fa Mon Sep 17 00:00:00 2001
From: "H. Peter Anvin" <hpa@zytor.com>
Date: Fri, 20 Apr 2012 12:19:52 -0700
Subject: x86, extable: Remove open-coded exception table entries in
 arch/x86/lib/usercopy_32.c

Remove open-coded exception table entries in arch/x86/lib/usercopy_32.c,
and replace them with _ASM_EXTABLE() macros; this will allow us to
change the format and type of the exception table entries.

Signed-off-by: H. Peter Anvin <hpa@zytor.com>
Cc: David Daney <david.daney@cavium.com>
Link: http://lkml.kernel.org/r/CA%2B55aFyijf43qSu3N9nWHEBwaGbb7T2Oq9A=9EyR=Jtyqfq_cQ@mail.gmail.com
---
 arch/x86/lib/usercopy_32.c | 232 +++++++++++++++++++++------------------------
 1 file changed, 106 insertions(+), 126 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/lib/usercopy_32.c b/arch/x86/lib/usercopy_32.c
index ef2a6a5d78e3..883b216c60b2 100644
--- a/arch/x86/lib/usercopy_32.c
+++ b/arch/x86/lib/usercopy_32.c
@@ -13,6 +13,7 @@
 #include <linux/interrupt.h>
 #include <asm/uaccess.h>
 #include <asm/mmx.h>
+#include <asm/asm.h>
 
 #ifdef CONFIG_X86_INTEL_USERCOPY
 /*
@@ -127,10 +128,7 @@ long strnlen_user(const char __user *s, long n)
 		"3:	movb $1,%%al\n"
 		"	jmp 1b\n"
 		".previous\n"
-		".section __ex_table,\"a\"\n"
-		"	.align 4\n"
-		"	.long 0b,2b\n"
-		".previous"
+		_ASM_EXTABLE(0b,2b)
 		:"=&r" (n), "=&D" (s), "=&a" (res), "=&c" (tmp)
 		:"0" (n), "1" (s), "2" (0), "3" (mask)
 		:"cc");
@@ -199,47 +197,44 @@ __copy_user_intel(void __user *to, const void *from, unsigned long size)
 		       "101:   lea 0(%%eax,%0,4),%0\n"
 		       "       jmp 100b\n"
 		       ".previous\n"
-		       ".section __ex_table,\"a\"\n"
-		       "       .align 4\n"
-		       "       .long 1b,100b\n"
-		       "       .long 2b,100b\n"
-		       "       .long 3b,100b\n"
-		       "       .long 4b,100b\n"
-		       "       .long 5b,100b\n"
-		       "       .long 6b,100b\n"
-		       "       .long 7b,100b\n"
-		       "       .long 8b,100b\n"
-		       "       .long 9b,100b\n"
-		       "       .long 10b,100b\n"
-		       "       .long 11b,100b\n"
-		       "       .long 12b,100b\n"
-		       "       .long 13b,100b\n"
-		       "       .long 14b,100b\n"
-		       "       .long 15b,100b\n"
-		       "       .long 16b,100b\n"
-		       "       .long 17b,100b\n"
-		       "       .long 18b,100b\n"
-		       "       .long 19b,100b\n"
-		       "       .long 20b,100b\n"
-		       "       .long 21b,100b\n"
-		       "       .long 22b,100b\n"
-		       "       .long 23b,100b\n"
-		       "       .long 24b,100b\n"
-		       "       .long 25b,100b\n"
-		       "       .long 26b,100b\n"
-		       "       .long 27b,100b\n"
-		       "       .long 28b,100b\n"
-		       "       .long 29b,100b\n"
-		       "       .long 30b,100b\n"
-		       "       .long 31b,100b\n"
-		       "       .long 32b,100b\n"
-		       "       .long 33b,100b\n"
-		       "       .long 34b,100b\n"
-		       "       .long 35b,100b\n"
-		       "       .long 36b,100b\n"
-		       "       .long 37b,100b\n"
-		       "       .long 99b,101b\n"
-		       ".previous"
+		       _ASM_EXTABLE(1b,100b)
+		       _ASM_EXTABLE(2b,100b)
+		       _ASM_EXTABLE(3b,100b)
+		       _ASM_EXTABLE(4b,100b)
+		       _ASM_EXTABLE(5b,100b)
+		       _ASM_EXTABLE(6b,100b)
+		       _ASM_EXTABLE(7b,100b)
+		       _ASM_EXTABLE(8b,100b)
+		       _ASM_EXTABLE(9b,100b)
+		       _ASM_EXTABLE(10b,100b)
+		       _ASM_EXTABLE(11b,100b)
+		       _ASM_EXTABLE(12b,100b)
+		       _ASM_EXTABLE(13b,100b)
+		       _ASM_EXTABLE(14b,100b)
+		       _ASM_EXTABLE(15b,100b)
+		       _ASM_EXTABLE(16b,100b)
+		       _ASM_EXTABLE(17b,100b)
+		       _ASM_EXTABLE(18b,100b)
+		       _ASM_EXTABLE(19b,100b)
+		       _ASM_EXTABLE(20b,100b)
+		       _ASM_EXTABLE(21b,100b)
+		       _ASM_EXTABLE(22b,100b)
+		       _ASM_EXTABLE(23b,100b)
+		       _ASM_EXTABLE(24b,100b)
+		       _ASM_EXTABLE(25b,100b)
+		       _ASM_EXTABLE(26b,100b)
+		       _ASM_EXTABLE(27b,100b)
+		       _ASM_EXTABLE(28b,100b)
+		       _ASM_EXTABLE(29b,100b)
+		       _ASM_EXTABLE(30b,100b)
+		       _ASM_EXTABLE(31b,100b)
+		       _ASM_EXTABLE(32b,100b)
+		       _ASM_EXTABLE(33b,100b)
+		       _ASM_EXTABLE(34b,100b)
+		       _ASM_EXTABLE(35b,100b)
+		       _ASM_EXTABLE(36b,100b)
+		       _ASM_EXTABLE(37b,100b)
+		       _ASM_EXTABLE(99b,101b)
 		       : "=&c"(size), "=&D" (d0), "=&S" (d1)
 		       :  "1"(to), "2"(from), "0"(size)
 		       : "eax", "edx", "memory");
@@ -312,29 +307,26 @@ __copy_user_zeroing_intel(void *to, const void __user *from, unsigned long size)
 		       "        popl %0\n"
 		       "        jmp 8b\n"
 		       ".previous\n"
-		       ".section __ex_table,\"a\"\n"
-		       "	.align 4\n"
-		       "	.long 0b,16b\n"
-		       "	.long 1b,16b\n"
-		       "	.long 2b,16b\n"
-		       "	.long 21b,16b\n"
-		       "	.long 3b,16b\n"
-		       "	.long 31b,16b\n"
-		       "	.long 4b,16b\n"
-		       "	.long 41b,16b\n"
-		       "	.long 10b,16b\n"
-		       "	.long 51b,16b\n"
-		       "	.long 11b,16b\n"
-		       "	.long 61b,16b\n"
-		       "	.long 12b,16b\n"
-		       "	.long 71b,16b\n"
-		       "	.long 13b,16b\n"
-		       "	.long 81b,16b\n"
-		       "	.long 14b,16b\n"
-		       "	.long 91b,16b\n"
-		       "	.long 6b,9b\n"
-		       "        .long 7b,16b\n"
-		       ".previous"
+		       _ASM_EXTABLE(0b,16b)
+		       _ASM_EXTABLE(1b,16b)
+		       _ASM_EXTABLE(2b,16b)
+		       _ASM_EXTABLE(21b,16b)
+		       _ASM_EXTABLE(3b,16b)
+		       _ASM_EXTABLE(31b,16b)
+		       _ASM_EXTABLE(4b,16b)
+		       _ASM_EXTABLE(41b,16b)
+		       _ASM_EXTABLE(10b,16b)
+		       _ASM_EXTABLE(51b,16b)
+		       _ASM_EXTABLE(11b,16b)
+		       _ASM_EXTABLE(61b,16b)
+		       _ASM_EXTABLE(12b,16b)
+		       _ASM_EXTABLE(71b,16b)
+		       _ASM_EXTABLE(13b,16b)
+		       _ASM_EXTABLE(81b,16b)
+		       _ASM_EXTABLE(14b,16b)
+		       _ASM_EXTABLE(91b,16b)
+		       _ASM_EXTABLE(6b,9b)
+		       _ASM_EXTABLE(7b,16b)
 		       : "=&c"(size), "=&D" (d0), "=&S" (d1)
 		       :  "1"(to), "2"(from), "0"(size)
 		       : "eax", "edx", "memory");
@@ -414,29 +406,26 @@ static unsigned long __copy_user_zeroing_intel_nocache(void *to,
 	       "        popl %0\n"
 	       "        jmp 8b\n"
 	       ".previous\n"
-	       ".section __ex_table,\"a\"\n"
-	       "	.align 4\n"
-	       "	.long 0b,16b\n"
-	       "	.long 1b,16b\n"
-	       "	.long 2b,16b\n"
-	       "	.long 21b,16b\n"
-	       "	.long 3b,16b\n"
-	       "	.long 31b,16b\n"
-	       "	.long 4b,16b\n"
-	       "	.long 41b,16b\n"
-	       "	.long 10b,16b\n"
-	       "	.long 51b,16b\n"
-	       "	.long 11b,16b\n"
-	       "	.long 61b,16b\n"
-	       "	.long 12b,16b\n"
-	       "	.long 71b,16b\n"
-	       "	.long 13b,16b\n"
-	       "	.long 81b,16b\n"
-	       "	.long 14b,16b\n"
-	       "	.long 91b,16b\n"
-	       "	.long 6b,9b\n"
-	       "        .long 7b,16b\n"
-	       ".previous"
+	       _ASM_EXTABLE(0b,16b)
+	       _ASM_EXTABLE(1b,16b)
+	       _ASM_EXTABLE(2b,16b)
+	       _ASM_EXTABLE(21b,16b)
+	       _ASM_EXTABLE(3b,16b)
+	       _ASM_EXTABLE(31b,16b)
+	       _ASM_EXTABLE(4b,16b)
+	       _ASM_EXTABLE(41b,16b)
+	       _ASM_EXTABLE(10b,16b)
+	       _ASM_EXTABLE(51b,16b)
+	       _ASM_EXTABLE(11b,16b)
+	       _ASM_EXTABLE(61b,16b)
+	       _ASM_EXTABLE(12b,16b)
+	       _ASM_EXTABLE(71b,16b)
+	       _ASM_EXTABLE(13b,16b)
+	       _ASM_EXTABLE(81b,16b)
+	       _ASM_EXTABLE(14b,16b)
+	       _ASM_EXTABLE(91b,16b)
+	       _ASM_EXTABLE(6b,9b)
+	       _ASM_EXTABLE(7b,16b)
 	       : "=&c"(size), "=&D" (d0), "=&S" (d1)
 	       :  "1"(to), "2"(from), "0"(size)
 	       : "eax", "edx", "memory");
@@ -505,29 +494,26 @@ static unsigned long __copy_user_intel_nocache(void *to,
 	       "9:      lea 0(%%eax,%0,4),%0\n"
 	       "16:     jmp 8b\n"
 	       ".previous\n"
-	       ".section __ex_table,\"a\"\n"
-	       "	.align 4\n"
-	       "	.long 0b,16b\n"
-	       "	.long 1b,16b\n"
-	       "	.long 2b,16b\n"
-	       "	.long 21b,16b\n"
-	       "	.long 3b,16b\n"
-	       "	.long 31b,16b\n"
-	       "	.long 4b,16b\n"
-	       "	.long 41b,16b\n"
-	       "	.long 10b,16b\n"
-	       "	.long 51b,16b\n"
-	       "	.long 11b,16b\n"
-	       "	.long 61b,16b\n"
-	       "	.long 12b,16b\n"
-	       "	.long 71b,16b\n"
-	       "	.long 13b,16b\n"
-	       "	.long 81b,16b\n"
-	       "	.long 14b,16b\n"
-	       "	.long 91b,16b\n"
-	       "	.long 6b,9b\n"
-	       "        .long 7b,16b\n"
-	       ".previous"
+	       _ASM_EXTABLE(0b,16b)
+	       _ASM_EXTABLE(1b,16b)
+	       _ASM_EXTABLE(2b,16b)
+	       _ASM_EXTABLE(21b,16b)
+	       _ASM_EXTABLE(3b,16b)
+	       _ASM_EXTABLE(31b,16b)
+	       _ASM_EXTABLE(4b,16b)
+	       _ASM_EXTABLE(41b,16b)
+	       _ASM_EXTABLE(10b,16b)
+	       _ASM_EXTABLE(51b,16b)
+	       _ASM_EXTABLE(11b,16b)
+	       _ASM_EXTABLE(61b,16b)
+	       _ASM_EXTABLE(12b,16b)
+	       _ASM_EXTABLE(71b,16b)
+	       _ASM_EXTABLE(13b,16b)
+	       _ASM_EXTABLE(81b,16b)
+	       _ASM_EXTABLE(14b,16b)
+	       _ASM_EXTABLE(91b,16b)
+	       _ASM_EXTABLE(6b,9b)
+	       _ASM_EXTABLE(7b,16b)
 	       : "=&c"(size), "=&D" (d0), "=&S" (d1)
 	       :  "1"(to), "2"(from), "0"(size)
 	       : "eax", "edx", "memory");
@@ -574,12 +560,9 @@ do {									\
 		"3:	lea 0(%3,%0,4),%0\n"				\
 		"	jmp 2b\n"					\
 		".previous\n"						\
-		".section __ex_table,\"a\"\n"				\
-		"	.align 4\n"					\
-		"	.long 4b,5b\n"					\
-		"	.long 0b,3b\n"					\
-		"	.long 1b,2b\n"					\
-		".previous"						\
+		_ASM_EXTABLE(4b,5b)					\
+		_ASM_EXTABLE(0b,3b)					\
+		_ASM_EXTABLE(1b,2b)					\
 		: "=&c"(size), "=&D" (__d0), "=&S" (__d1), "=r"(__d2)	\
 		: "3"(size), "0"(size), "1"(to), "2"(from)		\
 		: "memory");						\
@@ -616,12 +599,9 @@ do {									\
 		"	popl %0\n"					\
 		"	jmp 2b\n"					\
 		".previous\n"						\
-		".section __ex_table,\"a\"\n"				\
-		"	.align 4\n"					\
-		"	.long 4b,5b\n"					\
-		"	.long 0b,3b\n"					\
-		"	.long 1b,6b\n"					\
-		".previous"						\
+		_ASM_EXTABLE(4b,5b)					\
+		_ASM_EXTABLE(0b,3b)					\
+		_ASM_EXTABLE(1b,6b)					\
 		: "=&c"(size), "=&D" (__d0), "=&S" (__d1), "=r"(__d2)	\
 		: "3"(size), "0"(size), "1"(to), "2"(from)		\
 		: "memory");						\
-- 
cgit v1.2.3


From f542c5d6e57ea32daae3708a71911d9f5c883c5a Mon Sep 17 00:00:00 2001
From: "H. Peter Anvin" <hpa@zytor.com>
Date: Fri, 20 Apr 2012 12:19:52 -0700
Subject: x86, extable: Remove open-coded exception table entries in
 arch/x86/um/checksum_32.S

Remove open-coded exception table entries in arch/x86/um/checksum_32.S,
and replace them with _ASM_EXTABLE() macros; this will allow us to
change the format and type of the exception table entries.

Signed-off-by: H. Peter Anvin <hpa@zytor.com>
Cc: David Daney <david.daney@cavium.com>
Cc: Richard Weinberger <richard@nod.at>
Cc: Al Viro <viro@zeniv.linux.org.uk>
Link: http://lkml.kernel.org/r/CA%2B55aFyijf43qSu3N9nWHEBwaGbb7T2Oq9A=9EyR=Jtyqfq_cQ@mail.gmail.com
---
 arch/x86/um/checksum_32.S | 9 +++------
 1 file changed, 3 insertions(+), 6 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/um/checksum_32.S b/arch/x86/um/checksum_32.S
index f058d2f82e18..8d0c420465cc 100644
--- a/arch/x86/um/checksum_32.S
+++ b/arch/x86/um/checksum_32.S
@@ -26,6 +26,7 @@
  */
 
 #include <asm/errno.h>
+#include <asm/asm.h>
 				
 /*
  * computes a partial checksum, e.g. for TCP/UDP fragments
@@ -232,15 +233,11 @@ unsigned int csum_partial_copy_generic (const char *src, char *dst,
 
 #define SRC(y...)			\
 	9999: y;			\
-	.section __ex_table, "a";	\
-	.long 9999b, 6001f	;	\
-	.previous
+	_ASM_EXTABLE(9999b, 6001f)
 
 #define DST(y...)			\
 	9999: y;			\
-	.section __ex_table, "a";	\
-	.long 9999b, 6002f	;	\
-	.previous
+	_ASM_EXTABLE(9999b, 6002f)
 
 .align 4
 
-- 
cgit v1.2.3


From 8f6380b9ec1cc4bed9b38144f739b87dd2cddb1d Mon Sep 17 00:00:00 2001
From: "H. Peter Anvin" <hpa@zytor.com>
Date: Fri, 20 Apr 2012 12:19:52 -0700
Subject: x86, extable: Remove open-coded exception table entries in
 arch/x86/xen/xen-asm_32.S

Remove open-coded exception table entries in arch/x86/xen/xen-asm_32.S,
and replace them with _ASM_EXTABLE() macros; this will allow us to
change the format and type of the exception table entries.

Signed-off-by: H. Peter Anvin <hpa@zytor.com>
Cc: David Daney <david.daney@cavium.com>
Cc: Konrad Rzeszutek Wilk <konrad.wilk@oracle.com>
Cc: Jeremy Fitzhardinge <jeremy@goop.org>
Link: http://lkml.kernel.org/r/CA%2B55aFyijf43qSu3N9nWHEBwaGbb7T2Oq9A=9EyR=Jtyqfq_cQ@mail.gmail.com
---
 arch/x86/xen/xen-asm_32.S | 6 ++----
 1 file changed, 2 insertions(+), 4 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/xen/xen-asm_32.S b/arch/x86/xen/xen-asm_32.S
index b040b0e518ca..f9643fc50de5 100644
--- a/arch/x86/xen/xen-asm_32.S
+++ b/arch/x86/xen/xen-asm_32.S
@@ -14,6 +14,7 @@
 #include <asm/thread_info.h>
 #include <asm/processor-flags.h>
 #include <asm/segment.h>
+#include <asm/asm.h>
 
 #include <xen/interface/xen.h>
 
@@ -137,10 +138,7 @@ iret_restore_end:
 
 1:	iret
 xen_iret_end_crit:
-.section __ex_table, "a"
-	.align 4
-	.long 1b, iret_exc
-.previous
+	_ASM_EXTABLE(1b, iret_exc)
 
 hyper_iret:
 	/* put this out of line since its very rarely used */
-- 
cgit v1.2.3


From 447657e31235c692f579c639250317c7f565cd0d Mon Sep 17 00:00:00 2001
From: "H. Peter Anvin" <hpa@zytor.com>
Date: Fri, 20 Apr 2012 12:20:30 -0700
Subject: x86, extable: Remove the now-unused __ASM_EX_SEC macros

Nothing should use them anymore; only _ASM_EXTABLE() should ever be
used.

Signed-off-by: H. Peter Anvin <hpa@zytor.com>
Cc: David Daney <david.daney@cavium.com>
Link: http://lkml.kernel.org/r/CA%2B55aFyijf43qSu3N9nWHEBwaGbb7T2Oq9A=9EyR=Jtyqfq_cQ@mail.gmail.com
---
 arch/x86/include/asm/asm.h | 2 --
 1 file changed, 2 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/include/asm/asm.h b/arch/x86/include/asm/asm.h
index ff3f6bffcbf9..53dce41f2517 100644
--- a/arch/x86/include/asm/asm.h
+++ b/arch/x86/include/asm/asm.h
@@ -4,11 +4,9 @@
 #ifdef __ASSEMBLY__
 # define __ASM_FORM(x)	x
 # define __ASM_FORM_COMMA(x) x,
-# define __ASM_EX_SEC	.section __ex_table, "a"
 #else
 # define __ASM_FORM(x)	" " #x " "
 # define __ASM_FORM_COMMA(x) " " #x ","
-# define __ASM_EX_SEC	" .section __ex_table,\"a\"\n"
 #endif
 
 #ifdef CONFIG_X86_32
-- 
cgit v1.2.3


From 3ee89722cfb165295cc8eb498018c0bdafc57062 Mon Sep 17 00:00:00 2001
From: "H. Peter Anvin" <hpa@zytor.com>
Date: Fri, 20 Apr 2012 13:41:59 -0700
Subject: x86, extable: Remove open-coded exception table entries in
 arch/x86/include/asm/kvm_host.h

Remove open-coded exception table entries in arch/x86/include/asm/kvm_host.h,
and replace them with _ASM_EXTABLE() macros; this will allow us to
change the format and type of the exception table entries.

Signed-off-by: H. Peter Anvin <hpa@zytor.com>
Cc: David Daney <david.daney@cavium.com>
Cc: Avi Kivity <avi@redhat.com>
Cc: Marcelo Tosatti <mtosatti@redhat.com>
Link: http://lkml.kernel.org/r/CA%2B55aFyijf43qSu3N9nWHEBwaGbb7T2Oq9A=9EyR=Jtyqfq_cQ@mail.gmail.com
---
 arch/x86/include/asm/kvm_host.h | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index e216ba066e79..e5b97be12d2a 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -27,6 +27,7 @@
 #include <asm/desc.h>
 #include <asm/mtrr.h>
 #include <asm/msr-index.h>
+#include <asm/asm.h>
 
 #define KVM_MAX_VCPUS 254
 #define KVM_SOFT_MAX_VCPUS 160
@@ -921,9 +922,7 @@ extern bool kvm_rebooting;
 	__ASM_SIZE(push) " $666b \n\t"	      \
 	"call kvm_spurious_fault \n\t"	      \
 	".popsection \n\t" \
-	".pushsection __ex_table, \"a\" \n\t" \
-	_ASM_PTR " 666b, 667b \n\t" \
-	".popsection"
+	_ASM_EXTABLE(666b, 667b)
 
 #define __kvm_handle_fault_on_reboot(insn)		\
 	____kvm_handle_fault_on_reboot(insn, "")
-- 
cgit v1.2.3


From 7a040a4384c7c4973deb4d58a76e1b0ee3c8aa39 Mon Sep 17 00:00:00 2001
From: "H. Peter Anvin" <hpa@zytor.com>
Date: Fri, 20 Apr 2012 13:42:25 -0700
Subject: x86, extable: Remove open-coded exception table entries in
 arch/x86/include/asm/xsave.h

Remove open-coded exception table entries in arch/x86/include/asm/xsave.h,
and replace them with _ASM_EXTABLE() macros; this will allow us to
change the format and type of the exception table entries.

Signed-off-by: H. Peter Anvin <hpa@zytor.com>
Cc: David Daney <david.daney@cavium.com>
Cc: Suresh Siddha <suresh.b.siddha@intel.com>
Link: http://lkml.kernel.org/r/CA%2B55aFyijf43qSu3N9nWHEBwaGbb7T2Oq9A=9EyR=Jtyqfq_cQ@mail.gmail.com
---
 arch/x86/include/asm/xsave.h | 10 ++--------
 1 file changed, 2 insertions(+), 8 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/include/asm/xsave.h b/arch/x86/include/asm/xsave.h
index c6ce2452f10c..8a1b6f9b594a 100644
--- a/arch/x86/include/asm/xsave.h
+++ b/arch/x86/include/asm/xsave.h
@@ -80,10 +80,7 @@ static inline int xsave_user(struct xsave_struct __user *buf)
 			     "3:  movl $-1,%[err]\n"
 			     "    jmp  2b\n"
 			     ".previous\n"
-			     ".section __ex_table,\"a\"\n"
-			     _ASM_ALIGN "\n"
-			     _ASM_PTR "1b,3b\n"
-			     ".previous"
+			     _ASM_EXTABLE(1b,3b)
 			     : [err] "=r" (err)
 			     : "D" (buf), "a" (-1), "d" (-1), "0" (0)
 			     : "memory");
@@ -106,10 +103,7 @@ static inline int xrestore_user(struct xsave_struct __user *buf, u64 mask)
 			     "3:  movl $-1,%[err]\n"
 			     "    jmp  2b\n"
 			     ".previous\n"
-			     ".section __ex_table,\"a\"\n"
-			     _ASM_ALIGN "\n"
-			     _ASM_PTR "1b,3b\n"
-			     ".previous"
+			     _ASM_EXTABLE(1b,3b)
 			     : [err] "=r" (err)
 			     : "D" (xstate), "a" (lmask), "d" (hmask), "0" (0)
 			     : "memory");	/* memory required? */
-- 
cgit v1.2.3


From 8571723a698dcc0ee16c1c63908aa99dd940ce5c Mon Sep 17 00:00:00 2001
From: Chen Gong <gong.chen@linux.intel.com>
Date: Fri, 20 Apr 2012 16:02:05 -0700
Subject: x86/mce Add validation check before GHES error is recorded

When GHES error record is logged into mcelog kernel buffer, a validation
check for physical address is necessary, which prevents reporting an
invalid physical address.

[Since physical address is the only useful element in this error record,
we drop generating the record completely if we don't have a valid address]

Signed-off-by: Chen Gong <gong.chen@linux.intel.com>
Signed-off-by: Tony Luck <tony.luck@intel.com>
---
 arch/x86/kernel/cpu/mcheck/mce-apei.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kernel/cpu/mcheck/mce-apei.c b/arch/x86/kernel/cpu/mcheck/mce-apei.c
index 507ea58688e2..cd8b166a1735 100644
--- a/arch/x86/kernel/cpu/mcheck/mce-apei.c
+++ b/arch/x86/kernel/cpu/mcheck/mce-apei.c
@@ -42,7 +42,8 @@ void apei_mce_report_mem_error(int corrected, struct cper_sec_mem_err *mem_err)
 	struct mce m;
 
 	/* Only corrected MC is reported */
-	if (!corrected)
+	if (!corrected || !(mem_err->validation_bits &
+				CPER_MEM_VALID_PHYSICAL_ADDRESS))
 		return;
 
 	mce_setup(&m);
-- 
cgit v1.2.3


From a3e859fed1244b72253718e076a724ffe13a9584 Mon Sep 17 00:00:00 2001
From: "H. Peter Anvin" <hpa@zytor.com>
Date: Fri, 20 Apr 2012 16:51:50 -0700
Subject: x86, extable: Remove open-coded exception table entries in
 arch/x86/ia32/ia32entry.S

Remove open-coded exception table entries in arch/x86/ia32/ia32entry.S,
and replace them with _ASM_EXTABLE() macros; this will allow us to
change the format and type of the exception table entries.

This one was missed from the previous patch to this file.

Signed-off-by: H. Peter Anvin <hpa@zytor.com>
Cc: David Daney <david.daney@cavium.com>
Link: http://lkml.kernel.org/r/CA%2B55aFyijf43qSu3N9nWHEBwaGbb7T2Oq9A=9EyR=Jtyqfq_cQ@mail.gmail.com
---
 arch/x86/ia32/ia32entry.S | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/ia32/ia32entry.S b/arch/x86/ia32/ia32entry.S
index eb48edd0cad2..20e5f7ba0e6b 100644
--- a/arch/x86/ia32/ia32entry.S
+++ b/arch/x86/ia32/ia32entry.S
@@ -302,9 +302,7 @@ ENTRY(ia32_cstar_target)
 	   32bit zero extended */ 
 	/* hardware stack frame is complete now */	
 1:	movl	(%r8),%r9d
-	.section __ex_table,"a"
-	.quad 1b,ia32_badarg
-	.previous	
+	_ASM_EXTABLE(1b,ia32_badarg)
 	orl     $TS_COMPAT,TI_status+THREAD_INFO(%rsp,RIP-ARGOFFSET)
 	testl   $_TIF_WORK_SYSCALL_ENTRY,TI_flags+THREAD_INFO(%rsp,RIP-ARGOFFSET)
 	CFI_REMEMBER_STATE
-- 
cgit v1.2.3


From 535c0c34698061544f81a51c65fc51f4eeeebff6 Mon Sep 17 00:00:00 2001
From: "H. Peter Anvin" <hpa@zytor.com>
Date: Fri, 20 Apr 2012 16:57:35 -0700
Subject: x86, extable: Add _ASM_EXTABLE_EX() macro

Add _ASM_EXTABLE_EX() to generate the special extable entries that are
associated with uaccess_err.  This allows us to change the protocol
associated with these special entries.

Signed-off-by: H. Peter Anvin <hpa@zytor.com>
Cc: David Daney <david.daney@cavium.com>
Link: http://lkml.kernel.org/r/CA%2B55aFyijf43qSu3N9nWHEBwaGbb7T2Oq9A=9EyR=Jtyqfq_cQ@mail.gmail.com
---
 arch/x86/include/asm/asm.h     | 28 ++++++++++++++++++++--------
 arch/x86/include/asm/uaccess.h |  8 ++++----
 2 files changed, 24 insertions(+), 12 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/include/asm/asm.h b/arch/x86/include/asm/asm.h
index 53dce41f2517..0f15e8a4f565 100644
--- a/arch/x86/include/asm/asm.h
+++ b/arch/x86/include/asm/asm.h
@@ -40,16 +40,28 @@
 
 /* Exception table entry */
 #ifdef __ASSEMBLY__
-# define _ASM_EXTABLE(from,to)			\
-	.pushsection "__ex_table","a" ;		\
-	_ASM_ALIGN ;				\
-	_ASM_PTR from , to ;			\
+# define _ASM_EXTABLE(from,to)					\
+	.pushsection "__ex_table","a" ;				\
+	_ASM_ALIGN ;						\
+	_ASM_PTR from , to ;					\
+	.popsection
+
+# define _ASM_EXTABLE_EX(from,to)				\
+	.pushsection "__ex_table","a" ;				\
+	_ASM_ALIGN ;						\
+	_ASM_PTR from , (to) - (from) ;				\
 	.popsection
 #else
-# define _ASM_EXTABLE(from,to)			\
-	" .pushsection \"__ex_table\",\"a\"\n"	\
-	_ASM_ALIGN "\n" 			\
-	_ASM_PTR #from "," #to "\n" 		\
+# define _ASM_EXTABLE(from,to)					\
+	" .pushsection \"__ex_table\",\"a\"\n"			\
+	_ASM_ALIGN "\n" 					\
+	_ASM_PTR #from "," #to "\n" 				\
+	" .popsection\n"
+
+# define _ASM_EXTABLE_EX(from,to)				\
+	" .pushsection \"__ex_table\",\"a\"\n"			\
+	_ASM_ALIGN "\n" 					\
+	_ASM_PTR #from ",(" #to ")-(" #from ")\n" 		\
 	" .popsection\n"
 #endif
 
diff --git a/arch/x86/include/asm/uaccess.h b/arch/x86/include/asm/uaccess.h
index e0544597cfe7..4ee59dd66f5d 100644
--- a/arch/x86/include/asm/uaccess.h
+++ b/arch/x86/include/asm/uaccess.h
@@ -202,8 +202,8 @@ extern int __get_user_bad(void);
 	asm volatile("1:	movl %%eax,0(%1)\n"			\
 		     "2:	movl %%edx,4(%1)\n"			\
 		     "3:\n"						\
-		     _ASM_EXTABLE(1b, 2b - 1b)				\
-		     _ASM_EXTABLE(2b, 3b - 2b)				\
+		     _ASM_EXTABLE_EX(1b, 2b)				\
+		     _ASM_EXTABLE_EX(2b, 3b)				\
 		     : : "A" (x), "r" (addr))
 
 #define __put_user_x8(x, ptr, __ret_pu)				\
@@ -408,7 +408,7 @@ do {									\
 #define __get_user_asm_ex(x, addr, itype, rtype, ltype)			\
 	asm volatile("1:	mov"itype" %1,%"rtype"0\n"		\
 		     "2:\n"						\
-		     _ASM_EXTABLE(1b, 2b - 1b)				\
+		     _ASM_EXTABLE_EX(1b, 2b)				\
 		     : ltype(x) : "m" (__m(addr)))
 
 #define __put_user_nocheck(x, ptr, size)			\
@@ -450,7 +450,7 @@ struct __large_struct { unsigned long buf[100]; };
 #define __put_user_asm_ex(x, addr, itype, rtype, ltype)			\
 	asm volatile("1:	mov"itype" %"rtype"0,%1\n"		\
 		     "2:\n"						\
-		     _ASM_EXTABLE(1b, 2b - 1b)				\
+		     _ASM_EXTABLE_EX(1b, 2b)				\
 		     : : ltype(x), "m" (__m(addr)))
 
 /*
-- 
cgit v1.2.3


From fa574a48a1e9706bba38188d3bf61ecb66546a77 Mon Sep 17 00:00:00 2001
From: "H. Peter Anvin" <hpa@zytor.com>
Date: Fri, 20 Apr 2012 17:11:17 -0700
Subject: x86, extable: Disable presorted exception table for now

Disable presorting the exception table in preparation for changing the
format.

Signed-off-by: H. Peter Anvin <hpa@zytor.com>
Cc: David Daney <david.daney@cavium.com>
Link: http://lkml.kernel.org/r/CA%2B55aFyijf43qSu3N9nWHEBwaGbb7T2Oq9A=9EyR=Jtyqfq_cQ@mail.gmail.com
---
 arch/x86/Kconfig | 1 -
 1 file changed, 1 deletion(-)

(limited to 'arch/x86')

diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
index 2f925ccb3e5b..1d14cc6b79ad 100644
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -82,7 +82,6 @@ config X86
 	select ARCH_HAVE_NMI_SAFE_CMPXCHG
 	select GENERIC_IOMAP
 	select DCACHE_WORD_ACCESS if !DEBUG_PAGEALLOC
-	select BUILDTIME_EXTABLE_SORT
 
 config INSTRUCTION_DECODER
 	def_bool (KPROBES || PERF_EVENTS)
-- 
cgit v1.2.3


From 706276543b699d80f546e45f8b12574e7b18d952 Mon Sep 17 00:00:00 2001
From: "H. Peter Anvin" <hpa@zytor.com>
Date: Fri, 20 Apr 2012 17:12:48 -0700
Subject: x86, extable: Switch to relative exception table entries

Switch to using relative exception table entries on x86.  On i386,
this has the advantage that the exception table entries don't need to
be relocated; on x86-64 this means the exception table entries take up
only half the space.

In either case, a 32-bit delta is sufficient, as the range of kernel
code addresses is limited.

Since part of the goal is to avoid needing to adjust the entries when
the kernel is relocated, the old trick of using addresses in the NULL
pointer range to indicate uaccess_err no longer works (and unlike RISC
architectures we can't use a flag bit); instead use an delta just
below +2G to indicate these special entries.  The reach is still
limited to a single instruction.

Signed-off-by: H. Peter Anvin <hpa@zytor.com>
Cc: David Daney <david.daney@cavium.com>
Link: http://lkml.kernel.org/r/CA%2B55aFyijf43qSu3N9nWHEBwaGbb7T2Oq9A=9EyR=Jtyqfq_cQ@mail.gmail.com
---
 arch/x86/include/asm/asm.h     |  20 ++++---
 arch/x86/include/asm/uaccess.h |  17 ++++--
 arch/x86/mm/extable.c          | 131 ++++++++++++++++++++++++++++++++++++++---
 3 files changed, 146 insertions(+), 22 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/include/asm/asm.h b/arch/x86/include/asm/asm.h
index 0f15e8a4f565..1c2d247f65ce 100644
--- a/arch/x86/include/asm/asm.h
+++ b/arch/x86/include/asm/asm.h
@@ -42,26 +42,30 @@
 #ifdef __ASSEMBLY__
 # define _ASM_EXTABLE(from,to)					\
 	.pushsection "__ex_table","a" ;				\
-	_ASM_ALIGN ;						\
-	_ASM_PTR from , to ;					\
+	.balign 8 ;						\
+	.long (from) - . ;					\
+	.long (to) - . ;					\
 	.popsection
 
 # define _ASM_EXTABLE_EX(from,to)				\
 	.pushsection "__ex_table","a" ;				\
-	_ASM_ALIGN ;						\
-	_ASM_PTR from , (to) - (from) ;				\
+	.balign 8 ;						\
+	.long (from) - . ;					\
+	.long (to) - . + 0x7ffffff0 ;				\
 	.popsection
 #else
 # define _ASM_EXTABLE(from,to)					\
 	" .pushsection \"__ex_table\",\"a\"\n"			\
-	_ASM_ALIGN "\n" 					\
-	_ASM_PTR #from "," #to "\n" 				\
+	" .balign 8\n"						\
+	" .long (" #from ") - .\n"				\
+	" .long (" #to ") - .\n"				\
 	" .popsection\n"
 
 # define _ASM_EXTABLE_EX(from,to)				\
 	" .pushsection \"__ex_table\",\"a\"\n"			\
-	_ASM_ALIGN "\n" 					\
-	_ASM_PTR #from ",(" #to ")-(" #from ")\n" 		\
+	" .balign 8\n"						\
+	" .long (" #from ") - .\n"				\
+	" .long (" #to ") - . + 0x7ffffff0\n"			\
 	" .popsection\n"
 #endif
 
diff --git a/arch/x86/include/asm/uaccess.h b/arch/x86/include/asm/uaccess.h
index 4ee59dd66f5d..851fe0dc13bc 100644
--- a/arch/x86/include/asm/uaccess.h
+++ b/arch/x86/include/asm/uaccess.h
@@ -79,11 +79,12 @@
 #define access_ok(type, addr, size) (likely(__range_not_ok(addr, size) == 0))
 
 /*
- * The exception table consists of pairs of addresses: the first is the
- * address of an instruction that is allowed to fault, and the second is
- * the address at which the program should continue.  No registers are
- * modified, so it is entirely up to the continuation code to figure out
- * what to do.
+ * The exception table consists of pairs of addresses relative to the
+ * exception table enty itself: the first is the address of an
+ * instruction that is allowed to fault, and the second is the address
+ * at which the program should continue.  No registers are modified,
+ * so it is entirely up to the continuation code to figure out what to
+ * do.
  *
  * All the routines below use bits of fixup code that are out of line
  * with the main instruction path.  This means when everything is well,
@@ -92,10 +93,14 @@
  */
 
 struct exception_table_entry {
-	unsigned long insn, fixup;
+	int insn, fixup;
 };
+/* This is not the generic standard exception_table_entry format */
+#define ARCH_HAS_SORT_EXTABLE
+#define ARCH_HAS_SEARCH_EXTABLE
 
 extern int fixup_exception(struct pt_regs *regs);
+extern int early_fixup_exception(unsigned long *ip);
 
 /*
  * These are the main single-value transfer routines.  They automatically
diff --git a/arch/x86/mm/extable.c b/arch/x86/mm/extable.c
index 5555675dadb6..903ec1e9c326 100644
--- a/arch/x86/mm/extable.c
+++ b/arch/x86/mm/extable.c
@@ -1,11 +1,23 @@
 #include <linux/module.h>
 #include <linux/spinlock.h>
+#include <linux/sort.h>
 #include <asm/uaccess.h>
 
+static inline unsigned long
+ex_insn_addr(const struct exception_table_entry *x)
+{
+	return (unsigned long)&x->insn + x->insn;
+}
+static inline unsigned long
+ex_fixup_addr(const struct exception_table_entry *x)
+{
+	return (unsigned long)&x->fixup + x->fixup;
+}
 
 int fixup_exception(struct pt_regs *regs)
 {
 	const struct exception_table_entry *fixup;
+	unsigned long new_ip;
 
 #ifdef CONFIG_PNPBIOS
 	if (unlikely(SEGMENT_IS_PNP_CODE(regs->cs))) {
@@ -23,13 +35,14 @@ int fixup_exception(struct pt_regs *regs)
 
 	fixup = search_exception_tables(regs->ip);
 	if (fixup) {
-		/* If fixup is less than 16, it means uaccess error */
-		if (fixup->fixup < 16) {
+		new_ip = ex_fixup_addr(fixup);
+
+		if (fixup->fixup - fixup->insn >= 0x7ffffff0 - 4) {
+			/* Special hack for uaccess_err */
 			current_thread_info()->uaccess_err = 1;
-			regs->ip += fixup->fixup;
-			return 1;
+			new_ip -= 0x7ffffff0;
 		}
-		regs->ip = fixup->fixup;
+		regs->ip = new_ip;
 		return 1;
 	}
 
@@ -40,15 +53,117 @@ int fixup_exception(struct pt_regs *regs)
 int __init early_fixup_exception(unsigned long *ip)
 {
 	const struct exception_table_entry *fixup;
+	unsigned long new_ip;
 
 	fixup = search_exception_tables(*ip);
 	if (fixup) {
-		if (fixup->fixup < 16)
-			return 0; /* Not supported during early boot */
+		new_ip = ex_fixup_addr(fixup);
+
+		if (fixup->fixup - fixup->insn >= 0x7ffffff0 - 4) {
+			/* uaccess handling not supported during early boot */
+			return 0;
+		}
 
-		*ip = fixup->fixup;
+		*ip = new_ip;
 		return 1;
 	}
 
 	return 0;
 }
+
+/*
+ * Search one exception table for an entry corresponding to the
+ * given instruction address, and return the address of the entry,
+ * or NULL if none is found.
+ * We use a binary search, and thus we assume that the table is
+ * already sorted.
+ */
+const struct exception_table_entry *
+search_extable(const struct exception_table_entry *first,
+	       const struct exception_table_entry *last,
+	       unsigned long value)
+{
+	while (first <= last) {
+		const struct exception_table_entry *mid;
+		unsigned long addr;
+
+		mid = ((last - first) >> 1) + first;
+		addr = ex_insn_addr(mid);
+		if (addr < value)
+			first = mid + 1;
+		else if (addr > value)
+			last = mid - 1;
+		else
+			return mid;
+        }
+        return NULL;
+}
+
+/*
+ * The exception table needs to be sorted so that the binary
+ * search that we use to find entries in it works properly.
+ * This is used both for the kernel exception table and for
+ * the exception tables of modules that get loaded.
+ *
+ */
+static int cmp_ex(const void *a, const void *b)
+{
+	const struct exception_table_entry *x = a, *y = b;
+
+	/*
+	 * This value will always end up fittin in an int, because on
+	 * both i386 and x86-64 the kernel symbol-reachable address
+	 * space is < 2 GiB.
+	 *
+	 * This compare is only valid after normalization.
+	 */
+	return x->insn - y->insn;
+}
+
+void sort_extable(struct exception_table_entry *start,
+		  struct exception_table_entry *finish)
+{
+	struct exception_table_entry *p;
+	int i;
+
+	/* Convert all entries to being relative to the start of the section */
+	i = 0;
+	for (p = start; p < finish; p++) {
+		p->insn += i;
+		i += 4;
+		p->fixup += i;
+		i += 4;
+	}
+
+	sort(start, finish - start, sizeof(struct exception_table_entry),
+	     cmp_ex, NULL);
+
+	/* Denormalize all entries */
+	i = 0;
+	for (p = start; p < finish; p++) {
+		p->insn -= i;
+		i += 4;
+		p->fixup -= i;
+		i += 4;
+	}
+}
+
+#ifdef CONFIG_MODULES
+/*
+ * If the exception table is sorted, any referring to the module init
+ * will be at the beginning or the end.
+ */
+void trim_init_extable(struct module *m)
+{
+	/*trim the beginning*/
+	while (m->num_exentries &&
+	       within_module_init(ex_insn_addr(&m->extable[0]), m)) {
+		m->extable++;
+		m->num_exentries--;
+	}
+	/*trim the end*/
+	while (m->num_exentries &&
+	       within_module_init(ex_insn_addr(&m->extable[m->num_exentries-1]), m))
+		m->num_exentries--;
+}
+#endif /* CONFIG_MODULES */
-- 
cgit v1.2.3


From 88674088d10ca2538b2efd2559f6620ade8ec373 Mon Sep 17 00:00:00 2001
From: Matthew Garrett <mjg@redhat.com>
Date: Mon, 16 Apr 2012 16:26:04 -0400
Subject: x86: Use vga_default_device() when determining whether an fb is
 primary

IORESOURCE_ROM_SHADOW is not necessarily an indication that the hardware
is the primary device. Add support for using the vgaarb functions and
fall back if nothing's set them.

Signed-off-by: Matthew Garrett <mjg@redhat.com>
Cc: mingo@redhat.com
Acked-by: hpa@zytor.com
Signed-off-by: Dave Airlie <airlied@redhat.com>
---
 arch/x86/video/fbdev.c | 20 +++++++++++++++-----
 1 file changed, 15 insertions(+), 5 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/video/fbdev.c b/arch/x86/video/fbdev.c
index c5ffb6ac8707..d5644bbe8cba 100644
--- a/arch/x86/video/fbdev.c
+++ b/arch/x86/video/fbdev.c
@@ -9,24 +9,34 @@
 #include <linux/fb.h>
 #include <linux/pci.h>
 #include <linux/module.h>
+#include <linux/vgaarb.h>
 
 int fb_is_primary_device(struct fb_info *info)
 {
 	struct device *device = info->device;
 	struct pci_dev *pci_dev = NULL;
+	struct pci_dev *default_device = vga_default_device();
 	struct resource *res = NULL;
-	int retval = 0;
 
 	if (device)
 		pci_dev = to_pci_dev(device);
 
-	if (pci_dev)
-		res = &pci_dev->resource[PCI_ROM_RESOURCE];
+	if (!pci_dev)
+		return 0;
+
+	if (default_device) {
+		if (pci_dev == default_device)
+			return 1;
+		else
+			return 0;
+	}
+
+	res = &pci_dev->resource[PCI_ROM_RESOURCE];
 
 	if (res && res->flags & IORESOURCE_ROM_SHADOW)
-		retval = 1;
+		return 1;
 
-	return retval;
+	return 0;
 }
 EXPORT_SYMBOL(fb_is_primary_device);
 MODULE_LICENSE("GPL");
-- 
cgit v1.2.3


From b4aa0163056b6c70029b6e8619ce07c274351f42 Mon Sep 17 00:00:00 2001
From: Matthew Garrett <mjg@redhat.com>
Date: Mon, 16 Apr 2012 16:26:05 -0400
Subject: efifb: Implement vga_default_device() (v2)

EFI doesn't typically make use of the legacy VGA ROM, but it may still be
configured to pass that through to a given video device. This may lead to
an inaccurate choice of default video device. Add support to efifb to pick
out the correct active video device.

v2: fix if->ifdef

Signed-off-by: Matthew Garrett <mjg@redhat.com>
Acked-by: hpa@zytor.com
Cc: matt.fleming@intel.com
Signed-off-by: Dave Airlie <airlied@redhat.com>
---
 arch/x86/include/asm/vga.h |  6 ++++
 drivers/video/efifb.c      | 77 ++++++++++++++++++++++++++++++++++------------
 2 files changed, 63 insertions(+), 20 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/include/asm/vga.h b/arch/x86/include/asm/vga.h
index c4b9dc2f67c5..44282fbf7bf9 100644
--- a/arch/x86/include/asm/vga.h
+++ b/arch/x86/include/asm/vga.h
@@ -17,4 +17,10 @@
 #define vga_readb(x) (*(x))
 #define vga_writeb(x, y) (*(y) = (x))
 
+#ifdef CONFIG_FB_EFI
+#define __ARCH_HAS_VGA_DEFAULT_DEVICE
+extern struct pci_dev *vga_default_device(void);
+extern void vga_set_default_device(struct pci_dev *pdev);
+#endif
+
 #endif /* _ASM_X86_VGA_H */
diff --git a/drivers/video/efifb.c b/drivers/video/efifb.c
index 784139aed079..66ed991ed8ba 100644
--- a/drivers/video/efifb.c
+++ b/drivers/video/efifb.c
@@ -18,6 +18,8 @@
 
 static bool request_mem_succeeded = false;
 
+static struct pci_dev *default_vga;
+
 static struct fb_var_screeninfo efifb_defined __devinitdata = {
 	.activate		= FB_ACTIVATE_NOW,
 	.height			= -1,
@@ -298,35 +300,70 @@ static struct fb_ops efifb_ops = {
 	.fb_imageblit	= cfb_imageblit,
 };
 
+struct pci_dev *vga_default_device(void)
+{
+	return default_vga;
+}
+
+void vga_set_default_device(struct pci_dev *pdev)
+{
+	default_vga = pdev;
+}
+
 static int __init efifb_setup(char *options)
 {
 	char *this_opt;
 	int i;
+	struct pci_dev *dev = NULL;
+
+	if (options && *options) {
+		while ((this_opt = strsep(&options, ",")) != NULL) {
+			if (!*this_opt) continue;
+
+			for (i = 0; i < M_UNKNOWN; i++) {
+				if (!strcmp(this_opt, dmi_list[i].optname) &&
+				    dmi_list[i].base != 0) {
+					screen_info.lfb_base = dmi_list[i].base;
+					screen_info.lfb_linelength = dmi_list[i].stride;
+					screen_info.lfb_width = dmi_list[i].width;
+					screen_info.lfb_height = dmi_list[i].height;
+				}
+			}
+			if (!strncmp(this_opt, "base:", 5))
+				screen_info.lfb_base = simple_strtoul(this_opt+5, NULL, 0);
+			else if (!strncmp(this_opt, "stride:", 7))
+				screen_info.lfb_linelength = simple_strtoul(this_opt+7, NULL, 0) * 4;
+			else if (!strncmp(this_opt, "height:", 7))
+				screen_info.lfb_height = simple_strtoul(this_opt+7, NULL, 0);
+			else if (!strncmp(this_opt, "width:", 6))
+				screen_info.lfb_width = simple_strtoul(this_opt+6, NULL, 0);
+		}
+	}
 
-	if (!options || !*options)
-		return 0;
+	for_each_pci_dev(dev) {
+		int i;
 
-	while ((this_opt = strsep(&options, ",")) != NULL) {
-		if (!*this_opt) continue;
+		if ((dev->class >> 8) != PCI_CLASS_DISPLAY_VGA)
+			continue;
 
-		for (i = 0; i < M_UNKNOWN; i++) {
-			if (!strcmp(this_opt, dmi_list[i].optname) &&
-					dmi_list[i].base != 0) {
-				screen_info.lfb_base = dmi_list[i].base;
-				screen_info.lfb_linelength = dmi_list[i].stride;
-				screen_info.lfb_width = dmi_list[i].width;
-				screen_info.lfb_height = dmi_list[i].height;
-			}
+		for (i=0; i < DEVICE_COUNT_RESOURCE; i++) {
+			resource_size_t start, end;
+
+			if (!(pci_resource_flags(dev, i) & IORESOURCE_MEM))
+				continue;
+
+			start = pci_resource_start(dev, i);
+			end  = pci_resource_end(dev, i);
+
+			if (!start || !end)
+				continue;
+
+			if (screen_info.lfb_base >= start &&
+			    (screen_info.lfb_base + screen_info.lfb_size) < end)
+				default_vga = dev;
 		}
-		if (!strncmp(this_opt, "base:", 5))
-			screen_info.lfb_base = simple_strtoul(this_opt+5, NULL, 0);
-		else if (!strncmp(this_opt, "stride:", 7))
-			screen_info.lfb_linelength = simple_strtoul(this_opt+7, NULL, 0) * 4;
-		else if (!strncmp(this_opt, "height:", 7))
-			screen_info.lfb_height = simple_strtoul(this_opt+7, NULL, 0);
-		else if (!strncmp(this_opt, "width:", 6))
-			screen_info.lfb_width = simple_strtoul(this_opt+6, NULL, 0);
 	}
+
 	return 0;
 }
 
-- 
cgit v1.2.3


From 07975ad3b30579ca27d880491ad992326b930c63 Mon Sep 17 00:00:00 2001
From: Jan Kiszka <jan.kiszka@siemens.com>
Date: Thu, 29 Mar 2012 21:14:12 +0200
Subject: KVM: Introduce direct MSI message injection for in-kernel irqchips

Currently, MSI messages can only be injected to in-kernel irqchips by
defining a corresponding IRQ route for each message. This is not only
unhandy if the MSI messages are generated "on the fly" by user space,
IRQ routes are a limited resource that user space has to manage
carefully.

By providing a direct injection path, we can both avoid using up limited
resources and simplify the necessary steps for user land.

Signed-off-by: Jan Kiszka <jan.kiszka@siemens.com>
Signed-off-by: Avi Kivity <avi@redhat.com>
---
 Documentation/virtual/kvm/api.txt | 21 +++++++++++++++++++++
 arch/x86/kvm/Kconfig              |  1 +
 include/linux/kvm.h               | 11 +++++++++++
 include/linux/kvm_host.h          |  2 ++
 virt/kvm/Kconfig                  |  3 +++
 virt/kvm/irq_comm.c               | 14 ++++++++++++++
 virt/kvm/kvm_main.c               | 14 ++++++++++++++
 7 files changed, 66 insertions(+)

(limited to 'arch/x86')

diff --git a/Documentation/virtual/kvm/api.txt b/Documentation/virtual/kvm/api.txt
index 81ff39f6248d..a1552210b16d 100644
--- a/Documentation/virtual/kvm/api.txt
+++ b/Documentation/virtual/kvm/api.txt
@@ -1689,6 +1689,27 @@ where the guest will clear the flag: when the soft lockup watchdog timer resets
 itself or when a soft lockup is detected.  This ioctl can be called any time
 after pausing the vcpu, but before it is resumed.
 
+4.71 KVM_SIGNAL_MSI
+
+Capability: KVM_CAP_SIGNAL_MSI
+Architectures: x86
+Type: vm ioctl
+Parameters: struct kvm_msi (in)
+Returns: >0 on delivery, 0 if guest blocked the MSI, and -1 on error
+
+Directly inject a MSI message. Only valid with in-kernel irqchip that handles
+MSI messages.
+
+struct kvm_msi {
+	__u32 address_lo;
+	__u32 address_hi;
+	__u32 data;
+	__u32 flags;
+	__u8  pad[16];
+};
+
+No flags are defined so far. The corresponding field must be 0.
+
 5. The kvm_run structure
 
 Application code obtains a pointer to the kvm_run structure by
diff --git a/arch/x86/kvm/Kconfig b/arch/x86/kvm/Kconfig
index 1a7fe868f375..a28f338843ea 100644
--- a/arch/x86/kvm/Kconfig
+++ b/arch/x86/kvm/Kconfig
@@ -36,6 +36,7 @@ config KVM
 	select TASKSTATS
 	select TASK_DELAY_ACCT
 	select PERF_EVENTS
+	select HAVE_KVM_MSI
 	---help---
 	  Support hosting fully virtualized guest machines using hardware
 	  virtualization extensions.  You will need a fairly recent
diff --git a/include/linux/kvm.h b/include/linux/kvm.h
index 7a9dd4b3dede..225b452e1d1d 100644
--- a/include/linux/kvm.h
+++ b/include/linux/kvm.h
@@ -590,6 +590,7 @@ struct kvm_ppc_pvinfo {
 #define KVM_CAP_SYNC_REGS 74
 #define KVM_CAP_PCI_2_3 75
 #define KVM_CAP_KVMCLOCK_CTRL 76
+#define KVM_CAP_SIGNAL_MSI 77
 
 #ifdef KVM_CAP_IRQ_ROUTING
 
@@ -715,6 +716,14 @@ struct kvm_one_reg {
 	__u64 addr;
 };
 
+struct kvm_msi {
+	__u32 address_lo;
+	__u32 address_hi;
+	__u32 data;
+	__u32 flags;
+	__u8  pad[16];
+};
+
 /*
  * ioctls for VM fds
  */
@@ -789,6 +798,8 @@ struct kvm_s390_ucas_mapping {
 /* Available with KVM_CAP_PCI_2_3 */
 #define KVM_ASSIGN_SET_INTX_MASK  _IOW(KVMIO,  0xa4, \
 				       struct kvm_assigned_pci_dev)
+/* Available with KVM_CAP_SIGNAL_MSI */
+#define KVM_SIGNAL_MSI            _IOW(KVMIO,  0xa5, struct kvm_msi)
 
 /*
  * ioctls for vcpu fds
diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h
index 186ffab0b9f0..6f343307d72b 100644
--- a/include/linux/kvm_host.h
+++ b/include/linux/kvm_host.h
@@ -802,6 +802,8 @@ int kvm_set_irq_routing(struct kvm *kvm,
 			unsigned flags);
 void kvm_free_irq_routing(struct kvm *kvm);
 
+int kvm_send_userspace_msi(struct kvm *kvm, struct kvm_msi *msi);
+
 #else
 
 static inline void kvm_free_irq_routing(struct kvm *kvm) {}
diff --git a/virt/kvm/Kconfig b/virt/kvm/Kconfig
index f63ccb0a5982..28694f4a9139 100644
--- a/virt/kvm/Kconfig
+++ b/virt/kvm/Kconfig
@@ -18,3 +18,6 @@ config KVM_MMIO
 
 config KVM_ASYNC_PF
        bool
+
+config HAVE_KVM_MSI
+       bool
diff --git a/virt/kvm/irq_comm.c b/virt/kvm/irq_comm.c
index 9f614b4e365f..a6a0365475ed 100644
--- a/virt/kvm/irq_comm.c
+++ b/virt/kvm/irq_comm.c
@@ -138,6 +138,20 @@ int kvm_set_msi(struct kvm_kernel_irq_routing_entry *e,
 	return kvm_irq_delivery_to_apic(kvm, NULL, &irq);
 }
 
+int kvm_send_userspace_msi(struct kvm *kvm, struct kvm_msi *msi)
+{
+	struct kvm_kernel_irq_routing_entry route;
+
+	if (!irqchip_in_kernel(kvm) || msi->flags != 0)
+		return -EINVAL;
+
+	route.msi.address_lo = msi->address_lo;
+	route.msi.address_hi = msi->address_hi;
+	route.msi.data = msi->data;
+
+	return kvm_set_msi(&route, kvm, KVM_USERSPACE_IRQ_SOURCE_ID, 1);
+}
+
 /*
  * Return value:
  *  < 0   Interrupt was ignored (masked or not delivered for other reasons)
diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
index 9eb7936e491d..1847c762d8d9 100644
--- a/virt/kvm/kvm_main.c
+++ b/virt/kvm/kvm_main.c
@@ -2059,6 +2059,17 @@ static long kvm_vm_ioctl(struct file *filp,
 			kvm->bsp_vcpu_id = arg;
 		mutex_unlock(&kvm->lock);
 		break;
+#endif
+#ifdef CONFIG_HAVE_KVM_MSI
+	case KVM_SIGNAL_MSI: {
+		struct kvm_msi msi;
+
+		r = -EFAULT;
+		if (copy_from_user(&msi, argp, sizeof msi))
+			goto out;
+		r = kvm_send_userspace_msi(kvm, &msi);
+		break;
+	}
 #endif
 	default:
 		r = kvm_arch_vm_ioctl(filp, ioctl, arg);
@@ -2188,6 +2199,9 @@ static long kvm_dev_ioctl_check_extension_generic(long arg)
 	case KVM_CAP_SET_BOOT_CPU_ID:
 #endif
 	case KVM_CAP_INTERNAL_ERROR_DATA:
+#ifdef CONFIG_HAVE_KVM_MSI
+	case KVM_CAP_SIGNAL_MSI:
+#endif
 		return 1;
 #ifdef CONFIG_HAVE_KVM_IRQCHIP
 	case KVM_CAP_IRQ_ROUTING:
-- 
cgit v1.2.3


From 413837714232b3a4c0705e915d8af75ad521d083 Mon Sep 17 00:00:00 2001
From: Gleb Natapov <gleb@redhat.com>
Date: Thu, 19 Apr 2012 14:06:29 +0300
Subject: KVM: Introduce bitmask for apic attention reasons

The patch introduces a bitmap that will hold reasons apic should be
checked during vmexit. This is in a preparation for vp eoi patch
that will add one more check on vmexit. With the bitmap we can do
if(apic_attention) to check everything simultaneously which will
add zero overhead on the fast path.

Signed-off-by: Gleb Natapov <gleb@redhat.com>
Signed-off-by: Avi Kivity <avi@redhat.com>
---
 arch/x86/include/asm/kvm_host.h |  4 ++++
 arch/x86/kvm/lapic.c            | 12 +++++++-----
 2 files changed, 11 insertions(+), 5 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index f624ca72ea24..69e39bc7e36f 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -172,6 +172,9 @@ enum {
 #define DR7_FIXED_1	0x00000400
 #define DR7_VOLATILE	0xffff23ff
 
+/* apic attention bits */
+#define KVM_APIC_CHECK_VAPIC	0
+
 /*
  * We don't want allocation failures within the mmu code, so we preallocate
  * enough memory for a single page fault in a cache.
@@ -337,6 +340,7 @@ struct kvm_vcpu_arch {
 	u64 efer;
 	u64 apic_base;
 	struct kvm_lapic *apic;    /* kernel irqchip context */
+	unsigned long apic_attention;
 	int32_t apic_arb_prio;
 	int mp_state;
 	int sipi_vector;
diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c
index 992b4eaae684..93c15743f1ee 100644
--- a/arch/x86/kvm/lapic.c
+++ b/arch/x86/kvm/lapic.c
@@ -1088,6 +1088,7 @@ void kvm_lapic_reset(struct kvm_vcpu *vcpu)
 	apic_update_ppr(apic);
 
 	vcpu->arch.apic_arb_prio = 0;
+	vcpu->arch.apic_attention = 0;
 
 	apic_debug(KERN_INFO "%s: vcpu=%p, id=%d, base_msr="
 		   "0x%016" PRIx64 ", base_address=0x%0lx.\n", __func__,
@@ -1287,7 +1288,7 @@ void kvm_lapic_sync_from_vapic(struct kvm_vcpu *vcpu)
 	u32 data;
 	void *vapic;
 
-	if (!irqchip_in_kernel(vcpu->kvm) || !vcpu->arch.apic->vapic_addr)
+	if (!test_bit(KVM_APIC_CHECK_VAPIC, &vcpu->arch.apic_attention))
 		return;
 
 	vapic = kmap_atomic(vcpu->arch.apic->vapic_page);
@@ -1304,7 +1305,7 @@ void kvm_lapic_sync_to_vapic(struct kvm_vcpu *vcpu)
 	struct kvm_lapic *apic;
 	void *vapic;
 
-	if (!irqchip_in_kernel(vcpu->kvm) || !vcpu->arch.apic->vapic_addr)
+	if (!test_bit(KVM_APIC_CHECK_VAPIC, &vcpu->arch.apic_attention))
 		return;
 
 	apic = vcpu->arch.apic;
@@ -1324,10 +1325,11 @@ void kvm_lapic_sync_to_vapic(struct kvm_vcpu *vcpu)
 
 void kvm_lapic_set_vapic_addr(struct kvm_vcpu *vcpu, gpa_t vapic_addr)
 {
-	if (!irqchip_in_kernel(vcpu->kvm))
-		return;
-
 	vcpu->arch.apic->vapic_addr = vapic_addr;
+	if (vapic_addr)
+		__set_bit(KVM_APIC_CHECK_VAPIC, &vcpu->arch.apic_attention);
+	else
+		__clear_bit(KVM_APIC_CHECK_VAPIC, &vcpu->arch.apic_attention);
 }
 
 int kvm_x2apic_msr_write(struct kvm_vcpu *vcpu, u32 msr, u64 data)
-- 
cgit v1.2.3


From 38e8a2ddc9ada5dd1f2def95bebb733bf619bbef Mon Sep 17 00:00:00 2001
From: Avi Kivity <avi@redhat.com>
Date: Sun, 22 Apr 2012 15:12:50 +0300
Subject: KVM: x86 emulator: fix asm constraint in flush_pending_x87_faults

'bool' wants 8-bit registers.

Reported-by: Takuya Yoshikawa <takuya.yoshikawa@gmail.com>
Signed-off-by: Avi Kivity <avi@redhat.com>
---
 arch/x86/kvm/emulate.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c
index d5729a91d08d..0d151e232480 100644
--- a/arch/x86/kvm/emulate.c
+++ b/arch/x86/kvm/emulate.c
@@ -4123,7 +4123,7 @@ static int flush_pending_x87_faults(struct x86_emulate_ctxt *ctxt)
 		     "jmp 2b \n\t"
 		     ".popsection \n\t"
 		     _ASM_EXTABLE(1b, 3b)
-		     : [fault]"+rm"(fault));
+		     : [fault]"+qm"(fault));
 	ctxt->ops->put_fpu(ctxt);
 
 	if (unlikely(fault))
-- 
cgit v1.2.3


From 8b5ad472991796b2347464922c72de2ca5a028f3 Mon Sep 17 00:00:00 2001
From: David Daney <ddaney@caviumnetworks.com>
Date: Tue, 24 Apr 2012 11:23:15 -0700
Subject: Revert "x86, extable: Disable presorted exception table for now"

sortextable now works with relative entries, re-enable it.

Signed-off-by: David Daney <ddaney@caviumnetworks.com>
Link: http://lkml.kernel.org/r/1335291795-26693-3-git-send-email-ddaney.cavm@gmail.com
Signed-off-by: H. Peter Anvin <hpa@zytor.com>
---
 arch/x86/Kconfig | 1 +
 1 file changed, 1 insertion(+)

(limited to 'arch/x86')

diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
index 1d14cc6b79ad..2f925ccb3e5b 100644
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -82,6 +82,7 @@ config X86
 	select ARCH_HAVE_NMI_SAFE_CMPXCHG
 	select GENERIC_IOMAP
 	select DCACHE_WORD_ACCESS if !DEBUG_PAGEALLOC
+	select BUILDTIME_EXTABLE_SORT
 
 config INSTRUCTION_DECODER
 	def_bool (KPROBES || PERF_EVENTS)
-- 
cgit v1.2.3


From 553222f3e81f18da31b2552e18dc519715198590 Mon Sep 17 00:00:00 2001
From: Don Zickus <dzickus@redhat.com>
Date: Thu, 29 Mar 2012 16:11:16 -0400
Subject: x86/nmi: Add new NMI queues to deal with IO_CHK and SERR

In discussions with Thomas Mingarelli about hpwdt, he explained
to me some issues they were some when using their virtual NMI
button to test the hpwdt driver.

It turns out the virtual NMI button used on HP's machines do no
send unknown NMIs but instead send IO_CHK NMIs.  The way the
kernel code is written, the hpwdt driver can not register itself
against that type of NMI and therefore can not successfully
capture system information before panic'ing.

To solve this I created two new NMI queues to allow driver to
register against the IO_CHK and SERR NMIs.  Or in the hpwdt all
three (if you include unknown NMIs too).

The change is straightforward and just mimics what the unknown
NMI does.

Reported-and-tested-by: Thomas Mingarelli <thomas.mingarelli@hp.com>
Signed-off-by: Don Zickus <dzickus@redhat.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Andrew Morton <akpm@linux-foundation.org>
Link: http://lkml.kernel.org/r/1333051877-15755-3-git-send-email-dzickus@redhat.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/x86/include/asm/nmi.h |  2 ++
 arch/x86/kernel/nmi.c      | 18 ++++++++++++++++++
 drivers/watchdog/hpwdt.c   | 27 ++++++++++++++++++++-------
 3 files changed, 40 insertions(+), 7 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/include/asm/nmi.h b/arch/x86/include/asm/nmi.h
index fd3f9f18cf3f..07162dfbff84 100644
--- a/arch/x86/include/asm/nmi.h
+++ b/arch/x86/include/asm/nmi.h
@@ -27,6 +27,8 @@ void arch_trigger_all_cpu_backtrace(void);
 enum {
 	NMI_LOCAL=0,
 	NMI_UNKNOWN,
+	NMI_SERR,
+	NMI_IO_CHECK,
 	NMI_MAX
 };
 
diff --git a/arch/x86/kernel/nmi.c b/arch/x86/kernel/nmi.c
index 47acaf319165..ac9c1b76df96 100644
--- a/arch/x86/kernel/nmi.c
+++ b/arch/x86/kernel/nmi.c
@@ -54,6 +54,14 @@ static struct nmi_desc nmi_desc[NMI_MAX] =
 		.lock = __SPIN_LOCK_UNLOCKED(&nmi_desc[1].lock),
 		.head = LIST_HEAD_INIT(nmi_desc[1].head),
 	},
+	{
+		.lock = __SPIN_LOCK_UNLOCKED(&nmi_desc[2].lock),
+		.head = LIST_HEAD_INIT(nmi_desc[2].head),
+	},
+	{
+		.lock = __SPIN_LOCK_UNLOCKED(&nmi_desc[3].lock),
+		.head = LIST_HEAD_INIT(nmi_desc[3].head),
+	},
 
 };
 
@@ -120,6 +128,8 @@ static int __setup_nmi(unsigned int type, struct nmiaction *action)
 	 * to manage expectations
 	 */
 	WARN_ON_ONCE(type == NMI_UNKNOWN && !list_empty(&desc->head));
+	WARN_ON_ONCE(type == NMI_SERR && !list_empty(&desc->head));
+	WARN_ON_ONCE(type == NMI_IO_CHECK && !list_empty(&desc->head));
 
 	/*
 	 * some handlers need to be executed first otherwise a fake
@@ -212,6 +222,10 @@ EXPORT_SYMBOL_GPL(unregister_nmi_handler);
 static notrace __kprobes void
 pci_serr_error(unsigned char reason, struct pt_regs *regs)
 {
+	/* check to see if anyone registered against these types of errors */
+	if (nmi_handle(NMI_SERR, regs, false))
+		return;
+
 	pr_emerg("NMI: PCI system error (SERR) for reason %02x on CPU %d.\n",
 		 reason, smp_processor_id());
 
@@ -241,6 +255,10 @@ io_check_error(unsigned char reason, struct pt_regs *regs)
 {
 	unsigned long i;
 
+	/* check to see if anyone registered against these types of errors */
+	if (nmi_handle(NMI_IO_CHECK, regs, false))
+		return;
+
 	pr_emerg(
 	"NMI: IOCK error (debug interrupt?) for reason %02x on CPU %d.\n",
 		 reason, smp_processor_id());
diff --git a/drivers/watchdog/hpwdt.c b/drivers/watchdog/hpwdt.c
index 4000b8038cac..6e414b501d58 100644
--- a/drivers/watchdog/hpwdt.c
+++ b/drivers/watchdog/hpwdt.c
@@ -725,19 +725,32 @@ static int __devinit hpwdt_init_nmi_decoding(struct pci_dev *dev)
 	 * Only one function can register for NMI_UNKNOWN
 	 */
 	retval = register_nmi_handler(NMI_UNKNOWN, hpwdt_pretimeout, 0, "hpwdt");
-	if (retval != 0) {
-		dev_warn(&dev->dev,
-			"Unable to register a die notifier (err=%d).\n",
-			retval);
-		if (cru_rom_addr)
-			iounmap(cru_rom_addr);
-	}
+	if (retval)
+		goto error;
+	retval = register_nmi_handler(NMI_SERR, hpwdt_pretimeout, 0, "hpwdt");
+	if (retval)
+		goto error1;
+	retval = register_nmi_handler(NMI_IO_CHECK, hpwdt_pretimeout, 0, "hpwdt");
+	if (retval)
+		goto error2;
 
 	dev_info(&dev->dev,
 			"HP Watchdog Timer Driver: NMI decoding initialized"
 			", allow kernel dump: %s (default = 0/OFF)\n",
 			(allow_kdump == 0) ? "OFF" : "ON");
 	return 0;
+
+error2:
+	unregister_nmi_handler(NMI_SERR, "hpwdt");
+error1:
+	unregister_nmi_handler(NMI_UNKNOWN, "hpwdt");
+error:
+	dev_warn(&dev->dev,
+		"Unable to register a die notifier (err=%d).\n",
+		retval);
+	if (cru_rom_addr)
+		iounmap(cru_rom_addr);
+	return retval;
 }
 
 static void hpwdt_exit_nmi_decoding(void)
-- 
cgit v1.2.3


From 72b3fb24713755cf9740b403e95aa67ceedf3509 Mon Sep 17 00:00:00 2001
From: Li Zhong <zhong@linux.vnet.ibm.com>
Date: Thu, 29 Mar 2012 16:11:17 -0400
Subject: x86/nmi: Fix page faults by nmiaction if kmemcheck is enabled

This patch tries to fix the problem of page fault exception
caused by accessing nmiaction structure in nmi if kmemcheck
is enabled.

If kmemcheck is enabled, the memory allocated through slab are
in pages that are marked non-present, so that some checks could
be done in the page fault handling code ( e.g. whether the
memory is read before written to ).

As nmiaction is allocated in this way, so it resides in a
non-present page. Then there is a page fault while the nmi code
accessing the nmiaction structure, which would then cause a
warning by WARN_ON_ONCE(in_nmi()) in kmemcheck_fault(), called
by do_page_fault().

This significantly simplifies the code as well, as the whole
dynamic allocation dance goes away.

v2: as Peter suggested, changed the nmiaction to use static
    storage.

v3: as Peter suggested, use macro to shorten the codes. Also
    keep the original usage of register_nmi_handler, so users of
    this call doesn't need change.

Tested-by: Seiji Aguchi <seiji.aguchi@hds.com>
Fixes: https://lkml.org/lkml/2012/3/2/356
Signed-off-by: Li Zhong <zhong@linux.vnet.ibm.com>
[ simplified the wrappers ]
Signed-off-by: Don Zickus <dzickus@redhat.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: thomas.mingarelli@hp.com
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Andrew Morton <akpm@linux-foundation.org>
Link: http://lkml.kernel.org/r/1333051877-15755-4-git-send-email-dzickus@redhat.com
[ tidied the patch a bit ]
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/x86/include/asm/nmi.h | 20 ++++++++++++--
 arch/x86/kernel/nmi.c      | 65 +++++-----------------------------------------
 2 files changed, 24 insertions(+), 61 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/include/asm/nmi.h b/arch/x86/include/asm/nmi.h
index 07162dfbff84..a1a836c8131c 100644
--- a/arch/x86/include/asm/nmi.h
+++ b/arch/x86/include/asm/nmi.h
@@ -37,8 +37,24 @@ enum {
 
 typedef int (*nmi_handler_t)(unsigned int, struct pt_regs *);
 
-int register_nmi_handler(unsigned int, nmi_handler_t, unsigned long,
-			 const char *);
+struct nmiaction {
+	struct list_head	list;
+	nmi_handler_t		handler;
+	unsigned int		flags;
+	const char		*name;
+};
+
+#define register_nmi_handler(t, fn, fg, n)		\
+({							\
+	static struct nmiaction fn##_na = {		\
+		.handler = (fn),			\
+		.name = (n),				\
+		.flags = (fg),				\
+	};						\
+	__register_nmi_handler((t), &fn##_na);	\
+})
+
+int __register_nmi_handler(unsigned int, struct nmiaction *);
 
 void unregister_nmi_handler(unsigned int, const char *);
 
diff --git a/arch/x86/kernel/nmi.c b/arch/x86/kernel/nmi.c
index ac9c1b76df96..585be4bd71a5 100644
--- a/arch/x86/kernel/nmi.c
+++ b/arch/x86/kernel/nmi.c
@@ -31,14 +31,6 @@
 #include <asm/nmi.h>
 #include <asm/x86_init.h>
 
-#define NMI_MAX_NAMELEN	16
-struct nmiaction {
-	struct list_head list;
-	nmi_handler_t handler;
-	unsigned int flags;
-	char *name;
-};
-
 struct nmi_desc {
 	spinlock_t lock;
 	struct list_head head;
@@ -115,11 +107,14 @@ static int notrace __kprobes nmi_handle(unsigned int type, struct pt_regs *regs,
 	return handled;
 }
 
-static int __setup_nmi(unsigned int type, struct nmiaction *action)
+int __register_nmi_handler(unsigned int type, struct nmiaction *action)
 {
 	struct nmi_desc *desc = nmi_to_desc(type);
 	unsigned long flags;
 
+	if (!action->handler)
+		return -EINVAL;
+
 	spin_lock_irqsave(&desc->lock, flags);
 
 	/*
@@ -143,8 +138,9 @@ static int __setup_nmi(unsigned int type, struct nmiaction *action)
 	spin_unlock_irqrestore(&desc->lock, flags);
 	return 0;
 }
+EXPORT_SYMBOL(__register_nmi_handler);
 
-static struct nmiaction *__free_nmi(unsigned int type, const char *name)
+void unregister_nmi_handler(unsigned int type, const char *name)
 {
 	struct nmi_desc *desc = nmi_to_desc(type);
 	struct nmiaction *n;
@@ -167,56 +163,7 @@ static struct nmiaction *__free_nmi(unsigned int type, const char *name)
 
 	spin_unlock_irqrestore(&desc->lock, flags);
 	synchronize_rcu();
-	return (n);
 }
-
-int register_nmi_handler(unsigned int type, nmi_handler_t handler,
-			unsigned long nmiflags, const char *devname)
-{
-	struct nmiaction *action;
-	int retval = -ENOMEM;
-
-	if (!handler)
-		return -EINVAL;
-
-	action = kzalloc(sizeof(struct nmiaction), GFP_KERNEL);
-	if (!action)
-		goto fail_action;
-
-	action->handler = handler;
-	action->flags = nmiflags;
-	action->name = kstrndup(devname, NMI_MAX_NAMELEN, GFP_KERNEL);
-	if (!action->name)
-		goto fail_action_name;
-
-	retval = __setup_nmi(type, action);
-
-	if (retval)
-		goto fail_setup_nmi;
-
-	return retval;
-
-fail_setup_nmi:
-	kfree(action->name);
-fail_action_name:
-	kfree(action);
-fail_action:	
-
-	return retval;
-}
-EXPORT_SYMBOL_GPL(register_nmi_handler);
-
-void unregister_nmi_handler(unsigned int type, const char *name)
-{
-	struct nmiaction *a;
-
-	a = __free_nmi(type, name);
-	if (a) {
-		kfree(a->name);
-		kfree(a);
-	}
-}
-
 EXPORT_SYMBOL_GPL(unregister_nmi_handler);
 
 static notrace __kprobes void
-- 
cgit v1.2.3


From fab06992de6433af097c4a1d2d1b119812753ca7 Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@kernel.org>
Date: Wed, 25 Apr 2012 12:55:22 +0200
Subject: perf/x86: Clean up register_nmi_handler() usage

A function name represents the pointer to it - no need to take the
address of it. (Fixing this helps us introduce some macro magic
around register_nmi_handler() in the future.)

Cc: Robert Richter <robert.richter@amd.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/x86/kernel/cpu/perf_event_amd_ibs.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kernel/cpu/perf_event_amd_ibs.c b/arch/x86/kernel/cpu/perf_event_amd_ibs.c
index 573d24873459..8ff74d439041 100644
--- a/arch/x86/kernel/cpu/perf_event_amd_ibs.c
+++ b/arch/x86/kernel/cpu/perf_event_amd_ibs.c
@@ -469,7 +469,7 @@ static __init int perf_event_ibs_init(void)
 
 	perf_ibs_pmu_init(&perf_ibs_fetch, "ibs_fetch");
 	perf_ibs_pmu_init(&perf_ibs_op, "ibs_op");
-	register_nmi_handler(NMI_LOCAL, &perf_ibs_nmi_handler, 0, "perf_ibs");
+	register_nmi_handler(NMI_LOCAL, perf_ibs_nmi_handler, 0, "perf_ibs");
 	printk(KERN_INFO "perf: AMD IBS detected (0x%08x)\n", ibs_caps);
 
 	return 0;
-- 
cgit v1.2.3


From 8239c25f47d2b318156993b15f33900a86ea5e17 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Fri, 20 Apr 2012 13:05:42 +0000
Subject: smp: Add task_struct argument to __cpu_up()

Preparatory patch to make the idle thread allocation for secondary
cpus generic.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Rusty Russell <rusty@rustcorp.com.au>
Cc: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
Cc: Srivatsa S. Bhat <srivatsa.bhat@linux.vnet.ibm.com>
Cc: Matt Turner <mattst88@gmail.com>
Cc: Russell King <linux@arm.linux.org.uk>
Cc: Mike Frysinger <vapier@gentoo.org>
Cc: Jesper Nilsson <jesper.nilsson@axis.com>
Cc: Richard Kuo <rkuo@codeaurora.org>
Cc: Tony Luck <tony.luck@intel.com>
Cc: Hirokazu Takata <takata@linux-m32r.org>
Cc: Ralf Baechle <ralf@linux-mips.org>
Cc: David Howells <dhowells@redhat.com>
Cc: James E.J. Bottomley <jejb@parisc-linux.org>
Cc: Benjamin Herrenschmidt <benh@kernel.crashing.org>
Cc: Martin Schwidefsky <schwidefsky@de.ibm.com>
Cc: Paul Mundt <lethal@linux-sh.org>
Cc: David S. Miller <davem@davemloft.net>
Cc: Chris Metcalf <cmetcalf@tilera.com>
Cc: Richard Weinberger <richard@nod.at>
Cc: x86@kernel.org
Link: http://lkml.kernel.org/r/20120420124556.964170564@linutronix.de
---
 arch/alpha/kernel/smp.c         | 2 +-
 arch/arm/kernel/smp.c           | 2 +-
 arch/blackfin/mach-common/smp.c | 2 +-
 arch/cris/arch-v32/kernel/smp.c | 2 +-
 arch/hexagon/kernel/smp.c       | 2 +-
 arch/ia64/kernel/smpboot.c      | 2 +-
 arch/m32r/kernel/smpboot.c      | 2 +-
 arch/mips/kernel/smp.c          | 2 +-
 arch/mn10300/kernel/smp.c       | 2 +-
 arch/parisc/kernel/smp.c        | 2 +-
 arch/powerpc/kernel/smp.c       | 2 +-
 arch/s390/include/asm/smp.h     | 2 +-
 arch/s390/kernel/smp.c          | 2 +-
 arch/sh/kernel/smp.c            | 2 +-
 arch/sparc/kernel/smp_32.c      | 2 +-
 arch/sparc/kernel/smp_64.c      | 2 +-
 arch/tile/kernel/smpboot.c      | 2 +-
 arch/um/kernel/smp.c            | 2 +-
 arch/x86/include/asm/smp.h      | 4 +++-
 include/linux/smp.h             | 2 +-
 kernel/cpu.c                    | 2 +-
 21 files changed, 23 insertions(+), 21 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/alpha/kernel/smp.c b/arch/alpha/kernel/smp.c
index 50d438db1f6b..68d39470fb52 100644
--- a/arch/alpha/kernel/smp.c
+++ b/arch/alpha/kernel/smp.c
@@ -487,7 +487,7 @@ smp_prepare_boot_cpu(void)
 }
 
 int __cpuinit
-__cpu_up(unsigned int cpu)
+__cpu_up(unsigned int cpu, struct task_struct *tidle)
 {
 	smp_boot_one_cpu(cpu);
 
diff --git a/arch/arm/kernel/smp.c b/arch/arm/kernel/smp.c
index addbbe8028c2..f0e2cbbd837d 100644
--- a/arch/arm/kernel/smp.c
+++ b/arch/arm/kernel/smp.c
@@ -60,7 +60,7 @@ enum ipi_msg_type {
 
 static DECLARE_COMPLETION(cpu_running);
 
-int __cpuinit __cpu_up(unsigned int cpu)
+int __cpuinit __cpu_up(unsigned int cpu, struct task_struct *tidle)
 {
 	struct cpuinfo_arm *ci = &per_cpu(cpu_data, cpu);
 	struct task_struct *idle = ci->idle;
diff --git a/arch/blackfin/mach-common/smp.c b/arch/blackfin/mach-common/smp.c
index ac8f8a43158c..d0cddd95b0dd 100644
--- a/arch/blackfin/mach-common/smp.c
+++ b/arch/blackfin/mach-common/smp.c
@@ -340,7 +340,7 @@ void smp_send_stop(void)
 	return;
 }
 
-int __cpuinit __cpu_up(unsigned int cpu)
+int __cpuinit __cpu_up(unsigned int cpu, struct task_struct *tidle)
 {
 	int ret;
 	struct blackfin_cpudata *ci = &per_cpu(cpu_data, cpu);
diff --git a/arch/cris/arch-v32/kernel/smp.c b/arch/cris/arch-v32/kernel/smp.c
index 0b99df72d2a4..125ee2d7bc87 100644
--- a/arch/cris/arch-v32/kernel/smp.c
+++ b/arch/cris/arch-v32/kernel/smp.c
@@ -207,7 +207,7 @@ int setup_profiling_timer(unsigned int multiplier)
  */
 unsigned long cache_decay_ticks = 1;
 
-int __cpuinit __cpu_up(unsigned int cpu)
+int __cpuinit __cpu_up(unsigned int cpu, struct task_struct *tidle)
 {
 	smp_boot_one_cpu(cpu);
 	return cpu_online(cpu) ? 0 : -ENOSYS;
diff --git a/arch/hexagon/kernel/smp.c b/arch/hexagon/kernel/smp.c
index 1298141874a3..93e77e2b17a8 100644
--- a/arch/hexagon/kernel/smp.c
+++ b/arch/hexagon/kernel/smp.c
@@ -196,7 +196,7 @@ void __cpuinit start_secondary(void)
  * maintains control until "cpu_online(cpu)" is set.
  */
 
-int __cpuinit __cpu_up(unsigned int cpu)
+int __cpuinit __cpu_up(unsigned int cpu, struct task_struct *tidle)
 {
 	struct task_struct *idle;
 	struct thread_info *thread;
diff --git a/arch/ia64/kernel/smpboot.c b/arch/ia64/kernel/smpboot.c
index 796f6a5b966a..03e4ef3893c9 100644
--- a/arch/ia64/kernel/smpboot.c
+++ b/arch/ia64/kernel/smpboot.c
@@ -793,7 +793,7 @@ set_cpu_sibling_map(int cpu)
 }
 
 int __cpuinit
-__cpu_up (unsigned int cpu)
+__cpu_up(unsigned int cpu, struct task_struct *tidle)
 {
 	int ret;
 	int sapicid;
diff --git a/arch/m32r/kernel/smpboot.c b/arch/m32r/kernel/smpboot.c
index 31541c9b7eb6..a2cfc0abb05c 100644
--- a/arch/m32r/kernel/smpboot.c
+++ b/arch/m32r/kernel/smpboot.c
@@ -343,7 +343,7 @@ static void __init do_boot_cpu(int phys_id)
 	}
 }
 
-int __cpuinit __cpu_up(unsigned int cpu_id)
+int __cpuinit __cpu_up(unsigned int cpu_id, struct task_struct *tidle)
 {
 	int timeout;
 
diff --git a/arch/mips/kernel/smp.c b/arch/mips/kernel/smp.c
index ba9376bf52a1..41079b256092 100644
--- a/arch/mips/kernel/smp.c
+++ b/arch/mips/kernel/smp.c
@@ -209,7 +209,7 @@ static void __cpuinit do_fork_idle(struct work_struct *work)
 	complete(&c_idle->done);
 }
 
-int __cpuinit __cpu_up(unsigned int cpu)
+int __cpuinit __cpu_up(unsigned int cpu, struct task_struct *tidle)
 {
 	struct task_struct *idle;
 
diff --git a/arch/mn10300/kernel/smp.c b/arch/mn10300/kernel/smp.c
index 910dddf65e44..c6b40dad0d0b 100644
--- a/arch/mn10300/kernel/smp.c
+++ b/arch/mn10300/kernel/smp.c
@@ -921,7 +921,7 @@ void initialize_secondary(void)
  * __cpu_up - Set smp_commenced_mask for the nominated CPU
  * @cpu: The target CPU.
  */
-int __devinit __cpu_up(unsigned int cpu)
+int __devinit __cpu_up(unsigned int cpu, struct task_struct *tidle)
 {
 	int timeout;
 
diff --git a/arch/parisc/kernel/smp.c b/arch/parisc/kernel/smp.c
index 0bb1d63907f8..eae8cd808f07 100644
--- a/arch/parisc/kernel/smp.c
+++ b/arch/parisc/kernel/smp.c
@@ -449,7 +449,7 @@ void smp_cpus_done(unsigned int cpu_max)
 }
 
 
-int __cpuinit __cpu_up(unsigned int cpu)
+int __cpuinit __cpu_up(unsigned int cpu, struct task_struct *tidle)
 {
 	if (cpu != 0 && cpu < parisc_max_cpus)
 		smp_boot_one_cpu(cpu);
diff --git a/arch/powerpc/kernel/smp.c b/arch/powerpc/kernel/smp.c
index d9f94410fd7f..d38030fb3471 100644
--- a/arch/powerpc/kernel/smp.c
+++ b/arch/powerpc/kernel/smp.c
@@ -482,7 +482,7 @@ static int __cpuinit create_idle(unsigned int cpu)
 	return 0;
 }
 
-int __cpuinit __cpu_up(unsigned int cpu)
+int __cpuinit __cpu_up(unsigned int cpu, struct task_struct *tidle)
 {
 	int rc, c;
 
diff --git a/arch/s390/include/asm/smp.h b/arch/s390/include/asm/smp.h
index c77c6de6f6c0..0b6f586c1383 100644
--- a/arch/s390/include/asm/smp.h
+++ b/arch/s390/include/asm/smp.h
@@ -16,7 +16,7 @@
 extern struct mutex smp_cpu_state_mutex;
 extern struct save_area *zfcpdump_save_areas[NR_CPUS + 1];
 
-extern int __cpu_up(unsigned int cpu);
+extern int __cpu_up(unsigned int cpu, struct task_struct *tidle);
 
 extern void arch_send_call_function_single_ipi(int cpu);
 extern void arch_send_call_function_ipi_mask(const struct cpumask *mask);
diff --git a/arch/s390/kernel/smp.c b/arch/s390/kernel/smp.c
index 1f77227669e8..fc827aa8f9ca 100644
--- a/arch/s390/kernel/smp.c
+++ b/arch/s390/kernel/smp.c
@@ -738,7 +738,7 @@ static void __cpuinit smp_fork_idle(struct work_struct *work)
 }
 
 /* Upping and downing of CPUs */
-int __cpuinit __cpu_up(unsigned int cpu)
+int __cpuinit __cpu_up(unsigned int cpu, struct task_struct *tidle)
 {
 	struct create_idle c_idle;
 	struct pcpu *pcpu;
diff --git a/arch/sh/kernel/smp.c b/arch/sh/kernel/smp.c
index eaebdf6a5c77..ebb76e2a748b 100644
--- a/arch/sh/kernel/smp.c
+++ b/arch/sh/kernel/smp.c
@@ -220,7 +220,7 @@ extern struct {
 	void *thread_info;
 } stack_start;
 
-int __cpuinit __cpu_up(unsigned int cpu)
+int __cpuinit __cpu_up(unsigned int cpu, struct task_struct *tidle)
 {
 	struct task_struct *tsk;
 	unsigned long timeout;
diff --git a/arch/sparc/kernel/smp_32.c b/arch/sparc/kernel/smp_32.c
index f671e7fd6ddc..1f397ae11028 100644
--- a/arch/sparc/kernel/smp_32.c
+++ b/arch/sparc/kernel/smp_32.c
@@ -411,7 +411,7 @@ void __init smp_prepare_boot_cpu(void)
 	set_cpu_possible(cpuid, true);
 }
 
-int __cpuinit __cpu_up(unsigned int cpu)
+int __cpuinit __cpu_up(unsigned int cpu, struct task_struct *tidle)
 {
 	extern int __cpuinit smp4m_boot_one_cpu(int);
 	extern int __cpuinit smp4d_boot_one_cpu(int);
diff --git a/arch/sparc/kernel/smp_64.c b/arch/sparc/kernel/smp_64.c
index 3b1bd7c50164..2f9948c4107c 100644
--- a/arch/sparc/kernel/smp_64.c
+++ b/arch/sparc/kernel/smp_64.c
@@ -1227,7 +1227,7 @@ void __devinit smp_fill_in_sib_core_maps(void)
 	}
 }
 
-int __cpuinit __cpu_up(unsigned int cpu)
+int __cpuinit __cpu_up(unsigned int cpu, struct task_struct *tidle)
 {
 	int ret = smp_boot_one_cpu(cpu);
 
diff --git a/arch/tile/kernel/smpboot.c b/arch/tile/kernel/smpboot.c
index 172aef7d3159..84873fbe8f27 100644
--- a/arch/tile/kernel/smpboot.c
+++ b/arch/tile/kernel/smpboot.c
@@ -222,7 +222,7 @@ void __cpuinit online_secondary(void)
 	cpu_idle();
 }
 
-int __cpuinit __cpu_up(unsigned int cpu)
+int __cpuinit __cpu_up(unsigned int cpu, struct task_struct *tidle)
 {
 	/* Wait 5s total for all CPUs for them to come online */
 	static int timeout;
diff --git a/arch/um/kernel/smp.c b/arch/um/kernel/smp.c
index 6f588e160fb0..a02b7e9e6b94 100644
--- a/arch/um/kernel/smp.c
+++ b/arch/um/kernel/smp.c
@@ -140,7 +140,7 @@ void smp_prepare_boot_cpu(void)
 	set_cpu_online(smp_processor_id(), true);
 }
 
-int __cpu_up(unsigned int cpu)
+int __cpu_up(unsigned int cpu, struct task_struct *tidle)
 {
 	cpu_set(cpu, smp_commenced_mask);
 	while (!cpu_online(cpu))
diff --git a/arch/x86/include/asm/smp.h b/arch/x86/include/asm/smp.h
index 0434c400287c..4eb3a74bc4b0 100644
--- a/arch/x86/include/asm/smp.h
+++ b/arch/x86/include/asm/smp.h
@@ -62,6 +62,8 @@ DECLARE_EARLY_PER_CPU(int, x86_cpu_to_logical_apicid);
 /* Static state in head.S used to set up a CPU */
 extern unsigned long stack_start; /* Initial stack pointer address */
 
+struct task_struct;
+
 struct smp_ops {
 	void (*smp_prepare_boot_cpu)(void);
 	void (*smp_prepare_cpus)(unsigned max_cpus);
@@ -113,7 +115,7 @@ static inline void smp_cpus_done(unsigned int max_cpus)
 	smp_ops.smp_cpus_done(max_cpus);
 }
 
-static inline int __cpu_up(unsigned int cpu)
+static inline int __cpu_up(unsigned int cpu, struct task_struct *tidle)
 {
 	return smp_ops.cpu_up(cpu);
 }
diff --git a/include/linux/smp.h b/include/linux/smp.h
index 10530d92c04b..24360de6c968 100644
--- a/include/linux/smp.h
+++ b/include/linux/smp.h
@@ -61,7 +61,7 @@ extern void smp_prepare_cpus(unsigned int max_cpus);
 /*
  * Bring a CPU up
  */
-extern int __cpu_up(unsigned int cpunum);
+extern int __cpu_up(unsigned int cpunum, struct task_struct *tidle);
 
 /*
  * Final polishing of CPUs
diff --git a/kernel/cpu.c b/kernel/cpu.c
index 2060c6e57027..e711aef0fb3c 100644
--- a/kernel/cpu.c
+++ b/kernel/cpu.c
@@ -309,7 +309,7 @@ static int __cpuinit _cpu_up(unsigned int cpu, int tasks_frozen)
 	}
 
 	/* Arch-specific enabling code. */
-	ret = __cpu_up(cpu);
+	ret = __cpu_up(cpu, NULL);
 	if (ret != 0)
 		goto out_notify;
 	BUG_ON(!cpu_online(cpu));
-- 
cgit v1.2.3


From 5cdaf1834f43b0edc4a3aa683aa4ec98f6bfe8a7 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Fri, 20 Apr 2012 13:05:47 +0000
Subject: x86: Add task_struct argument to smp_ops.cpu_up

Preparatory patch to use the generic idle thread allocation.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Rusty Russell <rusty@rustcorp.com.au>
Cc: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
Cc: Srivatsa S. Bhat <srivatsa.bhat@linux.vnet.ibm.com>
Cc: Jeremy Fitzhardinge <jeremy@goop.org>
Cc: x86@kernel.org
Link: http://lkml.kernel.org/r/20120420124557.176604405@linutronix.de
---
 arch/x86/include/asm/smp.h | 6 +++---
 arch/x86/kernel/smpboot.c  | 2 +-
 arch/x86/xen/smp.c         | 6 +++---
 3 files changed, 7 insertions(+), 7 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/include/asm/smp.h b/arch/x86/include/asm/smp.h
index 4eb3a74bc4b0..f3ed33811c23 100644
--- a/arch/x86/include/asm/smp.h
+++ b/arch/x86/include/asm/smp.h
@@ -72,7 +72,7 @@ struct smp_ops {
 	void (*stop_other_cpus)(int wait);
 	void (*smp_send_reschedule)(int cpu);
 
-	int (*cpu_up)(unsigned cpu);
+	int (*cpu_up)(unsigned cpu, struct task_struct *tidle);
 	int (*cpu_disable)(void);
 	void (*cpu_die)(unsigned int cpu);
 	void (*play_dead)(void);
@@ -117,7 +117,7 @@ static inline void smp_cpus_done(unsigned int max_cpus)
 
 static inline int __cpu_up(unsigned int cpu, struct task_struct *tidle)
 {
-	return smp_ops.cpu_up(cpu);
+	return smp_ops.cpu_up(cpu, tidle);
 }
 
 static inline int __cpu_disable(void)
@@ -154,7 +154,7 @@ void cpu_disable_common(void);
 void native_smp_prepare_boot_cpu(void);
 void native_smp_prepare_cpus(unsigned int max_cpus);
 void native_smp_cpus_done(unsigned int max_cpus);
-int native_cpu_up(unsigned int cpunum);
+int native_cpu_up(unsigned int cpunum, struct task_struct *tidle);
 int native_cpu_disable(void);
 void native_cpu_die(unsigned int cpu);
 void native_play_dead(void);
diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c
index 6e1e406038c2..def235bf7594 100644
--- a/arch/x86/kernel/smpboot.c
+++ b/arch/x86/kernel/smpboot.c
@@ -818,7 +818,7 @@ do_rest:
 	return boot_error;
 }
 
-int __cpuinit native_cpu_up(unsigned int cpu)
+int __cpuinit native_cpu_up(unsigned int cpu, struct task_struct *tidle)
 {
 	int apicid = apic->cpu_present_to_apicid(cpu);
 	unsigned long flags;
diff --git a/arch/x86/xen/smp.c b/arch/x86/xen/smp.c
index 5fac6919b957..64d3bbce0b36 100644
--- a/arch/x86/xen/smp.c
+++ b/arch/x86/xen/smp.c
@@ -331,7 +331,7 @@ cpu_initialize_context(unsigned int cpu, struct task_struct *idle)
 	return 0;
 }
 
-static int __cpuinit xen_cpu_up(unsigned int cpu)
+static int __cpuinit xen_cpu_up(unsigned int cpu, struct task_struct *tidle)
 {
 	struct task_struct *idle = idle_task(cpu);
 	int rc;
@@ -547,10 +547,10 @@ static void __init xen_hvm_smp_prepare_cpus(unsigned int max_cpus)
 	xen_init_lock_cpu(0);
 }
 
-static int __cpuinit xen_hvm_cpu_up(unsigned int cpu)
+static int __cpuinit xen_hvm_cpu_up(unsigned int cpu, struct task_struct *tidle)
 {
 	int rc;
-	rc = native_cpu_up(cpu);
+	rc = native_cpu_up(cpu, tidle);
 	WARN_ON (xen_smp_intr_init(cpu));
 	return rc;
 }
-- 
cgit v1.2.3


From 7eb43a6d232bfa46464b501cd1987ec2d705d8cf Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Fri, 20 Apr 2012 13:05:48 +0000
Subject: x86: Use generic idle thread allocation

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Rusty Russell <rusty@rustcorp.com.au>
Cc: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
Cc: Srivatsa S. Bhat <srivatsa.bhat@linux.vnet.ibm.com>
Cc: Jeremy Fitzhardinge <jeremy@goop.org>
Cc: x86@kernel.org
Link: http://lkml.kernel.org/r/20120420124557.246929343@linutronix.de
---
 arch/x86/Kconfig           |  1 +
 arch/x86/include/asm/smp.h |  1 +
 arch/x86/kernel/smpboot.c  | 81 ++++++----------------------------------------
 arch/x86/xen/smp.c         | 15 ++-------
 4 files changed, 14 insertions(+), 84 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
index 1d14cc6b79ad..046bf4bd2510 100644
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -82,6 +82,7 @@ config X86
 	select ARCH_HAVE_NMI_SAFE_CMPXCHG
 	select GENERIC_IOMAP
 	select DCACHE_WORD_ACCESS if !DEBUG_PAGEALLOC
+	select GENERIC_SMP_IDLE_THREAD
 
 config INSTRUCTION_DECODER
 	def_bool (KPROBES || PERF_EVENTS)
diff --git a/arch/x86/include/asm/smp.h b/arch/x86/include/asm/smp.h
index f3ed33811c23..f8cbc6f20e31 100644
--- a/arch/x86/include/asm/smp.h
+++ b/arch/x86/include/asm/smp.h
@@ -164,6 +164,7 @@ int wbinvd_on_all_cpus(void);
 
 void native_send_call_func_ipi(const struct cpumask *mask);
 void native_send_call_func_single_ipi(int cpu);
+void x86_idle_thread_init(unsigned int cpu, struct task_struct *idle);
 
 void smp_store_cpu_info(int id);
 #define cpu_physical_id(cpu)	per_cpu(x86_cpu_to_apicid, cpu)
diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c
index def235bf7594..3acaf51dfddb 100644
--- a/arch/x86/kernel/smpboot.c
+++ b/arch/x86/kernel/smpboot.c
@@ -76,19 +76,7 @@
 /* State of each CPU */
 DEFINE_PER_CPU(int, cpu_state) = { 0 };
 
-/* Store all idle threads, this can be reused instead of creating
-* a new thread. Also avoids complicated thread destroy functionality
-* for idle threads.
-*/
 #ifdef CONFIG_HOTPLUG_CPU
-/*
- * Needed only for CONFIG_HOTPLUG_CPU because __cpuinitdata is
- * removed after init for !CONFIG_HOTPLUG_CPU.
- */
-static DEFINE_PER_CPU(struct task_struct *, idle_thread_array);
-#define get_idle_for_cpu(x)      (per_cpu(idle_thread_array, x))
-#define set_idle_for_cpu(x, p)   (per_cpu(idle_thread_array, x) = (p))
-
 /*
  * We need this for trampoline_base protection from concurrent accesses when
  * off- and onlining cores wildly.
@@ -97,20 +85,16 @@ static DEFINE_MUTEX(x86_cpu_hotplug_driver_mutex);
 
 void cpu_hotplug_driver_lock(void)
 {
-        mutex_lock(&x86_cpu_hotplug_driver_mutex);
+	mutex_lock(&x86_cpu_hotplug_driver_mutex);
 }
 
 void cpu_hotplug_driver_unlock(void)
 {
-        mutex_unlock(&x86_cpu_hotplug_driver_mutex);
+	mutex_unlock(&x86_cpu_hotplug_driver_mutex);
 }
 
 ssize_t arch_cpu_probe(const char *buf, size_t count) { return -1; }
 ssize_t arch_cpu_release(const char *buf, size_t count) { return -1; }
-#else
-static struct task_struct *idle_thread_array[NR_CPUS] __cpuinitdata ;
-#define get_idle_for_cpu(x)      (idle_thread_array[(x)])
-#define set_idle_for_cpu(x, p)   (idle_thread_array[(x)] = (p))
 #endif
 
 /* Number of siblings per CPU package */
@@ -618,22 +602,6 @@ wakeup_secondary_cpu_via_init(int phys_apicid, unsigned long start_eip)
 	return (send_status | accept_status);
 }
 
-struct create_idle {
-	struct work_struct work;
-	struct task_struct *idle;
-	struct completion done;
-	int cpu;
-};
-
-static void __cpuinit do_fork_idle(struct work_struct *work)
-{
-	struct create_idle *c_idle =
-		container_of(work, struct create_idle, work);
-
-	c_idle->idle = fork_idle(c_idle->cpu);
-	complete(&c_idle->done);
-}
-
 /* reduce the number of lines printed when booting a large cpu count system */
 static void __cpuinit announce_cpu(int cpu, int apicid)
 {
@@ -660,58 +628,31 @@ static void __cpuinit announce_cpu(int cpu, int apicid)
  * Returns zero if CPU booted OK, else error code from
  * ->wakeup_secondary_cpu.
  */
-static int __cpuinit do_boot_cpu(int apicid, int cpu)
+static int __cpuinit do_boot_cpu(int apicid, int cpu, struct task_struct *idle)
 {
 	unsigned long boot_error = 0;
 	unsigned long start_ip;
 	int timeout;
-	struct create_idle c_idle = {
-		.cpu	= cpu,
-		.done	= COMPLETION_INITIALIZER_ONSTACK(c_idle.done),
-	};
-
-	INIT_WORK_ONSTACK(&c_idle.work, do_fork_idle);
 
 	alternatives_smp_switch(1);
 
-	c_idle.idle = get_idle_for_cpu(cpu);
-
-	/*
-	 * We can't use kernel_thread since we must avoid to
-	 * reschedule the child.
-	 */
-	if (c_idle.idle) {
-		c_idle.idle->thread.sp = (unsigned long) (((struct pt_regs *)
-			(THREAD_SIZE +  task_stack_page(c_idle.idle))) - 1);
-		init_idle(c_idle.idle, cpu);
-		goto do_rest;
-	}
+	idle->thread.sp = (unsigned long) (((struct pt_regs *)
+			  (THREAD_SIZE +  task_stack_page(idle))) - 1);
+	per_cpu(current_task, cpu) = idle;
 
-	schedule_work(&c_idle.work);
-	wait_for_completion(&c_idle.done);
-
-	if (IS_ERR(c_idle.idle)) {
-		printk("failed fork for CPU %d\n", cpu);
-		destroy_work_on_stack(&c_idle.work);
-		return PTR_ERR(c_idle.idle);
-	}
-
-	set_idle_for_cpu(cpu, c_idle.idle);
-do_rest:
-	per_cpu(current_task, cpu) = c_idle.idle;
 #ifdef CONFIG_X86_32
 	/* Stack for startup_32 can be just as for start_secondary onwards */
 	irq_ctx_init(cpu);
 #else
-	clear_tsk_thread_flag(c_idle.idle, TIF_FORK);
+	clear_tsk_thread_flag(idle, TIF_FORK);
 	initial_gs = per_cpu_offset(cpu);
 	per_cpu(kernel_stack, cpu) =
-		(unsigned long)task_stack_page(c_idle.idle) -
+		(unsigned long)task_stack_page(idle) -
 		KERNEL_STACK_OFFSET + THREAD_SIZE;
 #endif
 	early_gdt_descr.address = (unsigned long)get_cpu_gdt_table(cpu);
 	initial_code = (unsigned long)start_secondary;
-	stack_start  = c_idle.idle->thread.sp;
+	stack_start  = idle->thread.sp;
 
 	/* start_ip had better be page-aligned! */
 	start_ip = trampoline_address();
@@ -813,8 +754,6 @@ do_rest:
 		 */
 		smpboot_restore_warm_reset_vector();
 	}
-
-	destroy_work_on_stack(&c_idle.work);
 	return boot_error;
 }
 
@@ -851,7 +790,7 @@ int __cpuinit native_cpu_up(unsigned int cpu, struct task_struct *tidle)
 
 	per_cpu(cpu_state, cpu) = CPU_UP_PREPARE;
 
-	err = do_boot_cpu(apicid, cpu);
+	err = do_boot_cpu(apicid, cpu, tidle);
 	if (err) {
 		pr_debug("do_boot_cpu failed %d\n", err);
 		return -EIO;
diff --git a/arch/x86/xen/smp.c b/arch/x86/xen/smp.c
index 64d3bbce0b36..8f44cc1a9291 100644
--- a/arch/x86/xen/smp.c
+++ b/arch/x86/xen/smp.c
@@ -250,18 +250,8 @@ static void __init xen_smp_prepare_cpus(unsigned int max_cpus)
 		set_cpu_possible(cpu, false);
 	}
 
-	for_each_possible_cpu (cpu) {
-		struct task_struct *idle;
-
-		if (cpu == 0)
-			continue;
-
-		idle = fork_idle(cpu);
-		if (IS_ERR(idle))
-			panic("failed fork for CPU %d", cpu);
-
+	for_each_possible_cpu(cpu)
 		set_cpu_present(cpu, true);
-	}
 }
 
 static int __cpuinit
@@ -331,9 +321,8 @@ cpu_initialize_context(unsigned int cpu, struct task_struct *idle)
 	return 0;
 }
 
-static int __cpuinit xen_cpu_up(unsigned int cpu, struct task_struct *tidle)
+static int __cpuinit xen_cpu_up(unsigned int cpu, struct task_struct *idle)
 {
-	struct task_struct *idle = idle_task(cpu);
 	int rc;
 
 	per_cpu(current_task, cpu) = idle;
-- 
cgit v1.2.3


From 10c250234c98928d1e15c4cea1c44b9a25354ccf Mon Sep 17 00:00:00 2001
From: Robert Richter <robert.richter@amd.com>
Date: Thu, 5 Apr 2012 18:24:41 +0200
Subject: perf: Trivial cleanup of duplicate code

Removing duplicate code.

Signed-off-by: Robert Richter <robert.richter@amd.com>
Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Link: http://lkml.kernel.org/r/1333643084-26776-2-git-send-email-robert.richter@amd.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/x86/kernel/cpu/perf_event.c | 3 ---
 1 file changed, 3 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kernel/cpu/perf_event.c b/arch/x86/kernel/cpu/perf_event.c
index bb8e03407e18..e33e9cf160eb 100644
--- a/arch/x86/kernel/cpu/perf_event.c
+++ b/arch/x86/kernel/cpu/perf_event.c
@@ -484,9 +484,6 @@ static int __x86_pmu_event_init(struct perf_event *event)
 
 	/* mark unused */
 	event->hw.extra_reg.idx = EXTRA_REG_NONE;
-
-	/* mark not used */
-	event->hw.extra_reg.idx = EXTRA_REG_NONE;
 	event->hw.branch_reg.idx = EXTRA_REG_NONE;
 
 	return x86_pmu.hw_config(event);
-- 
cgit v1.2.3


From 5f09fc688936705b2020ca247df39ee27283668a Mon Sep 17 00:00:00 2001
From: Robert Richter <robert.richter@amd.com>
Date: Thu, 5 Apr 2012 18:24:42 +0200
Subject: perf/x86: Fix cmpxchg() usage in amd_put_event_constraints()

Now the return value of cmpxchg() is used to match an event. The
change removes the duplicate event comparison and traverses the list
until an event was removed. This also fixes the following warning:

 arch/x86/kernel/cpu/perf_event_amd.c:170: warning: value computed is not used

Signed-off-by: Robert Richter <robert.richter@amd.com>
Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Link: http://lkml.kernel.org/r/1333643084-26776-3-git-send-email-robert.richter@amd.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/x86/kernel/cpu/perf_event_amd.c | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kernel/cpu/perf_event_amd.c b/arch/x86/kernel/cpu/perf_event_amd.c
index 95e7fe1c5f0b..589286f28877 100644
--- a/arch/x86/kernel/cpu/perf_event_amd.c
+++ b/arch/x86/kernel/cpu/perf_event_amd.c
@@ -205,10 +205,8 @@ static void amd_put_event_constraints(struct cpu_hw_events *cpuc,
 	 * when we come here
 	 */
 	for (i = 0; i < x86_pmu.num_counters; i++) {
-		if (nb->owners[i] == event) {
-			cmpxchg(nb->owners+i, event, NULL);
+		if (cmpxchg(nb->owners + i, event, NULL) == event)
 			break;
-		}
 	}
 }
 
-- 
cgit v1.2.3


From b6ddf05ff68d81a7c1736717faf492b70e9bf4f9 Mon Sep 17 00:00:00 2001
From: Jan Kiszka <jan.kiszka@siemens.com>
Date: Tue, 24 Apr 2012 16:40:17 +0200
Subject: KVM: x86: Run PIT work in own kthread

We can't run PIT IRQ injection work in the interrupt context of the host
timer. This would allow the user to influence the handler complexity by
asking for a broadcast to a large number of VCPUs. Therefore, this work
was pushed into workqueue context in 9d244caf2e. However, this prevents
prioritizing the PIT injection over other task as workqueues share
kernel threads.

This replaces the workqueue with a kthread worker and gives that thread
a name in the format "kvm-pit/<owner-process-pid>". That allows to
identify and adjust the kthread priority according to the VM process
parameters.

Signed-off-by: Jan Kiszka <jan.kiszka@siemens.com>
Signed-off-by: Marcelo Tosatti <mtosatti@redhat.com>
---
 Documentation/virtual/kvm/api.txt |  8 ++++++++
 arch/x86/kvm/i8254.c              | 31 +++++++++++++++++++------------
 arch/x86/kvm/i8254.h              |  7 +++++--
 3 files changed, 32 insertions(+), 14 deletions(-)

(limited to 'arch/x86')

diff --git a/Documentation/virtual/kvm/api.txt b/Documentation/virtual/kvm/api.txt
index a7cb93cb2154..eb62761b7683 100644
--- a/Documentation/virtual/kvm/api.txt
+++ b/Documentation/virtual/kvm/api.txt
@@ -1810,6 +1810,14 @@ Valid flags are:
 
 #define KVM_PIT_SPEAKER_DUMMY     1 /* emulate speaker port stub */
 
+PIT timer interrupts may use a per-VM kernel thread for injection. If it
+exists, this thread will have a name of the following pattern:
+
+kvm-pit/<owner-process-pid>
+
+When running a guest with elevated priorities, the scheduling parameters of
+this thread may have to be adjusted accordingly.
+
 This IOCTL replaces the obsolete KVM_CREATE_PIT.
 
 
diff --git a/arch/x86/kvm/i8254.c b/arch/x86/kvm/i8254.c
index d68f99df690c..adba28f88d1a 100644
--- a/arch/x86/kvm/i8254.c
+++ b/arch/x86/kvm/i8254.c
@@ -34,7 +34,6 @@
 
 #include <linux/kvm_host.h>
 #include <linux/slab.h>
-#include <linux/workqueue.h>
 
 #include "irq.h"
 #include "i8254.h"
@@ -249,7 +248,7 @@ static void kvm_pit_ack_irq(struct kvm_irq_ack_notifier *kian)
 		/* in this case, we had multiple outstanding pit interrupts
 		 * that we needed to inject.  Reinject
 		 */
-		queue_work(ps->pit->wq, &ps->pit->expired);
+		queue_kthread_work(&ps->pit->worker, &ps->pit->expired);
 	ps->irq_ack = 1;
 	spin_unlock(&ps->inject_lock);
 }
@@ -270,7 +269,7 @@ void __kvm_migrate_pit_timer(struct kvm_vcpu *vcpu)
 static void destroy_pit_timer(struct kvm_pit *pit)
 {
 	hrtimer_cancel(&pit->pit_state.pit_timer.timer);
-	cancel_work_sync(&pit->expired);
+	flush_kthread_work(&pit->expired);
 }
 
 static bool kpit_is_periodic(struct kvm_timer *ktimer)
@@ -284,7 +283,7 @@ static struct kvm_timer_ops kpit_ops = {
 	.is_periodic = kpit_is_periodic,
 };
 
-static void pit_do_work(struct work_struct *work)
+static void pit_do_work(struct kthread_work *work)
 {
 	struct kvm_pit *pit = container_of(work, struct kvm_pit, expired);
 	struct kvm *kvm = pit->kvm;
@@ -328,7 +327,7 @@ static enum hrtimer_restart pit_timer_fn(struct hrtimer *data)
 
 	if (ktimer->reinject || !atomic_read(&ktimer->pending)) {
 		atomic_inc(&ktimer->pending);
-		queue_work(pt->wq, &pt->expired);
+		queue_kthread_work(&pt->worker, &pt->expired);
 	}
 
 	if (ktimer->t_ops->is_periodic(ktimer)) {
@@ -353,7 +352,7 @@ static void create_pit_timer(struct kvm *kvm, u32 val, int is_period)
 
 	/* TODO The new value only affected after the retriggered */
 	hrtimer_cancel(&pt->timer);
-	cancel_work_sync(&ps->pit->expired);
+	flush_kthread_work(&ps->pit->expired);
 	pt->period = interval;
 	ps->is_periodic = is_period;
 
@@ -669,6 +668,8 @@ struct kvm_pit *kvm_create_pit(struct kvm *kvm, u32 flags)
 {
 	struct kvm_pit *pit;
 	struct kvm_kpit_state *pit_state;
+	struct pid *pid;
+	pid_t pid_nr;
 	int ret;
 
 	pit = kzalloc(sizeof(struct kvm_pit), GFP_KERNEL);
@@ -685,14 +686,20 @@ struct kvm_pit *kvm_create_pit(struct kvm *kvm, u32 flags)
 	mutex_lock(&pit->pit_state.lock);
 	spin_lock_init(&pit->pit_state.inject_lock);
 
-	pit->wq = create_singlethread_workqueue("kvm-pit-wq");
-	if (!pit->wq) {
+	pid = get_pid(task_tgid(current));
+	pid_nr = pid_vnr(pid);
+	put_pid(pid);
+
+	init_kthread_worker(&pit->worker);
+	pit->worker_task = kthread_run(kthread_worker_fn, &pit->worker,
+				       "kvm-pit/%d", pid_nr);
+	if (IS_ERR(pit->worker_task)) {
 		mutex_unlock(&pit->pit_state.lock);
 		kvm_free_irq_source_id(kvm, pit->irq_source_id);
 		kfree(pit);
 		return NULL;
 	}
-	INIT_WORK(&pit->expired, pit_do_work);
+	init_kthread_work(&pit->expired, pit_do_work);
 
 	kvm->arch.vpit = pit;
 	pit->kvm = kvm;
@@ -736,7 +743,7 @@ fail:
 	kvm_unregister_irq_mask_notifier(kvm, 0, &pit->mask_notifier);
 	kvm_unregister_irq_ack_notifier(kvm, &pit_state->irq_ack_notifier);
 	kvm_free_irq_source_id(kvm, pit->irq_source_id);
-	destroy_workqueue(pit->wq);
+	kthread_stop(pit->worker_task);
 	kfree(pit);
 	return NULL;
 }
@@ -756,10 +763,10 @@ void kvm_free_pit(struct kvm *kvm)
 		mutex_lock(&kvm->arch.vpit->pit_state.lock);
 		timer = &kvm->arch.vpit->pit_state.pit_timer.timer;
 		hrtimer_cancel(timer);
-		cancel_work_sync(&kvm->arch.vpit->expired);
+		flush_kthread_work(&kvm->arch.vpit->expired);
+		kthread_stop(kvm->arch.vpit->worker_task);
 		kvm_free_irq_source_id(kvm, kvm->arch.vpit->irq_source_id);
 		mutex_unlock(&kvm->arch.vpit->pit_state.lock);
-		destroy_workqueue(kvm->arch.vpit->wq);
 		kfree(kvm->arch.vpit);
 	}
 }
diff --git a/arch/x86/kvm/i8254.h b/arch/x86/kvm/i8254.h
index 51a97426e791..fdf40425ea1d 100644
--- a/arch/x86/kvm/i8254.h
+++ b/arch/x86/kvm/i8254.h
@@ -1,6 +1,8 @@
 #ifndef __I8254_H
 #define __I8254_H
 
+#include <linux/kthread.h>
+
 #include "iodev.h"
 
 struct kvm_kpit_channel_state {
@@ -39,8 +41,9 @@ struct kvm_pit {
 	struct kvm_kpit_state pit_state;
 	int irq_source_id;
 	struct kvm_irq_mask_notifier mask_notifier;
-	struct workqueue_struct *wq;
-	struct work_struct expired;
+	struct kthread_worker worker;
+	struct task_struct *worker_task;
+	struct kthread_work expired;
 };
 
 #define KVM_PIT_BASE_ADDRESS	    0x40
-- 
cgit v1.2.3


From 08d636b6d4fb80647fe8869ea1cd97b2c26a4751 Mon Sep 17 00:00:00 2001
From: Steven Rostedt <srostedt@redhat.com>
Date: Tue, 16 Aug 2011 09:57:10 -0400
Subject: ftrace/x86: Have arch x86_64 use breakpoints instead of stop machine

This method changes x86 to add a breakpoint to the mcount locations
instead of calling stop machine.

Now that iret can be handled by NMIs, we perform the following to
update code:

1) Add a breakpoint to all locations that will be modified

2) Sync all cores

3) Update all locations to be either a nop or call (except breakpoint
   op)

4) Sync all cores

5) Remove the breakpoint with the new code.

6) Sync all cores

[
  Added updates that Masami suggested:
   Use unlikely(modifying_ftrace_code) in int3 trap to keep kprobes efficient.
   Don't use NOTIFY_* in ftrace handler in int3 as it is not a notifier.
]

Cc: H. Peter Anvin <hpa@zytor.com>
Acked-by: Masami Hiramatsu <masami.hiramatsu.pt@hitachi.com>
Signed-off-by: Steven Rostedt <rostedt@goodmis.org>
---
 arch/x86/include/asm/ftrace.h |   3 +
 arch/x86/kernel/ftrace.c      | 342 ++++++++++++++++++++++++++++++++++++++++++
 arch/x86/kernel/traps.c       |   8 +-
 include/linux/ftrace.h        |   6 +
 4 files changed, 358 insertions(+), 1 deletion(-)

(limited to 'arch/x86')

diff --git a/arch/x86/include/asm/ftrace.h b/arch/x86/include/asm/ftrace.h
index 268c783ab1c0..18d9005d9e4f 100644
--- a/arch/x86/include/asm/ftrace.h
+++ b/arch/x86/include/asm/ftrace.h
@@ -34,6 +34,7 @@
 
 #ifndef __ASSEMBLY__
 extern void mcount(void);
+extern int modifying_ftrace_code;
 
 static inline unsigned long ftrace_call_adjust(unsigned long addr)
 {
@@ -50,6 +51,8 @@ struct dyn_arch_ftrace {
 	/* No extra data needed for x86 */
 };
 
+int ftrace_int3_handler(struct pt_regs *regs);
+
 #endif /*  CONFIG_DYNAMIC_FTRACE */
 #endif /* __ASSEMBLY__ */
 #endif /* CONFIG_FUNCTION_TRACER */
diff --git a/arch/x86/kernel/ftrace.c b/arch/x86/kernel/ftrace.c
index c9a281f272fd..80af34739a9a 100644
--- a/arch/x86/kernel/ftrace.c
+++ b/arch/x86/kernel/ftrace.c
@@ -20,6 +20,7 @@
 #include <linux/init.h>
 #include <linux/list.h>
 #include <linux/module.h>
+#include <linux/kprobes.h>
 
 #include <trace/syscall.h>
 
@@ -334,6 +335,347 @@ int ftrace_update_ftrace_func(ftrace_func_t func)
 	return ret;
 }
 
+int modifying_ftrace_code __read_mostly;
+
+/*
+ * A breakpoint was added to the code address we are about to
+ * modify, and this is the handle that will just skip over it.
+ * We are either changing a nop into a trace call, or a trace
+ * call to a nop. While the change is taking place, we treat
+ * it just like it was a nop.
+ */
+int ftrace_int3_handler(struct pt_regs *regs)
+{
+	if (WARN_ON_ONCE(!regs))
+		return 0;
+
+	if (!ftrace_location(regs->ip - 1))
+		return 0;
+
+	regs->ip += MCOUNT_INSN_SIZE - 1;
+
+	return 1;
+}
+
+static int ftrace_write(unsigned long ip, const char *val, int size)
+{
+	/*
+	 * On x86_64, kernel text mappings are mapped read-only with
+	 * CONFIG_DEBUG_RODATA. So we use the kernel identity mapping instead
+	 * of the kernel text mapping to modify the kernel text.
+	 *
+	 * For 32bit kernels, these mappings are same and we can use
+	 * kernel identity mapping to modify code.
+	 */
+	if (within(ip, (unsigned long)_text, (unsigned long)_etext))
+		ip = (unsigned long)__va(__pa(ip));
+
+	return probe_kernel_write((void *)ip, val, size);
+}
+
+static int add_break(unsigned long ip, const char *old)
+{
+	unsigned char replaced[MCOUNT_INSN_SIZE];
+	unsigned char brk = BREAKPOINT_INSTRUCTION;
+
+	if (probe_kernel_read(replaced, (void *)ip, MCOUNT_INSN_SIZE))
+		return -EFAULT;
+
+	/* Make sure it is what we expect it to be */
+	if (memcmp(replaced, old, MCOUNT_INSN_SIZE) != 0)
+		return -EINVAL;
+
+	if (ftrace_write(ip, &brk, 1))
+		return -EPERM;
+
+	return 0;
+}
+
+static int add_brk_on_call(struct dyn_ftrace *rec, unsigned long addr)
+{
+	unsigned const char *old;
+	unsigned long ip = rec->ip;
+
+	old = ftrace_call_replace(ip, addr);
+
+	return add_break(rec->ip, old);
+}
+
+
+static int add_brk_on_nop(struct dyn_ftrace *rec)
+{
+	unsigned const char *old;
+
+	old = ftrace_nop_replace();
+
+	return add_break(rec->ip, old);
+}
+
+static int add_breakpoints(struct dyn_ftrace *rec, int enable)
+{
+	unsigned long ftrace_addr;
+	int ret;
+
+	ret = ftrace_test_record(rec, enable);
+
+	ftrace_addr = (unsigned long)FTRACE_ADDR;
+
+	switch (ret) {
+	case FTRACE_UPDATE_IGNORE:
+		return 0;
+
+	case FTRACE_UPDATE_MAKE_CALL:
+		/* converting nop to call */
+		return add_brk_on_nop(rec);
+
+	case FTRACE_UPDATE_MAKE_NOP:
+		/* converting a call to a nop */
+		return add_brk_on_call(rec, ftrace_addr);
+	}
+	return 0;
+}
+
+/*
+ * On error, we need to remove breakpoints. This needs to
+ * be done caefully. If the address does not currently have a
+ * breakpoint, we know we are done. Otherwise, we look at the
+ * remaining 4 bytes of the instruction. If it matches a nop
+ * we replace the breakpoint with the nop. Otherwise we replace
+ * it with the call instruction.
+ */
+static int remove_breakpoint(struct dyn_ftrace *rec)
+{
+	unsigned char ins[MCOUNT_INSN_SIZE];
+	unsigned char brk = BREAKPOINT_INSTRUCTION;
+	const unsigned char *nop;
+	unsigned long ftrace_addr;
+	unsigned long ip = rec->ip;
+
+	/* If we fail the read, just give up */
+	if (probe_kernel_read(ins, (void *)ip, MCOUNT_INSN_SIZE))
+		return -EFAULT;
+
+	/* If this does not have a breakpoint, we are done */
+	if (ins[0] != brk)
+		return -1;
+
+	nop = ftrace_nop_replace();
+
+	/*
+	 * If the last 4 bytes of the instruction do not match
+	 * a nop, then we assume that this is a call to ftrace_addr.
+	 */
+	if (memcmp(&ins[1], &nop[1], MCOUNT_INSN_SIZE - 1) != 0) {
+		/*
+		 * For extra paranoidism, we check if the breakpoint is on
+		 * a call that would actually jump to the ftrace_addr.
+		 * If not, don't touch the breakpoint, we make just create
+		 * a disaster.
+		 */
+		ftrace_addr = (unsigned long)FTRACE_ADDR;
+		nop = ftrace_call_replace(ip, ftrace_addr);
+
+		if (memcmp(&ins[1], &nop[1], MCOUNT_INSN_SIZE - 1) != 0)
+			return -EINVAL;
+	}
+
+	return probe_kernel_write((void *)ip, &nop[0], 1);
+}
+
+static int add_update_code(unsigned long ip, unsigned const char *new)
+{
+	/* skip breakpoint */
+	ip++;
+	new++;
+	if (ftrace_write(ip, new, MCOUNT_INSN_SIZE - 1))
+		return -EPERM;
+	return 0;
+}
+
+static int add_update_call(struct dyn_ftrace *rec, unsigned long addr)
+{
+	unsigned long ip = rec->ip;
+	unsigned const char *new;
+
+	new = ftrace_call_replace(ip, addr);
+	return add_update_code(ip, new);
+}
+
+static int add_update_nop(struct dyn_ftrace *rec)
+{
+	unsigned long ip = rec->ip;
+	unsigned const char *new;
+
+	new = ftrace_nop_replace();
+	return add_update_code(ip, new);
+}
+
+static int add_update(struct dyn_ftrace *rec, int enable)
+{
+	unsigned long ftrace_addr;
+	int ret;
+
+	ret = ftrace_test_record(rec, enable);
+
+	ftrace_addr = (unsigned long)FTRACE_ADDR;
+
+	switch (ret) {
+	case FTRACE_UPDATE_IGNORE:
+		return 0;
+
+	case FTRACE_UPDATE_MAKE_CALL:
+		/* converting nop to call */
+		return add_update_call(rec, ftrace_addr);
+
+	case FTRACE_UPDATE_MAKE_NOP:
+		/* converting a call to a nop */
+		return add_update_nop(rec);
+	}
+
+	return 0;
+}
+
+static int finish_update_call(struct dyn_ftrace *rec, unsigned long addr)
+{
+	unsigned long ip = rec->ip;
+	unsigned const char *new;
+
+	new = ftrace_call_replace(ip, addr);
+
+	if (ftrace_write(ip, new, 1))
+		return -EPERM;
+
+	return 0;
+}
+
+static int finish_update_nop(struct dyn_ftrace *rec)
+{
+	unsigned long ip = rec->ip;
+	unsigned const char *new;
+
+	new = ftrace_nop_replace();
+
+	if (ftrace_write(ip, new, 1))
+		return -EPERM;
+	return 0;
+}
+
+static int finish_update(struct dyn_ftrace *rec, int enable)
+{
+	unsigned long ftrace_addr;
+	int ret;
+
+	ret = ftrace_update_record(rec, enable);
+
+	ftrace_addr = (unsigned long)FTRACE_ADDR;
+
+	switch (ret) {
+	case FTRACE_UPDATE_IGNORE:
+		return 0;
+
+	case FTRACE_UPDATE_MAKE_CALL:
+		/* converting nop to call */
+		return finish_update_call(rec, ftrace_addr);
+
+	case FTRACE_UPDATE_MAKE_NOP:
+		/* converting a call to a nop */
+		return finish_update_nop(rec);
+	}
+
+	return 0;
+}
+
+static void do_sync_core(void *data)
+{
+	sync_core();
+}
+
+static void run_sync(void)
+{
+	int enable_irqs = irqs_disabled();
+
+	/* We may be called with interrupts disbled (on bootup). */
+	if (enable_irqs)
+		local_irq_enable();
+	on_each_cpu(do_sync_core, NULL, 1);
+	if (enable_irqs)
+		local_irq_disable();
+}
+
+static void ftrace_replace_code(int enable)
+{
+	struct ftrace_rec_iter *iter;
+	struct dyn_ftrace *rec;
+	const char *report = "adding breakpoints";
+	int count = 0;
+	int ret;
+
+	for_ftrace_rec_iter(iter) {
+		rec = ftrace_rec_iter_record(iter);
+
+		ret = add_breakpoints(rec, enable);
+		if (ret)
+			goto remove_breakpoints;
+		count++;
+	}
+
+	run_sync();
+
+	report = "updating code";
+
+	for_ftrace_rec_iter(iter) {
+		rec = ftrace_rec_iter_record(iter);
+
+		ret = add_update(rec, enable);
+		if (ret)
+			goto remove_breakpoints;
+	}
+
+	run_sync();
+
+	report = "removing breakpoints";
+
+	for_ftrace_rec_iter(iter) {
+		rec = ftrace_rec_iter_record(iter);
+
+		ret = finish_update(rec, enable);
+		if (ret)
+			goto remove_breakpoints;
+	}
+
+	run_sync();
+
+	return;
+
+ remove_breakpoints:
+	ftrace_bug(ret, rec ? rec->ip : 0);
+	printk(KERN_WARNING "Failed on %s (%d):\n", report, count);
+	for_ftrace_rec_iter(iter) {
+		rec = ftrace_rec_iter_record(iter);
+		remove_breakpoint(rec);
+	}
+}
+
+void arch_ftrace_update_code(int command)
+{
+	modifying_ftrace_code++;
+
+	if (command & FTRACE_UPDATE_CALLS)
+		ftrace_replace_code(1);
+	else if (command & FTRACE_DISABLE_CALLS)
+		ftrace_replace_code(0);
+
+	if (command & FTRACE_UPDATE_TRACE_FUNC)
+		ftrace_update_ftrace_func(ftrace_trace_function);
+
+	if (command & FTRACE_START_FUNC_RET)
+		ftrace_enable_ftrace_graph_caller();
+	else if (command & FTRACE_STOP_FUNC_RET)
+		ftrace_disable_ftrace_graph_caller();
+
+	modifying_ftrace_code--;
+}
+
 int __init ftrace_dyn_arch_init(void *data)
 {
 	/* The return code is retured via data */
diff --git a/arch/x86/kernel/traps.c b/arch/x86/kernel/traps.c
index ff9281f16029..92d5756d85fc 100644
--- a/arch/x86/kernel/traps.c
+++ b/arch/x86/kernel/traps.c
@@ -50,6 +50,7 @@
 #include <asm/processor.h>
 #include <asm/debugreg.h>
 #include <linux/atomic.h>
+#include <asm/ftrace.h>
 #include <asm/traps.h>
 #include <asm/desc.h>
 #include <asm/i387.h>
@@ -303,8 +304,13 @@ gp_in_kernel:
 }
 
 /* May run on IST stack. */
-dotraplinkage void __kprobes do_int3(struct pt_regs *regs, long error_code)
+dotraplinkage void __kprobes notrace do_int3(struct pt_regs *regs, long error_code)
 {
+#ifdef CONFIG_DYNAMIC_FTRACE
+	/* ftrace must be first, everything else may cause a recursive crash */
+	if (unlikely(modifying_ftrace_code) && ftrace_int3_handler(regs))
+		return;
+#endif
 #ifdef CONFIG_KGDB_LOW_LEVEL_TRAP
 	if (kgdb_ll_trap(DIE_INT3, "int3", regs, error_code, X86_TRAP_BP,
 				SIGTRAP) == NOTIFY_STOP)
diff --git a/include/linux/ftrace.h b/include/linux/ftrace.h
index 72a6cabb4d5b..0b5590330bca 100644
--- a/include/linux/ftrace.h
+++ b/include/linux/ftrace.h
@@ -286,6 +286,12 @@ struct ftrace_rec_iter *ftrace_rec_iter_start(void);
 struct ftrace_rec_iter *ftrace_rec_iter_next(struct ftrace_rec_iter *iter);
 struct dyn_ftrace *ftrace_rec_iter_record(struct ftrace_rec_iter *iter);
 
+#define for_ftrace_rec_iter(iter)		\
+	for (iter = ftrace_rec_iter_start();	\
+	     iter;				\
+	     iter = ftrace_rec_iter_next(iter))
+
+
 int ftrace_update_record(struct dyn_ftrace *rec, int enable);
 int ftrace_test_record(struct dyn_ftrace *rec, int enable);
 void ftrace_run_stop_machine(int command);
-- 
cgit v1.2.3


From 4a6d70c9505fef1d8906b1d61db3de5d8ecf9454 Mon Sep 17 00:00:00 2001
From: Steven Rostedt <srostedt@redhat.com>
Date: Tue, 24 Apr 2012 16:31:07 -0400
Subject: ftrace/x86: Remove the complex ftrace NMI handling code

As ftrace function tracing would require modifying code that could
be executed in NMI context, which is not stopped with stop_machine(),
ftrace had to do a complex algorithm with various stages of setup
and memory barriers to make it work.

With the new breakpoint method, this is no longer required. The changes
to the code can be done without any problem in NMI context, as well as
without stop machine altogether. Remove the complex code as it is
no longer needed.

Also, a lot of the notrace annotations could be removed from the
NMI code as it is now safe to trace them. With the exception of
do_nmi itself, which does some special work to handle running in
the debug stack. The breakpoint method can cause NMIs to double
nest the debug stack if it's not setup properly, and that is done
in do_nmi(), thus that function must not be traced.

(Note the arch sh may want to do the same)

Cc: Paul Mundt <lethal@linux-sh.org>
Cc: H. Peter Anvin <hpa@zytor.com>
Signed-off-by: Steven Rostedt <rostedt@goodmis.org>
---
 arch/x86/Kconfig         |   1 -
 arch/x86/kernel/ftrace.c | 169 +----------------------------------------------
 arch/x86/kernel/nmi.c    |  10 +--
 3 files changed, 6 insertions(+), 174 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
index 1d14cc6b79ad..1324139612e1 100644
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -40,7 +40,6 @@ config X86
 	select HAVE_FUNCTION_GRAPH_TRACER
 	select HAVE_FUNCTION_GRAPH_FP_TEST
 	select HAVE_FUNCTION_TRACE_MCOUNT_TEST
-	select HAVE_FTRACE_NMI_ENTER if DYNAMIC_FTRACE
 	select HAVE_SYSCALL_TRACEPOINTS
 	select HAVE_KVM
 	select HAVE_ARCH_KGDB
diff --git a/arch/x86/kernel/ftrace.c b/arch/x86/kernel/ftrace.c
index 80af34739a9a..cf2d03ec1793 100644
--- a/arch/x86/kernel/ftrace.c
+++ b/arch/x86/kernel/ftrace.c
@@ -27,38 +27,18 @@
 #include <asm/cacheflush.h>
 #include <asm/ftrace.h>
 #include <asm/nops.h>
-#include <asm/nmi.h>
-
 
 #ifdef CONFIG_DYNAMIC_FTRACE
 
-/*
- * modifying_code is set to notify NMIs that they need to use
- * memory barriers when entering or exiting. But we don't want
- * to burden NMIs with unnecessary memory barriers when code
- * modification is not being done (which is most of the time).
- *
- * A mutex is already held when ftrace_arch_code_modify_prepare
- * and post_process are called. No locks need to be taken here.
- *
- * Stop machine will make sure currently running NMIs are done
- * and new NMIs will see the updated variable before we need
- * to worry about NMIs doing memory barriers.
- */
-static int modifying_code __read_mostly;
-static DEFINE_PER_CPU(int, save_modifying_code);
-
 int ftrace_arch_code_modify_prepare(void)
 {
 	set_kernel_text_rw();
 	set_all_modules_text_rw();
-	modifying_code = 1;
 	return 0;
 }
 
 int ftrace_arch_code_modify_post_process(void)
 {
-	modifying_code = 0;
 	set_all_modules_text_ro();
 	set_kernel_text_ro();
 	return 0;
@@ -91,134 +71,6 @@ static unsigned char *ftrace_call_replace(unsigned long ip, unsigned long addr)
 	return calc.code;
 }
 
-/*
- * Modifying code must take extra care. On an SMP machine, if
- * the code being modified is also being executed on another CPU
- * that CPU will have undefined results and possibly take a GPF.
- * We use kstop_machine to stop other CPUS from exectuing code.
- * But this does not stop NMIs from happening. We still need
- * to protect against that. We separate out the modification of
- * the code to take care of this.
- *
- * Two buffers are added: An IP buffer and a "code" buffer.
- *
- * 1) Put the instruction pointer into the IP buffer
- *    and the new code into the "code" buffer.
- * 2) Wait for any running NMIs to finish and set a flag that says
- *    we are modifying code, it is done in an atomic operation.
- * 3) Write the code
- * 4) clear the flag.
- * 5) Wait for any running NMIs to finish.
- *
- * If an NMI is executed, the first thing it does is to call
- * "ftrace_nmi_enter". This will check if the flag is set to write
- * and if it is, it will write what is in the IP and "code" buffers.
- *
- * The trick is, it does not matter if everyone is writing the same
- * content to the code location. Also, if a CPU is executing code
- * it is OK to write to that code location if the contents being written
- * are the same as what exists.
- */
-
-#define MOD_CODE_WRITE_FLAG (1 << 31)	/* set when NMI should do the write */
-static atomic_t nmi_running = ATOMIC_INIT(0);
-static int mod_code_status;		/* holds return value of text write */
-static void *mod_code_ip;		/* holds the IP to write to */
-static const void *mod_code_newcode;	/* holds the text to write to the IP */
-
-static unsigned nmi_wait_count;
-static atomic_t nmi_update_count = ATOMIC_INIT(0);
-
-int ftrace_arch_read_dyn_info(char *buf, int size)
-{
-	int r;
-
-	r = snprintf(buf, size, "%u %u",
-		     nmi_wait_count,
-		     atomic_read(&nmi_update_count));
-	return r;
-}
-
-static void clear_mod_flag(void)
-{
-	int old = atomic_read(&nmi_running);
-
-	for (;;) {
-		int new = old & ~MOD_CODE_WRITE_FLAG;
-
-		if (old == new)
-			break;
-
-		old = atomic_cmpxchg(&nmi_running, old, new);
-	}
-}
-
-static void ftrace_mod_code(void)
-{
-	/*
-	 * Yes, more than one CPU process can be writing to mod_code_status.
-	 *    (and the code itself)
-	 * But if one were to fail, then they all should, and if one were
-	 * to succeed, then they all should.
-	 */
-	mod_code_status = probe_kernel_write(mod_code_ip, mod_code_newcode,
-					     MCOUNT_INSN_SIZE);
-
-	/* if we fail, then kill any new writers */
-	if (mod_code_status)
-		clear_mod_flag();
-}
-
-void ftrace_nmi_enter(void)
-{
-	__this_cpu_write(save_modifying_code, modifying_code);
-
-	if (!__this_cpu_read(save_modifying_code))
-		return;
-
-	if (atomic_inc_return(&nmi_running) & MOD_CODE_WRITE_FLAG) {
-		smp_rmb();
-		ftrace_mod_code();
-		atomic_inc(&nmi_update_count);
-	}
-	/* Must have previous changes seen before executions */
-	smp_mb();
-}
-
-void ftrace_nmi_exit(void)
-{
-	if (!__this_cpu_read(save_modifying_code))
-		return;
-
-	/* Finish all executions before clearing nmi_running */
-	smp_mb();
-	atomic_dec(&nmi_running);
-}
-
-static void wait_for_nmi_and_set_mod_flag(void)
-{
-	if (!atomic_cmpxchg(&nmi_running, 0, MOD_CODE_WRITE_FLAG))
-		return;
-
-	do {
-		cpu_relax();
-	} while (atomic_cmpxchg(&nmi_running, 0, MOD_CODE_WRITE_FLAG));
-
-	nmi_wait_count++;
-}
-
-static void wait_for_nmi(void)
-{
-	if (!atomic_read(&nmi_running))
-		return;
-
-	do {
-		cpu_relax();
-	} while (atomic_read(&nmi_running));
-
-	nmi_wait_count++;
-}
-
 static inline int
 within(unsigned long addr, unsigned long start, unsigned long end)
 {
@@ -239,26 +91,7 @@ do_ftrace_mod_code(unsigned long ip, const void *new_code)
 	if (within(ip, (unsigned long)_text, (unsigned long)_etext))
 		ip = (unsigned long)__va(__pa(ip));
 
-	mod_code_ip = (void *)ip;
-	mod_code_newcode = new_code;
-
-	/* The buffers need to be visible before we let NMIs write them */
-	smp_mb();
-
-	wait_for_nmi_and_set_mod_flag();
-
-	/* Make sure all running NMIs have finished before we write the code */
-	smp_mb();
-
-	ftrace_mod_code();
-
-	/* Make sure the write happens before clearing the bit */
-	smp_mb();
-
-	clear_mod_flag();
-	wait_for_nmi();
-
-	return mod_code_status;
+	return probe_kernel_write((void *)ip, new_code, MCOUNT_INSN_SIZE);
 }
 
 static const unsigned char *ftrace_nop_replace(void)
diff --git a/arch/x86/kernel/nmi.c b/arch/x86/kernel/nmi.c
index 47acaf319165..eb1539eac393 100644
--- a/arch/x86/kernel/nmi.c
+++ b/arch/x86/kernel/nmi.c
@@ -84,7 +84,7 @@ __setup("unknown_nmi_panic", setup_unknown_nmi_panic);
 
 #define nmi_to_desc(type) (&nmi_desc[type])
 
-static int notrace __kprobes nmi_handle(unsigned int type, struct pt_regs *regs, bool b2b)
+static int __kprobes nmi_handle(unsigned int type, struct pt_regs *regs, bool b2b)
 {
 	struct nmi_desc *desc = nmi_to_desc(type);
 	struct nmiaction *a;
@@ -209,7 +209,7 @@ void unregister_nmi_handler(unsigned int type, const char *name)
 
 EXPORT_SYMBOL_GPL(unregister_nmi_handler);
 
-static notrace __kprobes void
+static __kprobes void
 pci_serr_error(unsigned char reason, struct pt_regs *regs)
 {
 	pr_emerg("NMI: PCI system error (SERR) for reason %02x on CPU %d.\n",
@@ -236,7 +236,7 @@ pci_serr_error(unsigned char reason, struct pt_regs *regs)
 	outb(reason, NMI_REASON_PORT);
 }
 
-static notrace __kprobes void
+static __kprobes void
 io_check_error(unsigned char reason, struct pt_regs *regs)
 {
 	unsigned long i;
@@ -263,7 +263,7 @@ io_check_error(unsigned char reason, struct pt_regs *regs)
 	outb(reason, NMI_REASON_PORT);
 }
 
-static notrace __kprobes void
+static __kprobes void
 unknown_nmi_error(unsigned char reason, struct pt_regs *regs)
 {
 	int handled;
@@ -305,7 +305,7 @@ unknown_nmi_error(unsigned char reason, struct pt_regs *regs)
 static DEFINE_PER_CPU(bool, swallow_nmi);
 static DEFINE_PER_CPU(unsigned long, last_nmi_rip);
 
-static notrace __kprobes void default_do_nmi(struct pt_regs *regs)
+static __kprobes void default_do_nmi(struct pt_regs *regs)
 {
 	unsigned char reason = 0;
 	int handled;
-- 
cgit v1.2.3


From 0749708352fddbe0fa81fc25f96e3b1f77c655f4 Mon Sep 17 00:00:00 2001
From: Linus Torvalds <torvalds@linux-foundation.org>
Date: Sat, 28 Apr 2012 14:27:38 -0700
Subject: x86: make word-at-a-time strncpy_from_user clear bytes at the end

This makes the newly optimized x86 strncpy_from_user clear the final
bytes in the word past the final NUL character, rather than copy them as
the word they were in the source.

NOTE! Unlike the silly semantics of the libc 'strncpy()' function, the
kernel strncpy_from_user() has never cleared all of the end of the
destination buffer.  And neither does it do so now: it only clears the
bytes at the end of the last word it copied.

So why make this change at all? It doesn't really cost us anything extra
(we have to calculate the mask to get the length anyway), and it means
that *if* any user actually cares about zeroing the whole buffer, they
can do a "memset()" before the strncpy_from_user(), and we will no
longer write random bytes after the NUL character.

In particular, the buffer contents will now at no point contain random
source data from beyond the end of the string.

In other words, it makes behavior a bit more repeatable at no new cost,
so it's a small cleanup.  I've been carrying this as a patch for the
last few weeks or so in my tree (done at the same time the sign error
was fixed in commit 12e993b89464), I might as well commit it.

Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 arch/x86/lib/usercopy.c | 20 ++++++++------------
 1 file changed, 8 insertions(+), 12 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/lib/usercopy.c b/arch/x86/lib/usercopy.c
index d6ae30bbd7bb..2e4e4b02c37a 100644
--- a/arch/x86/lib/usercopy.c
+++ b/arch/x86/lib/usercopy.c
@@ -44,13 +44,6 @@ copy_from_user_nmi(void *to, const void __user *from, unsigned long n)
 }
 EXPORT_SYMBOL_GPL(copy_from_user_nmi);
 
-static inline unsigned long count_bytes(unsigned long mask)
-{
-	mask = (mask - 1) & ~mask;
-	mask >>= 7;
-	return count_masked_bytes(mask);
-}
-
 /*
  * Do a strncpy, return length of string without final '\0'.
  * 'count' is the user-supplied count (return 'count' if we
@@ -69,16 +62,19 @@ static inline long do_strncpy_from_user(char *dst, const char __user *src, long
 		max = count;
 
 	while (max >= sizeof(unsigned long)) {
-		unsigned long c;
+		unsigned long c, mask;
 
 		/* Fall back to byte-at-a-time if we get a page fault */
 		if (unlikely(__get_user(c,(unsigned long __user *)(src+res))))
 			break;
-		/* This can write a few bytes past the NUL character, but that's ok */
+		mask = has_zero(c);
+		if (mask) {
+			mask = (mask - 1) & ~mask;
+			mask >>= 7;
+			*(unsigned long *)(dst+res) = c & mask;
+			return res + count_masked_bytes(mask);
+		}
 		*(unsigned long *)(dst+res) = c;
-		c = has_zero(c);
-		if (c)
-			return res + count_bytes(c);
 		res += sizeof(unsigned long);
 		max -= sizeof(unsigned long);
 	}
-- 
cgit v1.2.3


From 42a6bd2006c922143cee8d9ec7c4e27526d4d2a3 Mon Sep 17 00:00:00 2001
From: Kusanagi Kouichi <slash@ac.auone-net.jp>
Date: Sun, 1 Apr 2012 17:29:32 +0900
Subject: x86, relocs: Remove an unused variable

sh_symtab is set but not used.

Signed-off-by: Kusanagi Kouichi <slash@ac.auone-net.jp>
Signed-off-by: Jiri Kosina <jkosina@suse.cz>
---
 arch/x86/boot/compressed/relocs.c | 2 --
 1 file changed, 2 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/boot/compressed/relocs.c b/arch/x86/boot/compressed/relocs.c
index d3c0b0277666..fb7117a4ade1 100644
--- a/arch/x86/boot/compressed/relocs.c
+++ b/arch/x86/boot/compressed/relocs.c
@@ -403,13 +403,11 @@ static void print_absolute_symbols(void)
 	for (i = 0; i < ehdr.e_shnum; i++) {
 		struct section *sec = &secs[i];
 		char *sym_strtab;
-		Elf32_Sym *sh_symtab;
 		int j;
 
 		if (sec->shdr.sh_type != SHT_SYMTAB) {
 			continue;
 		}
-		sh_symtab = sec->symtab;
 		sym_strtab = sec->link->strtab;
 		for (j = 0; j < sec->shdr.sh_size/sizeof(Elf32_Sym); j++) {
 			Elf32_Sym *sym;
-- 
cgit v1.2.3


From f227d4306cf30e1d5b6f231e8ef9006c34f3d186 Mon Sep 17 00:00:00 2001
From: Borislav Petkov <borislav.petkov@amd.com>
Date: Mon, 16 Apr 2012 18:01:53 +0200
Subject: x86, MCE, AMD: Make APIC LVT thresholding interrupt optional

Currently, the APIC LVT interrupt for error thresholding is implicitly
enabled. However, there are models in the F15h range which do not enable
it. Make the code machinery which sets up the APIC interrupt support
an optional setting and add an ->interrupt_capable member to the bank
representation mirroring that capability and enable the interrupt offset
programming only if it is true.

Simplify code and fixup comment style while at it.

Signed-off-by: Borislav Petkov <borislav.petkov@amd.com>
---
 arch/x86/kernel/cpu/mcheck/mce_amd.c | 56 ++++++++++++++++++++++++++++--------
 1 file changed, 44 insertions(+), 12 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kernel/cpu/mcheck/mce_amd.c b/arch/x86/kernel/cpu/mcheck/mce_amd.c
index 99b57179f912..2c1d178be46e 100644
--- a/arch/x86/kernel/cpu/mcheck/mce_amd.c
+++ b/arch/x86/kernel/cpu/mcheck/mce_amd.c
@@ -51,6 +51,7 @@ struct threshold_block {
 	unsigned int		cpu;
 	u32			address;
 	u16			interrupt_enable;
+	bool			interrupt_capable;
 	u16			threshold_limit;
 	struct kobject		kobj;
 	struct list_head	miscj;
@@ -83,6 +84,21 @@ struct thresh_restart {
 	u16			old_limit;
 };
 
+static bool lvt_interrupt_supported(unsigned int bank, u32 msr_high_bits)
+{
+	/*
+	 * bank 4 supports APIC LVT interrupts implicitly since forever.
+	 */
+	if (bank == 4)
+		return true;
+
+	/*
+	 * IntP: interrupt present; if this bit is set, the thresholding
+	 * bank can generate APIC LVT interrupts
+	 */
+	return msr_high_bits & BIT(28);
+}
+
 static int lvt_off_valid(struct threshold_block *b, int apic, u32 lo, u32 hi)
 {
 	int msr = (hi & MASK_LVTOFF_HI) >> 20;
@@ -104,8 +120,10 @@ static int lvt_off_valid(struct threshold_block *b, int apic, u32 lo, u32 hi)
 	return 1;
 };
 
-/* must be called with correct cpu affinity */
-/* Called via smp_call_function_single() */
+/*
+ * Called via smp_call_function_single(), must be called with correct
+ * cpu affinity.
+ */
 static void threshold_restart_bank(void *_tr)
 {
 	struct thresh_restart *tr = _tr;
@@ -128,6 +146,12 @@ static void threshold_restart_bank(void *_tr)
 		    (new_count & THRESHOLD_MAX);
 	}
 
+	/* clear IntType */
+	hi &= ~MASK_INT_TYPE_HI;
+
+	if (!tr->b->interrupt_capable)
+		goto done;
+
 	if (tr->set_lvt_off) {
 		if (lvt_off_valid(tr->b, tr->lvt_off, lo, hi)) {
 			/* set new lvt offset */
@@ -136,9 +160,10 @@ static void threshold_restart_bank(void *_tr)
 		}
 	}
 
-	tr->b->interrupt_enable ?
-	    (hi = (hi & ~MASK_INT_TYPE_HI) | INT_TYPE_APIC) :
-	    (hi &= ~MASK_INT_TYPE_HI);
+	if (tr->b->interrupt_enable)
+		hi |= INT_TYPE_APIC;
+
+ done:
 
 	hi |= MASK_COUNT_EN_HI;
 	wrmsr(tr->b->address, lo, hi);
@@ -202,14 +227,17 @@ void mce_amd_feature_init(struct cpuinfo_x86 *c)
 			if (shared_bank[bank] && c->cpu_core_id)
 				break;
 
-			offset = setup_APIC_mce(offset,
-						(high & MASK_LVTOFF_HI) >> 20);
-
 			memset(&b, 0, sizeof(b));
-			b.cpu		= cpu;
-			b.bank		= bank;
-			b.block		= block;
-			b.address	= address;
+			b.cpu			= cpu;
+			b.bank			= bank;
+			b.block			= block;
+			b.address		= address;
+			b.interrupt_capable	= lvt_interrupt_supported(bank, high);
+
+			if (b.interrupt_capable) {
+				int new = (high & MASK_LVTOFF_HI) >> 20;
+				offset  = setup_APIC_mce(offset, new);
+			}
 
 			mce_threshold_block_init(&b, offset);
 			mce_threshold_vector = amd_threshold_interrupt;
@@ -309,6 +337,9 @@ store_interrupt_enable(struct threshold_block *b, const char *buf, size_t size)
 	struct thresh_restart tr;
 	unsigned long new;
 
+	if (!b->interrupt_capable)
+		return -EINVAL;
+
 	if (strict_strtoul(buf, 0, &new) < 0)
 		return -EINVAL;
 
@@ -467,6 +498,7 @@ static __cpuinit int allocate_threshold_blocks(unsigned int cpu,
 	b->cpu			= cpu;
 	b->address		= address;
 	b->interrupt_enable	= 0;
+	b->interrupt_capable	= lvt_interrupt_supported(bank, high);
 	b->threshold_limit	= THRESHOLD_MAX;
 
 	INIT_LIST_HEAD(&b->miscj);
-- 
cgit v1.2.3


From d26ecc4894464318dce51d709e19dd9d88916bee Mon Sep 17 00:00:00 2001
From: Borislav Petkov <borislav.petkov@amd.com>
Date: Mon, 16 Apr 2012 18:20:36 +0200
Subject: x86, MCE, AMD: Hide interrupt_enable sysfs node

Depending on whether the box supports the APIC LVT interrupt for
thresholding, we want to show the 'interrupt_enable' sysfs node or not.
Make that the case by adding it to the default sysfs attributes only if
it is supported.

Signed-off-by: Borislav Petkov <borislav.petkov@amd.com>
---
 arch/x86/kernel/cpu/mcheck/mce_amd.c | 9 +++++++--
 1 file changed, 7 insertions(+), 2 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kernel/cpu/mcheck/mce_amd.c b/arch/x86/kernel/cpu/mcheck/mce_amd.c
index 2c1d178be46e..f4873a64f46d 100644
--- a/arch/x86/kernel/cpu/mcheck/mce_amd.c
+++ b/arch/x86/kernel/cpu/mcheck/mce_amd.c
@@ -421,10 +421,10 @@ RW_ATTR(threshold_limit);
 RW_ATTR(error_count);
 
 static struct attribute *default_attrs[] = {
-	&interrupt_enable.attr,
 	&threshold_limit.attr,
 	&error_count.attr,
-	NULL
+	NULL,	/* possibly interrupt_enable if supported, see below */
+	NULL,
 };
 
 #define to_block(k)	container_of(k, struct threshold_block, kobj)
@@ -501,6 +501,11 @@ static __cpuinit int allocate_threshold_blocks(unsigned int cpu,
 	b->interrupt_capable	= lvt_interrupt_supported(bank, high);
 	b->threshold_limit	= THRESHOLD_MAX;
 
+	if (b->interrupt_capable)
+		threshold_ktype.default_attrs[2] = &interrupt_enable.attr;
+	else
+		threshold_ktype.default_attrs[2] = NULL;
+
 	INIT_LIST_HEAD(&b->miscj);
 
 	if (per_cpu(threshold_banks, cpu)[bank]->blocks) {
-- 
cgit v1.2.3


From 575203b4747c371698dd686b1fa6d0a3a0c47ac6 Mon Sep 17 00:00:00 2001
From: Borislav Petkov <borislav.petkov@amd.com>
Date: Fri, 20 Apr 2012 18:01:34 +0200
Subject: x86, MCE, AMD: Disable error thresholding bank 4 on some models

Turn off MC4_MISC thresholding banks on models which have them but that
particular processor implementation does not supply applicable error
sources to be counted.

Signed-off-by: Borislav Petkov <borislav.petkov@amd.com>
---
 arch/x86/kernel/cpu/mcheck/mce.c | 37 +++++++++++++++++++++++++++++++++++++
 1 file changed, 37 insertions(+)

(limited to 'arch/x86')

diff --git a/arch/x86/kernel/cpu/mcheck/mce.c b/arch/x86/kernel/cpu/mcheck/mce.c
index d086a09c087d..888fbf9d0adf 100644
--- a/arch/x86/kernel/cpu/mcheck/mce.c
+++ b/arch/x86/kernel/cpu/mcheck/mce.c
@@ -1423,6 +1423,43 @@ static int __cpuinit __mcheck_cpu_apply_quirks(struct cpuinfo_x86 *c)
 		 */
 		 if (c->x86 == 6 && banks > 0)
 			mce_banks[0].ctl = 0;
+
+		 /*
+		  * Turn off MC4_MISC thresholding banks on those models since
+		  * they're not supported there.
+		  */
+		 if (c->x86 == 0x15 &&
+		     (c->x86_model >= 0x10 && c->x86_model <= 0x1f)) {
+			 int i;
+			 u64 val, hwcr;
+			 bool need_toggle;
+			 u32 msrs[] = {
+				0x00000413, /* MC4_MISC0 */
+				0xc0000408, /* MC4_MISC1 */
+			 };
+
+			 rdmsrl(MSR_K7_HWCR, hwcr);
+
+			 /* McStatusWrEn has to be set */
+			 need_toggle = !(hwcr & BIT(18));
+
+			 if (need_toggle)
+				 wrmsrl(MSR_K7_HWCR, hwcr | BIT(18));
+
+			 for (i = 0; i < ARRAY_SIZE(msrs); i++) {
+				 rdmsrl(msrs[i], val);
+
+				 /* CntP bit set? */
+				 if (val & BIT(62)) {
+					 val &= ~BIT(62);
+					 wrmsrl(msrs[i], val);
+				 }
+			 }
+
+			 /* restore old settings */
+			 if (need_toggle)
+				 wrmsrl(MSR_K7_HWCR, hwcr);
+		 }
 	}
 
 	if (c->x86_vendor == X86_VENDOR_INTEL) {
-- 
cgit v1.2.3


From baa495d9de2af97310128bfc0e365a813b63d5bb Mon Sep 17 00:00:00 2001
From: Yinghai Lu <yinghai@kernel.org>
Date: Mon, 2 Apr 2012 18:31:53 -0700
Subject: x86/PCI: fix memleak with get_current_resources()

In pci_scan_acpi_root(), when pci_use_crs is set, get_current_resources()
is used to get pci_root_info, and it will allocate name and resource array.

Later if pci_create_root_bus() can not create bus (could be already
there...) it will only free bus res list, but the name and res array is not
freed.

Let get_current_resource() take info pointer instead of using local info.

Signed-off-by: Yinghai Lu <yinghai@kernel.org>
Signed-off-by: Bjorn Helgaas <bhelgaas@google.com>
---
 arch/x86/pci/acpi.c | 49 ++++++++++++++++++++++++++++++-------------------
 1 file changed, 30 insertions(+), 19 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/pci/acpi.c b/arch/x86/pci/acpi.c
index ed2835e148b5..a99b7d75f5ca 100644
--- a/arch/x86/pci/acpi.c
+++ b/arch/x86/pci/acpi.c
@@ -315,49 +315,55 @@ static void add_resources(struct pci_root_info *info)
 	}
 }
 
+static void free_pci_root_info(struct pci_root_info *info)
+{
+	kfree(info->name);
+	kfree(info->res);
+	memset(info, 0, sizeof(struct pci_root_info));
+}
+
 static void
-get_current_resources(struct acpi_device *device, int busnum,
+get_current_resources(struct pci_root_info *info,
+		      struct acpi_device *device, int busnum,
 		      int domain, struct list_head *resources)
 {
-	struct pci_root_info info;
 	size_t size;
 
-	info.bridge = device;
-	info.res_num = 0;
-	info.resources = resources;
+	info->bridge = device;
+	info->res_num = 0;
+	info->resources = resources;
 	acpi_walk_resources(device->handle, METHOD_NAME__CRS, count_resource,
-				&info);
-	if (!info.res_num)
+				info);
+	if (!info->res_num)
 		return;
 
-	size = sizeof(*info.res) * info.res_num;
-	info.res = kmalloc(size, GFP_KERNEL);
-	if (!info.res)
+	size = sizeof(*info->res) * info->res_num;
+	info->res = kmalloc(size, GFP_KERNEL);
+	if (!info->res)
 		return;
 
-	info.name = kasprintf(GFP_KERNEL, "PCI Bus %04x:%02x", domain, busnum);
-	if (!info.name)
+	info->name = kasprintf(GFP_KERNEL, "PCI Bus %04x:%02x", domain, busnum);
+	if (!info->name)
 		goto name_alloc_fail;
 
-	info.res_num = 0;
+	info->res_num = 0;
 	acpi_walk_resources(device->handle, METHOD_NAME__CRS, setup_resource,
-				&info);
+				info);
 
 	if (pci_use_crs) {
-		add_resources(&info);
+		add_resources(info);
 
 		return;
 	}
 
-	kfree(info.name);
-
 name_alloc_fail:
-	kfree(info.res);
+	free_pci_root_info(info);
 }
 
 struct pci_bus * __devinit pci_acpi_scan_root(struct acpi_pci_root *root)
 {
 	struct acpi_device *device = root->device;
+	struct pci_root_info info;
 	int domain = root->segment;
 	int busnum = root->secondary.start;
 	LIST_HEAD(resources);
@@ -402,6 +408,7 @@ struct pci_bus * __devinit pci_acpi_scan_root(struct acpi_pci_root *root)
 
 	sd->domain = domain;
 	sd->node = node;
+	memset(&info, 0, sizeof(struct pci_root_info));
 	/*
 	 * Maybe the desired pci bus has been already scanned. In such case
 	 * it is unnecessary to scan the pci bus with the given domain,busnum.
@@ -415,7 +422,8 @@ struct pci_bus * __devinit pci_acpi_scan_root(struct acpi_pci_root *root)
 		memcpy(bus->sysdata, sd, sizeof(*sd));
 		kfree(sd);
 	} else {
-		get_current_resources(device, busnum, domain, &resources);
+		get_current_resources(&info, device, busnum, domain,
+					&resources);
 
 		/*
 		 * _CRS with no apertures is normal, so only fall back to
@@ -429,6 +437,9 @@ struct pci_bus * __devinit pci_acpi_scan_root(struct acpi_pci_root *root)
 			bus->subordinate = pci_scan_child_bus(bus);
 		else
 			pci_free_resource_list(&resources);
+
+		if (!bus && pci_use_crs)
+			free_pci_root_info(&info);
 	}
 
 	/* After the PCI-E bus has been walked and all devices discovered,
-- 
cgit v1.2.3


From 9a03d28d9490b5a04f8b1d98fc08067c6f4f6189 Mon Sep 17 00:00:00 2001
From: Yinghai Lu <yinghai@kernel.org>
Date: Mon, 2 Apr 2012 18:31:53 -0700
Subject: x86/PCI: refactor get_current_resources()

Rename get_current_resources() to probe_pci_root_info.

1. Remove resource list head from pci_root_info
2. Make get_current_resources() not pass resources
3. Rename get_current_resources() to probe_pci_root_info()

Signed-off-by: Yinghai Lu <yinghai@kernel.org>
Signed-off-by: Bjorn Helgaas <bhelgaas@google.com>
---
 arch/x86/pci/acpi.c | 34 +++++++++++++---------------------
 1 file changed, 13 insertions(+), 21 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/pci/acpi.c b/arch/x86/pci/acpi.c
index a99b7d75f5ca..a858c1d9af53 100644
--- a/arch/x86/pci/acpi.c
+++ b/arch/x86/pci/acpi.c
@@ -12,7 +12,6 @@ struct pci_root_info {
 	char *name;
 	unsigned int res_num;
 	struct resource *res;
-	struct list_head *resources;
 	int busnum;
 };
 
@@ -287,7 +286,8 @@ static void coalesce_windows(struct pci_root_info *info, unsigned long type)
 	}
 }
 
-static void add_resources(struct pci_root_info *info)
+static void add_resources(struct pci_root_info *info,
+			  struct list_head *resources)
 {
 	int i;
 	struct resource *res, *root, *conflict;
@@ -311,7 +311,7 @@ static void add_resources(struct pci_root_info *info)
 				 "ignoring host bridge window %pR (conflicts with %s %pR)\n",
 				 res, conflict->name, conflict);
 		else
-			pci_add_resource(info->resources, res);
+			pci_add_resource(resources, res);
 	}
 }
 
@@ -323,41 +323,30 @@ static void free_pci_root_info(struct pci_root_info *info)
 }
 
 static void
-get_current_resources(struct pci_root_info *info,
-		      struct acpi_device *device, int busnum,
-		      int domain, struct list_head *resources)
+probe_pci_root_info(struct pci_root_info *info, struct acpi_device *device,
+		    int busnum, int domain)
 {
 	size_t size;
 
 	info->bridge = device;
 	info->res_num = 0;
-	info->resources = resources;
 	acpi_walk_resources(device->handle, METHOD_NAME__CRS, count_resource,
 				info);
 	if (!info->res_num)
 		return;
 
 	size = sizeof(*info->res) * info->res_num;
+	info->res_num = 0;
 	info->res = kmalloc(size, GFP_KERNEL);
 	if (!info->res)
 		return;
 
 	info->name = kasprintf(GFP_KERNEL, "PCI Bus %04x:%02x", domain, busnum);
 	if (!info->name)
-		goto name_alloc_fail;
+		return;
 
-	info->res_num = 0;
 	acpi_walk_resources(device->handle, METHOD_NAME__CRS, setup_resource,
 				info);
-
-	if (pci_use_crs) {
-		add_resources(info);
-
-		return;
-	}
-
-name_alloc_fail:
-	free_pci_root_info(info);
 }
 
 struct pci_bus * __devinit pci_acpi_scan_root(struct acpi_pci_root *root)
@@ -422,15 +411,18 @@ struct pci_bus * __devinit pci_acpi_scan_root(struct acpi_pci_root *root)
 		memcpy(bus->sysdata, sd, sizeof(*sd));
 		kfree(sd);
 	} else {
-		get_current_resources(&info, device, busnum, domain,
-					&resources);
+		probe_pci_root_info(&info, device, busnum, domain);
 
 		/*
 		 * _CRS with no apertures is normal, so only fall back to
 		 * defaults or native bridge info if we're ignoring _CRS.
 		 */
-		if (!pci_use_crs)
+		if (pci_use_crs)
+			add_resources(&info, &resources);
+		else {
+			free_pci_root_info(&info);
 			x86_pci_root_bus_resources(busnum, &resources);
+		}
 		bus = pci_create_root_bus(NULL, busnum, &pci_root_ops, sd,
 					  &resources);
 		if (bus)
-- 
cgit v1.2.3


From fd3b0c1ea482e863d6a2556b6686e35bec7a4f1c Mon Sep 17 00:00:00 2001
From: Yinghai Lu <yinghai@kernel.org>
Date: Mon, 2 Apr 2012 18:31:53 -0700
Subject: x86/PCI: add host bridge resource release for _CRS path

1. Allocate pci_root_info instead of using stack.  We need to pass around
   info for release function.
2. Add release_pci_root_info
3. Set x86 host bridge release function to make sure root bridge
   related resources get freed during root bus removal.

Signed-off-by: Yinghai Lu <yinghai@kernel.org>
Signed-off-by: Bjorn Helgaas <bhelgaas@google.com>
---
 arch/x86/pci/acpi.c | 63 +++++++++++++++++++++++++++++++++++++++++++----------
 1 file changed, 51 insertions(+), 12 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/pci/acpi.c b/arch/x86/pci/acpi.c
index a858c1d9af53..2b74a161d215 100644
--- a/arch/x86/pci/acpi.c
+++ b/arch/x86/pci/acpi.c
@@ -315,11 +315,40 @@ static void add_resources(struct pci_root_info *info,
 	}
 }
 
-static void free_pci_root_info(struct pci_root_info *info)
+static void free_pci_root_info_res(struct pci_root_info *info)
 {
 	kfree(info->name);
 	kfree(info->res);
-	memset(info, 0, sizeof(struct pci_root_info));
+	info->res = NULL;
+	info->res_num = 0;
+}
+
+static void __release_pci_root_info(struct pci_root_info *info)
+{
+	int i;
+	struct resource *res;
+
+	for (i = 0; i < info->res_num; i++) {
+		res = &info->res[i];
+
+		if (!res->parent)
+			continue;
+
+		if (!(res->flags & (IORESOURCE_MEM | IORESOURCE_IO)))
+			continue;
+
+		release_resource(res);
+	}
+
+	free_pci_root_info_res(info);
+
+	kfree(info);
+}
+static void release_pci_root_info(struct pci_host_bridge *bridge)
+{
+	struct pci_root_info *info = bridge->release_data;
+
+	__release_pci_root_info(info);
 }
 
 static void
@@ -352,7 +381,7 @@ probe_pci_root_info(struct pci_root_info *info, struct acpi_device *device,
 struct pci_bus * __devinit pci_acpi_scan_root(struct acpi_pci_root *root)
 {
 	struct acpi_device *device = root->device;
-	struct pci_root_info info;
+	struct pci_root_info *info = NULL;
 	int domain = root->segment;
 	int busnum = root->secondary.start;
 	LIST_HEAD(resources);
@@ -397,7 +426,13 @@ struct pci_bus * __devinit pci_acpi_scan_root(struct acpi_pci_root *root)
 
 	sd->domain = domain;
 	sd->node = node;
-	memset(&info, 0, sizeof(struct pci_root_info));
+	info = kzalloc(sizeof(*info), GFP_KERNEL);
+	if (!info) {
+		kfree(sd);
+		printk(KERN_WARNING "pci_bus %04x:%02x: "
+		       "ignored (out of memory)\n", domain, busnum);
+		return NULL;
+	}
 	/*
 	 * Maybe the desired pci bus has been already scanned. In such case
 	 * it is unnecessary to scan the pci bus with the given domain,busnum.
@@ -409,29 +444,33 @@ struct pci_bus * __devinit pci_acpi_scan_root(struct acpi_pci_root *root)
 		 * be replaced by sd.
 		 */
 		memcpy(bus->sysdata, sd, sizeof(*sd));
+		kfree(info);
 		kfree(sd);
 	} else {
-		probe_pci_root_info(&info, device, busnum, domain);
+		probe_pci_root_info(info, device, busnum, domain);
 
 		/*
 		 * _CRS with no apertures is normal, so only fall back to
 		 * defaults or native bridge info if we're ignoring _CRS.
 		 */
 		if (pci_use_crs)
-			add_resources(&info, &resources);
+			add_resources(info, &resources);
 		else {
-			free_pci_root_info(&info);
+			free_pci_root_info_res(info);
 			x86_pci_root_bus_resources(busnum, &resources);
 		}
+
 		bus = pci_create_root_bus(NULL, busnum, &pci_root_ops, sd,
 					  &resources);
-		if (bus)
+		if (bus) {
 			bus->subordinate = pci_scan_child_bus(bus);
-		else
+			pci_set_host_bridge_release(
+				to_pci_host_bridge(bus->bridge),
+				release_pci_root_info, info);
+		} else {
 			pci_free_resource_list(&resources);
-
-		if (!bus && pci_use_crs)
-			free_pci_root_info(&info);
+			__release_pci_root_info(info);
+		}
 	}
 
 	/* After the PCI-E bus has been walked and all devices discovered,
-- 
cgit v1.2.3


From fe05725ff97530e26109a0c3d52cef7fff326e15 Mon Sep 17 00:00:00 2001
From: Yinghai Lu <yinghai@kernel.org>
Date: Mon, 2 Apr 2012 18:31:53 -0700
Subject: x86/PCI: embed name into pci_root_info struct

We now keep the pci_root_info struct for the entire lifetime of the
host bridge, so just embed the name in the struct rather than
allocating it separately.

Signed-off-by: Yinghai Lu <yinghai@kernel.org>
Signed-off-by: Bjorn Helgaas <bhelgaas@google.com>
---
 arch/x86/pci/acpi.c | 7 ++-----
 1 file changed, 2 insertions(+), 5 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/pci/acpi.c b/arch/x86/pci/acpi.c
index 2b74a161d215..23e7361b1747 100644
--- a/arch/x86/pci/acpi.c
+++ b/arch/x86/pci/acpi.c
@@ -9,7 +9,7 @@
 
 struct pci_root_info {
 	struct acpi_device *bridge;
-	char *name;
+	char name[16];
 	unsigned int res_num;
 	struct resource *res;
 	int busnum;
@@ -317,7 +317,6 @@ static void add_resources(struct pci_root_info *info,
 
 static void free_pci_root_info_res(struct pci_root_info *info)
 {
-	kfree(info->name);
 	kfree(info->res);
 	info->res = NULL;
 	info->res_num = 0;
@@ -370,9 +369,7 @@ probe_pci_root_info(struct pci_root_info *info, struct acpi_device *device,
 	if (!info->res)
 		return;
 
-	info->name = kasprintf(GFP_KERNEL, "PCI Bus %04x:%02x", domain, busnum);
-	if (!info->name)
-		return;
+	sprintf(info->name, "PCI Bus %04x:%02x", domain, busnum);
 
 	acpi_walk_resources(device->handle, METHOD_NAME__CRS, setup_resource,
 				info);
-- 
cgit v1.2.3


From 35cb05e5bdac209cfdfafbe50d89ee7069cb6237 Mon Sep 17 00:00:00 2001
From: Yinghai Lu <yinghai@kernel.org>
Date: Mon, 2 Apr 2012 18:31:53 -0700
Subject: x86/PCI: embed pci_sysdata into pci_root_info on ACPI path

Embed the x86 struct pci_sysdata in the struct pci_root_info so it
will be automatically freed in the remove path.

Signed-off-by: Yinghai Lu <yinghai@kernel.org>
Signed-off-by: Bjorn Helgaas <bhelgaas@google.com>
---
 arch/x86/pci/acpi.c | 21 ++++-----------------
 1 file changed, 4 insertions(+), 17 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/pci/acpi.c b/arch/x86/pci/acpi.c
index 23e7361b1747..8a17b23f8c84 100644
--- a/arch/x86/pci/acpi.c
+++ b/arch/x86/pci/acpi.c
@@ -13,6 +13,7 @@ struct pci_root_info {
 	unsigned int res_num;
 	struct resource *res;
 	int busnum;
+	struct pci_sysdata sd;
 };
 
 static bool pci_use_crs = true;
@@ -410,26 +411,16 @@ struct pci_bus * __devinit pci_acpi_scan_root(struct acpi_pci_root *root)
 	if (node != -1 && !node_online(node))
 		node = -1;
 
-	/* Allocate per-root-bus (not per bus) arch-specific data.
-	 * TODO: leak; this memory is never freed.
-	 * It's arguable whether it's worth the trouble to care.
-	 */
-	sd = kzalloc(sizeof(*sd), GFP_KERNEL);
-	if (!sd) {
+	info = kzalloc(sizeof(*info), GFP_KERNEL);
+	if (!info) {
 		printk(KERN_WARNING "pci_bus %04x:%02x: "
 		       "ignored (out of memory)\n", domain, busnum);
 		return NULL;
 	}
 
+	sd = &info->sd;
 	sd->domain = domain;
 	sd->node = node;
-	info = kzalloc(sizeof(*info), GFP_KERNEL);
-	if (!info) {
-		kfree(sd);
-		printk(KERN_WARNING "pci_bus %04x:%02x: "
-		       "ignored (out of memory)\n", domain, busnum);
-		return NULL;
-	}
 	/*
 	 * Maybe the desired pci bus has been already scanned. In such case
 	 * it is unnecessary to scan the pci bus with the given domain,busnum.
@@ -442,7 +433,6 @@ struct pci_bus * __devinit pci_acpi_scan_root(struct acpi_pci_root *root)
 		 */
 		memcpy(bus->sysdata, sd, sizeof(*sd));
 		kfree(info);
-		kfree(sd);
 	} else {
 		probe_pci_root_info(info, device, busnum, domain);
 
@@ -484,9 +474,6 @@ struct pci_bus * __devinit pci_acpi_scan_root(struct acpi_pci_root *root)
 		}
 	}
 
-	if (!bus)
-		kfree(sd);
-
 	if (bus && node != -1) {
 #ifdef CONFIG_ACPI_NUMA
 		if (pxm >= 0)
-- 
cgit v1.2.3


From d28e5ac2a07e27638cf5ac061721b7969e17fe78 Mon Sep 17 00:00:00 2001
From: Yinghai Lu <yinghai@kernel.org>
Date: Mon, 2 Apr 2012 18:31:54 -0700
Subject: x86/PCI: dynamically allocate pci_root_info for native host bridge
 drivers

This dynamically allocates struct pci_root_info instead of using a
static array.

Signed-off-by: Yinghai Lu <yinghai@kernel.org>
Signed-off-by: Bjorn Helgaas <bhelgaas@google.com>
---
 arch/x86/pci/amd_bus.c      | 76 ++++++++++++++++-----------------------------
 arch/x86/pci/broadcom_bus.c | 12 +++----
 arch/x86/pci/bus_numa.c     | 69 +++++++++++++++++++++++++++-------------
 arch/x86/pci/bus_numa.h     | 18 ++++++-----
 4 files changed, 88 insertions(+), 87 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/pci/amd_bus.c b/arch/x86/pci/amd_bus.c
index 0567df3890e1..459a7316375c 100644
--- a/arch/x86/pci/amd_bus.c
+++ b/arch/x86/pci/amd_bus.c
@@ -32,6 +32,18 @@ static struct pci_hostbridge_probe pci_probes[] __initdata = {
 
 #define RANGE_NUM 16
 
+static struct pci_root_info __init *find_pci_root_info(int node, int link)
+{
+	struct pci_root_info *info;
+
+	/* find the position */
+	list_for_each_entry(info, &pci_root_infos, list)
+		if (info->node == node && info->link == link)
+			return info;
+
+	return NULL;
+}
+
 /**
  * early_fill_mp_bus_to_node()
  * called before pcibios_scan_root and pci_scan_bus
@@ -50,7 +62,6 @@ static int __init early_fill_mp_bus_info(void)
 	int def_link;
 	struct pci_root_info *info;
 	u32 reg;
-	struct resource *res;
 	u64 start;
 	u64 end;
 	struct range range[RANGE_NUM];
@@ -86,7 +97,6 @@ static int __init early_fill_mp_bus_info(void)
 	if (!found)
 		return 0;
 
-	pci_root_num = 0;
 	for (i = 0; i < 4; i++) {
 		int min_bus;
 		int max_bus;
@@ -105,13 +115,8 @@ static int __init early_fill_mp_bus_info(void)
 #endif
 		link = (reg >> 8) & 0x03;
 
-		info = &pci_root_info[pci_root_num];
-		info->bus_min = min_bus;
-		info->bus_max = max_bus;
-		info->node = node;
-		info->link = link;
+		info = alloc_pci_root_info(min_bus, max_bus, node, link);
 		sprintf(info->name, "PCI Bus #%02x", min_bus);
-		pci_root_num++;
 	}
 
 	/* get the default node and link for left over res */
@@ -134,16 +139,10 @@ static int __init early_fill_mp_bus_info(void)
 		link = (reg >> 4) & 0x03;
 		end = (reg & 0xfff000) | 0xfff;
 
-		/* find the position */
-		for (j = 0; j < pci_root_num; j++) {
-			info = &pci_root_info[j];
-			if (info->node == node && info->link == link)
-				break;
-		}
-		if (j == pci_root_num)
+		info = find_pci_root_info(node, link);
+		if (!info)
 			continue; /* not found */
 
-		info = &pci_root_info[j];
 		printk(KERN_DEBUG "node %d link %d: io port [%llx, %llx]\n",
 		       node, link, start, end);
 
@@ -155,13 +154,8 @@ static int __init early_fill_mp_bus_info(void)
 	}
 	/* add left over io port range to def node/link, [0, 0xffff] */
 	/* find the position */
-	for (j = 0; j < pci_root_num; j++) {
-		info = &pci_root_info[j];
-		if (info->node == def_node && info->link == def_link)
-			break;
-	}
-	if (j < pci_root_num) {
-		info = &pci_root_info[j];
+	info = find_pci_root_info(def_node, def_link);
+	if (info) {
 		for (i = 0; i < RANGE_NUM; i++) {
 			if (!range[i].end)
 				continue;
@@ -214,16 +208,10 @@ static int __init early_fill_mp_bus_info(void)
 		end <<= 8;
 		end |= 0xffff;
 
-		/* find the position */
-		for (j = 0; j < pci_root_num; j++) {
-			info = &pci_root_info[j];
-			if (info->node == node && info->link == link)
-				break;
-		}
-		if (j == pci_root_num)
-			continue; /* not found */
+		info = find_pci_root_info(node, link);
 
-		info = &pci_root_info[j];
+		if (!info)
+			continue;
 
 		printk(KERN_DEBUG "node %d link %d: mmio [%llx, %llx]",
 		       node, link, start, end);
@@ -291,14 +279,8 @@ static int __init early_fill_mp_bus_info(void)
 	 * add left over mmio range to def node/link ?
 	 * that is tricky, just record range in from start_min to 4G
 	 */
-	for (j = 0; j < pci_root_num; j++) {
-		info = &pci_root_info[j];
-		if (info->node == def_node && info->link == def_link)
-			break;
-	}
-	if (j < pci_root_num) {
-		info = &pci_root_info[j];
-
+	info = find_pci_root_info(def_node, def_link);
+	if (info) {
 		for (i = 0; i < RANGE_NUM; i++) {
 			if (!range[i].end)
 				continue;
@@ -309,20 +291,16 @@ static int __init early_fill_mp_bus_info(void)
 		}
 	}
 
-	for (i = 0; i < pci_root_num; i++) {
-		int res_num;
+	list_for_each_entry(info, &pci_root_infos, list) {
 		int busnum;
+		struct pci_root_res *root_res;
 
-		info = &pci_root_info[i];
-		res_num = info->res_num;
 		busnum = info->bus_min;
 		printk(KERN_DEBUG "bus: [%02x, %02x] on node %x link %x\n",
 		       info->bus_min, info->bus_max, info->node, info->link);
-		for (j = 0; j < res_num; j++) {
-			res = &info->res[j];
-			printk(KERN_DEBUG "bus: %02x index %x %pR\n",
-				       busnum, j, res);
-		}
+		list_for_each_entry(root_res, &info->resources, list)
+			printk(KERN_DEBUG "bus: %02x %pR\n",
+				       busnum, &root_res->res);
 	}
 
 	return 0;
diff --git a/arch/x86/pci/broadcom_bus.c b/arch/x86/pci/broadcom_bus.c
index f3a7c569a403..614392ced7d6 100644
--- a/arch/x86/pci/broadcom_bus.c
+++ b/arch/x86/pci/broadcom_bus.c
@@ -22,19 +22,15 @@
 static void __init cnb20le_res(u8 bus, u8 slot, u8 func)
 {
 	struct pci_root_info *info;
+	struct pci_root_res *root_res;
 	struct resource res;
 	u16 word1, word2;
 	u8 fbus, lbus;
-	int i;
-
-	info = &pci_root_info[pci_root_num];
-	pci_root_num++;
 
 	/* read the PCI bus numbers */
 	fbus = read_pci_config_byte(bus, slot, func, 0x44);
 	lbus = read_pci_config_byte(bus, slot, func, 0x45);
-	info->bus_min = fbus;
-	info->bus_max = lbus;
+	info = alloc_pci_root_info(fbus, lbus, 0, 0);
 
 	/*
 	 * Add the legacy IDE ports on bus 0
@@ -86,8 +82,8 @@ static void __init cnb20le_res(u8 bus, u8 slot, u8 func)
 	res.flags = IORESOURCE_BUS;
 	printk(KERN_INFO "CNB20LE PCI Host Bridge (domain 0000 %pR)\n", &res);
 
-	for (i = 0; i < info->res_num; i++)
-		printk(KERN_INFO "host bridge window %pR\n", &info->res[i]);
+	list_for_each_entry(root_res, &info->resources, list)
+		printk(KERN_INFO "host bridge window %pR\n", &root_res->res);
 }
 
 static int __init broadcom_postcore_init(void)
diff --git a/arch/x86/pci/bus_numa.c b/arch/x86/pci/bus_numa.c
index fd3f65510e9d..306579f7d0fd 100644
--- a/arch/x86/pci/bus_numa.c
+++ b/arch/x86/pci/bus_numa.c
@@ -4,35 +4,38 @@
 
 #include "bus_numa.h"
 
-int pci_root_num;
-struct pci_root_info pci_root_info[PCI_ROOT_NR];
+LIST_HEAD(pci_root_infos);
 
-void x86_pci_root_bus_resources(int bus, struct list_head *resources)
+static struct pci_root_info *x86_find_pci_root_info(int bus)
 {
-	int i;
-	int j;
 	struct pci_root_info *info;
 
-	if (!pci_root_num)
-		goto default_resources;
+	if (list_empty(&pci_root_infos))
+		return NULL;
 
-	for (i = 0; i < pci_root_num; i++) {
-		if (pci_root_info[i].bus_min == bus)
-			break;
-	}
+	list_for_each_entry(info, &pci_root_infos, list)
+		if (info->bus_min == bus)
+			return info;
+
+	return NULL;
+}
 
-	if (i == pci_root_num)
+void x86_pci_root_bus_resources(int bus, struct list_head *resources)
+{
+	struct pci_root_info *info = x86_find_pci_root_info(bus);
+	struct pci_root_res *root_res;
+
+	if (!info)
 		goto default_resources;
 
 	printk(KERN_DEBUG "PCI: root bus %02x: hardware-probed resources\n",
 	       bus);
 
-	info = &pci_root_info[i];
-	for (j = 0; j < info->res_num; j++) {
+	list_for_each_entry(root_res, &info->resources, list) {
 		struct resource *res;
 		struct resource *root;
 
-		res = &info->res[j];
+		res = &root_res->res;
 		pci_add_resource(resources, res);
 		if (res->flags & IORESOURCE_IO)
 			root = &ioport_resource;
@@ -53,11 +56,32 @@ default_resources:
 	pci_add_resource(resources, &iomem_resource);
 }
 
+struct pci_root_info __init *alloc_pci_root_info(int bus_min, int bus_max,
+						 int node, int link)
+{
+	struct pci_root_info *info;
+
+	info = kzalloc(sizeof(*info), GFP_KERNEL);
+
+	if (!info)
+		return info;
+
+	INIT_LIST_HEAD(&info->resources);
+	info->bus_min = bus_min;
+	info->bus_max = bus_max;
+	info->node = node;
+	info->link = link;
+
+	list_add_tail(&info->list, &pci_root_infos);
+
+	return info;
+}
+
 void __devinit update_res(struct pci_root_info *info, resource_size_t start,
 			  resource_size_t end, unsigned long flags, int merge)
 {
-	int i;
 	struct resource *res;
+	struct pci_root_res *root_res;
 
 	if (start > end)
 		return;
@@ -69,11 +93,11 @@ void __devinit update_res(struct pci_root_info *info, resource_size_t start,
 		goto addit;
 
 	/* try to merge it with old one */
-	for (i = 0; i < info->res_num; i++) {
+	list_for_each_entry(root_res, &info->resources, list) {
 		resource_size_t final_start, final_end;
 		resource_size_t common_start, common_end;
 
-		res = &info->res[i];
+		res = &root_res->res;
 		if (res->flags != flags)
 			continue;
 
@@ -93,14 +117,15 @@ void __devinit update_res(struct pci_root_info *info, resource_size_t start,
 addit:
 
 	/* need to add that */
-	if (info->res_num >= RES_NUM)
+	root_res = kzalloc(sizeof(*root_res), GFP_KERNEL);
+	if (!root_res)
 		return;
 
-	res = &info->res[info->res_num];
+	res = &root_res->res;
 	res->name = info->name;
 	res->flags = flags;
 	res->start = start;
 	res->end = end;
-	res->child = NULL;
-	info->res_num++;
+
+	list_add_tail(&root_res->list, &info->resources);
 }
diff --git a/arch/x86/pci/bus_numa.h b/arch/x86/pci/bus_numa.h
index 804a4b40c31a..226a466b2b2b 100644
--- a/arch/x86/pci/bus_numa.h
+++ b/arch/x86/pci/bus_numa.h
@@ -4,22 +4,24 @@
  * sub bus (transparent) will use entres from 3 to store extra from
  * root, so need to make sure we have enough slot there.
  */
-#define RES_NUM 16
+struct pci_root_res {
+	struct list_head list;
+	struct resource res;
+};
+
 struct pci_root_info {
+	struct list_head list;
 	char name[12];
-	unsigned int res_num;
-	struct resource res[RES_NUM];
+	struct list_head resources;
 	int bus_min;
 	int bus_max;
 	int node;
 	int link;
 };
 
-/* 4 at this time, it may become to 32 */
-#define PCI_ROOT_NR 4
-extern int pci_root_num;
-extern struct pci_root_info pci_root_info[PCI_ROOT_NR];
-
+extern struct list_head pci_root_infos;
+struct pci_root_info *alloc_pci_root_info(int bus_min, int bus_max,
+						int node, int link);
 extern void update_res(struct pci_root_info *info, resource_size_t start,
 		      resource_size_t end, unsigned long flags, int merge);
 #endif
-- 
cgit v1.2.3


From c57ca65a6ea3171370cbb3010e5a3aea7399a5e1 Mon Sep 17 00:00:00 2001
From: Yinghai Lu <yinghai@kernel.org>
Date: Mon, 2 Apr 2012 18:31:54 -0700
Subject: x86/PCI: merge pcibios_scan_root() and pci_scan_bus_on_node()

pcibios_scan_root() and pci_scan_bus_on_node() were almost identical,
so this patch merges them.

Signed-off-by: Yinghai Lu <yinghai@kernel.org>
Signed-off-by: Bjorn Helgaas <bhelgaas@google.com>
---
 arch/x86/pci/common.c | 27 ++++-----------------------
 1 file changed, 4 insertions(+), 23 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/pci/common.c b/arch/x86/pci/common.c
index 323481e06ef8..8e04ec591543 100644
--- a/arch/x86/pci/common.c
+++ b/arch/x86/pci/common.c
@@ -430,9 +430,7 @@ void __init dmi_check_pciprobe(void)
 
 struct pci_bus * __devinit pcibios_scan_root(int busnum)
 {
-	LIST_HEAD(resources);
 	struct pci_bus *bus = NULL;
-	struct pci_sysdata *sd;
 
 	while ((bus = pci_find_next_bus(bus)) != NULL) {
 		if (bus->number == busnum) {
@@ -441,28 +439,10 @@ struct pci_bus * __devinit pcibios_scan_root(int busnum)
 		}
 	}
 
-	/* Allocate per-root-bus (not per bus) arch-specific data.
-	 * TODO: leak; this memory is never freed.
-	 * It's arguable whether it's worth the trouble to care.
-	 */
-	sd = kzalloc(sizeof(*sd), GFP_KERNEL);
-	if (!sd) {
-		printk(KERN_ERR "PCI: OOM, not probing PCI bus %02x\n", busnum);
-		return NULL;
-	}
-
-	sd->node = get_mp_bus_to_node(busnum);
-
-	printk(KERN_DEBUG "PCI: Probing PCI hardware (bus %02x)\n", busnum);
-	x86_pci_root_bus_resources(busnum, &resources);
-	bus = pci_scan_root_bus(NULL, busnum, &pci_root_ops, sd, &resources);
-	if (!bus) {
-		pci_free_resource_list(&resources);
-		kfree(sd);
-	}
-
-	return bus;
+	return pci_scan_bus_on_node(busnum, &pci_root_ops,
+					get_mp_bus_to_node(busnum));
 }
+
 void __init pcibios_set_cache_line_size(void)
 {
 	struct cpuinfo_x86 *c = &boot_cpu_data;
@@ -656,6 +636,7 @@ struct pci_bus * __devinit pci_scan_bus_on_node(int busno, struct pci_ops *ops,
 	}
 	sd->node = node;
 	x86_pci_root_bus_resources(busno, &resources);
+	printk(KERN_DEBUG "PCI: Probing PCI hardware (bus %02x)\n", busno);
 	bus = pci_scan_root_bus(NULL, busno, ops, sd, &resources);
 	if (!bus) {
 		pci_free_resource_list(&resources);
-- 
cgit v1.2.3


From 284f5f9dbac170b054c1e386ef92cbf654e91bba Mon Sep 17 00:00:00 2001
From: Bjorn Helgaas <bhelgaas@google.com>
Date: Mon, 30 Apr 2012 15:21:02 -0600
Subject: PCI: work around Stratus ftServer broken PCIe hierarchy

A PCIe downstream port is a P2P bridge.  Its secondary interface is
a link that should lead only to device 0 (unless ARI is enabled)[1], so
we don't probe for non-zero device numbers.

Some Stratus ftServer systems have a PCIe downstream port (02:00.0) that
leads to both an upstream port (03:00.0) and a downstream port (03:01.0),
and 03:01.0 has important devices below it:

  [0000:02]-+-00.0-[03-3c]--+-00.0-[04-09]--...
                            \-01.0-[0a-0d]--+-[USB]
                                            +-[NIC]
                                            +-...

Previously, we didn't enumerate device 03:01.0, so USB and the network
didn't work.  This patch adds a DMI quirk to scan all device numbers,
not just 0, below a downstream port.

Based on a patch by Prarit Bhargava.

[1] PCIe spec r3.0, sec 7.3.1

CC: Myron Stowe <mstowe@redhat.com>
CC: Don Dutile <ddutile@redhat.com>
CC: James Paradis <james.paradis@stratus.com>
CC: Matthew Wilcox <matthew.r.wilcox@intel.com>
CC: Jesse Barnes <jbarnes@virtuousgeek.org>
CC: Prarit Bhargava <prarit@redhat.com>
Signed-off-by: Bjorn Helgaas <bhelgaas@google.com>
---
 Documentation/kernel-parameters.txt |  3 +++
 arch/x86/pci/common.c               | 16 ++++++++++++++++
 drivers/pci/pci.c                   |  3 +++
 drivers/pci/probe.c                 |  8 ++++++--
 include/asm-generic/pci-bridge.h    |  6 ++++++
 5 files changed, 34 insertions(+), 2 deletions(-)

(limited to 'arch/x86')

diff --git a/Documentation/kernel-parameters.txt b/Documentation/kernel-parameters.txt
index c1601e5a8b71..f995195409fd 100644
--- a/Documentation/kernel-parameters.txt
+++ b/Documentation/kernel-parameters.txt
@@ -2161,6 +2161,9 @@ bytes respectively. Such letter suffixes can also be entirely omitted.
 				on: Turn realloc on
 		realloc		same as realloc=on
 		noari		do not use PCIe ARI.
+		pcie_scan_all	Scan all possible PCIe devices.  Otherwise we
+				only look for one device below a PCIe downstream
+				port.
 
 	pcie_aspm=	[PCIE] Forcibly enable or disable PCIe Active State Power
 			Management.
diff --git a/arch/x86/pci/common.c b/arch/x86/pci/common.c
index 323481e06ef8..16c5d7835295 100644
--- a/arch/x86/pci/common.c
+++ b/arch/x86/pci/common.c
@@ -11,6 +11,7 @@
 #include <linux/dmi.h>
 #include <linux/slab.h>
 
+#include <asm-generic/pci-bridge.h>
 #include <asm/acpi.h>
 #include <asm/segment.h>
 #include <asm/io.h>
@@ -229,6 +230,14 @@ static int __devinit assign_all_busses(const struct dmi_system_id *d)
 }
 #endif
 
+static int __devinit set_scan_all(const struct dmi_system_id *d)
+{
+	printk(KERN_INFO "PCI: %s detected, enabling pci=pcie_scan_all\n",
+	       d->ident);
+	pci_add_flags(PCI_SCAN_ALL_PCIE_DEVS);
+	return 0;
+}
+
 static const struct dmi_system_id __devinitconst pciprobe_dmi_table[] = {
 #ifdef __i386__
 /*
@@ -420,6 +429,13 @@ static const struct dmi_system_id __devinitconst pciprobe_dmi_table[] = {
 			DMI_MATCH(DMI_PRODUCT_NAME, "ProLiant DL585 G2"),
 		},
 	},
+	{
+		.callback = set_scan_all,
+		.ident = "Stratus/NEC ftServer",
+		.matches = {
+			DMI_MATCH(DMI_SYS_VENDOR, "ftServer"),
+		},
+	},
 	{}
 };
 
diff --git a/drivers/pci/pci.c b/drivers/pci/pci.c
index 111569ccab43..8e6c38817036 100644
--- a/drivers/pci/pci.c
+++ b/drivers/pci/pci.c
@@ -22,6 +22,7 @@
 #include <linux/interrupt.h>
 #include <linux/device.h>
 #include <linux/pm_runtime.h>
+#include <asm-generic/pci-bridge.h>
 #include <asm/setup.h>
 #include "pci.h"
 
@@ -3893,6 +3894,8 @@ static int __init pci_setup(char *str)
 				pcie_bus_config = PCIE_BUS_PERFORMANCE;
 			} else if (!strncmp(str, "pcie_bus_peer2peer", 18)) {
 				pcie_bus_config = PCIE_BUS_PEER2PEER;
+			} else if (!strncmp(str, "pcie_scan_all", 13)) {
+				pci_add_flags(PCI_SCAN_ALL_PCIE_DEVS);
 			} else {
 				printk(KERN_ERR "PCI: Unknown option `%s'\n",
 						str);
diff --git a/drivers/pci/probe.c b/drivers/pci/probe.c
index 5e1ca3c58a7d..2dc8675eea1a 100644
--- a/drivers/pci/probe.c
+++ b/drivers/pci/probe.c
@@ -10,6 +10,7 @@
 #include <linux/module.h>
 #include <linux/cpumask.h>
 #include <linux/pci-aspm.h>
+#include <asm-generic/pci-bridge.h>
 #include "pci.h"
 
 #define CARDBUS_LATENCY_TIMER	176	/* secondary latency timer */
@@ -1395,10 +1396,13 @@ static unsigned no_next_fn(struct pci_dev *dev, unsigned fn)
 static int only_one_child(struct pci_bus *bus)
 {
 	struct pci_dev *parent = bus->self;
+
 	if (!parent || !pci_is_pcie(parent))
 		return 0;
-	if (parent->pcie_type == PCI_EXP_TYPE_ROOT_PORT ||
-	    parent->pcie_type == PCI_EXP_TYPE_DOWNSTREAM)
+	if (parent->pcie_type == PCI_EXP_TYPE_ROOT_PORT)
+		return 1;
+	if (parent->pcie_type == PCI_EXP_TYPE_DOWNSTREAM &&
+	    !pci_has_flag(PCI_SCAN_ALL_PCIE_DEVS))
 		return 1;
 	return 0;
 }
diff --git a/include/asm-generic/pci-bridge.h b/include/asm-generic/pci-bridge.h
index a5b5d5a89a4f..20db2e5a0a69 100644
--- a/include/asm-generic/pci-bridge.h
+++ b/include/asm-generic/pci-bridge.h
@@ -30,6 +30,12 @@ enum {
 	PCI_ENABLE_PROC_DOMAINS	= 0x00000010,
 	/* ... except for domain 0 */
 	PCI_COMPAT_DOMAIN_0	= 0x00000020,
+
+	/* PCIe downstream ports are bridges that normally lead to only a
+	 * device 0, but if this is set, we scan all possible devices, not
+	 * just device 0.
+	 */
+	PCI_SCAN_ALL_PCIE_DEVS	= 0x00000040,
 };
 
 #ifdef CONFIG_PCI
-- 
cgit v1.2.3


From 4a8e2a3115e7aa4bd2deb4c6483d47c743e0fbb3 Mon Sep 17 00:00:00 2001
From: Konrad Rzeszutek Wilk <konrad.wilk@oracle.com>
Date: Wed, 28 Mar 2012 12:37:36 -0400
Subject: x86/apic: Replace io_apic_ops with x86_io_apic_ops.

Which makes the code fit within the rest of the x86_ops functions.

Acked-by: Suresh Siddha <suresh.b.siddha@intel.com>
[v1: Changed x86_apic -> x86_ioapic per Yinghai Lu <yinghai@kernel.org> suggestion]
[v2: Rebased on tip/x86/urgent and redid to match Ingo's syntax style]
Signed-off-by: Konrad Rzeszutek Wilk <konrad.wilk@oracle.com>
---
 arch/x86/include/asm/io_apic.h  | 35 ++++++++++++++++++++-----------
 arch/x86/include/asm/x86_init.h |  9 +++++++-
 arch/x86/kernel/apic/io_apic.c  | 46 ++++-------------------------------------
 arch/x86/kernel/setup.c         |  2 +-
 arch/x86/kernel/x86_init.c      |  8 +++++++
 5 files changed, 44 insertions(+), 56 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/include/asm/io_apic.h b/arch/x86/include/asm/io_apic.h
index 2c4943de5150..73d8c5398ea9 100644
--- a/arch/x86/include/asm/io_apic.h
+++ b/arch/x86/include/asm/io_apic.h
@@ -5,7 +5,7 @@
 #include <asm/mpspec.h>
 #include <asm/apicdef.h>
 #include <asm/irq_vectors.h>
-
+#include <asm/x86_init.h>
 /*
  * Intel IO-APIC support for SMP and UP systems.
  *
@@ -21,15 +21,6 @@
 #define IO_APIC_REDIR_LEVEL_TRIGGER	(1 << 15)
 #define IO_APIC_REDIR_MASKED		(1 << 16)
 
-struct io_apic_ops {
-	void		(*init)  (void);
-	unsigned int	(*read)  (unsigned int apic, unsigned int reg);
-	void		(*write) (unsigned int apic, unsigned int reg, unsigned int value);
-	void		(*modify)(unsigned int apic, unsigned int reg, unsigned int value);
-};
-
-void __init set_io_apic_ops(const struct io_apic_ops *);
-
 /*
  * The structure of the IO-APIC:
  */
@@ -156,7 +147,6 @@ struct io_apic_irq_attr;
 extern int io_apic_set_pci_routing(struct device *dev, int irq,
 		 struct io_apic_irq_attr *irq_attr);
 void setup_IO_APIC_irq_extra(u32 gsi);
-extern void ioapic_and_gsi_init(void);
 extern void ioapic_insert_resources(void);
 
 int io_apic_setup_irq_pin_once(unsigned int irq, int node, struct io_apic_irq_attr *attr);
@@ -185,12 +175,29 @@ extern void mp_save_irq(struct mpc_intsrc *m);
 
 extern void disable_ioapic_support(void);
 
+extern void __init native_io_apic_init_mappings(void);
+extern unsigned int native_io_apic_read(unsigned int apic, unsigned int reg);
+extern void native_io_apic_write(unsigned int apic, unsigned int reg, unsigned int val);
+extern void native_io_apic_modify(unsigned int apic, unsigned int reg, unsigned int val);
+
+static inline unsigned int io_apic_read(unsigned int apic, unsigned int reg)
+{
+	return x86_io_apic_ops.read(apic, reg);
+}
+
+static inline void io_apic_write(unsigned int apic, unsigned int reg, unsigned int value)
+{
+	x86_io_apic_ops.write(apic, reg, value);
+}
+static inline void io_apic_modify(unsigned int apic, unsigned int reg, unsigned int value)
+{
+	x86_io_apic_ops.modify(apic, reg, value);
+}
 #else  /* !CONFIG_X86_IO_APIC */
 
 #define io_apic_assign_pci_irqs 0
 #define setup_ioapic_ids_from_mpc x86_init_noop
 static const int timer_through_8259 = 0;
-static inline void ioapic_and_gsi_init(void) { }
 static inline void ioapic_insert_resources(void) { }
 #define gsi_top (NR_IRQS_LEGACY)
 static inline int mp_find_ioapic(u32 gsi) { return 0; }
@@ -212,6 +219,10 @@ static inline int restore_ioapic_entries(void)
 
 static inline void mp_save_irq(struct mpc_intsrc *m) { };
 static inline void disable_ioapic_support(void) { }
+#define native_io_apic_init_mappings	NULL
+#define native_io_apic_read		NULL
+#define native_io_apic_write		NULL
+#define native_io_apic_modify		NULL
 #endif
 
 #endif /* _ASM_X86_IO_APIC_H */
diff --git a/arch/x86/include/asm/x86_init.h b/arch/x86/include/asm/x86_init.h
index 764b66a4cf89..c090af10ac7d 100644
--- a/arch/x86/include/asm/x86_init.h
+++ b/arch/x86/include/asm/x86_init.h
@@ -188,11 +188,18 @@ struct x86_msi_ops {
 	void (*restore_msi_irqs)(struct pci_dev *dev, int irq);
 };
 
+struct x86_io_apic_ops {
+	void		(*init)  (void);
+	unsigned int	(*read)  (unsigned int apic, unsigned int reg);
+	void		(*write) (unsigned int apic, unsigned int reg, unsigned int value);
+	void		(*modify)(unsigned int apic, unsigned int reg, unsigned int value);
+};
+
 extern struct x86_init_ops x86_init;
 extern struct x86_cpuinit_ops x86_cpuinit;
 extern struct x86_platform_ops x86_platform;
 extern struct x86_msi_ops x86_msi;
-
+extern struct x86_io_apic_ops x86_io_apic_ops;
 extern void x86_init_noop(void);
 extern void x86_init_uint_noop(unsigned int unused);
 
diff --git a/arch/x86/kernel/apic/io_apic.c b/arch/x86/kernel/apic/io_apic.c
index e88300d8e80a..973539c128a4 100644
--- a/arch/x86/kernel/apic/io_apic.c
+++ b/arch/x86/kernel/apic/io_apic.c
@@ -68,24 +68,6 @@
 #define for_each_irq_pin(entry, head) \
 	for (entry = head; entry; entry = entry->next)
 
-static void		__init __ioapic_init_mappings(void);
-
-static unsigned int	__io_apic_read  (unsigned int apic, unsigned int reg);
-static void		__io_apic_write (unsigned int apic, unsigned int reg, unsigned int val);
-static void		__io_apic_modify(unsigned int apic, unsigned int reg, unsigned int val);
-
-static struct io_apic_ops io_apic_ops = {
-	.init	= __ioapic_init_mappings,
-	.read	= __io_apic_read,
-	.write	= __io_apic_write,
-	.modify = __io_apic_modify,
-};
-
-void __init set_io_apic_ops(const struct io_apic_ops *ops)
-{
-	io_apic_ops = *ops;
-}
-
 /*
  *      Is the SiS APIC rmw bug present ?
  *      -1 = don't know, 0 = no, 1 = yes
@@ -313,21 +295,6 @@ static void free_irq_at(unsigned int at, struct irq_cfg *cfg)
 	irq_free_desc(at);
 }
 
-static inline unsigned int io_apic_read(unsigned int apic, unsigned int reg)
-{
-	return io_apic_ops.read(apic, reg);
-}
-
-static inline void io_apic_write(unsigned int apic, unsigned int reg, unsigned int value)
-{
-	io_apic_ops.write(apic, reg, value);
-}
-
-static inline void io_apic_modify(unsigned int apic, unsigned int reg, unsigned int value)
-{
-	io_apic_ops.modify(apic, reg, value);
-}
-
 
 struct io_apic {
 	unsigned int index;
@@ -349,14 +316,14 @@ static inline void io_apic_eoi(unsigned int apic, unsigned int vector)
 	writel(vector, &io_apic->eoi);
 }
 
-static unsigned int __io_apic_read(unsigned int apic, unsigned int reg)
+unsigned int native_io_apic_read(unsigned int apic, unsigned int reg)
 {
 	struct io_apic __iomem *io_apic = io_apic_base(apic);
 	writel(reg, &io_apic->index);
 	return readl(&io_apic->data);
 }
 
-static void __io_apic_write(unsigned int apic, unsigned int reg, unsigned int value)
+void native_io_apic_write(unsigned int apic, unsigned int reg, unsigned int value)
 {
 	struct io_apic __iomem *io_apic = io_apic_base(apic);
 
@@ -370,7 +337,7 @@ static void __io_apic_write(unsigned int apic, unsigned int reg, unsigned int va
  *
  * Older SiS APIC requires we rewrite the index register
  */
-static void __io_apic_modify(unsigned int apic, unsigned int reg, unsigned int value)
+void native_io_apic_modify(unsigned int apic, unsigned int reg, unsigned int value)
 {
 	struct io_apic __iomem *io_apic = io_apic_base(apic);
 
@@ -3931,12 +3898,7 @@ static struct resource * __init ioapic_setup_resources(int nr_ioapics)
 	return res;
 }
 
-void __init ioapic_and_gsi_init(void)
-{
-	io_apic_ops.init();
-}
-
-static void __init __ioapic_init_mappings(void)
+void __init native_io_apic_init_mappings(void)
 {
 	unsigned long ioapic_phys, idx = FIX_IO_APIC_BASE_0;
 	struct resource *ioapic_res;
diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c
index 1a2901562059..8526317c5f0b 100644
--- a/arch/x86/kernel/setup.c
+++ b/arch/x86/kernel/setup.c
@@ -1012,7 +1012,7 @@ void __init setup_arch(char **cmdline_p)
 	init_cpu_to_node();
 
 	init_apic_mappings();
-	ioapic_and_gsi_init();
+	x86_io_apic_ops.init();
 
 	kvm_guest_init();
 
diff --git a/arch/x86/kernel/x86_init.c b/arch/x86/kernel/x86_init.c
index 9cf71d0b2d37..35c5e543f550 100644
--- a/arch/x86/kernel/x86_init.c
+++ b/arch/x86/kernel/x86_init.c
@@ -18,6 +18,7 @@
 #include <asm/e820.h>
 #include <asm/time.h>
 #include <asm/irq.h>
+#include <asm/io_apic.h>
 #include <asm/pat.h>
 #include <asm/tsc.h>
 #include <asm/iommu.h>
@@ -119,3 +120,10 @@ struct x86_msi_ops x86_msi = {
 	.teardown_msi_irqs = default_teardown_msi_irqs,
 	.restore_msi_irqs = default_restore_msi_irqs,
 };
+
+struct x86_io_apic_ops x86_io_apic_ops = {
+	.init	= native_io_apic_init_mappings,
+	.read	= native_io_apic_read,
+	.write	= native_io_apic_write,
+	.modify	= native_io_apic_modify,
+};
-- 
cgit v1.2.3


From 31b3c9d723407b395564d1fff3624cc0083ae520 Mon Sep 17 00:00:00 2001
From: Konrad Rzeszutek Wilk <konrad.wilk@oracle.com>
Date: Tue, 20 Mar 2012 18:53:10 -0400
Subject: xen/x86: Implement x86_apic_ops

Or rather just implement one different function as opposed
to the native one : the read function.

We synthesize the values.

Acked-by:  Suresh Siddha <suresh.b.siddha@intel.com>
[v1: Rebased on top of tip/x86/urgent]
[v2: Return 0xfd instead of 0xff in the default case]
Signed-off-by: Konrad Rzeszutek Wilk <konrad.wilk@oracle.com>
---
 arch/x86/xen/Makefile    |  2 +-
 arch/x86/xen/apic.c      | 17 +++++++++++++++++
 arch/x86/xen/enlighten.c |  2 ++
 arch/x86/xen/xen-ops.h   |  4 ++++
 4 files changed, 24 insertions(+), 1 deletion(-)
 create mode 100644 arch/x86/xen/apic.c

(limited to 'arch/x86')

diff --git a/arch/x86/xen/Makefile b/arch/x86/xen/Makefile
index add2c2d729ce..96ab2c09cb68 100644
--- a/arch/x86/xen/Makefile
+++ b/arch/x86/xen/Makefile
@@ -20,5 +20,5 @@ obj-$(CONFIG_EVENT_TRACING) += trace.o
 obj-$(CONFIG_SMP)		+= smp.o
 obj-$(CONFIG_PARAVIRT_SPINLOCKS)+= spinlock.o
 obj-$(CONFIG_XEN_DEBUG_FS)	+= debugfs.o
-obj-$(CONFIG_XEN_DOM0)		+= vga.o
+obj-$(CONFIG_XEN_DOM0)		+= apic.o vga.o
 obj-$(CONFIG_SWIOTLB_XEN)	+= pci-swiotlb-xen.o
diff --git a/arch/x86/xen/apic.c b/arch/x86/xen/apic.c
new file mode 100644
index 000000000000..73ade38caa32
--- /dev/null
+++ b/arch/x86/xen/apic.c
@@ -0,0 +1,17 @@
+#include <linux/init.h>
+#include <asm/x86_init.h>
+
+unsigned int xen_io_apic_read(unsigned apic, unsigned reg)
+{
+	if (reg == 0x1)
+		return 0x00170020;
+	else if (reg == 0x0)
+		return apic << 24;
+
+	return 0xfd;
+}
+
+void __init xen_init_apic(void)
+{
+	x86_io_apic_ops.read = xen_io_apic_read;
+}
diff --git a/arch/x86/xen/enlighten.c b/arch/x86/xen/enlighten.c
index a8f8844b8d32..c2ea9e9f420d 100644
--- a/arch/x86/xen/enlighten.c
+++ b/arch/x86/xen/enlighten.c
@@ -1362,6 +1362,8 @@ asmlinkage void __init xen_start_kernel(void)
 		xen_start_info->console.domU.mfn = 0;
 		xen_start_info->console.domU.evtchn = 0;
 
+		xen_init_apic();
+
 		/* Make sure ACS will be enabled */
 		pci_request_acs();
 	}
diff --git a/arch/x86/xen/xen-ops.h b/arch/x86/xen/xen-ops.h
index b095739ccd4c..45c0c0667bd9 100644
--- a/arch/x86/xen/xen-ops.h
+++ b/arch/x86/xen/xen-ops.h
@@ -92,11 +92,15 @@ struct dom0_vga_console_info;
 
 #ifdef CONFIG_XEN_DOM0
 void __init xen_init_vga(const struct dom0_vga_console_info *, size_t size);
+void __init xen_init_apic(void);
 #else
 static inline void __init xen_init_vga(const struct dom0_vga_console_info *info,
 				       size_t size)
 {
 }
+static inline void __init xen_init_apic(void)
+{
+}
 #endif
 
 /* Declare an asm function, along with symbols needed to make it
-- 
cgit v1.2.3


From 27abd14bd9f1117dc7bdeee81a2de0557e077b61 Mon Sep 17 00:00:00 2001
From: Konrad Rzeszutek Wilk <konrad.wilk@oracle.com>
Date: Mon, 16 Apr 2012 13:53:40 -0400
Subject: Revert "xen/x86: Workaround 'x86/ioapic: Add register level checks to
 detect bogus io-apic entries'"

This reverts commit 2531d64b6fe2724dc432b67d8dc66bd45621da0b.

The two patches:
      x86/apic: Replace io_apic_ops with x86_io_apic_ops.
      xen/x86: Implement x86_apic_ops

take care of fixing it properly.

Signed-off-by: Konrad Rzeszutek Wilk <konrad.wilk@oracle.com>
---
 arch/x86/xen/mmu.c | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/xen/mmu.c b/arch/x86/xen/mmu.c
index b8e279479a6b..988828b479ed 100644
--- a/arch/x86/xen/mmu.c
+++ b/arch/x86/xen/mmu.c
@@ -1859,7 +1859,6 @@ pgd_t * __init xen_setup_kernel_pagetable(pgd_t *pgd,
 #endif	/* CONFIG_X86_64 */
 
 static unsigned char dummy_mapping[PAGE_SIZE] __page_aligned_bss;
-static unsigned char fake_ioapic_mapping[PAGE_SIZE] __page_aligned_bss;
 
 static void xen_set_fixmap(unsigned idx, phys_addr_t phys, pgprot_t prot)
 {
@@ -1900,7 +1899,7 @@ static void xen_set_fixmap(unsigned idx, phys_addr_t phys, pgprot_t prot)
 		 * We just don't map the IO APIC - all access is via
 		 * hypercalls.  Keep the address in the pte for reference.
 		 */
-		pte = pfn_pte(PFN_DOWN(__pa(fake_ioapic_mapping)), PAGE_KERNEL);
+		pte = pfn_pte(PFN_DOWN(__pa(dummy_mapping)), PAGE_KERNEL);
 		break;
 #endif
 
@@ -2065,7 +2064,6 @@ void __init xen_init_mmu_ops(void)
 	pv_mmu_ops = xen_mmu_ops;
 
 	memset(dummy_mapping, 0xff, PAGE_SIZE);
-	memset(fake_ioapic_mapping, 0xfd, PAGE_SIZE);
 }
 
 /* Protected by xen_reservation_lock. */
-- 
cgit v1.2.3


From ab6ec39a191243b9968bb9ac7f26cc7ec30c618b Mon Sep 17 00:00:00 2001
From: Lin Ming <mlin@ss.pku.edu.cn>
Date: Tue, 1 May 2012 00:16:27 +0800
Subject: xen/apic: implement io apic read with hypercall

Implements xen_io_apic_read with hypercall, so it returns proper
IO-APIC information instead of fabricated one.

Fallback to return an emulated IO_APIC values if hypercall fails.

[v2: fallback to return an emulated IO_APIC values if hypercall fails]
Signed-off-by: Lin Ming <mlin@ss.pku.edu.cn>
Signed-off-by: Konrad Rzeszutek Wilk <konrad.wilk@oracle.com>
---
 arch/x86/xen/apic.c | 13 +++++++++++++
 1 file changed, 13 insertions(+)

(limited to 'arch/x86')

diff --git a/arch/x86/xen/apic.c b/arch/x86/xen/apic.c
index 73ade38caa32..1913bf2d2a9c 100644
--- a/arch/x86/xen/apic.c
+++ b/arch/x86/xen/apic.c
@@ -1,8 +1,21 @@
 #include <linux/init.h>
 #include <asm/x86_init.h>
+#include <asm/apic.h>
+#include <xen/interface/physdev.h>
+#include <asm/xen/hypercall.h>
 
 unsigned int xen_io_apic_read(unsigned apic, unsigned reg)
 {
+	struct physdev_apic apic_op;
+	int ret;
+
+	apic_op.apic_physbase = mpc_ioapic_addr(apic);
+	apic_op.reg = reg;
+	ret = HYPERVISOR_physdev_op(PHYSDEVOP_apic_read, &apic_op);
+	if (!ret)
+		return apic_op.value;
+
+	/* fallback to return an emulated IO_APIC values */
 	if (reg == 0x1)
 		return 0x00170020;
 	else if (reg == 0x0)
-- 
cgit v1.2.3


From 0f1103e40f9186bd2cdac4dde6c5bbd2f5273365 Mon Sep 17 00:00:00 2001
From: Yinghai Lu <yinghai@kernel.org>
Date: Tue, 1 May 2012 17:25:18 -0600
Subject: x86/PCI: fix unused variable warning in amd_bus.c

Fix this warning:

  arch/x86/pci/amd_bus.c: In function 'early_fill_mp_bus_info':
  arch/x86/pci/amd_bus.c:56:6: warning: unused variable 'j' [-Wunused-variable]

introduced by commit d28e5ac2a07e ("x86/PCI: dynamically allocate
pci_root_info for native host bridge drivers").

Reported-by: Stephen Rothwell <sfr@canb.auug.org.au>
Signed-off-by: Yinghai Lu <yinghai@kernel.org>
Signed-off-by: Bjorn Helgaas <bhelgaas@google.com>
---
 arch/x86/pci/amd_bus.c | 15 ++++++++++-----
 1 file changed, 10 insertions(+), 5 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/pci/amd_bus.c b/arch/x86/pci/amd_bus.c
index 459a7316375c..5aed49bff058 100644
--- a/arch/x86/pci/amd_bus.c
+++ b/arch/x86/pci/amd_bus.c
@@ -44,6 +44,15 @@ static struct pci_root_info __init *find_pci_root_info(int node, int link)
 	return NULL;
 }
 
+static void __init set_mp_bus_range_to_node(int min_bus, int max_bus, int node)
+{
+#ifdef CONFIG_NUMA
+	int j;
+
+	for (j = min_bus; j <= max_bus; j++)
+		set_mp_bus_to_node(j, node);
+#endif
+}
 /**
  * early_fill_mp_bus_to_node()
  * called before pcibios_scan_root and pci_scan_bus
@@ -53,7 +62,6 @@ static struct pci_root_info __init *find_pci_root_info(int node, int link)
 static int __init early_fill_mp_bus_info(void)
 {
 	int i;
-	int j;
 	unsigned bus;
 	unsigned slot;
 	int node;
@@ -109,10 +117,7 @@ static int __init early_fill_mp_bus_info(void)
 		min_bus = (reg >> 16) & 0xff;
 		max_bus = (reg >> 24) & 0xff;
 		node = (reg >> 4) & 0x07;
-#ifdef CONFIG_NUMA
-		for (j = min_bus; j <= max_bus; j++)
-			set_mp_bus_to_node(j, node);
-#endif
+		set_mp_bus_range_to_node(min_bus, max_bus, node);
 		link = (reg >> 8) & 0x03;
 
 		info = alloc_pci_root_info(min_bus, max_bus, node, link);
-- 
cgit v1.2.3


From 078de5f706ece36afd73bb4b8283314132d2dfdf Mon Sep 17 00:00:00 2001
From: "Eric W. Biederman" <ebiederm@xmission.com>
Date: Wed, 8 Feb 2012 07:00:08 -0800
Subject: userns: Store uid and gid values in struct cred with kuid_t and
 kgid_t types

cred.h and a few trivial users of struct cred are changed.  The rest of the users
of struct cred are left for other patches as there are too many changes to make
in one go and leave the change reviewable.  If the user namespace is disabled and
CONFIG_UIDGID_STRICT_TYPE_CHECKS are disabled the code will contiue to compile
and behave correctly.

Acked-by: Serge Hallyn <serge.hallyn@canonical.com>
Signed-off-by: Eric W. Biederman <ebiederm@xmission.com>
---
 arch/x86/mm/fault.c            |  2 +-
 fs/ioprio.c                    |  8 ++------
 include/linux/cred.h           | 16 ++++++++--------
 include/linux/user_namespace.h |  8 ++++----
 kernel/cred.c                  | 36 ++++++++++++++++++++++--------------
 kernel/signal.c                | 14 ++++++++------
 kernel/sys.c                   | 26 +++++++++-----------------
 kernel/user_namespace.c        |  4 ++--
 mm/oom_kill.c                  |  4 ++--
 security/commoncap.c           |  3 +--
 10 files changed, 59 insertions(+), 62 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/mm/fault.c b/arch/x86/mm/fault.c
index 3ecfd1aaf214..76dcd9d8e0bc 100644
--- a/arch/x86/mm/fault.c
+++ b/arch/x86/mm/fault.c
@@ -582,7 +582,7 @@ show_fault_oops(struct pt_regs *regs, unsigned long error_code,
 		pte_t *pte = lookup_address(address, &level);
 
 		if (pte && pte_present(*pte) && !pte_exec(*pte))
-			printk(nx_warning, current_uid());
+			printk(nx_warning, from_kuid(&init_user_ns, current_uid()));
 	}
 
 	printk(KERN_ALERT "BUG: unable to handle kernel ");
diff --git a/fs/ioprio.c b/fs/ioprio.c
index 8e35e964d9ed..2072e41785d2 100644
--- a/fs/ioprio.c
+++ b/fs/ioprio.c
@@ -123,9 +123,7 @@ SYSCALL_DEFINE3(ioprio_set, int, which, int, who, int, ioprio)
 				break;
 
 			do_each_thread(g, p) {
-				const struct cred *tcred = __task_cred(p);
-				kuid_t tcred_uid = make_kuid(tcred->user_ns, tcred->uid);
-				if (!uid_eq(tcred_uid, uid))
+				if (!uid_eq(task_uid(p), uid))
 					continue;
 				ret = set_task_ioprio(p, ioprio);
 				if (ret)
@@ -220,9 +218,7 @@ SYSCALL_DEFINE2(ioprio_get, int, which, int, who)
 				break;
 
 			do_each_thread(g, p) {
-				const struct cred *tcred = __task_cred(p);
-				kuid_t tcred_uid = make_kuid(tcred->user_ns, tcred->uid);
-				if (!uid_eq(tcred_uid, user->uid))
+				if (!uid_eq(task_uid(p), user->uid))
 					continue;
 				tmpio = get_task_ioprio(p);
 				if (tmpio < 0)
diff --git a/include/linux/cred.h b/include/linux/cred.h
index 0ab3cda4a774..fac0579258fc 100644
--- a/include/linux/cred.h
+++ b/include/linux/cred.h
@@ -123,14 +123,14 @@ struct cred {
 #define CRED_MAGIC	0x43736564
 #define CRED_MAGIC_DEAD	0x44656144
 #endif
-	uid_t		uid;		/* real UID of the task */
-	gid_t		gid;		/* real GID of the task */
-	uid_t		suid;		/* saved UID of the task */
-	gid_t		sgid;		/* saved GID of the task */
-	uid_t		euid;		/* effective UID of the task */
-	gid_t		egid;		/* effective GID of the task */
-	uid_t		fsuid;		/* UID for VFS ops */
-	gid_t		fsgid;		/* GID for VFS ops */
+	kuid_t		uid;		/* real UID of the task */
+	kgid_t		gid;		/* real GID of the task */
+	kuid_t		suid;		/* saved UID of the task */
+	kgid_t		sgid;		/* saved GID of the task */
+	kuid_t		euid;		/* effective UID of the task */
+	kgid_t		egid;		/* effective GID of the task */
+	kuid_t		fsuid;		/* UID for VFS ops */
+	kgid_t		fsgid;		/* GID for VFS ops */
 	unsigned	securebits;	/* SUID-less security management */
 	kernel_cap_t	cap_inheritable; /* caps our children can inherit */
 	kernel_cap_t	cap_permitted;	/* caps we're permitted */
diff --git a/include/linux/user_namespace.h b/include/linux/user_namespace.h
index 4c9846d90741..a2c61457cba1 100644
--- a/include/linux/user_namespace.h
+++ b/include/linux/user_namespace.h
@@ -70,15 +70,15 @@ static inline void put_user_ns(struct user_namespace *ns)
 #endif
 
 static inline uid_t user_ns_map_uid(struct user_namespace *to,
-	const struct cred *cred, uid_t uid)
+	const struct cred *cred, kuid_t uid)
 {
-	return from_kuid_munged(to, make_kuid(cred->user_ns, uid));
+	return from_kuid_munged(to, uid);
 }
 
 static inline gid_t user_ns_map_gid(struct user_namespace *to,
-	const struct cred *cred, gid_t gid)
+	const struct cred *cred, kgid_t gid)
 {
-	return from_kgid_munged(to, make_kgid(cred->user_ns, gid));
+	return from_kgid_munged(to, gid);
 }
 
 #endif /* _LINUX_USER_H */
diff --git a/kernel/cred.c b/kernel/cred.c
index 7a0d80669886..eddc5e2e9587 100644
--- a/kernel/cred.c
+++ b/kernel/cred.c
@@ -49,6 +49,14 @@ struct cred init_cred = {
 	.subscribers		= ATOMIC_INIT(2),
 	.magic			= CRED_MAGIC,
 #endif
+	.uid			= GLOBAL_ROOT_UID,
+	.gid			= GLOBAL_ROOT_GID,
+	.suid			= GLOBAL_ROOT_UID,
+	.sgid			= GLOBAL_ROOT_GID,
+	.euid			= GLOBAL_ROOT_UID,
+	.egid			= GLOBAL_ROOT_GID,
+	.fsuid			= GLOBAL_ROOT_UID,
+	.fsgid			= GLOBAL_ROOT_GID,
 	.securebits		= SECUREBITS_DEFAULT,
 	.cap_inheritable	= CAP_EMPTY_SET,
 	.cap_permitted		= CAP_FULL_SET,
@@ -488,10 +496,10 @@ int commit_creds(struct cred *new)
 	get_cred(new); /* we will require a ref for the subj creds too */
 
 	/* dumpability changes */
-	if (old->euid != new->euid ||
-	    old->egid != new->egid ||
-	    old->fsuid != new->fsuid ||
-	    old->fsgid != new->fsgid ||
+	if (!uid_eq(old->euid, new->euid) ||
+	    !gid_eq(old->egid, new->egid) ||
+	    !uid_eq(old->fsuid, new->fsuid) ||
+	    !gid_eq(old->fsgid, new->fsgid) ||
 	    !cap_issubset(new->cap_permitted, old->cap_permitted)) {
 		if (task->mm)
 			set_dumpable(task->mm, suid_dumpable);
@@ -500,9 +508,9 @@ int commit_creds(struct cred *new)
 	}
 
 	/* alter the thread keyring */
-	if (new->fsuid != old->fsuid)
+	if (!uid_eq(new->fsuid, old->fsuid))
 		key_fsuid_changed(task);
-	if (new->fsgid != old->fsgid)
+	if (!gid_eq(new->fsgid, old->fsgid))
 		key_fsgid_changed(task);
 
 	/* do it
@@ -519,16 +527,16 @@ int commit_creds(struct cred *new)
 	alter_cred_subscribers(old, -2);
 
 	/* send notifications */
-	if (new->uid   != old->uid  ||
-	    new->euid  != old->euid ||
-	    new->suid  != old->suid ||
-	    new->fsuid != old->fsuid)
+	if (!uid_eq(new->uid,   old->uid)  ||
+	    !uid_eq(new->euid,  old->euid) ||
+	    !uid_eq(new->suid,  old->suid) ||
+	    !uid_eq(new->fsuid, old->fsuid))
 		proc_id_connector(task, PROC_EVENT_UID);
 
-	if (new->gid   != old->gid  ||
-	    new->egid  != old->egid ||
-	    new->sgid  != old->sgid ||
-	    new->fsgid != old->fsgid)
+	if (!gid_eq(new->gid,   old->gid)  ||
+	    !gid_eq(new->egid,  old->egid) ||
+	    !gid_eq(new->sgid,  old->sgid) ||
+	    !gid_eq(new->fsgid, old->fsgid))
 		proc_id_connector(task, PROC_EVENT_GID);
 
 	/* release the old obj and subj refs both */
diff --git a/kernel/signal.c b/kernel/signal.c
index e2c5d84f2dac..2734dc965f69 100644
--- a/kernel/signal.c
+++ b/kernel/signal.c
@@ -1038,8 +1038,10 @@ static inline void userns_fixup_signal_uid(struct siginfo *info, struct task_str
 	if (SI_FROMKERNEL(info))
 		return;
 
-	info->si_uid = user_ns_map_uid(task_cred_xxx(t, user_ns),
-					current_cred(), info->si_uid);
+	rcu_read_lock();
+	info->si_uid = from_kuid_munged(task_cred_xxx(t, user_ns),
+					make_kuid(current_user_ns(), info->si_uid));
+	rcu_read_unlock();
 }
 #else
 static inline void userns_fixup_signal_uid(struct siginfo *info, struct task_struct *t)
@@ -1106,7 +1108,7 @@ static int __send_signal(int sig, struct siginfo *info, struct task_struct *t,
 			q->info.si_code = SI_USER;
 			q->info.si_pid = task_tgid_nr_ns(current,
 							task_active_pid_ns(t));
-			q->info.si_uid = current_uid();
+			q->info.si_uid = from_kuid_munged(current_user_ns(), current_uid());
 			break;
 		case (unsigned long) SEND_SIG_PRIV:
 			q->info.si_signo = sig;
@@ -1973,7 +1975,7 @@ static void ptrace_do_notify(int signr, int exit_code, int why)
 	info.si_signo = signr;
 	info.si_code = exit_code;
 	info.si_pid = task_pid_vnr(current);
-	info.si_uid = current_uid();
+	info.si_uid = from_kuid_munged(current_user_ns(), current_uid());
 
 	/* Let the debugger run.  */
 	ptrace_stop(exit_code, why, 1, &info);
@@ -2828,7 +2830,7 @@ SYSCALL_DEFINE2(kill, pid_t, pid, int, sig)
 	info.si_errno = 0;
 	info.si_code = SI_USER;
 	info.si_pid = task_tgid_vnr(current);
-	info.si_uid = current_uid();
+	info.si_uid = from_kuid_munged(current_user_ns(), current_uid());
 
 	return kill_something_info(sig, &info, pid);
 }
@@ -2871,7 +2873,7 @@ static int do_tkill(pid_t tgid, pid_t pid, int sig)
 	info.si_errno = 0;
 	info.si_code = SI_TKILL;
 	info.si_pid = task_tgid_vnr(current);
-	info.si_uid = current_uid();
+	info.si_uid = from_kuid_munged(current_user_ns(), current_uid());
 
 	return do_send_specific(tgid, pid, sig, &info);
 }
diff --git a/kernel/sys.c b/kernel/sys.c
index f0c43b4b6657..39962818c008 100644
--- a/kernel/sys.c
+++ b/kernel/sys.c
@@ -175,7 +175,6 @@ SYSCALL_DEFINE3(setpriority, int, which, int, who, int, niceval)
 	const struct cred *cred = current_cred();
 	int error = -EINVAL;
 	struct pid *pgrp;
-	kuid_t cred_uid;
 	kuid_t uid;
 
 	if (which > PRIO_USER || which < PRIO_PROCESS)
@@ -209,22 +208,19 @@ SYSCALL_DEFINE3(setpriority, int, which, int, who, int, niceval)
 			} while_each_pid_thread(pgrp, PIDTYPE_PGID, p);
 			break;
 		case PRIO_USER:
-			cred_uid = make_kuid(cred->user_ns, cred->uid);
 			uid = make_kuid(cred->user_ns, who);
 			user = cred->user;
 			if (!who)
-				uid = cred_uid;
-			else if (!uid_eq(uid, cred_uid) &&
+				uid = cred->uid;
+			else if (!uid_eq(uid, cred->uid) &&
 				 !(user = find_user(uid)))
 				goto out_unlock;	/* No processes for this user */
 
 			do_each_thread(g, p) {
-				const struct cred *tcred = __task_cred(p);
-				kuid_t tcred_uid = make_kuid(tcred->user_ns, tcred->uid);
-				if (uid_eq(tcred_uid, uid))
+				if (uid_eq(task_uid(p), uid))
 					error = set_one_prio(p, niceval, error);
 			} while_each_thread(g, p);
-			if (!uid_eq(uid, cred_uid))
+			if (!uid_eq(uid, cred->uid))
 				free_uid(user);		/* For find_user() */
 			break;
 	}
@@ -248,7 +244,6 @@ SYSCALL_DEFINE2(getpriority, int, which, int, who)
 	const struct cred *cred = current_cred();
 	long niceval, retval = -ESRCH;
 	struct pid *pgrp;
-	kuid_t cred_uid;
 	kuid_t uid;
 
 	if (which > PRIO_USER || which < PRIO_PROCESS)
@@ -280,25 +275,22 @@ SYSCALL_DEFINE2(getpriority, int, which, int, who)
 			} while_each_pid_thread(pgrp, PIDTYPE_PGID, p);
 			break;
 		case PRIO_USER:
-			cred_uid = make_kuid(cred->user_ns, cred->uid);
 			uid = make_kuid(cred->user_ns, who);
 			user = cred->user;
 			if (!who)
-				uid = cred_uid;
-			else if (!uid_eq(uid, cred_uid) &&
+				uid = cred->uid;
+			else if (!uid_eq(uid, cred->uid) &&
 				 !(user = find_user(uid)))
 				goto out_unlock;	/* No processes for this user */
 
 			do_each_thread(g, p) {
-				const struct cred *tcred = __task_cred(p);
-				kuid_t tcred_uid = make_kuid(tcred->user_ns, tcred->uid);
-				if (uid_eq(tcred_uid, uid)) {
+				if (uid_eq(task_uid(p), uid)) {
 					niceval = 20 - task_nice(p);
 					if (niceval > retval)
 						retval = niceval;
 				}
 			} while_each_thread(g, p);
-			if (!uid_eq(uid, cred_uid))
+			if (!uid_eq(uid, cred->uid))
 				free_uid(user);		/* for find_user() */
 			break;
 	}
@@ -641,7 +633,7 @@ static int set_user(struct cred *new)
 {
 	struct user_struct *new_user;
 
-	new_user = alloc_uid(make_kuid(new->user_ns, new->uid));
+	new_user = alloc_uid(new->uid);
 	if (!new_user)
 		return -EAGAIN;
 
diff --git a/kernel/user_namespace.c b/kernel/user_namespace.c
index 7eff867bfac5..86602316422d 100644
--- a/kernel/user_namespace.c
+++ b/kernel/user_namespace.c
@@ -36,8 +36,8 @@ static bool new_idmap_permitted(struct user_namespace *ns, int cap_setid,
 int create_user_ns(struct cred *new)
 {
 	struct user_namespace *ns, *parent_ns = new->user_ns;
-	kuid_t owner = make_kuid(new->user_ns, new->euid);
-	kgid_t group = make_kgid(new->user_ns, new->egid);
+	kuid_t owner = new->euid;
+	kgid_t group = new->egid;
 
 	/* The creator needs a mapping in the parent user namespace
 	 * or else we won't be able to reasonably tell userspace who
diff --git a/mm/oom_kill.c b/mm/oom_kill.c
index 46bf2ed5594c..9f09a1fde9f9 100644
--- a/mm/oom_kill.c
+++ b/mm/oom_kill.c
@@ -410,8 +410,8 @@ static void dump_tasks(const struct mem_cgroup *memcg, const nodemask_t *nodemas
 		}
 
 		pr_info("[%5d] %5d %5d %8lu %8lu %3u     %3d         %5d %s\n",
-			task->pid, task_uid(task), task->tgid,
-			task->mm->total_vm, get_mm_rss(task->mm),
+			task->pid, from_kuid(&init_user_ns, task_uid(task)),
+			task->tgid, task->mm->total_vm, get_mm_rss(task->mm),
 			task_cpu(task), task->signal->oom_adj,
 			task->signal->oom_score_adj, task->comm);
 		task_unlock(task);
diff --git a/security/commoncap.c b/security/commoncap.c
index f2399d8afbe0..dbd465a59286 100644
--- a/security/commoncap.c
+++ b/security/commoncap.c
@@ -77,8 +77,7 @@ int cap_capable(const struct cred *cred, struct user_namespace *targ_ns,
 {
 	for (;;) {
 		/* The owner of the user namespace has all caps. */
-		if (targ_ns != &init_user_ns && uid_eq(targ_ns->owner,
-						       make_kuid(cred->user_ns, cred->euid)))
+		if (targ_ns != &init_user_ns && uid_eq(targ_ns->owner, cred->euid))
 			return 0;
 
 		/* Do we have the necessary capabilities? */
-- 
cgit v1.2.3


From 59a094c994a138049b41a44bc29cff9407d51c5b Mon Sep 17 00:00:00 2001
From: Steven Rostedt <srostedt@redhat.com>
Date: Fri, 4 May 2012 09:26:16 -0400
Subject: ftrace/x86: Use asm/kprobes.h instead of linux/kprobes.h

If CONFIG_KPROBES is not set, then linux/kprobes.h will not include
asm/kprobes.h needed by x86/ftrace.c for the BREAKPOINT macro.

The x86/ftrace.c file should just include asm/kprobes.h as it does not
need the rest of kprobes.

Reported-by: Ingo Molnar <mingo@elte.hu>
Cc: Masami Hiramatsu <masami.hiramatsu.pt@hitachi.com>
Signed-off-by: Steven Rostedt <rostedt@goodmis.org>
---
 arch/x86/kernel/ftrace.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kernel/ftrace.c b/arch/x86/kernel/ftrace.c
index cf2d03ec1793..4243e8bbdcb1 100644
--- a/arch/x86/kernel/ftrace.c
+++ b/arch/x86/kernel/ftrace.c
@@ -20,11 +20,11 @@
 #include <linux/init.h>
 #include <linux/list.h>
 #include <linux/module.h>
-#include <linux/kprobes.h>
 
 #include <trace/syscall.h>
 
 #include <asm/cacheflush.h>
+#include <asm/kprobes.h>
 #include <asm/ftrace.h>
 #include <asm/nops.h>
 
-- 
cgit v1.2.3


From 45046892ef89c1e0caad66a03c8c1e14ad478d23 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Thu, 3 May 2012 09:03:01 +0000
Subject: x86: Use generic init_task

Same code. Use the generic version. The special Makefile treatment is
pointless anyway as init_task.o contains only data which is handled by
the linker script. So no point on being treated like head text.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Link: http://lkml.kernel.org/r/20120503085035.739963562@linutronix.de
Cc: x86@kernel.org
---
 arch/x86/Kconfig            |  1 +
 arch/x86/Makefile           |  1 -
 arch/x86/kernel/Makefile    |  2 +-
 arch/x86/kernel/init_task.c | 42 ------------------------------------------
 arch/x86/kernel/process.c   |  9 +++++++++
 5 files changed, 11 insertions(+), 44 deletions(-)
 delete mode 100644 arch/x86/kernel/init_task.c

(limited to 'arch/x86')

diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
index 046bf4bd2510..224695938400 100644
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -83,6 +83,7 @@ config X86
 	select GENERIC_IOMAP
 	select DCACHE_WORD_ACCESS if !DEBUG_PAGEALLOC
 	select GENERIC_SMP_IDLE_THREAD
+	select HAVE_GENERIC_INIT_TASK
 
 config INSTRUCTION_DECODER
 	def_bool (KPROBES || PERF_EVENTS)
diff --git a/arch/x86/Makefile b/arch/x86/Makefile
index 41a7237606a3..3e48b26f67d5 100644
--- a/arch/x86/Makefile
+++ b/arch/x86/Makefile
@@ -146,7 +146,6 @@ archheaders:
 head-y := arch/x86/kernel/head_$(BITS).o
 head-y += arch/x86/kernel/head$(BITS).o
 head-y += arch/x86/kernel/head.o
-head-y += arch/x86/kernel/init_task.o
 
 libs-y  += arch/x86/lib/
 
diff --git a/arch/x86/kernel/Makefile b/arch/x86/kernel/Makefile
index 532d2e090e6f..56ebd1f98447 100644
--- a/arch/x86/kernel/Makefile
+++ b/arch/x86/kernel/Makefile
@@ -2,7 +2,7 @@
 # Makefile for the linux kernel.
 #
 
-extra-y                := head_$(BITS).o head$(BITS).o head.o init_task.o vmlinux.lds
+extra-y                := head_$(BITS).o head$(BITS).o head.o vmlinux.lds
 
 CPPFLAGS_vmlinux.lds += -U$(UTS_MACHINE)
 
diff --git a/arch/x86/kernel/init_task.c b/arch/x86/kernel/init_task.c
deleted file mode 100644
index 43e9ccf44947..000000000000
--- a/arch/x86/kernel/init_task.c
+++ /dev/null
@@ -1,42 +0,0 @@
-#include <linux/mm.h>
-#include <linux/module.h>
-#include <linux/sched.h>
-#include <linux/init.h>
-#include <linux/init_task.h>
-#include <linux/fs.h>
-#include <linux/mqueue.h>
-
-#include <asm/uaccess.h>
-#include <asm/pgtable.h>
-#include <asm/desc.h>
-
-static struct signal_struct init_signals = INIT_SIGNALS(init_signals);
-static struct sighand_struct init_sighand = INIT_SIGHAND(init_sighand);
-
-/*
- * Initial thread structure.
- *
- * We need to make sure that this is THREAD_SIZE aligned due to the
- * way process stacks are handled. This is done by having a special
- * "init_task" linker map entry..
- */
-union thread_union init_thread_union __init_task_data =
-	{ INIT_THREAD_INFO(init_task) };
-
-/*
- * Initial task structure.
- *
- * All other task structs will be allocated on slabs in fork.c
- */
-struct task_struct init_task = INIT_TASK(init_task);
-EXPORT_SYMBOL(init_task);
-
-/*
- * per-CPU TSS segments. Threads are completely 'soft' on Linux,
- * no more per-task TSS's. The TSS size is kept cacheline-aligned
- * so they are allowed to end up in the .data..cacheline_aligned
- * section. Since TSS's are completely CPU-local, we want them
- * on exact cacheline boundaries, to eliminate cacheline ping-pong.
- */
-DEFINE_PER_CPU_SHARED_ALIGNED(struct tss_struct, init_tss) = INIT_TSS;
-
diff --git a/arch/x86/kernel/process.c b/arch/x86/kernel/process.c
index 1d92a5ab6e8b..8aa532fa015d 100644
--- a/arch/x86/kernel/process.c
+++ b/arch/x86/kernel/process.c
@@ -27,6 +27,15 @@
 #include <asm/debugreg.h>
 #include <asm/nmi.h>
 
+/*
+ * per-CPU TSS segments. Threads are completely 'soft' on Linux,
+ * no more per-task TSS's. The TSS size is kept cacheline-aligned
+ * so they are allowed to end up in the .data..cacheline_aligned
+ * section. Since TSS's are completely CPU-local, we want them
+ * on exact cacheline boundaries, to eliminate cacheline ping-pong.
+ */
+DEFINE_PER_CPU_SHARED_ALIGNED(struct tss_struct, init_tss) = INIT_TSS;
+
 #ifdef CONFIG_X86_64
 static DEFINE_PER_CPU(unsigned char, is_idle);
 static ATOMIC_NOTIFIER_HEAD(idle_notifier);
-- 
cgit v1.2.3


From a6359d1eec43d1fd6ffbac958149844873e0084f Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Thu, 3 May 2012 09:03:02 +0000
Subject: init_task: Replace CONFIG_HAVE_GENERIC_INIT_TASK

Now that all archs except ia64 are converted, replace the config and
let the ia64 select CONFIG_ARCH_INIT_TASK

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Link: http://lkml.kernel.org/r/20120503085035.867948914@linutronix.de
---
 arch/Kconfig            | 3 ++-
 arch/alpha/Kconfig      | 1 -
 arch/arm/Kconfig        | 1 -
 arch/avr32/Kconfig      | 1 -
 arch/blackfin/Kconfig   | 1 -
 arch/c6x/Kconfig        | 1 -
 arch/cris/Kconfig       | 1 -
 arch/frv/Kconfig        | 1 -
 arch/h8300/Kconfig      | 1 -
 arch/hexagon/Kconfig    | 1 -
 arch/ia64/Kconfig       | 1 +
 arch/m32r/Kconfig       | 1 -
 arch/m68k/Kconfig       | 1 -
 arch/microblaze/Kconfig | 1 -
 arch/mips/Kconfig       | 1 -
 arch/mn10300/Kconfig    | 1 -
 arch/openrisc/Kconfig   | 1 -
 arch/parisc/Kconfig     | 1 -
 arch/powerpc/Kconfig    | 1 -
 arch/s390/Kconfig       | 1 -
 arch/score/Kconfig      | 1 -
 arch/sh/Kconfig         | 1 -
 arch/sparc/Kconfig      | 1 -
 arch/tile/Kconfig       | 1 -
 arch/um/Kconfig.common  | 1 -
 arch/unicore32/Kconfig  | 1 -
 arch/x86/Kconfig        | 1 -
 arch/xtensa/Kconfig     | 1 -
 init/Makefile           | 5 ++++-
 29 files changed, 7 insertions(+), 28 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/Kconfig b/arch/Kconfig
index 2dd8fdd7ea9f..597b132b3902 100644
--- a/arch/Kconfig
+++ b/arch/Kconfig
@@ -148,7 +148,8 @@ config USE_GENERIC_SMP_HELPERS
 config GENERIC_SMP_IDLE_THREAD
        bool
 
-config HAVE_GENERIC_INIT_TASK
+# Select if arch init_task initializer is different to init/init_task.c
+config ARCH_INIT_TASK
        bool
 
 config HAVE_REGS_AND_STACK_ACCESS_API
diff --git a/arch/alpha/Kconfig b/arch/alpha/Kconfig
index 74d000480b69..991b8bbff4ff 100644
--- a/arch/alpha/Kconfig
+++ b/arch/alpha/Kconfig
@@ -16,7 +16,6 @@ config ALPHA
 	select ARCH_WANT_OPTIONAL_GPIOLIB
 	select ARCH_HAVE_NMI_SAFE_CMPXCHG
 	select GENERIC_SMP_IDLE_THREAD
-	select HAVE_GENERIC_INIT_TASK
 	help
 	  The Alpha is a 64-bit general-purpose processor designed and
 	  marketed by the Digital Equipment Corporation of blessed memory,
diff --git a/arch/arm/Kconfig b/arch/arm/Kconfig
index 8b365353a10d..cb253ce218a0 100644
--- a/arch/arm/Kconfig
+++ b/arch/arm/Kconfig
@@ -35,7 +35,6 @@ config ARM
 	select GENERIC_PCI_IOMAP
 	select HAVE_BPF_JIT if NET
 	select GENERIC_SMP_IDLE_THREAD
-	select HAVE_GENERIC_INIT_TASK
 	help
 	  The ARM series is a line of low-power-consumption RISC chip designs
 	  licensed by ARM Ltd and targeted at embedded applications and
diff --git a/arch/avr32/Kconfig b/arch/avr32/Kconfig
index f4289ca78b55..3dea7231f637 100644
--- a/arch/avr32/Kconfig
+++ b/arch/avr32/Kconfig
@@ -12,7 +12,6 @@ config AVR32
 	select HARDIRQS_SW_RESEND
 	select GENERIC_IRQ_SHOW
 	select ARCH_HAVE_NMI_SAFE_CMPXCHG
-	select HAVE_GENERIC_INIT_TASK
 	help
 	  AVR32 is a high-performance 32-bit RISC microprocessor core,
 	  designed for cost-sensitive embedded applications, with particular
diff --git a/arch/blackfin/Kconfig b/arch/blackfin/Kconfig
index 8570d6e21807..779b9c846fd7 100644
--- a/arch/blackfin/Kconfig
+++ b/arch/blackfin/Kconfig
@@ -38,7 +38,6 @@ config BLACKFIN
 	select IRQ_PER_CPU if SMP
 	select HAVE_NMI_WATCHDOG if NMI_WATCHDOG
 	select GENERIC_SMP_IDLE_THREAD
-	select HAVE_GENERIC_INIT_TASK
 
 config GENERIC_CSUM
 	def_bool y
diff --git a/arch/c6x/Kconfig b/arch/c6x/Kconfig
index 4189fb52d519..1c3ccd416d50 100644
--- a/arch/c6x/Kconfig
+++ b/arch/c6x/Kconfig
@@ -10,7 +10,6 @@ config TMS320C6X
 	select HAVE_ARCH_TRACEHOOK
 	select HAVE_DMA_API_DEBUG
 	select HAVE_GENERIC_HARDIRQS
-	select HAVE_GENERIC_INIT_TASK
 	select HAVE_MEMBLOCK
 	select SPARSE_IRQ
 	select IRQ_DOMAIN
diff --git a/arch/cris/Kconfig b/arch/cris/Kconfig
index 15e30a771a72..2995035812ec 100644
--- a/arch/cris/Kconfig
+++ b/arch/cris/Kconfig
@@ -50,7 +50,6 @@ config CRIS
 	select GENERIC_IRQ_SHOW
 	select GENERIC_IOMAP
 	select GENERIC_SMP_IDLE_THREAD if ETRAX_ARCH_V32
-	select HAVE_GENERIC_INIT_TASK
 
 config HZ
 	int
diff --git a/arch/frv/Kconfig b/arch/frv/Kconfig
index ed6dbd290c42..a685910d2d5c 100644
--- a/arch/frv/Kconfig
+++ b/arch/frv/Kconfig
@@ -9,7 +9,6 @@ config FRV
 	select GENERIC_IRQ_SHOW
 	select ARCH_HAVE_NMI_SAFE_CMPXCHG
 	select GENERIC_CPU_DEVICES
-	select HAVE_GENERIC_INIT_TASK
 
 config ZONE_DMA
 	bool
diff --git a/arch/h8300/Kconfig b/arch/h8300/Kconfig
index 5fac425aece4..56e890df5053 100644
--- a/arch/h8300/Kconfig
+++ b/arch/h8300/Kconfig
@@ -5,7 +5,6 @@ config H8300
 	select HAVE_GENERIC_HARDIRQS
 	select GENERIC_IRQ_SHOW
 	select GENERIC_CPU_DEVICES
-	select HAVE_GENERIC_INIT_TASK
 
 config SYMBOL_PREFIX
 	string
diff --git a/arch/hexagon/Kconfig b/arch/hexagon/Kconfig
index 6ee5488ed305..d2e4a3330336 100644
--- a/arch/hexagon/Kconfig
+++ b/arch/hexagon/Kconfig
@@ -28,7 +28,6 @@ config HEXAGON
 	select NO_IOPORT
 	select GENERIC_IOMAP
 	select GENERIC_SMP_IDLE_THREAD
-	select HAVE_GENERIC_INIT_TASK
 	# mostly generic routines, with some accelerated ones
 	---help---
 	  Qualcomm Hexagon is a processor architecture designed for high
diff --git a/arch/ia64/Kconfig b/arch/ia64/Kconfig
index 11975475516a..022ea3a9d1ab 100644
--- a/arch/ia64/Kconfig
+++ b/arch/ia64/Kconfig
@@ -34,6 +34,7 @@ config IA64
 	select ARCH_HAVE_NMI_SAFE_CMPXCHG
 	select GENERIC_IOMAP
 	select GENERIC_SMP_IDLE_THREAD
+	select ARCH_INIT_TASK
 	default y
 	help
 	  The Itanium Processor Family is Intel's 64-bit successor to
diff --git a/arch/m32r/Kconfig b/arch/m32r/Kconfig
index 8b8bd7fa148a..ef80a6546ff2 100644
--- a/arch/m32r/Kconfig
+++ b/arch/m32r/Kconfig
@@ -11,7 +11,6 @@ config M32R
 	select GENERIC_IRQ_PROBE
 	select GENERIC_IRQ_SHOW
 	select GENERIC_ATOMIC64
-	select HAVE_GENERIC_INIT_TASK
 
 config SBUS
 	bool
diff --git a/arch/m68k/Kconfig b/arch/m68k/Kconfig
index 1891127c7db0..d318c606c888 100644
--- a/arch/m68k/Kconfig
+++ b/arch/m68k/Kconfig
@@ -8,7 +8,6 @@ config M68K
 	select ARCH_HAVE_NMI_SAFE_CMPXCHG if RMW_INSNS
 	select GENERIC_CPU_DEVICES
 	select FPU if MMU
-	select HAVE_GENERIC_INIT_TASK
 
 config RWSEM_GENERIC_SPINLOCK
 	bool
diff --git a/arch/microblaze/Kconfig b/arch/microblaze/Kconfig
index 21ccba6a05f9..ac22dc7f4cab 100644
--- a/arch/microblaze/Kconfig
+++ b/arch/microblaze/Kconfig
@@ -22,7 +22,6 @@ config MICROBLAZE
 	select GENERIC_PCI_IOMAP
 	select GENERIC_CPU_DEVICES
 	select GENERIC_ATOMIC64
-	select HAVE_GENERIC_INIT_TASK
 
 config SWAP
 	def_bool n
diff --git a/arch/mips/Kconfig b/arch/mips/Kconfig
index d6c78901e5f2..186fc8cf9ee0 100644
--- a/arch/mips/Kconfig
+++ b/arch/mips/Kconfig
@@ -30,7 +30,6 @@ config MIPS
 	select HAVE_MEMBLOCK_NODE_MAP
 	select ARCH_DISCARD_MEMBLOCK
 	select GENERIC_SMP_IDLE_THREAD
-	select HAVE_GENERIC_INIT_TASK
 
 menu "Machine selection"
 
diff --git a/arch/mn10300/Kconfig b/arch/mn10300/Kconfig
index d28b6eb1b122..3aa3de017159 100644
--- a/arch/mn10300/Kconfig
+++ b/arch/mn10300/Kconfig
@@ -6,7 +6,6 @@ config MN10300
 	select HAVE_ARCH_TRACEHOOK
 	select HAVE_ARCH_KGDB
 	select HAVE_NMI_WATCHDOG if MN10300_WD_TIMER
-	select HAVE_GENERIC_INIT_TASK
 
 config AM33_2
 	def_bool n
diff --git a/arch/openrisc/Kconfig b/arch/openrisc/Kconfig
index 6d921936f4ab..a4787197d8fe 100644
--- a/arch/openrisc/Kconfig
+++ b/arch/openrisc/Kconfig
@@ -17,7 +17,6 @@ config OPENRISC
 	select GENERIC_IOMAP
 	select GENERIC_CPU_DEVICES
 	select GENERIC_ATOMIC64
-	select HAVE_GENERIC_INIT_TASK
 
 config MMU
 	def_bool y
diff --git a/arch/parisc/Kconfig b/arch/parisc/Kconfig
index 4c6ca0de90cc..ddb8b24b823d 100644
--- a/arch/parisc/Kconfig
+++ b/arch/parisc/Kconfig
@@ -18,7 +18,6 @@ config PARISC
 	select IRQ_PER_CPU
 	select ARCH_HAVE_NMI_SAFE_CMPXCHG
 	select GENERIC_SMP_IDLE_THREAD
-	select HAVE_GENERIC_INIT_TASK
 
 	help
 	  The PA-RISC microprocessor is designed by Hewlett-Packard and used
diff --git a/arch/powerpc/Kconfig b/arch/powerpc/Kconfig
index 946e8816ecd3..c81553508366 100644
--- a/arch/powerpc/Kconfig
+++ b/arch/powerpc/Kconfig
@@ -145,7 +145,6 @@ config PPC
 	select HAVE_ARCH_JUMP_LABEL
 	select ARCH_HAVE_NMI_SAFE_CMPXCHG
 	select GENERIC_SMP_IDLE_THREAD
-	select HAVE_GENERIC_INIT_TASK
 
 config EARLY_PRINTK
 	bool
diff --git a/arch/s390/Kconfig b/arch/s390/Kconfig
index 6c0eb214ab27..15cab3ee44e8 100644
--- a/arch/s390/Kconfig
+++ b/arch/s390/Kconfig
@@ -123,7 +123,6 @@ config S390
 	select ARCH_INLINE_WRITE_UNLOCK_IRQ
 	select ARCH_INLINE_WRITE_UNLOCK_IRQRESTORE
 	select GENERIC_SMP_IDLE_THREAD
-	select HAVE_GENERIC_INIT_TASK
 
 config SCHED_OMIT_FRAME_POINTER
 	def_bool y
diff --git a/arch/score/Kconfig b/arch/score/Kconfig
index c760bccfad40..4b285779ac05 100644
--- a/arch/score/Kconfig
+++ b/arch/score/Kconfig
@@ -9,7 +9,6 @@ config SCORE
        select HAVE_MEMBLOCK_NODE_MAP
        select ARCH_DISCARD_MEMBLOCK
        select GENERIC_CPU_DEVICES
-       select HAVE_GENERIC_INIT_TASK
 
 choice
 	prompt "System type"
diff --git a/arch/sh/Kconfig b/arch/sh/Kconfig
index a0cd70be8656..244cfd0dbb7b 100644
--- a/arch/sh/Kconfig
+++ b/arch/sh/Kconfig
@@ -29,7 +29,6 @@ config SUPERH
 	select GENERIC_ATOMIC64
 	select GENERIC_IRQ_SHOW
 	select GENERIC_SMP_IDLE_THREAD
-	select HAVE_GENERIC_INIT_TASK
 	help
 	  The SuperH is a RISC processor targeted for use in embedded systems
 	  and consumer electronics; it was also used in the Sega Dreamcast
diff --git a/arch/sparc/Kconfig b/arch/sparc/Kconfig
index 99aad7cd0075..e417f35d5912 100644
--- a/arch/sparc/Kconfig
+++ b/arch/sparc/Kconfig
@@ -31,7 +31,6 @@ config SPARC
 	select GENERIC_PCI_IOMAP
 	select HAVE_NMI_WATCHDOG if SPARC64
 	select GENERIC_SMP_IDLE_THREAD
-	select HAVE_GENERIC_INIT_TASK
 
 config SPARC32
 	def_bool !64BIT
diff --git a/arch/tile/Kconfig b/arch/tile/Kconfig
index 4fa3ff5a7bc3..96033e2d6845 100644
--- a/arch/tile/Kconfig
+++ b/arch/tile/Kconfig
@@ -13,7 +13,6 @@ config TILE
 	select GENERIC_IRQ_SHOW
 	select SYS_HYPERVISOR
 	select ARCH_HAVE_NMI_SAFE_CMPXCHG
-	select HAVE_GENERIC_INIT_TASK
 
 # FIXME: investigate whether we need/want these options.
 #	select HAVE_IOREMAP_PROT
diff --git a/arch/um/Kconfig.common b/arch/um/Kconfig.common
index f03473cf86df..20a49ba93cb9 100644
--- a/arch/um/Kconfig.common
+++ b/arch/um/Kconfig.common
@@ -10,7 +10,6 @@ config UML
 	select GENERIC_IRQ_SHOW
 	select GENERIC_CPU_DEVICES
 	select GENERIC_IO
-	select HAVE_GENERIC_INIT_TASK
 
 config MMU
 	bool
diff --git a/arch/unicore32/Kconfig b/arch/unicore32/Kconfig
index e24ca398120e..eeb8054c7cd8 100644
--- a/arch/unicore32/Kconfig
+++ b/arch/unicore32/Kconfig
@@ -13,7 +13,6 @@ config UNICORE32
 	select GENERIC_IRQ_SHOW
 	select ARCH_WANT_FRAME_POINTERS
 	select GENERIC_IOMAP
-	select HAVE_GENERIC_INIT_TASK
 	help
 	  UniCore-32 is 32-bit Instruction Set Architecture,
 	  including a series of low-power-consumption RISC chip
diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
index 224695938400..046bf4bd2510 100644
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -83,7 +83,6 @@ config X86
 	select GENERIC_IOMAP
 	select DCACHE_WORD_ACCESS if !DEBUG_PAGEALLOC
 	select GENERIC_SMP_IDLE_THREAD
-	select HAVE_GENERIC_INIT_TASK
 
 config INSTRUCTION_DECODER
 	def_bool (KPROBES || PERF_EVENTS)
diff --git a/arch/xtensa/Kconfig b/arch/xtensa/Kconfig
index d0ab5bb0d582..8a3f8351f438 100644
--- a/arch/xtensa/Kconfig
+++ b/arch/xtensa/Kconfig
@@ -10,7 +10,6 @@ config XTENSA
 	select HAVE_GENERIC_HARDIRQS
 	select GENERIC_IRQ_SHOW
 	select GENERIC_CPU_DEVICES
-	select HAVE_GENERIC_INIT_TASK
 	help
 	  Xtensa processors are 32-bit RISC machines designed by Tensilica
 	  primarily for embedded systems.  These processors are both
diff --git a/init/Makefile b/init/Makefile
index c55eac955cdc..7bc47ee31c36 100644
--- a/init/Makefile
+++ b/init/Makefile
@@ -9,7 +9,10 @@ else
 obj-$(CONFIG_BLK_DEV_INITRD)   += initramfs.o
 endif
 obj-$(CONFIG_GENERIC_CALIBRATE_DELAY) += calibrate.o
-obj-$(CONFIG_HAVE_GENERIC_INIT_TASK) += init_task.o
+
+ifneq ($(CONFIG_ARCH_INIT_TASK),y)
+obj-y                          += init_task.o
+endif
 
 mounts-y			:= do_mounts.o
 mounts-$(CONFIG_BLK_DEV_RAM)	+= do_mounts_rd.o
-- 
cgit v1.2.3


From 57c22e5f35aa4b9b2fe11f73f3e62bbf9ef36190 Mon Sep 17 00:00:00 2001
From: "Michael S. Tsirkin" <mst@redhat.com>
Date: Wed, 2 May 2012 17:55:56 +0300
Subject: KVM: fix cpuid eax for KVM leaf

cpuid eax should return the max leaf so that
guests can find out the valid range.
This matches Xen et al.
Update documentation to match.

Tested with -cpu host.

Signed-off-by: Michael S. Tsirkin <mst@redhat.com>
Signed-off-by: Avi Kivity <avi@redhat.com>
---
 Documentation/virtual/kvm/cpuid.txt | 6 +++++-
 arch/x86/kvm/cpuid.c                | 2 +-
 2 files changed, 6 insertions(+), 2 deletions(-)

(limited to 'arch/x86')

diff --git a/Documentation/virtual/kvm/cpuid.txt b/Documentation/virtual/kvm/cpuid.txt
index 882068538c9c..83afe65d4966 100644
--- a/Documentation/virtual/kvm/cpuid.txt
+++ b/Documentation/virtual/kvm/cpuid.txt
@@ -10,11 +10,15 @@ a guest.
 KVM cpuid functions are:
 
 function: KVM_CPUID_SIGNATURE (0x40000000)
-returns : eax = 0,
+returns : eax = 0x40000001,
           ebx = 0x4b4d564b,
           ecx = 0x564b4d56,
           edx = 0x4d.
 Note that this value in ebx, ecx and edx corresponds to the string "KVMKVMKVM".
+The value in eax corresponds to the maximum cpuid function present in this leaf,
+and will be updated if more functions are added in the future.
+Note also that old hosts set eax value to 0x0. This should
+be interpreted as if the value was 0x40000001.
 This function queries the presence of KVM cpuid leafs.
 
 
diff --git a/arch/x86/kvm/cpuid.c b/arch/x86/kvm/cpuid.c
index c2134b881033..7df1c6d839fb 100644
--- a/arch/x86/kvm/cpuid.c
+++ b/arch/x86/kvm/cpuid.c
@@ -398,7 +398,7 @@ static int do_cpuid_ent(struct kvm_cpuid_entry2 *entry, u32 function,
 	case KVM_CPUID_SIGNATURE: {
 		char signature[12] = "KVMKVMKVM\0\0";
 		u32 *sigptr = (u32 *)signature;
-		entry->eax = 0;
+		entry->eax = KVM_CPUID_FEATURES;
 		entry->ebx = sigptr[0];
 		entry->ecx = sigptr[1];
 		entry->edx = sigptr[2];
-- 
cgit v1.2.3


From 9b72d3b07dd99ac8ab2b84de5004a295af460536 Mon Sep 17 00:00:00 2001
From: Gleb Natapov <gleb@redhat.com>
Date: Mon, 30 Apr 2012 14:45:49 +0300
Subject: KVM guest: make kvm_para_available() check hypervisor bit reading
 cpuid leaf

This cpuid range does not exist on real HW and Intel spec says that
"Information returned for highest basic information leaf" will be
returned. Not very well defined.

Signed-off-by: Gleb Natapov <gleb@redhat.com>
Signed-off-by: Avi Kivity <avi@redhat.com>
---
 arch/x86/include/asm/kvm_para.h | 18 ++++++++++--------
 1 file changed, 10 insertions(+), 8 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/include/asm/kvm_para.h b/arch/x86/include/asm/kvm_para.h
index 99c4bbe0cca2..a7a7a94b94ce 100644
--- a/arch/x86/include/asm/kvm_para.h
+++ b/arch/x86/include/asm/kvm_para.h
@@ -178,14 +178,16 @@ static inline int kvm_para_available(void)
 	unsigned int eax, ebx, ecx, edx;
 	char signature[13];
 
-	cpuid(KVM_CPUID_SIGNATURE, &eax, &ebx, &ecx, &edx);
-	memcpy(signature + 0, &ebx, 4);
-	memcpy(signature + 4, &ecx, 4);
-	memcpy(signature + 8, &edx, 4);
-	signature[12] = 0;
-
-	if (strcmp(signature, "KVMKVMKVM") == 0)
-		return 1;
+	if (cpu_has_hypervisor) {
+		cpuid(KVM_CPUID_SIGNATURE, &eax, &ebx, &ecx, &edx);
+		memcpy(signature + 0, &ebx, 4);
+		memcpy(signature + 4, &ecx, 4);
+		memcpy(signature + 8, &edx, 4);
+		signature[12] = 0;
+
+		if (strcmp(signature, "KVMKVMKVM") == 0)
+			return 1;
+	}
 
 	return 0;
 }
-- 
cgit v1.2.3


From 1c2545be05f436523cabc54087c6a60ea10110d3 Mon Sep 17 00:00:00 2001
From: Takuya Yoshikawa <yoshikawa.takuya@oss.ntt.co.jp>
Date: Mon, 30 Apr 2012 17:46:31 +0900
Subject: KVM: x86 emulator: Move ModRM flags for groups to top level opcode
 tables

Needed for the following patch which simplifies ModRM fetching code.

Signed-off-by: Takuya Yoshikawa <yoshikawa.takuya@oss.ntt.co.jp>
Signed-off-by: Avi Kivity <avi@redhat.com>
---
 arch/x86/kvm/emulate.c | 111 +++++++++++++++++++++++++------------------------
 1 file changed, 56 insertions(+), 55 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c
index 0d151e232480..8d2c3d04cfec 100644
--- a/arch/x86/kvm/emulate.c
+++ b/arch/x86/kvm/emulate.c
@@ -3359,8 +3359,8 @@ static int check_perm_out(struct x86_emulate_ctxt *ctxt)
 		      .check_perm = (_p) }
 #define N    D(0)
 #define EXT(_f, _e) { .flags = ((_f) | RMExt), .u.group = (_e) }
-#define G(_f, _g) { .flags = ((_f) | Group), .u.group = (_g) }
-#define GD(_f, _g) { .flags = ((_f) | GroupDual), .u.gdual = (_g) }
+#define G(_f, _g) { .flags = ((_f) | Group | ModRM), .u.group = (_g) }
+#define GD(_f, _g) { .flags = ((_f) | GroupDual | ModRM), .u.gdual = (_g) }
 #define I(_f, _e) { .flags = (_f), .u.execute = (_e) }
 #define II(_f, _e, _i) \
 	{ .flags = (_f), .u.execute = (_e), .intercept = x86_intercept_##_i }
@@ -3380,25 +3380,25 @@ static int check_perm_out(struct x86_emulate_ctxt *ctxt)
 		I2bv(((_f) & ~Lock) | DstAcc | SrcImm, _e)
 
 static struct opcode group7_rm1[] = {
-	DI(SrcNone | ModRM | Priv, monitor),
-	DI(SrcNone | ModRM | Priv, mwait),
+	DI(SrcNone | Priv, monitor),
+	DI(SrcNone | Priv, mwait),
 	N, N, N, N, N, N,
 };
 
 static struct opcode group7_rm3[] = {
-	DIP(SrcNone | ModRM | Prot | Priv, vmrun,   check_svme_pa),
-	II(SrcNone | ModRM | Prot | VendorSpecific, em_vmmcall, vmmcall),
-	DIP(SrcNone | ModRM | Prot | Priv, vmload,  check_svme_pa),
-	DIP(SrcNone | ModRM | Prot | Priv, vmsave,  check_svme_pa),
-	DIP(SrcNone | ModRM | Prot | Priv, stgi,    check_svme),
-	DIP(SrcNone | ModRM | Prot | Priv, clgi,    check_svme),
-	DIP(SrcNone | ModRM | Prot | Priv, skinit,  check_svme),
-	DIP(SrcNone | ModRM | Prot | Priv, invlpga, check_svme),
+	DIP(SrcNone | Prot | Priv,		vmrun,		check_svme_pa),
+	II(SrcNone  | Prot | VendorSpecific,	em_vmmcall,	vmmcall),
+	DIP(SrcNone | Prot | Priv,		vmload,		check_svme_pa),
+	DIP(SrcNone | Prot | Priv,		vmsave,		check_svme_pa),
+	DIP(SrcNone | Prot | Priv,		stgi,		check_svme),
+	DIP(SrcNone | Prot | Priv,		clgi,		check_svme),
+	DIP(SrcNone | Prot | Priv,		skinit,		check_svme),
+	DIP(SrcNone | Prot | Priv,		invlpga,	check_svme),
 };
 
 static struct opcode group7_rm7[] = {
 	N,
-	DIP(SrcNone | ModRM, rdtscp, check_rdtsc),
+	DIP(SrcNone, rdtscp, check_rdtsc),
 	N, N, N, N, N, N,
 };
 
@@ -3414,76 +3414,77 @@ static struct opcode group1[] = {
 };
 
 static struct opcode group1A[] = {
-	I(DstMem | SrcNone | ModRM | Mov | Stack, em_pop), N, N, N, N, N, N, N,
+	I(DstMem | SrcNone | Mov | Stack, em_pop), N, N, N, N, N, N, N,
 };
 
 static struct opcode group3[] = {
-	I(DstMem | SrcImm | ModRM, em_test),
-	I(DstMem | SrcImm | ModRM, em_test),
-	I(DstMem | SrcNone | ModRM | Lock, em_not),
-	I(DstMem | SrcNone | ModRM | Lock, em_neg),
-	I(SrcMem | ModRM, em_mul_ex),
-	I(SrcMem | ModRM, em_imul_ex),
-	I(SrcMem | ModRM, em_div_ex),
-	I(SrcMem | ModRM, em_idiv_ex),
+	I(DstMem | SrcImm, em_test),
+	I(DstMem | SrcImm, em_test),
+	I(DstMem | SrcNone | Lock, em_not),
+	I(DstMem | SrcNone | Lock, em_neg),
+	I(SrcMem, em_mul_ex),
+	I(SrcMem, em_imul_ex),
+	I(SrcMem, em_div_ex),
+	I(SrcMem, em_idiv_ex),
 };
 
 static struct opcode group4[] = {
-	I(ByteOp | DstMem | SrcNone | ModRM | Lock, em_grp45),
-	I(ByteOp | DstMem | SrcNone | ModRM | Lock, em_grp45),
+	I(ByteOp | DstMem | SrcNone | Lock, em_grp45),
+	I(ByteOp | DstMem | SrcNone | Lock, em_grp45),
 	N, N, N, N, N, N,
 };
 
 static struct opcode group5[] = {
-	I(DstMem | SrcNone | ModRM | Lock, em_grp45),
-	I(DstMem | SrcNone | ModRM | Lock, em_grp45),
-	I(SrcMem | ModRM | Stack, em_grp45),
-	I(SrcMemFAddr | ModRM | ImplicitOps | Stack, em_call_far),
-	I(SrcMem | ModRM | Stack, em_grp45),
-	I(SrcMemFAddr | ModRM | ImplicitOps, em_grp45),
-	I(SrcMem | ModRM | Stack, em_grp45), N,
+	I(DstMem | SrcNone | Lock,		em_grp45),
+	I(DstMem | SrcNone | Lock,		em_grp45),
+	I(SrcMem | Stack,			em_grp45),
+	I(SrcMemFAddr | ImplicitOps | Stack,	em_call_far),
+	I(SrcMem | Stack,			em_grp45),
+	I(SrcMemFAddr | ImplicitOps,		em_grp45),
+	I(SrcMem | Stack,			em_grp45), N,
 };
 
 static struct opcode group6[] = {
-	DI(ModRM | Prot,        sldt),
-	DI(ModRM | Prot,        str),
-	DI(ModRM | Prot | Priv, lldt),
-	DI(ModRM | Prot | Priv, ltr),
+	DI(Prot,	sldt),
+	DI(Prot,	str),
+	DI(Prot | Priv,	lldt),
+	DI(Prot | Priv,	ltr),
 	N, N, N, N,
 };
 
 static struct group_dual group7 = { {
-	DI(ModRM | Mov | DstMem | Priv, sgdt),
-	DI(ModRM | Mov | DstMem | Priv, sidt),
-	II(ModRM | SrcMem | Priv, em_lgdt, lgdt),
-	II(ModRM | SrcMem | Priv, em_lidt, lidt),
-	II(SrcNone | ModRM | DstMem | Mov, em_smsw, smsw), N,
-	II(SrcMem16 | ModRM | Mov | Priv, em_lmsw, lmsw),
-	II(SrcMem | ModRM | ByteOp | Priv | NoAccess, em_invlpg, invlpg),
+	DI(Mov | DstMem | Priv,			sgdt),
+	DI(Mov | DstMem | Priv,			sidt),
+	II(SrcMem | Priv,			em_lgdt, lgdt),
+	II(SrcMem | Priv,			em_lidt, lidt),
+	II(SrcNone | DstMem | Mov,		em_smsw, smsw), N,
+	II(SrcMem16 | Mov | Priv,		em_lmsw, lmsw),
+	II(SrcMem | ByteOp | Priv | NoAccess,	em_invlpg, invlpg),
 }, {
-	I(SrcNone | ModRM | Priv | VendorSpecific, em_vmcall),
+	I(SrcNone | Priv | VendorSpecific,	em_vmcall),
 	EXT(0, group7_rm1),
 	N, EXT(0, group7_rm3),
-	II(SrcNone | ModRM | DstMem | Mov, em_smsw, smsw), N,
-	II(SrcMem16 | ModRM | Mov | Priv, em_lmsw, lmsw), EXT(0, group7_rm7),
+	II(SrcNone | DstMem | Mov,		em_smsw, smsw), N,
+	II(SrcMem16 | Mov | Priv,		em_lmsw, lmsw),
+	EXT(0, group7_rm7),
 } };
 
 static struct opcode group8[] = {
 	N, N, N, N,
-	I(DstMem | SrcImmByte | ModRM, em_bt),
-	I(DstMem | SrcImmByte | ModRM | Lock | PageTable, em_bts),
-	I(DstMem | SrcImmByte | ModRM | Lock, em_btr),
-	I(DstMem | SrcImmByte | ModRM | Lock | PageTable, em_btc),
+	I(DstMem | SrcImmByte,				em_bt),
+	I(DstMem | SrcImmByte | Lock | PageTable,	em_bts),
+	I(DstMem | SrcImmByte | Lock,			em_btr),
+	I(DstMem | SrcImmByte | Lock | PageTable,	em_btc),
 };
 
 static struct group_dual group9 = { {
-	N, I(DstMem64 | ModRM | Lock | PageTable, em_cmpxchg8b), N, N, N, N, N, N,
+	N, I(DstMem64 | Lock | PageTable, em_cmpxchg8b), N, N, N, N, N, N,
 }, {
 	N, N, N, N, N, N, N, N,
 } };
 
 static struct opcode group11[] = {
-	I(DstMem | SrcImm | ModRM | Mov | PageTable, em_mov),
+	I(DstMem | SrcImm | Mov | PageTable, em_mov),
 	X7(D(Undefined)),
 };
 
@@ -3541,10 +3542,10 @@ static struct opcode opcode_table[256] = {
 	/* 0x70 - 0x7F */
 	X16(D(SrcImmByte)),
 	/* 0x80 - 0x87 */
-	G(ByteOp | DstMem | SrcImm | ModRM | Group, group1),
-	G(DstMem | SrcImm | ModRM | Group, group1),
-	G(ByteOp | DstMem | SrcImm | ModRM | No64 | Group, group1),
-	G(DstMem | SrcImmByte | ModRM | Group, group1),
+	G(ByteOp | DstMem | SrcImm, group1),
+	G(DstMem | SrcImm, group1),
+	G(ByteOp | DstMem | SrcImm | No64, group1),
+	G(DstMem | SrcImmByte, group1),
 	I2bv(DstMem | SrcReg | ModRM, em_test),
 	I2bv(DstMem | SrcReg | ModRM | Lock | PageTable, em_xchg),
 	/* 0x88 - 0x8F */
-- 
cgit v1.2.3


From 9f4260e73ac43aaa91eb5de95950e1de7002f467 Mon Sep 17 00:00:00 2001
From: Takuya Yoshikawa <yoshikawa.takuya@oss.ntt.co.jp>
Date: Mon, 30 Apr 2012 17:48:25 +0900
Subject: KVM: x86 emulator: Avoid pushing back ModRM byte fetched for group
 decoding

Although ModRM byte is fetched for group decoding, it is soon pushed
back to make decode_modrm() fetch it later again.

Now that ModRM flag can be found in the top level opcode tables, fetch
ModRM byte before group decoding to make the code simpler.

Signed-off-by: Takuya Yoshikawa <yoshikawa.takuya@oss.ntt.co.jp>
Signed-off-by: Avi Kivity <avi@redhat.com>
---
 arch/x86/kvm/emulate.c | 8 +++-----
 1 file changed, 3 insertions(+), 5 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c
index 8d2c3d04cfec..7fd25763b0e0 100644
--- a/arch/x86/kvm/emulate.c
+++ b/arch/x86/kvm/emulate.c
@@ -972,7 +972,6 @@ static int decode_modrm(struct x86_emulate_ctxt *ctxt,
 		ctxt->modrm_rm = base_reg = (ctxt->rex_prefix & 1) << 3; /* REG.B */
 	}
 
-	ctxt->modrm = insn_fetch(u8, ctxt);
 	ctxt->modrm_mod |= (ctxt->modrm & 0xc0) >> 6;
 	ctxt->modrm_reg |= (ctxt->modrm & 0x38) >> 3;
 	ctxt->modrm_rm |= (ctxt->modrm & 0x07);
@@ -3976,17 +3975,16 @@ done_prefixes:
 	}
 	ctxt->d = opcode.flags;
 
+	if (ctxt->d & ModRM)
+		ctxt->modrm = insn_fetch(u8, ctxt);
+
 	while (ctxt->d & GroupMask) {
 		switch (ctxt->d & GroupMask) {
 		case Group:
-			ctxt->modrm = insn_fetch(u8, ctxt);
-			--ctxt->_eip;
 			goffset = (ctxt->modrm >> 3) & 7;
 			opcode = opcode.u.group[goffset];
 			break;
 		case GroupDual:
-			ctxt->modrm = insn_fetch(u8, ctxt);
-			--ctxt->_eip;
 			goffset = (ctxt->modrm >> 3) & 7;
 			if ((ctxt->modrm >> 6) == 3)
 				opcode = opcode.u.gdual->mod3[goffset];
-- 
cgit v1.2.3


From 8529f613b6945f4b5bd8c1b69e42aa1cc51b2eb6 Mon Sep 17 00:00:00 2001
From: Linus Torvalds <torvalds@linux-foundation.org>
Date: Sun, 6 May 2012 18:02:40 -0700
Subject: vfs: don't force a big memset of stat data just to clear padding
 fields

Admittedly this is something that the compiler should be able to just do
for us, but gcc just isn't that smart.  And trying to use a structure
initializer (which would get us the right semantics) ends up resulting
in gcc allocating stack space for _two_ 'struct stat', and then copying
one into the other.

So do it by hand - just have a per-architecture macro that initializes
the padding fields.  And if the architecture doesn't provide one, fall
back to the old behavior of just doing the whole memset() first.

Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 arch/x86/include/asm/stat.h | 21 +++++++++++++++++++++
 fs/stat.c                   | 12 ++++++++++--
 2 files changed, 31 insertions(+), 2 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/include/asm/stat.h b/arch/x86/include/asm/stat.h
index e0b1d9bbcbc6..7b3ddc348585 100644
--- a/arch/x86/include/asm/stat.h
+++ b/arch/x86/include/asm/stat.h
@@ -25,6 +25,12 @@ struct stat {
 	unsigned long  __unused5;
 };
 
+/* We don't need to memset the whole thing just to initialize the padding */
+#define INIT_STRUCT_STAT_PADDING(st) do {	\
+	st.__unused4 = 0;			\
+	st.__unused5 = 0;			\
+} while (0)
+
 #define STAT64_HAS_BROKEN_ST_INO	1
 
 /* This matches struct stat64 in glibc2.1, hence the absolutely
@@ -63,6 +69,12 @@ struct stat64 {
 	unsigned long long	st_ino;
 };
 
+/* We don't need to memset the whole thing just to initialize the padding */
+#define INIT_STRUCT_STAT64_PADDING(st) do {		\
+	memset(&st.__pad0, 0, sizeof(st.__pad0));	\
+	memset(&st.__pad3, 0, sizeof(st.__pad3));	\
+} while (0)
+
 #else /* __i386__ */
 
 struct stat {
@@ -87,6 +99,15 @@ struct stat {
 	unsigned long   st_ctime_nsec;
 	long		__unused[3];
 };
+
+/* We don't need to memset the whole thing just to initialize the padding */
+#define INIT_STRUCT_STAT_PADDING(st) do {	\
+	st.__pad0 = 0;				\
+	st.__unused[0] = 0;			\
+	st.__unused[1] = 0;			\
+	st.__unused[2] = 0;			\
+} while (0)
+
 #endif
 
 /* for 32bit emulation and 32 bit kernels */
diff --git a/fs/stat.c b/fs/stat.c
index 2b5d55eb9d9a..b30ac60291e2 100644
--- a/fs/stat.c
+++ b/fs/stat.c
@@ -199,6 +199,10 @@ SYSCALL_DEFINE2(fstat, unsigned int, fd, struct __old_kernel_stat __user *, stat
 #define valid_dev(x)  choose_32_64(old_valid_dev,new_valid_dev)(x)
 #define encode_dev(x) choose_32_64(old_encode_dev,new_encode_dev)(x)
 
+#ifndef INIT_STRUCT_STAT_PADDING
+#  define INIT_STRUCT_STAT_PADDING(st) memset(&st, 0, sizeof(st))
+#endif
+
 static int cp_new_stat(struct kstat *stat, struct stat __user *statbuf)
 {
 	struct stat tmp;
@@ -210,7 +214,7 @@ static int cp_new_stat(struct kstat *stat, struct stat __user *statbuf)
 		return -EOVERFLOW;
 #endif
 
-	memset(&tmp, 0, sizeof(tmp));
+	INIT_STRUCT_STAT_PADDING(tmp);
 	tmp.st_dev = encode_dev(stat->dev);
 	tmp.st_ino = stat->ino;
 	if (sizeof(tmp.st_ino) < sizeof(stat->ino) && tmp.st_ino != stat->ino)
@@ -323,11 +327,15 @@ SYSCALL_DEFINE3(readlink, const char __user *, path, char __user *, buf,
 /* ---------- LFS-64 ----------- */
 #ifdef __ARCH_WANT_STAT64
 
+#ifndef INIT_STRUCT_STAT64_PADDING
+#  define INIT_STRUCT_STAT64_PADDING(st) memset(&st, 0, sizeof(st))
+#endif
+
 static long cp_new_stat64(struct kstat *stat, struct stat64 __user *statbuf)
 {
 	struct stat64 tmp;
 
-	memset(&tmp, 0, sizeof(struct stat64));
+	INIT_STRUCT_STAT64_PADDING(tmp);
 #ifdef CONFIG_MIPS
 	/* mips has weird padding, so we don't get 64 bits there */
 	if (!new_valid_dev(stat->dev) || !new_valid_dev(stat->rdev))
-- 
cgit v1.2.3


From 6ff968cca1dfebd4b6fcade87c11658dbfc96932 Mon Sep 17 00:00:00 2001
From: Betty Dall <betty.dall@hp.com>
Date: Fri, 27 Apr 2012 14:40:55 -0600
Subject: x86/nmi: Fix the type of the nmiaction.flags field

This patch changes the type of the struct nmiaction flags field
to unsigned long from unsigned int. All the usages of the flags
field are unsigned long already. There is only one flag used
currently, NMI_FLAG_FIRST, but having the wrong size could cause
a truncation bug in the future on 64 bit architectures.

Signed-off-by: Betty Dall <betty.dall@hp.com>
Acked-by: Don Zickus <dzickus@redhat.com>
Link: http://lkml.kernel.org/r/1335559255-13454-1-git-send-email-betty.dall@hp.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/x86/include/asm/nmi.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'arch/x86')

diff --git a/arch/x86/include/asm/nmi.h b/arch/x86/include/asm/nmi.h
index a1a836c8131c..0e3793b821ef 100644
--- a/arch/x86/include/asm/nmi.h
+++ b/arch/x86/include/asm/nmi.h
@@ -40,7 +40,7 @@ typedef int (*nmi_handler_t)(unsigned int, struct pt_regs *);
 struct nmiaction {
 	struct list_head	list;
 	nmi_handler_t		handler;
-	unsigned int		flags;
+	unsigned long		flags;
 	const char		*name;
 };
 
-- 
cgit v1.2.3


From 736baef4472d00574089f295bc759ac002b9558c Mon Sep 17 00:00:00 2001
From: Joerg Roedel <joerg.roedel@amd.com>
Date: Fri, 30 Mar 2012 11:47:00 -0700
Subject: iommu/vt-d: Make intr-remapping initialization generic

This patch introduces irq_remap_ops to hold implementation
specific function pointer to handle interrupt remapping. As
the first part the initialization functions for VT-d are
converted to these ops.

Signed-off-by: Joerg Roedel <joerg.roedel@amd.com>
Acked-by: Yinghai Lu <yinghai@kernel.org>
Cc: David Woodhouse <dwmw2@infradead.org>
Cc: Alex Williamson <alex.williamson@redhat.com>
Signed-off-by: Suresh Siddha <suresh.b.siddha@intel.com>
Signed-off-by: Joerg Roedel <joerg.roedel@amd.com>
---
 arch/ia64/include/asm/intr_remapping.h |  4 ++
 arch/x86/include/asm/intr_remapping.h  | 45 ++++++++++++++++++++
 arch/x86/kernel/apic/apic.c            | 14 ++++---
 arch/x86/kernel/apic/io_apic.c         |  1 +
 drivers/iommu/Makefile                 |  2 +-
 drivers/iommu/dmar.c                   |  1 +
 drivers/iommu/intel-iommu.c            |  1 +
 drivers/iommu/intel_intr_remapping.c   | 52 ++++++-----------------
 drivers/iommu/intr_remapping.c         | 76 ++++++++++++++++++++++++++++++++++
 drivers/iommu/intr_remapping.h         | 46 ++++++++++++++++++++
 include/linux/dmar.h                   |  3 --
 11 files changed, 196 insertions(+), 49 deletions(-)
 create mode 100644 arch/ia64/include/asm/intr_remapping.h
 create mode 100644 arch/x86/include/asm/intr_remapping.h
 create mode 100644 drivers/iommu/intr_remapping.c
 create mode 100644 drivers/iommu/intr_remapping.h

(limited to 'arch/x86')

diff --git a/arch/ia64/include/asm/intr_remapping.h b/arch/ia64/include/asm/intr_remapping.h
new file mode 100644
index 000000000000..095aa0d46c58
--- /dev/null
+++ b/arch/ia64/include/asm/intr_remapping.h
@@ -0,0 +1,4 @@
+#ifndef __IA64_INTR_REMAPPING_H
+#define __IA64_INTR_REMAPPING_H
+#define intr_remapping_enabled 0
+#endif
diff --git a/arch/x86/include/asm/intr_remapping.h b/arch/x86/include/asm/intr_remapping.h
new file mode 100644
index 000000000000..207c605dbdf5
--- /dev/null
+++ b/arch/x86/include/asm/intr_remapping.h
@@ -0,0 +1,45 @@
+/*
+ * Copyright (C) 2012 Advanced Micro Devices, Inc.
+ * Author: Joerg Roedel <joerg.roedel@amd.com>
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License version 2 as published
+ * by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307 USA
+ *
+ * This header file contains the interface of the interrupt remapping code to
+ * the x86 interrupt management code.
+ */
+
+#ifndef __X86_INTR_REMAPPING_H
+#define __X86_INTR_REMAPPING_H
+
+#ifdef CONFIG_IRQ_REMAP
+
+extern int intr_remapping_enabled;
+
+extern void setup_intr_remapping(void);
+extern int intr_remapping_supported(void);
+extern int intr_hardware_init(void);
+extern int intr_hardware_enable(void);
+
+#else  /* CONFIG_IRQ_REMAP */
+
+#define intr_remapping_enabled	0
+
+static inline void setup_intr_remapping(void) { }
+static inline int intr_remapping_supported(void) { return 0; }
+static inline int intr_hardware_init(void) { return -ENODEV; }
+static inline int intr_hardware_enable(void) { return -ENODEV; }
+
+#endif /* CONFIG_IRQ_REMAP */
+
+#endif /* __X86_INTR_REMAPPING_H */
diff --git a/arch/x86/kernel/apic/apic.c b/arch/x86/kernel/apic/apic.c
index edc24480469f..1db6f63a22ff 100644
--- a/arch/x86/kernel/apic/apic.c
+++ b/arch/x86/kernel/apic/apic.c
@@ -35,6 +35,7 @@
 #include <linux/smp.h>
 #include <linux/mm.h>
 
+#include <asm/intr_remapping.h>
 #include <asm/perf_event.h>
 #include <asm/x86_init.h>
 #include <asm/pgalloc.h>
@@ -1528,7 +1529,7 @@ int __init enable_IR(void)
 		return -1;
 	}
 
-	return enable_intr_remapping();
+	return intr_hardware_enable();
 #endif
 	return -1;
 }
@@ -1537,10 +1538,13 @@ void __init enable_IR_x2apic(void)
 {
 	unsigned long flags;
 	int ret, x2apic_enabled = 0;
-	int dmar_table_init_ret;
+	int hardware_init_ret;
 
-	dmar_table_init_ret = dmar_table_init();
-	if (dmar_table_init_ret && !x2apic_supported())
+	/* Make sure irq_remap_ops are initialized */
+	setup_intr_remapping();
+
+	hardware_init_ret = intr_hardware_init();
+	if (hardware_init_ret && !x2apic_supported())
 		return;
 
 	ret = save_ioapic_entries();
@@ -1556,7 +1560,7 @@ void __init enable_IR_x2apic(void)
 	if (x2apic_preenabled && nox2apic)
 		disable_x2apic();
 
-	if (dmar_table_init_ret)
+	if (hardware_init_ret)
 		ret = -1;
 	else
 		ret = enable_IR();
diff --git a/arch/x86/kernel/apic/io_apic.c b/arch/x86/kernel/apic/io_apic.c
index e88300d8e80a..1151fdccaad6 100644
--- a/arch/x86/kernel/apic/io_apic.c
+++ b/arch/x86/kernel/apic/io_apic.c
@@ -57,6 +57,7 @@
 #include <asm/msidef.h>
 #include <asm/hypertransport.h>
 #include <asm/setup.h>
+#include <asm/intr_remapping.h>
 #include <asm/irq_remapping.h>
 #include <asm/hpet.h>
 #include <asm/hw_irq.h>
diff --git a/drivers/iommu/Makefile b/drivers/iommu/Makefile
index 1533ebf1d68e..823e1cf8708f 100644
--- a/drivers/iommu/Makefile
+++ b/drivers/iommu/Makefile
@@ -4,7 +4,7 @@ obj-$(CONFIG_AMD_IOMMU) += amd_iommu.o amd_iommu_init.o
 obj-$(CONFIG_AMD_IOMMU_V2) += amd_iommu_v2.o
 obj-$(CONFIG_DMAR_TABLE) += dmar.o
 obj-$(CONFIG_INTEL_IOMMU) += iova.o intel-iommu.o
-obj-$(CONFIG_IRQ_REMAP) += intel_intr_remapping.o
+obj-$(CONFIG_IRQ_REMAP) += intel_intr_remapping.o intr_remapping.o
 obj-$(CONFIG_OMAP_IOMMU) += omap-iommu.o
 obj-$(CONFIG_OMAP_IOVMM) += omap-iovmm.o
 obj-$(CONFIG_OMAP_IOMMU_DEBUG) += omap-iommu-debug.o
diff --git a/drivers/iommu/dmar.c b/drivers/iommu/dmar.c
index 35c1e17fce1d..647e366403dc 100644
--- a/drivers/iommu/dmar.c
+++ b/drivers/iommu/dmar.c
@@ -36,6 +36,7 @@
 #include <linux/tboot.h>
 #include <linux/dmi.h>
 #include <linux/slab.h>
+#include <asm/intr_remapping.h>
 #include <asm/iommu_table.h>
 
 #define PREFIX "DMAR: "
diff --git a/drivers/iommu/intel-iommu.c b/drivers/iommu/intel-iommu.c
index f93d5ac8f81c..e1439808192c 100644
--- a/drivers/iommu/intel-iommu.c
+++ b/drivers/iommu/intel-iommu.c
@@ -42,6 +42,7 @@
 #include <linux/dmi.h>
 #include <linux/pci-ats.h>
 #include <linux/memblock.h>
+#include <asm/intr_remapping.h>
 #include <asm/cacheflush.h>
 #include <asm/iommu.h>
 
diff --git a/drivers/iommu/intel_intr_remapping.c b/drivers/iommu/intel_intr_remapping.c
index 212fff0c24b5..9c742fb111b6 100644
--- a/drivers/iommu/intel_intr_remapping.c
+++ b/drivers/iommu/intel_intr_remapping.c
@@ -11,8 +11,11 @@
 #include <asm/cpu.h>
 #include <linux/intel-iommu.h>
 #include <acpi/acpi.h>
+#include <asm/intr_remapping.h>
 #include <asm/pci-direct.h>
 
+#include "intr_remapping.h"
+
 struct ioapic_scope {
 	struct intel_iommu *iommu;
 	unsigned int id;
@@ -32,42 +35,6 @@ struct hpet_scope {
 static struct ioapic_scope ir_ioapic[MAX_IO_APICS];
 static struct hpet_scope ir_hpet[MAX_HPET_TBS];
 static int ir_ioapic_num, ir_hpet_num;
-int intr_remapping_enabled;
-
-static int disable_intremap;
-static int disable_sourceid_checking;
-static int no_x2apic_optout;
-
-static __init int setup_nointremap(char *str)
-{
-	disable_intremap = 1;
-	return 0;
-}
-early_param("nointremap", setup_nointremap);
-
-static __init int setup_intremap(char *str)
-{
-	if (!str)
-		return -EINVAL;
-
-	while (*str) {
-		if (!strncmp(str, "on", 2))
-			disable_intremap = 0;
-		else if (!strncmp(str, "off", 3))
-			disable_intremap = 1;
-		else if (!strncmp(str, "nosid", 5))
-			disable_sourceid_checking = 1;
-		else if (!strncmp(str, "no_x2apic_optout", 16))
-			no_x2apic_optout = 1;
-
-		str += strcspn(str, ",");
-		while (*str == ',')
-			str++;
-	}
-
-	return 0;
-}
-early_param("intremap", setup_intremap);
 
 static DEFINE_RAW_SPINLOCK(irq_2_ir_lock);
 
@@ -465,7 +432,7 @@ static void iommu_set_intr_remapping(struct intel_iommu *iommu, int mode)
 }
 
 
-static int setup_intr_remapping(struct intel_iommu *iommu, int mode)
+static int intel_setup_intr_remapping(struct intel_iommu *iommu, int mode)
 {
 	struct ir_table *ir_table;
 	struct page *pages;
@@ -534,7 +501,7 @@ static int __init dmar_x2apic_optout(void)
 	return dmar->flags & DMAR_X2APIC_OPT_OUT;
 }
 
-int __init intr_remapping_supported(void)
+static int __init intel_intr_remapping_supported(void)
 {
 	struct dmar_drhd_unit *drhd;
 
@@ -554,7 +521,7 @@ int __init intr_remapping_supported(void)
 	return 1;
 }
 
-int __init enable_intr_remapping(void)
+static int __init intel_enable_intr_remapping(void)
 {
 	struct dmar_drhd_unit *drhd;
 	int setup = 0;
@@ -638,7 +605,7 @@ int __init enable_intr_remapping(void)
 		if (!ecap_ir_support(iommu->ecap))
 			continue;
 
-		if (setup_intr_remapping(iommu, eim))
+		if (intel_setup_intr_remapping(iommu, eim))
 			goto error;
 
 		setup = 1;
@@ -847,3 +814,8 @@ error:
 	return -1;
 }
 
+struct irq_remap_ops intel_irq_remap_ops = {
+	.supported		= intel_intr_remapping_supported,
+	.hardware_init		= dmar_table_init,
+	.hardware_enable	= intel_enable_intr_remapping,
+};
diff --git a/drivers/iommu/intr_remapping.c b/drivers/iommu/intr_remapping.c
new file mode 100644
index 000000000000..670c69a80afd
--- /dev/null
+++ b/drivers/iommu/intr_remapping.c
@@ -0,0 +1,76 @@
+#include <linux/kernel.h>
+#include <linux/string.h>
+#include <linux/errno.h>
+
+#include "intr_remapping.h"
+
+int intr_remapping_enabled;
+
+int disable_intremap;
+int disable_sourceid_checking;
+int no_x2apic_optout;
+
+static struct irq_remap_ops *remap_ops;
+
+static __init int setup_nointremap(char *str)
+{
+	disable_intremap = 1;
+	return 0;
+}
+early_param("nointremap", setup_nointremap);
+
+static __init int setup_intremap(char *str)
+{
+	if (!str)
+		return -EINVAL;
+
+	while (*str) {
+		if (!strncmp(str, "on", 2))
+			disable_intremap = 0;
+		else if (!strncmp(str, "off", 3))
+			disable_intremap = 1;
+		else if (!strncmp(str, "nosid", 5))
+			disable_sourceid_checking = 1;
+		else if (!strncmp(str, "no_x2apic_optout", 16))
+			no_x2apic_optout = 1;
+
+		str += strcspn(str, ",");
+		while (*str == ',')
+			str++;
+	}
+
+	return 0;
+}
+early_param("intremap", setup_intremap);
+
+void __init setup_intr_remapping(void)
+{
+	remap_ops = &intel_irq_remap_ops;
+}
+
+int intr_remapping_supported(void)
+{
+	if (disable_intremap)
+		return 0;
+
+	if (!remap_ops || !remap_ops->supported)
+		return 0;
+
+	return remap_ops->supported();
+}
+
+int __init intr_hardware_init(void)
+{
+	if (!remap_ops || !remap_ops->hardware_init)
+		return -ENODEV;
+
+	return remap_ops->hardware_init();
+}
+
+int __init intr_hardware_enable(void)
+{
+	if (!remap_ops || !remap_ops->hardware_enable)
+		return -ENODEV;
+
+	return remap_ops->hardware_enable();
+}
diff --git a/drivers/iommu/intr_remapping.h b/drivers/iommu/intr_remapping.h
new file mode 100644
index 000000000000..d6df732e001f
--- /dev/null
+++ b/drivers/iommu/intr_remapping.h
@@ -0,0 +1,46 @@
+/*
+ * Copyright (C) 2012 Advanced Micro Devices, Inc.
+ * Author: Joerg Roedel <joerg.roedel@amd.com>
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License version 2 as published
+ * by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307 USA
+ *
+ * This header file contains stuff that is shared between different interrupt
+ * remapping drivers but with no need to be visible outside of the IOMMU layer.
+ */
+
+#ifndef __INTR_REMAPPING_H
+#define __INTR_REMAPPING_H
+
+#ifdef CONFIG_IRQ_REMAP
+
+extern int disable_intremap;
+extern int disable_sourceid_checking;
+extern int no_x2apic_optout;
+
+struct irq_remap_ops {
+	/* Check whether Interrupt Remapping is supported */
+	int (*supported)(void);
+
+	/* Initializes hardware and makes it ready for remapping interrupts */
+	int  (*hardware_init)(void);
+
+	/* Enables the remapping hardware */
+	int  (*hardware_enable)(void);
+};
+
+extern struct irq_remap_ops intel_irq_remap_ops;
+
+#endif /* CONFIG_IRQ_REMAP */
+
+#endif /* __INTR_REMAPPING_H */
diff --git a/include/linux/dmar.h b/include/linux/dmar.h
index 731a60975101..6d66c9c76e0a 100644
--- a/include/linux/dmar.h
+++ b/include/linux/dmar.h
@@ -115,9 +115,6 @@ struct irte {
 };
 
 #ifdef CONFIG_IRQ_REMAP
-extern int intr_remapping_enabled;
-extern int intr_remapping_supported(void);
-extern int enable_intr_remapping(void);
 extern void disable_intr_remapping(void);
 extern int reenable_intr_remapping(int);
 
-- 
cgit v1.2.3


From 4f3d8b67ad3090f9fb72f8235d21cde53cd24b79 Mon Sep 17 00:00:00 2001
From: Joerg Roedel <joerg.roedel@amd.com>
Date: Fri, 30 Mar 2012 11:47:01 -0700
Subject: iommu/vt-d: Convert missing apic.c intr-remapping call to remap_ops

Convert these calls too:

	* Disable of remapping hardware
	* Reenable of remapping hardware
	* Enable fault handling

With that all of arch/x86/kernel/apic/apic.c is converted to
use the generic intr-remapping interface.

Signed-off-by: Joerg Roedel <joerg.roedel@amd.com>
Acked-by: Yinghai Lu <yinghai@kernel.org>
Cc: David Woodhouse <dwmw2@infradead.org>
Cc: Alex Williamson <alex.williamson@redhat.com>
Signed-off-by: Suresh Siddha <suresh.b.siddha@intel.com>
Signed-off-by: Joerg Roedel <joerg.roedel@amd.com>
---
 arch/x86/include/asm/intr_remapping.h |  6 ++++++
 arch/x86/kernel/apic/apic.c           |  6 +++---
 drivers/iommu/intel_intr_remapping.c  |  7 +++++--
 drivers/iommu/intr_remapping.c        | 24 ++++++++++++++++++++++++
 drivers/iommu/intr_remapping.h        |  9 +++++++++
 include/linux/dmar.h                  | 18 ------------------
 6 files changed, 47 insertions(+), 23 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/include/asm/intr_remapping.h b/arch/x86/include/asm/intr_remapping.h
index 207c605dbdf5..55aa892a53e3 100644
--- a/arch/x86/include/asm/intr_remapping.h
+++ b/arch/x86/include/asm/intr_remapping.h
@@ -30,6 +30,9 @@ extern void setup_intr_remapping(void);
 extern int intr_remapping_supported(void);
 extern int intr_hardware_init(void);
 extern int intr_hardware_enable(void);
+extern void intr_hardware_disable(void);
+extern int intr_hardware_reenable(int);
+extern int intr_enable_fault_handling(void);
 
 #else  /* CONFIG_IRQ_REMAP */
 
@@ -39,6 +42,9 @@ static inline void setup_intr_remapping(void) { }
 static inline int intr_remapping_supported(void) { return 0; }
 static inline int intr_hardware_init(void) { return -ENODEV; }
 static inline int intr_hardware_enable(void) { return -ENODEV; }
+static inline void intr_hardware_disable(void) { }
+static inline int intr_hardware_reenable(int eim) { return -ENODEV; }
+static inline int intr_enable_fault_handling(void) { return -ENODEV; }
 
 #endif /* CONFIG_IRQ_REMAP */
 
diff --git a/arch/x86/kernel/apic/apic.c b/arch/x86/kernel/apic/apic.c
index 1db6f63a22ff..a2762687e2ee 100644
--- a/arch/x86/kernel/apic/apic.c
+++ b/arch/x86/kernel/apic/apic.c
@@ -1443,7 +1443,7 @@ void __init bsp_end_local_APIC_setup(void)
 	 * handling for interrupt remapping.
 	 */
 	if (intr_remapping_enabled)
-		enable_drhd_fault_handling();
+		intr_enable_fault_handling();
 
 }
 
@@ -2181,7 +2181,7 @@ static int lapic_suspend(void)
 	disable_local_APIC();
 
 	if (intr_remapping_enabled)
-		disable_intr_remapping();
+		intr_hardware_disable();
 
 	local_irq_restore(flags);
 	return 0;
@@ -2250,7 +2250,7 @@ static void lapic_resume(void)
 	apic_read(APIC_ESR);
 
 	if (intr_remapping_enabled)
-		reenable_intr_remapping(x2apic_mode);
+		intr_hardware_reenable(x2apic_mode);
 
 	local_irq_restore(flags);
 }
diff --git a/drivers/iommu/intel_intr_remapping.c b/drivers/iommu/intel_intr_remapping.c
index 9c742fb111b6..610b75b66c07 100644
--- a/drivers/iommu/intel_intr_remapping.c
+++ b/drivers/iommu/intel_intr_remapping.c
@@ -764,7 +764,7 @@ int __init ir_dev_scope_init(void)
 }
 rootfs_initcall(ir_dev_scope_init);
 
-void disable_intr_remapping(void)
+static void disable_intr_remapping(void)
 {
 	struct dmar_drhd_unit *drhd;
 	struct intel_iommu *iommu = NULL;
@@ -780,7 +780,7 @@ void disable_intr_remapping(void)
 	}
 }
 
-int reenable_intr_remapping(int eim)
+static int reenable_intr_remapping(int eim)
 {
 	struct dmar_drhd_unit *drhd;
 	int setup = 0;
@@ -818,4 +818,7 @@ struct irq_remap_ops intel_irq_remap_ops = {
 	.supported		= intel_intr_remapping_supported,
 	.hardware_init		= dmar_table_init,
 	.hardware_enable	= intel_enable_intr_remapping,
+	.hardware_disable	= disable_intr_remapping,
+	.hardware_reenable	= reenable_intr_remapping,
+	.enable_faulting	= enable_drhd_fault_handling,
 };
diff --git a/drivers/iommu/intr_remapping.c b/drivers/iommu/intr_remapping.c
index 670c69a80afd..9aabed7c0320 100644
--- a/drivers/iommu/intr_remapping.c
+++ b/drivers/iommu/intr_remapping.c
@@ -74,3 +74,27 @@ int __init intr_hardware_enable(void)
 
 	return remap_ops->hardware_enable();
 }
+
+void intr_hardware_disable(void)
+{
+	if (!remap_ops || !remap_ops->hardware_disable)
+		return;
+
+	remap_ops->hardware_disable();
+}
+
+int intr_hardware_reenable(int mode)
+{
+	if (!remap_ops || !remap_ops->hardware_reenable)
+		return 0;
+
+	return remap_ops->hardware_reenable(mode);
+}
+
+int __init intr_enable_fault_handling(void)
+{
+	if (!remap_ops || !remap_ops->enable_faulting)
+		return -ENODEV;
+
+	return remap_ops->enable_faulting();
+}
diff --git a/drivers/iommu/intr_remapping.h b/drivers/iommu/intr_remapping.h
index d6df732e001f..2744c9ae4aec 100644
--- a/drivers/iommu/intr_remapping.h
+++ b/drivers/iommu/intr_remapping.h
@@ -37,6 +37,15 @@ struct irq_remap_ops {
 
 	/* Enables the remapping hardware */
 	int  (*hardware_enable)(void);
+
+	/* Disables the remapping hardware */
+	void (*hardware_disable)(void);
+
+	/* Reenables the remapping hardware */
+	int  (*hardware_reenable)(int);
+
+	/* Enable fault handling */
+	int  (*enable_faulting)(void);
 };
 
 extern struct irq_remap_ops intel_irq_remap_ops;
diff --git a/include/linux/dmar.h b/include/linux/dmar.h
index 6d66c9c76e0a..f2bd87f52a8d 100644
--- a/include/linux/dmar.h
+++ b/include/linux/dmar.h
@@ -115,9 +115,6 @@ struct irte {
 };
 
 #ifdef CONFIG_IRQ_REMAP
-extern void disable_intr_remapping(void);
-extern int reenable_intr_remapping(int);
-
 extern int get_irte(int irq, struct irte *entry);
 extern int modify_irte(int irq, struct irte *irte_modified);
 extern int alloc_irte(struct intel_iommu *iommu, int irq, u16 count);
@@ -179,21 +176,6 @@ static inline int set_msi_sid(struct irte *irte, struct pci_dev *dev)
 	return 0;
 }
 
-#define intr_remapping_enabled		(0)
-
-static inline int enable_intr_remapping(void)
-{
-	return -1;
-}
-
-static inline void disable_intr_remapping(void)
-{
-}
-
-static inline int reenable_intr_remapping(int eim)
-{
-	return 0;
-}
 #endif
 
 enum {
-- 
cgit v1.2.3


From 0c3f173a88c4ae3e4253427cf574a59ad5352918 Mon Sep 17 00:00:00 2001
From: Joerg Roedel <joerg.roedel@amd.com>
Date: Fri, 30 Mar 2012 11:47:02 -0700
Subject: iommu/vt-d: Convert IR ioapic-setup to use remap_ops

The IOAPIC setup routine for interrupt remapping is VT-d
specific. Move it to the irq_remap_ops and add a call helper
function.

Signed-off-by: Joerg Roedel <joerg.roedel@amd.com>
Acked-by: Yinghai Lu <yinghai@kernel.org>
Cc: David Woodhouse <dwmw2@infradead.org>
Cc: Alex Williamson <alex.williamson@redhat.com>
Signed-off-by: Suresh Siddha <suresh.b.siddha@intel.com>
Signed-off-by: Joerg Roedel <joerg.roedel@amd.com>
---
 arch/x86/include/asm/intr_remapping.h | 15 +++++-
 arch/x86/kernel/apic/io_apic.c        | 68 +-------------------------
 drivers/iommu/intel_intr_remapping.c  | 89 +++++++++++++++++++++++++++++++++++
 drivers/iommu/intr_remapping.c        | 12 +++++
 drivers/iommu/intr_remapping.h        |  8 ++++
 5 files changed, 125 insertions(+), 67 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/include/asm/intr_remapping.h b/arch/x86/include/asm/intr_remapping.h
index 55aa892a53e3..a22e1f1ac7ec 100644
--- a/arch/x86/include/asm/intr_remapping.h
+++ b/arch/x86/include/asm/intr_remapping.h
@@ -24,6 +24,9 @@
 
 #ifdef CONFIG_IRQ_REMAP
 
+struct IO_APIC_route_entry;
+struct io_apic_irq_attr;
+
 extern int intr_remapping_enabled;
 
 extern void setup_intr_remapping(void);
@@ -33,6 +36,10 @@ extern int intr_hardware_enable(void);
 extern void intr_hardware_disable(void);
 extern int intr_hardware_reenable(int);
 extern int intr_enable_fault_handling(void);
+extern int intr_setup_ioapic_entry(int irq,
+				   struct IO_APIC_route_entry *entry,
+				   unsigned int destination, int vector,
+				   struct io_apic_irq_attr *attr);
 
 #else  /* CONFIG_IRQ_REMAP */
 
@@ -45,7 +52,13 @@ static inline int intr_hardware_enable(void) { return -ENODEV; }
 static inline void intr_hardware_disable(void) { }
 static inline int intr_hardware_reenable(int eim) { return -ENODEV; }
 static inline int intr_enable_fault_handling(void) { return -ENODEV; }
-
+static inline int intr_setup_ioapic_entry(int irq,
+					  struct IO_APIC_route_entry *entry,
+					  unsigned int destination, int vector,
+					  struct io_apic_irq_attr *attr)
+{
+	return -ENODEV;
+}
 #endif /* CONFIG_IRQ_REMAP */
 
 #endif /* __X86_INTR_REMAPPING_H */
diff --git a/arch/x86/kernel/apic/io_apic.c b/arch/x86/kernel/apic/io_apic.c
index 1151fdccaad6..e1ab625fb9ca 100644
--- a/arch/x86/kernel/apic/io_apic.c
+++ b/arch/x86/kernel/apic/io_apic.c
@@ -1362,77 +1362,13 @@ static void ioapic_register_intr(unsigned int irq, struct irq_cfg *cfg,
 				      fasteoi ? "fasteoi" : "edge");
 }
 
-
-static int setup_ir_ioapic_entry(int irq,
-			      struct IR_IO_APIC_route_entry *entry,
-			      unsigned int destination, int vector,
-			      struct io_apic_irq_attr *attr)
-{
-	int index;
-	struct irte irte;
-	int ioapic_id = mpc_ioapic_id(attr->ioapic);
-	struct intel_iommu *iommu = map_ioapic_to_ir(ioapic_id);
-
-	if (!iommu) {
-		pr_warn("No mapping iommu for ioapic %d\n", ioapic_id);
-		return -ENODEV;
-	}
-
-	index = alloc_irte(iommu, irq, 1);
-	if (index < 0) {
-		pr_warn("Failed to allocate IRTE for ioapic %d\n", ioapic_id);
-		return -ENOMEM;
-	}
-
-	prepare_irte(&irte, vector, destination);
-
-	/* Set source-id of interrupt request */
-	set_ioapic_sid(&irte, ioapic_id);
-
-	modify_irte(irq, &irte);
-
-	apic_printk(APIC_VERBOSE, KERN_DEBUG "IOAPIC[%d]: "
-		"Set IRTE entry (P:%d FPD:%d Dst_Mode:%d "
-		"Redir_hint:%d Trig_Mode:%d Dlvry_Mode:%X "
-		"Avail:%X Vector:%02X Dest:%08X "
-		"SID:%04X SQ:%X SVT:%X)\n",
-		attr->ioapic, irte.present, irte.fpd, irte.dst_mode,
-		irte.redir_hint, irte.trigger_mode, irte.dlvry_mode,
-		irte.avail, irte.vector, irte.dest_id,
-		irte.sid, irte.sq, irte.svt);
-
-	memset(entry, 0, sizeof(*entry));
-
-	entry->index2	= (index >> 15) & 0x1;
-	entry->zero	= 0;
-	entry->format	= 1;
-	entry->index	= (index & 0x7fff);
-	/*
-	 * IO-APIC RTE will be configured with virtual vector.
-	 * irq handler will do the explicit EOI to the io-apic.
-	 */
-	entry->vector	= attr->ioapic_pin;
-	entry->mask	= 0;			/* enable IRQ */
-	entry->trigger	= attr->trigger;
-	entry->polarity	= attr->polarity;
-
-	/* Mask level triggered irqs.
-	 * Use IRQ_DELAYED_DISABLE for edge triggered irqs.
-	 */
-	if (attr->trigger)
-		entry->mask = 1;
-
-	return 0;
-}
-
 static int setup_ioapic_entry(int irq, struct IO_APIC_route_entry *entry,
 			       unsigned int destination, int vector,
 			       struct io_apic_irq_attr *attr)
 {
 	if (intr_remapping_enabled)
-		return setup_ir_ioapic_entry(irq,
-			 (struct IR_IO_APIC_route_entry *)entry,
-			 destination, vector, attr);
+		return intr_setup_ioapic_entry(irq, entry, destination,
+					       vector, attr);
 
 	memset(entry, 0, sizeof(*entry));
 
diff --git a/drivers/iommu/intel_intr_remapping.c b/drivers/iommu/intel_intr_remapping.c
index 610b75b66c07..f495eba4b6ab 100644
--- a/drivers/iommu/intel_intr_remapping.c
+++ b/drivers/iommu/intel_intr_remapping.c
@@ -31,6 +31,7 @@ struct hpet_scope {
 };
 
 #define IR_X2APIC_MODE(mode) (mode ? (1 << 11) : 0)
+#define IRTE_DEST(dest) ((x2apic_mode) ? dest : dest << 8)
 
 static struct ioapic_scope ir_ioapic[MAX_IO_APICS];
 static struct hpet_scope ir_hpet[MAX_HPET_TBS];
@@ -814,6 +815,93 @@ error:
 	return -1;
 }
 
+static void prepare_irte(struct irte *irte, int vector,
+			 unsigned int dest)
+{
+	memset(irte, 0, sizeof(*irte));
+
+	irte->present = 1;
+	irte->dst_mode = apic->irq_dest_mode;
+	/*
+	 * Trigger mode in the IRTE will always be edge, and for IO-APIC, the
+	 * actual level or edge trigger will be setup in the IO-APIC
+	 * RTE. This will help simplify level triggered irq migration.
+	 * For more details, see the comments (in io_apic.c) explainig IO-APIC
+	 * irq migration in the presence of interrupt-remapping.
+	*/
+	irte->trigger_mode = 0;
+	irte->dlvry_mode = apic->irq_delivery_mode;
+	irte->vector = vector;
+	irte->dest_id = IRTE_DEST(dest);
+	irte->redir_hint = 1;
+}
+
+static int intel_setup_ioapic_entry(int irq,
+				    struct IO_APIC_route_entry *route_entry,
+				    unsigned int destination, int vector,
+				    struct io_apic_irq_attr *attr)
+{
+	int ioapic_id = mpc_ioapic_id(attr->ioapic);
+	struct intel_iommu *iommu = map_ioapic_to_ir(ioapic_id);
+	struct IR_IO_APIC_route_entry *entry;
+	struct irte irte;
+	int index;
+
+	if (!iommu) {
+		pr_warn("No mapping iommu for ioapic %d\n", ioapic_id);
+		return -ENODEV;
+	}
+
+	entry = (struct IR_IO_APIC_route_entry *)route_entry;
+
+	index = alloc_irte(iommu, irq, 1);
+	if (index < 0) {
+		pr_warn("Failed to allocate IRTE for ioapic %d\n", ioapic_id);
+		return -ENOMEM;
+	}
+
+	prepare_irte(&irte, vector, destination);
+
+	/* Set source-id of interrupt request */
+	set_ioapic_sid(&irte, ioapic_id);
+
+	modify_irte(irq, &irte);
+
+	apic_printk(APIC_VERBOSE, KERN_DEBUG "IOAPIC[%d]: "
+		"Set IRTE entry (P:%d FPD:%d Dst_Mode:%d "
+		"Redir_hint:%d Trig_Mode:%d Dlvry_Mode:%X "
+		"Avail:%X Vector:%02X Dest:%08X "
+		"SID:%04X SQ:%X SVT:%X)\n",
+		attr->ioapic, irte.present, irte.fpd, irte.dst_mode,
+		irte.redir_hint, irte.trigger_mode, irte.dlvry_mode,
+		irte.avail, irte.vector, irte.dest_id,
+		irte.sid, irte.sq, irte.svt);
+
+	memset(entry, 0, sizeof(*entry));
+
+	entry->index2	= (index >> 15) & 0x1;
+	entry->zero	= 0;
+	entry->format	= 1;
+	entry->index	= (index & 0x7fff);
+	/*
+	 * IO-APIC RTE will be configured with virtual vector.
+	 * irq handler will do the explicit EOI to the io-apic.
+	 */
+	entry->vector	= attr->ioapic_pin;
+	entry->mask	= 0;			/* enable IRQ */
+	entry->trigger	= attr->trigger;
+	entry->polarity	= attr->polarity;
+
+	/* Mask level triggered irqs.
+	 * Use IRQ_DELAYED_DISABLE for edge triggered irqs.
+	 */
+	if (attr->trigger)
+		entry->mask = 1;
+
+	return 0;
+}
+
+
 struct irq_remap_ops intel_irq_remap_ops = {
 	.supported		= intel_intr_remapping_supported,
 	.hardware_init		= dmar_table_init,
@@ -821,4 +909,5 @@ struct irq_remap_ops intel_irq_remap_ops = {
 	.hardware_disable	= disable_intr_remapping,
 	.hardware_reenable	= reenable_intr_remapping,
 	.enable_faulting	= enable_drhd_fault_handling,
+	.setup_ioapic_entry	= intel_setup_ioapic_entry,
 };
diff --git a/drivers/iommu/intr_remapping.c b/drivers/iommu/intr_remapping.c
index 9aabed7c0320..739148ab2538 100644
--- a/drivers/iommu/intr_remapping.c
+++ b/drivers/iommu/intr_remapping.c
@@ -98,3 +98,15 @@ int __init intr_enable_fault_handling(void)
 
 	return remap_ops->enable_faulting();
 }
+
+int intr_setup_ioapic_entry(int irq,
+			    struct IO_APIC_route_entry *entry,
+			    unsigned int destination, int vector,
+			    struct io_apic_irq_attr *attr)
+{
+	if (!remap_ops || !remap_ops->setup_ioapic_entry)
+		return -ENODEV;
+
+	return remap_ops->setup_ioapic_entry(irq, entry, destination,
+					     vector, attr);
+}
diff --git a/drivers/iommu/intr_remapping.h b/drivers/iommu/intr_remapping.h
index 2744c9ae4aec..e8994f2b3bbe 100644
--- a/drivers/iommu/intr_remapping.h
+++ b/drivers/iommu/intr_remapping.h
@@ -24,6 +24,9 @@
 
 #ifdef CONFIG_IRQ_REMAP
 
+struct IO_APIC_route_entry;
+struct io_apic_irq_attr;
+
 extern int disable_intremap;
 extern int disable_sourceid_checking;
 extern int no_x2apic_optout;
@@ -46,6 +49,11 @@ struct irq_remap_ops {
 
 	/* Enable fault handling */
 	int  (*enable_faulting)(void);
+
+	/* IO-APIC setup routine */
+	int (*setup_ioapic_entry)(int irq, struct IO_APIC_route_entry *,
+				  unsigned int, int,
+				  struct io_apic_irq_attr *);
 };
 
 extern struct irq_remap_ops intel_irq_remap_ops;
-- 
cgit v1.2.3


From 4c1bad6a0af1e297c8d05365e65af89d8c7bf9d1 Mon Sep 17 00:00:00 2001
From: Joerg Roedel <joerg.roedel@amd.com>
Date: Fri, 30 Mar 2012 11:47:03 -0700
Subject: iommu/vt-d: Convert IR set_affinity function to remap_ops

The function to set interrupt affinity with interrupt
remapping enabled is Intel specific too. So move it to the
irq_remap_ops too.

Signed-off-by: Joerg Roedel <joerg.roedel@amd.com>
Acked-by: Yinghai Lu <yinghai@kernel.org>
Cc: David Woodhouse <dwmw2@infradead.org>
Cc: Alex Williamson <alex.williamson@redhat.com>
Signed-off-by: Suresh Siddha <suresh.b.siddha@intel.com>
Signed-off-by: Joerg Roedel <joerg.roedel@amd.com>
---
 arch/x86/include/asm/intr_remapping.h |  9 +++++
 arch/x86/kernel/apic/io_apic.c        | 69 +----------------------------------
 drivers/iommu/intel_intr_remapping.c  | 54 +++++++++++++++++++++++++++
 drivers/iommu/intr_remapping.c        |  9 +++++
 drivers/iommu/intr_remapping.h        |  6 +++
 5 files changed, 80 insertions(+), 67 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/include/asm/intr_remapping.h b/arch/x86/include/asm/intr_remapping.h
index a22e1f1ac7ec..ae933ecfd8f0 100644
--- a/arch/x86/include/asm/intr_remapping.h
+++ b/arch/x86/include/asm/intr_remapping.h
@@ -40,6 +40,9 @@ extern int intr_setup_ioapic_entry(int irq,
 				   struct IO_APIC_route_entry *entry,
 				   unsigned int destination, int vector,
 				   struct io_apic_irq_attr *attr);
+extern int intr_set_affinity(struct irq_data *data,
+			     const struct cpumask *mask,
+			     bool force);
 
 #else  /* CONFIG_IRQ_REMAP */
 
@@ -59,6 +62,12 @@ static inline int intr_setup_ioapic_entry(int irq,
 {
 	return -ENODEV;
 }
+static inline int intr_set_affinity(struct irq_data *data,
+				    const struct cpumask *mask,
+				    bool force)
+{
+	return 0;
+}
 #endif /* CONFIG_IRQ_REMAP */
 
 #endif /* __X86_INTR_REMAPPING_H */
diff --git a/arch/x86/kernel/apic/io_apic.c b/arch/x86/kernel/apic/io_apic.c
index e1ab625fb9ca..a97c79aa25cf 100644
--- a/arch/x86/kernel/apic/io_apic.c
+++ b/arch/x86/kernel/apic/io_apic.c
@@ -2327,71 +2327,6 @@ ioapic_set_affinity(struct irq_data *data, const struct cpumask *mask,
 	return ret;
 }
 
-#ifdef CONFIG_IRQ_REMAP
-
-/*
- * Migrate the IO-APIC irq in the presence of intr-remapping.
- *
- * For both level and edge triggered, irq migration is a simple atomic
- * update(of vector and cpu destination) of IRTE and flush the hardware cache.
- *
- * For level triggered, we eliminate the io-apic RTE modification (with the
- * updated vector information), by using a virtual vector (io-apic pin number).
- * Real vector that is used for interrupting cpu will be coming from
- * the interrupt-remapping table entry.
- *
- * As the migration is a simple atomic update of IRTE, the same mechanism
- * is used to migrate MSI irq's in the presence of interrupt-remapping.
- */
-static int
-ir_ioapic_set_affinity(struct irq_data *data, const struct cpumask *mask,
-		       bool force)
-{
-	struct irq_cfg *cfg = data->chip_data;
-	unsigned int dest, irq = data->irq;
-	struct irte irte;
-
-	if (!cpumask_intersects(mask, cpu_online_mask))
-		return -EINVAL;
-
-	if (get_irte(irq, &irte))
-		return -EBUSY;
-
-	if (assign_irq_vector(irq, cfg, mask))
-		return -EBUSY;
-
-	dest = apic->cpu_mask_to_apicid_and(cfg->domain, mask);
-
-	irte.vector = cfg->vector;
-	irte.dest_id = IRTE_DEST(dest);
-
-	/*
-	 * Atomically updates the IRTE with the new destination, vector
-	 * and flushes the interrupt entry cache.
-	 */
-	modify_irte(irq, &irte);
-
-	/*
-	 * After this point, all the interrupts will start arriving
-	 * at the new destination. So, time to cleanup the previous
-	 * vector allocation.
-	 */
-	if (cfg->move_in_progress)
-		send_cleanup_vector(cfg);
-
-	cpumask_copy(data->affinity, mask);
-	return 0;
-}
-
-#else
-static inline int
-ir_ioapic_set_affinity(struct irq_data *data, const struct cpumask *mask,
-		       bool force)
-{
-	return 0;
-}
-#endif
-
 asmlinkage void smp_irq_move_cleanup_interrupt(void)
 {
 	unsigned vector, me;
@@ -2636,7 +2571,7 @@ static void irq_remap_modify_chip_defaults(struct irq_chip *chip)
 	chip->irq_eoi = ir_ack_apic_level;
 
 #ifdef CONFIG_SMP
-	chip->irq_set_affinity = ir_ioapic_set_affinity;
+	chip->irq_set_affinity = intr_set_affinity;
 #endif
 }
 #endif /* CONFIG_IRQ_REMAP */
@@ -3826,7 +3761,7 @@ void __init setup_ioapic_dest(void)
 			mask = apic->target_cpus();
 
 		if (intr_remapping_enabled)
-			ir_ioapic_set_affinity(idata, mask, false);
+			intr_set_affinity(idata, mask, false);
 		else
 			ioapic_set_affinity(idata, mask, false);
 	}
diff --git a/drivers/iommu/intel_intr_remapping.c b/drivers/iommu/intel_intr_remapping.c
index f495eba4b6ab..25372c1f3c8c 100644
--- a/drivers/iommu/intel_intr_remapping.c
+++ b/drivers/iommu/intel_intr_remapping.c
@@ -901,6 +901,59 @@ static int intel_setup_ioapic_entry(int irq,
 	return 0;
 }
 
+/*
+ * Migrate the IO-APIC irq in the presence of intr-remapping.
+ *
+ * For both level and edge triggered, irq migration is a simple atomic
+ * update(of vector and cpu destination) of IRTE and flush the hardware cache.
+ *
+ * For level triggered, we eliminate the io-apic RTE modification (with the
+ * updated vector information), by using a virtual vector (io-apic pin number).
+ * Real vector that is used for interrupting cpu will be coming from
+ * the interrupt-remapping table entry.
+ *
+ * As the migration is a simple atomic update of IRTE, the same mechanism
+ * is used to migrate MSI irq's in the presence of interrupt-remapping.
+ */
+static int
+intel_ioapic_set_affinity(struct irq_data *data, const struct cpumask *mask,
+			  bool force)
+{
+	struct irq_cfg *cfg = data->chip_data;
+	unsigned int dest, irq = data->irq;
+	struct irte irte;
+
+	if (!cpumask_intersects(mask, cpu_online_mask))
+		return -EINVAL;
+
+	if (get_irte(irq, &irte))
+		return -EBUSY;
+
+	if (assign_irq_vector(irq, cfg, mask))
+		return -EBUSY;
+
+	dest = apic->cpu_mask_to_apicid_and(cfg->domain, mask);
+
+	irte.vector = cfg->vector;
+	irte.dest_id = IRTE_DEST(dest);
+
+	/*
+	 * Atomically updates the IRTE with the new destination, vector
+	 * and flushes the interrupt entry cache.
+	 */
+	modify_irte(irq, &irte);
+
+	/*
+	 * After this point, all the interrupts will start arriving
+	 * at the new destination. So, time to cleanup the previous
+	 * vector allocation.
+	 */
+	if (cfg->move_in_progress)
+		send_cleanup_vector(cfg);
+
+	cpumask_copy(data->affinity, mask);
+	return 0;
+}
 
 struct irq_remap_ops intel_irq_remap_ops = {
 	.supported		= intel_intr_remapping_supported,
@@ -910,4 +963,5 @@ struct irq_remap_ops intel_irq_remap_ops = {
 	.hardware_reenable	= reenable_intr_remapping,
 	.enable_faulting	= enable_drhd_fault_handling,
 	.setup_ioapic_entry	= intel_setup_ioapic_entry,
+	.set_affinity		= intel_ioapic_set_affinity,
 };
diff --git a/drivers/iommu/intr_remapping.c b/drivers/iommu/intr_remapping.c
index 739148ab2538..2f4f27ffb861 100644
--- a/drivers/iommu/intr_remapping.c
+++ b/drivers/iommu/intr_remapping.c
@@ -110,3 +110,12 @@ int intr_setup_ioapic_entry(int irq,
 	return remap_ops->setup_ioapic_entry(irq, entry, destination,
 					     vector, attr);
 }
+
+int intr_set_affinity(struct irq_data *data, const struct cpumask *mask,
+		      bool force)
+{
+	if (!remap_ops || !remap_ops->set_affinity)
+		return 0;
+
+	return remap_ops->set_affinity(data, mask, force);
+}
diff --git a/drivers/iommu/intr_remapping.h b/drivers/iommu/intr_remapping.h
index e8994f2b3bbe..e0bc6e0ba1fb 100644
--- a/drivers/iommu/intr_remapping.h
+++ b/drivers/iommu/intr_remapping.h
@@ -26,6 +26,8 @@
 
 struct IO_APIC_route_entry;
 struct io_apic_irq_attr;
+struct irq_data;
+struct cpumask;
 
 extern int disable_intremap;
 extern int disable_sourceid_checking;
@@ -54,6 +56,10 @@ struct irq_remap_ops {
 	int (*setup_ioapic_entry)(int irq, struct IO_APIC_route_entry *,
 				  unsigned int, int,
 				  struct io_apic_irq_attr *);
+
+	/* Set the CPU affinity of a remapped interrupt */
+	int (*set_affinity)(struct irq_data *data, const struct cpumask *mask,
+			    bool force);
 };
 
 extern struct irq_remap_ops intel_irq_remap_ops;
-- 
cgit v1.2.3


From 9d619f65722236e0e0c35467d1528caed206e439 Mon Sep 17 00:00:00 2001
From: Joerg Roedel <joerg.roedel@amd.com>
Date: Fri, 30 Mar 2012 11:47:04 -0700
Subject: iommu/vt-d: Convert free_irte into a remap_ops callback

The operation for releasing a remapping entry is iommu
specific too.

Signed-off-by: Joerg Roedel <joerg.roedel@amd.com>
Acked-by: Yinghai Lu <yinghai@kernel.org>
Cc: David Woodhouse <dwmw2@infradead.org>
Cc: Alex Williamson <alex.williamson@redhat.com>
Signed-off-by: Suresh Siddha <suresh.b.siddha@intel.com>
Signed-off-by: Joerg Roedel <joerg.roedel@amd.com>
---
 arch/x86/include/asm/intr_remapping.h | 2 ++
 arch/x86/kernel/apic/io_apic.c        | 2 +-
 drivers/iommu/intel_intr_remapping.c  | 3 ++-
 drivers/iommu/intr_remapping.c        | 8 ++++++++
 drivers/iommu/intr_remapping.h        | 3 +++
 include/linux/dmar.h                  | 5 -----
 6 files changed, 16 insertions(+), 7 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/include/asm/intr_remapping.h b/arch/x86/include/asm/intr_remapping.h
index ae933ecfd8f0..a195b7d6995c 100644
--- a/arch/x86/include/asm/intr_remapping.h
+++ b/arch/x86/include/asm/intr_remapping.h
@@ -43,6 +43,7 @@ extern int intr_setup_ioapic_entry(int irq,
 extern int intr_set_affinity(struct irq_data *data,
 			     const struct cpumask *mask,
 			     bool force);
+extern void intr_free_irq(int irq);
 
 #else  /* CONFIG_IRQ_REMAP */
 
@@ -68,6 +69,7 @@ static inline int intr_set_affinity(struct irq_data *data,
 {
 	return 0;
 }
+static inline void intr_free_irq(int irq) { }
 #endif /* CONFIG_IRQ_REMAP */
 
 #endif /* __X86_INTR_REMAPPING_H */
diff --git a/arch/x86/kernel/apic/io_apic.c b/arch/x86/kernel/apic/io_apic.c
index a97c79aa25cf..5690469555fb 100644
--- a/arch/x86/kernel/apic/io_apic.c
+++ b/arch/x86/kernel/apic/io_apic.c
@@ -3041,7 +3041,7 @@ void destroy_irq(unsigned int irq)
 	irq_set_status_flags(irq, IRQ_NOREQUEST|IRQ_NOPROBE);
 
 	if (irq_remapped(cfg))
-		free_irte(irq);
+		intr_free_irq(irq);
 	raw_spin_lock_irqsave(&vector_lock, flags);
 	__clear_irq_vector(irq, cfg);
 	raw_spin_unlock_irqrestore(&vector_lock, flags);
diff --git a/drivers/iommu/intel_intr_remapping.c b/drivers/iommu/intel_intr_remapping.c
index 25372c1f3c8c..44a6e04a070b 100644
--- a/drivers/iommu/intel_intr_remapping.c
+++ b/drivers/iommu/intel_intr_remapping.c
@@ -253,7 +253,7 @@ static int clear_entries(struct irq_2_iommu *irq_iommu)
 	return qi_flush_iec(iommu, index, irq_iommu->irte_mask);
 }
 
-int free_irte(int irq)
+static int free_irte(int irq)
 {
 	struct irq_2_iommu *irq_iommu = irq_2_iommu(irq);
 	unsigned long flags;
@@ -964,4 +964,5 @@ struct irq_remap_ops intel_irq_remap_ops = {
 	.enable_faulting	= enable_drhd_fault_handling,
 	.setup_ioapic_entry	= intel_setup_ioapic_entry,
 	.set_affinity		= intel_ioapic_set_affinity,
+	.free_irq		= free_irte,
 };
diff --git a/drivers/iommu/intr_remapping.c b/drivers/iommu/intr_remapping.c
index 2f4f27ffb861..a68d304f9729 100644
--- a/drivers/iommu/intr_remapping.c
+++ b/drivers/iommu/intr_remapping.c
@@ -119,3 +119,11 @@ int intr_set_affinity(struct irq_data *data, const struct cpumask *mask,
 
 	return remap_ops->set_affinity(data, mask, force);
 }
+
+void intr_free_irq(int irq)
+{
+	if (!remap_ops || !remap_ops->free_irq)
+		return;
+
+	remap_ops->free_irq(irq);
+}
diff --git a/drivers/iommu/intr_remapping.h b/drivers/iommu/intr_remapping.h
index e0bc6e0ba1fb..57485539383d 100644
--- a/drivers/iommu/intr_remapping.h
+++ b/drivers/iommu/intr_remapping.h
@@ -60,6 +60,9 @@ struct irq_remap_ops {
 	/* Set the CPU affinity of a remapped interrupt */
 	int (*set_affinity)(struct irq_data *data, const struct cpumask *mask,
 			    bool force);
+
+	/* Free an IRQ */
+	int (*free_irq)(int);
 };
 
 extern struct irq_remap_ops intel_irq_remap_ops;
diff --git a/include/linux/dmar.h b/include/linux/dmar.h
index f2bd87f52a8d..7a207a39f879 100644
--- a/include/linux/dmar.h
+++ b/include/linux/dmar.h
@@ -121,7 +121,6 @@ extern int alloc_irte(struct intel_iommu *iommu, int irq, u16 count);
 extern int set_irte_irq(int irq, struct intel_iommu *iommu, u16 index,
    			u16 sub_handle);
 extern int map_irq_to_irte_handle(int irq, u16 *sub_handle);
-extern int free_irte(int irq);
 
 extern struct intel_iommu *map_dev_to_ir(struct pci_dev *dev);
 extern struct intel_iommu *map_ioapic_to_ir(int apic);
@@ -138,10 +137,6 @@ static inline int modify_irte(int irq, struct irte *irte_modified)
 {
 	return -1;
 }
-static inline int free_irte(int irq)
-{
-	return -1;
-}
 static inline int map_irq_to_irte_handle(int irq, u16 *sub_handle)
 {
 	return -1;
-- 
cgit v1.2.3


From 5e2b930b0784a30c98dee8e9d79c1f84c31f7209 Mon Sep 17 00:00:00 2001
From: Joerg Roedel <joerg.roedel@amd.com>
Date: Fri, 30 Mar 2012 11:47:05 -0700
Subject: iommu/vt-d: Convert MSI remapping setup to remap_ops

This patch introduces remapping-ops for setting ups MSI
interrupts.

Signed-off-by: Joerg Roedel <joerg.roedel@amd.com>
Acked-by: Yinghai Lu <yinghai@kernel.org>
Cc: David Woodhouse <dwmw2@infradead.org>
Cc: Alex Williamson <alex.williamson@redhat.com>
Signed-off-by: Suresh Siddha <suresh.b.siddha@intel.com>
Signed-off-by: Joerg Roedel <joerg.roedel@amd.com>
---
 arch/x86/include/asm/intr_remapping.h |  26 ++++++++
 arch/x86/include/asm/irq_remapping.h  |  23 -------
 arch/x86/kernel/apic/io_apic.c        | 119 ++++++++--------------------------
 drivers/iommu/intel_intr_remapping.c  |  97 +++++++++++++++++++++++++++
 drivers/iommu/intr_remapping.c        |  35 ++++++++++
 drivers/iommu/intr_remapping.h        |  16 +++++
 6 files changed, 202 insertions(+), 114 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/include/asm/intr_remapping.h b/arch/x86/include/asm/intr_remapping.h
index a195b7d6995c..a6afd6efa6c6 100644
--- a/arch/x86/include/asm/intr_remapping.h
+++ b/arch/x86/include/asm/intr_remapping.h
@@ -26,6 +26,7 @@
 
 struct IO_APIC_route_entry;
 struct io_apic_irq_attr;
+struct pci_dev;
 
 extern int intr_remapping_enabled;
 
@@ -44,6 +45,13 @@ extern int intr_set_affinity(struct irq_data *data,
 			     const struct cpumask *mask,
 			     bool force);
 extern void intr_free_irq(int irq);
+extern void intr_compose_msi_msg(struct pci_dev *pdev,
+				 unsigned int irq, unsigned int dest,
+				 struct msi_msg *msg, u8 hpet_id);
+extern int intr_msi_alloc_irq(struct pci_dev *pdev, int irq, int nvec);
+extern int intr_msi_setup_irq(struct pci_dev *pdev, unsigned int irq,
+			      int index, int sub_handle);
+extern int intr_setup_hpet_msi(unsigned int irq, unsigned int id);
 
 #else  /* CONFIG_IRQ_REMAP */
 
@@ -70,6 +78,24 @@ static inline int intr_set_affinity(struct irq_data *data,
 	return 0;
 }
 static inline void intr_free_irq(int irq) { }
+static inline void intr_compose_msi_msg(struct pci_dev *pdev,
+					unsigned int irq, unsigned int dest,
+					struct msi_msg *msg, u8 hpet_id)
+{
+}
+static inline int intr_msi_alloc_irq(struct pci_dev *pdev, int irq, int nvec)
+{
+	return -ENODEV;
+}
+static inline int intr_msi_setup_irq(struct pci_dev *pdev, unsigned int irq,
+				     int index, int sub_handle)
+{
+	return -ENODEV;
+}
+static inline int intr_setup_hpet_msi(unsigned int irq, unsigned int id)
+{
+	return -ENODEV;
+}
 #endif /* CONFIG_IRQ_REMAP */
 
 #endif /* __X86_INTR_REMAPPING_H */
diff --git a/arch/x86/include/asm/irq_remapping.h b/arch/x86/include/asm/irq_remapping.h
index 47d99934580f..0ddfc0b90adb 100644
--- a/arch/x86/include/asm/irq_remapping.h
+++ b/arch/x86/include/asm/irq_remapping.h
@@ -5,34 +5,11 @@
 
 #ifdef CONFIG_IRQ_REMAP
 static void irq_remap_modify_chip_defaults(struct irq_chip *chip);
-static inline void prepare_irte(struct irte *irte, int vector,
-			        unsigned int dest)
-{
-	memset(irte, 0, sizeof(*irte));
-
-	irte->present = 1;
-	irte->dst_mode = apic->irq_dest_mode;
-	/*
-	 * Trigger mode in the IRTE will always be edge, and for IO-APIC, the
-	 * actual level or edge trigger will be setup in the IO-APIC
-	 * RTE. This will help simplify level triggered irq migration.
-	 * For more details, see the comments (in io_apic.c) explainig IO-APIC
-	 * irq migration in the presence of interrupt-remapping.
-	*/
-	irte->trigger_mode = 0;
-	irte->dlvry_mode = apic->irq_delivery_mode;
-	irte->vector = vector;
-	irte->dest_id = IRTE_DEST(dest);
-	irte->redir_hint = 1;
-}
 static inline bool irq_remapped(struct irq_cfg *cfg)
 {
 	return cfg->irq_2_iommu.iommu != NULL;
 }
 #else
-static void prepare_irte(struct irte *irte, int vector, unsigned int dest)
-{
-}
 static inline bool irq_remapped(struct irq_cfg *cfg)
 {
 	return false;
diff --git a/arch/x86/kernel/apic/io_apic.c b/arch/x86/kernel/apic/io_apic.c
index 5690469555fb..3db693bda91d 100644
--- a/arch/x86/kernel/apic/io_apic.c
+++ b/arch/x86/kernel/apic/io_apic.c
@@ -3070,54 +3070,34 @@ static int msi_compose_msg(struct pci_dev *pdev, unsigned int irq,
 	dest = apic->cpu_mask_to_apicid_and(cfg->domain, apic->target_cpus());
 
 	if (irq_remapped(cfg)) {
-		struct irte irte;
-		int ir_index;
-		u16 sub_handle;
-
-		ir_index = map_irq_to_irte_handle(irq, &sub_handle);
-		BUG_ON(ir_index == -1);
-
-		prepare_irte(&irte, cfg->vector, dest);
-
-		/* Set source-id of interrupt request */
-		if (pdev)
-			set_msi_sid(&irte, pdev);
-		else
-			set_hpet_sid(&irte, hpet_id);
-
-		modify_irte(irq, &irte);
+		intr_compose_msi_msg(pdev, irq, dest, msg, hpet_id);
+		return err;
+	}
 
+	if (x2apic_enabled())
+		msg->address_hi = MSI_ADDR_BASE_HI |
+				  MSI_ADDR_EXT_DEST_ID(dest);
+	else
 		msg->address_hi = MSI_ADDR_BASE_HI;
-		msg->data = sub_handle;
-		msg->address_lo = MSI_ADDR_BASE_LO | MSI_ADDR_IR_EXT_INT |
-				  MSI_ADDR_IR_SHV |
-				  MSI_ADDR_IR_INDEX1(ir_index) |
-				  MSI_ADDR_IR_INDEX2(ir_index);
-	} else {
-		if (x2apic_enabled())
-			msg->address_hi = MSI_ADDR_BASE_HI |
-					  MSI_ADDR_EXT_DEST_ID(dest);
-		else
-			msg->address_hi = MSI_ADDR_BASE_HI;
 
-		msg->address_lo =
-			MSI_ADDR_BASE_LO |
-			((apic->irq_dest_mode == 0) ?
-				MSI_ADDR_DEST_MODE_PHYSICAL:
-				MSI_ADDR_DEST_MODE_LOGICAL) |
-			((apic->irq_delivery_mode != dest_LowestPrio) ?
-				MSI_ADDR_REDIRECTION_CPU:
-				MSI_ADDR_REDIRECTION_LOWPRI) |
-			MSI_ADDR_DEST_ID(dest);
+	msg->address_lo =
+		MSI_ADDR_BASE_LO |
+		((apic->irq_dest_mode == 0) ?
+			MSI_ADDR_DEST_MODE_PHYSICAL:
+			MSI_ADDR_DEST_MODE_LOGICAL) |
+		((apic->irq_delivery_mode != dest_LowestPrio) ?
+			MSI_ADDR_REDIRECTION_CPU:
+			MSI_ADDR_REDIRECTION_LOWPRI) |
+		MSI_ADDR_DEST_ID(dest);
+
+	msg->data =
+		MSI_DATA_TRIGGER_EDGE |
+		MSI_DATA_LEVEL_ASSERT |
+		((apic->irq_delivery_mode != dest_LowestPrio) ?
+			MSI_DATA_DELIVERY_FIXED:
+			MSI_DATA_DELIVERY_LOWPRI) |
+		MSI_DATA_VECTOR(cfg->vector);
 
-		msg->data =
-			MSI_DATA_TRIGGER_EDGE |
-			MSI_DATA_LEVEL_ASSERT |
-			((apic->irq_delivery_mode != dest_LowestPrio) ?
-				MSI_DATA_DELIVERY_FIXED:
-				MSI_DATA_DELIVERY_LOWPRI) |
-			MSI_DATA_VECTOR(cfg->vector);
-	}
 	return err;
 }
 
@@ -3160,33 +3140,6 @@ static struct irq_chip msi_chip = {
 	.irq_retrigger		= ioapic_retrigger_irq,
 };
 
-/*
- * Map the PCI dev to the corresponding remapping hardware unit
- * and allocate 'nvec' consecutive interrupt-remapping table entries
- * in it.
- */
-static int msi_alloc_irte(struct pci_dev *dev, int irq, int nvec)
-{
-	struct intel_iommu *iommu;
-	int index;
-
-	iommu = map_dev_to_ir(dev);
-	if (!iommu) {
-		printk(KERN_ERR
-		       "Unable to map PCI %s to iommu\n", pci_name(dev));
-		return -ENOENT;
-	}
-
-	index = alloc_irte(iommu, irq, nvec);
-	if (index < 0) {
-		printk(KERN_ERR
-		       "Unable to allocate %d IRTE for PCI %s\n", nvec,
-		       pci_name(dev));
-		return -ENOSPC;
-	}
-	return index;
-}
-
 static int setup_msi_irq(struct pci_dev *dev, struct msi_desc *msidesc, int irq)
 {
 	struct irq_chip *chip = &msi_chip;
@@ -3217,7 +3170,6 @@ int native_setup_msi_irqs(struct pci_dev *dev, int nvec, int type)
 	int node, ret, sub_handle, index = 0;
 	unsigned int irq, irq_want;
 	struct msi_desc *msidesc;
-	struct intel_iommu *iommu = NULL;
 
 	/* x86 doesn't support multiple MSI yet */
 	if (type == PCI_CAP_ID_MSI && nvec > 1)
@@ -3239,23 +3191,15 @@ int native_setup_msi_irqs(struct pci_dev *dev, int nvec, int type)
 			 * allocate the consecutive block of IRTE's
 			 * for 'nvec'
 			 */
-			index = msi_alloc_irte(dev, irq, nvec);
+			index = intr_msi_alloc_irq(dev, irq, nvec);
 			if (index < 0) {
 				ret = index;
 				goto error;
 			}
 		} else {
-			iommu = map_dev_to_ir(dev);
-			if (!iommu) {
-				ret = -ENOENT;
+			ret = intr_msi_setup_irq(dev, irq, index, sub_handle);
+			if (ret < 0)
 				goto error;
-			}
-			/*
-			 * setup the mapping between the irq and the IRTE
-			 * base index, the sub_handle pointing to the
-			 * appropriate interrupt remap table entry.
-			 */
-			set_irte_irq(irq, iommu, index, sub_handle);
 		}
 no_ir:
 		ret = setup_msi_irq(dev, msidesc, irq);
@@ -3374,14 +3318,7 @@ int arch_setup_hpet_msi(unsigned int irq, unsigned int id)
 	int ret;
 
 	if (intr_remapping_enabled) {
-		struct intel_iommu *iommu = map_hpet_to_ir(id);
-		int index;
-
-		if (!iommu)
-			return -1;
-
-		index = alloc_irte(iommu, irq, 1);
-		if (index < 0)
+		if (!intr_setup_hpet_msi(irq, id))
 			return -1;
 	}
 
diff --git a/drivers/iommu/intel_intr_remapping.c b/drivers/iommu/intel_intr_remapping.c
index 44a6e04a070b..a3bae67ec43c 100644
--- a/drivers/iommu/intel_intr_remapping.c
+++ b/drivers/iommu/intel_intr_remapping.c
@@ -13,6 +13,7 @@
 #include <acpi/acpi.h>
 #include <asm/intr_remapping.h>
 #include <asm/pci-direct.h>
+#include <asm/msidef.h>
 
 #include "intr_remapping.h"
 
@@ -955,6 +956,98 @@ intel_ioapic_set_affinity(struct irq_data *data, const struct cpumask *mask,
 	return 0;
 }
 
+static void intel_compose_msi_msg(struct pci_dev *pdev,
+				  unsigned int irq, unsigned int dest,
+				  struct msi_msg *msg, u8 hpet_id)
+{
+	struct irq_cfg *cfg;
+	struct irte irte;
+	u16 sub_handle;
+	int ir_index;
+
+	cfg = irq_get_chip_data(irq);
+
+	ir_index = map_irq_to_irte_handle(irq, &sub_handle);
+	BUG_ON(ir_index == -1);
+
+	prepare_irte(&irte, cfg->vector, dest);
+
+	/* Set source-id of interrupt request */
+	if (pdev)
+		set_msi_sid(&irte, pdev);
+	else
+		set_hpet_sid(&irte, hpet_id);
+
+	modify_irte(irq, &irte);
+
+	msg->address_hi = MSI_ADDR_BASE_HI;
+	msg->data = sub_handle;
+	msg->address_lo = MSI_ADDR_BASE_LO | MSI_ADDR_IR_EXT_INT |
+			  MSI_ADDR_IR_SHV |
+			  MSI_ADDR_IR_INDEX1(ir_index) |
+			  MSI_ADDR_IR_INDEX2(ir_index);
+}
+
+/*
+ * Map the PCI dev to the corresponding remapping hardware unit
+ * and allocate 'nvec' consecutive interrupt-remapping table entries
+ * in it.
+ */
+static int intel_msi_alloc_irq(struct pci_dev *dev, int irq, int nvec)
+{
+	struct intel_iommu *iommu;
+	int index;
+
+	iommu = map_dev_to_ir(dev);
+	if (!iommu) {
+		printk(KERN_ERR
+		       "Unable to map PCI %s to iommu\n", pci_name(dev));
+		return -ENOENT;
+	}
+
+	index = alloc_irte(iommu, irq, nvec);
+	if (index < 0) {
+		printk(KERN_ERR
+		       "Unable to allocate %d IRTE for PCI %s\n", nvec,
+		       pci_name(dev));
+		return -ENOSPC;
+	}
+	return index;
+}
+
+static int intel_msi_setup_irq(struct pci_dev *pdev, unsigned int irq,
+			       int index, int sub_handle)
+{
+	struct intel_iommu *iommu;
+
+	iommu = map_dev_to_ir(pdev);
+	if (!iommu)
+		return -ENOENT;
+	/*
+	 * setup the mapping between the irq and the IRTE
+	 * base index, the sub_handle pointing to the
+	 * appropriate interrupt remap table entry.
+	 */
+	set_irte_irq(irq, iommu, index, sub_handle);
+
+	return 0;
+}
+
+static int intel_setup_hpet_msi(unsigned int irq, unsigned int id)
+{
+	struct intel_iommu *iommu = map_hpet_to_ir(id);
+	int index;
+
+	if (!iommu)
+		return -1;
+
+	index = alloc_irte(iommu, irq, 1);
+	if (index < 0)
+		return -1;
+
+	return 0;
+}
+
 struct irq_remap_ops intel_irq_remap_ops = {
 	.supported		= intel_intr_remapping_supported,
 	.hardware_init		= dmar_table_init,
@@ -965,4 +1058,8 @@ struct irq_remap_ops intel_irq_remap_ops = {
 	.setup_ioapic_entry	= intel_setup_ioapic_entry,
 	.set_affinity		= intel_ioapic_set_affinity,
 	.free_irq		= free_irte,
+	.compose_msi_msg	= intel_compose_msi_msg,
+	.msi_alloc_irq		= intel_msi_alloc_irq,
+	.msi_setup_irq		= intel_msi_setup_irq,
+	.setup_hpet_msi		= intel_setup_hpet_msi,
 };
diff --git a/drivers/iommu/intr_remapping.c b/drivers/iommu/intr_remapping.c
index a68d304f9729..9dc179316ba1 100644
--- a/drivers/iommu/intr_remapping.c
+++ b/drivers/iommu/intr_remapping.c
@@ -127,3 +127,38 @@ void intr_free_irq(int irq)
 
 	remap_ops->free_irq(irq);
 }
+
+void intr_compose_msi_msg(struct pci_dev *pdev,
+			  unsigned int irq, unsigned int dest,
+			  struct msi_msg *msg, u8 hpet_id)
+{
+	if (!remap_ops || !remap_ops->compose_msi_msg)
+		return;
+
+	remap_ops->compose_msi_msg(pdev, irq, dest, msg, hpet_id);
+}
+
+int intr_msi_alloc_irq(struct pci_dev *pdev, int irq, int nvec)
+{
+	if (!remap_ops || !remap_ops->msi_alloc_irq)
+		return -ENODEV;
+
+	return remap_ops->msi_alloc_irq(pdev, irq, nvec);
+}
+
+int intr_msi_setup_irq(struct pci_dev *pdev, unsigned int irq,
+		       int index, int sub_handle)
+{
+	if (!remap_ops || !remap_ops->msi_setup_irq)
+		return -ENODEV;
+
+	return remap_ops->msi_setup_irq(pdev, irq, index, sub_handle);
+}
+
+int intr_setup_hpet_msi(unsigned int irq, unsigned int id)
+{
+	if (!remap_ops || !remap_ops->setup_hpet_msi)
+		return -ENODEV;
+
+	return remap_ops->setup_hpet_msi(irq, id);
+}
diff --git a/drivers/iommu/intr_remapping.h b/drivers/iommu/intr_remapping.h
index 57485539383d..6f4ea0a387b1 100644
--- a/drivers/iommu/intr_remapping.h
+++ b/drivers/iommu/intr_remapping.h
@@ -28,6 +28,8 @@ struct IO_APIC_route_entry;
 struct io_apic_irq_attr;
 struct irq_data;
 struct cpumask;
+struct pci_dev;
+struct msi_msg;
 
 extern int disable_intremap;
 extern int disable_sourceid_checking;
@@ -63,6 +65,20 @@ struct irq_remap_ops {
 
 	/* Free an IRQ */
 	int (*free_irq)(int);
+
+	/* Create MSI msg to use for interrupt remapping */
+	void (*compose_msi_msg)(struct pci_dev *,
+				unsigned int, unsigned int,
+				struct msi_msg *, u8);
+
+	/* Allocate remapping resources for MSI */
+	int (*msi_alloc_irq)(struct pci_dev *, int, int);
+
+	/* Setup the remapped MSI irq */
+	int (*msi_setup_irq)(struct pci_dev *, unsigned int, int, int);
+
+	/* Setup interrupt remapping for an HPET MSI */
+	int (*setup_hpet_msi)(unsigned int, unsigned int);
 };
 
 extern struct irq_remap_ops intel_irq_remap_ops;
-- 
cgit v1.2.3


From 263b5e8629c9ce21c9cd4c0e29c097afb1c10ef3 Mon Sep 17 00:00:00 2001
From: Joerg Roedel <joerg.roedel@amd.com>
Date: Fri, 30 Mar 2012 11:47:06 -0700
Subject: x86, iommu/vt-d: Clean up interfaces for interrupt remapping

Remove the Intel specific interfaces from dmar.h and remove
asm/irq_remapping.h which is only used for io_apic.c anyway.

Signed-off-by: Joerg Roedel <joerg.roedel@amd.com>
Acked-by: Yinghai Lu <yinghai@kernel.org>
Cc: David Woodhouse <dwmw2@infradead.org>
Cc: Alex Williamson <alex.williamson@redhat.com>
Signed-off-by: Suresh Siddha <suresh.b.siddha@intel.com>
Signed-off-by: Joerg Roedel <joerg.roedel@amd.com>
---
 arch/x86/include/asm/irq_remapping.h | 22 --------------
 arch/x86/kernel/apic/io_apic.c       | 17 ++++++++++-
 drivers/iommu/intel_intr_remapping.c | 20 ++++++------
 include/linux/dmar.h                 | 59 ------------------------------------
 4 files changed, 26 insertions(+), 92 deletions(-)
 delete mode 100644 arch/x86/include/asm/irq_remapping.h

(limited to 'arch/x86')

diff --git a/arch/x86/include/asm/irq_remapping.h b/arch/x86/include/asm/irq_remapping.h
deleted file mode 100644
index 0ddfc0b90adb..000000000000
--- a/arch/x86/include/asm/irq_remapping.h
+++ /dev/null
@@ -1,22 +0,0 @@
-#ifndef _ASM_X86_IRQ_REMAPPING_H
-#define _ASM_X86_IRQ_REMAPPING_H
-
-#define IRTE_DEST(dest) ((x2apic_mode) ? dest : dest << 8)
-
-#ifdef CONFIG_IRQ_REMAP
-static void irq_remap_modify_chip_defaults(struct irq_chip *chip);
-static inline bool irq_remapped(struct irq_cfg *cfg)
-{
-	return cfg->irq_2_iommu.iommu != NULL;
-}
-#else
-static inline bool irq_remapped(struct irq_cfg *cfg)
-{
-	return false;
-}
-static inline void irq_remap_modify_chip_defaults(struct irq_chip *chip)
-{
-}
-#endif
-
-#endif	/* _ASM_X86_IRQ_REMAPPING_H */
diff --git a/arch/x86/kernel/apic/io_apic.c b/arch/x86/kernel/apic/io_apic.c
index 3db693bda91d..073edd1d3c66 100644
--- a/arch/x86/kernel/apic/io_apic.c
+++ b/arch/x86/kernel/apic/io_apic.c
@@ -58,7 +58,6 @@
 #include <asm/hypertransport.h>
 #include <asm/setup.h>
 #include <asm/intr_remapping.h>
-#include <asm/irq_remapping.h>
 #include <asm/hpet.h>
 #include <asm/hw_irq.h>
 
@@ -87,6 +86,22 @@ void __init set_io_apic_ops(const struct io_apic_ops *ops)
 	io_apic_ops = *ops;
 }
 
+#ifdef CONFIG_IRQ_REMAP
+static void irq_remap_modify_chip_defaults(struct irq_chip *chip);
+static inline bool irq_remapped(struct irq_cfg *cfg)
+{
+	return cfg->irq_2_iommu.iommu != NULL;
+}
+#else
+static inline bool irq_remapped(struct irq_cfg *cfg)
+{
+	return false;
+}
+static inline void irq_remap_modify_chip_defaults(struct irq_chip *chip)
+{
+}
+#endif
+
 /*
  *      Is the SiS APIC rmw bug present ?
  *      -1 = don't know, 0 = no, 1 = yes
diff --git a/drivers/iommu/intel_intr_remapping.c b/drivers/iommu/intel_intr_remapping.c
index a3bae67ec43c..7472634df350 100644
--- a/drivers/iommu/intel_intr_remapping.c
+++ b/drivers/iommu/intel_intr_remapping.c
@@ -64,7 +64,7 @@ int get_irte(int irq, struct irte *entry)
 	return 0;
 }
 
-int alloc_irte(struct intel_iommu *iommu, int irq, u16 count)
+static int alloc_irte(struct intel_iommu *iommu, int irq, u16 count)
 {
 	struct ir_table *table = iommu->ir_table;
 	struct irq_2_iommu *irq_iommu = irq_2_iommu(irq);
@@ -136,7 +136,7 @@ static int qi_flush_iec(struct intel_iommu *iommu, int index, int mask)
 	return qi_submit_sync(&desc, iommu);
 }
 
-int map_irq_to_irte_handle(int irq, u16 *sub_handle)
+static int map_irq_to_irte_handle(int irq, u16 *sub_handle)
 {
 	struct irq_2_iommu *irq_iommu = irq_2_iommu(irq);
 	unsigned long flags;
@@ -152,7 +152,7 @@ int map_irq_to_irte_handle(int irq, u16 *sub_handle)
 	return index;
 }
 
-int set_irte_irq(int irq, struct intel_iommu *iommu, u16 index, u16 subhandle)
+static int set_irte_irq(int irq, struct intel_iommu *iommu, u16 index, u16 subhandle)
 {
 	struct irq_2_iommu *irq_iommu = irq_2_iommu(irq);
 	unsigned long flags;
@@ -172,7 +172,7 @@ int set_irte_irq(int irq, struct intel_iommu *iommu, u16 index, u16 subhandle)
 	return 0;
 }
 
-int modify_irte(int irq, struct irte *irte_modified)
+static int modify_irte(int irq, struct irte *irte_modified)
 {
 	struct irq_2_iommu *irq_iommu = irq_2_iommu(irq);
 	struct intel_iommu *iommu;
@@ -200,7 +200,7 @@ int modify_irte(int irq, struct irte *irte_modified)
 	return rc;
 }
 
-struct intel_iommu *map_hpet_to_ir(u8 hpet_id)
+static struct intel_iommu *map_hpet_to_ir(u8 hpet_id)
 {
 	int i;
 
@@ -210,7 +210,7 @@ struct intel_iommu *map_hpet_to_ir(u8 hpet_id)
 	return NULL;
 }
 
-struct intel_iommu *map_ioapic_to_ir(int apic)
+static struct intel_iommu *map_ioapic_to_ir(int apic)
 {
 	int i;
 
@@ -220,7 +220,7 @@ struct intel_iommu *map_ioapic_to_ir(int apic)
 	return NULL;
 }
 
-struct intel_iommu *map_dev_to_ir(struct pci_dev *dev)
+static struct intel_iommu *map_dev_to_ir(struct pci_dev *dev)
 {
 	struct dmar_drhd_unit *drhd;
 
@@ -312,7 +312,7 @@ static void set_irte_sid(struct irte *irte, unsigned int svt,
 	irte->sid = sid;
 }
 
-int set_ioapic_sid(struct irte *irte, int apic)
+static int set_ioapic_sid(struct irte *irte, int apic)
 {
 	int i;
 	u16 sid = 0;
@@ -337,7 +337,7 @@ int set_ioapic_sid(struct irte *irte, int apic)
 	return 0;
 }
 
-int set_hpet_sid(struct irte *irte, u8 id)
+static int set_hpet_sid(struct irte *irte, u8 id)
 {
 	int i;
 	u16 sid = 0;
@@ -367,7 +367,7 @@ int set_hpet_sid(struct irte *irte, u8 id)
 	return 0;
 }
 
-int set_msi_sid(struct irte *irte, struct pci_dev *dev)
+static int set_msi_sid(struct irte *irte, struct pci_dev *dev)
 {
 	struct pci_dev *bridge;
 
diff --git a/include/linux/dmar.h b/include/linux/dmar.h
index 7a207a39f879..b029d1aa2d12 100644
--- a/include/linux/dmar.h
+++ b/include/linux/dmar.h
@@ -114,65 +114,6 @@ struct irte {
 	};
 };
 
-#ifdef CONFIG_IRQ_REMAP
-extern int get_irte(int irq, struct irte *entry);
-extern int modify_irte(int irq, struct irte *irte_modified);
-extern int alloc_irte(struct intel_iommu *iommu, int irq, u16 count);
-extern int set_irte_irq(int irq, struct intel_iommu *iommu, u16 index,
-   			u16 sub_handle);
-extern int map_irq_to_irte_handle(int irq, u16 *sub_handle);
-
-extern struct intel_iommu *map_dev_to_ir(struct pci_dev *dev);
-extern struct intel_iommu *map_ioapic_to_ir(int apic);
-extern struct intel_iommu *map_hpet_to_ir(u8 id);
-extern int set_ioapic_sid(struct irte *irte, int apic);
-extern int set_hpet_sid(struct irte *irte, u8 id);
-extern int set_msi_sid(struct irte *irte, struct pci_dev *dev);
-#else
-static inline int alloc_irte(struct intel_iommu *iommu, int irq, u16 count)
-{
-	return -1;
-}
-static inline int modify_irte(int irq, struct irte *irte_modified)
-{
-	return -1;
-}
-static inline int map_irq_to_irte_handle(int irq, u16 *sub_handle)
-{
-	return -1;
-}
-static inline int set_irte_irq(int irq, struct intel_iommu *iommu, u16 index,
-			       u16 sub_handle)
-{
-	return -1;
-}
-static inline struct intel_iommu *map_dev_to_ir(struct pci_dev *dev)
-{
-	return NULL;
-}
-static inline struct intel_iommu *map_ioapic_to_ir(int apic)
-{
-	return NULL;
-}
-static inline struct intel_iommu *map_hpet_to_ir(unsigned int hpet_id)
-{
-	return NULL;
-}
-static inline int set_ioapic_sid(struct irte *irte, int apic)
-{
-	return 0;
-}
-static inline int set_hpet_sid(struct irte *irte, u8 id)
-{
-	return -1;
-}
-static inline int set_msi_sid(struct irte *irte, struct pci_dev *dev)
-{
-	return 0;
-}
-
-#endif
-
 enum {
 	IRQ_REMAP_XAPIC_MODE,
 	IRQ_REMAP_X2APIC_MODE,
-- 
cgit v1.2.3


From 95a02e976c39d63716b8c7c226bc530a2041536f Mon Sep 17 00:00:00 2001
From: Suresh Siddha <suresh.b.siddha@intel.com>
Date: Fri, 30 Mar 2012 11:47:07 -0700
Subject: iommu: rename intr_remapping references to irq_remapping

Make the code consistent with the naming conventions of irq subsystem.

Signed-off-by: Suresh Siddha <suresh.b.siddha@intel.com>
Cc: Joerg Roedel <joerg.roedel@amd.com>
Cc: Yinghai Lu <yinghai@kernel.org>
Cc: David Woodhouse <dwmw2@infradead.org>
Cc: Alex Williamson <alex.williamson@redhat.com>
Signed-off-by: Joerg Roedel <joerg.roedel@amd.com>
---
 arch/ia64/include/asm/intr_remapping.h |  2 +-
 arch/x86/include/asm/intr_remapping.h  | 94 +++++++++++++++++-----------------
 arch/x86/kernel/apic/apic.c            | 22 ++++----
 arch/x86/kernel/apic/io_apic.c         | 41 +++++++--------
 drivers/iommu/dmar.c                   |  8 +--
 drivers/iommu/intel-iommu.c            |  2 +-
 drivers/iommu/intel_intr_remapping.c   | 40 +++++++--------
 drivers/iommu/intr_remapping.c         | 74 +++++++++++++-------------
 drivers/iommu/intr_remapping.h         | 10 ++--
 9 files changed, 148 insertions(+), 145 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/ia64/include/asm/intr_remapping.h b/arch/ia64/include/asm/intr_remapping.h
index 095aa0d46c58..a8687b1d8906 100644
--- a/arch/ia64/include/asm/intr_remapping.h
+++ b/arch/ia64/include/asm/intr_remapping.h
@@ -1,4 +1,4 @@
 #ifndef __IA64_INTR_REMAPPING_H
 #define __IA64_INTR_REMAPPING_H
-#define intr_remapping_enabled 0
+#define irq_remapping_enabled 0
 #endif
diff --git a/arch/x86/include/asm/intr_remapping.h b/arch/x86/include/asm/intr_remapping.h
index a6afd6efa6c6..f9cbbcb2956e 100644
--- a/arch/x86/include/asm/intr_remapping.h
+++ b/arch/x86/include/asm/intr_remapping.h
@@ -28,71 +28,73 @@ struct IO_APIC_route_entry;
 struct io_apic_irq_attr;
 struct pci_dev;
 
-extern int intr_remapping_enabled;
+extern int irq_remapping_enabled;
 
-extern void setup_intr_remapping(void);
-extern int intr_remapping_supported(void);
-extern int intr_hardware_init(void);
-extern int intr_hardware_enable(void);
-extern void intr_hardware_disable(void);
-extern int intr_hardware_reenable(int);
-extern int intr_enable_fault_handling(void);
-extern int intr_setup_ioapic_entry(int irq,
-				   struct IO_APIC_route_entry *entry,
-				   unsigned int destination, int vector,
-				   struct io_apic_irq_attr *attr);
-extern int intr_set_affinity(struct irq_data *data,
-			     const struct cpumask *mask,
-			     bool force);
-extern void intr_free_irq(int irq);
-extern void intr_compose_msi_msg(struct pci_dev *pdev,
-				 unsigned int irq, unsigned int dest,
-				 struct msi_msg *msg, u8 hpet_id);
-extern int intr_msi_alloc_irq(struct pci_dev *pdev, int irq, int nvec);
-extern int intr_msi_setup_irq(struct pci_dev *pdev, unsigned int irq,
-			      int index, int sub_handle);
-extern int intr_setup_hpet_msi(unsigned int irq, unsigned int id);
+extern void setup_irq_remapping_ops(void);
+extern int irq_remapping_supported(void);
+extern int irq_remapping_prepare(void);
+extern int irq_remapping_enable(void);
+extern void irq_remapping_disable(void);
+extern int irq_remapping_reenable(int);
+extern int irq_remap_enable_fault_handling(void);
+extern int setup_ioapic_remapped_entry(int irq,
+				       struct IO_APIC_route_entry *entry,
+				       unsigned int destination,
+				       int vector,
+				       struct io_apic_irq_attr *attr);
+extern int set_remapped_irq_affinity(struct irq_data *data,
+				     const struct cpumask *mask,
+				     bool force);
+extern void free_remapped_irq(int irq);
+extern void compose_remapped_msi_msg(struct pci_dev *pdev,
+				     unsigned int irq, unsigned int dest,
+				     struct msi_msg *msg, u8 hpet_id);
+extern int msi_alloc_remapped_irq(struct pci_dev *pdev, int irq, int nvec);
+extern int msi_setup_remapped_irq(struct pci_dev *pdev, unsigned int irq,
+				  int index, int sub_handle);
+extern int setup_hpet_msi_remapped(unsigned int irq, unsigned int id);
 
 #else  /* CONFIG_IRQ_REMAP */
 
-#define intr_remapping_enabled	0
+#define irq_remapping_enabled	0
 
-static inline void setup_intr_remapping(void) { }
-static inline int intr_remapping_supported(void) { return 0; }
-static inline int intr_hardware_init(void) { return -ENODEV; }
-static inline int intr_hardware_enable(void) { return -ENODEV; }
-static inline void intr_hardware_disable(void) { }
-static inline int intr_hardware_reenable(int eim) { return -ENODEV; }
-static inline int intr_enable_fault_handling(void) { return -ENODEV; }
-static inline int intr_setup_ioapic_entry(int irq,
-					  struct IO_APIC_route_entry *entry,
-					  unsigned int destination, int vector,
-					  struct io_apic_irq_attr *attr)
+static inline void setup_irq_remapping_ops(void) { }
+static inline int irq_remapping_supported(void) { return 0; }
+static inline int irq_remapping_prepare(void) { return -ENODEV; }
+static inline int irq_remapping_enable(void) { return -ENODEV; }
+static inline void irq_remapping_disable(void) { }
+static inline int irq_remapping_reenable(int eim) { return -ENODEV; }
+static inline int irq_remap_enable_fault_handling(void) { return -ENODEV; }
+static inline int setup_ioapic_remapped_entry(int irq,
+					      struct IO_APIC_route_entry *entry,
+					      unsigned int destination,
+					      int vector,
+					      struct io_apic_irq_attr *attr)
 {
 	return -ENODEV;
 }
-static inline int intr_set_affinity(struct irq_data *data,
-				    const struct cpumask *mask,
-				    bool force)
+static inline int set_remapped_irq_affinity(struct irq_data *data,
+					    const struct cpumask *mask,
+					    bool force)
 {
 	return 0;
 }
-static inline void intr_free_irq(int irq) { }
-static inline void intr_compose_msi_msg(struct pci_dev *pdev,
-					unsigned int irq, unsigned int dest,
-					struct msi_msg *msg, u8 hpet_id)
+static inline void free_remapped_irq(int irq) { }
+static inline void compose_remapped_msi_msg(struct pci_dev *pdev,
+					    unsigned int irq, unsigned int dest,
+					    struct msi_msg *msg, u8 hpet_id)
 {
 }
-static inline int intr_msi_alloc_irq(struct pci_dev *pdev, int irq, int nvec)
+static inline int msi_alloc_remapped_irq(struct pci_dev *pdev, int irq, int nvec)
 {
 	return -ENODEV;
 }
-static inline int intr_msi_setup_irq(struct pci_dev *pdev, unsigned int irq,
-				     int index, int sub_handle)
+static inline int msi_setup_remapped_irq(struct pci_dev *pdev, unsigned int irq,
+					 int index, int sub_handle)
 {
 	return -ENODEV;
 }
-static inline int intr_setup_hpet_msi(unsigned int irq, unsigned int id)
+static inline int setup_hpet_msi_remapped(unsigned int irq, unsigned int id)
 {
 	return -ENODEV;
 }
diff --git a/arch/x86/kernel/apic/apic.c b/arch/x86/kernel/apic/apic.c
index a2762687e2ee..c02c666c4628 100644
--- a/arch/x86/kernel/apic/apic.c
+++ b/arch/x86/kernel/apic/apic.c
@@ -1442,8 +1442,8 @@ void __init bsp_end_local_APIC_setup(void)
 	 * Now that local APIC setup is completed for BP, configure the fault
 	 * handling for interrupt remapping.
 	 */
-	if (intr_remapping_enabled)
-		intr_enable_fault_handling();
+	if (irq_remapping_enabled)
+		irq_remap_enable_fault_handling();
 
 }
 
@@ -1518,7 +1518,7 @@ void enable_x2apic(void)
 int __init enable_IR(void)
 {
 #ifdef CONFIG_IRQ_REMAP
-	if (!intr_remapping_supported()) {
+	if (!irq_remapping_supported()) {
 		pr_debug("intr-remapping not supported\n");
 		return -1;
 	}
@@ -1529,7 +1529,7 @@ int __init enable_IR(void)
 		return -1;
 	}
 
-	return intr_hardware_enable();
+	return irq_remapping_enable();
 #endif
 	return -1;
 }
@@ -1541,9 +1541,9 @@ void __init enable_IR_x2apic(void)
 	int hardware_init_ret;
 
 	/* Make sure irq_remap_ops are initialized */
-	setup_intr_remapping();
+	setup_irq_remapping_ops();
 
-	hardware_init_ret = intr_hardware_init();
+	hardware_init_ret = irq_remapping_prepare();
 	if (hardware_init_ret && !x2apic_supported())
 		return;
 
@@ -2180,8 +2180,8 @@ static int lapic_suspend(void)
 	local_irq_save(flags);
 	disable_local_APIC();
 
-	if (intr_remapping_enabled)
-		intr_hardware_disable();
+	if (irq_remapping_enabled)
+		irq_remapping_disable();
 
 	local_irq_restore(flags);
 	return 0;
@@ -2197,7 +2197,7 @@ static void lapic_resume(void)
 		return;
 
 	local_irq_save(flags);
-	if (intr_remapping_enabled) {
+	if (irq_remapping_enabled) {
 		/*
 		 * IO-APIC and PIC have their own resume routines.
 		 * We just mask them here to make sure the interrupt
@@ -2249,8 +2249,8 @@ static void lapic_resume(void)
 	apic_write(APIC_ESR, 0);
 	apic_read(APIC_ESR);
 
-	if (intr_remapping_enabled)
-		intr_hardware_reenable(x2apic_mode);
+	if (irq_remapping_enabled)
+		irq_remapping_reenable(x2apic_mode);
 
 	local_irq_restore(flags);
 }
diff --git a/arch/x86/kernel/apic/io_apic.c b/arch/x86/kernel/apic/io_apic.c
index 073edd1d3c66..abbbcd4d1d71 100644
--- a/arch/x86/kernel/apic/io_apic.c
+++ b/arch/x86/kernel/apic/io_apic.c
@@ -1381,9 +1381,9 @@ static int setup_ioapic_entry(int irq, struct IO_APIC_route_entry *entry,
 			       unsigned int destination, int vector,
 			       struct io_apic_irq_attr *attr)
 {
-	if (intr_remapping_enabled)
-		return intr_setup_ioapic_entry(irq, entry, destination,
-					       vector, attr);
+	if (irq_remapping_enabled)
+		return setup_ioapic_remapped_entry(irq, entry, destination,
+						   vector, attr);
 
 	memset(entry, 0, sizeof(*entry));
 
@@ -1540,7 +1540,7 @@ static void __init setup_timer_IRQ0_pin(unsigned int ioapic_idx,
 {
 	struct IO_APIC_route_entry entry;
 
-	if (intr_remapping_enabled)
+	if (irq_remapping_enabled)
 		return;
 
 	memset(&entry, 0, sizeof(entry));
@@ -1626,7 +1626,7 @@ __apicdebuginit(void) print_IO_APIC(int ioapic_idx)
 
 	printk(KERN_DEBUG ".... IRQ redirection table:\n");
 
-	if (intr_remapping_enabled) {
+	if (irq_remapping_enabled) {
 		printk(KERN_DEBUG " NR Indx Fmt Mask Trig IRR"
 			" Pol Stat Indx2 Zero Vect:\n");
 	} else {
@@ -1635,7 +1635,7 @@ __apicdebuginit(void) print_IO_APIC(int ioapic_idx)
 	}
 
 	for (i = 0; i <= reg_01.bits.entries; i++) {
-		if (intr_remapping_enabled) {
+		if (irq_remapping_enabled) {
 			struct IO_APIC_route_entry entry;
 			struct IR_IO_APIC_route_entry *ir_entry;
 
@@ -2002,7 +2002,7 @@ void disable_IO_APIC(void)
 	 * IOAPIC RTE as well as interrupt-remapping table entry).
 	 * As this gets called during crash dump, keep this simple for now.
 	 */
-	if (ioapic_i8259.pin != -1 && !intr_remapping_enabled) {
+	if (ioapic_i8259.pin != -1 && !irq_remapping_enabled) {
 		struct IO_APIC_route_entry entry;
 
 		memset(&entry, 0, sizeof(entry));
@@ -2026,7 +2026,7 @@ void disable_IO_APIC(void)
 	 * Use virtual wire A mode when interrupt remapping is enabled.
 	 */
 	if (cpu_has_apic || apic_from_smp_config())
-		disconnect_bsp_APIC(!intr_remapping_enabled &&
+		disconnect_bsp_APIC(!irq_remapping_enabled &&
 				ioapic_i8259.pin != -1);
 }
 
@@ -2586,7 +2586,7 @@ static void irq_remap_modify_chip_defaults(struct irq_chip *chip)
 	chip->irq_eoi = ir_ack_apic_level;
 
 #ifdef CONFIG_SMP
-	chip->irq_set_affinity = intr_set_affinity;
+	chip->irq_set_affinity = set_remapped_irq_affinity;
 #endif
 }
 #endif /* CONFIG_IRQ_REMAP */
@@ -2799,7 +2799,7 @@ static inline void __init check_timer(void)
 	 * 8259A.
 	 */
 	if (pin1 == -1) {
-		if (intr_remapping_enabled)
+		if (irq_remapping_enabled)
 			panic("BIOS bug: timer not connected to IO-APIC");
 		pin1 = pin2;
 		apic1 = apic2;
@@ -2832,7 +2832,7 @@ static inline void __init check_timer(void)
 				clear_IO_APIC_pin(0, pin1);
 			goto out;
 		}
-		if (intr_remapping_enabled)
+		if (irq_remapping_enabled)
 			panic("timer doesn't work through Interrupt-remapped IO-APIC");
 		local_irq_disable();
 		clear_IO_APIC_pin(apic1, pin1);
@@ -3056,7 +3056,7 @@ void destroy_irq(unsigned int irq)
 	irq_set_status_flags(irq, IRQ_NOREQUEST|IRQ_NOPROBE);
 
 	if (irq_remapped(cfg))
-		intr_free_irq(irq);
+		free_remapped_irq(irq);
 	raw_spin_lock_irqsave(&vector_lock, flags);
 	__clear_irq_vector(irq, cfg);
 	raw_spin_unlock_irqrestore(&vector_lock, flags);
@@ -3085,7 +3085,7 @@ static int msi_compose_msg(struct pci_dev *pdev, unsigned int irq,
 	dest = apic->cpu_mask_to_apicid_and(cfg->domain, apic->target_cpus());
 
 	if (irq_remapped(cfg)) {
-		intr_compose_msi_msg(pdev, irq, dest, msg, hpet_id);
+		compose_remapped_msi_msg(pdev, irq, dest, msg, hpet_id);
 		return err;
 	}
 
@@ -3198,7 +3198,7 @@ int native_setup_msi_irqs(struct pci_dev *dev, int nvec, int type)
 		if (irq == 0)
 			return -1;
 		irq_want = irq + 1;
-		if (!intr_remapping_enabled)
+		if (!irq_remapping_enabled)
 			goto no_ir;
 
 		if (!sub_handle) {
@@ -3206,13 +3206,14 @@ int native_setup_msi_irqs(struct pci_dev *dev, int nvec, int type)
 			 * allocate the consecutive block of IRTE's
 			 * for 'nvec'
 			 */
-			index = intr_msi_alloc_irq(dev, irq, nvec);
+			index = msi_alloc_remapped_irq(dev, irq, nvec);
 			if (index < 0) {
 				ret = index;
 				goto error;
 			}
 		} else {
-			ret = intr_msi_setup_irq(dev, irq, index, sub_handle);
+			ret = msi_setup_remapped_irq(dev, irq, index,
+						     sub_handle);
 			if (ret < 0)
 				goto error;
 		}
@@ -3332,8 +3333,8 @@ int arch_setup_hpet_msi(unsigned int irq, unsigned int id)
 	struct msi_msg msg;
 	int ret;
 
-	if (intr_remapping_enabled) {
-		if (!intr_setup_hpet_msi(irq, id))
+	if (irq_remapping_enabled) {
+		if (!setup_hpet_msi_remapped(irq, id))
 			return -1;
 	}
 
@@ -3712,8 +3713,8 @@ void __init setup_ioapic_dest(void)
 		else
 			mask = apic->target_cpus();
 
-		if (intr_remapping_enabled)
-			intr_set_affinity(idata, mask, false);
+		if (irq_remapping_enabled)
+			set_remapped_irq_affinity(idata, mask, false);
 		else
 			ioapic_set_affinity(idata, mask, false);
 	}
diff --git a/drivers/iommu/dmar.c b/drivers/iommu/dmar.c
index 647e366403dc..ee74f698eef8 100644
--- a/drivers/iommu/dmar.c
+++ b/drivers/iommu/dmar.c
@@ -556,7 +556,7 @@ int __init detect_intel_iommu(void)
 
 		dmar = (struct acpi_table_dmar *) dmar_tbl;
 
-		if (ret && intr_remapping_enabled && cpu_has_x2apic &&
+		if (ret && irq_remapping_enabled && cpu_has_x2apic &&
 		    dmar->flags & 0x1)
 			printk(KERN_INFO
 			       "Queued invalidation will be enabled to support x2apic and Intr-remapping.\n");
@@ -1042,7 +1042,7 @@ static const char *dma_remap_fault_reasons[] =
 	"non-zero reserved fields in PTE",
 };
 
-static const char *intr_remap_fault_reasons[] =
+static const char *irq_remap_fault_reasons[] =
 {
 	"Detected reserved fields in the decoded interrupt-remapped request",
 	"Interrupt index exceeded the interrupt-remapping table size",
@@ -1058,9 +1058,9 @@ static const char *intr_remap_fault_reasons[] =
 const char *dmar_get_fault_reason(u8 fault_reason, int *fault_type)
 {
 	if (fault_reason >= 0x20 && (fault_reason <= 0x20 +
-				     ARRAY_SIZE(intr_remap_fault_reasons))) {
+				     ARRAY_SIZE(irq_remap_fault_reasons))) {
 		*fault_type = INTR_REMAP;
-		return intr_remap_fault_reasons[fault_reason - 0x20];
+		return irq_remap_fault_reasons[fault_reason - 0x20];
 	} else if (fault_reason < ARRAY_SIZE(dma_remap_fault_reasons)) {
 		*fault_type = DMA_REMAP;
 		return dma_remap_fault_reasons[fault_reason];
diff --git a/drivers/iommu/intel-iommu.c b/drivers/iommu/intel-iommu.c
index e1439808192c..cef5b8226f3d 100644
--- a/drivers/iommu/intel-iommu.c
+++ b/drivers/iommu/intel-iommu.c
@@ -4083,7 +4083,7 @@ static int intel_iommu_domain_has_cap(struct iommu_domain *domain,
 	if (cap == IOMMU_CAP_CACHE_COHERENCY)
 		return dmar_domain->iommu_snooping;
 	if (cap == IOMMU_CAP_INTR_REMAP)
-		return intr_remapping_enabled;
+		return irq_remapping_enabled;
 
 	return 0;
 }
diff --git a/drivers/iommu/intel_intr_remapping.c b/drivers/iommu/intel_intr_remapping.c
index 7472634df350..efeb601c782f 100644
--- a/drivers/iommu/intel_intr_remapping.c
+++ b/drivers/iommu/intel_intr_remapping.c
@@ -394,7 +394,7 @@ static int set_msi_sid(struct irte *irte, struct pci_dev *dev)
 	return 0;
 }
 
-static void iommu_set_intr_remapping(struct intel_iommu *iommu, int mode)
+static void iommu_set_irq_remapping(struct intel_iommu *iommu, int mode)
 {
 	u64 addr;
 	u32 sts;
@@ -434,7 +434,7 @@ static void iommu_set_intr_remapping(struct intel_iommu *iommu, int mode)
 }
 
 
-static int intel_setup_intr_remapping(struct intel_iommu *iommu, int mode)
+static int intel_setup_irq_remapping(struct intel_iommu *iommu, int mode)
 {
 	struct ir_table *ir_table;
 	struct page *pages;
@@ -457,14 +457,14 @@ static int intel_setup_intr_remapping(struct intel_iommu *iommu, int mode)
 
 	ir_table->base = page_address(pages);
 
-	iommu_set_intr_remapping(iommu, mode);
+	iommu_set_irq_remapping(iommu, mode);
 	return 0;
 }
 
 /*
  * Disable Interrupt Remapping.
  */
-static void iommu_disable_intr_remapping(struct intel_iommu *iommu)
+static void iommu_disable_irq_remapping(struct intel_iommu *iommu)
 {
 	unsigned long flags;
 	u32 sts;
@@ -503,11 +503,11 @@ static int __init dmar_x2apic_optout(void)
 	return dmar->flags & DMAR_X2APIC_OPT_OUT;
 }
 
-static int __init intel_intr_remapping_supported(void)
+static int __init intel_irq_remapping_supported(void)
 {
 	struct dmar_drhd_unit *drhd;
 
-	if (disable_intremap)
+	if (disable_irq_remap)
 		return 0;
 
 	if (!dmar_ir_support())
@@ -523,7 +523,7 @@ static int __init intel_intr_remapping_supported(void)
 	return 1;
 }
 
-static int __init intel_enable_intr_remapping(void)
+static int __init intel_enable_irq_remapping(void)
 {
 	struct dmar_drhd_unit *drhd;
 	int setup = 0;
@@ -561,7 +561,7 @@ static int __init intel_enable_intr_remapping(void)
 		 * Disable intr remapping and queued invalidation, if already
 		 * enabled prior to OS handover.
 		 */
-		iommu_disable_intr_remapping(iommu);
+		iommu_disable_irq_remapping(iommu);
 
 		dmar_disable_qi(iommu);
 	}
@@ -607,7 +607,7 @@ static int __init intel_enable_intr_remapping(void)
 		if (!ecap_ir_support(iommu->ecap))
 			continue;
 
-		if (intel_setup_intr_remapping(iommu, eim))
+		if (intel_setup_irq_remapping(iommu, eim))
 			goto error;
 
 		setup = 1;
@@ -616,7 +616,7 @@ static int __init intel_enable_intr_remapping(void)
 	if (!setup)
 		goto error;
 
-	intr_remapping_enabled = 1;
+	irq_remapping_enabled = 1;
 	pr_info("Enabled IRQ remapping in %s mode\n", eim ? "x2apic" : "xapic");
 
 	return eim ? IRQ_REMAP_X2APIC_MODE : IRQ_REMAP_XAPIC_MODE;
@@ -759,14 +759,14 @@ int __init parse_ioapics_under_ir(void)
 
 int __init ir_dev_scope_init(void)
 {
-	if (!intr_remapping_enabled)
+	if (!irq_remapping_enabled)
 		return 0;
 
 	return dmar_dev_scope_init();
 }
 rootfs_initcall(ir_dev_scope_init);
 
-static void disable_intr_remapping(void)
+static void disable_irq_remapping(void)
 {
 	struct dmar_drhd_unit *drhd;
 	struct intel_iommu *iommu = NULL;
@@ -778,11 +778,11 @@ static void disable_intr_remapping(void)
 		if (!ecap_ir_support(iommu->ecap))
 			continue;
 
-		iommu_disable_intr_remapping(iommu);
+		iommu_disable_irq_remapping(iommu);
 	}
 }
 
-static int reenable_intr_remapping(int eim)
+static int reenable_irq_remapping(int eim)
 {
 	struct dmar_drhd_unit *drhd;
 	int setup = 0;
@@ -800,7 +800,7 @@ static int reenable_intr_remapping(int eim)
 			continue;
 
 		/* Set up interrupt remapping for iommu.*/
-		iommu_set_intr_remapping(iommu, eim);
+		iommu_set_irq_remapping(iommu, eim);
 		setup = 1;
 	}
 
@@ -1049,11 +1049,11 @@ static int intel_setup_hpet_msi(unsigned int irq, unsigned int id)
 }
 
 struct irq_remap_ops intel_irq_remap_ops = {
-	.supported		= intel_intr_remapping_supported,
-	.hardware_init		= dmar_table_init,
-	.hardware_enable	= intel_enable_intr_remapping,
-	.hardware_disable	= disable_intr_remapping,
-	.hardware_reenable	= reenable_intr_remapping,
+	.supported		= intel_irq_remapping_supported,
+	.prepare		= dmar_table_init,
+	.enable			= intel_enable_irq_remapping,
+	.disable		= disable_irq_remapping,
+	.reenable		= reenable_irq_remapping,
 	.enable_faulting	= enable_drhd_fault_handling,
 	.setup_ioapic_entry	= intel_setup_ioapic_entry,
 	.set_affinity		= intel_ioapic_set_affinity,
diff --git a/drivers/iommu/intr_remapping.c b/drivers/iommu/intr_remapping.c
index 9dc179316ba1..523a7b3a1205 100644
--- a/drivers/iommu/intr_remapping.c
+++ b/drivers/iommu/intr_remapping.c
@@ -4,9 +4,9 @@
 
 #include "intr_remapping.h"
 
-int intr_remapping_enabled;
+int irq_remapping_enabled;
 
-int disable_intremap;
+int disable_irq_remap;
 int disable_sourceid_checking;
 int no_x2apic_optout;
 
@@ -14,21 +14,21 @@ static struct irq_remap_ops *remap_ops;
 
 static __init int setup_nointremap(char *str)
 {
-	disable_intremap = 1;
+	disable_irq_remap = 1;
 	return 0;
 }
 early_param("nointremap", setup_nointremap);
 
-static __init int setup_intremap(char *str)
+static __init int setup_irqremap(char *str)
 {
 	if (!str)
 		return -EINVAL;
 
 	while (*str) {
 		if (!strncmp(str, "on", 2))
-			disable_intremap = 0;
+			disable_irq_remap = 0;
 		else if (!strncmp(str, "off", 3))
-			disable_intremap = 1;
+			disable_irq_remap = 1;
 		else if (!strncmp(str, "nosid", 5))
 			disable_sourceid_checking = 1;
 		else if (!strncmp(str, "no_x2apic_optout", 16))
@@ -41,16 +41,16 @@ static __init int setup_intremap(char *str)
 
 	return 0;
 }
-early_param("intremap", setup_intremap);
+early_param("intremap", setup_irqremap);
 
-void __init setup_intr_remapping(void)
+void __init setup_irq_remapping_ops(void)
 {
 	remap_ops = &intel_irq_remap_ops;
 }
 
-int intr_remapping_supported(void)
+int irq_remapping_supported(void)
 {
-	if (disable_intremap)
+	if (disable_irq_remap)
 		return 0;
 
 	if (!remap_ops || !remap_ops->supported)
@@ -59,39 +59,39 @@ int intr_remapping_supported(void)
 	return remap_ops->supported();
 }
 
-int __init intr_hardware_init(void)
+int __init irq_remapping_prepare(void)
 {
-	if (!remap_ops || !remap_ops->hardware_init)
+	if (!remap_ops || !remap_ops->prepare)
 		return -ENODEV;
 
-	return remap_ops->hardware_init();
+	return remap_ops->prepare();
 }
 
-int __init intr_hardware_enable(void)
+int __init irq_remapping_enable(void)
 {
-	if (!remap_ops || !remap_ops->hardware_enable)
+	if (!remap_ops || !remap_ops->enable)
 		return -ENODEV;
 
-	return remap_ops->hardware_enable();
+	return remap_ops->enable();
 }
 
-void intr_hardware_disable(void)
+void irq_remapping_disable(void)
 {
-	if (!remap_ops || !remap_ops->hardware_disable)
+	if (!remap_ops || !remap_ops->disable)
 		return;
 
-	remap_ops->hardware_disable();
+	remap_ops->disable();
 }
 
-int intr_hardware_reenable(int mode)
+int irq_remapping_reenable(int mode)
 {
-	if (!remap_ops || !remap_ops->hardware_reenable)
+	if (!remap_ops || !remap_ops->reenable)
 		return 0;
 
-	return remap_ops->hardware_reenable(mode);
+	return remap_ops->reenable(mode);
 }
 
-int __init intr_enable_fault_handling(void)
+int __init irq_remap_enable_fault_handling(void)
 {
 	if (!remap_ops || !remap_ops->enable_faulting)
 		return -ENODEV;
@@ -99,10 +99,10 @@ int __init intr_enable_fault_handling(void)
 	return remap_ops->enable_faulting();
 }
 
-int intr_setup_ioapic_entry(int irq,
-			    struct IO_APIC_route_entry *entry,
-			    unsigned int destination, int vector,
-			    struct io_apic_irq_attr *attr)
+int setup_ioapic_remapped_entry(int irq,
+				struct IO_APIC_route_entry *entry,
+				unsigned int destination, int vector,
+				struct io_apic_irq_attr *attr)
 {
 	if (!remap_ops || !remap_ops->setup_ioapic_entry)
 		return -ENODEV;
@@ -111,8 +111,8 @@ int intr_setup_ioapic_entry(int irq,
 					     vector, attr);
 }
 
-int intr_set_affinity(struct irq_data *data, const struct cpumask *mask,
-		      bool force)
+int set_remapped_irq_affinity(struct irq_data *data, const struct cpumask *mask,
+			      bool force)
 {
 	if (!remap_ops || !remap_ops->set_affinity)
 		return 0;
@@ -120,7 +120,7 @@ int intr_set_affinity(struct irq_data *data, const struct cpumask *mask,
 	return remap_ops->set_affinity(data, mask, force);
 }
 
-void intr_free_irq(int irq)
+void free_remapped_irq(int irq)
 {
 	if (!remap_ops || !remap_ops->free_irq)
 		return;
@@ -128,9 +128,9 @@ void intr_free_irq(int irq)
 	remap_ops->free_irq(irq);
 }
 
-void intr_compose_msi_msg(struct pci_dev *pdev,
-			  unsigned int irq, unsigned int dest,
-			  struct msi_msg *msg, u8 hpet_id)
+void compose_remapped_msi_msg(struct pci_dev *pdev,
+			      unsigned int irq, unsigned int dest,
+			      struct msi_msg *msg, u8 hpet_id)
 {
 	if (!remap_ops || !remap_ops->compose_msi_msg)
 		return;
@@ -138,7 +138,7 @@ void intr_compose_msi_msg(struct pci_dev *pdev,
 	remap_ops->compose_msi_msg(pdev, irq, dest, msg, hpet_id);
 }
 
-int intr_msi_alloc_irq(struct pci_dev *pdev, int irq, int nvec)
+int msi_alloc_remapped_irq(struct pci_dev *pdev, int irq, int nvec)
 {
 	if (!remap_ops || !remap_ops->msi_alloc_irq)
 		return -ENODEV;
@@ -146,8 +146,8 @@ int intr_msi_alloc_irq(struct pci_dev *pdev, int irq, int nvec)
 	return remap_ops->msi_alloc_irq(pdev, irq, nvec);
 }
 
-int intr_msi_setup_irq(struct pci_dev *pdev, unsigned int irq,
-		       int index, int sub_handle)
+int msi_setup_remapped_irq(struct pci_dev *pdev, unsigned int irq,
+			   int index, int sub_handle)
 {
 	if (!remap_ops || !remap_ops->msi_setup_irq)
 		return -ENODEV;
@@ -155,7 +155,7 @@ int intr_msi_setup_irq(struct pci_dev *pdev, unsigned int irq,
 	return remap_ops->msi_setup_irq(pdev, irq, index, sub_handle);
 }
 
-int intr_setup_hpet_msi(unsigned int irq, unsigned int id)
+int setup_hpet_msi_remapped(unsigned int irq, unsigned int id)
 {
 	if (!remap_ops || !remap_ops->setup_hpet_msi)
 		return -ENODEV;
diff --git a/drivers/iommu/intr_remapping.h b/drivers/iommu/intr_remapping.h
index 6f4ea0a387b1..bd5d98fec148 100644
--- a/drivers/iommu/intr_remapping.h
+++ b/drivers/iommu/intr_remapping.h
@@ -31,7 +31,7 @@ struct cpumask;
 struct pci_dev;
 struct msi_msg;
 
-extern int disable_intremap;
+extern int disable_irq_remap;
 extern int disable_sourceid_checking;
 extern int no_x2apic_optout;
 
@@ -40,16 +40,16 @@ struct irq_remap_ops {
 	int (*supported)(void);
 
 	/* Initializes hardware and makes it ready for remapping interrupts */
-	int  (*hardware_init)(void);
+	int  (*prepare)(void);
 
 	/* Enables the remapping hardware */
-	int  (*hardware_enable)(void);
+	int  (*enable)(void);
 
 	/* Disables the remapping hardware */
-	void (*hardware_disable)(void);
+	void (*disable)(void);
 
 	/* Reenables the remapping hardware */
-	int  (*hardware_reenable)(int);
+	int  (*reenable)(int);
 
 	/* Enable fault handling */
 	int  (*enable_faulting)(void);
-- 
cgit v1.2.3


From 8a8f422d3b4f2cde8e0e1d31638279a26a886a82 Mon Sep 17 00:00:00 2001
From: Suresh Siddha <suresh.b.siddha@intel.com>
Date: Fri, 30 Mar 2012 11:47:08 -0700
Subject: iommu: rename intr_remapping.[ch] to irq_remapping.[ch]

Make the file names consistent with the naming conventions of irq subsystem.

Signed-off-by: Suresh Siddha <suresh.b.siddha@intel.com>
Cc: Joerg Roedel <joerg.roedel@amd.com>
Cc: Yinghai Lu <yinghai@kernel.org>
Cc: David Woodhouse <dwmw2@infradead.org>
Cc: Alex Williamson <alex.williamson@redhat.com>
Signed-off-by: Joerg Roedel <joerg.roedel@amd.com>
---
 arch/ia64/include/asm/intr_remapping.h |    4 -
 arch/ia64/include/asm/irq_remapping.h  |    4 +
 arch/x86/include/asm/intr_remapping.h  |  103 ---
 arch/x86/include/asm/irq_remapping.h   |  103 +++
 arch/x86/kernel/apic/apic.c            |    2 +-
 arch/x86/kernel/apic/io_apic.c         |    2 +-
 drivers/iommu/Makefile                 |    2 +-
 drivers/iommu/dmar.c                   |    2 +-
 drivers/iommu/intel-iommu.c            |    2 +-
 drivers/iommu/intel_intr_remapping.c   | 1065 --------------------------------
 drivers/iommu/intel_irq_remapping.c    | 1065 ++++++++++++++++++++++++++++++++
 drivers/iommu/intr_remapping.c         |  164 -----
 drivers/iommu/intr_remapping.h         |   88 ---
 drivers/iommu/irq_remapping.c          |  164 +++++
 drivers/iommu/irq_remapping.h          |   88 +++
 15 files changed, 1429 insertions(+), 1429 deletions(-)
 delete mode 100644 arch/ia64/include/asm/intr_remapping.h
 create mode 100644 arch/ia64/include/asm/irq_remapping.h
 delete mode 100644 arch/x86/include/asm/intr_remapping.h
 create mode 100644 arch/x86/include/asm/irq_remapping.h
 delete mode 100644 drivers/iommu/intel_intr_remapping.c
 create mode 100644 drivers/iommu/intel_irq_remapping.c
 delete mode 100644 drivers/iommu/intr_remapping.c
 delete mode 100644 drivers/iommu/intr_remapping.h
 create mode 100644 drivers/iommu/irq_remapping.c
 create mode 100644 drivers/iommu/irq_remapping.h

(limited to 'arch/x86')

diff --git a/arch/ia64/include/asm/intr_remapping.h b/arch/ia64/include/asm/intr_remapping.h
deleted file mode 100644
index a8687b1d8906..000000000000
--- a/arch/ia64/include/asm/intr_remapping.h
+++ /dev/null
@@ -1,4 +0,0 @@
-#ifndef __IA64_INTR_REMAPPING_H
-#define __IA64_INTR_REMAPPING_H
-#define irq_remapping_enabled 0
-#endif
diff --git a/arch/ia64/include/asm/irq_remapping.h b/arch/ia64/include/asm/irq_remapping.h
new file mode 100644
index 000000000000..a8687b1d8906
--- /dev/null
+++ b/arch/ia64/include/asm/irq_remapping.h
@@ -0,0 +1,4 @@
+#ifndef __IA64_INTR_REMAPPING_H
+#define __IA64_INTR_REMAPPING_H
+#define irq_remapping_enabled 0
+#endif
diff --git a/arch/x86/include/asm/intr_remapping.h b/arch/x86/include/asm/intr_remapping.h
deleted file mode 100644
index f9cbbcb2956e..000000000000
--- a/arch/x86/include/asm/intr_remapping.h
+++ /dev/null
@@ -1,103 +0,0 @@
-/*
- * Copyright (C) 2012 Advanced Micro Devices, Inc.
- * Author: Joerg Roedel <joerg.roedel@amd.com>
- *
- * This program is free software; you can redistribute it and/or modify it
- * under the terms of the GNU General Public License version 2 as published
- * by the Free Software Foundation.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software
- * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307 USA
- *
- * This header file contains the interface of the interrupt remapping code to
- * the x86 interrupt management code.
- */
-
-#ifndef __X86_INTR_REMAPPING_H
-#define __X86_INTR_REMAPPING_H
-
-#ifdef CONFIG_IRQ_REMAP
-
-struct IO_APIC_route_entry;
-struct io_apic_irq_attr;
-struct pci_dev;
-
-extern int irq_remapping_enabled;
-
-extern void setup_irq_remapping_ops(void);
-extern int irq_remapping_supported(void);
-extern int irq_remapping_prepare(void);
-extern int irq_remapping_enable(void);
-extern void irq_remapping_disable(void);
-extern int irq_remapping_reenable(int);
-extern int irq_remap_enable_fault_handling(void);
-extern int setup_ioapic_remapped_entry(int irq,
-				       struct IO_APIC_route_entry *entry,
-				       unsigned int destination,
-				       int vector,
-				       struct io_apic_irq_attr *attr);
-extern int set_remapped_irq_affinity(struct irq_data *data,
-				     const struct cpumask *mask,
-				     bool force);
-extern void free_remapped_irq(int irq);
-extern void compose_remapped_msi_msg(struct pci_dev *pdev,
-				     unsigned int irq, unsigned int dest,
-				     struct msi_msg *msg, u8 hpet_id);
-extern int msi_alloc_remapped_irq(struct pci_dev *pdev, int irq, int nvec);
-extern int msi_setup_remapped_irq(struct pci_dev *pdev, unsigned int irq,
-				  int index, int sub_handle);
-extern int setup_hpet_msi_remapped(unsigned int irq, unsigned int id);
-
-#else  /* CONFIG_IRQ_REMAP */
-
-#define irq_remapping_enabled	0
-
-static inline void setup_irq_remapping_ops(void) { }
-static inline int irq_remapping_supported(void) { return 0; }
-static inline int irq_remapping_prepare(void) { return -ENODEV; }
-static inline int irq_remapping_enable(void) { return -ENODEV; }
-static inline void irq_remapping_disable(void) { }
-static inline int irq_remapping_reenable(int eim) { return -ENODEV; }
-static inline int irq_remap_enable_fault_handling(void) { return -ENODEV; }
-static inline int setup_ioapic_remapped_entry(int irq,
-					      struct IO_APIC_route_entry *entry,
-					      unsigned int destination,
-					      int vector,
-					      struct io_apic_irq_attr *attr)
-{
-	return -ENODEV;
-}
-static inline int set_remapped_irq_affinity(struct irq_data *data,
-					    const struct cpumask *mask,
-					    bool force)
-{
-	return 0;
-}
-static inline void free_remapped_irq(int irq) { }
-static inline void compose_remapped_msi_msg(struct pci_dev *pdev,
-					    unsigned int irq, unsigned int dest,
-					    struct msi_msg *msg, u8 hpet_id)
-{
-}
-static inline int msi_alloc_remapped_irq(struct pci_dev *pdev, int irq, int nvec)
-{
-	return -ENODEV;
-}
-static inline int msi_setup_remapped_irq(struct pci_dev *pdev, unsigned int irq,
-					 int index, int sub_handle)
-{
-	return -ENODEV;
-}
-static inline int setup_hpet_msi_remapped(unsigned int irq, unsigned int id)
-{
-	return -ENODEV;
-}
-#endif /* CONFIG_IRQ_REMAP */
-
-#endif /* __X86_INTR_REMAPPING_H */
diff --git a/arch/x86/include/asm/irq_remapping.h b/arch/x86/include/asm/irq_remapping.h
new file mode 100644
index 000000000000..dcb0c7231028
--- /dev/null
+++ b/arch/x86/include/asm/irq_remapping.h
@@ -0,0 +1,103 @@
+/*
+ * Copyright (C) 2012 Advanced Micro Devices, Inc.
+ * Author: Joerg Roedel <joerg.roedel@amd.com>
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License version 2 as published
+ * by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307 USA
+ *
+ * This header file contains the interface of the interrupt remapping code to
+ * the x86 interrupt management code.
+ */
+
+#ifndef __X86_IRQ_REMAPPING_H
+#define __X86_IRQ_REMAPPING_H
+
+#ifdef CONFIG_IRQ_REMAP
+
+struct IO_APIC_route_entry;
+struct io_apic_irq_attr;
+struct pci_dev;
+
+extern int irq_remapping_enabled;
+
+extern void setup_irq_remapping_ops(void);
+extern int irq_remapping_supported(void);
+extern int irq_remapping_prepare(void);
+extern int irq_remapping_enable(void);
+extern void irq_remapping_disable(void);
+extern int irq_remapping_reenable(int);
+extern int irq_remap_enable_fault_handling(void);
+extern int setup_ioapic_remapped_entry(int irq,
+				       struct IO_APIC_route_entry *entry,
+				       unsigned int destination,
+				       int vector,
+				       struct io_apic_irq_attr *attr);
+extern int set_remapped_irq_affinity(struct irq_data *data,
+				     const struct cpumask *mask,
+				     bool force);
+extern void free_remapped_irq(int irq);
+extern void compose_remapped_msi_msg(struct pci_dev *pdev,
+				     unsigned int irq, unsigned int dest,
+				     struct msi_msg *msg, u8 hpet_id);
+extern int msi_alloc_remapped_irq(struct pci_dev *pdev, int irq, int nvec);
+extern int msi_setup_remapped_irq(struct pci_dev *pdev, unsigned int irq,
+				  int index, int sub_handle);
+extern int setup_hpet_msi_remapped(unsigned int irq, unsigned int id);
+
+#else  /* CONFIG_IRQ_REMAP */
+
+#define irq_remapping_enabled	0
+
+static inline void setup_irq_remapping_ops(void) { }
+static inline int irq_remapping_supported(void) { return 0; }
+static inline int irq_remapping_prepare(void) { return -ENODEV; }
+static inline int irq_remapping_enable(void) { return -ENODEV; }
+static inline void irq_remapping_disable(void) { }
+static inline int irq_remapping_reenable(int eim) { return -ENODEV; }
+static inline int irq_remap_enable_fault_handling(void) { return -ENODEV; }
+static inline int setup_ioapic_remapped_entry(int irq,
+					      struct IO_APIC_route_entry *entry,
+					      unsigned int destination,
+					      int vector,
+					      struct io_apic_irq_attr *attr)
+{
+	return -ENODEV;
+}
+static inline int set_remapped_irq_affinity(struct irq_data *data,
+					    const struct cpumask *mask,
+					    bool force)
+{
+	return 0;
+}
+static inline void free_remapped_irq(int irq) { }
+static inline void compose_remapped_msi_msg(struct pci_dev *pdev,
+					    unsigned int irq, unsigned int dest,
+					    struct msi_msg *msg, u8 hpet_id)
+{
+}
+static inline int msi_alloc_remapped_irq(struct pci_dev *pdev, int irq, int nvec)
+{
+	return -ENODEV;
+}
+static inline int msi_setup_remapped_irq(struct pci_dev *pdev, unsigned int irq,
+					 int index, int sub_handle)
+{
+	return -ENODEV;
+}
+static inline int setup_hpet_msi_remapped(unsigned int irq, unsigned int id)
+{
+	return -ENODEV;
+}
+#endif /* CONFIG_IRQ_REMAP */
+
+#endif /* __X86_IRQ_REMAPPING_H */
diff --git a/arch/x86/kernel/apic/apic.c b/arch/x86/kernel/apic/apic.c
index c02c666c4628..3722179a49db 100644
--- a/arch/x86/kernel/apic/apic.c
+++ b/arch/x86/kernel/apic/apic.c
@@ -35,7 +35,7 @@
 #include <linux/smp.h>
 #include <linux/mm.h>
 
-#include <asm/intr_remapping.h>
+#include <asm/irq_remapping.h>
 #include <asm/perf_event.h>
 #include <asm/x86_init.h>
 #include <asm/pgalloc.h>
diff --git a/arch/x86/kernel/apic/io_apic.c b/arch/x86/kernel/apic/io_apic.c
index abbbcd4d1d71..ef0648cd7084 100644
--- a/arch/x86/kernel/apic/io_apic.c
+++ b/arch/x86/kernel/apic/io_apic.c
@@ -57,7 +57,7 @@
 #include <asm/msidef.h>
 #include <asm/hypertransport.h>
 #include <asm/setup.h>
-#include <asm/intr_remapping.h>
+#include <asm/irq_remapping.h>
 #include <asm/hpet.h>
 #include <asm/hw_irq.h>
 
diff --git a/drivers/iommu/Makefile b/drivers/iommu/Makefile
index 823e1cf8708f..3e5e82ae9f0d 100644
--- a/drivers/iommu/Makefile
+++ b/drivers/iommu/Makefile
@@ -4,7 +4,7 @@ obj-$(CONFIG_AMD_IOMMU) += amd_iommu.o amd_iommu_init.o
 obj-$(CONFIG_AMD_IOMMU_V2) += amd_iommu_v2.o
 obj-$(CONFIG_DMAR_TABLE) += dmar.o
 obj-$(CONFIG_INTEL_IOMMU) += iova.o intel-iommu.o
-obj-$(CONFIG_IRQ_REMAP) += intel_intr_remapping.o intr_remapping.o
+obj-$(CONFIG_IRQ_REMAP) += intel_irq_remapping.o irq_remapping.o
 obj-$(CONFIG_OMAP_IOMMU) += omap-iommu.o
 obj-$(CONFIG_OMAP_IOVMM) += omap-iovmm.o
 obj-$(CONFIG_OMAP_IOMMU_DEBUG) += omap-iommu-debug.o
diff --git a/drivers/iommu/dmar.c b/drivers/iommu/dmar.c
index ee74f698eef8..5ef65cf66152 100644
--- a/drivers/iommu/dmar.c
+++ b/drivers/iommu/dmar.c
@@ -36,7 +36,7 @@
 #include <linux/tboot.h>
 #include <linux/dmi.h>
 #include <linux/slab.h>
-#include <asm/intr_remapping.h>
+#include <asm/irq_remapping.h>
 #include <asm/iommu_table.h>
 
 #define PREFIX "DMAR: "
diff --git a/drivers/iommu/intel-iommu.c b/drivers/iommu/intel-iommu.c
index cef5b8226f3d..bf2fbaad5e22 100644
--- a/drivers/iommu/intel-iommu.c
+++ b/drivers/iommu/intel-iommu.c
@@ -42,7 +42,7 @@
 #include <linux/dmi.h>
 #include <linux/pci-ats.h>
 #include <linux/memblock.h>
-#include <asm/intr_remapping.h>
+#include <asm/irq_remapping.h>
 #include <asm/cacheflush.h>
 #include <asm/iommu.h>
 
diff --git a/drivers/iommu/intel_intr_remapping.c b/drivers/iommu/intel_intr_remapping.c
deleted file mode 100644
index efeb601c782f..000000000000
--- a/drivers/iommu/intel_intr_remapping.c
+++ /dev/null
@@ -1,1065 +0,0 @@
-#include <linux/interrupt.h>
-#include <linux/dmar.h>
-#include <linux/spinlock.h>
-#include <linux/slab.h>
-#include <linux/jiffies.h>
-#include <linux/hpet.h>
-#include <linux/pci.h>
-#include <linux/irq.h>
-#include <asm/io_apic.h>
-#include <asm/smp.h>
-#include <asm/cpu.h>
-#include <linux/intel-iommu.h>
-#include <acpi/acpi.h>
-#include <asm/intr_remapping.h>
-#include <asm/pci-direct.h>
-#include <asm/msidef.h>
-
-#include "intr_remapping.h"
-
-struct ioapic_scope {
-	struct intel_iommu *iommu;
-	unsigned int id;
-	unsigned int bus;	/* PCI bus number */
-	unsigned int devfn;	/* PCI devfn number */
-};
-
-struct hpet_scope {
-	struct intel_iommu *iommu;
-	u8 id;
-	unsigned int bus;
-	unsigned int devfn;
-};
-
-#define IR_X2APIC_MODE(mode) (mode ? (1 << 11) : 0)
-#define IRTE_DEST(dest) ((x2apic_mode) ? dest : dest << 8)
-
-static struct ioapic_scope ir_ioapic[MAX_IO_APICS];
-static struct hpet_scope ir_hpet[MAX_HPET_TBS];
-static int ir_ioapic_num, ir_hpet_num;
-
-static DEFINE_RAW_SPINLOCK(irq_2_ir_lock);
-
-static struct irq_2_iommu *irq_2_iommu(unsigned int irq)
-{
-	struct irq_cfg *cfg = irq_get_chip_data(irq);
-	return cfg ? &cfg->irq_2_iommu : NULL;
-}
-
-int get_irte(int irq, struct irte *entry)
-{
-	struct irq_2_iommu *irq_iommu = irq_2_iommu(irq);
-	unsigned long flags;
-	int index;
-
-	if (!entry || !irq_iommu)
-		return -1;
-
-	raw_spin_lock_irqsave(&irq_2_ir_lock, flags);
-
-	index = irq_iommu->irte_index + irq_iommu->sub_handle;
-	*entry = *(irq_iommu->iommu->ir_table->base + index);
-
-	raw_spin_unlock_irqrestore(&irq_2_ir_lock, flags);
-	return 0;
-}
-
-static int alloc_irte(struct intel_iommu *iommu, int irq, u16 count)
-{
-	struct ir_table *table = iommu->ir_table;
-	struct irq_2_iommu *irq_iommu = irq_2_iommu(irq);
-	u16 index, start_index;
-	unsigned int mask = 0;
-	unsigned long flags;
-	int i;
-
-	if (!count || !irq_iommu)
-		return -1;
-
-	/*
-	 * start the IRTE search from index 0.
-	 */
-	index = start_index = 0;
-
-	if (count > 1) {
-		count = __roundup_pow_of_two(count);
-		mask = ilog2(count);
-	}
-
-	if (mask > ecap_max_handle_mask(iommu->ecap)) {
-		printk(KERN_ERR
-		       "Requested mask %x exceeds the max invalidation handle"
-		       " mask value %Lx\n", mask,
-		       ecap_max_handle_mask(iommu->ecap));
-		return -1;
-	}
-
-	raw_spin_lock_irqsave(&irq_2_ir_lock, flags);
-	do {
-		for (i = index; i < index + count; i++)
-			if  (table->base[i].present)
-				break;
-		/* empty index found */
-		if (i == index + count)
-			break;
-
-		index = (index + count) % INTR_REMAP_TABLE_ENTRIES;
-
-		if (index == start_index) {
-			raw_spin_unlock_irqrestore(&irq_2_ir_lock, flags);
-			printk(KERN_ERR "can't allocate an IRTE\n");
-			return -1;
-		}
-	} while (1);
-
-	for (i = index; i < index + count; i++)
-		table->base[i].present = 1;
-
-	irq_iommu->iommu = iommu;
-	irq_iommu->irte_index =  index;
-	irq_iommu->sub_handle = 0;
-	irq_iommu->irte_mask = mask;
-
-	raw_spin_unlock_irqrestore(&irq_2_ir_lock, flags);
-
-	return index;
-}
-
-static int qi_flush_iec(struct intel_iommu *iommu, int index, int mask)
-{
-	struct qi_desc desc;
-
-	desc.low = QI_IEC_IIDEX(index) | QI_IEC_TYPE | QI_IEC_IM(mask)
-		   | QI_IEC_SELECTIVE;
-	desc.high = 0;
-
-	return qi_submit_sync(&desc, iommu);
-}
-
-static int map_irq_to_irte_handle(int irq, u16 *sub_handle)
-{
-	struct irq_2_iommu *irq_iommu = irq_2_iommu(irq);
-	unsigned long flags;
-	int index;
-
-	if (!irq_iommu)
-		return -1;
-
-	raw_spin_lock_irqsave(&irq_2_ir_lock, flags);
-	*sub_handle = irq_iommu->sub_handle;
-	index = irq_iommu->irte_index;
-	raw_spin_unlock_irqrestore(&irq_2_ir_lock, flags);
-	return index;
-}
-
-static int set_irte_irq(int irq, struct intel_iommu *iommu, u16 index, u16 subhandle)
-{
-	struct irq_2_iommu *irq_iommu = irq_2_iommu(irq);
-	unsigned long flags;
-
-	if (!irq_iommu)
-		return -1;
-
-	raw_spin_lock_irqsave(&irq_2_ir_lock, flags);
-
-	irq_iommu->iommu = iommu;
-	irq_iommu->irte_index = index;
-	irq_iommu->sub_handle = subhandle;
-	irq_iommu->irte_mask = 0;
-
-	raw_spin_unlock_irqrestore(&irq_2_ir_lock, flags);
-
-	return 0;
-}
-
-static int modify_irte(int irq, struct irte *irte_modified)
-{
-	struct irq_2_iommu *irq_iommu = irq_2_iommu(irq);
-	struct intel_iommu *iommu;
-	unsigned long flags;
-	struct irte *irte;
-	int rc, index;
-
-	if (!irq_iommu)
-		return -1;
-
-	raw_spin_lock_irqsave(&irq_2_ir_lock, flags);
-
-	iommu = irq_iommu->iommu;
-
-	index = irq_iommu->irte_index + irq_iommu->sub_handle;
-	irte = &iommu->ir_table->base[index];
-
-	set_64bit(&irte->low, irte_modified->low);
-	set_64bit(&irte->high, irte_modified->high);
-	__iommu_flush_cache(iommu, irte, sizeof(*irte));
-
-	rc = qi_flush_iec(iommu, index, 0);
-	raw_spin_unlock_irqrestore(&irq_2_ir_lock, flags);
-
-	return rc;
-}
-
-static struct intel_iommu *map_hpet_to_ir(u8 hpet_id)
-{
-	int i;
-
-	for (i = 0; i < MAX_HPET_TBS; i++)
-		if (ir_hpet[i].id == hpet_id)
-			return ir_hpet[i].iommu;
-	return NULL;
-}
-
-static struct intel_iommu *map_ioapic_to_ir(int apic)
-{
-	int i;
-
-	for (i = 0; i < MAX_IO_APICS; i++)
-		if (ir_ioapic[i].id == apic)
-			return ir_ioapic[i].iommu;
-	return NULL;
-}
-
-static struct intel_iommu *map_dev_to_ir(struct pci_dev *dev)
-{
-	struct dmar_drhd_unit *drhd;
-
-	drhd = dmar_find_matched_drhd_unit(dev);
-	if (!drhd)
-		return NULL;
-
-	return drhd->iommu;
-}
-
-static int clear_entries(struct irq_2_iommu *irq_iommu)
-{
-	struct irte *start, *entry, *end;
-	struct intel_iommu *iommu;
-	int index;
-
-	if (irq_iommu->sub_handle)
-		return 0;
-
-	iommu = irq_iommu->iommu;
-	index = irq_iommu->irte_index + irq_iommu->sub_handle;
-
-	start = iommu->ir_table->base + index;
-	end = start + (1 << irq_iommu->irte_mask);
-
-	for (entry = start; entry < end; entry++) {
-		set_64bit(&entry->low, 0);
-		set_64bit(&entry->high, 0);
-	}
-
-	return qi_flush_iec(iommu, index, irq_iommu->irte_mask);
-}
-
-static int free_irte(int irq)
-{
-	struct irq_2_iommu *irq_iommu = irq_2_iommu(irq);
-	unsigned long flags;
-	int rc;
-
-	if (!irq_iommu)
-		return -1;
-
-	raw_spin_lock_irqsave(&irq_2_ir_lock, flags);
-
-	rc = clear_entries(irq_iommu);
-
-	irq_iommu->iommu = NULL;
-	irq_iommu->irte_index = 0;
-	irq_iommu->sub_handle = 0;
-	irq_iommu->irte_mask = 0;
-
-	raw_spin_unlock_irqrestore(&irq_2_ir_lock, flags);
-
-	return rc;
-}
-
-/*
- * source validation type
- */
-#define SVT_NO_VERIFY		0x0  /* no verification is required */
-#define SVT_VERIFY_SID_SQ	0x1  /* verify using SID and SQ fields */
-#define SVT_VERIFY_BUS		0x2  /* verify bus of request-id */
-
-/*
- * source-id qualifier
- */
-#define SQ_ALL_16	0x0  /* verify all 16 bits of request-id */
-#define SQ_13_IGNORE_1	0x1  /* verify most significant 13 bits, ignore
-			      * the third least significant bit
-			      */
-#define SQ_13_IGNORE_2	0x2  /* verify most significant 13 bits, ignore
-			      * the second and third least significant bits
-			      */
-#define SQ_13_IGNORE_3	0x3  /* verify most significant 13 bits, ignore
-			      * the least three significant bits
-			      */
-
-/*
- * set SVT, SQ and SID fields of irte to verify
- * source ids of interrupt requests
- */
-static void set_irte_sid(struct irte *irte, unsigned int svt,
-			 unsigned int sq, unsigned int sid)
-{
-	if (disable_sourceid_checking)
-		svt = SVT_NO_VERIFY;
-	irte->svt = svt;
-	irte->sq = sq;
-	irte->sid = sid;
-}
-
-static int set_ioapic_sid(struct irte *irte, int apic)
-{
-	int i;
-	u16 sid = 0;
-
-	if (!irte)
-		return -1;
-
-	for (i = 0; i < MAX_IO_APICS; i++) {
-		if (ir_ioapic[i].id == apic) {
-			sid = (ir_ioapic[i].bus << 8) | ir_ioapic[i].devfn;
-			break;
-		}
-	}
-
-	if (sid == 0) {
-		pr_warning("Failed to set source-id of IOAPIC (%d)\n", apic);
-		return -1;
-	}
-
-	set_irte_sid(irte, 1, 0, sid);
-
-	return 0;
-}
-
-static int set_hpet_sid(struct irte *irte, u8 id)
-{
-	int i;
-	u16 sid = 0;
-
-	if (!irte)
-		return -1;
-
-	for (i = 0; i < MAX_HPET_TBS; i++) {
-		if (ir_hpet[i].id == id) {
-			sid = (ir_hpet[i].bus << 8) | ir_hpet[i].devfn;
-			break;
-		}
-	}
-
-	if (sid == 0) {
-		pr_warning("Failed to set source-id of HPET block (%d)\n", id);
-		return -1;
-	}
-
-	/*
-	 * Should really use SQ_ALL_16. Some platforms are broken.
-	 * While we figure out the right quirks for these broken platforms, use
-	 * SQ_13_IGNORE_3 for now.
-	 */
-	set_irte_sid(irte, SVT_VERIFY_SID_SQ, SQ_13_IGNORE_3, sid);
-
-	return 0;
-}
-
-static int set_msi_sid(struct irte *irte, struct pci_dev *dev)
-{
-	struct pci_dev *bridge;
-
-	if (!irte || !dev)
-		return -1;
-
-	/* PCIe device or Root Complex integrated PCI device */
-	if (pci_is_pcie(dev) || !dev->bus->parent) {
-		set_irte_sid(irte, SVT_VERIFY_SID_SQ, SQ_ALL_16,
-			     (dev->bus->number << 8) | dev->devfn);
-		return 0;
-	}
-
-	bridge = pci_find_upstream_pcie_bridge(dev);
-	if (bridge) {
-		if (pci_is_pcie(bridge))/* this is a PCIe-to-PCI/PCIX bridge */
-			set_irte_sid(irte, SVT_VERIFY_BUS, SQ_ALL_16,
-				(bridge->bus->number << 8) | dev->bus->number);
-		else /* this is a legacy PCI bridge */
-			set_irte_sid(irte, SVT_VERIFY_SID_SQ, SQ_ALL_16,
-				(bridge->bus->number << 8) | bridge->devfn);
-	}
-
-	return 0;
-}
-
-static void iommu_set_irq_remapping(struct intel_iommu *iommu, int mode)
-{
-	u64 addr;
-	u32 sts;
-	unsigned long flags;
-
-	addr = virt_to_phys((void *)iommu->ir_table->base);
-
-	raw_spin_lock_irqsave(&iommu->register_lock, flags);
-
-	dmar_writeq(iommu->reg + DMAR_IRTA_REG,
-		    (addr) | IR_X2APIC_MODE(mode) | INTR_REMAP_TABLE_REG_SIZE);
-
-	/* Set interrupt-remapping table pointer */
-	iommu->gcmd |= DMA_GCMD_SIRTP;
-	writel(iommu->gcmd, iommu->reg + DMAR_GCMD_REG);
-
-	IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
-		      readl, (sts & DMA_GSTS_IRTPS), sts);
-	raw_spin_unlock_irqrestore(&iommu->register_lock, flags);
-
-	/*
-	 * global invalidation of interrupt entry cache before enabling
-	 * interrupt-remapping.
-	 */
-	qi_global_iec(iommu);
-
-	raw_spin_lock_irqsave(&iommu->register_lock, flags);
-
-	/* Enable interrupt-remapping */
-	iommu->gcmd |= DMA_GCMD_IRE;
-	writel(iommu->gcmd, iommu->reg + DMAR_GCMD_REG);
-
-	IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
-		      readl, (sts & DMA_GSTS_IRES), sts);
-
-	raw_spin_unlock_irqrestore(&iommu->register_lock, flags);
-}
-
-
-static int intel_setup_irq_remapping(struct intel_iommu *iommu, int mode)
-{
-	struct ir_table *ir_table;
-	struct page *pages;
-
-	ir_table = iommu->ir_table = kzalloc(sizeof(struct ir_table),
-					     GFP_ATOMIC);
-
-	if (!iommu->ir_table)
-		return -ENOMEM;
-
-	pages = alloc_pages_node(iommu->node, GFP_ATOMIC | __GFP_ZERO,
-				 INTR_REMAP_PAGE_ORDER);
-
-	if (!pages) {
-		printk(KERN_ERR "failed to allocate pages of order %d\n",
-		       INTR_REMAP_PAGE_ORDER);
-		kfree(iommu->ir_table);
-		return -ENOMEM;
-	}
-
-	ir_table->base = page_address(pages);
-
-	iommu_set_irq_remapping(iommu, mode);
-	return 0;
-}
-
-/*
- * Disable Interrupt Remapping.
- */
-static void iommu_disable_irq_remapping(struct intel_iommu *iommu)
-{
-	unsigned long flags;
-	u32 sts;
-
-	if (!ecap_ir_support(iommu->ecap))
-		return;
-
-	/*
-	 * global invalidation of interrupt entry cache before disabling
-	 * interrupt-remapping.
-	 */
-	qi_global_iec(iommu);
-
-	raw_spin_lock_irqsave(&iommu->register_lock, flags);
-
-	sts = dmar_readq(iommu->reg + DMAR_GSTS_REG);
-	if (!(sts & DMA_GSTS_IRES))
-		goto end;
-
-	iommu->gcmd &= ~DMA_GCMD_IRE;
-	writel(iommu->gcmd, iommu->reg + DMAR_GCMD_REG);
-
-	IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
-		      readl, !(sts & DMA_GSTS_IRES), sts);
-
-end:
-	raw_spin_unlock_irqrestore(&iommu->register_lock, flags);
-}
-
-static int __init dmar_x2apic_optout(void)
-{
-	struct acpi_table_dmar *dmar;
-	dmar = (struct acpi_table_dmar *)dmar_tbl;
-	if (!dmar || no_x2apic_optout)
-		return 0;
-	return dmar->flags & DMAR_X2APIC_OPT_OUT;
-}
-
-static int __init intel_irq_remapping_supported(void)
-{
-	struct dmar_drhd_unit *drhd;
-
-	if (disable_irq_remap)
-		return 0;
-
-	if (!dmar_ir_support())
-		return 0;
-
-	for_each_drhd_unit(drhd) {
-		struct intel_iommu *iommu = drhd->iommu;
-
-		if (!ecap_ir_support(iommu->ecap))
-			return 0;
-	}
-
-	return 1;
-}
-
-static int __init intel_enable_irq_remapping(void)
-{
-	struct dmar_drhd_unit *drhd;
-	int setup = 0;
-	int eim = 0;
-
-	if (parse_ioapics_under_ir() != 1) {
-		printk(KERN_INFO "Not enable interrupt remapping\n");
-		return -1;
-	}
-
-	if (x2apic_supported()) {
-		eim = !dmar_x2apic_optout();
-		WARN(!eim, KERN_WARNING
-			   "Your BIOS is broken and requested that x2apic be disabled\n"
-			   "This will leave your machine vulnerable to irq-injection attacks\n"
-			   "Use 'intremap=no_x2apic_optout' to override BIOS request\n");
-	}
-
-	for_each_drhd_unit(drhd) {
-		struct intel_iommu *iommu = drhd->iommu;
-
-		/*
-		 * If the queued invalidation is already initialized,
-		 * shouldn't disable it.
-		 */
-		if (iommu->qi)
-			continue;
-
-		/*
-		 * Clear previous faults.
-		 */
-		dmar_fault(-1, iommu);
-
-		/*
-		 * Disable intr remapping and queued invalidation, if already
-		 * enabled prior to OS handover.
-		 */
-		iommu_disable_irq_remapping(iommu);
-
-		dmar_disable_qi(iommu);
-	}
-
-	/*
-	 * check for the Interrupt-remapping support
-	 */
-	for_each_drhd_unit(drhd) {
-		struct intel_iommu *iommu = drhd->iommu;
-
-		if (!ecap_ir_support(iommu->ecap))
-			continue;
-
-		if (eim && !ecap_eim_support(iommu->ecap)) {
-			printk(KERN_INFO "DRHD %Lx: EIM not supported by DRHD, "
-			       " ecap %Lx\n", drhd->reg_base_addr, iommu->ecap);
-			return -1;
-		}
-	}
-
-	/*
-	 * Enable queued invalidation for all the DRHD's.
-	 */
-	for_each_drhd_unit(drhd) {
-		int ret;
-		struct intel_iommu *iommu = drhd->iommu;
-		ret = dmar_enable_qi(iommu);
-
-		if (ret) {
-			printk(KERN_ERR "DRHD %Lx: failed to enable queued, "
-			       " invalidation, ecap %Lx, ret %d\n",
-			       drhd->reg_base_addr, iommu->ecap, ret);
-			return -1;
-		}
-	}
-
-	/*
-	 * Setup Interrupt-remapping for all the DRHD's now.
-	 */
-	for_each_drhd_unit(drhd) {
-		struct intel_iommu *iommu = drhd->iommu;
-
-		if (!ecap_ir_support(iommu->ecap))
-			continue;
-
-		if (intel_setup_irq_remapping(iommu, eim))
-			goto error;
-
-		setup = 1;
-	}
-
-	if (!setup)
-		goto error;
-
-	irq_remapping_enabled = 1;
-	pr_info("Enabled IRQ remapping in %s mode\n", eim ? "x2apic" : "xapic");
-
-	return eim ? IRQ_REMAP_X2APIC_MODE : IRQ_REMAP_XAPIC_MODE;
-
-error:
-	/*
-	 * handle error condition gracefully here!
-	 */
-	return -1;
-}
-
-static void ir_parse_one_hpet_scope(struct acpi_dmar_device_scope *scope,
-				      struct intel_iommu *iommu)
-{
-	struct acpi_dmar_pci_path *path;
-	u8 bus;
-	int count;
-
-	bus = scope->bus;
-	path = (struct acpi_dmar_pci_path *)(scope + 1);
-	count = (scope->length - sizeof(struct acpi_dmar_device_scope))
-		/ sizeof(struct acpi_dmar_pci_path);
-
-	while (--count > 0) {
-		/*
-		 * Access PCI directly due to the PCI
-		 * subsystem isn't initialized yet.
-		 */
-		bus = read_pci_config_byte(bus, path->dev, path->fn,
-					   PCI_SECONDARY_BUS);
-		path++;
-	}
-	ir_hpet[ir_hpet_num].bus   = bus;
-	ir_hpet[ir_hpet_num].devfn = PCI_DEVFN(path->dev, path->fn);
-	ir_hpet[ir_hpet_num].iommu = iommu;
-	ir_hpet[ir_hpet_num].id    = scope->enumeration_id;
-	ir_hpet_num++;
-}
-
-static void ir_parse_one_ioapic_scope(struct acpi_dmar_device_scope *scope,
-				      struct intel_iommu *iommu)
-{
-	struct acpi_dmar_pci_path *path;
-	u8 bus;
-	int count;
-
-	bus = scope->bus;
-	path = (struct acpi_dmar_pci_path *)(scope + 1);
-	count = (scope->length - sizeof(struct acpi_dmar_device_scope))
-		/ sizeof(struct acpi_dmar_pci_path);
-
-	while (--count > 0) {
-		/*
-		 * Access PCI directly due to the PCI
-		 * subsystem isn't initialized yet.
-		 */
-		bus = read_pci_config_byte(bus, path->dev, path->fn,
-					   PCI_SECONDARY_BUS);
-		path++;
-	}
-
-	ir_ioapic[ir_ioapic_num].bus   = bus;
-	ir_ioapic[ir_ioapic_num].devfn = PCI_DEVFN(path->dev, path->fn);
-	ir_ioapic[ir_ioapic_num].iommu = iommu;
-	ir_ioapic[ir_ioapic_num].id    = scope->enumeration_id;
-	ir_ioapic_num++;
-}
-
-static int ir_parse_ioapic_hpet_scope(struct acpi_dmar_header *header,
-				      struct intel_iommu *iommu)
-{
-	struct acpi_dmar_hardware_unit *drhd;
-	struct acpi_dmar_device_scope *scope;
-	void *start, *end;
-
-	drhd = (struct acpi_dmar_hardware_unit *)header;
-
-	start = (void *)(drhd + 1);
-	end = ((void *)drhd) + header->length;
-
-	while (start < end) {
-		scope = start;
-		if (scope->entry_type == ACPI_DMAR_SCOPE_TYPE_IOAPIC) {
-			if (ir_ioapic_num == MAX_IO_APICS) {
-				printk(KERN_WARNING "Exceeded Max IO APICS\n");
-				return -1;
-			}
-
-			printk(KERN_INFO "IOAPIC id %d under DRHD base "
-			       " 0x%Lx IOMMU %d\n", scope->enumeration_id,
-			       drhd->address, iommu->seq_id);
-
-			ir_parse_one_ioapic_scope(scope, iommu);
-		} else if (scope->entry_type == ACPI_DMAR_SCOPE_TYPE_HPET) {
-			if (ir_hpet_num == MAX_HPET_TBS) {
-				printk(KERN_WARNING "Exceeded Max HPET blocks\n");
-				return -1;
-			}
-
-			printk(KERN_INFO "HPET id %d under DRHD base"
-			       " 0x%Lx\n", scope->enumeration_id,
-			       drhd->address);
-
-			ir_parse_one_hpet_scope(scope, iommu);
-		}
-		start += scope->length;
-	}
-
-	return 0;
-}
-
-/*
- * Finds the assocaition between IOAPIC's and its Interrupt-remapping
- * hardware unit.
- */
-int __init parse_ioapics_under_ir(void)
-{
-	struct dmar_drhd_unit *drhd;
-	int ir_supported = 0;
-
-	for_each_drhd_unit(drhd) {
-		struct intel_iommu *iommu = drhd->iommu;
-
-		if (ecap_ir_support(iommu->ecap)) {
-			if (ir_parse_ioapic_hpet_scope(drhd->hdr, iommu))
-				return -1;
-
-			ir_supported = 1;
-		}
-	}
-
-	if (ir_supported && ir_ioapic_num != nr_ioapics) {
-		printk(KERN_WARNING
-		       "Not all IO-APIC's listed under remapping hardware\n");
-		return -1;
-	}
-
-	return ir_supported;
-}
-
-int __init ir_dev_scope_init(void)
-{
-	if (!irq_remapping_enabled)
-		return 0;
-
-	return dmar_dev_scope_init();
-}
-rootfs_initcall(ir_dev_scope_init);
-
-static void disable_irq_remapping(void)
-{
-	struct dmar_drhd_unit *drhd;
-	struct intel_iommu *iommu = NULL;
-
-	/*
-	 * Disable Interrupt-remapping for all the DRHD's now.
-	 */
-	for_each_iommu(iommu, drhd) {
-		if (!ecap_ir_support(iommu->ecap))
-			continue;
-
-		iommu_disable_irq_remapping(iommu);
-	}
-}
-
-static int reenable_irq_remapping(int eim)
-{
-	struct dmar_drhd_unit *drhd;
-	int setup = 0;
-	struct intel_iommu *iommu = NULL;
-
-	for_each_iommu(iommu, drhd)
-		if (iommu->qi)
-			dmar_reenable_qi(iommu);
-
-	/*
-	 * Setup Interrupt-remapping for all the DRHD's now.
-	 */
-	for_each_iommu(iommu, drhd) {
-		if (!ecap_ir_support(iommu->ecap))
-			continue;
-
-		/* Set up interrupt remapping for iommu.*/
-		iommu_set_irq_remapping(iommu, eim);
-		setup = 1;
-	}
-
-	if (!setup)
-		goto error;
-
-	return 0;
-
-error:
-	/*
-	 * handle error condition gracefully here!
-	 */
-	return -1;
-}
-
-static void prepare_irte(struct irte *irte, int vector,
-			 unsigned int dest)
-{
-	memset(irte, 0, sizeof(*irte));
-
-	irte->present = 1;
-	irte->dst_mode = apic->irq_dest_mode;
-	/*
-	 * Trigger mode in the IRTE will always be edge, and for IO-APIC, the
-	 * actual level or edge trigger will be setup in the IO-APIC
-	 * RTE. This will help simplify level triggered irq migration.
-	 * For more details, see the comments (in io_apic.c) explainig IO-APIC
-	 * irq migration in the presence of interrupt-remapping.
-	*/
-	irte->trigger_mode = 0;
-	irte->dlvry_mode = apic->irq_delivery_mode;
-	irte->vector = vector;
-	irte->dest_id = IRTE_DEST(dest);
-	irte->redir_hint = 1;
-}
-
-static int intel_setup_ioapic_entry(int irq,
-				    struct IO_APIC_route_entry *route_entry,
-				    unsigned int destination, int vector,
-				    struct io_apic_irq_attr *attr)
-{
-	int ioapic_id = mpc_ioapic_id(attr->ioapic);
-	struct intel_iommu *iommu = map_ioapic_to_ir(ioapic_id);
-	struct IR_IO_APIC_route_entry *entry;
-	struct irte irte;
-	int index;
-
-	if (!iommu) {
-		pr_warn("No mapping iommu for ioapic %d\n", ioapic_id);
-		return -ENODEV;
-	}
-
-	entry = (struct IR_IO_APIC_route_entry *)route_entry;
-
-	index = alloc_irte(iommu, irq, 1);
-	if (index < 0) {
-		pr_warn("Failed to allocate IRTE for ioapic %d\n", ioapic_id);
-		return -ENOMEM;
-	}
-
-	prepare_irte(&irte, vector, destination);
-
-	/* Set source-id of interrupt request */
-	set_ioapic_sid(&irte, ioapic_id);
-
-	modify_irte(irq, &irte);
-
-	apic_printk(APIC_VERBOSE, KERN_DEBUG "IOAPIC[%d]: "
-		"Set IRTE entry (P:%d FPD:%d Dst_Mode:%d "
-		"Redir_hint:%d Trig_Mode:%d Dlvry_Mode:%X "
-		"Avail:%X Vector:%02X Dest:%08X "
-		"SID:%04X SQ:%X SVT:%X)\n",
-		attr->ioapic, irte.present, irte.fpd, irte.dst_mode,
-		irte.redir_hint, irte.trigger_mode, irte.dlvry_mode,
-		irte.avail, irte.vector, irte.dest_id,
-		irte.sid, irte.sq, irte.svt);
-
-	memset(entry, 0, sizeof(*entry));
-
-	entry->index2	= (index >> 15) & 0x1;
-	entry->zero	= 0;
-	entry->format	= 1;
-	entry->index	= (index & 0x7fff);
-	/*
-	 * IO-APIC RTE will be configured with virtual vector.
-	 * irq handler will do the explicit EOI to the io-apic.
-	 */
-	entry->vector	= attr->ioapic_pin;
-	entry->mask	= 0;			/* enable IRQ */
-	entry->trigger	= attr->trigger;
-	entry->polarity	= attr->polarity;
-
-	/* Mask level triggered irqs.
-	 * Use IRQ_DELAYED_DISABLE for edge triggered irqs.
-	 */
-	if (attr->trigger)
-		entry->mask = 1;
-
-	return 0;
-}
-
-/*
- * Migrate the IO-APIC irq in the presence of intr-remapping.
- *
- * For both level and edge triggered, irq migration is a simple atomic
- * update(of vector and cpu destination) of IRTE and flush the hardware cache.
- *
- * For level triggered, we eliminate the io-apic RTE modification (with the
- * updated vector information), by using a virtual vector (io-apic pin number).
- * Real vector that is used for interrupting cpu will be coming from
- * the interrupt-remapping table entry.
- *
- * As the migration is a simple atomic update of IRTE, the same mechanism
- * is used to migrate MSI irq's in the presence of interrupt-remapping.
- */
-static int
-intel_ioapic_set_affinity(struct irq_data *data, const struct cpumask *mask,
-			  bool force)
-{
-	struct irq_cfg *cfg = data->chip_data;
-	unsigned int dest, irq = data->irq;
-	struct irte irte;
-
-	if (!cpumask_intersects(mask, cpu_online_mask))
-		return -EINVAL;
-
-	if (get_irte(irq, &irte))
-		return -EBUSY;
-
-	if (assign_irq_vector(irq, cfg, mask))
-		return -EBUSY;
-
-	dest = apic->cpu_mask_to_apicid_and(cfg->domain, mask);
-
-	irte.vector = cfg->vector;
-	irte.dest_id = IRTE_DEST(dest);
-
-	/*
-	 * Atomically updates the IRTE with the new destination, vector
-	 * and flushes the interrupt entry cache.
-	 */
-	modify_irte(irq, &irte);
-
-	/*
-	 * After this point, all the interrupts will start arriving
-	 * at the new destination. So, time to cleanup the previous
-	 * vector allocation.
-	 */
-	if (cfg->move_in_progress)
-		send_cleanup_vector(cfg);
-
-	cpumask_copy(data->affinity, mask);
-	return 0;
-}
-
-static void intel_compose_msi_msg(struct pci_dev *pdev,
-				  unsigned int irq, unsigned int dest,
-				  struct msi_msg *msg, u8 hpet_id)
-{
-	struct irq_cfg *cfg;
-	struct irte irte;
-	u16 sub_handle;
-	int ir_index;
-
-	cfg = irq_get_chip_data(irq);
-
-	ir_index = map_irq_to_irte_handle(irq, &sub_handle);
-	BUG_ON(ir_index == -1);
-
-	prepare_irte(&irte, cfg->vector, dest);
-
-	/* Set source-id of interrupt request */
-	if (pdev)
-		set_msi_sid(&irte, pdev);
-	else
-		set_hpet_sid(&irte, hpet_id);
-
-	modify_irte(irq, &irte);
-
-	msg->address_hi = MSI_ADDR_BASE_HI;
-	msg->data = sub_handle;
-	msg->address_lo = MSI_ADDR_BASE_LO | MSI_ADDR_IR_EXT_INT |
-			  MSI_ADDR_IR_SHV |
-			  MSI_ADDR_IR_INDEX1(ir_index) |
-			  MSI_ADDR_IR_INDEX2(ir_index);
-}
-
-/*
- * Map the PCI dev to the corresponding remapping hardware unit
- * and allocate 'nvec' consecutive interrupt-remapping table entries
- * in it.
- */
-static int intel_msi_alloc_irq(struct pci_dev *dev, int irq, int nvec)
-{
-	struct intel_iommu *iommu;
-	int index;
-
-	iommu = map_dev_to_ir(dev);
-	if (!iommu) {
-		printk(KERN_ERR
-		       "Unable to map PCI %s to iommu\n", pci_name(dev));
-		return -ENOENT;
-	}
-
-	index = alloc_irte(iommu, irq, nvec);
-	if (index < 0) {
-		printk(KERN_ERR
-		       "Unable to allocate %d IRTE for PCI %s\n", nvec,
-		       pci_name(dev));
-		return -ENOSPC;
-	}
-	return index;
-}
-
-static int intel_msi_setup_irq(struct pci_dev *pdev, unsigned int irq,
-			       int index, int sub_handle)
-{
-	struct intel_iommu *iommu;
-
-	iommu = map_dev_to_ir(pdev);
-	if (!iommu)
-		return -ENOENT;
-	/*
-	 * setup the mapping between the irq and the IRTE
-	 * base index, the sub_handle pointing to the
-	 * appropriate interrupt remap table entry.
-	 */
-	set_irte_irq(irq, iommu, index, sub_handle);
-
-	return 0;
-}
-
-static int intel_setup_hpet_msi(unsigned int irq, unsigned int id)
-{
-	struct intel_iommu *iommu = map_hpet_to_ir(id);
-	int index;
-
-	if (!iommu)
-		return -1;
-
-	index = alloc_irte(iommu, irq, 1);
-	if (index < 0)
-		return -1;
-
-	return 0;
-}
-
-struct irq_remap_ops intel_irq_remap_ops = {
-	.supported		= intel_irq_remapping_supported,
-	.prepare		= dmar_table_init,
-	.enable			= intel_enable_irq_remapping,
-	.disable		= disable_irq_remapping,
-	.reenable		= reenable_irq_remapping,
-	.enable_faulting	= enable_drhd_fault_handling,
-	.setup_ioapic_entry	= intel_setup_ioapic_entry,
-	.set_affinity		= intel_ioapic_set_affinity,
-	.free_irq		= free_irte,
-	.compose_msi_msg	= intel_compose_msi_msg,
-	.msi_alloc_irq		= intel_msi_alloc_irq,
-	.msi_setup_irq		= intel_msi_setup_irq,
-	.setup_hpet_msi		= intel_setup_hpet_msi,
-};
diff --git a/drivers/iommu/intel_irq_remapping.c b/drivers/iommu/intel_irq_remapping.c
new file mode 100644
index 000000000000..b4d39507681a
--- /dev/null
+++ b/drivers/iommu/intel_irq_remapping.c
@@ -0,0 +1,1065 @@
+#include <linux/interrupt.h>
+#include <linux/dmar.h>
+#include <linux/spinlock.h>
+#include <linux/slab.h>
+#include <linux/jiffies.h>
+#include <linux/hpet.h>
+#include <linux/pci.h>
+#include <linux/irq.h>
+#include <asm/io_apic.h>
+#include <asm/smp.h>
+#include <asm/cpu.h>
+#include <linux/intel-iommu.h>
+#include <acpi/acpi.h>
+#include <asm/irq_remapping.h>
+#include <asm/pci-direct.h>
+#include <asm/msidef.h>
+
+#include "irq_remapping.h"
+
+struct ioapic_scope {
+	struct intel_iommu *iommu;
+	unsigned int id;
+	unsigned int bus;	/* PCI bus number */
+	unsigned int devfn;	/* PCI devfn number */
+};
+
+struct hpet_scope {
+	struct intel_iommu *iommu;
+	u8 id;
+	unsigned int bus;
+	unsigned int devfn;
+};
+
+#define IR_X2APIC_MODE(mode) (mode ? (1 << 11) : 0)
+#define IRTE_DEST(dest) ((x2apic_mode) ? dest : dest << 8)
+
+static struct ioapic_scope ir_ioapic[MAX_IO_APICS];
+static struct hpet_scope ir_hpet[MAX_HPET_TBS];
+static int ir_ioapic_num, ir_hpet_num;
+
+static DEFINE_RAW_SPINLOCK(irq_2_ir_lock);
+
+static struct irq_2_iommu *irq_2_iommu(unsigned int irq)
+{
+	struct irq_cfg *cfg = irq_get_chip_data(irq);
+	return cfg ? &cfg->irq_2_iommu : NULL;
+}
+
+int get_irte(int irq, struct irte *entry)
+{
+	struct irq_2_iommu *irq_iommu = irq_2_iommu(irq);
+	unsigned long flags;
+	int index;
+
+	if (!entry || !irq_iommu)
+		return -1;
+
+	raw_spin_lock_irqsave(&irq_2_ir_lock, flags);
+
+	index = irq_iommu->irte_index + irq_iommu->sub_handle;
+	*entry = *(irq_iommu->iommu->ir_table->base + index);
+
+	raw_spin_unlock_irqrestore(&irq_2_ir_lock, flags);
+	return 0;
+}
+
+static int alloc_irte(struct intel_iommu *iommu, int irq, u16 count)
+{
+	struct ir_table *table = iommu->ir_table;
+	struct irq_2_iommu *irq_iommu = irq_2_iommu(irq);
+	u16 index, start_index;
+	unsigned int mask = 0;
+	unsigned long flags;
+	int i;
+
+	if (!count || !irq_iommu)
+		return -1;
+
+	/*
+	 * start the IRTE search from index 0.
+	 */
+	index = start_index = 0;
+
+	if (count > 1) {
+		count = __roundup_pow_of_two(count);
+		mask = ilog2(count);
+	}
+
+	if (mask > ecap_max_handle_mask(iommu->ecap)) {
+		printk(KERN_ERR
+		       "Requested mask %x exceeds the max invalidation handle"
+		       " mask value %Lx\n", mask,
+		       ecap_max_handle_mask(iommu->ecap));
+		return -1;
+	}
+
+	raw_spin_lock_irqsave(&irq_2_ir_lock, flags);
+	do {
+		for (i = index; i < index + count; i++)
+			if  (table->base[i].present)
+				break;
+		/* empty index found */
+		if (i == index + count)
+			break;
+
+		index = (index + count) % INTR_REMAP_TABLE_ENTRIES;
+
+		if (index == start_index) {
+			raw_spin_unlock_irqrestore(&irq_2_ir_lock, flags);
+			printk(KERN_ERR "can't allocate an IRTE\n");
+			return -1;
+		}
+	} while (1);
+
+	for (i = index; i < index + count; i++)
+		table->base[i].present = 1;
+
+	irq_iommu->iommu = iommu;
+	irq_iommu->irte_index =  index;
+	irq_iommu->sub_handle = 0;
+	irq_iommu->irte_mask = mask;
+
+	raw_spin_unlock_irqrestore(&irq_2_ir_lock, flags);
+
+	return index;
+}
+
+static int qi_flush_iec(struct intel_iommu *iommu, int index, int mask)
+{
+	struct qi_desc desc;
+
+	desc.low = QI_IEC_IIDEX(index) | QI_IEC_TYPE | QI_IEC_IM(mask)
+		   | QI_IEC_SELECTIVE;
+	desc.high = 0;
+
+	return qi_submit_sync(&desc, iommu);
+}
+
+static int map_irq_to_irte_handle(int irq, u16 *sub_handle)
+{
+	struct irq_2_iommu *irq_iommu = irq_2_iommu(irq);
+	unsigned long flags;
+	int index;
+
+	if (!irq_iommu)
+		return -1;
+
+	raw_spin_lock_irqsave(&irq_2_ir_lock, flags);
+	*sub_handle = irq_iommu->sub_handle;
+	index = irq_iommu->irte_index;
+	raw_spin_unlock_irqrestore(&irq_2_ir_lock, flags);
+	return index;
+}
+
+static int set_irte_irq(int irq, struct intel_iommu *iommu, u16 index, u16 subhandle)
+{
+	struct irq_2_iommu *irq_iommu = irq_2_iommu(irq);
+	unsigned long flags;
+
+	if (!irq_iommu)
+		return -1;
+
+	raw_spin_lock_irqsave(&irq_2_ir_lock, flags);
+
+	irq_iommu->iommu = iommu;
+	irq_iommu->irte_index = index;
+	irq_iommu->sub_handle = subhandle;
+	irq_iommu->irte_mask = 0;
+
+	raw_spin_unlock_irqrestore(&irq_2_ir_lock, flags);
+
+	return 0;
+}
+
+static int modify_irte(int irq, struct irte *irte_modified)
+{
+	struct irq_2_iommu *irq_iommu = irq_2_iommu(irq);
+	struct intel_iommu *iommu;
+	unsigned long flags;
+	struct irte *irte;
+	int rc, index;
+
+	if (!irq_iommu)
+		return -1;
+
+	raw_spin_lock_irqsave(&irq_2_ir_lock, flags);
+
+	iommu = irq_iommu->iommu;
+
+	index = irq_iommu->irte_index + irq_iommu->sub_handle;
+	irte = &iommu->ir_table->base[index];
+
+	set_64bit(&irte->low, irte_modified->low);
+	set_64bit(&irte->high, irte_modified->high);
+	__iommu_flush_cache(iommu, irte, sizeof(*irte));
+
+	rc = qi_flush_iec(iommu, index, 0);
+	raw_spin_unlock_irqrestore(&irq_2_ir_lock, flags);
+
+	return rc;
+}
+
+static struct intel_iommu *map_hpet_to_ir(u8 hpet_id)
+{
+	int i;
+
+	for (i = 0; i < MAX_HPET_TBS; i++)
+		if (ir_hpet[i].id == hpet_id)
+			return ir_hpet[i].iommu;
+	return NULL;
+}
+
+static struct intel_iommu *map_ioapic_to_ir(int apic)
+{
+	int i;
+
+	for (i = 0; i < MAX_IO_APICS; i++)
+		if (ir_ioapic[i].id == apic)
+			return ir_ioapic[i].iommu;
+	return NULL;
+}
+
+static struct intel_iommu *map_dev_to_ir(struct pci_dev *dev)
+{
+	struct dmar_drhd_unit *drhd;
+
+	drhd = dmar_find_matched_drhd_unit(dev);
+	if (!drhd)
+		return NULL;
+
+	return drhd->iommu;
+}
+
+static int clear_entries(struct irq_2_iommu *irq_iommu)
+{
+	struct irte *start, *entry, *end;
+	struct intel_iommu *iommu;
+	int index;
+
+	if (irq_iommu->sub_handle)
+		return 0;
+
+	iommu = irq_iommu->iommu;
+	index = irq_iommu->irte_index + irq_iommu->sub_handle;
+
+	start = iommu->ir_table->base + index;
+	end = start + (1 << irq_iommu->irte_mask);
+
+	for (entry = start; entry < end; entry++) {
+		set_64bit(&entry->low, 0);
+		set_64bit(&entry->high, 0);
+	}
+
+	return qi_flush_iec(iommu, index, irq_iommu->irte_mask);
+}
+
+static int free_irte(int irq)
+{
+	struct irq_2_iommu *irq_iommu = irq_2_iommu(irq);
+	unsigned long flags;
+	int rc;
+
+	if (!irq_iommu)
+		return -1;
+
+	raw_spin_lock_irqsave(&irq_2_ir_lock, flags);
+
+	rc = clear_entries(irq_iommu);
+
+	irq_iommu->iommu = NULL;
+	irq_iommu->irte_index = 0;
+	irq_iommu->sub_handle = 0;
+	irq_iommu->irte_mask = 0;
+
+	raw_spin_unlock_irqrestore(&irq_2_ir_lock, flags);
+
+	return rc;
+}
+
+/*
+ * source validation type
+ */
+#define SVT_NO_VERIFY		0x0  /* no verification is required */
+#define SVT_VERIFY_SID_SQ	0x1  /* verify using SID and SQ fields */
+#define SVT_VERIFY_BUS		0x2  /* verify bus of request-id */
+
+/*
+ * source-id qualifier
+ */
+#define SQ_ALL_16	0x0  /* verify all 16 bits of request-id */
+#define SQ_13_IGNORE_1	0x1  /* verify most significant 13 bits, ignore
+			      * the third least significant bit
+			      */
+#define SQ_13_IGNORE_2	0x2  /* verify most significant 13 bits, ignore
+			      * the second and third least significant bits
+			      */
+#define SQ_13_IGNORE_3	0x3  /* verify most significant 13 bits, ignore
+			      * the least three significant bits
+			      */
+
+/*
+ * set SVT, SQ and SID fields of irte to verify
+ * source ids of interrupt requests
+ */
+static void set_irte_sid(struct irte *irte, unsigned int svt,
+			 unsigned int sq, unsigned int sid)
+{
+	if (disable_sourceid_checking)
+		svt = SVT_NO_VERIFY;
+	irte->svt = svt;
+	irte->sq = sq;
+	irte->sid = sid;
+}
+
+static int set_ioapic_sid(struct irte *irte, int apic)
+{
+	int i;
+	u16 sid = 0;
+
+	if (!irte)
+		return -1;
+
+	for (i = 0; i < MAX_IO_APICS; i++) {
+		if (ir_ioapic[i].id == apic) {
+			sid = (ir_ioapic[i].bus << 8) | ir_ioapic[i].devfn;
+			break;
+		}
+	}
+
+	if (sid == 0) {
+		pr_warning("Failed to set source-id of IOAPIC (%d)\n", apic);
+		return -1;
+	}
+
+	set_irte_sid(irte, 1, 0, sid);
+
+	return 0;
+}
+
+static int set_hpet_sid(struct irte *irte, u8 id)
+{
+	int i;
+	u16 sid = 0;
+
+	if (!irte)
+		return -1;
+
+	for (i = 0; i < MAX_HPET_TBS; i++) {
+		if (ir_hpet[i].id == id) {
+			sid = (ir_hpet[i].bus << 8) | ir_hpet[i].devfn;
+			break;
+		}
+	}
+
+	if (sid == 0) {
+		pr_warning("Failed to set source-id of HPET block (%d)\n", id);
+		return -1;
+	}
+
+	/*
+	 * Should really use SQ_ALL_16. Some platforms are broken.
+	 * While we figure out the right quirks for these broken platforms, use
+	 * SQ_13_IGNORE_3 for now.
+	 */
+	set_irte_sid(irte, SVT_VERIFY_SID_SQ, SQ_13_IGNORE_3, sid);
+
+	return 0;
+}
+
+static int set_msi_sid(struct irte *irte, struct pci_dev *dev)
+{
+	struct pci_dev *bridge;
+
+	if (!irte || !dev)
+		return -1;
+
+	/* PCIe device or Root Complex integrated PCI device */
+	if (pci_is_pcie(dev) || !dev->bus->parent) {
+		set_irte_sid(irte, SVT_VERIFY_SID_SQ, SQ_ALL_16,
+			     (dev->bus->number << 8) | dev->devfn);
+		return 0;
+	}
+
+	bridge = pci_find_upstream_pcie_bridge(dev);
+	if (bridge) {
+		if (pci_is_pcie(bridge))/* this is a PCIe-to-PCI/PCIX bridge */
+			set_irte_sid(irte, SVT_VERIFY_BUS, SQ_ALL_16,
+				(bridge->bus->number << 8) | dev->bus->number);
+		else /* this is a legacy PCI bridge */
+			set_irte_sid(irte, SVT_VERIFY_SID_SQ, SQ_ALL_16,
+				(bridge->bus->number << 8) | bridge->devfn);
+	}
+
+	return 0;
+}
+
+static void iommu_set_irq_remapping(struct intel_iommu *iommu, int mode)
+{
+	u64 addr;
+	u32 sts;
+	unsigned long flags;
+
+	addr = virt_to_phys((void *)iommu->ir_table->base);
+
+	raw_spin_lock_irqsave(&iommu->register_lock, flags);
+
+	dmar_writeq(iommu->reg + DMAR_IRTA_REG,
+		    (addr) | IR_X2APIC_MODE(mode) | INTR_REMAP_TABLE_REG_SIZE);
+
+	/* Set interrupt-remapping table pointer */
+	iommu->gcmd |= DMA_GCMD_SIRTP;
+	writel(iommu->gcmd, iommu->reg + DMAR_GCMD_REG);
+
+	IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
+		      readl, (sts & DMA_GSTS_IRTPS), sts);
+	raw_spin_unlock_irqrestore(&iommu->register_lock, flags);
+
+	/*
+	 * global invalidation of interrupt entry cache before enabling
+	 * interrupt-remapping.
+	 */
+	qi_global_iec(iommu);
+
+	raw_spin_lock_irqsave(&iommu->register_lock, flags);
+
+	/* Enable interrupt-remapping */
+	iommu->gcmd |= DMA_GCMD_IRE;
+	writel(iommu->gcmd, iommu->reg + DMAR_GCMD_REG);
+
+	IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
+		      readl, (sts & DMA_GSTS_IRES), sts);
+
+	raw_spin_unlock_irqrestore(&iommu->register_lock, flags);
+}
+
+
+static int intel_setup_irq_remapping(struct intel_iommu *iommu, int mode)
+{
+	struct ir_table *ir_table;
+	struct page *pages;
+
+	ir_table = iommu->ir_table = kzalloc(sizeof(struct ir_table),
+					     GFP_ATOMIC);
+
+	if (!iommu->ir_table)
+		return -ENOMEM;
+
+	pages = alloc_pages_node(iommu->node, GFP_ATOMIC | __GFP_ZERO,
+				 INTR_REMAP_PAGE_ORDER);
+
+	if (!pages) {
+		printk(KERN_ERR "failed to allocate pages of order %d\n",
+		       INTR_REMAP_PAGE_ORDER);
+		kfree(iommu->ir_table);
+		return -ENOMEM;
+	}
+
+	ir_table->base = page_address(pages);
+
+	iommu_set_irq_remapping(iommu, mode);
+	return 0;
+}
+
+/*
+ * Disable Interrupt Remapping.
+ */
+static void iommu_disable_irq_remapping(struct intel_iommu *iommu)
+{
+	unsigned long flags;
+	u32 sts;
+
+	if (!ecap_ir_support(iommu->ecap))
+		return;
+
+	/*
+	 * global invalidation of interrupt entry cache before disabling
+	 * interrupt-remapping.
+	 */
+	qi_global_iec(iommu);
+
+	raw_spin_lock_irqsave(&iommu->register_lock, flags);
+
+	sts = dmar_readq(iommu->reg + DMAR_GSTS_REG);
+	if (!(sts & DMA_GSTS_IRES))
+		goto end;
+
+	iommu->gcmd &= ~DMA_GCMD_IRE;
+	writel(iommu->gcmd, iommu->reg + DMAR_GCMD_REG);
+
+	IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
+		      readl, !(sts & DMA_GSTS_IRES), sts);
+
+end:
+	raw_spin_unlock_irqrestore(&iommu->register_lock, flags);
+}
+
+static int __init dmar_x2apic_optout(void)
+{
+	struct acpi_table_dmar *dmar;
+	dmar = (struct acpi_table_dmar *)dmar_tbl;
+	if (!dmar || no_x2apic_optout)
+		return 0;
+	return dmar->flags & DMAR_X2APIC_OPT_OUT;
+}
+
+static int __init intel_irq_remapping_supported(void)
+{
+	struct dmar_drhd_unit *drhd;
+
+	if (disable_irq_remap)
+		return 0;
+
+	if (!dmar_ir_support())
+		return 0;
+
+	for_each_drhd_unit(drhd) {
+		struct intel_iommu *iommu = drhd->iommu;
+
+		if (!ecap_ir_support(iommu->ecap))
+			return 0;
+	}
+
+	return 1;
+}
+
+static int __init intel_enable_irq_remapping(void)
+{
+	struct dmar_drhd_unit *drhd;
+	int setup = 0;
+	int eim = 0;
+
+	if (parse_ioapics_under_ir() != 1) {
+		printk(KERN_INFO "Not enable interrupt remapping\n");
+		return -1;
+	}
+
+	if (x2apic_supported()) {
+		eim = !dmar_x2apic_optout();
+		WARN(!eim, KERN_WARNING
+			   "Your BIOS is broken and requested that x2apic be disabled\n"
+			   "This will leave your machine vulnerable to irq-injection attacks\n"
+			   "Use 'intremap=no_x2apic_optout' to override BIOS request\n");
+	}
+
+	for_each_drhd_unit(drhd) {
+		struct intel_iommu *iommu = drhd->iommu;
+
+		/*
+		 * If the queued invalidation is already initialized,
+		 * shouldn't disable it.
+		 */
+		if (iommu->qi)
+			continue;
+
+		/*
+		 * Clear previous faults.
+		 */
+		dmar_fault(-1, iommu);
+
+		/*
+		 * Disable intr remapping and queued invalidation, if already
+		 * enabled prior to OS handover.
+		 */
+		iommu_disable_irq_remapping(iommu);
+
+		dmar_disable_qi(iommu);
+	}
+
+	/*
+	 * check for the Interrupt-remapping support
+	 */
+	for_each_drhd_unit(drhd) {
+		struct intel_iommu *iommu = drhd->iommu;
+
+		if (!ecap_ir_support(iommu->ecap))
+			continue;
+
+		if (eim && !ecap_eim_support(iommu->ecap)) {
+			printk(KERN_INFO "DRHD %Lx: EIM not supported by DRHD, "
+			       " ecap %Lx\n", drhd->reg_base_addr, iommu->ecap);
+			return -1;
+		}
+	}
+
+	/*
+	 * Enable queued invalidation for all the DRHD's.
+	 */
+	for_each_drhd_unit(drhd) {
+		int ret;
+		struct intel_iommu *iommu = drhd->iommu;
+		ret = dmar_enable_qi(iommu);
+
+		if (ret) {
+			printk(KERN_ERR "DRHD %Lx: failed to enable queued, "
+			       " invalidation, ecap %Lx, ret %d\n",
+			       drhd->reg_base_addr, iommu->ecap, ret);
+			return -1;
+		}
+	}
+
+	/*
+	 * Setup Interrupt-remapping for all the DRHD's now.
+	 */
+	for_each_drhd_unit(drhd) {
+		struct intel_iommu *iommu = drhd->iommu;
+
+		if (!ecap_ir_support(iommu->ecap))
+			continue;
+
+		if (intel_setup_irq_remapping(iommu, eim))
+			goto error;
+
+		setup = 1;
+	}
+
+	if (!setup)
+		goto error;
+
+	irq_remapping_enabled = 1;
+	pr_info("Enabled IRQ remapping in %s mode\n", eim ? "x2apic" : "xapic");
+
+	return eim ? IRQ_REMAP_X2APIC_MODE : IRQ_REMAP_XAPIC_MODE;
+
+error:
+	/*
+	 * handle error condition gracefully here!
+	 */
+	return -1;
+}
+
+static void ir_parse_one_hpet_scope(struct acpi_dmar_device_scope *scope,
+				      struct intel_iommu *iommu)
+{
+	struct acpi_dmar_pci_path *path;
+	u8 bus;
+	int count;
+
+	bus = scope->bus;
+	path = (struct acpi_dmar_pci_path *)(scope + 1);
+	count = (scope->length - sizeof(struct acpi_dmar_device_scope))
+		/ sizeof(struct acpi_dmar_pci_path);
+
+	while (--count > 0) {
+		/*
+		 * Access PCI directly due to the PCI
+		 * subsystem isn't initialized yet.
+		 */
+		bus = read_pci_config_byte(bus, path->dev, path->fn,
+					   PCI_SECONDARY_BUS);
+		path++;
+	}
+	ir_hpet[ir_hpet_num].bus   = bus;
+	ir_hpet[ir_hpet_num].devfn = PCI_DEVFN(path->dev, path->fn);
+	ir_hpet[ir_hpet_num].iommu = iommu;
+	ir_hpet[ir_hpet_num].id    = scope->enumeration_id;
+	ir_hpet_num++;
+}
+
+static void ir_parse_one_ioapic_scope(struct acpi_dmar_device_scope *scope,
+				      struct intel_iommu *iommu)
+{
+	struct acpi_dmar_pci_path *path;
+	u8 bus;
+	int count;
+
+	bus = scope->bus;
+	path = (struct acpi_dmar_pci_path *)(scope + 1);
+	count = (scope->length - sizeof(struct acpi_dmar_device_scope))
+		/ sizeof(struct acpi_dmar_pci_path);
+
+	while (--count > 0) {
+		/*
+		 * Access PCI directly due to the PCI
+		 * subsystem isn't initialized yet.
+		 */
+		bus = read_pci_config_byte(bus, path->dev, path->fn,
+					   PCI_SECONDARY_BUS);
+		path++;
+	}
+
+	ir_ioapic[ir_ioapic_num].bus   = bus;
+	ir_ioapic[ir_ioapic_num].devfn = PCI_DEVFN(path->dev, path->fn);
+	ir_ioapic[ir_ioapic_num].iommu = iommu;
+	ir_ioapic[ir_ioapic_num].id    = scope->enumeration_id;
+	ir_ioapic_num++;
+}
+
+static int ir_parse_ioapic_hpet_scope(struct acpi_dmar_header *header,
+				      struct intel_iommu *iommu)
+{
+	struct acpi_dmar_hardware_unit *drhd;
+	struct acpi_dmar_device_scope *scope;
+	void *start, *end;
+
+	drhd = (struct acpi_dmar_hardware_unit *)header;
+
+	start = (void *)(drhd + 1);
+	end = ((void *)drhd) + header->length;
+
+	while (start < end) {
+		scope = start;
+		if (scope->entry_type == ACPI_DMAR_SCOPE_TYPE_IOAPIC) {
+			if (ir_ioapic_num == MAX_IO_APICS) {
+				printk(KERN_WARNING "Exceeded Max IO APICS\n");
+				return -1;
+			}
+
+			printk(KERN_INFO "IOAPIC id %d under DRHD base "
+			       " 0x%Lx IOMMU %d\n", scope->enumeration_id,
+			       drhd->address, iommu->seq_id);
+
+			ir_parse_one_ioapic_scope(scope, iommu);
+		} else if (scope->entry_type == ACPI_DMAR_SCOPE_TYPE_HPET) {
+			if (ir_hpet_num == MAX_HPET_TBS) {
+				printk(KERN_WARNING "Exceeded Max HPET blocks\n");
+				return -1;
+			}
+
+			printk(KERN_INFO "HPET id %d under DRHD base"
+			       " 0x%Lx\n", scope->enumeration_id,
+			       drhd->address);
+
+			ir_parse_one_hpet_scope(scope, iommu);
+		}
+		start += scope->length;
+	}
+
+	return 0;
+}
+
+/*
+ * Finds the assocaition between IOAPIC's and its Interrupt-remapping
+ * hardware unit.
+ */
+int __init parse_ioapics_under_ir(void)
+{
+	struct dmar_drhd_unit *drhd;
+	int ir_supported = 0;
+
+	for_each_drhd_unit(drhd) {
+		struct intel_iommu *iommu = drhd->iommu;
+
+		if (ecap_ir_support(iommu->ecap)) {
+			if (ir_parse_ioapic_hpet_scope(drhd->hdr, iommu))
+				return -1;
+
+			ir_supported = 1;
+		}
+	}
+
+	if (ir_supported && ir_ioapic_num != nr_ioapics) {
+		printk(KERN_WARNING
+		       "Not all IO-APIC's listed under remapping hardware\n");
+		return -1;
+	}
+
+	return ir_supported;
+}
+
+int __init ir_dev_scope_init(void)
+{
+	if (!irq_remapping_enabled)
+		return 0;
+
+	return dmar_dev_scope_init();
+}
+rootfs_initcall(ir_dev_scope_init);
+
+static void disable_irq_remapping(void)
+{
+	struct dmar_drhd_unit *drhd;
+	struct intel_iommu *iommu = NULL;
+
+	/*
+	 * Disable Interrupt-remapping for all the DRHD's now.
+	 */
+	for_each_iommu(iommu, drhd) {
+		if (!ecap_ir_support(iommu->ecap))
+			continue;
+
+		iommu_disable_irq_remapping(iommu);
+	}
+}
+
+static int reenable_irq_remapping(int eim)
+{
+	struct dmar_drhd_unit *drhd;
+	int setup = 0;
+	struct intel_iommu *iommu = NULL;
+
+	for_each_iommu(iommu, drhd)
+		if (iommu->qi)
+			dmar_reenable_qi(iommu);
+
+	/*
+	 * Setup Interrupt-remapping for all the DRHD's now.
+	 */
+	for_each_iommu(iommu, drhd) {
+		if (!ecap_ir_support(iommu->ecap))
+			continue;
+
+		/* Set up interrupt remapping for iommu.*/
+		iommu_set_irq_remapping(iommu, eim);
+		setup = 1;
+	}
+
+	if (!setup)
+		goto error;
+
+	return 0;
+
+error:
+	/*
+	 * handle error condition gracefully here!
+	 */
+	return -1;
+}
+
+static void prepare_irte(struct irte *irte, int vector,
+			 unsigned int dest)
+{
+	memset(irte, 0, sizeof(*irte));
+
+	irte->present = 1;
+	irte->dst_mode = apic->irq_dest_mode;
+	/*
+	 * Trigger mode in the IRTE will always be edge, and for IO-APIC, the
+	 * actual level or edge trigger will be setup in the IO-APIC
+	 * RTE. This will help simplify level triggered irq migration.
+	 * For more details, see the comments (in io_apic.c) explainig IO-APIC
+	 * irq migration in the presence of interrupt-remapping.
+	*/
+	irte->trigger_mode = 0;
+	irte->dlvry_mode = apic->irq_delivery_mode;
+	irte->vector = vector;
+	irte->dest_id = IRTE_DEST(dest);
+	irte->redir_hint = 1;
+}
+
+static int intel_setup_ioapic_entry(int irq,
+				    struct IO_APIC_route_entry *route_entry,
+				    unsigned int destination, int vector,
+				    struct io_apic_irq_attr *attr)
+{
+	int ioapic_id = mpc_ioapic_id(attr->ioapic);
+	struct intel_iommu *iommu = map_ioapic_to_ir(ioapic_id);
+	struct IR_IO_APIC_route_entry *entry;
+	struct irte irte;
+	int index;
+
+	if (!iommu) {
+		pr_warn("No mapping iommu for ioapic %d\n", ioapic_id);
+		return -ENODEV;
+	}
+
+	entry = (struct IR_IO_APIC_route_entry *)route_entry;
+
+	index = alloc_irte(iommu, irq, 1);
+	if (index < 0) {
+		pr_warn("Failed to allocate IRTE for ioapic %d\n", ioapic_id);
+		return -ENOMEM;
+	}
+
+	prepare_irte(&irte, vector, destination);
+
+	/* Set source-id of interrupt request */
+	set_ioapic_sid(&irte, ioapic_id);
+
+	modify_irte(irq, &irte);
+
+	apic_printk(APIC_VERBOSE, KERN_DEBUG "IOAPIC[%d]: "
+		"Set IRTE entry (P:%d FPD:%d Dst_Mode:%d "
+		"Redir_hint:%d Trig_Mode:%d Dlvry_Mode:%X "
+		"Avail:%X Vector:%02X Dest:%08X "
+		"SID:%04X SQ:%X SVT:%X)\n",
+		attr->ioapic, irte.present, irte.fpd, irte.dst_mode,
+		irte.redir_hint, irte.trigger_mode, irte.dlvry_mode,
+		irte.avail, irte.vector, irte.dest_id,
+		irte.sid, irte.sq, irte.svt);
+
+	memset(entry, 0, sizeof(*entry));
+
+	entry->index2	= (index >> 15) & 0x1;
+	entry->zero	= 0;
+	entry->format	= 1;
+	entry->index	= (index & 0x7fff);
+	/*
+	 * IO-APIC RTE will be configured with virtual vector.
+	 * irq handler will do the explicit EOI to the io-apic.
+	 */
+	entry->vector	= attr->ioapic_pin;
+	entry->mask	= 0;			/* enable IRQ */
+	entry->trigger	= attr->trigger;
+	entry->polarity	= attr->polarity;
+
+	/* Mask level triggered irqs.
+	 * Use IRQ_DELAYED_DISABLE for edge triggered irqs.
+	 */
+	if (attr->trigger)
+		entry->mask = 1;
+
+	return 0;
+}
+
+/*
+ * Migrate the IO-APIC irq in the presence of intr-remapping.
+ *
+ * For both level and edge triggered, irq migration is a simple atomic
+ * update(of vector and cpu destination) of IRTE and flush the hardware cache.
+ *
+ * For level triggered, we eliminate the io-apic RTE modification (with the
+ * updated vector information), by using a virtual vector (io-apic pin number).
+ * Real vector that is used for interrupting cpu will be coming from
+ * the interrupt-remapping table entry.
+ *
+ * As the migration is a simple atomic update of IRTE, the same mechanism
+ * is used to migrate MSI irq's in the presence of interrupt-remapping.
+ */
+static int
+intel_ioapic_set_affinity(struct irq_data *data, const struct cpumask *mask,
+			  bool force)
+{
+	struct irq_cfg *cfg = data->chip_data;
+	unsigned int dest, irq = data->irq;
+	struct irte irte;
+
+	if (!cpumask_intersects(mask, cpu_online_mask))
+		return -EINVAL;
+
+	if (get_irte(irq, &irte))
+		return -EBUSY;
+
+	if (assign_irq_vector(irq, cfg, mask))
+		return -EBUSY;
+
+	dest = apic->cpu_mask_to_apicid_and(cfg->domain, mask);
+
+	irte.vector = cfg->vector;
+	irte.dest_id = IRTE_DEST(dest);
+
+	/*
+	 * Atomically updates the IRTE with the new destination, vector
+	 * and flushes the interrupt entry cache.
+	 */
+	modify_irte(irq, &irte);
+
+	/*
+	 * After this point, all the interrupts will start arriving
+	 * at the new destination. So, time to cleanup the previous
+	 * vector allocation.
+	 */
+	if (cfg->move_in_progress)
+		send_cleanup_vector(cfg);
+
+	cpumask_copy(data->affinity, mask);
+	return 0;
+}
+
+static void intel_compose_msi_msg(struct pci_dev *pdev,
+				  unsigned int irq, unsigned int dest,
+				  struct msi_msg *msg, u8 hpet_id)
+{
+	struct irq_cfg *cfg;
+	struct irte irte;
+	u16 sub_handle;
+	int ir_index;
+
+	cfg = irq_get_chip_data(irq);
+
+	ir_index = map_irq_to_irte_handle(irq, &sub_handle);
+	BUG_ON(ir_index == -1);
+
+	prepare_irte(&irte, cfg->vector, dest);
+
+	/* Set source-id of interrupt request */
+	if (pdev)
+		set_msi_sid(&irte, pdev);
+	else
+		set_hpet_sid(&irte, hpet_id);
+
+	modify_irte(irq, &irte);
+
+	msg->address_hi = MSI_ADDR_BASE_HI;
+	msg->data = sub_handle;
+	msg->address_lo = MSI_ADDR_BASE_LO | MSI_ADDR_IR_EXT_INT |
+			  MSI_ADDR_IR_SHV |
+			  MSI_ADDR_IR_INDEX1(ir_index) |
+			  MSI_ADDR_IR_INDEX2(ir_index);
+}
+
+/*
+ * Map the PCI dev to the corresponding remapping hardware unit
+ * and allocate 'nvec' consecutive interrupt-remapping table entries
+ * in it.
+ */
+static int intel_msi_alloc_irq(struct pci_dev *dev, int irq, int nvec)
+{
+	struct intel_iommu *iommu;
+	int index;
+
+	iommu = map_dev_to_ir(dev);
+	if (!iommu) {
+		printk(KERN_ERR
+		       "Unable to map PCI %s to iommu\n", pci_name(dev));
+		return -ENOENT;
+	}
+
+	index = alloc_irte(iommu, irq, nvec);
+	if (index < 0) {
+		printk(KERN_ERR
+		       "Unable to allocate %d IRTE for PCI %s\n", nvec,
+		       pci_name(dev));
+		return -ENOSPC;
+	}
+	return index;
+}
+
+static int intel_msi_setup_irq(struct pci_dev *pdev, unsigned int irq,
+			       int index, int sub_handle)
+{
+	struct intel_iommu *iommu;
+
+	iommu = map_dev_to_ir(pdev);
+	if (!iommu)
+		return -ENOENT;
+	/*
+	 * setup the mapping between the irq and the IRTE
+	 * base index, the sub_handle pointing to the
+	 * appropriate interrupt remap table entry.
+	 */
+	set_irte_irq(irq, iommu, index, sub_handle);
+
+	return 0;
+}
+
+static int intel_setup_hpet_msi(unsigned int irq, unsigned int id)
+{
+	struct intel_iommu *iommu = map_hpet_to_ir(id);
+	int index;
+
+	if (!iommu)
+		return -1;
+
+	index = alloc_irte(iommu, irq, 1);
+	if (index < 0)
+		return -1;
+
+	return 0;
+}
+
+struct irq_remap_ops intel_irq_remap_ops = {
+	.supported		= intel_irq_remapping_supported,
+	.prepare		= dmar_table_init,
+	.enable			= intel_enable_irq_remapping,
+	.disable		= disable_irq_remapping,
+	.reenable		= reenable_irq_remapping,
+	.enable_faulting	= enable_drhd_fault_handling,
+	.setup_ioapic_entry	= intel_setup_ioapic_entry,
+	.set_affinity		= intel_ioapic_set_affinity,
+	.free_irq		= free_irte,
+	.compose_msi_msg	= intel_compose_msi_msg,
+	.msi_alloc_irq		= intel_msi_alloc_irq,
+	.msi_setup_irq		= intel_msi_setup_irq,
+	.setup_hpet_msi		= intel_setup_hpet_msi,
+};
diff --git a/drivers/iommu/intr_remapping.c b/drivers/iommu/intr_remapping.c
deleted file mode 100644
index 523a7b3a1205..000000000000
--- a/drivers/iommu/intr_remapping.c
+++ /dev/null
@@ -1,164 +0,0 @@
-#include <linux/kernel.h>
-#include <linux/string.h>
-#include <linux/errno.h>
-
-#include "intr_remapping.h"
-
-int irq_remapping_enabled;
-
-int disable_irq_remap;
-int disable_sourceid_checking;
-int no_x2apic_optout;
-
-static struct irq_remap_ops *remap_ops;
-
-static __init int setup_nointremap(char *str)
-{
-	disable_irq_remap = 1;
-	return 0;
-}
-early_param("nointremap", setup_nointremap);
-
-static __init int setup_irqremap(char *str)
-{
-	if (!str)
-		return -EINVAL;
-
-	while (*str) {
-		if (!strncmp(str, "on", 2))
-			disable_irq_remap = 0;
-		else if (!strncmp(str, "off", 3))
-			disable_irq_remap = 1;
-		else if (!strncmp(str, "nosid", 5))
-			disable_sourceid_checking = 1;
-		else if (!strncmp(str, "no_x2apic_optout", 16))
-			no_x2apic_optout = 1;
-
-		str += strcspn(str, ",");
-		while (*str == ',')
-			str++;
-	}
-
-	return 0;
-}
-early_param("intremap", setup_irqremap);
-
-void __init setup_irq_remapping_ops(void)
-{
-	remap_ops = &intel_irq_remap_ops;
-}
-
-int irq_remapping_supported(void)
-{
-	if (disable_irq_remap)
-		return 0;
-
-	if (!remap_ops || !remap_ops->supported)
-		return 0;
-
-	return remap_ops->supported();
-}
-
-int __init irq_remapping_prepare(void)
-{
-	if (!remap_ops || !remap_ops->prepare)
-		return -ENODEV;
-
-	return remap_ops->prepare();
-}
-
-int __init irq_remapping_enable(void)
-{
-	if (!remap_ops || !remap_ops->enable)
-		return -ENODEV;
-
-	return remap_ops->enable();
-}
-
-void irq_remapping_disable(void)
-{
-	if (!remap_ops || !remap_ops->disable)
-		return;
-
-	remap_ops->disable();
-}
-
-int irq_remapping_reenable(int mode)
-{
-	if (!remap_ops || !remap_ops->reenable)
-		return 0;
-
-	return remap_ops->reenable(mode);
-}
-
-int __init irq_remap_enable_fault_handling(void)
-{
-	if (!remap_ops || !remap_ops->enable_faulting)
-		return -ENODEV;
-
-	return remap_ops->enable_faulting();
-}
-
-int setup_ioapic_remapped_entry(int irq,
-				struct IO_APIC_route_entry *entry,
-				unsigned int destination, int vector,
-				struct io_apic_irq_attr *attr)
-{
-	if (!remap_ops || !remap_ops->setup_ioapic_entry)
-		return -ENODEV;
-
-	return remap_ops->setup_ioapic_entry(irq, entry, destination,
-					     vector, attr);
-}
-
-int set_remapped_irq_affinity(struct irq_data *data, const struct cpumask *mask,
-			      bool force)
-{
-	if (!remap_ops || !remap_ops->set_affinity)
-		return 0;
-
-	return remap_ops->set_affinity(data, mask, force);
-}
-
-void free_remapped_irq(int irq)
-{
-	if (!remap_ops || !remap_ops->free_irq)
-		return;
-
-	remap_ops->free_irq(irq);
-}
-
-void compose_remapped_msi_msg(struct pci_dev *pdev,
-			      unsigned int irq, unsigned int dest,
-			      struct msi_msg *msg, u8 hpet_id)
-{
-	if (!remap_ops || !remap_ops->compose_msi_msg)
-		return;
-
-	remap_ops->compose_msi_msg(pdev, irq, dest, msg, hpet_id);
-}
-
-int msi_alloc_remapped_irq(struct pci_dev *pdev, int irq, int nvec)
-{
-	if (!remap_ops || !remap_ops->msi_alloc_irq)
-		return -ENODEV;
-
-	return remap_ops->msi_alloc_irq(pdev, irq, nvec);
-}
-
-int msi_setup_remapped_irq(struct pci_dev *pdev, unsigned int irq,
-			   int index, int sub_handle)
-{
-	if (!remap_ops || !remap_ops->msi_setup_irq)
-		return -ENODEV;
-
-	return remap_ops->msi_setup_irq(pdev, irq, index, sub_handle);
-}
-
-int setup_hpet_msi_remapped(unsigned int irq, unsigned int id)
-{
-	if (!remap_ops || !remap_ops->setup_hpet_msi)
-		return -ENODEV;
-
-	return remap_ops->setup_hpet_msi(irq, id);
-}
diff --git a/drivers/iommu/intr_remapping.h b/drivers/iommu/intr_remapping.h
deleted file mode 100644
index bd5d98fec148..000000000000
--- a/drivers/iommu/intr_remapping.h
+++ /dev/null
@@ -1,88 +0,0 @@
-/*
- * Copyright (C) 2012 Advanced Micro Devices, Inc.
- * Author: Joerg Roedel <joerg.roedel@amd.com>
- *
- * This program is free software; you can redistribute it and/or modify it
- * under the terms of the GNU General Public License version 2 as published
- * by the Free Software Foundation.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software
- * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307 USA
- *
- * This header file contains stuff that is shared between different interrupt
- * remapping drivers but with no need to be visible outside of the IOMMU layer.
- */
-
-#ifndef __INTR_REMAPPING_H
-#define __INTR_REMAPPING_H
-
-#ifdef CONFIG_IRQ_REMAP
-
-struct IO_APIC_route_entry;
-struct io_apic_irq_attr;
-struct irq_data;
-struct cpumask;
-struct pci_dev;
-struct msi_msg;
-
-extern int disable_irq_remap;
-extern int disable_sourceid_checking;
-extern int no_x2apic_optout;
-
-struct irq_remap_ops {
-	/* Check whether Interrupt Remapping is supported */
-	int (*supported)(void);
-
-	/* Initializes hardware and makes it ready for remapping interrupts */
-	int  (*prepare)(void);
-
-	/* Enables the remapping hardware */
-	int  (*enable)(void);
-
-	/* Disables the remapping hardware */
-	void (*disable)(void);
-
-	/* Reenables the remapping hardware */
-	int  (*reenable)(int);
-
-	/* Enable fault handling */
-	int  (*enable_faulting)(void);
-
-	/* IO-APIC setup routine */
-	int (*setup_ioapic_entry)(int irq, struct IO_APIC_route_entry *,
-				  unsigned int, int,
-				  struct io_apic_irq_attr *);
-
-	/* Set the CPU affinity of a remapped interrupt */
-	int (*set_affinity)(struct irq_data *data, const struct cpumask *mask,
-			    bool force);
-
-	/* Free an IRQ */
-	int (*free_irq)(int);
-
-	/* Create MSI msg to use for interrupt remapping */
-	void (*compose_msi_msg)(struct pci_dev *,
-				unsigned int, unsigned int,
-				struct msi_msg *, u8);
-
-	/* Allocate remapping resources for MSI */
-	int (*msi_alloc_irq)(struct pci_dev *, int, int);
-
-	/* Setup the remapped MSI irq */
-	int (*msi_setup_irq)(struct pci_dev *, unsigned int, int, int);
-
-	/* Setup interrupt remapping for an HPET MSI */
-	int (*setup_hpet_msi)(unsigned int, unsigned int);
-};
-
-extern struct irq_remap_ops intel_irq_remap_ops;
-
-#endif /* CONFIG_IRQ_REMAP */
-
-#endif /* __INTR_REMAPPING_H */
diff --git a/drivers/iommu/irq_remapping.c b/drivers/iommu/irq_remapping.c
new file mode 100644
index 000000000000..1cf350e02da8
--- /dev/null
+++ b/drivers/iommu/irq_remapping.c
@@ -0,0 +1,164 @@
+#include <linux/kernel.h>
+#include <linux/string.h>
+#include <linux/errno.h>
+
+#include "irq_remapping.h"
+
+int irq_remapping_enabled;
+
+int disable_irq_remap;
+int disable_sourceid_checking;
+int no_x2apic_optout;
+
+static struct irq_remap_ops *remap_ops;
+
+static __init int setup_nointremap(char *str)
+{
+	disable_irq_remap = 1;
+	return 0;
+}
+early_param("nointremap", setup_nointremap);
+
+static __init int setup_irqremap(char *str)
+{
+	if (!str)
+		return -EINVAL;
+
+	while (*str) {
+		if (!strncmp(str, "on", 2))
+			disable_irq_remap = 0;
+		else if (!strncmp(str, "off", 3))
+			disable_irq_remap = 1;
+		else if (!strncmp(str, "nosid", 5))
+			disable_sourceid_checking = 1;
+		else if (!strncmp(str, "no_x2apic_optout", 16))
+			no_x2apic_optout = 1;
+
+		str += strcspn(str, ",");
+		while (*str == ',')
+			str++;
+	}
+
+	return 0;
+}
+early_param("intremap", setup_irqremap);
+
+void __init setup_irq_remapping_ops(void)
+{
+	remap_ops = &intel_irq_remap_ops;
+}
+
+int irq_remapping_supported(void)
+{
+	if (disable_irq_remap)
+		return 0;
+
+	if (!remap_ops || !remap_ops->supported)
+		return 0;
+
+	return remap_ops->supported();
+}
+
+int __init irq_remapping_prepare(void)
+{
+	if (!remap_ops || !remap_ops->prepare)
+		return -ENODEV;
+
+	return remap_ops->prepare();
+}
+
+int __init irq_remapping_enable(void)
+{
+	if (!remap_ops || !remap_ops->enable)
+		return -ENODEV;
+
+	return remap_ops->enable();
+}
+
+void irq_remapping_disable(void)
+{
+	if (!remap_ops || !remap_ops->disable)
+		return;
+
+	remap_ops->disable();
+}
+
+int irq_remapping_reenable(int mode)
+{
+	if (!remap_ops || !remap_ops->reenable)
+		return 0;
+
+	return remap_ops->reenable(mode);
+}
+
+int __init irq_remap_enable_fault_handling(void)
+{
+	if (!remap_ops || !remap_ops->enable_faulting)
+		return -ENODEV;
+
+	return remap_ops->enable_faulting();
+}
+
+int setup_ioapic_remapped_entry(int irq,
+				struct IO_APIC_route_entry *entry,
+				unsigned int destination, int vector,
+				struct io_apic_irq_attr *attr)
+{
+	if (!remap_ops || !remap_ops->setup_ioapic_entry)
+		return -ENODEV;
+
+	return remap_ops->setup_ioapic_entry(irq, entry, destination,
+					     vector, attr);
+}
+
+int set_remapped_irq_affinity(struct irq_data *data, const struct cpumask *mask,
+			      bool force)
+{
+	if (!remap_ops || !remap_ops->set_affinity)
+		return 0;
+
+	return remap_ops->set_affinity(data, mask, force);
+}
+
+void free_remapped_irq(int irq)
+{
+	if (!remap_ops || !remap_ops->free_irq)
+		return;
+
+	remap_ops->free_irq(irq);
+}
+
+void compose_remapped_msi_msg(struct pci_dev *pdev,
+			      unsigned int irq, unsigned int dest,
+			      struct msi_msg *msg, u8 hpet_id)
+{
+	if (!remap_ops || !remap_ops->compose_msi_msg)
+		return;
+
+	remap_ops->compose_msi_msg(pdev, irq, dest, msg, hpet_id);
+}
+
+int msi_alloc_remapped_irq(struct pci_dev *pdev, int irq, int nvec)
+{
+	if (!remap_ops || !remap_ops->msi_alloc_irq)
+		return -ENODEV;
+
+	return remap_ops->msi_alloc_irq(pdev, irq, nvec);
+}
+
+int msi_setup_remapped_irq(struct pci_dev *pdev, unsigned int irq,
+			   int index, int sub_handle)
+{
+	if (!remap_ops || !remap_ops->msi_setup_irq)
+		return -ENODEV;
+
+	return remap_ops->msi_setup_irq(pdev, irq, index, sub_handle);
+}
+
+int setup_hpet_msi_remapped(unsigned int irq, unsigned int id)
+{
+	if (!remap_ops || !remap_ops->setup_hpet_msi)
+		return -ENODEV;
+
+	return remap_ops->setup_hpet_msi(irq, id);
+}
diff --git a/drivers/iommu/irq_remapping.h b/drivers/iommu/irq_remapping.h
new file mode 100644
index 000000000000..b12974cc1dfe
--- /dev/null
+++ b/drivers/iommu/irq_remapping.h
@@ -0,0 +1,88 @@
+/*
+ * Copyright (C) 2012 Advanced Micro Devices, Inc.
+ * Author: Joerg Roedel <joerg.roedel@amd.com>
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License version 2 as published
+ * by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307 USA
+ *
+ * This header file contains stuff that is shared between different interrupt
+ * remapping drivers but with no need to be visible outside of the IOMMU layer.
+ */
+
+#ifndef __IRQ_REMAPPING_H
+#define __IRQ_REMAPPING_H
+
+#ifdef CONFIG_IRQ_REMAP
+
+struct IO_APIC_route_entry;
+struct io_apic_irq_attr;
+struct irq_data;
+struct cpumask;
+struct pci_dev;
+struct msi_msg;
+
+extern int disable_irq_remap;
+extern int disable_sourceid_checking;
+extern int no_x2apic_optout;
+
+struct irq_remap_ops {
+	/* Check whether Interrupt Remapping is supported */
+	int (*supported)(void);
+
+	/* Initializes hardware and makes it ready for remapping interrupts */
+	int  (*prepare)(void);
+
+	/* Enables the remapping hardware */
+	int  (*enable)(void);
+
+	/* Disables the remapping hardware */
+	void (*disable)(void);
+
+	/* Reenables the remapping hardware */
+	int  (*reenable)(int);
+
+	/* Enable fault handling */
+	int  (*enable_faulting)(void);
+
+	/* IO-APIC setup routine */
+	int (*setup_ioapic_entry)(int irq, struct IO_APIC_route_entry *,
+				  unsigned int, int,
+				  struct io_apic_irq_attr *);
+
+	/* Set the CPU affinity of a remapped interrupt */
+	int (*set_affinity)(struct irq_data *data, const struct cpumask *mask,
+			    bool force);
+
+	/* Free an IRQ */
+	int (*free_irq)(int);
+
+	/* Create MSI msg to use for interrupt remapping */
+	void (*compose_msi_msg)(struct pci_dev *,
+				unsigned int, unsigned int,
+				struct msi_msg *, u8);
+
+	/* Allocate remapping resources for MSI */
+	int (*msi_alloc_irq)(struct pci_dev *, int, int);
+
+	/* Setup the remapped MSI irq */
+	int (*msi_setup_irq)(struct pci_dev *, unsigned int, int, int);
+
+	/* Setup interrupt remapping for an HPET MSI */
+	int (*setup_hpet_msi)(unsigned int, unsigned int);
+};
+
+extern struct irq_remap_ops intel_irq_remap_ops;
+
+#endif /* CONFIG_IRQ_REMAP */
+
+#endif /* __IRQ_REMAPPING_H */
-- 
cgit v1.2.3


From d2aa37411b8e65d57d2c5ae36f0222274292020d Mon Sep 17 00:00:00 2001
From: Daniel Drake <dsd@laptop.org>
Date: Thu, 12 Apr 2012 18:18:24 +0100
Subject: x86/olpc/xo1/sci: Produce wakeup events for buttons and switches

Produce wakeup events for the XO-1's power button, lid switch
and ebook switch, taking care to only produce events when the
states have changed.

Signed-off-by: Daniel Drake <dsd@laptop.org>
Cc: dilinger@queued.net
Cc: pgf@laptop.org
Link: http://lkml.kernel.org/r/20120412171824.D14C49D401E@zog.reactivated.net
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/x86/platform/olpc/olpc-xo1-sci.c | 26 +++++++++++++++++++++-----
 1 file changed, 21 insertions(+), 5 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/platform/olpc/olpc-xo1-sci.c b/arch/x86/platform/olpc/olpc-xo1-sci.c
index 1d4c783d7325..4b93ff46cec3 100644
--- a/arch/x86/platform/olpc/olpc-xo1-sci.c
+++ b/arch/x86/platform/olpc/olpc-xo1-sci.c
@@ -18,6 +18,7 @@
 #include <linux/interrupt.h>
 #include <linux/platform_device.h>
 #include <linux/pm.h>
+#include <linux/pm_wakeup.h>
 #include <linux/mfd/core.h>
 #include <linux/power_supply.h>
 #include <linux/suspend.h>
@@ -83,8 +84,12 @@ static void send_ebook_state(void)
 		return;
 	}
 
+	if (!!test_bit(SW_TABLET_MODE, ebook_switch_idev->sw) == state)
+		return; /* Nothing new to report. */
+
 	input_report_switch(ebook_switch_idev, SW_TABLET_MODE, state);
 	input_sync(ebook_switch_idev);
+	pm_wakeup_event(&ebook_switch_idev->dev, 0);
 }
 
 static void flip_lid_inverter(void)
@@ -123,8 +128,12 @@ static void detect_lid_state(void)
 /* Report current lid switch state through input layer */
 static void send_lid_state(void)
 {
+	if (!!test_bit(SW_LID, lid_switch_idev->sw) == !lid_open)
+		return; /* Nothing new to report. */
+
 	input_report_switch(lid_switch_idev, SW_LID, !lid_open);
 	input_sync(lid_switch_idev);
+	pm_wakeup_event(&lid_switch_idev->dev, 0);
 }
 
 static ssize_t lid_wake_mode_show(struct device *dev,
@@ -213,11 +222,18 @@ static irqreturn_t xo1_sci_intr(int irq, void *dev_id)
 
 	dev_dbg(&pdev->dev, "sts %x gpe %x\n", sts, gpe);
 
-	if (sts & CS5536_PWRBTN_FLAG && !(sts & CS5536_WAK_FLAG)) {
-		input_report_key(power_button_idev, KEY_POWER, 1);
-		input_sync(power_button_idev);
-		input_report_key(power_button_idev, KEY_POWER, 0);
-		input_sync(power_button_idev);
+	if (sts & CS5536_PWRBTN_FLAG) {
+		if (!(sts & CS5536_WAK_FLAG)) {
+			/* Only report power button input when it was pressed
+			 * during regular operation (as opposed to when it
+			 * was used to wake the system). */
+			input_report_key(power_button_idev, KEY_POWER, 1);
+			input_sync(power_button_idev);
+			input_report_key(power_button_idev, KEY_POWER, 0);
+			input_sync(power_button_idev);
+		}
+		/* Report the wakeup event in all cases. */
+		pm_wakeup_event(&power_button_idev->dev, 0);
 	}
 
 	if (gpe & CS5536_GPIOM7_PME_FLAG) { /* EC GPIO */
-- 
cgit v1.2.3


From c2c21e9bb17549e8add4ff76931bcec2e2d3ad48 Mon Sep 17 00:00:00 2001
From: Daniel Drake <dsd@laptop.org>
Date: Wed, 18 Apr 2012 23:34:02 +0100
Subject: x86/olpc/xo1/sci: Report RTC wakeup events

When the system is woken due to a RTC event, report the wakeup
event on the relevant rtc device (if it can be found).

Signed-off-by: Daniel Drake <dsd@laptop.org>
Cc: dilinger@queued.net
Cc: pgf@laptop.org
Link: http://lkml.kernel.org/r/20120418223402.D73249D401E@zog.reactivated.net
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/x86/platform/olpc/olpc-xo1-sci.c | 17 +++++++++++++++--
 include/linux/cs5535.h                |  1 +
 2 files changed, 16 insertions(+), 2 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/platform/olpc/olpc-xo1-sci.c b/arch/x86/platform/olpc/olpc-xo1-sci.c
index 4b93ff46cec3..04b8c73659c5 100644
--- a/arch/x86/platform/olpc/olpc-xo1-sci.c
+++ b/arch/x86/platform/olpc/olpc-xo1-sci.c
@@ -236,6 +236,18 @@ static irqreturn_t xo1_sci_intr(int irq, void *dev_id)
 		pm_wakeup_event(&power_button_idev->dev, 0);
 	}
 
+	if ((sts & (CS5536_RTC_FLAG | CS5536_WAK_FLAG)) ==
+			(CS5536_RTC_FLAG | CS5536_WAK_FLAG)) {
+		/* When the system is woken by the RTC alarm, report the
+		 * event on the rtc device. */
+		struct device *rtc = bus_find_device_by_name(
+			&platform_bus_type, NULL, "rtc_cmos");
+		if (rtc) {
+			pm_wakeup_event(rtc, 0);
+			put_device(rtc);
+		}
+	}
+
 	if (gpe & CS5536_GPIOM7_PME_FLAG) { /* EC GPIO */
 		cs5535_gpio_set(OLPC_GPIO_ECSCI, GPIO_NEGATIVE_EDGE_STS);
 		schedule_work(&sci_work);
@@ -326,9 +338,10 @@ static int __devinit setup_sci_interrupt(struct platform_device *pdev)
 		outb(lo, CS5536_PIC_INT_SEL2);
 	}
 
-	/* Enable SCI from power button, and clear pending interrupts */
+	/* Enable interesting SCI events, and clear pending interrupts */
 	sts = inl(acpi_base + CS5536_PM1_STS);
-	outl((CS5536_PM_PWRBTN << 16) | 0xffff, acpi_base + CS5536_PM1_STS);
+	outl(((CS5536_PM_PWRBTN | CS5536_PM_RTC) << 16) | 0xffff,
+	     acpi_base + CS5536_PM1_STS);
 
 	r = request_irq(sci_irq, xo1_sci_intr, 0, DRV_NAME, pdev);
 	if (r)
diff --git a/include/linux/cs5535.h b/include/linux/cs5535.h
index c077aec3a6ff..cfe83239d7f0 100644
--- a/include/linux/cs5535.h
+++ b/include/linux/cs5535.h
@@ -95,6 +95,7 @@ static inline int cs5535_pic_unreqz_select_high(unsigned int group,
 
 /* CS5536_PM1_STS bits */
 #define CS5536_WAK_FLAG		(1 << 15)
+#define CS5536_RTC_FLAG		(1 << 10)
 #define CS5536_PWRBTN_FLAG	(1 << 8)
 
 /* CS5536_PM1_EN bits */
-- 
cgit v1.2.3


From 396e2c6fed4ff13b53ce0e573105531cf53b0cad Mon Sep 17 00:00:00 2001
From: Jan Beulich <JBeulich@suse.com>
Date: Mon, 2 Apr 2012 15:15:55 +0100
Subject: x86: Clear HPET configuration registers on startup

While Linux itself has been calling hpet_disable() for quite a
while, having e.g. a secondary (kexec) kernel depend on such
behavior of the primary (crashed) environment is fragile. It
particularly broke until very recently when the primary
environment was Xen based, as that hypervisor did not clear any
of the HPET settings it may have used.

Rather than blindly (and incompletely) clearing certain HPET
settings in hpet_disable(), latch the config register settings
during boot and restore then here.

(Note on the hpet_set_mode() change: Now that we're clearing the
level bit upon initialization, there's no need anymore to do so
here.)

Signed-off-by: Jan Beulich <jbeulich@suse.com>
Link: http://lkml.kernel.org/r/4F79D0BB020000780007C02D@nat28.tlf.novell.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/x86/kernel/hpet.c | 59 +++++++++++++++++++++++++++++++++++++++++++-------
 1 file changed, 51 insertions(+), 8 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kernel/hpet.c b/arch/x86/kernel/hpet.c
index ad0de0c2714e..70bce5db1bb9 100644
--- a/arch/x86/kernel/hpet.c
+++ b/arch/x86/kernel/hpet.c
@@ -319,8 +319,6 @@ static void hpet_set_mode(enum clock_event_mode mode,
 		now = hpet_readl(HPET_COUNTER);
 		cmp = now + (unsigned int) delta;
 		cfg = hpet_readl(HPET_Tn_CFG(timer));
-		/* Make sure we use edge triggered interrupts */
-		cfg &= ~HPET_TN_LEVEL;
 		cfg |= HPET_TN_ENABLE | HPET_TN_PERIODIC |
 		       HPET_TN_SETVAL | HPET_TN_32BIT;
 		hpet_writel(cfg, HPET_Tn_CFG(timer));
@@ -787,15 +785,16 @@ static int hpet_clocksource_register(void)
 	return 0;
 }
 
+static u32 *hpet_boot_cfg;
+
 /**
  * hpet_enable - Try to setup the HPET timer. Returns 1 on success.
  */
 int __init hpet_enable(void)
 {
-	unsigned long hpet_period;
-	unsigned int id;
+	u32 hpet_period, cfg, id;
 	u64 freq;
-	int i;
+	unsigned int i, last;
 
 	if (!is_hpet_capable())
 		return 0;
@@ -847,15 +846,45 @@ int __init hpet_enable(void)
 	id = hpet_readl(HPET_ID);
 	hpet_print_config();
 
+	last = (id & HPET_ID_NUMBER) >> HPET_ID_NUMBER_SHIFT;
+
 #ifdef CONFIG_HPET_EMULATE_RTC
 	/*
 	 * The legacy routing mode needs at least two channels, tick timer
 	 * and the rtc emulation channel.
 	 */
-	if (!(id & HPET_ID_NUMBER))
+	if (!last)
 		goto out_nohpet;
 #endif
 
+	cfg = hpet_readl(HPET_CFG);
+	hpet_boot_cfg = kmalloc((last + 2) * sizeof(*hpet_boot_cfg),
+				GFP_KERNEL);
+	if (hpet_boot_cfg)
+		*hpet_boot_cfg = cfg;
+	else
+		pr_warn("HPET initial state will not be saved\n");
+	cfg &= ~(HPET_CFG_ENABLE | HPET_CFG_LEGACY);
+	hpet_writel(cfg, HPET_Tn_CFG(i));
+	if (cfg)
+		pr_warn("HPET: Unrecognized bits %#x set in global cfg\n",
+			cfg);
+
+	for (i = 0; i <= last; ++i) {
+		cfg = hpet_readl(HPET_Tn_CFG(i));
+		if (hpet_boot_cfg)
+			hpet_boot_cfg[i + 1] = cfg;
+		cfg &= ~(HPET_TN_ENABLE | HPET_TN_LEVEL | HPET_TN_FSB);
+		hpet_writel(cfg, HPET_Tn_CFG(i));
+		cfg &= ~(HPET_TN_PERIODIC | HPET_TN_PERIODIC_CAP
+			 | HPET_TN_64BIT_CAP | HPET_TN_32BIT | HPET_TN_ROUTE
+			 | HPET_TN_FSB | HPET_TN_FSB_CAP);
+		if (cfg)
+			pr_warn("HPET: Unrecognized bits %#x set in cfg#%u\n",
+				cfg, i);
+	}
+	hpet_print_config();
+
 	if (hpet_clocksource_register())
 		goto out_nohpet;
 
@@ -923,14 +952,28 @@ fs_initcall(hpet_late_init);
 void hpet_disable(void)
 {
 	if (is_hpet_capable() && hpet_virt_address) {
-		unsigned int cfg = hpet_readl(HPET_CFG);
+		unsigned int cfg = hpet_readl(HPET_CFG), id, last;
 
-		if (hpet_legacy_int_enabled) {
+		if (hpet_boot_cfg)
+			cfg = *hpet_boot_cfg;
+		else if (hpet_legacy_int_enabled) {
 			cfg &= ~HPET_CFG_LEGACY;
 			hpet_legacy_int_enabled = 0;
 		}
 		cfg &= ~HPET_CFG_ENABLE;
 		hpet_writel(cfg, HPET_CFG);
+
+		if (!hpet_boot_cfg)
+			return;
+
+		id = hpet_readl(HPET_ID);
+		last = ((id & HPET_ID_NUMBER) >> HPET_ID_NUMBER_SHIFT);
+
+		for (id = 0; id <= last; ++id)
+			hpet_writel(hpet_boot_cfg[id + 1], HPET_Tn_CFG(id));
+
+		if (*hpet_boot_cfg & HPET_CFG_ENABLE)
+			hpet_writel(*hpet_boot_cfg, HPET_CFG);
 	}
 }
 
-- 
cgit v1.2.3


From b2d6aba9657c7e3d027dd43ac7d7c405e0079d46 Mon Sep 17 00:00:00 2001
From: Jan Beulich <JBeulich@suse.com>
Date: Mon, 2 Apr 2012 15:17:36 +0100
Subject: x86: Allow multiple values to be specified with "hpet="

This is particularly to be able to specify "hpet=force,verbose",
as "force" ought to be a primary candidate for also wanting to
use "verbose".

Signed-off-by: Jan Beulich <jbeulich@suse.com>
Link: http://lkml.kernel.org/r/4F79D120020000780007C031@nat28.tlf.novell.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/x86/kernel/hpet.c | 7 ++++++-
 1 file changed, 6 insertions(+), 1 deletion(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kernel/hpet.c b/arch/x86/kernel/hpet.c
index 70bce5db1bb9..9cc7b4392f7c 100644
--- a/arch/x86/kernel/hpet.c
+++ b/arch/x86/kernel/hpet.c
@@ -94,13 +94,18 @@ static int hpet_verbose;
 
 static int __init hpet_setup(char *str)
 {
-	if (str) {
+	while (str) {
+		char *next = strchr(str, ',');
+
+		if (next)
+			*next++ = 0;
 		if (!strncmp("disable", str, 7))
 			boot_hpet_disable = 1;
 		if (!strncmp("force", str, 5))
 			hpet_force_user = 1;
 		if (!strncmp("verbose", str, 7))
 			hpet_verbose = 1;
+		str = next;
 	}
 	return 1;
 }
-- 
cgit v1.2.3


From ddc5681ed33a279fdc188e98e71f0c539f08c6e6 Mon Sep 17 00:00:00 2001
From: Shai Fultheim <shai@scalemp.com>
Date: Fri, 20 Apr 2012 01:09:11 +0300
Subject: x86/cache_info: Fix setup of l2/l3 ids

On some architectures (such as vSMP), it is possible to have
CPUs with a different number of cores sharing the same cache.

The current implementation implicitly assumes that all CPUs will
have the same number of cores sharing caches, and as a result,
different CPUs can end up with the same l2/l3 ids.

Fix this by masking out the shared cache bits, instead of
shifting the APICID. By doing so, it is guaranteed that the
generated cache ids are always unique.

Signed-off-by: Shai Fultheim <shai@scalemp.com>
[ rebased, simplified, and reworded the commit message]
Signed-off-by: Ido Yariv <ido@wizery.com>
Cc: Borislav Petkov <borislav.petkov@amd.com>
Cc: Andreas Herrmann <andreas.herrmann3@amd.com>
Cc: Mike Travis <travis@sgi.com>
Cc: Dave Jones <davej@redhat.com>
Link: http://lkml.kernel.org/r/1334873351-31142-1-git-send-email-ido@wizery.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/x86/kernel/cpu/intel_cacheinfo.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kernel/cpu/intel_cacheinfo.c b/arch/x86/kernel/cpu/intel_cacheinfo.c
index b8f3653dddbc..9a7c90d80bc4 100644
--- a/arch/x86/kernel/cpu/intel_cacheinfo.c
+++ b/arch/x86/kernel/cpu/intel_cacheinfo.c
@@ -615,14 +615,14 @@ unsigned int __cpuinit init_intel_cacheinfo(struct cpuinfo_x86 *c)
 					new_l2 = this_leaf.size/1024;
 					num_threads_sharing = 1 + this_leaf.eax.split.num_threads_sharing;
 					index_msb = get_count_order(num_threads_sharing);
-					l2_id = c->apicid >> index_msb;
+					l2_id = c->apicid & ~((1 << index_msb) - 1);
 					break;
 				case 3:
 					new_l3 = this_leaf.size/1024;
 					num_threads_sharing = 1 + this_leaf.eax.split.num_threads_sharing;
 					index_msb = get_count_order(
 							num_threads_sharing);
-					l3_id = c->apicid >> index_msb;
+					l3_id = c->apicid & ~((1 << index_msb) - 1);
 					break;
 				default:
 					break;
-- 
cgit v1.2.3


From 42fa4250436304d4650fa271f37671f6cee24e08 Mon Sep 17 00:00:00 2001
From: Shai Fultheim <shai@scalemp.com>
Date: Fri, 20 Apr 2012 01:12:32 +0300
Subject: x86: Conditionally update time when ack-ing pending irqs

On virtual environments, apic_read could take a long time. As a
result, under certain conditions the ack pending loop may exit
without any queued irqs left, but after more than one second. A
warning will be printed needlessly in this case.

If the loop is about to exit regardless of max_loops, don't
update it.

Signed-off-by: Shai Fultheim <shai@scalemp.com>
[ rebased and reworded the commit message]
Signed-off-by: Ido Yariv <ido@wizery.com>
Acked-by: Thomas Gleixner <tglx@linutronix.de>
Link: http://lkml.kernel.org/r/1334873552-31346-1-git-send-email-ido@wizery.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/x86/kernel/apic/apic.c | 12 +++++++-----
 1 file changed, 7 insertions(+), 5 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kernel/apic/apic.c b/arch/x86/kernel/apic/apic.c
index edc24480469f..3beab627190e 100644
--- a/arch/x86/kernel/apic/apic.c
+++ b/arch/x86/kernel/apic/apic.c
@@ -1325,11 +1325,13 @@ void __cpuinit setup_local_APIC(void)
 			       acked);
 			break;
 		}
-		if (cpu_has_tsc) {
-			rdtscll(ntsc);
-			max_loops = (cpu_khz << 10) - (ntsc - tsc);
-		} else
-			max_loops--;
+		if (queued) {
+			if (cpu_has_tsc) {
+				rdtscll(ntsc);
+				max_loops = (cpu_khz << 10) - (ntsc - tsc);
+			} else
+				max_loops--;
+		}
 	} while (queued && max_loops > 0);
 	WARN_ON(max_loops <= 0);
 
-- 
cgit v1.2.3


From 19209bbb8612004bc20a1f70ff12926f99fe2643 Mon Sep 17 00:00:00 2001
From: "Srivatsa S. Bhat" <srivatsa.bhat@linux.vnet.ibm.com>
Date: Mon, 30 Apr 2012 12:26:56 +0530
Subject: x86/sched: Make mwait_usable() heed to "idle=" kernel parameters
 properly

The checks that exist in mwait_usable() for "idle=" kernel
parameters are insufficient. As a result, mwait_usable() can
return 1 even if "idle=nomwait" or "idle=poll" or "idle=halt"
parameters are passed.

Of these cases, incorrect handling of idle=nomwait is a
universal problem since mwait can get used for usual CPU idling.
However the rest of the cases are problematic only during CPU
Hotplug (offline) because, in the CPU offline path, the function
mwait_play_dead() is called, which might result in mwait being
used in the offline CPUs, if mwait_usable() happens to return 1.

Fix these issues by checking for the boot time "idle=" kernel
parameter properly in mwait_usable().

The first issue (usual cpu idling) is demonstrated below:

Before applying the patch (dmesg snippet):

 [    0.000000] Command line: [...] idle=nomwait
 [    0.000000] Kernel command line: [...] idle=nomwait
 [    0.000000]  RCU dyntick-idle grace-period acceleration is enabled.
 [    0.140606] using mwait in idle threads.  <======= mwait being used
 [    4.303986] cpuidle: using governor ladder
 [    4.308232] cpuidle: using governor menu

After applying the patch:

 [    0.000000] Command line: [...] idle=nomwait
 [    0.000000] Kernel command line: [...] idle=nomwait
 [    0.000000]  RCU dyntick-idle grace-period acceleration is enabled.
 [    4.264100] cpuidle: using governor ladder
 [    4.268342] cpuidle: using governor menu

Signed-off-by: Srivatsa S. Bhat <srivatsa.bhat@linux.vnet.ibm.com>
Acked-by: Deepthi Dharwar <deepthi@linux.vnet.ibm.com>
Acked-by: Thomas Gleixner <tglx@linutronix.de>
Cc: venki@google.com
Cc: suresh.b.siddha@intel.com
Cc: Borislav Petkov <bp@amd64.org>
Cc: lenb@kernel.org
Cc: Rafael J. Wysocki <rjw@sisk.pl>
Link: http://lkml.kernel.org/r/4F9E37B8.30001@linux.vnet.ibm.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/x86/kernel/process.c | 8 ++++++++
 1 file changed, 8 insertions(+)

(limited to 'arch/x86')

diff --git a/arch/x86/kernel/process.c b/arch/x86/kernel/process.c
index 1d92a5ab6e8b..ad57d832d96f 100644
--- a/arch/x86/kernel/process.c
+++ b/arch/x86/kernel/process.c
@@ -594,9 +594,17 @@ int mwait_usable(const struct cpuinfo_x86 *c)
 {
 	u32 eax, ebx, ecx, edx;
 
+	/* Use mwait if idle=mwait boot option is given */
 	if (boot_option_idle_override == IDLE_FORCE_MWAIT)
 		return 1;
 
+	/*
+	 * Any idle= boot option other than idle=mwait means that we must not
+	 * use mwait. Eg: idle=halt or idle=poll or idle=nomwait
+	 */
+	if (boot_option_idle_override != IDLE_NO_OVERRIDE)
+		return 0;
+
 	if (c->cpuid_level < MWAIT_INFO)
 		return 0;
 
-- 
cgit v1.2.3


From 74d24b219bc4ebb20b75d63af2bb577bc1b10b5e Mon Sep 17 00:00:00 2001
From: Wei Yang <weiyang@linux.vnet.ibm.com>
Date: Thu, 26 Apr 2012 15:32:55 +0800
Subject: resources: add resource_overlaps()

Add resource_overlaps(), which returns true if two resources overlap at all.

Use this to replace the complicated check in coalesce_windows().

Signed-Off-By: Wei Yang <weiyang@linux.vnet.ibm.com>
Signed-off-by: Bjorn Helgaas <bhelgaas@google.com>
---
 arch/x86/pci/acpi.c    | 12 +-----------
 include/linux/ioport.h |  7 +++++++
 2 files changed, 8 insertions(+), 11 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/pci/acpi.c b/arch/x86/pci/acpi.c
index 8a17b23f8c84..fc09c2754e08 100644
--- a/arch/x86/pci/acpi.c
+++ b/arch/x86/pci/acpi.c
@@ -245,13 +245,6 @@ setup_resource(struct acpi_resource *acpi_res, void *data)
 	return AE_OK;
 }
 
-static bool resource_contains(struct resource *res, resource_size_t point)
-{
-	if (res->start <= point && point <= res->end)
-		return true;
-	return false;
-}
-
 static void coalesce_windows(struct pci_root_info *info, unsigned long type)
 {
 	int i, j;
@@ -272,10 +265,7 @@ static void coalesce_windows(struct pci_root_info *info, unsigned long type)
 			 * our resources no longer match the ACPI _CRS, but
 			 * the kernel resource tree doesn't allow overlaps.
 			 */
-			if (resource_contains(res1, res2->start) ||
-			    resource_contains(res1, res2->end) ||
-			    resource_contains(res2, res1->start) ||
-			    resource_contains(res2, res1->end)) {
+			if (resource_overlaps(res1, res2)) {
 				res1->start = min(res1->start, res2->start);
 				res1->end = max(res1->end, res2->end);
 				dev_info(&info->bridge->dev,
diff --git a/include/linux/ioport.h b/include/linux/ioport.h
index e885ba23de70..589e0e75efae 100644
--- a/include/linux/ioport.h
+++ b/include/linux/ioport.h
@@ -223,5 +223,12 @@ extern int
 walk_system_ram_range(unsigned long start_pfn, unsigned long nr_pages,
 		void *arg, int (*func)(unsigned long, unsigned long, void *));
 
+/* True if any part of r1 overlaps r2 */
+static inline bool resource_overlaps(struct resource *r1, struct resource *r2)
+{
+       return (r1->start <= r2->end && r1->end >= r2->start);
+}
+
+
 #endif /* __ASSEMBLY__ */
 #endif	/* _LINUX_IOPORT_H */
-- 
cgit v1.2.3


From 9438ef7f4ea73d5430a330fc206f97826eb9fb16 Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@kernel.org>
Date: Mon, 7 May 2012 19:19:56 +0200
Subject: x86/apic: Fix UP boot crash

Commit 31b3c9d72340 ("xen/x86: Implement x86_apic_ops") implemented
this:

... without considering that on UP the function pointer might be NULL.

Cc: Suresh Siddha <suresh.b.siddha@intel.com>
Cc: Konrad Rzeszutek Wilk <konrad.wilk@oracle.com>
Link: http://lkml.kernel.org/n/tip-3pfty0ml4yp62phbkchichh0@git.kernel.org
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/x86/kernel/setup.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c
index 8526317c5f0b..7e67c5a71061 100644
--- a/arch/x86/kernel/setup.c
+++ b/arch/x86/kernel/setup.c
@@ -1012,7 +1012,8 @@ void __init setup_arch(char **cmdline_p)
 	init_cpu_to_node();
 
 	init_apic_mappings();
-	x86_io_apic_ops.init();
+	if (x86_io_apic_ops.init)
+		x86_io_apic_ops.init();
 
 	kvm_guest_init();
 
-- 
cgit v1.2.3


From e826abd523913f63eb03b59746ffb16153c53dc4 Mon Sep 17 00:00:00 2001
From: Shuah Khan <shuahkhan@gmail.com>
Date: Sun, 6 May 2012 11:11:04 -0600
Subject: x86, microcode: microcode_core.c simple_strtoul cleanup

Change reload_for_cpu() in kernel/microcode_core.c to call kstrtoul()
instead of calling obsoleted simple_strtoul().

Signed-off-by: Shuah Khan <shuahkhan@gmail.com>
Reviewed-by: Borislav Petkov <bp@alien8.de>
Link: http://lkml.kernel.org/r/1336324264.2897.9.camel@lorien2
Signed-off-by: H. Peter Anvin <hpa@linux.intel.com>
---
 arch/x86/kernel/microcode_core.c | 9 ++++-----
 1 file changed, 4 insertions(+), 5 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kernel/microcode_core.c b/arch/x86/kernel/microcode_core.c
index c9bda6d6035c..fbdfc6917180 100644
--- a/arch/x86/kernel/microcode_core.c
+++ b/arch/x86/kernel/microcode_core.c
@@ -299,12 +299,11 @@ static ssize_t reload_store(struct device *dev,
 {
 	unsigned long val;
 	int cpu = dev->id;
-	int ret = 0;
-	char *end;
+	ssize_t ret = 0;
 
-	val = simple_strtoul(buf, &end, 0);
-	if (end == buf)
-		return -EINVAL;
+	ret = kstrtoul(buf, 0, &val);
+	if (ret)
+		return ret;
 
 	if (val == 1) {
 		get_online_cpus();
-- 
cgit v1.2.3


From ca1182387e57470460294ce1e39e2d5518809811 Mon Sep 17 00:00:00 2001
From: Konrad Rzeszutek Wilk <konrad.wilk@oracle.com>
Date: Fri, 30 Mar 2012 15:37:07 -0400
Subject: xen/setup: Only print "Freeing XXX-YYY pfn range: Z pages freed" if Z
 > 0

Otherwise we can get these meaningless:
Freeing  bad80-badf4 pfn range: 0 pages freed

We also can do this for the summary ones - no point of printing
"Set 0 page(s) to 1-1 mapping"

Acked-by: David Vrabel <david.vrabel@citrix.com>
[v1: Extended to the summary printks]
Signed-off-by: Konrad Rzeszutek Wilk <konrad.wilk@oracle.com>
---
 arch/x86/xen/setup.c | 11 +++++++----
 1 file changed, 7 insertions(+), 4 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/xen/setup.c b/arch/x86/xen/setup.c
index 1ba8dff26753..7b0ab77b8479 100644
--- a/arch/x86/xen/setup.c
+++ b/arch/x86/xen/setup.c
@@ -114,8 +114,9 @@ static unsigned long __init xen_release_chunk(unsigned long start,
 			len++;
 		}
 	}
-	printk(KERN_INFO "Freeing  %lx-%lx pfn range: %lu pages freed\n",
-	       start, end, len);
+	if (len)
+		printk(KERN_INFO "Freeing  %lx-%lx pfn range: %lu pages freed\n",
+		       start, end, len);
 
 	return len;
 }
@@ -162,8 +163,10 @@ static unsigned long __init xen_set_identity_and_release(
 		}
 	}
 
-	printk(KERN_INFO "Released %lu pages of unused memory\n", released);
-	printk(KERN_INFO "Set %ld page(s) to 1-1 mapping\n", identity);
+	if (released)
+		printk(KERN_INFO "Released %lu pages of unused memory\n", released);
+	if (identity)
+		printk(KERN_INFO "Set %ld page(s) to 1-1 mapping\n", identity);
 
 	return released;
 }
-- 
cgit v1.2.3


From 2e2fb75475c2fc74c98100f1468c8195fee49f3b Mon Sep 17 00:00:00 2001
From: Konrad Rzeszutek Wilk <konrad.wilk@oracle.com>
Date: Fri, 6 Apr 2012 10:07:11 -0400
Subject: xen/setup: Populate freed MFNs from non-RAM E820 entries and gaps to
 E820 RAM

When the Xen hypervisor boots a PV kernel it hands it two pieces
of information: nr_pages and a made up E820 entry.

The nr_pages value defines the range from zero to nr_pages of PFNs
which have a valid Machine Frame Number (MFN) underneath it. The
E820 mirrors that (with the VGA hole):
BIOS-provided physical RAM map:
 Xen: 0000000000000000 - 00000000000a0000 (usable)
 Xen: 00000000000a0000 - 0000000000100000 (reserved)
 Xen: 0000000000100000 - 0000000080800000 (usable)

The fun comes when a PV guest that is run with a machine E820 - that
can either be the initial domain or a PCI PV guest, where the E820
looks like the normal thing:

BIOS-provided physical RAM map:
 Xen: 0000000000000000 - 000000000009e000 (usable)
 Xen: 000000000009ec00 - 0000000000100000 (reserved)
 Xen: 0000000000100000 - 0000000020000000 (usable)
 Xen: 0000000020000000 - 0000000020200000 (reserved)
 Xen: 0000000020200000 - 0000000040000000 (usable)
 Xen: 0000000040000000 - 0000000040200000 (reserved)
 Xen: 0000000040200000 - 00000000bad80000 (usable)
 Xen: 00000000bad80000 - 00000000badc9000 (ACPI NVS)
..
With that overlaying the nr_pages directly on the E820 does not
work as there are gaps and non-RAM regions that won't be used
by the memory allocator. The 'xen_release_chunk' helps with that
by punching holes in the P2M (PFN to MFN lookup tree) for those
regions and tells us that:

Freeing  20000-20200 pfn range: 512 pages freed
Freeing  40000-40200 pfn range: 512 pages freed
Freeing  bad80-badf4 pfn range: 116 pages freed
Freeing  badf6-bae7f pfn range: 137 pages freed
Freeing  bb000-100000 pfn range: 282624 pages freed
Released 283999 pages of unused memory

Those 283999 pages are subtracted from the nr_pages and are returned
to the hypervisor. The end result is that the initial domain
boots with 1GB less memory as the nr_pages has been subtracted by
the amount of pages residing within the PCI hole. It can balloon up
to that if desired using 'xl mem-set 0 8092', but the balloon driver
is not always compiled in for the initial domain.

This patch, implements the populate hypercall (XENMEM_populate_physmap)
which increases the the domain with the same amount of pages that
were released.

The other solution (that did not work) was to transplant the MFN in
the P2M tree - the ones that were going to be freed were put in
the E820_RAM regions past the nr_pages. But the modifications to the
M2P array (the other side of creating PTEs) were not carried away.
As the hypervisor is the only one capable of modifying that and the
only two hypercalls that would do this are: the update_va_mapping
(which won't work, as during initial bootup only PFNs up to nr_pages
are mapped in the guest) or via the populate hypercall.

The end result is that the kernel can now boot with the
nr_pages without having to subtract the 283999 pages.

On a 8GB machine, with various dom0_mem= parameters this is what we get:

no dom0_mem
-Memory: 6485264k/9435136k available (5817k kernel code, 1136060k absent, 1813812k reserved, 2899k data, 696k init)
+Memory: 7619036k/9435136k available (5817k kernel code, 1136060k absent, 680040k reserved, 2899k data, 696k init)

dom0_mem=3G
-Memory: 2616536k/9435136k available (5817k kernel code, 1136060k absent, 5682540k reserved, 2899k data, 696k init)
+Memory: 2703776k/9435136k available (5817k kernel code, 1136060k absent, 5595300k reserved, 2899k data, 696k init)

dom0_mem=max:3G
-Memory: 2696732k/4281724k available (5817k kernel code, 1136060k absent, 448932k reserved, 2899k data, 696k init)
+Memory: 2702204k/4281724k available (5817k kernel code, 1136060k absent, 443460k reserved, 2899k data, 696k init)

And the 'xm list' or 'xl list' now reflect what the dom0_mem=
argument is.

Acked-by: David Vrabel <david.vrabel@citrix.com>
[v2: Use populate hypercall]
[v3: Remove debug printks]
[v4: Simplify code]
Signed-off-by: Konrad Rzeszutek Wilk <konrad.wilk@oracle.com>
---
 arch/x86/xen/setup.c | 116 +++++++++++++++++++++++++++++++++++++++++++++++++--
 1 file changed, 112 insertions(+), 4 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/xen/setup.c b/arch/x86/xen/setup.c
index 7b0ab77b8479..710af36e6dfb 100644
--- a/arch/x86/xen/setup.c
+++ b/arch/x86/xen/setup.c
@@ -26,7 +26,6 @@
 #include <xen/interface/memory.h>
 #include <xen/interface/physdev.h>
 #include <xen/features.h>
-
 #include "xen-ops.h"
 #include "vdso.h"
 
@@ -120,7 +119,105 @@ static unsigned long __init xen_release_chunk(unsigned long start,
 
 	return len;
 }
+static unsigned long __init xen_populate_physmap(unsigned long start,
+						 unsigned long end)
+{
+	struct xen_memory_reservation reservation = {
+		.address_bits = 0,
+		.extent_order = 0,
+		.domid        = DOMID_SELF
+	};
+	unsigned long len = 0;
+	int ret;
+
+	for (pfn = start; pfn < end; pfn++) {
+		unsigned long frame;
+
+		/* Make sure pfn does not exists to start with */
+		if (pfn_to_mfn(pfn) != INVALID_P2M_ENTRY)
+			continue;
 
+		frame = pfn;
+		set_xen_guest_handle(reservation.extent_start, &frame);
+		reservation.nr_extents = 1;
+
+		ret = HYPERVISOR_memory_op(XENMEM_populate_physmap, &reservation);
+		WARN(ret != 1, "Failed to populate pfn %lx err=%d\n", pfn, ret);
+		if (ret == 1) {
+			if (!early_set_phys_to_machine(pfn, frame)) {
+				set_xen_guest_handle(reservation.extent_start, &frame);
+				reservation.nr_extents = 1;
+				ret = HYPERVISOR_memory_op(XENMEM_decrease_reservation,
+							&reservation);
+				break;
+			}
+			len++;
+		} else
+			break;
+	}
+	if (len)
+		printk(KERN_INFO "Populated %lx-%lx pfn range: %lu pages added\n",
+		       start, end, len);
+	return len;
+}
+static unsigned long __init xen_populate_chunk(
+	const struct e820entry *list, size_t map_size,
+	unsigned long max_pfn, unsigned long *last_pfn,
+	unsigned long credits_left)
+{
+	const struct e820entry *entry;
+	unsigned int i;
+	unsigned long done = 0;
+	unsigned long dest_pfn;
+
+	for (i = 0, entry = list; i < map_size; i++, entry++) {
+		unsigned long credits = credits_left;
+		unsigned long s_pfn;
+		unsigned long e_pfn;
+		unsigned long pfns;
+		long capacity;
+
+		if (credits <= 0)
+			break;
+
+		if (entry->type != E820_RAM)
+			continue;
+
+		e_pfn = PFN_UP(entry->addr + entry->size);
+
+		/* We only care about E820 after the xen_start_info->nr_pages */
+		if (e_pfn <= max_pfn)
+			continue;
+
+		s_pfn = PFN_DOWN(entry->addr);
+		/* If the E820 falls within the nr_pages, we want to start
+		 * at the nr_pages PFN.
+		 * If that would mean going past the E820 entry, skip it
+		 */
+		if (s_pfn <= max_pfn) {
+			capacity = e_pfn - max_pfn;
+			dest_pfn = max_pfn;
+		} else {
+			/* last_pfn MUST be within E820_RAM regions */
+			if (*last_pfn && e_pfn >= *last_pfn)
+				s_pfn = *last_pfn;
+			capacity = e_pfn - s_pfn;
+			dest_pfn = s_pfn;
+		}
+		/* If we had filled this E820_RAM entry, go to the next one. */
+		if (capacity <= 0)
+			continue;
+
+		if (credits > capacity)
+			credits = capacity;
+
+		pfns = xen_populate_physmap(dest_pfn, dest_pfn + credits);
+		done += pfns;
+		credits_left -= pfns;
+		*last_pfn = (dest_pfn + pfns);
+	}
+	return done;
+}
 static unsigned long __init xen_set_identity_and_release(
 	const struct e820entry *list, size_t map_size, unsigned long nr_pages)
 {
@@ -143,7 +240,6 @@ static unsigned long __init xen_set_identity_and_release(
 	 */
 	for (i = 0, entry = list; i < map_size; i++, entry++) {
 		phys_addr_t end = entry->addr + entry->size;
-
 		if (entry->type == E820_RAM || i == map_size - 1) {
 			unsigned long start_pfn = PFN_DOWN(start);
 			unsigned long end_pfn = PFN_UP(end);
@@ -220,7 +316,9 @@ char * __init xen_memory_setup(void)
 	int rc;
 	struct xen_memory_map memmap;
 	unsigned long max_pages;
+	unsigned long last_pfn = 0;
 	unsigned long extra_pages = 0;
+	unsigned long populated;
 	int i;
 	int op;
 
@@ -260,8 +358,19 @@ char * __init xen_memory_setup(void)
 	 */
 	xen_released_pages = xen_set_identity_and_release(
 		map, memmap.nr_entries, max_pfn);
-	extra_pages += xen_released_pages;
 
+	/*
+	 * Populate back the non-RAM pages and E820 gaps that had been
+	 * released. */
+	populated = xen_populate_chunk(map, memmap.nr_entries,
+			max_pfn, &last_pfn, xen_released_pages);
+
+	extra_pages += (xen_released_pages - populated);
+
+	if (last_pfn > max_pfn) {
+		max_pfn = min(MAX_DOMAIN_PAGES, last_pfn);
+		mem_end = PFN_PHYS(max_pfn);
+	}
 	/*
 	 * Clamp the amount of extra memory to a EXTRA_MEM_RATIO
 	 * factor the base size.  On non-highmem systems, the base
@@ -275,7 +384,6 @@ char * __init xen_memory_setup(void)
 	 */
 	extra_pages = min(EXTRA_MEM_RATIO * min(max_pfn, PFN_DOWN(MAXMEM)),
 			  extra_pages);
-
 	i = 0;
 	while (i < memmap.nr_entries) {
 		u64 addr = map[i].addr;
-- 
cgit v1.2.3


From 96dc08b35c4af8cb5810450602590706f2593a5f Mon Sep 17 00:00:00 2001
From: Konrad Rzeszutek Wilk <konrad.wilk@oracle.com>
Date: Fri, 6 Apr 2012 16:10:20 -0400
Subject: xen/setup: Combine the two hypercall functions - since they are quite
 similar.

They use the same set of arguments, so it is just the matter
of using the proper hypercall.

Acked-by: David Vrabel <david.vrabel@citrix.com>
Signed-off-by: Konrad Rzeszutek Wilk <konrad.wilk@oracle.com>
---
 arch/x86/xen/setup.c | 81 +++++++++++++++++++---------------------------------
 1 file changed, 30 insertions(+), 51 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/xen/setup.c b/arch/x86/xen/setup.c
index 710af36e6dfb..30ac05a8d28f 100644
--- a/arch/x86/xen/setup.c
+++ b/arch/x86/xen/setup.c
@@ -83,8 +83,8 @@ static void __init xen_add_extra_mem(u64 start, u64 size)
 		__set_phys_to_machine(pfn, INVALID_P2M_ENTRY);
 }
 
-static unsigned long __init xen_release_chunk(unsigned long start,
-					      unsigned long end)
+static unsigned long __init xen_do_chunk(unsigned long start,
+					 unsigned long end, bool release)
 {
 	struct xen_memory_reservation reservation = {
 		.address_bits = 0,
@@ -95,60 +95,36 @@ static unsigned long __init xen_release_chunk(unsigned long start,
 	unsigned long pfn;
 	int ret;
 
-	for(pfn = start; pfn < end; pfn++) {
-		unsigned long mfn = pfn_to_mfn(pfn);
-
-		/* Make sure pfn exists to start with */
-		if (mfn == INVALID_P2M_ENTRY || mfn_to_pfn(mfn) != pfn)
-			continue;
-
-		set_xen_guest_handle(reservation.extent_start, &mfn);
-		reservation.nr_extents = 1;
-
-		ret = HYPERVISOR_memory_op(XENMEM_decrease_reservation,
-					   &reservation);
-		WARN(ret != 1, "Failed to release pfn %lx err=%d\n", pfn, ret);
-		if (ret == 1) {
-			__set_phys_to_machine(pfn, INVALID_P2M_ENTRY);
-			len++;
-		}
-	}
-	if (len)
-		printk(KERN_INFO "Freeing  %lx-%lx pfn range: %lu pages freed\n",
-		       start, end, len);
-
-	return len;
-}
-static unsigned long __init xen_populate_physmap(unsigned long start,
-						 unsigned long end)
-{
-	struct xen_memory_reservation reservation = {
-		.address_bits = 0,
-		.extent_order = 0,
-		.domid        = DOMID_SELF
-	};
-	unsigned long len = 0;
-	int ret;
-
 	for (pfn = start; pfn < end; pfn++) {
 		unsigned long frame;
+		unsigned long mfn = pfn_to_mfn(pfn);
 
-		/* Make sure pfn does not exists to start with */
-		if (pfn_to_mfn(pfn) != INVALID_P2M_ENTRY)
-			continue;
-
-		frame = pfn;
+		if (release) {
+			/* Make sure pfn exists to start with */
+			if (mfn == INVALID_P2M_ENTRY || mfn_to_pfn(mfn) != pfn)
+				continue;
+			frame = mfn;
+		} else {
+			if (mfn != INVALID_P2M_ENTRY)
+				continue;
+			frame = pfn;
+		}
 		set_xen_guest_handle(reservation.extent_start, &frame);
 		reservation.nr_extents = 1;
 
-		ret = HYPERVISOR_memory_op(XENMEM_populate_physmap, &reservation);
-		WARN(ret != 1, "Failed to populate pfn %lx err=%d\n", pfn, ret);
+		ret = HYPERVISOR_memory_op(release ? XENMEM_decrease_reservation : XENMEM_populate_physmap,
+					   &reservation);
+		WARN(ret != 1, "Failed to %s pfn %lx err=%d\n",
+		     release ? "release" : "populate", pfn, ret);
+
 		if (ret == 1) {
-			if (!early_set_phys_to_machine(pfn, frame)) {
+			if (!early_set_phys_to_machine(pfn, release ? INVALID_P2M_ENTRY : frame)) {
+				if (release)
+					break;
 				set_xen_guest_handle(reservation.extent_start, &frame);
 				reservation.nr_extents = 1;
 				ret = HYPERVISOR_memory_op(XENMEM_decrease_reservation,
-							&reservation);
+							   &reservation);
 				break;
 			}
 			len++;
@@ -156,8 +132,11 @@ static unsigned long __init xen_populate_physmap(unsigned long start,
 			break;
 	}
 	if (len)
-		printk(KERN_INFO "Populated %lx-%lx pfn range: %lu pages added\n",
-		       start, end, len);
+		printk(KERN_INFO "%s %lx-%lx pfn range: %lu pages %s\n",
+		       release ? "Freeing" : "Populating",
+		       start, end, len,
+		       release ? "freed" : "added");
+
 	return len;
 }
 static unsigned long __init xen_populate_chunk(
@@ -211,7 +190,7 @@ static unsigned long __init xen_populate_chunk(
 		if (credits > capacity)
 			credits = capacity;
 
-		pfns = xen_populate_physmap(dest_pfn, dest_pfn + credits);
+		pfns = xen_do_chunk(dest_pfn, dest_pfn + credits, false);
 		done += pfns;
 		credits_left -= pfns;
 		*last_pfn = (dest_pfn + pfns);
@@ -249,8 +228,8 @@ static unsigned long __init xen_set_identity_and_release(
 
 			if (start_pfn < end_pfn) {
 				if (start_pfn < nr_pages)
-					released += xen_release_chunk(
-						start_pfn, min(end_pfn, nr_pages));
+					released += xen_do_chunk(
+						start_pfn, min(end_pfn, nr_pages), true);
 
 				identity += set_phys_range_identity(
 					start_pfn, end_pfn);
-- 
cgit v1.2.3


From 83d51ab473dddde7df858015070ed22b84ebe9a9 Mon Sep 17 00:00:00 2001
From: David Vrabel <dvrabel@cantab.net>
Date: Thu, 3 May 2012 16:15:42 +0100
Subject: xen/setup: update VA mapping when releasing memory during setup

In xen_memory_setup(), if a page that is being released has a VA
mapping this must also be updated.  Otherwise, the page will be not
released completely -- it will still be referenced in Xen and won't be
freed util the mapping is removed and this prevents it from being
reallocated at a different PFN.

This was already being done for the ISA memory region in
xen_ident_map_ISA() but on many systems this was omitting a few pages
as many systems marked a few pages below the ISA memory region as
reserved in the e820 map.

This fixes errors such as:

(XEN) page_alloc.c:1148:d0 Over-allocation for domain 0: 2097153 > 2097152
(XEN) memory.c:133:d0 Could not allocate order=0 extent: id=0 memflags=0 (0 of 17)

Signed-off-by: David Vrabel <david.vrabel@citrix.com>
Signed-off-by: Konrad Rzeszutek Wilk <konrad.wilk@oracle.com>
---
 arch/x86/xen/enlighten.c |  1 -
 arch/x86/xen/mmu.c       | 23 -----------------------
 arch/x86/xen/setup.c     | 41 ++++++++++++++++++++++++++++++++++-------
 arch/x86/xen/xen-ops.h   |  1 -
 4 files changed, 34 insertions(+), 32 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/xen/enlighten.c b/arch/x86/xen/enlighten.c
index fe06bf4ef0e3..ac90e5629508 100644
--- a/arch/x86/xen/enlighten.c
+++ b/arch/x86/xen/enlighten.c
@@ -1308,7 +1308,6 @@ asmlinkage void __init xen_start_kernel(void)
 
 	xen_raw_console_write("mapping kernel into physical memory\n");
 	pgd = xen_setup_kernel_pagetable(pgd, xen_start_info->nr_pages);
-	xen_ident_map_ISA();
 
 	/* Allocate and initialize top and mid mfn levels for p2m structure */
 	xen_build_mfn_list_list();
diff --git a/arch/x86/xen/mmu.c b/arch/x86/xen/mmu.c
index 91dc2871e336..c9a351925a0c 100644
--- a/arch/x86/xen/mmu.c
+++ b/arch/x86/xen/mmu.c
@@ -1929,29 +1929,6 @@ static void xen_set_fixmap(unsigned idx, phys_addr_t phys, pgprot_t prot)
 #endif
 }
 
-void __init xen_ident_map_ISA(void)
-{
-	unsigned long pa;
-
-	/*
-	 * If we're dom0, then linear map the ISA machine addresses into
-	 * the kernel's address space.
-	 */
-	if (!xen_initial_domain())
-		return;
-
-	xen_raw_printk("Xen: setup ISA identity maps\n");
-
-	for (pa = ISA_START_ADDRESS; pa < ISA_END_ADDRESS; pa += PAGE_SIZE) {
-		pte_t pte = mfn_pte(PFN_DOWN(pa), PAGE_KERNEL_IO);
-
-		if (HYPERVISOR_update_va_mapping(PAGE_OFFSET + pa, pte, 0))
-			BUG();
-	}
-
-	xen_flush_tlb();
-}
-
 static void __init xen_post_allocator_init(void)
 {
 	pv_mmu_ops.set_pte = xen_set_pte;
diff --git a/arch/x86/xen/setup.c b/arch/x86/xen/setup.c
index 30ac05a8d28f..3ebba0753d38 100644
--- a/arch/x86/xen/setup.c
+++ b/arch/x86/xen/setup.c
@@ -139,6 +139,13 @@ static unsigned long __init xen_do_chunk(unsigned long start,
 
 	return len;
 }
+
+static unsigned long __init xen_release_chunk(unsigned long start,
+					      unsigned long end)
+{
+	return xen_do_chunk(start, end, true);
+}
+
 static unsigned long __init xen_populate_chunk(
 	const struct e820entry *list, size_t map_size,
 	unsigned long max_pfn, unsigned long *last_pfn,
@@ -197,6 +204,29 @@ static unsigned long __init xen_populate_chunk(
 	}
 	return done;
 }
+
+static void __init xen_set_identity_and_release_chunk(
+	unsigned long start_pfn, unsigned long end_pfn, unsigned long nr_pages,
+	unsigned long *released, unsigned long *identity)
+{
+	unsigned long pfn;
+
+	/*
+	 * If the PFNs are currently mapped, the VA mapping also needs
+	 * to be updated to be 1:1.
+	 */
+	for (pfn = start_pfn; pfn <= max_pfn_mapped && pfn < end_pfn; pfn++)
+		(void)HYPERVISOR_update_va_mapping(
+			(unsigned long)__va(pfn << PAGE_SHIFT),
+			mfn_pte(pfn, PAGE_KERNEL_IO), 0);
+
+	if (start_pfn < nr_pages)
+		*released += xen_release_chunk(
+			start_pfn, min(end_pfn, nr_pages));
+
+	*identity += set_phys_range_identity(start_pfn, end_pfn);
+}
+
 static unsigned long __init xen_set_identity_and_release(
 	const struct e820entry *list, size_t map_size, unsigned long nr_pages)
 {
@@ -226,14 +256,11 @@ static unsigned long __init xen_set_identity_and_release(
 			if (entry->type == E820_RAM)
 				end_pfn = PFN_UP(entry->addr);
 
-			if (start_pfn < end_pfn) {
-				if (start_pfn < nr_pages)
-					released += xen_do_chunk(
-						start_pfn, min(end_pfn, nr_pages), true);
+			if (start_pfn < end_pfn)
+				xen_set_identity_and_release_chunk(
+					start_pfn, end_pfn, nr_pages,
+					&released, &identity);
 
-				identity += set_phys_range_identity(
-					start_pfn, end_pfn);
-			}
 			start = end;
 		}
 	}
diff --git a/arch/x86/xen/xen-ops.h b/arch/x86/xen/xen-ops.h
index b095739ccd4c..506fa08d934a 100644
--- a/arch/x86/xen/xen-ops.h
+++ b/arch/x86/xen/xen-ops.h
@@ -28,7 +28,6 @@ void xen_setup_shared_info(void);
 void xen_build_mfn_list_list(void);
 void xen_setup_machphys_mapping(void);
 pgd_t *xen_setup_kernel_pagetable(pgd_t *pgd, unsigned long max_pfn);
-void xen_ident_map_ISA(void);
 void xen_reserve_top(void);
 extern unsigned long xen_max_p2m_pfn;
 
-- 
cgit v1.2.3


From f447d56d36af18c5104ff29dcb1327c0c0ac3634 Mon Sep 17 00:00:00 2001
From: Ben Guthro <ben@guthro.net>
Date: Sat, 21 Apr 2012 00:11:04 +0800
Subject: xen: implement apic ipi interface

Map native ipi vector to xen vector.
Implement apic ipi interface with xen_send_IPI_one.

Tested-by: Steven Noonan <steven@uplinklabs.net>
Signed-off-by: Ben Guthro <ben@guthro.net>
Signed-off-by: Lin Ming <mlin@ss.pku.edu.cn>
Signed-off-by: Konrad Rzeszutek Wilk <konrad.wilk@oracle.com>
---
 arch/x86/xen/enlighten.c |  9 ++++++
 arch/x86/xen/smp.c       | 81 +++++++++++++++++++++++++++++++++++++++++++++---
 arch/x86/xen/smp.h       | 12 +++++++
 3 files changed, 98 insertions(+), 4 deletions(-)
 create mode 100644 arch/x86/xen/smp.h

(limited to 'arch/x86')

diff --git a/arch/x86/xen/enlighten.c b/arch/x86/xen/enlighten.c
index 4f51bebac02c..1ed61c2bf633 100644
--- a/arch/x86/xen/enlighten.c
+++ b/arch/x86/xen/enlighten.c
@@ -74,6 +74,7 @@
 
 #include "xen-ops.h"
 #include "mmu.h"
+#include "smp.h"
 #include "multicalls.h"
 
 EXPORT_SYMBOL_GPL(hypercall_page);
@@ -849,6 +850,14 @@ static void set_xen_basic_apic_ops(void)
 	apic->icr_write = xen_apic_icr_write;
 	apic->wait_icr_idle = xen_apic_wait_icr_idle;
 	apic->safe_wait_icr_idle = xen_safe_apic_wait_icr_idle;
+
+#ifdef CONFIG_SMP
+	apic->send_IPI_allbutself = xen_send_IPI_allbutself;
+	apic->send_IPI_mask_allbutself = xen_send_IPI_mask_allbutself;
+	apic->send_IPI_mask = xen_send_IPI_mask;
+	apic->send_IPI_all = xen_send_IPI_all;
+	apic->send_IPI_self = xen_send_IPI_self;
+#endif
 }
 
 #endif
diff --git a/arch/x86/xen/smp.c b/arch/x86/xen/smp.c
index 5fac6919b957..2dc6628c1520 100644
--- a/arch/x86/xen/smp.c
+++ b/arch/x86/xen/smp.c
@@ -465,8 +465,8 @@ static void xen_smp_send_reschedule(int cpu)
 	xen_send_IPI_one(cpu, XEN_RESCHEDULE_VECTOR);
 }
 
-static void xen_send_IPI_mask(const struct cpumask *mask,
-			      enum ipi_vector vector)
+static void __xen_send_IPI_mask(const struct cpumask *mask,
+			      int vector)
 {
 	unsigned cpu;
 
@@ -478,7 +478,7 @@ static void xen_smp_send_call_function_ipi(const struct cpumask *mask)
 {
 	int cpu;
 
-	xen_send_IPI_mask(mask, XEN_CALL_FUNCTION_VECTOR);
+	__xen_send_IPI_mask(mask, XEN_CALL_FUNCTION_VECTOR);
 
 	/* Make sure other vcpus get a chance to run if they need to. */
 	for_each_cpu(cpu, mask) {
@@ -491,10 +491,83 @@ static void xen_smp_send_call_function_ipi(const struct cpumask *mask)
 
 static void xen_smp_send_call_function_single_ipi(int cpu)
 {
-	xen_send_IPI_mask(cpumask_of(cpu),
+	__xen_send_IPI_mask(cpumask_of(cpu),
 			  XEN_CALL_FUNCTION_SINGLE_VECTOR);
 }
 
+static inline int xen_map_vector(int vector)
+{
+	int xen_vector;
+
+	switch (vector) {
+	case RESCHEDULE_VECTOR:
+		xen_vector = XEN_RESCHEDULE_VECTOR;
+		break;
+	case CALL_FUNCTION_VECTOR:
+		xen_vector = XEN_CALL_FUNCTION_VECTOR;
+		break;
+	case CALL_FUNCTION_SINGLE_VECTOR:
+		xen_vector = XEN_CALL_FUNCTION_SINGLE_VECTOR;
+		break;
+	default:
+		xen_vector = -1;
+		printk(KERN_ERR "xen: vector 0x%x is not implemented\n",
+			vector);
+	}
+
+	return xen_vector;
+}
+
+void xen_send_IPI_mask(const struct cpumask *mask,
+			      int vector)
+{
+	int xen_vector = xen_map_vector(vector);
+
+	if (xen_vector >= 0)
+		__xen_send_IPI_mask(mask, xen_vector);
+}
+
+void xen_send_IPI_all(int vector)
+{
+	int xen_vector = xen_map_vector(vector);
+
+	if (xen_vector >= 0)
+		__xen_send_IPI_mask(cpu_online_mask, xen_vector);
+}
+
+void xen_send_IPI_self(int vector)
+{
+	int xen_vector = xen_map_vector(vector);
+
+	if (xen_vector >= 0)
+		xen_send_IPI_one(smp_processor_id(), xen_vector);
+}
+
+void xen_send_IPI_mask_allbutself(const struct cpumask *mask,
+				int vector)
+{
+	unsigned cpu;
+	unsigned int this_cpu = smp_processor_id();
+
+	if (!(num_online_cpus() > 1))
+		return;
+
+	for_each_cpu_and(cpu, mask, cpu_online_mask) {
+		if (this_cpu == cpu)
+			continue;
+
+		xen_smp_send_call_function_single_ipi(cpu);
+	}
+}
+
+void xen_send_IPI_allbutself(int vector)
+{
+	int xen_vector = xen_map_vector(vector);
+
+	if (xen_vector >= 0)
+		xen_send_IPI_mask_allbutself(cpu_online_mask, xen_vector);
+}
+
 static irqreturn_t xen_call_function_interrupt(int irq, void *dev_id)
 {
 	irq_enter();
diff --git a/arch/x86/xen/smp.h b/arch/x86/xen/smp.h
new file mode 100644
index 000000000000..8981a76d081a
--- /dev/null
+++ b/arch/x86/xen/smp.h
@@ -0,0 +1,12 @@
+#ifndef _XEN_SMP_H
+
+extern void xen_send_IPI_mask(const struct cpumask *mask,
+			      int vector);
+extern void xen_send_IPI_mask_allbutself(const struct cpumask *mask,
+				int vector);
+extern void xen_send_IPI_allbutself(int vector);
+extern void physflat_send_IPI_allbutself(int vector);
+extern void xen_send_IPI_all(int vector);
+extern void xen_send_IPI_self(int vector);
+
+#endif
-- 
cgit v1.2.3


From 1ff2b0c303698e486f1e0886b4d9876200ef8ca5 Mon Sep 17 00:00:00 2001
From: Lin Ming <mlin@ss.pku.edu.cn>
Date: Sat, 21 Apr 2012 00:11:05 +0800
Subject: xen: implement IRQ_WORK_VECTOR handler

Signed-off-by: Lin Ming <mlin@ss.pku.edu.cn>
Signed-off-by: Konrad Rzeszutek Wilk <konrad.wilk@oracle.com>
---
 arch/x86/include/asm/xen/events.h |  1 +
 arch/x86/xen/smp.c                | 30 ++++++++++++++++++++++++++++++
 2 files changed, 31 insertions(+)

(limited to 'arch/x86')

diff --git a/arch/x86/include/asm/xen/events.h b/arch/x86/include/asm/xen/events.h
index 1df35417c412..cc146d51449e 100644
--- a/arch/x86/include/asm/xen/events.h
+++ b/arch/x86/include/asm/xen/events.h
@@ -6,6 +6,7 @@ enum ipi_vector {
 	XEN_CALL_FUNCTION_VECTOR,
 	XEN_CALL_FUNCTION_SINGLE_VECTOR,
 	XEN_SPIN_UNLOCK_VECTOR,
+	XEN_IRQ_WORK_VECTOR,
 
 	XEN_NR_IPIS,
 };
diff --git a/arch/x86/xen/smp.c b/arch/x86/xen/smp.c
index 2dc6628c1520..3ec3f8eb19fc 100644
--- a/arch/x86/xen/smp.c
+++ b/arch/x86/xen/smp.c
@@ -16,6 +16,7 @@
 #include <linux/err.h>
 #include <linux/slab.h>
 #include <linux/smp.h>
+#include <linux/irq_work.h>
 
 #include <asm/paravirt.h>
 #include <asm/desc.h>
@@ -41,10 +42,12 @@ cpumask_var_t xen_cpu_initialized_map;
 static DEFINE_PER_CPU(int, xen_resched_irq);
 static DEFINE_PER_CPU(int, xen_callfunc_irq);
 static DEFINE_PER_CPU(int, xen_callfuncsingle_irq);
+static DEFINE_PER_CPU(int, xen_irq_work);
 static DEFINE_PER_CPU(int, xen_debug_irq) = -1;
 
 static irqreturn_t xen_call_function_interrupt(int irq, void *dev_id);
 static irqreturn_t xen_call_function_single_interrupt(int irq, void *dev_id);
+static irqreturn_t xen_irq_work_interrupt(int irq, void *dev_id);
 
 /*
  * Reschedule call back.
@@ -143,6 +146,17 @@ static int xen_smp_intr_init(unsigned int cpu)
 		goto fail;
 	per_cpu(xen_callfuncsingle_irq, cpu) = rc;
 
+	callfunc_name = kasprintf(GFP_KERNEL, "irqwork%d", cpu);
+	rc = bind_ipi_to_irqhandler(XEN_IRQ_WORK_VECTOR,
+				    cpu,
+				    xen_irq_work_interrupt,
+				    IRQF_DISABLED|IRQF_PERCPU|IRQF_NOBALANCING,
+				    callfunc_name,
+				    NULL);
+	if (rc < 0)
+		goto fail;
+	per_cpu(xen_irq_work, cpu) = rc;
+
 	return 0;
 
  fail:
@@ -155,6 +169,8 @@ static int xen_smp_intr_init(unsigned int cpu)
 	if (per_cpu(xen_callfuncsingle_irq, cpu) >= 0)
 		unbind_from_irqhandler(per_cpu(xen_callfuncsingle_irq, cpu),
 				       NULL);
+	if (per_cpu(xen_irq_work, cpu) >= 0)
+		unbind_from_irqhandler(per_cpu(xen_irq_work, cpu), NULL);
 
 	return rc;
 }
@@ -509,6 +525,9 @@ static inline int xen_map_vector(int vector)
 	case CALL_FUNCTION_SINGLE_VECTOR:
 		xen_vector = XEN_CALL_FUNCTION_SINGLE_VECTOR;
 		break;
+	case IRQ_WORK_VECTOR:
+		xen_vector = XEN_IRQ_WORK_VECTOR;
+		break;
 	default:
 		xen_vector = -1;
 		printk(KERN_ERR "xen: vector 0x%x is not implemented\n",
@@ -588,6 +607,16 @@ static irqreturn_t xen_call_function_single_interrupt(int irq, void *dev_id)
 	return IRQ_HANDLED;
 }
 
+static irqreturn_t xen_irq_work_interrupt(int irq, void *dev_id)
+{
+	irq_enter();
+	irq_work_run();
+	inc_irq_stat(apic_irq_work_irqs);
+	irq_exit();
+
+	return IRQ_HANDLED;
+}
+
 static const struct smp_ops xen_smp_ops __initconst = {
 	.smp_prepare_boot_cpu = xen_smp_prepare_boot_cpu,
 	.smp_prepare_cpus = xen_smp_prepare_cpus,
@@ -634,6 +663,7 @@ static void xen_hvm_cpu_die(unsigned int cpu)
 	unbind_from_irqhandler(per_cpu(xen_callfunc_irq, cpu), NULL);
 	unbind_from_irqhandler(per_cpu(xen_debug_irq, cpu), NULL);
 	unbind_from_irqhandler(per_cpu(xen_callfuncsingle_irq, cpu), NULL);
+	unbind_from_irqhandler(per_cpu(xen_irq_work, cpu), NULL);
 	native_cpu_die(cpu);
 }
 
-- 
cgit v1.2.3


From 211063dc159695bd6072c5393e9bc729481c6ede Mon Sep 17 00:00:00 2001
From: Konrad Rzeszutek Wilk <konrad.wilk@oracle.com>
Date: Thu, 8 Dec 2011 17:32:23 +0800
Subject: xen/acpi/sleep: Enable ACPI sleep via the __acpi_os_prepare_sleep

Provide the registration callback to call in the Xen's
ACPI sleep functionality. This means that during S3/S5
we make a hypercall XENPF_enter_acpi_sleep with the
proper PM1A/PM1B registers.

Based of Ke Yu's <ke.yu@intel.com> initial idea.
[ From http://xenbits.xensource.com/linux-2.6.18-xen.hg
change c68699484a65 ]

[v1: Added Copyright and license]
[v2: Added check if PM1A/B the 16-bits MSB contain something. The spec
     only uses 16-bits but might have more in future]
Signed-off-by: Liang Tang <liang.tang@oracle.com>
Signed-off-by: Konrad Rzeszutek Wilk <konrad.wilk@oracle.com>
---
 arch/x86/xen/enlighten.c |  3 +++
 drivers/xen/Makefile     |  2 +-
 drivers/xen/acpi.c       | 62 ++++++++++++++++++++++++++++++++++++++++++++++++
 include/xen/acpi.h       | 58 ++++++++++++++++++++++++++++++++++++++++++++
 4 files changed, 124 insertions(+), 1 deletion(-)
 create mode 100644 drivers/xen/acpi.c
 create mode 100644 include/xen/acpi.h

(limited to 'arch/x86')

diff --git a/arch/x86/xen/enlighten.c b/arch/x86/xen/enlighten.c
index 1ed61c2bf633..eca90e5be1e7 100644
--- a/arch/x86/xen/enlighten.c
+++ b/arch/x86/xen/enlighten.c
@@ -42,6 +42,7 @@
 #include <xen/page.h>
 #include <xen/hvm.h>
 #include <xen/hvc-console.h>
+#include <xen/acpi.h>
 
 #include <asm/paravirt.h>
 #include <asm/apic.h>
@@ -1373,6 +1374,8 @@ asmlinkage void __init xen_start_kernel(void)
 
 		/* Make sure ACS will be enabled */
 		pci_request_acs();
+
+		xen_acpi_sleep_register();
 	}
 		
 
diff --git a/drivers/xen/Makefile b/drivers/xen/Makefile
index 9adc5be57b13..fc3488631136 100644
--- a/drivers/xen/Makefile
+++ b/drivers/xen/Makefile
@@ -17,7 +17,7 @@ obj-$(CONFIG_XEN_SYS_HYPERVISOR)	+= sys-hypervisor.o
 obj-$(CONFIG_XEN_PVHVM)			+= platform-pci.o
 obj-$(CONFIG_XEN_TMEM)			+= tmem.o
 obj-$(CONFIG_SWIOTLB_XEN)		+= swiotlb-xen.o
-obj-$(CONFIG_XEN_DOM0)			+= pci.o
+obj-$(CONFIG_XEN_DOM0)			+= pci.o acpi.o
 obj-$(CONFIG_XEN_PCIDEV_BACKEND)	+= xen-pciback/
 obj-$(CONFIG_XEN_PRIVCMD)		+= xen-privcmd.o
 obj-$(CONFIG_XEN_ACPI_PROCESSOR)	+= xen-acpi-processor.o
diff --git a/drivers/xen/acpi.c b/drivers/xen/acpi.c
new file mode 100644
index 000000000000..119d42a2bf57
--- /dev/null
+++ b/drivers/xen/acpi.c
@@ -0,0 +1,62 @@
+/******************************************************************************
+ * acpi.c
+ * acpi file for domain 0 kernel
+ *
+ * Copyright (c) 2011 Konrad Rzeszutek Wilk <konrad.wilk@oracle.com>
+ * Copyright (c) 2011 Yu Ke ke.yu@intel.com
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License version 2
+ * as published by the Free Software Foundation; or, when distributed
+ * separately from the Linux kernel or incorporated into other
+ * software packages, subject to the following license:
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this source file (the "Software"), to deal in the Software without
+ * restriction, including without limitation the rights to use, copy, modify,
+ * merge, publish, distribute, sublicense, and/or sell copies of the Software,
+ * and to permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+
+#include <xen/acpi.h>
+#include <xen/interface/platform.h>
+#include <asm/xen/hypercall.h>
+#include <asm/xen/hypervisor.h>
+
+int xen_acpi_notify_hypervisor_state(u8 sleep_state,
+				     u32 pm1a_cnt, u32 pm1b_cnt)
+{
+	struct xen_platform_op op = {
+		.cmd = XENPF_enter_acpi_sleep,
+		.interface_version = XENPF_INTERFACE_VERSION,
+		.u = {
+			.enter_acpi_sleep = {
+				.pm1a_cnt_val = (u16)pm1a_cnt,
+				.pm1b_cnt_val = (u16)pm1b_cnt,
+				.sleep_state = sleep_state,
+			},
+		},
+	};
+
+	if ((pm1a_cnt & 0xffff0000) || (pm1b_cnt & 0xffff0000)) {
+		WARN(1, "Using more than 16bits of PM1A/B 0x%x/0x%x!"
+		     "Email xen-devel@lists.xensource.com  Thank you.\n", \
+		     pm1a_cnt, pm1b_cnt);
+		return -1;
+	}
+
+	HYPERVISOR_dom0_op(&op);
+	return 1;
+}
diff --git a/include/xen/acpi.h b/include/xen/acpi.h
new file mode 100644
index 000000000000..48a9c0171b65
--- /dev/null
+++ b/include/xen/acpi.h
@@ -0,0 +1,58 @@
+/******************************************************************************
+ * acpi.h
+ * acpi file for domain 0 kernel
+ *
+ * Copyright (c) 2011 Konrad Rzeszutek Wilk <konrad.wilk@oracle.com>
+ * Copyright (c) 2011 Yu Ke <ke.yu@intel.com>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License version 2
+ * as published by the Free Software Foundation; or, when distributed
+ * separately from the Linux kernel or incorporated into other
+ * software packages, subject to the following license:
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this source file (the "Software"), to deal in the Software without
+ * restriction, including without limitation the rights to use, copy, modify,
+ * merge, publish, distribute, sublicense, and/or sell copies of the Software,
+ * and to permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+
+#ifndef _XEN_ACPI_H
+#define _XEN_ACPI_H
+
+#include <linux/types.h>
+
+#ifdef CONFIG_XEN_DOM0
+#include <asm/xen/hypervisor.h>
+#include <xen/xen.h>
+#include <linux/acpi.h>
+
+int xen_acpi_notify_hypervisor_state(u8 sleep_state,
+				     u32 pm1a_cnt, u32 pm1b_cnd);
+
+static inline void xen_acpi_sleep_register(void)
+{
+	if (xen_initial_domain())
+		acpi_os_set_prepare_sleep(
+			&xen_acpi_notify_hypervisor_state);
+}
+#else
+static inline void xen_acpi_sleep_register(void)
+{
+}
+#endif
+
+#endif	/* _XEN_ACPI_H */
-- 
cgit v1.2.3


From 399988eea194a8453e283fdd2da968d1fd39a7cf Mon Sep 17 00:00:00 2001
From: Suresh Siddha <suresh.b.siddha@intel.com>
Date: Tue, 8 May 2012 00:08:52 -0700
Subject: irq_remap: Fix compiler warning with CONFIG_IRQ_REMAP=y
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Fix the below compiler warning:

arch/x86/include/asm/irq_remapping.h:72:19: warning: ‘struct IO_APIC_route_entry’ declared inside parameter list [enabled by default]

Signed-off-by: Suresh Siddha <suresh.b.siddha@intel.com>
Cc: joro@8bytes.org
Cc: iommu@lists.linux-foundation.org
Cc: Joerg Roedel <joerg.roedel@amd.com>
Link: http://lkml.kernel.org/r/1336460934-23592-1-git-send-email-suresh.b.siddha@intel.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/x86/include/asm/irq_remapping.h | 6 ++----
 1 file changed, 2 insertions(+), 4 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/include/asm/irq_remapping.h b/arch/x86/include/asm/irq_remapping.h
index dcb0c7231028..5fb9bbbd2f14 100644
--- a/arch/x86/include/asm/irq_remapping.h
+++ b/arch/x86/include/asm/irq_remapping.h
@@ -22,11 +22,9 @@
 #ifndef __X86_IRQ_REMAPPING_H
 #define __X86_IRQ_REMAPPING_H
 
-#ifdef CONFIG_IRQ_REMAP
+#include <asm/io_apic.h>
 
-struct IO_APIC_route_entry;
-struct io_apic_irq_attr;
-struct pci_dev;
+#ifdef CONFIG_IRQ_REMAP
 
 extern int irq_remapping_enabled;
 
-- 
cgit v1.2.3


From d1ecad6eee8629c6b425580aad76cf99b85956e9 Mon Sep 17 00:00:00 2001
From: Márton Németh <nm127@freemail.hu>
Date: Tue, 8 May 2012 00:24:20 -0700
Subject: x86/apic: Only compile local function if used with
 !CONFIG_GENERIC_PENDING_IRQ
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The local function io_apic_level_ack_pending() is only called
from io_apic_level_ack_pending(). The later function is only
compiled if CONFIG_GENERIC_PENDING_IRQ is defined. Move the
io_apic_level_ack_pending() to the existing #ifdef
CONFIG_GENERIC_PENDING_IRQ code block.

This will remove the following warning message during compiling
without CONFIG_GENERIC_PENDING_IRQ defined:

 * arch/x86/kernel/apic/io_apic.c:382: warning: ‘io_apic_level_ack_pending’ defined but not used

Signed-off-by: Márton Németh <nm127@freemail.hu>
Signed-off-by: Suresh Siddha <suresh.b.siddha@intel.com>
Cc: Yinghai Lu <yinghai@kernel.org>
Cc: Naga Chumbalkar <nagananda.chumbalkar@hp.com>
Link: http://lkml.kernel.org/r/1336461860.2296.3.camel@sbsiddha-mobl2
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/x86/kernel/apic/io_apic.c | 46 +++++++++++++++++++++---------------------
 1 file changed, 23 insertions(+), 23 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kernel/apic/io_apic.c b/arch/x86/kernel/apic/io_apic.c
index 973539c128a4..e245365670a4 100644
--- a/arch/x86/kernel/apic/io_apic.c
+++ b/arch/x86/kernel/apic/io_apic.c
@@ -346,29 +346,6 @@ void native_io_apic_modify(unsigned int apic, unsigned int reg, unsigned int val
 	writel(value, &io_apic->data);
 }
 
-static bool io_apic_level_ack_pending(struct irq_cfg *cfg)
-{
-	struct irq_pin_list *entry;
-	unsigned long flags;
-
-	raw_spin_lock_irqsave(&ioapic_lock, flags);
-	for_each_irq_pin(entry, cfg->irq_2_pin) {
-		unsigned int reg;
-		int pin;
-
-		pin = entry->pin;
-		reg = io_apic_read(entry->apic, 0x10 + pin*2);
-		/* Is the remote IRR bit set? */
-		if (reg & IO_APIC_REDIR_REMOTE_IRR) {
-			raw_spin_unlock_irqrestore(&ioapic_lock, flags);
-			return true;
-		}
-	}
-	raw_spin_unlock_irqrestore(&ioapic_lock, flags);
-
-	return false;
-}
-
 union entry_union {
 	struct { u32 w1, w2; };
 	struct IO_APIC_route_entry entry;
@@ -2519,6 +2496,29 @@ static void ack_apic_edge(struct irq_data *data)
 atomic_t irq_mis_count;
 
 #ifdef CONFIG_GENERIC_PENDING_IRQ
+static bool io_apic_level_ack_pending(struct irq_cfg *cfg)
+{
+	struct irq_pin_list *entry;
+	unsigned long flags;
+
+	raw_spin_lock_irqsave(&ioapic_lock, flags);
+	for_each_irq_pin(entry, cfg->irq_2_pin) {
+		unsigned int reg;
+		int pin;
+
+		pin = entry->pin;
+		reg = io_apic_read(entry->apic, 0x10 + pin*2);
+		/* Is the remote IRR bit set? */
+		if (reg & IO_APIC_REDIR_REMOTE_IRR) {
+			raw_spin_unlock_irqrestore(&ioapic_lock, flags);
+			return true;
+		}
+	}
+	raw_spin_unlock_irqrestore(&ioapic_lock, flags);
+
+	return false;
+}
+
 static inline bool ioapic_irqd_mask(struct irq_data *data, struct irq_cfg *cfg)
 {
 	/* If we are moving the irq we need to mask it */
-- 
cgit v1.2.3


From 85f7f656274fa0ba109dd8774db3887d42de5c6b Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Mon, 7 May 2012 17:59:49 +0000
Subject: x86: Use kick_all_cpus_sync()

Use kick_all_cpus_sync() and remove cpu_idle_wait().

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Cc: Peter Zijlstra <peterz@infradead.org>
Link: http://lkml.kernel.org/r/20120507175652.190382227@linutronix.de
Cc: x86@kernel.org
---
 arch/x86/Kconfig                 |  3 ---
 arch/x86/include/asm/processor.h |  2 --
 arch/x86/kernel/apm_32.c         |  2 +-
 arch/x86/kernel/process.c        | 20 --------------------
 4 files changed, 1 insertion(+), 26 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
index 046bf4bd2510..98876f55a2e0 100644
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -161,9 +161,6 @@ config RWSEM_GENERIC_SPINLOCK
 config RWSEM_XCHGADD_ALGORITHM
 	def_bool X86_XADD
 
-config ARCH_HAS_CPU_IDLE_WAIT
-	def_bool y
-
 config GENERIC_CALIBRATE_DELAY
 	def_bool y
 
diff --git a/arch/x86/include/asm/processor.h b/arch/x86/include/asm/processor.h
index 4fa7dcceb6c0..ccbb1ea99ccb 100644
--- a/arch/x86/include/asm/processor.h
+++ b/arch/x86/include/asm/processor.h
@@ -974,8 +974,6 @@ extern bool cpu_has_amd_erratum(const int *);
 #define cpu_has_amd_erratum(x)	(false)
 #endif /* CONFIG_CPU_SUP_AMD */
 
-void cpu_idle_wait(void);
-
 extern unsigned long arch_align_stack(unsigned long sp);
 extern void free_init_pages(char *what, unsigned long begin, unsigned long end);
 
diff --git a/arch/x86/kernel/apm_32.c b/arch/x86/kernel/apm_32.c
index 459e78cbf61e..07b0c0db466c 100644
--- a/arch/x86/kernel/apm_32.c
+++ b/arch/x86/kernel/apm_32.c
@@ -2401,7 +2401,7 @@ static void __exit apm_exit(void)
 		 * (pm_idle), Wait for all processors to update cached/local
 		 * copies of pm_idle before proceeding.
 		 */
-		cpu_idle_wait();
+		kick_all_cpus_sync();
 	}
 	if (((apm_info.bios.flags & APM_BIOS_DISENGAGED) == 0)
 	    && (apm_info.connection_version > 0x0100)) {
diff --git a/arch/x86/kernel/process.c b/arch/x86/kernel/process.c
index 8aa532fa015d..8215458f6af5 100644
--- a/arch/x86/kernel/process.c
+++ b/arch/x86/kernel/process.c
@@ -525,26 +525,6 @@ void stop_this_cpu(void *dummy)
 	}
 }
 
-static void do_nothing(void *unused)
-{
-}
-
-/*
- * cpu_idle_wait - Used to ensure that all the CPUs discard old value of
- * pm_idle and update to new pm_idle value. Required while changing pm_idle
- * handler on SMP systems.
- *
- * Caller must have changed pm_idle to the new value before the call. Old
- * pm_idle value will not be used by any CPU after the return of this function.
- */
-void cpu_idle_wait(void)
-{
-	smp_mb();
-	/* kick all the CPUs so that they exit out of pm_idle */
-	smp_call_function(do_nothing, NULL, 1);
-}
-EXPORT_SYMBOL_GPL(cpu_idle_wait);
-
 /* Default MONITOR/MWAIT with no hints, used for default C1 state */
 static void mwait_idle(void)
 {
-- 
cgit v1.2.3


From 6c0a9fa62feb7e9fdefa9720bcc03040c9b0b311 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Sat, 5 May 2012 15:05:40 +0000
Subject: fork: Remove the weak insanity

We error out when compiling with gcc4.1.[01] as it miscompiles
__weak. The workaround with magic defines is not longer
necessary. Make it __weak again.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Link: http://lkml.kernel.org/r/20120505150141.306358267@linutronix.de
---
 arch/sh/include/asm/thread_info.h  | 1 -
 arch/x86/include/asm/thread_info.h | 1 -
 kernel/fork.c                      | 8 +-------
 3 files changed, 1 insertion(+), 9 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/sh/include/asm/thread_info.h b/arch/sh/include/asm/thread_info.h
index 20ee40af16e9..09963d4018cb 100644
--- a/arch/sh/include/asm/thread_info.h
+++ b/arch/sh/include/asm/thread_info.h
@@ -98,7 +98,6 @@ static inline struct thread_info *current_thread_info(void)
 extern struct thread_info *alloc_thread_info_node(struct task_struct *tsk, int node);
 extern void free_thread_info(struct thread_info *ti);
 extern void arch_task_cache_init(void);
-#define arch_task_cache_init arch_task_cache_init
 extern int arch_dup_task_struct(struct task_struct *dst, struct task_struct *src);
 extern void init_thread_xstate(void);
 
diff --git a/arch/x86/include/asm/thread_info.h b/arch/x86/include/asm/thread_info.h
index ad6df8ccd715..8692a166dd4e 100644
--- a/arch/x86/include/asm/thread_info.h
+++ b/arch/x86/include/asm/thread_info.h
@@ -284,6 +284,5 @@ static inline bool is_ia32_task(void)
 extern void arch_task_cache_init(void);
 extern void free_thread_info(struct thread_info *ti);
 extern int arch_dup_task_struct(struct task_struct *dst, struct task_struct *src);
-#define arch_task_cache_init arch_task_cache_init
 #endif
 #endif /* _ASM_X86_THREAD_INFO_H */
diff --git a/kernel/fork.c b/kernel/fork.c
index b9372a0bff18..a79b36e2e912 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -203,13 +203,7 @@ void __put_task_struct(struct task_struct *tsk)
 }
 EXPORT_SYMBOL_GPL(__put_task_struct);
 
-/*
- * macro override instead of weak attribute alias, to workaround
- * gcc 4.1.0 and 4.1.1 bugs with weak attribute and empty functions.
- */
-#ifndef arch_task_cache_init
-#define arch_task_cache_init()
-#endif
+void __init __weak arch_task_cache_init(void) { }
 
 void __init fork_init(unsigned long mempages)
 {
-- 
cgit v1.2.3


From 38e7c572ce7310def003d8bb7c34260f5d8118cb Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Sat, 5 May 2012 15:05:42 +0000
Subject: x86: Use common threadinfo allocator

The only difference is the free_thread_info function, which frees
xstate.

Use the new arch_release_task_struct() function instead and switch
over to the core allocator.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Link: http://lkml.kernel.org/r/20120505150141.559556763@linutronix.de
Cc: x86@kernel.org
---
 arch/x86/include/asm/boot.h          |  2 +-
 arch/x86/include/asm/page_32_types.h |  4 ++--
 arch/x86/include/asm/page_64_types.h |  4 ++--
 arch/x86/include/asm/thread_info.h   | 20 +-------------------
 arch/x86/kernel/irq_32.c             |  8 ++++----
 arch/x86/kernel/process.c            |  5 ++---
 6 files changed, 12 insertions(+), 31 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/include/asm/boot.h b/arch/x86/include/asm/boot.h
index 5e1a2eef3e7c..b13fe63bdc59 100644
--- a/arch/x86/include/asm/boot.h
+++ b/arch/x86/include/asm/boot.h
@@ -19,7 +19,7 @@
 #ifdef CONFIG_X86_64
 #define MIN_KERNEL_ALIGN_LG2	PMD_SHIFT
 #else
-#define MIN_KERNEL_ALIGN_LG2	(PAGE_SHIFT + THREAD_ORDER)
+#define MIN_KERNEL_ALIGN_LG2	(PAGE_SHIFT + THREAD_SIZE_ORDER)
 #endif
 #define MIN_KERNEL_ALIGN	(_AC(1, UL) << MIN_KERNEL_ALIGN_LG2)
 
diff --git a/arch/x86/include/asm/page_32_types.h b/arch/x86/include/asm/page_32_types.h
index ade619ff9e2a..ef17af013475 100644
--- a/arch/x86/include/asm/page_32_types.h
+++ b/arch/x86/include/asm/page_32_types.h
@@ -15,8 +15,8 @@
  */
 #define __PAGE_OFFSET		_AC(CONFIG_PAGE_OFFSET, UL)
 
-#define THREAD_ORDER	1
-#define THREAD_SIZE 	(PAGE_SIZE << THREAD_ORDER)
+#define THREAD_SIZE_ORDER	1
+#define THREAD_SIZE		(PAGE_SIZE << THREAD_SIZE_ORDER)
 
 #define STACKFAULT_STACK 0
 #define DOUBLEFAULT_STACK 1
diff --git a/arch/x86/include/asm/page_64_types.h b/arch/x86/include/asm/page_64_types.h
index 7639dbf5d223..320f7bb95f76 100644
--- a/arch/x86/include/asm/page_64_types.h
+++ b/arch/x86/include/asm/page_64_types.h
@@ -1,8 +1,8 @@
 #ifndef _ASM_X86_PAGE_64_DEFS_H
 #define _ASM_X86_PAGE_64_DEFS_H
 
-#define THREAD_ORDER	1
-#define THREAD_SIZE  (PAGE_SIZE << THREAD_ORDER)
+#define THREAD_SIZE_ORDER	1
+#define THREAD_SIZE  (PAGE_SIZE << THREAD_SIZE_ORDER)
 #define CURRENT_MASK (~(THREAD_SIZE - 1))
 
 #define EXCEPTION_STACK_ORDER 0
diff --git a/arch/x86/include/asm/thread_info.h b/arch/x86/include/asm/thread_info.h
index 8692a166dd4e..73cfe0d309c9 100644
--- a/arch/x86/include/asm/thread_info.h
+++ b/arch/x86/include/asm/thread_info.h
@@ -155,24 +155,6 @@ struct thread_info {
 
 #define PREEMPT_ACTIVE		0x10000000
 
-/* thread information allocation */
-#ifdef CONFIG_DEBUG_STACK_USAGE
-#define THREAD_FLAGS (GFP_KERNEL | __GFP_NOTRACK | __GFP_ZERO)
-#else
-#define THREAD_FLAGS (GFP_KERNEL | __GFP_NOTRACK)
-#endif
-
-#define __HAVE_ARCH_THREAD_INFO_ALLOCATOR
-
-#define alloc_thread_info_node(tsk, node)				\
-({									\
-	struct page *page = alloc_pages_node(node, THREAD_FLAGS,	\
-					     THREAD_ORDER);		\
-	struct thread_info *ret = page ? page_address(page) : NULL;	\
-									\
-	ret;								\
-})
-
 #ifdef CONFIG_X86_32
 
 #define STACK_WARN	(THREAD_SIZE/8)
@@ -282,7 +264,7 @@ static inline bool is_ia32_task(void)
 
 #ifndef __ASSEMBLY__
 extern void arch_task_cache_init(void);
-extern void free_thread_info(struct thread_info *ti);
 extern int arch_dup_task_struct(struct task_struct *dst, struct task_struct *src);
+extern void arch_release_task_struct(struct task_struct *tsk);
 #endif
 #endif /* _ASM_X86_THREAD_INFO_H */
diff --git a/arch/x86/kernel/irq_32.c b/arch/x86/kernel/irq_32.c
index 58b7f27cb3e9..344faf8d0d62 100644
--- a/arch/x86/kernel/irq_32.c
+++ b/arch/x86/kernel/irq_32.c
@@ -127,8 +127,8 @@ void __cpuinit irq_ctx_init(int cpu)
 		return;
 
 	irqctx = page_address(alloc_pages_node(cpu_to_node(cpu),
-					       THREAD_FLAGS,
-					       THREAD_ORDER));
+					       THREADINFO_GFP,
+					       THREAD_SIZE_ORDER));
 	memset(&irqctx->tinfo, 0, sizeof(struct thread_info));
 	irqctx->tinfo.cpu		= cpu;
 	irqctx->tinfo.preempt_count	= HARDIRQ_OFFSET;
@@ -137,8 +137,8 @@ void __cpuinit irq_ctx_init(int cpu)
 	per_cpu(hardirq_ctx, cpu) = irqctx;
 
 	irqctx = page_address(alloc_pages_node(cpu_to_node(cpu),
-					       THREAD_FLAGS,
-					       THREAD_ORDER));
+					       THREADINFO_GFP,
+					       THREAD_SIZE_ORDER));
 	memset(&irqctx->tinfo, 0, sizeof(struct thread_info));
 	irqctx->tinfo.cpu		= cpu;
 	irqctx->tinfo.addr_limit	= MAKE_MM_SEG(0);
diff --git a/arch/x86/kernel/process.c b/arch/x86/kernel/process.c
index 8215458f6af5..e8173154800d 100644
--- a/arch/x86/kernel/process.c
+++ b/arch/x86/kernel/process.c
@@ -76,10 +76,9 @@ void free_thread_xstate(struct task_struct *tsk)
 	fpu_free(&tsk->thread.fpu);
 }
 
-void free_thread_info(struct thread_info *ti)
+void arch_release_task_struct(struct task_struct *tsk)
 {
-	free_thread_xstate(ti->task);
-	free_pages((unsigned long)ti, THREAD_ORDER);
+	free_thread_xstate(tsk);
 }
 
 void arch_task_cache_init(void)
-- 
cgit v1.2.3


From fba60c620a6a9ec11140c179e5d0fe0bc3c3ea29 Mon Sep 17 00:00:00 2001
From: Jan Beulich <JBeulich@suse.com>
Date: Tue, 8 May 2012 15:21:18 +0100
Subject: x86-64: Eliminate dead ia32 syscall handlers

None of the three routines being removed here was actually
hooked up anywhere, so they all represented dead code.

Signed-off-by: Jan Beulich <jbeulich@suse.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Link: http://lkml.kernel.org/r/4FA947FE020000780008247F@nat28.tlf.novell.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/x86/ia32/sys_ia32.c | 23 -----------------------
 1 file changed, 23 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/ia32/sys_ia32.c b/arch/x86/ia32/sys_ia32.c
index aec2202a596c..edca9c0a79cc 100644
--- a/arch/x86/ia32/sys_ia32.c
+++ b/arch/x86/ia32/sys_ia32.c
@@ -287,11 +287,6 @@ asmlinkage long sys32_sigaction(int sig, struct old_sigaction32 __user *act,
 	return ret;
 }
 
-asmlinkage long sys32_alarm(unsigned int seconds)
-{
-	return alarm_setitimer(seconds);
-}
-
 asmlinkage long sys32_waitpid(compat_pid_t pid, unsigned int *stat_addr,
 			      int options)
 {
@@ -300,11 +295,6 @@ asmlinkage long sys32_waitpid(compat_pid_t pid, unsigned int *stat_addr,
 
 /* 32-bit timeval and related flotsam.  */
 
-asmlinkage long sys32_sysfs(int option, u32 arg1, u32 arg2)
-{
-	return sys_sysfs(option, arg1, arg2);
-}
-
 asmlinkage long sys32_sched_rr_get_interval(compat_pid_t pid,
 				    struct compat_timespec __user *interval)
 {
@@ -375,19 +365,6 @@ asmlinkage long sys32_pwrite(unsigned int fd, const char __user *ubuf,
 }
 
 
-asmlinkage long sys32_personality(unsigned long personality)
-{
-	int ret;
-
-	if (personality(current->personality) == PER_LINUX32 &&
-		personality == PER_LINUX)
-		personality = PER_LINUX32;
-	ret = sys_personality(personality);
-	if (ret == PER_LINUX32)
-		ret = PER_LINUX;
-	return ret;
-}
-
 asmlinkage long sys32_sendfile(int out_fd, int in_fd,
 			       compat_off_t __user *offset, s32 count)
 {
-- 
cgit v1.2.3


From 433de739bbc22a5b2c87602116566ce27e3b4cab Mon Sep 17 00:00:00 2001
From: "H. Peter Anvin" <hpa@linux.intel.com>
Date: Tue, 8 May 2012 21:22:24 +0300
Subject: x86, realmode: 16-bit real-mode code support for relocs tool

A new option is added to the relocs tool called '--realmode'.
This option causes the generation of 16-bit segment relocations
and 32-bit linear relocations for the real-mode code. When
the real-mode code is moved to the low-memory during kernel
initialization, these relocation entries can be used to
relocate the code properly.

In the assembly code 16-bit segment relocations must be relative
to the 'real_mode_seg' absolute symbol. Linear relocations must be
relative to a symbol prefixed with 'pa_'.

16-bit segment relocation is used to load cs:ip in 16-bit code.
Linear relocations are used in the 32-bit code for relocatable
data references. They are declared in the linker script of the
real-mode code.

The relocs tool is moved to scripts/x86-relocs.c so it will
be compiled before building the arch/x86 tree.

Signed-off-by: H. Peter Anvin <hpa@linux.intel.com>
Link: http://lkml.kernel.org/r/1336501366-28617-2-git-send-email-jarkko.sakkinen@intel.com
Signed-off-by: Jarkko Sakkinen <jarkko.sakkinen@intel.com>
Signed-off-by: H. Peter Anvin <hpa@linux.intel.com>
---
 arch/x86/boot/compressed/Makefile |  11 +-
 arch/x86/boot/compressed/relocs.c | 678 --------------------------------
 scripts/.gitignore                |   1 +
 scripts/Makefile                  |   3 +
 scripts/x86-relocs.c              | 797 ++++++++++++++++++++++++++++++++++++++
 5 files changed, 806 insertions(+), 684 deletions(-)
 delete mode 100644 arch/x86/boot/compressed/relocs.c
 create mode 100644 scripts/x86-relocs.c

(limited to 'arch/x86')

diff --git a/arch/x86/boot/compressed/Makefile b/arch/x86/boot/compressed/Makefile
index fd55a2ff3ad8..0435e8a2d20e 100644
--- a/arch/x86/boot/compressed/Makefile
+++ b/arch/x86/boot/compressed/Makefile
@@ -40,13 +40,12 @@ OBJCOPYFLAGS_vmlinux.bin :=  -R .comment -S
 $(obj)/vmlinux.bin: vmlinux FORCE
 	$(call if_changed,objcopy)
 
+targets += vmlinux.bin.all vmlinux.relocs
 
-targets += vmlinux.bin.all vmlinux.relocs relocs
-hostprogs-$(CONFIG_X86_NEED_RELOCS) += relocs
-
-quiet_cmd_relocs = RELOCS  $@
-      cmd_relocs = $(obj)/relocs $< > $@;$(obj)/relocs --abs-relocs $<
-$(obj)/vmlinux.relocs: vmlinux $(obj)/relocs FORCE
+CMD_RELOCS = scripts/x86-relocs
+quiet_cmd_relocs = RELOCS $@
+      cmd_relocs = $(CMD_RELOCS) $< > $@;$(CMD_RELOCS) --abs-relocs $<
+$(obj)/vmlinux.relocs: vmlinux FORCE
 	$(call if_changed,relocs)
 
 vmlinux.bin.all-y := $(obj)/vmlinux.bin
diff --git a/arch/x86/boot/compressed/relocs.c b/arch/x86/boot/compressed/relocs.c
deleted file mode 100644
index fb7117a4ade1..000000000000
--- a/arch/x86/boot/compressed/relocs.c
+++ /dev/null
@@ -1,678 +0,0 @@
-#include <stdio.h>
-#include <stdarg.h>
-#include <stdlib.h>
-#include <stdint.h>
-#include <string.h>
-#include <errno.h>
-#include <unistd.h>
-#include <elf.h>
-#include <byteswap.h>
-#define USE_BSD
-#include <endian.h>
-#include <regex.h>
-#include <tools/le_byteshift.h>
-
-static void die(char *fmt, ...);
-
-#define ARRAY_SIZE(x) (sizeof(x) / sizeof((x)[0]))
-static Elf32_Ehdr ehdr;
-static unsigned long reloc_count, reloc_idx;
-static unsigned long *relocs;
-
-struct section {
-	Elf32_Shdr     shdr;
-	struct section *link;
-	Elf32_Sym      *symtab;
-	Elf32_Rel      *reltab;
-	char           *strtab;
-};
-static struct section *secs;
-
-/*
- * Following symbols have been audited. There values are constant and do
- * not change if bzImage is loaded at a different physical address than
- * the address for which it has been compiled. Don't warn user about
- * absolute relocations present w.r.t these symbols.
- */
-static const char abs_sym_regex[] =
-	"^(xen_irq_disable_direct_reloc$|"
-	"xen_save_fl_direct_reloc$|"
-	"VDSO|"
-	"__crc_)";
-static regex_t abs_sym_regex_c;
-static int is_abs_reloc(const char *sym_name)
-{
-	return !regexec(&abs_sym_regex_c, sym_name, 0, NULL, 0);
-}
-
-/*
- * These symbols are known to be relative, even if the linker marks them
- * as absolute (typically defined outside any section in the linker script.)
- */
-static const char rel_sym_regex[] =
-	"^_end$";
-static regex_t rel_sym_regex_c;
-static int is_rel_reloc(const char *sym_name)
-{
-	return !regexec(&rel_sym_regex_c, sym_name, 0, NULL, 0);
-}
-
-static void regex_init(void)
-{
-        char errbuf[128];
-        int err;
-	
-        err = regcomp(&abs_sym_regex_c, abs_sym_regex,
-                      REG_EXTENDED|REG_NOSUB);
-        if (err) {
-                regerror(err, &abs_sym_regex_c, errbuf, sizeof errbuf);
-                die("%s", errbuf);
-        }
-
-        err = regcomp(&rel_sym_regex_c, rel_sym_regex,
-                      REG_EXTENDED|REG_NOSUB);
-        if (err) {
-                regerror(err, &rel_sym_regex_c, errbuf, sizeof errbuf);
-                die("%s", errbuf);
-        }
-}
-
-static void die(char *fmt, ...)
-{
-	va_list ap;
-	va_start(ap, fmt);
-	vfprintf(stderr, fmt, ap);
-	va_end(ap);
-	exit(1);
-}
-
-static const char *sym_type(unsigned type)
-{
-	static const char *type_name[] = {
-#define SYM_TYPE(X) [X] = #X
-		SYM_TYPE(STT_NOTYPE),
-		SYM_TYPE(STT_OBJECT),
-		SYM_TYPE(STT_FUNC),
-		SYM_TYPE(STT_SECTION),
-		SYM_TYPE(STT_FILE),
-		SYM_TYPE(STT_COMMON),
-		SYM_TYPE(STT_TLS),
-#undef SYM_TYPE
-	};
-	const char *name = "unknown sym type name";
-	if (type < ARRAY_SIZE(type_name)) {
-		name = type_name[type];
-	}
-	return name;
-}
-
-static const char *sym_bind(unsigned bind)
-{
-	static const char *bind_name[] = {
-#define SYM_BIND(X) [X] = #X
-		SYM_BIND(STB_LOCAL),
-		SYM_BIND(STB_GLOBAL),
-		SYM_BIND(STB_WEAK),
-#undef SYM_BIND
-	};
-	const char *name = "unknown sym bind name";
-	if (bind < ARRAY_SIZE(bind_name)) {
-		name = bind_name[bind];
-	}
-	return name;
-}
-
-static const char *sym_visibility(unsigned visibility)
-{
-	static const char *visibility_name[] = {
-#define SYM_VISIBILITY(X) [X] = #X
-		SYM_VISIBILITY(STV_DEFAULT),
-		SYM_VISIBILITY(STV_INTERNAL),
-		SYM_VISIBILITY(STV_HIDDEN),
-		SYM_VISIBILITY(STV_PROTECTED),
-#undef SYM_VISIBILITY
-	};
-	const char *name = "unknown sym visibility name";
-	if (visibility < ARRAY_SIZE(visibility_name)) {
-		name = visibility_name[visibility];
-	}
-	return name;
-}
-
-static const char *rel_type(unsigned type)
-{
-	static const char *type_name[] = {
-#define REL_TYPE(X) [X] = #X
-		REL_TYPE(R_386_NONE),
-		REL_TYPE(R_386_32),
-		REL_TYPE(R_386_PC32),
-		REL_TYPE(R_386_GOT32),
-		REL_TYPE(R_386_PLT32),
-		REL_TYPE(R_386_COPY),
-		REL_TYPE(R_386_GLOB_DAT),
-		REL_TYPE(R_386_JMP_SLOT),
-		REL_TYPE(R_386_RELATIVE),
-		REL_TYPE(R_386_GOTOFF),
-		REL_TYPE(R_386_GOTPC),
-#undef REL_TYPE
-	};
-	const char *name = "unknown type rel type name";
-	if (type < ARRAY_SIZE(type_name) && type_name[type]) {
-		name = type_name[type];
-	}
-	return name;
-}
-
-static const char *sec_name(unsigned shndx)
-{
-	const char *sec_strtab;
-	const char *name;
-	sec_strtab = secs[ehdr.e_shstrndx].strtab;
-	name = "<noname>";
-	if (shndx < ehdr.e_shnum) {
-		name = sec_strtab + secs[shndx].shdr.sh_name;
-	}
-	else if (shndx == SHN_ABS) {
-		name = "ABSOLUTE";
-	}
-	else if (shndx == SHN_COMMON) {
-		name = "COMMON";
-	}
-	return name;
-}
-
-static const char *sym_name(const char *sym_strtab, Elf32_Sym *sym)
-{
-	const char *name;
-	name = "<noname>";
-	if (sym->st_name) {
-		name = sym_strtab + sym->st_name;
-	}
-	else {
-		name = sec_name(secs[sym->st_shndx].shdr.sh_name);
-	}
-	return name;
-}
-
-
-
-#if BYTE_ORDER == LITTLE_ENDIAN
-#define le16_to_cpu(val) (val)
-#define le32_to_cpu(val) (val)
-#endif
-#if BYTE_ORDER == BIG_ENDIAN
-#define le16_to_cpu(val) bswap_16(val)
-#define le32_to_cpu(val) bswap_32(val)
-#endif
-
-static uint16_t elf16_to_cpu(uint16_t val)
-{
-	return le16_to_cpu(val);
-}
-
-static uint32_t elf32_to_cpu(uint32_t val)
-{
-	return le32_to_cpu(val);
-}
-
-static void read_ehdr(FILE *fp)
-{
-	if (fread(&ehdr, sizeof(ehdr), 1, fp) != 1) {
-		die("Cannot read ELF header: %s\n",
-			strerror(errno));
-	}
-	if (memcmp(ehdr.e_ident, ELFMAG, SELFMAG) != 0) {
-		die("No ELF magic\n");
-	}
-	if (ehdr.e_ident[EI_CLASS] != ELFCLASS32) {
-		die("Not a 32 bit executable\n");
-	}
-	if (ehdr.e_ident[EI_DATA] != ELFDATA2LSB) {
-		die("Not a LSB ELF executable\n");
-	}
-	if (ehdr.e_ident[EI_VERSION] != EV_CURRENT) {
-		die("Unknown ELF version\n");
-	}
-	/* Convert the fields to native endian */
-	ehdr.e_type      = elf16_to_cpu(ehdr.e_type);
-	ehdr.e_machine   = elf16_to_cpu(ehdr.e_machine);
-	ehdr.e_version   = elf32_to_cpu(ehdr.e_version);
-	ehdr.e_entry     = elf32_to_cpu(ehdr.e_entry);
-	ehdr.e_phoff     = elf32_to_cpu(ehdr.e_phoff);
-	ehdr.e_shoff     = elf32_to_cpu(ehdr.e_shoff);
-	ehdr.e_flags     = elf32_to_cpu(ehdr.e_flags);
-	ehdr.e_ehsize    = elf16_to_cpu(ehdr.e_ehsize);
-	ehdr.e_phentsize = elf16_to_cpu(ehdr.e_phentsize);
-	ehdr.e_phnum     = elf16_to_cpu(ehdr.e_phnum);
-	ehdr.e_shentsize = elf16_to_cpu(ehdr.e_shentsize);
-	ehdr.e_shnum     = elf16_to_cpu(ehdr.e_shnum);
-	ehdr.e_shstrndx  = elf16_to_cpu(ehdr.e_shstrndx);
-
-	if ((ehdr.e_type != ET_EXEC) && (ehdr.e_type != ET_DYN)) {
-		die("Unsupported ELF header type\n");
-	}
-	if (ehdr.e_machine != EM_386) {
-		die("Not for x86\n");
-	}
-	if (ehdr.e_version != EV_CURRENT) {
-		die("Unknown ELF version\n");
-	}
-	if (ehdr.e_ehsize != sizeof(Elf32_Ehdr)) {
-		die("Bad Elf header size\n");
-	}
-	if (ehdr.e_phentsize != sizeof(Elf32_Phdr)) {
-		die("Bad program header entry\n");
-	}
-	if (ehdr.e_shentsize != sizeof(Elf32_Shdr)) {
-		die("Bad section header entry\n");
-	}
-	if (ehdr.e_shstrndx >= ehdr.e_shnum) {
-		die("String table index out of bounds\n");
-	}
-}
-
-static void read_shdrs(FILE *fp)
-{
-	int i;
-	Elf32_Shdr shdr;
-
-	secs = calloc(ehdr.e_shnum, sizeof(struct section));
-	if (!secs) {
-		die("Unable to allocate %d section headers\n",
-		    ehdr.e_shnum);
-	}
-	if (fseek(fp, ehdr.e_shoff, SEEK_SET) < 0) {
-		die("Seek to %d failed: %s\n",
-			ehdr.e_shoff, strerror(errno));
-	}
-	for (i = 0; i < ehdr.e_shnum; i++) {
-		struct section *sec = &secs[i];
-		if (fread(&shdr, sizeof shdr, 1, fp) != 1)
-			die("Cannot read ELF section headers %d/%d: %s\n",
-			    i, ehdr.e_shnum, strerror(errno));
-		sec->shdr.sh_name      = elf32_to_cpu(shdr.sh_name);
-		sec->shdr.sh_type      = elf32_to_cpu(shdr.sh_type);
-		sec->shdr.sh_flags     = elf32_to_cpu(shdr.sh_flags);
-		sec->shdr.sh_addr      = elf32_to_cpu(shdr.sh_addr);
-		sec->shdr.sh_offset    = elf32_to_cpu(shdr.sh_offset);
-		sec->shdr.sh_size      = elf32_to_cpu(shdr.sh_size);
-		sec->shdr.sh_link      = elf32_to_cpu(shdr.sh_link);
-		sec->shdr.sh_info      = elf32_to_cpu(shdr.sh_info);
-		sec->shdr.sh_addralign = elf32_to_cpu(shdr.sh_addralign);
-		sec->shdr.sh_entsize   = elf32_to_cpu(shdr.sh_entsize);
-		if (sec->shdr.sh_link < ehdr.e_shnum)
-			sec->link = &secs[sec->shdr.sh_link];
-	}
-
-}
-
-static void read_strtabs(FILE *fp)
-{
-	int i;
-	for (i = 0; i < ehdr.e_shnum; i++) {
-		struct section *sec = &secs[i];
-		if (sec->shdr.sh_type != SHT_STRTAB) {
-			continue;
-		}
-		sec->strtab = malloc(sec->shdr.sh_size);
-		if (!sec->strtab) {
-			die("malloc of %d bytes for strtab failed\n",
-				sec->shdr.sh_size);
-		}
-		if (fseek(fp, sec->shdr.sh_offset, SEEK_SET) < 0) {
-			die("Seek to %d failed: %s\n",
-				sec->shdr.sh_offset, strerror(errno));
-		}
-		if (fread(sec->strtab, 1, sec->shdr.sh_size, fp)
-		    != sec->shdr.sh_size) {
-			die("Cannot read symbol table: %s\n",
-				strerror(errno));
-		}
-	}
-}
-
-static void read_symtabs(FILE *fp)
-{
-	int i,j;
-	for (i = 0; i < ehdr.e_shnum; i++) {
-		struct section *sec = &secs[i];
-		if (sec->shdr.sh_type != SHT_SYMTAB) {
-			continue;
-		}
-		sec->symtab = malloc(sec->shdr.sh_size);
-		if (!sec->symtab) {
-			die("malloc of %d bytes for symtab failed\n",
-				sec->shdr.sh_size);
-		}
-		if (fseek(fp, sec->shdr.sh_offset, SEEK_SET) < 0) {
-			die("Seek to %d failed: %s\n",
-				sec->shdr.sh_offset, strerror(errno));
-		}
-		if (fread(sec->symtab, 1, sec->shdr.sh_size, fp)
-		    != sec->shdr.sh_size) {
-			die("Cannot read symbol table: %s\n",
-				strerror(errno));
-		}
-		for (j = 0; j < sec->shdr.sh_size/sizeof(Elf32_Sym); j++) {
-			Elf32_Sym *sym = &sec->symtab[j];
-			sym->st_name  = elf32_to_cpu(sym->st_name);
-			sym->st_value = elf32_to_cpu(sym->st_value);
-			sym->st_size  = elf32_to_cpu(sym->st_size);
-			sym->st_shndx = elf16_to_cpu(sym->st_shndx);
-		}
-	}
-}
-
-
-static void read_relocs(FILE *fp)
-{
-	int i,j;
-	for (i = 0; i < ehdr.e_shnum; i++) {
-		struct section *sec = &secs[i];
-		if (sec->shdr.sh_type != SHT_REL) {
-			continue;
-		}
-		sec->reltab = malloc(sec->shdr.sh_size);
-		if (!sec->reltab) {
-			die("malloc of %d bytes for relocs failed\n",
-				sec->shdr.sh_size);
-		}
-		if (fseek(fp, sec->shdr.sh_offset, SEEK_SET) < 0) {
-			die("Seek to %d failed: %s\n",
-				sec->shdr.sh_offset, strerror(errno));
-		}
-		if (fread(sec->reltab, 1, sec->shdr.sh_size, fp)
-		    != sec->shdr.sh_size) {
-			die("Cannot read symbol table: %s\n",
-				strerror(errno));
-		}
-		for (j = 0; j < sec->shdr.sh_size/sizeof(Elf32_Rel); j++) {
-			Elf32_Rel *rel = &sec->reltab[j];
-			rel->r_offset = elf32_to_cpu(rel->r_offset);
-			rel->r_info   = elf32_to_cpu(rel->r_info);
-		}
-	}
-}
-
-
-static void print_absolute_symbols(void)
-{
-	int i;
-	printf("Absolute symbols\n");
-	printf(" Num:    Value Size  Type       Bind        Visibility  Name\n");
-	for (i = 0; i < ehdr.e_shnum; i++) {
-		struct section *sec = &secs[i];
-		char *sym_strtab;
-		int j;
-
-		if (sec->shdr.sh_type != SHT_SYMTAB) {
-			continue;
-		}
-		sym_strtab = sec->link->strtab;
-		for (j = 0; j < sec->shdr.sh_size/sizeof(Elf32_Sym); j++) {
-			Elf32_Sym *sym;
-			const char *name;
-			sym = &sec->symtab[j];
-			name = sym_name(sym_strtab, sym);
-			if (sym->st_shndx != SHN_ABS) {
-				continue;
-			}
-			printf("%5d %08x %5d %10s %10s %12s %s\n",
-				j, sym->st_value, sym->st_size,
-				sym_type(ELF32_ST_TYPE(sym->st_info)),
-				sym_bind(ELF32_ST_BIND(sym->st_info)),
-				sym_visibility(ELF32_ST_VISIBILITY(sym->st_other)),
-				name);
-		}
-	}
-	printf("\n");
-}
-
-static void print_absolute_relocs(void)
-{
-	int i, printed = 0;
-
-	for (i = 0; i < ehdr.e_shnum; i++) {
-		struct section *sec = &secs[i];
-		struct section *sec_applies, *sec_symtab;
-		char *sym_strtab;
-		Elf32_Sym *sh_symtab;
-		int j;
-		if (sec->shdr.sh_type != SHT_REL) {
-			continue;
-		}
-		sec_symtab  = sec->link;
-		sec_applies = &secs[sec->shdr.sh_info];
-		if (!(sec_applies->shdr.sh_flags & SHF_ALLOC)) {
-			continue;
-		}
-		sh_symtab  = sec_symtab->symtab;
-		sym_strtab = sec_symtab->link->strtab;
-		for (j = 0; j < sec->shdr.sh_size/sizeof(Elf32_Rel); j++) {
-			Elf32_Rel *rel;
-			Elf32_Sym *sym;
-			const char *name;
-			rel = &sec->reltab[j];
-			sym = &sh_symtab[ELF32_R_SYM(rel->r_info)];
-			name = sym_name(sym_strtab, sym);
-			if (sym->st_shndx != SHN_ABS) {
-				continue;
-			}
-
-			/* Absolute symbols are not relocated if bzImage is
-			 * loaded at a non-compiled address. Display a warning
-			 * to user at compile time about the absolute
-			 * relocations present.
-			 *
-			 * User need to audit the code to make sure
-			 * some symbols which should have been section
-			 * relative have not become absolute because of some
-			 * linker optimization or wrong programming usage.
-			 *
-			 * Before warning check if this absolute symbol
-			 * relocation is harmless.
-			 */
-			if (is_abs_reloc(name) || is_rel_reloc(name))
-				continue;
-
-			if (!printed) {
-				printf("WARNING: Absolute relocations"
-					" present\n");
-				printf("Offset     Info     Type     Sym.Value "
-					"Sym.Name\n");
-				printed = 1;
-			}
-
-			printf("%08x %08x %10s %08x  %s\n",
-				rel->r_offset,
-				rel->r_info,
-				rel_type(ELF32_R_TYPE(rel->r_info)),
-				sym->st_value,
-				name);
-		}
-	}
-
-	if (printed)
-		printf("\n");
-}
-
-static void walk_relocs(void (*visit)(Elf32_Rel *rel, Elf32_Sym *sym))
-{
-	int i;
-	/* Walk through the relocations */
-	for (i = 0; i < ehdr.e_shnum; i++) {
-		char *sym_strtab;
-		Elf32_Sym *sh_symtab;
-		struct section *sec_applies, *sec_symtab;
-		int j;
-		struct section *sec = &secs[i];
-
-		if (sec->shdr.sh_type != SHT_REL) {
-			continue;
-		}
-		sec_symtab  = sec->link;
-		sec_applies = &secs[sec->shdr.sh_info];
-		if (!(sec_applies->shdr.sh_flags & SHF_ALLOC)) {
-			continue;
-		}
-		sh_symtab = sec_symtab->symtab;
-		sym_strtab = sec_symtab->link->strtab;
-		for (j = 0; j < sec->shdr.sh_size/sizeof(Elf32_Rel); j++) {
-			Elf32_Rel *rel;
-			Elf32_Sym *sym;
-			unsigned r_type;
-			rel = &sec->reltab[j];
-			sym = &sh_symtab[ELF32_R_SYM(rel->r_info)];
-			r_type = ELF32_R_TYPE(rel->r_info);
-			/* Don't visit relocations to absolute symbols */
-			if (sym->st_shndx == SHN_ABS &&
-			    !is_rel_reloc(sym_name(sym_strtab, sym))) {
-				continue;
-			}
-			switch (r_type) {
-			case R_386_NONE:
-			case R_386_PC32:
-				/*
-				 * NONE can be ignored and and PC relative
-				 * relocations don't need to be adjusted.
-				 */
-				break;
-			case R_386_32:
-				/* Visit relocations that need to be adjusted */
-				visit(rel, sym);
-				break;
-			default:
-				die("Unsupported relocation type: %s (%d)\n",
-				    rel_type(r_type), r_type);
-				break;
-			}
-		}
-	}
-}
-
-static void count_reloc(Elf32_Rel *rel, Elf32_Sym *sym)
-{
-	reloc_count += 1;
-}
-
-static void collect_reloc(Elf32_Rel *rel, Elf32_Sym *sym)
-{
-	/* Remember the address that needs to be adjusted. */
-	relocs[reloc_idx++] = rel->r_offset;
-}
-
-static int cmp_relocs(const void *va, const void *vb)
-{
-	const unsigned long *a, *b;
-	a = va; b = vb;
-	return (*a == *b)? 0 : (*a > *b)? 1 : -1;
-}
-
-static void emit_relocs(int as_text)
-{
-	int i;
-	/* Count how many relocations I have and allocate space for them. */
-	reloc_count = 0;
-	walk_relocs(count_reloc);
-	relocs = malloc(reloc_count * sizeof(relocs[0]));
-	if (!relocs) {
-		die("malloc of %d entries for relocs failed\n",
-			reloc_count);
-	}
-	/* Collect up the relocations */
-	reloc_idx = 0;
-	walk_relocs(collect_reloc);
-
-	/* Order the relocations for more efficient processing */
-	qsort(relocs, reloc_count, sizeof(relocs[0]), cmp_relocs);
-
-	/* Print the relocations */
-	if (as_text) {
-		/* Print the relocations in a form suitable that
-		 * gas will like.
-		 */
-		printf(".section \".data.reloc\",\"a\"\n");
-		printf(".balign 4\n");
-		for (i = 0; i < reloc_count; i++) {
-			printf("\t .long 0x%08lx\n", relocs[i]);
-		}
-		printf("\n");
-	}
-	else {
-		unsigned char buf[4];
-		/* Print a stop */
-		fwrite("\0\0\0\0", 4, 1, stdout);
-		/* Now print each relocation */
-		for (i = 0; i < reloc_count; i++) {
-			put_unaligned_le32(relocs[i], buf);
-			fwrite(buf, 4, 1, stdout);
-		}
-	}
-}
-
-static void usage(void)
-{
-	die("relocs [--abs-syms |--abs-relocs | --text] vmlinux\n");
-}
-
-int main(int argc, char **argv)
-{
-	int show_absolute_syms, show_absolute_relocs;
-	int as_text;
-	const char *fname;
-	FILE *fp;
-	int i;
-
-	regex_init();
-
-	show_absolute_syms = 0;
-	show_absolute_relocs = 0;
-	as_text = 0;
-	fname = NULL;
-	for (i = 1; i < argc; i++) {
-		char *arg = argv[i];
-		if (*arg == '-') {
-			if (strcmp(argv[1], "--abs-syms") == 0) {
-				show_absolute_syms = 1;
-				continue;
-			}
-
-			if (strcmp(argv[1], "--abs-relocs") == 0) {
-				show_absolute_relocs = 1;
-				continue;
-			}
-			else if (strcmp(argv[1], "--text") == 0) {
-				as_text = 1;
-				continue;
-			}
-		}
-		else if (!fname) {
-			fname = arg;
-			continue;
-		}
-		usage();
-	}
-	if (!fname) {
-		usage();
-	}
-	fp = fopen(fname, "r");
-	if (!fp) {
-		die("Cannot open %s: %s\n",
-			fname, strerror(errno));
-	}
-	read_ehdr(fp);
-	read_shdrs(fp);
-	read_strtabs(fp);
-	read_symtabs(fp);
-	read_relocs(fp);
-	if (show_absolute_syms) {
-		print_absolute_symbols();
-		return 0;
-	}
-	if (show_absolute_relocs) {
-		print_absolute_relocs();
-		return 0;
-	}
-	emit_relocs(as_text);
-	return 0;
-}
diff --git a/scripts/.gitignore b/scripts/.gitignore
index 105b21f08185..68c0f32fdc9b 100644
--- a/scripts/.gitignore
+++ b/scripts/.gitignore
@@ -9,3 +9,4 @@ unifdef
 ihex2fw
 recordmcount
 docproc
+x86-relocs
diff --git a/scripts/Makefile b/scripts/Makefile
index df7678febf27..a241359d2c82 100644
--- a/scripts/Makefile
+++ b/scripts/Makefile
@@ -8,11 +8,14 @@
 # conmakehash:	 Create arrays for initializing the kernel console tables
 # docproc:       Used in Documentation/DocBook
 
+HOST_EXTRACFLAGS += -I$(srctree)/tools/include
+
 hostprogs-$(CONFIG_KALLSYMS)     += kallsyms
 hostprogs-$(CONFIG_LOGO)         += pnmtologo
 hostprogs-$(CONFIG_VT)           += conmakehash
 hostprogs-$(CONFIG_IKCONFIG)     += bin2c
 hostprogs-$(BUILD_C_RECORDMCOUNT) += recordmcount
+hostprogs-$(CONFIG_X86)          += x86-relocs
 
 always		:= $(hostprogs-y) $(hostprogs-m)
 
diff --git a/scripts/x86-relocs.c b/scripts/x86-relocs.c
new file mode 100644
index 000000000000..02914706e5b9
--- /dev/null
+++ b/scripts/x86-relocs.c
@@ -0,0 +1,797 @@
+#include <stdio.h>
+#include <stdarg.h>
+#include <stdlib.h>
+#include <stdint.h>
+#include <string.h>
+#include <errno.h>
+#include <unistd.h>
+#include <elf.h>
+#include <byteswap.h>
+#define USE_BSD
+#include <endian.h>
+#include <regex.h>
+#include <tools/le_byteshift.h>
+
+static void die(char *fmt, ...);
+
+#define ARRAY_SIZE(x) (sizeof(x) / sizeof((x)[0]))
+static Elf32_Ehdr ehdr;
+static unsigned long reloc_count, reloc_idx;
+static unsigned long *relocs;
+static unsigned long reloc16_count, reloc16_idx;
+static unsigned long *relocs16;
+
+struct section {
+	Elf32_Shdr     shdr;
+	struct section *link;
+	Elf32_Sym      *symtab;
+	Elf32_Rel      *reltab;
+	char           *strtab;
+};
+static struct section *secs;
+
+enum symtype {
+	S_ABS,
+	S_REL,
+	S_SEG,
+	S_LIN,
+	S_NSYMTYPES
+};
+
+static const char * const sym_regex_kernel[S_NSYMTYPES] = {
+/*
+ * Following symbols have been audited. There values are constant and do
+ * not change if bzImage is loaded at a different physical address than
+ * the address for which it has been compiled. Don't warn user about
+ * absolute relocations present w.r.t these symbols.
+ */
+	[S_ABS] =
+	"^(xen_irq_disable_direct_reloc$|"
+	"xen_save_fl_direct_reloc$|"
+	"VDSO|"
+	"__crc_)",
+
+/*
+ * These symbols are known to be relative, even if the linker marks them
+ * as absolute (typically defined outside any section in the linker script.)
+ */
+	[S_REL] =
+	"^_end$",
+};
+
+
+static const char * const sym_regex_realmode[S_NSYMTYPES] = {
+/*
+ * These are 16-bit segment symbols when compiling 16-bit code.
+ */
+	[S_SEG] =
+	"^real_mode_seg$",
+
+/*
+ * These are offsets belonging to segments, as opposed to linear addresses,
+ * when compiling 16-bit code.
+ */
+	[S_LIN] =
+	"^pa_",
+};
+
+static const char * const *sym_regex;
+
+static regex_t sym_regex_c[S_NSYMTYPES];
+static int is_reloc(enum symtype type, const char *sym_name)
+{
+	return sym_regex[type] &&
+		!regexec(&sym_regex_c[type], sym_name, 0, NULL, 0);
+}
+
+static void regex_init(int use_real_mode)
+{
+        char errbuf[128];
+        int err;
+	int i;
+
+	if (use_real_mode)
+		sym_regex = sym_regex_realmode;
+	else
+		sym_regex = sym_regex_kernel;
+
+	for (i = 0; i < S_NSYMTYPES; i++) {
+		if (!sym_regex[i])
+			continue;
+
+		err = regcomp(&sym_regex_c[i], sym_regex[i],
+			      REG_EXTENDED|REG_NOSUB);
+
+		if (err) {
+			regerror(err, &sym_regex_c[i], errbuf, sizeof errbuf);
+			die("%s", errbuf);
+		}
+        }
+}
+
+static void die(char *fmt, ...)
+{
+	va_list ap;
+	va_start(ap, fmt);
+	vfprintf(stderr, fmt, ap);
+	va_end(ap);
+	exit(1);
+}
+
+static const char *sym_type(unsigned type)
+{
+	static const char *type_name[] = {
+#define SYM_TYPE(X) [X] = #X
+		SYM_TYPE(STT_NOTYPE),
+		SYM_TYPE(STT_OBJECT),
+		SYM_TYPE(STT_FUNC),
+		SYM_TYPE(STT_SECTION),
+		SYM_TYPE(STT_FILE),
+		SYM_TYPE(STT_COMMON),
+		SYM_TYPE(STT_TLS),
+#undef SYM_TYPE
+	};
+	const char *name = "unknown sym type name";
+	if (type < ARRAY_SIZE(type_name)) {
+		name = type_name[type];
+	}
+	return name;
+}
+
+static const char *sym_bind(unsigned bind)
+{
+	static const char *bind_name[] = {
+#define SYM_BIND(X) [X] = #X
+		SYM_BIND(STB_LOCAL),
+		SYM_BIND(STB_GLOBAL),
+		SYM_BIND(STB_WEAK),
+#undef SYM_BIND
+	};
+	const char *name = "unknown sym bind name";
+	if (bind < ARRAY_SIZE(bind_name)) {
+		name = bind_name[bind];
+	}
+	return name;
+}
+
+static const char *sym_visibility(unsigned visibility)
+{
+	static const char *visibility_name[] = {
+#define SYM_VISIBILITY(X) [X] = #X
+		SYM_VISIBILITY(STV_DEFAULT),
+		SYM_VISIBILITY(STV_INTERNAL),
+		SYM_VISIBILITY(STV_HIDDEN),
+		SYM_VISIBILITY(STV_PROTECTED),
+#undef SYM_VISIBILITY
+	};
+	const char *name = "unknown sym visibility name";
+	if (visibility < ARRAY_SIZE(visibility_name)) {
+		name = visibility_name[visibility];
+	}
+	return name;
+}
+
+static const char *rel_type(unsigned type)
+{
+	static const char *type_name[] = {
+#define REL_TYPE(X) [X] = #X
+		REL_TYPE(R_386_NONE),
+		REL_TYPE(R_386_32),
+		REL_TYPE(R_386_PC32),
+		REL_TYPE(R_386_GOT32),
+		REL_TYPE(R_386_PLT32),
+		REL_TYPE(R_386_COPY),
+		REL_TYPE(R_386_GLOB_DAT),
+		REL_TYPE(R_386_JMP_SLOT),
+		REL_TYPE(R_386_RELATIVE),
+		REL_TYPE(R_386_GOTOFF),
+		REL_TYPE(R_386_GOTPC),
+		REL_TYPE(R_386_8),
+		REL_TYPE(R_386_PC8),
+		REL_TYPE(R_386_16),
+		REL_TYPE(R_386_PC16),
+#undef REL_TYPE
+	};
+	const char *name = "unknown type rel type name";
+	if (type < ARRAY_SIZE(type_name) && type_name[type]) {
+		name = type_name[type];
+	}
+	return name;
+}
+
+static const char *sec_name(unsigned shndx)
+{
+	const char *sec_strtab;
+	const char *name;
+	sec_strtab = secs[ehdr.e_shstrndx].strtab;
+	name = "<noname>";
+	if (shndx < ehdr.e_shnum) {
+		name = sec_strtab + secs[shndx].shdr.sh_name;
+	}
+	else if (shndx == SHN_ABS) {
+		name = "ABSOLUTE";
+	}
+	else if (shndx == SHN_COMMON) {
+		name = "COMMON";
+	}
+	return name;
+}
+
+static const char *sym_name(const char *sym_strtab, Elf32_Sym *sym)
+{
+	const char *name;
+	name = "<noname>";
+	if (sym->st_name) {
+		name = sym_strtab + sym->st_name;
+	}
+	else {
+		name = sec_name(sym->st_shndx);
+	}
+	return name;
+}
+
+
+
+#if BYTE_ORDER == LITTLE_ENDIAN
+#define le16_to_cpu(val) (val)
+#define le32_to_cpu(val) (val)
+#endif
+#if BYTE_ORDER == BIG_ENDIAN
+#define le16_to_cpu(val) bswap_16(val)
+#define le32_to_cpu(val) bswap_32(val)
+#endif
+
+static uint16_t elf16_to_cpu(uint16_t val)
+{
+	return le16_to_cpu(val);
+}
+
+static uint32_t elf32_to_cpu(uint32_t val)
+{
+	return le32_to_cpu(val);
+}
+
+static void read_ehdr(FILE *fp)
+{
+	if (fread(&ehdr, sizeof(ehdr), 1, fp) != 1) {
+		die("Cannot read ELF header: %s\n",
+			strerror(errno));
+	}
+	if (memcmp(ehdr.e_ident, ELFMAG, SELFMAG) != 0) {
+		die("No ELF magic\n");
+	}
+	if (ehdr.e_ident[EI_CLASS] != ELFCLASS32) {
+		die("Not a 32 bit executable\n");
+	}
+	if (ehdr.e_ident[EI_DATA] != ELFDATA2LSB) {
+		die("Not a LSB ELF executable\n");
+	}
+	if (ehdr.e_ident[EI_VERSION] != EV_CURRENT) {
+		die("Unknown ELF version\n");
+	}
+	/* Convert the fields to native endian */
+	ehdr.e_type      = elf16_to_cpu(ehdr.e_type);
+	ehdr.e_machine   = elf16_to_cpu(ehdr.e_machine);
+	ehdr.e_version   = elf32_to_cpu(ehdr.e_version);
+	ehdr.e_entry     = elf32_to_cpu(ehdr.e_entry);
+	ehdr.e_phoff     = elf32_to_cpu(ehdr.e_phoff);
+	ehdr.e_shoff     = elf32_to_cpu(ehdr.e_shoff);
+	ehdr.e_flags     = elf32_to_cpu(ehdr.e_flags);
+	ehdr.e_ehsize    = elf16_to_cpu(ehdr.e_ehsize);
+	ehdr.e_phentsize = elf16_to_cpu(ehdr.e_phentsize);
+	ehdr.e_phnum     = elf16_to_cpu(ehdr.e_phnum);
+	ehdr.e_shentsize = elf16_to_cpu(ehdr.e_shentsize);
+	ehdr.e_shnum     = elf16_to_cpu(ehdr.e_shnum);
+	ehdr.e_shstrndx  = elf16_to_cpu(ehdr.e_shstrndx);
+
+	if ((ehdr.e_type != ET_EXEC) && (ehdr.e_type != ET_DYN)) {
+		die("Unsupported ELF header type\n");
+	}
+	if (ehdr.e_machine != EM_386) {
+		die("Not for x86\n");
+	}
+	if (ehdr.e_version != EV_CURRENT) {
+		die("Unknown ELF version\n");
+	}
+	if (ehdr.e_ehsize != sizeof(Elf32_Ehdr)) {
+		die("Bad Elf header size\n");
+	}
+	if (ehdr.e_phentsize != sizeof(Elf32_Phdr)) {
+		die("Bad program header entry\n");
+	}
+	if (ehdr.e_shentsize != sizeof(Elf32_Shdr)) {
+		die("Bad section header entry\n");
+	}
+	if (ehdr.e_shstrndx >= ehdr.e_shnum) {
+		die("String table index out of bounds\n");
+	}
+}
+
+static void read_shdrs(FILE *fp)
+{
+	int i;
+	Elf32_Shdr shdr;
+
+	secs = calloc(ehdr.e_shnum, sizeof(struct section));
+	if (!secs) {
+		die("Unable to allocate %d section headers\n",
+		    ehdr.e_shnum);
+	}
+	if (fseek(fp, ehdr.e_shoff, SEEK_SET) < 0) {
+		die("Seek to %d failed: %s\n",
+			ehdr.e_shoff, strerror(errno));
+	}
+	for (i = 0; i < ehdr.e_shnum; i++) {
+		struct section *sec = &secs[i];
+		if (fread(&shdr, sizeof shdr, 1, fp) != 1)
+			die("Cannot read ELF section headers %d/%d: %s\n",
+			    i, ehdr.e_shnum, strerror(errno));
+		sec->shdr.sh_name      = elf32_to_cpu(shdr.sh_name);
+		sec->shdr.sh_type      = elf32_to_cpu(shdr.sh_type);
+		sec->shdr.sh_flags     = elf32_to_cpu(shdr.sh_flags);
+		sec->shdr.sh_addr      = elf32_to_cpu(shdr.sh_addr);
+		sec->shdr.sh_offset    = elf32_to_cpu(shdr.sh_offset);
+		sec->shdr.sh_size      = elf32_to_cpu(shdr.sh_size);
+		sec->shdr.sh_link      = elf32_to_cpu(shdr.sh_link);
+		sec->shdr.sh_info      = elf32_to_cpu(shdr.sh_info);
+		sec->shdr.sh_addralign = elf32_to_cpu(shdr.sh_addralign);
+		sec->shdr.sh_entsize   = elf32_to_cpu(shdr.sh_entsize);
+		if (sec->shdr.sh_link < ehdr.e_shnum)
+			sec->link = &secs[sec->shdr.sh_link];
+	}
+
+}
+
+static void read_strtabs(FILE *fp)
+{
+	int i;
+	for (i = 0; i < ehdr.e_shnum; i++) {
+		struct section *sec = &secs[i];
+		if (sec->shdr.sh_type != SHT_STRTAB) {
+			continue;
+		}
+		sec->strtab = malloc(sec->shdr.sh_size);
+		if (!sec->strtab) {
+			die("malloc of %d bytes for strtab failed\n",
+				sec->shdr.sh_size);
+		}
+		if (fseek(fp, sec->shdr.sh_offset, SEEK_SET) < 0) {
+			die("Seek to %d failed: %s\n",
+				sec->shdr.sh_offset, strerror(errno));
+		}
+		if (fread(sec->strtab, 1, sec->shdr.sh_size, fp)
+		    != sec->shdr.sh_size) {
+			die("Cannot read symbol table: %s\n",
+				strerror(errno));
+		}
+	}
+}
+
+static void read_symtabs(FILE *fp)
+{
+	int i,j;
+	for (i = 0; i < ehdr.e_shnum; i++) {
+		struct section *sec = &secs[i];
+		if (sec->shdr.sh_type != SHT_SYMTAB) {
+			continue;
+		}
+		sec->symtab = malloc(sec->shdr.sh_size);
+		if (!sec->symtab) {
+			die("malloc of %d bytes for symtab failed\n",
+				sec->shdr.sh_size);
+		}
+		if (fseek(fp, sec->shdr.sh_offset, SEEK_SET) < 0) {
+			die("Seek to %d failed: %s\n",
+				sec->shdr.sh_offset, strerror(errno));
+		}
+		if (fread(sec->symtab, 1, sec->shdr.sh_size, fp)
+		    != sec->shdr.sh_size) {
+			die("Cannot read symbol table: %s\n",
+				strerror(errno));
+		}
+		for (j = 0; j < sec->shdr.sh_size/sizeof(Elf32_Sym); j++) {
+			Elf32_Sym *sym = &sec->symtab[j];
+			sym->st_name  = elf32_to_cpu(sym->st_name);
+			sym->st_value = elf32_to_cpu(sym->st_value);
+			sym->st_size  = elf32_to_cpu(sym->st_size);
+			sym->st_shndx = elf16_to_cpu(sym->st_shndx);
+		}
+	}
+}
+
+
+static void read_relocs(FILE *fp)
+{
+	int i,j;
+	for (i = 0; i < ehdr.e_shnum; i++) {
+		struct section *sec = &secs[i];
+		if (sec->shdr.sh_type != SHT_REL) {
+			continue;
+		}
+		sec->reltab = malloc(sec->shdr.sh_size);
+		if (!sec->reltab) {
+			die("malloc of %d bytes for relocs failed\n",
+				sec->shdr.sh_size);
+		}
+		if (fseek(fp, sec->shdr.sh_offset, SEEK_SET) < 0) {
+			die("Seek to %d failed: %s\n",
+				sec->shdr.sh_offset, strerror(errno));
+		}
+		if (fread(sec->reltab, 1, sec->shdr.sh_size, fp)
+		    != sec->shdr.sh_size) {
+			die("Cannot read symbol table: %s\n",
+				strerror(errno));
+		}
+		for (j = 0; j < sec->shdr.sh_size/sizeof(Elf32_Rel); j++) {
+			Elf32_Rel *rel = &sec->reltab[j];
+			rel->r_offset = elf32_to_cpu(rel->r_offset);
+			rel->r_info   = elf32_to_cpu(rel->r_info);
+		}
+	}
+}
+
+
+static void print_absolute_symbols(void)
+{
+	int i;
+	printf("Absolute symbols\n");
+	printf(" Num:    Value Size  Type       Bind        Visibility  Name\n");
+	for (i = 0; i < ehdr.e_shnum; i++) {
+		struct section *sec = &secs[i];
+		char *sym_strtab;
+		int j;
+
+		if (sec->shdr.sh_type != SHT_SYMTAB) {
+			continue;
+		}
+		sym_strtab = sec->link->strtab;
+		for (j = 0; j < sec->shdr.sh_size/sizeof(Elf32_Sym); j++) {
+			Elf32_Sym *sym;
+			const char *name;
+			sym = &sec->symtab[j];
+			name = sym_name(sym_strtab, sym);
+			if (sym->st_shndx != SHN_ABS) {
+				continue;
+			}
+			printf("%5d %08x %5d %10s %10s %12s %s\n",
+				j, sym->st_value, sym->st_size,
+				sym_type(ELF32_ST_TYPE(sym->st_info)),
+				sym_bind(ELF32_ST_BIND(sym->st_info)),
+				sym_visibility(ELF32_ST_VISIBILITY(sym->st_other)),
+				name);
+		}
+	}
+	printf("\n");
+}
+
+static void print_absolute_relocs(void)
+{
+	int i, printed = 0;
+
+	for (i = 0; i < ehdr.e_shnum; i++) {
+		struct section *sec = &secs[i];
+		struct section *sec_applies, *sec_symtab;
+		char *sym_strtab;
+		Elf32_Sym *sh_symtab;
+		int j;
+		if (sec->shdr.sh_type != SHT_REL) {
+			continue;
+		}
+		sec_symtab  = sec->link;
+		sec_applies = &secs[sec->shdr.sh_info];
+		if (!(sec_applies->shdr.sh_flags & SHF_ALLOC)) {
+			continue;
+		}
+		sh_symtab  = sec_symtab->symtab;
+		sym_strtab = sec_symtab->link->strtab;
+		for (j = 0; j < sec->shdr.sh_size/sizeof(Elf32_Rel); j++) {
+			Elf32_Rel *rel;
+			Elf32_Sym *sym;
+			const char *name;
+			rel = &sec->reltab[j];
+			sym = &sh_symtab[ELF32_R_SYM(rel->r_info)];
+			name = sym_name(sym_strtab, sym);
+			if (sym->st_shndx != SHN_ABS) {
+				continue;
+			}
+
+			/* Absolute symbols are not relocated if bzImage is
+			 * loaded at a non-compiled address. Display a warning
+			 * to user at compile time about the absolute
+			 * relocations present.
+			 *
+			 * User need to audit the code to make sure
+			 * some symbols which should have been section
+			 * relative have not become absolute because of some
+			 * linker optimization or wrong programming usage.
+			 *
+			 * Before warning check if this absolute symbol
+			 * relocation is harmless.
+			 */
+			if (is_reloc(S_ABS, name) || is_reloc(S_REL, name))
+				continue;
+
+			if (!printed) {
+				printf("WARNING: Absolute relocations"
+					" present\n");
+				printf("Offset     Info     Type     Sym.Value "
+					"Sym.Name\n");
+				printed = 1;
+			}
+
+			printf("%08x %08x %10s %08x  %s\n",
+				rel->r_offset,
+				rel->r_info,
+				rel_type(ELF32_R_TYPE(rel->r_info)),
+				sym->st_value,
+				name);
+		}
+	}
+
+	if (printed)
+		printf("\n");
+}
+
+static void walk_relocs(void (*visit)(Elf32_Rel *rel, Elf32_Sym *sym),
+			int use_real_mode)
+{
+	int i;
+	/* Walk through the relocations */
+	for (i = 0; i < ehdr.e_shnum; i++) {
+		char *sym_strtab;
+		Elf32_Sym *sh_symtab;
+		struct section *sec_applies, *sec_symtab;
+		int j;
+		struct section *sec = &secs[i];
+
+		if (sec->shdr.sh_type != SHT_REL) {
+			continue;
+		}
+		sec_symtab  = sec->link;
+		sec_applies = &secs[sec->shdr.sh_info];
+		if (!(sec_applies->shdr.sh_flags & SHF_ALLOC)) {
+			continue;
+		}
+		sh_symtab = sec_symtab->symtab;
+		sym_strtab = sec_symtab->link->strtab;
+		for (j = 0; j < sec->shdr.sh_size/sizeof(Elf32_Rel); j++) {
+			Elf32_Rel *rel;
+			Elf32_Sym *sym;
+			unsigned r_type;
+			const char *symname;
+			rel = &sec->reltab[j];
+			sym = &sh_symtab[ELF32_R_SYM(rel->r_info)];
+			r_type = ELF32_R_TYPE(rel->r_info);
+
+			switch (r_type) {
+			case R_386_NONE:
+			case R_386_PC32:
+			case R_386_PC16:
+			case R_386_PC8:
+				/*
+				 * NONE can be ignored and and PC relative
+				 * relocations don't need to be adjusted.
+				 */
+				break;
+
+			case R_386_16:
+				symname = sym_name(sym_strtab, sym);
+				if (!use_real_mode)
+					goto bad;
+				if (sym->st_shndx == SHN_ABS) {
+					if (is_reloc(S_ABS, symname))
+						break;
+					else if (!is_reloc(S_SEG, symname))
+						goto bad;
+				} else {
+					if (is_reloc(S_LIN, symname))
+						goto bad;
+					else
+						break;
+				}
+				visit(rel, sym);
+				break;
+
+			case R_386_32:
+				symname = sym_name(sym_strtab, sym);
+				if (sym->st_shndx == SHN_ABS) {
+					if (is_reloc(S_ABS, symname))
+						break;
+					else if (!is_reloc(S_REL, symname))
+						goto bad;
+				} else {
+					if (use_real_mode &&
+					    !is_reloc(S_LIN, symname))
+						break;
+				}
+				visit(rel, sym);
+				break;
+			default:
+				die("Unsupported relocation type: %s (%d)\n",
+				    rel_type(r_type), r_type);
+				break;
+			bad:
+				symname = sym_name(sym_strtab, sym);
+				die("Invalid %s relocation: %s\n",
+				    rel_type(r_type), symname);
+			}
+		}
+	}
+}
+
+static void count_reloc(Elf32_Rel *rel, Elf32_Sym *sym)
+{
+	if (ELF32_R_TYPE(rel->r_info) == R_386_16)
+		reloc16_count++;
+	else
+		reloc_count++;
+}
+
+static void collect_reloc(Elf32_Rel *rel, Elf32_Sym *sym)
+{
+	/* Remember the address that needs to be adjusted. */
+	if (ELF32_R_TYPE(rel->r_info) == R_386_16)
+		relocs16[reloc16_idx++] = rel->r_offset;
+	else
+		relocs[reloc_idx++] = rel->r_offset;
+}
+
+static int cmp_relocs(const void *va, const void *vb)
+{
+	const unsigned long *a, *b;
+	a = va; b = vb;
+	return (*a == *b)? 0 : (*a > *b)? 1 : -1;
+}
+
+static int write32(unsigned int v, FILE *f)
+{
+	unsigned char buf[4];
+
+	put_unaligned_le32(v, buf);
+	return fwrite(buf, 1, 4, f) == 4 ? 0 : -1;
+}
+
+static void emit_relocs(int as_text, int use_real_mode)
+{
+	int i;
+	/* Count how many relocations I have and allocate space for them. */
+	reloc_count = 0;
+	walk_relocs(count_reloc, use_real_mode);
+	relocs = malloc(reloc_count * sizeof(relocs[0]));
+	if (!relocs) {
+		die("malloc of %d entries for relocs failed\n",
+			reloc_count);
+	}
+
+	relocs16 = malloc(reloc16_count * sizeof(relocs[0]));
+	if (!relocs16) {
+		die("malloc of %d entries for relocs16 failed\n",
+			reloc16_count);
+	}
+	/* Collect up the relocations */
+	reloc_idx = 0;
+	walk_relocs(collect_reloc, use_real_mode);
+
+	if (reloc16_count && !use_real_mode)
+		die("Segment relocations found but --realmode not specified\n");
+
+	/* Order the relocations for more efficient processing */
+	qsort(relocs, reloc_count, sizeof(relocs[0]), cmp_relocs);
+	qsort(relocs16, reloc16_count, sizeof(relocs16[0]), cmp_relocs);
+
+	/* Print the relocations */
+	if (as_text) {
+		/* Print the relocations in a form suitable that
+		 * gas will like.
+		 */
+		printf(".section \".data.reloc\",\"a\"\n");
+		printf(".balign 4\n");
+		if (use_real_mode) {
+			printf("\t.long %lu\n", reloc16_count);
+			for (i = 0; i < reloc16_count; i++)
+				printf("\t.long 0x%08lx\n", relocs16[i]);
+			printf("\t.long %lu\n", reloc_count);
+			for (i = 0; i < reloc_count; i++) {
+				printf("\t.long 0x%08lx\n", relocs[i]);
+			}
+		} else {
+			/* Print a stop */
+			printf("\t.long 0x%08lx\n", (unsigned long)0);
+			for (i = 0; i < reloc_count; i++) {
+				printf("\t.long 0x%08lx\n", relocs[i]);
+			}
+		}
+
+		printf("\n");
+	}
+	else {
+		if (use_real_mode) {
+			write32(reloc16_count, stdout);
+			for (i = 0; i < reloc16_count; i++)
+				write32(relocs16[i], stdout);
+			write32(reloc_count, stdout);
+
+			/* Now print each relocation */
+			for (i = 0; i < reloc_count; i++)
+				write32(relocs[i], stdout);
+		} else {
+			/* Print a stop */
+			write32(0, stdout);
+
+			/* Now print each relocation */
+			for (i = 0; i < reloc_count; i++) {
+				write32(relocs[i], stdout);
+			}
+		}
+	}
+}
+
+static void usage(void)
+{
+	die("relocs [--abs-syms|--abs-relocs|--text|--realmode] vmlinux\n");
+}
+
+int main(int argc, char **argv)
+{
+	int show_absolute_syms, show_absolute_relocs;
+	int as_text, use_real_mode;
+	const char *fname;
+	FILE *fp;
+	int i;
+
+	show_absolute_syms = 0;
+	show_absolute_relocs = 0;
+	as_text = 0;
+	use_real_mode = 0;
+	fname = NULL;
+	for (i = 1; i < argc; i++) {
+		char *arg = argv[i];
+		if (*arg == '-') {
+			if (strcmp(arg, "--abs-syms") == 0) {
+				show_absolute_syms = 1;
+				continue;
+			}
+			if (strcmp(arg, "--abs-relocs") == 0) {
+				show_absolute_relocs = 1;
+				continue;
+			}
+			if (strcmp(arg, "--text") == 0) {
+				as_text = 1;
+				continue;
+			}
+			if (strcmp(arg, "--realmode") == 0) {
+				use_real_mode = 1;
+				continue;
+			}
+		}
+		else if (!fname) {
+			fname = arg;
+			continue;
+		}
+		usage();
+	}
+	if (!fname) {
+		usage();
+	}
+	regex_init(use_real_mode);
+	fp = fopen(fname, "r");
+	if (!fp) {
+		die("Cannot open %s: %s\n",
+			fname, strerror(errno));
+	}
+	read_ehdr(fp);
+	read_shdrs(fp);
+	read_strtabs(fp);
+	read_symtabs(fp);
+	read_relocs(fp);
+	if (show_absolute_syms) {
+		print_absolute_symbols();
+		return 0;
+	}
+	if (show_absolute_relocs) {
+		print_absolute_relocs();
+		return 0;
+	}
+	emit_relocs(as_text, use_real_mode);
+	return 0;
+}
-- 
cgit v1.2.3


From b3266bd6ff52efb9e57c7fbfff4c8f7363dfaab3 Mon Sep 17 00:00:00 2001
From: Jarkko Sakkinen <jarkko.sakkinen@intel.com>
Date: Tue, 8 May 2012 21:22:25 +0300
Subject: x86, realmode: realmode.bin infrastructure

Create realmode.bin and realmode.relocs files. Piggy
pack them into relocatable object that will be included
into .init.data section of the main kernel image.

The first file includes binary image of the real-mode code.
The latter file includes all relocations. The layout of the
binary image is specified in realmode.lds.S. The makefile
generates pa_ prefixed symbols for each exported global.
These are used in 32-bit code and in realmode header to
define symbols that need to be relocated.

Signed-off-by: Jarkko Sakkinen <jarkko.sakkinen@intel.com>
Link: http://lkml.kernel.org/r/1336501366-28617-3-git-send-email-jarkko.sakkinen@intel.com
Originally-by: H. Peter Anvin <hpa@linux.intel.com>
Signed-off-by: H. Peter Anvin <hpa@linux.intel.com>
---
 arch/x86/Kbuild                     |  2 +-
 arch/x86/realmode/Makefile          | 20 +++++++++++
 arch/x86/realmode/rm/.gitignore     |  3 ++
 arch/x86/realmode/rm/Makefile       | 63 ++++++++++++++++++++++++++++++++++
 arch/x86/realmode/rm/header.S       | 16 +++++++++
 arch/x86/realmode/rm/realmode.lds.S | 68 +++++++++++++++++++++++++++++++++++++
 arch/x86/realmode/rmpiggy.S         | 18 ++++++++++
 7 files changed, 189 insertions(+), 1 deletion(-)
 create mode 100644 arch/x86/realmode/Makefile
 create mode 100644 arch/x86/realmode/rm/.gitignore
 create mode 100644 arch/x86/realmode/rm/Makefile
 create mode 100644 arch/x86/realmode/rm/header.S
 create mode 100644 arch/x86/realmode/rm/realmode.lds.S
 create mode 100644 arch/x86/realmode/rmpiggy.S

(limited to 'arch/x86')

diff --git a/arch/x86/Kbuild b/arch/x86/Kbuild
index 0e9dec6cadd1..e5287d8517aa 100644
--- a/arch/x86/Kbuild
+++ b/arch/x86/Kbuild
@@ -1,4 +1,3 @@
-
 obj-$(CONFIG_KVM) += kvm/
 
 # Xen paravirtualization support
@@ -7,6 +6,7 @@ obj-$(CONFIG_XEN) += xen/
 # lguest paravirtualization support
 obj-$(CONFIG_LGUEST_GUEST) += lguest/
 
+obj-y += realmode/
 obj-y += kernel/
 obj-y += mm/
 
diff --git a/arch/x86/realmode/Makefile b/arch/x86/realmode/Makefile
new file mode 100644
index 000000000000..f22a4f8d99d6
--- /dev/null
+++ b/arch/x86/realmode/Makefile
@@ -0,0 +1,20 @@
+#
+# arch/x86/realmode/Makefile
+#
+# This file is subject to the terms and conditions of the GNU General Public
+# License.  See the file "COPYING" in the main directory of this archive
+# for more details.
+#
+#
+
+subdir- := rm
+
+obj-y += rmpiggy.o
+
+$(obj)/rmpiggy.o: $(obj)/rm/realmode.relocs $(obj)/rm/realmode.bin
+
+$(obj)/rm/realmode.bin: FORCE
+	$(Q)$(MAKE) $(build)=$(obj)/rm $@
+
+$(obj)/rm/realmode.relocs: FORCE
+	$(Q)$(MAKE) $(build)=$(obj)/rm $@
diff --git a/arch/x86/realmode/rm/.gitignore b/arch/x86/realmode/rm/.gitignore
new file mode 100644
index 000000000000..b6ed3a2555cb
--- /dev/null
+++ b/arch/x86/realmode/rm/.gitignore
@@ -0,0 +1,3 @@
+pasyms.h
+realmode.lds
+realmode.relocs
diff --git a/arch/x86/realmode/rm/Makefile b/arch/x86/realmode/rm/Makefile
new file mode 100644
index 000000000000..7c3f202cbccf
--- /dev/null
+++ b/arch/x86/realmode/rm/Makefile
@@ -0,0 +1,63 @@
+#
+# arch/x86/realmode/Makefile
+#
+# This file is subject to the terms and conditions of the GNU General Public
+# License.  See the file "COPYING" in the main directory of this archive
+# for more details.
+#
+#
+
+subdir- := wakeup
+
+always := realmode.bin
+
+realmode-y			+= header.o
+
+targets	+= $(realmode-y)
+
+REALMODE_OBJS = $(addprefix $(obj)/,$(realmode-y))
+
+sed-pasyms := -n -r -e 's/^([0-9a-fA-F]+) [ABCDGRSTVW] (.+)$$/pa_\2 = \2;/p'
+
+quiet_cmd_pasyms = PASYMS  $@
+      cmd_pasyms = $(NM) $(filter-out FORCE,$^) | \
+		   sed $(sed-pasyms) | sort | uniq > $@
+
+$(obj)/pasyms.h: $(REALMODE_OBJS) FORCE
+	$(call if_changed,pasyms)
+
+$(obj)/realmode.lds: $(obj)/pasyms.h
+
+LDFLAGS_realmode.elf := --emit-relocs -T
+CPPFLAGS_realmode.lds += -P -C -I$(obj)
+
+$(obj)/realmode.elf: $(obj)/realmode.lds $(REALMODE_OBJS) FORCE
+	$(call if_changed,ld)
+
+OBJCOPYFLAGS_realmode.bin := -O binary
+
+$(obj)/realmode.bin: $(obj)/realmode.elf
+	$(call if_changed,objcopy)
+
+quiet_cmd_relocs = RELOCS  $@
+      cmd_relocs = scripts/x86-relocs --realmode $< > $@
+$(obj)/realmode.relocs: $(obj)/realmode.elf FORCE
+	$(call if_changed,relocs)
+
+# ---------------------------------------------------------------------------
+
+# How to compile the 16-bit code.  Note we always compile for -march=i386,
+# that way we can complain to the user if the CPU is insufficient.
+KBUILD_CFLAGS	:= $(LINUXINCLUDE) -m32 -g -Os -D_SETUP -D__KERNEL__ \
+		   -DDISABLE_BRANCH_PROFILING \
+		   -Wall -Wstrict-prototypes \
+		   -march=i386 -mregparm=3 \
+		   -include $(srctree)/$(src)/../../boot/code16gcc.h \
+		   -fno-strict-aliasing -fomit-frame-pointer \
+		   $(call cc-option, -ffreestanding) \
+		   $(call cc-option, -fno-toplevel-reorder,\
+			$(call cc-option, -fno-unit-at-a-time)) \
+		   $(call cc-option, -fno-stack-protector) \
+		   $(call cc-option, -mpreferred-stack-boundary=2)
+KBUILD_AFLAGS	:= $(KBUILD_CFLAGS) -D__ASSEMBLY__
+GCOV_PROFILE := n
diff --git a/arch/x86/realmode/rm/header.S b/arch/x86/realmode/rm/header.S
new file mode 100644
index 000000000000..7be17f2c65a3
--- /dev/null
+++ b/arch/x86/realmode/rm/header.S
@@ -0,0 +1,16 @@
+/*
+ * Real-mode blob header; this should match realmode.h and be
+ * readonly; for mutable data instead add pointers into the .data
+ * or .bss sections as appropriate.
+ */
+
+#include <linux/linkage.h>
+#include <asm/page_types.h>
+
+		.section ".header", "a"
+
+ENTRY(real_mode_header)
+		.long	pa_text_start
+		.long	pa_ro_end
+		.long	pa_end
+END(real_mode_header)
diff --git a/arch/x86/realmode/rm/realmode.lds.S b/arch/x86/realmode/rm/realmode.lds.S
new file mode 100644
index 000000000000..c5b8a4f31ba3
--- /dev/null
+++ b/arch/x86/realmode/rm/realmode.lds.S
@@ -0,0 +1,68 @@
+/*
+ * realmode.lds.S
+ *
+ * Linker script for the real-mode code
+ */
+
+#include <asm/page_types.h>
+
+#undef i386
+
+OUTPUT_FORMAT("elf32-i386", "elf32-i386", "elf32-i386")
+OUTPUT_ARCH(i386)
+
+SECTIONS
+{
+	real_mode_seg = 0;
+
+	. = 0;
+	.header : {
+		pa_real_mode_base = .;
+		*(.header)
+	}
+
+	. = ALIGN(4);
+	.rodata : {
+		*(.rodata)
+		*(.rodata.*)
+	}
+
+	. = ALIGN(PAGE_SIZE);
+	.text : {
+		pa_text_start = .;
+		*(.text)
+		*(.text.*)
+	}
+
+	.text32 : {
+		*(.text32)
+		*(.text32.*)
+		pa_ro_end = .;
+	}
+
+	. = ALIGN(PAGE_SIZE);
+	.data : {
+		*(.data)
+		*(.data.*)
+	}
+
+	. = ALIGN(128);
+	.bss : {
+		*(.bss*)
+	}
+
+	/* End signature for integrity checking */
+	. = ALIGN(4);
+	.signature : {
+		*(.signature)
+		pa_end = .;
+	}
+
+	/DISCARD/ : {
+		*(.note*)
+		*(.debug*)
+		*(.eh_frame*)
+	}
+
+#include "pasyms.h"
+}
diff --git a/arch/x86/realmode/rmpiggy.S b/arch/x86/realmode/rmpiggy.S
new file mode 100644
index 000000000000..6047d7f604cf
--- /dev/null
+++ b/arch/x86/realmode/rmpiggy.S
@@ -0,0 +1,18 @@
+/*
+ * Wrapper script for the realmode binary as a transport object
+ * before copying to low memory.
+ */
+#include <linux/linkage.h>
+#include <asm/page_types.h>
+
+	.section ".init.data","aw"
+
+	.balign PAGE_SIZE
+
+ENTRY(real_mode_blob)
+	.incbin	"arch/x86/realmode/rm/realmode.bin"
+END(real_mode_blob)
+
+ENTRY(real_mode_relocs)
+	.incbin	"arch/x86/realmode/rm/realmode.relocs"
+END(real_mode_relocs)
-- 
cgit v1.2.3


From 084ee1c641a068bfd1194d545f7dc9ab2043eb35 Mon Sep 17 00:00:00 2001
From: Jarkko Sakkinen <jarkko.sakkinen@intel.com>
Date: Tue, 8 May 2012 21:22:26 +0300
Subject: x86, realmode: Relocator for realmode code

Implements relocator for real mode code that is called
as part of setup_arch(). Processes segment relocations
and linear relocations. Real-mode code is relocated to
a free hole below 1 MB.

Signed-off-by: Jarkko Sakkinen <jarkko.sakkinen@intel.com>
Link: http://lkml.kernel.org/r/1336501366-28617-4-git-send-email-jarkko.sakkinen@intel.com
Signed-off-by: H. Peter Anvin <hpa@linux.intel.com>
---
 arch/x86/include/asm/realmode.h | 26 ++++++++++++++
 arch/x86/kernel/Makefile        |  1 +
 arch/x86/kernel/realmode.c      | 79 +++++++++++++++++++++++++++++++++++++++++
 arch/x86/kernel/setup.c         |  2 ++
 4 files changed, 108 insertions(+)
 create mode 100644 arch/x86/include/asm/realmode.h
 create mode 100644 arch/x86/kernel/realmode.c

(limited to 'arch/x86')

diff --git a/arch/x86/include/asm/realmode.h b/arch/x86/include/asm/realmode.h
new file mode 100644
index 000000000000..dc1bba534c14
--- /dev/null
+++ b/arch/x86/include/asm/realmode.h
@@ -0,0 +1,26 @@
+#ifndef _ARCH_X86_REALMODE_H
+#define _ARCH_X86_REALMODE_H
+
+#include <linux/types.h>
+#include <asm/io.h>
+
+/* This must match data at realmode.S */
+struct real_mode_header {
+	u32	text_start;
+	u32	ro_end;
+	u32	end;
+} __attribute__((__packed__));
+
+extern struct real_mode_header real_mode_header;
+extern unsigned char *real_mode_base;
+
+extern unsigned long init_rsp;
+extern unsigned long initial_code;
+extern unsigned long initial_gs;
+
+extern unsigned char real_mode_blob[];
+extern unsigned char real_mode_relocs[];
+
+extern void __init setup_real_mode(void);
+
+#endif /* _ARCH_X86_REALMODE_H */
diff --git a/arch/x86/kernel/Makefile b/arch/x86/kernel/Makefile
index 532d2e090e6f..f9e19d4eb984 100644
--- a/arch/x86/kernel/Makefile
+++ b/arch/x86/kernel/Makefile
@@ -36,6 +36,7 @@ obj-y			+= pci-iommu_table.o
 obj-y			+= resource.o
 
 obj-y				+= trampoline.o trampoline_$(BITS).o
+obj-y				+= realmode.o
 obj-y				+= process.o
 obj-y				+= i387.o xsave.o
 obj-y				+= ptrace.o
diff --git a/arch/x86/kernel/realmode.c b/arch/x86/kernel/realmode.c
new file mode 100644
index 000000000000..7415c42547ac
--- /dev/null
+++ b/arch/x86/kernel/realmode.c
@@ -0,0 +1,79 @@
+#include <linux/io.h>
+#include <linux/memblock.h>
+
+#include <asm/cacheflush.h>
+#include <asm/pgtable.h>
+#include <asm/realmode.h>
+
+unsigned char *real_mode_base;
+struct real_mode_header real_mode_header;
+
+void __init setup_real_mode(void)
+{
+	phys_addr_t mem;
+	u16 real_mode_seg;
+	u32 *rel;
+	u32 count;
+	u32 *ptr;
+	u16 *seg;
+	int i;
+
+	struct real_mode_header *header =
+		(struct real_mode_header *) real_mode_blob;
+
+	size_t size = PAGE_ALIGN(header->end);
+
+	/* Has to be in very low memory so we can execute real-mode AP code. */
+	mem = memblock_find_in_range(0, 1<<20, size, PAGE_SIZE);
+	if (!mem)
+		panic("Cannot allocate trampoline\n");
+
+	real_mode_base = __va(mem);
+	memblock_reserve(mem, size);
+
+	printk(KERN_DEBUG "Base memory trampoline at [%p] %llx size %zu\n",
+	       real_mode_base, (unsigned long long)mem, size);
+
+	memcpy(real_mode_base, real_mode_blob, size);
+
+	real_mode_seg = __pa(real_mode_base) >> 4;
+	rel = (u32 *) real_mode_relocs;
+
+	/* 16-bit segment relocations. */
+	count = rel[0];
+	rel = &rel[1];
+	for (i = 0; i < count; i++) {
+		seg = (u16 *) (real_mode_base + rel[i]);
+		*seg = real_mode_seg;
+	}
+
+	/* 32-bit linear relocations. */
+	count = rel[i];
+	rel =  &rel[i + 1];
+	for (i = 0; i < count; i++) {
+		ptr = (u32 *) (real_mode_base + rel[i]);
+		*ptr += __pa(real_mode_base);
+	}
+
+	/* Copied header will contain relocated physical addresses. */
+	memcpy(&real_mode_header, real_mode_base,
+	       sizeof(struct real_mode_header));
+}
+
+/*
+ * set_real_mode_permissions() gets called very early, to guarantee the
+ * availability of low memory.  This is before the proper kernel page
+ * tables are set up, so we cannot set page permissions in that
+ * function.  Thus, we use an arch_initcall instead.
+ */
+static int __init set_real_mode_permissions(void)
+{
+	size_t all_size =
+		PAGE_ALIGN(real_mode_header.end) -
+		__pa(real_mode_base);
+
+	set_memory_x((unsigned long) real_mode_base, all_size >> PAGE_SHIFT);
+	return 0;
+}
+
+arch_initcall(set_real_mode_permissions);
diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c
index 1a2901562059..56e41242a6b8 100644
--- a/arch/x86/kernel/setup.c
+++ b/arch/x86/kernel/setup.c
@@ -74,6 +74,7 @@
 #include <asm/mtrr.h>
 #include <asm/apic.h>
 #include <asm/trampoline.h>
+#include <asm/realmode.h>
 #include <asm/e820.h>
 #include <asm/mpspec.h>
 #include <asm/setup.h>
@@ -918,6 +919,7 @@ void __init setup_arch(char **cmdline_p)
 			max_pfn_mapped<<PAGE_SHIFT);
 
 	setup_trampolines();
+	setup_real_mode();
 
 	init_gbpages();
 
-- 
cgit v1.2.3


From 5a8c9aebe04a78b069828d364798d5f24c5a42bd Mon Sep 17 00:00:00 2001
From: Jarkko Sakkinen <jarkko.sakkinen@intel.com>
Date: Tue, 8 May 2012 21:22:27 +0300
Subject: x86, realmode: Move reboot_32.S to unified realmode code

Migrated reboot_32.S from x86_trampoline to the real-mode
blob.

Signed-off-by: Jarkko Sakkinen <jarkko.sakkinen@intel.com>
Link: http://lkml.kernel.org/r/1336501366-28617-5-git-send-email-jarkko.sakkinen@intel.com
Signed-off-by: H. Peter Anvin <hpa@linux.intel.com>
---
 arch/x86/include/asm/realmode.h  |   4 ++
 arch/x86/kernel/Makefile         |   1 -
 arch/x86/kernel/reboot.c         |  25 +-------
 arch/x86/kernel/reboot_32.S      | 135 ---------------------------------------
 arch/x86/realmode/rm/Makefile    |   1 +
 arch/x86/realmode/rm/header.S    |   3 +
 arch/x86/realmode/rm/reboot_32.S | 134 ++++++++++++++++++++++++++++++++++++++
 7 files changed, 145 insertions(+), 158 deletions(-)
 delete mode 100644 arch/x86/kernel/reboot_32.S
 create mode 100644 arch/x86/realmode/rm/reboot_32.S

(limited to 'arch/x86')

diff --git a/arch/x86/include/asm/realmode.h b/arch/x86/include/asm/realmode.h
index dc1bba534c14..bf26b0681931 100644
--- a/arch/x86/include/asm/realmode.h
+++ b/arch/x86/include/asm/realmode.h
@@ -9,6 +9,10 @@ struct real_mode_header {
 	u32	text_start;
 	u32	ro_end;
 	u32	end;
+	/* reboot */
+#ifdef CONFIG_X86_32
+	u32	machine_real_restart_asm;
+#endif
 } __attribute__((__packed__));
 
 extern struct real_mode_header real_mode_header;
diff --git a/arch/x86/kernel/Makefile b/arch/x86/kernel/Makefile
index f9e19d4eb984..b71ef35c7d77 100644
--- a/arch/x86/kernel/Makefile
+++ b/arch/x86/kernel/Makefile
@@ -49,7 +49,6 @@ obj-$(CONFIG_STACKTRACE)	+= stacktrace.o
 obj-y				+= cpu/
 obj-y				+= acpi/
 obj-y				+= reboot.o
-obj-$(CONFIG_X86_32)		+= reboot_32.o
 obj-$(CONFIG_MCA)		+= mca_32.o
 obj-$(CONFIG_X86_MSR)		+= msr.o
 obj-$(CONFIG_X86_CPUID)		+= cpuid.o
diff --git a/arch/x86/kernel/reboot.c b/arch/x86/kernel/reboot.c
index d840e69a853c..050eff29a4bb 100644
--- a/arch/x86/kernel/reboot.c
+++ b/arch/x86/kernel/reboot.c
@@ -24,6 +24,7 @@
 #ifdef CONFIG_X86_32
 # include <linux/ctype.h>
 # include <linux/mc146818rtc.h>
+# include <asm/realmode.h>
 #else
 # include <asm/x86_init.h>
 #endif
@@ -332,15 +333,10 @@ static int __init reboot_init(void)
 }
 core_initcall(reboot_init);
 
-extern const unsigned char machine_real_restart_asm[];
-extern const u64 machine_real_restart_gdt[3];
-
 void machine_real_restart(unsigned int type)
 {
-	void *restart_va;
-	unsigned long restart_pa;
-	void (*restart_lowmem)(unsigned int);
-	u64 *lowmem_gdt;
+	void (*restart_lowmem)(unsigned int) = (void (*)(unsigned int))
+		real_mode_header.machine_real_restart_asm;
 
 	local_irq_disable();
 
@@ -369,21 +365,6 @@ void machine_real_restart(unsigned int type)
 	   too. */
 	*((unsigned short *)0x472) = reboot_mode;
 
-	/* Patch the GDT in the low memory trampoline */
-	lowmem_gdt = TRAMPOLINE_SYM(machine_real_restart_gdt);
-
-	restart_va = TRAMPOLINE_SYM(machine_real_restart_asm);
-	restart_pa = virt_to_phys(restart_va);
-	restart_lowmem = (void (*)(unsigned int))restart_pa;
-
-	/* GDT[0]: GDT self-pointer */
-	lowmem_gdt[0] =
-		(u64)(sizeof(machine_real_restart_gdt) - 1) +
-		((u64)virt_to_phys(lowmem_gdt) << 16);
-	/* GDT[1]: 64K real mode code segment */
-	lowmem_gdt[1] =
-		GDT_ENTRY(0x009b, restart_pa, 0xffff);
-
 	/* Jump to the identity-mapped low memory code */
 	restart_lowmem(type);
 }
diff --git a/arch/x86/kernel/reboot_32.S b/arch/x86/kernel/reboot_32.S
deleted file mode 100644
index 1d5c46df0d78..000000000000
--- a/arch/x86/kernel/reboot_32.S
+++ /dev/null
@@ -1,135 +0,0 @@
-#include <linux/linkage.h>
-#include <linux/init.h>
-#include <asm/segment.h>
-#include <asm/page_types.h>
-
-/*
- * The following code and data reboots the machine by switching to real
- * mode and jumping to the BIOS reset entry point, as if the CPU has
- * really been reset.  The previous version asked the keyboard
- * controller to pulse the CPU reset line, which is more thorough, but
- * doesn't work with at least one type of 486 motherboard.  It is easy
- * to stop this code working; hence the copious comments.
- *
- * This code is called with the restart type (0 = BIOS, 1 = APM) in %eax.
- */
-	.section ".x86_trampoline","a"
-	.balign 16
-	.code32
-ENTRY(machine_real_restart_asm)
-r_base = .
-	/* Get our own relocated address */
-	call	1f
-1:	popl	%ebx
-	subl	$(1b - r_base), %ebx
-
-	/* Compute the equivalent real-mode segment */
-	movl	%ebx, %ecx
-	shrl	$4, %ecx
-	
-	/* Patch post-real-mode segment jump */
-	movw	(dispatch_table - r_base)(%ebx,%eax,2),%ax
-	movw	%ax, (101f - r_base)(%ebx)
-	movw	%cx, (102f - r_base)(%ebx)
-
-	/* Set up the IDT for real mode. */
-	lidtl	(machine_real_restart_idt - r_base)(%ebx)
-
-	/*
-	 * Set up a GDT from which we can load segment descriptors for real
-	 * mode.  The GDT is not used in real mode; it is just needed here to
-	 * prepare the descriptors.
-	 */
-	lgdtl	(machine_real_restart_gdt - r_base)(%ebx)
-
-	/*
-	 * Load the data segment registers with 16-bit compatible values
-	 */
-	movl	$16, %ecx
-	movl	%ecx, %ds
-	movl	%ecx, %es
-	movl	%ecx, %fs
-	movl	%ecx, %gs
-	movl	%ecx, %ss
-	ljmpl	$8, $1f - r_base
-
-/*
- * This is 16-bit protected mode code to disable paging and the cache,
- * switch to real mode and jump to the BIOS reset code.
- *
- * The instruction that switches to real mode by writing to CR0 must be
- * followed immediately by a far jump instruction, which set CS to a
- * valid value for real mode, and flushes the prefetch queue to avoid
- * running instructions that have already been decoded in protected
- * mode.
- *
- * Clears all the flags except ET, especially PG (paging), PE
- * (protected-mode enable) and TS (task switch for coprocessor state
- * save).  Flushes the TLB after paging has been disabled.  Sets CD and
- * NW, to disable the cache on a 486, and invalidates the cache.  This
- * is more like the state of a 486 after reset.  I don't know if
- * something else should be done for other chips.
- *
- * More could be done here to set up the registers as if a CPU reset had
- * occurred; hopefully real BIOSs don't assume much.  This is not the
- * actual BIOS entry point, anyway (that is at 0xfffffff0).
- *
- * Most of this work is probably excessive, but it is what is tested.
- */
-	.code16
-1:
-	xorl	%ecx, %ecx
-	movl	%cr0, %eax
-	andl	$0x00000011, %eax
-	orl	$0x60000000, %eax
-	movl	%eax, %cr0
-	movl	%ecx, %cr3
-	movl	%cr0, %edx
-	andl	$0x60000000, %edx	/* If no cache bits -> no wbinvd */
-	jz	2f
-	wbinvd
-2:
-	andb	$0x10, %al
-	movl	%eax, %cr0
-	.byte	0xea			/* ljmpw */
-101:	.word	0			/* Offset */
-102:	.word	0			/* Segment */
-
-bios:
-	ljmpw	$0xf000, $0xfff0
-
-apm:
-	movw	$0x1000, %ax
-	movw	%ax, %ss
-	movw	$0xf000, %sp
-	movw	$0x5307, %ax
-	movw	$0x0001, %bx
-	movw	$0x0003, %cx
-	int	$0x15
-
-END(machine_real_restart_asm)
-
-	.balign 16
-	/* These must match <asm/reboot.h */
-dispatch_table:
-	.word	bios - r_base
-	.word	apm - r_base
-END(dispatch_table)
-
-	.balign 16
-machine_real_restart_idt:
-	.word	0xffff		/* Length - real mode default value */
-	.long	0		/* Base - real mode default value */
-END(machine_real_restart_idt)
-
-	.balign 16
-ENTRY(machine_real_restart_gdt)
-	.quad	0		/* Self-pointer, filled in by PM code */
-	.quad	0		/* 16-bit code segment, filled in by PM code */
-	/*
-	 * 16-bit data segment with the selector value 16 = 0x10 and
-	 * base value 0x100; since this is consistent with real mode
-	 * semantics we don't have to reload the segments once CR0.PE = 0.
-	 */
-	.quad	GDT_ENTRY(0x0093, 0x100, 0xffff)
-END(machine_real_restart_gdt)
diff --git a/arch/x86/realmode/rm/Makefile b/arch/x86/realmode/rm/Makefile
index 7c3f202cbccf..3f851c488593 100644
--- a/arch/x86/realmode/rm/Makefile
+++ b/arch/x86/realmode/rm/Makefile
@@ -12,6 +12,7 @@ subdir- := wakeup
 always := realmode.bin
 
 realmode-y			+= header.o
+realmode-$(CONFIG_X86_32)	+= reboot_32.o
 
 targets	+= $(realmode-y)
 
diff --git a/arch/x86/realmode/rm/header.S b/arch/x86/realmode/rm/header.S
index 7be17f2c65a3..db21401c0c57 100644
--- a/arch/x86/realmode/rm/header.S
+++ b/arch/x86/realmode/rm/header.S
@@ -13,4 +13,7 @@ ENTRY(real_mode_header)
 		.long	pa_text_start
 		.long	pa_ro_end
 		.long	pa_end
+#ifdef CONFIG_X86_32
+		.long	pa_machine_real_restart_asm
+#endif
 END(real_mode_header)
diff --git a/arch/x86/realmode/rm/reboot_32.S b/arch/x86/realmode/rm/reboot_32.S
new file mode 100644
index 000000000000..83803c222b4a
--- /dev/null
+++ b/arch/x86/realmode/rm/reboot_32.S
@@ -0,0 +1,134 @@
+#include <linux/linkage.h>
+#include <linux/init.h>
+#include <asm/segment.h>
+#include <asm/page_types.h>
+
+/*
+ * The following code and data reboots the machine by switching to real
+ * mode and jumping to the BIOS reset entry point, as if the CPU has
+ * really been reset.  The previous version asked the keyboard
+ * controller to pulse the CPU reset line, which is more thorough, but
+ * doesn't work with at least one type of 486 motherboard.  It is easy
+ * to stop this code working; hence the copious comments.
+ *
+ * This code is called with the restart type (0 = BIOS, 1 = APM) in %eax.
+ */
+	.section ".text32", "ax"
+	.code32
+	.globl machine_real_restart_asm
+
+	.balign 16
+machine_real_restart_asm:
+	/* Set up the IDT for real mode. */
+	lidtl	pa_machine_real_restart_idt
+
+	/*
+	 * Set up a GDT from which we can load segment descriptors for real
+	 * mode.  The GDT is not used in real mode; it is just needed here to
+	 * prepare the descriptors.
+	 */
+	lgdtl	pa_machine_real_restart_gdt
+
+	/*
+	 * Load the data segment registers with 16-bit compatible values
+	 */
+	movl	$16, %ecx
+	movl	%ecx, %ds
+	movl	%ecx, %es
+	movl	%ecx, %fs
+	movl	%ecx, %gs
+	movl	%ecx, %ss
+	ljmpw	$8, $1f
+
+/*
+ * This is 16-bit protected mode code to disable paging and the cache,
+ * switch to real mode and jump to the BIOS reset code.
+ *
+ * The instruction that switches to real mode by writing to CR0 must be
+ * followed immediately by a far jump instruction, which set CS to a
+ * valid value for real mode, and flushes the prefetch queue to avoid
+ * running instructions that have already been decoded in protected
+ * mode.
+ *
+ * Clears all the flags except ET, especially PG (paging), PE
+ * (protected-mode enable) and TS (task switch for coprocessor state
+ * save).  Flushes the TLB after paging has been disabled.  Sets CD and
+ * NW, to disable the cache on a 486, and invalidates the cache.  This
+ * is more like the state of a 486 after reset.  I don't know if
+ * something else should be done for other chips.
+ *
+ * More could be done here to set up the registers as if a CPU reset had
+ * occurred; hopefully real BIOSs don't assume much.  This is not the
+ * actual BIOS entry point, anyway (that is at 0xfffffff0).
+ *
+ * Most of this work is probably excessive, but it is what is tested.
+ */
+	.text
+	.code16
+
+	.balign 16
+machine_real_restart_asm16:
+1:
+	xorl	%ecx, %ecx
+	movl	%cr0, %edx
+	andl	$0x00000011, %edx
+	orl	$0x60000000, %edx
+	movl	%edx, %cr0
+	movl	%ecx, %cr3
+	movl	%cr0, %edx
+	andl	$0x60000000, %edx	/* If no cache bits -> no wbinvd */
+	jz	2f
+	wbinvd
+2:
+	andb	$0x10, %dl
+	movl	%edx, %cr0
+	.byte	0xea			/* ljmpw */
+	.word	3f			/* Offset */
+	.word	real_mode_seg		/* Segment */
+
+3:
+	testb	$0, %al
+	jz	bios
+
+apm:
+	movw	$0x1000, %ax
+	movw	%ax, %ss
+	movw	$0xf000, %sp
+	movw	$0x5307, %ax
+	movw	$0x0001, %bx
+	movw	$0x0003, %cx
+	int	$0x15
+	/* This should never return... */
+
+bios:
+	ljmpw	$0xf000, $0xfff0
+
+	.section ".rodata", "a"
+	.globl	machine_real_restart_idt, machine_real_restart_gdt
+
+	.balign 16
+machine_real_restart_idt:
+	.word	0xffff		/* Length - real mode default value */
+	.long	0		/* Base - real mode default value */
+
+	.balign 16
+machine_real_restart_gdt:
+	/* Self-pointer */
+	.word	0xffff		/* Length - real mode default value */
+	.long	pa_machine_real_restart_gdt
+	.word	0
+
+	/*
+	 * 16-bit code segment pointing to real_mode_seg
+	 * Selector value 8
+	 */
+	.word	0xffff		/* Limit */
+	.long	0x9b000000 + pa_real_mode_base
+	.word	0
+
+	/*
+	 * 16-bit data segment with the selector value 16 = 0x10 and
+	 * base value 0x100; since this is consistent with real mode
+	 * semantics we don't have to reload the segments once CR0.PE = 0.
+	 */
+	.quad	GDT_ENTRY(0x0093, 0x100, 0xffff)
-- 
cgit v1.2.3


From 48927bbb97c7d4cf343c05827ab9ac30c60678cb Mon Sep 17 00:00:00 2001
From: Jarkko Sakkinen <jarkko.sakkinen@intel.com>
Date: Tue, 8 May 2012 21:22:28 +0300
Subject: x86, realmode: Move SMP trampoline to unified realmode code

Migrated SMP trampoline code to the real mode blob.
SMP trampoline code is not yet removed from
.x86_trampoline because it is needed by the wakeup
code.

[ hpa: always enable compiling startup_32_smp in head_32.S... it is
  only a few instructions which go into .init on UP builds, and it makes
  the rest of the code less #ifdef ugly. ]

Signed-off-by: Jarkko Sakkinen <jarkko.sakkinen@intel.com>
Link: http://lkml.kernel.org/r/1336501366-28617-6-git-send-email-jarkko.sakkinen@intel.com
Signed-off-by: H. Peter Anvin <hpa@linux.intel.com>
---
 arch/x86/include/asm/realmode.h      |  18 ++++
 arch/x86/kernel/head_32.S            |   5 +-
 arch/x86/kernel/head_64.S            |   4 -
 arch/x86/kernel/realmode.c           |  14 +++
 arch/x86/kernel/smpboot.c            |  18 ++--
 arch/x86/realmode/rm/Makefile        |   1 +
 arch/x86/realmode/rm/header.S        |  11 +++
 arch/x86/realmode/rm/trampoline_32.S |  86 +++++++++++++++++
 arch/x86/realmode/rm/trampoline_64.S | 175 +++++++++++++++++++++++++++++++++++
 9 files changed, 316 insertions(+), 16 deletions(-)
 create mode 100644 arch/x86/realmode/rm/trampoline_32.S
 create mode 100644 arch/x86/realmode/rm/trampoline_64.S

(limited to 'arch/x86')

diff --git a/arch/x86/include/asm/realmode.h b/arch/x86/include/asm/realmode.h
index bf26b0681931..9b4a5da5e22e 100644
--- a/arch/x86/include/asm/realmode.h
+++ b/arch/x86/include/asm/realmode.h
@@ -12,6 +12,17 @@ struct real_mode_header {
 	/* reboot */
 #ifdef CONFIG_X86_32
 	u32	machine_real_restart_asm;
+#endif
+	/* SMP trampoline */
+	u32	trampoline_data;
+	u32	trampoline_status;
+#ifdef CONFIG_X86_32
+	u32	startup_32_smp;
+	u32	boot_gdt;
+#else
+	u32	startup_64_smp;
+	u32	level3_ident_pgt;
+	u32	level3_kernel_pgt;
 #endif
 } __attribute__((__packed__));
 
@@ -25,6 +36,13 @@ extern unsigned long initial_gs;
 extern unsigned char real_mode_blob[];
 extern unsigned char real_mode_relocs[];
 
+#ifdef CONFIG_X86_32
+extern unsigned char startup_32_smp[];
+extern unsigned char boot_gdt[];
+#else
+extern unsigned char secondary_startup_64[];
+#endif
+
 extern void __init setup_real_mode(void);
 
 #endif /* _ARCH_X86_REALMODE_H */
diff --git a/arch/x86/kernel/head_32.S b/arch/x86/kernel/head_32.S
index ce0be7cd085e..a3c2b4ffebc6 100644
--- a/arch/x86/kernel/head_32.S
+++ b/arch/x86/kernel/head_32.S
@@ -273,10 +273,7 @@ num_subarch_entries = (. - subarch_entries) / 4
  * If cpu hotplug is not supported then this code can go in init section
  * which will be freed later
  */
-
 __CPUINIT
-
-#ifdef CONFIG_SMP
 ENTRY(startup_32_smp)
 	cld
 	movl $(__BOOT_DS),%eax
@@ -287,7 +284,7 @@ ENTRY(startup_32_smp)
 	movl pa(stack_start),%ecx
 	movl %eax,%ss
 	leal -__PAGE_OFFSET(%ecx),%esp
-#endif /* CONFIG_SMP */
+
 default_entry:
 
 /*
diff --git a/arch/x86/kernel/head_64.S b/arch/x86/kernel/head_64.S
index 40f4eb3766d1..d70bc2eb202b 100644
--- a/arch/x86/kernel/head_64.S
+++ b/arch/x86/kernel/head_64.S
@@ -136,10 +136,6 @@ ident_complete:
 	/* Fixup phys_base */
 	addq	%rbp, phys_base(%rip)
 
-	/* Fixup trampoline */
-	addq	%rbp, trampoline_level4_pgt + 0(%rip)
-	addq	%rbp, trampoline_level4_pgt + (511*8)(%rip)
-
 	/* Due to ENTRY(), sometimes the empty space gets filled with
 	 * zeros. Better take a jmp than relying on empty space being
 	 * filled with 0x90 (nop)
diff --git a/arch/x86/kernel/realmode.c b/arch/x86/kernel/realmode.c
index 7415c42547ac..a465775b32f2 100644
--- a/arch/x86/kernel/realmode.c
+++ b/arch/x86/kernel/realmode.c
@@ -58,6 +58,20 @@ void __init setup_real_mode(void)
 	/* Copied header will contain relocated physical addresses. */
 	memcpy(&real_mode_header, real_mode_base,
 	       sizeof(struct real_mode_header));
+
+#ifdef CONFIG_X86_32
+	*((u32 *)__va(real_mode_header.startup_32_smp)) = __pa(startup_32_smp);
+	*((u32 *)__va(real_mode_header.boot_gdt)) = __pa(boot_gdt);
+#else
+	*((u64 *) __va(real_mode_header.startup_64_smp)) =
+		(u64) __pa(secondary_startup_64);
+
+	*((u64 *) __va(real_mode_header.level3_ident_pgt)) =
+		__pa(level3_ident_pgt) + _KERNPG_TABLE;
+
+	*((u64 *) __va(real_mode_header.level3_kernel_pgt)) =
+		__pa(level3_kernel_pgt) + _KERNPG_TABLE;
+#endif
 }
 
 /*
diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c
index 6e1e406038c2..c7971ea74bd0 100644
--- a/arch/x86/kernel/smpboot.c
+++ b/arch/x86/kernel/smpboot.c
@@ -57,7 +57,7 @@
 #include <asm/nmi.h>
 #include <asm/irq.h>
 #include <asm/idle.h>
-#include <asm/trampoline.h>
+#include <asm/realmode.h>
 #include <asm/cpu.h>
 #include <asm/numa.h>
 #include <asm/pgtable.h>
@@ -73,6 +73,8 @@
 #include <asm/smpboot_hooks.h>
 #include <asm/i8259.h>
 
+#include <asm/realmode.h>
+
 /* State of each CPU */
 DEFINE_PER_CPU(int, cpu_state) = { 0 };
 
@@ -662,8 +664,12 @@ static void __cpuinit announce_cpu(int cpu, int apicid)
  */
 static int __cpuinit do_boot_cpu(int apicid, int cpu)
 {
+	volatile u32 *trampoline_status =
+		(volatile u32 *) __va(real_mode_header.trampoline_status);
+	/* start_ip had better be page-aligned! */
+	unsigned long start_ip = real_mode_header.trampoline_data;
+
 	unsigned long boot_error = 0;
-	unsigned long start_ip;
 	int timeout;
 	struct create_idle c_idle = {
 		.cpu	= cpu,
@@ -713,9 +719,6 @@ do_rest:
 	initial_code = (unsigned long)start_secondary;
 	stack_start  = c_idle.idle->thread.sp;
 
-	/* start_ip had better be page-aligned! */
-	start_ip = trampoline_address();
-
 	/* So we see what's up */
 	announce_cpu(cpu, apicid);
 
@@ -778,8 +781,7 @@ do_rest:
 			pr_debug("CPU%d: has booted.\n", cpu);
 		} else {
 			boot_error = 1;
-			if (*(volatile u32 *)TRAMPOLINE_SYM(trampoline_status)
-			    == 0xA5A5A5A5)
+			if (*trampoline_status == 0xA5A5A5A5)
 				/* trampoline started but...? */
 				pr_err("CPU%d: Stuck ??\n", cpu);
 			else
@@ -805,7 +807,7 @@ do_rest:
 	}
 
 	/* mark "stuck" area as not stuck */
-	*(volatile u32 *)TRAMPOLINE_SYM(trampoline_status) = 0;
+	*trampoline_status = 0;
 
 	if (get_uv_system_type() != UV_NON_UNIQUE_APIC) {
 		/*
diff --git a/arch/x86/realmode/rm/Makefile b/arch/x86/realmode/rm/Makefile
index 3f851c488593..56ec64f94e69 100644
--- a/arch/x86/realmode/rm/Makefile
+++ b/arch/x86/realmode/rm/Makefile
@@ -13,6 +13,7 @@ always := realmode.bin
 
 realmode-y			+= header.o
 realmode-$(CONFIG_X86_32)	+= reboot_32.o
+realmode-y			+= trampoline_$(BITS).o
 
 targets	+= $(realmode-y)
 
diff --git a/arch/x86/realmode/rm/header.S b/arch/x86/realmode/rm/header.S
index db21401c0c57..a97900409c61 100644
--- a/arch/x86/realmode/rm/header.S
+++ b/arch/x86/realmode/rm/header.S
@@ -15,5 +15,16 @@ ENTRY(real_mode_header)
 		.long	pa_end
 #ifdef CONFIG_X86_32
 		.long	pa_machine_real_restart_asm
+#endif
+		/* SMP trampoline */
+		.long	pa_trampoline_data
+		.long	pa_trampoline_status
+#ifdef CONFIG_X86_32
+		.long	pa_startup_32_smp
+		.long	pa_boot_gdt
+#else
+		.long	pa_startup_64_smp
+		.long	pa_level3_ident_pgt
+		.long	pa_level3_kernel_pgt
 #endif
 END(real_mode_header)
diff --git a/arch/x86/realmode/rm/trampoline_32.S b/arch/x86/realmode/rm/trampoline_32.S
new file mode 100644
index 000000000000..18cb7fc9fad4
--- /dev/null
+++ b/arch/x86/realmode/rm/trampoline_32.S
@@ -0,0 +1,86 @@
+/*
+ *
+ *	Trampoline.S	Derived from Setup.S by Linus Torvalds
+ *
+ *	4 Jan 1997 Michael Chastain: changed to gnu as.
+ *
+ *	This is only used for booting secondary CPUs in SMP machine
+ *
+ *	Entry: CS:IP point to the start of our code, we are
+ *	in real mode with no stack, but the rest of the
+ *	trampoline page to make our stack and everything else
+ *	is a mystery.
+ *
+ *	We jump into arch/x86/kernel/head_32.S.
+ *
+ *	On entry to trampoline_data, the processor is in real mode
+ *	with 16-bit addressing and 16-bit data.  CS has some value
+ *	and IP is zero.  Thus, we load CS to the physical segment
+ *	of the real mode code before doing anything further.
+ *
+ *	The structure real_mode_header includes entries that need
+ *	to be set up before executing this code:
+ *
+ *	startup_32_smp
+ *	boot_gdt
+ */
+
+#include <linux/linkage.h>
+#include <linux/init.h>
+#include <asm/segment.h>
+#include <asm/page_types.h>
+
+	.text
+	.code16
+	.globl trampoline_data
+
+	.balign PAGE_SIZE
+trampoline_data:
+	wbinvd			# Needed for NUMA-Q should be harmless for others
+
+	.byte	0xea		# ljmpw
+	.word	1f		# Offset
+	.word	real_mode_seg	# Segment
+1:
+	mov	%cs, %ax	# Code and data in the same place
+	mov	%ax, %ds
+
+	cli			# We should be safe anyway
+
+	movl	$0xA5A5A5A5, trampoline_status
+				# write marker for master knows we're running
+
+	/* GDT tables in non default location kernel can be beyond 16MB and
+	 * lgdt will not be able to load the address as in real mode default
+	 * operand size is 16bit. Use lgdtl instead to force operand size
+	 * to 32 bit.
+	 */
+
+	lidtl	boot_idt_descr		# load idt with 0, 0
+	lgdtl	boot_gdt_descr		# load gdt with whatever is appropriate
+
+	xor	%ax, %ax
+	inc	%ax			# protected mode (PE) bit
+	lmsw	%ax			# into protected mode
+
+	# flush prefetch and jump to startup_32_smp in arch/i386/kernel/head.S
+	ljmpl	*(startup_32_smp)
+
+	.data
+	.globl startup_32_smp, boot_gdt, trampoline_status
+
+boot_gdt_descr:
+	.word	__BOOT_DS + 7			# gdt limit
+boot_gdt:
+	.long	0				# gdt base
+
+boot_idt_descr:
+	.word	0				# idt limit = 0
+	.long	0				# idt base = 0L
+
+trampoline_status:
+	.long	0
+
+startup_32_smp:
+	.long	0x00000000
+	.word	__BOOT_CS, 0
diff --git a/arch/x86/realmode/rm/trampoline_64.S b/arch/x86/realmode/rm/trampoline_64.S
new file mode 100644
index 000000000000..063da008d520
--- /dev/null
+++ b/arch/x86/realmode/rm/trampoline_64.S
@@ -0,0 +1,175 @@
+/*
+ *
+ *	Trampoline.S	Derived from Setup.S by Linus Torvalds
+ *
+ *	4 Jan 1997 Michael Chastain: changed to gnu as.
+ *	15 Sept 2005 Eric Biederman: 64bit PIC support
+ *
+ *	Entry: CS:IP point to the start of our code, we are
+ *	in real mode with no stack, but the rest of the
+ *	trampoline page to make our stack and everything else
+ *	is a mystery.
+ *
+ *	On entry to trampoline_data, the processor is in real mode
+ *	with 16-bit addressing and 16-bit data.  CS has some value
+ *	and IP is zero.  Thus, data addresses need to be absolute
+ *	(no relocation) and are taken with regard to r_base.
+ *
+ *	With the addition of trampoline_level4_pgt this code can
+ *	now enter a 64bit kernel that lives at arbitrary 64bit
+ *	physical addresses.
+ *
+ *	If you work on this file, check the object module with objdump
+ *	--full-contents --reloc to make sure there are no relocation
+ *	entries.
+ */
+
+#include <linux/linkage.h>
+#include <linux/init.h>
+#include <asm/pgtable_types.h>
+#include <asm/page_types.h>
+#include <asm/msr.h>
+#include <asm/segment.h>
+#include <asm/processor-flags.h>
+
+	.text
+	.balign PAGE_SIZE
+	.code16
+
+ENTRY(trampoline_data)
+	cli			# We should be safe anyway
+	wbinvd
+
+	.byte	0xea		# ljmpw
+	.word	1f		# Offset
+	.word	real_mode_seg	# Segment
+1:
+	mov	%cs, %ax	# Code and data in the same place
+	mov	%ax, %ds
+	mov	%ax, %es
+	mov	%ax, %ss
+
+	movl	$0xA5A5A5A5, trampoline_status
+	# write marker for master knows we're running
+
+	# Setup stack
+	movw	$trampoline_stack_end, %sp
+
+	call	verify_cpu		# Verify the cpu supports long mode
+	testl   %eax, %eax		# Check for return code
+	jnz	no_longmode
+
+	/*
+	 * GDT tables in non default location kernel can be beyond 16MB and
+	 * lgdt will not be able to load the address as in real mode default
+	 * operand size is 16bit. Use lgdtl instead to force operand size
+	 * to 32 bit.
+	 */
+
+	lidtl	tidt	# load idt with 0, 0
+	lgdtl	tgdt	# load gdt with whatever is appropriate
+
+	mov	$X86_CR0_PE, %ax	# protected mode (PE) bit
+	lmsw	%ax			# into protected mode
+
+	# flush prefetch and jump to startup_32
+	ljmpl	*(startup_32_vector)
+
+no_longmode:
+	hlt
+	jmp no_longmode
+#include "../kernel/verify_cpu.S"
+
+	.code32
+	.balign 4
+ENTRY(startup_32)
+	movl	$__KERNEL_DS, %eax	# Initialize the %ds segment register
+	movl	%eax, %ds
+
+	movl	$X86_CR4_PAE, %eax
+	movl	%eax, %cr4		# Enable PAE mode
+
+	movl	pa_startup_64_smp, %esi
+	movl	pa_startup_64_smp_high, %edi
+
+					# Setup trampoline 4 level pagetables
+	leal	pa_trampoline_level4_pgt, %eax
+	movl	%eax, %cr3
+
+	movl	$MSR_EFER, %ecx
+	movl	$(1 << _EFER_LME), %eax	# Enable Long Mode
+	xorl	%edx, %edx
+	wrmsr
+
+	# Enable paging and in turn activate Long Mode
+	# Enable protected mode
+	movl	$(X86_CR0_PG | X86_CR0_PE), %eax
+	movl	%eax, %cr0
+
+	/*
+	 * At this point we're in long mode but in 32bit compatibility mode
+	 * with EFER.LME = 1, CS.L = 0, CS.D = 1 (and in turn
+	 * EFER.LMA = 1). Now we want to jump in 64bit mode, to do that we use
+	 * the new gdt/idt that has __KERNEL_CS with CS.L = 1.
+	 */
+	ljmpl	*(pa_startup_64_vector)
+
+	.code64
+	.balign 4
+ENTRY(startup_64)
+	# Now jump into the kernel using virtual addresses
+	movl	%edi, %eax
+	shlq	$32, %rax
+	addl	%esi, %eax
+	jmp	*%rax
+
+	# Careful these need to be in the same 64K segment as the above;
+tidt:
+	.word	0			# idt limit = 0
+	.word	0, 0			# idt base = 0L
+
+	# Duplicate the global descriptor table
+	# so the kernel can live anywhere
+	.balign 4
+	.globl tgdt
+tgdt:
+	.short	tgdt_end - tgdt		# gdt limit
+	.long	pa_tgdt
+	.short	0
+	.quad	0x00cf9b000000ffff	# __KERNEL32_CS
+	.quad	0x00af9b000000ffff	# __KERNEL_CS
+	.quad	0x00cf93000000ffff	# __KERNEL_DS
+tgdt_end:
+
+	.balign 4
+startup_32_vector:
+	.long	pa_startup_32
+	.word	__KERNEL32_CS, 0
+
+	.balign 4
+	.globl startup_64_vector
+startup_64_vector:
+	.long	pa_startup_64
+	.word	__KERNEL_CS, 0
+
+	.data
+
+	.balign 4
+ENTRY(trampoline_status)
+	.long	0
+
+trampoline_stack:
+	.org 0x1000
+trampoline_stack_end:
+
+	.globl	level3_ident_pgt
+	.globl	level3_kernel_pgt
+ENTRY(trampoline_level4_pgt)
+	level3_ident_pgt:	.quad	0
+	.fill 510,8,0
+	level3_kernel_pgt:	.quad	0
+
+	.globl	startup_64_smp
+	.globl	startup_64_smp_high
+startup_64_smp:		.long 0
+startup_64_smp_high:	.long 0
-- 
cgit v1.2.3


From c9b77ccb52a5c77233b0e557b7d4417b00ef4012 Mon Sep 17 00:00:00 2001
From: Jarkko Sakkinen <jarkko.sakkinen@intel.com>
Date: Tue, 8 May 2012 21:22:29 +0300
Subject: x86, realmode: Move ACPI wakeup to unified realmode code

Migrated ACPI wakeup code to the real-mode blob.
Code existing in .x86_trampoline  can be completely
removed. Static descriptor table in wakeup_asm.S is
courtesy of H. Peter Anvin.

Signed-off-by: Jarkko Sakkinen <jarkko.sakkinen@intel.com>
Link: http://lkml.kernel.org/r/1336501366-28617-7-git-send-email-jarkko.sakkinen@intel.com
Cc: Rafael J. Wysocki <rjw@sisk.pl>
Cc: Len Brown <len.brown@intel.com>
Signed-off-by: H. Peter Anvin <hpa@linux.intel.com>
---
 arch/x86/include/asm/acpi.h                |   2 -
 arch/x86/include/asm/realmode.h            |   4 +
 arch/x86/include/asm/trampoline.h          |  39 ------
 arch/x86/kernel/Makefile                   |   1 -
 arch/x86/kernel/acpi/Makefile              |   9 +-
 arch/x86/kernel/acpi/realmode/.gitignore   |   3 -
 arch/x86/kernel/acpi/realmode/Makefile     |  59 ---------
 arch/x86/kernel/acpi/realmode/bioscall.S   |   1 -
 arch/x86/kernel/acpi/realmode/copy.S       |   1 -
 arch/x86/kernel/acpi/realmode/regs.c       |   1 -
 arch/x86/kernel/acpi/realmode/video-bios.c |   1 -
 arch/x86/kernel/acpi/realmode/video-mode.c |   1 -
 arch/x86/kernel/acpi/realmode/video-vesa.c |   1 -
 arch/x86/kernel/acpi/realmode/video-vga.c  |   1 -
 arch/x86/kernel/acpi/realmode/wakemain.c   |  81 -------------
 arch/x86/kernel/acpi/realmode/wakeup.S     | 170 --------------------------
 arch/x86/kernel/acpi/realmode/wakeup.h     |  48 --------
 arch/x86/kernel/acpi/realmode/wakeup.lds.S |  62 ----------
 arch/x86/kernel/acpi/sleep.c               |  33 +----
 arch/x86/kernel/acpi/sleep.h               |   2 +-
 arch/x86/kernel/acpi/wakeup_rm.S           |  12 --
 arch/x86/kernel/head32.c                   |   1 -
 arch/x86/kernel/head64.c                   |   1 -
 arch/x86/kernel/mpparse.c                  |   1 -
 arch/x86/kernel/setup.c                    |   2 -
 arch/x86/kernel/tboot.c                    |   5 +-
 arch/x86/kernel/trampoline.c               |  42 -------
 arch/x86/kernel/trampoline_32.S            |  83 -------------
 arch/x86/kernel/trampoline_64.S            | 171 --------------------------
 arch/x86/kernel/vmlinux.lds.S              |  12 --
 arch/x86/realmode/rm/Makefile              |   4 +
 arch/x86/realmode/rm/header.S              |   5 +
 arch/x86/realmode/rm/realmode.lds.S        |   4 +
 arch/x86/realmode/rm/wakeup/.gitignore     |   3 +
 arch/x86/realmode/rm/wakeup/Makefile       |  33 +++++
 arch/x86/realmode/rm/wakeup/bioscall.S     |   1 +
 arch/x86/realmode/rm/wakeup/copy.S         |   1 +
 arch/x86/realmode/rm/wakeup/regs.c         |   1 +
 arch/x86/realmode/rm/wakeup/video-bios.c   |   1 +
 arch/x86/realmode/rm/wakeup/video-mode.c   |   1 +
 arch/x86/realmode/rm/wakeup/video-vesa.c   |   1 +
 arch/x86/realmode/rm/wakeup/video-vga.c    |   1 +
 arch/x86/realmode/rm/wakeup/wakemain.c     |  82 +++++++++++++
 arch/x86/realmode/rm/wakeup/wakeup.h       |  41 +++++++
 arch/x86/realmode/rm/wakeup/wakeup_asm.S   | 189 +++++++++++++++++++++++++++++
 drivers/acpi/sleep.c                       |   8 +-
 46 files changed, 386 insertions(+), 840 deletions(-)
 delete mode 100644 arch/x86/include/asm/trampoline.h
 delete mode 100644 arch/x86/kernel/acpi/realmode/.gitignore
 delete mode 100644 arch/x86/kernel/acpi/realmode/Makefile
 delete mode 100644 arch/x86/kernel/acpi/realmode/bioscall.S
 delete mode 100644 arch/x86/kernel/acpi/realmode/copy.S
 delete mode 100644 arch/x86/kernel/acpi/realmode/regs.c
 delete mode 100644 arch/x86/kernel/acpi/realmode/video-bios.c
 delete mode 100644 arch/x86/kernel/acpi/realmode/video-mode.c
 delete mode 100644 arch/x86/kernel/acpi/realmode/video-vesa.c
 delete mode 100644 arch/x86/kernel/acpi/realmode/video-vga.c
 delete mode 100644 arch/x86/kernel/acpi/realmode/wakemain.c
 delete mode 100644 arch/x86/kernel/acpi/realmode/wakeup.S
 delete mode 100644 arch/x86/kernel/acpi/realmode/wakeup.h
 delete mode 100644 arch/x86/kernel/acpi/realmode/wakeup.lds.S
 delete mode 100644 arch/x86/kernel/acpi/wakeup_rm.S
 delete mode 100644 arch/x86/kernel/trampoline.c
 delete mode 100644 arch/x86/kernel/trampoline_32.S
 delete mode 100644 arch/x86/kernel/trampoline_64.S
 create mode 100644 arch/x86/realmode/rm/wakeup/.gitignore
 create mode 100644 arch/x86/realmode/rm/wakeup/Makefile
 create mode 100644 arch/x86/realmode/rm/wakeup/bioscall.S
 create mode 100644 arch/x86/realmode/rm/wakeup/copy.S
 create mode 100644 arch/x86/realmode/rm/wakeup/regs.c
 create mode 100644 arch/x86/realmode/rm/wakeup/video-bios.c
 create mode 100644 arch/x86/realmode/rm/wakeup/video-mode.c
 create mode 100644 arch/x86/realmode/rm/wakeup/video-vesa.c
 create mode 100644 arch/x86/realmode/rm/wakeup/video-vga.c
 create mode 100644 arch/x86/realmode/rm/wakeup/wakemain.c
 create mode 100644 arch/x86/realmode/rm/wakeup/wakeup.h
 create mode 100644 arch/x86/realmode/rm/wakeup/wakeup_asm.S

(limited to 'arch/x86')

diff --git a/arch/x86/include/asm/acpi.h b/arch/x86/include/asm/acpi.h
index 610001d385dd..724aa441de7d 100644
--- a/arch/x86/include/asm/acpi.h
+++ b/arch/x86/include/asm/acpi.h
@@ -29,7 +29,6 @@
 #include <asm/processor.h>
 #include <asm/mmu.h>
 #include <asm/mpspec.h>
-#include <asm/trampoline.h>
 
 #define COMPILER_DEPENDENT_INT64   long long
 #define COMPILER_DEPENDENT_UINT64  unsigned long long
@@ -118,7 +117,6 @@ static inline void acpi_disable_pci(void)
 extern int acpi_suspend_lowlevel(void);
 
 extern const unsigned char acpi_wakeup_code[];
-#define acpi_wakeup_address (__pa(TRAMPOLINE_SYM(acpi_wakeup_code)))
 
 /* early initialization routine */
 extern void acpi_reserve_wakeup_memory(void);
diff --git a/arch/x86/include/asm/realmode.h b/arch/x86/include/asm/realmode.h
index 9b4a5da5e22e..1bfc74d213a4 100644
--- a/arch/x86/include/asm/realmode.h
+++ b/arch/x86/include/asm/realmode.h
@@ -24,6 +24,10 @@ struct real_mode_header {
 	u32	level3_ident_pgt;
 	u32	level3_kernel_pgt;
 #endif
+#ifdef CONFIG_ACPI_SLEEP
+	u32	wakeup_start;
+	u32	wakeup_header;
+#endif
 } __attribute__((__packed__));
 
 extern struct real_mode_header real_mode_header;
diff --git a/arch/x86/include/asm/trampoline.h b/arch/x86/include/asm/trampoline.h
deleted file mode 100644
index feca3118a73b..000000000000
--- a/arch/x86/include/asm/trampoline.h
+++ /dev/null
@@ -1,39 +0,0 @@
-#ifndef _ASM_X86_TRAMPOLINE_H
-#define _ASM_X86_TRAMPOLINE_H
-
-#ifndef __ASSEMBLY__
-
-#include <linux/types.h>
-#include <asm/io.h>
-
-/*
- * Trampoline 80x86 program as an array.  These are in the init rodata
- * segment, but that's okay, because we only care about the relative
- * addresses of the symbols.
- */
-extern const unsigned char x86_trampoline_start [];
-extern const unsigned char x86_trampoline_end   [];
-extern unsigned char *x86_trampoline_base;
-
-extern unsigned long init_rsp;
-extern unsigned long initial_code;
-extern unsigned long initial_gs;
-
-extern void __init setup_trampolines(void);
-
-extern const unsigned char trampoline_data[];
-extern const unsigned char trampoline_status[];
-
-#define TRAMPOLINE_SYM(x)						\
-	((void *)(x86_trampoline_base +					\
-		  ((const unsigned char *)(x) - x86_trampoline_start)))
-
-/* Address of the SMP trampoline */
-static inline unsigned long trampoline_address(void)
-{
-	return virt_to_phys(TRAMPOLINE_SYM(trampoline_data));
-}
-
-#endif /* __ASSEMBLY__ */
-
-#endif /* _ASM_X86_TRAMPOLINE_H */
diff --git a/arch/x86/kernel/Makefile b/arch/x86/kernel/Makefile
index b71ef35c7d77..4a20f4441ffe 100644
--- a/arch/x86/kernel/Makefile
+++ b/arch/x86/kernel/Makefile
@@ -35,7 +35,6 @@ obj-y			+= tsc.o io_delay.o rtc.o
 obj-y			+= pci-iommu_table.o
 obj-y			+= resource.o
 
-obj-y				+= trampoline.o trampoline_$(BITS).o
 obj-y				+= realmode.o
 obj-y				+= process.o
 obj-y				+= i387.o xsave.o
diff --git a/arch/x86/kernel/acpi/Makefile b/arch/x86/kernel/acpi/Makefile
index 6f35260bb3ef..163b22581472 100644
--- a/arch/x86/kernel/acpi/Makefile
+++ b/arch/x86/kernel/acpi/Makefile
@@ -1,14 +1,7 @@
-subdir-				:= realmode
-
 obj-$(CONFIG_ACPI)		+= boot.o
-obj-$(CONFIG_ACPI_SLEEP)	+= sleep.o wakeup_rm.o wakeup_$(BITS).o
+obj-$(CONFIG_ACPI_SLEEP)	+= sleep.o wakeup_$(BITS).o
 
 ifneq ($(CONFIG_ACPI_PROCESSOR),)
 obj-y				+= cstate.o
 endif
 
-$(obj)/wakeup_rm.o:    $(obj)/realmode/wakeup.bin
-
-$(obj)/realmode/wakeup.bin: FORCE
-	$(Q)$(MAKE) $(build)=$(obj)/realmode
-
diff --git a/arch/x86/kernel/acpi/realmode/.gitignore b/arch/x86/kernel/acpi/realmode/.gitignore
deleted file mode 100644
index 58f1f48a58f8..000000000000
--- a/arch/x86/kernel/acpi/realmode/.gitignore
+++ /dev/null
@@ -1,3 +0,0 @@
-wakeup.bin
-wakeup.elf
-wakeup.lds
diff --git a/arch/x86/kernel/acpi/realmode/Makefile b/arch/x86/kernel/acpi/realmode/Makefile
deleted file mode 100644
index 6a564ac67ef5..000000000000
--- a/arch/x86/kernel/acpi/realmode/Makefile
+++ /dev/null
@@ -1,59 +0,0 @@
-#
-# arch/x86/kernel/acpi/realmode/Makefile
-#
-# This file is subject to the terms and conditions of the GNU General Public
-# License.  See the file "COPYING" in the main directory of this archive
-# for more details.
-#
-
-always		:= wakeup.bin
-targets		:= wakeup.elf wakeup.lds
-
-wakeup-y	+= wakeup.o wakemain.o video-mode.o copy.o bioscall.o regs.o
-
-# The link order of the video-*.o modules can matter.  In particular,
-# video-vga.o *must* be listed first, followed by video-vesa.o.
-# Hardware-specific drivers should follow in the order they should be
-# probed, and video-bios.o should typically be last.
-wakeup-y	+= video-vga.o
-wakeup-y	+= video-vesa.o
-wakeup-y	+= video-bios.o
-
-targets		+= $(wakeup-y)
-
-bootsrc		:= $(src)/../../../boot
-
-# ---------------------------------------------------------------------------
-
-# How to compile the 16-bit code.  Note we always compile for -march=i386,
-# that way we can complain to the user if the CPU is insufficient.
-# Compile with _SETUP since this is similar to the boot-time setup code.
-KBUILD_CFLAGS	:= $(LINUXINCLUDE) -g -Os -D_SETUP -D_WAKEUP -D__KERNEL__ \
-		   -I$(srctree)/$(bootsrc) \
-		   $(cflags-y) \
-		   -Wall -Wstrict-prototypes \
-		   -march=i386 -mregparm=3 \
-		   -include $(srctree)/$(bootsrc)/code16gcc.h \
-		   -fno-strict-aliasing -fomit-frame-pointer \
-		   $(call cc-option, -ffreestanding) \
-		   $(call cc-option, -fno-toplevel-reorder,\
-			$(call cc-option, -fno-unit-at-a-time)) \
-		   $(call cc-option, -fno-stack-protector) \
-		   $(call cc-option, -mpreferred-stack-boundary=2)
-KBUILD_CFLAGS	+= $(call cc-option, -m32)
-KBUILD_AFLAGS	:= $(KBUILD_CFLAGS) -D__ASSEMBLY__
-GCOV_PROFILE := n
-
-WAKEUP_OBJS = $(addprefix $(obj)/,$(wakeup-y))
-
-LDFLAGS_wakeup.elf	:= -T
-
-CPPFLAGS_wakeup.lds += -P -C
-
-$(obj)/wakeup.elf: $(obj)/wakeup.lds $(WAKEUP_OBJS) FORCE
-	$(call if_changed,ld)
-
-OBJCOPYFLAGS_wakeup.bin	:= -O binary
-
-$(obj)/wakeup.bin: $(obj)/wakeup.elf FORCE
-	$(call if_changed,objcopy)
diff --git a/arch/x86/kernel/acpi/realmode/bioscall.S b/arch/x86/kernel/acpi/realmode/bioscall.S
deleted file mode 100644
index f51eb0bb56ce..000000000000
--- a/arch/x86/kernel/acpi/realmode/bioscall.S
+++ /dev/null
@@ -1 +0,0 @@
-#include "../../../boot/bioscall.S"
diff --git a/arch/x86/kernel/acpi/realmode/copy.S b/arch/x86/kernel/acpi/realmode/copy.S
deleted file mode 100644
index dc59ebee69d8..000000000000
--- a/arch/x86/kernel/acpi/realmode/copy.S
+++ /dev/null
@@ -1 +0,0 @@
-#include "../../../boot/copy.S"
diff --git a/arch/x86/kernel/acpi/realmode/regs.c b/arch/x86/kernel/acpi/realmode/regs.c
deleted file mode 100644
index 6206033ba202..000000000000
--- a/arch/x86/kernel/acpi/realmode/regs.c
+++ /dev/null
@@ -1 +0,0 @@
-#include "../../../boot/regs.c"
diff --git a/arch/x86/kernel/acpi/realmode/video-bios.c b/arch/x86/kernel/acpi/realmode/video-bios.c
deleted file mode 100644
index 7deabc144a27..000000000000
--- a/arch/x86/kernel/acpi/realmode/video-bios.c
+++ /dev/null
@@ -1 +0,0 @@
-#include "../../../boot/video-bios.c"
diff --git a/arch/x86/kernel/acpi/realmode/video-mode.c b/arch/x86/kernel/acpi/realmode/video-mode.c
deleted file mode 100644
index 328ad209f113..000000000000
--- a/arch/x86/kernel/acpi/realmode/video-mode.c
+++ /dev/null
@@ -1 +0,0 @@
-#include "../../../boot/video-mode.c"
diff --git a/arch/x86/kernel/acpi/realmode/video-vesa.c b/arch/x86/kernel/acpi/realmode/video-vesa.c
deleted file mode 100644
index 9dbb9672226a..000000000000
--- a/arch/x86/kernel/acpi/realmode/video-vesa.c
+++ /dev/null
@@ -1 +0,0 @@
-#include "../../../boot/video-vesa.c"
diff --git a/arch/x86/kernel/acpi/realmode/video-vga.c b/arch/x86/kernel/acpi/realmode/video-vga.c
deleted file mode 100644
index bcc81255f374..000000000000
--- a/arch/x86/kernel/acpi/realmode/video-vga.c
+++ /dev/null
@@ -1 +0,0 @@
-#include "../../../boot/video-vga.c"
diff --git a/arch/x86/kernel/acpi/realmode/wakemain.c b/arch/x86/kernel/acpi/realmode/wakemain.c
deleted file mode 100644
index 883962d9eef2..000000000000
--- a/arch/x86/kernel/acpi/realmode/wakemain.c
+++ /dev/null
@@ -1,81 +0,0 @@
-#include "wakeup.h"
-#include "boot.h"
-
-static void udelay(int loops)
-{
-	while (loops--)
-		io_delay();	/* Approximately 1 us */
-}
-
-static void beep(unsigned int hz)
-{
-	u8 enable;
-
-	if (!hz) {
-		enable = 0x00;		/* Turn off speaker */
-	} else {
-		u16 div = 1193181/hz;
-
-		outb(0xb6, 0x43);	/* Ctr 2, squarewave, load, binary */
-		io_delay();
-		outb(div, 0x42);	/* LSB of counter */
-		io_delay();
-		outb(div >> 8, 0x42);	/* MSB of counter */
-		io_delay();
-
-		enable = 0x03;		/* Turn on speaker */
-	}
-	inb(0x61);		/* Dummy read of System Control Port B */
-	io_delay();
-	outb(enable, 0x61);	/* Enable timer 2 output to speaker */
-	io_delay();
-}
-
-#define DOT_HZ		880
-#define DASH_HZ		587
-#define US_PER_DOT	125000
-
-/* Okay, this is totally silly, but it's kind of fun. */
-static void send_morse(const char *pattern)
-{
-	char s;
-
-	while ((s = *pattern++)) {
-		switch (s) {
-		case '.':
-			beep(DOT_HZ);
-			udelay(US_PER_DOT);
-			beep(0);
-			udelay(US_PER_DOT);
-			break;
-		case '-':
-			beep(DASH_HZ);
-			udelay(US_PER_DOT * 3);
-			beep(0);
-			udelay(US_PER_DOT);
-			break;
-		default:	/* Assume it's a space */
-			udelay(US_PER_DOT * 3);
-			break;
-		}
-	}
-}
-
-void main(void)
-{
-	/* Kill machine if structures are wrong */
-	if (wakeup_header.real_magic != 0x12345678)
-		while (1);
-
-	if (wakeup_header.realmode_flags & 4)
-		send_morse("...-");
-
-	if (wakeup_header.realmode_flags & 1)
-		asm volatile("lcallw   $0xc000,$3");
-
-	if (wakeup_header.realmode_flags & 2) {
-		/* Need to call BIOS */
-		probe_cards(0);
-		set_mode(wakeup_header.video_mode);
-	}
-}
diff --git a/arch/x86/kernel/acpi/realmode/wakeup.S b/arch/x86/kernel/acpi/realmode/wakeup.S
deleted file mode 100644
index b4fd836e4053..000000000000
--- a/arch/x86/kernel/acpi/realmode/wakeup.S
+++ /dev/null
@@ -1,170 +0,0 @@
-/*
- * ACPI wakeup real mode startup stub
- */
-#include <asm/segment.h>
-#include <asm/msr-index.h>
-#include <asm/page_types.h>
-#include <asm/pgtable_types.h>
-#include <asm/processor-flags.h>
-#include "wakeup.h"
-
-	.code16
-	.section ".jump", "ax"
-	.globl	_start
-_start:
-	cli
-	jmp	wakeup_code
-
-/* This should match the structure in wakeup.h */
-		.section ".header", "a"
-		.globl	wakeup_header
-wakeup_header:
-video_mode:	.short	0	/* Video mode number */
-pmode_return:	.byte	0x66, 0xea	/* ljmpl */
-		.long	0	/* offset goes here */
-		.short	__KERNEL_CS
-pmode_cr0:	.long	0	/* Saved %cr0 */
-pmode_cr3:	.long	0	/* Saved %cr3 */
-pmode_cr4:	.long	0	/* Saved %cr4 */
-pmode_efer:	.quad	0	/* Saved EFER */
-pmode_gdt:	.quad	0
-pmode_misc_en:	.quad	0	/* Saved MISC_ENABLE MSR */
-pmode_behavior:	.long	0	/* Wakeup behavior flags */
-realmode_flags:	.long	0
-real_magic:	.long	0
-trampoline_segment:	.word 0
-_pad1:		.byte	0
-wakeup_jmp:	.byte	0xea	/* ljmpw */
-wakeup_jmp_off:	.word	3f
-wakeup_jmp_seg:	.word	0
-wakeup_gdt:	.quad	0, 0, 0
-signature:	.long	WAKEUP_HEADER_SIGNATURE
-
-	.text
-	.code16
-wakeup_code:
-	cld
-
-	/* Apparently some dimwit BIOS programmers don't know how to
-	   program a PM to RM transition, and we might end up here with
-	   junk in the data segment descriptor registers.  The only way
-	   to repair that is to go into PM and fix it ourselves... */
-	movw	$16, %cx
-	lgdtl	%cs:wakeup_gdt
-	movl	%cr0, %eax
-	orb	$X86_CR0_PE, %al
-	movl	%eax, %cr0
-	jmp	1f
-1:	ljmpw	$8, $2f
-2:
-	movw	%cx, %ds
-	movw	%cx, %es
-	movw	%cx, %ss
-	movw	%cx, %fs
-	movw	%cx, %gs
-
-	andb	$~X86_CR0_PE, %al
-	movl	%eax, %cr0
-	jmp	wakeup_jmp
-3:
-	/* Set up segments */
-	movw	%cs, %ax
-	movw	%ax, %ds
-	movw	%ax, %es
-	movw	%ax, %ss
-	lidtl	wakeup_idt
-
-	movl	$wakeup_stack_end, %esp
-
-	/* Clear the EFLAGS */
-	pushl	$0
-	popfl
-
-	/* Check header signature... */
-	movl	signature, %eax
-	cmpl	$WAKEUP_HEADER_SIGNATURE, %eax
-	jne	bogus_real_magic
-
-	/* Check we really have everything... */
-	movl	end_signature, %eax
-	cmpl	$WAKEUP_END_SIGNATURE, %eax
-	jne	bogus_real_magic
-
-	/* Call the C code */
-	calll	main
-
-	/* Restore MISC_ENABLE before entering protected mode, in case
-	   BIOS decided to clear XD_DISABLE during S3. */
-	movl	pmode_behavior, %eax
-	btl	$WAKEUP_BEHAVIOR_RESTORE_MISC_ENABLE, %eax
-	jnc	1f
-
-	movl	pmode_misc_en, %eax
-	movl	pmode_misc_en + 4, %edx
-	movl	$MSR_IA32_MISC_ENABLE, %ecx
-	wrmsr
-1:
-
-	/* Do any other stuff... */
-
-#ifndef CONFIG_64BIT
-	/* This could also be done in C code... */
-	movl	pmode_cr3, %eax
-	movl	%eax, %cr3
-
-	movl	pmode_cr4, %ecx
-	jecxz	1f
-	movl	%ecx, %cr4
-1:
-	movl	pmode_efer, %eax
-	movl	pmode_efer + 4, %edx
-	movl	%eax, %ecx
-	orl	%edx, %ecx
-	jz	1f
-	movl	$MSR_EFER, %ecx
-	wrmsr
-1:
-
-	lgdtl	pmode_gdt
-
-	/* This really couldn't... */
-	movl	pmode_cr0, %eax
-	movl	%eax, %cr0
-	jmp	pmode_return
-#else
-	pushw	$0
-	pushw	trampoline_segment
-	pushw	$0
-	lret
-#endif
-
-bogus_real_magic:
-1:
-	hlt
-	jmp	1b
-
-	.data
-	.balign	8
-
-	/* This is the standard real-mode IDT */
-wakeup_idt:
-	.word	0xffff		/* limit */
-	.long	0		/* address */
-	.word	0
-
-	.globl	HEAP, heap_end
-HEAP:
-	.long	wakeup_heap
-heap_end:
-	.long	wakeup_stack
-
-	.bss
-wakeup_heap:
-	.space	2048
-wakeup_stack:
-	.space	2048
-wakeup_stack_end:
-
-	.section ".signature","a"
-end_signature:
-	.long	WAKEUP_END_SIGNATURE
diff --git a/arch/x86/kernel/acpi/realmode/wakeup.h b/arch/x86/kernel/acpi/realmode/wakeup.h
deleted file mode 100644
index 97a29e1430e3..000000000000
--- a/arch/x86/kernel/acpi/realmode/wakeup.h
+++ /dev/null
@@ -1,48 +0,0 @@
-/*
- * Definitions for the wakeup data structure at the head of the
- * wakeup code.
- */
-
-#ifndef ARCH_X86_KERNEL_ACPI_RM_WAKEUP_H
-#define ARCH_X86_KERNEL_ACPI_RM_WAKEUP_H
-
-#ifndef __ASSEMBLY__
-#include <linux/types.h>
-
-/* This must match data at wakeup.S */
-struct wakeup_header {
-	u16 video_mode;		/* Video mode number */
-	u16 _jmp1;		/* ljmpl opcode, 32-bit only */
-	u32 pmode_entry;	/* Protected mode resume point, 32-bit only */
-	u16 _jmp2;		/* CS value, 32-bit only */
-	u32 pmode_cr0;		/* Protected mode cr0 */
-	u32 pmode_cr3;		/* Protected mode cr3 */
-	u32 pmode_cr4;		/* Protected mode cr4 */
-	u32 pmode_efer_low;	/* Protected mode EFER */
-	u32 pmode_efer_high;
-	u64 pmode_gdt;
-	u32 pmode_misc_en_low;	/* Protected mode MISC_ENABLE */
-	u32 pmode_misc_en_high;
-	u32 pmode_behavior;	/* Wakeup routine behavior flags */
-	u32 realmode_flags;
-	u32 real_magic;
-	u16 trampoline_segment;	/* segment with trampoline code, 64-bit only */
-	u8  _pad1;
-	u8  wakeup_jmp;
-	u16 wakeup_jmp_off;
-	u16 wakeup_jmp_seg;
-	u64 wakeup_gdt[3];
-	u32 signature;		/* To check we have correct structure */
-} __attribute__((__packed__));
-
-extern struct wakeup_header wakeup_header;
-#endif
-
-#define WAKEUP_HEADER_OFFSET	8
-#define WAKEUP_HEADER_SIGNATURE 0x51ee1111
-#define WAKEUP_END_SIGNATURE	0x65a22c82
-
-/* Wakeup behavior bits */
-#define WAKEUP_BEHAVIOR_RESTORE_MISC_ENABLE     0
-
-#endif /* ARCH_X86_KERNEL_ACPI_RM_WAKEUP_H */
diff --git a/arch/x86/kernel/acpi/realmode/wakeup.lds.S b/arch/x86/kernel/acpi/realmode/wakeup.lds.S
deleted file mode 100644
index d4f8010a5b1b..000000000000
--- a/arch/x86/kernel/acpi/realmode/wakeup.lds.S
+++ /dev/null
@@ -1,62 +0,0 @@
-/*
- * wakeup.ld
- *
- * Linker script for the real-mode wakeup code
- */
-#undef i386
-#include "wakeup.h"
-
-OUTPUT_FORMAT("elf32-i386", "elf32-i386", "elf32-i386")
-OUTPUT_ARCH(i386)
-ENTRY(_start)
-
-SECTIONS
-{
-	. = 0;
-	.jump	: {
-		*(.jump)
-	} = 0x90909090
-
-	. = WAKEUP_HEADER_OFFSET;
-	.header : {
-		*(.header)
-	}
-
-	. = ALIGN(16);
-	.text : {
-		 *(.text*)
-	} = 0x90909090
-
-	. = ALIGN(16);
-	.rodata : {
-		*(.rodata*)
-	}
-
-	.videocards : {
-		video_cards = .;
-		*(.videocards)
-		video_cards_end = .;
-	}
-
-	. = ALIGN(16);
-	.data : {
-		 *(.data*)
-	}
-
-	. = ALIGN(16);
-	.bss :	{
-		__bss_start = .;
-		*(.bss)
-		__bss_end = .;
-	}
-
-	.signature : {
-		*(.signature)
-	}
-
-	_end = .;
-
-	/DISCARD/ : {
-		*(.note*)
-	}
-}
diff --git a/arch/x86/kernel/acpi/sleep.c b/arch/x86/kernel/acpi/sleep.c
index 146a49c763a4..d941b62da4b6 100644
--- a/arch/x86/kernel/acpi/sleep.c
+++ b/arch/x86/kernel/acpi/sleep.c
@@ -14,8 +14,9 @@
 #include <asm/desc.h>
 #include <asm/pgtable.h>
 #include <asm/cacheflush.h>
+#include <asm/realmode.h>
 
-#include "realmode/wakeup.h"
+#include "../../realmode/rm/wakeup/wakeup.h"
 #include "sleep.h"
 
 unsigned long acpi_realmode_flags;
@@ -36,13 +37,9 @@ asmlinkage void acpi_enter_s3(void)
  */
 int acpi_suspend_lowlevel(void)
 {
-	struct wakeup_header *header;
-	/* address in low memory of the wakeup routine. */
-	char *acpi_realmode;
+	struct wakeup_header *header =
+		(struct wakeup_header *) __va(real_mode_header.wakeup_header);
 
-	acpi_realmode = TRAMPOLINE_SYM(acpi_wakeup_code);
-
-	header = (struct wakeup_header *)(acpi_realmode + WAKEUP_HEADER_OFFSET);
 	if (header->signature != WAKEUP_HEADER_SIGNATURE) {
 		printk(KERN_ERR "wakeup header does not match\n");
 		return -EINVAL;
@@ -50,27 +47,6 @@ int acpi_suspend_lowlevel(void)
 
 	header->video_mode = saved_video_mode;
 
-	header->wakeup_jmp_seg = acpi_wakeup_address >> 4;
-
-	/*
-	 * Set up the wakeup GDT.  We set these up as Big Real Mode,
-	 * that is, with limits set to 4 GB.  At least the Lenovo
-	 * Thinkpad X61 is known to need this for the video BIOS
-	 * initialization quirk to work; this is likely to also
-	 * be the case for other laptops or integrated video devices.
-	 */
-
-	/* GDT[0]: GDT self-pointer */
-	header->wakeup_gdt[0] =
-		(u64)(sizeof(header->wakeup_gdt) - 1) +
-		((u64)__pa(&header->wakeup_gdt) << 16);
-	/* GDT[1]: big real mode-like code segment */
-	header->wakeup_gdt[1] =
-		GDT_ENTRY(0x809b, acpi_wakeup_address, 0xfffff);
-	/* GDT[2]: big real mode-like data segment */
-	header->wakeup_gdt[2] =
-		GDT_ENTRY(0x8093, acpi_wakeup_address, 0xfffff);
-
 #ifndef CONFIG_64BIT
 	store_gdt((struct desc_ptr *)&header->pmode_gdt);
 
@@ -95,7 +71,6 @@ int acpi_suspend_lowlevel(void)
 	header->pmode_cr3 = (u32)__pa(&initial_page_table);
 	saved_magic = 0x12345678;
 #else /* CONFIG_64BIT */
-	header->trampoline_segment = trampoline_address() >> 4;
 #ifdef CONFIG_SMP
 	stack_start = (unsigned long)temp_stack + sizeof(temp_stack);
 	early_gdt_descr.address =
diff --git a/arch/x86/kernel/acpi/sleep.h b/arch/x86/kernel/acpi/sleep.h
index d68677a2a010..5653a5791ec9 100644
--- a/arch/x86/kernel/acpi/sleep.h
+++ b/arch/x86/kernel/acpi/sleep.h
@@ -2,8 +2,8 @@
  *	Variables and functions used by the code in sleep.c
  */
 
-#include <asm/trampoline.h>
 #include <linux/linkage.h>
+#include <asm/realmode.h>
 
 extern unsigned long saved_video_mode;
 extern long saved_magic;
diff --git a/arch/x86/kernel/acpi/wakeup_rm.S b/arch/x86/kernel/acpi/wakeup_rm.S
deleted file mode 100644
index 63b8ab524f2c..000000000000
--- a/arch/x86/kernel/acpi/wakeup_rm.S
+++ /dev/null
@@ -1,12 +0,0 @@
-/*
- * Wrapper script for the realmode binary as a transport object
- * before copying to low memory.
- */
-#include <asm/page_types.h>
-
-	.section ".x86_trampoline","a"
-	.balign PAGE_SIZE
-	.globl	acpi_wakeup_code
-acpi_wakeup_code:
-	.incbin	"arch/x86/kernel/acpi/realmode/wakeup.bin"
-	.size	acpi_wakeup_code, .-acpi_wakeup_code
diff --git a/arch/x86/kernel/head32.c b/arch/x86/kernel/head32.c
index 51ff18616d50..c18f59d10101 100644
--- a/arch/x86/kernel/head32.c
+++ b/arch/x86/kernel/head32.c
@@ -14,7 +14,6 @@
 #include <asm/sections.h>
 #include <asm/e820.h>
 #include <asm/page.h>
-#include <asm/trampoline.h>
 #include <asm/apic.h>
 #include <asm/io_apic.h>
 #include <asm/bios_ebda.h>
diff --git a/arch/x86/kernel/head64.c b/arch/x86/kernel/head64.c
index 3a3b779f41d3..037df57a99ac 100644
--- a/arch/x86/kernel/head64.c
+++ b/arch/x86/kernel/head64.c
@@ -24,7 +24,6 @@
 #include <asm/sections.h>
 #include <asm/kdebug.h>
 #include <asm/e820.h>
-#include <asm/trampoline.h>
 #include <asm/bios_ebda.h>
 
 static void __init zap_identity_mappings(void)
diff --git a/arch/x86/kernel/mpparse.c b/arch/x86/kernel/mpparse.c
index ca470e4c92dc..f44d31157353 100644
--- a/arch/x86/kernel/mpparse.c
+++ b/arch/x86/kernel/mpparse.c
@@ -27,7 +27,6 @@
 #include <asm/proto.h>
 #include <asm/bios_ebda.h>
 #include <asm/e820.h>
-#include <asm/trampoline.h>
 #include <asm/setup.h>
 #include <asm/smp.h>
 
diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c
index 56e41242a6b8..7a14fece9cfc 100644
--- a/arch/x86/kernel/setup.c
+++ b/arch/x86/kernel/setup.c
@@ -73,7 +73,6 @@
 
 #include <asm/mtrr.h>
 #include <asm/apic.h>
-#include <asm/trampoline.h>
 #include <asm/realmode.h>
 #include <asm/e820.h>
 #include <asm/mpspec.h>
@@ -918,7 +917,6 @@ void __init setup_arch(char **cmdline_p)
 	printk(KERN_DEBUG "initial memory mapped : 0 - %08lx\n",
 			max_pfn_mapped<<PAGE_SHIFT);
 
-	setup_trampolines();
 	setup_real_mode();
 
 	init_gbpages();
diff --git a/arch/x86/kernel/tboot.c b/arch/x86/kernel/tboot.c
index 6410744ac5cb..c136e2325062 100644
--- a/arch/x86/kernel/tboot.c
+++ b/arch/x86/kernel/tboot.c
@@ -32,7 +32,7 @@
 #include <linux/mm.h>
 #include <linux/tboot.h>
 
-#include <asm/trampoline.h>
+#include <asm/realmode.h>
 #include <asm/processor.h>
 #include <asm/bootparam.h>
 #include <asm/pgtable.h>
@@ -201,7 +201,8 @@ static int tboot_setup_sleep(void)
 		add_mac_region(e820.map[i].addr, e820.map[i].size);
 	}
 
-	tboot->acpi_sinfo.kernel_s3_resume_vector = acpi_wakeup_address;
+	tboot->acpi_sinfo.kernel_s3_resume_vector =
+		real_mode_header.wakeup_start;
 
 	return 0;
 }
diff --git a/arch/x86/kernel/trampoline.c b/arch/x86/kernel/trampoline.c
deleted file mode 100644
index a73b61055ad6..000000000000
--- a/arch/x86/kernel/trampoline.c
+++ /dev/null
@@ -1,42 +0,0 @@
-#include <linux/io.h>
-#include <linux/memblock.h>
-
-#include <asm/trampoline.h>
-#include <asm/cacheflush.h>
-#include <asm/pgtable.h>
-
-unsigned char *x86_trampoline_base;
-
-void __init setup_trampolines(void)
-{
-	phys_addr_t mem;
-	size_t size = PAGE_ALIGN(x86_trampoline_end - x86_trampoline_start);
-
-	/* Has to be in very low memory so we can execute real-mode AP code. */
-	mem = memblock_find_in_range(0, 1<<20, size, PAGE_SIZE);
-	if (!mem)
-		panic("Cannot allocate trampoline\n");
-
-	x86_trampoline_base = __va(mem);
-	memblock_reserve(mem, size);
-
-	printk(KERN_DEBUG "Base memory trampoline at [%p] %llx size %zu\n",
-	       x86_trampoline_base, (unsigned long long)mem, size);
-
-	memcpy(x86_trampoline_base, x86_trampoline_start, size);
-}
-
-/*
- * setup_trampolines() gets called very early, to guarantee the
- * availability of low memory.  This is before the proper kernel page
- * tables are set up, so we cannot set page permissions in that
- * function.  Thus, we use an arch_initcall instead.
- */
-static int __init configure_trampolines(void)
-{
-	size_t size = PAGE_ALIGN(x86_trampoline_end - x86_trampoline_start);
-
-	set_memory_x((unsigned long)x86_trampoline_base, size >> PAGE_SHIFT);
-	return 0;
-}
-arch_initcall(configure_trampolines);
diff --git a/arch/x86/kernel/trampoline_32.S b/arch/x86/kernel/trampoline_32.S
deleted file mode 100644
index 451c0a7ef7fd..000000000000
--- a/arch/x86/kernel/trampoline_32.S
+++ /dev/null
@@ -1,83 +0,0 @@
-/*
- *
- *	Trampoline.S	Derived from Setup.S by Linus Torvalds
- *
- *	4 Jan 1997 Michael Chastain: changed to gnu as.
- *
- *	This is only used for booting secondary CPUs in SMP machine
- *
- *	Entry: CS:IP point to the start of our code, we are 
- *	in real mode with no stack, but the rest of the 
- *	trampoline page to make our stack and everything else
- *	is a mystery.
- *
- *	We jump into arch/x86/kernel/head_32.S.
- *
- *	On entry to trampoline_data, the processor is in real mode
- *	with 16-bit addressing and 16-bit data.  CS has some value
- *	and IP is zero.  Thus, data addresses need to be absolute
- *	(no relocation) and are taken with regard to r_base.
- *
- *	If you work on this file, check the object module with
- *	objdump --reloc to make sure there are no relocation
- *	entries except for:
- *
- *	TYPE              VALUE
- *	R_386_32          startup_32_smp
- *	R_386_32          boot_gdt
- */
-
-#include <linux/linkage.h>
-#include <linux/init.h>
-#include <asm/segment.h>
-#include <asm/page_types.h>
-
-#ifdef CONFIG_SMP
-
-	.section ".x86_trampoline","a"
-	.balign PAGE_SIZE
-	.code16
-
-ENTRY(trampoline_data)
-r_base = .
-	wbinvd			# Needed for NUMA-Q should be harmless for others
-	mov	%cs, %ax	# Code and data in the same place
-	mov	%ax, %ds
-
-	cli			# We should be safe anyway
-
-	movl	$0xA5A5A5A5, trampoline_status - r_base
-				# write marker for master knows we're running
-
-	/* GDT tables in non default location kernel can be beyond 16MB and
-	 * lgdt will not be able to load the address as in real mode default
-	 * operand size is 16bit. Use lgdtl instead to force operand size
-	 * to 32 bit.
-	 */
-
-	lidtl	boot_idt_descr - r_base	# load idt with 0, 0
-	lgdtl	boot_gdt_descr - r_base	# load gdt with whatever is appropriate
-
-	xor	%ax, %ax
-	inc	%ax		# protected mode (PE) bit
-	lmsw	%ax		# into protected mode
-	# flush prefetch and jump to startup_32_smp in arch/i386/kernel/head.S
-	ljmpl	$__BOOT_CS, $(startup_32_smp-__PAGE_OFFSET)
-
-	# These need to be in the same 64K segment as the above;
-	# hence we don't use the boot_gdt_descr defined in head.S
-boot_gdt_descr:
-	.word	__BOOT_DS + 7			# gdt limit
-	.long	boot_gdt - __PAGE_OFFSET	# gdt base
-
-boot_idt_descr:
-	.word	0				# idt limit = 0
-	.long	0				# idt base = 0L
-
-ENTRY(trampoline_status)
-	.long	0
-
-.globl trampoline_end
-trampoline_end:
-
-#endif /* CONFIG_SMP */
diff --git a/arch/x86/kernel/trampoline_64.S b/arch/x86/kernel/trampoline_64.S
deleted file mode 100644
index 09ff51799e96..000000000000
--- a/arch/x86/kernel/trampoline_64.S
+++ /dev/null
@@ -1,171 +0,0 @@
-/*
- *
- *	Trampoline.S	Derived from Setup.S by Linus Torvalds
- *
- *	4 Jan 1997 Michael Chastain: changed to gnu as.
- *	15 Sept 2005 Eric Biederman: 64bit PIC support
- *
- *	Entry: CS:IP point to the start of our code, we are 
- *	in real mode with no stack, but the rest of the 
- *	trampoline page to make our stack and everything else
- *	is a mystery.
- *
- *	On entry to trampoline_data, the processor is in real mode
- *	with 16-bit addressing and 16-bit data.  CS has some value
- *	and IP is zero.  Thus, data addresses need to be absolute
- *	(no relocation) and are taken with regard to r_base.
- *
- *	With the addition of trampoline_level4_pgt this code can
- *	now enter a 64bit kernel that lives at arbitrary 64bit
- *	physical addresses.
- *
- *	If you work on this file, check the object module with objdump
- *	--full-contents --reloc to make sure there are no relocation
- *	entries.
- */
-
-#include <linux/linkage.h>
-#include <linux/init.h>
-#include <asm/pgtable_types.h>
-#include <asm/page_types.h>
-#include <asm/msr.h>
-#include <asm/segment.h>
-#include <asm/processor-flags.h>
-
-	.section ".x86_trampoline","a"
-	.balign PAGE_SIZE
-	.code16
-
-ENTRY(trampoline_data)
-r_base = .
-	cli			# We should be safe anyway
-	wbinvd
-	mov	%cs, %ax	# Code and data in the same place
-	mov	%ax, %ds
-	mov	%ax, %es
-	mov	%ax, %ss
-
-
-	movl	$0xA5A5A5A5, trampoline_status - r_base
-				# write marker for master knows we're running
-
-					# Setup stack
-	movw	$(trampoline_stack_end - r_base), %sp
-
-	call	verify_cpu		# Verify the cpu supports long mode
-	testl   %eax, %eax		# Check for return code
-	jnz	no_longmode
-
-	mov	%cs, %ax
-	movzx	%ax, %esi		# Find the 32bit trampoline location
-	shll	$4, %esi
-
-					# Fixup the absolute vectors
-	leal	(startup_32 - r_base)(%esi), %eax
-	movl	%eax, startup_32_vector - r_base
-	leal	(startup_64 - r_base)(%esi), %eax
-	movl	%eax, startup_64_vector - r_base
-	leal	(tgdt - r_base)(%esi), %eax
-	movl	%eax, (tgdt + 2 - r_base)
-
-	/*
-	 * GDT tables in non default location kernel can be beyond 16MB and
-	 * lgdt will not be able to load the address as in real mode default
-	 * operand size is 16bit. Use lgdtl instead to force operand size
-	 * to 32 bit.
-	 */
-
-	lidtl	tidt - r_base	# load idt with 0, 0
-	lgdtl	tgdt - r_base	# load gdt with whatever is appropriate
-
-	mov	$X86_CR0_PE, %ax	# protected mode (PE) bit
-	lmsw	%ax			# into protected mode
-
-	# flush prefetch and jump to startup_32
-	ljmpl	*(startup_32_vector - r_base)
-
-	.code32
-	.balign 4
-startup_32:
-	movl	$__KERNEL_DS, %eax	# Initialize the %ds segment register
-	movl	%eax, %ds
-
-	movl	$X86_CR4_PAE, %eax
-	movl	%eax, %cr4		# Enable PAE mode
-
-					# Setup trampoline 4 level pagetables
-	leal	(trampoline_level4_pgt - r_base)(%esi), %eax
-	movl	%eax, %cr3
-
-	movl	$MSR_EFER, %ecx
-	movl	$(1 << _EFER_LME), %eax	# Enable Long Mode
-	xorl	%edx, %edx
-	wrmsr
-
-	# Enable paging and in turn activate Long Mode
-	# Enable protected mode
-	movl	$(X86_CR0_PG | X86_CR0_PE), %eax
-	movl	%eax, %cr0
-
-	/*
-	 * At this point we're in long mode but in 32bit compatibility mode
-	 * with EFER.LME = 1, CS.L = 0, CS.D = 1 (and in turn
-	 * EFER.LMA = 1). Now we want to jump in 64bit mode, to do that we use
-	 * the new gdt/idt that has __KERNEL_CS with CS.L = 1.
-	 */
-	ljmp	*(startup_64_vector - r_base)(%esi)
-
-	.code64
-	.balign 4
-startup_64:
-	# Now jump into the kernel using virtual addresses
-	movq	$secondary_startup_64, %rax
-	jmp	*%rax
-
-	.code16
-no_longmode:
-	hlt
-	jmp no_longmode
-#include "verify_cpu.S"
-
-	.balign 4
-	# Careful these need to be in the same 64K segment as the above;
-tidt:
-	.word	0			# idt limit = 0
-	.word	0, 0			# idt base = 0L
-
-	# Duplicate the global descriptor table
-	# so the kernel can live anywhere
-	.balign 4
-tgdt:
-	.short	tgdt_end - tgdt		# gdt limit
-	.long	tgdt - r_base
-	.short 0
-	.quad	0x00cf9b000000ffff	# __KERNEL32_CS
-	.quad	0x00af9b000000ffff	# __KERNEL_CS
-	.quad	0x00cf93000000ffff	# __KERNEL_DS
-tgdt_end:
-
-	.balign 4
-startup_32_vector:
-	.long	startup_32 - r_base
-	.word	__KERNEL32_CS, 0
-
-	.balign 4
-startup_64_vector:
-	.long	startup_64 - r_base
-	.word	__KERNEL_CS, 0
-
-	.balign 4
-ENTRY(trampoline_status)
-	.long	0
-
-trampoline_stack:
-	.org 0x1000
-trampoline_stack_end:
-ENTRY(trampoline_level4_pgt)
-	.quad	level3_ident_pgt - __START_KERNEL_map + _KERNPG_TABLE
-	.fill	510,8,0
-	.quad	level3_kernel_pgt - __START_KERNEL_map + _KERNPG_TABLE
-
-ENTRY(trampoline_end)
diff --git a/arch/x86/kernel/vmlinux.lds.S b/arch/x86/kernel/vmlinux.lds.S
index 0f703f10901a..22a1530146a8 100644
--- a/arch/x86/kernel/vmlinux.lds.S
+++ b/arch/x86/kernel/vmlinux.lds.S
@@ -197,18 +197,6 @@ SECTIONS
 
 	INIT_DATA_SECTION(16)
 
-	/*
-	 * Code and data for a variety of lowlevel trampolines, to be
-	 * copied into base memory (< 1 MiB) during initialization.
-	 * Since it is copied early, the main copy can be discarded
-	 * afterwards.
-	 */
-	 .x86_trampoline : AT(ADDR(.x86_trampoline) - LOAD_OFFSET) {
-		x86_trampoline_start = .;
-		*(.x86_trampoline)
-		x86_trampoline_end = .;
-	}
-
 	.x86_cpu_dev.init : AT(ADDR(.x86_cpu_dev.init) - LOAD_OFFSET) {
 		__x86_cpu_dev_start = .;
 		*(.x86_cpu_dev.init)
diff --git a/arch/x86/realmode/rm/Makefile b/arch/x86/realmode/rm/Makefile
index 56ec64f94e69..2432acb6b04f 100644
--- a/arch/x86/realmode/rm/Makefile
+++ b/arch/x86/realmode/rm/Makefile
@@ -14,9 +14,13 @@ always := realmode.bin
 realmode-y			+= header.o
 realmode-$(CONFIG_X86_32)	+= reboot_32.o
 realmode-y			+= trampoline_$(BITS).o
+realmode-$(CONFIG_ACPI_SLEEP)	+= wakeup/wakeup.o
 
 targets	+= $(realmode-y)
 
+$(obj)/wakeup/wakeup.o: FORCE
+	$(Q)$(MAKE) $(build)=$(obj)/wakeup $@
+
 REALMODE_OBJS = $(addprefix $(obj)/,$(realmode-y))
 
 sed-pasyms := -n -r -e 's/^([0-9a-fA-F]+) [ABCDGRSTVW] (.+)$$/pa_\2 = \2;/p'
diff --git a/arch/x86/realmode/rm/header.S b/arch/x86/realmode/rm/header.S
index a97900409c61..730b1316c099 100644
--- a/arch/x86/realmode/rm/header.S
+++ b/arch/x86/realmode/rm/header.S
@@ -26,5 +26,10 @@ ENTRY(real_mode_header)
 		.long	pa_startup_64_smp
 		.long	pa_level3_ident_pgt
 		.long	pa_level3_kernel_pgt
+#endif
+		/* ACPI sleep */
+#ifdef CONFIG_ACPI_SLEEP
+		.long	pa_wakeup_start
+		.long	pa_wakeup_header
 #endif
 END(real_mode_header)
diff --git a/arch/x86/realmode/rm/realmode.lds.S b/arch/x86/realmode/rm/realmode.lds.S
index c5b8a4f31ba3..91b83ea55c37 100644
--- a/arch/x86/realmode/rm/realmode.lds.S
+++ b/arch/x86/realmode/rm/realmode.lds.S
@@ -25,6 +25,10 @@ SECTIONS
 	.rodata : {
 		*(.rodata)
 		*(.rodata.*)
+		. = ALIGN(16);
+		video_cards = .;
+		*(.videocards)
+		video_cards_end = .;
 	}
 
 	. = ALIGN(PAGE_SIZE);
diff --git a/arch/x86/realmode/rm/wakeup/.gitignore b/arch/x86/realmode/rm/wakeup/.gitignore
new file mode 100644
index 000000000000..58f1f48a58f8
--- /dev/null
+++ b/arch/x86/realmode/rm/wakeup/.gitignore
@@ -0,0 +1,3 @@
+wakeup.bin
+wakeup.elf
+wakeup.lds
diff --git a/arch/x86/realmode/rm/wakeup/Makefile b/arch/x86/realmode/rm/wakeup/Makefile
new file mode 100644
index 000000000000..4c8533240cdd
--- /dev/null
+++ b/arch/x86/realmode/rm/wakeup/Makefile
@@ -0,0 +1,33 @@
+#
+# arch/x86/kernel/acpi/realmode/Makefile
+#
+# This file is subject to the terms and conditions of the GNU General Public
+# License.  See the file "COPYING" in the main directory of this archive
+# for more details.
+#
+
+always		:= wakeup.o
+
+wakeup-y	+= wakeup_asm.o wakemain.o video-mode.o
+wakeup-y	+= copy.o bioscall.o regs.o
+
+# The link order of the video-*.o modules can matter.  In particular,
+# video-vga.o *must* be listed first, followed by video-vesa.o.
+# Hardware-specific drivers should follow in the order they should be
+# probed, and video-bios.o should typically be last.
+wakeup-y	+= video-vga.o
+wakeup-y	+= video-vesa.o
+wakeup-y	+= video-bios.o
+
+targets		+= $(wakeup-y)
+
+WAKEUP_OBJS = $(addprefix $(obj)/,$(wakeup-y))
+
+LDFLAGS_wakeup.o := -m elf_i386 -r
+$(obj)/wakeup.o: $(WAKEUP_OBJS) FORCE
+	$(call if_changed,ld)
+
+bootsrc := $(src)/../../../boot
+
+ccflags-y += -D_WAKEUP -I$(srctree)/$(bootsrc)
+asflags-y += -D_WAKEUP -I$(srctree)/$(bootsrc)
diff --git a/arch/x86/realmode/rm/wakeup/bioscall.S b/arch/x86/realmode/rm/wakeup/bioscall.S
new file mode 100644
index 000000000000..f51eb0bb56ce
--- /dev/null
+++ b/arch/x86/realmode/rm/wakeup/bioscall.S
@@ -0,0 +1 @@
+#include "../../../boot/bioscall.S"
diff --git a/arch/x86/realmode/rm/wakeup/copy.S b/arch/x86/realmode/rm/wakeup/copy.S
new file mode 100644
index 000000000000..dc59ebee69d8
--- /dev/null
+++ b/arch/x86/realmode/rm/wakeup/copy.S
@@ -0,0 +1 @@
+#include "../../../boot/copy.S"
diff --git a/arch/x86/realmode/rm/wakeup/regs.c b/arch/x86/realmode/rm/wakeup/regs.c
new file mode 100644
index 000000000000..6206033ba202
--- /dev/null
+++ b/arch/x86/realmode/rm/wakeup/regs.c
@@ -0,0 +1 @@
+#include "../../../boot/regs.c"
diff --git a/arch/x86/realmode/rm/wakeup/video-bios.c b/arch/x86/realmode/rm/wakeup/video-bios.c
new file mode 100644
index 000000000000..7deabc144a27
--- /dev/null
+++ b/arch/x86/realmode/rm/wakeup/video-bios.c
@@ -0,0 +1 @@
+#include "../../../boot/video-bios.c"
diff --git a/arch/x86/realmode/rm/wakeup/video-mode.c b/arch/x86/realmode/rm/wakeup/video-mode.c
new file mode 100644
index 000000000000..328ad209f113
--- /dev/null
+++ b/arch/x86/realmode/rm/wakeup/video-mode.c
@@ -0,0 +1 @@
+#include "../../../boot/video-mode.c"
diff --git a/arch/x86/realmode/rm/wakeup/video-vesa.c b/arch/x86/realmode/rm/wakeup/video-vesa.c
new file mode 100644
index 000000000000..9dbb9672226a
--- /dev/null
+++ b/arch/x86/realmode/rm/wakeup/video-vesa.c
@@ -0,0 +1 @@
+#include "../../../boot/video-vesa.c"
diff --git a/arch/x86/realmode/rm/wakeup/video-vga.c b/arch/x86/realmode/rm/wakeup/video-vga.c
new file mode 100644
index 000000000000..bcc81255f374
--- /dev/null
+++ b/arch/x86/realmode/rm/wakeup/video-vga.c
@@ -0,0 +1 @@
+#include "../../../boot/video-vga.c"
diff --git a/arch/x86/realmode/rm/wakeup/wakemain.c b/arch/x86/realmode/rm/wakeup/wakemain.c
new file mode 100644
index 000000000000..91405d515ec6
--- /dev/null
+++ b/arch/x86/realmode/rm/wakeup/wakemain.c
@@ -0,0 +1,82 @@
+#include "wakeup.h"
+#include "boot.h"
+
+static void udelay(int loops)
+{
+	while (loops--)
+		io_delay();	/* Approximately 1 us */
+}
+
+static void beep(unsigned int hz)
+{
+	u8 enable;
+
+	if (!hz) {
+		enable = 0x00;		/* Turn off speaker */
+	} else {
+		u16 div = 1193181/hz;
+
+		outb(0xb6, 0x43);	/* Ctr 2, squarewave, load, binary */
+		io_delay();
+		outb(div, 0x42);	/* LSB of counter */
+		io_delay();
+		outb(div >> 8, 0x42);	/* MSB of counter */
+		io_delay();
+
+		enable = 0x03;		/* Turn on speaker */
+	}
+	inb(0x61);		/* Dummy read of System Control Port B */
+	io_delay();
+	outb(enable, 0x61);	/* Enable timer 2 output to speaker */
+	io_delay();
+}
+
+#define DOT_HZ		880
+#define DASH_HZ		587
+#define US_PER_DOT	125000
+
+/* Okay, this is totally silly, but it's kind of fun. */
+static void send_morse(const char *pattern)
+{
+	char s;
+
+	while ((s = *pattern++)) {
+		switch (s) {
+		case '.':
+			beep(DOT_HZ);
+			udelay(US_PER_DOT);
+			beep(0);
+			udelay(US_PER_DOT);
+			break;
+		case '-':
+			beep(DASH_HZ);
+			udelay(US_PER_DOT * 3);
+			beep(0);
+			udelay(US_PER_DOT);
+			break;
+		default:	/* Assume it's a space */
+			udelay(US_PER_DOT * 3);
+			break;
+		}
+	}
+}
+
+void main(void)
+{
+	/* Kill machine if structures are wrong */
+	if (wakeup_header.real_magic != 0x12345678)
+		while (1)
+			;
+
+	if (wakeup_header.realmode_flags & 4)
+		send_morse("...-");
+
+	if (wakeup_header.realmode_flags & 1)
+		asm volatile("lcallw   $0xc000,$3");
+
+	if (wakeup_header.realmode_flags & 2) {
+		/* Need to call BIOS */
+		probe_cards(0);
+		set_mode(wakeup_header.video_mode);
+	}
+}
diff --git a/arch/x86/realmode/rm/wakeup/wakeup.h b/arch/x86/realmode/rm/wakeup/wakeup.h
new file mode 100644
index 000000000000..2dfaf06b8af1
--- /dev/null
+++ b/arch/x86/realmode/rm/wakeup/wakeup.h
@@ -0,0 +1,41 @@
+/*
+ * Definitions for the wakeup data structure at the head of the
+ * wakeup code.
+ */
+
+#ifndef ARCH_X86_KERNEL_ACPI_RM_WAKEUP_H
+#define ARCH_X86_KERNEL_ACPI_RM_WAKEUP_H
+
+#ifndef __ASSEMBLY__
+#include <linux/types.h>
+
+/* This must match data at wakeup.S */
+struct wakeup_header {
+	u16 video_mode;		/* Video mode number */
+	u32 pmode_entry;	/* Protected mode resume point, 32-bit only */
+	u16 pmode_cs;
+	u32 pmode_cr0;		/* Protected mode cr0 */
+	u32 pmode_cr3;		/* Protected mode cr3 */
+	u32 pmode_cr4;		/* Protected mode cr4 */
+	u32 pmode_efer_low;	/* Protected mode EFER */
+	u32 pmode_efer_high;
+	u64 pmode_gdt;
+	u32 pmode_misc_en_low;	/* Protected mode MISC_ENABLE */
+	u32 pmode_misc_en_high;
+	u32 pmode_behavior;	/* Wakeup routine behavior flags */
+	u32 realmode_flags;
+	u32 real_magic;
+	u32 signature;		/* To check we have correct structure */
+} __attribute__((__packed__));
+
+extern struct wakeup_header wakeup_header;
+#endif
+
+#define WAKEUP_HEADER_OFFSET	8
+#define WAKEUP_HEADER_SIGNATURE 0x51ee1111
+#define WAKEUP_END_SIGNATURE	0x65a22c82
+
+/* Wakeup behavior bits */
+#define WAKEUP_BEHAVIOR_RESTORE_MISC_ENABLE     0
+
+#endif /* ARCH_X86_KERNEL_ACPI_RM_WAKEUP_H */
diff --git a/arch/x86/realmode/rm/wakeup/wakeup_asm.S b/arch/x86/realmode/rm/wakeup/wakeup_asm.S
new file mode 100644
index 000000000000..b61126cb599e
--- /dev/null
+++ b/arch/x86/realmode/rm/wakeup/wakeup_asm.S
@@ -0,0 +1,189 @@
+/*
+ * ACPI wakeup real mode startup stub
+ */
+#include <asm/segment.h>
+#include <asm/msr-index.h>
+#include <asm/page_types.h>
+#include <asm/pgtable_types.h>
+#include <asm/processor-flags.h>
+#include "wakeup.h"
+
+		.code16
+
+/* This should match the structure in wakeup.h */
+		.section ".data", "aw"
+		.globl	wakeup_header
+wakeup_header:
+video_mode:	.short	0	/* Video mode number */
+pmode_entry:	.long	0
+pmode_cs:	.short	__KERNEL_CS
+pmode_cr0:	.long	0	/* Saved %cr0 */
+pmode_cr3:	.long	0	/* Saved %cr3 */
+pmode_cr4:	.long	0	/* Saved %cr4 */
+pmode_efer:	.quad	0	/* Saved EFER */
+pmode_gdt:	.quad	0
+pmode_misc_en:	.quad	0	/* Saved MISC_ENABLE MSR */
+pmode_behavior:	.long	0	/* Wakeup behavior flags */
+realmode_flags:	.long	0
+real_magic:	.long	0
+signature:	.long	WAKEUP_HEADER_SIGNATURE
+		.size	wakeup_header, .-wakeup_header
+
+	.text
+	.code16
+	.globl	wakeup_start
+wakeup_start:
+	cli
+	cld
+
+	.byte	0xea		/* ljmpw */
+	.word	3f
+	.word	real_mode_seg
+3:
+	/* Apparently some dimwit BIOS programmers don't know how to
+	   program a PM to RM transition, and we might end up here with
+	   junk in the data segment descriptor registers.  The only way
+	   to repair that is to go into PM and fix it ourselves... */
+	movw	$16, %cx
+	lgdtl	%cs:wakeup_gdt
+	movl	%cr0, %eax
+	orb	$X86_CR0_PE, %al
+	movl	%eax, %cr0
+	ljmpw	$8, $2f
+2:
+	movw	%cx, %ds
+	movw	%cx, %es
+	movw	%cx, %ss
+	movw	%cx, %fs
+	movw	%cx, %gs
+
+	andb	$~X86_CR0_PE, %al
+	movl	%eax, %cr0
+	.byte	0xea		/* ljmpw */
+	.word	3f
+	.word	real_mode_seg
+3:
+	/* Set up segments */
+	movw	%cs, %ax
+	movw	%ax, %ds
+	movw	%ax, %es
+	movw	%ax, %ss
+	lidtl	wakeup_idt
+
+	movl	$wakeup_stack_end, %esp
+
+	/* Clear the EFLAGS */
+	pushl	$0
+	popfl
+
+	/* Check header signature... */
+	movl	signature, %eax
+	cmpl	$WAKEUP_HEADER_SIGNATURE, %eax
+	jne	bogus_real_magic
+
+	/* Check we really have everything... */
+	movl	end_signature, %eax
+	cmpl	$WAKEUP_END_SIGNATURE, %eax
+	jne	bogus_real_magic
+
+	/* Call the C code */
+	calll	main
+
+	/* Restore MISC_ENABLE before entering protected mode, in case
+	   BIOS decided to clear XD_DISABLE during S3. */
+	movl	pmode_behavior, %eax
+	btl	$WAKEUP_BEHAVIOR_RESTORE_MISC_ENABLE, %eax
+	jnc	1f
+
+	movl	pmode_misc_en, %eax
+	movl	pmode_misc_en + 4, %edx
+	movl	$MSR_IA32_MISC_ENABLE, %ecx
+	wrmsr
+1:
+
+	/* Do any other stuff... */
+
+#ifndef CONFIG_64BIT
+	/* This could also be done in C code... */
+	movl	pmode_cr3, %eax
+	movl	%eax, %cr3
+
+	movl	pmode_cr4, %ecx
+	jecxz	1f
+	movl	%ecx, %cr4
+1:
+	movl	pmode_efer, %eax
+	movl	pmode_efer + 4, %edx
+	movl	%eax, %ecx
+	orl	%edx, %ecx
+	jz	1f
+	movl	$MSR_EFER, %ecx
+	wrmsr
+1:
+
+	lgdtl	pmode_gdt
+
+	/* This really couldn't... */
+	movl	pmode_cr0, %eax
+	movl	%eax, %cr0
+	ljmpl	*pmode_entry
+#else
+	jmp	trampoline_data
+#endif
+
+bogus_real_magic:
+1:
+	hlt
+	jmp	1b
+
+	.section ".rodata","a"
+
+	/*
+	 * Set up the wakeup GDT.  We set these up as Big Real Mode,
+	 * that is, with limits set to 4 GB.  At least the Lenovo
+	 * Thinkpad X61 is known to need this for the video BIOS
+	 * initialization quirk to work; this is likely to also
+	 * be the case for other laptops or integrated video devices.
+	 */
+
+	.globl	wakeup_gdt
+	.balign	16
+wakeup_gdt:
+	.word	3*8-1		/* Self-descriptor */
+	.long	pa_wakeup_gdt
+	.word	0
+
+	.word	0xffff		/* 16-bit code segment @ real_mode_base */
+	.long	0x9b000000 + pa_real_mode_base
+	.word	0x008f		/* big real mode */
+
+	.word	0xffff		/* 16-bit data segment @ real_mode_base */
+	.long	0x93000000 + pa_real_mode_base
+	.word	0x008f		/* big real mode */
+	.size	wakeup_gdt, .-wakeup_gdt
+
+	.data
+	.balign	8
+
+	/* This is the standard real-mode IDT */
+wakeup_idt:
+	.word	0xffff		/* limit */
+	.long	0		/* address */
+	.word	0
+
+	.globl	HEAP, heap_end
+HEAP:
+	.long	wakeup_heap
+heap_end:
+	.long	wakeup_stack
+
+	.bss
+wakeup_heap:
+	.space	2048
+wakeup_stack:
+	.space	2048
+wakeup_stack_end:
+
+	.section ".signature","a"
+end_signature:
+	.long	WAKEUP_END_SIGNATURE
diff --git a/drivers/acpi/sleep.c b/drivers/acpi/sleep.c
index eb6fd233764b..e77aa4a1c9f6 100644
--- a/drivers/acpi/sleep.c
+++ b/drivers/acpi/sleep.c
@@ -25,6 +25,8 @@
 #include <acpi/acpi_bus.h>
 #include <acpi/acpi_drivers.h>
 
+#include <asm/realmode.h>
+
 #include "internal.h"
 #include "sleep.h"
 
@@ -91,13 +93,13 @@ static struct notifier_block tts_notifier = {
 static int acpi_sleep_prepare(u32 acpi_state)
 {
 #ifdef CONFIG_ACPI_SLEEP
+	unsigned long wakeup_pa = real_mode_header.wakeup_start;
 	/* do we have a wakeup address for S2 and S3? */
 	if (acpi_state == ACPI_STATE_S3) {
-		if (!acpi_wakeup_address) {
+		if (!wakeup_pa)
 			return -EFAULT;
-		}
 		acpi_set_firmware_waking_vector(
-				(acpi_physical_address)acpi_wakeup_address);
+				(acpi_physical_address)wakeup_pa);
 
 	}
 	ACPI_FLUSH_CPU_CACHE();
-- 
cgit v1.2.3


From f156ffc439951b63cfa9f4d999a8d54267f13282 Mon Sep 17 00:00:00 2001
From: Jarkko Sakkinen <jarkko.sakkinen@intel.com>
Date: Tue, 8 May 2012 21:22:30 +0300
Subject: x86, realmode: Set permission for real mode pages

Set proper permissions for rodata, text and data, removing the
realmode trampoline area as a remaining RWX memory mapping in the
kernel.

Signed-off-by: Jarkko Sakkinen <jarkko.sakkinen@intel.com>
Link: http://lkml.kernel.org/r/1336501366-28617-8-git-send-email-jarkko.sakkinen@intel.com
Signed-off-by: H. Peter Anvin <hpa@linux.intel.com>
---
 arch/x86/kernel/realmode.c | 16 +++++++++++++++-
 1 file changed, 15 insertions(+), 1 deletion(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kernel/realmode.c b/arch/x86/kernel/realmode.c
index a465775b32f2..d85ac20bb4eb 100644
--- a/arch/x86/kernel/realmode.c
+++ b/arch/x86/kernel/realmode.c
@@ -86,7 +86,21 @@ static int __init set_real_mode_permissions(void)
 		PAGE_ALIGN(real_mode_header.end) -
 		__pa(real_mode_base);
 
-	set_memory_x((unsigned long) real_mode_base, all_size >> PAGE_SHIFT);
+	size_t ro_size =
+		PAGE_ALIGN(real_mode_header.ro_end) -
+		__pa(real_mode_base);
+
+	size_t text_size =
+		PAGE_ALIGN(real_mode_header.ro_end) -
+		real_mode_header.text_start;
+
+	unsigned long text_start =
+		(unsigned long) __va(real_mode_header.text_start);
+
+	set_memory_nx((unsigned long) real_mode_base, all_size >> PAGE_SHIFT);
+	set_memory_ro((unsigned long) real_mode_base, ro_size >> PAGE_SHIFT);
+	set_memory_x((unsigned long) text_start, text_size >> PAGE_SHIFT);
+
 	return 0;
 }
 
-- 
cgit v1.2.3


From 487f50ffeb142d8f86fff6e43a8852ce3d46c173 Mon Sep 17 00:00:00 2001
From: "H. Peter Anvin" <hpa@linux.intel.com>
Date: Tue, 8 May 2012 21:22:32 +0300
Subject: x86, realmode: Add .text64 section, make barrier symbols absolute

Add a .text64 section.  The purpose of this is to keep 16-, 32- and
64-bit code segregated into separate sections, mainly to keep
disassembly sane.

Move barrier symbols out of sections to avoid the "symbol in empty
section" problem in some versions of GNU ld.

Signed-off-by: H. Peter Anvin <hpa@linux.intel.com>
Link: http://lkml.kernel.org/r/1336501366-28617-10-git-send-email-jarkko.sakkinen@intel.com
---
 arch/x86/realmode/rm/realmode.lds.S | 11 ++++++++---
 1 file changed, 8 insertions(+), 3 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/realmode/rm/realmode.lds.S b/arch/x86/realmode/rm/realmode.lds.S
index 91b83ea55c37..4d4afcaf5f02 100644
--- a/arch/x86/realmode/rm/realmode.lds.S
+++ b/arch/x86/realmode/rm/realmode.lds.S
@@ -32,8 +32,8 @@ SECTIONS
 	}
 
 	. = ALIGN(PAGE_SIZE);
+	pa_text_start = .;
 	.text : {
-		pa_text_start = .;
 		*(.text)
 		*(.text.*)
 	}
@@ -41,9 +41,14 @@ SECTIONS
 	.text32 : {
 		*(.text32)
 		*(.text32.*)
-		pa_ro_end = .;
 	}
 
+	.text64 : {
+		*(.text64)
+		*(.text64.*)
+	}
+	pa_ro_end = .;
+
 	. = ALIGN(PAGE_SIZE);
 	.data : {
 		*(.data)
@@ -59,8 +64,8 @@ SECTIONS
 	. = ALIGN(4);
 	.signature : {
 		*(.signature)
-		pa_end = .;
 	}
+	pa_end = .;
 
 	/DISCARD/ : {
 		*(.note*)
-- 
cgit v1.2.3


From 024742861124ef26dae4cfc620250f8f47ac934a Mon Sep 17 00:00:00 2001
From: "H. Peter Anvin" <hpa@linux.intel.com>
Date: Tue, 8 May 2012 21:22:33 +0300
Subject: x86, realmode: Move bits to the proper sections in trampoline_64.S

Move various bits to the sections they really belong in in
trampoline_64.S.  Use GLOBAL() rather than ENTRY() for data objects:
ENTRY() should only be used with code and forces alignment to 16
bytes.

Signed-off-by: H. Peter Anvin <hpa@linux.intel.com>
Link: http://lkml.kernel.org/r/1336501366-28617-11-git-send-email-jarkko.sakkinen@intel.com
---
 arch/x86/realmode/rm/trampoline_64.S | 10 ++++++----
 1 file changed, 6 insertions(+), 4 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/realmode/rm/trampoline_64.S b/arch/x86/realmode/rm/trampoline_64.S
index 063da008d520..66c58cf15503 100644
--- a/arch/x86/realmode/rm/trampoline_64.S
+++ b/arch/x86/realmode/rm/trampoline_64.S
@@ -80,6 +80,7 @@ no_longmode:
 	jmp no_longmode
 #include "../kernel/verify_cpu.S"
 
+	.section ".text32","ax"
 	.code32
 	.balign 4
 ENTRY(startup_32)
@@ -114,6 +115,7 @@ ENTRY(startup_32)
 	 */
 	ljmpl	*(pa_startup_64_vector)
 
+	.section ".text64","ax"
 	.code64
 	.balign 4
 ENTRY(startup_64)
@@ -123,7 +125,8 @@ ENTRY(startup_64)
 	addl	%esi, %eax
 	jmp	*%rax
 
-	# Careful these need to be in the same 64K segment as the above;
+	.section ".rodata","a"
+	.balign	16
 tidt:
 	.word	0			# idt limit = 0
 	.word	0, 0			# idt base = 0L
@@ -153,9 +156,8 @@ startup_64_vector:
 	.word	__KERNEL_CS, 0
 
 	.data
-
 	.balign 4
-ENTRY(trampoline_status)
+GLOBAL(trampoline_status)
 	.long	0
 
 trampoline_stack:
@@ -164,7 +166,7 @@ trampoline_stack_end:
 
 	.globl	level3_ident_pgt
 	.globl	level3_kernel_pgt
-ENTRY(trampoline_level4_pgt)
+GLOBAL(trampoline_level4_pgt)
 	level3_ident_pgt:	.quad	0
 	.fill 510,8,0
 	level3_kernel_pgt:	.quad	0
-- 
cgit v1.2.3


From f7436a9da902922a48cccc208099763b87d6171f Mon Sep 17 00:00:00 2001
From: "H. Peter Anvin" <hpa@linux.intel.com>
Date: Tue, 8 May 2012 21:22:34 +0300
Subject: x86, realmode: Align .data section in trampoline_32.S

Specify the alignment of the .data section in trampoline_32.S.

Signed-off-by: H. Peter Anvin <hpa@linux.intel.com>
Link: http://lkml.kernel.org/r/1336501366-28617-12-git-send-email-jarkko.sakkinen@intel.com
---
 arch/x86/realmode/rm/trampoline_32.S | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'arch/x86')

diff --git a/arch/x86/realmode/rm/trampoline_32.S b/arch/x86/realmode/rm/trampoline_32.S
index 18cb7fc9fad4..1f9e3316f73d 100644
--- a/arch/x86/realmode/rm/trampoline_32.S
+++ b/arch/x86/realmode/rm/trampoline_32.S
@@ -68,7 +68,7 @@ trampoline_data:
 
 	.data
 	.globl startup_32_smp, boot_gdt, trampoline_status
-
+	.balign	4
 boot_gdt_descr:
 	.word	__BOOT_DS + 7			# gdt limit
 boot_gdt:
-- 
cgit v1.2.3


From 056a43a6d3ab903a798d8ee4435ad67d6fccc3e6 Mon Sep 17 00:00:00 2001
From: "H. Peter Anvin" <hpa@linux.intel.com>
Date: Tue, 8 May 2012 21:22:35 +0300
Subject: x86, realmode: Remove indirect jumps in trampoline_64.S

Remove indirect jumps in trampoline_64.S which are no longer
necessary: the realmode code can relocate the absolute jumps
correctly from the start.

Signed-off-by: H. Peter Anvin <hpa@linux.intel.com>
Link: http://lkml.kernel.org/r/1336501366-28617-13-git-send-email-jarkko.sakkinen@intel.com
---
 arch/x86/realmode/rm/trampoline_64.S | 15 ++-------------
 1 file changed, 2 insertions(+), 13 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/realmode/rm/trampoline_64.S b/arch/x86/realmode/rm/trampoline_64.S
index 66c58cf15503..77b72b45d705 100644
--- a/arch/x86/realmode/rm/trampoline_64.S
+++ b/arch/x86/realmode/rm/trampoline_64.S
@@ -73,7 +73,7 @@ ENTRY(trampoline_data)
 	lmsw	%ax			# into protected mode
 
 	# flush prefetch and jump to startup_32
-	ljmpl	*(startup_32_vector)
+	ljmpl	$__KERNEL32_CS, $pa_startup_32
 
 no_longmode:
 	hlt
@@ -113,7 +113,7 @@ ENTRY(startup_32)
 	 * EFER.LMA = 1). Now we want to jump in 64bit mode, to do that we use
 	 * the new gdt/idt that has __KERNEL_CS with CS.L = 1.
 	 */
-	ljmpl	*(pa_startup_64_vector)
+	ljmpl	$__KERNEL_CS, $pa_startup_64
 
 	.section ".text64","ax"
 	.code64
@@ -144,17 +144,6 @@ tgdt:
 	.quad	0x00cf93000000ffff	# __KERNEL_DS
 tgdt_end:
 
-	.balign 4
-startup_32_vector:
-	.long	pa_startup_32
-	.word	__KERNEL32_CS, 0
-
-	.balign 4
-	.globl startup_64_vector
-startup_64_vector:
-	.long	pa_startup_64
-	.word	__KERNEL_CS, 0
-
 	.data
 	.balign 4
 GLOBAL(trampoline_status)
-- 
cgit v1.2.3


From 968ff9ee56f1e3ed4ff4a6d10185865dc77d8f7e Mon Sep 17 00:00:00 2001
From: "H. Peter Anvin" <hpa@linux.intel.com>
Date: Tue, 8 May 2012 21:22:36 +0300
Subject: x86, realmode: Remove indirect jumps in trampoline_32 and wakeup_asm

Remove indirect jumps in trampoline_32.S and the 32-bit part of
wakeup_asm.S.  There exist systems which are known to do weird
things if an SMI comes in right after a mode switch, and the
safest way to deal with it is to always follow with a simple
absolute far jump.  In the 64-bit code we then to a register
indirect near jump; follow that pattern for the 32-bit code.

Signed-off-by: H. Peter Anvin <hpa@linux.intel.com>
Link: http://lkml.kernel.org/r/1336501366-28617-14-git-send-email-jarkko.sakkinen@intel.com
---
 arch/x86/realmode/rm/trampoline_32.S     | 22 +++++++++++++---------
 arch/x86/realmode/rm/wakeup/wakeup_asm.S |  8 +++++---
 2 files changed, 18 insertions(+), 12 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/realmode/rm/trampoline_32.S b/arch/x86/realmode/rm/trampoline_32.S
index 1f9e3316f73d..1315ef48dbf1 100644
--- a/arch/x86/realmode/rm/trampoline_32.S
+++ b/arch/x86/realmode/rm/trampoline_32.S
@@ -47,24 +47,29 @@ trampoline_data:
 
 	cli			# We should be safe anyway
 
+	movl	startup_32_smp, %eax	# where we need to go
+
 	movl	$0xA5A5A5A5, trampoline_status
 				# write marker for master knows we're running
 
-	/* GDT tables in non default location kernel can be beyond 16MB and
+	/*
+	 * GDT tables in non default location kernel can be beyond 16MB and
 	 * lgdt will not be able to load the address as in real mode default
 	 * operand size is 16bit. Use lgdtl instead to force operand size
 	 * to 32 bit.
 	 */
-
 	lidtl	boot_idt_descr		# load idt with 0, 0
 	lgdtl	boot_gdt_descr		# load gdt with whatever is appropriate
 
-	xor	%ax, %ax
-	inc	%ax			# protected mode (PE) bit
-	lmsw	%ax			# into protected mode
+	movw	$1, %dx			# protected mode (PE) bit
+	lmsw	%dx			# into protected mode
 
-	# flush prefetch and jump to startup_32_smp in arch/i386/kernel/head.S
-	ljmpl	*(startup_32_smp)
+	ljmpl	$__BOOT_CS, $pa_startup_32
+
+	.section ".text32","ax"
+	.code32
+ENTRY(startup_32)			# note: also used from wakeup_asm.S
+	jmp	*%eax
 
 	.data
 	.globl startup_32_smp, boot_gdt, trampoline_status
@@ -82,5 +87,4 @@ trampoline_status:
 	.long	0
 
 startup_32_smp:
-	.long	0x00000000
-	.word	__BOOT_CS, 0
+	.long	0
diff --git a/arch/x86/realmode/rm/wakeup/wakeup_asm.S b/arch/x86/realmode/rm/wakeup/wakeup_asm.S
index b61126cb599e..4c5c5f2bfbec 100644
--- a/arch/x86/realmode/rm/wakeup/wakeup_asm.S
+++ b/arch/x86/realmode/rm/wakeup/wakeup_asm.S
@@ -124,9 +124,11 @@ wakeup_start:
 	lgdtl	pmode_gdt
 
 	/* This really couldn't... */
-	movl	pmode_cr0, %eax
-	movl	%eax, %cr0
-	ljmpl	*pmode_entry
+	movl	pmode_entry, %eax
+	movl	pmode_cr0, %ecx
+	movl	%ecx, %cr0
+	ljmpl	$__KERNEL_CS, $pa_startup_32
+	/* -> jmp *%eax in trampoline_32.S */
 #else
 	jmp	trampoline_data
 #endif
-- 
cgit v1.2.3


From e5684ec438a094bec0f7d5c52652c0901b48b613 Mon Sep 17 00:00:00 2001
From: "H. Peter Anvin" <hpa@linux.intel.com>
Date: Tue, 8 May 2012 21:22:37 +0300
Subject: x86, realmode: Replace open-coded ljmpw with a macro

We cannot code an ljmpw to the real-mode segment directly, because gas
refuses to assemble an ljmp with a symbolic segment.  Instead of
open-coding it everywhere, define a macro and use it for this case.

This is specifically an ljmpw from a 16-bit segment.  This is okay, as
one should never enter real mode from a 32-bit segment: if one do, the
CPU ends up in a bizarre (and useless) mode sometimes called "unreal
mode" where segments behave like real mode but the default address and
operand sizes is 32 bits.

Signed-off-by: H. Peter Anvin <hpa@linux.intel.com>
Link: http://lkml.kernel.org/r/1336501366-28617-15-git-send-email-jarkko.sakkinen@intel.com
---
 arch/x86/realmode/rm/realmode.h          | 16 ++++++++++++++++
 arch/x86/realmode/rm/reboot_32.S         |  6 ++----
 arch/x86/realmode/rm/trampoline_32.S     |  5 ++---
 arch/x86/realmode/rm/trampoline_64.S     |  5 ++---
 arch/x86/realmode/rm/wakeup/wakeup_asm.S |  9 +++------
 5 files changed, 25 insertions(+), 16 deletions(-)
 create mode 100644 arch/x86/realmode/rm/realmode.h

(limited to 'arch/x86')

diff --git a/arch/x86/realmode/rm/realmode.h b/arch/x86/realmode/rm/realmode.h
new file mode 100644
index 000000000000..15ab6335f843
--- /dev/null
+++ b/arch/x86/realmode/rm/realmode.h
@@ -0,0 +1,16 @@
+#ifndef ARCH_X86_REALMODE_RM_REALMODE_H
+#define ARCH_X86_REALMODE_RM_REALMODE_H
+
+#ifdef __ASSEMBLY__
+
+/*
+ * 16-bit ljmpw to the real_mode_seg
+ *
+ * This must be open-coded since gas will choke on using a
+ * relocatable symbol for the segment portion.
+ */
+#define LJMPW_RM(to)	.byte 0xea ; .word (to), real_mode_seg
+
+#endif /* __ASSEMBLY__ */
+
+#endif /* ARCH_X86_REALMODE_RM_REALMODE_H */
diff --git a/arch/x86/realmode/rm/reboot_32.S b/arch/x86/realmode/rm/reboot_32.S
index 83803c222b4a..e90f8c4bbae2 100644
--- a/arch/x86/realmode/rm/reboot_32.S
+++ b/arch/x86/realmode/rm/reboot_32.S
@@ -2,6 +2,7 @@
 #include <linux/init.h>
 #include <asm/segment.h>
 #include <asm/page_types.h>
+#include "realmode.h"
 
 /*
  * The following code and data reboots the machine by switching to real
@@ -82,10 +83,7 @@ machine_real_restart_asm16:
 2:
 	andb	$0x10, %dl
 	movl	%edx, %cr0
-	.byte	0xea			/* ljmpw */
-	.word	3f			/* Offset */
-	.word	real_mode_seg		/* Segment */
-
+	LJMPW_RM(3f)
 3:
 	testb	$0, %al
 	jz	bios
diff --git a/arch/x86/realmode/rm/trampoline_32.S b/arch/x86/realmode/rm/trampoline_32.S
index 1315ef48dbf1..279f82ef7a9e 100644
--- a/arch/x86/realmode/rm/trampoline_32.S
+++ b/arch/x86/realmode/rm/trampoline_32.S
@@ -29,6 +29,7 @@
 #include <linux/init.h>
 #include <asm/segment.h>
 #include <asm/page_types.h>
+#include "realmode.h"
 
 	.text
 	.code16
@@ -38,9 +39,7 @@
 trampoline_data:
 	wbinvd			# Needed for NUMA-Q should be harmless for others
 
-	.byte	0xea		# ljmpw
-	.word	1f		# Offset
-	.word	real_mode_seg	# Segment
+	LJMPW_RM(1f)
 1:
 	mov	%cs, %ax	# Code and data in the same place
 	mov	%ax, %ds
diff --git a/arch/x86/realmode/rm/trampoline_64.S b/arch/x86/realmode/rm/trampoline_64.S
index 77b72b45d705..7459c52f0c25 100644
--- a/arch/x86/realmode/rm/trampoline_64.S
+++ b/arch/x86/realmode/rm/trampoline_64.S
@@ -31,6 +31,7 @@
 #include <asm/msr.h>
 #include <asm/segment.h>
 #include <asm/processor-flags.h>
+#include "realmode.h"
 
 	.text
 	.balign PAGE_SIZE
@@ -40,9 +41,7 @@ ENTRY(trampoline_data)
 	cli			# We should be safe anyway
 	wbinvd
 
-	.byte	0xea		# ljmpw
-	.word	1f		# Offset
-	.word	real_mode_seg	# Segment
+	LJMPW_RM(1f)
 1:
 	mov	%cs, %ax	# Code and data in the same place
 	mov	%ax, %ds
diff --git a/arch/x86/realmode/rm/wakeup/wakeup_asm.S b/arch/x86/realmode/rm/wakeup/wakeup_asm.S
index 4c5c5f2bfbec..8064e1c3591b 100644
--- a/arch/x86/realmode/rm/wakeup/wakeup_asm.S
+++ b/arch/x86/realmode/rm/wakeup/wakeup_asm.S
@@ -6,6 +6,7 @@
 #include <asm/page_types.h>
 #include <asm/pgtable_types.h>
 #include <asm/processor-flags.h>
+#include "../realmode.h"
 #include "wakeup.h"
 
 		.code16
@@ -36,9 +37,7 @@ wakeup_start:
 	cli
 	cld
 
-	.byte	0xea		/* ljmpw */
-	.word	3f
-	.word	real_mode_seg
+	LJMPW_RM(3f)
 3:
 	/* Apparently some dimwit BIOS programmers don't know how to
 	   program a PM to RM transition, and we might end up here with
@@ -59,9 +58,7 @@ wakeup_start:
 
 	andb	$~X86_CR0_PE, %al
 	movl	%eax, %cr0
-	.byte	0xea		/* ljmpw */
-	.word	3f
-	.word	real_mode_seg
+	LJMPW_RM(3f)
 3:
 	/* Set up segments */
 	movw	%cs, %ax
-- 
cgit v1.2.3


From be60828920d23758da8124bed771404a0438f369 Mon Sep 17 00:00:00 2001
From: "H. Peter Anvin" <hpa@linux.intel.com>
Date: Tue, 8 May 2012 21:22:38 +0300
Subject: x86, realmode: Move trampoline_*.S early in the link order

Move trampoline_*.S earlier in the link order so it ends up being
first in the text segment; since the SIPI vector requires 4K alignment
it otherwise ends up padding the .text segment with that much
completely unnecessarily.

Signed-off-by: H. Peter Anvin <hpa@linux.intel.com>
Link: http://lkml.kernel.org/r/1336501366-28617-16-git-send-email-jarkko.sakkinen@intel.com
---
 arch/x86/realmode/rm/Makefile | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'arch/x86')

diff --git a/arch/x86/realmode/rm/Makefile b/arch/x86/realmode/rm/Makefile
index 2432acb6b04f..2423142b4da4 100644
--- a/arch/x86/realmode/rm/Makefile
+++ b/arch/x86/realmode/rm/Makefile
@@ -12,8 +12,8 @@ subdir- := wakeup
 always := realmode.bin
 
 realmode-y			+= header.o
-realmode-$(CONFIG_X86_32)	+= reboot_32.o
 realmode-y			+= trampoline_$(BITS).o
+realmode-$(CONFIG_X86_32)	+= reboot_32.o
 realmode-$(CONFIG_ACPI_SLEEP)	+= wakeup/wakeup.o
 
 targets	+= $(realmode-y)
-- 
cgit v1.2.3


From 6feb592dceaed1a6cf26c9747b1180958d5156cd Mon Sep 17 00:00:00 2001
From: "H. Peter Anvin" <hpa@linux.intel.com>
Date: Tue, 8 May 2012 21:22:39 +0300
Subject: x86, realmode: Fix always-zero test in reboot_32.S

A test instruction is an "and", and an and with zero is always zero.
This would cause us to always take the BIOS path, not the APM path, in
case anyone actually cares...

Signed-off-by: H. Peter Anvin <hpa@linux.intel.com>
Link: http://lkml.kernel.org/r/1336501366-28617-17-git-send-email-jarkko.sakkinen@intel.com
---
 arch/x86/realmode/rm/reboot_32.S | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'arch/x86')

diff --git a/arch/x86/realmode/rm/reboot_32.S b/arch/x86/realmode/rm/reboot_32.S
index e90f8c4bbae2..50ba994ba921 100644
--- a/arch/x86/realmode/rm/reboot_32.S
+++ b/arch/x86/realmode/rm/reboot_32.S
@@ -85,7 +85,7 @@ machine_real_restart_asm16:
 	movl	%edx, %cr0
 	LJMPW_RM(3f)
 3:
-	testb	$0, %al
+	andw	%ax, %ax
 	jz	bios
 
 apm:
-- 
cgit v1.2.3


From 8e029fcdd8702719c9179317cae9ef84ebe7027e Mon Sep 17 00:00:00 2001
From: Jarkko Sakkinen <jarkko.sakkinen@intel.com>
Date: Tue, 8 May 2012 21:22:40 +0300
Subject: x86, realmode: fix 64-bit wakeup sequence

There were number of issues in wakeup sequence:

- Wakeup stack was placed in hardcoded address.
- NX bit in EFER was not enabled.
- Initialization incorrectly set physical address
of secondary_startup_64.
- Some alignment issues.

This patch fixes these issues and in addition:

- Unifies coding conventions in .S files.
- Sets alignments of code and data right.

Signed-off-by: Jarkko Sakkinen <jarkko.sakkinen@intel.com>
Link: http://lkml.kernel.org/r/1336501366-28617-18-git-send-email-jarkko.sakkinen@intel.com
Originally-by: H. Peter Anvin <hpa@linux.intel.com>
Cc: Rafael J. Wysocki <rjw@sisk.pl>
Cc: Len Brown <len.brown@intel.com>
Signed-off-by: H. Peter Anvin <hpa@linux.intel.com>
---
 arch/x86/kernel/realmode.c               |  2 +-
 arch/x86/realmode/rm/Makefile            |  1 +
 arch/x86/realmode/rm/header.S            |  2 +-
 arch/x86/realmode/rm/reboot_32.S         | 18 ++++----
 arch/x86/realmode/rm/stack.S             | 19 ++++++++
 arch/x86/realmode/rm/trampoline_32.S     | 29 ++++++------
 arch/x86/realmode/rm/trampoline_64.S     | 67 ++++++++++++----------------
 arch/x86/realmode/rm/wakeup/wakeup_asm.S | 75 +++++++++++++++-----------------
 arch/x86/realmode/rmpiggy.S              |  4 +-
 9 files changed, 110 insertions(+), 107 deletions(-)
 create mode 100644 arch/x86/realmode/rm/stack.S

(limited to 'arch/x86')

diff --git a/arch/x86/kernel/realmode.c b/arch/x86/kernel/realmode.c
index d85ac20bb4eb..e7bf82a409bf 100644
--- a/arch/x86/kernel/realmode.c
+++ b/arch/x86/kernel/realmode.c
@@ -64,7 +64,7 @@ void __init setup_real_mode(void)
 	*((u32 *)__va(real_mode_header.boot_gdt)) = __pa(boot_gdt);
 #else
 	*((u64 *) __va(real_mode_header.startup_64_smp)) =
-		(u64) __pa(secondary_startup_64);
+		(u64)secondary_startup_64;
 
 	*((u64 *) __va(real_mode_header.level3_ident_pgt)) =
 		__pa(level3_ident_pgt) + _KERNPG_TABLE;
diff --git a/arch/x86/realmode/rm/Makefile b/arch/x86/realmode/rm/Makefile
index 2423142b4da4..c2c27a41ab8f 100644
--- a/arch/x86/realmode/rm/Makefile
+++ b/arch/x86/realmode/rm/Makefile
@@ -13,6 +13,7 @@ always := realmode.bin
 
 realmode-y			+= header.o
 realmode-y			+= trampoline_$(BITS).o
+realmode-y			+= stack.o
 realmode-$(CONFIG_X86_32)	+= reboot_32.o
 realmode-$(CONFIG_ACPI_SLEEP)	+= wakeup/wakeup.o
 
diff --git a/arch/x86/realmode/rm/header.S b/arch/x86/realmode/rm/header.S
index 730b1316c099..a91ec8f6b15f 100644
--- a/arch/x86/realmode/rm/header.S
+++ b/arch/x86/realmode/rm/header.S
@@ -9,7 +9,7 @@
 
 		.section ".header", "a"
 
-ENTRY(real_mode_header)
+GLOBAL(real_mode_header)
 		.long	pa_text_start
 		.long	pa_ro_end
 		.long	pa_end
diff --git a/arch/x86/realmode/rm/reboot_32.S b/arch/x86/realmode/rm/reboot_32.S
index 50ba994ba921..8d9bfd13a93e 100644
--- a/arch/x86/realmode/rm/reboot_32.S
+++ b/arch/x86/realmode/rm/reboot_32.S
@@ -16,10 +16,9 @@
  */
 	.section ".text32", "ax"
 	.code32
-	.globl machine_real_restart_asm
 
-	.balign 16
-machine_real_restart_asm:
+	.balign	16
+ENTRY(machine_real_restart_asm)
 	/* Set up the IDT for real mode. */
 	lidtl	pa_machine_real_restart_idt
 
@@ -67,7 +66,7 @@ machine_real_restart_asm:
 	.text
 	.code16
 
-	.balign 16
+	.balign	16
 machine_real_restart_asm16:
 1:
 	xorl	%ecx, %ecx
@@ -102,15 +101,15 @@ bios:
 	ljmpw	$0xf000, $0xfff0
 
 	.section ".rodata", "a"
-	.globl	machine_real_restart_idt, machine_real_restart_gdt
 
-	.balign 16
-machine_real_restart_idt:
+	.balign	16
+GLOBAL(machine_real_restart_idt)
 	.word	0xffff		/* Length - real mode default value */
 	.long	0		/* Base - real mode default value */
+END(machine_real_restart_idt)
 
-	.balign 16
-machine_real_restart_gdt:
+	.balign	16
+GLOBAL(machine_real_restart_gdt)
 	/* Self-pointer */
 	.word	0xffff		/* Length - real mode default value */
 	.long	pa_machine_real_restart_gdt
@@ -130,3 +129,4 @@ machine_real_restart_gdt:
 	 * semantics we don't have to reload the segments once CR0.PE = 0.
 	 */
 	.quad	GDT_ENTRY(0x0093, 0x100, 0xffff)
+END(machine_real_restart_gdt)
diff --git a/arch/x86/realmode/rm/stack.S b/arch/x86/realmode/rm/stack.S
new file mode 100644
index 000000000000..867ae87adfae
--- /dev/null
+++ b/arch/x86/realmode/rm/stack.S
@@ -0,0 +1,19 @@
+/*
+ * Common heap and stack allocations
+ */
+
+#include <linux/linkage.h>
+
+	.data
+GLOBAL(HEAP)
+	.long	rm_heap
+GLOBAL(heap_end)
+	.long	rm_stack
+
+	.bss
+	.balign	16
+GLOBAL(rm_heap)
+	.space	2048
+GLOBAL(rm_stack)
+	.space	2048
+GLOBAL(rm_stack_end)
diff --git a/arch/x86/realmode/rm/trampoline_32.S b/arch/x86/realmode/rm/trampoline_32.S
index 279f82ef7a9e..1ecdbb59191b 100644
--- a/arch/x86/realmode/rm/trampoline_32.S
+++ b/arch/x86/realmode/rm/trampoline_32.S
@@ -33,10 +33,9 @@
 
 	.text
 	.code16
-	.globl trampoline_data
 
-	.balign PAGE_SIZE
-trampoline_data:
+	.balign	PAGE_SIZE
+ENTRY(trampoline_data)
 	wbinvd			# Needed for NUMA-Q should be harmless for others
 
 	LJMPW_RM(1f)
@@ -70,20 +69,22 @@ trampoline_data:
 ENTRY(startup_32)			# note: also used from wakeup_asm.S
 	jmp	*%eax
 
-	.data
-	.globl startup_32_smp, boot_gdt, trampoline_status
-	.balign	4
-boot_gdt_descr:
-	.word	__BOOT_DS + 7			# gdt limit
-boot_gdt:
-	.long	0				# gdt base
+	.section ".rodata","a"
 
+	.balign	4
 boot_idt_descr:
 	.word	0				# idt limit = 0
 	.long	0				# idt base = 0L
 
-trampoline_status:
-	.long	0
+	.data
 
-startup_32_smp:
-	.long	0
+boot_gdt_descr:
+	.word	__BOOT_DS + 7			# gdt limit
+GLOBAL(boot_gdt)
+	.long	0				# gdt base
+
+	.bss
+
+	.balign	4
+GLOBAL(trampoline_status)	.space	4
+GLOBAL(startup_32_smp)		.space	4
diff --git a/arch/x86/realmode/rm/trampoline_64.S b/arch/x86/realmode/rm/trampoline_64.S
index 7459c52f0c25..f71ea0800d3d 100644
--- a/arch/x86/realmode/rm/trampoline_64.S
+++ b/arch/x86/realmode/rm/trampoline_64.S
@@ -52,7 +52,7 @@ ENTRY(trampoline_data)
 	# write marker for master knows we're running
 
 	# Setup stack
-	movw	$trampoline_stack_end, %sp
+	movl	$rm_stack_end, %esp
 
 	call	verify_cpu		# Verify the cpu supports long mode
 	testl   %eax, %eax		# Check for return code
@@ -68,8 +68,11 @@ ENTRY(trampoline_data)
 	lidtl	tidt	# load idt with 0, 0
 	lgdtl	tgdt	# load gdt with whatever is appropriate
 
-	mov	$X86_CR0_PE, %ax	# protected mode (PE) bit
-	lmsw	%ax			# into protected mode
+	movw	$__KERNEL_DS, %dx	# Data segment descriptor
+
+	# Enable protected mode
+	movl	$X86_CR0_PE, %eax	# protected mode (PE) bit
+	movl	%eax, %cr0		# into protected mode
 
 	# flush prefetch and jump to startup_32
 	ljmpl	$__KERNEL32_CS, $pa_startup_32
@@ -83,27 +86,27 @@ no_longmode:
 	.code32
 	.balign 4
 ENTRY(startup_32)
-	movl	$__KERNEL_DS, %eax	# Initialize the %ds segment register
-	movl	%eax, %ds
+	movl	%edx, %ss
+	addl	$pa_real_mode_base, %esp
+	movl	%edx, %ds
+	movl	%edx, %es
+	movl	%edx, %fs
+	movl	%edx, %gs
 
 	movl	$X86_CR4_PAE, %eax
 	movl	%eax, %cr4		# Enable PAE mode
 
-	movl	pa_startup_64_smp, %esi
-	movl	pa_startup_64_smp_high, %edi
-
-					# Setup trampoline 4 level pagetables
-	leal	pa_trampoline_level4_pgt, %eax
+	# Setup trampoline 4 level pagetables
+	movl	$pa_level3_ident_pgt, %eax
 	movl	%eax, %cr3
 
 	movl	$MSR_EFER, %ecx
-	movl	$(1 << _EFER_LME), %eax	# Enable Long Mode
+	movl	$((1 << _EFER_LME) | (1 << _EFER_NX)), %eax	# Enable Long Mode
 	xorl	%edx, %edx
 	wrmsr
 
 	# Enable paging and in turn activate Long Mode
-	# Enable protected mode
-	movl	$(X86_CR0_PG | X86_CR0_PE), %eax
+	movl	$(X86_CR0_PG | X86_CR0_WP | X86_CR0_PE), %eax
 	movl	%eax, %cr0
 
 	/*
@@ -119,10 +122,7 @@ ENTRY(startup_32)
 	.balign 4
 ENTRY(startup_64)
 	# Now jump into the kernel using virtual addresses
-	movl	%edi, %eax
-	shlq	$32, %rax
-	addl	%esi, %eax
-	jmp	*%rax
+	jmpq	*startup_64_smp(%rip)
 
 	.section ".rodata","a"
 	.balign	16
@@ -132,10 +132,10 @@ tidt:
 
 	# Duplicate the global descriptor table
 	# so the kernel can live anywhere
-	.balign 4
+	.balign 16
 	.globl tgdt
 tgdt:
-	.short	tgdt_end - tgdt		# gdt limit
+	.short	tgdt_end - tgdt - 1	# gdt limit
 	.long	pa_tgdt
 	.short	0
 	.quad	0x00cf9b000000ffff	# __KERNEL32_CS
@@ -143,23 +143,12 @@ tgdt:
 	.quad	0x00cf93000000ffff	# __KERNEL_DS
 tgdt_end:
 
-	.data
-	.balign 4
-GLOBAL(trampoline_status)
-	.long	0
-
-trampoline_stack:
-	.org 0x1000
-trampoline_stack_end:
-
-	.globl	level3_ident_pgt
-	.globl	level3_kernel_pgt
-GLOBAL(trampoline_level4_pgt)
-	level3_ident_pgt:	.quad	0
-	.fill 510,8,0
-	level3_kernel_pgt:	.quad	0
-
-	.globl	startup_64_smp
-	.globl	startup_64_smp_high
-startup_64_smp:		.long 0
-startup_64_smp_high:	.long 0
+	.bss
+
+	.balign	PAGE_SIZE
+GLOBAL(level3_ident_pgt)	.space	511*8
+GLOBAL(level3_kernel_pgt)	.space	8
+
+	.balign	8
+GLOBAL(startup_64_smp)		.space	8
+GLOBAL(trampoline_status)	.space	4
diff --git a/arch/x86/realmode/rm/wakeup/wakeup_asm.S b/arch/x86/realmode/rm/wakeup/wakeup_asm.S
index 8064e1c3591b..f81c1cd99eaf 100644
--- a/arch/x86/realmode/rm/wakeup/wakeup_asm.S
+++ b/arch/x86/realmode/rm/wakeup/wakeup_asm.S
@@ -1,6 +1,7 @@
 /*
  * ACPI wakeup real mode startup stub
  */
+#include <linux/linkage.h>
 #include <asm/segment.h>
 #include <asm/msr-index.h>
 #include <asm/page_types.h>
@@ -9,31 +10,33 @@
 #include "../realmode.h"
 #include "wakeup.h"
 
-		.code16
+	.code16
 
 /* This should match the structure in wakeup.h */
-		.section ".data", "aw"
-		.globl	wakeup_header
-wakeup_header:
-video_mode:	.short	0	/* Video mode number */
-pmode_entry:	.long	0
-pmode_cs:	.short	__KERNEL_CS
-pmode_cr0:	.long	0	/* Saved %cr0 */
-pmode_cr3:	.long	0	/* Saved %cr3 */
-pmode_cr4:	.long	0	/* Saved %cr4 */
-pmode_efer:	.quad	0	/* Saved EFER */
-pmode_gdt:	.quad	0
-pmode_misc_en:	.quad	0	/* Saved MISC_ENABLE MSR */
-pmode_behavior:	.long	0	/* Wakeup behavior flags */
-realmode_flags:	.long	0
-real_magic:	.long	0
-signature:	.long	WAKEUP_HEADER_SIGNATURE
-		.size	wakeup_header, .-wakeup_header
+	.section ".data", "aw"
+
+	.balign	16
+GLOBAL(wakeup_header)
+	video_mode:	.short	0	/* Video mode number */
+	pmode_entry:	.long	0
+	pmode_cs:	.short	__KERNEL_CS
+	pmode_cr0:	.long	0	/* Saved %cr0 */
+	pmode_cr3:	.long	0	/* Saved %cr3 */
+	pmode_cr4:	.long	0	/* Saved %cr4 */
+	pmode_efer:	.quad	0	/* Saved EFER */
+	pmode_gdt:	.quad	0
+	pmode_misc_en:	.quad	0	/* Saved MISC_ENABLE MSR */
+	pmode_behavior:	.long	0	/* Wakeup behavior flags */
+	realmode_flags:	.long	0
+	real_magic:	.long	0
+	signature:	.long	WAKEUP_HEADER_SIGNATURE
+END(wakeup_header)
 
 	.text
 	.code16
-	.globl	wakeup_start
-wakeup_start:
+
+	.balign	16
+ENTRY(wakeup_start)
 	cli
 	cld
 
@@ -62,12 +65,14 @@ wakeup_start:
 3:
 	/* Set up segments */
 	movw	%cs, %ax
+	movw	%ax, %ss
+	movl	$rm_stack_end, %esp
 	movw	%ax, %ds
 	movw	%ax, %es
-	movw	%ax, %ss
-	lidtl	wakeup_idt
+	movw	%ax, %fs
+	movw	%ax, %gs
 
-	movl	$wakeup_stack_end, %esp
+	lidtl	wakeup_idt
 
 	/* Clear the EFLAGS */
 	pushl	$0
@@ -145,9 +150,8 @@ bogus_real_magic:
 	 * be the case for other laptops or integrated video devices.
 	 */
 
-	.globl	wakeup_gdt
 	.balign	16
-wakeup_gdt:
+GLOBAL(wakeup_gdt)
 	.word	3*8-1		/* Self-descriptor */
 	.long	pa_wakeup_gdt
 	.word	0
@@ -159,29 +163,18 @@ wakeup_gdt:
 	.word	0xffff		/* 16-bit data segment @ real_mode_base */
 	.long	0x93000000 + pa_real_mode_base
 	.word	0x008f		/* big real mode */
-	.size	wakeup_gdt, .-wakeup_gdt
+END(wakeup_gdt)
 
-	.data
+	.section ".rodata","a"
 	.balign	8
 
 	/* This is the standard real-mode IDT */
-wakeup_idt:
+	.balign	16
+GLOBAL(wakeup_idt)
 	.word	0xffff		/* limit */
 	.long	0		/* address */
 	.word	0
-
-	.globl	HEAP, heap_end
-HEAP:
-	.long	wakeup_heap
-heap_end:
-	.long	wakeup_stack
-
-	.bss
-wakeup_heap:
-	.space	2048
-wakeup_stack:
-	.space	2048
-wakeup_stack_end:
+END(wakeup_idt)
 
 	.section ".signature","a"
 end_signature:
diff --git a/arch/x86/realmode/rmpiggy.S b/arch/x86/realmode/rmpiggy.S
index 6047d7f604cf..fd72a99d12ae 100644
--- a/arch/x86/realmode/rmpiggy.S
+++ b/arch/x86/realmode/rmpiggy.S
@@ -9,10 +9,10 @@
 
 	.balign PAGE_SIZE
 
-ENTRY(real_mode_blob)
+GLOBAL(real_mode_blob)
 	.incbin	"arch/x86/realmode/rm/realmode.bin"
 END(real_mode_blob)
 
-ENTRY(real_mode_relocs)
+GLOBAL(real_mode_relocs)
 	.incbin	"arch/x86/realmode/rm/realmode.relocs"
 END(real_mode_relocs)
-- 
cgit v1.2.3


From b429dbf6e866bd6dadb56fae66f61f611cde57ff Mon Sep 17 00:00:00 2001
From: Jarkko Sakkinen <jarkko.sakkinen@intel.com>
Date: Tue, 8 May 2012 21:22:41 +0300
Subject: x86, realmode: don't copy real_mode_header

Replaced copying of real_mode_header with a pointer
to beginning of RM memory.

Signed-off-by: Jarkko Sakkinen <jarkko.sakkinen@intel.com>
Link: http://lkml.kernel.org/r/1336501366-28617-19-git-send-email-jarkko.sakkinen@intel.com
Signed-off-by: H. Peter Anvin <hpa@linux.intel.com>
---
 arch/x86/include/asm/realmode.h     |  5 ++--
 arch/x86/kernel/acpi/sleep.c        |  2 +-
 arch/x86/kernel/realmode.c          | 57 ++++++++++++++++---------------------
 arch/x86/kernel/reboot.c            |  2 +-
 arch/x86/kernel/smpboot.c           |  4 +--
 arch/x86/kernel/tboot.c             |  2 +-
 arch/x86/realmode/rm/header.S       |  1 -
 arch/x86/realmode/rm/realmode.lds.S |  1 -
 arch/x86/realmode/rmpiggy.S         |  2 ++
 drivers/acpi/sleep.c                |  2 +-
 10 files changed, 35 insertions(+), 43 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/include/asm/realmode.h b/arch/x86/include/asm/realmode.h
index 1bfc74d213a4..d3ae49f4c3ef 100644
--- a/arch/x86/include/asm/realmode.h
+++ b/arch/x86/include/asm/realmode.h
@@ -8,7 +8,6 @@
 struct real_mode_header {
 	u32	text_start;
 	u32	ro_end;
-	u32	end;
 	/* reboot */
 #ifdef CONFIG_X86_32
 	u32	machine_real_restart_asm;
@@ -30,8 +29,8 @@ struct real_mode_header {
 #endif
 } __attribute__((__packed__));
 
-extern struct real_mode_header real_mode_header;
-extern unsigned char *real_mode_base;
+extern struct real_mode_header *real_mode_header;
+extern unsigned char real_mode_blob_end[];
 
 extern unsigned long init_rsp;
 extern unsigned long initial_code;
diff --git a/arch/x86/kernel/acpi/sleep.c b/arch/x86/kernel/acpi/sleep.c
index d941b62da4b6..6ca3f54ebe7d 100644
--- a/arch/x86/kernel/acpi/sleep.c
+++ b/arch/x86/kernel/acpi/sleep.c
@@ -38,7 +38,7 @@ asmlinkage void acpi_enter_s3(void)
 int acpi_suspend_lowlevel(void)
 {
 	struct wakeup_header *header =
-		(struct wakeup_header *) __va(real_mode_header.wakeup_header);
+		(struct wakeup_header *) __va(real_mode_header->wakeup_header);
 
 	if (header->signature != WAKEUP_HEADER_SIGNATURE) {
 		printk(KERN_ERR "wakeup header does not match\n");
diff --git a/arch/x86/kernel/realmode.c b/arch/x86/kernel/realmode.c
index e7bf82a409bf..632c810ec8ea 100644
--- a/arch/x86/kernel/realmode.c
+++ b/arch/x86/kernel/realmode.c
@@ -5,8 +5,7 @@
 #include <asm/pgtable.h>
 #include <asm/realmode.h>
 
-unsigned char *real_mode_base;
-struct real_mode_header real_mode_header;
+struct real_mode_header *real_mode_header;
 
 void __init setup_real_mode(void)
 {
@@ -17,33 +16,32 @@ void __init setup_real_mode(void)
 	u32 *ptr;
 	u16 *seg;
 	int i;
+	unsigned char *base;
 
-	struct real_mode_header *header =
-		(struct real_mode_header *) real_mode_blob;
-
-	size_t size = PAGE_ALIGN(header->end);
+	size_t size = PAGE_ALIGN(real_mode_blob_end - real_mode_blob);
 
 	/* Has to be in very low memory so we can execute real-mode AP code. */
 	mem = memblock_find_in_range(0, 1<<20, size, PAGE_SIZE);
 	if (!mem)
 		panic("Cannot allocate trampoline\n");
 
-	real_mode_base = __va(mem);
+	base = __va(mem);
 	memblock_reserve(mem, size);
+	real_mode_header = (struct real_mode_header *) base;
 
 	printk(KERN_DEBUG "Base memory trampoline at [%p] %llx size %zu\n",
-	       real_mode_base, (unsigned long long)mem, size);
+	       base, (unsigned long long)mem, size);
 
-	memcpy(real_mode_base, real_mode_blob, size);
+	memcpy(base, real_mode_blob, size);
 
-	real_mode_seg = __pa(real_mode_base) >> 4;
+	real_mode_seg = __pa(base) >> 4;
 	rel = (u32 *) real_mode_relocs;
 
 	/* 16-bit segment relocations. */
 	count = rel[0];
 	rel = &rel[1];
 	for (i = 0; i < count; i++) {
-		seg = (u16 *) (real_mode_base + rel[i]);
+		seg = (u16 *) (base + rel[i]);
 		*seg = real_mode_seg;
 	}
 
@@ -51,25 +49,21 @@ void __init setup_real_mode(void)
 	count = rel[i];
 	rel =  &rel[i + 1];
 	for (i = 0; i < count; i++) {
-		ptr = (u32 *) (real_mode_base + rel[i]);
-		*ptr += __pa(real_mode_base);
+		ptr = (u32 *) (base + rel[i]);
+		*ptr += __pa(base);
 	}
 
-	/* Copied header will contain relocated physical addresses. */
-	memcpy(&real_mode_header, real_mode_base,
-	       sizeof(struct real_mode_header));
-
 #ifdef CONFIG_X86_32
-	*((u32 *)__va(real_mode_header.startup_32_smp)) = __pa(startup_32_smp);
-	*((u32 *)__va(real_mode_header.boot_gdt)) = __pa(boot_gdt);
+	*((u32 *)__va(real_mode_header->startup_32_smp)) = __pa(startup_32_smp);
+	*((u32 *)__va(real_mode_header->boot_gdt)) = __pa(boot_gdt);
 #else
-	*((u64 *) __va(real_mode_header.startup_64_smp)) =
+	*((u64 *) __va(real_mode_header->startup_64_smp)) =
 		(u64)secondary_startup_64;
 
-	*((u64 *) __va(real_mode_header.level3_ident_pgt)) =
+	*((u64 *) __va(real_mode_header->level3_ident_pgt)) =
 		__pa(level3_ident_pgt) + _KERNPG_TABLE;
 
-	*((u64 *) __va(real_mode_header.level3_kernel_pgt)) =
+	*((u64 *) __va(real_mode_header->level3_kernel_pgt)) =
 		__pa(level3_kernel_pgt) + _KERNPG_TABLE;
 #endif
 }
@@ -82,23 +76,22 @@ void __init setup_real_mode(void)
  */
 static int __init set_real_mode_permissions(void)
 {
-	size_t all_size =
-		PAGE_ALIGN(real_mode_header.end) -
-		__pa(real_mode_base);
+	unsigned char *base = (unsigned char *) real_mode_header;
+	size_t size = PAGE_ALIGN(real_mode_blob_end - real_mode_blob);
 
 	size_t ro_size =
-		PAGE_ALIGN(real_mode_header.ro_end) -
-		__pa(real_mode_base);
+		PAGE_ALIGN(real_mode_header->ro_end) -
+		__pa(base);
 
 	size_t text_size =
-		PAGE_ALIGN(real_mode_header.ro_end) -
-		real_mode_header.text_start;
+		PAGE_ALIGN(real_mode_header->ro_end) -
+		real_mode_header->text_start;
 
 	unsigned long text_start =
-		(unsigned long) __va(real_mode_header.text_start);
+		(unsigned long) __va(real_mode_header->text_start);
 
-	set_memory_nx((unsigned long) real_mode_base, all_size >> PAGE_SHIFT);
-	set_memory_ro((unsigned long) real_mode_base, ro_size >> PAGE_SHIFT);
+	set_memory_nx((unsigned long) base, size >> PAGE_SHIFT);
+	set_memory_ro((unsigned long) base, ro_size >> PAGE_SHIFT);
 	set_memory_x((unsigned long) text_start, text_size >> PAGE_SHIFT);
 
 	return 0;
diff --git a/arch/x86/kernel/reboot.c b/arch/x86/kernel/reboot.c
index 050eff29a4bb..658f856f09a3 100644
--- a/arch/x86/kernel/reboot.c
+++ b/arch/x86/kernel/reboot.c
@@ -336,7 +336,7 @@ core_initcall(reboot_init);
 void machine_real_restart(unsigned int type)
 {
 	void (*restart_lowmem)(unsigned int) = (void (*)(unsigned int))
-		real_mode_header.machine_real_restart_asm;
+		real_mode_header->machine_real_restart_asm;
 
 	local_irq_disable();
 
diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c
index c7971ea74bd0..b8c0661e2341 100644
--- a/arch/x86/kernel/smpboot.c
+++ b/arch/x86/kernel/smpboot.c
@@ -665,9 +665,9 @@ static void __cpuinit announce_cpu(int cpu, int apicid)
 static int __cpuinit do_boot_cpu(int apicid, int cpu)
 {
 	volatile u32 *trampoline_status =
-		(volatile u32 *) __va(real_mode_header.trampoline_status);
+		(volatile u32 *) __va(real_mode_header->trampoline_status);
 	/* start_ip had better be page-aligned! */
-	unsigned long start_ip = real_mode_header.trampoline_data;
+	unsigned long start_ip = real_mode_header->trampoline_data;
 
 	unsigned long boot_error = 0;
 	int timeout;
diff --git a/arch/x86/kernel/tboot.c b/arch/x86/kernel/tboot.c
index c136e2325062..65adda4fde93 100644
--- a/arch/x86/kernel/tboot.c
+++ b/arch/x86/kernel/tboot.c
@@ -202,7 +202,7 @@ static int tboot_setup_sleep(void)
 	}
 
 	tboot->acpi_sinfo.kernel_s3_resume_vector =
-		real_mode_header.wakeup_start;
+		real_mode_header->wakeup_start;
 
 	return 0;
 }
diff --git a/arch/x86/realmode/rm/header.S b/arch/x86/realmode/rm/header.S
index a91ec8f6b15f..c83005c4d455 100644
--- a/arch/x86/realmode/rm/header.S
+++ b/arch/x86/realmode/rm/header.S
@@ -12,7 +12,6 @@
 GLOBAL(real_mode_header)
 		.long	pa_text_start
 		.long	pa_ro_end
-		.long	pa_end
 #ifdef CONFIG_X86_32
 		.long	pa_machine_real_restart_asm
 #endif
diff --git a/arch/x86/realmode/rm/realmode.lds.S b/arch/x86/realmode/rm/realmode.lds.S
index 4d4afcaf5f02..86b2e8d6b1f1 100644
--- a/arch/x86/realmode/rm/realmode.lds.S
+++ b/arch/x86/realmode/rm/realmode.lds.S
@@ -65,7 +65,6 @@ SECTIONS
 	.signature : {
 		*(.signature)
 	}
-	pa_end = .;
 
 	/DISCARD/ : {
 		*(.note*)
diff --git a/arch/x86/realmode/rmpiggy.S b/arch/x86/realmode/rmpiggy.S
index fd72a99d12ae..204c6ece0e97 100644
--- a/arch/x86/realmode/rmpiggy.S
+++ b/arch/x86/realmode/rmpiggy.S
@@ -13,6 +13,8 @@ GLOBAL(real_mode_blob)
 	.incbin	"arch/x86/realmode/rm/realmode.bin"
 END(real_mode_blob)
 
+GLOBAL(real_mode_blob_end);
+
 GLOBAL(real_mode_relocs)
 	.incbin	"arch/x86/realmode/rm/realmode.relocs"
 END(real_mode_relocs)
diff --git a/drivers/acpi/sleep.c b/drivers/acpi/sleep.c
index e77aa4a1c9f6..06139005c4dd 100644
--- a/drivers/acpi/sleep.c
+++ b/drivers/acpi/sleep.c
@@ -93,7 +93,7 @@ static struct notifier_block tts_notifier = {
 static int acpi_sleep_prepare(u32 acpi_state)
 {
 #ifdef CONFIG_ACPI_SLEEP
-	unsigned long wakeup_pa = real_mode_header.wakeup_start;
+	unsigned long wakeup_pa = real_mode_header->wakeup_start;
 	/* do we have a wakeup address for S2 and S3? */
 	if (acpi_state == ACPI_STATE_S3) {
 		if (!wakeup_pa)
-- 
cgit v1.2.3


From c4845474a01f699966272536e8416222e3f2d2cb Mon Sep 17 00:00:00 2001
From: Jarkko Sakkinen <jarkko.sakkinen@intel.com>
Date: Tue, 8 May 2012 21:22:42 +0300
Subject: x86, realmode: flattened rm hierachy

Simplified hierarchy under rm directory to a flat
directory because it is not anymore really justified
to have own directory for wakeup code. It only adds
more complexity.

Signed-off-by: Jarkko Sakkinen <jarkko.sakkinen@intel.com>
Link: http://lkml.kernel.org/r/1336501366-28617-20-git-send-email-jarkko.sakkinen@intel.com
Signed-off-by: H. Peter Anvin <hpa@linux.intel.com>
---
 arch/x86/kernel/acpi/sleep.c             |   2 +-
 arch/x86/realmode/rm/Makefile            |  20 ++--
 arch/x86/realmode/rm/bioscall.S          |   1 +
 arch/x86/realmode/rm/copy.S              |   1 +
 arch/x86/realmode/rm/regs.c              |   1 +
 arch/x86/realmode/rm/video-bios.c        |   1 +
 arch/x86/realmode/rm/video-mode.c        |   1 +
 arch/x86/realmode/rm/video-vesa.c        |   1 +
 arch/x86/realmode/rm/video-vga.c         |   1 +
 arch/x86/realmode/rm/wakemain.c          |  82 ++++++++++++++
 arch/x86/realmode/rm/wakeup.h            |  41 +++++++
 arch/x86/realmode/rm/wakeup/.gitignore   |   3 -
 arch/x86/realmode/rm/wakeup/Makefile     |  33 ------
 arch/x86/realmode/rm/wakeup/bioscall.S   |   1 -
 arch/x86/realmode/rm/wakeup/copy.S       |   1 -
 arch/x86/realmode/rm/wakeup/regs.c       |   1 -
 arch/x86/realmode/rm/wakeup/video-bios.c |   1 -
 arch/x86/realmode/rm/wakeup/video-mode.c |   1 -
 arch/x86/realmode/rm/wakeup/video-vesa.c |   1 -
 arch/x86/realmode/rm/wakeup/video-vga.c  |   1 -
 arch/x86/realmode/rm/wakeup/wakemain.c   |  82 --------------
 arch/x86/realmode/rm/wakeup/wakeup.h     |  41 -------
 arch/x86/realmode/rm/wakeup/wakeup_asm.S | 181 -------------------------------
 arch/x86/realmode/rm/wakeup_asm.S        | 181 +++++++++++++++++++++++++++++++
 24 files changed, 325 insertions(+), 355 deletions(-)
 create mode 100644 arch/x86/realmode/rm/bioscall.S
 create mode 100644 arch/x86/realmode/rm/copy.S
 create mode 100644 arch/x86/realmode/rm/regs.c
 create mode 100644 arch/x86/realmode/rm/video-bios.c
 create mode 100644 arch/x86/realmode/rm/video-mode.c
 create mode 100644 arch/x86/realmode/rm/video-vesa.c
 create mode 100644 arch/x86/realmode/rm/video-vga.c
 create mode 100644 arch/x86/realmode/rm/wakemain.c
 create mode 100644 arch/x86/realmode/rm/wakeup.h
 delete mode 100644 arch/x86/realmode/rm/wakeup/.gitignore
 delete mode 100644 arch/x86/realmode/rm/wakeup/Makefile
 delete mode 100644 arch/x86/realmode/rm/wakeup/bioscall.S
 delete mode 100644 arch/x86/realmode/rm/wakeup/copy.S
 delete mode 100644 arch/x86/realmode/rm/wakeup/regs.c
 delete mode 100644 arch/x86/realmode/rm/wakeup/video-bios.c
 delete mode 100644 arch/x86/realmode/rm/wakeup/video-mode.c
 delete mode 100644 arch/x86/realmode/rm/wakeup/video-vesa.c
 delete mode 100644 arch/x86/realmode/rm/wakeup/video-vga.c
 delete mode 100644 arch/x86/realmode/rm/wakeup/wakemain.c
 delete mode 100644 arch/x86/realmode/rm/wakeup/wakeup.h
 delete mode 100644 arch/x86/realmode/rm/wakeup/wakeup_asm.S
 create mode 100644 arch/x86/realmode/rm/wakeup_asm.S

(limited to 'arch/x86')

diff --git a/arch/x86/kernel/acpi/sleep.c b/arch/x86/kernel/acpi/sleep.c
index 6ca3f54ebe7d..95bf99de9058 100644
--- a/arch/x86/kernel/acpi/sleep.c
+++ b/arch/x86/kernel/acpi/sleep.c
@@ -16,7 +16,7 @@
 #include <asm/cacheflush.h>
 #include <asm/realmode.h>
 
-#include "../../realmode/rm/wakeup/wakeup.h"
+#include "../../realmode/rm/wakeup.h"
 #include "sleep.h"
 
 unsigned long acpi_realmode_flags;
diff --git a/arch/x86/realmode/rm/Makefile b/arch/x86/realmode/rm/Makefile
index c2c27a41ab8f..fc8854b09dfa 100644
--- a/arch/x86/realmode/rm/Makefile
+++ b/arch/x86/realmode/rm/Makefile
@@ -7,21 +7,26 @@
 #
 #
 
-subdir- := wakeup
-
 always := realmode.bin
 
 realmode-y			+= header.o
 realmode-y			+= trampoline_$(BITS).o
 realmode-y			+= stack.o
 realmode-$(CONFIG_X86_32)	+= reboot_32.o
-realmode-$(CONFIG_ACPI_SLEEP)	+= wakeup/wakeup.o
+realmode-$(CONFIG_ACPI_SLEEP)	+= $(wakeup-objs)
+
+wakeup-objs	:= wakeup_asm.o wakemain.o video-mode.o
+wakeup-objs	+= copy.o bioscall.o regs.o
+# The link order of the video-*.o modules can matter.  In particular,
+# video-vga.o *must* be listed first, followed by video-vesa.o.
+# Hardware-specific drivers should follow in the order they should be
+# probed, and video-bios.o should typically be last.
+wakeup-objs	+= video-vga.o
+wakeup-objs	+= video-vesa.o
+wakeup-objs	+= video-bios.o
 
 targets	+= $(realmode-y)
 
-$(obj)/wakeup/wakeup.o: FORCE
-	$(Q)$(MAKE) $(build)=$(obj)/wakeup $@
-
 REALMODE_OBJS = $(addprefix $(obj)/,$(realmode-y))
 
 sed-pasyms := -n -r -e 's/^([0-9a-fA-F]+) [ABCDGRSTVW] (.+)$$/pa_\2 = \2;/p'
@@ -55,7 +60,8 @@ $(obj)/realmode.relocs: $(obj)/realmode.elf FORCE
 
 # How to compile the 16-bit code.  Note we always compile for -march=i386,
 # that way we can complain to the user if the CPU is insufficient.
-KBUILD_CFLAGS	:= $(LINUXINCLUDE) -m32 -g -Os -D_SETUP -D__KERNEL__ \
+KBUILD_CFLAGS	:= $(LINUXINCLUDE) -m32 -g -Os -D_SETUP -D__KERNEL__ -D_WAKEUP \
+		   -I$(srctree)/arch/x86/boot \
 		   -DDISABLE_BRANCH_PROFILING \
 		   -Wall -Wstrict-prototypes \
 		   -march=i386 -mregparm=3 \
diff --git a/arch/x86/realmode/rm/bioscall.S b/arch/x86/realmode/rm/bioscall.S
new file mode 100644
index 000000000000..16162d197918
--- /dev/null
+++ b/arch/x86/realmode/rm/bioscall.S
@@ -0,0 +1 @@
+#include "../../boot/bioscall.S"
diff --git a/arch/x86/realmode/rm/copy.S b/arch/x86/realmode/rm/copy.S
new file mode 100644
index 000000000000..b785e6f38fdd
--- /dev/null
+++ b/arch/x86/realmode/rm/copy.S
@@ -0,0 +1 @@
+#include "../../boot/copy.S"
diff --git a/arch/x86/realmode/rm/regs.c b/arch/x86/realmode/rm/regs.c
new file mode 100644
index 000000000000..fbb15b9f9ca9
--- /dev/null
+++ b/arch/x86/realmode/rm/regs.c
@@ -0,0 +1 @@
+#include "../../boot/regs.c"
diff --git a/arch/x86/realmode/rm/video-bios.c b/arch/x86/realmode/rm/video-bios.c
new file mode 100644
index 000000000000..848b25aaf11b
--- /dev/null
+++ b/arch/x86/realmode/rm/video-bios.c
@@ -0,0 +1 @@
+#include "../../boot/video-bios.c"
diff --git a/arch/x86/realmode/rm/video-mode.c b/arch/x86/realmode/rm/video-mode.c
new file mode 100644
index 000000000000..2a98b7e2368b
--- /dev/null
+++ b/arch/x86/realmode/rm/video-mode.c
@@ -0,0 +1 @@
+#include "../../boot/video-mode.c"
diff --git a/arch/x86/realmode/rm/video-vesa.c b/arch/x86/realmode/rm/video-vesa.c
new file mode 100644
index 000000000000..413edddb51e5
--- /dev/null
+++ b/arch/x86/realmode/rm/video-vesa.c
@@ -0,0 +1 @@
+#include "../../boot/video-vesa.c"
diff --git a/arch/x86/realmode/rm/video-vga.c b/arch/x86/realmode/rm/video-vga.c
new file mode 100644
index 000000000000..3085f5c9d288
--- /dev/null
+++ b/arch/x86/realmode/rm/video-vga.c
@@ -0,0 +1 @@
+#include "../../boot/video-vga.c"
diff --git a/arch/x86/realmode/rm/wakemain.c b/arch/x86/realmode/rm/wakemain.c
new file mode 100644
index 000000000000..91405d515ec6
--- /dev/null
+++ b/arch/x86/realmode/rm/wakemain.c
@@ -0,0 +1,82 @@
+#include "wakeup.h"
+#include "boot.h"
+
+static void udelay(int loops)
+{
+	while (loops--)
+		io_delay();	/* Approximately 1 us */
+}
+
+static void beep(unsigned int hz)
+{
+	u8 enable;
+
+	if (!hz) {
+		enable = 0x00;		/* Turn off speaker */
+	} else {
+		u16 div = 1193181/hz;
+
+		outb(0xb6, 0x43);	/* Ctr 2, squarewave, load, binary */
+		io_delay();
+		outb(div, 0x42);	/* LSB of counter */
+		io_delay();
+		outb(div >> 8, 0x42);	/* MSB of counter */
+		io_delay();
+
+		enable = 0x03;		/* Turn on speaker */
+	}
+	inb(0x61);		/* Dummy read of System Control Port B */
+	io_delay();
+	outb(enable, 0x61);	/* Enable timer 2 output to speaker */
+	io_delay();
+}
+
+#define DOT_HZ		880
+#define DASH_HZ		587
+#define US_PER_DOT	125000
+
+/* Okay, this is totally silly, but it's kind of fun. */
+static void send_morse(const char *pattern)
+{
+	char s;
+
+	while ((s = *pattern++)) {
+		switch (s) {
+		case '.':
+			beep(DOT_HZ);
+			udelay(US_PER_DOT);
+			beep(0);
+			udelay(US_PER_DOT);
+			break;
+		case '-':
+			beep(DASH_HZ);
+			udelay(US_PER_DOT * 3);
+			beep(0);
+			udelay(US_PER_DOT);
+			break;
+		default:	/* Assume it's a space */
+			udelay(US_PER_DOT * 3);
+			break;
+		}
+	}
+}
+
+void main(void)
+{
+	/* Kill machine if structures are wrong */
+	if (wakeup_header.real_magic != 0x12345678)
+		while (1)
+			;
+
+	if (wakeup_header.realmode_flags & 4)
+		send_morse("...-");
+
+	if (wakeup_header.realmode_flags & 1)
+		asm volatile("lcallw   $0xc000,$3");
+
+	if (wakeup_header.realmode_flags & 2) {
+		/* Need to call BIOS */
+		probe_cards(0);
+		set_mode(wakeup_header.video_mode);
+	}
+}
diff --git a/arch/x86/realmode/rm/wakeup.h b/arch/x86/realmode/rm/wakeup.h
new file mode 100644
index 000000000000..2dfaf06b8af1
--- /dev/null
+++ b/arch/x86/realmode/rm/wakeup.h
@@ -0,0 +1,41 @@
+/*
+ * Definitions for the wakeup data structure at the head of the
+ * wakeup code.
+ */
+
+#ifndef ARCH_X86_KERNEL_ACPI_RM_WAKEUP_H
+#define ARCH_X86_KERNEL_ACPI_RM_WAKEUP_H
+
+#ifndef __ASSEMBLY__
+#include <linux/types.h>
+
+/* This must match data at wakeup.S */
+struct wakeup_header {
+	u16 video_mode;		/* Video mode number */
+	u32 pmode_entry;	/* Protected mode resume point, 32-bit only */
+	u16 pmode_cs;
+	u32 pmode_cr0;		/* Protected mode cr0 */
+	u32 pmode_cr3;		/* Protected mode cr3 */
+	u32 pmode_cr4;		/* Protected mode cr4 */
+	u32 pmode_efer_low;	/* Protected mode EFER */
+	u32 pmode_efer_high;
+	u64 pmode_gdt;
+	u32 pmode_misc_en_low;	/* Protected mode MISC_ENABLE */
+	u32 pmode_misc_en_high;
+	u32 pmode_behavior;	/* Wakeup routine behavior flags */
+	u32 realmode_flags;
+	u32 real_magic;
+	u32 signature;		/* To check we have correct structure */
+} __attribute__((__packed__));
+
+extern struct wakeup_header wakeup_header;
+#endif
+
+#define WAKEUP_HEADER_OFFSET	8
+#define WAKEUP_HEADER_SIGNATURE 0x51ee1111
+#define WAKEUP_END_SIGNATURE	0x65a22c82
+
+/* Wakeup behavior bits */
+#define WAKEUP_BEHAVIOR_RESTORE_MISC_ENABLE     0
+
+#endif /* ARCH_X86_KERNEL_ACPI_RM_WAKEUP_H */
diff --git a/arch/x86/realmode/rm/wakeup/.gitignore b/arch/x86/realmode/rm/wakeup/.gitignore
deleted file mode 100644
index 58f1f48a58f8..000000000000
--- a/arch/x86/realmode/rm/wakeup/.gitignore
+++ /dev/null
@@ -1,3 +0,0 @@
-wakeup.bin
-wakeup.elf
-wakeup.lds
diff --git a/arch/x86/realmode/rm/wakeup/Makefile b/arch/x86/realmode/rm/wakeup/Makefile
deleted file mode 100644
index 4c8533240cdd..000000000000
--- a/arch/x86/realmode/rm/wakeup/Makefile
+++ /dev/null
@@ -1,33 +0,0 @@
-#
-# arch/x86/kernel/acpi/realmode/Makefile
-#
-# This file is subject to the terms and conditions of the GNU General Public
-# License.  See the file "COPYING" in the main directory of this archive
-# for more details.
-#
-
-always		:= wakeup.o
-
-wakeup-y	+= wakeup_asm.o wakemain.o video-mode.o
-wakeup-y	+= copy.o bioscall.o regs.o
-
-# The link order of the video-*.o modules can matter.  In particular,
-# video-vga.o *must* be listed first, followed by video-vesa.o.
-# Hardware-specific drivers should follow in the order they should be
-# probed, and video-bios.o should typically be last.
-wakeup-y	+= video-vga.o
-wakeup-y	+= video-vesa.o
-wakeup-y	+= video-bios.o
-
-targets		+= $(wakeup-y)
-
-WAKEUP_OBJS = $(addprefix $(obj)/,$(wakeup-y))
-
-LDFLAGS_wakeup.o := -m elf_i386 -r
-$(obj)/wakeup.o: $(WAKEUP_OBJS) FORCE
-	$(call if_changed,ld)
-
-bootsrc := $(src)/../../../boot
-
-ccflags-y += -D_WAKEUP -I$(srctree)/$(bootsrc)
-asflags-y += -D_WAKEUP -I$(srctree)/$(bootsrc)
diff --git a/arch/x86/realmode/rm/wakeup/bioscall.S b/arch/x86/realmode/rm/wakeup/bioscall.S
deleted file mode 100644
index f51eb0bb56ce..000000000000
--- a/arch/x86/realmode/rm/wakeup/bioscall.S
+++ /dev/null
@@ -1 +0,0 @@
-#include "../../../boot/bioscall.S"
diff --git a/arch/x86/realmode/rm/wakeup/copy.S b/arch/x86/realmode/rm/wakeup/copy.S
deleted file mode 100644
index dc59ebee69d8..000000000000
--- a/arch/x86/realmode/rm/wakeup/copy.S
+++ /dev/null
@@ -1 +0,0 @@
-#include "../../../boot/copy.S"
diff --git a/arch/x86/realmode/rm/wakeup/regs.c b/arch/x86/realmode/rm/wakeup/regs.c
deleted file mode 100644
index 6206033ba202..000000000000
--- a/arch/x86/realmode/rm/wakeup/regs.c
+++ /dev/null
@@ -1 +0,0 @@
-#include "../../../boot/regs.c"
diff --git a/arch/x86/realmode/rm/wakeup/video-bios.c b/arch/x86/realmode/rm/wakeup/video-bios.c
deleted file mode 100644
index 7deabc144a27..000000000000
--- a/arch/x86/realmode/rm/wakeup/video-bios.c
+++ /dev/null
@@ -1 +0,0 @@
-#include "../../../boot/video-bios.c"
diff --git a/arch/x86/realmode/rm/wakeup/video-mode.c b/arch/x86/realmode/rm/wakeup/video-mode.c
deleted file mode 100644
index 328ad209f113..000000000000
--- a/arch/x86/realmode/rm/wakeup/video-mode.c
+++ /dev/null
@@ -1 +0,0 @@
-#include "../../../boot/video-mode.c"
diff --git a/arch/x86/realmode/rm/wakeup/video-vesa.c b/arch/x86/realmode/rm/wakeup/video-vesa.c
deleted file mode 100644
index 9dbb9672226a..000000000000
--- a/arch/x86/realmode/rm/wakeup/video-vesa.c
+++ /dev/null
@@ -1 +0,0 @@
-#include "../../../boot/video-vesa.c"
diff --git a/arch/x86/realmode/rm/wakeup/video-vga.c b/arch/x86/realmode/rm/wakeup/video-vga.c
deleted file mode 100644
index bcc81255f374..000000000000
--- a/arch/x86/realmode/rm/wakeup/video-vga.c
+++ /dev/null
@@ -1 +0,0 @@
-#include "../../../boot/video-vga.c"
diff --git a/arch/x86/realmode/rm/wakeup/wakemain.c b/arch/x86/realmode/rm/wakeup/wakemain.c
deleted file mode 100644
index 91405d515ec6..000000000000
--- a/arch/x86/realmode/rm/wakeup/wakemain.c
+++ /dev/null
@@ -1,82 +0,0 @@
-#include "wakeup.h"
-#include "boot.h"
-
-static void udelay(int loops)
-{
-	while (loops--)
-		io_delay();	/* Approximately 1 us */
-}
-
-static void beep(unsigned int hz)
-{
-	u8 enable;
-
-	if (!hz) {
-		enable = 0x00;		/* Turn off speaker */
-	} else {
-		u16 div = 1193181/hz;
-
-		outb(0xb6, 0x43);	/* Ctr 2, squarewave, load, binary */
-		io_delay();
-		outb(div, 0x42);	/* LSB of counter */
-		io_delay();
-		outb(div >> 8, 0x42);	/* MSB of counter */
-		io_delay();
-
-		enable = 0x03;		/* Turn on speaker */
-	}
-	inb(0x61);		/* Dummy read of System Control Port B */
-	io_delay();
-	outb(enable, 0x61);	/* Enable timer 2 output to speaker */
-	io_delay();
-}
-
-#define DOT_HZ		880
-#define DASH_HZ		587
-#define US_PER_DOT	125000
-
-/* Okay, this is totally silly, but it's kind of fun. */
-static void send_morse(const char *pattern)
-{
-	char s;
-
-	while ((s = *pattern++)) {
-		switch (s) {
-		case '.':
-			beep(DOT_HZ);
-			udelay(US_PER_DOT);
-			beep(0);
-			udelay(US_PER_DOT);
-			break;
-		case '-':
-			beep(DASH_HZ);
-			udelay(US_PER_DOT * 3);
-			beep(0);
-			udelay(US_PER_DOT);
-			break;
-		default:	/* Assume it's a space */
-			udelay(US_PER_DOT * 3);
-			break;
-		}
-	}
-}
-
-void main(void)
-{
-	/* Kill machine if structures are wrong */
-	if (wakeup_header.real_magic != 0x12345678)
-		while (1)
-			;
-
-	if (wakeup_header.realmode_flags & 4)
-		send_morse("...-");
-
-	if (wakeup_header.realmode_flags & 1)
-		asm volatile("lcallw   $0xc000,$3");
-
-	if (wakeup_header.realmode_flags & 2) {
-		/* Need to call BIOS */
-		probe_cards(0);
-		set_mode(wakeup_header.video_mode);
-	}
-}
diff --git a/arch/x86/realmode/rm/wakeup/wakeup.h b/arch/x86/realmode/rm/wakeup/wakeup.h
deleted file mode 100644
index 2dfaf06b8af1..000000000000
--- a/arch/x86/realmode/rm/wakeup/wakeup.h
+++ /dev/null
@@ -1,41 +0,0 @@
-/*
- * Definitions for the wakeup data structure at the head of the
- * wakeup code.
- */
-
-#ifndef ARCH_X86_KERNEL_ACPI_RM_WAKEUP_H
-#define ARCH_X86_KERNEL_ACPI_RM_WAKEUP_H
-
-#ifndef __ASSEMBLY__
-#include <linux/types.h>
-
-/* This must match data at wakeup.S */
-struct wakeup_header {
-	u16 video_mode;		/* Video mode number */
-	u32 pmode_entry;	/* Protected mode resume point, 32-bit only */
-	u16 pmode_cs;
-	u32 pmode_cr0;		/* Protected mode cr0 */
-	u32 pmode_cr3;		/* Protected mode cr3 */
-	u32 pmode_cr4;		/* Protected mode cr4 */
-	u32 pmode_efer_low;	/* Protected mode EFER */
-	u32 pmode_efer_high;
-	u64 pmode_gdt;
-	u32 pmode_misc_en_low;	/* Protected mode MISC_ENABLE */
-	u32 pmode_misc_en_high;
-	u32 pmode_behavior;	/* Wakeup routine behavior flags */
-	u32 realmode_flags;
-	u32 real_magic;
-	u32 signature;		/* To check we have correct structure */
-} __attribute__((__packed__));
-
-extern struct wakeup_header wakeup_header;
-#endif
-
-#define WAKEUP_HEADER_OFFSET	8
-#define WAKEUP_HEADER_SIGNATURE 0x51ee1111
-#define WAKEUP_END_SIGNATURE	0x65a22c82
-
-/* Wakeup behavior bits */
-#define WAKEUP_BEHAVIOR_RESTORE_MISC_ENABLE     0
-
-#endif /* ARCH_X86_KERNEL_ACPI_RM_WAKEUP_H */
diff --git a/arch/x86/realmode/rm/wakeup/wakeup_asm.S b/arch/x86/realmode/rm/wakeup/wakeup_asm.S
deleted file mode 100644
index f81c1cd99eaf..000000000000
--- a/arch/x86/realmode/rm/wakeup/wakeup_asm.S
+++ /dev/null
@@ -1,181 +0,0 @@
-/*
- * ACPI wakeup real mode startup stub
- */
-#include <linux/linkage.h>
-#include <asm/segment.h>
-#include <asm/msr-index.h>
-#include <asm/page_types.h>
-#include <asm/pgtable_types.h>
-#include <asm/processor-flags.h>
-#include "../realmode.h"
-#include "wakeup.h"
-
-	.code16
-
-/* This should match the structure in wakeup.h */
-	.section ".data", "aw"
-
-	.balign	16
-GLOBAL(wakeup_header)
-	video_mode:	.short	0	/* Video mode number */
-	pmode_entry:	.long	0
-	pmode_cs:	.short	__KERNEL_CS
-	pmode_cr0:	.long	0	/* Saved %cr0 */
-	pmode_cr3:	.long	0	/* Saved %cr3 */
-	pmode_cr4:	.long	0	/* Saved %cr4 */
-	pmode_efer:	.quad	0	/* Saved EFER */
-	pmode_gdt:	.quad	0
-	pmode_misc_en:	.quad	0	/* Saved MISC_ENABLE MSR */
-	pmode_behavior:	.long	0	/* Wakeup behavior flags */
-	realmode_flags:	.long	0
-	real_magic:	.long	0
-	signature:	.long	WAKEUP_HEADER_SIGNATURE
-END(wakeup_header)
-
-	.text
-	.code16
-
-	.balign	16
-ENTRY(wakeup_start)
-	cli
-	cld
-
-	LJMPW_RM(3f)
-3:
-	/* Apparently some dimwit BIOS programmers don't know how to
-	   program a PM to RM transition, and we might end up here with
-	   junk in the data segment descriptor registers.  The only way
-	   to repair that is to go into PM and fix it ourselves... */
-	movw	$16, %cx
-	lgdtl	%cs:wakeup_gdt
-	movl	%cr0, %eax
-	orb	$X86_CR0_PE, %al
-	movl	%eax, %cr0
-	ljmpw	$8, $2f
-2:
-	movw	%cx, %ds
-	movw	%cx, %es
-	movw	%cx, %ss
-	movw	%cx, %fs
-	movw	%cx, %gs
-
-	andb	$~X86_CR0_PE, %al
-	movl	%eax, %cr0
-	LJMPW_RM(3f)
-3:
-	/* Set up segments */
-	movw	%cs, %ax
-	movw	%ax, %ss
-	movl	$rm_stack_end, %esp
-	movw	%ax, %ds
-	movw	%ax, %es
-	movw	%ax, %fs
-	movw	%ax, %gs
-
-	lidtl	wakeup_idt
-
-	/* Clear the EFLAGS */
-	pushl	$0
-	popfl
-
-	/* Check header signature... */
-	movl	signature, %eax
-	cmpl	$WAKEUP_HEADER_SIGNATURE, %eax
-	jne	bogus_real_magic
-
-	/* Check we really have everything... */
-	movl	end_signature, %eax
-	cmpl	$WAKEUP_END_SIGNATURE, %eax
-	jne	bogus_real_magic
-
-	/* Call the C code */
-	calll	main
-
-	/* Restore MISC_ENABLE before entering protected mode, in case
-	   BIOS decided to clear XD_DISABLE during S3. */
-	movl	pmode_behavior, %eax
-	btl	$WAKEUP_BEHAVIOR_RESTORE_MISC_ENABLE, %eax
-	jnc	1f
-
-	movl	pmode_misc_en, %eax
-	movl	pmode_misc_en + 4, %edx
-	movl	$MSR_IA32_MISC_ENABLE, %ecx
-	wrmsr
-1:
-
-	/* Do any other stuff... */
-
-#ifndef CONFIG_64BIT
-	/* This could also be done in C code... */
-	movl	pmode_cr3, %eax
-	movl	%eax, %cr3
-
-	movl	pmode_cr4, %ecx
-	jecxz	1f
-	movl	%ecx, %cr4
-1:
-	movl	pmode_efer, %eax
-	movl	pmode_efer + 4, %edx
-	movl	%eax, %ecx
-	orl	%edx, %ecx
-	jz	1f
-	movl	$MSR_EFER, %ecx
-	wrmsr
-1:
-
-	lgdtl	pmode_gdt
-
-	/* This really couldn't... */
-	movl	pmode_entry, %eax
-	movl	pmode_cr0, %ecx
-	movl	%ecx, %cr0
-	ljmpl	$__KERNEL_CS, $pa_startup_32
-	/* -> jmp *%eax in trampoline_32.S */
-#else
-	jmp	trampoline_data
-#endif
-
-bogus_real_magic:
-1:
-	hlt
-	jmp	1b
-
-	.section ".rodata","a"
-
-	/*
-	 * Set up the wakeup GDT.  We set these up as Big Real Mode,
-	 * that is, with limits set to 4 GB.  At least the Lenovo
-	 * Thinkpad X61 is known to need this for the video BIOS
-	 * initialization quirk to work; this is likely to also
-	 * be the case for other laptops or integrated video devices.
-	 */
-
-	.balign	16
-GLOBAL(wakeup_gdt)
-	.word	3*8-1		/* Self-descriptor */
-	.long	pa_wakeup_gdt
-	.word	0
-
-	.word	0xffff		/* 16-bit code segment @ real_mode_base */
-	.long	0x9b000000 + pa_real_mode_base
-	.word	0x008f		/* big real mode */
-
-	.word	0xffff		/* 16-bit data segment @ real_mode_base */
-	.long	0x93000000 + pa_real_mode_base
-	.word	0x008f		/* big real mode */
-END(wakeup_gdt)
-
-	.section ".rodata","a"
-	.balign	8
-
-	/* This is the standard real-mode IDT */
-	.balign	16
-GLOBAL(wakeup_idt)
-	.word	0xffff		/* limit */
-	.long	0		/* address */
-	.word	0
-END(wakeup_idt)
-
-	.section ".signature","a"
-end_signature:
-	.long	WAKEUP_END_SIGNATURE
diff --git a/arch/x86/realmode/rm/wakeup_asm.S b/arch/x86/realmode/rm/wakeup_asm.S
new file mode 100644
index 000000000000..8a57c5a05fbc
--- /dev/null
+++ b/arch/x86/realmode/rm/wakeup_asm.S
@@ -0,0 +1,181 @@
+/*
+ * ACPI wakeup real mode startup stub
+ */
+#include <linux/linkage.h>
+#include <asm/segment.h>
+#include <asm/msr-index.h>
+#include <asm/page_types.h>
+#include <asm/pgtable_types.h>
+#include <asm/processor-flags.h>
+#include "realmode.h"
+#include "wakeup.h"
+
+	.code16
+
+/* This should match the structure in wakeup.h */
+	.section ".data", "aw"
+
+	.balign	16
+GLOBAL(wakeup_header)
+	video_mode:	.short	0	/* Video mode number */
+	pmode_entry:	.long	0
+	pmode_cs:	.short	__KERNEL_CS
+	pmode_cr0:	.long	0	/* Saved %cr0 */
+	pmode_cr3:	.long	0	/* Saved %cr3 */
+	pmode_cr4:	.long	0	/* Saved %cr4 */
+	pmode_efer:	.quad	0	/* Saved EFER */
+	pmode_gdt:	.quad	0
+	pmode_misc_en:	.quad	0	/* Saved MISC_ENABLE MSR */
+	pmode_behavior:	.long	0	/* Wakeup behavior flags */
+	realmode_flags:	.long	0
+	real_magic:	.long	0
+	signature:	.long	WAKEUP_HEADER_SIGNATURE
+END(wakeup_header)
+
+	.text
+	.code16
+
+	.balign	16
+ENTRY(wakeup_start)
+	cli
+	cld
+
+	LJMPW_RM(3f)
+3:
+	/* Apparently some dimwit BIOS programmers don't know how to
+	   program a PM to RM transition, and we might end up here with
+	   junk in the data segment descriptor registers.  The only way
+	   to repair that is to go into PM and fix it ourselves... */
+	movw	$16, %cx
+	lgdtl	%cs:wakeup_gdt
+	movl	%cr0, %eax
+	orb	$X86_CR0_PE, %al
+	movl	%eax, %cr0
+	ljmpw	$8, $2f
+2:
+	movw	%cx, %ds
+	movw	%cx, %es
+	movw	%cx, %ss
+	movw	%cx, %fs
+	movw	%cx, %gs
+
+	andb	$~X86_CR0_PE, %al
+	movl	%eax, %cr0
+	LJMPW_RM(3f)
+3:
+	/* Set up segments */
+	movw	%cs, %ax
+	movw	%ax, %ss
+	movl	$rm_stack_end, %esp
+	movw	%ax, %ds
+	movw	%ax, %es
+	movw	%ax, %fs
+	movw	%ax, %gs
+
+	lidtl	wakeup_idt
+
+	/* Clear the EFLAGS */
+	pushl	$0
+	popfl
+
+	/* Check header signature... */
+	movl	signature, %eax
+	cmpl	$WAKEUP_HEADER_SIGNATURE, %eax
+	jne	bogus_real_magic
+
+	/* Check we really have everything... */
+	movl	end_signature, %eax
+	cmpl	$WAKEUP_END_SIGNATURE, %eax
+	jne	bogus_real_magic
+
+	/* Call the C code */
+	calll	main
+
+	/* Restore MISC_ENABLE before entering protected mode, in case
+	   BIOS decided to clear XD_DISABLE during S3. */
+	movl	pmode_behavior, %eax
+	btl	$WAKEUP_BEHAVIOR_RESTORE_MISC_ENABLE, %eax
+	jnc	1f
+
+	movl	pmode_misc_en, %eax
+	movl	pmode_misc_en + 4, %edx
+	movl	$MSR_IA32_MISC_ENABLE, %ecx
+	wrmsr
+1:
+
+	/* Do any other stuff... */
+
+#ifndef CONFIG_64BIT
+	/* This could also be done in C code... */
+	movl	pmode_cr3, %eax
+	movl	%eax, %cr3
+
+	movl	pmode_cr4, %ecx
+	jecxz	1f
+	movl	%ecx, %cr4
+1:
+	movl	pmode_efer, %eax
+	movl	pmode_efer + 4, %edx
+	movl	%eax, %ecx
+	orl	%edx, %ecx
+	jz	1f
+	movl	$MSR_EFER, %ecx
+	wrmsr
+1:
+
+	lgdtl	pmode_gdt
+
+	/* This really couldn't... */
+	movl	pmode_entry, %eax
+	movl	pmode_cr0, %ecx
+	movl	%ecx, %cr0
+	ljmpl	$__KERNEL_CS, $pa_startup_32
+	/* -> jmp *%eax in trampoline_32.S */
+#else
+	jmp	trampoline_data
+#endif
+
+bogus_real_magic:
+1:
+	hlt
+	jmp	1b
+
+	.section ".rodata","a"
+
+	/*
+	 * Set up the wakeup GDT.  We set these up as Big Real Mode,
+	 * that is, with limits set to 4 GB.  At least the Lenovo
+	 * Thinkpad X61 is known to need this for the video BIOS
+	 * initialization quirk to work; this is likely to also
+	 * be the case for other laptops or integrated video devices.
+	 */
+
+	.balign	16
+GLOBAL(wakeup_gdt)
+	.word	3*8-1		/* Self-descriptor */
+	.long	pa_wakeup_gdt
+	.word	0
+
+	.word	0xffff		/* 16-bit code segment @ real_mode_base */
+	.long	0x9b000000 + pa_real_mode_base
+	.word	0x008f		/* big real mode */
+
+	.word	0xffff		/* 16-bit data segment @ real_mode_base */
+	.long	0x93000000 + pa_real_mode_base
+	.word	0x008f		/* big real mode */
+END(wakeup_gdt)
+
+	.section ".rodata","a"
+	.balign	8
+
+	/* This is the standard real-mode IDT */
+	.balign	16
+GLOBAL(wakeup_idt)
+	.word	0xffff		/* limit */
+	.long	0		/* address */
+	.word	0
+END(wakeup_idt)
+
+	.section ".signature","a"
+end_signature:
+	.long	WAKEUP_END_SIGNATURE
-- 
cgit v1.2.3


From f37240f16bec91f15ce564515f70a6ca9715ce96 Mon Sep 17 00:00:00 2001
From: Jarkko Sakkinen <jarkko.sakkinen@intel.com>
Date: Tue, 8 May 2012 21:22:43 +0300
Subject: x86, realmode: header for trampoline code

Added header for trampoline code that can be used to supply
input data to it. This makes interface between real mode code
and kernel cleaner and simpler. Replaced two confusing pointers
to level4 pgt in trampoline_64.S with a single pointer to the
beginning of the page table.

Signed-off-by: Jarkko Sakkinen <jarkko.sakkinen@intel.com>
Link: http://lkml.kernel.org/r/1336501366-28617-21-git-send-email-jarkko.sakkinen@intel.com
Signed-off-by: H. Peter Anvin <hpa@linux.intel.com>
---
 arch/x86/include/asm/realmode.h          | 32 +++++++++++++++++-----------
 arch/x86/kernel/realmode.c               | 27 +++++++++++++-----------
 arch/x86/kernel/smpboot.c                |  2 +-
 arch/x86/realmode/rm/header.S            | 35 ++++++++++++++-----------------
 arch/x86/realmode/rm/trampoline_32.S     | 36 ++++++--------------------------
 arch/x86/realmode/rm/trampoline_64.S     | 18 +++++-----------
 arch/x86/realmode/rm/trampoline_common.S | 23 ++++++++++++++++++++
 arch/x86/realmode/rm/wakeup_asm.S        |  2 +-
 8 files changed, 87 insertions(+), 88 deletions(-)
 create mode 100644 arch/x86/realmode/rm/trampoline_common.S

(limited to 'arch/x86')

diff --git a/arch/x86/include/asm/realmode.h b/arch/x86/include/asm/realmode.h
index d3ae49f4c3ef..1421eed1c8e8 100644
--- a/arch/x86/include/asm/realmode.h
+++ b/arch/x86/include/asm/realmode.h
@@ -8,24 +8,32 @@
 struct real_mode_header {
 	u32	text_start;
 	u32	ro_end;
-	/* reboot */
-#ifdef CONFIG_X86_32
-	u32	machine_real_restart_asm;
-#endif
 	/* SMP trampoline */
-	u32	trampoline_data;
+	u32	trampoline_start;
 	u32	trampoline_status;
-#ifdef CONFIG_X86_32
-	u32	startup_32_smp;
-	u32	boot_gdt;
-#else
-	u32	startup_64_smp;
-	u32	level3_ident_pgt;
-	u32	level3_kernel_pgt;
+	u32	trampoline_header;
+#ifdef CONFIG_X86_64
+	u32	trampoline_pgd;
 #endif
+	/* ACPI S3 wakeup */
 #ifdef CONFIG_ACPI_SLEEP
 	u32	wakeup_start;
 	u32	wakeup_header;
+#endif
+	/* APM/BIOS reboot */
+#ifdef CONFIG_X86_32
+	u32	machine_real_restart_asm;
+#endif
+} __attribute__((__packed__));
+
+/* This must match data at trampoline_32/64.S */
+struct trampoline_header {
+#ifdef CONFIG_X86_32
+	u32 start;
+	u16 gdt_limit;
+	u32 gdt_base;
+#else
+	u64 start;
 #endif
 } __attribute__((__packed__));
 
diff --git a/arch/x86/kernel/realmode.c b/arch/x86/kernel/realmode.c
index 632c810ec8ea..712fba8fd774 100644
--- a/arch/x86/kernel/realmode.c
+++ b/arch/x86/kernel/realmode.c
@@ -17,8 +17,11 @@ void __init setup_real_mode(void)
 	u16 *seg;
 	int i;
 	unsigned char *base;
-
+	struct trampoline_header *trampoline_header;
 	size_t size = PAGE_ALIGN(real_mode_blob_end - real_mode_blob);
+#ifdef CONFIG_X86_64
+	u64 *trampoline_pgd;
+#endif
 
 	/* Has to be in very low memory so we can execute real-mode AP code. */
 	mem = memblock_find_in_range(0, 1<<20, size, PAGE_SIZE);
@@ -28,7 +31,6 @@ void __init setup_real_mode(void)
 	base = __va(mem);
 	memblock_reserve(mem, size);
 	real_mode_header = (struct real_mode_header *) base;
-
 	printk(KERN_DEBUG "Base memory trampoline at [%p] %llx size %zu\n",
 	       base, (unsigned long long)mem, size);
 
@@ -53,18 +55,19 @@ void __init setup_real_mode(void)
 		*ptr += __pa(base);
 	}
 
+	/* Must be perfomed *after* relocation. */
+	trampoline_header = (struct trampoline_header *)
+		__va(real_mode_header->trampoline_header);
+
 #ifdef CONFIG_X86_32
-	*((u32 *)__va(real_mode_header->startup_32_smp)) = __pa(startup_32_smp);
-	*((u32 *)__va(real_mode_header->boot_gdt)) = __pa(boot_gdt);
+	trampoline_header->start = __pa(startup_32_smp);
+	trampoline_header->gdt_limit = __BOOT_DS + 7;
+	trampoline_header->gdt_base = __pa(boot_gdt);
 #else
-	*((u64 *) __va(real_mode_header->startup_64_smp)) =
-		(u64)secondary_startup_64;
-
-	*((u64 *) __va(real_mode_header->level3_ident_pgt)) =
-		__pa(level3_ident_pgt) + _KERNPG_TABLE;
-
-	*((u64 *) __va(real_mode_header->level3_kernel_pgt)) =
-		__pa(level3_kernel_pgt) + _KERNPG_TABLE;
+	trampoline_header->start = (u64) secondary_startup_64;
+	trampoline_pgd = (u64 *) __va(real_mode_header->trampoline_pgd);
+	trampoline_pgd[0] = __pa(level3_ident_pgt) + _KERNPG_TABLE;
+	trampoline_pgd[511] = __pa(level3_kernel_pgt) + _KERNPG_TABLE;
 #endif
 }
 
diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c
index b8c0661e2341..757c4b1d0a02 100644
--- a/arch/x86/kernel/smpboot.c
+++ b/arch/x86/kernel/smpboot.c
@@ -667,7 +667,7 @@ static int __cpuinit do_boot_cpu(int apicid, int cpu)
 	volatile u32 *trampoline_status =
 		(volatile u32 *) __va(real_mode_header->trampoline_status);
 	/* start_ip had better be page-aligned! */
-	unsigned long start_ip = real_mode_header->trampoline_data;
+	unsigned long start_ip = real_mode_header->trampoline_start;
 
 	unsigned long boot_error = 0;
 	int timeout;
diff --git a/arch/x86/realmode/rm/header.S b/arch/x86/realmode/rm/header.S
index c83005c4d455..b4c32632bf16 100644
--- a/arch/x86/realmode/rm/header.S
+++ b/arch/x86/realmode/rm/header.S
@@ -7,28 +7,25 @@
 #include <linux/linkage.h>
 #include <asm/page_types.h>
 
-		.section ".header", "a"
+	.section ".header", "a"
 
 GLOBAL(real_mode_header)
-		.long	pa_text_start
-		.long	pa_ro_end
-#ifdef CONFIG_X86_32
-		.long	pa_machine_real_restart_asm
-#endif
-		/* SMP trampoline */
-		.long	pa_trampoline_data
-		.long	pa_trampoline_status
-#ifdef CONFIG_X86_32
-		.long	pa_startup_32_smp
-		.long	pa_boot_gdt
-#else
-		.long	pa_startup_64_smp
-		.long	pa_level3_ident_pgt
-		.long	pa_level3_kernel_pgt
+	.long	pa_text_start
+	.long	pa_ro_end
+	/* SMP trampoline */
+	.long	pa_trampoline_start
+	.long	pa_trampoline_status
+	.long	pa_trampoline_header
+#ifdef CONFIG_X86_64
+	.long	pa_trampoline_pgd;
 #endif
-		/* ACPI sleep */
+	/* ACPI S3 wakeup */
 #ifdef CONFIG_ACPI_SLEEP
-		.long	pa_wakeup_start
-		.long	pa_wakeup_header
+	.long	pa_wakeup_start
+	.long	pa_wakeup_header
+#endif
+	/* APM/BIOS reboot */
+#ifdef CONFIG_X86_32
+	.long	pa_machine_real_restart_asm
 #endif
 END(real_mode_header)
diff --git a/arch/x86/realmode/rm/trampoline_32.S b/arch/x86/realmode/rm/trampoline_32.S
index 1ecdbb59191b..6fc064b4d2b9 100644
--- a/arch/x86/realmode/rm/trampoline_32.S
+++ b/arch/x86/realmode/rm/trampoline_32.S
@@ -13,16 +13,10 @@
  *
  *	We jump into arch/x86/kernel/head_32.S.
  *
- *	On entry to trampoline_data, the processor is in real mode
+ *	On entry to trampoline_start, the processor is in real mode
  *	with 16-bit addressing and 16-bit data.  CS has some value
  *	and IP is zero.  Thus, we load CS to the physical segment
  *	of the real mode code before doing anything further.
- *
- *	The structure real_mode_header includes entries that need
- *	to be set up before executing this code:
- *
- *	startup_32_smp
- *	boot_gdt
  */
 
 #include <linux/linkage.h>
@@ -35,7 +29,7 @@
 	.code16
 
 	.balign	PAGE_SIZE
-ENTRY(trampoline_data)
+ENTRY(trampoline_start)
 	wbinvd			# Needed for NUMA-Q should be harmless for others
 
 	LJMPW_RM(1f)
@@ -45,7 +39,7 @@ ENTRY(trampoline_data)
 
 	cli			# We should be safe anyway
 
-	movl	startup_32_smp, %eax	# where we need to go
+	movl	tr_start, %eax	# where we need to go
 
 	movl	$0xA5A5A5A5, trampoline_status
 				# write marker for master knows we're running
@@ -56,8 +50,8 @@ ENTRY(trampoline_data)
 	 * operand size is 16bit. Use lgdtl instead to force operand size
 	 * to 32 bit.
 	 */
-	lidtl	boot_idt_descr		# load idt with 0, 0
-	lgdtl	boot_gdt_descr		# load gdt with whatever is appropriate
+	lidtl	tr_idt			# load idt with 0, 0
+	lgdtl	tr_gdt			# load gdt with whatever is appropriate
 
 	movw	$1, %dx			# protected mode (PE) bit
 	lmsw	%dx			# into protected mode
@@ -69,22 +63,4 @@ ENTRY(trampoline_data)
 ENTRY(startup_32)			# note: also used from wakeup_asm.S
 	jmp	*%eax
 
-	.section ".rodata","a"
-
-	.balign	4
-boot_idt_descr:
-	.word	0				# idt limit = 0
-	.long	0				# idt base = 0L
-
-	.data
-
-boot_gdt_descr:
-	.word	__BOOT_DS + 7			# gdt limit
-GLOBAL(boot_gdt)
-	.long	0				# gdt base
-
-	.bss
-
-	.balign	4
-GLOBAL(trampoline_status)	.space	4
-GLOBAL(startup_32_smp)		.space	4
+#include "trampoline_common.S"
diff --git a/arch/x86/realmode/rm/trampoline_64.S b/arch/x86/realmode/rm/trampoline_64.S
index f71ea0800d3d..3f7293239365 100644
--- a/arch/x86/realmode/rm/trampoline_64.S
+++ b/arch/x86/realmode/rm/trampoline_64.S
@@ -10,7 +10,7 @@
  *	trampoline page to make our stack and everything else
  *	is a mystery.
  *
- *	On entry to trampoline_data, the processor is in real mode
+ *	On entry to trampoline_start, the processor is in real mode
  *	with 16-bit addressing and 16-bit data.  CS has some value
  *	and IP is zero.  Thus, data addresses need to be absolute
  *	(no relocation) and are taken with regard to r_base.
@@ -37,7 +37,7 @@
 	.balign PAGE_SIZE
 	.code16
 
-ENTRY(trampoline_data)
+ENTRY(trampoline_start)
 	cli			# We should be safe anyway
 	wbinvd
 
@@ -97,7 +97,7 @@ ENTRY(startup_32)
 	movl	%eax, %cr4		# Enable PAE mode
 
 	# Setup trampoline 4 level pagetables
-	movl	$pa_level3_ident_pgt, %eax
+	movl	$pa_trampoline_pgd, %eax
 	movl	%eax, %cr3
 
 	movl	$MSR_EFER, %ecx
@@ -122,7 +122,7 @@ ENTRY(startup_32)
 	.balign 4
 ENTRY(startup_64)
 	# Now jump into the kernel using virtual addresses
-	jmpq	*startup_64_smp(%rip)
+	jmpq	*tr_start(%rip)
 
 	.section ".rodata","a"
 	.balign	16
@@ -143,12 +143,4 @@ tgdt:
 	.quad	0x00cf93000000ffff	# __KERNEL_DS
 tgdt_end:
 
-	.bss
-
-	.balign	PAGE_SIZE
-GLOBAL(level3_ident_pgt)	.space	511*8
-GLOBAL(level3_kernel_pgt)	.space	8
-
-	.balign	8
-GLOBAL(startup_64_smp)		.space	8
-GLOBAL(trampoline_status)	.space	4
+#include "trampoline_common.S"
diff --git a/arch/x86/realmode/rm/trampoline_common.S b/arch/x86/realmode/rm/trampoline_common.S
new file mode 100644
index 000000000000..c3f951c468c5
--- /dev/null
+++ b/arch/x86/realmode/rm/trampoline_common.S
@@ -0,0 +1,23 @@
+	.section ".rodata","a"
+
+	.balign	4
+tr_idt: .fill 1, 6, 0
+
+	.bss
+
+	.balign	4
+GLOBAL(trampoline_status)	.space	4
+
+GLOBAL(trampoline_header)
+#ifdef CONFIG_X86_32
+	tr_start:		.space	4
+	tr_gdt:			.space	6
+#else
+	tr_start:		.space	8
+#endif
+END(trampoline_header)
+
+#ifdef CONFIG_X86_64
+	.balign	PAGE_SIZE
+GLOBAL(trampoline_pgd)		.space	PAGE_SIZE
+#endif
diff --git a/arch/x86/realmode/rm/wakeup_asm.S b/arch/x86/realmode/rm/wakeup_asm.S
index 8a57c5a05fbc..46108f05e04e 100644
--- a/arch/x86/realmode/rm/wakeup_asm.S
+++ b/arch/x86/realmode/rm/wakeup_asm.S
@@ -132,7 +132,7 @@ ENTRY(wakeup_start)
 	ljmpl	$__KERNEL_CS, $pa_startup_32
 	/* -> jmp *%eax in trampoline_32.S */
 #else
-	jmp	trampoline_data
+	jmp	trampoline_start
 #endif
 
 bogus_real_magic:
-- 
cgit v1.2.3


From b2d0b7a061bfddd27155c7dcd53f365d9dc0c7c3 Mon Sep 17 00:00:00 2001
From: Joshua Cov <joshuacov@googlemail.com>
Date: Fri, 13 Apr 2012 21:08:26 +0200
Subject: keyboard: Use BIOS Keyboard variable to set Numlock

The PC BIOS does provide a NUMLOCK flag containing the desired state
of this LED. This patch sets the current state according to the data
in the bios.

[ hpa: fixed __weak declaration without definition, changed "inline"
  to "static inline" ]

Signed-Off-By: Joshua Cov <joshuacov@googlemail.com>
Link: http://lkml.kernel.org/r/CAKL7Q7rvq87TNS1T_Km8fW_5OzS%2BSbYazLXKxW-6ztOxo3zorg@mail.gmail.com
Acked-by: Alan Cox <alan@lxorguk.ukuu.org.uk>
Signed-off-by: H. Peter Anvin <hpa@zytor.com>
---
 arch/parisc/include/asm/kbdleds.h | 19 +++++++++++++++++++
 arch/x86/boot/main.c              | 18 ++++++++++++------
 arch/x86/include/asm/bootparam.h  |  3 ++-
 arch/x86/include/asm/kbdleds.h    | 17 +++++++++++++++++
 drivers/tty/vt/keyboard.c         | 20 ++++++++------------
 5 files changed, 58 insertions(+), 19 deletions(-)
 create mode 100644 arch/parisc/include/asm/kbdleds.h
 create mode 100644 arch/x86/include/asm/kbdleds.h

(limited to 'arch/x86')

diff --git a/arch/parisc/include/asm/kbdleds.h b/arch/parisc/include/asm/kbdleds.h
new file mode 100644
index 000000000000..2e2e75a83c28
--- /dev/null
+++ b/arch/parisc/include/asm/kbdleds.h
@@ -0,0 +1,19 @@
+#ifndef _ASM_PARISC_KBDLEDS_H
+#define _ASM_PARISC_KBDLEDS_H
+
+/*
+ * On HIL keyboards of PARISC machines there is no NumLock key and
+ * everyone expects the keypad to be used for numbers. That's why
+ * we can safely turn on the NUMLOCK bit.
+ */
+
+static inline int kbd_defleds(void)
+{
+#if defined(CONFIG_KEYBOARD_HIL) || defined(CONFIG_KEYBOARD_HIL_OLD)
+	return 1 << VC_NUMLOCK;
+#else
+	return 0;
+#endif
+}
+
+#endif /* _ASM_PARISC_KBDLEDS_H */
diff --git a/arch/x86/boot/main.c b/arch/x86/boot/main.c
index 40358c8905be..cf6083d444f4 100644
--- a/arch/x86/boot/main.c
+++ b/arch/x86/boot/main.c
@@ -57,14 +57,20 @@ static void copy_boot_params(void)
 }
 
 /*
- * Set the keyboard repeat rate to maximum.  Unclear why this
+ * Query the keyboard lock status as given by the BIOS, and
+ * set the keyboard repeat rate to maximum.  Unclear why the latter
  * is done here; this might be possible to kill off as stale code.
  */
-static void keyboard_set_repeat(void)
+static void keyboard_init(void)
 {
-	struct biosregs ireg;
+	struct biosregs ireg, oreg;
 	initregs(&ireg);
-	ireg.ax = 0x0305;
+
+	ireg.ah = 0x02;		/* Get keyboard status */
+	intcall(0x16, &ireg, &oreg);
+	boot_params.kbd_status = oreg.al;
+
+	ireg.ax = 0x0305;	/* Set keyboard repeat rate */
 	intcall(0x16, &ireg, NULL);
 }
 
@@ -151,8 +157,8 @@ void main(void)
 	/* Detect memory layout */
 	detect_memory();
 
-	/* Set keyboard repeat rate (why?) */
-	keyboard_set_repeat();
+	/* Set keyboard repeat rate (why?) and query the lock flags */
+	keyboard_init();
 
 	/* Query MCA information */
 	query_mca();
diff --git a/arch/x86/include/asm/bootparam.h b/arch/x86/include/asm/bootparam.h
index 2f90c51cc49d..eb45aa6b1f27 100644
--- a/arch/x86/include/asm/bootparam.h
+++ b/arch/x86/include/asm/bootparam.h
@@ -112,7 +112,8 @@ struct boot_params {
 	__u8  e820_entries;				/* 0x1e8 */
 	__u8  eddbuf_entries;				/* 0x1e9 */
 	__u8  edd_mbr_sig_buf_entries;			/* 0x1ea */
-	__u8  _pad6[6];					/* 0x1eb */
+	__u8  kbd_status;				/* 0x1eb */
+	__u8  _pad6[5];					/* 0x1ec */
 	struct setup_header hdr;    /* setup header */	/* 0x1f1 */
 	__u8  _pad7[0x290-0x1f1-sizeof(struct setup_header)];
 	__u32 edd_mbr_sig_buffer[EDD_MBR_SIG_MAX];	/* 0x290 */
diff --git a/arch/x86/include/asm/kbdleds.h b/arch/x86/include/asm/kbdleds.h
new file mode 100644
index 000000000000..f27ac5ff597d
--- /dev/null
+++ b/arch/x86/include/asm/kbdleds.h
@@ -0,0 +1,17 @@
+#ifndef _ASM_X86_KBDLEDS_H
+#define _ASM_X86_KBDLEDS_H
+
+/*
+ * Some laptops take the 789uiojklm,. keys as number pad when NumLock is on.
+ * This seems a good reason to start with NumLock off. That's why on X86 we
+ * ask the bios for the correct state.
+ */
+
+#include <asm/setup.h>
+
+static inline int kbd_defleds(void)
+{
+	return boot_params.kbd_status & 0x20 ? (1 << VC_NUMLOCK) : 0;
+}
+
+#endif /* _ASM_X86_KBDLEDS_H */
diff --git a/drivers/tty/vt/keyboard.c b/drivers/tty/vt/keyboard.c
index 86dd1e302bb3..b021a1817666 100644
--- a/drivers/tty/vt/keyboard.c
+++ b/drivers/tty/vt/keyboard.c
@@ -53,17 +53,13 @@ extern void ctrl_alt_del(void);
 
 #define KBD_DEFMODE ((1 << VC_REPEAT) | (1 << VC_META))
 
-/*
- * Some laptops take the 789uiojklm,. keys as number pad when NumLock is on.
- * This seems a good reason to start with NumLock off. On HIL keyboards
- * of PARISC machines however there is no NumLock key and everyone expects the
- * keypad to be used for numbers.
- */
-
-#if defined(CONFIG_PARISC) && (defined(CONFIG_KEYBOARD_HIL) || defined(CONFIG_KEYBOARD_HIL_OLD))
-#define KBD_DEFLEDS (1 << VC_NUMLOCK)
+#if defined(CONFIG_X86) || defined(CONFIG_PARISC)
+#include <asm/kbdleds.h>
 #else
-#define KBD_DEFLEDS 0
+static inline int kbd_defleds(void)
+{
+	return 0;
+}
 #endif
 
 #define KBD_DEFLOCK 0
@@ -1512,8 +1508,8 @@ int __init kbd_init(void)
 	int error;
 
 	for (i = 0; i < MAX_NR_CONSOLES; i++) {
-		kbd_table[i].ledflagstate = KBD_DEFLEDS;
-		kbd_table[i].default_ledflagstate = KBD_DEFLEDS;
+		kbd_table[i].ledflagstate = kbd_defleds();
+		kbd_table[i].default_ledflagstate = kbd_defleds();
 		kbd_table[i].ledmode = LED_SHOW_FLAGS;
 		kbd_table[i].lockstate = KBD_DEFLOCK;
 		kbd_table[i].slockstate = 0;
-- 
cgit v1.2.3


From f2604c141a00c00b92b7fd2f9d2455517fdd6c15 Mon Sep 17 00:00:00 2001
From: Jarkko Sakkinen <jarkko.sakkinen@intel.com>
Date: Tue, 8 May 2012 21:22:44 +0300
Subject: x86, realmode: move relocs from scripts/ to arch/x86/tools

Moved relocs tool from scripts/ to arch/x86/tools because
it is architecture specific script. Added new target archscripts
that can be used to build scripts needed building an architecture.

Signed-off-by: Jarkko Sakkinen <jarkko.sakkinen@intel.com>
Link: http://lkml.kernel.org/r/1336501366-28617-22-git-send-email-jarkko.sakkinen@intel.com
Signed-off-by: H. Peter Anvin <hpa@linux.intel.com>
Cc: Sam Ravnborg <sam@ravnborg.org>
Cc: Michal Marek <mmarek@suse.cz>
---
 Makefile                          |   9 +-
 arch/x86/Makefile                 |   3 +
 arch/x86/boot/compressed/Makefile |   4 +-
 arch/x86/realmode/rm/Makefile     |   2 +-
 arch/x86/tools/.gitignore         |   1 +
 arch/x86/tools/Makefile           |   4 +
 arch/x86/tools/relocs.c           | 804 ++++++++++++++++++++++++++++++++++++++
 scripts/Makefile                  |   1 -
 scripts/x86-relocs.c              | 804 --------------------------------------
 9 files changed, 821 insertions(+), 811 deletions(-)
 create mode 100644 arch/x86/tools/.gitignore
 create mode 100644 arch/x86/tools/relocs.c
 delete mode 100644 scripts/x86-relocs.c

(limited to 'arch/x86')

diff --git a/Makefile b/Makefile
index 9e384ae6c403..ed1bd29dc03c 100644
--- a/Makefile
+++ b/Makefile
@@ -442,7 +442,7 @@ asm-generic:
 
 no-dot-config-targets := clean mrproper distclean \
 			 cscope gtags TAGS tags help %docs check% coccicheck \
-			 include/linux/version.h headers_% archheaders \
+			 include/linux/version.h headers_% archheaders archscripts \
 			 kernelversion %src-pkg
 
 config-targets := 0
@@ -979,7 +979,7 @@ prepare1: prepare2 include/linux/version.h include/generated/utsrelease.h \
                    include/config/auto.conf
 	$(cmd_crmodverdir)
 
-archprepare: archheaders prepare1 scripts_basic
+archprepare: archheaders archscripts prepare1 scripts_basic
 
 prepare0: archprepare FORCE
 	$(Q)$(MAKE) $(build)=.
@@ -1049,8 +1049,11 @@ hdr-dst = $(if $(KBUILD_HEADERS), dst=include/asm-$(hdr-arch), dst=include/asm)
 PHONY += archheaders
 archheaders:
 
+PHONY += archscripts
+archscripts:
+
 PHONY += __headers
-__headers: include/linux/version.h scripts_basic asm-generic archheaders FORCE
+__headers: include/linux/version.h scripts_basic asm-generic archheaders archscripts FORCE
 	$(Q)$(MAKE) $(build)=scripts build_unifdef
 
 PHONY += headers_install_all
diff --git a/arch/x86/Makefile b/arch/x86/Makefile
index 41a7237606a3..94e91e401da9 100644
--- a/arch/x86/Makefile
+++ b/arch/x86/Makefile
@@ -134,6 +134,9 @@ KBUILD_CFLAGS += $(call cc-option,-mno-avx,)
 KBUILD_CFLAGS += $(mflags-y)
 KBUILD_AFLAGS += $(mflags-y)
 
+archscripts:
+	$(Q)$(MAKE) $(build)=arch/x86/tools relocs
+
 ###
 # Syscall table generation
 
diff --git a/arch/x86/boot/compressed/Makefile b/arch/x86/boot/compressed/Makefile
index 0435e8a2d20e..e398bb5d63bb 100644
--- a/arch/x86/boot/compressed/Makefile
+++ b/arch/x86/boot/compressed/Makefile
@@ -42,8 +42,8 @@ $(obj)/vmlinux.bin: vmlinux FORCE
 
 targets += vmlinux.bin.all vmlinux.relocs
 
-CMD_RELOCS = scripts/x86-relocs
-quiet_cmd_relocs = RELOCS $@
+CMD_RELOCS = arch/x86/tools/relocs
+quiet_cmd_relocs = RELOCS  $@
       cmd_relocs = $(CMD_RELOCS) $< > $@;$(CMD_RELOCS) --abs-relocs $<
 $(obj)/vmlinux.relocs: vmlinux FORCE
 	$(call if_changed,relocs)
diff --git a/arch/x86/realmode/rm/Makefile b/arch/x86/realmode/rm/Makefile
index fc8854b09dfa..de40bc44b92f 100644
--- a/arch/x86/realmode/rm/Makefile
+++ b/arch/x86/realmode/rm/Makefile
@@ -52,7 +52,7 @@ $(obj)/realmode.bin: $(obj)/realmode.elf
 	$(call if_changed,objcopy)
 
 quiet_cmd_relocs = RELOCS  $@
-      cmd_relocs = scripts/x86-relocs --realmode $< > $@
+      cmd_relocs = arch/x86/tools/relocs --realmode $< > $@
 $(obj)/realmode.relocs: $(obj)/realmode.elf FORCE
 	$(call if_changed,relocs)
 
diff --git a/arch/x86/tools/.gitignore b/arch/x86/tools/.gitignore
new file mode 100644
index 000000000000..be0ed065249b
--- /dev/null
+++ b/arch/x86/tools/.gitignore
@@ -0,0 +1 @@
+relocs
diff --git a/arch/x86/tools/Makefile b/arch/x86/tools/Makefile
index d511aa97533a..733057b435b0 100644
--- a/arch/x86/tools/Makefile
+++ b/arch/x86/tools/Makefile
@@ -36,3 +36,7 @@ HOSTCFLAGS_insn_sanity.o := -Wall -I$(objtree)/arch/x86/lib/ -I$(srctree)/arch/x
 $(obj)/test_get_len.o: $(srctree)/arch/x86/lib/insn.c $(srctree)/arch/x86/lib/inat.c $(srctree)/arch/x86/include/asm/inat_types.h $(srctree)/arch/x86/include/asm/inat.h $(srctree)/arch/x86/include/asm/insn.h $(objtree)/arch/x86/lib/inat-tables.c
 
 $(obj)/insn_sanity.o: $(srctree)/arch/x86/lib/insn.c $(srctree)/arch/x86/lib/inat.c $(srctree)/arch/x86/include/asm/inat_types.h $(srctree)/arch/x86/include/asm/inat.h $(srctree)/arch/x86/include/asm/insn.h $(objtree)/arch/x86/lib/inat-tables.c
+
+HOST_EXTRACFLAGS += -I$(srctree)/tools/include
+hostprogs-y	+= relocs
+relocs: $(obj)/relocs
diff --git a/arch/x86/tools/relocs.c b/arch/x86/tools/relocs.c
new file mode 100644
index 000000000000..74e16bb15dc4
--- /dev/null
+++ b/arch/x86/tools/relocs.c
@@ -0,0 +1,804 @@
+#include <stdio.h>
+#include <stdarg.h>
+#include <stdlib.h>
+#include <stdint.h>
+#include <string.h>
+#include <errno.h>
+#include <unistd.h>
+#include <elf.h>
+#include <byteswap.h>
+#define USE_BSD
+#include <endian.h>
+#include <regex.h>
+#include <tools/le_byteshift.h>
+
+static void die(char *fmt, ...);
+
+#define ARRAY_SIZE(x) (sizeof(x) / sizeof((x)[0]))
+static Elf32_Ehdr ehdr;
+static unsigned long reloc_count, reloc_idx;
+static unsigned long *relocs;
+static unsigned long reloc16_count, reloc16_idx;
+static unsigned long *relocs16;
+
+struct section {
+	Elf32_Shdr     shdr;
+	struct section *link;
+	Elf32_Sym      *symtab;
+	Elf32_Rel      *reltab;
+	char           *strtab;
+};
+static struct section *secs;
+
+enum symtype {
+	S_ABS,
+	S_REL,
+	S_SEG,
+	S_LIN,
+	S_NSYMTYPES
+};
+
+static const char * const sym_regex_kernel[S_NSYMTYPES] = {
+/*
+ * Following symbols have been audited. There values are constant and do
+ * not change if bzImage is loaded at a different physical address than
+ * the address for which it has been compiled. Don't warn user about
+ * absolute relocations present w.r.t these symbols.
+ */
+	[S_ABS] =
+	"^(xen_irq_disable_direct_reloc$|"
+	"xen_save_fl_direct_reloc$|"
+	"VDSO|"
+	"__crc_)",
+
+/*
+ * These symbols are known to be relative, even if the linker marks them
+ * as absolute (typically defined outside any section in the linker script.)
+ */
+	[S_REL] =
+	"^_end$",
+};
+
+
+static const char * const sym_regex_realmode[S_NSYMTYPES] = {
+/*
+ * These symbols are known to be relative, even if the linker marks them
+ * as absolute (typically defined outside any section in the linker script.)
+ */
+	[S_REL] =
+	"^pa_",
+
+/*
+ * These are 16-bit segment symbols when compiling 16-bit code.
+ */
+	[S_SEG] =
+	"^real_mode_seg$",
+
+/*
+ * These are offsets belonging to segments, as opposed to linear addresses,
+ * when compiling 16-bit code.
+ */
+	[S_LIN] =
+	"^pa_",
+};
+
+static const char * const *sym_regex;
+
+static regex_t sym_regex_c[S_NSYMTYPES];
+static int is_reloc(enum symtype type, const char *sym_name)
+{
+	return sym_regex[type] &&
+		!regexec(&sym_regex_c[type], sym_name, 0, NULL, 0);
+}
+
+static void regex_init(int use_real_mode)
+{
+        char errbuf[128];
+        int err;
+	int i;
+
+	if (use_real_mode)
+		sym_regex = sym_regex_realmode;
+	else
+		sym_regex = sym_regex_kernel;
+
+	for (i = 0; i < S_NSYMTYPES; i++) {
+		if (!sym_regex[i])
+			continue;
+
+		err = regcomp(&sym_regex_c[i], sym_regex[i],
+			      REG_EXTENDED|REG_NOSUB);
+
+		if (err) {
+			regerror(err, &sym_regex_c[i], errbuf, sizeof errbuf);
+			die("%s", errbuf);
+		}
+        }
+}
+
+static void die(char *fmt, ...)
+{
+	va_list ap;
+	va_start(ap, fmt);
+	vfprintf(stderr, fmt, ap);
+	va_end(ap);
+	exit(1);
+}
+
+static const char *sym_type(unsigned type)
+{
+	static const char *type_name[] = {
+#define SYM_TYPE(X) [X] = #X
+		SYM_TYPE(STT_NOTYPE),
+		SYM_TYPE(STT_OBJECT),
+		SYM_TYPE(STT_FUNC),
+		SYM_TYPE(STT_SECTION),
+		SYM_TYPE(STT_FILE),
+		SYM_TYPE(STT_COMMON),
+		SYM_TYPE(STT_TLS),
+#undef SYM_TYPE
+	};
+	const char *name = "unknown sym type name";
+	if (type < ARRAY_SIZE(type_name)) {
+		name = type_name[type];
+	}
+	return name;
+}
+
+static const char *sym_bind(unsigned bind)
+{
+	static const char *bind_name[] = {
+#define SYM_BIND(X) [X] = #X
+		SYM_BIND(STB_LOCAL),
+		SYM_BIND(STB_GLOBAL),
+		SYM_BIND(STB_WEAK),
+#undef SYM_BIND
+	};
+	const char *name = "unknown sym bind name";
+	if (bind < ARRAY_SIZE(bind_name)) {
+		name = bind_name[bind];
+	}
+	return name;
+}
+
+static const char *sym_visibility(unsigned visibility)
+{
+	static const char *visibility_name[] = {
+#define SYM_VISIBILITY(X) [X] = #X
+		SYM_VISIBILITY(STV_DEFAULT),
+		SYM_VISIBILITY(STV_INTERNAL),
+		SYM_VISIBILITY(STV_HIDDEN),
+		SYM_VISIBILITY(STV_PROTECTED),
+#undef SYM_VISIBILITY
+	};
+	const char *name = "unknown sym visibility name";
+	if (visibility < ARRAY_SIZE(visibility_name)) {
+		name = visibility_name[visibility];
+	}
+	return name;
+}
+
+static const char *rel_type(unsigned type)
+{
+	static const char *type_name[] = {
+#define REL_TYPE(X) [X] = #X
+		REL_TYPE(R_386_NONE),
+		REL_TYPE(R_386_32),
+		REL_TYPE(R_386_PC32),
+		REL_TYPE(R_386_GOT32),
+		REL_TYPE(R_386_PLT32),
+		REL_TYPE(R_386_COPY),
+		REL_TYPE(R_386_GLOB_DAT),
+		REL_TYPE(R_386_JMP_SLOT),
+		REL_TYPE(R_386_RELATIVE),
+		REL_TYPE(R_386_GOTOFF),
+		REL_TYPE(R_386_GOTPC),
+		REL_TYPE(R_386_8),
+		REL_TYPE(R_386_PC8),
+		REL_TYPE(R_386_16),
+		REL_TYPE(R_386_PC16),
+#undef REL_TYPE
+	};
+	const char *name = "unknown type rel type name";
+	if (type < ARRAY_SIZE(type_name) && type_name[type]) {
+		name = type_name[type];
+	}
+	return name;
+}
+
+static const char *sec_name(unsigned shndx)
+{
+	const char *sec_strtab;
+	const char *name;
+	sec_strtab = secs[ehdr.e_shstrndx].strtab;
+	name = "<noname>";
+	if (shndx < ehdr.e_shnum) {
+		name = sec_strtab + secs[shndx].shdr.sh_name;
+	}
+	else if (shndx == SHN_ABS) {
+		name = "ABSOLUTE";
+	}
+	else if (shndx == SHN_COMMON) {
+		name = "COMMON";
+	}
+	return name;
+}
+
+static const char *sym_name(const char *sym_strtab, Elf32_Sym *sym)
+{
+	const char *name;
+	name = "<noname>";
+	if (sym->st_name) {
+		name = sym_strtab + sym->st_name;
+	}
+	else {
+		name = sec_name(sym->st_shndx);
+	}
+	return name;
+}
+
+
+
+#if BYTE_ORDER == LITTLE_ENDIAN
+#define le16_to_cpu(val) (val)
+#define le32_to_cpu(val) (val)
+#endif
+#if BYTE_ORDER == BIG_ENDIAN
+#define le16_to_cpu(val) bswap_16(val)
+#define le32_to_cpu(val) bswap_32(val)
+#endif
+
+static uint16_t elf16_to_cpu(uint16_t val)
+{
+	return le16_to_cpu(val);
+}
+
+static uint32_t elf32_to_cpu(uint32_t val)
+{
+	return le32_to_cpu(val);
+}
+
+static void read_ehdr(FILE *fp)
+{
+	if (fread(&ehdr, sizeof(ehdr), 1, fp) != 1) {
+		die("Cannot read ELF header: %s\n",
+			strerror(errno));
+	}
+	if (memcmp(ehdr.e_ident, ELFMAG, SELFMAG) != 0) {
+		die("No ELF magic\n");
+	}
+	if (ehdr.e_ident[EI_CLASS] != ELFCLASS32) {
+		die("Not a 32 bit executable\n");
+	}
+	if (ehdr.e_ident[EI_DATA] != ELFDATA2LSB) {
+		die("Not a LSB ELF executable\n");
+	}
+	if (ehdr.e_ident[EI_VERSION] != EV_CURRENT) {
+		die("Unknown ELF version\n");
+	}
+	/* Convert the fields to native endian */
+	ehdr.e_type      = elf16_to_cpu(ehdr.e_type);
+	ehdr.e_machine   = elf16_to_cpu(ehdr.e_machine);
+	ehdr.e_version   = elf32_to_cpu(ehdr.e_version);
+	ehdr.e_entry     = elf32_to_cpu(ehdr.e_entry);
+	ehdr.e_phoff     = elf32_to_cpu(ehdr.e_phoff);
+	ehdr.e_shoff     = elf32_to_cpu(ehdr.e_shoff);
+	ehdr.e_flags     = elf32_to_cpu(ehdr.e_flags);
+	ehdr.e_ehsize    = elf16_to_cpu(ehdr.e_ehsize);
+	ehdr.e_phentsize = elf16_to_cpu(ehdr.e_phentsize);
+	ehdr.e_phnum     = elf16_to_cpu(ehdr.e_phnum);
+	ehdr.e_shentsize = elf16_to_cpu(ehdr.e_shentsize);
+	ehdr.e_shnum     = elf16_to_cpu(ehdr.e_shnum);
+	ehdr.e_shstrndx  = elf16_to_cpu(ehdr.e_shstrndx);
+
+	if ((ehdr.e_type != ET_EXEC) && (ehdr.e_type != ET_DYN)) {
+		die("Unsupported ELF header type\n");
+	}
+	if (ehdr.e_machine != EM_386) {
+		die("Not for x86\n");
+	}
+	if (ehdr.e_version != EV_CURRENT) {
+		die("Unknown ELF version\n");
+	}
+	if (ehdr.e_ehsize != sizeof(Elf32_Ehdr)) {
+		die("Bad Elf header size\n");
+	}
+	if (ehdr.e_phentsize != sizeof(Elf32_Phdr)) {
+		die("Bad program header entry\n");
+	}
+	if (ehdr.e_shentsize != sizeof(Elf32_Shdr)) {
+		die("Bad section header entry\n");
+	}
+	if (ehdr.e_shstrndx >= ehdr.e_shnum) {
+		die("String table index out of bounds\n");
+	}
+}
+
+static void read_shdrs(FILE *fp)
+{
+	int i;
+	Elf32_Shdr shdr;
+
+	secs = calloc(ehdr.e_shnum, sizeof(struct section));
+	if (!secs) {
+		die("Unable to allocate %d section headers\n",
+		    ehdr.e_shnum);
+	}
+	if (fseek(fp, ehdr.e_shoff, SEEK_SET) < 0) {
+		die("Seek to %d failed: %s\n",
+			ehdr.e_shoff, strerror(errno));
+	}
+	for (i = 0; i < ehdr.e_shnum; i++) {
+		struct section *sec = &secs[i];
+		if (fread(&shdr, sizeof shdr, 1, fp) != 1)
+			die("Cannot read ELF section headers %d/%d: %s\n",
+			    i, ehdr.e_shnum, strerror(errno));
+		sec->shdr.sh_name      = elf32_to_cpu(shdr.sh_name);
+		sec->shdr.sh_type      = elf32_to_cpu(shdr.sh_type);
+		sec->shdr.sh_flags     = elf32_to_cpu(shdr.sh_flags);
+		sec->shdr.sh_addr      = elf32_to_cpu(shdr.sh_addr);
+		sec->shdr.sh_offset    = elf32_to_cpu(shdr.sh_offset);
+		sec->shdr.sh_size      = elf32_to_cpu(shdr.sh_size);
+		sec->shdr.sh_link      = elf32_to_cpu(shdr.sh_link);
+		sec->shdr.sh_info      = elf32_to_cpu(shdr.sh_info);
+		sec->shdr.sh_addralign = elf32_to_cpu(shdr.sh_addralign);
+		sec->shdr.sh_entsize   = elf32_to_cpu(shdr.sh_entsize);
+		if (sec->shdr.sh_link < ehdr.e_shnum)
+			sec->link = &secs[sec->shdr.sh_link];
+	}
+
+}
+
+static void read_strtabs(FILE *fp)
+{
+	int i;
+	for (i = 0; i < ehdr.e_shnum; i++) {
+		struct section *sec = &secs[i];
+		if (sec->shdr.sh_type != SHT_STRTAB) {
+			continue;
+		}
+		sec->strtab = malloc(sec->shdr.sh_size);
+		if (!sec->strtab) {
+			die("malloc of %d bytes for strtab failed\n",
+				sec->shdr.sh_size);
+		}
+		if (fseek(fp, sec->shdr.sh_offset, SEEK_SET) < 0) {
+			die("Seek to %d failed: %s\n",
+				sec->shdr.sh_offset, strerror(errno));
+		}
+		if (fread(sec->strtab, 1, sec->shdr.sh_size, fp)
+		    != sec->shdr.sh_size) {
+			die("Cannot read symbol table: %s\n",
+				strerror(errno));
+		}
+	}
+}
+
+static void read_symtabs(FILE *fp)
+{
+	int i,j;
+	for (i = 0; i < ehdr.e_shnum; i++) {
+		struct section *sec = &secs[i];
+		if (sec->shdr.sh_type != SHT_SYMTAB) {
+			continue;
+		}
+		sec->symtab = malloc(sec->shdr.sh_size);
+		if (!sec->symtab) {
+			die("malloc of %d bytes for symtab failed\n",
+				sec->shdr.sh_size);
+		}
+		if (fseek(fp, sec->shdr.sh_offset, SEEK_SET) < 0) {
+			die("Seek to %d failed: %s\n",
+				sec->shdr.sh_offset, strerror(errno));
+		}
+		if (fread(sec->symtab, 1, sec->shdr.sh_size, fp)
+		    != sec->shdr.sh_size) {
+			die("Cannot read symbol table: %s\n",
+				strerror(errno));
+		}
+		for (j = 0; j < sec->shdr.sh_size/sizeof(Elf32_Sym); j++) {
+			Elf32_Sym *sym = &sec->symtab[j];
+			sym->st_name  = elf32_to_cpu(sym->st_name);
+			sym->st_value = elf32_to_cpu(sym->st_value);
+			sym->st_size  = elf32_to_cpu(sym->st_size);
+			sym->st_shndx = elf16_to_cpu(sym->st_shndx);
+		}
+	}
+}
+
+
+static void read_relocs(FILE *fp)
+{
+	int i,j;
+	for (i = 0; i < ehdr.e_shnum; i++) {
+		struct section *sec = &secs[i];
+		if (sec->shdr.sh_type != SHT_REL) {
+			continue;
+		}
+		sec->reltab = malloc(sec->shdr.sh_size);
+		if (!sec->reltab) {
+			die("malloc of %d bytes for relocs failed\n",
+				sec->shdr.sh_size);
+		}
+		if (fseek(fp, sec->shdr.sh_offset, SEEK_SET) < 0) {
+			die("Seek to %d failed: %s\n",
+				sec->shdr.sh_offset, strerror(errno));
+		}
+		if (fread(sec->reltab, 1, sec->shdr.sh_size, fp)
+		    != sec->shdr.sh_size) {
+			die("Cannot read symbol table: %s\n",
+				strerror(errno));
+		}
+		for (j = 0; j < sec->shdr.sh_size/sizeof(Elf32_Rel); j++) {
+			Elf32_Rel *rel = &sec->reltab[j];
+			rel->r_offset = elf32_to_cpu(rel->r_offset);
+			rel->r_info   = elf32_to_cpu(rel->r_info);
+		}
+	}
+}
+
+
+static void print_absolute_symbols(void)
+{
+	int i;
+	printf("Absolute symbols\n");
+	printf(" Num:    Value Size  Type       Bind        Visibility  Name\n");
+	for (i = 0; i < ehdr.e_shnum; i++) {
+		struct section *sec = &secs[i];
+		char *sym_strtab;
+		int j;
+
+		if (sec->shdr.sh_type != SHT_SYMTAB) {
+			continue;
+		}
+		sym_strtab = sec->link->strtab;
+		for (j = 0; j < sec->shdr.sh_size/sizeof(Elf32_Sym); j++) {
+			Elf32_Sym *sym;
+			const char *name;
+			sym = &sec->symtab[j];
+			name = sym_name(sym_strtab, sym);
+			if (sym->st_shndx != SHN_ABS) {
+				continue;
+			}
+			printf("%5d %08x %5d %10s %10s %12s %s\n",
+				j, sym->st_value, sym->st_size,
+				sym_type(ELF32_ST_TYPE(sym->st_info)),
+				sym_bind(ELF32_ST_BIND(sym->st_info)),
+				sym_visibility(ELF32_ST_VISIBILITY(sym->st_other)),
+				name);
+		}
+	}
+	printf("\n");
+}
+
+static void print_absolute_relocs(void)
+{
+	int i, printed = 0;
+
+	for (i = 0; i < ehdr.e_shnum; i++) {
+		struct section *sec = &secs[i];
+		struct section *sec_applies, *sec_symtab;
+		char *sym_strtab;
+		Elf32_Sym *sh_symtab;
+		int j;
+		if (sec->shdr.sh_type != SHT_REL) {
+			continue;
+		}
+		sec_symtab  = sec->link;
+		sec_applies = &secs[sec->shdr.sh_info];
+		if (!(sec_applies->shdr.sh_flags & SHF_ALLOC)) {
+			continue;
+		}
+		sh_symtab  = sec_symtab->symtab;
+		sym_strtab = sec_symtab->link->strtab;
+		for (j = 0; j < sec->shdr.sh_size/sizeof(Elf32_Rel); j++) {
+			Elf32_Rel *rel;
+			Elf32_Sym *sym;
+			const char *name;
+			rel = &sec->reltab[j];
+			sym = &sh_symtab[ELF32_R_SYM(rel->r_info)];
+			name = sym_name(sym_strtab, sym);
+			if (sym->st_shndx != SHN_ABS) {
+				continue;
+			}
+
+			/* Absolute symbols are not relocated if bzImage is
+			 * loaded at a non-compiled address. Display a warning
+			 * to user at compile time about the absolute
+			 * relocations present.
+			 *
+			 * User need to audit the code to make sure
+			 * some symbols which should have been section
+			 * relative have not become absolute because of some
+			 * linker optimization or wrong programming usage.
+			 *
+			 * Before warning check if this absolute symbol
+			 * relocation is harmless.
+			 */
+			if (is_reloc(S_ABS, name) || is_reloc(S_REL, name))
+				continue;
+
+			if (!printed) {
+				printf("WARNING: Absolute relocations"
+					" present\n");
+				printf("Offset     Info     Type     Sym.Value "
+					"Sym.Name\n");
+				printed = 1;
+			}
+
+			printf("%08x %08x %10s %08x  %s\n",
+				rel->r_offset,
+				rel->r_info,
+				rel_type(ELF32_R_TYPE(rel->r_info)),
+				sym->st_value,
+				name);
+		}
+	}
+
+	if (printed)
+		printf("\n");
+}
+
+static void walk_relocs(void (*visit)(Elf32_Rel *rel, Elf32_Sym *sym),
+			int use_real_mode)
+{
+	int i;
+	/* Walk through the relocations */
+	for (i = 0; i < ehdr.e_shnum; i++) {
+		char *sym_strtab;
+		Elf32_Sym *sh_symtab;
+		struct section *sec_applies, *sec_symtab;
+		int j;
+		struct section *sec = &secs[i];
+
+		if (sec->shdr.sh_type != SHT_REL) {
+			continue;
+		}
+		sec_symtab  = sec->link;
+		sec_applies = &secs[sec->shdr.sh_info];
+		if (!(sec_applies->shdr.sh_flags & SHF_ALLOC)) {
+			continue;
+		}
+		sh_symtab = sec_symtab->symtab;
+		sym_strtab = sec_symtab->link->strtab;
+		for (j = 0; j < sec->shdr.sh_size/sizeof(Elf32_Rel); j++) {
+			Elf32_Rel *rel;
+			Elf32_Sym *sym;
+			unsigned r_type;
+			const char *symname;
+			rel = &sec->reltab[j];
+			sym = &sh_symtab[ELF32_R_SYM(rel->r_info)];
+			r_type = ELF32_R_TYPE(rel->r_info);
+
+			switch (r_type) {
+			case R_386_NONE:
+			case R_386_PC32:
+			case R_386_PC16:
+			case R_386_PC8:
+				/*
+				 * NONE can be ignored and and PC relative
+				 * relocations don't need to be adjusted.
+				 */
+				break;
+
+			case R_386_16:
+				symname = sym_name(sym_strtab, sym);
+				if (!use_real_mode)
+					goto bad;
+				if (sym->st_shndx == SHN_ABS) {
+					if (is_reloc(S_ABS, symname))
+						break;
+					else if (!is_reloc(S_SEG, symname))
+						goto bad;
+				} else {
+					if (is_reloc(S_LIN, symname))
+						goto bad;
+					else
+						break;
+				}
+				visit(rel, sym);
+				break;
+
+			case R_386_32:
+				symname = sym_name(sym_strtab, sym);
+				if (sym->st_shndx == SHN_ABS) {
+					if (is_reloc(S_ABS, symname))
+						break;
+					else if (!is_reloc(S_REL, symname))
+						goto bad;
+				} else {
+					if (use_real_mode &&
+					    !is_reloc(S_LIN, symname))
+						break;
+				}
+				visit(rel, sym);
+				break;
+			default:
+				die("Unsupported relocation type: %s (%d)\n",
+				    rel_type(r_type), r_type);
+				break;
+			bad:
+				symname = sym_name(sym_strtab, sym);
+				die("Invalid %s relocation: %s\n",
+				    rel_type(r_type), symname);
+			}
+		}
+	}
+}
+
+static void count_reloc(Elf32_Rel *rel, Elf32_Sym *sym)
+{
+	if (ELF32_R_TYPE(rel->r_info) == R_386_16)
+		reloc16_count++;
+	else
+		reloc_count++;
+}
+
+static void collect_reloc(Elf32_Rel *rel, Elf32_Sym *sym)
+{
+	/* Remember the address that needs to be adjusted. */
+	if (ELF32_R_TYPE(rel->r_info) == R_386_16)
+		relocs16[reloc16_idx++] = rel->r_offset;
+	else
+		relocs[reloc_idx++] = rel->r_offset;
+}
+
+static int cmp_relocs(const void *va, const void *vb)
+{
+	const unsigned long *a, *b;
+	a = va; b = vb;
+	return (*a == *b)? 0 : (*a > *b)? 1 : -1;
+}
+
+static int write32(unsigned int v, FILE *f)
+{
+	unsigned char buf[4];
+
+	put_unaligned_le32(v, buf);
+	return fwrite(buf, 1, 4, f) == 4 ? 0 : -1;
+}
+
+static void emit_relocs(int as_text, int use_real_mode)
+{
+	int i;
+	/* Count how many relocations I have and allocate space for them. */
+	reloc_count = 0;
+	walk_relocs(count_reloc, use_real_mode);
+	relocs = malloc(reloc_count * sizeof(relocs[0]));
+	if (!relocs) {
+		die("malloc of %d entries for relocs failed\n",
+			reloc_count);
+	}
+
+	relocs16 = malloc(reloc16_count * sizeof(relocs[0]));
+	if (!relocs16) {
+		die("malloc of %d entries for relocs16 failed\n",
+			reloc16_count);
+	}
+	/* Collect up the relocations */
+	reloc_idx = 0;
+	walk_relocs(collect_reloc, use_real_mode);
+
+	if (reloc16_count && !use_real_mode)
+		die("Segment relocations found but --realmode not specified\n");
+
+	/* Order the relocations for more efficient processing */
+	qsort(relocs, reloc_count, sizeof(relocs[0]), cmp_relocs);
+	qsort(relocs16, reloc16_count, sizeof(relocs16[0]), cmp_relocs);
+
+	/* Print the relocations */
+	if (as_text) {
+		/* Print the relocations in a form suitable that
+		 * gas will like.
+		 */
+		printf(".section \".data.reloc\",\"a\"\n");
+		printf(".balign 4\n");
+		if (use_real_mode) {
+			printf("\t.long %lu\n", reloc16_count);
+			for (i = 0; i < reloc16_count; i++)
+				printf("\t.long 0x%08lx\n", relocs16[i]);
+			printf("\t.long %lu\n", reloc_count);
+			for (i = 0; i < reloc_count; i++) {
+				printf("\t.long 0x%08lx\n", relocs[i]);
+			}
+		} else {
+			/* Print a stop */
+			printf("\t.long 0x%08lx\n", (unsigned long)0);
+			for (i = 0; i < reloc_count; i++) {
+				printf("\t.long 0x%08lx\n", relocs[i]);
+			}
+		}
+
+		printf("\n");
+	}
+	else {
+		if (use_real_mode) {
+			write32(reloc16_count, stdout);
+			for (i = 0; i < reloc16_count; i++)
+				write32(relocs16[i], stdout);
+			write32(reloc_count, stdout);
+
+			/* Now print each relocation */
+			for (i = 0; i < reloc_count; i++)
+				write32(relocs[i], stdout);
+		} else {
+			/* Print a stop */
+			write32(0, stdout);
+
+			/* Now print each relocation */
+			for (i = 0; i < reloc_count; i++) {
+				write32(relocs[i], stdout);
+			}
+		}
+	}
+}
+
+static void usage(void)
+{
+	die("relocs [--abs-syms|--abs-relocs|--text|--realmode] vmlinux\n");
+}
+
+int main(int argc, char **argv)
+{
+	int show_absolute_syms, show_absolute_relocs;
+	int as_text, use_real_mode;
+	const char *fname;
+	FILE *fp;
+	int i;
+
+	show_absolute_syms = 0;
+	show_absolute_relocs = 0;
+	as_text = 0;
+	use_real_mode = 0;
+	fname = NULL;
+	for (i = 1; i < argc; i++) {
+		char *arg = argv[i];
+		if (*arg == '-') {
+			if (strcmp(arg, "--abs-syms") == 0) {
+				show_absolute_syms = 1;
+				continue;
+			}
+			if (strcmp(arg, "--abs-relocs") == 0) {
+				show_absolute_relocs = 1;
+				continue;
+			}
+			if (strcmp(arg, "--text") == 0) {
+				as_text = 1;
+				continue;
+			}
+			if (strcmp(arg, "--realmode") == 0) {
+				use_real_mode = 1;
+				continue;
+			}
+		}
+		else if (!fname) {
+			fname = arg;
+			continue;
+		}
+		usage();
+	}
+	if (!fname) {
+		usage();
+	}
+	regex_init(use_real_mode);
+	fp = fopen(fname, "r");
+	if (!fp) {
+		die("Cannot open %s: %s\n",
+			fname, strerror(errno));
+	}
+	read_ehdr(fp);
+	read_shdrs(fp);
+	read_strtabs(fp);
+	read_symtabs(fp);
+	read_relocs(fp);
+	if (show_absolute_syms) {
+		print_absolute_symbols();
+		return 0;
+	}
+	if (show_absolute_relocs) {
+		print_absolute_relocs();
+		return 0;
+	}
+	emit_relocs(as_text, use_real_mode);
+	return 0;
+}
diff --git a/scripts/Makefile b/scripts/Makefile
index a241359d2c82..36266665dbcb 100644
--- a/scripts/Makefile
+++ b/scripts/Makefile
@@ -15,7 +15,6 @@ hostprogs-$(CONFIG_LOGO)         += pnmtologo
 hostprogs-$(CONFIG_VT)           += conmakehash
 hostprogs-$(CONFIG_IKCONFIG)     += bin2c
 hostprogs-$(BUILD_C_RECORDMCOUNT) += recordmcount
-hostprogs-$(CONFIG_X86)          += x86-relocs
 
 always		:= $(hostprogs-y) $(hostprogs-m)
 
diff --git a/scripts/x86-relocs.c b/scripts/x86-relocs.c
deleted file mode 100644
index 74e16bb15dc4..000000000000
--- a/scripts/x86-relocs.c
+++ /dev/null
@@ -1,804 +0,0 @@
-#include <stdio.h>
-#include <stdarg.h>
-#include <stdlib.h>
-#include <stdint.h>
-#include <string.h>
-#include <errno.h>
-#include <unistd.h>
-#include <elf.h>
-#include <byteswap.h>
-#define USE_BSD
-#include <endian.h>
-#include <regex.h>
-#include <tools/le_byteshift.h>
-
-static void die(char *fmt, ...);
-
-#define ARRAY_SIZE(x) (sizeof(x) / sizeof((x)[0]))
-static Elf32_Ehdr ehdr;
-static unsigned long reloc_count, reloc_idx;
-static unsigned long *relocs;
-static unsigned long reloc16_count, reloc16_idx;
-static unsigned long *relocs16;
-
-struct section {
-	Elf32_Shdr     shdr;
-	struct section *link;
-	Elf32_Sym      *symtab;
-	Elf32_Rel      *reltab;
-	char           *strtab;
-};
-static struct section *secs;
-
-enum symtype {
-	S_ABS,
-	S_REL,
-	S_SEG,
-	S_LIN,
-	S_NSYMTYPES
-};
-
-static const char * const sym_regex_kernel[S_NSYMTYPES] = {
-/*
- * Following symbols have been audited. There values are constant and do
- * not change if bzImage is loaded at a different physical address than
- * the address for which it has been compiled. Don't warn user about
- * absolute relocations present w.r.t these symbols.
- */
-	[S_ABS] =
-	"^(xen_irq_disable_direct_reloc$|"
-	"xen_save_fl_direct_reloc$|"
-	"VDSO|"
-	"__crc_)",
-
-/*
- * These symbols are known to be relative, even if the linker marks them
- * as absolute (typically defined outside any section in the linker script.)
- */
-	[S_REL] =
-	"^_end$",
-};
-
-
-static const char * const sym_regex_realmode[S_NSYMTYPES] = {
-/*
- * These symbols are known to be relative, even if the linker marks them
- * as absolute (typically defined outside any section in the linker script.)
- */
-	[S_REL] =
-	"^pa_",
-
-/*
- * These are 16-bit segment symbols when compiling 16-bit code.
- */
-	[S_SEG] =
-	"^real_mode_seg$",
-
-/*
- * These are offsets belonging to segments, as opposed to linear addresses,
- * when compiling 16-bit code.
- */
-	[S_LIN] =
-	"^pa_",
-};
-
-static const char * const *sym_regex;
-
-static regex_t sym_regex_c[S_NSYMTYPES];
-static int is_reloc(enum symtype type, const char *sym_name)
-{
-	return sym_regex[type] &&
-		!regexec(&sym_regex_c[type], sym_name, 0, NULL, 0);
-}
-
-static void regex_init(int use_real_mode)
-{
-        char errbuf[128];
-        int err;
-	int i;
-
-	if (use_real_mode)
-		sym_regex = sym_regex_realmode;
-	else
-		sym_regex = sym_regex_kernel;
-
-	for (i = 0; i < S_NSYMTYPES; i++) {
-		if (!sym_regex[i])
-			continue;
-
-		err = regcomp(&sym_regex_c[i], sym_regex[i],
-			      REG_EXTENDED|REG_NOSUB);
-
-		if (err) {
-			regerror(err, &sym_regex_c[i], errbuf, sizeof errbuf);
-			die("%s", errbuf);
-		}
-        }
-}
-
-static void die(char *fmt, ...)
-{
-	va_list ap;
-	va_start(ap, fmt);
-	vfprintf(stderr, fmt, ap);
-	va_end(ap);
-	exit(1);
-}
-
-static const char *sym_type(unsigned type)
-{
-	static const char *type_name[] = {
-#define SYM_TYPE(X) [X] = #X
-		SYM_TYPE(STT_NOTYPE),
-		SYM_TYPE(STT_OBJECT),
-		SYM_TYPE(STT_FUNC),
-		SYM_TYPE(STT_SECTION),
-		SYM_TYPE(STT_FILE),
-		SYM_TYPE(STT_COMMON),
-		SYM_TYPE(STT_TLS),
-#undef SYM_TYPE
-	};
-	const char *name = "unknown sym type name";
-	if (type < ARRAY_SIZE(type_name)) {
-		name = type_name[type];
-	}
-	return name;
-}
-
-static const char *sym_bind(unsigned bind)
-{
-	static const char *bind_name[] = {
-#define SYM_BIND(X) [X] = #X
-		SYM_BIND(STB_LOCAL),
-		SYM_BIND(STB_GLOBAL),
-		SYM_BIND(STB_WEAK),
-#undef SYM_BIND
-	};
-	const char *name = "unknown sym bind name";
-	if (bind < ARRAY_SIZE(bind_name)) {
-		name = bind_name[bind];
-	}
-	return name;
-}
-
-static const char *sym_visibility(unsigned visibility)
-{
-	static const char *visibility_name[] = {
-#define SYM_VISIBILITY(X) [X] = #X
-		SYM_VISIBILITY(STV_DEFAULT),
-		SYM_VISIBILITY(STV_INTERNAL),
-		SYM_VISIBILITY(STV_HIDDEN),
-		SYM_VISIBILITY(STV_PROTECTED),
-#undef SYM_VISIBILITY
-	};
-	const char *name = "unknown sym visibility name";
-	if (visibility < ARRAY_SIZE(visibility_name)) {
-		name = visibility_name[visibility];
-	}
-	return name;
-}
-
-static const char *rel_type(unsigned type)
-{
-	static const char *type_name[] = {
-#define REL_TYPE(X) [X] = #X
-		REL_TYPE(R_386_NONE),
-		REL_TYPE(R_386_32),
-		REL_TYPE(R_386_PC32),
-		REL_TYPE(R_386_GOT32),
-		REL_TYPE(R_386_PLT32),
-		REL_TYPE(R_386_COPY),
-		REL_TYPE(R_386_GLOB_DAT),
-		REL_TYPE(R_386_JMP_SLOT),
-		REL_TYPE(R_386_RELATIVE),
-		REL_TYPE(R_386_GOTOFF),
-		REL_TYPE(R_386_GOTPC),
-		REL_TYPE(R_386_8),
-		REL_TYPE(R_386_PC8),
-		REL_TYPE(R_386_16),
-		REL_TYPE(R_386_PC16),
-#undef REL_TYPE
-	};
-	const char *name = "unknown type rel type name";
-	if (type < ARRAY_SIZE(type_name) && type_name[type]) {
-		name = type_name[type];
-	}
-	return name;
-}
-
-static const char *sec_name(unsigned shndx)
-{
-	const char *sec_strtab;
-	const char *name;
-	sec_strtab = secs[ehdr.e_shstrndx].strtab;
-	name = "<noname>";
-	if (shndx < ehdr.e_shnum) {
-		name = sec_strtab + secs[shndx].shdr.sh_name;
-	}
-	else if (shndx == SHN_ABS) {
-		name = "ABSOLUTE";
-	}
-	else if (shndx == SHN_COMMON) {
-		name = "COMMON";
-	}
-	return name;
-}
-
-static const char *sym_name(const char *sym_strtab, Elf32_Sym *sym)
-{
-	const char *name;
-	name = "<noname>";
-	if (sym->st_name) {
-		name = sym_strtab + sym->st_name;
-	}
-	else {
-		name = sec_name(sym->st_shndx);
-	}
-	return name;
-}
-
-
-
-#if BYTE_ORDER == LITTLE_ENDIAN
-#define le16_to_cpu(val) (val)
-#define le32_to_cpu(val) (val)
-#endif
-#if BYTE_ORDER == BIG_ENDIAN
-#define le16_to_cpu(val) bswap_16(val)
-#define le32_to_cpu(val) bswap_32(val)
-#endif
-
-static uint16_t elf16_to_cpu(uint16_t val)
-{
-	return le16_to_cpu(val);
-}
-
-static uint32_t elf32_to_cpu(uint32_t val)
-{
-	return le32_to_cpu(val);
-}
-
-static void read_ehdr(FILE *fp)
-{
-	if (fread(&ehdr, sizeof(ehdr), 1, fp) != 1) {
-		die("Cannot read ELF header: %s\n",
-			strerror(errno));
-	}
-	if (memcmp(ehdr.e_ident, ELFMAG, SELFMAG) != 0) {
-		die("No ELF magic\n");
-	}
-	if (ehdr.e_ident[EI_CLASS] != ELFCLASS32) {
-		die("Not a 32 bit executable\n");
-	}
-	if (ehdr.e_ident[EI_DATA] != ELFDATA2LSB) {
-		die("Not a LSB ELF executable\n");
-	}
-	if (ehdr.e_ident[EI_VERSION] != EV_CURRENT) {
-		die("Unknown ELF version\n");
-	}
-	/* Convert the fields to native endian */
-	ehdr.e_type      = elf16_to_cpu(ehdr.e_type);
-	ehdr.e_machine   = elf16_to_cpu(ehdr.e_machine);
-	ehdr.e_version   = elf32_to_cpu(ehdr.e_version);
-	ehdr.e_entry     = elf32_to_cpu(ehdr.e_entry);
-	ehdr.e_phoff     = elf32_to_cpu(ehdr.e_phoff);
-	ehdr.e_shoff     = elf32_to_cpu(ehdr.e_shoff);
-	ehdr.e_flags     = elf32_to_cpu(ehdr.e_flags);
-	ehdr.e_ehsize    = elf16_to_cpu(ehdr.e_ehsize);
-	ehdr.e_phentsize = elf16_to_cpu(ehdr.e_phentsize);
-	ehdr.e_phnum     = elf16_to_cpu(ehdr.e_phnum);
-	ehdr.e_shentsize = elf16_to_cpu(ehdr.e_shentsize);
-	ehdr.e_shnum     = elf16_to_cpu(ehdr.e_shnum);
-	ehdr.e_shstrndx  = elf16_to_cpu(ehdr.e_shstrndx);
-
-	if ((ehdr.e_type != ET_EXEC) && (ehdr.e_type != ET_DYN)) {
-		die("Unsupported ELF header type\n");
-	}
-	if (ehdr.e_machine != EM_386) {
-		die("Not for x86\n");
-	}
-	if (ehdr.e_version != EV_CURRENT) {
-		die("Unknown ELF version\n");
-	}
-	if (ehdr.e_ehsize != sizeof(Elf32_Ehdr)) {
-		die("Bad Elf header size\n");
-	}
-	if (ehdr.e_phentsize != sizeof(Elf32_Phdr)) {
-		die("Bad program header entry\n");
-	}
-	if (ehdr.e_shentsize != sizeof(Elf32_Shdr)) {
-		die("Bad section header entry\n");
-	}
-	if (ehdr.e_shstrndx >= ehdr.e_shnum) {
-		die("String table index out of bounds\n");
-	}
-}
-
-static void read_shdrs(FILE *fp)
-{
-	int i;
-	Elf32_Shdr shdr;
-
-	secs = calloc(ehdr.e_shnum, sizeof(struct section));
-	if (!secs) {
-		die("Unable to allocate %d section headers\n",
-		    ehdr.e_shnum);
-	}
-	if (fseek(fp, ehdr.e_shoff, SEEK_SET) < 0) {
-		die("Seek to %d failed: %s\n",
-			ehdr.e_shoff, strerror(errno));
-	}
-	for (i = 0; i < ehdr.e_shnum; i++) {
-		struct section *sec = &secs[i];
-		if (fread(&shdr, sizeof shdr, 1, fp) != 1)
-			die("Cannot read ELF section headers %d/%d: %s\n",
-			    i, ehdr.e_shnum, strerror(errno));
-		sec->shdr.sh_name      = elf32_to_cpu(shdr.sh_name);
-		sec->shdr.sh_type      = elf32_to_cpu(shdr.sh_type);
-		sec->shdr.sh_flags     = elf32_to_cpu(shdr.sh_flags);
-		sec->shdr.sh_addr      = elf32_to_cpu(shdr.sh_addr);
-		sec->shdr.sh_offset    = elf32_to_cpu(shdr.sh_offset);
-		sec->shdr.sh_size      = elf32_to_cpu(shdr.sh_size);
-		sec->shdr.sh_link      = elf32_to_cpu(shdr.sh_link);
-		sec->shdr.sh_info      = elf32_to_cpu(shdr.sh_info);
-		sec->shdr.sh_addralign = elf32_to_cpu(shdr.sh_addralign);
-		sec->shdr.sh_entsize   = elf32_to_cpu(shdr.sh_entsize);
-		if (sec->shdr.sh_link < ehdr.e_shnum)
-			sec->link = &secs[sec->shdr.sh_link];
-	}
-
-}
-
-static void read_strtabs(FILE *fp)
-{
-	int i;
-	for (i = 0; i < ehdr.e_shnum; i++) {
-		struct section *sec = &secs[i];
-		if (sec->shdr.sh_type != SHT_STRTAB) {
-			continue;
-		}
-		sec->strtab = malloc(sec->shdr.sh_size);
-		if (!sec->strtab) {
-			die("malloc of %d bytes for strtab failed\n",
-				sec->shdr.sh_size);
-		}
-		if (fseek(fp, sec->shdr.sh_offset, SEEK_SET) < 0) {
-			die("Seek to %d failed: %s\n",
-				sec->shdr.sh_offset, strerror(errno));
-		}
-		if (fread(sec->strtab, 1, sec->shdr.sh_size, fp)
-		    != sec->shdr.sh_size) {
-			die("Cannot read symbol table: %s\n",
-				strerror(errno));
-		}
-	}
-}
-
-static void read_symtabs(FILE *fp)
-{
-	int i,j;
-	for (i = 0; i < ehdr.e_shnum; i++) {
-		struct section *sec = &secs[i];
-		if (sec->shdr.sh_type != SHT_SYMTAB) {
-			continue;
-		}
-		sec->symtab = malloc(sec->shdr.sh_size);
-		if (!sec->symtab) {
-			die("malloc of %d bytes for symtab failed\n",
-				sec->shdr.sh_size);
-		}
-		if (fseek(fp, sec->shdr.sh_offset, SEEK_SET) < 0) {
-			die("Seek to %d failed: %s\n",
-				sec->shdr.sh_offset, strerror(errno));
-		}
-		if (fread(sec->symtab, 1, sec->shdr.sh_size, fp)
-		    != sec->shdr.sh_size) {
-			die("Cannot read symbol table: %s\n",
-				strerror(errno));
-		}
-		for (j = 0; j < sec->shdr.sh_size/sizeof(Elf32_Sym); j++) {
-			Elf32_Sym *sym = &sec->symtab[j];
-			sym->st_name  = elf32_to_cpu(sym->st_name);
-			sym->st_value = elf32_to_cpu(sym->st_value);
-			sym->st_size  = elf32_to_cpu(sym->st_size);
-			sym->st_shndx = elf16_to_cpu(sym->st_shndx);
-		}
-	}
-}
-
-
-static void read_relocs(FILE *fp)
-{
-	int i,j;
-	for (i = 0; i < ehdr.e_shnum; i++) {
-		struct section *sec = &secs[i];
-		if (sec->shdr.sh_type != SHT_REL) {
-			continue;
-		}
-		sec->reltab = malloc(sec->shdr.sh_size);
-		if (!sec->reltab) {
-			die("malloc of %d bytes for relocs failed\n",
-				sec->shdr.sh_size);
-		}
-		if (fseek(fp, sec->shdr.sh_offset, SEEK_SET) < 0) {
-			die("Seek to %d failed: %s\n",
-				sec->shdr.sh_offset, strerror(errno));
-		}
-		if (fread(sec->reltab, 1, sec->shdr.sh_size, fp)
-		    != sec->shdr.sh_size) {
-			die("Cannot read symbol table: %s\n",
-				strerror(errno));
-		}
-		for (j = 0; j < sec->shdr.sh_size/sizeof(Elf32_Rel); j++) {
-			Elf32_Rel *rel = &sec->reltab[j];
-			rel->r_offset = elf32_to_cpu(rel->r_offset);
-			rel->r_info   = elf32_to_cpu(rel->r_info);
-		}
-	}
-}
-
-
-static void print_absolute_symbols(void)
-{
-	int i;
-	printf("Absolute symbols\n");
-	printf(" Num:    Value Size  Type       Bind        Visibility  Name\n");
-	for (i = 0; i < ehdr.e_shnum; i++) {
-		struct section *sec = &secs[i];
-		char *sym_strtab;
-		int j;
-
-		if (sec->shdr.sh_type != SHT_SYMTAB) {
-			continue;
-		}
-		sym_strtab = sec->link->strtab;
-		for (j = 0; j < sec->shdr.sh_size/sizeof(Elf32_Sym); j++) {
-			Elf32_Sym *sym;
-			const char *name;
-			sym = &sec->symtab[j];
-			name = sym_name(sym_strtab, sym);
-			if (sym->st_shndx != SHN_ABS) {
-				continue;
-			}
-			printf("%5d %08x %5d %10s %10s %12s %s\n",
-				j, sym->st_value, sym->st_size,
-				sym_type(ELF32_ST_TYPE(sym->st_info)),
-				sym_bind(ELF32_ST_BIND(sym->st_info)),
-				sym_visibility(ELF32_ST_VISIBILITY(sym->st_other)),
-				name);
-		}
-	}
-	printf("\n");
-}
-
-static void print_absolute_relocs(void)
-{
-	int i, printed = 0;
-
-	for (i = 0; i < ehdr.e_shnum; i++) {
-		struct section *sec = &secs[i];
-		struct section *sec_applies, *sec_symtab;
-		char *sym_strtab;
-		Elf32_Sym *sh_symtab;
-		int j;
-		if (sec->shdr.sh_type != SHT_REL) {
-			continue;
-		}
-		sec_symtab  = sec->link;
-		sec_applies = &secs[sec->shdr.sh_info];
-		if (!(sec_applies->shdr.sh_flags & SHF_ALLOC)) {
-			continue;
-		}
-		sh_symtab  = sec_symtab->symtab;
-		sym_strtab = sec_symtab->link->strtab;
-		for (j = 0; j < sec->shdr.sh_size/sizeof(Elf32_Rel); j++) {
-			Elf32_Rel *rel;
-			Elf32_Sym *sym;
-			const char *name;
-			rel = &sec->reltab[j];
-			sym = &sh_symtab[ELF32_R_SYM(rel->r_info)];
-			name = sym_name(sym_strtab, sym);
-			if (sym->st_shndx != SHN_ABS) {
-				continue;
-			}
-
-			/* Absolute symbols are not relocated if bzImage is
-			 * loaded at a non-compiled address. Display a warning
-			 * to user at compile time about the absolute
-			 * relocations present.
-			 *
-			 * User need to audit the code to make sure
-			 * some symbols which should have been section
-			 * relative have not become absolute because of some
-			 * linker optimization or wrong programming usage.
-			 *
-			 * Before warning check if this absolute symbol
-			 * relocation is harmless.
-			 */
-			if (is_reloc(S_ABS, name) || is_reloc(S_REL, name))
-				continue;
-
-			if (!printed) {
-				printf("WARNING: Absolute relocations"
-					" present\n");
-				printf("Offset     Info     Type     Sym.Value "
-					"Sym.Name\n");
-				printed = 1;
-			}
-
-			printf("%08x %08x %10s %08x  %s\n",
-				rel->r_offset,
-				rel->r_info,
-				rel_type(ELF32_R_TYPE(rel->r_info)),
-				sym->st_value,
-				name);
-		}
-	}
-
-	if (printed)
-		printf("\n");
-}
-
-static void walk_relocs(void (*visit)(Elf32_Rel *rel, Elf32_Sym *sym),
-			int use_real_mode)
-{
-	int i;
-	/* Walk through the relocations */
-	for (i = 0; i < ehdr.e_shnum; i++) {
-		char *sym_strtab;
-		Elf32_Sym *sh_symtab;
-		struct section *sec_applies, *sec_symtab;
-		int j;
-		struct section *sec = &secs[i];
-
-		if (sec->shdr.sh_type != SHT_REL) {
-			continue;
-		}
-		sec_symtab  = sec->link;
-		sec_applies = &secs[sec->shdr.sh_info];
-		if (!(sec_applies->shdr.sh_flags & SHF_ALLOC)) {
-			continue;
-		}
-		sh_symtab = sec_symtab->symtab;
-		sym_strtab = sec_symtab->link->strtab;
-		for (j = 0; j < sec->shdr.sh_size/sizeof(Elf32_Rel); j++) {
-			Elf32_Rel *rel;
-			Elf32_Sym *sym;
-			unsigned r_type;
-			const char *symname;
-			rel = &sec->reltab[j];
-			sym = &sh_symtab[ELF32_R_SYM(rel->r_info)];
-			r_type = ELF32_R_TYPE(rel->r_info);
-
-			switch (r_type) {
-			case R_386_NONE:
-			case R_386_PC32:
-			case R_386_PC16:
-			case R_386_PC8:
-				/*
-				 * NONE can be ignored and and PC relative
-				 * relocations don't need to be adjusted.
-				 */
-				break;
-
-			case R_386_16:
-				symname = sym_name(sym_strtab, sym);
-				if (!use_real_mode)
-					goto bad;
-				if (sym->st_shndx == SHN_ABS) {
-					if (is_reloc(S_ABS, symname))
-						break;
-					else if (!is_reloc(S_SEG, symname))
-						goto bad;
-				} else {
-					if (is_reloc(S_LIN, symname))
-						goto bad;
-					else
-						break;
-				}
-				visit(rel, sym);
-				break;
-
-			case R_386_32:
-				symname = sym_name(sym_strtab, sym);
-				if (sym->st_shndx == SHN_ABS) {
-					if (is_reloc(S_ABS, symname))
-						break;
-					else if (!is_reloc(S_REL, symname))
-						goto bad;
-				} else {
-					if (use_real_mode &&
-					    !is_reloc(S_LIN, symname))
-						break;
-				}
-				visit(rel, sym);
-				break;
-			default:
-				die("Unsupported relocation type: %s (%d)\n",
-				    rel_type(r_type), r_type);
-				break;
-			bad:
-				symname = sym_name(sym_strtab, sym);
-				die("Invalid %s relocation: %s\n",
-				    rel_type(r_type), symname);
-			}
-		}
-	}
-}
-
-static void count_reloc(Elf32_Rel *rel, Elf32_Sym *sym)
-{
-	if (ELF32_R_TYPE(rel->r_info) == R_386_16)
-		reloc16_count++;
-	else
-		reloc_count++;
-}
-
-static void collect_reloc(Elf32_Rel *rel, Elf32_Sym *sym)
-{
-	/* Remember the address that needs to be adjusted. */
-	if (ELF32_R_TYPE(rel->r_info) == R_386_16)
-		relocs16[reloc16_idx++] = rel->r_offset;
-	else
-		relocs[reloc_idx++] = rel->r_offset;
-}
-
-static int cmp_relocs(const void *va, const void *vb)
-{
-	const unsigned long *a, *b;
-	a = va; b = vb;
-	return (*a == *b)? 0 : (*a > *b)? 1 : -1;
-}
-
-static int write32(unsigned int v, FILE *f)
-{
-	unsigned char buf[4];
-
-	put_unaligned_le32(v, buf);
-	return fwrite(buf, 1, 4, f) == 4 ? 0 : -1;
-}
-
-static void emit_relocs(int as_text, int use_real_mode)
-{
-	int i;
-	/* Count how many relocations I have and allocate space for them. */
-	reloc_count = 0;
-	walk_relocs(count_reloc, use_real_mode);
-	relocs = malloc(reloc_count * sizeof(relocs[0]));
-	if (!relocs) {
-		die("malloc of %d entries for relocs failed\n",
-			reloc_count);
-	}
-
-	relocs16 = malloc(reloc16_count * sizeof(relocs[0]));
-	if (!relocs16) {
-		die("malloc of %d entries for relocs16 failed\n",
-			reloc16_count);
-	}
-	/* Collect up the relocations */
-	reloc_idx = 0;
-	walk_relocs(collect_reloc, use_real_mode);
-
-	if (reloc16_count && !use_real_mode)
-		die("Segment relocations found but --realmode not specified\n");
-
-	/* Order the relocations for more efficient processing */
-	qsort(relocs, reloc_count, sizeof(relocs[0]), cmp_relocs);
-	qsort(relocs16, reloc16_count, sizeof(relocs16[0]), cmp_relocs);
-
-	/* Print the relocations */
-	if (as_text) {
-		/* Print the relocations in a form suitable that
-		 * gas will like.
-		 */
-		printf(".section \".data.reloc\",\"a\"\n");
-		printf(".balign 4\n");
-		if (use_real_mode) {
-			printf("\t.long %lu\n", reloc16_count);
-			for (i = 0; i < reloc16_count; i++)
-				printf("\t.long 0x%08lx\n", relocs16[i]);
-			printf("\t.long %lu\n", reloc_count);
-			for (i = 0; i < reloc_count; i++) {
-				printf("\t.long 0x%08lx\n", relocs[i]);
-			}
-		} else {
-			/* Print a stop */
-			printf("\t.long 0x%08lx\n", (unsigned long)0);
-			for (i = 0; i < reloc_count; i++) {
-				printf("\t.long 0x%08lx\n", relocs[i]);
-			}
-		}
-
-		printf("\n");
-	}
-	else {
-		if (use_real_mode) {
-			write32(reloc16_count, stdout);
-			for (i = 0; i < reloc16_count; i++)
-				write32(relocs16[i], stdout);
-			write32(reloc_count, stdout);
-
-			/* Now print each relocation */
-			for (i = 0; i < reloc_count; i++)
-				write32(relocs[i], stdout);
-		} else {
-			/* Print a stop */
-			write32(0, stdout);
-
-			/* Now print each relocation */
-			for (i = 0; i < reloc_count; i++) {
-				write32(relocs[i], stdout);
-			}
-		}
-	}
-}
-
-static void usage(void)
-{
-	die("relocs [--abs-syms|--abs-relocs|--text|--realmode] vmlinux\n");
-}
-
-int main(int argc, char **argv)
-{
-	int show_absolute_syms, show_absolute_relocs;
-	int as_text, use_real_mode;
-	const char *fname;
-	FILE *fp;
-	int i;
-
-	show_absolute_syms = 0;
-	show_absolute_relocs = 0;
-	as_text = 0;
-	use_real_mode = 0;
-	fname = NULL;
-	for (i = 1; i < argc; i++) {
-		char *arg = argv[i];
-		if (*arg == '-') {
-			if (strcmp(arg, "--abs-syms") == 0) {
-				show_absolute_syms = 1;
-				continue;
-			}
-			if (strcmp(arg, "--abs-relocs") == 0) {
-				show_absolute_relocs = 1;
-				continue;
-			}
-			if (strcmp(arg, "--text") == 0) {
-				as_text = 1;
-				continue;
-			}
-			if (strcmp(arg, "--realmode") == 0) {
-				use_real_mode = 1;
-				continue;
-			}
-		}
-		else if (!fname) {
-			fname = arg;
-			continue;
-		}
-		usage();
-	}
-	if (!fname) {
-		usage();
-	}
-	regex_init(use_real_mode);
-	fp = fopen(fname, "r");
-	if (!fp) {
-		die("Cannot open %s: %s\n",
-			fname, strerror(errno));
-	}
-	read_ehdr(fp);
-	read_shdrs(fp);
-	read_strtabs(fp);
-	read_symtabs(fp);
-	read_relocs(fp);
-	if (show_absolute_syms) {
-		print_absolute_symbols();
-		return 0;
-	}
-	if (show_absolute_relocs) {
-		print_absolute_relocs();
-		return 0;
-	}
-	emit_relocs(as_text, use_real_mode);
-	return 0;
-}
-- 
cgit v1.2.3


From bf8b88e97716feb750c3d34492f00d9c085e1183 Mon Sep 17 00:00:00 2001
From: Jarkko Sakkinen <jarkko.sakkinen@intel.com>
Date: Tue, 8 May 2012 21:22:45 +0300
Subject: x86, realmode: fixes compilation issue in tboot.c

Fixed include path of wakeup.h in tboot.c.

Signed-off-by: Jarkko Sakkinen <jarkko.sakkinen@intel.com>
Link: http://lkml.kernel.org/r/1336501366-28617-23-git-send-email-jarkko.sakkinen@intel.com
Signed-off-by: H. Peter Anvin <hpa@linux.intel.com>
---
 arch/x86/kernel/tboot.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kernel/tboot.c b/arch/x86/kernel/tboot.c
index 65adda4fde93..f84fe00fad48 100644
--- a/arch/x86/kernel/tboot.c
+++ b/arch/x86/kernel/tboot.c
@@ -44,7 +44,7 @@
 #include <asm/e820.h>
 #include <asm/io.h>
 
-#include "acpi/realmode/wakeup.h"
+#include "../realmode/rm/wakeup.h"
 
 /* Global pointer to shared data; NULL means no measured launch. */
 struct tboot *tboot __read_mostly;
-- 
cgit v1.2.3


From cda846f101fb1396b6924f1d9b68ac3d42de5403 Mon Sep 17 00:00:00 2001
From: Jarkko Sakkinen <jarkko.sakkinen@intel.com>
Date: Tue, 8 May 2012 21:22:46 +0300
Subject: x86, realmode: read cr4 and EFER from kernel for 64-bit trampoline

This patch changes 64-bit trampoline so that CR4 and
EFER are provided by the kernel instead of using fixed
values.

Signed-off-by: Jarkko Sakkinen <jarkko.sakkinen@intel.com>
Link: http://lkml.kernel.org/r/1336501366-28617-24-git-send-email-jarkko.sakkinen@intel.com
Signed-off-by: H. Peter Anvin <hpa@linux.intel.com>
---
 arch/x86/include/asm/processor.h         |  7 ++++++-
 arch/x86/include/asm/realmode.h          |  8 ++++++--
 arch/x86/kernel/realmode.c               |  8 ++++++++
 arch/x86/kernel/setup.c                  |  2 ++
 arch/x86/realmode/rm/header.S            |  1 +
 arch/x86/realmode/rm/trampoline_64.S     | 32 +++++++-------------------------
 arch/x86/realmode/rm/trampoline_common.S | 19 +++++++++++++++++++
 7 files changed, 49 insertions(+), 28 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/include/asm/processor.h b/arch/x86/include/asm/processor.h
index 4fa7dcceb6c0..404583ccf0cf 100644
--- a/arch/x86/include/asm/processor.h
+++ b/arch/x86/include/asm/processor.h
@@ -544,13 +544,16 @@ static inline void load_sp0(struct tss_struct *tss,
  * enable), so that any CPU's that boot up
  * after us can get the correct flags.
  */
-extern unsigned long		mmu_cr4_features;
+extern unsigned long mmu_cr4_features;
+extern u32 *trampoline_cr4_features;
 
 static inline void set_in_cr4(unsigned long mask)
 {
 	unsigned long cr4;
 
 	mmu_cr4_features |= mask;
+	if (trampoline_cr4_features)
+		*trampoline_cr4_features = mmu_cr4_features;
 	cr4 = read_cr4();
 	cr4 |= mask;
 	write_cr4(cr4);
@@ -561,6 +564,8 @@ static inline void clear_in_cr4(unsigned long mask)
 	unsigned long cr4;
 
 	mmu_cr4_features &= ~mask;
+	if (trampoline_cr4_features)
+		*trampoline_cr4_features = mmu_cr4_features;
 	cr4 = read_cr4();
 	cr4 &= ~mask;
 	write_cr4(cr4);
diff --git a/arch/x86/include/asm/realmode.h b/arch/x86/include/asm/realmode.h
index 1421eed1c8e8..937dc6071d76 100644
--- a/arch/x86/include/asm/realmode.h
+++ b/arch/x86/include/asm/realmode.h
@@ -24,18 +24,22 @@ struct real_mode_header {
 #ifdef CONFIG_X86_32
 	u32	machine_real_restart_asm;
 #endif
-} __attribute__((__packed__));
+};
 
 /* This must match data at trampoline_32/64.S */
 struct trampoline_header {
 #ifdef CONFIG_X86_32
 	u32 start;
+	u16 gdt_pad;
 	u16 gdt_limit;
 	u32 gdt_base;
 #else
 	u64 start;
+	u32 cr4;
+	u32 efer_low;
+	u32 efer_high;
 #endif
-} __attribute__((__packed__));
+};
 
 extern struct real_mode_header *real_mode_header;
 extern unsigned char real_mode_blob_end[];
diff --git a/arch/x86/kernel/realmode.c b/arch/x86/kernel/realmode.c
index 712fba8fd774..66ac276cf361 100644
--- a/arch/x86/kernel/realmode.c
+++ b/arch/x86/kernel/realmode.c
@@ -6,6 +6,7 @@
 #include <asm/realmode.h>
 
 struct real_mode_header *real_mode_header;
+u32 *trampoline_cr4_features;
 
 void __init setup_real_mode(void)
 {
@@ -64,7 +65,14 @@ void __init setup_real_mode(void)
 	trampoline_header->gdt_limit = __BOOT_DS + 7;
 	trampoline_header->gdt_base = __pa(boot_gdt);
 #else
+	if (rdmsr_safe(MSR_EFER, &trampoline_header->efer_low,
+		       &trampoline_header->efer_high))
+		BUG();
+
 	trampoline_header->start = (u64) secondary_startup_64;
+	trampoline_cr4_features = &trampoline_header->cr4;
+	*trampoline_cr4_features = read_cr4();
+
 	trampoline_pgd = (u64 *) __va(real_mode_header->trampoline_pgd);
 	trampoline_pgd[0] = __pa(level3_ident_pgt) + _KERNPG_TABLE;
 	trampoline_pgd[511] = __pa(level3_kernel_pgt) + _KERNPG_TABLE;
diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c
index 7a14fece9cfc..efcf305210a4 100644
--- a/arch/x86/kernel/setup.c
+++ b/arch/x86/kernel/setup.c
@@ -975,6 +975,8 @@ void __init setup_arch(char **cmdline_p)
 	if (boot_cpu_data.cpuid_level >= 0) {
 		/* A CPU has %cr4 if and only if it has CPUID */
 		mmu_cr4_features = read_cr4();
+		if (trampoline_cr4_features)
+			*trampoline_cr4_features = mmu_cr4_features;
 	}
 
 #ifdef CONFIG_X86_32
diff --git a/arch/x86/realmode/rm/header.S b/arch/x86/realmode/rm/header.S
index b4c32632bf16..4612d5382791 100644
--- a/arch/x86/realmode/rm/header.S
+++ b/arch/x86/realmode/rm/header.S
@@ -9,6 +9,7 @@
 
 	.section ".header", "a"
 
+	.balign	16
 GLOBAL(real_mode_header)
 	.long	pa_text_start
 	.long	pa_ro_end
diff --git a/arch/x86/realmode/rm/trampoline_64.S b/arch/x86/realmode/rm/trampoline_64.S
index 3f7293239365..66e26f088288 100644
--- a/arch/x86/realmode/rm/trampoline_64.S
+++ b/arch/x86/realmode/rm/trampoline_64.S
@@ -34,9 +34,9 @@
 #include "realmode.h"
 
 	.text
-	.balign PAGE_SIZE
 	.code16
 
+	.balign	PAGE_SIZE
 ENTRY(trampoline_start)
 	cli			# We should be safe anyway
 	wbinvd
@@ -65,8 +65,8 @@ ENTRY(trampoline_start)
 	 * to 32 bit.
 	 */
 
-	lidtl	tidt	# load idt with 0, 0
-	lgdtl	tgdt	# load gdt with whatever is appropriate
+	lidtl	tr_idt	# load idt with 0, 0
+	lgdtl	tr_gdt	# load gdt with whatever is appropriate
 
 	movw	$__KERNEL_DS, %dx	# Data segment descriptor
 
@@ -93,16 +93,17 @@ ENTRY(startup_32)
 	movl	%edx, %fs
 	movl	%edx, %gs
 
-	movl	$X86_CR4_PAE, %eax
+	movl	pa_tr_cr4, %eax
 	movl	%eax, %cr4		# Enable PAE mode
 
 	# Setup trampoline 4 level pagetables
 	movl	$pa_trampoline_pgd, %eax
 	movl	%eax, %cr3
 
+	# Set up EFER
+	movl	pa_tr_efer, %eax
+	movl	pa_tr_efer + 4, %edx
 	movl	$MSR_EFER, %ecx
-	movl	$((1 << _EFER_LME) | (1 << _EFER_NX)), %eax	# Enable Long Mode
-	xorl	%edx, %edx
 	wrmsr
 
 	# Enable paging and in turn activate Long Mode
@@ -124,23 +125,4 @@ ENTRY(startup_64)
 	# Now jump into the kernel using virtual addresses
 	jmpq	*tr_start(%rip)
 
-	.section ".rodata","a"
-	.balign	16
-tidt:
-	.word	0			# idt limit = 0
-	.word	0, 0			# idt base = 0L
-
-	# Duplicate the global descriptor table
-	# so the kernel can live anywhere
-	.balign 16
-	.globl tgdt
-tgdt:
-	.short	tgdt_end - tgdt - 1	# gdt limit
-	.long	pa_tgdt
-	.short	0
-	.quad	0x00cf9b000000ffff	# __KERNEL32_CS
-	.quad	0x00af9b000000ffff	# __KERNEL_CS
-	.quad	0x00cf93000000ffff	# __KERNEL_DS
-tgdt_end:
-
 #include "trampoline_common.S"
diff --git a/arch/x86/realmode/rm/trampoline_common.S b/arch/x86/realmode/rm/trampoline_common.S
index c3f951c468c5..cac444b942f8 100644
--- a/arch/x86/realmode/rm/trampoline_common.S
+++ b/arch/x86/realmode/rm/trampoline_common.S
@@ -1,5 +1,20 @@
 	.section ".rodata","a"
 
+#ifdef CONFIG_X86_64
+	# Duplicate the global descriptor table
+	# so the kernel can live anywhere
+	.balign	16
+	.globl tr_gdt
+tr_gdt:
+	.short	tr_gdt_end - tr_gdt - 1	# gdt limit
+	.long	pa_tr_gdt
+	.short	0
+	.quad	0x00cf9b000000ffff	# __KERNEL32_CS
+	.quad	0x00af9b000000ffff	# __KERNEL_CS
+	.quad	0x00cf93000000ffff	# __KERNEL_DS
+tr_gdt_end:
+#endif
+
 	.balign	4
 tr_idt: .fill 1, 6, 0
 
@@ -8,12 +23,16 @@ tr_idt: .fill 1, 6, 0
 	.balign	4
 GLOBAL(trampoline_status)	.space	4
 
+	.balign	8
 GLOBAL(trampoline_header)
 #ifdef CONFIG_X86_32
 	tr_start:		.space	4
+	tr_gdt_pad:		.space	2
 	tr_gdt:			.space	6
 #else
 	tr_start:		.space	8
+	GLOBAL(tr_cr4)		.space	4
+	GLOBAL(tr_efer)		.space	8
 #endif
 END(trampoline_header)
 
-- 
cgit v1.2.3


From 1f0459780c28491c480f7098f3ece79334ccae0a Mon Sep 17 00:00:00 2001
From: Philipp Hahn <hahn@univention.de>
Date: Wed, 2 May 2012 18:09:35 +0200
Subject: atomic64_32.h: fix parameter naming mismatch

The doc string doesn't match the parameter name, fix
@p -> @v
@ptr -> @v
@n -> @i

Signed-off-by: Philipp Hahn <hahn@univention.de>
Signed-off-by: Jiri Kosina <jkosina@suse.cz>
---
 arch/x86/include/asm/atomic64_32.h | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/include/asm/atomic64_32.h b/arch/x86/include/asm/atomic64_32.h
index 198119910da5..b154de75c90c 100644
--- a/arch/x86/include/asm/atomic64_32.h
+++ b/arch/x86/include/asm/atomic64_32.h
@@ -63,7 +63,7 @@ ATOMIC64_DECL(add_unless);
 
 /**
  * atomic64_cmpxchg - cmpxchg atomic64 variable
- * @p: pointer to type atomic64_t
+ * @v: pointer to type atomic64_t
  * @o: expected value
  * @n: new value
  *
@@ -98,7 +98,7 @@ static inline long long atomic64_xchg(atomic64_t *v, long long n)
 /**
  * atomic64_set - set atomic64 variable
  * @v: pointer to type atomic64_t
- * @n: value to assign
+ * @i: value to assign
  *
  * Atomically sets the value of @v to @n.
  */
@@ -200,7 +200,7 @@ static inline long long atomic64_sub(long long i, atomic64_t *v)
  * atomic64_sub_and_test - subtract value from variable and test result
  * @i: integer value to subtract
  * @v: pointer to type atomic64_t
-  *
+ *
  * Atomically subtracts @i from @v and returns
  * true if the result is zero, or false for all
  * other cases.
@@ -224,9 +224,9 @@ static inline void atomic64_inc(atomic64_t *v)
 
 /**
  * atomic64_dec - decrement atomic64 variable
- * @ptr: pointer to type atomic64_t
+ * @v: pointer to type atomic64_t
  *
- * Atomically decrements @ptr by 1.
+ * Atomically decrements @v by 1.
  */
 static inline void atomic64_dec(atomic64_t *v)
 {
-- 
cgit v1.2.3


From 57da8b960b9a25646a8ddb5a9c1d0b5978e69bec Mon Sep 17 00:00:00 2001
From: Jan Beulich <JBeulich@suse.com>
Date: Wed, 9 May 2012 08:47:37 +0100
Subject: x86: Avoid double stack traces with show_regs()

What was called show_registers() so far already showed a stack
trace for kernel faults, and kernel_stack_pointer() isn't even
valid to be used for faults from user mode, hence it was
pointless for show_regs() to call show_trace() after
show_registers().

Simply rename show_registers() to show_regs() and eliminate
the old definition.

Signed-off-by: Jan Beulich <jbeulich@suse.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Arjan van de Ven <arjan@linux.intel.com>
Cc: Frederic Weisbecker <fweisbec@gmail.com>
Link: http://lkml.kernel.org/r/4FAA3D3902000078000826E1@nat28.tlf.novell.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/x86/include/asm/kdebug.h  | 1 -
 arch/x86/kernel/dumpstack.c    | 2 +-
 arch/x86/kernel/dumpstack_32.c | 2 +-
 arch/x86/kernel/dumpstack_64.c | 2 +-
 arch/x86/kernel/kprobes.c      | 4 ++--
 arch/x86/kernel/nmi.c          | 2 +-
 arch/x86/kernel/process.c      | 6 ------
 7 files changed, 6 insertions(+), 13 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/include/asm/kdebug.h b/arch/x86/include/asm/kdebug.h
index d73f1571bde7..2c37aadcbc35 100644
--- a/arch/x86/include/asm/kdebug.h
+++ b/arch/x86/include/asm/kdebug.h
@@ -24,7 +24,6 @@ enum die_val {
 extern void printk_address(unsigned long address, int reliable);
 extern void die(const char *, struct pt_regs *,long);
 extern int __must_check __die(const char *, struct pt_regs *, long);
-extern void show_registers(struct pt_regs *regs);
 extern void show_trace(struct task_struct *t, struct pt_regs *regs,
 		       unsigned long *sp, unsigned long bp);
 extern void __show_regs(struct pt_regs *regs, int all);
diff --git a/arch/x86/kernel/dumpstack.c b/arch/x86/kernel/dumpstack.c
index 1b81839b6c88..40989da4bb22 100644
--- a/arch/x86/kernel/dumpstack.c
+++ b/arch/x86/kernel/dumpstack.c
@@ -271,7 +271,7 @@ int __kprobes __die(const char *str, struct pt_regs *regs, long err)
 			current->thread.trap_nr, SIGSEGV) == NOTIFY_STOP)
 		return 1;
 
-	show_registers(regs);
+	show_regs(regs);
 #ifdef CONFIG_X86_32
 	if (user_mode_vm(regs)) {
 		sp = regs->sp;
diff --git a/arch/x86/kernel/dumpstack_32.c b/arch/x86/kernel/dumpstack_32.c
index 88ec9129271d..e0b1d783daab 100644
--- a/arch/x86/kernel/dumpstack_32.c
+++ b/arch/x86/kernel/dumpstack_32.c
@@ -82,7 +82,7 @@ show_stack_log_lvl(struct task_struct *task, struct pt_regs *regs,
 }
 
 
-void show_registers(struct pt_regs *regs)
+void show_regs(struct pt_regs *regs)
 {
 	int i;
 
diff --git a/arch/x86/kernel/dumpstack_64.c b/arch/x86/kernel/dumpstack_64.c
index 17107bd6e1f0..791b76122aa8 100644
--- a/arch/x86/kernel/dumpstack_64.c
+++ b/arch/x86/kernel/dumpstack_64.c
@@ -245,7 +245,7 @@ show_stack_log_lvl(struct task_struct *task, struct pt_regs *regs,
 	show_trace_log_lvl(task, regs, sp, bp, log_lvl);
 }
 
-void show_registers(struct pt_regs *regs)
+void show_regs(struct pt_regs *regs)
 {
 	int i;
 	unsigned long sp;
diff --git a/arch/x86/kernel/kprobes.c b/arch/x86/kernel/kprobes.c
index e213fc8408d2..e2f751efb7b1 100644
--- a/arch/x86/kernel/kprobes.c
+++ b/arch/x86/kernel/kprobes.c
@@ -1037,9 +1037,9 @@ int __kprobes longjmp_break_handler(struct kprobe *p, struct pt_regs *regs)
 			       "current sp %p does not match saved sp %p\n",
 			       stack_addr(regs), kcb->jprobe_saved_sp);
 			printk(KERN_ERR "Saved registers for jprobe %p\n", jp);
-			show_registers(saved_regs);
+			show_regs(saved_regs);
 			printk(KERN_ERR "Current registers\n");
-			show_registers(regs);
+			show_regs(regs);
 			BUG();
 		}
 		*regs = kcb->jprobe_saved_regs;
diff --git a/arch/x86/kernel/nmi.c b/arch/x86/kernel/nmi.c
index 47acaf319165..03c134544966 100644
--- a/arch/x86/kernel/nmi.c
+++ b/arch/x86/kernel/nmi.c
@@ -244,7 +244,7 @@ io_check_error(unsigned char reason, struct pt_regs *regs)
 	pr_emerg(
 	"NMI: IOCK error (debug interrupt?) for reason %02x on CPU %d.\n",
 		 reason, smp_processor_id());
-	show_registers(regs);
+	show_regs(regs);
 
 	if (panic_on_io_nmi)
 		panic("NMI IOCK error: Not continuing");
diff --git a/arch/x86/kernel/process.c b/arch/x86/kernel/process.c
index 1d92a5ab6e8b..856d5bcae5b2 100644
--- a/arch/x86/kernel/process.c
+++ b/arch/x86/kernel/process.c
@@ -105,12 +105,6 @@ void exit_thread(void)
 	}
 }
 
-void show_regs(struct pt_regs *regs)
-{
-	show_registers(regs);
-	show_trace(NULL, regs, (unsigned long *)kernel_stack_pointer(regs), 0);
-}
-
 void show_regs_common(void)
 {
 	const char *vendor, *product, *board;
-- 
cgit v1.2.3


From 94c0dd3278dd3eae52eabf0fb77d472d0dd3e373 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <a.p.zijlstra@chello.nl>
Date: Wed, 18 Apr 2012 19:04:17 +0200
Subject: x86/numa: Allow specifying node_distance() for numa=fake

Allows emulating more interesting NUMA configurations like a quad
socket AMD Magny-Cour:

 "numa=fake=8:10,16,16,22,16,22,16,22,
              16,10,22,16,22,16,22,16,
              16,22,10,16,16,22,16,22,
              22,16,16,10,22,16,22,16,
              16,22,16,22,10,16,16,22,
              22,16,22,16,16,10,22,16,
              16,22,16,22,16,22,10,16,
              22,16,22,16,22,16,16,10"

Which has a non-fully-connected topology.

Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Tejun Heo <tj@kernel.org>
Cc: Yinghai Lu <yinghai@kernel.org>
Cc: x86@kernel.org
Link: http://lkml.kernel.org/n/tip-e1136ef7kdffj7yf9tjhydln@git.kernel.org
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/x86/mm/numa_emulation.c | 8 ++++++--
 1 file changed, 6 insertions(+), 2 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/mm/numa_emulation.c b/arch/x86/mm/numa_emulation.c
index 53489ff6bf82..871dd8868170 100644
--- a/arch/x86/mm/numa_emulation.c
+++ b/arch/x86/mm/numa_emulation.c
@@ -339,9 +339,11 @@ void __init numa_emulation(struct numa_meminfo *numa_meminfo, int numa_dist_cnt)
 	} else {
 		unsigned long n;
 
-		n = simple_strtoul(emu_cmdline, NULL, 0);
+		n = simple_strtoul(emu_cmdline, &emu_cmdline, 0);
 		ret = split_nodes_interleave(&ei, &pi, 0, max_addr, n);
 	}
+	if (*emu_cmdline == ':')
+		emu_cmdline++;
 
 	if (ret < 0)
 		goto no_emu;
@@ -418,7 +420,9 @@ void __init numa_emulation(struct numa_meminfo *numa_meminfo, int numa_dist_cnt)
 			int physj = emu_nid_to_phys[j];
 			int dist;
 
-			if (physi >= numa_dist_cnt || physj >= numa_dist_cnt)
+			if (get_option(&emu_cmdline, &dist) == 2)
+				;
+			else if (physi >= numa_dist_cnt || physj >= numa_dist_cnt)
 				dist = physi == physj ?
 					LOCAL_DISTANCE : REMOTE_DISTANCE;
 			else
-- 
cgit v1.2.3


From 0acbb440f06302058e1515861dd534594521e892 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <a.p.zijlstra@chello.nl>
Date: Wed, 18 Apr 2012 19:04:09 +0200
Subject: x86/numa: Hard partition cpu topology masks on node boundaries

When using numa=fake= you can get weird topologies where LLCs can span
nodes and other such nonsense. Cure this by hard partitioning these
masks on node boundaries.

Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Tejun Heo <tj@kernel.org>
Cc: Yinghai Lu <yinghai@kernel.org>
Cc: x86@kernel.org
Link: http://lkml.kernel.org/n/tip-di5vwjm96q5vrb76opwuflwx@git.kernel.org
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/x86/kernel/smpboot.c | 11 +++++++++++
 1 file changed, 11 insertions(+)

(limited to 'arch/x86')

diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c
index 6e1e406038c2..edfd03a9e390 100644
--- a/arch/x86/kernel/smpboot.c
+++ b/arch/x86/kernel/smpboot.c
@@ -337,6 +337,11 @@ void __cpuinit set_cpu_sibling_map(int cpu)
 		for_each_cpu(i, cpu_sibling_setup_mask) {
 			struct cpuinfo_x86 *o = &cpu_data(i);
 
+#ifdef CONFIG_NUMA_EMU
+			if (cpu_to_node(cpu) != cpu_to_node(i))
+				continue;
+#endif
+
 			if (cpu_has(c, X86_FEATURE_TOPOEXT)) {
 				if (c->phys_proc_id == o->phys_proc_id &&
 				    per_cpu(cpu_llc_id, cpu) == per_cpu(cpu_llc_id, i) &&
@@ -360,11 +365,17 @@ void __cpuinit set_cpu_sibling_map(int cpu)
 	}
 
 	for_each_cpu(i, cpu_sibling_setup_mask) {
+#ifdef CONFIG_NUMA_EMU
+		if (cpu_to_node(cpu) != cpu_to_node(i))
+			continue;
+#endif
+
 		if (per_cpu(cpu_llc_id, cpu) != BAD_APICID &&
 		    per_cpu(cpu_llc_id, cpu) == per_cpu(cpu_llc_id, i)) {
 			cpumask_set_cpu(i, cpu_llc_shared_mask(cpu));
 			cpumask_set_cpu(cpu, cpu_llc_shared_mask(i));
 		}
+
 		if (c->phys_proc_id == cpu_data(i).phys_proc_id) {
 			cpumask_set_cpu(i, cpu_core_mask(cpu));
 			cpumask_set_cpu(cpu, cpu_core_mask(i));
-- 
cgit v1.2.3


From ad7687dde8780a0d618a3e3b5a62bb383696fc22 Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@kernel.org>
Date: Wed, 9 May 2012 13:31:47 +0200
Subject: x86/numa: Check for nonsensical topologies on real hw as well

Instead of only checking nonsensical topologies on numa-emu, do it
on real hardware as well, and print a warning.

Acked-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Tejun Heo <tj@kernel.org>
Cc: Yinghai Lu <yinghai@kernel.org>
Cc: x86@kernel.org
Link: http://lkml.kernel.org/n/tip-re15l0jqjtpz709oxozt2zoh@git.kernel.org
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/x86/kernel/smpboot.c | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c
index edfd03a9e390..7c53d96d44ab 100644
--- a/arch/x86/kernel/smpboot.c
+++ b/arch/x86/kernel/smpboot.c
@@ -337,10 +337,10 @@ void __cpuinit set_cpu_sibling_map(int cpu)
 		for_each_cpu(i, cpu_sibling_setup_mask) {
 			struct cpuinfo_x86 *o = &cpu_data(i);
 
-#ifdef CONFIG_NUMA_EMU
-			if (cpu_to_node(cpu) != cpu_to_node(i))
+			if (cpu_to_node(cpu) != cpu_to_node(i)) {
+				WARN_ONCE(1, "sched: CPU #%d's thread-sibling CPU #%d not on the same node! [node %d != %d]. Ignoring sibling dependency.\n", cpu, i, cpu_to_node(cpu), cpu_to_node(i));
 				continue;
-#endif
+			}
 
 			if (cpu_has(c, X86_FEATURE_TOPOEXT)) {
 				if (c->phys_proc_id == o->phys_proc_id &&
@@ -365,10 +365,10 @@ void __cpuinit set_cpu_sibling_map(int cpu)
 	}
 
 	for_each_cpu(i, cpu_sibling_setup_mask) {
-#ifdef CONFIG_NUMA_EMU
-		if (cpu_to_node(cpu) != cpu_to_node(i))
+		if (cpu_to_node(cpu) != cpu_to_node(i)) {
+			WARN_ONCE(1, "sched: CPU #%d's core-sibling CPU #%d not on the same node! [node %d != %d]. Ignoring sibling dependency.\n", cpu, i, cpu_to_node(cpu), cpu_to_node(i));
 			continue;
-#endif
+		}
 
 		if (per_cpu(cpu_llc_id, cpu) != BAD_APICID &&
 		    per_cpu(cpu_llc_id, cpu) == per_cpu(cpu_llc_id, i)) {
-- 
cgit v1.2.3


From cb83b629bae0327cf9f44f096adc38d150ceb913 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <a.p.zijlstra@chello.nl>
Date: Tue, 17 Apr 2012 15:49:36 +0200
Subject: sched/numa: Rewrite the CONFIG_NUMA sched domain support

The current code groups up to 16 nodes in a level and then puts an
ALLNODES domain spanning the entire tree on top of that. This doesn't
reflect the numa topology and esp for the smaller not-fully-connected
machines out there today this might make a difference.

Therefore, build a proper numa topology based on node_distance().

Since there's no fixed numa layers anymore, the static SD_NODE_INIT
and SD_ALLNODES_INIT aren't usable anymore, the new code tries to
construct something similar and scales some values either on the
number of cpus in the domain and/or the node_distance() ratio.

Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Anton Blanchard <anton@samba.org>
Cc: Benjamin Herrenschmidt <benh@kernel.crashing.org>
Cc: Chris Metcalf <cmetcalf@tilera.com>
Cc: David Howells <dhowells@redhat.com>
Cc: "David S. Miller" <davem@davemloft.net>
Cc: Fenghua Yu <fenghua.yu@intel.com>
Cc: "H. Peter Anvin" <hpa@zytor.com>
Cc: Ivan Kokshaysky <ink@jurassic.park.msu.ru>
Cc: linux-alpha@vger.kernel.org
Cc: linux-ia64@vger.kernel.org
Cc: linux-kernel@vger.kernel.org
Cc: linux-mips@linux-mips.org
Cc: linuxppc-dev@lists.ozlabs.org
Cc: linux-sh@vger.kernel.org
Cc: Matt Turner <mattst88@gmail.com>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Paul Mundt <lethal@linux-sh.org>
Cc: Ralf Baechle <ralf@linux-mips.org>
Cc: Richard Henderson <rth@twiddle.net>
Cc: sparclinux@vger.kernel.org
Cc: Tony Luck <tony.luck@intel.com>
Cc: x86@kernel.org
Cc: Dimitri Sivanich <sivanich@sgi.com>
Cc: Greg Pearson <greg.pearson@hp.com>
Cc: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
Cc: bob.picco@oracle.com
Cc: chris.mason@oracle.com
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Andrew Morton <akpm@linux-foundation.org>
Link: http://lkml.kernel.org/n/tip-r74n3n8hhuc2ynbrnp3vt954@git.kernel.org
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/ia64/include/asm/topology.h           |  25 ---
 arch/mips/include/asm/mach-ip27/topology.h |  17 --
 arch/powerpc/include/asm/topology.h        |  36 ----
 arch/sh/include/asm/topology.h             |  25 ---
 arch/sparc/include/asm/topology_64.h       |  19 --
 arch/tile/include/asm/topology.h           |  26 ---
 arch/x86/include/asm/topology.h            |  38 ----
 include/linux/topology.h                   |  37 ----
 kernel/sched/core.c                        | 280 +++++++++++++++++++----------
 9 files changed, 185 insertions(+), 318 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/ia64/include/asm/topology.h b/arch/ia64/include/asm/topology.h
index 09f646753d1a..a2496e449b75 100644
--- a/arch/ia64/include/asm/topology.h
+++ b/arch/ia64/include/asm/topology.h
@@ -70,31 +70,6 @@ void build_cpu_to_node_map(void);
 	.nr_balance_failed	= 0,			\
 }
 
-/* sched_domains SD_NODE_INIT for IA64 NUMA machines */
-#define SD_NODE_INIT (struct sched_domain) {		\
-	.parent			= NULL,			\
-	.child			= NULL,			\
-	.groups			= NULL,			\
-	.min_interval		= 8,			\
-	.max_interval		= 8*(min(num_online_cpus(), 32U)), \
-	.busy_factor		= 64,			\
-	.imbalance_pct		= 125,			\
-	.cache_nice_tries	= 2,			\
-	.busy_idx		= 3,			\
-	.idle_idx		= 2,			\
-	.newidle_idx		= 0,			\
-	.wake_idx		= 0,			\
-	.forkexec_idx		= 0,			\
-	.flags			= SD_LOAD_BALANCE	\
-				| SD_BALANCE_NEWIDLE	\
-				| SD_BALANCE_EXEC	\
-				| SD_BALANCE_FORK	\
-				| SD_SERIALIZE,		\
-	.last_balance		= jiffies,		\
-	.balance_interval	= 64,			\
-	.nr_balance_failed	= 0,			\
-}
-
 #endif /* CONFIG_NUMA */
 
 #ifdef CONFIG_SMP
diff --git a/arch/mips/include/asm/mach-ip27/topology.h b/arch/mips/include/asm/mach-ip27/topology.h
index 1b1a7d1632b9..b2cf641f206f 100644
--- a/arch/mips/include/asm/mach-ip27/topology.h
+++ b/arch/mips/include/asm/mach-ip27/topology.h
@@ -36,23 +36,6 @@ extern unsigned char __node_distances[MAX_COMPACT_NODES][MAX_COMPACT_NODES];
 
 #define node_distance(from, to)	(__node_distances[(from)][(to)])
 
-/* sched_domains SD_NODE_INIT for SGI IP27 machines */
-#define SD_NODE_INIT (struct sched_domain) {		\
-	.parent			= NULL,			\
-	.child			= NULL,			\
-	.groups			= NULL,			\
-	.min_interval		= 8,			\
-	.max_interval		= 32,			\
-	.busy_factor		= 32,			\
-	.imbalance_pct		= 125,			\
-	.cache_nice_tries	= 1,			\
-	.flags			= SD_LOAD_BALANCE |	\
-				  SD_BALANCE_EXEC,	\
-	.last_balance		= jiffies,		\
-	.balance_interval	= 1,			\
-	.nr_balance_failed	= 0,			\
-}
-
 #include <asm-generic/topology.h>
 
 #endif /* _ASM_MACH_TOPOLOGY_H */
diff --git a/arch/powerpc/include/asm/topology.h b/arch/powerpc/include/asm/topology.h
index c97185885c6d..852ed1b384f6 100644
--- a/arch/powerpc/include/asm/topology.h
+++ b/arch/powerpc/include/asm/topology.h
@@ -18,12 +18,6 @@ struct device_node;
  */
 #define RECLAIM_DISTANCE 10
 
-/*
- * Avoid creating an extra level of balancing (SD_ALLNODES) on the largest
- * POWER7 boxes which have a maximum of 32 nodes.
- */
-#define SD_NODES_PER_DOMAIN 32
-
 #include <asm/mmzone.h>
 
 static inline int cpu_to_node(int cpu)
@@ -51,36 +45,6 @@ static inline int pcibus_to_node(struct pci_bus *bus)
 				 cpu_all_mask :				\
 				 cpumask_of_node(pcibus_to_node(bus)))
 
-/* sched_domains SD_NODE_INIT for PPC64 machines */
-#define SD_NODE_INIT (struct sched_domain) {				\
-	.min_interval		= 8,					\
-	.max_interval		= 32,					\
-	.busy_factor		= 32,					\
-	.imbalance_pct		= 125,					\
-	.cache_nice_tries	= 1,					\
-	.busy_idx		= 3,					\
-	.idle_idx		= 1,					\
-	.newidle_idx		= 0,					\
-	.wake_idx		= 0,					\
-	.forkexec_idx		= 0,					\
-									\
-	.flags			= 1*SD_LOAD_BALANCE			\
-				| 0*SD_BALANCE_NEWIDLE			\
-				| 1*SD_BALANCE_EXEC			\
-				| 1*SD_BALANCE_FORK			\
-				| 0*SD_BALANCE_WAKE			\
-				| 1*SD_WAKE_AFFINE			\
-				| 0*SD_PREFER_LOCAL			\
-				| 0*SD_SHARE_CPUPOWER			\
-				| 0*SD_POWERSAVINGS_BALANCE		\
-				| 0*SD_SHARE_PKG_RESOURCES		\
-				| 1*SD_SERIALIZE			\
-				| 0*SD_PREFER_SIBLING			\
-				,					\
-	.last_balance		= jiffies,				\
-	.balance_interval	= 1,					\
-}
-
 extern int __node_distance(int, int);
 #define node_distance(a, b) __node_distance(a, b)
 
diff --git a/arch/sh/include/asm/topology.h b/arch/sh/include/asm/topology.h
index 88e734069fa6..b0a282d65f6a 100644
--- a/arch/sh/include/asm/topology.h
+++ b/arch/sh/include/asm/topology.h
@@ -3,31 +3,6 @@
 
 #ifdef CONFIG_NUMA
 
-/* sched_domains SD_NODE_INIT for sh machines */
-#define SD_NODE_INIT (struct sched_domain) {		\
-	.parent			= NULL,			\
-	.child			= NULL,			\
-	.groups			= NULL,			\
-	.min_interval		= 8,			\
-	.max_interval		= 32,			\
-	.busy_factor		= 32,			\
-	.imbalance_pct		= 125,			\
-	.cache_nice_tries	= 2,			\
-	.busy_idx		= 3,			\
-	.idle_idx		= 2,			\
-	.newidle_idx		= 0,			\
-	.wake_idx		= 0,			\
-	.forkexec_idx		= 0,			\
-	.flags			= SD_LOAD_BALANCE	\
-				| SD_BALANCE_FORK	\
-				| SD_BALANCE_EXEC	\
-				| SD_BALANCE_NEWIDLE	\
-				| SD_SERIALIZE,		\
-	.last_balance		= jiffies,		\
-	.balance_interval	= 1,			\
-	.nr_balance_failed	= 0,			\
-}
-
 #define cpu_to_node(cpu)	((void)(cpu),0)
 #define parent_node(node)	((void)(node),0)
 
diff --git a/arch/sparc/include/asm/topology_64.h b/arch/sparc/include/asm/topology_64.h
index 8b9c556d630b..1754390a426f 100644
--- a/arch/sparc/include/asm/topology_64.h
+++ b/arch/sparc/include/asm/topology_64.h
@@ -31,25 +31,6 @@ static inline int pcibus_to_node(struct pci_bus *pbus)
 	 cpu_all_mask : \
 	 cpumask_of_node(pcibus_to_node(bus)))
 
-#define SD_NODE_INIT (struct sched_domain) {		\
-	.min_interval		= 8,			\
-	.max_interval		= 32,			\
-	.busy_factor		= 32,			\
-	.imbalance_pct		= 125,			\
-	.cache_nice_tries	= 2,			\
-	.busy_idx		= 3,			\
-	.idle_idx		= 2,			\
-	.newidle_idx		= 0, 			\
-	.wake_idx		= 0,			\
-	.forkexec_idx		= 0,			\
-	.flags			= SD_LOAD_BALANCE	\
-				| SD_BALANCE_FORK	\
-				| SD_BALANCE_EXEC	\
-				| SD_SERIALIZE,		\
-	.last_balance		= jiffies,		\
-	.balance_interval	= 1,			\
-}
-
 #else /* CONFIG_NUMA */
 
 #include <asm-generic/topology.h>
diff --git a/arch/tile/include/asm/topology.h b/arch/tile/include/asm/topology.h
index 6fdd0c860193..7a7ce390534f 100644
--- a/arch/tile/include/asm/topology.h
+++ b/arch/tile/include/asm/topology.h
@@ -78,32 +78,6 @@ static inline const struct cpumask *cpumask_of_node(int node)
 	.balance_interval	= 32,					\
 }
 
-/* sched_domains SD_NODE_INIT for TILE architecture */
-#define SD_NODE_INIT (struct sched_domain) {				\
-	.min_interval		= 16,					\
-	.max_interval		= 512,					\
-	.busy_factor		= 32,					\
-	.imbalance_pct		= 125,					\
-	.cache_nice_tries	= 1,					\
-	.busy_idx		= 3,					\
-	.idle_idx		= 1,					\
-	.newidle_idx		= 2,					\
-	.wake_idx		= 1,					\
-	.flags			= 1*SD_LOAD_BALANCE			\
-				| 1*SD_BALANCE_NEWIDLE			\
-				| 1*SD_BALANCE_EXEC			\
-				| 1*SD_BALANCE_FORK			\
-				| 0*SD_BALANCE_WAKE			\
-				| 0*SD_WAKE_AFFINE			\
-				| 0*SD_PREFER_LOCAL			\
-				| 0*SD_SHARE_CPUPOWER			\
-				| 0*SD_SHARE_PKG_RESOURCES		\
-				| 1*SD_SERIALIZE			\
-				,					\
-	.last_balance		= jiffies,				\
-	.balance_interval	= 128,					\
-}
-
 /* By definition, we create nodes based on online memory. */
 #define node_has_online_mem(nid) 1
 
diff --git a/arch/x86/include/asm/topology.h b/arch/x86/include/asm/topology.h
index b9676ae37ada..095b21507b6a 100644
--- a/arch/x86/include/asm/topology.h
+++ b/arch/x86/include/asm/topology.h
@@ -92,44 +92,6 @@ extern void setup_node_to_cpumask_map(void);
 
 #define pcibus_to_node(bus) __pcibus_to_node(bus)
 
-#ifdef CONFIG_X86_32
-# define SD_CACHE_NICE_TRIES	1
-# define SD_IDLE_IDX		1
-#else
-# define SD_CACHE_NICE_TRIES	2
-# define SD_IDLE_IDX		2
-#endif
-
-/* sched_domains SD_NODE_INIT for NUMA machines */
-#define SD_NODE_INIT (struct sched_domain) {				\
-	.min_interval		= 8,					\
-	.max_interval		= 32,					\
-	.busy_factor		= 32,					\
-	.imbalance_pct		= 125,					\
-	.cache_nice_tries	= SD_CACHE_NICE_TRIES,			\
-	.busy_idx		= 3,					\
-	.idle_idx		= SD_IDLE_IDX,				\
-	.newidle_idx		= 0,					\
-	.wake_idx		= 0,					\
-	.forkexec_idx		= 0,					\
-									\
-	.flags			= 1*SD_LOAD_BALANCE			\
-				| 1*SD_BALANCE_NEWIDLE			\
-				| 1*SD_BALANCE_EXEC			\
-				| 1*SD_BALANCE_FORK			\
-				| 0*SD_BALANCE_WAKE			\
-				| 1*SD_WAKE_AFFINE			\
-				| 0*SD_PREFER_LOCAL			\
-				| 0*SD_SHARE_CPUPOWER			\
-				| 0*SD_POWERSAVINGS_BALANCE		\
-				| 0*SD_SHARE_PKG_RESOURCES		\
-				| 1*SD_SERIALIZE			\
-				| 0*SD_PREFER_SIBLING			\
-				,					\
-	.last_balance		= jiffies,				\
-	.balance_interval	= 1,					\
-}
-
 extern int __node_distance(int, int);
 #define node_distance(a, b) __node_distance(a, b)
 
diff --git a/include/linux/topology.h b/include/linux/topology.h
index e26db031303b..4f59bf36f0af 100644
--- a/include/linux/topology.h
+++ b/include/linux/topology.h
@@ -70,7 +70,6 @@ int arch_update_cpu_topology(void);
  * Below are the 3 major initializers used in building sched_domains:
  * SD_SIBLING_INIT, for SMT domains
  * SD_CPU_INIT, for SMP domains
- * SD_NODE_INIT, for NUMA domains
  *
  * Any architecture that cares to do any tuning to these values should do so
  * by defining their own arch-specific initializer in include/asm/topology.h.
@@ -176,48 +175,12 @@ int arch_update_cpu_topology(void);
 }
 #endif
 
-/* sched_domains SD_ALLNODES_INIT for NUMA machines */
-#define SD_ALLNODES_INIT (struct sched_domain) {			\
-	.min_interval		= 64,					\
-	.max_interval		= 64*num_online_cpus(),			\
-	.busy_factor		= 128,					\
-	.imbalance_pct		= 133,					\
-	.cache_nice_tries	= 1,					\
-	.busy_idx		= 3,					\
-	.idle_idx		= 3,					\
-	.flags			= 1*SD_LOAD_BALANCE			\
-				| 1*SD_BALANCE_NEWIDLE			\
-				| 0*SD_BALANCE_EXEC			\
-				| 0*SD_BALANCE_FORK			\
-				| 0*SD_BALANCE_WAKE			\
-				| 0*SD_WAKE_AFFINE			\
-				| 0*SD_SHARE_CPUPOWER			\
-				| 0*SD_POWERSAVINGS_BALANCE		\
-				| 0*SD_SHARE_PKG_RESOURCES		\
-				| 1*SD_SERIALIZE			\
-				| 0*SD_PREFER_SIBLING			\
-				,					\
-	.last_balance		= jiffies,				\
-	.balance_interval	= 64,					\
-}
-
-#ifndef SD_NODES_PER_DOMAIN
-#define SD_NODES_PER_DOMAIN 16
-#endif
-
 #ifdef CONFIG_SCHED_BOOK
 #ifndef SD_BOOK_INIT
 #error Please define an appropriate SD_BOOK_INIT in include/asm/topology.h!!!
 #endif
 #endif /* CONFIG_SCHED_BOOK */
 
-#ifdef CONFIG_NUMA
-#ifndef SD_NODE_INIT
-#error Please define an appropriate SD_NODE_INIT in include/asm/topology.h!!!
-#endif
-
-#endif /* CONFIG_NUMA */
-
 #ifdef CONFIG_USE_PERCPU_NUMA_NODE_ID
 DECLARE_PER_CPU(int, numa_node);
 
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 6001e5c3b4e4..b4f2096980a3 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -5560,7 +5560,8 @@ static int sched_domain_debug_one(struct sched_domain *sd, int cpu, int level,
 			break;
 		}
 
-		if (cpumask_intersects(groupmask, sched_group_cpus(group))) {
+		if (!(sd->flags & SD_OVERLAP) &&
+		    cpumask_intersects(groupmask, sched_group_cpus(group))) {
 			printk(KERN_CONT "\n");
 			printk(KERN_ERR "ERROR: repeated CPUs\n");
 			break;
@@ -5898,92 +5899,6 @@ static int __init isolated_cpu_setup(char *str)
 
 __setup("isolcpus=", isolated_cpu_setup);
 
-#ifdef CONFIG_NUMA
-
-/**
- * find_next_best_node - find the next node to include in a sched_domain
- * @node: node whose sched_domain we're building
- * @used_nodes: nodes already in the sched_domain
- *
- * Find the next node to include in a given scheduling domain. Simply
- * finds the closest node not already in the @used_nodes map.
- *
- * Should use nodemask_t.
- */
-static int find_next_best_node(int node, nodemask_t *used_nodes)
-{
-	int i, n, val, min_val, best_node = -1;
-
-	min_val = INT_MAX;
-
-	for (i = 0; i < nr_node_ids; i++) {
-		/* Start at @node */
-		n = (node + i) % nr_node_ids;
-
-		if (!nr_cpus_node(n))
-			continue;
-
-		/* Skip already used nodes */
-		if (node_isset(n, *used_nodes))
-			continue;
-
-		/* Simple min distance search */
-		val = node_distance(node, n);
-
-		if (val < min_val) {
-			min_val = val;
-			best_node = n;
-		}
-	}
-
-	if (best_node != -1)
-		node_set(best_node, *used_nodes);
-	return best_node;
-}
-
-/**
- * sched_domain_node_span - get a cpumask for a node's sched_domain
- * @node: node whose cpumask we're constructing
- * @span: resulting cpumask
- *
- * Given a node, construct a good cpumask for its sched_domain to span. It
- * should be one that prevents unnecessary balancing, but also spreads tasks
- * out optimally.
- */
-static void sched_domain_node_span(int node, struct cpumask *span)
-{
-	nodemask_t used_nodes;
-	int i;
-
-	cpumask_clear(span);
-	nodes_clear(used_nodes);
-
-	cpumask_or(span, span, cpumask_of_node(node));
-	node_set(node, used_nodes);
-
-	for (i = 1; i < SD_NODES_PER_DOMAIN; i++) {
-		int next_node = find_next_best_node(node, &used_nodes);
-		if (next_node < 0)
-			break;
-		cpumask_or(span, span, cpumask_of_node(next_node));
-	}
-}
-
-static const struct cpumask *cpu_node_mask(int cpu)
-{
-	lockdep_assert_held(&sched_domains_mutex);
-
-	sched_domain_node_span(cpu_to_node(cpu), sched_domains_tmpmask);
-
-	return sched_domains_tmpmask;
-}
-
-static const struct cpumask *cpu_allnodes_mask(int cpu)
-{
-	return cpu_possible_mask;
-}
-#endif /* CONFIG_NUMA */
-
 static const struct cpumask *cpu_cpu_mask(int cpu)
 {
 	return cpumask_of_node(cpu_to_node(cpu));
@@ -6020,6 +5935,7 @@ struct sched_domain_topology_level {
 	sched_domain_init_f init;
 	sched_domain_mask_f mask;
 	int		    flags;
+	int		    numa_level;
 	struct sd_data      data;
 };
 
@@ -6213,10 +6129,6 @@ sd_init_##type(struct sched_domain_topology_level *tl, int cpu) 	\
 }
 
 SD_INIT_FUNC(CPU)
-#ifdef CONFIG_NUMA
- SD_INIT_FUNC(ALLNODES)
- SD_INIT_FUNC(NODE)
-#endif
 #ifdef CONFIG_SCHED_SMT
  SD_INIT_FUNC(SIBLING)
 #endif
@@ -6338,15 +6250,191 @@ static struct sched_domain_topology_level default_topology[] = {
 	{ sd_init_BOOK, cpu_book_mask, },
 #endif
 	{ sd_init_CPU, cpu_cpu_mask, },
-#ifdef CONFIG_NUMA
-	{ sd_init_NODE, cpu_node_mask, SDTL_OVERLAP, },
-	{ sd_init_ALLNODES, cpu_allnodes_mask, },
-#endif
 	{ NULL, },
 };
 
 static struct sched_domain_topology_level *sched_domain_topology = default_topology;
 
+#ifdef CONFIG_NUMA
+
+static int sched_domains_numa_levels;
+static int sched_domains_numa_scale;
+static int *sched_domains_numa_distance;
+static struct cpumask ***sched_domains_numa_masks;
+static int sched_domains_curr_level;
+
+static inline unsigned long numa_scale(unsigned long x, int level)
+{
+	return x * sched_domains_numa_distance[level] / sched_domains_numa_scale;
+}
+
+static inline int sd_local_flags(int level)
+{
+	if (sched_domains_numa_distance[level] > REMOTE_DISTANCE)
+		return 0;
+
+	return SD_BALANCE_EXEC | SD_BALANCE_FORK | SD_WAKE_AFFINE;
+}
+
+static struct sched_domain *
+sd_numa_init(struct sched_domain_topology_level *tl, int cpu)
+{
+	struct sched_domain *sd = *per_cpu_ptr(tl->data.sd, cpu);
+	int level = tl->numa_level;
+	int sd_weight = cpumask_weight(
+			sched_domains_numa_masks[level][cpu_to_node(cpu)]);
+
+	*sd = (struct sched_domain){
+		.min_interval		= sd_weight,
+		.max_interval		= 2*sd_weight,
+		.busy_factor		= 32,
+		.imbalance_pct		= 100 + numa_scale(25, level),
+		.cache_nice_tries	= 2,
+		.busy_idx		= 3,
+		.idle_idx		= 2,
+		.newidle_idx		= 0,
+		.wake_idx		= 0,
+		.forkexec_idx		= 0,
+
+		.flags			= 1*SD_LOAD_BALANCE
+					| 1*SD_BALANCE_NEWIDLE
+					| 0*SD_BALANCE_EXEC
+					| 0*SD_BALANCE_FORK
+					| 0*SD_BALANCE_WAKE
+					| 0*SD_WAKE_AFFINE
+					| 0*SD_PREFER_LOCAL
+					| 0*SD_SHARE_CPUPOWER
+					| 0*SD_POWERSAVINGS_BALANCE
+					| 0*SD_SHARE_PKG_RESOURCES
+					| 1*SD_SERIALIZE
+					| 0*SD_PREFER_SIBLING
+					| sd_local_flags(level)
+					,
+		.last_balance		= jiffies,
+		.balance_interval	= sd_weight,
+	};
+	SD_INIT_NAME(sd, NUMA);
+	sd->private = &tl->data;
+
+	/*
+	 * Ugly hack to pass state to sd_numa_mask()...
+	 */
+	sched_domains_curr_level = tl->numa_level;
+
+	return sd;
+}
+
+static const struct cpumask *sd_numa_mask(int cpu)
+{
+	return sched_domains_numa_masks[sched_domains_curr_level][cpu_to_node(cpu)];
+}
+
+static void sched_init_numa(void)
+{
+	int next_distance, curr_distance = node_distance(0, 0);
+	struct sched_domain_topology_level *tl;
+	int level = 0;
+	int i, j, k;
+
+	sched_domains_numa_scale = curr_distance;
+	sched_domains_numa_distance = kzalloc(sizeof(int) * nr_node_ids, GFP_KERNEL);
+	if (!sched_domains_numa_distance)
+		return;
+
+	/*
+	 * O(nr_nodes^2) deduplicating selection sort -- in order to find the
+	 * unique distances in the node_distance() table.
+	 *
+	 * Assumes node_distance(0,j) includes all distances in
+	 * node_distance(i,j) in order to avoid cubic time.
+	 *
+	 * XXX: could be optimized to O(n log n) by using sort()
+	 */
+	next_distance = curr_distance;
+	for (i = 0; i < nr_node_ids; i++) {
+		for (j = 0; j < nr_node_ids; j++) {
+			int distance = node_distance(0, j);
+			if (distance > curr_distance &&
+					(distance < next_distance ||
+					 next_distance == curr_distance))
+				next_distance = distance;
+		}
+		if (next_distance != curr_distance) {
+			sched_domains_numa_distance[level++] = next_distance;
+			sched_domains_numa_levels = level;
+			curr_distance = next_distance;
+		} else break;
+	}
+	/*
+	 * 'level' contains the number of unique distances, excluding the
+	 * identity distance node_distance(i,i).
+	 *
+	 * The sched_domains_nume_distance[] array includes the actual distance
+	 * numbers.
+	 */
+
+	sched_domains_numa_masks = kzalloc(sizeof(void *) * level, GFP_KERNEL);
+	if (!sched_domains_numa_masks)
+		return;
+
+	/*
+	 * Now for each level, construct a mask per node which contains all
+	 * cpus of nodes that are that many hops away from us.
+	 */
+	for (i = 0; i < level; i++) {
+		sched_domains_numa_masks[i] =
+			kzalloc(nr_node_ids * sizeof(void *), GFP_KERNEL);
+		if (!sched_domains_numa_masks[i])
+			return;
+
+		for (j = 0; j < nr_node_ids; j++) {
+			struct cpumask *mask = kzalloc_node(cpumask_size(), GFP_KERNEL, j);
+			if (!mask)
+				return;
+
+			sched_domains_numa_masks[i][j] = mask;
+
+			for (k = 0; k < nr_node_ids; k++) {
+				if (node_distance(cpu_to_node(j), k) >
+						sched_domains_numa_distance[i])
+					continue;
+
+				cpumask_or(mask, mask, cpumask_of_node(k));
+			}
+		}
+	}
+
+	tl = kzalloc((ARRAY_SIZE(default_topology) + level) *
+			sizeof(struct sched_domain_topology_level), GFP_KERNEL);
+	if (!tl)
+		return;
+
+	/*
+	 * Copy the default topology bits..
+	 */
+	for (i = 0; default_topology[i].init; i++)
+		tl[i] = default_topology[i];
+
+	/*
+	 * .. and append 'j' levels of NUMA goodness.
+	 */
+	for (j = 0; j < level; i++, j++) {
+		tl[i] = (struct sched_domain_topology_level){
+			.init = sd_numa_init,
+			.mask = sd_numa_mask,
+			.flags = SDTL_OVERLAP,
+			.numa_level = j,
+		};
+	}
+
+	sched_domain_topology = tl;
+}
+#else
+static inline void sched_init_numa(void)
+{
+}
+#endif /* CONFIG_NUMA */
+
 static int __sdt_alloc(const struct cpumask *cpu_map)
 {
 	struct sched_domain_topology_level *tl;
@@ -6840,6 +6928,8 @@ void __init sched_init_smp(void)
 	alloc_cpumask_var(&non_isolated_cpus, GFP_KERNEL);
 	alloc_cpumask_var(&fallback_doms, GFP_KERNEL);
 
+	sched_init_numa();
+
 	get_online_cpus();
 	mutex_lock(&sched_domains_mutex);
 	init_sched_domains(cpu_active_mask);
-- 
cgit v1.2.3


From c75841a398d667d9968245b9519d93cedbfb4780 Mon Sep 17 00:00:00 2001
From: Robert Richter <robert.richter@amd.com>
Date: Mon, 2 Apr 2012 20:19:07 +0200
Subject: perf/x86-ibs: Fix update of period

The last sw period was not correctly updated on overflow and thus led
to wrong distribution of events. We always need to properly initialize
data.period in struct perf_sample_data.

Signed-off-by: Robert Richter <robert.richter@amd.com>
Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Link: http://lkml.kernel.org/r/1333390758-10893-2-git-send-email-robert.richter@amd.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/x86/kernel/cpu/perf_event_amd_ibs.c | 27 ++++++++++++++-------------
 1 file changed, 14 insertions(+), 13 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kernel/cpu/perf_event_amd_ibs.c b/arch/x86/kernel/cpu/perf_event_amd_ibs.c
index 8ff74d439041..c8f69bea6624 100644
--- a/arch/x86/kernel/cpu/perf_event_amd_ibs.c
+++ b/arch/x86/kernel/cpu/perf_event_amd_ibs.c
@@ -386,7 +386,21 @@ static int perf_ibs_handle_irq(struct perf_ibs *perf_ibs, struct pt_regs *iregs)
 	if (!(*buf++ & perf_ibs->valid_mask))
 		return 0;
 
+	/*
+	 * Emulate IbsOpCurCnt in MSRC001_1033 (IbsOpCtl), not
+	 * supported in all cpus. As this triggered an interrupt, we
+	 * set the current count to the max count.
+	 */
+	config = ibs_data.regs[0];
+	if (perf_ibs == &perf_ibs_op && !(ibs_caps & IBS_CAPS_RDWROPCNT)) {
+		config &= ~IBS_OP_CUR_CNT;
+		config |= (config & IBS_OP_MAX_CNT) << 36;
+	}
+
+	perf_ibs_event_update(perf_ibs, event, config);
 	perf_sample_data_init(&data, 0);
+	data.period = event->hw.last_period;
+
 	if (event->attr.sample_type & PERF_SAMPLE_RAW) {
 		ibs_data.caps = ibs_caps;
 		size = 1;
@@ -405,19 +419,6 @@ static int perf_ibs_handle_irq(struct perf_ibs *perf_ibs, struct pt_regs *iregs)
 
 	regs = *iregs; /* XXX: update ip from ibs sample */
 
-	/*
-	 * Emulate IbsOpCurCnt in MSRC001_1033 (IbsOpCtl), not
-	 * supported in all cpus. As this triggered an interrupt, we
-	 * set the current count to the max count.
-	 */
-	config = ibs_data.regs[0];
-	if (perf_ibs == &perf_ibs_op && !(ibs_caps & IBS_CAPS_RDWROPCNT)) {
-		config &= ~IBS_OP_CUR_CNT;
-		config |= (config & IBS_OP_MAX_CNT) << 36;
-	}
-
-	perf_ibs_event_update(perf_ibs, event, config);
-
 	overflow = perf_ibs_set_period(perf_ibs, hwc, &config);
 	reenable = !(overflow && perf_event_overflow(event, &data, &regs));
 	config = (config >> 4) | (reenable ? perf_ibs->enable_mask : 0);
-- 
cgit v1.2.3


From fd0d000b2c34aa43d4e92dcf0dfaeda7e123008a Mon Sep 17 00:00:00 2001
From: Robert Richter <robert.richter@amd.com>
Date: Mon, 2 Apr 2012 20:19:08 +0200
Subject: perf: Pass last sampling period to perf_sample_data_init()

We always need to pass the last sample period to
perf_sample_data_init(), otherwise the event distribution will be
wrong. Thus, modifiyng the function interface with the required period
as argument. So basically a pattern like this:

        perf_sample_data_init(&data, ~0ULL);
        data.period = event->hw.last_period;

will now be like that:

        perf_sample_data_init(&data, ~0ULL, event->hw.last_period);

Avoids unininitialized data.period and simplifies code.

Signed-off-by: Robert Richter <robert.richter@amd.com>
Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Link: http://lkml.kernel.org/r/1333390758-10893-3-git-send-email-robert.richter@amd.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/alpha/kernel/perf_event.c            | 3 +--
 arch/arm/kernel/perf_event_v6.c           | 4 +---
 arch/arm/kernel/perf_event_v7.c           | 4 +---
 arch/arm/kernel/perf_event_xscale.c       | 8 ++------
 arch/mips/kernel/perf_event_mipsxx.c      | 2 +-
 arch/powerpc/perf/core-book3s.c           | 3 +--
 arch/powerpc/perf/core-fsl-emb.c          | 3 +--
 arch/sparc/kernel/perf_event.c            | 4 +---
 arch/x86/kernel/cpu/perf_event.c          | 4 +---
 arch/x86/kernel/cpu/perf_event_amd_ibs.c  | 3 +--
 arch/x86/kernel/cpu/perf_event_intel.c    | 4 +---
 arch/x86/kernel/cpu/perf_event_intel_ds.c | 6 ++----
 arch/x86/kernel/cpu/perf_event_p4.c       | 6 +++---
 include/linux/perf_event.h                | 5 ++++-
 kernel/events/core.c                      | 9 ++++-----
 15 files changed, 25 insertions(+), 43 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/alpha/kernel/perf_event.c b/arch/alpha/kernel/perf_event.c
index 0dae252f7a33..d821b17047e0 100644
--- a/arch/alpha/kernel/perf_event.c
+++ b/arch/alpha/kernel/perf_event.c
@@ -824,7 +824,6 @@ static void alpha_perf_event_irq_handler(unsigned long la_ptr,
 
 	idx = la_ptr;
 
-	perf_sample_data_init(&data, 0);
 	for (j = 0; j < cpuc->n_events; j++) {
 		if (cpuc->current_idx[j] == idx)
 			break;
@@ -848,7 +847,7 @@ static void alpha_perf_event_irq_handler(unsigned long la_ptr,
 
 	hwc = &event->hw;
 	alpha_perf_event_update(event, hwc, idx, alpha_pmu->pmc_max_period[idx]+1);
-	data.period = event->hw.last_period;
+	perf_sample_data_init(&data, 0, hwc->last_period);
 
 	if (alpha_perf_event_set_period(event, hwc, idx)) {
 		if (perf_event_overflow(event, &data, regs)) {
diff --git a/arch/arm/kernel/perf_event_v6.c b/arch/arm/kernel/perf_event_v6.c
index b78af0cc6ef3..ab627a740fa3 100644
--- a/arch/arm/kernel/perf_event_v6.c
+++ b/arch/arm/kernel/perf_event_v6.c
@@ -489,8 +489,6 @@ armv6pmu_handle_irq(int irq_num,
 	 */
 	armv6_pmcr_write(pmcr);
 
-	perf_sample_data_init(&data, 0);
-
 	cpuc = &__get_cpu_var(cpu_hw_events);
 	for (idx = 0; idx < cpu_pmu->num_events; ++idx) {
 		struct perf_event *event = cpuc->events[idx];
@@ -509,7 +507,7 @@ armv6pmu_handle_irq(int irq_num,
 
 		hwc = &event->hw;
 		armpmu_event_update(event, hwc, idx);
-		data.period = event->hw.last_period;
+		perf_sample_data_init(&data, 0, hwc->last_period);
 		if (!armpmu_event_set_period(event, hwc, idx))
 			continue;
 
diff --git a/arch/arm/kernel/perf_event_v7.c b/arch/arm/kernel/perf_event_v7.c
index 00755d82e2f2..d3c536068162 100644
--- a/arch/arm/kernel/perf_event_v7.c
+++ b/arch/arm/kernel/perf_event_v7.c
@@ -1077,8 +1077,6 @@ static irqreturn_t armv7pmu_handle_irq(int irq_num, void *dev)
 	 */
 	regs = get_irq_regs();
 
-	perf_sample_data_init(&data, 0);
-
 	cpuc = &__get_cpu_var(cpu_hw_events);
 	for (idx = 0; idx < cpu_pmu->num_events; ++idx) {
 		struct perf_event *event = cpuc->events[idx];
@@ -1097,7 +1095,7 @@ static irqreturn_t armv7pmu_handle_irq(int irq_num, void *dev)
 
 		hwc = &event->hw;
 		armpmu_event_update(event, hwc, idx);
-		data.period = event->hw.last_period;
+		perf_sample_data_init(&data, 0, hwc->last_period);
 		if (!armpmu_event_set_period(event, hwc, idx))
 			continue;
 
diff --git a/arch/arm/kernel/perf_event_xscale.c b/arch/arm/kernel/perf_event_xscale.c
index 71a21e6712f5..e34e7254e652 100644
--- a/arch/arm/kernel/perf_event_xscale.c
+++ b/arch/arm/kernel/perf_event_xscale.c
@@ -248,8 +248,6 @@ xscale1pmu_handle_irq(int irq_num, void *dev)
 
 	regs = get_irq_regs();
 
-	perf_sample_data_init(&data, 0);
-
 	cpuc = &__get_cpu_var(cpu_hw_events);
 	for (idx = 0; idx < cpu_pmu->num_events; ++idx) {
 		struct perf_event *event = cpuc->events[idx];
@@ -263,7 +261,7 @@ xscale1pmu_handle_irq(int irq_num, void *dev)
 
 		hwc = &event->hw;
 		armpmu_event_update(event, hwc, idx);
-		data.period = event->hw.last_period;
+		perf_sample_data_init(&data, 0, hwc->last_period);
 		if (!armpmu_event_set_period(event, hwc, idx))
 			continue;
 
@@ -588,8 +586,6 @@ xscale2pmu_handle_irq(int irq_num, void *dev)
 
 	regs = get_irq_regs();
 
-	perf_sample_data_init(&data, 0);
-
 	cpuc = &__get_cpu_var(cpu_hw_events);
 	for (idx = 0; idx < cpu_pmu->num_events; ++idx) {
 		struct perf_event *event = cpuc->events[idx];
@@ -603,7 +599,7 @@ xscale2pmu_handle_irq(int irq_num, void *dev)
 
 		hwc = &event->hw;
 		armpmu_event_update(event, hwc, idx);
-		data.period = event->hw.last_period;
+		perf_sample_data_init(&data, 0, hwc->last_period);
 		if (!armpmu_event_set_period(event, hwc, idx))
 			continue;
 
diff --git a/arch/mips/kernel/perf_event_mipsxx.c b/arch/mips/kernel/perf_event_mipsxx.c
index 811084f4e422..ab73fa2fb9b5 100644
--- a/arch/mips/kernel/perf_event_mipsxx.c
+++ b/arch/mips/kernel/perf_event_mipsxx.c
@@ -1325,7 +1325,7 @@ static int mipsxx_pmu_handle_shared_irq(void)
 
 	regs = get_irq_regs();
 
-	perf_sample_data_init(&data, 0);
+	perf_sample_data_init(&data, 0, 0);
 
 	switch (counters) {
 #define HANDLE_COUNTER(n)						\
diff --git a/arch/powerpc/perf/core-book3s.c b/arch/powerpc/perf/core-book3s.c
index 02aee03e713c..8f84bcba18da 100644
--- a/arch/powerpc/perf/core-book3s.c
+++ b/arch/powerpc/perf/core-book3s.c
@@ -1299,8 +1299,7 @@ static void record_and_restart(struct perf_event *event, unsigned long val,
 	if (record) {
 		struct perf_sample_data data;
 
-		perf_sample_data_init(&data, ~0ULL);
-		data.period = event->hw.last_period;
+		perf_sample_data_init(&data, ~0ULL, event->hw.last_period);
 
 		if (event->attr.sample_type & PERF_SAMPLE_ADDR)
 			perf_get_data_addr(regs, &data.addr);
diff --git a/arch/powerpc/perf/core-fsl-emb.c b/arch/powerpc/perf/core-fsl-emb.c
index 0a6d2a9d569c..106c53354675 100644
--- a/arch/powerpc/perf/core-fsl-emb.c
+++ b/arch/powerpc/perf/core-fsl-emb.c
@@ -613,8 +613,7 @@ static void record_and_restart(struct perf_event *event, unsigned long val,
 	if (record) {
 		struct perf_sample_data data;
 
-		perf_sample_data_init(&data, 0);
-		data.period = event->hw.last_period;
+		perf_sample_data_init(&data, 0, event->hw.last_period);
 
 		if (perf_event_overflow(event, &data, regs))
 			fsl_emb_pmu_stop(event, 0);
diff --git a/arch/sparc/kernel/perf_event.c b/arch/sparc/kernel/perf_event.c
index 28559ce5eeb5..5713957dcb8a 100644
--- a/arch/sparc/kernel/perf_event.c
+++ b/arch/sparc/kernel/perf_event.c
@@ -1296,8 +1296,6 @@ static int __kprobes perf_event_nmi_handler(struct notifier_block *self,
 
 	regs = args->regs;
 
-	perf_sample_data_init(&data, 0);
-
 	cpuc = &__get_cpu_var(cpu_hw_events);
 
 	/* If the PMU has the TOE IRQ enable bits, we need to do a
@@ -1321,7 +1319,7 @@ static int __kprobes perf_event_nmi_handler(struct notifier_block *self,
 		if (val & (1ULL << 31))
 			continue;
 
-		data.period = event->hw.last_period;
+		perf_sample_data_init(&data, 0, hwc->last_period);
 		if (!sparc_perf_event_set_period(event, hwc, idx))
 			continue;
 
diff --git a/arch/x86/kernel/cpu/perf_event.c b/arch/x86/kernel/cpu/perf_event.c
index e33e9cf160eb..e049d6da0183 100644
--- a/arch/x86/kernel/cpu/perf_event.c
+++ b/arch/x86/kernel/cpu/perf_event.c
@@ -1183,8 +1183,6 @@ int x86_pmu_handle_irq(struct pt_regs *regs)
 	int idx, handled = 0;
 	u64 val;
 
-	perf_sample_data_init(&data, 0);
-
 	cpuc = &__get_cpu_var(cpu_hw_events);
 
 	/*
@@ -1219,7 +1217,7 @@ int x86_pmu_handle_irq(struct pt_regs *regs)
 		 * event overflow
 		 */
 		handled++;
-		data.period	= event->hw.last_period;
+		perf_sample_data_init(&data, 0, event->hw.last_period);
 
 		if (!x86_perf_event_set_period(event))
 			continue;
diff --git a/arch/x86/kernel/cpu/perf_event_amd_ibs.c b/arch/x86/kernel/cpu/perf_event_amd_ibs.c
index c8f69bea6624..2317228b5299 100644
--- a/arch/x86/kernel/cpu/perf_event_amd_ibs.c
+++ b/arch/x86/kernel/cpu/perf_event_amd_ibs.c
@@ -398,8 +398,7 @@ static int perf_ibs_handle_irq(struct perf_ibs *perf_ibs, struct pt_regs *iregs)
 	}
 
 	perf_ibs_event_update(perf_ibs, event, config);
-	perf_sample_data_init(&data, 0);
-	data.period = event->hw.last_period;
+	perf_sample_data_init(&data, 0, hwc->last_period);
 
 	if (event->attr.sample_type & PERF_SAMPLE_RAW) {
 		ibs_data.caps = ibs_caps;
diff --git a/arch/x86/kernel/cpu/perf_event_intel.c b/arch/x86/kernel/cpu/perf_event_intel.c
index 26b3e2fef104..166546ec6aef 100644
--- a/arch/x86/kernel/cpu/perf_event_intel.c
+++ b/arch/x86/kernel/cpu/perf_event_intel.c
@@ -1027,8 +1027,6 @@ static int intel_pmu_handle_irq(struct pt_regs *regs)
 	u64 status;
 	int handled;
 
-	perf_sample_data_init(&data, 0);
-
 	cpuc = &__get_cpu_var(cpu_hw_events);
 
 	/*
@@ -1082,7 +1080,7 @@ again:
 		if (!intel_pmu_save_and_restart(event))
 			continue;
 
-		data.period = event->hw.last_period;
+		perf_sample_data_init(&data, 0, event->hw.last_period);
 
 		if (has_branch_stack(event))
 			data.br_stack = &cpuc->lbr_stack;
diff --git a/arch/x86/kernel/cpu/perf_event_intel_ds.c b/arch/x86/kernel/cpu/perf_event_intel_ds.c
index 7f64df19e7dd..5a3edc27f6e5 100644
--- a/arch/x86/kernel/cpu/perf_event_intel_ds.c
+++ b/arch/x86/kernel/cpu/perf_event_intel_ds.c
@@ -316,8 +316,7 @@ int intel_pmu_drain_bts_buffer(void)
 
 	ds->bts_index = ds->bts_buffer_base;
 
-	perf_sample_data_init(&data, 0);
-	data.period = event->hw.last_period;
+	perf_sample_data_init(&data, 0, event->hw.last_period);
 	regs.ip     = 0;
 
 	/*
@@ -564,8 +563,7 @@ static void __intel_pmu_pebs_event(struct perf_event *event,
 	if (!intel_pmu_save_and_restart(event))
 		return;
 
-	perf_sample_data_init(&data, 0);
-	data.period = event->hw.last_period;
+	perf_sample_data_init(&data, 0, event->hw.last_period);
 
 	/*
 	 * We use the interrupt regs as a base because the PEBS record
diff --git a/arch/x86/kernel/cpu/perf_event_p4.c b/arch/x86/kernel/cpu/perf_event_p4.c
index a2dfacfd7103..47124a73dd73 100644
--- a/arch/x86/kernel/cpu/perf_event_p4.c
+++ b/arch/x86/kernel/cpu/perf_event_p4.c
@@ -1005,8 +1005,6 @@ static int p4_pmu_handle_irq(struct pt_regs *regs)
 	int idx, handled = 0;
 	u64 val;
 
-	perf_sample_data_init(&data, 0);
-
 	cpuc = &__get_cpu_var(cpu_hw_events);
 
 	for (idx = 0; idx < x86_pmu.num_counters; idx++) {
@@ -1034,10 +1032,12 @@ static int p4_pmu_handle_irq(struct pt_regs *regs)
 		handled += overflow;
 
 		/* event overflow for sure */
-		data.period = event->hw.last_period;
+		perf_sample_data_init(&data, 0, hwc->last_period);
 
 		if (!x86_perf_event_set_period(event))
 			continue;
+
+
 		if (perf_event_overflow(event, &data, regs))
 			x86_pmu_stop(event, 0);
 	}
diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h
index ddbb6a901f65..f32578634d9d 100644
--- a/include/linux/perf_event.h
+++ b/include/linux/perf_event.h
@@ -1132,11 +1132,14 @@ struct perf_sample_data {
 	struct perf_branch_stack	*br_stack;
 };
 
-static inline void perf_sample_data_init(struct perf_sample_data *data, u64 addr)
+static inline void perf_sample_data_init(struct perf_sample_data *data,
+					 u64 addr, u64 period)
 {
+	/* remaining struct members initialized in perf_prepare_sample() */
 	data->addr = addr;
 	data->raw  = NULL;
 	data->br_stack = NULL;
+	data->period	= period;
 }
 
 extern void perf_output_sample(struct perf_output_handle *handle,
diff --git a/kernel/events/core.c b/kernel/events/core.c
index 9789a56b7d54..00c58df9f4e2 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -4957,7 +4957,7 @@ void __perf_sw_event(u32 event_id, u64 nr, struct pt_regs *regs, u64 addr)
 	if (rctx < 0)
 		return;
 
-	perf_sample_data_init(&data, addr);
+	perf_sample_data_init(&data, addr, 0);
 
 	do_perf_sw_event(PERF_TYPE_SOFTWARE, event_id, nr, &data, regs);
 
@@ -5215,7 +5215,7 @@ void perf_tp_event(u64 addr, u64 count, void *record, int entry_size,
 		.data = record,
 	};
 
-	perf_sample_data_init(&data, addr);
+	perf_sample_data_init(&data, addr, 0);
 	data.raw = &raw;
 
 	hlist_for_each_entry_rcu(event, node, head, hlist_entry) {
@@ -5318,7 +5318,7 @@ void perf_bp_event(struct perf_event *bp, void *data)
 	struct perf_sample_data sample;
 	struct pt_regs *regs = data;
 
-	perf_sample_data_init(&sample, bp->attr.bp_addr);
+	perf_sample_data_init(&sample, bp->attr.bp_addr, 0);
 
 	if (!bp->hw.state && !perf_exclude_event(bp, regs))
 		perf_swevent_event(bp, 1, &sample, regs);
@@ -5344,8 +5344,7 @@ static enum hrtimer_restart perf_swevent_hrtimer(struct hrtimer *hrtimer)
 
 	event->pmu->read(event);
 
-	perf_sample_data_init(&data, 0);
-	data.period = event->hw.last_period;
+	perf_sample_data_init(&data, 0, event->hw.last_period);
 	regs = get_irq_regs();
 
 	if (regs && !perf_exclude_event(event, regs)) {
-- 
cgit v1.2.3


From 7bf352384fda3f678a283928c6c5b2cd9da877e4 Mon Sep 17 00:00:00 2001
From: Robert Richter <robert.richter@amd.com>
Date: Mon, 2 Apr 2012 20:19:09 +0200
Subject: perf/x86-ibs: Enable ibs op micro-ops counting mode

Allow enabling ibs op micro-ops counting mode.

Signed-off-by: Robert Richter <robert.richter@amd.com>
Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Link: http://lkml.kernel.org/r/1333390758-10893-4-git-send-email-robert.richter@amd.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/x86/kernel/cpu/perf_event_amd_ibs.c | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'arch/x86')

diff --git a/arch/x86/kernel/cpu/perf_event_amd_ibs.c b/arch/x86/kernel/cpu/perf_event_amd_ibs.c
index 2317228b5299..ebf169fe40ef 100644
--- a/arch/x86/kernel/cpu/perf_event_amd_ibs.c
+++ b/arch/x86/kernel/cpu/perf_event_amd_ibs.c
@@ -468,6 +468,8 @@ static __init int perf_event_ibs_init(void)
 		return -ENODEV;	/* ibs not supported by the cpu */
 
 	perf_ibs_pmu_init(&perf_ibs_fetch, "ibs_fetch");
+	if (ibs_caps & IBS_CAPS_OPCNT)
+		perf_ibs_op.config_mask |= IBS_OP_CNT_CTL;
 	perf_ibs_pmu_init(&perf_ibs_op, "ibs_op");
 	register_nmi_handler(NMI_LOCAL, perf_ibs_nmi_handler, 0, "perf_ibs");
 	printk(KERN_INFO "perf: AMD IBS detected (0x%08x)\n", ibs_caps);
-- 
cgit v1.2.3


From 6accb9cf76080422d400a641d9068b6b2a2c216f Mon Sep 17 00:00:00 2001
From: Robert Richter <robert.richter@amd.com>
Date: Mon, 2 Apr 2012 20:19:10 +0200
Subject: perf/x86-ibs: Fix frequency profiling

Fixing profiling at a fixed frequency, in this case the freq value and
sample period was setup incorrectly. Since sampling periods are
adjusted we also allow periods that have lower 4 bits set.

Another fix is the setup of the hw counter: If we modify
hwc->sample_period, we also need to update hwc->last_period and
hwc->period_left.

Signed-off-by: Robert Richter <robert.richter@amd.com>
Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Link: http://lkml.kernel.org/r/1333390758-10893-5-git-send-email-robert.richter@amd.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/x86/kernel/cpu/perf_event_amd_ibs.c | 18 ++++++++++++++++--
 1 file changed, 16 insertions(+), 2 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kernel/cpu/perf_event_amd_ibs.c b/arch/x86/kernel/cpu/perf_event_amd_ibs.c
index ebf169fe40ef..bc401bd9f14a 100644
--- a/arch/x86/kernel/cpu/perf_event_amd_ibs.c
+++ b/arch/x86/kernel/cpu/perf_event_amd_ibs.c
@@ -162,9 +162,16 @@ static int perf_ibs_init(struct perf_event *event)
 		if (config & perf_ibs->cnt_mask)
 			/* raw max_cnt may not be set */
 			return -EINVAL;
-		if (hwc->sample_period & 0x0f)
-			/* lower 4 bits can not be set in ibs max cnt */
+		if (!event->attr.sample_freq && hwc->sample_period & 0x0f)
+			/*
+			 * lower 4 bits can not be set in ibs max cnt,
+			 * but allowing it in case we adjust the
+			 * sample period to set a frequency.
+			 */
 			return -EINVAL;
+		hwc->sample_period &= ~0x0FULL;
+		if (!hwc->sample_period)
+			hwc->sample_period = 0x10;
 	} else {
 		max_cnt = config & perf_ibs->cnt_mask;
 		config &= ~perf_ibs->cnt_mask;
@@ -175,6 +182,13 @@ static int perf_ibs_init(struct perf_event *event)
 	if (!hwc->sample_period)
 		return -EINVAL;
 
+	/*
+	 * If we modify hwc->sample_period, we also need to update
+	 * hwc->last_period and hwc->period_left.
+	 */
+	hwc->last_period = hwc->sample_period;
+	local64_set(&hwc->period_left, hwc->sample_period);
+
 	hwc->config_base = perf_ibs->msr;
 	hwc->config = config;
 
-- 
cgit v1.2.3


From d47e8238cd76f1ffa7c8cd30e08b8e9074fd597e Mon Sep 17 00:00:00 2001
From: Robert Richter <robert.richter@amd.com>
Date: Mon, 2 Apr 2012 20:19:11 +0200
Subject: perf/x86-ibs: Take instruction pointer from ibs sample

Each IBS sample contains a linear address of the instruction that
caused the sample to trigger. This address is more precise than the
rip that was taken from the interrupt handler's stack. Update the rip
with that address. We use this in the next patch to implement
precise-event sampling on AMD systems using IBS.

Signed-off-by: Robert Richter <robert.richter@amd.com>
Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Link: http://lkml.kernel.org/r/1333390758-10893-6-git-send-email-robert.richter@amd.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/x86/include/asm/perf_event.h        |  6 ++--
 arch/x86/kernel/cpu/perf_event_amd_ibs.c | 48 +++++++++++++++++++++-----------
 2 files changed, 35 insertions(+), 19 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/include/asm/perf_event.h b/arch/x86/include/asm/perf_event.h
index 8a3c75d824b7..4e40a64315c9 100644
--- a/arch/x86/include/asm/perf_event.h
+++ b/arch/x86/include/asm/perf_event.h
@@ -158,6 +158,7 @@ struct x86_pmu_capability {
 #define IBS_CAPS_OPCNT			(1U<<4)
 #define IBS_CAPS_BRNTRGT		(1U<<5)
 #define IBS_CAPS_OPCNTEXT		(1U<<6)
+#define IBS_CAPS_RIPINVALIDCHK		(1U<<7)
 
 #define IBS_CAPS_DEFAULT		(IBS_CAPS_AVAIL		\
 					 | IBS_CAPS_FETCHSAM	\
@@ -170,14 +171,14 @@ struct x86_pmu_capability {
 #define IBSCTL_LVT_OFFSET_VALID		(1ULL<<8)
 #define IBSCTL_LVT_OFFSET_MASK		0x0F
 
-/* IbsFetchCtl bits/masks */
+/* ibs fetch bits/masks */
 #define IBS_FETCH_RAND_EN	(1ULL<<57)
 #define IBS_FETCH_VAL		(1ULL<<49)
 #define IBS_FETCH_ENABLE	(1ULL<<48)
 #define IBS_FETCH_CNT		0xFFFF0000ULL
 #define IBS_FETCH_MAX_CNT	0x0000FFFFULL
 
-/* IbsOpCtl bits */
+/* ibs op bits/masks */
 /* lower 4 bits of the current count are ignored: */
 #define IBS_OP_CUR_CNT		(0xFFFF0ULL<<32)
 #define IBS_OP_CNT_CTL		(1ULL<<19)
@@ -185,6 +186,7 @@ struct x86_pmu_capability {
 #define IBS_OP_ENABLE		(1ULL<<17)
 #define IBS_OP_MAX_CNT		0x0000FFFFULL
 #define IBS_OP_MAX_CNT_EXT	0x007FFFFFULL	/* not a register bit mask */
+#define IBS_RIP_INVALID		(1ULL<<38)
 
 extern u32 get_ibs_caps(void);
 
diff --git a/arch/x86/kernel/cpu/perf_event_amd_ibs.c b/arch/x86/kernel/cpu/perf_event_amd_ibs.c
index bc401bd9f14a..cc1f3293d6c2 100644
--- a/arch/x86/kernel/cpu/perf_event_amd_ibs.c
+++ b/arch/x86/kernel/cpu/perf_event_amd_ibs.c
@@ -9,6 +9,7 @@
 #include <linux/perf_event.h>
 #include <linux/module.h>
 #include <linux/pci.h>
+#include <linux/ptrace.h>
 
 #include <asm/apic.h>
 
@@ -382,7 +383,7 @@ static int perf_ibs_handle_irq(struct perf_ibs *perf_ibs, struct pt_regs *iregs)
 	struct perf_raw_record raw;
 	struct pt_regs regs;
 	struct perf_ibs_data ibs_data;
-	int offset, size, overflow, reenable;
+	int offset, size, check_rip, offset_max, throttle = 0;
 	unsigned int msr;
 	u64 *buf, config;
 
@@ -413,28 +414,41 @@ static int perf_ibs_handle_irq(struct perf_ibs *perf_ibs, struct pt_regs *iregs)
 
 	perf_ibs_event_update(perf_ibs, event, config);
 	perf_sample_data_init(&data, 0, hwc->last_period);
+	if (!perf_ibs_set_period(perf_ibs, hwc, &config))
+		goto out;	/* no sw counter overflow */
+
+	ibs_data.caps = ibs_caps;
+	size = 1;
+	offset = 1;
+	check_rip = (perf_ibs == &perf_ibs_op && (ibs_caps & IBS_CAPS_RIPINVALIDCHK));
+	if (event->attr.sample_type & PERF_SAMPLE_RAW)
+		offset_max = perf_ibs->offset_max;
+	else if (check_rip)
+		offset_max = 2;
+	else
+		offset_max = 1;
+	do {
+		rdmsrl(msr + offset, *buf++);
+		size++;
+		offset = find_next_bit(perf_ibs->offset_mask,
+				       perf_ibs->offset_max,
+				       offset + 1);
+	} while (offset < offset_max);
+	ibs_data.size = sizeof(u64) * size;
+
+	regs = *iregs;
+	if (!check_rip || !(ibs_data.regs[2] & IBS_RIP_INVALID))
+		instruction_pointer_set(&regs, ibs_data.regs[1]);
 
 	if (event->attr.sample_type & PERF_SAMPLE_RAW) {
-		ibs_data.caps = ibs_caps;
-		size = 1;
-		offset = 1;
-		do {
-		    rdmsrl(msr + offset, *buf++);
-		    size++;
-		    offset = find_next_bit(perf_ibs->offset_mask,
-					   perf_ibs->offset_max,
-					   offset + 1);
-		} while (offset < perf_ibs->offset_max);
-		raw.size = sizeof(u32) + sizeof(u64) * size;
+		raw.size = sizeof(u32) + ibs_data.size;
 		raw.data = ibs_data.data;
 		data.raw = &raw;
 	}
 
-	regs = *iregs; /* XXX: update ip from ibs sample */
-
-	overflow = perf_ibs_set_period(perf_ibs, hwc, &config);
-	reenable = !(overflow && perf_event_overflow(event, &data, &regs));
-	config = (config >> 4) | (reenable ? perf_ibs->enable_mask : 0);
+	throttle = perf_event_overflow(event, &data, &regs);
+out:
+	config = (config >> 4) | (throttle ? 0 : perf_ibs->enable_mask);
 	perf_ibs_enable_event(hwc, config);
 
 	perf_event_update_userpage(event);
-- 
cgit v1.2.3


From 450bbd493d436f9eadd1b7828158f37559f26674 Mon Sep 17 00:00:00 2001
From: Robert Richter <robert.richter@amd.com>
Date: Mon, 12 Mar 2012 12:54:32 +0100
Subject: perf/x86-ibs: Precise event sampling with IBS for AMD CPUs

This patch adds support for precise event sampling with IBS. There are
two counting modes to count either cycles or micro-ops. If the
corresponding performance counter events (hw events) are setup with
the precise flag set, the request is redirected to the ibs pmu:

 perf record -a -e cpu-cycles:p ...    # use ibs op counting cycle count
 perf record -a -e r076:p ...          # same as -e cpu-cycles:p
 perf record -a -e r0C1:p ...          # use ibs op counting micro-ops

Each ibs sample contains a linear address that points to the
instruction that was causing the sample to trigger. With ibs we have
skid 0. Thus, ibs supports precise levels 1 and 2. Samples are marked
with the PERF_EFLAGS_EXACT flag set. In rare cases the rip is invalid
when IBS was not able to record the rip correctly. Then the
PERF_EFLAGS_EXACT flag is cleared and the rip is taken from pt_regs.

V2:
* don't drop samples in precise level 2 if rip is invalid, instead
  support the PERF_EFLAGS_EXACT flag

Signed-off-by: Robert Richter <robert.richter@amd.com>
Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Link: http://lkml.kernel.org/r/20120502103309.GP18810@erda.amd.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/x86/kernel/cpu/perf_event_amd.c     |  7 ++-
 arch/x86/kernel/cpu/perf_event_amd_ibs.c | 73 ++++++++++++++++++++++++++++++--
 2 files changed, 76 insertions(+), 4 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kernel/cpu/perf_event_amd.c b/arch/x86/kernel/cpu/perf_event_amd.c
index 589286f28877..65652265fffd 100644
--- a/arch/x86/kernel/cpu/perf_event_amd.c
+++ b/arch/x86/kernel/cpu/perf_event_amd.c
@@ -134,8 +134,13 @@ static u64 amd_pmu_event_map(int hw_event)
 
 static int amd_pmu_hw_config(struct perf_event *event)
 {
-	int ret = x86_pmu_hw_config(event);
+	int ret;
 
+	/* pass precise event sampling to ibs: */
+	if (event->attr.precise_ip && get_ibs_caps())
+		return -ENOENT;
+
+	ret = x86_pmu_hw_config(event);
 	if (ret)
 		return ret;
 
diff --git a/arch/x86/kernel/cpu/perf_event_amd_ibs.c b/arch/x86/kernel/cpu/perf_event_amd_ibs.c
index cc1f3293d6c2..34dfa853f6df 100644
--- a/arch/x86/kernel/cpu/perf_event_amd_ibs.c
+++ b/arch/x86/kernel/cpu/perf_event_amd_ibs.c
@@ -145,17 +145,80 @@ static struct perf_ibs *get_ibs_pmu(int type)
 	return NULL;
 }
 
+/*
+ * Use IBS for precise event sampling:
+ *
+ *  perf record -a -e cpu-cycles:p ...    # use ibs op counting cycle count
+ *  perf record -a -e r076:p ...          # same as -e cpu-cycles:p
+ *  perf record -a -e r0C1:p ...          # use ibs op counting micro-ops
+ *
+ * IbsOpCntCtl (bit 19) of IBS Execution Control Register (IbsOpCtl,
+ * MSRC001_1033) is used to select either cycle or micro-ops counting
+ * mode.
+ *
+ * The rip of IBS samples has skid 0. Thus, IBS supports precise
+ * levels 1 and 2 and the PERF_EFLAGS_EXACT is set. In rare cases the
+ * rip is invalid when IBS was not able to record the rip correctly.
+ * We clear PERF_EFLAGS_EXACT and take the rip from pt_regs then.
+ *
+ */
+static int perf_ibs_precise_event(struct perf_event *event, u64 *config)
+{
+	switch (event->attr.precise_ip) {
+	case 0:
+		return -ENOENT;
+	case 1:
+	case 2:
+		break;
+	default:
+		return -EOPNOTSUPP;
+	}
+
+	switch (event->attr.type) {
+	case PERF_TYPE_HARDWARE:
+		switch (event->attr.config) {
+		case PERF_COUNT_HW_CPU_CYCLES:
+			*config = 0;
+			return 0;
+		}
+		break;
+	case PERF_TYPE_RAW:
+		switch (event->attr.config) {
+		case 0x0076:
+			*config = 0;
+			return 0;
+		case 0x00C1:
+			*config = IBS_OP_CNT_CTL;
+			return 0;
+		}
+		break;
+	default:
+		return -ENOENT;
+	}
+
+	return -EOPNOTSUPP;
+}
+
 static int perf_ibs_init(struct perf_event *event)
 {
 	struct hw_perf_event *hwc = &event->hw;
 	struct perf_ibs *perf_ibs;
 	u64 max_cnt, config;
+	int ret;
 
 	perf_ibs = get_ibs_pmu(event->attr.type);
-	if (!perf_ibs)
+	if (perf_ibs) {
+		config = event->attr.config;
+	} else {
+		perf_ibs = &perf_ibs_op;
+		ret = perf_ibs_precise_event(event, &config);
+		if (ret)
+			return ret;
+	}
+
+	if (event->pmu != &perf_ibs->pmu)
 		return -ENOENT;
 
-	config = event->attr.config;
 	if (config & ~perf_ibs->config_mask)
 		return -EINVAL;
 
@@ -437,8 +500,12 @@ static int perf_ibs_handle_irq(struct perf_ibs *perf_ibs, struct pt_regs *iregs)
 	ibs_data.size = sizeof(u64) * size;
 
 	regs = *iregs;
-	if (!check_rip || !(ibs_data.regs[2] & IBS_RIP_INVALID))
+	if (check_rip && (ibs_data.regs[2] & IBS_RIP_INVALID)) {
+		regs.flags &= ~PERF_EFLAGS_EXACT;
+	} else {
 		instruction_pointer_set(&regs, ibs_data.regs[1]);
+		regs.flags |= PERF_EFLAGS_EXACT;
+	}
 
 	if (event->attr.sample_type & PERF_SAMPLE_RAW) {
 		raw.size = sizeof(u32) + ibs_data.size;
-- 
cgit v1.2.3


From 98112d2e957e0d348f06d8a40f2f720204a70b55 Mon Sep 17 00:00:00 2001
From: Robert Richter <robert.richter@amd.com>
Date: Mon, 2 Apr 2012 20:19:13 +0200
Subject: perf/x86-ibs: Rename some variables

Simple patch that just renames some variables for better
understanding.

Signed-off-by: Robert Richter <robert.richter@amd.com>
Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Link: http://lkml.kernel.org/r/1333390758-10893-8-git-send-email-robert.richter@amd.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/x86/kernel/cpu/perf_event_amd_ibs.c | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kernel/cpu/perf_event_amd_ibs.c b/arch/x86/kernel/cpu/perf_event_amd_ibs.c
index 34dfa853f6df..29a1bffe1dfb 100644
--- a/arch/x86/kernel/cpu/perf_event_amd_ibs.c
+++ b/arch/x86/kernel/cpu/perf_event_amd_ibs.c
@@ -62,7 +62,7 @@ struct perf_ibs_data {
 };
 
 static int
-perf_event_set_period(struct hw_perf_event *hwc, u64 min, u64 max, u64 *count)
+perf_event_set_period(struct hw_perf_event *hwc, u64 min, u64 max, u64 *hw_period)
 {
 	s64 left = local64_read(&hwc->period_left);
 	s64 period = hwc->sample_period;
@@ -91,7 +91,7 @@ perf_event_set_period(struct hw_perf_event *hwc, u64 min, u64 max, u64 *count)
 	if (left > max)
 		left = max;
 
-	*count = (u64)left;
+	*hw_period = (u64)left;
 
 	return overflow;
 }
@@ -262,13 +262,13 @@ static int perf_ibs_init(struct perf_event *event)
 static int perf_ibs_set_period(struct perf_ibs *perf_ibs,
 			       struct hw_perf_event *hwc, u64 *period)
 {
-	int ret;
+	int overflow;
 
 	/* ignore lower 4 bits in min count: */
-	ret = perf_event_set_period(hwc, 1<<4, perf_ibs->max_period, period);
+	overflow = perf_event_set_period(hwc, 1<<4, perf_ibs->max_period, period);
 	local64_set(&hwc->prev_count, 0);
 
-	return ret;
+	return overflow;
 }
 
 static u64 get_ibs_fetch_count(u64 config)
-- 
cgit v1.2.3


From fc006cf7cc7471e1bdf34e40111971e03622af6c Mon Sep 17 00:00:00 2001
From: Robert Richter <robert.richter@amd.com>
Date: Mon, 2 Apr 2012 20:19:14 +0200
Subject: perf/x86-ibs: Trigger overflow if remaining period is too small

There are cases where the remaining period is smaller than the minimal
possible value. In this case the counter is restarted with the minimal
period. This is of no use as the interrupt handler will trigger
immediately again and most likely hits itself. This biases the
results.

So, if the remaining period is within the min range, we better do not
restart the counter and instead trigger the overflow.

Signed-off-by: Robert Richter <robert.richter@amd.com>
Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Link: http://lkml.kernel.org/r/1333390758-10893-9-git-send-email-robert.richter@amd.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/x86/kernel/cpu/perf_event_amd_ibs.c | 5 +----
 1 file changed, 1 insertion(+), 4 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kernel/cpu/perf_event_amd_ibs.c b/arch/x86/kernel/cpu/perf_event_amd_ibs.c
index 29a1bffe1dfb..3e32908292a7 100644
--- a/arch/x86/kernel/cpu/perf_event_amd_ibs.c
+++ b/arch/x86/kernel/cpu/perf_event_amd_ibs.c
@@ -78,16 +78,13 @@ perf_event_set_period(struct hw_perf_event *hwc, u64 min, u64 max, u64 *hw_perio
 		overflow = 1;
 	}
 
-	if (unlikely(left <= 0)) {
+	if (unlikely(left < (s64)min)) {
 		left += period;
 		local64_set(&hwc->period_left, left);
 		hwc->last_period = period;
 		overflow = 1;
 	}
 
-	if (unlikely(left < min))
-		left = min;
-
 	if (left > max)
 		left = max;
 
-- 
cgit v1.2.3


From 7caaf4d8241feecafb87919402b0a6dbb1b71d9e Mon Sep 17 00:00:00 2001
From: Robert Richter <robert.richter@amd.com>
Date: Mon, 2 Apr 2012 20:19:15 +0200
Subject: perf/x86-ibs: Extend hw period that triggers overflow

If the last hw period is too short we might hit the irq handler which
biases the results. Thus try to have a max last period that triggers
the sw overflow.

Signed-off-by: Robert Richter <robert.richter@amd.com>
Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Link: http://lkml.kernel.org/r/1333390758-10893-10-git-send-email-robert.richter@amd.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/x86/kernel/cpu/perf_event_amd_ibs.c | 15 +++++++++++++--
 1 file changed, 13 insertions(+), 2 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kernel/cpu/perf_event_amd_ibs.c b/arch/x86/kernel/cpu/perf_event_amd_ibs.c
index 3e32908292a7..cb51a3e55870 100644
--- a/arch/x86/kernel/cpu/perf_event_amd_ibs.c
+++ b/arch/x86/kernel/cpu/perf_event_amd_ibs.c
@@ -85,8 +85,19 @@ perf_event_set_period(struct hw_perf_event *hwc, u64 min, u64 max, u64 *hw_perio
 		overflow = 1;
 	}
 
-	if (left > max)
-		left = max;
+	/*
+	 * If the hw period that triggers the sw overflow is too short
+	 * we might hit the irq handler. This biases the results.
+	 * Thus we shorten the next-to-last period and set the last
+	 * period to the max period.
+	 */
+	if (left > max) {
+		left -= max;
+		if (left > max)
+			left = max;
+		else if (left < min)
+			left = min;
+	}
 
 	*hw_period = (u64)left;
 
-- 
cgit v1.2.3


From c9574fe0bdb9ac9a2698e02a712088ce8431e9f8 Mon Sep 17 00:00:00 2001
From: Robert Richter <robert.richter@amd.com>
Date: Mon, 2 Apr 2012 20:19:16 +0200
Subject: perf/x86-ibs: Implement workaround for IBS erratum #420

When disabling ibs there might be the case where hardware continuously
generates interrupts. This is described in erratum #420 (Instruction-
Based Sampling Engine May Generate Interrupt that Cannot Be Cleared).
To avoid this we must clear the counter mask first and then clear the
enable bit. This patch implements this.

See Revision Guide for AMD Family 10h Processors, Publication #41322.

Note: We now keep track of the last read ibs config value which is
then used to disable ibs. To update the config value we pass now a
pointer to the functions reading it.

Signed-off-by: Robert Richter <robert.richter@amd.com>
Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Link: http://lkml.kernel.org/r/1333390758-10893-11-git-send-email-robert.richter@amd.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/x86/kernel/cpu/perf_event_amd_ibs.c | 62 ++++++++++++++++++++------------
 1 file changed, 39 insertions(+), 23 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kernel/cpu/perf_event_amd_ibs.c b/arch/x86/kernel/cpu/perf_event_amd_ibs.c
index cb51a3e55870..b14e71127c82 100644
--- a/arch/x86/kernel/cpu/perf_event_amd_ibs.c
+++ b/arch/x86/kernel/cpu/perf_event_amd_ibs.c
@@ -291,20 +291,36 @@ static u64 get_ibs_op_count(u64 config)
 
 static void
 perf_ibs_event_update(struct perf_ibs *perf_ibs, struct perf_event *event,
-		      u64 config)
+		      u64 *config)
 {
-	u64 count = perf_ibs->get_count(config);
+	u64 count = perf_ibs->get_count(*config);
 
 	while (!perf_event_try_update(event, count, 20)) {
-		rdmsrl(event->hw.config_base, config);
-		count = perf_ibs->get_count(config);
+		rdmsrl(event->hw.config_base, *config);
+		count = perf_ibs->get_count(*config);
 	}
 }
 
-/* Note: The enable mask must be encoded in the config argument. */
-static inline void perf_ibs_enable_event(struct hw_perf_event *hwc, u64 config)
+static inline void perf_ibs_enable_event(struct perf_ibs *perf_ibs,
+					 struct hw_perf_event *hwc, u64 config)
 {
-	wrmsrl(hwc->config_base, hwc->config | config);
+	wrmsrl(hwc->config_base, hwc->config | config | perf_ibs->enable_mask);
+}
+
+/*
+ * Erratum #420 Instruction-Based Sampling Engine May Generate
+ * Interrupt that Cannot Be Cleared:
+ *
+ * Must clear counter mask first, then clear the enable bit. See
+ * Revision Guide for AMD Family 10h Processors, Publication #41322.
+ */
+static inline void perf_ibs_disable_event(struct perf_ibs *perf_ibs,
+					  struct hw_perf_event *hwc, u64 config)
+{
+	config &= ~perf_ibs->cnt_mask;
+	wrmsrl(hwc->config_base, config);
+	config &= ~perf_ibs->enable_mask;
+	wrmsrl(hwc->config_base, config);
 }
 
 /*
@@ -318,7 +334,7 @@ static void perf_ibs_start(struct perf_event *event, int flags)
 	struct hw_perf_event *hwc = &event->hw;
 	struct perf_ibs *perf_ibs = container_of(event->pmu, struct perf_ibs, pmu);
 	struct cpu_perf_ibs *pcpu = this_cpu_ptr(perf_ibs->pcpu);
-	u64 config;
+	u64 period;
 
 	if (WARN_ON_ONCE(!(hwc->state & PERF_HES_STOPPED)))
 		return;
@@ -326,10 +342,9 @@ static void perf_ibs_start(struct perf_event *event, int flags)
 	WARN_ON_ONCE(!(hwc->state & PERF_HES_UPTODATE));
 	hwc->state = 0;
 
-	perf_ibs_set_period(perf_ibs, hwc, &config);
-	config = (config >> 4) | perf_ibs->enable_mask;
+	perf_ibs_set_period(perf_ibs, hwc, &period);
 	set_bit(IBS_STARTED, pcpu->state);
-	perf_ibs_enable_event(hwc, config);
+	perf_ibs_enable_event(perf_ibs, hwc, period >> 4);
 
 	perf_event_update_userpage(event);
 }
@@ -339,7 +354,7 @@ static void perf_ibs_stop(struct perf_event *event, int flags)
 	struct hw_perf_event *hwc = &event->hw;
 	struct perf_ibs *perf_ibs = container_of(event->pmu, struct perf_ibs, pmu);
 	struct cpu_perf_ibs *pcpu = this_cpu_ptr(perf_ibs->pcpu);
-	u64 val;
+	u64 config;
 	int stopping;
 
 	stopping = test_and_clear_bit(IBS_STARTED, pcpu->state);
@@ -347,12 +362,11 @@ static void perf_ibs_stop(struct perf_event *event, int flags)
 	if (!stopping && (hwc->state & PERF_HES_UPTODATE))
 		return;
 
-	rdmsrl(hwc->config_base, val);
+	rdmsrl(hwc->config_base, config);
 
 	if (stopping) {
 		set_bit(IBS_STOPPING, pcpu->state);
-		val &= ~perf_ibs->enable_mask;
-		wrmsrl(hwc->config_base, val);
+		perf_ibs_disable_event(perf_ibs, hwc, config);
 		WARN_ON_ONCE(hwc->state & PERF_HES_STOPPED);
 		hwc->state |= PERF_HES_STOPPED;
 	}
@@ -360,7 +374,7 @@ static void perf_ibs_stop(struct perf_event *event, int flags)
 	if (hwc->state & PERF_HES_UPTODATE)
 		return;
 
-	perf_ibs_event_update(perf_ibs, event, val);
+	perf_ibs_event_update(perf_ibs, event, &config);
 	hwc->state |= PERF_HES_UPTODATE;
 }
 
@@ -456,7 +470,7 @@ static int perf_ibs_handle_irq(struct perf_ibs *perf_ibs, struct pt_regs *iregs)
 	struct perf_ibs_data ibs_data;
 	int offset, size, check_rip, offset_max, throttle = 0;
 	unsigned int msr;
-	u64 *buf, config;
+	u64 *buf, *config, period;
 
 	if (!test_bit(IBS_STARTED, pcpu->state)) {
 		/* Catch spurious interrupts after stopping IBS: */
@@ -477,15 +491,15 @@ static int perf_ibs_handle_irq(struct perf_ibs *perf_ibs, struct pt_regs *iregs)
 	 * supported in all cpus. As this triggered an interrupt, we
 	 * set the current count to the max count.
 	 */
-	config = ibs_data.regs[0];
+	config = &ibs_data.regs[0];
 	if (perf_ibs == &perf_ibs_op && !(ibs_caps & IBS_CAPS_RDWROPCNT)) {
-		config &= ~IBS_OP_CUR_CNT;
-		config |= (config & IBS_OP_MAX_CNT) << 36;
+		*config &= ~IBS_OP_CUR_CNT;
+		*config |= (*config & IBS_OP_MAX_CNT) << 36;
 	}
 
 	perf_ibs_event_update(perf_ibs, event, config);
 	perf_sample_data_init(&data, 0, hwc->last_period);
-	if (!perf_ibs_set_period(perf_ibs, hwc, &config))
+	if (!perf_ibs_set_period(perf_ibs, hwc, &period))
 		goto out;	/* no sw counter overflow */
 
 	ibs_data.caps = ibs_caps;
@@ -523,8 +537,10 @@ static int perf_ibs_handle_irq(struct perf_ibs *perf_ibs, struct pt_regs *iregs)
 
 	throttle = perf_event_overflow(event, &data, &regs);
 out:
-	config = (config >> 4) | (throttle ? 0 : perf_ibs->enable_mask);
-	perf_ibs_enable_event(hwc, config);
+	if (throttle)
+		perf_ibs_disable_event(perf_ibs, hwc, *config);
+	else
+		perf_ibs_enable_event(perf_ibs, hwc, period >> 4);
 
 	perf_event_update_userpage(event);
 
-- 
cgit v1.2.3


From fc5fb2b5e1874e5894e2ac503bfb744220db89a1 Mon Sep 17 00:00:00 2001
From: Robert Richter <robert.richter@amd.com>
Date: Mon, 2 Apr 2012 20:19:17 +0200
Subject: perf/x86-ibs: Catch spurious interrupts after stopping IBS

After disabling IBS there could be still incomming NMIs with samples
that even have the valid bit cleared. Mark all this NMIs as handled to
avoid spurious interrupt messages.

Signed-off-by: Robert Richter <robert.richter@amd.com>
Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Link: http://lkml.kernel.org/r/1333390758-10893-12-git-send-email-robert.richter@amd.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/x86/kernel/cpu/perf_event_amd_ibs.c | 12 +++++++-----
 1 file changed, 7 insertions(+), 5 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kernel/cpu/perf_event_amd_ibs.c b/arch/x86/kernel/cpu/perf_event_amd_ibs.c
index b14e71127c82..5a9f95b5cc26 100644
--- a/arch/x86/kernel/cpu/perf_event_amd_ibs.c
+++ b/arch/x86/kernel/cpu/perf_event_amd_ibs.c
@@ -473,11 +473,13 @@ static int perf_ibs_handle_irq(struct perf_ibs *perf_ibs, struct pt_regs *iregs)
 	u64 *buf, *config, period;
 
 	if (!test_bit(IBS_STARTED, pcpu->state)) {
-		/* Catch spurious interrupts after stopping IBS: */
-		if (!test_and_clear_bit(IBS_STOPPING, pcpu->state))
-			return 0;
-		rdmsrl(perf_ibs->msr, *ibs_data.regs);
-		return (*ibs_data.regs & perf_ibs->valid_mask) ? 1 : 0;
+		/*
+		 * Catch spurious interrupts after stopping IBS: After
+		 * disabling IBS there could be still incomming NMIs
+		 * with samples that even have the valid bit cleared.
+		 * Mark all this NMIs as handled.
+		 */
+		return test_and_clear_bit(IBS_STOPPING, pcpu->state) ? 1 : 0;
 	}
 
 	msr = hwc->config_base;
-- 
cgit v1.2.3


From 8b1e13638d465863572c8207a5cfceeef0cf0441 Mon Sep 17 00:00:00 2001
From: Robert Richter <robert.richter@amd.com>
Date: Mon, 2 Apr 2012 20:19:18 +0200
Subject: perf/x86-ibs: Fix usage of IBS op current count

The value of IbsOpCurCnt rolls over when it reaches IbsOpMaxCnt. Thus,
it is reset to zero by hardware. To get the correct count we need to
add the max count to it in case we received an ibs sample (valid bit
set).

Signed-off-by: Robert Richter <robert.richter@amd.com>
Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Link: http://lkml.kernel.org/r/1333390758-10893-13-git-send-email-robert.richter@amd.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/x86/kernel/cpu/perf_event_amd_ibs.c | 33 ++++++++++++++++++++------------
 1 file changed, 21 insertions(+), 12 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kernel/cpu/perf_event_amd_ibs.c b/arch/x86/kernel/cpu/perf_event_amd_ibs.c
index 5a9f95b5cc26..da9bcdcd9856 100644
--- a/arch/x86/kernel/cpu/perf_event_amd_ibs.c
+++ b/arch/x86/kernel/cpu/perf_event_amd_ibs.c
@@ -286,7 +286,15 @@ static u64 get_ibs_fetch_count(u64 config)
 
 static u64 get_ibs_op_count(u64 config)
 {
-	return (config & IBS_OP_CUR_CNT) >> 32;
+	u64 count = 0;
+
+	if (config & IBS_OP_VAL)
+		count += (config & IBS_OP_MAX_CNT) << 4; /* cnt rolled over */
+
+	if (ibs_caps & IBS_CAPS_RDWROPCNT)
+		count += (config & IBS_OP_CUR_CNT) >> 32;
+
+	return count;
 }
 
 static void
@@ -295,7 +303,12 @@ perf_ibs_event_update(struct perf_ibs *perf_ibs, struct perf_event *event,
 {
 	u64 count = perf_ibs->get_count(*config);
 
-	while (!perf_event_try_update(event, count, 20)) {
+	/*
+	 * Set width to 64 since we do not overflow on max width but
+	 * instead on max count. In perf_ibs_set_period() we clear
+	 * prev count manually on overflow.
+	 */
+	while (!perf_event_try_update(event, count, 64)) {
 		rdmsrl(event->hw.config_base, *config);
 		count = perf_ibs->get_count(*config);
 	}
@@ -374,6 +387,12 @@ static void perf_ibs_stop(struct perf_event *event, int flags)
 	if (hwc->state & PERF_HES_UPTODATE)
 		return;
 
+	/*
+	 * Clear valid bit to not count rollovers on update, rollovers
+	 * are only updated in the irq handler.
+	 */
+	config &= ~perf_ibs->valid_mask;
+
 	perf_ibs_event_update(perf_ibs, event, &config);
 	hwc->state |= PERF_HES_UPTODATE;
 }
@@ -488,17 +507,7 @@ static int perf_ibs_handle_irq(struct perf_ibs *perf_ibs, struct pt_regs *iregs)
 	if (!(*buf++ & perf_ibs->valid_mask))
 		return 0;
 
-	/*
-	 * Emulate IbsOpCurCnt in MSRC001_1033 (IbsOpCtl), not
-	 * supported in all cpus. As this triggered an interrupt, we
-	 * set the current count to the max count.
-	 */
 	config = &ibs_data.regs[0];
-	if (perf_ibs == &perf_ibs_op && !(ibs_caps & IBS_CAPS_RDWROPCNT)) {
-		*config &= ~IBS_OP_CUR_CNT;
-		*config |= (*config & IBS_OP_MAX_CNT) << 36;
-	}
-
 	perf_ibs_event_update(perf_ibs, event, config);
 	perf_sample_data_init(&data, 0, hwc->last_period);
 	if (!perf_ibs_set_period(perf_ibs, hwc, &period))
-- 
cgit v1.2.3


From 35bdd29095ad614c5fb4a934bfd4f57a94dfd395 Mon Sep 17 00:00:00 2001
From: Alessandro Rubini <rubini@gnudd.com>
Date: Thu, 12 Apr 2012 10:48:44 +0200
Subject: mfd: Add driver for STA2X11 MFD block

This also introduces <asm/sta2x11.h> to export a function that is in
the base sta2x11 support patches. The header will increase with other
prototypes and constants over time.

Signed-off-by: Alessandro Rubini <rubini@gnudd.com>
Acked-by: Giancarlo Asnaghi <giancarlo.asnaghi@st.com>
Cc: Alan Cox <alan@linux.intel.com>
Signed-off-by: Samuel Ortiz <sameo@linux.intel.com>
---
 arch/x86/include/asm/sta2x11.h  |  12 ++
 drivers/mfd/Kconfig             |   5 +
 drivers/mfd/Makefile            |   1 +
 drivers/mfd/sta2x11-mfd.c       | 467 ++++++++++++++++++++++++++++++++++++++++
 include/linux/mfd/sta2x11-mfd.h | 324 ++++++++++++++++++++++++++++
 5 files changed, 809 insertions(+)
 create mode 100644 arch/x86/include/asm/sta2x11.h
 create mode 100644 drivers/mfd/sta2x11-mfd.c
 create mode 100644 include/linux/mfd/sta2x11-mfd.h

(limited to 'arch/x86')

diff --git a/arch/x86/include/asm/sta2x11.h b/arch/x86/include/asm/sta2x11.h
new file mode 100644
index 000000000000..e9d32df89ccc
--- /dev/null
+++ b/arch/x86/include/asm/sta2x11.h
@@ -0,0 +1,12 @@
+/*
+ * Header file for STMicroelectronics ConneXt (STA2X11) IOHub
+ */
+#ifndef __ASM_STA2X11_H
+#define __ASM_STA2X11_H
+
+#include <linux/pci.h>
+
+/* This needs to be called from the MFD to configure its sub-devices */
+struct sta2x11_instance *sta2x11_get_instance(struct pci_dev *pdev);
+
+#endif /* __ASM_STA2X11_H */
diff --git a/drivers/mfd/Kconfig b/drivers/mfd/Kconfig
index ef86a741b7e2..48eed22c65a5 100644
--- a/drivers/mfd/Kconfig
+++ b/drivers/mfd/Kconfig
@@ -906,6 +906,11 @@ config MFD_RC5T583
 	  Additional drivers must be enabled in order to use the
 	  different functionality of the device.
 
+config MFD_STA2X11
+	bool "STA2X11 multi function device support"
+	depends on STA2X11
+	select MFD_CORE
+
 config MFD_ANATOP
 	bool "Support for Freescale i.MX on-chip ANATOP controller"
 	depends on SOC_IMX6Q
diff --git a/drivers/mfd/Makefile b/drivers/mfd/Makefile
index 5dd6be7aa350..0dc55cbefa09 100644
--- a/drivers/mfd/Makefile
+++ b/drivers/mfd/Makefile
@@ -15,6 +15,7 @@ obj-$(CONFIG_MFD_DAVINCI_VOICECODEC)	+= davinci_voicecodec.o
 obj-$(CONFIG_MFD_DM355EVM_MSP)	+= dm355evm_msp.o
 obj-$(CONFIG_MFD_TI_SSP)	+= ti-ssp.o
 
+obj-$(CONFIG_MFD_STA2X11)	+= sta2x11-mfd.o
 obj-$(CONFIG_MFD_STMPE)		+= stmpe.o
 obj-$(CONFIG_STMPE_I2C)		+= stmpe-i2c.o
 obj-$(CONFIG_STMPE_SPI)		+= stmpe-spi.o
diff --git a/drivers/mfd/sta2x11-mfd.c b/drivers/mfd/sta2x11-mfd.c
new file mode 100644
index 000000000000..d31fed07aefb
--- /dev/null
+++ b/drivers/mfd/sta2x11-mfd.c
@@ -0,0 +1,467 @@
+/*
+ * Copyright (c) 2009-2011 Wind River Systems, Inc.
+ * Copyright (c) 2011 ST Microelectronics (Alessandro Rubini)
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ * See the GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
+ *
+ */
+
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/spinlock.h>
+#include <linux/errno.h>
+#include <linux/device.h>
+#include <linux/slab.h>
+#include <linux/list.h>
+#include <linux/io.h>
+#include <linux/ioport.h>
+#include <linux/pci.h>
+#include <linux/debugfs.h>
+#include <linux/seq_file.h>
+#include <linux/platform_device.h>
+#include <linux/mfd/core.h>
+#include <linux/mfd/sta2x11-mfd.h>
+
+#include <asm/sta2x11.h>
+
+/* This describes STA2X11 MFD chip for us, we may have several */
+struct sta2x11_mfd {
+	struct sta2x11_instance *instance;
+	spinlock_t lock;
+	struct list_head list;
+	void __iomem *sctl_regs;
+	void __iomem *apbreg_regs;
+};
+
+static LIST_HEAD(sta2x11_mfd_list);
+
+/* Three functions to act on the list */
+static struct sta2x11_mfd *sta2x11_mfd_find(struct pci_dev *pdev)
+{
+	struct sta2x11_instance *instance;
+	struct sta2x11_mfd *mfd;
+
+	if (!pdev && !list_empty(&sta2x11_mfd_list)) {
+		pr_warning("%s: Unspecified device, "
+			    "using first instance\n", __func__);
+		return list_entry(sta2x11_mfd_list.next,
+				  struct sta2x11_mfd, list);
+	}
+
+	instance = sta2x11_get_instance(pdev);
+	if (!instance)
+		return NULL;
+	list_for_each_entry(mfd, &sta2x11_mfd_list, list) {
+		if (mfd->instance == instance)
+			return mfd;
+	}
+	return NULL;
+}
+
+static int __devinit sta2x11_mfd_add(struct pci_dev *pdev, gfp_t flags)
+{
+	struct sta2x11_mfd *mfd = sta2x11_mfd_find(pdev);
+	struct sta2x11_instance *instance;
+
+	if (mfd)
+		return -EBUSY;
+	instance = sta2x11_get_instance(pdev);
+	if (!instance)
+		return -EINVAL;
+	mfd = kzalloc(sizeof(*mfd), flags);
+	if (!mfd)
+		return -ENOMEM;
+	INIT_LIST_HEAD(&mfd->list);
+	spin_lock_init(&mfd->lock);
+	mfd->instance = instance;
+	list_add(&mfd->list, &sta2x11_mfd_list);
+	return 0;
+}
+
+static int __devexit mfd_remove(struct pci_dev *pdev)
+{
+	struct sta2x11_mfd *mfd = sta2x11_mfd_find(pdev);
+
+	if (!mfd)
+		return -ENODEV;
+	list_del(&mfd->list);
+	kfree(mfd);
+	return 0;
+}
+
+/* These two functions are exported and are not expected to fail */
+u32 sta2x11_sctl_mask(struct pci_dev *pdev, u32 reg, u32 mask, u32 val)
+{
+	struct sta2x11_mfd *mfd = sta2x11_mfd_find(pdev);
+	u32 r;
+	unsigned long flags;
+
+	if (!mfd) {
+		dev_warn(&pdev->dev, ": can't access sctl regs\n");
+		return 0;
+	}
+	if (!mfd->sctl_regs) {
+		dev_warn(&pdev->dev, ": system ctl not initialized\n");
+		return 0;
+	}
+	spin_lock_irqsave(&mfd->lock, flags);
+	r = readl(mfd->sctl_regs + reg);
+	r &= ~mask;
+	r |= val;
+	if (mask)
+		writel(r, mfd->sctl_regs + reg);
+	spin_unlock_irqrestore(&mfd->lock, flags);
+	return r;
+}
+EXPORT_SYMBOL(sta2x11_sctl_mask);
+
+u32 sta2x11_apbreg_mask(struct pci_dev *pdev, u32 reg, u32 mask, u32 val)
+{
+	struct sta2x11_mfd *mfd = sta2x11_mfd_find(pdev);
+	u32 r;
+	unsigned long flags;
+
+	if (!mfd) {
+		dev_warn(&pdev->dev, ": can't access apb regs\n");
+		return 0;
+	}
+	if (!mfd->apbreg_regs) {
+		dev_warn(&pdev->dev, ": apb bridge not initialized\n");
+		return 0;
+	}
+	spin_lock_irqsave(&mfd->lock, flags);
+	r = readl(mfd->apbreg_regs + reg);
+	r &= ~mask;
+	r |= val;
+	if (mask)
+		writel(r, mfd->apbreg_regs + reg);
+	spin_unlock_irqrestore(&mfd->lock, flags);
+	return r;
+}
+EXPORT_SYMBOL(sta2x11_apbreg_mask);
+
+/* Two debugfs files, for our registers (FIXME: one instance only) */
+#define REG(regname) {.name = #regname, .offset = SCTL_ ## regname}
+static struct debugfs_reg32 sta2x11_sctl_regs[] = {
+	REG(SCCTL), REG(ARMCFG), REG(SCPLLCTL), REG(SCPLLFCTRL),
+	REG(SCRESFRACT), REG(SCRESCTRL1), REG(SCRESXTRL2), REG(SCPEREN0),
+	REG(SCPEREN1), REG(SCPEREN2), REG(SCGRST), REG(SCPCIPMCR1),
+	REG(SCPCIPMCR2), REG(SCPCIPMSR1), REG(SCPCIPMSR2), REG(SCPCIPMSR3),
+	REG(SCINTREN), REG(SCRISR), REG(SCCLKSTAT0), REG(SCCLKSTAT1),
+	REG(SCCLKSTAT2), REG(SCRSTSTA),
+};
+#undef REG
+
+static struct debugfs_regset32 sctl_regset = {
+	.regs = sta2x11_sctl_regs,
+	.nregs = ARRAY_SIZE(sta2x11_sctl_regs),
+};
+
+#define REG(regname) {.name = #regname, .offset = regname}
+static struct debugfs_reg32 sta2x11_apbreg_regs[] = {
+	REG(APBREG_BSR), REG(APBREG_PAER), REG(APBREG_PWAC), REG(APBREG_PRAC),
+	REG(APBREG_PCG), REG(APBREG_PUR), REG(APBREG_EMU_PCG),
+};
+#undef REG
+
+static struct debugfs_regset32 apbreg_regset = {
+	.regs = sta2x11_apbreg_regs,
+	.nregs = ARRAY_SIZE(sta2x11_apbreg_regs),
+};
+
+static struct dentry *sta2x11_sctl_debugfs;
+static struct dentry *sta2x11_apbreg_debugfs;
+
+/* Probe for the two platform devices */
+static int sta2x11_sctl_probe(struct platform_device *dev)
+{
+	struct pci_dev **pdev;
+	struct sta2x11_mfd *mfd;
+	struct resource *res;
+
+	pdev = dev->dev.platform_data;
+	mfd = sta2x11_mfd_find(*pdev);
+	if (!mfd)
+		return -ENODEV;
+
+	res = platform_get_resource(dev, IORESOURCE_MEM, 0);
+	if (!res)
+		return -ENOMEM;
+
+	if (!request_mem_region(res->start, resource_size(res),
+				"sta2x11-sctl"))
+		return -EBUSY;
+
+	mfd->sctl_regs = ioremap(res->start, resource_size(res));
+	if (!mfd->sctl_regs) {
+		release_mem_region(res->start, resource_size(res));
+		return -ENOMEM;
+	}
+	sctl_regset.base = mfd->sctl_regs;
+	sta2x11_sctl_debugfs = debugfs_create_regset32("sta2x11-sctl",
+						  S_IFREG | S_IRUGO,
+						  NULL, &sctl_regset);
+	return 0;
+}
+
+static int sta2x11_apbreg_probe(struct platform_device *dev)
+{
+	struct pci_dev **pdev;
+	struct sta2x11_mfd *mfd;
+	struct resource *res;
+
+	pdev = dev->dev.platform_data;
+	dev_dbg(&dev->dev, "%s: pdata is %p\n", __func__, pdev);
+	dev_dbg(&dev->dev, "%s: *pdata is %p\n", __func__, *pdev);
+
+	mfd = sta2x11_mfd_find(*pdev);
+	if (!mfd)
+		return -ENODEV;
+
+	res = platform_get_resource(dev, IORESOURCE_MEM, 0);
+	if (!res)
+		return -ENOMEM;
+
+	if (!request_mem_region(res->start, resource_size(res),
+				"sta2x11-apbreg"))
+		return -EBUSY;
+
+	mfd->apbreg_regs = ioremap(res->start, resource_size(res));
+	if (!mfd->apbreg_regs) {
+		release_mem_region(res->start, resource_size(res));
+		return -ENOMEM;
+	}
+	dev_dbg(&dev->dev, "%s: regbase %p\n", __func__, mfd->apbreg_regs);
+
+	apbreg_regset.base = mfd->apbreg_regs;
+	sta2x11_apbreg_debugfs = debugfs_create_regset32("sta2x11-apbreg",
+						  S_IFREG | S_IRUGO,
+						  NULL, &apbreg_regset);
+	return 0;
+}
+
+/* The two platform drivers */
+static struct platform_driver sta2x11_sctl_platform_driver = {
+	.driver = {
+		.name	= "sta2x11-sctl",
+		.owner	= THIS_MODULE,
+	},
+	.probe		= sta2x11_sctl_probe,
+};
+
+static int __init sta2x11_sctl_init(void)
+{
+	pr_info("%s\n", __func__);
+	return platform_driver_register(&sta2x11_sctl_platform_driver);
+}
+
+static struct platform_driver sta2x11_platform_driver = {
+	.driver = {
+		.name	= "sta2x11-apbreg",
+		.owner	= THIS_MODULE,
+	},
+	.probe		= sta2x11_apbreg_probe,
+};
+
+static int __init sta2x11_apbreg_init(void)
+{
+	pr_info("%s\n", __func__);
+	return platform_driver_register(&sta2x11_platform_driver);
+}
+
+/*
+ * What follows is the PCI device that hosts the above two pdevs.
+ * Each logic block is 4kB and they are all consecutive: we use this info.
+ */
+
+/* Bar 0 */
+enum bar0_cells {
+	STA2X11_GPIO_0 = 0,
+	STA2X11_GPIO_1,
+	STA2X11_GPIO_2,
+	STA2X11_GPIO_3,
+	STA2X11_SCTL,
+	STA2X11_SCR,
+	STA2X11_TIME,
+};
+/* Bar 1 */
+enum bar1_cells {
+	STA2X11_APBREG = 0,
+};
+#define CELL_4K(_name, _cell) { \
+		.name = _name, \
+		.start = _cell * 4096, .end = _cell * 4096 + 4095, \
+		.flags = IORESOURCE_MEM, \
+		}
+
+static const __devinitconst struct resource gpio_resources[] = {
+	{
+		.name = "sta2x11_gpio", /* 4 consecutive cells, 1 driver */
+		.start = 0,
+		.end = (4 * 4096) - 1,
+		.flags = IORESOURCE_MEM,
+	}
+};
+static const __devinitconst struct resource sctl_resources[] = {
+	CELL_4K("sta2x11-sctl", STA2X11_SCTL),
+};
+static const __devinitconst struct resource scr_resources[] = {
+	CELL_4K("sta2x11-scr", STA2X11_SCR),
+};
+static const __devinitconst struct resource time_resources[] = {
+	CELL_4K("sta2x11-time", STA2X11_TIME),
+};
+
+static const __devinitconst struct resource apbreg_resources[] = {
+	CELL_4K("sta2x11-apbreg", STA2X11_APBREG),
+};
+
+#define DEV(_name, _r) \
+	{ .name = _name, .num_resources = ARRAY_SIZE(_r), .resources = _r, }
+
+static __devinitdata struct mfd_cell sta2x11_mfd_bar0[] = {
+	DEV("sta2x11-gpio", gpio_resources), /* offset 0: we add pdata later */
+	DEV("sta2x11-sctl", sctl_resources),
+	DEV("sta2x11-scr", scr_resources),
+	DEV("sta2x11-time", time_resources),
+};
+
+static __devinitdata struct mfd_cell sta2x11_mfd_bar1[] = {
+	DEV("sta2x11-apbreg", apbreg_resources),
+};
+
+static int sta2x11_mfd_suspend(struct pci_dev *pdev, pm_message_t state)
+{
+	pci_save_state(pdev);
+	pci_disable_device(pdev);
+	pci_set_power_state(pdev, pci_choose_state(pdev, state));
+
+	return 0;
+}
+
+static int sta2x11_mfd_resume(struct pci_dev *pdev)
+{
+	int err;
+
+	pci_set_power_state(pdev, 0);
+	err = pci_enable_device(pdev);
+	if (err)
+		return err;
+	pci_restore_state(pdev);
+
+	return 0;
+}
+
+static int __devinit sta2x11_mfd_probe(struct pci_dev *pdev,
+				       const struct pci_device_id *pci_id)
+{
+	int err, i;
+	struct sta2x11_gpio_pdata *gpio_data;
+
+	dev_info(&pdev->dev, "%s\n", __func__);
+
+	err = pci_enable_device(pdev);
+	if (err) {
+		dev_err(&pdev->dev, "Can't enable device.\n");
+		return err;
+	}
+
+	err = pci_enable_msi(pdev);
+	if (err)
+		dev_info(&pdev->dev, "Enable msi failed\n");
+
+	/* Read gpio config data as pci device's platform data */
+	gpio_data = dev_get_platdata(&pdev->dev);
+	if (!gpio_data)
+		dev_warn(&pdev->dev, "no gpio configuration\n");
+
+	dev_dbg(&pdev->dev, "%s, gpio_data = %p (%p)\n", __func__,
+		gpio_data, &gpio_data);
+	dev_dbg(&pdev->dev, "%s, pdev = %p (%p)\n", __func__,
+		pdev, &pdev);
+
+	/* platform data is the pci device for all of them */
+	for (i = 0; i < ARRAY_SIZE(sta2x11_mfd_bar0); i++) {
+		sta2x11_mfd_bar0[i].pdata_size = sizeof(pdev);
+		sta2x11_mfd_bar0[i].platform_data = &pdev;
+	}
+	sta2x11_mfd_bar1[0].pdata_size = sizeof(pdev);
+	sta2x11_mfd_bar1[0].platform_data = &pdev;
+
+	/* Record this pdev before mfd_add_devices: their probe looks for it */
+	sta2x11_mfd_add(pdev, GFP_ATOMIC);
+
+
+	err = mfd_add_devices(&pdev->dev, -1,
+			      sta2x11_mfd_bar0,
+			      ARRAY_SIZE(sta2x11_mfd_bar0),
+			      &pdev->resource[0],
+			      0);
+	if (err) {
+		dev_err(&pdev->dev, "mfd_add_devices[0] failed: %d\n", err);
+		goto err_disable;
+	}
+
+	err = mfd_add_devices(&pdev->dev, -1,
+			      sta2x11_mfd_bar1,
+			      ARRAY_SIZE(sta2x11_mfd_bar1),
+			      &pdev->resource[1],
+			      0);
+	if (err) {
+		dev_err(&pdev->dev, "mfd_add_devices[1] failed: %d\n", err);
+		goto err_disable;
+	}
+
+	return 0;
+
+err_disable:
+	mfd_remove_devices(&pdev->dev);
+	pci_disable_device(pdev);
+	pci_disable_msi(pdev);
+	return err;
+}
+
+static DEFINE_PCI_DEVICE_TABLE(sta2x11_mfd_tbl) = {
+	{PCI_DEVICE(PCI_VENDOR_ID_STMICRO, PCI_DEVICE_ID_STMICRO_GPIO)},
+	{0,},
+};
+
+static struct pci_driver sta2x11_mfd_driver = {
+	.name =		"sta2x11-mfd",
+	.id_table =	sta2x11_mfd_tbl,
+	.probe =	sta2x11_mfd_probe,
+	.suspend =	sta2x11_mfd_suspend,
+	.resume =	sta2x11_mfd_resume,
+};
+
+static int __init sta2x11_mfd_init(void)
+{
+	pr_info("%s\n", __func__);
+	return pci_register_driver(&sta2x11_mfd_driver);
+}
+
+/*
+ * All of this must be ready before "normal" devices like MMCI appear.
+ * But MFD (the pci device) can't be too early. The following choice
+ * prepares platform drivers very early and probe the PCI device later,
+ * but before other PCI devices.
+ */
+subsys_initcall(sta2x11_apbreg_init);
+subsys_initcall(sta2x11_sctl_init);
+rootfs_initcall(sta2x11_mfd_init);
+
+MODULE_LICENSE("GPL v2");
+MODULE_AUTHOR("Wind River");
+MODULE_DESCRIPTION("STA2x11 mfd for GPIO, SCTL and APBREG");
+MODULE_DEVICE_TABLE(pci, sta2x11_mfd_tbl);
diff --git a/include/linux/mfd/sta2x11-mfd.h b/include/linux/mfd/sta2x11-mfd.h
new file mode 100644
index 000000000000..d179227e866f
--- /dev/null
+++ b/include/linux/mfd/sta2x11-mfd.h
@@ -0,0 +1,324 @@
+/*
+ * Copyright (c) 2009-2011 Wind River Systems, Inc.
+ * Copyright (c) 2011 ST Microelectronics (Alessandro Rubini)
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ * See the GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
+ *
+ * The STMicroelectronics ConneXt (STA2X11) chip has several unrelated
+ * functions in one PCI endpoint functions. This driver simply
+ * registers the platform devices in this iomemregion and exports a few
+ * functions to access common registers
+ */
+
+#ifndef __STA2X11_MFD_H
+#define __STA2X11_MFD_H
+#include <linux/types.h>
+#include <linux/pci.h>
+
+/*
+ * The MFD PCI block includes the GPIO peripherals and other register blocks.
+ * For GPIO, we have 32*4 bits (I use "gsta" for "gpio sta2x11".)
+ */
+#define GSTA_GPIO_PER_BLOCK	32
+#define GSTA_NR_BLOCKS		4
+#define GSTA_NR_GPIO		(GSTA_GPIO_PER_BLOCK * GSTA_NR_BLOCKS)
+
+/* Pinconfig is set by the board definition: altfunc, pull-up, pull-down */
+struct sta2x11_gpio_pdata {
+	unsigned pinconfig[GSTA_NR_GPIO];
+};
+
+/* Macros below lifted from sh_pfc.h, with minor differences */
+#define PINMUX_TYPE_NONE		0
+#define PINMUX_TYPE_FUNCTION		1
+#define PINMUX_TYPE_OUTPUT_LOW		2
+#define PINMUX_TYPE_OUTPUT_HIGH		3
+#define PINMUX_TYPE_INPUT		4
+#define PINMUX_TYPE_INPUT_PULLUP	5
+#define PINMUX_TYPE_INPUT_PULLDOWN	6
+
+/* Give names to GPIO pins, like PXA does, taken from the manual */
+#define STA2X11_GPIO0			0
+#define STA2X11_GPIO1			1
+#define STA2X11_GPIO2			2
+#define STA2X11_GPIO3			3
+#define STA2X11_GPIO4			4
+#define STA2X11_GPIO5			5
+#define STA2X11_GPIO6			6
+#define STA2X11_GPIO7			7
+#define STA2X11_GPIO8_RGBOUT_RED7	8
+#define STA2X11_GPIO9_RGBOUT_RED6	9
+#define STA2X11_GPIO10_RGBOUT_RED5	10
+#define STA2X11_GPIO11_RGBOUT_RED4	11
+#define STA2X11_GPIO12_RGBOUT_RED3	12
+#define STA2X11_GPIO13_RGBOUT_RED2	13
+#define STA2X11_GPIO14_RGBOUT_RED1	14
+#define STA2X11_GPIO15_RGBOUT_RED0	15
+#define STA2X11_GPIO16_RGBOUT_GREEN7	16
+#define STA2X11_GPIO17_RGBOUT_GREEN6	17
+#define STA2X11_GPIO18_RGBOUT_GREEN5	18
+#define STA2X11_GPIO19_RGBOUT_GREEN4	19
+#define STA2X11_GPIO20_RGBOUT_GREEN3	20
+#define STA2X11_GPIO21_RGBOUT_GREEN2	21
+#define STA2X11_GPIO22_RGBOUT_GREEN1	22
+#define STA2X11_GPIO23_RGBOUT_GREEN0	23
+#define STA2X11_GPIO24_RGBOUT_BLUE7	24
+#define STA2X11_GPIO25_RGBOUT_BLUE6	25
+#define STA2X11_GPIO26_RGBOUT_BLUE5	26
+#define STA2X11_GPIO27_RGBOUT_BLUE4	27
+#define STA2X11_GPIO28_RGBOUT_BLUE3	28
+#define STA2X11_GPIO29_RGBOUT_BLUE2	29
+#define STA2X11_GPIO30_RGBOUT_BLUE1	30
+#define STA2X11_GPIO31_RGBOUT_BLUE0	31
+#define STA2X11_GPIO32_RGBOUT_VSYNCH	32
+#define STA2X11_GPIO33_RGBOUT_HSYNCH	33
+#define STA2X11_GPIO34_RGBOUT_DEN	34
+#define STA2X11_GPIO35_ETH_CRS_DV	35
+#define STA2X11_GPIO36_ETH_TXD1		36
+#define STA2X11_GPIO37_ETH_TXD0		37
+#define STA2X11_GPIO38_ETH_TX_EN	38
+#define STA2X11_GPIO39_MDIO		39
+#define STA2X11_GPIO40_ETH_REF_CLK	40
+#define STA2X11_GPIO41_ETH_RXD1		41
+#define STA2X11_GPIO42_ETH_RXD0		42
+#define STA2X11_GPIO43_MDC		43
+#define STA2X11_GPIO44_CAN_TX		44
+#define STA2X11_GPIO45_CAN_RX		45
+#define STA2X11_GPIO46_MLB_DAT		46
+#define STA2X11_GPIO47_MLB_SIG		47
+#define STA2X11_GPIO48_SPI0_CLK		48
+#define STA2X11_GPIO49_SPI0_TXD		49
+#define STA2X11_GPIO50_SPI0_RXD		50
+#define STA2X11_GPIO51_SPI0_FRM		51
+#define STA2X11_GPIO52_SPI1_CLK		52
+#define STA2X11_GPIO53_SPI1_TXD		53
+#define STA2X11_GPIO54_SPI1_RXD		54
+#define STA2X11_GPIO55_SPI1_FRM		55
+#define STA2X11_GPIO56_SPI2_CLK		56
+#define STA2X11_GPIO57_SPI2_TXD		57
+#define STA2X11_GPIO58_SPI2_RXD		58
+#define STA2X11_GPIO59_SPI2_FRM		59
+#define STA2X11_GPIO60_I2C0_SCL		60
+#define STA2X11_GPIO61_I2C0_SDA		61
+#define STA2X11_GPIO62_I2C1_SCL		62
+#define STA2X11_GPIO63_I2C1_SDA		63
+#define STA2X11_GPIO64_I2C2_SCL		64
+#define STA2X11_GPIO65_I2C2_SDA		65
+#define STA2X11_GPIO66_I2C3_SCL		66
+#define STA2X11_GPIO67_I2C3_SDA		67
+#define STA2X11_GPIO68_MSP0_RCK		68
+#define STA2X11_GPIO69_MSP0_RXD		69
+#define STA2X11_GPIO70_MSP0_RFS		70
+#define STA2X11_GPIO71_MSP0_TCK		71
+#define STA2X11_GPIO72_MSP0_TXD		72
+#define STA2X11_GPIO73_MSP0_TFS		73
+#define STA2X11_GPIO74_MSP0_SCK		74
+#define STA2X11_GPIO75_MSP1_CK		75
+#define STA2X11_GPIO76_MSP1_RXD		76
+#define STA2X11_GPIO77_MSP1_FS		77
+#define STA2X11_GPIO78_MSP1_TXD		78
+#define STA2X11_GPIO79_MSP2_CK		79
+#define STA2X11_GPIO80_MSP2_RXD		80
+#define STA2X11_GPIO81_MSP2_FS		81
+#define STA2X11_GPIO82_MSP2_TXD		82
+#define STA2X11_GPIO83_MSP3_CK		83
+#define STA2X11_GPIO84_MSP3_RXD		84
+#define STA2X11_GPIO85_MSP3_FS		85
+#define STA2X11_GPIO86_MSP3_TXD		86
+#define STA2X11_GPIO87_MSP4_CK		87
+#define STA2X11_GPIO88_MSP4_RXD		88
+#define STA2X11_GPIO89_MSP4_FS		89
+#define STA2X11_GPIO90_MSP4_TXD		90
+#define STA2X11_GPIO91_MSP5_CK		91
+#define STA2X11_GPIO92_MSP5_RXD		92
+#define STA2X11_GPIO93_MSP5_FS		93
+#define STA2X11_GPIO94_MSP5_TXD		94
+#define STA2X11_GPIO95_SDIO3_DAT3	95
+#define STA2X11_GPIO96_SDIO3_DAT2	96
+#define STA2X11_GPIO97_SDIO3_DAT1	97
+#define STA2X11_GPIO98_SDIO3_DAT0	98
+#define STA2X11_GPIO99_SDIO3_CLK	99
+#define STA2X11_GPIO100_SDIO3_CMD	100
+#define STA2X11_GPIO101			101
+#define STA2X11_GPIO102			102
+#define STA2X11_GPIO103			103
+#define STA2X11_GPIO104			104
+#define STA2X11_GPIO105_SDIO2_DAT3	105
+#define STA2X11_GPIO106_SDIO2_DAT2	106
+#define STA2X11_GPIO107_SDIO2_DAT1	107
+#define STA2X11_GPIO108_SDIO2_DAT0	108
+#define STA2X11_GPIO109_SDIO2_CLK	109
+#define STA2X11_GPIO110_SDIO2_CMD	110
+#define STA2X11_GPIO111			111
+#define STA2X11_GPIO112			112
+#define STA2X11_GPIO113			113
+#define STA2X11_GPIO114			114
+#define STA2X11_GPIO115_SDIO1_DAT3	115
+#define STA2X11_GPIO116_SDIO1_DAT2	116
+#define STA2X11_GPIO117_SDIO1_DAT1	117
+#define STA2X11_GPIO118_SDIO1_DAT0	118
+#define STA2X11_GPIO119_SDIO1_CLK	119
+#define STA2X11_GPIO120_SDIO1_CMD	120
+#define STA2X11_GPIO121			121
+#define STA2X11_GPIO122			122
+#define STA2X11_GPIO123			123
+#define STA2X11_GPIO124			124
+#define STA2X11_GPIO125_UART2_TXD	125
+#define STA2X11_GPIO126_UART2_RXD	126
+#define STA2X11_GPIO127_UART3_TXD	127
+
+/*
+ * The APB bridge has its own registers, needed by our users as well.
+ * They are accessed with the following read/mask/write function.
+ */
+u32 sta2x11_apbreg_mask(struct pci_dev *pdev, u32 reg, u32 mask, u32 val);
+
+/* CAN and MLB */
+#define APBREG_BSR	0x00	/* Bridge Status Reg */
+#define APBREG_PAER	0x08	/* Peripherals Address Error Reg */
+#define APBREG_PWAC	0x20	/* Peripheral Write Access Control reg */
+#define APBREG_PRAC	0x40	/* Peripheral Read Access Control reg */
+#define APBREG_PCG	0x60	/* Peripheral Clock Gating Reg */
+#define APBREG_PUR	0x80	/* Peripheral Under Reset Reg */
+#define APBREG_EMU_PCG	0xA0	/* Emulator Peripheral Clock Gating Reg */
+
+#define APBREG_CAN	(1 << 1)
+#define APBREG_MLB	(1 << 3)
+
+/* SARAC */
+#define APBREG_BSR_SARAC     0x100 /* Bridge Status Reg */
+#define APBREG_PAER_SARAC    0x108 /* Peripherals Address Error Reg */
+#define APBREG_PWAC_SARAC    0x120 /* Peripheral Write Access Control reg */
+#define APBREG_PRAC_SARAC    0x140 /* Peripheral Read Access Control reg */
+#define APBREG_PCG_SARAC     0x160 /* Peripheral Clock Gating Reg */
+#define APBREG_PUR_SARAC     0x180 /* Peripheral Under Reset Reg */
+#define APBREG_EMU_PCG_SARAC 0x1A0 /* Emulator Peripheral Clock Gating Reg */
+
+#define APBREG_SARAC	(1 << 2)
+
+/*
+ * The system controller has its own registers. Some of these are accessed
+ * by out users as well, using the following read/mask/write/function
+ */
+u32 sta2x11_sctl_mask(struct pci_dev *pdev, u32 reg, u32 mask, u32 val);
+
+#define SCTL_SCCTL		0x00	/* System controller control register */
+#define SCTL_ARMCFG		0x04	/* ARM configuration register */
+#define SCTL_SCPLLCTL		0x08	/* PLL control status register */
+#define SCTL_SCPLLFCTRL		0x0c	/* PLL frequency control register */
+#define SCTL_SCRESFRACT		0x10	/* PLL fractional input register */
+#define SCTL_SCRESCTRL1		0x14	/* Peripheral reset control 1 */
+#define SCTL_SCRESXTRL2		0x18	/* Peripheral reset control 2 */
+#define SCTL_SCPEREN0		0x1c	/* Peripheral clock enable register 0 */
+#define SCTL_SCPEREN1		0x20	/* Peripheral clock enable register 1 */
+#define SCTL_SCPEREN2		0x24	/* Peripheral clock enable register 2 */
+#define SCTL_SCGRST		0x28	/* Peripheral global reset */
+#define SCTL_SCPCIPMCR1		0x30	/* PCI power management control 1 */
+#define SCTL_SCPCIPMCR2		0x34	/* PCI power management control 2 */
+#define SCTL_SCPCIPMSR1		0x38	/* PCI power management status 1 */
+#define SCTL_SCPCIPMSR2		0x3c	/* PCI power management status 2 */
+#define SCTL_SCPCIPMSR3		0x40	/* PCI power management status 3 */
+#define SCTL_SCINTREN		0x44	/* Interrupt enable */
+#define SCTL_SCRISR		0x48	/* RAW interrupt status */
+#define SCTL_SCCLKSTAT0		0x4c	/* Peripheral clocks status 0 */
+#define SCTL_SCCLKSTAT1		0x50	/* Peripheral clocks status 1 */
+#define SCTL_SCCLKSTAT2		0x54	/* Peripheral clocks status 2 */
+#define SCTL_SCRSTSTA		0x58	/* Reset status register */
+
+#define SCTL_SCRESCTRL1_USB_PHY_POR	(1 << 0)
+#define SCTL_SCRESCTRL1_USB_OTG	(1 << 1)
+#define SCTL_SCRESCTRL1_USB_HRST	(1 << 2)
+#define SCTL_SCRESCTRL1_USB_PHY_HOST	(1 << 3)
+#define SCTL_SCRESCTRL1_SATAII	(1 << 4)
+#define SCTL_SCRESCTRL1_VIP		(1 << 5)
+#define SCTL_SCRESCTRL1_PER_MMC0	(1 << 6)
+#define SCTL_SCRESCTRL1_PER_MMC1	(1 << 7)
+#define SCTL_SCRESCTRL1_PER_GPIO0	(1 << 8)
+#define SCTL_SCRESCTRL1_PER_GPIO1	(1 << 9)
+#define SCTL_SCRESCTRL1_PER_GPIO2	(1 << 10)
+#define SCTL_SCRESCTRL1_PER_GPIO3	(1 << 11)
+#define SCTL_SCRESCTRL1_PER_MTU0	(1 << 12)
+#define SCTL_SCRESCTRL1_KER_SPI0	(1 << 13)
+#define SCTL_SCRESCTRL1_KER_SPI1	(1 << 14)
+#define SCTL_SCRESCTRL1_KER_SPI2	(1 << 15)
+#define SCTL_SCRESCTRL1_KER_MCI0	(1 << 16)
+#define SCTL_SCRESCTRL1_KER_MCI1	(1 << 17)
+#define SCTL_SCRESCTRL1_PRE_HSI2C0	(1 << 18)
+#define SCTL_SCRESCTRL1_PER_HSI2C1	(1 << 19)
+#define SCTL_SCRESCTRL1_PER_HSI2C2	(1 << 20)
+#define SCTL_SCRESCTRL1_PER_HSI2C3	(1 << 21)
+#define SCTL_SCRESCTRL1_PER_MSP0	(1 << 22)
+#define SCTL_SCRESCTRL1_PER_MSP1	(1 << 23)
+#define SCTL_SCRESCTRL1_PER_MSP2	(1 << 24)
+#define SCTL_SCRESCTRL1_PER_MSP3	(1 << 25)
+#define SCTL_SCRESCTRL1_PER_MSP4	(1 << 26)
+#define SCTL_SCRESCTRL1_PER_MSP5	(1 << 27)
+#define SCTL_SCRESCTRL1_PER_MMC	(1 << 28)
+#define SCTL_SCRESCTRL1_KER_MSP0	(1 << 29)
+#define SCTL_SCRESCTRL1_KER_MSP1	(1 << 30)
+#define SCTL_SCRESCTRL1_KER_MSP2	(1 << 31)
+
+#define SCTL_SCPEREN0_UART0		(1 << 0)
+#define SCTL_SCPEREN0_UART1		(1 << 1)
+#define SCTL_SCPEREN0_UART2		(1 << 2)
+#define SCTL_SCPEREN0_UART3		(1 << 3)
+#define SCTL_SCPEREN0_MSP0		(1 << 4)
+#define SCTL_SCPEREN0_MSP1		(1 << 5)
+#define SCTL_SCPEREN0_MSP2		(1 << 6)
+#define SCTL_SCPEREN0_MSP3		(1 << 7)
+#define SCTL_SCPEREN0_MSP4		(1 << 8)
+#define SCTL_SCPEREN0_MSP5		(1 << 9)
+#define SCTL_SCPEREN0_SPI0		(1 << 10)
+#define SCTL_SCPEREN0_SPI1		(1 << 11)
+#define SCTL_SCPEREN0_SPI2		(1 << 12)
+#define SCTL_SCPEREN0_I2C0		(1 << 13)
+#define SCTL_SCPEREN0_I2C1		(1 << 14)
+#define SCTL_SCPEREN0_I2C2		(1 << 15)
+#define SCTL_SCPEREN0_I2C3		(1 << 16)
+#define SCTL_SCPEREN0_SVDO_LVDS		(1 << 17)
+#define SCTL_SCPEREN0_USB_HOST		(1 << 18)
+#define SCTL_SCPEREN0_USB_OTG		(1 << 19)
+#define SCTL_SCPEREN0_MCI0		(1 << 20)
+#define SCTL_SCPEREN0_MCI1		(1 << 21)
+#define SCTL_SCPEREN0_MCI2		(1 << 22)
+#define SCTL_SCPEREN0_MCI3		(1 << 23)
+#define SCTL_SCPEREN0_SATA		(1 << 24)
+#define SCTL_SCPEREN0_ETHERNET		(1 << 25)
+#define SCTL_SCPEREN0_VIC		(1 << 26)
+#define SCTL_SCPEREN0_DMA_AUDIO		(1 << 27)
+#define SCTL_SCPEREN0_DMA_SOC		(1 << 28)
+#define SCTL_SCPEREN0_RAM		(1 << 29)
+#define SCTL_SCPEREN0_VIP		(1 << 30)
+#define SCTL_SCPEREN0_ARM		(1 << 31)
+
+#define SCTL_SCPEREN1_UART0		(1 << 0)
+#define SCTL_SCPEREN1_UART1		(1 << 1)
+#define SCTL_SCPEREN1_UART2		(1 << 2)
+#define SCTL_SCPEREN1_UART3		(1 << 3)
+#define SCTL_SCPEREN1_MSP0		(1 << 4)
+#define SCTL_SCPEREN1_MSP1		(1 << 5)
+#define SCTL_SCPEREN1_MSP2		(1 << 6)
+#define SCTL_SCPEREN1_MSP3		(1 << 7)
+#define SCTL_SCPEREN1_MSP4		(1 << 8)
+#define SCTL_SCPEREN1_MSP5		(1 << 9)
+#define SCTL_SCPEREN1_SPI0		(1 << 10)
+#define SCTL_SCPEREN1_SPI1		(1 << 11)
+#define SCTL_SCPEREN1_SPI2		(1 << 12)
+#define SCTL_SCPEREN1_I2C0		(1 << 13)
+#define SCTL_SCPEREN1_I2C1		(1 << 14)
+#define SCTL_SCPEREN1_I2C2		(1 << 15)
+#define SCTL_SCPEREN1_I2C3		(1 << 16)
+#define SCTL_SCPEREN1_USB_PHY		(1 << 17)
+
+#endif /* __STA2X11_MFD_H */
-- 
cgit v1.2.3


From c5403aed044e23f8d1ecdf05d0ff120314186527 Mon Sep 17 00:00:00 2001
From: Jarkko Sakkinen <jarkko.sakkinen@intel.com>
Date: Wed, 9 May 2012 23:25:06 +0300
Subject: x86, realmode: build fix: remove duplicate build

Real-mode binary was built twice. This patch fixes
the issue by making realmode.relocs as target for
realmode.bin.

[ hpa: removed the direct dependency on realmode.relocs in
  arch/x86/realmode/Makefile ]

Signed-off-by: Jarkko Sakkinen <jarkko.sakkinen@intel.com>
Link: http://lkml.kernel.org/r/1336595106-21135-1-git-send-email-jarkko.sakkinen@intel.com
Cc: Sam Ravnborg <sam@ravnborg.org>
Cc: Michal Marek <mmarek@suse.cz>
Signed-off-by: H. Peter Anvin <hpa@linux.intel.com>
---
 arch/x86/realmode/Makefile    | 5 +----
 arch/x86/realmode/rm/Makefile | 2 +-
 2 files changed, 2 insertions(+), 5 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/realmode/Makefile b/arch/x86/realmode/Makefile
index f22a4f8d99d6..a05b3aca64ad 100644
--- a/arch/x86/realmode/Makefile
+++ b/arch/x86/realmode/Makefile
@@ -11,10 +11,7 @@ subdir- := rm
 
 obj-y += rmpiggy.o
 
-$(obj)/rmpiggy.o: $(obj)/rm/realmode.relocs $(obj)/rm/realmode.bin
+$(obj)/rmpiggy.o: $(obj)/rm/realmode.bin
 
 $(obj)/rm/realmode.bin: FORCE
 	$(Q)$(MAKE) $(build)=$(obj)/rm $@
-
-$(obj)/rm/realmode.relocs: FORCE
-	$(Q)$(MAKE) $(build)=$(obj)/rm $@
diff --git a/arch/x86/realmode/rm/Makefile b/arch/x86/realmode/rm/Makefile
index de40bc44b92f..1c1d3d3bbee4 100644
--- a/arch/x86/realmode/rm/Makefile
+++ b/arch/x86/realmode/rm/Makefile
@@ -48,7 +48,7 @@ $(obj)/realmode.elf: $(obj)/realmode.lds $(REALMODE_OBJS) FORCE
 
 OBJCOPYFLAGS_realmode.bin := -O binary
 
-$(obj)/realmode.bin: $(obj)/realmode.elf
+$(obj)/realmode.bin: $(obj)/realmode.elf $(obj)/realmode.relocs
 	$(call if_changed,objcopy)
 
 quiet_cmd_relocs = RELOCS  $@
-- 
cgit v1.2.3


From 0f6f11eb00830fa691c16084048f53d83c5c3a5d Mon Sep 17 00:00:00 2001
From: "H. Peter Anvin" <hpa@linux.intel.com>
Date: Wed, 9 May 2012 14:53:01 -0700
Subject: x86, realmode: Make sure all generated files are listed in targets

Kbuild expects all generated files to be listed in the targets
variable.  If it isn't, weird things happen.

Cc: Sam Ravnborg <sam@ravnborg.org>
Cc: Michal Marek <mmarek@suse.cz>
Signed-off-by: H. Peter Anvin <hpa@linux.intel.com>
Link: http://lkml.kernel.org/r/1336595106-21135-1-git-send-email-jarkko.sakkinen@intel.com
---
 arch/x86/realmode/rm/Makefile | 20 +++++++++++++-------
 1 file changed, 13 insertions(+), 7 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/realmode/rm/Makefile b/arch/x86/realmode/rm/Makefile
index 1c1d3d3bbee4..5b84a2d30888 100644
--- a/arch/x86/realmode/rm/Makefile
+++ b/arch/x86/realmode/rm/Makefile
@@ -7,13 +7,7 @@
 #
 #
 
-always := realmode.bin
-
-realmode-y			+= header.o
-realmode-y			+= trampoline_$(BITS).o
-realmode-y			+= stack.o
-realmode-$(CONFIG_X86_32)	+= reboot_32.o
-realmode-$(CONFIG_ACPI_SLEEP)	+= $(wakeup-objs)
+always := realmode.bin realmode.relocs
 
 wakeup-objs	:= wakeup_asm.o wakemain.o video-mode.o
 wakeup-objs	+= copy.o bioscall.o regs.o
@@ -25,6 +19,12 @@ wakeup-objs	+= video-vga.o
 wakeup-objs	+= video-vesa.o
 wakeup-objs	+= video-bios.o
 
+realmode-y			+= header.o
+realmode-y			+= trampoline_$(BITS).o
+realmode-y			+= stack.o
+realmode-$(CONFIG_X86_32)	+= reboot_32.o
+realmode-$(CONFIG_ACPI_SLEEP)	+= $(wakeup-objs)
+
 targets	+= $(realmode-y)
 
 REALMODE_OBJS = $(addprefix $(obj)/,$(realmode-y))
@@ -35,24 +35,30 @@ quiet_cmd_pasyms = PASYMS  $@
       cmd_pasyms = $(NM) $(filter-out FORCE,$^) | \
 		   sed $(sed-pasyms) | sort | uniq > $@
 
+targets += pasyms.h
 $(obj)/pasyms.h: $(REALMODE_OBJS) FORCE
 	$(call if_changed,pasyms)
 
+targets += realmode.lds
 $(obj)/realmode.lds: $(obj)/pasyms.h
 
 LDFLAGS_realmode.elf := --emit-relocs -T
 CPPFLAGS_realmode.lds += -P -C -I$(obj)
 
+targets += realmode.elf
 $(obj)/realmode.elf: $(obj)/realmode.lds $(REALMODE_OBJS) FORCE
 	$(call if_changed,ld)
 
 OBJCOPYFLAGS_realmode.bin := -O binary
 
+targets += realmode.bin
 $(obj)/realmode.bin: $(obj)/realmode.elf $(obj)/realmode.relocs
 	$(call if_changed,objcopy)
 
 quiet_cmd_relocs = RELOCS  $@
       cmd_relocs = arch/x86/tools/relocs --realmode $< > $@
+
+targets += realmode.relocs
 $(obj)/realmode.relocs: $(obj)/realmode.elf FORCE
 	$(call if_changed,relocs)
 
-- 
cgit v1.2.3


From 34d0b02e08470c56a411ba6da1f377bc6da02826 Mon Sep 17 00:00:00 2001
From: Jarkko Sakkinen <jarkko.sakkinen@intel.com>
Date: Thu, 10 May 2012 10:11:38 +0300
Subject: x86, realmode: Fix no cache bits test in reboot_32.S

Before the new real-mode code infrastructure %edx was
used for testing CD and NW bits with andl in order to
decide whether to flush the processor caches or not.
The value of cr0 was also stored in %eax, which was
later used to set cr0 after masking out lower byte
(except TS bit) in order to enter real-mode.

In the new real-mode code infrastructure we wanted to
keep input parameter in %eax so we are using %edx for
both cr0 cases. This has caused regression since andl
overwrites the value of %edx.

This patch fixes the issue by replacing andl with testl,
which is essentially andl without writing result to the
register.

Special thanks to Paolo Bonzini for noting this and
proposing a fix.

Reported-and-tested-by: Paolo Bonzini <pbonzini@redhat.com>
Signed-off-by: Jarkko Sakkinen <jarkko.sakkinen@intel.com>
Link: http://lkml.kernel.org/r/1336633898-23743-1-git-send-email-jarkko.sakkinen@intel.com
Signed-off-by: H. Peter Anvin <hpa@linux.intel.com>
---
 arch/x86/realmode/rm/reboot_32.S | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'arch/x86')

diff --git a/arch/x86/realmode/rm/reboot_32.S b/arch/x86/realmode/rm/reboot_32.S
index 8d9bfd13a93e..114044876b3d 100644
--- a/arch/x86/realmode/rm/reboot_32.S
+++ b/arch/x86/realmode/rm/reboot_32.S
@@ -76,7 +76,7 @@ machine_real_restart_asm16:
 	movl	%edx, %cr0
 	movl	%ecx, %cr3
 	movl	%cr0, %edx
-	andl	$0x60000000, %edx	/* If no cache bits -> no wbinvd */
+	testl	$0x60000000, %edx	/* If no cache bits -> no wbinvd */
 	jz	2f
 	wbinvd
 2:
-- 
cgit v1.2.3


From 7563bbf89d065a2c3f05059ecbcc805645edcc62 Mon Sep 17 00:00:00 2001
From: Mark Brown <broonie@opensource.wolfsonmicro.com>
Date: Sun, 15 Apr 2012 10:52:54 +0100
Subject: gpiolib/arches: Centralise bolierplate asm/gpio.h

Rather than requiring architectures that use gpiolib but don't have any
need to define anything custom to copy an asm/gpio.h provide a Kconfig
symbol which architectures must select in order to include gpio.h and
for other architectures just provide the trivial implementation directly.

This makes it much easier to do gpiolib updates and is also a step towards
making gpiolib APIs available on every architecture.

For architectures with existing boilerplate code leave a stub header in
place which warns on direct inclusion of asm/gpio.h and includes
linux/gpio.h to catch code that's doing this.  Direct inclusion of
asm/gpio.h has long been deprecated.

Signed-off-by: Mark Brown <broonie@opensource.wolfsonmicro.com>
Acked-by: Jonas Bonn <jonas@southpole.se>
Acked-by: Tony Luck <tony.luck@intel.com>
Acked-by: Linus Walleij <linus.walleij@linaro.org>
Signed-off-by: Grant Likely <grant.likely@secretlab.ca>
---
 arch/alpha/include/asm/gpio.h      | 59 +++-----------------------------
 arch/arm/Kconfig                   |  1 +
 arch/avr32/Kconfig                 |  1 +
 arch/blackfin/Kconfig              |  1 +
 arch/ia64/include/asm/gpio.h       | 59 +++-----------------------------
 arch/m68k/Kconfig.cpu              |  1 +
 arch/microblaze/include/asm/gpio.h | 57 +++----------------------------
 arch/mips/Kconfig                  |  1 +
 arch/openrisc/include/asm/gpio.h   | 69 +++-----------------------------------
 arch/powerpc/include/asm/gpio.h    | 57 +++----------------------------
 arch/sh/Kconfig                    |  1 +
 arch/sparc/include/asm/gpio.h      | 40 +++-------------------
 arch/unicore32/Kconfig             |  1 +
 arch/x86/include/asm/gpio.h        | 57 +++----------------------------
 arch/xtensa/include/asm/gpio.h     | 60 +++------------------------------
 drivers/gpio/Kconfig               |  8 +++++
 include/linux/gpio.h               | 34 +++++++++++++++++++
 17 files changed, 81 insertions(+), 426 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/alpha/include/asm/gpio.h b/arch/alpha/include/asm/gpio.h
index 7dc6a6343c06..b3799d88ffcf 100644
--- a/arch/alpha/include/asm/gpio.h
+++ b/arch/alpha/include/asm/gpio.h
@@ -1,55 +1,4 @@
-/*
- * Generic GPIO API implementation for Alpha.
- *
- * A stright copy of that for PowerPC which was:
- *
- * Copyright (c) 2007-2008  MontaVista Software, Inc.
- *
- * Author: Anton Vorontsov <avorontsov@ru.mvista.com>
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation; either version 2 of the License, or
- * (at your option) any later version.
- */
-
-#ifndef _ASM_ALPHA_GPIO_H
-#define _ASM_ALPHA_GPIO_H
-
-#include <linux/errno.h>
-#include <asm-generic/gpio.h>
-
-#ifdef CONFIG_GPIOLIB
-
-/*
- * We don't (yet) implement inlined/rapid versions for on-chip gpios.
- * Just call gpiolib.
- */
-static inline int gpio_get_value(unsigned int gpio)
-{
-	return __gpio_get_value(gpio);
-}
-
-static inline void gpio_set_value(unsigned int gpio, int value)
-{
-	__gpio_set_value(gpio, value);
-}
-
-static inline int gpio_cansleep(unsigned int gpio)
-{
-	return __gpio_cansleep(gpio);
-}
-
-static inline int gpio_to_irq(unsigned int gpio)
-{
-	return __gpio_to_irq(gpio);
-}
-
-static inline int irq_to_gpio(unsigned int irq)
-{
-	return -EINVAL;
-}
-
-#endif /* CONFIG_GPIOLIB */
-
-#endif /* _ASM_ALPHA_GPIO_H */
+#ifndef __LINUX_GPIO_H
+#warning Include linux/gpio.h instead of asm/gpio.h
+#include <linux/gpio.h>
+#endif
diff --git a/arch/arm/Kconfig b/arch/arm/Kconfig
index 36586dba6fa6..777025e773d9 100644
--- a/arch/arm/Kconfig
+++ b/arch/arm/Kconfig
@@ -1,6 +1,7 @@
 config ARM
 	bool
 	default y
+	select ARCH_HAVE_CUSTOM_GPIO_H
 	select HAVE_AOUT
 	select HAVE_DMA_API_DEBUG
 	select HAVE_IDE if PCI || ISA || PCMCIA
diff --git a/arch/avr32/Kconfig b/arch/avr32/Kconfig
index 3dea7231f637..859b2de4a624 100644
--- a/arch/avr32/Kconfig
+++ b/arch/avr32/Kconfig
@@ -11,6 +11,7 @@ config AVR32
 	select GENERIC_ATOMIC64
 	select HARDIRQS_SW_RESEND
 	select GENERIC_IRQ_SHOW
+	select ARCH_HAVE_CUSTOM_GPIO_H
 	select ARCH_HAVE_NMI_SAFE_CMPXCHG
 	help
 	  AVR32 is a high-performance 32-bit RISC microprocessor core,
diff --git a/arch/blackfin/Kconfig b/arch/blackfin/Kconfig
index 373a6902d8fa..bf3d80f9738b 100644
--- a/arch/blackfin/Kconfig
+++ b/arch/blackfin/Kconfig
@@ -31,6 +31,7 @@ config BLACKFIN
 	select HAVE_KERNEL_LZO if RAMKERNEL
 	select HAVE_OPROFILE
 	select HAVE_PERF_EVENTS
+	select ARCH_HAVE_CUSTOM_GPIO_H
 	select ARCH_WANT_OPTIONAL_GPIOLIB
 	select HAVE_GENERIC_HARDIRQS
 	select GENERIC_ATOMIC64
diff --git a/arch/ia64/include/asm/gpio.h b/arch/ia64/include/asm/gpio.h
index 590a20debc4e..b3799d88ffcf 100644
--- a/arch/ia64/include/asm/gpio.h
+++ b/arch/ia64/include/asm/gpio.h
@@ -1,55 +1,4 @@
-/*
- * Generic GPIO API implementation for IA-64.
- *
- * A stright copy of that for PowerPC which was:
- *
- * Copyright (c) 2007-2008  MontaVista Software, Inc.
- *
- * Author: Anton Vorontsov <avorontsov@ru.mvista.com>
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation; either version 2 of the License, or
- * (at your option) any later version.
- */
-
-#ifndef _ASM_IA64_GPIO_H
-#define _ASM_IA64_GPIO_H
-
-#include <linux/errno.h>
-#include <asm-generic/gpio.h>
-
-#ifdef CONFIG_GPIOLIB
-
-/*
- * We don't (yet) implement inlined/rapid versions for on-chip gpios.
- * Just call gpiolib.
- */
-static inline int gpio_get_value(unsigned int gpio)
-{
-	return __gpio_get_value(gpio);
-}
-
-static inline void gpio_set_value(unsigned int gpio, int value)
-{
-	__gpio_set_value(gpio, value);
-}
-
-static inline int gpio_cansleep(unsigned int gpio)
-{
-	return __gpio_cansleep(gpio);
-}
-
-static inline int gpio_to_irq(unsigned int gpio)
-{
-	return __gpio_to_irq(gpio);
-}
-
-static inline int irq_to_gpio(unsigned int irq)
-{
-	return -EINVAL;
-}
-
-#endif /* CONFIG_GPIOLIB */
-
-#endif /* _ASM_IA64_GPIO_H */
+#ifndef __LINUX_GPIO_H
+#warning Include linux/gpio.h instead of asm/gpio.h
+#include <linux/gpio.h>
+#endif
diff --git a/arch/m68k/Kconfig.cpu b/arch/m68k/Kconfig.cpu
index 8a9c767125a4..8941af1d3ad2 100644
--- a/arch/m68k/Kconfig.cpu
+++ b/arch/m68k/Kconfig.cpu
@@ -24,6 +24,7 @@ config COLDFIRE
 	bool "Coldfire CPU family support"
 	select GENERIC_GPIO
 	select ARCH_REQUIRE_GPIOLIB
+	select ARCH_HAVE_CUSTOM_GPIO_H
 	select CPU_HAS_NO_BITFIELDS
 	select CPU_HAS_NO_MULDIV64
 	select GENERIC_CSUM
diff --git a/arch/microblaze/include/asm/gpio.h b/arch/microblaze/include/asm/gpio.h
index 2b2c18be71c6..b3799d88ffcf 100644
--- a/arch/microblaze/include/asm/gpio.h
+++ b/arch/microblaze/include/asm/gpio.h
@@ -1,53 +1,4 @@
-/*
- * Generic GPIO API implementation for PowerPC.
- *
- * Copyright (c) 2007-2008  MontaVista Software, Inc.
- *
- * Author: Anton Vorontsov <avorontsov@ru.mvista.com>
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation; either version 2 of the License, or
- * (at your option) any later version.
- */
-
-#ifndef _ASM_MICROBLAZE_GPIO_H
-#define _ASM_MICROBLAZE_GPIO_H
-
-#include <linux/errno.h>
-#include <asm-generic/gpio.h>
-
-#ifdef CONFIG_GPIOLIB
-
-/*
- * We don't (yet) implement inlined/rapid versions for on-chip gpios.
- * Just call gpiolib.
- */
-static inline int gpio_get_value(unsigned int gpio)
-{
-	return __gpio_get_value(gpio);
-}
-
-static inline void gpio_set_value(unsigned int gpio, int value)
-{
-	__gpio_set_value(gpio, value);
-}
-
-static inline int gpio_cansleep(unsigned int gpio)
-{
-	return __gpio_cansleep(gpio);
-}
-
-static inline int gpio_to_irq(unsigned int gpio)
-{
-	return __gpio_to_irq(gpio);
-}
-
-static inline int irq_to_gpio(unsigned int irq)
-{
-	return -EINVAL;
-}
-
-#endif /* CONFIG_GPIOLIB */
-
-#endif /* _ASM_MICROBLAZE_GPIO_H */
+#ifndef __LINUX_GPIO_H
+#warning Include linux/gpio.h instead of asm/gpio.h
+#include <linux/gpio.h>
+#endif
diff --git a/arch/mips/Kconfig b/arch/mips/Kconfig
index ce30e2f91d77..63321b283fe4 100644
--- a/arch/mips/Kconfig
+++ b/arch/mips/Kconfig
@@ -8,6 +8,7 @@ config MIPS
 	select HAVE_PERF_EVENTS
 	select PERF_USE_VMALLOC
 	select HAVE_ARCH_KGDB
+	select ARCH_HAVE_CUSTOM_GPIO_H
 	select HAVE_FUNCTION_TRACER
 	select HAVE_FUNCTION_TRACE_MCOUNT_TEST
 	select HAVE_DYNAMIC_FTRACE
diff --git a/arch/openrisc/include/asm/gpio.h b/arch/openrisc/include/asm/gpio.h
index 0b0d174f47cd..b3799d88ffcf 100644
--- a/arch/openrisc/include/asm/gpio.h
+++ b/arch/openrisc/include/asm/gpio.h
@@ -1,65 +1,4 @@
-/*
- * OpenRISC Linux
- *
- * Linux architectural port borrowing liberally from similar works of
- * others.  All original copyrights apply as per the original source
- * declaration.
- *
- * OpenRISC implementation:
- * Copyright (C) 2003 Matjaz Breskvar <phoenix@bsemi.com>
- * Copyright (C) 2010-2011 Jonas Bonn <jonas@southpole.se>
- * et al.
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation; either version 2 of the License, or
- * (at your option) any later version.
- */
-
-#ifndef __ASM_OPENRISC_GPIO_H
-#define __ASM_OPENRISC_GPIO_H
-
-#include <linux/errno.h>
-#include <asm-generic/gpio.h>
-
-#ifdef CONFIG_GPIOLIB
-
-/*
- * OpenRISC (or1k) does not have on-chip GPIO's so there is not really
- * any standardized implementation that makes sense here.  If passing
- * through gpiolib becomes a bottleneck then it may make sense, on a
- * case-by-case basis, to implement these inlined/rapid versions.
- *
- * Just call gpiolib.
- */
-static inline int gpio_get_value(unsigned int gpio)
-{
-	return __gpio_get_value(gpio);
-}
-
-static inline void gpio_set_value(unsigned int gpio, int value)
-{
-	__gpio_set_value(gpio, value);
-}
-
-static inline int gpio_cansleep(unsigned int gpio)
-{
-	return __gpio_cansleep(gpio);
-}
-
-/*
- * Not implemented, yet.
- */
-static inline int gpio_to_irq(unsigned int gpio)
-{
-	return -ENOSYS;
-}
-
-static inline int irq_to_gpio(unsigned int irq)
-{
-	return -EINVAL;
-}
-
-#endif /* CONFIG_GPIOLIB */
-
-#endif /* __ASM_OPENRISC_GPIO_H */
+#ifndef __LINUX_GPIO_H
+#warning Include linux/gpio.h instead of asm/gpio.h
+#include <linux/gpio.h>
+#endif
diff --git a/arch/powerpc/include/asm/gpio.h b/arch/powerpc/include/asm/gpio.h
index 38762edb5e58..b3799d88ffcf 100644
--- a/arch/powerpc/include/asm/gpio.h
+++ b/arch/powerpc/include/asm/gpio.h
@@ -1,53 +1,4 @@
-/*
- * Generic GPIO API implementation for PowerPC.
- *
- * Copyright (c) 2007-2008  MontaVista Software, Inc.
- *
- * Author: Anton Vorontsov <avorontsov@ru.mvista.com>
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation; either version 2 of the License, or
- * (at your option) any later version.
- */
-
-#ifndef __ASM_POWERPC_GPIO_H
-#define __ASM_POWERPC_GPIO_H
-
-#include <linux/errno.h>
-#include <asm-generic/gpio.h>
-
-#ifdef CONFIG_GPIOLIB
-
-/*
- * We don't (yet) implement inlined/rapid versions for on-chip gpios.
- * Just call gpiolib.
- */
-static inline int gpio_get_value(unsigned int gpio)
-{
-	return __gpio_get_value(gpio);
-}
-
-static inline void gpio_set_value(unsigned int gpio, int value)
-{
-	__gpio_set_value(gpio, value);
-}
-
-static inline int gpio_cansleep(unsigned int gpio)
-{
-	return __gpio_cansleep(gpio);
-}
-
-static inline int gpio_to_irq(unsigned int gpio)
-{
-	return __gpio_to_irq(gpio);
-}
-
-static inline int irq_to_gpio(unsigned int irq)
-{
-	return -EINVAL;
-}
-
-#endif /* CONFIG_GPIOLIB */
-
-#endif /* __ASM_POWERPC_GPIO_H */
+#ifndef __LINUX_GPIO_H
+#warning Include linux/gpio.h instead of asm/gpio.h
+#include <linux/gpio.h>
+#endif
diff --git a/arch/sh/Kconfig b/arch/sh/Kconfig
index ff9e033ce626..c40b29ac3644 100644
--- a/arch/sh/Kconfig
+++ b/arch/sh/Kconfig
@@ -13,6 +13,7 @@ config SUPERH
 	select HAVE_DMA_ATTRS
 	select HAVE_IRQ_WORK
 	select HAVE_PERF_EVENTS
+	select ARCH_HAVE_CUSTOM_GPIO_H
 	select ARCH_HAVE_NMI_SAFE_CMPXCHG if (GUSA_RB || CPU_SH4A)
 	select PERF_USE_VMALLOC
 	select HAVE_KERNEL_GZIP
diff --git a/arch/sparc/include/asm/gpio.h b/arch/sparc/include/asm/gpio.h
index a0e3ac0af599..b3799d88ffcf 100644
--- a/arch/sparc/include/asm/gpio.h
+++ b/arch/sparc/include/asm/gpio.h
@@ -1,36 +1,4 @@
-#ifndef __ASM_SPARC_GPIO_H
-#define __ASM_SPARC_GPIO_H
-
-#include <linux/errno.h>
-#include <asm-generic/gpio.h>
-
-#ifdef CONFIG_GPIOLIB
-
-static inline int gpio_get_value(unsigned int gpio)
-{
-	return __gpio_get_value(gpio);
-}
-
-static inline void gpio_set_value(unsigned int gpio, int value)
-{
-	__gpio_set_value(gpio, value);
-}
-
-static inline int gpio_cansleep(unsigned int gpio)
-{
-	return __gpio_cansleep(gpio);
-}
-
-static inline int gpio_to_irq(unsigned int gpio)
-{
-	return -ENOSYS;
-}
-
-static inline int irq_to_gpio(unsigned int irq)
-{
-	return -EINVAL;
-}
-
-#endif /* CONFIG_GPIOLIB */
-
-#endif /* __ASM_SPARC_GPIO_H */
+#ifndef __LINUX_GPIO_H
+#warning Include linux/gpio.h instead of asm/gpio.h
+#include <linux/gpio.h>
+#endif
diff --git a/arch/unicore32/Kconfig b/arch/unicore32/Kconfig
index eeb8054c7cd8..7ff6d10c0be2 100644
--- a/arch/unicore32/Kconfig
+++ b/arch/unicore32/Kconfig
@@ -8,6 +8,7 @@ config UNICORE32
 	select HAVE_KERNEL_BZIP2
 	select HAVE_KERNEL_LZO
 	select HAVE_KERNEL_LZMA
+	select ARCH_HAVE_CUSTOM_GPIO_H
 	select GENERIC_FIND_FIRST_BIT
 	select GENERIC_IRQ_PROBE
 	select GENERIC_IRQ_SHOW
diff --git a/arch/x86/include/asm/gpio.h b/arch/x86/include/asm/gpio.h
index 91d915a65259..b3799d88ffcf 100644
--- a/arch/x86/include/asm/gpio.h
+++ b/arch/x86/include/asm/gpio.h
@@ -1,53 +1,4 @@
-/*
- * Generic GPIO API implementation for x86.
- *
- * Derived from the generic GPIO API for powerpc:
- *
- * Copyright (c) 2007-2008  MontaVista Software, Inc.
- *
- * Author: Anton Vorontsov <avorontsov@ru.mvista.com>
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation; either version 2 of the License, or
- * (at your option) any later version.
- */
-
-#ifndef _ASM_X86_GPIO_H
-#define _ASM_X86_GPIO_H
-
-#include <asm-generic/gpio.h>
-
-#ifdef CONFIG_GPIOLIB
-
-/*
- * Just call gpiolib.
- */
-static inline int gpio_get_value(unsigned int gpio)
-{
-	return __gpio_get_value(gpio);
-}
-
-static inline void gpio_set_value(unsigned int gpio, int value)
-{
-	__gpio_set_value(gpio, value);
-}
-
-static inline int gpio_cansleep(unsigned int gpio)
-{
-	return __gpio_cansleep(gpio);
-}
-
-static inline int gpio_to_irq(unsigned int gpio)
-{
-	return __gpio_to_irq(gpio);
-}
-
-static inline int irq_to_gpio(unsigned int irq)
-{
-	return -EINVAL;
-}
-
-#endif /* CONFIG_GPIOLIB */
-
-#endif /* _ASM_X86_GPIO_H */
+#ifndef __LINUX_GPIO_H
+#warning Include linux/gpio.h instead of asm/gpio.h
+#include <linux/gpio.h>
+#endif
diff --git a/arch/xtensa/include/asm/gpio.h b/arch/xtensa/include/asm/gpio.h
index a8c9fc46c790..b3799d88ffcf 100644
--- a/arch/xtensa/include/asm/gpio.h
+++ b/arch/xtensa/include/asm/gpio.h
@@ -1,56 +1,4 @@
-/*
- * Generic GPIO API implementation for xtensa.
- *
- * Stolen from x86, which is derived from the generic GPIO API for powerpc:
- *
- * Copyright (c) 2007-2008  MontaVista Software, Inc.
- *
- * Author: Anton Vorontsov <avorontsov@ru.mvista.com>
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation; either version 2 of the License, or
- * (at your option) any later version.
- */
-
-#ifndef _ASM_XTENSA_GPIO_H
-#define _ASM_XTENSA_GPIO_H
-
-#include <asm-generic/gpio.h>
-
-#ifdef CONFIG_GPIOLIB
-
-/*
- * Just call gpiolib.
- */
-static inline int gpio_get_value(unsigned int gpio)
-{
-	return __gpio_get_value(gpio);
-}
-
-static inline void gpio_set_value(unsigned int gpio, int value)
-{
-	__gpio_set_value(gpio, value);
-}
-
-static inline int gpio_cansleep(unsigned int gpio)
-{
-	return __gpio_cansleep(gpio);
-}
-
-static inline int gpio_to_irq(unsigned int gpio)
-{
-	return __gpio_to_irq(gpio);
-}
-
-/*
- * Not implemented, yet.
- */
-static inline int irq_to_gpio(unsigned int irq)
-{
-	return -EINVAL;
-}
-
-#endif /* CONFIG_GPIOLIB */
-
-#endif /* _ASM_XTENSA_GPIO_H */
+#ifndef __LINUX_GPIO_H
+#warning Include linux/gpio.h instead of asm/gpio.h
+#include <linux/gpio.h>
+#endif
diff --git a/drivers/gpio/Kconfig b/drivers/gpio/Kconfig
index 5169a99e9f61..25535ebf4f90 100644
--- a/drivers/gpio/Kconfig
+++ b/drivers/gpio/Kconfig
@@ -2,6 +2,14 @@
 # GPIO infrastructure and drivers
 #
 
+config ARCH_HAVE_CUSTOM_GPIO_H
+	bool
+	help
+	  Selecting this config option from the architecture Kconfig allows
+	  the architecture to provide a custom asm/gpio.h implementation
+	  overriding the default implementations.  New uses of this are
+	  strongly discouraged.
+
 config ARCH_WANT_OPTIONAL_GPIOLIB
 	bool
 	help
diff --git a/include/linux/gpio.h b/include/linux/gpio.h
index d1890d46b6ce..7a8816a1a0d8 100644
--- a/include/linux/gpio.h
+++ b/include/linux/gpio.h
@@ -1,6 +1,8 @@
 #ifndef __LINUX_GPIO_H
 #define __LINUX_GPIO_H
 
+#include <linux/errno.h>
+
 /* see Documentation/gpio.txt */
 
 /* make these flag values available regardless of GPIO kconfig options */
@@ -38,7 +40,39 @@ struct gpio {
 };
 
 #ifdef CONFIG_GENERIC_GPIO
+
+#ifdef CONFIG_ARCH_HAVE_CUSTOM_GPIO_H
 #include <asm/gpio.h>
+#else
+
+#include <asm-generic/gpio.h>
+
+static inline int gpio_get_value(unsigned int gpio)
+{
+	return __gpio_get_value(gpio);
+}
+
+static inline void gpio_set_value(unsigned int gpio, int value)
+{
+	__gpio_set_value(gpio, value);
+}
+
+static inline int gpio_cansleep(unsigned int gpio)
+{
+	return __gpio_cansleep(gpio);
+}
+
+static inline int gpio_to_irq(unsigned int gpio)
+{
+	return __gpio_to_irq(gpio);
+}
+
+static inline int irq_to_gpio(unsigned int irq)
+{
+	return -EINVAL;
+}
+
+#endif
 
 #else
 
-- 
cgit v1.2.3


From 5f3fbc342f408199e5cbb4b3dc220569147a99a7 Mon Sep 17 00:00:00 2001
From: Xiao Guangrong <xiaoguangrong@linux.vnet.ibm.com>
Date: Mon, 14 May 2012 14:58:58 +0800
Subject: KVM: VMX: unlike vmcs on fail path

fix:

[ 1529.577273] Call Trace:
[ 1529.577289]  [<ffffffffa060d58f>] kvm_arch_hardware_disable+0x13/0x30 [kvm]
[ 1529.577302]  [<ffffffffa05fa2d4>] hardware_disable_nolock+0x35/0x39 [kvm]
[ 1529.577311]  [<ffffffffa05fa29f>] ? cpumask_clear_cpu.constprop.31+0x13/0x13 [kvm]
[ 1529.577315]  [<ffffffff81096ba8>] on_each_cpu+0x44/0x84
[ 1529.577326]  [<ffffffffa05f98b5>] hardware_disable_all_nolock+0x34/0x36 [kvm]
[ 1529.577335]  [<ffffffffa05f98e2>] hardware_disable_all+0x2b/0x39 [kvm]
[ 1529.577349]  [<ffffffffa05fafe5>] kvm_put_kvm+0xed/0x10f [kvm]
[ 1529.577358]  [<ffffffffa05fb3d7>] kvm_vm_release+0x22/0x28 [kvm]

Signed-off-by: Xiao Guangrong <xiaoguangrong@linux.vnet.ibm.com>
Signed-off-by: Avi Kivity <avi@redhat.com>
---
 arch/x86/kvm/vmx.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index 61ebdb6390ee..3062ea95266e 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -6350,7 +6350,7 @@ static struct kvm_vcpu *vmx_create_vcpu(struct kvm *kvm, unsigned int id)
 	return &vmx->vcpu;
 
 free_vmcs:
-	free_vmcs(vmx->loaded_vmcs->vmcs);
+	free_loaded_vmcs(vmx->loaded_vmcs);
 free_msrs:
 	kfree(vmx->guest_msrs);
 uninit_vcpu:
-- 
cgit v1.2.3


From d54e4237bcbb400fda11c902fd538aa0b4805720 Mon Sep 17 00:00:00 2001
From: Joerg Roedel <joerg.roedel@amd.com>
Date: Mon, 7 May 2012 12:12:25 +0200
Subject: KVM: x86 emulator: convert bsf/bsr instructions to
 emulate_2op_SrcV_nobyte()

The instruction emulation for bsrw is broken in KVM because
the code always uses bsr with 32 or 64 bit operand size for
emulation. Fix that by using emulate_2op_SrcV_nobyte() macro
to use guest operand size for emulation.

Signed-off-by: Joerg Roedel <joerg.roedel@amd.com>
Signed-off-by: Avi Kivity <avi@redhat.com>
---
 arch/x86/kvm/emulate.c | 26 ++------------------------
 1 file changed, 2 insertions(+), 24 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c
index 7fd25763b0e0..f95d242ee9f7 100644
--- a/arch/x86/kvm/emulate.c
+++ b/arch/x86/kvm/emulate.c
@@ -3133,35 +3133,13 @@ static int em_btc(struct x86_emulate_ctxt *ctxt)
 
 static int em_bsf(struct x86_emulate_ctxt *ctxt)
 {
-	u8 zf;
-
-	__asm__ ("bsf %2, %0; setz %1"
-		 : "=r"(ctxt->dst.val), "=q"(zf)
-		 : "r"(ctxt->src.val));
-
-	ctxt->eflags &= ~X86_EFLAGS_ZF;
-	if (zf) {
-		ctxt->eflags |= X86_EFLAGS_ZF;
-		/* Disable writeback. */
-		ctxt->dst.type = OP_NONE;
-	}
+	emulate_2op_SrcV_nobyte(ctxt, "bsf");
 	return X86EMUL_CONTINUE;
 }
 
 static int em_bsr(struct x86_emulate_ctxt *ctxt)
 {
-	u8 zf;
-
-	__asm__ ("bsr %2, %0; setz %1"
-		 : "=r"(ctxt->dst.val), "=q"(zf)
-		 : "r"(ctxt->src.val));
-
-	ctxt->eflags &= ~X86_EFLAGS_ZF;
-	if (zf) {
-		ctxt->eflags |= X86_EFLAGS_ZF;
-		/* Disable writeback. */
-		ctxt->dst.type = OP_NONE;
-	}
+	emulate_2op_SrcV_nobyte(ctxt, "bsr");
 	return X86EMUL_CONTINUE;
 }
 
-- 
cgit v1.2.3


From 5d2b86d90f7cc4a41316cef3d41560da6141f45c Mon Sep 17 00:00:00 2001
From: Don Zickus <dzickus@redhat.com>
Date: Fri, 11 May 2012 14:41:13 -0400
Subject: Revert "x86, reboot: Use NMI instead of REBOOT_VECTOR to stop cpus"

This reverts commit 3603a2512f9e69dc87914ba922eb4a0812b21cd6.

Originally I wanted a better hammer to shutdown cpus during
panic. However, this really steps on the toes of various
spinlocks in the panic path.  Sometimes it is easier to wait for
the IRQ to become re-enabled to indictate the cpu left the
critical region and then shutdown the cpu.

The next patch moves the NMI addition after the IRQ part.  To
make it easier to see the logic of everything, revert this patch
and apply the next simpler patch.

Signed-off-by: Don Zickus <dzickus@redhat.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Link: http://lkml.kernel.org/r/1336761675-24296-2-git-send-email-dzickus@redhat.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/x86/kernel/smp.c | 59 ++-------------------------------------------------
 1 file changed, 2 insertions(+), 57 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kernel/smp.c b/arch/x86/kernel/smp.c
index 66c74f481cab..6d20f523bc4e 100644
--- a/arch/x86/kernel/smp.c
+++ b/arch/x86/kernel/smp.c
@@ -29,7 +29,6 @@
 #include <asm/mmu_context.h>
 #include <asm/proto.h>
 #include <asm/apic.h>
-#include <asm/nmi.h>
 /*
  *	Some notes on x86 processor bugs affecting SMP operation:
  *
@@ -149,60 +148,6 @@ void native_send_call_func_ipi(const struct cpumask *mask)
 	free_cpumask_var(allbutself);
 }
 
-static atomic_t stopping_cpu = ATOMIC_INIT(-1);
-
-static int smp_stop_nmi_callback(unsigned int val, struct pt_regs *regs)
-{
-	/* We are registered on stopping cpu too, avoid spurious NMI */
-	if (raw_smp_processor_id() == atomic_read(&stopping_cpu))
-		return NMI_HANDLED;
-
-	stop_this_cpu(NULL);
-
-	return NMI_HANDLED;
-}
-
-static void native_nmi_stop_other_cpus(int wait)
-{
-	unsigned long flags;
-	unsigned long timeout;
-
-	if (reboot_force)
-		return;
-
-	/*
-	 * Use an own vector here because smp_call_function
-	 * does lots of things not suitable in a panic situation.
-	 */
-	if (num_online_cpus() > 1) {
-		/* did someone beat us here? */
-		if (atomic_cmpxchg(&stopping_cpu, -1, safe_smp_processor_id()) != -1)
-			return;
-
-		if (register_nmi_handler(NMI_LOCAL, smp_stop_nmi_callback,
-					 NMI_FLAG_FIRST, "smp_stop"))
-			/* Note: we ignore failures here */
-			return;
-
-		/* sync above data before sending NMI */
-		wmb();
-
-		apic->send_IPI_allbutself(NMI_VECTOR);
-
-		/*
-		 * Don't wait longer than a second if the caller
-		 * didn't ask us to wait.
-		 */
-		timeout = USEC_PER_SEC;
-		while (num_online_cpus() > 1 && (wait || timeout--))
-			udelay(1);
-	}
-
-	local_irq_save(flags);
-	disable_local_APIC();
-	local_irq_restore(flags);
-}
-
 /*
  * this function calls the 'stop' function on all other CPUs in the system.
  */
@@ -215,7 +160,7 @@ asmlinkage void smp_reboot_interrupt(void)
 	irq_exit();
 }
 
-static void native_irq_stop_other_cpus(int wait)
+static void native_stop_other_cpus(int wait)
 {
 	unsigned long flags;
 	unsigned long timeout;
@@ -298,7 +243,7 @@ struct smp_ops smp_ops = {
 	.smp_prepare_cpus	= native_smp_prepare_cpus,
 	.smp_cpus_done		= native_smp_cpus_done,
 
-	.stop_other_cpus	= native_nmi_stop_other_cpus,
+	.stop_other_cpus	= native_stop_other_cpus,
 	.smp_send_reschedule	= native_smp_send_reschedule,
 
 	.cpu_up			= native_cpu_up,
-- 
cgit v1.2.3


From 7d007d21e539dbecb6942c5734e6649f720982cf Mon Sep 17 00:00:00 2001
From: Don Zickus <dzickus@redhat.com>
Date: Fri, 11 May 2012 14:41:14 -0400
Subject: x86/reboot: Use NMI to assist in shutting down if IRQ fails

For v3.3, I added code to use the NMI to stop other cpus in the
panic case.  The idea was to make sure all cpus on the system
were definitely halted to help serialize the panic path to
execute the rest of the code on a single cpu.

The main problem it was trying to solve was how to stop a cpu
that was spinning with its irqs disabled.  A IPI irq would be
stuck and couldn't get in there, but an NMI could.

Things were great until we had another conversation about some
pstore changes.  Because some of the backend pstore still uses
spinlocks to protect the device access, things could get ugly if
a panic happened and we were stuck spinning on a lock.

Now with the NMI shutting down cpus, we could assume no other
cpus were running and just bust the spin lock and proceed.

The counter argument was, well if you do that the backend could
be in a screwed up state and you might not be able to save
anything as a result. If we could have just given the cpu a
little more time to finish things, we could have grabbed the
spin lock cleanly and everything would have been fine.

Well, how do give a cpu a 'little more time' in the panic case?
For the most part you can't without spinning on the lock and
even in that case, how long do you spin for?

So instead of making it ugly in the pstore code, just mimic the
idea that stop_machine had, which is block on an IRQ IPI until
the remote cpu has re-enabled interrupts and left the critical
region.  Which is what happens now using REBOOT_IRQ.

Then leave the NMI case for those cpus that are truly stuck
after a short time.  This leaves the current behaviour alone and
just handle a corner case.  Most systems should never have to
enter the NMI code and if they do, print out a message in case
the NMI itself causes another issue.

Signed-off-by: Don Zickus <dzickus@redhat.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Link: http://lkml.kernel.org/r/1336761675-24296-3-git-send-email-dzickus@redhat.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/x86/kernel/smp.c | 61 ++++++++++++++++++++++++++++++++++++++++++++++-----
 1 file changed, 56 insertions(+), 5 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kernel/smp.c b/arch/x86/kernel/smp.c
index 6d20f523bc4e..228e7405511a 100644
--- a/arch/x86/kernel/smp.c
+++ b/arch/x86/kernel/smp.c
@@ -29,6 +29,7 @@
 #include <asm/mmu_context.h>
 #include <asm/proto.h>
 #include <asm/apic.h>
+#include <asm/nmi.h>
 /*
  *	Some notes on x86 processor bugs affecting SMP operation:
  *
@@ -108,6 +109,8 @@
  *	about nothing of note with C stepping upwards.
  */
 
+static atomic_t stopping_cpu = ATOMIC_INIT(-1);
+
 /*
  * this function sends a 'reschedule' IPI to another CPU.
  * it goes straight through and wastes no time serializing
@@ -148,6 +151,17 @@ void native_send_call_func_ipi(const struct cpumask *mask)
 	free_cpumask_var(allbutself);
 }
 
+static int smp_stop_nmi_callback(unsigned int val, struct pt_regs *regs)
+{
+	/* We are registered on stopping cpu too, avoid spurious NMI */
+	if (raw_smp_processor_id() == atomic_read(&stopping_cpu))
+		return NMI_HANDLED;
+
+	stop_this_cpu(NULL);
+
+	return NMI_HANDLED;
+}
+
 /*
  * this function calls the 'stop' function on all other CPUs in the system.
  */
@@ -171,13 +185,25 @@ static void native_stop_other_cpus(int wait)
 	/*
 	 * Use an own vector here because smp_call_function
 	 * does lots of things not suitable in a panic situation.
-	 * On most systems we could also use an NMI here,
-	 * but there are a few systems around where NMI
-	 * is problematic so stay with an non NMI for now
-	 * (this implies we cannot stop CPUs spinning with irq off
-	 * currently)
+	 */
+
+	/*
+	 * We start by using the REBOOT_VECTOR irq.
+	 * The irq is treated as a sync point to allow critical
+	 * regions of code on other cpus to release their spin locks
+	 * and re-enable irqs.  Jumping straight to an NMI might
+	 * accidentally cause deadlocks with further shutdown/panic
+	 * code.  By syncing, we give the cpus up to one second to
+	 * finish their work before we force them off with the NMI.
 	 */
 	if (num_online_cpus() > 1) {
+		/* did someone beat us here? */
+		if (atomic_cmpxchg(&stopping_cpu, -1, safe_smp_processor_id()) != -1)
+			return;
+
+		/* sync above data before sending IRQ */
+		wmb();
+
 		apic->send_IPI_allbutself(REBOOT_VECTOR);
 
 		/*
@@ -188,7 +214,32 @@ static void native_stop_other_cpus(int wait)
 		while (num_online_cpus() > 1 && (wait || timeout--))
 			udelay(1);
 	}
+	
+	/* if the REBOOT_VECTOR didn't work, try with the NMI */
+	if ((num_online_cpus() > 1))  {
+		if (register_nmi_handler(NMI_LOCAL, smp_stop_nmi_callback,
+					 NMI_FLAG_FIRST, "smp_stop"))
+			/* Note: we ignore failures here */
+			/* Hope the REBOOT_IRQ is good enough */
+			goto finish;
+
+		/* sync above data before sending IRQ */
+		wmb();
+
+		pr_emerg("Shutting down cpus with NMI\n");
+
+		apic->send_IPI_allbutself(NMI_VECTOR);
+
+		/*
+		 * Don't wait longer than a 10 ms if the caller
+		 * didn't ask us to wait.
+		 */
+		timeout = USEC_PER_MSEC * 10;
+		while (num_online_cpus() > 1 && (wait || timeout--))
+			udelay(1);
+	}
 
+finish:
 	local_irq_save(flags);
 	disable_local_APIC();
 	local_irq_restore(flags);
-- 
cgit v1.2.3


From 3aac27aba79b7c52e709ef6de0f7d8139caedc01 Mon Sep 17 00:00:00 2001
From: Don Zickus <dzickus@redhat.com>
Date: Fri, 11 May 2012 14:41:15 -0400
Subject: x86/reboot: Update nonmi_ipi parameter

Update the nonmi_ipi parameter to reflect the simple change
instead of the previous complicated one.  There should be less
of a need to use it but there may still be corner cases on older
hardware that stumble into NMI issues.

Signed-off-by: Don Zickus <dzickus@redhat.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Link: http://lkml.kernel.org/r/1336761675-24296-4-git-send-email-dzickus@redhat.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/x86/kernel/smp.c | 12 ++++--------
 1 file changed, 4 insertions(+), 8 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kernel/smp.c b/arch/x86/kernel/smp.c
index 228e7405511a..48d2b7ded422 100644
--- a/arch/x86/kernel/smp.c
+++ b/arch/x86/kernel/smp.c
@@ -110,6 +110,7 @@
  */
 
 static atomic_t stopping_cpu = ATOMIC_INIT(-1);
+static bool smp_no_nmi_ipi = false;
 
 /*
  * this function sends a 'reschedule' IPI to another CPU.
@@ -216,7 +217,7 @@ static void native_stop_other_cpus(int wait)
 	}
 	
 	/* if the REBOOT_VECTOR didn't work, try with the NMI */
-	if ((num_online_cpus() > 1))  {
+	if ((num_online_cpus() > 1) && (!smp_no_nmi_ipi))  {
 		if (register_nmi_handler(NMI_LOCAL, smp_stop_nmi_callback,
 					 NMI_FLAG_FIRST, "smp_stop"))
 			/* Note: we ignore failures here */
@@ -245,11 +246,6 @@ finish:
 	local_irq_restore(flags);
 }
 
-static void native_smp_disable_nmi_ipi(void)
-{
-	smp_ops.stop_other_cpus = native_irq_stop_other_cpus;
-}
-
 /*
  * Reschedule call back.
  */
@@ -283,8 +279,8 @@ void smp_call_function_single_interrupt(struct pt_regs *regs)
 
 static int __init nonmi_ipi_setup(char *str)
 {
-        native_smp_disable_nmi_ipi();
-        return 1;
+	smp_no_nmi_ipi = true;
+	return 1;
 }
 
 __setup("nonmi_ipi", nonmi_ipi_setup);
-- 
cgit v1.2.3


From 978da300c7a65494692b329a6a4cbf364afc37c5 Mon Sep 17 00:00:00 2001
From: Robert Richter <robert.richter@amd.com>
Date: Fri, 11 May 2012 11:44:59 +0200
Subject: perf/x86/ibs: Fix undefined reference to `get_ibs_caps'

Fixing i386 allnoconfig built errors:

 arch/x86/built-in.o: In function `amd_pmu_hw_config':
 perf_event_amd.c:(.text+0xc3e1): undefined reference to `get_ibs_caps'

Reported-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Robert Richter <robert.richter@amd.com>
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/x86/include/asm/perf_event.h | 4 ++++
 1 file changed, 4 insertions(+)

(limited to 'arch/x86')

diff --git a/arch/x86/include/asm/perf_event.h b/arch/x86/include/asm/perf_event.h
index 4e40a64315c9..588f52ea810e 100644
--- a/arch/x86/include/asm/perf_event.h
+++ b/arch/x86/include/asm/perf_event.h
@@ -188,7 +188,11 @@ struct x86_pmu_capability {
 #define IBS_OP_MAX_CNT_EXT	0x007FFFFFULL	/* not a register bit mask */
 #define IBS_RIP_INVALID		(1ULL<<38)
 
+#ifdef CONFIG_X86_LOCAL_APIC
 extern u32 get_ibs_caps(void);
+#else
+static inline u32 get_ibs_caps(void) { return 0; }
+#endif
 
 #ifdef CONFIG_PERF_EVENTS
 extern void perf_events_lapic_init(void);
-- 
cgit v1.2.3


From ead91d4b8c3b1fb08a73aaa4a191230ecf717ee0 Mon Sep 17 00:00:00 2001
From: Shai Fultheim <shai@scalemp.com>
Date: Mon, 16 Apr 2012 10:39:35 +0300
Subject: x86/vsmp: Fix number of CPUs when vsmp is disabled

In case CONFIG_X86_VSMP is not set, limit the number of CPUs to
the number of CPUs of the first board.

Also make CONFIG_X86_VSMP depend on CONFIG_SMP, as there's
little point in having a vsmp machine with a single CPU.

Signed-off-by: Shai Fultheim <shai@scalemp.com>
[ido@wizery.com: rebased, fixed minor coding-style issues]
Signed-off-by: Ido Yariv <ido@wizery.com>
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/x86/Kconfig          |  1 +
 arch/x86/kernel/vsmp_64.c | 40 ++++++++++++++++++++++++++++++++++++++++
 2 files changed, 41 insertions(+)

(limited to 'arch/x86')

diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
index f9ed801abaf9..d2599a0ea208 100644
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -376,6 +376,7 @@ config X86_VSMP
 	select PARAVIRT
 	depends on X86_64 && PCI
 	depends on X86_EXTENDED_PLATFORM
+	depends on SMP
 	---help---
 	  Support for ScaleMP vSMP systems.  Say 'Y' here if this kernel is
 	  supposed to run on these EM64T-based machines.  Only choose this option
diff --git a/arch/x86/kernel/vsmp_64.c b/arch/x86/kernel/vsmp_64.c
index a1d804bcd483..8eeb55a551b4 100644
--- a/arch/x86/kernel/vsmp_64.c
+++ b/arch/x86/kernel/vsmp_64.c
@@ -15,6 +15,7 @@
 #include <linux/init.h>
 #include <linux/pci_ids.h>
 #include <linux/pci_regs.h>
+#include <linux/smp.h>
 
 #include <asm/apic.h>
 #include <asm/pci-direct.h>
@@ -22,6 +23,8 @@
 #include <asm/paravirt.h>
 #include <asm/setup.h>
 
+#define TOPOLOGY_REGISTER_OFFSET 0x10
+
 #if defined CONFIG_PCI && defined CONFIG_PARAVIRT
 /*
  * Interrupt control on vSMPowered systems:
@@ -149,12 +152,49 @@ int is_vsmp_box(void)
 	return 0;
 }
 #endif
+
+static void __init vsmp_cap_cpus(void)
+{
+#if !defined(CONFIG_X86_VSMP) && defined(CONFIG_SMP)
+	void __iomem *address;
+	unsigned int cfg, topology, node_shift, maxcpus;
+
+	/*
+	 * CONFIG_X86_VSMP is not configured, so limit the number CPUs to the
+	 * ones present in the first board, unless explicitly overridden by
+	 * setup_max_cpus
+	 */
+	if (setup_max_cpus != NR_CPUS)
+		return;
+
+	/* Read the vSMP Foundation topology register */
+	cfg = read_pci_config(0, 0x1f, 0, PCI_BASE_ADDRESS_0);
+	address = early_ioremap(cfg + TOPOLOGY_REGISTER_OFFSET, 4);
+	if (WARN_ON(!address))
+		return;
+
+	topology = readl(address);
+	node_shift = (topology >> 16) & 0x7;
+	if (!node_shift)
+		/* The value 0 should be decoded as 8 */
+		node_shift = 8;
+	maxcpus = (topology & ((1 << node_shift) - 1)) + 1;
+
+	pr_info("vSMP CTL: Capping CPUs to %d (CONFIG_X86_VSMP is unset)\n",
+		maxcpus);
+	setup_max_cpus = maxcpus;
+	early_iounmap(address, 4);
+#endif
+}
+
 void __init vsmp_init(void)
 {
 	detect_vsmp_box();
 	if (!is_vsmp_box())
 		return;
 
+	vsmp_cap_cpus();
+
 	set_vsmp_pv_ops();
 	return;
 }
-- 
cgit v1.2.3


From 316ad248307fba13be40f01e92a22b89457c32bc Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <a.p.zijlstra@chello.nl>
Date: Fri, 11 May 2012 13:05:59 +0200
Subject: sched/x86: Rewrite set_cpu_sibling_map()

Commit ad7687dde ("x86/numa: Check for nonsensical topologies on real
hw as well") is broken in that the condition can trigger for valid
setups but only changes the end result for invalid setups with no real
means of discerning between those.

Rewrite set_cpu_sibling_map() to make the code clearer and make sure
to only warn when the check changes the end result.

Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Link: http://lkml.kernel.org/n/tip-klcwahu3gx467uhfiqjyhdcs@git.kernel.org
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/x86/kernel/smpboot.c | 112 +++++++++++++++++++++++++++-------------------
 1 file changed, 66 insertions(+), 46 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c
index 7c53d96d44ab..e84c1bbea339 100644
--- a/arch/x86/kernel/smpboot.c
+++ b/arch/x86/kernel/smpboot.c
@@ -315,70 +315,90 @@ void __cpuinit smp_store_cpu_info(int id)
 		identify_secondary_cpu(c);
 }
 
-static void __cpuinit link_thread_siblings(int cpu1, int cpu2)
+static bool __cpuinit
+topology_sane(struct cpuinfo_x86 *c, struct cpuinfo_x86 *o, const char *name)
 {
-	cpumask_set_cpu(cpu1, cpu_sibling_mask(cpu2));
-	cpumask_set_cpu(cpu2, cpu_sibling_mask(cpu1));
-	cpumask_set_cpu(cpu1, cpu_core_mask(cpu2));
-	cpumask_set_cpu(cpu2, cpu_core_mask(cpu1));
-	cpumask_set_cpu(cpu1, cpu_llc_shared_mask(cpu2));
-	cpumask_set_cpu(cpu2, cpu_llc_shared_mask(cpu1));
+	int cpu1 = c->cpu_index, cpu2 = o->cpu_index;
+
+	return !WARN_ONCE(cpu_to_node(cpu1) != cpu_to_node(cpu2),
+		"sched: CPU #%d's %s-sibling CPU #%d is not on the same node! "
+		"[node: %d != %d]. Ignoring dependency.\n",
+		cpu1, name, cpu2, cpu_to_node(cpu1), cpu_to_node(cpu2));
 }
 
+#define link_mask(_m, c1, c2)						\
+do {									\
+	cpumask_set_cpu((c1), cpu_##_m##_mask(c2));			\
+	cpumask_set_cpu((c2), cpu_##_m##_mask(c1));			\
+} while (0)
 
-void __cpuinit set_cpu_sibling_map(int cpu)
+static bool __cpuinit match_smt(struct cpuinfo_x86 *c, struct cpuinfo_x86 *o)
 {
-	int i;
-	struct cpuinfo_x86 *c = &cpu_data(cpu);
+	if (cpu_has(c, X86_FEATURE_TOPOEXT)) {
+		int cpu1 = c->cpu_index, cpu2 = o->cpu_index;
 
-	cpumask_set_cpu(cpu, cpu_sibling_setup_mask);
+		if (c->phys_proc_id == o->phys_proc_id &&
+		    per_cpu(cpu_llc_id, cpu1) == per_cpu(cpu_llc_id, cpu2) &&
+		    c->compute_unit_id == o->compute_unit_id)
+			return topology_sane(c, o, "smt");
 
-	if (smp_num_siblings > 1) {
-		for_each_cpu(i, cpu_sibling_setup_mask) {
-			struct cpuinfo_x86 *o = &cpu_data(i);
+	} else if (c->phys_proc_id == o->phys_proc_id &&
+		   c->cpu_core_id == o->cpu_core_id) {
+		return topology_sane(c, o, "smt");
+	}
 
-			if (cpu_to_node(cpu) != cpu_to_node(i)) {
-				WARN_ONCE(1, "sched: CPU #%d's thread-sibling CPU #%d not on the same node! [node %d != %d]. Ignoring sibling dependency.\n", cpu, i, cpu_to_node(cpu), cpu_to_node(i));
-				continue;
-			}
+	return false;
+}
 
-			if (cpu_has(c, X86_FEATURE_TOPOEXT)) {
-				if (c->phys_proc_id == o->phys_proc_id &&
-				    per_cpu(cpu_llc_id, cpu) == per_cpu(cpu_llc_id, i) &&
-				    c->compute_unit_id == o->compute_unit_id)
-					link_thread_siblings(cpu, i);
-			} else if (c->phys_proc_id == o->phys_proc_id &&
-				   c->cpu_core_id == o->cpu_core_id) {
-				link_thread_siblings(cpu, i);
-			}
-		}
-	} else {
-		cpumask_set_cpu(cpu, cpu_sibling_mask(cpu));
-	}
+static bool __cpuinit match_llc(struct cpuinfo_x86 *c, struct cpuinfo_x86 *o)
+{
+	int cpu1 = c->cpu_index, cpu2 = o->cpu_index;
+
+	if (per_cpu(cpu_llc_id, cpu1) != BAD_APICID &&
+	    per_cpu(cpu_llc_id, cpu1) == per_cpu(cpu_llc_id, cpu2))
+		return topology_sane(c, o, "llc");
+
+	return false;
+}
+
+static bool __cpuinit match_mc(struct cpuinfo_x86 *c, struct cpuinfo_x86 *o)
+{
+	if (c->phys_proc_id == o->phys_proc_id)
+		return topology_sane(c, o, "mc");
+
+	return false;
+}
+
+void __cpuinit set_cpu_sibling_map(int cpu)
+{
+	bool has_mc = boot_cpu_data.x86_max_cores > 1;
+	bool has_smt = smp_num_siblings > 1;
+	struct cpuinfo_x86 *c = &cpu_data(cpu);
+	struct cpuinfo_x86 *o;
+	int i;
 
-	cpumask_set_cpu(cpu, cpu_llc_shared_mask(cpu));
+	cpumask_set_cpu(cpu, cpu_sibling_setup_mask);
 
-	if (__this_cpu_read(cpu_info.x86_max_cores) == 1) {
-		cpumask_copy(cpu_core_mask(cpu), cpu_sibling_mask(cpu));
+	if (!has_smt && !has_mc) {
+		cpumask_set_cpu(cpu, cpu_sibling_mask(cpu));
+		cpumask_set_cpu(cpu, cpu_llc_shared_mask(cpu));
+		cpumask_set_cpu(cpu, cpu_core_mask(cpu));
 		c->booted_cores = 1;
 		return;
 	}
 
 	for_each_cpu(i, cpu_sibling_setup_mask) {
-		if (cpu_to_node(cpu) != cpu_to_node(i)) {
-			WARN_ONCE(1, "sched: CPU #%d's core-sibling CPU #%d not on the same node! [node %d != %d]. Ignoring sibling dependency.\n", cpu, i, cpu_to_node(cpu), cpu_to_node(i));
-			continue;
-		}
+		o = &cpu_data(i);
 
-		if (per_cpu(cpu_llc_id, cpu) != BAD_APICID &&
-		    per_cpu(cpu_llc_id, cpu) == per_cpu(cpu_llc_id, i)) {
-			cpumask_set_cpu(i, cpu_llc_shared_mask(cpu));
-			cpumask_set_cpu(cpu, cpu_llc_shared_mask(i));
-		}
+		if ((i == cpu) || (has_smt && match_smt(c, o)))
+			link_mask(sibling, cpu, i);
+
+		if ((i == cpu) || (has_mc && match_llc(c, o)))
+			link_mask(llc_shared, cpu, i);
+
+		if ((i == cpu) || (has_mc && match_mc(c, o))) {
+			link_mask(core, cpu, i);
 
-		if (c->phys_proc_id == cpu_data(i).phys_proc_id) {
-			cpumask_set_cpu(i, cpu_core_mask(cpu));
-			cpumask_set_cpu(cpu, cpu_core_mask(i));
 			/*
 			 *  Does this new cpu bringup a new core?
 			 */
-- 
cgit v1.2.3


From c6ae41e7d469f00d9c92a2b2887c7235d121c009 Mon Sep 17 00:00:00 2001
From: Alex Shi <alex.shi@intel.com>
Date: Fri, 11 May 2012 15:35:27 +0800
Subject: x86: replace percpu_xxx funcs with this_cpu_xxx

Since percpu_xxx() serial functions are duplicated with this_cpu_xxx().
Removing percpu_xxx() definition and replacing them by this_cpu_xxx()
in code. There is no function change in this patch, just preparation for
later percpu_xxx serial function removing.

On x86 machine the this_cpu_xxx() serial functions are same as
__this_cpu_xxx() without no unnecessary premmpt enable/disable.

Thanks for Stephen Rothwell, he found and fixed a i386 build error in
the patch.

Also thanks for Andrew Morton, he kept updating the patchset in Linus'
tree.

Signed-off-by: Alex Shi <alex.shi@intel.com>
Acked-by: Christoph Lameter <cl@gentwo.org>
Acked-by: Tejun Heo <tj@kernel.org>
Acked-by: "H. Peter Anvin" <hpa@zytor.com>
Cc: Ingo Molnar <mingo@elte.hu>
Cc: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Stephen Rothwell <sfr@canb.auug.org.au>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Tejun Heo <tj@kernel.org>
---
 arch/x86/include/asm/compat.h         |  2 +-
 arch/x86/include/asm/current.h        |  2 +-
 arch/x86/include/asm/desc.h           |  1 +
 arch/x86/include/asm/fpu-internal.h   |  6 +++---
 arch/x86/include/asm/hardirq.h        |  9 +++++----
 arch/x86/include/asm/irq_regs.h       |  4 ++--
 arch/x86/include/asm/mmu_context.h    | 12 ++++++------
 arch/x86/include/asm/percpu.h         |  8 ++++----
 arch/x86/include/asm/smp.h            |  4 ++--
 arch/x86/include/asm/stackprotector.h |  4 ++--
 arch/x86/include/asm/thread_info.h    |  2 +-
 arch/x86/include/asm/tlbflush.h       |  4 ++--
 arch/x86/kernel/cpu/common.c          |  2 +-
 arch/x86/kernel/cpu/mcheck/mce.c      |  4 ++--
 arch/x86/kernel/i387.c                |  2 +-
 arch/x86/kernel/nmi_selftest.c        |  1 +
 arch/x86/kernel/paravirt.c            | 12 ++++++------
 arch/x86/kernel/process.c             |  2 +-
 arch/x86/kernel/process_32.c          |  2 +-
 arch/x86/kernel/process_64.c          | 10 +++++-----
 arch/x86/mm/tlb.c                     | 10 +++++-----
 include/linux/topology.h              |  4 ++--
 22 files changed, 55 insertions(+), 52 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/include/asm/compat.h b/arch/x86/include/asm/compat.h
index d6805798d6fc..fedf32b73e65 100644
--- a/arch/x86/include/asm/compat.h
+++ b/arch/x86/include/asm/compat.h
@@ -229,7 +229,7 @@ static inline void __user *arch_compat_alloc_user_space(long len)
 		sp = task_pt_regs(current)->sp;
 	} else {
 		/* -128 for the x32 ABI redzone */
-		sp = percpu_read(old_rsp) - 128;
+		sp = this_cpu_read(old_rsp) - 128;
 	}
 
 	return (void __user *)round_down(sp - len, 16);
diff --git a/arch/x86/include/asm/current.h b/arch/x86/include/asm/current.h
index 4d447b732d82..9476c04ee635 100644
--- a/arch/x86/include/asm/current.h
+++ b/arch/x86/include/asm/current.h
@@ -11,7 +11,7 @@ DECLARE_PER_CPU(struct task_struct *, current_task);
 
 static __always_inline struct task_struct *get_current(void)
 {
-	return percpu_read_stable(current_task);
+	return this_cpu_read_stable(current_task);
 }
 
 #define current get_current()
diff --git a/arch/x86/include/asm/desc.h b/arch/x86/include/asm/desc.h
index e95822d683f4..8bf1c06070d5 100644
--- a/arch/x86/include/asm/desc.h
+++ b/arch/x86/include/asm/desc.h
@@ -6,6 +6,7 @@
 #include <asm/mmu.h>
 
 #include <linux/smp.h>
+#include <linux/percpu.h>
 
 static inline void fill_ldt(struct desc_struct *desc, const struct user_desc *info)
 {
diff --git a/arch/x86/include/asm/fpu-internal.h b/arch/x86/include/asm/fpu-internal.h
index 4fa88154e4de..75f4c6d6a331 100644
--- a/arch/x86/include/asm/fpu-internal.h
+++ b/arch/x86/include/asm/fpu-internal.h
@@ -290,14 +290,14 @@ static inline int __thread_has_fpu(struct task_struct *tsk)
 static inline void __thread_clear_has_fpu(struct task_struct *tsk)
 {
 	tsk->thread.fpu.has_fpu = 0;
-	percpu_write(fpu_owner_task, NULL);
+	this_cpu_write(fpu_owner_task, NULL);
 }
 
 /* Must be paired with a 'clts' before! */
 static inline void __thread_set_has_fpu(struct task_struct *tsk)
 {
 	tsk->thread.fpu.has_fpu = 1;
-	percpu_write(fpu_owner_task, tsk);
+	this_cpu_write(fpu_owner_task, tsk);
 }
 
 /*
@@ -344,7 +344,7 @@ typedef struct { int preload; } fpu_switch_t;
  */
 static inline int fpu_lazy_restore(struct task_struct *new, unsigned int cpu)
 {
-	return new == percpu_read_stable(fpu_owner_task) &&
+	return new == this_cpu_read_stable(fpu_owner_task) &&
 		cpu == new->thread.fpu.last_cpu;
 }
 
diff --git a/arch/x86/include/asm/hardirq.h b/arch/x86/include/asm/hardirq.h
index 382f75d735f3..d3895dbf4ddb 100644
--- a/arch/x86/include/asm/hardirq.h
+++ b/arch/x86/include/asm/hardirq.h
@@ -35,14 +35,15 @@ DECLARE_PER_CPU_SHARED_ALIGNED(irq_cpustat_t, irq_stat);
 
 #define __ARCH_IRQ_STAT
 
-#define inc_irq_stat(member)	percpu_inc(irq_stat.member)
+#define inc_irq_stat(member)	this_cpu_inc(irq_stat.member)
 
-#define local_softirq_pending()	percpu_read(irq_stat.__softirq_pending)
+#define local_softirq_pending()	this_cpu_read(irq_stat.__softirq_pending)
 
 #define __ARCH_SET_SOFTIRQ_PENDING
 
-#define set_softirq_pending(x)	percpu_write(irq_stat.__softirq_pending, (x))
-#define or_softirq_pending(x)	percpu_or(irq_stat.__softirq_pending, (x))
+#define set_softirq_pending(x)	\
+		this_cpu_write(irq_stat.__softirq_pending, (x))
+#define or_softirq_pending(x)	this_cpu_or(irq_stat.__softirq_pending, (x))
 
 extern void ack_bad_irq(unsigned int irq);
 
diff --git a/arch/x86/include/asm/irq_regs.h b/arch/x86/include/asm/irq_regs.h
index 77843225b7ea..d82250b1debb 100644
--- a/arch/x86/include/asm/irq_regs.h
+++ b/arch/x86/include/asm/irq_regs.h
@@ -15,7 +15,7 @@ DECLARE_PER_CPU(struct pt_regs *, irq_regs);
 
 static inline struct pt_regs *get_irq_regs(void)
 {
-	return percpu_read(irq_regs);
+	return this_cpu_read(irq_regs);
 }
 
 static inline struct pt_regs *set_irq_regs(struct pt_regs *new_regs)
@@ -23,7 +23,7 @@ static inline struct pt_regs *set_irq_regs(struct pt_regs *new_regs)
 	struct pt_regs *old_regs;
 
 	old_regs = get_irq_regs();
-	percpu_write(irq_regs, new_regs);
+	this_cpu_write(irq_regs, new_regs);
 
 	return old_regs;
 }
diff --git a/arch/x86/include/asm/mmu_context.h b/arch/x86/include/asm/mmu_context.h
index 69021528b43c..cdbf36776106 100644
--- a/arch/x86/include/asm/mmu_context.h
+++ b/arch/x86/include/asm/mmu_context.h
@@ -25,8 +25,8 @@ void destroy_context(struct mm_struct *mm);
 static inline void enter_lazy_tlb(struct mm_struct *mm, struct task_struct *tsk)
 {
 #ifdef CONFIG_SMP
-	if (percpu_read(cpu_tlbstate.state) == TLBSTATE_OK)
-		percpu_write(cpu_tlbstate.state, TLBSTATE_LAZY);
+	if (this_cpu_read(cpu_tlbstate.state) == TLBSTATE_OK)
+		this_cpu_write(cpu_tlbstate.state, TLBSTATE_LAZY);
 #endif
 }
 
@@ -37,8 +37,8 @@ static inline void switch_mm(struct mm_struct *prev, struct mm_struct *next,
 
 	if (likely(prev != next)) {
 #ifdef CONFIG_SMP
-		percpu_write(cpu_tlbstate.state, TLBSTATE_OK);
-		percpu_write(cpu_tlbstate.active_mm, next);
+		this_cpu_write(cpu_tlbstate.state, TLBSTATE_OK);
+		this_cpu_write(cpu_tlbstate.active_mm, next);
 #endif
 		cpumask_set_cpu(cpu, mm_cpumask(next));
 
@@ -56,8 +56,8 @@ static inline void switch_mm(struct mm_struct *prev, struct mm_struct *next,
 	}
 #ifdef CONFIG_SMP
 	else {
-		percpu_write(cpu_tlbstate.state, TLBSTATE_OK);
-		BUG_ON(percpu_read(cpu_tlbstate.active_mm) != next);
+		this_cpu_write(cpu_tlbstate.state, TLBSTATE_OK);
+		BUG_ON(this_cpu_read(cpu_tlbstate.active_mm) != next);
 
 		if (!cpumask_test_and_set_cpu(cpu, mm_cpumask(next))) {
 			/* We were in lazy tlb mode and leave_mm disabled
diff --git a/arch/x86/include/asm/percpu.h b/arch/x86/include/asm/percpu.h
index 7a11910a63c4..967ee3be5c0a 100644
--- a/arch/x86/include/asm/percpu.h
+++ b/arch/x86/include/asm/percpu.h
@@ -46,7 +46,7 @@
 
 #ifdef CONFIG_SMP
 #define __percpu_prefix		"%%"__stringify(__percpu_seg)":"
-#define __my_cpu_offset		percpu_read(this_cpu_off)
+#define __my_cpu_offset		this_cpu_read(this_cpu_off)
 
 /*
  * Compared to the generic __my_cpu_offset version, the following
@@ -352,15 +352,15 @@ do {									\
 
 /*
  * percpu_read() makes gcc load the percpu variable every time it is
- * accessed while percpu_read_stable() allows the value to be cached.
- * percpu_read_stable() is more efficient and can be used if its value
+ * accessed while this_cpu_read_stable() allows the value to be cached.
+ * this_cpu_read_stable() is more efficient and can be used if its value
  * is guaranteed to be valid across cpus.  The current users include
  * get_current() and get_thread_info() both of which are actually
  * per-thread variables implemented as per-cpu variables and thus
  * stable for the duration of the respective task.
  */
 #define percpu_read(var)		percpu_from_op("mov", var, "m" (var))
-#define percpu_read_stable(var)		percpu_from_op("mov", var, "p" (&(var)))
+#define this_cpu_read_stable(var)	percpu_from_op("mov", var, "p" (&(var)))
 #define percpu_write(var, val)		percpu_to_op("mov", var, val)
 #define percpu_add(var, val)		percpu_add_op(var, val)
 #define percpu_sub(var, val)		percpu_add_op(var, -(val))
diff --git a/arch/x86/include/asm/smp.h b/arch/x86/include/asm/smp.h
index 0434c400287c..e276f6bb6524 100644
--- a/arch/x86/include/asm/smp.h
+++ b/arch/x86/include/asm/smp.h
@@ -188,11 +188,11 @@ extern unsigned disabled_cpus __cpuinitdata;
  * from the initial startup. We map APIC_BASE very early in page_setup(),
  * so this is correct in the x86 case.
  */
-#define raw_smp_processor_id() (percpu_read(cpu_number))
+#define raw_smp_processor_id() (this_cpu_read(cpu_number))
 extern int safe_smp_processor_id(void);
 
 #elif defined(CONFIG_X86_64_SMP)
-#define raw_smp_processor_id() (percpu_read(cpu_number))
+#define raw_smp_processor_id() (this_cpu_read(cpu_number))
 
 #define stack_smp_processor_id()					\
 ({								\
diff --git a/arch/x86/include/asm/stackprotector.h b/arch/x86/include/asm/stackprotector.h
index b5d9533d2c38..6a998598f172 100644
--- a/arch/x86/include/asm/stackprotector.h
+++ b/arch/x86/include/asm/stackprotector.h
@@ -75,9 +75,9 @@ static __always_inline void boot_init_stack_canary(void)
 
 	current->stack_canary = canary;
 #ifdef CONFIG_X86_64
-	percpu_write(irq_stack_union.stack_canary, canary);
+	this_cpu_write(irq_stack_union.stack_canary, canary);
 #else
-	percpu_write(stack_canary.canary, canary);
+	this_cpu_write(stack_canary.canary, canary);
 #endif
 }
 
diff --git a/arch/x86/include/asm/thread_info.h b/arch/x86/include/asm/thread_info.h
index ad6df8ccd715..f67fd89c874b 100644
--- a/arch/x86/include/asm/thread_info.h
+++ b/arch/x86/include/asm/thread_info.h
@@ -222,7 +222,7 @@ DECLARE_PER_CPU(unsigned long, kernel_stack);
 static inline struct thread_info *current_thread_info(void)
 {
 	struct thread_info *ti;
-	ti = (void *)(percpu_read_stable(kernel_stack) +
+	ti = (void *)(this_cpu_read_stable(kernel_stack) +
 		      KERNEL_STACK_OFFSET - THREAD_SIZE);
 	return ti;
 }
diff --git a/arch/x86/include/asm/tlbflush.h b/arch/x86/include/asm/tlbflush.h
index c0e108e08079..1620d23f14d7 100644
--- a/arch/x86/include/asm/tlbflush.h
+++ b/arch/x86/include/asm/tlbflush.h
@@ -156,8 +156,8 @@ DECLARE_PER_CPU_SHARED_ALIGNED(struct tlb_state, cpu_tlbstate);
 
 static inline void reset_lazy_tlbstate(void)
 {
-	percpu_write(cpu_tlbstate.state, 0);
-	percpu_write(cpu_tlbstate.active_mm, &init_mm);
+	this_cpu_write(cpu_tlbstate.state, 0);
+	this_cpu_write(cpu_tlbstate.active_mm, &init_mm);
 }
 
 #endif	/* SMP */
diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c
index cf79302198a6..82f29e70d058 100644
--- a/arch/x86/kernel/cpu/common.c
+++ b/arch/x86/kernel/cpu/common.c
@@ -1185,7 +1185,7 @@ void __cpuinit cpu_init(void)
 	oist = &per_cpu(orig_ist, cpu);
 
 #ifdef CONFIG_NUMA
-	if (cpu != 0 && percpu_read(numa_node) == 0 &&
+	if (cpu != 0 && this_cpu_read(numa_node) == 0 &&
 	    early_cpu_to_node(cpu) != NUMA_NO_NODE)
 		set_numa_node(early_cpu_to_node(cpu));
 #endif
diff --git a/arch/x86/kernel/cpu/mcheck/mce.c b/arch/x86/kernel/cpu/mcheck/mce.c
index d086a09c087d..c0276d5d9bd4 100644
--- a/arch/x86/kernel/cpu/mcheck/mce.c
+++ b/arch/x86/kernel/cpu/mcheck/mce.c
@@ -583,7 +583,7 @@ void machine_check_poll(enum mcp_flags flags, mce_banks_t *b)
 	struct mce m;
 	int i;
 
-	percpu_inc(mce_poll_count);
+	this_cpu_inc(mce_poll_count);
 
 	mce_gather_info(&m, NULL);
 
@@ -1015,7 +1015,7 @@ void do_machine_check(struct pt_regs *regs, long error_code)
 
 	atomic_inc(&mce_entry);
 
-	percpu_inc(mce_exception_count);
+	this_cpu_inc(mce_exception_count);
 
 	if (!banks)
 		goto out;
diff --git a/arch/x86/kernel/i387.c b/arch/x86/kernel/i387.c
index 2d6e6498c176..f250431fb505 100644
--- a/arch/x86/kernel/i387.c
+++ b/arch/x86/kernel/i387.c
@@ -88,7 +88,7 @@ void kernel_fpu_begin(void)
 		__thread_clear_has_fpu(me);
 		/* We do 'stts()' in kernel_fpu_end() */
 	} else {
-		percpu_write(fpu_owner_task, NULL);
+		this_cpu_write(fpu_owner_task, NULL);
 		clts();
 	}
 }
diff --git a/arch/x86/kernel/nmi_selftest.c b/arch/x86/kernel/nmi_selftest.c
index 2c39dcd510fa..ff3698625081 100644
--- a/arch/x86/kernel/nmi_selftest.c
+++ b/arch/x86/kernel/nmi_selftest.c
@@ -13,6 +13,7 @@
 #include <linux/cpumask.h>
 #include <linux/delay.h>
 #include <linux/init.h>
+#include <linux/percpu.h>
 
 #include <asm/apic.h>
 #include <asm/nmi.h>
diff --git a/arch/x86/kernel/paravirt.c b/arch/x86/kernel/paravirt.c
index ab137605e694..9ce885996fd7 100644
--- a/arch/x86/kernel/paravirt.c
+++ b/arch/x86/kernel/paravirt.c
@@ -241,16 +241,16 @@ static DEFINE_PER_CPU(enum paravirt_lazy_mode, paravirt_lazy_mode) = PARAVIRT_LA
 
 static inline void enter_lazy(enum paravirt_lazy_mode mode)
 {
-	BUG_ON(percpu_read(paravirt_lazy_mode) != PARAVIRT_LAZY_NONE);
+	BUG_ON(this_cpu_read(paravirt_lazy_mode) != PARAVIRT_LAZY_NONE);
 
-	percpu_write(paravirt_lazy_mode, mode);
+	this_cpu_write(paravirt_lazy_mode, mode);
 }
 
 static void leave_lazy(enum paravirt_lazy_mode mode)
 {
-	BUG_ON(percpu_read(paravirt_lazy_mode) != mode);
+	BUG_ON(this_cpu_read(paravirt_lazy_mode) != mode);
 
-	percpu_write(paravirt_lazy_mode, PARAVIRT_LAZY_NONE);
+	this_cpu_write(paravirt_lazy_mode, PARAVIRT_LAZY_NONE);
 }
 
 void paravirt_enter_lazy_mmu(void)
@@ -267,7 +267,7 @@ void paravirt_start_context_switch(struct task_struct *prev)
 {
 	BUG_ON(preemptible());
 
-	if (percpu_read(paravirt_lazy_mode) == PARAVIRT_LAZY_MMU) {
+	if (this_cpu_read(paravirt_lazy_mode) == PARAVIRT_LAZY_MMU) {
 		arch_leave_lazy_mmu_mode();
 		set_ti_thread_flag(task_thread_info(prev), TIF_LAZY_MMU_UPDATES);
 	}
@@ -289,7 +289,7 @@ enum paravirt_lazy_mode paravirt_get_lazy_mode(void)
 	if (in_interrupt())
 		return PARAVIRT_LAZY_NONE;
 
-	return percpu_read(paravirt_lazy_mode);
+	return this_cpu_read(paravirt_lazy_mode);
 }
 
 void arch_flush_lazy_mmu_mode(void)
diff --git a/arch/x86/kernel/process.c b/arch/x86/kernel/process.c
index 1d92a5ab6e8b..857adffb7080 100644
--- a/arch/x86/kernel/process.c
+++ b/arch/x86/kernel/process.c
@@ -377,7 +377,7 @@ static inline void play_dead(void)
 #ifdef CONFIG_X86_64
 void enter_idle(void)
 {
-	percpu_write(is_idle, 1);
+	this_cpu_write(is_idle, 1);
 	atomic_notifier_call_chain(&idle_notifier, IDLE_START, NULL);
 }
 
diff --git a/arch/x86/kernel/process_32.c b/arch/x86/kernel/process_32.c
index ae6847303e26..01d8d40ccaf6 100644
--- a/arch/x86/kernel/process_32.c
+++ b/arch/x86/kernel/process_32.c
@@ -302,7 +302,7 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p)
 
 	switch_fpu_finish(next_p, fpu);
 
-	percpu_write(current_task, next_p);
+	this_cpu_write(current_task, next_p);
 
 	return prev_p;
 }
diff --git a/arch/x86/kernel/process_64.c b/arch/x86/kernel/process_64.c
index 43d8b48b23e6..28e810255a0a 100644
--- a/arch/x86/kernel/process_64.c
+++ b/arch/x86/kernel/process_64.c
@@ -237,7 +237,7 @@ start_thread_common(struct pt_regs *regs, unsigned long new_ip,
 	current->thread.usersp	= new_sp;
 	regs->ip		= new_ip;
 	regs->sp		= new_sp;
-	percpu_write(old_rsp, new_sp);
+	this_cpu_write(old_rsp, new_sp);
 	regs->cs		= _cs;
 	regs->ss		= _ss;
 	regs->flags		= X86_EFLAGS_IF;
@@ -359,11 +359,11 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p)
 	/*
 	 * Switch the PDA and FPU contexts.
 	 */
-	prev->usersp = percpu_read(old_rsp);
-	percpu_write(old_rsp, next->usersp);
-	percpu_write(current_task, next_p);
+	prev->usersp = this_cpu_read(old_rsp);
+	this_cpu_write(old_rsp, next->usersp);
+	this_cpu_write(current_task, next_p);
 
-	percpu_write(kernel_stack,
+	this_cpu_write(kernel_stack,
 		  (unsigned long)task_stack_page(next_p) +
 		  THREAD_SIZE - KERNEL_STACK_OFFSET);
 
diff --git a/arch/x86/mm/tlb.c b/arch/x86/mm/tlb.c
index d6c0418c3e47..3804471db104 100644
--- a/arch/x86/mm/tlb.c
+++ b/arch/x86/mm/tlb.c
@@ -61,10 +61,10 @@ static DEFINE_PER_CPU_READ_MOSTLY(int, tlb_vector_offset);
  */
 void leave_mm(int cpu)
 {
-	if (percpu_read(cpu_tlbstate.state) == TLBSTATE_OK)
+	if (this_cpu_read(cpu_tlbstate.state) == TLBSTATE_OK)
 		BUG();
 	cpumask_clear_cpu(cpu,
-			  mm_cpumask(percpu_read(cpu_tlbstate.active_mm)));
+			  mm_cpumask(this_cpu_read(cpu_tlbstate.active_mm)));
 	load_cr3(swapper_pg_dir);
 }
 EXPORT_SYMBOL_GPL(leave_mm);
@@ -152,8 +152,8 @@ void smp_invalidate_interrupt(struct pt_regs *regs)
 		 * BUG();
 		 */
 
-	if (f->flush_mm == percpu_read(cpu_tlbstate.active_mm)) {
-		if (percpu_read(cpu_tlbstate.state) == TLBSTATE_OK) {
+	if (f->flush_mm == this_cpu_read(cpu_tlbstate.active_mm)) {
+		if (this_cpu_read(cpu_tlbstate.state) == TLBSTATE_OK) {
 			if (f->flush_va == TLB_FLUSH_ALL)
 				local_flush_tlb();
 			else
@@ -322,7 +322,7 @@ void flush_tlb_page(struct vm_area_struct *vma, unsigned long va)
 static void do_flush_tlb_all(void *info)
 {
 	__flush_tlb_all();
-	if (percpu_read(cpu_tlbstate.state) == TLBSTATE_LAZY)
+	if (this_cpu_read(cpu_tlbstate.state) == TLBSTATE_LAZY)
 		leave_mm(smp_processor_id());
 }
 
diff --git a/include/linux/topology.h b/include/linux/topology.h
index e26db031303b..9dc427cdb6ff 100644
--- a/include/linux/topology.h
+++ b/include/linux/topology.h
@@ -239,7 +239,7 @@ static inline int cpu_to_node(int cpu)
 #ifndef set_numa_node
 static inline void set_numa_node(int node)
 {
-	percpu_write(numa_node, node);
+	this_cpu_write(numa_node, node);
 }
 #endif
 
@@ -274,7 +274,7 @@ DECLARE_PER_CPU(int, _numa_mem_);
 #ifndef set_numa_mem
 static inline void set_numa_mem(int node)
 {
-	percpu_write(_numa_mem_, node);
+	this_cpu_write(_numa_mem_, node);
 }
 #endif
 
-- 
cgit v1.2.3


From 641b695c2f11397bd307ea689d4d3f128360ce49 Mon Sep 17 00:00:00 2001
From: Alex Shi <alex.shi@intel.com>
Date: Mon, 14 May 2012 14:15:32 -0700
Subject: percpu: remove percpu_xxx() functions

Remove percpu_xxx serial functions, all of them were replaced by
this_cpu_xxx or __this_cpu_xxx serial functions

Signed-off-by: Alex Shi <alex.shi@intel.com>
Acked-by: Christoph Lameter <cl@gentwo.org>
Acked-by: Tejun Heo <tj@kernel.org>
Acked-by: "H. Peter Anvin" <hpa@zytor.com>
Cc: Ingo Molnar <mingo@elte.hu>
Cc: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Tejun Heo <tj@kernel.org>
---
 arch/x86/include/asm/percpu.h | 16 +++++--------
 include/linux/percpu.h        | 54 -------------------------------------------
 2 files changed, 6 insertions(+), 64 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/include/asm/percpu.h b/arch/x86/include/asm/percpu.h
index 967ee3be5c0a..d9b8e3f7f42a 100644
--- a/arch/x86/include/asm/percpu.h
+++ b/arch/x86/include/asm/percpu.h
@@ -351,7 +351,7 @@ do {									\
 })
 
 /*
- * percpu_read() makes gcc load the percpu variable every time it is
+ * this_cpu_read() makes gcc load the percpu variable every time it is
  * accessed while this_cpu_read_stable() allows the value to be cached.
  * this_cpu_read_stable() is more efficient and can be used if its value
  * is guaranteed to be valid across cpus.  The current users include
@@ -359,15 +359,7 @@ do {									\
  * per-thread variables implemented as per-cpu variables and thus
  * stable for the duration of the respective task.
  */
-#define percpu_read(var)		percpu_from_op("mov", var, "m" (var))
 #define this_cpu_read_stable(var)	percpu_from_op("mov", var, "p" (&(var)))
-#define percpu_write(var, val)		percpu_to_op("mov", var, val)
-#define percpu_add(var, val)		percpu_add_op(var, val)
-#define percpu_sub(var, val)		percpu_add_op(var, -(val))
-#define percpu_and(var, val)		percpu_to_op("and", var, val)
-#define percpu_or(var, val)		percpu_to_op("or", var, val)
-#define percpu_xor(var, val)		percpu_to_op("xor", var, val)
-#define percpu_inc(var)		percpu_unary_op("inc", var)
 
 #define __this_cpu_read_1(pcp)		percpu_from_op("mov", (pcp), "m"(pcp))
 #define __this_cpu_read_2(pcp)		percpu_from_op("mov", (pcp), "m"(pcp))
@@ -512,7 +504,11 @@ static __always_inline int x86_this_cpu_constant_test_bit(unsigned int nr,
 {
 	unsigned long __percpu *a = (unsigned long *)addr + nr / BITS_PER_LONG;
 
-	return ((1UL << (nr % BITS_PER_LONG)) & percpu_read(*a)) != 0;
+#ifdef CONFIG_X86_64
+	return ((1UL << (nr % BITS_PER_LONG)) & __this_cpu_read_8(*a)) != 0;
+#else
+	return ((1UL << (nr % BITS_PER_LONG)) & __this_cpu_read_4(*a)) != 0;
+#endif
 }
 
 static inline int x86_this_cpu_variable_test_bit(int nr,
diff --git a/include/linux/percpu.h b/include/linux/percpu.h
index 21638ae14e07..2b9f82c037c9 100644
--- a/include/linux/percpu.h
+++ b/include/linux/percpu.h
@@ -165,60 +165,6 @@ extern phys_addr_t per_cpu_ptr_to_phys(void *addr);
 #define alloc_percpu(type)	\
 	(typeof(type) __percpu *)__alloc_percpu(sizeof(type), __alignof__(type))
 
-/*
- * Optional methods for optimized non-lvalue per-cpu variable access.
- *
- * @var can be a percpu variable or a field of it and its size should
- * equal char, int or long.  percpu_read() evaluates to a lvalue and
- * all others to void.
- *
- * These operations are guaranteed to be atomic.
- * The generic versions disable interrupts.  Archs are
- * encouraged to implement single-instruction alternatives which don't
- * require protection.
- */
-#ifndef percpu_read
-# define percpu_read(var)						\
-  ({									\
-	typeof(var) *pr_ptr__ = &(var);					\
-	typeof(var) pr_ret__;						\
-	pr_ret__ = get_cpu_var(*pr_ptr__);				\
-	put_cpu_var(*pr_ptr__);						\
-	pr_ret__;							\
-  })
-#endif
-
-#define __percpu_generic_to_op(var, val, op)				\
-do {									\
-	typeof(var) *pgto_ptr__ = &(var);				\
-	get_cpu_var(*pgto_ptr__) op val;				\
-	put_cpu_var(*pgto_ptr__);					\
-} while (0)
-
-#ifndef percpu_write
-# define percpu_write(var, val)		__percpu_generic_to_op(var, (val), =)
-#endif
-
-#ifndef percpu_add
-# define percpu_add(var, val)		__percpu_generic_to_op(var, (val), +=)
-#endif
-
-#ifndef percpu_sub
-# define percpu_sub(var, val)		__percpu_generic_to_op(var, (val), -=)
-#endif
-
-#ifndef percpu_and
-# define percpu_and(var, val)		__percpu_generic_to_op(var, (val), &=)
-#endif
-
-#ifndef percpu_or
-# define percpu_or(var, val)		__percpu_generic_to_op(var, (val), |=)
-#endif
-
-#ifndef percpu_xor
-# define percpu_xor(var, val)		__percpu_generic_to_op(var, (val), ^=)
-#endif
-
 /*
  * Branching function to split up a function into a set of functions that
  * are called for different scalar sizes of the objects handled.
-- 
cgit v1.2.3


From fa46ccb8eb960c62c1e5e3237085d4007788a345 Mon Sep 17 00:00:00 2001
From: Jussi Kivilinna <jussi.kivilinna@mbnet.fi>
Date: Fri, 11 May 2012 16:00:48 +0300
Subject: crypto: aesni-intel - use crypto_[un]register_algs

Combine all crypto_alg to be registered and use new crypto_[un]register_algs
functions. Simplifies init/exit code and reduce object size.

Cc: Huang Ying <ying.huang@intel.com>
Signed-off-by: Jussi Kivilinna <jussi.kivilinna@mbnet.fi>
Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
---
 arch/x86/crypto/aesni-intel_glue.c | 727 ++++++++++++++++---------------------
 1 file changed, 305 insertions(+), 422 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/crypto/aesni-intel_glue.c b/arch/x86/crypto/aesni-intel_glue.c
index c799352e24fc..20c622016629 100644
--- a/arch/x86/crypto/aesni-intel_glue.c
+++ b/arch/x86/crypto/aesni-intel_glue.c
@@ -222,27 +222,6 @@ static void aes_decrypt(struct crypto_tfm *tfm, u8 *dst, const u8 *src)
 	}
 }
 
-static struct crypto_alg aesni_alg = {
-	.cra_name		= "aes",
-	.cra_driver_name	= "aes-aesni",
-	.cra_priority		= 300,
-	.cra_flags		= CRYPTO_ALG_TYPE_CIPHER,
-	.cra_blocksize		= AES_BLOCK_SIZE,
-	.cra_ctxsize		= sizeof(struct crypto_aes_ctx)+AESNI_ALIGN-1,
-	.cra_alignmask		= 0,
-	.cra_module		= THIS_MODULE,
-	.cra_list		= LIST_HEAD_INIT(aesni_alg.cra_list),
-	.cra_u	= {
-		.cipher	= {
-			.cia_min_keysize	= AES_MIN_KEY_SIZE,
-			.cia_max_keysize	= AES_MAX_KEY_SIZE,
-			.cia_setkey		= aes_set_key,
-			.cia_encrypt		= aes_encrypt,
-			.cia_decrypt		= aes_decrypt
-		}
-	}
-};
-
 static void __aes_encrypt(struct crypto_tfm *tfm, u8 *dst, const u8 *src)
 {
 	struct crypto_aes_ctx *ctx = aes_ctx(crypto_tfm_ctx(tfm));
@@ -257,27 +236,6 @@ static void __aes_decrypt(struct crypto_tfm *tfm, u8 *dst, const u8 *src)
 	aesni_dec(ctx, dst, src);
 }
 
-static struct crypto_alg __aesni_alg = {
-	.cra_name		= "__aes-aesni",
-	.cra_driver_name	= "__driver-aes-aesni",
-	.cra_priority		= 0,
-	.cra_flags		= CRYPTO_ALG_TYPE_CIPHER,
-	.cra_blocksize		= AES_BLOCK_SIZE,
-	.cra_ctxsize		= sizeof(struct crypto_aes_ctx)+AESNI_ALIGN-1,
-	.cra_alignmask		= 0,
-	.cra_module		= THIS_MODULE,
-	.cra_list		= LIST_HEAD_INIT(__aesni_alg.cra_list),
-	.cra_u	= {
-		.cipher	= {
-			.cia_min_keysize	= AES_MIN_KEY_SIZE,
-			.cia_max_keysize	= AES_MAX_KEY_SIZE,
-			.cia_setkey		= aes_set_key,
-			.cia_encrypt		= __aes_encrypt,
-			.cia_decrypt		= __aes_decrypt
-		}
-	}
-};
-
 static int ecb_encrypt(struct blkcipher_desc *desc,
 		       struct scatterlist *dst, struct scatterlist *src,
 		       unsigned int nbytes)
@@ -326,28 +284,6 @@ static int ecb_decrypt(struct blkcipher_desc *desc,
 	return err;
 }
 
-static struct crypto_alg blk_ecb_alg = {
-	.cra_name		= "__ecb-aes-aesni",
-	.cra_driver_name	= "__driver-ecb-aes-aesni",
-	.cra_priority		= 0,
-	.cra_flags		= CRYPTO_ALG_TYPE_BLKCIPHER,
-	.cra_blocksize		= AES_BLOCK_SIZE,
-	.cra_ctxsize		= sizeof(struct crypto_aes_ctx)+AESNI_ALIGN-1,
-	.cra_alignmask		= 0,
-	.cra_type		= &crypto_blkcipher_type,
-	.cra_module		= THIS_MODULE,
-	.cra_list		= LIST_HEAD_INIT(blk_ecb_alg.cra_list),
-	.cra_u = {
-		.blkcipher = {
-			.min_keysize	= AES_MIN_KEY_SIZE,
-			.max_keysize	= AES_MAX_KEY_SIZE,
-			.setkey		= aes_set_key,
-			.encrypt	= ecb_encrypt,
-			.decrypt	= ecb_decrypt,
-		},
-	},
-};
-
 static int cbc_encrypt(struct blkcipher_desc *desc,
 		       struct scatterlist *dst, struct scatterlist *src,
 		       unsigned int nbytes)
@@ -396,28 +332,6 @@ static int cbc_decrypt(struct blkcipher_desc *desc,
 	return err;
 }
 
-static struct crypto_alg blk_cbc_alg = {
-	.cra_name		= "__cbc-aes-aesni",
-	.cra_driver_name	= "__driver-cbc-aes-aesni",
-	.cra_priority		= 0,
-	.cra_flags		= CRYPTO_ALG_TYPE_BLKCIPHER,
-	.cra_blocksize		= AES_BLOCK_SIZE,
-	.cra_ctxsize		= sizeof(struct crypto_aes_ctx)+AESNI_ALIGN-1,
-	.cra_alignmask		= 0,
-	.cra_type		= &crypto_blkcipher_type,
-	.cra_module		= THIS_MODULE,
-	.cra_list		= LIST_HEAD_INIT(blk_cbc_alg.cra_list),
-	.cra_u = {
-		.blkcipher = {
-			.min_keysize	= AES_MIN_KEY_SIZE,
-			.max_keysize	= AES_MAX_KEY_SIZE,
-			.setkey		= aes_set_key,
-			.encrypt	= cbc_encrypt,
-			.decrypt	= cbc_decrypt,
-		},
-	},
-};
-
 #ifdef CONFIG_X86_64
 static void ctr_crypt_final(struct crypto_aes_ctx *ctx,
 			    struct blkcipher_walk *walk)
@@ -461,29 +375,6 @@ static int ctr_crypt(struct blkcipher_desc *desc,
 
 	return err;
 }
-
-static struct crypto_alg blk_ctr_alg = {
-	.cra_name		= "__ctr-aes-aesni",
-	.cra_driver_name	= "__driver-ctr-aes-aesni",
-	.cra_priority		= 0,
-	.cra_flags		= CRYPTO_ALG_TYPE_BLKCIPHER,
-	.cra_blocksize		= 1,
-	.cra_ctxsize		= sizeof(struct crypto_aes_ctx)+AESNI_ALIGN-1,
-	.cra_alignmask		= 0,
-	.cra_type		= &crypto_blkcipher_type,
-	.cra_module		= THIS_MODULE,
-	.cra_list		= LIST_HEAD_INIT(blk_ctr_alg.cra_list),
-	.cra_u = {
-		.blkcipher = {
-			.min_keysize	= AES_MIN_KEY_SIZE,
-			.max_keysize	= AES_MAX_KEY_SIZE,
-			.ivsize		= AES_BLOCK_SIZE,
-			.setkey		= aes_set_key,
-			.encrypt	= ctr_crypt,
-			.decrypt	= ctr_crypt,
-		},
-	},
-};
 #endif
 
 static int ablk_set_key(struct crypto_ablkcipher *tfm, const u8 *key,
@@ -572,30 +463,6 @@ static int ablk_ecb_init(struct crypto_tfm *tfm)
 	return 0;
 }
 
-static struct crypto_alg ablk_ecb_alg = {
-	.cra_name		= "ecb(aes)",
-	.cra_driver_name	= "ecb-aes-aesni",
-	.cra_priority		= 400,
-	.cra_flags		= CRYPTO_ALG_TYPE_ABLKCIPHER|CRYPTO_ALG_ASYNC,
-	.cra_blocksize		= AES_BLOCK_SIZE,
-	.cra_ctxsize		= sizeof(struct async_aes_ctx),
-	.cra_alignmask		= 0,
-	.cra_type		= &crypto_ablkcipher_type,
-	.cra_module		= THIS_MODULE,
-	.cra_list		= LIST_HEAD_INIT(ablk_ecb_alg.cra_list),
-	.cra_init		= ablk_ecb_init,
-	.cra_exit		= ablk_exit,
-	.cra_u = {
-		.ablkcipher = {
-			.min_keysize	= AES_MIN_KEY_SIZE,
-			.max_keysize	= AES_MAX_KEY_SIZE,
-			.setkey		= ablk_set_key,
-			.encrypt	= ablk_encrypt,
-			.decrypt	= ablk_decrypt,
-		},
-	},
-};
-
 static int ablk_cbc_init(struct crypto_tfm *tfm)
 {
 	struct cryptd_ablkcipher *cryptd_tfm;
@@ -607,31 +474,6 @@ static int ablk_cbc_init(struct crypto_tfm *tfm)
 	return 0;
 }
 
-static struct crypto_alg ablk_cbc_alg = {
-	.cra_name		= "cbc(aes)",
-	.cra_driver_name	= "cbc-aes-aesni",
-	.cra_priority		= 400,
-	.cra_flags		= CRYPTO_ALG_TYPE_ABLKCIPHER|CRYPTO_ALG_ASYNC,
-	.cra_blocksize		= AES_BLOCK_SIZE,
-	.cra_ctxsize		= sizeof(struct async_aes_ctx),
-	.cra_alignmask		= 0,
-	.cra_type		= &crypto_ablkcipher_type,
-	.cra_module		= THIS_MODULE,
-	.cra_list		= LIST_HEAD_INIT(ablk_cbc_alg.cra_list),
-	.cra_init		= ablk_cbc_init,
-	.cra_exit		= ablk_exit,
-	.cra_u = {
-		.ablkcipher = {
-			.min_keysize	= AES_MIN_KEY_SIZE,
-			.max_keysize	= AES_MAX_KEY_SIZE,
-			.ivsize		= AES_BLOCK_SIZE,
-			.setkey		= ablk_set_key,
-			.encrypt	= ablk_encrypt,
-			.decrypt	= ablk_decrypt,
-		},
-	},
-};
-
 #ifdef CONFIG_X86_64
 static int ablk_ctr_init(struct crypto_tfm *tfm)
 {
@@ -644,32 +486,6 @@ static int ablk_ctr_init(struct crypto_tfm *tfm)
 	return 0;
 }
 
-static struct crypto_alg ablk_ctr_alg = {
-	.cra_name		= "ctr(aes)",
-	.cra_driver_name	= "ctr-aes-aesni",
-	.cra_priority		= 400,
-	.cra_flags		= CRYPTO_ALG_TYPE_ABLKCIPHER|CRYPTO_ALG_ASYNC,
-	.cra_blocksize		= 1,
-	.cra_ctxsize		= sizeof(struct async_aes_ctx),
-	.cra_alignmask		= 0,
-	.cra_type		= &crypto_ablkcipher_type,
-	.cra_module		= THIS_MODULE,
-	.cra_list		= LIST_HEAD_INIT(ablk_ctr_alg.cra_list),
-	.cra_init		= ablk_ctr_init,
-	.cra_exit		= ablk_exit,
-	.cra_u = {
-		.ablkcipher = {
-			.min_keysize	= AES_MIN_KEY_SIZE,
-			.max_keysize	= AES_MAX_KEY_SIZE,
-			.ivsize		= AES_BLOCK_SIZE,
-			.setkey		= ablk_set_key,
-			.encrypt	= ablk_encrypt,
-			.decrypt	= ablk_encrypt,
-			.geniv		= "chainiv",
-		},
-	},
-};
-
 #ifdef HAS_CTR
 static int ablk_rfc3686_ctr_init(struct crypto_tfm *tfm)
 {
@@ -682,32 +498,6 @@ static int ablk_rfc3686_ctr_init(struct crypto_tfm *tfm)
 	ablk_init_common(tfm, cryptd_tfm);
 	return 0;
 }
-
-static struct crypto_alg ablk_rfc3686_ctr_alg = {
-	.cra_name		= "rfc3686(ctr(aes))",
-	.cra_driver_name	= "rfc3686-ctr-aes-aesni",
-	.cra_priority		= 400,
-	.cra_flags		= CRYPTO_ALG_TYPE_ABLKCIPHER|CRYPTO_ALG_ASYNC,
-	.cra_blocksize		= 1,
-	.cra_ctxsize		= sizeof(struct async_aes_ctx),
-	.cra_alignmask		= 0,
-	.cra_type		= &crypto_ablkcipher_type,
-	.cra_module		= THIS_MODULE,
-	.cra_list		= LIST_HEAD_INIT(ablk_rfc3686_ctr_alg.cra_list),
-	.cra_init		= ablk_rfc3686_ctr_init,
-	.cra_exit		= ablk_exit,
-	.cra_u = {
-		.ablkcipher = {
-			.min_keysize = AES_MIN_KEY_SIZE+CTR_RFC3686_NONCE_SIZE,
-			.max_keysize = AES_MAX_KEY_SIZE+CTR_RFC3686_NONCE_SIZE,
-			.ivsize	     = CTR_RFC3686_IV_SIZE,
-			.setkey	     = ablk_set_key,
-			.encrypt     = ablk_encrypt,
-			.decrypt     = ablk_decrypt,
-			.geniv	     = "seqiv",
-		},
-	},
-};
 #endif
 #endif
 
@@ -723,31 +513,6 @@ static int ablk_lrw_init(struct crypto_tfm *tfm)
 	ablk_init_common(tfm, cryptd_tfm);
 	return 0;
 }
-
-static struct crypto_alg ablk_lrw_alg = {
-	.cra_name		= "lrw(aes)",
-	.cra_driver_name	= "lrw-aes-aesni",
-	.cra_priority		= 400,
-	.cra_flags		= CRYPTO_ALG_TYPE_ABLKCIPHER|CRYPTO_ALG_ASYNC,
-	.cra_blocksize		= AES_BLOCK_SIZE,
-	.cra_ctxsize		= sizeof(struct async_aes_ctx),
-	.cra_alignmask		= 0,
-	.cra_type		= &crypto_ablkcipher_type,
-	.cra_module		= THIS_MODULE,
-	.cra_list		= LIST_HEAD_INIT(ablk_lrw_alg.cra_list),
-	.cra_init		= ablk_lrw_init,
-	.cra_exit		= ablk_exit,
-	.cra_u = {
-		.ablkcipher = {
-			.min_keysize	= AES_MIN_KEY_SIZE + AES_BLOCK_SIZE,
-			.max_keysize	= AES_MAX_KEY_SIZE + AES_BLOCK_SIZE,
-			.ivsize		= AES_BLOCK_SIZE,
-			.setkey		= ablk_set_key,
-			.encrypt	= ablk_encrypt,
-			.decrypt	= ablk_decrypt,
-		},
-	},
-};
 #endif
 
 #ifdef HAS_PCBC
@@ -762,31 +527,6 @@ static int ablk_pcbc_init(struct crypto_tfm *tfm)
 	ablk_init_common(tfm, cryptd_tfm);
 	return 0;
 }
-
-static struct crypto_alg ablk_pcbc_alg = {
-	.cra_name		= "pcbc(aes)",
-	.cra_driver_name	= "pcbc-aes-aesni",
-	.cra_priority		= 400,
-	.cra_flags		= CRYPTO_ALG_TYPE_ABLKCIPHER|CRYPTO_ALG_ASYNC,
-	.cra_blocksize		= AES_BLOCK_SIZE,
-	.cra_ctxsize		= sizeof(struct async_aes_ctx),
-	.cra_alignmask		= 0,
-	.cra_type		= &crypto_ablkcipher_type,
-	.cra_module		= THIS_MODULE,
-	.cra_list		= LIST_HEAD_INIT(ablk_pcbc_alg.cra_list),
-	.cra_init		= ablk_pcbc_init,
-	.cra_exit		= ablk_exit,
-	.cra_u = {
-		.ablkcipher = {
-			.min_keysize	= AES_MIN_KEY_SIZE,
-			.max_keysize	= AES_MAX_KEY_SIZE,
-			.ivsize		= AES_BLOCK_SIZE,
-			.setkey		= ablk_set_key,
-			.encrypt	= ablk_encrypt,
-			.decrypt	= ablk_decrypt,
-		},
-	},
-};
 #endif
 
 #ifdef HAS_XTS
@@ -801,31 +541,6 @@ static int ablk_xts_init(struct crypto_tfm *tfm)
 	ablk_init_common(tfm, cryptd_tfm);
 	return 0;
 }
-
-static struct crypto_alg ablk_xts_alg = {
-	.cra_name		= "xts(aes)",
-	.cra_driver_name	= "xts-aes-aesni",
-	.cra_priority		= 400,
-	.cra_flags		= CRYPTO_ALG_TYPE_ABLKCIPHER|CRYPTO_ALG_ASYNC,
-	.cra_blocksize		= AES_BLOCK_SIZE,
-	.cra_ctxsize		= sizeof(struct async_aes_ctx),
-	.cra_alignmask		= 0,
-	.cra_type		= &crypto_ablkcipher_type,
-	.cra_module		= THIS_MODULE,
-	.cra_list		= LIST_HEAD_INIT(ablk_xts_alg.cra_list),
-	.cra_init		= ablk_xts_init,
-	.cra_exit		= ablk_exit,
-	.cra_u = {
-		.ablkcipher = {
-			.min_keysize	= 2 * AES_MIN_KEY_SIZE,
-			.max_keysize	= 2 * AES_MAX_KEY_SIZE,
-			.ivsize		= AES_BLOCK_SIZE,
-			.setkey		= ablk_set_key,
-			.encrypt	= ablk_encrypt,
-			.decrypt	= ablk_decrypt,
-		},
-	},
-};
 #endif
 
 #ifdef CONFIG_X86_64
@@ -1050,32 +765,6 @@ static int rfc4106_decrypt(struct aead_request *req)
 	}
 }
 
-static struct crypto_alg rfc4106_alg = {
-	.cra_name = "rfc4106(gcm(aes))",
-	.cra_driver_name = "rfc4106-gcm-aesni",
-	.cra_priority = 400,
-	.cra_flags = CRYPTO_ALG_TYPE_AEAD | CRYPTO_ALG_ASYNC,
-	.cra_blocksize = 1,
-	.cra_ctxsize = sizeof(struct aesni_rfc4106_gcm_ctx) + AESNI_ALIGN,
-	.cra_alignmask = 0,
-	.cra_type = &crypto_nivaead_type,
-	.cra_module = THIS_MODULE,
-	.cra_list = LIST_HEAD_INIT(rfc4106_alg.cra_list),
-	.cra_init = rfc4106_init,
-	.cra_exit = rfc4106_exit,
-	.cra_u = {
-		.aead = {
-			.setkey = rfc4106_set_key,
-			.setauthsize = rfc4106_set_authsize,
-			.encrypt = rfc4106_encrypt,
-			.decrypt = rfc4106_decrypt,
-			.geniv = "seqiv",
-			.ivsize = 8,
-			.maxauthsize = 16,
-		},
-	},
-};
-
 static int __driver_rfc4106_encrypt(struct aead_request *req)
 {
 	u8 one_entry_in_sg = 0;
@@ -1233,26 +922,316 @@ static int __driver_rfc4106_decrypt(struct aead_request *req)
 	}
 	return retval;
 }
+#endif
 
-static struct crypto_alg __rfc4106_alg = {
-	.cra_name		= "__gcm-aes-aesni",
-	.cra_driver_name	= "__driver-gcm-aes-aesni",
+static struct crypto_alg aesni_algs[] = { {
+	.cra_name		= "aes",
+	.cra_driver_name	= "aes-aesni",
+	.cra_priority		= 300,
+	.cra_flags		= CRYPTO_ALG_TYPE_CIPHER,
+	.cra_blocksize		= AES_BLOCK_SIZE,
+	.cra_ctxsize		= sizeof(struct crypto_aes_ctx) +
+				  AESNI_ALIGN - 1,
+	.cra_alignmask		= 0,
+	.cra_module		= THIS_MODULE,
+	.cra_u	= {
+		.cipher	= {
+			.cia_min_keysize	= AES_MIN_KEY_SIZE,
+			.cia_max_keysize	= AES_MAX_KEY_SIZE,
+			.cia_setkey		= aes_set_key,
+			.cia_encrypt		= aes_encrypt,
+			.cia_decrypt		= aes_decrypt
+		}
+	}
+}, {
+	.cra_name		= "__aes-aesni",
+	.cra_driver_name	= "__driver-aes-aesni",
+	.cra_priority		= 0,
+	.cra_flags		= CRYPTO_ALG_TYPE_CIPHER,
+	.cra_blocksize		= AES_BLOCK_SIZE,
+	.cra_ctxsize		= sizeof(struct crypto_aes_ctx) +
+				  AESNI_ALIGN - 1,
+	.cra_alignmask		= 0,
+	.cra_module		= THIS_MODULE,
+	.cra_u	= {
+		.cipher	= {
+			.cia_min_keysize	= AES_MIN_KEY_SIZE,
+			.cia_max_keysize	= AES_MAX_KEY_SIZE,
+			.cia_setkey		= aes_set_key,
+			.cia_encrypt		= __aes_encrypt,
+			.cia_decrypt		= __aes_decrypt
+		}
+	}
+}, {
+	.cra_name		= "__ecb-aes-aesni",
+	.cra_driver_name	= "__driver-ecb-aes-aesni",
+	.cra_priority		= 0,
+	.cra_flags		= CRYPTO_ALG_TYPE_BLKCIPHER,
+	.cra_blocksize		= AES_BLOCK_SIZE,
+	.cra_ctxsize		= sizeof(struct crypto_aes_ctx) +
+				  AESNI_ALIGN - 1,
+	.cra_alignmask		= 0,
+	.cra_type		= &crypto_blkcipher_type,
+	.cra_module		= THIS_MODULE,
+	.cra_u = {
+		.blkcipher = {
+			.min_keysize	= AES_MIN_KEY_SIZE,
+			.max_keysize	= AES_MAX_KEY_SIZE,
+			.setkey		= aes_set_key,
+			.encrypt	= ecb_encrypt,
+			.decrypt	= ecb_decrypt,
+		},
+	},
+}, {
+	.cra_name		= "__cbc-aes-aesni",
+	.cra_driver_name	= "__driver-cbc-aes-aesni",
+	.cra_priority		= 0,
+	.cra_flags		= CRYPTO_ALG_TYPE_BLKCIPHER,
+	.cra_blocksize		= AES_BLOCK_SIZE,
+	.cra_ctxsize		= sizeof(struct crypto_aes_ctx) +
+				  AESNI_ALIGN - 1,
+	.cra_alignmask		= 0,
+	.cra_type		= &crypto_blkcipher_type,
+	.cra_module		= THIS_MODULE,
+	.cra_u = {
+		.blkcipher = {
+			.min_keysize	= AES_MIN_KEY_SIZE,
+			.max_keysize	= AES_MAX_KEY_SIZE,
+			.setkey		= aes_set_key,
+			.encrypt	= cbc_encrypt,
+			.decrypt	= cbc_decrypt,
+		},
+	},
+}, {
+	.cra_name		= "ecb(aes)",
+	.cra_driver_name	= "ecb-aes-aesni",
+	.cra_priority		= 400,
+	.cra_flags		= CRYPTO_ALG_TYPE_ABLKCIPHER | CRYPTO_ALG_ASYNC,
+	.cra_blocksize		= AES_BLOCK_SIZE,
+	.cra_ctxsize		= sizeof(struct async_aes_ctx),
+	.cra_alignmask		= 0,
+	.cra_type		= &crypto_ablkcipher_type,
+	.cra_module		= THIS_MODULE,
+	.cra_init		= ablk_ecb_init,
+	.cra_exit		= ablk_exit,
+	.cra_u = {
+		.ablkcipher = {
+			.min_keysize	= AES_MIN_KEY_SIZE,
+			.max_keysize	= AES_MAX_KEY_SIZE,
+			.setkey		= ablk_set_key,
+			.encrypt	= ablk_encrypt,
+			.decrypt	= ablk_decrypt,
+		},
+	},
+}, {
+	.cra_name		= "cbc(aes)",
+	.cra_driver_name	= "cbc-aes-aesni",
+	.cra_priority		= 400,
+	.cra_flags		= CRYPTO_ALG_TYPE_ABLKCIPHER | CRYPTO_ALG_ASYNC,
+	.cra_blocksize		= AES_BLOCK_SIZE,
+	.cra_ctxsize		= sizeof(struct async_aes_ctx),
+	.cra_alignmask		= 0,
+	.cra_type		= &crypto_ablkcipher_type,
+	.cra_module		= THIS_MODULE,
+	.cra_init		= ablk_cbc_init,
+	.cra_exit		= ablk_exit,
+	.cra_u = {
+		.ablkcipher = {
+			.min_keysize	= AES_MIN_KEY_SIZE,
+			.max_keysize	= AES_MAX_KEY_SIZE,
+			.ivsize		= AES_BLOCK_SIZE,
+			.setkey		= ablk_set_key,
+			.encrypt	= ablk_encrypt,
+			.decrypt	= ablk_decrypt,
+		},
+	},
+#ifdef CONFIG_X86_64
+}, {
+	.cra_name		= "__ctr-aes-aesni",
+	.cra_driver_name	= "__driver-ctr-aes-aesni",
+	.cra_priority		= 0,
+	.cra_flags		= CRYPTO_ALG_TYPE_BLKCIPHER,
+	.cra_blocksize		= 1,
+	.cra_ctxsize		= sizeof(struct crypto_aes_ctx) +
+				  AESNI_ALIGN - 1,
+	.cra_alignmask		= 0,
+	.cra_type		= &crypto_blkcipher_type,
+	.cra_module		= THIS_MODULE,
+	.cra_u = {
+		.blkcipher = {
+			.min_keysize	= AES_MIN_KEY_SIZE,
+			.max_keysize	= AES_MAX_KEY_SIZE,
+			.ivsize		= AES_BLOCK_SIZE,
+			.setkey		= aes_set_key,
+			.encrypt	= ctr_crypt,
+			.decrypt	= ctr_crypt,
+		},
+	},
+}, {
+	.cra_name		= "ctr(aes)",
+	.cra_driver_name	= "ctr-aes-aesni",
+	.cra_priority		= 400,
+	.cra_flags		= CRYPTO_ALG_TYPE_ABLKCIPHER | CRYPTO_ALG_ASYNC,
+	.cra_blocksize		= 1,
+	.cra_ctxsize		= sizeof(struct async_aes_ctx),
+	.cra_alignmask		= 0,
+	.cra_type		= &crypto_ablkcipher_type,
+	.cra_module		= THIS_MODULE,
+	.cra_init		= ablk_ctr_init,
+	.cra_exit		= ablk_exit,
+	.cra_u = {
+		.ablkcipher = {
+			.min_keysize	= AES_MIN_KEY_SIZE,
+			.max_keysize	= AES_MAX_KEY_SIZE,
+			.ivsize		= AES_BLOCK_SIZE,
+			.setkey		= ablk_set_key,
+			.encrypt	= ablk_encrypt,
+			.decrypt	= ablk_encrypt,
+			.geniv		= "chainiv",
+		},
+	},
+}, {
+	.cra_name		= "__gcm-aes-aesni",
+	.cra_driver_name	= "__driver-gcm-aes-aesni",
 	.cra_priority		= 0,
 	.cra_flags		= CRYPTO_ALG_TYPE_AEAD,
 	.cra_blocksize		= 1,
-	.cra_ctxsize	= sizeof(struct aesni_rfc4106_gcm_ctx) + AESNI_ALIGN,
+	.cra_ctxsize		= sizeof(struct aesni_rfc4106_gcm_ctx) +
+				  AESNI_ALIGN,
 	.cra_alignmask		= 0,
 	.cra_type		= &crypto_aead_type,
 	.cra_module		= THIS_MODULE,
-	.cra_list		= LIST_HEAD_INIT(__rfc4106_alg.cra_list),
 	.cra_u = {
 		.aead = {
 			.encrypt	= __driver_rfc4106_encrypt,
 			.decrypt	= __driver_rfc4106_decrypt,
 		},
 	},
-};
+}, {
+	.cra_name		= "rfc4106(gcm(aes))",
+	.cra_driver_name	= "rfc4106-gcm-aesni",
+	.cra_priority		= 400,
+	.cra_flags		= CRYPTO_ALG_TYPE_AEAD | CRYPTO_ALG_ASYNC,
+	.cra_blocksize		= 1,
+	.cra_ctxsize		= sizeof(struct aesni_rfc4106_gcm_ctx) +
+				  AESNI_ALIGN,
+	.cra_alignmask		= 0,
+	.cra_type		= &crypto_nivaead_type,
+	.cra_module		= THIS_MODULE,
+	.cra_init		= rfc4106_init,
+	.cra_exit		= rfc4106_exit,
+	.cra_u = {
+		.aead = {
+			.setkey		= rfc4106_set_key,
+			.setauthsize	= rfc4106_set_authsize,
+			.encrypt	= rfc4106_encrypt,
+			.decrypt	= rfc4106_decrypt,
+			.geniv		= "seqiv",
+			.ivsize		= 8,
+			.maxauthsize	= 16,
+		},
+	},
+#ifdef HAS_CTR
+}, {
+	.cra_name		= "rfc3686(ctr(aes))",
+	.cra_driver_name	= "rfc3686-ctr-aes-aesni",
+	.cra_priority		= 400,
+	.cra_flags		= CRYPTO_ALG_TYPE_ABLKCIPHER | CRYPTO_ALG_ASYNC,
+	.cra_blocksize		= 1,
+	.cra_ctxsize		= sizeof(struct async_aes_ctx),
+	.cra_alignmask		= 0,
+	.cra_type		= &crypto_ablkcipher_type,
+	.cra_module		= THIS_MODULE,
+	.cra_init		= ablk_rfc3686_ctr_init,
+	.cra_exit		= ablk_exit,
+	.cra_u = {
+		.ablkcipher = {
+			.min_keysize = AES_MIN_KEY_SIZE +
+				       CTR_RFC3686_NONCE_SIZE,
+			.max_keysize = AES_MAX_KEY_SIZE +
+				       CTR_RFC3686_NONCE_SIZE,
+			.ivsize	     = CTR_RFC3686_IV_SIZE,
+			.setkey	     = ablk_set_key,
+			.encrypt     = ablk_encrypt,
+			.decrypt     = ablk_decrypt,
+			.geniv	     = "seqiv",
+		},
+	},
+#endif
+#endif
+#ifdef HAS_LRW
+}, {
+	.cra_name		= "lrw(aes)",
+	.cra_driver_name	= "lrw-aes-aesni",
+	.cra_priority		= 400,
+	.cra_flags		= CRYPTO_ALG_TYPE_ABLKCIPHER | CRYPTO_ALG_ASYNC,
+	.cra_blocksize		= AES_BLOCK_SIZE,
+	.cra_ctxsize		= sizeof(struct async_aes_ctx),
+	.cra_alignmask		= 0,
+	.cra_type		= &crypto_ablkcipher_type,
+	.cra_module		= THIS_MODULE,
+	.cra_init		= ablk_lrw_init,
+	.cra_exit		= ablk_exit,
+	.cra_u = {
+		.ablkcipher = {
+			.min_keysize	= AES_MIN_KEY_SIZE + AES_BLOCK_SIZE,
+			.max_keysize	= AES_MAX_KEY_SIZE + AES_BLOCK_SIZE,
+			.ivsize		= AES_BLOCK_SIZE,
+			.setkey		= ablk_set_key,
+			.encrypt	= ablk_encrypt,
+			.decrypt	= ablk_decrypt,
+		},
+	},
+#endif
+#ifdef HAS_PCBC
+}, {
+	.cra_name		= "pcbc(aes)",
+	.cra_driver_name	= "pcbc-aes-aesni",
+	.cra_priority		= 400,
+	.cra_flags		= CRYPTO_ALG_TYPE_ABLKCIPHER | CRYPTO_ALG_ASYNC,
+	.cra_blocksize		= AES_BLOCK_SIZE,
+	.cra_ctxsize		= sizeof(struct async_aes_ctx),
+	.cra_alignmask		= 0,
+	.cra_type		= &crypto_ablkcipher_type,
+	.cra_module		= THIS_MODULE,
+	.cra_init		= ablk_pcbc_init,
+	.cra_exit		= ablk_exit,
+	.cra_u = {
+		.ablkcipher = {
+			.min_keysize	= AES_MIN_KEY_SIZE,
+			.max_keysize	= AES_MAX_KEY_SIZE,
+			.ivsize		= AES_BLOCK_SIZE,
+			.setkey		= ablk_set_key,
+			.encrypt	= ablk_encrypt,
+			.decrypt	= ablk_decrypt,
+		},
+	},
+#endif
+#ifdef HAS_XTS
+}, {
+	.cra_name		= "xts(aes)",
+	.cra_driver_name	= "xts-aes-aesni",
+	.cra_priority		= 400,
+	.cra_flags		= CRYPTO_ALG_TYPE_ABLKCIPHER | CRYPTO_ALG_ASYNC,
+	.cra_blocksize		= AES_BLOCK_SIZE,
+	.cra_ctxsize		= sizeof(struct async_aes_ctx),
+	.cra_alignmask		= 0,
+	.cra_type		= &crypto_ablkcipher_type,
+	.cra_module		= THIS_MODULE,
+	.cra_init		= ablk_xts_init,
+	.cra_exit		= ablk_exit,
+	.cra_u = {
+		.ablkcipher = {
+			.min_keysize	= 2 * AES_MIN_KEY_SIZE,
+			.max_keysize	= 2 * AES_MAX_KEY_SIZE,
+			.ivsize		= AES_BLOCK_SIZE,
+			.setkey		= ablk_set_key,
+			.encrypt	= ablk_encrypt,
+			.decrypt	= ablk_decrypt,
+		},
+	},
 #endif
+} };
 
 
 static const struct x86_cpu_id aesni_cpu_id[] = {
@@ -1263,120 +1242,24 @@ MODULE_DEVICE_TABLE(x86cpu, aesni_cpu_id);
 
 static int __init aesni_init(void)
 {
-	int err;
+	int err, i;
 
 	if (!x86_match_cpu(aesni_cpu_id))
 		return -ENODEV;
 
-	if ((err = crypto_fpu_init()))
-		goto fpu_err;
-	if ((err = crypto_register_alg(&aesni_alg)))
-		goto aes_err;
-	if ((err = crypto_register_alg(&__aesni_alg)))
-		goto __aes_err;
-	if ((err = crypto_register_alg(&blk_ecb_alg)))
-		goto blk_ecb_err;
-	if ((err = crypto_register_alg(&blk_cbc_alg)))
-		goto blk_cbc_err;
-	if ((err = crypto_register_alg(&ablk_ecb_alg)))
-		goto ablk_ecb_err;
-	if ((err = crypto_register_alg(&ablk_cbc_alg)))
-		goto ablk_cbc_err;
-#ifdef CONFIG_X86_64
-	if ((err = crypto_register_alg(&blk_ctr_alg)))
-		goto blk_ctr_err;
-	if ((err = crypto_register_alg(&ablk_ctr_alg)))
-		goto ablk_ctr_err;
-	if ((err = crypto_register_alg(&__rfc4106_alg)))
-		goto __aead_gcm_err;
-	if ((err = crypto_register_alg(&rfc4106_alg)))
-		goto aead_gcm_err;
-#ifdef HAS_CTR
-	if ((err = crypto_register_alg(&ablk_rfc3686_ctr_alg)))
-		goto ablk_rfc3686_ctr_err;
-#endif
-#endif
-#ifdef HAS_LRW
-	if ((err = crypto_register_alg(&ablk_lrw_alg)))
-		goto ablk_lrw_err;
-#endif
-#ifdef HAS_PCBC
-	if ((err = crypto_register_alg(&ablk_pcbc_alg)))
-		goto ablk_pcbc_err;
-#endif
-#ifdef HAS_XTS
-	if ((err = crypto_register_alg(&ablk_xts_alg)))
-		goto ablk_xts_err;
-#endif
-	return err;
+	err = crypto_fpu_init();
+	if (err)
+		return err;
 
-#ifdef HAS_XTS
-ablk_xts_err:
-#endif
-#ifdef HAS_PCBC
-	crypto_unregister_alg(&ablk_pcbc_alg);
-ablk_pcbc_err:
-#endif
-#ifdef HAS_LRW
-	crypto_unregister_alg(&ablk_lrw_alg);
-ablk_lrw_err:
-#endif
-#ifdef CONFIG_X86_64
-#ifdef HAS_CTR
-	crypto_unregister_alg(&ablk_rfc3686_ctr_alg);
-ablk_rfc3686_ctr_err:
-#endif
-	crypto_unregister_alg(&rfc4106_alg);
-aead_gcm_err:
-	crypto_unregister_alg(&__rfc4106_alg);
-__aead_gcm_err:
-	crypto_unregister_alg(&ablk_ctr_alg);
-ablk_ctr_err:
-	crypto_unregister_alg(&blk_ctr_alg);
-blk_ctr_err:
-#endif
-	crypto_unregister_alg(&ablk_cbc_alg);
-ablk_cbc_err:
-	crypto_unregister_alg(&ablk_ecb_alg);
-ablk_ecb_err:
-	crypto_unregister_alg(&blk_cbc_alg);
-blk_cbc_err:
-	crypto_unregister_alg(&blk_ecb_alg);
-blk_ecb_err:
-	crypto_unregister_alg(&__aesni_alg);
-__aes_err:
-	crypto_unregister_alg(&aesni_alg);
-aes_err:
-fpu_err:
-	return err;
+	for (i = 0; i < ARRAY_SIZE(aesni_algs); i++)
+		INIT_LIST_HEAD(&aesni_algs[i].cra_list);
+
+	return crypto_register_algs(aesni_algs, ARRAY_SIZE(aesni_algs));
 }
 
 static void __exit aesni_exit(void)
 {
-#ifdef HAS_XTS
-	crypto_unregister_alg(&ablk_xts_alg);
-#endif
-#ifdef HAS_PCBC
-	crypto_unregister_alg(&ablk_pcbc_alg);
-#endif
-#ifdef HAS_LRW
-	crypto_unregister_alg(&ablk_lrw_alg);
-#endif
-#ifdef CONFIG_X86_64
-#ifdef HAS_CTR
-	crypto_unregister_alg(&ablk_rfc3686_ctr_alg);
-#endif
-	crypto_unregister_alg(&rfc4106_alg);
-	crypto_unregister_alg(&__rfc4106_alg);
-	crypto_unregister_alg(&ablk_ctr_alg);
-	crypto_unregister_alg(&blk_ctr_alg);
-#endif
-	crypto_unregister_alg(&ablk_cbc_alg);
-	crypto_unregister_alg(&ablk_ecb_alg);
-	crypto_unregister_alg(&blk_cbc_alg);
-	crypto_unregister_alg(&blk_ecb_alg);
-	crypto_unregister_alg(&__aesni_alg);
-	crypto_unregister_alg(&aesni_alg);
+	crypto_unregister_algs(aesni_algs, ARRAY_SIZE(aesni_algs));
 
 	crypto_fpu_exit();
 }
-- 
cgit v1.2.3


From ef45b834319f8a18f257a40ba4bce6b829ef1708 Mon Sep 17 00:00:00 2001
From: Jussi Kivilinna <jussi.kivilinna@mbnet.fi>
Date: Fri, 11 May 2012 16:00:54 +0300
Subject: crypto: aesni-intel - move more common code to ablk_init_common

ablk_*_init functions share more common code than what is currently in
ablk_init_common. Move all of the common code to ablk_init_common.

Cc: Huang Ying <ying.huang@intel.com>
Signed-off-by: Jussi Kivilinna <jussi.kivilinna@mbnet.fi>
Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
---
 arch/x86/crypto/aesni-intel_glue.c | 70 ++++++++------------------------------
 1 file changed, 15 insertions(+), 55 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/crypto/aesni-intel_glue.c b/arch/x86/crypto/aesni-intel_glue.c
index 20c622016629..ac7f5cd019e8 100644
--- a/arch/x86/crypto/aesni-intel_glue.c
+++ b/arch/x86/crypto/aesni-intel_glue.c
@@ -442,61 +442,42 @@ static void ablk_exit(struct crypto_tfm *tfm)
 	cryptd_free_ablkcipher(ctx->cryptd_tfm);
 }
 
-static void ablk_init_common(struct crypto_tfm *tfm,
-			     struct cryptd_ablkcipher *cryptd_tfm)
+static int ablk_init_common(struct crypto_tfm *tfm, const char *drv_name)
 {
 	struct async_aes_ctx *ctx = crypto_tfm_ctx(tfm);
+	struct cryptd_ablkcipher *cryptd_tfm;
+
+	cryptd_tfm = cryptd_alloc_ablkcipher(drv_name, 0, 0);
+	if (IS_ERR(cryptd_tfm))
+		return PTR_ERR(cryptd_tfm);
 
 	ctx->cryptd_tfm = cryptd_tfm;
 	tfm->crt_ablkcipher.reqsize = sizeof(struct ablkcipher_request) +
 		crypto_ablkcipher_reqsize(&cryptd_tfm->base);
+
+	return 0;
 }
 
 static int ablk_ecb_init(struct crypto_tfm *tfm)
 {
-	struct cryptd_ablkcipher *cryptd_tfm;
-
-	cryptd_tfm = cryptd_alloc_ablkcipher("__driver-ecb-aes-aesni", 0, 0);
-	if (IS_ERR(cryptd_tfm))
-		return PTR_ERR(cryptd_tfm);
-	ablk_init_common(tfm, cryptd_tfm);
-	return 0;
+	return ablk_init_common(tfm, "__driver-ecb-aes-aesni");
 }
 
 static int ablk_cbc_init(struct crypto_tfm *tfm)
 {
-	struct cryptd_ablkcipher *cryptd_tfm;
-
-	cryptd_tfm = cryptd_alloc_ablkcipher("__driver-cbc-aes-aesni", 0, 0);
-	if (IS_ERR(cryptd_tfm))
-		return PTR_ERR(cryptd_tfm);
-	ablk_init_common(tfm, cryptd_tfm);
-	return 0;
+	return ablk_init_common(tfm, "__driver-cbc-aes-aesni");
 }
 
 #ifdef CONFIG_X86_64
 static int ablk_ctr_init(struct crypto_tfm *tfm)
 {
-	struct cryptd_ablkcipher *cryptd_tfm;
-
-	cryptd_tfm = cryptd_alloc_ablkcipher("__driver-ctr-aes-aesni", 0, 0);
-	if (IS_ERR(cryptd_tfm))
-		return PTR_ERR(cryptd_tfm);
-	ablk_init_common(tfm, cryptd_tfm);
-	return 0;
+	return ablk_init_common(tfm, "__driver-ctr-aes-aesni");
 }
 
 #ifdef HAS_CTR
 static int ablk_rfc3686_ctr_init(struct crypto_tfm *tfm)
 {
-	struct cryptd_ablkcipher *cryptd_tfm;
-
-	cryptd_tfm = cryptd_alloc_ablkcipher(
-		"rfc3686(__driver-ctr-aes-aesni)", 0, 0);
-	if (IS_ERR(cryptd_tfm))
-		return PTR_ERR(cryptd_tfm);
-	ablk_init_common(tfm, cryptd_tfm);
-	return 0;
+	return ablk_init_common(tfm, "rfc3686(__driver-ctr-aes-aesni)");
 }
 #endif
 #endif
@@ -504,42 +485,21 @@ static int ablk_rfc3686_ctr_init(struct crypto_tfm *tfm)
 #ifdef HAS_LRW
 static int ablk_lrw_init(struct crypto_tfm *tfm)
 {
-	struct cryptd_ablkcipher *cryptd_tfm;
-
-	cryptd_tfm = cryptd_alloc_ablkcipher("fpu(lrw(__driver-aes-aesni))",
-					     0, 0);
-	if (IS_ERR(cryptd_tfm))
-		return PTR_ERR(cryptd_tfm);
-	ablk_init_common(tfm, cryptd_tfm);
-	return 0;
+	return ablk_init_common(tfm, "fpu(lrw(__driver-aes-aesni))");
 }
 #endif
 
 #ifdef HAS_PCBC
 static int ablk_pcbc_init(struct crypto_tfm *tfm)
 {
-	struct cryptd_ablkcipher *cryptd_tfm;
-
-	cryptd_tfm = cryptd_alloc_ablkcipher("fpu(pcbc(__driver-aes-aesni))",
-					     0, 0);
-	if (IS_ERR(cryptd_tfm))
-		return PTR_ERR(cryptd_tfm);
-	ablk_init_common(tfm, cryptd_tfm);
-	return 0;
+	return ablk_init_common(tfm, "fpu(pcbc(__driver-aes-aesni))");
 }
 #endif
 
 #ifdef HAS_XTS
 static int ablk_xts_init(struct crypto_tfm *tfm)
 {
-	struct cryptd_ablkcipher *cryptd_tfm;
-
-	cryptd_tfm = cryptd_alloc_ablkcipher("fpu(xts(__driver-aes-aesni))",
-					     0, 0);
-	if (IS_ERR(cryptd_tfm))
-		return PTR_ERR(cryptd_tfm);
-	ablk_init_common(tfm, cryptd_tfm);
-	return 0;
+	return ablk_init_common(tfm, "fpu(xts(__driver-aes-aesni))");
 }
 #endif
 
-- 
cgit v1.2.3


From a7c1938e22c02b008655524c766d185ae99d9d53 Mon Sep 17 00:00:00 2001
From: "Eric W. Biederman" <ebiederm@xmission.com>
Date: Thu, 9 Feb 2012 09:10:30 -0800
Subject: userns: Convert stat to return values mapped from kuids and kgids

- Store uids and gids with kuid_t and kgid_t in struct kstat
- Convert uid and gids to userspace usable values with
  from_kuid and from_kgid

Acked-by: Serge Hallyn <serge.hallyn@canonical.com>
Signed-off-by: Eric W. Biederman <ebiederm@xmission.com>
---
 arch/arm/kernel/sys_oabi-compat.c |  4 ++--
 arch/parisc/hpux/fs.c             |  4 ++--
 arch/s390/kernel/compat_linux.c   |  4 ++--
 arch/sparc/kernel/sys_sparc32.c   |  4 ++--
 arch/x86/ia32/sys_ia32.c          |  4 ++--
 fs/compat.c                       |  4 ++--
 fs/stat.c                         | 12 ++++++------
 include/linux/stat.h              |  5 +++--
 8 files changed, 21 insertions(+), 20 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/arm/kernel/sys_oabi-compat.c b/arch/arm/kernel/sys_oabi-compat.c
index af0aaebf4de6..3e94811690ce 100644
--- a/arch/arm/kernel/sys_oabi-compat.c
+++ b/arch/arm/kernel/sys_oabi-compat.c
@@ -124,8 +124,8 @@ static long cp_oldabi_stat64(struct kstat *stat,
 	tmp.__st_ino = stat->ino;
 	tmp.st_mode = stat->mode;
 	tmp.st_nlink = stat->nlink;
-	tmp.st_uid = stat->uid;
-	tmp.st_gid = stat->gid;
+	tmp.st_uid = from_kuid_munged(current_user_ns(), stat->uid);
+	tmp.st_gid = from_kgid_munged(current_user_ns(), stat->gid);
 	tmp.st_rdev = huge_encode_dev(stat->rdev);
 	tmp.st_size = stat->size;
 	tmp.st_blocks = stat->blocks;
diff --git a/arch/parisc/hpux/fs.c b/arch/parisc/hpux/fs.c
index 0dc8543acb4f..c71eb6c79897 100644
--- a/arch/parisc/hpux/fs.c
+++ b/arch/parisc/hpux/fs.c
@@ -159,8 +159,8 @@ static int cp_hpux_stat(struct kstat *stat, struct hpux_stat64 __user *statbuf)
 	tmp.st_ino = stat->ino;
 	tmp.st_mode = stat->mode;
 	tmp.st_nlink = stat->nlink;
-	tmp.st_uid = stat->uid;
-	tmp.st_gid = stat->gid;
+	tmp.st_uid = from_kuid_munged(current_user_ns(), stat->uid);
+	tmp.st_gid = from_kgid_munged(current_user_ns(), stat->gid);
 	tmp.st_rdev = new_encode_dev(stat->rdev);
 	tmp.st_size = stat->size;
 	tmp.st_atime = stat->atime.tv_sec;
diff --git a/arch/s390/kernel/compat_linux.c b/arch/s390/kernel/compat_linux.c
index f0273ed760ef..65426525d9f2 100644
--- a/arch/s390/kernel/compat_linux.c
+++ b/arch/s390/kernel/compat_linux.c
@@ -547,8 +547,8 @@ static int cp_stat64(struct stat64_emu31 __user *ubuf, struct kstat *stat)
 	tmp.__st_ino = (u32)stat->ino;
 	tmp.st_mode = stat->mode;
 	tmp.st_nlink = (unsigned int)stat->nlink;
-	tmp.st_uid = stat->uid;
-	tmp.st_gid = stat->gid;
+	tmp.st_uid = from_kuid_munged(current_user_ns(), stat->uid);
+	tmp.st_gid = from_kgid_munged(current_user_ns(), stat->gid);
 	tmp.st_rdev = huge_encode_dev(stat->rdev);
 	tmp.st_size = stat->size;
 	tmp.st_blksize = (u32)stat->blksize;
diff --git a/arch/sparc/kernel/sys_sparc32.c b/arch/sparc/kernel/sys_sparc32.c
index 29c478ffed91..f7392336961f 100644
--- a/arch/sparc/kernel/sys_sparc32.c
+++ b/arch/sparc/kernel/sys_sparc32.c
@@ -139,8 +139,8 @@ static int cp_compat_stat64(struct kstat *stat,
 	err |= put_user(stat->ino, &statbuf->st_ino);
 	err |= put_user(stat->mode, &statbuf->st_mode);
 	err |= put_user(stat->nlink, &statbuf->st_nlink);
-	err |= put_user(stat->uid, &statbuf->st_uid);
-	err |= put_user(stat->gid, &statbuf->st_gid);
+	err |= put_user(from_kuid_munged(current_user_ns(), stat->uid), &statbuf->st_uid);
+	err |= put_user(from_kgid_munged(current_user_ns(), stat->gid), &statbuf->st_gid);
 	err |= put_user(huge_encode_dev(stat->rdev), &statbuf->st_rdev);
 	err |= put_user(0, (unsigned long __user *) &statbuf->__pad3[0]);
 	err |= put_user(stat->size, &statbuf->st_size);
diff --git a/arch/x86/ia32/sys_ia32.c b/arch/x86/ia32/sys_ia32.c
index aec2202a596c..d5c820a54590 100644
--- a/arch/x86/ia32/sys_ia32.c
+++ b/arch/x86/ia32/sys_ia32.c
@@ -71,8 +71,8 @@ static int cp_stat64(struct stat64 __user *ubuf, struct kstat *stat)
 {
 	typeof(ubuf->st_uid) uid = 0;
 	typeof(ubuf->st_gid) gid = 0;
-	SET_UID(uid, stat->uid);
-	SET_GID(gid, stat->gid);
+	SET_UID(uid, from_kuid_munged(current_user_ns(), stat->uid));
+	SET_GID(gid, from_kgid_munged(current_user_ns(), stat->gid));
 	if (!access_ok(VERIFY_WRITE, ubuf, sizeof(struct stat64)) ||
 	    __put_user(huge_encode_dev(stat->dev), &ubuf->st_dev) ||
 	    __put_user(stat->ino, &ubuf->__st_ino) ||
diff --git a/fs/compat.c b/fs/compat.c
index f2944ace7a7b..0781e619a62a 100644
--- a/fs/compat.c
+++ b/fs/compat.c
@@ -144,8 +144,8 @@ static int cp_compat_stat(struct kstat *stat, struct compat_stat __user *ubuf)
 	tmp.st_nlink = stat->nlink;
 	if (tmp.st_nlink != stat->nlink)
 		return -EOVERFLOW;
-	SET_UID(tmp.st_uid, stat->uid);
-	SET_GID(tmp.st_gid, stat->gid);
+	SET_UID(tmp.st_uid, from_kuid_munged(current_user_ns(), stat->uid));
+	SET_GID(tmp.st_gid, from_kgid_munged(current_user_ns(), stat->gid));
 	tmp.st_rdev = old_encode_dev(stat->rdev);
 	if ((u64) stat->size > MAX_NON_LFS)
 		return -EOVERFLOW;
diff --git a/fs/stat.c b/fs/stat.c
index c733dc5753ae..31acca5f5a0c 100644
--- a/fs/stat.c
+++ b/fs/stat.c
@@ -137,8 +137,8 @@ static int cp_old_stat(struct kstat *stat, struct __old_kernel_stat __user * sta
 	tmp.st_nlink = stat->nlink;
 	if (tmp.st_nlink != stat->nlink)
 		return -EOVERFLOW;
-	SET_UID(tmp.st_uid, stat->uid);
-	SET_GID(tmp.st_gid, stat->gid);
+	SET_UID(tmp.st_uid, from_kuid_munged(current_user_ns(), stat->uid));
+	SET_GID(tmp.st_gid, from_kgid_munged(current_user_ns(), stat->gid));
 	tmp.st_rdev = old_encode_dev(stat->rdev);
 #if BITS_PER_LONG == 32
 	if (stat->size > MAX_NON_LFS)
@@ -215,8 +215,8 @@ static int cp_new_stat(struct kstat *stat, struct stat __user *statbuf)
 	tmp.st_nlink = stat->nlink;
 	if (tmp.st_nlink != stat->nlink)
 		return -EOVERFLOW;
-	SET_UID(tmp.st_uid, stat->uid);
-	SET_GID(tmp.st_gid, stat->gid);
+	SET_UID(tmp.st_uid, from_kuid_munged(current_user_ns(), stat->uid));
+	SET_GID(tmp.st_gid, from_kgid_munged(current_user_ns(), stat->gid));
 #if BITS_PER_LONG == 32
 	tmp.st_rdev = old_encode_dev(stat->rdev);
 #else
@@ -350,8 +350,8 @@ static long cp_new_stat64(struct kstat *stat, struct stat64 __user *statbuf)
 #endif
 	tmp.st_mode = stat->mode;
 	tmp.st_nlink = stat->nlink;
-	tmp.st_uid = stat->uid;
-	tmp.st_gid = stat->gid;
+	tmp.st_uid = from_kuid_munged(current_user_ns(), stat->uid);
+	tmp.st_gid = from_kgid_munged(current_user_ns(), stat->gid);
 	tmp.st_atime = stat->atime.tv_sec;
 	tmp.st_atime_nsec = stat->atime.tv_nsec;
 	tmp.st_mtime = stat->mtime.tv_sec;
diff --git a/include/linux/stat.h b/include/linux/stat.h
index 611c398dab72..46132409a3f7 100644
--- a/include/linux/stat.h
+++ b/include/linux/stat.h
@@ -58,14 +58,15 @@
 
 #include <linux/types.h>
 #include <linux/time.h>
+#include <linux/uidgid.h>
 
 struct kstat {
 	u64		ino;
 	dev_t		dev;
 	umode_t		mode;
 	unsigned int	nlink;
-	uid_t		uid;
-	gid_t		gid;
+	kuid_t		uid;
+	kgid_t		gid;
 	dev_t		rdev;
 	loff_t		size;
 	struct timespec  atime;
-- 
cgit v1.2.3


From 5abe68e493e5aea1ccfc384092f8e98a542b336a Mon Sep 17 00:00:00 2001
From: Shuah Khan <shuahkhan@gmail.com>
Date: Sun, 6 May 2012 11:55:08 -0600
Subject: x86: kernel/check.c simple_strtoul cleanup

Change set_corruption_check() and set_corruption_check_period()
in kernel/check.c to call kstrtoul() instead of calling
obsoleted simple_strtoul().

Signed-off-by: Shuah Khan <shuahkhan@gmail.com>
Link: http://lkml.kernel.org/r/1336326908.2897.12.camel@lorien2
Signed-off-by: H. Peter Anvin <hpa@linux.intel.com>
---
 arch/x86/kernel/check.c | 20 ++++++++++++++------
 1 file changed, 14 insertions(+), 6 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kernel/check.c b/arch/x86/kernel/check.c
index 5da1269e8ddc..e2dbcb7dabdd 100644
--- a/arch/x86/kernel/check.c
+++ b/arch/x86/kernel/check.c
@@ -27,21 +27,29 @@ static int num_scan_areas;
 
 static __init int set_corruption_check(char *arg)
 {
-	char *end;
+	ssize_t ret;
+	unsigned long val;
 
-	memory_corruption_check = simple_strtol(arg, &end, 10);
+	ret = kstrtoul(arg, 10, &val);
+	if (ret)
+		return ret;
 
-	return (*end == 0) ? 0 : -EINVAL;
+	memory_corruption_check = val;
+	return 0;
 }
 early_param("memory_corruption_check", set_corruption_check);
 
 static __init int set_corruption_check_period(char *arg)
 {
-	char *end;
+	ssize_t ret;
+	unsigned long val;
 
-	corruption_check_period = simple_strtoul(arg, &end, 10);
+	ret = kstrtoul(arg, 10, &val);
+	if (ret)
+		return ret;
 
-	return (*end == 0) ? 0 : -EINVAL;
+	corruption_check_period = val;
+	return 0;
 }
 early_param("memory_corruption_check_period", set_corruption_check_period);
 
-- 
cgit v1.2.3


From 363f7ce3250aafdaab43011c7dc40158ea571e6b Mon Sep 17 00:00:00 2001
From: Shuah Khan <shuahkhan@gmail.com>
Date: Sun, 6 May 2012 11:58:04 -0600
Subject: x86: kernel/dumpstack.c simple_strtoul cleanup

Change kstack_setup() and code_bytes_setup() in kernel/dumpstack.c
to call kstrtoul() instead of calling obsoleted simple_strtoul().

Signed-off-by: Shuah Khan <shuahkhan@gmail.com>
Link: http://lkml.kernel.org/r/1336327084.2897.15.camel@lorien2
Signed-off-by: H. Peter Anvin <hpa@linux.intel.com>
---
 arch/x86/kernel/dumpstack.c | 21 +++++++++++++++++++--
 1 file changed, 19 insertions(+), 2 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kernel/dumpstack.c b/arch/x86/kernel/dumpstack.c
index 1b81839b6c88..b154f6d99058 100644
--- a/arch/x86/kernel/dumpstack.c
+++ b/arch/x86/kernel/dumpstack.c
@@ -311,16 +311,33 @@ void die(const char *str, struct pt_regs *regs, long err)
 
 static int __init kstack_setup(char *s)
 {
+	ssize_t ret;
+	unsigned long val;
+
 	if (!s)
 		return -EINVAL;
-	kstack_depth_to_print = simple_strtoul(s, NULL, 0);
+
+	ret = kstrtoul(s, 0, &val);
+	if (ret)
+		return ret;
+	kstack_depth_to_print = val;
 	return 0;
 }
 early_param("kstack", kstack_setup);
 
 static int __init code_bytes_setup(char *s)
 {
-	code_bytes = simple_strtoul(s, NULL, 0);
+	ssize_t ret;
+	unsigned long val;
+
+	if (!s)
+		return -EINVAL;
+
+	ret = kstrtoul(s, 0, &val);
+	if (ret)
+		return ret;
+
+	code_bytes = val;
 	if (code_bytes > 8192)
 		code_bytes = 8192;
 
-- 
cgit v1.2.3


From 867aae6ebe593db73fb8a676475ee20227292cfe Mon Sep 17 00:00:00 2001
From: Bjorn Helgaas <bhelgaas@google.com>
Date: Tue, 15 May 2012 17:01:09 -0600
Subject: x86/PCI: only check for spinlock being held in SMP kernels

spin_is_locked() is always false on UP kernels: spin_lock_irqsave() does no
locking, so we can't tell whether the lock is held or not.  Therefore,
this warning is only valid for SMP kernels.

CC: Myron Stowe <myron.stowe@redhat.com>
Signed-off-by: Bjorn Helgaas <bhelgaas@google.com>
---
 arch/x86/pci/i386.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'arch/x86')

diff --git a/arch/x86/pci/i386.c b/arch/x86/pci/i386.c
index 831971e731f7..dd8ca6f7223b 100644
--- a/arch/x86/pci/i386.c
+++ b/arch/x86/pci/i386.c
@@ -57,7 +57,7 @@ static struct pcibios_fwaddrmap *pcibios_fwaddrmap_lookup(struct pci_dev *dev)
 {
 	struct pcibios_fwaddrmap *map;
 
-	WARN_ON(!spin_is_locked(&pcibios_fwaddrmap_lock));
+	WARN_ON_SMP(!spin_is_locked(&pcibios_fwaddrmap_lock));
 
 	list_for_each_entry(map, &pcibios_fwaddrmappings, list)
 		if (map->dev == dev)
-- 
cgit v1.2.3


From 6cf20beec4b91c240cf759b4db72669e251f1fc4 Mon Sep 17 00:00:00 2001
From: Dave Airlie <airlied@redhat.com>
Date: Mon, 14 May 2012 17:00:40 +0100
Subject: x86/vga: set the default device from the fixup.

Since Matthew's efi/vga changes on non-EFI machines we were failing
to tell the vgaarb/switcheroo what the default device was, this
sets the default device in the quirk if none has been set before.

This fixes the switcheroo on my T410s.

Cc: Matthew Garrett <mjg@redhat.com>
Acked-by: H. Peter Anvin <hpa@zytor.com>
Signed-off-by: Dave Airlie <airlied@redhat.com>
---
 arch/x86/pci/fixup.c | 3 +++
 1 file changed, 3 insertions(+)

(limited to 'arch/x86')

diff --git a/arch/x86/pci/fixup.c b/arch/x86/pci/fixup.c
index d0e6e403b4f6..01635537d72e 100644
--- a/arch/x86/pci/fixup.c
+++ b/arch/x86/pci/fixup.c
@@ -7,6 +7,7 @@
 #include <linux/pci.h>
 #include <linux/init.h>
 #include <asm/pci_x86.h>
+#include <asm/vga.h>
 
 static void __devinit pci_fixup_i450nx(struct pci_dev *d)
 {
@@ -348,6 +349,8 @@ static void __devinit pci_fixup_video(struct pci_dev *pdev)
 	if (config & (PCI_COMMAND_IO | PCI_COMMAND_MEMORY)) {
 		pdev->resource[PCI_ROM_RESOURCE].flags |= IORESOURCE_ROM_SHADOW;
 		dev_printk(KERN_DEBUG, &pdev->dev, "Boot video device\n");
+		if (!vga_default_device())
+			vga_set_default_device(pdev);
 	}
 }
 DECLARE_PCI_FIXUP_CLASS_FINAL(PCI_ANY_ID, PCI_ANY_ID,
-- 
cgit v1.2.3


From 512d5649e8dc3ed36f2ebf0818da64a4d4c2544a Mon Sep 17 00:00:00 2001
From: Avi Kivity <avi@redhat.com>
Date: Sun, 13 May 2012 19:53:23 +0300
Subject: KVM: VMX: Fix %ds/%es clobber

The vmx exit code unconditionally restores %ds and %es to __USER_DS.  This
can override the user's values, since %ds and %es are not saved and restored
in x86_64 syscalls.  In practice, this isn't dangerous since nobody uses
segment registers in long mode, least of all programs that use KVM.

Signed-off-by: Avi Kivity <avi@redhat.com>
Signed-off-by: Marcelo Tosatti <mtosatti@redhat.com>
---
 arch/x86/kvm/vmx.c | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index 3062ea95266e..f2ee016e1004 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -6102,7 +6102,10 @@ static void atomic_switch_perf_msrs(struct vcpu_vmx *vmx)
 static void __noclone vmx_vcpu_run(struct kvm_vcpu *vcpu)
 {
 	struct vcpu_vmx *vmx = to_vmx(vcpu);
+	u16 _ds, _es;
 
+	savesegment(ds, _ds);
+	savesegment(es, _es);
 	if (is_guest_mode(vcpu) && !vmx->nested.nested_run_pending) {
 		struct vmcs12 *vmcs12 = get_vmcs12(vcpu);
 		if (vmcs12->idt_vectoring_info_field &
@@ -6263,7 +6266,8 @@ static void __noclone vmx_vcpu_run(struct kvm_vcpu *vcpu)
 		}
 	}
 
-	asm("mov %0, %%ds; mov %0, %%es" : : "r"(__USER_DS));
+	loadsegment(ds, _ds);
+	loadsegment(es, _es);
 	vmx->loaded_vmcs->launched = 1;
 
 	vmx->exit_reason = vmcs_read32(VM_EXIT_REASON);
-- 
cgit v1.2.3


From b2da15ac26a0c00fc0d399a2bc5cf3c4e15f0b4f Mon Sep 17 00:00:00 2001
From: Avi Kivity <avi@redhat.com>
Date: Sun, 13 May 2012 19:53:24 +0300
Subject: KVM: VMX: Optimize %ds, %es reload

On x86_64, we can defer %ds and %es reload to the heavyweight context switch,
since nothing in the lightweight paths uses the host %ds or %es (they are
ignored by the processor).  Furthermore we can avoid the load if the segments
are null, by letting the hardware load the null segments for us.  This is the
expected case.

On i386, we could avoid the reload entirely, since the entry.S paths take care
of reload, except for the SYSEXIT path which leaves %ds and %es set to __USER_DS.
So we set them to the same values as well.

Saves about 70 cycles out of 1600 (around 4%; noisy measurements).

Signed-off-by: Avi Kivity <avi@redhat.com>
Signed-off-by: Marcelo Tosatti <mtosatti@redhat.com>
---
 arch/x86/kvm/vmx.c | 36 +++++++++++++++++++++++++++++++-----
 1 file changed, 31 insertions(+), 5 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index f2ee016e1004..32eb58866292 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -393,6 +393,9 @@ struct vcpu_vmx {
 	struct {
 		int           loaded;
 		u16           fs_sel, gs_sel, ldt_sel;
+#ifdef CONFIG_X86_64
+		u16           ds_sel, es_sel;
+#endif
 		int           gs_ldt_reload_needed;
 		int           fs_reload_needed;
 	} host_state;
@@ -1417,6 +1420,11 @@ static void vmx_save_host_state(struct kvm_vcpu *vcpu)
 		vmx->host_state.gs_ldt_reload_needed = 1;
 	}
 
+#ifdef CONFIG_X86_64
+	savesegment(ds, vmx->host_state.ds_sel);
+	savesegment(es, vmx->host_state.es_sel);
+#endif
+
 #ifdef CONFIG_X86_64
 	vmcs_writel(HOST_FS_BASE, read_msr(MSR_FS_BASE));
 	vmcs_writel(HOST_GS_BASE, read_msr(MSR_GS_BASE));
@@ -1457,6 +1465,19 @@ static void __vmx_load_host_state(struct vcpu_vmx *vmx)
 	}
 	if (vmx->host_state.fs_reload_needed)
 		loadsegment(fs, vmx->host_state.fs_sel);
+#ifdef CONFIG_X86_64
+	if (unlikely(vmx->host_state.ds_sel | vmx->host_state.es_sel)) {
+		loadsegment(ds, vmx->host_state.ds_sel);
+		loadsegment(es, vmx->host_state.es_sel);
+	}
+#else
+	/*
+	 * The sysexit path does not restore ds/es, so we must set them to
+	 * a reasonable value ourselves.
+	 */
+	loadsegment(ds, __USER_DS);
+	loadsegment(es, __USER_DS);
+#endif
 	reload_tss();
 #ifdef CONFIG_X86_64
 	wrmsrl(MSR_KERNEL_GS_BASE, vmx->msr_host_kernel_gs_base);
@@ -3640,8 +3661,18 @@ static void vmx_set_constant_host_state(void)
 	vmcs_writel(HOST_CR3, read_cr3());  /* 22.2.3  FIXME: shadow tables */
 
 	vmcs_write16(HOST_CS_SELECTOR, __KERNEL_CS);  /* 22.2.4 */
+#ifdef CONFIG_X86_64
+	/*
+	 * Load null selectors, so we can avoid reloading them in
+	 * __vmx_load_host_state(), in case userspace uses the null selectors
+	 * too (the expected case).
+	 */
+	vmcs_write16(HOST_DS_SELECTOR, 0);
+	vmcs_write16(HOST_ES_SELECTOR, 0);
+#else
 	vmcs_write16(HOST_DS_SELECTOR, __KERNEL_DS);  /* 22.2.4 */
 	vmcs_write16(HOST_ES_SELECTOR, __KERNEL_DS);  /* 22.2.4 */
+#endif
 	vmcs_write16(HOST_SS_SELECTOR, __KERNEL_DS);  /* 22.2.4 */
 	vmcs_write16(HOST_TR_SELECTOR, GDT_ENTRY_TSS*8);  /* 22.2.4 */
 
@@ -6102,10 +6133,7 @@ static void atomic_switch_perf_msrs(struct vcpu_vmx *vmx)
 static void __noclone vmx_vcpu_run(struct kvm_vcpu *vcpu)
 {
 	struct vcpu_vmx *vmx = to_vmx(vcpu);
-	u16 _ds, _es;
 
-	savesegment(ds, _ds);
-	savesegment(es, _es);
 	if (is_guest_mode(vcpu) && !vmx->nested.nested_run_pending) {
 		struct vmcs12 *vmcs12 = get_vmcs12(vcpu);
 		if (vmcs12->idt_vectoring_info_field &
@@ -6266,8 +6294,6 @@ static void __noclone vmx_vcpu_run(struct kvm_vcpu *vcpu)
 		}
 	}
 
-	loadsegment(ds, _ds);
-	loadsegment(es, _es);
 	vmx->loaded_vmcs->launched = 1;
 
 	vmx->exit_reason = vmcs_read32(VM_EXIT_REASON);
-- 
cgit v1.2.3


From c142786c6291189b5c85f53d91743e1eefbd8fe0 Mon Sep 17 00:00:00 2001
From: Avi Kivity <avi@redhat.com>
Date: Mon, 14 May 2012 15:44:06 +0300
Subject: KVM: MMU: Don't use RCU for lockless shadow walking

Using RCU for lockless shadow walking can increase the amount of memory
in use by the system, since RCU grace periods are unpredictable.  We also
have an unconditional write to a shared variable (reader_counter), which
isn't good for scaling.

Replace that with a scheme similar to x86's get_user_pages_fast(): disable
interrupts during lockless shadow walk to force the freer
(kvm_mmu_commit_zap_page()) to wait for the TLB flush IPI to find the
processor with interrupts enabled.

We also add a new vcpu->mode, READING_SHADOW_PAGE_TABLES, to prevent
kvm_flush_remote_tlbs() from avoiding the IPI.

Signed-off-by: Avi Kivity <avi@redhat.com>
Signed-off-by: Marcelo Tosatti <mtosatti@redhat.com>
---
 arch/x86/include/asm/kvm_host.h |  4 ---
 arch/x86/kvm/mmu.c              | 73 ++++++++++++++++-------------------------
 include/linux/kvm_host.h        |  3 +-
 3 files changed, 31 insertions(+), 49 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index 69e39bc7e36f..64c8989263f6 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -240,8 +240,6 @@ struct kvm_mmu_page {
 #endif
 
 	int write_flooding_count;
-
-	struct rcu_head rcu;
 };
 
 struct kvm_pio_request {
@@ -540,8 +538,6 @@ struct kvm_arch {
 	u64 hv_guest_os_id;
 	u64 hv_hypercall;
 
-	atomic_t reader_counter;
-
 	#ifdef CONFIG_KVM_MMU_AUDIT
 	int audit_point;
 	#endif
diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
index 07424cf60434..72102e0ab7cb 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -551,19 +551,29 @@ static u64 mmu_spte_get_lockless(u64 *sptep)
 
 static void walk_shadow_page_lockless_begin(struct kvm_vcpu *vcpu)
 {
-	rcu_read_lock();
-	atomic_inc(&vcpu->kvm->arch.reader_counter);
-
-	/* Increase the counter before walking shadow page table */
-	smp_mb__after_atomic_inc();
+	/*
+	 * Prevent page table teardown by making any free-er wait during
+	 * kvm_flush_remote_tlbs() IPI to all active vcpus.
+	 */
+	local_irq_disable();
+	vcpu->mode = READING_SHADOW_PAGE_TABLES;
+	/*
+	 * Make sure a following spte read is not reordered ahead of the write
+	 * to vcpu->mode.
+	 */
+	smp_mb();
 }
 
 static void walk_shadow_page_lockless_end(struct kvm_vcpu *vcpu)
 {
-	/* Decrease the counter after walking shadow page table finished */
-	smp_mb__before_atomic_dec();
-	atomic_dec(&vcpu->kvm->arch.reader_counter);
-	rcu_read_unlock();
+	/*
+	 * Make sure the write to vcpu->mode is not reordered in front of
+	 * reads to sptes.  If it does, kvm_commit_zap_page() can see us
+	 * OUTSIDE_GUEST_MODE and proceed to free the shadow page table.
+	 */
+	smp_mb();
+	vcpu->mode = OUTSIDE_GUEST_MODE;
+	local_irq_enable();
 }
 
 static int mmu_topup_memory_cache(struct kvm_mmu_memory_cache *cache,
@@ -1989,30 +1999,6 @@ static int kvm_mmu_prepare_zap_page(struct kvm *kvm, struct kvm_mmu_page *sp,
 	return ret;
 }
 
-static void kvm_mmu_isolate_pages(struct list_head *invalid_list)
-{
-	struct kvm_mmu_page *sp;
-
-	list_for_each_entry(sp, invalid_list, link)
-		kvm_mmu_isolate_page(sp);
-}
-
-static void free_pages_rcu(struct rcu_head *head)
-{
-	struct kvm_mmu_page *next, *sp;
-
-	sp = container_of(head, struct kvm_mmu_page, rcu);
-	while (sp) {
-		if (!list_empty(&sp->link))
-			next = list_first_entry(&sp->link,
-				      struct kvm_mmu_page, link);
-		else
-			next = NULL;
-		kvm_mmu_free_page(sp);
-		sp = next;
-	}
-}
-
 static void kvm_mmu_commit_zap_page(struct kvm *kvm,
 				    struct list_head *invalid_list)
 {
@@ -2021,17 +2007,17 @@ static void kvm_mmu_commit_zap_page(struct kvm *kvm,
 	if (list_empty(invalid_list))
 		return;
 
-	kvm_flush_remote_tlbs(kvm);
-
-	if (atomic_read(&kvm->arch.reader_counter)) {
-		kvm_mmu_isolate_pages(invalid_list);
-		sp = list_first_entry(invalid_list, struct kvm_mmu_page, link);
-		list_del_init(invalid_list);
+	/*
+	 * wmb: make sure everyone sees our modifications to the page tables
+	 * rmb: make sure we see changes to vcpu->mode
+	 */
+	smp_mb();
 
-		trace_kvm_mmu_delay_free_pages(sp);
-		call_rcu(&sp->rcu, free_pages_rcu);
-		return;
-	}
+	/*
+	 * Wait for all vcpus to exit guest mode and/or lockless shadow
+	 * page table walks.
+	 */
+	kvm_flush_remote_tlbs(kvm);
 
 	do {
 		sp = list_first_entry(invalid_list, struct kvm_mmu_page, link);
@@ -2039,7 +2025,6 @@ static void kvm_mmu_commit_zap_page(struct kvm *kvm,
 		kvm_mmu_isolate_page(sp);
 		kvm_mmu_free_page(sp);
 	} while (!list_empty(invalid_list));
-
 }
 
 /*
diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h
index cae342d29d1b..c4464356b35b 100644
--- a/include/linux/kvm_host.h
+++ b/include/linux/kvm_host.h
@@ -128,7 +128,8 @@ int kvm_async_pf_wakeup_all(struct kvm_vcpu *vcpu);
 enum {
 	OUTSIDE_GUEST_MODE,
 	IN_GUEST_MODE,
-	EXITING_GUEST_MODE
+	EXITING_GUEST_MODE,
+	READING_SHADOW_PAGE_TABLES,
 };
 
 /*
-- 
cgit v1.2.3


From 796038799a72adb279d785c9154df6eeb98b6e8d Mon Sep 17 00:00:00 2001
From: "H. Peter Anvin" <hpa@linux.intel.com>
Date: Wed, 16 May 2012 13:22:41 -0700
Subject: x86, realmode: Mask out EFER.LMA when saving trampoline EFER

Some AMD processors apparently #GP(0) if EFER.LMA is set in WRMSR,
rather than ignoring it.  Thus, we need to mask it out.

Reported-by: Ingo Molnar <mingo@kernel.org>
Tested-by: Borislav Petkov <bp@alien8.de>
Cc: Jarkko Sakkinen <jarkko.sakkinen@intel.com>
Signed-off-by: H. Peter Anvin <hpa@linux.intel.com>
Link: http://lkml.kernel.org/r/1336501366-28617-24-git-send-email-jarkko.sakkinen@intel.com
---
 arch/x86/kernel/realmode.c | 11 ++++++++---
 1 file changed, 8 insertions(+), 3 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kernel/realmode.c b/arch/x86/kernel/realmode.c
index 66ac276cf361..099277984b80 100644
--- a/arch/x86/kernel/realmode.c
+++ b/arch/x86/kernel/realmode.c
@@ -22,6 +22,7 @@ void __init setup_real_mode(void)
 	size_t size = PAGE_ALIGN(real_mode_blob_end - real_mode_blob);
 #ifdef CONFIG_X86_64
 	u64 *trampoline_pgd;
+	u32 efer_low, efer_high;
 #endif
 
 	/* Has to be in very low memory so we can execute real-mode AP code. */
@@ -65,9 +66,13 @@ void __init setup_real_mode(void)
 	trampoline_header->gdt_limit = __BOOT_DS + 7;
 	trampoline_header->gdt_base = __pa(boot_gdt);
 #else
-	if (rdmsr_safe(MSR_EFER, &trampoline_header->efer_low,
-		       &trampoline_header->efer_high))
-		BUG();
+	/*
+	 * Some AMD processors will #GP(0) if EFER.LMA is set in WRMSR
+	 * so we need to mask it out.
+	 */
+	rdmsr(MSR_EFER, efer_low, efer_high);
+	trampoline_header->efer_low  = efer_low & ~EFER_LMA;
+	trampoline_header->efer_high = efer_high;
 
 	trampoline_header->start = (u64) secondary_startup_64;
 	trampoline_cr4_features = &trampoline_header->cr4;
-- 
cgit v1.2.3


From 51edbe6a2f47c78c6c6e529999ee0a044fe59a89 Mon Sep 17 00:00:00 2001
From: "H. Peter Anvin" <hpa@linux.intel.com>
Date: Wed, 16 May 2012 13:44:10 -0700
Subject: x86, realmode: Move not-common bits out of trampoline_common.S

Move the bits that aren't actually common out of trampoline_common.S
and into the arch-specific files.  Furthermore, make sure the page
directory is first in the .bss section for trampoline_64.S in order to
not waste an entire page of memory.

Signed-off-by: H. Peter Anvin <hpa@linux.intel.com>
Cc: Jarkko Sakkinen <jarkko.sakkinen@intel.com>
---
 arch/x86/realmode/rm/trampoline_32.S     |  8 ++++++++
 arch/x86/realmode/rm/trampoline_64.S     | 25 +++++++++++++++++++++++
 arch/x86/realmode/rm/trampoline_common.S | 35 --------------------------------
 3 files changed, 33 insertions(+), 35 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/realmode/rm/trampoline_32.S b/arch/x86/realmode/rm/trampoline_32.S
index 6fc064b4d2b9..c1b2791183e7 100644
--- a/arch/x86/realmode/rm/trampoline_32.S
+++ b/arch/x86/realmode/rm/trampoline_32.S
@@ -63,4 +63,12 @@ ENTRY(trampoline_start)
 ENTRY(startup_32)			# note: also used from wakeup_asm.S
 	jmp	*%eax
 
+	.bss
+	.balign 8
+GLOBAL(trampoline_header)
+	tr_start:		.space	4
+	tr_gdt_pad:		.space	2
+	tr_gdt:			.space	6
+END(trampoline_header)
+	
 #include "trampoline_common.S"
diff --git a/arch/x86/realmode/rm/trampoline_64.S b/arch/x86/realmode/rm/trampoline_64.S
index 66e26f088288..1b9e1bc1ac5e 100644
--- a/arch/x86/realmode/rm/trampoline_64.S
+++ b/arch/x86/realmode/rm/trampoline_64.S
@@ -125,4 +125,29 @@ ENTRY(startup_64)
 	# Now jump into the kernel using virtual addresses
 	jmpq	*tr_start(%rip)
 
+	.section ".rodata","a"
+	# Duplicate the global descriptor table
+	# so the kernel can live anywhere
+	.balign	16
+	.globl tr_gdt
+tr_gdt:
+	.short	tr_gdt_end - tr_gdt - 1	# gdt limit
+	.long	pa_tr_gdt
+	.short	0
+	.quad	0x00cf9b000000ffff	# __KERNEL32_CS
+	.quad	0x00af9b000000ffff	# __KERNEL_CS
+	.quad	0x00cf93000000ffff	# __KERNEL_DS
+tr_gdt_end:
+
+	.bss
+	.balign	PAGE_SIZE
+GLOBAL(trampoline_pgd)		.space	PAGE_SIZE
+
+	.balign	8
+GLOBAL(trampoline_header)
+	tr_start:		.space	8
+	GLOBAL(tr_cr4)		.space	4
+	GLOBAL(tr_efer)		.space	8
+END(trampoline_header)
+
 #include "trampoline_common.S"
diff --git a/arch/x86/realmode/rm/trampoline_common.S b/arch/x86/realmode/rm/trampoline_common.S
index cac444b942f8..b1ecdb9692ad 100644
--- a/arch/x86/realmode/rm/trampoline_common.S
+++ b/arch/x86/realmode/rm/trampoline_common.S
@@ -1,42 +1,7 @@
 	.section ".rodata","a"
-
-#ifdef CONFIG_X86_64
-	# Duplicate the global descriptor table
-	# so the kernel can live anywhere
 	.balign	16
-	.globl tr_gdt
-tr_gdt:
-	.short	tr_gdt_end - tr_gdt - 1	# gdt limit
-	.long	pa_tr_gdt
-	.short	0
-	.quad	0x00cf9b000000ffff	# __KERNEL32_CS
-	.quad	0x00af9b000000ffff	# __KERNEL_CS
-	.quad	0x00cf93000000ffff	# __KERNEL_DS
-tr_gdt_end:
-#endif
-
-	.balign	4
 tr_idt: .fill 1, 6, 0
 
 	.bss
-
 	.balign	4
 GLOBAL(trampoline_status)	.space	4
-
-	.balign	8
-GLOBAL(trampoline_header)
-#ifdef CONFIG_X86_32
-	tr_start:		.space	4
-	tr_gdt_pad:		.space	2
-	tr_gdt:			.space	6
-#else
-	tr_start:		.space	8
-	GLOBAL(tr_cr4)		.space	4
-	GLOBAL(tr_efer)		.space	8
-#endif
-END(trampoline_header)
-
-#ifdef CONFIG_X86_64
-	.balign	PAGE_SIZE
-GLOBAL(trampoline_pgd)		.space	PAGE_SIZE
-#endif
-- 
cgit v1.2.3


From 137127018812ec7fcccb9843156cfc0b5cfa31d5 Mon Sep 17 00:00:00 2001
From: "H. Peter Anvin" <hpa@linux.intel.com>
Date: Wed, 16 May 2012 13:49:10 -0700
Subject: x86, realmode: Move kernel/realmode.c to realmode/init.c

Keep all the realmode code together, including initialization (only
the rm/ subdirectory is actually built as real-mode code, anyway.)

Signed-off-by: H. Peter Anvin <hpa@linux.intel.com>
Cc: Jarkko Sakkinen <jarkko.sakkinen@intel.com>
---
 arch/x86/kernel/Makefile   |   1 -
 arch/x86/kernel/realmode.c | 116 ---------------------------------------------
 arch/x86/realmode/Makefile |   1 +
 arch/x86/realmode/init.c   | 116 +++++++++++++++++++++++++++++++++++++++++++++
 4 files changed, 117 insertions(+), 117 deletions(-)
 delete mode 100644 arch/x86/kernel/realmode.c
 create mode 100644 arch/x86/realmode/init.c

(limited to 'arch/x86')

diff --git a/arch/x86/kernel/Makefile b/arch/x86/kernel/Makefile
index 4a20f4441ffe..08484332f329 100644
--- a/arch/x86/kernel/Makefile
+++ b/arch/x86/kernel/Makefile
@@ -35,7 +35,6 @@ obj-y			+= tsc.o io_delay.o rtc.o
 obj-y			+= pci-iommu_table.o
 obj-y			+= resource.o
 
-obj-y				+= realmode.o
 obj-y				+= process.o
 obj-y				+= i387.o xsave.o
 obj-y				+= ptrace.o
diff --git a/arch/x86/kernel/realmode.c b/arch/x86/kernel/realmode.c
deleted file mode 100644
index 099277984b80..000000000000
--- a/arch/x86/kernel/realmode.c
+++ /dev/null
@@ -1,116 +0,0 @@
-#include <linux/io.h>
-#include <linux/memblock.h>
-
-#include <asm/cacheflush.h>
-#include <asm/pgtable.h>
-#include <asm/realmode.h>
-
-struct real_mode_header *real_mode_header;
-u32 *trampoline_cr4_features;
-
-void __init setup_real_mode(void)
-{
-	phys_addr_t mem;
-	u16 real_mode_seg;
-	u32 *rel;
-	u32 count;
-	u32 *ptr;
-	u16 *seg;
-	int i;
-	unsigned char *base;
-	struct trampoline_header *trampoline_header;
-	size_t size = PAGE_ALIGN(real_mode_blob_end - real_mode_blob);
-#ifdef CONFIG_X86_64
-	u64 *trampoline_pgd;
-	u32 efer_low, efer_high;
-#endif
-
-	/* Has to be in very low memory so we can execute real-mode AP code. */
-	mem = memblock_find_in_range(0, 1<<20, size, PAGE_SIZE);
-	if (!mem)
-		panic("Cannot allocate trampoline\n");
-
-	base = __va(mem);
-	memblock_reserve(mem, size);
-	real_mode_header = (struct real_mode_header *) base;
-	printk(KERN_DEBUG "Base memory trampoline at [%p] %llx size %zu\n",
-	       base, (unsigned long long)mem, size);
-
-	memcpy(base, real_mode_blob, size);
-
-	real_mode_seg = __pa(base) >> 4;
-	rel = (u32 *) real_mode_relocs;
-
-	/* 16-bit segment relocations. */
-	count = rel[0];
-	rel = &rel[1];
-	for (i = 0; i < count; i++) {
-		seg = (u16 *) (base + rel[i]);
-		*seg = real_mode_seg;
-	}
-
-	/* 32-bit linear relocations. */
-	count = rel[i];
-	rel =  &rel[i + 1];
-	for (i = 0; i < count; i++) {
-		ptr = (u32 *) (base + rel[i]);
-		*ptr += __pa(base);
-	}
-
-	/* Must be perfomed *after* relocation. */
-	trampoline_header = (struct trampoline_header *)
-		__va(real_mode_header->trampoline_header);
-
-#ifdef CONFIG_X86_32
-	trampoline_header->start = __pa(startup_32_smp);
-	trampoline_header->gdt_limit = __BOOT_DS + 7;
-	trampoline_header->gdt_base = __pa(boot_gdt);
-#else
-	/*
-	 * Some AMD processors will #GP(0) if EFER.LMA is set in WRMSR
-	 * so we need to mask it out.
-	 */
-	rdmsr(MSR_EFER, efer_low, efer_high);
-	trampoline_header->efer_low  = efer_low & ~EFER_LMA;
-	trampoline_header->efer_high = efer_high;
-
-	trampoline_header->start = (u64) secondary_startup_64;
-	trampoline_cr4_features = &trampoline_header->cr4;
-	*trampoline_cr4_features = read_cr4();
-
-	trampoline_pgd = (u64 *) __va(real_mode_header->trampoline_pgd);
-	trampoline_pgd[0] = __pa(level3_ident_pgt) + _KERNPG_TABLE;
-	trampoline_pgd[511] = __pa(level3_kernel_pgt) + _KERNPG_TABLE;
-#endif
-}
-
-/*
- * set_real_mode_permissions() gets called very early, to guarantee the
- * availability of low memory.  This is before the proper kernel page
- * tables are set up, so we cannot set page permissions in that
- * function.  Thus, we use an arch_initcall instead.
- */
-static int __init set_real_mode_permissions(void)
-{
-	unsigned char *base = (unsigned char *) real_mode_header;
-	size_t size = PAGE_ALIGN(real_mode_blob_end - real_mode_blob);
-
-	size_t ro_size =
-		PAGE_ALIGN(real_mode_header->ro_end) -
-		__pa(base);
-
-	size_t text_size =
-		PAGE_ALIGN(real_mode_header->ro_end) -
-		real_mode_header->text_start;
-
-	unsigned long text_start =
-		(unsigned long) __va(real_mode_header->text_start);
-
-	set_memory_nx((unsigned long) base, size >> PAGE_SHIFT);
-	set_memory_ro((unsigned long) base, ro_size >> PAGE_SHIFT);
-	set_memory_x((unsigned long) text_start, text_size >> PAGE_SHIFT);
-
-	return 0;
-}
-
-arch_initcall(set_real_mode_permissions);
diff --git a/arch/x86/realmode/Makefile b/arch/x86/realmode/Makefile
index a05b3aca64ad..94f7fbe97b08 100644
--- a/arch/x86/realmode/Makefile
+++ b/arch/x86/realmode/Makefile
@@ -9,6 +9,7 @@
 
 subdir- := rm
 
+obj-y += init.o
 obj-y += rmpiggy.o
 
 $(obj)/rmpiggy.o: $(obj)/rm/realmode.bin
diff --git a/arch/x86/realmode/init.c b/arch/x86/realmode/init.c
new file mode 100644
index 000000000000..099277984b80
--- /dev/null
+++ b/arch/x86/realmode/init.c
@@ -0,0 +1,116 @@
+#include <linux/io.h>
+#include <linux/memblock.h>
+
+#include <asm/cacheflush.h>
+#include <asm/pgtable.h>
+#include <asm/realmode.h>
+
+struct real_mode_header *real_mode_header;
+u32 *trampoline_cr4_features;
+
+void __init setup_real_mode(void)
+{
+	phys_addr_t mem;
+	u16 real_mode_seg;
+	u32 *rel;
+	u32 count;
+	u32 *ptr;
+	u16 *seg;
+	int i;
+	unsigned char *base;
+	struct trampoline_header *trampoline_header;
+	size_t size = PAGE_ALIGN(real_mode_blob_end - real_mode_blob);
+#ifdef CONFIG_X86_64
+	u64 *trampoline_pgd;
+	u32 efer_low, efer_high;
+#endif
+
+	/* Has to be in very low memory so we can execute real-mode AP code. */
+	mem = memblock_find_in_range(0, 1<<20, size, PAGE_SIZE);
+	if (!mem)
+		panic("Cannot allocate trampoline\n");
+
+	base = __va(mem);
+	memblock_reserve(mem, size);
+	real_mode_header = (struct real_mode_header *) base;
+	printk(KERN_DEBUG "Base memory trampoline at [%p] %llx size %zu\n",
+	       base, (unsigned long long)mem, size);
+
+	memcpy(base, real_mode_blob, size);
+
+	real_mode_seg = __pa(base) >> 4;
+	rel = (u32 *) real_mode_relocs;
+
+	/* 16-bit segment relocations. */
+	count = rel[0];
+	rel = &rel[1];
+	for (i = 0; i < count; i++) {
+		seg = (u16 *) (base + rel[i]);
+		*seg = real_mode_seg;
+	}
+
+	/* 32-bit linear relocations. */
+	count = rel[i];
+	rel =  &rel[i + 1];
+	for (i = 0; i < count; i++) {
+		ptr = (u32 *) (base + rel[i]);
+		*ptr += __pa(base);
+	}
+
+	/* Must be perfomed *after* relocation. */
+	trampoline_header = (struct trampoline_header *)
+		__va(real_mode_header->trampoline_header);
+
+#ifdef CONFIG_X86_32
+	trampoline_header->start = __pa(startup_32_smp);
+	trampoline_header->gdt_limit = __BOOT_DS + 7;
+	trampoline_header->gdt_base = __pa(boot_gdt);
+#else
+	/*
+	 * Some AMD processors will #GP(0) if EFER.LMA is set in WRMSR
+	 * so we need to mask it out.
+	 */
+	rdmsr(MSR_EFER, efer_low, efer_high);
+	trampoline_header->efer_low  = efer_low & ~EFER_LMA;
+	trampoline_header->efer_high = efer_high;
+
+	trampoline_header->start = (u64) secondary_startup_64;
+	trampoline_cr4_features = &trampoline_header->cr4;
+	*trampoline_cr4_features = read_cr4();
+
+	trampoline_pgd = (u64 *) __va(real_mode_header->trampoline_pgd);
+	trampoline_pgd[0] = __pa(level3_ident_pgt) + _KERNPG_TABLE;
+	trampoline_pgd[511] = __pa(level3_kernel_pgt) + _KERNPG_TABLE;
+#endif
+}
+
+/*
+ * set_real_mode_permissions() gets called very early, to guarantee the
+ * availability of low memory.  This is before the proper kernel page
+ * tables are set up, so we cannot set page permissions in that
+ * function.  Thus, we use an arch_initcall instead.
+ */
+static int __init set_real_mode_permissions(void)
+{
+	unsigned char *base = (unsigned char *) real_mode_header;
+	size_t size = PAGE_ALIGN(real_mode_blob_end - real_mode_blob);
+
+	size_t ro_size =
+		PAGE_ALIGN(real_mode_header->ro_end) -
+		__pa(base);
+
+	size_t text_size =
+		PAGE_ALIGN(real_mode_header->ro_end) -
+		real_mode_header->text_start;
+
+	unsigned long text_start =
+		(unsigned long) __va(real_mode_header->text_start);
+
+	set_memory_nx((unsigned long) base, size >> PAGE_SHIFT);
+	set_memory_ro((unsigned long) base, ro_size >> PAGE_SHIFT);
+	set_memory_x((unsigned long) text_start, text_size >> PAGE_SHIFT);
+
+	return 0;
+}
+
+arch_initcall(set_real_mode_permissions);
-- 
cgit v1.2.3


From ab7b64e9ee1e930ffe9d7f5b5eebe618a3b3a03b Mon Sep 17 00:00:00 2001
From: Peter Jones <pjones@redhat.com>
Date: Wed, 16 May 2012 13:43:26 -0400
Subject: x86: Don't continue booting if we can't load the specified initrd

If we've determined we can't do what the user asked, trying to do
something else isn't going to make the user's life better.

Without this the screen scrolls a bit and then you get a panic
anyway, and it's nice not to have so much scroll after the real
problem in bug reports.

Link: http://lkml.kernel.org/r/1337190206-12121-1-git-send-email-pjones@redhat.com
Signed-off-by: H. Peter Anvin <hpa@linux.intel.com>
---
 arch/x86/kernel/setup.c | 7 +++----
 1 file changed, 3 insertions(+), 4 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c
index 1a2901562059..37ef1169ffde 100644
--- a/arch/x86/kernel/setup.c
+++ b/arch/x86/kernel/setup.c
@@ -393,10 +393,9 @@ static void __init reserve_initrd(void)
 	initrd_start = 0;
 
 	if (ramdisk_size >= (end_of_lowmem>>1)) {
-		memblock_free(ramdisk_image, ramdisk_end - ramdisk_image);
-		printk(KERN_ERR "initrd too large to handle, "
-		       "disabling initrd\n");
-		return;
+		panic("initrd too large to handle, "
+		       "disabling initrd (%lld needed, %lld available)\n",
+		       ramdisk_size, end_of_lowmem>>1);
 	}
 
 	printk(KERN_INFO "RAMDISK: %08llx - %08llx\n", ramdisk_image,
-- 
cgit v1.2.3


From 638d957b51c88852de72f15f7cd588d125e97dab Mon Sep 17 00:00:00 2001
From: "H. Peter Anvin" <hpa@linux.intel.com>
Date: Wed, 16 May 2012 14:02:05 -0700
Subject: x86, realmode: Change EFER to a single u64 field

Change EFER to be a single u64 field instead of two u32 fields; change
the order to maintain alignment.  Note that on x86-64 cr4 is really
also a 64-bit quantity, although we can only set the low 32 bits from
the trampoline code since it is still executing in 32-bit mode at that
point.

Signed-off-by: H. Peter Anvin <hpa@linux.intel.com>
Cc: Jarkko Sakkinen <jarkko.sakkinen@intel.com>
---
 arch/x86/include/asm/realmode.h      | 3 +--
 arch/x86/realmode/init.c             | 7 +++----
 arch/x86/realmode/rm/trampoline_64.S | 2 +-
 3 files changed, 5 insertions(+), 7 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/include/asm/realmode.h b/arch/x86/include/asm/realmode.h
index 937dc6071d76..fce3f4ae5bd6 100644
--- a/arch/x86/include/asm/realmode.h
+++ b/arch/x86/include/asm/realmode.h
@@ -35,9 +35,8 @@ struct trampoline_header {
 	u32 gdt_base;
 #else
 	u64 start;
+	u64 efer;
 	u32 cr4;
-	u32 efer_low;
-	u32 efer_high;
 #endif
 };
 
diff --git a/arch/x86/realmode/init.c b/arch/x86/realmode/init.c
index 099277984b80..cbca565af5bd 100644
--- a/arch/x86/realmode/init.c
+++ b/arch/x86/realmode/init.c
@@ -22,7 +22,7 @@ void __init setup_real_mode(void)
 	size_t size = PAGE_ALIGN(real_mode_blob_end - real_mode_blob);
 #ifdef CONFIG_X86_64
 	u64 *trampoline_pgd;
-	u32 efer_low, efer_high;
+	u64 efer;
 #endif
 
 	/* Has to be in very low memory so we can execute real-mode AP code. */
@@ -70,9 +70,8 @@ void __init setup_real_mode(void)
 	 * Some AMD processors will #GP(0) if EFER.LMA is set in WRMSR
 	 * so we need to mask it out.
 	 */
-	rdmsr(MSR_EFER, efer_low, efer_high);
-	trampoline_header->efer_low  = efer_low & ~EFER_LMA;
-	trampoline_header->efer_high = efer_high;
+	rdmsrl(MSR_EFER, efer);
+	trampoline_header->efer = efer & ~EFER_LMA;
 
 	trampoline_header->start = (u64) secondary_startup_64;
 	trampoline_cr4_features = &trampoline_header->cr4;
diff --git a/arch/x86/realmode/rm/trampoline_64.S b/arch/x86/realmode/rm/trampoline_64.S
index 1b9e1bc1ac5e..bb360dc39d21 100644
--- a/arch/x86/realmode/rm/trampoline_64.S
+++ b/arch/x86/realmode/rm/trampoline_64.S
@@ -146,8 +146,8 @@ GLOBAL(trampoline_pgd)		.space	PAGE_SIZE
 	.balign	8
 GLOBAL(trampoline_header)
 	tr_start:		.space	8
-	GLOBAL(tr_cr4)		.space	4
 	GLOBAL(tr_efer)		.space	8
+	GLOBAL(tr_cr4)		.space	4
 END(trampoline_header)
 
 #include "trampoline_common.S"
-- 
cgit v1.2.3


From d8368af8b46b904def42a0f341d2f4f29001fa77 Mon Sep 17 00:00:00 2001
From: Avi Kivity <avi@redhat.com>
Date: Mon, 14 May 2012 18:07:56 +0300
Subject: KVM: Fix mmu_reload() clash with nested vmx event injection

Currently the inject_pending_event() call during guest entry happens after
kvm_mmu_reload().  This is for historical reasons - we used to
inject_pending_event() in atomic context, while kvm_mmu_reload() needs task
context.

A problem is that nested vmx can cause the mmu context to be reset, if event
injection is intercepted and causes a #VMEXIT instead (the #VMEXIT resets
CR0/CR3/CR4).  If this happens, we end up with invalid root_hpa, and since
kvm_mmu_reload() has already run, no one will fix it and we end up entering
the guest this way.

Fix by reordering event injection to be before kvm_mmu_reload().  Use
->cancel_injection() to undo if kvm_mmu_reload() fails.

https://bugzilla.kernel.org/show_bug.cgi?id=42980

Reported-by: Luke-Jr <luke-jr+linuxbugs@utopios.org>
Signed-off-by: Avi Kivity <avi@redhat.com>
Signed-off-by: Marcelo Tosatti <mtosatti@redhat.com>
---
 arch/x86/kvm/x86.c | 10 ++++++----
 1 file changed, 6 insertions(+), 4 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 4de705cdcafd..b78f89d34242 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -5279,10 +5279,6 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu)
 			kvm_deliver_pmi(vcpu);
 	}
 
-	r = kvm_mmu_reload(vcpu);
-	if (unlikely(r))
-		goto out;
-
 	if (kvm_check_request(KVM_REQ_EVENT, vcpu) || req_int_win) {
 		inject_pending_event(vcpu);
 
@@ -5298,6 +5294,12 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu)
 		}
 	}
 
+	r = kvm_mmu_reload(vcpu);
+	if (unlikely(r)) {
+		kvm_x86_ops->cancel_injection(vcpu);
+		goto out;
+	}
+
 	preempt_disable();
 
 	kvm_x86_ops->prepare_guest_switch(vcpu);
-- 
cgit v1.2.3


From 55ccf3fe3f9a3441731aa79cf42a628fc4ecace9 Mon Sep 17 00:00:00 2001
From: Suresh Siddha <suresh.b.siddha@intel.com>
Date: Wed, 16 May 2012 15:03:51 -0700
Subject: fork: move the real prepare_to_copy() users to arch_dup_task_struct()

Historical prepare_to_copy() is mostly a no-op, duplicated for majority of
the architectures and the rest following the x86 model of flushing the extended
register state like fpu there.

Remove it and use the arch_dup_task_struct() instead.

Suggested-by: Oleg Nesterov <oleg@redhat.com>
Suggested-by: Linus Torvalds <torvalds@linux-foundation.org>
Signed-off-by: Suresh Siddha <suresh.b.siddha@intel.com>
Link: http://lkml.kernel.org/r/1336692811-30576-1-git-send-email-suresh.b.siddha@intel.com
Acked-by: Benjamin Herrenschmidt <benh@kernel.crashing.org>
Cc: David Howells <dhowells@redhat.com>
Cc: Koichi Yasutake <yasutake.koichi@jp.panasonic.com>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Paul Mundt <lethal@linux-sh.org>
Cc: Chris Zankel <chris@zankel.net>
Cc: Richard Henderson <rth@twiddle.net>
Cc: Russell King <linux@arm.linux.org.uk>
Cc: Haavard Skinnemoen <hskinnemoen@gmail.com>
Cc: Mike Frysinger <vapier@gentoo.org>
Cc: Mark Salter <msalter@redhat.com>
Cc: Aurelien Jacquiot <a-jacquiot@ti.com>
Cc: Mikael Starvik <starvik@axis.com>
Cc: Yoshinori Sato <ysato@users.sourceforge.jp>
Cc: Richard Kuo <rkuo@codeaurora.org>
Cc: Tony Luck <tony.luck@intel.com>
Cc: Michal Simek <monstr@monstr.eu>
Cc: Ralf Baechle <ralf@linux-mips.org>
Cc: Jonas Bonn <jonas@southpole.se>
Cc: James E.J. Bottomley <jejb@parisc-linux.org>
Cc: Helge Deller <deller@gmx.de>
Cc: Martin Schwidefsky <schwidefsky@de.ibm.com>
Cc: Heiko Carstens <heiko.carstens@de.ibm.com>
Cc: Chen Liqin <liqin.chen@sunplusct.com>
Cc: Lennox Wu <lennox.wu@gmail.com>
Cc: David S. Miller <davem@davemloft.net>
Cc: Chris Metcalf <cmetcalf@tilera.com>
Cc: Jeff Dike <jdike@addtoit.com>
Cc: Richard Weinberger <richard@nod.at>
Cc: Guan Xuetao <gxt@mprc.pku.edu.cn>
Signed-off-by: H. Peter Anvin <hpa@linux.intel.com>
---
 arch/alpha/include/asm/processor.h      |  3 ---
 arch/arm/include/asm/processor.h        |  3 ---
 arch/avr32/include/asm/processor.h      |  3 ---
 arch/blackfin/include/asm/processor.h   |  2 --
 arch/c6x/include/asm/processor.h        |  3 ---
 arch/cris/include/asm/processor.h       |  4 ----
 arch/frv/include/asm/processor.h        |  2 --
 arch/frv/kernel/process.c               | 11 -----------
 arch/h8300/include/asm/processor.h      |  2 --
 arch/hexagon/include/asm/processor.h    |  7 -------
 arch/ia64/include/asm/processor.h       |  3 ---
 arch/m32r/include/asm/processor.h       |  2 --
 arch/m68k/include/asm/processor.h       |  3 ---
 arch/microblaze/include/asm/processor.h |  1 -
 arch/mips/include/asm/processor.h       |  3 ---
 arch/mn10300/include/asm/processor.h    |  3 ---
 arch/mn10300/kernel/process.c           | 10 ++++++----
 arch/openrisc/include/asm/processor.h   |  4 ----
 arch/parisc/include/asm/processor.h     |  3 ---
 arch/powerpc/include/asm/processor.h    |  3 ---
 arch/powerpc/kernel/process.c           | 19 +++++++++++--------
 arch/s390/include/asm/processor.h       |  3 ---
 arch/score/include/asm/processor.h      |  1 -
 arch/sh/include/asm/processor_32.h      |  3 ---
 arch/sh/include/asm/processor_64.h      |  1 -
 arch/sh/kernel/process.c                |  7 +++++++
 arch/sh/kernel/process_32.c             |  9 ---------
 arch/sparc/include/asm/processor_32.h   |  3 ---
 arch/sparc/include/asm/processor_64.h   |  3 ---
 arch/tile/include/asm/processor.h       |  3 ---
 arch/um/include/asm/processor-generic.h |  5 -----
 arch/unicore32/include/asm/processor.h  |  3 ---
 arch/x86/include/asm/processor.h        |  3 ---
 arch/x86/kernel/process.c               |  6 ++++++
 arch/x86/kernel/process_32.c            |  9 ---------
 arch/x86/kernel/process_64.c            |  9 ---------
 arch/xtensa/include/asm/processor.h     |  3 ---
 arch/xtensa/kernel/process.c            |  9 ++++++---
 kernel/fork.c                           |  2 --
 39 files changed, 36 insertions(+), 140 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/alpha/include/asm/processor.h b/arch/alpha/include/asm/processor.h
index 94afe5859301..e37b887b3d9f 100644
--- a/arch/alpha/include/asm/processor.h
+++ b/arch/alpha/include/asm/processor.h
@@ -49,9 +49,6 @@ extern void start_thread(struct pt_regs *, unsigned long, unsigned long);
 /* Free all resources held by a thread. */
 extern void release_thread(struct task_struct *);
 
-/* Prepare to copy thread state - unlazy all lazy status */
-#define prepare_to_copy(tsk)	do { } while (0)
-
 /* Create a kernel thread without removing it from tasklists.  */
 extern long kernel_thread(int (*fn)(void *), void *arg, unsigned long flags);
 
diff --git a/arch/arm/include/asm/processor.h b/arch/arm/include/asm/processor.h
index 5ac8d3d3e025..6354224f980d 100644
--- a/arch/arm/include/asm/processor.h
+++ b/arch/arm/include/asm/processor.h
@@ -77,9 +77,6 @@ struct task_struct;
 /* Free all resources held by a thread. */
 extern void release_thread(struct task_struct *);
 
-/* Prepare to copy thread state - unlazy all lazy status */
-#define prepare_to_copy(tsk)	do { } while (0)
-
 unsigned long get_wchan(struct task_struct *p);
 
 #if __LINUX_ARM_ARCH__ == 6 || defined(CONFIG_ARM_ERRATA_754327)
diff --git a/arch/avr32/include/asm/processor.h b/arch/avr32/include/asm/processor.h
index 108502bc6770..87d8baccc60e 100644
--- a/arch/avr32/include/asm/processor.h
+++ b/arch/avr32/include/asm/processor.h
@@ -145,9 +145,6 @@ extern void release_thread(struct task_struct *);
 /* Create a kernel thread without removing it from tasklists */
 extern int kernel_thread(int (*fn)(void *), void *arg, unsigned long flags);
 
-/* Prepare to copy thread state - unlazy all lazy status */
-#define prepare_to_copy(tsk) do { } while(0)
-
 /* Return saved PC of a blocked thread */
 #define thread_saved_pc(tsk)    ((tsk)->thread.cpu_context.pc)
 
diff --git a/arch/blackfin/include/asm/processor.h b/arch/blackfin/include/asm/processor.h
index 8af7772e84cc..4ef7cfe43ceb 100644
--- a/arch/blackfin/include/asm/processor.h
+++ b/arch/blackfin/include/asm/processor.h
@@ -75,8 +75,6 @@ static inline void release_thread(struct task_struct *dead_task)
 {
 }
 
-#define prepare_to_copy(tsk)	do { } while (0)
-
 extern int kernel_thread(int (*fn) (void *), void *arg, unsigned long flags);
 
 /*
diff --git a/arch/c6x/include/asm/processor.h b/arch/c6x/include/asm/processor.h
index 3ff7fab956ba..c50af7ef1c96 100644
--- a/arch/c6x/include/asm/processor.h
+++ b/arch/c6x/include/asm/processor.h
@@ -92,9 +92,6 @@ static inline void release_thread(struct task_struct *dead_task)
 {
 }
 
-/* Prepare to copy thread state - unlazy all lazy status */
-#define prepare_to_copy(tsk)	do { } while (0)
-
 extern int kernel_thread(int (*fn)(void *), void * arg, unsigned long flags);
 
 #define copy_segments(tsk, mm)		do { } while (0)
diff --git a/arch/cris/include/asm/processor.h b/arch/cris/include/asm/processor.h
index 4210d72a6667..37f522feabc1 100644
--- a/arch/cris/include/asm/processor.h
+++ b/arch/cris/include/asm/processor.h
@@ -50,10 +50,6 @@ struct task_struct;
 #define task_pt_regs(task) user_regs(task_thread_info(task))
 #define current_regs() task_pt_regs(current)
 
-static inline void prepare_to_copy(struct task_struct *tsk)
-{
-}
-
 extern int kernel_thread(int (*fn)(void *), void * arg, unsigned long flags);
 
 unsigned long get_wchan(struct task_struct *p);
diff --git a/arch/frv/include/asm/processor.h b/arch/frv/include/asm/processor.h
index 81c2e271d620..9c768170b304 100644
--- a/arch/frv/include/asm/processor.h
+++ b/arch/frv/include/asm/processor.h
@@ -103,8 +103,6 @@ do {							\
 	__frame->sp	= (_usp);			\
 } while(0)
 
-extern void prepare_to_copy(struct task_struct *tsk);
-
 /* Free all resources held by a thread. */
 static inline void release_thread(struct task_struct *dead_task)
 {
diff --git a/arch/frv/kernel/process.c b/arch/frv/kernel/process.c
index d4de48bd5efe..9f3dfadee09e 100644
--- a/arch/frv/kernel/process.c
+++ b/arch/frv/kernel/process.c
@@ -180,17 +180,6 @@ asmlinkage int sys_clone(unsigned long clone_flags, unsigned long newsp,
 	return do_fork(clone_flags, newsp, __frame, 0, parent_tidptr, child_tidptr);
 } /* end sys_clone() */
 
-/*****************************************************************************/
-/*
- * This gets called before we allocate a new thread and copy
- * the current task into it.
- */
-void prepare_to_copy(struct task_struct *tsk)
-{
-	//unlazy_fpu(tsk);
-} /* end prepare_to_copy() */
-
-/*****************************************************************************/
 /*
  * set up the kernel stack and exception frames for a new process
  */
diff --git a/arch/h8300/include/asm/processor.h b/arch/h8300/include/asm/processor.h
index 61fabf1788c6..4c9f6f87b617 100644
--- a/arch/h8300/include/asm/processor.h
+++ b/arch/h8300/include/asm/processor.h
@@ -109,8 +109,6 @@ static inline void release_thread(struct task_struct *dead_task)
 
 extern int kernel_thread(int (*fn)(void *), void * arg, unsigned long flags);
 
-#define prepare_to_copy(tsk)	do { } while (0)
-
 /*
  * Free current thread data structures etc..
  */
diff --git a/arch/hexagon/include/asm/processor.h b/arch/hexagon/include/asm/processor.h
index 20c5ddabbd8b..e8ea459002a4 100644
--- a/arch/hexagon/include/asm/processor.h
+++ b/arch/hexagon/include/asm/processor.h
@@ -58,13 +58,6 @@ struct thread_struct {
 
 #define cpu_relax() __vmyield()
 
-/*
- * "Unlazying all lazy status" occurs here.
- */
-static inline void prepare_to_copy(struct task_struct *tsk)
-{
-}
-
 /*
  * Decides where the kernel will search for a free chunk of vm space during
  * mmaps.
diff --git a/arch/ia64/include/asm/processor.h b/arch/ia64/include/asm/processor.h
index 483f6c6a4238..efcca1b601c5 100644
--- a/arch/ia64/include/asm/processor.h
+++ b/arch/ia64/include/asm/processor.h
@@ -343,9 +343,6 @@ struct task_struct;
  */
 #define release_thread(dead_task)
 
-/* Prepare to copy thread state - unlazy all lazy status */
-#define prepare_to_copy(tsk)	do { } while (0)
-
 /*
  * This is the mechanism for creating a new kernel thread.
  *
diff --git a/arch/m32r/include/asm/processor.h b/arch/m32r/include/asm/processor.h
index e1f46d757460..da17253b5735 100644
--- a/arch/m32r/include/asm/processor.h
+++ b/arch/m32r/include/asm/processor.h
@@ -118,8 +118,6 @@ struct mm_struct;
 /* Free all resources held by a thread. */
 extern void release_thread(struct task_struct *);
 
-#define prepare_to_copy(tsk)	do { } while (0)
-
 /*
  * create a kernel thread without removing it from tasklists
  */
diff --git a/arch/m68k/include/asm/processor.h b/arch/m68k/include/asm/processor.h
index 46460fa15d5c..f17c42aff7ff 100644
--- a/arch/m68k/include/asm/processor.h
+++ b/arch/m68k/include/asm/processor.h
@@ -153,9 +153,6 @@ static inline void release_thread(struct task_struct *dead_task)
 {
 }
 
-/* Prepare to copy thread state - unlazy all lazy status */
-#define prepare_to_copy(tsk)	do { } while (0)
-
 extern int kernel_thread(int (*fn)(void *), void * arg, unsigned long flags);
 
 /*
diff --git a/arch/microblaze/include/asm/processor.h b/arch/microblaze/include/asm/processor.h
index bffb54527299..af2bb9652392 100644
--- a/arch/microblaze/include/asm/processor.h
+++ b/arch/microblaze/include/asm/processor.h
@@ -23,7 +23,6 @@ extern const struct seq_operations cpuinfo_op;
 
 # define cpu_relax()		barrier()
 # define cpu_sleep()		do {} while (0)
-# define prepare_to_copy(tsk)	do {} while (0)
 
 #define task_pt_regs(tsk) \
 		(((struct pt_regs *)(THREAD_SIZE + task_stack_page(tsk))) - 1)
diff --git a/arch/mips/include/asm/processor.h b/arch/mips/include/asm/processor.h
index 20e9dcf42b27..5e33fabe354d 100644
--- a/arch/mips/include/asm/processor.h
+++ b/arch/mips/include/asm/processor.h
@@ -310,9 +310,6 @@ struct task_struct;
 /* Free all resources held by a thread. */
 #define release_thread(thread) do { } while(0)
 
-/* Prepare to copy thread state - unlazy all lazy status */
-#define prepare_to_copy(tsk)	do { } while (0)
-
 extern long kernel_thread(int (*fn)(void *), void * arg, unsigned long flags);
 
 extern unsigned long thread_saved_pc(struct task_struct *tsk);
diff --git a/arch/mn10300/include/asm/processor.h b/arch/mn10300/include/asm/processor.h
index f7b3c9ab2cb5..247928c9f549 100644
--- a/arch/mn10300/include/asm/processor.h
+++ b/arch/mn10300/include/asm/processor.h
@@ -139,9 +139,6 @@ static inline void start_thread(struct pt_regs *regs,
 /* Free all resources held by a thread. */
 extern void release_thread(struct task_struct *);
 
-/* Prepare to copy thread state - unlazy all lazy status */
-extern void prepare_to_copy(struct task_struct *tsk);
-
 /*
  * create a kernel thread without removing it from tasklists
  */
diff --git a/arch/mn10300/kernel/process.c b/arch/mn10300/kernel/process.c
index 14707f25153b..7dab0cd36466 100644
--- a/arch/mn10300/kernel/process.c
+++ b/arch/mn10300/kernel/process.c
@@ -208,12 +208,14 @@ void copy_segments(struct task_struct *p, struct mm_struct *new_mm)
 }
 
 /*
- * this gets called before we allocate a new thread and copy the current task
- * into it so that we can store lazy state into memory
+ * this gets called so that we can store lazy state into memory and copy the
+ * current task into the new thread.
  */
-void prepare_to_copy(struct task_struct *tsk)
+int arch_dup_task_struct(struct task_struct *dst, struct task_struct *src)
 {
-	unlazy_fpu(tsk);
+	unlazy_fpu(src);
+	*dst = *src;
+	return 0;
 }
 
 /*
diff --git a/arch/openrisc/include/asm/processor.h b/arch/openrisc/include/asm/processor.h
index f7516fa78b58..30462f1fe959 100644
--- a/arch/openrisc/include/asm/processor.h
+++ b/arch/openrisc/include/asm/processor.h
@@ -72,10 +72,6 @@ struct thread_struct {
 #define task_pt_regs(task) user_regs(task_thread_info(task))
 #define current_regs() user_regs(current_thread_info())
 
-extern inline void prepare_to_copy(struct task_struct *tsk)
-{
-}
-
 #define INIT_SP         (sizeof(init_stack) + (unsigned long) &init_stack)
 
 #define INIT_THREAD  { }
diff --git a/arch/parisc/include/asm/processor.h b/arch/parisc/include/asm/processor.h
index acdf4cad6125..0e8b7b8ce8a2 100644
--- a/arch/parisc/include/asm/processor.h
+++ b/arch/parisc/include/asm/processor.h
@@ -328,9 +328,6 @@ struct mm_struct;
 extern void release_thread(struct task_struct *);
 extern int kernel_thread(int (*fn)(void *), void * arg, unsigned long flags);
 
-/* Prepare to copy thread state - unlazy all lazy status */
-#define prepare_to_copy(tsk)	do { } while (0)
-
 extern void map_hpux_gateway_page(struct task_struct *tsk, struct mm_struct *mm);
 
 extern unsigned long get_wchan(struct task_struct *p);
diff --git a/arch/powerpc/include/asm/processor.h b/arch/powerpc/include/asm/processor.h
index 8e2d0371fe1e..854f899d0c34 100644
--- a/arch/powerpc/include/asm/processor.h
+++ b/arch/powerpc/include/asm/processor.h
@@ -74,9 +74,6 @@ struct task_struct;
 void start_thread(struct pt_regs *regs, unsigned long fdptr, unsigned long sp);
 void release_thread(struct task_struct *);
 
-/* Prepare to copy thread state - unlazy all lazy status */
-extern void prepare_to_copy(struct task_struct *tsk);
-
 /* Create a new kernel thread. */
 extern long kernel_thread(int (*fn)(void *), void *arg, unsigned long flags);
 
diff --git a/arch/powerpc/kernel/process.c b/arch/powerpc/kernel/process.c
index 4937c9690090..bc129f24e11f 100644
--- a/arch/powerpc/kernel/process.c
+++ b/arch/powerpc/kernel/process.c
@@ -711,18 +711,21 @@ release_thread(struct task_struct *t)
 }
 
 /*
- * This gets called before we allocate a new thread and copy
- * the current task into it.
+ * this gets called so that we can store coprocessor state into memory and
+ * copy the current task into the new thread.
  */
-void prepare_to_copy(struct task_struct *tsk)
+int arch_dup_task_struct(struct task_struct *dst, struct task_struct *src)
 {
-	flush_fp_to_thread(current);
-	flush_altivec_to_thread(current);
-	flush_vsx_to_thread(current);
-	flush_spe_to_thread(current);
+	flush_fp_to_thread(src);
+	flush_altivec_to_thread(src);
+	flush_vsx_to_thread(src);
+	flush_spe_to_thread(src);
 #ifdef CONFIG_HAVE_HW_BREAKPOINT
-	flush_ptrace_hw_breakpoint(tsk);
+	flush_ptrace_hw_breakpoint(src);
 #endif /* CONFIG_HAVE_HW_BREAKPOINT */
+
+	*dst = *src;
+	return 0;
 }
 
 /*
diff --git a/arch/s390/include/asm/processor.h b/arch/s390/include/asm/processor.h
index d499b30ea487..6cbf31311673 100644
--- a/arch/s390/include/asm/processor.h
+++ b/arch/s390/include/asm/processor.h
@@ -141,9 +141,6 @@ struct seq_file;
 extern void release_thread(struct task_struct *);
 extern int kernel_thread(int (*fn)(void *), void * arg, unsigned long flags);
 
-/* Prepare to copy thread state - unlazy all lazy status */
-#define prepare_to_copy(tsk)	do { } while (0)
-
 /*
  * Return saved PC of a blocked thread.
  */
diff --git a/arch/score/include/asm/processor.h b/arch/score/include/asm/processor.h
index 7e22f216d771..ab3aceb54209 100644
--- a/arch/score/include/asm/processor.h
+++ b/arch/score/include/asm/processor.h
@@ -26,7 +26,6 @@ extern unsigned long get_wchan(struct task_struct *p);
 
 #define cpu_relax()		barrier()
 #define release_thread(thread)	do {} while (0)
-#define prepare_to_copy(tsk)	do {} while (0)
 
 /*
  * User space process size: 2GB. This is hardcoded into a few places,
diff --git a/arch/sh/include/asm/processor_32.h b/arch/sh/include/asm/processor_32.h
index 900f8d72ffe2..b6311fd2d066 100644
--- a/arch/sh/include/asm/processor_32.h
+++ b/arch/sh/include/asm/processor_32.h
@@ -126,9 +126,6 @@ extern void start_thread(struct pt_regs *regs, unsigned long new_pc, unsigned lo
 /* Free all resources held by a thread. */
 extern void release_thread(struct task_struct *);
 
-/* Prepare to copy thread state - unlazy all lazy status */
-void prepare_to_copy(struct task_struct *tsk);
-
 /*
  * create a kernel thread without removing it from tasklists
  */
diff --git a/arch/sh/include/asm/processor_64.h b/arch/sh/include/asm/processor_64.h
index e25c4c7d6b63..fe99afecfbc5 100644
--- a/arch/sh/include/asm/processor_64.h
+++ b/arch/sh/include/asm/processor_64.h
@@ -172,7 +172,6 @@ extern int kernel_thread(int (*fn)(void *), void * arg, unsigned long flags);
 #define copy_segments(p, mm)	do { } while (0)
 #define release_segments(mm)	do { } while (0)
 #define forget_segments()	do { } while (0)
-#define prepare_to_copy(tsk)	do { } while (0)
 /*
  * FPU lazy state save handling.
  */
diff --git a/arch/sh/kernel/process.c b/arch/sh/kernel/process.c
index 325f98b1736d..2bde59eae10d 100644
--- a/arch/sh/kernel/process.c
+++ b/arch/sh/kernel/process.c
@@ -6,8 +6,15 @@
 struct kmem_cache *task_xstate_cachep = NULL;
 unsigned int xstate_size;
 
+/*
+ * this gets called so that we can store lazy state into memory and copy the
+ * current task into the new thread.
+ */
 int arch_dup_task_struct(struct task_struct *dst, struct task_struct *src)
 {
+#ifdef CONFIG_SUPERH32
+	unlazy_fpu(src, task_pt_regs(src));
+#endif
 	*dst = *src;
 
 	if (src->thread.xstate) {
diff --git a/arch/sh/kernel/process_32.c b/arch/sh/kernel/process_32.c
index 94273aaf78c1..dee596113cc1 100644
--- a/arch/sh/kernel/process_32.c
+++ b/arch/sh/kernel/process_32.c
@@ -155,15 +155,6 @@ int dump_fpu(struct pt_regs *regs, elf_fpregset_t *fpu)
 }
 EXPORT_SYMBOL(dump_fpu);
 
-/*
- * This gets called before we allocate a new thread and copy
- * the current task into it.
- */
-void prepare_to_copy(struct task_struct *tsk)
-{
-	unlazy_fpu(tsk, task_pt_regs(tsk));
-}
-
 asmlinkage void ret_from_fork(void);
 
 int copy_thread(unsigned long clone_flags, unsigned long usp,
diff --git a/arch/sparc/include/asm/processor_32.h b/arch/sparc/include/asm/processor_32.h
index 09521c6a5edb..c9c760fdc713 100644
--- a/arch/sparc/include/asm/processor_32.h
+++ b/arch/sparc/include/asm/processor_32.h
@@ -109,9 +109,6 @@ static inline void start_thread(struct pt_regs * regs, unsigned long pc,
 #define release_thread(tsk)		do { } while(0)
 extern pid_t kernel_thread(int (*fn)(void *), void * arg, unsigned long flags);
 
-/* Prepare to copy thread state - unlazy all lazy status */
-#define prepare_to_copy(tsk)	do { } while (0)
-
 extern unsigned long get_wchan(struct task_struct *);
 
 #define task_pt_regs(tsk) ((tsk)->thread.kregs)
diff --git a/arch/sparc/include/asm/processor_64.h b/arch/sparc/include/asm/processor_64.h
index e713db249931..67df5cc10011 100644
--- a/arch/sparc/include/asm/processor_64.h
+++ b/arch/sparc/include/asm/processor_64.h
@@ -186,9 +186,6 @@ do { \
 /* Free all resources held by a thread. */
 #define release_thread(tsk)		do { } while (0)
 
-/* Prepare to copy thread state - unlazy all lazy status */
-#define prepare_to_copy(tsk)	do { } while (0)
-
 extern pid_t kernel_thread(int (*fn)(void *), void * arg, unsigned long flags);
 
 extern unsigned long get_wchan(struct task_struct *task);
diff --git a/arch/tile/include/asm/processor.h b/arch/tile/include/asm/processor.h
index 34c1e01ffb5e..15cd8a4a06ce 100644
--- a/arch/tile/include/asm/processor.h
+++ b/arch/tile/include/asm/processor.h
@@ -210,9 +210,6 @@ static inline void release_thread(struct task_struct *dead_task)
 	/* Nothing for now */
 }
 
-/* Prepare to copy thread state - unlazy all lazy status. */
-#define prepare_to_copy(tsk)	do { } while (0)
-
 extern int kernel_thread(int (*fn)(void *), void *arg, unsigned long flags);
 
 extern int do_work_pending(struct pt_regs *regs, u32 flags);
diff --git a/arch/um/include/asm/processor-generic.h b/arch/um/include/asm/processor-generic.h
index 98d01bc4fa92..63b716052d20 100644
--- a/arch/um/include/asm/processor-generic.h
+++ b/arch/um/include/asm/processor-generic.h
@@ -76,11 +76,6 @@ static inline void release_thread(struct task_struct *task)
 
 extern int kernel_thread(int (*fn)(void *), void * arg, unsigned long flags);
 
-static inline void prepare_to_copy(struct task_struct *tsk)
-{
-}
-
-
 extern unsigned long thread_saved_pc(struct task_struct *t);
 
 static inline void mm_copy_segments(struct mm_struct *from_mm,
diff --git a/arch/unicore32/include/asm/processor.h b/arch/unicore32/include/asm/processor.h
index f0d780a51f9b..14382cb09657 100644
--- a/arch/unicore32/include/asm/processor.h
+++ b/arch/unicore32/include/asm/processor.h
@@ -68,9 +68,6 @@ struct task_struct;
 /* Free all resources held by a thread. */
 extern void release_thread(struct task_struct *);
 
-/* Prepare to copy thread state - unlazy all lazy status */
-#define prepare_to_copy(tsk)	do { } while (0)
-
 unsigned long get_wchan(struct task_struct *p);
 
 #define cpu_relax()			barrier()
diff --git a/arch/x86/include/asm/processor.h b/arch/x86/include/asm/processor.h
index 4fa7dcceb6c0..97fe04318e95 100644
--- a/arch/x86/include/asm/processor.h
+++ b/arch/x86/include/asm/processor.h
@@ -579,9 +579,6 @@ extern int kernel_thread(int (*fn)(void *), void *arg, unsigned long flags);
 /* Free all resources held by a thread. */
 extern void release_thread(struct task_struct *);
 
-/* Prepare to copy thread state - unlazy all lazy state */
-extern void prepare_to_copy(struct task_struct *tsk);
-
 unsigned long get_wchan(struct task_struct *p);
 
 /*
diff --git a/arch/x86/kernel/process.c b/arch/x86/kernel/process.c
index 1d92a5ab6e8b..b7e1e0e53987 100644
--- a/arch/x86/kernel/process.c
+++ b/arch/x86/kernel/process.c
@@ -47,10 +47,16 @@ EXPORT_SYMBOL_GPL(idle_notifier_unregister);
 struct kmem_cache *task_xstate_cachep;
 EXPORT_SYMBOL_GPL(task_xstate_cachep);
 
+/*
+ * this gets called so that we can store lazy state into memory and copy the
+ * current task into the new thread.
+ */
 int arch_dup_task_struct(struct task_struct *dst, struct task_struct *src)
 {
 	int ret;
 
+	unlazy_fpu(src);
+
 	*dst = *src;
 	if (fpu_allocated(&src->thread.fpu)) {
 		memset(&dst->thread.fpu, 0, sizeof(dst->thread.fpu));
diff --git a/arch/x86/kernel/process_32.c b/arch/x86/kernel/process_32.c
index ae6847303e26..2aa57dc909d6 100644
--- a/arch/x86/kernel/process_32.c
+++ b/arch/x86/kernel/process_32.c
@@ -126,15 +126,6 @@ void release_thread(struct task_struct *dead_task)
 	release_vm86_irqs(dead_task);
 }
 
-/*
- * This gets called before we allocate a new thread and copy
- * the current task into it.
- */
-void prepare_to_copy(struct task_struct *tsk)
-{
-	unlazy_fpu(tsk);
-}
-
 int copy_thread(unsigned long clone_flags, unsigned long sp,
 	unsigned long unused,
 	struct task_struct *p, struct pt_regs *regs)
diff --git a/arch/x86/kernel/process_64.c b/arch/x86/kernel/process_64.c
index 43d8b48b23e6..c4c0645a4011 100644
--- a/arch/x86/kernel/process_64.c
+++ b/arch/x86/kernel/process_64.c
@@ -145,15 +145,6 @@ static inline u32 read_32bit_tls(struct task_struct *t, int tls)
 	return get_desc_base(&t->thread.tls_array[tls]);
 }
 
-/*
- * This gets called before we allocate a new thread and copy
- * the current task into it.
- */
-void prepare_to_copy(struct task_struct *tsk)
-{
-	unlazy_fpu(tsk);
-}
-
 int copy_thread(unsigned long clone_flags, unsigned long sp,
 		unsigned long unused,
 	struct task_struct *p, struct pt_regs *regs)
diff --git a/arch/xtensa/include/asm/processor.h b/arch/xtensa/include/asm/processor.h
index 3acb26e8dead..5c371d8d4528 100644
--- a/arch/xtensa/include/asm/processor.h
+++ b/arch/xtensa/include/asm/processor.h
@@ -168,9 +168,6 @@ struct mm_struct;
 /* Free all resources held by a thread. */
 #define release_thread(thread) do { } while(0)
 
-/* Prepare to copy thread state - unlazy all lazy status */
-extern void prepare_to_copy(struct task_struct*);
-
 /* Create a kernel thread without removing it from tasklists */
 extern int kernel_thread(int (*fn)(void *), void * arg, unsigned long flags);
 
diff --git a/arch/xtensa/kernel/process.c b/arch/xtensa/kernel/process.c
index 6a2d6edf8f72..9b306e550e3f 100644
--- a/arch/xtensa/kernel/process.c
+++ b/arch/xtensa/kernel/process.c
@@ -140,13 +140,16 @@ void flush_thread(void)
 }
 
 /*
- * This is called before the thread is copied. 
+ * this gets called so that we can store coprocessor state into memory and
+ * copy the current task into the new thread.
  */
-void prepare_to_copy(struct task_struct *tsk)
+int arch_dup_task_struct(struct task_struct *dst, struct task_struct *src)
 {
 #if XTENSA_HAVE_COPROCESSORS
-	coprocessor_flush_all(task_thread_info(tsk));
+	coprocessor_flush_all(task_thread_info(src));
 #endif
+	*dst = *src;
+	return 0;
 }
 
 /*
diff --git a/kernel/fork.c b/kernel/fork.c
index 687a15d56243..7aed746ff47c 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -261,8 +261,6 @@ static struct task_struct *dup_task_struct(struct task_struct *orig)
 	int node = tsk_fork_get_node(orig);
 	int err;
 
-	prepare_to_copy(orig);
-
 	tsk = alloc_task_struct_node(node);
 	if (!tsk)
 		return NULL;
-- 
cgit v1.2.3


From d75f1b391f5ef73016d14bc6f7e4725820ebaa5b Mon Sep 17 00:00:00 2001
From: Suresh Siddha <suresh.b.siddha@intel.com>
Date: Wed, 16 May 2012 15:03:53 -0700
Subject: x86, xsave: remove thread_has_fpu() bug check in
 __sanitize_i387_state()

Code paths like fork(), exit() and signal handling flush the fpu
state explicitly to the structures in memory.

BUG_ON() in __sanitize_i387_state() is checking that the fpu state
is not live any more. But for preempt kernels, task can be scheduled
out and in at any place and the preload_fpu logic during context switch
can make the fpu registers live again.

For example, consider a 64-bit Task which uses fpu frequently and as such
you will find its fpu_counter mostly non-zero. During its time slice, kernel
used fpu by doing kernel_fpu_begin/kernel_fpu_end(). After this, in the same
scheduling slice, task-A got a signal to handle. Then during the signal
setup path we got preempted when we are just before the sanitize_i387_state()
in arch/x86/kernel/xsave.c:save_i387_xstate(). And when we come back we
will have the fpu registers live that can hit the bug_on.

Similarly during core dump, other threads can context-switch in and out
(because of spurious wakeups while waiting for the coredump to finish in
 kernel/exit.c:exit_mm()) and the main thread dumping core can run into this
bug when it finds some other thread with its fpu registers live on some other cpu.

So remove the paranoid check for now, even though it caught a bug in the
multi-threaded core dump case (fixed in the previous patch).

Signed-off-by: Suresh Siddha <suresh.b.siddha@intel.com>
Link: http://lkml.kernel.org/r/1336692811-30576-3-git-send-email-suresh.b.siddha@intel.com
Cc: Oleg Nesterov <oleg@redhat.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Signed-off-by: H. Peter Anvin <hpa@linux.intel.com>
---
 arch/x86/kernel/xsave.c | 2 --
 1 file changed, 2 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kernel/xsave.c b/arch/x86/kernel/xsave.c
index e62728e30b01..bd18149b2b0f 100644
--- a/arch/x86/kernel/xsave.c
+++ b/arch/x86/kernel/xsave.c
@@ -48,8 +48,6 @@ void __sanitize_i387_state(struct task_struct *tsk)
 	if (!fx)
 		return;
 
-	BUG_ON(__thread_has_fpu(tsk));
-
 	xstate_bv = tsk->thread.fpu.state->xsave.xsave_hdr.xstate_bv;
 
 	/*
-- 
cgit v1.2.3


From 1dcc8d7ba235a316a056f993e88f0d18b92c60d9 Mon Sep 17 00:00:00 2001
From: Suresh Siddha <suresh.b.siddha@intel.com>
Date: Wed, 16 May 2012 15:03:54 -0700
Subject: x86, fpu: drop the fpu state during thread exit

There is no need to save any active fpu state to the task structure
memory if the task is dead. Just drop the state instead.

For example, this saved some 1770 xsave's during the system boot
of a two socket Xeon system.

Suggested-by: Linus Torvalds <torvalds@linux-foundation.org>
Signed-off-by: Suresh Siddha <suresh.b.siddha@intel.com>
Link: http://lkml.kernel.org/r/1336692811-30576-4-git-send-email-suresh.b.siddha@intel.com
Cc: Oleg Nesterov <oleg@redhat.com>
Signed-off-by: H. Peter Anvin <hpa@linux.intel.com>
---
 arch/x86/kernel/process.c | 19 +++++++++++++------
 1 file changed, 13 insertions(+), 6 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kernel/process.c b/arch/x86/kernel/process.c
index b7e1e0e53987..1219fe2be8f3 100644
--- a/arch/x86/kernel/process.c
+++ b/arch/x86/kernel/process.c
@@ -87,6 +87,16 @@ void arch_task_cache_init(void)
 				  SLAB_PANIC | SLAB_NOTRACK, NULL);
 }
 
+static inline void drop_fpu(struct task_struct *tsk)
+{
+	/*
+	 * Forget coprocessor state..
+	 */
+	tsk->fpu_counter = 0;
+	clear_fpu(tsk);
+	clear_used_math();
+}
+
 /*
  * Free current thread data structures etc..
  */
@@ -109,6 +119,8 @@ void exit_thread(void)
 		put_cpu();
 		kfree(bp);
 	}
+
+	drop_fpu(me);
 }
 
 void show_regs(struct pt_regs *regs)
@@ -149,12 +161,7 @@ void flush_thread(void)
 
 	flush_ptrace_hw_breakpoint(tsk);
 	memset(tsk->thread.tls_array, 0, sizeof(tsk->thread.tls_array));
-	/*
-	 * Forget coprocessor state..
-	 */
-	tsk->fpu_counter = 0;
-	clear_fpu(tsk);
-	clear_used_math();
+	drop_fpu(tsk);
 }
 
 static void hard_disable_TSC(void)
-- 
cgit v1.2.3


From e4f5d5440bb860a3e8942ca8f7277a7f31798965 Mon Sep 17 00:00:00 2001
From: Steven Rostedt <srostedt@redhat.com>
Date: Fri, 27 Apr 2012 09:13:18 -0400
Subject: ftrace/x86: Have x86 ftrace use the ftrace_modify_all_code()

To remove duplicate code, have the ftrace arch_ftrace_update_code()
use the generic ftrace_modify_all_code(). This requires that the
default ftrace_replace_code() becomes a weak function so that an
arch may override it.

Signed-off-by: Steven Rostedt <rostedt@goodmis.org>
---
 arch/x86/kernel/ftrace.c | 15 ++-------------
 include/linux/ftrace.h   |  1 +
 kernel/trace/ftrace.c    |  4 ++--
 3 files changed, 5 insertions(+), 15 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kernel/ftrace.c b/arch/x86/kernel/ftrace.c
index 4243e8bbdcb1..32ff36596ab1 100644
--- a/arch/x86/kernel/ftrace.c
+++ b/arch/x86/kernel/ftrace.c
@@ -435,7 +435,7 @@ static void run_sync(void)
 		local_irq_disable();
 }
 
-static void ftrace_replace_code(int enable)
+void ftrace_replace_code(int enable)
 {
 	struct ftrace_rec_iter *iter;
 	struct dyn_ftrace *rec;
@@ -493,18 +493,7 @@ void arch_ftrace_update_code(int command)
 {
 	modifying_ftrace_code++;
 
-	if (command & FTRACE_UPDATE_CALLS)
-		ftrace_replace_code(1);
-	else if (command & FTRACE_DISABLE_CALLS)
-		ftrace_replace_code(0);
-
-	if (command & FTRACE_UPDATE_TRACE_FUNC)
-		ftrace_update_ftrace_func(ftrace_trace_function);
-
-	if (command & FTRACE_START_FUNC_RET)
-		ftrace_enable_ftrace_graph_caller();
-	else if (command & FTRACE_STOP_FUNC_RET)
-		ftrace_disable_ftrace_graph_caller();
+	ftrace_modify_all_code(command);
 
 	modifying_ftrace_code--;
 }
diff --git a/include/linux/ftrace.h b/include/linux/ftrace.h
index cd72ace7ade3..55e6d63d46d0 100644
--- a/include/linux/ftrace.h
+++ b/include/linux/ftrace.h
@@ -314,6 +314,7 @@ ftrace_set_early_filter(struct ftrace_ops *ops, char *buf, int enable);
 /* defined in arch */
 extern int ftrace_ip_converted(unsigned long ip);
 extern int ftrace_dyn_arch_init(void *data);
+extern void ftrace_replace_code(int enable);
 extern int ftrace_update_ftrace_func(ftrace_func_t func);
 extern void ftrace_caller(void);
 extern void ftrace_call(void);
diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c
index 3c345825cc23..a008663d86c8 100644
--- a/kernel/trace/ftrace.c
+++ b/kernel/trace/ftrace.c
@@ -1683,7 +1683,7 @@ __ftrace_replace_code(struct dyn_ftrace *rec, int enable)
 	return -1; /* unknow ftrace bug */
 }
 
-static void ftrace_replace_code(int update)
+void __weak ftrace_replace_code(int enable)
 {
 	struct dyn_ftrace *rec;
 	struct ftrace_page *pg;
@@ -1693,7 +1693,7 @@ static void ftrace_replace_code(int update)
 		return;
 
 	do_for_each_ftrace_rec(pg, rec) {
-		failed = __ftrace_replace_code(rec, update);
+		failed = __ftrace_replace_code(rec, enable);
 		if (failed) {
 			ftrace_bug(failed, rec->ip);
 			/* Stop processing */
-- 
cgit v1.2.3


From db2e034d2c55e1f273ed820cc3edcdbc73d0292c Mon Sep 17 00:00:00 2001
From: Dave Airlie <airlied@redhat.com>
Date: Thu, 17 May 2012 08:31:29 +0100
Subject: x86/vga: fix build with efi disabled.

Reported by sfr on -next merge.

Signed-off-by: Dave Airlie <airlied@redhat.com>
---
 arch/x86/pci/fixup.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'arch/x86')

diff --git a/arch/x86/pci/fixup.c b/arch/x86/pci/fixup.c
index 01635537d72e..82487d3d5879 100644
--- a/arch/x86/pci/fixup.c
+++ b/arch/x86/pci/fixup.c
@@ -6,8 +6,8 @@
 #include <linux/dmi.h>
 #include <linux/pci.h>
 #include <linux/init.h>
+#include <linux/vgaarb.h>
 #include <asm/pci_x86.h>
-#include <asm/vga.h>
 
 static void __devinit pci_fixup_i450nx(struct pci_dev *d)
 {
-- 
cgit v1.2.3


From 8e7fbcbc22c12414bcc9dfdd683637f58fb32759 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <peterz@infradead.org>
Date: Mon, 9 Jan 2012 11:28:35 +0100
Subject: sched: Remove stale power aware scheduling remnants and dysfunctional
 knobs

It's been broken forever (i.e. it's not scheduling in a power
aware fashion), as reported by Suresh and others sending
patches, and nobody cares enough to fix it properly ...
so remove it to make space free for something better.

There's various problems with the code as it stands today, first
and foremost the user interface which is bound to topology
levels and has multiple values per level. This results in a
state explosion which the administrator or distro needs to
master and almost nobody does.

Furthermore large configuration state spaces aren't good, it
means the thing doesn't just work right because it's either
under so many impossibe to meet constraints, or even if
there's an achievable state workloads have to be aware of
it precisely and can never meet it for dynamic workloads.

So pushing this kind of decision to user-space was a bad idea
even with a single knob - it's exponentially worse with knobs
on every node of the topology.

There is a proposal to replace the user interface with a single
3 state knob:

 sched_balance_policy := { performance, power, auto }

where 'auto' would be the preferred default which looks at things
like Battery/AC mode and possible cpufreq state or whatever the hw
exposes to show us power use expectations - but there's been no
progress on it in the past many months.

Aside from that, the actual implementation of the various knobs
is known to be broken. There have been sporadic attempts at
fixing things but these always stop short of reaching a mergable
state.

Therefore this wholesale removal with the hopes of spurring
people who care to come forward once again and work on a
coherent replacement.

Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Suresh Siddha <suresh.b.siddha@intel.com>
Cc: Arjan van de Ven <arjan@linux.intel.com>
Cc: Vincent Guittot <vincent.guittot@linaro.org>
Cc: Vaidyanathan Srinivasan <svaidy@linux.vnet.ibm.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Andrew Morton <akpm@linux-foundation.org>
Link: http://lkml.kernel.org/r/1326104915.2442.53.camel@twins
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 Documentation/ABI/testing/sysfs-devices-system-cpu |  25 --
 Documentation/scheduler/sched-domains.txt          |   4 -
 arch/x86/kernel/smpboot.c                          |   3 +-
 drivers/base/cpu.c                                 |   4 -
 include/linux/cpu.h                                |   2 -
 include/linux/sched.h                              |  47 ----
 include/linux/topology.h                           |   5 -
 kernel/sched/core.c                                |  94 -------
 kernel/sched/fair.c                                | 275 +--------------------
 tools/power/cpupower/man/cpupower-set.1            |   9 -
 tools/power/cpupower/utils/helpers/sysfs.c         |  35 +--
 11 files changed, 5 insertions(+), 498 deletions(-)

(limited to 'arch/x86')

diff --git a/Documentation/ABI/testing/sysfs-devices-system-cpu b/Documentation/ABI/testing/sysfs-devices-system-cpu
index e7be75b96e4b..5dab36448b44 100644
--- a/Documentation/ABI/testing/sysfs-devices-system-cpu
+++ b/Documentation/ABI/testing/sysfs-devices-system-cpu
@@ -9,31 +9,6 @@ Description:
 
 		/sys/devices/system/cpu/cpu#/
 
-What:		/sys/devices/system/cpu/sched_mc_power_savings
-		/sys/devices/system/cpu/sched_smt_power_savings
-Date:		June 2006
-Contact:	Linux kernel mailing list <linux-kernel@vger.kernel.org>
-Description:	Discover and adjust the kernel's multi-core scheduler support.
-
-		Possible values are:
-
-		0 - No power saving load balance (default value)
-		1 - Fill one thread/core/package first for long running threads
-		2 - Also bias task wakeups to semi-idle cpu package for power
-		    savings
-
-		sched_mc_power_savings is dependent upon SCHED_MC, which is
-		itself architecture dependent.
-
-		sched_smt_power_savings is dependent upon SCHED_SMT, which
-		is itself architecture dependent.
-
-		The two files are independent of each other. It is possible
-		that one file may be present without the other.
-
-		Introduced by git commit 5c45bf27.
-
-
 What:		/sys/devices/system/cpu/kernel_max
 		/sys/devices/system/cpu/offline
 		/sys/devices/system/cpu/online
diff --git a/Documentation/scheduler/sched-domains.txt b/Documentation/scheduler/sched-domains.txt
index b7ee379b651b..443f0c76bab4 100644
--- a/Documentation/scheduler/sched-domains.txt
+++ b/Documentation/scheduler/sched-domains.txt
@@ -61,10 +61,6 @@ The implementor should read comments in include/linux/sched.h:
 struct sched_domain fields, SD_FLAG_*, SD_*_INIT to get an idea of
 the specifics and what to tune.
 
-For SMT, the architecture must define CONFIG_SCHED_SMT and provide a
-cpumask_t cpu_sibling_map[NR_CPUS], where cpu_sibling_map[i] is the mask of
-all "i"'s siblings as well as "i" itself.
-
 Architectures may retain the regular override the default SD_*_INIT flags
 while using the generic domain builder in kernel/sched.c if they wish to
 retain the traditional SMT->SMP->NUMA topology (or some subset of that). This
diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c
index e84c1bbea339..256c20cc5e96 100644
--- a/arch/x86/kernel/smpboot.c
+++ b/arch/x86/kernel/smpboot.c
@@ -429,8 +429,7 @@ const struct cpumask *cpu_coregroup_mask(int cpu)
 	 * For perf, we return last level cache shared map.
 	 * And for power savings, we return cpu_core_map
 	 */
-	if ((sched_mc_power_savings || sched_smt_power_savings) &&
-	    !(cpu_has(c, X86_FEATURE_AMD_DCM)))
+	if (!(cpu_has(c, X86_FEATURE_AMD_DCM)))
 		return cpu_core_mask(cpu);
 	else
 		return cpu_llc_shared_mask(cpu);
diff --git a/drivers/base/cpu.c b/drivers/base/cpu.c
index adf937bf4091..63452943abd1 100644
--- a/drivers/base/cpu.c
+++ b/drivers/base/cpu.c
@@ -330,8 +330,4 @@ void __init cpu_dev_init(void)
 		panic("Failed to register CPU subsystem");
 
 	cpu_dev_register_generic();
-
-#if defined(CONFIG_SCHED_MC) || defined(CONFIG_SCHED_SMT)
-	sched_create_sysfs_power_savings_entries(cpu_subsys.dev_root);
-#endif
 }
diff --git a/include/linux/cpu.h b/include/linux/cpu.h
index ee28844ae68e..7230bb59a06f 100644
--- a/include/linux/cpu.h
+++ b/include/linux/cpu.h
@@ -36,8 +36,6 @@ extern void cpu_remove_dev_attr(struct device_attribute *attr);
 extern int cpu_add_dev_attr_group(struct attribute_group *attrs);
 extern void cpu_remove_dev_attr_group(struct attribute_group *attrs);
 
-extern int sched_create_sysfs_power_savings_entries(struct device *dev);
-
 #ifdef CONFIG_HOTPLUG_CPU
 extern void unregister_cpu(struct cpu *cpu);
 extern ssize_t arch_cpu_probe(const char *, size_t);
diff --git a/include/linux/sched.h b/include/linux/sched.h
index 4a559bf0622f..3d644809c9db 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -855,61 +855,14 @@ enum cpu_idle_type {
 #define SD_WAKE_AFFINE		0x0020	/* Wake task to waking CPU */
 #define SD_PREFER_LOCAL		0x0040  /* Prefer to keep tasks local to this domain */
 #define SD_SHARE_CPUPOWER	0x0080	/* Domain members share cpu power */
-#define SD_POWERSAVINGS_BALANCE	0x0100	/* Balance for power savings */
 #define SD_SHARE_PKG_RESOURCES	0x0200	/* Domain members share cpu pkg resources */
 #define SD_SERIALIZE		0x0400	/* Only a single load balancing instance */
 #define SD_ASYM_PACKING		0x0800  /* Place busy groups earlier in the domain */
 #define SD_PREFER_SIBLING	0x1000	/* Prefer to place tasks in a sibling domain */
 #define SD_OVERLAP		0x2000	/* sched_domains of this level overlap */
 
-enum powersavings_balance_level {
-	POWERSAVINGS_BALANCE_NONE = 0,  /* No power saving load balance */
-	POWERSAVINGS_BALANCE_BASIC,	/* Fill one thread/core/package
-					 * first for long running threads
-					 */
-	POWERSAVINGS_BALANCE_WAKEUP,	/* Also bias task wakeups to semi-idle
-					 * cpu package for power savings
-					 */
-	MAX_POWERSAVINGS_BALANCE_LEVELS
-};
-
-extern int sched_mc_power_savings, sched_smt_power_savings;
-
-static inline int sd_balance_for_mc_power(void)
-{
-	if (sched_smt_power_savings)
-		return SD_POWERSAVINGS_BALANCE;
-
-	if (!sched_mc_power_savings)
-		return SD_PREFER_SIBLING;
-
-	return 0;
-}
-
-static inline int sd_balance_for_package_power(void)
-{
-	if (sched_mc_power_savings | sched_smt_power_savings)
-		return SD_POWERSAVINGS_BALANCE;
-
-	return SD_PREFER_SIBLING;
-}
-
 extern int __weak arch_sd_sibiling_asym_packing(void);
 
-/*
- * Optimise SD flags for power savings:
- * SD_BALANCE_NEWIDLE helps aggressive task consolidation and power savings.
- * Keep default SD flags if sched_{smt,mc}_power_saving=0
- */
-
-static inline int sd_power_saving_flags(void)
-{
-	if (sched_mc_power_savings | sched_smt_power_savings)
-		return SD_BALANCE_NEWIDLE;
-
-	return 0;
-}
-
 struct sched_group_power {
 	atomic_t ref;
 	/*
diff --git a/include/linux/topology.h b/include/linux/topology.h
index 4f59bf36f0af..09558d1daacd 100644
--- a/include/linux/topology.h
+++ b/include/linux/topology.h
@@ -98,7 +98,6 @@ int arch_update_cpu_topology(void);
 				| 0*SD_BALANCE_WAKE			\
 				| 1*SD_WAKE_AFFINE			\
 				| 1*SD_SHARE_CPUPOWER			\
-				| 0*SD_POWERSAVINGS_BALANCE		\
 				| 1*SD_SHARE_PKG_RESOURCES		\
 				| 0*SD_SERIALIZE			\
 				| 0*SD_PREFER_SIBLING			\
@@ -134,8 +133,6 @@ int arch_update_cpu_topology(void);
 				| 0*SD_SHARE_CPUPOWER			\
 				| 1*SD_SHARE_PKG_RESOURCES		\
 				| 0*SD_SERIALIZE			\
-				| sd_balance_for_mc_power()		\
-				| sd_power_saving_flags()		\
 				,					\
 	.last_balance		= jiffies,				\
 	.balance_interval	= 1,					\
@@ -167,8 +164,6 @@ int arch_update_cpu_topology(void);
 				| 0*SD_SHARE_CPUPOWER			\
 				| 0*SD_SHARE_PKG_RESOURCES		\
 				| 0*SD_SERIALIZE			\
-				| sd_balance_for_package_power()	\
-				| sd_power_saving_flags()		\
 				,					\
 	.last_balance		= jiffies,				\
 	.balance_interval	= 1,					\
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index bd314d7cd9f8..24ca677b5457 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -5929,8 +5929,6 @@ static const struct cpumask *cpu_cpu_mask(int cpu)
 	return cpumask_of_node(cpu_to_node(cpu));
 }
 
-int sched_smt_power_savings = 0, sched_mc_power_savings = 0;
-
 struct sd_data {
 	struct sched_domain **__percpu sd;
 	struct sched_group **__percpu sg;
@@ -6322,7 +6320,6 @@ sd_numa_init(struct sched_domain_topology_level *tl, int cpu)
 					| 0*SD_WAKE_AFFINE
 					| 0*SD_PREFER_LOCAL
 					| 0*SD_SHARE_CPUPOWER
-					| 0*SD_POWERSAVINGS_BALANCE
 					| 0*SD_SHARE_PKG_RESOURCES
 					| 1*SD_SERIALIZE
 					| 0*SD_PREFER_SIBLING
@@ -6819,97 +6816,6 @@ match2:
 	mutex_unlock(&sched_domains_mutex);
 }
 
-#if defined(CONFIG_SCHED_MC) || defined(CONFIG_SCHED_SMT)
-static void reinit_sched_domains(void)
-{
-	get_online_cpus();
-
-	/* Destroy domains first to force the rebuild */
-	partition_sched_domains(0, NULL, NULL);
-
-	rebuild_sched_domains();
-	put_online_cpus();
-}
-
-static ssize_t sched_power_savings_store(const char *buf, size_t count, int smt)
-{
-	unsigned int level = 0;
-
-	if (sscanf(buf, "%u", &level) != 1)
-		return -EINVAL;
-
-	/*
-	 * level is always be positive so don't check for
-	 * level < POWERSAVINGS_BALANCE_NONE which is 0
-	 * What happens on 0 or 1 byte write,
-	 * need to check for count as well?
-	 */
-
-	if (level >= MAX_POWERSAVINGS_BALANCE_LEVELS)
-		return -EINVAL;
-
-	if (smt)
-		sched_smt_power_savings = level;
-	else
-		sched_mc_power_savings = level;
-
-	reinit_sched_domains();
-
-	return count;
-}
-
-#ifdef CONFIG_SCHED_MC
-static ssize_t sched_mc_power_savings_show(struct device *dev,
-					   struct device_attribute *attr,
-					   char *buf)
-{
-	return sprintf(buf, "%u\n", sched_mc_power_savings);
-}
-static ssize_t sched_mc_power_savings_store(struct device *dev,
-					    struct device_attribute *attr,
-					    const char *buf, size_t count)
-{
-	return sched_power_savings_store(buf, count, 0);
-}
-static DEVICE_ATTR(sched_mc_power_savings, 0644,
-		   sched_mc_power_savings_show,
-		   sched_mc_power_savings_store);
-#endif
-
-#ifdef CONFIG_SCHED_SMT
-static ssize_t sched_smt_power_savings_show(struct device *dev,
-					    struct device_attribute *attr,
-					    char *buf)
-{
-	return sprintf(buf, "%u\n", sched_smt_power_savings);
-}
-static ssize_t sched_smt_power_savings_store(struct device *dev,
-					    struct device_attribute *attr,
-					     const char *buf, size_t count)
-{
-	return sched_power_savings_store(buf, count, 1);
-}
-static DEVICE_ATTR(sched_smt_power_savings, 0644,
-		   sched_smt_power_savings_show,
-		   sched_smt_power_savings_store);
-#endif
-
-int __init sched_create_sysfs_power_savings_entries(struct device *dev)
-{
-	int err = 0;
-
-#ifdef CONFIG_SCHED_SMT
-	if (smt_capable())
-		err = device_create_file(dev, &dev_attr_sched_smt_power_savings);
-#endif
-#ifdef CONFIG_SCHED_MC
-	if (!err && mc_capable())
-		err = device_create_file(dev, &dev_attr_sched_mc_power_savings);
-#endif
-	return err;
-}
-#endif /* CONFIG_SCHED_MC || CONFIG_SCHED_SMT */
-
 /*
  * Update cpusets according to cpu_active mask.  If cpusets are
  * disabled, cpuset_update_active_cpus() becomes a simple wrapper
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 0b42f4487329..940e6d17cf96 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -2721,7 +2721,7 @@ select_task_rq_fair(struct task_struct *p, int sd_flag, int wake_flags)
 		 * If power savings logic is enabled for a domain, see if we
 		 * are not overloaded, if so, don't balance wider.
 		 */
-		if (tmp->flags & (SD_POWERSAVINGS_BALANCE|SD_PREFER_LOCAL)) {
+		if (tmp->flags & (SD_PREFER_LOCAL)) {
 			unsigned long power = 0;
 			unsigned long nr_running = 0;
 			unsigned long capacity;
@@ -2734,9 +2734,6 @@ select_task_rq_fair(struct task_struct *p, int sd_flag, int wake_flags)
 
 			capacity = DIV_ROUND_CLOSEST(power, SCHED_POWER_SCALE);
 
-			if (tmp->flags & SD_POWERSAVINGS_BALANCE)
-				nr_running /= 2;
-
 			if (nr_running < capacity)
 				want_sd = 0;
 		}
@@ -3435,14 +3432,6 @@ struct sd_lb_stats {
 	unsigned int  busiest_group_weight;
 
 	int group_imb; /* Is there imbalance in this sd */
-#if defined(CONFIG_SCHED_MC) || defined(CONFIG_SCHED_SMT)
-	int power_savings_balance; /* Is powersave balance needed for this sd */
-	struct sched_group *group_min; /* Least loaded group in sd */
-	struct sched_group *group_leader; /* Group which relieves group_min */
-	unsigned long min_load_per_task; /* load_per_task in group_min */
-	unsigned long leader_nr_running; /* Nr running of group_leader */
-	unsigned long min_nr_running; /* Nr running of group_min */
-#endif
 };
 
 /*
@@ -3486,147 +3475,6 @@ static inline int get_sd_load_idx(struct sched_domain *sd,
 	return load_idx;
 }
 
-
-#if defined(CONFIG_SCHED_MC) || defined(CONFIG_SCHED_SMT)
-/**
- * init_sd_power_savings_stats - Initialize power savings statistics for
- * the given sched_domain, during load balancing.
- *
- * @sd: Sched domain whose power-savings statistics are to be initialized.
- * @sds: Variable containing the statistics for sd.
- * @idle: Idle status of the CPU at which we're performing load-balancing.
- */
-static inline void init_sd_power_savings_stats(struct sched_domain *sd,
-	struct sd_lb_stats *sds, enum cpu_idle_type idle)
-{
-	/*
-	 * Busy processors will not participate in power savings
-	 * balance.
-	 */
-	if (idle == CPU_NOT_IDLE || !(sd->flags & SD_POWERSAVINGS_BALANCE))
-		sds->power_savings_balance = 0;
-	else {
-		sds->power_savings_balance = 1;
-		sds->min_nr_running = ULONG_MAX;
-		sds->leader_nr_running = 0;
-	}
-}
-
-/**
- * update_sd_power_savings_stats - Update the power saving stats for a
- * sched_domain while performing load balancing.
- *
- * @group: sched_group belonging to the sched_domain under consideration.
- * @sds: Variable containing the statistics of the sched_domain
- * @local_group: Does group contain the CPU for which we're performing
- * 		load balancing ?
- * @sgs: Variable containing the statistics of the group.
- */
-static inline void update_sd_power_savings_stats(struct sched_group *group,
-	struct sd_lb_stats *sds, int local_group, struct sg_lb_stats *sgs)
-{
-
-	if (!sds->power_savings_balance)
-		return;
-
-	/*
-	 * If the local group is idle or completely loaded
-	 * no need to do power savings balance at this domain
-	 */
-	if (local_group && (sds->this_nr_running >= sgs->group_capacity ||
-				!sds->this_nr_running))
-		sds->power_savings_balance = 0;
-
-	/*
-	 * If a group is already running at full capacity or idle,
-	 * don't include that group in power savings calculations
-	 */
-	if (!sds->power_savings_balance ||
-		sgs->sum_nr_running >= sgs->group_capacity ||
-		!sgs->sum_nr_running)
-		return;
-
-	/*
-	 * Calculate the group which has the least non-idle load.
-	 * This is the group from where we need to pick up the load
-	 * for saving power
-	 */
-	if ((sgs->sum_nr_running < sds->min_nr_running) ||
-	    (sgs->sum_nr_running == sds->min_nr_running &&
-	     group_first_cpu(group) > group_first_cpu(sds->group_min))) {
-		sds->group_min = group;
-		sds->min_nr_running = sgs->sum_nr_running;
-		sds->min_load_per_task = sgs->sum_weighted_load /
-						sgs->sum_nr_running;
-	}
-
-	/*
-	 * Calculate the group which is almost near its
-	 * capacity but still has some space to pick up some load
-	 * from other group and save more power
-	 */
-	if (sgs->sum_nr_running + 1 > sgs->group_capacity)
-		return;
-
-	if (sgs->sum_nr_running > sds->leader_nr_running ||
-	    (sgs->sum_nr_running == sds->leader_nr_running &&
-	     group_first_cpu(group) < group_first_cpu(sds->group_leader))) {
-		sds->group_leader = group;
-		sds->leader_nr_running = sgs->sum_nr_running;
-	}
-}
-
-/**
- * check_power_save_busiest_group - see if there is potential for some power-savings balance
- * @env: load balance environment
- * @sds: Variable containing the statistics of the sched_domain
- *	under consideration.
- *
- * Description:
- * Check if we have potential to perform some power-savings balance.
- * If yes, set the busiest group to be the least loaded group in the
- * sched_domain, so that it's CPUs can be put to idle.
- *
- * Returns 1 if there is potential to perform power-savings balance.
- * Else returns 0.
- */
-static inline
-int check_power_save_busiest_group(struct lb_env *env, struct sd_lb_stats *sds)
-{
-	if (!sds->power_savings_balance)
-		return 0;
-
-	if (sds->this != sds->group_leader ||
-			sds->group_leader == sds->group_min)
-		return 0;
-
-	env->imbalance = sds->min_load_per_task;
-	sds->busiest = sds->group_min;
-
-	return 1;
-
-}
-#else /* CONFIG_SCHED_MC || CONFIG_SCHED_SMT */
-static inline void init_sd_power_savings_stats(struct sched_domain *sd,
-	struct sd_lb_stats *sds, enum cpu_idle_type idle)
-{
-	return;
-}
-
-static inline void update_sd_power_savings_stats(struct sched_group *group,
-	struct sd_lb_stats *sds, int local_group, struct sg_lb_stats *sgs)
-{
-	return;
-}
-
-static inline
-int check_power_save_busiest_group(struct lb_env *env, struct sd_lb_stats *sds)
-{
-	return 0;
-}
-#endif /* CONFIG_SCHED_MC || CONFIG_SCHED_SMT */
-
-
 unsigned long default_scale_freq_power(struct sched_domain *sd, int cpu)
 {
 	return SCHED_POWER_SCALE;
@@ -3932,7 +3780,6 @@ static inline void update_sd_lb_stats(struct lb_env *env,
 	if (child && child->flags & SD_PREFER_SIBLING)
 		prefer_sibling = 1;
 
-	init_sd_power_savings_stats(env->sd, sds, env->idle);
 	load_idx = get_sd_load_idx(env->sd, env->idle);
 
 	do {
@@ -3981,7 +3828,6 @@ static inline void update_sd_lb_stats(struct lb_env *env,
 			sds->group_imb = sgs.group_imb;
 		}
 
-		update_sd_power_savings_stats(sg, sds, local_group, &sgs);
 		sg = sg->next;
 	} while (sg != env->sd->groups);
 }
@@ -4276,12 +4122,6 @@ force_balance:
 	return sds.busiest;
 
 out_balanced:
-	/*
-	 * There is no obvious imbalance. But check if we can do some balancing
-	 * to save power.
-	 */
-	if (check_power_save_busiest_group(env, &sds))
-		return sds.busiest;
 ret:
 	env->imbalance = 0;
 	return NULL;
@@ -4359,28 +4199,6 @@ static int need_active_balance(struct lb_env *env)
 		 */
 		if ((sd->flags & SD_ASYM_PACKING) && env->src_cpu > env->dst_cpu)
 			return 1;
-
-		/*
-		 * The only task running in a non-idle cpu can be moved to this
-		 * cpu in an attempt to completely freeup the other CPU
-		 * package.
-		 *
-		 * The package power saving logic comes from
-		 * find_busiest_group(). If there are no imbalance, then
-		 * f_b_g() will return NULL. However when sched_mc={1,2} then
-		 * f_b_g() will select a group from which a running task may be
-		 * pulled to this cpu in order to make the other package idle.
-		 * If there is no opportunity to make a package idle and if
-		 * there are no imbalance, then f_b_g() will return NULL and no
-		 * action will be taken in load_balance_newidle().
-		 *
-		 * Under normal task pull operation due to imbalance, there
-		 * will be more than one task in the source run queue and
-		 * move_tasks() will succeed.  ld_moved will be true and this
-		 * active balance code will not be triggered.
-		 */
-		if (sched_mc_power_savings < POWERSAVINGS_BALANCE_WAKEUP)
-			return 0;
 	}
 
 	return unlikely(sd->nr_balance_failed > sd->cache_nice_tries+2);
@@ -4700,104 +4518,15 @@ static struct {
 	unsigned long next_balance;     /* in jiffy units */
 } nohz ____cacheline_aligned;
 
-#if defined(CONFIG_SCHED_MC) || defined(CONFIG_SCHED_SMT)
-/**
- * lowest_flag_domain - Return lowest sched_domain containing flag.
- * @cpu:	The cpu whose lowest level of sched domain is to
- *		be returned.
- * @flag:	The flag to check for the lowest sched_domain
- *		for the given cpu.
- *
- * Returns the lowest sched_domain of a cpu which contains the given flag.
- */
-static inline struct sched_domain *lowest_flag_domain(int cpu, int flag)
-{
-	struct sched_domain *sd;
-
-	for_each_domain(cpu, sd)
-		if (sd->flags & flag)
-			break;
-
-	return sd;
-}
-
-/**
- * for_each_flag_domain - Iterates over sched_domains containing the flag.
- * @cpu:	The cpu whose domains we're iterating over.
- * @sd:		variable holding the value of the power_savings_sd
- *		for cpu.
- * @flag:	The flag to filter the sched_domains to be iterated.
- *
- * Iterates over all the scheduler domains for a given cpu that has the 'flag'
- * set, starting from the lowest sched_domain to the highest.
- */
-#define for_each_flag_domain(cpu, sd, flag) \
-	for (sd = lowest_flag_domain(cpu, flag); \
-		(sd && (sd->flags & flag)); sd = sd->parent)
-
-/**
- * find_new_ilb - Finds the optimum idle load balancer for nomination.
- * @cpu:	The cpu which is nominating a new idle_load_balancer.
- *
- * Returns:	Returns the id of the idle load balancer if it exists,
- *		Else, returns >= nr_cpu_ids.
- *
- * This algorithm picks the idle load balancer such that it belongs to a
- * semi-idle powersavings sched_domain. The idea is to try and avoid
- * completely idle packages/cores just for the purpose of idle load balancing
- * when there are other idle cpu's which are better suited for that job.
- */
-static int find_new_ilb(int cpu)
+static inline int find_new_ilb(int call_cpu)
 {
 	int ilb = cpumask_first(nohz.idle_cpus_mask);
-	struct sched_group *ilbg;
-	struct sched_domain *sd;
 
-	/*
-	 * Have idle load balancer selection from semi-idle packages only
-	 * when power-aware load balancing is enabled
-	 */
-	if (!(sched_smt_power_savings || sched_mc_power_savings))
-		goto out_done;
-
-	/*
-	 * Optimize for the case when we have no idle CPUs or only one
-	 * idle CPU. Don't walk the sched_domain hierarchy in such cases
-	 */
-	if (cpumask_weight(nohz.idle_cpus_mask) < 2)
-		goto out_done;
-
-	rcu_read_lock();
-	for_each_flag_domain(cpu, sd, SD_POWERSAVINGS_BALANCE) {
-		ilbg = sd->groups;
-
-		do {
-			if (ilbg->group_weight !=
-				atomic_read(&ilbg->sgp->nr_busy_cpus)) {
-				ilb = cpumask_first_and(nohz.idle_cpus_mask,
-							sched_group_cpus(ilbg));
-				goto unlock;
-			}
-
-			ilbg = ilbg->next;
-
-		} while (ilbg != sd->groups);
-	}
-unlock:
-	rcu_read_unlock();
-
-out_done:
 	if (ilb < nr_cpu_ids && idle_cpu(ilb))
 		return ilb;
 
 	return nr_cpu_ids;
 }
-#else /*  (CONFIG_SCHED_MC || CONFIG_SCHED_SMT) */
-static inline int find_new_ilb(int call_cpu)
-{
-	return nr_cpu_ids;
-}
-#endif
 
 /*
  * Kick a CPU to do the nohz balancing, if it is time for it. We pick the
diff --git a/tools/power/cpupower/man/cpupower-set.1 b/tools/power/cpupower/man/cpupower-set.1
index c4954a9fe4e7..9dbd536518ab 100644
--- a/tools/power/cpupower/man/cpupower-set.1
+++ b/tools/power/cpupower/man/cpupower-set.1
@@ -85,15 +85,6 @@ Possible values are:
 savings
 .RE
 
-sched_mc_power_savings is dependent upon SCHED_MC, which is
-itself architecture dependent.
-
-sched_smt_power_savings is dependent upon SCHED_SMT, which
-is itself architecture dependent.
-
-The two files are independent of each other. It is possible
-that one file may be present without the other.
-
 .SH "SEE ALSO"
 cpupower-info(1), cpupower-monitor(1), powertop(1)
 .PP
diff --git a/tools/power/cpupower/utils/helpers/sysfs.c b/tools/power/cpupower/utils/helpers/sysfs.c
index c6343024a611..96e28c124b5c 100644
--- a/tools/power/cpupower/utils/helpers/sysfs.c
+++ b/tools/power/cpupower/utils/helpers/sysfs.c
@@ -362,22 +362,7 @@ char *sysfs_get_cpuidle_driver(void)
  */
 int sysfs_get_sched(const char *smt_mc)
 {
-	unsigned long value;
-	char linebuf[MAX_LINE_LEN];
-	char *endp;
-	char path[SYSFS_PATH_MAX];
-
-	if (strcmp("mc", smt_mc) && strcmp("smt", smt_mc))
-		return -EINVAL;
-
-	snprintf(path, sizeof(path),
-		PATH_TO_CPU "sched_%s_power_savings", smt_mc);
-	if (sysfs_read_file(path, linebuf, MAX_LINE_LEN) == 0)
-		return -1;
-	value = strtoul(linebuf, &endp, 0);
-	if (endp == linebuf || errno == ERANGE)
-		return -1;
-	return value;
+	return -ENODEV;
 }
 
 /*
@@ -388,21 +373,5 @@ int sysfs_get_sched(const char *smt_mc)
  */
 int sysfs_set_sched(const char *smt_mc, int val)
 {
-	char linebuf[MAX_LINE_LEN];
-	char path[SYSFS_PATH_MAX];
-	struct stat statbuf;
-
-	if (strcmp("mc", smt_mc) && strcmp("smt", smt_mc))
-		return -EINVAL;
-
-	snprintf(path, sizeof(path),
-		PATH_TO_CPU "sched_%s_power_savings", smt_mc);
-	sprintf(linebuf, "%d", val);
-
-	if (stat(path, &statbuf) != 0)
-		return -ENODEV;
-
-	if (sysfs_write_file(path, linebuf, MAX_LINE_LEN) == 0)
-		return -1;
-	return 0;
+	return -ENODEV;
 }
-- 
cgit v1.2.3


From 80b3e557371205566a71e569fbfcce5b11f92dbe Mon Sep 17 00:00:00 2001
From: Alan Cox <alan@linux.intel.com>
Date: Tue, 15 May 2012 18:44:15 +0100
Subject: x86: Fix boot on Twinhead H12Y

Despite lots of investigation into why this is needed we don't
know or have an elegant cure. The only answer found on this
laptop is to mark a problem region as used so that Linux doesn't
put anything there.

Currently all the users add reserve= command lines and anyone
not knowing this needs to find the magic page that documents it.
Automate it instead.

Signed-off-by: Alan Cox <alan@linux.intel.com>
Tested-and-bugfixed-by: Arne Fitzenreiter <arne@fitzenreiter.de>
Resolves-bug: https://bugzilla.kernel.org/show_bug.cgi?id=10231
Link: http://lkml.kernel.org/r/20120515174347.5109.94551.stgit@bluebook
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/x86/pci/fixup.c | 17 +++++++++++++++++
 1 file changed, 17 insertions(+)

(limited to 'arch/x86')

diff --git a/arch/x86/pci/fixup.c b/arch/x86/pci/fixup.c
index d0e6e403b4f6..5dd467bd6121 100644
--- a/arch/x86/pci/fixup.c
+++ b/arch/x86/pci/fixup.c
@@ -519,3 +519,20 @@ static void sb600_disable_hpet_bar(struct pci_dev *dev)
 	}
 }
 DECLARE_PCI_FIXUP_EARLY(PCI_VENDOR_ID_ATI, 0x4385, sb600_disable_hpet_bar);
+
+/*
+ * Twinhead H12Y needs us to block out a region otherwise we map devices
+ * there and any access kills the box.
+ *
+ *   See: https://bugzilla.kernel.org/show_bug.cgi?id=10231
+ *
+ * Match off the LPC and svid/sdid (older kernels lose the bridge subvendor)
+ */
+static void __devinit twinhead_reserve_killing_zone(struct pci_dev *dev)
+{
+        if (dev->subsystem_vendor == 0x14FF && dev->subsystem_device == 0xA003) {
+                pr_info("Reserving memory on Twinhead H12Y\n");
+                request_mem_region(0xFFB00000, 0x100000, "twinhead");
+        }
+}
+DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x27B9, twinhead_reserve_killing_zone);
-- 
cgit v1.2.3


From bb8187d35f820671d6dd76700d77a6b55f95e2c5 Mon Sep 17 00:00:00 2001
From: Paul Gortmaker <paul.gortmaker@windriver.com>
Date: Thu, 17 May 2012 19:06:13 -0400
Subject: MCA: delete all remaining traces of microchannel bus support.

Hardware with MCA bus is limited to 386 and 486 class machines
that are now 20+ years old and typically with less than 32MB
of memory.  A quick search on the internet, and you see that
even the MCA hobbyist/enthusiast community has lost interest
in the early 2000 era and never really even moved ahead from
the 2.4 kernels to the 2.6 series.

This deletes anything remaining related to CONFIG_MCA from core
kernel code and from the x86 architecture.  There is no point in
carrying this any further into the future.

One complication to watch for is inadvertently scooping up
stuff relating to machine check, since there is overlap in
the TLA name space (e.g. arch/x86/boot/mca.c).

Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: James Bottomley <JBottomley@Parallels.com>
Cc: x86@kernel.org
Acked-by: Ingo Molnar <mingo@elte.hu>
Acked-by: H. Peter Anvin <hpa@zytor.com>
Signed-off-by: Paul Gortmaker <paul.gortmaker@windriver.com>
---
 Documentation/00-INDEX                |   2 -
 Documentation/DocBook/Makefile        |   2 +-
 Documentation/DocBook/kernel-api.tmpl |  13 -
 Documentation/DocBook/mcabook.tmpl    | 107 --------
 Documentation/devices.txt             |   8 +-
 Documentation/eisa.txt                |   2 +-
 Documentation/kernel-parameters.txt   |   1 -
 Documentation/mca.txt                 | 313 ----------------------
 MAINTAINERS                           |  13 -
 arch/frv/include/asm/processor.h      |   1 -
 arch/x86/Kconfig                      |  10 -
 arch/x86/include/asm/mca.h            |  43 ---
 arch/x86/include/asm/mca_dma.h        | 201 --------------
 arch/x86/include/asm/mpspec.h         |   2 +-
 arch/x86/include/asm/mpspec_def.h     |   3 +-
 arch/x86/kernel/Makefile              |   1 -
 arch/x86/kernel/acpi/boot.c           |   2 +-
 arch/x86/kernel/apic/io_apic.c        |  17 +-
 arch/x86/kernel/mca_32.c              | 476 ----------------------------------
 arch/x86/kernel/mpparse.c             |  11 +-
 arch/x86/kernel/nmi.c                 |  12 -
 arch/x86/kernel/setup.c               |   8 -
 arch/x86/kernel/time.c                |   6 -
 arch/x86/kernel/traps.c               |   4 -
 drivers/Makefile                      |   1 -
 drivers/mca/Kconfig                   |  14 -
 drivers/mca/Makefile                  |   7 -
 drivers/mca/mca-bus.c                 | 169 ------------
 drivers/mca/mca-device.c              | 218 ----------------
 drivers/mca/mca-driver.c              |  63 -----
 drivers/mca/mca-legacy.c              | 329 -----------------------
 drivers/mca/mca-proc.c                | 249 ------------------
 drivers/message/i2o/i2o_proc.c        |  13 -
 include/linux/i2o-dev.h               |   2 +-
 include/linux/mca-legacy.h            |  66 -----
 include/linux/mca.h                   | 148 -----------
 scripts/kconfig/mconf.c               |   2 +-
 scripts/kconfig/nconf.c               |   2 +-
 38 files changed, 15 insertions(+), 2526 deletions(-)
 delete mode 100644 Documentation/DocBook/mcabook.tmpl
 delete mode 100644 Documentation/mca.txt
 delete mode 100644 arch/x86/include/asm/mca.h
 delete mode 100644 arch/x86/include/asm/mca_dma.h
 delete mode 100644 arch/x86/kernel/mca_32.c
 delete mode 100644 drivers/mca/Kconfig
 delete mode 100644 drivers/mca/Makefile
 delete mode 100644 drivers/mca/mca-bus.c
 delete mode 100644 drivers/mca/mca-device.c
 delete mode 100644 drivers/mca/mca-driver.c
 delete mode 100644 drivers/mca/mca-legacy.c
 delete mode 100644 drivers/mca/mca-proc.c
 delete mode 100644 include/linux/mca-legacy.h
 delete mode 100644 include/linux/mca.h

(limited to 'arch/x86')

diff --git a/Documentation/00-INDEX b/Documentation/00-INDEX
index 2214f123a976..49c051380daf 100644
--- a/Documentation/00-INDEX
+++ b/Documentation/00-INDEX
@@ -218,8 +218,6 @@ m68k/
 	- directory with info about Linux on Motorola 68k architecture.
 magic-number.txt
 	- list of magic numbers used to mark/protect kernel data structures.
-mca.txt
-	- info on supporting Micro Channel Architecture (e.g. PS/2) systems.
 md.txt
 	- info on boot arguments for the multiple devices driver.
 memory-barriers.txt
diff --git a/Documentation/DocBook/Makefile b/Documentation/DocBook/Makefile
index 66725a3d30dc..bc3d9f8c0a90 100644
--- a/Documentation/DocBook/Makefile
+++ b/Documentation/DocBook/Makefile
@@ -6,7 +6,7 @@
 # To add a new book the only step required is to add the book to the
 # list of DOCBOOKS.
 
-DOCBOOKS := z8530book.xml mcabook.xml device-drivers.xml \
+DOCBOOKS := z8530book.xml device-drivers.xml \
 	    kernel-hacking.xml kernel-locking.xml deviceiobook.xml \
 	    writing_usb_driver.xml networking.xml \
 	    kernel-api.xml filesystems.xml lsm.xml usb.xml kgdb.xml \
diff --git a/Documentation/DocBook/kernel-api.tmpl b/Documentation/DocBook/kernel-api.tmpl
index 7160652a8736..00687ee9d363 100644
--- a/Documentation/DocBook/kernel-api.tmpl
+++ b/Documentation/DocBook/kernel-api.tmpl
@@ -212,19 +212,6 @@ X!Edrivers/pci/hotplug.c
      <sect1><title>PCI Hotplug Support Library</title>
 !Edrivers/pci/hotplug/pci_hotplug_core.c
      </sect1>
-     <sect1><title>MCA Architecture</title>
-	<sect2><title>MCA Device Functions</title>
-           <para>
-              Refer to the file arch/x86/kernel/mca_32.c for more information.
-           </para>
-<!-- FIXME: Removed for now since no structured comments in source
-X!Earch/x86/kernel/mca_32.c
--->
-	</sect2>
-	<sect2><title>MCA Bus DMA</title>
-!Iarch/x86/include/asm/mca_dma.h
-	</sect2>
-     </sect1>
   </chapter>
 
   <chapter id="firmware">
diff --git a/Documentation/DocBook/mcabook.tmpl b/Documentation/DocBook/mcabook.tmpl
deleted file mode 100644
index 467ccac6ec50..000000000000
--- a/Documentation/DocBook/mcabook.tmpl
+++ /dev/null
@@ -1,107 +0,0 @@
-<?xml version="1.0" encoding="UTF-8"?>
-<!DOCTYPE book PUBLIC "-//OASIS//DTD DocBook XML V4.1.2//EN"
-	"http://www.oasis-open.org/docbook/xml/4.1.2/docbookx.dtd" []>
-
-<book id="MCAGuide">
- <bookinfo>
-  <title>MCA Driver Programming Interface</title>
-  
-  <authorgroup>
-   <author>
-    <firstname>Alan</firstname>
-    <surname>Cox</surname>
-    <affiliation>
-     <address>
-      <email>alan@lxorguk.ukuu.org.uk</email>
-     </address>
-    </affiliation>
-   </author>
-   <author>
-    <firstname>David</firstname>
-    <surname>Weinehall</surname>
-   </author>
-   <author>
-    <firstname>Chris</firstname>
-    <surname>Beauregard</surname>
-   </author>
-  </authorgroup>
-
-  <copyright>
-   <year>2000</year>
-   <holder>Alan Cox</holder>
-   <holder>David Weinehall</holder>
-   <holder>Chris Beauregard</holder>
-  </copyright>
-
-  <legalnotice>
-   <para>
-     This documentation is free software; you can redistribute
-     it and/or modify it under the terms of the GNU General Public
-     License as published by the Free Software Foundation; either
-     version 2 of the License, or (at your option) any later
-     version.
-   </para>
-      
-   <para>
-     This program is distributed in the hope that it will be
-     useful, but WITHOUT ANY WARRANTY; without even the implied
-     warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
-     See the GNU General Public License for more details.
-   </para>
-      
-   <para>
-     You should have received a copy of the GNU General Public
-     License along with this program; if not, write to the Free
-     Software Foundation, Inc., 59 Temple Place, Suite 330, Boston,
-     MA 02111-1307 USA
-   </para>
-      
-   <para>
-     For more details see the file COPYING in the source
-     distribution of Linux.
-   </para>
-  </legalnotice>
- </bookinfo>
-
-<toc></toc>
-
-  <chapter id="intro">
-      <title>Introduction</title>
-  <para>
-	The MCA bus functions provide a generalised interface to find MCA
-	bus cards, to claim them for a driver, and to read and manipulate POS 
-	registers without being aware of the motherboard internals or 
-	certain deep magic specific to onboard devices.
-  </para>
-  <para>
-	The basic interface to the MCA bus devices is the slot. Each slot
-	is numbered and virtual slot numbers are assigned to the internal
-	devices. Using a pci_dev as other busses do does not really make
-	sense in the MCA context as the MCA bus resources require card
-	specific interpretation.
-  </para>
-  <para>
-	Finally the MCA bus functions provide a parallel set of DMA
-	functions mimicing the ISA bus DMA functions as closely as possible,
-	although also supporting the additional DMA functionality on the
-	MCA bus controllers.
-  </para>
-  </chapter>
-  <chapter id="bugs">
-     <title>Known Bugs And Assumptions</title>
-  <para>
-	None.	
-  </para>
-  </chapter>
-
-  <chapter id="pubfunctions">
-     <title>Public Functions Provided</title>
-!Edrivers/mca/mca-legacy.c
-  </chapter>
-
-  <chapter id="dmafunctions">
-     <title>DMA Functions Provided</title>
-!Iarch/x86/include/asm/mca_dma.h
-  </chapter>
-
-</book>
diff --git a/Documentation/devices.txt b/Documentation/devices.txt
index 00383186d8fb..c162be1c3234 100644
--- a/Documentation/devices.txt
+++ b/Documentation/devices.txt
@@ -846,13 +846,7 @@ Your cooperation is appreciated.
 		    ...
 		 31 = /dev/tap15	16th Ethertap device
 
- 36 block	MCA ESDI hard disk
-		  0 = /dev/eda		First ESDI disk whole disk
-		 64 = /dev/edb		Second ESDI disk whole disk
-		    ...
-
-		Partitions are handled in the same way as IDE disks
-		(see major number 3).
+ 36 block	OBSOLETE (was MCA ESDI hard disk)
 
  37 char	IDE tape
 		  0 = /dev/ht0		First IDE tape
diff --git a/Documentation/eisa.txt b/Documentation/eisa.txt
index 38cf0c7b559f..a55e4910924e 100644
--- a/Documentation/eisa.txt
+++ b/Documentation/eisa.txt
@@ -179,7 +179,7 @@ CONFIG_ALPHA_JENSEN or CONFIG_EISA_VLB_PRIMING are set.
 
 Converting an EISA driver to the new API mostly involves *deleting*
 code (since probing is now in the core EISA code). Unfortunately, most
-drivers share their probing routine between ISA, MCA and EISA. Special
+drivers share their probing routine between ISA, and EISA. Special
 care must be taken when ripping out the EISA code, so other busses
 won't suffer from these surgical strikes...
 
diff --git a/Documentation/kernel-parameters.txt b/Documentation/kernel-parameters.txt
index c1601e5a8b71..38cad53620cc 100644
--- a/Documentation/kernel-parameters.txt
+++ b/Documentation/kernel-parameters.txt
@@ -70,7 +70,6 @@ parameter is applicable:
 	M68k	M68k architecture is enabled.
 			These options have more detailed description inside of
 			Documentation/m68k/kernel-options.txt.
-	MCA	MCA bus support is enabled.
 	MDA	MDA console support is enabled.
 	MIPS	MIPS architecture is enabled.
 	MOUSE	Appropriate mouse support is enabled.
diff --git a/Documentation/mca.txt b/Documentation/mca.txt
deleted file mode 100644
index dfd130c2207d..000000000000
--- a/Documentation/mca.txt
+++ /dev/null
@@ -1,313 +0,0 @@
-i386 Micro Channel Architecture Support
-=======================================
-
-MCA support is enabled using the CONFIG_MCA define.  A machine with a MCA
-bus will have the kernel variable MCA_bus set, assuming the BIOS feature
-bits are set properly (see arch/i386/boot/setup.S for information on
-how this detection is done).
-
-Adapter Detection
-=================
-
-The ideal MCA adapter detection is done through the use of the
-Programmable Option Select registers.  Generic functions for doing
-this have been added in include/linux/mca.h and arch/x86/kernel/mca_32.c.
-Everything needed to detect adapters and read (and write) configuration
-information is there.  A number of MCA-specific drivers already use
-this.  The typical probe code looks like the following:
-
-	#include <linux/mca.h>
-
-	unsigned char pos2, pos3, pos4, pos5;
-	struct net_device* dev;
-	int slot;
-
-	if( MCA_bus ) {
-		slot = mca_find_adapter( ADAPTER_ID, 0 );
-		if( slot == MCA_NOTFOUND ) {
-			return -ENODEV;
-		}
-		/* optional - see below */
-		mca_set_adapter_name( slot, "adapter name & description" );
-		mca_set_adapter_procfn( slot, dev_getinfo, dev );
-
-		/* read the POS registers.  Most devices only use 2 and 3 */
-		pos2 = mca_read_stored_pos( slot, 2 );
-		pos3 = mca_read_stored_pos( slot, 3 );
-		pos4 = mca_read_stored_pos( slot, 4 );
-		pos5 = mca_read_stored_pos( slot, 5 );
-	} else {
-		return -ENODEV;
-	}
-
-	/* extract configuration from pos[2345] and set everything up */
-
-Loadable modules should modify this to test that the specified IRQ and
-IO ports (plus whatever other stuff) match.  See 3c523.c for example
-code (actually, smc-mca.c has a slightly more complex example that can
-handle a list of adapter ids).
-
-Keep in mind that devices should never directly access the POS registers
-(via inb(), outb(), etc).  While it's generally safe, there is a small
-potential for blowing up hardware when it's done at the wrong time.
-Furthermore, accessing a POS register disables a device temporarily.
-This is usually okay during startup, but do _you_ want to rely on it?
-During initial configuration, mca_init() reads all the POS registers
-into memory.  mca_read_stored_pos() accesses that data.  mca_read_pos()
-and mca_write_pos() are also available for (safer) direct POS access,
-but their use is _highly_ discouraged.  mca_write_pos() is particularly
-dangerous, as it is possible for adapters to be put in inconsistent
-states (i.e. sharing IO address, etc) and may result in crashes, toasted
-hardware, and blindness.
-
-User level drivers (such as the AGX X server) can use /proc/mca/pos to
-find adapters (see below).
-
-Some MCA adapters can also be detected via the usual ISA-style device
-probing (many SCSI adapters, for example).  This sort of thing is highly
-discouraged.  Perfectly good information is available telling you what's
-there, so there's no excuse for messing with random IO ports.  However,
-we MCA people still appreciate any ISA-style driver that will work with
-our hardware.  You take what you can get...
-
-Level-Triggered Interrupts
-==========================
-
-Because MCA uses level-triggered interrupts, a few problems arise with
-what might best be described as the ISA mindset and its effects on
-drivers.  These sorts of problems are expected to become less common as
-more people use shared IRQs on PCI machines.
-
-In general, an interrupt must be acknowledged not only at the ICU (which
-is done automagically by the kernel), but at the device level.  In
-particular, IRQ 0 must be reset after a timer interrupt (now done in
-arch/x86/kernel/time.c) or the first timer interrupt hangs the system.
-There were also problems with the 1.3.x floppy drivers, but that seems
-to have been fixed.
-
-IRQs are also shareable, and most MCA-specific devices should be coded
-with shared IRQs in mind.
-
-/proc/mca
-=========
-
-/proc/mca is a directory containing various files for adapters and
-other stuff.
-
-	/proc/mca/pos		Straight listing of POS registers
-	/proc/mca/slot[1-8]	Information on adapter in specific slot
-	/proc/mca/video		Same for integrated video
-	/proc/mca/scsi		Same for integrated SCSI
-	/proc/mca/machine	Machine information
-
-See Appendix A for a sample.
-
-Device drivers can easily add their own information function for
-specific slots (including integrated ones) via the
-mca_set_adapter_procfn() call.  Drivers that support this are ESDI, IBM
-SCSI, and 3c523.  If a device is also a module, make sure that the proc
-function is removed in the module cleanup.  This will require storing
-the slot information in a private structure somewhere.  See the 3c523
-driver for details.
-
-Your typical proc function will look something like this:
-
-	static int
-	dev_getinfo( char* buf, int slot, void* d ) {
-		struct net_device* dev = (struct net_device*) d;
-		int len = 0;
-
-		len += sprintf( buf+len, "Device: %s\n", dev->name );
-		len += sprintf( buf+len, "IRQ: %d\n", dev->irq );
-		len += sprintf( buf+len, "IO Port: %#lx-%#lx\n", ... );
-		...
-
-		return len;
-	}
-
-Some of the standard MCA information will already be printed, so don't
-bother repeating it.  Don't try putting in more than 3K of information.
-
-Enable this function with:
-	mca_set_adapter_procfn( slot, dev_getinfo, dev );
-
-Disable it with:
-	mca_set_adapter_procfn( slot, NULL, NULL );
-
-It is also recommended that, even if you don't write a proc function, to
-set the name of the adapter (i.e. "PS/2 ESDI Controller") via
-mca_set_adapter_name( int slot, char* name ).
-
-MCA Device Drivers
-==================
-
-Currently, there are a number of MCA-specific device drivers.
-
-1) PS/2 SCSI
-	drivers/scsi/ibmmca.c
-	drivers/scsi/ibmmca.h
-   The driver for the IBM SCSI subsystem.  Includes both integrated
-   controllers and adapter cards.  May require command-line arg
-   "ibmmcascsi=io_port" to force detection of an adapter.  If you have a
-   machine with a front-panel display (i.e. model 95), you can use
-   "ibmmcascsi=display" to enable a drive activity indicator.
-
-2) 3c523
-	drivers/net/3c523.c
-	drivers/net/3c523.h
-   3Com 3c523 Etherlink/MC ethernet driver.
-
-3) SMC Ultra/MCA and IBM Adapter/A
-	drivers/net/smc-mca.c
-	drivers/net/smc-mca.h
-	Driver for the MCA version of the SMC Ultra and various other
-	OEM'ed and work-alike cards (Elite, Adapter/A, etc).
-
-4) NE/2
-	driver/net/ne2.c
-	driver/net/ne2.h
-	The NE/2 is the MCA version of the NE2000.  This may not work
-	with clones that have a different adapter id than the original
-	NE/2.
-
-5) Future Domain MCS-600/700, OEM'd IBM Fast SCSI Adapter/A and
-   Reply Sound Blaster/SCSI (SCSI part)
-	Better support for these cards than the driver for ISA.
-   Supports multiple cards with IRQ sharing.
-
-Also added boot time option of scsi-probe, which can do reordering of
-SCSI host adapters. This will direct the kernel on the order which
-SCSI adapter should be detected. Example:
-  scsi-probe=ibmmca,fd_mcs,adaptec1542,buslogic
-
-The serial drivers were modified to support the extended IO port range
-of the typical MCA system (also #ifdef CONFIG_MCA).
-
-The following devices work with existing drivers:
-1) Token-ring
-2) Future Domain SCSI (MCS-600, MCS-700, not MCS-350, OEM'ed IBM SCSI)
-3) Adaptec 1640 SCSI (using the aha1542 driver)
-4) Bustek/Buslogic SCSI (various)
-5) Probably all Arcnet cards.
-6) Some, possibly all, MCA IDE controllers.
-7) 3Com 3c529 (MCA version of 3c509) (patched)
-
-8) Intel EtherExpressMC  (patched version)
-   You need to have CONFIG_MCA defined to have EtherExpressMC support.
-9) Reply Sound Blaster/SCSI (SB part) (patched version)
-
-Bugs & Other Weirdness
-======================
-
-NMIs tend to occur with MCA machines because of various hardware
-weirdness, bus timeouts, and many other non-critical things.  Some basic
-code to handle them (inspired by the NetBSD MCA code) has been added to
-detect the guilty device, but it's pretty incomplete.  If NMIs are a
-persistent problem (on some model 70 or 80s, they occur every couple
-shell commands), the CONFIG_IGNORE_NMI flag will take care of that.
-
-Various Pentium machines have had serious problems with the FPU test in
-bugs.h.  Basically, the machine hangs after the HLT test.  This occurs,
-as far as we know, on the Pentium-equipped 85s, 95s, and some PC Servers.
-The PCI/MCA PC 750s are fine as far as I can tell.  The ``mca-pentium''
-boot-prompt flag will disable the FPU bug check if this is a problem
-with your machine.
-
-The model 80 has a raft of problems that are just too weird and unique
-to get into here.  Some people have no trouble while others have nothing
-but problems.  I'd suspect some problems are related to the age of the
-average 80 and accompanying hardware deterioration, although others
-are definitely design problems with the hardware.  Among the problems
-include SCSI controller problems, ESDI controller problems, and serious
-screw-ups in the floppy controller.  Oh, and the parallel port is also
-pretty flaky.  There were about 5 or 6 different model 80 motherboards
-produced to fix various obscure problems.  As far as I know, it's pretty
-much impossible to tell which bugs a particular model 80 has (other than
-triggering them, that is).
-
-Drivers are required for some MCA memory adapters.  If you're suddenly
-short a few megs of RAM, this might be the reason.  The (I think) Enhanced
-Memory Adapter commonly found on the model 70 is one.  There's a very
-alpha driver floating around, but it's pretty ugly (disassembled from
-the DOS driver, actually).  See the MCA Linux web page (URL below)
-for more current memory info.
-
-The Thinkpad 700 and 720 will work, but various components are either
-non-functional, flaky, or we don't know anything about them.  The
-graphics controller is supposed to be some WD, but we can't get things
-working properly.  The PCMCIA slots don't seem to work.  Ditto for APM.
-The serial ports work, but detection seems to be flaky.
-
-Credits
-=======
-A whole pile of people have contributed to the MCA code.  I'd include
-their names here, but I don't have a list handy.  Check the MCA Linux
-home page (URL below) for a perpetually out-of-date list.
-
-=====================================================================
-MCA Linux Home Page: http://www.dgmicro.com/mca/
-
-Christophe Beauregard
-chrisb@truespectra.com
-cpbeaure@calum.csclub.uwaterloo.ca
-
-=====================================================================
-Appendix A: Sample /proc/mca
-
-This is from my model 8595.  Slot 1 contains the standard IBM SCSI
-adapter, slot 3 is an Adaptec AHA-1640, slot 5 is a XGA-1 video adapter,
-and slot 7 is the 3c523 Etherlink/MC.
-
-/proc/mca/machine:
-Model Id: 0xf8
-Submodel Id: 0x14
-BIOS Revision: 0x5
-
-/proc/mca/pos:
-Slot 1: ff 8e f1 fc a0 ff ff ff  IBM SCSI Adapter w/Cache
-Slot 2: ff ff ff ff ff ff ff ff  
-Slot 3: 1f 0f 81 3b bf b6 ff ff  
-Slot 4: ff ff ff ff ff ff ff ff  
-Slot 5: db 8f 1d 5e fd c0 00 00  
-Slot 6: ff ff ff ff ff ff ff ff  
-Slot 7: 42 60 ff 08 ff ff ff ff  3Com 3c523 Etherlink/MC
-Slot 8: ff ff ff ff ff ff ff ff  
-Video : ff ff ff ff ff ff ff ff  
-SCSI  : ff ff ff ff ff ff ff ff  
-
-/proc/mca/slot1:
-Slot: 1
-Adapter Name: IBM SCSI Adapter w/Cache
-Id: 8eff
-Enabled: Yes
-POS: ff 8e f1 fc a0 ff ff ff 
-Subsystem PUN: 7
-Detected at boot: Yes
-
-/proc/mca/slot3:
-Slot: 3
-Adapter Name: Unknown
-Id: 0f1f
-Enabled: Yes
-POS: 1f 0f 81 3b bf b6 ff ff 
-
-/proc/mca/slot5:
-Slot: 5
-Adapter Name: Unknown
-Id: 8fdb
-Enabled: Yes
-POS: db 8f 1d 5e fd c0 00 00 
-
-/proc/mca/slot7:
-Slot: 7
-Adapter Name: 3Com 3c523 Etherlink/MC
-Id: 6042
-Enabled: Yes
-POS: 42 60 ff 08 ff ff ff ff 
-Revision: 0xe
-IRQ: 9
-IO Address: 0x3300-0x3308
-Memory: 0xd8000-0xdbfff
-Transceiver: External
-Device: eth0
-Hardware Address: 02 60 8c 45 c4 2a
diff --git a/MAINTAINERS b/MAINTAINERS
index 490dd6e640ac..9fa728b53d4e 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -3316,12 +3316,6 @@ T:	git git://git.kernel.org/pub/scm/linux/kernel/git/aegl/linux.git
 S:	Maintained
 F:	arch/ia64/
 
-IBM MCA SCSI SUBSYSTEM DRIVER
-M:	Michael Lang <langa2@kph.uni-mainz.de>
-W:	http://www.uni-mainz.de/~langm000/linux.html
-S:	Maintained
-F:	drivers/scsi/ibmmca.c
-
 IBM Power Linux RAID adapter
 M:	Brian King <brking@us.ibm.com>
 S:	Supported
@@ -4418,13 +4412,6 @@ T:	git git://git.monstr.eu/linux-2.6-microblaze.git
 S:	Supported
 F:	arch/microblaze/
 
-MICROCHANNEL ARCHITECTURE (MCA)
-M:	James Bottomley <James.Bottomley@HansenPartnership.com>
-S:	Maintained
-F:	Documentation/mca.txt
-F:	drivers/mca/
-F:	include/linux/mca*
-
 MICROTEK X6 SCANNER
 M:	Oliver Neukum <oliver@neukum.name>
 S:	Maintained
diff --git a/arch/frv/include/asm/processor.h b/arch/frv/include/asm/processor.h
index 81c2e271d620..4a53811cd4cd 100644
--- a/arch/frv/include/asm/processor.h
+++ b/arch/frv/include/asm/processor.h
@@ -54,7 +54,6 @@ extern struct cpuinfo_frv __nongprelbss boot_cpu_data;
  * Bus types
  */
 #define EISA_bus 0
-#define MCA_bus 0
 
 struct thread_struct {
 	struct pt_regs		*frame;		/* [GR28] exception frame ptr for this thread */
diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
index c9866b0b77d8..9137057152c3 100644
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -2023,16 +2023,6 @@ config EISA
 
 source "drivers/eisa/Kconfig"
 
-config MCA
-	bool "MCA support"
-	---help---
-	  MicroChannel Architecture is found in some IBM PS/2 machines and
-	  laptops.  It is a bus system similar to PCI or ISA. See
-	  <file:Documentation/mca.txt> (and especially the web page given
-	  there) before attempting to build an MCA bus kernel.
-
-source "drivers/mca/Kconfig"
-
 config SCx200
 	tristate "NatSemi SCx200 support"
 	---help---
diff --git a/arch/x86/include/asm/mca.h b/arch/x86/include/asm/mca.h
deleted file mode 100644
index eedbb6cc1efb..000000000000
--- a/arch/x86/include/asm/mca.h
+++ /dev/null
@@ -1,43 +0,0 @@
-/* -*- mode: c; c-basic-offset: 8 -*- */
-
-/* Platform specific MCA defines */
-#ifndef _ASM_X86_MCA_H
-#define _ASM_X86_MCA_H
-
-/* Maximal number of MCA slots - actually, some machines have less, but
- * they all have sufficient number of POS registers to cover 8.
- */
-#define MCA_MAX_SLOT_NR  8
-
-/* Most machines have only one MCA bus.  The only multiple bus machines
- * I know have at most two */
-#define MAX_MCA_BUSSES 2
-
-#define MCA_PRIMARY_BUS		0
-#define MCA_SECONDARY_BUS	1
-
-/* Dummy slot numbers on primary MCA for integrated functions */
-#define MCA_INTEGSCSI	(MCA_MAX_SLOT_NR)
-#define MCA_INTEGVIDEO	(MCA_MAX_SLOT_NR+1)
-#define MCA_MOTHERBOARD (MCA_MAX_SLOT_NR+2)
-
-/* Dummy POS values for integrated functions */
-#define MCA_DUMMY_POS_START	0x10000
-#define MCA_INTEGSCSI_POS	(MCA_DUMMY_POS_START+1)
-#define MCA_INTEGVIDEO_POS	(MCA_DUMMY_POS_START+2)
-#define MCA_MOTHERBOARD_POS	(MCA_DUMMY_POS_START+3)
-
-/* MCA registers */
-
-#define MCA_MOTHERBOARD_SETUP_REG	0x94
-#define MCA_ADAPTER_SETUP_REG		0x96
-#define MCA_POS_REG(n)			(0x100+(n))
-
-#define MCA_ENABLED	0x01	/* POS 2, set if adapter enabled */
-
-/* Max number of adapters, including both slots and various integrated
- * things.
- */
-#define MCA_NUMADAPTERS (MCA_MAX_SLOT_NR+3)
-
-#endif /* _ASM_X86_MCA_H */
diff --git a/arch/x86/include/asm/mca_dma.h b/arch/x86/include/asm/mca_dma.h
deleted file mode 100644
index 45271aef82dd..000000000000
--- a/arch/x86/include/asm/mca_dma.h
+++ /dev/null
@@ -1,201 +0,0 @@
-#ifndef _ASM_X86_MCA_DMA_H
-#define _ASM_X86_MCA_DMA_H
-
-#include <asm/io.h>
-#include <linux/ioport.h>
-
-/*
- * Microchannel specific DMA stuff.  DMA on an MCA machine is fairly similar to
- *   standard PC dma, but it certainly has its quirks.  DMA register addresses
- *   are in a different place and there are some added functions.  Most of this
- *   should be pretty obvious on inspection.  Note that the user must divide
- *   count by 2 when using 16-bit dma; that is not handled by these functions.
- *
- * Ramen Noodles are yummy.
- *
- *  1998 Tymm Twillman <tymm@computer.org>
- */
-
-/*
- * Registers that are used by the DMA controller; FN is the function register
- *   (tell the controller what to do) and EXE is the execution register (how
- *   to do it)
- */
-
-#define MCA_DMA_REG_FN  0x18
-#define MCA_DMA_REG_EXE 0x1A
-
-/*
- * Functions that the DMA controller can do
- */
-
-#define MCA_DMA_FN_SET_IO       0x00
-#define MCA_DMA_FN_SET_ADDR     0x20
-#define MCA_DMA_FN_GET_ADDR     0x30
-#define MCA_DMA_FN_SET_COUNT    0x40
-#define MCA_DMA_FN_GET_COUNT    0x50
-#define MCA_DMA_FN_GET_STATUS   0x60
-#define MCA_DMA_FN_SET_MODE     0x70
-#define MCA_DMA_FN_SET_ARBUS    0x80
-#define MCA_DMA_FN_MASK         0x90
-#define MCA_DMA_FN_RESET_MASK   0xA0
-#define MCA_DMA_FN_MASTER_CLEAR 0xD0
-
-/*
- * Modes (used by setting MCA_DMA_FN_MODE in the function register)
- *
- * Note that the MODE_READ is read from memory (write to device), and
- *   MODE_WRITE is vice-versa.
- */
-
-#define MCA_DMA_MODE_XFER  0x04  /* read by default */
-#define MCA_DMA_MODE_READ  0x04  /* same as XFER */
-#define MCA_DMA_MODE_WRITE 0x08  /* OR with MODE_XFER to use */
-#define MCA_DMA_MODE_IO    0x01  /* DMA from IO register */
-#define MCA_DMA_MODE_16    0x40  /* 16 bit xfers */
-
-
-/**
- *	mca_enable_dma	-	channel to enable DMA on
- *	@dmanr: DMA channel
- *
- *	Enable the MCA bus DMA on a channel. This can be called from
- *	IRQ context.
- */
-
-static inline void mca_enable_dma(unsigned int dmanr)
-{
-	outb(MCA_DMA_FN_RESET_MASK | dmanr, MCA_DMA_REG_FN);
-}
-
-/**
- *	mca_disble_dma	-	channel to disable DMA on
- *	@dmanr: DMA channel
- *
- *	Enable the MCA bus DMA on a channel. This can be called from
- *	IRQ context.
- */
-
-static inline void mca_disable_dma(unsigned int dmanr)
-{
-	outb(MCA_DMA_FN_MASK | dmanr, MCA_DMA_REG_FN);
-}
-
-/**
- *	mca_set_dma_addr -	load a 24bit DMA address
- *	@dmanr: DMA channel
- *	@a: 24bit bus address
- *
- *	Load the address register in the DMA controller. This has a 24bit
- *	limitation (16Mb).
- */
-
-static inline void mca_set_dma_addr(unsigned int dmanr, unsigned int a)
-{
-	outb(MCA_DMA_FN_SET_ADDR | dmanr, MCA_DMA_REG_FN);
-	outb(a & 0xff, MCA_DMA_REG_EXE);
-	outb((a >> 8) & 0xff, MCA_DMA_REG_EXE);
-	outb((a >> 16) & 0xff, MCA_DMA_REG_EXE);
-}
-
-/**
- *	mca_get_dma_addr -	load a 24bit DMA address
- *	@dmanr: DMA channel
- *
- *	Read the address register in the DMA controller. This has a 24bit
- *	limitation (16Mb). The return is a bus address.
- */
-
-static inline unsigned int mca_get_dma_addr(unsigned int dmanr)
-{
-	unsigned int addr;
-
-	outb(MCA_DMA_FN_GET_ADDR | dmanr, MCA_DMA_REG_FN);
-	addr = inb(MCA_DMA_REG_EXE);
-	addr |= inb(MCA_DMA_REG_EXE) << 8;
-	addr |= inb(MCA_DMA_REG_EXE) << 16;
-
-	return addr;
-}
-
-/**
- *	mca_set_dma_count -	load a 16bit transfer count
- *	@dmanr: DMA channel
- *	@count: count
- *
- *	Set the DMA count for this channel. This can be up to 64Kbytes.
- *	Setting a count of zero will not do what you expect.
- */
-
-static inline void mca_set_dma_count(unsigned int dmanr, unsigned int count)
-{
-	count--;  /* transfers one more than count -- correct for this */
-
-	outb(MCA_DMA_FN_SET_COUNT | dmanr, MCA_DMA_REG_FN);
-	outb(count & 0xff, MCA_DMA_REG_EXE);
-	outb((count >> 8) & 0xff, MCA_DMA_REG_EXE);
-}
-
-/**
- *	mca_get_dma_residue -	get the remaining bytes to transfer
- *	@dmanr: DMA channel
- *
- *	This function returns the number of bytes left to transfer
- *	on this DMA channel.
- */
-
-static inline unsigned int mca_get_dma_residue(unsigned int dmanr)
-{
-	unsigned short count;
-
-	outb(MCA_DMA_FN_GET_COUNT | dmanr, MCA_DMA_REG_FN);
-	count = 1 + inb(MCA_DMA_REG_EXE);
-	count += inb(MCA_DMA_REG_EXE) << 8;
-
-	return count;
-}
-
-/**
- *	mca_set_dma_io -	set the port for an I/O transfer
- *	@dmanr: DMA channel
- *	@io_addr: an I/O port number
- *
- *	Unlike the ISA bus DMA controllers the DMA on MCA bus can transfer
- *	with an I/O port target.
- */
-
-static inline void mca_set_dma_io(unsigned int dmanr, unsigned int io_addr)
-{
-	/*
-	 * DMA from a port address -- set the io address
-	 */
-
-	outb(MCA_DMA_FN_SET_IO | dmanr, MCA_DMA_REG_FN);
-	outb(io_addr & 0xff, MCA_DMA_REG_EXE);
-	outb((io_addr >>  8) & 0xff, MCA_DMA_REG_EXE);
-}
-
-/**
- *	mca_set_dma_mode -	set the DMA mode
- *	@dmanr: DMA channel
- *	@mode: mode to set
- *
- *	The DMA controller supports several modes. The mode values you can
- *	set are-
- *
- *	%MCA_DMA_MODE_READ when reading from the DMA device.
- *
- *	%MCA_DMA_MODE_WRITE to writing to the DMA device.
- *
- *	%MCA_DMA_MODE_IO to do DMA to or from an I/O port.
- *
- *	%MCA_DMA_MODE_16 to do 16bit transfers.
- */
-
-static inline void mca_set_dma_mode(unsigned int dmanr, unsigned int mode)
-{
-	outb(MCA_DMA_FN_SET_MODE | dmanr, MCA_DMA_REG_FN);
-	outb(mode, MCA_DMA_REG_EXE);
-}
-
-#endif /* _ASM_X86_MCA_DMA_H */
diff --git a/arch/x86/include/asm/mpspec.h b/arch/x86/include/asm/mpspec.h
index 9c7d95f6174b..3e2f42a4b872 100644
--- a/arch/x86/include/asm/mpspec.h
+++ b/arch/x86/include/asm/mpspec.h
@@ -40,7 +40,7 @@ extern int quad_local_to_mp_bus_id [NR_CPUS/4][4];
 
 #endif /* CONFIG_X86_64 */
 
-#if defined(CONFIG_MCA) || defined(CONFIG_EISA)
+#ifdef CONFIG_EISA
 extern int mp_bus_id_to_type[MAX_MP_BUSSES];
 #endif
 
diff --git a/arch/x86/include/asm/mpspec_def.h b/arch/x86/include/asm/mpspec_def.h
index c0a955a9a087..b31f8c098271 100644
--- a/arch/x86/include/asm/mpspec_def.h
+++ b/arch/x86/include/asm/mpspec_def.h
@@ -84,7 +84,7 @@ struct mpc_bus {
 #define BUSTYPE_EISA	"EISA"
 #define BUSTYPE_ISA	"ISA"
 #define BUSTYPE_INTERN	"INTERN"	/* Internal BUS */
-#define BUSTYPE_MCA	"MCA"
+#define BUSTYPE_MCA	"MCA"		/* Obsolete */
 #define BUSTYPE_VL	"VL"		/* Local bus */
 #define BUSTYPE_PCI	"PCI"
 #define BUSTYPE_PCMCIA	"PCMCIA"
@@ -169,6 +169,5 @@ enum mp_bustype {
 	MP_BUS_ISA = 1,
 	MP_BUS_EISA,
 	MP_BUS_PCI,
-	MP_BUS_MCA,
 };
 #endif /* _ASM_X86_MPSPEC_DEF_H */
diff --git a/arch/x86/kernel/Makefile b/arch/x86/kernel/Makefile
index 532d2e090e6f..7d1569947204 100644
--- a/arch/x86/kernel/Makefile
+++ b/arch/x86/kernel/Makefile
@@ -49,7 +49,6 @@ obj-y				+= cpu/
 obj-y				+= acpi/
 obj-y				+= reboot.o
 obj-$(CONFIG_X86_32)		+= reboot_32.o
-obj-$(CONFIG_MCA)		+= mca_32.o
 obj-$(CONFIG_X86_MSR)		+= msr.o
 obj-$(CONFIG_X86_CPUID)		+= cpuid.o
 obj-$(CONFIG_PCI)		+= early-quirks.o
diff --git a/arch/x86/kernel/acpi/boot.c b/arch/x86/kernel/acpi/boot.c
index a415b1f44365..f564b189de1a 100644
--- a/arch/x86/kernel/acpi/boot.c
+++ b/arch/x86/kernel/acpi/boot.c
@@ -990,7 +990,7 @@ void __init mp_config_acpi_legacy_irqs(void)
 	int i;
 	struct mpc_intsrc mp_irq;
 
-#if defined (CONFIG_MCA) || defined (CONFIG_EISA)
+#ifdef CONFIG_EISA
 	/*
 	 * Fabricate the legacy ISA bus (bus #31).
 	 */
diff --git a/arch/x86/kernel/apic/io_apic.c b/arch/x86/kernel/apic/io_apic.c
index e88300d8e80a..675e9045a3c5 100644
--- a/arch/x86/kernel/apic/io_apic.c
+++ b/arch/x86/kernel/apic/io_apic.c
@@ -142,7 +142,7 @@ int mp_irq_entries;
 /* GSI interrupts */
 static int nr_irqs_gsi = NR_IRQS_LEGACY;
 
-#if defined (CONFIG_MCA) || defined (CONFIG_EISA)
+#ifdef CONFIG_EISA
 int mp_bus_id_to_type[MAX_MP_BUSSES];
 #endif
 
@@ -875,7 +875,7 @@ static int __init find_isa_irq_apic(int irq, int type)
 	return -1;
 }
 
-#if defined(CONFIG_EISA) || defined(CONFIG_MCA)
+#ifdef CONFIG_EISA
 /*
  * EISA Edge/Level control register, ELCR
  */
@@ -912,12 +912,6 @@ static int EISA_ELCR(unsigned int irq)
 #define default_PCI_trigger(idx)	(1)
 #define default_PCI_polarity(idx)	(1)
 
-/* MCA interrupts are always polarity zero level triggered,
- * when listed as conforming in the MP table. */
-
-#define default_MCA_trigger(idx)	(1)
-#define default_MCA_polarity(idx)	default_ISA_polarity(idx)
-
 static int irq_polarity(int idx)
 {
 	int bus = mp_irqs[idx].srcbus;
@@ -975,7 +969,7 @@ static int irq_trigger(int idx)
 				trigger = default_ISA_trigger(idx);
 			else
 				trigger = default_PCI_trigger(idx);
-#if defined(CONFIG_EISA) || defined(CONFIG_MCA)
+#ifdef CONFIG_EISA
 			switch (mp_bus_id_to_type[bus]) {
 				case MP_BUS_ISA: /* ISA pin */
 				{
@@ -992,11 +986,6 @@ static int irq_trigger(int idx)
 					/* set before the switch */
 					break;
 				}
-				case MP_BUS_MCA: /* MCA pin */
-				{
-					trigger = default_MCA_trigger(idx);
-					break;
-				}
 				default:
 				{
 					printk(KERN_WARNING "broken BIOS!!\n");
diff --git a/arch/x86/kernel/mca_32.c b/arch/x86/kernel/mca_32.c
deleted file mode 100644
index 7eb1e2b97827..000000000000
--- a/arch/x86/kernel/mca_32.c
+++ /dev/null
@@ -1,476 +0,0 @@
-/*
- *  Written by Martin Kolinek, February 1996
- *
- * Changes:
- *
- *	Chris Beauregard July 28th, 1996
- *	- Fixed up integrated SCSI detection
- *
- *	Chris Beauregard August 3rd, 1996
- *	- Made mca_info local
- *	- Made integrated registers accessible through standard function calls
- *	- Added name field
- *	- More sanity checking
- *
- *	Chris Beauregard August 9th, 1996
- *	- Rewrote /proc/mca
- *
- *	Chris Beauregard January 7th, 1997
- *	- Added basic NMI-processing
- *	- Added more information to mca_info structure
- *
- *	David Weinehall October 12th, 1998
- *	- Made a lot of cleaning up in the source
- *	- Added use of save_flags / restore_flags
- *	- Added the 'driver_loaded' flag in MCA_adapter
- *	- Added an alternative implemention of ZP Gu's mca_find_unused_adapter
- *
- *	David Weinehall March 24th, 1999
- *	- Fixed the output of 'Driver Installed' in /proc/mca/pos
- *	- Made the Integrated Video & SCSI show up even if they have id 0000
- *
- *	Alexander Viro November 9th, 1999
- *	- Switched to regular procfs methods
- *
- *	Alfred Arnold & David Weinehall August 23rd, 2000
- *	- Added support for Planar POS-registers
- */
-
-#include <linux/module.h>
-#include <linux/types.h>
-#include <linux/errno.h>
-#include <linux/kernel.h>
-#include <linux/mca.h>
-#include <linux/kprobes.h>
-#include <linux/slab.h>
-#include <asm/io.h>
-#include <linux/proc_fs.h>
-#include <linux/mman.h>
-#include <linux/mm.h>
-#include <linux/pagemap.h>
-#include <linux/ioport.h>
-#include <asm/uaccess.h>
-#include <linux/init.h>
-
-static unsigned char which_scsi;
-
-int MCA_bus;
-EXPORT_SYMBOL(MCA_bus);
-
-/*
- * Motherboard register spinlock. Untested on SMP at the moment, but
- * are there any MCA SMP boxes?
- *
- * Yes - Alan
- */
-static DEFINE_SPINLOCK(mca_lock);
-
-/* Build the status info for the adapter */
-
-static void mca_configure_adapter_status(struct mca_device *mca_dev)
-{
-	mca_dev->status = MCA_ADAPTER_NONE;
-
-	mca_dev->pos_id = mca_dev->pos[0]
-		+ (mca_dev->pos[1] << 8);
-
-	if (!mca_dev->pos_id && mca_dev->slot < MCA_MAX_SLOT_NR) {
-
-		/*
-		 * id = 0x0000 usually indicates hardware failure,
-		 * however, ZP Gu (zpg@castle.net> reports that his 9556
-		 * has 0x0000 as id and everything still works. There
-		 * also seem to be an adapter with id = 0x0000; the
-		 * NCR Parallel Bus Memory Card. Until this is confirmed,
-		 * however, this code will stay.
-		 */
-
-		mca_dev->status = MCA_ADAPTER_ERROR;
-
-		return;
-	} else if (mca_dev->pos_id != 0xffff) {
-
-		/*
-		 * 0xffff usually indicates that there's no adapter,
-		 * however, some integrated adapters may have 0xffff as
-		 * their id and still be valid. Examples are on-board
-		 * VGA of the 55sx, the integrated SCSI of the 56 & 57,
-		 * and possibly also the 95 ULTIMEDIA.
-		 */
-
-		mca_dev->status = MCA_ADAPTER_NORMAL;
-	}
-
-	if ((mca_dev->pos_id == 0xffff ||
-	    mca_dev->pos_id == 0x0000) && mca_dev->slot >= MCA_MAX_SLOT_NR) {
-		int j;
-
-		for (j = 2; j < 8; j++) {
-			if (mca_dev->pos[j] != 0xff) {
-				mca_dev->status = MCA_ADAPTER_NORMAL;
-				break;
-			}
-		}
-	}
-
-	if (!(mca_dev->pos[2] & MCA_ENABLED)) {
-
-		/* enabled bit is in POS 2 */
-
-		mca_dev->status = MCA_ADAPTER_DISABLED;
-	}
-} /* mca_configure_adapter_status */
-
-/*--------------------------------------------------------------------*/
-
-static struct resource mca_standard_resources[] = {
-	{ .start = 0x60, .end = 0x60, .name = "system control port B (MCA)" },
-	{ .start = 0x90, .end = 0x90, .name = "arbitration (MCA)" },
-	{ .start = 0x91, .end = 0x91, .name = "card Select Feedback (MCA)" },
-	{ .start = 0x92, .end = 0x92, .name = "system Control port A (MCA)" },
-	{ .start = 0x94, .end = 0x94, .name = "system board setup (MCA)" },
-	{ .start = 0x96, .end = 0x97, .name = "POS (MCA)" },
-	{ .start = 0x100, .end = 0x107, .name = "POS (MCA)" }
-};
-
-#define MCA_STANDARD_RESOURCES	ARRAY_SIZE(mca_standard_resources)
-
-/*
- *	mca_read_and_store_pos - read the POS registers into a memory buffer
- *      @pos: a char pointer to 8 bytes, contains the POS register value on
- *            successful return
- *
- *	Returns 1 if a card actually exists (i.e. the pos isn't
- *	all 0xff) or 0 otherwise
- */
-static int mca_read_and_store_pos(unsigned char *pos)
-{
-	int j;
-	int found = 0;
-
-	for (j = 0; j < 8; j++) {
-		pos[j] = inb_p(MCA_POS_REG(j));
-		if (pos[j] != 0xff) {
-			/* 0xff all across means no device. 0x00 means
-			 * something's broken, but a device is
-			 * probably there.  However, if you get 0x00
-			 * from a motherboard register it won't matter
-			 * what we find.  For the record, on the
-			 * 57SLC, the integrated SCSI adapter has
-			 * 0xffff for the adapter ID, but nonzero for
-			 * other registers.  */
-
-			found = 1;
-		}
-	}
-	return found;
-}
-
-static unsigned char mca_pc_read_pos(struct mca_device *mca_dev, int reg)
-{
-	unsigned char byte;
-	unsigned long flags;
-
-	if (reg < 0 || reg >= 8)
-		return 0;
-
-	spin_lock_irqsave(&mca_lock, flags);
-	if (mca_dev->pos_register) {
-		/* Disable adapter setup, enable motherboard setup */
-
-		outb_p(0, MCA_ADAPTER_SETUP_REG);
-		outb_p(mca_dev->pos_register, MCA_MOTHERBOARD_SETUP_REG);
-
-		byte = inb_p(MCA_POS_REG(reg));
-		outb_p(0xff, MCA_MOTHERBOARD_SETUP_REG);
-	} else {
-
-		/* Make sure motherboard setup is off */
-
-		outb_p(0xff, MCA_MOTHERBOARD_SETUP_REG);
-
-		/* Read the appropriate register */
-
-		outb_p(0x8|(mca_dev->slot & 0xf), MCA_ADAPTER_SETUP_REG);
-		byte = inb_p(MCA_POS_REG(reg));
-		outb_p(0, MCA_ADAPTER_SETUP_REG);
-	}
-	spin_unlock_irqrestore(&mca_lock, flags);
-
-	mca_dev->pos[reg] = byte;
-
-	return byte;
-}
-
-static void mca_pc_write_pos(struct mca_device *mca_dev, int reg,
-			     unsigned char byte)
-{
-	unsigned long flags;
-
-	if (reg < 0 || reg >= 8)
-		return;
-
-	spin_lock_irqsave(&mca_lock, flags);
-
-	/* Make sure motherboard setup is off */
-
-	outb_p(0xff, MCA_MOTHERBOARD_SETUP_REG);
-
-	/* Read in the appropriate register */
-
-	outb_p(0x8|(mca_dev->slot&0xf), MCA_ADAPTER_SETUP_REG);
-	outb_p(byte, MCA_POS_REG(reg));
-	outb_p(0, MCA_ADAPTER_SETUP_REG);
-
-	spin_unlock_irqrestore(&mca_lock, flags);
-
-	/* Update the global register list, while we have the byte */
-
-	mca_dev->pos[reg] = byte;
-
-}
-
-/* for the primary MCA bus, we have identity transforms */
-static int mca_dummy_transform_irq(struct mca_device *mca_dev, int irq)
-{
-	return irq;
-}
-
-static int mca_dummy_transform_ioport(struct mca_device *mca_dev, int port)
-{
-	return port;
-}
-
-static void *mca_dummy_transform_memory(struct mca_device *mca_dev, void *mem)
-{
-	return mem;
-}
-
-
-static int __init mca_init(void)
-{
-	unsigned int i, j;
-	struct mca_device *mca_dev;
-	unsigned char pos[8];
-	short mca_builtin_scsi_ports[] = {0xf7, 0xfd, 0x00};
-	struct mca_bus *bus;
-
-	/*
-	 * WARNING: Be careful when making changes here. Putting an adapter
-	 * and the motherboard simultaneously into setup mode may result in
-	 * damage to chips (according to The Indispensable PC Hardware Book
-	 * by Hans-Peter Messmer). Also, we disable system interrupts (so
-	 * that we are not disturbed in the middle of this).
-	 */
-
-	/* Make sure the MCA bus is present */
-
-	if (mca_system_init()) {
-		printk(KERN_ERR "MCA bus system initialisation failed\n");
-		return -ENODEV;
-	}
-
-	if (!MCA_bus)
-		return -ENODEV;
-
-	printk(KERN_INFO "Micro Channel bus detected.\n");
-
-	/* All MCA systems have at least a primary bus */
-	bus = mca_attach_bus(MCA_PRIMARY_BUS);
-	if (!bus)
-		goto out_nomem;
-	bus->default_dma_mask = 0xffffffffLL;
-	bus->f.mca_write_pos = mca_pc_write_pos;
-	bus->f.mca_read_pos = mca_pc_read_pos;
-	bus->f.mca_transform_irq = mca_dummy_transform_irq;
-	bus->f.mca_transform_ioport = mca_dummy_transform_ioport;
-	bus->f.mca_transform_memory = mca_dummy_transform_memory;
-
-	/* get the motherboard device */
-	mca_dev = kzalloc(sizeof(struct mca_device), GFP_KERNEL);
-	if (unlikely(!mca_dev))
-		goto out_nomem;
-
-	/*
-	 * We do not expect many MCA interrupts during initialization,
-	 * but let us be safe:
-	 */
-	spin_lock_irq(&mca_lock);
-
-	/* Make sure adapter setup is off */
-
-	outb_p(0, MCA_ADAPTER_SETUP_REG);
-
-	/* Read motherboard POS registers */
-
-	mca_dev->pos_register = 0x7f;
-	outb_p(mca_dev->pos_register, MCA_MOTHERBOARD_SETUP_REG);
-	mca_dev->name[0] = 0;
-	mca_read_and_store_pos(mca_dev->pos);
-	mca_configure_adapter_status(mca_dev);
-	/* fake POS and slot for a motherboard */
-	mca_dev->pos_id = MCA_MOTHERBOARD_POS;
-	mca_dev->slot = MCA_MOTHERBOARD;
-	mca_register_device(MCA_PRIMARY_BUS, mca_dev);
-
-	mca_dev = kzalloc(sizeof(struct mca_device), GFP_ATOMIC);
-	if (unlikely(!mca_dev))
-		goto out_unlock_nomem;
-
-	/* Put motherboard into video setup mode, read integrated video
-	 * POS registers, and turn motherboard setup off.
-	 */
-
-	mca_dev->pos_register = 0xdf;
-	outb_p(mca_dev->pos_register, MCA_MOTHERBOARD_SETUP_REG);
-	mca_dev->name[0] = 0;
-	mca_read_and_store_pos(mca_dev->pos);
-	mca_configure_adapter_status(mca_dev);
-	/* fake POS and slot for the integrated video */
-	mca_dev->pos_id = MCA_INTEGVIDEO_POS;
-	mca_dev->slot = MCA_INTEGVIDEO;
-	mca_register_device(MCA_PRIMARY_BUS, mca_dev);
-
-	/*
-	 * Put motherboard into scsi setup mode, read integrated scsi
-	 * POS registers, and turn motherboard setup off.
-	 *
-	 * It seems there are two possible SCSI registers. Martin says that
-	 * for the 56,57, 0xf7 is the one, but fails on the 76.
-	 * Alfredo (apena@vnet.ibm.com) says
-	 * 0xfd works on his machine. We'll try both of them. I figure it's
-	 * a good bet that only one could be valid at a time. This could
-	 * screw up though if one is used for something else on the other
-	 * machine.
-	 */
-
-	for (i = 0; (which_scsi = mca_builtin_scsi_ports[i]) != 0; i++) {
-		outb_p(which_scsi, MCA_MOTHERBOARD_SETUP_REG);
-		if (mca_read_and_store_pos(pos))
-			break;
-	}
-	if (which_scsi) {
-		/* found a scsi card */
-		mca_dev = kzalloc(sizeof(struct mca_device), GFP_ATOMIC);
-		if (unlikely(!mca_dev))
-			goto out_unlock_nomem;
-
-		for (j = 0; j < 8; j++)
-			mca_dev->pos[j] = pos[j];
-
-		mca_configure_adapter_status(mca_dev);
-		/* fake POS and slot for integrated SCSI controller */
-		mca_dev->pos_id = MCA_INTEGSCSI_POS;
-		mca_dev->slot = MCA_INTEGSCSI;
-		mca_dev->pos_register = which_scsi;
-		mca_register_device(MCA_PRIMARY_BUS, mca_dev);
-	}
-
-	/* Turn off motherboard setup */
-
-	outb_p(0xff, MCA_MOTHERBOARD_SETUP_REG);
-
-	/*
-	 * Now loop over MCA slots: put each adapter into setup mode, and
-	 * read its POS registers. Then put adapter setup off.
-	 */
-
-	for (i = 0; i < MCA_MAX_SLOT_NR; i++) {
-		outb_p(0x8|(i&0xf), MCA_ADAPTER_SETUP_REG);
-		if (!mca_read_and_store_pos(pos))
-			continue;
-
-		mca_dev = kzalloc(sizeof(struct mca_device), GFP_ATOMIC);
-		if (unlikely(!mca_dev))
-			goto out_unlock_nomem;
-
-		for (j = 0; j < 8; j++)
-			mca_dev->pos[j] = pos[j];
-
-		mca_dev->driver_loaded = 0;
-		mca_dev->slot = i;
-		mca_dev->pos_register = 0;
-		mca_configure_adapter_status(mca_dev);
-		mca_register_device(MCA_PRIMARY_BUS, mca_dev);
-	}
-	outb_p(0, MCA_ADAPTER_SETUP_REG);
-
-	/* Enable interrupts and return memory start */
-	spin_unlock_irq(&mca_lock);
-
-	for (i = 0; i < MCA_STANDARD_RESOURCES; i++)
-		request_resource(&ioport_resource, mca_standard_resources + i);
-
-	mca_do_proc_init();
-
-	return 0;
-
- out_unlock_nomem:
-	spin_unlock_irq(&mca_lock);
- out_nomem:
-	printk(KERN_EMERG "Failed memory allocation in MCA setup!\n");
-	return -ENOMEM;
-}
-
-subsys_initcall(mca_init);
-
-/*--------------------------------------------------------------------*/
-
-static __kprobes void
-mca_handle_nmi_device(struct mca_device *mca_dev, int check_flag)
-{
-	int slot = mca_dev->slot;
-
-	if (slot == MCA_INTEGSCSI) {
-		printk(KERN_CRIT "NMI: caused by MCA integrated SCSI adapter (%s)\n",
-			mca_dev->name);
-	} else if (slot == MCA_INTEGVIDEO) {
-		printk(KERN_CRIT "NMI: caused by MCA integrated video adapter (%s)\n",
-			mca_dev->name);
-	} else if (slot == MCA_MOTHERBOARD) {
-		printk(KERN_CRIT "NMI: caused by motherboard (%s)\n",
-			mca_dev->name);
-	}
-
-	/* More info available in POS 6 and 7? */
-
-	if (check_flag) {
-		unsigned char pos6, pos7;
-
-		pos6 = mca_device_read_pos(mca_dev, 6);
-		pos7 = mca_device_read_pos(mca_dev, 7);
-
-		printk(KERN_CRIT "NMI: POS 6 = 0x%x, POS 7 = 0x%x\n", pos6, pos7);
-	}
-
-} /* mca_handle_nmi_slot */
-
-/*--------------------------------------------------------------------*/
-
-static int __kprobes mca_handle_nmi_callback(struct device *dev, void *data)
-{
-	struct mca_device *mca_dev = to_mca_device(dev);
-	unsigned char pos5;
-
-	pos5 = mca_device_read_pos(mca_dev, 5);
-
-	if (!(pos5 & 0x80)) {
-		/*
-		 *  Bit 7 of POS 5 is reset when this adapter has a hardware
-		 * error. Bit 7 it reset if there's error information
-		 * available in POS 6 and 7.
-		 */
-		mca_handle_nmi_device(mca_dev, !(pos5 & 0x40));
-		return 1;
-	}
-	return 0;
-}
-
-void __kprobes mca_handle_nmi(void)
-{
-	/*
-	 *  First try - scan the various adapters and see if a specific
-	 * adapter was responsible for the error.
-	 */
-	bus_for_each_dev(&mca_bus_type, NULL, NULL, mca_handle_nmi_callback);
-}
diff --git a/arch/x86/kernel/mpparse.c b/arch/x86/kernel/mpparse.c
index ca470e4c92dc..b02d4dd6b8a3 100644
--- a/arch/x86/kernel/mpparse.c
+++ b/arch/x86/kernel/mpparse.c
@@ -97,7 +97,7 @@ static void __init MP_bus_info(struct mpc_bus *m)
 
 	set_bit(m->busid, mp_bus_not_pci);
 	if (strncmp(str, BUSTYPE_ISA, sizeof(BUSTYPE_ISA) - 1) == 0) {
-#if defined(CONFIG_EISA) || defined(CONFIG_MCA)
+#ifdef CONFIG_EISA
 		mp_bus_id_to_type[m->busid] = MP_BUS_ISA;
 #endif
 	} else if (strncmp(str, BUSTYPE_PCI, sizeof(BUSTYPE_PCI) - 1) == 0) {
@@ -105,12 +105,10 @@ static void __init MP_bus_info(struct mpc_bus *m)
 			x86_init.mpparse.mpc_oem_pci_bus(m);
 
 		clear_bit(m->busid, mp_bus_not_pci);
-#if defined(CONFIG_EISA) || defined(CONFIG_MCA)
+#ifdef CONFIG_EISA
 		mp_bus_id_to_type[m->busid] = MP_BUS_PCI;
 	} else if (strncmp(str, BUSTYPE_EISA, sizeof(BUSTYPE_EISA) - 1) == 0) {
 		mp_bus_id_to_type[m->busid] = MP_BUS_EISA;
-	} else if (strncmp(str, BUSTYPE_MCA, sizeof(BUSTYPE_MCA) - 1) == 0) {
-		mp_bus_id_to_type[m->busid] = MP_BUS_MCA;
 #endif
 	} else
 		printk(KERN_WARNING "Unknown bustype %s - ignoring\n", str);
@@ -368,9 +366,6 @@ static void __init construct_ioapic_table(int mpc_default_type)
 	case 3:
 		memcpy(bus.bustype, "EISA  ", 6);
 		break;
-	case 4:
-	case 7:
-		memcpy(bus.bustype, "MCA   ", 6);
 	}
 	MP_bus_info(&bus);
 	if (mpc_default_type > 4) {
@@ -623,7 +618,7 @@ void __init default_find_smp_config(void)
 		return;
 	/*
 	 * If it is an SMP machine we should know now, unless the
-	 * configuration is in an EISA/MCA bus machine with an
+	 * configuration is in an EISA bus machine with an
 	 * extended bios data area.
 	 *
 	 * there is a real-mode segmented pointer pointing to the
diff --git a/arch/x86/kernel/nmi.c b/arch/x86/kernel/nmi.c
index 47acaf319165..7b3fdfdabf94 100644
--- a/arch/x86/kernel/nmi.c
+++ b/arch/x86/kernel/nmi.c
@@ -19,8 +19,6 @@
 #include <linux/slab.h>
 #include <linux/export.h>
 
-#include <linux/mca.h>
-
 #if defined(CONFIG_EDAC)
 #include <linux/edac.h>
 #endif
@@ -282,16 +280,6 @@ unknown_nmi_error(unsigned char reason, struct pt_regs *regs)
 
 	__this_cpu_add(nmi_stats.unknown, 1);
 
-#ifdef CONFIG_MCA
-	/*
-	 * Might actually be able to figure out what the guilty party
-	 * is:
-	 */
-	if (MCA_bus) {
-		mca_handle_nmi();
-		return;
-	}
-#endif
 	pr_emerg("Uhhuh. NMI received for unknown reason %02x on CPU %d.\n",
 		 reason, smp_processor_id());
 
diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c
index 1a2901562059..879166402bf9 100644
--- a/arch/x86/kernel/setup.c
+++ b/arch/x86/kernel/setup.c
@@ -34,7 +34,6 @@
 #include <linux/memblock.h>
 #include <linux/seq_file.h>
 #include <linux/console.h>
-#include <linux/mca.h>
 #include <linux/root_dev.h>
 #include <linux/highmem.h>
 #include <linux/module.h>
@@ -179,12 +178,6 @@ struct cpuinfo_x86 new_cpu_data __cpuinitdata = {0, 0, 0, 0, -1, 1, 0, 0, -1};
 /* common cpu data for all cpus */
 struct cpuinfo_x86 boot_cpu_data __read_mostly = {0, 0, 0, 0, -1, 1, 0, 0, -1};
 EXPORT_SYMBOL(boot_cpu_data);
-static void set_mca_bus(int x)
-{
-#ifdef CONFIG_MCA
-	MCA_bus = x;
-#endif
-}
 
 unsigned int def_to_bigsmp;
 
@@ -717,7 +710,6 @@ void __init setup_arch(char **cmdline_p)
 	apm_info.bios = boot_params.apm_bios_info;
 	ist_info = boot_params.ist_info;
 	if (boot_params.sys_desc_table.length != 0) {
-		set_mca_bus(boot_params.sys_desc_table.table[3] & 0x2);
 		machine_id = boot_params.sys_desc_table.table[0];
 		machine_submodel_id = boot_params.sys_desc_table.table[1];
 		BIOS_revision = boot_params.sys_desc_table.table[2];
diff --git a/arch/x86/kernel/time.c b/arch/x86/kernel/time.c
index c6eba2b42673..24d3c91e9812 100644
--- a/arch/x86/kernel/time.c
+++ b/arch/x86/kernel/time.c
@@ -14,7 +14,6 @@
 #include <linux/i8253.h>
 #include <linux/time.h>
 #include <linux/export.h>
-#include <linux/mca.h>
 
 #include <asm/vsyscall.h>
 #include <asm/x86_init.h>
@@ -58,11 +57,6 @@ EXPORT_SYMBOL(profile_pc);
 static irqreturn_t timer_interrupt(int irq, void *dev_id)
 {
 	global_clock_event->event_handler(global_clock_event);
-
-	/* MCA bus quirk: Acknowledge irq0 by setting bit 7 in port 0x61 */
-	if (MCA_bus)
-		outb_p(inb_p(0x61)| 0x80, 0x61);
-
 	return IRQ_HANDLED;
 }
 
diff --git a/arch/x86/kernel/traps.c b/arch/x86/kernel/traps.c
index ff9281f16029..4754f510b360 100644
--- a/arch/x86/kernel/traps.c
+++ b/arch/x86/kernel/traps.c
@@ -37,10 +37,6 @@
 #include <linux/eisa.h>
 #endif
 
-#ifdef CONFIG_MCA
-#include <linux/mca.h>
-#endif
-
 #if defined(CONFIG_EDAC)
 #include <linux/edac.h>
 #endif
diff --git a/drivers/Makefile b/drivers/Makefile
index 95952c82bf16..f9b82f2c7c47 100644
--- a/drivers/Makefile
+++ b/drivers/Makefile
@@ -92,7 +92,6 @@ obj-$(CONFIG_BT)		+= bluetooth/
 obj-$(CONFIG_ACCESSIBILITY)	+= accessibility/
 obj-$(CONFIG_ISDN)		+= isdn/
 obj-$(CONFIG_EDAC)		+= edac/
-obj-$(CONFIG_MCA)		+= mca/
 obj-$(CONFIG_EISA)		+= eisa/
 obj-y				+= lguest/
 obj-$(CONFIG_CPU_FREQ)		+= cpufreq/
diff --git a/drivers/mca/Kconfig b/drivers/mca/Kconfig
deleted file mode 100644
index a7a0220ab4bd..000000000000
--- a/drivers/mca/Kconfig
+++ /dev/null
@@ -1,14 +0,0 @@
-config MCA_LEGACY
-	bool "Legacy MCA API Support"
-	depends on MCA
-	help
-	  This compiles in support for the old slot based MCA API.  If you
-	  have an unconverted MCA driver, you will need to say Y here.  It
-	  is safe to say Y anyway.
-
-config MCA_PROC_FS
-	bool "Support for the mca entry in /proc"
-	depends on MCA_LEGACY && PROC_FS
-	help
-	  If you want the old style /proc/mca directory in addition to the
-	  new style sysfs say Y here.
diff --git a/drivers/mca/Makefile b/drivers/mca/Makefile
deleted file mode 100644
index 0794b122520e..000000000000
--- a/drivers/mca/Makefile
+++ /dev/null
@@ -1,7 +0,0 @@
-# Makefile for the Linux MCA bus support
-
-obj-y	:= mca-bus.o mca-device.o mca-driver.o
-
-obj-$(CONFIG_MCA_PROC_FS)	+= mca-proc.o
-obj-$(CONFIG_MCA_LEGACY)	+= mca-legacy.o
-
diff --git a/drivers/mca/mca-bus.c b/drivers/mca/mca-bus.c
deleted file mode 100644
index ada5ebbaa255..000000000000
--- a/drivers/mca/mca-bus.c
+++ /dev/null
@@ -1,169 +0,0 @@
-/* -*- mode: c; c-basic-offset: 8 -*- */
-
-/*
- * MCA bus support functions for sysfs.
- *
- * (C) 2002 James Bottomley <James.Bottomley@HansenPartnership.com>
- *
-**-----------------------------------------------------------------------------
-**  
-**  This program is free software; you can redistribute it and/or modify
-**  it under the terms of the GNU General Public License as published by
-**  the Free Software Foundation; either version 2 of the License, or
-**  (at your option) any later version.
-**
-**  This program is distributed in the hope that it will be useful,
-**  but WITHOUT ANY WARRANTY; without even the implied warranty of
-**  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
-**  GNU General Public License for more details.
-**
-**  You should have received a copy of the GNU General Public License
-**  along with this program; if not, write to the Free Software
-**  Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
-**
-**-----------------------------------------------------------------------------
- */
-
-#include <linux/kernel.h>
-#include <linux/device.h>
-#include <linux/mca.h>
-#include <linux/module.h>
-#include <linux/init.h>
-#include <linux/slab.h>
-
-/* Very few machines have more than one MCA bus.  However, there are
- * those that do (Voyager 35xx/5xxx), so we do it this way for future
- * expansion.  None that I know have more than 2 */
-static struct mca_bus *mca_root_busses[MAX_MCA_BUSSES];
-
-#define MCA_DEVINFO(i,s) { .pos = i, .name = s }
-
-struct mca_device_info {
-	short pos_id;		/* the 2 byte pos id for this card */
-	char name[50];
-};
-
-static int mca_bus_match (struct device *dev, struct device_driver *drv)
-{
-	struct mca_device *mca_dev = to_mca_device (dev);
-	struct mca_driver *mca_drv = to_mca_driver (drv);
-	const unsigned short *mca_ids = mca_drv->id_table;
-	int i = 0;
-
-	if (mca_ids) {
-		for(i = 0; mca_ids[i]; i++) {
-			if (mca_ids[i] == mca_dev->pos_id) {
-				mca_dev->index = i;
-				return 1;
-			}
-		}
-	}
-	/* If the integrated id is present, treat it as though it were an
-	 * additional id in the id_table (it can't be because by definition,
-	 * integrated id's overflow a short */
-	if (mca_drv->integrated_id && mca_dev->pos_id ==
-	    mca_drv->integrated_id) {
-		mca_dev->index = i;
-		return 1;
-	}
-	return 0;
-}
-
-struct bus_type mca_bus_type = {
-	.name  = "MCA",
-	.match = mca_bus_match,
-};
-EXPORT_SYMBOL (mca_bus_type);
-
-static ssize_t mca_show_pos_id(struct device *dev, struct device_attribute *attr, char *buf)
-{
-	/* four digits, \n and trailing \0 */
-	struct mca_device *mca_dev = to_mca_device(dev);
-	int len;
-
-	if(mca_dev->pos_id < MCA_DUMMY_POS_START)
-		len = sprintf(buf, "%04x\n", mca_dev->pos_id);
-	else
-		len = sprintf(buf, "none\n");
-	return len;
-}
-static ssize_t mca_show_pos(struct device *dev, struct device_attribute *attr, char *buf)
-{
-	/* enough for 8 two byte hex chars plus space and new line */
-	int j, len=0;
-	struct mca_device *mca_dev = to_mca_device(dev);
-
-	for(j=0; j<8; j++)
-		len += sprintf(buf+len, "%02x ", mca_dev->pos[j]);
-	/* change last trailing space to new line */
-	buf[len-1] = '\n';
-	return len;
-}
-
-static DEVICE_ATTR(id, S_IRUGO, mca_show_pos_id, NULL);
-static DEVICE_ATTR(pos, S_IRUGO, mca_show_pos, NULL);
-
-int __init mca_register_device(int bus, struct mca_device *mca_dev)
-{
-	struct mca_bus *mca_bus = mca_root_busses[bus];
-	int rc;
-
-	mca_dev->dev.parent = &mca_bus->dev;
-	mca_dev->dev.bus = &mca_bus_type;
-	dev_set_name(&mca_dev->dev, "%02d:%02X", bus, mca_dev->slot);
-	mca_dev->dma_mask = mca_bus->default_dma_mask;
-	mca_dev->dev.dma_mask = &mca_dev->dma_mask;
-	mca_dev->dev.coherent_dma_mask = mca_dev->dma_mask;
-
-	rc = device_register(&mca_dev->dev);
-	if (rc)
-		goto err_out;
-
-	rc = device_create_file(&mca_dev->dev, &dev_attr_id);
-	if (rc) goto err_out_devreg;
-	rc = device_create_file(&mca_dev->dev, &dev_attr_pos);
-	if (rc) goto err_out_id;
-
-	return 1;
-
-err_out_id:
-	device_remove_file(&mca_dev->dev, &dev_attr_id);
-err_out_devreg:
-	device_unregister(&mca_dev->dev);
-err_out:
-	return 0;
-}
-
-/* */
-struct mca_bus * __devinit mca_attach_bus(int bus)
-{
-	struct mca_bus *mca_bus;
-
-	if (unlikely(mca_root_busses[bus] != NULL)) {
-		/* This should never happen, but just in case */
-		printk(KERN_EMERG "MCA tried to add already existing bus %d\n",
-		       bus);
-		dump_stack();
-		return NULL;
-	}
-
-	mca_bus = kzalloc(sizeof(struct mca_bus), GFP_KERNEL);
-	if (!mca_bus)
-		return NULL;
-
-	dev_set_name(&mca_bus->dev, "mca%d", bus);
-	sprintf(mca_bus->name,"Host %s MCA Bridge", bus ? "Secondary" : "Primary");
-	if (device_register(&mca_bus->dev)) {
-		kfree(mca_bus);
-		return NULL;
-	}
-
-	mca_root_busses[bus] = mca_bus;
-
-	return mca_bus;
-}
-
-int __init mca_system_init (void)
-{
-	return bus_register(&mca_bus_type);
-}
diff --git a/drivers/mca/mca-device.c b/drivers/mca/mca-device.c
deleted file mode 100644
index e7adf89fae41..000000000000
--- a/drivers/mca/mca-device.c
+++ /dev/null
@@ -1,218 +0,0 @@
-/* -*- mode: c; c-basic-offset: 8 -*- */
-
-/*
- * MCA device support functions
- *
- * These functions support the ongoing device access API.
- *
- * (C) 2002 James Bottomley <James.Bottomley@HansenPartnership.com>
- *
-**-----------------------------------------------------------------------------
-**  
-**  This program is free software; you can redistribute it and/or modify
-**  it under the terms of the GNU General Public License as published by
-**  the Free Software Foundation; either version 2 of the License, or
-**  (at your option) any later version.
-**
-**  This program is distributed in the hope that it will be useful,
-**  but WITHOUT ANY WARRANTY; without even the implied warranty of
-**  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
-**  GNU General Public License for more details.
-**
-**  You should have received a copy of the GNU General Public License
-**  along with this program; if not, write to the Free Software
-**  Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
-**
-**-----------------------------------------------------------------------------
- */
-
-#include <linux/module.h>
-#include <linux/device.h>
-#include <linux/mca.h>
-#include <linux/string.h>
-
-/**
- *	mca_device_read_stored_pos - read POS register from stored data
- *	@mca_dev: device to read from
- *	@reg:  register to read from
- *
- *	Fetch a POS value that was stored at boot time by the kernel
- *	when it scanned the MCA space. The register value is returned.
- *	Missing or invalid registers report 0.
- */
-unsigned char mca_device_read_stored_pos(struct mca_device *mca_dev, int reg)
-{
-	if(reg < 0 || reg >= 8)
-		return 0;
-
-	return mca_dev->pos[reg];
-}
-EXPORT_SYMBOL(mca_device_read_stored_pos);
-
-/**
- *	mca_device_read_pos - read POS register from card
- *	@mca_dev: device to read from
- *	@reg:  register to read from
- *
- *	Fetch a POS value directly from the hardware to obtain the
- *	current value. This is much slower than
- *	mca_device_read_stored_pos and may not be invoked from
- *	interrupt context. It handles the deep magic required for
- *	onboard devices transparently.
- */
-unsigned char mca_device_read_pos(struct mca_device *mca_dev, int reg)
-{
-	struct mca_bus *mca_bus = to_mca_bus(mca_dev->dev.parent);
-
-	return mca_bus->f.mca_read_pos(mca_dev, reg);
-
-	return 	mca_dev->pos[reg];
-}
-EXPORT_SYMBOL(mca_device_read_pos);
-
-
-/**
- *	mca_device_write_pos - read POS register from card
- *	@mca_dev: device to write pos register to
- *	@reg:  register to write to
- *	@byte: byte to write to the POS registers
- *
- *	Store a POS value directly to the hardware. You should not
- *	normally need to use this function and should have a very good
- *	knowledge of MCA bus before you do so. Doing this wrongly can
- *	damage the hardware.
- *
- *	This function may not be used from interrupt context.
- *
- */
-void mca_device_write_pos(struct mca_device *mca_dev, int reg,
-			  unsigned char byte)
-{
-	struct mca_bus *mca_bus = to_mca_bus(mca_dev->dev.parent);
-
-	mca_bus->f.mca_write_pos(mca_dev, reg, byte);
-}
-EXPORT_SYMBOL(mca_device_write_pos);
-
-/**
- *	mca_device_transform_irq - transform the ADF obtained IRQ
- *	@mca_device: device whose irq needs transforming
- *	@irq: input irq from ADF
- *
- *	MCA Adapter Definition Files (ADF) contain irq, ioport, memory
- *	etc. definitions.  In systems with more than one bus, these need
- *	to be transformed through bus mapping functions to get the real
- *	system global quantities.
- *
- *	This function transforms the interrupt number and returns the
- *	transformed system global interrupt
- */
-int mca_device_transform_irq(struct mca_device *mca_dev, int irq)
-{
-	struct mca_bus *mca_bus = to_mca_bus(mca_dev->dev.parent);
-
-	return mca_bus->f.mca_transform_irq(mca_dev, irq);
-}
-EXPORT_SYMBOL(mca_device_transform_irq);
-
-/**
- *	mca_device_transform_ioport - transform the ADF obtained I/O port
- *	@mca_device: device whose port needs transforming
- *	@ioport: input I/O port from ADF
- *
- *	MCA Adapter Definition Files (ADF) contain irq, ioport, memory
- *	etc. definitions.  In systems with more than one bus, these need
- *	to be transformed through bus mapping functions to get the real
- *	system global quantities.
- *
- *	This function transforms the I/O port number and returns the
- *	transformed system global port number.
- *
- *	This transformation can be assumed to be linear for port ranges.
- */
-int mca_device_transform_ioport(struct mca_device *mca_dev, int port)
-{
-	struct mca_bus *mca_bus = to_mca_bus(mca_dev->dev.parent);
-
-	return mca_bus->f.mca_transform_ioport(mca_dev, port);
-}
-EXPORT_SYMBOL(mca_device_transform_ioport);
-
-/**
- *	mca_device_transform_memory - transform the ADF obtained memory
- *	@mca_device: device whose memory region needs transforming
- *	@mem: memory region start from ADF
- *
- *	MCA Adapter Definition Files (ADF) contain irq, ioport, memory
- *	etc. definitions.  In systems with more than one bus, these need
- *	to be transformed through bus mapping functions to get the real
- *	system global quantities.
- *
- *	This function transforms the memory region start and returns the
- *	transformed system global memory region (physical).
- *
- *	This transformation can be assumed to be linear for region ranges.
- */
-void *mca_device_transform_memory(struct mca_device *mca_dev, void *mem)
-{
-	struct mca_bus *mca_bus = to_mca_bus(mca_dev->dev.parent);
-
-	return mca_bus->f.mca_transform_memory(mca_dev, mem);
-}
-EXPORT_SYMBOL(mca_device_transform_memory);
-
-
-/**
- *	mca_device_claimed - check if claimed by driver
- *	@mca_dev:	device to check
- *
- *	Returns 1 if the slot has been claimed by a driver
- */
-
-int mca_device_claimed(struct mca_device *mca_dev)
-{
-	return mca_dev->driver_loaded;
-}
-EXPORT_SYMBOL(mca_device_claimed);
-
-/**
- *	mca_device_set_claim - set the claim value of the driver
- *	@mca_dev:	device to set value for
- *	@val:		claim value to set (1 claimed, 0 unclaimed)
- */
-void mca_device_set_claim(struct mca_device *mca_dev, int val)
-{
-	mca_dev->driver_loaded = val;
-}
-EXPORT_SYMBOL(mca_device_set_claim);
-
-/**
- *	mca_device_status - get the status of the device
- *	@mca_device:	device to get
- *
- *	returns an enumeration of the device status:
- *
- *	MCA_ADAPTER_NORMAL	adapter is OK.
- *	MCA_ADAPTER_NONE	no adapter at device (should never happen).
- *	MCA_ADAPTER_DISABLED	adapter is disabled.
- *	MCA_ADAPTER_ERROR	adapter cannot be initialised.
- */
-enum MCA_AdapterStatus mca_device_status(struct mca_device *mca_dev)
-{
-	return mca_dev->status;
-}
-EXPORT_SYMBOL(mca_device_status);
-
-/**
- *	mca_device_set_name - set the name of the device
- *	@mca_device:	device to set the name of
- *	@name:		name to set
- */
-void mca_device_set_name(struct mca_device *mca_dev, const char *name)
-{
-	if(!mca_dev)
-		return;
-
-	strlcpy(mca_dev->name, name, sizeof(mca_dev->name));
-}
-EXPORT_SYMBOL(mca_device_set_name);
diff --git a/drivers/mca/mca-driver.c b/drivers/mca/mca-driver.c
deleted file mode 100644
index 32cd39bcc715..000000000000
--- a/drivers/mca/mca-driver.c
+++ /dev/null
@@ -1,63 +0,0 @@
-/* -*- mode: c; c-basic-offset: 8 -*- */
-
-/*
- * MCA driver support functions for sysfs.
- *
- * (C) 2002 James Bottomley <James.Bottomley@HansenPartnership.com>
- *
-**-----------------------------------------------------------------------------
-**  
-**  This program is free software; you can redistribute it and/or modify
-**  it under the terms of the GNU General Public License as published by
-**  the Free Software Foundation; either version 2 of the License, or
-**  (at your option) any later version.
-**
-**  This program is distributed in the hope that it will be useful,
-**  but WITHOUT ANY WARRANTY; without even the implied warranty of
-**  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
-**  GNU General Public License for more details.
-**
-**  You should have received a copy of the GNU General Public License
-**  along with this program; if not, write to the Free Software
-**  Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
-**
-**-----------------------------------------------------------------------------
- */
-
-#include <linux/device.h>
-#include <linux/mca.h>
-#include <linux/module.h>
-
-int mca_register_driver(struct mca_driver *mca_drv)
-{
-	int r;
-
-	if (MCA_bus) {
-		mca_drv->driver.bus = &mca_bus_type;
-		if ((r = driver_register(&mca_drv->driver)) < 0)
-			return r;
-		mca_drv->integrated_id = 0;
-	}
-
-	return 0;
-}
-EXPORT_SYMBOL(mca_register_driver);
-
-int mca_register_driver_integrated(struct mca_driver *mca_driver,
-				   int integrated_id)
-{
-	int r = mca_register_driver(mca_driver);
-
-	if (!r)
-		mca_driver->integrated_id = integrated_id;
-
-	return r;
-}
-EXPORT_SYMBOL(mca_register_driver_integrated);
-
-void mca_unregister_driver(struct mca_driver *mca_drv)
-{
-	if (MCA_bus)
-		driver_unregister(&mca_drv->driver);
-}
-EXPORT_SYMBOL(mca_unregister_driver);
diff --git a/drivers/mca/mca-legacy.c b/drivers/mca/mca-legacy.c
deleted file mode 100644
index 494f0c2001f5..000000000000
--- a/drivers/mca/mca-legacy.c
+++ /dev/null
@@ -1,329 +0,0 @@
-/* -*- mode: c; c-basic-offset: 8 -*- */
-
-/*
- * MCA bus support functions for legacy (2.4) API.
- *
- * Legacy API means the API that operates in terms of MCA slot number
- *
- * (C) 2002 James Bottomley <James.Bottomley@HansenPartnership.com>
- *
-**-----------------------------------------------------------------------------
-**  
-**  This program is free software; you can redistribute it and/or modify
-**  it under the terms of the GNU General Public License as published by
-**  the Free Software Foundation; either version 2 of the License, or
-**  (at your option) any later version.
-**
-**  This program is distributed in the hope that it will be useful,
-**  but WITHOUT ANY WARRANTY; without even the implied warranty of
-**  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
-**  GNU General Public License for more details.
-**
-**  You should have received a copy of the GNU General Public License
-**  along with this program; if not, write to the Free Software
-**  Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
-**
-**-----------------------------------------------------------------------------
- */
-
-#include <linux/module.h>
-#include <linux/device.h>
-#include <linux/mca-legacy.h>
-#include <asm/io.h>
-
-/* NOTE: This structure is stack allocated */
-struct mca_find_adapter_info {
-	int			id;
-	int			slot;
-	struct mca_device	*mca_dev;
-};
-
-/* The purpose of this iterator is to loop over all the devices and
- * find the one with the smallest slot number that's just greater than
- * or equal to the required slot with a matching id */
-static int mca_find_adapter_callback(struct device *dev, void *data)
-{
-	struct mca_find_adapter_info *info = data;
-	struct mca_device *mca_dev = to_mca_device(dev);
-
-	if(mca_dev->pos_id != info->id)
-		return 0;
-
-	if(mca_dev->slot < info->slot)
-		return 0;
-
-	if(!info->mca_dev || info->mca_dev->slot >= mca_dev->slot)
-		info->mca_dev = mca_dev;
-
-	return 0;
-}
-
-/**
- *	mca_find_adapter - scan for adapters
- *	@id:	MCA identification to search for
- *	@start:	starting slot
- *
- *	Search the MCA configuration for adapters matching the 16bit
- *	ID given. The first time it should be called with start as zero
- *	and then further calls made passing the return value of the
- *	previous call until %MCA_NOTFOUND is returned.
- *
- *	Disabled adapters are not reported.
- */
-
-int mca_find_adapter(int id, int start)
-{
-	struct mca_find_adapter_info info;
-
-	if(id == 0xffff)
-		return MCA_NOTFOUND;
-
-	info.slot = start;
-	info.id = id;
-	info.mca_dev = NULL;
-
-	for(;;) {
-		bus_for_each_dev(&mca_bus_type, NULL, &info, mca_find_adapter_callback);
-
-		if(info.mca_dev == NULL)
-			return MCA_NOTFOUND;
-
-		if(info.mca_dev->status != MCA_ADAPTER_DISABLED)
-			break;
-
-		/* OK, found adapter but it was disabled.  Go around
-		 * again, excluding the slot we just found */
-
-		info.slot = info.mca_dev->slot + 1;
-		info.mca_dev = NULL;
-	}
-		
-	return info.mca_dev->slot;
-}
-EXPORT_SYMBOL(mca_find_adapter);
-
-/*--------------------------------------------------------------------*/
-
-/**
- *	mca_find_unused_adapter - scan for unused adapters
- *	@id:	MCA identification to search for
- *	@start:	starting slot
- *
- *	Search the MCA configuration for adapters matching the 16bit
- *	ID given. The first time it should be called with start as zero
- *	and then further calls made passing the return value of the
- *	previous call until %MCA_NOTFOUND is returned.
- *
- *	Adapters that have been claimed by drivers and those that
- *	are disabled are not reported. This function thus allows a driver
- *	to scan for further cards when some may already be driven.
- */
-
-int mca_find_unused_adapter(int id, int start)
-{
-	struct mca_find_adapter_info info = { 0 };
-
-	if (!MCA_bus || id == 0xffff)
-		return MCA_NOTFOUND;
-
-	info.slot = start;
-	info.id = id;
-	info.mca_dev = NULL;
-
-	for(;;) {
-		bus_for_each_dev(&mca_bus_type, NULL, &info, mca_find_adapter_callback);
-
-		if(info.mca_dev == NULL)
-			return MCA_NOTFOUND;
-
-		if(info.mca_dev->status != MCA_ADAPTER_DISABLED
-		   && !info.mca_dev->driver_loaded)
-			break;
-
-		/* OK, found adapter but it was disabled or already in
-		 * use.  Go around again, excluding the slot we just
-		 * found */
-
-		info.slot = info.mca_dev->slot + 1;
-		info.mca_dev = NULL;
-	}
-		
-	return info.mca_dev->slot;
-}
-EXPORT_SYMBOL(mca_find_unused_adapter);
-
-/* NOTE: stack allocated structure */
-struct mca_find_device_by_slot_info {
-	int			slot;
-	struct mca_device 	*mca_dev;
-};
-
-static int mca_find_device_by_slot_callback(struct device *dev, void *data)
-{
-	struct mca_find_device_by_slot_info *info = data;
-	struct mca_device *mca_dev = to_mca_device(dev);
-
-	if(mca_dev->slot == info->slot)
-		info->mca_dev = mca_dev;
-
-	return 0;
-}
-
-struct mca_device *mca_find_device_by_slot(int slot)
-{
-	struct mca_find_device_by_slot_info info;
-
-	info.slot = slot;
-	info.mca_dev = NULL;
-
-	bus_for_each_dev(&mca_bus_type, NULL, &info, mca_find_device_by_slot_callback);
-
-	return info.mca_dev;
-}
-
-/**
- *	mca_read_stored_pos - read POS register from boot data
- *	@slot: slot number to read from
- *	@reg:  register to read from
- *
- *	Fetch a POS value that was stored at boot time by the kernel
- *	when it scanned the MCA space. The register value is returned.
- *	Missing or invalid registers report 0.
- */
-unsigned char mca_read_stored_pos(int slot, int reg)
-{
-	struct mca_device *mca_dev = mca_find_device_by_slot(slot);
-
-	if(!mca_dev)
-		return 0;
-
-	return mca_device_read_stored_pos(mca_dev, reg);
-}
-EXPORT_SYMBOL(mca_read_stored_pos);
-
-
-/**
- *	mca_read_pos - read POS register from card
- *	@slot: slot number to read from
- *	@reg:  register to read from
- *
- *	Fetch a POS value directly from the hardware to obtain the
- *	current value. This is much slower than mca_read_stored_pos and
- *	may not be invoked from interrupt context. It handles the
- *	deep magic required for onboard devices transparently.
- */
-
-unsigned char mca_read_pos(int slot, int reg)
-{
-	struct mca_device *mca_dev = mca_find_device_by_slot(slot);
-
-	if(!mca_dev)
-		return 0;
-
-	return mca_device_read_pos(mca_dev, reg);
-}
-EXPORT_SYMBOL(mca_read_pos);
-
-		
-/**
- *	mca_write_pos - read POS register from card
- *	@slot: slot number to read from
- *	@reg:  register to read from
- *	@byte: byte to write to the POS registers
- *
- *	Store a POS value directly from the hardware. You should not
- *	normally need to use this function and should have a very good
- *	knowledge of MCA bus before you do so. Doing this wrongly can
- *	damage the hardware.
- *
- *	This function may not be used from interrupt context.
- *
- *	Note that this a technically a Bad Thing, as IBM tech stuff says
- *	you should only set POS values through their utilities.
- *	However, some devices such as the 3c523 recommend that you write
- *	back some data to make sure the configuration is consistent.
- *	I'd say that IBM is right, but I like my drivers to work.
- *
- *	This function can't do checks to see if multiple devices end up
- *	with the same resources, so you might see magic smoke if someone
- *	screws up.
- */
-
-void mca_write_pos(int slot, int reg, unsigned char byte)
-{
-	struct mca_device *mca_dev = mca_find_device_by_slot(slot);
-
-	if(!mca_dev)
-		return;
-
-	mca_device_write_pos(mca_dev, reg, byte);
-}
-EXPORT_SYMBOL(mca_write_pos);
-
-/**
- *	mca_set_adapter_name - Set the description of the card
- *	@slot: slot to name
- *	@name: text string for the namen
- *
- *	This function sets the name reported via /proc for this
- *	adapter slot. This is for user information only. Setting a
- *	name deletes any previous name.
- */
-
-void mca_set_adapter_name(int slot, char* name)
-{
-	struct mca_device *mca_dev = mca_find_device_by_slot(slot);
-
-	if(!mca_dev)
-		return;
-
-	mca_device_set_name(mca_dev, name);
-}
-EXPORT_SYMBOL(mca_set_adapter_name);
-
-/**
- *	mca_mark_as_used - claim an MCA device
- *	@slot:	slot to claim
- *	FIXME:  should we make this threadsafe
- *
- *	Claim an MCA slot for a device driver. If the
- *	slot is already taken the function returns 1,
- *	if it is not taken it is claimed and 0 is
- *	returned.
- */
-
-int mca_mark_as_used(int slot)
-{
-	struct mca_device *mca_dev = mca_find_device_by_slot(slot);
-
-	if(!mca_dev)
-		/* FIXME: this is actually a severe error */
-		return 1;
-
-	if(mca_device_claimed(mca_dev))
-		return 1;
-
-	mca_device_set_claim(mca_dev, 1);
-
-	return 0;
-}
-EXPORT_SYMBOL(mca_mark_as_used);
-
-/**
- *	mca_mark_as_unused - release an MCA device
- *	@slot:	slot to claim
- *
- *	Release the slot for other drives to use.
- */
-
-void mca_mark_as_unused(int slot)
-{
-	struct mca_device *mca_dev = mca_find_device_by_slot(slot);
-
-	if(!mca_dev)
-		return;
-
-	mca_device_set_claim(mca_dev, 0);
-}
-EXPORT_SYMBOL(mca_mark_as_unused);
-
diff --git a/drivers/mca/mca-proc.c b/drivers/mca/mca-proc.c
deleted file mode 100644
index 81ea0d377bf4..000000000000
--- a/drivers/mca/mca-proc.c
+++ /dev/null
@@ -1,249 +0,0 @@
-/* -*- mode: c; c-basic-offset: 8 -*- */
-
-/*
- * MCA bus support functions for the proc fs.
- *
- * NOTE: this code *requires* the legacy MCA api.
- *
- * Legacy API means the API that operates in terms of MCA slot number
- *
- * (C) 2002 James Bottomley <James.Bottomley@HansenPartnership.com>
- *
-**-----------------------------------------------------------------------------
-**  
-**  This program is free software; you can redistribute it and/or modify
-**  it under the terms of the GNU General Public License as published by
-**  the Free Software Foundation; either version 2 of the License, or
-**  (at your option) any later version.
-**
-**  This program is distributed in the hope that it will be useful,
-**  but WITHOUT ANY WARRANTY; without even the implied warranty of
-**  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
-**  GNU General Public License for more details.
-**
-**  You should have received a copy of the GNU General Public License
-**  along with this program; if not, write to the Free Software
-**  Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
-**
-**-----------------------------------------------------------------------------
- */
-#include <linux/module.h>
-#include <linux/init.h>
-#include <linux/proc_fs.h>
-#include <linux/mca.h>
-
-static int get_mca_info_helper(struct mca_device *mca_dev, char *page, int len)
-{
-	int j;
-
-	for(j=0; j<8; j++)
-		len += sprintf(page+len, "%02x ",
-			       mca_dev ? mca_dev->pos[j] : 0xff);
-	len += sprintf(page+len, " %s\n", mca_dev ? mca_dev->name : "");
-	return len;
-}
-
-static int get_mca_info(char *page, char **start, off_t off,
-			int count, int *eof, void *data)
-{
-	int i, len = 0;
-
-	if(MCA_bus) {
-		struct mca_device *mca_dev;
-		/* Format POS registers of eight MCA slots */
-
-		for(i=0; i<MCA_MAX_SLOT_NR; i++) {
-			mca_dev = mca_find_device_by_slot(i);
-
-			len += sprintf(page+len, "Slot %d: ", i+1);
-			len = get_mca_info_helper(mca_dev, page, len);
-		}
-
-		/* Format POS registers of integrated video subsystem */
-
-		mca_dev = mca_find_device_by_slot(MCA_INTEGVIDEO);
-		len += sprintf(page+len, "Video : ");
-		len = get_mca_info_helper(mca_dev, page, len);
-
-		/* Format POS registers of integrated SCSI subsystem */
-
-		mca_dev = mca_find_device_by_slot(MCA_INTEGSCSI);
-		len += sprintf(page+len, "SCSI  : ");
-		len = get_mca_info_helper(mca_dev, page, len);
-
-		/* Format POS registers of motherboard */
-
-		mca_dev = mca_find_device_by_slot(MCA_MOTHERBOARD);
-		len += sprintf(page+len, "Planar: ");
-		len = get_mca_info_helper(mca_dev, page, len);
-	} else {
-		/* Leave it empty if MCA not detected - this should *never*
-		 * happen!
-		 */
-	}
-
-	if (len <= off+count) *eof = 1;
-	*start = page + off;
-	len -= off;
-	if (len>count) len = count;
-	if (len<0) len = 0;
-	return len;
-}
-
-/*--------------------------------------------------------------------*/
-
-static int mca_default_procfn(char* buf, struct mca_device *mca_dev)
-{
-	int len = 0, i;
-	int slot = mca_dev->slot;
-
-	/* Print out the basic information */
-
-	if(slot < MCA_MAX_SLOT_NR) {
-		len += sprintf(buf+len, "Slot: %d\n", slot+1);
-	} else if(slot == MCA_INTEGSCSI) {
-		len += sprintf(buf+len, "Integrated SCSI Adapter\n");
-	} else if(slot == MCA_INTEGVIDEO) {
-		len += sprintf(buf+len, "Integrated Video Adapter\n");
-	} else if(slot == MCA_MOTHERBOARD) {
-		len += sprintf(buf+len, "Motherboard\n");
-	}
-	if (mca_dev->name[0]) {
-
-		/* Drivers might register a name without /proc handler... */
-
-		len += sprintf(buf+len, "Adapter Name: %s\n",
-			       mca_dev->name);
-	} else {
-		len += sprintf(buf+len, "Adapter Name: Unknown\n");
-	}
-	len += sprintf(buf+len, "Id: %02x%02x\n",
-		mca_dev->pos[1], mca_dev->pos[0]);
-	len += sprintf(buf+len, "Enabled: %s\nPOS: ",
-		mca_device_status(mca_dev) == MCA_ADAPTER_NORMAL ?
-			"Yes" : "No");
-	for(i=0; i<8; i++) {
-		len += sprintf(buf+len, "%02x ", mca_dev->pos[i]);
-	}
-	len += sprintf(buf+len, "\nDriver Installed: %s",
-		mca_device_claimed(mca_dev) ? "Yes" : "No");
-	buf[len++] = '\n';
-	buf[len] = 0;
-
-	return len;
-} /* mca_default_procfn() */
-
-static int get_mca_machine_info(char* page, char **start, off_t off,
-				 int count, int *eof, void *data)
-{
-	int len = 0;
-
-	len += sprintf(page+len, "Model Id: 0x%x\n", machine_id);
-	len += sprintf(page+len, "Submodel Id: 0x%x\n", machine_submodel_id);
-	len += sprintf(page+len, "BIOS Revision: 0x%x\n", BIOS_revision);
-
-	if (len <= off+count) *eof = 1;
-	*start = page + off;
-	len -= off;
-	if (len>count) len = count;
-	if (len<0) len = 0;
-	return len;
-}
-
-static int mca_read_proc(char *page, char **start, off_t off,
-				 int count, int *eof, void *data)
-{
-	struct mca_device *mca_dev = (struct mca_device *)data;
-	int len = 0;
-
-	/* Get the standard info */
-
-	len = mca_default_procfn(page, mca_dev);
-
-	/* Do any device-specific processing, if there is any */
-
-	if(mca_dev->procfn) {
-		len += mca_dev->procfn(page+len, mca_dev->slot,
-				       mca_dev->proc_dev);
-	}
-	if (len <= off+count) *eof = 1;
-	*start = page + off;
-	len -= off;
-	if (len>count) len = count;
-	if (len<0) len = 0;
-	return len;
-} /* mca_read_proc() */
-
-/*--------------------------------------------------------------------*/
-
-void __init mca_do_proc_init(void)
-{
-	int i;
-	struct proc_dir_entry *proc_mca;
-	struct proc_dir_entry* node = NULL;
-	struct mca_device *mca_dev;
-
-	proc_mca = proc_mkdir("mca", NULL);
-	create_proc_read_entry("pos",0,proc_mca,get_mca_info,NULL);
-	create_proc_read_entry("machine",0,proc_mca,get_mca_machine_info,NULL);
-
-	/* Initialize /proc/mca entries for existing adapters */
-
-	for(i = 0; i < MCA_NUMADAPTERS; i++) {
-		enum MCA_AdapterStatus status;
-		mca_dev = mca_find_device_by_slot(i);
-		if(!mca_dev)
-			continue;
-
-		mca_dev->procfn = NULL;
-
-		if(i < MCA_MAX_SLOT_NR) sprintf(mca_dev->procname,"slot%d", i+1);
-		else if(i == MCA_INTEGVIDEO) sprintf(mca_dev->procname,"video");
-		else if(i == MCA_INTEGSCSI) sprintf(mca_dev->procname,"scsi");
-		else if(i == MCA_MOTHERBOARD) sprintf(mca_dev->procname,"planar");
-
-		status = mca_device_status(mca_dev);
-		if (status != MCA_ADAPTER_NORMAL &&
-		    status != MCA_ADAPTER_DISABLED)
-			continue;
-
-		node = create_proc_read_entry(mca_dev->procname, 0, proc_mca,
-					      mca_read_proc, (void *)mca_dev);
-
-		if(node == NULL) {
-			printk("Failed to allocate memory for MCA proc-entries!");
-			return;
-		}
-	}
-
-} /* mca_do_proc_init() */
-
-/**
- *	mca_set_adapter_procfn - Set the /proc callback
- *	@slot: slot to configure
- *	@procfn: callback function to call for /proc
- *	@dev: device information passed to the callback
- *
- *	This sets up an information callback for /proc/mca/slot?.  The
- *	function is called with the buffer, slot, and device pointer (or
- *	some equally informative context information, or nothing, if you
- *	prefer), and is expected to put useful information into the
- *	buffer.  The adapter name, ID, and POS registers get printed
- *	before this is called though, so don't do it again.
- *
- *	This should be called with a %NULL @procfn when a module
- *	unregisters, thus preventing kernel crashes and other such
- *	nastiness.
- */
-
-void mca_set_adapter_procfn(int slot, MCA_ProcFn procfn, void* proc_dev)
-{
-	struct mca_device *mca_dev = mca_find_device_by_slot(slot);
-
-	if(!mca_dev)
-		return;
-
-	mca_dev->procfn = procfn;
-	mca_dev->proc_dev = proc_dev;
-}
-EXPORT_SYMBOL(mca_set_adapter_procfn);
diff --git a/drivers/message/i2o/i2o_proc.c b/drivers/message/i2o/i2o_proc.c
index 6d115c7208ab..506c36f6e1db 100644
--- a/drivers/message/i2o/i2o_proc.c
+++ b/drivers/message/i2o/i2o_proc.c
@@ -283,7 +283,6 @@ static char *bus_strings[] = {
 	"Local Bus",
 	"ISA",
 	"EISA",
-	"MCA",
 	"PCI",
 	"PCMCIA",
 	"NUBUS",
@@ -351,18 +350,6 @@ static int i2o_seq_show_hrt(struct seq_file *seq, void *v)
 					   EisaSlotNumber);
 				break;
 
-			case I2O_BUS_MCA:
-				seq_printf(seq, "     IOBase: %0#6x,",
-					   hrt->hrt_entry[i].bus.mca_bus.
-					   McaBaseIOPort);
-				seq_printf(seq, " MemoryBase: %0#10x,",
-					   hrt->hrt_entry[i].bus.mca_bus.
-					   McaBaseMemoryAddress);
-				seq_printf(seq, " Slot: %0#4x,",
-					   hrt->hrt_entry[i].bus.mca_bus.
-					   McaSlotNumber);
-				break;
-
 			case I2O_BUS_PCI:
 				seq_printf(seq, "     Bus: %0#4x",
 					   hrt->hrt_entry[i].bus.pci_bus.
diff --git a/include/linux/i2o-dev.h b/include/linux/i2o-dev.h
index a0b23dd45239..a8093bfec3a6 100644
--- a/include/linux/i2o-dev.h
+++ b/include/linux/i2o-dev.h
@@ -124,7 +124,7 @@ typedef struct i2o_sg_io_hdr {
 #define I2O_BUS_LOCAL	0
 #define I2O_BUS_ISA	1
 #define I2O_BUS_EISA	2
-#define I2O_BUS_MCA	3
+/* was  I2O_BUS_MCA	3 */
 #define I2O_BUS_PCI	4
 #define I2O_BUS_PCMCIA	5
 #define I2O_BUS_NUBUS	6
diff --git a/include/linux/mca-legacy.h b/include/linux/mca-legacy.h
deleted file mode 100644
index 7a3aea845902..000000000000
--- a/include/linux/mca-legacy.h
+++ /dev/null
@@ -1,66 +0,0 @@
-/* -*- mode: c; c-basic-offset: 8 -*- */
-
-/* This is the function prototypes for the old legacy MCA interface
- *
- * Please move your driver to the new sysfs based one instead */
-
-#ifndef _LINUX_MCA_LEGACY_H
-#define _LINUX_MCA_LEGACY_H
-
-#include <linux/mca.h>
-
-#warning "MCA legacy - please move your driver to the new sysfs api"
-
-/* MCA_NOTFOUND is an error condition.  The other two indicate
- * motherboard POS registers contain the adapter.  They might be
- * returned by the mca_find_adapter() function, and can be used as
- * arguments to mca_read_stored_pos().  I'm not going to allow direct
- * access to the motherboard registers until we run across an adapter
- * that requires it.  We don't know enough about them to know if it's
- * safe.
- *
- * See Documentation/mca.txt or one of the existing drivers for
- * more information.
- */
-#define MCA_NOTFOUND	(-1)
-
-
-
-/* Returns the slot of the first enabled adapter matching id.  User can
- * specify a starting slot beyond zero, to deal with detecting multiple
- * devices.  Returns MCA_NOTFOUND if id not found.  Also checks the
- * integrated adapters.
- */
-extern int mca_find_adapter(int id, int start);
-extern int mca_find_unused_adapter(int id, int start);
-
-extern int mca_mark_as_used(int slot);
-extern void mca_mark_as_unused(int slot);
-
-/* gets a byte out of POS register (stored in memory) */
-extern unsigned char mca_read_stored_pos(int slot, int reg);
-
-/* This can be expanded later.  Right now, it gives us a way of
- * getting meaningful information into the MCA_info structure,
- * so we can have a more interesting /proc/mca.
- */
-extern void mca_set_adapter_name(int slot, char* name);
-
-/* These routines actually mess with the hardware POS registers.  They
- * temporarily disable the device (and interrupts), so make sure you know
- * what you're doing if you use them.  Furthermore, writing to a POS may
- * result in two devices trying to share a resource, which in turn can
- * result in multiple devices sharing memory spaces, IRQs, or even trashing
- * hardware.  YOU HAVE BEEN WARNED.
- *
- * You can only access slots with this.  Motherboard registers are off
- * limits.
- */
-
-/* read a byte from the specified POS register. */
-extern unsigned char mca_read_pos(int slot, int reg);
-
-/* write a byte to the specified POS register. */
-extern void mca_write_pos(int slot, int reg, unsigned char byte);
-
-#endif
diff --git a/include/linux/mca.h b/include/linux/mca.h
deleted file mode 100644
index 37972704617f..000000000000
--- a/include/linux/mca.h
+++ /dev/null
@@ -1,148 +0,0 @@
-/*
- * Header for Microchannel Architecture Bus
- * Written by Martin Kolinek, February 1996
- */
-
-#ifndef _LINUX_MCA_H
-#define _LINUX_MCA_H
-
-#include <linux/device.h>
-
-#ifdef CONFIG_MCA
-#include <asm/mca.h>
-
-extern int MCA_bus;
-#else
-#define MCA_bus 0
-#endif
-
-/* This sets up an information callback for /proc/mca/slot?.  The
- * function is called with the buffer, slot, and device pointer (or
- * some equally informative context information, or nothing, if you
- * prefer), and is expected to put useful information into the
- * buffer.  The adapter name, id, and POS registers get printed
- * before this is called though, so don't do it again.
- *
- * This should be called with a NULL procfn when a module
- * unregisters, thus preventing kernel crashes and other such
- * nastiness.
- */
-typedef int (*MCA_ProcFn)(char* buf, int slot, void* dev);
-
-/* Should only be called by the NMI interrupt handler, this will do some
- * fancy stuff to figure out what might have generated a NMI.
- */
-extern void mca_handle_nmi(void);
-
-enum MCA_AdapterStatus {
-	MCA_ADAPTER_NORMAL = 0,
-	MCA_ADAPTER_NONE = 1,
-	MCA_ADAPTER_DISABLED = 2,
-	MCA_ADAPTER_ERROR = 3
-};
-
-struct mca_device {
-	u64			dma_mask;
-	int			pos_id;
-	int			slot;
-
-	/* index into id_table, set by the bus match routine */
-	int			index;
-
-	/* is there a driver installed? 0 - No, 1 - Yes */
-	int			driver_loaded;
-	/* POS registers */
-	unsigned char		pos[8];
-	/* if a pseudo adapter of the motherboard, this is the motherboard
-	 * register value to use for setup cycles */
-	short			pos_register;
-	
-	enum MCA_AdapterStatus	status;
-#ifdef CONFIG_MCA_PROC_FS
-	/* name of the proc/mca file */
-	char			procname[8];
-	/* /proc info callback */
-	MCA_ProcFn		procfn;
-	/* device/context info for proc callback */
-	void			*proc_dev;
-#endif
-	struct device		dev;
-	char			name[32];
-};
-#define to_mca_device(mdev) container_of(mdev, struct mca_device, dev)
-
-struct mca_bus_accessor_functions {
-	unsigned char	(*mca_read_pos)(struct mca_device *, int reg);
-	void		(*mca_write_pos)(struct mca_device *, int reg,
-					 unsigned char byte);
-	int		(*mca_transform_irq)(struct mca_device *, int irq);
-	int		(*mca_transform_ioport)(struct mca_device *,
-						  int region);
-	void *		(*mca_transform_memory)(struct mca_device *,
-						void *memory);
-};
-
-struct mca_bus {
-	u64			default_dma_mask;
-	int			number;
-	struct mca_bus_accessor_functions f;
-	struct device		dev;
-	char			name[32];
-};
-#define to_mca_bus(mdev) container_of(mdev, struct mca_bus, dev)
-
-struct mca_driver {
-	const short		*id_table;
-	void			*driver_data;
-	int			integrated_id;
-	struct device_driver	driver;
-};
-#define to_mca_driver(mdriver) container_of(mdriver, struct mca_driver, driver)
-
-/* Ongoing supported API functions */
-extern struct mca_device *mca_find_device_by_slot(int slot);
-extern int mca_system_init(void);
-extern struct mca_bus *mca_attach_bus(int);
-
-extern unsigned char mca_device_read_stored_pos(struct mca_device *mca_dev,
-						int reg);
-extern unsigned char mca_device_read_pos(struct mca_device *mca_dev, int reg);
-extern void mca_device_write_pos(struct mca_device *mca_dev, int reg,
-				 unsigned char byte);
-extern int mca_device_transform_irq(struct mca_device *mca_dev, int irq);
-extern int mca_device_transform_ioport(struct mca_device *mca_dev, int port);
-extern void *mca_device_transform_memory(struct mca_device *mca_dev,
-					 void *mem);
-extern int mca_device_claimed(struct mca_device *mca_dev);
-extern void mca_device_set_claim(struct mca_device *mca_dev, int val);
-extern void mca_device_set_name(struct mca_device *mca_dev, const char *name);
-static inline char *mca_device_get_name(struct mca_device *mca_dev)
-{
-	return mca_dev ? mca_dev->name : NULL;
-}
-
-extern enum MCA_AdapterStatus mca_device_status(struct mca_device *mca_dev);
-
-extern struct bus_type mca_bus_type;
-
-extern int mca_register_driver(struct mca_driver *drv);
-extern int mca_register_driver_integrated(struct mca_driver *, int);
-extern void mca_unregister_driver(struct mca_driver *drv);
-
-/* WARNING: only called by the boot time device setup */
-extern int mca_register_device(int bus, struct mca_device *mca_dev);
-
-#ifdef CONFIG_MCA_PROC_FS
-extern void mca_do_proc_init(void);
-extern void mca_set_adapter_procfn(int slot, MCA_ProcFn, void* dev);
-#else
-static inline void mca_do_proc_init(void)
-{
-}
-
-static inline void mca_set_adapter_procfn(int slot, MCA_ProcFn fn, void* dev)
-{
-}
-#endif
-
-#endif /* _LINUX_MCA_H */
diff --git a/scripts/kconfig/mconf.c b/scripts/kconfig/mconf.c
index 2c6286c0bc1a..f606738d421d 100644
--- a/scripts/kconfig/mconf.c
+++ b/scripts/kconfig/mconf.c
@@ -240,7 +240,7 @@ search_help[] = N_(
 	"Defined at drivers/pci/Kconfig:47\n"
 	"Depends on: X86_LOCAL_APIC && X86_IO_APIC || IA64\n"
 	"Location:\n"
-	"  -> Bus options (PCI, PCMCIA, EISA, MCA, ISA)\n"
+	"  -> Bus options (PCI, PCMCIA, EISA, ISA)\n"
 	"    -> PCI support (PCI [=y])\n"
 	"      -> PCI access mode (<choice> [=y])\n"
 	"Selects: LIBCRC32\n"
diff --git a/scripts/kconfig/nconf.c b/scripts/kconfig/nconf.c
index 73070cb0b6de..8c0eb65978c9 100644
--- a/scripts/kconfig/nconf.c
+++ b/scripts/kconfig/nconf.c
@@ -223,7 +223,7 @@ search_help[] = N_(
 "Defined at drivers/pci/Kconfig:47\n"
 "Depends on: X86_LOCAL_APIC && X86_IO_APIC || IA64\n"
 "Location:\n"
-"  -> Bus options (PCI, PCMCIA, EISA, MCA, ISA)\n"
+"  -> Bus options (PCI, PCMCIA, EISA, ISA)\n"
 "    -> PCI support (PCI [ = y])\n"
 "      -> PCI access mode (<choice> [ = y])\n"
 "Selects: LIBCRC32\n"
-- 
cgit v1.2.3


From bea3f8781e30d0abc0bd0da80aa528d44c71959e Mon Sep 17 00:00:00 2001
From: "H. Peter Anvin" <hpa@linux.intel.com>
Date: Fri, 18 May 2012 00:24:09 -0700
Subject: x86, relocs: Workaround for binutils 2.22.52.0.1 section bug

GNU ld 2.22.52.0.1 has a bug that it blindly changes symbols from
section-relative to absolute if they are in a section of zero length.
This turns the symbols __init_begin and __init_end into absolute
symbols.  Let the relocs program know that those should be treated as
relative symbols.

Reported-by: Ingo Molnar <mingo@kernel.org>
Signed-off-by: H. Peter Anvin <hpa@zytor.com>
Cc: H.J. Lu <hjl.tools@gmail.com>
---
 arch/x86/tools/relocs.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'arch/x86')

diff --git a/arch/x86/tools/relocs.c b/arch/x86/tools/relocs.c
index 74e16bb15dc4..4df285450e8c 100644
--- a/arch/x86/tools/relocs.c
+++ b/arch/x86/tools/relocs.c
@@ -56,7 +56,7 @@ static const char * const sym_regex_kernel[S_NSYMTYPES] = {
  * as absolute (typically defined outside any section in the linker script.)
  */
 	[S_REL] =
-	"^_end$",
+	"^(__init_begin|__init_end|_end)$"
 };
 
 
-- 
cgit v1.2.3


From 87e4baacaeba098e217b14e9d2f6af428fe3f657 Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@kernel.org>
Date: Fri, 18 May 2012 09:34:45 +0200
Subject: x86/xen/apic: Add missing #include <xen/xen.h>

This file depends on <xen/xen.h>, but the dependency was hidden due
to: <asm/acpi.h> -> <asm/trampoline.h> -> <asm/io.h> -> <xen/xen.h>

With the removal of <asm/trampoline.h>, this exposed the missing

Cc: Len Brown <lenb@kernel.org>
Cc: Konrad Rzeszutek Wilk <konrad.wilk@oracle.com>
Cc: Jeremy Fitzhardinge <jeremy@goop.org>
Cc: Jarkko Sakkinen <jarkko.sakkinen@intel.com>
Cc: H. Peter Anvin <hpa@zytor.com>
Link: http://lkml.kernel.org/n/tip-7ccybvue6mw6wje3uxzzcglj@git.kernel.org
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/x86/xen/apic.c | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

(limited to 'arch/x86')

diff --git a/arch/x86/xen/apic.c b/arch/x86/xen/apic.c
index 1913bf2d2a9c..ec57bd3818a4 100644
--- a/arch/x86/xen/apic.c
+++ b/arch/x86/xen/apic.c
@@ -1,9 +1,12 @@
 #include <linux/init.h>
+
 #include <asm/x86_init.h>
 #include <asm/apic.h>
-#include <xen/interface/physdev.h>
 #include <asm/xen/hypercall.h>
 
+#include <xen/xen.h>
+#include <xen/interface/physdev.h>
+
 unsigned int xen_io_apic_read(unsigned apic, unsigned reg)
 {
 	struct physdev_apic apic_op;
-- 
cgit v1.2.3


From c8f64bf7df9e4858c051b6c1c09582359b97677d Mon Sep 17 00:00:00 2001
From: "Michael S. Tsirkin" <mst@redhat.com>
Date: Wed, 16 May 2012 19:03:25 +0300
Subject: x86/apic: Fix typo EIO_ACK -> EOI_ACK and document it

Fix typo in the macro name and document the
reason it has this value. Update users.

Signed-off-by: Michael S. Tsirkin <mst@redhat.com>
Cc: Avi Kivity <avi@redhat.com>
Cc: Marcelo Tosatti <mtosatti@redhat.com>
Cc: gleb@redhat.com
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Link: http://lkml.kernel.org/r/37867b31b9330690af2e60a2a7c4cb4b1b070caf.1337184153.git.mst@redhat.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/x86/include/asm/apicdef.h         | 2 +-
 arch/x86/platform/visws/visws_quirks.c | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/include/asm/apicdef.h b/arch/x86/include/asm/apicdef.h
index 134bba00df09..c46bb99d5fb2 100644
--- a/arch/x86/include/asm/apicdef.h
+++ b/arch/x86/include/asm/apicdef.h
@@ -37,7 +37,7 @@
 #define		APIC_ARBPRI_MASK	0xFFu
 #define	APIC_PROCPRI	0xA0
 #define	APIC_EOI	0xB0
-#define		APIC_EIO_ACK		0x0
+#define		APIC_EOI_ACK		0x0 /* Docs say 0 for future compat. */
 #define	APIC_RRR	0xC0
 #define	APIC_LDR	0xD0
 #define		APIC_LDR_MASK		(0xFFu << 24)
diff --git a/arch/x86/platform/visws/visws_quirks.c b/arch/x86/platform/visws/visws_quirks.c
index c7abf13a213f..94d8a39332ec 100644
--- a/arch/x86/platform/visws/visws_quirks.c
+++ b/arch/x86/platform/visws/visws_quirks.c
@@ -445,7 +445,7 @@ static void ack_cobalt_irq(struct irq_data *data)
 
 	spin_lock_irqsave(&cobalt_lock, flags);
 	disable_cobalt_irq(data);
-	apic_write(APIC_EOI, APIC_EIO_ACK);
+	apic_write(APIC_EOI, APIC_EOI_ACK);
 	spin_unlock_irqrestore(&cobalt_lock, flags);
 }
 
-- 
cgit v1.2.3


From 4ebcc243901c48ee3baba6bdf179c7315fa8806f Mon Sep 17 00:00:00 2001
From: "Michael S. Tsirkin" <mst@redhat.com>
Date: Wed, 16 May 2012 19:03:44 +0300
Subject: x86/apic: Use symbolic APIC_EOI_ACK

Use the symbol instead of hard-coded numbers,
now that the reason for the value is documented
where the constant is defined we don't need to
duplicate this explanation in code.

Signed-off-by: Michael S. Tsirkin <mst@redhat.com>
Cc: Avi Kivity <avi@redhat.com>
Cc: Marcelo Tosatti <mtosatti@redhat.com>
Cc: gleb@redhat.com
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Link: http://lkml.kernel.org/r/ecbe4c79d69c172378e47e5a587ff5cd10293c9f.1337184153.git.mst@redhat.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/x86/include/asm/apic.h | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/include/asm/apic.h b/arch/x86/include/asm/apic.h
index d85410171260..a09e9ab0bbdf 100644
--- a/arch/x86/include/asm/apic.h
+++ b/arch/x86/include/asm/apic.h
@@ -463,9 +463,7 @@ static inline void ack_APIC_irq(void)
 	 * ack_APIC_irq() actually gets compiled as a single instruction
 	 * ... yummie.
 	 */
-
-	/* Docs say use 0 for future compatibility */
-	apic_write(APIC_EOI, 0);
+	apic_write(APIC_EOI, APIC_EOI_ACK);
 }
 
 static inline unsigned default_get_apic_id(unsigned long x)
-- 
cgit v1.2.3


From 2a43195d831997551da93e6b3c22c965e93fe9cc Mon Sep 17 00:00:00 2001
From: "Michael S. Tsirkin" <mst@redhat.com>
Date: Wed, 16 May 2012 19:03:52 +0300
Subject: x86/apic: Add apic->eoi_write() callback

Add eoi_write callback so that kvm can override
eoi accesses without touching the rest of the apic.
As a side-effect, this will enable a micro-optimization
for apics using msr.

Signed-off-by: Michael S. Tsirkin <mst@redhat.com>
Cc: Avi Kivity <avi@redhat.com>
Cc: Marcelo Tosatti <mtosatti@redhat.com>
Cc: gleb@redhat.com
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Link: http://lkml.kernel.org/r/0df425d746c49ac2ecc405174df87752869629d2.1337184153.git.mst@redhat.com
[ tidied it up a bit ]
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/x86/include/asm/apic.h           | 16 +++++++++++++++-
 arch/x86/kernel/apic/apic_flat_64.c   |  2 ++
 arch/x86/kernel/apic/apic_noop.c      |  1 +
 arch/x86/kernel/apic/apic_numachip.c  |  1 +
 arch/x86/kernel/apic/bigsmp_32.c      |  1 +
 arch/x86/kernel/apic/es7000_32.c      |  2 ++
 arch/x86/kernel/apic/numaq_32.c       |  1 +
 arch/x86/kernel/apic/probe_32.c       |  1 +
 arch/x86/kernel/apic/summit_32.c      |  1 +
 arch/x86/kernel/apic/x2apic_cluster.c |  1 +
 arch/x86/kernel/apic/x2apic_phys.c    |  1 +
 arch/x86/kernel/apic/x2apic_uv_x.c    |  1 +
 12 files changed, 28 insertions(+), 1 deletion(-)

(limited to 'arch/x86')

diff --git a/arch/x86/include/asm/apic.h b/arch/x86/include/asm/apic.h
index a09e9ab0bbdf..bf8d065dd977 100644
--- a/arch/x86/include/asm/apic.h
+++ b/arch/x86/include/asm/apic.h
@@ -351,6 +351,14 @@ struct apic {
 	/* apic ops */
 	u32 (*read)(u32 reg);
 	void (*write)(u32 reg, u32 v);
+	/*
+	 * ->eoi_write() has the same signature as ->write().
+	 *
+	 * Drivers can support both ->eoi_write() and ->write() by passing the same
+	 * callback value. Kernel can override ->eoi_write() and fall back
+	 * on write for EOI.
+	 */
+	void (*eoi_write)(u32 reg, u32 v);
 	u64 (*icr_read)(void);
 	void (*icr_write)(u32 low, u32 high);
 	void (*wait_icr_idle)(void);
@@ -426,6 +434,11 @@ static inline void apic_write(u32 reg, u32 val)
 	apic->write(reg, val);
 }
 
+static inline void apic_eoi(void)
+{
+	apic->eoi_write(APIC_EOI, APIC_EOI_ACK);
+}
+
 static inline u64 apic_icr_read(void)
 {
 	return apic->icr_read();
@@ -450,6 +463,7 @@ static inline u32 safe_apic_wait_icr_idle(void)
 
 static inline u32 apic_read(u32 reg) { return 0; }
 static inline void apic_write(u32 reg, u32 val) { }
+static inline void apic_eoi(void) { }
 static inline u64 apic_icr_read(void) { return 0; }
 static inline void apic_icr_write(u32 low, u32 high) { }
 static inline void apic_wait_icr_idle(void) { }
@@ -463,7 +477,7 @@ static inline void ack_APIC_irq(void)
 	 * ack_APIC_irq() actually gets compiled as a single instruction
 	 * ... yummie.
 	 */
-	apic_write(APIC_EOI, APIC_EOI_ACK);
+	apic_eoi();
 }
 
 static inline unsigned default_get_apic_id(unsigned long x)
diff --git a/arch/x86/kernel/apic/apic_flat_64.c b/arch/x86/kernel/apic/apic_flat_64.c
index 359b6899a36c..0e881c46e8c8 100644
--- a/arch/x86/kernel/apic/apic_flat_64.c
+++ b/arch/x86/kernel/apic/apic_flat_64.c
@@ -227,6 +227,7 @@ static struct apic apic_flat =  {
 
 	.read				= native_apic_mem_read,
 	.write				= native_apic_mem_write,
+	.eoi_write			= native_apic_mem_write,
 	.icr_read			= native_apic_icr_read,
 	.icr_write			= native_apic_icr_write,
 	.wait_icr_idle			= native_apic_wait_icr_idle,
@@ -386,6 +387,7 @@ static struct apic apic_physflat =  {
 
 	.read				= native_apic_mem_read,
 	.write				= native_apic_mem_write,
+	.eoi_write			= native_apic_mem_write,
 	.icr_read			= native_apic_icr_read,
 	.icr_write			= native_apic_icr_write,
 	.wait_icr_idle			= native_apic_wait_icr_idle,
diff --git a/arch/x86/kernel/apic/apic_noop.c b/arch/x86/kernel/apic/apic_noop.c
index 634ae6cdd5c9..a6e4c6e06c08 100644
--- a/arch/x86/kernel/apic/apic_noop.c
+++ b/arch/x86/kernel/apic/apic_noop.c
@@ -181,6 +181,7 @@ struct apic apic_noop = {
 
 	.read				= noop_apic_read,
 	.write				= noop_apic_write,
+	.eoi_write			= noop_apic_write,
 	.icr_read			= noop_apic_icr_read,
 	.icr_write			= noop_apic_icr_write,
 	.wait_icr_idle			= noop_apic_wait_icr_idle,
diff --git a/arch/x86/kernel/apic/apic_numachip.c b/arch/x86/kernel/apic/apic_numachip.c
index 23e75422e013..6ec6d5d297c3 100644
--- a/arch/x86/kernel/apic/apic_numachip.c
+++ b/arch/x86/kernel/apic/apic_numachip.c
@@ -295,6 +295,7 @@ static struct apic apic_numachip __refconst = {
 
 	.read				= native_apic_mem_read,
 	.write				= native_apic_mem_write,
+	.eoi_write			= native_apic_mem_write,
 	.icr_read			= native_apic_icr_read,
 	.icr_write			= native_apic_icr_write,
 	.wait_icr_idle			= native_apic_wait_icr_idle,
diff --git a/arch/x86/kernel/apic/bigsmp_32.c b/arch/x86/kernel/apic/bigsmp_32.c
index 0cdec7065aff..31fbdbfbf960 100644
--- a/arch/x86/kernel/apic/bigsmp_32.c
+++ b/arch/x86/kernel/apic/bigsmp_32.c
@@ -248,6 +248,7 @@ static struct apic apic_bigsmp = {
 
 	.read				= native_apic_mem_read,
 	.write				= native_apic_mem_write,
+	.eoi_write			= native_apic_mem_write,
 	.icr_read			= native_apic_icr_read,
 	.icr_write			= native_apic_icr_write,
 	.wait_icr_idle			= native_apic_wait_icr_idle,
diff --git a/arch/x86/kernel/apic/es7000_32.c b/arch/x86/kernel/apic/es7000_32.c
index e42d1d3b9134..db4ab1be3c79 100644
--- a/arch/x86/kernel/apic/es7000_32.c
+++ b/arch/x86/kernel/apic/es7000_32.c
@@ -678,6 +678,7 @@ static struct apic __refdata apic_es7000_cluster = {
 
 	.read				= native_apic_mem_read,
 	.write				= native_apic_mem_write,
+	.eoi_write			= native_apic_mem_write,
 	.icr_read			= native_apic_icr_read,
 	.icr_write			= native_apic_icr_write,
 	.wait_icr_idle			= native_apic_wait_icr_idle,
@@ -742,6 +743,7 @@ static struct apic __refdata apic_es7000 = {
 
 	.read				= native_apic_mem_read,
 	.write				= native_apic_mem_write,
+	.eoi_write			= native_apic_mem_write,
 	.icr_read			= native_apic_icr_read,
 	.icr_write			= native_apic_icr_write,
 	.wait_icr_idle			= native_apic_wait_icr_idle,
diff --git a/arch/x86/kernel/apic/numaq_32.c b/arch/x86/kernel/apic/numaq_32.c
index 00d2422ca7c9..f00a68cca37a 100644
--- a/arch/x86/kernel/apic/numaq_32.c
+++ b/arch/x86/kernel/apic/numaq_32.c
@@ -530,6 +530,7 @@ static struct apic __refdata apic_numaq = {
 
 	.read				= native_apic_mem_read,
 	.write				= native_apic_mem_write,
+	.eoi_write			= native_apic_mem_write,
 	.icr_read			= native_apic_icr_read,
 	.icr_write			= native_apic_icr_write,
 	.wait_icr_idle			= native_apic_wait_icr_idle,
diff --git a/arch/x86/kernel/apic/probe_32.c b/arch/x86/kernel/apic/probe_32.c
index ff2c1b9aac4d..1b291da09e60 100644
--- a/arch/x86/kernel/apic/probe_32.c
+++ b/arch/x86/kernel/apic/probe_32.c
@@ -142,6 +142,7 @@ static struct apic apic_default = {
 
 	.read				= native_apic_mem_read,
 	.write				= native_apic_mem_write,
+	.eoi_write			= native_apic_mem_write,
 	.icr_read			= native_apic_icr_read,
 	.icr_write			= native_apic_icr_write,
 	.wait_icr_idle			= native_apic_wait_icr_idle,
diff --git a/arch/x86/kernel/apic/summit_32.c b/arch/x86/kernel/apic/summit_32.c
index fea000b27f07..659897c00755 100644
--- a/arch/x86/kernel/apic/summit_32.c
+++ b/arch/x86/kernel/apic/summit_32.c
@@ -546,6 +546,7 @@ static struct apic apic_summit = {
 
 	.read				= native_apic_mem_read,
 	.write				= native_apic_mem_write,
+	.eoi_write			= native_apic_mem_write,
 	.icr_read			= native_apic_icr_read,
 	.icr_write			= native_apic_icr_write,
 	.wait_icr_idle			= native_apic_wait_icr_idle,
diff --git a/arch/x86/kernel/apic/x2apic_cluster.c b/arch/x86/kernel/apic/x2apic_cluster.c
index 48f3103b3c93..a5baa785a251 100644
--- a/arch/x86/kernel/apic/x2apic_cluster.c
+++ b/arch/x86/kernel/apic/x2apic_cluster.c
@@ -260,6 +260,7 @@ static struct apic apic_x2apic_cluster = {
 
 	.read				= native_apic_msr_read,
 	.write				= native_apic_msr_write,
+	.eoi_write			= native_apic_msr_write,
 	.icr_read			= native_x2apic_icr_read,
 	.icr_write			= native_x2apic_icr_write,
 	.wait_icr_idle			= native_x2apic_wait_icr_idle,
diff --git a/arch/x86/kernel/apic/x2apic_phys.c b/arch/x86/kernel/apic/x2apic_phys.c
index 991e315f4227..834035666b8d 100644
--- a/arch/x86/kernel/apic/x2apic_phys.c
+++ b/arch/x86/kernel/apic/x2apic_phys.c
@@ -172,6 +172,7 @@ static struct apic apic_x2apic_phys = {
 
 	.read				= native_apic_msr_read,
 	.write				= native_apic_msr_write,
+	.eoi_write			= native_apic_msr_write,
 	.icr_read			= native_x2apic_icr_read,
 	.icr_write			= native_x2apic_icr_write,
 	.wait_icr_idle			= native_x2apic_wait_icr_idle,
diff --git a/arch/x86/kernel/apic/x2apic_uv_x.c b/arch/x86/kernel/apic/x2apic_uv_x.c
index 87bfa69e216e..5b0e3d0a3d2d 100644
--- a/arch/x86/kernel/apic/x2apic_uv_x.c
+++ b/arch/x86/kernel/apic/x2apic_uv_x.c
@@ -404,6 +404,7 @@ static struct apic __refdata apic_x2apic_uv_x = {
 
 	.read				= native_apic_msr_read,
 	.write				= native_apic_msr_write,
+	.eoi_write			= native_apic_msr_write,
 	.icr_read			= native_x2apic_icr_read,
 	.icr_write			= native_x2apic_icr_write,
 	.wait_icr_idle			= native_x2apic_wait_icr_idle,
-- 
cgit v1.2.3


From 0ab711ae6ab0db7696b43c74f9ba9de4d7fc1deb Mon Sep 17 00:00:00 2001
From: "Michael S. Tsirkin" <mst@redhat.com>
Date: Wed, 16 May 2012 19:03:58 +0300
Subject: x86/apic: Implement EIO micro-optimization

We know both register and value for eoi beforehand,
so there's no need to check it and no need to do math
to calculate the msr. Saves instructions/branches
on each EOI when using x2apic.

I looked at the objdump output to verify that the
generated code looks right and actually is shorter.

The real improvemements will be on the KVM guest side
though, those come in a later patch.

Signed-off-by: Michael S. Tsirkin <mst@redhat.com>
Cc: Avi Kivity <avi@redhat.com>
Cc: Marcelo Tosatti <mtosatti@redhat.com>
Cc: gleb@redhat.com
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Link: http://lkml.kernel.org/r/e019d1a125316f10d3e3a4b2f6bda41473f4fb72.1337184153.git.mst@redhat.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/x86/include/asm/apic.h           | 5 +++++
 arch/x86/kernel/apic/x2apic_cluster.c | 2 +-
 arch/x86/kernel/apic/x2apic_phys.c    | 2 +-
 arch/x86/kernel/apic/x2apic_uv_x.c    | 2 +-
 4 files changed, 8 insertions(+), 3 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/include/asm/apic.h b/arch/x86/include/asm/apic.h
index bf8d065dd977..eaff4790ed96 100644
--- a/arch/x86/include/asm/apic.h
+++ b/arch/x86/include/asm/apic.h
@@ -138,6 +138,11 @@ static inline void native_apic_msr_write(u32 reg, u32 v)
 	wrmsr(APIC_BASE_MSR + (reg >> 4), v, 0);
 }
 
+static inline void native_apic_msr_eoi_write(u32 reg, u32 v)
+{
+	wrmsr(APIC_BASE_MSR + (APIC_EOI >> 4), APIC_EOI_ACK, 0);
+}
+
 static inline u32 native_apic_msr_read(u32 reg)
 {
 	u64 msr;
diff --git a/arch/x86/kernel/apic/x2apic_cluster.c b/arch/x86/kernel/apic/x2apic_cluster.c
index a5baa785a251..ff35cff0e1a7 100644
--- a/arch/x86/kernel/apic/x2apic_cluster.c
+++ b/arch/x86/kernel/apic/x2apic_cluster.c
@@ -260,7 +260,7 @@ static struct apic apic_x2apic_cluster = {
 
 	.read				= native_apic_msr_read,
 	.write				= native_apic_msr_write,
-	.eoi_write			= native_apic_msr_write,
+	.eoi_write			= native_apic_msr_eoi_write,
 	.icr_read			= native_x2apic_icr_read,
 	.icr_write			= native_x2apic_icr_write,
 	.wait_icr_idle			= native_x2apic_wait_icr_idle,
diff --git a/arch/x86/kernel/apic/x2apic_phys.c b/arch/x86/kernel/apic/x2apic_phys.c
index 834035666b8d..c17e982db275 100644
--- a/arch/x86/kernel/apic/x2apic_phys.c
+++ b/arch/x86/kernel/apic/x2apic_phys.c
@@ -172,7 +172,7 @@ static struct apic apic_x2apic_phys = {
 
 	.read				= native_apic_msr_read,
 	.write				= native_apic_msr_write,
-	.eoi_write			= native_apic_msr_write,
+	.eoi_write			= native_apic_msr_eoi_write,
 	.icr_read			= native_x2apic_icr_read,
 	.icr_write			= native_x2apic_icr_write,
 	.wait_icr_idle			= native_x2apic_wait_icr_idle,
diff --git a/arch/x86/kernel/apic/x2apic_uv_x.c b/arch/x86/kernel/apic/x2apic_uv_x.c
index 5b0e3d0a3d2d..c6d03f7a4401 100644
--- a/arch/x86/kernel/apic/x2apic_uv_x.c
+++ b/arch/x86/kernel/apic/x2apic_uv_x.c
@@ -404,7 +404,7 @@ static struct apic __refdata apic_x2apic_uv_x = {
 
 	.read				= native_apic_msr_read,
 	.write				= native_apic_msr_write,
-	.eoi_write			= native_apic_msr_write,
+	.eoi_write			= native_apic_msr_eoi_write,
 	.icr_read			= native_x2apic_icr_read,
 	.icr_write			= native_x2apic_icr_write,
 	.wait_icr_idle			= native_x2apic_wait_icr_idle,
-- 
cgit v1.2.3


From 3e7f3db001de6133db1c385c92eec944409a8b4f Mon Sep 17 00:00:00 2001
From: Alex Shi <alex.shi@intel.com>
Date: Thu, 10 May 2012 18:01:59 +0800
Subject: x86/tlb: Clean up and unify TLB_FLUSH_ALL definition

Since sizeof(long) is 4 in x86_32 mode, and it's 8 in x86_64
mode, sizeof(long long) is also 8 byte in x86_64 mode.
use long mode can fit TLB_FLUSH_ALL defination here both in 32
or 64 bits mode.

Signed-off-by: Alex Shi <alex.shi@intel.com>
Link: http://lkml.kernel.org/n/tip-evv5bekiipi2pmyzdsy8lkkw@git.kernel.org
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/x86/include/asm/tlbflush.h | 6 +-----
 1 file changed, 1 insertion(+), 5 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/include/asm/tlbflush.h b/arch/x86/include/asm/tlbflush.h
index 169be8938b96..63af9098e6a5 100644
--- a/arch/x86/include/asm/tlbflush.h
+++ b/arch/x86/include/asm/tlbflush.h
@@ -62,11 +62,7 @@ static inline void __flush_tlb_one(unsigned long addr)
 		__flush_tlb();
 }
 
-#ifdef CONFIG_X86_32
-# define TLB_FLUSH_ALL	0xffffffff
-#else
-# define TLB_FLUSH_ALL	-1ULL
-#endif
+#define TLB_FLUSH_ALL	-1UL
 
 /*
  * TLB flushing:
-- 
cgit v1.2.3


From 20167d3421a089a1bf1bd680b150dc69c9506810 Mon Sep 17 00:00:00 2001
From: Jan Beulich <JBeulich@suse.com>
Date: Wed, 16 May 2012 14:06:26 +0100
Subject: x86-64: Fix accounting in kernel_physical_mapping_init()

When finding a present and acceptable 2M/1G mapping, the number
of pages mapped this way shouldn't be incremented (as it was
already incremented when the earlier part of the mapping was
established). Instead, last_map_addr needs to be updated in this
case.

Further, address increments were wrong in one place each in both
phys_pmd_init() and phys_pud_init() (lacking the aligning down
to the respective page boundary).

As we're now doing the same calculation several times, fold it
into a single instance using a local variable (matching how
kernel_physical_mapping_init() itself does it at the PGD level).

Observed during code inspection, not because of an actual
problem.

Signed-off-by: Jan Beulich <jbeulich@suse.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Link: http://lkml.kernel.org/r/4FB3C27202000078000841A0@nat28.tlf.novell.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/x86/mm/init_64.c | 23 +++++++++++++----------
 1 file changed, 13 insertions(+), 10 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/mm/init_64.c b/arch/x86/mm/init_64.c
index 436a0309db33..f9476a0f8cb6 100644
--- a/arch/x86/mm/init_64.c
+++ b/arch/x86/mm/init_64.c
@@ -408,12 +408,12 @@ static unsigned long __meminit
 phys_pmd_init(pmd_t *pmd_page, unsigned long address, unsigned long end,
 	      unsigned long page_size_mask, pgprot_t prot)
 {
-	unsigned long pages = 0;
+	unsigned long pages = 0, next;
 	unsigned long last_map_addr = end;
 
 	int i = pmd_index(address);
 
-	for (; i < PTRS_PER_PMD; i++, address += PMD_SIZE) {
+	for (; i < PTRS_PER_PMD; i++, address = next) {
 		unsigned long pte_phys;
 		pmd_t *pmd = pmd_page + pmd_index(address);
 		pte_t *pte;
@@ -427,6 +427,8 @@ phys_pmd_init(pmd_t *pmd_page, unsigned long address, unsigned long end,
 			break;
 		}
 
+		next = (address & PMD_MASK) + PMD_SIZE;
+
 		if (pmd_val(*pmd)) {
 			if (!pmd_large(*pmd)) {
 				spin_lock(&init_mm.page_table_lock);
@@ -450,7 +452,7 @@ phys_pmd_init(pmd_t *pmd_page, unsigned long address, unsigned long end,
 			 * attributes.
 			 */
 			if (page_size_mask & (1 << PG_LEVEL_2M)) {
-				pages++;
+				last_map_addr = next;
 				continue;
 			}
 			new_prot = pte_pgprot(pte_clrhuge(*(pte_t *)pmd));
@@ -463,7 +465,7 @@ phys_pmd_init(pmd_t *pmd_page, unsigned long address, unsigned long end,
 				pfn_pte(address >> PAGE_SHIFT,
 					__pgprot(pgprot_val(prot) | _PAGE_PSE)));
 			spin_unlock(&init_mm.page_table_lock);
-			last_map_addr = (address & PMD_MASK) + PMD_SIZE;
+			last_map_addr = next;
 			continue;
 		}
 
@@ -483,11 +485,11 @@ static unsigned long __meminit
 phys_pud_init(pud_t *pud_page, unsigned long addr, unsigned long end,
 			 unsigned long page_size_mask)
 {
-	unsigned long pages = 0;
+	unsigned long pages = 0, next;
 	unsigned long last_map_addr = end;
 	int i = pud_index(addr);
 
-	for (; i < PTRS_PER_PUD; i++, addr = (addr & PUD_MASK) + PUD_SIZE) {
+	for (; i < PTRS_PER_PUD; i++, addr = next) {
 		unsigned long pmd_phys;
 		pud_t *pud = pud_page + pud_index(addr);
 		pmd_t *pmd;
@@ -496,8 +498,9 @@ phys_pud_init(pud_t *pud_page, unsigned long addr, unsigned long end,
 		if (addr >= end)
 			break;
 
-		if (!after_bootmem &&
-				!e820_any_mapped(addr, addr+PUD_SIZE, 0)) {
+		next = (addr & PUD_MASK) + PUD_SIZE;
+
+		if (!after_bootmem && !e820_any_mapped(addr, next, 0)) {
 			set_pud(pud, __pud(0));
 			continue;
 		}
@@ -524,7 +527,7 @@ phys_pud_init(pud_t *pud_page, unsigned long addr, unsigned long end,
 			 * attributes.
 			 */
 			if (page_size_mask & (1 << PG_LEVEL_1G)) {
-				pages++;
+				last_map_addr = next;
 				continue;
 			}
 			prot = pte_pgprot(pte_clrhuge(*(pte_t *)pud));
@@ -536,7 +539,7 @@ phys_pud_init(pud_t *pud_page, unsigned long addr, unsigned long end,
 			set_pte((pte_t *)pud,
 				pfn_pte(addr >> PAGE_SHIFT, PAGE_KERNEL_LARGE));
 			spin_unlock(&init_mm.page_table_lock);
-			last_map_addr = (addr & PUD_MASK) + PUD_SIZE;
+			last_map_addr = next;
 			continue;
 		}
 
-- 
cgit v1.2.3


From 5bcdf5e4fee3c45e1281c25e4941f2163cb28c65 Mon Sep 17 00:00:00 2001
From: Robert Richter <robert.richter@amd.com>
Date: Fri, 18 May 2012 12:40:42 +0200
Subject: perf/x86: Update event scheduling constraints for AMD family 15h
 models

This update is for newer family 15h cpu models from 0x02 to 0x1f.

Signed-off-by: Robert Richter <robert.richter@amd.com>
Acked-by: Peter Zijlstra <peterz@infradead.org>
Cc: Stephane Eranian <eranian@google.com>
Cc: stable@vger.kernel.org # v2.6.39+
Link: http://lkml.kernel.org/r/1337337642-1621-1-git-send-email-robert.richter@amd.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/x86/kernel/cpu/perf_event_amd.c | 11 ++++++++++-
 1 file changed, 10 insertions(+), 1 deletion(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kernel/cpu/perf_event_amd.c b/arch/x86/kernel/cpu/perf_event_amd.c
index 95e7fe1c5f0b..9edc786aef89 100644
--- a/arch/x86/kernel/cpu/perf_event_amd.c
+++ b/arch/x86/kernel/cpu/perf_event_amd.c
@@ -493,6 +493,7 @@ static __initconst const struct x86_pmu amd_pmu = {
  * 0x023	DE	PERF_CTL[2:0]
  * 0x02D	LS	PERF_CTL[3]
  * 0x02E	LS	PERF_CTL[3,0]
+ * 0x031	LS	PERF_CTL[2:0] (**)
  * 0x043	CU	PERF_CTL[2:0]
  * 0x045	CU	PERF_CTL[2:0]
  * 0x046	CU	PERF_CTL[2:0]
@@ -506,10 +507,12 @@ static __initconst const struct x86_pmu amd_pmu = {
  * 0x0DD	LS	PERF_CTL[5:0]
  * 0x0DE	LS	PERF_CTL[5:0]
  * 0x0DF	LS	PERF_CTL[5:0]
+ * 0x1C0	EX	PERF_CTL[5:3]
  * 0x1D6	EX	PERF_CTL[5:0]
  * 0x1D8	EX	PERF_CTL[5:0]
  *
- * (*) depending on the umask all FPU counters may be used
+ * (*)  depending on the umask all FPU counters may be used
+ * (**) only one unitmask enabled at a time
  */
 
 static struct event_constraint amd_f15_PMC0  = EVENT_CONSTRAINT(0, 0x01, 0);
@@ -559,6 +562,12 @@ amd_get_event_constraints_f15h(struct cpu_hw_events *cpuc, struct perf_event *ev
 			return &amd_f15_PMC3;
 		case 0x02E:
 			return &amd_f15_PMC30;
+		case 0x031:
+			if (hweight_long(hwc->config & ARCH_PERFMON_EVENTSEL_UMASK) <= 1)
+				return &amd_f15_PMC20;
+			return &emptyconstraint;
+		case 0x1C0:
+			return &amd_f15_PMC53;
 		default:
 			return &amd_f15_PMC50;
 		}
-- 
cgit v1.2.3


From c54a354c1835e7412a53458891b9ea05361b4e8a Mon Sep 17 00:00:00 2001
From: "H. Peter Anvin" <hpa@linux.intel.com>
Date: Fri, 18 May 2012 08:31:44 -0700
Subject: x86, relocs: More relocations which may end up as absolute

GNU ld 2.22.52.0.1 has a bug that it blindly changes symbols from
section-relative to absolute if they are in a section of zero length.
This turns the symbols __init_begin and __init_end into absolute
symbols.  Let the relocs program know that those should be treated as
relative symbols.

This bug is exposed by checkin

433de739bbc2 x86, realmode: 16-bit real-mode code support for relocs tool

only in the sense that that checkin changes the relocs tool to report
an error instead of silently generating a kernel which is broken if
relocated.

Reported-by: Ingo Molnar <mingo@kernel.org>
Signed-off-by: H. Peter Anvin <hpa@zytor.com>
Cc: H.J. Lu <hjl.tools@gmail.com>
Cc: Jarkko Sakkinen <jarkko.sakkinen@intel.com>
---
 arch/x86/tools/relocs.c | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

(limited to 'arch/x86')

diff --git a/arch/x86/tools/relocs.c b/arch/x86/tools/relocs.c
index 4df285450e8c..b49c2119295e 100644
--- a/arch/x86/tools/relocs.c
+++ b/arch/x86/tools/relocs.c
@@ -56,7 +56,11 @@ static const char * const sym_regex_kernel[S_NSYMTYPES] = {
  * as absolute (typically defined outside any section in the linker script.)
  */
 	[S_REL] =
-	"^(__init_begin|__init_end|_end)$"
+	"^(__init_(begin|end)|"
+	"__x86_cpu_dev_(start|end)|"
+	"(__parainstructions|__alt_instructions)(|_end)|"
+	"(__iommu_table|__apicdrivers|__smp_locks)(|_end)|"
+	"_end)$"
 };
 
 
-- 
cgit v1.2.3


From 8a3b947c40cb36100f316ac0d433f4ae554ee4cc Mon Sep 17 00:00:00 2001
From: "H. Peter Anvin" <hpa@linux.intel.com>
Date: Fri, 18 May 2012 09:52:01 -0700
Subject: x86, relocs: When printing an error, say relative or absolute

When the relocs tool throws an error, let the error message say if it
is an absolute or relative symbol.  This should make it a lot more
clear what action the programmer needs to take.

Signed-off-by: H. Peter Anvin <hpa@zytor.com>
---
 arch/x86/tools/relocs.c | 11 ++++++++---
 1 file changed, 8 insertions(+), 3 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/tools/relocs.c b/arch/x86/tools/relocs.c
index b49c2119295e..dce982d4bc31 100644
--- a/arch/x86/tools/relocs.c
+++ b/arch/x86/tools/relocs.c
@@ -570,10 +570,14 @@ static void walk_relocs(void (*visit)(Elf32_Rel *rel, Elf32_Sym *sym),
 			Elf32_Sym *sym;
 			unsigned r_type;
 			const char *symname;
+			int shn_abs;
+
 			rel = &sec->reltab[j];
 			sym = &sh_symtab[ELF32_R_SYM(rel->r_info)];
 			r_type = ELF32_R_TYPE(rel->r_info);
 
+			shn_abs = sym->st_shndx == SHN_ABS;
+
 			switch (r_type) {
 			case R_386_NONE:
 			case R_386_PC32:
@@ -589,7 +593,7 @@ static void walk_relocs(void (*visit)(Elf32_Rel *rel, Elf32_Sym *sym),
 				symname = sym_name(sym_strtab, sym);
 				if (!use_real_mode)
 					goto bad;
-				if (sym->st_shndx == SHN_ABS) {
+				if (shn_abs) {
 					if (is_reloc(S_ABS, symname))
 						break;
 					else if (!is_reloc(S_SEG, symname))
@@ -605,7 +609,7 @@ static void walk_relocs(void (*visit)(Elf32_Rel *rel, Elf32_Sym *sym),
 
 			case R_386_32:
 				symname = sym_name(sym_strtab, sym);
-				if (sym->st_shndx == SHN_ABS) {
+				if (shn_abs) {
 					if (is_reloc(S_ABS, symname))
 						break;
 					else if (!is_reloc(S_REL, symname))
@@ -623,7 +627,8 @@ static void walk_relocs(void (*visit)(Elf32_Rel *rel, Elf32_Sym *sym),
 				break;
 			bad:
 				symname = sym_name(sym_strtab, sym);
-				die("Invalid %s relocation: %s\n",
+				die("Invalid %s %s relocation: %s\n",
+				    shn_abs ? "absolute" : "relative",
 				    rel_type(r_type), symname);
 			}
 		}
-- 
cgit v1.2.3


From 61f5446169046c217a5479517edac3a890c3bee7 Mon Sep 17 00:00:00 2001
From: "H. Peter Anvin" <hpa@linux.intel.com>
Date: Mon, 21 May 2012 00:02:45 -0700
Subject: x86, realmode: Move end signature into header.S

The end signature was defined in wakeup_asm.S as it originally came
from the ACPI wakeup code.  However, we rely on the existence of the
.signature section to expand .bss, otherwise we would have to include
code to explicitly zero the .bss depending on the configuration.
Since the expanded .bss is just in .init.data anyway, it's easier to
always have it expanded.

This fixes failures when compiled without CONFIG_ACPI_SLEEP.

Reported-by: Ingo Molnar <mingo@kernel.org>
Signed-off-by: H. Peter Anvin <hpa@zytor.com>
Cc: Jarkko Sakkinen <jarkko.sakkinen@intel.com>
---
 arch/x86/realmode/rm/header.S     | 9 +++++++++
 arch/x86/realmode/rm/realmode.h   | 5 +++++
 arch/x86/realmode/rm/wakeup.h     | 1 -
 arch/x86/realmode/rm/wakeup_asm.S | 6 +-----
 4 files changed, 15 insertions(+), 6 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/realmode/rm/header.S b/arch/x86/realmode/rm/header.S
index 4612d5382791..fadf48378ada 100644
--- a/arch/x86/realmode/rm/header.S
+++ b/arch/x86/realmode/rm/header.S
@@ -7,6 +7,8 @@
 #include <linux/linkage.h>
 #include <asm/page_types.h>
 
+#include "realmode.h"
+	
 	.section ".header", "a"
 
 	.balign	16
@@ -30,3 +32,10 @@ GLOBAL(real_mode_header)
 	.long	pa_machine_real_restart_asm
 #endif
 END(real_mode_header)
+
+	/* End signature, used to verify integrity */
+	.section ".signature","a"
+	.balign 4
+GLOBAL(end_signature)
+	.long	REALMODE_END_SIGNATURE
+END(end_signature)
diff --git a/arch/x86/realmode/rm/realmode.h b/arch/x86/realmode/rm/realmode.h
index 15ab6335f843..d74cff6350ed 100644
--- a/arch/x86/realmode/rm/realmode.h
+++ b/arch/x86/realmode/rm/realmode.h
@@ -13,4 +13,9 @@
 
 #endif /* __ASSEMBLY__ */
 
+/*
+ * Signature at the end of the realmode region
+ */
+#define REALMODE_END_SIGNATURE	0x65a22c82
+
 #endif /* ARCH_X86_REALMODE_RM_REALMODE_H */
diff --git a/arch/x86/realmode/rm/wakeup.h b/arch/x86/realmode/rm/wakeup.h
index 2dfaf06b8af1..9317e0042f24 100644
--- a/arch/x86/realmode/rm/wakeup.h
+++ b/arch/x86/realmode/rm/wakeup.h
@@ -33,7 +33,6 @@ extern struct wakeup_header wakeup_header;
 
 #define WAKEUP_HEADER_OFFSET	8
 #define WAKEUP_HEADER_SIGNATURE 0x51ee1111
-#define WAKEUP_END_SIGNATURE	0x65a22c82
 
 /* Wakeup behavior bits */
 #define WAKEUP_BEHAVIOR_RESTORE_MISC_ENABLE     0
diff --git a/arch/x86/realmode/rm/wakeup_asm.S b/arch/x86/realmode/rm/wakeup_asm.S
index 46108f05e04e..8905166b0bbb 100644
--- a/arch/x86/realmode/rm/wakeup_asm.S
+++ b/arch/x86/realmode/rm/wakeup_asm.S
@@ -85,7 +85,7 @@ ENTRY(wakeup_start)
 
 	/* Check we really have everything... */
 	movl	end_signature, %eax
-	cmpl	$WAKEUP_END_SIGNATURE, %eax
+	cmpl	$REALMODE_END_SIGNATURE, %eax
 	jne	bogus_real_magic
 
 	/* Call the C code */
@@ -175,7 +175,3 @@ GLOBAL(wakeup_idt)
 	.long	0		/* address */
 	.word	0
 END(wakeup_idt)
-
-	.section ".signature","a"
-end_signature:
-	.long	WAKEUP_END_SIGNATURE
-- 
cgit v1.2.3


From 74bc491795420254f8b9c782ec654c9ba005d3ac Mon Sep 17 00:00:00 2001
From: Shuah Khan <shuahkhan@gmail.com>
Date: Sun, 20 May 2012 17:24:28 -0600
Subject: x86/pci-calgary_64.c: Remove obsoleted simple_strtoul() usage

Change calgary_parse_options() to call kstrtoul() instead of
calling obsoleted simple_strtoul().

Signed-off-by: Shuah Khan <shuahkhan@gmail.com>
Acked-by: Muli Ben-Yehuda <muli@cs.technion.ac.il>
Cc: jdmason@kudzu.us
Link: http://lkml.kernel.org/r/1337556268.3126.5.camel@lorien2
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/x86/kernel/pci-calgary_64.c | 8 +++++---
 1 file changed, 5 insertions(+), 3 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kernel/pci-calgary_64.c b/arch/x86/kernel/pci-calgary_64.c
index 6ac5782f4d6b..dbbfb261e62c 100644
--- a/arch/x86/kernel/pci-calgary_64.c
+++ b/arch/x86/kernel/pci-calgary_64.c
@@ -1479,8 +1479,9 @@ cleanup:
 static int __init calgary_parse_options(char *p)
 {
 	unsigned int bridge;
+	unsigned long val;
 	size_t len;
-	char* endp;
+	ssize_t ret;
 
 	while (*p) {
 		if (!strncmp(p, "64k", 3))
@@ -1511,10 +1512,11 @@ static int __init calgary_parse_options(char *p)
 				++p;
 			if (*p == '\0')
 				break;
-			bridge = simple_strtoul(p, &endp, 0);
-			if (p == endp)
+			ret = kstrtoul(p, 0, &val);
+			if (ret)
 				break;
 
+			bridge = val;
 			if (bridge < MAX_PHB_BUS_NUM) {
 				printk(KERN_INFO "Calgary: disabling "
 				       "translation for PHB %#x\n", bridge);
-- 
cgit v1.2.3


From bdebaf80a02b854381fe212e0dac13c8c8edac57 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Fri, 18 May 2012 16:45:44 +0000
Subject: x86: Use generic time config

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Anna-Maria Gleixner <anna-maria@glx-um.de>
Link: http://lkml.kernel.org/r/20120518163104.630579708@glx-um.de
Cc: x86@kernel.org
---
 arch/x86/Kconfig | 31 +++++++------------------------
 1 file changed, 7 insertions(+), 24 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
index c9866b0b77d8..3b0a9217836a 100644
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -82,6 +82,13 @@ config X86
 	select ARCH_HAVE_NMI_SAFE_CMPXCHG
 	select GENERIC_IOMAP
 	select DCACHE_WORD_ACCESS
+	select GENERIC_CMOS_UPDATE
+	select CLOCKSOURCE_WATCHDOG
+	select GENERIC_CLOCKEVENTS
+	select ARCH_CLOCKSOURCE_DATA if X86_64
+	select GENERIC_CLOCKEVENTS_BROADCAST if X86_64 || (X86_32 && X86_LOCAL_APIC)
+	select GENERIC_TIME_VSYSCALL if X86_64
+	select KTIME_SCALAR if X86_32
 
 config INSTRUCTION_DECODER
 	def_bool (KPROBES || PERF_EVENTS)
@@ -96,23 +103,6 @@ config ARCH_DEFCONFIG
 	default "arch/x86/configs/i386_defconfig" if X86_32
 	default "arch/x86/configs/x86_64_defconfig" if X86_64
 
-config GENERIC_CMOS_UPDATE
-	def_bool y
-
-config CLOCKSOURCE_WATCHDOG
-	def_bool y
-
-config GENERIC_CLOCKEVENTS
-	def_bool y
-
-config ARCH_CLOCKSOURCE_DATA
-	def_bool y
-	depends on X86_64
-
-config GENERIC_CLOCKEVENTS_BROADCAST
-	def_bool y
-	depends on X86_64 || (X86_32 && X86_LOCAL_APIC)
-
 config LOCKDEP_SUPPORT
 	def_bool y
 
@@ -166,10 +156,6 @@ config ARCH_HAS_CPU_IDLE_WAIT
 config GENERIC_CALIBRATE_DELAY
 	def_bool y
 
-config GENERIC_TIME_VSYSCALL
-	bool
-	default X86_64
-
 config ARCH_HAS_CPU_RELAX
 	def_bool y
 
@@ -236,9 +222,6 @@ config ARCH_HWEIGHT_CFLAGS
 	default "-fcall-saved-ecx -fcall-saved-edx" if X86_32
 	default "-fcall-saved-rdi -fcall-saved-rsi -fcall-saved-rdx -fcall-saved-rcx -fcall-saved-r8 -fcall-saved-r9 -fcall-saved-r10 -fcall-saved-r11" if X86_64
 
-config KTIME_SCALAR
-	def_bool X86_32
-
 config ARCH_CPU_PROBE_RELEASE
 	def_bool y
 	depends on HOTPLUG_CPU
-- 
cgit v1.2.3


From 0a2b9a6ea93650b8a00f9fd5ee8fdd25671e2df6 Mon Sep 17 00:00:00 2001
From: Marek Szyprowski <m.szyprowski@samsung.com>
Date: Thu, 29 Dec 2011 13:09:51 +0100
Subject: X86: integrate CMA with DMA-mapping subsystem

This patch adds support for CMA to dma-mapping subsystem for x86
architecture that uses common pci-dma/pci-nommu implementation. This
allows to test CMA on KVM/QEMU and a lot of common x86 boxes.

Signed-off-by: Marek Szyprowski <m.szyprowski@samsung.com>
Signed-off-by: Kyungmin Park <kyungmin.park@samsung.com>
CC: Michal Nazarewicz <mina86@mina86.com>
Acked-by: Arnd Bergmann <arnd@arndb.de>
---
 arch/x86/Kconfig                      |  1 +
 arch/x86/include/asm/dma-contiguous.h | 13 +++++++++++++
 arch/x86/include/asm/dma-mapping.h    |  5 +++++
 arch/x86/kernel/pci-dma.c             | 18 ++++++++++++++++--
 arch/x86/kernel/pci-nommu.c           |  8 +-------
 arch/x86/kernel/setup.c               |  2 ++
 6 files changed, 38 insertions(+), 9 deletions(-)
 create mode 100644 arch/x86/include/asm/dma-contiguous.h

(limited to 'arch/x86')

diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
index c9866b0b77d8..7cbdfdac3c7c 100644
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -31,6 +31,7 @@ config X86
 	select ARCH_WANT_OPTIONAL_GPIOLIB
 	select ARCH_WANT_FRAME_POINTERS
 	select HAVE_DMA_ATTRS
+	select HAVE_DMA_CONTIGUOUS if !SWIOTLB
 	select HAVE_KRETPROBES
 	select HAVE_OPTPROBES
 	select HAVE_FTRACE_MCOUNT_RECORD
diff --git a/arch/x86/include/asm/dma-contiguous.h b/arch/x86/include/asm/dma-contiguous.h
new file mode 100644
index 000000000000..c09241659971
--- /dev/null
+++ b/arch/x86/include/asm/dma-contiguous.h
@@ -0,0 +1,13 @@
+#ifndef ASMX86_DMA_CONTIGUOUS_H
+#define ASMX86_DMA_CONTIGUOUS_H
+
+#ifdef __KERNEL__
+
+#include <linux/types.h>
+#include <asm-generic/dma-contiguous.h>
+
+static inline void
+dma_contiguous_early_fixup(phys_addr_t base, unsigned long size) { }
+
+#endif
+#endif
diff --git a/arch/x86/include/asm/dma-mapping.h b/arch/x86/include/asm/dma-mapping.h
index 4b4331d71935..7b9227b44b9b 100644
--- a/arch/x86/include/asm/dma-mapping.h
+++ b/arch/x86/include/asm/dma-mapping.h
@@ -13,6 +13,7 @@
 #include <asm/io.h>
 #include <asm/swiotlb.h>
 #include <asm-generic/dma-coherent.h>
+#include <linux/dma-contiguous.h>
 
 #ifdef CONFIG_ISA
 # define ISA_DMA_BIT_MASK DMA_BIT_MASK(24)
@@ -62,6 +63,10 @@ extern void *dma_generic_alloc_coherent(struct device *dev, size_t size,
 					dma_addr_t *dma_addr, gfp_t flag,
 					struct dma_attrs *attrs);
 
+extern void dma_generic_free_coherent(struct device *dev, size_t size,
+				      void *vaddr, dma_addr_t dma_addr,
+				      struct dma_attrs *attrs);
+
 static inline bool dma_capable(struct device *dev, dma_addr_t addr, size_t size)
 {
 	if (!dev->dma_mask)
diff --git a/arch/x86/kernel/pci-dma.c b/arch/x86/kernel/pci-dma.c
index 3003250ac51d..62c9457ccd2f 100644
--- a/arch/x86/kernel/pci-dma.c
+++ b/arch/x86/kernel/pci-dma.c
@@ -100,14 +100,18 @@ void *dma_generic_alloc_coherent(struct device *dev, size_t size,
 				 struct dma_attrs *attrs)
 {
 	unsigned long dma_mask;
-	struct page *page;
+	struct page *page = NULL;
+	unsigned int count = PAGE_ALIGN(size) >> PAGE_SHIFT;
 	dma_addr_t addr;
 
 	dma_mask = dma_alloc_coherent_mask(dev, flag);
 
 	flag |= __GFP_ZERO;
 again:
-	page = alloc_pages_node(dev_to_node(dev), flag, get_order(size));
+	if (!(flag & GFP_ATOMIC))
+		page = dma_alloc_from_contiguous(dev, count, get_order(size));
+	if (!page)
+		page = alloc_pages_node(dev_to_node(dev), flag, get_order(size));
 	if (!page)
 		return NULL;
 
@@ -127,6 +131,16 @@ again:
 	return page_address(page);
 }
 
+void dma_generic_free_coherent(struct device *dev, size_t size, void *vaddr,
+			       dma_addr_t dma_addr, struct dma_attrs *attrs)
+{
+	unsigned int count = PAGE_ALIGN(size) >> PAGE_SHIFT;
+	struct page *page = virt_to_page(vaddr);
+
+	if (!dma_release_from_contiguous(dev, page, count))
+		free_pages((unsigned long)vaddr, get_order(size));
+}
+
 /*
  * See <Documentation/x86/x86_64/boot-options.txt> for the iommu kernel
  * parameter documentation.
diff --git a/arch/x86/kernel/pci-nommu.c b/arch/x86/kernel/pci-nommu.c
index f96050685b46..871be4a84c7d 100644
--- a/arch/x86/kernel/pci-nommu.c
+++ b/arch/x86/kernel/pci-nommu.c
@@ -74,12 +74,6 @@ static int nommu_map_sg(struct device *hwdev, struct scatterlist *sg,
 	return nents;
 }
 
-static void nommu_free_coherent(struct device *dev, size_t size, void *vaddr,
-				dma_addr_t dma_addr, struct dma_attrs *attrs)
-{
-	free_pages((unsigned long)vaddr, get_order(size));
-}
-
 static void nommu_sync_single_for_device(struct device *dev,
 			dma_addr_t addr, size_t size,
 			enum dma_data_direction dir)
@@ -97,7 +91,7 @@ static void nommu_sync_sg_for_device(struct device *dev,
 
 struct dma_map_ops nommu_dma_ops = {
 	.alloc			= dma_generic_alloc_coherent,
-	.free			= nommu_free_coherent,
+	.free			= dma_generic_free_coherent,
 	.map_sg			= nommu_map_sg,
 	.map_page		= nommu_map_page,
 	.sync_single_for_device = nommu_sync_single_for_device,
diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c
index 1a2901562059..d6c956e674cc 100644
--- a/arch/x86/kernel/setup.c
+++ b/arch/x86/kernel/setup.c
@@ -50,6 +50,7 @@
 #include <asm/pci-direct.h>
 #include <linux/init_ohci1394_dma.h>
 #include <linux/kvm_para.h>
+#include <linux/dma-contiguous.h>
 
 #include <linux/errno.h>
 #include <linux/kernel.h>
@@ -934,6 +935,7 @@ void __init setup_arch(char **cmdline_p)
 	}
 #endif
 	memblock.current_limit = get_max_mapped();
+	dma_contiguous_reserve(0);
 
 	/*
 	 * NOTE: On x86-32, only from this point on, fixmaps are ready for use.
-- 
cgit v1.2.3


From 2f1bd67d544d3c086fb5101513f4b6c8f4291b43 Mon Sep 17 00:00:00 2001
From: Konrad Rzeszutek Wilk <konrad.wilk@oracle.com>
Date: Mon, 21 May 2012 09:19:38 -0400
Subject: xen/smp: unbind irqworkX when unplugging vCPUs.

The git commit  1ff2b0c303698e486f1e0886b4d9876200ef8ca5
"xen: implement IRQ_WORK_VECTOR handler" added the functionality
to have a per-cpu "irqworkX" for the IPI APIC functionality.
However it missed the unbind when a vCPU is unplugged resulting
in an orphaned per-cpu interrupt line for unplugged vCPU:

  30:        216          0   xen-dyn-event     hvc_console
  31:        810          4   xen-dyn-event     eth0
  32:         29          0   xen-dyn-event     blkif
- 36:          0          0  xen-percpu-ipi       irqwork2
- 37:        287          0   xen-dyn-event     xenbus
+ 36:        287          0   xen-dyn-event     xenbus
 NMI:          0          0   Non-maskable interrupts
 LOC:          0          0   Local timer interrupts
 SPU:          0          0   Spurious interrupts

Signed-off-by: Konrad Rzeszutek Wilk <konrad.wilk@oracle.com>
---
 arch/x86/xen/smp.c | 1 +
 1 file changed, 1 insertion(+)

(limited to 'arch/x86')

diff --git a/arch/x86/xen/smp.c b/arch/x86/xen/smp.c
index 3ec3f8eb19fc..ce9e98b1e69c 100644
--- a/arch/x86/xen/smp.c
+++ b/arch/x86/xen/smp.c
@@ -419,6 +419,7 @@ static void xen_cpu_die(unsigned int cpu)
 	unbind_from_irqhandler(per_cpu(xen_callfunc_irq, cpu), NULL);
 	unbind_from_irqhandler(per_cpu(xen_debug_irq, cpu), NULL);
 	unbind_from_irqhandler(per_cpu(xen_callfuncsingle_irq, cpu), NULL);
+	unbind_from_irqhandler(per_cpu(xen_irq_work, cpu), NULL);
 	xen_uninit_lock_cpu(cpu);
 	xen_teardown_timer(cpu);
 
-- 
cgit v1.2.3


From 29d679ffd850ea37a303bb930142be14982611e4 Mon Sep 17 00:00:00 2001
From: Sasha Levin <levinsasha928@gmail.com>
Date: Tue, 8 May 2012 17:56:12 +0200
Subject: x86, printk: Add missing KERN_CONT to NMI selftest

Fix this behaviour:

----------------
| NMI testsuite:
--------------------
  remote IPI:
  ok  |

   local IPI:
  ok  |

Revealed due to a new modification to printk().

Signed-off-by: Sasha Levin <levinsasha928@gmail.com>
Link: http://lkml.kernel.org/r/1336492573-17530-3-git-send-email-levinsasha928@gmail.com
Signed-off-by: H. Peter Anvin <hpa@zytor.com>
---
 arch/x86/kernel/nmi_selftest.c | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kernel/nmi_selftest.c b/arch/x86/kernel/nmi_selftest.c
index 2c39dcd510fa..9f11dd3b6577 100644
--- a/arch/x86/kernel/nmi_selftest.c
+++ b/arch/x86/kernel/nmi_selftest.c
@@ -117,15 +117,15 @@ static void __init dotest(void (*testcase_fn)(void), int expected)
 		unexpected_testcase_failures++;
 
 		if (nmi_fail == FAILURE)
-			printk("FAILED |");
+			printk(KERN_CONT "FAILED |");
 		else if (nmi_fail == TIMEOUT)
-			printk("TIMEOUT|");
+			printk(KERN_CONT "TIMEOUT|");
 		else
-			printk("ERROR  |");
+			printk(KERN_CONT "ERROR  |");
 		dump_stack();
 	} else {
 		testcase_successes++;
-		printk("  ok  |");
+		printk(KERN_CONT "  ok  |");
 	}
 	testcase_total++;
 
@@ -150,10 +150,10 @@ void __init nmi_selftest(void)
 
 	print_testname("remote IPI");
 	dotest(remote_ipi, SUCCESS);
-	printk("\n");
+	printk(KERN_CONT "\n");
 	print_testname("local IPI");
 	dotest(local_ipi, SUCCESS);
-	printk("\n");
+	printk(KERN_CONT "\n");
 
 	cleanup_nmi_testsuite();
 
-- 
cgit v1.2.3


From 68c2c39a76b094e9b2773e5846424ea674bf2c46 Mon Sep 17 00:00:00 2001
From: Stefano Stabellini <stefano.stabellini@eu.citrix.com>
Date: Mon, 21 May 2012 16:54:10 +0100
Subject: xen: do not map the same GSI twice in PVHVM guests.

PV on HVM guests map GSIs into event channels. At restore time the
event channels are resumed by restore_pirqs.

Device drivers might try to register the same GSI again through ACPI at
restore time, but the GSI has already been mapped and bound by
restore_pirqs. This patch detects these situations and avoids
 mapping the same GSI multiple times.

Without this patch we get:
(XEN) irq.c:2235: dom4: pirq 23 or emuirq 28 already mapped
and waste a pirq.

CC: stable@kernel.org
Signed-off-by: Stefano Stabellini <stefano.stabellini@eu.citrix.com>
Signed-off-by: Konrad Rzeszutek Wilk <konrad.wilk@oracle.com>
---
 arch/x86/pci/xen.c   | 4 ++++
 drivers/xen/events.c | 5 +++--
 include/xen/events.h | 3 +++
 3 files changed, 10 insertions(+), 2 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/pci/xen.c b/arch/x86/pci/xen.c
index 7415aa927913..56ab74989cf1 100644
--- a/arch/x86/pci/xen.c
+++ b/arch/x86/pci/xen.c
@@ -64,6 +64,10 @@ static int xen_register_pirq(u32 gsi, int gsi_override, int triggering,
 	int shareable = 0;
 	char *name;
 
+	irq = xen_irq_from_gsi(gsi);
+	if (irq > 0)
+		return irq;
+
 	if (set_pirq)
 		pirq = gsi;
 
diff --git a/drivers/xen/events.c b/drivers/xen/events.c
index 4b33acd8ed4e..faae2f910ad2 100644
--- a/drivers/xen/events.c
+++ b/drivers/xen/events.c
@@ -611,7 +611,7 @@ static void disable_pirq(struct irq_data *data)
 	disable_dynirq(data);
 }
 
-static int find_irq_by_gsi(unsigned gsi)
+int xen_irq_from_gsi(unsigned gsi)
 {
 	struct irq_info *info;
 
@@ -625,6 +625,7 @@ static int find_irq_by_gsi(unsigned gsi)
 
 	return -1;
 }
+EXPORT_SYMBOL_GPL(xen_irq_from_gsi);
 
 /*
  * Do not make any assumptions regarding the relationship between the
@@ -644,7 +645,7 @@ int xen_bind_pirq_gsi_to_irq(unsigned gsi,
 
 	mutex_lock(&irq_mapping_update_lock);
 
-	irq = find_irq_by_gsi(gsi);
+	irq = xen_irq_from_gsi(gsi);
 	if (irq != -1) {
 		printk(KERN_INFO "xen_map_pirq_gsi: returning irq %d for gsi %u\n",
 		       irq, gsi);
diff --git a/include/xen/events.h b/include/xen/events.h
index 0f773708e02c..04399b28e821 100644
--- a/include/xen/events.h
+++ b/include/xen/events.h
@@ -103,6 +103,9 @@ int xen_irq_from_pirq(unsigned pirq);
 /* Return the pirq allocated to the irq. */
 int xen_pirq_from_irq(unsigned irq);
 
+/* Return the irq allocated to the gsi */
+int xen_irq_from_gsi(unsigned gsi);
+
 /* Determine whether to ignore this IRQ if it is passed to a guest. */
 int xen_test_irq_shared(int irq);
 
-- 
cgit v1.2.3


From 3b7d15bde54be81e3edd773724d85d20ae42a4da Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Sun, 22 Apr 2012 03:27:28 -0400
Subject: um: ->restart_block.fn needs to be reset on sigreturn

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 arch/um/kernel/signal.c | 3 ---
 arch/x86/um/signal.c    | 3 +++
 2 files changed, 3 insertions(+), 3 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/um/kernel/signal.c b/arch/um/kernel/signal.c
index fb12f4c5e649..0dfcef92ec91 100644
--- a/arch/um/kernel/signal.c
+++ b/arch/um/kernel/signal.c
@@ -29,9 +29,6 @@ static int handle_signal(struct pt_regs *regs, unsigned long signr,
 	unsigned long sp;
 	int err;
 
-	/* Always make any pending restarted system calls return -EINTR */
-	current_thread_info()->restart_block.fn = do_no_restart_syscall;
-
 	/* Did we come from a system call? */
 	if (PT_REGS_SYSCALL_NR(regs) >= 0) {
 		/* If so, check system call restarting.. */
diff --git a/arch/x86/um/signal.c b/arch/x86/um/signal.c
index 4883b9546016..72eafa6c6a52 100644
--- a/arch/x86/um/signal.c
+++ b/arch/x86/um/signal.c
@@ -156,6 +156,9 @@ static int copy_sc_from_user(struct pt_regs *regs,
 	struct sigcontext sc;
 	int err, pid;
 
+	/* Always make any pending restarted system calls return -EINTR */
+	current_thread_info()->restart_block.fn = do_no_restart_syscall;
+
 	err = copy_from_user(&sc, from, sizeof(sc));
 	if (err)
 		return err;
-- 
cgit v1.2.3


From b2d668da9307c4c163dd603d2bb3cadb10f9fd37 Mon Sep 17 00:00:00 2001
From: Jarkko Sakkinen <jarkko.sakkinen@intel.com>
Date: Mon, 21 May 2012 20:51:24 +0300
Subject: x86, relocs: Build clean fix

relocs was not cleaned up when "make clean" is issued. This
patch fixes the issue.

Signed-off-by: Jarkko Sakkinen <jarkko.sakkinen@intel.com>
Link: http://lkml.kernel.org/r/1337622684-6834-1-git-send-email-jarkko.sakkinen@intel.com
Signed-off-by: H. Peter Anvin <hpa@zytor.com>
Cc: <stable@vger.kernel.org> v3.4
---
 arch/x86/Makefile | 1 +
 1 file changed, 1 insertion(+)

(limited to 'arch/x86')

diff --git a/arch/x86/Makefile b/arch/x86/Makefile
index 94e91e401da9..b1c611e6da67 100644
--- a/arch/x86/Makefile
+++ b/arch/x86/Makefile
@@ -206,6 +206,7 @@ archclean:
 	$(Q)rm -rf $(objtree)/arch/i386
 	$(Q)rm -rf $(objtree)/arch/x86_64
 	$(Q)$(MAKE) $(clean)=$(boot)
+	$(Q)$(MAKE) $(clean)=arch/x86/tools
 
 define archhelp
   echo  '* bzImage      - Compressed kernel image (arch/x86/boot/bzImage)'
-- 
cgit v1.2.3


From e47b65b032f2997aa0a7392ecdf656c86d4d7561 Mon Sep 17 00:00:00 2001
From: Sam Ravnborg <sam@ravnborg.org>
Date: Mon, 21 May 2012 20:45:37 +0200
Subject: net: drop NET dependency from HAVE_BPF_JIT

There is no point having the NET dependency on the select target, as it
forces all users to depend on NET to tell they support BPF_JIT.  Move
the config option to the bottom of the file - this could be a nice place
also for future "selectable" config symbols.

Fix up all users to drop the dependency on NET now that it is not
required to supress warnings for non-NET builds.

Reported-by: Linus Torvalds <torvalds@linux-foundation.org>
Signed-off-by: Sam Ravnborg <sam@ravnborg.org>
Acked-by: David Miller <davem@davemloft.net>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 arch/arm/Kconfig     | 2 +-
 arch/powerpc/Kconfig | 2 +-
 arch/sparc/Kconfig   | 2 +-
 arch/x86/Kconfig     | 2 +-
 net/Kconfig          | 7 ++++---
 5 files changed, 8 insertions(+), 7 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/arm/Kconfig b/arch/arm/Kconfig
index 36586dba6fa6..e19ed3fd3089 100644
--- a/arch/arm/Kconfig
+++ b/arch/arm/Kconfig
@@ -33,7 +33,7 @@ config ARM
 	select GENERIC_IRQ_SHOW
 	select CPU_PM if (SUSPEND || CPU_IDLE)
 	select GENERIC_PCI_IOMAP
-	select HAVE_BPF_JIT if NET
+	select HAVE_BPF_JIT
 	help
 	  The ARM series is a line of low-power-consumption RISC chip designs
 	  licensed by ARM Ltd and targeted at embedded applications and
diff --git a/arch/powerpc/Kconfig b/arch/powerpc/Kconfig
index feab3bad6d0f..73ec03945717 100644
--- a/arch/powerpc/Kconfig
+++ b/arch/powerpc/Kconfig
@@ -141,7 +141,7 @@ config PPC
 	select IRQ_FORCED_THREADING
 	select HAVE_RCU_TABLE_FREE if SMP
 	select HAVE_SYSCALL_TRACEPOINTS
-	select HAVE_BPF_JIT if (PPC64 && NET)
+	select HAVE_BPF_JIT if PPC64
 	select HAVE_ARCH_JUMP_LABEL
 	select ARCH_HAVE_NMI_SAFE_CMPXCHG
 
diff --git a/arch/sparc/Kconfig b/arch/sparc/Kconfig
index 6c49ed2ee786..d176c03274c5 100644
--- a/arch/sparc/Kconfig
+++ b/arch/sparc/Kconfig
@@ -30,7 +30,7 @@ config SPARC
 	select USE_GENERIC_SMP_HELPERS if SMP
 	select GENERIC_PCI_IOMAP
 	select HAVE_NMI_WATCHDOG if SPARC64
-	select HAVE_BPF_JIT if NET
+	select HAVE_BPF_JIT
 
 config SPARC32
 	def_bool !64BIT
diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
index c9866b0b77d8..25f87bccbf8f 100644
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -77,7 +77,7 @@ config X86
 	select GENERIC_CLOCKEVENTS_MIN_ADJUST
 	select IRQ_FORCED_THREADING
 	select USE_GENERIC_SMP_HELPERS if SMP
-	select HAVE_BPF_JIT if (X86_64 && NET)
+	select HAVE_BPF_JIT if X86_64
 	select CLKEVT_I8253
 	select ARCH_HAVE_NMI_SAFE_CMPXCHG
 	select GENERIC_IOMAP
diff --git a/net/Kconfig b/net/Kconfig
index 1e47bd03dde3..245831bec09a 100644
--- a/net/Kconfig
+++ b/net/Kconfig
@@ -246,9 +246,6 @@ config BQL
 	select DQL
 	default y
 
-config HAVE_BPF_JIT
-	bool
-
 config BPF_JIT
 	bool "enable BPF Just In Time compiler"
 	depends on HAVE_BPF_JIT
@@ -340,3 +337,7 @@ source "net/nfc/Kconfig"
 
 
 endif   # if NET
+
+# Used by archs to tell that they support BPF_JIT
+config HAVE_BPF_JIT
+	bool
-- 
cgit v1.2.3


From 243412be9cecfc7fddebb912a277b76119fd4ecd Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Sun, 20 May 2012 00:05:58 -0400
Subject: um/x86: merge (and trim) 32- and 64-bit variants of ptrace.h

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 arch/um/kernel/process.c              |   2 +-
 arch/um/kernel/skas/syscall.c         |   2 +-
 arch/x86/um/asm/elf.h                 |  42 +++++++-------
 arch/x86/um/asm/ptrace.h              |  34 ++++++++++++
 arch/x86/um/asm/ptrace_32.h           |  23 --------
 arch/x86/um/asm/ptrace_64.h           |  26 ---------
 arch/x86/um/shared/sysdep/ptrace.h    |  67 ++++++++++++++++++++--
 arch/x86/um/shared/sysdep/ptrace_32.h |  92 +++----------------------------
 arch/x86/um/shared/sysdep/ptrace_64.h | 101 ++--------------------------------
 arch/x86/um/signal.c                  |  22 ++++----
 arch/x86/um/sysrq_32.c                |   8 +--
 arch/x86/um/sysrq_64.c                |   8 +--
 arch/x86/um/tls_32.c                  |   2 +-
 13 files changed, 150 insertions(+), 279 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/um/kernel/process.c b/arch/um/kernel/process.c
index 4d9af3172d9f..3a2235e0abc3 100644
--- a/arch/um/kernel/process.c
+++ b/arch/um/kernel/process.c
@@ -196,7 +196,7 @@ int copy_thread(unsigned long clone_flags, unsigned long sp,
 	if (current->thread.forking) {
 	  	memcpy(&p->thread.regs.regs, &regs->regs,
 		       sizeof(p->thread.regs.regs));
-		REGS_SET_SYSCALL_RETURN(p->thread.regs.regs.gp, 0);
+		UPT_SET_SYSCALL_RETURN(&p->thread.regs.regs, 0);
 		if (sp != 0)
 			REGS_SP(p->thread.regs.regs.gp) = sp;
 
diff --git a/arch/um/kernel/skas/syscall.c b/arch/um/kernel/skas/syscall.c
index f5173e1ec3ac..05fbeb480e0b 100644
--- a/arch/um/kernel/skas/syscall.c
+++ b/arch/um/kernel/skas/syscall.c
@@ -34,7 +34,7 @@ void handle_syscall(struct uml_pt_regs *r)
 		result = -ENOSYS;
 	else result = EXECUTE_SYSCALL(syscall, regs);
 
-	REGS_SET_SYSCALL_RETURN(r->gp, result);
+	UPT_SET_SYSCALL_RETURN(r, result);
 
 	syscall_trace(r, 1);
 }
diff --git a/arch/x86/um/asm/elf.h b/arch/x86/um/asm/elf.h
index f3b0633b69a1..0e07adc8cbe4 100644
--- a/arch/x86/um/asm/elf.h
+++ b/arch/x86/um/asm/elf.h
@@ -34,25 +34,25 @@
 #define ELF_ARCH        EM_386
 
 #define ELF_PLAT_INIT(regs, load_addr) do { \
-	PT_REGS_EBX(regs) = 0; \
-	PT_REGS_ECX(regs) = 0; \
-	PT_REGS_EDX(regs) = 0; \
-	PT_REGS_ESI(regs) = 0; \
-	PT_REGS_EDI(regs) = 0; \
-	PT_REGS_EBP(regs) = 0; \
-	PT_REGS_EAX(regs) = 0; \
+	PT_REGS_BX(regs) = 0; \
+	PT_REGS_CX(regs) = 0; \
+	PT_REGS_DX(regs) = 0; \
+	PT_REGS_SI(regs) = 0; \
+	PT_REGS_DI(regs) = 0; \
+	PT_REGS_BP(regs) = 0; \
+	PT_REGS_AX(regs) = 0; \
 } while (0)
 
 /* Shamelessly stolen from include/asm-i386/elf.h */
 
 #define ELF_CORE_COPY_REGS(pr_reg, regs) do {	\
-	pr_reg[0] = PT_REGS_EBX(regs);		\
-	pr_reg[1] = PT_REGS_ECX(regs);		\
-	pr_reg[2] = PT_REGS_EDX(regs);		\
-	pr_reg[3] = PT_REGS_ESI(regs);		\
-	pr_reg[4] = PT_REGS_EDI(regs);		\
-	pr_reg[5] = PT_REGS_EBP(regs);		\
-	pr_reg[6] = PT_REGS_EAX(regs);		\
+	pr_reg[0] = PT_REGS_BX(regs);		\
+	pr_reg[1] = PT_REGS_CX(regs);		\
+	pr_reg[2] = PT_REGS_DX(regs);		\
+	pr_reg[3] = PT_REGS_SI(regs);		\
+	pr_reg[4] = PT_REGS_DI(regs);		\
+	pr_reg[5] = PT_REGS_BP(regs);		\
+	pr_reg[6] = PT_REGS_AX(regs);		\
 	pr_reg[7] = PT_REGS_DS(regs);		\
 	pr_reg[8] = PT_REGS_ES(regs);		\
 	/* fake once used fs and gs selectors? */	\
@@ -130,13 +130,13 @@ do {								\
 #define ELF_ARCH        EM_X86_64
 
 #define ELF_PLAT_INIT(regs, load_addr)    do { \
-	PT_REGS_RBX(regs) = 0; \
-	PT_REGS_RCX(regs) = 0; \
-	PT_REGS_RDX(regs) = 0; \
-	PT_REGS_RSI(regs) = 0; \
-	PT_REGS_RDI(regs) = 0; \
-	PT_REGS_RBP(regs) = 0; \
-	PT_REGS_RAX(regs) = 0; \
+	PT_REGS_BX(regs) = 0; \
+	PT_REGS_CX(regs) = 0; \
+	PT_REGS_DX(regs) = 0; \
+	PT_REGS_SI(regs) = 0; \
+	PT_REGS_DI(regs) = 0; \
+	PT_REGS_BP(regs) = 0; \
+	PT_REGS_AX(regs) = 0; \
 	PT_REGS_R8(regs) = 0; \
 	PT_REGS_R9(regs) = 0; \
 	PT_REGS_R10(regs) = 0; \
diff --git a/arch/x86/um/asm/ptrace.h b/arch/x86/um/asm/ptrace.h
index c8aca8c501b0..950dfb7b8417 100644
--- a/arch/x86/um/asm/ptrace.h
+++ b/arch/x86/um/asm/ptrace.h
@@ -1,5 +1,39 @@
+#ifndef __UM_X86_PTRACE_H
+#define __UM_X86_PTRACE_H
+
 #ifdef CONFIG_X86_32
 # include "ptrace_32.h"
 #else
 # include "ptrace_64.h"
 #endif
+
+#define PT_REGS_AX(r) UPT_AX(&(r)->regs)
+#define PT_REGS_BX(r) UPT_BX(&(r)->regs)
+#define PT_REGS_CX(r) UPT_CX(&(r)->regs)
+#define PT_REGS_DX(r) UPT_DX(&(r)->regs)
+
+#define PT_REGS_SI(r) UPT_SI(&(r)->regs)
+#define PT_REGS_DI(r) UPT_DI(&(r)->regs)
+#define PT_REGS_BP(r) UPT_BP(&(r)->regs)
+#define PT_REGS_EFLAGS(r) UPT_EFLAGS(&(r)->regs)
+
+#define PT_REGS_CS(r) UPT_CS(&(r)->regs)
+#define PT_REGS_SS(r) UPT_SS(&(r)->regs)
+#define PT_REGS_DS(r) UPT_DS(&(r)->regs)
+#define PT_REGS_ES(r) UPT_ES(&(r)->regs)
+
+#define PT_REGS_ORIG_SYSCALL(r) PT_REGS_AX(r)
+#define PT_REGS_SYSCALL_RET(r) PT_REGS_AX(r)
+
+#define PT_FIX_EXEC_STACK(sp) do ; while(0)
+
+#define profile_pc(regs) PT_REGS_IP(regs)
+
+#define UPT_RESTART_SYSCALL(r) (UPT_IP(r) -= 2)
+#define UPT_SET_SYSCALL_RETURN(r, res) (UPT_AX(r) = (res))
+
+static inline long regs_return_value(struct uml_pt_regs *regs)
+{
+	return UPT_AX(regs);
+}
+#endif /* __UM_X86_PTRACE_H */
diff --git a/arch/x86/um/asm/ptrace_32.h b/arch/x86/um/asm/ptrace_32.h
index 5d2a59112537..2cf225351b65 100644
--- a/arch/x86/um/asm/ptrace_32.h
+++ b/arch/x86/um/asm/ptrace_32.h
@@ -11,29 +11,6 @@
 #include "linux/compiler.h"
 #include "asm/ptrace-generic.h"
 
-#define PT_REGS_EAX(r) UPT_EAX(&(r)->regs)
-#define PT_REGS_EBX(r) UPT_EBX(&(r)->regs)
-#define PT_REGS_ECX(r) UPT_ECX(&(r)->regs)
-#define PT_REGS_EDX(r) UPT_EDX(&(r)->regs)
-#define PT_REGS_ESI(r) UPT_ESI(&(r)->regs)
-#define PT_REGS_EDI(r) UPT_EDI(&(r)->regs)
-#define PT_REGS_EBP(r) UPT_EBP(&(r)->regs)
-
-#define PT_REGS_CS(r) UPT_CS(&(r)->regs)
-#define PT_REGS_SS(r) UPT_SS(&(r)->regs)
-#define PT_REGS_DS(r) UPT_DS(&(r)->regs)
-#define PT_REGS_ES(r) UPT_ES(&(r)->regs)
-#define PT_REGS_FS(r) UPT_FS(&(r)->regs)
-#define PT_REGS_GS(r) UPT_GS(&(r)->regs)
-
-#define PT_REGS_EFLAGS(r) UPT_EFLAGS(&(r)->regs)
-
-#define PT_REGS_ORIG_SYSCALL(r) PT_REGS_EAX(r)
-#define PT_REGS_SYSCALL_RET(r) PT_REGS_EAX(r)
-#define PT_FIX_EXEC_STACK(sp) do ; while(0)
-
-#define profile_pc(regs) PT_REGS_IP(regs)
-
 #define user_mode(r) UPT_IS_USER(&(r)->regs)
 
 /*
diff --git a/arch/x86/um/asm/ptrace_64.h b/arch/x86/um/asm/ptrace_64.h
index 706a0d80545c..ea7bff394320 100644
--- a/arch/x86/um/asm/ptrace_64.h
+++ b/arch/x86/um/asm/ptrace_64.h
@@ -15,13 +15,6 @@
 
 #define HOST_AUDIT_ARCH AUDIT_ARCH_X86_64
 
-#define PT_REGS_RBX(r) UPT_RBX(&(r)->regs)
-#define PT_REGS_RCX(r) UPT_RCX(&(r)->regs)
-#define PT_REGS_RDX(r) UPT_RDX(&(r)->regs)
-#define PT_REGS_RSI(r) UPT_RSI(&(r)->regs)
-#define PT_REGS_RDI(r) UPT_RDI(&(r)->regs)
-#define PT_REGS_RBP(r) UPT_RBP(&(r)->regs)
-#define PT_REGS_RAX(r) UPT_RAX(&(r)->regs)
 #define PT_REGS_R8(r) UPT_R8(&(r)->regs)
 #define PT_REGS_R9(r) UPT_R9(&(r)->regs)
 #define PT_REGS_R10(r) UPT_R10(&(r)->regs)
@@ -31,27 +24,8 @@
 #define PT_REGS_R14(r) UPT_R14(&(r)->regs)
 #define PT_REGS_R15(r) UPT_R15(&(r)->regs)
 
-#define PT_REGS_FS(r) UPT_FS(&(r)->regs)
-#define PT_REGS_GS(r) UPT_GS(&(r)->regs)
-#define PT_REGS_DS(r) UPT_DS(&(r)->regs)
-#define PT_REGS_ES(r) UPT_ES(&(r)->regs)
-#define PT_REGS_SS(r) UPT_SS(&(r)->regs)
-#define PT_REGS_CS(r) UPT_CS(&(r)->regs)
-
-#define PT_REGS_ORIG_RAX(r) UPT_ORIG_RAX(&(r)->regs)
-#define PT_REGS_RIP(r) UPT_IP(&(r)->regs)
-#define PT_REGS_SP(r) UPT_SP(&(r)->regs)
-
-#define PT_REGS_EFLAGS(r) UPT_EFLAGS(&(r)->regs)
-
 /* XXX */
 #define user_mode(r) UPT_IS_USER(&(r)->regs)
-#define PT_REGS_ORIG_SYSCALL(r) PT_REGS_RAX(r)
-#define PT_REGS_SYSCALL_RET(r) PT_REGS_RAX(r)
-
-#define PT_FIX_EXEC_STACK(sp) do ; while(0)
-
-#define profile_pc(regs) PT_REGS_IP(regs)
 
 struct user_desc;
 
diff --git a/arch/x86/um/shared/sysdep/ptrace.h b/arch/x86/um/shared/sysdep/ptrace.h
index 2bbe1ec2d96a..6ce2d76eb908 100644
--- a/arch/x86/um/shared/sysdep/ptrace.h
+++ b/arch/x86/um/shared/sysdep/ptrace.h
@@ -1,15 +1,74 @@
 #ifndef __SYSDEP_X86_PTRACE_H
 #define __SYSDEP_X86_PTRACE_H
 
+#include <generated/user_constants.h>
+#include "sysdep/faultinfo.h"
+
+#define MAX_REG_OFFSET (UM_FRAME_SIZE)
+#define MAX_REG_NR ((MAX_REG_OFFSET) / sizeof(unsigned long))
+
+#define REGS_IP(r) ((r)[HOST_IP])
+#define REGS_SP(r) ((r)[HOST_SP])
+#define REGS_EFLAGS(r) ((r)[HOST_EFLAGS])
+#define REGS_AX(r) ((r)[HOST_AX])
+#define REGS_BX(r) ((r)[HOST_BX])
+#define REGS_CX(r) ((r)[HOST_CX])
+#define REGS_DX(r) ((r)[HOST_DX])
+#define REGS_SI(r) ((r)[HOST_SI])
+#define REGS_DI(r) ((r)[HOST_DI])
+#define REGS_BP(r) ((r)[HOST_BP])
+#define REGS_CS(r) ((r)[HOST_CS])
+#define REGS_SS(r) ((r)[HOST_SS])
+#define REGS_DS(r) ((r)[HOST_DS])
+#define REGS_ES(r) ((r)[HOST_ES])
+
+#define UPT_IP(r) REGS_IP((r)->gp)
+#define UPT_SP(r) REGS_SP((r)->gp)
+#define UPT_EFLAGS(r) REGS_EFLAGS((r)->gp)
+#define UPT_AX(r) REGS_AX((r)->gp)
+#define UPT_BX(r) REGS_BX((r)->gp)
+#define UPT_CX(r) REGS_CX((r)->gp)
+#define UPT_DX(r) REGS_DX((r)->gp)
+#define UPT_SI(r) REGS_SI((r)->gp)
+#define UPT_DI(r) REGS_DI((r)->gp)
+#define UPT_BP(r) REGS_BP((r)->gp)
+#define UPT_CS(r) REGS_CS((r)->gp)
+#define UPT_SS(r) REGS_SS((r)->gp)
+#define UPT_DS(r) REGS_DS((r)->gp)
+#define UPT_ES(r) REGS_ES((r)->gp)
+
 #ifdef __i386__
 #include "ptrace_32.h"
 #else
 #include "ptrace_64.h"
 #endif
 
-static inline long regs_return_value(struct uml_pt_regs *regs)
-{
-	return UPT_SYSCALL_RET(regs);
-}
+struct syscall_args {
+	unsigned long args[6];
+};
+
+#define SYSCALL_ARGS(r) ((struct syscall_args) \
+			 { .args = { UPT_SYSCALL_ARG1(r),	 \
+				     UPT_SYSCALL_ARG2(r),	 \
+				     UPT_SYSCALL_ARG3(r),	 \
+				     UPT_SYSCALL_ARG4(r),	 \
+				     UPT_SYSCALL_ARG5(r),	 \
+				     UPT_SYSCALL_ARG6(r) } } )
+
+struct uml_pt_regs {
+	unsigned long gp[MAX_REG_NR];
+	unsigned long fp[MAX_FP_NR];
+	struct faultinfo faultinfo;
+	long syscall;
+	int is_user;
+};
+
+#define EMPTY_UML_PT_REGS { }
+
+#define UPT_SYSCALL_NR(r) ((r)->syscall)
+#define UPT_FAULTINFO(r) (&(r)->faultinfo)
+#define UPT_IS_USER(r) ((r)->is_user)
+
+extern int user_context(unsigned long sp);
 
 #endif /* __SYSDEP_X86_PTRACE_H */
diff --git a/arch/x86/um/shared/sysdep/ptrace_32.h b/arch/x86/um/shared/sysdep/ptrace_32.h
index befd1df32ed0..b94a108de1dc 100644
--- a/arch/x86/um/shared/sysdep/ptrace_32.h
+++ b/arch/x86/um/shared/sysdep/ptrace_32.h
@@ -6,11 +6,7 @@
 #ifndef __SYSDEP_I386_PTRACE_H
 #define __SYSDEP_I386_PTRACE_H
 
-#include <generated/user_constants.h>
-#include "sysdep/faultinfo.h"
-
-#define MAX_REG_NR (UM_FRAME_SIZE / sizeof(unsigned long))
-#define MAX_REG_OFFSET (UM_FRAME_SIZE)
+#define MAX_FP_NR HOST_FPX_SIZE
 
 static inline void update_debugregs(int seq) {}
 
@@ -24,90 +20,16 @@ void set_using_sysemu(int value);
 int get_using_sysemu(void);
 extern int sysemu_supported;
 
-#define REGS_IP(r) ((r)[HOST_IP])
-#define REGS_SP(r) ((r)[HOST_SP])
-#define REGS_EFLAGS(r) ((r)[HOST_EFLAGS])
-#define REGS_EAX(r) ((r)[HOST_AX])
-#define REGS_EBX(r) ((r)[HOST_BX])
-#define REGS_ECX(r) ((r)[HOST_CX])
-#define REGS_EDX(r) ((r)[HOST_DX])
-#define REGS_ESI(r) ((r)[HOST_SI])
-#define REGS_EDI(r) ((r)[HOST_DI])
-#define REGS_EBP(r) ((r)[HOST_BP])
-#define REGS_CS(r) ((r)[HOST_CS])
-#define REGS_SS(r) ((r)[HOST_SS])
-#define REGS_DS(r) ((r)[HOST_DS])
-#define REGS_ES(r) ((r)[HOST_ES])
-#define REGS_FS(r) ((r)[HOST_FS])
-#define REGS_GS(r) ((r)[HOST_GS])
-
-#define REGS_SET_SYSCALL_RETURN(r, res) REGS_EAX(r) = (res)
-
-#define IP_RESTART_SYSCALL(ip) ((ip) -= 2)
-#define REGS_RESTART_SYSCALL(r) IP_RESTART_SYSCALL(REGS_IP(r))
-
 #ifndef PTRACE_SYSEMU_SINGLESTEP
 #define PTRACE_SYSEMU_SINGLESTEP 32
 #endif
 
-struct uml_pt_regs {
-	unsigned long gp[MAX_REG_NR];
-	unsigned long fp[HOST_FPX_SIZE];
-	struct faultinfo faultinfo;
-	long syscall;
-	int is_user;
-};
-
-#define EMPTY_UML_PT_REGS { }
-
-#define UPT_IP(r) REGS_IP((r)->gp)
-#define UPT_SP(r) REGS_SP((r)->gp)
-#define UPT_EFLAGS(r) REGS_EFLAGS((r)->gp)
-#define UPT_EAX(r) REGS_EAX((r)->gp)
-#define UPT_EBX(r) REGS_EBX((r)->gp)
-#define UPT_ECX(r) REGS_ECX((r)->gp)
-#define UPT_EDX(r) REGS_EDX((r)->gp)
-#define UPT_ESI(r) REGS_ESI((r)->gp)
-#define UPT_EDI(r) REGS_EDI((r)->gp)
-#define UPT_EBP(r) REGS_EBP((r)->gp)
-#define UPT_ORIG_EAX(r) ((r)->syscall)
-#define UPT_CS(r) REGS_CS((r)->gp)
-#define UPT_SS(r) REGS_SS((r)->gp)
-#define UPT_DS(r) REGS_DS((r)->gp)
-#define UPT_ES(r) REGS_ES((r)->gp)
-#define UPT_FS(r) REGS_FS((r)->gp)
-#define UPT_GS(r) REGS_GS((r)->gp)
-
-#define UPT_SYSCALL_ARG1(r) UPT_EBX(r)
-#define UPT_SYSCALL_ARG2(r) UPT_ECX(r)
-#define UPT_SYSCALL_ARG3(r) UPT_EDX(r)
-#define UPT_SYSCALL_ARG4(r) UPT_ESI(r)
-#define UPT_SYSCALL_ARG5(r) UPT_EDI(r)
-#define UPT_SYSCALL_ARG6(r) UPT_EBP(r)
-
-extern int user_context(unsigned long sp);
-
-#define UPT_IS_USER(r) ((r)->is_user)
-
-struct syscall_args {
-	unsigned long args[6];
-};
-
-#define SYSCALL_ARGS(r) ((struct syscall_args) \
-			 { .args = { UPT_SYSCALL_ARG1(r),	\
-				     UPT_SYSCALL_ARG2(r),	\
-				     UPT_SYSCALL_ARG3(r),	\
-				     UPT_SYSCALL_ARG4(r),	\
-				     UPT_SYSCALL_ARG5(r),	\
-				     UPT_SYSCALL_ARG6(r) } } )
-
-#define UPT_RESTART_SYSCALL(r) REGS_RESTART_SYSCALL((r)->gp)
-
-#define UPT_ORIG_SYSCALL(r) UPT_EAX(r)
-#define UPT_SYSCALL_NR(r) UPT_ORIG_EAX(r)
-#define UPT_SYSCALL_RET(r) UPT_EAX(r)
-
-#define UPT_FAULTINFO(r) (&(r)->faultinfo)
+#define UPT_SYSCALL_ARG1(r) UPT_BX(r)
+#define UPT_SYSCALL_ARG2(r) UPT_CX(r)
+#define UPT_SYSCALL_ARG3(r) UPT_DX(r)
+#define UPT_SYSCALL_ARG4(r) UPT_SI(r)
+#define UPT_SYSCALL_ARG5(r) UPT_DI(r)
+#define UPT_SYSCALL_ARG6(r) UPT_BP(r)
 
 extern void arch_init_registers(int pid);
 
diff --git a/arch/x86/um/shared/sysdep/ptrace_64.h b/arch/x86/um/shared/sysdep/ptrace_64.h
index 031edc53ac57..919789f1071e 100644
--- a/arch/x86/um/shared/sysdep/ptrace_64.h
+++ b/arch/x86/um/shared/sysdep/ptrace_64.h
@@ -8,22 +8,8 @@
 #ifndef __SYSDEP_X86_64_PTRACE_H
 #define __SYSDEP_X86_64_PTRACE_H
 
-#include <generated/user_constants.h>
-#include "sysdep/faultinfo.h"
+#define MAX_FP_NR HOST_FP_SIZE
 
-#define MAX_REG_OFFSET (UM_FRAME_SIZE)
-#define MAX_REG_NR ((MAX_REG_OFFSET) / sizeof(unsigned long))
-
-#define REGS_IP(r) ((r)[HOST_IP])
-#define REGS_SP(r) ((r)[HOST_SP])
-
-#define REGS_RBX(r) ((r)[HOST_BX])
-#define REGS_RCX(r) ((r)[HOST_CX])
-#define REGS_RDX(r) ((r)[HOST_DX])
-#define REGS_RSI(r) ((r)[HOST_SI])
-#define REGS_RDI(r) ((r)[HOST_DI])
-#define REGS_RBP(r) ((r)[HOST_BP])
-#define REGS_RAX(r) ((r)[HOST_AX])
 #define REGS_R8(r) ((r)[HOST_R8])
 #define REGS_R9(r) ((r)[HOST_R9])
 #define REGS_R10(r) ((r)[HOST_R10])
@@ -32,9 +18,6 @@
 #define REGS_R13(r) ((r)[HOST_R13])
 #define REGS_R14(r) ((r)[HOST_R14])
 #define REGS_R15(r) ((r)[HOST_R15])
-#define REGS_CS(r) ((r)[HOST_CS])
-#define REGS_EFLAGS(r) ((r)[HOST_EFLAGS])
-#define REGS_SS(r) ((r)[HOST_SS])
 
 #define HOST_FS_BASE 21
 #define HOST_GS_BASE 22
@@ -58,45 +41,6 @@
 #define GS (HOST_GS * sizeof(long))
 #endif
 
-#define REGS_FS_BASE(r) ((r)[HOST_FS_BASE])
-#define REGS_GS_BASE(r) ((r)[HOST_GS_BASE])
-#define REGS_DS(r) ((r)[HOST_DS])
-#define REGS_ES(r) ((r)[HOST_ES])
-#define REGS_FS(r) ((r)[HOST_FS])
-#define REGS_GS(r) ((r)[HOST_GS])
-
-#define REGS_ORIG_RAX(r) ((r)[HOST_ORIG_AX])
-
-#define REGS_SET_SYSCALL_RETURN(r, res) REGS_RAX(r) = (res)
-
-#define IP_RESTART_SYSCALL(ip) ((ip) -= 2)
-#define REGS_RESTART_SYSCALL(r) IP_RESTART_SYSCALL(REGS_IP(r))
-
-#define REGS_FAULT_ADDR(r) ((r)->fault_addr)
-
-#define REGS_FAULT_WRITE(r) FAULT_WRITE((r)->fault_type)
-
-#define REGS_TRAP(r) ((r)->trap_type)
-
-#define REGS_ERR(r) ((r)->fault_type)
-
-struct uml_pt_regs {
-	unsigned long gp[MAX_REG_NR];
-	unsigned long fp[HOST_FP_SIZE];
-	struct faultinfo faultinfo;
-	long syscall;
-	int is_user;
-};
-
-#define EMPTY_UML_PT_REGS { }
-
-#define UPT_RBX(r) REGS_RBX((r)->gp)
-#define UPT_RCX(r) REGS_RCX((r)->gp)
-#define UPT_RDX(r) REGS_RDX((r)->gp)
-#define UPT_RSI(r) REGS_RSI((r)->gp)
-#define UPT_RDI(r) REGS_RDI((r)->gp)
-#define UPT_RBP(r) REGS_RBP((r)->gp)
-#define UPT_RAX(r) REGS_RAX((r)->gp)
 #define UPT_R8(r) REGS_R8((r)->gp)
 #define UPT_R9(r) REGS_R9((r)->gp)
 #define UPT_R10(r) REGS_R10((r)->gp)
@@ -105,51 +49,14 @@ struct uml_pt_regs {
 #define UPT_R13(r) REGS_R13((r)->gp)
 #define UPT_R14(r) REGS_R14((r)->gp)
 #define UPT_R15(r) REGS_R15((r)->gp)
-#define UPT_CS(r) REGS_CS((r)->gp)
-#define UPT_FS_BASE(r) REGS_FS_BASE((r)->gp)
-#define UPT_FS(r) REGS_FS((r)->gp)
-#define UPT_GS_BASE(r) REGS_GS_BASE((r)->gp)
-#define UPT_GS(r) REGS_GS((r)->gp)
-#define UPT_DS(r) REGS_DS((r)->gp)
-#define UPT_ES(r) REGS_ES((r)->gp)
-#define UPT_CS(r) REGS_CS((r)->gp)
-#define UPT_SS(r) REGS_SS((r)->gp)
-#define UPT_ORIG_RAX(r) REGS_ORIG_RAX((r)->gp)
-
-#define UPT_IP(r) REGS_IP((r)->gp)
-#define UPT_SP(r) REGS_SP((r)->gp)
-
-#define UPT_EFLAGS(r) REGS_EFLAGS((r)->gp)
-#define UPT_SYSCALL_NR(r) ((r)->syscall)
-#define UPT_SYSCALL_RET(r) UPT_RAX(r)
-
-extern int user_context(unsigned long sp);
 
-#define UPT_IS_USER(r) ((r)->is_user)
-
-#define UPT_SYSCALL_ARG1(r) UPT_RDI(r)
-#define UPT_SYSCALL_ARG2(r) UPT_RSI(r)
-#define UPT_SYSCALL_ARG3(r) UPT_RDX(r)
+#define UPT_SYSCALL_ARG1(r) UPT_DI(r)
+#define UPT_SYSCALL_ARG2(r) UPT_SI(r)
+#define UPT_SYSCALL_ARG3(r) UPT_DX(r)
 #define UPT_SYSCALL_ARG4(r) UPT_R10(r)
 #define UPT_SYSCALL_ARG5(r) UPT_R8(r)
 #define UPT_SYSCALL_ARG6(r) UPT_R9(r)
 
-struct syscall_args {
-	unsigned long args[6];
-};
-
-#define SYSCALL_ARGS(r) ((struct syscall_args) \
-			 { .args = { UPT_SYSCALL_ARG1(r),	 \
-				     UPT_SYSCALL_ARG2(r),	 \
-				     UPT_SYSCALL_ARG3(r),	 \
-				     UPT_SYSCALL_ARG4(r),	 \
-				     UPT_SYSCALL_ARG5(r),	 \
-				     UPT_SYSCALL_ARG6(r) } } )
-
-#define UPT_RESTART_SYSCALL(r) REGS_RESTART_SYSCALL((r)->gp)
-
-#define UPT_FAULTINFO(r) (&(r)->faultinfo)
-
 static inline void arch_init_registers(int pid)
 {
 }
diff --git a/arch/x86/um/signal.c b/arch/x86/um/signal.c
index 72eafa6c6a52..35b283d3df0c 100644
--- a/arch/x86/um/signal.c
+++ b/arch/x86/um/signal.c
@@ -413,9 +413,9 @@ int setup_signal_stack_sc(unsigned long stack_top, int sig,
 
 	PT_REGS_SP(regs) = (unsigned long) frame;
 	PT_REGS_IP(regs) = (unsigned long) ka->sa.sa_handler;
-	PT_REGS_EAX(regs) = (unsigned long) sig;
-	PT_REGS_EDX(regs) = (unsigned long) 0;
-	PT_REGS_ECX(regs) = (unsigned long) 0;
+	PT_REGS_AX(regs) = (unsigned long) sig;
+	PT_REGS_DX(regs) = (unsigned long) 0;
+	PT_REGS_CX(regs) = (unsigned long) 0;
 
 	if ((current->ptrace & PT_DTRACE) && (current->ptrace & PT_PTRACED))
 		ptrace_notify(SIGTRAP);
@@ -463,9 +463,9 @@ int setup_signal_stack_si(unsigned long stack_top, int sig,
 
 	PT_REGS_SP(regs) = (unsigned long) frame;
 	PT_REGS_IP(regs) = (unsigned long) ka->sa.sa_handler;
-	PT_REGS_EAX(regs) = (unsigned long) sig;
-	PT_REGS_EDX(regs) = (unsigned long) &frame->info;
-	PT_REGS_ECX(regs) = (unsigned long) &frame->uc;
+	PT_REGS_AX(regs) = (unsigned long) sig;
+	PT_REGS_DX(regs) = (unsigned long) &frame->info;
+	PT_REGS_CX(regs) = (unsigned long) &frame->uc;
 
 	if ((current->ptrace & PT_DTRACE) && (current->ptrace & PT_PTRACED))
 		ptrace_notify(SIGTRAP);
@@ -573,17 +573,17 @@ int setup_signal_stack_si(unsigned long stack_top, int sig,
 	}
 
 	PT_REGS_SP(regs) = (unsigned long) frame;
-	PT_REGS_RDI(regs) = sig;
+	PT_REGS_DI(regs) = sig;
 	/* In case the signal handler was declared without prototypes */
-	PT_REGS_RAX(regs) = 0;
+	PT_REGS_AX(regs) = 0;
 
 	/*
 	 * This also works for non SA_SIGINFO handlers because they expect the
 	 * next argument after the signal number on the stack.
 	 */
-	PT_REGS_RSI(regs) = (unsigned long) &frame->info;
-	PT_REGS_RDX(regs) = (unsigned long) &frame->uc;
-	PT_REGS_RIP(regs) = (unsigned long) ka->sa.sa_handler;
+	PT_REGS_SI(regs) = (unsigned long) &frame->info;
+	PT_REGS_DX(regs) = (unsigned long) &frame->uc;
+	PT_REGS_IP(regs) = (unsigned long) ka->sa.sa_handler;
  out:
 	return err;
 }
diff --git a/arch/x86/um/sysrq_32.c b/arch/x86/um/sysrq_32.c
index 171b3e9dc867..2d5cc51e9bef 100644
--- a/arch/x86/um/sysrq_32.c
+++ b/arch/x86/um/sysrq_32.c
@@ -23,12 +23,10 @@ void show_regs(struct pt_regs *regs)
         printk(" EFLAGS: %08lx\n    %s\n", PT_REGS_EFLAGS(regs),
 	       print_tainted());
         printk("EAX: %08lx EBX: %08lx ECX: %08lx EDX: %08lx\n",
-                PT_REGS_EAX(regs), PT_REGS_EBX(regs), 
-	       PT_REGS_ECX(regs), 
-	       PT_REGS_EDX(regs));
+               PT_REGS_AX(regs), PT_REGS_BX(regs), 
+	       PT_REGS_CX(regs), PT_REGS_DX(regs));
         printk("ESI: %08lx EDI: %08lx EBP: %08lx",
-	       PT_REGS_ESI(regs), PT_REGS_EDI(regs), 
-	       PT_REGS_EBP(regs));
+	       PT_REGS_SI(regs), PT_REGS_DI(regs), PT_REGS_BP(regs));
         printk(" DS: %04lx ES: %04lx\n",
 	       0xffff & PT_REGS_DS(regs), 
 	       0xffff & PT_REGS_ES(regs));
diff --git a/arch/x86/um/sysrq_64.c b/arch/x86/um/sysrq_64.c
index e8913436d7dc..08258f179969 100644
--- a/arch/x86/um/sysrq_64.c
+++ b/arch/x86/um/sysrq_64.c
@@ -19,15 +19,15 @@ void __show_regs(struct pt_regs *regs)
 	printk(KERN_INFO "Pid: %d, comm: %.20s %s %s\n", task_pid_nr(current),
 		current->comm, print_tainted(), init_utsname()->release);
 	printk(KERN_INFO "RIP: %04lx:[<%016lx>]\n", PT_REGS_CS(regs) & 0xffff,
-	       PT_REGS_RIP(regs));
+	       PT_REGS_IP(regs));
 	printk(KERN_INFO "RSP: %016lx  EFLAGS: %08lx\n", PT_REGS_SP(regs),
 	       PT_REGS_EFLAGS(regs));
 	printk(KERN_INFO "RAX: %016lx RBX: %016lx RCX: %016lx\n",
-	       PT_REGS_RAX(regs), PT_REGS_RBX(regs), PT_REGS_RCX(regs));
+	       PT_REGS_AX(regs), PT_REGS_BX(regs), PT_REGS_CX(regs));
 	printk(KERN_INFO "RDX: %016lx RSI: %016lx RDI: %016lx\n",
-	       PT_REGS_RDX(regs), PT_REGS_RSI(regs), PT_REGS_RDI(regs));
+	       PT_REGS_DX(regs), PT_REGS_SI(regs), PT_REGS_DI(regs));
 	printk(KERN_INFO "RBP: %016lx R08: %016lx R09: %016lx\n",
-	       PT_REGS_RBP(regs), PT_REGS_R8(regs), PT_REGS_R9(regs));
+	       PT_REGS_BP(regs), PT_REGS_R8(regs), PT_REGS_R9(regs));
 	printk(KERN_INFO "R10: %016lx R11: %016lx R12: %016lx\n",
 	       PT_REGS_R10(regs), PT_REGS_R11(regs), PT_REGS_R12(regs));
 	printk(KERN_INFO "R13: %016lx R14: %016lx R15: %016lx\n",
diff --git a/arch/x86/um/tls_32.c b/arch/x86/um/tls_32.c
index c6c7131e563b..baba84f8ecb8 100644
--- a/arch/x86/um/tls_32.c
+++ b/arch/x86/um/tls_32.c
@@ -219,7 +219,7 @@ int arch_copy_tls(struct task_struct *new)
 	int idx, ret = -EFAULT;
 
 	if (copy_from_user(&info,
-			   (void __user *) UPT_ESI(&new->thread.regs.regs),
+			   (void __user *) UPT_SI(&new->thread.regs.regs),
 			   sizeof(info)))
 		goto out;
 
-- 
cgit v1.2.3


From 0088b6ec8fa4773dd56b861bfc1630f4c3c069db Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Sun, 22 Apr 2012 03:28:20 -0400
Subject: um: stub_rt_sigsuspend isn't needed these days anymore

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 arch/x86/um/sys_call_table_64.c | 1 -
 1 file changed, 1 deletion(-)

(limited to 'arch/x86')

diff --git a/arch/x86/um/sys_call_table_64.c b/arch/x86/um/sys_call_table_64.c
index 9924776f4265..170bd926a69c 100644
--- a/arch/x86/um/sys_call_table_64.c
+++ b/arch/x86/um/sys_call_table_64.c
@@ -31,7 +31,6 @@
 #define stub_fork sys_fork
 #define stub_vfork sys_vfork
 #define stub_execve sys_execve
-#define stub_rt_sigsuspend sys_rt_sigsuspend
 #define stub_sigaltstack sys_sigaltstack
 #define stub_rt_sigreturn sys_rt_sigreturn
 
-- 
cgit v1.2.3


From ffc51be82b17e1c515fdb2dd5b92605798216b30 Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Sun, 22 Apr 2012 16:34:27 -0400
Subject: um: missing checks of __put_user()/__get_user() return values

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 arch/x86/um/signal.c      |  4 ++--
 arch/x86/um/syscalls_32.c | 12 ++++++------
 2 files changed, 8 insertions(+), 8 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/um/signal.c b/arch/x86/um/signal.c
index 35b283d3df0c..bb0fb03b9f85 100644
--- a/arch/x86/um/signal.c
+++ b/arch/x86/um/signal.c
@@ -544,8 +544,8 @@ int setup_signal_stack_si(unsigned long stack_top, int sig,
 			       set->sig[0]);
 	err |= __put_user(&frame->fpstate, &frame->uc.uc_mcontext.fpstate);
 	if (sizeof(*set) == 16) {
-		__put_user(set->sig[0], &frame->uc.uc_sigmask.sig[0]);
-		__put_user(set->sig[1], &frame->uc.uc_sigmask.sig[1]);
+		err |= __put_user(set->sig[0], &frame->uc.uc_sigmask.sig[0]);
+		err |= __put_user(set->sig[1], &frame->uc.uc_sigmask.sig[1]);
 	}
 	else
 		err |= __copy_to_user(&frame->uc.uc_sigmask, set,
diff --git a/arch/x86/um/syscalls_32.c b/arch/x86/um/syscalls_32.c
index 70ca357393b8..b853e8600b9d 100644
--- a/arch/x86/um/syscalls_32.c
+++ b/arch/x86/um/syscalls_32.c
@@ -44,10 +44,10 @@ long sys_sigaction(int sig, const struct old_sigaction __user *act,
 		old_sigset_t mask;
 		if (!access_ok(VERIFY_READ, act, sizeof(*act)) ||
 		    __get_user(new_ka.sa.sa_handler, &act->sa_handler) ||
-		    __get_user(new_ka.sa.sa_restorer, &act->sa_restorer))
+		    __get_user(new_ka.sa.sa_restorer, &act->sa_restorer) ||
+		    __get_user(new_ka.sa.sa_flags, &act->sa_flags) ||
+		    __get_user(mask, &act->sa_mask))
 			return -EFAULT;
-		__get_user(new_ka.sa.sa_flags, &act->sa_flags);
-		__get_user(mask, &act->sa_mask);
 		siginitset(&new_ka.sa.sa_mask, mask);
 	}
 
@@ -56,10 +56,10 @@ long sys_sigaction(int sig, const struct old_sigaction __user *act,
 	if (!ret && oact) {
 		if (!access_ok(VERIFY_WRITE, oact, sizeof(*oact)) ||
 		    __put_user(old_ka.sa.sa_handler, &oact->sa_handler) ||
-		    __put_user(old_ka.sa.sa_restorer, &oact->sa_restorer))
+		    __put_user(old_ka.sa.sa_restorer, &oact->sa_restorer) ||
+		    __put_user(old_ka.sa.sa_flags, &oact->sa_flags) ||
+		    __put_user(old_ka.sa.sa_mask.sig[0], &oact->sa_mask))
 			return -EFAULT;
-		__put_user(old_ka.sa.sa_flags, &oact->sa_flags);
-		__put_user(old_ka.sa.sa_mask.sig[0], &oact->sa_mask);
 	}
 
 	return ret;
-- 
cgit v1.2.3


From 764e0da14fd7ac2d259d98d34ece0a87d32306c9 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Mon, 21 May 2012 23:16:18 +0200
Subject: timers: Fixup the Kconfig consolidation fallout

Sigh, I missed to check which architecture Kconfig files actually
include the core Kconfig file. There are a few which did not. So we
broke them.

Instead of adding the includes to those, we are better off to move the
include to init/Kconfig like we did already with irqs and others.

This does not change anything for the architectures using the old
style periodic timer mode. It just solves the build wreckage there.

For those architectures which use the clock events infrastructure it
moves the include of the core Kconfig file to "General setup" which is
a way more logical place than having it at random locations specified
by the architecture specific Kconfigs.

Reported-by: Ingo Molnar <mingo@kernel.org>
Cc: Anna-Maria Gleixner <anna-maria@glx-um.de>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 arch/arm/Kconfig        |  2 --
 arch/avr32/Kconfig      |  2 --
 arch/blackfin/Kconfig   |  2 --
 arch/c6x/Kconfig        |  1 -
 arch/h8300/Kconfig.cpu  |  2 --
 arch/hexagon/Kconfig    |  1 -
 arch/m68k/Kconfig       |  4 ---
 arch/microblaze/Kconfig |  2 --
 arch/mips/Kconfig       |  2 --
 arch/mn10300/Kconfig    |  1 -
 arch/openrisc/Kconfig   |  1 -
 arch/powerpc/Kconfig    |  1 -
 arch/s390/Kconfig       |  2 --
 arch/score/Kconfig      |  1 -
 arch/sh/Kconfig         |  3 --
 arch/sparc/Kconfig      |  2 --
 arch/tile/Kconfig       |  2 --
 arch/um/Kconfig.um      |  1 -
 arch/unicore32/Kconfig  |  2 --
 arch/x86/Kconfig        |  2 --
 init/Kconfig            |  1 +
 kernel/time/Kconfig     | 73 +++++++++++++++++++++++++++----------------------
 22 files changed, 42 insertions(+), 68 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/arm/Kconfig b/arch/arm/Kconfig
index feccc1d37ecf..c1e5f07fab93 100644
--- a/arch/arm/Kconfig
+++ b/arch/arm/Kconfig
@@ -1459,8 +1459,6 @@ endmenu
 
 menu "Kernel Features"
 
-source "kernel/time/Kconfig"
-
 config HAVE_SMP
 	bool
 	help
diff --git a/arch/avr32/Kconfig b/arch/avr32/Kconfig
index 0bd13ab9f43b..f8bc2d27d148 100644
--- a/arch/avr32/Kconfig
+++ b/arch/avr32/Kconfig
@@ -61,8 +61,6 @@ source "kernel/Kconfig.freezer"
 
 menu "System Type and features"
 
-source "kernel/time/Kconfig"
-
 config SUBARCH_AVR32B
 	bool
 config MMU
diff --git a/arch/blackfin/Kconfig b/arch/blackfin/Kconfig
index bc21de2e8fed..f7897eefa630 100644
--- a/arch/blackfin/Kconfig
+++ b/arch/blackfin/Kconfig
@@ -631,8 +631,6 @@ config GPTMR0_CLOCKSOURCE
 	depends on !TICKSOURCE_GPTMR0
 endmenu
 
-source kernel/time/Kconfig
-
 comment "Misc"
 
 choice
diff --git a/arch/c6x/Kconfig b/arch/c6x/Kconfig
index 30c04c658b9e..9d446eff2c04 100644
--- a/arch/c6x/Kconfig
+++ b/arch/c6x/Kconfig
@@ -132,7 +132,6 @@ source "mm/Kconfig"
 source "kernel/Kconfig.preempt"
 
 source "kernel/Kconfig.hz"
-source "kernel/time/Kconfig"
 
 endmenu
 
diff --git a/arch/h8300/Kconfig.cpu b/arch/h8300/Kconfig.cpu
index 15c22286ae79..321f3922728b 100644
--- a/arch/h8300/Kconfig.cpu
+++ b/arch/h8300/Kconfig.cpu
@@ -1,7 +1,5 @@
 menu "Processor type and features"
 
-source "kernel/time/Kconfig"
-
 choice
 	prompt "H8/300 platform"
 	default H8300H_GENERIC
diff --git a/arch/hexagon/Kconfig b/arch/hexagon/Kconfig
index 7727ed9d2bf3..35f6c32d040c 100644
--- a/arch/hexagon/Kconfig
+++ b/arch/hexagon/Kconfig
@@ -183,7 +183,6 @@ endchoice
 source "mm/Kconfig"
 
 source "kernel/Kconfig.hz"
-source "kernel/time/Kconfig"
 
 config GENERIC_GPIO
 	bool "Generic GPIO support"
diff --git a/arch/m68k/Kconfig b/arch/m68k/Kconfig
index 2f4b0f0610d6..cac5b6be572a 100644
--- a/arch/m68k/Kconfig
+++ b/arch/m68k/Kconfig
@@ -106,10 +106,6 @@ if COLDFIRE
 source "kernel/Kconfig.preempt"
 endif
 
-if !MMU || COLDFIRE
-source "kernel/time/Kconfig"
-endif
-
 source "mm/Kconfig"
 
 endmenu
diff --git a/arch/microblaze/Kconfig b/arch/microblaze/Kconfig
index 3e786ac9a655..83460468998d 100644
--- a/arch/microblaze/Kconfig
+++ b/arch/microblaze/Kconfig
@@ -74,8 +74,6 @@ source "arch/microblaze/platform/Kconfig.platform"
 
 menu "Processor type and features"
 
-source "kernel/time/Kconfig"
-
 source "kernel/Kconfig.preempt"
 
 source "kernel/Kconfig.hz"
diff --git a/arch/mips/Kconfig b/arch/mips/Kconfig
index c9c330bc4e76..b65a730cba75 100644
--- a/arch/mips/Kconfig
+++ b/arch/mips/Kconfig
@@ -2205,8 +2205,6 @@ config NR_CPUS
 	  performance should round up your number of processors to the next
 	  power of two.
 
-source "kernel/time/Kconfig"
-
 #
 # Timer Interrupt Frequency Configuration
 #
diff --git a/arch/mn10300/Kconfig b/arch/mn10300/Kconfig
index 7f78057af2f5..687f9b4a2ed6 100644
--- a/arch/mn10300/Kconfig
+++ b/arch/mn10300/Kconfig
@@ -226,7 +226,6 @@ config MN10300_USING_JTAG
 	  single-stepping, which are taken over completely by the JTAG unit.
 
 source "kernel/Kconfig.hz"
-source "kernel/time/Kconfig"
 
 config MN10300_RTC
 	bool "Using MN10300 RTC"
diff --git a/arch/openrisc/Kconfig b/arch/openrisc/Kconfig
index be04485431fe..70653039e79b 100644
--- a/arch/openrisc/Kconfig
+++ b/arch/openrisc/Kconfig
@@ -106,7 +106,6 @@ config OPENRISC_HAVE_INST_DIV
 endmenu
 
 
-source "kernel/time/Kconfig"
 source kernel/Kconfig.hz
 source kernel/Kconfig.preempt
 source "mm/Kconfig"
diff --git a/arch/powerpc/Kconfig b/arch/powerpc/Kconfig
index 901215f7a2f2..d47cf7ffa792 100644
--- a/arch/powerpc/Kconfig
+++ b/arch/powerpc/Kconfig
@@ -278,7 +278,6 @@ config HIGHMEM
 	bool "High memory support"
 	depends on PPC32
 
-source kernel/time/Kconfig
 source kernel/Kconfig.hz
 source kernel/Kconfig.preempt
 source "fs/Kconfig.binfmt"
diff --git a/arch/s390/Kconfig b/arch/s390/Kconfig
index f9edb9303a7e..d0325d9ae21f 100644
--- a/arch/s390/Kconfig
+++ b/arch/s390/Kconfig
@@ -131,8 +131,6 @@ menu "Base setup"
 
 comment "Processor type and features"
 
-source "kernel/time/Kconfig"
-
 config 64BIT
 	def_bool y
 	prompt "64 bit kernel"
diff --git a/arch/score/Kconfig b/arch/score/Kconfig
index f5d3b3237419..ba0f412920be 100644
--- a/arch/score/Kconfig
+++ b/arch/score/Kconfig
@@ -66,7 +66,6 @@ config MEMORY_START
 	hex
 	default	0xa0000000
 
-source "kernel/time/Kconfig"
 source "kernel/Kconfig.hz"
 source "kernel/Kconfig.preempt"
 
diff --git a/arch/sh/Kconfig b/arch/sh/Kconfig
index cffd8b0082d5..820dfe3c7b69 100644
--- a/arch/sh/Kconfig
+++ b/arch/sh/Kconfig
@@ -577,9 +577,6 @@ config SH_CLK_CPG_LEGACY
 	depends on SH_CLK_CPG
 	def_bool y if !CPU_SUBTYPE_SH7785 && !ARCH_SHMOBILE && \
 		      !CPU_SHX3 && !CPU_SUBTYPE_SH7757
-
-source "kernel/time/Kconfig"
-
 endmenu
 
 menu "CPU Frequency scaling"
diff --git a/arch/sparc/Kconfig b/arch/sparc/Kconfig
index 33399d3d90bc..b5a035a5c53a 100644
--- a/arch/sparc/Kconfig
+++ b/arch/sparc/Kconfig
@@ -266,8 +266,6 @@ config HOTPLUG_CPU
 	  can be controlled through /sys/devices/system/cpu/cpu#.
 	  Say N if you want to disable CPU hotplug.
 
-source "kernel/time/Kconfig"
-
 if SPARC64
 source "drivers/cpufreq/Kconfig"
 
diff --git a/arch/tile/Kconfig b/arch/tile/Kconfig
index b56772cac5d2..4eec3a1a72c0 100644
--- a/arch/tile/Kconfig
+++ b/arch/tile/Kconfig
@@ -136,8 +136,6 @@ config NR_CPUS
 	  smaller kernel memory footprint results from using a smaller
 	  value on chips with fewer tiles.
 
-source "kernel/time/Kconfig"
-
 source "kernel/Kconfig.hz"
 
 config KEXEC
diff --git a/arch/um/Kconfig.um b/arch/um/Kconfig.um
index 70fd690964e4..bf87f25eb2de 100644
--- a/arch/um/Kconfig.um
+++ b/arch/um/Kconfig.um
@@ -10,7 +10,6 @@ config STATIC_LINK
 	  2.75G) for UML.
 
 source "mm/Kconfig"
-source "kernel/time/Kconfig"
 
 config LD_SCRIPT_STATIC
 	bool
diff --git a/arch/unicore32/Kconfig b/arch/unicore32/Kconfig
index a25ca7606bea..47ad5210606f 100644
--- a/arch/unicore32/Kconfig
+++ b/arch/unicore32/Kconfig
@@ -143,8 +143,6 @@ endmenu
 
 menu "Kernel Features"
 
-source "kernel/time/Kconfig"
-
 source "kernel/Kconfig.preempt"
 
 source "kernel/Kconfig.hz"
diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
index 3b0a9217836a..1b1e0493ef7f 100644
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -241,8 +241,6 @@ config ZONE_DMA
 
 	  If unsure, say Y.
 
-source "kernel/time/Kconfig"
-
 config SMP
 	bool "Symmetric multi-processing support"
 	---help---
diff --git a/init/Kconfig b/init/Kconfig
index 6cfd71d06463..528a0c4111cc 100644
--- a/init/Kconfig
+++ b/init/Kconfig
@@ -387,6 +387,7 @@ config AUDIT_LOGINUID_IMMUTABLE
 	  but may not be backwards compatible with older init systems.
 
 source "kernel/irq/Kconfig"
+source "kernel/time/Kconfig"
 
 menu "RCU Subsystem"
 
diff --git a/kernel/time/Kconfig b/kernel/time/Kconfig
index f6ebc4ff702a..fd42bd452b75 100644
--- a/kernel/time/Kconfig
+++ b/kernel/time/Kconfig
@@ -2,38 +2,6 @@
 # Timer subsystem related configuration options
 #
 
-# Core internal switch. Selected by NO_HZ / HIGH_RES_TIMERS. This is
-# only related to the tick functionality. Oneshot clockevent devices
-# are supported independ of this.
-config TICK_ONESHOT
-	bool
-
-config NO_HZ
-	bool "Tickless System (Dynamic Ticks)"
-	depends on !ARCH_USES_GETTIMEOFFSET && GENERIC_CLOCKEVENTS
-	select TICK_ONESHOT
-	help
-	  This option enables a tickless system: timer interrupts will
-	  only trigger on an as-needed basis both when the system is
-	  busy and when the system is idle.
-
-config HIGH_RES_TIMERS
-	bool "High Resolution Timer Support"
-	depends on !ARCH_USES_GETTIMEOFFSET && GENERIC_CLOCKEVENTS
-	select TICK_ONESHOT
-	help
-	  This option enables high resolution timer support. If your
-	  hardware is not capable then this option only increases
-	  the size of the kernel image.
-
-config GENERIC_CLOCKEVENTS_BUILD
-	bool
-	default y
-	depends on GENERIC_CLOCKEVENTS
-
-config GENERIC_CLOCKEVENTS_MIN_ADJUST
-	bool
-
 # Options selectable by arch Kconfig
 
 # Watchdog function for clocksources to detect instabilities
@@ -60,11 +28,52 @@ config ARCH_USES_GETTIMEOFFSET
 config GENERIC_CLOCKEVENTS
 	bool
 
+# Migration helper. Builds, but does not invoke
+config GENERIC_CLOCKEVENTS_BUILD
+	bool
+	default y
+	depends on GENERIC_CLOCKEVENTS
+
 # Clockevents broadcasting infrastructure
 config GENERIC_CLOCKEVENTS_BROADCAST
 	bool
 	depends on GENERIC_CLOCKEVENTS
 
+# Automatically adjust the min. reprogramming time for
+# clock event device
+config GENERIC_CLOCKEVENTS_MIN_ADJUST
+	bool
+
 # Generic update of CMOS clock
 config GENERIC_CMOS_UPDATE
 	bool
+
+if GENERIC_CLOCKEVENTS
+menu "Timers subsystem"
+
+# Core internal switch. Selected by NO_HZ / HIGH_RES_TIMERS. This is
+# only related to the tick functionality. Oneshot clockevent devices
+# are supported independ of this.
+config TICK_ONESHOT
+	bool
+
+config NO_HZ
+	bool "Tickless System (Dynamic Ticks)"
+	depends on !ARCH_USES_GETTIMEOFFSET && GENERIC_CLOCKEVENTS
+	select TICK_ONESHOT
+	help
+	  This option enables a tickless system: timer interrupts will
+	  only trigger on an as-needed basis both when the system is
+	  busy and when the system is idle.
+
+config HIGH_RES_TIMERS
+	bool "High Resolution Timer Support"
+	depends on !ARCH_USES_GETTIMEOFFSET && GENERIC_CLOCKEVENTS
+	select TICK_ONESHOT
+	help
+	  This option enables high resolution timer support. If your
+	  hardware is not capable then this option only increases
+	  the size of the kernel image.
+
+endmenu
+endif
-- 
cgit v1.2.3


From 68f3f16d9ad0f1e28ab3fd0001ab5798c41f15a3 Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Mon, 21 May 2012 21:42:32 -0400
Subject: new helper: sigsuspend()

guts of saved_sigmask-based sigsuspend/rt_sigsuspend.  Takes
kernel sigset_t *.

Open-coded instances replaced with calling it.

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 arch/alpha/kernel/signal.c         | 11 +----------
 arch/arm/kernel/signal.c           | 11 +----------
 arch/cris/arch-v10/kernel/signal.c | 16 ++++------------
 arch/cris/arch-v32/kernel/signal.c | 16 ++++------------
 arch/frv/kernel/signal.c           | 14 +++-----------
 arch/m68k/kernel/signal.c          | 15 +++------------
 arch/mips/kernel/signal.c          | 20 ++------------------
 arch/mips/kernel/signal32.c        | 20 ++------------------
 arch/mips/kernel/signal_n32.c      | 10 +---------
 arch/mn10300/kernel/signal.c       | 14 +++-----------
 arch/powerpc/kernel/signal_32.c    | 11 +----------
 arch/s390/kernel/signal.c          |  9 +--------
 arch/sh/kernel/signal_32.c         | 12 +-----------
 arch/sparc/kernel/signal_32.c      | 12 +-----------
 arch/sparc/kernel/signal_64.c      | 13 +------------
 arch/um/kernel/signal.c            |  9 +--------
 arch/x86/ia32/ia32_signal.c        | 12 +-----------
 arch/x86/kernel/signal.c           | 12 +-----------
 include/linux/signal.h             |  1 +
 kernel/compat.c                    | 10 +---------
 kernel/signal.c                    | 25 ++++++++++++++++---------
 21 files changed, 50 insertions(+), 223 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/alpha/kernel/signal.c b/arch/alpha/kernel/signal.c
index 35f2ef44de12..74b05e6ed441 100644
--- a/arch/alpha/kernel/signal.c
+++ b/arch/alpha/kernel/signal.c
@@ -121,17 +121,8 @@ SYSCALL_DEFINE5(rt_sigaction, int, sig, const struct sigaction __user *, act,
 SYSCALL_DEFINE1(sigsuspend, old_sigset_t, mask)
 {
 	sigset_t blocked;
-
-	current->saved_sigmask = current->blocked;
-
-	mask &= _BLOCKABLE;
 	siginitset(&blocked, mask);
-	set_current_blocked(&blocked);
-
-	current->state = TASK_INTERRUPTIBLE;
-	schedule();
-	set_thread_flag(TIF_RESTORE_SIGMASK);
-	return -ERESTARTNOHAND;
+	return sigsuspend(&blocked);
 }
 
 asmlinkage int
diff --git a/arch/arm/kernel/signal.c b/arch/arm/kernel/signal.c
index 73d9a420850d..4e5fdd9bd9e3 100644
--- a/arch/arm/kernel/signal.c
+++ b/arch/arm/kernel/signal.c
@@ -67,17 +67,8 @@ const unsigned long syscall_restart_code[2] = {
 asmlinkage int sys_sigsuspend(int restart, unsigned long oldmask, old_sigset_t mask)
 {
 	sigset_t blocked;
-
-	current->saved_sigmask = current->blocked;
-
-	mask &= _BLOCKABLE;
 	siginitset(&blocked, mask);
-	set_current_blocked(&blocked);
-
-	current->state = TASK_INTERRUPTIBLE;
-	schedule();
-	set_restore_sigmask();
-	return -ERESTARTNOHAND;
+	return sigsuspend(&blocked);
 }
 
 asmlinkage int 
diff --git a/arch/cris/arch-v10/kernel/signal.c b/arch/cris/arch-v10/kernel/signal.c
index 289c584ba499..170f4970d590 100644
--- a/arch/cris/arch-v10/kernel/signal.c
+++ b/arch/cris/arch-v10/kernel/signal.c
@@ -48,19 +48,11 @@ void do_signal(int canrestart, struct pt_regs *regs);
  * dummy arguments to be able to reach the regs argument.  (Note that this
  * arrangement relies on old_sigset_t occupying one register.)
  */
-int sys_sigsuspend(old_sigset_t mask, long r11, long r12, long r13, long mof,
-	long srp, struct pt_regs *regs)
+int sys_sigsuspend(old_sigset_t mask)
 {
-	mask &= _BLOCKABLE;
-	spin_lock_irq(&current->sighand->siglock);
-	current->saved_sigmask = current->blocked;
-	siginitset(&current->blocked, mask);
-	recalc_sigpending();
-	spin_unlock_irq(&current->sighand->siglock);
-	current->state = TASK_INTERRUPTIBLE;
-	schedule();
-	set_thread_flag(TIF_RESTORE_SIGMASK);
-	return -ERESTARTNOHAND;
+	sigset_t blocked;
+	siginitset(&blocked, mask);
+	return sigsuspend(&blocked);
 }
 
 int sys_sigaction(int sig, const struct old_sigaction __user *act,
diff --git a/arch/cris/arch-v32/kernel/signal.c b/arch/cris/arch-v32/kernel/signal.c
index ce4ab1a5552c..e09083208cb6 100644
--- a/arch/cris/arch-v32/kernel/signal.c
+++ b/arch/cris/arch-v32/kernel/signal.c
@@ -59,19 +59,11 @@ void keep_debug_flags(unsigned long oldccs, unsigned long oldspc,
  * dummy arguments to be able to reach the regs argument.
  */
 int
-sys_sigsuspend(old_sigset_t mask, long r11, long r12, long r13, long mof,
-	       long srp, struct pt_regs *regs)
+sys_sigsuspend(old_sigset_t mask)
 {
-	mask &= _BLOCKABLE;
-	spin_lock_irq(&current->sighand->siglock);
-	current->saved_sigmask = current->blocked;
-	siginitset(&current->blocked, mask);
-	recalc_sigpending();
-	spin_unlock_irq(&current->sighand->siglock);
-	current->state = TASK_INTERRUPTIBLE;
-	schedule();
-	set_thread_flag(TIF_RESTORE_SIGMASK);
-	return -ERESTARTNOHAND;
+	sigset_t blocked;
+	siginitset(&blocked, mask);
+	return sigsuspend(&blocked);
 }
 
 int
diff --git a/arch/frv/kernel/signal.c b/arch/frv/kernel/signal.c
index bab01298b58e..df957c7ba387 100644
--- a/arch/frv/kernel/signal.c
+++ b/arch/frv/kernel/signal.c
@@ -40,17 +40,9 @@ struct fdpic_func_descriptor {
  */
 asmlinkage int sys_sigsuspend(int history0, int history1, old_sigset_t mask)
 {
-	mask &= _BLOCKABLE;
-	spin_lock_irq(&current->sighand->siglock);
-	current->saved_sigmask = current->blocked;
-	siginitset(&current->blocked, mask);
-	recalc_sigpending();
-	spin_unlock_irq(&current->sighand->siglock);
-
-	current->state = TASK_INTERRUPTIBLE;
-	schedule();
-	set_thread_flag(TIF_RESTORE_SIGMASK);
-	return -ERESTARTNOHAND;
+	sigset_t blocked;
+	siginitset(&blocked, mask);
+	return sigsuspend(&blocked);
 }
 
 asmlinkage int sys_sigaction(int sig,
diff --git a/arch/m68k/kernel/signal.c b/arch/m68k/kernel/signal.c
index 1747c7030a33..8186982fb320 100644
--- a/arch/m68k/kernel/signal.c
+++ b/arch/m68k/kernel/signal.c
@@ -230,18 +230,9 @@ static inline void push_cache(unsigned long vaddr)
 asmlinkage int
 sys_sigsuspend(int unused0, int unused1, old_sigset_t mask)
 {
-	mask &= _BLOCKABLE;
-	spin_lock_irq(&current->sighand->siglock);
-	current->saved_sigmask = current->blocked;
-	siginitset(&current->blocked, mask);
-	recalc_sigpending();
-	spin_unlock_irq(&current->sighand->siglock);
-
-	current->state = TASK_INTERRUPTIBLE;
-	schedule();
-	set_restore_sigmask();
-
-	return -ERESTARTNOHAND;
+	sigset_t blocked;
+	siginitset(&blocked, mask);
+	return sigsuspend(&blocked);
 }
 
 asmlinkage int
diff --git a/arch/mips/kernel/signal.c b/arch/mips/kernel/signal.c
index d5a338a1739c..17f6ee30ad0d 100644
--- a/arch/mips/kernel/signal.c
+++ b/arch/mips/kernel/signal.c
@@ -255,15 +255,7 @@ asmlinkage int sys_sigsuspend(nabi_no_regargs struct pt_regs regs)
 	uset = (sigset_t __user *) regs.regs[4];
 	if (copy_from_user(&newset, uset, sizeof(sigset_t)))
 		return -EFAULT;
-	sigdelsetmask(&newset, ~_BLOCKABLE);
-
-	current->saved_sigmask = current->blocked;
-	set_current_blocked(&newset);
-
-	current->state = TASK_INTERRUPTIBLE;
-	schedule();
-	set_thread_flag(TIF_RESTORE_SIGMASK);
-	return -ERESTARTNOHAND;
+	return sigsuspend(&newset);
 }
 #endif
 
@@ -281,15 +273,7 @@ asmlinkage int sys_rt_sigsuspend(nabi_no_regargs struct pt_regs regs)
 	unewset = (sigset_t __user *) regs.regs[4];
 	if (copy_from_user(&newset, unewset, sizeof(newset)))
 		return -EFAULT;
-	sigdelsetmask(&newset, ~_BLOCKABLE);
-
-	current->saved_sigmask = current->blocked;
-	set_current_blocked(&newset);
-
-	current->state = TASK_INTERRUPTIBLE;
-	schedule();
-	set_thread_flag(TIF_RESTORE_SIGMASK);
-	return -ERESTARTNOHAND;
+	return sigsuspend(&newset);
 }
 
 #ifdef CONFIG_TRAD_SIGNALS
diff --git a/arch/mips/kernel/signal32.c b/arch/mips/kernel/signal32.c
index ac3b8d89aae5..b4fe2eacbd5d 100644
--- a/arch/mips/kernel/signal32.c
+++ b/arch/mips/kernel/signal32.c
@@ -288,15 +288,7 @@ asmlinkage int sys32_sigsuspend(nabi_no_regargs struct pt_regs regs)
 	uset = (compat_sigset_t __user *) regs.regs[4];
 	if (get_sigset(&newset, uset))
 		return -EFAULT;
-	sigdelsetmask(&newset, ~_BLOCKABLE);
-
-	current->saved_sigmask = current->blocked;
-	set_current_blocked(&newset);
-
-	current->state = TASK_INTERRUPTIBLE;
-	schedule();
-	set_thread_flag(TIF_RESTORE_SIGMASK);
-	return -ERESTARTNOHAND;
+	return sigsuspend(&newset);
 }
 
 asmlinkage int sys32_rt_sigsuspend(nabi_no_regargs struct pt_regs regs)
@@ -313,15 +305,7 @@ asmlinkage int sys32_rt_sigsuspend(nabi_no_regargs struct pt_regs regs)
 	uset = (compat_sigset_t __user *) regs.regs[4];
 	if (get_sigset(&newset, uset))
 		return -EFAULT;
-	sigdelsetmask(&newset, ~_BLOCKABLE);
-
-	current->saved_sigmask = current->blocked;
-	set_current_blocked(&newset);
-
-	current->state = TASK_INTERRUPTIBLE;
-	schedule();
-	set_thread_flag(TIF_RESTORE_SIGMASK);
-	return -ERESTARTNOHAND;
+	return sigsuspend(&newset);
 }
 
 SYSCALL_DEFINE3(32_sigaction, long, sig, const struct sigaction32 __user *, act,
diff --git a/arch/mips/kernel/signal_n32.c b/arch/mips/kernel/signal_n32.c
index 86eb4b04631c..63ffac9af7c5 100644
--- a/arch/mips/kernel/signal_n32.c
+++ b/arch/mips/kernel/signal_n32.c
@@ -91,15 +91,7 @@ asmlinkage int sysn32_rt_sigsuspend(nabi_no_regargs struct pt_regs regs)
 	if (copy_from_user(&uset, unewset, sizeof(uset)))
 		return -EFAULT;
 	sigset_from_compat(&newset, &uset);
-	sigdelsetmask(&newset, ~_BLOCKABLE);
-
-	current->saved_sigmask = current->blocked;
-	set_current_blocked(&newset);
-
-	current->state = TASK_INTERRUPTIBLE;
-	schedule();
-	set_thread_flag(TIF_RESTORE_SIGMASK);
-	return -ERESTARTNOHAND;
+	return sigsuspend(&newset);
 }
 
 asmlinkage void sysn32_rt_sigreturn(nabi_no_regargs struct pt_regs regs)
diff --git a/arch/mn10300/kernel/signal.c b/arch/mn10300/kernel/signal.c
index 690f4e9507d7..50eb94a05826 100644
--- a/arch/mn10300/kernel/signal.c
+++ b/arch/mn10300/kernel/signal.c
@@ -38,17 +38,9 @@
  */
 asmlinkage long sys_sigsuspend(int history0, int history1, old_sigset_t mask)
 {
-	mask &= _BLOCKABLE;
-	spin_lock_irq(&current->sighand->siglock);
-	current->saved_sigmask = current->blocked;
-	siginitset(&current->blocked, mask);
-	recalc_sigpending();
-	spin_unlock_irq(&current->sighand->siglock);
-
-	current->state = TASK_INTERRUPTIBLE;
-	schedule();
-	set_thread_flag(TIF_RESTORE_SIGMASK);
-	return -ERESTARTNOHAND;
+	sigset_t blocked;
+	siginitset(&blocked, mask);
+	return sigsuspend(&blocked);
 }
 
 /*
diff --git a/arch/powerpc/kernel/signal_32.c b/arch/powerpc/kernel/signal_32.c
index 45eb998557f8..ac1f96027bf5 100644
--- a/arch/powerpc/kernel/signal_32.c
+++ b/arch/powerpc/kernel/signal_32.c
@@ -244,17 +244,8 @@ static inline int restore_general_regs(struct pt_regs *regs,
 long sys_sigsuspend(old_sigset_t mask)
 {
 	sigset_t blocked;
-
-	current->saved_sigmask = current->blocked;
-
-	mask &= _BLOCKABLE;
 	siginitset(&blocked, mask);
-	set_current_blocked(&blocked);
-
- 	current->state = TASK_INTERRUPTIBLE;
- 	schedule();
-	set_restore_sigmask();
- 	return -ERESTARTNOHAND;
+	return sigsuspend(&blocked);
 }
 
 long sys_sigaction(int sig, struct old_sigaction __user *act,
diff --git a/arch/s390/kernel/signal.c b/arch/s390/kernel/signal.c
index 8a4e2b760d56..f626232e216c 100644
--- a/arch/s390/kernel/signal.c
+++ b/arch/s390/kernel/signal.c
@@ -59,15 +59,8 @@ typedef struct
 SYSCALL_DEFINE3(sigsuspend, int, history0, int, history1, old_sigset_t, mask)
 {
 	sigset_t blocked;
-
-	current->saved_sigmask = current->blocked;
-	mask &= _BLOCKABLE;
 	siginitset(&blocked, mask);
-	set_current_blocked(&blocked);
-	set_current_state(TASK_INTERRUPTIBLE);
-	schedule();
-	set_restore_sigmask();
-	return -ERESTARTNOHAND;
+	return sigsuspend(&blocked);
 }
 
 SYSCALL_DEFINE3(sigaction, int, sig, const struct old_sigaction __user *, act,
diff --git a/arch/sh/kernel/signal_32.c b/arch/sh/kernel/signal_32.c
index 5901fba3176e..46c9f9b00b14 100644
--- a/arch/sh/kernel/signal_32.c
+++ b/arch/sh/kernel/signal_32.c
@@ -58,18 +58,8 @@ sys_sigsuspend(old_sigset_t mask,
 	       struct pt_regs __regs)
 {
 	sigset_t blocked;
-
-	current->saved_sigmask = current->blocked;
-
-	mask &= _BLOCKABLE;
 	siginitset(&blocked, mask);
-	set_current_blocked(&blocked);
-
-	current->state = TASK_INTERRUPTIBLE;
-	schedule();
-	set_restore_sigmask();
-
-	return -ERESTARTNOHAND;
+	return sigsuspend(&blocked);
 }
 
 asmlinkage int
diff --git a/arch/sparc/kernel/signal_32.c b/arch/sparc/kernel/signal_32.c
index ac8e66b50f07..2b7e849f7c65 100644
--- a/arch/sparc/kernel/signal_32.c
+++ b/arch/sparc/kernel/signal_32.c
@@ -64,18 +64,8 @@ struct rt_signal_frame {
 static int _sigpause_common(old_sigset_t set)
 {
 	sigset_t blocked;
-
-	current->saved_sigmask = current->blocked;
-
-	set &= _BLOCKABLE;
 	siginitset(&blocked, set);
-	set_current_blocked(&blocked);
-
-	current->state = TASK_INTERRUPTIBLE;
-	schedule();
-	set_thread_flag(TIF_RESTORE_SIGMASK);
-
-	return -ERESTARTNOHAND;
+	return sigsuspend(&blocked);
 }
 
 asmlinkage int sys_sigsuspend(old_sigset_t set)
diff --git a/arch/sparc/kernel/signal_64.c b/arch/sparc/kernel/signal_64.c
index 48b0f57b65f7..eafaab486b2d 100644
--- a/arch/sparc/kernel/signal_64.c
+++ b/arch/sparc/kernel/signal_64.c
@@ -242,19 +242,8 @@ struct rt_signal_frame {
 static long _sigpause_common(old_sigset_t set)
 {
 	sigset_t blocked;
-
-	current->saved_sigmask = current->blocked;
-
-	set &= _BLOCKABLE;
 	siginitset(&blocked, set);
-	set_current_blocked(&blocked);
-
-	current->state = TASK_INTERRUPTIBLE;
-	schedule();
-
-	set_restore_sigmask();
-
-	return -ERESTARTNOHAND;
+	return sigsuspend(&blocked);
 }
 
 asmlinkage long sys_sigpause(unsigned int set)
diff --git a/arch/um/kernel/signal.c b/arch/um/kernel/signal.c
index fb12f4c5e649..b9b75b3bd5c9 100644
--- a/arch/um/kernel/signal.c
+++ b/arch/um/kernel/signal.c
@@ -152,15 +152,8 @@ int do_signal(void)
 long sys_sigsuspend(int history0, int history1, old_sigset_t mask)
 {
 	sigset_t blocked;
-
-	mask &= _BLOCKABLE;
 	siginitset(&blocked, mask);
-	set_current_blocked(&blocked);
-
-	current->state = TASK_INTERRUPTIBLE;
-	schedule();
-	set_thread_flag(TIF_RESTORE_SIGMASK);
-	return -ERESTARTNOHAND;
+	return sigsuspend(&blocked);
 }
 
 long sys_sigaltstack(const stack_t __user *uss, stack_t __user *uoss)
diff --git a/arch/x86/ia32/ia32_signal.c b/arch/x86/ia32/ia32_signal.c
index a69245ba27e3..fa54410c6666 100644
--- a/arch/x86/ia32/ia32_signal.c
+++ b/arch/x86/ia32/ia32_signal.c
@@ -127,18 +127,8 @@ int copy_siginfo_from_user32(siginfo_t *to, compat_siginfo_t __user *from)
 asmlinkage long sys32_sigsuspend(int history0, int history1, old_sigset_t mask)
 {
 	sigset_t blocked;
-
-	current->saved_sigmask = current->blocked;
-
-	mask &= _BLOCKABLE;
 	siginitset(&blocked, mask);
-	set_current_blocked(&blocked);
-
-	current->state = TASK_INTERRUPTIBLE;
-	schedule();
-
-	set_restore_sigmask();
-	return -ERESTARTNOHAND;
+	return sigsuspend(&blocked);
 }
 
 asmlinkage long sys32_sigaltstack(const stack_ia32_t __user *uss_ptr,
diff --git a/arch/x86/kernel/signal.c b/arch/x86/kernel/signal.c
index 115eac431483..b68ccadd2ff4 100644
--- a/arch/x86/kernel/signal.c
+++ b/arch/x86/kernel/signal.c
@@ -478,18 +478,8 @@ asmlinkage int
 sys_sigsuspend(int history0, int history1, old_sigset_t mask)
 {
 	sigset_t blocked;
-
-	current->saved_sigmask = current->blocked;
-
-	mask &= _BLOCKABLE;
 	siginitset(&blocked, mask);
-	set_current_blocked(&blocked);
-
-	current->state = TASK_INTERRUPTIBLE;
-	schedule();
-
-	set_restore_sigmask();
-	return -ERESTARTNOHAND;
+	return sigsuspend(&blocked);
 }
 
 asmlinkage int
diff --git a/include/linux/signal.h b/include/linux/signal.h
index 7987ce74874b..17046cc484bc 100644
--- a/include/linux/signal.h
+++ b/include/linux/signal.h
@@ -252,6 +252,7 @@ extern int do_sigtimedwait(const sigset_t *, siginfo_t *,
 extern int sigprocmask(int, sigset_t *, sigset_t *);
 extern void set_current_blocked(const sigset_t *);
 extern int show_unhandled_signals;
+extern int sigsuspend(sigset_t *);
 
 extern int get_signal_to_deliver(siginfo_t *info, struct k_sigaction *return_ka, struct pt_regs *regs, void *cookie);
 extern void block_sigmask(struct k_sigaction *ka, int signr);
diff --git a/kernel/compat.c b/kernel/compat.c
index d2c67aa49ae6..c28a306ae05c 100644
--- a/kernel/compat.c
+++ b/kernel/compat.c
@@ -1073,15 +1073,7 @@ asmlinkage long compat_sys_rt_sigsuspend(compat_sigset_t __user *unewset, compat
 	if (copy_from_user(&newset32, unewset, sizeof(compat_sigset_t)))
 		return -EFAULT;
 	sigset_from_compat(&newset, &newset32);
-	sigdelsetmask(&newset, sigmask(SIGKILL)|sigmask(SIGSTOP));
-
-	current->saved_sigmask = current->blocked;
-	set_current_blocked(&newset);
-
-	current->state = TASK_INTERRUPTIBLE;
-	schedule();
-	set_restore_sigmask();
-	return -ERESTARTNOHAND;
+	return sigsuspend(&newset);
 }
 #endif /* __ARCH_WANT_COMPAT_SYS_RT_SIGSUSPEND */
 
diff --git a/kernel/signal.c b/kernel/signal.c
index 17afcaf582d0..3ad220a81619 100644
--- a/kernel/signal.c
+++ b/kernel/signal.c
@@ -3236,6 +3236,21 @@ SYSCALL_DEFINE0(pause)
 
 #endif
 
+#ifdef HAVE_SET_RESTORE_SIGMASK
+int sigsuspend(sigset_t *set)
+{
+	sigdelsetmask(set, sigmask(SIGKILL)|sigmask(SIGSTOP));
+
+	current->saved_sigmask = current->blocked;
+	set_current_blocked(set);
+
+	current->state = TASK_INTERRUPTIBLE;
+	schedule();
+	set_restore_sigmask();
+	return -ERESTARTNOHAND;
+}
+#endif
+
 #ifdef __ARCH_WANT_SYS_RT_SIGSUSPEND
 /**
  *  sys_rt_sigsuspend - replace the signal mask for a value with the
@@ -3253,15 +3268,7 @@ SYSCALL_DEFINE2(rt_sigsuspend, sigset_t __user *, unewset, size_t, sigsetsize)
 
 	if (copy_from_user(&newset, unewset, sizeof(newset)))
 		return -EFAULT;
-	sigdelsetmask(&newset, sigmask(SIGKILL)|sigmask(SIGSTOP));
-
-	current->saved_sigmask = current->blocked;
-	set_current_blocked(&newset);
-
-	current->state = TASK_INTERRUPTIBLE;
-	schedule();
-	set_restore_sigmask();
-	return -ERESTARTNOHAND;
+	return sigsuspend(&newset);
 }
 #endif /* __ARCH_WANT_SYS_RT_SIGSUSPEND */
 
-- 
cgit v1.2.3


From ea4d26ae24e58fbd2c61de9242adab053cb982d8 Mon Sep 17 00:00:00 2001
From: Jim Kukunas <james.t.kukunas@linux.intel.com>
Date: Tue, 22 May 2012 13:54:04 +1000
Subject: raid5: add AVX optimized RAID5 checksumming

Optimize RAID5 xor checksumming by taking advantage of
256-bit YMM registers introduced in AVX.

Signed-off-by: Jim Kukunas <james.t.kukunas@linux.intel.com>
Signed-off-by: NeilBrown <neilb@suse.de>
---
 arch/x86/Makefile              |   5 +-
 arch/x86/include/asm/xor_32.h  |   6 +-
 arch/x86/include/asm/xor_64.h  |   8 +-
 arch/x86/include/asm/xor_avx.h | 214 +++++++++++++++++++++++++++++++++++++++++
 4 files changed, 229 insertions(+), 4 deletions(-)
 create mode 100644 arch/x86/include/asm/xor_avx.h

(limited to 'arch/x86')

diff --git a/arch/x86/Makefile b/arch/x86/Makefile
index 41a7237606a3..7a1cc9ee5c8a 100644
--- a/arch/x86/Makefile
+++ b/arch/x86/Makefile
@@ -115,9 +115,10 @@ cfi-sections := $(call as-instr,.cfi_sections .debug_frame,-DCONFIG_AS_CFI_SECTI
 
 # does binutils support specific instructions?
 asinstr := $(call as-instr,fxsaveq (%rax),-DCONFIG_AS_FXSAVEQ=1)
+avx_instr := $(call as-instr,vxorps %ymm0$(comma)%ymm1$(comma)%ymm2,-DCONFIG_AS_AVX=1)
 
-KBUILD_AFLAGS += $(cfi) $(cfi-sigframe) $(cfi-sections) $(asinstr)
-KBUILD_CFLAGS += $(cfi) $(cfi-sigframe) $(cfi-sections) $(asinstr)
+KBUILD_AFLAGS += $(cfi) $(cfi-sigframe) $(cfi-sections) $(asinstr) $(avx_instr)
+KBUILD_CFLAGS += $(cfi) $(cfi-sigframe) $(cfi-sections) $(asinstr) $(avx_instr)
 
 LDFLAGS := -m elf_$(UTS_MACHINE)
 
diff --git a/arch/x86/include/asm/xor_32.h b/arch/x86/include/asm/xor_32.h
index 133b40a0f495..454570891bdc 100644
--- a/arch/x86/include/asm/xor_32.h
+++ b/arch/x86/include/asm/xor_32.h
@@ -861,6 +861,9 @@ static struct xor_block_template xor_block_pIII_sse = {
 	.do_5 = xor_sse_5,
 };
 
+/* Also try the AVX routines */
+#include "xor_avx.h"
+
 /* Also try the generic routines.  */
 #include <asm-generic/xor.h>
 
@@ -871,6 +874,7 @@ do {							\
 	xor_speed(&xor_block_8regs_p);			\
 	xor_speed(&xor_block_32regs);			\
 	xor_speed(&xor_block_32regs_p);			\
+	AVX_XOR_SPEED;					\
 	if (cpu_has_xmm)				\
 		xor_speed(&xor_block_pIII_sse);		\
 	if (cpu_has_mmx) {				\
@@ -883,6 +887,6 @@ do {							\
    We may also be able to load into the L1 only depending on how the cpu
    deals with a load to a line that is being prefetched.  */
 #define XOR_SELECT_TEMPLATE(FASTEST)			\
-	(cpu_has_xmm ? &xor_block_pIII_sse : FASTEST)
+	AVX_SELECT(cpu_has_xmm ? &xor_block_pIII_sse : FASTEST)
 
 #endif /* _ASM_X86_XOR_32_H */
diff --git a/arch/x86/include/asm/xor_64.h b/arch/x86/include/asm/xor_64.h
index 1549b5e261f6..b9b2323e90fe 100644
--- a/arch/x86/include/asm/xor_64.h
+++ b/arch/x86/include/asm/xor_64.h
@@ -347,15 +347,21 @@ static struct xor_block_template xor_block_sse = {
 	.do_5 = xor_sse_5,
 };
 
+
+/* Also try the AVX routines */
+#include "xor_avx.h"
+
 #undef XOR_TRY_TEMPLATES
 #define XOR_TRY_TEMPLATES			\
 do {						\
+	AVX_XOR_SPEED;				\
 	xor_speed(&xor_block_sse);		\
 } while (0)
 
 /* We force the use of the SSE xor block because it can write around L2.
    We may also be able to load into the L1 only depending on how the cpu
    deals with a load to a line that is being prefetched.  */
-#define XOR_SELECT_TEMPLATE(FASTEST) (&xor_block_sse)
+#define XOR_SELECT_TEMPLATE(FASTEST) \
+	AVX_SELECT(&xor_block_sse)
 
 #endif /* _ASM_X86_XOR_64_H */
diff --git a/arch/x86/include/asm/xor_avx.h b/arch/x86/include/asm/xor_avx.h
new file mode 100644
index 000000000000..2510d35f480e
--- /dev/null
+++ b/arch/x86/include/asm/xor_avx.h
@@ -0,0 +1,214 @@
+#ifndef _ASM_X86_XOR_AVX_H
+#define _ASM_X86_XOR_AVX_H
+
+/*
+ * Optimized RAID-5 checksumming functions for AVX
+ *
+ * Copyright (C) 2012 Intel Corporation
+ * Author: Jim Kukunas <james.t.kukunas@linux.intel.com>
+ *
+ * Based on Ingo Molnar and Zach Brown's respective MMX and SSE routines
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; version 2
+ * of the License.
+ */
+
+#ifdef CONFIG_AS_AVX
+
+#include <linux/compiler.h>
+#include <asm/i387.h>
+
+#define ALIGN32 __aligned(32)
+
+#define YMM_SAVED_REGS 4
+
+#define YMMS_SAVE \
+do { \
+	preempt_disable(); \
+	cr0 = read_cr0(); \
+	clts(); \
+	asm volatile("vmovaps %%ymm0, %0" : "=m" (ymm_save[0]) : : "memory"); \
+	asm volatile("vmovaps %%ymm1, %0" : "=m" (ymm_save[32]) : : "memory"); \
+	asm volatile("vmovaps %%ymm2, %0" : "=m" (ymm_save[64]) : : "memory"); \
+	asm volatile("vmovaps %%ymm3, %0" : "=m" (ymm_save[96]) : : "memory"); \
+} while (0);
+
+#define YMMS_RESTORE \
+do { \
+	asm volatile("sfence" : : : "memory"); \
+	asm volatile("vmovaps %0, %%ymm3" : : "m" (ymm_save[96])); \
+	asm volatile("vmovaps %0, %%ymm2" : : "m" (ymm_save[64])); \
+	asm volatile("vmovaps %0, %%ymm1" : : "m" (ymm_save[32])); \
+	asm volatile("vmovaps %0, %%ymm0" : : "m" (ymm_save[0])); \
+	write_cr0(cr0); \
+	preempt_enable(); \
+} while (0);
+
+#define BLOCK4(i) \
+		BLOCK(32 * i, 0) \
+		BLOCK(32 * (i + 1), 1) \
+		BLOCK(32 * (i + 2), 2) \
+		BLOCK(32 * (i + 3), 3)
+
+#define BLOCK16() \
+		BLOCK4(0) \
+		BLOCK4(4) \
+		BLOCK4(8) \
+		BLOCK4(12)
+
+static void xor_avx_2(unsigned long bytes, unsigned long *p0, unsigned long *p1)
+{
+	unsigned long cr0, lines = bytes >> 9;
+	char ymm_save[32 * YMM_SAVED_REGS] ALIGN32;
+
+	YMMS_SAVE
+
+	while (lines--) {
+#undef BLOCK
+#define BLOCK(i, reg) \
+do { \
+	asm volatile("vmovdqa %0, %%ymm" #reg : : "m" (p1[i / sizeof(*p1)])); \
+	asm volatile("vxorps %0, %%ymm" #reg ", %%ymm"  #reg : : \
+		"m" (p0[i / sizeof(*p0)])); \
+	asm volatile("vmovdqa %%ymm" #reg ", %0" : \
+		"=m" (p0[i / sizeof(*p0)])); \
+} while (0);
+
+		BLOCK16()
+
+		p0 = (unsigned long *)((uintptr_t)p0 + 512);
+		p1 = (unsigned long *)((uintptr_t)p1 + 512);
+	}
+
+	YMMS_RESTORE
+}
+
+static void xor_avx_3(unsigned long bytes, unsigned long *p0, unsigned long *p1,
+	unsigned long *p2)
+{
+	unsigned long cr0, lines = bytes >> 9;
+	char ymm_save[32 * YMM_SAVED_REGS] ALIGN32;
+
+	YMMS_SAVE
+
+	while (lines--) {
+#undef BLOCK
+#define BLOCK(i, reg) \
+do { \
+	asm volatile("vmovdqa %0, %%ymm" #reg : : "m" (p2[i / sizeof(*p2)])); \
+	asm volatile("vxorps %0, %%ymm" #reg ", %%ymm" #reg : : \
+		"m" (p1[i / sizeof(*p1)])); \
+	asm volatile("vxorps %0, %%ymm" #reg ", %%ymm" #reg : : \
+		"m" (p0[i / sizeof(*p0)])); \
+	asm volatile("vmovdqa %%ymm" #reg ", %0" : \
+		"=m" (p0[i / sizeof(*p0)])); \
+} while (0);
+
+		BLOCK16()
+
+		p0 = (unsigned long *)((uintptr_t)p0 + 512);
+		p1 = (unsigned long *)((uintptr_t)p1 + 512);
+		p2 = (unsigned long *)((uintptr_t)p2 + 512);
+	}
+
+	YMMS_RESTORE
+}
+
+static void xor_avx_4(unsigned long bytes, unsigned long *p0, unsigned long *p1,
+	unsigned long *p2, unsigned long *p3)
+{
+	unsigned long cr0, lines = bytes >> 9;
+	char ymm_save[32 * YMM_SAVED_REGS] ALIGN32;
+
+	YMMS_SAVE
+
+	while (lines--) {
+#undef BLOCK
+#define BLOCK(i, reg) \
+do { \
+	asm volatile("vmovdqa %0, %%ymm" #reg : : "m" (p3[i / sizeof(*p3)])); \
+	asm volatile("vxorps %0, %%ymm" #reg ", %%ymm" #reg : : \
+		"m" (p2[i / sizeof(*p2)])); \
+	asm volatile("vxorps %0, %%ymm" #reg ", %%ymm" #reg : : \
+		"m" (p1[i / sizeof(*p1)])); \
+	asm volatile("vxorps %0, %%ymm" #reg ", %%ymm" #reg : : \
+		"m" (p0[i / sizeof(*p0)])); \
+	asm volatile("vmovdqa %%ymm" #reg ", %0" : \
+		"=m" (p0[i / sizeof(*p0)])); \
+} while (0);
+
+		BLOCK16();
+
+		p0 = (unsigned long *)((uintptr_t)p0 + 512);
+		p1 = (unsigned long *)((uintptr_t)p1 + 512);
+		p2 = (unsigned long *)((uintptr_t)p2 + 512);
+		p3 = (unsigned long *)((uintptr_t)p3 + 512);
+	}
+
+	YMMS_RESTORE
+}
+
+static void xor_avx_5(unsigned long bytes, unsigned long *p0, unsigned long *p1,
+	unsigned long *p2, unsigned long *p3, unsigned long *p4)
+{
+	unsigned long cr0, lines = bytes >> 9;
+	char ymm_save[32 * YMM_SAVED_REGS] ALIGN32;
+
+	YMMS_SAVE
+
+	while (lines--) {
+#undef BLOCK
+#define BLOCK(i, reg) \
+do { \
+	asm volatile("vmovdqa %0, %%ymm" #reg : : "m" (p4[i / sizeof(*p4)])); \
+	asm volatile("vxorps %0, %%ymm" #reg ", %%ymm" #reg : : \
+		"m" (p3[i / sizeof(*p3)])); \
+	asm volatile("vxorps %0, %%ymm" #reg ", %%ymm" #reg : : \
+		"m" (p2[i / sizeof(*p2)])); \
+	asm volatile("vxorps %0, %%ymm" #reg ", %%ymm" #reg : : \
+		"m" (p1[i / sizeof(*p1)])); \
+	asm volatile("vxorps %0, %%ymm" #reg ", %%ymm" #reg : : \
+		"m" (p0[i / sizeof(*p0)])); \
+	asm volatile("vmovdqa %%ymm" #reg ", %0" : \
+		"=m" (p0[i / sizeof(*p0)])); \
+} while (0);
+
+		BLOCK16()
+
+		p0 = (unsigned long *)((uintptr_t)p0 + 512);
+		p1 = (unsigned long *)((uintptr_t)p1 + 512);
+		p2 = (unsigned long *)((uintptr_t)p2 + 512);
+		p3 = (unsigned long *)((uintptr_t)p3 + 512);
+		p4 = (unsigned long *)((uintptr_t)p4 + 512);
+	}
+
+	YMMS_RESTORE
+}
+
+static struct xor_block_template xor_block_avx = {
+	.name = "avx",
+	.do_2 = xor_avx_2,
+	.do_3 = xor_avx_3,
+	.do_4 = xor_avx_4,
+	.do_5 = xor_avx_5,
+};
+
+#define AVX_XOR_SPEED \
+do { \
+	if (cpu_has_avx) \
+		xor_speed(&xor_block_avx); \
+} while (0)
+
+#define AVX_SELECT(FASTEST) \
+	(cpu_has_avx ? &xor_block_avx : FASTEST)
+
+#else
+
+#define AVX_XOR_SPEED {}
+
+#define AVX_SELECT(FASTEST) (FASTEST)
+
+#endif
+#endif
-- 
cgit v1.2.3


From e8f380e00840f694599e6ab42806639f7de26f11 Mon Sep 17 00:00:00 2001
From: Borislav Petkov <borislav.petkov@amd.com>
Date: Tue, 22 May 2012 12:53:45 +0200
Subject: x86/bitops: Move BIT_64() for a wider use

Needed for shifting 64-bit values on 32-bit, like MSR values,
for example.

Signed-off-by: Borislav Petkov <borislav.petkov@amd.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Frank Arnold <frank.arnold@amd.com>
Link: http://lkml.kernel.org/r/1337684026-19740-1-git-send-email-bp@amd64.org
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/x86/include/asm/bitops.h | 2 ++
 drivers/edac/mce_amd.h        | 2 --
 2 files changed, 2 insertions(+), 2 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/include/asm/bitops.h b/arch/x86/include/asm/bitops.h
index b97596e2b68c..a6983b277220 100644
--- a/arch/x86/include/asm/bitops.h
+++ b/arch/x86/include/asm/bitops.h
@@ -15,6 +15,8 @@
 #include <linux/compiler.h>
 #include <asm/alternative.h>
 
+#define BIT_64(n)			(U64_C(1) << (n))
+
 /*
  * These have to be done with inline assembly: that way the bit-setting
  * is guaranteed to be atomic. All bit operations return 0 if the bit
diff --git a/drivers/edac/mce_amd.h b/drivers/edac/mce_amd.h
index c6074c5cd1ef..8c87a5e87057 100644
--- a/drivers/edac/mce_amd.h
+++ b/drivers/edac/mce_amd.h
@@ -5,8 +5,6 @@
 
 #include <asm/mce.h>
 
-#define BIT_64(n)			(U64_C(1) << (n))
-
 #define EC(x)				((x) & 0xffff)
 #define XEC(x, mask)			(((x) >> 16) & mask)
 
-- 
cgit v1.2.3


From 80f033610fb968e75f5d470233d8d0260d7a72ed Mon Sep 17 00:00:00 2001
From: Borislav Petkov <borislav.petkov@amd.com>
Date: Tue, 22 May 2012 12:53:46 +0200
Subject: x86/mce: Fix 32-bit build

Got bitten again by the BIT() macro:

 arch/x86/kernel/cpu/mcheck/mce.c: In function '__mcheck_cpu_apply_quirks':
 arch/x86/kernel/cpu/mcheck/mce.c:1453:6: warning: left shift
 count >= width of type arch/x86/kernel/cpu/mcheck/mce.c:1454:7: warning: left shift count >= width of type

Fix it already.

Signed-off-by: Borislav Petkov <borislav.petkov@amd.com>
Cc: Frank Arnold <frank.arnold@amd.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Link: http://lkml.kernel.org/r/1337684026-19740-2-git-send-email-bp@amd64.org
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/x86/kernel/cpu/mcheck/mce.c | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kernel/cpu/mcheck/mce.c b/arch/x86/kernel/cpu/mcheck/mce.c
index 888fbf9d0adf..0456b9a08086 100644
--- a/arch/x86/kernel/cpu/mcheck/mce.c
+++ b/arch/x86/kernel/cpu/mcheck/mce.c
@@ -1450,9 +1450,9 @@ static int __cpuinit __mcheck_cpu_apply_quirks(struct cpuinfo_x86 *c)
 				 rdmsrl(msrs[i], val);
 
 				 /* CntP bit set? */
-				 if (val & BIT(62)) {
-					 val &= ~BIT(62);
-					 wrmsrl(msrs[i], val);
+				 if (val & BIT_64(62)) {
+					val &= ~BIT_64(62);
+					wrmsrl(msrs[i], val);
 				 }
 			 }
 
-- 
cgit v1.2.3


From fd952815307f0f272bf49fd364a7fd2f9992bc42 Mon Sep 17 00:00:00 2001
From: "H. Peter Anvin" <hpa@zytor.com>
Date: Wed, 23 May 2012 14:02:34 -0700
Subject: x86-32, relocs: Whitelist more symbols for ld bug workaround

As noted in checkin:

a3e854d95 x86, relocs: Workaround for binutils 2.22.52.0.1 section bug

ld version 2.22.52.0.[12] can incorrectly promote relative symbols to
absolute, if the output section they appear in is otherwise empty.

Since checkin:

6520fe55 x86, realmode: 16-bit real-mode code support for relocs tool

we actually check for this and error out rather than silently creating
a kernel which will malfunction if relocated.

Ingo found a configuration in which __start_builtin_fw triggered the
warning.

Go through the linker script sources and look for more symbols that
could plausibly get bogusly promoted to absolute, and add them to the
whitelist.

In general, if the following error triggers:

	Invalid absolute R_386_32 relocation: <symbol>

... then we should verify that <symbol> is really meant to be
relocated, and add it and any related symbols manually to the S_REL
regexp.

Please note that 6520fe55 does not introduce the error, only the check
for the error -- without 6520fe55 this version of ld will simply
produce a corrupt kernel if CONFIG_RELOCATABLE is set on x86-32.

Reported-by: Ingo Molnar <mingo@kernel.org>
Signed-off-by: H. Peter Anvin <hpa@zytor.com>
Cc: <stable@vger.kernel.org> v3.4
---
 arch/x86/tools/relocs.c | 11 +++++++++++
 1 file changed, 11 insertions(+)

(limited to 'arch/x86')

diff --git a/arch/x86/tools/relocs.c b/arch/x86/tools/relocs.c
index b43cfcd9bf40..b8f7c65fc40c 100644
--- a/arch/x86/tools/relocs.c
+++ b/arch/x86/tools/relocs.c
@@ -60,6 +60,17 @@ static const char * const sym_regex_kernel[S_NSYMTYPES] = {
 	"__x86_cpu_dev_(start|end)|"
 	"(__parainstructions|__alt_instructions)(|_end)|"
 	"(__iommu_table|__apicdrivers|__smp_locks)(|_end)|"
+	"__(start|end)_pci_.*|"
+	"__(start|end)_builtin_fw|"
+	"__(start|stop)___ksymtab(|_gpl|_unused|_unused_gpl|_gpl_future)|"
+	"__(start|stop)___kcrctab(|_gpl|_unused|_unused_gpl|_gpl_future)|"
+	"__(start|stop)___param|"
+	"__(start|stop)___modver|"
+	"__(start|stop)___bug_table|"
+	"__tracedata_(start|end)|"
+	"__(start|stop)_notes|"
+	"__end_rodata|"
+	"__initramfs_start|"
 	"_end)$"
 };
 
-- 
cgit v1.2.3


From a129a7c84582629741e5fa6f40026efcd7a65bd4 Mon Sep 17 00:00:00 2001
From: Andi Kleen <andi@firstfloor.org>
Date: Fri, 19 Nov 2010 13:16:22 +0100
Subject: MCE: Fix vm86 handling for 32bit mce handler

When running on 32bit the mce handler could misinterpret
vm86 mode as ring 0. This can affect whether it does recovery
or not; it was possible to panic when recovery was actually
possible.

Fix this by always forcing vm86 to look like ring 3.

Signed-off-by: Andi Kleen <ak@linux.intel.com>
Cc: <stable@vger.kernel.org>
Signed-off-by: Tony Luck <tony.luck@intel.com>
---
 arch/x86/kernel/cpu/mcheck/mce.c | 8 ++++++++
 1 file changed, 8 insertions(+)

(limited to 'arch/x86')

diff --git a/arch/x86/kernel/cpu/mcheck/mce.c b/arch/x86/kernel/cpu/mcheck/mce.c
index 66e1c51be084..5f793e6c854b 100644
--- a/arch/x86/kernel/cpu/mcheck/mce.c
+++ b/arch/x86/kernel/cpu/mcheck/mce.c
@@ -437,6 +437,14 @@ static inline void mce_gather_info(struct mce *m, struct pt_regs *regs)
 		if (m->mcgstatus & (MCG_STATUS_RIPV|MCG_STATUS_EIPV)) {
 			m->ip = regs->ip;
 			m->cs = regs->cs;
+
+			/*
+			 * When in VM86 mode make the cs look like ring 3
+			 * always. This is a lie, but it's better than passing
+			 * the additional vm86 bit around everywhere.
+			 */
+			if (v8086_mode(regs))
+				m->cs |= 3;
 		}
 		/* Use accurate RIP reporting if available. */
 		if (rip_msr)
-- 
cgit v1.2.3


From 875e26648cf9b6db9d8dc07b7959d7c61fb3f49c Mon Sep 17 00:00:00 2001
From: Tony Luck <tony.luck@intel.com>
Date: Wed, 23 May 2012 14:14:22 -0700
Subject: x86/mce: Fix check for processor context when machine check was
 taken.

Linus pointed out that there was no value is checking whether m->ip
was zero - because zero is a legimate value.  If we have a reliable
(or faked in the VM86 case) "m->cs" we can use it to tell whether we
were in user mode or kernelwhen the machine check hit.

Reported-by: Linus Torvalds <torvalds@linuxfoundation.org>
Cc: <stable@vger.kernel.org>
Signed-off-by: Tony Luck <tony.luck@intel.com>
---
 arch/x86/kernel/cpu/mcheck/mce-severity.c | 16 ++++++++++------
 1 file changed, 10 insertions(+), 6 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kernel/cpu/mcheck/mce-severity.c b/arch/x86/kernel/cpu/mcheck/mce-severity.c
index 0c82091b1652..1ccd453903d8 100644
--- a/arch/x86/kernel/cpu/mcheck/mce-severity.c
+++ b/arch/x86/kernel/cpu/mcheck/mce-severity.c
@@ -165,15 +165,19 @@ static struct severity {
 };
 
 /*
- * If the EIPV bit is set, it means the saved IP is the
- * instruction which caused the MCE.
+ * If mcgstatus indicated that ip/cs on the stack were
+ * no good, then "m->cs" will be zero and we will have
+ * to assume the worst case (IN_KERNEL) as we actually
+ * have no idea what we were executing when the machine
+ * check hit.
+ * If we do have a good "m->cs" (or a faked one in the
+ * case we were executing in VM86 mode) we can use it to
+ * distinguish an exception taken in user from from one
+ * taken in the kernel.
  */
 static int error_context(struct mce *m)
 {
-	if (m->mcgstatus & MCG_STATUS_EIPV)
-		return (m->ip && (m->cs & 3) == 3) ? IN_USER : IN_KERNEL;
-	/* Unknown, assume kernel */
-	return IN_KERNEL;
+	return ((m->cs & 3) == 3) ? IN_USER : IN_KERNEL;
 }
 
 int mce_severity(struct mce *m, int tolerant, char **msg)
-- 
cgit v1.2.3


From 37c3459b67dd5a396a968e819cf4a86d24ac9ace Mon Sep 17 00:00:00 2001
From: Tony Luck <tony.luck@intel.com>
Date: Thu, 10 May 2012 11:12:14 -0700
Subject: x86/mce: Add instruction recovery signatures to mce-severity table

Instruction recovery cases are very similar to the data recovery one
we already have. Just trade out for a new MCACOD value.

Signed-off-by: Tony Luck <tony.luck@intel.com>
---
 arch/x86/kernel/cpu/mcheck/mce-severity.c | 10 ++++++++++
 1 file changed, 10 insertions(+)

(limited to 'arch/x86')

diff --git a/arch/x86/kernel/cpu/mcheck/mce-severity.c b/arch/x86/kernel/cpu/mcheck/mce-severity.c
index 1ccd453903d8..413c2ced887c 100644
--- a/arch/x86/kernel/cpu/mcheck/mce-severity.c
+++ b/arch/x86/kernel/cpu/mcheck/mce-severity.c
@@ -126,6 +126,16 @@ static struct severity {
 		SER, MASK(MCI_STATUS_OVER|MCI_UC_SAR|MCI_ADDR|MCACOD, MCI_UC_SAR|MCI_ADDR|MCACOD_DATA),
 		USER
 		),
+	MCESEV(
+		KEEP, "HT thread notices Action required: instruction fetch error",
+		SER, MASK(MCI_STATUS_OVER|MCI_UC_SAR|MCI_ADDR|MCACOD, MCI_UC_SAR|MCI_ADDR|MCACOD_INSTR),
+		MCGMASK(MCG_STATUS_EIPV, 0)
+		),
+	MCESEV(
+		AR, "Action required: instruction fetch error",
+		SER, MASK(MCI_STATUS_OVER|MCI_UC_SAR|MCI_ADDR|MCACOD, MCI_UC_SAR|MCI_ADDR|MCACOD_INSTR),
+		USER
+		),
 #endif
 	MCESEV(
 		PANIC, "Action required: unknown MCACOD",
-- 
cgit v1.2.3


From a42c6ded827dbd396d2efde7530620be029a72d1 Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Wed, 23 May 2012 14:44:37 -0400
Subject: move key_repace_session_keyring() into tracehook_notify_resume()

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 arch/alpha/kernel/signal.c      | 2 --
 arch/arm/kernel/signal.c        | 2 --
 arch/avr32/kernel/signal.c      | 2 --
 arch/blackfin/kernel/signal.c   | 2 --
 arch/c6x/kernel/signal.c        | 2 --
 arch/cris/kernel/ptrace.c       | 2 --
 arch/frv/kernel/signal.c        | 2 --
 arch/h8300/kernel/signal.c      | 2 --
 arch/hexagon/kernel/signal.c    | 2 --
 arch/ia64/kernel/process.c      | 2 --
 arch/m32r/kernel/signal.c       | 2 --
 arch/m68k/kernel/signal.c       | 5 +----
 arch/microblaze/kernel/signal.c | 5 +----
 arch/mips/kernel/signal.c       | 2 --
 arch/mn10300/kernel/signal.c    | 2 --
 arch/openrisc/kernel/signal.c   | 2 --
 arch/parisc/kernel/signal.c     | 2 --
 arch/powerpc/kernel/signal.c    | 2 --
 arch/s390/kernel/signal.c       | 2 --
 arch/score/kernel/signal.c      | 2 --
 arch/sh/kernel/signal_32.c      | 2 --
 arch/sh/kernel/signal_64.c      | 2 --
 arch/sparc/kernel/signal_32.c   | 2 --
 arch/sparc/kernel/signal_64.c   | 2 --
 arch/tile/kernel/process.c      | 2 --
 arch/um/kernel/process.c        | 5 +----
 arch/unicore32/kernel/signal.c  | 2 --
 arch/x86/kernel/signal.c        | 2 --
 arch/xtensa/kernel/signal.c     | 5 +----
 include/linux/tracehook.h       | 2 ++
 30 files changed, 6 insertions(+), 66 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/alpha/kernel/signal.c b/arch/alpha/kernel/signal.c
index 10ab2d74ecbb..f6db3032ddf0 100644
--- a/arch/alpha/kernel/signal.c
+++ b/arch/alpha/kernel/signal.c
@@ -590,7 +590,5 @@ do_notify_resume(struct pt_regs *regs, struct switch_stack *sw,
 	if (thread_info_flags & _TIF_NOTIFY_RESUME) {
 		clear_thread_flag(TIF_NOTIFY_RESUME);
 		tracehook_notify_resume(regs);
-		if (current->replacement_session_keyring)
-			key_replace_session_keyring();
 	}
 }
diff --git a/arch/arm/kernel/signal.c b/arch/arm/kernel/signal.c
index 4e5fdd9bd9e3..ec640412aed0 100644
--- a/arch/arm/kernel/signal.c
+++ b/arch/arm/kernel/signal.c
@@ -728,7 +728,5 @@ do_notify_resume(struct pt_regs *regs, unsigned int thread_flags, int syscall)
 	if (thread_flags & _TIF_NOTIFY_RESUME) {
 		clear_thread_flag(TIF_NOTIFY_RESUME);
 		tracehook_notify_resume(regs);
-		if (current->replacement_session_keyring)
-			key_replace_session_keyring();
 	}
 }
diff --git a/arch/avr32/kernel/signal.c b/arch/avr32/kernel/signal.c
index ae386c304bee..e7595ef74f51 100644
--- a/arch/avr32/kernel/signal.c
+++ b/arch/avr32/kernel/signal.c
@@ -321,7 +321,5 @@ asmlinkage void do_notify_resume(struct pt_regs *regs, struct thread_info *ti)
 	if (ti->flags & _TIF_NOTIFY_RESUME) {
 		clear_thread_flag(TIF_NOTIFY_RESUME);
 		tracehook_notify_resume(regs);
-		if (current->replacement_session_keyring)
-			key_replace_session_keyring();
 	}
 }
diff --git a/arch/blackfin/kernel/signal.c b/arch/blackfin/kernel/signal.c
index e5bbc1a5edc2..fc9ecce8b6ce 100644
--- a/arch/blackfin/kernel/signal.c
+++ b/arch/blackfin/kernel/signal.c
@@ -336,8 +336,6 @@ asmlinkage void do_notify_resume(struct pt_regs *regs)
 	if (test_thread_flag(TIF_NOTIFY_RESUME)) {
 		clear_thread_flag(TIF_NOTIFY_RESUME);
 		tracehook_notify_resume(regs);
-		if (current->replacement_session_keyring)
-			key_replace_session_keyring();
 	}
 }
 
diff --git a/arch/c6x/kernel/signal.c b/arch/c6x/kernel/signal.c
index cf37478c1169..9493f0bbf0a6 100644
--- a/arch/c6x/kernel/signal.c
+++ b/arch/c6x/kernel/signal.c
@@ -364,7 +364,5 @@ asmlinkage void do_notify_resume(struct pt_regs *regs, u32 thread_info_flags,
 	if (thread_info_flags & (1 << TIF_NOTIFY_RESUME)) {
 		clear_thread_flag(TIF_NOTIFY_RESUME);
 		tracehook_notify_resume(regs);
-		if (current->replacement_session_keyring)
-			key_replace_session_keyring();
 	}
 }
diff --git a/arch/cris/kernel/ptrace.c b/arch/cris/kernel/ptrace.c
index d114ad3da9b1..58d44ee1a71f 100644
--- a/arch/cris/kernel/ptrace.c
+++ b/arch/cris/kernel/ptrace.c
@@ -40,7 +40,5 @@ void do_notify_resume(int canrestart, struct pt_regs *regs,
 	if (thread_info_flags & _TIF_NOTIFY_RESUME) {
 		clear_thread_flag(TIF_NOTIFY_RESUME);
 		tracehook_notify_resume(regs);
-		if (current->replacement_session_keyring)
-			key_replace_session_keyring();
 	}
 }
diff --git a/arch/frv/kernel/signal.c b/arch/frv/kernel/signal.c
index 8cf5dca01758..595bf1e5a5dc 100644
--- a/arch/frv/kernel/signal.c
+++ b/arch/frv/kernel/signal.c
@@ -562,8 +562,6 @@ asmlinkage void do_notify_resume(__u32 thread_info_flags)
 	if (thread_info_flags & _TIF_NOTIFY_RESUME) {
 		clear_thread_flag(TIF_NOTIFY_RESUME);
 		tracehook_notify_resume(__frame);
-		if (current->replacement_session_keyring)
-			key_replace_session_keyring();
 	}
 
 } /* end do_notify_resume() */
diff --git a/arch/h8300/kernel/signal.c b/arch/h8300/kernel/signal.c
index d4b0555d2904..e58992ad789e 100644
--- a/arch/h8300/kernel/signal.c
+++ b/arch/h8300/kernel/signal.c
@@ -513,7 +513,5 @@ asmlinkage void do_notify_resume(struct pt_regs *regs, u32 thread_info_flags)
 	if (thread_info_flags & _TIF_NOTIFY_RESUME) {
 		clear_thread_flag(TIF_NOTIFY_RESUME);
 		tracehook_notify_resume(regs);
-		if (current->replacement_session_keyring)
-			key_replace_session_keyring();
 	}
 }
diff --git a/arch/hexagon/kernel/signal.c b/arch/hexagon/kernel/signal.c
index 434866eb0f1c..21a3018cb9bf 100644
--- a/arch/hexagon/kernel/signal.c
+++ b/arch/hexagon/kernel/signal.c
@@ -273,8 +273,6 @@ void do_notify_resume(struct pt_regs *regs, unsigned long thread_info_flags)
 	if (thread_info_flags & _TIF_NOTIFY_RESUME) {
 		clear_thread_flag(TIF_NOTIFY_RESUME);
 		tracehook_notify_resume(regs);
-		if (current->replacement_session_keyring)
-			key_replace_session_keyring();
 	}
 }
 
diff --git a/arch/ia64/kernel/process.c b/arch/ia64/kernel/process.c
index 5e0e86ddb12f..dd6fc1449741 100644
--- a/arch/ia64/kernel/process.c
+++ b/arch/ia64/kernel/process.c
@@ -199,8 +199,6 @@ do_notify_resume_user(sigset_t *unused, struct sigscratch *scr, long in_syscall)
 	if (test_thread_flag(TIF_NOTIFY_RESUME)) {
 		clear_thread_flag(TIF_NOTIFY_RESUME);
 		tracehook_notify_resume(&scr->pt);
-		if (current->replacement_session_keyring)
-			key_replace_session_keyring();
 	}
 
 	/* copy user rbs to kernel rbs */
diff --git a/arch/m32r/kernel/signal.c b/arch/m32r/kernel/signal.c
index f54d96993ea1..64804f1f5141 100644
--- a/arch/m32r/kernel/signal.c
+++ b/arch/m32r/kernel/signal.c
@@ -383,8 +383,6 @@ void do_notify_resume(struct pt_regs *regs, __u32 thread_info_flags)
 	if (thread_info_flags & _TIF_NOTIFY_RESUME) {
 		clear_thread_flag(TIF_NOTIFY_RESUME);
 		tracehook_notify_resume(regs);
-		if (current->replacement_session_keyring)
-			key_replace_session_keyring();
 	}
 
 	clear_thread_flag(TIF_IRET);
diff --git a/arch/m68k/kernel/signal.c b/arch/m68k/kernel/signal.c
index d9f3d1900eed..973eec60cad4 100644
--- a/arch/m68k/kernel/signal.c
+++ b/arch/m68k/kernel/signal.c
@@ -1193,9 +1193,6 @@ void do_notify_resume(struct pt_regs *regs)
 	if (test_thread_flag(TIF_SIGPENDING))
 		do_signal(regs);
 
-	if (test_and_clear_thread_flag(TIF_NOTIFY_RESUME)) {
+	if (test_and_clear_thread_flag(TIF_NOTIFY_RESUME))
 		tracehook_notify_resume(regs);
-		if (current->replacement_session_keyring)
-			key_replace_session_keyring();
-	}
 }
diff --git a/arch/microblaze/kernel/signal.c b/arch/microblaze/kernel/signal.c
index 7f4c7bef1642..5d796e32786e 100644
--- a/arch/microblaze/kernel/signal.c
+++ b/arch/microblaze/kernel/signal.c
@@ -401,9 +401,6 @@ void do_notify_resume(struct pt_regs *regs, int in_syscall)
 	if (test_thread_flag(TIF_SIGPENDING))
 		do_signal(regs, in_syscall);
 
-	if (test_and_clear_thread_flag(TIF_NOTIFY_RESUME)) {
+	if (test_and_clear_thread_flag(TIF_NOTIFY_RESUME))
 		tracehook_notify_resume(regs);
-		if (current->replacement_session_keyring)
-			key_replace_session_keyring();
-	}
 }
diff --git a/arch/mips/kernel/signal.c b/arch/mips/kernel/signal.c
index 17f6ee30ad0d..8a6e6d116ab0 100644
--- a/arch/mips/kernel/signal.c
+++ b/arch/mips/kernel/signal.c
@@ -636,8 +636,6 @@ asmlinkage void do_notify_resume(struct pt_regs *regs, void *unused,
 	if (thread_info_flags & _TIF_NOTIFY_RESUME) {
 		clear_thread_flag(TIF_NOTIFY_RESUME);
 		tracehook_notify_resume(regs);
-		if (current->replacement_session_keyring)
-			key_replace_session_keyring();
 	}
 }
 
diff --git a/arch/mn10300/kernel/signal.c b/arch/mn10300/kernel/signal.c
index 890cf91767cc..b8b6aa1a6837 100644
--- a/arch/mn10300/kernel/signal.c
+++ b/arch/mn10300/kernel/signal.c
@@ -554,7 +554,5 @@ asmlinkage void do_notify_resume(struct pt_regs *regs, u32 thread_info_flags)
 	if (thread_info_flags & _TIF_NOTIFY_RESUME) {
 		clear_thread_flag(TIF_NOTIFY_RESUME);
 		tracehook_notify_resume(current_frame());
-		if (current->replacement_session_keyring)
-			key_replace_session_keyring();
 	}
 }
diff --git a/arch/openrisc/kernel/signal.c b/arch/openrisc/kernel/signal.c
index e970743251ae..9ae611522953 100644
--- a/arch/openrisc/kernel/signal.c
+++ b/arch/openrisc/kernel/signal.c
@@ -376,7 +376,5 @@ asmlinkage void do_notify_resume(struct pt_regs *regs)
 	if (current_thread_info()->flags & _TIF_NOTIFY_RESUME) {
 		clear_thread_flag(TIF_NOTIFY_RESUME);
 		tracehook_notify_resume(regs);
-		if (current->replacement_session_keyring)
-			key_replace_session_keyring();
 	}
 }
diff --git a/arch/parisc/kernel/signal.c b/arch/parisc/kernel/signal.c
index 4b9cb0d546d1..e7a7cd3e1120 100644
--- a/arch/parisc/kernel/signal.c
+++ b/arch/parisc/kernel/signal.c
@@ -638,7 +638,5 @@ void do_notify_resume(struct pt_regs *regs, long in_syscall)
 	if (test_thread_flag(TIF_NOTIFY_RESUME)) {
 		clear_thread_flag(TIF_NOTIFY_RESUME);
 		tracehook_notify_resume(regs);
-		if (current->replacement_session_keyring)
-			key_replace_session_keyring();
 	}
 }
diff --git a/arch/powerpc/kernel/signal.c b/arch/powerpc/kernel/signal.c
index 651c5963662b..bfc3ec1382fb 100644
--- a/arch/powerpc/kernel/signal.c
+++ b/arch/powerpc/kernel/signal.c
@@ -193,8 +193,6 @@ void do_notify_resume(struct pt_regs *regs, unsigned long thread_info_flags)
 	if (thread_info_flags & _TIF_NOTIFY_RESUME) {
 		clear_thread_flag(TIF_NOTIFY_RESUME);
 		tracehook_notify_resume(regs);
-		if (current->replacement_session_keyring)
-			key_replace_session_keyring();
 	}
 }
 
diff --git a/arch/s390/kernel/signal.c b/arch/s390/kernel/signal.c
index f626232e216c..42a6e8b47f06 100644
--- a/arch/s390/kernel/signal.c
+++ b/arch/s390/kernel/signal.c
@@ -494,6 +494,4 @@ void do_notify_resume(struct pt_regs *regs)
 {
 	clear_thread_flag(TIF_NOTIFY_RESUME);
 	tracehook_notify_resume(regs);
-	if (current->replacement_session_keyring)
-		key_replace_session_keyring();
 }
diff --git a/arch/score/kernel/signal.c b/arch/score/kernel/signal.c
index d4a49011c48a..302838d3acf6 100644
--- a/arch/score/kernel/signal.c
+++ b/arch/score/kernel/signal.c
@@ -356,7 +356,5 @@ asmlinkage void do_notify_resume(struct pt_regs *regs, void *unused,
 	if (thread_info_flags & _TIF_NOTIFY_RESUME) {
 		clear_thread_flag(TIF_NOTIFY_RESUME);
 		tracehook_notify_resume(regs);
-		if (current->replacement_session_keyring)
-			key_replace_session_keyring();
 	}
 }
diff --git a/arch/sh/kernel/signal_32.c b/arch/sh/kernel/signal_32.c
index cb4172c8af7d..9d7bfd66f189 100644
--- a/arch/sh/kernel/signal_32.c
+++ b/arch/sh/kernel/signal_32.c
@@ -626,7 +626,5 @@ asmlinkage void do_notify_resume(struct pt_regs *regs, unsigned int save_r0,
 	if (thread_info_flags & _TIF_NOTIFY_RESUME) {
 		clear_thread_flag(TIF_NOTIFY_RESUME);
 		tracehook_notify_resume(regs);
-		if (current->replacement_session_keyring)
-			key_replace_session_keyring();
 	}
 }
diff --git a/arch/sh/kernel/signal_64.c b/arch/sh/kernel/signal_64.c
index b589a354c069..aa6428430842 100644
--- a/arch/sh/kernel/signal_64.c
+++ b/arch/sh/kernel/signal_64.c
@@ -685,7 +685,5 @@ asmlinkage void do_notify_resume(struct pt_regs *regs, unsigned long thread_info
 	if (thread_info_flags & _TIF_NOTIFY_RESUME) {
 		clear_thread_flag(TIF_NOTIFY_RESUME);
 		tracehook_notify_resume(regs);
-		if (current->replacement_session_keyring)
-			key_replace_session_keyring();
 	}
 }
diff --git a/arch/sparc/kernel/signal_32.c b/arch/sparc/kernel/signal_32.c
index 2b7e849f7c65..6b42e8622d12 100644
--- a/arch/sparc/kernel/signal_32.c
+++ b/arch/sparc/kernel/signal_32.c
@@ -590,8 +590,6 @@ void do_notify_resume(struct pt_regs *regs, unsigned long orig_i0,
 	if (thread_info_flags & _TIF_NOTIFY_RESUME) {
 		clear_thread_flag(TIF_NOTIFY_RESUME);
 		tracehook_notify_resume(regs);
-		if (current->replacement_session_keyring)
-			key_replace_session_keyring();
 	}
 }
 
diff --git a/arch/sparc/kernel/signal_64.c b/arch/sparc/kernel/signal_64.c
index eafaab486b2d..c82cf1cc3965 100644
--- a/arch/sparc/kernel/signal_64.c
+++ b/arch/sparc/kernel/signal_64.c
@@ -607,8 +607,6 @@ void do_notify_resume(struct pt_regs *regs, unsigned long orig_i0, unsigned long
 	if (thread_info_flags & _TIF_NOTIFY_RESUME) {
 		clear_thread_flag(TIF_NOTIFY_RESUME);
 		tracehook_notify_resume(regs);
-		if (current->replacement_session_keyring)
-			key_replace_session_keyring();
 	}
 }
 
diff --git a/arch/tile/kernel/process.c b/arch/tile/kernel/process.c
index f572c19c4082..32817ab6062a 100644
--- a/arch/tile/kernel/process.c
+++ b/arch/tile/kernel/process.c
@@ -569,8 +569,6 @@ int do_work_pending(struct pt_regs *regs, u32 thread_info_flags)
 	if (thread_info_flags & _TIF_NOTIFY_RESUME) {
 		clear_thread_flag(TIF_NOTIFY_RESUME);
 		tracehook_notify_resume(regs);
-		if (current->replacement_session_keyring)
-			key_replace_session_keyring();
 		return 1;
 	}
 	if (thread_info_flags & _TIF_SINGLESTEP) {
diff --git a/arch/um/kernel/process.c b/arch/um/kernel/process.c
index 3a2235e0abc3..ccb9a9d283f1 100644
--- a/arch/um/kernel/process.c
+++ b/arch/um/kernel/process.c
@@ -117,11 +117,8 @@ void interrupt_end(void)
 		schedule();
 	if (test_thread_flag(TIF_SIGPENDING))
 		do_signal();
-	if (test_and_clear_thread_flag(TIF_NOTIFY_RESUME)) {
+	if (test_and_clear_thread_flag(TIF_NOTIFY_RESUME))
 		tracehook_notify_resume(&current->thread.regs);
-		if (current->replacement_session_keyring)
-			key_replace_session_keyring();
-	}
 }
 
 void exit_thread(void)
diff --git a/arch/unicore32/kernel/signal.c b/arch/unicore32/kernel/signal.c
index 7754df6ef7d4..28782ad47b93 100644
--- a/arch/unicore32/kernel/signal.c
+++ b/arch/unicore32/kernel/signal.c
@@ -464,8 +464,6 @@ asmlinkage void do_notify_resume(struct pt_regs *regs,
 	if (thread_flags & _TIF_NOTIFY_RESUME) {
 		clear_thread_flag(TIF_NOTIFY_RESUME);
 		tracehook_notify_resume(regs);
-		if (current->replacement_session_keyring)
-			key_replace_session_keyring();
 	}
 }
 
diff --git a/arch/x86/kernel/signal.c b/arch/x86/kernel/signal.c
index b68ccadd2ff4..9363b58b967c 100644
--- a/arch/x86/kernel/signal.c
+++ b/arch/x86/kernel/signal.c
@@ -821,8 +821,6 @@ do_notify_resume(struct pt_regs *regs, void *unused, __u32 thread_info_flags)
 	if (thread_info_flags & _TIF_NOTIFY_RESUME) {
 		clear_thread_flag(TIF_NOTIFY_RESUME);
 		tracehook_notify_resume(regs);
-		if (current->replacement_session_keyring)
-			key_replace_session_keyring();
 	}
 	if (thread_info_flags & _TIF_USER_RETURN_NOTIFY)
 		fire_user_return_notifiers();
diff --git a/arch/xtensa/kernel/signal.c b/arch/xtensa/kernel/signal.c
index c5e4ec0598d2..ea7e17778a75 100644
--- a/arch/xtensa/kernel/signal.c
+++ b/arch/xtensa/kernel/signal.c
@@ -548,9 +548,6 @@ void do_notify_resume(struct pt_regs *regs)
 	if (test_thread_flag(TIF_SIGPENDING))
 		do_signal(regs);
 
-	if (test_and_clear_thread_flag(TIF_NOTIFY_RESUME)) {
+	if (test_and_clear_thread_flag(TIF_NOTIFY_RESUME))
 		tracehook_notify_resume(regs);
-		if (current->replacement_session_keyring)
-			key_replace_session_keyring();
-	}
 }
diff --git a/include/linux/tracehook.h b/include/linux/tracehook.h
index 8a2a3fc9bd05..b9ca903bb553 100644
--- a/include/linux/tracehook.h
+++ b/include/linux/tracehook.h
@@ -183,6 +183,8 @@ static inline void set_notify_resume(struct task_struct *task)
  */
 static inline void tracehook_notify_resume(struct pt_regs *regs)
 {
+	if (current->replacement_session_keyring)
+		key_replace_session_keyring();
 }
 
 #endif	/* <linux/tracehook.h> */
-- 
cgit v1.2.3


From ea17e7414bc62e8d3bde8d08e3df1d921c518c17 Mon Sep 17 00:00:00 2001
From: "H. Peter Anvin" <hpa@zytor.com>
Date: Thu, 24 May 2012 07:01:38 -0700
Subject: x86, relocs: Add jiffies and jiffies_64 to the relative whitelist

The symbol jiffies is created in the linker script as an alias to
jiffies_64.  Unfortunately this is done outside any section, and
apparently GNU ld 2.21 doesn't carry the section with it, so we end up
with an absolute symbol and therefore a broken kernel.

Add jiffies and jiffies_64 to the whitelist.

The most disturbing bit with this discovery is that it shows that we
have had multiple linker bugs in this area crossing multiple
generations, and have been silently building bad kernels for some time.

Link: http://lkml.kernel.org/r/20120524171604.0d98284f3affc643e9714470@canb.auug.org.au
Reported-by: Stephen Rothwell <sfr@canb.auug.org.au>
Signed-off-by: H. Peter Anvin <hpa@zytor.com>
Cc: <stable@vger.kernel.org> v3.4
---
 arch/x86/tools/relocs.c | 1 +
 1 file changed, 1 insertion(+)

(limited to 'arch/x86')

diff --git a/arch/x86/tools/relocs.c b/arch/x86/tools/relocs.c
index b8f7c65fc40c..b685296d4464 100644
--- a/arch/x86/tools/relocs.c
+++ b/arch/x86/tools/relocs.c
@@ -71,6 +71,7 @@ static const char * const sym_regex_kernel[S_NSYMTYPES] = {
 	"__(start|stop)_notes|"
 	"__end_rodata|"
 	"__initramfs_start|"
+	"(jiffies|jiffies_64)|"
 	"_end)$"
 };
 
-- 
cgit v1.2.3


From 446969084d33a4064a39d280806da642c54ba4ac Mon Sep 17 00:00:00 2001
From: "David S. Miller" <davem@davemloft.net>
Date: Wed, 23 May 2012 20:12:50 -0700
Subject: kernel: Move REPEAT_BYTE definition into linux/kernel.h

And make sure that everything using it explicitly includes
that header file.

Signed-off-by: David S. Miller <davem@davemloft.net>
---
 arch/sparc/lib/usercopy.c             | 3 +--
 arch/x86/include/asm/word-at-a-time.h | 4 ++--
 fs/namei.c                            | 1 +
 include/linux/kernel.h                | 2 ++
 4 files changed, 6 insertions(+), 4 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/sparc/lib/usercopy.c b/arch/sparc/lib/usercopy.c
index f61ed820cb61..0b12e91d6ccc 100644
--- a/arch/sparc/lib/usercopy.c
+++ b/arch/sparc/lib/usercopy.c
@@ -1,5 +1,6 @@
 #include <linux/module.h>
 #include <linux/uaccess.h>
+#include <linux/kernel.h>
 #include <linux/errno.h>
 #include <linux/bug.h>
 
@@ -11,8 +12,6 @@ void copy_from_user_overflow(void)
 }
 EXPORT_SYMBOL(copy_from_user_overflow);
 
-#define REPEAT_BYTE(x)	((~0ul / 0xff) * (x))
-
 static inline long find_zero(unsigned long mask)
 {
 	long byte = 0;
diff --git a/arch/x86/include/asm/word-at-a-time.h b/arch/x86/include/asm/word-at-a-time.h
index e58f03b206c3..ae03facfadd6 100644
--- a/arch/x86/include/asm/word-at-a-time.h
+++ b/arch/x86/include/asm/word-at-a-time.h
@@ -1,6 +1,8 @@
 #ifndef _ASM_WORD_AT_A_TIME_H
 #define _ASM_WORD_AT_A_TIME_H
 
+#include <linux/kernel.h>
+
 /*
  * This is largely generic for little-endian machines, but the
  * optimal byte mask counting is probably going to be something
@@ -35,8 +37,6 @@ static inline long count_masked_bytes(long mask)
 
 #endif
 
-#define REPEAT_BYTE(x)	((~0ul / 0xff) * (x))
-
 /* Return the high bit set in the first byte that is a zero */
 static inline unsigned long has_zero(unsigned long a)
 {
diff --git a/fs/namei.c b/fs/namei.c
index f9e883c1b856..8d2ba420e42f 100644
--- a/fs/namei.c
+++ b/fs/namei.c
@@ -16,6 +16,7 @@
 
 #include <linux/init.h>
 #include <linux/export.h>
+#include <linux/kernel.h>
 #include <linux/slab.h>
 #include <linux/fs.h>
 #include <linux/namei.h>
diff --git a/include/linux/kernel.h b/include/linux/kernel.h
index 645231c373c8..fbe9bfacb8db 100644
--- a/include/linux/kernel.h
+++ b/include/linux/kernel.h
@@ -38,6 +38,8 @@
 
 #define STACK_MAGIC	0xdeadbeef
 
+#define REPEAT_BYTE(x)	((~0ul / 0xff) * (x))
+
 #define ALIGN(x, a)		__ALIGN_KERNEL((x), (a))
 #define __ALIGN_MASK(x, mask)	__ALIGN_KERNEL_MASK((x), (mask))
 #define PTR_ALIGN(p, a)		((typeof(p))ALIGN((unsigned long)(p), (a)))
-- 
cgit v1.2.3


From 1b38a3a10f2ad96a3c0130f63b7f3610bab7090d Mon Sep 17 00:00:00 2001
From: Jan Beulich <JBeulich@suse.com>
Date: Fri, 25 May 2012 11:40:09 +0100
Subject: x86: hpet: Fix copy-and-paste mistake in earlier change

This fixes an oversight in 396e2c6fed4ff13b53ce0e573105531cf53b0cad
("x86: Clear HPET configuration registers on startup"), noticed by
Thomas Gleixner.

Signed-off-by: Jan Beulich <jbeulich@suse.com>
Link: http://lkml.kernel.org/r/4FBF7DA902000078000861EE@nat28.tlf.novell.com
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 arch/x86/kernel/hpet.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kernel/hpet.c b/arch/x86/kernel/hpet.c
index 9cc7b4392f7c..1460a5df92f7 100644
--- a/arch/x86/kernel/hpet.c
+++ b/arch/x86/kernel/hpet.c
@@ -870,7 +870,7 @@ int __init hpet_enable(void)
 	else
 		pr_warn("HPET initial state will not be saved\n");
 	cfg &= ~(HPET_CFG_ENABLE | HPET_CFG_LEGACY);
-	hpet_writel(cfg, HPET_Tn_CFG(i));
+	hpet_writel(cfg, HPET_CFG);
 	if (cfg)
 		pr_warn("HPET: Unrecognized bits %#x set in global cfg\n",
 			cfg);
-- 
cgit v1.2.3


From 4ae73f2d53255c388d50bf83c1681112a6f9cba1 Mon Sep 17 00:00:00 2001
From: Linus Torvalds <torvalds@linux-foundation.org>
Date: Sat, 26 May 2012 10:14:39 -0700
Subject: x86: use generic strncpy_from_user routine

The generic strncpy_from_user() is not really optimal, since it is
designed to work on both little-endian and big-endian.  And on
little-endian you can simplify much of the logic to find the first zero
byte, since little-endian arithmetic doesn't have to worry about the
carry bit propagating into earlier bytes (only later bytes, which we
don't care about).

But I have patches to make the generic routines use the architecture-
specific <asm/word-at-a-time.h> infrastructure, so that we can regain
the little-endian optimizations.  But before we do that, switch over to
the generic routines to make the patches each do just one well-defined
thing.

Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 arch/x86/Kconfig               |  1 +
 arch/x86/include/asm/uaccess.h |  1 +
 arch/x86/lib/usercopy.c        | 97 ------------------------------------------
 3 files changed, 2 insertions(+), 97 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
index 81c3e8be789a..3220d44e24d0 100644
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -93,6 +93,7 @@ config X86
 	select GENERIC_CLOCKEVENTS_BROADCAST if X86_64 || (X86_32 && X86_LOCAL_APIC)
 	select GENERIC_TIME_VSYSCALL if X86_64
 	select KTIME_SCALAR if X86_32
+	select GENERIC_STRNCPY_FROM_USER
 
 config INSTRUCTION_DECODER
 	def_bool (KPROBES || PERF_EVENTS || UPROBES)
diff --git a/arch/x86/include/asm/uaccess.h b/arch/x86/include/asm/uaccess.h
index 851fe0dc13bc..1354facd8f63 100644
--- a/arch/x86/include/asm/uaccess.h
+++ b/arch/x86/include/asm/uaccess.h
@@ -32,6 +32,7 @@
 
 #define segment_eq(a, b)	((a).seg == (b).seg)
 
+#define user_addr_max() (current_thread_info()->addr_limit.seg)
 #define __addr_ok(addr)					\
 	((unsigned long __force)(addr) <		\
 	 (current_thread_info()->addr_limit.seg))
diff --git a/arch/x86/lib/usercopy.c b/arch/x86/lib/usercopy.c
index 2e4e4b02c37a..f61ee67ec00f 100644
--- a/arch/x86/lib/usercopy.c
+++ b/arch/x86/lib/usercopy.c
@@ -43,100 +43,3 @@ copy_from_user_nmi(void *to, const void __user *from, unsigned long n)
 	return len;
 }
 EXPORT_SYMBOL_GPL(copy_from_user_nmi);
-
-/*
- * Do a strncpy, return length of string without final '\0'.
- * 'count' is the user-supplied count (return 'count' if we
- * hit it), 'max' is the address space maximum (and we return
- * -EFAULT if we hit it).
- */
-static inline long do_strncpy_from_user(char *dst, const char __user *src, long count, unsigned long max)
-{
-	long res = 0;
-
-	/*
-	 * Truncate 'max' to the user-specified limit, so that
-	 * we only have one limit we need to check in the loop
-	 */
-	if (max > count)
-		max = count;
-
-	while (max >= sizeof(unsigned long)) {
-		unsigned long c, mask;
-
-		/* Fall back to byte-at-a-time if we get a page fault */
-		if (unlikely(__get_user(c,(unsigned long __user *)(src+res))))
-			break;
-		mask = has_zero(c);
-		if (mask) {
-			mask = (mask - 1) & ~mask;
-			mask >>= 7;
-			*(unsigned long *)(dst+res) = c & mask;
-			return res + count_masked_bytes(mask);
-		}
-		*(unsigned long *)(dst+res) = c;
-		res += sizeof(unsigned long);
-		max -= sizeof(unsigned long);
-	}
-
-	while (max) {
-		char c;
-
-		if (unlikely(__get_user(c,src+res)))
-			return -EFAULT;
-		dst[res] = c;
-		if (!c)
-			return res;
-		res++;
-		max--;
-	}
-
-	/*
-	 * Uhhuh. We hit 'max'. But was that the user-specified maximum
-	 * too? If so, that's ok - we got as much as the user asked for.
-	 */
-	if (res >= count)
-		return res;
-
-	/*
-	 * Nope: we hit the address space limit, and we still had more
-	 * characters the caller would have wanted. That's an EFAULT.
-	 */
-	return -EFAULT;
-}
-
-/**
- * strncpy_from_user: - Copy a NUL terminated string from userspace.
- * @dst:   Destination address, in kernel space.  This buffer must be at
- *         least @count bytes long.
- * @src:   Source address, in user space.
- * @count: Maximum number of bytes to copy, including the trailing NUL.
- *
- * Copies a NUL-terminated string from userspace to kernel space.
- *
- * On success, returns the length of the string (not including the trailing
- * NUL).
- *
- * If access to userspace fails, returns -EFAULT (some data may have been
- * copied).
- *
- * If @count is smaller than the length of the string, copies @count bytes
- * and returns @count.
- */
-long
-strncpy_from_user(char *dst, const char __user *src, long count)
-{
-	unsigned long max_addr, src_addr;
-
-	if (unlikely(count <= 0))
-		return 0;
-
-	max_addr = current_thread_info()->addr_limit.seg;
-	src_addr = (unsigned long)src;
-	if (likely(src_addr < max_addr)) {
-		unsigned long max = max_addr - src_addr;
-		return do_strncpy_from_user(dst, src, count, max);
-	}
-	return -EFAULT;
-}
-EXPORT_SYMBOL(strncpy_from_user);
-- 
cgit v1.2.3


From 36126f8f2ed8168eb13aa0662b9b9585cba100a9 Mon Sep 17 00:00:00 2001
From: Linus Torvalds <torvalds@linux-foundation.org>
Date: Sat, 26 May 2012 10:43:17 -0700
Subject: word-at-a-time: make the interfaces truly generic

This changes the interfaces in <asm/word-at-a-time.h> to be a bit more
complicated, but a lot more generic.

In particular, it allows us to really do the operations efficiently on
both little-endian and big-endian machines, pretty much regardless of
machine details.  For example, if you can rely on a fast population
count instruction on your architecture, this will allow you to make your
optimized <asm/word-at-a-time.h> file with that.

NOTE! The "generic" version in include/asm-generic/word-at-a-time.h is
not truly generic, it actually only works on big-endian.  Why? Because
on little-endian the generic algorithms are wasteful, since you can
inevitably do better. The x86 implementation is an example of that.

(The only truly non-generic part of the asm-generic implementation is
the "find_zero()" function, and you could make a little-endian version
of it.  And if the Kbuild infrastructure allowed us to pick a particular
header file, that would be lovely)

The <asm/word-at-a-time.h> functions are as follows:

 - WORD_AT_A_TIME_CONSTANTS: specific constants that the algorithm
   uses.

 - has_zero(): take a word, and determine if it has a zero byte in it.
   It gets the word, the pointer to the constant pool, and a pointer to
   an intermediate "data" field it can set.

   This is the "quick-and-dirty" zero tester: it's what is run inside
   the hot loops.

 - "prep_zero_mask()": take the word, the data that has_zero() produced,
   and the constant pool, and generate an *exact* mask of which byte had
   the first zero.  This is run directly *outside* the loop, and allows
   the "has_zero()" function to answer the "is there a zero byte"
   question without necessarily getting exactly *which* byte is the
   first one to contain a zero.

   If you do multiple byte lookups concurrently (eg "hash_name()", which
   looks for both NUL and '/' bytes), after you've done the prep_zero_mask()
   phase, the result of those can be or'ed together to get the "either
   or" case.

 - The result from "prep_zero_mask()" can then be fed into "find_zero()"
   (to find the byte offset of the first byte that was zero) or into
   "zero_bytemask()" (to find the bytemask of the bytes preceding the
   zero byte).

   The existence of zero_bytemask() is optional, and is not necessary
   for the normal string routines.  But dentry name hashing needs it, so
   if you enable DENTRY_WORD_AT_A_TIME you need to expose it.

This changes the generic strncpy_from_user() function and the dentry
hashing functions to use these modified word-at-a-time interfaces.  This
gets us back to the optimized state of the x86 strncpy that we lost in
the previous commit when moving over to the generic version.

Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 arch/openrisc/include/asm/Kbuild      |  1 +
 arch/sparc/include/asm/Kbuild         |  1 +
 arch/x86/include/asm/word-at-a-time.h | 32 +++++++++++++++++++--
 fs/namei.c                            | 22 ++++++++-------
 include/asm-generic/word-at-a-time.h  | 52 +++++++++++++++++++++++++++++++++++
 lib/strncpy_from_user.c               | 47 +++++--------------------------
 6 files changed, 102 insertions(+), 53 deletions(-)
 create mode 100644 include/asm-generic/word-at-a-time.h

(limited to 'arch/x86')

diff --git a/arch/openrisc/include/asm/Kbuild b/arch/openrisc/include/asm/Kbuild
index c936483bc8e2..3f35c38d7b64 100644
--- a/arch/openrisc/include/asm/Kbuild
+++ b/arch/openrisc/include/asm/Kbuild
@@ -66,3 +66,4 @@ generic-y += topology.h
 generic-y += types.h
 generic-y += ucontext.h
 generic-y += user.h
+generic-y += word-at-a-time.h
diff --git a/arch/sparc/include/asm/Kbuild b/arch/sparc/include/asm/Kbuild
index 2c2e38821f60..67f83e0a0d68 100644
--- a/arch/sparc/include/asm/Kbuild
+++ b/arch/sparc/include/asm/Kbuild
@@ -21,3 +21,4 @@ generic-y += div64.h
 generic-y += local64.h
 generic-y += irq_regs.h
 generic-y += local.h
+generic-y += word-at-a-time.h
diff --git a/arch/x86/include/asm/word-at-a-time.h b/arch/x86/include/asm/word-at-a-time.h
index ae03facfadd6..5b238981542a 100644
--- a/arch/x86/include/asm/word-at-a-time.h
+++ b/arch/x86/include/asm/word-at-a-time.h
@@ -10,6 +10,11 @@
  * bit count instruction, that might be better than the multiply
  * and shift, for example.
  */
+struct word_at_a_time {
+	const unsigned long one_bits, high_bits;
+};
+
+#define WORD_AT_A_TIME_CONSTANTS { REPEAT_BYTE(0x01), REPEAT_BYTE(0x80) }
 
 #ifdef CONFIG_64BIT
 
@@ -37,10 +42,31 @@ static inline long count_masked_bytes(long mask)
 
 #endif
 
-/* Return the high bit set in the first byte that is a zero */
-static inline unsigned long has_zero(unsigned long a)
+/* Return nonzero if it has a zero */
+static inline unsigned long has_zero(unsigned long a, unsigned long *bits, const struct word_at_a_time *c)
+{
+	unsigned long mask = ((a - c->one_bits) & ~a) & c->high_bits;
+	*bits = mask;
+	return mask;
+}
+
+static inline unsigned long prep_zero_mask(unsigned long a, unsigned long bits, const struct word_at_a_time *c)
+{
+	return bits;
+}
+
+static inline unsigned long create_zero_mask(unsigned long bits)
+{
+	bits = (bits - 1) & ~bits;
+	return bits >> 7;
+}
+
+/* The mask we created is directly usable as a bytemask */
+#define zero_bytemask(mask) (mask)
+
+static inline unsigned long find_zero(unsigned long mask)
 {
-	return ((a - REPEAT_BYTE(0x01)) & ~a) & REPEAT_BYTE(0x80);
+	return count_masked_bytes(mask);
 }
 
 /*
diff --git a/fs/namei.c b/fs/namei.c
index 93ff12b1a1de..c651f02c9fec 100644
--- a/fs/namei.c
+++ b/fs/namei.c
@@ -1452,7 +1452,8 @@ EXPORT_SYMBOL(full_name_hash);
  */
 static inline unsigned long hash_name(const char *name, unsigned int *hashp)
 {
-	unsigned long a, mask, hash, len;
+	unsigned long a, b, adata, bdata, mask, hash, len;
+	const struct word_at_a_time constants = WORD_AT_A_TIME_CONSTANTS;
 
 	hash = a = 0;
 	len = -sizeof(unsigned long);
@@ -1460,17 +1461,18 @@ static inline unsigned long hash_name(const char *name, unsigned int *hashp)
 		hash = (hash + a) * 9;
 		len += sizeof(unsigned long);
 		a = load_unaligned_zeropad(name+len);
-		/* Do we have any NUL or '/' bytes in this word? */
-		mask = has_zero(a) | has_zero(a ^ REPEAT_BYTE('/'));
-	} while (!mask);
-
-	/* The mask *below* the first high bit set */
-	mask = (mask - 1) & ~mask;
-	mask >>= 7;
-	hash += a & mask;
+		b = a ^ REPEAT_BYTE('/');
+	} while (!(has_zero(a, &adata, &constants) | has_zero(b, &bdata, &constants)));
+
+	adata = prep_zero_mask(a, adata, &constants);
+	bdata = prep_zero_mask(b, bdata, &constants);
+
+	mask = create_zero_mask(adata | bdata);
+
+	hash += a & zero_bytemask(mask);
 	*hashp = fold_hash(hash);
 
-	return len + count_masked_bytes(mask);
+	return len + find_zero(mask);
 }
 
 #else
diff --git a/include/asm-generic/word-at-a-time.h b/include/asm-generic/word-at-a-time.h
new file mode 100644
index 000000000000..3f21f1b72e45
--- /dev/null
+++ b/include/asm-generic/word-at-a-time.h
@@ -0,0 +1,52 @@
+#ifndef _ASM_WORD_AT_A_TIME_H
+#define _ASM_WORD_AT_A_TIME_H
+
+/*
+ * This says "generic", but it's actually big-endian only.
+ * Little-endian can use more efficient versions of these
+ * interfaces, see for example
+ *	 arch/x86/include/asm/word-at-a-time.h
+ * for those.
+ */
+
+#include <linux/kernel.h>
+
+struct word_at_a_time {
+	const unsigned long high_bits, low_bits;
+};
+
+#define WORD_AT_A_TIME_CONSTANTS { REPEAT_BYTE(0xfe) + 1, REPEAT_BYTE(0x7f) }
+
+/* Bit set in the bytes that have a zero */
+static inline long prep_zero_mask(unsigned long val, unsigned long rhs, const struct word_at_a_time *c)
+{
+	unsigned long mask = (val & c->low_bits) + c->low_bits;
+	return ~(mask | rhs);
+}
+
+#define create_zero_mask(mask) (mask)
+
+static inline long find_zero(unsigned long mask)
+{
+	long byte = 0;
+#ifdef CONFIG_64BIT
+	if (mask >> 32)
+		mask >>= 32;
+	else
+		byte = 4;
+#endif
+	if (mask >> 16)
+		mask >>= 16;
+	else
+		byte += 2;
+	return (mask >> 8) ? byte : byte + 1;
+}
+
+static inline bool has_zero(unsigned long val, unsigned long *data, const struct word_at_a_time *c)
+{
+	unsigned long rhs = val | c->low_bits;
+	*data = rhs;
+	return (val + c->high_bits) & ~rhs;
+}
+
+#endif /* _ASM_WORD_AT_A_TIME_H */
diff --git a/lib/strncpy_from_user.c b/lib/strncpy_from_user.c
index c4c09b0e96ba..bb2b201d6ad0 100644
--- a/lib/strncpy_from_user.c
+++ b/lib/strncpy_from_user.c
@@ -4,37 +4,7 @@
 #include <linux/errno.h>
 
 #include <asm/byteorder.h>
-
-static inline long find_zero(unsigned long mask)
-{
-	long byte = 0;
-
-#ifdef __BIG_ENDIAN
-#ifdef CONFIG_64BIT
-	if (mask >> 32)
-		mask >>= 32;
-	else
-		byte = 4;
-#endif
-	if (mask >> 16)
-		mask >>= 16;
-	else
-		byte += 2;
-	return (mask >> 8) ? byte : byte + 1;
-#else
-#ifdef CONFIG_64BIT
-	if (!((unsigned int) mask)) {
-		mask >>= 32;
-		byte = 4;
-	}
-#endif
-	if (!(mask & 0xffff)) {
-		mask >>= 16;
-		byte += 2;
-	}
-	return (mask & 0xff) ? byte : byte + 1;
-#endif
-}
+#include <asm/word-at-a-time.h>
 
 #ifdef CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS
 #define IS_UNALIGNED(src, dst)	0
@@ -51,8 +21,7 @@ static inline long find_zero(unsigned long mask)
  */
 static inline long do_strncpy_from_user(char *dst, const char __user *src, long count, unsigned long max)
 {
-	const unsigned long high_bits = REPEAT_BYTE(0xfe) + 1;
-	const unsigned long low_bits = REPEAT_BYTE(0x7f);
+	const struct word_at_a_time constants = WORD_AT_A_TIME_CONSTANTS;
 	long res = 0;
 
 	/*
@@ -66,18 +35,16 @@ static inline long do_strncpy_from_user(char *dst, const char __user *src, long
 		goto byte_at_a_time;
 
 	while (max >= sizeof(unsigned long)) {
-		unsigned long c, v, rhs;
+		unsigned long c, data;
 
 		/* Fall back to byte-at-a-time if we get a page fault */
 		if (unlikely(__get_user(c,(unsigned long __user *)(src+res))))
 			break;
-		rhs = c | low_bits;
-		v = (c + high_bits) & ~rhs;
 		*(unsigned long *)(dst+res) = c;
-		if (v) {
-			v = (c & low_bits) + low_bits;
-			v = ~(v | rhs);
-			return res + find_zero(v);
+		if (has_zero(c, &data, &constants)) {
+			data = prep_zero_mask(c, data, &constants);
+			data = create_zero_mask(data);
+			return res + find_zero(data);
 		}
 		res += sizeof(unsigned long);
 		max -= sizeof(unsigned long);
-- 
cgit v1.2.3


From 5723aa993d83803157c22327e90cd59e3dcbe879 Mon Sep 17 00:00:00 2001
From: Linus Torvalds <torvalds@linux-foundation.org>
Date: Sat, 26 May 2012 11:09:53 -0700
Subject: x86: use the new generic strnlen_user() function

This throws away the old x86-specific functions in favor of the generic
optimized version.

Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 arch/x86/Kconfig                  |  1 +
 arch/x86/include/asm/uaccess.h    |  3 +++
 arch/x86/include/asm/uaccess_32.h | 17 --------------
 arch/x86/include/asm/uaccess_64.h |  3 ---
 arch/x86/lib/usercopy_32.c        | 41 ---------------------------------
 arch/x86/lib/usercopy_64.c        | 48 ---------------------------------------
 6 files changed, 4 insertions(+), 109 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
index 3220d44e24d0..d700811785ea 100644
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -94,6 +94,7 @@ config X86
 	select GENERIC_TIME_VSYSCALL if X86_64
 	select KTIME_SCALAR if X86_32
 	select GENERIC_STRNCPY_FROM_USER
+	select GENERIC_STRNLEN_USER
 
 config INSTRUCTION_DECODER
 	def_bool (KPROBES || PERF_EVENTS || UPROBES)
diff --git a/arch/x86/include/asm/uaccess.h b/arch/x86/include/asm/uaccess.h
index 1354facd8f63..04cd6882308e 100644
--- a/arch/x86/include/asm/uaccess.h
+++ b/arch/x86/include/asm/uaccess.h
@@ -566,6 +566,9 @@ copy_from_user_nmi(void *to, const void __user *from, unsigned long n);
 extern __must_check long
 strncpy_from_user(char *dst, const char __user *src, long count);
 
+extern __must_check long strlen_user(const char __user *str);
+extern __must_check long strnlen_user(const char __user *str, long n);
+
 /*
  * movsl can be slow when source and dest are not both 8-byte aligned
  */
diff --git a/arch/x86/include/asm/uaccess_32.h b/arch/x86/include/asm/uaccess_32.h
index 8084bc73b18c..576e39bca6ad 100644
--- a/arch/x86/include/asm/uaccess_32.h
+++ b/arch/x86/include/asm/uaccess_32.h
@@ -213,23 +213,6 @@ static inline unsigned long __must_check copy_from_user(void *to,
 	return n;
 }
 
-/**
- * strlen_user: - Get the size of a string in user space.
- * @str: The string to measure.
- *
- * Context: User context only.  This function may sleep.
- *
- * Get the size of a NUL-terminated string in user space.
- *
- * Returns the size of the string INCLUDING the terminating NUL.
- * On exception, returns 0.
- *
- * If there is a limit on the length of a valid string, you may wish to
- * consider using strnlen_user() instead.
- */
-#define strlen_user(str) strnlen_user(str, LONG_MAX)
-
-long strnlen_user(const char __user *str, long n);
 unsigned long __must_check clear_user(void __user *mem, unsigned long len);
 unsigned long __must_check __clear_user(void __user *mem, unsigned long len);
 
diff --git a/arch/x86/include/asm/uaccess_64.h b/arch/x86/include/asm/uaccess_64.h
index fcd4b6f3ef02..8e796fbbf9c6 100644
--- a/arch/x86/include/asm/uaccess_64.h
+++ b/arch/x86/include/asm/uaccess_64.h
@@ -208,9 +208,6 @@ int __copy_in_user(void __user *dst, const void __user *src, unsigned size)
 	}
 }
 
-__must_check long strnlen_user(const char __user *str, long n);
-__must_check long __strnlen_user(const char __user *str, long n);
-__must_check long strlen_user(const char __user *str);
 __must_check unsigned long clear_user(void __user *mem, unsigned long len);
 __must_check unsigned long __clear_user(void __user *mem, unsigned long len);
 
diff --git a/arch/x86/lib/usercopy_32.c b/arch/x86/lib/usercopy_32.c
index 883b216c60b2..1781b2f950e2 100644
--- a/arch/x86/lib/usercopy_32.c
+++ b/arch/x86/lib/usercopy_32.c
@@ -95,47 +95,6 @@ __clear_user(void __user *to, unsigned long n)
 }
 EXPORT_SYMBOL(__clear_user);
 
-/**
- * strnlen_user: - Get the size of a string in user space.
- * @s: The string to measure.
- * @n: The maximum valid length
- *
- * Get the size of a NUL-terminated string in user space.
- *
- * Returns the size of the string INCLUDING the terminating NUL.
- * On exception, returns 0.
- * If the string is too long, returns a value greater than @n.
- */
-long strnlen_user(const char __user *s, long n)
-{
-	unsigned long mask = -__addr_ok(s);
-	unsigned long res, tmp;
-
-	might_fault();
-
-	__asm__ __volatile__(
-		"	testl %0, %0\n"
-		"	jz 3f\n"
-		"	andl %0,%%ecx\n"
-		"0:	repne; scasb\n"
-		"	setne %%al\n"
-		"	subl %%ecx,%0\n"
-		"	addl %0,%%eax\n"
-		"1:\n"
-		".section .fixup,\"ax\"\n"
-		"2:	xorl %%eax,%%eax\n"
-		"	jmp 1b\n"
-		"3:	movb $1,%%al\n"
-		"	jmp 1b\n"
-		".previous\n"
-		_ASM_EXTABLE(0b,2b)
-		:"=&r" (n), "=&D" (s), "=&a" (res), "=&c" (tmp)
-		:"0" (n), "1" (s), "2" (0), "3" (mask)
-		:"cc");
-	return res & mask;
-}
-EXPORT_SYMBOL(strnlen_user);
-
 #ifdef CONFIG_X86_INTEL_USERCOPY
 static unsigned long
 __copy_user_intel(void __user *to, const void *from, unsigned long size)
diff --git a/arch/x86/lib/usercopy_64.c b/arch/x86/lib/usercopy_64.c
index 0d0326f388c0..e5b130bc2d0e 100644
--- a/arch/x86/lib/usercopy_64.c
+++ b/arch/x86/lib/usercopy_64.c
@@ -52,54 +52,6 @@ unsigned long clear_user(void __user *to, unsigned long n)
 }
 EXPORT_SYMBOL(clear_user);
 
-/*
- * Return the size of a string (including the ending 0)
- *
- * Return 0 on exception, a value greater than N if too long
- */
-
-long __strnlen_user(const char __user *s, long n)
-{
-	long res = 0;
-	char c;
-
-	while (1) {
-		if (res>n)
-			return n+1;
-		if (__get_user(c, s))
-			return 0;
-		if (!c)
-			return res+1;
-		res++;
-		s++;
-	}
-}
-EXPORT_SYMBOL(__strnlen_user);
-
-long strnlen_user(const char __user *s, long n)
-{
-	if (!access_ok(VERIFY_READ, s, 1))
-		return 0;
-	return __strnlen_user(s, n);
-}
-EXPORT_SYMBOL(strnlen_user);
-
-long strlen_user(const char __user *s)
-{
-	long res = 0;
-	char c;
-
-	for (;;) {
-		if (get_user(c, s))
-			return 0;
-		if (!c)
-			return res+1;
-		res++;
-		s++;
-	}
-}
-EXPORT_SYMBOL(strlen_user);
-
 unsigned long copy_in_user(void __user *to, const void __user *from, unsigned len)
 {
 	if (access_ok(VERIFY_WRITE, to, len) && access_ok(VERIFY_READ, from, len)) { 
-- 
cgit v1.2.3


From c35866678391861942b3836c219a8898a259255a Mon Sep 17 00:00:00 2001
From: Xiao Guangrong <xiaoguangrong@linux.vnet.ibm.com>
Date: Mon, 28 May 2012 14:10:43 +0800
Subject: KVM: MMU: fix huge page adapted on non-PAE host

The huge page size is 4M on non-PAE host, but 2M page size is used in
transparent_hugepage_adjust(), so the page we get after adjust the
mapping level is not the head page, the BUG_ON() will be triggered

Signed-off-by: Xiao Guangrong <xiaoguangrong@linux.vnet.ibm.com>
Signed-off-by: Avi Kivity <avi@redhat.com>
---
 arch/x86/kvm/mmu.c | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
index 72102e0ab7cb..be3cea4407ff 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -2595,8 +2595,7 @@ static void transparent_hugepage_adjust(struct kvm_vcpu *vcpu,
 			*gfnp = gfn;
 			kvm_release_pfn_clean(pfn);
 			pfn &= ~mask;
-			if (!get_page_unless_zero(pfn_to_page(pfn)))
-				BUG();
+			kvm_get_pfn(pfn);
 			*pfnp = pfn;
 		}
 	}
-- 
cgit v1.2.3


From 91eb0f67c38c7104766faa49c5aaee2b4876511e Mon Sep 17 00:00:00 2001
From: Bjorn Helgaas <bhelgaas@google.com>
Date: Tue, 29 May 2012 15:06:28 -0700
Subject: x86: print e820 physical addresses consistently with other parts of
 kernel

Print physical address info in a style consistent with the %pR style used
elsewhere in the kernel.  For example:

    -BIOS-provided physical RAM map:
    +e820: BIOS-provided physical RAM map:
    - BIOS-e820: 0000000000000100 - 000000000009e000 (usable)
    +BIOS-e820: [mem 0x0000000000000100-0x000000000009dfff] usable
    -Allocating PCI resources starting at 90000000 (gap: 90000000:6ed1c000)
    +e820: [mem 0x90000000-0xfed1bfff] available for PCI devices
    -reserve RAM buffer: 000000000009e000 - 000000000009ffff
    +e820: reserve RAM buffer [mem 0x0009e000-0x0009ffff]

Signed-off-by: Bjorn Helgaas <bhelgaas@google.com>
Cc: Yinghai Lu <yinghai@kernel.org>
Cc: Konrad Rzeszutek Wilk <konrad.wilk@oracle.com>
Cc: Ingo Molnar <mingo@elte.hu>
Cc: "H. Peter Anvin" <hpa@zytor.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 arch/x86/kernel/e820.c | 53 +++++++++++++++++++++++++-------------------------
 1 file changed, 27 insertions(+), 26 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kernel/e820.c b/arch/x86/kernel/e820.c
index 62d61e9976eb..41857970517f 100644
--- a/arch/x86/kernel/e820.c
+++ b/arch/x86/kernel/e820.c
@@ -113,7 +113,9 @@ static void __init __e820_add_region(struct e820map *e820x, u64 start, u64 size,
 	int x = e820x->nr_map;
 
 	if (x >= ARRAY_SIZE(e820x->map)) {
-		printk(KERN_ERR "Ooops! Too many entries in the memory map!\n");
+		printk(KERN_ERR "e820: too many entries; ignoring [mem %#010llx-%#010llx]\n",
+		       (unsigned long long) start,
+		       (unsigned long long) (start + size - 1));
 		return;
 	}
 
@@ -133,19 +135,19 @@ static void __init e820_print_type(u32 type)
 	switch (type) {
 	case E820_RAM:
 	case E820_RESERVED_KERN:
-		printk(KERN_CONT "(usable)");
+		printk(KERN_CONT "usable");
 		break;
 	case E820_RESERVED:
-		printk(KERN_CONT "(reserved)");
+		printk(KERN_CONT "reserved");
 		break;
 	case E820_ACPI:
-		printk(KERN_CONT "(ACPI data)");
+		printk(KERN_CONT "ACPI data");
 		break;
 	case E820_NVS:
-		printk(KERN_CONT "(ACPI NVS)");
+		printk(KERN_CONT "ACPI NVS");
 		break;
 	case E820_UNUSABLE:
-		printk(KERN_CONT "(unusable)");
+		printk(KERN_CONT "unusable");
 		break;
 	default:
 		printk(KERN_CONT "type %u", type);
@@ -158,10 +160,10 @@ void __init e820_print_map(char *who)
 	int i;
 
 	for (i = 0; i < e820.nr_map; i++) {
-		printk(KERN_INFO " %s: %016Lx - %016Lx ", who,
+		printk(KERN_INFO "%s: [mem %#018Lx-%#018Lx] ", who,
 		       (unsigned long long) e820.map[i].addr,
 		       (unsigned long long)
-		       (e820.map[i].addr + e820.map[i].size));
+		       (e820.map[i].addr + e820.map[i].size - 1));
 		e820_print_type(e820.map[i].type);
 		printk(KERN_CONT "\n");
 	}
@@ -428,9 +430,8 @@ static u64 __init __e820_update_range(struct e820map *e820x, u64 start,
 		size = ULLONG_MAX - start;
 
 	end = start + size;
-	printk(KERN_DEBUG "e820 update range: %016Lx - %016Lx ",
-		       (unsigned long long) start,
-		       (unsigned long long) end);
+	printk(KERN_DEBUG "e820: update [mem %#010Lx-%#010Lx] ",
+	       (unsigned long long) start, (unsigned long long) (end - 1));
 	e820_print_type(old_type);
 	printk(KERN_CONT " ==> ");
 	e820_print_type(new_type);
@@ -509,9 +510,8 @@ u64 __init e820_remove_range(u64 start, u64 size, unsigned old_type,
 		size = ULLONG_MAX - start;
 
 	end = start + size;
-	printk(KERN_DEBUG "e820 remove range: %016Lx - %016Lx ",
-		       (unsigned long long) start,
-		       (unsigned long long) end);
+	printk(KERN_DEBUG "e820: remove [mem %#010Lx-%#010Lx] ",
+	       (unsigned long long) start, (unsigned long long) (end - 1));
 	if (checktype)
 		e820_print_type(old_type);
 	printk(KERN_CONT "\n");
@@ -567,7 +567,7 @@ void __init update_e820(void)
 	if (sanitize_e820_map(e820.map, ARRAY_SIZE(e820.map), &nr_map))
 		return;
 	e820.nr_map = nr_map;
-	printk(KERN_INFO "modified physical RAM map:\n");
+	printk(KERN_INFO "e820: modified physical RAM map:\n");
 	e820_print_map("modified");
 }
 static void __init update_e820_saved(void)
@@ -637,8 +637,8 @@ __init void e820_setup_gap(void)
 	if (!found) {
 		gapstart = (max_pfn << PAGE_SHIFT) + 1024*1024;
 		printk(KERN_ERR
-	"PCI: Warning: Cannot find a gap in the 32bit address range\n"
-	"PCI: Unassigned devices with 32bit resource registers may break!\n");
+	"e820: cannot find a gap in the 32bit address range\n"
+	"e820: PCI devices with unassigned 32bit BARs may break!\n");
 	}
 #endif
 
@@ -648,8 +648,8 @@ __init void e820_setup_gap(void)
 	pci_mem_start = gapstart;
 
 	printk(KERN_INFO
-	       "Allocating PCI resources starting at %lx (gap: %lx:%lx)\n",
-	       pci_mem_start, gapstart, gapsize);
+	       "e820: [mem %#010lx-%#010lx] available for PCI devices\n",
+	       gapstart, gapstart + gapsize - 1);
 }
 
 /**
@@ -667,7 +667,7 @@ void __init parse_e820_ext(struct setup_data *sdata)
 	extmap = (struct e820entry *)(sdata->data);
 	__append_e820_map(extmap, entries);
 	sanitize_e820_map(e820.map, ARRAY_SIZE(e820.map), &e820.nr_map);
-	printk(KERN_INFO "extended physical RAM map:\n");
+	printk(KERN_INFO "e820: extended physical RAM map:\n");
 	e820_print_map("extended");
 }
 
@@ -734,7 +734,7 @@ u64 __init early_reserve_e820(u64 size, u64 align)
 	addr = __memblock_alloc_base(size, align, MEMBLOCK_ALLOC_ACCESSIBLE);
 	if (addr) {
 		e820_update_range_saved(addr, size, E820_RAM, E820_RESERVED);
-		printk(KERN_INFO "update e820_saved for early_reserve_e820\n");
+		printk(KERN_INFO "e820: update e820_saved for early_reserve_e820\n");
 		update_e820_saved();
 	}
 
@@ -784,7 +784,7 @@ static unsigned long __init e820_end_pfn(unsigned long limit_pfn, unsigned type)
 	if (last_pfn > max_arch_pfn)
 		last_pfn = max_arch_pfn;
 
-	printk(KERN_INFO "last_pfn = %#lx max_arch_pfn = %#lx\n",
+	printk(KERN_INFO "e820: last_pfn = %#lx max_arch_pfn = %#lx\n",
 			 last_pfn, max_arch_pfn);
 	return last_pfn;
 }
@@ -888,7 +888,7 @@ void __init finish_e820_parsing(void)
 			early_panic("Invalid user supplied memory map");
 		e820.nr_map = nr;
 
-		printk(KERN_INFO "user-defined physical RAM map:\n");
+		printk(KERN_INFO "e820: user-defined physical RAM map:\n");
 		e820_print_map("user");
 	}
 }
@@ -996,8 +996,9 @@ void __init e820_reserve_resources_late(void)
 			end = MAX_RESOURCE_SIZE;
 		if (start >= end)
 			continue;
-		printk(KERN_DEBUG "reserve RAM buffer: %016llx - %016llx ",
-			       start, end);
+		printk(KERN_DEBUG
+		       "e820: reserve RAM buffer [mem %#010llx-%#010llx]\n",
+		       start, end);
 		reserve_region_with_split(&iomem_resource, start, end,
 					  "RAM buffer");
 	}
@@ -1047,7 +1048,7 @@ void __init setup_memory_map(void)
 
 	who = x86_init.resources.memory_setup();
 	memcpy(&e820_saved, &e820, sizeof(struct e820map));
-	printk(KERN_INFO "BIOS-provided physical RAM map:\n");
+	printk(KERN_INFO "e820: BIOS-provided physical RAM map:\n");
 	e820_print_map(who);
 }
 
-- 
cgit v1.2.3


From 365811d6f9bd98543bedc02b72d94f0f0faf3670 Mon Sep 17 00:00:00 2001
From: Bjorn Helgaas <bhelgaas@google.com>
Date: Tue, 29 May 2012 15:06:29 -0700
Subject: x86: print physical addresses consistently with other parts of kernel

Print physical address info in a style consistent with the %pR style used
elsewhere in the kernel.  For example:

    -found SMP MP-table at [ffff8800000fce90] fce90
    +found SMP MP-table at [mem 0x000fce90-0x000fce9f] mapped at [ffff8800000fce90]
    -initial memory mapped : 0 - 20000000
    +initial memory mapped: [mem 0x00000000-0x1fffffff]
    -Base memory trampoline at [ffff88000009c000] 9c000 size 8192
    +Base memory trampoline [mem 0x0009c000-0x0009dfff] mapped at [ffff88000009c000]
    -SRAT: Node 0 PXM 0 0-80000000
    +SRAT: Node 0 PXM 0 [mem 0x00000000-0x7fffffff]

Signed-off-by: Bjorn Helgaas <bhelgaas@google.com>
Cc: Yinghai Lu <yinghai@kernel.org>
Cc: Konrad Rzeszutek Wilk <konrad.wilk@oracle.com>
Cc: Ingo Molnar <mingo@elte.hu>
Cc: "H. Peter Anvin" <hpa@zytor.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 arch/x86/kernel/mpparse.c    | 10 ++++++----
 arch/x86/kernel/setup.c      | 16 ++++++++--------
 arch/x86/mm/init.c           | 16 +++++++++-------
 arch/x86/mm/numa.c           | 32 ++++++++++++++++----------------
 arch/x86/mm/numa_emulation.c |  4 ++--
 arch/x86/mm/pat.c            | 42 +++++++++++++++++++-----------------------
 arch/x86/mm/srat.c           |  5 +++--
 7 files changed, 63 insertions(+), 62 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kernel/mpparse.c b/arch/x86/kernel/mpparse.c
index b02d4dd6b8a3..fbca2e6223bf 100644
--- a/arch/x86/kernel/mpparse.c
+++ b/arch/x86/kernel/mpparse.c
@@ -568,8 +568,8 @@ static int __init smp_scan_config(unsigned long base, unsigned long length)
 	struct mpf_intel *mpf;
 	unsigned long mem;
 
-	apic_printk(APIC_VERBOSE, "Scan SMP from %p for %ld bytes.\n",
-			bp, length);
+	apic_printk(APIC_VERBOSE, "Scan for SMP in [mem %#010lx-%#010lx]\n",
+		    base, base + length - 1);
 	BUILD_BUG_ON(sizeof(*mpf) != 16);
 
 	while (length > 0) {
@@ -584,8 +584,10 @@ static int __init smp_scan_config(unsigned long base, unsigned long length)
 #endif
 			mpf_found = mpf;
 
-			printk(KERN_INFO "found SMP MP-table at [%p] %llx\n",
-			       mpf, (u64)virt_to_phys(mpf));
+			printk(KERN_INFO "found SMP MP-table at [mem %#010llx-%#010llx] mapped at [%p]\n",
+			       (unsigned long long) virt_to_phys(mpf),
+			       (unsigned long long) virt_to_phys(mpf) +
+			       sizeof(*mpf) - 1, mpf);
 
 			mem = virt_to_phys(mpf);
 			memblock_reserve(mem, sizeof(*mpf));
diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c
index f2afee6a19c1..982e44f960db 100644
--- a/arch/x86/kernel/setup.c
+++ b/arch/x86/kernel/setup.c
@@ -334,8 +334,8 @@ static void __init relocate_initrd(void)
 	memblock_reserve(ramdisk_here, area_size);
 	initrd_start = ramdisk_here + PAGE_OFFSET;
 	initrd_end   = initrd_start + ramdisk_size;
-	printk(KERN_INFO "Allocated new RAMDISK: %08llx - %08llx\n",
-			 ramdisk_here, ramdisk_here + ramdisk_size);
+	printk(KERN_INFO "Allocated new RAMDISK: [mem %#010llx-%#010llx]\n",
+			 ramdisk_here, ramdisk_here + ramdisk_size - 1);
 
 	q = (char *)initrd_start;
 
@@ -366,8 +366,8 @@ static void __init relocate_initrd(void)
 	/* high pages is not converted by early_res_to_bootmem */
 	ramdisk_image = boot_params.hdr.ramdisk_image;
 	ramdisk_size  = boot_params.hdr.ramdisk_size;
-	printk(KERN_INFO "Move RAMDISK from %016llx - %016llx to"
-		" %08llx - %08llx\n",
+	printk(KERN_INFO "Move RAMDISK from [mem %#010llx-%#010llx] to"
+		" [mem %#010llx-%#010llx]\n",
 		ramdisk_image, ramdisk_image + ramdisk_size - 1,
 		ramdisk_here, ramdisk_here + ramdisk_size - 1);
 }
@@ -392,8 +392,8 @@ static void __init reserve_initrd(void)
 		       ramdisk_size, end_of_lowmem>>1);
 	}
 
-	printk(KERN_INFO "RAMDISK: %08llx - %08llx\n", ramdisk_image,
-			ramdisk_end);
+	printk(KERN_INFO "RAMDISK: [mem %#010llx-%#010llx]\n", ramdisk_image,
+			ramdisk_end - 1);
 
 
 	if (ramdisk_end <= end_of_lowmem) {
@@ -906,8 +906,8 @@ void __init setup_arch(char **cmdline_p)
 	setup_bios_corruption_check();
 #endif
 
-	printk(KERN_DEBUG "initial memory mapped : 0 - %08lx\n",
-			max_pfn_mapped<<PAGE_SHIFT);
+	printk(KERN_DEBUG "initial memory mapped: [mem 0x00000000-%#010lx]\n",
+			(max_pfn_mapped<<PAGE_SHIFT) - 1);
 
 	setup_trampolines();
 
diff --git a/arch/x86/mm/init.c b/arch/x86/mm/init.c
index 319b6f2fb8b9..97141c26a13a 100644
--- a/arch/x86/mm/init.c
+++ b/arch/x86/mm/init.c
@@ -84,8 +84,9 @@ static void __init find_early_table_space(struct map_range *mr, unsigned long en
 	pgt_buf_end = pgt_buf_start;
 	pgt_buf_top = pgt_buf_start + (tables >> PAGE_SHIFT);
 
-	printk(KERN_DEBUG "kernel direct mapping tables up to %lx @ %lx-%lx\n",
-		end, pgt_buf_start << PAGE_SHIFT, pgt_buf_top << PAGE_SHIFT);
+	printk(KERN_DEBUG "kernel direct mapping tables up to %#lx @ [mem %#010lx-%#010lx]\n",
+		end - 1, pgt_buf_start << PAGE_SHIFT,
+		(pgt_buf_top << PAGE_SHIFT) - 1);
 }
 
 void __init native_pagetable_reserve(u64 start, u64 end)
@@ -132,7 +133,8 @@ unsigned long __init_refok init_memory_mapping(unsigned long start,
 	int nr_range, i;
 	int use_pse, use_gbpages;
 
-	printk(KERN_INFO "init_memory_mapping: %016lx-%016lx\n", start, end);
+	printk(KERN_INFO "init_memory_mapping: [mem %#010lx-%#010lx]\n",
+	       start, end - 1);
 
 #if defined(CONFIG_DEBUG_PAGEALLOC) || defined(CONFIG_KMEMCHECK)
 	/*
@@ -251,8 +253,8 @@ unsigned long __init_refok init_memory_mapping(unsigned long start,
 	}
 
 	for (i = 0; i < nr_range; i++)
-		printk(KERN_DEBUG " %010lx - %010lx page %s\n",
-				mr[i].start, mr[i].end,
+		printk(KERN_DEBUG " [mem %#010lx-%#010lx] page %s\n",
+				mr[i].start, mr[i].end - 1,
 			(mr[i].page_size_mask & (1<<PG_LEVEL_1G))?"1G":(
 			 (mr[i].page_size_mask & (1<<PG_LEVEL_2M))?"2M":"4k"));
 
@@ -350,8 +352,8 @@ void free_init_pages(char *what, unsigned long begin, unsigned long end)
 	 * create a kernel page fault:
 	 */
 #ifdef CONFIG_DEBUG_PAGEALLOC
-	printk(KERN_INFO "debug: unmapping init memory %08lx..%08lx\n",
-		begin, end);
+	printk(KERN_INFO "debug: unmapping init [mem %#010lx-%#010lx]\n",
+		begin, end - 1);
 	set_memory_np(begin, (end - begin) >> PAGE_SHIFT);
 #else
 	/*
diff --git a/arch/x86/mm/numa.c b/arch/x86/mm/numa.c
index 19d3fa08b119..2d125be1bae9 100644
--- a/arch/x86/mm/numa.c
+++ b/arch/x86/mm/numa.c
@@ -141,8 +141,8 @@ static int __init numa_add_memblk_to(int nid, u64 start, u64 end,
 
 	/* whine about and ignore invalid blks */
 	if (start > end || nid < 0 || nid >= MAX_NUMNODES) {
-		pr_warning("NUMA: Warning: invalid memblk node %d (%Lx-%Lx)\n",
-			   nid, start, end);
+		pr_warning("NUMA: Warning: invalid memblk node %d [mem %#010Lx-%#010Lx]\n",
+			   nid, start, end - 1);
 		return 0;
 	}
 
@@ -210,8 +210,8 @@ static void __init setup_node_data(int nid, u64 start, u64 end)
 
 	start = roundup(start, ZONE_ALIGN);
 
-	printk(KERN_INFO "Initmem setup node %d %016Lx-%016Lx\n",
-	       nid, start, end);
+	printk(KERN_INFO "Initmem setup node %d [mem %#010Lx-%#010Lx]\n",
+	       nid, start, end - 1);
 
 	/*
 	 * Allocate node data.  Try remap allocator first, node-local
@@ -232,7 +232,7 @@ static void __init setup_node_data(int nid, u64 start, u64 end)
 	}
 
 	/* report and initialize */
-	printk(KERN_INFO "  NODE_DATA [%016Lx - %016Lx]%s\n",
+	printk(KERN_INFO "  NODE_DATA [mem %#010Lx-%#010Lx]%s\n",
 	       nd_pa, nd_pa + nd_size - 1, remapped ? " (remapped)" : "");
 	tnid = early_pfn_to_nid(nd_pa >> PAGE_SHIFT);
 	if (!remapped && tnid != nid)
@@ -291,14 +291,14 @@ int __init numa_cleanup_meminfo(struct numa_meminfo *mi)
 			 */
 			if (bi->end > bj->start && bi->start < bj->end) {
 				if (bi->nid != bj->nid) {
-					pr_err("NUMA: node %d (%Lx-%Lx) overlaps with node %d (%Lx-%Lx)\n",
-					       bi->nid, bi->start, bi->end,
-					       bj->nid, bj->start, bj->end);
+					pr_err("NUMA: node %d [mem %#010Lx-%#010Lx] overlaps with node %d [mem %#010Lx-%#010Lx]\n",
+					       bi->nid, bi->start, bi->end - 1,
+					       bj->nid, bj->start, bj->end - 1);
 					return -EINVAL;
 				}
-				pr_warning("NUMA: Warning: node %d (%Lx-%Lx) overlaps with itself (%Lx-%Lx)\n",
-					   bi->nid, bi->start, bi->end,
-					   bj->start, bj->end);
+				pr_warning("NUMA: Warning: node %d [mem %#010Lx-%#010Lx] overlaps with itself [mem %#010Lx-%#010Lx]\n",
+					   bi->nid, bi->start, bi->end - 1,
+					   bj->start, bj->end - 1);
 			}
 
 			/*
@@ -320,9 +320,9 @@ int __init numa_cleanup_meminfo(struct numa_meminfo *mi)
 			}
 			if (k < mi->nr_blks)
 				continue;
-			printk(KERN_INFO "NUMA: Node %d [%Lx,%Lx) + [%Lx,%Lx) -> [%Lx,%Lx)\n",
-			       bi->nid, bi->start, bi->end, bj->start, bj->end,
-			       start, end);
+			printk(KERN_INFO "NUMA: Node %d [mem %#010Lx-%#010Lx] + [mem %#010Lx-%#010Lx] -> [mem %#010Lx-%#010Lx]\n",
+			       bi->nid, bi->start, bi->end - 1, bj->start,
+			       bj->end - 1, start, end - 1);
 			bi->start = start;
 			bi->end = end;
 			numa_remove_memblk_from(j--, mi);
@@ -616,8 +616,8 @@ static int __init dummy_numa_init(void)
 {
 	printk(KERN_INFO "%s\n",
 	       numa_off ? "NUMA turned off" : "No NUMA configuration found");
-	printk(KERN_INFO "Faking a node at %016Lx-%016Lx\n",
-	       0LLU, PFN_PHYS(max_pfn));
+	printk(KERN_INFO "Faking a node at [mem %#018Lx-%#018Lx]\n",
+	       0LLU, PFN_PHYS(max_pfn) - 1);
 
 	node_set(0, numa_nodes_parsed);
 	numa_add_memblk(0, 0, PFN_PHYS(max_pfn));
diff --git a/arch/x86/mm/numa_emulation.c b/arch/x86/mm/numa_emulation.c
index 871dd8868170..dbbbb47260cc 100644
--- a/arch/x86/mm/numa_emulation.c
+++ b/arch/x86/mm/numa_emulation.c
@@ -68,8 +68,8 @@ static int __init emu_setup_memblk(struct numa_meminfo *ei,
 		numa_remove_memblk_from(phys_blk, pi);
 	}
 
-	printk(KERN_INFO "Faking node %d at %016Lx-%016Lx (%LuMB)\n", nid,
-	       eb->start, eb->end, (eb->end - eb->start) >> 20);
+	printk(KERN_INFO "Faking node %d at [mem %#018Lx-%#018Lx] (%LuMB)\n",
+	       nid, eb->start, eb->end - 1, (eb->end - eb->start) >> 20);
 	return 0;
 }
 
diff --git a/arch/x86/mm/pat.c b/arch/x86/mm/pat.c
index f6ff57b7efa5..f11729fd019c 100644
--- a/arch/x86/mm/pat.c
+++ b/arch/x86/mm/pat.c
@@ -209,9 +209,8 @@ static int reserve_ram_pages_type(u64 start, u64 end, unsigned long req_type,
 		page = pfn_to_page(pfn);
 		type = get_page_memtype(page);
 		if (type != -1) {
-			printk(KERN_INFO "reserve_ram_pages_type failed "
-				"0x%Lx-0x%Lx, track 0x%lx, req 0x%lx\n",
-				start, end, type, req_type);
+			printk(KERN_INFO "reserve_ram_pages_type failed [mem %#010Lx-%#010Lx], track 0x%lx, req 0x%lx\n",
+				start, end - 1, type, req_type);
 			if (new_type)
 				*new_type = type;
 
@@ -314,9 +313,9 @@ int reserve_memtype(u64 start, u64 end, unsigned long req_type,
 
 	err = rbt_memtype_check_insert(new, new_type);
 	if (err) {
-		printk(KERN_INFO "reserve_memtype failed 0x%Lx-0x%Lx, "
-		       "track %s, req %s\n",
-		       start, end, cattr_name(new->type), cattr_name(req_type));
+		printk(KERN_INFO "reserve_memtype failed [mem %#010Lx-%#010Lx], track %s, req %s\n",
+		       start, end - 1,
+		       cattr_name(new->type), cattr_name(req_type));
 		kfree(new);
 		spin_unlock(&memtype_lock);
 
@@ -325,8 +324,8 @@ int reserve_memtype(u64 start, u64 end, unsigned long req_type,
 
 	spin_unlock(&memtype_lock);
 
-	dprintk("reserve_memtype added 0x%Lx-0x%Lx, track %s, req %s, ret %s\n",
-		start, end, cattr_name(new->type), cattr_name(req_type),
+	dprintk("reserve_memtype added [mem %#010Lx-%#010Lx], track %s, req %s, ret %s\n",
+		start, end - 1, cattr_name(new->type), cattr_name(req_type),
 		new_type ? cattr_name(*new_type) : "-");
 
 	return err;
@@ -360,14 +359,14 @@ int free_memtype(u64 start, u64 end)
 	spin_unlock(&memtype_lock);
 
 	if (!entry) {
-		printk(KERN_INFO "%s:%d freeing invalid memtype %Lx-%Lx\n",
-			current->comm, current->pid, start, end);
+		printk(KERN_INFO "%s:%d freeing invalid memtype [mem %#010Lx-%#010Lx]\n",
+		       current->comm, current->pid, start, end - 1);
 		return -EINVAL;
 	}
 
 	kfree(entry);
 
-	dprintk("free_memtype request 0x%Lx-0x%Lx\n", start, end);
+	dprintk("free_memtype request [mem %#010Lx-%#010Lx]\n", start, end - 1);
 
 	return 0;
 }
@@ -491,9 +490,8 @@ static inline int range_is_allowed(unsigned long pfn, unsigned long size)
 
 	while (cursor < to) {
 		if (!devmem_is_allowed(pfn)) {
-			printk(KERN_INFO
-		"Program %s tried to access /dev/mem between %Lx->%Lx.\n",
-				current->comm, from, to);
+			printk(KERN_INFO "Program %s tried to access /dev/mem between [mem %#010Lx-%#010Lx]\n",
+				current->comm, from, to - 1);
 			return 0;
 		}
 		cursor += PAGE_SIZE;
@@ -554,12 +552,11 @@ int kernel_map_sync_memtype(u64 base, unsigned long size, unsigned long flags)
 				size;
 
 	if (ioremap_change_attr((unsigned long)__va(base), id_sz, flags) < 0) {
-		printk(KERN_INFO
-			"%s:%d ioremap_change_attr failed %s "
-			"for %Lx-%Lx\n",
+		printk(KERN_INFO "%s:%d ioremap_change_attr failed %s "
+			"for [mem %#010Lx-%#010Lx]\n",
 			current->comm, current->pid,
 			cattr_name(flags),
-			base, (unsigned long long)(base + size));
+			base, (unsigned long long)(base + size-1));
 		return -EINVAL;
 	}
 	return 0;
@@ -591,12 +588,11 @@ static int reserve_pfn_range(u64 paddr, unsigned long size, pgprot_t *vma_prot,
 
 		flags = lookup_memtype(paddr);
 		if (want_flags != flags) {
-			printk(KERN_WARNING
-			"%s:%d map pfn RAM range req %s for %Lx-%Lx, got %s\n",
+			printk(KERN_WARNING "%s:%d map pfn RAM range req %s for [mem %#010Lx-%#010Lx], got %s\n",
 				current->comm, current->pid,
 				cattr_name(want_flags),
 				(unsigned long long)paddr,
-				(unsigned long long)(paddr + size),
+				(unsigned long long)(paddr + size - 1),
 				cattr_name(flags));
 			*vma_prot = __pgprot((pgprot_val(*vma_prot) &
 					      (~_PAGE_CACHE_MASK)) |
@@ -614,11 +610,11 @@ static int reserve_pfn_range(u64 paddr, unsigned long size, pgprot_t *vma_prot,
 		    !is_new_memtype_allowed(paddr, size, want_flags, flags)) {
 			free_memtype(paddr, paddr + size);
 			printk(KERN_ERR "%s:%d map pfn expected mapping type %s"
-				" for %Lx-%Lx, got %s\n",
+				" for [mem %#010Lx-%#010Lx], got %s\n",
 				current->comm, current->pid,
 				cattr_name(want_flags),
 				(unsigned long long)paddr,
-				(unsigned long long)(paddr + size),
+				(unsigned long long)(paddr + size - 1),
 				cattr_name(flags));
 			return -EINVAL;
 		}
diff --git a/arch/x86/mm/srat.c b/arch/x86/mm/srat.c
index efb5b4b93711..732af3a96183 100644
--- a/arch/x86/mm/srat.c
+++ b/arch/x86/mm/srat.c
@@ -176,8 +176,9 @@ acpi_numa_memory_affinity_init(struct acpi_srat_mem_affinity *ma)
 		return;
 	}
 
-	printk(KERN_INFO "SRAT: Node %u PXM %u %Lx-%Lx\n", node, pxm,
-	       start, end);
+	printk(KERN_INFO "SRAT: Node %u PXM %u [mem %#010Lx-%#010Lx]\n",
+	       node, pxm,
+	       (unsigned long long) start, (unsigned long long) end - 1);
 }
 
 void __init acpi_numa_arch_fixup(void) {}
-- 
cgit v1.2.3


From 26c191788f18129af0eb32a358cdaea0c7479626 Mon Sep 17 00:00:00 2001
From: Andrea Arcangeli <aarcange@redhat.com>
Date: Tue, 29 May 2012 15:06:49 -0700
Subject: mm: pmd_read_atomic: fix 32bit PAE pmd walk vs pmd_populate SMP race
 condition

When holding the mmap_sem for reading, pmd_offset_map_lock should only
run on a pmd_t that has been read atomically from the pmdp pointer,
otherwise we may read only half of it leading to this crash.

PID: 11679  TASK: f06e8000  CPU: 3   COMMAND: "do_race_2_panic"
 #0 [f06a9dd8] crash_kexec at c049b5ec
 #1 [f06a9e2c] oops_end at c083d1c2
 #2 [f06a9e40] no_context at c0433ded
 #3 [f06a9e64] bad_area_nosemaphore at c043401a
 #4 [f06a9e6c] __do_page_fault at c0434493
 #5 [f06a9eec] do_page_fault at c083eb45
 #6 [f06a9f04] error_code (via page_fault) at c083c5d5
    EAX: 01fb470c EBX: fff35000 ECX: 00000003 EDX: 00000100 EBP:
    00000000
    DS:  007b     ESI: 9e201000 ES:  007b     EDI: 01fb4700 GS:  00e0
    CS:  0060     EIP: c083bc14 ERR: ffffffff EFLAGS: 00010246
 #7 [f06a9f38] _spin_lock at c083bc14
 #8 [f06a9f44] sys_mincore at c0507b7d
 #9 [f06a9fb0] system_call at c083becd
                         start           len
    EAX: ffffffda  EBX: 9e200000  ECX: 00001000  EDX: 6228537f
    DS:  007b      ESI: 00000000  ES:  007b      EDI: 003d0f00
    SS:  007b      ESP: 62285354  EBP: 62285388  GS:  0033
    CS:  0073      EIP: 00291416  ERR: 000000da  EFLAGS: 00000286

This should be a longstanding bug affecting x86 32bit PAE without THP.
Only archs with 64bit large pmd_t and 32bit unsigned long should be
affected.

With THP enabled the barrier() in pmd_none_or_trans_huge_or_clear_bad()
would partly hide the bug when the pmd transition from none to stable,
by forcing a re-read of the *pmd in pmd_offset_map_lock, but when THP is
enabled a new set of problem arises by the fact could then transition
freely in any of the none, pmd_trans_huge or pmd_trans_stable states.
So making the barrier in pmd_none_or_trans_huge_or_clear_bad()
unconditional isn't good idea and it would be a flakey solution.

This should be fully fixed by introducing a pmd_read_atomic that reads
the pmd in order with THP disabled, or by reading the pmd atomically
with cmpxchg8b with THP enabled.

Luckily this new race condition only triggers in the places that must
already be covered by pmd_none_or_trans_huge_or_clear_bad() so the fix
is localized there but this bug is not related to THP.

NOTE: this can trigger on x86 32bit systems with PAE enabled with more
than 4G of ram, otherwise the high part of the pmd will never risk to be
truncated because it would be zero at all times, in turn so hiding the
SMP race.

This bug was discovered and fully debugged by Ulrich, quote:

----
[..]
pmd_none_or_trans_huge_or_clear_bad() loads the content of edx and
eax.

    496 static inline int pmd_none_or_trans_huge_or_clear_bad(pmd_t
    *pmd)
    497 {
    498         /* depend on compiler for an atomic pmd read */
    499         pmd_t pmdval = *pmd;

                                // edi = pmd pointer
0xc0507a74 <sys_mincore+548>:   mov    0x8(%esp),%edi
...
                                // edx = PTE page table high address
0xc0507a84 <sys_mincore+564>:   mov    0x4(%edi),%edx
...
                                // eax = PTE page table low address
0xc0507a8e <sys_mincore+574>:   mov    (%edi),%eax

[..]

Please note that the PMD is not read atomically. These are two "mov"
instructions where the high order bits of the PMD entry are fetched
first. Hence, the above machine code is prone to the following race.

-  The PMD entry {high|low} is 0x0000000000000000.
   The "mov" at 0xc0507a84 loads 0x00000000 into edx.

-  A page fault (on another CPU) sneaks in between the two "mov"
   instructions and instantiates the PMD.

-  The PMD entry {high|low} is now 0x00000003fda38067.
   The "mov" at 0xc0507a8e loads 0xfda38067 into eax.
----

Reported-by: Ulrich Obergfell <uobergfe@redhat.com>
Signed-off-by: Andrea Arcangeli <aarcange@redhat.com>
Cc: Mel Gorman <mgorman@suse.de>
Cc: Hugh Dickins <hughd@google.com>
Cc: Larry Woodman <lwoodman@redhat.com>
Cc: Petr Matousek <pmatouse@redhat.com>
Cc: Rik van Riel <riel@redhat.com>
Cc: <stable@vger.kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 arch/x86/include/asm/pgtable-3level.h | 50 +++++++++++++++++++++++++++++++++++
 include/asm-generic/pgtable.h         | 22 +++++++++++++--
 2 files changed, 70 insertions(+), 2 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/include/asm/pgtable-3level.h b/arch/x86/include/asm/pgtable-3level.h
index effff47a3c82..43876f16caf1 100644
--- a/arch/x86/include/asm/pgtable-3level.h
+++ b/arch/x86/include/asm/pgtable-3level.h
@@ -31,6 +31,56 @@ static inline void native_set_pte(pte_t *ptep, pte_t pte)
 	ptep->pte_low = pte.pte_low;
 }
 
+#define pmd_read_atomic pmd_read_atomic
+/*
+ * pte_offset_map_lock on 32bit PAE kernels was reading the pmd_t with
+ * a "*pmdp" dereference done by gcc. Problem is, in certain places
+ * where pte_offset_map_lock is called, concurrent page faults are
+ * allowed, if the mmap_sem is hold for reading. An example is mincore
+ * vs page faults vs MADV_DONTNEED. On the page fault side
+ * pmd_populate rightfully does a set_64bit, but if we're reading the
+ * pmd_t with a "*pmdp" on the mincore side, a SMP race can happen
+ * because gcc will not read the 64bit of the pmd atomically. To fix
+ * this all places running pmd_offset_map_lock() while holding the
+ * mmap_sem in read mode, shall read the pmdp pointer using this
+ * function to know if the pmd is null nor not, and in turn to know if
+ * they can run pmd_offset_map_lock or pmd_trans_huge or other pmd
+ * operations.
+ *
+ * Without THP if the mmap_sem is hold for reading, the
+ * pmd can only transition from null to not null while pmd_read_atomic runs.
+ * So there's no need of literally reading it atomically.
+ *
+ * With THP if the mmap_sem is hold for reading, the pmd can become
+ * THP or null or point to a pte (and in turn become "stable") at any
+ * time under pmd_read_atomic, so it's mandatory to read it atomically
+ * with cmpxchg8b.
+ */
+#ifndef CONFIG_TRANSPARENT_HUGEPAGE
+static inline pmd_t pmd_read_atomic(pmd_t *pmdp)
+{
+	pmdval_t ret;
+	u32 *tmp = (u32 *)pmdp;
+
+	ret = (pmdval_t) (*tmp);
+	if (ret) {
+		/*
+		 * If the low part is null, we must not read the high part
+		 * or we can end up with a partial pmd.
+		 */
+		smp_rmb();
+		ret |= ((pmdval_t)*(tmp + 1)) << 32;
+	}
+
+	return (pmd_t) { ret };
+}
+#else /* CONFIG_TRANSPARENT_HUGEPAGE */
+static inline pmd_t pmd_read_atomic(pmd_t *pmdp)
+{
+	return (pmd_t) { atomic64_read((atomic64_t *)pmdp) };
+}
+#endif /* CONFIG_TRANSPARENT_HUGEPAGE */
+
 static inline void native_set_pte_atomic(pte_t *ptep, pte_t pte)
 {
 	set_64bit((unsigned long long *)(ptep), native_pte_val(pte));
diff --git a/include/asm-generic/pgtable.h b/include/asm-generic/pgtable.h
index e2768f188f55..6f2b45a9b6bc 100644
--- a/include/asm-generic/pgtable.h
+++ b/include/asm-generic/pgtable.h
@@ -445,6 +445,18 @@ static inline int pmd_write(pmd_t pmd)
 #endif /* __HAVE_ARCH_PMD_WRITE */
 #endif /* CONFIG_TRANSPARENT_HUGEPAGE */
 
+#ifndef pmd_read_atomic
+static inline pmd_t pmd_read_atomic(pmd_t *pmdp)
+{
+	/*
+	 * Depend on compiler for an atomic pmd read. NOTE: this is
+	 * only going to work, if the pmdval_t isn't larger than
+	 * an unsigned long.
+	 */
+	return *pmdp;
+}
+#endif
+
 /*
  * This function is meant to be used by sites walking pagetables with
  * the mmap_sem hold in read mode to protect against MADV_DONTNEED and
@@ -458,11 +470,17 @@ static inline int pmd_write(pmd_t pmd)
  * undefined so behaving like if the pmd was none is safe (because it
  * can return none anyway). The compiler level barrier() is critically
  * important to compute the two checks atomically on the same pmdval.
+ *
+ * For 32bit kernels with a 64bit large pmd_t this automatically takes
+ * care of reading the pmd atomically to avoid SMP race conditions
+ * against pmd_populate() when the mmap_sem is hold for reading by the
+ * caller (a special atomic read not done by "gcc" as in the generic
+ * version above, is also needed when THP is disabled because the page
+ * fault can populate the pmd from under us).
  */
 static inline int pmd_none_or_trans_huge_or_clear_bad(pmd_t *pmd)
 {
-	/* depend on compiler for an atomic pmd read */
-	pmd_t pmdval = *pmd;
+	pmd_t pmdval = pmd_read_atomic(pmd);
 	/*
 	 * The barrier will stabilize the pmdval in a register or on
 	 * the stack so that it will stop changing under the code.
-- 
cgit v1.2.3


From fa83523f45fbb403eba4ebc5704bf98aa4da0163 Mon Sep 17 00:00:00 2001
From: John Dykstra <jdykstra@cray.com>
Date: Fri, 25 May 2012 16:12:46 -0500
Subject: x86/mm/pat: Improve scaling of pat_pagerange_is_ram()

Function pat_pagerange_is_ram() scales poorly to large address
ranges, because it probes the resource tree for each page.

On a 2.6 GHz Opteron, this function consumes 34 ms for a 1 GB range.

It is called twice during untrack_pfn_vma(), slowing process
cleanup and handicapping the OOM killer.

This replacement consumes less than 1ms, under the same conditions.

Signed-off-by: John Dykstra <jdykstra@cray.com> on behalf of Cray Inc.
Acked-by: Suresh Siddha <suresh.b.siddha@intel.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Link: http://lkml.kernel.org/r/1337980366.1979.6.camel@redwood
[ Small stylistic cleanups and renames ]
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/x86/mm/pat.c | 56 +++++++++++++++++++++++++++++++++++--------------------
 1 file changed, 36 insertions(+), 20 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/mm/pat.c b/arch/x86/mm/pat.c
index f6ff57b7efa5..bea6e573e02b 100644
--- a/arch/x86/mm/pat.c
+++ b/arch/x86/mm/pat.c
@@ -158,31 +158,47 @@ static unsigned long pat_x_mtrr_type(u64 start, u64 end, unsigned long req_type)
 	return req_type;
 }
 
+struct pagerange_state {
+	unsigned long		cur_pfn;
+	int			ram;
+	int			not_ram;
+};
+
+static int
+pagerange_is_ram_callback(unsigned long initial_pfn, unsigned long total_nr_pages, void *arg)
+{
+	struct pagerange_state *state = arg;
+
+	state->not_ram	|= initial_pfn > state->cur_pfn;
+	state->ram	|= total_nr_pages > 0;
+	state->cur_pfn	 = initial_pfn + total_nr_pages;
+
+	return state->ram && state->not_ram;
+}
+
 static int pat_pagerange_is_ram(resource_size_t start, resource_size_t end)
 {
-	int ram_page = 0, not_rampage = 0;
-	unsigned long page_nr;
+	int ret = 0;
+	unsigned long start_pfn = start >> PAGE_SHIFT;
+	unsigned long end_pfn = (end + PAGE_SIZE - 1) >> PAGE_SHIFT;
+	struct pagerange_state state = {start_pfn, 0, 0};
 
-	for (page_nr = (start >> PAGE_SHIFT); page_nr < (end >> PAGE_SHIFT);
-	     ++page_nr) {
-		/*
-		 * For legacy reasons, physical address range in the legacy ISA
-		 * region is tracked as non-RAM. This will allow users of
-		 * /dev/mem to map portions of legacy ISA region, even when
-		 * some of those portions are listed(or not even listed) with
-		 * different e820 types(RAM/reserved/..)
-		 */
-		if (page_nr >= (ISA_END_ADDRESS >> PAGE_SHIFT) &&
-		    page_is_ram(page_nr))
-			ram_page = 1;
-		else
-			not_rampage = 1;
-
-		if (ram_page == not_rampage)
-			return -1;
+	/*
+	 * For legacy reasons, physical address range in the legacy ISA
+	 * region is tracked as non-RAM. This will allow users of
+	 * /dev/mem to map portions of legacy ISA region, even when
+	 * some of those portions are listed(or not even listed) with
+	 * different e820 types(RAM/reserved/..)
+	 */
+	if (start_pfn < ISA_END_ADDRESS >> PAGE_SHIFT)
+		start_pfn = ISA_END_ADDRESS >> PAGE_SHIFT;
+
+	if (start_pfn < end_pfn) {
+		ret = walk_system_ram_range(start_pfn, end_pfn - start_pfn,
+				&state, pagerange_is_ram_callback);
 	}
 
-	return ram_page;
+	return (ret > 0) ? -1 : (state.ram ? 1 : 0);
 }
 
 /*
-- 
cgit v1.2.3


From 9f646389aa7727a2fd8f9ae6337b92af9cfbc264 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <a.p.zijlstra@chello.nl>
Date: Tue, 29 May 2012 16:39:09 +0200
Subject: sched/x86: Use cpu_llc_shared_mask(cpu) for coregroup_mask

Commit commit 8e7fbcbc2 ("sched: Remove stale power aware scheduling
remnants and dysfunctional knobs") made a boo-boo with removing the
power aware scheduling muck from the x86 topology bits.

We should unconditionally use the llc_shared mask for multi-core.

Reported-and-tested-by: Mike Galbraith <efault@gmx.de>
Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Borislav Petkov <bp@amd64.org>
Cc: Andreas Herrmann <andreas.herrmann3@amd.com>
Link: http://lkml.kernel.org/n/tip-lsksc2kfyeveb13avh327p0d@git.kernel.org
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/x86/kernel/smpboot.c | 10 +---------
 1 file changed, 1 insertion(+), 9 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c
index f56f96da77f5..fd019d78b1f4 100644
--- a/arch/x86/kernel/smpboot.c
+++ b/arch/x86/kernel/smpboot.c
@@ -410,15 +410,7 @@ void __cpuinit set_cpu_sibling_map(int cpu)
 /* maps the cpu to the sched domain representing multi-core */
 const struct cpumask *cpu_coregroup_mask(int cpu)
 {
-	struct cpuinfo_x86 *c = &cpu_data(cpu);
-	/*
-	 * For perf, we return last level cache shared map.
-	 * And for power savings, we return cpu_core_map
-	 */
-	if (!(cpu_has(c, X86_FEATURE_AMD_DCM)))
-		return cpu_core_mask(cpu);
-	else
-		return cpu_llc_shared_mask(cpu);
+	return cpu_llc_shared_mask(cpu);
 }
 
 static void impress_friends(void)
-- 
cgit v1.2.3


From 319b6ffc6df892e4ccffff823cc5521a4a5d2dca Mon Sep 17 00:00:00 2001
From: "H. Peter Anvin" <hpa@kernel.org>
Date: Wed, 30 May 2012 12:33:41 +0300
Subject: x86, realmode: Unbreak the ia64 build of drivers/acpi/sleep.c

Revert usage of acpi_wakeup_address and move definition
to x86 architecture code in order to make compilation work
in ia64.

[jsakkine: tested compilation in ia64/x86-64 and added
proper commit message]

Reported-by: Paul Gortmaker <paul.gortmaker@windriver.com>
Originally-by: H. Peter Anvin <hpa@kernel.org>
Signed-off-by: Jarkko Sakkinen <jarkko.sakkinen@intel.com>
Link: http://lkml.kernel.org/r/1338370421-27735-1-git-send-email-jarkko.sakkinen@intel.com
Cc: Tony Luck <tony.luck@intel.com>
Cc: Len Brown <lenb@kernel.org>
Signed-off-by: H. Peter Anvin <hpa@zytor.com>
---
 arch/x86/include/asm/acpi.h | 7 +++----
 drivers/acpi/sleep.c        | 8 ++------
 2 files changed, 5 insertions(+), 10 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/include/asm/acpi.h b/arch/x86/include/asm/acpi.h
index 724aa441de7d..0c44630d1789 100644
--- a/arch/x86/include/asm/acpi.h
+++ b/arch/x86/include/asm/acpi.h
@@ -29,6 +29,7 @@
 #include <asm/processor.h>
 #include <asm/mmu.h>
 #include <asm/mpspec.h>
+#include <asm/realmode.h>
 
 #define COMPILER_DEPENDENT_INT64   long long
 #define COMPILER_DEPENDENT_UINT64  unsigned long long
@@ -116,10 +117,8 @@ static inline void acpi_disable_pci(void)
 /* Low-level suspend routine. */
 extern int acpi_suspend_lowlevel(void);
 
-extern const unsigned char acpi_wakeup_code[];
-
-/* early initialization routine */
-extern void acpi_reserve_wakeup_memory(void);
+/* Physical address to resume after wakeup */
+#define acpi_wakeup_address ((unsigned long)(real_mode_header->wakeup_start))
 
 /*
  * Check if the CPU can handle C2 and deeper
diff --git a/drivers/acpi/sleep.c b/drivers/acpi/sleep.c
index ebaa04593236..74ee4ab577b6 100644
--- a/drivers/acpi/sleep.c
+++ b/drivers/acpi/sleep.c
@@ -25,8 +25,6 @@
 #include <acpi/acpi_bus.h>
 #include <acpi/acpi_drivers.h>
 
-#include <asm/realmode.h>
-
 #include "internal.h"
 #include "sleep.h"
 
@@ -93,13 +91,11 @@ static struct notifier_block tts_notifier = {
 static int acpi_sleep_prepare(u32 acpi_state)
 {
 #ifdef CONFIG_ACPI_SLEEP
-	unsigned long wakeup_pa = real_mode_header->wakeup_start;
 	/* do we have a wakeup address for S2 and S3? */
 	if (acpi_state == ACPI_STATE_S3) {
-		if (!wakeup_pa)
+		if (!acpi_wakeup_address)
 			return -EFAULT;
-		acpi_set_firmware_waking_vector(
-				(acpi_physical_address)wakeup_pa);
+		acpi_set_firmware_waking_vector(acpi_wakeup_address);
 
 	}
 	ACPI_FLUSH_CPU_CACHE();
-- 
cgit v1.2.3


From 2da06af8106f8f35318bb084baf8448797ef058a Mon Sep 17 00:00:00 2001
From: "zhenzhong.duan" <zhenzhong.duan@oracle.com>
Date: Wed, 30 May 2012 12:52:15 +0800
Subject: x86, mtrr: Fix a type overflow in range_to_mtrr func

When boot on sun G5+ with 4T mem, see an overflow in mtrr cleanup as below.

*BAD*gran_size: 2G      chunk_size: 2G  num_reg: 10     lose cover RAM:
-18014398505283592M

This is because 1<<31 sign extended. Use an unsigned long constant to
fix it.  Useful for mem larger than or equal to 4T.

-v2: Use 64bit constant instead of explicit type conversion as suggested
by Yinghai. Description updated too.

Signed-off-by: Zhenzhong Duan <zhenzhong.duan@oracle.com>
Link: http://lkml.kernel.org/r/4FC5A77F.6060505@oracle.com
Signed-off-by: H. Peter Anvin <hpa@zytor.com>
---
 arch/x86/kernel/cpu/mtrr/cleanup.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kernel/cpu/mtrr/cleanup.c b/arch/x86/kernel/cpu/mtrr/cleanup.c
index ac140c7be396..bdda2e6c673b 100644
--- a/arch/x86/kernel/cpu/mtrr/cleanup.c
+++ b/arch/x86/kernel/cpu/mtrr/cleanup.c
@@ -266,7 +266,7 @@ range_to_mtrr(unsigned int reg, unsigned long range_startk,
 		if (align > max_align)
 			align = max_align;
 
-		sizek = 1 << align;
+		sizek = 1UL << align;
 		if (debug_print) {
 			char start_factor = 'K', size_factor = 'K';
 			unsigned long start_base, size_base;
-- 
cgit v1.2.3


From 82f7af09e6fb58fb725c850d725d5e8780a9bec2 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Thu, 24 May 2012 17:54:51 +0000
Subject: x86/mce: Cleanup timer mess

Use unsigned long for dealing with jiffies not int. Rename the
callback to something sensible. Use __this_cpu_read/write for
accessing per cpu data.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Acked-by: Borislav Petkov <borislav.petkov@amd.com>
Signed-off-by: Tony Luck <tony.luck@intel.com>
---
 arch/x86/kernel/cpu/mcheck/mce.c | 31 ++++++++++++++++---------------
 1 file changed, 16 insertions(+), 15 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kernel/cpu/mcheck/mce.c b/arch/x86/kernel/cpu/mcheck/mce.c
index 5f793e6c854b..98003bfc5556 100644
--- a/arch/x86/kernel/cpu/mcheck/mce.c
+++ b/arch/x86/kernel/cpu/mcheck/mce.c
@@ -1243,15 +1243,15 @@ void mce_log_therm_throt_event(__u64 status)
  * poller finds an MCE, poll 2x faster.  When the poller finds no more
  * errors, poll 2x slower (up to check_interval seconds).
  */
-static int check_interval = 5 * 60; /* 5 minutes */
+static unsigned long check_interval = 5 * 60; /* 5 minutes */
 
-static DEFINE_PER_CPU(int, mce_next_interval); /* in jiffies */
+static DEFINE_PER_CPU(unsigned long, mce_next_interval); /* in jiffies */
 static DEFINE_PER_CPU(struct timer_list, mce_timer);
 
-static void mce_start_timer(unsigned long data)
+static void mce_timer_fn(unsigned long data)
 {
-	struct timer_list *t = &per_cpu(mce_timer, data);
-	int *n;
+	struct timer_list *t = &__get_cpu_var(mce_timer);
+	unsigned long iv;
 
 	WARN_ON(smp_processor_id() != data);
 
@@ -1264,13 +1264,14 @@ static void mce_start_timer(unsigned long data)
 	 * Alert userspace if needed.  If we logged an MCE, reduce the
 	 * polling interval, otherwise increase the polling interval.
 	 */
-	n = &__get_cpu_var(mce_next_interval);
+	iv = __this_cpu_read(mce_next_interval);
 	if (mce_notify_irq())
-		*n = max(*n/2, HZ/100);
+		iv = max(iv, (unsigned long) HZ/100);
 	else
-		*n = min(*n*2, (int)round_jiffies_relative(check_interval*HZ));
+		iv = min(iv * 2, round_jiffies_relative(check_interval * HZ));
+	__this_cpu_write(mce_next_interval, iv);
 
-	t->expires = jiffies + *n;
+	t->expires = jiffies + iv;
 	add_timer_on(t, smp_processor_id());
 }
 
@@ -1511,17 +1512,17 @@ static void __mcheck_cpu_init_vendor(struct cpuinfo_x86 *c)
 static void __mcheck_cpu_init_timer(void)
 {
 	struct timer_list *t = &__get_cpu_var(mce_timer);
-	int *n = &__get_cpu_var(mce_next_interval);
+	unsigned long iv = __this_cpu_read(mce_next_interval);
 
-	setup_timer(t, mce_start_timer, smp_processor_id());
+	setup_timer(t, mce_timer_fn, smp_processor_id());
 
 	if (mce_ignore_ce)
 		return;
 
-	*n = check_interval * HZ;
-	if (!*n)
+	__this_cpu_write(mce_next_interval, iv);
+	if (!iv)
 		return;
-	t->expires = round_jiffies(jiffies + *n);
+	t->expires = round_jiffies(jiffies + iv);
 	add_timer_on(t, smp_processor_id());
 }
 
@@ -2231,7 +2232,7 @@ mce_cpu_callback(struct notifier_block *nfb, unsigned long action, void *hcpu)
 	case CPU_DOWN_FAILED_FROZEN:
 		if (!mce_ignore_ce && check_interval) {
 			t->expires = round_jiffies(jiffies +
-					   __get_cpu_var(mce_next_interval));
+					per_cpu(mce_next_interval, cpu));
 			add_timer_on(t, cpu);
 		}
 		smp_call_function_single(cpu, mce_reenable_cpu, &action, 1);
-- 
cgit v1.2.3


From 1ab46fd319bcf1fcd9fb6311727d532b580e4eba Mon Sep 17 00:00:00 2001
From: Konrad Rzeszutek Wilk <konrad@darnok.org>
Date: Wed, 30 May 2012 18:23:56 -0400
Subject: x86, amd, xen: Avoid NULL pointer paravirt references

Stub out MSR methods that aren't actually needed.  This fixes a crash
as Xen Dom0 on AMD Trinity systems.  A bigger patch should be added to
remove the paravirt machinery completely for the methods which
apparently have no users!

Reported-by: Andre Przywara <andre.przywara@amd.com>
Link: http://lkml.kernel.org/r/20120530222356.GA28417@andromeda.dapyr.net
Signed-off-by: H. Peter Anvin <hpa@zytor.com>
Cc: <stable@vger.kernel.org>
---
 arch/x86/xen/enlighten.c | 3 +++
 1 file changed, 3 insertions(+)

(limited to 'arch/x86')

diff --git a/arch/x86/xen/enlighten.c b/arch/x86/xen/enlighten.c
index 75f33b2a5933..e74df9548a02 100644
--- a/arch/x86/xen/enlighten.c
+++ b/arch/x86/xen/enlighten.c
@@ -1116,7 +1116,10 @@ static const struct pv_cpu_ops xen_cpu_ops __initconst = {
 	.wbinvd = native_wbinvd,
 
 	.read_msr = native_read_msr_safe,
+	.rdmsr_regs = native_rdmsr_safe_regs,
 	.write_msr = xen_write_msr_safe,
+	.wrmsr_regs = native_wrmsr_safe_regs,
+
 	.read_tsc = native_read_tsc,
 	.read_pmc = native_read_pmc,
 
-- 
cgit v1.2.3


From bb8ac181a5cf50458a0d83b4460790badc9fdc16 Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Sat, 19 May 2012 10:25:23 -0400
Subject: bury __kernel_nlink_t, make internal nlink_t consistent

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 arch/alpha/include/asm/posix_types.h    | 3 ---
 arch/arm/include/asm/posix_types.h      | 3 ---
 arch/avr32/include/asm/posix_types.h    | 3 ---
 arch/blackfin/include/asm/posix_types.h | 3 ---
 arch/cris/include/asm/posix_types.h     | 3 ---
 arch/frv/include/asm/posix_types.h      | 3 ---
 arch/h8300/include/asm/posix_types.h    | 3 ---
 arch/ia64/include/asm/posix_types.h     | 3 ---
 arch/m32r/include/asm/posix_types.h     | 3 ---
 arch/m68k/include/asm/posix_types.h     | 3 ---
 arch/mips/include/asm/posix_types.h     | 5 -----
 arch/mn10300/include/asm/posix_types.h  | 3 ---
 arch/parisc/include/asm/posix_types.h   | 3 ---
 arch/powerpc/include/asm/posix_types.h  | 3 ---
 arch/s390/include/asm/posix_types.h     | 3 ---
 arch/sh/include/asm/posix_types_32.h    | 2 --
 arch/sh/include/asm/posix_types_64.h    | 2 --
 arch/sparc/include/asm/posix_types.h    | 5 -----
 arch/tile/include/asm/compat.h          | 1 -
 arch/x86/include/asm/posix_types_32.h   | 3 ---
 include/asm-generic/posix_types.h       | 4 ----
 include/linux/types.h                   | 2 +-
 22 files changed, 1 insertion(+), 65 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/alpha/include/asm/posix_types.h b/arch/alpha/include/asm/posix_types.h
index 24779fc95994..5a8a48320efe 100644
--- a/arch/alpha/include/asm/posix_types.h
+++ b/arch/alpha/include/asm/posix_types.h
@@ -10,9 +10,6 @@
 typedef unsigned int	__kernel_ino_t;
 #define __kernel_ino_t __kernel_ino_t
 
-typedef unsigned int	__kernel_nlink_t;
-#define __kernel_nlink_t __kernel_nlink_t
-
 typedef unsigned long	__kernel_sigset_t;	/* at least 32 bits */
 
 #include <asm-generic/posix_types.h>
diff --git a/arch/arm/include/asm/posix_types.h b/arch/arm/include/asm/posix_types.h
index efdf99045d87..d2de9cbbcd9b 100644
--- a/arch/arm/include/asm/posix_types.h
+++ b/arch/arm/include/asm/posix_types.h
@@ -22,9 +22,6 @@
 typedef unsigned short		__kernel_mode_t;
 #define __kernel_mode_t __kernel_mode_t
 
-typedef unsigned short		__kernel_nlink_t;
-#define __kernel_nlink_t __kernel_nlink_t
-
 typedef unsigned short		__kernel_ipc_pid_t;
 #define __kernel_ipc_pid_t __kernel_ipc_pid_t
 
diff --git a/arch/avr32/include/asm/posix_types.h b/arch/avr32/include/asm/posix_types.h
index 74667bfc88cc..9ba9e749b3f3 100644
--- a/arch/avr32/include/asm/posix_types.h
+++ b/arch/avr32/include/asm/posix_types.h
@@ -17,9 +17,6 @@
 typedef unsigned short  __kernel_mode_t;
 #define __kernel_mode_t __kernel_mode_t
 
-typedef unsigned short  __kernel_nlink_t;
-#define __kernel_nlink_t __kernel_nlink_t
-
 typedef unsigned short  __kernel_ipc_pid_t;
 #define __kernel_ipc_pid_t __kernel_ipc_pid_t
 
diff --git a/arch/blackfin/include/asm/posix_types.h b/arch/blackfin/include/asm/posix_types.h
index 41bc1875c4d7..1bd3436db6a7 100644
--- a/arch/blackfin/include/asm/posix_types.h
+++ b/arch/blackfin/include/asm/posix_types.h
@@ -10,9 +10,6 @@
 typedef unsigned short __kernel_mode_t;
 #define __kernel_mode_t __kernel_mode_t
 
-typedef unsigned short __kernel_nlink_t;
-#define __kernel_nlink_t __kernel_nlink_t
-
 typedef unsigned int __kernel_ipc_pid_t;
 #define __kernel_ipc_pid_t __kernel_ipc_pid_t
 
diff --git a/arch/cris/include/asm/posix_types.h b/arch/cris/include/asm/posix_types.h
index 234891c74e2b..ce4e51793151 100644
--- a/arch/cris/include/asm/posix_types.h
+++ b/arch/cris/include/asm/posix_types.h
@@ -15,9 +15,6 @@
 typedef unsigned short	__kernel_mode_t;
 #define __kernel_mode_t __kernel_mode_t
 
-typedef unsigned short	__kernel_nlink_t;
-#define __kernel_nlink_t __kernel_nlink_t
-
 typedef unsigned short  __kernel_ipc_pid_t;
 #define __kernel_ipc_pid_t __kernel_ipc_pid_t
 
diff --git a/arch/frv/include/asm/posix_types.h b/arch/frv/include/asm/posix_types.h
index 3f34cb45fbb3..fe512af74a5a 100644
--- a/arch/frv/include/asm/posix_types.h
+++ b/arch/frv/include/asm/posix_types.h
@@ -10,9 +10,6 @@
 typedef unsigned short	__kernel_mode_t;
 #define __kernel_mode_t __kernel_mode_t
 
-typedef unsigned short	__kernel_nlink_t;
-#define __kernel_nlink_t __kernel_nlink_t
-
 typedef unsigned short	__kernel_ipc_pid_t;
 #define __kernel_ipc_pid_t __kernel_ipc_pid_t
 
diff --git a/arch/h8300/include/asm/posix_types.h b/arch/h8300/include/asm/posix_types.h
index bc4c34efb1ad..91e62ba4c7b0 100644
--- a/arch/h8300/include/asm/posix_types.h
+++ b/arch/h8300/include/asm/posix_types.h
@@ -10,9 +10,6 @@
 typedef unsigned short	__kernel_mode_t;
 #define __kernel_mode_t __kernel_mode_t
 
-typedef unsigned short	__kernel_nlink_t;
-#define __kernel_nlink_t __kernel_nlink_t
-
 typedef unsigned short	__kernel_ipc_pid_t;
 #define __kernel_ipc_pid_t __kernel_ipc_pid_t
 
diff --git a/arch/ia64/include/asm/posix_types.h b/arch/ia64/include/asm/posix_types.h
index 7323ab9467eb..99ee1d6510cf 100644
--- a/arch/ia64/include/asm/posix_types.h
+++ b/arch/ia64/include/asm/posix_types.h
@@ -1,9 +1,6 @@
 #ifndef _ASM_IA64_POSIX_TYPES_H
 #define _ASM_IA64_POSIX_TYPES_H
 
-typedef unsigned int	__kernel_nlink_t;
-#define __kernel_nlink_t __kernel_nlink_t
-
 typedef unsigned long	__kernel_sigset_t;	/* at least 32 bits */
 
 #include <asm-generic/posix_types.h>
diff --git a/arch/m32r/include/asm/posix_types.h b/arch/m32r/include/asm/posix_types.h
index 0195850e1f88..236de26a409b 100644
--- a/arch/m32r/include/asm/posix_types.h
+++ b/arch/m32r/include/asm/posix_types.h
@@ -10,9 +10,6 @@
 typedef unsigned short	__kernel_mode_t;
 #define __kernel_mode_t __kernel_mode_t
 
-typedef unsigned short	__kernel_nlink_t;
-#define __kernel_nlink_t __kernel_nlink_t
-
 typedef unsigned short	__kernel_ipc_pid_t;
 #define __kernel_ipc_pid_t __kernel_ipc_pid_t
 
diff --git a/arch/m68k/include/asm/posix_types.h b/arch/m68k/include/asm/posix_types.h
index 6373093be72b..cf4dbf70fdc7 100644
--- a/arch/m68k/include/asm/posix_types.h
+++ b/arch/m68k/include/asm/posix_types.h
@@ -10,9 +10,6 @@
 typedef unsigned short	__kernel_mode_t;
 #define __kernel_mode_t __kernel_mode_t
 
-typedef unsigned short	__kernel_nlink_t;
-#define __kernel_nlink_t __kernel_nlink_t
-
 typedef unsigned short	__kernel_ipc_pid_t;
 #define __kernel_ipc_pid_t __kernel_ipc_pid_t
 
diff --git a/arch/mips/include/asm/posix_types.h b/arch/mips/include/asm/posix_types.h
index e0308dcca135..fa03ec3fbf89 100644
--- a/arch/mips/include/asm/posix_types.h
+++ b/arch/mips/include/asm/posix_types.h
@@ -17,11 +17,6 @@
  * assume GCC is being used.
  */
 
-#if (_MIPS_SZLONG == 64)
-typedef unsigned int	__kernel_nlink_t;
-#define __kernel_nlink_t __kernel_nlink_t
-#endif
-
 typedef long		__kernel_daddr_t;
 #define __kernel_daddr_t __kernel_daddr_t
 
diff --git a/arch/mn10300/include/asm/posix_types.h b/arch/mn10300/include/asm/posix_types.h
index ab506181ec31..d31eeea480cf 100644
--- a/arch/mn10300/include/asm/posix_types.h
+++ b/arch/mn10300/include/asm/posix_types.h
@@ -20,9 +20,6 @@
 typedef unsigned short	__kernel_mode_t;
 #define __kernel_mode_t __kernel_mode_t
 
-typedef unsigned short	__kernel_nlink_t;
-#define __kernel_nlink_t __kernel_nlink_t
-
 typedef unsigned short	__kernel_ipc_pid_t;
 #define __kernel_ipc_pid_t __kernel_ipc_pid_t
 
diff --git a/arch/parisc/include/asm/posix_types.h b/arch/parisc/include/asm/posix_types.h
index 5212b0357daf..b9344256f76b 100644
--- a/arch/parisc/include/asm/posix_types.h
+++ b/arch/parisc/include/asm/posix_types.h
@@ -10,9 +10,6 @@
 typedef unsigned short		__kernel_mode_t;
 #define __kernel_mode_t __kernel_mode_t
 
-typedef unsigned short		__kernel_nlink_t;
-#define __kernel_nlink_t __kernel_nlink_t
-
 typedef unsigned short		__kernel_ipc_pid_t;
 #define __kernel_ipc_pid_t __kernel_ipc_pid_t
 
diff --git a/arch/powerpc/include/asm/posix_types.h b/arch/powerpc/include/asm/posix_types.h
index f1393252bbda..2958c5b97b2d 100644
--- a/arch/powerpc/include/asm/posix_types.h
+++ b/arch/powerpc/include/asm/posix_types.h
@@ -16,9 +16,6 @@ typedef int		__kernel_ssize_t;
 typedef long		__kernel_ptrdiff_t;
 #define __kernel_size_t __kernel_size_t
 
-typedef unsigned short	__kernel_nlink_t;
-#define __kernel_nlink_t __kernel_nlink_t
-
 typedef short		__kernel_ipc_pid_t;
 #define __kernel_ipc_pid_t __kernel_ipc_pid_t
 #endif
diff --git a/arch/s390/include/asm/posix_types.h b/arch/s390/include/asm/posix_types.h
index edf8527ff08d..7be104c0f192 100644
--- a/arch/s390/include/asm/posix_types.h
+++ b/arch/s390/include/asm/posix_types.h
@@ -24,7 +24,6 @@ typedef unsigned short	__kernel_old_dev_t;
 
 typedef unsigned long   __kernel_ino_t;
 typedef unsigned short  __kernel_mode_t;
-typedef unsigned short  __kernel_nlink_t;
 typedef unsigned short  __kernel_ipc_pid_t;
 typedef unsigned short  __kernel_uid_t;
 typedef unsigned short  __kernel_gid_t;
@@ -35,7 +34,6 @@ typedef int             __kernel_ptrdiff_t;
 
 typedef unsigned int    __kernel_ino_t;
 typedef unsigned int    __kernel_mode_t;
-typedef unsigned int    __kernel_nlink_t;
 typedef int             __kernel_ipc_pid_t;
 typedef unsigned int    __kernel_uid_t;
 typedef unsigned int    __kernel_gid_t;
@@ -47,7 +45,6 @@ typedef unsigned long   __kernel_sigset_t;      /* at least 32 bits */
 
 #define __kernel_ino_t  __kernel_ino_t
 #define __kernel_mode_t __kernel_mode_t
-#define __kernel_nlink_t __kernel_nlink_t
 #define __kernel_ipc_pid_t __kernel_ipc_pid_t
 #define __kernel_uid_t __kernel_uid_t
 #define __kernel_gid_t __kernel_gid_t
diff --git a/arch/sh/include/asm/posix_types_32.h b/arch/sh/include/asm/posix_types_32.h
index abda58467ece..ba0bdc423b07 100644
--- a/arch/sh/include/asm/posix_types_32.h
+++ b/arch/sh/include/asm/posix_types_32.h
@@ -3,8 +3,6 @@
 
 typedef unsigned short	__kernel_mode_t;
 #define __kernel_mode_t __kernel_mode_t
-typedef unsigned short	__kernel_nlink_t;
-#define __kernel_nlink_t __kernel_nlink_t
 typedef unsigned short	__kernel_ipc_pid_t;
 #define __kernel_ipc_pid_t __kernel_ipc_pid_t
 typedef unsigned short	__kernel_uid_t;
diff --git a/arch/sh/include/asm/posix_types_64.h b/arch/sh/include/asm/posix_types_64.h
index fcda07b4a616..244f7e950e17 100644
--- a/arch/sh/include/asm/posix_types_64.h
+++ b/arch/sh/include/asm/posix_types_64.h
@@ -3,8 +3,6 @@
 
 typedef unsigned short	__kernel_mode_t;
 #define __kernel_mode_t __kernel_mode_t
-typedef unsigned short	__kernel_nlink_t;
-#define __kernel_nlink_t __kernel_nlink_t
 typedef unsigned short	__kernel_ipc_pid_t;
 #define __kernel_ipc_pid_t __kernel_ipc_pid_t
 typedef unsigned short	__kernel_uid_t;
diff --git a/arch/sparc/include/asm/posix_types.h b/arch/sparc/include/asm/posix_types.h
index 3070f25ae90a..156220ed99eb 100644
--- a/arch/sparc/include/asm/posix_types.h
+++ b/arch/sparc/include/asm/posix_types.h
@@ -9,8 +9,6 @@
 
 #if defined(__sparc__) && defined(__arch64__)
 /* sparc 64 bit */
-typedef unsigned int           __kernel_nlink_t;
-#define __kernel_nlink_t __kernel_nlink_t
 
 typedef unsigned short 	       __kernel_old_uid_t;
 typedef unsigned short         __kernel_old_gid_t;
@@ -38,9 +36,6 @@ typedef unsigned short         __kernel_gid_t;
 typedef unsigned short         __kernel_mode_t;
 #define __kernel_mode_t __kernel_mode_t
 
-typedef short                  __kernel_nlink_t;
-#define __kernel_nlink_t __kernel_nlink_t
-
 typedef long                   __kernel_daddr_t;
 #define __kernel_daddr_t __kernel_daddr_t
 
diff --git a/arch/tile/include/asm/compat.h b/arch/tile/include/asm/compat.h
index 69adc08d36a5..6e74450ff0a1 100644
--- a/arch/tile/include/asm/compat.h
+++ b/arch/tile/include/asm/compat.h
@@ -44,7 +44,6 @@ typedef __kernel_uid32_t __compat_gid32_t;
 typedef __kernel_mode_t compat_mode_t;
 typedef __kernel_dev_t compat_dev_t;
 typedef __kernel_loff_t compat_loff_t;
-typedef __kernel_nlink_t compat_nlink_t;
 typedef __kernel_ipc_pid_t compat_ipc_pid_t;
 typedef __kernel_daddr_t compat_daddr_t;
 typedef __kernel_fsid_t	compat_fsid_t;
diff --git a/arch/x86/include/asm/posix_types_32.h b/arch/x86/include/asm/posix_types_32.h
index 99f262e04b91..8e525059e7d8 100644
--- a/arch/x86/include/asm/posix_types_32.h
+++ b/arch/x86/include/asm/posix_types_32.h
@@ -10,9 +10,6 @@
 typedef unsigned short	__kernel_mode_t;
 #define __kernel_mode_t __kernel_mode_t
 
-typedef unsigned short	__kernel_nlink_t;
-#define __kernel_nlink_t __kernel_nlink_t
-
 typedef unsigned short	__kernel_ipc_pid_t;
 #define __kernel_ipc_pid_t __kernel_ipc_pid_t
 
diff --git a/include/asm-generic/posix_types.h b/include/asm-generic/posix_types.h
index 91d44bd4dde3..fe74fccf18db 100644
--- a/include/asm-generic/posix_types.h
+++ b/include/asm-generic/posix_types.h
@@ -23,10 +23,6 @@ typedef __kernel_ulong_t __kernel_ino_t;
 typedef unsigned int	__kernel_mode_t;
 #endif
 
-#ifndef __kernel_nlink_t
-typedef __kernel_ulong_t __kernel_nlink_t;
-#endif
-
 #ifndef __kernel_pid_t
 typedef int		__kernel_pid_t;
 #endif
diff --git a/include/linux/types.h b/include/linux/types.h
index 7f480db60231..9c1bd539ea70 100644
--- a/include/linux/types.h
+++ b/include/linux/types.h
@@ -25,7 +25,7 @@ typedef __kernel_dev_t		dev_t;
 typedef __kernel_ino_t		ino_t;
 typedef __kernel_mode_t		mode_t;
 typedef unsigned short		umode_t;
-typedef __kernel_nlink_t	nlink_t;
+typedef __u32			nlink_t;
 typedef __kernel_off_t		off_t;
 typedef __kernel_pid_t		pid_t;
 typedef __kernel_daddr_t	daddr_t;
-- 
cgit v1.2.3


From d97b46a64674a267bc41c9e16132ee2a98c3347d Mon Sep 17 00:00:00 2001
From: Cyrill Gorcunov <gorcunov@openvz.org>
Date: Thu, 31 May 2012 16:26:44 -0700
Subject: syscalls, x86: add __NR_kcmp syscall

While doing the checkpoint-restore in the user space one need to determine
whether various kernel objects (like mm_struct-s of file_struct-s) are
shared between tasks and restore this state.

The 2nd step can be solved by using appropriate CLONE_ flags and the
unshare syscall, while there's currently no ways for solving the 1st one.

One of the ways for checking whether two tasks share e.g.  mm_struct is to
provide some mm_struct ID of a task to its proc file, but showing such
info considered to be not that good for security reasons.

Thus after some debates we end up in conclusion that using that named
'comparison' syscall might be the best candidate.  So here is it --
__NR_kcmp.

It takes up to 5 arguments - the pids of the two tasks (which
characteristics should be compared), the comparison type and (in case of
comparison of files) two file descriptors.

Lookups for pids are done in the caller's PID namespace only.

At moment only x86 is supported and tested.

[akpm@linux-foundation.org: fix up selftests, warnings]
[akpm@linux-foundation.org: include errno.h]
[akpm@linux-foundation.org: tweak comment text]
Signed-off-by: Cyrill Gorcunov <gorcunov@openvz.org>
Acked-by: "Eric W. Biederman" <ebiederm@xmission.com>
Cc: Pavel Emelyanov <xemul@parallels.com>
Cc: Andrey Vagin <avagin@openvz.org>
Cc: KOSAKI Motohiro <kosaki.motohiro@gmail.com>
Cc: Ingo Molnar <mingo@elte.hu>
Cc: H. Peter Anvin <hpa@zytor.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Glauber Costa <glommer@parallels.com>
Cc: Andi Kleen <andi@firstfloor.org>
Cc: Tejun Heo <tj@kernel.org>
Cc: Matt Helsley <matthltc@us.ibm.com>
Cc: Pekka Enberg <penberg@kernel.org>
Cc: Eric Dumazet <eric.dumazet@gmail.com>
Cc: Vasiliy Kulikov <segoon@openwall.com>
Cc: Alexey Dobriyan <adobriyan@gmail.com>
Cc: Valdis.Kletnieks@vt.edu
Cc: Michal Marek <mmarek@suse.cz>
Cc: Frederic Weisbecker <fweisbec@gmail.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 arch/x86/syscalls/syscall_32.tbl         |   1 +
 arch/x86/syscalls/syscall_64.tbl         |   2 +
 include/linux/kcmp.h                     |  17 +++
 include/linux/syscalls.h                 |   2 +
 kernel/Makefile                          |   3 +
 kernel/kcmp.c                            | 196 +++++++++++++++++++++++++++++++
 kernel/sys_ni.c                          |   3 +
 tools/testing/selftests/Makefile         |   2 +-
 tools/testing/selftests/kcmp/Makefile    |  29 +++++
 tools/testing/selftests/kcmp/kcmp_test.c |  94 +++++++++++++++
 10 files changed, 348 insertions(+), 1 deletion(-)
 create mode 100644 include/linux/kcmp.h
 create mode 100644 kernel/kcmp.c
 create mode 100644 tools/testing/selftests/kcmp/Makefile
 create mode 100644 tools/testing/selftests/kcmp/kcmp_test.c

(limited to 'arch/x86')

diff --git a/arch/x86/syscalls/syscall_32.tbl b/arch/x86/syscalls/syscall_32.tbl
index 29f9f0554f7d..7a35a6e71d44 100644
--- a/arch/x86/syscalls/syscall_32.tbl
+++ b/arch/x86/syscalls/syscall_32.tbl
@@ -355,3 +355,4 @@
 346	i386	setns			sys_setns
 347	i386	process_vm_readv	sys_process_vm_readv		compat_sys_process_vm_readv
 348	i386	process_vm_writev	sys_process_vm_writev		compat_sys_process_vm_writev
+349	i386	kcmp			sys_kcmp
diff --git a/arch/x86/syscalls/syscall_64.tbl b/arch/x86/syscalls/syscall_64.tbl
index dd29a9ea27c5..51171aeff0dc 100644
--- a/arch/x86/syscalls/syscall_64.tbl
+++ b/arch/x86/syscalls/syscall_64.tbl
@@ -318,6 +318,8 @@
 309	common	getcpu			sys_getcpu
 310	64	process_vm_readv	sys_process_vm_readv
 311	64	process_vm_writev	sys_process_vm_writev
+312	64	kcmp			sys_kcmp
+
 #
 # x32-specific system call numbers start at 512 to avoid cache impact
 # for native 64-bit operation.
diff --git a/include/linux/kcmp.h b/include/linux/kcmp.h
new file mode 100644
index 000000000000..2dcd1b3aafc8
--- /dev/null
+++ b/include/linux/kcmp.h
@@ -0,0 +1,17 @@
+#ifndef _LINUX_KCMP_H
+#define _LINUX_KCMP_H
+
+/* Comparison type */
+enum kcmp_type {
+	KCMP_FILE,
+	KCMP_VM,
+	KCMP_FILES,
+	KCMP_FS,
+	KCMP_SIGHAND,
+	KCMP_IO,
+	KCMP_SYSVSEM,
+
+	KCMP_TYPES,
+};
+
+#endif /* _LINUX_KCMP_H */
diff --git a/include/linux/syscalls.h b/include/linux/syscalls.h
index 3de3acb84a95..19439c75c5b2 100644
--- a/include/linux/syscalls.h
+++ b/include/linux/syscalls.h
@@ -858,4 +858,6 @@ asmlinkage long sys_process_vm_writev(pid_t pid,
 				      unsigned long riovcnt,
 				      unsigned long flags);
 
+asmlinkage long sys_kcmp(pid_t pid1, pid_t pid2, int type,
+			 unsigned long idx1, unsigned long idx2);
 #endif
diff --git a/kernel/Makefile b/kernel/Makefile
index 6c07f30fa9b7..80be6ca0cc75 100644
--- a/kernel/Makefile
+++ b/kernel/Makefile
@@ -25,6 +25,9 @@ endif
 obj-y += sched/
 obj-y += power/
 
+ifeq ($(CONFIG_CHECKPOINT_RESTORE),y)
+obj-$(CONFIG_X86) += kcmp.o
+endif
 obj-$(CONFIG_FREEZER) += freezer.o
 obj-$(CONFIG_PROFILING) += profile.o
 obj-$(CONFIG_STACKTRACE) += stacktrace.o
diff --git a/kernel/kcmp.c b/kernel/kcmp.c
new file mode 100644
index 000000000000..30b7b225306c
--- /dev/null
+++ b/kernel/kcmp.c
@@ -0,0 +1,196 @@
+#include <linux/kernel.h>
+#include <linux/syscalls.h>
+#include <linux/fdtable.h>
+#include <linux/string.h>
+#include <linux/random.h>
+#include <linux/module.h>
+#include <linux/init.h>
+#include <linux/errno.h>
+#include <linux/cache.h>
+#include <linux/bug.h>
+#include <linux/err.h>
+#include <linux/kcmp.h>
+
+#include <asm/unistd.h>
+
+/*
+ * We don't expose the real in-memory order of objects for security reasons.
+ * But still the comparison results should be suitable for sorting. So we
+ * obfuscate kernel pointers values and compare the production instead.
+ *
+ * The obfuscation is done in two steps. First we xor the kernel pointer with
+ * a random value, which puts pointer into a new position in a reordered space.
+ * Secondly we multiply the xor production with a large odd random number to
+ * permute its bits even more (the odd multiplier guarantees that the product
+ * is unique ever after the high bits are truncated, since any odd number is
+ * relative prime to 2^n).
+ *
+ * Note also that the obfuscation itself is invisible to userspace and if needed
+ * it can be changed to an alternate scheme.
+ */
+static unsigned long cookies[KCMP_TYPES][2] __read_mostly;
+
+static long kptr_obfuscate(long v, int type)
+{
+	return (v ^ cookies[type][0]) * cookies[type][1];
+}
+
+/*
+ * 0 - equal, i.e. v1 = v2
+ * 1 - less than, i.e. v1 < v2
+ * 2 - greater than, i.e. v1 > v2
+ * 3 - not equal but ordering unavailable (reserved for future)
+ */
+static int kcmp_ptr(void *v1, void *v2, enum kcmp_type type)
+{
+	long ret;
+
+	ret = kptr_obfuscate((long)v1, type) - kptr_obfuscate((long)v2, type);
+
+	return (ret < 0) | ((ret > 0) << 1);
+}
+
+/* The caller must have pinned the task */
+static struct file *
+get_file_raw_ptr(struct task_struct *task, unsigned int idx)
+{
+	struct file *file = NULL;
+
+	task_lock(task);
+	rcu_read_lock();
+
+	if (task->files)
+		file = fcheck_files(task->files, idx);
+
+	rcu_read_unlock();
+	task_unlock(task);
+
+	return file;
+}
+
+static void kcmp_unlock(struct mutex *m1, struct mutex *m2)
+{
+	if (likely(m2 != m1))
+		mutex_unlock(m2);
+	mutex_unlock(m1);
+}
+
+static int kcmp_lock(struct mutex *m1, struct mutex *m2)
+{
+	int err;
+
+	if (m2 > m1)
+		swap(m1, m2);
+
+	err = mutex_lock_killable(m1);
+	if (!err && likely(m1 != m2)) {
+		err = mutex_lock_killable_nested(m2, SINGLE_DEPTH_NESTING);
+		if (err)
+			mutex_unlock(m1);
+	}
+
+	return err;
+}
+
+SYSCALL_DEFINE5(kcmp, pid_t, pid1, pid_t, pid2, int, type,
+		unsigned long, idx1, unsigned long, idx2)
+{
+	struct task_struct *task1, *task2;
+	int ret;
+
+	rcu_read_lock();
+
+	/*
+	 * Tasks are looked up in caller's PID namespace only.
+	 */
+	task1 = find_task_by_vpid(pid1);
+	task2 = find_task_by_vpid(pid2);
+	if (!task1 || !task2)
+		goto err_no_task;
+
+	get_task_struct(task1);
+	get_task_struct(task2);
+
+	rcu_read_unlock();
+
+	/*
+	 * One should have enough rights to inspect task details.
+	 */
+	ret = kcmp_lock(&task1->signal->cred_guard_mutex,
+			&task2->signal->cred_guard_mutex);
+	if (ret)
+		goto err;
+	if (!ptrace_may_access(task1, PTRACE_MODE_READ) ||
+	    !ptrace_may_access(task2, PTRACE_MODE_READ)) {
+		ret = -EPERM;
+		goto err_unlock;
+	}
+
+	switch (type) {
+	case KCMP_FILE: {
+		struct file *filp1, *filp2;
+
+		filp1 = get_file_raw_ptr(task1, idx1);
+		filp2 = get_file_raw_ptr(task2, idx2);
+
+		if (filp1 && filp2)
+			ret = kcmp_ptr(filp1, filp2, KCMP_FILE);
+		else
+			ret = -EBADF;
+		break;
+	}
+	case KCMP_VM:
+		ret = kcmp_ptr(task1->mm, task2->mm, KCMP_VM);
+		break;
+	case KCMP_FILES:
+		ret = kcmp_ptr(task1->files, task2->files, KCMP_FILES);
+		break;
+	case KCMP_FS:
+		ret = kcmp_ptr(task1->fs, task2->fs, KCMP_FS);
+		break;
+	case KCMP_SIGHAND:
+		ret = kcmp_ptr(task1->sighand, task2->sighand, KCMP_SIGHAND);
+		break;
+	case KCMP_IO:
+		ret = kcmp_ptr(task1->io_context, task2->io_context, KCMP_IO);
+		break;
+	case KCMP_SYSVSEM:
+#ifdef CONFIG_SYSVIPC
+		ret = kcmp_ptr(task1->sysvsem.undo_list,
+			       task2->sysvsem.undo_list,
+			       KCMP_SYSVSEM);
+#else
+		ret = -EOPNOTSUPP;
+#endif
+		break;
+	default:
+		ret = -EINVAL;
+		break;
+	}
+
+err_unlock:
+	kcmp_unlock(&task1->signal->cred_guard_mutex,
+		    &task2->signal->cred_guard_mutex);
+err:
+	put_task_struct(task1);
+	put_task_struct(task2);
+
+	return ret;
+
+err_no_task:
+	rcu_read_unlock();
+	return -ESRCH;
+}
+
+static __init int kcmp_cookies_init(void)
+{
+	int i;
+
+	get_random_bytes(cookies, sizeof(cookies));
+
+	for (i = 0; i < KCMP_TYPES; i++)
+		cookies[i][1] |= (~(~0UL >>  1) | 1);
+
+	return 0;
+}
+arch_initcall(kcmp_cookies_init);
diff --git a/kernel/sys_ni.c b/kernel/sys_ni.c
index 47bfa16430d7..dbff751e4086 100644
--- a/kernel/sys_ni.c
+++ b/kernel/sys_ni.c
@@ -203,3 +203,6 @@ cond_syscall(sys_fanotify_mark);
 cond_syscall(sys_name_to_handle_at);
 cond_syscall(sys_open_by_handle_at);
 cond_syscall(compat_sys_open_by_handle_at);
+
+/* compare kernel pointers */
+cond_syscall(sys_kcmp);
diff --git a/tools/testing/selftests/Makefile b/tools/testing/selftests/Makefile
index 14972017a43e..a4162e15c25f 100644
--- a/tools/testing/selftests/Makefile
+++ b/tools/testing/selftests/Makefile
@@ -1,4 +1,4 @@
-TARGETS = breakpoints mqueue vm
+TARGETS = breakpoints kcmp mqueue vm
 
 all:
 	for TARGET in $(TARGETS); do \
diff --git a/tools/testing/selftests/kcmp/Makefile b/tools/testing/selftests/kcmp/Makefile
new file mode 100644
index 000000000000..dc79b86ea65c
--- /dev/null
+++ b/tools/testing/selftests/kcmp/Makefile
@@ -0,0 +1,29 @@
+uname_M := $(shell uname -m 2>/dev/null || echo not)
+ARCH ?= $(shell echo $(uname_M) | sed -e s/i.86/i386/)
+ifeq ($(ARCH),i386)
+        ARCH := X86
+	CFLAGS := -DCONFIG_X86_32 -D__i386__
+endif
+ifeq ($(ARCH),x86_64)
+	ARCH := X86
+	CFLAGS := -DCONFIG_X86_64 -D__x86_64__
+endif
+
+CFLAGS += -I../../../../arch/x86/include/generated/
+CFLAGS += -I../../../../include/
+CFLAGS += -I../../../../usr/include/
+CFLAGS += -I../../../../arch/x86/include/
+
+all:
+ifeq ($(ARCH),X86)
+	gcc $(CFLAGS) kcmp_test.c -o run_test
+else
+	echo "Not an x86 target, can't build kcmp selftest"
+endif
+
+run-tests: all
+	./kcmp_test
+
+clean:
+	rm -fr ./run_test
+	rm -fr ./test-file
diff --git a/tools/testing/selftests/kcmp/kcmp_test.c b/tools/testing/selftests/kcmp/kcmp_test.c
new file mode 100644
index 000000000000..358cc6bfa35d
--- /dev/null
+++ b/tools/testing/selftests/kcmp/kcmp_test.c
@@ -0,0 +1,94 @@
+#define _GNU_SOURCE
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <signal.h>
+#include <limits.h>
+#include <unistd.h>
+#include <errno.h>
+#include <string.h>
+#include <fcntl.h>
+
+#include <linux/unistd.h>
+#include <linux/kcmp.h>
+
+#include <sys/syscall.h>
+#include <sys/types.h>
+#include <sys/stat.h>
+#include <sys/wait.h>
+
+static long sys_kcmp(int pid1, int pid2, int type, int fd1, int fd2)
+{
+	return syscall(__NR_kcmp, pid1, pid2, type, fd1, fd2);
+}
+
+int main(int argc, char **argv)
+{
+	const char kpath[] = "kcmp-test-file";
+	int pid1, pid2;
+	int fd1, fd2;
+	int status;
+
+	fd1 = open(kpath, O_RDWR | O_CREAT | O_TRUNC, 0644);
+	pid1 = getpid();
+
+	if (fd1 < 0) {
+		perror("Can't create file");
+		exit(1);
+	}
+
+	pid2 = fork();
+	if (pid2 < 0) {
+		perror("fork failed");
+		exit(1);
+	}
+
+	if (!pid2) {
+		int pid2 = getpid();
+		int ret;
+
+		fd2 = open(kpath, O_RDWR, 0644);
+		if (fd2 < 0) {
+			perror("Can't open file");
+			exit(1);
+		}
+
+		/* An example of output and arguments */
+		printf("pid1: %6d pid2: %6d FD: %2ld FILES: %2ld VM: %2ld "
+		       "FS: %2ld SIGHAND: %2ld IO: %2ld SYSVSEM: %2ld "
+		       "INV: %2ld\n",
+		       pid1, pid2,
+		       sys_kcmp(pid1, pid2, KCMP_FILE,		fd1, fd2),
+		       sys_kcmp(pid1, pid2, KCMP_FILES,		0, 0),
+		       sys_kcmp(pid1, pid2, KCMP_VM,		0, 0),
+		       sys_kcmp(pid1, pid2, KCMP_FS,		0, 0),
+		       sys_kcmp(pid1, pid2, KCMP_SIGHAND,	0, 0),
+		       sys_kcmp(pid1, pid2, KCMP_IO,		0, 0),
+		       sys_kcmp(pid1, pid2, KCMP_SYSVSEM,	0, 0),
+
+			/* This one should fail */
+		       sys_kcmp(pid1, pid2, KCMP_TYPES + 1,	0, 0));
+
+		/* This one should return same fd */
+		ret = sys_kcmp(pid1, pid2, KCMP_FILE, fd1, fd1);
+		if (ret) {
+			printf("FAIL: 0 expected but %d returned\n", ret);
+			ret = -1;
+		} else
+			printf("PASS: 0 returned as expected\n");
+
+		/* Compare with self */
+		ret = sys_kcmp(pid1, pid1, KCMP_VM, 0, 0);
+		if (ret) {
+			printf("FAIL: 0 expected but %li returned\n", ret);
+			ret = -1;
+		} else
+			printf("PASS: 0 returned as expected\n");
+
+		exit(ret);
+	}
+
+	waitpid(pid2, &status, P_ALL);
+
+	return 0;
+}
-- 
cgit v1.2.3


From a192cd0413b71c2a3e4e48dd365af704be72b748 Mon Sep 17 00:00:00 2001
From: Steven Rostedt <srostedt@redhat.com>
Date: Wed, 30 May 2012 13:26:37 -0400
Subject: ftrace: Synchronize variable setting with breakpoints

When the function tracer starts modifying the code via breakpoints
it sets a variable (modifying_ftrace_code) to inform the breakpoint
handler to call the ftrace int3 code.

But there's no synchronization between setting this code and the
handler, thus it is possible for the handler to be called on another
CPU before it sees the variable. This will cause a kernel crash as
the int3 handler will not know what to do with it.

I originally added smp_mb()'s to force the visibility of the variable
but H. Peter Anvin suggested that I just make it atomic.

[ Added comments as suggested by Peter Zijlstra ]

Suggested-by: H. Peter Anvin <hpa@zytor.com>
Signed-off-by: Steven Rostedt <rostedt@goodmis.org>
---
 arch/x86/include/asm/ftrace.h |  2 +-
 arch/x86/kernel/ftrace.c      | 38 +++++++++++++++++++++++++++++++++++---
 arch/x86/kernel/traps.c       |  8 ++++++--
 3 files changed, 42 insertions(+), 6 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/include/asm/ftrace.h b/arch/x86/include/asm/ftrace.h
index 18d9005d9e4f..b0767bc08740 100644
--- a/arch/x86/include/asm/ftrace.h
+++ b/arch/x86/include/asm/ftrace.h
@@ -34,7 +34,7 @@
 
 #ifndef __ASSEMBLY__
 extern void mcount(void);
-extern int modifying_ftrace_code;
+extern atomic_t modifying_ftrace_code;
 
 static inline unsigned long ftrace_call_adjust(unsigned long addr)
 {
diff --git a/arch/x86/kernel/ftrace.c b/arch/x86/kernel/ftrace.c
index 32ff36596ab1..2407a6d81cb7 100644
--- a/arch/x86/kernel/ftrace.c
+++ b/arch/x86/kernel/ftrace.c
@@ -168,7 +168,38 @@ int ftrace_update_ftrace_func(ftrace_func_t func)
 	return ret;
 }
 
-int modifying_ftrace_code __read_mostly;
+/*
+ * The modifying_ftrace_code is used to tell the breakpoint
+ * handler to call ftrace_int3_handler(). If it fails to
+ * call this handler for a breakpoint added by ftrace, then
+ * the kernel may crash.
+ *
+ * As atomic_writes on x86 do not need a barrier, we do not
+ * need to add smp_mb()s for this to work. It is also considered
+ * that we can not read the modifying_ftrace_code before
+ * executing the breakpoint. That would be quite remarkable if
+ * it could do that. Here's the flow that is required:
+ *
+ *   CPU-0                          CPU-1
+ *
+ * atomic_inc(mfc);
+ * write int3s
+ *				<trap-int3> // implicit (r)mb
+ *				if (atomic_read(mfc))
+ *					call ftrace_int3_handler()
+ *
+ * Then when we are finished:
+ *
+ * atomic_dec(mfc);
+ *
+ * If we hit a breakpoint that was not set by ftrace, it does not
+ * matter if ftrace_int3_handler() is called or not. It will
+ * simply be ignored. But it is crucial that a ftrace nop/caller
+ * breakpoint is handled. No other user should ever place a
+ * breakpoint on an ftrace nop/caller location. It must only
+ * be done by this code.
+ */
+atomic_t modifying_ftrace_code __read_mostly;
 
 /*
  * A breakpoint was added to the code address we are about to
@@ -491,11 +522,12 @@ void ftrace_replace_code(int enable)
 
 void arch_ftrace_update_code(int command)
 {
-	modifying_ftrace_code++;
+	/* See comment above by declaration of modifying_ftrace_code */
+	atomic_inc(&modifying_ftrace_code);
 
 	ftrace_modify_all_code(command);
 
-	modifying_ftrace_code--;
+	atomic_dec(&modifying_ftrace_code);
 }
 
 int __init ftrace_dyn_arch_init(void *data)
diff --git a/arch/x86/kernel/traps.c b/arch/x86/kernel/traps.c
index ff08457a025d..05b31d92f69c 100644
--- a/arch/x86/kernel/traps.c
+++ b/arch/x86/kernel/traps.c
@@ -303,8 +303,12 @@ gp_in_kernel:
 dotraplinkage void __kprobes notrace do_int3(struct pt_regs *regs, long error_code)
 {
 #ifdef CONFIG_DYNAMIC_FTRACE
-	/* ftrace must be first, everything else may cause a recursive crash */
-	if (unlikely(modifying_ftrace_code) && ftrace_int3_handler(regs))
+	/*
+	 * ftrace must be first, everything else may cause a recursive crash.
+	 * See note by declaration of modifying_ftrace_code in ftrace.c
+	 */
+	if (unlikely(atomic_read(&modifying_ftrace_code)) &&
+	    ftrace_int3_handler(regs))
 		return;
 #endif
 #ifdef CONFIG_KGDB_LOW_LEVEL_TRAP
-- 
cgit v1.2.3


From 8a4d0a687a599f39b7df3fe15f2d51d2157caf44 Mon Sep 17 00:00:00 2001
From: Steven Rostedt <srostedt@redhat.com>
Date: Wed, 30 May 2012 13:36:38 -0400
Subject: ftrace: Use breakpoint method to update ftrace caller

On boot up and module load, it is fine to modify the code directly,
without the use of breakpoints. This is because boot up modification
is done before SMP is initialized, thus the modification is serial,
and module load is done before the module executes.

But after that we must use a SMP safe method to modify running code.
Otherwise, if we are running the function tracer and update its
function (by starting off the stack tracer, or perf tracing)
the change of the function called by the ftrace trampoline is done
directly. If this is being executed on another CPU, that CPU may
take a GPF and crash the kernel.

The breakpoint method is used to change the nops at all the functions, but
the change of the ftrace callback handler itself was still using a
direct modification. If tracing was enabled and the function callback
was changed then another CPU could fault if it was currently calling
the original callback. This modification must use the breakpoint method
too.

Note, the direct method is still used for boot up and module load.

Signed-off-by: Steven Rostedt <rostedt@goodmis.org>
---
 arch/x86/kernel/ftrace.c | 88 +++++++++++++++++++++++++++++++++++++++---------
 1 file changed, 72 insertions(+), 16 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kernel/ftrace.c b/arch/x86/kernel/ftrace.c
index 2407a6d81cb7..c3a7cb4bf6e6 100644
--- a/arch/x86/kernel/ftrace.c
+++ b/arch/x86/kernel/ftrace.c
@@ -100,7 +100,7 @@ static const unsigned char *ftrace_nop_replace(void)
 }
 
 static int
-ftrace_modify_code(unsigned long ip, unsigned const char *old_code,
+ftrace_modify_code_direct(unsigned long ip, unsigned const char *old_code,
 		   unsigned const char *new_code)
 {
 	unsigned char replaced[MCOUNT_INSN_SIZE];
@@ -141,7 +141,20 @@ int ftrace_make_nop(struct module *mod,
 	old = ftrace_call_replace(ip, addr);
 	new = ftrace_nop_replace();
 
-	return ftrace_modify_code(rec->ip, old, new);
+	/*
+	 * On boot up, and when modules are loaded, the MCOUNT_ADDR
+	 * is converted to a nop, and will never become MCOUNT_ADDR
+	 * again. This code is either running before SMP (on boot up)
+	 * or before the code will ever be executed (module load).
+	 * We do not want to use the breakpoint version in this case,
+	 * just modify the code directly.
+	 */
+	if (addr == MCOUNT_ADDR)
+		return ftrace_modify_code_direct(rec->ip, old, new);
+
+	/* Normal cases use add_brk_on_nop */
+	WARN_ONCE(1, "invalid use of ftrace_make_nop");
+	return -EINVAL;
 }
 
 int ftrace_make_call(struct dyn_ftrace *rec, unsigned long addr)
@@ -152,20 +165,8 @@ int ftrace_make_call(struct dyn_ftrace *rec, unsigned long addr)
 	old = ftrace_nop_replace();
 	new = ftrace_call_replace(ip, addr);
 
-	return ftrace_modify_code(rec->ip, old, new);
-}
-
-int ftrace_update_ftrace_func(ftrace_func_t func)
-{
-	unsigned long ip = (unsigned long)(&ftrace_call);
-	unsigned char old[MCOUNT_INSN_SIZE], *new;
-	int ret;
-
-	memcpy(old, &ftrace_call, MCOUNT_INSN_SIZE);
-	new = ftrace_call_replace(ip, (unsigned long)func);
-	ret = ftrace_modify_code(ip, old, new);
-
-	return ret;
+	/* Should only be called when module is loaded */
+	return ftrace_modify_code_direct(rec->ip, old, new);
 }
 
 /*
@@ -201,6 +202,29 @@ int ftrace_update_ftrace_func(ftrace_func_t func)
  */
 atomic_t modifying_ftrace_code __read_mostly;
 
+static int
+ftrace_modify_code(unsigned long ip, unsigned const char *old_code,
+		   unsigned const char *new_code);
+
+int ftrace_update_ftrace_func(ftrace_func_t func)
+{
+	unsigned long ip = (unsigned long)(&ftrace_call);
+	unsigned char old[MCOUNT_INSN_SIZE], *new;
+	int ret;
+
+	memcpy(old, &ftrace_call, MCOUNT_INSN_SIZE);
+	new = ftrace_call_replace(ip, (unsigned long)func);
+
+	/* See comment above by declaration of modifying_ftrace_code */
+	atomic_inc(&modifying_ftrace_code);
+
+	ret = ftrace_modify_code(ip, old, new);
+
+	atomic_dec(&modifying_ftrace_code);
+
+	return ret;
+}
+
 /*
  * A breakpoint was added to the code address we are about to
  * modify, and this is the handle that will just skip over it.
@@ -520,6 +544,38 @@ void ftrace_replace_code(int enable)
 	}
 }
 
+static int
+ftrace_modify_code(unsigned long ip, unsigned const char *old_code,
+		   unsigned const char *new_code)
+{
+	int ret;
+
+	ret = add_break(ip, old_code);
+	if (ret)
+		goto out;
+
+	run_sync();
+
+	ret = add_update_code(ip, new_code);
+	if (ret)
+		goto fail_update;
+
+	run_sync();
+
+	ret = ftrace_write(ip, new_code, 1);
+	if (ret) {
+		ret = -EPERM;
+		goto out;
+	}
+	run_sync();
+ out:
+	return ret;
+
+ fail_update:
+	probe_kernel_write((void *)ip, &old_code[0], 1);
+	goto out;
+}
+
 void arch_ftrace_update_code(int command)
 {
 	/* See comment above by declaration of modifying_ftrace_code */
-- 
cgit v1.2.3


From c0525a6972d3f1fb83058ef503e183475d6e4e26 Mon Sep 17 00:00:00 2001
From: Steven Rostedt <srostedt@redhat.com>
Date: Wed, 30 May 2012 11:43:19 -0400
Subject: x86: Reset the debug_stack update counter

When an NMI goes off and it sees that it preempted the debug stack,
to keep the debug stack safe, it changes the IDT to point to one that
does not modify the stack on breakpoint (to allow breakpoints in NMIs).

But the variable that gets set to know to undo it on exit never gets
cleared on exit. Thus every NMI will reset it on exit the first time
it is done even if it does not need to be reset.

[ Added H. Peter Anvin's suggestion to use this_cpu_read/write ]

Cc: <stable@vger.kernel.org> # v3.3
Signed-off-by: Steven Rostedt <rostedt@goodmis.org>
---
 arch/x86/kernel/nmi.c | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kernel/nmi.c b/arch/x86/kernel/nmi.c
index 90875279ef3d..a0b2f84457be 100644
--- a/arch/x86/kernel/nmi.c
+++ b/arch/x86/kernel/nmi.c
@@ -444,14 +444,16 @@ static inline void nmi_nesting_preprocess(struct pt_regs *regs)
 	 */
 	if (unlikely(is_debug_stack(regs->sp))) {
 		debug_stack_set_zero();
-		__get_cpu_var(update_debug_stack) = 1;
+		this_cpu_write(update_debug_stack, 1);
 	}
 }
 
 static inline void nmi_nesting_postprocess(void)
 {
-	if (unlikely(__get_cpu_var(update_debug_stack)))
+	if (unlikely(this_cpu_read(update_debug_stack))) {
 		debug_stack_reset();
+		this_cpu_write(update_debug_stack, 0);
+	}
 }
 #endif
 
-- 
cgit v1.2.3


From f8988175fd70874d1fb3712b1c5d3bfc6d455202 Mon Sep 17 00:00:00 2001
From: Steven Rostedt <srostedt@redhat.com>
Date: Wed, 30 May 2012 11:47:00 -0400
Subject: x86: Allow nesting of the debug stack IDT setting

When the NMI handler runs, it checks if it preempted a debug handler
and if that handler is using the debug stack. If it is, it changes the
IDT table not to update the stack, otherwise it will reset the debug
stack and corrupt the debug handler it preempted.

Now that ftrace uses breakpoints to change functions from nops to
callers, many more places may hit a breakpoint. Unfortunately this
includes some of the calls that lockdep performs. Which causes issues
with the debug stack. It too needs to change the debug stack before
tracing (if called from the debug handler).

Allow the debug_stack_set_zero() and debug_stack_reset() to be nested
so that the debug handlers can take advantage of them too.

[ Used this_cpu_*() over __get_cpu_var() as suggested by H. Peter Anvin ]

Signed-off-by: Steven Rostedt <rostedt@goodmis.org>
---
 arch/x86/kernel/cpu/common.c | 8 +++++++-
 1 file changed, 7 insertions(+), 1 deletion(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c
index 82f29e70d058..6b9333b429ba 100644
--- a/arch/x86/kernel/cpu/common.c
+++ b/arch/x86/kernel/cpu/common.c
@@ -1101,14 +1101,20 @@ int is_debug_stack(unsigned long addr)
 		 addr > (__get_cpu_var(debug_stack_addr) - DEBUG_STKSZ));
 }
 
+static DEFINE_PER_CPU(u32, debug_stack_use_ctr);
+
 void debug_stack_set_zero(void)
 {
+	this_cpu_inc(debug_stack_use_ctr);
 	load_idt((const struct desc_ptr *)&nmi_idt_descr);
 }
 
 void debug_stack_reset(void)
 {
-	load_idt((const struct desc_ptr *)&idt_descr);
+	if (WARN_ON(!this_cpu_read(debug_stack_use_ctr)))
+		return;
+	if (this_cpu_dec_return(debug_stack_use_ctr) == 0)
+		load_idt((const struct desc_ptr *)&idt_descr);
 }
 
 #else	/* CONFIG_X86_64 */
-- 
cgit v1.2.3


From 5963e317b1e9d2a4511503916d8fd664bb8fa8fb Mon Sep 17 00:00:00 2001
From: Steven Rostedt <srostedt@redhat.com>
Date: Wed, 30 May 2012 11:54:53 -0400
Subject: ftrace/x86: Do not change stacks in DEBUG when calling lockdep

When both DYNAMIC_FTRACE and LOCKDEP are set, the TRACE_IRQS_ON/OFF
will call into the lockdep code. The lockdep code can call lots of
functions that may be traced by ftrace. When ftrace is updating its
code and hits a breakpoint, the breakpoint handler will call into
lockdep. If lockdep happens to call a function that also has a breakpoint
attached, it will jump back into the breakpoint handler resetting
the stack to the debug stack and corrupt the contents currently on
that stack.

The 'do_sym' call that calls do_int3() is protected by modifying the
IST table to point to a different location if another breakpoint is
hit. But the TRACE_IRQS_OFF/ON are outside that protection, and if
a breakpoint is hit from those, the stack will get corrupted, and
the kernel will crash:

[ 1013.243754] BUG: unable to handle kernel NULL pointer dereference at 0000000000000002
[ 1013.272665] IP: [<ffff880145cc0000>] 0xffff880145cbffff
[ 1013.285186] PGD 1401b2067 PUD 14324c067 PMD 0
[ 1013.298832] Oops: 0010 [#1] PREEMPT SMP
[ 1013.310600] CPU 2
[ 1013.317904] Modules linked in: ip6t_REJECT nf_conntrack_ipv6 nf_defrag_ipv6 xt_state nf_conntrack ip6table_filter ip6_tables crc32c_intel ghash_clmulni_intel microcode usb_debug serio_raw pcspkr iTCO_wdt i2c_i801 iTCO_vendor_support e1000e nfsd nfs_acl auth_rpcgss lockd sunrpc i915 video i2c_algo_bit drm_kms_helper drm i2c_core [last unloaded: scsi_wait_scan]
[ 1013.401848]
[ 1013.407399] Pid: 112, comm: kworker/2:1 Not tainted 3.4.0+ #30
[ 1013.437943] RIP: 8eb8:[<ffff88014630a000>]  [<ffff88014630a000>] 0xffff880146309fff
[ 1013.459871] RSP: ffffffff8165e919:ffff88014780f408  EFLAGS: 00010046
[ 1013.477909] RAX: 0000000000000001 RBX: ffffffff81104020 RCX: 0000000000000000
[ 1013.499458] RDX: ffff880148008ea8 RSI: ffffffff8131ef40 RDI: ffffffff82203b20
[ 1013.521612] RBP: ffffffff81005751 R08: 0000000000000000 R09: 0000000000000000
[ 1013.543121] R10: ffffffff82cdc318 R11: 0000000000000000 R12: ffff880145cc0000
[ 1013.564614] R13: ffff880148008eb8 R14: 0000000000000002 R15: ffff88014780cb40
[ 1013.586108] FS:  0000000000000000(0000) GS:ffff880148000000(0000) knlGS:0000000000000000
[ 1013.609458] CS:  0010 DS: 0000 ES: 0000 CR0: 000000008005003b
[ 1013.627420] CR2: 0000000000000002 CR3: 0000000141f10000 CR4: 00000000001407e0
[ 1013.649051] DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000
[ 1013.670724] DR3: 0000000000000000 DR6: 00000000ffff0ff0 DR7: 0000000000000400
[ 1013.692376] Process kworker/2:1 (pid: 112, threadinfo ffff88013fe0e000, task ffff88014020a6a0)
[ 1013.717028] Stack:
[ 1013.724131]  ffff88014780f570 ffff880145cc0000 0000400000004000 0000000000000000
[ 1013.745918]  cccccccccccccccc ffff88014780cca8 ffffffff811072bb ffffffff81651627
[ 1013.767870]  ffffffff8118f8a7 ffffffff811072bb ffffffff81f2b6c5 ffffffff81f11bdb
[ 1013.790021] Call Trace:
[ 1013.800701] Code: 5a 5a 5a 5a 5a 5a 5a 5a 5a 5a 5a 5a 5a 5a 5a 5a 5a 5a 5a 5a 5a 5a 5a 5a 5a 5a 5a 5a 5a 5a 5a 5a 5a 5a 5a 5a 5a 5a 5a 5a 5a 5a 5a <e7> d7 64 81 ff ff ff ff 01 00 00 00 00 00 00 00 65 d9 64 81 ff
[ 1013.861443] RIP  [<ffff88014630a000>] 0xffff880146309fff
[ 1013.884466]  RSP <ffff88014780f408>
[ 1013.901507] CR2: 0000000000000002

The solution was to reuse the NMI functions that change the IDT table to make the debug
stack keep its current stack (in kernel mode) when hitting a breakpoint:

  call debug_stack_set_zero
  TRACE_IRQS_ON
  call debug_stack_reset

If the TRACE_IRQS_ON happens to hit a breakpoint then it will keep the current stack
and not crash the box.

Reported-by: Dave Jones <davej@redhat.com>
Signed-off-by: Steven Rostedt <rostedt@goodmis.org>
---
 arch/x86/kernel/entry_64.S | 44 +++++++++++++++++++++++++++++++++++++++++---
 1 file changed, 41 insertions(+), 3 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kernel/entry_64.S b/arch/x86/kernel/entry_64.S
index 320852d02026..7d65133b51be 100644
--- a/arch/x86/kernel/entry_64.S
+++ b/arch/x86/kernel/entry_64.S
@@ -190,6 +190,44 @@ ENDPROC(native_usergs_sysret64)
 #endif
 .endm
 
+/*
+ * When dynamic function tracer is enabled it will add a breakpoint
+ * to all locations that it is about to modify, sync CPUs, update
+ * all the code, sync CPUs, then remove the breakpoints. In this time
+ * if lockdep is enabled, it might jump back into the debug handler
+ * outside the updating of the IST protection. (TRACE_IRQS_ON/OFF).
+ *
+ * We need to change the IDT table before calling TRACE_IRQS_ON/OFF to
+ * make sure the stack pointer does not get reset back to the top
+ * of the debug stack, and instead just reuses the current stack.
+ */
+#if defined(CONFIG_DYNAMIC_FTRACE) && defined(CONFIG_TRACE_IRQFLAGS)
+
+.macro TRACE_IRQS_OFF_DEBUG
+	call debug_stack_set_zero
+	TRACE_IRQS_OFF
+	call debug_stack_reset
+.endm
+
+.macro TRACE_IRQS_ON_DEBUG
+	call debug_stack_set_zero
+	TRACE_IRQS_ON
+	call debug_stack_reset
+.endm
+
+.macro TRACE_IRQS_IRETQ_DEBUG offset=ARGOFFSET
+	bt   $9,EFLAGS-\offset(%rsp)	/* interrupts off? */
+	jnc  1f
+	TRACE_IRQS_ON_DEBUG
+1:
+.endm
+
+#else
+# define TRACE_IRQS_OFF_DEBUG		TRACE_IRQS_OFF
+# define TRACE_IRQS_ON_DEBUG		TRACE_IRQS_ON
+# define TRACE_IRQS_IRETQ_DEBUG		TRACE_IRQS_IRETQ
+#endif
+
 /*
  * C code is not supposed to know about undefined top of stack. Every time
  * a C function with an pt_regs argument is called from the SYSCALL based
@@ -1098,7 +1136,7 @@ ENTRY(\sym)
 	subq $ORIG_RAX-R15, %rsp
 	CFI_ADJUST_CFA_OFFSET ORIG_RAX-R15
 	call save_paranoid
-	TRACE_IRQS_OFF
+	TRACE_IRQS_OFF_DEBUG
 	movq %rsp,%rdi		/* pt_regs pointer */
 	xorl %esi,%esi		/* no error code */
 	subq $EXCEPTION_STKSZ, INIT_TSS_IST(\ist)
@@ -1393,7 +1431,7 @@ paranoidzeroentry machine_check *machine_check_vector(%rip)
 ENTRY(paranoid_exit)
 	DEFAULT_FRAME
 	DISABLE_INTERRUPTS(CLBR_NONE)
-	TRACE_IRQS_OFF
+	TRACE_IRQS_OFF_DEBUG
 	testl %ebx,%ebx				/* swapgs needed? */
 	jnz paranoid_restore
 	testl $3,CS(%rsp)
@@ -1404,7 +1442,7 @@ paranoid_swapgs:
 	RESTORE_ALL 8
 	jmp irq_return
 paranoid_restore:
-	TRACE_IRQS_IRETQ 0
+	TRACE_IRQS_IRETQ_DEBUG 0
 	RESTORE_ALL 8
 	jmp irq_return
 paranoid_userspace:
-- 
cgit v1.2.3


From 30dc0d0fe5d08396dbdaa2d70972149131340960 Mon Sep 17 00:00:00 2001
From: Matt Fleming <matt.fleming@intel.com>
Date: Thu, 15 Mar 2012 19:13:25 +0000
Subject: x86, efi: Only close open files in error path

The loop at the 'close_handles' label in handle_ramdisks() should be
using 'i', which represents the number of initrd files that were
successfully opened, not 'nr_initrds' which is the number of initrd=
arguments passed on the command line.

Currently, if we execute the loop to close all file handles and we
failed to open any initrds we'll try to call the close function on a
garbage pointer, causing the machine to hang.

Cc: Matthew Garrett <mjg@redhat.com>
Signed-off-by: Matt Fleming <matt.fleming@intel.com>
Link: http://lkml.kernel.org/r/1331907517-3985-2-git-send-email-matt@console-pimps.org
Signed-off-by: H. Peter Anvin <hpa@zytor.com>
---
 arch/x86/boot/compressed/eboot.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'arch/x86')

diff --git a/arch/x86/boot/compressed/eboot.c b/arch/x86/boot/compressed/eboot.c
index 2c14e76bb4c7..52a4e667b258 100644
--- a/arch/x86/boot/compressed/eboot.c
+++ b/arch/x86/boot/compressed/eboot.c
@@ -674,7 +674,7 @@ free_initrd_total:
 	low_free(initrd_total, initrd_addr);
 
 close_handles:
-	for (k = j; k < nr_initrds; k++)
+	for (k = j; k < i; k++)
 		efi_call_phys1(fh->close, initrds[k].handle);
 free_initrds:
 	efi_call_phys1(sys_table->boottime->free_pool, initrds);
-- 
cgit v1.2.3


From 9fa7dedad3d30345c843bd82db02c4d6169e5f61 Mon Sep 17 00:00:00 2001
From: Matt Fleming <matt.fleming@intel.com>
Date: Mon, 20 Feb 2012 13:20:59 +0000
Subject: x86, efi; Add EFI boot stub console support

We need a way of printing useful messages to the user, for example
when we fail to open an initrd file, instead of just hanging the
machine without giving the user any indication of what went wrong. So
sprinkle some error messages throughout the EFI boot stub code to make
it easier for users to diagnose/report problems.

Reported-by: Keshav P R <the.ridikulus.rat@gmail.com>
Cc: Matthew Garrett <mjg@redhat.com>
Signed-off-by: Matt Fleming <matt.fleming@intel.com>
Link: http://lkml.kernel.org/r/1331907517-3985-3-git-send-email-matt@console-pimps.org
Signed-off-by: H. Peter Anvin <hpa@zytor.com>
---
 arch/x86/boot/compressed/eboot.c | 85 ++++++++++++++++++++++++++++++++--------
 arch/x86/boot/compressed/eboot.h |  6 +++
 2 files changed, 75 insertions(+), 16 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/boot/compressed/eboot.c b/arch/x86/boot/compressed/eboot.c
index 52a4e667b258..4e85f5f85837 100644
--- a/arch/x86/boot/compressed/eboot.c
+++ b/arch/x86/boot/compressed/eboot.c
@@ -16,6 +16,26 @@
 
 static efi_system_table_t *sys_table;
 
+static void efi_printk(char *str)
+{
+	char *s8;
+
+	for (s8 = str; *s8; s8++) {
+		struct efi_simple_text_output_protocol *out;
+		efi_char16_t ch[2] = { 0 };
+
+		ch[0] = *s8;
+		out = (struct efi_simple_text_output_protocol *)sys_table->con_out;
+
+		if (*s8 == '\n') {
+			efi_char16_t nl[2] = { '\r', 0 };
+			efi_call_phys2(out->output_string, out, nl);
+		}
+
+		efi_call_phys2(out->output_string, out, ch);
+	}
+}
+
 static efi_status_t __get_map(efi_memory_desc_t **map, unsigned long *map_size,
 			      unsigned long *desc_size)
 {
@@ -531,8 +551,10 @@ static efi_status_t handle_ramdisks(efi_loaded_image_t *image,
 				EFI_LOADER_DATA,
 				nr_initrds * sizeof(*initrds),
 				&initrds);
-	if (status != EFI_SUCCESS)
+	if (status != EFI_SUCCESS) {
+		efi_printk("Failed to alloc mem for initrds\n");
 		goto fail;
+	}
 
 	str = (char *)(unsigned long)hdr->cmd_line_ptr;
 	for (i = 0; i < nr_initrds; i++) {
@@ -575,32 +597,42 @@ static efi_status_t handle_ramdisks(efi_loaded_image_t *image,
 
 			status = efi_call_phys3(boottime->handle_protocol,
 					image->device_handle, &fs_proto, &io);
-			if (status != EFI_SUCCESS)
+			if (status != EFI_SUCCESS) {
+				efi_printk("Failed to handle fs_proto\n");
 				goto free_initrds;
+			}
 
 			status = efi_call_phys2(io->open_volume, io, &fh);
-			if (status != EFI_SUCCESS)
+			if (status != EFI_SUCCESS) {
+				efi_printk("Failed to open volume\n");
 				goto free_initrds;
+			}
 		}
 
 		status = efi_call_phys5(fh->open, fh, &h, filename_16,
 					EFI_FILE_MODE_READ, (u64)0);
-		if (status != EFI_SUCCESS)
+		if (status != EFI_SUCCESS) {
+			efi_printk("Failed to open initrd file\n");
 			goto close_handles;
+		}
 
 		initrd->handle = h;
 
 		info_sz = 0;
 		status = efi_call_phys4(h->get_info, h, &info_guid,
 					&info_sz, NULL);
-		if (status != EFI_BUFFER_TOO_SMALL)
+		if (status != EFI_BUFFER_TOO_SMALL) {
+			efi_printk("Failed to get initrd info size\n");
 			goto close_handles;
+		}
 
 grow:
 		status = efi_call_phys3(sys_table->boottime->allocate_pool,
 					EFI_LOADER_DATA, info_sz, &info);
-		if (status != EFI_SUCCESS)
+		if (status != EFI_SUCCESS) {
+			efi_printk("Failed to alloc mem for initrd info\n");
 			goto close_handles;
+		}
 
 		status = efi_call_phys4(h->get_info, h, &info_guid,
 					&info_sz, info);
@@ -612,8 +644,10 @@ grow:
 		file_sz = info->file_size;
 		efi_call_phys1(sys_table->boottime->free_pool, info);
 
-		if (status != EFI_SUCCESS)
+		if (status != EFI_SUCCESS) {
+			efi_printk("Failed to get initrd info\n");
 			goto close_handles;
+		}
 
 		initrd->size = file_sz;
 		initrd_total += file_sz;
@@ -629,11 +663,14 @@ grow:
 		 */
 		status = high_alloc(initrd_total, 0x1000,
 				   &initrd_addr, hdr->initrd_addr_max);
-		if (status != EFI_SUCCESS)
+		if (status != EFI_SUCCESS) {
+			efi_printk("Failed to alloc highmem for initrds\n");
 			goto close_handles;
+		}
 
 		/* We've run out of free low memory. */
 		if (initrd_addr > hdr->initrd_addr_max) {
+			efi_printk("We've run out of free low memory\n");
 			status = EFI_INVALID_PARAMETER;
 			goto free_initrd_total;
 		}
@@ -652,8 +689,10 @@ grow:
 				status = efi_call_phys3(fh->read,
 							initrds[j].handle,
 							&chunksize, addr);
-				if (status != EFI_SUCCESS)
+				if (status != EFI_SUCCESS) {
+					efi_printk("Failed to read initrd\n");
 					goto free_initrd_total;
+				}
 				addr += chunksize;
 				size -= chunksize;
 			}
@@ -732,8 +771,10 @@ static efi_status_t make_boot_params(struct boot_params *boot_params,
 			options_size++;	/* NUL termination */
 
 			status = low_alloc(options_size, 1, &cmdline);
-			if (status != EFI_SUCCESS)
+			if (status != EFI_SUCCESS) {
+				efi_printk("Failed to alloc mem for cmdline\n");
 				goto fail;
+			}
 
 			s1 = (u8 *)(unsigned long)cmdline;
 			s2 = (u16 *)options;
@@ -895,12 +936,16 @@ struct boot_params *efi_main(void *handle, efi_system_table_t *_table)
 
 	status = efi_call_phys3(sys_table->boottime->handle_protocol,
 				handle, &proto, (void *)&image);
-	if (status != EFI_SUCCESS)
+	if (status != EFI_SUCCESS) {
+		efi_printk("Failed to get handle for LOADED_IMAGE_PROTOCOL\n");
 		goto fail;
+	}
 
 	status = low_alloc(0x4000, 1, (unsigned long *)&boot_params);
-	if (status != EFI_SUCCESS)
+	if (status != EFI_SUCCESS) {
+		efi_printk("Failed to alloc lowmem for boot params\n");
 		goto fail;
+	}
 
 	memset(boot_params, 0x0, 0x4000);
 
@@ -933,8 +978,10 @@ struct boot_params *efi_main(void *handle, efi_system_table_t *_table)
 	if (status != EFI_SUCCESS) {
 		status = low_alloc(hdr->init_size, hdr->kernel_alignment,
 				   &start);
-		if (status != EFI_SUCCESS)
+		if (status != EFI_SUCCESS) {
+			efi_printk("Failed to alloc mem for kernel\n");
 			goto fail;
+		}
 	}
 
 	hdr->code32_start = (__u32)start;
@@ -945,19 +992,25 @@ struct boot_params *efi_main(void *handle, efi_system_table_t *_table)
 	status = efi_call_phys3(sys_table->boottime->allocate_pool,
 				EFI_LOADER_DATA, sizeof(*gdt),
 				(void **)&gdt);
-	if (status != EFI_SUCCESS)
+	if (status != EFI_SUCCESS) {
+		efi_printk("Failed to alloc mem for gdt structure\n");
 		goto fail;
+	}
 
 	gdt->size = 0x800;
 	status = low_alloc(gdt->size, 8, (unsigned long *)&gdt->address);
-	if (status != EFI_SUCCESS)
+	if (status != EFI_SUCCESS) {
+		efi_printk("Failed to alloc mem for gdt\n");
 		goto fail;
+	}
 
 	status = efi_call_phys3(sys_table->boottime->allocate_pool,
 				EFI_LOADER_DATA, sizeof(*idt),
 				(void **)&idt);
-	if (status != EFI_SUCCESS)
+	if (status != EFI_SUCCESS) {
+		efi_printk("Failed to alloc mem for idt structure\n");
 		goto fail;
+	}
 
 	idt->size = 0;
 	idt->address = 0;
diff --git a/arch/x86/boot/compressed/eboot.h b/arch/x86/boot/compressed/eboot.h
index 39251663e65b..3b6e15627c55 100644
--- a/arch/x86/boot/compressed/eboot.h
+++ b/arch/x86/boot/compressed/eboot.h
@@ -58,4 +58,10 @@ struct efi_uga_draw_protocol {
 	void *blt;
 };
 
+struct efi_simple_text_output_protocol {
+	void *reset;
+	void *output_string;
+	void *test_string;
+};
+
 #endif /* BOOT_COMPRESSED_EBOOT_H */
-- 
cgit v1.2.3


From 0c7596621e313bfcfbacb288e768c7150f5de9e0 Mon Sep 17 00:00:00 2001
From: Matt Fleming <matt.fleming@intel.com>
Date: Fri, 16 Mar 2012 12:03:13 +0000
Subject: x86, efi: Add EFI boot stub documentation

Since we can't expect every user to read the EFI boot stub code it
seems prudent to have a couple of paragraphs explaining what it is and
how it works.

The "initrd=" option in particular is tricky because it only
understands absolute EFI-style paths (backslashes as directory
separators), and until now this hasn't been documented anywhere. This
has tripped up a couple of users.

Cc: Matthew Garrett <mjg@redhat.com>
Cc: Randy Dunlap <rdunlap@xenotime.net>
Signed-off-by: Matt Fleming <matt.fleming@intel.com>
Link: http://lkml.kernel.org/r/1331907517-3985-4-git-send-email-matt@console-pimps.org
Signed-off-by: H. Peter Anvin <hpa@zytor.com>
---
 Documentation/x86/efi-stub.txt | 65 ++++++++++++++++++++++++++++++++++++++++++
 arch/x86/Kconfig               |  2 ++
 2 files changed, 67 insertions(+)
 create mode 100644 Documentation/x86/efi-stub.txt

(limited to 'arch/x86')

diff --git a/Documentation/x86/efi-stub.txt b/Documentation/x86/efi-stub.txt
new file mode 100644
index 000000000000..44e6bb6ead10
--- /dev/null
+++ b/Documentation/x86/efi-stub.txt
@@ -0,0 +1,65 @@
+			  The EFI Boot Stub
+		     ---------------------------
+
+On the x86 platform, a bzImage can masquerade as a PE/COFF image,
+thereby convincing EFI firmware loaders to load it as an EFI
+executable. The code that modifies the bzImage header, along with the
+EFI-specific entry point that the firmware loader jumps to are
+collectively known as the "EFI boot stub", and live in
+arch/x86/boot/header.S and arch/x86/boot/compressed/eboot.c,
+respectively.
+
+By using the EFI boot stub it's possible to boot a Linux kernel
+without the use of a conventional EFI boot loader, such as grub or
+elilo. Since the EFI boot stub performs the jobs of a boot loader, in
+a certain sense it *IS* the boot loader.
+
+The EFI boot stub is enabled with the CONFIG_EFI_STUB kernel option.
+
+
+**** How to install bzImage.efi
+
+The bzImage located in arch/x86/boot/bzImage must be copied to the EFI
+System Partiion (ESP) and renamed with the extension ".efi". Without
+the extension the EFI firmware loader will refuse to execute it. It's
+not possible to execute bzImage.efi from the usual Linux file systems
+because EFI firmware doesn't have support for them.
+
+
+**** Passing kernel parameters from the EFI shell
+
+Arguments to the kernel can be passed after bzImage.efi, e.g.
+
+	fs0:> bzImage.efi console=ttyS0 root=/dev/sda4
+
+
+**** The "initrd=" option
+
+Like most boot loaders, the EFI stub allows the user to specify
+multiple initrd files using the "initrd=" option. This is the only EFI
+stub-specific command line parameter, everything else is passed to the
+kernel when it boots.
+
+The path to the initrd file must be an absolute path from the
+beginning of the ESP, relative path names do not work. Also, the path
+is an EFI-style path and directory elements must be separated with
+backslashes (\). For example, given the following directory layout,
+
+fs0:>
+	Kernels\
+			bzImage.efi
+			initrd-large.img
+
+	Ramdisks\
+			initrd-small.img
+			initrd-medium.img
+
+to boot with the initrd-large.img file if the current working
+directory is fs0:\Kernels, the following command must be used,
+
+	fs0:\Kernels> bzImage.efi initrd=\Kernels\initrd-large.img
+
+Notice how bzImage.efi can be specified with a relative path. That's
+because the image we're executing is interpreted by the EFI shell,
+which understands relative paths, whereas the rest of the command line
+is passed to bzImage.efi.
diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
index d700811785ea..c70684f859e1 100644
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -1506,6 +1506,8 @@ config EFI_STUB
           This kernel feature allows a bzImage to be loaded directly
 	  by EFI firmware without the use of a bootloader.
 
+	  See Documentation/x86/efi-stub.txt for more information.
+
 config SECCOMP
 	def_bool y
 	prompt "Enable seccomp to safely compute untrusted bytecode"
-- 
cgit v1.2.3


From 4ebefe3ec729003443daf153ed6fad1739271283 Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Thu, 26 Apr 2012 22:29:20 -0400
Subject: new helpers: {clear,test,test_and_clear}_restore_sigmask()

helpers parallel to set_restore_sigmask(), used in the next commits

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 arch/ia64/include/asm/thread_info.h       | 16 ++++++++++++++++
 arch/microblaze/include/asm/thread_info.h | 16 ++++++++++++++++
 arch/powerpc/include/asm/thread_info.h    | 16 ++++++++++++++++
 arch/sh/include/asm/thread_info.h         | 17 +++++++++++++++++
 arch/sparc/include/asm/thread_info_64.h   | 16 ++++++++++++++++
 arch/tile/include/asm/thread_info.h       | 16 ++++++++++++++++
 arch/x86/include/asm/thread_info.h        | 16 ++++++++++++++++
 include/linux/thread_info.h               | 12 ++++++++++++
 8 files changed, 125 insertions(+)

(limited to 'arch/x86')

diff --git a/arch/ia64/include/asm/thread_info.h b/arch/ia64/include/asm/thread_info.h
index 310d9734f02d..8d600363fa57 100644
--- a/arch/ia64/include/asm/thread_info.h
+++ b/arch/ia64/include/asm/thread_info.h
@@ -143,6 +143,22 @@ static inline void set_restore_sigmask(void)
 	ti->status |= TS_RESTORE_SIGMASK;
 	set_bit(TIF_SIGPENDING, &ti->flags);
 }
+static inline void clear_restore_sigmask(void)
+{
+	current_thread_info()->status &= ~TS_RESTORE_SIGMASK;
+}
+static inline bool test_restore_sigmask(void)
+{
+	return current_thread_info()->status & TS_RESTORE_SIGMASK;
+}
+static inline bool test_and_clear_restore_sigmask(void)
+{
+	struct thread_info *ti = current_thread_info();
+	if (!(ti->status & TS_RESTORE_SIGMASK))
+		return false;
+	ti->status &= ~TS_RESTORE_SIGMASK;
+	return true;
+}
 #endif	/* !__ASSEMBLY__ */
 
 #endif /* _ASM_IA64_THREAD_INFO_H */
diff --git a/arch/microblaze/include/asm/thread_info.h b/arch/microblaze/include/asm/thread_info.h
index 1a8ab6a5c03f..12e39206b3ef 100644
--- a/arch/microblaze/include/asm/thread_info.h
+++ b/arch/microblaze/include/asm/thread_info.h
@@ -168,6 +168,22 @@ static inline void set_restore_sigmask(void)
 	ti->status |= TS_RESTORE_SIGMASK;
 	set_bit(TIF_SIGPENDING, (unsigned long *)&ti->flags);
 }
+static inline void clear_restore_sigmask(void)
+{
+	current_thread_info()->status &= ~TS_RESTORE_SIGMASK;
+}
+static inline bool test_restore_sigmask(void)
+{
+	return current_thread_info()->status & TS_RESTORE_SIGMASK;
+}
+static inline bool test_and_clear_restore_sigmask(void)
+{
+	struct thread_info *ti = current_thread_info();
+	if (!(ti->status & TS_RESTORE_SIGMASK))
+		return false;
+	ti->status &= ~TS_RESTORE_SIGMASK;
+	return true;
+}
 #endif
 
 #endif /* __KERNEL__ */
diff --git a/arch/powerpc/include/asm/thread_info.h b/arch/powerpc/include/asm/thread_info.h
index a556ccc16b58..85d50a93a92f 100644
--- a/arch/powerpc/include/asm/thread_info.h
+++ b/arch/powerpc/include/asm/thread_info.h
@@ -142,6 +142,22 @@ static inline void set_restore_sigmask(void)
 	ti->local_flags |= _TLF_RESTORE_SIGMASK;
 	set_bit(TIF_SIGPENDING, &ti->flags);
 }
+static inline void clear_restore_sigmask(void)
+{
+	current_thread_info()->local_flags &= ~_TLF_RESTORE_SIGMASK;
+}
+static inline bool test_restore_sigmask(void)
+{
+	return current_thread_info()->local_flags & _TLF_RESTORE_SIGMASK;
+}
+static inline bool test_and_clear_restore_sigmask(void)
+{
+	struct thread_info *ti = current_thread_info();
+	if (!(ti->local_flags & _TLF_RESTORE_SIGMASK))
+		return false;
+	ti->local_flags &= ~_TLF_RESTORE_SIGMASK;
+	return true;
+}
 
 static inline bool test_thread_local_flags(unsigned int flags)
 {
diff --git a/arch/sh/include/asm/thread_info.h b/arch/sh/include/asm/thread_info.h
index 0c04ffc4f12c..a109157c6b8f 100644
--- a/arch/sh/include/asm/thread_info.h
+++ b/arch/sh/include/asm/thread_info.h
@@ -189,6 +189,23 @@ static inline unsigned int get_thread_fault_code(void)
 	struct thread_info *ti = current_thread_info();
 	return ti->flags >> TI_FLAG_FAULT_CODE_SHIFT;
 }
+
+static inline void clear_restore_sigmask(void)
+{
+	current_thread_info()->status &= ~TS_RESTORE_SIGMASK;
+}
+static inline bool test_restore_sigmask(void)
+{
+	return current_thread_info()->status & TS_RESTORE_SIGMASK;
+}
+static inline bool test_and_clear_restore_sigmask(void)
+{
+	struct thread_info *ti = current_thread_info();
+	if (!(ti->status & TS_RESTORE_SIGMASK))
+		return false;
+	ti->status &= ~TS_RESTORE_SIGMASK;
+	return true;
+}
 #endif	/* !__ASSEMBLY__ */
 
 #endif /* __KERNEL__ */
diff --git a/arch/sparc/include/asm/thread_info_64.h b/arch/sparc/include/asm/thread_info_64.h
index 7f0981b09451..cb9b7a9f5fc1 100644
--- a/arch/sparc/include/asm/thread_info_64.h
+++ b/arch/sparc/include/asm/thread_info_64.h
@@ -240,6 +240,22 @@ static inline void set_restore_sigmask(void)
 	ti->status |= TS_RESTORE_SIGMASK;
 	set_bit(TIF_SIGPENDING, &ti->flags);
 }
+static inline void clear_restore_sigmask(void)
+{
+	current_thread_info()->status &= ~TS_RESTORE_SIGMASK;
+}
+static inline bool test_restore_sigmask(void)
+{
+	return current_thread_info()->status & TS_RESTORE_SIGMASK;
+}
+static inline bool test_and_clear_restore_sigmask(void)
+{
+	struct thread_info *ti = current_thread_info();
+	if (!(ti->status & TS_RESTORE_SIGMASK))
+		return false;
+	ti->status &= ~TS_RESTORE_SIGMASK;
+	return true;
+}
 #endif	/* !__ASSEMBLY__ */
 
 #endif /* __KERNEL__ */
diff --git a/arch/tile/include/asm/thread_info.h b/arch/tile/include/asm/thread_info.h
index 656c486e64fa..5aef371921e4 100644
--- a/arch/tile/include/asm/thread_info.h
+++ b/arch/tile/include/asm/thread_info.h
@@ -168,6 +168,22 @@ static inline void set_restore_sigmask(void)
 	ti->status |= TS_RESTORE_SIGMASK;
 	set_bit(TIF_SIGPENDING, &ti->flags);
 }
+static inline void clear_restore_sigmask(void)
+{
+	current_thread_info()->status &= ~TS_RESTORE_SIGMASK;
+}
+static inline bool test_restore_sigmask(void)
+{
+	return current_thread_info()->status & TS_RESTORE_SIGMASK;
+}
+static inline bool test_and_clear_restore_sigmask(void)
+{
+	struct thread_info *ti = current_thread_info();
+	if (!(ti->status & TS_RESTORE_SIGMASK))
+		return false;
+	ti->status &= ~TS_RESTORE_SIGMASK;
+	return true;
+}
 #endif	/* !__ASSEMBLY__ */
 
 #endif /* _ASM_TILE_THREAD_INFO_H */
diff --git a/arch/x86/include/asm/thread_info.h b/arch/x86/include/asm/thread_info.h
index 5c25de07cba8..8f3f1ff69fa9 100644
--- a/arch/x86/include/asm/thread_info.h
+++ b/arch/x86/include/asm/thread_info.h
@@ -250,6 +250,22 @@ static inline void set_restore_sigmask(void)
 	ti->status |= TS_RESTORE_SIGMASK;
 	set_bit(TIF_SIGPENDING, (unsigned long *)&ti->flags);
 }
+static inline void clear_restore_sigmask(void)
+{
+	current_thread_info()->status &= ~TS_RESTORE_SIGMASK;
+}
+static inline bool test_restore_sigmask(void)
+{
+	return current_thread_info()->status & TS_RESTORE_SIGMASK;
+}
+static inline bool test_and_clear_restore_sigmask(void)
+{
+	struct thread_info *ti = current_thread_info();
+	if (!(ti->status & TS_RESTORE_SIGMASK))
+		return false;
+	ti->status &= ~TS_RESTORE_SIGMASK;
+	return true;
+}
 
 static inline bool is_ia32_task(void)
 {
diff --git a/include/linux/thread_info.h b/include/linux/thread_info.h
index eee729428683..ed279701ac79 100644
--- a/include/linux/thread_info.h
+++ b/include/linux/thread_info.h
@@ -127,6 +127,18 @@ static inline void set_restore_sigmask(void)
 	set_thread_flag(TIF_RESTORE_SIGMASK);
 	set_thread_flag(TIF_SIGPENDING);
 }
+static inline void clear_restore_sigmask(void)
+{
+	clear_thread_flag(TIF_RESTORE_SIGMASK);
+}
+static inline bool test_restore_sigmask(void)
+{
+	return test_thread_flag(TIF_RESTORE_SIGMASK);
+}
+static inline bool test_and_clear_restore_sigmask(void)
+{
+	return test_and_clear_thread_flag(TIF_RESTORE_SIGMASK);
+}
 #endif	/* TIF_RESTORE_SIGMASK && !HAVE_SET_RESTORE_SIGMASK */
 
 #ifndef HAVE_SET_RESTORE_SIGMASK
-- 
cgit v1.2.3


From 51a7b448d4134e3e8eec633435e3e8faee14a828 Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Mon, 21 May 2012 23:33:55 -0400
Subject: new helper: restore_saved_sigmask()

first fruits of ..._restore_sigmask() helpers: now we can take
boilerplate "signal didn't have a handler, clear RESTORE_SIGMASK
and restore the blocked mask from ->saved_mask" into a common
helper.  Open-coded instances switched...

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 arch/alpha/kernel/signal.c         | 4 +---
 arch/arm/kernel/signal.c           | 6 +-----
 arch/avr32/kernel/signal.c         | 5 +----
 arch/blackfin/kernel/signal.c      | 5 +----
 arch/c6x/kernel/signal.c           | 5 +----
 arch/cris/arch-v10/kernel/signal.c | 5 +----
 arch/cris/arch-v32/kernel/signal.c | 5 +----
 arch/frv/kernel/signal.c           | 6 +-----
 arch/h8300/kernel/signal.c         | 3 +--
 arch/hexagon/kernel/signal.c       | 5 +----
 arch/ia64/kernel/signal.c          | 5 +----
 arch/m32r/kernel/signal.c          | 5 +----
 arch/m68k/kernel/signal.c          | 5 +----
 arch/microblaze/kernel/signal.c    | 5 +----
 arch/mips/kernel/signal.c          | 5 +----
 arch/mn10300/kernel/signal.c       | 5 +----
 arch/openrisc/kernel/signal.c      | 6 +-----
 arch/parisc/kernel/signal.c        | 7 +------
 arch/powerpc/kernel/signal.c       | 6 +-----
 arch/s390/kernel/signal.c          | 5 +----
 arch/score/kernel/signal.c         | 5 +----
 arch/sh/kernel/signal_32.c         | 5 +----
 arch/sh/kernel/signal_64.c         | 7 +------
 arch/sparc/kernel/signal32.c       | 5 +----
 arch/sparc/kernel/signal_32.c      | 5 +----
 arch/sparc/kernel/signal_64.c      | 5 +----
 arch/tile/kernel/signal.c          | 5 +----
 arch/um/kernel/signal.c            | 6 ++----
 arch/unicore32/kernel/signal.c     | 3 +--
 arch/x86/kernel/signal.c           | 5 +----
 arch/xtensa/kernel/signal.c        | 3 +--
 include/linux/sched.h              | 6 ++++++
 32 files changed, 38 insertions(+), 125 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/alpha/kernel/signal.c b/arch/alpha/kernel/signal.c
index f6db3032ddf0..cadf4571ca31 100644
--- a/arch/alpha/kernel/signal.c
+++ b/arch/alpha/kernel/signal.c
@@ -572,9 +572,7 @@ do_signal(struct pt_regs * regs, struct switch_stack * sw,
 	}
 
 	/* If there's no signal to deliver, we just restore the saved mask.  */
-	if (test_and_clear_thread_flag(TIF_RESTORE_SIGMASK))
-		set_current_blocked(&current->saved_sigmask);
-
+	restore_saved_sigmask();
 	if (single_stepping)
 		ptrace_set_bpt(current);	/* re-set breakpoint */
 }
diff --git a/arch/arm/kernel/signal.c b/arch/arm/kernel/signal.c
index 63f327dd5198..3d1daac8ea04 100644
--- a/arch/arm/kernel/signal.c
+++ b/arch/arm/kernel/signal.c
@@ -663,11 +663,7 @@ static void do_signal(struct pt_regs *regs, int syscall)
 			set_thread_flag(TIF_SYSCALL_RESTARTSYS);
 	}
 
-	/* If there's no signal to deliver, we just put the saved sigmask
-	 * back.
-	 */
-	if (test_and_clear_thread_flag(TIF_RESTORE_SIGMASK))
-		set_current_blocked(&current->saved_sigmask);
+	restore_saved_sigmask();
 }
 
 asmlinkage void
diff --git a/arch/avr32/kernel/signal.c b/arch/avr32/kernel/signal.c
index e7595ef74f51..8b12c3046137 100644
--- a/arch/avr32/kernel/signal.c
+++ b/arch/avr32/kernel/signal.c
@@ -297,10 +297,7 @@ int do_signal(struct pt_regs *regs, sigset_t *oldset, int syscall)
 
 	if (signr == 0) {
 		/* No signal to deliver -- put the saved sigmask back */
-		if (test_thread_flag(TIF_RESTORE_SIGMASK)) {
-			clear_thread_flag(TIF_RESTORE_SIGMASK);
-			sigprocmask(SIG_SETMASK, &current->saved_sigmask, NULL);
-		}
+		restore_saved_sigmask();
 		return 0;
 	}
 
diff --git a/arch/blackfin/kernel/signal.c b/arch/blackfin/kernel/signal.c
index fc9ecce8b6ce..9d692a1277b3 100644
--- a/arch/blackfin/kernel/signal.c
+++ b/arch/blackfin/kernel/signal.c
@@ -319,10 +319,7 @@ asmlinkage void do_signal(struct pt_regs *regs)
 
 	/* if there's no signal to deliver, we just put the saved sigmask
 	 * back */
-	if (test_thread_flag(TIF_RESTORE_SIGMASK)) {
-		clear_thread_flag(TIF_RESTORE_SIGMASK);
-		sigprocmask(SIG_SETMASK, &current->saved_sigmask, NULL);
-	}
+	restore_saved_sigmask();
 }
 
 /*
diff --git a/arch/c6x/kernel/signal.c b/arch/c6x/kernel/signal.c
index 9493f0bbf0a6..bfbcc958bbb4 100644
--- a/arch/c6x/kernel/signal.c
+++ b/arch/c6x/kernel/signal.c
@@ -343,10 +343,7 @@ static void do_signal(struct pt_regs *regs, int syscall)
 
 	/* if there's no signal to deliver, we just put the saved sigmask
 	 * back */
-	if (test_thread_flag(TIF_RESTORE_SIGMASK)) {
-		clear_thread_flag(TIF_RESTORE_SIGMASK);
-		sigprocmask(SIG_SETMASK, &current->saved_sigmask, NULL);
-	}
+	restore_saved_sigmask();
 }
 
 /*
diff --git a/arch/cris/arch-v10/kernel/signal.c b/arch/cris/arch-v10/kernel/signal.c
index e16f8f297f61..06885e94e455 100644
--- a/arch/cris/arch-v10/kernel/signal.c
+++ b/arch/cris/arch-v10/kernel/signal.c
@@ -525,8 +525,5 @@ void do_signal(int canrestart, struct pt_regs *regs)
 
 	/* if there's no signal to deliver, we just put the saved sigmask
 	 * back */
-	if (test_thread_flag(TIF_RESTORE_SIGMASK)) {
-		clear_thread_flag(TIF_RESTORE_SIGMASK);
-		sigprocmask(SIG_SETMASK, &current->saved_sigmask, NULL);
-	}
+	restore_saved_sigmask();
 }
diff --git a/arch/cris/arch-v32/kernel/signal.c b/arch/cris/arch-v32/kernel/signal.c
index b338d8fc0c12..fe12cdca0bac 100644
--- a/arch/cris/arch-v32/kernel/signal.c
+++ b/arch/cris/arch-v32/kernel/signal.c
@@ -560,10 +560,7 @@ do_signal(int canrestart, struct pt_regs *regs)
 
 	/* if there's no signal to deliver, we just put the saved sigmask
 	 * back */
-	if (test_thread_flag(TIF_RESTORE_SIGMASK)) {
-		clear_thread_flag(TIF_RESTORE_SIGMASK);
-		sigprocmask(SIG_SETMASK, &current->saved_sigmask, NULL);
-	}
+	restore_saved_sigmask();
 }
 
 asmlinkage void
diff --git a/arch/frv/kernel/signal.c b/arch/frv/kernel/signal.c
index 595bf1e5a5dc..16351cc8c36c 100644
--- a/arch/frv/kernel/signal.c
+++ b/arch/frv/kernel/signal.c
@@ -536,11 +536,7 @@ no_signal:
 
 	/* if there's no signal to deliver, we just put the saved sigmask
 	 * back */
-	if (test_thread_flag(TIF_RESTORE_SIGMASK)) {
-		clear_thread_flag(TIF_RESTORE_SIGMASK);
-		sigprocmask(SIG_SETMASK, &current->saved_sigmask, NULL);
-	}
-
+	restore_saved_sigmask();
 } /* end do_signal() */
 
 /*****************************************************************************/
diff --git a/arch/h8300/kernel/signal.c b/arch/h8300/kernel/signal.c
index e58992ad789e..63623dabab32 100644
--- a/arch/h8300/kernel/signal.c
+++ b/arch/h8300/kernel/signal.c
@@ -501,8 +501,7 @@ statis void do_signal(struct pt_regs *regs)
 	}
 
 	/* If there's no signal to deliver, we just restore the saved mask.  */
-	if (test_and_clear_thread_flag(TIF_RESTORE_SIGMASK))
-		set_current_blocked(&current->saved_sigmask);
+	restore_saved_sigmask();
 }
 
 asmlinkage void do_notify_resume(struct pt_regs *regs, u32 thread_info_flags)
diff --git a/arch/hexagon/kernel/signal.c b/arch/hexagon/kernel/signal.c
index 21a3018cb9bf..acd6272913b3 100644
--- a/arch/hexagon/kernel/signal.c
+++ b/arch/hexagon/kernel/signal.c
@@ -259,10 +259,7 @@ no_signal:
 
 no_restart:
 	/* If there's no signal to deliver, put the saved sigmask back */
-	if (test_thread_flag(TIF_RESTORE_SIGMASK)) {
-		clear_thread_flag(TIF_RESTORE_SIGMASK);
-		sigprocmask(SIG_SETMASK, &current->saved_sigmask, NULL);
-	}
+	restore_saved_sigmask();
 }
 
 void do_notify_resume(struct pt_regs *regs, unsigned long thread_info_flags)
diff --git a/arch/ia64/kernel/signal.c b/arch/ia64/kernel/signal.c
index 7523501d3bc0..39d8f3afff49 100644
--- a/arch/ia64/kernel/signal.c
+++ b/arch/ia64/kernel/signal.c
@@ -538,8 +538,5 @@ ia64_do_signal (struct sigscratch *scr, long in_syscall)
 
 	/* if there's no signal to deliver, we just put the saved sigmask
 	 * back */
-	if (current_thread_info()->status & TS_RESTORE_SIGMASK) {
-		current_thread_info()->status &= ~TS_RESTORE_SIGMASK;
-		sigprocmask(SIG_SETMASK, &current->saved_sigmask, NULL);
-	}
+	restore_saved_sigmask();
 }
diff --git a/arch/m32r/kernel/signal.c b/arch/m32r/kernel/signal.c
index 64804f1f5141..2ad7c4587669 100644
--- a/arch/m32r/kernel/signal.c
+++ b/arch/m32r/kernel/signal.c
@@ -360,10 +360,7 @@ static void do_signal(struct pt_regs *regs)
 			prev_insn(regs);
 		}
 	}
-	if (test_thread_flag(TIF_RESTORE_SIGMASK)) {
-		clear_thread_flag(TIF_RESTORE_SIGMASK);
-		sigprocmask(SIG_SETMASK, &current->saved_sigmask, NULL);
-	}
+	restore_saved_sigmask();
 }
 
 /*
diff --git a/arch/m68k/kernel/signal.c b/arch/m68k/kernel/signal.c
index 973eec60cad4..685cbe84f33f 100644
--- a/arch/m68k/kernel/signal.c
+++ b/arch/m68k/kernel/signal.c
@@ -1182,10 +1182,7 @@ static void do_signal(struct pt_regs *regs)
 		handle_restart(regs, NULL, 0);
 
 	/* If there's no signal to deliver, we just restore the saved mask.  */
-	if (test_thread_flag(TIF_RESTORE_SIGMASK)) {
-		clear_thread_flag(TIF_RESTORE_SIGMASK);
-		sigprocmask(SIG_SETMASK, &current->saved_sigmask, NULL);
-	}
+	restore_saved_sigmask();
 }
 
 void do_notify_resume(struct pt_regs *regs)
diff --git a/arch/microblaze/kernel/signal.c b/arch/microblaze/kernel/signal.c
index 5d796e32786e..8e644dfaba4f 100644
--- a/arch/microblaze/kernel/signal.c
+++ b/arch/microblaze/kernel/signal.c
@@ -381,10 +381,7 @@ static void do_signal(struct pt_regs *regs, int in_syscall)
 	 * If there's no signal to deliver, we just put the saved sigmask
 	 * back.
 	 */
-	if (current_thread_info()->status & TS_RESTORE_SIGMASK) {
-		current_thread_info()->status &= ~TS_RESTORE_SIGMASK;
-		sigprocmask(SIG_SETMASK, &current->saved_sigmask, NULL);
-	}
+	restore_saved_sigmask();
 }
 
 void do_notify_resume(struct pt_regs *regs, int in_syscall)
diff --git a/arch/mips/kernel/signal.c b/arch/mips/kernel/signal.c
index 8a6e6d116ab0..aad2d2da5eec 100644
--- a/arch/mips/kernel/signal.c
+++ b/arch/mips/kernel/signal.c
@@ -614,10 +614,7 @@ static void do_signal(struct pt_regs *regs)
 	 * If there's no signal to deliver, we just put the saved sigmask
 	 * back
 	 */
-	if (test_thread_flag(TIF_RESTORE_SIGMASK)) {
-		clear_thread_flag(TIF_RESTORE_SIGMASK);
-		sigprocmask(SIG_SETMASK, &current->saved_sigmask, NULL);
-	}
+	restore_saved_sigmask();
 }
 
 /*
diff --git a/arch/mn10300/kernel/signal.c b/arch/mn10300/kernel/signal.c
index b8b6aa1a6837..b7994c38eacc 100644
--- a/arch/mn10300/kernel/signal.c
+++ b/arch/mn10300/kernel/signal.c
@@ -525,10 +525,7 @@ static void do_signal(struct pt_regs *regs)
 
 	/* if there's no signal to deliver, we just put the saved sigmask
 	 * back */
-	if (test_thread_flag(TIF_RESTORE_SIGMASK)) {
-		clear_thread_flag(TIF_RESTORE_SIGMASK);
-		sigprocmask(SIG_SETMASK, &current->saved_sigmask, NULL);
-	}
+	restore_saved_sigmask();
 }
 
 /*
diff --git a/arch/openrisc/kernel/signal.c b/arch/openrisc/kernel/signal.c
index 9ae611522953..266c6fd2eb5c 100644
--- a/arch/openrisc/kernel/signal.c
+++ b/arch/openrisc/kernel/signal.c
@@ -339,11 +339,7 @@ void do_signal(struct pt_regs *regs)
 	if (signr <= 0) {
 		/* no signal to deliver so we just put the saved sigmask
 		 * back */
-		if (test_thread_flag(TIF_RESTORE_SIGMASK)) {
-			clear_thread_flag(TIF_RESTORE_SIGMASK);
-			sigprocmask(SIG_SETMASK, &current->saved_sigmask, NULL);
-		}
-
+		restore_saved_sigmask();
 	} else {		/* signr > 0 */
 		sigset_t *oldset;
 
diff --git a/arch/parisc/kernel/signal.c b/arch/parisc/kernel/signal.c
index e7a7cd3e1120..277cacadf653 100644
--- a/arch/parisc/kernel/signal.c
+++ b/arch/parisc/kernel/signal.c
@@ -621,12 +621,7 @@ do_signal(struct pt_regs *regs, long in_syscall)
 	DBG(1,"do_signal: Exit (not delivered), regs->gr[28] = %ld\n", 
 		regs->gr[28]);
 
-	if (test_thread_flag(TIF_RESTORE_SIGMASK)) {
-		clear_thread_flag(TIF_RESTORE_SIGMASK);
-		sigprocmask(SIG_SETMASK, &current->saved_sigmask, NULL);
-	}
-
-	return;
+	restore_saved_sigmask();
 }
 
 void do_notify_resume(struct pt_regs *regs, long in_syscall)
diff --git a/arch/powerpc/kernel/signal.c b/arch/powerpc/kernel/signal.c
index bfc3ec1382fb..0f4cc67f4268 100644
--- a/arch/powerpc/kernel/signal.c
+++ b/arch/powerpc/kernel/signal.c
@@ -132,12 +132,8 @@ static int do_signal(struct pt_regs *regs)
 	check_syscall_restart(regs, &ka, signr > 0);
 
 	if (signr <= 0) {
-		struct thread_info *ti = current_thread_info();
 		/* No signal to deliver -- put the saved sigmask back */
-		if (ti->local_flags & _TLF_RESTORE_SIGMASK) {
-			ti->local_flags &= ~_TLF_RESTORE_SIGMASK;
-			sigprocmask(SIG_SETMASK, &current->saved_sigmask, NULL);
-		}
+		restore_saved_sigmask();
 		regs->trap = 0;
 		return 0;               /* no signals delivered */
 	}
diff --git a/arch/s390/kernel/signal.c b/arch/s390/kernel/signal.c
index 42a6e8b47f06..37799089c38e 100644
--- a/arch/s390/kernel/signal.c
+++ b/arch/s390/kernel/signal.c
@@ -484,10 +484,7 @@ void do_signal(struct pt_regs *regs)
 	/*
 	 * If there's no signal to deliver, we just put the saved sigmask back.
 	 */
-	if (test_thread_flag(TIF_RESTORE_SIGMASK)) {
-		clear_thread_flag(TIF_RESTORE_SIGMASK);
-		sigprocmask(SIG_SETMASK, &current->saved_sigmask, NULL);
-	}
+	restore_saved_sigmask();
 }
 
 void do_notify_resume(struct pt_regs *regs)
diff --git a/arch/score/kernel/signal.c b/arch/score/kernel/signal.c
index 302838d3acf6..9e751559375b 100644
--- a/arch/score/kernel/signal.c
+++ b/arch/score/kernel/signal.c
@@ -337,10 +337,7 @@ static void do_signal(struct pt_regs *regs)
 	 * If there's no signal to deliver, we just put the saved sigmask
 	 * back
 	 */
-	if (test_thread_flag(TIF_RESTORE_SIGMASK)) {
-		clear_thread_flag(TIF_RESTORE_SIGMASK);
-		sigprocmask(SIG_SETMASK, &current->saved_sigmask, NULL);
-	}
+	restore_saved_sigmask();
 }
 
 /*
diff --git a/arch/sh/kernel/signal_32.c b/arch/sh/kernel/signal_32.c
index 9d7bfd66f189..92f4173ad29a 100644
--- a/arch/sh/kernel/signal_32.c
+++ b/arch/sh/kernel/signal_32.c
@@ -610,10 +610,7 @@ static void do_signal(struct pt_regs *regs, unsigned int save_r0)
 	 * If there's no signal to deliver, we just put the saved sigmask
 	 * back.
 	 */
-	if (current_thread_info()->status & TS_RESTORE_SIGMASK) {
-		current_thread_info()->status &= ~TS_RESTORE_SIGMASK;
-		sigprocmask(SIG_SETMASK, &current->saved_sigmask, NULL);
-	}
+	restore_saved_sigmask();
 }
 
 asmlinkage void do_notify_resume(struct pt_regs *regs, unsigned int save_r0,
diff --git a/arch/sh/kernel/signal_64.c b/arch/sh/kernel/signal_64.c
index aa6428430842..6e191ef0aa62 100644
--- a/arch/sh/kernel/signal_64.c
+++ b/arch/sh/kernel/signal_64.c
@@ -143,12 +143,7 @@ static void do_signal(struct pt_regs *regs)
 	}
 
 	/* No signal to deliver -- put the saved sigmask back */
-	if (current_thread_info()->status & TS_RESTORE_SIGMASK) {
-		current_thread_info()->status &= ~TS_RESTORE_SIGMASK;
-		sigprocmask(SIG_SETMASK, &current->saved_sigmask, NULL);
-	}
-
-	return;
+	restore_saved_sigmask();
 }
 
 /*
diff --git a/arch/sparc/kernel/signal32.c b/arch/sparc/kernel/signal32.c
index bb1513e45f1a..88e0d8122d2c 100644
--- a/arch/sparc/kernel/signal32.c
+++ b/arch/sparc/kernel/signal32.c
@@ -872,10 +872,7 @@ void do_signal32(sigset_t *oldset, struct pt_regs * regs)
 	/* If there's no signal to deliver, we just put the saved sigmask
 	 * back
 	 */
-	if (current_thread_info()->status & TS_RESTORE_SIGMASK) {
-		current_thread_info()->status &= ~TS_RESTORE_SIGMASK;
-		set_current_blocked(&current->saved_sigmask);
-	}
+	restore_saved_sigmask();
 }
 
 struct sigstack32 {
diff --git a/arch/sparc/kernel/signal_32.c b/arch/sparc/kernel/signal_32.c
index 6b42e8622d12..9dd97d2e171e 100644
--- a/arch/sparc/kernel/signal_32.c
+++ b/arch/sparc/kernel/signal_32.c
@@ -576,10 +576,7 @@ static void do_signal(struct pt_regs *regs, unsigned long orig_i0)
 	/* if there's no signal to deliver, we just put the saved sigmask
 	 * back
 	 */
-	if (test_thread_flag(TIF_RESTORE_SIGMASK)) {
-		clear_thread_flag(TIF_RESTORE_SIGMASK);
-		set_current_blocked(&current->saved_sigmask);
-	}
+	restore_saved_sigmask();
 }
 
 void do_notify_resume(struct pt_regs *regs, unsigned long orig_i0,
diff --git a/arch/sparc/kernel/signal_64.c b/arch/sparc/kernel/signal_64.c
index c82cf1cc3965..55b820ee0ac9 100644
--- a/arch/sparc/kernel/signal_64.c
+++ b/arch/sparc/kernel/signal_64.c
@@ -594,10 +594,7 @@ static void do_signal(struct pt_regs *regs, unsigned long orig_i0)
 	/* If there's no signal to deliver, we just put the saved sigmask
 	 * back
 	 */
-	if (current_thread_info()->status & TS_RESTORE_SIGMASK) {
-		current_thread_info()->status &= ~TS_RESTORE_SIGMASK;
-		set_current_blocked(&current->saved_sigmask);
-	}
+	restore_saved_sigmask();
 }
 
 void do_notify_resume(struct pt_regs *regs, unsigned long orig_i0, unsigned long thread_info_flags)
diff --git a/arch/tile/kernel/signal.c b/arch/tile/kernel/signal.c
index f79d4b88c747..62b3493ea77d 100644
--- a/arch/tile/kernel/signal.c
+++ b/arch/tile/kernel/signal.c
@@ -350,10 +350,7 @@ void do_signal(struct pt_regs *regs)
 	}
 
 	/* If there's no signal to deliver, just put the saved sigmask back. */
-	if (current_thread_info()->status & TS_RESTORE_SIGMASK) {
-		current_thread_info()->status &= ~TS_RESTORE_SIGMASK;
-		sigprocmask(SIG_SETMASK, &current->saved_sigmask, NULL);
-	}
+	restore_saved_sigmask();
 
 done:
 	/* Avoid double syscall restart if there are nested signals. */
diff --git a/arch/um/kernel/signal.c b/arch/um/kernel/signal.c
index 292e706016c5..6acf13c1740b 100644
--- a/arch/um/kernel/signal.c
+++ b/arch/um/kernel/signal.c
@@ -130,10 +130,8 @@ static int kern_do_signal(struct pt_regs *regs)
 	 * if there's no signal to deliver, we just put the saved sigmask
 	 * back
 	 */
-	if (!handled_sig && test_thread_flag(TIF_RESTORE_SIGMASK)) {
-		clear_thread_flag(TIF_RESTORE_SIGMASK);
-		sigprocmask(SIG_SETMASK, &current->saved_sigmask, NULL);
-	}
+	if (!handled_sig)
+		restore_saved_sigmask();
 	return handled_sig;
 }
 
diff --git a/arch/unicore32/kernel/signal.c b/arch/unicore32/kernel/signal.c
index 28782ad47b93..65a5ed3b6f2a 100644
--- a/arch/unicore32/kernel/signal.c
+++ b/arch/unicore32/kernel/signal.c
@@ -451,8 +451,7 @@ static void do_signal(struct pt_regs *regs, int syscall)
 	/* If there's no signal to deliver, we just put the saved
 	 * sigmask back.
 	 */
-	if (test_and_clear_thread_flag(TIF_RESTORE_SIGMASK))
-		set_current_blocked(&current->saved_sigmask);
+	restore_saved_sigmask();
 }
 
 asmlinkage void do_notify_resume(struct pt_regs *regs,
diff --git a/arch/x86/kernel/signal.c b/arch/x86/kernel/signal.c
index 2e937a5ad531..25a4a81a51aa 100644
--- a/arch/x86/kernel/signal.c
+++ b/arch/x86/kernel/signal.c
@@ -796,10 +796,7 @@ static void do_signal(struct pt_regs *regs)
 	 * If there's no signal to deliver, we just put the saved sigmask
 	 * back.
 	 */
-	if (current_thread_info()->status & TS_RESTORE_SIGMASK) {
-		current_thread_info()->status &= ~TS_RESTORE_SIGMASK;
-		set_current_blocked(&current->saved_sigmask);
-	}
+	restore_saved_sigmask();
 }
 
 /*
diff --git a/arch/xtensa/kernel/signal.c b/arch/xtensa/kernel/signal.c
index ea7e17778a75..8c4e751e3b83 100644
--- a/arch/xtensa/kernel/signal.c
+++ b/arch/xtensa/kernel/signal.c
@@ -532,8 +532,7 @@ no_signal:
 	}
 
 	/* If there's no signal to deliver, we just restore the saved mask.  */
-	if (test_and_clear_thread_flag(TIF_RESTORE_SIGMASK))
-		set_current_blocked(&current->saved_sigmask);
+	restore_saved_sigmask();
 
 	if (current->ptrace & PT_SINGLESTEP)
 		task_pt_regs(current)->icountlevel = 1;
diff --git a/include/linux/sched.h b/include/linux/sched.h
index 660c8ae93471..f1b46b88f6f5 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -2207,6 +2207,12 @@ extern int send_sigqueue(struct sigqueue *,  struct task_struct *, int group);
 extern int do_sigaction(int, struct k_sigaction *, struct k_sigaction *);
 extern int do_sigaltstack(const stack_t __user *, stack_t __user *, unsigned long);
 
+static inline void restore_saved_sigmask(void)
+{
+	if (test_and_clear_restore_sigmask())
+		set_current_blocked(&current->saved_sigmask);
+}
+
 static inline int kill_cad_pid(int sig, int priv)
 {
 	return kill_pid(cad_pid, sig, priv);
-- 
cgit v1.2.3


From b7f9a11a6cf1ea9ee6be3eb2b90d91327a09ad14 Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Wed, 2 May 2012 09:59:21 -0400
Subject: new helper: sigmask_to_save()

replace boilerplate "should we use ->saved_sigmask or ->blocked?"
with calls of obvious inlined helper...

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 arch/alpha/kernel/signal.c         |  5 +----
 arch/arm/kernel/signal.c           | 12 +++---------
 arch/avr32/kernel/signal.c         | 20 +++++++-------------
 arch/blackfin/kernel/signal.c      | 12 +++---------
 arch/c6x/kernel/signal.c           | 14 +++-----------
 arch/cris/arch-v10/kernel/signal.c | 11 +++--------
 arch/cris/arch-v32/kernel/signal.c | 11 +++--------
 arch/frv/kernel/signal.c           | 10 +++-------
 arch/h8300/kernel/signal.c         | 11 +++--------
 arch/hexagon/kernel/signal.c       | 13 +++----------
 arch/ia64/kernel/signal.c          | 12 +++---------
 arch/m32r/kernel/signal.c          | 12 +++---------
 arch/m68k/kernel/signal.c          | 11 +++--------
 arch/microblaze/kernel/signal.c    |  9 ++-------
 arch/mips/kernel/signal.c          | 11 +++--------
 arch/mn10300/kernel/signal.c       | 11 +++--------
 arch/openrisc/kernel/signal.c      | 12 +++---------
 arch/parisc/kernel/signal.c        | 21 +++++----------------
 arch/powerpc/kernel/signal.c       |  7 +------
 arch/s390/kernel/signal.c          |  7 +------
 arch/score/kernel/signal.c         | 12 +++---------
 arch/sh/kernel/signal_32.c         | 11 +++--------
 arch/sh/kernel/signal_64.c         | 13 ++++---------
 arch/sparc/kernel/signal_32.c      | 11 +++--------
 arch/sparc/kernel/signal_64.c      |  7 +------
 arch/tile/kernel/signal.c          | 11 +++--------
 arch/um/kernel/signal.c            | 11 +++--------
 arch/unicore32/kernel/signal.c     | 13 +++----------
 arch/x86/kernel/signal.c           |  5 +----
 arch/xtensa/kernel/signal.c        |  8 +-------
 include/linux/sched.h              |  8 ++++++++
 31 files changed, 92 insertions(+), 250 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/alpha/kernel/signal.c b/arch/alpha/kernel/signal.c
index cadf4571ca31..f1e7d2aa2586 100644
--- a/arch/alpha/kernel/signal.c
+++ b/arch/alpha/kernel/signal.c
@@ -468,12 +468,9 @@ static inline void
 handle_signal(int sig, struct k_sigaction *ka, siginfo_t *info,
 	      struct pt_regs * regs, struct switch_stack *sw)
 {
-	sigset_t *oldset = &current->blocked;
+	sigset_t *oldset = sigmask_to_save();
 	int ret;
 
-	if (test_thread_flag(TIF_RESTORE_SIGMASK))
-		oldset = &current->saved_sigmask;
-
 	if (ka->sa.sa_flags & SA_SIGINFO)
 		ret = setup_rt_frame(sig, ka, info, oldset, regs, sw);
 	else
diff --git a/arch/arm/kernel/signal.c b/arch/arm/kernel/signal.c
index 3d1daac8ea04..2e66c93973c3 100644
--- a/arch/arm/kernel/signal.c
+++ b/arch/arm/kernel/signal.c
@@ -530,11 +530,11 @@ setup_rt_frame(int usig, struct k_sigaction *ka, siginfo_t *info,
  */	
 static int
 handle_signal(unsigned long sig, struct k_sigaction *ka,
-	      siginfo_t *info, sigset_t *oldset,
-	      struct pt_regs * regs)
+	      siginfo_t *info, struct pt_regs *regs)
 {
 	struct thread_info *thread = current_thread_info();
 	struct task_struct *tsk = current;
+	sigset_t *oldset = sigmask_to_save();
 	int usig = sig;
 	int ret;
 
@@ -617,8 +617,6 @@ static void do_signal(struct pt_regs *regs, int syscall)
 	 */
 	signr = get_signal_to_deliver(&info, &ka, regs, NULL);
 	if (signr > 0) {
-		sigset_t *oldset;
-
 		/*
 		 * Depending on the signal settings we may need to revert the
 		 * decision to restart the system call.  But skip this if a
@@ -635,11 +633,7 @@ static void do_signal(struct pt_regs *regs, int syscall)
 			clear_thread_flag(TIF_SYSCALL_RESTARTSYS);
 		}
 
-		if (test_thread_flag(TIF_RESTORE_SIGMASK))
-			oldset = &current->saved_sigmask;
-		else
-			oldset = &current->blocked;
-		if (handle_signal(signr, &ka, &info, oldset, regs) == 0) {
+		if (handle_signal(signr, &ka, &info, regs) == 0) {
 			/*
 			 * A signal was successfully delivered; the saved
 			 * sigmask will have been stored in the signal frame,
diff --git a/arch/avr32/kernel/signal.c b/arch/avr32/kernel/signal.c
index 8b12c3046137..0e2c0527c9fe 100644
--- a/arch/avr32/kernel/signal.c
+++ b/arch/avr32/kernel/signal.c
@@ -224,14 +224,14 @@ static inline void setup_syscall_restart(struct pt_regs *regs)
 
 static inline void
 handle_signal(unsigned long sig, struct k_sigaction *ka, siginfo_t *info,
-	      sigset_t *oldset, struct pt_regs *regs, int syscall)
+	      struct pt_regs *regs, int syscall)
 {
 	int ret;
 
 	/*
 	 * Set up the stack frame
 	 */
-	ret = setup_rt_frame(sig, ka, info, oldset, regs);
+	ret = setup_rt_frame(sig, ka, info, sigmask_to_save(), regs);
 
 	/*
 	 * Check that the resulting registers are sane
@@ -255,7 +255,7 @@ handle_signal(unsigned long sig, struct k_sigaction *ka, siginfo_t *info,
  * doesn't want to handle. Thus you cannot kill init even with a
  * SIGKILL even by mistake.
  */
-int do_signal(struct pt_regs *regs, sigset_t *oldset, int syscall)
+static void do_signal(struct pt_regs *regs, int syscall)
 {
 	siginfo_t info;
 	int signr;
@@ -267,12 +267,7 @@ int do_signal(struct pt_regs *regs, sigset_t *oldset, int syscall)
 	 * without doing anything if so.
 	 */
 	if (!user_mode(regs))
-		return 0;
-
-	if (test_thread_flag(TIF_RESTORE_SIGMASK))
-		oldset = &current->saved_sigmask;
-	else if (!oldset)
-		oldset = &current->blocked;
+		return;
 
 	signr = get_signal_to_deliver(&info, &ka, regs, NULL);
 	if (syscall) {
@@ -298,11 +293,10 @@ int do_signal(struct pt_regs *regs, sigset_t *oldset, int syscall)
 	if (signr == 0) {
 		/* No signal to deliver -- put the saved sigmask back */
 		restore_saved_sigmask();
-		return 0;
+		return;
 	}
 
-	handle_signal(signr, &ka, &info, oldset, regs, syscall);
-	return 1;
+	handle_signal(signr, &ka, &info, regs, syscall);
 }
 
 asmlinkage void do_notify_resume(struct pt_regs *regs, struct thread_info *ti)
@@ -313,7 +307,7 @@ asmlinkage void do_notify_resume(struct pt_regs *regs, struct thread_info *ti)
 		syscall = 1;
 
 	if (ti->flags & (_TIF_SIGPENDING | _TIF_RESTORE_SIGMASK))
-		do_signal(regs, &current->blocked, syscall);
+		do_signal(regs, syscall);
 
 	if (ti->flags & _TIF_NOTIFY_RESUME) {
 		clear_thread_flag(TIF_NOTIFY_RESUME);
diff --git a/arch/blackfin/kernel/signal.c b/arch/blackfin/kernel/signal.c
index 9d692a1277b3..7f4205ddfa4d 100644
--- a/arch/blackfin/kernel/signal.c
+++ b/arch/blackfin/kernel/signal.c
@@ -249,7 +249,7 @@ handle_restart(struct pt_regs *regs, struct k_sigaction *ka, int has_handler)
  */
 static int
 handle_signal(int sig, siginfo_t *info, struct k_sigaction *ka,
-	      sigset_t *oldset, struct pt_regs *regs)
+	      struct pt_regs *regs)
 {
 	int ret;
 
@@ -259,7 +259,7 @@ handle_signal(int sig, siginfo_t *info, struct k_sigaction *ka,
 		handle_restart(regs, ka, 1);
 
 	/* set up the stack frame */
-	ret = setup_rt_frame(sig, ka, info, oldset, regs);
+	ret = setup_rt_frame(sig, ka, info, sigmask_to_save(), regs);
 
 	if (ret == 0)
 		block_sigmask(ka, sig);
@@ -281,22 +281,16 @@ asmlinkage void do_signal(struct pt_regs *regs)
 	siginfo_t info;
 	int signr;
 	struct k_sigaction ka;
-	sigset_t *oldset;
 
 	current->thread.esp0 = (unsigned long)regs;
 
 	if (try_to_freeze())
 		goto no_signal;
 
-	if (test_thread_flag(TIF_RESTORE_SIGMASK))
-		oldset = &current->saved_sigmask;
-	else
-		oldset = &current->blocked;
-
 	signr = get_signal_to_deliver(&info, &ka, regs, NULL);
 	if (signr > 0) {
 		/* Whee!  Actually deliver the signal.  */
-		if (handle_signal(signr, &info, &ka, oldset, regs) == 0) {
+		if (handle_signal(signr, &info, &ka, regs) == 0) {
 			/* a signal was successfully delivered; the saved
 			 * sigmask will have been stored in the signal frame,
 			 * and will be restored by sigreturn, so we can simply
diff --git a/arch/c6x/kernel/signal.c b/arch/c6x/kernel/signal.c
index bfbcc958bbb4..38bb501eb117 100644
--- a/arch/c6x/kernel/signal.c
+++ b/arch/c6x/kernel/signal.c
@@ -250,8 +250,7 @@ do_restart:
  */
 static int handle_signal(int sig,
 			 siginfo_t *info, struct k_sigaction *ka,
-			 sigset_t *oldset, struct pt_regs *regs,
-			 int syscall)
+			 struct pt_regs *regs, int syscall)
 {
 	int ret;
 
@@ -278,7 +277,7 @@ static int handle_signal(int sig,
 	}
 
 	/* Set up the stack frame */
-	ret = setup_rt_frame(sig, ka, info, oldset, regs);
+	ret = setup_rt_frame(sig, ka, info, sigmask_to_save(), regs);
 	if (ret == 0)
 		block_sigmask(ka, sig);
 
@@ -292,7 +291,6 @@ static void do_signal(struct pt_regs *regs, int syscall)
 {
 	struct k_sigaction ka;
 	siginfo_t info;
-	sigset_t *oldset;
 	int signr;
 
 	/* we want the common case to go fast, which is why we may in certain
@@ -300,15 +298,9 @@ static void do_signal(struct pt_regs *regs, int syscall)
 	if (!user_mode(regs))
 		return;
 
-	if (test_thread_flag(TIF_RESTORE_SIGMASK))
-		oldset = &current->saved_sigmask;
-	else
-		oldset = &current->blocked;
-
 	signr = get_signal_to_deliver(&info, &ka, regs, NULL);
 	if (signr > 0) {
-		if (handle_signal(signr, &info, &ka, oldset,
-				  regs, syscall) == 0) {
+		if (handle_signal(signr, &info, &ka, regs, syscall) == 0) {
 			/* a signal was successfully delivered; the saved
 			 * sigmask will have been stored in the signal frame,
 			 * and will be restored by sigreturn, so we can simply
diff --git a/arch/cris/arch-v10/kernel/signal.c b/arch/cris/arch-v10/kernel/signal.c
index 06885e94e455..09a4cf4eb08a 100644
--- a/arch/cris/arch-v10/kernel/signal.c
+++ b/arch/cris/arch-v10/kernel/signal.c
@@ -417,8 +417,9 @@ give_sigsegv:
 
 static inline int handle_signal(int canrestart, unsigned long sig,
 	siginfo_t *info, struct k_sigaction *ka,
-	sigset_t *oldset, struct pt_regs *regs)
+	struct pt_regs *regs)
 {
+	sigset_t *oldset = sigmask_to_save();
 	int ret;
 
 	/* Are we from a system call? */
@@ -478,7 +479,6 @@ void do_signal(int canrestart, struct pt_regs *regs)
 	siginfo_t info;
 	int signr;
         struct k_sigaction ka;
-	sigset_t *oldset;
 
 	/*
 	 * We want the common case to go fast, which
@@ -489,16 +489,11 @@ void do_signal(int canrestart, struct pt_regs *regs)
 	if (!user_mode(regs))
 		return;
 
-	if (test_thread_flag(TIF_RESTORE_SIGMASK))
-		oldset = &current->saved_sigmask;
-	else
-		oldset = &current->blocked;
-
 	signr = get_signal_to_deliver(&info, &ka, regs, NULL);
 	if (signr > 0) {
 		/* Whee!  Actually deliver the signal.  */
 		if (handle_signal(canrestart, signr, &info, &ka,
-				oldset, regs)) {
+				regs)) {
 			/* a signal was successfully delivered; the saved
 			 * sigmask will have been stored in the signal frame,
 			 * and will be restored by sigreturn, so we can simply
diff --git a/arch/cris/arch-v32/kernel/signal.c b/arch/cris/arch-v32/kernel/signal.c
index fe12cdca0bac..d52276ddae4b 100644
--- a/arch/cris/arch-v32/kernel/signal.c
+++ b/arch/cris/arch-v32/kernel/signal.c
@@ -437,8 +437,9 @@ give_sigsegv:
 static inline int
 handle_signal(int canrestart, unsigned long sig,
 	      siginfo_t *info, struct k_sigaction *ka,
-              sigset_t *oldset, struct pt_regs * regs)
+              struct pt_regs * regs)
 {
+	sigset_t *oldset = sigmask_to_save();
 	int ret;
 
 	/* Check if this got called from a system call. */
@@ -511,7 +512,6 @@ do_signal(int canrestart, struct pt_regs *regs)
 	int signr;
 	siginfo_t info;
         struct k_sigaction ka;
-	sigset_t *oldset;
 
 	/*
 	 * The common case should go fast, which is why this point is
@@ -521,17 +521,12 @@ do_signal(int canrestart, struct pt_regs *regs)
 	if (!user_mode(regs))
 		return;
 
-	if (test_thread_flag(TIF_RESTORE_SIGMASK))
-		oldset = &current->saved_sigmask;
-	else
-		oldset = &current->blocked;
-
 	signr = get_signal_to_deliver(&info, &ka, regs, NULL);
 
 	if (signr > 0) {
 		/* Whee!  Actually deliver the signal.  */
 		if (handle_signal(canrestart, signr, &info, &ka,
-				oldset, regs)) {
+				regs)) {
 			/* a signal was successfully delivered; the saved
 			 * sigmask will have been stored in the signal frame,
 			 * and will be restored by sigreturn, so we can simply
diff --git a/arch/frv/kernel/signal.c b/arch/frv/kernel/signal.c
index 16351cc8c36c..22efe8d25038 100644
--- a/arch/frv/kernel/signal.c
+++ b/arch/frv/kernel/signal.c
@@ -427,8 +427,9 @@ give_sigsegv:
  * OK, we're invoking a handler
  */
 static int handle_signal(unsigned long sig, siginfo_t *info,
-			 struct k_sigaction *ka, sigset_t *oldset)
+			 struct k_sigaction *ka)
 {
+	sigset_t *oldset = sigmask_to_save();
 	int ret;
 
 	/* Are we from a system call? */
@@ -492,14 +493,9 @@ static void do_signal(void)
 	if (try_to_freeze())
 		goto no_signal;
 
-	if (test_thread_flag(TIF_RESTORE_SIGMASK))
-		oldset = &current->saved_sigmask;
-	else
-		oldset = &current->blocked;
-
 	signr = get_signal_to_deliver(&info, &ka, __frame, NULL);
 	if (signr > 0) {
-		if (handle_signal(signr, &info, &ka, oldset) == 0) {
+		if (handle_signal(signr, &info, &ka) == 0) {
 			/* a signal was successfully delivered; the saved
 			 * sigmask will have been stored in the signal frame,
 			 * and will be restored by sigreturn, so we can simply
diff --git a/arch/h8300/kernel/signal.c b/arch/h8300/kernel/signal.c
index 63623dabab32..d4d2f72672ad 100644
--- a/arch/h8300/kernel/signal.c
+++ b/arch/h8300/kernel/signal.c
@@ -412,8 +412,9 @@ give_sigsegv:
  */
 static void
 handle_signal(unsigned long sig, siginfo_t *info, struct k_sigaction *ka,
-	      sigset_t *oldset,	struct pt_regs * regs)
+	      struct pt_regs * regs)
 {
+	sigset_t *oldset = sigmask_to_save();
 	int ret;
 	/* are we from a system call? */
 	if (regs->orig_er0 >= 0) {
@@ -457,7 +458,6 @@ statis void do_signal(struct pt_regs *regs)
 	siginfo_t info;
 	int signr;
 	struct k_sigaction ka;
-	sigset_t *oldset;
 
 	/*
 	 * We want the common case to go fast, which
@@ -473,15 +473,10 @@ statis void do_signal(struct pt_regs *regs)
 
 	current->thread.esp0 = (unsigned long) regs;
 
-	if (test_thread_flag(TIF_RESTORE_SIGMASK))
-		oldset = &current->saved_sigmask;
-	else
-		oldset = &current->blocked;
-
 	signr = get_signal_to_deliver(&info, &ka, regs, NULL);
 	if (signr > 0) {
 		/* Whee!  Actually deliver the signal.  */
-		handle_signal(signr, &info, &ka, oldset, regs);
+		handle_signal(signr, &info, &ka, regs);
 		return;
 	}
  no_signal:
diff --git a/arch/hexagon/kernel/signal.c b/arch/hexagon/kernel/signal.c
index acd6272913b3..f73fcee09bac 100644
--- a/arch/hexagon/kernel/signal.c
+++ b/arch/hexagon/kernel/signal.c
@@ -150,7 +150,7 @@ sigsegv:
  * Setup invocation of signal handler
  */
 static int handle_signal(int sig, siginfo_t *info, struct k_sigaction *ka,
-			 sigset_t *oldset, struct pt_regs *regs)
+			 struct pt_regs *regs)
 {
 	int rc;
 
@@ -186,7 +186,7 @@ static int handle_signal(int sig, siginfo_t *info, struct k_sigaction *ka,
 	 * Set up the stack frame; not doing the SA_SIGINFO thing.  We
 	 * only set up the rt_frame flavor.
 	 */
-	rc = setup_rt_frame(sig, ka, info, oldset, regs);
+	rc = setup_rt_frame(sig, ka, info, sigmask_to_save(), regs);
 
 	/* If there was an error on setup, no signal was delivered. */
 	if (rc)
@@ -215,14 +215,7 @@ static void do_signal(struct pt_regs *regs)
 	signo = get_signal_to_deliver(&info, &sigact, regs, NULL);
 
 	if (signo > 0) {
-		sigset_t *oldset;
-
-		if (test_thread_flag(TIF_RESTORE_SIGMASK))
-			oldset = &current->saved_sigmask;
-		else
-			oldset = &current->blocked;
-
-		if (handle_signal(signo, &info, &sigact, oldset, regs) == 0) {
+		if (handle_signal(signo, &info, &sigact, regs) == 0) {
 			/*
 			 * Successful delivery case.  The saved sigmask is
 			 * stored in the signal frame, and will be restored
diff --git a/arch/ia64/kernel/signal.c b/arch/ia64/kernel/signal.c
index 39d8f3afff49..9fee6d6a3f21 100644
--- a/arch/ia64/kernel/signal.c
+++ b/arch/ia64/kernel/signal.c
@@ -415,10 +415,10 @@ setup_frame (int sig, struct k_sigaction *ka, siginfo_t *info, sigset_t *set,
 }
 
 static long
-handle_signal (unsigned long sig, struct k_sigaction *ka, siginfo_t *info, sigset_t *oldset,
+handle_signal (unsigned long sig, struct k_sigaction *ka, siginfo_t *info,
 	       struct sigscratch *scr)
 {
-	if (!setup_frame(sig, ka, info, oldset, scr))
+	if (!setup_frame(sig, ka, info, sigmask_to_save(), scr))
 		return 0;
 
 	block_sigmask(ka, sig);
@@ -440,7 +440,6 @@ void
 ia64_do_signal (struct sigscratch *scr, long in_syscall)
 {
 	struct k_sigaction ka;
-	sigset_t *oldset;
 	siginfo_t info;
 	long restart = in_syscall;
 	long errno = scr->pt.r8;
@@ -453,11 +452,6 @@ ia64_do_signal (struct sigscratch *scr, long in_syscall)
 	if (!user_mode(&scr->pt))
 		return;
 
-	if (current_thread_info()->status & TS_RESTORE_SIGMASK)
-		oldset = &current->saved_sigmask;
-	else
-		oldset = &current->blocked;
-
 	/*
 	 * This only loops in the rare cases of handle_signal() failing, in which case we
 	 * need to push through a forced SIGSEGV.
@@ -507,7 +501,7 @@ ia64_do_signal (struct sigscratch *scr, long in_syscall)
 		 * Whee!  Actually deliver the signal.  If the delivery failed, we need to
 		 * continue to iterate in this loop so we can deliver the SIGSEGV...
 		 */
-		if (handle_signal(signr, &ka, &info, oldset, scr)) {
+		if (handle_signal(signr, &ka, &info, scr)) {
 			/*
 			 * A signal was successfully delivered; the saved
 			 * sigmask will have been stored in the signal frame,
diff --git a/arch/m32r/kernel/signal.c b/arch/m32r/kernel/signal.c
index 2ad7c4587669..e0d6d1079f33 100644
--- a/arch/m32r/kernel/signal.c
+++ b/arch/m32r/kernel/signal.c
@@ -269,7 +269,7 @@ static int prev_insn(struct pt_regs *regs)
 
 static int
 handle_signal(unsigned long sig, struct k_sigaction *ka, siginfo_t *info,
-	      sigset_t *oldset, struct pt_regs *regs)
+	      struct pt_regs *regs)
 {
 	/* Are we from a system call? */
 	if (regs->syscall_nr >= 0) {
@@ -294,7 +294,7 @@ handle_signal(unsigned long sig, struct k_sigaction *ka, siginfo_t *info,
 	}
 
 	/* Set up the stack frame */
-	if (setup_rt_frame(sig, ka, info, oldset, regs))
+	if (setup_rt_frame(sig, ka, info, sigmask_to_save(), regs))
 		return -EFAULT;
 
 	block_sigmask(ka, sig);
@@ -311,7 +311,6 @@ static void do_signal(struct pt_regs *regs)
 	siginfo_t info;
 	int signr;
 	struct k_sigaction ka;
-	sigset_t *oldset;
 
 	/*
 	 * We want the common case to go fast, which
@@ -325,11 +324,6 @@ static void do_signal(struct pt_regs *regs)
 	if (try_to_freeze()) 
 		goto no_signal;
 
-	if (test_thread_flag(TIF_RESTORE_SIGMASK))
-		oldset = &current->saved_sigmask;
-	else
-		oldset = &current->blocked;
-
 	signr = get_signal_to_deliver(&info, &ka, regs, NULL);
 	if (signr > 0) {
 		/* Re-enable any watchpoints before delivering the
@@ -339,7 +333,7 @@ static void do_signal(struct pt_regs *regs)
 		 */
 
 		/* Whee!  Actually deliver the signal.  */
-		if (handle_signal(signr, &ka, &info, oldset, regs) == 0)
+		if (handle_signal(signr, &ka, &info, regs) == 0)
 			clear_thread_flag(TIF_RESTORE_SIGMASK);
 
 		return;
diff --git a/arch/m68k/kernel/signal.c b/arch/m68k/kernel/signal.c
index 685cbe84f33f..c83eb5a8ed8b 100644
--- a/arch/m68k/kernel/signal.c
+++ b/arch/m68k/kernel/signal.c
@@ -1123,8 +1123,9 @@ handle_restart(struct pt_regs *regs, struct k_sigaction *ka, int has_handler)
  */
 static void
 handle_signal(int sig, struct k_sigaction *ka, siginfo_t *info,
-	      sigset_t *oldset, struct pt_regs *regs)
+	      struct pt_regs *regs)
 {
+	sigset_t *oldset = sigmask_to_save();
 	int err;
 	/* are we from a system call? */
 	if (regs->orig_d0 >= 0)
@@ -1160,19 +1161,13 @@ static void do_signal(struct pt_regs *regs)
 	siginfo_t info;
 	struct k_sigaction ka;
 	int signr;
-	sigset_t *oldset;
 
 	current->thread.esp0 = (unsigned long) regs;
 
-	if (test_thread_flag(TIF_RESTORE_SIGMASK))
-		oldset = &current->saved_sigmask;
-	else
-		oldset = &current->blocked;
-
 	signr = get_signal_to_deliver(&info, &ka, regs, NULL);
 	if (signr > 0) {
 		/* Whee!  Actually deliver the signal.  */
-		handle_signal(signr, &ka, &info, oldset, regs);
+		handle_signal(signr, &ka, &info, regs);
 		return;
 	}
 
diff --git a/arch/microblaze/kernel/signal.c b/arch/microblaze/kernel/signal.c
index 8e644dfaba4f..fd2de5718a4e 100644
--- a/arch/microblaze/kernel/signal.c
+++ b/arch/microblaze/kernel/signal.c
@@ -312,8 +312,9 @@ do_restart:
 
 static int
 handle_signal(unsigned long sig, struct k_sigaction *ka,
-		siginfo_t *info, sigset_t *oldset, struct pt_regs *regs)
+		siginfo_t *info, struct pt_regs *regs)
 {
+	sigset_t *oldset = sigmask_to_save();
 	int ret;
 
 	/* Set up the stack frame */
@@ -344,18 +345,12 @@ static void do_signal(struct pt_regs *regs, int in_syscall)
 	siginfo_t info;
 	int signr;
 	struct k_sigaction ka;
-	sigset_t *oldset;
 #ifdef DEBUG_SIG
 	printk(KERN_INFO "do signal: %p %d\n", regs, in_syscall);
 	printk(KERN_INFO "do signal2: %lx %lx %ld [%lx]\n", regs->pc, regs->r1,
 			regs->r12, current_thread_info()->flags);
 #endif
 
-	if (current_thread_info()->status & TS_RESTORE_SIGMASK)
-		oldset = &current->saved_sigmask;
-	else
-		oldset = &current->blocked;
-
 	signr = get_signal_to_deliver(&info, &ka, regs, NULL);
 	if (signr > 0) {
 		/* Whee! Actually deliver the signal. */
diff --git a/arch/mips/kernel/signal.c b/arch/mips/kernel/signal.c
index aad2d2da5eec..18355060f241 100644
--- a/arch/mips/kernel/signal.c
+++ b/arch/mips/kernel/signal.c
@@ -515,8 +515,9 @@ struct mips_abi mips_abi = {
 };
 
 static int handle_signal(unsigned long sig, siginfo_t *info,
-	struct k_sigaction *ka, sigset_t *oldset, struct pt_regs *regs)
+	struct k_sigaction *ka, struct pt_regs *regs)
 {
+	sigset_t *oldset = sigmask_to_save();
 	int ret;
 	struct mips_abi *abi = current->thread.abi;
 	void *vdso = current->mm->context.vdso;
@@ -560,7 +561,6 @@ static int handle_signal(unsigned long sig, siginfo_t *info,
 static void do_signal(struct pt_regs *regs)
 {
 	struct k_sigaction ka;
-	sigset_t *oldset;
 	siginfo_t info;
 	int signr;
 
@@ -572,15 +572,10 @@ static void do_signal(struct pt_regs *regs)
 	if (!user_mode(regs))
 		return;
 
-	if (test_thread_flag(TIF_RESTORE_SIGMASK))
-		oldset = &current->saved_sigmask;
-	else
-		oldset = &current->blocked;
-
 	signr = get_signal_to_deliver(&info, &ka, regs, NULL);
 	if (signr > 0) {
 		/* Whee!  Actually deliver the signal.  */
-		if (handle_signal(signr, &info, &ka, oldset, regs) == 0) {
+		if (handle_signal(signr, &info, &ka, regs) == 0) {
 			/*
 			 * A signal was successfully delivered; the saved
 			 * sigmask will have been stored in the signal frame,
diff --git a/arch/mn10300/kernel/signal.c b/arch/mn10300/kernel/signal.c
index b7994c38eacc..26a1d98c62a1 100644
--- a/arch/mn10300/kernel/signal.c
+++ b/arch/mn10300/kernel/signal.c
@@ -430,8 +430,9 @@ static inline void stepback(struct pt_regs *regs)
  */
 static int handle_signal(int sig,
 			 siginfo_t *info, struct k_sigaction *ka,
-			 sigset_t *oldset, struct pt_regs *regs)
+			 struct pt_regs *regs)
 {
+	sigset_t *oldset = sigmask_to_save();
 	int ret;
 
 	/* Are we from a system call? */
@@ -475,7 +476,6 @@ static void do_signal(struct pt_regs *regs)
 {
 	struct k_sigaction ka;
 	siginfo_t info;
-	sigset_t *oldset;
 	int signr;
 
 	/* we want the common case to go fast, which is why we may in certain
@@ -483,14 +483,9 @@ static void do_signal(struct pt_regs *regs)
 	if (!user_mode(regs))
 		return;
 
-	if (test_thread_flag(TIF_RESTORE_SIGMASK))
-		oldset = &current->saved_sigmask;
-	else
-		oldset = &current->blocked;
-
 	signr = get_signal_to_deliver(&info, &ka, regs, NULL);
 	if (signr > 0) {
-		if (handle_signal(signr, &info, &ka, oldset, regs) == 0) {
+		if (handle_signal(signr, &info, &ka, regs) == 0) {
 			/* a signal was successfully delivered; the saved
 			 * sigmask will have been stored in the signal frame,
 			 * and will be restored by sigreturn, so we can simply
diff --git a/arch/openrisc/kernel/signal.c b/arch/openrisc/kernel/signal.c
index 266c6fd2eb5c..721c584ff44a 100644
--- a/arch/openrisc/kernel/signal.c
+++ b/arch/openrisc/kernel/signal.c
@@ -254,11 +254,11 @@ give_sigsegv:
 static inline int
 handle_signal(unsigned long sig,
 	      siginfo_t *info, struct k_sigaction *ka,
-	      sigset_t *oldset, struct pt_regs *regs)
+	      struct pt_regs *regs)
 {
 	int ret;
 
-	ret = setup_rt_frame(sig, ka, info, oldset, regs);
+	ret = setup_rt_frame(sig, ka, info, sigmask_to_save(), regs);
 	if (ret)
 		return ret;
 
@@ -341,15 +341,9 @@ void do_signal(struct pt_regs *regs)
 		 * back */
 		restore_saved_sigmask();
 	} else {		/* signr > 0 */
-		sigset_t *oldset;
-
-		if (current_thread_info()->flags & _TIF_RESTORE_SIGMASK)
-			oldset = &current->saved_sigmask;
-		else
-			oldset = &current->blocked;
 
 		/* Whee!  Actually deliver the signal.  */
-		if (!handle_signal(signr, &info, &ka, oldset, regs)) {
+		if (!handle_signal(signr, &info, &ka, regs)) {
 			/* a signal was successfully delivered; the saved
 			 * sigmask will have been stored in the signal frame,
 			 * and will be restored by sigreturn, so we can simply
diff --git a/arch/parisc/kernel/signal.c b/arch/parisc/kernel/signal.c
index 277cacadf653..441b25992846 100644
--- a/arch/parisc/kernel/signal.c
+++ b/arch/parisc/kernel/signal.c
@@ -443,8 +443,9 @@ give_sigsegv:
 
 static long
 handle_signal(unsigned long sig, siginfo_t *info, struct k_sigaction *ka,
-		sigset_t *oldset, struct pt_regs *regs, int in_syscall)
+		struct pt_regs *regs, int in_syscall)
 {
+	sigset_t *oldset = sigmask_to_save();
 	DBG(1,"handle_signal: sig=%ld, ka=%p, info=%p, oldset=%p, regs=%p\n",
 	       sig, ka, info, oldset, regs);
 	
@@ -568,28 +569,17 @@ do_signal(struct pt_regs *regs, long in_syscall)
 	siginfo_t info;
 	struct k_sigaction ka;
 	int signr;
-	sigset_t *oldset;
 
-	DBG(1,"\ndo_signal: oldset=0x%p, regs=0x%p, sr7 %#lx, in_syscall=%d\n",
-	       oldset, regs, regs->sr[7], in_syscall);
+	DBG(1,"\ndo_signal: regs=0x%p, sr7 %#lx, in_syscall=%d\n",
+	       regs, regs->sr[7], in_syscall);
 
 	/* Everyone else checks to see if they are in kernel mode at
 	   this point and exits if that's the case.  I'm not sure why
 	   we would be called in that case, but for some reason we
 	   are. */
 
-	if (test_thread_flag(TIF_RESTORE_SIGMASK))
-		oldset = &current->saved_sigmask;
-	else
-		oldset = &current->blocked;
-
-	DBG(1,"do_signal: oldset %08lx / %08lx\n", 
-		oldset->sig[0], oldset->sig[1]);
-
-
 	/* May need to force signal if handle_signal failed to deliver */
 	while (1) {
-	  
 		signr = get_signal_to_deliver(&info, &ka, regs, NULL);
 		DBG(3,"do_signal: signr = %d, regs->gr[28] = %ld\n", signr, regs->gr[28]); 
 	
@@ -603,8 +593,7 @@ do_signal(struct pt_regs *regs, long in_syscall)
 		/* Whee!  Actually deliver the signal.  If the
 		   delivery failed, we need to continue to iterate in
 		   this loop so we can deliver the SIGSEGV... */
-		if (handle_signal(signr, &info, &ka, oldset,
-				  regs, in_syscall)) {
+		if (handle_signal(signr, &info, &ka, regs, in_syscall)) {
 			DBG(1,KERN_DEBUG "do_signal: Exit (success), regs->gr[28] = %ld\n",
 				regs->gr[28]);
 			if (test_thread_flag(TIF_RESTORE_SIGMASK))
diff --git a/arch/powerpc/kernel/signal.c b/arch/powerpc/kernel/signal.c
index 0f4cc67f4268..8e9ddab7ade6 100644
--- a/arch/powerpc/kernel/signal.c
+++ b/arch/powerpc/kernel/signal.c
@@ -114,18 +114,13 @@ static void check_syscall_restart(struct pt_regs *regs, struct k_sigaction *ka,
 
 static int do_signal(struct pt_regs *regs)
 {
-	sigset_t *oldset;
+	sigset_t *oldset = sigmask_to_save();
 	siginfo_t info;
 	int signr;
 	struct k_sigaction ka;
 	int ret;
 	int is32 = is_32bit_task();
 
-	if (current_thread_info()->local_flags & _TLF_RESTORE_SIGMASK)
-		oldset = &current->saved_sigmask;
-	else
-		oldset = &current->blocked;
-
 	signr = get_signal_to_deliver(&info, &ka, regs, NULL);
 
 	/* Is there any syscall restart business here ? */
diff --git a/arch/s390/kernel/signal.c b/arch/s390/kernel/signal.c
index 37799089c38e..c880c48a09f3 100644
--- a/arch/s390/kernel/signal.c
+++ b/arch/s390/kernel/signal.c
@@ -398,12 +398,7 @@ void do_signal(struct pt_regs *regs)
 	siginfo_t info;
 	int signr;
 	struct k_sigaction ka;
-	sigset_t *oldset;
-
-	if (test_thread_flag(TIF_RESTORE_SIGMASK))
-		oldset = &current->saved_sigmask;
-	else
-		oldset = &current->blocked;
+	sigset_t *oldset = sigmask_to_save();
 
 	/*
 	 * Get signal to deliver. When running under ptrace, at this point
diff --git a/arch/score/kernel/signal.c b/arch/score/kernel/signal.c
index 9e751559375b..b24dfaf2462f 100644
--- a/arch/score/kernel/signal.c
+++ b/arch/score/kernel/signal.c
@@ -242,7 +242,7 @@ give_sigsegv:
 }
 
 static int handle_signal(unsigned long sig, siginfo_t *info,
-	struct k_sigaction *ka, sigset_t *oldset, struct pt_regs *regs)
+	struct k_sigaction *ka, struct pt_regs *regs)
 {
 	int ret;
 
@@ -269,7 +269,7 @@ static int handle_signal(unsigned long sig, siginfo_t *info,
 	/*
 	 * Set up the stack frame
 	 */
-	ret = setup_rt_frame(ka, regs, sig, oldset, info);
+	ret = setup_rt_frame(ka, regs, sig, sigmask_to_save(), info);
 
 	if (ret == 0)
 		block_sigmask(ka, sig);
@@ -280,7 +280,6 @@ static int handle_signal(unsigned long sig, siginfo_t *info,
 static void do_signal(struct pt_regs *regs)
 {
 	struct k_sigaction ka;
-	sigset_t *oldset;
 	siginfo_t info;
 	int signr;
 
@@ -292,15 +291,10 @@ static void do_signal(struct pt_regs *regs)
 	if (!user_mode(regs))
 		return;
 
-	if (test_thread_flag(TIF_RESTORE_SIGMASK))
-		oldset = &current->saved_sigmask;
-	else
-		oldset = &current->blocked;
-
 	signr = get_signal_to_deliver(&info, &ka, regs, NULL);
 	if (signr > 0) {
 		/* Actually deliver the signal.  */
-		if (handle_signal(signr, &info, &ka, oldset, regs) == 0) {
+		if (handle_signal(signr, &info, &ka, regs) == 0) {
 			/*
 			 * A signal was successfully delivered; the saved
 			 * sigmask will have been stored in the signal frame,
diff --git a/arch/sh/kernel/signal_32.c b/arch/sh/kernel/signal_32.c
index 92f4173ad29a..bfb3d599f032 100644
--- a/arch/sh/kernel/signal_32.c
+++ b/arch/sh/kernel/signal_32.c
@@ -524,8 +524,9 @@ handle_syscall_restart(unsigned long save_r0, struct pt_regs *regs,
  */
 static int
 handle_signal(unsigned long sig, struct k_sigaction *ka, siginfo_t *info,
-	      sigset_t *oldset, struct pt_regs *regs, unsigned int save_r0)
+	      struct pt_regs *regs, unsigned int save_r0)
 {
+	sigset_t *oldset = sigmask_to_save();
 	int ret;
 
 	/* Set up the stack frame */
@@ -554,7 +555,6 @@ static void do_signal(struct pt_regs *regs, unsigned int save_r0)
 	siginfo_t info;
 	int signr;
 	struct k_sigaction ka;
-	sigset_t *oldset;
 
 	/*
 	 * We want the common case to go fast, which
@@ -565,17 +565,12 @@ static void do_signal(struct pt_regs *regs, unsigned int save_r0)
 	if (!user_mode(regs))
 		return;
 
-	if (current_thread_info()->status & TS_RESTORE_SIGMASK)
-		oldset = &current->saved_sigmask;
-	else
-		oldset = &current->blocked;
-
 	signr = get_signal_to_deliver(&info, &ka, regs, NULL);
 	if (signr > 0) {
 		handle_syscall_restart(save_r0, regs, &ka.sa);
 
 		/* Whee!  Actually deliver the signal.  */
-		if (handle_signal(signr, &ka, &info, oldset,
+		if (handle_signal(signr, &ka, &info,
 				  regs, save_r0) == 0) {
 			/*
 			 * A signal was successfully delivered; the saved
diff --git a/arch/sh/kernel/signal_64.c b/arch/sh/kernel/signal_64.c
index 6e191ef0aa62..cc22d2b2e3f2 100644
--- a/arch/sh/kernel/signal_64.c
+++ b/arch/sh/kernel/signal_64.c
@@ -45,7 +45,7 @@
 
 static int
 handle_signal(unsigned long sig, siginfo_t *info, struct k_sigaction *ka,
-		sigset_t *oldset, struct pt_regs * regs);
+		struct pt_regs * regs);
 
 static inline void
 handle_syscall_restart(struct pt_regs *regs, struct sigaction *sa)
@@ -88,7 +88,6 @@ static void do_signal(struct pt_regs *regs)
 	siginfo_t info;
 	int signr;
 	struct k_sigaction ka;
-	sigset_t *oldset;
 
 	/*
 	 * We want the common case to go fast, which
@@ -99,17 +98,12 @@ static void do_signal(struct pt_regs *regs)
 	if (!user_mode(regs))
 		return;
 
-	if (current_thread_info()->status & TS_RESTORE_SIGMASK)
-		oldset = &current->saved_sigmask;
-	else
-		oldset = &current->blocked;
-
 	signr = get_signal_to_deliver(&info, &ka, regs, 0);
 	if (signr > 0) {
 		handle_syscall_restart(regs, &ka.sa);
 
 		/* Whee!  Actually deliver the signal.  */
-		if (handle_signal(signr, &info, &ka, oldset, regs) == 0) {
+		if (handle_signal(signr, &info, &ka, regs) == 0) {
 			/*
 			 * If a signal was successfully delivered, the
 			 * saved sigmask is in its frame, and we can
@@ -656,8 +650,9 @@ give_sigsegv:
  */
 static int
 handle_signal(unsigned long sig, siginfo_t *info, struct k_sigaction *ka,
-		sigset_t *oldset, struct pt_regs * regs)
+		struct pt_regs * regs)
 {
+	sigset_t *oldset = sigmask_to_save();
 	int ret;
 
 	/* Set up the stack frame */
diff --git a/arch/sparc/kernel/signal_32.c b/arch/sparc/kernel/signal_32.c
index 9dd97d2e171e..5d74410c787b 100644
--- a/arch/sparc/kernel/signal_32.c
+++ b/arch/sparc/kernel/signal_32.c
@@ -451,8 +451,9 @@ sigsegv:
 
 static inline int
 handle_signal(unsigned long signr, struct k_sigaction *ka,
-	      siginfo_t *info, sigset_t *oldset, struct pt_regs *regs)
+	      siginfo_t *info, struct pt_regs *regs)
 {
+	sigset_t *oldset = sigmask_to_save();
 	int err;
 
 	if (ka->sa.sa_flags & SA_SIGINFO)
@@ -498,7 +499,6 @@ static void do_signal(struct pt_regs *regs, unsigned long orig_i0)
 {
 	struct k_sigaction ka;
 	int restart_syscall;
-	sigset_t *oldset;
 	siginfo_t info;
 	int signr;
 
@@ -523,11 +523,6 @@ static void do_signal(struct pt_regs *regs, unsigned long orig_i0)
 	if (pt_regs_is_syscall(regs) && (regs->psr & PSR_C))
 		regs->u_regs[UREG_G6] = orig_i0;
 
-	if (test_thread_flag(TIF_RESTORE_SIGMASK))
-		oldset = &current->saved_sigmask;
-	else
-		oldset = &current->blocked;
-
 	signr = get_signal_to_deliver(&info, &ka, regs, NULL);
 
 	/* If the debugger messes with the program counter, it clears
@@ -544,7 +539,7 @@ static void do_signal(struct pt_regs *regs, unsigned long orig_i0)
 	if (signr > 0) {
 		if (restart_syscall)
 			syscall_restart(orig_i0, regs, &ka.sa);
-		if (handle_signal(signr, &ka, &info, oldset, regs) == 0) {
+		if (handle_signal(signr, &ka, &info, regs) == 0) {
 			/* a signal was successfully delivered; the saved
 			 * sigmask will have been stored in the signal frame,
 			 * and will be restored by sigreturn, so we can simply
diff --git a/arch/sparc/kernel/signal_64.c b/arch/sparc/kernel/signal_64.c
index 55b820ee0ac9..088a733f83f9 100644
--- a/arch/sparc/kernel/signal_64.c
+++ b/arch/sparc/kernel/signal_64.c
@@ -512,7 +512,7 @@ static void do_signal(struct pt_regs *regs, unsigned long orig_i0)
 {
 	struct k_sigaction ka;
 	int restart_syscall;
-	sigset_t *oldset;
+	sigset_t *oldset = sigmask_to_save();
 	siginfo_t info;
 	int signr;
 	
@@ -538,11 +538,6 @@ static void do_signal(struct pt_regs *regs, unsigned long orig_i0)
 	    (regs->tstate & (TSTATE_XCARRY | TSTATE_ICARRY)))
 		regs->u_regs[UREG_G6] = orig_i0;
 
-	if (current_thread_info()->status & TS_RESTORE_SIGMASK)
-		oldset = &current->saved_sigmask;
-	else
-		oldset = &current->blocked;
-
 #ifdef CONFIG_COMPAT
 	if (test_thread_flag(TIF_32BIT)) {
 		extern void do_signal32(sigset_t *, struct pt_regs *);
diff --git a/arch/tile/kernel/signal.c b/arch/tile/kernel/signal.c
index 62b3493ea77d..588c28b2db58 100644
--- a/arch/tile/kernel/signal.c
+++ b/arch/tile/kernel/signal.c
@@ -243,9 +243,10 @@ give_sigsegv:
  */
 
 static int handle_signal(unsigned long sig, siginfo_t *info,
-			 struct k_sigaction *ka, sigset_t *oldset,
+			 struct k_sigaction *ka,
 			 struct pt_regs *regs)
 {
+	sigset_t *oldset = sigmask_to_save();
 	int ret;
 
 	/* Are we from a system call? */
@@ -299,7 +300,6 @@ void do_signal(struct pt_regs *regs)
 	siginfo_t info;
 	int signr;
 	struct k_sigaction ka;
-	sigset_t *oldset;
 
 	/*
 	 * i386 will check if we're coming from kernel mode and bail out
@@ -308,15 +308,10 @@ void do_signal(struct pt_regs *regs)
 	 * helpful, we can reinstate the check on "!user_mode(regs)".
 	 */
 
-	if (current_thread_info()->status & TS_RESTORE_SIGMASK)
-		oldset = &current->saved_sigmask;
-	else
-		oldset = &current->blocked;
-
 	signr = get_signal_to_deliver(&info, &ka, regs, NULL);
 	if (signr > 0) {
 		/* Whee! Actually deliver the signal.  */
-		if (handle_signal(signr, &info, &ka, oldset, regs) == 0) {
+		if (handle_signal(signr, &info, &ka, regs) == 0) {
 			/*
 			 * A signal was successfully delivered; the saved
 			 * sigmask will have been stored in the signal frame,
diff --git a/arch/um/kernel/signal.c b/arch/um/kernel/signal.c
index 6acf13c1740b..909e9b8d6612 100644
--- a/arch/um/kernel/signal.c
+++ b/arch/um/kernel/signal.c
@@ -23,9 +23,9 @@ EXPORT_SYMBOL(unblock_signals);
  * OK, we're invoking a handler
  */
 static int handle_signal(struct pt_regs *regs, unsigned long signr,
-			 struct k_sigaction *ka, siginfo_t *info,
-			 sigset_t *oldset)
+			 struct k_sigaction *ka, siginfo_t *info)
 {
+	sigset_t *oldset = sigmask_to_save();
 	unsigned long sp;
 	int err;
 
@@ -77,14 +77,9 @@ static int kern_do_signal(struct pt_regs *regs)
 	int sig, handled_sig = 0;
 
 	while ((sig = get_signal_to_deliver(&info, &ka_copy, regs, NULL)) > 0) {
-		sigset_t *oldset;
-		if (test_thread_flag(TIF_RESTORE_SIGMASK))
-			oldset = &current->saved_sigmask;
-		else
-			oldset = &current->blocked;
 		handled_sig = 1;
 		/* Whee!  Actually deliver the signal.  */
-		if (!handle_signal(regs, sig, &ka_copy, &info, oldset)) {
+		if (!handle_signal(regs, sig, &ka_copy, &info)) {
 			/*
 			 * a signal was successfully delivered; the saved
 			 * sigmask will have been stored in the signal frame,
diff --git a/arch/unicore32/kernel/signal.c b/arch/unicore32/kernel/signal.c
index 65a5ed3b6f2a..bf23194dc74d 100644
--- a/arch/unicore32/kernel/signal.c
+++ b/arch/unicore32/kernel/signal.c
@@ -313,12 +313,11 @@ static inline void setup_syscall_restart(struct pt_regs *regs)
  * OK, we're invoking a handler
  */
 static int handle_signal(unsigned long sig, struct k_sigaction *ka,
-	      siginfo_t *info, sigset_t *oldset,
-	      struct pt_regs *regs, int syscall)
+	      siginfo_t *info, struct pt_regs *regs, int syscall)
 {
 	struct thread_info *thread = current_thread_info();
 	struct task_struct *tsk = current;
-	sigset_t blocked;
+	sigset_t *oldset = sigmask_to_save();
 	int usig = sig;
 	int ret;
 
@@ -404,13 +403,7 @@ static void do_signal(struct pt_regs *regs, int syscall)
 
 	signr = get_signal_to_deliver(&info, &ka, regs, NULL);
 	if (signr > 0) {
-		sigset_t *oldset;
-
-		if (test_thread_flag(TIF_RESTORE_SIGMASK))
-			oldset = &current->saved_sigmask;
-		else
-			oldset = &current->blocked;
-		if (handle_signal(signr, &ka, &info, oldset, regs, syscall)
+		if (handle_signal(signr, &ka, &info, regs, syscall)
 				== 0) {
 			/*
 			 * A signal was successfully delivered; the saved
diff --git a/arch/x86/kernel/signal.c b/arch/x86/kernel/signal.c
index 25a4a81a51aa..56f3062c5111 100644
--- a/arch/x86/kernel/signal.c
+++ b/arch/x86/kernel/signal.c
@@ -647,12 +647,9 @@ setup_rt_frame(int sig, struct k_sigaction *ka, siginfo_t *info,
 		struct pt_regs *regs)
 {
 	int usig = signr_convert(sig);
-	sigset_t *set = &current->blocked;
+	sigset_t *set = sigmask_to_save();
 	int ret;
 
-	if (current_thread_info()->status & TS_RESTORE_SIGMASK)
-		set = &current->saved_sigmask;
-
 	/* Set up the stack frame */
 	if (is_ia32) {
 		if (ka->sa.sa_flags & SA_SIGINFO)
diff --git a/arch/xtensa/kernel/signal.c b/arch/xtensa/kernel/signal.c
index 8c4e751e3b83..e4b06e2d4eb9 100644
--- a/arch/xtensa/kernel/signal.c
+++ b/arch/xtensa/kernel/signal.c
@@ -452,16 +452,10 @@ static void do_signal(struct pt_regs *regs)
 	siginfo_t info;
 	int signr;
 	struct k_sigaction ka;
-	sigset_t oldset;
 
 	if (try_to_freeze())
 		goto no_signal;
 
-	if (test_thread_flag(TIF_RESTORE_SIGMASK))
-		oldset = &current->saved_sigmask;
-	else
-		oldset = &current->blocked;
-
 	task_pt_regs(current)->icountlevel = 0;
 
 	signr = get_signal_to_deliver(&info, &ka, regs, NULL);
@@ -501,7 +495,7 @@ static void do_signal(struct pt_regs *regs)
 
 		/* Whee!  Actually deliver the signal.  */
 		/* Set up the stack frame */
-		ret = setup_frame(signr, &ka, &info, oldset, regs);
+		ret = setup_frame(signr, &ka, &info, sigmask_to_save(), regs);
 		if (ret)
 			return;
 
diff --git a/include/linux/sched.h b/include/linux/sched.h
index f1b46b88f6f5..ded3fb63fb06 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -2213,6 +2213,14 @@ static inline void restore_saved_sigmask(void)
 		set_current_blocked(&current->saved_sigmask);
 }
 
+static inline sigset_t *sigmask_to_save(void)
+{
+	sigset_t *res = &current->blocked;
+	if (unlikely(test_restore_sigmask()))
+		res = &current->saved_sigmask;
+	return res;
+}
+
 static inline int kill_cad_pid(int sig, int priv)
 {
 	return kill_pid(cad_pid, sig, priv);
-- 
cgit v1.2.3


From a610d6e672d6d3723e8da257ad4a8a288a8f2f89 Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Mon, 21 May 2012 23:42:15 -0400
Subject: pull clearing RESTORE_SIGMASK into block_sigmask()

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 arch/alpha/kernel/signal.c         |  5 -----
 arch/arm/kernel/signal.c           | 18 +++---------------
 arch/avr32/kernel/signal.c         | 11 ++++-------
 arch/blackfin/kernel/signal.c      | 24 +++++++-----------------
 arch/c6x/kernel/signal.c           | 23 ++++++-----------------
 arch/cris/arch-v10/kernel/signal.c | 14 ++------------
 arch/cris/arch-v32/kernel/signal.c | 15 ++-------------
 arch/frv/kernel/signal.c           | 24 +++++++-----------------
 arch/h8300/kernel/signal.c         |  4 +---
 arch/hexagon/kernel/signal.c       | 26 ++++++--------------------
 arch/ia64/kernel/signal.c          | 10 +---------
 arch/m32r/kernel/signal.c          |  8 +++-----
 arch/m68k/kernel/signal.c          |  2 --
 arch/microblaze/kernel/signal.c    | 17 +++--------------
 arch/mips/kernel/signal.c          | 18 +++---------------
 arch/mn10300/kernel/signal.c       | 18 +++++-------------
 arch/openrisc/kernel/signal.c      |  2 --
 arch/parisc/kernel/signal.c        | 10 ++++------
 arch/powerpc/kernel/signal.c       |  7 -------
 arch/s390/kernel/compat_signal.c   | 10 +++++++---
 arch/s390/kernel/entry.h           |  2 +-
 arch/s390/kernel/signal.c          | 32 +++++++++++---------------------
 arch/score/kernel/signal.c         | 24 +++++-------------------
 arch/sh/kernel/signal_32.c         | 26 +++++++-------------------
 arch/sh/kernel/signal_64.c         | 24 ++++++++----------------
 arch/sparc/kernel/signal32.c       | 15 +++------------
 arch/sparc/kernel/signal_32.c      | 16 +++-------------
 arch/sparc/kernel/signal_64.c      | 15 +++------------
 arch/tile/kernel/signal.c          | 25 +++++--------------------
 arch/um/kernel/signal.c            | 16 ++--------------
 arch/unicore32/kernel/signal.c     | 18 +++---------------
 arch/x86/kernel/signal.c           | 31 +++++++++----------------------
 arch/xtensa/kernel/signal.c        |  1 -
 kernel/signal.c                    |  6 ++++++
 34 files changed, 130 insertions(+), 387 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/alpha/kernel/signal.c b/arch/alpha/kernel/signal.c
index f1e7d2aa2586..bb45a8813393 100644
--- a/arch/alpha/kernel/signal.c
+++ b/arch/alpha/kernel/signal.c
@@ -481,11 +481,6 @@ handle_signal(int sig, struct k_sigaction *ka, siginfo_t *info,
 		return;
 	}
 	block_sigmask(ka, sig);
-	/* A signal was successfully delivered, and the
-	   saved sigmask was stored on the signal frame,
-	   and will be restored by sigreturn.  So we can
-	   simply clear the restore sigmask flag.  */
-	clear_thread_flag(TIF_RESTORE_SIGMASK);
 }
 
 static inline void
diff --git a/arch/arm/kernel/signal.c b/arch/arm/kernel/signal.c
index 2e66c93973c3..7f9abd75fc2e 100644
--- a/arch/arm/kernel/signal.c
+++ b/arch/arm/kernel/signal.c
@@ -528,7 +528,7 @@ setup_rt_frame(int usig, struct k_sigaction *ka, siginfo_t *info,
 /*
  * OK, we're invoking a handler
  */	
-static int
+static void
 handle_signal(unsigned long sig, struct k_sigaction *ka,
 	      siginfo_t *info, struct pt_regs *regs)
 {
@@ -559,17 +559,14 @@ handle_signal(unsigned long sig, struct k_sigaction *ka,
 
 	if (ret != 0) {
 		force_sigsegv(sig, tsk);
-		return ret;
+		return;
 	}
 
 	/*
 	 * Block the signal if we were successful.
 	 */
 	block_sigmask(ka, sig);
-
 	tracehook_signal_handler(sig, info, ka, regs, 0);
-
-	return 0;
 }
 
 /*
@@ -633,16 +630,7 @@ static void do_signal(struct pt_regs *regs, int syscall)
 			clear_thread_flag(TIF_SYSCALL_RESTARTSYS);
 		}
 
-		if (handle_signal(signr, &ka, &info, regs) == 0) {
-			/*
-			 * A signal was successfully delivered; the saved
-			 * sigmask will have been stored in the signal frame,
-			 * and will be restored by sigreturn, so we can simply
-			 * clear the TIF_RESTORE_SIGMASK flag.
-			 */
-			if (test_thread_flag(TIF_RESTORE_SIGMASK))
-				clear_thread_flag(TIF_RESTORE_SIGMASK);
-		}
+		handle_signal(signr, &ka, &info, regs);
 		return;
 	}
 
diff --git a/arch/avr32/kernel/signal.c b/arch/avr32/kernel/signal.c
index 0e2c0527c9fe..dc7875a0ad79 100644
--- a/arch/avr32/kernel/signal.c
+++ b/arch/avr32/kernel/signal.c
@@ -238,16 +238,13 @@ handle_signal(unsigned long sig, struct k_sigaction *ka, siginfo_t *info,
 	 */
 	ret |= !valid_user_regs(regs);
 
-	if (ret != 0) {
-		force_sigsegv(sig, current);
-		return;
-	}
-
 	/*
 	 * Block the signal if we were successful.
 	 */
-	block_sigmask(ka, sig);
-	clear_thread_flag(TIF_RESTORE_SIGMASK);
+	if (ret != 0)
+		force_sigsegv(sig, current);
+	else
+		block_sigmask(ka, sig);
 }
 
 /*
diff --git a/arch/blackfin/kernel/signal.c b/arch/blackfin/kernel/signal.c
index 7f4205ddfa4d..b25cbfef8192 100644
--- a/arch/blackfin/kernel/signal.c
+++ b/arch/blackfin/kernel/signal.c
@@ -247,7 +247,7 @@ handle_restart(struct pt_regs *regs, struct k_sigaction *ka, int has_handler)
 /*
  * OK, we're invoking a handler
  */
-static int
+static void
 handle_signal(int sig, siginfo_t *info, struct k_sigaction *ka,
 	      struct pt_regs *regs)
 {
@@ -260,11 +260,12 @@ handle_signal(int sig, siginfo_t *info, struct k_sigaction *ka,
 
 	/* set up the stack frame */
 	ret = setup_rt_frame(sig, ka, info, sigmask_to_save(), regs);
+	if (ret)
+		return;
 
-	if (ret == 0)
-		block_sigmask(ka, sig);
-
-	return ret;
+	block_sigmask(ka, sig);
+	tracehook_signal_handler(sig, info, ka, regs,
+			test_thread_flag(TIF_SINGLESTEP));
 }
 
 /*
@@ -290,18 +291,7 @@ asmlinkage void do_signal(struct pt_regs *regs)
 	signr = get_signal_to_deliver(&info, &ka, regs, NULL);
 	if (signr > 0) {
 		/* Whee!  Actually deliver the signal.  */
-		if (handle_signal(signr, &info, &ka, regs) == 0) {
-			/* a signal was successfully delivered; the saved
-			 * sigmask will have been stored in the signal frame,
-			 * and will be restored by sigreturn, so we can simply
-			 * clear the TIF_RESTORE_SIGMASK flag */
-			if (test_thread_flag(TIF_RESTORE_SIGMASK))
-				clear_thread_flag(TIF_RESTORE_SIGMASK);
-
-			tracehook_signal_handler(signr, &info, &ka, regs,
-				test_thread_flag(TIF_SINGLESTEP));
-		}
-
+		handle_signal(signr, &info, &ka, regs);
 		return;
 	}
 
diff --git a/arch/c6x/kernel/signal.c b/arch/c6x/kernel/signal.c
index 38bb501eb117..f39346f1f2d6 100644
--- a/arch/c6x/kernel/signal.c
+++ b/arch/c6x/kernel/signal.c
@@ -248,7 +248,7 @@ do_restart:
 /*
  * handle the actual delivery of a signal to userspace
  */
-static int handle_signal(int sig,
+static void handle_signal(int sig,
 			 siginfo_t *info, struct k_sigaction *ka,
 			 struct pt_regs *regs, int syscall)
 {
@@ -277,11 +277,10 @@ static int handle_signal(int sig,
 	}
 
 	/* Set up the stack frame */
-	ret = setup_rt_frame(sig, ka, info, sigmask_to_save(), regs);
-	if (ret == 0)
-		block_sigmask(ka, sig);
-
-	return ret;
+	if (setup_rt_frame(sig, ka, info, sigmask_to_save(), regs) < 0)
+		return;
+	block_sigmask(ka, sig);
+	tracehook_signal_handler(sig, info, ka, regs, 0);
 }
 
 /*
@@ -300,17 +299,7 @@ static void do_signal(struct pt_regs *regs, int syscall)
 
 	signr = get_signal_to_deliver(&info, &ka, regs, NULL);
 	if (signr > 0) {
-		if (handle_signal(signr, &info, &ka, regs, syscall) == 0) {
-			/* a signal was successfully delivered; the saved
-			 * sigmask will have been stored in the signal frame,
-			 * and will be restored by sigreturn, so we can simply
-			 * clear the TIF_RESTORE_SIGMASK flag */
-			if (test_thread_flag(TIF_RESTORE_SIGMASK))
-				clear_thread_flag(TIF_RESTORE_SIGMASK);
-
-			tracehook_signal_handler(signr, &info, &ka, regs, 0);
-		}
-
+		handle_signal(signr, &info, &ka, regs, syscall);
 		return;
 	}
 
diff --git a/arch/cris/arch-v10/kernel/signal.c b/arch/cris/arch-v10/kernel/signal.c
index 09a4cf4eb08a..46c8ca605e4d 100644
--- a/arch/cris/arch-v10/kernel/signal.c
+++ b/arch/cris/arch-v10/kernel/signal.c
@@ -415,7 +415,7 @@ give_sigsegv:
  * OK, we're invoking a handler
  */
 
-static inline int handle_signal(int canrestart, unsigned long sig,
+static inline void handle_signal(int canrestart, unsigned long sig,
 	siginfo_t *info, struct k_sigaction *ka,
 	struct pt_regs *regs)
 {
@@ -458,8 +458,6 @@ static inline int handle_signal(int canrestart, unsigned long sig,
 
 	if (ret == 0)
 		block_sigmask(ka, sig);
-
-	return ret;
 }
 
 /*
@@ -492,15 +490,7 @@ void do_signal(int canrestart, struct pt_regs *regs)
 	signr = get_signal_to_deliver(&info, &ka, regs, NULL);
 	if (signr > 0) {
 		/* Whee!  Actually deliver the signal.  */
-		if (handle_signal(canrestart, signr, &info, &ka,
-				regs)) {
-			/* a signal was successfully delivered; the saved
-			 * sigmask will have been stored in the signal frame,
-			 * and will be restored by sigreturn, so we can simply
-			 * clear the TIF_RESTORE_SIGMASK flag */
-			if (test_thread_flag(TIF_RESTORE_SIGMASK))
-				clear_thread_flag(TIF_RESTORE_SIGMASK);
-		}
+		handle_signal(canrestart, signr, &info, &ka, regs);
 		return;
 	}
 
diff --git a/arch/cris/arch-v32/kernel/signal.c b/arch/cris/arch-v32/kernel/signal.c
index d52276ddae4b..e0431328b7cd 100644
--- a/arch/cris/arch-v32/kernel/signal.c
+++ b/arch/cris/arch-v32/kernel/signal.c
@@ -434,7 +434,7 @@ give_sigsegv:
 }
 
 /* Invoke a signal handler to, well, handle the signal. */
-static inline int
+static inline void
 handle_signal(int canrestart, unsigned long sig,
 	      siginfo_t *info, struct k_sigaction *ka,
               struct pt_regs * regs)
@@ -491,8 +491,6 @@ handle_signal(int canrestart, unsigned long sig,
 
 	if (ret == 0)
 		block_sigmask(ka, sig);
-
-	return ret;
 }
 
 /*
@@ -525,16 +523,7 @@ do_signal(int canrestart, struct pt_regs *regs)
 
 	if (signr > 0) {
 		/* Whee!  Actually deliver the signal.  */
-		if (handle_signal(canrestart, signr, &info, &ka,
-				regs)) {
-			/* a signal was successfully delivered; the saved
-			 * sigmask will have been stored in the signal frame,
-			 * and will be restored by sigreturn, so we can simply
-			 * clear the TIF_RESTORE_SIGMASK flag */
-			if (test_thread_flag(TIF_RESTORE_SIGMASK))
-				clear_thread_flag(TIF_RESTORE_SIGMASK);
-		}
-
+		handle_signal(canrestart, signr, &info, &ka, regs);
 		return;
 	}
 
diff --git a/arch/frv/kernel/signal.c b/arch/frv/kernel/signal.c
index 22efe8d25038..8dd0492bfb7b 100644
--- a/arch/frv/kernel/signal.c
+++ b/arch/frv/kernel/signal.c
@@ -426,7 +426,7 @@ give_sigsegv:
 /*
  * OK, we're invoking a handler
  */
-static int handle_signal(unsigned long sig, siginfo_t *info,
+static void handle_signal(unsigned long sig, siginfo_t *info,
 			 struct k_sigaction *ka)
 {
 	sigset_t *oldset = sigmask_to_save();
@@ -461,11 +461,12 @@ static int handle_signal(unsigned long sig, siginfo_t *info,
 	else
 		ret = setup_frame(sig, ka, oldset);
 
-	if (ret == 0)
-		block_sigmask(ka, sig);
-
-	return ret;
+	if (ret)
+		return;
 
+	block_sigmask(ka, sig);
+	tracehook_signal_handler(sig, info, ka, __frame,
+				 test_thread_flag(TIF_SINGLESTEP));
 } /* end handle_signal() */
 
 /*****************************************************************************/
@@ -495,18 +496,7 @@ static void do_signal(void)
 
 	signr = get_signal_to_deliver(&info, &ka, __frame, NULL);
 	if (signr > 0) {
-		if (handle_signal(signr, &info, &ka) == 0) {
-			/* a signal was successfully delivered; the saved
-			 * sigmask will have been stored in the signal frame,
-			 * and will be restored by sigreturn, so we can simply
-			 * clear the TIF_RESTORE_SIGMASK flag */
-			if (test_thread_flag(TIF_RESTORE_SIGMASK))
-				clear_thread_flag(TIF_RESTORE_SIGMASK);
-
-			tracehook_signal_handler(signr, &info, &ka, __frame,
-						 test_thread_flag(TIF_SINGLESTEP));
-		}
-
+		handle_signal(signr, &info, &ka);
 		return;
 	}
 
diff --git a/arch/h8300/kernel/signal.c b/arch/h8300/kernel/signal.c
index d4d2f72672ad..eac26c9ffc44 100644
--- a/arch/h8300/kernel/signal.c
+++ b/arch/h8300/kernel/signal.c
@@ -442,10 +442,8 @@ handle_signal(unsigned long sig, siginfo_t *info, struct k_sigaction *ka,
 	else
 		ret = setup_frame(sig, ka, oldset, regs);
 
-	if (!ret) {
+	if (!ret)
 		block_sigmask(ka, sig);
-		clear_thread_flag(TIF_RESTORE_SIGMASK);
-	}
 }
 
 /*
diff --git a/arch/hexagon/kernel/signal.c b/arch/hexagon/kernel/signal.c
index f73fcee09bac..5f7d7c8a1328 100644
--- a/arch/hexagon/kernel/signal.c
+++ b/arch/hexagon/kernel/signal.c
@@ -149,11 +149,9 @@ sigsegv:
 /*
  * Setup invocation of signal handler
  */
-static int handle_signal(int sig, siginfo_t *info, struct k_sigaction *ka,
+static void handle_signal(int sig, siginfo_t *info, struct k_sigaction *ka,
 			 struct pt_regs *regs)
 {
-	int rc;
-
 	/*
 	 * If we're handling a signal that aborted a system call,
 	 * set up the error return value before adding the signal
@@ -186,15 +184,13 @@ static int handle_signal(int sig, siginfo_t *info, struct k_sigaction *ka,
 	 * Set up the stack frame; not doing the SA_SIGINFO thing.  We
 	 * only set up the rt_frame flavor.
 	 */
-	rc = setup_rt_frame(sig, ka, info, sigmask_to_save(), regs);
-
 	/* If there was an error on setup, no signal was delivered. */
-	if (rc)
-		return rc;
+	if (setup_rt_frame(sig, ka, info, sigmask_to_save(), regs) < 0)
+		return;
 
 	block_sigmask(ka, sig);
-
-	return 0;
+	tracehook_signal_handler(sig, info, ka, regs,
+			test_thread_flag(TIF_SINGLESTEP));
 }
 
 /*
@@ -215,17 +211,7 @@ static void do_signal(struct pt_regs *regs)
 	signo = get_signal_to_deliver(&info, &sigact, regs, NULL);
 
 	if (signo > 0) {
-		if (handle_signal(signo, &info, &sigact, regs) == 0) {
-			/*
-			 * Successful delivery case.  The saved sigmask is
-			 * stored in the signal frame, and will be restored
-			 * by sigreturn.  We can clear the TIF flag.
-			 */
-			clear_thread_flag(TIF_RESTORE_SIGMASK);
-
-			tracehook_signal_handler(signo, &info, &sigact, regs,
-				test_thread_flag(TIF_SINGLESTEP));
-		}
+		handle_signal(signo, &info, &sigact, regs);
 		return;
 	}
 
diff --git a/arch/ia64/kernel/signal.c b/arch/ia64/kernel/signal.c
index 9fee6d6a3f21..dc6fe6573465 100644
--- a/arch/ia64/kernel/signal.c
+++ b/arch/ia64/kernel/signal.c
@@ -501,16 +501,8 @@ ia64_do_signal (struct sigscratch *scr, long in_syscall)
 		 * Whee!  Actually deliver the signal.  If the delivery failed, we need to
 		 * continue to iterate in this loop so we can deliver the SIGSEGV...
 		 */
-		if (handle_signal(signr, &ka, &info, scr)) {
-			/*
-			 * A signal was successfully delivered; the saved
-			 * sigmask will have been stored in the signal frame,
-			 * and will be restored by sigreturn, so we can simply
-			 * clear the TS_RESTORE_SIGMASK flag.
-			 */
-			current_thread_info()->status &= ~TS_RESTORE_SIGMASK;
+		if (handle_signal(signr, &ka, &info, scr))
 			return;
-		}
 	}
 
 	/* Did we come from a system call? */
diff --git a/arch/m32r/kernel/signal.c b/arch/m32r/kernel/signal.c
index e0d6d1079f33..970f46dbf24f 100644
--- a/arch/m32r/kernel/signal.c
+++ b/arch/m32r/kernel/signal.c
@@ -267,7 +267,7 @@ static int prev_insn(struct pt_regs *regs)
  * OK, we're invoking a handler
  */
 
-static int
+static void
 handle_signal(unsigned long sig, struct k_sigaction *ka, siginfo_t *info,
 	      struct pt_regs *regs)
 {
@@ -295,10 +295,9 @@ handle_signal(unsigned long sig, struct k_sigaction *ka, siginfo_t *info,
 
 	/* Set up the stack frame */
 	if (setup_rt_frame(sig, ka, info, sigmask_to_save(), regs))
-		return -EFAULT;
+		return;
 
 	block_sigmask(ka, sig);
-	return 0;
 }
 
 /*
@@ -333,8 +332,7 @@ static void do_signal(struct pt_regs *regs)
 		 */
 
 		/* Whee!  Actually deliver the signal.  */
-		if (handle_signal(signr, &ka, &info, regs) == 0)
-			clear_thread_flag(TIF_RESTORE_SIGMASK);
+		handle_signal(signr, &ka, &info, regs);
 
 		return;
 	}
diff --git a/arch/m68k/kernel/signal.c b/arch/m68k/kernel/signal.c
index c83eb5a8ed8b..6dbee8a167a5 100644
--- a/arch/m68k/kernel/signal.c
+++ b/arch/m68k/kernel/signal.c
@@ -1147,8 +1147,6 @@ handle_signal(int sig, struct k_sigaction *ka, siginfo_t *info,
 		regs->sr &= ~0x8000;
 		send_sig(SIGTRAP, current, 1);
 	}
-
-	clear_thread_flag(TIF_RESTORE_SIGMASK);
 }
 
 /*
diff --git a/arch/microblaze/kernel/signal.c b/arch/microblaze/kernel/signal.c
index fd2de5718a4e..03641199666e 100644
--- a/arch/microblaze/kernel/signal.c
+++ b/arch/microblaze/kernel/signal.c
@@ -310,7 +310,7 @@ do_restart:
  * OK, we're invoking a handler
  */
 
-static int
+static void
 handle_signal(unsigned long sig, struct k_sigaction *ka,
 		siginfo_t *info, struct pt_regs *regs)
 {
@@ -324,11 +324,9 @@ handle_signal(unsigned long sig, struct k_sigaction *ka,
 		ret = setup_rt_frame(sig, ka, NULL, oldset, regs);
 
 	if (ret)
-		return ret;
+		return;
 
 	block_sigmask(ka, sig);
-
-	return 0;
 }
 
 /*
@@ -356,16 +354,7 @@ static void do_signal(struct pt_regs *regs, int in_syscall)
 		/* Whee! Actually deliver the signal. */
 		if (in_syscall)
 			handle_restart(regs, &ka, 1);
-		if (!handle_signal(signr, &ka, &info, oldset, regs)) {
-			/*
-			 * A signal was successfully delivered; the saved
-			 * sigmask will have been stored in the signal frame,
-			 * and will be restored by sigreturn, so we can simply
-			 * clear the TS_RESTORE_SIGMASK flag.
-			 */
-			current_thread_info()->status &=
-			    ~TS_RESTORE_SIGMASK;
-		}
+		handle_signal(signr, &ka, &info, regs);
 		return;
 	}
 
diff --git a/arch/mips/kernel/signal.c b/arch/mips/kernel/signal.c
index 18355060f241..53c6e90082f0 100644
--- a/arch/mips/kernel/signal.c
+++ b/arch/mips/kernel/signal.c
@@ -514,7 +514,7 @@ struct mips_abi mips_abi = {
 	.restart	= __NR_restart_syscall
 };
 
-static int handle_signal(unsigned long sig, siginfo_t *info,
+static void handle_signal(unsigned long sig, siginfo_t *info,
 	struct k_sigaction *ka, struct pt_regs *regs)
 {
 	sigset_t *oldset = sigmask_to_save();
@@ -551,11 +551,9 @@ static int handle_signal(unsigned long sig, siginfo_t *info,
 				       ka, regs, sig, oldset);
 
 	if (ret)
-		return ret;
+		return;
 
 	block_sigmask(ka, sig);
-
-	return ret;
 }
 
 static void do_signal(struct pt_regs *regs)
@@ -575,17 +573,7 @@ static void do_signal(struct pt_regs *regs)
 	signr = get_signal_to_deliver(&info, &ka, regs, NULL);
 	if (signr > 0) {
 		/* Whee!  Actually deliver the signal.  */
-		if (handle_signal(signr, &info, &ka, regs) == 0) {
-			/*
-			 * A signal was successfully delivered; the saved
-			 * sigmask will have been stored in the signal frame,
-			 * and will be restored by sigreturn, so we can simply
-			 * clear the TIF_RESTORE_SIGMASK flag.
-			 */
-			if (test_thread_flag(TIF_RESTORE_SIGMASK))
-				clear_thread_flag(TIF_RESTORE_SIGMASK);
-		}
-
+		handle_signal(signr, &info, &ka, regs);
 		return;
 	}
 
diff --git a/arch/mn10300/kernel/signal.c b/arch/mn10300/kernel/signal.c
index 26a1d98c62a1..1715478f4e94 100644
--- a/arch/mn10300/kernel/signal.c
+++ b/arch/mn10300/kernel/signal.c
@@ -462,11 +462,12 @@ static int handle_signal(int sig,
 		ret = setup_rt_frame(sig, ka, info, oldset, regs);
 	else
 		ret = setup_frame(sig, ka, oldset, regs);
+	if (ret)
+		return;
 
-	if (ret == 0)
-		block_sigmask(ka, sig);
-
-	return ret;
+	block_sigmask(ka, sig);
+	tracehook_signal_handler(sig, info, ka, regs,
+				 test_thread_flag(TIF_SINGLESTEP));
 }
 
 /*
@@ -486,15 +487,6 @@ static void do_signal(struct pt_regs *regs)
 	signr = get_signal_to_deliver(&info, &ka, regs, NULL);
 	if (signr > 0) {
 		if (handle_signal(signr, &info, &ka, regs) == 0) {
-			/* a signal was successfully delivered; the saved
-			 * sigmask will have been stored in the signal frame,
-			 * and will be restored by sigreturn, so we can simply
-			 * clear the TIF_RESTORE_SIGMASK flag */
-			if (test_thread_flag(TIF_RESTORE_SIGMASK))
-				clear_thread_flag(TIF_RESTORE_SIGMASK);
-
-			tracehook_signal_handler(signr, &info, &ka, regs,
-						 test_thread_flag(TIF_SINGLESTEP));
 		}
 
 		return;
diff --git a/arch/openrisc/kernel/signal.c b/arch/openrisc/kernel/signal.c
index 6c41778410e6..aa1105c1618f 100644
--- a/arch/openrisc/kernel/signal.c
+++ b/arch/openrisc/kernel/signal.c
@@ -263,8 +263,6 @@ handle_signal(unsigned long sig,
 		return;
 
 	block_sigmask(ka, sig);
-	clear_thread_flag(TIF_RESTORE_SIGMASK);
-
 	tracehook_signal_handler(sig, info, ka, regs,
 				 test_thread_flag(TIF_SINGLESTEP));
 }
diff --git a/arch/parisc/kernel/signal.c b/arch/parisc/kernel/signal.c
index 441b25992846..d6ddc572eba1 100644
--- a/arch/parisc/kernel/signal.c
+++ b/arch/parisc/kernel/signal.c
@@ -459,6 +459,9 @@ handle_signal(unsigned long sig, siginfo_t *info, struct k_sigaction *ka,
 		test_thread_flag(TIF_SINGLESTEP) ||
 		test_thread_flag(TIF_BLOCKSTEP));
 
+	DBG(1,KERN_DEBUG "do_signal: Exit (success), regs->gr[28] = %ld\n",
+		regs->gr[28]);
+
 	return 1;
 }
 
@@ -593,13 +596,8 @@ do_signal(struct pt_regs *regs, long in_syscall)
 		/* Whee!  Actually deliver the signal.  If the
 		   delivery failed, we need to continue to iterate in
 		   this loop so we can deliver the SIGSEGV... */
-		if (handle_signal(signr, &info, &ka, regs, in_syscall)) {
-			DBG(1,KERN_DEBUG "do_signal: Exit (success), regs->gr[28] = %ld\n",
-				regs->gr[28]);
-			if (test_thread_flag(TIF_RESTORE_SIGMASK))
-				clear_thread_flag(TIF_RESTORE_SIGMASK);
+		if (handle_signal(signr, &info, &ka, regs, in_syscall))
 			return;
-		}
 	}
 	/* end of while(1) looping forever if we can't force a signal */
 
diff --git a/arch/powerpc/kernel/signal.c b/arch/powerpc/kernel/signal.c
index 8e9ddab7ade6..d926d2e4611a 100644
--- a/arch/powerpc/kernel/signal.c
+++ b/arch/powerpc/kernel/signal.c
@@ -159,13 +159,6 @@ static int do_signal(struct pt_regs *regs)
 	regs->trap = 0;
 	if (ret) {
 		block_sigmask(&ka, signr);
-
-		/*
-		 * A signal was successfully delivered; the saved sigmask is in
-		 * its frame, and we can clear the TLF_RESTORE_SIGMASK flag.
-		 */
-		current_thread_info()->local_flags &= ~_TLF_RESTORE_SIGMASK;
-
 		/*
 		 * Let tracing know that we've done the handler setup.
 		 */
diff --git a/arch/s390/kernel/compat_signal.c b/arch/s390/kernel/compat_signal.c
index 377c096ca4a7..233db1d68eee 100644
--- a/arch/s390/kernel/compat_signal.c
+++ b/arch/s390/kernel/compat_signal.c
@@ -572,7 +572,7 @@ give_sigsegv:
  * OK, we're invoking a handler
  */	
 
-int handle_signal32(unsigned long sig, struct k_sigaction *ka,
+void handle_signal32(unsigned long sig, struct k_sigaction *ka,
 		    siginfo_t *info, sigset_t *oldset, struct pt_regs *regs)
 {
 	int ret;
@@ -583,8 +583,12 @@ int handle_signal32(unsigned long sig, struct k_sigaction *ka,
 	else
 		ret = setup_frame32(sig, ka, oldset, regs);
 	if (ret)
-		return ret;
+		return;
 	block_sigmask(ka, sig);
-	return 0;
+	/*
+	 * Let tracing know that we've done the handler setup.
+	 */
+	tracehook_signal_handler(sig, info, ka, regs,
+				 test_thread_flag(TIF_SINGLE_STEP));
 }
 
diff --git a/arch/s390/kernel/entry.h b/arch/s390/kernel/entry.h
index 6cdddac93a2e..f66a229ab0b3 100644
--- a/arch/s390/kernel/entry.h
+++ b/arch/s390/kernel/entry.h
@@ -31,7 +31,7 @@ void do_per_trap(struct pt_regs *regs);
 void syscall_trace(struct pt_regs *regs, int entryexit);
 void kernel_stack_overflow(struct pt_regs * regs);
 void do_signal(struct pt_regs *regs);
-int handle_signal32(unsigned long sig, struct k_sigaction *ka,
+void handle_signal32(unsigned long sig, struct k_sigaction *ka,
 		    siginfo_t *info, sigset_t *oldset, struct pt_regs *regs);
 void do_notify_resume(struct pt_regs *regs);
 
diff --git a/arch/s390/kernel/signal.c b/arch/s390/kernel/signal.c
index c880c48a09f3..7f9a862a161a 100644
--- a/arch/s390/kernel/signal.c
+++ b/arch/s390/kernel/signal.c
@@ -367,7 +367,7 @@ give_sigsegv:
 	return -EFAULT;
 }
 
-static int handle_signal(unsigned long sig, struct k_sigaction *ka,
+static void handle_signal(unsigned long sig, struct k_sigaction *ka,
 			 siginfo_t *info, sigset_t *oldset,
 			 struct pt_regs *regs)
 {
@@ -379,9 +379,13 @@ static int handle_signal(unsigned long sig, struct k_sigaction *ka,
 	else
 		ret = setup_frame(sig, ka, oldset, regs);
 	if (ret)
-		return ret;
+		return;
 	block_sigmask(ka, sig);
-	return 0;
+	/*
+	 * Let tracing know that we've done the handler setup.
+	 */
+	tracehook_signal_handler(sig, info, ka, regs,
+				 test_thread_flag(TIF_SINGLE_STEP));
 }
 
 /*
@@ -436,24 +440,10 @@ void do_signal(struct pt_regs *regs)
 		/* No longer in a system call */
 		clear_thread_flag(TIF_SYSCALL);
 
-		if ((is_compat_task() ?
-		     handle_signal32(signr, &ka, &info, oldset, regs) :
-		     handle_signal(signr, &ka, &info, oldset, regs)) == 0) {
-			/*
-			 * A signal was successfully delivered; the saved
-			 * sigmask will have been stored in the signal frame,
-			 * and will be restored by sigreturn, so we can simply
-			 * clear the TIF_RESTORE_SIGMASK flag.
-			 */
-			if (test_thread_flag(TIF_RESTORE_SIGMASK))
-				clear_thread_flag(TIF_RESTORE_SIGMASK);
-
-			/*
-			 * Let tracing know that we've done the handler setup.
-			 */
-			tracehook_signal_handler(signr, &info, &ka, regs,
-					 test_thread_flag(TIF_SINGLE_STEP));
-		}
+		if (is_compat_task())
+			handle_signal32(signr, &ka, &info, oldset, regs);
+		else
+			handle_signal(signr, &ka, &info, oldset, regs);
 		return;
 	}
 
diff --git a/arch/score/kernel/signal.c b/arch/score/kernel/signal.c
index b24dfaf2462f..13e0eed0e301 100644
--- a/arch/score/kernel/signal.c
+++ b/arch/score/kernel/signal.c
@@ -241,11 +241,9 @@ give_sigsegv:
 	return -EFAULT;
 }
 
-static int handle_signal(unsigned long sig, siginfo_t *info,
+static void handle_signal(unsigned long sig, siginfo_t *info,
 	struct k_sigaction *ka, struct pt_regs *regs)
 {
-	int ret;
-
 	if (regs->is_syscall) {
 		switch (regs->regs[4]) {
 		case ERESTART_RESTARTBLOCK:
@@ -269,12 +267,10 @@ static int handle_signal(unsigned long sig, siginfo_t *info,
 	/*
 	 * Set up the stack frame
 	 */
-	ret = setup_rt_frame(ka, regs, sig, sigmask_to_save(), info);
-
-	if (ret == 0)
-		block_sigmask(ka, sig);
+	if (setup_rt_frame(ka, regs, sig, sigmask_to_save(), info) < 0)
+		return;
 
-	return ret;
+	block_sigmask(ka, sig);
 }
 
 static void do_signal(struct pt_regs *regs)
@@ -294,17 +290,7 @@ static void do_signal(struct pt_regs *regs)
 	signr = get_signal_to_deliver(&info, &ka, regs, NULL);
 	if (signr > 0) {
 		/* Actually deliver the signal.  */
-		if (handle_signal(signr, &info, &ka, regs) == 0) {
-			/*
-			 * A signal was successfully delivered; the saved
-			 * sigmask will have been stored in the signal frame,
-			 * and will be restored by sigreturn, so we can simply
-			 * clear the TIF_RESTORE_SIGMASK flag.
-			 */
-			if (test_thread_flag(TIF_RESTORE_SIGMASK))
-				clear_thread_flag(TIF_RESTORE_SIGMASK);
-		}
-
+		handle_signal(signr, &info, &ka, regs);
 		return;
 	}
 
diff --git a/arch/sh/kernel/signal_32.c b/arch/sh/kernel/signal_32.c
index bfb3d599f032..2675a97f374f 100644
--- a/arch/sh/kernel/signal_32.c
+++ b/arch/sh/kernel/signal_32.c
@@ -522,7 +522,7 @@ handle_syscall_restart(unsigned long save_r0, struct pt_regs *regs,
 /*
  * OK, we're invoking a handler
  */
-static int
+static void
 handle_signal(unsigned long sig, struct k_sigaction *ka, siginfo_t *info,
 	      struct pt_regs *regs, unsigned int save_r0)
 {
@@ -535,10 +535,11 @@ handle_signal(unsigned long sig, struct k_sigaction *ka, siginfo_t *info,
 	else
 		ret = setup_frame(sig, ka, oldset, regs);
 
-	if (ret == 0)
-		block_sigmask(ka, sig);
-
-	return ret;
+	if (ret)
+		return;
+	block_sigmask(ka, sig);
+	tracehook_signal_handler(sig, info, ka, regs,
+			test_thread_flag(TIF_SINGLESTEP));
 }
 
 /*
@@ -570,20 +571,7 @@ static void do_signal(struct pt_regs *regs, unsigned int save_r0)
 		handle_syscall_restart(save_r0, regs, &ka.sa);
 
 		/* Whee!  Actually deliver the signal.  */
-		if (handle_signal(signr, &ka, &info,
-				  regs, save_r0) == 0) {
-			/*
-			 * A signal was successfully delivered; the saved
-			 * sigmask will have been stored in the signal frame,
-			 * and will be restored by sigreturn, so we can simply
-			 * clear the TS_RESTORE_SIGMASK flag
-			 */
-			current_thread_info()->status &= ~TS_RESTORE_SIGMASK;
-
-			tracehook_signal_handler(signr, &info, &ka, regs,
-					test_thread_flag(TIF_SINGLESTEP));
-		}
-
+		handle_signal(signr, &ka, &info, regs, save_r0);
 		return;
 	}
 
diff --git a/arch/sh/kernel/signal_64.c b/arch/sh/kernel/signal_64.c
index aeeab070aaa9..7075c63bfc6f 100644
--- a/arch/sh/kernel/signal_64.c
+++ b/arch/sh/kernel/signal_64.c
@@ -43,7 +43,7 @@
 
 #define _BLOCKABLE (~(sigmask(SIGKILL) | sigmask(SIGSTOP)))
 
-static int
+static void
 handle_signal(unsigned long sig, siginfo_t *info, struct k_sigaction *ka,
 		struct pt_regs * regs);
 
@@ -103,17 +103,7 @@ static void do_signal(struct pt_regs *regs)
 		handle_syscall_restart(regs, &ka.sa);
 
 		/* Whee!  Actually deliver the signal.  */
-		if (handle_signal(signr, &info, &ka, regs) == 0) {
-			/*
-			 * If a signal was successfully delivered, the
-			 * saved sigmask is in its frame, and we can
-			 * clear the TS_RESTORE_SIGMASK flag.
-			 */
-			current_thread_info()->status &= ~TS_RESTORE_SIGMASK;
-
-			tracehook_signal_handler(signr, &info, &ka, regs,
-					test_thread_flag(TIF_SINGLESTEP));
-		}
+		handle_signal(signr, &info, &ka, regs);
 		return;
 	}
 
@@ -648,7 +638,7 @@ give_sigsegv:
 /*
  * OK, we're invoking a handler
  */
-static int
+static void
 handle_signal(unsigned long sig, siginfo_t *info, struct k_sigaction *ka,
 		struct pt_regs * regs)
 {
@@ -661,10 +651,12 @@ handle_signal(unsigned long sig, siginfo_t *info, struct k_sigaction *ka,
 	else
 		ret = setup_frame(sig, ka, oldset, regs);
 
-	if (ret == 0)
-		block_sigmask(ka, sig);
+	if (ret)
+		return;
 
-	return ret;
+	block_sigmask(ka, sig);
+	tracehook_signal_handler(sig, info, ka, regs,
+			test_thread_flag(TIF_SINGLESTEP));
 }
 
 asmlinkage void do_notify_resume(struct pt_regs *regs, unsigned long thread_info_flags)
diff --git a/arch/sparc/kernel/signal32.c b/arch/sparc/kernel/signal32.c
index 88e0d8122d2c..8c93c00922a7 100644
--- a/arch/sparc/kernel/signal32.c
+++ b/arch/sparc/kernel/signal32.c
@@ -775,7 +775,7 @@ sigsegv:
 	return -EFAULT;
 }
 
-static inline int handle_signal32(unsigned long signr, struct k_sigaction *ka,
+static inline void handle_signal32(unsigned long signr, struct k_sigaction *ka,
 				  siginfo_t *info,
 				  sigset_t *oldset, struct pt_regs *regs)
 {
@@ -787,12 +787,10 @@ static inline int handle_signal32(unsigned long signr, struct k_sigaction *ka,
 		err = setup_frame32(ka, regs, signr, oldset);
 
 	if (err)
-		return err;
+		return;
 
 	block_sigmask(ka, signr);
 	tracehook_signal_handler(signr, info, ka, regs, 0);
-
-	return 0;
 }
 
 static inline void syscall_restart32(unsigned long orig_i0, struct pt_regs *regs,
@@ -841,14 +839,7 @@ void do_signal32(sigset_t *oldset, struct pt_regs * regs)
 	if (signr > 0) {
 		if (restart_syscall)
 			syscall_restart32(orig_i0, regs, &ka.sa);
-		if (handle_signal32(signr, &ka, &info, oldset, regs) == 0) {
-			/* A signal was successfully delivered; the saved
-			 * sigmask will have been stored in the signal frame,
-			 * and will be restored by sigreturn, so we can simply
-			 * clear the TS_RESTORE_SIGMASK flag.
-			 */
-			current_thread_info()->status &= ~TS_RESTORE_SIGMASK;
-		}
+		handle_signal32(signr, &ka, &info, oldset, regs);
 		return;
 	}
 	if (restart_syscall &&
diff --git a/arch/sparc/kernel/signal_32.c b/arch/sparc/kernel/signal_32.c
index 5d74410c787b..ee81b90c532f 100644
--- a/arch/sparc/kernel/signal_32.c
+++ b/arch/sparc/kernel/signal_32.c
@@ -449,7 +449,7 @@ sigsegv:
 	return -EFAULT;
 }
 
-static inline int
+static inline void
 handle_signal(unsigned long signr, struct k_sigaction *ka,
 	      siginfo_t *info, struct pt_regs *regs)
 {
@@ -462,12 +462,10 @@ handle_signal(unsigned long signr, struct k_sigaction *ka,
 		err = setup_frame(ka, regs, signr, oldset);
 
 	if (err)
-		return err;
+		return;
 
 	block_sigmask(ka, signr);
 	tracehook_signal_handler(signr, info, ka, regs, 0);
-
-	return 0;
 }
 
 static inline void syscall_restart(unsigned long orig_i0, struct pt_regs *regs,
@@ -539,15 +537,7 @@ static void do_signal(struct pt_regs *regs, unsigned long orig_i0)
 	if (signr > 0) {
 		if (restart_syscall)
 			syscall_restart(orig_i0, regs, &ka.sa);
-		if (handle_signal(signr, &ka, &info, regs) == 0) {
-			/* a signal was successfully delivered; the saved
-			 * sigmask will have been stored in the signal frame,
-			 * and will be restored by sigreturn, so we can simply
-			 * clear the TIF_RESTORE_SIGMASK flag.
-			 */
-			if (test_thread_flag(TIF_RESTORE_SIGMASK))
-				clear_thread_flag(TIF_RESTORE_SIGMASK);
-		}
+		handle_signal(signr, &ka, &info, regs);
 		return;
 	}
 	if (restart_syscall &&
diff --git a/arch/sparc/kernel/signal_64.c b/arch/sparc/kernel/signal_64.c
index 088a733f83f9..febbc4b697ba 100644
--- a/arch/sparc/kernel/signal_64.c
+++ b/arch/sparc/kernel/signal_64.c
@@ -466,7 +466,7 @@ sigsegv:
 	return -EFAULT;
 }
 
-static inline int handle_signal(unsigned long signr, struct k_sigaction *ka,
+static inline void handle_signal(unsigned long signr, struct k_sigaction *ka,
 				siginfo_t *info,
 				sigset_t *oldset, struct pt_regs *regs)
 {
@@ -475,12 +475,10 @@ static inline int handle_signal(unsigned long signr, struct k_sigaction *ka,
 	err = setup_rt_frame(ka, regs, signr, oldset,
 			     (ka->sa.sa_flags & SA_SIGINFO) ? info : NULL);
 	if (err)
-		return err;
+		return;
 
 	block_sigmask(ka, signr);
 	tracehook_signal_handler(signr, info, ka, regs, 0);
-
-	return 0;
 }
 
 static inline void syscall_restart(unsigned long orig_i0, struct pt_regs *regs,
@@ -558,14 +556,7 @@ static void do_signal(struct pt_regs *regs, unsigned long orig_i0)
 	if (signr > 0) {
 		if (restart_syscall)
 			syscall_restart(orig_i0, regs, &ka.sa);
-		if (handle_signal(signr, &ka, &info, oldset, regs) == 0) {
-			/* A signal was successfully delivered; the saved
-			 * sigmask will have been stored in the signal frame,
-			 * and will be restored by sigreturn, so we can simply
-			 * clear the TS_RESTORE_SIGMASK flag.
-			 */
-			current_thread_info()->status &= ~TS_RESTORE_SIGMASK;
-		}
+		handle_signal(signr, &ka, &info, oldset, regs);
 		return;
 	}
 	if (restart_syscall &&
diff --git a/arch/tile/kernel/signal.c b/arch/tile/kernel/signal.c
index 588c28b2db58..9b71bfd4913d 100644
--- a/arch/tile/kernel/signal.c
+++ b/arch/tile/kernel/signal.c
@@ -242,7 +242,7 @@ give_sigsegv:
  * OK, we're invoking a handler
  */
 
-static int handle_signal(unsigned long sig, siginfo_t *info,
+static void handle_signal(unsigned long sig, siginfo_t *info,
 			 struct k_sigaction *ka,
 			 struct pt_regs *regs)
 {
@@ -279,15 +279,9 @@ static int handle_signal(unsigned long sig, siginfo_t *info,
 	else
 #endif
 		ret = setup_rt_frame(sig, ka, info, oldset, regs);
-	if (ret == 0) {
-		/* This code is only called from system calls or from
-		 * the work_pending path in the return-to-user code, and
-		 * either way we can re-enable interrupts unconditionally.
-		 */
-		block_sigmask(ka, sig);
-	}
-
-	return ret;
+	if (ret)
+		return;
+	block_sigmask(ka, sig);
 }
 
 /*
@@ -311,16 +305,7 @@ void do_signal(struct pt_regs *regs)
 	signr = get_signal_to_deliver(&info, &ka, regs, NULL);
 	if (signr > 0) {
 		/* Whee! Actually deliver the signal.  */
-		if (handle_signal(signr, &info, &ka, regs) == 0) {
-			/*
-			 * A signal was successfully delivered; the saved
-			 * sigmask will have been stored in the signal frame,
-			 * and will be restored by sigreturn, so we can simply
-			 * clear the TS_RESTORE_SIGMASK flag.
-			 */
-			current_thread_info()->status &= ~TS_RESTORE_SIGMASK;
-		}
-
+		handle_signal(signr, &info, &ka, regs);
 		goto done;
 	}
 
diff --git a/arch/um/kernel/signal.c b/arch/um/kernel/signal.c
index 909e9b8d6612..549a51c8e54f 100644
--- a/arch/um/kernel/signal.c
+++ b/arch/um/kernel/signal.c
@@ -22,7 +22,7 @@ EXPORT_SYMBOL(unblock_signals);
 /*
  * OK, we're invoking a handler
  */
-static int handle_signal(struct pt_regs *regs, unsigned long signr,
+static void handle_signal(struct pt_regs *regs, unsigned long signr,
 			 struct k_sigaction *ka, siginfo_t *info)
 {
 	sigset_t *oldset = sigmask_to_save();
@@ -66,8 +66,6 @@ static int handle_signal(struct pt_regs *regs, unsigned long signr,
 		force_sigsegv(signr, current);
 	else
 		block_sigmask(ka, signr);
-
-	return err;
 }
 
 static int kern_do_signal(struct pt_regs *regs)
@@ -79,17 +77,7 @@ static int kern_do_signal(struct pt_regs *regs)
 	while ((sig = get_signal_to_deliver(&info, &ka_copy, regs, NULL)) > 0) {
 		handled_sig = 1;
 		/* Whee!  Actually deliver the signal.  */
-		if (!handle_signal(regs, sig, &ka_copy, &info)) {
-			/*
-			 * a signal was successfully delivered; the saved
-			 * sigmask will have been stored in the signal frame,
-			 * and will be restored by sigreturn, so we can simply
-			 * clear the TIF_RESTORE_SIGMASK flag
-			 */
-			if (test_thread_flag(TIF_RESTORE_SIGMASK))
-				clear_thread_flag(TIF_RESTORE_SIGMASK);
-			break;
-		}
+		handle_signal(regs, sig, &ka_copy, &info);
 	}
 
 	/* Did we come from a system call? */
diff --git a/arch/unicore32/kernel/signal.c b/arch/unicore32/kernel/signal.c
index bf23194dc74d..dc41b11f8a57 100644
--- a/arch/unicore32/kernel/signal.c
+++ b/arch/unicore32/kernel/signal.c
@@ -312,7 +312,7 @@ static inline void setup_syscall_restart(struct pt_regs *regs)
 /*
  * OK, we're invoking a handler
  */
-static int handle_signal(unsigned long sig, struct k_sigaction *ka,
+static void handle_signal(unsigned long sig, struct k_sigaction *ka,
 	      siginfo_t *info, struct pt_regs *regs, int syscall)
 {
 	struct thread_info *thread = current_thread_info();
@@ -363,15 +363,13 @@ static int handle_signal(unsigned long sig, struct k_sigaction *ka,
 
 	if (ret != 0) {
 		force_sigsegv(sig, tsk);
-		return ret;
+		return;
 	}
 
 	/*
 	 * Block the signal if we were successful.
 	 */
 	block_sigmask(ka, sig);
-
-	return 0;
 }
 
 /*
@@ -403,17 +401,7 @@ static void do_signal(struct pt_regs *regs, int syscall)
 
 	signr = get_signal_to_deliver(&info, &ka, regs, NULL);
 	if (signr > 0) {
-		if (handle_signal(signr, &ka, &info, regs, syscall)
-				== 0) {
-			/*
-			 * A signal was successfully delivered; the saved
-			 * sigmask will have been stored in the signal frame,
-			 * and will be restored by sigreturn, so we can simply
-			 * clear the TIF_RESTORE_SIGMASK flag.
-			 */
-			if (test_thread_flag(TIF_RESTORE_SIGMASK))
-				clear_thread_flag(TIF_RESTORE_SIGMASK);
-		}
+		handle_signal(signr, &ka, &info, regs, syscall);
 		return;
 	}
 
diff --git a/arch/x86/kernel/signal.c b/arch/x86/kernel/signal.c
index 56f3062c5111..700c49dcd84e 100644
--- a/arch/x86/kernel/signal.c
+++ b/arch/x86/kernel/signal.c
@@ -648,38 +648,27 @@ setup_rt_frame(int sig, struct k_sigaction *ka, siginfo_t *info,
 {
 	int usig = signr_convert(sig);
 	sigset_t *set = sigmask_to_save();
-	int ret;
 
 	/* Set up the stack frame */
 	if (is_ia32) {
 		if (ka->sa.sa_flags & SA_SIGINFO)
-			ret = ia32_setup_rt_frame(usig, ka, info, set, regs);
+			return ia32_setup_rt_frame(usig, ka, info, set, regs);
 		else
-			ret = ia32_setup_frame(usig, ka, set, regs);
+			return ia32_setup_frame(usig, ka, set, regs);
 #ifdef CONFIG_X86_X32_ABI
 	} else if (is_x32) {
-		ret = x32_setup_rt_frame(usig, ka, info,
+		return x32_setup_rt_frame(usig, ka, info,
 					 (compat_sigset_t *)set, regs);
 #endif
 	} else {
-		ret = __setup_rt_frame(sig, ka, info, set, regs);
+		return __setup_rt_frame(sig, ka, info, set, regs);
 	}
-
-	if (ret) {
-		force_sigsegv(sig, current);
-		return -EFAULT;
-	}
-
-	current_thread_info()->status &= ~TS_RESTORE_SIGMASK;
-	return ret;
 }
 
-static int
+static void
 handle_signal(unsigned long sig, siginfo_t *info, struct k_sigaction *ka,
 		struct pt_regs *regs)
 {
-	int ret;
-
 	/* Are we from a system call? */
 	if (syscall_get_nr(current, regs) >= 0) {
 		/* If so, check system call restarting.. */
@@ -710,10 +699,10 @@ handle_signal(unsigned long sig, siginfo_t *info, struct k_sigaction *ka,
 	    likely(test_and_clear_thread_flag(TIF_FORCED_TF)))
 		regs->flags &= ~X86_EFLAGS_TF;
 
-	ret = setup_rt_frame(sig, ka, info, regs);
-
-	if (ret)
-		return ret;
+	if (setup_rt_frame(sig, ka, info, regs) < 0) {
+		force_sigsegv(sig, current);
+		return;
+	}
 
 	/*
 	 * Clear the direction flag as per the ABI for function entry.
@@ -732,8 +721,6 @@ handle_signal(unsigned long sig, siginfo_t *info, struct k_sigaction *ka,
 
 	tracehook_signal_handler(sig, info, ka, regs,
 				 test_thread_flag(TIF_SINGLESTEP));
-
-	return 0;
 }
 
 #ifdef CONFIG_X86_32
diff --git a/arch/xtensa/kernel/signal.c b/arch/xtensa/kernel/signal.c
index e4b06e2d4eb9..3e83913a3c7c 100644
--- a/arch/xtensa/kernel/signal.c
+++ b/arch/xtensa/kernel/signal.c
@@ -499,7 +499,6 @@ static void do_signal(struct pt_regs *regs)
 		if (ret)
 			return;
 
-		clear_thread_flag(TIF_RESTORE_SIGMASK);
 		block_sigmask(&ka, signr);
 		if (current->ptrace & PT_SINGLESTEP)
 			task_pt_regs(current)->icountlevel = 1;
diff --git a/kernel/signal.c b/kernel/signal.c
index 95a9d9d8122b..b9be7e0fe41a 100644
--- a/kernel/signal.c
+++ b/kernel/signal.c
@@ -2382,6 +2382,12 @@ void block_sigmask(struct k_sigaction *ka, int signr)
 {
 	sigset_t blocked;
 
+	/* A signal was successfully delivered, and the
+	   saved sigmask was stored on the signal frame,
+	   and will be restored by sigreturn.  So we can
+	   simply clear the restore sigmask flag.  */
+	clear_restore_sigmask();
+
 	sigorsets(&blocked, &current->blocked, &ka->sa.sa_mask);
 	if (!(ka->sa.sa_flags & SA_NODEFER))
 		sigaddset(&blocked, signr);
-- 
cgit v1.2.3


From edd63a2763bdae0daa4f0a4d4c5d61d1154352a5 Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Fri, 27 Apr 2012 13:42:45 -0400
Subject: set_restore_sigmask() is never called without SIGPENDING (and never
 should be)

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 arch/ia64/include/asm/thread_info.h       | 2 +-
 arch/microblaze/include/asm/thread_info.h | 2 +-
 arch/powerpc/include/asm/thread_info.h    | 2 +-
 arch/sh/include/asm/thread_info.h         | 2 +-
 arch/sparc/include/asm/thread_info_64.h   | 2 +-
 arch/tile/include/asm/thread_info.h       | 2 +-
 arch/x86/include/asm/thread_info.h        | 2 +-
 include/linux/thread_info.h               | 3 ++-
 8 files changed, 9 insertions(+), 8 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/ia64/include/asm/thread_info.h b/arch/ia64/include/asm/thread_info.h
index 8d600363fa57..f7ee85378311 100644
--- a/arch/ia64/include/asm/thread_info.h
+++ b/arch/ia64/include/asm/thread_info.h
@@ -141,7 +141,7 @@ static inline void set_restore_sigmask(void)
 {
 	struct thread_info *ti = current_thread_info();
 	ti->status |= TS_RESTORE_SIGMASK;
-	set_bit(TIF_SIGPENDING, &ti->flags);
+	WARN_ON(!test_bit(TIF_SIGPENDING, &ti->flags));
 }
 static inline void clear_restore_sigmask(void)
 {
diff --git a/arch/microblaze/include/asm/thread_info.h b/arch/microblaze/include/asm/thread_info.h
index 12e39206b3ef..6c610234ffab 100644
--- a/arch/microblaze/include/asm/thread_info.h
+++ b/arch/microblaze/include/asm/thread_info.h
@@ -166,7 +166,7 @@ static inline void set_restore_sigmask(void)
 {
 	struct thread_info *ti = current_thread_info();
 	ti->status |= TS_RESTORE_SIGMASK;
-	set_bit(TIF_SIGPENDING, (unsigned long *)&ti->flags);
+	WARN_ON(!test_bit(TIF_SIGPENDING, (unsigned long *)&ti->flags));
 }
 static inline void clear_restore_sigmask(void)
 {
diff --git a/arch/powerpc/include/asm/thread_info.h b/arch/powerpc/include/asm/thread_info.h
index 85d50a93a92f..68831e9cf82f 100644
--- a/arch/powerpc/include/asm/thread_info.h
+++ b/arch/powerpc/include/asm/thread_info.h
@@ -140,7 +140,7 @@ static inline void set_restore_sigmask(void)
 {
 	struct thread_info *ti = current_thread_info();
 	ti->local_flags |= _TLF_RESTORE_SIGMASK;
-	set_bit(TIF_SIGPENDING, &ti->flags);
+	WARN_ON(!test_bit(TIF_SIGPENDING, &ti->flags));
 }
 static inline void clear_restore_sigmask(void)
 {
diff --git a/arch/sh/include/asm/thread_info.h b/arch/sh/include/asm/thread_info.h
index a109157c6b8f..bc13b57cdc83 100644
--- a/arch/sh/include/asm/thread_info.h
+++ b/arch/sh/include/asm/thread_info.h
@@ -169,7 +169,7 @@ static inline void set_restore_sigmask(void)
 {
 	struct thread_info *ti = current_thread_info();
 	ti->status |= TS_RESTORE_SIGMASK;
-	set_bit(TIF_SIGPENDING, (unsigned long *)&ti->flags);
+	WARN_ON(!test_bit(TIF_SIGPENDING, (unsigned long *)&ti->flags));
 }
 
 #define TI_FLAG_FAULT_CODE_SHIFT	24
diff --git a/arch/sparc/include/asm/thread_info_64.h b/arch/sparc/include/asm/thread_info_64.h
index cb9b7a9f5fc1..cfa8c38fb9c8 100644
--- a/arch/sparc/include/asm/thread_info_64.h
+++ b/arch/sparc/include/asm/thread_info_64.h
@@ -238,7 +238,7 @@ static inline void set_restore_sigmask(void)
 {
 	struct thread_info *ti = current_thread_info();
 	ti->status |= TS_RESTORE_SIGMASK;
-	set_bit(TIF_SIGPENDING, &ti->flags);
+	WARN_ON(!test_bit(TIF_SIGPENDING, &ti->flags));
 }
 static inline void clear_restore_sigmask(void)
 {
diff --git a/arch/tile/include/asm/thread_info.h b/arch/tile/include/asm/thread_info.h
index 5aef371921e4..7e1fef36bde6 100644
--- a/arch/tile/include/asm/thread_info.h
+++ b/arch/tile/include/asm/thread_info.h
@@ -166,7 +166,7 @@ static inline void set_restore_sigmask(void)
 {
 	struct thread_info *ti = current_thread_info();
 	ti->status |= TS_RESTORE_SIGMASK;
-	set_bit(TIF_SIGPENDING, &ti->flags);
+	WARN_ON(!test_bit(TIF_SIGPENDING, &ti->flags));
 }
 static inline void clear_restore_sigmask(void)
 {
diff --git a/arch/x86/include/asm/thread_info.h b/arch/x86/include/asm/thread_info.h
index 8f3f1ff69fa9..89f794f007ec 100644
--- a/arch/x86/include/asm/thread_info.h
+++ b/arch/x86/include/asm/thread_info.h
@@ -248,7 +248,7 @@ static inline void set_restore_sigmask(void)
 {
 	struct thread_info *ti = current_thread_info();
 	ti->status |= TS_RESTORE_SIGMASK;
-	set_bit(TIF_SIGPENDING, (unsigned long *)&ti->flags);
+	WARN_ON(!test_bit(TIF_SIGPENDING, (unsigned long *)&ti->flags));
 }
 static inline void clear_restore_sigmask(void)
 {
diff --git a/include/linux/thread_info.h b/include/linux/thread_info.h
index ed279701ac79..ccc1899bd62e 100644
--- a/include/linux/thread_info.h
+++ b/include/linux/thread_info.h
@@ -8,6 +8,7 @@
 #define _LINUX_THREAD_INFO_H
 
 #include <linux/types.h>
+#include <linux/bug.h>
 
 struct timespec;
 struct compat_timespec;
@@ -125,7 +126,7 @@ static inline int test_ti_thread_flag(struct thread_info *ti, int flag)
 static inline void set_restore_sigmask(void)
 {
 	set_thread_flag(TIF_RESTORE_SIGMASK);
-	set_thread_flag(TIF_SIGPENDING);
+	WARN_ON(!test_thread_flag(TIF_SIGPENDING));
 }
 static inline void clear_restore_sigmask(void)
 {
-- 
cgit v1.2.3


From 77097ae503b170120ab66dd1d547f8577193f91f Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Fri, 27 Apr 2012 13:58:59 -0400
Subject: most of set_current_blocked() callers want SIGKILL/SIGSTOP removed
 from set

Only 3 out of 63 do not.  Renamed the current variant to __set_current_blocked(),
added set_current_blocked() that will exclude unblockable signals, switched
open-coded instances to it.

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 arch/alpha/kernel/signal.c          |  2 --
 arch/arm/kernel/signal.c            |  6 +-----
 arch/avr32/kernel/signal.c          |  3 ---
 arch/blackfin/kernel/signal.c       |  3 ---
 arch/c6x/kernel/signal.c            |  3 ---
 arch/cris/arch-v10/kernel/signal.c  |  4 ----
 arch/cris/arch-v32/kernel/signal.c  |  5 -----
 arch/frv/kernel/signal.c            |  4 ----
 arch/h8300/kernel/signal.c          |  4 ----
 arch/hexagon/kernel/signal.c        |  3 ---
 arch/ia64/kernel/signal.c           |  2 --
 arch/m32r/kernel/signal.c           |  3 ---
 arch/m68k/kernel/signal.c           |  4 ----
 arch/microblaze/kernel/signal.c     |  3 ---
 arch/mips/kernel/signal-common.h    |  2 --
 arch/mips/kernel/signal.c           |  2 --
 arch/mips/kernel/signal32.c         |  2 --
 arch/mips/kernel/signal_n32.c       |  1 -
 arch/mn10300/kernel/signal.c        |  4 ----
 arch/openrisc/kernel/signal.c       |  3 ---
 arch/parisc/kernel/signal.c         |  4 ----
 arch/parisc/kernel/signal32.c       |  2 --
 arch/powerpc/kernel/signal.c        |  1 -
 arch/powerpc/kernel/signal.h        |  2 --
 arch/s390/kernel/compat_signal.c    |  4 ----
 arch/s390/kernel/signal.c           |  5 -----
 arch/score/kernel/signal.c          |  3 ---
 arch/sh/kernel/signal_32.c          |  4 ----
 arch/sh/kernel/signal_64.c          |  4 ----
 arch/sparc/kernel/signal32.c        |  4 ----
 arch/sparc/kernel/signal_32.c       |  4 ----
 arch/sparc/kernel/signal_64.c       |  4 ----
 arch/tile/kernel/compat_signal.c    |  3 ---
 arch/tile/kernel/signal.c           |  3 ---
 arch/um/include/shared/frame_kern.h |  3 ---
 arch/um/kernel/signal.c             |  4 ----
 arch/unicore32/kernel/signal.c      |  6 +-----
 arch/x86/ia32/ia32_signal.c         |  2 --
 arch/x86/include/asm/sighandling.h  |  2 --
 arch/x86/kernel/signal.c            |  3 ---
 arch/x86/um/signal.c                |  2 --
 arch/xtensa/kernel/signal.c         |  3 ---
 include/linux/sched.h               |  2 +-
 include/linux/signal.h              |  3 ++-
 kernel/signal.c                     | 18 ++++++++++++------
 45 files changed, 17 insertions(+), 141 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/alpha/kernel/signal.c b/arch/alpha/kernel/signal.c
index bb45a8813393..48c4df2389ac 100644
--- a/arch/alpha/kernel/signal.c
+++ b/arch/alpha/kernel/signal.c
@@ -226,7 +226,6 @@ do_sigreturn(struct sigcontext __user *sc, struct pt_regs *regs,
 	if (__get_user(set.sig[0], &sc->sc_mask))
 		goto give_sigsegv;
 
-	sigdelsetmask(&set, ~_BLOCKABLE);
 	set_current_blocked(&set);
 
 	if (restore_sigcontext(sc, regs, sw))
@@ -261,7 +260,6 @@ do_rt_sigreturn(struct rt_sigframe __user *frame, struct pt_regs *regs,
 	if (__copy_from_user(&set, &frame->uc.uc_sigmask, sizeof(set)))
 		goto give_sigsegv;
 
-	sigdelsetmask(&set, ~_BLOCKABLE);
 	set_current_blocked(&set);
 
 	if (restore_sigcontext(&frame->uc.uc_mcontext, regs, sw))
diff --git a/arch/arm/kernel/signal.c b/arch/arm/kernel/signal.c
index 7f9abd75fc2e..c126eba8411d 100644
--- a/arch/arm/kernel/signal.c
+++ b/arch/arm/kernel/signal.c
@@ -22,8 +22,6 @@
 
 #include "signal.h"
 
-#define _BLOCKABLE (~(sigmask(SIGKILL) | sigmask(SIGSTOP)))
-
 /*
  * For ARM syscalls, we encode the syscall number into the instruction.
  */
@@ -210,10 +208,8 @@ static int restore_sigframe(struct pt_regs *regs, struct sigframe __user *sf)
 	int err;
 
 	err = __copy_from_user(&set, &sf->uc.uc_sigmask, sizeof(set));
-	if (err == 0) {
-		sigdelsetmask(&set, ~_BLOCKABLE);
+	if (err == 0)
 		set_current_blocked(&set);
-	}
 
 	__get_user_error(regs->ARM_r0, &sf->uc.uc_mcontext.arm_r0, err);
 	__get_user_error(regs->ARM_r1, &sf->uc.uc_mcontext.arm_r1, err);
diff --git a/arch/avr32/kernel/signal.c b/arch/avr32/kernel/signal.c
index 3ac1a60f9eb6..e883fa5eb845 100644
--- a/arch/avr32/kernel/signal.c
+++ b/arch/avr32/kernel/signal.c
@@ -22,8 +22,6 @@
 #include <asm/ucontext.h>
 #include <asm/syscalls.h>
 
-#define _BLOCKABLE (~(sigmask(SIGKILL) | sigmask(SIGSTOP)))
-
 asmlinkage int sys_sigaltstack(const stack_t __user *uss, stack_t __user *uoss,
 			       struct pt_regs *regs)
 {
@@ -89,7 +87,6 @@ asmlinkage int sys_rt_sigreturn(struct pt_regs *regs)
 	if (__copy_from_user(&set, &frame->uc.uc_sigmask, sizeof(set)))
 		goto badframe;
 
-	sigdelsetmask(&set, ~_BLOCKABLE);
 	set_current_blocked(&set);
 
 	if (restore_sigcontext(regs, &frame->uc.uc_mcontext))
diff --git a/arch/blackfin/kernel/signal.c b/arch/blackfin/kernel/signal.c
index b20d435d084a..463612643821 100644
--- a/arch/blackfin/kernel/signal.c
+++ b/arch/blackfin/kernel/signal.c
@@ -19,8 +19,6 @@
 #include <asm/fixed_code.h>
 #include <asm/syscall.h>
 
-#define _BLOCKABLE (~(sigmask(SIGKILL) | sigmask(SIGSTOP)))
-
 /* Location of the trace bit in SYSCFG. */
 #define TRACE_BITS 0x0001
 
@@ -98,7 +96,6 @@ asmlinkage int do_rt_sigreturn(unsigned long __unused)
 	if (__copy_from_user(&set, &frame->uc.uc_sigmask, sizeof(set)))
 		goto badframe;
 
-	sigdelsetmask(&set, ~_BLOCKABLE);
 	set_current_blocked(&set);
 
 	if (rt_restore_sigcontext(regs, &frame->uc.uc_mcontext, &r0))
diff --git a/arch/c6x/kernel/signal.c b/arch/c6x/kernel/signal.c
index d599a7fb5d24..eb1b3086ae00 100644
--- a/arch/c6x/kernel/signal.c
+++ b/arch/c6x/kernel/signal.c
@@ -20,8 +20,6 @@
 #include <asm/cacheflush.h>
 
 
-#define _BLOCKABLE (~(sigmask(SIGKILL) | sigmask(SIGSTOP)))
-
 /*
  * Do a signal return, undo the signal stack.
  */
@@ -87,7 +85,6 @@ asmlinkage int do_rt_sigreturn(struct pt_regs *regs)
 	if (__copy_from_user(&set, &frame->uc.uc_sigmask, sizeof(set)))
 		goto badframe;
 
-	sigdelsetmask(&set, ~_BLOCKABLE);
 	set_current_blocked(&set);
 
 	if (restore_sigcontext(regs, &frame->uc.uc_mcontext))
diff --git a/arch/cris/arch-v10/kernel/signal.c b/arch/cris/arch-v10/kernel/signal.c
index 46c8ca605e4d..cf6380cb9a57 100644
--- a/arch/cris/arch-v10/kernel/signal.c
+++ b/arch/cris/arch-v10/kernel/signal.c
@@ -31,8 +31,6 @@
 
 #define DEBUG_SIG 0
 
-#define _BLOCKABLE (~(sigmask(SIGKILL) | sigmask(SIGSTOP)))
-
 /* a syscall in Linux/CRIS is a break 13 instruction which is 2 bytes */
 /* manipulate regs so that upon return, it will be re-executed */
 
@@ -176,7 +174,6 @@ asmlinkage int sys_sigreturn(long r10, long r11, long r12, long r13, long mof,
 				    sizeof(frame->extramask))))
 		goto badframe;
 
-	sigdelsetmask(&set, ~_BLOCKABLE);
 	set_current_blocked(&set);
 
 	if (restore_sigcontext(regs, &frame->sc))
@@ -212,7 +209,6 @@ asmlinkage int sys_rt_sigreturn(long r10, long r11, long r12, long r13,
 	if (__copy_from_user(&set, &frame->uc.uc_sigmask, sizeof(set)))
 		goto badframe;
 
-	sigdelsetmask(&set, ~_BLOCKABLE);
 	set_current_blocked(&set);
 
 	if (restore_sigcontext(regs, &frame->uc.uc_mcontext))
diff --git a/arch/cris/arch-v32/kernel/signal.c b/arch/cris/arch-v32/kernel/signal.c
index e0431328b7cd..07b81ee09f65 100644
--- a/arch/cris/arch-v32/kernel/signal.c
+++ b/arch/cris/arch-v32/kernel/signal.c
@@ -24,9 +24,6 @@
 
 extern unsigned long cris_signal_return_page;
 
-/* Flag to check if a signal is blockable. */
-#define _BLOCKABLE (~(sigmask(SIGKILL) | sigmask(SIGSTOP)))
-
 /*
  * A syscall in CRIS is really a "break 13" instruction, which is 2
  * bytes. The registers is manipulated so upon return the instruction
@@ -167,7 +164,6 @@ sys_sigreturn(long r10, long r11, long r12, long r13, long mof, long srp,
 						 sizeof(frame->extramask))))
 		goto badframe;
 
-	sigdelsetmask(&set, ~_BLOCKABLE);
 	set_current_blocked(&set);
 
 	if (restore_sigcontext(regs, &frame->sc))
@@ -208,7 +204,6 @@ sys_rt_sigreturn(long r10, long r11, long r12, long r13, long mof, long srp,
 	if (__copy_from_user(&set, &frame->uc.uc_sigmask, sizeof(set)))
 		goto badframe;
 
-	sigdelsetmask(&set, ~_BLOCKABLE);
 	set_current_blocked(&set);
 
 	if (restore_sigcontext(regs, &frame->uc.uc_mcontext))
diff --git a/arch/frv/kernel/signal.c b/arch/frv/kernel/signal.c
index 9ec3d2e27b4c..511285fa2461 100644
--- a/arch/frv/kernel/signal.c
+++ b/arch/frv/kernel/signal.c
@@ -28,8 +28,6 @@
 
 #define DEBUG_SIG 0
 
-#define _BLOCKABLE (~(sigmask(SIGKILL) | sigmask(SIGSTOP)))
-
 struct fdpic_func_descriptor {
 	unsigned long	text;
 	unsigned long	GOT;
@@ -149,7 +147,6 @@ asmlinkage int sys_sigreturn(void)
 	    __copy_from_user(&set.sig[1], &frame->extramask, sizeof(frame->extramask)))
 		goto badframe;
 
-	sigdelsetmask(&set, ~_BLOCKABLE);
 	set_current_blocked(&set);
 
 	if (restore_sigcontext(&frame->sc, &gr8))
@@ -172,7 +169,6 @@ asmlinkage int sys_rt_sigreturn(void)
 	if (__copy_from_user(&set, &frame->uc.uc_sigmask, sizeof(set)))
 		goto badframe;
 
-	sigdelsetmask(&set, ~_BLOCKABLE);
 	set_current_blocked(&set);
 
 	if (restore_sigcontext(&frame->uc.uc_mcontext, &gr8))
diff --git a/arch/h8300/kernel/signal.c b/arch/h8300/kernel/signal.c
index 8fbfc39574f5..aa6f09666915 100644
--- a/arch/h8300/kernel/signal.c
+++ b/arch/h8300/kernel/signal.c
@@ -47,8 +47,6 @@
 #include <asm/traps.h>
 #include <asm/ucontext.h>
 
-#define _BLOCKABLE (~(sigmask(SIGKILL) | sigmask(SIGSTOP)))
-
 /*
  * Atomically swap in the new signal mask, and wait for a signal.
  */
@@ -186,7 +184,6 @@ asmlinkage int do_sigreturn(unsigned long __unused,...)
 			      sizeof(frame->extramask))))
 		goto badframe;
 
-	sigdelsetmask(&set, ~_BLOCKABLE);
 	set_current_blocked(&set);
 	
 	if (restore_sigcontext(regs, &frame->sc, &er0))
@@ -211,7 +208,6 @@ asmlinkage int do_rt_sigreturn(unsigned long __unused,...)
 	if (__copy_from_user(&set, &frame->uc.uc_sigmask, sizeof(set)))
 		goto badframe;
 
-	sigdelsetmask(&set, ~_BLOCKABLE);
 	set_current_blocked(&set);
 	
 	if (restore_sigcontext(regs, &frame->uc.uc_mcontext, &er0))
diff --git a/arch/hexagon/kernel/signal.c b/arch/hexagon/kernel/signal.c
index c9caf7401191..439f11a3a8ef 100644
--- a/arch/hexagon/kernel/signal.c
+++ b/arch/hexagon/kernel/signal.c
@@ -31,8 +31,6 @@
 #include <asm/signal.h>
 #include <asm/vdso.h>
 
-#define _BLOCKABLE (~(sigmask(SIGKILL) | sigmask(SIGSTOP)))
-
 struct rt_sigframe {
 	unsigned long tramp[2];
 	struct siginfo info;
@@ -273,7 +271,6 @@ asmlinkage int sys_rt_sigreturn(void)
 	if (__copy_from_user(&blocked, &frame->uc.uc_sigmask, sizeof(blocked)))
 		goto badframe;
 
-	sigdelsetmask(&blocked, ~_BLOCKABLE);
 	set_current_blocked(&blocked);
 
 	if (restore_sigcontext(regs, &frame->uc.uc_mcontext))
diff --git a/arch/ia64/kernel/signal.c b/arch/ia64/kernel/signal.c
index dc6fe6573465..c4041c76c07d 100644
--- a/arch/ia64/kernel/signal.c
+++ b/arch/ia64/kernel/signal.c
@@ -30,7 +30,6 @@
 
 #define DEBUG_SIG	0
 #define STACK_ALIGN	16		/* minimal alignment for stack pointer */
-#define _BLOCKABLE	(~(sigmask(SIGKILL) | sigmask(SIGSTOP)))
 
 #if _NSIG_WORDS > 1
 # define PUT_SIGSET(k,u)	__copy_to_user((u)->sig, (k)->sig, sizeof(sigset_t))
@@ -200,7 +199,6 @@ ia64_rt_sigreturn (struct sigscratch *scr)
 	if (GET_SIGSET(&set, &sc->sc_mask))
 		goto give_sigsegv;
 
-	sigdelsetmask(&set, ~_BLOCKABLE);
 	set_current_blocked(&set);
 
 	if (restore_sigcontext(sc, scr))
diff --git a/arch/m32r/kernel/signal.c b/arch/m32r/kernel/signal.c
index 7cbfa639fbfa..07f9032576c0 100644
--- a/arch/m32r/kernel/signal.c
+++ b/arch/m32r/kernel/signal.c
@@ -28,8 +28,6 @@
 
 #define DEBUG_SIG 0
 
-#define _BLOCKABLE (~(sigmask(SIGKILL) | sigmask(SIGSTOP)))
-
 asmlinkage int
 sys_sigaltstack(const stack_t __user *uss, stack_t __user *uoss,
 		unsigned long r2, unsigned long r3, unsigned long r4,
@@ -111,7 +109,6 @@ sys_rt_sigreturn(unsigned long r0, unsigned long r1,
 	if (__copy_from_user(&set, &frame->uc.uc_sigmask, sizeof(set)))
 		goto badframe;
 
-	sigdelsetmask(&set, ~_BLOCKABLE);
 	set_current_blocked(&set);
 
 	if (restore_sigcontext(regs, &frame->uc.uc_mcontext, &result))
diff --git a/arch/m68k/kernel/signal.c b/arch/m68k/kernel/signal.c
index 6dbee8a167a5..c00caad215a6 100644
--- a/arch/m68k/kernel/signal.c
+++ b/arch/m68k/kernel/signal.c
@@ -51,8 +51,6 @@
 #include <asm/traps.h>
 #include <asm/ucontext.h>
 
-#define _BLOCKABLE (~(sigmask(SIGKILL) | sigmask(SIGSTOP)))
-
 #ifdef CONFIG_MMU
 
 /*
@@ -795,7 +793,6 @@ asmlinkage int do_sigreturn(unsigned long __unused)
 			      sizeof(frame->extramask))))
 		goto badframe;
 
-	sigdelsetmask(&set, ~_BLOCKABLE);
 	set_current_blocked(&set);
 
 	if (restore_sigcontext(regs, &frame->sc, frame + 1))
@@ -820,7 +817,6 @@ asmlinkage int do_rt_sigreturn(unsigned long __unused)
 	if (__copy_from_user(&set, &frame->uc.uc_sigmask, sizeof(set)))
 		goto badframe;
 
-	sigdelsetmask(&set, ~_BLOCKABLE);
 	set_current_blocked(&set);
 
 	if (rt_restore_ucontext(regs, sw, &frame->uc))
diff --git a/arch/microblaze/kernel/signal.c b/arch/microblaze/kernel/signal.c
index 03641199666e..c662e68671a2 100644
--- a/arch/microblaze/kernel/signal.c
+++ b/arch/microblaze/kernel/signal.c
@@ -41,8 +41,6 @@
 #include <asm/cacheflush.h>
 #include <asm/syscalls.h>
 
-#define _BLOCKABLE (~(sigmask(SIGKILL) | sigmask(SIGSTOP)))
-
 asmlinkage long
 sys_sigaltstack(const stack_t __user *uss, stack_t __user *uoss,
 		struct pt_regs *regs)
@@ -106,7 +104,6 @@ asmlinkage long sys_rt_sigreturn(struct pt_regs *regs)
 	if (__copy_from_user(&set, &frame->uc.uc_sigmask, sizeof(set)))
 		goto badframe;
 
-	sigdelsetmask(&set, ~_BLOCKABLE);
 	set_current_blocked(&set);
 
 	if (restore_sigcontext(regs, &frame->uc.uc_mcontext, &rval))
diff --git a/arch/mips/kernel/signal-common.h b/arch/mips/kernel/signal-common.h
index 10263b405981..9c60d09e62a7 100644
--- a/arch/mips/kernel/signal-common.h
+++ b/arch/mips/kernel/signal-common.h
@@ -19,8 +19,6 @@
 #  define DEBUGP(fmt, args...)
 #endif
 
-#define _BLOCKABLE (~(sigmask(SIGKILL) | sigmask(SIGSTOP)))
-
 /*
  * Determine which stack to use..
  */
diff --git a/arch/mips/kernel/signal.c b/arch/mips/kernel/signal.c
index 896165757e6f..02e0cba24f82 100644
--- a/arch/mips/kernel/signal.c
+++ b/arch/mips/kernel/signal.c
@@ -339,7 +339,6 @@ asmlinkage void sys_sigreturn(nabi_no_regargs struct pt_regs regs)
 	if (__copy_from_user(&blocked, &frame->sf_mask, sizeof(blocked)))
 		goto badframe;
 
-	sigdelsetmask(&blocked, ~_BLOCKABLE);
 	set_current_blocked(&blocked);
 
 	sig = restore_sigcontext(&regs, &frame->sf_sc);
@@ -375,7 +374,6 @@ asmlinkage void sys_rt_sigreturn(nabi_no_regargs struct pt_regs regs)
 	if (__copy_from_user(&set, &frame->rs_uc.uc_sigmask, sizeof(set)))
 		goto badframe;
 
-	sigdelsetmask(&set, ~_BLOCKABLE);
 	set_current_blocked(&set);
 
 	sig = restore_sigcontext(&regs, &frame->rs_uc.uc_mcontext);
diff --git a/arch/mips/kernel/signal32.c b/arch/mips/kernel/signal32.c
index b4fe2eacbd5d..da1b56a39ac7 100644
--- a/arch/mips/kernel/signal32.c
+++ b/arch/mips/kernel/signal32.c
@@ -465,7 +465,6 @@ asmlinkage void sys32_sigreturn(nabi_no_regargs struct pt_regs regs)
 	if (__copy_conv_sigset_from_user(&blocked, &frame->sf_mask))
 		goto badframe;
 
-	sigdelsetmask(&blocked, ~_BLOCKABLE);
 	set_current_blocked(&blocked);
 
 	sig = restore_sigcontext32(&regs, &frame->sf_sc);
@@ -503,7 +502,6 @@ asmlinkage void sys32_rt_sigreturn(nabi_no_regargs struct pt_regs regs)
 	if (__copy_conv_sigset_from_user(&set, &frame->rs_uc.uc_sigmask))
 		goto badframe;
 
-	sigdelsetmask(&set, ~_BLOCKABLE);
 	set_current_blocked(&set);
 
 	sig = restore_sigcontext32(&regs, &frame->rs_uc.uc_mcontext);
diff --git a/arch/mips/kernel/signal_n32.c b/arch/mips/kernel/signal_n32.c
index 63ffac9af7c5..3574c145511b 100644
--- a/arch/mips/kernel/signal_n32.c
+++ b/arch/mips/kernel/signal_n32.c
@@ -109,7 +109,6 @@ asmlinkage void sysn32_rt_sigreturn(nabi_no_regargs struct pt_regs regs)
 	if (__copy_conv_sigset_from_user(&set, &frame->rs_uc.uc_sigmask))
 		goto badframe;
 
-	sigdelsetmask(&set, ~_BLOCKABLE);
 	set_current_blocked(&set);
 
 	sig = restore_sigcontext(&regs, &frame->rs_uc.uc_mcontext);
diff --git a/arch/mn10300/kernel/signal.c b/arch/mn10300/kernel/signal.c
index d57013e06ea0..4f6d20763061 100644
--- a/arch/mn10300/kernel/signal.c
+++ b/arch/mn10300/kernel/signal.c
@@ -31,8 +31,6 @@
 
 #define DEBUG_SIG 0
 
-#define _BLOCKABLE (~(sigmask(SIGKILL) | sigmask(SIGSTOP)))
-
 /*
  * atomically swap in the new signal mask, and wait for a signal.
  */
@@ -163,7 +161,6 @@ asmlinkage long sys_sigreturn(void)
 			     sizeof(frame->extramask)))
 		goto badframe;
 
-	sigdelsetmask(&set, ~_BLOCKABLE);
 	set_current_blocked(&set);
 
 	if (restore_sigcontext(current_frame(), &frame->sc, &d0))
@@ -191,7 +188,6 @@ asmlinkage long sys_rt_sigreturn(void)
 	if (__copy_from_user(&set, &frame->uc.uc_sigmask, sizeof(set)))
 		goto badframe;
 
-	sigdelsetmask(&set, ~_BLOCKABLE);
 	set_current_blocked(&set);
 
 	if (restore_sigcontext(current_frame(), &frame->uc.uc_mcontext, &d0))
diff --git a/arch/openrisc/kernel/signal.c b/arch/openrisc/kernel/signal.c
index aa1105c1618f..53972b7260b7 100644
--- a/arch/openrisc/kernel/signal.c
+++ b/arch/openrisc/kernel/signal.c
@@ -33,8 +33,6 @@
 
 #define DEBUG_SIG 0
 
-#define _BLOCKABLE (~(sigmask(SIGKILL) | sigmask(SIGSTOP)))
-
 asmlinkage long
 _sys_sigaltstack(const stack_t *uss, stack_t *uoss, struct pt_regs *regs)
 {
@@ -101,7 +99,6 @@ asmlinkage long _sys_rt_sigreturn(struct pt_regs *regs)
 	if (__copy_from_user(&set, &frame->uc.uc_sigmask, sizeof(set)))
 		goto badframe;
 
-	sigdelsetmask(&set, ~_BLOCKABLE);
 	set_current_blocked(&set);
 
 	if (restore_sigcontext(regs, &frame->uc.uc_mcontext))
diff --git a/arch/parisc/kernel/signal.c b/arch/parisc/kernel/signal.c
index 7f3c8f2c962d..25161eaf720d 100644
--- a/arch/parisc/kernel/signal.c
+++ b/arch/parisc/kernel/signal.c
@@ -48,9 +48,6 @@
 #define DBG(LEVEL, ...)
 #endif
 	
-
-#define _BLOCKABLE (~(sigmask(SIGKILL) | sigmask(SIGSTOP)))
-
 /* gcc will complain if a pointer is cast to an integer of different
  * size.  If you really need to do this (and we do for an ELF32 user
  * application in an ELF64 kernel) then you have to do a cast to an
@@ -131,7 +128,6 @@ sys_rt_sigreturn(struct pt_regs *regs, int in_syscall)
 			goto give_sigsegv;
 	}
 		
-	sigdelsetmask(&set, ~_BLOCKABLE);
 	set_current_blocked(&set);
 
 	/* Good thing we saved the old gr[30], eh? */
diff --git a/arch/parisc/kernel/signal32.c b/arch/parisc/kernel/signal32.c
index e14132430762..fd49aeda9eb8 100644
--- a/arch/parisc/kernel/signal32.c
+++ b/arch/parisc/kernel/signal32.c
@@ -47,8 +47,6 @@
 #define DBG(LEVEL, ...)
 #endif
 
-#define _BLOCKABLE (~(sigmask(SIGKILL) | sigmask(SIGSTOP)))
-
 inline void
 sigset_32to64(sigset_t *s64, compat_sigset_t *s32)
 {
diff --git a/arch/powerpc/kernel/signal.c b/arch/powerpc/kernel/signal.c
index d926d2e4611a..3a3413c049c3 100644
--- a/arch/powerpc/kernel/signal.c
+++ b/arch/powerpc/kernel/signal.c
@@ -57,7 +57,6 @@ void __user * get_sigframe(struct k_sigaction *ka, struct pt_regs *regs,
  */
 void restore_sigmask(sigset_t *set)
 {
-	sigdelsetmask(set, ~_BLOCKABLE);
 	set_current_blocked(set);
 }
 
diff --git a/arch/powerpc/kernel/signal.h b/arch/powerpc/kernel/signal.h
index 8dde973aaaf5..11439ea18ed4 100644
--- a/arch/powerpc/kernel/signal.h
+++ b/arch/powerpc/kernel/signal.h
@@ -10,8 +10,6 @@
 #ifndef _POWERPC_ARCH_SIGNAL_H
 #define _POWERPC_ARCH_SIGNAL_H
 
-#define _BLOCKABLE (~(sigmask(SIGKILL) | sigmask(SIGSTOP)))
-
 extern void do_notify_resume(struct pt_regs *regs, unsigned long thread_info_flags);
 
 extern void __user * get_sigframe(struct k_sigaction *ka, struct pt_regs *regs,
diff --git a/arch/s390/kernel/compat_signal.c b/arch/s390/kernel/compat_signal.c
index 233db1d68eee..923baa96c0b0 100644
--- a/arch/s390/kernel/compat_signal.c
+++ b/arch/s390/kernel/compat_signal.c
@@ -32,8 +32,6 @@
 #include "compat_ptrace.h"
 #include "entry.h"
 
-#define _BLOCKABLE (~(sigmask(SIGKILL) | sigmask(SIGSTOP)))
-
 typedef struct 
 {
 	__u8 callee_used_stack[__SIGNAL_FRAMESIZE32];
@@ -364,7 +362,6 @@ asmlinkage long sys32_sigreturn(void)
 		goto badframe;
 	if (__copy_from_user(&set.sig, &frame->sc.oldmask, _SIGMASK_COPY_SIZE32))
 		goto badframe;
-	sigdelsetmask(&set, ~_BLOCKABLE);
 	set_current_blocked(&set);
 	if (restore_sigregs32(regs, &frame->sregs))
 		goto badframe;
@@ -390,7 +387,6 @@ asmlinkage long sys32_rt_sigreturn(void)
 		goto badframe;
 	if (__copy_from_user(&set, &frame->uc.uc_sigmask, sizeof(set)))
 		goto badframe;
-	sigdelsetmask(&set, ~_BLOCKABLE);
 	set_current_blocked(&set);
 	if (restore_sigregs32(regs, &frame->uc.uc_mcontext))
 		goto badframe;
diff --git a/arch/s390/kernel/signal.c b/arch/s390/kernel/signal.c
index 7f9a862a161a..8332a6943384 100644
--- a/arch/s390/kernel/signal.c
+++ b/arch/s390/kernel/signal.c
@@ -33,9 +33,6 @@
 #include <asm/switch_to.h>
 #include "entry.h"
 
-#define _BLOCKABLE (~(sigmask(SIGKILL) | sigmask(SIGSTOP)))
-
-
 typedef struct 
 {
 	__u8 callee_used_stack[__SIGNAL_FRAMESIZE];
@@ -169,7 +166,6 @@ SYSCALL_DEFINE0(sigreturn)
 		goto badframe;
 	if (__copy_from_user(&set.sig, &frame->sc.oldmask, _SIGMASK_COPY_SIZE))
 		goto badframe;
-	sigdelsetmask(&set, ~_BLOCKABLE);
 	set_current_blocked(&set);
 	if (restore_sigregs(regs, &frame->sregs))
 		goto badframe;
@@ -189,7 +185,6 @@ SYSCALL_DEFINE0(rt_sigreturn)
 		goto badframe;
 	if (__copy_from_user(&set.sig, &frame->uc.uc_sigmask, sizeof(set)))
 		goto badframe;
-	sigdelsetmask(&set, ~_BLOCKABLE);
 	set_current_blocked(&set);
 	if (restore_sigregs(regs, &frame->uc.uc_mcontext))
 		goto badframe;
diff --git a/arch/score/kernel/signal.c b/arch/score/kernel/signal.c
index 13e0eed0e301..f1b3fef0907b 100644
--- a/arch/score/kernel/signal.c
+++ b/arch/score/kernel/signal.c
@@ -34,8 +34,6 @@
 #include <asm/syscalls.h>
 #include <asm/ucontext.h>
 
-#define _BLOCKABLE (~(sigmask(SIGKILL) | sigmask(SIGSTOP)))
-
 struct rt_sigframe {
 	u32 rs_ass[4];		/* argument save space */
 	u32 rs_code[2];		/* signal trampoline */
@@ -162,7 +160,6 @@ score_rt_sigreturn(struct pt_regs *regs)
 	if (__copy_from_user(&set, &frame->rs_uc.uc_sigmask, sizeof(set)))
 		goto badframe;
 
-	sigdelsetmask(&set, ~_BLOCKABLE);
 	set_current_blocked(&set);
 
 	sig = restore_sigcontext(regs, &frame->rs_uc.uc_mcontext);
diff --git a/arch/sh/kernel/signal_32.c b/arch/sh/kernel/signal_32.c
index 2675a97f374f..e4a531414e19 100644
--- a/arch/sh/kernel/signal_32.c
+++ b/arch/sh/kernel/signal_32.c
@@ -32,8 +32,6 @@
 #include <asm/syscalls.h>
 #include <asm/fpu.h>
 
-#define _BLOCKABLE (~(sigmask(SIGKILL) | sigmask(SIGSTOP)))
-
 struct fdpic_func_descriptor {
 	unsigned long	text;
 	unsigned long	GOT;
@@ -226,7 +224,6 @@ asmlinkage int sys_sigreturn(unsigned long r4, unsigned long r5,
 				    sizeof(frame->extramask))))
 		goto badframe;
 
-	sigdelsetmask(&set, ~_BLOCKABLE);
 	set_current_blocked(&set);
 
 	if (restore_sigcontext(regs, &frame->sc, &r0))
@@ -256,7 +253,6 @@ asmlinkage int sys_rt_sigreturn(unsigned long r4, unsigned long r5,
 	if (__copy_from_user(&set, &frame->uc.uc_sigmask, sizeof(set)))
 		goto badframe;
 
-	sigdelsetmask(&set, ~_BLOCKABLE);
 	set_current_blocked(&set);
 
 	if (restore_sigcontext(regs, &frame->uc.uc_mcontext, &r0))
diff --git a/arch/sh/kernel/signal_64.c b/arch/sh/kernel/signal_64.c
index 7075c63bfc6f..75960ef6c1d1 100644
--- a/arch/sh/kernel/signal_64.c
+++ b/arch/sh/kernel/signal_64.c
@@ -41,8 +41,6 @@
 
 #define DEBUG_SIG 0
 
-#define _BLOCKABLE (~(sigmask(SIGKILL) | sigmask(SIGSTOP)))
-
 static void
 handle_signal(unsigned long sig, siginfo_t *info, struct k_sigaction *ka,
 		struct pt_regs * regs);
@@ -330,7 +328,6 @@ asmlinkage int sys_sigreturn(unsigned long r2, unsigned long r3,
 				    sizeof(frame->extramask))))
 		goto badframe;
 
-	sigdelsetmask(&set, ~_BLOCKABLE);
 	set_current_blocked(&set);
 
 	if (restore_sigcontext(regs, &frame->sc, &ret))
@@ -363,7 +360,6 @@ asmlinkage int sys_rt_sigreturn(unsigned long r2, unsigned long r3,
 	if (__copy_from_user(&set, &frame->uc.uc_sigmask, sizeof(set)))
 		goto badframe;
 
-	sigdelsetmask(&set, ~_BLOCKABLE);
 	set_current_blocked(&set);
 
 	if (restore_sigcontext(regs, &frame->uc.uc_mcontext, &ret))
diff --git a/arch/sparc/kernel/signal32.c b/arch/sparc/kernel/signal32.c
index 8c93c00922a7..ba3dbfcdb28e 100644
--- a/arch/sparc/kernel/signal32.c
+++ b/arch/sparc/kernel/signal32.c
@@ -32,8 +32,6 @@
 
 #include "sigutil.h"
 
-#define _BLOCKABLE (~(sigmask(SIGKILL) | sigmask(SIGSTOP)))
-
 /* This magic should be in g_upper[0] for all upper parts
  * to be valid.
  */
@@ -274,7 +272,6 @@ void do_sigreturn32(struct pt_regs *regs)
 		case 2: set.sig[1] = seta[2] + (((long)seta[3]) << 32);
 		case 1: set.sig[0] = seta[0] + (((long)seta[1]) << 32);
 	}
-	sigdelsetmask(&set, ~_BLOCKABLE);
 	set_current_blocked(&set);
 	return;
 
@@ -376,7 +373,6 @@ asmlinkage void do_rt_sigreturn32(struct pt_regs *regs)
 		case 2: set.sig[1] = seta.sig[2] + (((long)seta.sig[3]) << 32);
 		case 1: set.sig[0] = seta.sig[0] + (((long)seta.sig[1]) << 32);
 	}
-	sigdelsetmask(&set, ~_BLOCKABLE);
 	set_current_blocked(&set);
 	return;
 segv:
diff --git a/arch/sparc/kernel/signal_32.c b/arch/sparc/kernel/signal_32.c
index f6722427203d..1bfa854be602 100644
--- a/arch/sparc/kernel/signal_32.c
+++ b/arch/sparc/kernel/signal_32.c
@@ -29,8 +29,6 @@
 
 #include "sigutil.h"
 
-#define _BLOCKABLE (~(sigmask(SIGKILL) | sigmask(SIGSTOP)))
-
 extern void fpsave(unsigned long *fpregs, unsigned long *fsr,
 		   void *fpqueue, unsigned long *fpqdepth);
 extern void fpload(unsigned long *fpregs, unsigned long *fsr);
@@ -130,7 +128,6 @@ asmlinkage void do_sigreturn(struct pt_regs *regs)
 	if (err)
 		goto segv_and_exit;
 
-	sigdelsetmask(&set, ~_BLOCKABLE);
 	set_current_blocked(&set);
 	return;
 
@@ -197,7 +194,6 @@ asmlinkage void do_rt_sigreturn(struct pt_regs *regs)
 			goto segv;
 	}
 
-	sigdelsetmask(&set, ~_BLOCKABLE);
 	set_current_blocked(&set);
 	return;
 segv:
diff --git a/arch/sparc/kernel/signal_64.c b/arch/sparc/kernel/signal_64.c
index febbc4b697ba..23b60caa6c43 100644
--- a/arch/sparc/kernel/signal_64.c
+++ b/arch/sparc/kernel/signal_64.c
@@ -38,8 +38,6 @@
 #include "systbls.h"
 #include "sigutil.h"
 
-#define _BLOCKABLE (~(sigmask(SIGKILL) | sigmask(SIGSTOP)))
-
 /* {set, get}context() needed for 64-bit SparcLinux userland. */
 asmlinkage void sparc64_set_context(struct pt_regs *regs)
 {
@@ -71,7 +69,6 @@ asmlinkage void sparc64_set_context(struct pt_regs *regs)
 			if (__copy_from_user(&set, &ucp->uc_sigmask, sizeof(sigset_t)))
 				goto do_sigsegv;
 		}
-		sigdelsetmask(&set, ~_BLOCKABLE);
 		set_current_blocked(&set);
 	}
 	if (test_thread_flag(TIF_32BIT)) {
@@ -315,7 +312,6 @@ void do_rt_sigreturn(struct pt_regs *regs)
 	/* Prevent syscall restart.  */
 	pt_regs_clear_syscall(regs);
 
-	sigdelsetmask(&set, ~_BLOCKABLE);
 	set_current_blocked(&set);
 	return;
 segv:
diff --git a/arch/tile/kernel/compat_signal.c b/arch/tile/kernel/compat_signal.c
index cdef6e5ec022..474571b84085 100644
--- a/arch/tile/kernel/compat_signal.c
+++ b/arch/tile/kernel/compat_signal.c
@@ -118,8 +118,6 @@ struct compat_rt_sigframe {
 	struct compat_ucontext uc;
 };
 
-#define _BLOCKABLE (~(sigmask(SIGKILL) | sigmask(SIGSTOP)))
-
 long compat_sys_rt_sigaction(int sig, struct compat_sigaction __user *act,
 			     struct compat_sigaction __user *oact,
 			     size_t sigsetsize)
@@ -302,7 +300,6 @@ long compat_sys_rt_sigreturn(struct pt_regs *regs)
 	if (__copy_from_user(&set, &frame->uc.uc_sigmask, sizeof(set)))
 		goto badframe;
 
-	sigdelsetmask(&set, ~_BLOCKABLE);
 	set_current_blocked(&set);
 
 	if (restore_sigcontext(regs, &frame->uc.uc_mcontext))
diff --git a/arch/tile/kernel/signal.c b/arch/tile/kernel/signal.c
index 9b71bfd4913d..e068aa0c6dfc 100644
--- a/arch/tile/kernel/signal.c
+++ b/arch/tile/kernel/signal.c
@@ -37,8 +37,6 @@
 
 #define DEBUG_SIG 0
 
-#define _BLOCKABLE (~(sigmask(SIGKILL) | sigmask(SIGSTOP)))
-
 SYSCALL_DEFINE3(sigaltstack, const stack_t __user *, uss,
 		stack_t __user *, uoss, struct pt_regs *, regs)
 {
@@ -96,7 +94,6 @@ SYSCALL_DEFINE1(rt_sigreturn, struct pt_regs *, regs)
 	if (__copy_from_user(&set, &frame->uc.uc_sigmask, sizeof(set)))
 		goto badframe;
 
-	sigdelsetmask(&set, ~_BLOCKABLE);
 	set_current_blocked(&set);
 
 	if (restore_sigcontext(regs, &frame->uc.uc_mcontext))
diff --git a/arch/um/include/shared/frame_kern.h b/arch/um/include/shared/frame_kern.h
index 76078490c258..e584e40ee832 100644
--- a/arch/um/include/shared/frame_kern.h
+++ b/arch/um/include/shared/frame_kern.h
@@ -6,9 +6,6 @@
 #ifndef __FRAME_KERN_H_
 #define __FRAME_KERN_H_
 
-#define _S(nr) (1<<((nr)-1))
-#define _BLOCKABLE (~(_S(SIGKILL) | _S(SIGSTOP)))
-
 extern int setup_signal_stack_sc(unsigned long stack_top, int sig, 
 				 struct k_sigaction *ka,
 				 struct pt_regs *regs, 
diff --git a/arch/um/kernel/signal.c b/arch/um/kernel/signal.c
index 549a51c8e54f..4ce6ab2d2996 100644
--- a/arch/um/kernel/signal.c
+++ b/arch/um/kernel/signal.c
@@ -15,10 +15,6 @@
 EXPORT_SYMBOL(block_signals);
 EXPORT_SYMBOL(unblock_signals);
 
-#define _S(nr) (1<<((nr)-1))
-
-#define _BLOCKABLE (~(_S(SIGKILL) | _S(SIGSTOP)))
-
 /*
  * OK, we're invoking a handler
  */
diff --git a/arch/unicore32/kernel/signal.c b/arch/unicore32/kernel/signal.c
index af962e57efb2..4d9c4841989d 100644
--- a/arch/unicore32/kernel/signal.c
+++ b/arch/unicore32/kernel/signal.c
@@ -21,8 +21,6 @@
 #include <asm/cacheflush.h>
 #include <asm/ucontext.h>
 
-#define _BLOCKABLE (~(sigmask(SIGKILL) | sigmask(SIGSTOP)))
-
 /*
  * For UniCore syscalls, we encode the syscall number into the instruction.
  */
@@ -61,10 +59,8 @@ static int restore_sigframe(struct pt_regs *regs, struct sigframe __user *sf)
 	int err;
 
 	err = __copy_from_user(&set, &sf->uc.uc_sigmask, sizeof(set));
-	if (err == 0) {
-		sigdelsetmask(&set, ~_BLOCKABLE);
+	if (err == 0)
 		set_current_blocked(&set);
-	}
 
 	err |= __get_user(regs->UCreg_00, &sf->uc.uc_mcontext.regs.UCreg_00);
 	err |= __get_user(regs->UCreg_01, &sf->uc.uc_mcontext.regs.UCreg_01);
diff --git a/arch/x86/ia32/ia32_signal.c b/arch/x86/ia32/ia32_signal.c
index 98bd70faccc5..daeca56211e3 100644
--- a/arch/x86/ia32/ia32_signal.c
+++ b/arch/x86/ia32/ia32_signal.c
@@ -273,7 +273,6 @@ asmlinkage long sys32_sigreturn(struct pt_regs *regs)
 				    sizeof(frame->extramask))))
 		goto badframe;
 
-	sigdelsetmask(&set, ~_BLOCKABLE);
 	set_current_blocked(&set);
 
 	if (ia32_restore_sigcontext(regs, &frame->sc, &ax))
@@ -299,7 +298,6 @@ asmlinkage long sys32_rt_sigreturn(struct pt_regs *regs)
 	if (__copy_from_user(&set, &frame->uc.uc_sigmask, sizeof(set)))
 		goto badframe;
 
-	sigdelsetmask(&set, ~_BLOCKABLE);
 	set_current_blocked(&set);
 
 	if (ia32_restore_sigcontext(regs, &frame->uc.uc_mcontext, &ax))
diff --git a/arch/x86/include/asm/sighandling.h b/arch/x86/include/asm/sighandling.h
index ada93b3b8c66..beff97f7df37 100644
--- a/arch/x86/include/asm/sighandling.h
+++ b/arch/x86/include/asm/sighandling.h
@@ -7,8 +7,6 @@
 
 #include <asm/processor-flags.h>
 
-#define _BLOCKABLE (~(sigmask(SIGKILL) | sigmask(SIGSTOP)))
-
 #define __FIX_EFLAGS	(X86_EFLAGS_AC | X86_EFLAGS_OF | \
 			 X86_EFLAGS_DF | X86_EFLAGS_TF | X86_EFLAGS_SF | \
 			 X86_EFLAGS_ZF | X86_EFLAGS_AF | X86_EFLAGS_PF | \
diff --git a/arch/x86/kernel/signal.c b/arch/x86/kernel/signal.c
index 700c49dcd84e..11e206f0f45a 100644
--- a/arch/x86/kernel/signal.c
+++ b/arch/x86/kernel/signal.c
@@ -555,7 +555,6 @@ unsigned long sys_sigreturn(struct pt_regs *regs)
 				    sizeof(frame->extramask))))
 		goto badframe;
 
-	sigdelsetmask(&set, ~_BLOCKABLE);
 	set_current_blocked(&set);
 
 	if (restore_sigcontext(regs, &frame->sc, &ax))
@@ -581,7 +580,6 @@ long sys_rt_sigreturn(struct pt_regs *regs)
 	if (__copy_from_user(&set, &frame->uc.uc_sigmask, sizeof(set)))
 		goto badframe;
 
-	sigdelsetmask(&set, ~_BLOCKABLE);
 	set_current_blocked(&set);
 
 	if (restore_sigcontext(regs, &frame->uc.uc_mcontext, &ax))
@@ -915,7 +913,6 @@ asmlinkage long sys32_x32_rt_sigreturn(struct pt_regs *regs)
 	if (__copy_from_user(&set, &frame->uc.uc_sigmask, sizeof(set)))
 		goto badframe;
 
-	sigdelsetmask(&set, ~_BLOCKABLE);
 	set_current_blocked(&set);
 
 	if (restore_sigcontext(regs, &frame->uc.uc_mcontext, &ax))
diff --git a/arch/x86/um/signal.c b/arch/x86/um/signal.c
index bb0fb03b9f85..a508cea13503 100644
--- a/arch/x86/um/signal.c
+++ b/arch/x86/um/signal.c
@@ -486,7 +486,6 @@ long sys_sigreturn(struct pt_regs *regs)
 	    copy_from_user(&set.sig[1], extramask, sig_size))
 		goto segfault;
 
-	sigdelsetmask(&set, ~_BLOCKABLE);
 	set_current_blocked(&set);
 
 	if (copy_sc_from_user(&current->thread.regs, sc))
@@ -600,7 +599,6 @@ long sys_rt_sigreturn(struct pt_regs *regs)
 	if (copy_from_user(&set, &uc->uc_sigmask, sizeof(set)))
 		goto segfault;
 
-	sigdelsetmask(&set, ~_BLOCKABLE);
 	set_current_blocked(&set);
 
 	if (copy_sc_from_user(&current->thread.regs, &uc->uc_mcontext))
diff --git a/arch/xtensa/kernel/signal.c b/arch/xtensa/kernel/signal.c
index ca98b86ef9a7..4da3c6f6d929 100644
--- a/arch/xtensa/kernel/signal.c
+++ b/arch/xtensa/kernel/signal.c
@@ -30,8 +30,6 @@
 
 #define DEBUG_SIG  0
 
-#define _BLOCKABLE (~(sigmask(SIGKILL) | sigmask(SIGSTOP)))
-
 extern struct task_struct *coproc_owners[];
 
 struct rt_sigframe
@@ -261,7 +259,6 @@ asmlinkage long xtensa_rt_sigreturn(long a0, long a1, long a2, long a3,
 	if (__copy_from_user(&set, &frame->uc.uc_sigmask, sizeof(set)))
 		goto badframe;
 
-	sigdelsetmask(&set, ~_BLOCKABLE);
 	set_current_blocked(&set);
 
 	if (restore_sigcontext(regs, frame))
diff --git a/include/linux/sched.h b/include/linux/sched.h
index ded3fb63fb06..f34437e835a7 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -2210,7 +2210,7 @@ extern int do_sigaltstack(const stack_t __user *, stack_t __user *, unsigned lon
 static inline void restore_saved_sigmask(void)
 {
 	if (test_and_clear_restore_sigmask())
-		set_current_blocked(&current->saved_sigmask);
+		__set_current_blocked(&current->saved_sigmask);
 }
 
 static inline sigset_t *sigmask_to_save(void)
diff --git a/include/linux/signal.h b/include/linux/signal.h
index 17046cc484bc..065e76330398 100644
--- a/include/linux/signal.h
+++ b/include/linux/signal.h
@@ -250,7 +250,8 @@ extern long do_sigpending(void __user *, unsigned long);
 extern int do_sigtimedwait(const sigset_t *, siginfo_t *,
 				const struct timespec *);
 extern int sigprocmask(int, sigset_t *, sigset_t *);
-extern void set_current_blocked(const sigset_t *);
+extern void set_current_blocked(sigset_t *);
+extern void __set_current_blocked(const sigset_t *);
 extern int show_unhandled_signals;
 extern int sigsuspend(sigset_t *);
 
diff --git a/kernel/signal.c b/kernel/signal.c
index b9be7e0fe41a..df8d721a9e6f 100644
--- a/kernel/signal.c
+++ b/kernel/signal.c
@@ -2524,7 +2524,16 @@ static void __set_task_blocked(struct task_struct *tsk, const sigset_t *newset)
  * It is wrong to change ->blocked directly, this helper should be used
  * to ensure the process can't miss a shared signal we are going to block.
  */
-void set_current_blocked(const sigset_t *newset)
+void set_current_blocked(sigset_t *newset)
+{
+	struct task_struct *tsk = current;
+	sigdelsetmask(newset, sigmask(SIGKILL) | sigmask(SIGSTOP));
+	spin_lock_irq(&tsk->sighand->siglock);
+	__set_task_blocked(tsk, newset);
+	spin_unlock_irq(&tsk->sighand->siglock);
+}
+
+void __set_current_blocked(const sigset_t *newset)
 {
 	struct task_struct *tsk = current;
 
@@ -2564,7 +2573,7 @@ int sigprocmask(int how, sigset_t *set, sigset_t *oldset)
 		return -EINVAL;
 	}
 
-	set_current_blocked(&newset);
+	__set_current_blocked(&newset);
 	return 0;
 }
 
@@ -3138,7 +3147,7 @@ SYSCALL_DEFINE3(sigprocmask, int, how, old_sigset_t __user *, nset,
 			return -EINVAL;
 		}
 
-		set_current_blocked(&new_blocked);
+		__set_current_blocked(&new_blocked);
 	}
 
 	if (oset) {
@@ -3202,7 +3211,6 @@ SYSCALL_DEFINE1(ssetmask, int, newmask)
 	int old = current->blocked.sig[0];
 	sigset_t newset;
 
-	siginitset(&newset, newmask & ~(sigmask(SIGKILL) | sigmask(SIGSTOP)));
 	set_current_blocked(&newset);
 
 	return old;
@@ -3243,8 +3251,6 @@ SYSCALL_DEFINE0(pause)
 
 int sigsuspend(sigset_t *set)
 {
-	sigdelsetmask(set, sigmask(SIGKILL)|sigmask(SIGSTOP));
-
 	current->saved_sigmask = current->blocked;
 	set_current_blocked(set);
 
-- 
cgit v1.2.3


From efee984c27b67e3ebef40410f35671997441b57c Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Sat, 28 Apr 2012 02:04:15 -0400
Subject: new helper: signal_delivered()

Does block_sigmask() + tracehook_signal_handler();  called when
sigframe has been successfully built.  All architectures converted
to it; block_sigmask() itself is gone now (merged into this one).

I'm still not too happy with the signature, but that's a separate
story (IMO we need a structure that would contain signal number +
siginfo + k_sigaction, so that get_signal_to_deliver() would fill one,
signal_delivered(), handle_signal() and probably setup...frame() -
take one).

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 arch/alpha/kernel/signal.c         |  2 +-
 arch/arm/kernel/signal.c           |  7 +------
 arch/avr32/kernel/signal.c         |  2 +-
 arch/blackfin/kernel/signal.c      |  3 +--
 arch/c6x/kernel/signal.c           |  3 +--
 arch/cris/arch-v10/kernel/signal.c |  2 +-
 arch/cris/arch-v32/kernel/signal.c |  2 +-
 arch/frv/kernel/signal.c           |  3 +--
 arch/h8300/kernel/signal.c         |  2 +-
 arch/hexagon/kernel/signal.c       |  3 +--
 arch/ia64/kernel/signal.c          |  7 +------
 arch/m32r/kernel/signal.c          |  2 +-
 arch/m68k/kernel/signal.c          |  2 +-
 arch/microblaze/kernel/signal.c    |  2 +-
 arch/mips/kernel/signal.c          |  2 +-
 arch/mn10300/kernel/signal.c       |  3 +--
 arch/openrisc/kernel/signal.c      |  3 +--
 arch/parisc/kernel/signal.c        |  4 +---
 arch/powerpc/kernel/signal.c       |  6 +-----
 arch/s390/kernel/compat_signal.c   |  6 +-----
 arch/s390/kernel/signal.c          |  6 +-----
 arch/score/kernel/signal.c         |  2 +-
 arch/sh/kernel/signal_32.c         |  3 +--
 arch/sh/kernel/signal_64.c         |  3 +--
 arch/sparc/kernel/signal32.c       |  3 +--
 arch/sparc/kernel/signal_32.c      |  3 +--
 arch/sparc/kernel/signal_64.c      |  3 +--
 arch/tile/kernel/signal.c          |  2 +-
 arch/um/kernel/signal.c            |  2 +-
 arch/unicore32/kernel/signal.c     |  5 +----
 arch/x86/kernel/signal.c           |  6 ++----
 arch/xtensa/kernel/signal.c        |  2 +-
 include/linux/signal.h             |  2 +-
 kernel/signal.c                    | 22 +++++++++++++---------
 34 files changed, 47 insertions(+), 83 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/alpha/kernel/signal.c b/arch/alpha/kernel/signal.c
index 48c4df2389ac..a8c97d42ec8e 100644
--- a/arch/alpha/kernel/signal.c
+++ b/arch/alpha/kernel/signal.c
@@ -478,7 +478,7 @@ handle_signal(int sig, struct k_sigaction *ka, siginfo_t *info,
 		force_sigsegv(sig, current);
 		return;
 	}
-	block_sigmask(ka, sig);
+	signal_delivered(sig, info, ka, regs, 0);
 }
 
 static inline void
diff --git a/arch/arm/kernel/signal.c b/arch/arm/kernel/signal.c
index c126eba8411d..fd2392a17ac1 100644
--- a/arch/arm/kernel/signal.c
+++ b/arch/arm/kernel/signal.c
@@ -557,12 +557,7 @@ handle_signal(unsigned long sig, struct k_sigaction *ka,
 		force_sigsegv(sig, tsk);
 		return;
 	}
-
-	/*
-	 * Block the signal if we were successful.
-	 */
-	block_sigmask(ka, sig);
-	tracehook_signal_handler(sig, info, ka, regs, 0);
+	signal_delivered(sig, info, ka, regs, 0);
 }
 
 /*
diff --git a/arch/avr32/kernel/signal.c b/arch/avr32/kernel/signal.c
index e883fa5eb845..c140f9b41dce 100644
--- a/arch/avr32/kernel/signal.c
+++ b/arch/avr32/kernel/signal.c
@@ -241,7 +241,7 @@ handle_signal(unsigned long sig, struct k_sigaction *ka, siginfo_t *info,
 	if (ret != 0)
 		force_sigsegv(sig, current);
 	else
-		block_sigmask(ka, sig);
+		signal_delivered(sig, info, ka, regs, 0);
 }
 
 /*
diff --git a/arch/blackfin/kernel/signal.c b/arch/blackfin/kernel/signal.c
index 463612643821..35459e681483 100644
--- a/arch/blackfin/kernel/signal.c
+++ b/arch/blackfin/kernel/signal.c
@@ -260,8 +260,7 @@ handle_signal(int sig, siginfo_t *info, struct k_sigaction *ka,
 	if (ret)
 		return;
 
-	block_sigmask(ka, sig);
-	tracehook_signal_handler(sig, info, ka, regs,
+	signal_delivered(sig, info, ka, regs,
 			test_thread_flag(TIF_SINGLESTEP));
 }
 
diff --git a/arch/c6x/kernel/signal.c b/arch/c6x/kernel/signal.c
index eb1b3086ae00..3d8f3c22a94f 100644
--- a/arch/c6x/kernel/signal.c
+++ b/arch/c6x/kernel/signal.c
@@ -276,8 +276,7 @@ static void handle_signal(int sig,
 	/* Set up the stack frame */
 	if (setup_rt_frame(sig, ka, info, sigmask_to_save(), regs) < 0)
 		return;
-	block_sigmask(ka, sig);
-	tracehook_signal_handler(sig, info, ka, regs, 0);
+	signal_delivered(sig, info, ka, regs, 0);
 }
 
 /*
diff --git a/arch/cris/arch-v10/kernel/signal.c b/arch/cris/arch-v10/kernel/signal.c
index cf6380cb9a57..0bb477c13a4e 100644
--- a/arch/cris/arch-v10/kernel/signal.c
+++ b/arch/cris/arch-v10/kernel/signal.c
@@ -453,7 +453,7 @@ static inline void handle_signal(int canrestart, unsigned long sig,
 		ret = setup_frame(sig, ka, oldset, regs);
 
 	if (ret == 0)
-		block_sigmask(ka, sig);
+		signal_delivered(sig, info, ka, regs, 0);
 }
 
 /*
diff --git a/arch/cris/arch-v32/kernel/signal.c b/arch/cris/arch-v32/kernel/signal.c
index 07b81ee09f65..b60d1b65a426 100644
--- a/arch/cris/arch-v32/kernel/signal.c
+++ b/arch/cris/arch-v32/kernel/signal.c
@@ -485,7 +485,7 @@ handle_signal(int canrestart, unsigned long sig,
 		ret = setup_frame(sig, ka, oldset, regs);
 
 	if (ret == 0)
-		block_sigmask(ka, sig);
+		signal_delivered(sig, info, ka, regs, 0);
 }
 
 /*
diff --git a/arch/frv/kernel/signal.c b/arch/frv/kernel/signal.c
index 511285fa2461..4e134c7eceea 100644
--- a/arch/frv/kernel/signal.c
+++ b/arch/frv/kernel/signal.c
@@ -460,8 +460,7 @@ static void handle_signal(unsigned long sig, siginfo_t *info,
 	if (ret)
 		return;
 
-	block_sigmask(ka, sig);
-	tracehook_signal_handler(sig, info, ka, __frame,
+	signal_delivered(sig, info, ka, __frame,
 				 test_thread_flag(TIF_SINGLESTEP));
 } /* end handle_signal() */
 
diff --git a/arch/h8300/kernel/signal.c b/arch/h8300/kernel/signal.c
index aa6f09666915..fca10378701b 100644
--- a/arch/h8300/kernel/signal.c
+++ b/arch/h8300/kernel/signal.c
@@ -439,7 +439,7 @@ handle_signal(unsigned long sig, siginfo_t *info, struct k_sigaction *ka,
 		ret = setup_frame(sig, ka, oldset, regs);
 
 	if (!ret)
-		block_sigmask(ka, sig);
+		signal_delivered(sig, info, ka, regs, 0);
 }
 
 /*
diff --git a/arch/hexagon/kernel/signal.c b/arch/hexagon/kernel/signal.c
index 439f11a3a8ef..304b0808d072 100644
--- a/arch/hexagon/kernel/signal.c
+++ b/arch/hexagon/kernel/signal.c
@@ -186,8 +186,7 @@ static void handle_signal(int sig, siginfo_t *info, struct k_sigaction *ka,
 	if (setup_rt_frame(sig, ka, info, sigmask_to_save(), regs) < 0)
 		return;
 
-	block_sigmask(ka, sig);
-	tracehook_signal_handler(sig, info, ka, regs,
+	signal_delivered(sig, info, ka, regs,
 			test_thread_flag(TIF_SINGLESTEP));
 }
 
diff --git a/arch/ia64/kernel/signal.c b/arch/ia64/kernel/signal.c
index c4041c76c07d..a199be1fe619 100644
--- a/arch/ia64/kernel/signal.c
+++ b/arch/ia64/kernel/signal.c
@@ -419,12 +419,7 @@ handle_signal (unsigned long sig, struct k_sigaction *ka, siginfo_t *info,
 	if (!setup_frame(sig, ka, info, sigmask_to_save(), scr))
 		return 0;
 
-	block_sigmask(ka, sig);
-
-	/*
-	 * Let tracing know that we've done the handler setup.
-	 */
-	tracehook_signal_handler(sig, info, ka, &scr->pt,
+	signal_delivered(sig, info, ka, &scr->pt,
 				 test_thread_flag(TIF_SINGLESTEP));
 
 	return 1;
diff --git a/arch/m32r/kernel/signal.c b/arch/m32r/kernel/signal.c
index 07f9032576c0..f3fb2c029cfc 100644
--- a/arch/m32r/kernel/signal.c
+++ b/arch/m32r/kernel/signal.c
@@ -294,7 +294,7 @@ handle_signal(unsigned long sig, struct k_sigaction *ka, siginfo_t *info,
 	if (setup_rt_frame(sig, ka, info, sigmask_to_save(), regs))
 		return;
 
-	block_sigmask(ka, sig);
+	signal_delivered(sig, info, ka, regs, 0);
 }
 
 /*
diff --git a/arch/m68k/kernel/signal.c b/arch/m68k/kernel/signal.c
index c00caad215a6..710a528b928b 100644
--- a/arch/m68k/kernel/signal.c
+++ b/arch/m68k/kernel/signal.c
@@ -1137,7 +1137,7 @@ handle_signal(int sig, struct k_sigaction *ka, siginfo_t *info,
 	if (err)
 		return;
 
-	block_sigmask(ka, sig);
+	signal_delivered(sig, info, ka, regs, 0);
 
 	if (test_thread_flag(TIF_DELAYED_TRACE)) {
 		regs->sr &= ~0x8000;
diff --git a/arch/microblaze/kernel/signal.c b/arch/microblaze/kernel/signal.c
index c662e68671a2..76b9722557db 100644
--- a/arch/microblaze/kernel/signal.c
+++ b/arch/microblaze/kernel/signal.c
@@ -323,7 +323,7 @@ handle_signal(unsigned long sig, struct k_sigaction *ka,
 	if (ret)
 		return;
 
-	block_sigmask(ka, sig);
+	signal_delivered(sig, info, ka, regs, 0);
 }
 
 /*
diff --git a/arch/mips/kernel/signal.c b/arch/mips/kernel/signal.c
index 02e0cba24f82..f2c09cfc60ac 100644
--- a/arch/mips/kernel/signal.c
+++ b/arch/mips/kernel/signal.c
@@ -551,7 +551,7 @@ static void handle_signal(unsigned long sig, siginfo_t *info,
 	if (ret)
 		return;
 
-	block_sigmask(ka, sig);
+	signal_delivered(sig, info, ka, regs, 0);
 }
 
 static void do_signal(struct pt_regs *regs)
diff --git a/arch/mn10300/kernel/signal.c b/arch/mn10300/kernel/signal.c
index 4f6d20763061..6ab0bee2a54f 100644
--- a/arch/mn10300/kernel/signal.c
+++ b/arch/mn10300/kernel/signal.c
@@ -461,8 +461,7 @@ static int handle_signal(int sig,
 	if (ret)
 		return;
 
-	block_sigmask(ka, sig);
-	tracehook_signal_handler(sig, info, ka, regs,
+	signal_delivered(sig, info, ka, regs,
 				 test_thread_flag(TIF_SINGLESTEP));
 }
 
diff --git a/arch/openrisc/kernel/signal.c b/arch/openrisc/kernel/signal.c
index 53972b7260b7..30110297f4f9 100644
--- a/arch/openrisc/kernel/signal.c
+++ b/arch/openrisc/kernel/signal.c
@@ -259,8 +259,7 @@ handle_signal(unsigned long sig,
 	if (ret)
 		return;
 
-	block_sigmask(ka, sig);
-	tracehook_signal_handler(sig, info, ka, regs,
+	signal_delivered(sig, info, ka, regs,
 				 test_thread_flag(TIF_SINGLESTEP));
 }
 
diff --git a/arch/parisc/kernel/signal.c b/arch/parisc/kernel/signal.c
index 25161eaf720d..594459bde14e 100644
--- a/arch/parisc/kernel/signal.c
+++ b/arch/parisc/kernel/signal.c
@@ -449,9 +449,7 @@ handle_signal(unsigned long sig, siginfo_t *info, struct k_sigaction *ka,
 	if (!setup_rt_frame(sig, ka, info, oldset, regs, in_syscall))
 		return 0;
 
-	block_sigmask(ka, sig);
-
-	tracehook_signal_handler(sig, info, ka, regs, 
+	signal_delivered(sig, info, ka, regs, 
 		test_thread_flag(TIF_SINGLESTEP) ||
 		test_thread_flag(TIF_BLOCKSTEP));
 
diff --git a/arch/powerpc/kernel/signal.c b/arch/powerpc/kernel/signal.c
index 129bdffc6daf..5c023c9cf16e 100644
--- a/arch/powerpc/kernel/signal.c
+++ b/arch/powerpc/kernel/signal.c
@@ -148,11 +148,7 @@ static int do_signal(struct pt_regs *regs)
 
 	regs->trap = 0;
 	if (ret) {
-		block_sigmask(&ka, signr);
-		/*
-		 * Let tracing know that we've done the handler setup.
-		 */
-		tracehook_signal_handler(signr, &info, &ka, regs,
+		signal_delivered(signr, &info, &ka, regs,
 					 test_thread_flag(TIF_SINGLESTEP));
 	}
 
diff --git a/arch/s390/kernel/compat_signal.c b/arch/s390/kernel/compat_signal.c
index 923baa96c0b0..3c0c19830c37 100644
--- a/arch/s390/kernel/compat_signal.c
+++ b/arch/s390/kernel/compat_signal.c
@@ -580,11 +580,7 @@ void handle_signal32(unsigned long sig, struct k_sigaction *ka,
 		ret = setup_frame32(sig, ka, oldset, regs);
 	if (ret)
 		return;
-	block_sigmask(ka, sig);
-	/*
-	 * Let tracing know that we've done the handler setup.
-	 */
-	tracehook_signal_handler(sig, info, ka, regs,
+	signal_delivered(sig, info, ka, regs,
 				 test_thread_flag(TIF_SINGLE_STEP));
 }
 
diff --git a/arch/s390/kernel/signal.c b/arch/s390/kernel/signal.c
index 8332a6943384..ac565b44aabb 100644
--- a/arch/s390/kernel/signal.c
+++ b/arch/s390/kernel/signal.c
@@ -375,11 +375,7 @@ static void handle_signal(unsigned long sig, struct k_sigaction *ka,
 		ret = setup_frame(sig, ka, oldset, regs);
 	if (ret)
 		return;
-	block_sigmask(ka, sig);
-	/*
-	 * Let tracing know that we've done the handler setup.
-	 */
-	tracehook_signal_handler(sig, info, ka, regs,
+	signal_delivered(sig, info, ka, regs,
 				 test_thread_flag(TIF_SINGLE_STEP));
 }
 
diff --git a/arch/score/kernel/signal.c b/arch/score/kernel/signal.c
index f1b3fef0907b..e382c52ca0d9 100644
--- a/arch/score/kernel/signal.c
+++ b/arch/score/kernel/signal.c
@@ -267,7 +267,7 @@ static void handle_signal(unsigned long sig, siginfo_t *info,
 	if (setup_rt_frame(ka, regs, sig, sigmask_to_save(), info) < 0)
 		return;
 
-	block_sigmask(ka, sig);
+	signal_delivered(sig, info, ka, regs, 0);
 }
 
 static void do_signal(struct pt_regs *regs)
diff --git a/arch/sh/kernel/signal_32.c b/arch/sh/kernel/signal_32.c
index e4a531414e19..d6b7b6154f87 100644
--- a/arch/sh/kernel/signal_32.c
+++ b/arch/sh/kernel/signal_32.c
@@ -533,8 +533,7 @@ handle_signal(unsigned long sig, struct k_sigaction *ka, siginfo_t *info,
 
 	if (ret)
 		return;
-	block_sigmask(ka, sig);
-	tracehook_signal_handler(sig, info, ka, regs,
+	signal_delivered(sig, info, ka, regs,
 			test_thread_flag(TIF_SINGLESTEP));
 }
 
diff --git a/arch/sh/kernel/signal_64.c b/arch/sh/kernel/signal_64.c
index 75960ef6c1d1..6b5b3dfe886b 100644
--- a/arch/sh/kernel/signal_64.c
+++ b/arch/sh/kernel/signal_64.c
@@ -650,8 +650,7 @@ handle_signal(unsigned long sig, siginfo_t *info, struct k_sigaction *ka,
 	if (ret)
 		return;
 
-	block_sigmask(ka, sig);
-	tracehook_signal_handler(sig, info, ka, regs,
+	signal_delivered(sig, info, ka, regs,
 			test_thread_flag(TIF_SINGLESTEP));
 }
 
diff --git a/arch/sparc/kernel/signal32.c b/arch/sparc/kernel/signal32.c
index ba3dbfcdb28e..a53e0a5fd3a3 100644
--- a/arch/sparc/kernel/signal32.c
+++ b/arch/sparc/kernel/signal32.c
@@ -785,8 +785,7 @@ static inline void handle_signal32(unsigned long signr, struct k_sigaction *ka,
 	if (err)
 		return;
 
-	block_sigmask(ka, signr);
-	tracehook_signal_handler(signr, info, ka, regs, 0);
+	signal_delivered(signr, info, ka, regs, 0);
 }
 
 static inline void syscall_restart32(unsigned long orig_i0, struct pt_regs *regs,
diff --git a/arch/sparc/kernel/signal_32.c b/arch/sparc/kernel/signal_32.c
index 1bfa854be602..68f9c8650af4 100644
--- a/arch/sparc/kernel/signal_32.c
+++ b/arch/sparc/kernel/signal_32.c
@@ -460,8 +460,7 @@ handle_signal(unsigned long signr, struct k_sigaction *ka,
 	if (err)
 		return;
 
-	block_sigmask(ka, signr);
-	tracehook_signal_handler(signr, info, ka, regs, 0);
+	signal_delivered(signr, info, ka, regs, 0);
 }
 
 static inline void syscall_restart(unsigned long orig_i0, struct pt_regs *regs,
diff --git a/arch/sparc/kernel/signal_64.c b/arch/sparc/kernel/signal_64.c
index 23b60caa6c43..867de2f8189c 100644
--- a/arch/sparc/kernel/signal_64.c
+++ b/arch/sparc/kernel/signal_64.c
@@ -473,8 +473,7 @@ static inline void handle_signal(unsigned long signr, struct k_sigaction *ka,
 	if (err)
 		return;
 
-	block_sigmask(ka, signr);
-	tracehook_signal_handler(signr, info, ka, regs, 0);
+	signal_delivered(signr, info, ka, regs, 0);
 }
 
 static inline void syscall_restart(unsigned long orig_i0, struct pt_regs *regs,
diff --git a/arch/tile/kernel/signal.c b/arch/tile/kernel/signal.c
index e068aa0c6dfc..e29b0553211d 100644
--- a/arch/tile/kernel/signal.c
+++ b/arch/tile/kernel/signal.c
@@ -278,7 +278,7 @@ static void handle_signal(unsigned long sig, siginfo_t *info,
 		ret = setup_rt_frame(sig, ka, info, oldset, regs);
 	if (ret)
 		return;
-	block_sigmask(ka, sig);
+	signal_delivered(sig, info, ka, regs, 0);
 }
 
 /*
diff --git a/arch/um/kernel/signal.c b/arch/um/kernel/signal.c
index 4ce6ab2d2996..7362d58efc29 100644
--- a/arch/um/kernel/signal.c
+++ b/arch/um/kernel/signal.c
@@ -61,7 +61,7 @@ static void handle_signal(struct pt_regs *regs, unsigned long signr,
 	if (err)
 		force_sigsegv(signr, current);
 	else
-		block_sigmask(ka, signr);
+		signal_delivered(signr, info, ka, regs, 0);
 }
 
 static int kern_do_signal(struct pt_regs *regs)
diff --git a/arch/unicore32/kernel/signal.c b/arch/unicore32/kernel/signal.c
index 4d9c4841989d..8adedb37720a 100644
--- a/arch/unicore32/kernel/signal.c
+++ b/arch/unicore32/kernel/signal.c
@@ -362,10 +362,7 @@ static void handle_signal(unsigned long sig, struct k_sigaction *ka,
 		return;
 	}
 
-	/*
-	 * Block the signal if we were successful.
-	 */
-	block_sigmask(ka, sig);
+	signal_delivered(sig, info, ka, regs, 0);
 }
 
 /*
diff --git a/arch/x86/kernel/signal.c b/arch/x86/kernel/signal.c
index 11e206f0f45a..e8a89374d356 100644
--- a/arch/x86/kernel/signal.c
+++ b/arch/x86/kernel/signal.c
@@ -715,10 +715,8 @@ handle_signal(unsigned long sig, siginfo_t *info, struct k_sigaction *ka,
 	 */
 	regs->flags &= ~X86_EFLAGS_TF;
 
-	block_sigmask(ka, sig);
-
-	tracehook_signal_handler(sig, info, ka, regs,
-				 test_thread_flag(TIF_SINGLESTEP));
+	signal_delivered(sig, info, ka, regs,
+			 test_thread_flag(TIF_SINGLESTEP));
 }
 
 #ifdef CONFIG_X86_32
diff --git a/arch/xtensa/kernel/signal.c b/arch/xtensa/kernel/signal.c
index 4da3c6f6d929..b9f8e5850d3a 100644
--- a/arch/xtensa/kernel/signal.c
+++ b/arch/xtensa/kernel/signal.c
@@ -493,7 +493,7 @@ static void do_signal(struct pt_regs *regs)
 		if (ret)
 			return;
 
-		block_sigmask(&ka, signr);
+		signal_delivered(signr, info, ka, regs, 0);
 		if (current->ptrace & PT_SINGLESTEP)
 			task_pt_regs(current)->icountlevel = 1;
 
diff --git a/include/linux/signal.h b/include/linux/signal.h
index 065e76330398..26b424adc842 100644
--- a/include/linux/signal.h
+++ b/include/linux/signal.h
@@ -256,7 +256,7 @@ extern int show_unhandled_signals;
 extern int sigsuspend(sigset_t *);
 
 extern int get_signal_to_deliver(siginfo_t *info, struct k_sigaction *return_ka, struct pt_regs *regs, void *cookie);
-extern void block_sigmask(struct k_sigaction *ka, int signr);
+extern void signal_delivered(int sig, siginfo_t *info, struct k_sigaction *ka, struct pt_regs *regs, int stepping);
 extern void exit_signals(struct task_struct *tsk);
 
 extern struct kmem_cache *sighand_cachep;
diff --git a/kernel/signal.c b/kernel/signal.c
index df8d721a9e6f..677102789cf2 100644
--- a/kernel/signal.c
+++ b/kernel/signal.c
@@ -2368,17 +2368,20 @@ relock:
 }
 
 /**
- * block_sigmask - add @ka's signal mask to current->blocked
- * @ka: action for @signr
- * @signr: signal that has been successfully delivered
+ * signal_delivered - 
+ * @sig:		number of signal being delivered
+ * @info:		siginfo_t of signal being delivered
+ * @ka:			sigaction setting that chose the handler
+ * @regs:		user register state
+ * @stepping:		nonzero if debugger single-step or block-step in use
  *
  * This function should be called when a signal has succesfully been
- * delivered. It adds the mask of signals for @ka to current->blocked
- * so that they are blocked during the execution of the signal
- * handler. In addition, @signr will be blocked unless %SA_NODEFER is
- * set in @ka->sa.sa_flags.
+ * delivered. It updates the blocked signals accordingly (@ka->sa.sa_mask
+ * is always blocked, and the signal itself is blocked unless %SA_NODEFER
+ * is set in @ka->sa.sa_flags.  Tracing is notified.
  */
-void block_sigmask(struct k_sigaction *ka, int signr)
+void signal_delivered(int sig, siginfo_t *info, struct k_sigaction *ka,
+			struct pt_regs *regs, int stepping)
 {
 	sigset_t blocked;
 
@@ -2390,8 +2393,9 @@ void block_sigmask(struct k_sigaction *ka, int signr)
 
 	sigorsets(&blocked, &current->blocked, &ka->sa.sa_mask);
 	if (!(ka->sa.sa_flags & SA_NODEFER))
-		sigaddset(&blocked, signr);
+		sigaddset(&blocked, sig);
 	set_current_blocked(&blocked);
+	tracehook_signal_handler(sig, info, ka, regs, stepping);
 }
 
 /*
-- 
cgit v1.2.3


From 44fbbb3dc687c9709a6f2236197316e5c79ab1eb Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Mon, 30 Apr 2012 18:24:46 -0400
Subject: x86: get rid of calling do_notify_resume() when returning to kernel
 mode

If we end up calling do_notify_resume() with !user_mode(refs), it
does nothing (do_signal() explicitly bails out and we can't get there
with TIF_NOTIFY_RESUME in such situations).  Then we jump to
resume_userspace_sig, which rechecks the same thing and bails out
to resume_kernel, thus breaking the loop.

It's easier and cheaper to check *before* calling do_notify_resume()
and bail out to resume_kernel immediately.  And kill the check in
do_signal()...

Note that on amd64 we can't get there with !user_mode() at all - asm
glue takes care of that.

Acked-and-reviewed-by: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 arch/x86/kernel/entry_32.S | 13 ++++++++++---
 arch/x86/kernel/signal.c   | 10 ----------
 2 files changed, 10 insertions(+), 13 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kernel/entry_32.S b/arch/x86/kernel/entry_32.S
index 01ccf9b71473..623f28837476 100644
--- a/arch/x86/kernel/entry_32.S
+++ b/arch/x86/kernel/entry_32.S
@@ -316,7 +316,6 @@ ret_from_exception:
 	preempt_stop(CLBR_ANY)
 ret_from_intr:
 	GET_THREAD_INFO(%ebp)
-resume_userspace_sig:
 #ifdef CONFIG_VM86
 	movl PT_EFLAGS(%esp), %eax	# mix EFLAGS and CS
 	movb PT_CS(%esp), %al
@@ -615,9 +614,13 @@ work_notifysig:				# deal with pending signals and
 					# vm86-space
 	TRACE_IRQS_ON
 	ENABLE_INTERRUPTS(CLBR_NONE)
+	movb PT_CS(%esp), %bl
+	andb $SEGMENT_RPL_MASK, %bl
+	cmpb $USER_RPL, %bl
+	jb resume_kernel
 	xorl %edx, %edx
 	call do_notify_resume
-	jmp resume_userspace_sig
+	jmp resume_userspace
 
 	ALIGN
 work_notifysig_v86:
@@ -630,9 +633,13 @@ work_notifysig_v86:
 #endif
 	TRACE_IRQS_ON
 	ENABLE_INTERRUPTS(CLBR_NONE)
+	movb PT_CS(%esp), %bl
+	andb $SEGMENT_RPL_MASK, %bl
+	cmpb $USER_RPL, %bl
+	jb resume_kernel
 	xorl %edx, %edx
 	call do_notify_resume
-	jmp resume_userspace_sig
+	jmp resume_userspace
 END(work_pending)
 
 	# perform syscall exit tracing
diff --git a/arch/x86/kernel/signal.c b/arch/x86/kernel/signal.c
index e8a89374d356..21af737053aa 100644
--- a/arch/x86/kernel/signal.c
+++ b/arch/x86/kernel/signal.c
@@ -737,16 +737,6 @@ static void do_signal(struct pt_regs *regs)
 	siginfo_t info;
 	int signr;
 
-	/*
-	 * We want the common case to go fast, which is why we may in certain
-	 * cases get here from kernel mode. Just return without doing anything
-	 * if so.
-	 * X86_32: vm86 regs switched out by assembly code before reaching
-	 * here, so testing against kernel CS suffices.
-	 */
-	if (!user_mode(regs))
-		return;
-
 	signr = get_signal_to_deliver(&info, &ka, regs, NULL);
 	if (signr > 0) {
 		/* Whee! Actually deliver the signal.  */
-- 
cgit v1.2.3


From bad1a753d4d4deb09d4bc0bac1dd4fc3298502e9 Mon Sep 17 00:00:00 2001
From: "H.J. Lu" <hjl.tools@gmail.com>
Date: Mon, 21 May 2012 20:29:45 -0700
Subject: x86, x32, ptrace: Remove PTRACE_ARCH_PRCTL for x32

When I added x32 ptrace to 3.4 kernel, I also include PTRACE_ARCH_PRCTL
support for x32 GDB  For ARCH_GET_FS/GS, it takes a pointer to int64.  But
at user level, ARCH_GET_FS/GS takes a pointer to int32.  So I have to add
x32 ptrace to glibc to handle it with a temporary int64 passed to kernel and
copy it back to GDB as int32.  Roland suggested that PTRACE_ARCH_PRCTL
is obsolete and x32 GDB should use fs_base and gs_base fields of
user_regs_struct instead.

Accordingly, remove PTRACE_ARCH_PRCTL completely from the x32 code to
avoid possible memory overrun when pointer to int32 is passed to
kernel.

Link: http://lkml.kernel.org/r/CAMe9rOpDzHfS7NH7m1vmD9QRw8SSj4Sc%2BaNOgcWm_WJME2eRsQ@mail.gmail.com
Signed-off-by: H. Peter Anvin <hpa@zytor.com>
Cc: <stable@vger.kernel.org> v3.4
---
 arch/x86/kernel/ptrace.c | 6 ------
 1 file changed, 6 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kernel/ptrace.c b/arch/x86/kernel/ptrace.c
index 13b1990c7c58..c4c6a5c2bf0f 100644
--- a/arch/x86/kernel/ptrace.c
+++ b/arch/x86/kernel/ptrace.c
@@ -1211,12 +1211,6 @@ static long x32_arch_ptrace(struct task_struct *child,
 					     0, sizeof(struct user_i387_struct),
 					     datap);
 
-		/* normal 64bit interface to access TLS data.
-		   Works just like arch_prctl, except that the arguments
-		   are reversed. */
-	case PTRACE_ARCH_PRCTL:
-		return do_arch_prctl(child, data, addr);
-
 	default:
 		return compat_ptrace_request(child, request, addr, data);
 	}
-- 
cgit v1.2.3


From 76eb9a30db4bc8fd172f9155247264b5f2686d7b Mon Sep 17 00:00:00 2001
From: Zhang Rui <rui.zhang@intel.com>
Date: Mon, 20 Feb 2012 14:20:06 +0800
Subject: ACPI, x86: fix Dell M6600 ACPI reboot regression via DMI

Dell Precision M6600 is known to require PCI reboot, so add it to
the reboot blacklist in pci_reboot_dmi_table[].

https://bugzilla.kernel.org/show_bug.cgi?id=42749

cc: x86@kernel.org
Signed-off-by: Zhang Rui <rui.zhang@intel.com>
Signed-off-by: Len Brown <len.brown@intel.com>
---
 arch/x86/kernel/reboot.c | 8 ++++++++
 1 file changed, 8 insertions(+)

(limited to 'arch/x86')

diff --git a/arch/x86/kernel/reboot.c b/arch/x86/kernel/reboot.c
index 79c45af81604..412db5716d91 100644
--- a/arch/x86/kernel/reboot.c
+++ b/arch/x86/kernel/reboot.c
@@ -451,6 +451,14 @@ static struct dmi_system_id __initdata reboot_dmi_table[] = {
 			DMI_MATCH(DMI_PRODUCT_NAME, "OptiPlex 990"),
 		},
 	},
+	{	/* Handle problems with rebooting on the Precision M6600. */
+		.callback = set_pci_reboot,
+		.ident = "Dell OptiPlex 990",
+		.matches = {
+			DMI_MATCH(DMI_SYS_VENDOR, "Dell Inc."),
+			DMI_MATCH(DMI_PRODUCT_NAME, "Precision M6600"),
+		},
+	},
 	{ }
 };
 
-- 
cgit v1.2.3


From c2238f10e0c34a85a2a555c8a197316d1ca3fb7e Mon Sep 17 00:00:00 2001
From: Chen Gong <gong.chen@linux.intel.com>
Date: Tue, 5 Jun 2012 10:35:02 +0800
Subject: x86/mce: Fix the MCE poll timer logic

In commit 82f7af09 (x86/mce: Cleanup timer mess), Thomas just forgot
the "/ 2" there while cleaning up.

Signed-off-by: Chen Gong <gong.chen@linux.intel.com>
Acked-by: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Tony Luck <tony.luck@intel.com>
---
 arch/x86/kernel/cpu/mcheck/mce.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kernel/cpu/mcheck/mce.c b/arch/x86/kernel/cpu/mcheck/mce.c
index 98003bfc5556..d6b18a4d0b95 100644
--- a/arch/x86/kernel/cpu/mcheck/mce.c
+++ b/arch/x86/kernel/cpu/mcheck/mce.c
@@ -1266,7 +1266,7 @@ static void mce_timer_fn(unsigned long data)
 	 */
 	iv = __this_cpu_read(mce_next_interval);
 	if (mce_notify_irq())
-		iv = max(iv, (unsigned long) HZ/100);
+		iv = max(iv / 2, (unsigned long) HZ/100);
 	else
 		iv = min(iv * 2, round_jiffies_relative(check_interval * HZ));
 	__this_cpu_write(mce_next_interval, iv);
-- 
cgit v1.2.3


From 958fb3c51295764599d6abce87e1a01ace897a3e Mon Sep 17 00:00:00 2001
From: Chen Gong <gong.chen@linux.intel.com>
Date: Tue, 5 Jun 2012 10:35:02 +0800
Subject: x86/mce: Fix the MCE poll timer logic

In commit 82f7af09 ("x86/mce: Cleanup timer mess), Thomas just
forgot the "/ 2" there while cleaning up.

Signed-off-by: Chen Gong <gong.chen@linux.intel.com>
Acked-by: Thomas Gleixner <tglx@linutronix.de>
Cc: bp@amd64.org
Cc: tony.luck@intel.com
Link: http://lkml.kernel.org/r/1338863702-9245-1-git-send-email-gong.chen@linux.intel.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/x86/kernel/cpu/mcheck/mce.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kernel/cpu/mcheck/mce.c b/arch/x86/kernel/cpu/mcheck/mce.c
index 0a687fd185e6..a97f3c4a3946 100644
--- a/arch/x86/kernel/cpu/mcheck/mce.c
+++ b/arch/x86/kernel/cpu/mcheck/mce.c
@@ -1274,7 +1274,7 @@ static void mce_timer_fn(unsigned long data)
 	 */
 	iv = __this_cpu_read(mce_next_interval);
 	if (mce_notify_irq())
-		iv = max(iv, (unsigned long) HZ/100);
+		iv = max(iv / 2, (unsigned long) HZ/100);
 	else
 		iv = min(iv * 2, round_jiffies_relative(check_interval * HZ));
 	__this_cpu_write(mce_next_interval, iv);
-- 
cgit v1.2.3


From 436d03faf6961b30e13b2d0967aea9d772d6cf44 Mon Sep 17 00:00:00 2001
From: Masami Hiramatsu <masami.hiramatsu.pt@hitachi.com>
Date: Tue, 5 Jun 2012 00:09:11 +0900
Subject: x86/decoder: Fix bsr/bsf/jmpe decoding with operand-size prefix

Fix the x86 instruction decoder to decode bsr/bsf/jmpe with
operand-size prefix (66h). This fixes the test case failure
reported by Linus, attached below.

bsf/bsr/jmpe have a special encoding. Opcode map in
Intel Software Developers Manual vol2 says they have
TZCNT/LZCNT variants if it has F3h prefix. However, there
is no information if it has other 66h or F2h prefixes.
Current instruction decoder supposes that those are
bad instructions, but it actually accepts at least
operand-size prefixes.

H. Peter Anvin further explains:

 " TZCNT/LZCNT are F3 + BSF/BSR exactly because the F2 and
   F3 prefixes have historically been no-ops with most instructions.
   This allows software to unconditionally use the prefixed versions
   and get TZCNT/LZCNT on the processors that have them if they don't
   care about the difference. "

This fixes errors reported by test_get_len:

  Warning: arch/x86/tools/test_get_len found difference at <em_bsf>:ffffffff81036d87
  Warning: ffffffff81036de5:	66 0f bc c2          	bsf    %dx,%ax
  Warning: objdump says 4 bytes, but insn_get_length() says 3
  Warning: arch/x86/tools/test_get_len found difference at <em_bsr>:ffffffff81036ea6
  Warning: ffffffff81036f04:	66 0f bd c2          	bsr    %dx,%ax
  Warning: objdump says 4 bytes, but insn_get_length() says 3
  Warning: decoded and checked 13298882 instructions with 2 warnings

Reported-by: Linus Torvalds <torvalds@linux-foundation.org>
Reported-by: Pekka Enberg <penberg@kernel.org>
Signed-off-by: Masami Hiramatsu <masami.hiramatsu.pt@hitachi.com>
Cc: "H. Peter Anvin" <hpa@zytor.com>
Cc: <yrl.pp-manager.tt@hitachi.com>
Link: http://lkml.kernel.org/r/20120604150911.22338.43296.stgit@localhost.localdomain
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/x86/lib/x86-opcode-map.txt      |  8 ++++----
 arch/x86/tools/gen-insn-attr-x86.awk | 14 +++++++++-----
 2 files changed, 13 insertions(+), 9 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/lib/x86-opcode-map.txt b/arch/x86/lib/x86-opcode-map.txt
index 819137904428..5d7e51f3fd28 100644
--- a/arch/x86/lib/x86-opcode-map.txt
+++ b/arch/x86/lib/x86-opcode-map.txt
@@ -28,7 +28,7 @@
 #  - (66): the last prefix is 0x66
 #  - (F3): the last prefix is 0xF3
 #  - (F2): the last prefix is 0xF2
-#
+#  - (!F3) : the last prefix is not 0xF3 (including non-last prefix case)
 
 Table: one byte opcode
 Referrer:
@@ -515,12 +515,12 @@ b4: LFS Gv,Mp
 b5: LGS Gv,Mp
 b6: MOVZX Gv,Eb
 b7: MOVZX Gv,Ew
-b8: JMPE | POPCNT Gv,Ev (F3)
+b8: JMPE (!F3) | POPCNT Gv,Ev (F3)
 b9: Grp10 (1A)
 ba: Grp8 Ev,Ib (1A)
 bb: BTC Ev,Gv
-bc: BSF Gv,Ev | TZCNT Gv,Ev (F3)
-bd: BSR Gv,Ev | LZCNT Gv,Ev (F3)
+bc: BSF Gv,Ev (!F3) | TZCNT Gv,Ev (F3)
+bd: BSR Gv,Ev (!F3) | LZCNT Gv,Ev (F3)
 be: MOVSX Gv,Eb
 bf: MOVSX Gv,Ew
 # 0x0f 0xc0-0xcf
diff --git a/arch/x86/tools/gen-insn-attr-x86.awk b/arch/x86/tools/gen-insn-attr-x86.awk
index 5f6a5b6c3a15..ddcf39b1a18d 100644
--- a/arch/x86/tools/gen-insn-attr-x86.awk
+++ b/arch/x86/tools/gen-insn-attr-x86.awk
@@ -66,9 +66,10 @@ BEGIN {
 	rex_expr = "^REX(\\.[XRWB]+)*"
 	fpu_expr = "^ESC" # TODO
 
-	lprefix1_expr = "\\(66\\)"
+	lprefix1_expr = "\\((66|!F3)\\)"
 	lprefix2_expr = "\\(F3\\)"
-	lprefix3_expr = "\\(F2\\)"
+	lprefix3_expr = "\\((F2|!F3)\\)"
+	lprefix_expr = "\\((66|F2|F3)\\)"
 	max_lprefix = 4
 
 	# All opcodes starting with lower-case 'v' or with (v1) superscript
@@ -333,13 +334,16 @@ function convert_operands(count,opnd,       i,j,imm,mod)
 		if (match(ext, lprefix1_expr)) {
 			lptable1[idx] = add_flags(lptable1[idx],flags)
 			variant = "INAT_VARIANT"
-		} else if (match(ext, lprefix2_expr)) {
+		}
+		if (match(ext, lprefix2_expr)) {
 			lptable2[idx] = add_flags(lptable2[idx],flags)
 			variant = "INAT_VARIANT"
-		} else if (match(ext, lprefix3_expr)) {
+		}
+		if (match(ext, lprefix3_expr)) {
 			lptable3[idx] = add_flags(lptable3[idx],flags)
 			variant = "INAT_VARIANT"
-		} else {
+		}
+		if (!match(ext, lprefix_expr)){
 			table[idx] = add_flags(table[idx],flags)
 		}
 	}
-- 
cgit v1.2.3


From 1a87fc1ec7b05b9bc60df9dc52297d4c225d7f1a Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Wed, 6 Jun 2012 11:33:21 +0200
Subject: x86: mce: Add the dropped timer interval init back

commit 82f7af09 ("x86/mce: Cleanup timer mess) dropped the
initialization of the per cpu timer interval. Duh :(

Restore the previous behaviour.

Reported-by: Chen Gong <gong.chen@linux.intel.com>
Cc: bp@amd64.org
Cc: tony.luck@intel.com
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 arch/x86/kernel/cpu/mcheck/mce.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kernel/cpu/mcheck/mce.c b/arch/x86/kernel/cpu/mcheck/mce.c
index a97f3c4a3946..da27c5d2168a 100644
--- a/arch/x86/kernel/cpu/mcheck/mce.c
+++ b/arch/x86/kernel/cpu/mcheck/mce.c
@@ -1557,7 +1557,7 @@ static void __mcheck_cpu_init_vendor(struct cpuinfo_x86 *c)
 static void __mcheck_cpu_init_timer(void)
 {
 	struct timer_list *t = &__get_cpu_var(mce_timer);
-	unsigned long iv = __this_cpu_read(mce_next_interval);
+	unsigned long iv = check_interval * HZ;
 
 	setup_timer(t, mce_timer_fn, smp_processor_id());
 
-- 
cgit v1.2.3


From aff5a62d52ff03956ff6992b9fe4b561fd855804 Mon Sep 17 00:00:00 2001
From: Xiaotian Feng <xtfeng@gmail.com>
Date: Tue, 5 Jun 2012 15:00:31 -0400
Subject: x86/gart: Fix kmemleak warning

aperture_64.c now is using memblock, the previous
kmemleak_ignore() for alloc_bootmem() should be removed then.

Otherwise, with kmemleak enabled, kernel will throw warnings
like:

[    0.000000] kmemleak: Trying to color unknown object at 0xffff8800c4000000 as Black
[    0.000000] Pid: 0, comm: swapper/0 Not tainted 3.5.0-rc1-next-20120605+ #130
[    0.000000] Call Trace:
[    0.000000]  [<ffffffff811b27e6>] paint_ptr+0x66/0xc0
[    0.000000]  [<ffffffff816b90fb>] kmemleak_ignore+0x2b/0x60
[    0.000000]  [<ffffffff81ef7bc0>] kmemleak_init+0x217/0x2c1
[    0.000000]  [<ffffffff81ed2b97>] start_kernel+0x32d/0x3eb
[    0.000000]  [<ffffffff81ed25e4>] ? repair_env_string+0x5a/0x5a
[    0.000000]  [<ffffffff81ed2356>] x86_64_start_reservations+0x131/0x135
[    0.000000]  [<ffffffff81ed2120>] ? early_idt_handlers+0x120/0x120
[    0.000000]  [<ffffffff81ed245c>] x86_64_start_kernel+0x102/0x111
[    0.000000] kmemleak: Early log backtrace:
[    0.000000]    [<ffffffff816b911b>] kmemleak_ignore+0x4b/0x60
[    0.000000]    [<ffffffff81ee6a38>] gart_iommu_hole_init+0x3e7/0x547
[    0.000000]    [<ffffffff81edb20b>] pci_iommu_alloc+0x44/0x6f
[    0.000000]    [<ffffffff81ee81ad>] mem_init+0x19/0xec
[    0.000000]    [<ffffffff81ed2a54>] start_kernel+0x1ea/0x3eb
[    0.000000]    [<ffffffff81ed2356>] x86_64_start_reservations+0x131/0x135
[    0.000000]    [<ffffffff81ed245c>] x86_64_start_kernel+0x102/0x111
[    0.000000]    [<ffffffffffffffff>] 0xffffffffffffffff

Signed-off-by: Xiaotian Feng <dannyfeng@tencent.com>
Cc: Xiaotian Feng <xtfeng@gmail.com>
Cc: Tejun Heo <tj@kernel.org>
Link: http://lkml.kernel.org/r/1338922831-2847-1-git-send-email-xtfeng@gmail.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/x86/kernel/aperture_64.c | 6 ------
 1 file changed, 6 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kernel/aperture_64.c b/arch/x86/kernel/aperture_64.c
index 6e76c191a835..d5fd66f0d4cd 100644
--- a/arch/x86/kernel/aperture_64.c
+++ b/arch/x86/kernel/aperture_64.c
@@ -20,7 +20,6 @@
 #include <linux/bitops.h>
 #include <linux/ioport.h>
 #include <linux/suspend.h>
-#include <linux/kmemleak.h>
 #include <asm/e820.h>
 #include <asm/io.h>
 #include <asm/iommu.h>
@@ -95,11 +94,6 @@ static u32 __init allocate_aperture(void)
 		return 0;
 	}
 	memblock_reserve(addr, aper_size);
-	/*
-	 * Kmemleak should not scan this block as it may not be mapped via the
-	 * kernel direct mapping.
-	 */
-	kmemleak_ignore(phys_to_virt(addr));
 	printk(KERN_INFO "Mapping aperture over %d KB of RAM @ %lx\n",
 			aper_size >> 10, addr);
 	insert_aperture_resource((u32)addr, aper_size);
-- 
cgit v1.2.3


From 4af463d28f1a026e25c0b879fac2a0d2b7bff599 Mon Sep 17 00:00:00 2001
From: Yasuaki Ishimatsu <isimatu.yasuaki@jp.fujitsu.com>
Date: Mon, 4 Jun 2012 11:42:32 +0900
Subject: x86/numa: Set numa_nodes_parsed at acpi_numa_memory_affinity_init()

When hot-adding a CPU, the system outputs following messages
since node_to_cpumask_map[2] was not allocated memory.

Booting Node 2 Processor 32 APIC 0xc0
node_to_cpumask_map[2] NULL
Pid: 0, comm: swapper/32 Tainted: G       A     3.3.5-acd #21
Call Trace:
 [<ffffffff81048845>] debug_cpumask_set_cpu+0x155/0x160
 [<ffffffff8105e28a>] ? add_timer_on+0xaa/0x120
 [<ffffffff8150665f>] numa_add_cpu+0x1e/0x22
 [<ffffffff815020bb>] identify_cpu+0x1df/0x1e4
 [<ffffffff815020d6>] identify_econdary_cpu+0x16/0x1d
 [<ffffffff81504614>] smp_store_cpu_info+0x3c/0x3e
 [<ffffffff81505263>] smp_callin+0x139/0x1be
 [<ffffffff815052fb>] start_secondary+0x13/0xeb

The reason is that the bit of node 2 was not set at
numa_nodes_parsed. numa_nodes_parsed is set by only
acpi_numa_processor_affinity_init /
acpi_numa_x2apic_affinity_init. Thus even if hot-added memory
which is same PXM as hot-added CPU is written in ACPI SRAT
Table, if the hot-added CPU is not written in ACPI SRAT table,
numa_nodes_parsed is not set.

But according to ACPI Spec Rev 5.0, it says about ACPI SRAT
table as follows: This optional table provides information that
allows OSPM to associate processors and memory ranges, including
ranges of memory provided by hot-added memory devices, with
system localities / proximity domains and clock domains.

It means that ACPI SRAT table only provides information for CPUs
present at boot time and for memory including hot-added memory.
So hot-added memory is written in ACPI SRAT table, but hot-added
CPU is not written in it. Thus numa_nodes_parsed should be set
by not only acpi_numa_processor_affinity_init /
acpi_numa_x2apic_affinity_init but also
acpi_numa_memory_affinity_init for the case.

Additionally, if system has cpuless memory node,
acpi_numa_processor_affinity_init /
acpi_numa_x2apic_affinity_init cannot set numa_nodes_parseds
since these functions cannot find cpu description for the node.
In this case, numa_nodes_parsed needs to be set by
acpi_numa_memory_affinity_init.

Signed-off-by: Yasuaki Ishimatsu <isimatu.yasuaki@jp.fujitsu.com>
Acked-by: David Rientjes <rientjes@google.com>
Acked-by: KOSAKI Motohiro <kosaki.motohiro@jp.fujitsu.com>
Cc: liuj97@gmail.com
Cc: kosaki.motohiro@gmail.com
Link: http://lkml.kernel.org/r/4FCC2098.4030007@jp.fujitsu.com
[ merged it ]
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/x86/mm/srat.c | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'arch/x86')

diff --git a/arch/x86/mm/srat.c b/arch/x86/mm/srat.c
index 732af3a96183..4599c3e8bcb6 100644
--- a/arch/x86/mm/srat.c
+++ b/arch/x86/mm/srat.c
@@ -176,6 +176,8 @@ acpi_numa_memory_affinity_init(struct acpi_srat_mem_affinity *ma)
 		return;
 	}
 
+	node_set(node, numa_nodes_parsed);
+
 	printk(KERN_INFO "SRAT: Node %u PXM %u [mem %#010Lx-%#010Lx]\n",
 	       node, pxm,
 	       (unsigned long long) start, (unsigned long long) end - 1);
-- 
cgit v1.2.3


From 7071f6b2889bb41bea61891d8a3e6e70517ef5e6 Mon Sep 17 00:00:00 2001
From: Sebastian Andrzej Siewior <sebastian@breakpoint.cc>
Date: Thu, 31 May 2012 23:20:25 +0200
Subject: x86/intel/moorestown: Change intel_scu_devices_create() to __devinit

The allmodconfig hits:

 WARNING: vmlinux.o(.text+0x6553d): Section mismatch in
          reference from the function intel_scu_devices_create() to the
          function .devinit.text: spi_register_board_info()
	  [...]

This patch marks intel_scu_devices_create() as devinit because
it only calls a devinit function, spi_register_board_info().

Signed-off-by: Sebastian Andrzej Siewior <sebastian@breakpoint.cc>
Cc: Alan Cox <alan@linux.intel.com>
Cc: Kirill A. Shutemov <kirill.shutemov@linux.intel.com>
Cc: Mika Westerberg <mika.westerberg@linux.intel.com>
Cc: Samuel Ortiz <sameo@linux.intel.com>
Cc: Feng Tang <feng.tang@intel.com>
Link: http://lkml.kernel.org/r/20120531212025.GA8519@breakpoint.cc
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/x86/platform/mrst/mrst.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'arch/x86')

diff --git a/arch/x86/platform/mrst/mrst.c b/arch/x86/platform/mrst/mrst.c
index e31bcd8f2eee..fd41a9262d65 100644
--- a/arch/x86/platform/mrst/mrst.c
+++ b/arch/x86/platform/mrst/mrst.c
@@ -782,7 +782,7 @@ BLOCKING_NOTIFIER_HEAD(intel_scu_notifier);
 EXPORT_SYMBOL_GPL(intel_scu_notifier);
 
 /* Called by IPC driver */
-void intel_scu_devices_create(void)
+void __devinit intel_scu_devices_create(void)
 {
 	int i;
 
-- 
cgit v1.2.3


From 55c844a4dd16a4d1fdc0cf2a283ec631a02ec448 Mon Sep 17 00:00:00 2001
From: Feng Tang <feng.tang@intel.com>
Date: Wed, 30 May 2012 23:15:41 +0800
Subject: x86/reboot: Fix a warning message triggered by stop_other_cpus()

When rebooting our 24 CPU Westmere servers with 3.4-rc6, we
always see this warning msg:

Restarting system.
machine restart
------------[ cut here ]------------
WARNING: at arch/x86/kernel/smp.c:125
native_smp_send_reschedule+0x74/0xa7() Hardware name: X8DTN
Modules linked in: igb [last unloaded: scsi_wait_scan]
Pid: 1, comm: systemd-shutdow Not tainted 3.4.0-rc6+ #22
Call Trace:
 <IRQ>  [<ffffffff8102a41f>] warn_slowpath_common+0x7e/0x96
 [<ffffffff8102a44c>] warn_slowpath_null+0x15/0x17
 [<ffffffff81018cf7>] native_smp_send_reschedule+0x74/0xa7
 [<ffffffff810561c1>] trigger_load_balance+0x279/0x2a6
 [<ffffffff81050112>] scheduler_tick+0xe0/0xe9
 [<ffffffff81036768>] update_process_times+0x60/0x70
 [<ffffffff81062f2f>] tick_sched_timer+0x68/0x92
 [<ffffffff81046e33>] __run_hrtimer+0xb3/0x13c
 [<ffffffff81062ec7>] ? tick_nohz_handler+0xd0/0xd0
 [<ffffffff810474f2>] hrtimer_interrupt+0xdb/0x198
 [<ffffffff81019a35>] smp_apic_timer_interrupt+0x81/0x94
 [<ffffffff81655187>] apic_timer_interrupt+0x67/0x70
 <EOI>  [<ffffffff8101a3c4>] ? default_send_IPI_mask_allbutself_phys+0xb4/0xc4
 [<ffffffff8101c680>] physflat_send_IPI_allbutself+0x12/0x14
 [<ffffffff81018db4>] native_nmi_stop_other_cpus+0x8a/0xd6
 [<ffffffff810188ba>] native_machine_shutdown+0x50/0x67
 [<ffffffff81018926>] machine_shutdown+0xa/0xc
 [<ffffffff8101897e>] native_machine_restart+0x20/0x32
 [<ffffffff810189b0>] machine_restart+0xa/0xc
 [<ffffffff8103b196>] kernel_restart+0x47/0x4c
 [<ffffffff8103b2e6>] sys_reboot+0x13e/0x17c
 [<ffffffff8164e436>] ? _raw_spin_unlock_bh+0x10/0x12
 [<ffffffff810fcac9>] ? bdi_queue_work+0xcf/0xd8
 [<ffffffff810fe82f>] ? __bdi_start_writeback+0xae/0xb7
 [<ffffffff810e0d64>] ? iterate_supers+0xa3/0xb7
 [<ffffffff816547a2>] system_call_fastpath+0x16/0x1b
---[ end trace 320af5cb1cb60c5b ]---

The root cause seems to be the
default_send_IPI_mask_allbutself_phys() takes quite some time (I
measured it could be several ms) to complete sending NMIs to all
the other 23 CPUs, and for HZ=250/1000 system, the time is long
enough for a timer interrupt to happen, which will in turn
trigger to kick load balance to a stopped CPU and cause this
warning in native_smp_send_reschedule().

So disabling the local irq before stop_other_cpu() can fix this
problem (tested 25 times reboot ok), and it is fine as there
should be nobody caring the timer interrupt in such reboot
stage.

The latest 3.4 kernel slightly changes this behavior by sending
REBOOT_VECTOR first and only send NMI_VECTOR if the REBOOT_VCTOR
fails, and this patch is still needed to prevent the problem.

Signed-off-by: Feng Tang <feng.tang@intel.com>
Acked-by: Don Zickus <dzickus@redhat.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Link: http://lkml.kernel.org/r/20120530231541.4c13433a@feng-i7
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/x86/kernel/reboot.c | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kernel/reboot.c b/arch/x86/kernel/reboot.c
index 79c45af81604..25b48edb847c 100644
--- a/arch/x86/kernel/reboot.c
+++ b/arch/x86/kernel/reboot.c
@@ -639,9 +639,11 @@ void native_machine_shutdown(void)
 	set_cpus_allowed_ptr(current, cpumask_of(reboot_cpu_id));
 
 	/*
-	 * O.K Now that I'm on the appropriate processor,
-	 * stop all of the others.
+	 * O.K Now that I'm on the appropriate processor, stop all of the
+	 * others. Also disable the local irq to not receive the per-cpu
+	 * timer interrupt which may trigger scheduler's load balance.
 	 */
+	local_irq_disable();
 	stop_other_cpus();
 #endif
 
-- 
cgit v1.2.3


From f6175f5bfb4c9f2ed32758c95f765b529b1a7f15 Mon Sep 17 00:00:00 2001
From: Tomoki Sekiyama <tomoki.sekiyama.qu@hitachi.com>
Date: Mon, 28 May 2012 18:09:18 +0900
Subject: x86/ioapic: Fix NULL pointer dereference on CPU hotplug after
 disabling irqs

In current Linux, percpu variable `vector_irq' is not cleared on
offlined cpus while disabling devices' irqs. If the cpu that has
the disabled irqs in vector_irq is hotplugged,
__setup_vector_irq() hits invalid irq vector and may crash.

This bug can be reproduced as following;

  # echo 0 > /sys/devices/system/cpu/cpu7/online
  # modprobe -r some_driver_using_interrupts      # vector_irq@cpu7 uncleared
  # echo 1 > /sys/devices/system/cpu/cpu7/online  # kernel may crash

This patch fixes this bug by clearing vector_irq in
__clear_irq_vector() even if the cpu is offlined.

Signed-off-by: Tomoki Sekiyama <tomoki.sekiyama.qu@hitachi.com>
Acked-by: Thomas Gleixner <tglx@linutronix.de>
Cc: yrl.pp-manager.tt@hitachi.com
Cc: ltc-kernel@ml.yrl.intra.hitachi.co.jp
Cc: Suresh Siddha <suresh.b.siddha@intel.com>
Cc: Yinghai Lu <yinghai@kernel.org>
Cc: Alexander Gordeev <agordeev@redhat.com>
Link: http://lkml.kernel.org/r/4FC340BE.7080101@hitachi.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/x86/kernel/apic/io_apic.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kernel/apic/io_apic.c b/arch/x86/kernel/apic/io_apic.c
index ac96561d1a99..5f0ff597437c 100644
--- a/arch/x86/kernel/apic/io_apic.c
+++ b/arch/x86/kernel/apic/io_apic.c
@@ -1195,7 +1195,7 @@ static void __clear_irq_vector(int irq, struct irq_cfg *cfg)
 	BUG_ON(!cfg->vector);
 
 	vector = cfg->vector;
-	for_each_cpu_and(cpu, cfg->domain, cpu_online_mask)
+	for_each_cpu(cpu, cfg->domain)
 		per_cpu(vector_irq, cpu)[vector] = -1;
 
 	cfg->vector = 0;
@@ -1203,7 +1203,7 @@ static void __clear_irq_vector(int irq, struct irq_cfg *cfg)
 
 	if (likely(!cfg->move_in_progress))
 		return;
-	for_each_cpu_and(cpu, cfg->old_domain, cpu_online_mask) {
+	for_each_cpu(cpu, cfg->old_domain) {
 		for (vector = FIRST_EXTERNAL_VECTOR; vector < NR_VECTORS;
 								vector++) {
 			if (per_cpu(vector_irq, cpu)[vector] != irq)
-- 
cgit v1.2.3


From ceb1cbac8eda66cf0f889def226b4e82f8ff857b Mon Sep 17 00:00:00 2001
From: Kamalesh Babulal <kamalesh@linux.vnet.ibm.com>
Date: Thu, 31 May 2012 13:07:38 +0530
Subject: sched/x86: Calculate booted cores after construction of sibling_mask

Commit 316ad248307fb ("sched/x86: Rewrite set_cpu_sibling_map()")
broke the booted_cores accounting.

The problem is that the booted_cores accounting needs all the
sibling links set up. So restore the second loop and add a comment as
to why its needed.

On qemu booted with -smp sockets=1,cores=2,threads=2;
Before:
 $ grep cores /proc/cpuinfo
 cpu cores       : 2
 cpu cores       : 1
 cpu cores       : 4
 cpu cores       : 3

With the patch:
 $ grep cores /proc/cpuinfo
 cpu cores       : 2
 cpu cores       : 2
 cpu cores       : 2
 cpu cores       : 2

Reported-by: Prarit Bhargava <prarit@redhat.com>
Reported-by: Borislav Petkov <bp@amd64.org>
Signed-off-by: Kamalesh Babulal <kamalesh@linux.vnet.ibm.com>
Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Link: http://lkml.kernel.org/r/20120531073738.GH7511@linux.vnet.ibm.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/x86/kernel/smpboot.c | 9 +++++++++
 1 file changed, 9 insertions(+)

(limited to 'arch/x86')

diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c
index fd019d78b1f4..3fab55bea29b 100644
--- a/arch/x86/kernel/smpboot.c
+++ b/arch/x86/kernel/smpboot.c
@@ -382,6 +382,15 @@ void __cpuinit set_cpu_sibling_map(int cpu)
 		if ((i == cpu) || (has_mc && match_llc(c, o)))
 			link_mask(llc_shared, cpu, i);
 
+	}
+
+	/*
+	 * This needs a separate iteration over the cpus because we rely on all
+	 * cpu_sibling_mask links to be set-up.
+	 */
+	for_each_cpu(i, cpu_sibling_setup_mask) {
+		o = &cpu_data(i);
+
 		if ((i == cpu) || (has_mc && match_mc(c, o))) {
 			link_mask(core, cpu, i);
 
-- 
cgit v1.2.3


From b430f7c4706aeba4270c7ab7744fc504b9315e1c Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <peterz@infradead.org>
Date: Tue, 5 Jun 2012 15:30:31 +0200
Subject: perf/x86: Fix Intel shared extra MSR allocation

Zheng Yan reported that event group validation can wreck event state
when Intel extra_reg allocation changes event state.

Validation shouldn't change any persistent state. Cloning events in
validate_{event,group}() isn't really pretty either, so add a few
special cases to avoid modifying the event state.

The code is restructured to minimize the special case impact.

Reported-by: Zheng Yan <zheng.z.yan@linux.intel.com>
Acked-by: Stephane Eranian <eranian@google.com>
Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Link: http://lkml.kernel.org/r/1338903031.28282.175.camel@twins
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/x86/kernel/cpu/perf_event.c       |  1 +
 arch/x86/kernel/cpu/perf_event.h       |  1 +
 arch/x86/kernel/cpu/perf_event_intel.c | 92 +++++++++++++++++++++++-----------
 3 files changed, 66 insertions(+), 28 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kernel/cpu/perf_event.c b/arch/x86/kernel/cpu/perf_event.c
index e049d6da0183..cb608383e4f6 100644
--- a/arch/x86/kernel/cpu/perf_event.c
+++ b/arch/x86/kernel/cpu/perf_event.c
@@ -1496,6 +1496,7 @@ static struct cpu_hw_events *allocate_fake_cpuc(void)
 		if (!cpuc->shared_regs)
 			goto error;
 	}
+	cpuc->is_fake = 1;
 	return cpuc;
 error:
 	free_fake_cpuc(cpuc);
diff --git a/arch/x86/kernel/cpu/perf_event.h b/arch/x86/kernel/cpu/perf_event.h
index 6638aaf54493..83794d8e6af0 100644
--- a/arch/x86/kernel/cpu/perf_event.h
+++ b/arch/x86/kernel/cpu/perf_event.h
@@ -117,6 +117,7 @@ struct cpu_hw_events {
 	struct perf_event	*event_list[X86_PMC_IDX_MAX]; /* in enabled order */
 
 	unsigned int		group_flag;
+	int			is_fake;
 
 	/*
 	 * Intel DebugStore bits
diff --git a/arch/x86/kernel/cpu/perf_event_intel.c b/arch/x86/kernel/cpu/perf_event_intel.c
index 166546ec6aef..965baa2fa790 100644
--- a/arch/x86/kernel/cpu/perf_event_intel.c
+++ b/arch/x86/kernel/cpu/perf_event_intel.c
@@ -1119,27 +1119,33 @@ intel_bts_constraints(struct perf_event *event)
 	return NULL;
 }
 
-static bool intel_try_alt_er(struct perf_event *event, int orig_idx)
+static int intel_alt_er(int idx)
 {
 	if (!(x86_pmu.er_flags & ERF_HAS_RSP_1))
-		return false;
+		return idx;
 
-	if (event->hw.extra_reg.idx == EXTRA_REG_RSP_0) {
-		event->hw.config &= ~INTEL_ARCH_EVENT_MASK;
-		event->hw.config |= 0x01bb;
-		event->hw.extra_reg.idx = EXTRA_REG_RSP_1;
-		event->hw.extra_reg.reg = MSR_OFFCORE_RSP_1;
-	} else if (event->hw.extra_reg.idx == EXTRA_REG_RSP_1) {
+	if (idx == EXTRA_REG_RSP_0)
+		return EXTRA_REG_RSP_1;
+
+	if (idx == EXTRA_REG_RSP_1)
+		return EXTRA_REG_RSP_0;
+
+	return idx;
+}
+
+static void intel_fixup_er(struct perf_event *event, int idx)
+{
+	event->hw.extra_reg.idx = idx;
+
+	if (idx == EXTRA_REG_RSP_0) {
 		event->hw.config &= ~INTEL_ARCH_EVENT_MASK;
 		event->hw.config |= 0x01b7;
-		event->hw.extra_reg.idx = EXTRA_REG_RSP_0;
 		event->hw.extra_reg.reg = MSR_OFFCORE_RSP_0;
+	} else if (idx == EXTRA_REG_RSP_1) {
+		event->hw.config &= ~INTEL_ARCH_EVENT_MASK;
+		event->hw.config |= 0x01bb;
+		event->hw.extra_reg.reg = MSR_OFFCORE_RSP_1;
 	}
-
-	if (event->hw.extra_reg.idx == orig_idx)
-		return false;
-
-	return true;
 }
 
 /*
@@ -1157,14 +1163,18 @@ __intel_shared_reg_get_constraints(struct cpu_hw_events *cpuc,
 	struct event_constraint *c = &emptyconstraint;
 	struct er_account *era;
 	unsigned long flags;
-	int orig_idx = reg->idx;
+	int idx = reg->idx;
 
-	/* already allocated shared msr */
-	if (reg->alloc)
+	/*
+	 * reg->alloc can be set due to existing state, so for fake cpuc we
+	 * need to ignore this, otherwise we might fail to allocate proper fake
+	 * state for this extra reg constraint. Also see the comment below.
+	 */
+	if (reg->alloc && !cpuc->is_fake)
 		return NULL; /* call x86_get_event_constraint() */
 
 again:
-	era = &cpuc->shared_regs->regs[reg->idx];
+	era = &cpuc->shared_regs->regs[idx];
 	/*
 	 * we use spin_lock_irqsave() to avoid lockdep issues when
 	 * passing a fake cpuc
@@ -1173,6 +1183,29 @@ again:
 
 	if (!atomic_read(&era->ref) || era->config == reg->config) {
 
+		/*
+		 * If its a fake cpuc -- as per validate_{group,event}() we
+		 * shouldn't touch event state and we can avoid doing so
+		 * since both will only call get_event_constraints() once
+		 * on each event, this avoids the need for reg->alloc.
+		 *
+		 * Not doing the ER fixup will only result in era->reg being
+		 * wrong, but since we won't actually try and program hardware
+		 * this isn't a problem either.
+		 */
+		if (!cpuc->is_fake) {
+			if (idx != reg->idx)
+				intel_fixup_er(event, idx);
+
+			/*
+			 * x86_schedule_events() can call get_event_constraints()
+			 * multiple times on events in the case of incremental
+			 * scheduling(). reg->alloc ensures we only do the ER
+			 * allocation once.
+			 */
+			reg->alloc = 1;
+		}
+
 		/* lock in msr value */
 		era->config = reg->config;
 		era->reg = reg->reg;
@@ -1180,17 +1213,17 @@ again:
 		/* one more user */
 		atomic_inc(&era->ref);
 
-		/* no need to reallocate during incremental event scheduling */
-		reg->alloc = 1;
-
 		/*
 		 * need to call x86_get_event_constraint()
 		 * to check if associated event has constraints
 		 */
 		c = NULL;
-	} else if (intel_try_alt_er(event, orig_idx)) {
-		raw_spin_unlock_irqrestore(&era->lock, flags);
-		goto again;
+	} else {
+		idx = intel_alt_er(idx);
+		if (idx != reg->idx) {
+			raw_spin_unlock_irqrestore(&era->lock, flags);
+			goto again;
+		}
 	}
 	raw_spin_unlock_irqrestore(&era->lock, flags);
 
@@ -1204,11 +1237,14 @@ __intel_shared_reg_put_constraints(struct cpu_hw_events *cpuc,
 	struct er_account *era;
 
 	/*
-	 * only put constraint if extra reg was actually
-	 * allocated. Also takes care of event which do
-	 * not use an extra shared reg
+	 * Only put constraint if extra reg was actually allocated. Also takes
+	 * care of event which do not use an extra shared reg.
+	 *
+	 * Also, if this is a fake cpuc we shouldn't touch any event state
+	 * (reg->alloc) and we don't care about leaving inconsistent cpuc state
+	 * either since it'll be thrown out.
 	 */
-	if (!reg->alloc)
+	if (!reg->alloc || cpuc->is_fake)
 		return;
 
 	era = &cpuc->shared_regs->regs[reg->idx];
-- 
cgit v1.2.3


From cccb9ba9e4ee0d750265f53de9258df69655c40b Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <peterz@infradead.org>
Date: Tue, 5 Jun 2012 10:26:43 +0200
Subject: perf/x86: Implement cycles:p for SNB/IVB

Now that there's finally a chip with working PEBS (IvyBridge), we can
enable the hardware and implement cycles:p for SNB/IVB.

Cc: Stephane Eranian <eranian@google.com>
Requested-and-tested-by: Linus Torvalds <torvalds@linux-foundation.org>
Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Link: http://lkml.kernel.org/r/1338884803.28282.153.camel@twins
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/x86/kernel/cpu/perf_event.h       |  1 +
 arch/x86/kernel/cpu/perf_event_intel.c | 50 ++++++++++++++++++++++++++++------
 2 files changed, 43 insertions(+), 8 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kernel/cpu/perf_event.h b/arch/x86/kernel/cpu/perf_event.h
index 83794d8e6af0..7241e2fc3c17 100644
--- a/arch/x86/kernel/cpu/perf_event.h
+++ b/arch/x86/kernel/cpu/perf_event.h
@@ -365,6 +365,7 @@ struct x86_pmu {
 	int		pebs_record_size;
 	void		(*drain_pebs)(struct pt_regs *regs);
 	struct event_constraint *pebs_constraints;
+	void		(*pebs_aliases)(struct perf_event *event);
 
 	/*
 	 * Intel LBR
diff --git a/arch/x86/kernel/cpu/perf_event_intel.c b/arch/x86/kernel/cpu/perf_event_intel.c
index 965baa2fa790..2312c1ff1b19 100644
--- a/arch/x86/kernel/cpu/perf_event_intel.c
+++ b/arch/x86/kernel/cpu/perf_event_intel.c
@@ -1336,15 +1336,9 @@ static void intel_put_event_constraints(struct cpu_hw_events *cpuc,
 	intel_put_shared_regs_event_constraints(cpuc, event);
 }
 
-static int intel_pmu_hw_config(struct perf_event *event)
+static void intel_pebs_aliases_core2(struct perf_event *event)
 {
-	int ret = x86_pmu_hw_config(event);
-
-	if (ret)
-		return ret;
-
-	if (event->attr.precise_ip &&
-	    (event->hw.config & X86_RAW_EVENT_MASK) == 0x003c) {
+	if ((event->hw.config & X86_RAW_EVENT_MASK) == 0x003c) {
 		/*
 		 * Use an alternative encoding for CPU_CLK_UNHALTED.THREAD_P
 		 * (0x003c) so that we can use it with PEBS.
@@ -1365,10 +1359,48 @@ static int intel_pmu_hw_config(struct perf_event *event)
 		 */
 		u64 alt_config = X86_CONFIG(.event=0xc0, .inv=1, .cmask=16);
 
+		alt_config |= (event->hw.config & ~X86_RAW_EVENT_MASK);
+		event->hw.config = alt_config;
+	}
+}
+
+static void intel_pebs_aliases_snb(struct perf_event *event)
+{
+	if ((event->hw.config & X86_RAW_EVENT_MASK) == 0x003c) {
+		/*
+		 * Use an alternative encoding for CPU_CLK_UNHALTED.THREAD_P
+		 * (0x003c) so that we can use it with PEBS.
+		 *
+		 * The regular CPU_CLK_UNHALTED.THREAD_P event (0x003c) isn't
+		 * PEBS capable. However we can use UOPS_RETIRED.ALL
+		 * (0x01c2), which is a PEBS capable event, to get the same
+		 * count.
+		 *
+		 * UOPS_RETIRED.ALL counts the number of cycles that retires
+		 * CNTMASK micro-ops. By setting CNTMASK to a value (16)
+		 * larger than the maximum number of micro-ops that can be
+		 * retired per cycle (4) and then inverting the condition, we
+		 * count all cycles that retire 16 or less micro-ops, which
+		 * is every cycle.
+		 *
+		 * Thereby we gain a PEBS capable cycle counter.
+		 */
+		u64 alt_config = X86_CONFIG(.event=0xc2, .umask=0x01, .inv=1, .cmask=16);
 
 		alt_config |= (event->hw.config & ~X86_RAW_EVENT_MASK);
 		event->hw.config = alt_config;
 	}
+}
+
+static int intel_pmu_hw_config(struct perf_event *event)
+{
+	int ret = x86_pmu_hw_config(event);
+
+	if (ret)
+		return ret;
+
+	if (event->attr.precise_ip && x86_pmu.pebs_aliases)
+		x86_pmu.pebs_aliases(event);
 
 	if (intel_pmu_needs_lbr_smpl(event)) {
 		ret = intel_pmu_setup_lbr_filter(event);
@@ -1643,6 +1675,7 @@ static __initconst const struct x86_pmu intel_pmu = {
 	.max_period		= (1ULL << 31) - 1,
 	.get_event_constraints	= intel_get_event_constraints,
 	.put_event_constraints	= intel_put_event_constraints,
+	.pebs_aliases		= intel_pebs_aliases_core2,
 
 	.format_attrs		= intel_arch3_formats_attr,
 
@@ -1885,6 +1918,7 @@ __init int intel_pmu_init(void)
 
 		x86_pmu.event_constraints = intel_snb_event_constraints;
 		x86_pmu.pebs_constraints = intel_snb_pebs_event_constraints;
+		x86_pmu.pebs_aliases = intel_pebs_aliases_snb;
 		x86_pmu.extra_regs = intel_snb_extra_regs;
 		/* all extra regs are per-cpu when HT is on */
 		x86_pmu.er_flags |= ERF_HAS_RSP_1;
-- 
cgit v1.2.3


From b6db437ba8322f5cee0bd355ad2ef9f73c413754 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <peterz@infradead.org>
Date: Tue, 5 Jun 2012 10:26:43 +0200
Subject: perf/x86: Enable/Add IvyBridge hardware support

Implement rudimentary IVB perf support. The SDM states its identical
to SNB with exception of the exact event tables, but a quick look
suggests they're similar enough.

Also mark SNB-EP as broken for now.

Requested-and-tested-by: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Stephane Eranian <eranian@google.com>
Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Link: http://lkml.kernel.org/r/1338884803.28282.153.camel@twins
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/x86/kernel/cpu/perf_event_intel.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kernel/cpu/perf_event_intel.c b/arch/x86/kernel/cpu/perf_event_intel.c
index 2312c1ff1b19..187c294bc658 100644
--- a/arch/x86/kernel/cpu/perf_event_intel.c
+++ b/arch/x86/kernel/cpu/perf_event_intel.c
@@ -1909,8 +1909,9 @@ __init int intel_pmu_init(void)
 		break;
 
 	case 42: /* SandyBridge */
-		x86_add_quirk(intel_sandybridge_quirk);
 	case 45: /* SandyBridge, "Romely-EP" */
+		x86_add_quirk(intel_sandybridge_quirk);
+	case 58: /* IvyBridge */
 		memcpy(hw_cache_event_ids, snb_hw_cache_event_ids,
 		       sizeof(hw_cache_event_ids));
 
-- 
cgit v1.2.3


From 8440ccb43fc0ecffcf1acee0273d766e6a8cd51d Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <peterz@infradead.org>
Date: Tue, 5 Jun 2012 10:26:43 +0200
Subject: perf/x86: Update SNB PEBS constraints

Afaict there's no need to (incompletely) iterate the
MEM_UOPS_RETIRED.* umask state.

Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Stephane Eranian <eranian@google.com>
Link: http://lkml.kernel.org/r/1338884803.28282.153.camel@twins
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/x86/kernel/cpu/perf_event_intel_ds.c | 9 +--------
 1 file changed, 1 insertion(+), 8 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kernel/cpu/perf_event_intel_ds.c b/arch/x86/kernel/cpu/perf_event_intel_ds.c
index 5a3edc27f6e5..35e2192df9f4 100644
--- a/arch/x86/kernel/cpu/perf_event_intel_ds.c
+++ b/arch/x86/kernel/cpu/perf_event_intel_ds.c
@@ -400,14 +400,7 @@ struct event_constraint intel_snb_pebs_event_constraints[] = {
 	INTEL_EVENT_CONSTRAINT(0xc4, 0xf),    /* BR_INST_RETIRED.* */
 	INTEL_EVENT_CONSTRAINT(0xc5, 0xf),    /* BR_MISP_RETIRED.* */
 	INTEL_EVENT_CONSTRAINT(0xcd, 0x8),    /* MEM_TRANS_RETIRED.* */
-	INTEL_UEVENT_CONSTRAINT(0x11d0, 0xf), /* MEM_UOP_RETIRED.STLB_MISS_LOADS */
-	INTEL_UEVENT_CONSTRAINT(0x12d0, 0xf), /* MEM_UOP_RETIRED.STLB_MISS_STORES */
-	INTEL_UEVENT_CONSTRAINT(0x21d0, 0xf), /* MEM_UOP_RETIRED.LOCK_LOADS */
-	INTEL_UEVENT_CONSTRAINT(0x22d0, 0xf), /* MEM_UOP_RETIRED.LOCK_STORES */
-	INTEL_UEVENT_CONSTRAINT(0x41d0, 0xf), /* MEM_UOP_RETIRED.SPLIT_LOADS */
-	INTEL_UEVENT_CONSTRAINT(0x42d0, 0xf), /* MEM_UOP_RETIRED.SPLIT_STORES */
-	INTEL_UEVENT_CONSTRAINT(0x81d0, 0xf), /* MEM_UOP_RETIRED.ANY_LOADS */
-	INTEL_UEVENT_CONSTRAINT(0x82d0, 0xf), /* MEM_UOP_RETIRED.ANY_STORES */
+	INTEL_EVENT_CONSTRAINT(0xd0, 0xf),    /* MEM_UOP_RETIRED.* */
 	INTEL_EVENT_CONSTRAINT(0xd1, 0xf),    /* MEM_LOAD_UOPS_RETIRED.* */
 	INTEL_EVENT_CONSTRAINT(0xd2, 0xf),    /* MEM_LOAD_UOPS_LLC_HIT_RETIRED.* */
 	INTEL_UEVENT_CONSTRAINT(0x02d4, 0xf), /* MEM_LOAD_UOPS_MISC_RETIRED.LLC_MISS */
-- 
cgit v1.2.3


From 302fa4b58ac754a6da13f4f5546f710fecc3b945 Mon Sep 17 00:00:00 2001
From: Arun Sharma <asharma@fb.com>
Date: Fri, 20 Apr 2012 15:41:33 -0700
Subject: perf/x86: Allow multiple stacks

Without this patch, applications with two different stack
regions (eg: native stack vs JIT stack) get truncated
callchains even when RBP chaining is present. GDB shows proper
stack traces and the frame pointer chaining is intact.

This patch disables the (fp < RSP) check, hoping that other checks
in the code save the day for us. In our limited testing, this
didn't seem to break anything.

In the long term, we could potentially have userspace advise
the kernel on the range of valid stack addresses, so we don't
spend a lot of time unwinding from bogus addresses.

Signed-off-by: Arun Sharma <asharma@fb.com>
CC: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: Frederic Weisbecker <fweisbec@gmail.com>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Stephane Eranian <eranian@google.com>
Cc: Namhyung Kim <namhyung.kim@lge.com>
Cc: Tom Zanussi <tzanussi@gmail.com>
Cc: linux-kernel@vger.kernel.org
Cc: linux-perf-users@vger.kernel.org
Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Link: http://lkml.kernel.org/r/1334961696-19580-2-git-send-email-asharma@fb.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/x86/kernel/cpu/perf_event.c | 6 ------
 1 file changed, 6 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kernel/cpu/perf_event.c b/arch/x86/kernel/cpu/perf_event.c
index cb608383e4f6..e78bc256aea8 100644
--- a/arch/x86/kernel/cpu/perf_event.c
+++ b/arch/x86/kernel/cpu/perf_event.c
@@ -1781,9 +1781,6 @@ perf_callchain_user32(struct pt_regs *regs, struct perf_callchain_entry *entry)
 		if (bytes != sizeof(frame))
 			break;
 
-		if (fp < compat_ptr(regs->sp))
-			break;
-
 		perf_callchain_store(entry, frame.return_address);
 		fp = compat_ptr(frame.next_frame);
 	}
@@ -1827,9 +1824,6 @@ perf_callchain_user(struct perf_callchain_entry *entry, struct pt_regs *regs)
 		if (bytes != sizeof(frame))
 			break;
 
-		if ((unsigned long)fp < regs->sp)
-			break;
-
 		perf_callchain_store(entry, frame.return_address);
 		fp = frame.next_frame;
 	}
-- 
cgit v1.2.3


From bc6ca7b342d5ae15c3ba3081fd40271b8039fb25 Mon Sep 17 00:00:00 2001
From: Arun Sharma <asharma@fb.com>
Date: Fri, 20 Apr 2012 15:41:35 -0700
Subject: perf/x86: Check if user fp is valid

Signed-off-by: Arun Sharma <asharma@fb.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: linux-kernel@vger.kernel.org
Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Link: http://lkml.kernel.org/r/1334961696-19580-4-git-send-email-asharma@fb.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/x86/include/asm/uaccess.h   | 12 ++++++------
 arch/x86/kernel/cpu/perf_event.c | 12 ++++++++++++
 2 files changed, 18 insertions(+), 6 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/include/asm/uaccess.h b/arch/x86/include/asm/uaccess.h
index 04cd6882308e..e1f3a17034fc 100644
--- a/arch/x86/include/asm/uaccess.h
+++ b/arch/x86/include/asm/uaccess.h
@@ -33,9 +33,8 @@
 #define segment_eq(a, b)	((a).seg == (b).seg)
 
 #define user_addr_max() (current_thread_info()->addr_limit.seg)
-#define __addr_ok(addr)					\
-	((unsigned long __force)(addr) <		\
-	 (current_thread_info()->addr_limit.seg))
+#define __addr_ok(addr) 	\
+	((unsigned long __force)(addr) < user_addr_max())
 
 /*
  * Test whether a block of memory is a valid user space address.
@@ -47,14 +46,14 @@
  * This needs 33-bit (65-bit for x86_64) arithmetic. We have a carry...
  */
 
-#define __range_not_ok(addr, size)					\
+#define __range_not_ok(addr, size, limit)				\
 ({									\
 	unsigned long flag, roksum;					\
 	__chk_user_ptr(addr);						\
 	asm("add %3,%1 ; sbb %0,%0 ; cmp %1,%4 ; sbb $0,%0"		\
 	    : "=&r" (flag), "=r" (roksum)				\
 	    : "1" (addr), "g" ((long)(size)),				\
-	      "rm" (current_thread_info()->addr_limit.seg));		\
+	      "rm" (limit));						\
 	flag;								\
 })
 
@@ -77,7 +76,8 @@
  * checks that the pointer is in the user space range - after calling
  * this function, memory access functions may still return -EFAULT.
  */
-#define access_ok(type, addr, size) (likely(__range_not_ok(addr, size) == 0))
+#define access_ok(type, addr, size) \
+	(likely(__range_not_ok(addr, size, user_addr_max()) == 0))
 
 /*
  * The exception table consists of pairs of addresses relative to the
diff --git a/arch/x86/kernel/cpu/perf_event.c b/arch/x86/kernel/cpu/perf_event.c
index e78bc256aea8..c4706cf9c011 100644
--- a/arch/x86/kernel/cpu/perf_event.c
+++ b/arch/x86/kernel/cpu/perf_event.c
@@ -1757,6 +1757,12 @@ perf_callchain_kernel(struct perf_callchain_entry *entry, struct pt_regs *regs)
 	dump_trace(NULL, regs, NULL, 0, &backtrace_ops, entry);
 }
 
+static inline int
+valid_user_frame(const void __user *fp, unsigned long size)
+{
+	return (__range_not_ok(fp, size, TASK_SIZE) == 0);
+}
+
 #ifdef CONFIG_COMPAT
 
 #include <asm/compat.h>
@@ -1781,6 +1787,9 @@ perf_callchain_user32(struct pt_regs *regs, struct perf_callchain_entry *entry)
 		if (bytes != sizeof(frame))
 			break;
 
+		if (!valid_user_frame(fp, sizeof(frame)))
+			break;
+
 		perf_callchain_store(entry, frame.return_address);
 		fp = compat_ptr(frame.next_frame);
 	}
@@ -1824,6 +1833,9 @@ perf_callchain_user(struct perf_callchain_entry *entry, struct pt_regs *regs)
 		if (bytes != sizeof(frame))
 			break;
 
+		if (!valid_user_frame(fp, sizeof(frame)))
+			break;
+
 		perf_callchain_store(entry, frame.return_address);
 		fp = frame.next_frame;
 	}
-- 
cgit v1.2.3


From db0dc75d6403b6663c0eab4c6ccb672eb9b2ed72 Mon Sep 17 00:00:00 2001
From: Arun Sharma <asharma@fb.com>
Date: Fri, 20 Apr 2012 15:41:36 -0700
Subject: perf/x86: Check user address explicitly in copy_from_user_nmi()

Signed-off-by: Arun Sharma <asharma@fb.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: linux-kernel@vger.kernel.org
Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Link: http://lkml.kernel.org/r/1334961696-19580-5-git-send-email-asharma@fb.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/x86/lib/usercopy.c | 4 ++++
 1 file changed, 4 insertions(+)

(limited to 'arch/x86')

diff --git a/arch/x86/lib/usercopy.c b/arch/x86/lib/usercopy.c
index f61ee67ec00f..677b1ed184c9 100644
--- a/arch/x86/lib/usercopy.c
+++ b/arch/x86/lib/usercopy.c
@@ -8,6 +8,7 @@
 #include <linux/module.h>
 
 #include <asm/word-at-a-time.h>
+#include <linux/sched.h>
 
 /*
  * best effort, GUP based copy_from_user() that is NMI-safe
@@ -21,6 +22,9 @@ copy_from_user_nmi(void *to, const void __user *from, unsigned long n)
 	void *map;
 	int ret;
 
+	if (__range_not_ok(from, n, TASK_SIZE) == 0)
+		return len;
+
 	do {
 		ret = __get_user_pages_fast(addr, 1, 0, &page);
 		if (!ret)
-- 
cgit v1.2.3


From 743628e868c5992354fc80b4d1e9a6143da1c0e6 Mon Sep 17 00:00:00 2001
From: Jordan Justen <jordan.l.justen@intel.com>
Date: Thu, 7 Jun 2012 09:05:21 -0700
Subject: x86, efi stub: Add .reloc section back into image

Some UEFI firmware will not load a .efi with a .reloc section
with a size of 0.

Therefore, we create a .efi image with 4 main areas and 3 sections.
1. PE/COFF file header
2. .setup section (covers all setup code following the first sector)
3. .reloc section (contains 1 dummy reloc entry, created in build.c)
4. .text section (covers the remaining kernel image)

To make room for the new .setup section data, the header
bugger_off_msg had to be shortened.

Reported-by: Henrik Rydberg <rydberg@euromail.se>
Signed-off-by: Jordan Justen <jordan.l.justen@intel.com>
Link: http://lkml.kernel.org/r/1339085121-12760-1-git-send-email-jordan.l.justen@intel.com
Tested-by: Lee G Rosenbaum <lee.g.rosenbaum@intel.com>
Tested-by: Henrik Rydberg <rydberg@euromail.se>
Cc: Matt Fleming <matt.fleming@intel.com>
Signed-off-by: H. Peter Anvin <hpa@zytor.com>
---
 arch/x86/boot/header.S      |  42 ++++++++---
 arch/x86/boot/tools/build.c | 172 ++++++++++++++++++++++++++++----------------
 2 files changed, 140 insertions(+), 74 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/boot/header.S b/arch/x86/boot/header.S
index 8bbea6aa40d9..efe5acfc79c3 100644
--- a/arch/x86/boot/header.S
+++ b/arch/x86/boot/header.S
@@ -94,10 +94,10 @@ bs_die:
 
 	.section ".bsdata", "a"
 bugger_off_msg:
-	.ascii	"Direct booting from floppy is no longer supported.\r\n"
-	.ascii	"Please use a boot loader program instead.\r\n"
+	.ascii	"Direct floppy boot is not supported. "
+	.ascii	"Use a boot loader program instead.\r\n"
 	.ascii	"\n"
-	.ascii	"Remove disk and press any key to reboot . . .\r\n"
+	.ascii	"Remove disk and press any key to reboot ...\r\n"
 	.byte	0
 
 #ifdef CONFIG_EFI_STUB
@@ -111,7 +111,7 @@ coff_header:
 #else
 	.word	0x8664				# x86-64
 #endif
-	.word	2				# nr_sections
+	.word	3				# nr_sections
 	.long	0 				# TimeDateStamp
 	.long	0				# PointerToSymbolTable
 	.long	1				# NumberOfSymbols
@@ -158,8 +158,8 @@ extra_header_fields:
 #else
 	.quad	0				# ImageBase
 #endif
-	.long	0x1000				# SectionAlignment
-	.long	0x200				# FileAlignment
+	.long	0x20				# SectionAlignment
+	.long	0x20				# FileAlignment
 	.word	0				# MajorOperatingSystemVersion
 	.word	0				# MinorOperatingSystemVersion
 	.word	0				# MajorImageVersion
@@ -200,8 +200,10 @@ extra_header_fields:
 
 	# Section table
 section_table:
-	.ascii	".text"
-	.byte	0
+	#
+	# The offset & size fields are filled in by build.c.
+	#
+	.ascii	".setup"
 	.byte	0
 	.byte	0
 	.long	0
@@ -217,9 +219,8 @@ section_table:
 
 	#
 	# The EFI application loader requires a relocation section
-	# because EFI applications must be relocatable. But since
-	# we don't need the loader to fixup any relocs for us, we
-	# just create an empty (zero-length) .reloc section header.
+	# because EFI applications must be relocatable. The .reloc
+	# offset & size fields are filled in by build.c.
 	#
 	.ascii	".reloc"
 	.byte	0
@@ -233,6 +234,25 @@ section_table:
 	.word	0				# NumberOfRelocations
 	.word	0				# NumberOfLineNumbers
 	.long	0x42100040			# Characteristics (section flags)
+
+	#
+	# The offset & size fields are filled in by build.c.
+	#
+	.ascii	".text"
+	.byte	0
+	.byte	0
+	.byte	0
+	.long	0
+	.long	0x0				# startup_{32,64}
+	.long	0				# Size of initialized data
+						# on disk
+	.long	0x0				# startup_{32,64}
+	.long	0				# PointerToRelocations
+	.long	0				# PointerToLineNumbers
+	.word	0				# NumberOfRelocations
+	.word	0				# NumberOfLineNumbers
+	.long	0x60500020			# Characteristics (section flags)
+
 #endif /* CONFIG_EFI_STUB */
 
 	# Kernel attributes; used by setup.  This is part 1 of the
diff --git a/arch/x86/boot/tools/build.c b/arch/x86/boot/tools/build.c
index 3f61f6e2b46f..4b8e165ee572 100644
--- a/arch/x86/boot/tools/build.c
+++ b/arch/x86/boot/tools/build.c
@@ -50,6 +50,8 @@ typedef unsigned int   u32;
 u8 buf[SETUP_SECT_MAX*512];
 int is_big_kernel;
 
+#define PECOFF_RELOC_RESERVE 0x20
+
 /*----------------------------------------------------------------------*/
 
 static const u32 crctab32[] = {
@@ -133,11 +135,103 @@ static void usage(void)
 	die("Usage: build setup system [> image]");
 }
 
-int main(int argc, char ** argv)
-{
 #ifdef CONFIG_EFI_STUB
-	unsigned int file_sz, pe_header;
+
+static void update_pecoff_section_header(char *section_name, u32 offset, u32 size)
+{
+	unsigned int pe_header;
+	unsigned short num_sections;
+	u8 *section;
+
+	pe_header = get_unaligned_le32(&buf[0x3c]);
+	num_sections = get_unaligned_le16(&buf[pe_header + 6]);
+
+#ifdef CONFIG_X86_32
+	section = &buf[pe_header + 0xa8];
+#else
+	section = &buf[pe_header + 0xb8];
 #endif
+
+	while (num_sections > 0) {
+		if (strncmp((char*)section, section_name, 8) == 0) {
+			/* section header size field */
+			put_unaligned_le32(size, section + 0x8);
+
+			/* section header vma field */
+			put_unaligned_le32(offset, section + 0xc);
+
+			/* section header 'size of initialised data' field */
+			put_unaligned_le32(size, section + 0x10);
+
+			/* section header 'file offset' field */
+			put_unaligned_le32(offset, section + 0x14);
+
+			break;
+		}
+		section += 0x28;
+		num_sections--;
+	}
+}
+
+static void update_pecoff_setup_and_reloc(unsigned int size)
+{
+	u32 setup_offset = 0x200;
+	u32 reloc_offset = size - PECOFF_RELOC_RESERVE;
+	u32 setup_size = reloc_offset - setup_offset;
+
+	update_pecoff_section_header(".setup", setup_offset, setup_size);
+	update_pecoff_section_header(".reloc", reloc_offset, PECOFF_RELOC_RESERVE);
+
+	/*
+	 * Modify .reloc section contents with a single entry. The
+	 * relocation is applied to offset 10 of the relocation section.
+	 */
+	put_unaligned_le32(reloc_offset + 10, &buf[reloc_offset]);
+	put_unaligned_le32(10, &buf[reloc_offset + 4]);
+}
+
+static void update_pecoff_text(unsigned int text_start, unsigned int file_sz)
+{
+	unsigned int pe_header;
+	unsigned int text_sz = file_sz - text_start;
+
+	pe_header = get_unaligned_le32(&buf[0x3c]);
+
+	/* Size of image */
+	put_unaligned_le32(file_sz, &buf[pe_header + 0x50]);
+
+	/*
+	 * Size of code: Subtract the size of the first sector (512 bytes)
+	 * which includes the header.
+	 */
+	put_unaligned_le32(file_sz - 512, &buf[pe_header + 0x1c]);
+
+#ifdef CONFIG_X86_32
+	/*
+	 * Address of entry point.
+	 *
+	 * The EFI stub entry point is +16 bytes from the start of
+	 * the .text section.
+	 */
+	put_unaligned_le32(text_start + 16, &buf[pe_header + 0x28]);
+#else
+	/*
+	 * Address of entry point. startup_32 is at the beginning and
+	 * the 64-bit entry point (startup_64) is always 512 bytes
+	 * after. The EFI stub entry point is 16 bytes after that, as
+	 * the first instruction allows legacy loaders to jump over
+	 * the EFI stub initialisation
+	 */
+	put_unaligned_le32(text_start + 528, &buf[pe_header + 0x28]);
+#endif /* CONFIG_X86_32 */
+
+	update_pecoff_section_header(".text", text_start, text_sz);
+}
+
+#endif /* CONFIG_EFI_STUB */
+
+int main(int argc, char ** argv)
+{
 	unsigned int i, sz, setup_sectors;
 	int c;
 	u32 sys_size;
@@ -163,6 +257,12 @@ int main(int argc, char ** argv)
 		die("Boot block hasn't got boot flag (0xAA55)");
 	fclose(file);
 
+#ifdef CONFIG_EFI_STUB
+	/* Reserve 0x20 bytes for .reloc section */
+	memset(buf+c, 0, PECOFF_RELOC_RESERVE);
+	c += PECOFF_RELOC_RESERVE;
+#endif
+
 	/* Pad unused space with zeros */
 	setup_sectors = (c + 511) / 512;
 	if (setup_sectors < SETUP_SECT_MIN)
@@ -170,6 +270,10 @@ int main(int argc, char ** argv)
 	i = setup_sectors*512;
 	memset(buf+c, 0, i-c);
 
+#ifdef CONFIG_EFI_STUB
+	update_pecoff_setup_and_reloc(i);
+#endif
+
 	/* Set the default root device */
 	put_unaligned_le16(DEFAULT_ROOT_DEV, &buf[508]);
 
@@ -194,66 +298,8 @@ int main(int argc, char ** argv)
 	put_unaligned_le32(sys_size, &buf[0x1f4]);
 
 #ifdef CONFIG_EFI_STUB
-	file_sz = sz + i + ((sys_size * 16) - sz);
-
-	pe_header = get_unaligned_le32(&buf[0x3c]);
-
-	/* Size of image */
-	put_unaligned_le32(file_sz, &buf[pe_header + 0x50]);
-
-	/*
-	 * Subtract the size of the first section (512 bytes) which
-	 * includes the header and .reloc section. The remaining size
-	 * is that of the .text section.
-	 */
-	file_sz -= 512;
-
-	/* Size of code */
-	put_unaligned_le32(file_sz, &buf[pe_header + 0x1c]);
-
-#ifdef CONFIG_X86_32
-	/*
-	 * Address of entry point.
-	 *
-	 * The EFI stub entry point is +16 bytes from the start of
-	 * the .text section.
-	 */
-	put_unaligned_le32(i + 16, &buf[pe_header + 0x28]);
-
-	/* .text size */
-	put_unaligned_le32(file_sz, &buf[pe_header + 0xb0]);
-
-	/* .text vma */
-	put_unaligned_le32(0x200, &buf[pe_header + 0xb4]);
-
-	/* .text size of initialised data */
-	put_unaligned_le32(file_sz, &buf[pe_header + 0xb8]);
-
-	/* .text file offset */
-	put_unaligned_le32(0x200, &buf[pe_header + 0xbc]);
-#else
-	/*
-	 * Address of entry point. startup_32 is at the beginning and
-	 * the 64-bit entry point (startup_64) is always 512 bytes
-	 * after. The EFI stub entry point is 16 bytes after that, as
-	 * the first instruction allows legacy loaders to jump over
-	 * the EFI stub initialisation
-	 */
-	put_unaligned_le32(i + 528, &buf[pe_header + 0x28]);
-
-	/* .text size */
-	put_unaligned_le32(file_sz, &buf[pe_header + 0xc0]);
-
-	/* .text vma */
-	put_unaligned_le32(0x200, &buf[pe_header + 0xc4]);
-
-	/* .text size of initialised data */
-	put_unaligned_le32(file_sz, &buf[pe_header + 0xc8]);
-
-	/* .text file offset */
-	put_unaligned_le32(0x200, &buf[pe_header + 0xcc]);
-#endif /* CONFIG_X86_32 */
-#endif /* CONFIG_EFI_STUB */
+	update_pecoff_text(setup_sectors * 512, sz + i + ((sys_size * 16) - sz));
+#endif
 
 	crc = partial_crc32(buf, i, crc);
 	if (fwrite(buf, 1, i, stdout) != i)
-- 
cgit v1.2.3


From bd2753b2dda7bb43c7468826de75f49c6a7e8965 Mon Sep 17 00:00:00 2001
From: Yinghai Lu <yinghai@kernel.org>
Date: Wed, 6 Jun 2012 10:55:40 -0700
Subject: x86/mm: Only add extra pages count for the first memory range during
 pre-allocation early page table space

Robin found this regression:

| I just tried to boot an 8TB system.  It fails very early in boot with:
| Kernel panic - not syncing: Cannot find space for the kernel page tables

git bisect commit 722bc6b16771ed80871e1fd81c86d3627dda2ac8.

A git revert of that commit does boot past that point on the 8TB
configuration.

That commit will add up extra pages for all memory range even
above 4g.

Try to limit that extra page count adding to first entry only.

Bisected-by: Robin Holt <holt@sgi.com>
Tested-by: Robin Holt <holt@sgi.com>
Signed-off-by: Yinghai Lu <yinghai@kernel.org>
Cc: WANG Cong <xiyou.wangcong@gmail.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Link: http://lkml.kernel.org/r/CAE9FiQUj3wyzQxtq9yzBNc9u220p8JZ1FYHG7t%3DMOzJ%3D9BZMYA@mail.gmail.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/x86/mm/init.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

(limited to 'arch/x86')

diff --git a/arch/x86/mm/init.c b/arch/x86/mm/init.c
index 97141c26a13a..bc4e9d84157f 100644
--- a/arch/x86/mm/init.c
+++ b/arch/x86/mm/init.c
@@ -62,7 +62,8 @@ static void __init find_early_table_space(struct map_range *mr, unsigned long en
 		extra += PMD_SIZE;
 #endif
 		/* The first 2/4M doesn't use large pages. */
-		extra += mr->end - mr->start;
+		if (mr->start < PMD_SIZE)
+			extra += mr->end - mr->start;
 
 		ptes = (extra + PAGE_SIZE - 1) >> PAGE_SHIFT;
 	} else
-- 
cgit v1.2.3


From d5d2d2eea84b0d8450b082edbc3dbde41fb8bfd8 Mon Sep 17 00:00:00 2001
From: Cliff Wickman <cpw@sgi.com>
Date: Thu, 7 Jun 2012 08:31:40 -0500
Subject: x86/uv: Fix UV2 BAU legacy mode

The SGI Altix UV2 BAU (Broadcast Assist Unit) as used for
tlb-shootdown (selective broadcast mode) always uses UV2
broadcast descriptor format. There is no need to clear the
'legacy' (UV1) mode, because the hardware always uses UV2 mode
for selective broadcast.

But the BIOS uses general broadcast and legacy mode, and the
hardware pays attention to the legacy mode bit for general
broadcast. So the kernel must not clear that mode bit.

Signed-off-by: Cliff Wickman <cpw@sgi.com>
Cc: <stable@kernel.org>
Link: http://lkml.kernel.org/r/E1SccoO-0002Lh-Cb@eag09.americas.sgi.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/x86/include/asm/uv/uv_bau.h | 1 -
 arch/x86/platform/uv/tlb_uv.c    | 1 -
 2 files changed, 2 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/include/asm/uv/uv_bau.h b/arch/x86/include/asm/uv/uv_bau.h
index becf47b81735..6149b476d9df 100644
--- a/arch/x86/include/asm/uv/uv_bau.h
+++ b/arch/x86/include/asm/uv/uv_bau.h
@@ -149,7 +149,6 @@
 /* 4 bits of software ack period */
 #define UV2_ACK_MASK			0x7UL
 #define UV2_ACK_UNITS_SHFT		3
-#define UV2_LEG_SHFT UV2H_LB_BAU_MISC_CONTROL_USE_LEGACY_DESCRIPTOR_FORMATS_SHFT
 #define UV2_EXT_SHFT UV2H_LB_BAU_MISC_CONTROL_ENABLE_EXTENDED_SB_STATUS_SHFT
 
 /*
diff --git a/arch/x86/platform/uv/tlb_uv.c b/arch/x86/platform/uv/tlb_uv.c
index 3ae0e61abd23..59880afa851f 100644
--- a/arch/x86/platform/uv/tlb_uv.c
+++ b/arch/x86/platform/uv/tlb_uv.c
@@ -1295,7 +1295,6 @@ static void __init enable_timeouts(void)
 		 */
 		mmr_image |= (1L << SOFTACK_MSHIFT);
 		if (is_uv2_hub()) {
-			mmr_image &= ~(1L << UV2_LEG_SHFT);
 			mmr_image |= (1L << UV2_EXT_SHFT);
 		}
 		write_mmr_misc_control(pnode, mmr_image);
-- 
cgit v1.2.3


From eeaaa96a3a2134a174100afd129bb0891d05f4b2 Mon Sep 17 00:00:00 2001
From: Don Zickus <dzickus@redhat.com>
Date: Wed, 6 Jun 2012 10:05:42 -0400
Subject: x86/nmi: Fix section mismatch warnings on 32-bit

It was reported that compiling for 32-bit caused a bunch of
section mismatch warnings:

 VDSOSYM arch/x86/vdso/vdso32-syms.lds
  LD      arch/x86/vdso/built-in.o
  LD      arch/x86/built-in.o

 WARNING: arch/x86/built-in.o(.data+0x5af0): Section mismatch in
 reference from the variable test_nmi_ipi_callback_na.10451 to
 the function .init.text:test_nmi_ipi_callback() [...]

 WARNING: arch/x86/built-in.o(.data+0x5b04): Section mismatch in
 reference from the variable nmi_unk_cb_na.10399 to the function
 .init.text:nmi_unk_cb() The variable nmi_unk_cb_na.10399
 references the function __init nmi_unk_cb() [...]

Both of these are attributed to the internal representation of
the nmiaction struct created during register_nmi_handler.  The
reason for this is that those structs are not defined in the
init section whereas the rest of the code in nmi_selftest.c is.

To resolve this, I created a new #define,
register_nmi_handler_initonly, that tags the struct as
__initdata to resolve the mismatch.  This #define should only be
used in rare situations where the register/unregister is called
during init of the kernel.

Big thanks to Jan Beulich for decoding this for me as I didn't
have a clue what was going on.

Reported-by: Witold Baryluk <baryluk@smp.if.uj.edu.pl>
Tested-by: Witold Baryluk <baryluk@smp.if.uj.edu.pl>
Cc: Jan Beulich <JBeulich@suse.com>
Signed-off-by: Don Zickus <dzickus@redhat.com>
Link: http://lkml.kernel.org/r/1338991542-23000-1-git-send-email-dzickus@redhat.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/x86/include/asm/nmi.h     | 14 ++++++++++++++
 arch/x86/kernel/nmi_selftest.c |  4 ++--
 2 files changed, 16 insertions(+), 2 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/include/asm/nmi.h b/arch/x86/include/asm/nmi.h
index 0e3793b821ef..dc580c42851c 100644
--- a/arch/x86/include/asm/nmi.h
+++ b/arch/x86/include/asm/nmi.h
@@ -54,6 +54,20 @@ struct nmiaction {
 	__register_nmi_handler((t), &fn##_na);	\
 })
 
+/*
+ * For special handlers that register/unregister in the
+ * init section only.  This should be considered rare.
+ */
+#define register_nmi_handler_initonly(t, fn, fg, n)		\
+({							\
+	static struct nmiaction fn##_na __initdata = {		\
+		.handler = (fn),			\
+		.name = (n),				\
+		.flags = (fg),				\
+	};						\
+	__register_nmi_handler((t), &fn##_na);	\
+})
+
 int __register_nmi_handler(unsigned int, struct nmiaction *);
 
 void unregister_nmi_handler(unsigned int, const char *);
diff --git a/arch/x86/kernel/nmi_selftest.c b/arch/x86/kernel/nmi_selftest.c
index e31bf8d5c4d2..149b8d9c6ad4 100644
--- a/arch/x86/kernel/nmi_selftest.c
+++ b/arch/x86/kernel/nmi_selftest.c
@@ -42,7 +42,7 @@ static int __init nmi_unk_cb(unsigned int val, struct pt_regs *regs)
 static void __init init_nmi_testsuite(void)
 {
 	/* trap all the unknown NMIs we may generate */
-	register_nmi_handler(NMI_UNKNOWN, nmi_unk_cb, 0, "nmi_selftest_unk");
+	register_nmi_handler_initonly(NMI_UNKNOWN, nmi_unk_cb, 0, "nmi_selftest_unk");
 }
 
 static void __init cleanup_nmi_testsuite(void)
@@ -64,7 +64,7 @@ static void __init test_nmi_ipi(struct cpumask *mask)
 {
 	unsigned long timeout;
 
-	if (register_nmi_handler(NMI_LOCAL, test_nmi_ipi_callback,
+	if (register_nmi_handler_initonly(NMI_LOCAL, test_nmi_ipi_callback,
 				 NMI_FLAG_FIRST, "nmi_selftest")) {
 		nmi_fail = FAILURE;
 		return;
-- 
cgit v1.2.3


From ae10ccdc3093486f8c2369d227583f9d79f628e5 Mon Sep 17 00:00:00 2001
From: Feng Tang <feng.tang@intel.com>
Date: Mon, 4 Jun 2012 15:00:04 +0800
Subject: ACPI: Make acpi_skip_timer_override cover all source_irq==0 cases

Currently when acpi_skip_timer_override is set, it only cover the
(source_irq == 0 && global_irq == 2) cases. While there is also
platform which need use this option and its global_irq is not 2.
This patch will extend acpi_skip_timer_override to cover all
timer overriding cases as long as the source irq is 0.

This is the first part of a fix to kernel bug bugzilla 40002:
	"IRQ 0 assigned to VGA"
https://bugzilla.kernel.org/show_bug.cgi?id=40002

Reported-and-tested-by: Szymon Kowalczyk <fazerxlo@o2.pl>
Signed-off-by: Feng Tang <feng.tang@intel.com>
Signed-off-by: Len Brown <len.brown@intel.com>
---
 arch/x86/kernel/acpi/boot.c | 14 ++++++++------
 1 file changed, 8 insertions(+), 6 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kernel/acpi/boot.c b/arch/x86/kernel/acpi/boot.c
index 8afb69319815..e7c698e9c7ec 100644
--- a/arch/x86/kernel/acpi/boot.c
+++ b/arch/x86/kernel/acpi/boot.c
@@ -422,12 +422,14 @@ acpi_parse_int_src_ovr(struct acpi_subtable_header * header,
 		return 0;
 	}
 
-	if (intsrc->source_irq == 0 && intsrc->global_irq == 2) {
+	if (intsrc->source_irq == 0) {
 		if (acpi_skip_timer_override) {
-			printk(PREFIX "BIOS IRQ0 pin2 override ignored.\n");
+			printk(PREFIX "BIOS IRQ0 override ignored.\n");
 			return 0;
 		}
-		if (acpi_fix_pin2_polarity && (intsrc->inti_flags & ACPI_MADT_POLARITY_MASK)) {
+
+		if ((intsrc->global_irq == 2) && acpi_fix_pin2_polarity
+			&& (intsrc->inti_flags & ACPI_MADT_POLARITY_MASK)) {
 			intsrc->inti_flags &= ~ACPI_MADT_POLARITY_MASK;
 			printk(PREFIX "BIOS IRQ0 pin2 override: forcing polarity to high active.\n");
 		}
@@ -1334,7 +1336,7 @@ static int __init dmi_disable_acpi(const struct dmi_system_id *d)
 }
 
 /*
- * Force ignoring BIOS IRQ0 pin2 override
+ * Force ignoring BIOS IRQ0 override
  */
 static int __init dmi_ignore_irq0_timer_override(const struct dmi_system_id *d)
 {
@@ -1344,7 +1346,7 @@ static int __init dmi_ignore_irq0_timer_override(const struct dmi_system_id *d)
 	 */
 	if (!acpi_skip_timer_override) {
 		WARN(1, KERN_ERR "ati_ixp4x0 quirk not complete.\n");
-		pr_notice("%s detected: Ignoring BIOS IRQ0 pin2 override\n",
+		pr_notice("%s detected: Ignoring BIOS IRQ0 override\n",
 			d->ident);
 		acpi_skip_timer_override = 1;
 	}
@@ -1438,7 +1440,7 @@ static struct dmi_system_id __initdata acpi_dmi_table_late[] = {
 	 * is enabled.  This input is incorrectly designated the
 	 * ISA IRQ 0 via an interrupt source override even though
 	 * it is wired to the output of the master 8259A and INTIN0
-	 * is not connected at all.  Force ignoring BIOS IRQ0 pin2
+	 * is not connected at all.  Force ignoring BIOS IRQ0
 	 * override in that cases.
 	 */
 	{
-- 
cgit v1.2.3


From 7f68b4c2e158019c2ec494b5cfbd9c83b4e5b253 Mon Sep 17 00:00:00 2001
From: Feng Tang <feng.tang@intel.com>
Date: Mon, 4 Jun 2012 15:00:05 +0800
Subject: ACPI: Remove one board specific WARN when ignoring timer overriding

Current WARN msg is only for the ati_ixp4x0 board, while this function
is used by mulitple platforms. So this one board specific warning
is not appropriate any more.

Signed-off-by: Feng Tang <feng.tang@intel.com>
Signed-off-by: Len Brown <len.brown@intel.com>
---
 arch/x86/kernel/acpi/boot.c | 5 -----
 1 file changed, 5 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kernel/acpi/boot.c b/arch/x86/kernel/acpi/boot.c
index e7c698e9c7ec..3a6afba6eca7 100644
--- a/arch/x86/kernel/acpi/boot.c
+++ b/arch/x86/kernel/acpi/boot.c
@@ -1340,12 +1340,7 @@ static int __init dmi_disable_acpi(const struct dmi_system_id *d)
  */
 static int __init dmi_ignore_irq0_timer_override(const struct dmi_system_id *d)
 {
-	/*
-	 * The ati_ixp4x0_rev() early PCI quirk should have set
-	 * the acpi_skip_timer_override flag already:
-	 */
 	if (!acpi_skip_timer_override) {
-		WARN(1, KERN_ERR "ati_ixp4x0 quirk not complete.\n");
 		pr_notice("%s detected: Ignoring BIOS IRQ0 override\n",
 			d->ident);
 		acpi_skip_timer_override = 1;
-- 
cgit v1.2.3


From f6b54f083cc66cf9b11d2120d8df3c2ad4e0836d Mon Sep 17 00:00:00 2001
From: Feng Tang <feng.tang@intel.com>
Date: Mon, 4 Jun 2012 15:00:06 +0800
Subject: ACPI: Add a quirk for "AMILO PRO V2030" to ignore the timer
 overriding

This is the 2nd part of fix for kernel bugzilla 40002:
    "IRQ 0 assigned to VGA"
https://bugzilla.kernel.org/show_bug.cgi?id=40002

The root cause is the buggy FW, whose ACPI tables assign the GSI 16
to 2 irqs 0 and 16(VGA), and the VGA is the right owner of GSI 16.
So add a quirk to ignore the irq0 overriding GSI 16 for the
FUJITSU SIEMENS AMILO PRO V2030 platform will solve this issue.

Reported-and-tested-by: Szymon Kowalczyk <fazerxlo@o2.pl>
Signed-off-by: Feng Tang <feng.tang@intel.com>
Signed-off-by: Len Brown <len.brown@intel.com>
---
 arch/x86/kernel/acpi/boot.c | 8 ++++++++
 1 file changed, 8 insertions(+)

(limited to 'arch/x86')

diff --git a/arch/x86/kernel/acpi/boot.c b/arch/x86/kernel/acpi/boot.c
index 3a6afba6eca7..b2297e58c6ed 100644
--- a/arch/x86/kernel/acpi/boot.c
+++ b/arch/x86/kernel/acpi/boot.c
@@ -1470,6 +1470,14 @@ static struct dmi_system_id __initdata acpi_dmi_table_late[] = {
 		     DMI_MATCH(DMI_PRODUCT_NAME, "HP Compaq 6715b"),
 		     },
 	 },
+	{
+	 .callback = dmi_ignore_irq0_timer_override,
+	 .ident = "FUJITSU SIEMENS",
+	 .matches = {
+		     DMI_MATCH(DMI_SYS_VENDOR, "FUJITSU SIEMENS"),
+		     DMI_MATCH(DMI_PRODUCT_NAME, "AMILO PRO V2030"),
+		     },
+	 },
 	{}
 };
 
-- 
cgit v1.2.3