From ff14c1d01576fb839a925a42596582f6c68a1a1a Mon Sep 17 00:00:00 2001
From: Pekka Enberg <penberg@kernel.org>
Date: Tue, 1 Nov 2011 15:58:16 +0200
Subject: x86, mm: Use MAX_DMA_PFN for ZONE_DMA on 32-bit

Use MAX_DMA_PFN which represents the 16 MB ISA DMA limit on
32-bit x86 just like we do on 64-bit.

Acked-by: Tejun Heo <tj@kernel.org>
Acked-by: Yinghai Lu <yinghai@kernel.org>
Acked-by: David Rientjes <rientjes@google.com>
Signed-off-by: Pekka Enberg <penberg@kernel.org>
Link: http://lkml.kernel.org/r/1320155902-10424-1-git-send-email-penberg@kernel.org
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/mm/init_32.c | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/mm/init_32.c b/arch/x86/mm/init_32.c
index 29f7c6d98179..434c97d620c2 100644
--- a/arch/x86/mm/init_32.c
+++ b/arch/x86/mm/init_32.c
@@ -679,8 +679,7 @@ static void __init zone_sizes_init(void)
 	unsigned long max_zone_pfns[MAX_NR_ZONES];
 	memset(max_zone_pfns, 0, sizeof(max_zone_pfns));
 #ifdef CONFIG_ZONE_DMA
-	max_zone_pfns[ZONE_DMA] =
-		virt_to_phys((char *)MAX_DMA_ADDRESS) >> PAGE_SHIFT;
+	max_zone_pfns[ZONE_DMA] = MAX_DMA_PFN;
 #endif
 	max_zone_pfns[ZONE_NORMAL] = max_low_pfn;
 #ifdef CONFIG_HIGHMEM
-- 
cgit v1.2.3


From 4c0b2e5f8940fec7cbeafcf641fecd5e746329c5 Mon Sep 17 00:00:00 2001
From: Pekka Enberg <penberg@kernel.org>
Date: Tue, 1 Nov 2011 15:58:17 +0200
Subject: x86, mm: Move zone init from paging_init() on 64-bit

This patch introduces a zone_sizes_init() helper function on
64-bit to make it more similar to 32-bit init.

Acked-by: Tejun Heo <tj@kernel.org>
Acked-by: Yinghai Lu <yinghai@kernel.org>
Acked-by: David Rientjes <rientjes@google.com>
Signed-off-by: Pekka Enberg <penberg@kernel.org>
Link: http://lkml.kernel.org/r/1320155902-10424-2-git-send-email-penberg@kernel.org
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/mm/init_64.c | 9 +++++++--
 1 file changed, 7 insertions(+), 2 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/mm/init_64.c b/arch/x86/mm/init_64.c
index bbaaa005bf0e..3ddda59f7087 100644
--- a/arch/x86/mm/init_64.c
+++ b/arch/x86/mm/init_64.c
@@ -612,7 +612,7 @@ void __init initmem_init(void)
 }
 #endif
 
-void __init paging_init(void)
+static void __init zone_sizes_init(void)
 {
 	unsigned long max_zone_pfns[MAX_NR_ZONES];
 
@@ -623,6 +623,11 @@ void __init paging_init(void)
 	max_zone_pfns[ZONE_DMA32] = MAX_DMA32_PFN;
 	max_zone_pfns[ZONE_NORMAL] = max_pfn;
 
+	free_area_init_nodes(max_zone_pfns);
+}
+
+void __init paging_init(void)
+{
 	sparse_memory_present_with_active_regions(MAX_NUMNODES);
 	sparse_init();
 
@@ -634,7 +639,7 @@ void __init paging_init(void)
 	 */
 	node_clear_state(0, N_NORMAL_MEMORY);
 
-	free_area_init_nodes(max_zone_pfns);
+	zone_sizes_init();
 }
 
 /*
-- 
cgit v1.2.3


From e4794640ca408acda18eb31b126f58a58803b9c9 Mon Sep 17 00:00:00 2001
From: Pekka Enberg <penberg@kernel.org>
Date: Tue, 1 Nov 2011 15:58:18 +0200
Subject: x86, mm: Use max_pfn instead of highend_pfn

The 'highend_pfn' variable is always set to 'max_pfn' so just
use the latter directly.

Acked-by: Tejun Heo <tj@kernel.org>
Acked-by: Yinghai Lu <yinghai@kernel.org>
Signed-off-by: Pekka Enberg <penberg@kernel.org>
Link: http://lkml.kernel.org/r/1320155902-10424-3-git-send-email-penberg@kernel.org
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/mm/init_32.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'arch/x86')

diff --git a/arch/x86/mm/init_32.c b/arch/x86/mm/init_32.c
index 434c97d620c2..5ac0118b7610 100644
--- a/arch/x86/mm/init_32.c
+++ b/arch/x86/mm/init_32.c
@@ -683,7 +683,7 @@ static void __init zone_sizes_init(void)
 #endif
 	max_zone_pfns[ZONE_NORMAL] = max_low_pfn;
 #ifdef CONFIG_HIGHMEM
-	max_zone_pfns[ZONE_HIGHMEM] = highend_pfn;
+	max_zone_pfns[ZONE_HIGHMEM] = max_pfn;
 #endif
 
 	free_area_init_nodes(max_zone_pfns);
-- 
cgit v1.2.3


From 80b3cac97bc14fdf839d967602e599cbf82ea336 Mon Sep 17 00:00:00 2001
From: Pekka Enberg <penberg@kernel.org>
Date: Tue, 1 Nov 2011 15:58:19 +0200
Subject: x86, mm: Wrap ZONE_DMA32 with CONFIG_ZONE_DMA32

In preparation for unifying 32-bit and 64-bit zone_sizes_init()
make sure ZONE_DMA32 is wrapped in CONFIG_ZONE_DMA32.

Acked-by: Tejun Heo <tj@kernel.org>
Acked-by: Yinghai Lu <yinghai@kernel.org>
Acked-by: David Rientjes <rientjes@google.com>
Acked-by: Arun Sharma <asharma@fb.com>
Signed-off-by: Pekka Enberg <penberg@kernel.org>
Link: http://lkml.kernel.org/r/1320155902-10424-4-git-send-email-penberg@kernel.org
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/mm/init_64.c | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'arch/x86')

diff --git a/arch/x86/mm/init_64.c b/arch/x86/mm/init_64.c
index 3ddda59f7087..a9214e6e721a 100644
--- a/arch/x86/mm/init_64.c
+++ b/arch/x86/mm/init_64.c
@@ -620,7 +620,9 @@ static void __init zone_sizes_init(void)
 #ifdef CONFIG_ZONE_DMA
 	max_zone_pfns[ZONE_DMA] = MAX_DMA_PFN;
 #endif
+#ifdef CONFIG_ZONE_DMA32
 	max_zone_pfns[ZONE_DMA32] = MAX_DMA32_PFN;
+#endif
 	max_zone_pfns[ZONE_NORMAL] = max_pfn;
 
 	free_area_init_nodes(max_zone_pfns);
-- 
cgit v1.2.3


From ece838b6257412647197c072fe59dfc6615df144 Mon Sep 17 00:00:00 2001
From: Pekka Enberg <penberg@kernel.org>
Date: Tue, 1 Nov 2011 15:58:20 +0200
Subject: x86, mm: Use max_low_pfn for ZONE_NORMAL on 64-bit

64-bit has no highmem so max_low_pfn is always the same as
'max_pfn'.

Acked-by: Tejun Heo <tj@kernel.org>
Acked-by: Yinghai Lu <yinghai@kernel.org>
Acked-by: David Rientjes <rientjes@google.com>
Signed-off-by: Pekka Enberg <penberg@kernel.org>
Link: http://lkml.kernel.org/r/1320155902-10424-5-git-send-email-penberg@kernel.org
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/mm/init_64.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'arch/x86')

diff --git a/arch/x86/mm/init_64.c b/arch/x86/mm/init_64.c
index a9214e6e721a..f6b1f087cced 100644
--- a/arch/x86/mm/init_64.c
+++ b/arch/x86/mm/init_64.c
@@ -623,7 +623,7 @@ static void __init zone_sizes_init(void)
 #ifdef CONFIG_ZONE_DMA32
 	max_zone_pfns[ZONE_DMA32] = MAX_DMA32_PFN;
 #endif
-	max_zone_pfns[ZONE_NORMAL] = max_pfn;
+	max_zone_pfns[ZONE_NORMAL] = max_low_pfn;
 
 	free_area_init_nodes(max_zone_pfns);
 }
-- 
cgit v1.2.3


From 248b52b97da7a712d2263a51d8d84c959f38ef75 Mon Sep 17 00:00:00 2001
From: Pekka Enberg <penberg@kernel.org>
Date: Tue, 1 Nov 2011 15:58:21 +0200
Subject: x86, mm: Prepare zone_sizes_init() for unification

Make 32-bit and 64-bit zone_sizes_init() identical in
preparation for unification.

Acked-by: Tejun Heo <tj@kernel.org>
Acked-by: Yinghai Lu <yinghai@kernel.org>
Signed-off-by: Pekka Enberg <penberg@kernel.org>
Link: http://lkml.kernel.org/r/1320155902-10424-6-git-send-email-penberg@kernel.org
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/mm/init_32.c | 4 ++++
 arch/x86/mm/init_64.c | 3 +++
 2 files changed, 7 insertions(+)

(limited to 'arch/x86')

diff --git a/arch/x86/mm/init_32.c b/arch/x86/mm/init_32.c
index 5ac0118b7610..27455b958b8d 100644
--- a/arch/x86/mm/init_32.c
+++ b/arch/x86/mm/init_32.c
@@ -677,9 +677,13 @@ void __init initmem_init(void)
 static void __init zone_sizes_init(void)
 {
 	unsigned long max_zone_pfns[MAX_NR_ZONES];
+
 	memset(max_zone_pfns, 0, sizeof(max_zone_pfns));
 #ifdef CONFIG_ZONE_DMA
 	max_zone_pfns[ZONE_DMA] = MAX_DMA_PFN;
+#endif
+#ifdef CONFIG_ZONE_DMA32
+        max_zone_pfns[ZONE_DMA32] = MAX_DMA32_PFN;
 #endif
 	max_zone_pfns[ZONE_NORMAL] = max_low_pfn;
 #ifdef CONFIG_HIGHMEM
diff --git a/arch/x86/mm/init_64.c b/arch/x86/mm/init_64.c
index f6b1f087cced..06c4360cf796 100644
--- a/arch/x86/mm/init_64.c
+++ b/arch/x86/mm/init_64.c
@@ -624,6 +624,9 @@ static void __init zone_sizes_init(void)
 	max_zone_pfns[ZONE_DMA32] = MAX_DMA32_PFN;
 #endif
 	max_zone_pfns[ZONE_NORMAL] = max_low_pfn;
+#ifdef CONFIG_HIGHMEM
+	max_zone_pfns[ZONE_HIGHMEM] = max_pfn;
+#endif
 
 	free_area_init_nodes(max_zone_pfns);
 }
-- 
cgit v1.2.3


From 176239153049a023d060ce95b05f7ef31667e362 Mon Sep 17 00:00:00 2001
From: Pekka Enberg <penberg@kernel.org>
Date: Tue, 1 Nov 2011 15:58:22 +0200
Subject: x86, mm: Unify zone_sizes_init()

Now that zone_sizes_init() is identical on 32-bit and 64-bit,
move the code to arch/x86/mm/init.c and use it for both
architectures.

Acked-by: Tejun Heo <tj@kernel.org>
Acked-by: Yinghai Lu <yinghai@kernel.org>
Signed-off-by: Pekka Enberg <penberg@kernel.org>
Link: http://lkml.kernel.org/r/1320155902-10424-7-git-send-email-penberg@kernel.org
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/include/asm/init.h |  2 ++
 arch/x86/mm/init.c          | 23 +++++++++++++++++++++++
 arch/x86/mm/init_32.c       | 19 -------------------
 arch/x86/mm/init_64.c       | 19 -------------------
 4 files changed, 25 insertions(+), 38 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/include/asm/init.h b/arch/x86/include/asm/init.h
index 8dbe353e41e1..adcc0ae73d09 100644
--- a/arch/x86/include/asm/init.h
+++ b/arch/x86/include/asm/init.h
@@ -5,6 +5,8 @@
 extern void __init early_ioremap_page_table_range_init(void);
 #endif
 
+extern void __init zone_sizes_init(void);
+
 extern unsigned long __init
 kernel_physical_mapping_init(unsigned long start,
 			     unsigned long end,
diff --git a/arch/x86/mm/init.c b/arch/x86/mm/init.c
index 87488b93a65c..2426b60bb409 100644
--- a/arch/x86/mm/init.c
+++ b/arch/x86/mm/init.c
@@ -3,6 +3,7 @@
 #include <linux/ioport.h>
 #include <linux/swap.h>
 #include <linux/memblock.h>
+#include <linux/bootmem.h>	/* for max_low_pfn */
 
 #include <asm/cacheflush.h>
 #include <asm/e820.h>
@@ -15,6 +16,7 @@
 #include <asm/tlbflush.h>
 #include <asm/tlb.h>
 #include <asm/proto.h>
+#include <asm/dma.h>		/* for MAX_DMA_PFN */
 
 unsigned long __initdata pgt_buf_start;
 unsigned long __meminitdata pgt_buf_end;
@@ -392,3 +394,24 @@ void free_initrd_mem(unsigned long start, unsigned long end)
 	free_init_pages("initrd memory", start, PAGE_ALIGN(end));
 }
 #endif
+
+void __init zone_sizes_init(void)
+{
+	unsigned long max_zone_pfns[MAX_NR_ZONES];
+
+	memset(max_zone_pfns, 0, sizeof(max_zone_pfns));
+
+#ifdef CONFIG_ZONE_DMA
+	max_zone_pfns[ZONE_DMA]		= MAX_DMA_PFN;
+#endif
+#ifdef CONFIG_ZONE_DMA32
+	max_zone_pfns[ZONE_DMA32]	= MAX_DMA32_PFN;
+#endif
+	max_zone_pfns[ZONE_NORMAL]	= max_low_pfn;
+#ifdef CONFIG_HIGHMEM
+	max_zone_pfns[ZONE_HIGHMEM]	= max_pfn;
+#endif
+
+	free_area_init_nodes(max_zone_pfns);
+}
+
diff --git a/arch/x86/mm/init_32.c b/arch/x86/mm/init_32.c
index 27455b958b8d..3bebaed5021c 100644
--- a/arch/x86/mm/init_32.c
+++ b/arch/x86/mm/init_32.c
@@ -674,25 +674,6 @@ void __init initmem_init(void)
 }
 #endif /* !CONFIG_NEED_MULTIPLE_NODES */
 
-static void __init zone_sizes_init(void)
-{
-	unsigned long max_zone_pfns[MAX_NR_ZONES];
-
-	memset(max_zone_pfns, 0, sizeof(max_zone_pfns));
-#ifdef CONFIG_ZONE_DMA
-	max_zone_pfns[ZONE_DMA] = MAX_DMA_PFN;
-#endif
-#ifdef CONFIG_ZONE_DMA32
-        max_zone_pfns[ZONE_DMA32] = MAX_DMA32_PFN;
-#endif
-	max_zone_pfns[ZONE_NORMAL] = max_low_pfn;
-#ifdef CONFIG_HIGHMEM
-	max_zone_pfns[ZONE_HIGHMEM] = max_pfn;
-#endif
-
-	free_area_init_nodes(max_zone_pfns);
-}
-
 void __init setup_bootmem_allocator(void)
 {
 	printk(KERN_INFO "  mapped low ram: 0 - %08lx\n",
diff --git a/arch/x86/mm/init_64.c b/arch/x86/mm/init_64.c
index 06c4360cf796..6fcce7d34555 100644
--- a/arch/x86/mm/init_64.c
+++ b/arch/x86/mm/init_64.c
@@ -612,25 +612,6 @@ void __init initmem_init(void)
 }
 #endif
 
-static void __init zone_sizes_init(void)
-{
-	unsigned long max_zone_pfns[MAX_NR_ZONES];
-
-	memset(max_zone_pfns, 0, sizeof(max_zone_pfns));
-#ifdef CONFIG_ZONE_DMA
-	max_zone_pfns[ZONE_DMA] = MAX_DMA_PFN;
-#endif
-#ifdef CONFIG_ZONE_DMA32
-	max_zone_pfns[ZONE_DMA32] = MAX_DMA32_PFN;
-#endif
-	max_zone_pfns[ZONE_NORMAL] = max_low_pfn;
-#ifdef CONFIG_HIGHMEM
-	max_zone_pfns[ZONE_HIGHMEM] = max_pfn;
-#endif
-
-	free_area_init_nodes(max_zone_pfns);
-}
-
 void __init paging_init(void)
 {
 	sparse_memory_present_with_active_regions(MAX_NUMNODES);
-- 
cgit v1.2.3


From b7641d2c83aa10031bf45afd82619bfaaedcbc6f Mon Sep 17 00:00:00 2001
From: "H. Peter Anvin" <hpa@linux.intel.com>
Date: Fri, 11 Nov 2011 15:43:02 -0800
Subject: x86-64, syscall: Adjust comment spacing and remove typo

Adjust spacing for comment so that it matches the multiline comment
style used in the rest of the kernel, and remove word duplication.

It is not really clear what version of gcc this refers to, but the
extra & doesn't cause any harm, so there is no reason to remove it.

Signed-off-by: H. Peter Anvin <hpa@linux.intel.com>
---
 arch/x86/kernel/syscall_64.c | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kernel/syscall_64.c b/arch/x86/kernel/syscall_64.c
index de87d6008295..0edfafa1b269 100644
--- a/arch/x86/kernel/syscall_64.c
+++ b/arch/x86/kernel/syscall_64.c
@@ -21,9 +21,9 @@ extern void sys_ni_syscall(void);
 
 const sys_call_ptr_t sys_call_table[__NR_syscall_max+1] = {
 	/*
-	*Smells like a like a compiler bug -- it doesn't work
-	*when the & below is removed.
-	*/
+	 * Smells like a compiler bug -- it doesn't work
+	 * when the & below is removed.
+	 */
 	[0 ... __NR_syscall_max] = &sys_ni_syscall,
 #include <asm/unistd_64.h>
 };
-- 
cgit v1.2.3


From e79a7fccfb2ab10f8753ac634a1c8473e870ae6c Mon Sep 17 00:00:00 2001
From: "H. Peter Anvin" <hpa@linux.intel.com>
Date: Fri, 11 Nov 2011 15:48:42 -0800
Subject: x86-64, ia32: Move compat_ni_syscall into C and its own file

Move compat_ni_syscall out of ia32entry.S and into its own .c file.
Although this is a trivial function, it is not performance-critical,
and this will simplify further cleanups.

Signed-off-by: H. Peter Anvin <hpa@linux.intel.com>
---
 arch/x86/ia32/Makefile    | 1 +
 arch/x86/ia32/ia32entry.S | 3 ---
 arch/x86/ia32/nosyscall.c | 7 +++++++
 3 files changed, 8 insertions(+), 3 deletions(-)
 create mode 100644 arch/x86/ia32/nosyscall.c

(limited to 'arch/x86')

diff --git a/arch/x86/ia32/Makefile b/arch/x86/ia32/Makefile
index 52d0ccfcf6ea..eea9a1c77d38 100644
--- a/arch/x86/ia32/Makefile
+++ b/arch/x86/ia32/Makefile
@@ -3,6 +3,7 @@
 #
 
 obj-$(CONFIG_IA32_EMULATION) := ia32entry.o sys_ia32.o ia32_signal.o
+obj-$(CONFIG_IA32_EMULATION) += nosyscall.o
 
 sysv-$(CONFIG_SYSVIPC) := ipc32.o
 obj-$(CONFIG_IA32_EMULATION) += $(sysv-y)
diff --git a/arch/x86/ia32/ia32entry.S b/arch/x86/ia32/ia32entry.S
index a6253ec1b284..59538a777695 100644
--- a/arch/x86/ia32/ia32entry.S
+++ b/arch/x86/ia32/ia32entry.S
@@ -453,9 +453,6 @@ ia32_badsys:
 	movq $-ENOSYS,%rax
 	jmp ia32_sysret
 
-quiet_ni_syscall:
-	movq $-ENOSYS,%rax
-	ret
 	CFI_ENDPROC
 	
 	.macro PTREGSCALL label, func, arg
diff --git a/arch/x86/ia32/nosyscall.c b/arch/x86/ia32/nosyscall.c
new file mode 100644
index 000000000000..51ecd5b4e787
--- /dev/null
+++ b/arch/x86/ia32/nosyscall.c
@@ -0,0 +1,7 @@
+#include <linux/kernel.h>
+#include <linux/errno.h>
+
+long compat_ni_syscall(void)
+{
+	return -ENOSYS;
+}
-- 
cgit v1.2.3


From d181764ccf6207e02abb95fb3052639b947f4833 Mon Sep 17 00:00:00 2001
From: "H. Peter Anvin" <hpa@linux.intel.com>
Date: Fri, 11 Nov 2011 15:55:49 -0800
Subject: x86: Machine-readable syscall tables and scripts to process them

Create a simple set of syscall tables and scripts to turn them into
both header files (unistd_*.h) and macros for generating the system
call tables.

Signed-off-by: H. Peter Anvin <hpa@linux.intel.com>
---
 arch/x86/syscalls/Makefile       |  43 +++++
 arch/x86/syscalls/syscall_32.tbl | 357 +++++++++++++++++++++++++++++++++++++++
 arch/x86/syscalls/syscall_64.tbl | 320 +++++++++++++++++++++++++++++++++++
 arch/x86/syscalls/syscallhdr.sh  |  36 ++++
 arch/x86/syscalls/syscalltbl.sh  |  15 ++
 5 files changed, 771 insertions(+)
 create mode 100644 arch/x86/syscalls/Makefile
 create mode 100644 arch/x86/syscalls/syscall_32.tbl
 create mode 100644 arch/x86/syscalls/syscall_64.tbl
 create mode 100644 arch/x86/syscalls/syscallhdr.sh
 create mode 100644 arch/x86/syscalls/syscalltbl.sh

(limited to 'arch/x86')

diff --git a/arch/x86/syscalls/Makefile b/arch/x86/syscalls/Makefile
new file mode 100644
index 000000000000..564b2476fede
--- /dev/null
+++ b/arch/x86/syscalls/Makefile
@@ -0,0 +1,43 @@
+out := $(obj)/../include/generated/asm
+
+# Create output directory if not already present
+_dummy := $(shell [ -d '$(out)' ] || mkdir -p '$(out)')
+
+syscall32 := $(srctree)/$(src)/syscall_32.tbl
+syscall64 := $(srctree)/$(src)/syscall_64.tbl
+
+syshdr := $(srctree)/$(src)/syscallhdr.sh
+systbl := $(srctree)/$(src)/syscalltbl.sh
+
+quiet_cmd_syshdr = SYSHDR  $@
+      cmd_syshdr = $(CONFIG_SHELL) '$(syshdr)' $< $@ \
+		   $(syshdr_abi_$(basetarget)) $(syshdr_pfx_$(basetarget))
+quiet_cmd_systbl = SYSTBL  $@
+      cmd_systbl = $(CONFIG_SHELL) '$(systbl)' $< $@
+
+syshdr_abi_unistd_32 := i386
+$(out)/unistd_32.h: $(syscall32) $(syshdr)
+	$(call if_changed,syshdr)
+
+syshdr_abi_unistd_32_ia32 := i386
+syshdr_pfx_unistd_32_ia32 := ia32_
+$(out)/unistd_32_ia32.h: $(syscall32) $(syshdr)
+	$(call if_changed,syshdr)
+
+syshdr_abi_unistd_64 := 64
+$(out)/unistd_64.h: $(syscall64) $(syshdr)
+	$(call if_changed,syshdr)
+
+$(out)/syscalls_32.h: $(syscall32) $(systbl)
+	$(call if_changed,systbl)
+$(out)/syscalls_64.h: $(syscall64) $(systbl)
+	$(call if_changed,systbl)
+
+syshdr-y			+= unistd_32.h unistd_64.h
+syshdr-y			+= syscalls_32.h
+syshdr-$(CONFIG_X86_64)		+= unistd_32_ia32.h
+syshdr-$(CONFIG_X86_64)		+= syscalls_64.h
+
+targets	+= $(syshdr-y)
+
+all: $(addprefix $(out)/,$(targets))
diff --git a/arch/x86/syscalls/syscall_32.tbl b/arch/x86/syscalls/syscall_32.tbl
new file mode 100644
index 000000000000..ce98e287c066
--- /dev/null
+++ b/arch/x86/syscalls/syscall_32.tbl
@@ -0,0 +1,357 @@
+#
+# 32-bit system call numbers and entry vectors
+#
+# The format is:
+# <number> <abi> <name> <entry point> <compat entry point>
+#
+# The abi is always "i386" for this file.
+#
+0	i386	restart_syscall		sys_restart_syscall
+1	i386	exit			sys_exit
+2	i386	fork			ptregs_fork			stub32_fork
+3	i386	read			sys_read
+4	i386	write			sys_write
+5	i386	open			sys_open			compat_sys_open
+6	i386	close			sys_close
+7	i386	waitpid			sys_waitpid			sys32_waitpid
+8	i386	creat			sys_creat
+9	i386	link			sys_link
+10	i386	unlink			sys_unlink
+11	i386	execve			ptregs_execve			stub32_execve
+12	i386	chdir			sys_chdir
+13	i386	time			sys_time			compat_sys_time
+14	i386	mknod			sys_mknod
+15	i386	chmod			sys_chmod
+16	i386	lchown			sys_lchown16
+17	i386	break
+18	i386	oldstat			sys_stat
+19	i386	lseek			sys_lseek			sys32_lseek
+20	i386	getpid			sys_getpid
+21	i386	mount			sys_mount			compat_sys_mount
+22	i386	umount			sys_oldumount
+23	i386	setuid			sys_setuid16
+24	i386	getuid			sys_getuid16
+25	i386	stime			sys_stime			compat_sys_stime
+26	i386	ptrace			sys_ptrace			compat_sys_ptrace
+27	i386	alarm			sys_alarm
+28	i386	oldfstat		sys_fstat
+29	i386	pause			sys_pause
+30	i386	utime			sys_utime			compat_sys_utime
+31	i386	stty
+32	i386	gtty
+33	i386	access			sys_access
+34	i386	nice			sys_nice
+35	i386	ftime
+36	i386	sync			sys_sync
+37	i386	kill			sys_kill			sys32_kill
+38	i386	rename			sys_rename
+39	i386	mkdir			sys_mkdir
+40	i386	rmdir			sys_rmdir
+41	i386	dup			sys_dup
+42	i386	pipe			sys_pipe
+43	i386	times			sys_times			compat_sys_times
+44	i386	prof
+45	i386	brk			sys_brk
+46	i386	setgid			sys_setgid16
+47	i386	getgid			sys_getgid16
+48	i386	signal			sys_signal
+49	i386	geteuid			sys_geteuid16
+50	i386	getegid			sys_getegid16
+51	i386	acct			sys_acct
+52	i386	umount2			sys_umount
+53	i386	lock
+54	i386	ioctl			sys_ioctl			compat_sys_ioctl
+55	i386	fcntl			sys_fcntl			compat_sys_fcntl64
+56	i386	mpx
+57	i386	setpgid			sys_setpgid
+58	i386	ulimit
+59	i386	oldolduname		sys_olduname
+60	i386	umask			sys_umask
+61	i386	chroot			sys_chroot
+62	i386	ustat			sys_ustat			compat_sys_ustat
+63	i386	dup2			sys_dup2
+64	i386	getppid			sys_getppid
+65	i386	getpgrp			sys_getpgrp
+66	i386	setsid			sys_setsid
+67	i386	sigaction		sys_sigaction			sys32_sigaction
+68	i386	sgetmask		sys_sgetmask
+69	i386	ssetmask		sys_ssetmask
+70	i386	setreuid		sys_setreuid16
+71	i386	setregid		sys_setregid16
+72	i386	sigsuspend		sys_sigsuspend			sys32_sigsuspend
+73	i386	sigpending		sys_sigpending			compat_sys_sigpending
+74	i386	sethostname		sys_sethostname
+75	i386	setrlimit		sys_setrlimit			compat_sys_setrlimit
+76	i386	getrlimit		sys_old_getrlimit		compat_sys_old_getrlimit
+77	i386	getrusage		sys_getrusage			compat_sys_getrusage
+78	i386	gettimeofday		sys_gettimeofday		compat_sys_gettimeofday
+79	i386	settimeofday		sys_settimeofday		compat_sys_settimeofday
+80	i386	getgroups		sys_getgroups16
+81	i386	setgroups		sys_setgroups16
+82	i386	select			sys_old_select			compat_sys_old_select
+83	i386	symlink			sys_symlink
+84	i386	oldlstat		sys_lstat
+85	i386	readlink		sys_readlink
+86	i386	uselib			sys_uselib
+87	i386	swapon			sys_swapon
+88	i386	reboot			sys_reboot
+89	i386	readdir			sys_old_readdir			compat_sys_old_readdir
+90	i386	mmap			sys_old_mmap			sys32_mmap
+91	i386	munmap			sys_munmap
+92	i386	truncate		sys_truncate
+93	i386	ftruncate		sys_ftruncate
+94	i386	fchmod			sys_fchmod
+95	i386	fchown			sys_fchown16
+96	i386	getpriority		sys_getpriority
+97	i386	setpriority		sys_setpriority
+98	i386	profil
+99	i386	statfs			sys_statfs			compat_sys_statfs
+100	i386	fstatfs			sys_fstatfs			compat_sys_fstatfs
+101	i386	ioperm			sys_ioperm
+102	i386	socketcall		sys_socketcall			compat_sys_socketcall
+103	i386	syslog			sys_syslog
+104	i386	setitimer		sys_setitimer			compat_sys_setitimer
+105	i386	getitimer		sys_getitimer			compat_sys_getitimer
+106	i386	stat			sys_newstat			compat_sys_newstat
+107	i386	lstat			sys_newlstat			compat_sys_newlstat
+108	i386	fstat			sys_newfstat			compat_sys_newfstat
+109	i386	olduname		sys_uname
+110	i386	iopl			ptregs_iopl			stub32_iopl
+111	i386	vhangup			sys_vhangup
+112	i386	idle
+113	i386	vm86old			ptregs_vm86old			sys32_vm86_warning
+114	i386	wait4			sys_wait4			compat_sys_wait4
+115	i386	swapoff			sys_swapoff
+116	i386	sysinfo			sys_sysinfo			compat_sys_sysinfo
+117	i386	ipc			sys_ipc				sys32_ipc
+118	i386	fsync			sys_fsync
+119	i386	sigreturn		ptregs_sigreturn		stub32_sigreturn
+120	i386	clone			ptregs_clone			stub32_clone
+121	i386	setdomainname		sys_setdomainname
+122	i386	uname			sys_newuname
+123	i386	modify_ldt		sys_modify_ldt
+124	i386	adjtimex		sys_adjtimex			compat_sys_adjtimex
+125	i386	mprotect		sys_mprotect			sys32_mprotect
+126	i386	sigprocmask		sys_sigprocmask			compat_sys_sigprocmask
+127	i386	create_module
+128	i386	init_module		sys_init_module
+129	i386	delete_module		sys_delete_module
+130	i386	get_kernel_syms
+131	i386	quotactl		sys_quotactl			sys32_quotactl
+132	i386	getpgid			sys_getpgid
+133	i386	fchdir			sys_fchdir
+134	i386	bdflush			sys_bdflush
+135	i386	sysfs			sys_sysfs
+136	i386	personality		sys_personality
+137	i386	afs_syscall
+138	i386	setfsuid		sys_setfsuid16
+139	i386	setfsgid		sys_setfsgid16
+140	i386	_llseek			sys_llseek
+141	i386	getdents		sys_getdents			compat_sys_getdents
+142	i386	_newselect		sys_select			compat_sys_select
+143	i386	flock			sys_flock
+144	i386	msync			sys_msync
+145	i386	readv			sys_readv			compat_sys_readv
+146	i386	writev			sys_writev			compat_sys_writev
+147	i386	getsid			sys_getsid
+148	i386	fdatasync		sys_fdatasync
+149	i386	_sysctl			sys_sysctl			compat_sys_sysctl
+150	i386	mlock			sys_mlock
+151	i386	munlock			sys_munlock
+152	i386	mlockall		sys_mlockall
+153	i386	munlockall		sys_munlockall
+154	i386	sched_setparam		sys_sched_setparam
+155	i386	sched_getparam		sys_sched_getparam
+156	i386	sched_setscheduler	sys_sched_setscheduler
+157	i386	sched_getscheduler	sys_sched_getscheduler
+158	i386	sched_yield		sys_sched_yield
+159	i386	sched_get_priority_max	sys_sched_get_priority_max
+160	i386	sched_get_priority_min	sys_sched_get_priority_min
+161	i386	sched_rr_get_interval	sys_sched_rr_get_interval	sys32_sched_rr_get_interval
+162	i386	nanosleep		sys_nanosleep			compat_sys_nanosleep
+163	i386	mremap			sys_mremap
+164	i386	setresuid		sys_setresuid16
+165	i386	getresuid		sys_getresuid16
+166	i386	vm86			ptregs_vm86			sys32_vm86_warning
+167	i386	query_module
+168	i386	poll			sys_poll
+169	i386	nfsservctl
+170	i386	setresgid		sys_setresgid16
+171	i386	getresgid		sys_getresgid16
+172	i386	prctl			sys_prctl
+173	i386	rt_sigreturn		ptregs_rt_sigreturn		stub32_rt_sigreturn
+174	i386	rt_sigaction		sys_rt_sigaction		sys32_rt_sigaction
+175	i386	rt_sigprocmask		sys_rt_sigprocmask		sys32_rt_sigprocmask
+176	i386	rt_sigpending		sys_rt_sigpending		sys32_rt_sigpending
+177	i386	rt_sigtimedwait		sys_rt_sigtimedwait		compat_sys_rt_sigtimedwait
+178	i386	rt_sigqueueinfo		sys_rt_sigqueueinfo		sys32_rt_sigqueueinfo
+179	i386	rt_sigsuspend		sys_rt_sigsuspend
+180	i386	pread64			sys_pread64			sys32_pread
+181	i386	pwrite64		sys_pwrite64			sys32_pwrite
+182	i386	chown			sys_chown16
+183	i386	getcwd			sys_getcwd
+184	i386	capget			sys_capget
+185	i386	capset			sys_capset
+186	i386	sigaltstack		ptregs_sigaltstack		stub32_sigaltstack
+187	i386	sendfile		sys_sendfile			sys32_sendfile
+188	i386	getpmsg
+189	i386	putpmsg
+190	i386	vfork			ptregs_vfork			stub32_vfork
+191	i386	ugetrlimit		sys_getrlimit			compat_sys_getrlimit
+192	i386	mmap2			sys_mmap_pgoff
+193	i386	truncate64		sys_truncate64			sys32_truncate64
+194	i386	ftruncate64		sys_ftruncate64			sys32_ftruncate64
+195	i386	stat64			sys_stat64			sys32_stat64
+196	i386	lstat64			sys_lstat64			sys32_lstat64
+197	i386	fstat64			sys_fstat64			sys32_fstat64
+198	i386	lchown32		sys_lchown
+199	i386	getuid32		sys_getuid
+200	i386	getgid32		sys_getgid
+201	i386	geteuid32		sys_geteuid
+202	i386	getegid32		sys_getegid
+203	i386	setreuid32		sys_setreuid
+204	i386	setregid32		sys_setregid
+205	i386	getgroups32		sys_getgroups
+206	i386	setgroups32		sys_setgroups
+207	i386	fchown32		sys_fchown
+208	i386	setresuid32		sys_setresuid
+209	i386	getresuid32		sys_getresuid
+210	i386	setresgid32		sys_setresgid
+211	i386	getresgid32		sys_getresgid
+212	i386	chown32			sys_chown
+213	i386	setuid32		sys_setuid
+214	i386	setgid32		sys_setgid
+215	i386	setfsuid32		sys_setfsuid
+216	i386	setfsgid32		sys_setfsgid
+217	i386	pivot_root		sys_pivot_root
+218	i386	mincore			sys_mincore
+219	i386	madvise			sys_madvise
+220	i386	getdents64		sys_getdents64			compat_sys_getdents64
+221	i386	fcntl64			sys_fcntl64			compat_sys_fcntl64
+# 222 is unused
+# 223 is unused
+224	i386	gettid			sys_gettid
+225	i386	readahead		sys_readahead			sys32_readahead
+226	i386	setxattr		sys_setxattr
+227	i386	lsetxattr		sys_lsetxattr
+228	i386	fsetxattr		sys_fsetxattr
+229	i386	getxattr		sys_getxattr
+230	i386	lgetxattr		sys_lgetxattr
+231	i386	fgetxattr		sys_fgetxattr
+232	i386	listxattr		sys_listxattr
+233	i386	llistxattr		sys_llistxattr
+234	i386	flistxattr		sys_flistxattr
+235	i386	removexattr		sys_removexattr
+236	i386	lremovexattr		sys_lremovexattr
+237	i386	fremovexattr		sys_fremovexattr
+238	i386	tkill			sys_tkill
+239	i386	sendfile64		sys_sendfile64
+240	i386	futex			sys_futex			compat_sys_futex
+241	i386	sched_setaffinity	sys_sched_setaffinity		compat_sys_sched_setaffinity
+242	i386	sched_getaffinity	sys_sched_getaffinity		compat_sys_sched_getaffinity
+243	i386	set_thread_area		sys_set_thread_area
+244	i386	get_thread_area		sys_get_thread_area
+245	i386	io_setup		sys_io_setup			compat_sys_io_setup
+246	i386	io_destroy		sys_io_destroy
+247	i386	io_getevents		sys_io_getevents		compat_sys_io_getevents
+248	i386	io_submit		sys_io_submit			compat_sys_io_submit
+249	i386	io_cancel		sys_io_cancel
+250	i386	fadvise64		sys_fadvise64			sys32_fadvise64
+# 251 is available for reuse (was briefly sys_set_zone_reclaim)
+252	i386	exit_group		sys_exit_group
+253	i386	lookup_dcookie		sys_lookup_dcookie		sys32_lookup_dcookie
+254	i386	epoll_create		sys_epoll_create
+255	i386	epoll_ctl		sys_epoll_ctl
+256	i386	epoll_wait		sys_epoll_wait
+257	i386	remap_file_pages	sys_remap_file_pages
+258	i386	set_tid_address		sys_set_tid_address
+259	i386	timer_create		sys_timer_create		compat_sys_timer_create
+260	i386	timer_settime		sys_timer_settime		compat_sys_timer_settime
+261	i386	timer_gettime		sys_timer_gettime		compat_sys_timer_gettime
+262	i386	timer_getoverrun	sys_timer_getoverrun
+263	i386	timer_delete		sys_timer_delete
+264	i386	clock_settime		sys_clock_settime		compat_sys_clock_settime
+265	i386	clock_gettime		sys_clock_gettime		compat_sys_clock_gettime
+266	i386	clock_getres		sys_clock_getres		compat_sys_clock_getres
+267	i386	clock_nanosleep		sys_clock_nanosleep		compat_sys_clock_nanosleep
+268	i386	statfs64		sys_statfs64			compat_sys_statfs64
+269	i386	fstatfs64		sys_fstatfs64			compat_sys_fstatfs64
+270	i386	tgkill			sys_tgkill
+271	i386	utimes			sys_utimes			compat_sys_utimes
+272	i386	fadvise64_64		sys_fadvise64_64		sys32_fadvise64_64
+273	i386	vserver
+274	i386	mbind			sys_mbind
+275	i386	get_mempolicy		sys_get_mempolicy		compat_sys_get_mempolicy
+276	i386	set_mempolicy		sys_set_mempolicy
+277	i386	mq_open			sys_mq_open			compat_sys_mq_open
+278	i386	mq_unlink		sys_mq_unlink
+279	i386	mq_timedsend		sys_mq_timedsend		compat_sys_mq_timedsend
+280	i386	mq_timedreceive		sys_mq_timedreceive		compat_sys_mq_timedreceive
+281	i386	mq_notify		sys_mq_notify			compat_sys_mq_notify
+282	i386	mq_getsetaddr		sys_mq_getsetattr		compat_sys_mq_getsetattr
+283	i386	kexec_load		sys_kexec_load			compat_sys_kexec_load
+284	i386	waitid			sys_waitid			compat_sys_waitid
+# 285 sys_setaltroot
+286	i386	add_key			sys_add_key
+287	i386	request_key		sys_request_key
+288	i386	keyctl			sys_keyctl
+289	i386	ioprio_set		sys_ioprio_set
+290	i386	ioprio_get		sys_ioprio_get
+291	i386	inotify_init		sys_inotify_init
+292	i386	inotify_add_watch	sys_inotify_add_watch
+293	i386	inotify_rm_watch	sys_inotify_rm_watch
+294	i386	migrate_pages		sys_migrate_pages
+295	i386	openat			sys_openat			compat_sys_openat
+296	i386	mkdirat			sys_mkdirat
+297	i386	mknodat			sys_mknodat
+298	i386	fchownat		sys_fchownat
+299	i386	futimesat		sys_futimesat			compat_sys_futimesat
+300	i386	fstatat64		sys_fstatat64			sys32_fstatat
+301	i386	unlinkat		sys_unlinkat
+302	i386	renameat		sys_renameat
+303	i386	linkat			sys_linkat
+304	i386	symlinkat		sys_symlinkat
+305	i386	readlinkat		sys_readlinkat
+306	i386	fchmodat		sys_fchmodat
+307	i386	faccessat		sys_faccessat
+308	i386	pselect6		sys_pselect6			compat_sys_pselect6
+309	i386	ppoll			sys_ppoll			compat_sys_ppoll
+310	i386	unshare			sys_unshare
+311	i386	set_robust_list		sys_set_robust_list		compat_sys_set_robust_list
+312	i386	get_robust_list		sys_get_robust_list		compat_sys_get_robust_list
+313	i386	splice			sys_splice
+314	i386	sync_file_range		sys_sync_file_range		sys32_sync_file_range
+315	i386	tee			sys_tee
+316	i386	vmsplice		sys_vmsplice			compat_sys_vmsplice
+317	i386	move_pages		sys_move_pages			compat_sys_move_pages
+318	i386	getcpu			sys_getcpu
+319	i386	epoll_pwait		sys_epoll_pwait
+320	i386	utimensat		sys_utimensat			compat_sys_utimensat
+321	i386	signalfd		sys_signalfd			compat_sys_signalfd
+322	i386	timerfd_create		sys_timerfd_create
+323	i386	eventfd			sys_eventfd
+324	i386	fallocate		sys_fallocate			sys32_fallocate
+325	i386	timerfd_settime		sys_timerfd_settime		compat_sys_timerfd_settime
+326	i386	timerfd_gettime		sys_timerfd_gettime		compat_sys_timerfd_gettime
+327	i386	signalfd4		sys_signalfd4			compat_sys_signalfd4
+328	i386	eventfd2		sys_eventfd2
+329	i386	epoll_create1		sys_epoll_create1
+330	i386	dup3			sys_dup3
+331	i386	pipe2			sys_pipe2
+332	i386	inotify_init1		sys_inotify_init1
+333	i386	preadv			sys_preadv			compat_sys_preadv
+334	i386	pwritev			sys_pwritev			compat_sys_pwritev
+335	i386	rt_tgsigqueueinfo	sys_rt_tgsigqueueinfo		compat_sys_rt_tgsigqueueinfo
+336	i386	perf_event_open		sys_perf_event_open
+337	i386	recvmmsg		sys_recvmmsg			compat_sys_recvmmsg
+338	i386	fanotify_init		sys_fanotify_init
+339	i386	fanotify_mark		sys_fanotify_mark		sys32_fanotify_mark
+340	i386	prlimit64		sys_prlimit64
+341	i386	name_to_handle_at	sys_name_to_handle_at
+342	i386	open_by_handle_at	sys_open_by_handle_at		compat_sys_open_by_handle_at
+343	i386	clock_adjtime		sys_clock_adjtime		compat_sys_clock_adjtime
+344	i386	syncfs			sys_syncfs
+345	i386	sendmmsg		sys_sendmmsg			compat_sys_sendmmsg
+346	i386	setns			sys_setns
+347	i386	process_vm_readv	sys_process_vm_readv		compat_sys_process_vm_readv
+348	i386	process_vm_writev	sys_process_vm_writev		compat_sys_process_vm_writev
diff --git a/arch/x86/syscalls/syscall_64.tbl b/arch/x86/syscalls/syscall_64.tbl
new file mode 100644
index 000000000000..b440a8f7eefa
--- /dev/null
+++ b/arch/x86/syscalls/syscall_64.tbl
@@ -0,0 +1,320 @@
+#
+# 64-bit system call numbers and entry vectors
+#
+# The format is:
+# <number> <abi> <name> <entry point>
+#
+# The abi is always "64" for this file (for now.)
+#
+0	64	read			sys_read
+1	64	write			sys_write
+2	64	open			sys_open
+3	64	close			sys_close
+4	64	stat			sys_newstat
+5	64	fstat			sys_newfstat
+6	64	lstat			sys_newlstat
+7	64	poll			sys_poll
+8	64	lseek			sys_lseek
+9	64	mmap			sys_mmap
+10	64	mprotect		sys_mprotect
+11	64	munmap			sys_munmap
+12	64	brk			sys_brk
+13	64	rt_sigaction		sys_rt_sigaction
+14	64	rt_sigprocmask		sys_rt_sigprocmask
+15	64	rt_sigreturn		stub_rt_sigreturn
+16	64	ioctl			sys_ioctl
+17	64	pread64			sys_pread64
+18	64	pwrite64		sys_pwrite64
+19	64	readv			sys_readv
+20	64	writev			sys_writev
+21	64	access			sys_access
+22	64	pipe			sys_pipe
+23	64	select			sys_select
+24	64	sched_yield		sys_sched_yield
+25	64	mremap			sys_mremap
+26	64	msync			sys_msync
+27	64	mincore			sys_mincore
+28	64	madvise			sys_madvise
+29	64	shmget			sys_shmget
+30	64	shmat			sys_shmat
+31	64	shmctl			sys_shmctl
+32	64	dup			sys_dup
+33	64	dup2			sys_dup2
+34	64	pause			sys_pause
+35	64	nanosleep		sys_nanosleep
+36	64	getitimer		sys_getitimer
+37	64	alarm			sys_alarm
+38	64	setitimer		sys_setitimer
+39	64	getpid			sys_getpid
+40	64	sendfile		sys_sendfile64
+41	64	socket			sys_socket
+42	64	connect			sys_connect
+43	64	accept			sys_accept
+44	64	sendto			sys_sendto
+45	64	recvfrom		sys_recvfrom
+46	64	sendmsg			sys_sendmsg
+47	64	recvmsg			sys_recvmsg
+48	64	shutdown		sys_shutdown
+49	64	bind			sys_bind
+50	64	listen			sys_listen
+51	64	getsockname		sys_getsockname
+52	64	getpeername		sys_getpeername
+53	64	socketpair		sys_socketpair
+54	64	setsockopt		sys_setsockopt
+55	64	getsockopt		sys_getsockopt
+56	64	clone			stub_clone
+57	64	fork			stub_fork
+58	64	vfork			stub_vfork
+59	64	execve			stub_execve
+60	64	exit			sys_exit
+61	64	wait4			sys_wait4
+62	64	kill			sys_kill
+63	64	uname			sys_newuname
+64	64	semget			sys_semget
+65	64	semop			sys_semop
+66	64	semctl			sys_semctl
+67	64	shmdt			sys_shmdt
+68	64	msgget			sys_msgget
+69	64	msgsnd			sys_msgsnd
+70	64	msgrcv			sys_msgrcv
+71	64	msgctl			sys_msgctl
+72	64	fcntl			sys_fcntl
+73	64	flock			sys_flock
+74	64	fsync			sys_fsync
+75	64	fdatasync		sys_fdatasync
+76	64	truncate		sys_truncate
+77	64	ftruncate		sys_ftruncate
+78	64	getdents		sys_getdents
+79	64	getcwd			sys_getcwd
+80	64	chdir			sys_chdir
+81	64	fchdir			sys_fchdir
+82	64	rename			sys_rename
+83	64	mkdir			sys_mkdir
+84	64	rmdir			sys_rmdir
+85	64	creat			sys_creat
+86	64	link			sys_link
+87	64	unlink			sys_unlink
+88	64	symlink			sys_symlink
+89	64	readlink		sys_readlink
+90	64	chmod			sys_chmod
+91	64	fchmod			sys_fchmod
+92	64	chown			sys_chown
+93	64	fchown			sys_fchown
+94	64	lchown			sys_lchown
+95	64	umask			sys_umask
+96	64	gettimeofday		sys_gettimeofday
+97	64	getrlimit		sys_getrlimit
+98	64	getrusage		sys_getrusage
+99	64	sysinfo			sys_sysinfo
+100	64	times			sys_times
+101	64	ptrace			sys_ptrace
+102	64	getuid			sys_getuid
+103	64	syslog			sys_syslog
+104	64	getgid			sys_getgid
+105	64	setuid			sys_setuid
+106	64	setgid			sys_setgid
+107	64	geteuid			sys_geteuid
+108	64	getegid			sys_getegid
+109	64	setpgid			sys_setpgid
+110	64	getppid			sys_getppid
+111	64	getpgrp			sys_getpgrp
+112	64	setsid			sys_setsid
+113	64	setreuid		sys_setreuid
+114	64	setregid		sys_setregid
+115	64	getgroups		sys_getgroups
+116	64	setgroups		sys_setgroups
+117	64	setresuid		sys_setresuid
+118	64	getresuid		sys_getresuid
+119	64	setresgid		sys_setresgid
+120	64	getresgid		sys_getresgid
+121	64	getpgid			sys_getpgid
+122	64	setfsuid		sys_setfsuid
+123	64	setfsgid		sys_setfsgid
+124	64	getsid			sys_getsid
+125	64	capget			sys_capget
+126	64	capset			sys_capset
+127	64	rt_sigpending		sys_rt_sigpending
+128	64	rt_sigtimedwait		sys_rt_sigtimedwait
+129	64	rt_sigqueueinfo		sys_rt_sigqueueinfo
+130	64	rt_sigsuspend		sys_rt_sigsuspend
+131	64	sigaltstack		stub_sigaltstack
+132	64	utime			sys_utime
+133	64	mknod			sys_mknod
+134	64	uselib
+135	64	personality		sys_personality
+136	64	ustat			sys_ustat
+137	64	statfs			sys_statfs
+138	64	fstatfs			sys_fstatfs
+139	64	sysfs			sys_sysfs
+140	64	getpriority		sys_getpriority
+141	64	setpriority		sys_setpriority
+142	64	sched_setparam		sys_sched_setparam
+143	64	sched_getparam		sys_sched_getparam
+144	64	sched_setscheduler	sys_sched_setscheduler
+145	64	sched_getscheduler	sys_sched_getscheduler
+146	64	sched_get_priority_max	sys_sched_get_priority_max
+147	64	sched_get_priority_min	sys_sched_get_priority_min
+148	64	sched_rr_get_interval	sys_sched_rr_get_interval
+149	64	mlock			sys_mlock
+150	64	munlock			sys_munlock
+151	64	mlockall		sys_mlockall
+152	64	munlockall		sys_munlockall
+153	64	vhangup			sys_vhangup
+154	64	modify_ldt		sys_modify_ldt
+155	64	pivot_root		sys_pivot_root
+156	64	_sysctl			sys_sysctl
+157	64	prctl			sys_prctl
+158	64	arch_prctl		sys_arch_prctl
+159	64	adjtimex		sys_adjtimex
+160	64	setrlimit		sys_setrlimit
+161	64	chroot			sys_chroot
+162	64	sync			sys_sync
+163	64	acct			sys_acct
+164	64	settimeofday		sys_settimeofday
+165	64	mount			sys_mount
+166	64	umount2			sys_umount
+167	64	swapon			sys_swapon
+168	64	swapoff			sys_swapoff
+169	64	reboot			sys_reboot
+170	64	sethostname		sys_sethostname
+171	64	setdomainname		sys_setdomainname
+172	64	iopl			stub_iopl
+173	64	ioperm			sys_ioperm
+174	64	create_module
+175	64	init_module		sys_init_module
+176	64	delete_module		sys_delete_module
+177	64	get_kernel_syms
+178	64	query_module
+179	64	quotactl		sys_quotactl
+180	64	nfsservctl
+181	64	getpmsg
+182	64	putpmsg
+183	64	afs_syscall
+184	64	tuxcall
+185	64	security
+186	64	gettid			sys_gettid
+187	64	readahead		sys_readahead
+188	64	setxattr		sys_setxattr
+189	64	lsetxattr		sys_lsetxattr
+190	64	fsetxattr		sys_fsetxattr
+191	64	getxattr		sys_getxattr
+192	64	lgetxattr		sys_lgetxattr
+193	64	fgetxattr		sys_fgetxattr
+194	64	listxattr		sys_listxattr
+195	64	llistxattr		sys_llistxattr
+196	64	flistxattr		sys_flistxattr
+197	64	removexattr		sys_removexattr
+198	64	lremovexattr		sys_lremovexattr
+199	64	fremovexattr		sys_fremovexattr
+200	64	tkill			sys_tkill
+201	64	time			sys_time
+202	64	futex			sys_futex
+203	64	sched_setaffinity	sys_sched_setaffinity
+204	64	sched_getaffinity	sys_sched_getaffinity
+205	64	set_thread_area
+206	64	io_setup		sys_io_setup
+207	64	io_destroy		sys_io_destroy
+208	64	io_getevents		sys_io_getevents
+209	64	io_submit		sys_io_submit
+210	64	io_cancel		sys_io_cancel
+211	64	get_thread_area
+212	64	lookup_dcookie		sys_lookup_dcookie
+213	64	epoll_create		sys_epoll_create
+214	64	epoll_ctl_old
+215	64	epoll_wait_old
+216	64	remap_file_pages	sys_remap_file_pages
+217	64	getdents64		sys_getdents64
+218	64	set_tid_address		sys_set_tid_address
+219	64	restart_syscall		sys_restart_syscall
+220	64	semtimedop		sys_semtimedop
+221	64	fadvise64		sys_fadvise64
+222	64	timer_create		sys_timer_create
+223	64	timer_settime		sys_timer_settime
+224	64	timer_gettime		sys_timer_gettime
+225	64	timer_getoverrun	sys_timer_getoverrun
+226	64	timer_delete		sys_timer_delete
+227	64	clock_settime		sys_clock_settime
+228	64	clock_gettime		sys_clock_gettime
+229	64	clock_getres		sys_clock_getres
+230	64	clock_nanosleep		sys_clock_nanosleep
+231	64	exit_group		sys_exit_group
+232	64	epoll_wait		sys_epoll_wait
+233	64	epoll_ctl		sys_epoll_ctl
+234	64	tgkill			sys_tgkill
+235	64	utimes			sys_utimes
+236	64	vserver
+237	64	mbind			sys_mbind
+238	64	set_mempolicy		sys_set_mempolicy
+239	64	get_mempolicy		sys_get_mempolicy
+240	64	mq_open			sys_mq_open
+241	64	mq_unlink		sys_mq_unlink
+242	64	mq_timedsend		sys_mq_timedsend
+243	64	mq_timedreceive		sys_mq_timedreceive
+244	64	mq_notify		sys_mq_notify
+245	64	mq_getsetattr		sys_mq_getsetattr
+246	64	kexec_load		sys_kexec_load
+247	64	waitid			sys_waitid
+248	64	add_key			sys_add_key
+249	64	request_key		sys_request_key
+250	64	keyctl			sys_keyctl
+251	64	ioprio_set		sys_ioprio_set
+252	64	ioprio_get		sys_ioprio_get
+253	64	inotify_init		sys_inotify_init
+254	64	inotify_add_watch	sys_inotify_add_watch
+255	64	inotify_rm_watch	sys_inotify_rm_watch
+256	64	migrate_pages		sys_migrate_pages
+257	64	openat			sys_openat
+258	64	mkdirat			sys_mkdirat
+259	64	mknodat			sys_mknodat
+260	64	fchownat		sys_fchownat
+261	64	futimesat		sys_futimesat
+262	64	newfstatat		sys_newfstatat
+263	64	unlinkat		sys_unlinkat
+264	64	renameat		sys_renameat
+265	64	linkat			sys_linkat
+266	64	symlinkat		sys_symlinkat
+267	64	readlinkat		sys_readlinkat
+268	64	fchmodat		sys_fchmodat
+269	64	faccessat		sys_faccessat
+270	64	pselect6		sys_pselect6
+271	64	ppoll			sys_ppoll
+272	64	unshare			sys_unshare
+273	64	set_robust_list		sys_set_robust_list
+274	64	get_robust_list		sys_get_robust_list
+275	64	splice			sys_splice
+276	64	tee			sys_tee
+277	64	sync_file_range		sys_sync_file_range
+278	64	vmsplice		sys_vmsplice
+279	64	move_pages		sys_move_pages
+280	64	utimensat		sys_utimensat
+281	64	epoll_pwait		sys_epoll_pwait
+282	64	signalfd		sys_signalfd
+283	64	timerfd_create		sys_timerfd_create
+284	64	eventfd			sys_eventfd
+285	64	fallocate		sys_fallocate
+286	64	timerfd_settime		sys_timerfd_settime
+287	64	timerfd_gettime		sys_timerfd_gettime
+288	64	accept4			sys_accept4
+289	64	signalfd4		sys_signalfd4
+290	64	eventfd2		sys_eventfd2
+291	64	epoll_create1		sys_epoll_create1
+292	64	dup3			sys_dup3
+293	64	pipe2			sys_pipe2
+294	64	inotify_init1		sys_inotify_init1
+295	64	preadv			sys_preadv
+296	64	pwritev			sys_pwritev
+297	64	rt_tgsigqueueinfo	sys_rt_tgsigqueueinfo
+298	64	perf_event_open		sys_perf_event_open
+299	64	recvmmsg		sys_recvmmsg
+300	64	fanotify_init		sys_fanotify_init
+301	64	fanotify_mark		sys_fanotify_mark
+302	64	prlimit64		sys_prlimit64
+303	64	name_to_handle_at	sys_name_to_handle_at
+304	64	open_by_handle_at	sys_open_by_handle_at
+305	64	clock_adjtime		sys_clock_adjtime
+306	64	syncfs			sys_syncfs
+307	64	sendmmsg		sys_sendmmsg
+308	64	setns			sys_setns
+309	64	getcpu			sys_getcpu
+310	64	process_vm_readv	sys_process_vm_readv
+311	64	process_vm_writev	sys_process_vm_writev
diff --git a/arch/x86/syscalls/syscallhdr.sh b/arch/x86/syscalls/syscallhdr.sh
new file mode 100644
index 000000000000..0d473ff12eaf
--- /dev/null
+++ b/arch/x86/syscalls/syscallhdr.sh
@@ -0,0 +1,36 @@
+#!/bin/sh
+
+in="$1"
+out="$2"
+my_abis=`echo "$3" | tr ',' ' '`
+prefix="$4"
+offset="$5"
+
+fileguard=_ASM_X86_`basename "$out" | sed \
+    -e 'y/abcdefghijklmnopqrstuvwxyz/ABCDEFGHIJKLMNOPQRSTUVWXYZ/' \
+    -e 's/[^A-Z0-9_]/_/g' -e 's/__/_/g'`
+
+in_list () {
+    local x
+    for x in $1; do
+	if [ x"$x" = x"$2" ]; then
+	    return 0
+	fi
+    done
+    return 1
+}
+
+grep '^[0-9]' "$in" | sort -n | (
+    echo "#ifndef ${fileguard}"
+    echo "#define ${fileguard} 1"
+    echo ""
+
+    while read nr abi name entry ; do
+	if in_list "$my_abis" "$abi"; then
+	    echo "#define __NR_${prefix}${name}" $((nr+offset))
+        fi
+    done
+
+    echo ""
+    echo "#endif /* ${fileguard} */"
+) > "$out"
diff --git a/arch/x86/syscalls/syscalltbl.sh b/arch/x86/syscalls/syscalltbl.sh
new file mode 100644
index 000000000000..0e7f8ec071e7
--- /dev/null
+++ b/arch/x86/syscalls/syscalltbl.sh
@@ -0,0 +1,15 @@
+#!/bin/sh
+
+in="$1"
+out="$2"
+
+grep '^[0-9]' "$in" | sort -n | (
+    while read nr abi name entry compat; do
+	abi=`echo "$abi" | tr '[a-z]' '[A-Z]'`
+	if [ -n "$compat" ]; then
+	    echo "__SYSCALL_${abi}($nr, $entry, $compat)"
+	elif [ -n "$entry" ]; then
+	    echo "__SYSCALL_${abi}($nr, $entry, $entry)"
+	fi
+    done
+) > "$out"
-- 
cgit v1.2.3


From 303395ac3bf3e2cb488435537d416bc840438fcb Mon Sep 17 00:00:00 2001
From: "H. Peter Anvin" <hpa@linux.intel.com>
Date: Fri, 11 Nov 2011 16:07:41 -0800
Subject: x86: Generate system call tables and unistd_*.h from tables

Generate system call tables and unistd_*.h automatically from the
tables in arch/x86/syscalls.  All other information, like NR_syscalls,
is auto-generated, some of which is in asm-offsets_*.c.

This allows us to keep all the system call information in one place,
and allows for kernel space and user space to see different
information; this is currently used for the ia32 system call numbers
when building the 64-bit kernel, but will be used by the x32 ABI in
the near future.

This also removes some gratuitious differences between i386, x86-64
and ia32; in particular, now all system call tables are generated with
the same mechanism.

Cc: H. J. Lu <hjl.tools@gmail.com>
Cc: Sam Ravnborg <sam@ravnborg.org>
Cc: Michal Marek <mmarek@suse.cz>
Signed-off-by: H. Peter Anvin <hpa@linux.intel.com>
---
 arch/x86/Makefile                  |   6 +
 arch/x86/ia32/Makefile             |   2 +-
 arch/x86/ia32/ia32entry.S          | 356 ------------------
 arch/x86/ia32/syscall_ia32.c       |  25 ++
 arch/x86/include/asm/Kbuild        |   5 +-
 arch/x86/include/asm/ia32_unistd.h |  13 +-
 arch/x86/include/asm/unistd.h      |  54 ++-
 arch/x86/include/asm/unistd_32.h   | 401 --------------------
 arch/x86/include/asm/unistd_64.h   | 732 -------------------------------------
 arch/x86/kernel/Makefile           |   3 +-
 arch/x86/kernel/asm-offsets_32.c   |   8 +
 arch/x86/kernel/asm-offsets_64.c   |  19 +-
 arch/x86/kernel/entry_32.S         |  37 +-
 arch/x86/kernel/syscall_32.c       |  25 ++
 arch/x86/kernel/syscall_64.c       |  14 +-
 arch/x86/kernel/syscall_table_32.S | 350 ------------------
 16 files changed, 154 insertions(+), 1896 deletions(-)
 create mode 100644 arch/x86/ia32/syscall_ia32.c
 delete mode 100644 arch/x86/include/asm/unistd_32.h
 delete mode 100644 arch/x86/include/asm/unistd_64.h
 create mode 100644 arch/x86/kernel/syscall_32.c
 delete mode 100644 arch/x86/kernel/syscall_table_32.S

(limited to 'arch/x86')

diff --git a/arch/x86/Makefile b/arch/x86/Makefile
index b02e509072a7..209ba1294592 100644
--- a/arch/x86/Makefile
+++ b/arch/x86/Makefile
@@ -117,6 +117,12 @@ KBUILD_CFLAGS += $(call cc-option,-mno-sse -mno-mmx -mno-sse2 -mno-3dnow,)
 KBUILD_CFLAGS += $(mflags-y)
 KBUILD_AFLAGS += $(mflags-y)
 
+###
+# Syscall table generation
+
+archheaders:
+	$(Q)$(MAKE) $(build)=arch/x86/syscalls all
+
 ###
 # Kernel objects
 
diff --git a/arch/x86/ia32/Makefile b/arch/x86/ia32/Makefile
index eea9a1c77d38..455646e0e532 100644
--- a/arch/x86/ia32/Makefile
+++ b/arch/x86/ia32/Makefile
@@ -3,7 +3,7 @@
 #
 
 obj-$(CONFIG_IA32_EMULATION) := ia32entry.o sys_ia32.o ia32_signal.o
-obj-$(CONFIG_IA32_EMULATION) += nosyscall.o
+obj-$(CONFIG_IA32_EMULATION) += nosyscall.o syscall_ia32.o
 
 sysv-$(CONFIG_SYSVIPC) := ipc32.o
 obj-$(CONFIG_IA32_EMULATION) += $(sysv-y)
diff --git a/arch/x86/ia32/ia32entry.S b/arch/x86/ia32/ia32entry.S
index 59538a777695..72f853aea478 100644
--- a/arch/x86/ia32/ia32entry.S
+++ b/arch/x86/ia32/ia32entry.S
@@ -27,8 +27,6 @@
 
 	.section .entry.text, "ax"
 
-#define IA32_NR_syscalls ((ia32_syscall_end - ia32_sys_call_table)/8)
-
 	.macro IA32_ARG_FIXUP noebp=0
 	movl	%edi,%r8d
 	.if \noebp
@@ -496,357 +494,3 @@ ENTRY(ia32_ptregs_common)
 	jmp  ia32_sysret	/* misbalances the return cache */
 	CFI_ENDPROC
 END(ia32_ptregs_common)
-
-	.section .rodata,"a"
-	.align 8
-ia32_sys_call_table:
-	.quad sys_restart_syscall
-	.quad sys_exit
-	.quad stub32_fork
-	.quad sys_read
-	.quad sys_write
-	.quad compat_sys_open		/* 5 */
-	.quad sys_close
-	.quad sys32_waitpid
-	.quad sys_creat
-	.quad sys_link
-	.quad sys_unlink		/* 10 */
-	.quad stub32_execve
-	.quad sys_chdir
-	.quad compat_sys_time
-	.quad sys_mknod
-	.quad sys_chmod		/* 15 */
-	.quad sys_lchown16
-	.quad quiet_ni_syscall			/* old break syscall holder */
-	.quad sys_stat
-	.quad sys32_lseek
-	.quad sys_getpid		/* 20 */
-	.quad compat_sys_mount	/* mount  */
-	.quad sys_oldumount	/* old_umount  */
-	.quad sys_setuid16
-	.quad sys_getuid16
-	.quad compat_sys_stime	/* stime */		/* 25 */
-	.quad compat_sys_ptrace	/* ptrace */
-	.quad sys_alarm
-	.quad sys_fstat	/* (old)fstat */
-	.quad sys_pause
-	.quad compat_sys_utime	/* 30 */
-	.quad quiet_ni_syscall	/* old stty syscall holder */
-	.quad quiet_ni_syscall	/* old gtty syscall holder */
-	.quad sys_access
-	.quad sys_nice	
-	.quad quiet_ni_syscall	/* 35 */	/* old ftime syscall holder */
-	.quad sys_sync
-	.quad sys32_kill
-	.quad sys_rename
-	.quad sys_mkdir
-	.quad sys_rmdir		/* 40 */
-	.quad sys_dup
-	.quad sys_pipe
-	.quad compat_sys_times
-	.quad quiet_ni_syscall			/* old prof syscall holder */
-	.quad sys_brk		/* 45 */
-	.quad sys_setgid16
-	.quad sys_getgid16
-	.quad sys_signal
-	.quad sys_geteuid16
-	.quad sys_getegid16	/* 50 */
-	.quad sys_acct
-	.quad sys_umount			/* new_umount */
-	.quad quiet_ni_syscall			/* old lock syscall holder */
-	.quad compat_sys_ioctl
-	.quad compat_sys_fcntl64		/* 55 */
-	.quad quiet_ni_syscall			/* old mpx syscall holder */
-	.quad sys_setpgid
-	.quad quiet_ni_syscall			/* old ulimit syscall holder */
-	.quad sys_olduname
-	.quad sys_umask		/* 60 */
-	.quad sys_chroot
-	.quad compat_sys_ustat
-	.quad sys_dup2
-	.quad sys_getppid
-	.quad sys_getpgrp		/* 65 */
-	.quad sys_setsid
-	.quad sys32_sigaction
-	.quad sys_sgetmask
-	.quad sys_ssetmask
-	.quad sys_setreuid16	/* 70 */
-	.quad sys_setregid16
-	.quad sys32_sigsuspend
-	.quad compat_sys_sigpending
-	.quad sys_sethostname
-	.quad compat_sys_setrlimit	/* 75 */
-	.quad compat_sys_old_getrlimit	/* old_getrlimit */
-	.quad compat_sys_getrusage
-	.quad compat_sys_gettimeofday
-	.quad compat_sys_settimeofday
-	.quad sys_getgroups16	/* 80 */
-	.quad sys_setgroups16
-	.quad compat_sys_old_select
-	.quad sys_symlink
-	.quad sys_lstat
-	.quad sys_readlink		/* 85 */
-	.quad sys_uselib
-	.quad sys_swapon
-	.quad sys_reboot
-	.quad compat_sys_old_readdir
-	.quad sys32_mmap		/* 90 */
-	.quad sys_munmap
-	.quad sys_truncate
-	.quad sys_ftruncate
-	.quad sys_fchmod
-	.quad sys_fchown16		/* 95 */
-	.quad sys_getpriority
-	.quad sys_setpriority
-	.quad quiet_ni_syscall			/* old profil syscall holder */
-	.quad compat_sys_statfs
-	.quad compat_sys_fstatfs		/* 100 */
-	.quad sys_ioperm
-	.quad compat_sys_socketcall
-	.quad sys_syslog
-	.quad compat_sys_setitimer
-	.quad compat_sys_getitimer	/* 105 */
-	.quad compat_sys_newstat
-	.quad compat_sys_newlstat
-	.quad compat_sys_newfstat
-	.quad sys_uname
-	.quad stub32_iopl		/* 110 */
-	.quad sys_vhangup
-	.quad quiet_ni_syscall	/* old "idle" system call */
-	.quad sys32_vm86_warning	/* vm86old */ 
-	.quad compat_sys_wait4
-	.quad sys_swapoff		/* 115 */
-	.quad compat_sys_sysinfo
-	.quad sys32_ipc
-	.quad sys_fsync
-	.quad stub32_sigreturn
-	.quad stub32_clone		/* 120 */
-	.quad sys_setdomainname
-	.quad sys_newuname
-	.quad sys_modify_ldt
-	.quad compat_sys_adjtimex
-	.quad sys32_mprotect		/* 125 */
-	.quad compat_sys_sigprocmask
-	.quad quiet_ni_syscall		/* create_module */
-	.quad sys_init_module
-	.quad sys_delete_module
-	.quad quiet_ni_syscall		/* 130  get_kernel_syms */
-	.quad sys32_quotactl
-	.quad sys_getpgid
-	.quad sys_fchdir
-	.quad quiet_ni_syscall	/* bdflush */
-	.quad sys_sysfs		/* 135 */
-	.quad sys_personality
-	.quad quiet_ni_syscall	/* for afs_syscall */
-	.quad sys_setfsuid16
-	.quad sys_setfsgid16
-	.quad sys_llseek		/* 140 */
-	.quad compat_sys_getdents
-	.quad compat_sys_select
-	.quad sys_flock
-	.quad sys_msync
-	.quad compat_sys_readv		/* 145 */
-	.quad compat_sys_writev
-	.quad sys_getsid
-	.quad sys_fdatasync
-	.quad compat_sys_sysctl	/* sysctl */
-	.quad sys_mlock		/* 150 */
-	.quad sys_munlock
-	.quad sys_mlockall
-	.quad sys_munlockall
-	.quad sys_sched_setparam
-	.quad sys_sched_getparam   /* 155 */
-	.quad sys_sched_setscheduler
-	.quad sys_sched_getscheduler
-	.quad sys_sched_yield
-	.quad sys_sched_get_priority_max
-	.quad sys_sched_get_priority_min  /* 160 */
-	.quad sys32_sched_rr_get_interval
-	.quad compat_sys_nanosleep
-	.quad sys_mremap
-	.quad sys_setresuid16
-	.quad sys_getresuid16	/* 165 */
-	.quad sys32_vm86_warning	/* vm86 */ 
-	.quad quiet_ni_syscall	/* query_module */
-	.quad sys_poll
-	.quad quiet_ni_syscall /* old nfsservctl */
-	.quad sys_setresgid16	/* 170 */
-	.quad sys_getresgid16
-	.quad sys_prctl
-	.quad stub32_rt_sigreturn
-	.quad sys32_rt_sigaction
-	.quad sys32_rt_sigprocmask	/* 175 */
-	.quad sys32_rt_sigpending
-	.quad compat_sys_rt_sigtimedwait
-	.quad sys32_rt_sigqueueinfo
-	.quad sys_rt_sigsuspend
-	.quad sys32_pread		/* 180 */
-	.quad sys32_pwrite
-	.quad sys_chown16
-	.quad sys_getcwd
-	.quad sys_capget
-	.quad sys_capset
-	.quad stub32_sigaltstack
-	.quad sys32_sendfile
-	.quad quiet_ni_syscall		/* streams1 */
-	.quad quiet_ni_syscall		/* streams2 */
-	.quad stub32_vfork            /* 190 */
-	.quad compat_sys_getrlimit
-	.quad sys_mmap_pgoff
-	.quad sys32_truncate64
-	.quad sys32_ftruncate64
-	.quad sys32_stat64		/* 195 */
-	.quad sys32_lstat64
-	.quad sys32_fstat64
-	.quad sys_lchown
-	.quad sys_getuid
-	.quad sys_getgid		/* 200 */
-	.quad sys_geteuid
-	.quad sys_getegid
-	.quad sys_setreuid
-	.quad sys_setregid
-	.quad sys_getgroups	/* 205 */
-	.quad sys_setgroups
-	.quad sys_fchown
-	.quad sys_setresuid
-	.quad sys_getresuid
-	.quad sys_setresgid	/* 210 */
-	.quad sys_getresgid
-	.quad sys_chown
-	.quad sys_setuid
-	.quad sys_setgid
-	.quad sys_setfsuid		/* 215 */
-	.quad sys_setfsgid
-	.quad sys_pivot_root
-	.quad sys_mincore
-	.quad sys_madvise
-	.quad compat_sys_getdents64	/* 220 getdents64 */
-	.quad compat_sys_fcntl64	
-	.quad quiet_ni_syscall		/* tux */
-	.quad quiet_ni_syscall    	/* security */
-	.quad sys_gettid	
-	.quad sys32_readahead	/* 225 */
-	.quad sys_setxattr
-	.quad sys_lsetxattr
-	.quad sys_fsetxattr
-	.quad sys_getxattr
-	.quad sys_lgetxattr	/* 230 */
-	.quad sys_fgetxattr
-	.quad sys_listxattr
-	.quad sys_llistxattr
-	.quad sys_flistxattr
-	.quad sys_removexattr	/* 235 */
-	.quad sys_lremovexattr
-	.quad sys_fremovexattr
-	.quad sys_tkill
-	.quad sys_sendfile64 
-	.quad compat_sys_futex		/* 240 */
-	.quad compat_sys_sched_setaffinity
-	.quad compat_sys_sched_getaffinity
-	.quad sys_set_thread_area
-	.quad sys_get_thread_area
-	.quad compat_sys_io_setup	/* 245 */
-	.quad sys_io_destroy
-	.quad compat_sys_io_getevents
-	.quad compat_sys_io_submit
-	.quad sys_io_cancel
-	.quad sys32_fadvise64		/* 250 */
-	.quad quiet_ni_syscall 	/* free_huge_pages */
-	.quad sys_exit_group
-	.quad sys32_lookup_dcookie
-	.quad sys_epoll_create
-	.quad sys_epoll_ctl		/* 255 */
-	.quad sys_epoll_wait
-	.quad sys_remap_file_pages
-	.quad sys_set_tid_address
-	.quad compat_sys_timer_create
-	.quad compat_sys_timer_settime	/* 260 */
-	.quad compat_sys_timer_gettime
-	.quad sys_timer_getoverrun
-	.quad sys_timer_delete
-	.quad compat_sys_clock_settime
-	.quad compat_sys_clock_gettime	/* 265 */
-	.quad compat_sys_clock_getres
-	.quad compat_sys_clock_nanosleep
-	.quad compat_sys_statfs64
-	.quad compat_sys_fstatfs64
-	.quad sys_tgkill		/* 270 */
-	.quad compat_sys_utimes
-	.quad sys32_fadvise64_64
-	.quad quiet_ni_syscall	/* sys_vserver */
-	.quad sys_mbind
-	.quad compat_sys_get_mempolicy	/* 275 */
-	.quad sys_set_mempolicy
-	.quad compat_sys_mq_open
-	.quad sys_mq_unlink
-	.quad compat_sys_mq_timedsend
-	.quad compat_sys_mq_timedreceive	/* 280 */
-	.quad compat_sys_mq_notify
-	.quad compat_sys_mq_getsetattr
-	.quad compat_sys_kexec_load	/* reserved for kexec */
-	.quad compat_sys_waitid
-	.quad quiet_ni_syscall		/* 285: sys_altroot */
-	.quad sys_add_key
-	.quad sys_request_key
-	.quad sys_keyctl
-	.quad sys_ioprio_set
-	.quad sys_ioprio_get		/* 290 */
-	.quad sys_inotify_init
-	.quad sys_inotify_add_watch
-	.quad sys_inotify_rm_watch
-	.quad sys_migrate_pages
-	.quad compat_sys_openat		/* 295 */
-	.quad sys_mkdirat
-	.quad sys_mknodat
-	.quad sys_fchownat
-	.quad compat_sys_futimesat
-	.quad sys32_fstatat		/* 300 */
-	.quad sys_unlinkat
-	.quad sys_renameat
-	.quad sys_linkat
-	.quad sys_symlinkat
-	.quad sys_readlinkat		/* 305 */
-	.quad sys_fchmodat
-	.quad sys_faccessat
-	.quad compat_sys_pselect6
-	.quad compat_sys_ppoll
-	.quad sys_unshare		/* 310 */
-	.quad compat_sys_set_robust_list
-	.quad compat_sys_get_robust_list
-	.quad sys_splice
-	.quad sys32_sync_file_range
-	.quad sys_tee			/* 315 */
-	.quad compat_sys_vmsplice
-	.quad compat_sys_move_pages
-	.quad sys_getcpu
-	.quad sys_epoll_pwait
-	.quad compat_sys_utimensat	/* 320 */
-	.quad compat_sys_signalfd
-	.quad sys_timerfd_create
-	.quad sys_eventfd
-	.quad sys32_fallocate
-	.quad compat_sys_timerfd_settime	/* 325 */
-	.quad compat_sys_timerfd_gettime
-	.quad compat_sys_signalfd4
-	.quad sys_eventfd2
-	.quad sys_epoll_create1
-	.quad sys_dup3				/* 330 */
-	.quad sys_pipe2
-	.quad sys_inotify_init1
-	.quad compat_sys_preadv
-	.quad compat_sys_pwritev
-	.quad compat_sys_rt_tgsigqueueinfo	/* 335 */
-	.quad sys_perf_event_open
-	.quad compat_sys_recvmmsg
-	.quad sys_fanotify_init
-	.quad sys32_fanotify_mark
-	.quad sys_prlimit64		/* 340 */
-	.quad sys_name_to_handle_at
-	.quad compat_sys_open_by_handle_at
-	.quad compat_sys_clock_adjtime
-	.quad sys_syncfs
-	.quad compat_sys_sendmmsg	/* 345 */
-	.quad sys_setns
-	.quad compat_sys_process_vm_readv
-	.quad compat_sys_process_vm_writev
-ia32_syscall_end:
diff --git a/arch/x86/ia32/syscall_ia32.c b/arch/x86/ia32/syscall_ia32.c
new file mode 100644
index 000000000000..d04d3dbc47d4
--- /dev/null
+++ b/arch/x86/ia32/syscall_ia32.c
@@ -0,0 +1,25 @@
+/* System call table for ia32 emulation. */
+
+#include <linux/linkage.h>
+#include <linux/sys.h>
+#include <linux/cache.h>
+#include <asm/asm-offsets.h>
+
+#define __SYSCALL_I386(nr, sym, compat) extern asmlinkage void compat(void) ;
+#include <asm/syscalls_32.h>
+#undef __SYSCALL_I386
+
+#define __SYSCALL_I386(nr, sym, compat) [nr] = compat,
+
+typedef void (*sys_call_ptr_t)(void);
+
+extern void compat_ni_syscall(void);
+
+const sys_call_ptr_t ia32_sys_call_table[__NR_ia32_syscall_max+1] = {
+	/*
+	 * Smells like a like a compiler bug -- it doesn't work
+	 * when the & below is removed.
+	 */
+	[0 ... __NR_ia32_syscall_max] = &compat_ni_syscall,
+#include <asm/syscalls_32.h>
+};
diff --git a/arch/x86/include/asm/Kbuild b/arch/x86/include/asm/Kbuild
index 6fa90a845e4c..b57e6a43a37a 100644
--- a/arch/x86/include/asm/Kbuild
+++ b/arch/x86/include/asm/Kbuild
@@ -19,7 +19,8 @@ header-y += processor-flags.h
 header-y += ptrace-abi.h
 header-y += sigcontext32.h
 header-y += ucontext.h
-header-y += unistd_32.h
-header-y += unistd_64.h
 header-y += vm86.h
 header-y += vsyscall.h
+
+genhdr-y += unistd_32.h
+genhdr-y += unistd_64.h
diff --git a/arch/x86/include/asm/ia32_unistd.h b/arch/x86/include/asm/ia32_unistd.h
index 976f6ecd2ce6..b0d5716ca1e4 100644
--- a/arch/x86/include/asm/ia32_unistd.h
+++ b/arch/x86/include/asm/ia32_unistd.h
@@ -2,17 +2,10 @@
 #define _ASM_X86_IA32_UNISTD_H
 
 /*
- * This file contains the system call numbers of the ia32 port,
+ * This file contains the system call numbers of the ia32 compat ABI,
  * this is for the kernel only.
- * Only add syscalls here where some part of the kernel needs to know
- * the number. This should be otherwise in sync with asm-x86/unistd_32.h. -AK
  */
-
-#define __NR_ia32_restart_syscall 0
-#define __NR_ia32_exit		  1
-#define __NR_ia32_read		  3
-#define __NR_ia32_write		  4
-#define __NR_ia32_sigreturn	119
-#define __NR_ia32_rt_sigreturn	173
+#define __SYSCALL_ia32_NR(x) (x)
+#include <asm/unistd_32_ia32.h>
 
 #endif /* _ASM_X86_IA32_UNISTD_H */
diff --git a/arch/x86/include/asm/unistd.h b/arch/x86/include/asm/unistd.h
index 2a58ed3e51d8..b4a3db7ce140 100644
--- a/arch/x86/include/asm/unistd.h
+++ b/arch/x86/include/asm/unistd.h
@@ -1,13 +1,59 @@
+#ifndef _ASM_X86_UNISTD_H
+#define _ASM_X86_UNISTD_H 1
+
 #ifdef __KERNEL__
 # ifdef CONFIG_X86_32
-#  include "unistd_32.h"
+
+#  include <asm/unistd_32.h>
+#  define __ARCH_WANT_IPC_PARSE_VERSION
+#  define __ARCH_WANT_STAT64
+#  define __ARCH_WANT_SYS_OLD_MMAP
+#  define __ARCH_WANT_SYS_OLD_SELECT
+
 # else
-#  include "unistd_64.h"
+
+#  include <asm/unistd_64.h>
+#  define __ARCH_WANT_COMPAT_SYS_TIME
+
 # endif
+
+# define __ARCH_WANT_OLD_READDIR
+# define __ARCH_WANT_OLD_STAT
+# define __ARCH_WANT_SYS_ALARM
+# define __ARCH_WANT_SYS_FADVISE64
+# define __ARCH_WANT_SYS_GETHOSTNAME
+# define __ARCH_WANT_SYS_GETPGRP
+# define __ARCH_WANT_SYS_LLSEEK
+# define __ARCH_WANT_SYS_NICE
+# define __ARCH_WANT_SYS_OLDUMOUNT
+# define __ARCH_WANT_SYS_OLD_GETRLIMIT
+# define __ARCH_WANT_SYS_OLD_UNAME
+# define __ARCH_WANT_SYS_PAUSE
+# define __ARCH_WANT_SYS_RT_SIGACTION
+# define __ARCH_WANT_SYS_RT_SIGSUSPEND
+# define __ARCH_WANT_SYS_SGETMASK
+# define __ARCH_WANT_SYS_SIGNAL
+# define __ARCH_WANT_SYS_SIGPENDING
+# define __ARCH_WANT_SYS_SIGPROCMASK
+# define __ARCH_WANT_SYS_SOCKETCALL
+# define __ARCH_WANT_SYS_TIME
+# define __ARCH_WANT_SYS_UTIME
+# define __ARCH_WANT_SYS_WAITPID
+
+/*
+ * "Conditional" syscalls
+ *
+ * What we want is __attribute__((weak,alias("sys_ni_syscall"))),
+ * but it doesn't work on all toolchains, so we just do it by hand
+ */
+# define cond_syscall(x) asm(".weak\t" #x "\n\t.set\t" #x ",sys_ni_syscall")
+
 #else
 # ifdef __i386__
-#  include "unistd_32.h"
+#  include <asm/unistd_32.h>
 # else
-#  include "unistd_64.h"
+#  include <asm/unistd_64.h>
 # endif
 #endif
+
+#endif /* _ASM_X86_UNISTD_H */
diff --git a/arch/x86/include/asm/unistd_32.h b/arch/x86/include/asm/unistd_32.h
deleted file mode 100644
index 599c77d38f33..000000000000
--- a/arch/x86/include/asm/unistd_32.h
+++ /dev/null
@@ -1,401 +0,0 @@
-#ifndef _ASM_X86_UNISTD_32_H
-#define _ASM_X86_UNISTD_32_H
-
-/*
- * This file contains the system call numbers.
- */
-
-#define __NR_restart_syscall      0
-#define __NR_exit		  1
-#define __NR_fork		  2
-#define __NR_read		  3
-#define __NR_write		  4
-#define __NR_open		  5
-#define __NR_close		  6
-#define __NR_waitpid		  7
-#define __NR_creat		  8
-#define __NR_link		  9
-#define __NR_unlink		 10
-#define __NR_execve		 11
-#define __NR_chdir		 12
-#define __NR_time		 13
-#define __NR_mknod		 14
-#define __NR_chmod		 15
-#define __NR_lchown		 16
-#define __NR_break		 17
-#define __NR_oldstat		 18
-#define __NR_lseek		 19
-#define __NR_getpid		 20
-#define __NR_mount		 21
-#define __NR_umount		 22
-#define __NR_setuid		 23
-#define __NR_getuid		 24
-#define __NR_stime		 25
-#define __NR_ptrace		 26
-#define __NR_alarm		 27
-#define __NR_oldfstat		 28
-#define __NR_pause		 29
-#define __NR_utime		 30
-#define __NR_stty		 31
-#define __NR_gtty		 32
-#define __NR_access		 33
-#define __NR_nice		 34
-#define __NR_ftime		 35
-#define __NR_sync		 36
-#define __NR_kill		 37
-#define __NR_rename		 38
-#define __NR_mkdir		 39
-#define __NR_rmdir		 40
-#define __NR_dup		 41
-#define __NR_pipe		 42
-#define __NR_times		 43
-#define __NR_prof		 44
-#define __NR_brk		 45
-#define __NR_setgid		 46
-#define __NR_getgid		 47
-#define __NR_signal		 48
-#define __NR_geteuid		 49
-#define __NR_getegid		 50
-#define __NR_acct		 51
-#define __NR_umount2		 52
-#define __NR_lock		 53
-#define __NR_ioctl		 54
-#define __NR_fcntl		 55
-#define __NR_mpx		 56
-#define __NR_setpgid		 57
-#define __NR_ulimit		 58
-#define __NR_oldolduname	 59
-#define __NR_umask		 60
-#define __NR_chroot		 61
-#define __NR_ustat		 62
-#define __NR_dup2		 63
-#define __NR_getppid		 64
-#define __NR_getpgrp		 65
-#define __NR_setsid		 66
-#define __NR_sigaction		 67
-#define __NR_sgetmask		 68
-#define __NR_ssetmask		 69
-#define __NR_setreuid		 70
-#define __NR_setregid		 71
-#define __NR_sigsuspend		 72
-#define __NR_sigpending		 73
-#define __NR_sethostname	 74
-#define __NR_setrlimit		 75
-#define __NR_getrlimit		 76   /* Back compatible 2Gig limited rlimit */
-#define __NR_getrusage		 77
-#define __NR_gettimeofday	 78
-#define __NR_settimeofday	 79
-#define __NR_getgroups		 80
-#define __NR_setgroups		 81
-#define __NR_select		 82
-#define __NR_symlink		 83
-#define __NR_oldlstat		 84
-#define __NR_readlink		 85
-#define __NR_uselib		 86
-#define __NR_swapon		 87
-#define __NR_reboot		 88
-#define __NR_readdir		 89
-#define __NR_mmap		 90
-#define __NR_munmap		 91
-#define __NR_truncate		 92
-#define __NR_ftruncate		 93
-#define __NR_fchmod		 94
-#define __NR_fchown		 95
-#define __NR_getpriority	 96
-#define __NR_setpriority	 97
-#define __NR_profil		 98
-#define __NR_statfs		 99
-#define __NR_fstatfs		100
-#define __NR_ioperm		101
-#define __NR_socketcall		102
-#define __NR_syslog		103
-#define __NR_setitimer		104
-#define __NR_getitimer		105
-#define __NR_stat		106
-#define __NR_lstat		107
-#define __NR_fstat		108
-#define __NR_olduname		109
-#define __NR_iopl		110
-#define __NR_vhangup		111
-#define __NR_idle		112
-#define __NR_vm86old		113
-#define __NR_wait4		114
-#define __NR_swapoff		115
-#define __NR_sysinfo		116
-#define __NR_ipc		117
-#define __NR_fsync		118
-#define __NR_sigreturn		119
-#define __NR_clone		120
-#define __NR_setdomainname	121
-#define __NR_uname		122
-#define __NR_modify_ldt		123
-#define __NR_adjtimex		124
-#define __NR_mprotect		125
-#define __NR_sigprocmask	126
-#define __NR_create_module	127
-#define __NR_init_module	128
-#define __NR_delete_module	129
-#define __NR_get_kernel_syms	130
-#define __NR_quotactl		131
-#define __NR_getpgid		132
-#define __NR_fchdir		133
-#define __NR_bdflush		134
-#define __NR_sysfs		135
-#define __NR_personality	136
-#define __NR_afs_syscall	137 /* Syscall for Andrew File System */
-#define __NR_setfsuid		138
-#define __NR_setfsgid		139
-#define __NR__llseek		140
-#define __NR_getdents		141
-#define __NR__newselect		142
-#define __NR_flock		143
-#define __NR_msync		144
-#define __NR_readv		145
-#define __NR_writev		146
-#define __NR_getsid		147
-#define __NR_fdatasync		148
-#define __NR__sysctl		149
-#define __NR_mlock		150
-#define __NR_munlock		151
-#define __NR_mlockall		152
-#define __NR_munlockall		153
-#define __NR_sched_setparam		154
-#define __NR_sched_getparam		155
-#define __NR_sched_setscheduler		156
-#define __NR_sched_getscheduler		157
-#define __NR_sched_yield		158
-#define __NR_sched_get_priority_max	159
-#define __NR_sched_get_priority_min	160
-#define __NR_sched_rr_get_interval	161
-#define __NR_nanosleep		162
-#define __NR_mremap		163
-#define __NR_setresuid		164
-#define __NR_getresuid		165
-#define __NR_vm86		166
-#define __NR_query_module	167
-#define __NR_poll		168
-#define __NR_nfsservctl		169
-#define __NR_setresgid		170
-#define __NR_getresgid		171
-#define __NR_prctl              172
-#define __NR_rt_sigreturn	173
-#define __NR_rt_sigaction	174
-#define __NR_rt_sigprocmask	175
-#define __NR_rt_sigpending	176
-#define __NR_rt_sigtimedwait	177
-#define __NR_rt_sigqueueinfo	178
-#define __NR_rt_sigsuspend	179
-#define __NR_pread64		180
-#define __NR_pwrite64		181
-#define __NR_chown		182
-#define __NR_getcwd		183
-#define __NR_capget		184
-#define __NR_capset		185
-#define __NR_sigaltstack	186
-#define __NR_sendfile		187
-#define __NR_getpmsg		188	/* some people actually want streams */
-#define __NR_putpmsg		189	/* some people actually want streams */
-#define __NR_vfork		190
-#define __NR_ugetrlimit		191	/* SuS compliant getrlimit */
-#define __NR_mmap2		192
-#define __NR_truncate64		193
-#define __NR_ftruncate64	194
-#define __NR_stat64		195
-#define __NR_lstat64		196
-#define __NR_fstat64		197
-#define __NR_lchown32		198
-#define __NR_getuid32		199
-#define __NR_getgid32		200
-#define __NR_geteuid32		201
-#define __NR_getegid32		202
-#define __NR_setreuid32		203
-#define __NR_setregid32		204
-#define __NR_getgroups32	205
-#define __NR_setgroups32	206
-#define __NR_fchown32		207
-#define __NR_setresuid32	208
-#define __NR_getresuid32	209
-#define __NR_setresgid32	210
-#define __NR_getresgid32	211
-#define __NR_chown32		212
-#define __NR_setuid32		213
-#define __NR_setgid32		214
-#define __NR_setfsuid32		215
-#define __NR_setfsgid32		216
-#define __NR_pivot_root		217
-#define __NR_mincore		218
-#define __NR_madvise		219
-#define __NR_madvise1		219	/* delete when C lib stub is removed */
-#define __NR_getdents64		220
-#define __NR_fcntl64		221
-/* 223 is unused */
-#define __NR_gettid		224
-#define __NR_readahead		225
-#define __NR_setxattr		226
-#define __NR_lsetxattr		227
-#define __NR_fsetxattr		228
-#define __NR_getxattr		229
-#define __NR_lgetxattr		230
-#define __NR_fgetxattr		231
-#define __NR_listxattr		232
-#define __NR_llistxattr		233
-#define __NR_flistxattr		234
-#define __NR_removexattr	235
-#define __NR_lremovexattr	236
-#define __NR_fremovexattr	237
-#define __NR_tkill		238
-#define __NR_sendfile64		239
-#define __NR_futex		240
-#define __NR_sched_setaffinity	241
-#define __NR_sched_getaffinity	242
-#define __NR_set_thread_area	243
-#define __NR_get_thread_area	244
-#define __NR_io_setup		245
-#define __NR_io_destroy		246
-#define __NR_io_getevents	247
-#define __NR_io_submit		248
-#define __NR_io_cancel		249
-#define __NR_fadvise64		250
-/* 251 is available for reuse (was briefly sys_set_zone_reclaim) */
-#define __NR_exit_group		252
-#define __NR_lookup_dcookie	253
-#define __NR_epoll_create	254
-#define __NR_epoll_ctl		255
-#define __NR_epoll_wait		256
-#define __NR_remap_file_pages	257
-#define __NR_set_tid_address	258
-#define __NR_timer_create	259
-#define __NR_timer_settime	(__NR_timer_create+1)
-#define __NR_timer_gettime	(__NR_timer_create+2)
-#define __NR_timer_getoverrun	(__NR_timer_create+3)
-#define __NR_timer_delete	(__NR_timer_create+4)
-#define __NR_clock_settime	(__NR_timer_create+5)
-#define __NR_clock_gettime	(__NR_timer_create+6)
-#define __NR_clock_getres	(__NR_timer_create+7)
-#define __NR_clock_nanosleep	(__NR_timer_create+8)
-#define __NR_statfs64		268
-#define __NR_fstatfs64		269
-#define __NR_tgkill		270
-#define __NR_utimes		271
-#define __NR_fadvise64_64	272
-#define __NR_vserver		273
-#define __NR_mbind		274
-#define __NR_get_mempolicy	275
-#define __NR_set_mempolicy	276
-#define __NR_mq_open 		277
-#define __NR_mq_unlink		(__NR_mq_open+1)
-#define __NR_mq_timedsend	(__NR_mq_open+2)
-#define __NR_mq_timedreceive	(__NR_mq_open+3)
-#define __NR_mq_notify		(__NR_mq_open+4)
-#define __NR_mq_getsetattr	(__NR_mq_open+5)
-#define __NR_kexec_load		283
-#define __NR_waitid		284
-/* #define __NR_sys_setaltroot	285 */
-#define __NR_add_key		286
-#define __NR_request_key	287
-#define __NR_keyctl		288
-#define __NR_ioprio_set		289
-#define __NR_ioprio_get		290
-#define __NR_inotify_init	291
-#define __NR_inotify_add_watch	292
-#define __NR_inotify_rm_watch	293
-#define __NR_migrate_pages	294
-#define __NR_openat		295
-#define __NR_mkdirat		296
-#define __NR_mknodat		297
-#define __NR_fchownat		298
-#define __NR_futimesat		299
-#define __NR_fstatat64		300
-#define __NR_unlinkat		301
-#define __NR_renameat		302
-#define __NR_linkat		303
-#define __NR_symlinkat		304
-#define __NR_readlinkat		305
-#define __NR_fchmodat		306
-#define __NR_faccessat		307
-#define __NR_pselect6		308
-#define __NR_ppoll		309
-#define __NR_unshare		310
-#define __NR_set_robust_list	311
-#define __NR_get_robust_list	312
-#define __NR_splice		313
-#define __NR_sync_file_range	314
-#define __NR_tee		315
-#define __NR_vmsplice		316
-#define __NR_move_pages		317
-#define __NR_getcpu		318
-#define __NR_epoll_pwait	319
-#define __NR_utimensat		320
-#define __NR_signalfd		321
-#define __NR_timerfd_create	322
-#define __NR_eventfd		323
-#define __NR_fallocate		324
-#define __NR_timerfd_settime	325
-#define __NR_timerfd_gettime	326
-#define __NR_signalfd4		327
-#define __NR_eventfd2		328
-#define __NR_epoll_create1	329
-#define __NR_dup3		330
-#define __NR_pipe2		331
-#define __NR_inotify_init1	332
-#define __NR_preadv		333
-#define __NR_pwritev		334
-#define __NR_rt_tgsigqueueinfo	335
-#define __NR_perf_event_open	336
-#define __NR_recvmmsg		337
-#define __NR_fanotify_init	338
-#define __NR_fanotify_mark	339
-#define __NR_prlimit64		340
-#define __NR_name_to_handle_at	341
-#define __NR_open_by_handle_at  342
-#define __NR_clock_adjtime	343
-#define __NR_syncfs             344
-#define __NR_sendmmsg		345
-#define __NR_setns		346
-#define __NR_process_vm_readv	347
-#define __NR_process_vm_writev	348
-
-#ifdef __KERNEL__
-
-#define NR_syscalls 349
-
-#define __ARCH_WANT_IPC_PARSE_VERSION
-#define __ARCH_WANT_OLD_READDIR
-#define __ARCH_WANT_OLD_STAT
-#define __ARCH_WANT_STAT64
-#define __ARCH_WANT_SYS_ALARM
-#define __ARCH_WANT_SYS_GETHOSTNAME
-#define __ARCH_WANT_SYS_IPC
-#define __ARCH_WANT_SYS_PAUSE
-#define __ARCH_WANT_SYS_SGETMASK
-#define __ARCH_WANT_SYS_SIGNAL
-#define __ARCH_WANT_SYS_TIME
-#define __ARCH_WANT_SYS_UTIME
-#define __ARCH_WANT_SYS_WAITPID
-#define __ARCH_WANT_SYS_SOCKETCALL
-#define __ARCH_WANT_SYS_FADVISE64
-#define __ARCH_WANT_SYS_GETPGRP
-#define __ARCH_WANT_SYS_LLSEEK
-#define __ARCH_WANT_SYS_NICE
-#define __ARCH_WANT_SYS_OLD_GETRLIMIT
-#define __ARCH_WANT_SYS_OLD_UNAME
-#define __ARCH_WANT_SYS_OLD_MMAP
-#define __ARCH_WANT_SYS_OLD_SELECT
-#define __ARCH_WANT_SYS_OLDUMOUNT
-#define __ARCH_WANT_SYS_SIGPENDING
-#define __ARCH_WANT_SYS_SIGPROCMASK
-#define __ARCH_WANT_SYS_RT_SIGACTION
-#define __ARCH_WANT_SYS_RT_SIGSUSPEND
-
-/*
- * "Conditional" syscalls
- *
- * What we want is __attribute__((weak,alias("sys_ni_syscall"))),
- * but it doesn't work on all toolchains, so we just do it by hand
- */
-#ifndef cond_syscall
-#define cond_syscall(x) asm(".weak\t" #x "\n\t.set\t" #x ",sys_ni_syscall")
-#endif
-
-#endif /* __KERNEL__ */
-#endif /* _ASM_X86_UNISTD_32_H */
diff --git a/arch/x86/include/asm/unistd_64.h b/arch/x86/include/asm/unistd_64.h
deleted file mode 100644
index 0431f193c3f2..000000000000
--- a/arch/x86/include/asm/unistd_64.h
+++ /dev/null
@@ -1,732 +0,0 @@
-#ifndef _ASM_X86_UNISTD_64_H
-#define _ASM_X86_UNISTD_64_H
-
-#ifndef __SYSCALL
-#define __SYSCALL(a, b)
-#endif
-
-/*
- * This file contains the system call numbers.
- *
- * Note: holes are not allowed.
- */
-
-/* at least 8 syscall per cacheline */
-#define __NR_read				0
-__SYSCALL(__NR_read, sys_read)
-#define __NR_write				1
-__SYSCALL(__NR_write, sys_write)
-#define __NR_open				2
-__SYSCALL(__NR_open, sys_open)
-#define __NR_close				3
-__SYSCALL(__NR_close, sys_close)
-#define __NR_stat				4
-__SYSCALL(__NR_stat, sys_newstat)
-#define __NR_fstat				5
-__SYSCALL(__NR_fstat, sys_newfstat)
-#define __NR_lstat				6
-__SYSCALL(__NR_lstat, sys_newlstat)
-#define __NR_poll				7
-__SYSCALL(__NR_poll, sys_poll)
-
-#define __NR_lseek				8
-__SYSCALL(__NR_lseek, sys_lseek)
-#define __NR_mmap				9
-__SYSCALL(__NR_mmap, sys_mmap)
-#define __NR_mprotect				10
-__SYSCALL(__NR_mprotect, sys_mprotect)
-#define __NR_munmap				11
-__SYSCALL(__NR_munmap, sys_munmap)
-#define __NR_brk				12
-__SYSCALL(__NR_brk, sys_brk)
-#define __NR_rt_sigaction			13
-__SYSCALL(__NR_rt_sigaction, sys_rt_sigaction)
-#define __NR_rt_sigprocmask			14
-__SYSCALL(__NR_rt_sigprocmask, sys_rt_sigprocmask)
-#define __NR_rt_sigreturn			15
-__SYSCALL(__NR_rt_sigreturn, stub_rt_sigreturn)
-
-#define __NR_ioctl				16
-__SYSCALL(__NR_ioctl, sys_ioctl)
-#define __NR_pread64				17
-__SYSCALL(__NR_pread64, sys_pread64)
-#define __NR_pwrite64				18
-__SYSCALL(__NR_pwrite64, sys_pwrite64)
-#define __NR_readv				19
-__SYSCALL(__NR_readv, sys_readv)
-#define __NR_writev				20
-__SYSCALL(__NR_writev, sys_writev)
-#define __NR_access				21
-__SYSCALL(__NR_access, sys_access)
-#define __NR_pipe				22
-__SYSCALL(__NR_pipe, sys_pipe)
-#define __NR_select				23
-__SYSCALL(__NR_select, sys_select)
-
-#define __NR_sched_yield			24
-__SYSCALL(__NR_sched_yield, sys_sched_yield)
-#define __NR_mremap				25
-__SYSCALL(__NR_mremap, sys_mremap)
-#define __NR_msync				26
-__SYSCALL(__NR_msync, sys_msync)
-#define __NR_mincore				27
-__SYSCALL(__NR_mincore, sys_mincore)
-#define __NR_madvise				28
-__SYSCALL(__NR_madvise, sys_madvise)
-#define __NR_shmget				29
-__SYSCALL(__NR_shmget, sys_shmget)
-#define __NR_shmat				30
-__SYSCALL(__NR_shmat, sys_shmat)
-#define __NR_shmctl				31
-__SYSCALL(__NR_shmctl, sys_shmctl)
-
-#define __NR_dup				32
-__SYSCALL(__NR_dup, sys_dup)
-#define __NR_dup2				33
-__SYSCALL(__NR_dup2, sys_dup2)
-#define __NR_pause				34
-__SYSCALL(__NR_pause, sys_pause)
-#define __NR_nanosleep				35
-__SYSCALL(__NR_nanosleep, sys_nanosleep)
-#define __NR_getitimer				36
-__SYSCALL(__NR_getitimer, sys_getitimer)
-#define __NR_alarm				37
-__SYSCALL(__NR_alarm, sys_alarm)
-#define __NR_setitimer				38
-__SYSCALL(__NR_setitimer, sys_setitimer)
-#define __NR_getpid				39
-__SYSCALL(__NR_getpid, sys_getpid)
-
-#define __NR_sendfile				40
-__SYSCALL(__NR_sendfile, sys_sendfile64)
-#define __NR_socket				41
-__SYSCALL(__NR_socket, sys_socket)
-#define __NR_connect				42
-__SYSCALL(__NR_connect, sys_connect)
-#define __NR_accept				43
-__SYSCALL(__NR_accept, sys_accept)
-#define __NR_sendto				44
-__SYSCALL(__NR_sendto, sys_sendto)
-#define __NR_recvfrom				45
-__SYSCALL(__NR_recvfrom, sys_recvfrom)
-#define __NR_sendmsg				46
-__SYSCALL(__NR_sendmsg, sys_sendmsg)
-#define __NR_recvmsg				47
-__SYSCALL(__NR_recvmsg, sys_recvmsg)
-
-#define __NR_shutdown				48
-__SYSCALL(__NR_shutdown, sys_shutdown)
-#define __NR_bind				49
-__SYSCALL(__NR_bind, sys_bind)
-#define __NR_listen				50
-__SYSCALL(__NR_listen, sys_listen)
-#define __NR_getsockname			51
-__SYSCALL(__NR_getsockname, sys_getsockname)
-#define __NR_getpeername			52
-__SYSCALL(__NR_getpeername, sys_getpeername)
-#define __NR_socketpair				53
-__SYSCALL(__NR_socketpair, sys_socketpair)
-#define __NR_setsockopt				54
-__SYSCALL(__NR_setsockopt, sys_setsockopt)
-#define __NR_getsockopt				55
-__SYSCALL(__NR_getsockopt, sys_getsockopt)
-
-#define __NR_clone				56
-__SYSCALL(__NR_clone, stub_clone)
-#define __NR_fork				57
-__SYSCALL(__NR_fork, stub_fork)
-#define __NR_vfork				58
-__SYSCALL(__NR_vfork, stub_vfork)
-#define __NR_execve				59
-__SYSCALL(__NR_execve, stub_execve)
-#define __NR_exit				60
-__SYSCALL(__NR_exit, sys_exit)
-#define __NR_wait4				61
-__SYSCALL(__NR_wait4, sys_wait4)
-#define __NR_kill				62
-__SYSCALL(__NR_kill, sys_kill)
-#define __NR_uname				63
-__SYSCALL(__NR_uname, sys_newuname)
-
-#define __NR_semget				64
-__SYSCALL(__NR_semget, sys_semget)
-#define __NR_semop				65
-__SYSCALL(__NR_semop, sys_semop)
-#define __NR_semctl				66
-__SYSCALL(__NR_semctl, sys_semctl)
-#define __NR_shmdt				67
-__SYSCALL(__NR_shmdt, sys_shmdt)
-#define __NR_msgget				68
-__SYSCALL(__NR_msgget, sys_msgget)
-#define __NR_msgsnd				69
-__SYSCALL(__NR_msgsnd, sys_msgsnd)
-#define __NR_msgrcv				70
-__SYSCALL(__NR_msgrcv, sys_msgrcv)
-#define __NR_msgctl				71
-__SYSCALL(__NR_msgctl, sys_msgctl)
-
-#define __NR_fcntl				72
-__SYSCALL(__NR_fcntl, sys_fcntl)
-#define __NR_flock				73
-__SYSCALL(__NR_flock, sys_flock)
-#define __NR_fsync				74
-__SYSCALL(__NR_fsync, sys_fsync)
-#define __NR_fdatasync				75
-__SYSCALL(__NR_fdatasync, sys_fdatasync)
-#define __NR_truncate				76
-__SYSCALL(__NR_truncate, sys_truncate)
-#define __NR_ftruncate				77
-__SYSCALL(__NR_ftruncate, sys_ftruncate)
-#define __NR_getdents				78
-__SYSCALL(__NR_getdents, sys_getdents)
-#define __NR_getcwd				79
-__SYSCALL(__NR_getcwd, sys_getcwd)
-
-#define __NR_chdir				80
-__SYSCALL(__NR_chdir, sys_chdir)
-#define __NR_fchdir				81
-__SYSCALL(__NR_fchdir, sys_fchdir)
-#define __NR_rename				82
-__SYSCALL(__NR_rename, sys_rename)
-#define __NR_mkdir				83
-__SYSCALL(__NR_mkdir, sys_mkdir)
-#define __NR_rmdir				84
-__SYSCALL(__NR_rmdir, sys_rmdir)
-#define __NR_creat				85
-__SYSCALL(__NR_creat, sys_creat)
-#define __NR_link				86
-__SYSCALL(__NR_link, sys_link)
-#define __NR_unlink				87
-__SYSCALL(__NR_unlink, sys_unlink)
-
-#define __NR_symlink				88
-__SYSCALL(__NR_symlink, sys_symlink)
-#define __NR_readlink				89
-__SYSCALL(__NR_readlink, sys_readlink)
-#define __NR_chmod				90
-__SYSCALL(__NR_chmod, sys_chmod)
-#define __NR_fchmod				91
-__SYSCALL(__NR_fchmod, sys_fchmod)
-#define __NR_chown				92
-__SYSCALL(__NR_chown, sys_chown)
-#define __NR_fchown				93
-__SYSCALL(__NR_fchown, sys_fchown)
-#define __NR_lchown				94
-__SYSCALL(__NR_lchown, sys_lchown)
-#define __NR_umask				95
-__SYSCALL(__NR_umask, sys_umask)
-
-#define __NR_gettimeofday			96
-__SYSCALL(__NR_gettimeofday, sys_gettimeofday)
-#define __NR_getrlimit				97
-__SYSCALL(__NR_getrlimit, sys_getrlimit)
-#define __NR_getrusage				98
-__SYSCALL(__NR_getrusage, sys_getrusage)
-#define __NR_sysinfo				99
-__SYSCALL(__NR_sysinfo, sys_sysinfo)
-#define __NR_times				100
-__SYSCALL(__NR_times, sys_times)
-#define __NR_ptrace				101
-__SYSCALL(__NR_ptrace, sys_ptrace)
-#define __NR_getuid				102
-__SYSCALL(__NR_getuid, sys_getuid)
-#define __NR_syslog				103
-__SYSCALL(__NR_syslog, sys_syslog)
-
-/* at the very end the stuff that never runs during the benchmarks */
-#define __NR_getgid				104
-__SYSCALL(__NR_getgid, sys_getgid)
-#define __NR_setuid				105
-__SYSCALL(__NR_setuid, sys_setuid)
-#define __NR_setgid				106
-__SYSCALL(__NR_setgid, sys_setgid)
-#define __NR_geteuid				107
-__SYSCALL(__NR_geteuid, sys_geteuid)
-#define __NR_getegid				108
-__SYSCALL(__NR_getegid, sys_getegid)
-#define __NR_setpgid				109
-__SYSCALL(__NR_setpgid, sys_setpgid)
-#define __NR_getppid				110
-__SYSCALL(__NR_getppid, sys_getppid)
-#define __NR_getpgrp				111
-__SYSCALL(__NR_getpgrp, sys_getpgrp)
-
-#define __NR_setsid				112
-__SYSCALL(__NR_setsid, sys_setsid)
-#define __NR_setreuid				113
-__SYSCALL(__NR_setreuid, sys_setreuid)
-#define __NR_setregid				114
-__SYSCALL(__NR_setregid, sys_setregid)
-#define __NR_getgroups				115
-__SYSCALL(__NR_getgroups, sys_getgroups)
-#define __NR_setgroups				116
-__SYSCALL(__NR_setgroups, sys_setgroups)
-#define __NR_setresuid				117
-__SYSCALL(__NR_setresuid, sys_setresuid)
-#define __NR_getresuid				118
-__SYSCALL(__NR_getresuid, sys_getresuid)
-#define __NR_setresgid				119
-__SYSCALL(__NR_setresgid, sys_setresgid)
-
-#define __NR_getresgid				120
-__SYSCALL(__NR_getresgid, sys_getresgid)
-#define __NR_getpgid				121
-__SYSCALL(__NR_getpgid, sys_getpgid)
-#define __NR_setfsuid				122
-__SYSCALL(__NR_setfsuid, sys_setfsuid)
-#define __NR_setfsgid				123
-__SYSCALL(__NR_setfsgid, sys_setfsgid)
-#define __NR_getsid				124
-__SYSCALL(__NR_getsid, sys_getsid)
-#define __NR_capget				125
-__SYSCALL(__NR_capget, sys_capget)
-#define __NR_capset				126
-__SYSCALL(__NR_capset, sys_capset)
-
-#define __NR_rt_sigpending			127
-__SYSCALL(__NR_rt_sigpending, sys_rt_sigpending)
-#define __NR_rt_sigtimedwait			128
-__SYSCALL(__NR_rt_sigtimedwait, sys_rt_sigtimedwait)
-#define __NR_rt_sigqueueinfo			129
-__SYSCALL(__NR_rt_sigqueueinfo, sys_rt_sigqueueinfo)
-#define __NR_rt_sigsuspend			130
-__SYSCALL(__NR_rt_sigsuspend, sys_rt_sigsuspend)
-#define __NR_sigaltstack			131
-__SYSCALL(__NR_sigaltstack, stub_sigaltstack)
-#define __NR_utime				132
-__SYSCALL(__NR_utime, sys_utime)
-#define __NR_mknod				133
-__SYSCALL(__NR_mknod, sys_mknod)
-
-/* Only needed for a.out */
-#define __NR_uselib				134
-__SYSCALL(__NR_uselib, sys_ni_syscall)
-#define __NR_personality			135
-__SYSCALL(__NR_personality, sys_personality)
-
-#define __NR_ustat				136
-__SYSCALL(__NR_ustat, sys_ustat)
-#define __NR_statfs				137
-__SYSCALL(__NR_statfs, sys_statfs)
-#define __NR_fstatfs				138
-__SYSCALL(__NR_fstatfs, sys_fstatfs)
-#define __NR_sysfs				139
-__SYSCALL(__NR_sysfs, sys_sysfs)
-
-#define __NR_getpriority			140
-__SYSCALL(__NR_getpriority, sys_getpriority)
-#define __NR_setpriority			141
-__SYSCALL(__NR_setpriority, sys_setpriority)
-#define __NR_sched_setparam			142
-__SYSCALL(__NR_sched_setparam, sys_sched_setparam)
-#define __NR_sched_getparam			143
-__SYSCALL(__NR_sched_getparam, sys_sched_getparam)
-#define __NR_sched_setscheduler			144
-__SYSCALL(__NR_sched_setscheduler, sys_sched_setscheduler)
-#define __NR_sched_getscheduler			145
-__SYSCALL(__NR_sched_getscheduler, sys_sched_getscheduler)
-#define __NR_sched_get_priority_max		146
-__SYSCALL(__NR_sched_get_priority_max, sys_sched_get_priority_max)
-#define __NR_sched_get_priority_min		147
-__SYSCALL(__NR_sched_get_priority_min, sys_sched_get_priority_min)
-#define __NR_sched_rr_get_interval		148
-__SYSCALL(__NR_sched_rr_get_interval, sys_sched_rr_get_interval)
-
-#define __NR_mlock				149
-__SYSCALL(__NR_mlock, sys_mlock)
-#define __NR_munlock				150
-__SYSCALL(__NR_munlock, sys_munlock)
-#define __NR_mlockall				151
-__SYSCALL(__NR_mlockall, sys_mlockall)
-#define __NR_munlockall				152
-__SYSCALL(__NR_munlockall, sys_munlockall)
-
-#define __NR_vhangup				153
-__SYSCALL(__NR_vhangup, sys_vhangup)
-
-#define __NR_modify_ldt				154
-__SYSCALL(__NR_modify_ldt, sys_modify_ldt)
-
-#define __NR_pivot_root				155
-__SYSCALL(__NR_pivot_root, sys_pivot_root)
-
-#define __NR__sysctl				156
-__SYSCALL(__NR__sysctl, sys_sysctl)
-
-#define __NR_prctl				157
-__SYSCALL(__NR_prctl, sys_prctl)
-#define __NR_arch_prctl				158
-__SYSCALL(__NR_arch_prctl, sys_arch_prctl)
-
-#define __NR_adjtimex				159
-__SYSCALL(__NR_adjtimex, sys_adjtimex)
-
-#define __NR_setrlimit				160
-__SYSCALL(__NR_setrlimit, sys_setrlimit)
-
-#define __NR_chroot				161
-__SYSCALL(__NR_chroot, sys_chroot)
-
-#define __NR_sync				162
-__SYSCALL(__NR_sync, sys_sync)
-
-#define __NR_acct				163
-__SYSCALL(__NR_acct, sys_acct)
-
-#define __NR_settimeofday			164
-__SYSCALL(__NR_settimeofday, sys_settimeofday)
-
-#define __NR_mount				165
-__SYSCALL(__NR_mount, sys_mount)
-#define __NR_umount2				166
-__SYSCALL(__NR_umount2, sys_umount)
-
-#define __NR_swapon				167
-__SYSCALL(__NR_swapon, sys_swapon)
-#define __NR_swapoff				168
-__SYSCALL(__NR_swapoff, sys_swapoff)
-
-#define __NR_reboot				169
-__SYSCALL(__NR_reboot, sys_reboot)
-
-#define __NR_sethostname			170
-__SYSCALL(__NR_sethostname, sys_sethostname)
-#define __NR_setdomainname			171
-__SYSCALL(__NR_setdomainname, sys_setdomainname)
-
-#define __NR_iopl				172
-__SYSCALL(__NR_iopl, stub_iopl)
-#define __NR_ioperm				173
-__SYSCALL(__NR_ioperm, sys_ioperm)
-
-#define __NR_create_module			174
-__SYSCALL(__NR_create_module, sys_ni_syscall)
-#define __NR_init_module			175
-__SYSCALL(__NR_init_module, sys_init_module)
-#define __NR_delete_module			176
-__SYSCALL(__NR_delete_module, sys_delete_module)
-#define __NR_get_kernel_syms			177
-__SYSCALL(__NR_get_kernel_syms, sys_ni_syscall)
-#define __NR_query_module			178
-__SYSCALL(__NR_query_module, sys_ni_syscall)
-
-#define __NR_quotactl				179
-__SYSCALL(__NR_quotactl, sys_quotactl)
-
-#define __NR_nfsservctl				180
-__SYSCALL(__NR_nfsservctl, sys_ni_syscall)
-
-/* reserved for LiS/STREAMS */
-#define __NR_getpmsg				181
-__SYSCALL(__NR_getpmsg, sys_ni_syscall)
-#define __NR_putpmsg				182
-__SYSCALL(__NR_putpmsg, sys_ni_syscall)
-
-/* reserved for AFS */
-#define __NR_afs_syscall			183
-__SYSCALL(__NR_afs_syscall, sys_ni_syscall)
-
-/* reserved for tux */
-#define __NR_tuxcall				184
-__SYSCALL(__NR_tuxcall, sys_ni_syscall)
-
-#define __NR_security				185
-__SYSCALL(__NR_security, sys_ni_syscall)
-
-#define __NR_gettid				186
-__SYSCALL(__NR_gettid, sys_gettid)
-
-#define __NR_readahead				187
-__SYSCALL(__NR_readahead, sys_readahead)
-#define __NR_setxattr				188
-__SYSCALL(__NR_setxattr, sys_setxattr)
-#define __NR_lsetxattr				189
-__SYSCALL(__NR_lsetxattr, sys_lsetxattr)
-#define __NR_fsetxattr				190
-__SYSCALL(__NR_fsetxattr, sys_fsetxattr)
-#define __NR_getxattr				191
-__SYSCALL(__NR_getxattr, sys_getxattr)
-#define __NR_lgetxattr				192
-__SYSCALL(__NR_lgetxattr, sys_lgetxattr)
-#define __NR_fgetxattr				193
-__SYSCALL(__NR_fgetxattr, sys_fgetxattr)
-#define __NR_listxattr				194
-__SYSCALL(__NR_listxattr, sys_listxattr)
-#define __NR_llistxattr				195
-__SYSCALL(__NR_llistxattr, sys_llistxattr)
-#define __NR_flistxattr				196
-__SYSCALL(__NR_flistxattr, sys_flistxattr)
-#define __NR_removexattr			197
-__SYSCALL(__NR_removexattr, sys_removexattr)
-#define __NR_lremovexattr			198
-__SYSCALL(__NR_lremovexattr, sys_lremovexattr)
-#define __NR_fremovexattr			199
-__SYSCALL(__NR_fremovexattr, sys_fremovexattr)
-#define __NR_tkill				200
-__SYSCALL(__NR_tkill, sys_tkill)
-#define __NR_time				201
-__SYSCALL(__NR_time, sys_time)
-#define __NR_futex				202
-__SYSCALL(__NR_futex, sys_futex)
-#define __NR_sched_setaffinity			203
-__SYSCALL(__NR_sched_setaffinity, sys_sched_setaffinity)
-#define __NR_sched_getaffinity			204
-__SYSCALL(__NR_sched_getaffinity, sys_sched_getaffinity)
-#define __NR_set_thread_area			205
-__SYSCALL(__NR_set_thread_area, sys_ni_syscall)	/* use arch_prctl */
-#define __NR_io_setup				206
-__SYSCALL(__NR_io_setup, sys_io_setup)
-#define __NR_io_destroy				207
-__SYSCALL(__NR_io_destroy, sys_io_destroy)
-#define __NR_io_getevents			208
-__SYSCALL(__NR_io_getevents, sys_io_getevents)
-#define __NR_io_submit				209
-__SYSCALL(__NR_io_submit, sys_io_submit)
-#define __NR_io_cancel				210
-__SYSCALL(__NR_io_cancel, sys_io_cancel)
-#define __NR_get_thread_area			211
-__SYSCALL(__NR_get_thread_area, sys_ni_syscall)	/* use arch_prctl */
-#define __NR_lookup_dcookie			212
-__SYSCALL(__NR_lookup_dcookie, sys_lookup_dcookie)
-#define __NR_epoll_create			213
-__SYSCALL(__NR_epoll_create, sys_epoll_create)
-#define __NR_epoll_ctl_old			214
-__SYSCALL(__NR_epoll_ctl_old, sys_ni_syscall)
-#define __NR_epoll_wait_old			215
-__SYSCALL(__NR_epoll_wait_old, sys_ni_syscall)
-#define __NR_remap_file_pages			216
-__SYSCALL(__NR_remap_file_pages, sys_remap_file_pages)
-#define __NR_getdents64				217
-__SYSCALL(__NR_getdents64, sys_getdents64)
-#define __NR_set_tid_address			218
-__SYSCALL(__NR_set_tid_address, sys_set_tid_address)
-#define __NR_restart_syscall			219
-__SYSCALL(__NR_restart_syscall, sys_restart_syscall)
-#define __NR_semtimedop				220
-__SYSCALL(__NR_semtimedop, sys_semtimedop)
-#define __NR_fadvise64				221
-__SYSCALL(__NR_fadvise64, sys_fadvise64)
-#define __NR_timer_create			222
-__SYSCALL(__NR_timer_create, sys_timer_create)
-#define __NR_timer_settime			223
-__SYSCALL(__NR_timer_settime, sys_timer_settime)
-#define __NR_timer_gettime			224
-__SYSCALL(__NR_timer_gettime, sys_timer_gettime)
-#define __NR_timer_getoverrun			225
-__SYSCALL(__NR_timer_getoverrun, sys_timer_getoverrun)
-#define __NR_timer_delete			226
-__SYSCALL(__NR_timer_delete, sys_timer_delete)
-#define __NR_clock_settime			227
-__SYSCALL(__NR_clock_settime, sys_clock_settime)
-#define __NR_clock_gettime			228
-__SYSCALL(__NR_clock_gettime, sys_clock_gettime)
-#define __NR_clock_getres			229
-__SYSCALL(__NR_clock_getres, sys_clock_getres)
-#define __NR_clock_nanosleep			230
-__SYSCALL(__NR_clock_nanosleep, sys_clock_nanosleep)
-#define __NR_exit_group				231
-__SYSCALL(__NR_exit_group, sys_exit_group)
-#define __NR_epoll_wait				232
-__SYSCALL(__NR_epoll_wait, sys_epoll_wait)
-#define __NR_epoll_ctl				233
-__SYSCALL(__NR_epoll_ctl, sys_epoll_ctl)
-#define __NR_tgkill				234
-__SYSCALL(__NR_tgkill, sys_tgkill)
-#define __NR_utimes				235
-__SYSCALL(__NR_utimes, sys_utimes)
-#define __NR_vserver				236
-__SYSCALL(__NR_vserver, sys_ni_syscall)
-#define __NR_mbind				237
-__SYSCALL(__NR_mbind, sys_mbind)
-#define __NR_set_mempolicy			238
-__SYSCALL(__NR_set_mempolicy, sys_set_mempolicy)
-#define __NR_get_mempolicy			239
-__SYSCALL(__NR_get_mempolicy, sys_get_mempolicy)
-#define __NR_mq_open				240
-__SYSCALL(__NR_mq_open, sys_mq_open)
-#define __NR_mq_unlink				241
-__SYSCALL(__NR_mq_unlink, sys_mq_unlink)
-#define __NR_mq_timedsend			242
-__SYSCALL(__NR_mq_timedsend, sys_mq_timedsend)
-#define __NR_mq_timedreceive			243
-__SYSCALL(__NR_mq_timedreceive, sys_mq_timedreceive)
-#define __NR_mq_notify				244
-__SYSCALL(__NR_mq_notify, sys_mq_notify)
-#define __NR_mq_getsetattr			245
-__SYSCALL(__NR_mq_getsetattr, sys_mq_getsetattr)
-#define __NR_kexec_load				246
-__SYSCALL(__NR_kexec_load, sys_kexec_load)
-#define __NR_waitid				247
-__SYSCALL(__NR_waitid, sys_waitid)
-#define __NR_add_key				248
-__SYSCALL(__NR_add_key, sys_add_key)
-#define __NR_request_key			249
-__SYSCALL(__NR_request_key, sys_request_key)
-#define __NR_keyctl				250
-__SYSCALL(__NR_keyctl, sys_keyctl)
-#define __NR_ioprio_set				251
-__SYSCALL(__NR_ioprio_set, sys_ioprio_set)
-#define __NR_ioprio_get				252
-__SYSCALL(__NR_ioprio_get, sys_ioprio_get)
-#define __NR_inotify_init			253
-__SYSCALL(__NR_inotify_init, sys_inotify_init)
-#define __NR_inotify_add_watch			254
-__SYSCALL(__NR_inotify_add_watch, sys_inotify_add_watch)
-#define __NR_inotify_rm_watch			255
-__SYSCALL(__NR_inotify_rm_watch, sys_inotify_rm_watch)
-#define __NR_migrate_pages			256
-__SYSCALL(__NR_migrate_pages, sys_migrate_pages)
-#define __NR_openat				257
-__SYSCALL(__NR_openat, sys_openat)
-#define __NR_mkdirat				258
-__SYSCALL(__NR_mkdirat, sys_mkdirat)
-#define __NR_mknodat				259
-__SYSCALL(__NR_mknodat, sys_mknodat)
-#define __NR_fchownat				260
-__SYSCALL(__NR_fchownat, sys_fchownat)
-#define __NR_futimesat				261
-__SYSCALL(__NR_futimesat, sys_futimesat)
-#define __NR_newfstatat				262
-__SYSCALL(__NR_newfstatat, sys_newfstatat)
-#define __NR_unlinkat				263
-__SYSCALL(__NR_unlinkat, sys_unlinkat)
-#define __NR_renameat				264
-__SYSCALL(__NR_renameat, sys_renameat)
-#define __NR_linkat				265
-__SYSCALL(__NR_linkat, sys_linkat)
-#define __NR_symlinkat				266
-__SYSCALL(__NR_symlinkat, sys_symlinkat)
-#define __NR_readlinkat				267
-__SYSCALL(__NR_readlinkat, sys_readlinkat)
-#define __NR_fchmodat				268
-__SYSCALL(__NR_fchmodat, sys_fchmodat)
-#define __NR_faccessat				269
-__SYSCALL(__NR_faccessat, sys_faccessat)
-#define __NR_pselect6				270
-__SYSCALL(__NR_pselect6, sys_pselect6)
-#define __NR_ppoll				271
-__SYSCALL(__NR_ppoll,	sys_ppoll)
-#define __NR_unshare				272
-__SYSCALL(__NR_unshare,	sys_unshare)
-#define __NR_set_robust_list			273
-__SYSCALL(__NR_set_robust_list, sys_set_robust_list)
-#define __NR_get_robust_list			274
-__SYSCALL(__NR_get_robust_list, sys_get_robust_list)
-#define __NR_splice				275
-__SYSCALL(__NR_splice, sys_splice)
-#define __NR_tee				276
-__SYSCALL(__NR_tee, sys_tee)
-#define __NR_sync_file_range			277
-__SYSCALL(__NR_sync_file_range, sys_sync_file_range)
-#define __NR_vmsplice				278
-__SYSCALL(__NR_vmsplice, sys_vmsplice)
-#define __NR_move_pages				279
-__SYSCALL(__NR_move_pages, sys_move_pages)
-#define __NR_utimensat				280
-__SYSCALL(__NR_utimensat, sys_utimensat)
-#define __NR_epoll_pwait			281
-__SYSCALL(__NR_epoll_pwait, sys_epoll_pwait)
-#define __NR_signalfd				282
-__SYSCALL(__NR_signalfd, sys_signalfd)
-#define __NR_timerfd_create			283
-__SYSCALL(__NR_timerfd_create, sys_timerfd_create)
-#define __NR_eventfd				284
-__SYSCALL(__NR_eventfd, sys_eventfd)
-#define __NR_fallocate				285
-__SYSCALL(__NR_fallocate, sys_fallocate)
-#define __NR_timerfd_settime			286
-__SYSCALL(__NR_timerfd_settime, sys_timerfd_settime)
-#define __NR_timerfd_gettime			287
-__SYSCALL(__NR_timerfd_gettime, sys_timerfd_gettime)
-#define __NR_accept4				288
-__SYSCALL(__NR_accept4, sys_accept4)
-#define __NR_signalfd4				289
-__SYSCALL(__NR_signalfd4, sys_signalfd4)
-#define __NR_eventfd2				290
-__SYSCALL(__NR_eventfd2, sys_eventfd2)
-#define __NR_epoll_create1			291
-__SYSCALL(__NR_epoll_create1, sys_epoll_create1)
-#define __NR_dup3				292
-__SYSCALL(__NR_dup3, sys_dup3)
-#define __NR_pipe2				293
-__SYSCALL(__NR_pipe2, sys_pipe2)
-#define __NR_inotify_init1			294
-__SYSCALL(__NR_inotify_init1, sys_inotify_init1)
-#define __NR_preadv				295
-__SYSCALL(__NR_preadv, sys_preadv)
-#define __NR_pwritev				296
-__SYSCALL(__NR_pwritev, sys_pwritev)
-#define __NR_rt_tgsigqueueinfo			297
-__SYSCALL(__NR_rt_tgsigqueueinfo, sys_rt_tgsigqueueinfo)
-#define __NR_perf_event_open			298
-__SYSCALL(__NR_perf_event_open, sys_perf_event_open)
-#define __NR_recvmmsg				299
-__SYSCALL(__NR_recvmmsg, sys_recvmmsg)
-#define __NR_fanotify_init			300
-__SYSCALL(__NR_fanotify_init, sys_fanotify_init)
-#define __NR_fanotify_mark			301
-__SYSCALL(__NR_fanotify_mark, sys_fanotify_mark)
-#define __NR_prlimit64				302
-__SYSCALL(__NR_prlimit64, sys_prlimit64)
-#define __NR_name_to_handle_at			303
-__SYSCALL(__NR_name_to_handle_at, sys_name_to_handle_at)
-#define __NR_open_by_handle_at			304
-__SYSCALL(__NR_open_by_handle_at, sys_open_by_handle_at)
-#define __NR_clock_adjtime			305
-__SYSCALL(__NR_clock_adjtime, sys_clock_adjtime)
-#define __NR_syncfs                             306
-__SYSCALL(__NR_syncfs, sys_syncfs)
-#define __NR_sendmmsg				307
-__SYSCALL(__NR_sendmmsg, sys_sendmmsg)
-#define __NR_setns				308
-__SYSCALL(__NR_setns, sys_setns)
-#define __NR_getcpu				309
-__SYSCALL(__NR_getcpu, sys_getcpu)
-#define __NR_process_vm_readv			310
-__SYSCALL(__NR_process_vm_readv, sys_process_vm_readv)
-#define __NR_process_vm_writev			311
-__SYSCALL(__NR_process_vm_writev, sys_process_vm_writev)
-
-#ifndef __NO_STUBS
-#define __ARCH_WANT_OLD_READDIR
-#define __ARCH_WANT_OLD_STAT
-#define __ARCH_WANT_SYS_ALARM
-#define __ARCH_WANT_SYS_GETHOSTNAME
-#define __ARCH_WANT_SYS_PAUSE
-#define __ARCH_WANT_SYS_SGETMASK
-#define __ARCH_WANT_SYS_SIGNAL
-#define __ARCH_WANT_SYS_UTIME
-#define __ARCH_WANT_SYS_WAITPID
-#define __ARCH_WANT_SYS_SOCKETCALL
-#define __ARCH_WANT_SYS_FADVISE64
-#define __ARCH_WANT_SYS_GETPGRP
-#define __ARCH_WANT_SYS_LLSEEK
-#define __ARCH_WANT_SYS_NICE
-#define __ARCH_WANT_SYS_OLD_GETRLIMIT
-#define __ARCH_WANT_SYS_OLD_UNAME
-#define __ARCH_WANT_SYS_OLDUMOUNT
-#define __ARCH_WANT_SYS_SIGPENDING
-#define __ARCH_WANT_SYS_SIGPROCMASK
-#define __ARCH_WANT_SYS_RT_SIGACTION
-#define __ARCH_WANT_SYS_RT_SIGSUSPEND
-#define __ARCH_WANT_SYS_TIME
-#define __ARCH_WANT_COMPAT_SYS_TIME
-#endif	/* __NO_STUBS */
-
-#ifdef __KERNEL__
-
-#ifndef COMPILE_OFFSETS
-#include <asm/asm-offsets.h>
-#define NR_syscalls (__NR_syscall_max + 1)
-#endif
-
-/*
- * "Conditional" syscalls
- *
- * What we want is __attribute__((weak,alias("sys_ni_syscall"))),
- * but it doesn't work on all toolchains, so we just do it by hand
- */
-#define cond_syscall(x) asm(".weak\t" #x "\n\t.set\t" #x ",sys_ni_syscall")
-#endif	/* __KERNEL__ */
-
-#endif /* _ASM_X86_UNISTD_64_H */
diff --git a/arch/x86/kernel/Makefile b/arch/x86/kernel/Makefile
index 8baca3c4871c..8c473d9d0b83 100644
--- a/arch/x86/kernel/Makefile
+++ b/arch/x86/kernel/Makefile
@@ -25,7 +25,8 @@ obj-$(CONFIG_IRQ_WORK)  += irq_work.o
 obj-y			+= probe_roms.o
 obj-$(CONFIG_X86_32)	+= sys_i386_32.o i386_ksyms_32.o
 obj-$(CONFIG_X86_64)	+= sys_x86_64.o x8664_ksyms_64.o
-obj-$(CONFIG_X86_64)	+= syscall_64.o vsyscall_64.o
+obj-y			+= syscall_$(BITS).o
+obj-$(CONFIG_X86_64)	+= vsyscall_64.o
 obj-$(CONFIG_X86_64)	+= vsyscall_emu_64.o
 obj-y			+= bootflag.o e820.o
 obj-y			+= pci-dma.o quirks.o topology.o kdebugfs.o
diff --git a/arch/x86/kernel/asm-offsets_32.c b/arch/x86/kernel/asm-offsets_32.c
index 395a10e68067..85d98ab15cdc 100644
--- a/arch/x86/kernel/asm-offsets_32.c
+++ b/arch/x86/kernel/asm-offsets_32.c
@@ -3,6 +3,11 @@
 #include <linux/lguest.h>
 #include "../../../drivers/lguest/lg.h"
 
+#define __SYSCALL_I386(nr, sym, compat) [nr] = 1,
+static char syscalls[] = {
+#include <asm/syscalls_32.h>
+};
+
 /* workaround for a warning with -Wmissing-prototypes */
 void foo(void);
 
@@ -76,4 +81,7 @@ void foo(void)
 	OFFSET(LGUEST_PAGES_regs_errcode, lguest_pages, regs.errcode);
 	OFFSET(LGUEST_PAGES_regs, lguest_pages, regs);
 #endif
+	BLANK();
+	DEFINE(__NR_syscall_max, sizeof(syscalls) - 1);
+	DEFINE(NR_syscalls, sizeof(syscalls));
 }
diff --git a/arch/x86/kernel/asm-offsets_64.c b/arch/x86/kernel/asm-offsets_64.c
index e72a1194af22..834e897b1e25 100644
--- a/arch/x86/kernel/asm-offsets_64.c
+++ b/arch/x86/kernel/asm-offsets_64.c
@@ -1,11 +1,12 @@
 #include <asm/ia32.h>
 
-#define __NO_STUBS 1
-#undef __SYSCALL
-#undef _ASM_X86_UNISTD_64_H
-#define __SYSCALL(nr, sym) [nr] = 1,
-static char syscalls[] = {
-#include <asm/unistd.h>
+#define __SYSCALL_64(nr, sym, compat) [nr] = 1,
+static char syscalls_64[] = {
+#include <asm/syscalls_64.h>
+};
+#define __SYSCALL_I386(nr, sym, compat) [nr] = 1,
+static char syscalls_ia32[] = {
+#include <asm/syscalls_32.h>
 };
 
 int main(void)
@@ -72,7 +73,11 @@ int main(void)
 	OFFSET(TSS_ist, tss_struct, x86_tss.ist);
 	BLANK();
 
-	DEFINE(__NR_syscall_max, sizeof(syscalls) - 1);
+	DEFINE(__NR_syscall_max, sizeof(syscalls_64) - 1);
+	DEFINE(NR_syscalls, sizeof(syscalls_64));
+
+	DEFINE(__NR_ia32_syscall_max, sizeof(syscalls_ia32) - 1);
+	DEFINE(IA32_NR_syscalls, sizeof(syscalls_ia32));
 
 	return 0;
 }
diff --git a/arch/x86/kernel/entry_32.S b/arch/x86/kernel/entry_32.S
index f3f6f5344001..1ffcda22c2f6 100644
--- a/arch/x86/kernel/entry_32.S
+++ b/arch/x86/kernel/entry_32.S
@@ -81,8 +81,6 @@
  * enough to patch inline, increasing performance.
  */
 
-#define nr_syscalls ((syscall_table_size)/4)
-
 #ifdef CONFIG_PREEMPT
 #define preempt_stop(clobbers)	DISABLE_INTERRUPTS(clobbers); TRACE_IRQS_OFF
 #else
@@ -423,7 +421,7 @@ sysenter_past_esp:
 	testl $_TIF_WORK_SYSCALL_ENTRY,TI_flags(%ebp)
 	jnz sysenter_audit
 sysenter_do_call:
-	cmpl $(nr_syscalls), %eax
+	cmpl $(NR_syscalls), %eax
 	jae syscall_badsys
 	call *sys_call_table(,%eax,4)
 	movl %eax,PT_EAX(%esp)
@@ -504,7 +502,7 @@ ENTRY(system_call)
 					# system call tracing in operation / emulation
 	testl $_TIF_WORK_SYSCALL_ENTRY,TI_flags(%ebp)
 	jnz syscall_trace_entry
-	cmpl $(nr_syscalls), %eax
+	cmpl $(NR_syscalls), %eax
 	jae syscall_badsys
 syscall_call:
 	call *sys_call_table(,%eax,4)
@@ -650,7 +648,7 @@ syscall_trace_entry:
 	movl %esp, %eax
 	call syscall_trace_enter
 	/* What it returned is what we'll actually use.  */
-	cmpl $(nr_syscalls), %eax
+	cmpl $(NR_syscalls), %eax
 	jnae syscall_call
 	jmp syscall_exit
 END(syscall_trace_entry)
@@ -690,29 +688,28 @@ END(syscall_badsys)
  * System calls that need a pt_regs pointer.
  */
 #define PTREGSCALL0(name) \
-	ALIGN; \
-ptregs_##name: \
+ENTRY(ptregs_##name) ;  \
 	leal 4(%esp),%eax; \
-	jmp sys_##name;
+	jmp sys_##name; \
+ENDPROC(ptregs_##name)
 
 #define PTREGSCALL1(name) \
-	ALIGN; \
-ptregs_##name: \
+ENTRY(ptregs_##name) ; \
 	leal 4(%esp),%edx; \
 	movl (PT_EBX+4)(%esp),%eax; \
-	jmp sys_##name;
+	jmp sys_##name; \
+ENDPROC(ptregs_##name)
 
 #define PTREGSCALL2(name) \
-	ALIGN; \
-ptregs_##name: \
+ENTRY(ptregs_##name) ; \
 	leal 4(%esp),%ecx; \
 	movl (PT_ECX+4)(%esp),%edx; \
 	movl (PT_EBX+4)(%esp),%eax; \
-	jmp sys_##name;
+	jmp sys_##name; \
+ENDPROC(ptregs_##name)
 
 #define PTREGSCALL3(name) \
-	ALIGN; \
-ptregs_##name: \
+ENTRY(ptregs_##name) ; \
 	CFI_STARTPROC; \
 	leal 4(%esp),%eax; \
 	pushl_cfi %eax; \
@@ -737,8 +734,7 @@ PTREGSCALL2(vm86)
 PTREGSCALL1(vm86old)
 
 /* Clone is an oddball.  The 4th arg is in %edi */
-	ALIGN;
-ptregs_clone:
+ENTRY(ptregs_clone)
 	CFI_STARTPROC
 	leal 4(%esp),%eax
 	pushl_cfi %eax
@@ -1209,11 +1205,6 @@ return_to_handler:
 	jmp *%ecx
 #endif
 
-.section .rodata,"a"
-#include "syscall_table_32.S"
-
-syscall_table_size=(.-sys_call_table)
-
 /*
  * Some functions should be protected against kprobes
  */
diff --git a/arch/x86/kernel/syscall_32.c b/arch/x86/kernel/syscall_32.c
new file mode 100644
index 000000000000..b37a57336609
--- /dev/null
+++ b/arch/x86/kernel/syscall_32.c
@@ -0,0 +1,25 @@
+/* System call table for i386. */
+
+#include <linux/linkage.h>
+#include <linux/sys.h>
+#include <linux/cache.h>
+#include <asm/asm-offsets.h>
+
+#define __SYSCALL_I386(nr, sym, compat) extern asmlinkage void sym(void) ;
+#include <asm/syscalls_32.h>
+#undef __SYSCALL_I386
+
+#define __SYSCALL_I386(nr, sym, compat) [nr] = sym,
+
+typedef asmlinkage void (*sys_call_ptr_t)(void);
+
+extern asmlinkage void sys_ni_syscall(void);
+
+const sys_call_ptr_t sys_call_table[__NR_syscall_max+1] = {
+	/*
+	 * Smells like a like a compiler bug -- it doesn't work
+	 * when the & below is removed.
+	 */
+	[0 ... __NR_syscall_max] = &sys_ni_syscall,
+#include <asm/syscalls_32.h>
+};
diff --git a/arch/x86/kernel/syscall_64.c b/arch/x86/kernel/syscall_64.c
index 0edfafa1b269..7ac7943be02c 100644
--- a/arch/x86/kernel/syscall_64.c
+++ b/arch/x86/kernel/syscall_64.c
@@ -5,15 +5,11 @@
 #include <linux/cache.h>
 #include <asm/asm-offsets.h>
 
-#define __NO_STUBS
+#define __SYSCALL_64(nr, sym, compat) extern asmlinkage void sym(void) ;
+#include <asm/syscalls_64.h>
+#undef __SYSCALL_64
 
-#define __SYSCALL(nr, sym) extern asmlinkage void sym(void) ;
-#undef _ASM_X86_UNISTD_64_H
-#include <asm/unistd_64.h>
-
-#undef __SYSCALL
-#define __SYSCALL(nr, sym) [nr] = sym,
-#undef _ASM_X86_UNISTD_64_H
+#define __SYSCALL_64(nr, sym, compat) [nr] = sym,
 
 typedef void (*sys_call_ptr_t)(void);
 
@@ -25,5 +21,5 @@ const sys_call_ptr_t sys_call_table[__NR_syscall_max+1] = {
 	 * when the & below is removed.
 	 */
 	[0 ... __NR_syscall_max] = &sys_ni_syscall,
-#include <asm/unistd_64.h>
+#include <asm/syscalls_64.h>
 };
diff --git a/arch/x86/kernel/syscall_table_32.S b/arch/x86/kernel/syscall_table_32.S
deleted file mode 100644
index 9a0e31293920..000000000000
--- a/arch/x86/kernel/syscall_table_32.S
+++ /dev/null
@@ -1,350 +0,0 @@
-ENTRY(sys_call_table)
-	.long sys_restart_syscall	/* 0 - old "setup()" system call, used for restarting */
-	.long sys_exit
-	.long ptregs_fork
-	.long sys_read
-	.long sys_write
-	.long sys_open		/* 5 */
-	.long sys_close
-	.long sys_waitpid
-	.long sys_creat
-	.long sys_link
-	.long sys_unlink	/* 10 */
-	.long ptregs_execve
-	.long sys_chdir
-	.long sys_time
-	.long sys_mknod
-	.long sys_chmod		/* 15 */
-	.long sys_lchown16
-	.long sys_ni_syscall	/* old break syscall holder */
-	.long sys_stat
-	.long sys_lseek
-	.long sys_getpid	/* 20 */
-	.long sys_mount
-	.long sys_oldumount
-	.long sys_setuid16
-	.long sys_getuid16
-	.long sys_stime		/* 25 */
-	.long sys_ptrace
-	.long sys_alarm
-	.long sys_fstat
-	.long sys_pause
-	.long sys_utime		/* 30 */
-	.long sys_ni_syscall	/* old stty syscall holder */
-	.long sys_ni_syscall	/* old gtty syscall holder */
-	.long sys_access
-	.long sys_nice
-	.long sys_ni_syscall	/* 35 - old ftime syscall holder */
-	.long sys_sync
-	.long sys_kill
-	.long sys_rename
-	.long sys_mkdir
-	.long sys_rmdir		/* 40 */
-	.long sys_dup
-	.long sys_pipe
-	.long sys_times
-	.long sys_ni_syscall	/* old prof syscall holder */
-	.long sys_brk		/* 45 */
-	.long sys_setgid16
-	.long sys_getgid16
-	.long sys_signal
-	.long sys_geteuid16
-	.long sys_getegid16	/* 50 */
-	.long sys_acct
-	.long sys_umount	/* recycled never used phys() */
-	.long sys_ni_syscall	/* old lock syscall holder */
-	.long sys_ioctl
-	.long sys_fcntl		/* 55 */
-	.long sys_ni_syscall	/* old mpx syscall holder */
-	.long sys_setpgid
-	.long sys_ni_syscall	/* old ulimit syscall holder */
-	.long sys_olduname
-	.long sys_umask		/* 60 */
-	.long sys_chroot
-	.long sys_ustat
-	.long sys_dup2
-	.long sys_getppid
-	.long sys_getpgrp	/* 65 */
-	.long sys_setsid
-	.long sys_sigaction
-	.long sys_sgetmask
-	.long sys_ssetmask
-	.long sys_setreuid16	/* 70 */
-	.long sys_setregid16
-	.long sys_sigsuspend
-	.long sys_sigpending
-	.long sys_sethostname
-	.long sys_setrlimit	/* 75 */
-	.long sys_old_getrlimit
-	.long sys_getrusage
-	.long sys_gettimeofday
-	.long sys_settimeofday
-	.long sys_getgroups16	/* 80 */
-	.long sys_setgroups16
-	.long sys_old_select
-	.long sys_symlink
-	.long sys_lstat
-	.long sys_readlink	/* 85 */
-	.long sys_uselib
-	.long sys_swapon
-	.long sys_reboot
-	.long sys_old_readdir
-	.long sys_old_mmap	/* 90 */
-	.long sys_munmap
-	.long sys_truncate
-	.long sys_ftruncate
-	.long sys_fchmod
-	.long sys_fchown16	/* 95 */
-	.long sys_getpriority
-	.long sys_setpriority
-	.long sys_ni_syscall	/* old profil syscall holder */
-	.long sys_statfs
-	.long sys_fstatfs	/* 100 */
-	.long sys_ioperm
-	.long sys_socketcall
-	.long sys_syslog
-	.long sys_setitimer
-	.long sys_getitimer	/* 105 */
-	.long sys_newstat
-	.long sys_newlstat
-	.long sys_newfstat
-	.long sys_uname
-	.long ptregs_iopl	/* 110 */
-	.long sys_vhangup
-	.long sys_ni_syscall	/* old "idle" system call */
-	.long ptregs_vm86old
-	.long sys_wait4
-	.long sys_swapoff	/* 115 */
-	.long sys_sysinfo
-	.long sys_ipc
-	.long sys_fsync
-	.long ptregs_sigreturn
-	.long ptregs_clone	/* 120 */
-	.long sys_setdomainname
-	.long sys_newuname
-	.long sys_modify_ldt
-	.long sys_adjtimex
-	.long sys_mprotect	/* 125 */
-	.long sys_sigprocmask
-	.long sys_ni_syscall	/* old "create_module" */
-	.long sys_init_module
-	.long sys_delete_module
-	.long sys_ni_syscall	/* 130:	old "get_kernel_syms" */
-	.long sys_quotactl
-	.long sys_getpgid
-	.long sys_fchdir
-	.long sys_bdflush
-	.long sys_sysfs		/* 135 */
-	.long sys_personality
-	.long sys_ni_syscall	/* reserved for afs_syscall */
-	.long sys_setfsuid16
-	.long sys_setfsgid16
-	.long sys_llseek	/* 140 */
-	.long sys_getdents
-	.long sys_select
-	.long sys_flock
-	.long sys_msync
-	.long sys_readv		/* 145 */
-	.long sys_writev
-	.long sys_getsid
-	.long sys_fdatasync
-	.long sys_sysctl
-	.long sys_mlock		/* 150 */
-	.long sys_munlock
-	.long sys_mlockall
-	.long sys_munlockall
-	.long sys_sched_setparam
-	.long sys_sched_getparam   /* 155 */
-	.long sys_sched_setscheduler
-	.long sys_sched_getscheduler
-	.long sys_sched_yield
-	.long sys_sched_get_priority_max
-	.long sys_sched_get_priority_min  /* 160 */
-	.long sys_sched_rr_get_interval
-	.long sys_nanosleep
-	.long sys_mremap
-	.long sys_setresuid16
-	.long sys_getresuid16	/* 165 */
-	.long ptregs_vm86
-	.long sys_ni_syscall	/* Old sys_query_module */
-	.long sys_poll
-	.long sys_ni_syscall	/* Old nfsservctl */
-	.long sys_setresgid16	/* 170 */
-	.long sys_getresgid16
-	.long sys_prctl
-	.long ptregs_rt_sigreturn
-	.long sys_rt_sigaction
-	.long sys_rt_sigprocmask	/* 175 */
-	.long sys_rt_sigpending
-	.long sys_rt_sigtimedwait
-	.long sys_rt_sigqueueinfo
-	.long sys_rt_sigsuspend
-	.long sys_pread64	/* 180 */
-	.long sys_pwrite64
-	.long sys_chown16
-	.long sys_getcwd
-	.long sys_capget
-	.long sys_capset	/* 185 */
-	.long ptregs_sigaltstack
-	.long sys_sendfile
-	.long sys_ni_syscall	/* reserved for streams1 */
-	.long sys_ni_syscall	/* reserved for streams2 */
-	.long ptregs_vfork	/* 190 */
-	.long sys_getrlimit
-	.long sys_mmap_pgoff
-	.long sys_truncate64
-	.long sys_ftruncate64
-	.long sys_stat64	/* 195 */
-	.long sys_lstat64
-	.long sys_fstat64
-	.long sys_lchown
-	.long sys_getuid
-	.long sys_getgid	/* 200 */
-	.long sys_geteuid
-	.long sys_getegid
-	.long sys_setreuid
-	.long sys_setregid
-	.long sys_getgroups	/* 205 */
-	.long sys_setgroups
-	.long sys_fchown
-	.long sys_setresuid
-	.long sys_getresuid
-	.long sys_setresgid	/* 210 */
-	.long sys_getresgid
-	.long sys_chown
-	.long sys_setuid
-	.long sys_setgid
-	.long sys_setfsuid	/* 215 */
-	.long sys_setfsgid
-	.long sys_pivot_root
-	.long sys_mincore
-	.long sys_madvise
-	.long sys_getdents64	/* 220 */
-	.long sys_fcntl64
-	.long sys_ni_syscall	/* reserved for TUX */
-	.long sys_ni_syscall
-	.long sys_gettid
-	.long sys_readahead	/* 225 */
-	.long sys_setxattr
-	.long sys_lsetxattr
-	.long sys_fsetxattr
-	.long sys_getxattr
-	.long sys_lgetxattr	/* 230 */
-	.long sys_fgetxattr
-	.long sys_listxattr
-	.long sys_llistxattr
-	.long sys_flistxattr
-	.long sys_removexattr	/* 235 */
-	.long sys_lremovexattr
-	.long sys_fremovexattr
-	.long sys_tkill
-	.long sys_sendfile64
-	.long sys_futex		/* 240 */
-	.long sys_sched_setaffinity
-	.long sys_sched_getaffinity
-	.long sys_set_thread_area
-	.long sys_get_thread_area
-	.long sys_io_setup	/* 245 */
-	.long sys_io_destroy
-	.long sys_io_getevents
-	.long sys_io_submit
-	.long sys_io_cancel
-	.long sys_fadvise64	/* 250 */
-	.long sys_ni_syscall
-	.long sys_exit_group
-	.long sys_lookup_dcookie
-	.long sys_epoll_create
-	.long sys_epoll_ctl	/* 255 */
-	.long sys_epoll_wait
- 	.long sys_remap_file_pages
- 	.long sys_set_tid_address
- 	.long sys_timer_create
- 	.long sys_timer_settime		/* 260 */
- 	.long sys_timer_gettime
- 	.long sys_timer_getoverrun
- 	.long sys_timer_delete
- 	.long sys_clock_settime
- 	.long sys_clock_gettime		/* 265 */
- 	.long sys_clock_getres
- 	.long sys_clock_nanosleep
-	.long sys_statfs64
-	.long sys_fstatfs64
-	.long sys_tgkill	/* 270 */
-	.long sys_utimes
- 	.long sys_fadvise64_64
-	.long sys_ni_syscall	/* sys_vserver */
-	.long sys_mbind
-	.long sys_get_mempolicy
-	.long sys_set_mempolicy
-	.long sys_mq_open
-	.long sys_mq_unlink
-	.long sys_mq_timedsend
-	.long sys_mq_timedreceive	/* 280 */
-	.long sys_mq_notify
-	.long sys_mq_getsetattr
-	.long sys_kexec_load
-	.long sys_waitid
-	.long sys_ni_syscall		/* 285 */ /* available */
-	.long sys_add_key
-	.long sys_request_key
-	.long sys_keyctl
-	.long sys_ioprio_set
-	.long sys_ioprio_get		/* 290 */
-	.long sys_inotify_init
-	.long sys_inotify_add_watch
-	.long sys_inotify_rm_watch
-	.long sys_migrate_pages
-	.long sys_openat		/* 295 */
-	.long sys_mkdirat
-	.long sys_mknodat
-	.long sys_fchownat
-	.long sys_futimesat
-	.long sys_fstatat64		/* 300 */
-	.long sys_unlinkat
-	.long sys_renameat
-	.long sys_linkat
-	.long sys_symlinkat
-	.long sys_readlinkat		/* 305 */
-	.long sys_fchmodat
-	.long sys_faccessat
-	.long sys_pselect6
-	.long sys_ppoll
-	.long sys_unshare		/* 310 */
-	.long sys_set_robust_list
-	.long sys_get_robust_list
-	.long sys_splice
-	.long sys_sync_file_range
-	.long sys_tee			/* 315 */
-	.long sys_vmsplice
-	.long sys_move_pages
-	.long sys_getcpu
-	.long sys_epoll_pwait
-	.long sys_utimensat		/* 320 */
-	.long sys_signalfd
-	.long sys_timerfd_create
-	.long sys_eventfd
-	.long sys_fallocate
-	.long sys_timerfd_settime	/* 325 */
-	.long sys_timerfd_gettime
-	.long sys_signalfd4
-	.long sys_eventfd2
-	.long sys_epoll_create1
-	.long sys_dup3			/* 330 */
-	.long sys_pipe2
-	.long sys_inotify_init1
-	.long sys_preadv
-	.long sys_pwritev
-	.long sys_rt_tgsigqueueinfo	/* 335 */
-	.long sys_perf_event_open
-	.long sys_recvmmsg
-	.long sys_fanotify_init
-	.long sys_fanotify_mark
-	.long sys_prlimit64		/* 340 */
-	.long sys_name_to_handle_at
-	.long sys_open_by_handle_at
-	.long sys_clock_adjtime
-	.long sys_syncfs
-	.long sys_sendmmsg		/* 345 */
-	.long sys_setns
-	.long sys_process_vm_readv
-	.long sys_process_vm_writev
-- 
cgit v1.2.3


From f14525f9e033f344996905744f41680ea2b877ce Mon Sep 17 00:00:00 2001
From: "H. Peter Anvin" <hpa@linux.intel.com>
Date: Fri, 18 Nov 2011 16:03:27 -0800
Subject: x86: Simplify syscallhdr.sh

Simplify syscallhdr.sh by letting grep sort out the ABIs that we want,
rather than relying on manual list matching.  This is safe since the
ABI strings already have to consist only of characters which are valid in C
macro names.

Suggested-by: Matt Helsley <matthltc@us.ibm.com>
Link: http://lkml.kernel.org/r/20111118221558.GA6408@count0.beaverton.ibm.com
Signed-off-by: H. Peter Anvin <hpa@linux.intel.com>
---
 arch/x86/syscalls/syscallhdr.sh | 19 +++----------------
 1 file changed, 3 insertions(+), 16 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/syscalls/syscallhdr.sh b/arch/x86/syscalls/syscallhdr.sh
index 0d473ff12eaf..b3c593072785 100644
--- a/arch/x86/syscalls/syscallhdr.sh
+++ b/arch/x86/syscalls/syscallhdr.sh
@@ -2,33 +2,20 @@
 
 in="$1"
 out="$2"
-my_abis=`echo "$3" | tr ',' ' '`
+my_abis=`echo "($3)" | tr ',' '|'`
 prefix="$4"
 offset="$5"
 
 fileguard=_ASM_X86_`basename "$out" | sed \
     -e 'y/abcdefghijklmnopqrstuvwxyz/ABCDEFGHIJKLMNOPQRSTUVWXYZ/' \
     -e 's/[^A-Z0-9_]/_/g' -e 's/__/_/g'`
-
-in_list () {
-    local x
-    for x in $1; do
-	if [ x"$x" = x"$2" ]; then
-	    return 0
-	fi
-    done
-    return 1
-}
-
-grep '^[0-9]' "$in" | sort -n | (
+grep -E "^[0-9A-Fa-fXx]+[[:space:]]+${my_abis}" "$in" | sort -n | (
     echo "#ifndef ${fileguard}"
     echo "#define ${fileguard} 1"
     echo ""
 
     while read nr abi name entry ; do
-	if in_list "$my_abis" "$abi"; then
-	    echo "#define __NR_${prefix}${name}" $((nr+offset))
-        fi
+	echo "#define __NR_${prefix}${name}" $((nr+offset))
     done
 
     echo ""
-- 
cgit v1.2.3


From 61f1e7e20874e8f11dab69b6a4bf7616badd4fe8 Mon Sep 17 00:00:00 2001
From: "H. Peter Anvin" <hpa@linux.intel.com>
Date: Fri, 18 Nov 2011 16:25:07 -0800
Subject: x86, syscall: Re-fix typo in comment

Fix the same typo as was fixed in:

b7641d2c x86-64, syscall: Adjust comment spacing and remove typo

... for the new versions of this file (32-bit and IA32 compat).

Signed-off-by: H. Peter Anvin <hpa@linux.intel.com>
Link: http://lkml.kernel.org/r/1321569446-20433-4-git-send-email-hpa@linux.intel.com
---
 arch/x86/ia32/syscall_ia32.c | 2 +-
 arch/x86/kernel/syscall_32.c | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/ia32/syscall_ia32.c b/arch/x86/ia32/syscall_ia32.c
index d04d3dbc47d4..4754ba0f5d9f 100644
--- a/arch/x86/ia32/syscall_ia32.c
+++ b/arch/x86/ia32/syscall_ia32.c
@@ -17,7 +17,7 @@ extern void compat_ni_syscall(void);
 
 const sys_call_ptr_t ia32_sys_call_table[__NR_ia32_syscall_max+1] = {
 	/*
-	 * Smells like a like a compiler bug -- it doesn't work
+	 * Smells like a compiler bug -- it doesn't work
 	 * when the & below is removed.
 	 */
 	[0 ... __NR_ia32_syscall_max] = &compat_ni_syscall,
diff --git a/arch/x86/kernel/syscall_32.c b/arch/x86/kernel/syscall_32.c
index b37a57336609..147fcd4941c4 100644
--- a/arch/x86/kernel/syscall_32.c
+++ b/arch/x86/kernel/syscall_32.c
@@ -17,7 +17,7 @@ extern asmlinkage void sys_ni_syscall(void);
 
 const sys_call_ptr_t sys_call_table[__NR_syscall_max+1] = {
 	/*
-	 * Smells like a like a compiler bug -- it doesn't work
+	 * Smells like a compiler bug -- it doesn't work
 	 * when the & below is removed.
 	 */
 	[0 ... __NR_syscall_max] = &sys_ni_syscall,
-- 
cgit v1.2.3


From 3f86886c72fb68088162c7e08cc7f85282f1860c Mon Sep 17 00:00:00 2001
From: "H. Peter Anvin" <hpa@linux.intel.com>
Date: Fri, 18 Nov 2011 17:01:19 -0800
Subject: x86, syscall: Allow syscall offset to be symbolic

Allow the specified syscall offset to be symbolic, e.g. a macro.  For
offset system calls, this if nothing else makes the generated code
easier to read.

Suggested-by: H. J. Lu <hjl.tools@gmail.com>
Link: http://lkml.kernel.org/r/1321569446-20433-7-git-send-email-hpa@linux.intel.com
Signed-off-by: H. Peter Anvin <hpa@linux.intel.com>
---
 arch/x86/syscalls/syscallhdr.sh | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

(limited to 'arch/x86')

diff --git a/arch/x86/syscalls/syscallhdr.sh b/arch/x86/syscalls/syscallhdr.sh
index b3c593072785..31fd5f1f38f7 100644
--- a/arch/x86/syscalls/syscallhdr.sh
+++ b/arch/x86/syscalls/syscallhdr.sh
@@ -15,7 +15,11 @@ grep -E "^[0-9A-Fa-fXx]+[[:space:]]+${my_abis}" "$in" | sort -n | (
     echo ""
 
     while read nr abi name entry ; do
-	echo "#define __NR_${prefix}${name}" $((nr+offset))
+	if [ -z "$offset" ]; then
+	    echo "#define __NR_${prefix}${name} $nr"
+	else
+	    echo "#define __NR_${prefix}${name} ($offset + $nr)"
+        fi
     done
 
     echo ""
-- 
cgit v1.2.3


From 37fe6a42b3433b79a159ceb06a94cd1ef00e279d Mon Sep 17 00:00:00 2001
From: Mitsuo Hayasaka <mitsuo.hayasaka.hu@hitachi.com>
Date: Tue, 29 Nov 2011 15:08:29 +0900
Subject: x86: Check stack overflow in detail

Currently, only kernel stack is checked for the overflow, which
is not sufficient for systems that need a high reliability. To
enhance it, it is required to check the IRQ and exception
stacks, as well.

This patch checks all the stack types and will cause messages of
stacks in detail when free stack space drops below a certain
limit except user stack.

Signed-off-by: Mitsuo Hayasaka <mitsuo.hayasaka.hu@hitachi.com>
Cc: yrl.pp-manager.tt@hitachi.com
Cc: Randy Dunlap <rdunlap@xenotime.net>
Link: http://lkml.kernel.org/r/20111129060829.11076.51733.stgit@ltc219.sdl.hitachi.co.jp
Signed-off-by: Ingo Molnar <mingo@elte.hu>
Cc: "H. Peter Anvin" <hpa@zytor.com>
---
 arch/x86/Kconfig.debug   |  7 +++++--
 arch/x86/kernel/irq_64.c | 29 +++++++++++++++++++++++------
 2 files changed, 28 insertions(+), 8 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/Kconfig.debug b/arch/x86/Kconfig.debug
index bf56e1793272..4caec1261f12 100644
--- a/arch/x86/Kconfig.debug
+++ b/arch/x86/Kconfig.debug
@@ -63,8 +63,11 @@ config DEBUG_STACKOVERFLOW
 	bool "Check for stack overflows"
 	depends on DEBUG_KERNEL
 	---help---
-	  This option will cause messages to be printed if free stack space
-	  drops below a certain limit.
+	  Say Y here if you want to check the overflows of kernel, IRQ
+	  and exception stacks. This option will cause messages of the
+	  stacks in detail when free stack space drops below a certain
+	  limit.
+	  If in doubt, say "N".
 
 config X86_PTDUMP
 	bool "Export kernel pagetable layout to userspace via debugfs"
diff --git a/arch/x86/kernel/irq_64.c b/arch/x86/kernel/irq_64.c
index 69bca468c47a..928a7e909619 100644
--- a/arch/x86/kernel/irq_64.c
+++ b/arch/x86/kernel/irq_64.c
@@ -36,18 +36,35 @@ EXPORT_PER_CPU_SYMBOL(irq_regs);
 static inline void stack_overflow_check(struct pt_regs *regs)
 {
 #ifdef CONFIG_DEBUG_STACKOVERFLOW
+	struct orig_ist *oist;
+	u64 irq_stack_top, irq_stack_bottom;
+	u64 estack_top, estack_bottom;
 	u64 curbase = (u64)task_stack_page(current);
 
 	if (user_mode_vm(regs))
 		return;
 
-	WARN_ONCE(regs->sp >= curbase &&
-		  regs->sp <= curbase + THREAD_SIZE &&
-		  regs->sp <  curbase + sizeof(struct thread_info) +
-					sizeof(struct pt_regs) + 128,
+	if (regs->sp >= curbase &&
+	    regs->sp <= curbase + THREAD_SIZE &&
+	    regs->sp >= curbase + sizeof(struct thread_info) +
+				  sizeof(struct pt_regs) + 128)
+		return;
+
+	irq_stack_top = (u64)__get_cpu_var(irq_stack_union.irq_stack);
+	irq_stack_bottom = (u64)__get_cpu_var(irq_stack_ptr);
+	if (regs->sp >= irq_stack_top && regs->sp <= irq_stack_bottom)
+		return;
+
+	oist = &__get_cpu_var(orig_ist);
+	estack_top = (u64)oist->ist[0] - EXCEPTION_STKSZ;
+	estack_bottom = (u64)oist->ist[N_EXCEPTION_STACKS - 1];
+	if (regs->sp >= estack_top && regs->sp <= estack_bottom)
+		return;
 
-		  "do_IRQ: %s near stack overflow (cur:%Lx,sp:%lx)\n",
-			current->comm, curbase, regs->sp);
+	WARN_ONCE(1, "do_IRQ(): %s has overflown the kernel stack (cur:%Lx,sp:%lx,irq stk top-bottom:%Lx-%Lx,exception stk top-bottom:%Lx-%Lx)\n",
+		current->comm, curbase, regs->sp,
+		irq_stack_top, irq_stack_bottom,
+		estack_top, estack_bottom);
 #endif
 }
 
-- 
cgit v1.2.3


From 55af77969fbd7a841838220ea2287432e0da8ae5 Mon Sep 17 00:00:00 2001
From: Mitsuo Hayasaka <mitsuo.hayasaka.hu@hitachi.com>
Date: Tue, 29 Nov 2011 15:08:36 +0900
Subject: x86: Panic on detection of stack overflow

Currently, messages are just output on the detection of stack
overflow, which is not sufficient for systems that need a
high reliability. This is because in general the overflow may
corrupt data, and the additional corruption may occur due to
reading them unless systems stop.

This patch adds the sysctl parameter
kernel.panic_on_stackoverflow and causes a panic when detecting
the overflows of kernel, IRQ and exception stacks except user
stack according to the parameter. It is disabled by default.

Signed-off-by: Mitsuo Hayasaka <mitsuo.hayasaka.hu@hitachi.com>
Cc: yrl.pp-manager.tt@hitachi.com
Cc: Randy Dunlap <rdunlap@xenotime.net>
Cc: "H. Peter Anvin" <hpa@zytor.com>
Link: http://lkml.kernel.org/r/20111129060836.11076.12323.stgit@ltc219.sdl.hitachi.co.jp
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 Documentation/sysctl/kernel.txt | 14 ++++++++++++++
 arch/x86/kernel/irq_32.c        |  2 ++
 arch/x86/kernel/irq_64.c        |  5 +++++
 include/linux/kernel.h          |  1 +
 kernel/sysctl.c                 |  9 +++++++++
 5 files changed, 31 insertions(+)

(limited to 'arch/x86')

diff --git a/Documentation/sysctl/kernel.txt b/Documentation/sysctl/kernel.txt
index 1f2463671a1a..6d8cd8b2c30d 100644
--- a/Documentation/sysctl/kernel.txt
+++ b/Documentation/sysctl/kernel.txt
@@ -49,6 +49,7 @@ show up in /proc/sys/kernel:
 - panic
 - panic_on_oops
 - panic_on_unrecovered_nmi
+- panic_on_stackoverflow
 - pid_max
 - powersave-nap               [ PPC only ]
 - printk
@@ -393,6 +394,19 @@ Controls the kernel's behaviour when an oops or BUG is encountered.
 
 ==============================================================
 
+panic_on_stackoverflow:
+
+Controls the kernel's behavior when detecting the overflows of
+kernel, IRQ and exception stacks except a user stack.
+This file shows up if CONFIG_DEBUG_STACKOVERFLOW is enabled.
+
+0: try to continue operation.
+
+1: panic immediately.
+
+==============================================================
+
+
 pid_max:
 
 PID allocation wrap value.  When the kernel's next PID value
diff --git a/arch/x86/kernel/irq_32.c b/arch/x86/kernel/irq_32.c
index 72090705a656..e16e99ebd7ad 100644
--- a/arch/x86/kernel/irq_32.c
+++ b/arch/x86/kernel/irq_32.c
@@ -43,6 +43,8 @@ static void print_stack_overflow(void)
 {
 	printk(KERN_WARNING "low stack detected by irq handler\n");
 	dump_stack();
+	if (sysctl_panic_on_stackoverflow)
+		panic("low stack detected by irq handler - check messages\n");
 }
 
 #else
diff --git a/arch/x86/kernel/irq_64.c b/arch/x86/kernel/irq_64.c
index 928a7e909619..42552b0dce6a 100644
--- a/arch/x86/kernel/irq_64.c
+++ b/arch/x86/kernel/irq_64.c
@@ -26,6 +26,8 @@ EXPORT_PER_CPU_SYMBOL(irq_stat);
 DEFINE_PER_CPU(struct pt_regs *, irq_regs);
 EXPORT_PER_CPU_SYMBOL(irq_regs);
 
+int sysctl_panic_on_stackoverflow;
+
 /*
  * Probabilistic stack overflow check:
  *
@@ -65,6 +67,9 @@ static inline void stack_overflow_check(struct pt_regs *regs)
 		current->comm, curbase, regs->sp,
 		irq_stack_top, irq_stack_bottom,
 		estack_top, estack_bottom);
+
+	if (sysctl_panic_on_stackoverflow)
+		panic("low stack detected by irq handler - check messages\n");
 #endif
 }
 
diff --git a/include/linux/kernel.h b/include/linux/kernel.h
index e8b1597b5cf2..ff83683c0b9d 100644
--- a/include/linux/kernel.h
+++ b/include/linux/kernel.h
@@ -341,6 +341,7 @@ extern int panic_timeout;
 extern int panic_on_oops;
 extern int panic_on_unrecovered_nmi;
 extern int panic_on_io_nmi;
+extern int sysctl_panic_on_stackoverflow;
 extern const char *print_tainted(void);
 extern void add_taint(unsigned flag);
 extern int test_taint(unsigned flag);
diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index ae2719643854..f487f257e05e 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -803,6 +803,15 @@ static struct ctl_table kern_table[] = {
 		.mode		= 0644,
 		.proc_handler	= proc_dointvec,
 	},
+#ifdef CONFIG_DEBUG_STACKOVERFLOW
+	{
+		.procname	= "panic_on_stackoverflow",
+		.data		= &sysctl_panic_on_stackoverflow,
+		.maxlen		= sizeof(int),
+		.mode		= 0644,
+		.proc_handler	= proc_dointvec,
+	},
+#endif
 	{
 		.procname	= "bootloader_type",
 		.data		= &bootloader_type,
-- 
cgit v1.2.3


From 467e6b7a7c0eb792ebaf322ddb7363742b4ead40 Mon Sep 17 00:00:00 2001
From: Mitsuo Hayasaka <mitsuo.hayasaka.hu@hitachi.com>
Date: Tue, 29 Nov 2011 15:08:45 +0900
Subject: x86: Clean up the range of stack overflow checking

The overflow checking of kernel stack checks if the stack
pointer points to the available kernel stack range, which is
derived from the original overflow checking.

It is clear that curbase address is always less than low
boundary of available kernel stack. So, this patch removes the
first condition that checks if the pointer is higher than
curbase.

Signed-off-by: Mitsuo Hayasaka <mitsuo.hayasaka.hu@hitachi.com>
Cc: yrl.pp-manager.tt@hitachi.com
Cc: Randy Dunlap <rdunlap@xenotime.net>
Link: http://lkml.kernel.org/r/20111129060845.11076.40916.stgit@ltc219.sdl.hitachi.co.jp
Signed-off-by: Ingo Molnar <mingo@elte.hu>
Cc: "H. Peter Anvin" <hpa@zytor.com>
---
 arch/x86/kernel/irq_64.c | 7 +++----
 1 file changed, 3 insertions(+), 4 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kernel/irq_64.c b/arch/x86/kernel/irq_64.c
index 42552b0dce6a..54e2b2b2e250 100644
--- a/arch/x86/kernel/irq_64.c
+++ b/arch/x86/kernel/irq_64.c
@@ -46,10 +46,9 @@ static inline void stack_overflow_check(struct pt_regs *regs)
 	if (user_mode_vm(regs))
 		return;
 
-	if (regs->sp >= curbase &&
-	    regs->sp <= curbase + THREAD_SIZE &&
-	    regs->sp >= curbase + sizeof(struct thread_info) +
-				  sizeof(struct pt_regs) + 128)
+	if (regs->sp >= curbase + sizeof(struct thread_info) +
+				  sizeof(struct pt_regs) + 128 &&
+	    regs->sp <= curbase + THREAD_SIZE)
 		return;
 
 	irq_stack_top = (u64)__get_cpu_var(irq_stack_union.irq_stack);
-- 
cgit v1.2.3


From 3603a2512f9e69dc87914ba922eb4a0812b21cd6 Mon Sep 17 00:00:00 2001
From: Don Zickus <dzickus@redhat.com>
Date: Thu, 13 Oct 2011 15:14:25 -0400
Subject: x86, reboot: Use NMI instead of REBOOT_VECTOR to stop cpus

A recent discussion started talking about the locking on the
pstore fs and how it relates to the kmsg infrastructure.  We
noticed it was possible for userspace to r/w to the pstore fs
(grabbing the locks in the process) and block the panic path
from r/w to the same fs.

The reason was the cpu with the lock could be doing work while
the crashing cpu is panic'ing.  Busting those spinlocks might
cause those cpus to step on each other's data.  Fine, fair
enough.

It was suggested it would be nice to serialize the panic path
(ie stop the other cpus) and have only one cpu running.  This
would allow us to bust the spinlocks and not worry about another
cpu stepping on the data.

Of course, smp_send_stop() does this in the panic case.
kmsg_dump() would have to be moved to be called after it.  Easy
enough.

The only problem is on x86 the smp_send_stop() function calls
the REBOOT_VECTOR.  Any cpu with irqs disabled (which pstore and
its backend ERST would do), block this IPI and thus do not stop.
 This makes it difficult to reliably log data to the pstore fs.

The patch below switches from the REBOOT_VECTOR to NMI (and
mimics what kdump does).  Switching to NMI allows us to deliver
the IPI when irqs are disabled, increasing the reliability of
this function.

However, Andi carefully noted that on some machines this
approach does not work because of broken BIOSes or whatever.

To help accomodate this, the next couple of patches will run a
selftest and provide a knob to disable.

V2:
  uses atomic ops to serialize the cpu that shuts everyone down
V3:
  comment cleanup

Signed-off-by: Don Zickus <dzickus@redhat.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Robert Richter <robert.richter@amd.com>
Cc: seiji.aguchi@hds.com
Cc: vgoyal@redhat.com
Cc: mjg@redhat.com
Cc: tony.luck@intel.com
Cc: gong.chen@intel.com
Cc: satoru.moriya@hds.com
Cc: avi@redhat.com
Cc: Andi Kleen <andi@firstfloor.org>
Link: http://lkml.kernel.org/r/1318533267-18880-2-git-send-email-dzickus@redhat.com
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/smp.c | 59 +++++++++++++++++++++++++++++++++++++++++++++++++--
 1 file changed, 57 insertions(+), 2 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kernel/smp.c b/arch/x86/kernel/smp.c
index 16204dc15484..e72b1754a2d7 100644
--- a/arch/x86/kernel/smp.c
+++ b/arch/x86/kernel/smp.c
@@ -29,6 +29,7 @@
 #include <asm/mmu_context.h>
 #include <asm/proto.h>
 #include <asm/apic.h>
+#include <asm/nmi.h>
 /*
  *	Some notes on x86 processor bugs affecting SMP operation:
  *
@@ -148,6 +149,60 @@ void native_send_call_func_ipi(const struct cpumask *mask)
 	free_cpumask_var(allbutself);
 }
 
+static atomic_t stopping_cpu = ATOMIC_INIT(-1);
+
+static int smp_stop_nmi_callback(unsigned int val, struct pt_regs *regs)
+{
+	/* We are registered on stopping cpu too, avoid spurious NMI */
+	if (raw_smp_processor_id() == atomic_read(&stopping_cpu))
+		return NMI_HANDLED;
+
+	stop_this_cpu(NULL);
+
+	return NMI_HANDLED;
+}
+
+static void native_nmi_stop_other_cpus(int wait)
+{
+	unsigned long flags;
+	unsigned long timeout;
+
+	if (reboot_force)
+		return;
+
+	/*
+	 * Use an own vector here because smp_call_function
+	 * does lots of things not suitable in a panic situation.
+	 */
+	if (num_online_cpus() > 1) {
+		/* did someone beat us here? */
+		if (atomic_cmpxchg(&stopping_cpu, -1, safe_smp_processor_id() != -1))
+			return;
+
+		if (register_nmi_handler(NMI_LOCAL, smp_stop_nmi_callback,
+					 NMI_FLAG_FIRST, "smp_stop"))
+			/* Note: we ignore failures here */
+			return;
+
+		/* sync above data before sending NMI */
+		wmb();
+
+		apic->send_IPI_allbutself(NMI_VECTOR);
+
+		/*
+		 * Don't wait longer than a second if the caller
+		 * didn't ask us to wait.
+		 */
+		timeout = USEC_PER_SEC;
+		while (num_online_cpus() > 1 && (wait || timeout--))
+			udelay(1);
+	}
+
+	local_irq_save(flags);
+	disable_local_APIC();
+	local_irq_restore(flags);
+}
+
 /*
  * this function calls the 'stop' function on all other CPUs in the system.
  */
@@ -160,7 +215,7 @@ asmlinkage void smp_reboot_interrupt(void)
 	irq_exit();
 }
 
-static void native_stop_other_cpus(int wait)
+static void native_irq_stop_other_cpus(int wait)
 {
 	unsigned long flags;
 	unsigned long timeout;
@@ -230,7 +285,7 @@ struct smp_ops smp_ops = {
 	.smp_prepare_cpus	= native_smp_prepare_cpus,
 	.smp_cpus_done		= native_smp_cpus_done,
 
-	.stop_other_cpus	= native_stop_other_cpus,
+	.stop_other_cpus	= native_nmi_stop_other_cpus,
 	.smp_send_reschedule	= native_smp_send_reschedule,
 
 	.cpu_up			= native_cpu_up,
-- 
cgit v1.2.3


From 99e8b9ca90d688c3ac7d3a141b701c9694a93925 Mon Sep 17 00:00:00 2001
From: Don Zickus <dzickus@redhat.com>
Date: Thu, 13 Oct 2011 15:14:26 -0400
Subject: x86, NMI: Add NMI IPI selftest

The previous patch modified the stop cpus path to use NMI
instead of IRQ as the way to communicate to the other cpus to
shutdown.  There were some concerns that various machines may
have problems with using an NMI IPI.

This patch creates a selftest to check if NMI is working at
boot. The idea is to help catch any issues before the machine
panics and we learn the hard way.

Loosely based on the locking-selftest.c file, this separate file
runs a couple of simple tests and reports the results.  The
output looks like:

...
Brought up 4 CPUs
----------------
| NMI testsuite:
--------------------
  remote IPI:  ok  |
   local IPI:  ok  |
--------------------
Good, all   2 testcases passed! |
---------------------------------
Total of 4 processors activated (21330.61 BogoMIPS).
...

Signed-off-by: Don Zickus <dzickus@redhat.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Robert Richter <robert.richter@amd.com>
Cc: seiji.aguchi@hds.com
Cc: vgoyal@redhat.com
Cc: mjg@redhat.com
Cc: tony.luck@intel.com
Cc: gong.chen@intel.com
Cc: satoru.moriya@hds.com
Cc: avi@redhat.com
Cc: Andi Kleen <andi@firstfloor.org>
Link: http://lkml.kernel.org/r/1318533267-18880-3-git-send-email-dzickus@redhat.com
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/Kconfig.debug         |  12 +++
 arch/x86/include/asm/smp.h     |   6 ++
 arch/x86/kernel/Makefile       |   1 +
 arch/x86/kernel/nmi_selftest.c | 179 +++++++++++++++++++++++++++++++++++++++++
 arch/x86/kernel/smpboot.c      |   1 +
 5 files changed, 199 insertions(+)
 create mode 100644 arch/x86/kernel/nmi_selftest.c

(limited to 'arch/x86')

diff --git a/arch/x86/Kconfig.debug b/arch/x86/Kconfig.debug
index 4caec1261f12..97da3c17b424 100644
--- a/arch/x86/Kconfig.debug
+++ b/arch/x86/Kconfig.debug
@@ -287,4 +287,16 @@ config DEBUG_STRICT_USER_COPY_CHECKS
 
 	  If unsure, or if you run an older (pre 4.4) gcc, say N.
 
+config DEBUG_NMI_SELFTEST
+	bool "NMI Selftest"
+	depends on DEBUG_KERNEL
+	---help---
+	  Enabling this option turns on a quick NMI selftest to verify
+	  that the NMI behaves correctly.
+
+	  This might help diagnose strange hangs that rely on NMI to
+	  function properly.
+
+	  If unsure, say N.
+
 endmenu
diff --git a/arch/x86/include/asm/smp.h b/arch/x86/include/asm/smp.h
index 73b11bc0ae6f..0434c400287c 100644
--- a/arch/x86/include/asm/smp.h
+++ b/arch/x86/include/asm/smp.h
@@ -225,5 +225,11 @@ extern int hard_smp_processor_id(void);
 
 #endif /* CONFIG_X86_LOCAL_APIC */
 
+#ifdef CONFIG_DEBUG_NMI_SELFTEST
+extern void nmi_selftest(void);
+#else
+#define nmi_selftest() do { } while (0)
+#endif
+
 #endif /* __ASSEMBLY__ */
 #endif /* _ASM_X86_SMP_H */
diff --git a/arch/x86/kernel/Makefile b/arch/x86/kernel/Makefile
index 8baca3c4871c..02b2f05b371e 100644
--- a/arch/x86/kernel/Makefile
+++ b/arch/x86/kernel/Makefile
@@ -80,6 +80,7 @@ obj-$(CONFIG_APB_TIMER)		+= apb_timer.o
 obj-$(CONFIG_AMD_NB)		+= amd_nb.o
 obj-$(CONFIG_DEBUG_RODATA_TEST)	+= test_rodata.o
 obj-$(CONFIG_DEBUG_NX_TEST)	+= test_nx.o
+obj-$(CONFIG_DEBUG_NMI_SELFTEST) += nmi_selftest.o
 
 obj-$(CONFIG_KVM_GUEST)		+= kvm.o
 obj-$(CONFIG_KVM_CLOCK)		+= kvmclock.o
diff --git a/arch/x86/kernel/nmi_selftest.c b/arch/x86/kernel/nmi_selftest.c
new file mode 100644
index 000000000000..572adb622251
--- /dev/null
+++ b/arch/x86/kernel/nmi_selftest.c
@@ -0,0 +1,179 @@
+/*
+ * arch/x86/kernel/nmi-selftest.c
+ *
+ * Testsuite for NMI: IPIs
+ *
+ * Started by Don Zickus:
+ * (using lib/locking-selftest.c as a guide)
+ *
+ *   Copyright (C) 2011 Red Hat, Inc., Don Zickus <dzickus@redhat.com>
+ */
+
+#include <linux/smp.h>
+#include <linux/cpumask.h>
+#include <linux/delay.h>
+
+#include <asm/apic.h>
+#include <asm/nmi.h>
+
+#define SUCCESS		0
+#define FAILURE		1
+#define TIMEOUT		2
+
+static int nmi_fail;
+
+/* check to see if NMI IPIs work on this machine */
+static DECLARE_BITMAP(nmi_ipi_mask, NR_CPUS) __read_mostly;
+
+static int testcase_total;
+static int testcase_successes;
+static int expected_testcase_failures;
+static int unexpected_testcase_failures;
+static int unexpected_testcase_unknowns;
+
+static int nmi_unk_cb(unsigned int val, struct pt_regs *regs)
+{
+	unexpected_testcase_unknowns++;
+	return NMI_HANDLED;
+}
+
+static void init_nmi_testsuite(void)
+{
+	/* trap all the unknown NMIs we may generate */
+	register_nmi_handler(NMI_UNKNOWN, nmi_unk_cb, 0, "nmi_selftest_unk");
+}
+
+static void cleanup_nmi_testsuite(void)
+{
+	unregister_nmi_handler(NMI_UNKNOWN, "nmi_selftest_unk");
+}
+
+static int test_nmi_ipi_callback(unsigned int val, struct pt_regs *regs)
+{
+        int cpu = raw_smp_processor_id();
+
+        if (cpumask_test_and_clear_cpu(cpu, to_cpumask(nmi_ipi_mask)))
+                return NMI_HANDLED;
+
+        return NMI_DONE;
+}
+
+static void test_nmi_ipi(struct cpumask *mask)
+{
+	unsigned long timeout;
+
+	if (register_nmi_handler(NMI_LOCAL, test_nmi_ipi_callback,
+				 NMI_FLAG_FIRST, "nmi_selftest")) {
+		nmi_fail = FAILURE;
+		return;
+	}
+
+	/* sync above data before sending NMI */
+	wmb();
+
+	apic->send_IPI_mask(mask, NMI_VECTOR);
+
+	/* Don't wait longer than a second */
+	timeout = USEC_PER_SEC;
+	while (!cpumask_empty(mask) && timeout--)
+	        udelay(1);
+
+	/* What happens if we timeout, do we still unregister?? */
+	unregister_nmi_handler(NMI_LOCAL, "nmi_selftest");
+
+	if (!timeout)
+		nmi_fail = TIMEOUT;
+	return;
+}
+
+static void remote_ipi(void)
+{
+	cpumask_copy(to_cpumask(nmi_ipi_mask), cpu_online_mask);
+	cpumask_clear_cpu(smp_processor_id(), to_cpumask(nmi_ipi_mask));
+	test_nmi_ipi(to_cpumask(nmi_ipi_mask));
+}
+
+static void local_ipi(void)
+{
+	cpumask_clear(to_cpumask(nmi_ipi_mask));
+	cpumask_set_cpu(smp_processor_id(), to_cpumask(nmi_ipi_mask));
+	test_nmi_ipi(to_cpumask(nmi_ipi_mask));
+}
+
+static void reset_nmi(void)
+{
+	nmi_fail = 0;
+}
+
+static void dotest(void (*testcase_fn)(void), int expected)
+{
+	testcase_fn();
+	/*
+	 * Filter out expected failures:
+	 */
+	if (nmi_fail != expected) {
+		unexpected_testcase_failures++;
+
+		if (nmi_fail == FAILURE)
+			printk("FAILED |");
+		else if (nmi_fail == TIMEOUT)
+			printk("TIMEOUT|");
+		else
+			printk("ERROR  |");
+		dump_stack();
+	} else {
+		testcase_successes++;
+		printk("  ok  |");
+	}
+	testcase_total++;
+
+	reset_nmi();
+}
+
+static inline void print_testname(const char *testname)
+{
+	printk("%12s:", testname);
+}
+
+void nmi_selftest(void)
+{
+	init_nmi_testsuite();
+
+        /*
+	 * Run the testsuite:
+	 */
+	printk("----------------\n");
+	printk("| NMI testsuite:\n");
+	printk("--------------------\n");
+
+	print_testname("remote IPI");
+	dotest(remote_ipi, SUCCESS);
+	printk("\n");
+	print_testname("local IPI");
+	dotest(local_ipi, SUCCESS);
+	printk("\n");
+
+	cleanup_nmi_testsuite();
+
+	if (unexpected_testcase_failures) {
+		printk("--------------------\n");
+		printk("BUG: %3d unexpected failures (out of %3d) - debugging disabled! |\n",
+			unexpected_testcase_failures, testcase_total);
+		printk("-----------------------------------------------------------------\n");
+	} else if (expected_testcase_failures && testcase_successes) {
+		printk("--------------------\n");
+		printk("%3d out of %3d testcases failed, as expected. |\n",
+			expected_testcase_failures, testcase_total);
+		printk("----------------------------------------------------\n");
+	} else if (expected_testcase_failures && !testcase_successes) {
+		printk("--------------------\n");
+		printk("All %3d testcases failed, as expected. |\n",
+			expected_testcase_failures);
+		printk("----------------------------------------\n");
+	} else {
+		printk("--------------------\n");
+		printk("Good, all %3d testcases passed! |\n",
+			testcase_successes);
+		printk("---------------------------------\n");
+	}
+}
diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c
index 9f548cb4a958..19277817effa 100644
--- a/arch/x86/kernel/smpboot.c
+++ b/arch/x86/kernel/smpboot.c
@@ -1142,6 +1142,7 @@ void __init native_smp_cpus_done(unsigned int max_cpus)
 {
 	pr_debug("Boot done.\n");
 
+	nmi_selftest();
 	impress_friends();
 #ifdef CONFIG_X86_IO_APIC
 	setup_ioapic_dest();
-- 
cgit v1.2.3


From bda62633983f9db49ce0b1a9235b3709c1cda5f0 Mon Sep 17 00:00:00 2001
From: Don Zickus <dzickus@redhat.com>
Date: Thu, 13 Oct 2011 15:14:27 -0400
Subject: x86, NMI: Add knob to disable using NMI IPIs to stop cpus

Some machines may exhibit problems using the NMI to stop other
cpus. This knob just allows one to revert back to the original
behaviour to help diagnose the problem.

V2:
  make function static

Signed-off-by: Don Zickus <dzickus@redhat.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Robert Richter <robert.richter@amd.com>
Cc: seiji.aguchi@hds.com
Cc: vgoyal@redhat.com
Cc: mjg@redhat.com
Cc: tony.luck@intel.com
Cc: gong.chen@intel.com
Cc: satoru.moriya@hds.com
Cc: avi@redhat.com
Cc: Andi Kleen <andi@firstfloor.org>
Link: http://lkml.kernel.org/r/1318533267-18880-4-git-send-email-dzickus@redhat.com
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 Documentation/kernel-parameters.txt |  4 ++++
 arch/x86/kernel/smp.c               | 13 +++++++++++++
 2 files changed, 17 insertions(+)

(limited to 'arch/x86')

diff --git a/Documentation/kernel-parameters.txt b/Documentation/kernel-parameters.txt
index a0c5c5f4fce6..b4339e5a50da 100644
--- a/Documentation/kernel-parameters.txt
+++ b/Documentation/kernel-parameters.txt
@@ -1796,6 +1796,10 @@ bytes respectively. Such letter suffixes can also be entirely omitted.
 	nomfgpt		[X86-32] Disable Multi-Function General Purpose
 			Timer usage (for AMD Geode machines).
 
+	nonmi_ipi	[X86] Disable using NMI IPIs during panic/reboot to
+			shutdown the other cpus.  Instead use the REBOOT_VECTOR
+			irq.
+
 	nopat		[X86] Disable PAT (page attribute table extension of
 			pagetables) support.
 
diff --git a/arch/x86/kernel/smp.c b/arch/x86/kernel/smp.c
index e72b1754a2d7..113acda5879e 100644
--- a/arch/x86/kernel/smp.c
+++ b/arch/x86/kernel/smp.c
@@ -249,6 +249,11 @@ static void native_irq_stop_other_cpus(int wait)
 	local_irq_restore(flags);
 }
 
+static void native_smp_disable_nmi_ipi(void)
+{
+	smp_ops.stop_other_cpus = native_irq_stop_other_cpus;
+}
+
 /*
  * Reschedule call back.
  */
@@ -280,6 +285,14 @@ void smp_call_function_single_interrupt(struct pt_regs *regs)
 	irq_exit();
 }
 
+static int __init nonmi_ipi_setup(char *str)
+{
+        native_smp_disable_nmi_ipi();
+        return 1;
+}
+
+__setup("nonmi_ipi", nonmi_ipi_setup);
+
 struct smp_ops smp_ops = {
 	.smp_prepare_boot_cpu	= native_smp_prepare_boot_cpu,
 	.smp_prepare_cpus	= native_smp_prepare_cpus,
-- 
cgit v1.2.3


From 53b5650273fea486ac8ac6c1d1e9a6cd17aa31ca Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Mon, 5 Dec 2011 12:25:44 +0100
Subject: x86: Fix the 32-bit stackoverflow-debug build

The panic_on_stackoverflow variable needs to be avilable
on the 32-bit side as well ...

Cc: Mitsuo Hayasaka <mitsuo.hayasaka.hu@hitachi.com>
Cc: Randy Dunlap <rdunlap@xenotime.net>
Cc: "H. Peter Anvin" <hpa@zytor.com>
Link: http://lkml.kernel.org/r/20111129060836.11076.12323.stgit@ltc219.sdl.hitachi.co.jp
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/irq_32.c | 3 +++
 1 file changed, 3 insertions(+)

(limited to 'arch/x86')

diff --git a/arch/x86/kernel/irq_32.c b/arch/x86/kernel/irq_32.c
index e16e99ebd7ad..40fc86161d92 100644
--- a/arch/x86/kernel/irq_32.c
+++ b/arch/x86/kernel/irq_32.c
@@ -28,6 +28,9 @@ DEFINE_PER_CPU(struct pt_regs *, irq_regs);
 EXPORT_PER_CPU_SYMBOL(irq_regs);
 
 #ifdef CONFIG_DEBUG_STACKOVERFLOW
+
+int sysctl_panic_on_stackoverflow __read_mostly;
+
 /* Debugging check for stack overflow: is there less than 1KB free? */
 static int check_stack_overflow(void)
 {
-- 
cgit v1.2.3


From 54b0264ec8c6e90f0413ad30e2f91c65e7844613 Mon Sep 17 00:00:00 2001
From: Alan Cox <alan@linux.intel.com>
Date: Wed, 16 Nov 2011 18:17:40 +0000
Subject: x86/sfi: Kill the IRQ as id hack

Nothing should now need it so take it out

Signed-off-by: Alan Cox <alan@linux.intel.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/platform/mrst/mrst.c | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/platform/mrst/mrst.c b/arch/x86/platform/mrst/mrst.c
index b1489a06a49d..6a21f603bd78 100644
--- a/arch/x86/platform/mrst/mrst.c
+++ b/arch/x86/platform/mrst/mrst.c
@@ -802,8 +802,7 @@ static void __init sfi_handle_ipc_dev(struct sfi_device_table_entry *entry)
 	if (mrst_has_msic())
 		return;
 
-	/* ID as IRQ is a hack that will go away */
-	pdev = platform_device_alloc(entry->name, entry->irq);
+	pdev = platform_device_alloc(entry->name, 0);
 	if (pdev == NULL) {
 		pr_err("out of memory for SFI platform device '%s'.\n",
 			entry->name);
-- 
cgit v1.2.3


From 1ea7c6737c8f68453f55c894b3d07d7f48fcbef8 Mon Sep 17 00:00:00 2001
From: Alan Cox <alan@linux.intel.com>
Date: Thu, 10 Nov 2011 13:29:14 +0000
Subject: x86/config: Revamp configuration for MID devices

This follows on from the patch applied in 3.2rc1 which creates
an INTEL_MID configuration. We can now add the entry for
Medfield specific code. After this is merged the final patch
will be submitted which moves the rest of the device Kconfig
dependancies to MRST/MEDFIELD/INTEL_MID as appropriate.

Signed-off-by: Alan Cox <alan@linux.intel.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/Kconfig                | 17 +++++++++++++++++
 arch/x86/Kconfig.debug          |  6 +++---
 arch/x86/kernel/early_printk.c  |  2 +-
 arch/x86/platform/mrst/Makefile |  2 +-
 4 files changed, 22 insertions(+), 5 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
index cb9a1044a771..9e7a361423d6 100644
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -419,6 +419,23 @@ config X86_MRST
 	  nor standard legacy replacement devices/features. e.g. Moorestown does
 	  not contain i8259, i8254, HPET, legacy BIOS, most of the io ports.
 
+config X86_MDFLD
+       bool "Medfield MID platform"
+	depends on PCI
+	depends on PCI_GOANY
+	depends on X86_IO_APIC
+	select APB_TIMER
+	select I2C
+	select SPI
+	select INTEL_SCU_IPC
+	select X86_PLATFORM_DEVICES
+	---help---
+	  Medfield is Intel's Low Power Intel Architecture (LPIA) based Moblin
+	  Internet Device(MID) platform. 
+	  Unlike standard x86 PCs, Medfield does not have many legacy devices
+	  nor standard legacy replacement devices/features. e.g. Medfield does
+	  not contain i8259, i8254, HPET, legacy BIOS, most of the io ports.
+
 endif
 
 config X86_RDC321X
diff --git a/arch/x86/Kconfig.debug b/arch/x86/Kconfig.debug
index bf56e1793272..28c3c73ab208 100644
--- a/arch/x86/Kconfig.debug
+++ b/arch/x86/Kconfig.debug
@@ -43,9 +43,9 @@ config EARLY_PRINTK
 	  with klogd/syslogd or the X server. You should normally N here,
 	  unless you want to debug such a crash.
 
-config EARLY_PRINTK_MRST
-	bool "Early printk for MRST platform support"
-	depends on EARLY_PRINTK && X86_MRST
+config EARLY_PRINTK_INTEL_MID
+	bool "Early printk for Intel MID platform support"
+	depends on EARLY_PRINTK && X86_INTEL_MID
 
 config EARLY_PRINTK_DBGP
 	bool "Early printk via EHCI debug port"
diff --git a/arch/x86/kernel/early_printk.c b/arch/x86/kernel/early_printk.c
index cd28a350f7f9..7a53da03086f 100644
--- a/arch/x86/kernel/early_printk.c
+++ b/arch/x86/kernel/early_printk.c
@@ -240,7 +240,7 @@ static int __init setup_early_printk(char *buf)
 		if (!strncmp(buf, "xen", 3))
 			early_console_register(&xenboot_console, keep);
 #endif
-#ifdef CONFIG_EARLY_PRINTK_MRST
+#ifdef CONFIG_EARLY_PRINTK_INTEL_MID
 		if (!strncmp(buf, "mrst", 4)) {
 			mrst_early_console_init();
 			early_console_register(&early_mrst_console, keep);
diff --git a/arch/x86/platform/mrst/Makefile b/arch/x86/platform/mrst/Makefile
index 1ea38775a6d3..ddeec7300464 100644
--- a/arch/x86/platform/mrst/Makefile
+++ b/arch/x86/platform/mrst/Makefile
@@ -1,4 +1,4 @@
 obj-$(CONFIG_X86_MRST)		+= mrst.o
 obj-$(CONFIG_X86_MRST)		+= vrtc.o
-obj-$(CONFIG_EARLY_PRINTK_MRST)	+= early_printk_mrst.o
+obj-$(CONFIG_EARLY_PRINTK_INTEL_MID)	+= early_printk_mrst.o
 obj-$(CONFIG_X86_MRST)		+= pmu.o
-- 
cgit v1.2.3


From 9af0c7a6fa860698d080481f24a342ba74b68982 Mon Sep 17 00:00:00 2001
From: Ludwig Nussel <ludwig.nussel@suse.de>
Date: Tue, 15 Nov 2011 14:46:46 -0800
Subject: x86: Fix mmap random address range

On x86_32 casting the unsigned int result of get_random_int() to
long may result in a negative value.  On x86_32 the range of
mmap_rnd() therefore was -255 to 255.  The 32bit mode on x86_64
used 0 to 255 as intended.

The bug was introduced by 675a081 ("x86: unify mmap_{32|64}.c")
in January 2008.

Signed-off-by: Ludwig Nussel <ludwig.nussel@suse.de>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: harvey.harrison@gmail.com
Cc: "H. Peter Anvin" <hpa@zytor.com>
Cc: Harvey Harrison <harvey.harrison@gmail.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Link: http://lkml.kernel.org/r/201111152246.pAFMklOB028527@wpaz5.hot.corp.google.com
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/mm/mmap.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/mm/mmap.c b/arch/x86/mm/mmap.c
index 4b5ba85eb5c9..845df6835f9f 100644
--- a/arch/x86/mm/mmap.c
+++ b/arch/x86/mm/mmap.c
@@ -75,9 +75,9 @@ static unsigned long mmap_rnd(void)
 	*/
 	if (current->flags & PF_RANDOMIZE) {
 		if (mmap_is_ia32())
-			rnd = (long)get_random_int() % (1<<8);
+			rnd = get_random_int() % (1<<8);
 		else
-			rnd = (long)(get_random_int() % (1<<28));
+			rnd = get_random_int() % (1<<28);
 	}
 	return rnd << PAGE_SHIFT;
 }
-- 
cgit v1.2.3


From d1bbdd669298b7ca08284ddb29153dfc039dd89d Mon Sep 17 00:00:00 2001
From: Mike Ditto <mditto@google.com>
Date: Tue, 15 Nov 2011 14:46:50 -0800
Subject: arch/x86/kernel/e820.c: Eliminate bubble sort from
 sanitize_e820_map()

Replace the bubble sort in sanitize_e820_map() with a call to
the generic kernel sort function to avoid pathological
performance with large maps.

On large (thousands of entries) E820 maps, the previous code
took minutes to run; with this change it's now milliseconds.

Signed-off-by: Mike Ditto <mditto@google.com>
Cc: sassmann@kpanic.de
Cc: yuenn@google.com
Cc: Stefan Assmann <sassmann@kpanic.de>
Cc: Nancy Yuen <yuenn@google.com>
Cc: "H. Peter Anvin" <hpa@zytor.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/e820.c | 59 ++++++++++++++++++++------------------------------
 1 file changed, 24 insertions(+), 35 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kernel/e820.c b/arch/x86/kernel/e820.c
index 303a0e48f076..f655f802260d 100644
--- a/arch/x86/kernel/e820.c
+++ b/arch/x86/kernel/e820.c
@@ -19,6 +19,7 @@
 #include <linux/acpi.h>
 #include <linux/firmware-map.h>
 #include <linux/memblock.h>
+#include <linux/sort.h>
 
 #include <asm/e820.h>
 #include <asm/proto.h>
@@ -227,22 +228,38 @@ void __init e820_print_map(char *who)
  *	   ____________________33__
  *	   ______________________4_
  */
+struct change_member {
+	struct e820entry *pbios; /* pointer to original bios entry */
+	unsigned long long addr; /* address for this change point */
+};
+
+static int __init cpcompare(const void *a, const void *b)
+{
+	struct change_member * const *app = a, * const *bpp = b;
+	const struct change_member *ap = *app, *bp = *bpp;
+
+	/*
+	 * Inputs are pointers to two elements of change_point[].  If their
+	 * addresses are unequal, their difference dominates.  If the addresses
+	 * are equal, then consider one that represents the end of its region
+	 * to be greater than one that does not.
+	 */
+	if (ap->addr != bp->addr)
+		return ap->addr > bp->addr ? 1 : -1;
+
+	return (ap->addr != ap->pbios->addr) - (bp->addr != bp->pbios->addr);
+}
 
 int __init sanitize_e820_map(struct e820entry *biosmap, int max_nr_map,
 			     u32 *pnr_map)
 {
-	struct change_member {
-		struct e820entry *pbios; /* pointer to original bios entry */
-		unsigned long long addr; /* address for this change point */
-	};
 	static struct change_member change_point_list[2*E820_X_MAX] __initdata;
 	static struct change_member *change_point[2*E820_X_MAX] __initdata;
 	static struct e820entry *overlap_list[E820_X_MAX] __initdata;
 	static struct e820entry new_bios[E820_X_MAX] __initdata;
-	struct change_member *change_tmp;
 	unsigned long current_type, last_type;
 	unsigned long long last_addr;
-	int chgidx, still_changing;
+	int chgidx;
 	int overlap_entries;
 	int new_bios_entry;
 	int old_nr, new_nr, chg_nr;
@@ -279,35 +296,7 @@ int __init sanitize_e820_map(struct e820entry *biosmap, int max_nr_map,
 	chg_nr = chgidx;
 
 	/* sort change-point list by memory addresses (low -> high) */
-	still_changing = 1;
-	while (still_changing)	{
-		still_changing = 0;
-		for (i = 1; i < chg_nr; i++)  {
-			unsigned long long curaddr, lastaddr;
-			unsigned long long curpbaddr, lastpbaddr;
-
-			curaddr = change_point[i]->addr;
-			lastaddr = change_point[i - 1]->addr;
-			curpbaddr = change_point[i]->pbios->addr;
-			lastpbaddr = change_point[i - 1]->pbios->addr;
-
-			/*
-			 * swap entries, when:
-			 *
-			 * curaddr > lastaddr or
-			 * curaddr == lastaddr and curaddr == curpbaddr and
-			 * lastaddr != lastpbaddr
-			 */
-			if (curaddr < lastaddr ||
-			    (curaddr == lastaddr && curaddr == curpbaddr &&
-			     lastaddr != lastpbaddr)) {
-				change_tmp = change_point[i];
-				change_point[i] = change_point[i-1];
-				change_point[i-1] = change_tmp;
-				still_changing = 1;
-			}
-		}
-	}
+	sort(change_point, chg_nr, sizeof *change_point, cpcompare, 0);
 
 	/* create a new bios memory map, removing overlaps */
 	overlap_entries = 0;	 /* number of entries in the overlap table */
-- 
cgit v1.2.3


From 706d9a9c8b5758390036b9980a2b12d809599777 Mon Sep 17 00:00:00 2001
From: H Hartley Sweeten <hartleys@visionengravers.com>
Date: Tue, 15 Nov 2011 14:48:56 -0800
Subject: arch/x86/kernel/e820.c: quiet sparse noise about plain integer as
 NULL pointer

The last parameter to sort() is a pointer to the function used
to swap items.  This parameter should be NULL, not 0, when not
used.  This quiets the following sparse warning:

 warning: Using plain integer as NULL pointer

Signed-off-by: H Hartley Sweeten <hsweeten@visionengravers.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Cc: hartleys@visionengravers.com
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/e820.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kernel/e820.c b/arch/x86/kernel/e820.c
index f655f802260d..d6bd85352c81 100644
--- a/arch/x86/kernel/e820.c
+++ b/arch/x86/kernel/e820.c
@@ -296,7 +296,7 @@ int __init sanitize_e820_map(struct e820entry *biosmap, int max_nr_map,
 	chg_nr = chgidx;
 
 	/* sort change-point list by memory addresses (low -> high) */
-	sort(change_point, chg_nr, sizeof *change_point, cpcompare, 0);
+	sort(change_point, chg_nr, sizeof *change_point, cpcompare, NULL);
 
 	/* create a new bios memory map, removing overlaps */
 	overlap_entries = 0;	 /* number of entries in the overlap table */
-- 
cgit v1.2.3


From b565201cf75210614903ef2ae5917b4379681647 Mon Sep 17 00:00:00 2001
From: Jack Steiner <steiner@sgi.com>
Date: Tue, 15 Nov 2011 15:33:56 -0800
Subject: x86: Reduce clock calibration time during slave cpu startup

Reduce the startup time for slave cpus.

Adds hooks for an arch-specific function for clock calibration.
These hooks are used on x86.  If a newly started cpu has the
same phys_proc_id as a core already active, uses the TSC for the
delay loop and has a CONSTANT_TSC, use the already-calculated
value of loops_per_jiffy.

This patch reduces the time required to start slave cpus on a
4096 cpu system from: 465 sec OLD 62 sec NEW

This reduces boot time on a 4096p system by almost 7 minutes.
Nice...

Signed-off-by: Jack Steiner <steiner@sgi.com>
Cc: "H. Peter Anvin" <hpa@zytor.com>
Cc: John Stultz <john.stultz@linaro.org>
[fix CONFIG_SMP=n build]
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/smpboot.c | 16 +++++++++++-----
 arch/x86/kernel/tsc.c     | 20 ++++++++++++++++++++
 init/calibrate.c          | 15 +++++++++++++++
 3 files changed, 46 insertions(+), 5 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c
index 9f548cb4a958..00eef55c8327 100644
--- a/arch/x86/kernel/smpboot.c
+++ b/arch/x86/kernel/smpboot.c
@@ -207,22 +207,28 @@ static void __cpuinit smp_callin(void)
 	 * Need to setup vector mappings before we enable interrupts.
 	 */
 	setup_vector_irq(smp_processor_id());
+
+	/*
+	 * Save our processor parameters. Note: this information
+	 * is needed for clock calibration.
+	 */
+	smp_store_cpu_info(cpuid);
+
 	/*
 	 * Get our bogomips.
+	 * Update loops_per_jiffy in cpu_data. Previous call to
+	 * smp_store_cpu_info() stored a value that is close but not as
+	 * accurate as the value just calculated.
 	 *
 	 * Need to enable IRQs because it can take longer and then
 	 * the NMI watchdog might kill us.
 	 */
 	local_irq_enable();
 	calibrate_delay();
+	cpu_data(cpuid).loops_per_jiffy = loops_per_jiffy;
 	local_irq_disable();
 	pr_debug("Stack at about %p\n", &cpuid);
 
-	/*
-	 * Save our processor parameters
-	 */
-	smp_store_cpu_info(cpuid);
-
 	/*
 	 * This must be done before setting cpu_online_mask
 	 * or calling notify_cpu_starting.
diff --git a/arch/x86/kernel/tsc.c b/arch/x86/kernel/tsc.c
index db483369f10b..490fb330be87 100644
--- a/arch/x86/kernel/tsc.c
+++ b/arch/x86/kernel/tsc.c
@@ -995,3 +995,23 @@ void __init tsc_init(void)
 	check_system_tsc_reliable();
 }
 
+#ifdef CONFIG_SMP
+/*
+ * If we have a constant TSC and are using the TSC for the delay loop,
+ * we can skip clock calibration if another cpu in the same socket has already
+ * been calibrated. This assumes that CONSTANT_TSC applies to all
+ * cpus in the socket - this should be a safe assumption.
+ */
+unsigned long __cpuinit calibrate_delay_is_known(void)
+{
+	int i, cpu = smp_processor_id();
+
+	if (!tsc_disabled && !cpu_has(&cpu_data(cpu), X86_FEATURE_CONSTANT_TSC))
+		return 0;
+
+	for_each_online_cpu(i)
+		if (cpu_data(i).phys_proc_id == cpu_data(cpu).phys_proc_id)
+			return cpu_data(i).loops_per_jiffy;
+	return 0;
+}
+#endif
diff --git a/init/calibrate.c b/init/calibrate.c
index 24df7976816c..5f117ca9e069 100644
--- a/init/calibrate.c
+++ b/init/calibrate.c
@@ -246,6 +246,19 @@ recalibrate:
 
 static DEFINE_PER_CPU(unsigned long, cpu_loops_per_jiffy) = { 0 };
 
+/*
+ * Check if cpu calibration delay is already known. For example,
+ * some processors with multi-core sockets may have all cores
+ * with the same calibration delay.
+ *
+ * Architectures should override this function if a faster calibration
+ * method is available.
+ */
+unsigned long __attribute__((weak)) __cpuinit calibrate_delay_is_known(void)
+{
+	return 0;
+}
+
 void __cpuinit calibrate_delay(void)
 {
 	unsigned long lpj;
@@ -265,6 +278,8 @@ void __cpuinit calibrate_delay(void)
 		lpj = lpj_fine;
 		pr_info("Calibrating delay loop (skipped), "
 			"value calculated using timer frequency.. ");
+	} else if ((lpj = calibrate_delay_is_known())) {
+		;
 	} else if ((lpj = calibrate_delay_direct()) != 0) {
 		if (!printed)
 			pr_info("Calibrating delay using timer "
-- 
cgit v1.2.3


From f9b15df466ba923a5832c9121ad8327ccf5483ef Mon Sep 17 00:00:00 2001
From: Alessandro Rubini <rubini@gnudd.com>
Date: Sat, 29 Oct 2011 00:48:42 +0200
Subject: x86/Kconfig: Cyclone-timer depends on x86-summit

CONFIG_X86_CYCLONE_TIMER depends on CONFIG_X86_32_NON_STANDARD,
which forces drivers/clocksource/cyclone.c to be compiled. The
file doesn't do anything unless enabled by
arch/x86/kernel/apic/summit_32.c

Make CONFIG_X86_CYCLONE_TIMER depend by X86_SUMMIT instead, to
avoid unnecessary code in other non-standard systems.

Signed-off-by: Alessandro Rubini <rubini@gnudd.com>
Cc: john stultz <johnstul@us.ibm.com>
Link: http://lkml.kernel.org/r/20111028224842.GA7582@mail.gnudd.com
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/Kconfig | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'arch/x86')

diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
index 9e7a361423d6..faf39a0d6242 100644
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -633,7 +633,7 @@ config X86_SUMMIT_NUMA
 
 config X86_CYCLONE_TIMER
 	def_bool y
-	depends on X86_32_NON_STANDARD
+	depends on X86_SUMMIT
 
 source "arch/x86/Kconfig.cpu"
 
-- 
cgit v1.2.3


From 45db1c6176c8171d9ae6fa6d82e07d115a5950ca Mon Sep 17 00:00:00 2001
From: "H. Peter Anvin" <hpa@linux.intel.com>
Date: Mon, 5 Dec 2011 16:08:49 -0800
Subject: x86, um: Use the same style generated syscall tables as native

Now when the native kernel uses a single style of generated system
call table, follow suite for UML and implement the same style, all in
C.  This requires __NR_syscall_max and NR_syscalls to be generated; on
native this is done in asm-headers.h but that file is common to all
UML architectures; therefore put it in user-headers.h instead which
already have accommodations for architecture-specific values.

Signed-off-by: H. Peter Anvin <hpa@linux.intel.com>
---
 arch/x86/um/Makefile            |  3 ++-
 arch/x86/um/sys_call_table_32.S | 26 -------------------
 arch/x86/um/sys_call_table_32.c | 55 +++++++++++++++++++++++++++++++++++++++++
 arch/x86/um/sys_call_table_64.c | 31 +++++++++--------------
 arch/x86/um/user-offsets.c      | 15 +++++++++++
 5 files changed, 84 insertions(+), 46 deletions(-)
 delete mode 100644 arch/x86/um/sys_call_table_32.S
 create mode 100644 arch/x86/um/sys_call_table_32.c

(limited to 'arch/x86')

diff --git a/arch/x86/um/Makefile b/arch/x86/um/Makefile
index 8fb58400e415..5d065b2222d3 100644
--- a/arch/x86/um/Makefile
+++ b/arch/x86/um/Makefile
@@ -37,7 +37,8 @@ subarch-$(CONFIG_MODULES) += ../kernel/module.o
 USER_OBJS := bugs_$(BITS).o ptrace_user.o fault.o
 
 extra-y += user-offsets.s
-$(obj)/user-offsets.s: c_flags = -Wp,-MD,$(depfile) $(USER_CFLAGS)
+$(obj)/user-offsets.s: c_flags = -Wp,-MD,$(depfile) $(USER_CFLAGS) \
+	-Iarch/x86/include/generated
 
 UNPROFILE_OBJS := stub_segv.o
 CFLAGS_stub_segv.o := $(CFLAGS_NO_HARDENING)
diff --git a/arch/x86/um/sys_call_table_32.S b/arch/x86/um/sys_call_table_32.S
deleted file mode 100644
index a7ca80d2dceb..000000000000
--- a/arch/x86/um/sys_call_table_32.S
+++ /dev/null
@@ -1,26 +0,0 @@
-#include <linux/linkage.h>
-/* Steal i386 syscall table for our purposes, but with some slight changes.*/
-
-#define sys_iopl sys_ni_syscall
-#define sys_ioperm sys_ni_syscall
-
-#define sys_vm86old sys_ni_syscall
-#define sys_vm86 sys_ni_syscall
-
-#define old_mmap sys_old_mmap
-
-#define ptregs_fork sys_fork
-#define ptregs_execve sys_execve
-#define ptregs_iopl sys_iopl
-#define ptregs_vm86old sys_vm86old
-#define ptregs_clone sys_clone
-#define ptregs_vm86 sys_vm86
-#define ptregs_sigaltstack sys_sigaltstack
-#define ptregs_vfork sys_vfork
-
-.section .rodata,"a"
-
-#include "../kernel/syscall_table_32.S"
-
-ENTRY(syscall_table_size)
-.long .-sys_call_table
diff --git a/arch/x86/um/sys_call_table_32.c b/arch/x86/um/sys_call_table_32.c
new file mode 100644
index 000000000000..b897fcae6205
--- /dev/null
+++ b/arch/x86/um/sys_call_table_32.c
@@ -0,0 +1,55 @@
+/*
+ * System call table for UML/i386, copied from arch/x86/kernel/syscall_*.c
+ * with some changes for UML.
+ */
+
+#include <linux/linkage.h>
+#include <linux/sys.h>
+#include <linux/cache.h>
+#include <generated/user_constants.h>
+
+#define __NO_STUBS
+
+/*
+ * Below you can see, in terms of #define's, the differences between the x86-64
+ * and the UML syscall table.
+ */
+
+/* Not going to be implemented by UML, since we have no hardware. */
+#define stub_iopl sys_ni_syscall
+#define sys_ioperm sys_ni_syscall
+
+#define sys_vm86old sys_ni_syscall
+#define sys_vm86 sys_ni_syscall
+
+#define old_mmap sys_old_mmap
+
+#define ptregs_fork sys_fork
+#define ptregs_execve sys_execve
+#define ptregs_iopl sys_iopl
+#define ptregs_vm86old sys_vm86old
+#define ptregs_clone sys_clone
+#define ptregs_vm86 sys_vm86
+#define ptregs_sigaltstack sys_sigaltstack
+#define ptregs_vfork sys_vfork
+
+#define __SYSCALL_I386(nr, sym, compat) extern asmlinkage void sym(void) ;
+#include <asm/syscalls_32.h>
+
+#undef __SYSCALL_I386
+#define __SYSCALL_I386(nr, sym, compat) [ nr ] = sym,
+
+typedef void (*sys_call_ptr_t)(void);
+
+extern void sys_ni_syscall(void);
+
+sys_call_ptr_t sys_call_table[] __cacheline_aligned = {
+	/*
+	 * Smells like a compiler bug -- it doesn't work
+	 * when the & below is removed.
+	 */
+	[0 ... __NR_syscall_max] = &sys_ni_syscall,
+#include <asm/syscalls_32.h>
+};
+
+int syscall_table_size = sizeof(sys_call_table);
diff --git a/arch/x86/um/sys_call_table_64.c b/arch/x86/um/sys_call_table_64.c
index 99522f78b162..797a639bcca5 100644
--- a/arch/x86/um/sys_call_table_64.c
+++ b/arch/x86/um/sys_call_table_64.c
@@ -1,11 +1,12 @@
 /*
- * System call table for UML/x86-64, copied from arch/x86_64/kernel/syscall.c
+ * System call table for UML/x86-64, copied from arch/x86/kernel/syscall_*.c
  * with some changes for UML.
  */
 
 #include <linux/linkage.h>
 #include <linux/sys.h>
 #include <linux/cache.h>
+#include <generated/user_constants.h>
 
 #define __NO_STUBS
 
@@ -34,31 +35,23 @@
 #define stub_sigaltstack sys_sigaltstack
 #define stub_rt_sigreturn sys_rt_sigreturn
 
-#define __SYSCALL(nr, sym) extern asmlinkage void sym(void) ;
-#undef _ASM_X86_UNISTD_64_H
-#include "../../x86/include/asm/unistd_64.h"
+#define __SYSCALL_64(nr, sym, compat) extern asmlinkage void sym(void) ;
+#include <asm/syscalls_64.h>
 
-#undef __SYSCALL
-#define __SYSCALL(nr, sym) [ nr ] = sym,
-#undef _ASM_X86_UNISTD_64_H
+#undef __SYSCALL_64
+#define __SYSCALL_64(nr, sym, compat) [ nr ] = sym,
 
 typedef void (*sys_call_ptr_t)(void);
 
 extern void sys_ni_syscall(void);
 
-/*
- * We used to have a trick here which made sure that holes in the
- * x86_64 table were filled in with sys_ni_syscall, but a comment in
- * unistd_64.h says that holes aren't allowed, so the trick was
- * removed.
- * The trick looked like this
- *	[0 ... UM_NR_syscall_max] = &sys_ni_syscall
- * before including unistd_64.h - the later initializations overwrote
- * the sys_ni_syscall filler.
- */
-
 sys_call_ptr_t sys_call_table[] __cacheline_aligned = {
-#include <asm/unistd_64.h>
+	/*
+	 * Smells like a compiler bug -- it doesn't work
+	 * when the & below is removed.
+	 */
+	[0 ... __NR_syscall_max] = &sys_ni_syscall,
+#include <asm/syscalls_64.h>
 };
 
 int syscall_table_size = sizeof(sys_call_table);
diff --git a/arch/x86/um/user-offsets.c b/arch/x86/um/user-offsets.c
index ca49be8ddd0c..5edf4f4bbf53 100644
--- a/arch/x86/um/user-offsets.c
+++ b/arch/x86/um/user-offsets.c
@@ -8,6 +8,18 @@
 #include <asm/ptrace.h>
 #include <asm/types.h>
 
+#ifdef __i386__
+#define __SYSCALL_I386(nr, sym, compat) [nr] = 1,
+static char syscalls[] = {
+#include <asm/syscalls_32.h>
+};
+#else
+#define __SYSCALL_64(nr, sym, compat) [nr] = 1,
+static char syscalls[] = {
+#include <asm/syscalls_64.h>
+};
+#endif
+
 #define DEFINE(sym, val) \
 	asm volatile("\n->" #sym " %0 " #val : : "i" (val))
 
@@ -77,4 +89,7 @@ void foo(void)
 	DEFINE(UM_PROT_READ, PROT_READ);
 	DEFINE(UM_PROT_WRITE, PROT_WRITE);
 	DEFINE(UM_PROT_EXEC, PROT_EXEC);
+
+	DEFINE(__NR_syscall_max, sizeof(syscalls) - 1);
+	DEFINE(NR_syscalls, sizeof(syscalls));
 }
-- 
cgit v1.2.3


From a074335a370eca6d72f2ec890e4ae22923a2aea4 Mon Sep 17 00:00:00 2001
From: "H. Peter Anvin" <hpa@zytor.com>
Date: Mon, 5 Dec 2011 22:48:49 -0800
Subject: x86, um: Mark system call tables readonly

Mark the system call tables readonly, as they already are on native,
and the 32-bit UM version was in the previous assembly version.  The
32-bit version lost it due to copy and paste from the 64-bit version,
which was missing the const.

Cc: Jeff Dike <jdike@addtoit.com>
Cc: Richard Weinberger <richard@nod.at>
Link: http://lkml.kernel.org/r/tip-45db1c6176c8171d9ae6fa6d82e07d115a5950ca@git.kernel.org
Signed-off-by: H. Peter Anvin <hpa@zytor.com>
---
 arch/x86/um/sys_call_table_32.c | 2 +-
 arch/x86/um/sys_call_table_64.c | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/um/sys_call_table_32.c b/arch/x86/um/sys_call_table_32.c
index b897fcae6205..0606aa3e92ae 100644
--- a/arch/x86/um/sys_call_table_32.c
+++ b/arch/x86/um/sys_call_table_32.c
@@ -43,7 +43,7 @@ typedef void (*sys_call_ptr_t)(void);
 
 extern void sys_ni_syscall(void);
 
-sys_call_ptr_t sys_call_table[] __cacheline_aligned = {
+const sys_call_ptr_t sys_call_table[] __cacheline_aligned = {
 	/*
 	 * Smells like a compiler bug -- it doesn't work
 	 * when the & below is removed.
diff --git a/arch/x86/um/sys_call_table_64.c b/arch/x86/um/sys_call_table_64.c
index 797a639bcca5..fe626c3ba01b 100644
--- a/arch/x86/um/sys_call_table_64.c
+++ b/arch/x86/um/sys_call_table_64.c
@@ -45,7 +45,7 @@ typedef void (*sys_call_ptr_t)(void);
 
 extern void sys_ni_syscall(void);
 
-sys_call_ptr_t sys_call_table[] __cacheline_aligned = {
+const sys_call_ptr_t sys_call_table[] __cacheline_aligned = {
 	/*
 	 * Smells like a compiler bug -- it doesn't work
 	 * when the & below is removed.
-- 
cgit v1.2.3


From 855c743a27bb58a9a521bdc485ef5acfdb69badc Mon Sep 17 00:00:00 2001
From: Stanislaw Gruszka <sgruszka@redhat.com>
Date: Tue, 6 Dec 2011 09:08:34 +0100
Subject: x86/mm: Initialize high mem before free_all_bootmem()

Patch fixes a boot crash with pagealloc debugging enabled:

  Initializing HighMem for node 0 (000377fe:0003fff0)
  BUG: unable to handle kernel paging request at f6fefe80
  IP: [<c1621ab5>] find_range_array+0x5e/0x69
  [...]
  Call Trace:
   [<c1622064>] __get_free_all_memory_range+0x39/0xb4
   [<c1620dd0>] add_highpages_with_active_regions+0x18/0x9b
   [<c1621a2e>] set_highmem_pages_init+0x70/0x90
   [<c162122b>] mem_init+0x50/0x21b
   [<c16155bd>] start_kernel+0x1bf/0x31c
   [<c1615065>] i386_start_kernel+0x65/0x67

The crash happens when memblock wants to allocate big area for
temporary "struct range" array and reuses pages from top of low
memory, which were already passed to the buddy allocator.

Reported-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Stanislaw Gruszka <sgruszka@redhat.com>
Cc: linux-mm@kvack.org
Cc: Mel Gorman <mgorman@suse.de>
Link: http://lkml.kernel.org/r/20111206080833.GB3105@redhat.com
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/mm/init_32.c | 13 +++++++++++--
 1 file changed, 11 insertions(+), 2 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/mm/init_32.c b/arch/x86/mm/init_32.c
index 3bebaed5021c..a2fecb1611cc 100644
--- a/arch/x86/mm/init_32.c
+++ b/arch/x86/mm/init_32.c
@@ -744,6 +744,17 @@ void __init mem_init(void)
 #ifdef CONFIG_FLATMEM
 	BUG_ON(!mem_map);
 #endif
+	/*
+	 * With CONFIG_DEBUG_PAGEALLOC initialization of highmem pages has to
+	 * be done before free_all_bootmem(). Memblock use free low memory for
+	 * temporary data (see find_range_array()) and for this purpose can use
+	 * pages that was already passed to the buddy allocator, hence marked as
+	 * not accessible in the page tables when compiled with
+	 * CONFIG_DEBUG_PAGEALLOC. Otherwise order of initialization is not
+	 * important here.
+	 */
+	set_highmem_pages_init();
+
 	/* this will put all low memory onto the freelists */
 	totalram_pages += free_all_bootmem();
 
@@ -755,8 +766,6 @@ void __init mem_init(void)
 		if (page_is_ram(tmp) && PageReserved(pfn_to_page(tmp)))
 			reservedpages++;
 
-	set_highmem_pages_init();
-
 	codesize =  (unsigned long) &_etext - (unsigned long) &_text;
 	datasize =  (unsigned long) &_edata - (unsigned long) &_etext;
 	initsize =  (unsigned long) &__init_end - (unsigned long) &__init_begin;
-- 
cgit v1.2.3


From 54c29c635ae91f5d75ced7bffeaa77ba37ca02bb Mon Sep 17 00:00:00 2001
From: Stanislaw Gruszka <sgruszka@redhat.com>
Date: Tue, 29 Nov 2011 17:05:11 +0100
Subject: mm, x86: Remove debug_pagealloc_enabled

When (no)bootmem finish operation, it pass pages to buddy
allocator. Since debug_pagealloc_enabled is not set, we will do
not protect pages, what is not what we want with
CONFIG_DEBUG_PAGEALLOC=y.

To fix remove debug_pagealloc_enabled. That variable was
introduced by commit 12d6f21e "x86: do not PSE on
CONFIG_DEBUG_PAGEALLOC=y" to get more CPA (change page
attribude) code testing. But currently we have CONFIG_CPA_DEBUG,
which test CPA.

Signed-off-by: Stanislaw Gruszka <sgruszka@redhat.com>
Acked-by: Mel Gorman <mgorman@suse.de>
Cc: linux-mm@kvack.org
Link: http://lkml.kernel.org/r/1322582711-14571-1-git-send-email-sgruszka@redhat.com
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/mm/pageattr.c |  6 ------
 include/linux/mm.h     | 10 ----------
 init/main.c            |  5 -----
 mm/debug-pagealloc.c   |  3 ---
 4 files changed, 24 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/mm/pageattr.c b/arch/x86/mm/pageattr.c
index f9e526742fa1..5031eefa051f 100644
--- a/arch/x86/mm/pageattr.c
+++ b/arch/x86/mm/pageattr.c
@@ -1333,12 +1333,6 @@ void kernel_map_pages(struct page *page, int numpages, int enable)
 					   numpages * PAGE_SIZE);
 	}
 
-	/*
-	 * If page allocator is not up yet then do not call c_p_a():
-	 */
-	if (!debug_pagealloc_enabled)
-		return;
-
 	/*
 	 * The return value is ignored as the calls cannot fail.
 	 * Large pages for identity mappings are not used at boot time
diff --git a/include/linux/mm.h b/include/linux/mm.h
index 3dc3a8c2c485..0a22db144753 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -1537,23 +1537,13 @@ static inline void vm_stat_account(struct mm_struct *mm,
 #endif /* CONFIG_PROC_FS */
 
 #ifdef CONFIG_DEBUG_PAGEALLOC
-extern int debug_pagealloc_enabled;
-
 extern void kernel_map_pages(struct page *page, int numpages, int enable);
-
-static inline void enable_debug_pagealloc(void)
-{
-	debug_pagealloc_enabled = 1;
-}
 #ifdef CONFIG_HIBERNATION
 extern bool kernel_page_present(struct page *page);
 #endif /* CONFIG_HIBERNATION */
 #else
 static inline void
 kernel_map_pages(struct page *page, int numpages, int enable) {}
-static inline void enable_debug_pagealloc(void)
-{
-}
 #ifdef CONFIG_HIBERNATION
 static inline bool kernel_page_present(struct page *page) { return true; }
 #endif /* CONFIG_HIBERNATION */
diff --git a/init/main.c b/init/main.c
index 217ed23e9487..99c4ba30ba7e 100644
--- a/init/main.c
+++ b/init/main.c
@@ -282,10 +282,6 @@ static int __init unknown_bootoption(char *param, char *val)
 	return 0;
 }
 
-#ifdef CONFIG_DEBUG_PAGEALLOC
-int __read_mostly debug_pagealloc_enabled = 0;
-#endif
-
 static int __init init_setup(char *str)
 {
 	unsigned int i;
@@ -597,7 +593,6 @@ asmlinkage void __init start_kernel(void)
 	}
 #endif
 	page_cgroup_init();
-	enable_debug_pagealloc();
 	debug_objects_mem_init();
 	kmemleak_init();
 	setup_per_cpu_pageset();
diff --git a/mm/debug-pagealloc.c b/mm/debug-pagealloc.c
index 7cea557407f4..789ff70c8a4a 100644
--- a/mm/debug-pagealloc.c
+++ b/mm/debug-pagealloc.c
@@ -95,9 +95,6 @@ static void unpoison_pages(struct page *page, int n)
 
 void kernel_map_pages(struct page *page, int numpages, int enable)
 {
-	if (!debug_pagealloc_enabled)
-		return;
-
 	if (enable)
 		unpoison_pages(page, numpages);
 	else
-- 
cgit v1.2.3


From 565cbc3e934f221369a656b4469a044aa4c3f2a8 Mon Sep 17 00:00:00 2001
From: Don Zickus <dzickus@redhat.com>
Date: Tue, 6 Dec 2011 13:08:59 -0500
Subject: x86, NMI: NMI-selftest should handle the UP case properly

If no remote cpus are online, then just quietly skip the remote
IPI test for now.

Signed-off-by: Don Zickus <dzickus@redhat.com>
Cc: andi@firstfloor.org
Cc: torvalds@linux-foundation.org
Cc: peterz@infradead.org
Cc: robert.richter@amd.com
Link: http://lkml.kernel.org/r/20111206180859.GR1669@redhat.com
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/nmi_selftest.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kernel/nmi_selftest.c b/arch/x86/kernel/nmi_selftest.c
index 572adb622251..1e42a23c1f2a 100644
--- a/arch/x86/kernel/nmi_selftest.c
+++ b/arch/x86/kernel/nmi_selftest.c
@@ -90,7 +90,8 @@ static void remote_ipi(void)
 {
 	cpumask_copy(to_cpumask(nmi_ipi_mask), cpu_online_mask);
 	cpumask_clear_cpu(smp_processor_id(), to_cpumask(nmi_ipi_mask));
-	test_nmi_ipi(to_cpumask(nmi_ipi_mask));
+	if (!cpumask_empty(nmi_ipi_mask))
+		test_nmi_ipi(to_cpumask(nmi_ipi_mask));
 }
 
 static void local_ipi(void)
-- 
cgit v1.2.3


From d2db6610219cbcadceea6c43ee03d89068b7d759 Mon Sep 17 00:00:00 2001
From: Mitsuo Hayasaka <mitsuo.hayasaka.hu@hitachi.com>
Date: Wed, 7 Dec 2011 17:29:10 +0900
Subject: x86: Add stack top margin for stack overflow checking

It seems that a margin for stack overflow checking is added to
top of a kernel stack but is not added to IRQ and exception
stacks in stack_overflow_check(). Therefore, the overflows of
IRQ and exception stacks are always detected only after they
actually occurred and data corruption might occur due to them.

This patch adds the margin to top of IRQ and exception stacks
as well as a kernel stack to enhance reliability.

Signed-off-by: Mitsuo Hayasaka <mitsuo.hayasaka.hu@hitachi.com>
Cc: yrl.pp-manager.tt@hitachi.com
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Andrew Morton <akpm@linux-foundation.org>
Link: http://lkml.kernel.org/r/20111207082910.9847.3359.stgit@ltc219.sdl.hitachi.co.jp
[ removed the #undef - we typically don't do that for uncommon names ]
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/irq_64.c | 8 +++++---
 1 file changed, 5 insertions(+), 3 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kernel/irq_64.c b/arch/x86/kernel/irq_64.c
index 54e2b2b2e250..d04d3ecded62 100644
--- a/arch/x86/kernel/irq_64.c
+++ b/arch/x86/kernel/irq_64.c
@@ -38,6 +38,7 @@ int sysctl_panic_on_stackoverflow;
 static inline void stack_overflow_check(struct pt_regs *regs)
 {
 #ifdef CONFIG_DEBUG_STACKOVERFLOW
+#define STACK_TOP_MARGIN	128
 	struct orig_ist *oist;
 	u64 irq_stack_top, irq_stack_bottom;
 	u64 estack_top, estack_bottom;
@@ -47,17 +48,18 @@ static inline void stack_overflow_check(struct pt_regs *regs)
 		return;
 
 	if (regs->sp >= curbase + sizeof(struct thread_info) +
-				  sizeof(struct pt_regs) + 128 &&
+				  sizeof(struct pt_regs) + STACK_TOP_MARGIN &&
 	    regs->sp <= curbase + THREAD_SIZE)
 		return;
 
-	irq_stack_top = (u64)__get_cpu_var(irq_stack_union.irq_stack);
+	irq_stack_top = (u64)__get_cpu_var(irq_stack_union.irq_stack) +
+			STACK_TOP_MARGIN;
 	irq_stack_bottom = (u64)__get_cpu_var(irq_stack_ptr);
 	if (regs->sp >= irq_stack_top && regs->sp <= irq_stack_bottom)
 		return;
 
 	oist = &__get_cpu_var(orig_ist);
-	estack_top = (u64)oist->ist[0] - EXCEPTION_STKSZ;
+	estack_top = (u64)oist->ist[0] - EXCEPTION_STKSZ + STACK_TOP_MARGIN;
 	estack_bottom = (u64)oist->ist[N_EXCEPTION_STACKS - 1];
 	if (regs->sp >= estack_top && regs->sp <= estack_bottom)
 		return;
-- 
cgit v1.2.3


From 4f941c57fe7e04e38c2401d53516bfd16038c9ab Mon Sep 17 00:00:00 2001
From: Don Zickus <dzickus@redhat.com>
Date: Wed, 7 Dec 2011 16:06:30 -0500
Subject: x86, NMI: NMI selftest depends on the local apic

The selftest doesn't work with out a local apic for now.

Reported-by: Randy Durlap <rdunlap@xenotime.net>
Signed-off-by: Don Zickus <dzickus@redhat.com>
Cc: Stephen Rothwell <sfr@canb.auug.org.au>
Link: http://lkml.kernel.org/r/20111207210630.GI1669@redhat.com
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/Kconfig.debug | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'arch/x86')

diff --git a/arch/x86/Kconfig.debug b/arch/x86/Kconfig.debug
index 97da3c17b424..aa4158f3ce62 100644
--- a/arch/x86/Kconfig.debug
+++ b/arch/x86/Kconfig.debug
@@ -289,7 +289,7 @@ config DEBUG_STRICT_USER_COPY_CHECKS
 
 config DEBUG_NMI_SELFTEST
 	bool "NMI Selftest"
-	depends on DEBUG_KERNEL
+	depends on DEBUG_KERNEL && X86_LOCAL_APIC
 	---help---
 	  Enabling this option turns on a quick NMI selftest to verify
 	  that the NMI behaves correctly.
-- 
cgit v1.2.3


From 3d6240b53e34e372be007e08f7066e7625910675 Mon Sep 17 00:00:00 2001
From: Dan Carpenter <dan.carpenter@oracle.com>
Date: Wed, 7 Dec 2011 14:06:12 +0300
Subject: x86, NMI: Add to_cpumask() to silence compile warning
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Gcc complains if we don't cast this to a struct cpumask pointer.
arch/x86/kernel/nmi_selftest.c:93:2:

	warning: passing argument 1 of ‘cpumask_empty’ from
	incompatible pointer type [enabled by default]

Signed-off-by: Dan Carpenter <dan.carpenter@oracle.com>
Cc: Don Zickus <dzickus@redhat.com>
Link: http://lkml.kernel.org/r/20111207110612.GA3437@mwanda
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/nmi_selftest.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kernel/nmi_selftest.c b/arch/x86/kernel/nmi_selftest.c
index 1e42a23c1f2a..0d01a8ea4e11 100644
--- a/arch/x86/kernel/nmi_selftest.c
+++ b/arch/x86/kernel/nmi_selftest.c
@@ -90,7 +90,7 @@ static void remote_ipi(void)
 {
 	cpumask_copy(to_cpumask(nmi_ipi_mask), cpu_online_mask);
 	cpumask_clear_cpu(smp_processor_id(), to_cpumask(nmi_ipi_mask));
-	if (!cpumask_empty(nmi_ipi_mask))
+	if (!cpumask_empty(to_cpumask(nmi_ipi_mask)))
 		test_nmi_ipi(to_cpumask(nmi_ipi_mask));
 }
 
-- 
cgit v1.2.3


From 54eed6cb16ec315565aaaf8e34252ca253a68b7b Mon Sep 17 00:00:00 2001
From: Petr Holasek <pholasek@redhat.com>
Date: Thu, 8 Dec 2011 13:16:41 +0100
Subject: x86/numa: Add constraints check for nid parameters

This patch adds constraint checks to the numa_set_distance()
function.

When the check triggers (this should not happen normally) it
emits a warning and avoids a store to a negative index in
numa_distance[] array - i.e. avoids memory corruption.

Negative ids can be passed when the pxm-to-nids mapping is not
properly filled while parsing the SRAT.

Signed-off-by: Petr Holasek <pholasek@redhat.com>
Acked-by: David Rientjes <rientjes@google.com>
Cc: Anton Arapov <anton@redhat.com>
Link: http://lkml.kernel.org/r/20111208121640.GA2229@dhcp-27-244.brq.redhat.com
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/mm/numa.c | 10 ++++++----
 1 file changed, 6 insertions(+), 4 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/mm/numa.c b/arch/x86/mm/numa.c
index fbeaaf416610..cdc00543d375 100644
--- a/arch/x86/mm/numa.c
+++ b/arch/x86/mm/numa.c
@@ -430,8 +430,9 @@ static int __init numa_alloc_distance(void)
  * calls are ignored until the distance table is reset with
  * numa_reset_distance().
  *
- * If @from or @to is higher than the highest known node at the time of
- * table creation or @distance doesn't make sense, the call is ignored.
+ * If @from or @to is higher than the highest known node or lower than zero
+ * at the time of table creation or @distance doesn't make sense, the call
+ * is ignored.
  * This is to allow simplification of specific NUMA config implementations.
  */
 void __init numa_set_distance(int from, int to, int distance)
@@ -439,8 +440,9 @@ void __init numa_set_distance(int from, int to, int distance)
 	if (!numa_distance && numa_alloc_distance() < 0)
 		return;
 
-	if (from >= numa_distance_cnt || to >= numa_distance_cnt) {
-		printk_once(KERN_DEBUG "NUMA: Debug: distance out of bound, from=%d to=%d distance=%d\n",
+	if (from >= numa_distance_cnt || to >= numa_distance_cnt ||
+			from < 0 || to < 0) {
+		pr_warn_once("NUMA: Warning: node ids are out of bound, from=%d to=%d distance=%d\n",
 			    from, to, distance);
 		return;
 	}
-- 
cgit v1.2.3


From 47db9e7c808a45b1f86971f25eca5e38fa95ab86 Mon Sep 17 00:00:00 2001
From: "H. Peter Anvin" <hpa@zytor.com>
Date: Fri, 9 Dec 2011 11:13:59 -0800
Subject: x86, um: Fix typo in 32-bit system call modifications

We override sys_iopl(), not stub_iopl(); the latter is a 64-bitism
that doesn't apply to i386 in the first place.

Reported-by: Richard Weinberger <richard@nod.at>
Signed-off-by: H. Peter Anvin <hpa@zytor.com>
---
 arch/x86/um/sys_call_table_32.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'arch/x86')

diff --git a/arch/x86/um/sys_call_table_32.c b/arch/x86/um/sys_call_table_32.c
index 0606aa3e92ae..416bd40c0eba 100644
--- a/arch/x86/um/sys_call_table_32.c
+++ b/arch/x86/um/sys_call_table_32.c
@@ -16,7 +16,7 @@
  */
 
 /* Not going to be implemented by UML, since we have no hardware. */
-#define stub_iopl sys_ni_syscall
+#define sys_iopl sys_ni_syscall
 #define sys_ioperm sys_ni_syscall
 
 #define sys_vm86old sys_ni_syscall
-- 
cgit v1.2.3


From 8af21e7e71d1ac56d9b66fb787a14fd66af7f5f7 Mon Sep 17 00:00:00 2001
From: Matt Fleming <matt.fleming@intel.com>
Date: Sat, 27 Aug 2011 09:35:45 +0100
Subject: x86: Add missing bzImage fields to struct setup_header

commit 37ba7ab5e33c ("x86, boot: make kernel_alignment adjustable; new
bzImage fields") introduced some new fields into the bzImage header
but struct setup_header was not updated accordingly. Add the missing
'pref_address' and 'init_size' fields.

Signed-off-by: Matt Fleming <matt.fleming@intel.com>
Link: http://lkml.kernel.org/r/1318848017-12301-1-git-send-email-matt@console-pimps.org
Signed-off-by: H. Peter Anvin <hpa@linux.intel.com>
---
 arch/x86/include/asm/bootparam.h | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'arch/x86')

diff --git a/arch/x86/include/asm/bootparam.h b/arch/x86/include/asm/bootparam.h
index e020d88ec02d..2f90c51cc49d 100644
--- a/arch/x86/include/asm/bootparam.h
+++ b/arch/x86/include/asm/bootparam.h
@@ -64,6 +64,8 @@ struct setup_header {
 	__u32	payload_offset;
 	__u32	payload_length;
 	__u64	setup_data;
+	__u64	pref_address;
+	__u32	init_size;
 } __attribute__((packed));
 
 struct sys_desc_table {
-- 
cgit v1.2.3


From f7d7d01be53cb47e0ae212c4e968aa28b82d2138 Mon Sep 17 00:00:00 2001
From: Matt Fleming <matt.fleming@intel.com>
Date: Tue, 15 Nov 2011 12:56:14 +0000
Subject: x86: Don't use magic strings for EFI loader signature

Introduce a symbol, EFI_LOADER_SIGNATURE instead of using the magic
strings, which also helps to reduce the amount of ifdeffery.

Cc: Matthew Garrett <mjg@redhat.com>
Signed-off-by: Matt Fleming <matt.fleming@intel.com>
Link: http://lkml.kernel.org/r/1318848017-12301-1-git-send-email-matt@console-pimps.org
Signed-off-by: H. Peter Anvin <hpa@linux.intel.com>
---
 arch/x86/include/asm/efi.h | 4 ++++
 arch/x86/kernel/setup.c    | 7 +------
 2 files changed, 5 insertions(+), 6 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/include/asm/efi.h b/arch/x86/include/asm/efi.h
index b8d8bfcd44a9..26d8c18d5faa 100644
--- a/arch/x86/include/asm/efi.h
+++ b/arch/x86/include/asm/efi.h
@@ -3,6 +3,8 @@
 
 #ifdef CONFIG_X86_32
 
+#define EFI_LOADER_SIGNATURE	"EL32"
+
 extern unsigned long asmlinkage efi_call_phys(void *, ...);
 
 #define efi_call_phys0(f)		efi_call_phys(f)
@@ -35,6 +37,8 @@ extern unsigned long asmlinkage efi_call_phys(void *, ...);
 
 #else /* !CONFIG_X86_32 */
 
+#define EFI_LOADER_SIGNATURE	"EL64"
+
 extern u64 efi_call0(void *fp);
 extern u64 efi_call1(void *fp, u64 arg1);
 extern u64 efi_call2(void *fp, u64 arg1, u64 arg2);
diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c
index 9a9e40fb091c..4d5243c31ac4 100644
--- a/arch/x86/kernel/setup.c
+++ b/arch/x86/kernel/setup.c
@@ -752,12 +752,7 @@ void __init setup_arch(char **cmdline_p)
 #endif
 #ifdef CONFIG_EFI
 	if (!strncmp((char *)&boot_params.efi_info.efi_loader_signature,
-#ifdef CONFIG_X86_32
-		     "EL32",
-#else
-		     "EL64",
-#endif
-	 4)) {
+		     EFI_LOADER_SIGNATURE, 4)) {
 		efi_enabled = 1;
 		efi_memblock_x86_reserve_range();
 	}
-- 
cgit v1.2.3


From 291f36325f9f252bd76ef5f603995f37e453fc60 Mon Sep 17 00:00:00 2001
From: Matt Fleming <matt.fleming@intel.com>
Date: Mon, 12 Dec 2011 21:27:52 +0000
Subject: x86, efi: EFI boot stub support
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

There is currently a large divide between kernel development and the
development of EFI boot loaders. The idea behind this patch is to give
the kernel developers full control over the EFI boot process. As
H. Peter Anvin put it,

"The 'kernel carries its own stub' approach been very successful in
dealing with BIOS, and would make a lot of sense to me for EFI as
well."

This patch introduces an EFI boot stub that allows an x86 bzImage to
be loaded and executed by EFI firmware. The bzImage appears to the
firmware as an EFI application. Luckily there are enough free bits
within the bzImage header so that it can masquerade as an EFI
application, thereby coercing the EFI firmware into loading it and
jumping to its entry point. The beauty of this masquerading approach
is that both BIOS and EFI boot loaders can still load and run the same
bzImage, thereby allowing a single kernel image to work in any boot
environment.

The EFI boot stub supports multiple initrds, but they must exist on
the same partition as the bzImage. Command-line arguments for the
kernel can be appended after the bzImage name when run from the EFI
shell, e.g.

Shell> bzImage console=ttyS0 root=/dev/sdb initrd=initrd.img

v7:
 - Fix checkpatch warnings.

v6:

 - Try to allocate initrd memory just below hdr->inird_addr_max.

v5:

 - load_options_size is UTF-16, which needs dividing by 2 to convert
   to the corresponding ASCII size.

v4:

 - Don't read more than image->load_options_size

v3:

 - Fix following warnings when compiling CONFIG_EFI_STUB=n

   arch/x86/boot/tools/build.c: In function ‘main’:
   arch/x86/boot/tools/build.c:138:24: warning: unused variable ‘pe_header’
   arch/x86/boot/tools/build.c:138:15: warning: unused variable ‘file_sz’

 - As reported by Matthew Garrett, some Apple machines have GOPs that
   don't have hardware attached. We need to weed these out by
   searching for ones that handle the PCIIO protocol.

 - Don't allocate memory if no initrds are on cmdline
 - Don't trust image->load_options_size

Maarten Lankhorst noted:
 - Don't strip first argument when booted from efibootmgr
 - Don't allocate too much memory for cmdline
 - Don't update cmdline_size, the kernel considers it read-only
 - Don't accept '\n' for initrd names

v2:

 - File alignment was too large, was 8192 should be 512. Reported by
   Maarten Lankhorst on LKML.
 - Added UGA support for graphics
 - Use VIDEO_TYPE_EFI instead of hard-coded number.
 - Move linelength assignment until after we've assigned depth
 - Dynamically fill out AddressOfEntryPoint in tools/build.c
 - Don't use magic number for GDT/TSS stuff. Requested by Andi Kleen
 - The bzImage may need to be relocated as it may have been loaded at
   a high address address by the firmware. This was required to get my
   macbook booting because the firmware loaded it at 0x7cxxxxxx, which
   triggers this error in decompress_kernel(),

	if (heap > ((-__PAGE_OFFSET-(128<<20)-1) & 0x7fffffff))
		error("Destination address too large");

Cc: Mike Waychison <mikew@google.com>
Cc: Matthew Garrett <mjg@redhat.com>
Tested-by: Henrik Rydberg <rydberg@euromail.se>
Signed-off-by: Matt Fleming <matt.fleming@intel.com>
Link: http://lkml.kernel.org/r/1321383097.2657.9.camel@mfleming-mobl1.ger.corp.intel.com
Signed-off-by: H. Peter Anvin <hpa@linux.intel.com>
---
 arch/x86/Kconfig                       |    7 +
 arch/x86/boot/compressed/Makefile      |   10 +-
 arch/x86/boot/compressed/eboot.c       | 1014 ++++++++++++++++++++++++++++++++
 arch/x86/boot/compressed/eboot.h       |   60 ++
 arch/x86/boot/compressed/efi_stub_32.S |   86 +++
 arch/x86/boot/compressed/efi_stub_64.S |    1 +
 arch/x86/boot/compressed/head_32.S     |   22 +
 arch/x86/boot/compressed/head_64.S     |   20 +
 arch/x86/boot/compressed/string.c      |    9 +
 arch/x86/boot/header.S                 |  158 +++++
 arch/x86/boot/string.c                 |   35 ++
 arch/x86/boot/tools/build.c            |   39 ++
 arch/x86/kernel/asm-offsets.c          |    2 +
 13 files changed, 1462 insertions(+), 1 deletion(-)
 create mode 100644 arch/x86/boot/compressed/eboot.c
 create mode 100644 arch/x86/boot/compressed/eboot.h
 create mode 100644 arch/x86/boot/compressed/efi_stub_32.S
 create mode 100644 arch/x86/boot/compressed/efi_stub_64.S

(limited to 'arch/x86')

diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
index efb42949cc09..d71b656bcb97 100644
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -1478,6 +1478,13 @@ config EFI
 	  resultant kernel should continue to boot on existing non-EFI
 	  platforms.
 
+config EFI_STUB
+       bool "EFI stub support"
+       depends on EFI
+       ---help---
+          This kernel feature allows a bzImage to be loaded directly
+	  by EFI firmware without the use of a bootloader.
+
 config SECCOMP
 	def_bool y
 	prompt "Enable seccomp to safely compute untrusted bytecode"
diff --git a/arch/x86/boot/compressed/Makefile b/arch/x86/boot/compressed/Makefile
index 09664efb9cee..b123b9a8f5b3 100644
--- a/arch/x86/boot/compressed/Makefile
+++ b/arch/x86/boot/compressed/Makefile
@@ -23,7 +23,15 @@ LDFLAGS_vmlinux := -T
 
 hostprogs-y	:= mkpiggy
 
-$(obj)/vmlinux: $(obj)/vmlinux.lds $(obj)/head_$(BITS).o $(obj)/misc.o $(obj)/string.o $(obj)/cmdline.o $(obj)/early_serial_console.o $(obj)/piggy.o FORCE
+VMLINUX_OBJS = $(obj)/vmlinux.lds $(obj)/head_$(BITS).o $(obj)/misc.o \
+	$(obj)/string.o $(obj)/cmdline.o $(obj)/early_serial_console.o \
+	$(obj)/piggy.o
+
+ifeq ($(CONFIG_EFI_STUB), y)
+	VMLINUX_OBJS += $(obj)/eboot.o $(obj)/efi_stub_$(BITS).o
+endif
+
+$(obj)/vmlinux: $(VMLINUX_OBJS) FORCE
 	$(call if_changed,ld)
 	@:
 
diff --git a/arch/x86/boot/compressed/eboot.c b/arch/x86/boot/compressed/eboot.c
new file mode 100644
index 000000000000..4055e63d0b04
--- /dev/null
+++ b/arch/x86/boot/compressed/eboot.c
@@ -0,0 +1,1014 @@
+/* -----------------------------------------------------------------------
+ *
+ *   Copyright 2011 Intel Corporation; author Matt Fleming
+ *
+ *   This file is part of the Linux kernel, and is made available under
+ *   the terms of the GNU General Public License version 2.
+ *
+ * ----------------------------------------------------------------------- */
+
+#include <linux/efi.h>
+#include <asm/efi.h>
+#include <asm/setup.h>
+#include <asm/desc.h>
+
+#include "eboot.h"
+
+static efi_system_table_t *sys_table;
+
+static efi_status_t __get_map(efi_memory_desc_t **map, unsigned long *map_size,
+			      unsigned long *desc_size)
+{
+	efi_memory_desc_t *m = NULL;
+	efi_status_t status;
+	unsigned long key;
+	u32 desc_version;
+
+	*map_size = sizeof(*m) * 32;
+again:
+	/*
+	 * Add an additional efi_memory_desc_t because we're doing an
+	 * allocation which may be in a new descriptor region.
+	 */
+	*map_size += sizeof(*m);
+	status = efi_call_phys3(sys_table->boottime->allocate_pool,
+				EFI_LOADER_DATA, *map_size, (void **)&m);
+	if (status != EFI_SUCCESS)
+		goto fail;
+
+	status = efi_call_phys5(sys_table->boottime->get_memory_map, map_size,
+				m, &key, desc_size, &desc_version);
+	if (status == EFI_BUFFER_TOO_SMALL) {
+		efi_call_phys1(sys_table->boottime->free_pool, m);
+		goto again;
+	}
+
+	if (status != EFI_SUCCESS)
+		efi_call_phys1(sys_table->boottime->free_pool, m);
+
+fail:
+	*map = m;
+	return status;
+}
+
+/*
+ * Allocate at the highest possible address that is not above 'max'.
+ */
+static efi_status_t high_alloc(unsigned long size, unsigned long align,
+			      unsigned long *addr, unsigned long max)
+{
+	unsigned long map_size, desc_size;
+	efi_memory_desc_t *map;
+	efi_status_t status;
+	unsigned long nr_pages;
+	u64 max_addr = 0;
+	int i;
+
+	status = __get_map(&map, &map_size, &desc_size);
+	if (status != EFI_SUCCESS)
+		goto fail;
+
+	nr_pages = round_up(size, EFI_PAGE_SIZE) / EFI_PAGE_SIZE;
+again:
+	for (i = 0; i < map_size / desc_size; i++) {
+		efi_memory_desc_t *desc;
+		unsigned long m = (unsigned long)map;
+		u64 start, end;
+
+		desc = (efi_memory_desc_t *)(m + (i * desc_size));
+		if (desc->type != EFI_CONVENTIONAL_MEMORY)
+			continue;
+
+		if (desc->num_pages < nr_pages)
+			continue;
+
+		start = desc->phys_addr;
+		end = start + desc->num_pages * (1UL << EFI_PAGE_SHIFT);
+
+		if ((start + size) > end || (start + size) > max)
+			continue;
+
+		if (end - size > max)
+			end = max;
+
+		if (round_down(end - size, align) < start)
+			continue;
+
+		start = round_down(end - size, align);
+
+		/*
+		 * Don't allocate at 0x0. It will confuse code that
+		 * checks pointers against NULL.
+		 */
+		if (start == 0x0)
+			continue;
+
+		if (start > max_addr)
+			max_addr = start;
+	}
+
+	if (!max_addr)
+		status = EFI_NOT_FOUND;
+	else {
+		status = efi_call_phys4(sys_table->boottime->allocate_pages,
+					EFI_ALLOCATE_ADDRESS, EFI_LOADER_DATA,
+					nr_pages, &max_addr);
+		if (status != EFI_SUCCESS) {
+			max = max_addr;
+			max_addr = 0;
+			goto again;
+		}
+
+		*addr = max_addr;
+	}
+
+free_pool:
+	efi_call_phys1(sys_table->boottime->free_pool, map);
+
+fail:
+	return status;
+}
+
+/*
+ * Allocate at the lowest possible address.
+ */
+static efi_status_t low_alloc(unsigned long size, unsigned long align,
+			      unsigned long *addr)
+{
+	unsigned long map_size, desc_size;
+	efi_memory_desc_t *map;
+	efi_status_t status;
+	unsigned long nr_pages;
+	int i;
+
+	status = __get_map(&map, &map_size, &desc_size);
+	if (status != EFI_SUCCESS)
+		goto fail;
+
+	nr_pages = round_up(size, EFI_PAGE_SIZE) / EFI_PAGE_SIZE;
+	for (i = 0; i < map_size / desc_size; i++) {
+		efi_memory_desc_t *desc;
+		unsigned long m = (unsigned long)map;
+		u64 start, end;
+
+		desc = (efi_memory_desc_t *)(m + (i * desc_size));
+
+		if (desc->type != EFI_CONVENTIONAL_MEMORY)
+			continue;
+
+		if (desc->num_pages < nr_pages)
+			continue;
+
+		start = desc->phys_addr;
+		end = start + desc->num_pages * (1UL << EFI_PAGE_SHIFT);
+
+		/*
+		 * Don't allocate at 0x0. It will confuse code that
+		 * checks pointers against NULL. Skip the first 8
+		 * bytes so we start at a nice even number.
+		 */
+		if (start == 0x0)
+			start += 8;
+
+		start = round_up(start, align);
+		if ((start + size) > end)
+			continue;
+
+		status = efi_call_phys4(sys_table->boottime->allocate_pages,
+					EFI_ALLOCATE_ADDRESS, EFI_LOADER_DATA,
+					nr_pages, &start);
+		if (status == EFI_SUCCESS) {
+			*addr = start;
+			break;
+		}
+	}
+
+	if (i == map_size / desc_size)
+		status = EFI_NOT_FOUND;
+
+free_pool:
+	efi_call_phys1(sys_table->boottime->free_pool, map);
+fail:
+	return status;
+}
+
+static void low_free(unsigned long size, unsigned long addr)
+{
+	unsigned long nr_pages;
+
+	nr_pages = round_up(size, EFI_PAGE_SIZE) / EFI_PAGE_SIZE;
+	efi_call_phys2(sys_table->boottime->free_pages, addr, size);
+}
+
+static void find_bits(unsigned long mask, u8 *pos, u8 *size)
+{
+	u8 first, len;
+
+	first = 0;
+	len = 0;
+
+	if (mask) {
+		while (!(mask & 0x1)) {
+			mask = mask >> 1;
+			first++;
+		}
+
+		while (mask & 0x1) {
+			mask = mask >> 1;
+			len++;
+		}
+	}
+
+	*pos = first;
+	*size = len;
+}
+
+/*
+ * See if we have Graphics Output Protocol
+ */
+static efi_status_t setup_gop(struct screen_info *si, efi_guid_t *proto,
+			      unsigned long size)
+{
+	struct efi_graphics_output_protocol *gop, *first_gop;
+	struct efi_pixel_bitmask pixel_info;
+	unsigned long nr_gops;
+	efi_status_t status;
+	void **gop_handle;
+	u16 width, height;
+	u32 fb_base, fb_size;
+	u32 pixels_per_scan_line;
+	int pixel_format;
+	int i;
+
+	status = efi_call_phys3(sys_table->boottime->allocate_pool,
+				EFI_LOADER_DATA, size, &gop_handle);
+	if (status != EFI_SUCCESS)
+		return status;
+
+	status = efi_call_phys5(sys_table->boottime->locate_handle,
+				EFI_LOCATE_BY_PROTOCOL, proto,
+				NULL, &size, gop_handle);
+	if (status != EFI_SUCCESS)
+		goto free_handle;
+
+	first_gop = NULL;
+
+	nr_gops = size / sizeof(void *);
+	for (i = 0; i < nr_gops; i++) {
+		struct efi_graphics_output_mode_info *info;
+		efi_guid_t pciio_proto = EFI_PCI_IO_PROTOCOL_GUID;
+		void *pciio;
+		void *h = gop_handle[i];
+
+		status = efi_call_phys3(sys_table->boottime->handle_protocol,
+					h, proto, &gop);
+		if (status != EFI_SUCCESS)
+			continue;
+
+		efi_call_phys3(sys_table->boottime->handle_protocol,
+			       h, &pciio_proto, &pciio);
+
+		status = efi_call_phys4(gop->query_mode, gop,
+					gop->mode->mode, &size, &info);
+		if (status == EFI_SUCCESS && (!first_gop || pciio)) {
+			/*
+			 * Apple provide GOPs that are not backed by
+			 * real hardware (they're used to handle
+			 * multiple displays). The workaround is to
+			 * search for a GOP implementing the PCIIO
+			 * protocol, and if one isn't found, to just
+			 * fallback to the first GOP.
+			 */
+			width = info->horizontal_resolution;
+			height = info->vertical_resolution;
+			fb_base = gop->mode->frame_buffer_base;
+			fb_size = gop->mode->frame_buffer_size;
+			pixel_format = info->pixel_format;
+			pixel_info = info->pixel_information;
+			pixels_per_scan_line = info->pixels_per_scan_line;
+
+			/*
+			 * Once we've found a GOP supporting PCIIO,
+			 * don't bother looking any further.
+			 */
+			if (pciio)
+				break;
+
+			first_gop = gop;
+		}
+	}
+
+	/* Did we find any GOPs? */
+	if (!first_gop)
+		goto free_handle;
+
+	/* EFI framebuffer */
+	si->orig_video_isVGA = VIDEO_TYPE_EFI;
+
+	si->lfb_width = width;
+	si->lfb_height = height;
+	si->lfb_base = fb_base;
+	si->lfb_size = fb_size;
+	si->pages = 1;
+
+	if (pixel_format == PIXEL_RGB_RESERVED_8BIT_PER_COLOR) {
+		si->lfb_depth = 32;
+		si->lfb_linelength = pixels_per_scan_line * 4;
+		si->red_size = 8;
+		si->red_pos = 0;
+		si->green_size = 8;
+		si->green_pos = 8;
+		si->blue_size = 8;
+		si->blue_pos = 16;
+		si->rsvd_size = 8;
+		si->rsvd_pos = 24;
+	} else if (pixel_format == PIXEL_BGR_RESERVED_8BIT_PER_COLOR) {
+		si->lfb_depth = 32;
+		si->lfb_linelength = pixels_per_scan_line * 4;
+		si->red_size = 8;
+		si->red_pos = 16;
+		si->green_size = 8;
+		si->green_pos = 8;
+		si->blue_size = 8;
+		si->blue_pos = 0;
+		si->rsvd_size = 8;
+		si->rsvd_pos = 24;
+	} else if (pixel_format == PIXEL_BIT_MASK) {
+		find_bits(pixel_info.red_mask, &si->red_pos, &si->red_size);
+		find_bits(pixel_info.green_mask, &si->green_pos,
+			  &si->green_size);
+		find_bits(pixel_info.blue_mask, &si->blue_pos, &si->blue_size);
+		find_bits(pixel_info.reserved_mask, &si->rsvd_pos,
+			  &si->rsvd_size);
+		si->lfb_depth = si->red_size + si->green_size +
+			si->blue_size + si->rsvd_size;
+		si->lfb_linelength = (pixels_per_scan_line * si->lfb_depth) / 8;
+	} else {
+		si->lfb_depth = 4;
+		si->lfb_linelength = si->lfb_width / 2;
+		si->red_size = 0;
+		si->red_pos = 0;
+		si->green_size = 0;
+		si->green_pos = 0;
+		si->blue_size = 0;
+		si->blue_pos = 0;
+		si->rsvd_size = 0;
+		si->rsvd_pos = 0;
+	}
+
+free_handle:
+	efi_call_phys1(sys_table->boottime->free_pool, gop_handle);
+	return status;
+}
+
+/*
+ * See if we have Universal Graphics Adapter (UGA) protocol
+ */
+static efi_status_t setup_uga(struct screen_info *si, efi_guid_t *uga_proto,
+			      unsigned long size)
+{
+	struct efi_uga_draw_protocol *uga, *first_uga;
+	unsigned long nr_ugas;
+	efi_status_t status;
+	u32 width, height;
+	void **uga_handle = NULL;
+	int i;
+
+	status = efi_call_phys3(sys_table->boottime->allocate_pool,
+				EFI_LOADER_DATA, size, &uga_handle);
+	if (status != EFI_SUCCESS)
+		return status;
+
+	status = efi_call_phys5(sys_table->boottime->locate_handle,
+				EFI_LOCATE_BY_PROTOCOL, uga_proto,
+				NULL, &size, uga_handle);
+	if (status != EFI_SUCCESS)
+		goto free_handle;
+
+	first_uga = NULL;
+
+	nr_ugas = size / sizeof(void *);
+	for (i = 0; i < nr_ugas; i++) {
+		efi_guid_t pciio_proto = EFI_PCI_IO_PROTOCOL_GUID;
+		void *handle = uga_handle[i];
+		u32 w, h, depth, refresh;
+		void *pciio;
+
+		status = efi_call_phys3(sys_table->boottime->handle_protocol,
+					handle, uga_proto, &uga);
+		if (status != EFI_SUCCESS)
+			continue;
+
+		efi_call_phys3(sys_table->boottime->handle_protocol,
+			       handle, &pciio_proto, &pciio);
+
+		status = efi_call_phys5(uga->get_mode, uga, &w, &h,
+					&depth, &refresh);
+		if (status == EFI_SUCCESS && (!first_uga || pciio)) {
+			width = w;
+			height = h;
+
+			/*
+			 * Once we've found a UGA supporting PCIIO,
+			 * don't bother looking any further.
+			 */
+			if (pciio)
+				break;
+
+			first_uga = uga;
+		}
+	}
+
+	if (!first_uga)
+		goto free_handle;
+
+	/* EFI framebuffer */
+	si->orig_video_isVGA = VIDEO_TYPE_EFI;
+
+	si->lfb_depth = 32;
+	si->lfb_width = width;
+	si->lfb_height = height;
+
+	si->red_size = 8;
+	si->red_pos = 16;
+	si->green_size = 8;
+	si->green_pos = 8;
+	si->blue_size = 8;
+	si->blue_pos = 0;
+	si->rsvd_size = 8;
+	si->rsvd_pos = 24;
+
+
+free_handle:
+	efi_call_phys1(sys_table->boottime->free_pool, uga_handle);
+	return status;
+}
+
+void setup_graphics(struct boot_params *boot_params)
+{
+	efi_guid_t graphics_proto = EFI_GRAPHICS_OUTPUT_PROTOCOL_GUID;
+	struct screen_info *si;
+	efi_guid_t uga_proto = EFI_UGA_PROTOCOL_GUID;
+	efi_status_t status;
+	unsigned long size;
+	void **gop_handle = NULL;
+	void **uga_handle = NULL;
+
+	si = &boot_params->screen_info;
+	memset(si, 0, sizeof(*si));
+
+	size = 0;
+	status = efi_call_phys5(sys_table->boottime->locate_handle,
+				EFI_LOCATE_BY_PROTOCOL, &graphics_proto,
+				NULL, &size, gop_handle);
+	if (status == EFI_BUFFER_TOO_SMALL)
+		status = setup_gop(si, &graphics_proto, size);
+
+	if (status != EFI_SUCCESS) {
+		size = 0;
+		status = efi_call_phys5(sys_table->boottime->locate_handle,
+					EFI_LOCATE_BY_PROTOCOL, &uga_proto,
+					NULL, &size, uga_handle);
+		if (status == EFI_BUFFER_TOO_SMALL)
+			setup_uga(si, &uga_proto, size);
+	}
+}
+
+struct initrd {
+	efi_file_handle_t *handle;
+	u64 size;
+};
+
+/*
+ * Check the cmdline for a LILO-style initrd= arguments.
+ *
+ * We only support loading an initrd from the same filesystem as the
+ * kernel image.
+ */
+static efi_status_t handle_ramdisks(efi_loaded_image_t *image,
+				    struct setup_header *hdr)
+{
+	struct initrd *initrds;
+	unsigned long initrd_addr;
+	efi_guid_t fs_proto = EFI_FILE_SYSTEM_GUID;
+	u64 initrd_total;
+	efi_file_io_interface_t *io;
+	efi_file_handle_t *fh;
+	efi_status_t status;
+	int nr_initrds;
+	char *str;
+	int i, j, k;
+
+	initrd_addr = 0;
+	initrd_total = 0;
+
+	str = (char *)(unsigned long)hdr->cmd_line_ptr;
+
+	j = 0;			/* See close_handles */
+
+	if (!str || !*str)
+		return EFI_SUCCESS;
+
+	for (nr_initrds = 0; *str; nr_initrds++) {
+		str = strstr(str, "initrd=");
+		if (!str)
+			break;
+
+		str += 7;
+
+		/* Skip any leading slashes */
+		while (*str == '/' || *str == '\\')
+			str++;
+
+		while (*str && *str != ' ' && *str != '\n')
+			str++;
+	}
+
+	if (!nr_initrds)
+		return EFI_SUCCESS;
+
+	status = efi_call_phys3(sys_table->boottime->allocate_pool,
+				EFI_LOADER_DATA,
+				nr_initrds * sizeof(*initrds),
+				&initrds);
+	if (status != EFI_SUCCESS)
+		goto fail;
+
+	str = (char *)(unsigned long)hdr->cmd_line_ptr;
+	for (i = 0; i < nr_initrds; i++) {
+		struct initrd *initrd;
+		efi_file_handle_t *h;
+		efi_file_info_t *info;
+		efi_char16_t filename[256];
+		unsigned long info_sz;
+		efi_guid_t info_guid = EFI_FILE_INFO_ID;
+		efi_char16_t *p;
+		u64 file_sz;
+
+		str = strstr(str, "initrd=");
+		if (!str)
+			break;
+
+		str += 7;
+
+		initrd = &initrds[i];
+		p = filename;
+
+		/* Skip any leading slashes */
+		while (*str == '/' || *str == '\\')
+			str++;
+
+		while (*str && *str != ' ' && *str != '\n') {
+			if (p >= filename + sizeof(filename))
+				break;
+
+			*p++ = *str++;
+		}
+
+		*p = '\0';
+
+		/* Only open the volume once. */
+		if (!i) {
+			efi_boot_services_t *boottime;
+
+			boottime = sys_table->boottime;
+
+			status = efi_call_phys3(boottime->handle_protocol,
+					image->device_handle, &fs_proto, &io);
+			if (status != EFI_SUCCESS)
+				goto free_initrds;
+
+			status = efi_call_phys2(io->open_volume, io, &fh);
+			if (status != EFI_SUCCESS)
+				goto free_initrds;
+		}
+
+		status = efi_call_phys5(fh->open, fh, &h, filename,
+					EFI_FILE_MODE_READ, (u64)0);
+		if (status != EFI_SUCCESS)
+			goto close_handles;
+
+		initrd->handle = h;
+
+		info_sz = 0;
+		status = efi_call_phys4(h->get_info, h, &info_guid,
+					&info_sz, NULL);
+		if (status != EFI_BUFFER_TOO_SMALL)
+			goto close_handles;
+
+grow:
+		status = efi_call_phys3(sys_table->boottime->allocate_pool,
+					EFI_LOADER_DATA, info_sz, &info);
+		if (status != EFI_SUCCESS)
+			goto close_handles;
+
+		status = efi_call_phys4(h->get_info, h, &info_guid,
+					&info_sz, info);
+		if (status == EFI_BUFFER_TOO_SMALL) {
+			efi_call_phys1(sys_table->boottime->free_pool, info);
+			goto grow;
+		}
+
+		file_sz = info->file_size;
+		efi_call_phys1(sys_table->boottime->free_pool, info);
+
+		if (status != EFI_SUCCESS)
+			goto close_handles;
+
+		initrd->size = file_sz;
+		initrd_total += file_sz;
+	}
+
+	if (initrd_total) {
+		unsigned long addr;
+
+		/*
+		 * Multiple initrd's need to be at consecutive
+		 * addresses in memory, so allocate enough memory for
+		 * all the initrd's.
+		 */
+		status = high_alloc(initrd_total, 0x1000,
+				   &initrd_addr, hdr->initrd_addr_max);
+		if (status != EFI_SUCCESS)
+			goto close_handles;
+
+		/* We've run out of free low memory. */
+		if (initrd_addr > hdr->initrd_addr_max) {
+			status = EFI_INVALID_PARAMETER;
+			goto free_initrd_total;
+		}
+
+		addr = initrd_addr;
+		for (j = 0; j < nr_initrds; j++) {
+			u64 size;
+
+			size = initrds[j].size;
+			status = efi_call_phys3(fh->read, initrds[j].handle,
+						&size, addr);
+			if (status != EFI_SUCCESS)
+				goto free_initrd_total;
+
+			efi_call_phys1(fh->close, initrds[j].handle);
+
+			addr += size;
+		}
+
+	}
+
+	efi_call_phys1(sys_table->boottime->free_pool, initrds);
+
+	hdr->ramdisk_image = initrd_addr;
+	hdr->ramdisk_size = initrd_total;
+
+	return status;
+
+free_initrd_total:
+	low_free(initrd_total, initrd_addr);
+
+close_handles:
+	for (k = j; k < nr_initrds; k++)
+		efi_call_phys1(fh->close, initrds[k].handle);
+free_initrds:
+	efi_call_phys1(sys_table->boottime->free_pool, initrds);
+fail:
+	hdr->ramdisk_image = 0;
+	hdr->ramdisk_size = 0;
+
+	return status;
+}
+
+/*
+ * Because the x86 boot code expects to be passed a boot_params we
+ * need to create one ourselves (usually the bootloader would create
+ * one for us).
+ */
+static efi_status_t make_boot_params(struct boot_params *boot_params,
+				     efi_loaded_image_t *image,
+				     void *handle)
+{
+	struct efi_info *efi = &boot_params->efi_info;
+	struct apm_bios_info *bi = &boot_params->apm_bios_info;
+	struct sys_desc_table *sdt = &boot_params->sys_desc_table;
+	struct e820entry *e820_map = &boot_params->e820_map[0];
+	struct e820entry *prev = NULL;
+	struct setup_header *hdr = &boot_params->hdr;
+	unsigned long size, key, desc_size, _size;
+	efi_memory_desc_t *mem_map;
+	void *options = image->load_options;
+	u32 load_options_size = image->load_options_size / 2; /* ASCII */
+	int options_size = 0;
+	efi_status_t status;
+	__u32 desc_version;
+	unsigned long cmdline;
+	u8 nr_entries;
+	u16 *s2;
+	u8 *s1;
+	int i;
+
+	hdr->type_of_loader = 0x21;
+
+	/* Convert unicode cmdline to ascii */
+	cmdline = 0;
+	s2 = (u16 *)options;
+
+	if (s2) {
+		while (*s2 && *s2 != '\n' && options_size < load_options_size) {
+			s2++;
+			options_size++;
+		}
+
+		if (options_size) {
+			if (options_size > hdr->cmdline_size)
+				options_size = hdr->cmdline_size;
+
+			options_size++;	/* NUL termination */
+
+			status = low_alloc(options_size, 1, &cmdline);
+			if (status != EFI_SUCCESS)
+				goto fail;
+
+			s1 = (u8 *)(unsigned long)cmdline;
+			s2 = (u16 *)options;
+
+			for (i = 0; i < options_size - 1; i++)
+				*s1++ = *s2++;
+
+			*s1 = '\0';
+		}
+	}
+
+	hdr->cmd_line_ptr = cmdline;
+
+	hdr->ramdisk_image = 0;
+	hdr->ramdisk_size = 0;
+
+	status = handle_ramdisks(image, hdr);
+	if (status != EFI_SUCCESS)
+		goto free_cmdline;
+
+	setup_graphics(boot_params);
+
+	/* Clear APM BIOS info */
+	memset(bi, 0, sizeof(*bi));
+
+	memset(sdt, 0, sizeof(*sdt));
+
+	memcpy(&efi->efi_loader_signature, EFI_LOADER_SIGNATURE, sizeof(__u32));
+
+	size = sizeof(*mem_map) * 32;
+
+again:
+	size += sizeof(*mem_map);
+	_size = size;
+	status = low_alloc(size, 1, (unsigned long *)&mem_map);
+	if (status != EFI_SUCCESS)
+		goto free_cmdline;
+
+	status = efi_call_phys5(sys_table->boottime->get_memory_map, &size,
+				mem_map, &key, &desc_size, &desc_version);
+	if (status == EFI_BUFFER_TOO_SMALL) {
+		low_free(_size, (unsigned long)mem_map);
+		goto again;
+	}
+
+	if (status != EFI_SUCCESS)
+		goto free_mem_map;
+
+	efi->efi_systab = (unsigned long)sys_table;
+	efi->efi_memdesc_size = desc_size;
+	efi->efi_memdesc_version = desc_version;
+	efi->efi_memmap = (unsigned long)mem_map;
+	efi->efi_memmap_size = size;
+
+#ifdef CONFIG_X86_64
+	efi->efi_systab_hi = (unsigned long)sys_table >> 32;
+	efi->efi_memmap_hi = (unsigned long)mem_map >> 32;
+#endif
+
+	/* Might as well exit boot services now */
+	status = efi_call_phys2(sys_table->boottime->exit_boot_services,
+				handle, key);
+	if (status != EFI_SUCCESS)
+		goto free_mem_map;
+
+	/* Historic? */
+	boot_params->alt_mem_k = 32 * 1024;
+
+	/*
+	 * Convert the EFI memory map to E820.
+	 */
+	nr_entries = 0;
+	for (i = 0; i < size / desc_size; i++) {
+		efi_memory_desc_t *d;
+		unsigned int e820_type = 0;
+		unsigned long m = (unsigned long)mem_map;
+
+		d = (efi_memory_desc_t *)(m + (i * desc_size));
+		switch (d->type) {
+		case EFI_RESERVED_TYPE:
+		case EFI_RUNTIME_SERVICES_CODE:
+		case EFI_RUNTIME_SERVICES_DATA:
+		case EFI_MEMORY_MAPPED_IO:
+		case EFI_MEMORY_MAPPED_IO_PORT_SPACE:
+		case EFI_PAL_CODE:
+			e820_type = E820_RESERVED;
+			break;
+
+		case EFI_UNUSABLE_MEMORY:
+			e820_type = E820_UNUSABLE;
+			break;
+
+		case EFI_ACPI_RECLAIM_MEMORY:
+			e820_type = E820_ACPI;
+			break;
+
+		case EFI_LOADER_CODE:
+		case EFI_LOADER_DATA:
+		case EFI_BOOT_SERVICES_CODE:
+		case EFI_BOOT_SERVICES_DATA:
+		case EFI_CONVENTIONAL_MEMORY:
+			e820_type = E820_RAM;
+			break;
+
+		case EFI_ACPI_MEMORY_NVS:
+			e820_type = E820_NVS;
+			break;
+
+		default:
+			continue;
+		}
+
+		/* Merge adjacent mappings */
+		if (prev && prev->type == e820_type &&
+		    (prev->addr + prev->size) == d->phys_addr)
+			prev->size += d->num_pages << 12;
+		else {
+			e820_map->addr = d->phys_addr;
+			e820_map->size = d->num_pages << 12;
+			e820_map->type = e820_type;
+			prev = e820_map++;
+			nr_entries++;
+		}
+	}
+
+	boot_params->e820_entries = nr_entries;
+
+	return EFI_SUCCESS;
+
+free_mem_map:
+	low_free(_size, (unsigned long)mem_map);
+free_cmdline:
+	if (options_size)
+		low_free(options_size, hdr->cmd_line_ptr);
+fail:
+	return status;
+}
+
+/*
+ * On success we return a pointer to a boot_params structure, and NULL
+ * on failure.
+ */
+struct boot_params *efi_main(void *handle, efi_system_table_t *_table)
+{
+	struct boot_params *boot_params;
+	unsigned long start, nr_pages;
+	struct desc_ptr *gdt, *idt;
+	efi_loaded_image_t *image;
+	struct setup_header *hdr;
+	efi_status_t status;
+	efi_guid_t proto = LOADED_IMAGE_PROTOCOL_GUID;
+	struct desc_struct *desc;
+
+	sys_table = _table;
+
+	/* Check if we were booted by the EFI firmware */
+	if (sys_table->hdr.signature != EFI_SYSTEM_TABLE_SIGNATURE)
+		goto fail;
+
+	status = efi_call_phys3(sys_table->boottime->handle_protocol,
+				handle, &proto, (void *)&image);
+	if (status != EFI_SUCCESS)
+		goto fail;
+
+	status = low_alloc(0x4000, 1, (unsigned long *)&boot_params);
+	if (status != EFI_SUCCESS)
+		goto fail;
+
+	memset(boot_params, 0x0, 0x4000);
+
+	/* Copy first two sectors to boot_params */
+	memcpy(boot_params, image->image_base, 1024);
+
+	hdr = &boot_params->hdr;
+
+	/*
+	 * The EFI firmware loader could have placed the kernel image
+	 * anywhere in memory, but the kernel has various restrictions
+	 * on the max physical address it can run at. Attempt to move
+	 * the kernel to boot_params.pref_address, or as low as
+	 * possible.
+	 */
+	start = hdr->pref_address;
+	nr_pages = round_up(hdr->init_size, EFI_PAGE_SIZE) / EFI_PAGE_SIZE;
+
+	status = efi_call_phys4(sys_table->boottime->allocate_pages,
+				EFI_ALLOCATE_ADDRESS, EFI_LOADER_DATA,
+				nr_pages, &start);
+	if (status != EFI_SUCCESS) {
+		status = low_alloc(hdr->init_size, hdr->kernel_alignment,
+				   &start);
+		if (status != EFI_SUCCESS)
+			goto fail;
+	}
+
+	hdr->code32_start = (__u32)start;
+	hdr->pref_address = (__u64)(unsigned long)image->image_base;
+
+	memcpy((void *)start, image->image_base, image->image_size);
+
+	status = efi_call_phys3(sys_table->boottime->allocate_pool,
+				EFI_LOADER_DATA, sizeof(*gdt),
+				(void **)&gdt);
+	if (status != EFI_SUCCESS)
+		goto fail;
+
+	gdt->size = 0x800;
+	status = low_alloc(gdt->size, 8, (unsigned long *)&gdt->address);
+	if (status != EFI_SUCCESS)
+		goto fail;
+
+	status = efi_call_phys3(sys_table->boottime->allocate_pool,
+				EFI_LOADER_DATA, sizeof(*idt),
+				(void **)&idt);
+	if (status != EFI_SUCCESS)
+		goto fail;
+
+	idt->size = 0;
+	idt->address = 0;
+
+	status = make_boot_params(boot_params, image, handle);
+	if (status != EFI_SUCCESS)
+		goto fail;
+
+	memset((char *)gdt->address, 0x0, gdt->size);
+	desc = (struct desc_struct *)gdt->address;
+
+	/* The first GDT is a dummy and the second is unused. */
+	desc += 2;
+
+	desc->limit0 = 0xffff;
+	desc->base0 = 0x0000;
+	desc->base1 = 0x0000;
+	desc->type = SEG_TYPE_CODE | SEG_TYPE_EXEC_READ;
+	desc->s = DESC_TYPE_CODE_DATA;
+	desc->dpl = 0;
+	desc->p = 1;
+	desc->limit = 0xf;
+	desc->avl = 0;
+	desc->l = 0;
+	desc->d = SEG_OP_SIZE_32BIT;
+	desc->g = SEG_GRANULARITY_4KB;
+	desc->base2 = 0x00;
+
+	desc++;
+	desc->limit0 = 0xffff;
+	desc->base0 = 0x0000;
+	desc->base1 = 0x0000;
+	desc->type = SEG_TYPE_DATA | SEG_TYPE_READ_WRITE;
+	desc->s = DESC_TYPE_CODE_DATA;
+	desc->dpl = 0;
+	desc->p = 1;
+	desc->limit = 0xf;
+	desc->avl = 0;
+	desc->l = 0;
+	desc->d = SEG_OP_SIZE_32BIT;
+	desc->g = SEG_GRANULARITY_4KB;
+	desc->base2 = 0x00;
+
+#ifdef CONFIG_X86_64
+	/* Task segment value */
+	desc++;
+	desc->limit0 = 0x0000;
+	desc->base0 = 0x0000;
+	desc->base1 = 0x0000;
+	desc->type = SEG_TYPE_TSS;
+	desc->s = 0;
+	desc->dpl = 0;
+	desc->p = 1;
+	desc->limit = 0x0;
+	desc->avl = 0;
+	desc->l = 0;
+	desc->d = 0;
+	desc->g = SEG_GRANULARITY_4KB;
+	desc->base2 = 0x00;
+#endif /* CONFIG_X86_64 */
+
+	asm volatile ("lidt %0" : : "m" (*idt));
+	asm volatile ("lgdt %0" : : "m" (*gdt));
+
+	asm volatile("cli");
+
+	return boot_params;
+fail:
+	return NULL;
+}
diff --git a/arch/x86/boot/compressed/eboot.h b/arch/x86/boot/compressed/eboot.h
new file mode 100644
index 000000000000..f66d023e91ef
--- /dev/null
+++ b/arch/x86/boot/compressed/eboot.h
@@ -0,0 +1,60 @@
+#ifndef BOOT_COMPRESSED_EBOOT_H
+#define BOOT_COMPRESSED_EBOOT_H
+
+#define SEG_TYPE_DATA		(0 << 3)
+#define SEG_TYPE_READ_WRITE	(1 << 1)
+#define SEG_TYPE_CODE		(1 << 3)
+#define SEG_TYPE_EXEC_READ	(1 << 1)
+#define SEG_TYPE_TSS		((1 << 3) | (1 << 0))
+#define SEG_OP_SIZE_32BIT	(1 << 0)
+#define SEG_GRANULARITY_4KB	(1 << 0)
+
+#define DESC_TYPE_CODE_DATA	(1 << 0)
+
+#define EFI_PAGE_SIZE		(1UL << EFI_PAGE_SHIFT)
+
+#define PIXEL_RGB_RESERVED_8BIT_PER_COLOR		0
+#define PIXEL_BGR_RESERVED_8BIT_PER_COLOR		1
+#define PIXEL_BIT_MASK					2
+#define PIXEL_BLT_ONLY					3
+#define PIXEL_FORMAT_MAX				4
+
+struct efi_pixel_bitmask {
+	u32 red_mask;
+	u32 green_mask;
+	u32 blue_mask;
+	u32 reserved_mask;
+};
+
+struct efi_graphics_output_mode_info {
+	u32 version;
+	u32 horizontal_resolution;
+	u32 vertical_resolution;
+	int pixel_format;
+	struct efi_pixel_bitmask pixel_information;
+	u32 pixels_per_scan_line;
+} __packed;
+
+struct efi_graphics_output_protocol_mode {
+	u32 max_mode;
+	u32 mode;
+	unsigned long info;
+	unsigned long size_of_info;
+	u64 frame_buffer_base;
+	unsigned long frame_buffer_size;
+} __packed;
+
+struct efi_graphics_output_protocol {
+	void *query_mode;
+	unsigned long set_mode;
+	unsigned long blt;
+	struct efi_graphics_output_protocol_mode *mode;
+};
+
+struct efi_uga_draw_protocol {
+	void *get_mode;
+	void *set_mode;
+	void *blt;
+};
+
+#endif /* BOOT_COMPRESSED_EBOOT_H */
diff --git a/arch/x86/boot/compressed/efi_stub_32.S b/arch/x86/boot/compressed/efi_stub_32.S
new file mode 100644
index 000000000000..a53440e81d52
--- /dev/null
+++ b/arch/x86/boot/compressed/efi_stub_32.S
@@ -0,0 +1,86 @@
+/*
+ * EFI call stub for IA32.
+ *
+ * This stub allows us to make EFI calls in physical mode with interrupts
+ * turned off. Note that this implementation is different from the one in
+ * arch/x86/platform/efi/efi_stub_32.S because we're _already_ in physical
+ * mode at this point.
+ */
+
+#include <linux/linkage.h>
+#include <asm/page_types.h>
+
+/*
+ * efi_call_phys(void *, ...) is a function with variable parameters.
+ * All the callers of this function assure that all the parameters are 4-bytes.
+ */
+
+/*
+ * In gcc calling convention, EBX, ESP, EBP, ESI and EDI are all callee save.
+ * So we'd better save all of them at the beginning of this function and restore
+ * at the end no matter how many we use, because we can not assure EFI runtime
+ * service functions will comply with gcc calling convention, too.
+ */
+
+.text
+ENTRY(efi_call_phys)
+	/*
+	 * 0. The function can only be called in Linux kernel. So CS has been
+	 * set to 0x0010, DS and SS have been set to 0x0018. In EFI, I found
+	 * the values of these registers are the same. And, the corresponding
+	 * GDT entries are identical. So I will do nothing about segment reg
+	 * and GDT, but change GDT base register in prelog and epilog.
+	 */
+
+	/*
+	 * 1. Because we haven't been relocated by this point we need to
+	 * use relative addressing.
+	 */
+	call	1f
+1:	popl	%edx
+	subl	$1b, %edx
+
+	/*
+	 * 2. Now on the top of stack is the return
+	 * address in the caller of efi_call_phys(), then parameter 1,
+	 * parameter 2, ..., param n. To make things easy, we save the return
+	 * address of efi_call_phys in a global variable.
+	 */
+	popl	%ecx
+	movl	%ecx, saved_return_addr(%edx)
+	/* get the function pointer into ECX*/
+	popl	%ecx
+	movl	%ecx, efi_rt_function_ptr(%edx)
+
+	/*
+	 * 3. Call the physical function.
+	 */
+	call	*%ecx
+
+	/*
+	 * 4. Balance the stack. And because EAX contain the return value,
+	 * we'd better not clobber it. We need to calculate our address
+	 * again because %ecx and %edx are not preserved across EFI function
+	 * calls.
+	 */
+	call	1f
+1:	popl	%edx
+	subl	$1b, %edx
+
+	movl	efi_rt_function_ptr(%edx), %ecx
+	pushl	%ecx
+
+	/*
+	 * 10. Push the saved return address onto the stack and return.
+	 */
+	movl	saved_return_addr(%edx), %ecx
+	pushl	%ecx
+	ret
+ENDPROC(efi_call_phys)
+.previous
+
+.data
+saved_return_addr:
+	.long 0
+efi_rt_function_ptr:
+	.long 0
diff --git a/arch/x86/boot/compressed/efi_stub_64.S b/arch/x86/boot/compressed/efi_stub_64.S
new file mode 100644
index 000000000000..cedc60de86eb
--- /dev/null
+++ b/arch/x86/boot/compressed/efi_stub_64.S
@@ -0,0 +1 @@
+#include "../../platform/efi/efi_stub_64.S"
diff --git a/arch/x86/boot/compressed/head_32.S b/arch/x86/boot/compressed/head_32.S
index 67a655a39ce4..a0559930a180 100644
--- a/arch/x86/boot/compressed/head_32.S
+++ b/arch/x86/boot/compressed/head_32.S
@@ -32,6 +32,28 @@
 
 	__HEAD
 ENTRY(startup_32)
+#ifdef CONFIG_EFI_STUB
+	/*
+	 * We don't need the return address, so set up the stack so
+	 * efi_main() can find its arugments.
+	 */
+	add	$0x4, %esp
+
+	call	efi_main
+	cmpl	$0, %eax
+	je	preferred_addr
+	movl	%eax, %esi
+	call	1f
+1:
+	popl	%eax
+	subl	$1b, %eax
+	subl	BP_pref_address(%esi), %eax
+	add	BP_code32_start(%esi), %eax
+	leal	preferred_addr(%eax), %eax
+	jmp	*%eax
+
+preferred_addr:
+#endif
 	cld
 	/*
 	 * Test KEEP_SEGMENTS flag to see if the bootloader is asking
diff --git a/arch/x86/boot/compressed/head_64.S b/arch/x86/boot/compressed/head_64.S
index 35af09d13dc1..558d76ce23bc 100644
--- a/arch/x86/boot/compressed/head_64.S
+++ b/arch/x86/boot/compressed/head_64.S
@@ -199,6 +199,26 @@ ENTRY(startup_64)
 	 * an identity mapped page table being provied that maps our
 	 * entire text+data+bss and hopefully all of memory.
 	 */
+#ifdef CONFIG_EFI_STUB
+	pushq	%rsi
+	mov	%rcx, %rdi
+	mov	%rdx, %rsi
+	call	efi_main
+	popq	%rsi
+	cmpq	$0,%rax
+	je	preferred_addr
+	movq	%rax,%rsi
+	call	1f
+1:
+	popq	%rax
+	subq	$1b, %rax
+	subq	BP_pref_address(%rsi), %rax
+	add	BP_code32_start(%esi), %eax
+	leaq	preferred_addr(%rax), %rax
+	jmp	*%rax
+
+preferred_addr:
+#endif
 
 	/* Setup data segments. */
 	xorl	%eax, %eax
diff --git a/arch/x86/boot/compressed/string.c b/arch/x86/boot/compressed/string.c
index 19b3e693cd72..ffb9c5c9d748 100644
--- a/arch/x86/boot/compressed/string.c
+++ b/arch/x86/boot/compressed/string.c
@@ -1,2 +1,11 @@
 #include "misc.h"
+
+int memcmp(const void *s1, const void *s2, size_t len)
+{
+	u8 diff;
+	asm("repe; cmpsb; setnz %0"
+	    : "=qm" (diff), "+D" (s1), "+S" (s2), "+c" (len));
+	return diff;
+}
+
 #include "../string.c"
diff --git a/arch/x86/boot/header.S b/arch/x86/boot/header.S
index bdb4d458ec8c..f1bbeeb09148 100644
--- a/arch/x86/boot/header.S
+++ b/arch/x86/boot/header.S
@@ -45,6 +45,11 @@ SYSSEG		= 0x1000		/* historical load address >> 4 */
 
 	.global bootsect_start
 bootsect_start:
+#ifdef CONFIG_EFI_STUB
+	# "MZ", MS-DOS header
+	.byte 0x4d
+	.byte 0x5a
+#endif
 
 	# Normalize the start address
 	ljmp	$BOOTSEG, $start2
@@ -79,6 +84,14 @@ bs_die:
 	# invoke the BIOS reset code...
 	ljmp	$0xf000,$0xfff0
 
+#ifdef CONFIG_EFI_STUB
+	.org	0x3c
+	#
+	# Offset to the PE header.
+	#
+	.long	pe_header
+#endif /* CONFIG_EFI_STUB */
+
 	.section ".bsdata", "a"
 bugger_off_msg:
 	.ascii	"Direct booting from floppy is no longer supported.\r\n"
@@ -87,6 +100,141 @@ bugger_off_msg:
 	.ascii	"Remove disk and press any key to reboot . . .\r\n"
 	.byte	0
 
+#ifdef CONFIG_EFI_STUB
+pe_header:
+	.ascii	"PE"
+	.word 	0
+
+coff_header:
+#ifdef CONFIG_X86_32
+	.word	0x14c				# i386
+#else
+	.word	0x8664				# x86-64
+#endif
+	.word	2				# nr_sections
+	.long	0 				# TimeDateStamp
+	.long	0				# PointerToSymbolTable
+	.long	1				# NumberOfSymbols
+	.word	section_table - optional_header	# SizeOfOptionalHeader
+#ifdef CONFIG_X86_32
+	.word	0x306				# Characteristics.
+						# IMAGE_FILE_32BIT_MACHINE |
+						# IMAGE_FILE_DEBUG_STRIPPED |
+						# IMAGE_FILE_EXECUTABLE_IMAGE |
+						# IMAGE_FILE_LINE_NUMS_STRIPPED
+#else
+	.word	0x206				# Characteristics
+						# IMAGE_FILE_DEBUG_STRIPPED |
+						# IMAGE_FILE_EXECUTABLE_IMAGE |
+						# IMAGE_FILE_LINE_NUMS_STRIPPED
+#endif
+
+optional_header:
+#ifdef CONFIG_X86_32
+	.word	0x10b				# PE32 format
+#else
+	.word	0x20b 				# PE32+ format
+#endif
+	.byte	0x02				# MajorLinkerVersion
+	.byte	0x14				# MinorLinkerVersion
+
+	# Filled in by build.c
+	.long	0				# SizeOfCode
+
+	.long	0				# SizeOfInitializedData
+	.long	0				# SizeOfUninitializedData
+
+	# Filled in by build.c
+	.long	0x0000				# AddressOfEntryPoint
+
+	.long	0x0000				# BaseOfCode
+#ifdef CONFIG_X86_32
+	.long	0				# data
+#endif
+
+extra_header_fields:
+#ifdef CONFIG_X86_32
+	.long	0				# ImageBase
+#else
+	.quad	0				# ImageBase
+#endif
+	.long	0x1000				# SectionAlignment
+	.long	0x200				# FileAlignment
+	.word	0				# MajorOperatingSystemVersion
+	.word	0				# MinorOperatingSystemVersion
+	.word	0				# MajorImageVersion
+	.word	0				# MinorImageVersion
+	.word	0				# MajorSubsystemVersion
+	.word	0				# MinorSubsystemVersion
+	.long	0				# Win32VersionValue
+
+	#
+	# The size of the bzImage is written in tools/build.c
+	#
+	.long	0				# SizeOfImage
+
+	.long	0x200				# SizeOfHeaders
+	.long	0				# CheckSum
+	.word	0xa				# Subsystem (EFI application)
+	.word	0				# DllCharacteristics
+#ifdef CONFIG_X86_32
+	.long	0				# SizeOfStackReserve
+	.long	0				# SizeOfStackCommit
+	.long	0				# SizeOfHeapReserve
+	.long	0				# SizeOfHeapCommit
+#else
+	.quad	0				# SizeOfStackReserve
+	.quad	0				# SizeOfStackCommit
+	.quad	0				# SizeOfHeapReserve
+	.quad	0				# SizeOfHeapCommit
+#endif
+	.long	0				# LoaderFlags
+	.long	0x1				# NumberOfRvaAndSizes
+
+	.quad	0				# ExportTable
+	.quad	0				# ImportTable
+	.quad	0				# ResourceTable
+	.quad	0				# ExceptionTable
+	.quad	0				# CertificationTable
+	.quad	0				# BaseRelocationTable
+
+	# Section table
+section_table:
+	.ascii	".text"
+	.byte	0
+	.byte	0
+	.byte	0
+	.long	0
+	.long	0x0				# startup_{32,64}
+	.long	0				# Size of initialized data
+						# on disk
+	.long	0x0				# startup_{32,64}
+	.long	0				# PointerToRelocations
+	.long	0				# PointerToLineNumbers
+	.word	0				# NumberOfRelocations
+	.word	0				# NumberOfLineNumbers
+	.long	0x60500020			# Characteristics (section flags)
+
+	#
+	# The EFI application loader requires a relocation section
+	# because EFI applications are relocatable and not having
+	# this section seems to confuse it. But since we don't need
+	# the loader to fixup any relocs for us just fill it with a
+	# single dummy reloc.
+	#
+	.ascii	".reloc"
+	.byte	0
+	.byte	0
+	.long	reloc_end - reloc_start
+	.long	reloc_start
+	.long	reloc_end - reloc_start		# SizeOfRawData
+	.long	reloc_start			# PointerToRawData
+	.long	0				# PointerToRelocations
+	.long	0				# PointerToLineNumbers
+	.word	0				# NumberOfRelocations
+	.word	0				# NumberOfLineNumbers
+	.long	0x42100040			# Characteristics (section flags)
+#endif /* CONFIG_EFI_STUB */
 
 	# Kernel attributes; used by setup.  This is part 1 of the
 	# header, from the old boot sector.
@@ -318,3 +466,13 @@ die:
 setup_corrupt:
 	.byte	7
 	.string	"No setup signature found...\n"
+
+	.data
+dummy:	.long	0
+
+	.section .reloc
+reloc_start:
+	.long	dummy - reloc_start
+	.long	10
+	.word	0
+reloc_end:
diff --git a/arch/x86/boot/string.c b/arch/x86/boot/string.c
index 3cbc4058dd26..574dedfe2890 100644
--- a/arch/x86/boot/string.c
+++ b/arch/x86/boot/string.c
@@ -111,3 +111,38 @@ unsigned long long simple_strtoull(const char *cp, char **endp, unsigned int bas
 
 	return result;
 }
+
+/**
+ * strlen - Find the length of a string
+ * @s: The string to be sized
+ */
+size_t strlen(const char *s)
+{
+	const char *sc;
+
+	for (sc = s; *sc != '\0'; ++sc)
+		/* nothing */;
+	return sc - s;
+}
+
+/**
+ * strstr - Find the first substring in a %NUL terminated string
+ * @s1: The string to be searched
+ * @s2: The string to search for
+ */
+char *strstr(const char *s1, const char *s2)
+{
+	size_t l1, l2;
+
+	l2 = strlen(s2);
+	if (!l2)
+		return (char *)s1;
+	l1 = strlen(s1);
+	while (l1 >= l2) {
+		l1--;
+		if (!memcmp(s1, s2, l2))
+			return (char *)s1;
+		s1++;
+	}
+	return NULL;
+}
diff --git a/arch/x86/boot/tools/build.c b/arch/x86/boot/tools/build.c
index fdc60a0b3c20..4e9bd6bcafa6 100644
--- a/arch/x86/boot/tools/build.c
+++ b/arch/x86/boot/tools/build.c
@@ -135,6 +135,9 @@ static void usage(void)
 
 int main(int argc, char ** argv)
 {
+#ifdef CONFIG_EFI_STUB
+	unsigned int file_sz, pe_header;
+#endif
 	unsigned int i, sz, setup_sectors;
 	int c;
 	u32 sys_size;
@@ -194,6 +197,42 @@ int main(int argc, char ** argv)
 	buf[0x1f6] = sys_size >> 16;
 	buf[0x1f7] = sys_size >> 24;
 
+#ifdef CONFIG_EFI_STUB
+	file_sz = sz + i + ((sys_size * 16) - sz);
+
+	pe_header = *(unsigned int *)&buf[0x3c];
+
+	/* Size of code */
+	*(unsigned int *)&buf[pe_header + 0x1c] = file_sz;
+
+	/* Size of image */
+	*(unsigned int *)&buf[pe_header + 0x50] = file_sz;
+
+#ifdef CONFIG_X86_32
+	/* Address of entry point */
+	*(unsigned int *)&buf[pe_header + 0x28] = i;
+
+	/* .text size */
+	*(unsigned int *)&buf[pe_header + 0xb0] = file_sz;
+
+	/* .text size of initialised data */
+	*(unsigned int *)&buf[pe_header + 0xb8] = file_sz;
+#else
+	/*
+	 * Address of entry point. startup_32 is at the beginning and
+	 * the 64-bit entry point (startup_64) is always 512 bytes
+	 * after.
+	 */
+	*(unsigned int *)&buf[pe_header + 0x28] = i + 512;
+
+	/* .text size */
+	*(unsigned int *)&buf[pe_header + 0xc0] = file_sz;
+
+	/* .text size of initialised data */
+	*(unsigned int *)&buf[pe_header + 0xc8] = file_sz;
+#endif /* CONFIG_X86_32 */
+#endif /* CONFIG_EFI_STUB */
+
 	crc = partial_crc32(buf, i, crc);
 	if (fwrite(buf, 1, i, stdout) != i)
 		die("Writing setup failed");
diff --git a/arch/x86/kernel/asm-offsets.c b/arch/x86/kernel/asm-offsets.c
index 4f13fafc5264..68de2dc962ec 100644
--- a/arch/x86/kernel/asm-offsets.c
+++ b/arch/x86/kernel/asm-offsets.c
@@ -67,4 +67,6 @@ void common(void) {
 	OFFSET(BP_hardware_subarch, boot_params, hdr.hardware_subarch);
 	OFFSET(BP_version, boot_params, hdr.version);
 	OFFSET(BP_kernel_alignment, boot_params, hdr.kernel_alignment);
+	OFFSET(BP_pref_address, boot_params, hdr.pref_address);
+	OFFSET(BP_code32_start, boot_params, hdr.code32_start);
 }
-- 
cgit v1.2.3


From 3e8f9451d3db669d7c0d8b330d4f5770149d90d5 Mon Sep 17 00:00:00 2001
From: Alan Cox <alan@linux.intel.com>
Date: Thu, 15 Dec 2011 22:19:41 +0000
Subject: x86: Fix INTEL_MID silly

Doh.. pass the brown paper bags - preferably filled with mince
pies..

This fixes occasional build failures.

Signed-off-by: Alan Cox <alan@linux.intel.com>
Link: http://lkml.kernel.org/n/tip-r0oc1knlvzuqr69artaeq8s8@git.kernel.org
[ extended the changelog a bit ]
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/Kconfig | 1 +
 1 file changed, 1 insertion(+)

(limited to 'arch/x86')

diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
index faf39a0d6242..2b54a2fb3ab0 100644
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -429,6 +429,7 @@ config X86_MDFLD
 	select SPI
 	select INTEL_SCU_IPC
 	select X86_PLATFORM_DEVICES
+	select X86_INTEL_MID
 	---help---
 	  Medfield is Intel's Low Power Intel Architecture (LPIA) based Moblin
 	  Internet Device(MID) platform. 
-- 
cgit v1.2.3


From 2d2da60fb40a80cc59383121ccf763e0e0e8a42a Mon Sep 17 00:00:00 2001
From: Maarten Lankhorst <m.b.lankhorst@gmail.com>
Date: Fri, 16 Dec 2011 13:30:58 +0100
Subject: x86, efi: Break up large initrd reads

The efi boot stub tries to read the entire initrd in 1 go, however
some efi implementations hang if too much if asked to read too much
data at the same time. After some experimentation I found out that my
asrock p67 board will hang if asked to read chunks of 4MiB, so use a
safe value.

elilo reads in chunks of 16KiB, but since that requires many read
calls I use a value of 1 MiB.  hpa suggested adding individual
blacklists for when systems are found where this value causes a crash.

Signed-off-by: Maarten Lankhorst <m.b.lankhorst@gmail.com>
Link: http://lkml.kernel.org/r/4EEB3A02.3090201@gmail.com
Signed-off-by: H. Peter Anvin <hpa@linux.intel.com>
---
 arch/x86/boot/compressed/eboot.c | 20 ++++++++++++++------
 arch/x86/boot/compressed/eboot.h |  1 +
 2 files changed, 15 insertions(+), 6 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/boot/compressed/eboot.c b/arch/x86/boot/compressed/eboot.c
index 4055e63d0b04..fec216f4fbc3 100644
--- a/arch/x86/boot/compressed/eboot.c
+++ b/arch/x86/boot/compressed/eboot.c
@@ -643,14 +643,22 @@ grow:
 			u64 size;
 
 			size = initrds[j].size;
-			status = efi_call_phys3(fh->read, initrds[j].handle,
-						&size, addr);
-			if (status != EFI_SUCCESS)
-				goto free_initrd_total;
+			while (size) {
+				u64 chunksize;
+				if (size > EFI_READ_CHUNK_SIZE)
+					chunksize = EFI_READ_CHUNK_SIZE;
+				else
+					chunksize = size;
+				status = efi_call_phys3(fh->read,
+							initrds[j].handle,
+							&chunksize, addr);
+				if (status != EFI_SUCCESS)
+					goto free_initrd_total;
+				addr += chunksize;
+				size -= chunksize;
+			}
 
 			efi_call_phys1(fh->close, initrds[j].handle);
-
-			addr += size;
 		}
 
 	}
diff --git a/arch/x86/boot/compressed/eboot.h b/arch/x86/boot/compressed/eboot.h
index f66d023e91ef..39251663e65b 100644
--- a/arch/x86/boot/compressed/eboot.h
+++ b/arch/x86/boot/compressed/eboot.h
@@ -12,6 +12,7 @@
 #define DESC_TYPE_CODE_DATA	(1 << 0)
 
 #define EFI_PAGE_SIZE		(1UL << EFI_PAGE_SHIFT)
+#define EFI_READ_CHUNK_SIZE	(1024 * 1024)
 
 #define PIXEL_RGB_RESERVED_8BIT_PER_COLOR		0
 #define PIXEL_BGR_RESERVED_8BIT_PER_COLOR		1
-- 
cgit v1.2.3


From a0c3832a578c84d4a93c61e22cb09c99fa9447ea Mon Sep 17 00:00:00 2001
From: Alan Cox <alan@linux.intel.com>
Date: Sat, 17 Dec 2011 21:57:25 +0000
Subject: x86/apb: Fix configuration constraints

The APB timer requires SFI, SCU and MID support

Reported-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Alan Cox <alan@linux.intel.com>
Link: http://lkml.kernel.org/r/20111217215719.3743.93550.stgit@bob.linux.org.uk
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/Kconfig | 1 +
 1 file changed, 1 insertion(+)

(limited to 'arch/x86')

diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
index 2b54a2fb3ab0..ca4ee7644855 100644
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -665,6 +665,7 @@ config APB_TIMER
        def_bool y if MRST
        prompt "Langwell APB Timer Support" if X86_MRST
        select DW_APB_TIMER
+       depends on X86_INTEL_MID && SFI
        help
          APB timer is the replacement for 8254, HPET on X86 MID platforms.
          The APBT provides a stable time base on SMP
-- 
cgit v1.2.3


From 933b9463a0ef75da681747b2dac06c1754465372 Mon Sep 17 00:00:00 2001
From: Alan Cox <alan@linux.intel.com>
Date: Sat, 17 Dec 2011 17:43:40 +0000
Subject: x86/intel config: Revamp configuration to allow for Moorestown and
 Medfield

This sets all up the other bits that need to be INTEL_MID
specific rather than Moorestown specific.

Signed-off-by: Alan Cox <alan@linux.intel.com>
Link: http://lkml.kernel.org/r/20111217174318.7207.91543.stgit@bob.linux.org.uk
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/Kconfig                | 4 ++--
 arch/x86/include/asm/fixmap.h   | 2 +-
 arch/x86/include/asm/setup.h    | 2 +-
 arch/x86/pci/Makefile           | 2 +-
 arch/x86/platform/mrst/Makefile | 4 ++--
 drivers/rtc/Kconfig             | 6 +++---
 6 files changed, 10 insertions(+), 10 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
index ca4ee7644855..c3c9343e4498 100644
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -662,8 +662,8 @@ config HPET_EMULATE_RTC
 	depends on HPET_TIMER && (RTC=y || RTC=m || RTC_DRV_CMOS=m || RTC_DRV_CMOS=y)
 
 config APB_TIMER
-       def_bool y if MRST
-       prompt "Langwell APB Timer Support" if X86_MRST
+       def_bool y if X86_INTEL_MID
+       prompt "Intel MID APB Timer Support" if X86_INTEL_MID
        select DW_APB_TIMER
        depends on X86_INTEL_MID && SFI
        help
diff --git a/arch/x86/include/asm/fixmap.h b/arch/x86/include/asm/fixmap.h
index 460c74e4852c..4da3c0c4c974 100644
--- a/arch/x86/include/asm/fixmap.h
+++ b/arch/x86/include/asm/fixmap.h
@@ -117,7 +117,7 @@ enum fixed_addresses {
 #endif
 	FIX_TEXT_POKE1,	/* reserve 2 pages for text_poke() */
 	FIX_TEXT_POKE0, /* first page is last, because allocation is backward */
-#ifdef	CONFIG_X86_MRST
+#ifdef	CONFIG_X86_INTEL_MID
 	FIX_LNW_VRTC,
 #endif
 	__end_of_permanent_fixed_addresses,
diff --git a/arch/x86/include/asm/setup.h b/arch/x86/include/asm/setup.h
index 9756551ec760..d0f19f9fb846 100644
--- a/arch/x86/include/asm/setup.h
+++ b/arch/x86/include/asm/setup.h
@@ -47,7 +47,7 @@ extern void reserve_standard_io_resources(void);
 extern void i386_reserve_resources(void);
 extern void setup_default_timer_irq(void);
 
-#ifdef CONFIG_X86_MRST
+#ifdef CONFIG_X86_INTEL_MID
 extern void x86_mrst_early_setup(void);
 #else
 static inline void x86_mrst_early_setup(void) { }
diff --git a/arch/x86/pci/Makefile b/arch/x86/pci/Makefile
index 6b8759f7634e..75b06f34b1f2 100644
--- a/arch/x86/pci/Makefile
+++ b/arch/x86/pci/Makefile
@@ -15,7 +15,7 @@ obj-$(CONFIG_X86_VISWS)		+= visws.o
 
 obj-$(CONFIG_X86_NUMAQ)		+= numaq_32.o
 
-obj-$(CONFIG_X86_MRST)		+= mrst.o
+obj-$(CONFIG_X86_INTEL_MID)	+= mrst.o
 
 obj-y				+= common.o early.o
 obj-y				+= amd_bus.o bus_numa.o
diff --git a/arch/x86/platform/mrst/Makefile b/arch/x86/platform/mrst/Makefile
index ddeec7300464..7baed5135e0f 100644
--- a/arch/x86/platform/mrst/Makefile
+++ b/arch/x86/platform/mrst/Makefile
@@ -1,4 +1,4 @@
-obj-$(CONFIG_X86_MRST)		+= mrst.o
-obj-$(CONFIG_X86_MRST)		+= vrtc.o
+obj-$(CONFIG_X86_INTEL_MID)	+= mrst.o
+obj-$(CONFIG_X86_INTEL_MID)	+= vrtc.o
 obj-$(CONFIG_EARLY_PRINTK_INTEL_MID)	+= early_printk_mrst.o
 obj-$(CONFIG_X86_MRST)		+= pmu.o
diff --git a/drivers/rtc/Kconfig b/drivers/rtc/Kconfig
index 53eb4e55b289..3a125b835546 100644
--- a/drivers/rtc/Kconfig
+++ b/drivers/rtc/Kconfig
@@ -498,9 +498,9 @@ config RTC_DRV_CMOS
 	  will be called rtc-cmos.
 
 config RTC_DRV_VRTC
-	tristate "Virtual RTC for Moorestown platforms"
-	depends on X86_MRST
-	default y if X86_MRST
+	tristate "Virtual RTC for Intel MID platforms"
+	depends on X86_INTEL_MID
+	default y if X86_INTEL_MID
 
 	help
 	Say "yes" here to get direct support for the real time clock
-- 
cgit v1.2.3


From d79a8869d8a4b565b12a88faeff834b09a36368c Mon Sep 17 00:00:00 2001
From: Michael Demeter <michael.demeter@intel.com>
Date: Thu, 15 Dec 2011 22:31:23 +0000
Subject: x86/mrst: Add additional debug prints for pb_keys

Added additional debug output that we always seem to add
during power ons to validate firmware operation.

Signed-off-by: Michael Demeter <michael.demeter@intel.com>
Signed-off-by: Kirill A. Shutemov <kirill.shutemov@linux.intel.com>
Signed-off-by: Alan Cox <alan@linux.intel.com>
Link: http://lkml.kernel.org/r/20111215223116.10166.50803.stgit@bob.linux.org.uk
[ fixed line breaks, formatting and commit title. ]
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/platform/mrst/mrst.c | 1 +
 1 file changed, 1 insertion(+)

(limited to 'arch/x86')

diff --git a/arch/x86/platform/mrst/mrst.c b/arch/x86/platform/mrst/mrst.c
index 6a21f603bd78..b6a33d2bd4d6 100644
--- a/arch/x86/platform/mrst/mrst.c
+++ b/arch/x86/platform/mrst/mrst.c
@@ -983,6 +983,7 @@ static int __init pb_keys_init(void)
 	num = sizeof(gpio_button) / sizeof(struct gpio_keys_button);
 	for (i = 0; i < num; i++) {
 		gb[i].gpio = get_gpio_by_name(gb[i].desc);
+		pr_debug("info[%2d]: name = %s, gpio = %d\n", i, gb[i].desc, gb[i].gpio);
 		if (gb[i].gpio == -1)
 			continue;
 
-- 
cgit v1.2.3


From 35edc2a5095efb189e60dc32bbb9d2663aec6d24 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <a.p.zijlstra@chello.nl>
Date: Sun, 20 Nov 2011 20:36:02 +0100
Subject: perf, arch: Rework perf_event_index()

Put the logic to compute the event index into a per pmu method. This
is required because the x86 rules are weird and wonderful and don't
match the capabilities of the current scheme.

AFAIK only powerpc actually has a usable userspace read of the PMCs
but I'm not at all sure anybody actually used that.

ARM is restored to the default since it currently does not support
userspace access at all. And all software events are provided with a
method that reports their index as 0 (disabled).

Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Michael Cree <mcree@orcon.net.nz>
Cc: Will Deacon <will.deacon@arm.com>
Cc: Deng-Cheng Zhu <dengcheng.zhu@gmail.com>
Cc: Anton Blanchard <anton@samba.org>
Cc: Eric B Munson <emunson@mgebm.net>
Cc: Heiko Carstens <heiko.carstens@de.ibm.com>
Cc: Paul Mundt <lethal@linux-sh.org>
Cc: David S. Miller <davem@davemloft.net>
Cc: Richard Kuo <rkuo@codeaurora.org>
Cc: Stephane Eranian <eranian@google.com>
Cc: Arun Sharma <asharma@fb.com>
Link: http://lkml.kernel.org/n/tip-dfydxodki16lylkt3gl2j7cw@git.kernel.org
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/arm/include/asm/perf_event.h            |  4 ----
 arch/frv/include/asm/perf_event.h            |  2 --
 arch/hexagon/include/asm/perf_event.h        |  2 --
 arch/powerpc/include/asm/perf_event_server.h |  2 --
 arch/powerpc/kernel/perf_event.c             |  6 ++++++
 arch/s390/include/asm/perf_event.h           |  1 -
 arch/x86/include/asm/perf_event.h            |  2 --
 include/linux/perf_event.h                   |  6 ++++++
 kernel/events/core.c                         | 27 ++++++++++++++++++++++-----
 kernel/events/hw_breakpoint.c                |  7 +++++++
 10 files changed, 41 insertions(+), 18 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/arm/include/asm/perf_event.h b/arch/arm/include/asm/perf_event.h
index 0f8e3827a89b..08f94d8fc04c 100644
--- a/arch/arm/include/asm/perf_event.h
+++ b/arch/arm/include/asm/perf_event.h
@@ -12,10 +12,6 @@
 #ifndef __ARM_PERF_EVENT_H__
 #define __ARM_PERF_EVENT_H__
 
-/* ARM performance counters start from 1 (in the cp15 accesses) so use the
- * same indexes here for consistency. */
-#define PERF_EVENT_INDEX_OFFSET 1
-
 /* ARM perf PMU IDs for use by internal perf clients. */
 enum arm_perf_pmu_ids {
 	ARM_PERF_PMU_ID_XSCALE1	= 0,
diff --git a/arch/frv/include/asm/perf_event.h b/arch/frv/include/asm/perf_event.h
index a69e0155d146..c52ea5546b5b 100644
--- a/arch/frv/include/asm/perf_event.h
+++ b/arch/frv/include/asm/perf_event.h
@@ -12,6 +12,4 @@
 #ifndef _ASM_PERF_EVENT_H
 #define _ASM_PERF_EVENT_H
 
-#define PERF_EVENT_INDEX_OFFSET	0
-
 #endif /* _ASM_PERF_EVENT_H */
diff --git a/arch/hexagon/include/asm/perf_event.h b/arch/hexagon/include/asm/perf_event.h
index 6c2910f91180..8b8526b491c7 100644
--- a/arch/hexagon/include/asm/perf_event.h
+++ b/arch/hexagon/include/asm/perf_event.h
@@ -19,6 +19,4 @@
 #ifndef _ASM_PERF_EVENT_H
 #define _ASM_PERF_EVENT_H
 
-#define PERF_EVENT_INDEX_OFFSET	0
-
 #endif /* _ASM_PERF_EVENT_H */
diff --git a/arch/powerpc/include/asm/perf_event_server.h b/arch/powerpc/include/asm/perf_event_server.h
index 8f1df1208d23..1a8093fa8f71 100644
--- a/arch/powerpc/include/asm/perf_event_server.h
+++ b/arch/powerpc/include/asm/perf_event_server.h
@@ -61,8 +61,6 @@ struct pt_regs;
 extern unsigned long perf_misc_flags(struct pt_regs *regs);
 extern unsigned long perf_instruction_pointer(struct pt_regs *regs);
 
-#define PERF_EVENT_INDEX_OFFSET	1
-
 /*
  * Only override the default definitions in include/linux/perf_event.h
  * if we have hardware PMU support.
diff --git a/arch/powerpc/kernel/perf_event.c b/arch/powerpc/kernel/perf_event.c
index 10a140f82cb8..d614ab57ccca 100644
--- a/arch/powerpc/kernel/perf_event.c
+++ b/arch/powerpc/kernel/perf_event.c
@@ -1187,6 +1187,11 @@ static int power_pmu_event_init(struct perf_event *event)
 	return err;
 }
 
+static int power_pmu_event_idx(struct perf_event *event)
+{
+	return event->hw.idx;
+}
+
 struct pmu power_pmu = {
 	.pmu_enable	= power_pmu_enable,
 	.pmu_disable	= power_pmu_disable,
@@ -1199,6 +1204,7 @@ struct pmu power_pmu = {
 	.start_txn	= power_pmu_start_txn,
 	.cancel_txn	= power_pmu_cancel_txn,
 	.commit_txn	= power_pmu_commit_txn,
+	.event_idx	= power_pmu_event_idx,
 };
 
 /*
diff --git a/arch/s390/include/asm/perf_event.h b/arch/s390/include/asm/perf_event.h
index a75f168d2718..4eb444edbe49 100644
--- a/arch/s390/include/asm/perf_event.h
+++ b/arch/s390/include/asm/perf_event.h
@@ -6,4 +6,3 @@
 
 /* Empty, just to avoid compiling error */
 
-#define PERF_EVENT_INDEX_OFFSET 0
diff --git a/arch/x86/include/asm/perf_event.h b/arch/x86/include/asm/perf_event.h
index 096c975e099f..9b922c136254 100644
--- a/arch/x86/include/asm/perf_event.h
+++ b/arch/x86/include/asm/perf_event.h
@@ -188,8 +188,6 @@ extern u32 get_ibs_caps(void);
 #ifdef CONFIG_PERF_EVENTS
 extern void perf_events_lapic_init(void);
 
-#define PERF_EVENT_INDEX_OFFSET			0
-
 /*
  * Abuse bit 3 of the cpu eflags register to indicate proper PEBS IP fixups.
  * This flag is otherwise unused and ABI specified to be 0, so nobody should
diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h
index 08855613ceb3..02545e6df95b 100644
--- a/include/linux/perf_event.h
+++ b/include/linux/perf_event.h
@@ -680,6 +680,12 @@ struct pmu {
 	 * for each successful ->add() during the transaction.
 	 */
 	void (*cancel_txn)		(struct pmu *pmu); /* optional */
+
+	/*
+	 * Will return the value for perf_event_mmap_page::index for this event,
+	 * if no implementation is provided it will default to: event->hw.idx + 1.
+	 */
+	int (*event_idx)		(struct perf_event *event); /*optional */
 };
 
 /**
diff --git a/kernel/events/core.c b/kernel/events/core.c
index 0ca1f648ac08..3894309c41a2 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -3208,10 +3208,6 @@ int perf_event_task_disable(void)
 	return 0;
 }
 
-#ifndef PERF_EVENT_INDEX_OFFSET
-# define PERF_EVENT_INDEX_OFFSET 0
-#endif
-
 static int perf_event_index(struct perf_event *event)
 {
 	if (event->hw.state & PERF_HES_STOPPED)
@@ -3220,7 +3216,7 @@ static int perf_event_index(struct perf_event *event)
 	if (event->state != PERF_EVENT_STATE_ACTIVE)
 		return 0;
 
-	return event->hw.idx + 1 - PERF_EVENT_INDEX_OFFSET;
+	return event->pmu->event_idx(event);
 }
 
 static void calc_timer_values(struct perf_event *event,
@@ -4992,6 +4988,11 @@ static int perf_swevent_init(struct perf_event *event)
 	return 0;
 }
 
+static int perf_swevent_event_idx(struct perf_event *event)
+{
+	return 0;
+}
+
 static struct pmu perf_swevent = {
 	.task_ctx_nr	= perf_sw_context,
 
@@ -5001,6 +5002,8 @@ static struct pmu perf_swevent = {
 	.start		= perf_swevent_start,
 	.stop		= perf_swevent_stop,
 	.read		= perf_swevent_read,
+
+	.event_idx	= perf_swevent_event_idx,
 };
 
 #ifdef CONFIG_EVENT_TRACING
@@ -5087,6 +5090,8 @@ static struct pmu perf_tracepoint = {
 	.start		= perf_swevent_start,
 	.stop		= perf_swevent_stop,
 	.read		= perf_swevent_read,
+
+	.event_idx	= perf_swevent_event_idx,
 };
 
 static inline void perf_tp_register(void)
@@ -5306,6 +5311,8 @@ static struct pmu perf_cpu_clock = {
 	.start		= cpu_clock_event_start,
 	.stop		= cpu_clock_event_stop,
 	.read		= cpu_clock_event_read,
+
+	.event_idx	= perf_swevent_event_idx,
 };
 
 /*
@@ -5378,6 +5385,8 @@ static struct pmu perf_task_clock = {
 	.start		= task_clock_event_start,
 	.stop		= task_clock_event_stop,
 	.read		= task_clock_event_read,
+
+	.event_idx	= perf_swevent_event_idx,
 };
 
 static void perf_pmu_nop_void(struct pmu *pmu)
@@ -5405,6 +5414,11 @@ static void perf_pmu_cancel_txn(struct pmu *pmu)
 	perf_pmu_enable(pmu);
 }
 
+static int perf_event_idx_default(struct perf_event *event)
+{
+	return event->hw.idx + 1;
+}
+
 /*
  * Ensures all contexts with the same task_ctx_nr have the same
  * pmu_cpu_context too.
@@ -5594,6 +5608,9 @@ got_cpu_context:
 		pmu->pmu_disable = perf_pmu_nop_void;
 	}
 
+	if (!pmu->event_idx)
+		pmu->event_idx = perf_event_idx_default;
+
 	list_add_rcu(&pmu->entry, &pmus);
 	ret = 0;
 unlock:
diff --git a/kernel/events/hw_breakpoint.c b/kernel/events/hw_breakpoint.c
index b7971d6f38bf..b0309f76d777 100644
--- a/kernel/events/hw_breakpoint.c
+++ b/kernel/events/hw_breakpoint.c
@@ -613,6 +613,11 @@ static void hw_breakpoint_stop(struct perf_event *bp, int flags)
 	bp->hw.state = PERF_HES_STOPPED;
 }
 
+static int hw_breakpoint_event_idx(struct perf_event *bp)
+{
+	return 0;
+}
+
 static struct pmu perf_breakpoint = {
 	.task_ctx_nr	= perf_sw_context, /* could eventually get its own */
 
@@ -622,6 +627,8 @@ static struct pmu perf_breakpoint = {
 	.start		= hw_breakpoint_start,
 	.stop		= hw_breakpoint_stop,
 	.read		= hw_breakpoint_pmu_read,
+
+	.event_idx	= hw_breakpoint_event_idx,
 };
 
 int __init init_hw_breakpoint(void)
-- 
cgit v1.2.3


From fe4a330885aee20f233de36085fb15c38094e635 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <a.p.zijlstra@chello.nl>
Date: Sun, 20 Nov 2011 20:44:06 +0100
Subject: perf, x86: Implement user-space RDPMC support, to allow fast,
 user-space access to self-monitoring counters

Implement a correct pmu::event_idx for the x86 counter index rules and
set CR4.PCE on CPU_STARTING.

Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Stephane Eranian <eranian@google.com>
Cc: Arun Sharma <asharma@fb.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: "H. Peter Anvin" <hpa@zytor.com>
Link: http://lkml.kernel.org/n/tip-mwxab34dibqgzk5zywutfnha@git.kernel.org
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/cpu/perf_event.c | 15 +++++++++++++++
 1 file changed, 15 insertions(+)

(limited to 'arch/x86')

diff --git a/arch/x86/kernel/cpu/perf_event.c b/arch/x86/kernel/cpu/perf_event.c
index 5adce1040b11..53b569910175 100644
--- a/arch/x86/kernel/cpu/perf_event.c
+++ b/arch/x86/kernel/cpu/perf_event.c
@@ -1210,6 +1210,7 @@ x86_pmu_notifier(struct notifier_block *self, unsigned long action, void *hcpu)
 		break;
 
 	case CPU_STARTING:
+		set_in_cr4(X86_CR4_PCE);
 		if (x86_pmu.cpu_starting)
 			x86_pmu.cpu_starting(cpu);
 		break;
@@ -1542,6 +1543,18 @@ static int x86_pmu_event_init(struct perf_event *event)
 	return err;
 }
 
+static int x86_pmu_event_idx(struct perf_event *event)
+{
+	int idx = event->hw.idx;
+
+	if (x86_pmu.num_counters_fixed && idx >= X86_PMC_IDX_FIXED) {
+		idx -= X86_PMC_IDX_FIXED;
+		idx |= 1 << 30;
+	}
+
+	return idx + 1;
+}
+
 static struct pmu pmu = {
 	.pmu_enable	= x86_pmu_enable,
 	.pmu_disable	= x86_pmu_disable,
@@ -1557,6 +1570,8 @@ static struct pmu pmu = {
 	.start_txn	= x86_pmu_start_txn,
 	.cancel_txn	= x86_pmu_cancel_txn,
 	.commit_txn	= x86_pmu_commit_txn,
+
+	.event_idx	= x86_pmu_event_idx,
 };
 
 /*
-- 
cgit v1.2.3


From 0c9d42ed4cee2aa1dfc3a260b741baae8615744f Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <a.p.zijlstra@chello.nl>
Date: Sun, 20 Nov 2011 23:30:47 +0100
Subject: perf, x86: Provide means for disabling userspace RDPMC

Allow the disabling of RDPMC via a pmu specific attribute:

  echo 0 > /sys/bus/event_source/devices/cpu/rdpmc

Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Stephane Eranian <eranian@google.com>
Cc: Arun Sharma <asharma@fb.com>
Link: http://lkml.kernel.org/n/tip-pqeog465zo5hsimtkfz73f27@git.kernel.org
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/cpu/perf_event.c | 55 +++++++++++++++++++++++++++++++++++++++-
 arch/x86/kernel/cpu/perf_event.h |  8 ++++++
 include/linux/perf_event.h       |  1 +
 kernel/events/core.c             |  1 +
 4 files changed, 64 insertions(+), 1 deletion(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kernel/cpu/perf_event.c b/arch/x86/kernel/cpu/perf_event.c
index 53b569910175..116b040a73a8 100644
--- a/arch/x86/kernel/cpu/perf_event.c
+++ b/arch/x86/kernel/cpu/perf_event.c
@@ -24,6 +24,7 @@
 #include <linux/slab.h>
 #include <linux/cpu.h>
 #include <linux/bitops.h>
+#include <linux/device.h>
 
 #include <asm/apic.h>
 #include <asm/stacktrace.h>
@@ -1210,7 +1211,8 @@ x86_pmu_notifier(struct notifier_block *self, unsigned long action, void *hcpu)
 		break;
 
 	case CPU_STARTING:
-		set_in_cr4(X86_CR4_PCE);
+		if (x86_pmu.attr_rdpmc)
+			set_in_cr4(X86_CR4_PCE);
 		if (x86_pmu.cpu_starting)
 			x86_pmu.cpu_starting(cpu);
 		break;
@@ -1320,6 +1322,8 @@ static int __init init_hw_perf_events(void)
 		}
 	}
 
+	x86_pmu.attr_rdpmc = 1; /* enable userspace RDPMC usage by default */
+
 	pr_info("... version:                %d\n",     x86_pmu.version);
 	pr_info("... bit width:              %d\n",     x86_pmu.cntval_bits);
 	pr_info("... generic registers:      %d\n",     x86_pmu.num_counters);
@@ -1555,10 +1559,59 @@ static int x86_pmu_event_idx(struct perf_event *event)
 	return idx + 1;
 }
 
+static ssize_t get_attr_rdpmc(struct device *cdev,
+			      struct device_attribute *attr,
+			      char *buf)
+{
+	return snprintf(buf, 40, "%d\n", x86_pmu.attr_rdpmc);
+}
+
+static void change_rdpmc(void *info)
+{
+	bool enable = !!(unsigned long)info;
+
+	if (enable)
+		set_in_cr4(X86_CR4_PCE);
+	else
+		clear_in_cr4(X86_CR4_PCE);
+}
+
+static ssize_t set_attr_rdpmc(struct device *cdev,
+			      struct device_attribute *attr,
+			      const char *buf, size_t count)
+{
+	unsigned long val = simple_strtoul(buf, NULL, 0);
+
+	if (!!val != !!x86_pmu.attr_rdpmc) {
+		x86_pmu.attr_rdpmc = !!val;
+		smp_call_function(change_rdpmc, (void *)val, 1);
+	}
+
+	return count;
+}
+
+static DEVICE_ATTR(rdpmc, S_IRUSR | S_IWUSR, get_attr_rdpmc, set_attr_rdpmc);
+
+static struct attribute *x86_pmu_attrs[] = {
+	&dev_attr_rdpmc.attr,
+	NULL,
+};
+
+static struct attribute_group x86_pmu_attr_group = {
+	.attrs = x86_pmu_attrs,
+};
+
+static const struct attribute_group *x86_pmu_attr_groups[] = {
+	&x86_pmu_attr_group,
+	NULL,
+};
+
 static struct pmu pmu = {
 	.pmu_enable	= x86_pmu_enable,
 	.pmu_disable	= x86_pmu_disable,
 
+	.attr_groups	= x86_pmu_attr_groups,
+
 	.event_init	= x86_pmu_event_init,
 
 	.add		= x86_pmu_add,
diff --git a/arch/x86/kernel/cpu/perf_event.h b/arch/x86/kernel/cpu/perf_event.h
index 8944062f46e2..513d617b93c4 100644
--- a/arch/x86/kernel/cpu/perf_event.h
+++ b/arch/x86/kernel/cpu/perf_event.h
@@ -307,6 +307,14 @@ struct x86_pmu {
 	struct x86_pmu_quirk *quirks;
 	int		perfctr_second_write;
 
+	/*
+	 * sysfs attrs
+	 */
+	int		attr_rdpmc;
+
+	/*
+	 * CPU Hotplug hooks
+	 */
 	int		(*cpu_prepare)(int cpu);
 	void		(*cpu_starting)(int cpu);
 	void		(*cpu_dying)(int cpu);
diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h
index 02545e6df95b..5311b79fe62c 100644
--- a/include/linux/perf_event.h
+++ b/include/linux/perf_event.h
@@ -615,6 +615,7 @@ struct pmu {
 	struct list_head		entry;
 
 	struct device			*dev;
+	const struct attribute_group	**attr_groups;
 	char				*name;
 	int				type;
 
diff --git a/kernel/events/core.c b/kernel/events/core.c
index 05affc3878ff..dcd4049e92fc 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -5505,6 +5505,7 @@ static int pmu_dev_alloc(struct pmu *pmu)
 	if (!pmu->dev)
 		goto out;
 
+	pmu->dev->groups = pmu->attr_groups;
 	device_initialize(pmu->dev);
 	ret = dev_set_name(pmu->dev, "%s", pmu->name);
 	if (ret)
-- 
cgit v1.2.3


From e3f3541c19c89a4daae39300defba68943301949 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <a.p.zijlstra@chello.nl>
Date: Mon, 21 Nov 2011 11:43:53 +0100
Subject: perf: Extend the mmap control page with time (TSC) fields

Extend the mmap control page with fields so that userspace can compute
time deltas relative to the provided time fields.

Currently only implemented for x86 with constant and nonstop TSC.

Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Stephane Eranian <eranian@google.com>
Cc: Arun Sharma <asharma@fb.com>
Link: http://lkml.kernel.org/n/tip-3u1jucza77j3wuvs0x2bic0f@git.kernel.org
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/cpu/perf_event.c | 14 ++++++++++++++
 include/linux/perf_event.h       |  4 +++-
 kernel/events/core.c             | 21 ++++++++++++++-------
 3 files changed, 31 insertions(+), 8 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kernel/cpu/perf_event.c b/arch/x86/kernel/cpu/perf_event.c
index 116b040a73a8..f8bddb5b0600 100644
--- a/arch/x86/kernel/cpu/perf_event.c
+++ b/arch/x86/kernel/cpu/perf_event.c
@@ -32,6 +32,7 @@
 #include <asm/compat.h>
 #include <asm/smp.h>
 #include <asm/alternative.h>
+#include <asm/timer.h>
 
 #include "perf_event.h"
 
@@ -1627,6 +1628,19 @@ static struct pmu pmu = {
 	.event_idx	= x86_pmu_event_idx,
 };
 
+void perf_update_user_clock(struct perf_event_mmap_page *userpg, u64 now)
+{
+	if (!boot_cpu_has(X86_FEATURE_CONSTANT_TSC))
+		return;
+
+	if (!boot_cpu_has(X86_FEATURE_NONSTOP_TSC))
+		return;
+
+	userpg->time_mult = this_cpu_read(cyc2ns);
+	userpg->time_shift = CYC2NS_SCALE_FACTOR;
+	userpg->time_offset = this_cpu_read(cyc2ns_offset) - now;
+}
+
 /*
  * callchain support
  */
diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h
index 5311b79fe62c..0b91db2522cc 100644
--- a/include/linux/perf_event.h
+++ b/include/linux/perf_event.h
@@ -291,12 +291,14 @@ struct perf_event_mmap_page {
 	__s64	offset;			/* add to hardware event value */
 	__u64	time_enabled;		/* time event active */
 	__u64	time_running;		/* time event on cpu */
+	__u32	time_mult, time_shift;
+	__u64	time_offset;
 
 		/*
 		 * Hole for extension of the self monitor capabilities
 		 */
 
-	__u64	__reserved[123];	/* align to 1k */
+	__u64	__reserved[121];	/* align to 1k */
 
 	/*
 	 * Control data for the mmap() data buffer.
diff --git a/kernel/events/core.c b/kernel/events/core.c
index dcd4049e92fc..3a9c7d81afbf 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -3220,17 +3220,22 @@ static int perf_event_index(struct perf_event *event)
 }
 
 static void calc_timer_values(struct perf_event *event,
+				u64 *now,
 				u64 *enabled,
 				u64 *running)
 {
-	u64 now, ctx_time;
+	u64 ctx_time;
 
-	now = perf_clock();
-	ctx_time = event->shadow_ctx_time + now;
+	*now = perf_clock();
+	ctx_time = event->shadow_ctx_time + *now;
 	*enabled = ctx_time - event->tstamp_enabled;
 	*running = ctx_time - event->tstamp_running;
 }
 
+void __weak perf_update_user_clock(struct perf_event_mmap_page *userpg, u64 now)
+{
+}
+
 /*
  * Callers need to ensure there can be no nesting of this function, otherwise
  * the seqlock logic goes bad. We can not serialize this because the arch
@@ -3240,7 +3245,7 @@ void perf_event_update_userpage(struct perf_event *event)
 {
 	struct perf_event_mmap_page *userpg;
 	struct ring_buffer *rb;
-	u64 enabled, running;
+	u64 enabled, running, now;
 
 	rcu_read_lock();
 	/*
@@ -3252,7 +3257,7 @@ void perf_event_update_userpage(struct perf_event *event)
 	 * because of locking issue as we can be called in
 	 * NMI context
 	 */
-	calc_timer_values(event, &enabled, &running);
+	calc_timer_values(event, &now, &enabled, &running);
 	rb = rcu_dereference(event->rb);
 	if (!rb)
 		goto unlock;
@@ -3277,6 +3282,8 @@ void perf_event_update_userpage(struct perf_event *event)
 	userpg->time_running = running +
 			atomic64_read(&event->child_total_time_running);
 
+	perf_update_user_clock(userpg, now);
+
 	barrier();
 	++userpg->lock;
 	preempt_enable();
@@ -3763,7 +3770,7 @@ static void perf_output_read_group(struct perf_output_handle *handle,
 static void perf_output_read(struct perf_output_handle *handle,
 			     struct perf_event *event)
 {
-	u64 enabled = 0, running = 0;
+	u64 enabled = 0, running = 0, now;
 	u64 read_format = event->attr.read_format;
 
 	/*
@@ -3776,7 +3783,7 @@ static void perf_output_read(struct perf_output_handle *handle,
 	 * NMI context
 	 */
 	if (read_format & PERF_FORMAT_TOTAL_TIMES)
-		calc_timer_values(event, &enabled, &running);
+		calc_timer_values(event, &now, &enabled, &running);
 
 	if (event->attr.read_format & PERF_FORMAT_GROUP)
 		perf_output_read_group(handle, event, enabled, running);
-- 
cgit v1.2.3


From 549c89b98c4530b278dde1a3f68ce5ebbb1e6304 Mon Sep 17 00:00:00 2001
From: Linus Torvalds <torvalds@linux-foundation.org>
Date: Tue, 29 Nov 2011 12:44:55 -0800
Subject: x86: Do not schedule while still in NMI context

The NMI handler uses the paranoid_exit routine that checks the
NEED_RESCHED flag, and if it is set and the return is for userspace,
then interrupts are enabled, the stack is swapped to the thread's stack,
and schedule is called. The problem with this is that we are still in an
NMI context until an iret is executed. This means that any new NMIs are
now starved until an interrupt or exception occurs and does the iret.

As NMIs can not be masked and can interrupt any location, they are
treated as a special case. NEED_RESCHED should not be set in an NMI
handler. The interruption by the NMI should not disturb the work flow
for scheduling. Any IPI sent to a processor after sending the
NEED_RESCHED would have to wait for the NMI anyway, and after the IPI
finishes the schedule would be called as required.

There is no reason to do anything special leaving an NMI. Remove the
call to paranoid_exit and do a simple return. This not only fixes the
bug of starved NMIs, but it also cleans up the code.

Link: http://lkml.kernel.org/r/CA+55aFzgM55hXTs4griX5e9=v_O+=ue+7Rj0PTD=M7hFYpyULQ@mail.gmail.com

Acked-by: Andi Kleen <ak@linux.intel.com>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: "H. Peter Anvin" <hpa@linux.intel.com>
Cc: Frederic Weisbecker <fweisbec@gmail.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Paul Turner <pjt@google.com>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
Signed-off-by: Steven Rostedt <rostedt@goodmis.org>
---
 arch/x86/kernel/entry_64.S | 32 --------------------------------
 1 file changed, 32 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kernel/entry_64.S b/arch/x86/kernel/entry_64.S
index faf8d5e74b0b..3819ea907339 100644
--- a/arch/x86/kernel/entry_64.S
+++ b/arch/x86/kernel/entry_64.S
@@ -1489,46 +1489,14 @@ ENTRY(nmi)
 	movq %rsp,%rdi
 	movq $-1,%rsi
 	call do_nmi
-#ifdef CONFIG_TRACE_IRQFLAGS
-	/* paranoidexit; without TRACE_IRQS_OFF */
-	/* ebx:	no swapgs flag */
-	DISABLE_INTERRUPTS(CLBR_NONE)
 	testl %ebx,%ebx				/* swapgs needed? */
 	jnz nmi_restore
-	testl $3,CS(%rsp)
-	jnz nmi_userspace
 nmi_swapgs:
 	SWAPGS_UNSAFE_STACK
 nmi_restore:
 	RESTORE_ALL 8
 	jmp irq_return
-nmi_userspace:
-	GET_THREAD_INFO(%rcx)
-	movl TI_flags(%rcx),%ebx
-	andl $_TIF_WORK_MASK,%ebx
-	jz nmi_swapgs
-	movq %rsp,%rdi			/* &pt_regs */
-	call sync_regs
-	movq %rax,%rsp			/* switch stack for scheduling */
-	testl $_TIF_NEED_RESCHED,%ebx
-	jnz nmi_schedule
-	movl %ebx,%edx			/* arg3: thread flags */
-	ENABLE_INTERRUPTS(CLBR_NONE)
-	xorl %esi,%esi 			/* arg2: oldset */
-	movq %rsp,%rdi 			/* arg1: &pt_regs */
-	call do_notify_resume
-	DISABLE_INTERRUPTS(CLBR_NONE)
-	jmp nmi_userspace
-nmi_schedule:
-	ENABLE_INTERRUPTS(CLBR_ANY)
-	call schedule
-	DISABLE_INTERRUPTS(CLBR_ANY)
-	jmp nmi_userspace
 	CFI_ENDPROC
-#else
-	jmp paranoid_exit
-	CFI_ENDPROC
-#endif
 END(nmi)
 
 ENTRY(ignore_sysret)
-- 
cgit v1.2.3


From 1fd466efc88c48f50e5ee29f4dbb4e210a889172 Mon Sep 17 00:00:00 2001
From: Steven Rostedt <srostedt@redhat.com>
Date: Thu, 8 Dec 2011 12:32:27 -0500
Subject: x86: Document the NMI handler about not using paranoid_exit

Linus cleaned up the NMI handler but it still needs some comments to
explain why it uses save_paranoid but not paranoid_exit. Just to keep
others from adding that in the future, document why it's not used.

Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Andi Kleen <andi@firstfloor.org>
Signed-off-by: Steven Rostedt <rostedt@goodmis.org>
---
 arch/x86/kernel/entry_64.S | 9 ++++++++-
 1 file changed, 8 insertions(+), 1 deletion(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kernel/entry_64.S b/arch/x86/kernel/entry_64.S
index 3819ea907339..d1d5434e7f6a 100644
--- a/arch/x86/kernel/entry_64.S
+++ b/arch/x86/kernel/entry_64.S
@@ -1480,9 +1480,16 @@ END(error_exit)
 ENTRY(nmi)
 	INTR_FRAME
 	PARAVIRT_ADJUST_EXCEPTION_FRAME
-	pushq_cfi $-1
+	pushq_cfi $-1		/* ORIG_RAX: no syscall to restart */
 	subq $ORIG_RAX-R15, %rsp
 	CFI_ADJUST_CFA_OFFSET ORIG_RAX-R15
+	/*
+	 * Use save_paranoid to handle SWAPGS, but no need to use paranoid_exit
+	 * as we should not be calling schedule in NMI context.
+	 * Even with normal interrupts enabled. An NMI should not be
+	 * setting NEED_RESCHED or anything that normal interrupts and
+	 * exceptions might do.
+	 */
 	call save_paranoid
 	DEFAULT_FRAME 0
 	/* paranoidentry do_nmi, 0; without TRACE_IRQS_OFF */
-- 
cgit v1.2.3


From 3f3c8b8c4b2a34776c3470142a7c8baafcda6eb0 Mon Sep 17 00:00:00 2001
From: Steven Rostedt <srostedt@redhat.com>
Date: Thu, 8 Dec 2011 12:36:23 -0500
Subject: x86: Add workaround to NMI iret woes

In x86, when an NMI goes off, the CPU goes into an NMI context that
prevents other NMIs to trigger on that CPU. If an NMI is suppose to
trigger, it has to wait till the previous NMI leaves NMI context.
At that time, the next NMI can trigger (note, only one more NMI will
trigger, as only one can be latched at a time).

The way x86 gets out of NMI context is by calling iret. The problem
with this is that this causes problems if the NMI handle either
triggers an exception, or a breakpoint. Both the exception and the
breakpoint handlers will finish with an iret. If this happens while
in NMI context, the CPU will leave NMI context and a new NMI may come
in. As NMI handlers are not made to be re-entrant, this can cause
havoc with the system, not to mention, the nested NMI will write
all over the previous NMI's stack.

Linus Torvalds proposed the following workaround to this problem:

https://lkml.org/lkml/2010/7/14/264

"In fact, I wonder if we couldn't just do a software NMI disable
instead? Hav ea per-cpu variable (in the _core_ percpu areas that get
allocated statically) that points to the NMI stack frame, and just
make the NMI code itself do something like

 NMI entry:
 - load percpu NMI stack frame pointer
 - if non-zero we know we're nested, and should ignore this NMI:
    - we're returning to kernel mode, so return immediately by using
"popf/ret", which also keeps NMI's disabled in the hardware until the
"real" NMI iret happens.
    - before the popf/iret, use the NMI stack pointer to make the NMI
return stack be invalid and cause a fault
  - set the NMI stack pointer to the current stack pointer

 NMI exit (not the above "immediate exit because we nested"):
   clear the percpu NMI stack pointer
   Just do the iret.

Now, the thing is, now the "iret" is atomic. If we had a nested NMI,
we'll take a fault, and that re-does our "delayed" NMI - and NMI's
will stay masked.

And if we didn't have a nested NMI, that iret will now unmask NMI's,
and everything is happy."

I first tried to follow this advice but as I started implementing this
code, a few gotchas showed up.

One, is accessing per-cpu variables in the NMI handler.

The problem is that per-cpu variables use the %gs register to get the
variable for the given CPU. But as the NMI may happen in userspace,
we must first perform a SWAPGS to get to it. The NMI handler already
does this later in the code, but its too late as we have saved off
all the registers and we don't want to do that for a disabled NMI.

Peter Zijlstra suggested to keep all variables on the stack. This
simplifies things greatly and it has the added benefit of cache locality.

Two, faulting on the iret.

I really wanted to make this work, but it was becoming very hacky, and
I never got it to be stable. The iret already had a fault handler for
userspace faulting with bad segment registers, and getting NMI to trigger
a fault and detect it was very tricky. But for strange reasons, the system
would usually take a double fault and crash. I never figured out why
and decided to go with a simple "jmp" approach. The new approach I took
also simplified things.

Finally, the last problem with Linus's approach was to have the nested
NMI handler do a ret instead of an iret to give the first NMI NMI-context
again.

The problem is that ret is much more limited than an iret. I couldn't figure
out how to get the stack back where it belonged. I could have copied the
current stack, pushed the return onto it, but my fear here is that there
may be some place that writes data below the stack pointer. I know that
is not something code should depend on, but I don't want to chance it.
I may add this feature later, but for now, an NMI handler that loses NMI
context will not get it back.

Here's what is done:

When an NMI comes in, the HW pushes the interrupt stack frame onto the
per cpu NMI stack that is selected by the IST.

A special location on the NMI stack holds a variable that is set when
the first NMI handler runs. If this variable is set then we know that
this is a nested NMI and we process the nested NMI code.

There is still a race when this variable is cleared and an NMI comes
in just before the first NMI does the return. For this case, if the
variable is cleared, we also check if the interrupted stack is the
NMI stack. If it is, then we process the nested NMI code.

Why the two tests and not just test the interrupted stack?

If the first NMI hits a breakpoint and loses NMI context, and then it
hits another breakpoint and while processing that breakpoint we get a
nested NMI. When processing a breakpoint, the stack changes to the
breakpoint stack. If another NMI comes in here we can't rely on the
interrupted stack to be the NMI stack.

If the variable is not set and the interrupted task's stack is not the
NMI stack, then we know this is the first NMI and we can process things
normally. But in order to do so, we need to do a few things first.

1) Set the stack variable that tells us that we are in an NMI handler

2) Make two copies of the interrupt stack frame.
   One copy is used to return on iret
   The other is used to restore the first one if we have a nested NMI.

This is what the stack will look like:

	  +-------------------------+
	  | original SS             |
	  | original Return RSP     |
	  | original RFLAGS         |
	  | original CS             |
	  | original RIP            |
	  +-------------------------+
	  | temp storage for rdx    |
	  +-------------------------+
	  | NMI executing variable  |
	  +-------------------------+
	  | Saved SS                |
	  | Saved Return RSP        |
	  | Saved RFLAGS            |
	  | Saved CS                |
	  | Saved RIP               |
	  +-------------------------+
	  | copied SS               |
	  | copied Return RSP       |
	  | copied RFLAGS           |
	  | copied CS               |
	  | copied RIP              |
	  +-------------------------+
	  | pt_regs                 |
	  +-------------------------+

The original stack frame contains what the HW put in when we entered
the NMI.

We store %rdx as a temp variable to use. Both the original HW stack
frame and this %rdx storage will be clobbered by nested NMIs so we
can not rely on them later in the first NMI handler.

The next item is the special stack variable that is set when we execute
the rest of the NMI handler.

Then we have two copies of the interrupt stack. The second copy is
modified by any nested NMIs to let the first NMI know that we triggered
a second NMI (latched) and that we should repeat the NMI handler.

If the first NMI hits an exception or breakpoint that takes it out of
NMI context, if a second NMI comes in before the first one finishes,
it will update the copied interrupt stack to point to a fix up location
to trigger another NMI.

When the first NMI calls iret, it will instead jump to the fix up
location. This fix up location will copy the saved interrupt stack back
to the copy and execute the nmi handler again.

Note, the nested NMI knows enough to check if it preempted a previous
NMI handler while it is in the fixup location. If it has, it will not
modify the copied interrupt stack and will just leave as if nothing
happened. As the NMI handle is about to execute again, there's no reason
to latch now.

To test all this, I forced the NMI handler to call iret and take itself
out of NMI context. I also added assemble code to write to the serial to
make sure that it hits the nested path as well as the fix up path.
Everything seems to be working fine.

Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: H. Peter Anvin <hpa@linux.intel.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Paul Turner <pjt@google.com>
Cc: Frederic Weisbecker <fweisbec@gmail.com>
Cc: Mathieu Desnoyers <mathieu.desnoyers@efficios.com>
Signed-off-by: Steven Rostedt <rostedt@goodmis.org>
---
 arch/x86/kernel/entry_64.S | 177 +++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 177 insertions(+)

(limited to 'arch/x86')

diff --git a/arch/x86/kernel/entry_64.S b/arch/x86/kernel/entry_64.S
index d1d5434e7f6a..b62aa298df7f 100644
--- a/arch/x86/kernel/entry_64.S
+++ b/arch/x86/kernel/entry_64.S
@@ -1475,11 +1475,166 @@ ENTRY(error_exit)
 	CFI_ENDPROC
 END(error_exit)
 
+/*
+ * Test if a given stack is an NMI stack or not.
+ */
+	.macro test_in_nmi reg stack nmi_ret normal_ret
+	cmpq %\reg, \stack
+	ja \normal_ret
+	subq $EXCEPTION_STKSZ, %\reg
+	cmpq %\reg, \stack
+	jb \normal_ret
+	jmp \nmi_ret
+	.endm
 
 	/* runs on exception stack */
 ENTRY(nmi)
 	INTR_FRAME
 	PARAVIRT_ADJUST_EXCEPTION_FRAME
+	/*
+	 * We allow breakpoints in NMIs. If a breakpoint occurs, then
+	 * the iretq it performs will take us out of NMI context.
+	 * This means that we can have nested NMIs where the next
+	 * NMI is using the top of the stack of the previous NMI. We
+	 * can't let it execute because the nested NMI will corrupt the
+	 * stack of the previous NMI. NMI handlers are not re-entrant
+	 * anyway.
+	 *
+	 * To handle this case we do the following:
+	 *  Check the a special location on the stack that contains
+	 *  a variable that is set when NMIs are executing.
+	 *  The interrupted task's stack is also checked to see if it
+	 *  is an NMI stack.
+	 *  If the variable is not set and the stack is not the NMI
+	 *  stack then:
+	 *    o Set the special variable on the stack
+	 *    o Copy the interrupt frame into a "saved" location on the stack
+	 *    o Copy the interrupt frame into a "copy" location on the stack
+	 *    o Continue processing the NMI
+	 *  If the variable is set or the previous stack is the NMI stack:
+	 *    o Modify the "copy" location to jump to the repeate_nmi
+	 *    o return back to the first NMI
+	 *
+	 * Now on exit of the first NMI, we first clear the stack variable
+	 * The NMI stack will tell any nested NMIs at that point that it is
+	 * nested. Then we pop the stack normally with iret, and if there was
+	 * a nested NMI that updated the copy interrupt stack frame, a
+	 * jump will be made to the repeat_nmi code that will handle the second
+	 * NMI.
+	 */
+
+	/* Use %rdx as out temp variable throughout */
+	pushq_cfi %rdx
+
+	/*
+	 * Check the special variable on the stack to see if NMIs are
+	 * executing.
+	 */
+	cmp $1, -8(%rsp)
+	je nested_nmi
+
+	/*
+	 * Now test if the previous stack was an NMI stack.
+	 * We need the double check. We check the NMI stack to satisfy the
+	 * race when the first NMI clears the variable before returning.
+	 * We check the variable because the first NMI could be in a
+	 * breakpoint routine using a breakpoint stack.
+	 */
+	lea 6*8(%rsp), %rdx
+	test_in_nmi rdx, 4*8(%rsp), nested_nmi, first_nmi
+
+nested_nmi:
+	/*
+	 * Do nothing if we interrupted the fixup in repeat_nmi.
+	 * It's about to repeat the NMI handler, so we are fine
+	 * with ignoring this one.
+	 */
+	movq $repeat_nmi, %rdx
+	cmpq 8(%rsp), %rdx
+	ja 1f
+	movq $end_repeat_nmi, %rdx
+	cmpq 8(%rsp), %rdx
+	ja nested_nmi_out
+
+1:
+	/* Set up the interrupted NMIs stack to jump to repeat_nmi */
+	leaq -6*8(%rsp), %rdx
+	movq %rdx, %rsp
+	CFI_ADJUST_CFA_OFFSET 6*8
+	pushq_cfi $__KERNEL_DS
+	pushq_cfi %rdx
+	pushfq_cfi
+	pushq_cfi $__KERNEL_CS
+	pushq_cfi $repeat_nmi
+
+	/* Put stack back */
+	addq $(11*8), %rsp
+	CFI_ADJUST_CFA_OFFSET -11*8
+
+nested_nmi_out:
+	popq_cfi %rdx
+
+	/* No need to check faults here */
+	INTERRUPT_RETURN
+
+first_nmi:
+	/*
+	 * Because nested NMIs will use the pushed location that we
+	 * stored in rdx, we must keep that space available.
+	 * Here's what our stack frame will look like:
+	 * +-------------------------+
+	 * | original SS             |
+	 * | original Return RSP     |
+	 * | original RFLAGS         |
+	 * | original CS             |
+	 * | original RIP            |
+	 * +-------------------------+
+	 * | temp storage for rdx    |
+	 * +-------------------------+
+	 * | NMI executing variable  |
+	 * +-------------------------+
+	 * | Saved SS                |
+	 * | Saved Return RSP        |
+	 * | Saved RFLAGS            |
+	 * | Saved CS                |
+	 * | Saved RIP               |
+	 * +-------------------------+
+	 * | copied SS               |
+	 * | copied Return RSP       |
+	 * | copied RFLAGS           |
+	 * | copied CS               |
+	 * | copied RIP              |
+	 * +-------------------------+
+	 * | pt_regs                 |
+	 * +-------------------------+
+	 *
+	 * The saved RIP is used to fix up the copied RIP that a nested
+	 * NMI may zero out. The original stack frame and the temp storage
+	 * is also used by nested NMIs and can not be trusted on exit.
+	 */
+	/* Set the NMI executing variable on the stack. */
+	pushq_cfi $1
+
+	/* Copy the stack frame to the Saved frame */
+	.rept 5
+	pushq_cfi 6*8(%rsp)
+	.endr
+
+	/* Make another copy, this one may be modified by nested NMIs */
+	.rept 5
+	pushq_cfi 4*8(%rsp)
+	.endr
+
+	/* Do not pop rdx, nested NMIs will corrupt it */
+	movq 11*8(%rsp), %rdx
+
+	/*
+	 * Everything below this point can be preempted by a nested
+	 * NMI if the first NMI took an exception. Repeated NMIs
+	 * caused by an exception and nested NMI will start here, and
+	 * can still be preempted by another NMI.
+	 */
+restart_nmi:
 	pushq_cfi $-1		/* ORIG_RAX: no syscall to restart */
 	subq $ORIG_RAX-R15, %rsp
 	CFI_ADJUST_CFA_OFFSET ORIG_RAX-R15
@@ -1502,10 +1657,32 @@ nmi_swapgs:
 	SWAPGS_UNSAFE_STACK
 nmi_restore:
 	RESTORE_ALL 8
+	/* Clear the NMI executing stack variable */
+	movq $0, 10*8(%rsp)
 	jmp irq_return
 	CFI_ENDPROC
 END(nmi)
 
+	/*
+	 * If an NMI hit an iret because of an exception or breakpoint,
+	 * it can lose its NMI context, and a nested NMI may come in.
+	 * In that case, the nested NMI will change the preempted NMI's
+	 * stack to jump to here when it does the final iret.
+	 */
+repeat_nmi:
+	INTR_FRAME
+	/* Update the stack variable to say we are still in NMI */
+	movq $1, 5*8(%rsp)
+
+	/* copy the saved stack back to copy stack */
+	.rept 5
+	pushq_cfi 4*8(%rsp)
+	.endr
+
+	jmp restart_nmi
+	CFI_ENDPROC
+end_repeat_nmi:
+
 ENTRY(ignore_sysret)
 	CFI_STARTPROC
 	mov $-ENOSYS,%eax
-- 
cgit v1.2.3


From 228bdaa95fb830e08b6acd1afd4d2c55093cabfa Mon Sep 17 00:00:00 2001
From: Steven Rostedt <srostedt@redhat.com>
Date: Fri, 9 Dec 2011 03:02:19 -0500
Subject: x86: Keep current stack in NMI breakpoints

We want to allow NMI handlers to have breakpoints to be able to
remove stop_machine from ftrace, kprobes and jump_labels. But if
an NMI interrupts a current breakpoint, and then it triggers a
breakpoint itself, it will switch to the breakpoint stack and
corrupt the data on it for the breakpoint processing that it
interrupted.

Instead, have the NMI check if it interrupted breakpoint processing
by checking if the stack that is currently used is a breakpoint
stack. If it is, then load a special IDT that changes the IST
for the debug exception to keep the same stack in kernel context.
When the NMI is done, it puts it back.

This way, if the NMI does trigger a breakpoint, it will keep
using the same stack and not stomp on the breakpoint data for
the breakpoint it interrupted.

Suggested-by: Peter Zijlstra <peterz@infradead.org>
Signed-off-by: Steven Rostedt <rostedt@goodmis.org>
---
 arch/x86/include/asm/desc.h      | 12 ++++++++++++
 arch/x86/include/asm/processor.h |  6 ++++++
 arch/x86/kernel/cpu/common.c     | 22 ++++++++++++++++++++++
 arch/x86/kernel/head_64.S        |  4 ++++
 arch/x86/kernel/nmi.c            | 15 +++++++++++++++
 arch/x86/kernel/traps.c          |  6 ++++++
 6 files changed, 65 insertions(+)

(limited to 'arch/x86')

diff --git a/arch/x86/include/asm/desc.h b/arch/x86/include/asm/desc.h
index 41935fadfdfc..e95822d683f4 100644
--- a/arch/x86/include/asm/desc.h
+++ b/arch/x86/include/asm/desc.h
@@ -35,6 +35,8 @@ static inline void fill_ldt(struct desc_struct *desc, const struct user_desc *in
 
 extern struct desc_ptr idt_descr;
 extern gate_desc idt_table[];
+extern struct desc_ptr nmi_idt_descr;
+extern gate_desc nmi_idt_table[];
 
 struct gdt_page {
 	struct desc_struct gdt[GDT_ENTRIES];
@@ -307,6 +309,16 @@ static inline void set_desc_limit(struct desc_struct *desc, unsigned long limit)
 	desc->limit = (limit >> 16) & 0xf;
 }
 
+#ifdef CONFIG_X86_64
+static inline void set_nmi_gate(int gate, void *addr)
+{
+	gate_desc s;
+
+	pack_gate(&s, GATE_INTERRUPT, (unsigned long)addr, 0, 0, __KERNEL_CS);
+	write_idt_entry(nmi_idt_table, gate, &s);
+}
+#endif
+
 static inline void _set_gate(int gate, unsigned type, void *addr,
 			     unsigned dpl, unsigned ist, unsigned seg)
 {
diff --git a/arch/x86/include/asm/processor.h b/arch/x86/include/asm/processor.h
index b650435ffb53..4b39d6d7e3a1 100644
--- a/arch/x86/include/asm/processor.h
+++ b/arch/x86/include/asm/processor.h
@@ -402,6 +402,9 @@ DECLARE_PER_CPU(char *, irq_stack_ptr);
 DECLARE_PER_CPU(unsigned int, irq_count);
 extern unsigned long kernel_eflags;
 extern asmlinkage void ignore_sysret(void);
+int is_debug_stack(unsigned long addr);
+void debug_stack_set_zero(void);
+void debug_stack_reset(void);
 #else	/* X86_64 */
 #ifdef CONFIG_CC_STACKPROTECTOR
 /*
@@ -416,6 +419,9 @@ struct stack_canary {
 };
 DECLARE_PER_CPU_ALIGNED(struct stack_canary, stack_canary);
 #endif
+static inline int is_debug_stack(unsigned long addr) { return 0; }
+static inline void debug_stack_set_zero(void) { }
+static inline void debug_stack_reset(void) { }
 #endif	/* X86_64 */
 
 extern unsigned int xstate_size;
diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c
index aa003b13a831..caa404556b9c 100644
--- a/arch/x86/kernel/cpu/common.c
+++ b/arch/x86/kernel/cpu/common.c
@@ -1026,6 +1026,8 @@ __setup("clearcpuid=", setup_disablecpuid);
 
 #ifdef CONFIG_X86_64
 struct desc_ptr idt_descr = { NR_VECTORS * 16 - 1, (unsigned long) idt_table };
+struct desc_ptr nmi_idt_descr = { NR_VECTORS * 16 - 1,
+				    (unsigned long) nmi_idt_table };
 
 DEFINE_PER_CPU_FIRST(union irq_stack_union,
 		     irq_stack_union) __aligned(PAGE_SIZE);
@@ -1090,6 +1092,24 @@ unsigned long kernel_eflags;
  */
 DEFINE_PER_CPU(struct orig_ist, orig_ist);
 
+static DEFINE_PER_CPU(unsigned long, debug_stack_addr);
+
+int is_debug_stack(unsigned long addr)
+{
+	return addr <= __get_cpu_var(debug_stack_addr) &&
+		addr > (__get_cpu_var(debug_stack_addr) - DEBUG_STKSZ);
+}
+
+void debug_stack_set_zero(void)
+{
+	load_idt((const struct desc_ptr *)&nmi_idt_descr);
+}
+
+void debug_stack_reset(void)
+{
+	load_idt((const struct desc_ptr *)&idt_descr);
+}
+
 #else	/* CONFIG_X86_64 */
 
 DEFINE_PER_CPU(struct task_struct *, current_task) = &init_task;
@@ -1208,6 +1228,8 @@ void __cpuinit cpu_init(void)
 			estacks += exception_stack_sizes[v];
 			oist->ist[v] = t->x86_tss.ist[v] =
 					(unsigned long)estacks;
+			if (v == DEBUG_STACK-1)
+				per_cpu(debug_stack_addr, cpu) = (unsigned long)estacks;
 		}
 	}
 
diff --git a/arch/x86/kernel/head_64.S b/arch/x86/kernel/head_64.S
index e11e39478a49..40f4eb3766d1 100644
--- a/arch/x86/kernel/head_64.S
+++ b/arch/x86/kernel/head_64.S
@@ -417,6 +417,10 @@ ENTRY(phys_base)
 ENTRY(idt_table)
 	.skip IDT_ENTRIES * 16
 
+	.align L1_CACHE_BYTES
+ENTRY(nmi_idt_table)
+	.skip IDT_ENTRIES * 16
+
 	__PAGE_ALIGNED_BSS
 	.align PAGE_SIZE
 ENTRY(empty_zero_page)
diff --git a/arch/x86/kernel/nmi.c b/arch/x86/kernel/nmi.c
index e88f37b58ddd..de8d4b333f40 100644
--- a/arch/x86/kernel/nmi.c
+++ b/arch/x86/kernel/nmi.c
@@ -408,6 +408,18 @@ static notrace __kprobes void default_do_nmi(struct pt_regs *regs)
 dotraplinkage notrace __kprobes void
 do_nmi(struct pt_regs *regs, long error_code)
 {
+	int update_debug_stack = 0;
+
+	/*
+	 * If we interrupted a breakpoint, it is possible that
+	 * the nmi handler will have breakpoints too. We need to
+	 * change the IDT such that breakpoints that happen here
+	 * continue to use the NMI stack.
+	 */
+	if (unlikely(is_debug_stack(regs->sp))) {
+		debug_stack_set_zero();
+		update_debug_stack = 1;
+	}
 	nmi_enter();
 
 	inc_irq_stat(__nmi_count);
@@ -416,6 +428,9 @@ do_nmi(struct pt_regs *regs, long error_code)
 		default_do_nmi(regs);
 
 	nmi_exit();
+
+	if (unlikely(update_debug_stack))
+		debug_stack_reset();
 }
 
 void stop_nmi(void)
diff --git a/arch/x86/kernel/traps.c b/arch/x86/kernel/traps.c
index a8e3eb83466c..a93c5cabc36a 100644
--- a/arch/x86/kernel/traps.c
+++ b/arch/x86/kernel/traps.c
@@ -723,4 +723,10 @@ void __init trap_init(void)
 	cpu_init();
 
 	x86_init.irqs.trap_init();
+
+#ifdef CONFIG_X86_64
+	memcpy(&nmi_idt_table, &idt_table, IDT_ENTRIES * 16);
+	set_nmi_gate(1, &debug);
+	set_nmi_gate(3, &int3);
+#endif
 }
-- 
cgit v1.2.3


From ccd49c2391773ffbf52bb80d75c4a92b16972517 Mon Sep 17 00:00:00 2001
From: Steven Rostedt <srostedt@redhat.com>
Date: Tue, 13 Dec 2011 16:44:16 -0500
Subject: x86: Allow NMIs to hit breakpoints in i386

With i386, NMIs and breakpoints use the current stack and they
do not reset the stack pointer to a fix point that might corrupt
a previous NMI or breakpoint (as it does in x86_64). But NMIs are
still not made to be re-entrant, and need to prevent the case that
an NMI hitting a breakpoint (which does an iret), doesn't allow
another NMI to run.

The fix is to let the NMI be in 3 different states:

1) not running
2) executing
3) latched

When no NMI is executing on a given CPU, the state is "not running".
When the first NMI comes in, the state is switched to "executing".
On exit of that NMI, a cmpxchg is performed to switch the state
back to "not running" and if that fails, the NMI is restarted.

If a breakpoint is hit and does an iret, which re-enables NMIs,
and another NMI comes in before the first NMI finished, it will
detect that the state is not in the "not running" state and the
current NMI is nested. In this case, the state is switched to "latched"
to let the interrupted NMI know to restart the NMI handler, and
the nested NMI exits without doing anything.

Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: H. Peter Anvin <hpa@linux.intel.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Paul Turner <pjt@google.com>
Cc: Frederic Weisbecker <fweisbec@gmail.com>
Cc: Mathieu Desnoyers <mathieu.desnoyers@efficios.com>
Signed-off-by: Steven Rostedt <rostedt@goodmis.org>
---
 arch/x86/kernel/nmi.c | 101 ++++++++++++++++++++++++++++++++++++++++++++++----
 1 file changed, 94 insertions(+), 7 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kernel/nmi.c b/arch/x86/kernel/nmi.c
index de8d4b333f40..47acaf319165 100644
--- a/arch/x86/kernel/nmi.c
+++ b/arch/x86/kernel/nmi.c
@@ -405,11 +405,84 @@ static notrace __kprobes void default_do_nmi(struct pt_regs *regs)
 		unknown_nmi_error(reason, regs);
 }
 
-dotraplinkage notrace __kprobes void
-do_nmi(struct pt_regs *regs, long error_code)
-{
-	int update_debug_stack = 0;
+/*
+ * NMIs can hit breakpoints which will cause it to lose its
+ * NMI context with the CPU when the breakpoint does an iret.
+ */
+#ifdef CONFIG_X86_32
+/*
+ * For i386, NMIs use the same stack as the kernel, and we can
+ * add a workaround to the iret problem in C. Simply have 3 states
+ * the NMI can be in.
+ *
+ *  1) not running
+ *  2) executing
+ *  3) latched
+ *
+ * When no NMI is in progress, it is in the "not running" state.
+ * When an NMI comes in, it goes into the "executing" state.
+ * Normally, if another NMI is triggered, it does not interrupt
+ * the running NMI and the HW will simply latch it so that when
+ * the first NMI finishes, it will restart the second NMI.
+ * (Note, the latch is binary, thus multiple NMIs triggering,
+ *  when one is running, are ignored. Only one NMI is restarted.)
+ *
+ * If an NMI hits a breakpoint that executes an iret, another
+ * NMI can preempt it. We do not want to allow this new NMI
+ * to run, but we want to execute it when the first one finishes.
+ * We set the state to "latched", and the first NMI will perform
+ * an cmpxchg on the state, and if it doesn't successfully
+ * reset the state to "not running" it will restart the next
+ * NMI.
+ */
+enum nmi_states {
+	NMI_NOT_RUNNING,
+	NMI_EXECUTING,
+	NMI_LATCHED,
+};
+static DEFINE_PER_CPU(enum nmi_states, nmi_state);
+
+#define nmi_nesting_preprocess(regs)					\
+	do {								\
+		if (__get_cpu_var(nmi_state) != NMI_NOT_RUNNING) {	\
+			__get_cpu_var(nmi_state) = NMI_LATCHED;		\
+			return;						\
+		}							\
+	nmi_restart:							\
+		__get_cpu_var(nmi_state) = NMI_EXECUTING;		\
+	} while (0)
+
+#define nmi_nesting_postprocess()					\
+	do {								\
+		if (cmpxchg(&__get_cpu_var(nmi_state),			\
+		    NMI_EXECUTING, NMI_NOT_RUNNING) != NMI_EXECUTING)	\
+			goto nmi_restart;				\
+	} while (0)
+#else /* x86_64 */
+/*
+ * In x86_64 things are a bit more difficult. This has the same problem
+ * where an NMI hitting a breakpoint that calls iret will remove the
+ * NMI context, allowing a nested NMI to enter. What makes this more
+ * difficult is that both NMIs and breakpoints have their own stack.
+ * When a new NMI or breakpoint is executed, the stack is set to a fixed
+ * point. If an NMI is nested, it will have its stack set at that same
+ * fixed address that the first NMI had, and will start corrupting the
+ * stack. This is handled in entry_64.S, but the same problem exists with
+ * the breakpoint stack.
+ *
+ * If a breakpoint is being processed, and the debug stack is being used,
+ * if an NMI comes in and also hits a breakpoint, the stack pointer
+ * will be set to the same fixed address as the breakpoint that was
+ * interrupted, causing that stack to be corrupted. To handle this case,
+ * check if the stack that was interrupted is the debug stack, and if
+ * so, change the IDT so that new breakpoints will use the current stack
+ * and not switch to the fixed address. On return of the NMI, switch back
+ * to the original IDT.
+ */
+static DEFINE_PER_CPU(int, update_debug_stack);
 
+static inline void nmi_nesting_preprocess(struct pt_regs *regs)
+{
 	/*
 	 * If we interrupted a breakpoint, it is possible that
 	 * the nmi handler will have breakpoints too. We need to
@@ -418,8 +491,22 @@ do_nmi(struct pt_regs *regs, long error_code)
 	 */
 	if (unlikely(is_debug_stack(regs->sp))) {
 		debug_stack_set_zero();
-		update_debug_stack = 1;
+		__get_cpu_var(update_debug_stack) = 1;
 	}
+}
+
+static inline void nmi_nesting_postprocess(void)
+{
+	if (unlikely(__get_cpu_var(update_debug_stack)))
+		debug_stack_reset();
+}
+#endif
+
+dotraplinkage notrace __kprobes void
+do_nmi(struct pt_regs *regs, long error_code)
+{
+	nmi_nesting_preprocess(regs);
+
 	nmi_enter();
 
 	inc_irq_stat(__nmi_count);
@@ -429,8 +516,8 @@ do_nmi(struct pt_regs *regs, long error_code)
 
 	nmi_exit();
 
-	if (unlikely(update_debug_stack))
-		debug_stack_reset();
+	/* On i386, may loop back to preprocess */
+	nmi_nesting_postprocess();
 }
 
 void stop_nmi(void)
-- 
cgit v1.2.3


From 42181186ad4db986fcaa40ca95c6e407e9e79372 Mon Sep 17 00:00:00 2001
From: Steven Rostedt <srostedt@redhat.com>
Date: Fri, 16 Dec 2011 11:43:02 -0500
Subject: x86: Add counter when debug stack is used with interrupts enabled

Mathieu Desnoyers pointed out a case that can cause issues with
NMIs running on the debug stack:

  int3 -> interrupt -> NMI -> int3

Because the interrupt changes the stack, the NMI will not see that
it preempted the debug stack. Looking deeper at this case,
interrupts only happen when the int3 is from userspace or in
an a location in the exception table (fixup).

  userspace -> int3 -> interurpt -> NMI -> int3

All other int3s that happen in the kernel should be processed
without ever enabling interrupts, as the do_trap() call will
panic the kernel if it is called to process any other location
within the kernel.

Adding a counter around the sections that enable interrupts while
using the debug stack allows the NMI to also check that case.
If the NMI sees that it either interrupted a task using the debug
stack or the debug counter is non-zero, then it will have to
change the IDT table to make the int3 not change stacks (which will
corrupt the stack if it does).

Note, I had to move the debug_usage functions out of processor.h
and into debugreg.h because of the static inlined functions to
inc and dec the debug_usage counter. __get_cpu_var() requires
smp.h which includes processor.h, and would fail to build.

Link: http://lkml.kernel.org/r/1323976535.23971.112.camel@gandalf.stny.rr.com

Reported-by: Mathieu Desnoyers <mathieu.desnoyers@efficios.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: H. Peter Anvin <hpa@linux.intel.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Paul Turner <pjt@google.com>
Cc: Frederic Weisbecker <fweisbec@gmail.com>
Signed-off-by: Steven Rostedt <rostedt@goodmis.org>
---
 arch/x86/include/asm/debugreg.h  | 22 ++++++++++++++++++++++
 arch/x86/include/asm/processor.h |  6 ------
 arch/x86/kernel/cpu/common.c     |  6 ++++--
 arch/x86/kernel/traps.c          | 14 ++++++++++++++
 4 files changed, 40 insertions(+), 8 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/include/asm/debugreg.h b/arch/x86/include/asm/debugreg.h
index 078ad0caefc6..b903d5ea3941 100644
--- a/arch/x86/include/asm/debugreg.h
+++ b/arch/x86/include/asm/debugreg.h
@@ -101,6 +101,28 @@ extern void aout_dump_debugregs(struct user *dump);
 
 extern void hw_breakpoint_restore(void);
 
+#ifdef CONFIG_X86_64
+DECLARE_PER_CPU(int, debug_stack_usage);
+static inline void debug_stack_usage_inc(void)
+{
+	__get_cpu_var(debug_stack_usage)++;
+}
+static inline void debug_stack_usage_dec(void)
+{
+	__get_cpu_var(debug_stack_usage)--;
+}
+int is_debug_stack(unsigned long addr);
+void debug_stack_set_zero(void);
+void debug_stack_reset(void);
+#else /* !X86_64 */
+static inline int is_debug_stack(unsigned long addr) { return 0; }
+static inline void debug_stack_set_zero(void) { }
+static inline void debug_stack_reset(void) { }
+static inline void debug_stack_usage_inc(void) { }
+static inline void debug_stack_usage_dec(void) { }
+#endif /* X86_64 */
+
+
 #endif	/* __KERNEL__ */
 
 #endif /* _ASM_X86_DEBUGREG_H */
diff --git a/arch/x86/include/asm/processor.h b/arch/x86/include/asm/processor.h
index 4b39d6d7e3a1..b650435ffb53 100644
--- a/arch/x86/include/asm/processor.h
+++ b/arch/x86/include/asm/processor.h
@@ -402,9 +402,6 @@ DECLARE_PER_CPU(char *, irq_stack_ptr);
 DECLARE_PER_CPU(unsigned int, irq_count);
 extern unsigned long kernel_eflags;
 extern asmlinkage void ignore_sysret(void);
-int is_debug_stack(unsigned long addr);
-void debug_stack_set_zero(void);
-void debug_stack_reset(void);
 #else	/* X86_64 */
 #ifdef CONFIG_CC_STACKPROTECTOR
 /*
@@ -419,9 +416,6 @@ struct stack_canary {
 };
 DECLARE_PER_CPU_ALIGNED(struct stack_canary, stack_canary);
 #endif
-static inline int is_debug_stack(unsigned long addr) { return 0; }
-static inline void debug_stack_set_zero(void) { }
-static inline void debug_stack_reset(void) { }
 #endif	/* X86_64 */
 
 extern unsigned int xstate_size;
diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c
index caa404556b9c..266e4649b1da 100644
--- a/arch/x86/kernel/cpu/common.c
+++ b/arch/x86/kernel/cpu/common.c
@@ -1093,11 +1093,13 @@ unsigned long kernel_eflags;
 DEFINE_PER_CPU(struct orig_ist, orig_ist);
 
 static DEFINE_PER_CPU(unsigned long, debug_stack_addr);
+DEFINE_PER_CPU(int, debug_stack_usage);
 
 int is_debug_stack(unsigned long addr)
 {
-	return addr <= __get_cpu_var(debug_stack_addr) &&
-		addr > (__get_cpu_var(debug_stack_addr) - DEBUG_STKSZ);
+	return __get_cpu_var(debug_stack_usage) ||
+		(addr <= __get_cpu_var(debug_stack_addr) &&
+		 addr > (__get_cpu_var(debug_stack_addr) - DEBUG_STKSZ));
 }
 
 void debug_stack_set_zero(void)
diff --git a/arch/x86/kernel/traps.c b/arch/x86/kernel/traps.c
index a93c5cabc36a..0072b38e3ea1 100644
--- a/arch/x86/kernel/traps.c
+++ b/arch/x86/kernel/traps.c
@@ -316,9 +316,15 @@ dotraplinkage void __kprobes do_int3(struct pt_regs *regs, long error_code)
 		return;
 #endif
 
+	/*
+	 * Let others (NMI) know that the debug stack is in use
+	 * as we may switch to the interrupt stack.
+	 */
+	debug_stack_usage_inc();
 	preempt_conditional_sti(regs);
 	do_trap(3, SIGTRAP, "int3", regs, error_code, NULL);
 	preempt_conditional_cli(regs);
+	debug_stack_usage_dec();
 }
 
 #ifdef CONFIG_X86_64
@@ -411,6 +417,12 @@ dotraplinkage void __kprobes do_debug(struct pt_regs *regs, long error_code)
 							SIGTRAP) == NOTIFY_STOP)
 		return;
 
+	/*
+	 * Let others (NMI) know that the debug stack is in use
+	 * as we may switch to the interrupt stack.
+	 */
+	debug_stack_usage_inc();
+
 	/* It's safe to allow irq's after DR6 has been saved */
 	preempt_conditional_sti(regs);
 
@@ -418,6 +430,7 @@ dotraplinkage void __kprobes do_debug(struct pt_regs *regs, long error_code)
 		handle_vm86_trap((struct kernel_vm86_regs *) regs,
 				error_code, 1);
 		preempt_conditional_cli(regs);
+		debug_stack_usage_dec();
 		return;
 	}
 
@@ -437,6 +450,7 @@ dotraplinkage void __kprobes do_debug(struct pt_regs *regs, long error_code)
 	if (tsk->thread.debugreg6 & (DR_STEP | DR_TRAP_BITS) || user_icebp)
 		send_sigtrap(tsk, regs, error_code, si_code);
 	preempt_conditional_cli(regs);
+	debug_stack_usage_dec();
 
 	return;
 }
-- 
cgit v1.2.3


From 7c9c3a1e5fc8728e948b8fa3cbcfcfb86db3afda Mon Sep 17 00:00:00 2001
From: Alan Cox <alan@linux.intel.com>
Date: Thu, 29 Dec 2011 14:43:16 +0000
Subject: x86/intel config: Fix the APB_TIMER selection

Seems Kconfig SELECT isn't selecting things hierarchically when
selected.

config APB_TIMER
       def_bool y if X86_INTEL_MID
       prompt "Intel MID APB Timer Support" if X86_INTEL_MID
       select DW_APB_TIMER
       depends on X86_INTEL_MID && SFI

when we select APB_TIMER doesn't select DW_APB_TIMER so do it by
hand.

Signed-off-by: Alan Cox <alan@linux.intel.com>
Link: http://lkml.kernel.org/n/tip-kpnaimplltk6d1lolusqj3ae@git.kernel.org
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/Kconfig | 8 ++++++--
 1 file changed, 6 insertions(+), 2 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
index 07620bc913db..78fbb346959b 100644
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -409,12 +409,14 @@ config X86_MRST
 	depends on PCI
 	depends on PCI_GOANY
 	depends on X86_IO_APIC
+	select X86_INTEL_MID
+	select SFI
+	select DW_APB_TIMER
 	select APB_TIMER
 	select I2C
 	select SPI
 	select INTEL_SCU_IPC
 	select X86_PLATFORM_DEVICES
-	select X86_INTEL_MID
 	---help---
 	  Moorestown is Intel's Low Power Intel Architecture (LPIA) based Moblin
 	  Internet Device(MID) platform. Moorestown consists of two chips:
@@ -428,12 +430,14 @@ config X86_MDFLD
 	depends on PCI
 	depends on PCI_GOANY
 	depends on X86_IO_APIC
+	select X86_INTEL_MID
+	select SFI
+	select DW_APB_TIMER
 	select APB_TIMER
 	select I2C
 	select SPI
 	select INTEL_SCU_IPC
 	select X86_PLATFORM_DEVICES
-	select X86_INTEL_MID
 	---help---
 	  Medfield is Intel's Low Power Intel Architecture (LPIA) based Moblin
 	  Internet Device(MID) platform. 
-- 
cgit v1.2.3


From cd42f4a3b2b1c4cbd997363dc57821953d73fd87 Mon Sep 17 00:00:00 2001
From: Tony Luck <tony.luck@intel.com>
Date: Thu, 15 Dec 2011 10:48:12 -0800
Subject: HWPOISON: Clean up memory_failure() vs. __memory_failure()

There is only one caller of memory_failure(), all other users call
__memory_failure() and pass in the flags argument explicitly. The
lone user of memory_failure() will soon need to pass flags too.

Add flags argument to the callsite in mce.c. Delete the old memory_failure()
function, and then rename __memory_failure() without the leading "__".

Provide clearer message when action optional memory errors are ignored.

Acked-by: Borislav Petkov <bp@amd64.org>
Signed-off-by: Tony Luck <tony.luck@intel.com>
---
 arch/x86/kernel/cpu/mcheck/mce.c | 12 +++++++----
 drivers/base/memory.c            |  2 +-
 include/linux/mm.h               |  3 +--
 mm/hwpoison-inject.c             |  4 ++--
 mm/madvise.c                     |  2 +-
 mm/memory-failure.c              | 46 ++++++++++++++++++----------------------
 6 files changed, 34 insertions(+), 35 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kernel/cpu/mcheck/mce.c b/arch/x86/kernel/cpu/mcheck/mce.c
index 2af127d4c3d1..1a08ce5f345f 100644
--- a/arch/x86/kernel/cpu/mcheck/mce.c
+++ b/arch/x86/kernel/cpu/mcheck/mce.c
@@ -1046,11 +1046,15 @@ out:
 }
 EXPORT_SYMBOL_GPL(do_machine_check);
 
-/* dummy to break dependency. actual code is in mm/memory-failure.c */
-void __attribute__((weak)) memory_failure(unsigned long pfn, int vector)
+#ifndef CONFIG_MEMORY_FAILURE
+int memory_failure(unsigned long pfn, int vector, int flags)
 {
-	printk(KERN_ERR "Action optional memory failure at %lx ignored\n", pfn);
+	printk(KERN_ERR "Uncorrected memory error in page 0x%lx ignored\n"
+		"Rebuild kernel with CONFIG_MEMORY_FAILURE=y for smarter handling\n", pfn);
+
+	return 0;
 }
+#endif
 
 /*
  * Called after mce notification in process context. This code
@@ -1068,7 +1072,7 @@ void mce_notify_process(void)
 	unsigned long pfn;
 	mce_notify_irq();
 	while (mce_ring_get(&pfn))
-		memory_failure(pfn, MCE_VECTOR);
+		memory_failure(pfn, MCE_VECTOR, 0);
 }
 
 static void mce_process_work(struct work_struct *dummy)
diff --git a/drivers/base/memory.c b/drivers/base/memory.c
index 8272d92d22c0..9a924440053f 100644
--- a/drivers/base/memory.c
+++ b/drivers/base/memory.c
@@ -474,7 +474,7 @@ store_hard_offline_page(struct class *class,
 	if (strict_strtoull(buf, 0, &pfn) < 0)
 		return -EINVAL;
 	pfn >>= PAGE_SHIFT;
-	ret = __memory_failure(pfn, 0, 0);
+	ret = memory_failure(pfn, 0, 0);
 	return ret ? ret : count;
 }
 
diff --git a/include/linux/mm.h b/include/linux/mm.h
index 4baadd18f4ad..bcc523474724 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -1607,8 +1607,7 @@ void vmemmap_populate_print_last(void);
 enum mf_flags {
 	MF_COUNT_INCREASED = 1 << 0,
 };
-extern void memory_failure(unsigned long pfn, int trapno);
-extern int __memory_failure(unsigned long pfn, int trapno, int flags);
+extern int memory_failure(unsigned long pfn, int trapno, int flags);
 extern void memory_failure_queue(unsigned long pfn, int trapno, int flags);
 extern int unpoison_memory(unsigned long pfn);
 extern int sysctl_memory_failure_early_kill;
diff --git a/mm/hwpoison-inject.c b/mm/hwpoison-inject.c
index c7fc7fd00e32..cc448bb983ba 100644
--- a/mm/hwpoison-inject.c
+++ b/mm/hwpoison-inject.c
@@ -45,7 +45,7 @@ static int hwpoison_inject(void *data, u64 val)
 	 * do a racy check with elevated page count, to make sure PG_hwpoison
 	 * will only be set for the targeted owner (or on a free page).
 	 * We temporarily take page lock for try_get_mem_cgroup_from_page().
-	 * __memory_failure() will redo the check reliably inside page lock.
+	 * memory_failure() will redo the check reliably inside page lock.
 	 */
 	lock_page(hpage);
 	err = hwpoison_filter(hpage);
@@ -55,7 +55,7 @@ static int hwpoison_inject(void *data, u64 val)
 
 inject:
 	printk(KERN_INFO "Injecting memory failure at pfn %lx\n", pfn);
-	return __memory_failure(pfn, 18, MF_COUNT_INCREASED);
+	return memory_failure(pfn, 18, MF_COUNT_INCREASED);
 }
 
 static int hwpoison_unpoison(void *data, u64 val)
diff --git a/mm/madvise.c b/mm/madvise.c
index 74bf193eff04..f5ab745672b7 100644
--- a/mm/madvise.c
+++ b/mm/madvise.c
@@ -251,7 +251,7 @@ static int madvise_hwpoison(int bhv, unsigned long start, unsigned long end)
 		printk(KERN_INFO "Injecting memory failure for page %lx at %lx\n",
 		       page_to_pfn(p), start);
 		/* Ignore return value for now */
-		__memory_failure(page_to_pfn(p), 0, MF_COUNT_INCREASED);
+		memory_failure(page_to_pfn(p), 0, MF_COUNT_INCREASED);
 	}
 	return ret;
 }
diff --git a/mm/memory-failure.c b/mm/memory-failure.c
index 06d3479513aa..ab259bb0adc5 100644
--- a/mm/memory-failure.c
+++ b/mm/memory-failure.c
@@ -984,7 +984,25 @@ static void clear_page_hwpoison_huge_page(struct page *hpage)
 		ClearPageHWPoison(hpage + i);
 }
 
-int __memory_failure(unsigned long pfn, int trapno, int flags)
+/**
+ * memory_failure - Handle memory failure of a page.
+ * @pfn: Page Number of the corrupted page
+ * @trapno: Trap number reported in the signal to user space.
+ * @flags: fine tune action taken
+ *
+ * This function is called by the low level machine check code
+ * of an architecture when it detects hardware memory corruption
+ * of a page. It tries its best to recover, which includes
+ * dropping pages, killing processes etc.
+ *
+ * The function is primarily of use for corruptions that
+ * happen outside the current execution context (e.g. when
+ * detected by a background scrubber)
+ *
+ * Must run in process context (e.g. a work queue) with interrupts
+ * enabled and no spinlocks hold.
+ */
+int memory_failure(unsigned long pfn, int trapno, int flags)
 {
 	struct page_state *ps;
 	struct page *p;
@@ -1156,29 +1174,7 @@ out:
 	unlock_page(hpage);
 	return res;
 }
-EXPORT_SYMBOL_GPL(__memory_failure);
-
-/**
- * memory_failure - Handle memory failure of a page.
- * @pfn: Page Number of the corrupted page
- * @trapno: Trap number reported in the signal to user space.
- *
- * This function is called by the low level machine check code
- * of an architecture when it detects hardware memory corruption
- * of a page. It tries its best to recover, which includes
- * dropping pages, killing processes etc.
- *
- * The function is primarily of use for corruptions that
- * happen outside the current execution context (e.g. when
- * detected by a background scrubber)
- *
- * Must run in process context (e.g. a work queue) with interrupts
- * enabled and no spinlocks hold.
- */
-void memory_failure(unsigned long pfn, int trapno)
-{
-	__memory_failure(pfn, trapno, 0);
-}
+EXPORT_SYMBOL_GPL(memory_failure);
 
 #define MEMORY_FAILURE_FIFO_ORDER	4
 #define MEMORY_FAILURE_FIFO_SIZE	(1 << MEMORY_FAILURE_FIFO_ORDER)
@@ -1251,7 +1247,7 @@ static void memory_failure_work_func(struct work_struct *work)
 		spin_unlock_irqrestore(&mf_cpu->lock, proc_flags);
 		if (!gotten)
 			break;
-		__memory_failure(entry.pfn, entry.trapno, entry.flags);
+		memory_failure(entry.pfn, entry.trapno, entry.flags);
 	}
 }
 
-- 
cgit v1.2.3


From 85f92694affa7dba7f1978666a69552b5dfc628e Mon Sep 17 00:00:00 2001
From: Tony Luck <tony.luck@intel.com>
Date: Tue, 13 Dec 2011 09:48:13 -0800
Subject: x86/mce: Create helper function to save addr/misc when needed

The MCI_STATUS_MISCV and MCI_STATUS_ADDRV bits in the bank status
registers define whether the MISC and ADDR registers respectively
contain valid data - provide a helper function to check these bits
and read the registers when needed.

In addition, processors that support software error recovery (as
indicated by the MCG_SER_P bit in the MCG_CAP register) may include
some undefined bits in the ADDR register - mask these out.

Acked-by: Borislav Petkov <bp@amd64.org>
Signed-off-by: Tony Luck <tony.luck@intel.com>
---
 arch/x86/kernel/cpu/mcheck/mce.c | 31 +++++++++++++++++++++++--------
 1 file changed, 23 insertions(+), 8 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kernel/cpu/mcheck/mce.c b/arch/x86/kernel/cpu/mcheck/mce.c
index 1a08ce5f345f..2f1c200f05e6 100644
--- a/arch/x86/kernel/cpu/mcheck/mce.c
+++ b/arch/x86/kernel/cpu/mcheck/mce.c
@@ -492,6 +492,27 @@ static void mce_report_event(struct pt_regs *regs)
 	irq_work_queue(&__get_cpu_var(mce_irq_work));
 }
 
+/*
+ * Read ADDR and MISC registers.
+ */
+static void mce_read_aux(struct mce *m, int i)
+{
+	if (m->status & MCI_STATUS_MISCV)
+		m->misc = mce_rdmsrl(MSR_IA32_MCx_MISC(i));
+	if (m->status & MCI_STATUS_ADDRV) {
+		m->addr = mce_rdmsrl(MSR_IA32_MCx_ADDR(i));
+
+		/*
+		 * Mask the reported address by the reported granularity.
+		 */
+		if (mce_ser && (m->status & MCI_STATUS_MISCV)) {
+			u8 shift = MCI_MISC_ADDR_LSB(m->misc);
+			m->addr >>= shift;
+			m->addr <<= shift;
+		}
+	}
+}
+
 DEFINE_PER_CPU(unsigned, mce_poll_count);
 
 /*
@@ -542,10 +563,7 @@ void machine_check_poll(enum mcp_flags flags, mce_banks_t *b)
 		    (m.status & (mce_ser ? MCI_STATUS_S : MCI_STATUS_UC)))
 			continue;
 
-		if (m.status & MCI_STATUS_MISCV)
-			m.misc = mce_rdmsrl(MSR_IA32_MCx_MISC(i));
-		if (m.status & MCI_STATUS_ADDRV)
-			m.addr = mce_rdmsrl(MSR_IA32_MCx_ADDR(i));
+		mce_read_aux(&m, i);
 
 		if (!(flags & MCP_TIMESTAMP))
 			m.tsc = 0;
@@ -981,10 +999,7 @@ void do_machine_check(struct pt_regs *regs, long error_code)
 		if (severity == MCE_AR_SEVERITY)
 			kill_it = 1;
 
-		if (m.status & MCI_STATUS_MISCV)
-			m.misc = mce_rdmsrl(MSR_IA32_MCx_MISC(i));
-		if (m.status & MCI_STATUS_ADDRV)
-			m.addr = mce_rdmsrl(MSR_IA32_MCx_ADDR(i));
+		mce_read_aux(&m, i);
 
 		/*
 		 * Action optional error. Queue address for later processing.
-- 
cgit v1.2.3


From af104e394e17e328df85c25a9e21448539725b67 Mon Sep 17 00:00:00 2001
From: Tony Luck <tony.luck@intel.com>
Date: Wed, 14 Dec 2011 15:55:20 -0800
Subject: x86/mce: Add mechanism to safely save information in MCE handler

Machine checks on Intel cpus interrupt execution on all cpus, regardless
of interrupt masking.  We have a need to save some data about the cause
of the machine check (physical address) in the machine check handler that
can be retrieved later to attempt recovery in a more flexible execution
state.

Signed-off-by: Tony Luck <tony.luck@intel.com>
---
 arch/x86/kernel/cpu/mcheck/mce.c | 43 ++++++++++++++++++++++++++++++++++++++++
 1 file changed, 43 insertions(+)

(limited to 'arch/x86')

diff --git a/arch/x86/kernel/cpu/mcheck/mce.c b/arch/x86/kernel/cpu/mcheck/mce.c
index 2f1c200f05e6..e1579c5a71da 100644
--- a/arch/x86/kernel/cpu/mcheck/mce.c
+++ b/arch/x86/kernel/cpu/mcheck/mce.c
@@ -886,6 +886,49 @@ static void mce_clear_state(unsigned long *toclear)
 	}
 }
 
+/*
+ * Need to save faulting physical address associated with a process
+ * in the machine check handler some place where we can grab it back
+ * later in mce_notify_process()
+ */
+#define	MCE_INFO_MAX	16
+
+struct mce_info {
+	atomic_t		inuse;
+	struct task_struct	*t;
+	__u64			paddr;
+} mce_info[MCE_INFO_MAX];
+
+static void mce_save_info(__u64 addr)
+{
+	struct mce_info *mi;
+
+	for (mi = mce_info; mi < &mce_info[MCE_INFO_MAX]; mi++) {
+		if (atomic_cmpxchg(&mi->inuse, 0, 1) == 0) {
+			mi->t = current;
+			mi->paddr = addr;
+			return;
+		}
+	}
+
+	mce_panic("Too many concurrent recoverable errors", NULL, NULL);
+}
+
+static struct mce_info *mce_find_info(void)
+{
+	struct mce_info *mi;
+
+	for (mi = mce_info; mi < &mce_info[MCE_INFO_MAX]; mi++)
+		if (atomic_read(&mi->inuse) && mi->t == current)
+			return mi;
+	return NULL;
+}
+
+static void mce_clear_info(struct mce_info *mi)
+{
+	atomic_set(&mi->inuse, 0);
+}
+
 /*
  * The actual machine check handler. This only handles real
  * exceptions when something got corrupted coming in through int 18.
-- 
cgit v1.2.3


From a8c321fbf9aeced45519248e5901af8cbc240510 Mon Sep 17 00:00:00 2001
From: Tony Luck <tony.luck@intel.com>
Date: Tue, 3 Jan 2012 11:45:45 -0800
Subject: x86/mce: Handle "action required" errors

All non-urgent actions (reporting low severity errors and handling
"action-optional" errors) are now handled by a work queue. This
means that TIF_MCE_NOTIFY can be used to block execution for a
thread experiencing an "action-required" fault until we get all
cpus out of the machine check handler (and the thread that hit
the fault into mce_notify_process().

We use the new mce_{save,find,clear}_info() API to get information
from do_machine_check() to mce_notify_process(), and then use the
newly improved memory_failure(..., MF_ACTION_REQUIRED) to handle
the error (possibly signalling the process).

Update some comments to make the new code flows clearer.

Signed-off-by: Tony Luck <tony.luck@intel.com>
---
 arch/x86/kernel/cpu/mcheck/mce.c | 95 ++++++++++++++++++++++------------------
 1 file changed, 53 insertions(+), 42 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kernel/cpu/mcheck/mce.c b/arch/x86/kernel/cpu/mcheck/mce.c
index e1579c5a71da..56e4e79387c3 100644
--- a/arch/x86/kernel/cpu/mcheck/mce.c
+++ b/arch/x86/kernel/cpu/mcheck/mce.c
@@ -982,7 +982,9 @@ void do_machine_check(struct pt_regs *regs, long error_code)
 	barrier();
 
 	/*
-	 * When no restart IP must always kill or panic.
+	 * When no restart IP might need to kill or panic.
+	 * Assume the worst for now, but if we find the
+	 * severity is MCE_AR_SEVERITY we have other options.
 	 */
 	if (!(m.mcgstatus & MCG_STATUS_RIPV))
 		kill_it = 1;
@@ -1036,12 +1038,6 @@ void do_machine_check(struct pt_regs *regs, long error_code)
 			continue;
 		}
 
-		/*
-		 * Kill on action required.
-		 */
-		if (severity == MCE_AR_SEVERITY)
-			kill_it = 1;
-
 		mce_read_aux(&m, i);
 
 		/*
@@ -1062,6 +1058,9 @@ void do_machine_check(struct pt_regs *regs, long error_code)
 		}
 	}
 
+	/* mce_clear_state will clear *final, save locally for use later */
+	m = *final;
+
 	if (!no_way_out)
 		mce_clear_state(toclear);
 
@@ -1073,27 +1072,22 @@ void do_machine_check(struct pt_regs *regs, long error_code)
 		no_way_out = worst >= MCE_PANIC_SEVERITY;
 
 	/*
-	 * If we have decided that we just CAN'T continue, and the user
-	 * has not set tolerant to an insane level, give up and die.
-	 *
-	 * This is mainly used in the case when the system doesn't
-	 * support MCE broadcasting or it has been disabled.
+	 * At insane "tolerant" levels we take no action. Otherwise
+	 * we only die if we have no other choice. For less serious
+	 * issues we try to recover, or limit damage to the current
+	 * process.
 	 */
-	if (no_way_out && tolerant < 3)
-		mce_panic("Fatal machine check on current CPU", final, msg);
-
-	/*
-	 * If the error seems to be unrecoverable, something should be
-	 * done.  Try to kill as little as possible.  If we can kill just
-	 * one task, do that.  If the user has set the tolerance very
-	 * high, don't try to do anything at all.
-	 */
-
-	if (kill_it && tolerant < 3)
-		force_sig(SIGBUS, current);
-
-	/* notify userspace ASAP */
-	set_thread_flag(TIF_MCE_NOTIFY);
+	if (tolerant < 3) {
+		if (no_way_out)
+			mce_panic("Fatal machine check on current CPU", &m, msg);
+		if (worst == MCE_AR_SEVERITY) {
+			/* schedule action before return to userland */
+			mce_save_info(m.addr);
+			set_thread_flag(TIF_MCE_NOTIFY);
+		} else if (kill_it) {
+			force_sig(SIGBUS, current);
+		}
+	}
 
 	if (worst > 0)
 		mce_report_event(regs);
@@ -1107,6 +1101,8 @@ EXPORT_SYMBOL_GPL(do_machine_check);
 #ifndef CONFIG_MEMORY_FAILURE
 int memory_failure(unsigned long pfn, int vector, int flags)
 {
+	/* mce_severity() should not hand us an ACTION_REQUIRED error */
+	BUG_ON(flags & MF_ACTION_REQUIRED);
 	printk(KERN_ERR "Uncorrected memory error in page 0x%lx ignored\n"
 		"Rebuild kernel with CONFIG_MEMORY_FAILURE=y for smarter handling\n", pfn);
 
@@ -1115,27 +1111,44 @@ int memory_failure(unsigned long pfn, int vector, int flags)
 #endif
 
 /*
- * Called after mce notification in process context. This code
- * is allowed to sleep. Call the high level VM handler to process
- * any corrupted pages.
- * Assume that the work queue code only calls this one at a time
- * per CPU.
- * Note we don't disable preemption, so this code might run on the wrong
- * CPU. In this case the event is picked up by the scheduled work queue.
- * This is merely a fast path to expedite processing in some common
- * cases.
+ * Called in process context that interrupted by MCE and marked with
+ * TIF_MCE_NOTIFY, just before returning to erroneous userland.
+ * This code is allowed to sleep.
+ * Attempt possible recovery such as calling the high level VM handler to
+ * process any corrupted pages, and kill/signal current process if required.
+ * Action required errors are handled here.
  */
 void mce_notify_process(void)
 {
 	unsigned long pfn;
-	mce_notify_irq();
-	while (mce_ring_get(&pfn))
-		memory_failure(pfn, MCE_VECTOR, 0);
+	struct mce_info *mi = mce_find_info();
+
+	if (!mi)
+		mce_panic("Lost physical address for unconsumed uncorrectable error", NULL, NULL);
+	pfn = mi->paddr >> PAGE_SHIFT;
+
+	clear_thread_flag(TIF_MCE_NOTIFY);
+
+	pr_err("Uncorrected hardware memory error in user-access at %llx",
+		 mi->paddr);
+	if (memory_failure(pfn, MCE_VECTOR, MF_ACTION_REQUIRED) < 0) {
+		pr_err("Memory error not recovered");
+		force_sig(SIGBUS, current);
+	}
+	mce_clear_info(mi);
 }
 
+/*
+ * Action optional processing happens here (picking up
+ * from the list of faulting pages that do_machine_check()
+ * placed into the "ring").
+ */
 static void mce_process_work(struct work_struct *dummy)
 {
-	mce_notify_process();
+	unsigned long pfn;
+
+	while (mce_ring_get(&pfn))
+		memory_failure(pfn, MCE_VECTOR, 0);
 }
 
 #ifdef CONFIG_X86_MCE_INTEL
@@ -1225,8 +1238,6 @@ int mce_notify_irq(void)
 	/* Not more than two messages every minute */
 	static DEFINE_RATELIMIT_STATE(ratelimit, 60*HZ, 2);
 
-	clear_thread_flag(TIF_MCE_NOTIFY);
-
 	if (test_and_clear_bit(0, &mce_need_notify)) {
 		/* wake processes polling /dev/mcelog */
 		wake_up_interruptible(&mce_chrdev_wait);
-- 
cgit v1.2.3


From 5f7b88d51e89771f64c15903b96b5933dd0bc6d8 Mon Sep 17 00:00:00 2001
From: Tony Luck <tony.luck@intel.com>
Date: Tue, 3 Jan 2012 11:48:04 -0800
Subject: x86/mce: Recognise machine check bank signature for data path error

Action required data path signature is defined in table 15-19 of SDM:

+-----------------------------------------------------------------------------+
| SRAR Error | Valid | OVER | UC | EN | MISCV | ADDRV | PCC | S | AR | MCACOD |
| Data Load  |     1 |    0 |  1 |  1 |     1 |     1 |   0 | 1 |  1 |  0x134 |
+-----------------------------------------------------------------------------+

Recognise this, and pass MCE_AR_SEVERITY code back to do_machine_check() if
we have the action handler configured (CONFIG_MEMORY_FAILURE=y)

Acked-by: Borislav Petkov <bp@amd64.org>
Signed-off-by: Tony Luck <tony.luck@intel.com>
---
 arch/x86/kernel/cpu/mcheck/mce-severity.c | 16 +++++++++++++++-
 1 file changed, 15 insertions(+), 1 deletion(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kernel/cpu/mcheck/mce-severity.c b/arch/x86/kernel/cpu/mcheck/mce-severity.c
index 7395d5f4272d..f6c92f99efa0 100644
--- a/arch/x86/kernel/cpu/mcheck/mce-severity.c
+++ b/arch/x86/kernel/cpu/mcheck/mce-severity.c
@@ -54,6 +54,7 @@ static struct severity {
 #define  MASK(x, y)	.mask = x, .result = y
 #define MCI_UC_S (MCI_STATUS_UC|MCI_STATUS_S)
 #define MCI_UC_SAR (MCI_STATUS_UC|MCI_STATUS_S|MCI_STATUS_AR)
+#define	MCI_ADDR (MCI_STATUS_ADDRV|MCI_STATUS_MISCV)
 #define MCACOD 0xffff
 
 	MCESEV(
@@ -102,11 +103,24 @@ static struct severity {
 		SER, BITCLR(MCI_STATUS_S)
 		),
 
-	/* AR add known MCACODs here */
 	MCESEV(
 		PANIC, "Action required with lost events",
 		SER, BITSET(MCI_STATUS_OVER|MCI_UC_SAR)
 		),
+
+	/* known AR MCACODs: */
+#ifdef	CONFIG_MEMORY_FAILURE
+	MCESEV(
+		KEEP, "HT thread notices Action required: data load error",
+		SER, MASK(MCI_STATUS_OVER|MCI_UC_SAR|MCI_ADDR|MCACOD, MCI_UC_SAR|MCI_ADDR|0x0134),
+		MCGMASK(MCG_STATUS_EIPV, 0)
+		),
+	MCESEV(
+		AR, "Action required: data load error",
+		SER, MASK(MCI_STATUS_OVER|MCI_UC_SAR|MCI_ADDR|MCACOD, MCI_UC_SAR|MCI_ADDR|0x0134),
+		USER
+		),
+#endif
 	MCESEV(
 		PANIC, "Action required: unknown MCACOD",
 		SER, MASK(MCI_STATUS_OVER|MCI_UC_SAR, MCI_UC_SAR)
-- 
cgit v1.2.3


From 426932909093e4e7729777a0e2beed4b54911361 Mon Sep 17 00:00:00 2001
From: Jan Beulich <JBeulich@suse.com>
Date: Thu, 5 Jan 2012 16:12:25 +0000
Subject: x86-64: Slightly shorten copy_page()

%r13 got saved and restored without ever getting touched, so
there's no need to do so.

Signed-off-by: Jan Beulich <jbeulich@suse.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Andrew Morton <akpm@linux-foundation.org>
Link: http://lkml.kernel.org/r/4F05D9F9020000780006AA0D@nat28.tlf.novell.com
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/lib/copy_page_64.S | 12 ++++--------
 1 file changed, 4 insertions(+), 8 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/lib/copy_page_64.S b/arch/x86/lib/copy_page_64.S
index 01c805ba5359..6b34d04d096a 100644
--- a/arch/x86/lib/copy_page_64.S
+++ b/arch/x86/lib/copy_page_64.S
@@ -20,14 +20,12 @@ ENDPROC(copy_page_c)
 
 ENTRY(copy_page)
 	CFI_STARTPROC
-	subq	$3*8,%rsp
-	CFI_ADJUST_CFA_OFFSET 3*8
+	subq	$2*8,%rsp
+	CFI_ADJUST_CFA_OFFSET 2*8
 	movq	%rbx,(%rsp)
 	CFI_REL_OFFSET rbx, 0
 	movq	%r12,1*8(%rsp)
 	CFI_REL_OFFSET r12, 1*8
-	movq	%r13,2*8(%rsp)
-	CFI_REL_OFFSET r13, 2*8
 
 	movl	$(4096/64)-5,%ecx
 	.p2align 4
@@ -91,10 +89,8 @@ ENTRY(copy_page)
 	CFI_RESTORE rbx
 	movq	1*8(%rsp),%r12
 	CFI_RESTORE r12
-	movq	2*8(%rsp),%r13
-	CFI_RESTORE r13
-	addq	$3*8,%rsp
-	CFI_ADJUST_CFA_OFFSET -3*8
+	addq	$2*8,%rsp
+	CFI_ADJUST_CFA_OFFSET -2*8
 	ret
 .Lcopy_page_end:
 	CFI_ENDPROC
-- 
cgit v1.2.3


From e58d429209105e698e9d0357481d62b37fe9a7dd Mon Sep 17 00:00:00 2001
From: Don Zickus <dzickus@redhat.com>
Date: Fri, 6 Jan 2012 11:17:51 -0500
Subject: x86, reboot: Fix typo in nmi reboot path

It was brought to my attention that my x86 change to use NMI in
the reboot path broke Intel Nehalem and Westmere boxes when
using kexec.

I realized I had mistyped the if statement in commit
3603a2512f9e69dc87914ba922eb4a0812b21cd6 and stuck the ')' in
the wrong spot.  Putting it in the right spot fixes kexec again.

Doh.

Reported-by: Yinghai Lu <yinghai@kernel.org>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Signed-off-by: Don Zickus <dzickus@redhat.com>
Link: http://lkml.kernel.org/r/1325866671-9797-1-git-send-email-dzickus@redhat.com
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/smp.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kernel/smp.c b/arch/x86/kernel/smp.c
index 113acda5879e..66c74f481cab 100644
--- a/arch/x86/kernel/smp.c
+++ b/arch/x86/kernel/smp.c
@@ -176,7 +176,7 @@ static void native_nmi_stop_other_cpus(int wait)
 	 */
 	if (num_online_cpus() > 1) {
 		/* did someone beat us here? */
-		if (atomic_cmpxchg(&stopping_cpu, -1, safe_smp_processor_id() != -1))
+		if (atomic_cmpxchg(&stopping_cpu, -1, safe_smp_processor_id()) != -1)
 			return;
 
 		if (register_nmi_handler(NMI_LOCAL, smp_stop_nmi_callback,
-- 
cgit v1.2.3


From 72142fd4109105c6bd21658966ca5e93c1684081 Mon Sep 17 00:00:00 2001
From: "H. Peter Anvin" <hpa@zytor.com>
Date: Sat, 7 Jan 2012 14:10:18 -0800
Subject: x86: Move <asm/asm-offsets.h> from trace_syscalls.c to asm/syscall.h

This reverts commit d5e553d6e0a4bdea43adae7373e3fa144b9a1aaa, which
caused large numbers of build warnings on PowerPC.

This moves the #include <asm/asm-offsets.h> to <asm/syscall.h>, which
makes some kind of sense since NR_syscalls is syscalls related.

Reported-by: Stephen Rothwell <sfr@canb.auug.org.au>
Signed-off-by: H. Peter Anvin <hpa@zytor.com>
Link: http://lkml.kernel.org/r/20111214181545.6e13bc954cb7ddce9086e861@canb.auug.org.au
---
 arch/x86/include/asm/syscall.h | 1 +
 kernel/trace/trace_syscalls.c  | 1 -
 2 files changed, 1 insertion(+), 1 deletion(-)

(limited to 'arch/x86')

diff --git a/arch/x86/include/asm/syscall.h b/arch/x86/include/asm/syscall.h
index c4a348f7bd43..d962e5652a73 100644
--- a/arch/x86/include/asm/syscall.h
+++ b/arch/x86/include/asm/syscall.h
@@ -15,6 +15,7 @@
 
 #include <linux/sched.h>
 #include <linux/err.h>
+#include <asm/asm-offsets.h>	/* For NR_syscalls */
 
 extern const unsigned long sys_call_table[];
 
diff --git a/kernel/trace/trace_syscalls.c b/kernel/trace/trace_syscalls.c
index 5f35f6f15d99..cb654542c1a1 100644
--- a/kernel/trace/trace_syscalls.c
+++ b/kernel/trace/trace_syscalls.c
@@ -6,7 +6,6 @@
 #include <linux/ftrace.h>
 #include <linux/perf_event.h>
 #include <asm/syscall.h>
-#include <asm/asm-offsets.h>
 
 #include "trace_output.h"
 #include "trace.h"
-- 
cgit v1.2.3


From da517a08ac5913cd80ce3507cddd00f2a091b13c Mon Sep 17 00:00:00 2001
From: Jack Steiner <steiner@sgi.com>
Date: Fri, 6 Jan 2012 13:19:00 -0600
Subject: x86, UV: Update Boot messages for SGI UV2 platform

SGI UV systems print a message during boot:

	UV: Found <num> blades

Due to packaging changes, the blade count is not accurate for
on the next generation of the platform. This patch corrects the
count.

Signed-off-by: Jack Steiner <steiner@sgi.com>
Cc: <stable@kernel.org>
Link: http://lkml.kernel.org/r/20120106191900.GA19772@sgi.com
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/apic/x2apic_uv_x.c | 7 ++++++-
 1 file changed, 6 insertions(+), 1 deletion(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kernel/apic/x2apic_uv_x.c b/arch/x86/kernel/apic/x2apic_uv_x.c
index 9d59bbacd4e3..79b05b88aa19 100644
--- a/arch/x86/kernel/apic/x2apic_uv_x.c
+++ b/arch/x86/kernel/apic/x2apic_uv_x.c
@@ -769,7 +769,12 @@ void __init uv_system_init(void)
 	for(i = 0; i < UVH_NODE_PRESENT_TABLE_DEPTH; i++)
 		uv_possible_blades +=
 		  hweight64(uv_read_local_mmr( UVH_NODE_PRESENT_TABLE + i * 8));
-	printk(KERN_DEBUG "UV: Found %d blades\n", uv_num_possible_blades());
+
+	/* uv_num_possible_blades() is really the hub count */
+	printk(KERN_INFO "UV: Found %d blades, %d hubs\n",
+			is_uv1_hub() ? uv_num_possible_blades() :
+			(uv_num_possible_blades() + 1) / 2,
+			uv_num_possible_blades());
 
 	bytes = sizeof(struct uv_blade_info) * uv_num_possible_blades();
 	uv_blade_info = kzalloc(bytes, GFP_KERNEL);
-- 
cgit v1.2.3


From 8030c36d13f030103356709e63638678fdc66fdc Mon Sep 17 00:00:00 2001
From: "H. Peter Anvin" <hpa@zytor.com>
Date: Mon, 9 Jan 2012 19:33:24 -0800
Subject: x86, atomic: atomic64_read() take a const pointer

atomic64_read() doesn't actually write anything (as far as the C
environment is concerned... the CPU does actually write but that's an
implementation quirk), so it should take a const pointer.

This does NOT mean that it is safe to use atomic64_read() on an object
in readonly storage (it will trap!)

Reported-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: H. Peter Anvin <hpa@zytor.com>
Link: http://lkml.kernel.org/r/20120109165859.1879abda.akpm@linux-foundation.org
---
 arch/x86/include/asm/atomic64_32.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'arch/x86')

diff --git a/arch/x86/include/asm/atomic64_32.h b/arch/x86/include/asm/atomic64_32.h
index 24098aafce0d..fa13f0ec2874 100644
--- a/arch/x86/include/asm/atomic64_32.h
+++ b/arch/x86/include/asm/atomic64_32.h
@@ -82,7 +82,7 @@ static inline void atomic64_set(atomic64_t *v, long long i)
  *
  * Atomically reads the value of @v and returns it.
  */
-static inline long long atomic64_read(atomic64_t *v)
+static inline long long atomic64_read(const atomic64_t *v)
 {
 	long long r;
 	asm volatile(ATOMIC64_ALTERNATIVE(read)
-- 
cgit v1.2.3


From b6c96c0214138186f495e3ee73737c6fc5e4efa2 Mon Sep 17 00:00:00 2001
From: Stratos Psomadakis <psomas@cslab.ece.ntua.gr>
Date: Thu, 12 Jan 2012 15:44:47 +1030
Subject: lguest: Make sure interrupt is allocated ok by lguest_setup_irq

Make sure the interrupt is allocated correctly by lguest_setup_irq (check the
return value of irq_alloc_desc_at for -ENOMEM)

Signed-off-by: Stratos Psomadakis <psomas@cslab.ece.ntua.gr>
Signed-off-by: Rusty Russell <rusty@rustcorp.com.au> (cleanups and commentry)
---
 arch/x86/lguest/boot.c         | 21 +++++++++++++--------
 drivers/lguest/lguest_device.c | 10 +++++++---
 2 files changed, 20 insertions(+), 11 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/lguest/boot.c b/arch/x86/lguest/boot.c
index cf4603ba866f..642d8805bc1b 100644
--- a/arch/x86/lguest/boot.c
+++ b/arch/x86/lguest/boot.c
@@ -856,18 +856,23 @@ static void __init lguest_init_IRQ(void)
 }
 
 /*
- * With CONFIG_SPARSE_IRQ, interrupt descriptors are allocated as-needed, so
- * rather than set them in lguest_init_IRQ we are called here every time an
- * lguest device needs an interrupt.
- *
- * FIXME: irq_alloc_desc_at() can fail due to lack of memory, we should
- * pass that up!
+ * Interrupt descriptors are allocated as-needed, but low-numbered ones are
+ * reserved by the generic x86 code.  So we ignore irq_alloc_desc_at if it
+ * tells us the irq is already used: other errors (ie. ENOMEM) we take
+ * seriously.
  */
-void lguest_setup_irq(unsigned int irq)
+int lguest_setup_irq(unsigned int irq)
 {
-	irq_alloc_desc_at(irq, 0);
+	int err;
+
+	/* Returns -ve error or vector number. */
+	err = irq_alloc_desc_at(irq, 0);
+	if (err < 0 && err != -EEXIST)
+		return err;
+
 	irq_set_chip_and_handler_name(irq, &lguest_irq_controller,
 				      handle_level_irq, "level");
+	return 0;
 }
 
 /*
diff --git a/drivers/lguest/lguest_device.c b/drivers/lguest/lguest_device.c
index 6a1d6447b864..9e8388efd88e 100644
--- a/drivers/lguest/lguest_device.c
+++ b/drivers/lguest/lguest_device.c
@@ -241,7 +241,7 @@ static void lg_notify(struct virtqueue *vq)
 }
 
 /* An extern declaration inside a C file is bad form.  Don't do it. */
-extern void lguest_setup_irq(unsigned int irq);
+extern int lguest_setup_irq(unsigned int irq);
 
 /*
  * This routine finds the Nth virtqueue described in the configuration of
@@ -304,7 +304,9 @@ static struct virtqueue *lg_find_vq(struct virtio_device *vdev,
 	}
 
 	/* Make sure the interrupt is allocated. */
-	lguest_setup_irq(lvq->config.irq);
+	err = lguest_setup_irq(lvq->config.irq);
+	if (err)
+		goto destroy_vring;
 
 	/*
 	 * Tell the interrupt for this virtqueue to go to the virtio_ring
@@ -317,7 +319,7 @@ static struct virtqueue *lg_find_vq(struct virtio_device *vdev,
 	err = request_irq(lvq->config.irq, vring_interrupt, IRQF_SHARED,
 			  dev_name(&vdev->dev), vq);
 	if (err)
-		goto destroy_vring;
+		goto free_desc;
 
 	/*
 	 * Last of all we hook up our 'struct lguest_vq_info" to the
@@ -326,6 +328,8 @@ static struct virtqueue *lg_find_vq(struct virtio_device *vdev,
 	vq->priv = lvq;
 	return vq;
 
+free_desc:
+	irq_free_desc(lvq->config.irq);
 destroy_vring:
 	vring_del_virtqueue(vq);
 unmap:
-- 
cgit v1.2.3


From 5cf9a4e69c1ff0ccdd1d2b7404f95c0531355274 Mon Sep 17 00:00:00 2001
From: Bjorn Helgaas <bhelgaas@google.com>
Date: Thu, 12 Jan 2012 08:01:40 -0700
Subject: x86/PCI: build amd_bus.o only when CONFIG_AMD_NB=y

We only need amd_bus.o for AMD systems with PCI.  arch/x86/pci/Makefile
already depends on CONFIG_PCI=y, so this patch just adds the dependency
on CONFIG_AMD_NB.

Cc: Yinghai Lu <yinghai@kernel.org>
Cc: stable@kernel.org	# 2.6.34+ (needs adjustment for k8 -> amd rename)
Signed-off-by: Bjorn Helgaas <bhelgaas@google.com>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 arch/x86/pci/Makefile | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

(limited to 'arch/x86')

diff --git a/arch/x86/pci/Makefile b/arch/x86/pci/Makefile
index 75b06f34b1f2..e76e18c94a3c 100644
--- a/arch/x86/pci/Makefile
+++ b/arch/x86/pci/Makefile
@@ -18,8 +18,9 @@ obj-$(CONFIG_X86_NUMAQ)		+= numaq_32.o
 obj-$(CONFIG_X86_INTEL_MID)	+= mrst.o
 
 obj-y				+= common.o early.o
-obj-y				+= amd_bus.o bus_numa.o
+obj-y				+= bus_numa.o
 
+obj-$(CONFIG_AMD_NB)		+= amd_bus.o
 obj-$(CONFIG_PCI_CNB20LE_QUIRK)	+= broadcom_bus.o
 
 ifeq ($(CONFIG_PCI_DEBUG),y)
-- 
cgit v1.2.3


From bccd17294a26b67a8a19aaa120e3eeaa7da49281 Mon Sep 17 00:00:00 2001
From: Anton Vorontsov <cbouatmailru@gmail.com>
Date: Wed, 11 Jan 2012 05:11:46 +0400
Subject: x86: Get rid of 'dubious one-bit signed bitfield' sprase warning

This very noisy sparse warning appears on almost every file in the
kernel:

  CHECK   init/main.c
  arch/x86/include/asm/thread_info.h:43:55: error: dubious one-bit signed bitfield
  arch/x86/include/asm/thread_info.h:44:46: error: dubious one-bit signed bitfield

This patch changes sig_on_uaccess_error and uaccess_err flags to unsigned
type and thus fixes the warning.

Signed-off-by: Anton Vorontsov <cbouatmailru@gmail.com>
Acked-by: Andy Lutomirski <luto@mit.edu>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 arch/x86/include/asm/thread_info.h | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/include/asm/thread_info.h b/arch/x86/include/asm/thread_info.h
index 74047159d0ab..bc817cd8b443 100644
--- a/arch/x86/include/asm/thread_info.h
+++ b/arch/x86/include/asm/thread_info.h
@@ -40,8 +40,8 @@ struct thread_info {
 						*/
 	__u8			supervisor_stack[0];
 #endif
-	int			sig_on_uaccess_error:1;
-	int			uaccess_err:1;	/* uaccess failed */
+	unsigned int		sig_on_uaccess_error:1;
+	unsigned int		uaccess_err:1;	/* uaccess failed */
 };
 
 #define INIT_THREAD_INFO(tsk)			\
-- 
cgit v1.2.3


From 476bc0015bf09dad39d36a8b19f76f0c181d1ec9 Mon Sep 17 00:00:00 2001
From: Rusty Russell <rusty@rustcorp.com.au>
Date: Fri, 13 Jan 2012 09:32:18 +1030
Subject: module_param: make bool parameters really bool (arch)

module_param(bool) used to counter-intuitively take an int.  In
fddd5201 (mid-2009) we allowed bool or int/unsigned int using a messy
trick.

It's time to remove the int/unsigned int option.  For this version
it'll simply give a warning, but it'll break next kernel version.

Signed-off-by: Rusty Russell <rusty@rustcorp.com.au>
---
 arch/ia64/hp/common/aml_nfw.c  |  2 +-
 arch/x86/kernel/apm_32.c       | 16 ++++++++--------
 arch/x86/kvm/mmu.c             |  2 +-
 arch/x86/kvm/vmx.c             | 18 +++++++++---------
 arch/x86/kvm/x86.c             |  4 ++--
 arch/x86/mm/mmio-mod.c         |  4 ++--
 arch/x86/platform/geode/alix.c |  2 +-
 arch/x86/platform/iris/iris.c  |  2 +-
 8 files changed, 25 insertions(+), 25 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/ia64/hp/common/aml_nfw.c b/arch/ia64/hp/common/aml_nfw.c
index 22078486d35d..6192f7188654 100644
--- a/arch/ia64/hp/common/aml_nfw.c
+++ b/arch/ia64/hp/common/aml_nfw.c
@@ -31,7 +31,7 @@ MODULE_AUTHOR("Bjorn Helgaas <bjorn.helgaas@hp.com>");
 MODULE_LICENSE("GPL");
 MODULE_DESCRIPTION("ACPI opregion handler for native firmware calls");
 
-static int force_register;
+static bool force_register;
 module_param_named(force, force_register, bool, 0);
 MODULE_PARM_DESC(force, "Install opregion handler even without HPQ5001 device");
 
diff --git a/arch/x86/kernel/apm_32.c b/arch/x86/kernel/apm_32.c
index a46bd383953c..f76623cbe263 100644
--- a/arch/x86/kernel/apm_32.c
+++ b/arch/x86/kernel/apm_32.c
@@ -383,21 +383,21 @@ static int ignore_sys_suspend;
 static int ignore_normal_resume;
 static int bounce_interval __read_mostly = DEFAULT_BOUNCE_INTERVAL;
 
-static int debug __read_mostly;
-static int smp __read_mostly;
+static bool debug __read_mostly;
+static bool smp __read_mostly;
 static int apm_disabled = -1;
 #ifdef CONFIG_SMP
-static int power_off;
+static bool power_off;
 #else
-static int power_off = 1;
+static bool power_off = 1;
 #endif
-static int realmode_power_off;
+static bool realmode_power_off;
 #ifdef CONFIG_APM_ALLOW_INTS
-static int allow_ints = 1;
+static bool allow_ints = 1;
 #else
-static int allow_ints;
+static bool allow_ints;
 #endif
-static int broken_psr;
+static bool broken_psr;
 
 static DECLARE_WAIT_QUEUE_HEAD(apm_waitqueue);
 static DECLARE_WAIT_QUEUE_HEAD(apm_suspend_waitqueue);
diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
index 2a2a9b40db19..224b02c3cda9 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -74,7 +74,7 @@ enum {
 #endif
 
 #ifdef MMU_DEBUG
-static int dbg = 0;
+static bool dbg = 0;
 module_param(dbg, bool, 0644);
 #endif
 
diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index 906a7e84200f..d29216c462b3 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -51,29 +51,29 @@
 MODULE_AUTHOR("Qumranet");
 MODULE_LICENSE("GPL");
 
-static int __read_mostly enable_vpid = 1;
+static bool __read_mostly enable_vpid = 1;
 module_param_named(vpid, enable_vpid, bool, 0444);
 
-static int __read_mostly flexpriority_enabled = 1;
+static bool __read_mostly flexpriority_enabled = 1;
 module_param_named(flexpriority, flexpriority_enabled, bool, S_IRUGO);
 
-static int __read_mostly enable_ept = 1;
+static bool __read_mostly enable_ept = 1;
 module_param_named(ept, enable_ept, bool, S_IRUGO);
 
-static int __read_mostly enable_unrestricted_guest = 1;
+static bool __read_mostly enable_unrestricted_guest = 1;
 module_param_named(unrestricted_guest,
 			enable_unrestricted_guest, bool, S_IRUGO);
 
-static int __read_mostly emulate_invalid_guest_state = 0;
+static bool __read_mostly emulate_invalid_guest_state = 0;
 module_param(emulate_invalid_guest_state, bool, S_IRUGO);
 
-static int __read_mostly vmm_exclusive = 1;
+static bool __read_mostly vmm_exclusive = 1;
 module_param(vmm_exclusive, bool, S_IRUGO);
 
-static int __read_mostly yield_on_hlt = 1;
+static bool __read_mostly yield_on_hlt = 1;
 module_param(yield_on_hlt, bool, S_IRUGO);
 
-static int __read_mostly fasteoi = 1;
+static bool __read_mostly fasteoi = 1;
 module_param(fasteoi, bool, S_IRUGO);
 
 /*
@@ -81,7 +81,7 @@ module_param(fasteoi, bool, S_IRUGO);
  * VMX and be a hypervisor for its own guests. If nested=0, guests may not
  * use VMX instructions.
  */
-static int __read_mostly nested = 0;
+static bool __read_mostly nested = 0;
 module_param(nested, bool, S_IRUGO);
 
 #define KVM_GUEST_CR0_MASK_UNRESTRICTED_GUEST				\
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 1171def5f96b..14d6cadc4ba6 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -88,8 +88,8 @@ static void process_nmi(struct kvm_vcpu *vcpu);
 struct kvm_x86_ops *kvm_x86_ops;
 EXPORT_SYMBOL_GPL(kvm_x86_ops);
 
-int ignore_msrs = 0;
-module_param_named(ignore_msrs, ignore_msrs, bool, S_IRUGO | S_IWUSR);
+static bool ignore_msrs = 0;
+module_param(ignore_msrs, bool, S_IRUGO | S_IWUSR);
 
 bool kvm_has_tsc_control;
 EXPORT_SYMBOL_GPL(kvm_has_tsc_control);
diff --git a/arch/x86/mm/mmio-mod.c b/arch/x86/mm/mmio-mod.c
index de54b9b278a7..dc0b727742f4 100644
--- a/arch/x86/mm/mmio-mod.c
+++ b/arch/x86/mm/mmio-mod.c
@@ -75,8 +75,8 @@ static LIST_HEAD(trace_list);		/* struct remap_trace */
 
 /* module parameters */
 static unsigned long	filter_offset;
-static int		nommiotrace;
-static int		trace_pc;
+static bool		nommiotrace;
+static bool		trace_pc;
 
 module_param(filter_offset, ulong, 0);
 module_param(nommiotrace, bool, 0);
diff --git a/arch/x86/platform/geode/alix.c b/arch/x86/platform/geode/alix.c
index ca1973699d3d..dc5f1d32aced 100644
--- a/arch/x86/platform/geode/alix.c
+++ b/arch/x86/platform/geode/alix.c
@@ -27,7 +27,7 @@
 
 #include <asm/geode.h>
 
-static int force = 0;
+static bool force = 0;
 module_param(force, bool, 0444);
 /* FIXME: Award bios is not automatically detected as Alix platform */
 MODULE_PARM_DESC(force, "Force detection as ALIX.2/ALIX.3 platform");
diff --git a/arch/x86/platform/iris/iris.c b/arch/x86/platform/iris/iris.c
index 1ba7f5ed8c9b..5917eb56b313 100644
--- a/arch/x86/platform/iris/iris.c
+++ b/arch/x86/platform/iris/iris.c
@@ -42,7 +42,7 @@ MODULE_AUTHOR("Sébastien Hinderer <Sebastien.Hinderer@ens-lyon.org>");
 MODULE_DESCRIPTION("A power_off handler for Iris devices from EuroBraille");
 MODULE_SUPPORTED_DEVICE("Eurobraille/Iris");
 
-static int force;
+static bool force;
 
 module_param(force, bool, 0);
 MODULE_PARM_DESC(force, "Set to one to force poweroff handler installation.");
-- 
cgit v1.2.3


From 43570fd2f47ba518145e9289f54cde3dba4c8b25 Mon Sep 17 00:00:00 2001
From: Heiko Carstens <heiko.carstens@de.ibm.com>
Date: Thu, 12 Jan 2012 17:17:27 -0800
Subject: mm,slub,x86: decouple size of struct page from CONFIG_CMPXCHG_LOCAL

While implementing cmpxchg_double() on s390 I realized that we don't set
CONFIG_CMPXCHG_LOCAL despite the fact that we have support for it.

However setting that option will increase the size of struct page by
eight bytes on 64 bit, which we certainly do not want.  Also, it doesn't
make sense that a present cpu feature should increase the size of struct
page.

Besides that it looks like the dependency to CMPXCHG_LOCAL is wrong and
that it should depend on CMPXCHG_DOUBLE instead.

This patch:

If an architecture supports CMPXCHG_LOCAL this shouldn't result
automatically in larger struct pages if the SLUB allocator is used.
Instead introduce a new config option "HAVE_ALIGNED_STRUCT_PAGE" which
can be selected if a double word aligned struct page is required.  Also
update x86 Kconfig so that it should work as before.

Signed-off-by: Heiko Carstens <heiko.carstens@de.ibm.com>
Acked-by: Christoph Lameter <cl@linux.com>
Cc: Pekka Enberg <penberg@kernel.org>
Cc: Ingo Molnar <mingo@elte.hu>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: "H. Peter Anvin" <hpa@zytor.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 arch/Kconfig             | 8 ++++++++
 arch/x86/Kconfig         | 1 +
 include/linux/mm_types.h | 9 ++++-----
 mm/slub.c                | 6 +++---
 4 files changed, 16 insertions(+), 8 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/Kconfig b/arch/Kconfig
index 2505740b81d2..a2c5c077c32d 100644
--- a/arch/Kconfig
+++ b/arch/Kconfig
@@ -185,4 +185,12 @@ config HAVE_RCU_TABLE_FREE
 config ARCH_HAVE_NMI_SAFE_CMPXCHG
 	bool
 
+config HAVE_ALIGNED_STRUCT_PAGE
+	bool
+	help
+	  This makes sure that struct pages are double word aligned and that
+	  e.g. the SLUB allocator can perform double word atomic operations
+	  on a struct page for better performance. However selecting this
+	  might increase the size of a struct page by a word.
+
 source "kernel/gcov/Kconfig"
diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
index a150f4c35e94..5201a2c27239 100644
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -60,6 +60,7 @@ config X86
 	select PERF_EVENTS
 	select HAVE_PERF_EVENTS_NMI
 	select ANON_INODES
+	select HAVE_ALIGNED_STRUCT_PAGE if SLUB && !M386
 	select HAVE_ARCH_KMEMCHECK
 	select HAVE_USER_RETURN_NOTIFIER
 	select ARCH_BINFMT_ELF_RANDOMIZE_PIE
diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h
index 5b42f1b34eb7..3cc3062b3767 100644
--- a/include/linux/mm_types.h
+++ b/include/linux/mm_types.h
@@ -151,12 +151,11 @@ struct page {
 #endif
 }
 /*
- * If another subsystem starts using the double word pairing for atomic
- * operations on struct page then it must change the #if to ensure
- * proper alignment of the page struct.
+ * The struct page can be forced to be double word aligned so that atomic ops
+ * on double words work. The SLUB allocator can make use of such a feature.
  */
-#if defined(CONFIG_SLUB) && defined(CONFIG_CMPXCHG_LOCAL)
-	__attribute__((__aligned__(2*sizeof(unsigned long))))
+#ifdef CONFIG_HAVE_ALIGNED_STRUCT_PAGE
+	__aligned(2 * sizeof(unsigned long))
 #endif
 ;
 
diff --git a/mm/slub.c b/mm/slub.c
index 5d37b5e44140..72aa84134609 100644
--- a/mm/slub.c
+++ b/mm/slub.c
@@ -366,7 +366,7 @@ static inline bool __cmpxchg_double_slab(struct kmem_cache *s, struct page *page
 		const char *n)
 {
 	VM_BUG_ON(!irqs_disabled());
-#ifdef CONFIG_CMPXCHG_DOUBLE
+#if defined(CONFIG_CMPXCHG_DOUBLE) && defined(CONFIG_HAVE_ALIGNED_STRUCT_PAGE)
 	if (s->flags & __CMPXCHG_DOUBLE) {
 		if (cmpxchg_double(&page->freelist, &page->counters,
 			freelist_old, counters_old,
@@ -400,7 +400,7 @@ static inline bool cmpxchg_double_slab(struct kmem_cache *s, struct page *page,
 		void *freelist_new, unsigned long counters_new,
 		const char *n)
 {
-#ifdef CONFIG_CMPXCHG_DOUBLE
+#if defined(CONFIG_CMPXCHG_DOUBLE) && defined(CONFIG_HAVE_ALIGNED_STRUCT_PAGE)
 	if (s->flags & __CMPXCHG_DOUBLE) {
 		if (cmpxchg_double(&page->freelist, &page->counters,
 			freelist_old, counters_old,
@@ -3014,7 +3014,7 @@ static int kmem_cache_open(struct kmem_cache *s,
 		}
 	}
 
-#ifdef CONFIG_CMPXCHG_DOUBLE
+#if defined(CONFIG_CMPXCHG_DOUBLE) && defined(CONFIG_HAVE_ALIGNED_STRUCT_PAGE)
 	if (system_has_cmpxchg_double() && (s->flags & SLAB_DEBUG_FLAGS) == 0)
 		/* Enable fast mode */
 		s->flags |= __CMPXCHG_DOUBLE;
-- 
cgit v1.2.3


From 4156153c4daddf12dd386016f96a947a01e93bf4 Mon Sep 17 00:00:00 2001
From: Heiko Carstens <heiko.carstens@de.ibm.com>
Date: Thu, 12 Jan 2012 17:17:30 -0800
Subject: mm,x86,um: move CMPXCHG_LOCAL config option

Move CMPXCHG_LOCAL and rename it to HAVE_CMPXCHG_LOCAL so architectures
can simply select the option if it is supported.

Signed-off-by: Heiko Carstens <heiko.carstens@de.ibm.com>
Acked-by: Christoph Lameter <cl@linux.com>
Cc: Pekka Enberg <penberg@kernel.org>
Cc: Ingo Molnar <mingo@elte.hu>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: "H. Peter Anvin" <hpa@zytor.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 arch/Kconfig         | 3 +++
 arch/x86/Kconfig     | 1 +
 arch/x86/Kconfig.cpu | 3 ---
 arch/x86/um/Kconfig  | 4 ----
 mm/vmstat.c          | 2 +-
 5 files changed, 5 insertions(+), 8 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/Kconfig b/arch/Kconfig
index a2c5c077c32d..22182a8cc62c 100644
--- a/arch/Kconfig
+++ b/arch/Kconfig
@@ -193,4 +193,7 @@ config HAVE_ALIGNED_STRUCT_PAGE
 	  on a struct page for better performance. However selecting this
 	  might increase the size of a struct page by a word.
 
+config HAVE_CMPXCHG_LOCAL
+	bool
+
 source "kernel/gcov/Kconfig"
diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
index 5201a2c27239..59717fd17bc7 100644
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -61,6 +61,7 @@ config X86
 	select HAVE_PERF_EVENTS_NMI
 	select ANON_INODES
 	select HAVE_ALIGNED_STRUCT_PAGE if SLUB && !M386
+	select HAVE_CMPXCHG_LOCAL if !M386
 	select HAVE_ARCH_KMEMCHECK
 	select HAVE_USER_RETURN_NOTIFIER
 	select ARCH_BINFMT_ELF_RANDOMIZE_PIE
diff --git a/arch/x86/Kconfig.cpu b/arch/x86/Kconfig.cpu
index e3ca7e0d858c..99d2ab8b7795 100644
--- a/arch/x86/Kconfig.cpu
+++ b/arch/x86/Kconfig.cpu
@@ -309,9 +309,6 @@ config X86_INTERNODE_CACHE_SHIFT
 config X86_CMPXCHG
 	def_bool X86_64 || (X86_32 && !M386)
 
-config CMPXCHG_LOCAL
-	def_bool X86_64 || (X86_32 && !M386)
-
 config CMPXCHG_DOUBLE
 	def_bool y
 
diff --git a/arch/x86/um/Kconfig b/arch/x86/um/Kconfig
index 1d97bd84b6fb..a62bfc66239e 100644
--- a/arch/x86/um/Kconfig
+++ b/arch/x86/um/Kconfig
@@ -6,10 +6,6 @@ menu "UML-specific options"
 
 menu "Host processor type and features"
 
-config CMPXCHG_LOCAL
-	bool
-	default n
-
 config CMPXCHG_DOUBLE
 	bool
 	default n
diff --git a/mm/vmstat.c b/mm/vmstat.c
index 8fd603b1665e..f600557a7659 100644
--- a/mm/vmstat.c
+++ b/mm/vmstat.c
@@ -295,7 +295,7 @@ void __dec_zone_page_state(struct page *page, enum zone_stat_item item)
 }
 EXPORT_SYMBOL(__dec_zone_page_state);
 
-#ifdef CONFIG_CMPXCHG_LOCAL
+#ifdef CONFIG_HAVE_CMPXCHG_LOCAL
 /*
  * If we have cmpxchg_local support then we do not need to incur the overhead
  * that comes with local_irq_save/restore if we use this_cpu_cmpxchg.
-- 
cgit v1.2.3


From 2565409fc0303f3ab8d66b8326702a687962a29b Mon Sep 17 00:00:00 2001
From: Heiko Carstens <heiko.carstens@de.ibm.com>
Date: Thu, 12 Jan 2012 17:17:33 -0800
Subject: mm,x86,um: move CMPXCHG_DOUBLE config option

Move CMPXCHG_DOUBLE and rename it to HAVE_CMPXCHG_DOUBLE so architectures
can simply select the option if it is supported.

Signed-off-by: Heiko Carstens <heiko.carstens@de.ibm.com>
Acked-by: Christoph Lameter <cl@linux.com>
Cc: Pekka Enberg <penberg@kernel.org>
Cc: Ingo Molnar <mingo@elte.hu>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: "H. Peter Anvin" <hpa@zytor.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 arch/Kconfig         | 3 +++
 arch/x86/Kconfig     | 1 +
 arch/x86/Kconfig.cpu | 3 ---
 arch/x86/um/Kconfig  | 4 ----
 mm/slub.c            | 9 ++++++---
 5 files changed, 10 insertions(+), 10 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/Kconfig b/arch/Kconfig
index 22182a8cc62c..4f55c736be11 100644
--- a/arch/Kconfig
+++ b/arch/Kconfig
@@ -196,4 +196,7 @@ config HAVE_ALIGNED_STRUCT_PAGE
 config HAVE_CMPXCHG_LOCAL
 	bool
 
+config HAVE_CMPXCHG_DOUBLE
+	bool
+
 source "kernel/gcov/Kconfig"
diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
index 59717fd17bc7..6c14ecd851d0 100644
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -62,6 +62,7 @@ config X86
 	select ANON_INODES
 	select HAVE_ALIGNED_STRUCT_PAGE if SLUB && !M386
 	select HAVE_CMPXCHG_LOCAL if !M386
+	select HAVE_CMPXCHG_DOUBLE
 	select HAVE_ARCH_KMEMCHECK
 	select HAVE_USER_RETURN_NOTIFIER
 	select ARCH_BINFMT_ELF_RANDOMIZE_PIE
diff --git a/arch/x86/Kconfig.cpu b/arch/x86/Kconfig.cpu
index 99d2ab8b7795..3c57033e2211 100644
--- a/arch/x86/Kconfig.cpu
+++ b/arch/x86/Kconfig.cpu
@@ -309,9 +309,6 @@ config X86_INTERNODE_CACHE_SHIFT
 config X86_CMPXCHG
 	def_bool X86_64 || (X86_32 && !M386)
 
-config CMPXCHG_DOUBLE
-	def_bool y
-
 config X86_L1_CACHE_SHIFT
 	int
 	default "7" if MPENTIUM4 || MPSC
diff --git a/arch/x86/um/Kconfig b/arch/x86/um/Kconfig
index a62bfc66239e..b2b54d2edf53 100644
--- a/arch/x86/um/Kconfig
+++ b/arch/x86/um/Kconfig
@@ -6,10 +6,6 @@ menu "UML-specific options"
 
 menu "Host processor type and features"
 
-config CMPXCHG_DOUBLE
-	bool
-	default n
-
 source "arch/x86/Kconfig.cpu"
 
 endmenu
diff --git a/mm/slub.c b/mm/slub.c
index 72aa84134609..4907563ef7ff 100644
--- a/mm/slub.c
+++ b/mm/slub.c
@@ -366,7 +366,8 @@ static inline bool __cmpxchg_double_slab(struct kmem_cache *s, struct page *page
 		const char *n)
 {
 	VM_BUG_ON(!irqs_disabled());
-#if defined(CONFIG_CMPXCHG_DOUBLE) && defined(CONFIG_HAVE_ALIGNED_STRUCT_PAGE)
+#if defined(CONFIG_HAVE_CMPXCHG_DOUBLE) && \
+    defined(CONFIG_HAVE_ALIGNED_STRUCT_PAGE)
 	if (s->flags & __CMPXCHG_DOUBLE) {
 		if (cmpxchg_double(&page->freelist, &page->counters,
 			freelist_old, counters_old,
@@ -400,7 +401,8 @@ static inline bool cmpxchg_double_slab(struct kmem_cache *s, struct page *page,
 		void *freelist_new, unsigned long counters_new,
 		const char *n)
 {
-#if defined(CONFIG_CMPXCHG_DOUBLE) && defined(CONFIG_HAVE_ALIGNED_STRUCT_PAGE)
+#if defined(CONFIG_HAVE_CMPXCHG_DOUBLE) && \
+    defined(CONFIG_HAVE_ALIGNED_STRUCT_PAGE)
 	if (s->flags & __CMPXCHG_DOUBLE) {
 		if (cmpxchg_double(&page->freelist, &page->counters,
 			freelist_old, counters_old,
@@ -3014,7 +3016,8 @@ static int kmem_cache_open(struct kmem_cache *s,
 		}
 	}
 
-#if defined(CONFIG_CMPXCHG_DOUBLE) && defined(CONFIG_HAVE_ALIGNED_STRUCT_PAGE)
+#if defined(CONFIG_HAVE_CMPXCHG_DOUBLE) && \
+    defined(CONFIG_HAVE_ALIGNED_STRUCT_PAGE)
 	if (system_has_cmpxchg_double() && (s->flags & SLAB_DEBUG_FLAGS) == 0)
 		/* Enable fast mode */
 		s->flags |= __CMPXCHG_DOUBLE;
-- 
cgit v1.2.3


From 9512938b885304f72c847379611d6018064af840 Mon Sep 17 00:00:00 2001
From: Wanlong Gao <gaowanlong@cn.fujitsu.com>
Date: Thu, 12 Jan 2012 17:20:09 -0800
Subject: cpumask: update setup_node_to_cpumask_map() comments

node_to_cpumask() has been replaced by cpumask_of_node(), and wholly
removed since commit 29c337a0 ("cpumask: remove obsolete node_to_cpumask
now everyone uses cpumask_of_node").

So update the comments for setup_node_to_cpumask_map().

Signed-off-by: Wanlong Gao <gaowanlong@cn.fujitsu.com>
Acked-by: Rusty Russell <rusty@rustcorp.com.au>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 arch/powerpc/mm/numa.c | 2 +-
 arch/x86/mm/numa.c     | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/powerpc/mm/numa.c b/arch/powerpc/mm/numa.c
index 4ff3d8e411a7..3feefc3842a8 100644
--- a/arch/powerpc/mm/numa.c
+++ b/arch/powerpc/mm/numa.c
@@ -58,7 +58,7 @@ static int distance_lookup_table[MAX_NUMNODES][MAX_DISTANCE_REF_POINTS];
  * Allocate node_to_cpumask_map based on number of available nodes
  * Requires node_possible_map to be valid.
  *
- * Note: node_to_cpumask() is not valid until after this is done.
+ * Note: cpumask_of_node() is not valid until after this is done.
  */
 static void __init setup_node_to_cpumask_map(void)
 {
diff --git a/arch/x86/mm/numa.c b/arch/x86/mm/numa.c
index 020cd2e80873..19d3fa08b119 100644
--- a/arch/x86/mm/numa.c
+++ b/arch/x86/mm/numa.c
@@ -110,7 +110,7 @@ void __cpuinit numa_clear_node(int cpu)
  * Allocate node_to_cpumask_map based on number of available nodes
  * Requires node_possible_map to be valid.
  *
- * Note: node_to_cpumask() is not valid until after this is done.
+ * Note: cpumask_of_node() is not valid until after this is done.
  * (Use CONFIG_DEBUG_PER_CPU_MAPS to check this.)
  */
 void __init setup_node_to_cpumask_map(void)
-- 
cgit v1.2.3


From a522ee85ba979e7897a75b1c97db1b0304b68b5c Mon Sep 17 00:00:00 2001
From: Jussi Kivilinna <jussi.kivilinna@mbnet.fi>
Date: Tue, 20 Dec 2011 12:20:16 +0200
Subject: crypto: twofish-x86_64-3way - blacklist pentium4 and atom

Performance of twofish-x86_64-3way on Intel Pentium 4 and Atom is lower than
of twofish-x86_64 module. So blacklist these CPUs.

Signed-off-by: Jussi Kivilinna <jussi.kivilinna@mbnet.fi>
Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
---
 arch/x86/crypto/twofish_glue_3way.c | 47 +++++++++++++++++++++++++++++++++++++
 1 file changed, 47 insertions(+)

(limited to 'arch/x86')

diff --git a/arch/x86/crypto/twofish_glue_3way.c b/arch/x86/crypto/twofish_glue_3way.c
index 7fee8c152f93..0afd134d8c9c 100644
--- a/arch/x86/crypto/twofish_glue_3way.c
+++ b/arch/x86/crypto/twofish_glue_3way.c
@@ -25,6 +25,7 @@
  *
  */
 
+#include <asm/processor.h>
 #include <linux/crypto.h>
 #include <linux/init.h>
 #include <linux/module.h>
@@ -637,10 +638,56 @@ static struct crypto_alg blk_xts_alg = {
 	},
 };
 
+static bool is_blacklisted_cpu(void)
+{
+	if (boot_cpu_data.x86_vendor != X86_VENDOR_INTEL)
+		return false;
+
+	if (boot_cpu_data.x86 == 0x06 &&
+		(boot_cpu_data.x86_model == 0x1c ||
+		 boot_cpu_data.x86_model == 0x26 ||
+		 boot_cpu_data.x86_model == 0x36)) {
+		/*
+		 * On Atom, twofish-3way is slower than original assembler
+		 * implementation. Twofish-3way trades off some performance in
+		 * storing blocks in 64bit registers to allow three blocks to
+		 * be processed parallel. Parallel operation then allows gaining
+		 * more performance than was trade off, on out-of-order CPUs.
+		 * However Atom does not benefit from this parallellism and
+		 * should be blacklisted.
+		 */
+		return true;
+	}
+
+	if (boot_cpu_data.x86 == 0x0f) {
+		/*
+		 * On Pentium 4, twofish-3way is slower than original assembler
+		 * implementation because excessive uses of 64bit rotate and
+		 * left-shifts (which are really slow on P4) needed to store and
+		 * handle 128bit block in two 64bit registers.
+		 */
+		return true;
+	}
+
+	return false;
+}
+
+static int force;
+module_param(force, int, 0);
+MODULE_PARM_DESC(force, "Force module load, ignore CPU blacklist");
+
 int __init init(void)
 {
 	int err;
 
+	if (!force && is_blacklisted_cpu()) {
+		printk(KERN_INFO
+			"twofish-x86_64-3way: performance on this CPU "
+			"would be suboptimal: disabling "
+			"twofish-x86_64-3way.\n");
+		return -ENODEV;
+	}
+
 	err = crypto_register_alg(&blk_ecb_alg);
 	if (err)
 		goto ecb_err;
-- 
cgit v1.2.3


From 4c58464b8034cef4317593bf4ccbfc19d5bb3a77 Mon Sep 17 00:00:00 2001
From: Jussi Kivilinna <jussi.kivilinna@mbnet.fi>
Date: Tue, 20 Dec 2011 12:20:21 +0200
Subject: crypto: blowfish-x86_64 - blacklist Pentium 4

Implementation in blowfish-x86_64 uses 64bit rotations which are slow on P4,
making blowfish-x86_64 slower than generic C implementation. Therefore
blacklist P4.

Signed-off-by: Jussi Kivilinna <jussi.kivilinna@mbnet.fi>
Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
---
 arch/x86/crypto/blowfish_glue.c | 30 ++++++++++++++++++++++++++++++
 1 file changed, 30 insertions(+)

(limited to 'arch/x86')

diff --git a/arch/x86/crypto/blowfish_glue.c b/arch/x86/crypto/blowfish_glue.c
index b05aa163d55a..2970110d2cea 100644
--- a/arch/x86/crypto/blowfish_glue.c
+++ b/arch/x86/crypto/blowfish_glue.c
@@ -25,6 +25,7 @@
  *
  */
 
+#include <asm/processor.h>
 #include <crypto/blowfish.h>
 #include <linux/crypto.h>
 #include <linux/init.h>
@@ -446,10 +447,39 @@ static struct crypto_alg blk_ctr_alg = {
 	},
 };
 
+static bool is_blacklisted_cpu(void)
+{
+	if (boot_cpu_data.x86_vendor != X86_VENDOR_INTEL)
+		return false;
+
+	if (boot_cpu_data.x86 == 0x0f) {
+		/*
+		 * On Pentium 4, blowfish-x86_64 is slower than generic C
+		 * implementation because use of 64bit rotates (which are really
+		 * slow on P4). Therefore blacklist P4s.
+		 */
+		return true;
+	}
+
+	return false;
+}
+
+static int force;
+module_param(force, int, 0);
+MODULE_PARM_DESC(force, "Force module load, ignore CPU blacklist");
+
 static int __init init(void)
 {
 	int err;
 
+	if (!force && is_blacklisted_cpu()) {
+		printk(KERN_INFO
+			"blowfish-x86_64: performance on this CPU "
+			"would be suboptimal: disabling "
+			"blowfish-x86_64.\n");
+		return -ENODEV;
+	}
+
 	err = crypto_register_alg(&bf_alg);
 	if (err)
 		goto bf_err;
-- 
cgit v1.2.3


From 847cb7ef565d31484f426677e0bea081bfd2acd9 Mon Sep 17 00:00:00 2001
From: Jussi Kivilinna <jussi.kivilinna@mbnet.fi>
Date: Tue, 20 Dec 2011 12:58:06 +0200
Subject: crypto: serpent-sse2 - change transpose_4x4 to only use integer
 instructions

Matrix transpose macro in serpent-sse2 uses mix of SSE2 integer and SSE floating
point instructions, which might cause performance penality on some CPUs.

This patch replaces transpose_4x4 macro with version that uses only SSE2
integer instructions.

Signed-off-by: Jussi Kivilinna <jussi.kivilinna@mbnet.fi>
Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
---
 arch/x86/crypto/serpent-sse2-i586-asm_32.S   | 29 +++++++++++++---------------
 arch/x86/crypto/serpent-sse2-x86_64-asm_64.S | 29 +++++++++++++---------------
 2 files changed, 26 insertions(+), 32 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/crypto/serpent-sse2-i586-asm_32.S b/arch/x86/crypto/serpent-sse2-i586-asm_32.S
index 4e37677ca851..c00053d42f99 100644
--- a/arch/x86/crypto/serpent-sse2-i586-asm_32.S
+++ b/arch/x86/crypto/serpent-sse2-i586-asm_32.S
@@ -463,23 +463,20 @@
 	pand x0,		x4; \
 	pxor x2,		x4;
 
-#define transpose_4x4(x0, x1, x2, x3, t1, t2, t3) \
-	movdqa x2,		t3; \
-	movdqa x0,		t1; \
-	unpcklps x3,		t3; \
+#define transpose_4x4(x0, x1, x2, x3, t0, t1, t2) \
 	movdqa x0,		t2; \
-	unpcklps x1,		t1; \
-	unpckhps x1,		t2; \
-	movdqa t3,		x1; \
-	unpckhps x3,		x2; \
-	movdqa t1,		x0; \
-	movhlps t1,		x1; \
-	movdqa t2,		t1; \
-	movlhps t3,		x0; \
-	movlhps x2,		t1; \
-	movhlps t2,		x2; \
-	movdqa x2,		x3; \
-	movdqa t1,		x2;
+	punpckldq x1,		x0; \
+	punpckhdq x1,		t2; \
+	movdqa x2,		t1; \
+	punpckhdq x3,		x2; \
+	punpckldq x3,		t1; \
+	movdqa x0,		x1; \
+	punpcklqdq t1,		x0; \
+	punpckhqdq t1,		x1; \
+	movdqa t2,		x3; \
+	punpcklqdq x2,		t2; \
+	punpckhqdq x2,		x3; \
+	movdqa t2,		x2;
 
 #define read_blocks(in, x0, x1, x2, x3, t0, t1, t2) \
 	movdqu (0*4*4)(in),	x0; \
diff --git a/arch/x86/crypto/serpent-sse2-x86_64-asm_64.S b/arch/x86/crypto/serpent-sse2-x86_64-asm_64.S
index 7f24a1540821..3ee1ff04d3e9 100644
--- a/arch/x86/crypto/serpent-sse2-x86_64-asm_64.S
+++ b/arch/x86/crypto/serpent-sse2-x86_64-asm_64.S
@@ -585,23 +585,20 @@
 	get_key(i, 1, RK1); \
 	SBOX ## _2(x0 ## 2, x1 ## 2, x2 ## 2, x3 ## 2, x4 ## 2); \
 
-#define transpose_4x4(x0, x1, x2, x3, t1, t2, t3) \
-	movdqa x2,		t3; \
-	movdqa x0,		t1; \
-	unpcklps x3,		t3; \
+#define transpose_4x4(x0, x1, x2, x3, t0, t1, t2) \
 	movdqa x0,		t2; \
-	unpcklps x1,		t1; \
-	unpckhps x1,		t2; \
-	movdqa t3,		x1; \
-	unpckhps x3,		x2; \
-	movdqa t1,		x0; \
-	movhlps t1,		x1; \
-	movdqa t2,		t1; \
-	movlhps t3,		x0; \
-	movlhps x2,		t1; \
-	movhlps t2,		x2; \
-	movdqa x2,		x3; \
-	movdqa t1,		x2;
+	punpckldq x1,		x0; \
+	punpckhdq x1,		t2; \
+	movdqa x2,		t1; \
+	punpckhdq x3,		x2; \
+	punpckldq x3,		t1; \
+	movdqa x0,		x1; \
+	punpcklqdq t1,		x0; \
+	punpckhqdq t1,		x1; \
+	movdqa t2,		x3; \
+	punpcklqdq x2,		t2; \
+	punpckhqdq x2,		x3; \
+	movdqa t2,		x2;
 
 #define read_blocks(in, x0, x1, x2, x3, t0, t1, t2) \
 	movdqu (0*4*4)(in),	x0; \
-- 
cgit v1.2.3


From a3301b751b19f0efbafddc4034f8e7ce6bf3007b Mon Sep 17 00:00:00 2001
From: "Srivatsa S. Bhat" <srivatsa.bhat@linux.vnet.ibm.com>
Date: Sat, 14 Jan 2012 08:11:31 +0530
Subject: x86/mce: Fix CPU hotplug and suspend regression related to MCE

Commit 8a25a2fd126c ("cpu: convert 'cpu' and 'machinecheck' sysdev_class
to a regular subsystem") changed how things are dealt with in the MCE
subsystem.  Some of the things that got broken due to this are CPU
hotplug and suspend/hibernate.

MCE uses per_cpu allocations of struct device.  So, when a CPU goes
offline and comes back online, in order to ensure that we start from a
clean slate with respect to the MCE subsystem, zero out the entire
per_cpu device structure to 0 before using it.

Signed-off-by: Srivatsa S. Bhat <srivatsa.bhat@linux.vnet.ibm.com>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 arch/x86/kernel/cpu/mcheck/mce.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kernel/cpu/mcheck/mce.c b/arch/x86/kernel/cpu/mcheck/mce.c
index f22a9f7f6390..29ba3297e480 100644
--- a/arch/x86/kernel/cpu/mcheck/mce.c
+++ b/arch/x86/kernel/cpu/mcheck/mce.c
@@ -2011,7 +2011,7 @@ static __cpuinit int mce_device_create(unsigned int cpu)
 	if (!mce_available(&boot_cpu_data))
 		return -EIO;
 
-	memset(&dev->kobj, 0, sizeof(struct kobject));
+	memset(dev, 0, sizeof(struct device));
 	dev->id  = cpu;
 	dev->bus = &mce_subsys;
 
-- 
cgit v1.2.3


From 8d973b624ece3b85cfae9474935795d034f72faf Mon Sep 17 00:00:00 2001
From: Ulrich Drepper <drepper@gmail.com>
Date: Sun, 15 Jan 2012 19:40:24 -0500
Subject: x86/kprobes: Fix typo transferred from Intel manual

The arch/x86/lib/x86-opcode-map.txt file [used by the
kprobes instruction decoder] contains the line:

  af: SCAS/W/D/Q rAX,Xv

This is what the Intel manuals show, but it's not correct.
The 'X' stands for:

  Memory addressed by the DS:rSI register pair (for example, MOVS, CMPS, OUTS, or LODS).

On the other hand 'Y' means (also see the ae byte entry for
SCASB):

  Memory addressed by the ES:rDI register pair (for example, MOVS, CMPS, INS, STOS, or SCAS).

Signed-off-by: Ulrich Drepper <drepper@gmail.com>
Acked-by: Masami Hiramatsu <masami.hiramatsu.pt@hitachi.com>
Cc: yrl.pp-manager.tt@hitachi.com
Link: http://lkml.kernel.org/r/CAOPLpQfytPyDEBF1Hbkpo7ovUerEsstVGxBr%3DEpDL-BKEMaqLA@mail.gmail.com
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/lib/x86-opcode-map.txt | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

(limited to 'arch/x86')

diff --git a/arch/x86/lib/x86-opcode-map.txt b/arch/x86/lib/x86-opcode-map.txt
index a793da5e560e..8641bbb8e006 100644
--- a/arch/x86/lib/x86-opcode-map.txt
+++ b/arch/x86/lib/x86-opcode-map.txt
@@ -210,7 +210,9 @@ ab: STOS/W/D/Q Yv,rAX
 ac: LODS/B AL,Xb
 ad: LODS/W/D/Q rAX,Xv
 ae: SCAS/B AL,Yb
-af: SCAS/W/D/Q rAX,Xv
+# Note: The May 2011 Intel manual shows Xv for the second parameter of the
+# next instruction but Yv is correct
+af: SCAS/W/D/Q rAX,Yv
 # 0xb0 - 0xbf
 b0: MOV AL/R8L,Ib
 b1: MOV CL/R9L,Ib
-- 
cgit v1.2.3


From a1c611745c8c4e8996c1877d4e5d0fc95f227c38 Mon Sep 17 00:00:00 2001
From: "xiyou.wangcong@gmail.com" <xiyou.wangcong@gmail.com>
Date: Sun, 15 Jan 2012 20:02:17 +0800
Subject: x86/kprobes: Add arch/x86/tools/insn_sanity to .gitignore

After compiling the kernel, I got:

	% git status
	# On branch master
	# Untracked files:
	#   (use "git add <file>..." to include in what will be committed)
	#
	#	arch/x86/tools/insn_sanity
	nothing added to commit but untracked files present (use "git add" to track)

it should be added to .gitignore.

Signed-off-by: WANG Cong <xiyou.wangcong@gmail.com>
Acked-by: Masami Hiramatsu <masami.hiramatsu.pt@hitachi.com>
Link: http://lkml.kernel.org/r/1326628937-27609-1-git-send-email-xiyou.wangcong@gmail.com
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/.gitignore | 1 +
 1 file changed, 1 insertion(+)

(limited to 'arch/x86')

diff --git a/arch/x86/.gitignore b/arch/x86/.gitignore
index 028079065af6..7cab8c08e6d1 100644
--- a/arch/x86/.gitignore
+++ b/arch/x86/.gitignore
@@ -1,3 +1,4 @@
 boot/compressed/vmlinux
 tools/test_get_len
+tools/insn_sanity
 
-- 
cgit v1.2.3


From f10448689d95b9516c656ccd4078839e656656e7 Mon Sep 17 00:00:00 2001
From: Anton Vorontsov <cbouatmailru@gmail.com>
Date: Wed, 11 Jan 2012 05:11:46 +0400
Subject: x86: Get rid of dubious one-bit signed bitfield

This very noisy sparse warning appears on almost every file in
the kernel:

  CHECK   init/main.c
  arch/x86/include/asm/thread_info.h:43:55: error: dubious one-bit
  signed bitfield arch/x86/include/asm/thread_info.h:44:46: error:
  dubious one-bit signed bitfield

Sparse is right and this patch changes sig_on_uaccess_error and
uaccess_err flags to unsigned type and thus fixes the warning.

Signed-off-by: Anton Vorontsov <cbouatmailru@gmail.com>
Acked-by: Andy Lutomirski <luto@mit.edu>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: H. Peter Anvin <hpa@linux.intel.com>
Cc: Dan Carpenter <error27@gmail.com>
Link: http://lkml.kernel.org/r/20120111011146.GA30428@oksana.dev.rtsoft.ru
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/include/asm/thread_info.h | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/include/asm/thread_info.h b/arch/x86/include/asm/thread_info.h
index 185b719ec61a..56a63ff7665e 100644
--- a/arch/x86/include/asm/thread_info.h
+++ b/arch/x86/include/asm/thread_info.h
@@ -40,8 +40,8 @@ struct thread_info {
 						*/
 	__u8			supervisor_stack[0];
 #endif
-	int			sig_on_uaccess_error:1;
-	int			uaccess_err:1;	/* uaccess failed */
+	unsigned int		sig_on_uaccess_error:1;
+	unsigned int		uaccess_err:1;	/* uaccess failed */
 };
 
 #define INIT_THREAD_INFO(tsk)			\
-- 
cgit v1.2.3


From e032d80774315869aa2285b217fdbbfed86c0b49 Mon Sep 17 00:00:00 2001
From: Greg Kroah-Hartman <gregkh@suse.de>
Date: Mon, 16 Jan 2012 14:40:28 -0800
Subject: mce: fix warning messages about static struct mce_device

When suspending, there was a large list of warnings going something like:

	Device 'machinecheck1' does not have a release() function, it is broken and must be fixed

This patch turns the static mce_devices into dynamically allocated, and
properly frees them when they are removed from the system.  It solves
the warning messages on my laptop here.

Reported-by: "Srivatsa S. Bhat" <srivatsa.bhat@linux.vnet.ibm.com>
Reported-by: Linus Torvalds <torvalds@linux-foundation.org>
Tested-by: Djalal Harouni <tixxdz@opendz.org>
Cc: Kay Sievers <kay.sievers@vrfy.org>
Cc: Tony Luck <tony.luck@intel.com>
Cc: Borislav Petkov <bp@amd64.org>
Signed-off-by: Greg Kroah-Hartman <gregkh@suse.de>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 arch/x86/include/asm/mce.h           |  2 +-
 arch/x86/kernel/cpu/mcheck/mce.c     | 18 ++++++++++++++----
 arch/x86/kernel/cpu/mcheck/mce_amd.c | 18 +++++++++++-------
 3 files changed, 26 insertions(+), 12 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/include/asm/mce.h b/arch/x86/include/asm/mce.h
index f35ce43c1a77..6aefb14cbbc5 100644
--- a/arch/x86/include/asm/mce.h
+++ b/arch/x86/include/asm/mce.h
@@ -151,7 +151,7 @@ static inline void enable_p5_mce(void) {}
 
 void mce_setup(struct mce *m);
 void mce_log(struct mce *m);
-DECLARE_PER_CPU(struct device, mce_device);
+extern struct device *mce_device[CONFIG_NR_CPUS];
 
 /*
  * Maximum banks number.
diff --git a/arch/x86/kernel/cpu/mcheck/mce.c b/arch/x86/kernel/cpu/mcheck/mce.c
index 29ba3297e480..5a11ae2e9e91 100644
--- a/arch/x86/kernel/cpu/mcheck/mce.c
+++ b/arch/x86/kernel/cpu/mcheck/mce.c
@@ -1859,7 +1859,7 @@ static struct bus_type mce_subsys = {
 	.dev_name	= "machinecheck",
 };
 
-DEFINE_PER_CPU(struct device, mce_device);
+struct device *mce_device[CONFIG_NR_CPUS];
 
 __cpuinitdata
 void (*threshold_cpu_callback)(unsigned long action, unsigned int cpu);
@@ -2001,19 +2001,27 @@ static struct device_attribute *mce_device_attrs[] = {
 
 static cpumask_var_t mce_device_initialized;
 
+static void mce_device_release(struct device *dev)
+{
+	kfree(dev);
+}
+
 /* Per cpu device init. All of the cpus still share the same ctrl bank: */
 static __cpuinit int mce_device_create(unsigned int cpu)
 {
-	struct device *dev = &per_cpu(mce_device, cpu);
+	struct device *dev;
 	int err;
 	int i, j;
 
 	if (!mce_available(&boot_cpu_data))
 		return -EIO;
 
-	memset(dev, 0, sizeof(struct device));
+	dev = kzalloc(sizeof *dev, GFP_KERNEL);
+	if (!dev)
+		return -ENOMEM;
 	dev->id  = cpu;
 	dev->bus = &mce_subsys;
+	dev->release = &mce_device_release;
 
 	err = device_register(dev);
 	if (err)
@@ -2030,6 +2038,7 @@ static __cpuinit int mce_device_create(unsigned int cpu)
 			goto error2;
 	}
 	cpumask_set_cpu(cpu, mce_device_initialized);
+	mce_device[cpu] = dev;
 
 	return 0;
 error2:
@@ -2046,7 +2055,7 @@ error:
 
 static __cpuinit void mce_device_remove(unsigned int cpu)
 {
-	struct device *dev = &per_cpu(mce_device, cpu);
+	struct device *dev = mce_device[cpu];
 	int i;
 
 	if (!cpumask_test_cpu(cpu, mce_device_initialized))
@@ -2060,6 +2069,7 @@ static __cpuinit void mce_device_remove(unsigned int cpu)
 
 	device_unregister(dev);
 	cpumask_clear_cpu(cpu, mce_device_initialized);
+	mce_device[cpu] = NULL;
 }
 
 /* Make sure there are no machine checks on offlined CPUs. */
diff --git a/arch/x86/kernel/cpu/mcheck/mce_amd.c b/arch/x86/kernel/cpu/mcheck/mce_amd.c
index ba0b94a7e204..786e76a86322 100644
--- a/arch/x86/kernel/cpu/mcheck/mce_amd.c
+++ b/arch/x86/kernel/cpu/mcheck/mce_amd.c
@@ -523,6 +523,7 @@ static __cpuinit int threshold_create_bank(unsigned int cpu, unsigned int bank)
 {
 	int i, err = 0;
 	struct threshold_bank *b = NULL;
+	struct device *dev = mce_device[cpu];
 	char name[32];
 
 	sprintf(name, "threshold_bank%i", bank);
@@ -543,8 +544,7 @@ static __cpuinit int threshold_create_bank(unsigned int cpu, unsigned int bank)
 		if (!b)
 			goto out;
 
-		err = sysfs_create_link(&per_cpu(mce_device, cpu).kobj,
-					b->kobj, name);
+		err = sysfs_create_link(&dev->kobj, b->kobj, name);
 		if (err)
 			goto out;
 
@@ -565,7 +565,7 @@ static __cpuinit int threshold_create_bank(unsigned int cpu, unsigned int bank)
 		goto out;
 	}
 
-	b->kobj = kobject_create_and_add(name, &per_cpu(mce_device, cpu).kobj);
+	b->kobj = kobject_create_and_add(name, &dev->kobj);
 	if (!b->kobj)
 		goto out_free;
 
@@ -585,8 +585,9 @@ static __cpuinit int threshold_create_bank(unsigned int cpu, unsigned int bank)
 		if (i == cpu)
 			continue;
 
-		err = sysfs_create_link(&per_cpu(mce_device, i).kobj,
-					b->kobj, name);
+		dev = mce_device[i];
+		if (dev)
+			err = sysfs_create_link(&dev->kobj,b->kobj, name);
 		if (err)
 			goto out;
 
@@ -649,6 +650,7 @@ static void deallocate_threshold_block(unsigned int cpu,
 static void threshold_remove_bank(unsigned int cpu, int bank)
 {
 	struct threshold_bank *b;
+	struct device *dev;
 	char name[32];
 	int i = 0;
 
@@ -663,7 +665,7 @@ static void threshold_remove_bank(unsigned int cpu, int bank)
 #ifdef CONFIG_SMP
 	/* sibling symlink */
 	if (shared_bank[bank] && b->blocks->cpu != cpu) {
-		sysfs_remove_link(&per_cpu(mce_device, cpu).kobj, name);
+		sysfs_remove_link(&mce_device[cpu]->kobj, name);
 		per_cpu(threshold_banks, cpu)[bank] = NULL;
 
 		return;
@@ -675,7 +677,9 @@ static void threshold_remove_bank(unsigned int cpu, int bank)
 		if (i == cpu)
 			continue;
 
-		sysfs_remove_link(&per_cpu(mce_device, i).kobj, name);
+		dev = mce_device[i];
+		if (dev)
+			sysfs_remove_link(&dev->kobj, name);
 		per_cpu(threshold_banks, i)[bank] = NULL;
 	}
 
-- 
cgit v1.2.3


From da87c937e5a2374686edd58df06cfd5050b125fa Mon Sep 17 00:00:00 2001
From: Cliff Wickman <cpw@sgi.com>
Date: Mon, 16 Jan 2012 15:17:50 -0600
Subject: x86/UV2: Fix new UV2 hardware by using native UV2 broadcast mode

Update the use of the Broadcast Assist Unit on SGI Altix UV2 to
the use of native UV2 mode on new hardware (not the legacy mode).

UV2 native mode has a different format for a broadcast message.
We also need quick differentiaton between UV1 and UV2.

Signed-off-by: Cliff Wickman <cpw@sgi.com>
Link: http://lkml.kernel.org/r/20120116211750.GA5767@sgi.com
Cc: <stable@kernel.org>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/include/asm/uv/uv_bau.h | 93 +++++++++++++++++++++++++++++++++++++---
 arch/x86/platform/uv/tlb_uv.c    | 88 +++++++++++++++++++++++++++----------
 2 files changed, 151 insertions(+), 30 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/include/asm/uv/uv_bau.h b/arch/x86/include/asm/uv/uv_bau.h
index 8e862aaf0d90..4a46b27ee9a0 100644
--- a/arch/x86/include/asm/uv/uv_bau.h
+++ b/arch/x86/include/asm/uv/uv_bau.h
@@ -65,7 +65,7 @@
  * UV2: Bit 19 selects between
  *  (0): 10 microsecond timebase and
  *  (1): 80 microseconds
- *  we're using 655us, similar to UV1: 65 units of 10us
+ *  we're using 560us, similar to UV1: 65 units of 10us
  */
 #define UV1_INTD_SOFT_ACK_TIMEOUT_PERIOD (9UL)
 #define UV2_INTD_SOFT_ACK_TIMEOUT_PERIOD (15UL)
@@ -235,10 +235,10 @@ struct bau_msg_payload {
 
 
 /*
- * Message header:  16 bytes (128 bits) (bytes 0x30-0x3f of descriptor)
+ * UV1 Message header:  16 bytes (128 bits) (bytes 0x30-0x3f of descriptor)
  * see table 4.2.3.0.1 in broacast_assist spec.
  */
-struct bau_msg_header {
+struct uv1_bau_msg_header {
 	unsigned int	dest_subnodeid:6;	/* must be 0x10, for the LB */
 	/* bits 5:0 */
 	unsigned int	base_dest_nasid:15;	/* nasid of the first bit */
@@ -317,20 +317,88 @@ struct bau_msg_header {
 	/* bits 127:107 */
 };
 
+/*
+ * UV2 Message header:  16 bytes (128 bits) (bytes 0x30-0x3f of descriptor)
+ * see figure 9-2 of harp_sys.pdf
+ */
+struct uv2_bau_msg_header {
+	unsigned int	base_dest_nasid:15;	/* nasid of the first bit */
+	/* bits 14:0 */				/* in uvhub map */
+	unsigned int	dest_subnodeid:5;	/* must be 0x10, for the LB */
+	/* bits 19:15 */
+	unsigned int	rsvd_1:1;		/* must be zero */
+	/* bit 20 */
+	/* Address bits 59:21 */
+	/* bits 25:2 of address (44:21) are payload */
+	/* these next 24 bits become bytes 12-14 of msg */
+	/* bits 28:21 land in byte 12 */
+	unsigned int	replied_to:1;		/* sent as 0 by the source to
+						   byte 12 */
+	/* bit 21 */
+	unsigned int	msg_type:3;		/* software type of the
+						   message */
+	/* bits 24:22 */
+	unsigned int	canceled:1;		/* message canceled, resource
+						   is to be freed*/
+	/* bit 25 */
+	unsigned int	payload_1:3;		/* not currently used */
+	/* bits 28:26 */
+
+	/* bits 36:29 land in byte 13 */
+	unsigned int	payload_2a:3;		/* not currently used */
+	unsigned int	payload_2b:5;		/* not currently used */
+	/* bits 36:29 */
+
+	/* bits 44:37 land in byte 14 */
+	unsigned int	payload_3:8;		/* not currently used */
+	/* bits 44:37 */
+
+	unsigned int	rsvd_2:7;		/* reserved */
+	/* bits 51:45 */
+	unsigned int	swack_flag:1;		/* software acknowledge flag */
+	/* bit 52 */
+	unsigned int	rsvd_3a:3;		/* must be zero */
+	unsigned int	rsvd_3b:8;		/* must be zero */
+	unsigned int	rsvd_3c:8;		/* must be zero */
+	unsigned int	rsvd_3d:3;		/* must be zero */
+	/* bits 74:53 */
+	unsigned int	fairness:3;		/* usually zero */
+	/* bits 77:75 */
+
+	unsigned int	sequence:16;		/* message sequence number */
+	/* bits 93:78  Suppl_A  */
+	unsigned int	chaining:1;		/* next descriptor is part of
+						   this activation*/
+	/* bit 94 */
+	unsigned int	multilevel:1;		/* multi-level multicast
+						   format */
+	/* bit 95 */
+	unsigned int	rsvd_4:24;		/* ordered / source node /
+						   source subnode / aging
+						   must be zero */
+	/* bits 119:96 */
+	unsigned int	command:8;		/* message type */
+	/* bits 127:120 */
+};
+
 /*
  * The activation descriptor:
  * The format of the message to send, plus all accompanying control
  * Should be 64 bytes
  */
 struct bau_desc {
-	struct pnmask			distribution;
+	struct pnmask				distribution;
 	/*
 	 * message template, consisting of header and payload:
 	 */
-	struct bau_msg_header		header;
-	struct bau_msg_payload		payload;
+	union bau_msg_header {
+		struct uv1_bau_msg_header	uv1_hdr;
+		struct uv2_bau_msg_header	uv2_hdr;
+	} header;
+
+	struct bau_msg_payload			payload;
 };
-/*
+/* UV1:
  *   -payload--    ---------header------
  *   bytes 0-11    bits 41-56  bits 58-81
  *       A           B  (2)      C (3)
@@ -340,6 +408,16 @@ struct bau_desc {
  *   bytes 0-11  bytes 12-14  bytes 16-17  (byte 15 filled in by hw as vector)
  *   ------------payload queue-----------
  */
+/* UV2:
+ *   -payload--    ---------header------
+ *   bytes 0-11    bits 70-78  bits 21-44
+ *       A           B  (2)      C (3)
+ *
+ *            A/B/C are moved to:
+ *       A            C          B
+ *   bytes 0-11  bytes 12-14  bytes 16-17  (byte 15 filled in by hw as vector)
+ *   ------------payload queue-----------
+ */
 
 /*
  * The payload queue on the destination side is an array of these.
@@ -511,6 +589,7 @@ struct bau_control {
 	short			osnode;
 	short			uvhub_cpu;
 	short			uvhub;
+	short			uvhub_version;
 	short			cpus_in_socket;
 	short			cpus_in_uvhub;
 	short			partition_base_pnode;
diff --git a/arch/x86/platform/uv/tlb_uv.c b/arch/x86/platform/uv/tlb_uv.c
index 5b552198f774..1341a2e06542 100644
--- a/arch/x86/platform/uv/tlb_uv.c
+++ b/arch/x86/platform/uv/tlb_uv.c
@@ -573,7 +573,7 @@ static int wait_completion(struct bau_desc *bau_desc,
 		right_shift = ((cpu - UV_CPUS_PER_AS) * UV_ACT_STATUS_SIZE);
 	}
 
-	if (is_uv1_hub())
+	if (bcp->uvhub_version == 1)
 		return uv1_wait_completion(bau_desc, mmr_offset, right_shift,
 								bcp, try);
 	else
@@ -757,15 +757,22 @@ int uv_flush_send_and_wait(struct bau_desc *bau_desc,
 {
 	int seq_number = 0;
 	int completion_stat = 0;
+	int uv1 = 0;
 	long try = 0;
 	unsigned long index;
 	cycles_t time1;
 	cycles_t time2;
 	struct ptc_stats *stat = bcp->statp;
 	struct bau_control *hmaster = bcp->uvhub_master;
+	struct uv1_bau_msg_header *uv1_hdr = NULL;
+	struct uv2_bau_msg_header *uv2_hdr = NULL;
 
-	if (is_uv1_hub())
+	if (bcp->uvhub_version == 1) {
+		uv1 = 1;
 		uv1_throttle(hmaster, stat);
+		uv1_hdr = &bau_desc->header.uv1_hdr;
+	} else
+		uv2_hdr = &bau_desc->header.uv2_hdr;
 
 	while (hmaster->uvhub_quiesce)
 		cpu_relax();
@@ -773,14 +780,23 @@ int uv_flush_send_and_wait(struct bau_desc *bau_desc,
 	time1 = get_cycles();
 	do {
 		if (try == 0) {
-			bau_desc->header.msg_type = MSG_REGULAR;
+			if (uv1)
+				uv1_hdr->msg_type = MSG_REGULAR;
+			else
+				uv2_hdr->msg_type = MSG_REGULAR;
 			seq_number = bcp->message_number++;
 		} else {
-			bau_desc->header.msg_type = MSG_RETRY;
+			if (uv1)
+				uv1_hdr->msg_type = MSG_RETRY;
+			else
+				uv2_hdr->msg_type = MSG_RETRY;
 			stat->s_retry_messages++;
 		}
 
-		bau_desc->header.sequence = seq_number;
+		if (uv1)
+			uv1_hdr->sequence = seq_number;
+		else
+			uv2_hdr->sequence = seq_number;
 		index = (1UL << AS_PUSH_SHIFT) | bcp->uvhub_cpu;
 		bcp->send_message = get_cycles();
 
@@ -967,7 +983,7 @@ const struct cpumask *uv_flush_tlb_others(const struct cpumask *cpumask,
 		stat->s_ntargself++;
 
 	bau_desc = bcp->descriptor_base;
-	bau_desc += ITEMS_PER_DESC * bcp->uvhub_cpu;
+	bau_desc += (ITEMS_PER_DESC * bcp->uvhub_cpu);
 	bau_uvhubs_clear(&bau_desc->distribution, UV_DISTRIBUTION_SIZE);
 	if (set_distrib_bits(flush_mask, bcp, bau_desc, &locals, &remotes))
 		return NULL;
@@ -1083,7 +1099,7 @@ static void __init enable_timeouts(void)
 		 */
 		mmr_image |= (1L << SOFTACK_MSHIFT);
 		if (is_uv2_hub()) {
-			mmr_image |= (1L << UV2_LEG_SHFT);
+			mmr_image &= ~(1L << UV2_LEG_SHFT);
 			mmr_image |= (1L << UV2_EXT_SHFT);
 		}
 		write_mmr_misc_control(pnode, mmr_image);
@@ -1432,12 +1448,15 @@ static void activation_descriptor_init(int node, int pnode, int base_pnode)
 {
 	int i;
 	int cpu;
+	int uv1 = 0;
 	unsigned long gpa;
 	unsigned long m;
 	unsigned long n;
 	size_t dsize;
 	struct bau_desc *bau_desc;
 	struct bau_desc *bd2;
+	struct uv1_bau_msg_header *uv1_hdr;
+	struct uv2_bau_msg_header *uv2_hdr;
 	struct bau_control *bcp;
 
 	/*
@@ -1451,6 +1470,8 @@ static void activation_descriptor_init(int node, int pnode, int base_pnode)
 	gpa = uv_gpa(bau_desc);
 	n = uv_gpa_to_gnode(gpa);
 	m = uv_gpa_to_offset(gpa);
+	if (is_uv1_hub())
+		uv1 = 1;
 
 	/* the 14-bit pnode */
 	write_mmr_descriptor_base(pnode, (n << UV_DESC_PSHIFT | m));
@@ -1461,21 +1482,33 @@ static void activation_descriptor_init(int node, int pnode, int base_pnode)
 	 */
 	for (i = 0, bd2 = bau_desc; i < (ADP_SZ * ITEMS_PER_DESC); i++, bd2++) {
 		memset(bd2, 0, sizeof(struct bau_desc));
-		bd2->header.swack_flag =	1;
-		/*
-		 * The base_dest_nasid set in the message header is the nasid
-		 * of the first uvhub in the partition. The bit map will
-		 * indicate destination pnode numbers relative to that base.
-		 * They may not be consecutive if nasid striding is being used.
-		 */
-		bd2->header.base_dest_nasid =	UV_PNODE_TO_NASID(base_pnode);
-		bd2->header.dest_subnodeid =	UV_LB_SUBNODEID;
-		bd2->header.command =		UV_NET_ENDPOINT_INTD;
-		bd2->header.int_both =		1;
-		/*
-		 * all others need to be set to zero:
-		 *   fairness chaining multilevel count replied_to
-		 */
+		if (uv1) {
+			uv1_hdr = &bd2->header.uv1_hdr;
+			uv1_hdr->swack_flag =	1;
+			/*
+			 * The base_dest_nasid set in the message header
+			 * is the nasid of the first uvhub in the partition.
+			 * The bit map will indicate destination pnode numbers
+			 * relative to that base. They may not be consecutive
+			 * if nasid striding is being used.
+			 */
+			uv1_hdr->base_dest_nasid =
+						UV_PNODE_TO_NASID(base_pnode);
+			uv1_hdr->dest_subnodeid =	UV_LB_SUBNODEID;
+			uv1_hdr->command =		UV_NET_ENDPOINT_INTD;
+			uv1_hdr->int_both =		1;
+			/*
+			 * all others need to be set to zero:
+			 *   fairness chaining multilevel count replied_to
+			 */
+		} else {
+			uv2_hdr = &bd2->header.uv2_hdr;
+			uv2_hdr->swack_flag =	1;
+			uv2_hdr->base_dest_nasid =
+						UV_PNODE_TO_NASID(base_pnode);
+			uv2_hdr->dest_subnodeid =	UV_LB_SUBNODEID;
+			uv2_hdr->command =		UV_NET_ENDPOINT_INTD;
+		}
 	}
 	for_each_present_cpu(cpu) {
 		if (pnode != uv_blade_to_pnode(uv_cpu_to_blade_id(cpu)))
@@ -1728,6 +1761,14 @@ static int scan_sock(struct socket_desc *sdp, struct uvhub_desc *bdp,
 		bcp->cpus_in_socket = sdp->num_cpus;
 		bcp->socket_master = *smasterp;
 		bcp->uvhub = bdp->uvhub;
+		if (is_uv1_hub())
+			bcp->uvhub_version = 1;
+		else if (is_uv2_hub())
+			bcp->uvhub_version = 2;
+		else {
+			printk(KERN_EMERG "uvhub version not 1 or 2\n");
+			return 1;
+		}
 		bcp->uvhub_master = *hmasterp;
 		bcp->uvhub_cpu = uv_cpu_hub_info(cpu)->blade_processor_id;
 		if (bcp->uvhub_cpu >= MAX_CPUS_PER_UVHUB) {
@@ -1867,7 +1908,8 @@ static int __init uv_bau_init(void)
 			val = 1L << 63;
 			write_gmmr_activation(pnode, val);
 			mmr = 1; /* should be 1 to broadcast to both sockets */
-			write_mmr_data_broadcast(pnode, mmr);
+			if (!is_uv1_hub())
+				write_mmr_data_broadcast(pnode, mmr);
 		}
 	}
 
-- 
cgit v1.2.3


From d059f9fa84a30e04279c6ff615e9e2cf3b260191 Mon Sep 17 00:00:00 2001
From: Cliff Wickman <cpw@sgi.com>
Date: Mon, 16 Jan 2012 15:18:48 -0600
Subject: x86/UV2: Fix BAU destination timeout initialization

Move the call to enable_timeouts() forward so that
BAU_MISC_CONTROL is initialized before using it in
calculate_destination_timeout().

Fix the calculation of a BAU destination timeout
for UV2 (in calculate_destination_timeout()).

Signed-off-by: Cliff Wickman <cpw@sgi.com>
Link: http://lkml.kernel.org/r/20120116211848.GB5767@sgi.com
Cc: <stable@kernel.org>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/platform/uv/tlb_uv.c | 13 +++++++------
 1 file changed, 7 insertions(+), 6 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/platform/uv/tlb_uv.c b/arch/x86/platform/uv/tlb_uv.c
index 1341a2e06542..c425ff1a9cc3 100644
--- a/arch/x86/platform/uv/tlb_uv.c
+++ b/arch/x86/platform/uv/tlb_uv.c
@@ -1617,14 +1617,14 @@ static int calculate_destination_timeout(void)
 		ts_ns = base * mult1 * mult2;
 		ret = ts_ns / 1000;
 	} else {
-		/* 4 bits  0/1 for 10/80us, 3 bits of multiplier */
-		mmr_image = uv_read_local_mmr(UVH_AGING_PRESCALE_SEL);
+		/* 4 bits  0/1 for 10/80us base, 3 bits of multiplier */
+		mmr_image = uv_read_local_mmr(UVH_LB_BAU_MISC_CONTROL);
 		mmr_image = (mmr_image & UV_SA_MASK) >> UV_SA_SHFT;
 		if (mmr_image & (1L << UV2_ACK_UNITS_SHFT))
-			mult1 = 80;
+			base = 80;
 		else
-			mult1 = 10;
-		base = mmr_image & UV2_ACK_MASK;
+			base = 10;
+		mult1 = mmr_image & UV2_ACK_MASK;
 		ret = mult1 * base;
 	}
 	return ret;
@@ -1886,6 +1886,8 @@ static int __init uv_bau_init(void)
 			uv_base_pnode = uv_blade_to_pnode(uvhub);
 	}
 
+	enable_timeouts();
+
 	if (init_per_cpu(nuvhubs, uv_base_pnode)) {
 		nobau = 1;
 		return 0;
@@ -1896,7 +1898,6 @@ static int __init uv_bau_init(void)
 		if (uv_blade_nr_possible_cpus(uvhub))
 			init_uvhub(uvhub, vector, uv_base_pnode);
 
-	enable_timeouts();
 	alloc_intr_gate(vector, uv_bau_message_intr1);
 
 	for_each_possible_blade(uvhub) {
-- 
cgit v1.2.3


From c5d35d399e685acccc85a675e8765c26b2a9813a Mon Sep 17 00:00:00 2001
From: Cliff Wickman <cpw@sgi.com>
Date: Mon, 16 Jan 2012 15:19:47 -0600
Subject: x86/UV2: Work around BAU bug

This patch implements a workaround for a UV2 hardware bug.
The bug is a non-atomic update of a memory-mapped register. When
hardware message delivery and software message acknowledge occur
simultaneously the pending message acknowledge for the arriving
message may be lost.  This causes the sender's message status to
stay busy.

Part of the workaround is to not acknowledge a completed message
until it is verified that no other message is actually using the
resource that is mistakenly recorded in the completed message.

Part of the workaround is to test for long elapsed time in such
a busy condition, then handle it by using a spare sending
descriptor. The stay-busy condition is eventually timed out by
hardware, and then the original sending descriptor can be
re-used. Most of that logic change is in keeping track of the
current descriptor and the state of the spares.

The occurrences of the workaround are added to the BAU
statistics.

Signed-off-by: Cliff Wickman <cpw@sgi.com>
Link: http://lkml.kernel.org/r/20120116211947.GC5767@sgi.com
Cc: <stable@kernel.org>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/include/asm/uv/uv_bau.h |  13 +-
 arch/x86/platform/uv/tlb_uv.c    | 274 ++++++++++++++++++++++++++++++++++-----
 2 files changed, 254 insertions(+), 33 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/include/asm/uv/uv_bau.h b/arch/x86/include/asm/uv/uv_bau.h
index 4a46b27ee9a0..1b82f7e87393 100644
--- a/arch/x86/include/asm/uv/uv_bau.h
+++ b/arch/x86/include/asm/uv/uv_bau.h
@@ -167,6 +167,7 @@
 #define FLUSH_RETRY_TIMEOUT		2
 #define FLUSH_GIVEUP			3
 #define FLUSH_COMPLETE			4
+#define FLUSH_RETRY_BUSYBUG		5
 
 /*
  * tuning the action when the numalink network is extremely delayed
@@ -463,7 +464,6 @@ struct bau_pq_entry {
 struct msg_desc {
 	struct bau_pq_entry	*msg;
 	int			msg_slot;
-	int			swack_slot;
 	struct bau_pq_entry	*queue_first;
 	struct bau_pq_entry	*queue_last;
 };
@@ -517,6 +517,9 @@ struct ptc_stats {
 	unsigned long	s_retry_messages;	/* retry broadcasts */
 	unsigned long	s_bau_reenabled;	/* for bau enable/disable */
 	unsigned long	s_bau_disabled;		/* for bau enable/disable */
+	unsigned long	s_uv2_wars;		/* uv2 workaround, perm. busy */
+	unsigned long	s_uv2_wars_hw;		/* uv2 workaround, hiwater */
+	unsigned long	s_uv2_war_waits;	/* uv2 workaround, long waits */
 	/* destination statistics */
 	unsigned long	d_alltlb;		/* times all tlb's on this
 						   cpu were flushed */
@@ -593,6 +596,8 @@ struct bau_control {
 	short			cpus_in_socket;
 	short			cpus_in_uvhub;
 	short			partition_base_pnode;
+	short			using_desc; /* an index, like uvhub_cpu */
+	unsigned int		inuse_map;
 	unsigned short		message_number;
 	unsigned short		uvhub_quiesce;
 	short			socket_acknowledge_count[DEST_Q_SIZE];
@@ -610,6 +615,7 @@ struct bau_control {
 	int			cong_response_us;
 	int			cong_reps;
 	int			cong_period;
+	unsigned long		clocks_per_100_usec;
 	cycles_t		period_time;
 	long			period_requests;
 	struct hub_and_pnode	*thp;
@@ -670,6 +676,11 @@ static inline void write_mmr_sw_ack(unsigned long mr)
 	uv_write_local_mmr(UVH_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE_ALIAS, mr);
 }
 
+static inline void write_gmmr_sw_ack(int pnode, unsigned long mr)
+{
+	write_gmmr(pnode, UVH_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE_ALIAS, mr);
+}
+
 static inline unsigned long read_mmr_sw_ack(void)
 {
 	return read_lmmr(UVH_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE);
diff --git a/arch/x86/platform/uv/tlb_uv.c b/arch/x86/platform/uv/tlb_uv.c
index c425ff1a9cc3..9010ca715c03 100644
--- a/arch/x86/platform/uv/tlb_uv.c
+++ b/arch/x86/platform/uv/tlb_uv.c
@@ -157,13 +157,14 @@ static int __init uvhub_to_first_apicid(int uvhub)
  * clear of the Timeout bit (as well) will free the resource. No reply will
  * be sent (the hardware will only do one reply per message).
  */
-static void reply_to_message(struct msg_desc *mdp, struct bau_control *bcp)
+static void reply_to_message(struct msg_desc *mdp, struct bau_control *bcp,
+						int do_acknowledge)
 {
 	unsigned long dw;
 	struct bau_pq_entry *msg;
 
 	msg = mdp->msg;
-	if (!msg->canceled) {
+	if (!msg->canceled && do_acknowledge) {
 		dw = (msg->swack_vec << UV_SW_ACK_NPENDING) | msg->swack_vec;
 		write_mmr_sw_ack(dw);
 	}
@@ -212,8 +213,8 @@ static void bau_process_retry_msg(struct msg_desc *mdp,
 			if (mmr & (msg_res << UV_SW_ACK_NPENDING)) {
 				unsigned long mr;
 				/*
-				 * is the resource timed out?
-				 * make everyone ignore the cancelled message.
+				 * Is the resource timed out?
+				 * Make everyone ignore the cancelled message.
 				 */
 				msg2->canceled = 1;
 				stat->d_canceled++;
@@ -231,8 +232,8 @@ static void bau_process_retry_msg(struct msg_desc *mdp,
  * Do all the things a cpu should do for a TLB shootdown message.
  * Other cpu's may come here at the same time for this message.
  */
-static void bau_process_message(struct msg_desc *mdp,
-					struct bau_control *bcp)
+static void bau_process_message(struct msg_desc *mdp, struct bau_control *bcp,
+						int do_acknowledge)
 {
 	short socket_ack_count = 0;
 	short *sp;
@@ -284,8 +285,9 @@ static void bau_process_message(struct msg_desc *mdp,
 		if (msg_ack_count == bcp->cpus_in_uvhub) {
 			/*
 			 * All cpus in uvhub saw it; reply
+			 * (unless we are in the UV2 workaround)
 			 */
-			reply_to_message(mdp, bcp);
+			reply_to_message(mdp, bcp, do_acknowledge);
 		}
 	}
 
@@ -491,27 +493,138 @@ static int uv1_wait_completion(struct bau_desc *bau_desc,
 /*
  * UV2 has an extra bit of status in the ACTIVATION_STATUS_2 register.
  */
-static unsigned long uv2_read_status(unsigned long offset, int rshft, int cpu)
+static unsigned long uv2_read_status(unsigned long offset, int rshft, int desc)
 {
 	unsigned long descriptor_status;
 	unsigned long descriptor_status2;
 
 	descriptor_status = ((read_lmmr(offset) >> rshft) & UV_ACT_STATUS_MASK);
-	descriptor_status2 = (read_mmr_uv2_status() >> cpu) & 0x1UL;
+	descriptor_status2 = (read_mmr_uv2_status() >> desc) & 0x1UL;
 	descriptor_status = (descriptor_status << 1) | descriptor_status2;
 	return descriptor_status;
 }
 
+/*
+ * Return whether the status of the descriptor that is normally used for this
+ * cpu (the one indexed by its hub-relative cpu number) is busy.
+ * The status of the original 32 descriptors is always reflected in the 64
+ * bits of UVH_LB_BAU_SB_ACTIVATION_STATUS_0.
+ * The bit provided by the activation_status_2 register is irrelevant to
+ * the status if it is only being tested for busy or not busy.
+ */
+int normal_busy(struct bau_control *bcp)
+{
+	int cpu = bcp->uvhub_cpu;
+	int mmr_offset;
+	int right_shift;
+
+	mmr_offset = UVH_LB_BAU_SB_ACTIVATION_STATUS_0;
+	right_shift = cpu * UV_ACT_STATUS_SIZE;
+	return (((((read_lmmr(mmr_offset) >> right_shift) &
+				UV_ACT_STATUS_MASK)) << 1) == UV2H_DESC_BUSY);
+}
+
+/*
+ * Entered when a bau descriptor has gone into a permanent busy wait because
+ * of a hardware bug.
+ * Workaround the bug.
+ */
+int handle_uv2_busy(struct bau_control *bcp)
+{
+	int busy_one = bcp->using_desc;
+	int normal = bcp->uvhub_cpu;
+	int selected = -1;
+	int i;
+	unsigned long descriptor_status;
+	unsigned long status;
+	int mmr_offset;
+	struct bau_desc *bau_desc_old;
+	struct bau_desc *bau_desc_new;
+	struct bau_control *hmaster = bcp->uvhub_master;
+	struct ptc_stats *stat = bcp->statp;
+	cycles_t ttm;
+
+	stat->s_uv2_wars++;
+	spin_lock(&hmaster->uvhub_lock);
+	/* try for the original first */
+	if (busy_one != normal) {
+		if (!normal_busy(bcp))
+			selected = normal;
+	}
+	if (selected < 0) {
+		/* can't use the normal, select an alternate */
+		mmr_offset = UVH_LB_BAU_SB_ACTIVATION_STATUS_1;
+		descriptor_status = read_lmmr(mmr_offset);
+
+		/* scan available descriptors 32-63 */
+		for (i = 0; i < UV_CPUS_PER_AS; i++) {
+			if ((hmaster->inuse_map & (1 << i)) == 0) {
+				status = ((descriptor_status >>
+						(i * UV_ACT_STATUS_SIZE)) &
+						UV_ACT_STATUS_MASK) << 1;
+				if (status != UV2H_DESC_BUSY) {
+					selected = i + UV_CPUS_PER_AS;
+					break;
+				}
+			}
+		}
+	}
+
+	if (busy_one != normal)
+		/* mark the busy alternate as not in-use */
+		hmaster->inuse_map &= ~(1 << (busy_one - UV_CPUS_PER_AS));
+
+	if (selected >= 0) {
+		/* switch to the selected descriptor */
+		if (selected != normal) {
+			/* set the selected alternate as in-use */
+			hmaster->inuse_map |=
+					(1 << (selected - UV_CPUS_PER_AS));
+			if (selected > stat->s_uv2_wars_hw)
+				stat->s_uv2_wars_hw = selected;
+		}
+		bau_desc_old = bcp->descriptor_base;
+		bau_desc_old += (ITEMS_PER_DESC * busy_one);
+		bcp->using_desc = selected;
+		bau_desc_new = bcp->descriptor_base;
+		bau_desc_new += (ITEMS_PER_DESC * selected);
+		*bau_desc_new = *bau_desc_old;
+	} else {
+		/*
+		 * All are busy. Wait for the normal one for this cpu to
+		 * free up.
+		 */
+		stat->s_uv2_war_waits++;
+		spin_unlock(&hmaster->uvhub_lock);
+		ttm = get_cycles();
+		do {
+			cpu_relax();
+		} while (normal_busy(bcp));
+		spin_lock(&hmaster->uvhub_lock);
+		/* switch to the original descriptor */
+		bcp->using_desc = normal;
+		bau_desc_old = bcp->descriptor_base;
+		bau_desc_old += (ITEMS_PER_DESC * bcp->using_desc);
+		bcp->using_desc = (ITEMS_PER_DESC * normal);
+		bau_desc_new = bcp->descriptor_base;
+		bau_desc_new += (ITEMS_PER_DESC * normal);
+		*bau_desc_new = *bau_desc_old; /* copy the entire descriptor */
+	}
+	spin_unlock(&hmaster->uvhub_lock);
+	return FLUSH_RETRY_BUSYBUG;
+}
+
 static int uv2_wait_completion(struct bau_desc *bau_desc,
 				unsigned long mmr_offset, int right_shift,
 				struct bau_control *bcp, long try)
 {
 	unsigned long descriptor_stat;
 	cycles_t ttm;
-	int cpu = bcp->uvhub_cpu;
+	int desc = bcp->using_desc;
+	long busy_reps = 0;
 	struct ptc_stats *stat = bcp->statp;
 
-	descriptor_stat = uv2_read_status(mmr_offset, right_shift, cpu);
+	descriptor_stat = uv2_read_status(mmr_offset, right_shift, desc);
 
 	/* spin on the status MMR, waiting for it to go idle */
 	while (descriptor_stat != UV2H_DESC_IDLE) {
@@ -542,12 +655,23 @@ static int uv2_wait_completion(struct bau_desc *bau_desc,
 			bcp->conseccompletes = 0;
 			return FLUSH_RETRY_TIMEOUT;
 		} else {
+			busy_reps++;
+			if (busy_reps > 1000000) {
+				/* not to hammer on the clock */
+				busy_reps = 0;
+				ttm = get_cycles();
+				if ((ttm - bcp->send_message) >
+					(bcp->clocks_per_100_usec)) {
+					return handle_uv2_busy(bcp);
+				}
+			}
 			/*
 			 * descriptor_stat is still BUSY
 			 */
 			cpu_relax();
 		}
-		descriptor_stat = uv2_read_status(mmr_offset, right_shift, cpu);
+		descriptor_stat = uv2_read_status(mmr_offset, right_shift,
+									desc);
 	}
 	bcp->conseccompletes++;
 	return FLUSH_COMPLETE;
@@ -563,14 +687,14 @@ static int wait_completion(struct bau_desc *bau_desc,
 {
 	int right_shift;
 	unsigned long mmr_offset;
-	int cpu = bcp->uvhub_cpu;
+	int desc = bcp->using_desc;
 
-	if (cpu < UV_CPUS_PER_AS) {
+	if (desc < UV_CPUS_PER_AS) {
 		mmr_offset = UVH_LB_BAU_SB_ACTIVATION_STATUS_0;
-		right_shift = cpu * UV_ACT_STATUS_SIZE;
+		right_shift = desc * UV_ACT_STATUS_SIZE;
 	} else {
 		mmr_offset = UVH_LB_BAU_SB_ACTIVATION_STATUS_1;
-		right_shift = ((cpu - UV_CPUS_PER_AS) * UV_ACT_STATUS_SIZE);
+		right_shift = ((desc - UV_CPUS_PER_AS) * UV_ACT_STATUS_SIZE);
 	}
 
 	if (bcp->uvhub_version == 1)
@@ -752,8 +876,7 @@ static void handle_cmplt(int completion_status, struct bau_desc *bau_desc,
  * Returns 1 if it gives up entirely and the original cpu mask is to be
  * returned to the kernel.
  */
-int uv_flush_send_and_wait(struct bau_desc *bau_desc,
-			struct cpumask *flush_mask, struct bau_control *bcp)
+int uv_flush_send_and_wait(struct cpumask *flush_mask, struct bau_control *bcp)
 {
 	int seq_number = 0;
 	int completion_stat = 0;
@@ -766,20 +889,24 @@ int uv_flush_send_and_wait(struct bau_desc *bau_desc,
 	struct bau_control *hmaster = bcp->uvhub_master;
 	struct uv1_bau_msg_header *uv1_hdr = NULL;
 	struct uv2_bau_msg_header *uv2_hdr = NULL;
+	struct bau_desc *bau_desc;
 
-	if (bcp->uvhub_version == 1) {
-		uv1 = 1;
+	if (bcp->uvhub_version == 1)
 		uv1_throttle(hmaster, stat);
-		uv1_hdr = &bau_desc->header.uv1_hdr;
-	} else
-		uv2_hdr = &bau_desc->header.uv2_hdr;
 
 	while (hmaster->uvhub_quiesce)
 		cpu_relax();
 
 	time1 = get_cycles();
 	do {
-		if (try == 0) {
+		bau_desc = bcp->descriptor_base;
+		bau_desc += (ITEMS_PER_DESC * bcp->using_desc);
+		if (bcp->uvhub_version == 1) {
+			uv1 = 1;
+			uv1_hdr = &bau_desc->header.uv1_hdr;
+		} else
+			uv2_hdr = &bau_desc->header.uv2_hdr;
+		if ((try == 0) || (completion_stat == FLUSH_RETRY_BUSYBUG)) {
 			if (uv1)
 				uv1_hdr->msg_type = MSG_REGULAR;
 			else
@@ -797,13 +924,14 @@ int uv_flush_send_and_wait(struct bau_desc *bau_desc,
 			uv1_hdr->sequence = seq_number;
 		else
 			uv2_hdr->sequence = seq_number;
-		index = (1UL << AS_PUSH_SHIFT) | bcp->uvhub_cpu;
+		index = (1UL << AS_PUSH_SHIFT) | bcp->using_desc;
 		bcp->send_message = get_cycles();
 
 		write_mmr_activation(index);
 
 		try++;
 		completion_stat = wait_completion(bau_desc, bcp, try);
+		/* UV2: wait_completion() may change the bcp->using_desc */
 
 		handle_cmplt(completion_stat, bau_desc, bcp, hmaster, stat);
 
@@ -814,6 +942,7 @@ int uv_flush_send_and_wait(struct bau_desc *bau_desc,
 		}
 		cpu_relax();
 	} while ((completion_stat == FLUSH_RETRY_PLUGGED) ||
+		 (completion_stat == FLUSH_RETRY_BUSYBUG) ||
 		 (completion_stat == FLUSH_RETRY_TIMEOUT));
 
 	time2 = get_cycles();
@@ -828,6 +957,7 @@ int uv_flush_send_and_wait(struct bau_desc *bau_desc,
 	record_send_stats(time1, time2, bcp, stat, completion_stat, try);
 
 	if (completion_stat == FLUSH_GIVEUP)
+		/* FLUSH_GIVEUP will fall back to using IPI's for tlb flush */
 		return 1;
 	return 0;
 }
@@ -983,7 +1113,7 @@ const struct cpumask *uv_flush_tlb_others(const struct cpumask *cpumask,
 		stat->s_ntargself++;
 
 	bau_desc = bcp->descriptor_base;
-	bau_desc += (ITEMS_PER_DESC * bcp->uvhub_cpu);
+	bau_desc += (ITEMS_PER_DESC * bcp->using_desc);
 	bau_uvhubs_clear(&bau_desc->distribution, UV_DISTRIBUTION_SIZE);
 	if (set_distrib_bits(flush_mask, bcp, bau_desc, &locals, &remotes))
 		return NULL;
@@ -996,12 +1126,85 @@ const struct cpumask *uv_flush_tlb_others(const struct cpumask *cpumask,
 	 * uv_flush_send_and_wait returns 0 if all cpu's were messaged,
 	 * or 1 if it gave up and the original cpumask should be returned.
 	 */
-	if (!uv_flush_send_and_wait(bau_desc, flush_mask, bcp))
+	if (!uv_flush_send_and_wait(flush_mask, bcp))
 		return NULL;
 	else
 		return cpumask;
 }
 
+/*
+ * Search the message queue for any 'other' message with the same software
+ * acknowledge resource bit vector.
+ */
+struct bau_pq_entry *find_another_by_swack(struct bau_pq_entry *msg,
+			struct bau_control *bcp, unsigned char swack_vec)
+{
+	struct bau_pq_entry *msg_next = msg + 1;
+
+	if (msg_next > bcp->queue_last)
+		msg_next = bcp->queue_first;
+	while ((msg_next->swack_vec != 0) && (msg_next != msg)) {
+		if (msg_next->swack_vec == swack_vec)
+			return msg_next;
+		msg_next++;
+		if (msg_next > bcp->queue_last)
+			msg_next = bcp->queue_first;
+	}
+	return NULL;
+}
+
+/*
+ * UV2 needs to work around a bug in which an arriving message has not
+ * set a bit in the UVH_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE register.
+ * Such a message must be ignored.
+ */
+void process_uv2_message(struct msg_desc *mdp, struct bau_control *bcp)
+{
+	unsigned long mmr_image;
+	unsigned char swack_vec;
+	struct bau_pq_entry *msg = mdp->msg;
+	struct bau_pq_entry *other_msg;
+
+	mmr_image = read_mmr_sw_ack();
+	swack_vec = msg->swack_vec;
+
+	if ((swack_vec & mmr_image) == 0) {
+		/*
+		 * This message was assigned a swack resource, but no
+		 * reserved acknowlegment is pending.
+		 * The bug has prevented this message from setting the MMR.
+		 * And no other message has used the same sw_ack resource.
+		 * Do the requested shootdown but do not reply to the msg.
+		 * (the 0 means make no acknowledge)
+		 */
+		bau_process_message(mdp, bcp, 0);
+		return;
+	}
+
+	/*
+	 * Some message has set the MMR 'pending' bit; it might have been
+	 * another message.  Look for that message.
+	 */
+	other_msg = find_another_by_swack(msg, bcp, msg->swack_vec);
+	if (other_msg) {
+		/* There is another.  Do not ack the current one. */
+		bau_process_message(mdp, bcp, 0);
+		/*
+		 * Let the natural processing of that message acknowledge
+		 * it. Don't get the processing of sw_ack's out of order.
+		 */
+		return;
+	}
+
+	/*
+	 * There is no other message using this sw_ack, so it is safe to
+	 * acknowledge it.
+	 */
+	bau_process_message(mdp, bcp, 1);
+
+	return;
+}
+
 /*
  * The BAU message interrupt comes here. (registered by set_intr_gate)
  * See entry_64.S
@@ -1038,9 +1241,11 @@ void uv_bau_message_interrupt(struct pt_regs *regs)
 		count++;
 
 		msgdesc.msg_slot = msg - msgdesc.queue_first;
-		msgdesc.swack_slot = ffs(msg->swack_vec) - 1;
 		msgdesc.msg = msg;
-		bau_process_message(&msgdesc, bcp);
+		if (bcp->uvhub_version == 2)
+			process_uv2_message(&msgdesc, bcp);
+		else
+			bau_process_message(&msgdesc, bcp, 1);
 
 		msg++;
 		if (msg > msgdesc.queue_last)
@@ -1158,7 +1363,7 @@ static int ptc_seq_show(struct seq_file *file, void *data)
 		seq_printf(file,
 			"all one mult none retry canc nocan reset rcan ");
 		seq_printf(file,
-			"disable enable\n");
+			"disable enable wars warshw warwaits\n");
 	}
 	if (cpu < num_possible_cpus() && cpu_online(cpu)) {
 		stat = &per_cpu(ptcstats, cpu);
@@ -1189,8 +1394,10 @@ static int ptc_seq_show(struct seq_file *file, void *data)
 			   stat->d_nomsg, stat->d_retries, stat->d_canceled,
 			   stat->d_nocanceled, stat->d_resets,
 			   stat->d_rcanceled);
-		seq_printf(file, "%ld %ld\n",
-			stat->s_bau_disabled, stat->s_bau_reenabled);
+		seq_printf(file, "%ld %ld %ld %ld %ld\n",
+			stat->s_bau_disabled, stat->s_bau_reenabled,
+			stat->s_uv2_wars, stat->s_uv2_wars_hw,
+			stat->s_uv2_war_waits);
 	}
 	return 0;
 }
@@ -1564,6 +1771,7 @@ static void pq_init(int node, int pnode)
 	write_mmr_payload_first(pnode, pn_first);
 	write_mmr_payload_tail(pnode, first);
 	write_mmr_payload_last(pnode, last);
+	write_gmmr_sw_ack(pnode, 0xffffUL);
 
 	/* in effect, all msg_type's are set to MSG_NOOP */
 	memset(pqp, 0, sizeof(struct bau_pq_entry) * DEST_Q_SIZE);
@@ -1651,6 +1859,7 @@ static void __init init_per_cpu_tunables(void)
 		bcp->cong_response_us		= congested_respns_us;
 		bcp->cong_reps			= congested_reps;
 		bcp->cong_period		= congested_period;
+		bcp->clocks_per_100_usec =	usec_2_cycles(100);
 	}
 }
 
@@ -1771,6 +1980,7 @@ static int scan_sock(struct socket_desc *sdp, struct uvhub_desc *bdp,
 		}
 		bcp->uvhub_master = *hmasterp;
 		bcp->uvhub_cpu = uv_cpu_hub_info(cpu)->blade_processor_id;
+		bcp->using_desc = bcp->uvhub_cpu;
 		if (bcp->uvhub_cpu >= MAX_CPUS_PER_UVHUB) {
 			printk(KERN_EMERG "%d cpus per uvhub invalid\n",
 				bcp->uvhub_cpu);
-- 
cgit v1.2.3


From 478c6e529e7bd7c6ef8994c55bd252c287c35893 Mon Sep 17 00:00:00 2001
From: Cliff Wickman <cpw@sgi.com>
Date: Mon, 16 Jan 2012 15:20:50 -0600
Subject: x86/UV2: Remove stale no-resources test for UV2 BAU

This patch removes an unnecessary test for a
no-destination-resources-available condition that looks like a
destination timeout in UV1, but is separately distinguishable in
UV2.

Signed-off-by: Cliff Wickman <cpw@sgi.com>
Link: http://lkml.kernel.org/r/20120116212050.GD5767@sgi.com
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/platform/uv/tlb_uv.c | 10 ----------
 1 file changed, 10 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/platform/uv/tlb_uv.c b/arch/x86/platform/uv/tlb_uv.c
index 9010ca715c03..affea509c174 100644
--- a/arch/x86/platform/uv/tlb_uv.c
+++ b/arch/x86/platform/uv/tlb_uv.c
@@ -642,16 +642,6 @@ static int uv2_wait_completion(struct bau_desc *bau_desc,
 		} else if (descriptor_stat == UV2H_DESC_DEST_TIMEOUT) {
 			stat->s_dtimeout++;
 			ttm = get_cycles();
-			/*
-			 * Our retries may be blocked by all destination
-			 * swack resources being consumed, and a timeout
-			 * pending.  In that case hardware returns the
-			 * ERROR that looks like a destination timeout.
-			 */
-			if (cycles_2_us(ttm - bcp->send_message) < timeout_us) {
-				bcp->conseccompletes = 0;
-				return FLUSH_RETRY_PLUGGED;
-			}
 			bcp->conseccompletes = 0;
 			return FLUSH_RETRY_TIMEOUT;
 		} else {
-- 
cgit v1.2.3


From 88ed9dd7f63c3ae71c1984d99ee2dced0b386dea Mon Sep 17 00:00:00 2001
From: Cliff Wickman <cpw@sgi.com>
Date: Mon, 16 Jan 2012 15:21:46 -0600
Subject: x86/UV2: Ack BAU interrupt earlier

This patch moves the ack of the BAU interrupt to the beginning
of  the interrupt handler so that there is less possibility of a
lost interrupt and slower response to a shootdown message.

Signed-off-by: Cliff Wickman <cpw@sgi.com>
Link: http://lkml.kernel.org/r/20120116212146.GE5767@sgi.com
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/platform/uv/tlb_uv.c | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/platform/uv/tlb_uv.c b/arch/x86/platform/uv/tlb_uv.c
index affea509c174..4686bf1e56ec 100644
--- a/arch/x86/platform/uv/tlb_uv.c
+++ b/arch/x86/platform/uv/tlb_uv.c
@@ -1218,6 +1218,7 @@ void uv_bau_message_interrupt(struct pt_regs *regs)
 	struct ptc_stats *stat;
 	struct msg_desc msgdesc;
 
+	ack_APIC_irq();
 	time_start = get_cycles();
 
 	bcp = &per_cpu(bau_control, smp_processor_id());
@@ -1247,8 +1248,6 @@ void uv_bau_message_interrupt(struct pt_regs *regs)
 		stat->d_nomsg++;
 	else if (count > 1)
 		stat->d_multmsg++;
-
-	ack_APIC_irq();
 }
 
 /*
-- 
cgit v1.2.3


From b54bd9be35f4084edb3eb9ee054a43f722a67483 Mon Sep 17 00:00:00 2001
From: Cliff Wickman <cpw@sgi.com>
Date: Mon, 16 Jan 2012 15:22:38 -0600
Subject: x86/UV2: Add accounting for BAU strong nacks

This patch adds separate accounting of UV2 message "strong
nack's" in the BAU statistics.

Signed-off-by: Cliff Wickman <cpw@sgi.com>
Link: http://lkml.kernel.org/r/20120116212238.GF5767@sgi.com
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/include/asm/uv/uv_bau.h |  1 +
 arch/x86/platform/uv/tlb_uv.c    | 12 +++++++-----
 2 files changed, 8 insertions(+), 5 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/include/asm/uv/uv_bau.h b/arch/x86/include/asm/uv/uv_bau.h
index 1b82f7e87393..becf47b81735 100644
--- a/arch/x86/include/asm/uv/uv_bau.h
+++ b/arch/x86/include/asm/uv/uv_bau.h
@@ -483,6 +483,7 @@ struct ptc_stats {
 						   requests */
 	unsigned long	s_stimeout;		/* source side timeouts */
 	unsigned long	s_dtimeout;		/* destination side timeouts */
+	unsigned long	s_strongnacks;		/* number of strong nack's */
 	unsigned long	s_time;			/* time spent in sending side */
 	unsigned long	s_retriesok;		/* successful retries */
 	unsigned long	s_ntargcpu;		/* total number of cpu's
diff --git a/arch/x86/platform/uv/tlb_uv.c b/arch/x86/platform/uv/tlb_uv.c
index 4686bf1e56ec..9be4cff00a2d 100644
--- a/arch/x86/platform/uv/tlb_uv.c
+++ b/arch/x86/platform/uv/tlb_uv.c
@@ -635,13 +635,15 @@ static int uv2_wait_completion(struct bau_desc *bau_desc,
 		 * our message and its state will stay IDLE.
 		 */
 		if ((descriptor_stat == UV2H_DESC_SOURCE_TIMEOUT) ||
-		    (descriptor_stat == UV2H_DESC_DEST_STRONG_NACK) ||
 		    (descriptor_stat == UV2H_DESC_DEST_PUT_ERR)) {
 			stat->s_stimeout++;
 			return FLUSH_GIVEUP;
+		} else if (descriptor_stat == UV2H_DESC_DEST_STRONG_NACK) {
+			stat->s_strongnacks++;
+			bcp->conseccompletes = 0;
+			return FLUSH_GIVEUP;
 		} else if (descriptor_stat == UV2H_DESC_DEST_TIMEOUT) {
 			stat->s_dtimeout++;
-			ttm = get_cycles();
 			bcp->conseccompletes = 0;
 			return FLUSH_RETRY_TIMEOUT;
 		} else {
@@ -1346,7 +1348,7 @@ static int ptc_seq_show(struct seq_file *file, void *data)
 		seq_printf(file,
 			"remotehub numuvhubs numuvhubs16 numuvhubs8 ");
 		seq_printf(file,
-			"numuvhubs4 numuvhubs2 numuvhubs1 dto retries rok ");
+		    "numuvhubs4 numuvhubs2 numuvhubs1 dto snacks retries rok ");
 		seq_printf(file,
 			"resetp resett giveup sto bz throt swack recv rtime ");
 		seq_printf(file,
@@ -1364,10 +1366,10 @@ static int ptc_seq_show(struct seq_file *file, void *data)
 			   stat->s_ntargremotes, stat->s_ntargcpu,
 			   stat->s_ntarglocaluvhub, stat->s_ntargremoteuvhub,
 			   stat->s_ntarguvhub, stat->s_ntarguvhub16);
-		seq_printf(file, "%ld %ld %ld %ld %ld ",
+		seq_printf(file, "%ld %ld %ld %ld %ld %ld ",
 			   stat->s_ntarguvhub8, stat->s_ntarguvhub4,
 			   stat->s_ntarguvhub2, stat->s_ntarguvhub1,
-			   stat->s_dtimeout);
+			   stat->s_dtimeout, stat->s_strongnacks);
 		seq_printf(file, "%ld %ld %ld %ld %ld %ld %ld %ld ",
 			   stat->s_retry_messages, stat->s_retriesok,
 			   stat->s_resets_plug, stat->s_resets_timeout,
-- 
cgit v1.2.3


From b54ac6d2a25084667da781c7ca2cebef52a2bcdd Mon Sep 17 00:00:00 2001
From: Huang Ying <ying.huang@intel.com>
Date: Thu, 8 Dec 2011 11:25:49 +0800
Subject: ACPI, Record ACPI NVS regions

Some firmware will access memory in ACPI NVS region via APEI.  That
is, instructions in APEI ERST/EINJ table will read/write ACPI NVS
region.  The original resource conflict checking in APEI code will
check memory/ioport accessed by APEI via general resource management
mechanism.  But ACPI NVS region is marked as busy already, so that the
false resource conflict will prevent APEI ERST/EINJ to work.

To fix this, this patch record ACPI NVS regions, so that we can avoid
request resources for memory region inside it.

Signed-off-by: Huang Ying <ying.huang@intel.com>
Signed-off-by: Len Brown <len.brown@intel.com>
---
 arch/x86/kernel/e820.c |  4 ++--
 drivers/acpi/Makefile  |  3 ++-
 drivers/acpi/nvs.c     | 53 +++++++++++++++++++++++++++++++++++++++++++++++++-
 include/linux/acpi.h   | 20 +++++++++++++------
 4 files changed, 70 insertions(+), 10 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kernel/e820.c b/arch/x86/kernel/e820.c
index 303a0e48f076..51c3b186e5b9 100644
--- a/arch/x86/kernel/e820.c
+++ b/arch/x86/kernel/e820.c
@@ -714,7 +714,7 @@ void __init e820_mark_nosave_regions(unsigned long limit_pfn)
 }
 #endif
 
-#ifdef CONFIG_HIBERNATION
+#ifdef CONFIG_ACPI
 /**
  * Mark ACPI NVS memory region, so that we can save/restore it during
  * hibernation and the subsequent resume.
@@ -727,7 +727,7 @@ static int __init e820_mark_nvs_memory(void)
 		struct e820entry *ei = &e820.map[i];
 
 		if (ei->type == E820_NVS)
-			suspend_nvs_register(ei->addr, ei->size);
+			acpi_nvs_register(ei->addr, ei->size);
 	}
 
 	return 0;
diff --git a/drivers/acpi/Makefile b/drivers/acpi/Makefile
index ecb26b4f29a0..c07f44f05f9d 100644
--- a/drivers/acpi/Makefile
+++ b/drivers/acpi/Makefile
@@ -20,11 +20,12 @@ obj-y				+= acpi.o \
 # All the builtin files are in the "acpi." module_param namespace.
 acpi-y				+= osl.o utils.o reboot.o
 acpi-y				+= atomicio.o
+acpi-y				+= nvs.o
 
 # sleep related files
 acpi-y				+= wakeup.o
 acpi-y				+= sleep.o
-acpi-$(CONFIG_ACPI_SLEEP)	+= proc.o nvs.o
+acpi-$(CONFIG_ACPI_SLEEP)	+= proc.o
 
 
 #
diff --git a/drivers/acpi/nvs.c b/drivers/acpi/nvs.c
index 096787b43c96..7a2035fa8c71 100644
--- a/drivers/acpi/nvs.c
+++ b/drivers/acpi/nvs.c
@@ -15,6 +15,56 @@
 #include <linux/acpi_io.h>
 #include <acpi/acpiosxf.h>
 
+/* ACPI NVS regions, APEI may use it */
+
+struct nvs_region {
+	__u64 phys_start;
+	__u64 size;
+	struct list_head node;
+};
+
+static LIST_HEAD(nvs_region_list);
+
+#ifdef CONFIG_ACPI_SLEEP
+static int suspend_nvs_register(unsigned long start, unsigned long size);
+#else
+static inline int suspend_nvs_register(unsigned long a, unsigned long b)
+{
+	return 0;
+}
+#endif
+
+int acpi_nvs_register(__u64 start, __u64 size)
+{
+	struct nvs_region *region;
+
+	region = kmalloc(sizeof(*region), GFP_KERNEL);
+	if (!region)
+		return -ENOMEM;
+	region->phys_start = start;
+	region->size = size;
+	list_add_tail(&region->node, &nvs_region_list);
+
+	return suspend_nvs_register(start, size);
+}
+
+int acpi_nvs_for_each_region(int (*func)(__u64 start, __u64 size, void *data),
+			     void *data)
+{
+	int rc;
+	struct nvs_region *region;
+
+	list_for_each_entry(region, &nvs_region_list, node) {
+		rc = func(region->phys_start, region->size, data);
+		if (rc)
+			return rc;
+	}
+
+	return 0;
+}
+
+
+#ifdef CONFIG_ACPI_SLEEP
 /*
  * Platforms, like ACPI, may want us to save some memory used by them during
  * suspend and to restore the contents of this memory during the subsequent
@@ -41,7 +91,7 @@ static LIST_HEAD(nvs_list);
  *	things so that the data from page-aligned addresses in this region will
  *	be copied into separate RAM pages.
  */
-int suspend_nvs_register(unsigned long start, unsigned long size)
+static int suspend_nvs_register(unsigned long start, unsigned long size)
 {
 	struct nvs_page *entry, *next;
 
@@ -159,3 +209,4 @@ void suspend_nvs_restore(void)
 		if (entry->data)
 			memcpy(entry->kaddr, entry->data, entry->size);
 }
+#endif
diff --git a/include/linux/acpi.h b/include/linux/acpi.h
index 6001b4da39dd..26b75442ff7a 100644
--- a/include/linux/acpi.h
+++ b/include/linux/acpi.h
@@ -306,6 +306,11 @@ extern acpi_status acpi_pci_osc_control_set(acpi_handle handle,
 					     u32 *mask, u32 req);
 extern void acpi_early_init(void);
 
+extern int acpi_nvs_register(__u64 start, __u64 size);
+
+extern int acpi_nvs_for_each_region(int (*func)(__u64, __u64, void *),
+				    void *data);
+
 #else	/* !CONFIG_ACPI */
 
 #define acpi_disabled 1
@@ -348,15 +353,18 @@ static inline int acpi_table_parse(char *id,
 {
 	return -1;
 }
-#endif	/* !CONFIG_ACPI */
 
-#ifdef CONFIG_ACPI_SLEEP
-int suspend_nvs_register(unsigned long start, unsigned long size);
-#else
-static inline int suspend_nvs_register(unsigned long a, unsigned long b)
+static inline int acpi_nvs_register(__u64 start, __u64 size)
 {
 	return 0;
 }
-#endif
+
+static inline int acpi_nvs_for_each_region(int (*func)(__u64, __u64, void *),
+					   void *data)
+{
+	return 0;
+}
+
+#endif	/* !CONFIG_ACPI */
 
 #endif	/*_LINUX_ACPI_H*/
-- 
cgit v1.2.3


From cd298f60a2451a16e0f077404bf69b62ec868733 Mon Sep 17 00:00:00 2001
From: Kurt Garloff <kurt@garloff.de>
Date: Tue, 17 Jan 2012 04:20:31 -0500
Subject: ACPI, x86: Use SRAT table rev to use 8bit or 32bit PXM fields
 (x86/x86-64)

In SRAT v1, we had 8bit proximity domain (PXM) fields; SRAT v2 provides
32bits for these. The new fields were reserved before.
According to the ACPI spec, the OS must disregrard reserved fields.

x86/x86-64 was rather inconsistent prior to this patch; it used 8 bits
for the pxm field in cpu_affinity, but 32 bits in mem_affinity.
This patch makes it consistent: Either use 8 bits consistently (SRAT
rev 1 or lower) or 32 bits (SRAT rev 2 or higher).

cc: x86@kernel.org
Signed-off-by: Kurt Garloff <kurt@garloff.de>
Signed-off-by: Len Brown <len.brown@intel.com>
---
 arch/x86/mm/srat.c | 4 ++++
 1 file changed, 4 insertions(+)

(limited to 'arch/x86')

diff --git a/arch/x86/mm/srat.c b/arch/x86/mm/srat.c
index 81dbfdeb080d..7efd0c615d58 100644
--- a/arch/x86/mm/srat.c
+++ b/arch/x86/mm/srat.c
@@ -104,6 +104,8 @@ acpi_numa_processor_affinity_init(struct acpi_srat_cpu_affinity *pa)
 	if ((pa->flags & ACPI_SRAT_CPU_ENABLED) == 0)
 		return;
 	pxm = pa->proximity_domain_lo;
+	if (acpi_srat_revision >= 2)
+		pxm |= *((unsigned int*)pa->proximity_domain_hi) << 8;
 	node = setup_node(pxm);
 	if (node < 0) {
 		printk(KERN_ERR "SRAT: Too many proximity domains %x\n", pxm);
@@ -155,6 +157,8 @@ acpi_numa_memory_affinity_init(struct acpi_srat_mem_affinity *ma)
 	start = ma->base_address;
 	end = start + ma->length;
 	pxm = ma->proximity_domain;
+	if (acpi_srat_revision <= 1)
+		pxm &= 0xff;
 	node = setup_node(pxm);
 	if (node < 0) {
 		printk(KERN_ERR "SRAT: Too many proximity domains.\n");
-- 
cgit v1.2.3


From 5ee71535440f034de1196b11f78cef81c4025c2b Mon Sep 17 00:00:00 2001
From: Randy Dunlap <rdunlap@xenotime.net>
Date: Mon, 16 Jan 2012 11:57:18 -0800
Subject: x86/kconfig: Move the ZONE_DMA entry under a menu

Move the ZONE_DMA kconfig symbol under a menu item instead
of having it listed before everything else in
"make {xconfig | gconfig | nconfig | menuconfig}".

This drops the first line of the top-level kernel config menu
(in 3.2) below and moves it under "Processor type and features".

          [*] DMA memory allocation support
              General setup  --->
          [*] Enable loadable module support  --->
          [*] Enable the block layer  --->
              Processor type and features  --->
              Power management and ACPI options  --->
              Bus options (PCI etc.)  --->
              Executable file formats / Emulations  --->

Signed-off-by: Randy Dunlap <rdunlap@xenotime.net>
Acked-by: David Rientjes <rientjes@google.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: linux-mm@kvack.org <linux-mm@kvack.org>
Link: http://lkml.kernel.org/r/4F14811E.6090107@xenotime.net
Signed-off-by: Ingo Molnar <mingo@elte.hu>
Cc: David Rientjes <rientjes@google.com>
---
 arch/x86/Kconfig | 20 ++++++++++----------
 1 file changed, 10 insertions(+), 10 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
index 5731eb70e0a0..db190faffba1 100644
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -120,16 +120,6 @@ config HAVE_LATENCYTOP_SUPPORT
 config MMU
 	def_bool y
 
-config ZONE_DMA
-	bool "DMA memory allocation support" if EXPERT
-	default y
-	help
-	  DMA memory allocation support allows devices with less than 32-bit
-	  addressing to allocate within the first 16MB of address space.
-	  Disable if no such devices will be used.
-
-	  If unsure, say Y.
-
 config SBUS
 	bool
 
@@ -253,6 +243,16 @@ source "kernel/Kconfig.freezer"
 
 menu "Processor type and features"
 
+config ZONE_DMA
+	bool "DMA memory allocation support" if EXPERT
+	default y
+	help
+	  DMA memory allocation support allows devices with less than 32-bit
+	  addressing to allocate within the first 16MB of address space.
+	  Disable if no such devices will be used.
+
+	  If unsure, say Y.
+
 source "kernel/time/Kconfig"
 
 config SMP
-- 
cgit v1.2.3


From ce79dac861e0d9a473d9923391bdbaad83c1c57f Mon Sep 17 00:00:00 2001
From: Ulrich Drepper <drepper@gmail.com>
Date: Tue, 17 Jan 2012 14:14:02 -0500
Subject: x86, opcode: ANDN and Group 17 in x86-opcode-map.txt

The Intel documentation at

http://software.intel.com/file/36945

shows the ANDN opcode and Group 17 with encoding f2 and f3 encoding
respectively.  The current version of x86-opcode-map.txt shows them
with f3 and f4.  Unless someone can point to documentation which shows
the currently used encoding the following patch be applied.

Signed-off-by: Ulrich Drepper <drepper@gmail.com>
Link: http://lkml.kernel.org/r/CAOPLpQdq5SuVo9=023CYhbFLAX9rONyjmYq7jJkqc5xwctW5eA@mail.gmail.com
Signed-off-by: H. Peter Anvin <hpa@zytor.com>
---
 arch/x86/lib/x86-opcode-map.txt | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/lib/x86-opcode-map.txt b/arch/x86/lib/x86-opcode-map.txt
index 5b83c51c12e0..4c8010d4f5e6 100644
--- a/arch/x86/lib/x86-opcode-map.txt
+++ b/arch/x86/lib/x86-opcode-map.txt
@@ -729,8 +729,8 @@ de: VAESDEC Vdq,Hdq,Wdq (66),(v1)
 df: VAESDECLAST Vdq,Hdq,Wdq (66),(v1)
 f0: MOVBE Gy,My | MOVBE Gw,Mw (66) | CRC32 Gd,Eb (F2)
 f1: MOVBE My,Gy | MOVBE Mw,Gw (66) | CRC32 Gd,Ey (F2)
-f3: ANDN Gy,By,Ey (v)
-f4: Grp17 (1A)
+f2: ANDN Gy,By,Ey (v)
+f3: Grp17 (1A)
 f5: BZHI Gy,Ey,By (v) | PEXT Gy,By,Ey (F3),(v) | PDEP Gy,By,Ey (F2),(v)
 f6: MULX By,Gy,rDX,Ey (F2),(v)
 f7: BEXTR Gy,Ey,By (v) | SHLX Gy,Ey,By (66),(v) | SARX Gy,Ey,By (F3),(v) | SHRX Gy,Ey,By (F2),(v)
-- 
cgit v1.2.3


From d7e7528bcd456f5c36ad4a202ccfb43c5aa98bc4 Mon Sep 17 00:00:00 2001
From: Eric Paris <eparis@redhat.com>
Date: Tue, 3 Jan 2012 14:23:06 -0500
Subject: Audit: push audit success and retcode into arch ptrace.h

The audit system previously expected arches calling to audit_syscall_exit to
supply as arguments if the syscall was a success and what the return code was.
Audit also provides a helper AUDITSC_RESULT which was supposed to simplify things
by converting from negative retcodes to an audit internal magic value stating
success or failure.  This helper was wrong and could indicate that a valid
pointer returned to userspace was a failed syscall.  The fix is to fix the
layering foolishness.  We now pass audit_syscall_exit a struct pt_reg and it
in turns calls back into arch code to collect the return value and to
determine if the syscall was a success or failure.  We also define a generic
is_syscall_success() macro which determines success/failure based on if the
value is < -MAX_ERRNO.  This works for arches like x86 which do not use a
separate mechanism to indicate syscall failure.

We make both the is_syscall_success() and regs_return_value() static inlines
instead of macros.  The reason is because the audit function must take a void*
for the regs.  (uml calls theirs struct uml_pt_regs instead of just struct
pt_regs so audit_syscall_exit can't take a struct pt_regs).  Since the audit
function takes a void* we need to use static inlines to cast it back to the
arch correct structure to dereference it.

The other major change is that on some arches, like ia64, MIPS and ppc, we
change regs_return_value() to give us the negative value on syscall failure.
THE only other user of this macro, kretprobe_example.c, won't notice and it
makes the value signed consistently for the audit functions across all archs.

In arch/sh/kernel/ptrace_64.c I see that we were using regs[9] in the old
audit code as the return value.  But the ptrace_64.h code defined the macro
regs_return_value() as regs[3].  I have no idea which one is correct, but this
patch now uses the regs_return_value() function, so it now uses regs[3].

For powerpc we previously used regs->result but now use the
regs_return_value() function which uses regs->gprs[3].  regs->gprs[3] is
always positive so the regs_return_value(), much like ia64 makes it negative
before calling the audit code when appropriate.

Signed-off-by: Eric Paris <eparis@redhat.com>
Acked-by: H. Peter Anvin <hpa@zytor.com> [for x86 portion]
Acked-by: Tony Luck <tony.luck@intel.com> [for ia64]
Acked-by: Richard Weinberger <richard@nod.at> [for uml]
Acked-by: David S. Miller <davem@davemloft.net> [for sparc]
Acked-by: Ralf Baechle <ralf@linux-mips.org> [for mips]
Acked-by: Benjamin Herrenschmidt <benh@kernel.crashing.org> [for ppc]
---
 arch/ia64/include/asm/ptrace.h       | 13 ++++++++++++-
 arch/ia64/kernel/ptrace.c            |  9 +--------
 arch/microblaze/include/asm/ptrace.h |  5 +++++
 arch/microblaze/kernel/ptrace.c      |  3 +--
 arch/mips/include/asm/ptrace.h       | 14 +++++++++++++-
 arch/mips/kernel/ptrace.c            |  4 +---
 arch/powerpc/include/asm/ptrace.h    | 13 ++++++++++++-
 arch/powerpc/kernel/ptrace.c         |  4 +---
 arch/s390/include/asm/ptrace.h       |  6 +++++-
 arch/s390/kernel/ptrace.c            |  4 +---
 arch/sh/include/asm/ptrace_32.h      |  5 ++++-
 arch/sh/include/asm/ptrace_64.h      |  5 ++++-
 arch/sh/kernel/ptrace_32.c           |  4 +---
 arch/sh/kernel/ptrace_64.c           |  4 +---
 arch/sparc/include/asm/ptrace.h      | 10 +++++++++-
 arch/sparc/kernel/ptrace_64.c        | 11 +----------
 arch/um/kernel/ptrace.c              |  4 ++--
 arch/x86/ia32/ia32entry.S            | 10 +++++-----
 arch/x86/kernel/entry_32.S           |  8 ++++----
 arch/x86/kernel/entry_64.S           | 10 +++++-----
 arch/x86/kernel/ptrace.c             |  3 +--
 arch/x86/kernel/vm86_32.c            |  4 ++--
 arch/x86/um/shared/sysdep/ptrace.h   |  5 +++++
 include/linux/audit.h                | 22 ++++++++++++++--------
 include/linux/ptrace.h               | 10 ++++++++++
 kernel/auditsc.c                     | 16 ++++++++++++----
 26 files changed, 132 insertions(+), 74 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/ia64/include/asm/ptrace.h b/arch/ia64/include/asm/ptrace.h
index f5cb27614e35..68c98f5b3ca6 100644
--- a/arch/ia64/include/asm/ptrace.h
+++ b/arch/ia64/include/asm/ptrace.h
@@ -246,7 +246,18 @@ static inline unsigned long user_stack_pointer(struct pt_regs *regs)
 	return regs->ar_bspstore;
 }
 
-#define regs_return_value(regs) ((regs)->r8)
+static inline int is_syscall_success(struct pt_regs *regs)
+{
+	return regs->r10 != -1;
+}
+
+static inline long regs_return_value(struct pt_regs *regs)
+{
+	if (is_syscall_success(regs))
+		return regs->r8;
+	else
+		return -regs->r8;
+}
 
 /* Conserve space in histogram by encoding slot bits in address
  * bits 2 and 3 rather than bits 0 and 1.
diff --git a/arch/ia64/kernel/ptrace.c b/arch/ia64/kernel/ptrace.c
index 8848f43d819e..2c154088cce7 100644
--- a/arch/ia64/kernel/ptrace.c
+++ b/arch/ia64/kernel/ptrace.c
@@ -1268,14 +1268,7 @@ syscall_trace_leave (long arg0, long arg1, long arg2, long arg3,
 {
 	int step;
 
-	if (unlikely(current->audit_context)) {
-		int success = AUDITSC_RESULT(regs.r10);
-		long result = regs.r8;
-
-		if (success != AUDITSC_SUCCESS)
-			result = -result;
-		audit_syscall_exit(success, result);
-	}
+	audit_syscall_exit(&regs);
 
 	step = test_thread_flag(TIF_SINGLESTEP);
 	if (step || test_thread_flag(TIF_SYSCALL_TRACE))
diff --git a/arch/microblaze/include/asm/ptrace.h b/arch/microblaze/include/asm/ptrace.h
index 816bee64b196..94e92c805859 100644
--- a/arch/microblaze/include/asm/ptrace.h
+++ b/arch/microblaze/include/asm/ptrace.h
@@ -61,6 +61,11 @@ struct pt_regs {
 #define instruction_pointer(regs)	((regs)->pc)
 #define profile_pc(regs)		instruction_pointer(regs)
 
+static inline long regs_return_value(struct pt_regs *regs)
+{
+	return regs->r3;
+}
+
 #else /* __KERNEL__ */
 
 /* pt_regs offsets used by gdbserver etc in ptrace syscalls */
diff --git a/arch/microblaze/kernel/ptrace.c b/arch/microblaze/kernel/ptrace.c
index 043cb58f9c44..f564b1bfd386 100644
--- a/arch/microblaze/kernel/ptrace.c
+++ b/arch/microblaze/kernel/ptrace.c
@@ -159,8 +159,7 @@ asmlinkage void do_syscall_trace_leave(struct pt_regs *regs)
 {
 	int step;
 
-	if (unlikely(current->audit_context))
-		audit_syscall_exit(AUDITSC_RESULT(regs->r3), regs->r3);
+	audit_syscall_exit(regs);
 
 	step = test_thread_flag(TIF_SINGLESTEP);
 	if (step || test_thread_flag(TIF_SYSCALL_TRACE))
diff --git a/arch/mips/include/asm/ptrace.h b/arch/mips/include/asm/ptrace.h
index de39b1f343ea..7d409505df2d 100644
--- a/arch/mips/include/asm/ptrace.h
+++ b/arch/mips/include/asm/ptrace.h
@@ -137,7 +137,19 @@ extern int ptrace_set_watch_regs(struct task_struct *child,
  */
 #define user_mode(regs) (((regs)->cp0_status & KU_MASK) == KU_USER)
 
-#define regs_return_value(_regs) ((_regs)->regs[2])
+static inline int is_syscall_success(struct pt_regs *regs)
+{
+	return !regs->regs[7];
+}
+
+static inline long regs_return_value(struct pt_regs *regs)
+{
+	if (is_syscall_success(regs))
+		return regs->regs[2];
+	else
+		return -regs->regs[2];
+}
+
 #define instruction_pointer(regs) ((regs)->cp0_epc)
 #define profile_pc(regs) instruction_pointer(regs)
 
diff --git a/arch/mips/kernel/ptrace.c b/arch/mips/kernel/ptrace.c
index 4e6ea1ffad46..ab0f1963a7bd 100644
--- a/arch/mips/kernel/ptrace.c
+++ b/arch/mips/kernel/ptrace.c
@@ -572,9 +572,7 @@ out:
  */
 asmlinkage void syscall_trace_leave(struct pt_regs *regs)
 {
-	if (unlikely(current->audit_context))
-		audit_syscall_exit(AUDITSC_RESULT(regs->regs[7]),
-		                   -regs->regs[2]);
+	audit_syscall_exit(regs);
 
 	if (!(current->ptrace & PT_PTRACED))
 		return;
diff --git a/arch/powerpc/include/asm/ptrace.h b/arch/powerpc/include/asm/ptrace.h
index 48223f9b8728..78a205162fd7 100644
--- a/arch/powerpc/include/asm/ptrace.h
+++ b/arch/powerpc/include/asm/ptrace.h
@@ -86,7 +86,18 @@ struct pt_regs {
 #define instruction_pointer(regs) ((regs)->nip)
 #define user_stack_pointer(regs) ((regs)->gpr[1])
 #define kernel_stack_pointer(regs) ((regs)->gpr[1])
-#define regs_return_value(regs) ((regs)->gpr[3])
+static inline int is_syscall_success(struct pt_regs *regs)
+{
+	return !(regs->ccr & 0x10000000);
+}
+
+static inline long regs_return_value(struct pt_regs *regs)
+{
+	if (is_syscall_success(regs))
+		return regs->gpr[3];
+	else
+		return -regs->gpr[3];
+}
 
 #ifdef CONFIG_SMP
 extern unsigned long profile_pc(struct pt_regs *regs);
diff --git a/arch/powerpc/kernel/ptrace.c b/arch/powerpc/kernel/ptrace.c
index 5de73dbd15c7..09d31c12a5e3 100644
--- a/arch/powerpc/kernel/ptrace.c
+++ b/arch/powerpc/kernel/ptrace.c
@@ -1748,9 +1748,7 @@ void do_syscall_trace_leave(struct pt_regs *regs)
 {
 	int step;
 
-	if (unlikely(current->audit_context))
-		audit_syscall_exit((regs->ccr&0x10000000)?AUDITSC_FAILURE:AUDITSC_SUCCESS,
-				   regs->result);
+	audit_syscall_exit(regs);
 
 	if (unlikely(test_thread_flag(TIF_SYSCALL_TRACEPOINT)))
 		trace_sys_exit(regs, regs->result);
diff --git a/arch/s390/include/asm/ptrace.h b/arch/s390/include/asm/ptrace.h
index 56da355678f4..aeb77f017985 100644
--- a/arch/s390/include/asm/ptrace.h
+++ b/arch/s390/include/asm/ptrace.h
@@ -541,9 +541,13 @@ struct user_regs_struct
 #define user_mode(regs) (((regs)->psw.mask & PSW_MASK_PSTATE) != 0)
 #define instruction_pointer(regs) ((regs)->psw.addr & PSW_ADDR_INSN)
 #define user_stack_pointer(regs)((regs)->gprs[15])
-#define regs_return_value(regs)((regs)->gprs[2])
 #define profile_pc(regs) instruction_pointer(regs)
 
+static inline long regs_return_value(struct pt_regs *regs)
+{
+	return regs->gprs[2];
+}
+
 int regs_query_register_offset(const char *name);
 const char *regs_query_register_name(unsigned int offset);
 unsigned long regs_get_register(struct pt_regs *regs, unsigned int offset);
diff --git a/arch/s390/kernel/ptrace.c b/arch/s390/kernel/ptrace.c
index 573bc29551ef..f52758600980 100644
--- a/arch/s390/kernel/ptrace.c
+++ b/arch/s390/kernel/ptrace.c
@@ -751,9 +751,7 @@ asmlinkage long do_syscall_trace_enter(struct pt_regs *regs)
 
 asmlinkage void do_syscall_trace_exit(struct pt_regs *regs)
 {
-	if (unlikely(current->audit_context))
-		audit_syscall_exit(AUDITSC_RESULT(regs->gprs[2]),
-				   regs->gprs[2]);
+	audit_syscall_exit(regs);
 
 	if (unlikely(test_thread_flag(TIF_SYSCALL_TRACEPOINT)))
 		trace_sys_exit(regs, regs->gprs[2]);
diff --git a/arch/sh/include/asm/ptrace_32.h b/arch/sh/include/asm/ptrace_32.h
index 6c2239cca1a2..2d3e906aa722 100644
--- a/arch/sh/include/asm/ptrace_32.h
+++ b/arch/sh/include/asm/ptrace_32.h
@@ -76,7 +76,10 @@ struct pt_dspregs {
 #ifdef __KERNEL__
 
 #define MAX_REG_OFFSET		offsetof(struct pt_regs, tra)
-#define regs_return_value(_regs)	((_regs)->regs[0])
+static inline long regs_return_value(struct pt_regs *regs)
+{
+	return regs->regs[0];
+}
 
 #endif /* __KERNEL__ */
 
diff --git a/arch/sh/include/asm/ptrace_64.h b/arch/sh/include/asm/ptrace_64.h
index bf9be7764d69..eb3fcceaf64b 100644
--- a/arch/sh/include/asm/ptrace_64.h
+++ b/arch/sh/include/asm/ptrace_64.h
@@ -13,7 +13,10 @@ struct pt_regs {
 #ifdef __KERNEL__
 
 #define MAX_REG_OFFSET		offsetof(struct pt_regs, tregs[7])
-#define regs_return_value(_regs)	((_regs)->regs[3])
+static inline long regs_return_value(struct pt_regs *regs)
+{
+	return regs->regs[3];
+}
 
 #endif /* __KERNEL__ */
 
diff --git a/arch/sh/kernel/ptrace_32.c b/arch/sh/kernel/ptrace_32.c
index 92b3c276339a..c0b5c179d27b 100644
--- a/arch/sh/kernel/ptrace_32.c
+++ b/arch/sh/kernel/ptrace_32.c
@@ -530,9 +530,7 @@ asmlinkage void do_syscall_trace_leave(struct pt_regs *regs)
 {
 	int step;
 
-	if (unlikely(current->audit_context))
-		audit_syscall_exit(AUDITSC_RESULT(regs->regs[0]),
-				   regs->regs[0]);
+	audit_syscall_exit(regs);
 
 	if (unlikely(test_thread_flag(TIF_SYSCALL_TRACEPOINT)))
 		trace_sys_exit(regs, regs->regs[0]);
diff --git a/arch/sh/kernel/ptrace_64.c b/arch/sh/kernel/ptrace_64.c
index c8f97649f354..ba720d686435 100644
--- a/arch/sh/kernel/ptrace_64.c
+++ b/arch/sh/kernel/ptrace_64.c
@@ -548,9 +548,7 @@ asmlinkage void do_syscall_trace_leave(struct pt_regs *regs)
 {
 	int step;
 
-	if (unlikely(current->audit_context))
-		audit_syscall_exit(AUDITSC_RESULT(regs->regs[9]),
-				   regs->regs[9]);
+	audit_syscall_exit(regs);
 
 	if (unlikely(test_thread_flag(TIF_SYSCALL_TRACEPOINT)))
 		trace_sys_exit(regs, regs->regs[9]);
diff --git a/arch/sparc/include/asm/ptrace.h b/arch/sparc/include/asm/ptrace.h
index a0e1bcf843a1..c00c3b5c2806 100644
--- a/arch/sparc/include/asm/ptrace.h
+++ b/arch/sparc/include/asm/ptrace.h
@@ -207,7 +207,15 @@ do {	current_thread_info()->syscall_noerror = 1; \
 #define instruction_pointer(regs) ((regs)->tpc)
 #define instruction_pointer_set(regs, val) ((regs)->tpc = (val))
 #define user_stack_pointer(regs) ((regs)->u_regs[UREG_FP])
-#define regs_return_value(regs) ((regs)->u_regs[UREG_I0])
+static inline int is_syscall_success(struct pt_regs *regs)
+{
+	return !(regs->tstate & (TSTATE_XCARRY | TSTATE_ICARRY));
+}
+
+static inline long regs_return_value(struct pt_regs *regs)
+{
+	return regs->u_regs[UREG_I0];
+}
 #ifdef CONFIG_SMP
 extern unsigned long profile_pc(struct pt_regs *);
 #else
diff --git a/arch/sparc/kernel/ptrace_64.c b/arch/sparc/kernel/ptrace_64.c
index 96ee50a80661..c73c8c50f117 100644
--- a/arch/sparc/kernel/ptrace_64.c
+++ b/arch/sparc/kernel/ptrace_64.c
@@ -1086,17 +1086,8 @@ asmlinkage int syscall_trace_enter(struct pt_regs *regs)
 
 asmlinkage void syscall_trace_leave(struct pt_regs *regs)
 {
-#ifdef CONFIG_AUDITSYSCALL
-	if (unlikely(current->audit_context)) {
-		unsigned long tstate = regs->tstate;
-		int result = AUDITSC_SUCCESS;
+	audit_syscall_exit(regs);
 
-		if (unlikely(tstate & (TSTATE_XCARRY | TSTATE_ICARRY)))
-			result = AUDITSC_FAILURE;
-
-		audit_syscall_exit(result, regs->u_regs[UREG_I0]);
-	}
-#endif
 	if (unlikely(test_thread_flag(TIF_SYSCALL_TRACEPOINT)))
 		trace_sys_exit(regs, regs->u_regs[UREG_G1]);
 
diff --git a/arch/um/kernel/ptrace.c b/arch/um/kernel/ptrace.c
index c9da32b0c707..2ccf25c42feb 100644
--- a/arch/um/kernel/ptrace.c
+++ b/arch/um/kernel/ptrace.c
@@ -175,8 +175,8 @@ void syscall_trace(struct uml_pt_regs *regs, int entryexit)
 					    UPT_SYSCALL_ARG2(regs),
 					    UPT_SYSCALL_ARG3(regs),
 					    UPT_SYSCALL_ARG4(regs));
-		else audit_syscall_exit(AUDITSC_RESULT(UPT_SYSCALL_RET(regs)),
-					UPT_SYSCALL_RET(regs));
+		else
+			audit_syscall_exit(regs);
 	}
 
 	/* Fake a debug trap */
diff --git a/arch/x86/ia32/ia32entry.S b/arch/x86/ia32/ia32entry.S
index 3e274564f6bf..64ced0b8f8fd 100644
--- a/arch/x86/ia32/ia32entry.S
+++ b/arch/x86/ia32/ia32entry.S
@@ -14,6 +14,7 @@
 #include <asm/segment.h>
 #include <asm/irqflags.h>
 #include <linux/linkage.h>
+#include <linux/err.h>
 
 /* Avoid __ASSEMBLER__'ifying <linux/audit.h> just for this.  */
 #include <linux/elf-em.h>
@@ -208,12 +209,11 @@ sysexit_from_sys_call:
 	TRACE_IRQS_ON
 	sti
 	movl %eax,%esi		/* second arg, syscall return value */
-	cmpl $0,%eax		/* is it < 0? */
-	setl %al		/* 1 if so, 0 if not */
+	cmpl $-MAX_ERRNO,%eax	/* is it an error ? */
+	setbe %al		/* 1 if so, 0 if not */
 	movzbl %al,%edi		/* zero-extend that into %edi */
-	inc %edi /* first arg, 0->1(AUDITSC_SUCCESS), 1->2(AUDITSC_FAILURE) */
-	call audit_syscall_exit
-	movl RAX-ARGOFFSET(%rsp),%eax	/* reload syscall return value */
+	call __audit_syscall_exit
+	movq RAX-ARGOFFSET(%rsp),%rax	/* reload syscall return value */
 	movl $(_TIF_ALLWORK_MASK & ~_TIF_SYSCALL_AUDIT),%edi
 	cli
 	TRACE_IRQS_OFF
diff --git a/arch/x86/kernel/entry_32.S b/arch/x86/kernel/entry_32.S
index 22d0e21b4dd7..a22facf06f0e 100644
--- a/arch/x86/kernel/entry_32.S
+++ b/arch/x86/kernel/entry_32.S
@@ -42,6 +42,7 @@
  */
 
 #include <linux/linkage.h>
+#include <linux/err.h>
 #include <asm/thread_info.h>
 #include <asm/irqflags.h>
 #include <asm/errno.h>
@@ -466,11 +467,10 @@ sysexit_audit:
 	TRACE_IRQS_ON
 	ENABLE_INTERRUPTS(CLBR_ANY)
 	movl %eax,%edx		/* second arg, syscall return value */
-	cmpl $0,%eax		/* is it < 0? */
-	setl %al		/* 1 if so, 0 if not */
+	cmpl $-MAX_ERRNO,%eax	/* is it an error ? */
+	setbe %al		/* 1 if so, 0 if not */
 	movzbl %al,%eax		/* zero-extend that */
-	inc %eax /* first arg, 0->1(AUDITSC_SUCCESS), 1->2(AUDITSC_FAILURE) */
-	call audit_syscall_exit
+	call __audit_syscall_exit
 	DISABLE_INTERRUPTS(CLBR_ANY)
 	TRACE_IRQS_OFF
 	movl TI_flags(%ebp), %ecx
diff --git a/arch/x86/kernel/entry_64.S b/arch/x86/kernel/entry_64.S
index a20e1cb9dc87..e51393dd93a3 100644
--- a/arch/x86/kernel/entry_64.S
+++ b/arch/x86/kernel/entry_64.S
@@ -55,6 +55,7 @@
 #include <asm/paravirt.h>
 #include <asm/ftrace.h>
 #include <asm/percpu.h>
+#include <linux/err.h>
 
 /* Avoid __ASSEMBLER__'ifying <linux/audit.h> just for this.  */
 #include <linux/elf-em.h>
@@ -563,17 +564,16 @@ auditsys:
 	jmp system_call_fastpath
 
 	/*
-	 * Return fast path for syscall audit.  Call audit_syscall_exit()
+	 * Return fast path for syscall audit.  Call __audit_syscall_exit()
 	 * directly and then jump back to the fast path with TIF_SYSCALL_AUDIT
 	 * masked off.
 	 */
 sysret_audit:
 	movq RAX-ARGOFFSET(%rsp),%rsi	/* second arg, syscall return value */
-	cmpq $0,%rsi		/* is it < 0? */
-	setl %al		/* 1 if so, 0 if not */
+	cmpq $-MAX_ERRNO,%rsi	/* is it < -MAX_ERRNO? */
+	setbe %al		/* 1 if so, 0 if not */
 	movzbl %al,%edi		/* zero-extend that into %edi */
-	inc %edi /* first arg, 0->1(AUDITSC_SUCCESS), 1->2(AUDITSC_FAILURE) */
-	call audit_syscall_exit
+	call __audit_syscall_exit
 	movl $(_TIF_ALLWORK_MASK & ~_TIF_SYSCALL_AUDIT),%edi
 	jmp sysret_check
 #endif	/* CONFIG_AUDITSYSCALL */
diff --git a/arch/x86/kernel/ptrace.c b/arch/x86/kernel/ptrace.c
index 89a04c7b5bb6..8b0218758775 100644
--- a/arch/x86/kernel/ptrace.c
+++ b/arch/x86/kernel/ptrace.c
@@ -1414,8 +1414,7 @@ void syscall_trace_leave(struct pt_regs *regs)
 {
 	bool step;
 
-	if (unlikely(current->audit_context))
-		audit_syscall_exit(AUDITSC_RESULT(regs->ax), regs->ax);
+	audit_syscall_exit(regs);
 
 	if (unlikely(test_thread_flag(TIF_SYSCALL_TRACEPOINT)))
 		trace_sys_exit(regs, regs->ax);
diff --git a/arch/x86/kernel/vm86_32.c b/arch/x86/kernel/vm86_32.c
index 863f8753ab0a..af17e1c966dc 100644
--- a/arch/x86/kernel/vm86_32.c
+++ b/arch/x86/kernel/vm86_32.c
@@ -335,9 +335,9 @@ static void do_sys_vm86(struct kernel_vm86_struct *info, struct task_struct *tsk
 	if (info->flags & VM86_SCREEN_BITMAP)
 		mark_screen_rdonly(tsk->mm);
 
-	/*call audit_syscall_exit since we do not exit via the normal paths */
+	/*call __audit_syscall_exit since we do not exit via the normal paths */
 	if (unlikely(current->audit_context))
-		audit_syscall_exit(AUDITSC_RESULT(0), 0);
+		__audit_syscall_exit(1, 0);
 
 	__asm__ __volatile__(
 		"movl %0,%%esp\n\t"
diff --git a/arch/x86/um/shared/sysdep/ptrace.h b/arch/x86/um/shared/sysdep/ptrace.h
index 711b1621747f..5ef9344a8b24 100644
--- a/arch/x86/um/shared/sysdep/ptrace.h
+++ b/arch/x86/um/shared/sysdep/ptrace.h
@@ -3,3 +3,8 @@
 #else
 #include "ptrace_64.h"
 #endif
+
+static inline long regs_return_value(struct uml_pt_regs *regs)
+{
+	return UPT_SYSCALL_RET(regs);
+}
diff --git a/include/linux/audit.h b/include/linux/audit.h
index 6e1c533f9b46..3d65e4b3ba06 100644
--- a/include/linux/audit.h
+++ b/include/linux/audit.h
@@ -26,6 +26,7 @@
 
 #include <linux/types.h>
 #include <linux/elf-em.h>
+#include <linux/ptrace.h>
 
 /* The netlink messages for the audit system is divided into blocks:
  * 1000 - 1099 are for commanding the audit system
@@ -408,10 +409,6 @@ struct audit_field {
 	void				*lsm_rule;
 };
 
-#define AUDITSC_INVALID 0
-#define AUDITSC_SUCCESS 1
-#define AUDITSC_FAILURE 2
-#define AUDITSC_RESULT(x) ( ((long)(x))<0?AUDITSC_FAILURE:AUDITSC_SUCCESS )
 extern int __init audit_register_class(int class, unsigned *list);
 extern int audit_classify_syscall(int abi, unsigned syscall);
 extern int audit_classify_arch(int arch);
@@ -424,7 +421,7 @@ extern void audit_free(struct task_struct *task);
 extern void audit_syscall_entry(int arch,
 				int major, unsigned long a0, unsigned long a1,
 				unsigned long a2, unsigned long a3);
-extern void audit_syscall_exit(int failed, long return_code);
+extern void __audit_syscall_exit(int ret_success, long ret_value);
 extern void __audit_getname(const char *name);
 extern void audit_putname(const char *name);
 extern void __audit_inode(const char *name, const struct dentry *dentry);
@@ -438,6 +435,15 @@ static inline int audit_dummy_context(void)
 	void *p = current->audit_context;
 	return !p || *(int *)p;
 }
+static inline void audit_syscall_exit(void *pt_regs)
+{
+	if (unlikely(current->audit_context)) {
+		int success = is_syscall_success(pt_regs);
+		int return_code = regs_return_value(pt_regs);
+
+		__audit_syscall_exit(success, return_code);
+	}
+}
 static inline void audit_getname(const char *name)
 {
 	if (unlikely(!audit_dummy_context()))
@@ -551,12 +557,12 @@ static inline void audit_mmap_fd(int fd, int flags)
 
 extern int audit_n_rules;
 extern int audit_signals;
-#else
+#else /* CONFIG_AUDITSYSCALL */
 #define audit_finish_fork(t)
 #define audit_alloc(t) ({ 0; })
 #define audit_free(t) do { ; } while (0)
 #define audit_syscall_entry(ta,a,b,c,d,e) do { ; } while (0)
-#define audit_syscall_exit(f,r) do { ; } while (0)
+#define audit_syscall_exit(r) do { ; } while (0)
 #define audit_dummy_context() 1
 #define audit_getname(n) do { ; } while (0)
 #define audit_putname(n) do { ; } while (0)
@@ -587,7 +593,7 @@ extern int audit_signals;
 #define audit_ptrace(t) ((void)0)
 #define audit_n_rules 0
 #define audit_signals 0
-#endif
+#endif /* CONFIG_AUDITSYSCALL */
 
 #ifdef CONFIG_AUDIT
 /* These are defined in audit.c */
diff --git a/include/linux/ptrace.h b/include/linux/ptrace.h
index 800f113bea66..dd4cefa6519d 100644
--- a/include/linux/ptrace.h
+++ b/include/linux/ptrace.h
@@ -112,6 +112,7 @@
 
 #include <linux/compiler.h>		/* For unlikely.  */
 #include <linux/sched.h>		/* For struct task_struct.  */
+#include <linux/err.h>			/* for IS_ERR_VALUE */
 
 
 extern long arch_ptrace(struct task_struct *child, long request,
@@ -265,6 +266,15 @@ static inline void ptrace_release_task(struct task_struct *task)
 #define force_successful_syscall_return() do { } while (0)
 #endif
 
+#ifndef is_syscall_success
+/*
+ * On most systems we can tell if a syscall is a success based on if the retval
+ * is an error value.  On some systems like ia64 and powerpc they have different
+ * indicators of success/failure and must define their own.
+ */
+#define is_syscall_success(regs) (!IS_ERR_VALUE((unsigned long)(regs_return_value(regs))))
+#endif
+
 /*
  * <asm/ptrace.h> should define the following things inside #ifdef __KERNEL__.
  *
diff --git a/kernel/auditsc.c b/kernel/auditsc.c
index e9bcb93800d8..3d2853808185 100644
--- a/kernel/auditsc.c
+++ b/kernel/auditsc.c
@@ -70,6 +70,11 @@
 
 #include "audit.h"
 
+/* flags stating the success for a syscall */
+#define AUDITSC_INVALID 0
+#define AUDITSC_SUCCESS 1
+#define AUDITSC_FAILURE 2
+
 /* AUDIT_NAMES is the number of slots we reserve in the audit_context
  * for saving names from getname().  If we get more names we will allocate
  * a name dynamically and also add those to the list anchored by names_list. */
@@ -1724,8 +1729,7 @@ void audit_finish_fork(struct task_struct *child)
 
 /**
  * audit_syscall_exit - deallocate audit context after a system call
- * @valid: success/failure flag
- * @return_code: syscall return value
+ * @pt_regs: syscall registers
  *
  * Tear down after system call.  If the audit context has been marked as
  * auditable (either because of the AUDIT_RECORD_CONTEXT state from
@@ -1733,13 +1737,17 @@ void audit_finish_fork(struct task_struct *child)
  * message), then write out the syscall information.  In call cases,
  * free the names stored from getname().
  */
-void audit_syscall_exit(int valid, long return_code)
+void __audit_syscall_exit(int success, long return_code)
 {
 	struct task_struct *tsk = current;
 	struct audit_context *context;
 
-	context = audit_get_context(tsk, valid, return_code);
+	if (success)
+		success = AUDITSC_SUCCESS;
+	else
+		success = AUDITSC_FAILURE;
 
+	context = audit_get_context(tsk, success, return_code);
 	if (likely(!context))
 		return;
 
-- 
cgit v1.2.3


From f031cd25568a390dc2c9c3a4015054183753449a Mon Sep 17 00:00:00 2001
From: Eric Paris <eparis@redhat.com>
Date: Tue, 3 Jan 2012 14:23:06 -0500
Subject: audit: ia32entry.S sign extend error codes when calling 64 bit code

In the ia32entry syscall exit audit fastpath we have assembly code which calls
__audit_syscall_exit directly.  This code was, however, zeroes the upper 32
bits of the return code.  It then proceeded to call code which expects longs
to be 64bits long.  In order to handle code which expects longs to be 64bit we
sign extend the return code if that code is an error.  Thus the
__audit_syscall_exit function can correctly handle using the values in
snprintf("%ld").  This fixes the regression introduced in 5cbf1565f29eb57a86a.

Old record:
type=SYSCALL msg=audit(1306197182.256:281): arch=40000003 syscall=192 success=no exit=4294967283
New record:
type=SYSCALL msg=audit(1306197182.256:281): arch=40000003 syscall=192 success=no exit=-13

Signed-off-by: Eric Paris <eparis@redhat.com>
Acked-by: H. Peter Anvin <hpa@zytor.com>
---
 arch/x86/ia32/ia32entry.S | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

(limited to 'arch/x86')

diff --git a/arch/x86/ia32/ia32entry.S b/arch/x86/ia32/ia32entry.S
index 64ced0b8f8fd..025f0f01d254 100644
--- a/arch/x86/ia32/ia32entry.S
+++ b/arch/x86/ia32/ia32entry.S
@@ -210,7 +210,9 @@ sysexit_from_sys_call:
 	sti
 	movl %eax,%esi		/* second arg, syscall return value */
 	cmpl $-MAX_ERRNO,%eax	/* is it an error ? */
-	setbe %al		/* 1 if so, 0 if not */
+	jbe 1f
+	movslq %eax, %rsi	/* if error sign extend to 64 bits */
+1:	setbe %al		/* 1 if error, 0 if not */
 	movzbl %al,%edi		/* zero-extend that into %edi */
 	call __audit_syscall_exit
 	movq RAX-ARGOFFSET(%rsp),%rax	/* reload syscall return value */
-- 
cgit v1.2.3


From b05d8447e7821695bc2fa3359431f7a664232743 Mon Sep 17 00:00:00 2001
From: Eric Paris <eparis@redhat.com>
Date: Tue, 3 Jan 2012 14:23:06 -0500
Subject: audit: inline audit_syscall_entry to reduce burden on archs

Every arch calls:

if (unlikely(current->audit_context))
	audit_syscall_entry()

which requires knowledge about audit (the existance of audit_context) in
the arch code.  Just do it all in static inline in audit.h so that arch's
can remain blissfully ignorant.

Signed-off-by: Eric Paris <eparis@redhat.com>
---
 arch/ia64/kernel/ptrace.c       |  9 +--------
 arch/microblaze/kernel/ptrace.c |  6 ++----
 arch/mips/kernel/ptrace.c       |  7 +++----
 arch/powerpc/kernel/ptrace.c    | 26 ++++++++++++--------------
 arch/s390/kernel/ptrace.c       | 11 +++++------
 arch/sh/kernel/ptrace_32.c      |  7 +++----
 arch/sh/kernel/ptrace_64.c      |  7 +++----
 arch/sparc/kernel/ptrace_64.c   | 17 ++++++++---------
 arch/um/kernel/ptrace.c         | 20 +++++++++-----------
 arch/x86/ia32/ia32entry.S       |  2 +-
 arch/x86/kernel/entry_32.S      |  2 +-
 arch/x86/kernel/entry_64.S      |  4 ++--
 arch/x86/kernel/ptrace.c        | 22 ++++++++++------------
 arch/xtensa/kernel/ptrace.c     |  3 +--
 include/linux/audit.h           | 13 ++++++++++---
 kernel/auditsc.c                |  2 +-
 16 files changed, 72 insertions(+), 86 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/ia64/kernel/ptrace.c b/arch/ia64/kernel/ptrace.c
index 2c154088cce7..dad91661ddf9 100644
--- a/arch/ia64/kernel/ptrace.c
+++ b/arch/ia64/kernel/ptrace.c
@@ -1246,15 +1246,8 @@ syscall_trace_enter (long arg0, long arg1, long arg2, long arg3,
 	if (test_thread_flag(TIF_RESTORE_RSE))
 		ia64_sync_krbs();
 
-	if (unlikely(current->audit_context)) {
-		long syscall;
-		int arch;
 
-		syscall = regs.r15;
-		arch = AUDIT_ARCH_IA64;
-
-		audit_syscall_entry(arch, syscall, arg0, arg1, arg2, arg3);
-	}
+	audit_syscall_entry(AUDIT_ARCH_IA64, regs.r15, arg0, arg1, arg2, arg3);
 
 	return 0;
 }
diff --git a/arch/microblaze/kernel/ptrace.c b/arch/microblaze/kernel/ptrace.c
index f564b1bfd386..6eb2aa927d89 100644
--- a/arch/microblaze/kernel/ptrace.c
+++ b/arch/microblaze/kernel/ptrace.c
@@ -147,10 +147,8 @@ asmlinkage long do_syscall_trace_enter(struct pt_regs *regs)
 		 */
 		ret = -1L;
 
-	if (unlikely(current->audit_context))
-		audit_syscall_entry(EM_MICROBLAZE, regs->r12,
-				    regs->r5, regs->r6,
-				    regs->r7, regs->r8);
+	audit_syscall_entry(EM_MICROBLAZE, regs->r12, regs->r5, regs->r6,
+			    regs->r7, regs->r8);
 
 	return ret ?: regs->r12;
 }
diff --git a/arch/mips/kernel/ptrace.c b/arch/mips/kernel/ptrace.c
index ab0f1963a7bd..7786b608d932 100644
--- a/arch/mips/kernel/ptrace.c
+++ b/arch/mips/kernel/ptrace.c
@@ -560,10 +560,9 @@ asmlinkage void syscall_trace_enter(struct pt_regs *regs)
 	}
 
 out:
-	if (unlikely(current->audit_context))
-		audit_syscall_entry(audit_arch(), regs->regs[2],
-				    regs->regs[4], regs->regs[5],
-				    regs->regs[6], regs->regs[7]);
+	audit_syscall_entry(audit_arch(), regs->regs[2],
+			    regs->regs[4], regs->regs[5],
+			    regs->regs[6], regs->regs[7]);
 }
 
 /*
diff --git a/arch/powerpc/kernel/ptrace.c b/arch/powerpc/kernel/ptrace.c
index 09d31c12a5e3..5b43325402bc 100644
--- a/arch/powerpc/kernel/ptrace.c
+++ b/arch/powerpc/kernel/ptrace.c
@@ -1724,22 +1724,20 @@ long do_syscall_trace_enter(struct pt_regs *regs)
 	if (unlikely(test_thread_flag(TIF_SYSCALL_TRACEPOINT)))
 		trace_sys_enter(regs, regs->gpr[0]);
 
-	if (unlikely(current->audit_context)) {
 #ifdef CONFIG_PPC64
-		if (!is_32bit_task())
-			audit_syscall_entry(AUDIT_ARCH_PPC64,
-					    regs->gpr[0],
-					    regs->gpr[3], regs->gpr[4],
-					    regs->gpr[5], regs->gpr[6]);
-		else
+	if (!is_32bit_task())
+		audit_syscall_entry(AUDIT_ARCH_PPC64,
+				    regs->gpr[0],
+				    regs->gpr[3], regs->gpr[4],
+				    regs->gpr[5], regs->gpr[6]);
+	else
 #endif
-			audit_syscall_entry(AUDIT_ARCH_PPC,
-					    regs->gpr[0],
-					    regs->gpr[3] & 0xffffffff,
-					    regs->gpr[4] & 0xffffffff,
-					    regs->gpr[5] & 0xffffffff,
-					    regs->gpr[6] & 0xffffffff);
-	}
+		audit_syscall_entry(AUDIT_ARCH_PPC,
+				    regs->gpr[0],
+				    regs->gpr[3] & 0xffffffff,
+				    regs->gpr[4] & 0xffffffff,
+				    regs->gpr[5] & 0xffffffff,
+				    regs->gpr[6] & 0xffffffff);
 
 	return ret ?: regs->gpr[0];
 }
diff --git a/arch/s390/kernel/ptrace.c b/arch/s390/kernel/ptrace.c
index f52758600980..9d82ed4bcb27 100644
--- a/arch/s390/kernel/ptrace.c
+++ b/arch/s390/kernel/ptrace.c
@@ -740,12 +740,11 @@ asmlinkage long do_syscall_trace_enter(struct pt_regs *regs)
 	if (unlikely(test_thread_flag(TIF_SYSCALL_TRACEPOINT)))
 		trace_sys_enter(regs, regs->gprs[2]);
 
-	if (unlikely(current->audit_context))
-		audit_syscall_entry(is_compat_task() ?
-					AUDIT_ARCH_S390 : AUDIT_ARCH_S390X,
-				    regs->gprs[2], regs->orig_gpr2,
-				    regs->gprs[3], regs->gprs[4],
-				    regs->gprs[5]);
+	audit_syscall_entry(is_compat_task() ?
+				AUDIT_ARCH_S390 : AUDIT_ARCH_S390X,
+			    regs->gprs[2], regs->orig_gpr2,
+			    regs->gprs[3], regs->gprs[4],
+			    regs->gprs[5]);
 	return ret ?: regs->gprs[2];
 }
 
diff --git a/arch/sh/kernel/ptrace_32.c b/arch/sh/kernel/ptrace_32.c
index c0b5c179d27b..a3e651563763 100644
--- a/arch/sh/kernel/ptrace_32.c
+++ b/arch/sh/kernel/ptrace_32.c
@@ -518,10 +518,9 @@ asmlinkage long do_syscall_trace_enter(struct pt_regs *regs)
 	if (unlikely(test_thread_flag(TIF_SYSCALL_TRACEPOINT)))
 		trace_sys_enter(regs, regs->regs[0]);
 
-	if (unlikely(current->audit_context))
-		audit_syscall_entry(audit_arch(), regs->regs[3],
-				    regs->regs[4], regs->regs[5],
-				    regs->regs[6], regs->regs[7]);
+	audit_syscall_entry(audit_arch(), regs->regs[3],
+			    regs->regs[4], regs->regs[5],
+			    regs->regs[6], regs->regs[7]);
 
 	return ret ?: regs->regs[0];
 }
diff --git a/arch/sh/kernel/ptrace_64.c b/arch/sh/kernel/ptrace_64.c
index ba720d686435..3d0080b5c976 100644
--- a/arch/sh/kernel/ptrace_64.c
+++ b/arch/sh/kernel/ptrace_64.c
@@ -536,10 +536,9 @@ asmlinkage long long do_syscall_trace_enter(struct pt_regs *regs)
 	if (unlikely(test_thread_flag(TIF_SYSCALL_TRACEPOINT)))
 		trace_sys_enter(regs, regs->regs[9]);
 
-	if (unlikely(current->audit_context))
-		audit_syscall_entry(audit_arch(), regs->regs[1],
-				    regs->regs[2], regs->regs[3],
-				    regs->regs[4], regs->regs[5]);
+	audit_syscall_entry(audit_arch(), regs->regs[1],
+			    regs->regs[2], regs->regs[3],
+			    regs->regs[4], regs->regs[5]);
 
 	return ret ?: regs->regs[9];
 }
diff --git a/arch/sparc/kernel/ptrace_64.c b/arch/sparc/kernel/ptrace_64.c
index c73c8c50f117..9388844cd88c 100644
--- a/arch/sparc/kernel/ptrace_64.c
+++ b/arch/sparc/kernel/ptrace_64.c
@@ -1071,15 +1071,14 @@ asmlinkage int syscall_trace_enter(struct pt_regs *regs)
 	if (unlikely(test_thread_flag(TIF_SYSCALL_TRACEPOINT)))
 		trace_sys_enter(regs, regs->u_regs[UREG_G1]);
 
-	if (unlikely(current->audit_context) && !ret)
-		audit_syscall_entry((test_thread_flag(TIF_32BIT) ?
-				     AUDIT_ARCH_SPARC :
-				     AUDIT_ARCH_SPARC64),
-				    regs->u_regs[UREG_G1],
-				    regs->u_regs[UREG_I0],
-				    regs->u_regs[UREG_I1],
-				    regs->u_regs[UREG_I2],
-				    regs->u_regs[UREG_I3]);
+	audit_syscall_entry((test_thread_flag(TIF_32BIT) ?
+			     AUDIT_ARCH_SPARC :
+			     AUDIT_ARCH_SPARC64),
+			    regs->u_regs[UREG_G1],
+			    regs->u_regs[UREG_I0],
+			    regs->u_regs[UREG_I1],
+			    regs->u_regs[UREG_I2],
+			    regs->u_regs[UREG_I3]);
 
 	return ret;
 }
diff --git a/arch/um/kernel/ptrace.c b/arch/um/kernel/ptrace.c
index 2ccf25c42feb..06b190390505 100644
--- a/arch/um/kernel/ptrace.c
+++ b/arch/um/kernel/ptrace.c
@@ -167,17 +167,15 @@ void syscall_trace(struct uml_pt_regs *regs, int entryexit)
 	int is_singlestep = (current->ptrace & PT_DTRACE) && entryexit;
 	int tracesysgood;
 
-	if (unlikely(current->audit_context)) {
-		if (!entryexit)
-			audit_syscall_entry(HOST_AUDIT_ARCH,
-					    UPT_SYSCALL_NR(regs),
-					    UPT_SYSCALL_ARG1(regs),
-					    UPT_SYSCALL_ARG2(regs),
-					    UPT_SYSCALL_ARG3(regs),
-					    UPT_SYSCALL_ARG4(regs));
-		else
-			audit_syscall_exit(regs);
-	}
+	if (!entryexit)
+		audit_syscall_entry(HOST_AUDIT_ARCH,
+				    UPT_SYSCALL_NR(regs),
+				    UPT_SYSCALL_ARG1(regs),
+				    UPT_SYSCALL_ARG2(regs),
+				    UPT_SYSCALL_ARG3(regs),
+				    UPT_SYSCALL_ARG4(regs));
+	else
+		audit_syscall_exit(regs);
 
 	/* Fake a debug trap */
 	if (is_singlestep)
diff --git a/arch/x86/ia32/ia32entry.S b/arch/x86/ia32/ia32entry.S
index 025f0f01d254..cecfd9a8f734 100644
--- a/arch/x86/ia32/ia32entry.S
+++ b/arch/x86/ia32/ia32entry.S
@@ -192,7 +192,7 @@ sysexit_from_sys_call:
 	movl %ebx,%edx			/* 3rd arg: 1st syscall arg */
 	movl %eax,%esi			/* 2nd arg: syscall number */
 	movl $AUDIT_ARCH_I386,%edi	/* 1st arg: audit arch */
-	call audit_syscall_entry
+	call __audit_syscall_entry
 	movl RAX-ARGOFFSET(%rsp),%eax	/* reload syscall number */
 	cmpq $(IA32_NR_syscalls-1),%rax
 	ja ia32_badsys
diff --git a/arch/x86/kernel/entry_32.S b/arch/x86/kernel/entry_32.S
index a22facf06f0e..1ccd742eba1b 100644
--- a/arch/x86/kernel/entry_32.S
+++ b/arch/x86/kernel/entry_32.S
@@ -456,7 +456,7 @@ sysenter_audit:
 	movl %ebx,%ecx			/* 3rd arg: 1st syscall arg */
 	movl %eax,%edx			/* 2nd arg: syscall number */
 	movl $AUDIT_ARCH_I386,%eax	/* 1st arg: audit arch */
-	call audit_syscall_entry
+	call __audit_syscall_entry
 	pushl_cfi %ebx
 	movl PT_EAX(%esp),%eax		/* reload syscall number */
 	jmp sysenter_do_call
diff --git a/arch/x86/kernel/entry_64.S b/arch/x86/kernel/entry_64.S
index e51393dd93a3..1ca66b650123 100644
--- a/arch/x86/kernel/entry_64.S
+++ b/arch/x86/kernel/entry_64.S
@@ -549,7 +549,7 @@ badsys:
 #ifdef CONFIG_AUDITSYSCALL
 	/*
 	 * Fast path for syscall audit without full syscall trace.
-	 * We just call audit_syscall_entry() directly, and then
+	 * We just call __audit_syscall_entry() directly, and then
 	 * jump back to the normal fast path.
 	 */
 auditsys:
@@ -559,7 +559,7 @@ auditsys:
 	movq %rdi,%rdx			/* 3rd arg: 1st syscall arg */
 	movq %rax,%rsi			/* 2nd arg: syscall number */
 	movl $AUDIT_ARCH_X86_64,%edi	/* 1st arg: audit arch */
-	call audit_syscall_entry
+	call __audit_syscall_entry
 	LOAD_ARGS 0		/* reload call-clobbered registers */
 	jmp system_call_fastpath
 
diff --git a/arch/x86/kernel/ptrace.c b/arch/x86/kernel/ptrace.c
index 8b0218758775..50267386b766 100644
--- a/arch/x86/kernel/ptrace.c
+++ b/arch/x86/kernel/ptrace.c
@@ -1392,20 +1392,18 @@ long syscall_trace_enter(struct pt_regs *regs)
 	if (unlikely(test_thread_flag(TIF_SYSCALL_TRACEPOINT)))
 		trace_sys_enter(regs, regs->orig_ax);
 
-	if (unlikely(current->audit_context)) {
-		if (IS_IA32)
-			audit_syscall_entry(AUDIT_ARCH_I386,
-					    regs->orig_ax,
-					    regs->bx, regs->cx,
-					    regs->dx, regs->si);
+	if (IS_IA32)
+		audit_syscall_entry(AUDIT_ARCH_I386,
+				    regs->orig_ax,
+				    regs->bx, regs->cx,
+				    regs->dx, regs->si);
 #ifdef CONFIG_X86_64
-		else
-			audit_syscall_entry(AUDIT_ARCH_X86_64,
-					    regs->orig_ax,
-					    regs->di, regs->si,
-					    regs->dx, regs->r10);
+	else
+		audit_syscall_entry(AUDIT_ARCH_X86_64,
+				    regs->orig_ax,
+				    regs->di, regs->si,
+				    regs->dx, regs->r10);
 #endif
-	}
 
 	return ret ?: regs->orig_ax;
 }
diff --git a/arch/xtensa/kernel/ptrace.c b/arch/xtensa/kernel/ptrace.c
index a0d042aa2967..2dff698ab02e 100644
--- a/arch/xtensa/kernel/ptrace.c
+++ b/arch/xtensa/kernel/ptrace.c
@@ -334,8 +334,7 @@ void do_syscall_trace_enter(struct pt_regs *regs)
 		do_syscall_trace();
 
 #if 0
-	if (unlikely(current->audit_context))
-		audit_syscall_entry(current, AUDIT_ARCH_XTENSA..);
+	audit_syscall_entry(current, AUDIT_ARCH_XTENSA..);
 #endif
 }
 
diff --git a/include/linux/audit.h b/include/linux/audit.h
index 3d65e4b3ba06..f56ce2669b83 100644
--- a/include/linux/audit.h
+++ b/include/linux/audit.h
@@ -418,9 +418,9 @@ extern int audit_classify_arch(int arch);
 extern void audit_finish_fork(struct task_struct *child);
 extern int  audit_alloc(struct task_struct *task);
 extern void audit_free(struct task_struct *task);
-extern void audit_syscall_entry(int arch,
-				int major, unsigned long a0, unsigned long a1,
-				unsigned long a2, unsigned long a3);
+extern void __audit_syscall_entry(int arch,
+				  int major, unsigned long a0, unsigned long a1,
+				  unsigned long a2, unsigned long a3);
 extern void __audit_syscall_exit(int ret_success, long ret_value);
 extern void __audit_getname(const char *name);
 extern void audit_putname(const char *name);
@@ -435,6 +435,13 @@ static inline int audit_dummy_context(void)
 	void *p = current->audit_context;
 	return !p || *(int *)p;
 }
+static inline void audit_syscall_entry(int arch, int major, unsigned long a0,
+				       unsigned long a1, unsigned long a2,
+				       unsigned long a3)
+{
+	if (unlikely(!audit_dummy_context()))
+		__audit_syscall_entry(arch, major, a0, a1, a2, a3);
+}
 static inline void audit_syscall_exit(void *pt_regs)
 {
 	if (unlikely(current->audit_context)) {
diff --git a/kernel/auditsc.c b/kernel/auditsc.c
index 3d2853808185..b408100dd6ef 100644
--- a/kernel/auditsc.c
+++ b/kernel/auditsc.c
@@ -1632,7 +1632,7 @@ void audit_free(struct task_struct *tsk)
  * will only be written if another part of the kernel requests that it
  * be written).
  */
-void audit_syscall_entry(int arch, int major,
+void __audit_syscall_entry(int arch, int major,
 			 unsigned long a1, unsigned long a2,
 			 unsigned long a3, unsigned long a4)
 {
-- 
cgit v1.2.3


From 68f30fbee19cc67849b9fa8e153ede70758afe81 Mon Sep 17 00:00:00 2001
From: Linus Torvalds <torvalds@linux-foundation.org>
Date: Tue, 17 Jan 2012 15:35:37 -0800
Subject: x86, tsc: Fix SMI induced variation in quick_pit_calibrate()

pit_expect_msb() returns success wrongly in the below SMI scenario:

a. pit_verify_msb() has not yet seen the MSB transition.

b. we are close to the MSB transition though and got a SMI immediately after
   returning from pit_verify_msb() which didn't see the MSB transition. PIT MSB
   transition has happened somewhere during SMI execution.

c. returned from SMI and we noted down the 'tsc', saw the pit MSB change now and
   exited the loop to calculate 'deltatsc'. Instead of noting the TSC at the MSB
   transition, we are way off because of the SMI.  And as the SMI happened
   between the pit_verify_msb() and before the 'tsc' is recorded in the
   for loop, 'delattsc' (d1/d2 in quick_pit_calibrate()) will be small and
   quick_pit_calibrate() will not notice this error.

Depending on whether SMI disturbance happens while computing d1 or d2, we will
see the TSC calibrated value smaller or bigger than the expected value. As a
result, in a cluster we were seeing a variation of approximately +/- 20MHz in
the calibrated values, resulting in NTP failures.

  [ As far as the SMI source is concerned, this is a periodic SMI that gets
    disabled after ACPI is enabled by the OS. But the TSC calibration happens
    before the ACPI is enabled. ]

To address this, change pit_expect_msb() so that

 - the 'tsc' is the TSC in between the two reads that read the MSB
change from the PIT (same as before)

 - the 'delta' is the difference in TSC from *before* the MSB changed
to *after* the MSB changed.

Now the delta is twice as big as before (it covers four PIT accesses,
roughly 4us) and quick_pit_calibrate() will loop a bit longer to get
the calibrated value with in the 500ppm precision. As the delta (d1/d2)
covers four PIT accesses, actual calibrated result might be closer to
250ppm precision.

As the loop now takes longer to stabilize, double MAX_QUICK_PIT_MS to 50.

SMI disturbance will showup as much larger delta's and the loop will take
longer than usual for the result to be with in the accepted precision. Or will
fallback to slow PIT calibration if it takes more than 50msec.

Also while we are at this, remove the calibration correction that aims to
get the result to the middle of the error bars. We really don't know which
direction to correct into, so remove it.

Reported-and-tested-by: Suresh Siddha <suresh.b.siddha@intel.com>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
Signed-off-by: Suresh Siddha <suresh.b.siddha@intel.com>
Link: http://lkml.kernel.org/r/1326843337.5291.4.camel@sbsiddha-mobl2
Signed-off-by: H. Peter Anvin <hpa@zytor.com>
---
 arch/x86/kernel/tsc.c | 14 ++++++--------
 1 file changed, 6 insertions(+), 8 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kernel/tsc.c b/arch/x86/kernel/tsc.c
index 2c9cf0fd78f5..f54694611172 100644
--- a/arch/x86/kernel/tsc.c
+++ b/arch/x86/kernel/tsc.c
@@ -290,14 +290,15 @@ static inline int pit_verify_msb(unsigned char val)
 static inline int pit_expect_msb(unsigned char val, u64 *tscp, unsigned long *deltap)
 {
 	int count;
-	u64 tsc = 0;
+	u64 tsc = 0, prev_tsc = 0;
 
 	for (count = 0; count < 50000; count++) {
 		if (!pit_verify_msb(val))
 			break;
+		prev_tsc = tsc;
 		tsc = get_cycles();
 	}
-	*deltap = get_cycles() - tsc;
+	*deltap = get_cycles() - prev_tsc;
 	*tscp = tsc;
 
 	/*
@@ -311,9 +312,9 @@ static inline int pit_expect_msb(unsigned char val, u64 *tscp, unsigned long *de
  * How many MSB values do we want to see? We aim for
  * a maximum error rate of 500ppm (in practice the
  * real error is much smaller), but refuse to spend
- * more than 25ms on it.
+ * more than 50ms on it.
  */
-#define MAX_QUICK_PIT_MS 25
+#define MAX_QUICK_PIT_MS 50
 #define MAX_QUICK_PIT_ITERATIONS (MAX_QUICK_PIT_MS * PIT_TICK_RATE / 1000 / 256)
 
 static unsigned long quick_pit_calibrate(void)
@@ -383,15 +384,12 @@ success:
 	 *
 	 * As a result, we can depend on there not being
 	 * any odd delays anywhere, and the TSC reads are
-	 * reliable (within the error). We also adjust the
-	 * delta to the middle of the error bars, just
-	 * because it looks nicer.
+	 * reliable (within the error).
 	 *
 	 * kHz = ticks / time-in-seconds / 1000;
 	 * kHz = (t2 - t1) / (I * 256 / PIT_TICK_RATE) / 1000
 	 * kHz = ((t2 - t1) * PIT_TICK_RATE) / (I * 256 * 1000)
 	 */
-	delta += (long)(d2 - d1)/2;
 	delta *= PIT_TICK_RATE;
 	do_div(delta, i*256*1000);
 	printk("Fast TSC calibration using PIT\n");
-- 
cgit v1.2.3


From 6015ff103133c7e50a753c198c69bcabc3a5e3b0 Mon Sep 17 00:00:00 2001
From: Al Viro <viro@ZenIV.linux.org.uk>
Date: Wed, 18 Jan 2012 01:51:22 +0000
Subject: x86-32: Fix build failure with AUDIT=y, AUDITSYSCALL=n

JONGMAN HEO reports:

  With current linus git (commit a25a2b84), I got following build error,

  arch/x86/kernel/vm86_32.c: In function 'do_sys_vm86':
  arch/x86/kernel/vm86_32.c:340: error: implicit declaration of function '__audit_syscall_exit'
  make[3]: *** [arch/x86/kernel/vm86_32.o] Error 1

OK, I can reproduce it (32bit allmodconfig with AUDIT=y, AUDITSYSCALL=n)

It's due to commit d7e7528bcd45: "Audit: push audit success and retcode
into arch ptrace.h".

Reported-by: JONGMAN HEO <jongman.heo@samsung.com>
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 arch/x86/kernel/vm86_32.c | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'arch/x86')

diff --git a/arch/x86/kernel/vm86_32.c b/arch/x86/kernel/vm86_32.c
index af17e1c966dc..b466cab5ba15 100644
--- a/arch/x86/kernel/vm86_32.c
+++ b/arch/x86/kernel/vm86_32.c
@@ -336,8 +336,10 @@ static void do_sys_vm86(struct kernel_vm86_struct *info, struct task_struct *tsk
 		mark_screen_rdonly(tsk->mm);
 
 	/*call __audit_syscall_exit since we do not exit via the normal paths */
+#ifdef CONFIG_AUDITSYSCALL
 	if (unlikely(current->audit_context))
 		__audit_syscall_exit(1, 0);
+#endif
 
 	__asm__ __volatile__(
 		"movl %0,%%esp\n\t"
-- 
cgit v1.2.3


From d00a9dd21bdf7908b70866794c8313ee8a5abd5c Mon Sep 17 00:00:00 2001
From: Eric Dumazet <eric.dumazet@gmail.com>
Date: Wed, 18 Jan 2012 07:21:42 +0000
Subject: net: bpf_jit: fix divide by 0 generation

Several problems fixed in this patch :

1) Target of the conditional jump in case a divide by 0 is performed
   by a bpf is wrong.

2) Must 'generate' the full function prologue/epilogue at pass=0,
   or else we can stop too early in pass=1 if the proglen doesnt change.
   (if the increase of prologue/epilogue equals decrease of all
    instructions length because some jumps are converted to near jumps)

3) Change the wrong length detection at the end of code generation to
   issue a more explicit message, no need for a full stack trace.

Reported-by: Phil Oester <kernel@linuxace.com>
Signed-off-by: Eric Dumazet <eric.dumazet@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 arch/x86/net/bpf_jit_comp.c | 36 ++++++++++++++++++++++--------------
 1 file changed, 22 insertions(+), 14 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/net/bpf_jit_comp.c b/arch/x86/net/bpf_jit_comp.c
index 7b65f752c5f8..7c1b765ecc59 100644
--- a/arch/x86/net/bpf_jit_comp.c
+++ b/arch/x86/net/bpf_jit_comp.c
@@ -151,17 +151,18 @@ void bpf_jit_compile(struct sk_filter *fp)
 	cleanup_addr = proglen; /* epilogue address */
 
 	for (pass = 0; pass < 10; pass++) {
+		u8 seen_or_pass0 = (pass == 0) ? (SEEN_XREG | SEEN_DATAREF | SEEN_MEM) : seen;
 		/* no prologue/epilogue for trivial filters (RET something) */
 		proglen = 0;
 		prog = temp;
 
-		if (seen) {
+		if (seen_or_pass0) {
 			EMIT4(0x55, 0x48, 0x89, 0xe5); /* push %rbp; mov %rsp,%rbp */
 			EMIT4(0x48, 0x83, 0xec, 96);	/* subq  $96,%rsp	*/
 			/* note : must save %rbx in case bpf_error is hit */
-			if (seen & (SEEN_XREG | SEEN_DATAREF))
+			if (seen_or_pass0 & (SEEN_XREG | SEEN_DATAREF))
 				EMIT4(0x48, 0x89, 0x5d, 0xf8); /* mov %rbx, -8(%rbp) */
-			if (seen & SEEN_XREG)
+			if (seen_or_pass0 & SEEN_XREG)
 				CLEAR_X(); /* make sure we dont leek kernel memory */
 
 			/*
@@ -170,7 +171,7 @@ void bpf_jit_compile(struct sk_filter *fp)
 			 *  r9 = skb->len - skb->data_len
 			 *  r8 = skb->data
 			 */
-			if (seen & SEEN_DATAREF) {
+			if (seen_or_pass0 & SEEN_DATAREF) {
 				if (offsetof(struct sk_buff, len) <= 127)
 					/* mov    off8(%rdi),%r9d */
 					EMIT4(0x44, 0x8b, 0x4f, offsetof(struct sk_buff, len));
@@ -260,9 +261,14 @@ void bpf_jit_compile(struct sk_filter *fp)
 			case BPF_S_ALU_DIV_X: /* A /= X; */
 				seen |= SEEN_XREG;
 				EMIT2(0x85, 0xdb);	/* test %ebx,%ebx */
-				if (pc_ret0 != -1)
-					EMIT_COND_JMP(X86_JE, addrs[pc_ret0] - (addrs[i] - 4));
-				else {
+				if (pc_ret0 > 0) {
+					/* addrs[pc_ret0 - 1] is start address of target
+					 * (addrs[i] - 4) is the address following this jmp
+					 * ("xor %edx,%edx; div %ebx" being 4 bytes long)
+					 */
+					EMIT_COND_JMP(X86_JE, addrs[pc_ret0 - 1] -
+								(addrs[i] - 4));
+				} else {
 					EMIT_COND_JMP(X86_JNE, 2 + 5);
 					CLEAR_A();
 					EMIT1_off32(0xe9, cleanup_addr - (addrs[i] - 4)); /* jmp .+off32 */
@@ -335,12 +341,12 @@ void bpf_jit_compile(struct sk_filter *fp)
 				}
 				/* fallinto */
 			case BPF_S_RET_A:
-				if (seen) {
+				if (seen_or_pass0) {
 					if (i != flen - 1) {
 						EMIT_JMP(cleanup_addr - addrs[i]);
 						break;
 					}
-					if (seen & SEEN_XREG)
+					if (seen_or_pass0 & SEEN_XREG)
 						EMIT4(0x48, 0x8b, 0x5d, 0xf8);  /* mov  -8(%rbp),%rbx */
 					EMIT1(0xc9);		/* leaveq */
 				}
@@ -483,8 +489,9 @@ common_load:			seen |= SEEN_DATAREF;
 				goto common_load;
 			case BPF_S_LDX_B_MSH:
 				if ((int)K < 0) {
-					if (pc_ret0 != -1) {
-						EMIT_JMP(addrs[pc_ret0] - addrs[i]);
+					if (pc_ret0 > 0) {
+						/* addrs[pc_ret0 - 1] is the start address */
+						EMIT_JMP(addrs[pc_ret0 - 1] - addrs[i]);
 						break;
 					}
 					CLEAR_A();
@@ -599,13 +606,14 @@ cond_branch:			f_offset = addrs[i + filter[i].jf] - addrs[i];
 		 * use it to give the cleanup instruction(s) addr
 		 */
 		cleanup_addr = proglen - 1; /* ret */
-		if (seen)
+		if (seen_or_pass0)
 			cleanup_addr -= 1; /* leaveq */
-		if (seen & SEEN_XREG)
+		if (seen_or_pass0 & SEEN_XREG)
 			cleanup_addr -= 4; /* mov  -8(%rbp),%rbx */
 
 		if (image) {
-			WARN_ON(proglen != oldproglen);
+			if (proglen != oldproglen)
+				pr_err("bpb_jit_compile proglen=%u != oldproglen=%u\n", proglen, oldproglen);
 			break;
 		}
 		if (proglen == oldproglen) {
-- 
cgit v1.2.3


From 90a4c0f51e8e44111a926be6f4c87af3938a79c3 Mon Sep 17 00:00:00 2001
From: Linus Torvalds <torvalds@linux-foundation.org>
Date: Wed, 18 Jan 2012 19:26:11 -0800
Subject: uml: fix compile for x86-64

Randy Dunlap reports that we get

  arch/x86/um/shared/sysdep/ptrace.h:7:20: error: redefinition of 'regs_return_value'
  arch/x86/um/shared/sysdep/ptrace.h:7:20: note: previous definition of 'regs_return_value' was here

when compiling UML for x86-64.

Stephen Rothwell root-caused it and says:

 "Caused by commit d7e7528bcd45 ("Audit: push audit success and retcode
  into arch ptrace.h") (another patch that was never in linux-next :-().

  This file now needs protection against double inclusion."

so let's do as the man says.

Reported-by: Randy Dunlap <rdunlap@xenotime.net>
Analyzed-by: Stephen Rothwell <sfr@canb.auug.org.au>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 arch/x86/um/shared/sysdep/ptrace.h | 5 +++++
 1 file changed, 5 insertions(+)

(limited to 'arch/x86')

diff --git a/arch/x86/um/shared/sysdep/ptrace.h b/arch/x86/um/shared/sysdep/ptrace.h
index 5ef9344a8b24..2bbe1ec2d96a 100644
--- a/arch/x86/um/shared/sysdep/ptrace.h
+++ b/arch/x86/um/shared/sysdep/ptrace.h
@@ -1,3 +1,6 @@
+#ifndef __SYSDEP_X86_PTRACE_H
+#define __SYSDEP_X86_PTRACE_H
+
 #ifdef __i386__
 #include "ptrace_32.h"
 #else
@@ -8,3 +11,5 @@ static inline long regs_return_value(struct uml_pt_regs *regs)
 {
 	return UPT_SYSCALL_RET(regs);
 }
+
+#endif /* __SYSDEP_X86_PTRACE_H */
-- 
cgit v1.2.3


From 4f2f81a5621de47d42476d0b929be2e0d565df84 Mon Sep 17 00:00:00 2001
From: "H. Peter Anvin" <hpa@zytor.com>
Date: Thu, 19 Jan 2012 12:41:25 -0800
Subject: x86, syscall: Need __ARCH_WANT_SYS_IPC for 32 bits

In checkin

  303395ac3bf3 x86: Generate system call tables and unistd_*.h from tables

the feature macros in <asm/unistd.h> were unified between 32 and 64
bits.  Unfortunately 32 bits requires __ARCH_WANT_SYS_IPC and this was
inadvertently dropped.

Reported-by: Dmitry Kasatkin <dmitry.kasatkin@intel.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Signed-off-by: H. Peter Anvin <hpa@zytor.com>
Link: http://lkml.kernel.org/r/CALLzPKbeXN5gdngo8uYYU8mAow=XhrwBFBhKfG811f37BubQOg@mail.gmail.com
---
 arch/x86/include/asm/unistd.h | 1 +
 1 file changed, 1 insertion(+)

(limited to 'arch/x86')

diff --git a/arch/x86/include/asm/unistd.h b/arch/x86/include/asm/unistd.h
index b4a3db7ce140..21f77b89e47a 100644
--- a/arch/x86/include/asm/unistd.h
+++ b/arch/x86/include/asm/unistd.h
@@ -7,6 +7,7 @@
 #  include <asm/unistd_32.h>
 #  define __ARCH_WANT_IPC_PARSE_VERSION
 #  define __ARCH_WANT_STAT64
+#  define __ARCH_WANT_SYS_IPC
 #  define __ARCH_WANT_SYS_OLD_MMAP
 #  define __ARCH_WANT_SYS_OLD_SELECT
 
-- 
cgit v1.2.3


From 819165fb34b9777f852429f2c6d6f79fbb71b9eb Mon Sep 17 00:00:00 2001
From: Jan Beulich <JBeulich@suse.com>
Date: Fri, 20 Jan 2012 16:21:41 +0000
Subject: x86: Adjust asm constraints in atomic64 wrappers

Eric pointed out overly restrictive constraints in atomic64_set(), but
there are issues throughout the file. In the cited case, %ebx and %ecx
are inputs only (don't get changed by either of the two low level
implementations). This was also the case elsewhere.

Further in many cases early-clobber indicators were missing.

Finally, the previous implementation rolled a custom alternative
instruction macro from scratch, rather than using alternative_call()
(which was introduced with the commit that the description of the
change in question actually refers to). Adjusting has the benefit of
not hiding referenced symbols from the compiler, which however requires
them to be declared not just in the exporting source file (which, as a
desirable side effect, in turn allows that exporting file to become a
real 5-line stub).

This patch does not eliminate the overly restrictive memory clobbers,
however: Doing so would occasionally make the compiler set up a second
register for accessing the memory object (to satisfy the added "m"
constraint), and it's not clear which of the two non-optimal
alternatives is better.

v2: Re-do the declaration and exporting of the internal symbols.

Reported-by: Eric Dumazet <eric.dumazet@gmail.com>
Signed-off-by: Jan Beulich <jbeulich@suse.com>
Link: http://lkml.kernel.org/r/4F19A2A5020000780006E0D9@nat28.tlf.novell.com
Cc: Luca Barbieri <luca@luca-barbieri.com>
Signed-off-by: H. Peter Anvin <hpa@linux.intel.com>
---
 arch/x86/include/asm/alternative.h |   6 ++
 arch/x86/include/asm/atomic64_32.h | 147 ++++++++++++++++++++-----------------
 arch/x86/lib/atomic64_32.c         |  59 +--------------
 3 files changed, 88 insertions(+), 124 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/include/asm/alternative.h b/arch/x86/include/asm/alternative.h
index 37ad100a2210..49331bedc158 100644
--- a/arch/x86/include/asm/alternative.h
+++ b/arch/x86/include/asm/alternative.h
@@ -145,6 +145,12 @@ static inline int alternatives_text_reserved(void *start, void *end)
  */
 #define ASM_OUTPUT2(a...) a
 
+/*
+ * use this macro if you need clobbers but no inputs in
+ * alternative_{input,io,call}()
+ */
+#define ASM_NO_INPUT_CLOBBER(clbr...) "i" (0) : clbr
+
 struct paravirt_patch_site;
 #ifdef CONFIG_PARAVIRT
 void apply_paravirt(struct paravirt_patch_site *start,
diff --git a/arch/x86/include/asm/atomic64_32.h b/arch/x86/include/asm/atomic64_32.h
index fa13f0ec2874..908303f68bba 100644
--- a/arch/x86/include/asm/atomic64_32.h
+++ b/arch/x86/include/asm/atomic64_32.h
@@ -14,13 +14,52 @@ typedef struct {
 
 #define ATOMIC64_INIT(val)	{ (val) }
 
+#define __ATOMIC64_DECL(sym) void atomic64_##sym(atomic64_t *, ...)
+#ifndef ATOMIC64_EXPORT
+#define ATOMIC64_DECL_ONE __ATOMIC64_DECL
+#else
+#define ATOMIC64_DECL_ONE(sym) __ATOMIC64_DECL(sym); \
+	ATOMIC64_EXPORT(atomic64_##sym)
+#endif
+
 #ifdef CONFIG_X86_CMPXCHG64
-#define ATOMIC64_ALTERNATIVE_(f, g) "call atomic64_" #g "_cx8"
+#define __alternative_atomic64(f, g, out, in...) \
+	asm volatile("call %P[func]" \
+		     : out : [func] "i" (atomic64_##g##_cx8), ## in)
+
+#define ATOMIC64_DECL(sym) ATOMIC64_DECL_ONE(sym##_cx8)
 #else
-#define ATOMIC64_ALTERNATIVE_(f, g) ALTERNATIVE("call atomic64_" #f "_386", "call atomic64_" #g "_cx8", X86_FEATURE_CX8)
+#define __alternative_atomic64(f, g, out, in...) \
+	alternative_call(atomic64_##f##_386, atomic64_##g##_cx8, \
+			 X86_FEATURE_CX8, ASM_OUTPUT2(out), ## in)
+
+#define ATOMIC64_DECL(sym) ATOMIC64_DECL_ONE(sym##_cx8); \
+	ATOMIC64_DECL_ONE(sym##_386)
+
+ATOMIC64_DECL_ONE(add_386);
+ATOMIC64_DECL_ONE(sub_386);
+ATOMIC64_DECL_ONE(inc_386);
+ATOMIC64_DECL_ONE(dec_386);
 #endif
 
-#define ATOMIC64_ALTERNATIVE(f) ATOMIC64_ALTERNATIVE_(f, f)
+#define alternative_atomic64(f, out, in...) \
+	__alternative_atomic64(f, f, ASM_OUTPUT2(out), ## in)
+
+ATOMIC64_DECL(read);
+ATOMIC64_DECL(set);
+ATOMIC64_DECL(xchg);
+ATOMIC64_DECL(add_return);
+ATOMIC64_DECL(sub_return);
+ATOMIC64_DECL(inc_return);
+ATOMIC64_DECL(dec_return);
+ATOMIC64_DECL(dec_if_positive);
+ATOMIC64_DECL(inc_not_zero);
+ATOMIC64_DECL(add_unless);
+
+#undef ATOMIC64_DECL
+#undef ATOMIC64_DECL_ONE
+#undef __ATOMIC64_DECL
+#undef ATOMIC64_EXPORT
 
 /**
  * atomic64_cmpxchg - cmpxchg atomic64 variable
@@ -50,11 +89,9 @@ static inline long long atomic64_xchg(atomic64_t *v, long long n)
 	long long o;
 	unsigned high = (unsigned)(n >> 32);
 	unsigned low = (unsigned)n;
-	asm volatile(ATOMIC64_ALTERNATIVE(xchg)
-		     : "=A" (o), "+b" (low), "+c" (high)
-		     : "S" (v)
-		     : "memory"
-		     );
+	alternative_atomic64(xchg, "=&A" (o),
+			     "S" (v), "b" (low), "c" (high)
+			     : "memory");
 	return o;
 }
 
@@ -69,11 +106,9 @@ static inline void atomic64_set(atomic64_t *v, long long i)
 {
 	unsigned high = (unsigned)(i >> 32);
 	unsigned low = (unsigned)i;
-	asm volatile(ATOMIC64_ALTERNATIVE(set)
-		     : "+b" (low), "+c" (high)
-		     : "S" (v)
-		     : "eax", "edx", "memory"
-		     );
+	alternative_atomic64(set, /* no output */,
+			     "S" (v), "b" (low), "c" (high)
+			     : "eax", "edx", "memory");
 }
 
 /**
@@ -85,10 +120,7 @@ static inline void atomic64_set(atomic64_t *v, long long i)
 static inline long long atomic64_read(const atomic64_t *v)
 {
 	long long r;
-	asm volatile(ATOMIC64_ALTERNATIVE(read)
-		     : "=A" (r), "+c" (v)
-		     : : "memory"
-		     );
+	alternative_atomic64(read, "=&A" (r), "c" (v) : "memory");
 	return r;
  }
 
@@ -101,10 +133,9 @@ static inline long long atomic64_read(const atomic64_t *v)
  */
 static inline long long atomic64_add_return(long long i, atomic64_t *v)
 {
-	asm volatile(ATOMIC64_ALTERNATIVE(add_return)
-		     : "+A" (i), "+c" (v)
-		     : : "memory"
-		     );
+	alternative_atomic64(add_return,
+			     ASM_OUTPUT2("+A" (i), "+c" (v)),
+			     ASM_NO_INPUT_CLOBBER("memory"));
 	return i;
 }
 
@@ -113,32 +144,25 @@ static inline long long atomic64_add_return(long long i, atomic64_t *v)
  */
 static inline long long atomic64_sub_return(long long i, atomic64_t *v)
 {
-	asm volatile(ATOMIC64_ALTERNATIVE(sub_return)
-		     : "+A" (i), "+c" (v)
-		     : : "memory"
-		     );
+	alternative_atomic64(sub_return,
+			     ASM_OUTPUT2("+A" (i), "+c" (v)),
+			     ASM_NO_INPUT_CLOBBER("memory"));
 	return i;
 }
 
 static inline long long atomic64_inc_return(atomic64_t *v)
 {
 	long long a;
-	asm volatile(ATOMIC64_ALTERNATIVE(inc_return)
-		     : "=A" (a)
-		     : "S" (v)
-		     : "memory", "ecx"
-		     );
+	alternative_atomic64(inc_return, "=&A" (a),
+			     "S" (v) : "memory", "ecx");
 	return a;
 }
 
 static inline long long atomic64_dec_return(atomic64_t *v)
 {
 	long long a;
-	asm volatile(ATOMIC64_ALTERNATIVE(dec_return)
-		     : "=A" (a)
-		     : "S" (v)
-		     : "memory", "ecx"
-		     );
+	alternative_atomic64(dec_return, "=&A" (a),
+			     "S" (v) : "memory", "ecx");
 	return a;
 }
 
@@ -151,10 +175,9 @@ static inline long long atomic64_dec_return(atomic64_t *v)
  */
 static inline long long atomic64_add(long long i, atomic64_t *v)
 {
-	asm volatile(ATOMIC64_ALTERNATIVE_(add, add_return)
-		     : "+A" (i), "+c" (v)
-		     : : "memory"
-		     );
+	__alternative_atomic64(add, add_return,
+			       ASM_OUTPUT2("+A" (i), "+c" (v)),
+			       ASM_NO_INPUT_CLOBBER("memory"));
 	return i;
 }
 
@@ -167,10 +190,9 @@ static inline long long atomic64_add(long long i, atomic64_t *v)
  */
 static inline long long atomic64_sub(long long i, atomic64_t *v)
 {
-	asm volatile(ATOMIC64_ALTERNATIVE_(sub, sub_return)
-		     : "+A" (i), "+c" (v)
-		     : : "memory"
-		     );
+	__alternative_atomic64(sub, sub_return,
+			       ASM_OUTPUT2("+A" (i), "+c" (v)),
+			       ASM_NO_INPUT_CLOBBER("memory"));
 	return i;
 }
 
@@ -196,10 +218,8 @@ static inline int atomic64_sub_and_test(long long i, atomic64_t *v)
  */
 static inline void atomic64_inc(atomic64_t *v)
 {
-	asm volatile(ATOMIC64_ALTERNATIVE_(inc, inc_return)
-		     : : "S" (v)
-		     : "memory", "eax", "ecx", "edx"
-		     );
+	__alternative_atomic64(inc, inc_return, /* no output */,
+			       "S" (v) : "memory", "eax", "ecx", "edx");
 }
 
 /**
@@ -210,10 +230,8 @@ static inline void atomic64_inc(atomic64_t *v)
  */
 static inline void atomic64_dec(atomic64_t *v)
 {
-	asm volatile(ATOMIC64_ALTERNATIVE_(dec, dec_return)
-		     : : "S" (v)
-		     : "memory", "eax", "ecx", "edx"
-		     );
+	__alternative_atomic64(dec, dec_return, /* no output */,
+			       "S" (v) : "memory", "eax", "ecx", "edx");
 }
 
 /**
@@ -263,15 +281,16 @@ static inline int atomic64_add_negative(long long i, atomic64_t *v)
  * @u: ...unless v is equal to u.
  *
  * Atomically adds @a to @v, so long as it was not @u.
- * Returns the old value of @v.
+ * Returns non-zero if the add was done, zero otherwise.
  */
 static inline int atomic64_add_unless(atomic64_t *v, long long a, long long u)
 {
 	unsigned low = (unsigned)u;
 	unsigned high = (unsigned)(u >> 32);
-	asm volatile(ATOMIC64_ALTERNATIVE(add_unless) "\n\t"
-		     : "+A" (a), "+c" (v), "+S" (low), "+D" (high)
-		     : : "memory");
+	alternative_atomic64(add_unless,
+			     ASM_OUTPUT2("+A" (a), "+c" (v),
+					 "+S" (low), "+D" (high)),
+			     ASM_NO_INPUT_CLOBBER("memory"));
 	return (int)a;
 }
 
@@ -279,26 +298,20 @@ static inline int atomic64_add_unless(atomic64_t *v, long long a, long long u)
 static inline int atomic64_inc_not_zero(atomic64_t *v)
 {
 	int r;
-	asm volatile(ATOMIC64_ALTERNATIVE(inc_not_zero)
-		     : "=a" (r)
-		     : "S" (v)
-		     : "ecx", "edx", "memory"
-		     );
+	alternative_atomic64(inc_not_zero, "=&a" (r),
+			     "S" (v) : "ecx", "edx", "memory");
 	return r;
 }
 
 static inline long long atomic64_dec_if_positive(atomic64_t *v)
 {
 	long long r;
-	asm volatile(ATOMIC64_ALTERNATIVE(dec_if_positive)
-		     : "=A" (r)
-		     : "S" (v)
-		     : "ecx", "memory"
-		     );
+	alternative_atomic64(dec_if_positive, "=&A" (r),
+			     "S" (v) : "ecx", "memory");
 	return r;
 }
 
-#undef ATOMIC64_ALTERNATIVE
-#undef ATOMIC64_ALTERNATIVE_
+#undef alternative_atomic64
+#undef __alternative_atomic64
 
 #endif /* _ASM_X86_ATOMIC64_32_H */
diff --git a/arch/x86/lib/atomic64_32.c b/arch/x86/lib/atomic64_32.c
index 042f6826bf57..a0b4a350daa7 100644
--- a/arch/x86/lib/atomic64_32.c
+++ b/arch/x86/lib/atomic64_32.c
@@ -1,59 +1,4 @@
-#include <linux/compiler.h>
-#include <linux/module.h>
-#include <linux/types.h>
+#define ATOMIC64_EXPORT EXPORT_SYMBOL
 
-#include <asm/processor.h>
-#include <asm/cmpxchg.h>
+#include <linux/export.h>
 #include <linux/atomic.h>
-
-long long atomic64_read_cx8(long long, const atomic64_t *v);
-EXPORT_SYMBOL(atomic64_read_cx8);
-long long atomic64_set_cx8(long long, const atomic64_t *v);
-EXPORT_SYMBOL(atomic64_set_cx8);
-long long atomic64_xchg_cx8(long long, unsigned high);
-EXPORT_SYMBOL(atomic64_xchg_cx8);
-long long atomic64_add_return_cx8(long long a, atomic64_t *v);
-EXPORT_SYMBOL(atomic64_add_return_cx8);
-long long atomic64_sub_return_cx8(long long a, atomic64_t *v);
-EXPORT_SYMBOL(atomic64_sub_return_cx8);
-long long atomic64_inc_return_cx8(long long a, atomic64_t *v);
-EXPORT_SYMBOL(atomic64_inc_return_cx8);
-long long atomic64_dec_return_cx8(long long a, atomic64_t *v);
-EXPORT_SYMBOL(atomic64_dec_return_cx8);
-long long atomic64_dec_if_positive_cx8(atomic64_t *v);
-EXPORT_SYMBOL(atomic64_dec_if_positive_cx8);
-int atomic64_inc_not_zero_cx8(atomic64_t *v);
-EXPORT_SYMBOL(atomic64_inc_not_zero_cx8);
-int atomic64_add_unless_cx8(atomic64_t *v, long long a, long long u);
-EXPORT_SYMBOL(atomic64_add_unless_cx8);
-
-#ifndef CONFIG_X86_CMPXCHG64
-long long atomic64_read_386(long long, const atomic64_t *v);
-EXPORT_SYMBOL(atomic64_read_386);
-long long atomic64_set_386(long long, const atomic64_t *v);
-EXPORT_SYMBOL(atomic64_set_386);
-long long atomic64_xchg_386(long long, unsigned high);
-EXPORT_SYMBOL(atomic64_xchg_386);
-long long atomic64_add_return_386(long long a, atomic64_t *v);
-EXPORT_SYMBOL(atomic64_add_return_386);
-long long atomic64_sub_return_386(long long a, atomic64_t *v);
-EXPORT_SYMBOL(atomic64_sub_return_386);
-long long atomic64_inc_return_386(long long a, atomic64_t *v);
-EXPORT_SYMBOL(atomic64_inc_return_386);
-long long atomic64_dec_return_386(long long a, atomic64_t *v);
-EXPORT_SYMBOL(atomic64_dec_return_386);
-long long atomic64_add_386(long long a, atomic64_t *v);
-EXPORT_SYMBOL(atomic64_add_386);
-long long atomic64_sub_386(long long a, atomic64_t *v);
-EXPORT_SYMBOL(atomic64_sub_386);
-long long atomic64_inc_386(long long a, atomic64_t *v);
-EXPORT_SYMBOL(atomic64_inc_386);
-long long atomic64_dec_386(long long a, atomic64_t *v);
-EXPORT_SYMBOL(atomic64_dec_386);
-long long atomic64_dec_if_positive_386(atomic64_t *v);
-EXPORT_SYMBOL(atomic64_dec_if_positive_386);
-int atomic64_inc_not_zero_386(atomic64_t *v);
-EXPORT_SYMBOL(atomic64_inc_not_zero_386);
-int atomic64_add_unless_386(atomic64_t *v, long long a, long long u);
-EXPORT_SYMBOL(atomic64_add_unless_386);
-#endif
-- 
cgit v1.2.3


From cb8095bba6d24118135a5683a956f4f4fb5f17bb Mon Sep 17 00:00:00 2001
From: Jan Beulich <JBeulich@suse.com>
Date: Fri, 20 Jan 2012 16:22:04 +0000
Subject: x86: atomic64 assembly improvements

In the "xchg" implementation, %ebx and %ecx don't need to be copied
into %eax and %edx respectively (this is only necessary when desiring
to only read the stored value).

In the "add_unless" implementation, swapping the use of %ecx and %esi
for passing arguments allows %esi to become an input only (i.e.
permitting the register to be re-used to address the same object
without reload).

In "{add,sub}_return", doing the initial read64 through the passed in
%ecx decreases a register dependency.

In "inc_not_zero", a branch can be eliminated by or-ing together the
two halves of the current (64-bit) value, and code size can be further
reduced by adjusting the arithmetic slightly.

v2: Undo the folding of "xchg" and "set".

Signed-off-by: Jan Beulich <jbeulich@suse.com>
Link: http://lkml.kernel.org/r/4F19A2BC020000780006E0DC@nat28.tlf.novell.com
Cc: Luca Barbieri <luca@luca-barbieri.com>
Cc: Eric Dumazet <eric.dumazet@gmail.com>
Signed-off-by: H. Peter Anvin <hpa@linux.intel.com>
---
 arch/x86/include/asm/atomic64_32.h |  5 ++---
 arch/x86/lib/atomic64_386_32.S     |  6 +++---
 arch/x86/lib/atomic64_cx8_32.S     | 29 +++++++++++------------------
 3 files changed, 16 insertions(+), 24 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/include/asm/atomic64_32.h b/arch/x86/include/asm/atomic64_32.h
index 908303f68bba..198119910da5 100644
--- a/arch/x86/include/asm/atomic64_32.h
+++ b/arch/x86/include/asm/atomic64_32.h
@@ -288,9 +288,8 @@ static inline int atomic64_add_unless(atomic64_t *v, long long a, long long u)
 	unsigned low = (unsigned)u;
 	unsigned high = (unsigned)(u >> 32);
 	alternative_atomic64(add_unless,
-			     ASM_OUTPUT2("+A" (a), "+c" (v),
-					 "+S" (low), "+D" (high)),
-			     ASM_NO_INPUT_CLOBBER("memory"));
+			     ASM_OUTPUT2("+A" (a), "+c" (low), "+D" (high)),
+			     "S" (v) : "memory");
 	return (int)a;
 }
 
diff --git a/arch/x86/lib/atomic64_386_32.S b/arch/x86/lib/atomic64_386_32.S
index e8e7e0d06f42..00933d5e992f 100644
--- a/arch/x86/lib/atomic64_386_32.S
+++ b/arch/x86/lib/atomic64_386_32.S
@@ -137,13 +137,13 @@ BEGIN(dec_return)
 RET_ENDP
 #undef v
 
-#define v %ecx
+#define v %esi
 BEGIN(add_unless)
-	addl %eax, %esi
+	addl %eax, %ecx
 	adcl %edx, %edi
 	addl  (v), %eax
 	adcl 4(v), %edx
-	cmpl %eax, %esi
+	cmpl %eax, %ecx
 	je 3f
 1:
 	movl %eax,  (v)
diff --git a/arch/x86/lib/atomic64_cx8_32.S b/arch/x86/lib/atomic64_cx8_32.S
index 391a083674b4..f5cc9eb1d51b 100644
--- a/arch/x86/lib/atomic64_cx8_32.S
+++ b/arch/x86/lib/atomic64_cx8_32.S
@@ -55,8 +55,6 @@ ENDPROC(atomic64_set_cx8)
 ENTRY(atomic64_xchg_cx8)
 	CFI_STARTPROC
 
-	movl %ebx, %eax
-	movl %ecx, %edx
 1:
 	LOCK_PREFIX
 	cmpxchg8b (%esi)
@@ -78,7 +76,7 @@ ENTRY(atomic64_\func\()_return_cx8)
 	movl %edx, %edi
 	movl %ecx, %ebp
 
-	read64 %ebp
+	read64 %ecx
 1:
 	movl %eax, %ebx
 	movl %edx, %ecx
@@ -159,23 +157,22 @@ ENTRY(atomic64_add_unless_cx8)
 	SAVE ebx
 /* these just push these two parameters on the stack */
 	SAVE edi
-	SAVE esi
+	SAVE ecx
 
-	movl %ecx, %ebp
-	movl %eax, %esi
+	movl %eax, %ebp
 	movl %edx, %edi
 
-	read64 %ebp
+	read64 %esi
 1:
 	cmpl %eax, 0(%esp)
 	je 4f
 2:
 	movl %eax, %ebx
 	movl %edx, %ecx
-	addl %esi, %ebx
+	addl %ebp, %ebx
 	adcl %edi, %ecx
 	LOCK_PREFIX
-	cmpxchg8b (%ebp)
+	cmpxchg8b (%esi)
 	jne 1b
 
 	movl $1, %eax
@@ -199,13 +196,13 @@ ENTRY(atomic64_inc_not_zero_cx8)
 
 	read64 %esi
 1:
-	testl %eax, %eax
-	je 4f
-2:
+	movl %eax, %ecx
+	orl %edx, %ecx
+	jz 3f
 	movl %eax, %ebx
-	movl %edx, %ecx
+	xorl %ecx, %ecx
 	addl $1, %ebx
-	adcl $0, %ecx
+	adcl %edx, %ecx
 	LOCK_PREFIX
 	cmpxchg8b (%esi)
 	jne 1b
@@ -214,9 +211,5 @@ ENTRY(atomic64_inc_not_zero_cx8)
 3:
 	RESTORE ebx
 	ret
-4:
-	testl %edx, %edx
-	jne 2b
-	jmp 3b
 	CFI_ENDPROC
 ENDPROC(atomic64_inc_not_zero_cx8)
-- 
cgit v1.2.3


From 7a7546b377bdaa25ac77f33d9433c59f259b9688 Mon Sep 17 00:00:00 2001
From: David Vrabel <david.vrabel@citrix.com>
Date: Mon, 23 Jan 2012 19:32:25 +0000
Subject: x86: xen: size struct xen_spinlock to always fit in arch_spinlock_t

If NR_CPUS < 256 then arch_spinlock_t is only 16 bits wide but struct
xen_spinlock is 32 bits.  When a spin lock is contended and
xl->spinners is modified the two bytes immediately after the spin lock
would be corrupted.

This is a regression caused by 84eb950db13ca40a0572ce9957e14723500943d6
(x86, ticketlock: Clean up types and accessors) which reduced the size
of arch_spinlock_t.

Fix this by making xl->spinners a u8 if NR_CPUS < 256.  A
BUILD_BUG_ON() is also added to check the sizes of the two structures
are compatible.

In many cases this was not noticable as there would often be padding
bytes after the lock (e.g., if any of CONFIG_GENERIC_LOCKBREAK,
CONFIG_DEBUG_SPINLOCK, or CONFIG_DEBUG_LOCK_ALLOC were enabled).

The bnx2 driver is affected. In struct bnx2, phy_lock and
indirect_lock may have no padding after them.  Contention on phy_lock
would corrupt indirect_lock making it appear locked and the driver
would deadlock.

Signed-off-by: David Vrabel <david.vrabel@citrix.com>
Signed-off-by: Jeremy Fitzhardinge <jeremy@goop.org>
Acked-by: Ian Campbell <ian.campbell@citrix.com>
CC: stable@kernel.org #only 3.2
Signed-off-by: Konrad Rzeszutek Wilk <konrad.wilk@oracle.com>
---
 arch/x86/xen/spinlock.c | 27 ++++++++++++++++++++++-----
 1 file changed, 22 insertions(+), 5 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/xen/spinlock.c b/arch/x86/xen/spinlock.c
index cc9b1e182fcf..d69cc6c3f808 100644
--- a/arch/x86/xen/spinlock.c
+++ b/arch/x86/xen/spinlock.c
@@ -116,9 +116,26 @@ static inline void spin_time_accum_blocked(u64 start)
 }
 #endif  /* CONFIG_XEN_DEBUG_FS */
 
+/*
+ * Size struct xen_spinlock so it's the same as arch_spinlock_t.
+ */
+#if NR_CPUS < 256
+typedef u8 xen_spinners_t;
+# define inc_spinners(xl) \
+	asm(LOCK_PREFIX " incb %0" : "+m" ((xl)->spinners) : : "memory");
+# define dec_spinners(xl) \
+	asm(LOCK_PREFIX " decb %0" : "+m" ((xl)->spinners) : : "memory");
+#else
+typedef u16 xen_spinners_t;
+# define inc_spinners(xl) \
+	asm(LOCK_PREFIX " incw %0" : "+m" ((xl)->spinners) : : "memory");
+# define dec_spinners(xl) \
+	asm(LOCK_PREFIX " decw %0" : "+m" ((xl)->spinners) : : "memory");
+#endif
+
 struct xen_spinlock {
 	unsigned char lock;		/* 0 -> free; 1 -> locked */
-	unsigned short spinners;	/* count of waiting cpus */
+	xen_spinners_t spinners;	/* count of waiting cpus */
 };
 
 static int xen_spin_is_locked(struct arch_spinlock *lock)
@@ -164,8 +181,7 @@ static inline struct xen_spinlock *spinning_lock(struct xen_spinlock *xl)
 
 	wmb();			/* set lock of interest before count */
 
-	asm(LOCK_PREFIX " incw %0"
-	    : "+m" (xl->spinners) : : "memory");
+	inc_spinners(xl);
 
 	return prev;
 }
@@ -176,8 +192,7 @@ static inline struct xen_spinlock *spinning_lock(struct xen_spinlock *xl)
  */
 static inline void unspinning_lock(struct xen_spinlock *xl, struct xen_spinlock *prev)
 {
-	asm(LOCK_PREFIX " decw %0"
-	    : "+m" (xl->spinners) : : "memory");
+	dec_spinners(xl);
 	wmb();			/* decrement count before restoring lock */
 	__this_cpu_write(lock_spinners, prev);
 }
@@ -373,6 +388,8 @@ void xen_uninit_lock_cpu(int cpu)
 
 void __init xen_init_spinlocks(void)
 {
+	BUILD_BUG_ON(sizeof(struct xen_spinlock) > sizeof(arch_spinlock_t));
+
 	pv_lock_ops.spin_is_locked = xen_spin_is_locked;
 	pv_lock_ops.spin_is_contended = xen_spin_is_contended;
 	pv_lock_ops.spin_lock = xen_spin_lock;
-- 
cgit v1.2.3


From 2ed86b16eabe4efbf80cc725a8cbb5310746a2fc Mon Sep 17 00:00:00 2001
From: Rob Herring <rob.herring@calxeda.com>
Date: Wed, 25 Jan 2012 20:02:40 -0600
Subject: irq: make SPARSE_IRQ an optionally hidden option

On ARM, we don't want SPARSE_IRQ to be a user visible option. Make
SPARSE_IRQ visible based on MAY_HAVE_SPARSE_IRQ instead of depending
on HAVE_SPARSE_IRQ.

With this, SPARSE_IRQ is not visible on C6X and ARM.

Signed-off-by: Rob Herring <rob.herring@calxeda.com>
Cc: Russell King <linux@arm.linux.org.uk>
Cc: Mark Salter <msalter@redhat.com>
Cc: Aurelien Jacquiot <a-jacquiot@ti.com>
Cc: Benjamin Herrenschmidt <benh@kernel.crashing.org>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Paul Mundt <lethal@linux-sh.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: "H. Peter Anvin" <hpa@zytor.com>
Cc: linux-arm-kernel@lists.infradead.org
Cc: linux-kernel@vger.kernel.org
Cc: linux-c6x-dev@linux-c6x.org
Cc: linuxppc-dev@lists.ozlabs.org
Cc: linux-sh@vger.kernel.org
---
 arch/arm/Kconfig     | 1 -
 arch/c6x/Kconfig     | 2 +-
 arch/powerpc/Kconfig | 2 +-
 arch/sh/Kconfig      | 2 +-
 arch/x86/Kconfig     | 1 -
 kernel/irq/Kconfig   | 5 ++---
 6 files changed, 5 insertions(+), 8 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/arm/Kconfig b/arch/arm/Kconfig
index 24626b0419ee..30e7840498ce 100644
--- a/arch/arm/Kconfig
+++ b/arch/arm/Kconfig
@@ -28,7 +28,6 @@ config ARM
 	select HAVE_HW_BREAKPOINT if (PERF_EVENTS && (CPU_V6 || CPU_V6K || CPU_V7))
 	select HAVE_C_RECORDMCOUNT
 	select HAVE_GENERIC_HARDIRQS
-	select HAVE_SPARSE_IRQ
 	select GENERIC_IRQ_SHOW
 	select CPU_PM if (SUSPEND || CPU_IDLE)
 	select GENERIC_PCI_IOMAP
diff --git a/arch/c6x/Kconfig b/arch/c6x/Kconfig
index 26e67f0f0051..2f58c61e2812 100644
--- a/arch/c6x/Kconfig
+++ b/arch/c6x/Kconfig
@@ -11,7 +11,7 @@ config TMS320C6X
 	select HAVE_DMA_API_DEBUG
 	select HAVE_GENERIC_HARDIRQS
 	select HAVE_MEMBLOCK
-	select HAVE_SPARSE_IRQ
+	select SPARSE_IRQ
 	select OF
 	select OF_EARLY_FLATTREE
 
diff --git a/arch/powerpc/Kconfig b/arch/powerpc/Kconfig
index 1919634a9b32..06c1cf0f24a6 100644
--- a/arch/powerpc/Kconfig
+++ b/arch/powerpc/Kconfig
@@ -133,7 +133,7 @@ config PPC
 	select HAVE_REGS_AND_STACK_ACCESS_API
 	select HAVE_HW_BREAKPOINT if PERF_EVENTS && PPC_BOOK3S_64
 	select HAVE_GENERIC_HARDIRQS
-	select HAVE_SPARSE_IRQ
+	select MAY_HAVE_SPARSE_IRQ
 	select IRQ_PER_CPU
 	select GENERIC_IRQ_SHOW
 	select GENERIC_IRQ_SHOW_LEVEL
diff --git a/arch/sh/Kconfig b/arch/sh/Kconfig
index 3c8db65c89e5..21b82a8cca21 100644
--- a/arch/sh/Kconfig
+++ b/arch/sh/Kconfig
@@ -22,7 +22,7 @@ config SUPERH
 	select HAVE_SYSCALL_TRACEPOINTS
 	select HAVE_REGS_AND_STACK_ACCESS_API
 	select HAVE_GENERIC_HARDIRQS
-	select HAVE_SPARSE_IRQ
+	select MAY_HAVE_SPARSE_IRQ
 	select IRQ_FORCED_THREADING
 	select RTC_LIB
 	select GENERIC_ATOMIC64
diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
index 864cc6e6ac8e..fb2da445945f 100644
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -69,7 +69,6 @@ config X86
 	select HAVE_ARCH_JUMP_LABEL
 	select HAVE_TEXT_POKE_SMP
 	select HAVE_GENERIC_HARDIRQS
-	select HAVE_SPARSE_IRQ
 	select SPARSE_IRQ
 	select GENERIC_FIND_FIRST_BIT
 	select GENERIC_IRQ_PROBE
diff --git a/kernel/irq/Kconfig b/kernel/irq/Kconfig
index 5a38bf4de641..1f2dece9ad4c 100644
--- a/kernel/irq/Kconfig
+++ b/kernel/irq/Kconfig
@@ -13,7 +13,7 @@ config GENERIC_HARDIRQS
 # Options selectable by the architecture code
 
 # Make sparse irq Kconfig switch below available
-config HAVE_SPARSE_IRQ
+config MAY_HAVE_SPARSE_IRQ
        bool
 
 # Enable the generic irq autoprobe mechanism
@@ -61,8 +61,7 @@ config IRQ_FORCED_THREADING
        bool
 
 config SPARSE_IRQ
-	bool "Support sparse irq numbering"
-	depends on HAVE_SPARSE_IRQ
+	bool "Support sparse irq numbering" if MAY_HAVE_SPARSE_IRQ
 	---help---
 
 	  Sparse irq numbering is useful for distro kernels that want
-- 
cgit v1.2.3


From 5a51467b146ab7948d2f6812892eac120a30529c Mon Sep 17 00:00:00 2001
From: Russ Anderson <rja@sgi.com>
Date: Wed, 18 Jan 2012 20:07:54 -0600
Subject: x86/uv: Fix uv_gpa_to_soc_phys_ram() shift

uv_gpa_to_soc_phys_ram() was inadvertently ignoring the
shift values.  This fix takes the shift into account.

Signed-off-by: Russ Anderson <rja@sgi.com>
Cc: <stable@kernel.org>
Link: http://lkml.kernel.org/r/20120119020753.GA7228@sgi.com
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/include/asm/uv/uv_hub.h | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/include/asm/uv/uv_hub.h b/arch/x86/include/asm/uv/uv_hub.h
index 54a13aaebc40..21f7385badb8 100644
--- a/arch/x86/include/asm/uv/uv_hub.h
+++ b/arch/x86/include/asm/uv/uv_hub.h
@@ -318,13 +318,13 @@ uv_gpa_in_mmr_space(unsigned long gpa)
 /* UV global physical address --> socket phys RAM */
 static inline unsigned long uv_gpa_to_soc_phys_ram(unsigned long gpa)
 {
-	unsigned long paddr = gpa & uv_hub_info->gpa_mask;
+	unsigned long paddr;
 	unsigned long remap_base = uv_hub_info->lowmem_remap_base;
 	unsigned long remap_top =  uv_hub_info->lowmem_remap_top;
 
 	gpa = ((gpa << uv_hub_info->m_shift) >> uv_hub_info->m_shift) |
 		((gpa >> uv_hub_info->n_lshift) << uv_hub_info->m_val);
-	gpa = gpa & uv_hub_info->gpa_mask;
+	paddr = gpa & uv_hub_info->gpa_mask;
 	if (paddr >= remap_base && paddr < remap_base + remap_top)
 		paddr -= remap_base;
 	return paddr;
-- 
cgit v1.2.3


From d2ebc71d472020bc30e29afe8c4d2a85a5b41f56 Mon Sep 17 00:00:00 2001
From: Cliff Wickman <cpw@sgi.com>
Date: Wed, 18 Jan 2012 09:40:47 -0600
Subject: x86/uv: Fix uninitialized spinlocks

Initialize two spinlocks in tlb_uv.c and also properly define/initialize
the uv_irq_lock.

The lack of explicit initialization seems to be functionally
harmless, but it is diagnosed when these are turned on:

        CONFIG_DEBUG_SPINLOCK=y
        CONFIG_DEBUG_MUTEXES=y
        CONFIG_DEBUG_LOCK_ALLOC=y
        CONFIG_LOCKDEP=y

Signed-off-by: Cliff Wickman <cpw@sgi.com>
Cc: <stable@kernel.org>
Cc: Dimitri Sivanich <sivanich@sgi.com>
Link: http://lkml.kernel.org/r/E1RnXd1-0003wU-PM@eag09.americas.sgi.com
[ Added the uv_irq_lock initialization fix by Dimitri Sivanich ]
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/platform/uv/tlb_uv.c | 2 ++
 arch/x86/platform/uv/uv_irq.c | 2 +-
 2 files changed, 3 insertions(+), 1 deletion(-)

(limited to 'arch/x86')

diff --git a/arch/x86/platform/uv/tlb_uv.c b/arch/x86/platform/uv/tlb_uv.c
index 9be4cff00a2d..3ae0e61abd23 100644
--- a/arch/x86/platform/uv/tlb_uv.c
+++ b/arch/x86/platform/uv/tlb_uv.c
@@ -1851,6 +1851,8 @@ static void __init init_per_cpu_tunables(void)
 		bcp->cong_reps			= congested_reps;
 		bcp->cong_period		= congested_period;
 		bcp->clocks_per_100_usec =	usec_2_cycles(100);
+		spin_lock_init(&bcp->queue_lock);
+		spin_lock_init(&bcp->uvhub_lock);
 	}
 }
 
diff --git a/arch/x86/platform/uv/uv_irq.c b/arch/x86/platform/uv/uv_irq.c
index 374a05d8ad22..f25c2765a5c9 100644
--- a/arch/x86/platform/uv/uv_irq.c
+++ b/arch/x86/platform/uv/uv_irq.c
@@ -25,7 +25,7 @@ struct uv_irq_2_mmr_pnode{
 	int			irq;
 };
 
-static spinlock_t		uv_irq_lock;
+static DEFINE_SPINLOCK(uv_irq_lock);
 static struct rb_root		uv_irq_root;
 
 static int uv_set_irq_affinity(struct irq_data *, const struct cpumask *, bool);
-- 
cgit v1.2.3


From 3fe54564a61f72982032423d24041dca30617ca2 Mon Sep 17 00:00:00 2001
From: Daniel J Blueman <daniel@numascale-asia.com>
Date: Wed, 25 Jan 2012 14:35:49 +0800
Subject: x86/numachip: Drop unnecessary conflict with EDAC

EDAC detection no longer crashes multi-node systems, so don't
conflict on it with NumaChip.

Signed-off-by: Daniel J Blueman <daniel@numascale-asia.com>
Cc: Steffen Persvold <sp@numascale.com>
Link: http://lkml.kernel.org/r/1327473349-28395-1-git-send-email-daniel@numascale-asia.com
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/Kconfig | 1 -
 1 file changed, 1 deletion(-)

(limited to 'arch/x86')

diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
index 864cc6e6ac8e..5bed94e189fa 100644
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -360,7 +360,6 @@ config X86_NUMACHIP
 	depends on NUMA
 	depends on SMP
 	depends on X86_X2APIC
-	depends on !EDAC_AMD64
 	---help---
 	  Adds support for Numascale NumaChip large-SMP systems. Needed to
 	  enable more than ~168 cores.
-- 
cgit v1.2.3


From 5067cf53cac9b36d42ebb3a45bb12259d0bc1e68 Mon Sep 17 00:00:00 2001
From: Jesper Juhl <jj@chaosbits.net>
Date: Mon, 23 Jan 2012 23:34:59 +0100
Subject: x86/boot-image: Don't leak phdrs in
 arch/x86/boot/compressed/misc.c::Parse_elf()

We allocate memory with malloc(), but neglect to free it before
the variable 'phdrs' goes out of scope --> leak.

Signed-off-by: Jesper Juhl <jj@chaosbits.net>
Link: http://lkml.kernel.org/r/alpine.LNX.2.00.1201232332590.8772@swampdragon.chaosbits.net
[ Mostly harmless. ]
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/boot/compressed/misc.c | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'arch/x86')

diff --git a/arch/x86/boot/compressed/misc.c b/arch/x86/boot/compressed/misc.c
index 3a19d04cebeb..7116dcba0c9e 100644
--- a/arch/x86/boot/compressed/misc.c
+++ b/arch/x86/boot/compressed/misc.c
@@ -321,6 +321,8 @@ static void parse_elf(void *output)
 		default: /* Ignore other PT_* */ break;
 		}
 	}
+
+	free(phdrs);
 }
 
 asmlinkage void decompress_kernel(void *rmode, memptr heap,
-- 
cgit v1.2.3


From 5d7244e7c984cecead412bde6395ce18618a4a37 Mon Sep 17 00:00:00 2001
From: Jan Beulich <JBeulich@suse.com>
Date: Thu, 5 Jan 2012 16:10:42 +0000
Subject: x86-64: Fix memset() to support sizes of 4Gb and above

While currently there doesn't appear to be any reachable in-tree
case where such large memory blocks may be passed to memset()
(alloc_bootmem() being the primary non-reachable one, as it gets
called with suitably large sizes in FLATMEM configurations), we
have recently hit the problem a second time in our Xen kernels.

Rather than working around it a second time, prevent others from
falling into the same trap by fixing this long standing
limitation.

Signed-off-by: Jan Beulich <jbeulich@suse.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Andrew Morton <akpm@linux-foundation.org>
Link: http://lkml.kernel.org/r/4F05D992020000780006AA09@nat28.tlf.novell.com
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/lib/memset_64.S | 33 +++++++++++++++------------------
 1 file changed, 15 insertions(+), 18 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/lib/memset_64.S b/arch/x86/lib/memset_64.S
index 79bd454b78a3..2dcb3808cbda 100644
--- a/arch/x86/lib/memset_64.S
+++ b/arch/x86/lib/memset_64.S
@@ -19,16 +19,15 @@
 	.section .altinstr_replacement, "ax", @progbits
 .Lmemset_c:
 	movq %rdi,%r9
-	movl %edx,%r8d
-	andl $7,%r8d
-	movl %edx,%ecx
-	shrl $3,%ecx
+	movq %rdx,%rcx
+	andl $7,%edx
+	shrq $3,%rcx
 	/* expand byte value  */
 	movzbl %sil,%esi
 	movabs $0x0101010101010101,%rax
-	mulq %rsi		/* with rax, clobbers rdx */
+	imulq %rsi,%rax
 	rep stosq
-	movl %r8d,%ecx
+	movl %edx,%ecx
 	rep stosb
 	movq %r9,%rax
 	ret
@@ -50,7 +49,7 @@
 .Lmemset_c_e:
 	movq %rdi,%r9
 	movb %sil,%al
-	movl %edx,%ecx
+	movq %rdx,%rcx
 	rep stosb
 	movq %r9,%rax
 	ret
@@ -61,12 +60,11 @@ ENTRY(memset)
 ENTRY(__memset)
 	CFI_STARTPROC
 	movq %rdi,%r10
-	movq %rdx,%r11
 
 	/* expand byte value  */
 	movzbl %sil,%ecx
 	movabs $0x0101010101010101,%rax
-	mul    %rcx		/* with rax, clobbers rdx */
+	imulq  %rcx,%rax
 
 	/* align dst */
 	movl  %edi,%r9d
@@ -75,13 +73,13 @@ ENTRY(__memset)
 	CFI_REMEMBER_STATE
 .Lafter_bad_alignment:
 
-	movl %r11d,%ecx
-	shrl $6,%ecx
+	movq  %rdx,%rcx
+	shrq  $6,%rcx
 	jz	 .Lhandle_tail
 
 	.p2align 4
 .Lloop_64:
-	decl   %ecx
+	decq  %rcx
 	movq  %rax,(%rdi)
 	movq  %rax,8(%rdi)
 	movq  %rax,16(%rdi)
@@ -97,7 +95,7 @@ ENTRY(__memset)
 	   to predict jump tables. */
 	.p2align 4
 .Lhandle_tail:
-	movl	%r11d,%ecx
+	movl	%edx,%ecx
 	andl    $63&(~7),%ecx
 	jz 		.Lhandle_7
 	shrl	$3,%ecx
@@ -109,12 +107,11 @@ ENTRY(__memset)
 	jnz    .Lloop_8
 
 .Lhandle_7:
-	movl	%r11d,%ecx
-	andl	$7,%ecx
+	andl	$7,%edx
 	jz      .Lende
 	.p2align 4
 .Lloop_1:
-	decl    %ecx
+	decl    %edx
 	movb 	%al,(%rdi)
 	leaq	1(%rdi),%rdi
 	jnz     .Lloop_1
@@ -125,13 +122,13 @@ ENTRY(__memset)
 
 	CFI_RESTORE_STATE
 .Lbad_alignment:
-	cmpq $7,%r11
+	cmpq $7,%rdx
 	jbe	.Lhandle_7
 	movq %rax,(%rdi)	/* unaligned store */
 	movq $8,%r8
 	subq %r9,%r8
 	addq %r8,%rdi
-	subq %r8,%r11
+	subq %r8,%rdx
 	jmp .Lafter_bad_alignment
 .Lfinal:
 	CFI_ENDPROC
-- 
cgit v1.2.3


From 652847aa449cfe364d40018849223f57f31a38e2 Mon Sep 17 00:00:00 2001
From: Andreas Herrmann <andreas.herrmann3@amd.com>
Date: Fri, 20 Jan 2012 17:38:23 +0100
Subject: x86/amd: Add missing feature flag for fam15h models 10h-1fh
 processors

That is the last one missing for those CPUs.

Others were recently added with commits

 fb215366b3c7320ac25dca766a0152df16534932
 (KVM: expose latest Intel cpu new features (BMI1/BMI2/FMA/AVX2) to guest)

and

 commit 969df4b82904a30fef19a67398a0c854d223ea67
 (x86: Report cpb and eff_freq_ro flags correctly)

Signed-off-by: Andreas Herrmann <andreas.herrmann3@amd.com>
Link: http://lkml.kernel.org/r/20120120163823.GC24508@alberich.amd.com
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/include/asm/cpufeature.h | 1 +
 1 file changed, 1 insertion(+)

(limited to 'arch/x86')

diff --git a/arch/x86/include/asm/cpufeature.h b/arch/x86/include/asm/cpufeature.h
index 17c5d4bdee5e..8d67d428b0f9 100644
--- a/arch/x86/include/asm/cpufeature.h
+++ b/arch/x86/include/asm/cpufeature.h
@@ -159,6 +159,7 @@
 #define X86_FEATURE_WDT		(6*32+13) /* Watchdog timer */
 #define X86_FEATURE_LWP		(6*32+15) /* Light Weight Profiling */
 #define X86_FEATURE_FMA4	(6*32+16) /* 4 operands MAC instructions */
+#define X86_FEATURE_TCE		(6*32+17) /* translation cache extension */
 #define X86_FEATURE_NODEID_MSR	(6*32+19) /* NodeId MSR */
 #define X86_FEATURE_TBM		(6*32+21) /* trailing bit manipulations */
 #define X86_FEATURE_TOPOEXT	(6*32+22) /* topology extensions CPUID leafs */
-- 
cgit v1.2.3


From 5b68edc91cdc972c46f76f85eded7ffddc3ff5c2 Mon Sep 17 00:00:00 2001
From: Andreas Herrmann <andreas.herrmann3@amd.com>
Date: Fri, 20 Jan 2012 17:44:12 +0100
Subject: x86/microcode_amd: Add support for CPU family specific container
 files

We've decided to provide CPU family specific container files
(starting with CPU family 15h). E.g. for family 15h we have to
load microcode_amd_fam15h.bin instead of microcode_amd.bin

Rationale is that starting with family 15h patch size is larger
than 2KB which was hard coded as maximum patch size in various
microcode loaders (not just Linux).

Container files which include patches larger than 2KB cause
different kinds of trouble with such old patch loaders. Thus we
have to ensure that the default container file provides only
patches with size less than 2KB.

Signed-off-by: Andreas Herrmann <andreas.herrmann3@amd.com>
Cc: Borislav Petkov <borislav.petkov@amd.com>
Cc: <stable@kernel.org>
Link: http://lkml.kernel.org/r/20120120164412.GD24508@alberich.amd.com
[ documented the naming convention and tidied the code a bit. ]
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/microcode_amd.c | 24 ++++++++++++++++++++++--
 1 file changed, 22 insertions(+), 2 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kernel/microcode_amd.c b/arch/x86/kernel/microcode_amd.c
index fe86493f3ed1..ac0417be9131 100644
--- a/arch/x86/kernel/microcode_amd.c
+++ b/arch/x86/kernel/microcode_amd.c
@@ -311,13 +311,33 @@ out:
 	return state;
 }
 
+/*
+ * AMD microcode firmware naming convention, up to family 15h they are in
+ * the legacy file:
+ *
+ *    amd-ucode/microcode_amd.bin
+ *
+ * This legacy file is always smaller than 2K in size.
+ *
+ * Starting at family 15h they are in family specific firmware files:
+ *
+ *    amd-ucode/microcode_amd_fam15h.bin
+ *    amd-ucode/microcode_amd_fam16h.bin
+ *    ...
+ *
+ * These might be larger than 2K.
+ */
 static enum ucode_state request_microcode_amd(int cpu, struct device *device)
 {
-	const char *fw_name = "amd-ucode/microcode_amd.bin";
+	char fw_name[36] = "amd-ucode/microcode_amd.bin";
 	const struct firmware *fw;
 	enum ucode_state ret = UCODE_NFOUND;
+	struct cpuinfo_x86 *c = &cpu_data(cpu);
+
+	if (c->x86 >= 0x15)
+		snprintf(fw_name, sizeof(fw_name), "amd-ucode/microcode_amd_fam%.2xh.bin", c->x86);
 
-	if (request_firmware(&fw, fw_name, device)) {
+	if (request_firmware(&fw, (const char *)fw_name, device)) {
 		pr_err("failed to load file %s\n", fw_name);
 		goto out;
 	}
-- 
cgit v1.2.3


From fc395b9291925b1880e0afc61274fe2f6ddc1269 Mon Sep 17 00:00:00 2001
From: Jan Beulich <JBeulich@suse.com>
Date: Thu, 26 Jan 2012 15:47:37 +0000
Subject: x86: Properly parenthesize cmpxchg() macro arguments

Quite oddly, all of the arguments passed through from the top
level macros to the second level which didn't need parentheses
had them, while the only expression (involving a parameter)
needing them didn't.

Very recently I got bitten by the lack thereof when using
something like "array + index" for the first operand, with
"array" being an array more narrow than int.

Signed-off-by: Jan Beulich <jbeulich@suse.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Link: http://lkml.kernel.org/r/4F2183A9020000780006F3E6@nat28.tlf.novell.com
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/include/asm/cmpxchg.h | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/include/asm/cmpxchg.h b/arch/x86/include/asm/cmpxchg.h
index 0c9fa2745f13..b3b733262909 100644
--- a/arch/x86/include/asm/cmpxchg.h
+++ b/arch/x86/include/asm/cmpxchg.h
@@ -145,13 +145,13 @@ extern void __add_wrong_size(void)
 
 #ifdef __HAVE_ARCH_CMPXCHG
 #define cmpxchg(ptr, old, new)						\
-	__cmpxchg((ptr), (old), (new), sizeof(*ptr))
+	__cmpxchg(ptr, old, new, sizeof(*(ptr)))
 
 #define sync_cmpxchg(ptr, old, new)					\
-	__sync_cmpxchg((ptr), (old), (new), sizeof(*ptr))
+	__sync_cmpxchg(ptr, old, new, sizeof(*(ptr)))
 
 #define cmpxchg_local(ptr, old, new)					\
-	__cmpxchg_local((ptr), (old), (new), sizeof(*ptr))
+	__cmpxchg_local(ptr, old, new, sizeof(*(ptr)))
 #endif
 
 /*
-- 
cgit v1.2.3


From 2ab560911a427fdc73bfd3a7d2944d8ee0ca6db8 Mon Sep 17 00:00:00 2001
From: Jan Beulich <JBeulich@suse.com>
Date: Thu, 26 Jan 2012 15:50:55 +0000
Subject: x86-64: Fix memcpy() to support sizes of 4Gb and above

While currently there doesn't appear to be any reachable in-tree
case where such large memory blocks may be passed to memcpy(),
we already had hit the problem in our Xen kernels. Just like
done recently for mmeset(), rather than working around it,
prevent others from falling into the same trap by fixing this
long standing limitation.

Signed-off-by: Jan Beulich <jbeulich@suse.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Link: http://lkml.kernel.org/r/4F21846F020000780006F3FA@nat28.tlf.novell.com
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/lib/memcpy_64.S | 25 ++++++++++---------------
 1 file changed, 10 insertions(+), 15 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/lib/memcpy_64.S b/arch/x86/lib/memcpy_64.S
index efbf2a0ecdea..1235b04a9a60 100644
--- a/arch/x86/lib/memcpy_64.S
+++ b/arch/x86/lib/memcpy_64.S
@@ -27,9 +27,8 @@
 	.section .altinstr_replacement, "ax", @progbits
 .Lmemcpy_c:
 	movq %rdi, %rax
-
-	movl %edx, %ecx
-	shrl $3, %ecx
+	movq %rdx, %rcx
+	shrq $3, %rcx
 	andl $7, %edx
 	rep movsq
 	movl %edx, %ecx
@@ -48,8 +47,7 @@
 	.section .altinstr_replacement, "ax", @progbits
 .Lmemcpy_c_e:
 	movq %rdi, %rax
-
-	movl %edx, %ecx
+	movq %rdx, %rcx
 	rep movsb
 	ret
 .Lmemcpy_e_e:
@@ -60,10 +58,7 @@ ENTRY(memcpy)
 	CFI_STARTPROC
 	movq %rdi, %rax
 
-	/*
-	 * Use 32bit CMP here to avoid long NOP padding.
-	 */
-	cmp  $0x20, %edx
+	cmpq $0x20, %rdx
 	jb .Lhandle_tail
 
 	/*
@@ -72,7 +67,7 @@ ENTRY(memcpy)
 	 */
 	cmp  %dil, %sil
 	jl .Lcopy_backward
-	subl $0x20, %edx
+	subq $0x20, %rdx
 .Lcopy_forward_loop:
 	subq $0x20,	%rdx
 
@@ -91,7 +86,7 @@ ENTRY(memcpy)
 	movq %r11,	3*8(%rdi)
 	leaq 4*8(%rdi),	%rdi
 	jae  .Lcopy_forward_loop
-	addq $0x20,	%rdx
+	addl $0x20,	%edx
 	jmp  .Lhandle_tail
 
 .Lcopy_backward:
@@ -123,11 +118,11 @@ ENTRY(memcpy)
 	/*
 	 * Calculate copy position to head.
 	 */
-	addq $0x20,	%rdx
+	addl $0x20,	%edx
 	subq %rdx,	%rsi
 	subq %rdx,	%rdi
 .Lhandle_tail:
-	cmpq $16,	%rdx
+	cmpl $16,	%edx
 	jb   .Lless_16bytes
 
 	/*
@@ -144,7 +139,7 @@ ENTRY(memcpy)
 	retq
 	.p2align 4
 .Lless_16bytes:
-	cmpq $8,	%rdx
+	cmpl $8,	%edx
 	jb   .Lless_8bytes
 	/*
 	 * Move data from 8 bytes to 15 bytes.
@@ -156,7 +151,7 @@ ENTRY(memcpy)
 	retq
 	.p2align 4
 .Lless_8bytes:
-	cmpq $4,	%rdx
+	cmpl $4,	%edx
 	jb   .Lless_3bytes
 
 	/*
-- 
cgit v1.2.3


From 9d8e22777e66f420e46490e9fc6f8cb7e0e2222b Mon Sep 17 00:00:00 2001
From: Jan Beulich <JBeulich@suse.com>
Date: Thu, 26 Jan 2012 15:55:32 +0000
Subject: x86-64: Handle byte-wise tail copying in memcpy() without a loop

While hard to measure, reducing the number of possibly/likely
mis-predicted branches can generally be expected to be slightly
better.

Other than apparent at the first glance, this also doesn't grow
the function size (the alignment gap to the next function just
gets smaller).

Signed-off-by: Jan Beulich <jbeulich@suse.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Link: http://lkml.kernel.org/r/4F218584020000780006F422@nat28.tlf.novell.com
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/lib/memcpy_64.S | 19 ++++++++++---------
 1 file changed, 10 insertions(+), 9 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/lib/memcpy_64.S b/arch/x86/lib/memcpy_64.S
index 1235b04a9a60..1c273be7c97e 100644
--- a/arch/x86/lib/memcpy_64.S
+++ b/arch/x86/lib/memcpy_64.S
@@ -164,18 +164,19 @@ ENTRY(memcpy)
 	retq
 	.p2align 4
 .Lless_3bytes:
-	cmpl $0, %edx
-	je .Lend
+	subl $1, %edx
+	jb .Lend
 	/*
 	 * Move data from 1 bytes to 3 bytes.
 	 */
-.Lloop_1:
-	movb (%rsi), %r8b
-	movb %r8b, (%rdi)
-	incq %rdi
-	incq %rsi
-	decl %edx
-	jnz .Lloop_1
+	movzbl (%rsi), %ecx
+	jz .Lstore_1byte
+	movzbq 1(%rsi), %r8
+	movzbq (%rsi, %rdx), %r9
+	movb %r8b, 1(%rdi)
+	movb %r9b, (%rdi, %rdx)
+.Lstore_1byte:
+	movb %cl, (%rdi)
 
 .Lend:
 	retq
-- 
cgit v1.2.3


From b3eea29c189a0e3e2ac921e85fabfa4989ee58d7 Mon Sep 17 00:00:00 2001
From: Jacob Pan <jacob.jun.pan@linux.intel.com>
Date: Thu, 26 Jan 2012 17:32:04 +0000
Subject: x86/ioapic: Use legacy_pic to set correct gsi-irq mapping

Using compile time NR_LEGACY_IRQS causes the wrong gsi-irq
mapping on non-PC platforms, such as Moorestown. This patch uses
legacy_pic abstraction to set the correct number of legacy
interrupts at runtime. For Moorestown, nr_legacy_irqs = 0. We
have 1:1 mapping for gsi-irq even within the legacy irq range.

Signed-off-by: Jacob Pan <jacob.jun.pan@linux.intel.com>
Signed-off-by: Dirk Brandewie <dirk.brandewie@gmail.com>
Link: http://lkml.kernel.org/n/tip-kzvj4xp9tmicuoqoh2w05iay@git.kernel.org
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/apic/io_apic.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kernel/apic/io_apic.c b/arch/x86/kernel/apic/io_apic.c
index fb072754bc1d..9e753663f0d1 100644
--- a/arch/x86/kernel/apic/io_apic.c
+++ b/arch/x86/kernel/apic/io_apic.c
@@ -1010,7 +1010,7 @@ static int pin_2_irq(int idx, int apic, int pin)
 	} else {
 		u32 gsi = gsi_cfg->gsi_base + pin;
 
-		if (gsi >= NR_IRQS_LEGACY)
+		if (gsi >= legacy_pic->nr_legacy_irqs)
 			irq = gsi;
 		else
 			irq = gsi_top + gsi;
@@ -3610,7 +3610,7 @@ static void __init probe_nr_irqs_gsi(void)
 {
 	int nr;
 
-	nr = gsi_top + NR_IRQS_LEGACY;
+	nr = gsi_top + legacy_pic->nr_legacy_irqs;
 	if (nr > nr_irqs_gsi)
 		nr_irqs_gsi = nr;
 
-- 
cgit v1.2.3


From d450c088fb00d5a744b1fe8648a488035a10a03c Mon Sep 17 00:00:00 2001
From: Jacob Pan <jacob.jun.pan@linux.intel.com>
Date: Thu, 26 Jan 2012 17:32:33 +0000
Subject: x86/mrst: Set ISA bus type for fake MP IRQs

We use MP IRQs for SFI presented timer interrupts, we should
also set mp_bus_not_pci for MP_ISA_BUS so that pin_2_irq mapping
is correct.

Signed-off-by: Jacob Pan <jacob.jun.pan@linux.intel.com>
Signed-off-by: Dirk Brandewie <dirk.brandewie@gmail.com>
Link: http://lkml.kernel.org/n/tip-8h3rc1igpp8ir94aas69qmhk@git.kernel.org
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/apic/io_apic.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kernel/apic/io_apic.c b/arch/x86/kernel/apic/io_apic.c
index 9e753663f0d1..fb072754bc1d 100644
--- a/arch/x86/kernel/apic/io_apic.c
+++ b/arch/x86/kernel/apic/io_apic.c
@@ -1010,7 +1010,7 @@ static int pin_2_irq(int idx, int apic, int pin)
 	} else {
 		u32 gsi = gsi_cfg->gsi_base + pin;
 
-		if (gsi >= legacy_pic->nr_legacy_irqs)
+		if (gsi >= NR_IRQS_LEGACY)
 			irq = gsi;
 		else
 			irq = gsi_top + gsi;
@@ -3610,7 +3610,7 @@ static void __init probe_nr_irqs_gsi(void)
 {
 	int nr;
 
-	nr = gsi_top + legacy_pic->nr_legacy_irqs;
+	nr = gsi_top + NR_IRQS_LEGACY;
 	if (nr > nr_irqs_gsi)
 		nr_irqs_gsi = nr;
 
-- 
cgit v1.2.3


From 1a8359e411eb5055405412a7da812dae63c64a55 Mon Sep 17 00:00:00 2001
From: Alan Cox <alan@linux.intel.com>
Date: Thu, 26 Jan 2012 17:33:30 +0000
Subject: x86/mid: Remove Intel Moorestown

All production devices operate in the Oaktrail configuration
with legacy PC elements present and an ACPI BIOS. Continue
stripping out the Moorestown elements from the tree leaving
Medfield.

Signed-off-by: Alan Cox <alan@linux.intel.com>
Cc: jacob.jun.pan@linux.intel.com
Link: http://lkml.kernel.org/n/tip-fvm1hgpq99jln6l0fbek68ik@git.kernel.org
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/Kconfig                |  21 --
 arch/x86/include/asm/mrst.h     |   4 +-
 arch/x86/platform/mrst/Makefile |   1 -
 arch/x86/platform/mrst/mrst.c   |  64 ++--
 arch/x86/platform/mrst/pmu.c    | 817 ----------------------------------------
 arch/x86/platform/mrst/pmu.h    | 234 ------------
 6 files changed, 26 insertions(+), 1115 deletions(-)
 delete mode 100644 arch/x86/platform/mrst/pmu.c
 delete mode 100644 arch/x86/platform/mrst/pmu.h

(limited to 'arch/x86')

diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
index 864cc6e6ac8e..a13addbcbd5e 100644
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -418,27 +418,6 @@ if X86_WANT_INTEL_MID
 config X86_INTEL_MID
 	bool
 
-config X86_MRST
-       bool "Moorestown MID platform"
-	depends on PCI
-	depends on PCI_GOANY
-	depends on X86_IO_APIC
-	select X86_INTEL_MID
-	select SFI
-	select DW_APB_TIMER
-	select APB_TIMER
-	select I2C
-	select SPI
-	select INTEL_SCU_IPC
-	select X86_PLATFORM_DEVICES
-	---help---
-	  Moorestown is Intel's Low Power Intel Architecture (LPIA) based Moblin
-	  Internet Device(MID) platform. Moorestown consists of two chips:
-	  Lincroft (CPU core, graphics, and memory controller) and Langwell IOH.
-	  Unlike standard x86 PCs, Moorestown does not have many legacy devices
-	  nor standard legacy replacement devices/features. e.g. Moorestown does
-	  not contain i8259, i8254, HPET, legacy BIOS, most of the io ports.
-
 config X86_MDFLD
        bool "Medfield MID platform"
 	depends on PCI
diff --git a/arch/x86/include/asm/mrst.h b/arch/x86/include/asm/mrst.h
index 0a0a95460434..fc18bf3ce7c8 100644
--- a/arch/x86/include/asm/mrst.h
+++ b/arch/x86/include/asm/mrst.h
@@ -26,8 +26,8 @@ extern struct sfi_rtc_table_entry sfi_mrtc_array[];
  * identified via MSRs.
  */
 enum mrst_cpu_type {
-	MRST_CPU_CHIP_LINCROFT = 1,
-	MRST_CPU_CHIP_PENWELL,
+	/* 1 was Moorestown */
+	MRST_CPU_CHIP_PENWELL = 2,
 };
 
 extern enum mrst_cpu_type __mrst_cpu_chip;
diff --git a/arch/x86/platform/mrst/Makefile b/arch/x86/platform/mrst/Makefile
index 7baed5135e0f..af1da7e623f9 100644
--- a/arch/x86/platform/mrst/Makefile
+++ b/arch/x86/platform/mrst/Makefile
@@ -1,4 +1,3 @@
 obj-$(CONFIG_X86_INTEL_MID)	+= mrst.o
 obj-$(CONFIG_X86_INTEL_MID)	+= vrtc.o
 obj-$(CONFIG_EARLY_PRINTK_INTEL_MID)	+= early_printk_mrst.o
-obj-$(CONFIG_X86_MRST)		+= pmu.o
diff --git a/arch/x86/platform/mrst/mrst.c b/arch/x86/platform/mrst/mrst.c
index 475e2cd0f3c3..6743587575a6 100644
--- a/arch/x86/platform/mrst/mrst.c
+++ b/arch/x86/platform/mrst/mrst.c
@@ -78,16 +78,11 @@ int sfi_mrtc_num;
 
 static void mrst_power_off(void)
 {
-	if (__mrst_cpu_chip == MRST_CPU_CHIP_LINCROFT)
-		intel_scu_ipc_simple_command(IPCMSG_COLD_RESET, 1);
 }
 
 static void mrst_reboot(void)
 {
-	if (__mrst_cpu_chip == MRST_CPU_CHIP_LINCROFT)
-		intel_scu_ipc_simple_command(IPCMSG_COLD_RESET, 0);
-	else
-		intel_scu_ipc_simple_command(IPCMSG_COLD_BOOT, 0);
+	intel_scu_ipc_simple_command(IPCMSG_COLD_BOOT, 0);
 }
 
 /* parse all the mtimer info to a static mtimer array */
@@ -200,34 +195,28 @@ int __init sfi_parse_mrtc(struct sfi_table_header *table)
 
 static unsigned long __init mrst_calibrate_tsc(void)
 {
-	unsigned long flags, fast_calibrate;
-	if (__mrst_cpu_chip == MRST_CPU_CHIP_PENWELL) {
-		u32 lo, hi, ratio, fsb;
-
-		rdmsr(MSR_IA32_PERF_STATUS, lo, hi);
-		pr_debug("IA32 perf status is 0x%x, 0x%0x\n", lo, hi);
-		ratio = (hi >> 8) & 0x1f;
-		pr_debug("ratio is %d\n", ratio);
-		if (!ratio) {
-			pr_err("read a zero ratio, should be incorrect!\n");
-			pr_err("force tsc ratio to 16 ...\n");
-			ratio = 16;
-		}
-		rdmsr(MSR_FSB_FREQ, lo, hi);
-		if ((lo & 0x7) == 0x7)
-			fsb = PENWELL_FSB_FREQ_83SKU;
-		else
-			fsb = PENWELL_FSB_FREQ_100SKU;
-		fast_calibrate = ratio * fsb;
-		pr_debug("read penwell tsc %lu khz\n", fast_calibrate);
-		lapic_timer_frequency = fsb * 1000 / HZ;
-		/* mark tsc clocksource as reliable */
-		set_cpu_cap(&boot_cpu_data, X86_FEATURE_TSC_RELIABLE);
-	} else {
-		local_irq_save(flags);
-		fast_calibrate = apbt_quick_calibrate();
-		local_irq_restore(flags);
+	unsigned long fast_calibrate;
+	u32 lo, hi, ratio, fsb;
+
+	rdmsr(MSR_IA32_PERF_STATUS, lo, hi);
+	pr_debug("IA32 perf status is 0x%x, 0x%0x\n", lo, hi);
+	ratio = (hi >> 8) & 0x1f;
+	pr_debug("ratio is %d\n", ratio);
+	if (!ratio) {
+		pr_err("read a zero ratio, should be incorrect!\n");
+		pr_err("force tsc ratio to 16 ...\n");
+		ratio = 16;
 	}
+	rdmsr(MSR_FSB_FREQ, lo, hi);
+	if ((lo & 0x7) == 0x7)
+		fsb = PENWELL_FSB_FREQ_83SKU;
+	else
+		fsb = PENWELL_FSB_FREQ_100SKU;
+	fast_calibrate = ratio * fsb;
+	pr_debug("read penwell tsc %lu khz\n", fast_calibrate);
+	lapic_timer_frequency = fsb * 1000 / HZ;
+	/* mark tsc clocksource as reliable */
+	set_cpu_cap(&boot_cpu_data, X86_FEATURE_TSC_RELIABLE);
 	
 	if (fast_calibrate)
 		return fast_calibrate;
@@ -261,16 +250,11 @@ static void __cpuinit mrst_arch_setup(void)
 {
 	if (boot_cpu_data.x86 == 6 && boot_cpu_data.x86_model == 0x27)
 		__mrst_cpu_chip = MRST_CPU_CHIP_PENWELL;
-	else if (boot_cpu_data.x86 == 6 && boot_cpu_data.x86_model == 0x26)
-		__mrst_cpu_chip = MRST_CPU_CHIP_LINCROFT;
 	else {
-		pr_err("Unknown Moorestown CPU (%d:%d), default to Lincroft\n",
+		pr_err("Unknown Intel MID CPU (%d:%d), default to Penwell\n",
 			boot_cpu_data.x86, boot_cpu_data.x86_model);
-		__mrst_cpu_chip = MRST_CPU_CHIP_LINCROFT;
+		__mrst_cpu_chip = MRST_CPU_CHIP_PENWELL;
 	}
-	pr_debug("Moorestown CPU %s identified\n",
-		(__mrst_cpu_chip == MRST_CPU_CHIP_LINCROFT) ?
-		"Lincroft" : "Penwell");
 }
 
 /* MID systems don't have i8042 controller */
diff --git a/arch/x86/platform/mrst/pmu.c b/arch/x86/platform/mrst/pmu.c
deleted file mode 100644
index c0ac06da57ac..000000000000
--- a/arch/x86/platform/mrst/pmu.c
+++ /dev/null
@@ -1,817 +0,0 @@
-/*
- * mrst/pmu.c - driver for MRST Power Management Unit
- *
- * Copyright (c) 2011, Intel Corporation.
- *
- * This program is free software; you can redistribute it and/or modify it
- * under the terms and conditions of the GNU General Public License,
- * version 2, as published by the Free Software Foundation.
- *
- * This program is distributed in the hope it will be useful, but WITHOUT
- * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
- * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
- * more details.
- *
- * You should have received a copy of the GNU General Public License along with
- * this program; if not, write to the Free Software Foundation, Inc.,
- * 51 Franklin St - Fifth Floor, Boston, MA 02110-1301 USA.
- */
-
-#include <linux/cpuidle.h>
-#include <linux/debugfs.h>
-#include <linux/delay.h>
-#include <linux/interrupt.h>
-#include <linux/module.h>
-#include <linux/pci.h>
-#include <linux/seq_file.h>
-#include <linux/sfi.h>
-#include <asm/intel_scu_ipc.h>
-#include "pmu.h"
-
-#define IPCMSG_FW_REVISION	0xF4
-
-struct mrst_device {
-	u16 pci_dev_num;	/* DEBUG only */
-	u16 lss;
-	u16 latest_request;
-	unsigned int pci_state_counts[PCI_D3cold + 1]; /* DEBUG only */
-};
-
-/*
- * comlete list of MRST PCI devices
- */
-static struct mrst_device mrst_devs[] = {
-/*  0 */ { 0x0800, LSS_SPI0 },		/* Moorestown SPI Ctrl 0 */
-/*  1 */ { 0x0801, LSS_SPI1 },		/* Moorestown SPI Ctrl 1 */
-/*  2 */ { 0x0802, LSS_I2C0 },		/* Moorestown I2C 0 */
-/*  3 */ { 0x0803, LSS_I2C1 },		/* Moorestown I2C 1 */
-/*  4 */ { 0x0804, LSS_I2C2 },		/* Moorestown I2C 2 */
-/*  5 */ { 0x0805, LSS_KBD },		/* Moorestown Keyboard Ctrl */
-/*  6 */ { 0x0806, LSS_USB_HC },	/* Moorestown USB Ctrl */
-/*  7 */ { 0x0807, LSS_SD_HC0 },	/* Moorestown SD Host Ctrl 0 */
-/*  8 */ { 0x0808, LSS_SD_HC1 },	/* Moorestown SD Host Ctrl 1 */
-/*  9 */ { 0x0809, LSS_NAND },		/* Moorestown NAND Ctrl */
-/* 10 */ { 0x080a, LSS_AUDIO },		/* Moorestown Audio Ctrl */
-/* 11 */ { 0x080b, LSS_IMAGING },	/* Moorestown ISP */
-/* 12 */ { 0x080c, LSS_SECURITY },	/* Moorestown Security Controller */
-/* 13 */ { 0x080d, LSS_DISPLAY },	/* Moorestown External Displays */
-/* 14 */ { 0x080e, 0 },			/* Moorestown SCU IPC */
-/* 15 */ { 0x080f, LSS_GPIO },		/* Moorestown GPIO Controller */
-/* 16 */ { 0x0810, 0 },			/* Moorestown Power Management Unit */
-/* 17 */ { 0x0811, LSS_USB_OTG },	/* Moorestown OTG Ctrl */
-/* 18 */ { 0x0812, LSS_SPI2 },		/* Moorestown SPI Ctrl 2 */
-/* 19 */ { 0x0813, 0 },			/* Moorestown SC DMA */
-/* 20 */ { 0x0814, LSS_AUDIO_LPE },	/* Moorestown LPE DMA */
-/* 21 */ { 0x0815, LSS_AUDIO_SSP },	/* Moorestown SSP0 */
-
-/* 22 */ { 0x084F, LSS_SD_HC2 },	/* Moorestown SD Host Ctrl 2 */
-
-/* 23 */ { 0x4102, 0 },			/* Lincroft */
-/* 24 */ { 0x4110, 0 },			/* Lincroft */
-};
-
-/* n.b. We ignore PCI-id 0x815 in LSS9 b/c Linux has no driver for it */
-static u16 mrst_lss9_pci_ids[] = {0x080a, 0x0814, 0};
-static u16 mrst_lss10_pci_ids[] = {0x0800, 0x0801, 0x0802, 0x0803,
-					0x0804, 0x0805, 0x080f, 0};
-
-/* handle concurrent SMP invokations of pmu_pci_set_power_state() */
-static spinlock_t mrst_pmu_power_state_lock;
-
-static unsigned int wake_counters[MRST_NUM_LSS];	/* DEBUG only */
-static unsigned int pmu_irq_stats[INT_INVALID + 1];	/* DEBUG only */
-
-static int graphics_is_off;
-static int lss_s0i3_enabled;
-static bool mrst_pmu_s0i3_enable;
-
-/*  debug counters */
-static u32 pmu_wait_ready_calls;
-static u32 pmu_wait_ready_udelays;
-static u32 pmu_wait_ready_udelays_max;
-static u32 pmu_wait_done_calls;
-static u32 pmu_wait_done_udelays;
-static u32 pmu_wait_done_udelays_max;
-static u32 pmu_set_power_state_entry;
-static u32 pmu_set_power_state_send_cmd;
-
-static struct mrst_device *pci_id_2_mrst_dev(u16 pci_dev_num)
-{
-	int index = 0;
-
-	if ((pci_dev_num >= 0x0800) && (pci_dev_num <= 0x815))
-		index = pci_dev_num - 0x800;
-	else if (pci_dev_num == 0x084F)
-		index = 22;
-	else if (pci_dev_num == 0x4102)
-		index = 23;
-	else if (pci_dev_num == 0x4110)
-		index = 24;
-
-	if (pci_dev_num != mrst_devs[index].pci_dev_num) {
-		WARN_ONCE(1, FW_BUG "Unknown PCI device 0x%04X\n", pci_dev_num);
-		return 0;
-	}
-
-	return &mrst_devs[index];
-}
-
-/**
- * mrst_pmu_validate_cstates
- * @dev: cpuidle_device
- *
- * Certain states are not appropriate for governor to pick in some cases.
- * This function will be called as cpuidle_device's prepare callback and
- * thus tells governor to ignore such states when selecting the next state
- * to enter.
- */
-
-#define IDLE_STATE4_IS_C6	4
-#define IDLE_STATE5_IS_S0I3	5
-
-int mrst_pmu_invalid_cstates(void)
-{
-	int cpu = smp_processor_id();
-
-	/*
-	 * Demote to C4 if the PMU is busy.
-	 * Since LSS changes leave the busy bit clear...
-	 * busy means either the PMU is waiting for an ACK-C6 that
-	 * isn't coming due to an MWAIT that returned immediately;
-	 * or we returned from S0i3 successfully, and the PMU
-	 * is not done sending us interrupts.
-	 */
-	if (pmu_read_busy_status())
-		return 1 << IDLE_STATE4_IS_C6 | 1 << IDLE_STATE5_IS_S0I3;
-
-	/*
-	 * Disallow S0i3 if: PMU is not initialized, or CPU1 is active,
-	 * or if device LSS is insufficient, or the GPU is active,
-	 * or if it has been explicitly disabled.
-	 */
-	if (!pmu_reg || !cpumask_equal(cpu_online_mask, cpumask_of(cpu)) ||
-	    !lss_s0i3_enabled || !graphics_is_off || !mrst_pmu_s0i3_enable)
-		return 1 << IDLE_STATE5_IS_S0I3;
-	else
-		return 0;
-}
-
-/*
- * pmu_update_wake_counters(): read PM_WKS, update wake_counters[]
- * DEBUG only.
- */
-static void pmu_update_wake_counters(void)
-{
-	int lss;
-	u32 wake_status;
-
-	wake_status = pmu_read_wks();
-
-	for (lss = 0; lss < MRST_NUM_LSS; ++lss) {
-		if (wake_status & (1 << lss))
-			wake_counters[lss]++;
-	}
-}
-
-int mrst_pmu_s0i3_entry(void)
-{
-	int status;
-
-	/* Clear any possible error conditions */
-	pmu_write_ics(0x300);
-
-	/* set wake control to current D-states */
-	pmu_write_wssc(S0I3_SSS_TARGET);
-
-	status = mrst_s0i3_entry(PM_S0I3_COMMAND, &pmu_reg->pm_cmd);
-	pmu_update_wake_counters();
-	return status;
-}
-
-/* poll for maximum of 5ms for busy bit to clear */
-static int pmu_wait_ready(void)
-{
-	int udelays;
-
-	pmu_wait_ready_calls++;
-
-	for (udelays = 0; udelays < 500; ++udelays) {
-		if (udelays > pmu_wait_ready_udelays_max)
-			pmu_wait_ready_udelays_max = udelays;
-
-		if (pmu_read_busy_status() == 0)
-			return 0;
-
-		udelay(10);
-		pmu_wait_ready_udelays++;
-	}
-
-	/*
-	 * if this fires, observe
-	 * /sys/kernel/debug/mrst_pmu_wait_ready_calls
-	 * /sys/kernel/debug/mrst_pmu_wait_ready_udelays
-	 */
-	WARN_ONCE(1, "SCU not ready for 5ms");
-	return -EBUSY;
-}
-/* poll for maximum of 50ms us for busy bit to clear */
-static int pmu_wait_done(void)
-{
-	int udelays;
-
-	pmu_wait_done_calls++;
-
-	for (udelays = 0; udelays < 500; ++udelays) {
-		if (udelays > pmu_wait_done_udelays_max)
-			pmu_wait_done_udelays_max = udelays;
-
-		if (pmu_read_busy_status() == 0)
-			return 0;
-
-		udelay(100);
-		pmu_wait_done_udelays++;
-	}
-
-	/*
-	 * if this fires, observe
-	 * /sys/kernel/debug/mrst_pmu_wait_done_calls
-	 * /sys/kernel/debug/mrst_pmu_wait_done_udelays
-	 */
-	WARN_ONCE(1, "SCU not done for 50ms");
-	return -EBUSY;
-}
-
-u32 mrst_pmu_msi_is_disabled(void)
-{
-	return pmu_msi_is_disabled();
-}
-
-void mrst_pmu_enable_msi(void)
-{
-	pmu_msi_enable();
-}
-
-/**
- * pmu_irq - pmu driver interrupt handler
- * Context: interrupt context
- */
-static irqreturn_t pmu_irq(int irq, void *dummy)
-{
-	union pmu_pm_ics pmu_ics;
-
-	pmu_ics.value = pmu_read_ics();
-
-	if (!pmu_ics.bits.pending)
-		return IRQ_NONE;
-
-	switch (pmu_ics.bits.cause) {
-	case INT_SPURIOUS:
-	case INT_CMD_DONE:
-	case INT_CMD_ERR:
-	case INT_WAKE_RX:
-	case INT_SS_ERROR:
-	case INT_S0IX_MISS:
-	case INT_NO_ACKC6:
-		pmu_irq_stats[pmu_ics.bits.cause]++;
-		break;
-	default:
-		pmu_irq_stats[INT_INVALID]++;
-	}
-
-	pmu_write_ics(pmu_ics.value); /* Clear pending interrupt */
-
-	return IRQ_HANDLED;
-}
-
-/*
- * Translate PCI power management to MRST LSS D-states
- */
-static int pci_2_mrst_state(int lss, pci_power_t pci_state)
-{
-	switch (pci_state) {
-	case PCI_D0:
-		if (SSMSK(D0i1, lss) & D0I1_ACG_SSS_TARGET)
-			return D0i1;
-		else
-			return D0;
-	case PCI_D1:
-		return D0i1;
-	case PCI_D2:
-		return D0i2;
-	case PCI_D3hot:
-	case PCI_D3cold:
-		return D0i3;
-	default:
-		WARN(1, "pci_state %d\n", pci_state);
-		return 0;
-	}
-}
-
-static int pmu_issue_command(u32 pm_ssc)
-{
-	union pmu_pm_set_cfg_cmd_t command;
-
-	if (pmu_read_busy_status()) {
-		pr_debug("pmu is busy, Operation not permitted\n");
-		return -1;
-	}
-
-	/*
-	 * enable interrupts in PMU so that interrupts are
-	 * propagated when ioc bit for a particular set
-	 * command is set
-	 */
-
-	pmu_irq_enable();
-
-	/* Configure the sub systems for pmu2 */
-
-	pmu_write_ssc(pm_ssc);
-
-	/*
-	 * Send the set config command for pmu its configured
-	 * for mode CM_IMMEDIATE & hence with No Trigger
-	 */
-
-	command.pmu2_params.d_param.cfg_mode = CM_IMMEDIATE;
-	command.pmu2_params.d_param.cfg_delay = 0;
-	command.pmu2_params.d_param.rsvd = 0;
-
-	/* construct the command to send SET_CFG to particular PMU */
-	command.pmu2_params.d_param.cmd = SET_CFG_CMD;
-	command.pmu2_params.d_param.ioc = 0;
-	command.pmu2_params.d_param.mode_id = 0;
-	command.pmu2_params.d_param.sys_state = SYS_STATE_S0I0;
-
-	/* write the value of PM_CMD into particular PMU */
-	pr_debug("pmu command being written %x\n",
-			command.pmu_pm_set_cfg_cmd_value);
-
-	pmu_write_cmd(command.pmu_pm_set_cfg_cmd_value);
-
-	return 0;
-}
-
-static u16 pmu_min_lss_pci_req(u16 *ids, u16 pci_state)
-{
-	u16 existing_request;
-	int i;
-
-	for (i = 0; ids[i]; ++i) {
-		struct mrst_device *mrst_dev;
-
-		mrst_dev = pci_id_2_mrst_dev(ids[i]);
-		if (unlikely(!mrst_dev))
-			continue;
-
-		existing_request = mrst_dev->latest_request;
-		if (existing_request < pci_state)
-			pci_state = existing_request;
-	}
-	return pci_state;
-}
-
-/**
- * pmu_pci_set_power_state - Callback function is used by all the PCI devices
- *			for a platform  specific device power on/shutdown.
- */
-
-int pmu_pci_set_power_state(struct pci_dev *pdev, pci_power_t pci_state)
-{
-	u32 old_sss, new_sss;
-	int status = 0;
-	struct mrst_device *mrst_dev;
-
-	pmu_set_power_state_entry++;
-
-	BUG_ON(pdev->vendor != PCI_VENDOR_ID_INTEL);
-	BUG_ON(pci_state < PCI_D0 || pci_state > PCI_D3cold);
-
-	mrst_dev = pci_id_2_mrst_dev(pdev->device);
-	if (unlikely(!mrst_dev))
-		return -ENODEV;
-
-	mrst_dev->pci_state_counts[pci_state]++;	/* count invocations */
-
-	/* PMU driver calls self as part of PCI initialization, ignore */
-	if (pdev->device == PCI_DEV_ID_MRST_PMU)
-		return 0;
-
-	BUG_ON(!pmu_reg); /* SW bug if called before initialized */
-
-	spin_lock(&mrst_pmu_power_state_lock);
-
-	if (pdev->d3_delay) {
-		dev_dbg(&pdev->dev, "d3_delay %d, should be 0\n",
-			pdev->d3_delay);
-		pdev->d3_delay = 0;
-	}
-	/*
-	 * If Lincroft graphics, simply remember state
-	 */
-	if ((pdev->class >> 16) == PCI_BASE_CLASS_DISPLAY
-		&& !((pdev->class & PCI_SUB_CLASS_MASK) >> 8)) {
-		if (pci_state == PCI_D0)
-			graphics_is_off = 0;
-		else
-			graphics_is_off = 1;
-		goto ret;
-	}
-
-	if (!mrst_dev->lss)
-		goto ret;	/* device with no LSS */
-
-	if (mrst_dev->latest_request == pci_state)
-		goto ret;	/* no change */
-
-	mrst_dev->latest_request = pci_state;	/* record latest request */
-
-	/*
-	 * LSS9 and LSS10 contain multiple PCI devices.
-	 * Use the lowest numbered (highest power) state in the LSS
-	 */
-	if (mrst_dev->lss == 9)
-		pci_state = pmu_min_lss_pci_req(mrst_lss9_pci_ids, pci_state);
-	else if (mrst_dev->lss == 10)
-		pci_state = pmu_min_lss_pci_req(mrst_lss10_pci_ids, pci_state);
-
-	status = pmu_wait_ready();
-	if (status)
-		goto ret;
-
-	old_sss = pmu_read_sss();
-	new_sss = old_sss & ~SSMSK(3, mrst_dev->lss);
-	new_sss |= SSMSK(pci_2_mrst_state(mrst_dev->lss, pci_state),
-			mrst_dev->lss);
-
-	if (new_sss == old_sss)
-		goto ret;	/* nothing to do */
-
-	pmu_set_power_state_send_cmd++;
-
-	status = pmu_issue_command(new_sss);
-
-	if (unlikely(status != 0)) {
-		dev_err(&pdev->dev, "Failed to Issue a PM command\n");
-		goto ret;
-	}
-
-	if (pmu_wait_done())
-		goto ret;
-
-	lss_s0i3_enabled =
-	((pmu_read_sss() & S0I3_SSS_TARGET) == S0I3_SSS_TARGET);
-ret:
-	spin_unlock(&mrst_pmu_power_state_lock);
-	return status;
-}
-
-#ifdef CONFIG_DEBUG_FS
-static char *d0ix_names[] = {"D0", "D0i1", "D0i2", "D0i3"};
-
-static inline const char *d0ix_name(int state)
-{
-	return d0ix_names[(int) state];
-}
-
-static int debug_mrst_pmu_show(struct seq_file *s, void *unused)
-{
-	struct pci_dev *pdev = NULL;
-	u32 cur_pmsss;
-	int lss;
-
-	seq_printf(s, "0x%08X D0I1_ACG_SSS_TARGET\n", D0I1_ACG_SSS_TARGET);
-
-	cur_pmsss = pmu_read_sss();
-
-	seq_printf(s, "0x%08X S0I3_SSS_TARGET\n", S0I3_SSS_TARGET);
-
-	seq_printf(s, "0x%08X Current SSS ", cur_pmsss);
-	seq_printf(s, lss_s0i3_enabled ? "\n" : "[BLOCKS s0i3]\n");
-
-	if (cpumask_equal(cpu_online_mask, cpumask_of(0)))
-		seq_printf(s, "cpu0 is only cpu online\n");
-	else
-		seq_printf(s, "cpu0 is NOT only cpu online [BLOCKS S0i3]\n");
-
-	seq_printf(s, "GFX: %s\n", graphics_is_off ? "" : "[BLOCKS s0i3]");
-
-
-	for_each_pci_dev(pdev) {
-		int pos;
-		u16 pmcsr;
-		struct mrst_device *mrst_dev;
-		int i;
-
-		mrst_dev = pci_id_2_mrst_dev(pdev->device);
-
-		seq_printf(s, "%s %04x/%04X %-16.16s ",
-			dev_name(&pdev->dev),
-			pdev->vendor, pdev->device,
-			dev_driver_string(&pdev->dev));
-
-		if (unlikely (!mrst_dev)) {
-			seq_printf(s, " UNKNOWN\n");
-			continue;
-		}
-
-		if (mrst_dev->lss)
-			seq_printf(s, "LSS %2d %-4s ", mrst_dev->lss,
-				d0ix_name(((cur_pmsss >>
-					(mrst_dev->lss * 2)) & 0x3)));
-		else
-			seq_printf(s, "            ");
-
-		/* PCI PM config space setting */
-		pos = pci_find_capability(pdev, PCI_CAP_ID_PM);
-		if (pos != 0) {
-			pci_read_config_word(pdev, pos + PCI_PM_CTRL, &pmcsr);
-		seq_printf(s, "PCI-%-4s",
-			pci_power_name(pmcsr & PCI_PM_CTRL_STATE_MASK));
-		} else {
-			seq_printf(s, "        ");
-		}
-
-		seq_printf(s, " %s ", pci_power_name(mrst_dev->latest_request));
-		for (i = 0; i <= PCI_D3cold; ++i)
-			seq_printf(s, "%d ", mrst_dev->pci_state_counts[i]);
-
-		if (mrst_dev->lss) {
-			unsigned int lssmask;
-
-			lssmask = SSMSK(D0i3, mrst_dev->lss);
-
-			if ((lssmask & S0I3_SSS_TARGET) &&
-				((lssmask & cur_pmsss) !=
-					(lssmask & S0I3_SSS_TARGET)))
-						seq_printf(s , "[BLOCKS s0i3]");
-		}
-
-		seq_printf(s, "\n");
-	}
-	seq_printf(s, "Wake Counters:\n");
-	for (lss = 0; lss < MRST_NUM_LSS; ++lss)
-		seq_printf(s, "LSS%d %d\n", lss, wake_counters[lss]);
-
-	seq_printf(s, "Interrupt Counters:\n");
-	seq_printf(s,
-		"INT_SPURIOUS \t%8u\n" "INT_CMD_DONE \t%8u\n"
-		"INT_CMD_ERR  \t%8u\n" "INT_WAKE_RX  \t%8u\n"
-		"INT_SS_ERROR \t%8u\n" "INT_S0IX_MISS\t%8u\n"
-		"INT_NO_ACKC6 \t%8u\n" "INT_INVALID  \t%8u\n",
-		pmu_irq_stats[INT_SPURIOUS], pmu_irq_stats[INT_CMD_DONE],
-		pmu_irq_stats[INT_CMD_ERR], pmu_irq_stats[INT_WAKE_RX],
-		pmu_irq_stats[INT_SS_ERROR], pmu_irq_stats[INT_S0IX_MISS],
-		pmu_irq_stats[INT_NO_ACKC6], pmu_irq_stats[INT_INVALID]);
-
-	seq_printf(s, "mrst_pmu_wait_ready_calls          %8d\n",
-			pmu_wait_ready_calls);
-	seq_printf(s, "mrst_pmu_wait_ready_udelays        %8d\n",
-			pmu_wait_ready_udelays);
-	seq_printf(s, "mrst_pmu_wait_ready_udelays_max    %8d\n",
-			pmu_wait_ready_udelays_max);
-	seq_printf(s, "mrst_pmu_wait_done_calls           %8d\n",
-			pmu_wait_done_calls);
-	seq_printf(s, "mrst_pmu_wait_done_udelays         %8d\n",
-			pmu_wait_done_udelays);
-	seq_printf(s, "mrst_pmu_wait_done_udelays_max     %8d\n",
-			pmu_wait_done_udelays_max);
-	seq_printf(s, "mrst_pmu_set_power_state_entry     %8d\n",
-			pmu_set_power_state_entry);
-	seq_printf(s, "mrst_pmu_set_power_state_send_cmd  %8d\n",
-			pmu_set_power_state_send_cmd);
-	seq_printf(s, "SCU busy: %d\n", pmu_read_busy_status());
-
-	return 0;
-}
-
-static int debug_mrst_pmu_open(struct inode *inode, struct file *file)
-{
-	return single_open(file, debug_mrst_pmu_show, NULL);
-}
-
-static const struct file_operations devices_state_operations = {
-	.open		= debug_mrst_pmu_open,
-	.read		= seq_read,
-	.llseek		= seq_lseek,
-	.release	= single_release,
-};
-#endif	/* DEBUG_FS */
-
-/*
- * Validate SCU PCI shim PCI vendor capability byte
- * against LSS hard-coded in mrst_devs[] above.
- * DEBUG only.
- */
-static void pmu_scu_firmware_debug(void)
-{
-	struct pci_dev *pdev = NULL;
-
-	for_each_pci_dev(pdev) {
-		struct mrst_device *mrst_dev;
-		u8 pci_config_lss;
-		int pos;
-
-		mrst_dev = pci_id_2_mrst_dev(pdev->device);
-		if (unlikely(!mrst_dev)) {
-			printk(KERN_ERR FW_BUG "pmu: Unknown "
-				"PCI device 0x%04X\n", pdev->device);
-			continue;
-		}
-
-		if (mrst_dev->lss == 0)
-			continue;	 /* no LSS in our table */
-
-		pos = pci_find_capability(pdev, PCI_CAP_ID_VNDR);
-		if (!pos != 0) {
-			printk(KERN_ERR FW_BUG "pmu: 0x%04X "
-				"missing PCI Vendor Capability\n",
-				pdev->device);
-			continue;
-		}
-		pci_read_config_byte(pdev, pos + 4, &pci_config_lss);
-		if (!(pci_config_lss & PCI_VENDOR_CAP_LOG_SS_MASK)) {
-			printk(KERN_ERR FW_BUG "pmu: 0x%04X "
-				"invalid PCI Vendor Capability 0x%x "
-				" expected LSS 0x%X\n",
-				pdev->device, pci_config_lss, mrst_dev->lss);
-			continue;
-		}
-		pci_config_lss &= PCI_VENDOR_CAP_LOG_ID_MASK;
-
-		if (mrst_dev->lss == pci_config_lss)
-			continue;
-
-		printk(KERN_ERR FW_BUG "pmu: 0x%04X LSS = %d, expected %d\n",
-			pdev->device, pci_config_lss, mrst_dev->lss);
-	}
-}
-
-/**
- * pmu_probe
- */
-static int __devinit pmu_probe(struct pci_dev *pdev,
-				   const struct pci_device_id *pci_id)
-{
-	int ret;
-	struct mrst_pmu_reg *pmu;
-
-	/* Init the device */
-	ret = pci_enable_device(pdev);
-	if (ret) {
-		dev_err(&pdev->dev, "Unable to Enable PCI device\n");
-		return ret;
-	}
-
-	ret = pci_request_regions(pdev, MRST_PMU_DRV_NAME);
-	if (ret < 0) {
-		dev_err(&pdev->dev, "Cannot obtain PCI resources, aborting\n");
-		goto out_err1;
-	}
-
-	/* Map the memory of PMU reg base */
-	pmu = pci_iomap(pdev, 0, 0);
-	if (!pmu) {
-		dev_err(&pdev->dev, "Unable to map the PMU address space\n");
-		ret = -ENOMEM;
-		goto out_err2;
-	}
-
-#ifdef CONFIG_DEBUG_FS
-	/* /sys/kernel/debug/mrst_pmu */
-	(void) debugfs_create_file("mrst_pmu", S_IFREG | S_IRUGO,
-				NULL, NULL, &devices_state_operations);
-#endif
-	pmu_reg = pmu;	/* success */
-
-	if (request_irq(pdev->irq, pmu_irq, 0, MRST_PMU_DRV_NAME, NULL)) {
-		dev_err(&pdev->dev, "Registering isr has failed\n");
-		ret = -1;
-		goto out_err3;
-	}
-
-	pmu_scu_firmware_debug();
-
-	pmu_write_wkc(S0I3_WAKE_SOURCES);	/* Enable S0i3 wakeup sources */
-
-	pmu_wait_ready();
-
-	pmu_write_ssc(D0I1_ACG_SSS_TARGET);	/* Enable Auto-Clock_Gating */
-	pmu_write_cmd(0x201);
-
-	spin_lock_init(&mrst_pmu_power_state_lock);
-
-	/* Enable the hardware interrupt */
-	pmu_irq_enable();
-	return 0;
-
-out_err3:
-	free_irq(pdev->irq, NULL);
-	pci_iounmap(pdev, pmu_reg);
-	pmu_reg = NULL;
-out_err2:
-	pci_release_region(pdev, 0);
-out_err1:
-	pci_disable_device(pdev);
-	return ret;
-}
-
-static void __devexit pmu_remove(struct pci_dev *pdev)
-{
-	dev_err(&pdev->dev, "Mid PM pmu_remove called\n");
-
-	/* Freeing up the irq */
-	free_irq(pdev->irq, NULL);
-
-	pci_iounmap(pdev, pmu_reg);
-	pmu_reg = NULL;
-
-	/* disable the current PCI device */
-	pci_release_region(pdev, 0);
-	pci_disable_device(pdev);
-}
-
-static DEFINE_PCI_DEVICE_TABLE(pmu_pci_ids) = {
-	{ PCI_VDEVICE(INTEL, PCI_DEV_ID_MRST_PMU), 0 },
-	{ }
-};
-
-MODULE_DEVICE_TABLE(pci, pmu_pci_ids);
-
-static struct pci_driver driver = {
-	.name = MRST_PMU_DRV_NAME,
-	.id_table = pmu_pci_ids,
-	.probe = pmu_probe,
-	.remove = __devexit_p(pmu_remove),
-};
-
-/**
- * pmu_pci_register - register the PMU driver as PCI device
- */
-static int __init pmu_pci_register(void)
-{
-	return pci_register_driver(&driver);
-}
-
-/* Register and probe via fs_initcall() to preceed device_initcall() */
-fs_initcall(pmu_pci_register);
-
-static void __exit mid_pci_cleanup(void)
-{
-	pci_unregister_driver(&driver);
-}
-
-static int ia_major;
-static int ia_minor;
-
-static int pmu_sfi_parse_oem(struct sfi_table_header *table)
-{
-	struct sfi_table_simple *sb;
-
-	sb = (struct sfi_table_simple *)table;
-	ia_major = (sb->pentry[1] >> 0) & 0xFFFF;
-	ia_minor = (sb->pentry[1] >> 16) & 0xFFFF;
-	printk(KERN_INFO "mrst_pmu: IA FW version v%x.%x\n",
-		ia_major, ia_minor);
-
-	return 0;
-}
-
-static int __init scu_fw_check(void)
-{
-	int ret;
-	u32 fw_version;
-
-	if (!pmu_reg)
-		return 0;	/* this driver didn't probe-out */
-
-	sfi_table_parse("OEMB", NULL, NULL, pmu_sfi_parse_oem);
-
-	if (ia_major < 0x6005 || ia_minor < 0x1525) {
-		WARN(1, "mrst_pmu: IA FW version too old\n");
-		return -1;
-	}
-
-	ret = intel_scu_ipc_command(IPCMSG_FW_REVISION, 0, NULL, 0,
-					&fw_version, 1);
-
-	if (ret) {
-		WARN(1, "mrst_pmu: IPC FW version? %d\n", ret);
-	} else {
-		int scu_major = (fw_version >> 8) & 0xFF;
-		int scu_minor = (fw_version >> 0) & 0xFF;
-
-		printk(KERN_INFO "mrst_pmu: firmware v%x\n", fw_version);
-
-		if ((scu_major >= 0xC0) && (scu_minor >= 0x49)) {
-			printk(KERN_INFO "mrst_pmu: enabling S0i3\n");
-			mrst_pmu_s0i3_enable = true;
-		} else {
-			WARN(1, "mrst_pmu: S0i3 disabled, old firmware %X.%X",
-					scu_major, scu_minor);
-		}
-	}
-	return 0;
-}
-late_initcall(scu_fw_check);
-module_exit(mid_pci_cleanup);
diff --git a/arch/x86/platform/mrst/pmu.h b/arch/x86/platform/mrst/pmu.h
deleted file mode 100644
index bfbfe64b167b..000000000000
--- a/arch/x86/platform/mrst/pmu.h
+++ /dev/null
@@ -1,234 +0,0 @@
-/*
- * mrst/pmu.h - private definitions for MRST Power Management Unit mrst/pmu.c
- *
- * Copyright (c) 2011, Intel Corporation.
- *
- * This program is free software; you can redistribute it and/or modify it
- * under the terms and conditions of the GNU General Public License,
- * version 2, as published by the Free Software Foundation.
- *
- * This program is distributed in the hope it will be useful, but WITHOUT
- * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
- * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
- * more details.
- *
- * You should have received a copy of the GNU General Public License along with
- * this program; if not, write to the Free Software Foundation, Inc.,
- * 51 Franklin St - Fifth Floor, Boston, MA 02110-1301 USA.
- */
-
-#ifndef _MRST_PMU_H_
-#define _MRST_PMU_H_
-
-#define PCI_DEV_ID_MRST_PMU		0x0810
-#define MRST_PMU_DRV_NAME		"mrst_pmu"
-#define	PCI_SUB_CLASS_MASK		0xFF00
-
-#define	PCI_VENDOR_CAP_LOG_ID_MASK	0x7F
-#define PCI_VENDOR_CAP_LOG_SS_MASK	0x80
-
-#define SUB_SYS_ALL_D0I1	0x01155555
-#define S0I3_WAKE_SOURCES	0x00001FFF
-
-#define PM_S0I3_COMMAND					\
-	((0 << 31) |	/* Reserved */			\
-	(0 << 30) |	/* Core must be idle */		\
-	(0xc2 << 22) |	/* ACK C6 trigger */		\
-	(3 << 19) |	/* Trigger on DMI message */	\
-	(3 << 16) |	/* Enter S0i3 */		\
-	(0 << 13) |	/* Numeric mode ID (sw) */	\
-	(3 << 9) |	/* Trigger mode */		\
-	(0 << 8) |	/* Do not interrupt */		\
-	(1 << 0))	/* Set configuration */
-
-#define	LSS_DMI		0
-#define	LSS_SD_HC0	1
-#define	LSS_SD_HC1	2
-#define	LSS_NAND	3
-#define	LSS_IMAGING	4
-#define	LSS_SECURITY	5
-#define	LSS_DISPLAY	6
-#define	LSS_USB_HC	7
-#define	LSS_USB_OTG	8
-#define	LSS_AUDIO	9
-#define	LSS_AUDIO_LPE	9
-#define	LSS_AUDIO_SSP	9
-#define	LSS_I2C0	10
-#define	LSS_I2C1	10
-#define	LSS_I2C2	10
-#define	LSS_KBD		10
-#define	LSS_SPI0	10
-#define	LSS_SPI1	10
-#define	LSS_SPI2	10
-#define	LSS_GPIO	10
-#define	LSS_SRAM	11	/* used by SCU, do not touch */
-#define	LSS_SD_HC2	12
-/* LSS hardware bits 15,14,13 are hardwired to 0, thus unusable */
-#define MRST_NUM_LSS	13
-
-#define MIN(a, b) (((a) < (b)) ? (a) : (b))
-
-#define	SSMSK(mask, lss) ((mask) << ((lss) * 2))
-#define	D0	0
-#define	D0i1	1
-#define	D0i2	2
-#define	D0i3	3
-
-#define S0I3_SSS_TARGET	(		\
-	SSMSK(D0i1, LSS_DMI) |		\
-	SSMSK(D0i3, LSS_SD_HC0) |	\
-	SSMSK(D0i3, LSS_SD_HC1) |	\
-	SSMSK(D0i3, LSS_NAND) |		\
-	SSMSK(D0i3, LSS_SD_HC2) |	\
-	SSMSK(D0i3, LSS_IMAGING) |	\
-	SSMSK(D0i3, LSS_SECURITY) |	\
-	SSMSK(D0i3, LSS_DISPLAY) |	\
-	SSMSK(D0i3, LSS_USB_HC) |	\
-	SSMSK(D0i3, LSS_USB_OTG) |	\
-	SSMSK(D0i3, LSS_AUDIO) |	\
-	SSMSK(D0i1, LSS_I2C0))
-
-/*
- * D0i1 on Langwell is Autonomous Clock Gating (ACG).
- * Enable ACG on every LSS except camera and audio
- */
-#define D0I1_ACG_SSS_TARGET	 \
-	(SUB_SYS_ALL_D0I1 & ~SSMSK(D0i1, LSS_IMAGING) & ~SSMSK(D0i1, LSS_AUDIO))
-
-enum cm_mode {
-	CM_NOP,			/* ignore the config mode value */
-	CM_IMMEDIATE,
-	CM_DELAY,
-	CM_TRIGGER,
-	CM_INVALID
-};
-
-enum sys_state {
-	SYS_STATE_S0I0,
-	SYS_STATE_S0I1,
-	SYS_STATE_S0I2,
-	SYS_STATE_S0I3,
-	SYS_STATE_S3,
-	SYS_STATE_S5
-};
-
-#define SET_CFG_CMD	1
-
-enum int_status {
-	INT_SPURIOUS = 0,
-	INT_CMD_DONE = 1,
-	INT_CMD_ERR = 2,
-	INT_WAKE_RX = 3,
-	INT_SS_ERROR = 4,
-	INT_S0IX_MISS = 5,
-	INT_NO_ACKC6 = 6,
-	INT_INVALID = 7,
-};
-
-/* PMU register interface */
-static struct mrst_pmu_reg {
-	u32 pm_sts;		/* 0x00 */
-	u32 pm_cmd;		/* 0x04 */
-	u32 pm_ics;		/* 0x08 */
-	u32 _resv1;		/* 0x0C */
-	u32 pm_wkc[2];		/* 0x10 */
-	u32 pm_wks[2];		/* 0x18 */
-	u32 pm_ssc[4];		/* 0x20 */
-	u32 pm_sss[4];		/* 0x30 */
-	u32 pm_wssc[4];		/* 0x40 */
-	u32 pm_c3c4;		/* 0x50 */
-	u32 pm_c5c6;		/* 0x54 */
-	u32 pm_msi_disable;	/* 0x58 */
-} *pmu_reg;
-
-static inline u32 pmu_read_sts(void) { return readl(&pmu_reg->pm_sts); }
-static inline u32 pmu_read_ics(void) { return readl(&pmu_reg->pm_ics); }
-static inline u32 pmu_read_wks(void) { return readl(&pmu_reg->pm_wks[0]); }
-static inline u32 pmu_read_sss(void) { return readl(&pmu_reg->pm_sss[0]); }
-
-static inline void pmu_write_cmd(u32 arg) { writel(arg, &pmu_reg->pm_cmd); }
-static inline void pmu_write_ics(u32 arg) { writel(arg, &pmu_reg->pm_ics); }
-static inline void pmu_write_wkc(u32 arg) { writel(arg, &pmu_reg->pm_wkc[0]); }
-static inline void pmu_write_ssc(u32 arg) { writel(arg, &pmu_reg->pm_ssc[0]); }
-static inline void pmu_write_wssc(u32 arg)
-					{ writel(arg, &pmu_reg->pm_wssc[0]); }
-
-static inline void pmu_msi_enable(void) { writel(0, &pmu_reg->pm_msi_disable); }
-static inline u32 pmu_msi_is_disabled(void)
-				{ return readl(&pmu_reg->pm_msi_disable); }
-
-union pmu_pm_ics {
-	struct {
-		u32 cause:8;
-		u32 enable:1;
-		u32 pending:1;
-		u32 reserved:22;
-	} bits;
-	u32 value;
-};
-
-static inline void pmu_irq_enable(void)
-{
-	union pmu_pm_ics pmu_ics;
-
-	pmu_ics.value = pmu_read_ics();
-	pmu_ics.bits.enable = 1;
-	pmu_write_ics(pmu_ics.value);
-}
-
-union pmu_pm_status {
-	struct {
-		u32 pmu_rev:8;
-		u32 pmu_busy:1;
-		u32 mode_id:4;
-		u32 Reserved:19;
-	} pmu_status_parts;
-	u32 pmu_status_value;
-};
-
-static inline int pmu_read_busy_status(void)
-{
-	union pmu_pm_status result;
-
-	result.pmu_status_value = pmu_read_sts();
-
-	return result.pmu_status_parts.pmu_busy;
-}
-
-/* pmu set config parameters */
-struct cfg_delay_param_t {
-	u32 cmd:8;
-	u32 ioc:1;
-	u32 cfg_mode:4;
-	u32 mode_id:3;
-	u32 sys_state:3;
-	u32 cfg_delay:8;
-	u32 rsvd:5;
-};
-
-struct cfg_trig_param_t {
-	u32 cmd:8;
-	u32 ioc:1;
-	u32 cfg_mode:4;
-	u32 mode_id:3;
-	u32 sys_state:3;
-	u32 cfg_trig_type:3;
-	u32 cfg_trig_val:8;
-	u32 cmbi:1;
-	u32 rsvd1:1;
-};
-
-union pmu_pm_set_cfg_cmd_t {
-	union {
-		struct cfg_delay_param_t d_param;
-		struct cfg_trig_param_t t_param;
-	} pmu2_params;
-	u32 pmu_pm_set_cfg_cmd_value;
-};
-
-#ifdef FUTURE_PATCH
-extern int mrst_s0i3_entry(u32 regval, u32 *regaddr);
-#else
-static inline int mrst_s0i3_entry(u32 regval, u32 *regaddr) { return -1; }
-#endif
-#endif
-- 
cgit v1.2.3


From 15a713df4145ad2540f8d84c3f4de930806f6151 Mon Sep 17 00:00:00 2001
From: Mika Westerberg <mika.westerberg@linux.intel.com>
Date: Thu, 26 Jan 2012 17:35:05 +0000
Subject: x86/config: Select MSIC MFD driver on Intel Medfield platform

On Intel Medfield platform we use MSIC MFD driver to create
necessary platform devices so it is essential to have the driver
compiled into the kernel.

Signed-off-by: Mika Westerberg <mika.westerberg@linux.intel.com>
Signed-off-by: Alan Cox <alan@linux.intel.com>
Cc: jacob.jun.pan@linux.intel.com
Link: http://lkml.kernel.org/n/tip-7hp1otk4wf4mg5pqohcwt06w@git.kernel.org
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/Kconfig | 1 +
 1 file changed, 1 insertion(+)

(limited to 'arch/x86')

diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
index a13addbcbd5e..c0d49316a63d 100644
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -431,6 +431,7 @@ config X86_MDFLD
 	select SPI
 	select INTEL_SCU_IPC
 	select X86_PLATFORM_DEVICES
+	select MFD_INTEL_MSIC
 	---help---
 	  Medfield is Intel's Low Power Intel Architecture (LPIA) based Moblin
 	  Internet Device(MID) platform. 
-- 
cgit v1.2.3


From ecfdb0ac15ba983ba4ff11709fdf8f178c0b8b87 Mon Sep 17 00:00:00 2001
From: Mika Westerberg <mika.westerberg@linux.intel.com>
Date: Thu, 26 Jan 2012 17:36:08 +0000
Subject: x86/mrst: Add msic_thermal platform support

This will let the MSIC driver to create platform device for the
thermal driver.

Signed-off-by: Mika Westerberg <mika.westerberg@linux.intel.com>
Signed-off-by: Kirill A. Shutemov <kirill.shutemov@linux.intel.com>
Signed-off-by: Alan Cox <alan@linux.intel.com>
Cc: jacob.jun.pan@linux.intel.com
Link: http://lkml.kernel.org/n/tip-rh1jaft9tjpzfql76gd56h1q@git.kernel.org
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/platform/mrst/mrst.c | 6 ++++++
 1 file changed, 6 insertions(+)

(limited to 'arch/x86')

diff --git a/arch/x86/platform/mrst/mrst.c b/arch/x86/platform/mrst/mrst.c
index 6743587575a6..721e65285dce 100644
--- a/arch/x86/platform/mrst/mrst.c
+++ b/arch/x86/platform/mrst/mrst.c
@@ -670,6 +670,11 @@ static void *msic_ocd_platform_data(void *info)
 	return msic_generic_platform_data(info, INTEL_MSIC_BLOCK_OCD);
 }
 
+static void *msic_thermal_platform_data(void *info)
+{
+	return msic_generic_platform_data(info, INTEL_MSIC_BLOCK_THERMAL);
+}
+
 static const struct devs_id __initconst device_ids[] = {
 	{"bma023", SFI_DEV_TYPE_I2C, 1, &no_platform_data},
 	{"pmic_gpio", SFI_DEV_TYPE_SPI, 1, &pmic_gpio_platform_data},
@@ -689,6 +694,7 @@ static const struct devs_id __initconst device_ids[] = {
 	{"msic_audio", SFI_DEV_TYPE_IPC, 1, &msic_audio_platform_data},
 	{"msic_power_btn", SFI_DEV_TYPE_IPC, 1, &msic_power_btn_platform_data},
 	{"msic_ocd", SFI_DEV_TYPE_IPC, 1, &msic_ocd_platform_data},
+	{"msic_thermal", SFI_DEV_TYPE_IPC, 1, &msic_thermal_platform_data},
 
 	{},
 };
-- 
cgit v1.2.3


From b0f4c4b32c8e3aa0d44fc4dd6c40a9a9a8d66b63 Mon Sep 17 00:00:00 2001
From: Prarit Bhargava <prarit@redhat.com>
Date: Thu, 26 Jan 2012 08:55:34 -0500
Subject: bugs, x86: Fix printk levels for panic, softlockups and stack dumps

rsyslog will display KERN_EMERG messages on a connected
terminal.  However, these messages are useless/undecipherable
for a general user.

For example, after a softlockup we get:

 Message from syslogd@intel-s3e37-04 at Jan 25 14:18:06 ...
 kernel:Stack:

 Message from syslogd@intel-s3e37-04 at Jan 25 14:18:06 ...
 kernel:Call Trace:

 Message from syslogd@intel-s3e37-04 at Jan 25 14:18:06 ...
 kernel:Code: ff ff a8 08 75 25 31 d2 48 8d 86 38 e0 ff ff 48 89
 d1 0f 01 c8 0f ae f0 48 8b 86 38 e0 ff ff a8 08 75 08 b1 01 4c 89 e0 0f 01 c9 <e8> ea 69 dd ff 4c 29 e8 48 89 c7 e8 0f bc da ff 49 89 c4 49 89

This happens because the printk levels for these messages are
incorrect. Only an informational message should be displayed on
a terminal.

I modified the printk levels for various messages in the kernel
and tested the output by using the drivers/misc/lkdtm.c kernel
modules (ie, softlockups, panics, hard lockups, etc.) and
confirmed that the console output was still the same and that
the output to the terminals was correct.

For example, in the case of a softlockup we now see the much
more informative:

 Message from syslogd@intel-s3e37-04 at Jan 25 10:18:06 ...
 BUG: soft lockup - CPU4 stuck for 60s!

instead of the above confusing messages.

AFAICT, the messages no longer have to be KERN_EMERG.  In the
most important case of a panic we set console_verbose().  As for
the other less severe cases the correct data is output to the
console and /var/log/messages.

Successfully tested by me using the drivers/misc/lkdtm.c module.

Signed-off-by: Prarit Bhargava <prarit@redhat.com>
Cc: dzickus@redhat.com
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Andrew Morton <akpm@linux-foundation.org>
Link: http://lkml.kernel.org/r/1327586134-11926-1-git-send-email-prarit@redhat.com
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/dumpstack.c    | 3 ++-
 arch/x86/kernel/dumpstack_64.c | 6 +++---
 arch/x86/mm/fault.c            | 4 ++--
 kernel/watchdog.c              | 2 +-
 lib/bug.c                      | 2 +-
 5 files changed, 9 insertions(+), 8 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kernel/dumpstack.c b/arch/x86/kernel/dumpstack.c
index 1aae78f775fc..4025fe4f928f 100644
--- a/arch/x86/kernel/dumpstack.c
+++ b/arch/x86/kernel/dumpstack.c
@@ -252,7 +252,8 @@ int __kprobes __die(const char *str, struct pt_regs *regs, long err)
 	unsigned short ss;
 	unsigned long sp;
 #endif
-	printk(KERN_EMERG "%s: %04lx [#%d] ", str, err & 0xffff, ++die_counter);
+	printk(KERN_DEFAULT
+	       "%s: %04lx [#%d] ", str, err & 0xffff, ++die_counter);
 #ifdef CONFIG_PREEMPT
 	printk("PREEMPT ");
 #endif
diff --git a/arch/x86/kernel/dumpstack_64.c b/arch/x86/kernel/dumpstack_64.c
index 6d728d9284bd..42b2bca0b72c 100644
--- a/arch/x86/kernel/dumpstack_64.c
+++ b/arch/x86/kernel/dumpstack_64.c
@@ -269,11 +269,11 @@ void show_registers(struct pt_regs *regs)
 		unsigned char c;
 		u8 *ip;
 
-		printk(KERN_EMERG "Stack:\n");
+		printk(KERN_DEFAULT "Stack:\n");
 		show_stack_log_lvl(NULL, regs, (unsigned long *)sp,
-				   0, KERN_EMERG);
+				   0, KERN_DEFAULT);
 
-		printk(KERN_EMERG "Code: ");
+		printk(KERN_DEFAULT "Code: ");
 
 		ip = (u8 *)regs->ip - code_prologue;
 		if (ip < (u8 *)PAGE_OFFSET || probe_kernel_address(ip, c)) {
diff --git a/arch/x86/mm/fault.c b/arch/x86/mm/fault.c
index 9d74824a708d..f0b4caf85c1a 100644
--- a/arch/x86/mm/fault.c
+++ b/arch/x86/mm/fault.c
@@ -673,7 +673,7 @@ no_context(struct pt_regs *regs, unsigned long error_code,
 
 	stackend = end_of_stack(tsk);
 	if (tsk != &init_task && *stackend != STACK_END_MAGIC)
-		printk(KERN_ALERT "Thread overran stack, or stack corrupted\n");
+		printk(KERN_EMERG "Thread overran stack, or stack corrupted\n");
 
 	tsk->thread.cr2		= address;
 	tsk->thread.trap_no	= 14;
@@ -684,7 +684,7 @@ no_context(struct pt_regs *regs, unsigned long error_code,
 		sig = 0;
 
 	/* Executive summary in case the body of the oops scrolled away */
-	printk(KERN_EMERG "CR2: %016lx\n", address);
+	printk(KERN_DEFAULT "CR2: %016lx\n", address);
 
 	oops_end(flags, regs, sig);
 }
diff --git a/kernel/watchdog.c b/kernel/watchdog.c
index 1d7bca7f4f52..d117262deba3 100644
--- a/kernel/watchdog.c
+++ b/kernel/watchdog.c
@@ -296,7 +296,7 @@ static enum hrtimer_restart watchdog_timer_fn(struct hrtimer *hrtimer)
 		if (__this_cpu_read(soft_watchdog_warn) == true)
 			return HRTIMER_RESTART;
 
-		printk(KERN_ERR "BUG: soft lockup - CPU#%d stuck for %us! [%s:%d]\n",
+		printk(KERN_EMERG "BUG: soft lockup - CPU#%d stuck for %us! [%s:%d]\n",
 			smp_processor_id(), duration,
 			current->comm, task_pid_nr(current));
 		print_modules();
diff --git a/lib/bug.c b/lib/bug.c
index 19552096d16b..a28c1415357c 100644
--- a/lib/bug.c
+++ b/lib/bug.c
@@ -169,7 +169,7 @@ enum bug_trap_type report_bug(unsigned long bugaddr, struct pt_regs *regs)
 		return BUG_TRAP_TYPE_WARN;
 	}
 
-	printk(KERN_EMERG "------------[ cut here ]------------\n");
+	printk(KERN_DEFAULT "------------[ cut here ]------------\n");
 
 	if (file)
 		printk(KERN_CRIT "kernel BUG at %s:%u!\n",
-- 
cgit v1.2.3


From 08dda402d60a721ac94e79efd7646b332be3e3b2 Mon Sep 17 00:00:00 2001
From: Tony Luck <tony.luck@intel.com>
Date: Thu, 26 Jan 2012 16:02:22 -0800
Subject: x86/mce: Replace hard coded hex constants with symbolic defines

Magic constants like 0x0134 in code just invite questions on
where they come from, what they mean, can they be changed.

Provide #defines for the architecturally defined MCACOD values
with a reference to the Intel Software Developers manual which
describes them.

Signed-off-by: Tony Luck <tony.luck@intel.com>
---
 arch/x86/kernel/cpu/mcheck/mce-severity.c | 14 ++++++++++----
 1 file changed, 10 insertions(+), 4 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kernel/cpu/mcheck/mce-severity.c b/arch/x86/kernel/cpu/mcheck/mce-severity.c
index f6c92f99efa0..0c82091b1652 100644
--- a/arch/x86/kernel/cpu/mcheck/mce-severity.c
+++ b/arch/x86/kernel/cpu/mcheck/mce-severity.c
@@ -56,6 +56,12 @@ static struct severity {
 #define MCI_UC_SAR (MCI_STATUS_UC|MCI_STATUS_S|MCI_STATUS_AR)
 #define	MCI_ADDR (MCI_STATUS_ADDRV|MCI_STATUS_MISCV)
 #define MCACOD 0xffff
+/* Architecturally defined codes from SDM Vol. 3B Chapter 15 */
+#define MCACOD_SCRUB	0x00C0	/* 0xC0-0xCF Memory Scrubbing */
+#define MCACOD_SCRUBMSK	0xfff0
+#define MCACOD_L3WB	0x017A	/* L3 Explicit Writeback */
+#define MCACOD_DATA	0x0134	/* Data Load */
+#define MCACOD_INSTR	0x0150	/* Instruction Fetch */
 
 	MCESEV(
 		NO, "Invalid",
@@ -112,12 +118,12 @@ static struct severity {
 #ifdef	CONFIG_MEMORY_FAILURE
 	MCESEV(
 		KEEP, "HT thread notices Action required: data load error",
-		SER, MASK(MCI_STATUS_OVER|MCI_UC_SAR|MCI_ADDR|MCACOD, MCI_UC_SAR|MCI_ADDR|0x0134),
+		SER, MASK(MCI_STATUS_OVER|MCI_UC_SAR|MCI_ADDR|MCACOD, MCI_UC_SAR|MCI_ADDR|MCACOD_DATA),
 		MCGMASK(MCG_STATUS_EIPV, 0)
 		),
 	MCESEV(
 		AR, "Action required: data load error",
-		SER, MASK(MCI_STATUS_OVER|MCI_UC_SAR|MCI_ADDR|MCACOD, MCI_UC_SAR|MCI_ADDR|0x0134),
+		SER, MASK(MCI_STATUS_OVER|MCI_UC_SAR|MCI_ADDR|MCACOD, MCI_UC_SAR|MCI_ADDR|MCACOD_DATA),
 		USER
 		),
 #endif
@@ -129,11 +135,11 @@ static struct severity {
 	/* known AO MCACODs: */
 	MCESEV(
 		AO, "Action optional: memory scrubbing error",
-		SER, MASK(MCI_STATUS_OVER|MCI_UC_SAR|0xfff0, MCI_UC_S|0x00c0)
+		SER, MASK(MCI_STATUS_OVER|MCI_UC_SAR|MCACOD_SCRUBMSK, MCI_UC_S|MCACOD_SCRUB)
 		),
 	MCESEV(
 		AO, "Action optional: last level cache writeback error",
-		SER, MASK(MCI_STATUS_OVER|MCI_UC_SAR|MCACOD, MCI_UC_S|0x017a)
+		SER, MASK(MCI_STATUS_OVER|MCI_UC_SAR|MCACOD, MCI_UC_S|MCACOD_L3WB)
 		),
 	MCESEV(
 		SOME, "Action optional: unknown MCACOD",
-- 
cgit v1.2.3


From 644e9cbbe3fc032cc92d0936057e166a994dc246 Mon Sep 17 00:00:00 2001
From: Andi Kleen <ak@linux.intel.com>
Date: Thu, 26 Jan 2012 00:09:05 +0100
Subject: Add driver auto probing for x86 features v4

There's a growing number of drivers that support a specific x86 feature
or CPU.  Currently loading these drivers currently on a generic
distribution requires various driver specific hacks and it often
doesn't work.

This patch adds auto probing for drivers based on the x86 cpuid
information, in particular based on vendor/family/model number
and also based on CPUID feature bits.

For example a common issue is not loading the SSE 4.2 accelerated
CRC module: this can significantly lower the performance of BTRFS
which relies on fast CRC.

Another issue is loading the right CPUFREQ driver for the current CPU.
Currently distributions often try all all possible driver until
one sticks, which is not really a good way to do this.

It works with existing udev without any changes. The code
exports the x86 information as a generic string in sysfs
that can be matched by udev's pattern matching.

This scheme does not support numeric ranges, so if you want to
handle e.g. ranges of model numbers they have to be encoded
in ASCII or simply all models or families listed. Fixing
that would require changing udev.

Another issue is that udev will happily load all drivers that match,
there is currently no nice way to stop a specific driver from
being loaded if it's not needed (e.g. if you don't need fast CRC)
But there are not that many cpu specific drivers around and they're
all not that bloated, so this isn't a particularly serious issue.

Originally this patch added the modalias to the normal cpu
sysdevs. However sysdevs don't have all the infrastructure
needed for udev, so it couldn't really autoload drivers.
This patch instead adds the CPU modaliases to the cpuid devices,
which are real devices with full support for udev. This implies
that the cpuid driver has to be loaded to use this.

This patch just adds infrastructure, some driver conversions
in followups.

Thanks to Kay for helping with some sysfs magic.

v2: Constifcation, some updates
v4: (trenn@suse.de):
    - Use kzalloc instead of kmalloc to terminate modalias buffer
    - Use uppercase hex values to match correctly against hex values containing
      letters

Cc: Dave Jones <davej@redhat.com>
Cc: Kay Sievers <kay.sievers@vrfy.org>
Cc: Jen Axboe <axboe@kernel.dk>
Cc: Herbert Xu <herbert@gondor.apana.org.au>
Cc: Huang Ying <ying.huang@intel.com>
Cc: Len Brown <lenb@kernel.org>
Signed-off-by: Andi Kleen <ak@linux.intel.com>
Signed-off-by: Thomas Renninger <trenn@suse.de>
Acked-by: H. Peter Anvin <hpa@zytor.com>
Signed-off-by: Greg Kroah-Hartman <gregkh@suse.de>
---
 arch/x86/include/asm/cpu_device_id.h | 13 ++++++++
 arch/x86/kernel/cpu/Makefile         |  1 +
 arch/x86/kernel/cpu/match.c          | 48 +++++++++++++++++++++++++++++
 arch/x86/kernel/cpuid.c              | 59 +++++++++++++++++++++++++++++++++++-
 include/linux/mod_devicetable.h      | 21 +++++++++++++
 scripts/mod/file2alias.c             | 24 +++++++++++++++
 6 files changed, 165 insertions(+), 1 deletion(-)
 create mode 100644 arch/x86/include/asm/cpu_device_id.h
 create mode 100644 arch/x86/kernel/cpu/match.c

(limited to 'arch/x86')

diff --git a/arch/x86/include/asm/cpu_device_id.h b/arch/x86/include/asm/cpu_device_id.h
new file mode 100644
index 000000000000..ff501e511d91
--- /dev/null
+++ b/arch/x86/include/asm/cpu_device_id.h
@@ -0,0 +1,13 @@
+#ifndef _CPU_DEVICE_ID
+#define _CPU_DEVICE_ID 1
+
+/*
+ * Declare drivers belonging to specific x86 CPUs
+ * Similar in spirit to pci_device_id and related PCI functions
+ */
+
+#include <linux/mod_devicetable.h>
+
+extern const struct x86_cpu_id *x86_match_cpu(const struct x86_cpu_id *match);
+
+#endif
diff --git a/arch/x86/kernel/cpu/Makefile b/arch/x86/kernel/cpu/Makefile
index 25f24dccdcfa..6ab6aa2fdfdd 100644
--- a/arch/x86/kernel/cpu/Makefile
+++ b/arch/x86/kernel/cpu/Makefile
@@ -16,6 +16,7 @@ obj-y			:= intel_cacheinfo.o scattered.o topology.o
 obj-y			+= proc.o capflags.o powerflags.o common.o
 obj-y			+= vmware.o hypervisor.o sched.o mshyperv.o
 obj-y			+= rdrand.o
+obj-y			+= match.o
 
 obj-$(CONFIG_X86_32)	+= bugs.o
 obj-$(CONFIG_X86_64)	+= bugs_64.o
diff --git a/arch/x86/kernel/cpu/match.c b/arch/x86/kernel/cpu/match.c
new file mode 100644
index 000000000000..7acc961422e7
--- /dev/null
+++ b/arch/x86/kernel/cpu/match.c
@@ -0,0 +1,48 @@
+#include <asm/cpu_device_id.h>
+#include <asm/processor.h>
+#include <linux/cpu.h>
+#include <linux/module.h>
+
+/**
+ * x86_match_cpu - match current CPU again an array of x86_cpu_ids
+ * @match: Pointer to array of x86_cpu_ids. Last entry terminated with
+ *         {}.
+ *
+ * Return the entry if the current CPU matches the entries in the
+ * passed x86_cpu_id match table. Otherwise NULL.  The match table
+ * contains vendor (X86_VENDOR_*), family, model and feature bits or
+ * respective wildcard entries.
+ *
+ * A typical table entry would be to match a specific CPU
+ * { X86_VENDOR_INTEL, 6, 0x12 }
+ * or to match a specific CPU feature
+ * { X86_FEATURE_MATCH(X86_FEATURE_FOOBAR) }
+ *
+ * Fields can be wildcarded with %X86_VENDOR_ANY, %X86_FAMILY_ANY,
+ * %X86_MODEL_ANY, %X86_FEATURE_ANY or 0 (except for vendor)
+ *
+ * Arrays used to match for this should also be declared using
+ * MODULE_DEVICE_TABLE(x86_cpu, ...)
+ *
+ * This always matches against the boot cpu, assuming models and features are
+ * consistent over all CPUs.
+ */
+const struct x86_cpu_id *x86_match_cpu(const struct x86_cpu_id *match)
+{
+	const struct x86_cpu_id *m;
+	struct cpuinfo_x86 *c = &boot_cpu_data;
+
+	for (m = match; m->vendor | m->family | m->model | m->feature; m++) {
+		if (m->vendor != X86_VENDOR_ANY && c->x86_vendor != m->vendor)
+			continue;
+		if (m->family != X86_FAMILY_ANY && c->x86 != m->family)
+			continue;
+		if (m->model != X86_MODEL_ANY && c->x86_model != m->model)
+			continue;
+		if (m->feature != X86_FEATURE_ANY && !cpu_has(c, m->feature))
+			continue;
+		return m;
+	}
+	return NULL;
+}
+EXPORT_SYMBOL(x86_match_cpu);
diff --git a/arch/x86/kernel/cpuid.c b/arch/x86/kernel/cpuid.c
index a524353d93f2..7c89880eefd0 100644
--- a/arch/x86/kernel/cpuid.c
+++ b/arch/x86/kernel/cpuid.c
@@ -40,6 +40,7 @@
 #include <linux/notifier.h>
 #include <linux/uaccess.h>
 #include <linux/gfp.h>
+#include <linux/slab.h>
 
 #include <asm/processor.h>
 #include <asm/msr.h>
@@ -138,13 +139,57 @@ static const struct file_operations cpuid_fops = {
 	.open = cpuid_open,
 };
 
+static ssize_t print_cpu_modalias(struct device *dev,
+				  struct device_attribute *attr,
+				  char *bufptr)
+{
+	int size = PAGE_SIZE;
+	int i, n;
+	char *buf = bufptr;
+
+	n = snprintf(buf, size, "x86cpu:vendor:%04X:family:"
+		     "%04X:model:%04X:feature:",
+		boot_cpu_data.x86_vendor,
+		boot_cpu_data.x86,
+		boot_cpu_data.x86_model);
+	size -= n;
+	buf += n;
+	size -= 2;
+	for (i = 0; i < NCAPINTS*32; i++) {
+		if (boot_cpu_has(i)) {
+			n = snprintf(buf, size, ",%04X", i);
+			if (n < 0) {
+				WARN(1, "x86 features overflow page\n");
+				break;
+			}
+			size -= n;
+			buf += n;
+		}
+	}
+	*buf++ = ',';
+	*buf++ = '\n';
+	return buf - bufptr;
+}
+
+static DEVICE_ATTR(modalias, 0444, print_cpu_modalias, NULL);
+
 static __cpuinit int cpuid_device_create(int cpu)
 {
 	struct device *dev;
+	int err;
 
 	dev = device_create(cpuid_class, NULL, MKDEV(CPUID_MAJOR, cpu), NULL,
 			    "cpu%d", cpu);
-	return IS_ERR(dev) ? PTR_ERR(dev) : 0;
+	if (IS_ERR(dev))
+		return PTR_ERR(dev);
+
+	err = device_create_file(dev, &dev_attr_modalias);
+	if (err) {
+		/* keep device around on error. attribute is optional. */
+		err = 0;
+	}
+
+	return 0;
 }
 
 static void cpuid_device_destroy(int cpu)
@@ -182,6 +227,17 @@ static char *cpuid_devnode(struct device *dev, umode_t *mode)
 	return kasprintf(GFP_KERNEL, "cpu/%u/cpuid", MINOR(dev->devt));
 }
 
+static int cpuid_dev_uevent(struct device *dev, struct kobj_uevent_env *env)
+{
+	char *buf = kzalloc(PAGE_SIZE, GFP_KERNEL);
+	if (buf) {
+		print_cpu_modalias(NULL, NULL, buf);
+		add_uevent_var(env, "MODALIAS=%s", buf);
+		kfree(buf);
+	}
+	return 0;
+}
+
 static int __init cpuid_init(void)
 {
 	int i, err = 0;
@@ -200,6 +256,7 @@ static int __init cpuid_init(void)
 		goto out_chrdev;
 	}
 	cpuid_class->devnode = cpuid_devnode;
+	cpuid_class->dev_uevent = cpuid_dev_uevent;
 	for_each_online_cpu(i) {
 		err = cpuid_device_create(i);
 		if (err != 0)
diff --git a/include/linux/mod_devicetable.h b/include/linux/mod_devicetable.h
index b29e7f6f8fa5..cff2cc08f45a 100644
--- a/include/linux/mod_devicetable.h
+++ b/include/linux/mod_devicetable.h
@@ -571,4 +571,25 @@ struct amba_id {
 #endif
 };
 
+/*
+ * Match x86 CPUs for CPU specific drivers.
+ * See documentation of "x86_match_cpu" for details.
+ */
+
+struct x86_cpu_id {
+	__u16 vendor;
+	__u16 family;
+	__u16 model;
+	__u16 feature;	/* bit index */
+	kernel_ulong_t driver_data;
+};
+
+#define X86_FEATURE_MATCH(x) \
+	{ X86_VENDOR_ANY, X86_FAMILY_ANY, X86_MODEL_ANY, x }
+
+#define X86_VENDOR_ANY 0xffff
+#define X86_FAMILY_ANY 0
+#define X86_MODEL_ANY  0
+#define X86_FEATURE_ANY 0	/* Same as FPU, you can't test for that */
+
 #endif /* LINUX_MOD_DEVICETABLE_H */
diff --git a/scripts/mod/file2alias.c b/scripts/mod/file2alias.c
index c0e14b3f2306..026ba38759ca 100644
--- a/scripts/mod/file2alias.c
+++ b/scripts/mod/file2alias.c
@@ -1013,6 +1013,30 @@ static int do_amba_entry(const char *filename,
 }
 ADD_TO_DEVTABLE("amba", struct amba_id, do_amba_entry);
 
+/* LOOKS like x86cpu:vendor:VVVV:family:FFFF:model:MMMM:feature:*,FEAT,*
+ * All fields are numbers. It would be nicer to use strings for vendor
+ * and feature, but getting those out of the build system here is too
+ * complicated.
+ */
+
+static int do_x86cpu_entry(const char *filename, struct x86_cpu_id *id,
+			   char *alias)
+{
+	id->feature = TO_NATIVE(id->feature);
+	id->family = TO_NATIVE(id->family);
+	id->model = TO_NATIVE(id->model);
+	id->vendor = TO_NATIVE(id->vendor);
+
+	strcpy(alias, "x86cpu:");
+	ADD(alias, "vendor:",  id->vendor != X86_VENDOR_ANY, id->vendor);
+	ADD(alias, ":family:", id->family != X86_FAMILY_ANY, id->family);
+	ADD(alias, ":model:",  id->model  != X86_MODEL_ANY,  id->model);
+	ADD(alias, ":feature:*,", id->feature != X86_FEATURE_ANY, id->feature);
+	strcat(alias, ",*");
+	return 1;
+}
+ADD_TO_DEVTABLE("x86cpu", struct x86_cpu_id, do_x86cpu_entry);
+
 /* Does namelen bytes of name exactly match the symbol? */
 static bool sym_is(const char *name, unsigned namelen, const char *symbol)
 {
-- 
cgit v1.2.3


From 3bd391f056df61e928de1680ff4a3e7e07e5b399 Mon Sep 17 00:00:00 2001
From: Andi Kleen <ak@linux.intel.com>
Date: Thu, 26 Jan 2012 00:09:06 +0100
Subject: crypto: Add support for x86 cpuid auto loading for x86 crypto drivers

Add support for auto-loading of crypto drivers based on cpuid features.
This enables auto-loading of the VIA and Intel specific drivers
for AES, hashing and CRCs.

Requires the earlier infrastructure patch to add x86 modinfo.
I kept it all in a single patch for now.

I dropped the printks when the driver cpuid doesn't match (imho
drivers never should print anything in such a case)

One drawback is that udev doesn't know if the drivers are used or not,
so they will be unconditionally loaded at boot up. That's better
than not loading them at all, like it often happens.

Cc: Dave Jones <davej@redhat.com>
Cc: Kay Sievers <kay.sievers@vrfy.org>
Cc: Jen Axboe <axboe@kernel.dk>
Cc: Herbert Xu <herbert@gondor.apana.org.au>
Cc: Huang Ying <ying.huang@intel.com>
Signed-off-by: Andi Kleen <ak@linux.intel.com>
Signed-off-by: Thomas Renninger <trenn@suse.de>
Acked-by: H. Peter Anvin <hpa@zytor.com>
Signed-off-by: Greg Kroah-Hartman <gregkh@suse.de>
---
 arch/x86/crypto/aesni-intel_glue.c         | 12 +++++++++---
 arch/x86/crypto/crc32c-intel.c             | 11 ++++++++---
 arch/x86/crypto/ghash-clmulni-intel_glue.c | 12 ++++++++----
 drivers/crypto/padlock-aes.c               |  9 ++++++++-
 drivers/crypto/padlock-sha.c               | 16 ++++++++--------
 5 files changed, 41 insertions(+), 19 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/crypto/aesni-intel_glue.c b/arch/x86/crypto/aesni-intel_glue.c
index 545d0ce59818..b3350bd32c60 100644
--- a/arch/x86/crypto/aesni-intel_glue.c
+++ b/arch/x86/crypto/aesni-intel_glue.c
@@ -28,6 +28,7 @@
 #include <crypto/aes.h>
 #include <crypto/cryptd.h>
 #include <crypto/ctr.h>
+#include <asm/cpu_device_id.h>
 #include <asm/i387.h>
 #include <asm/aes.h>
 #include <crypto/scatterwalk.h>
@@ -1253,14 +1254,19 @@ static struct crypto_alg __rfc4106_alg = {
 };
 #endif
 
+
+static const struct x86_cpu_id aesni_cpu_id[] = {
+	X86_FEATURE_MATCH(X86_FEATURE_AES),
+	{}
+};
+MODULE_DEVICE_TABLE(x86cpu, aesni_cpu_id);
+
 static int __init aesni_init(void)
 {
 	int err;
 
-	if (!cpu_has_aes) {
-		printk(KERN_INFO "Intel AES-NI instructions are not detected.\n");
+	if (!x86_match_cpu(aesni_cpu_id))
 		return -ENODEV;
-	}
 
 	if ((err = crypto_fpu_init()))
 		goto fpu_err;
diff --git a/arch/x86/crypto/crc32c-intel.c b/arch/x86/crypto/crc32c-intel.c
index b9d00261703c..493f959261f7 100644
--- a/arch/x86/crypto/crc32c-intel.c
+++ b/arch/x86/crypto/crc32c-intel.c
@@ -31,6 +31,7 @@
 #include <crypto/internal/hash.h>
 
 #include <asm/cpufeature.h>
+#include <asm/cpu_device_id.h>
 
 #define CHKSUM_BLOCK_SIZE	1
 #define CHKSUM_DIGEST_SIZE	4
@@ -173,13 +174,17 @@ static struct shash_alg alg = {
 	}
 };
 
+static const struct x86_cpu_id crc32c_cpu_id[] = {
+	X86_FEATURE_MATCH(X86_FEATURE_XMM4_2),
+	{}
+};
+MODULE_DEVICE_TABLE(x86cpu, crc32c_cpu_id);
 
 static int __init crc32c_intel_mod_init(void)
 {
-	if (cpu_has_xmm4_2)
-		return crypto_register_shash(&alg);
-	else
+	if (!x86_match_cpu(crc32c_cpu_id))
 		return -ENODEV;
+	return crypto_register_shash(&alg);
 }
 
 static void __exit crc32c_intel_mod_fini(void)
diff --git a/arch/x86/crypto/ghash-clmulni-intel_glue.c b/arch/x86/crypto/ghash-clmulni-intel_glue.c
index 976aa64d9a20..b4bf0a63b520 100644
--- a/arch/x86/crypto/ghash-clmulni-intel_glue.c
+++ b/arch/x86/crypto/ghash-clmulni-intel_glue.c
@@ -20,6 +20,7 @@
 #include <crypto/gf128mul.h>
 #include <crypto/internal/hash.h>
 #include <asm/i387.h>
+#include <asm/cpu_device_id.h>
 
 #define GHASH_BLOCK_SIZE	16
 #define GHASH_DIGEST_SIZE	16
@@ -294,15 +295,18 @@ static struct ahash_alg ghash_async_alg = {
 	},
 };
 
+static const struct x86_cpu_id pcmul_cpu_id[] = {
+	X86_FEATURE_MATCH(X86_FEATURE_PCLMULQDQ), /* Pickle-Mickle-Duck */
+	{}
+};
+MODULE_DEVICE_TABLE(x86cpu, pcmul_cpu_id);
+
 static int __init ghash_pclmulqdqni_mod_init(void)
 {
 	int err;
 
-	if (!cpu_has_pclmulqdq) {
-		printk(KERN_INFO "Intel PCLMULQDQ-NI instructions are not"
-		       " detected.\n");
+	if (!x86_match_cpu(pcmul_cpu_id))
 		return -ENODEV;
-	}
 
 	err = crypto_register_shash(&ghash_alg);
 	if (err)
diff --git a/drivers/crypto/padlock-aes.c b/drivers/crypto/padlock-aes.c
index 29b9469f8378..37b2e9406af6 100644
--- a/drivers/crypto/padlock-aes.c
+++ b/drivers/crypto/padlock-aes.c
@@ -19,6 +19,7 @@
 #include <linux/percpu.h>
 #include <linux/smp.h>
 #include <linux/slab.h>
+#include <asm/cpu_device_id.h>
 #include <asm/byteorder.h>
 #include <asm/processor.h>
 #include <asm/i387.h>
@@ -503,12 +504,18 @@ static struct crypto_alg cbc_aes_alg = {
 	}
 };
 
+static struct x86_cpu_id padlock_cpu_id[] = {
+	X86_FEATURE_MATCH(X86_FEATURE_XCRYPT),
+	{}
+};
+MODULE_DEVICE_TABLE(x86cpu, padlock_cpu_id);
+
 static int __init padlock_init(void)
 {
 	int ret;
 	struct cpuinfo_x86 *c = &cpu_data(0);
 
-	if (!cpu_has_xcrypt)
+	if (!x86_match_cpu(padlock_cpu_id))
 		return -ENODEV;
 
 	if (!cpu_has_xcrypt_enabled) {
diff --git a/drivers/crypto/padlock-sha.c b/drivers/crypto/padlock-sha.c
index 06bdb4b2c6a6..9266c0e25492 100644
--- a/drivers/crypto/padlock-sha.c
+++ b/drivers/crypto/padlock-sha.c
@@ -22,6 +22,7 @@
 #include <linux/interrupt.h>
 #include <linux/kernel.h>
 #include <linux/scatterlist.h>
+#include <asm/cpu_device_id.h>
 #include <asm/i387.h>
 
 struct padlock_sha_desc {
@@ -526,6 +527,12 @@ static struct shash_alg sha256_alg_nano = {
 	}
 };
 
+static struct x86_cpu_id padlock_sha_ids[] = {
+	X86_FEATURE_MATCH(X86_FEATURE_PHE),
+	{}
+};
+MODULE_DEVICE_TABLE(x86cpu, padlock_sha_ids);
+
 static int __init padlock_init(void)
 {
 	int rc = -ENODEV;
@@ -533,15 +540,8 @@ static int __init padlock_init(void)
 	struct shash_alg *sha1;
 	struct shash_alg *sha256;
 
-	if (!cpu_has_phe) {
-		printk(KERN_NOTICE PFX "VIA PadLock Hash Engine not detected.\n");
-		return -ENODEV;
-	}
-
-	if (!cpu_has_phe_enabled) {
-		printk(KERN_NOTICE PFX "VIA PadLock detected, but not enabled. Hmm, strange...\n");
+	if (!x86_match_cpu(padlock_sha_ids) || !cpu_has_phe_enabled)
 		return -ENODEV;
-	}
 
 	/* Register the newly added algorithm module if on *
 	* VIA Nano processor, or else just do as before */
-- 
cgit v1.2.3


From 2f1e097e24defe64a86535b53768f5c8ab0368d1 Mon Sep 17 00:00:00 2001
From: Thomas Renninger <trenn@suse.de>
Date: Thu, 26 Jan 2012 00:09:11 +0100
Subject: X86: Introduce HW-Pstate scattered cpuid feature

It is rather similar to CPB (boot capability) feature
and exists since fam10h (can be looked up in AMD's BKDG).

The feature is needed for powernow-k8 to cleanup init functions and to
provide proper autoloading matching with the new x86cpu modalias
feature.

Cc: Kay Sievers <kay.sievers@vrfy.org>
Cc: Dave Jones <davej@redhat.com>
Cc: Borislav Petkov <bp@amd64.org>
Signed-off-by: Thomas Renninger <trenn@suse.de>
Acked-by: H. Peter Anvin <hpa@zytor.com>
Signed-off-by: Greg Kroah-Hartman <gregkh@suse.de>
---
 arch/x86/include/asm/cpufeature.h | 1 +
 arch/x86/kernel/cpu/scattered.c   | 1 +
 2 files changed, 2 insertions(+)

(limited to 'arch/x86')

diff --git a/arch/x86/include/asm/cpufeature.h b/arch/x86/include/asm/cpufeature.h
index 17c5d4bdee5e..67b0910ebbb8 100644
--- a/arch/x86/include/asm/cpufeature.h
+++ b/arch/x86/include/asm/cpufeature.h
@@ -176,6 +176,7 @@
 #define X86_FEATURE_PLN		(7*32+ 5) /* Intel Power Limit Notification */
 #define X86_FEATURE_PTS		(7*32+ 6) /* Intel Package Thermal Status */
 #define X86_FEATURE_DTS		(7*32+ 7) /* Digital Thermal Sensor */
+#define X86_FEATURE_HW_PSTATE	(7*32+ 8) /* AMD HW-PState */
 
 /* Virtualization flags: Linux defined, word 8 */
 #define X86_FEATURE_TPR_SHADOW  (8*32+ 0) /* Intel TPR Shadow */
diff --git a/arch/x86/kernel/cpu/scattered.c b/arch/x86/kernel/cpu/scattered.c
index c7f64e6f537a..addf9e82a7f2 100644
--- a/arch/x86/kernel/cpu/scattered.c
+++ b/arch/x86/kernel/cpu/scattered.c
@@ -40,6 +40,7 @@ void __cpuinit init_scattered_cpuid_features(struct cpuinfo_x86 *c)
 		{ X86_FEATURE_EPB,		CR_ECX, 3, 0x00000006, 0 },
 		{ X86_FEATURE_XSAVEOPT,		CR_EAX,	0, 0x0000000d, 1 },
 		{ X86_FEATURE_CPB,		CR_EDX, 9, 0x80000007, 0 },
+		{ X86_FEATURE_HW_PSTATE,	CR_EDX, 7, 0x80000007, 0 },
 		{ X86_FEATURE_NPT,		CR_EDX, 0, 0x8000000a, 0 },
 		{ X86_FEATURE_LBRV,		CR_EDX, 1, 0x8000000a, 0 },
 		{ X86_FEATURE_SVML,		CR_EDX, 2, 0x8000000a, 0 },
-- 
cgit v1.2.3


From 78ff123b05fb15beb1ad670372eea0d299d0b8af Mon Sep 17 00:00:00 2001
From: Andi Kleen <ak@linux.intel.com>
Date: Thu, 26 Jan 2012 00:09:13 +0100
Subject: x86: autoload microcode driver on Intel and AMD systems v2

Don't try to describe the actual models for now.

v2: Fix typo: X86_VENDOR_ANY -> X86_FAMILY_ANY (trenn)

Signed-off-by: Andi Kleen <ak@linux.intel.com>
Signed-off-by: Thomas Renninger <trenn@suse.de>
Acked-by: H. Peter Anvin <hpa@zytor.com>
Signed-off-by: Greg Kroah-Hartman <gregkh@suse.de>
---
 arch/x86/kernel/microcode_core.c | 15 +++++++++++++++
 1 file changed, 15 insertions(+)

(limited to 'arch/x86')

diff --git a/arch/x86/kernel/microcode_core.c b/arch/x86/kernel/microcode_core.c
index fda91c307104..87a0f8688301 100644
--- a/arch/x86/kernel/microcode_core.c
+++ b/arch/x86/kernel/microcode_core.c
@@ -86,6 +86,7 @@
 
 #include <asm/microcode.h>
 #include <asm/processor.h>
+#include <asm/cpu_device_id.h>
 
 MODULE_DESCRIPTION("Microcode Update Driver");
 MODULE_AUTHOR("Tigran Aivazian <tigran@aivazian.fsnet.co.uk>");
@@ -504,6 +505,20 @@ static struct notifier_block __refdata mc_cpu_notifier = {
 	.notifier_call	= mc_cpu_callback,
 };
 
+#ifdef MODULE
+/* Autoload on Intel and AMD systems */
+static const struct x86_cpu_id microcode_id[] = {
+#ifdef CONFIG_MICROCODE_INTEL
+	{ X86_VENDOR_INTEL, X86_FAMILY_ANY, X86_MODEL_ANY, },
+#endif
+#ifdef CONFIG_MICROCODE_AMD
+	{ X86_VENDOR_AMD, X86_FAMILY_ANY, X86_MODEL_ANY, },
+#endif
+	{}
+};
+MODULE_DEVICE_TABLE(x86cpu, microcode_id);
+#endif
+
 static int __init microcode_init(void)
 {
 	struct cpuinfo_x86 *c = &cpu_data(0);
-- 
cgit v1.2.3


From fad12ac8c8c2591c7f4e61d19b6a9d76cd49fafa Mon Sep 17 00:00:00 2001
From: Thomas Renninger <trenn@suse.de>
Date: Thu, 26 Jan 2012 00:09:14 +0100
Subject: CPU: Introduce ARCH_HAS_CPU_AUTOPROBE and X86 parts

This patch is based on Andi Kleen's work:
Implement autoprobing/loading of modules serving CPU
specific features (x86cpu autoloading).

And Kay Siever's work to get rid of sysdev cpu structures
and making use of struct device instead.

Before, the cpuid driver had to be loaded to get the x86cpu
autoloading feature. With this patch autoloading works through
the /sys/devices/system/cpu object

Cc: Kay Sievers <kay.sievers@vrfy.org>
Cc: Dave Jones <davej@redhat.com>
Cc: Jens Axboe <axboe@kernel.dk>
Cc: Herbert Xu <herbert@gondor.apana.org.au>
Cc: Huang Ying <ying.huang@intel.com>
Cc: Len Brown <lenb@kernel.org>
Acked-by: Andi Kleen <ak@linux.intel.com>
Signed-off-by: Thomas Renninger <trenn@suse.de>
Acked-by: H. Peter Anvin <hpa@zytor.com>
Signed-off-by: Greg Kroah-Hartman <gregkh@suse.de>
---
 arch/x86/Kconfig            |  3 +++
 arch/x86/kernel/cpu/match.c | 44 +++++++++++++++++++++++++++++++++
 arch/x86/kernel/cpuid.c     | 59 +--------------------------------------------
 drivers/base/cpu.c          | 11 +++++++++
 include/linux/cpu.h         |  7 ++++++
 5 files changed, 66 insertions(+), 58 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
index 864cc6e6ac8e..6baa1e66e1bc 100644
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -179,6 +179,9 @@ config ARCH_HAS_DEFAULT_IDLE
 config ARCH_HAS_CACHE_LINE_SIZE
 	def_bool y
 
+config ARCH_HAS_CPU_AUTOPROBE
+	def_bool y
+
 config HAVE_SETUP_PER_CPU_AREA
 	def_bool y
 
diff --git a/arch/x86/kernel/cpu/match.c b/arch/x86/kernel/cpu/match.c
index 7acc961422e7..940e2d483076 100644
--- a/arch/x86/kernel/cpu/match.c
+++ b/arch/x86/kernel/cpu/match.c
@@ -2,6 +2,7 @@
 #include <asm/processor.h>
 #include <linux/cpu.h>
 #include <linux/module.h>
+#include <linux/slab.h>
 
 /**
  * x86_match_cpu - match current CPU again an array of x86_cpu_ids
@@ -46,3 +47,46 @@ const struct x86_cpu_id *x86_match_cpu(const struct x86_cpu_id *match)
 	return NULL;
 }
 EXPORT_SYMBOL(x86_match_cpu);
+
+ssize_t arch_print_cpu_modalias(struct device *dev,
+				struct device_attribute *attr,
+				char *bufptr)
+{
+	int size = PAGE_SIZE;
+	int i, n;
+	char *buf = bufptr;
+
+	n = snprintf(buf, size, "x86cpu:vendor:%04X:family:%04X:"
+		     "model:%04X:feature:",
+		boot_cpu_data.x86_vendor,
+		boot_cpu_data.x86,
+		boot_cpu_data.x86_model);
+	size -= n;
+	buf += n;
+	size -= 2;
+	for (i = 0; i < NCAPINTS*32; i++) {
+		if (boot_cpu_has(i)) {
+			n = snprintf(buf, size, ",%04X", i);
+			if (n < 0) {
+				WARN(1, "x86 features overflow page\n");
+				break;
+			}
+			size -= n;
+			buf += n;
+		}
+	}
+	*buf++ = ',';
+	*buf++ = '\n';
+	return buf - bufptr;
+}
+
+int arch_cpu_uevent(struct device *dev, struct kobj_uevent_env *env)
+{
+	char *buf = kzalloc(PAGE_SIZE, GFP_KERNEL);
+	if (buf) {
+		arch_print_cpu_modalias(NULL, NULL, buf);
+		add_uevent_var(env, "MODALIAS=%s", buf);
+		kfree(buf);
+	}
+	return 0;
+}
diff --git a/arch/x86/kernel/cpuid.c b/arch/x86/kernel/cpuid.c
index 7c89880eefd0..a524353d93f2 100644
--- a/arch/x86/kernel/cpuid.c
+++ b/arch/x86/kernel/cpuid.c
@@ -40,7 +40,6 @@
 #include <linux/notifier.h>
 #include <linux/uaccess.h>
 #include <linux/gfp.h>
-#include <linux/slab.h>
 
 #include <asm/processor.h>
 #include <asm/msr.h>
@@ -139,57 +138,13 @@ static const struct file_operations cpuid_fops = {
 	.open = cpuid_open,
 };
 
-static ssize_t print_cpu_modalias(struct device *dev,
-				  struct device_attribute *attr,
-				  char *bufptr)
-{
-	int size = PAGE_SIZE;
-	int i, n;
-	char *buf = bufptr;
-
-	n = snprintf(buf, size, "x86cpu:vendor:%04X:family:"
-		     "%04X:model:%04X:feature:",
-		boot_cpu_data.x86_vendor,
-		boot_cpu_data.x86,
-		boot_cpu_data.x86_model);
-	size -= n;
-	buf += n;
-	size -= 2;
-	for (i = 0; i < NCAPINTS*32; i++) {
-		if (boot_cpu_has(i)) {
-			n = snprintf(buf, size, ",%04X", i);
-			if (n < 0) {
-				WARN(1, "x86 features overflow page\n");
-				break;
-			}
-			size -= n;
-			buf += n;
-		}
-	}
-	*buf++ = ',';
-	*buf++ = '\n';
-	return buf - bufptr;
-}
-
-static DEVICE_ATTR(modalias, 0444, print_cpu_modalias, NULL);
-
 static __cpuinit int cpuid_device_create(int cpu)
 {
 	struct device *dev;
-	int err;
 
 	dev = device_create(cpuid_class, NULL, MKDEV(CPUID_MAJOR, cpu), NULL,
 			    "cpu%d", cpu);
-	if (IS_ERR(dev))
-		return PTR_ERR(dev);
-
-	err = device_create_file(dev, &dev_attr_modalias);
-	if (err) {
-		/* keep device around on error. attribute is optional. */
-		err = 0;
-	}
-
-	return 0;
+	return IS_ERR(dev) ? PTR_ERR(dev) : 0;
 }
 
 static void cpuid_device_destroy(int cpu)
@@ -227,17 +182,6 @@ static char *cpuid_devnode(struct device *dev, umode_t *mode)
 	return kasprintf(GFP_KERNEL, "cpu/%u/cpuid", MINOR(dev->devt));
 }
 
-static int cpuid_dev_uevent(struct device *dev, struct kobj_uevent_env *env)
-{
-	char *buf = kzalloc(PAGE_SIZE, GFP_KERNEL);
-	if (buf) {
-		print_cpu_modalias(NULL, NULL, buf);
-		add_uevent_var(env, "MODALIAS=%s", buf);
-		kfree(buf);
-	}
-	return 0;
-}
-
 static int __init cpuid_init(void)
 {
 	int i, err = 0;
@@ -256,7 +200,6 @@ static int __init cpuid_init(void)
 		goto out_chrdev;
 	}
 	cpuid_class->devnode = cpuid_devnode;
-	cpuid_class->dev_uevent = cpuid_dev_uevent;
 	for_each_online_cpu(i) {
 		err = cpuid_device_create(i);
 		if (err != 0)
diff --git a/drivers/base/cpu.c b/drivers/base/cpu.c
index db87e78d7459..2a0c670c281d 100644
--- a/drivers/base/cpu.c
+++ b/drivers/base/cpu.c
@@ -11,6 +11,7 @@
 #include <linux/device.h>
 #include <linux/node.h>
 #include <linux/gfp.h>
+#include <linux/slab.h>
 #include <linux/percpu.h>
 
 #include "base.h"
@@ -223,6 +224,9 @@ int __cpuinit register_cpu(struct cpu *cpu, int num)
 	cpu->node_id = cpu_to_node(num);
 	cpu->dev.id = num;
 	cpu->dev.bus = &cpu_subsys;
+#ifdef CONFIG_ARCH_HAS_CPU_AUTOPROBE
+	cpu->dev.bus->uevent = arch_cpu_uevent;
+#endif
 	error = device_register(&cpu->dev);
 	if (!error && cpu->hotpluggable)
 		register_cpu_control(cpu);
@@ -247,6 +251,10 @@ struct device *get_cpu_device(unsigned cpu)
 }
 EXPORT_SYMBOL_GPL(get_cpu_device);
 
+#ifdef CONFIG_ARCH_HAS_CPU_AUTOPROBE
+static DEVICE_ATTR(modalias, 0444, arch_print_cpu_modalias, NULL);
+#endif
+
 static struct attribute *cpu_root_attrs[] = {
 #ifdef CONFIG_ARCH_CPU_PROBE_RELEASE
 	&dev_attr_probe.attr,
@@ -257,6 +265,9 @@ static struct attribute *cpu_root_attrs[] = {
 	&cpu_attrs[2].attr.attr,
 	&dev_attr_kernel_max.attr,
 	&dev_attr_offline.attr,
+#ifdef CONFIG_ARCH_HAS_CPU_AUTOPROBE
+	&dev_attr_modalias.attr,
+#endif
 	NULL
 };
 
diff --git a/include/linux/cpu.h b/include/linux/cpu.h
index 1f6587590a1a..6e53b4823d7f 100644
--- a/include/linux/cpu.h
+++ b/include/linux/cpu.h
@@ -44,6 +44,13 @@ extern ssize_t arch_cpu_release(const char *, size_t);
 #endif
 struct notifier_block;
 
+#ifdef CONFIG_ARCH_HAS_CPU_AUTOPROBE
+extern int arch_cpu_uevent(struct device *dev, struct kobj_uevent_env *env);
+extern ssize_t arch_print_cpu_modalias(struct device *dev,
+				       struct device_attribute *attr,
+				       char *bufptr);
+#endif
+
 /*
  * CPU notifier priorities.
  */
-- 
cgit v1.2.3


From d0caf292505d051b1026e85faf3a85e907566f31 Mon Sep 17 00:00:00 2001
From: Dan Carpenter <dan.carpenter@oracle.com>
Date: Sat, 28 Jan 2012 13:52:46 +0300
Subject: x86/dumpstack: Remove unneeded check in dump_trace()

Smatch complains that we have some inconsistent NULL checking.

If "task" were NULL then it would lead to a NULL dereference
later. We can remove this test because earlier on in the
function we have:

 if (!task)
	task = current;

Signed-off-by: Dan Carpenter <dan.carpenter@oracle.com>
Acked-by: Frederic Weisbecker <fweisbec@gmail.com>
Cc: Namhyung Kim <namhyung@gmail.com>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Clemens Ladisch <clemens@ladisch.de>
Link: http://lkml.kernel.org/r/20120128105246.GA25092@elgon.mountain
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/dumpstack_64.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kernel/dumpstack_64.c b/arch/x86/kernel/dumpstack_64.c
index 6d728d9284bd..af7785ff5aa0 100644
--- a/arch/x86/kernel/dumpstack_64.c
+++ b/arch/x86/kernel/dumpstack_64.c
@@ -129,7 +129,7 @@ void dump_trace(struct task_struct *task, struct pt_regs *regs,
 	if (!stack) {
 		if (regs)
 			stack = (unsigned long *)regs->sp;
-		else if (task && task != current)
+		else if (task != current)
 			stack = (unsigned long *)task->thread.sp;
 		else
 			stack = &dummy;
-- 
cgit v1.2.3


From cf579dfb82550e34de7ccf3ef090d8b834ccd3a9 Mon Sep 17 00:00:00 2001
From: "Rafael J. Wysocki" <rjw@sisk.pl>
Date: Sun, 29 Jan 2012 20:38:29 +0100
Subject: PM / Sleep: Introduce "late suspend" and "early resume" of devices

The current device suspend/resume phases during system-wide power
transitions appear to be insufficient for some platforms that want
to use the same callback routines for saving device states and
related operations during runtime suspend/resume as well as during
system suspend/resume.  In principle, they could point their
.suspend_noirq() and .resume_noirq() to the same callback routines
as their .runtime_suspend() and .runtime_resume(), respectively,
but at least some of them require device interrupts to be enabled
while the code in those routines is running.

It also makes sense to have device suspend-resume callbacks that will
be executed with runtime PM disabled and with device interrupts
enabled in case someone needs to run some special code in that
context during system-wide power transitions.

Apart from this, .suspend_noirq() and .resume_noirq() were introduced
as a workaround for drivers using shared interrupts and failing to
prevent their interrupt handlers from accessing suspended hardware.
It appears to be better not to use them for other porposes, or we may
have to deal with some serious confusion (which seems to be happening
already).

For the above reasons, introduce new device suspend/resume phases,
"late suspend" and "early resume" (and analogously for hibernation)
whose callback will be executed with runtime PM disabled and with
device interrupts enabled and whose callback pointers generally may
point to runtime suspend/resume routines.

Signed-off-by: Rafael J. Wysocki <rjw@sisk.pl>
Reviewed-by: Mark Brown <broonie@opensource.wolfsonmicro.com>
Reviewed-by: Kevin Hilman <khilman@ti.com>
---
 Documentation/power/devices.txt |  93 +++++++++------
 arch/x86/kernel/apm_32.c        |  11 +-
 drivers/base/power/main.c       | 247 ++++++++++++++++++++++++++++++++++++----
 drivers/xen/manage.c            |   6 +-
 include/linux/pm.h              |  43 +++++--
 include/linux/suspend.h         |   4 +
 kernel/kexec.c                  |   8 +-
 kernel/power/hibernate.c        |  24 ++--
 kernel/power/main.c             |   8 +-
 kernel/power/suspend.c          |   4 +-
 10 files changed, 357 insertions(+), 91 deletions(-)

(limited to 'arch/x86')

diff --git a/Documentation/power/devices.txt b/Documentation/power/devices.txt
index 20af7def23c8..872815cd41d3 100644
--- a/Documentation/power/devices.txt
+++ b/Documentation/power/devices.txt
@@ -96,6 +96,12 @@ struct dev_pm_ops {
 	int (*thaw)(struct device *dev);
 	int (*poweroff)(struct device *dev);
 	int (*restore)(struct device *dev);
+	int (*suspend_late)(struct device *dev);
+	int (*resume_early)(struct device *dev);
+	int (*freeze_late)(struct device *dev);
+	int (*thaw_early)(struct device *dev);
+	int (*poweroff_late)(struct device *dev);
+	int (*restore_early)(struct device *dev);
 	int (*suspend_noirq)(struct device *dev);
 	int (*resume_noirq)(struct device *dev);
 	int (*freeze_noirq)(struct device *dev);
@@ -305,7 +311,7 @@ Entering System Suspend
 -----------------------
 When the system goes into the standby or memory sleep state, the phases are:
 
-		prepare, suspend, suspend_noirq.
+		prepare, suspend, suspend_late, suspend_noirq.
 
     1.	The prepare phase is meant to prevent races by preventing new devices
 	from being registered; the PM core would never know that all the
@@ -324,7 +330,12 @@ When the system goes into the standby or memory sleep state, the phases are:
 	appropriate low-power state, depending on the bus type the device is on,
 	and they may enable wakeup events.
 
-    3.	The suspend_noirq phase occurs after IRQ handlers have been disabled,
+    3	For a number of devices it is convenient to split suspend into the
+	"quiesce device" and "save device state" phases, in which cases
+	suspend_late is meant to do the latter.  It is always executed after
+	runtime power management has been disabled for all devices.
+
+    4.	The suspend_noirq phase occurs after IRQ handlers have been disabled,
 	which means that the driver's interrupt handler will not be called while
 	the callback method is running.  The methods should save the values of
 	the device's registers that weren't saved previously and finally put the
@@ -359,7 +370,7 @@ Leaving System Suspend
 ----------------------
 When resuming from standby or memory sleep, the phases are:
 
-		resume_noirq, resume, complete.
+		resume_noirq, resume_early, resume, complete.
 
     1.	The resume_noirq callback methods should perform any actions needed
 	before the driver's interrupt handlers are invoked.  This generally
@@ -375,14 +386,18 @@ When resuming from standby or memory sleep, the phases are:
 	device driver's ->pm.resume_noirq() method to perform device-specific
 	actions.
 
-    2.	The resume methods should bring the the device back to its operating
+    2.	The resume_early methods should prepare devices for the execution of
+	the resume methods.  This generally involves undoing the actions of the
+	preceding suspend_late phase.
+
+    3	The resume methods should bring the the device back to its operating
 	state, so that it can perform normal I/O.  This generally involves
 	undoing the actions of the suspend phase.
 
-    3.	The complete phase uses only a bus callback.  The method should undo the
-	actions of the prepare phase.  Note, however, that new children may be
-	registered below the device as soon as the resume callbacks occur; it's
-	not necessary to wait until the complete phase.
+    4.	The complete phase should undo the actions of the prepare phase.  Note,
+	however, that new children may be registered below the device as soon as
+	the resume callbacks occur; it's not necessary to wait until the
+	complete phase.
 
 At the end of these phases, drivers should be as functional as they were before
 suspending: I/O can be performed using DMA and IRQs, and the relevant clocks are
@@ -429,8 +444,8 @@ an image of the system memory while everything is stable, reactivate all
 devices (thaw), write the image to permanent storage, and finally shut down the
 system (poweroff).  The phases used to accomplish this are:
 
-	prepare, freeze, freeze_noirq, thaw_noirq, thaw, complete,
-	prepare, poweroff, poweroff_noirq
+	prepare, freeze, freeze_late, freeze_noirq, thaw_noirq, thaw_early,
+	thaw, complete, prepare, poweroff, poweroff_late, poweroff_noirq
 
     1.	The prepare phase is discussed in the "Entering System Suspend" section
 	above.
@@ -441,7 +456,11 @@ system (poweroff).  The phases used to accomplish this are:
 	save time it's best not to do so.  Also, the device should not be
 	prepared to generate wakeup events.
 
-    3.	The freeze_noirq phase is analogous to the suspend_noirq phase discussed
+    3.	The freeze_late phase is analogous to the suspend_late phase described
+	above, except that the device should not be put in a low-power state and
+	should not be allowed to generate wakeup events by it.
+
+    4.	The freeze_noirq phase is analogous to the suspend_noirq phase discussed
 	above, except again that the device should not be put in a low-power
 	state and should not be allowed to generate wakeup events.
 
@@ -449,15 +468,19 @@ At this point the system image is created.  All devices should be inactive and
 the contents of memory should remain undisturbed while this happens, so that the
 image forms an atomic snapshot of the system state.
 
-    4.	The thaw_noirq phase is analogous to the resume_noirq phase discussed
+    5.	The thaw_noirq phase is analogous to the resume_noirq phase discussed
 	above.  The main difference is that its methods can assume the device is
 	in the same state as at the end of the freeze_noirq phase.
 
-    5.	The thaw phase is analogous to the resume phase discussed above.  Its
+    6.	The thaw_early phase is analogous to the resume_early phase described
+	above.  Its methods should undo the actions of the preceding
+	freeze_late, if necessary.
+
+    7.	The thaw phase is analogous to the resume phase discussed above.  Its
 	methods should bring the device back to an operating state, so that it
 	can be used for saving the image if necessary.
 
-    6.	The complete phase is discussed in the "Leaving System Suspend" section
+    8.	The complete phase is discussed in the "Leaving System Suspend" section
 	above.
 
 At this point the system image is saved, and the devices then need to be
@@ -465,16 +488,19 @@ prepared for the upcoming system shutdown.  This is much like suspending them
 before putting the system into the standby or memory sleep state, and the phases
 are similar.
 
-    7.	The prepare phase is discussed above.
+    9.	The prepare phase is discussed above.
+
+    10.	The poweroff phase is analogous to the suspend phase.
 
-    8.	The poweroff phase is analogous to the suspend phase.
+    11.	The poweroff_late phase is analogous to the suspend_late phase.
 
-    9.	The poweroff_noirq phase is analogous to the suspend_noirq phase.
+    12.	The poweroff_noirq phase is analogous to the suspend_noirq phase.
 
-The poweroff and poweroff_noirq callbacks should do essentially the same things
-as the suspend and suspend_noirq callbacks.  The only notable difference is that
-they need not store the device register values, because the registers should
-already have been stored during the freeze or freeze_noirq phases.
+The poweroff, poweroff_late and poweroff_noirq callbacks should do essentially
+the same things as the suspend, suspend_late and suspend_noirq callbacks,
+respectively.  The only notable difference is that they need not store the
+device register values, because the registers should already have been stored
+during the freeze, freeze_late or freeze_noirq phases.
 
 
 Leaving Hibernation
@@ -518,22 +544,25 @@ To achieve this, the image kernel must restore the devices' pre-hibernation
 functionality.  The operation is much like waking up from the memory sleep
 state, although it involves different phases:
 
-	restore_noirq, restore, complete
+	restore_noirq, restore_early, restore, complete
 
     1.	The restore_noirq phase is analogous to the resume_noirq phase.
 
-    2.	The restore phase is analogous to the resume phase.
+    2.	The restore_early phase is analogous to the resume_early phase.
+
+    3.	The restore phase is analogous to the resume phase.
 
-    3.	The complete phase is discussed above.
+    4.	The complete phase is discussed above.
 
-The main difference from resume[_noirq] is that restore[_noirq] must assume the
-device has been accessed and reconfigured by the boot loader or the boot kernel.
-Consequently the state of the device may be different from the state remembered
-from the freeze and freeze_noirq phases.  The device may even need to be reset
-and completely re-initialized.  In many cases this difference doesn't matter, so
-the resume[_noirq] and restore[_norq] method pointers can be set to the same
-routines.  Nevertheless, different callback pointers are used in case there is a
-situation where it actually matters.
+The main difference from resume[_early|_noirq] is that restore[_early|_noirq]
+must assume the device has been accessed and reconfigured by the boot loader or
+the boot kernel.  Consequently the state of the device may be different from the
+state remembered from the freeze, freeze_late and freeze_noirq phases.  The
+device may even need to be reset and completely re-initialized.  In many cases
+this difference doesn't matter, so the resume[_early|_noirq] and
+restore[_early|_norq] method pointers can be set to the same routines.
+Nevertheless, different callback pointers are used in case there is a situation
+where it actually does matter.
 
 
 Device Power Management Domains
diff --git a/arch/x86/kernel/apm_32.c b/arch/x86/kernel/apm_32.c
index f76623cbe263..5d56931a15b3 100644
--- a/arch/x86/kernel/apm_32.c
+++ b/arch/x86/kernel/apm_32.c
@@ -1234,8 +1234,7 @@ static int suspend(int vetoable)
 	struct apm_user	*as;
 
 	dpm_suspend_start(PMSG_SUSPEND);
-
-	dpm_suspend_noirq(PMSG_SUSPEND);
+	dpm_suspend_end(PMSG_SUSPEND);
 
 	local_irq_disable();
 	syscore_suspend();
@@ -1259,9 +1258,9 @@ static int suspend(int vetoable)
 	syscore_resume();
 	local_irq_enable();
 
-	dpm_resume_noirq(PMSG_RESUME);
-
+	dpm_resume_start(PMSG_RESUME);
 	dpm_resume_end(PMSG_RESUME);
+
 	queue_event(APM_NORMAL_RESUME, NULL);
 	spin_lock(&user_list_lock);
 	for (as = user_list; as != NULL; as = as->next) {
@@ -1277,7 +1276,7 @@ static void standby(void)
 {
 	int err;
 
-	dpm_suspend_noirq(PMSG_SUSPEND);
+	dpm_suspend_end(PMSG_SUSPEND);
 
 	local_irq_disable();
 	syscore_suspend();
@@ -1291,7 +1290,7 @@ static void standby(void)
 	syscore_resume();
 	local_irq_enable();
 
-	dpm_resume_noirq(PMSG_RESUME);
+	dpm_resume_start(PMSG_RESUME);
 }
 
 static apm_event_t get_event(void)
diff --git a/drivers/base/power/main.c b/drivers/base/power/main.c
index e2cc3d2e0ecc..b462c0e341cb 100644
--- a/drivers/base/power/main.c
+++ b/drivers/base/power/main.c
@@ -47,6 +47,7 @@ typedef int (*pm_callback_t)(struct device *);
 LIST_HEAD(dpm_list);
 LIST_HEAD(dpm_prepared_list);
 LIST_HEAD(dpm_suspended_list);
+LIST_HEAD(dpm_late_early_list);
 LIST_HEAD(dpm_noirq_list);
 
 struct suspend_stats suspend_stats;
@@ -245,6 +246,40 @@ static pm_callback_t pm_op(const struct dev_pm_ops *ops, pm_message_t state)
 	return NULL;
 }
 
+/**
+ * pm_late_early_op - Return the PM operation appropriate for given PM event.
+ * @ops: PM operations to choose from.
+ * @state: PM transition of the system being carried out.
+ *
+ * Runtime PM is disabled for @dev while this function is being executed.
+ */
+static pm_callback_t pm_late_early_op(const struct dev_pm_ops *ops,
+				      pm_message_t state)
+{
+	switch (state.event) {
+#ifdef CONFIG_SUSPEND
+	case PM_EVENT_SUSPEND:
+		return ops->suspend_late;
+	case PM_EVENT_RESUME:
+		return ops->resume_early;
+#endif /* CONFIG_SUSPEND */
+#ifdef CONFIG_HIBERNATE_CALLBACKS
+	case PM_EVENT_FREEZE:
+	case PM_EVENT_QUIESCE:
+		return ops->freeze_late;
+	case PM_EVENT_HIBERNATE:
+		return ops->poweroff_late;
+	case PM_EVENT_THAW:
+	case PM_EVENT_RECOVER:
+		return ops->thaw_early;
+	case PM_EVENT_RESTORE:
+		return ops->restore_early;
+#endif /* CONFIG_HIBERNATE_CALLBACKS */
+	}
+
+	return NULL;
+}
+
 /**
  * pm_noirq_op - Return the PM operation appropriate for given PM event.
  * @ops: PM operations to choose from.
@@ -374,21 +409,21 @@ static int device_resume_noirq(struct device *dev, pm_message_t state)
 	TRACE_RESUME(0);
 
 	if (dev->pm_domain) {
-		info = "EARLY power domain ";
+		info = "noirq power domain ";
 		callback = pm_noirq_op(&dev->pm_domain->ops, state);
 	} else if (dev->type && dev->type->pm) {
-		info = "EARLY type ";
+		info = "noirq type ";
 		callback = pm_noirq_op(dev->type->pm, state);
 	} else if (dev->class && dev->class->pm) {
-		info = "EARLY class ";
+		info = "noirq class ";
 		callback = pm_noirq_op(dev->class->pm, state);
 	} else if (dev->bus && dev->bus->pm) {
-		info = "EARLY bus ";
+		info = "noirq bus ";
 		callback = pm_noirq_op(dev->bus->pm, state);
 	}
 
 	if (!callback && dev->driver && dev->driver->pm) {
-		info = "EARLY driver ";
+		info = "noirq driver ";
 		callback = pm_noirq_op(dev->driver->pm, state);
 	}
 
@@ -399,13 +434,13 @@ static int device_resume_noirq(struct device *dev, pm_message_t state)
 }
 
 /**
- * dpm_resume_noirq - Execute "early resume" callbacks for non-sysdev devices.
+ * dpm_resume_noirq - Execute "noirq resume" callbacks for all devices.
  * @state: PM transition of the system being carried out.
  *
- * Call the "noirq" resume handlers for all devices marked as DPM_OFF_IRQ and
+ * Call the "noirq" resume handlers for all devices in dpm_noirq_list and
  * enable device drivers to receive interrupts.
  */
-void dpm_resume_noirq(pm_message_t state)
+static void dpm_resume_noirq(pm_message_t state)
 {
 	ktime_t starttime = ktime_get();
 
@@ -415,7 +450,7 @@ void dpm_resume_noirq(pm_message_t state)
 		int error;
 
 		get_device(dev);
-		list_move_tail(&dev->power.entry, &dpm_suspended_list);
+		list_move_tail(&dev->power.entry, &dpm_late_early_list);
 		mutex_unlock(&dpm_list_mtx);
 
 		error = device_resume_noirq(dev, state);
@@ -423,6 +458,80 @@ void dpm_resume_noirq(pm_message_t state)
 			suspend_stats.failed_resume_noirq++;
 			dpm_save_failed_step(SUSPEND_RESUME_NOIRQ);
 			dpm_save_failed_dev(dev_name(dev));
+			pm_dev_err(dev, state, " noirq", error);
+		}
+
+		mutex_lock(&dpm_list_mtx);
+		put_device(dev);
+	}
+	mutex_unlock(&dpm_list_mtx);
+	dpm_show_time(starttime, state, "noirq");
+	resume_device_irqs();
+}
+
+/**
+ * device_resume_early - Execute an "early resume" callback for given device.
+ * @dev: Device to handle.
+ * @state: PM transition of the system being carried out.
+ *
+ * Runtime PM is disabled for @dev while this function is being executed.
+ */
+static int device_resume_early(struct device *dev, pm_message_t state)
+{
+	pm_callback_t callback = NULL;
+	char *info = NULL;
+	int error = 0;
+
+	TRACE_DEVICE(dev);
+	TRACE_RESUME(0);
+
+	if (dev->pm_domain) {
+		info = "early power domain ";
+		callback = pm_late_early_op(&dev->pm_domain->ops, state);
+	} else if (dev->type && dev->type->pm) {
+		info = "early type ";
+		callback = pm_late_early_op(dev->type->pm, state);
+	} else if (dev->class && dev->class->pm) {
+		info = "early class ";
+		callback = pm_late_early_op(dev->class->pm, state);
+	} else if (dev->bus && dev->bus->pm) {
+		info = "early bus ";
+		callback = pm_late_early_op(dev->bus->pm, state);
+	}
+
+	if (!callback && dev->driver && dev->driver->pm) {
+		info = "early driver ";
+		callback = pm_late_early_op(dev->driver->pm, state);
+	}
+
+	error = dpm_run_callback(callback, dev, state, info);
+
+	TRACE_RESUME(error);
+	return error;
+}
+
+/**
+ * dpm_resume_early - Execute "early resume" callbacks for all devices.
+ * @state: PM transition of the system being carried out.
+ */
+static void dpm_resume_early(pm_message_t state)
+{
+	ktime_t starttime = ktime_get();
+
+	mutex_lock(&dpm_list_mtx);
+	while (!list_empty(&dpm_late_early_list)) {
+		struct device *dev = to_device(dpm_late_early_list.next);
+		int error;
+
+		get_device(dev);
+		list_move_tail(&dev->power.entry, &dpm_suspended_list);
+		mutex_unlock(&dpm_list_mtx);
+
+		error = device_resume_early(dev, state);
+		if (error) {
+			suspend_stats.failed_resume_early++;
+			dpm_save_failed_step(SUSPEND_RESUME_EARLY);
+			dpm_save_failed_dev(dev_name(dev));
 			pm_dev_err(dev, state, " early", error);
 		}
 
@@ -431,9 +540,18 @@ void dpm_resume_noirq(pm_message_t state)
 	}
 	mutex_unlock(&dpm_list_mtx);
 	dpm_show_time(starttime, state, "early");
-	resume_device_irqs();
 }
-EXPORT_SYMBOL_GPL(dpm_resume_noirq);
+
+/**
+ * dpm_resume_start - Execute "noirq" and "early" device callbacks.
+ * @state: PM transition of the system being carried out.
+ */
+void dpm_resume_start(pm_message_t state)
+{
+	dpm_resume_noirq(state);
+	dpm_resume_early(state);
+}
+EXPORT_SYMBOL_GPL(dpm_resume_start);
 
 /**
  * device_resume - Execute "resume" callbacks for given device.
@@ -716,21 +834,21 @@ static int device_suspend_noirq(struct device *dev, pm_message_t state)
 	char *info = NULL;
 
 	if (dev->pm_domain) {
-		info = "LATE power domain ";
+		info = "noirq power domain ";
 		callback = pm_noirq_op(&dev->pm_domain->ops, state);
 	} else if (dev->type && dev->type->pm) {
-		info = "LATE type ";
+		info = "noirq type ";
 		callback = pm_noirq_op(dev->type->pm, state);
 	} else if (dev->class && dev->class->pm) {
-		info = "LATE class ";
+		info = "noirq class ";
 		callback = pm_noirq_op(dev->class->pm, state);
 	} else if (dev->bus && dev->bus->pm) {
-		info = "LATE bus ";
+		info = "noirq bus ";
 		callback = pm_noirq_op(dev->bus->pm, state);
 	}
 
 	if (!callback && dev->driver && dev->driver->pm) {
-		info = "LATE driver ";
+		info = "noirq driver ";
 		callback = pm_noirq_op(dev->driver->pm, state);
 	}
 
@@ -738,21 +856,21 @@ static int device_suspend_noirq(struct device *dev, pm_message_t state)
 }
 
 /**
- * dpm_suspend_noirq - Execute "late suspend" callbacks for non-sysdev devices.
+ * dpm_suspend_noirq - Execute "noirq suspend" callbacks for all devices.
  * @state: PM transition of the system being carried out.
  *
  * Prevent device drivers from receiving interrupts and call the "noirq" suspend
  * handlers for all non-sysdev devices.
  */
-int dpm_suspend_noirq(pm_message_t state)
+static int dpm_suspend_noirq(pm_message_t state)
 {
 	ktime_t starttime = ktime_get();
 	int error = 0;
 
 	suspend_device_irqs();
 	mutex_lock(&dpm_list_mtx);
-	while (!list_empty(&dpm_suspended_list)) {
-		struct device *dev = to_device(dpm_suspended_list.prev);
+	while (!list_empty(&dpm_late_early_list)) {
+		struct device *dev = to_device(dpm_late_early_list.prev);
 
 		get_device(dev);
 		mutex_unlock(&dpm_list_mtx);
@@ -761,7 +879,7 @@ int dpm_suspend_noirq(pm_message_t state)
 
 		mutex_lock(&dpm_list_mtx);
 		if (error) {
-			pm_dev_err(dev, state, " late", error);
+			pm_dev_err(dev, state, " noirq", error);
 			suspend_stats.failed_suspend_noirq++;
 			dpm_save_failed_step(SUSPEND_SUSPEND_NOIRQ);
 			dpm_save_failed_dev(dev_name(dev));
@@ -775,11 +893,96 @@ int dpm_suspend_noirq(pm_message_t state)
 	mutex_unlock(&dpm_list_mtx);
 	if (error)
 		dpm_resume_noirq(resume_event(state));
+	else
+		dpm_show_time(starttime, state, "noirq");
+	return error;
+}
+
+/**
+ * device_suspend_late - Execute a "late suspend" callback for given device.
+ * @dev: Device to handle.
+ * @state: PM transition of the system being carried out.
+ *
+ * Runtime PM is disabled for @dev while this function is being executed.
+ */
+static int device_suspend_late(struct device *dev, pm_message_t state)
+{
+	pm_callback_t callback = NULL;
+	char *info = NULL;
+
+	if (dev->pm_domain) {
+		info = "late power domain ";
+		callback = pm_late_early_op(&dev->pm_domain->ops, state);
+	} else if (dev->type && dev->type->pm) {
+		info = "late type ";
+		callback = pm_late_early_op(dev->type->pm, state);
+	} else if (dev->class && dev->class->pm) {
+		info = "late class ";
+		callback = pm_late_early_op(dev->class->pm, state);
+	} else if (dev->bus && dev->bus->pm) {
+		info = "late bus ";
+		callback = pm_late_early_op(dev->bus->pm, state);
+	}
+
+	if (!callback && dev->driver && dev->driver->pm) {
+		info = "late driver ";
+		callback = pm_late_early_op(dev->driver->pm, state);
+	}
+
+	return dpm_run_callback(callback, dev, state, info);
+}
+
+/**
+ * dpm_suspend_late - Execute "late suspend" callbacks for all devices.
+ * @state: PM transition of the system being carried out.
+ */
+static int dpm_suspend_late(pm_message_t state)
+{
+	ktime_t starttime = ktime_get();
+	int error = 0;
+
+	mutex_lock(&dpm_list_mtx);
+	while (!list_empty(&dpm_suspended_list)) {
+		struct device *dev = to_device(dpm_suspended_list.prev);
+
+		get_device(dev);
+		mutex_unlock(&dpm_list_mtx);
+
+		error = device_suspend_late(dev, state);
+
+		mutex_lock(&dpm_list_mtx);
+		if (error) {
+			pm_dev_err(dev, state, " late", error);
+			suspend_stats.failed_suspend_late++;
+			dpm_save_failed_step(SUSPEND_SUSPEND_LATE);
+			dpm_save_failed_dev(dev_name(dev));
+			put_device(dev);
+			break;
+		}
+		if (!list_empty(&dev->power.entry))
+			list_move(&dev->power.entry, &dpm_late_early_list);
+		put_device(dev);
+	}
+	mutex_unlock(&dpm_list_mtx);
+	if (error)
+		dpm_resume_early(resume_event(state));
 	else
 		dpm_show_time(starttime, state, "late");
+
 	return error;
 }
-EXPORT_SYMBOL_GPL(dpm_suspend_noirq);
+
+/**
+ * dpm_suspend_end - Execute "late" and "noirq" device suspend callbacks.
+ * @state: PM transition of the system being carried out.
+ */
+int dpm_suspend_end(pm_message_t state)
+{
+	int error = dpm_suspend_late(state);
+
+	return error ? : dpm_suspend_noirq(state);
+}
+EXPORT_SYMBOL_GPL(dpm_suspend_end);
 
 /**
  * legacy_suspend - Execute a legacy (bus or class) suspend callback for device.
diff --git a/drivers/xen/manage.c b/drivers/xen/manage.c
index ce4fa0831860..9e14ae6cd49c 100644
--- a/drivers/xen/manage.c
+++ b/drivers/xen/manage.c
@@ -129,9 +129,9 @@ static void do_suspend(void)
 	printk(KERN_DEBUG "suspending xenstore...\n");
 	xs_suspend();
 
-	err = dpm_suspend_noirq(PMSG_FREEZE);
+	err = dpm_suspend_end(PMSG_FREEZE);
 	if (err) {
-		printk(KERN_ERR "dpm_suspend_noirq failed: %d\n", err);
+		printk(KERN_ERR "dpm_suspend_end failed: %d\n", err);
 		goto out_resume;
 	}
 
@@ -149,7 +149,7 @@ static void do_suspend(void)
 
 	err = stop_machine(xen_suspend, &si, cpumask_of(0));
 
-	dpm_resume_noirq(si.cancelled ? PMSG_THAW : PMSG_RESTORE);
+	dpm_resume_start(si.cancelled ? PMSG_THAW : PMSG_RESTORE);
 
 	if (err) {
 		printk(KERN_ERR "failed to start xen_suspend: %d\n", err);
diff --git a/include/linux/pm.h b/include/linux/pm.h
index e4982ac3fbbc..c68e1f22ac95 100644
--- a/include/linux/pm.h
+++ b/include/linux/pm.h
@@ -110,6 +110,10 @@ typedef struct pm_message {
  *	Subsystem-level @suspend() is executed for all devices after invoking
  *	subsystem-level @prepare() for all of them.
  *
+ * @suspend_late: Continue operations started by @suspend().  For a number of
+ *	devices @suspend_late() may point to the same callback routine as the
+ *	runtime suspend callback.
+ *
  * @resume: Executed after waking the system up from a sleep state in which the
  *	contents of main memory were preserved.  The exact action to perform
  *	depends on the device's subsystem, but generally the driver is expected
@@ -122,6 +126,10 @@ typedef struct pm_message {
  *	Subsystem-level @resume() is executed for all devices after invoking
  *	subsystem-level @resume_noirq() for all of them.
  *
+ * @resume_early: Prepare to execute @resume().  For a number of devices
+ *	@resume_early() may point to the same callback routine as the runtime
+ *	resume callback.
+ *
  * @freeze: Hibernation-specific, executed before creating a hibernation image.
  *	Analogous to @suspend(), but it should not enable the device to signal
  *	wakeup events or change its power state.  The majority of subsystems
@@ -131,6 +139,10 @@ typedef struct pm_message {
  *	Subsystem-level @freeze() is executed for all devices after invoking
  *	subsystem-level @prepare() for all of them.
  *
+ * @freeze_late: Continue operations started by @freeze().  Analogous to
+ *	@suspend_late(), but it should not enable the device to signal wakeup
+ *	events or change its power state.
+ *
  * @thaw: Hibernation-specific, executed after creating a hibernation image OR
  *	if the creation of an image has failed.  Also executed after a failing
  *	attempt to restore the contents of main memory from such an image.
@@ -140,15 +152,23 @@ typedef struct pm_message {
  *	subsystem-level @thaw_noirq() for all of them.  It also may be executed
  *	directly after @freeze() in case of a transition error.
  *
+ * @thaw_early: Prepare to execute @thaw().  Undo the changes made by the
+ *	preceding @freeze_late().
+ *
  * @poweroff: Hibernation-specific, executed after saving a hibernation image.
  *	Analogous to @suspend(), but it need not save the device's settings in
  *	memory.
  *	Subsystem-level @poweroff() is executed for all devices after invoking
  *	subsystem-level @prepare() for all of them.
  *
+ * @poweroff_late: Continue operations started by @poweroff().  Analogous to
+ *	@suspend_late(), but it need not save the device's settings in memory.
+ *
  * @restore: Hibernation-specific, executed after restoring the contents of main
  *	memory from a hibernation image, analogous to @resume().
  *
+ * @restore_early: Prepare to execute @restore(), analogous to @resume_early().
+ *
  * @suspend_noirq: Complete the actions started by @suspend().  Carry out any
  *	additional operations required for suspending the device that might be
  *	racing with its driver's interrupt handler, which is guaranteed not to
@@ -158,9 +178,10 @@ typedef struct pm_message {
  *	@suspend_noirq() has returned successfully.  If the device can generate
  *	system wakeup signals and is enabled to wake up the system, it should be
  *	configured to do so at that time.  However, depending on the platform
- *	and device's subsystem, @suspend() may be allowed to put the device into
- *	the low-power state and configure it to generate wakeup signals, in
- *	which case it generally is not necessary to define @suspend_noirq().
+ *	and device's subsystem, @suspend() or @suspend_late() may be allowed to
+ *	put the device into the low-power state and configure it to generate
+ *	wakeup signals, in which case it generally is not necessary to define
+ *	@suspend_noirq().
  *
  * @resume_noirq: Prepare for the execution of @resume() by carrying out any
  *	operations required for resuming the device that might be racing with
@@ -171,9 +192,9 @@ typedef struct pm_message {
  *	additional operations required for freezing the device that might be
  *	racing with its driver's interrupt handler, which is guaranteed not to
  *	run while @freeze_noirq() is being executed.
- *	The power state of the device should not be changed by either @freeze()
- *	or @freeze_noirq() and it should not be configured to signal system
- *	wakeup by any of these callbacks.
+ *	The power state of the device should not be changed by either @freeze(),
+ *	or @freeze_late(), or @freeze_noirq() and it should not be configured to
+ *	signal system wakeup by any of these callbacks.
  *
  * @thaw_noirq: Prepare for the execution of @thaw() by carrying out any
  *	operations required for thawing the device that might be racing with its
@@ -249,6 +270,12 @@ struct dev_pm_ops {
 	int (*thaw)(struct device *dev);
 	int (*poweroff)(struct device *dev);
 	int (*restore)(struct device *dev);
+	int (*suspend_late)(struct device *dev);
+	int (*resume_early)(struct device *dev);
+	int (*freeze_late)(struct device *dev);
+	int (*thaw_early)(struct device *dev);
+	int (*poweroff_late)(struct device *dev);
+	int (*restore_early)(struct device *dev);
 	int (*suspend_noirq)(struct device *dev);
 	int (*resume_noirq)(struct device *dev);
 	int (*freeze_noirq)(struct device *dev);
@@ -584,13 +611,13 @@ struct dev_pm_domain {
 
 #ifdef CONFIG_PM_SLEEP
 extern void device_pm_lock(void);
-extern void dpm_resume_noirq(pm_message_t state);
+extern void dpm_resume_start(pm_message_t state);
 extern void dpm_resume_end(pm_message_t state);
 extern void dpm_resume(pm_message_t state);
 extern void dpm_complete(pm_message_t state);
 
 extern void device_pm_unlock(void);
-extern int dpm_suspend_noirq(pm_message_t state);
+extern int dpm_suspend_end(pm_message_t state);
 extern int dpm_suspend_start(pm_message_t state);
 extern int dpm_suspend(pm_message_t state);
 extern int dpm_prepare(pm_message_t state);
diff --git a/include/linux/suspend.h b/include/linux/suspend.h
index 91784a4f8608..ac1c114c499d 100644
--- a/include/linux/suspend.h
+++ b/include/linux/suspend.h
@@ -42,8 +42,10 @@ enum suspend_stat_step {
 	SUSPEND_FREEZE = 1,
 	SUSPEND_PREPARE,
 	SUSPEND_SUSPEND,
+	SUSPEND_SUSPEND_LATE,
 	SUSPEND_SUSPEND_NOIRQ,
 	SUSPEND_RESUME_NOIRQ,
+	SUSPEND_RESUME_EARLY,
 	SUSPEND_RESUME
 };
 
@@ -53,8 +55,10 @@ struct suspend_stats {
 	int	failed_freeze;
 	int	failed_prepare;
 	int	failed_suspend;
+	int	failed_suspend_late;
 	int	failed_suspend_noirq;
 	int	failed_resume;
+	int	failed_resume_early;
 	int	failed_resume_noirq;
 #define	REC_FAILED_NUM	2
 	int	last_failed_dev;
diff --git a/kernel/kexec.c b/kernel/kexec.c
index 7b0886786701..a6a675cb9818 100644
--- a/kernel/kexec.c
+++ b/kernel/kexec.c
@@ -1546,13 +1546,13 @@ int kernel_kexec(void)
 		if (error)
 			goto Resume_console;
 		/* At this point, dpm_suspend_start() has been called,
-		 * but *not* dpm_suspend_noirq(). We *must* call
-		 * dpm_suspend_noirq() now.  Otherwise, drivers for
+		 * but *not* dpm_suspend_end(). We *must* call
+		 * dpm_suspend_end() now.  Otherwise, drivers for
 		 * some devices (e.g. interrupt controllers) become
 		 * desynchronized with the actual state of the
 		 * hardware at resume time, and evil weirdness ensues.
 		 */
-		error = dpm_suspend_noirq(PMSG_FREEZE);
+		error = dpm_suspend_end(PMSG_FREEZE);
 		if (error)
 			goto Resume_devices;
 		error = disable_nonboot_cpus();
@@ -1579,7 +1579,7 @@ int kernel_kexec(void)
 		local_irq_enable();
  Enable_cpus:
 		enable_nonboot_cpus();
-		dpm_resume_noirq(PMSG_RESTORE);
+		dpm_resume_start(PMSG_RESTORE);
  Resume_devices:
 		dpm_resume_end(PMSG_RESTORE);
  Resume_console:
diff --git a/kernel/power/hibernate.c b/kernel/power/hibernate.c
index 6d6d28870335..a5d4cf0aa03e 100644
--- a/kernel/power/hibernate.c
+++ b/kernel/power/hibernate.c
@@ -245,8 +245,8 @@ void swsusp_show_speed(struct timeval *start, struct timeval *stop,
  * create_image - Create a hibernation image.
  * @platform_mode: Whether or not to use the platform driver.
  *
- * Execute device drivers' .freeze_noirq() callbacks, create a hibernation image
- * and execute the drivers' .thaw_noirq() callbacks.
+ * Execute device drivers' "late" and "noirq" freeze callbacks, create a
+ * hibernation image and run the drivers' "noirq" and "early" thaw callbacks.
  *
  * Control reappears in this routine after the subsequent restore.
  */
@@ -254,7 +254,7 @@ static int create_image(int platform_mode)
 {
 	int error;
 
-	error = dpm_suspend_noirq(PMSG_FREEZE);
+	error = dpm_suspend_end(PMSG_FREEZE);
 	if (error) {
 		printk(KERN_ERR "PM: Some devices failed to power down, "
 			"aborting hibernation\n");
@@ -306,7 +306,7 @@ static int create_image(int platform_mode)
  Platform_finish:
 	platform_finish(platform_mode);
 
-	dpm_resume_noirq(in_suspend ?
+	dpm_resume_start(in_suspend ?
 		(error ? PMSG_RECOVER : PMSG_THAW) : PMSG_RESTORE);
 
 	return error;
@@ -394,16 +394,16 @@ int hibernation_snapshot(int platform_mode)
  * resume_target_kernel - Restore system state from a hibernation image.
  * @platform_mode: Whether or not to use the platform driver.
  *
- * Execute device drivers' .freeze_noirq() callbacks, restore the contents of
- * highmem that have not been restored yet from the image and run the low-level
- * code that will restore the remaining contents of memory and switch to the
- * just restored target kernel.
+ * Execute device drivers' "noirq" and "late" freeze callbacks, restore the
+ * contents of highmem that have not been restored yet from the image and run
+ * the low-level code that will restore the remaining contents of memory and
+ * switch to the just restored target kernel.
  */
 static int resume_target_kernel(bool platform_mode)
 {
 	int error;
 
-	error = dpm_suspend_noirq(PMSG_QUIESCE);
+	error = dpm_suspend_end(PMSG_QUIESCE);
 	if (error) {
 		printk(KERN_ERR "PM: Some devices failed to power down, "
 			"aborting resume\n");
@@ -460,7 +460,7 @@ static int resume_target_kernel(bool platform_mode)
  Cleanup:
 	platform_restore_cleanup(platform_mode);
 
-	dpm_resume_noirq(PMSG_RECOVER);
+	dpm_resume_start(PMSG_RECOVER);
 
 	return error;
 }
@@ -518,7 +518,7 @@ int hibernation_platform_enter(void)
 		goto Resume_devices;
 	}
 
-	error = dpm_suspend_noirq(PMSG_HIBERNATE);
+	error = dpm_suspend_end(PMSG_HIBERNATE);
 	if (error)
 		goto Resume_devices;
 
@@ -549,7 +549,7 @@ int hibernation_platform_enter(void)
  Platform_finish:
 	hibernation_ops->finish();
 
-	dpm_resume_noirq(PMSG_RESTORE);
+	dpm_resume_start(PMSG_RESTORE);
 
  Resume_devices:
 	entering_platform_hibernation = false;
diff --git a/kernel/power/main.c b/kernel/power/main.c
index 9824b41e5a18..8c5014a4e052 100644
--- a/kernel/power/main.c
+++ b/kernel/power/main.c
@@ -165,16 +165,20 @@ static int suspend_stats_show(struct seq_file *s, void *unused)
 	last_errno %= REC_FAILED_NUM;
 	last_step = suspend_stats.last_failed_step + REC_FAILED_NUM - 1;
 	last_step %= REC_FAILED_NUM;
-	seq_printf(s, "%s: %d\n%s: %d\n%s: %d\n%s: %d\n"
-			"%s: %d\n%s: %d\n%s: %d\n%s: %d\n",
+	seq_printf(s, "%s: %d\n%s: %d\n%s: %d\n%s: %d\n%s: %d\n"
+			"%s: %d\n%s: %d\n%s: %d\n%s: %d\n%s: %d\n",
 			"success", suspend_stats.success,
 			"fail", suspend_stats.fail,
 			"failed_freeze", suspend_stats.failed_freeze,
 			"failed_prepare", suspend_stats.failed_prepare,
 			"failed_suspend", suspend_stats.failed_suspend,
+			"failed_suspend_late",
+				suspend_stats.failed_suspend_late,
 			"failed_suspend_noirq",
 				suspend_stats.failed_suspend_noirq,
 			"failed_resume", suspend_stats.failed_resume,
+			"failed_resume_early",
+				suspend_stats.failed_resume_early,
 			"failed_resume_noirq",
 				suspend_stats.failed_resume_noirq);
 	seq_printf(s,	"failures:\n  last_failed_dev:\t%-s\n",
diff --git a/kernel/power/suspend.c b/kernel/power/suspend.c
index 4fd51beed879..560a639614a1 100644
--- a/kernel/power/suspend.c
+++ b/kernel/power/suspend.c
@@ -147,7 +147,7 @@ static int suspend_enter(suspend_state_t state, bool *wakeup)
 			goto Platform_finish;
 	}
 
-	error = dpm_suspend_noirq(PMSG_SUSPEND);
+	error = dpm_suspend_end(PMSG_SUSPEND);
 	if (error) {
 		printk(KERN_ERR "PM: Some devices failed to power down\n");
 		goto Platform_finish;
@@ -189,7 +189,7 @@ static int suspend_enter(suspend_state_t state, bool *wakeup)
 	if (suspend_ops->wake)
 		suspend_ops->wake();
 
-	dpm_resume_noirq(PMSG_RESUME);
+	dpm_resume_start(PMSG_RESUME);
 
  Platform_finish:
 	if (suspend_ops->finish)
-- 
cgit v1.2.3


From 5955633e91bfc5cd0a41d8d82259e1d8b32980ef Mon Sep 17 00:00:00 2001
From: Michael D Labriola <michael.d.labriola@gmail.com>
Date: Sun, 29 Jan 2012 14:17:22 -0500
Subject: x86/reboot: Skip DMI checks if reboot set by user

Skip DMI checks for vendor specific reboot quirks if the user
passed in a reboot= arg on the command line - we should never
override user choices.

Signed-off-by: Michael D Labriola <michael.d.labriola@gmail.com>
Cc: Alan Cox <alan@lxorguk.ukuu.org.uk>
Cc: Michael D Labriola <mlabriol@gdeb.com>
Cc: Matthew Garrett <mjg@redhat.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Link: http://lkml.kernel.org/r/87wr8ab9od.fsf@gmail.com
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/reboot.c | 28 ++++++++++++++++++++++++++--
 1 file changed, 26 insertions(+), 2 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kernel/reboot.c b/arch/x86/kernel/reboot.c
index 37a458b521a6..b257f0e28824 100644
--- a/arch/x86/kernel/reboot.c
+++ b/arch/x86/kernel/reboot.c
@@ -39,6 +39,14 @@ static int reboot_mode;
 enum reboot_type reboot_type = BOOT_ACPI;
 int reboot_force;
 
+/* This variable is used privately to keep track of whether or not
+ * reboot_type is still set to its default value (i.e., reboot= hasn't
+ * been set on the command line).  This is needed so that we can
+ * suppress DMI scanning for reboot quirks.  Without it, it's
+ * impossible to override a faulty reboot quirk without recompiling.
+ */
+static int reboot_default = 1;
+
 #if defined(CONFIG_X86_32) && defined(CONFIG_SMP)
 static int reboot_cpu = -1;
 #endif
@@ -67,6 +75,12 @@ bool port_cf9_safe = false;
 static int __init reboot_setup(char *str)
 {
 	for (;;) {
+		/* Having anything passed on the command line via
+		 * reboot= will cause us to disable DMI checking
+		 * below.
+		 */
+		reboot_default = 0;
+
 		switch (*str) {
 		case 'w':
 			reboot_mode = 0x1234;
@@ -316,7 +330,12 @@ static struct dmi_system_id __initdata reboot_dmi_table[] = {
 
 static int __init reboot_init(void)
 {
-	dmi_check_system(reboot_dmi_table);
+	/* Only do the DMI check if reboot_type hasn't been overridden
+	 * on the command line
+	 */
+	if (reboot_default) {
+		dmi_check_system(reboot_dmi_table);
+	}
 	return 0;
 }
 core_initcall(reboot_init);
@@ -465,7 +484,12 @@ static struct dmi_system_id __initdata pci_reboot_dmi_table[] = {
 
 static int __init pci_reboot_init(void)
 {
-	dmi_check_system(pci_reboot_dmi_table);
+	/* Only do the DMI check if reboot_type hasn't been overridden
+	 * on the command line
+	 */
+	if (reboot_default) {
+		dmi_check_system(pci_reboot_dmi_table);
+	}
 	return 0;
 }
 core_initcall(pci_reboot_init);
-- 
cgit v1.2.3


From e6d36a653becc7bbc643c399a77882e02bf552cb Mon Sep 17 00:00:00 2001
From: Michael D Labriola <michael.d.labriola@gmail.com>
Date: Sun, 29 Jan 2012 14:21:17 -0500
Subject: x86/reboot: Remove VersaLogic Menlow reboot quirk

This commit removes the reboot quirk originally added by commit
e19e074 ("x86: Fix reboot problem on VersaLogic Menlow boards").

Testing with a VersaLogic Ocelot (VL-EPMs-21a rev 1.00 w/ BIOS
6.5.102) revealed the following regarding the reboot hang
problem:

- v2.6.37 reboot=bios was needed.

- v2.6.38-rc1: behavior changed, reboot=acpi is needed,
  reboot=kbd and reboot=bios results in system hang.

- v2.6.38: VersaLogic patch (e19e074 "x86: Fix reboot problem on
  VersaLogic Menlow boards") was applied prior to v2.6.38-rc7.  This
  patch sets a quirk for VersaLogic Menlow boards that forces the use
  of reboot=bios, which doesn't work anymore.

- v3.2: It seems that commit 660e34c ("x86: Reorder reboot method
  preferences") changed the default reboot method to acpi prior to
  v3.0-rc1, which means the default behavior is appropriate for the
  Ocelot.  No VersaLogic quirk is required.

The Ocelot board used for testing can successfully reboot w/out
having to pass any reboot= arguments for all 3 current versions
of the BIOS.

Signed-off-by: Michael D Labriola <michael.d.labriola@gmail.com>
Cc: Matthew Garrett <mjg@redhat.com>
Cc: Michael D Labriola <mlabriol@gdeb.com>
Cc: Kushal Koolwal <kushalkoolwal@gmail.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Link: http://lkml.kernel.org/r/87vcnub9hu.fsf@gmail.com
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/reboot.c | 8 --------
 1 file changed, 8 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kernel/reboot.c b/arch/x86/kernel/reboot.c
index b257f0e28824..d840e69a853c 100644
--- a/arch/x86/kernel/reboot.c
+++ b/arch/x86/kernel/reboot.c
@@ -309,14 +309,6 @@ static struct dmi_system_id __initdata reboot_dmi_table[] = {
 			DMI_MATCH(DMI_BOARD_NAME, "P4S800"),
 		},
 	},
-	{	/* Handle problems with rebooting on VersaLogic Menlow boards */
-		.callback = set_bios_reboot,
-		.ident = "VersaLogic Menlow based board",
-		.matches = {
-			DMI_MATCH(DMI_BOARD_VENDOR, "VersaLogic Corporation"),
-			DMI_MATCH(DMI_BOARD_NAME, "VersaLogic Menlow board"),
-		},
-	},
 	{ /* Handle reboot issue on Acer Aspire one */
 		.callback = set_kbd_reboot,
 		.ident = "Acer Aspire One A110",
-- 
cgit v1.2.3


From 35f1790e6c6a7e4cae57b616cf36444d27fa6b28 Mon Sep 17 00:00:00 2001
From: He Chunhui <hchunhui@mail.ustc.edu.cn>
Date: Wed, 1 Feb 2012 00:48:28 +0800
Subject: x86, boot: Fix port argument to inl() function

"u32 port" in inl() should be "u16 port".

[ hpa: it's a bug, but it doesn't produce incorrect code, so no need
  to put this into urgent or stable. ]

Signed-off-by: He Chunhui <hchunhui@mail.ustc.edu.cn>
Link: http://lkml.kernel.org/r/32892299.2931391328028508117.JavaMail.coremail@mailweb
Signed-off-by: H. Peter Anvin <hpa@linux.intel.com>
---
 arch/x86/boot/boot.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'arch/x86')

diff --git a/arch/x86/boot/boot.h b/arch/x86/boot/boot.h
index c7093bd9f2d3..18997e5a1053 100644
--- a/arch/x86/boot/boot.h
+++ b/arch/x86/boot/boot.h
@@ -67,7 +67,7 @@ static inline void outl(u32 v, u16 port)
 {
 	asm volatile("outl %0,%1" : : "a" (v), "dN" (port));
 }
-static inline u32 inl(u32 port)
+static inline u32 inl(u16 port)
 {
 	u32 v;
 	asm volatile("inl %1,%0" : "=a" (v) : "dN" (port));
-- 
cgit v1.2.3


From bdb42f5afebe208eae90406959383856ae2caf2b Mon Sep 17 00:00:00 2001
From: Stephan Bärwolf <stephan.baerwolf@tu-ilmenau.de>
Date: Thu, 12 Jan 2012 16:43:03 +0100
Subject: KVM: x86: extend "struct x86_emulate_ops" with "get_cpuid"

In order to be able to proceed checks on CPU-specific properties
within the emulator, function "get_cpuid" is introduced.
With "get_cpuid" it is possible to virtually call the guests
"cpuid"-opcode without changing the VM's context.

[mtosatti: cleanup/beautify code]

Signed-off-by: Stephan Baerwolf <stephan.baerwolf@tu-ilmenau.de>
Signed-off-by: Marcelo Tosatti <mtosatti@redhat.com>
---
 arch/x86/include/asm/kvm_emulate.h |  3 +++
 arch/x86/kvm/x86.c                 | 23 +++++++++++++++++++++++
 2 files changed, 26 insertions(+)

(limited to 'arch/x86')

diff --git a/arch/x86/include/asm/kvm_emulate.h b/arch/x86/include/asm/kvm_emulate.h
index ab4092e3214e..c8b28689eeeb 100644
--- a/arch/x86/include/asm/kvm_emulate.h
+++ b/arch/x86/include/asm/kvm_emulate.h
@@ -190,6 +190,9 @@ struct x86_emulate_ops {
 	int (*intercept)(struct x86_emulate_ctxt *ctxt,
 			 struct x86_instruction_info *info,
 			 enum x86_intercept_stage stage);
+
+	bool (*get_cpuid)(struct x86_emulate_ctxt *ctxt,
+			 u32 *eax, u32 *ebx, u32 *ecx, u32 *edx);
 };
 
 typedef u32 __attribute__((vector_size(16))) sse128_t;
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 14d6cadc4ba6..8c890e2fa6b6 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -4180,6 +4180,28 @@ static int emulator_intercept(struct x86_emulate_ctxt *ctxt,
 	return kvm_x86_ops->check_intercept(emul_to_vcpu(ctxt), info, stage);
 }
 
+static bool emulator_get_cpuid(struct x86_emulate_ctxt *ctxt,
+			       u32 *eax, u32 *ebx, u32 *ecx, u32 *edx)
+{
+	struct kvm_cpuid_entry2 *cpuid = NULL;
+
+	if (eax && ecx)
+		cpuid = kvm_find_cpuid_entry(emul_to_vcpu(ctxt),
+					    *eax, *ecx);
+
+	if (cpuid) {
+		*eax = cpuid->eax;
+		*ecx = cpuid->ecx;
+		if (ebx)
+			*ebx = cpuid->ebx;
+		if (edx)
+			*edx = cpuid->edx;
+		return true;
+	}
+
+	return false;
+}
+
 static struct x86_emulate_ops emulate_ops = {
 	.read_std            = kvm_read_guest_virt_system,
 	.write_std           = kvm_write_guest_virt_system,
@@ -4211,6 +4233,7 @@ static struct x86_emulate_ops emulate_ops = {
 	.get_fpu             = emulator_get_fpu,
 	.put_fpu             = emulator_put_fpu,
 	.intercept           = emulator_intercept,
+	.get_cpuid           = emulator_get_cpuid,
 };
 
 static void cache_all_regs(struct kvm_vcpu *vcpu)
-- 
cgit v1.2.3


From c2226fc9e87ba3da060e47333657cd6616652b84 Mon Sep 17 00:00:00 2001
From: Stephan Bärwolf <stephan.baerwolf@tu-ilmenau.de>
Date: Thu, 12 Jan 2012 16:43:04 +0100
Subject: KVM: x86: fix missing checks in syscall emulation

On hosts without this patch, 32bit guests will crash (and 64bit guests
may behave in a wrong way) for example by simply executing following
nasm-demo-application:

    [bits 32]
    global _start
    SECTION .text
    _start: syscall

(I tested it with winxp and linux - both always crashed)

    Disassembly of section .text:

    00000000 <_start>:
       0:   0f 05                   syscall

The reason seems a missing "invalid opcode"-trap (int6) for the
syscall opcode "0f05", which is not available on Intel CPUs
within non-longmodes, as also on some AMD CPUs within legacy-mode.
(depending on CPU vendor, MSR_EFER and cpuid)

Because previous mentioned OSs may not engage corresponding
syscall target-registers (STAR, LSTAR, CSTAR), they remain
NULL and (non trapping) syscalls are leading to multiple
faults and finally crashs.

Depending on the architecture (AMD or Intel) pretended by
guests, various checks according to vendor's documentation
are implemented to overcome the current issue and behave
like the CPUs physical counterparts.

[mtosatti: cleanup/beautify code]

Signed-off-by: Stephan Baerwolf <stephan.baerwolf@tu-ilmenau.de>
Signed-off-by: Marcelo Tosatti <mtosatti@redhat.com>
---
 arch/x86/include/asm/kvm_emulate.h | 13 ++++++++++
 arch/x86/kvm/emulate.c             | 51 ++++++++++++++++++++++++++++++++++++++
 2 files changed, 64 insertions(+)

(limited to 'arch/x86')

diff --git a/arch/x86/include/asm/kvm_emulate.h b/arch/x86/include/asm/kvm_emulate.h
index c8b28689eeeb..7b9cfc4878af 100644
--- a/arch/x86/include/asm/kvm_emulate.h
+++ b/arch/x86/include/asm/kvm_emulate.h
@@ -301,6 +301,19 @@ struct x86_emulate_ctxt {
 #define X86EMUL_MODE_PROT     (X86EMUL_MODE_PROT16|X86EMUL_MODE_PROT32| \
 			       X86EMUL_MODE_PROT64)
 
+/* CPUID vendors */
+#define X86EMUL_CPUID_VENDOR_AuthenticAMD_ebx 0x68747541
+#define X86EMUL_CPUID_VENDOR_AuthenticAMD_ecx 0x444d4163
+#define X86EMUL_CPUID_VENDOR_AuthenticAMD_edx 0x69746e65
+
+#define X86EMUL_CPUID_VENDOR_AMDisbetterI_ebx 0x69444d41
+#define X86EMUL_CPUID_VENDOR_AMDisbetterI_ecx 0x21726574
+#define X86EMUL_CPUID_VENDOR_AMDisbetterI_edx 0x74656273
+
+#define X86EMUL_CPUID_VENDOR_GenuineIntel_ebx 0x756e6547
+#define X86EMUL_CPUID_VENDOR_GenuineIntel_ecx 0x6c65746e
+#define X86EMUL_CPUID_VENDOR_GenuineIntel_edx 0x49656e69
+
 enum x86_intercept_stage {
 	X86_ICTP_NONE = 0,   /* Allow zero-init to not match anything */
 	X86_ICPT_PRE_EXCEPT,
diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c
index 05a562b85025..0982507b962a 100644
--- a/arch/x86/kvm/emulate.c
+++ b/arch/x86/kvm/emulate.c
@@ -1891,6 +1891,51 @@ setup_syscalls_segments(struct x86_emulate_ctxt *ctxt,
 	ss->p = 1;
 }
 
+static bool em_syscall_is_enabled(struct x86_emulate_ctxt *ctxt)
+{
+	struct x86_emulate_ops *ops = ctxt->ops;
+	u32 eax, ebx, ecx, edx;
+
+	/*
+	 * syscall should always be enabled in longmode - so only become
+	 * vendor specific (cpuid) if other modes are active...
+	 */
+	if (ctxt->mode == X86EMUL_MODE_PROT64)
+		return true;
+
+	eax = 0x00000000;
+	ecx = 0x00000000;
+	if (ops->get_cpuid(ctxt, &eax, &ebx, &ecx, &edx)) {
+		/*
+		 * Intel ("GenuineIntel")
+		 * remark: Intel CPUs only support "syscall" in 64bit
+		 * longmode. Also an 64bit guest with a
+		 * 32bit compat-app running will #UD !! While this
+		 * behaviour can be fixed (by emulating) into AMD
+		 * response - CPUs of AMD can't behave like Intel.
+		 */
+		if (ebx == X86EMUL_CPUID_VENDOR_GenuineIntel_ebx &&
+		    ecx == X86EMUL_CPUID_VENDOR_GenuineIntel_ecx &&
+		    edx == X86EMUL_CPUID_VENDOR_GenuineIntel_edx)
+			return false;
+
+		/* AMD ("AuthenticAMD") */
+		if (ebx == X86EMUL_CPUID_VENDOR_AuthenticAMD_ebx &&
+		    ecx == X86EMUL_CPUID_VENDOR_AuthenticAMD_ecx &&
+		    edx == X86EMUL_CPUID_VENDOR_AuthenticAMD_edx)
+			return true;
+
+		/* AMD ("AMDisbetter!") */
+		if (ebx == X86EMUL_CPUID_VENDOR_AMDisbetterI_ebx &&
+		    ecx == X86EMUL_CPUID_VENDOR_AMDisbetterI_ecx &&
+		    edx == X86EMUL_CPUID_VENDOR_AMDisbetterI_edx)
+			return true;
+	}
+
+	/* default: (not Intel, not AMD), apply Intel's stricter rules... */
+	return false;
+}
+
 static int em_syscall(struct x86_emulate_ctxt *ctxt)
 {
 	struct x86_emulate_ops *ops = ctxt->ops;
@@ -1904,9 +1949,15 @@ static int em_syscall(struct x86_emulate_ctxt *ctxt)
 	    ctxt->mode == X86EMUL_MODE_VM86)
 		return emulate_ud(ctxt);
 
+	if (!(em_syscall_is_enabled(ctxt)))
+		return emulate_ud(ctxt);
+
 	ops->get_msr(ctxt, MSR_EFER, &efer);
 	setup_syscalls_segments(ctxt, &cs, &ss);
 
+	if (!(efer & EFER_SCE))
+		return emulate_ud(ctxt);
+
 	ops->get_msr(ctxt, MSR_STAR, &msr_data);
 	msr_data >>= 32;
 	cs_sel = (u16)(msr_data & 0xfffc);
-- 
cgit v1.2.3


From 5753785fa97742d2723ed8ebb29ae59cac912705 Mon Sep 17 00:00:00 2001
From: Gleb Natapov <gleb@redhat.com>
Date: Sun, 15 Jan 2012 14:17:22 +0200
Subject: KVM: do not #GP on perf MSR writes when vPMU is disabled

Return to behaviour perf MSR had before introducing vPMU in case vPMU
is disabled. Some guests access those registers unconditionally and do
not expect it to fail.

Signed-off-by: Gleb Natapov <gleb@redhat.com>
Signed-off-by: Marcelo Tosatti <mtosatti@redhat.com>
---
 arch/x86/kvm/x86.c | 22 ++++++++++++++++++++++
 1 file changed, 22 insertions(+)

(limited to 'arch/x86')

diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 8c890e2fa6b6..9cbfc0698118 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -1495,6 +1495,8 @@ static void record_steal_time(struct kvm_vcpu *vcpu)
 
 int kvm_set_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 data)
 {
+	bool pr = false;
+
 	switch (msr) {
 	case MSR_EFER:
 		return set_efer(vcpu, data);
@@ -1635,6 +1637,18 @@ int kvm_set_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 data)
 		pr_unimpl(vcpu, "unimplemented perfctr wrmsr: "
 			"0x%x data 0x%llx\n", msr, data);
 		break;
+	case MSR_P6_PERFCTR0:
+	case MSR_P6_PERFCTR1:
+		pr = true;
+	case MSR_P6_EVNTSEL0:
+	case MSR_P6_EVNTSEL1:
+		if (kvm_pmu_msr(vcpu, msr))
+			return kvm_pmu_set_msr(vcpu, msr, data);
+
+		if (pr || data != 0)
+			pr_unimpl(vcpu, "disabled perfctr wrmsr: "
+				"0x%x data 0x%llx\n", msr, data);
+		break;
 	case MSR_K7_CLK_CTL:
 		/*
 		 * Ignore all writes to this no longer documented MSR.
@@ -1835,6 +1849,14 @@ int kvm_get_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata)
 	case MSR_FAM10H_MMIO_CONF_BASE:
 		data = 0;
 		break;
+	case MSR_P6_PERFCTR0:
+	case MSR_P6_PERFCTR1:
+	case MSR_P6_EVNTSEL0:
+	case MSR_P6_EVNTSEL1:
+		if (kvm_pmu_msr(vcpu, msr))
+			return kvm_pmu_get_msr(vcpu, msr, pdata);
+		data = 0;
+		break;
 	case MSR_IA32_UCODE_REV:
 		data = 0x100000000ULL;
 		break;
-- 
cgit v1.2.3


From 84f2b9b2edc09595569c7397cc3c888764ffd78b Mon Sep 17 00:00:00 2001
From: Stephane Eranian <eranian@google.com>
Date: Thu, 2 Feb 2012 12:04:01 +0100
Subject: perf: Remove deprecated WARN_ON_ONCE()

With the new throttling/unthrottling code introduced with
commit:

  e050e3f0a71b ("perf: Fix broken interrupt rate throttling")

we occasionally hit two WARN_ON_ONCE() checks in:

  - intel_pmu_pebs_enable()
  - intel_pmu_lbr_enable()
  - x86_pmu_start()

The assertions are no longer problematic. There is a valid
path where they can trigger but it is harmless.

The assertion can be triggered with:

  $ perf record -e instructions:pp ....

Leading to paths:

  intel_pmu_pebs_enable
  intel_pmu_enable_event
  x86_perf_event_set_period
  x86_pmu_start
  perf_adjust_freq_unthr_context
  perf_event_task_tick
  scheduler_tick

And:

  intel_pmu_lbr_enable
  intel_pmu_enable_event
  x86_perf_event_set_period
  x86_pmu_start
  perf_adjust_freq_unthr_context.
  perf_event_task_tick
  scheduler_tick

cpuc->enabled is always on because when we get to
perf_adjust_freq_unthr_context() the PMU is not totally
disabled. Furthermore when we need to adjust a period,
we only stop the event we need to change and not the
entire PMU. Thus, when we re-enable, cpuc->enabled is
already set. Note that when we stop the event, both
pebs and lbr are stopped if necessary (and possible).

Signed-off-by: Stephane Eranian <eranian@google.com>
Cc: peterz@infradead.org
Link: http://lkml.kernel.org/r/20120202110401.GA30911@quad
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/cpu/perf_event.c           | 3 ---
 arch/x86/kernel/cpu/perf_event_intel_ds.c  | 1 -
 arch/x86/kernel/cpu/perf_event_intel_lbr.c | 2 --
 3 files changed, 6 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kernel/cpu/perf_event.c b/arch/x86/kernel/cpu/perf_event.c
index 5adce1040b11..2a30e5ae6acf 100644
--- a/arch/x86/kernel/cpu/perf_event.c
+++ b/arch/x86/kernel/cpu/perf_event.c
@@ -986,9 +986,6 @@ static void x86_pmu_start(struct perf_event *event, int flags)
 	struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events);
 	int idx = event->hw.idx;
 
-	if (WARN_ON_ONCE(!(event->hw.state & PERF_HES_STOPPED)))
-		return;
-
 	if (WARN_ON_ONCE(idx == -1))
 		return;
 
diff --git a/arch/x86/kernel/cpu/perf_event_intel_ds.c b/arch/x86/kernel/cpu/perf_event_intel_ds.c
index 73da6b64f5b7..d6bd49faa40c 100644
--- a/arch/x86/kernel/cpu/perf_event_intel_ds.c
+++ b/arch/x86/kernel/cpu/perf_event_intel_ds.c
@@ -439,7 +439,6 @@ void intel_pmu_pebs_enable(struct perf_event *event)
 	hwc->config &= ~ARCH_PERFMON_EVENTSEL_INT;
 
 	cpuc->pebs_enabled |= 1ULL << hwc->idx;
-	WARN_ON_ONCE(cpuc->enabled);
 
 	if (x86_pmu.intel_cap.pebs_trap && event->attr.precise_ip > 1)
 		intel_pmu_lbr_enable(event);
diff --git a/arch/x86/kernel/cpu/perf_event_intel_lbr.c b/arch/x86/kernel/cpu/perf_event_intel_lbr.c
index 3fab3de3ce96..47a7e63bfe54 100644
--- a/arch/x86/kernel/cpu/perf_event_intel_lbr.c
+++ b/arch/x86/kernel/cpu/perf_event_intel_lbr.c
@@ -72,8 +72,6 @@ void intel_pmu_lbr_enable(struct perf_event *event)
 	if (!x86_pmu.lbr_nr)
 		return;
 
-	WARN_ON_ONCE(cpuc->enabled);
-
 	/*
 	 * Reset the LBR stack if we changed task context to
 	 * avoid data leaks.
-- 
cgit v1.2.3


From 41bd956de3dfdc3a43708fe2e0c8096c69064a1e Mon Sep 17 00:00:00 2001
From: Konrad Rzeszutek Wilk <konrad.wilk@oracle.com>
Date: Wed, 1 Feb 2012 15:56:54 -0500
Subject: xen/smp: Fix CPU online/offline bug triggering a BUG: scheduling
 while atomic.

When a user offlines a VCPU and then onlines it, we get:

NMI watchdog disabled (cpu2): hardware events not enabled
BUG: scheduling while atomic: swapper/2/0/0x00000002
Modules linked in: dm_multipath dm_mod xen_evtchn iscsi_boot_sysfs iscsi_tcp libiscsi_tcp libiscsi scsi_transport_iscsi scsi_mod libcrc32c crc32c radeon fbco
 ttm bitblit softcursor drm_kms_helper xen_blkfront xen_netfront xen_fbfront fb_sys_fops sysimgblt sysfillrect syscopyarea xen_kbdfront xenfs [last unloaded:

Pid: 0, comm: swapper/2 Tainted: G           O 3.2.0phase15.1-00003-gd6f7f5b-dirty #4
Call Trace:
 [<ffffffff81070571>] __schedule_bug+0x61/0x70
 [<ffffffff8158eb78>] __schedule+0x798/0x850
 [<ffffffff8158ed6a>] schedule+0x3a/0x50
 [<ffffffff810349be>] cpu_idle+0xbe/0xe0
 [<ffffffff81583599>] cpu_bringup_and_idle+0xe/0x10

The reason for this should be obvious from this call-chain:
cpu_bringup_and_idle:
 \- cpu_bringup
  |   \-[preempt_disable]
  |
  |- cpu_idle
       \- play_dead [assuming the user offlined the VCPU]
       |     \
       |     +- (xen_play_dead)
       |          \- HYPERVISOR_VCPU_off [so VCPU is dead, once user
       |          |                       onlines it starts from here]
       |          \- cpu_bringup [preempt_disable]
       |
       +- preempt_enable_no_reschedule()
       +- schedule()
       \- preempt_enable()

So we have two preempt_disble() and one preempt_enable(). Calling
preempt_enable() after the cpu_bringup() in the xen_play_dead
fixes the imbalance.

Signed-off-by: Konrad Rzeszutek Wilk <konrad.wilk@oracle.com>
---
 arch/x86/xen/smp.c | 7 +++++++
 1 file changed, 7 insertions(+)

(limited to 'arch/x86')

diff --git a/arch/x86/xen/smp.c b/arch/x86/xen/smp.c
index 041d4fe9dfe4..501d4e0244ba 100644
--- a/arch/x86/xen/smp.c
+++ b/arch/x86/xen/smp.c
@@ -409,6 +409,13 @@ static void __cpuinit xen_play_dead(void) /* used only with HOTPLUG_CPU */
 	play_dead_common();
 	HYPERVISOR_vcpu_op(VCPUOP_down, smp_processor_id(), NULL);
 	cpu_bringup();
+	/*
+	 * Balance out the preempt calls - as we are running in cpu_idle
+	 * loop which has been called at bootup from cpu_bringup_and_idle.
+	 * The cpucpu_bringup_and_idle called cpu_bringup which made a
+	 * preempt_disable() So this preempt_enable will balance it out.
+	 */
+	preempt_enable();
 }
 
 #else /* !CONFIG_HOTPLUG_CPU */
-- 
cgit v1.2.3


From 207d543f472c1ac9552df79838dc807cbcaa9740 Mon Sep 17 00:00:00 2001
From: Stefano Stabellini <stefano.stabellini@eu.citrix.com>
Date: Mon, 30 Jan 2012 14:31:46 +0000
Subject: xen pvhvm: do not remap pirqs onto evtchns if
 !xen_have_vector_callback

CC: stable@kernel.org #2.6.37 and onwards
Signed-off-by: Stefano Stabellini <stefano.stabellini@eu.citrix.com>
Signed-off-by: Konrad Rzeszutek Wilk <konrad.wilk@oracle.com>
---
 arch/x86/pci/xen.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'arch/x86')

diff --git a/arch/x86/pci/xen.c b/arch/x86/pci/xen.c
index 492ade8c978e..d99346ea8fdb 100644
--- a/arch/x86/pci/xen.c
+++ b/arch/x86/pci/xen.c
@@ -374,7 +374,7 @@ int __init pci_xen_init(void)
 
 int __init pci_xen_hvm_init(void)
 {
-	if (!xen_feature(XENFEAT_hvm_pirqs))
+	if (!xen_have_vector_callback || !xen_feature(XENFEAT_hvm_pirqs))
 		return 0;
 
 #ifdef CONFIG_ACPI
-- 
cgit v1.2.3


From 0ac2526064e5c7da5d1a47b48d37c1345e487258 Mon Sep 17 00:00:00 2001
From: Jim Cromie <jim.cromie@gmail.com>
Date: Wed, 1 Feb 2012 10:47:01 -0700
Subject: scx200_32: use PCI_VDEVICE

Replace PCI_DEVICE with PCI_VDEVICE to shorten device table.

Signed-off-by: Jim Cromie <jim.cromie@gmail.com>
Signed-off-by: Jiri Kosina <jkosina@suse.cz>
---
 arch/x86/platform/scx200/scx200_32.c | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/platform/scx200/scx200_32.c b/arch/x86/platform/scx200/scx200_32.c
index 7e004acbe526..5dfd335ad50c 100644
--- a/arch/x86/platform/scx200/scx200_32.c
+++ b/arch/x86/platform/scx200/scx200_32.c
@@ -29,10 +29,10 @@ unsigned long scx200_gpio_shadow[2];
 unsigned scx200_cb_base = 0;
 
 static struct pci_device_id scx200_tbl[] = {
-	{ PCI_DEVICE(PCI_VENDOR_ID_NS, PCI_DEVICE_ID_NS_SCx200_BRIDGE) },
-	{ PCI_DEVICE(PCI_VENDOR_ID_NS, PCI_DEVICE_ID_NS_SC1100_BRIDGE) },
-	{ PCI_DEVICE(PCI_VENDOR_ID_NS, PCI_DEVICE_ID_NS_SCx200_XBUS)   },
-	{ PCI_DEVICE(PCI_VENDOR_ID_NS, PCI_DEVICE_ID_NS_SC1100_XBUS)   },
+	{ PCI_VDEVICE(NS, PCI_DEVICE_ID_NS_SCx200_BRIDGE) },
+	{ PCI_VDEVICE(NS, PCI_DEVICE_ID_NS_SC1100_BRIDGE) },
+	{ PCI_VDEVICE(NS, PCI_DEVICE_ID_NS_SCx200_XBUS)   },
+	{ PCI_VDEVICE(NS, PCI_DEVICE_ID_NS_SC1100_XBUS)   },
 	{ },
 };
 MODULE_DEVICE_TABLE(pci,scx200_tbl);
-- 
cgit v1.2.3


From 8ad95f0958d3ea9a182ee4c5449c4847e0577cdc Mon Sep 17 00:00:00 2001
From: Jim Cromie <jim.cromie@gmail.com>
Date: Wed, 1 Feb 2012 10:58:49 -0700
Subject: scx200_32: replace printks with pr_<level>s

update scx200_32.c to use pr_<level>, also 2 whitespaces.

Signed-off-by: Jim Cromie <jim.cromie@gmail.com>
Signed-off-by: Jiri Kosina <jkosina@suse.cz>
---
 arch/x86/platform/scx200/scx200_32.c | 16 +++++++---------
 1 file changed, 7 insertions(+), 9 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/platform/scx200/scx200_32.c b/arch/x86/platform/scx200/scx200_32.c
index 5dfd335ad50c..7a9ad30d6c9f 100644
--- a/arch/x86/platform/scx200/scx200_32.c
+++ b/arch/x86/platform/scx200/scx200_32.c
@@ -17,8 +17,6 @@
 /* Verify that the configuration block really is there */
 #define scx200_cb_probe(base) (inw((base) + SCx200_CBA) == (base))
 
-#define NAME "scx200"
-
 MODULE_AUTHOR("Christer Weinigel <wingel@nano-system.com>");
 MODULE_DESCRIPTION("NatSemi SCx200 Driver");
 MODULE_LICENSE("GPL");
@@ -63,10 +61,11 @@ static int __devinit scx200_probe(struct pci_dev *pdev, const struct pci_device_
 	if (pdev->device == PCI_DEVICE_ID_NS_SCx200_BRIDGE ||
 	    pdev->device == PCI_DEVICE_ID_NS_SC1100_BRIDGE) {
 		base = pci_resource_start(pdev, 0);
-		printk(KERN_INFO NAME ": GPIO base 0x%x\n", base);
+		pr_info("GPIO base 0x%x\n", base);
 
-		if (!request_region(base, SCx200_GPIO_SIZE, "NatSemi SCx200 GPIO")) {
-			printk(KERN_ERR NAME ": can't allocate I/O for GPIOs\n");
+		if (!request_region(base, SCx200_GPIO_SIZE,
+				    "NatSemi SCx200 GPIO")) {
+			pr_err("can't allocate I/O for GPIOs\n");
 			return -EBUSY;
 		}
 
@@ -82,11 +81,11 @@ static int __devinit scx200_probe(struct pci_dev *pdev, const struct pci_device_
 			if (scx200_cb_probe(base)) {
 				scx200_cb_base = base;
 			} else {
-				printk(KERN_WARNING NAME ": Configuration Block not found\n");
+				pr_warn("Configuration Block not found\n");
 				return -ENODEV;
 			}
 		}
-		printk(KERN_INFO NAME ": Configuration Block base 0x%x\n", scx200_cb_base);
+		pr_info("Configuration Block base 0x%x\n", scx200_cb_base);
 	}
 
 	return 0;
@@ -111,8 +110,7 @@ u32 scx200_gpio_configure(unsigned index, u32 mask, u32 bits)
 
 static int __init scx200_init(void)
 {
-	printk(KERN_INFO NAME ": NatSemi SCx200 Driver\n");
-
+	pr_info("NatSemi SCx200 Driver\n");
 	return pci_register_driver(&scx200_pci_driver);
 }
 
-- 
cgit v1.2.3


From b43ab901d671e3e3cad425ea5e9a3c74e266dcdd Mon Sep 17 00:00:00 2001
From: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
Date: Mon, 27 Jun 2011 09:26:23 +0200
Subject: gpio: Add a driver for Sodaville GPIO controller

Sodaville has GPIO controller behind the PCI bus. To my suprissed it is
not the same as on PXA.

The interrupt & gpio chip can be referenced from the device tree like
from any other driver. Unfortunately the driver which uses the gpio
interrupt has to use irq_of_parse_and_map() instead of
platform_get_irq(). The problem is that the platform device (which is
created from the device tree) is most likely created before the
interrupt chip is registered and therefore irq_of_parse_and_map() fails.

In theory the driver works as module. In reality most of the irq
functions are not exported to modules and it is possible that _this_
module is unloaded while the provided irqs are still in use.

Signed-off-by: Hans J. Koch <hjk@linutronix.de>
[torbenh@linutronix.de: make it work after the irq namespace cleanup,
	                add some device tree entries.]
Signed-off-by: Torben Hohn <torbenh@linutronix.de>
[bigeasy@linutronix.de: convert to generic irq & gpio chip]
Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
[grant.likely@secretlab.ca: depend on x86 to avoid irq_domain breakage]
Signed-off-by: Grant Likely <grant.likely@secretlab.ca>
---
 .../devicetree/bindings/gpio/sodaville.txt         |  48 ++++
 arch/x86/platform/ce4100/falconfalls.dts           |   7 +-
 drivers/gpio/Kconfig                               |   8 +
 drivers/gpio/Makefile                              |   1 +
 drivers/gpio/gpio-sodaville.c                      | 302 +++++++++++++++++++++
 5 files changed, 364 insertions(+), 2 deletions(-)
 create mode 100644 Documentation/devicetree/bindings/gpio/sodaville.txt
 create mode 100644 drivers/gpio/gpio-sodaville.c

(limited to 'arch/x86')

diff --git a/Documentation/devicetree/bindings/gpio/sodaville.txt b/Documentation/devicetree/bindings/gpio/sodaville.txt
new file mode 100644
index 000000000000..563eff22b975
--- /dev/null
+++ b/Documentation/devicetree/bindings/gpio/sodaville.txt
@@ -0,0 +1,48 @@
+GPIO controller on CE4100 / Sodaville SoCs
+==========================================
+
+The bindings for CE4100's GPIO controller match the generic description
+which is covered by the gpio.txt file in this folder.
+
+The only additional property is the intel,muxctl property which holds the
+value which is written into the MUXCNTL register.
+
+There is no compatible property for now because the driver is probed via
+PCI id (vendor 0x8086 device 0x2e67).
+
+The interrupt specifier consists of two cells encoded as follows:
+ - <1st cell>: The interrupt-number that identifies the interrupt source.
+ - <2nd cell>: The level-sense information, encoded as follows:
+		4 - active high level-sensitive
+		8 - active low level-sensitive
+
+Example of the GPIO device and one user:
+
+	pcigpio: gpio@b,1 {
+			/* two cells for GPIO and interrupt */
+			#gpio-cells = <2>;
+			#interrupt-cells = <2>;
+			compatible = "pci8086,2e67.2",
+					   "pci8086,2e67",
+					   "pciclassff0000",
+					   "pciclassff00";
+
+			reg = <0x15900 0x0 0x0 0x0 0x0>;
+			/* Interrupt line of the gpio device */
+			interrupts = <15 1>;
+			/* It is an interrupt and GPIO controller itself */
+			interrupt-controller;
+			gpio-controller;
+			intel,muxctl = <0>;
+	};
+
+	testuser@20 {
+			compatible = "example,testuser";
+			/* User the 11th GPIO line as an active high triggered
+			 * level interrupt
+			 */
+			interrupts = <11 8>;
+			interrupt-parent = <&pcigpio>;
+			/* Use this GPIO also with the gpio functions */
+			gpios = <&pcigpio 11 0>;
+	};
diff --git a/arch/x86/platform/ce4100/falconfalls.dts b/arch/x86/platform/ce4100/falconfalls.dts
index e70be38ce039..ce874f872cc6 100644
--- a/arch/x86/platform/ce4100/falconfalls.dts
+++ b/arch/x86/platform/ce4100/falconfalls.dts
@@ -208,16 +208,19 @@
 					interrupts = <14 1>;
 				};
 
-				gpio@b,1 {
+				pcigpio: gpio@b,1 {
+					#gpio-cells = <2>;
+					#interrupt-cells = <2>;
 					compatible = "pci8086,2e67.2",
 						   "pci8086,2e67",
 						   "pciclassff0000",
 						   "pciclassff00";
 
-					#gpio-cells = <2>;
 					reg = <0x15900 0x0 0x0 0x0 0x0>;
 					interrupts = <15 1>;
+					interrupt-controller;
 					gpio-controller;
+					intel,muxctl = <0>;
 				};
 
 				i2c-controller@b,2 {
diff --git a/drivers/gpio/Kconfig b/drivers/gpio/Kconfig
index eaa7d3828e70..dbb1909ca0a2 100644
--- a/drivers/gpio/Kconfig
+++ b/drivers/gpio/Kconfig
@@ -417,6 +417,14 @@ config GPIO_ML_IOH
 	  Hub) which is for IVI(In-Vehicle Infotainment) use.
 	  This driver can access the IOH's GPIO device.
 
+config GPIO_SODAVILLE
+	bool "Intel Sodaville GPIO support"
+	depends on X86 && PCI && OF
+	select GPIO_GENERIC
+	select GENERIC_IRQ_CHIP
+	help
+	  Say Y here to support Intel Sodaville GPIO.
+
 config GPIO_TIMBERDALE
 	bool "Support for timberdale GPIO IP"
 	depends on MFD_TIMBERDALE && HAS_IOMEM
diff --git a/drivers/gpio/Makefile b/drivers/gpio/Makefile
index 8863a7f2dece..593bdcd1976e 100644
--- a/drivers/gpio/Makefile
+++ b/drivers/gpio/Makefile
@@ -46,6 +46,7 @@ obj-$(CONFIG_GPIO_RDC321X)	+= gpio-rdc321x.o
 obj-$(CONFIG_PLAT_SAMSUNG)	+= gpio-samsung.o
 obj-$(CONFIG_ARCH_SA1100)	+= gpio-sa1100.o
 obj-$(CONFIG_GPIO_SCH)		+= gpio-sch.o
+obj-$(CONFIG_GPIO_SODAVILLE)	+= gpio-sodaville.o
 obj-$(CONFIG_GPIO_STMPE)	+= gpio-stmpe.o
 obj-$(CONFIG_GPIO_SX150X)	+= gpio-sx150x.o
 obj-$(CONFIG_GPIO_TC3589X)	+= gpio-tc3589x.o
diff --git a/drivers/gpio/gpio-sodaville.c b/drivers/gpio/gpio-sodaville.c
new file mode 100644
index 000000000000..9ba15d31d242
--- /dev/null
+++ b/drivers/gpio/gpio-sodaville.c
@@ -0,0 +1,302 @@
+/*
+ *  GPIO interface for Intel Sodaville SoCs.
+ *
+ *  Copyright (c) 2010, 2011 Intel Corporation
+ *
+ *  This program is free software; you can redistribute it and/or modify
+ *  it under the terms of the GNU General Public License 2 as published
+ *  by the Free Software Foundation.
+ *
+ */
+
+#include <linux/errno.h>
+#include <linux/gpio.h>
+#include <linux/init.h>
+#include <linux/io.h>
+#include <linux/irq.h>
+#include <linux/interrupt.h>
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/pci.h>
+#include <linux/platform_device.h>
+#include <linux/of_irq.h>
+#include <linux/basic_mmio_gpio.h>
+
+#define DRV_NAME		"sdv_gpio"
+#define SDV_NUM_PUB_GPIOS	12
+#define PCI_DEVICE_ID_SDV_GPIO	0x2e67
+#define GPIO_BAR		0
+
+#define GPOUTR		0x00
+#define GPOER		0x04
+#define GPINR		0x08
+
+#define GPSTR		0x0c
+#define GPIT1R0		0x10
+#define GPIO_INT	0x14
+#define GPIT1R1		0x18
+
+#define GPMUXCTL	0x1c
+
+struct sdv_gpio_chip_data {
+	int irq_base;
+	void __iomem *gpio_pub_base;
+	struct irq_domain id;
+	struct irq_chip_generic *gc;
+	struct bgpio_chip bgpio;
+};
+
+static int sdv_gpio_pub_set_type(struct irq_data *d, unsigned int type)
+{
+	struct irq_chip_generic *gc = irq_data_get_irq_chip_data(d);
+	struct sdv_gpio_chip_data *sd = gc->private;
+	void __iomem *type_reg;
+	u32 irq_offs = d->irq - sd->irq_base;
+	u32 reg;
+
+	if (irq_offs < 8)
+		type_reg = sd->gpio_pub_base + GPIT1R0;
+	else
+		type_reg = sd->gpio_pub_base + GPIT1R1;
+
+	reg = readl(type_reg);
+
+	switch (type) {
+	case IRQ_TYPE_LEVEL_HIGH:
+		reg &= ~BIT(4 * (irq_offs % 8));
+		break;
+
+	case IRQ_TYPE_LEVEL_LOW:
+		reg |= BIT(4 * (irq_offs % 8));
+		break;
+
+	default:
+		return -EINVAL;
+	}
+
+	writel(reg, type_reg);
+	return 0;
+}
+
+static irqreturn_t sdv_gpio_pub_irq_handler(int irq, void *data)
+{
+	struct sdv_gpio_chip_data *sd = data;
+	u32 irq_stat = readl(sd->gpio_pub_base + GPSTR);
+
+	irq_stat &= readl(sd->gpio_pub_base + GPIO_INT);
+	if (!irq_stat)
+		return IRQ_NONE;
+
+	while (irq_stat) {
+		u32 irq_bit = __fls(irq_stat);
+
+		irq_stat &= ~BIT(irq_bit);
+		generic_handle_irq(sd->irq_base + irq_bit);
+	}
+
+	return IRQ_HANDLED;
+}
+
+static int sdv_xlate(struct irq_domain *h, struct device_node *node,
+		const u32 *intspec, u32 intsize, irq_hw_number_t *out_hwirq,
+		u32 *out_type)
+{
+	u32 line, type;
+
+	if (node != h->of_node)
+		return -EINVAL;
+
+	if (intsize < 2)
+		return -EINVAL;
+
+	line = *intspec;
+	*out_hwirq = line;
+
+	intspec++;
+	type = *intspec;
+
+	switch (type) {
+	case IRQ_TYPE_LEVEL_LOW:
+	case IRQ_TYPE_LEVEL_HIGH:
+		*out_type = type;
+		break;
+	default:
+		return -EINVAL;
+	}
+	return 0;
+}
+
+static struct irq_domain_ops irq_domain_sdv_ops = {
+	.dt_translate	= sdv_xlate,
+};
+
+static __devinit int sdv_register_irqsupport(struct sdv_gpio_chip_data *sd,
+		struct pci_dev *pdev)
+{
+	struct irq_chip_type *ct;
+	int ret;
+
+	sd->irq_base = irq_alloc_descs(-1, 0, SDV_NUM_PUB_GPIOS, -1);
+	if (sd->irq_base < 0)
+		return sd->irq_base;
+
+	/* mask + ACK all interrupt sources */
+	writel(0, sd->gpio_pub_base + GPIO_INT);
+	writel((1 << 11) - 1, sd->gpio_pub_base + GPSTR);
+
+	ret = request_irq(pdev->irq, sdv_gpio_pub_irq_handler, IRQF_SHARED,
+			"sdv_gpio", sd);
+	if (ret)
+		goto out_free_desc;
+
+	sd->id.irq_base = sd->irq_base;
+	sd->id.of_node = of_node_get(pdev->dev.of_node);
+	sd->id.ops = &irq_domain_sdv_ops;
+
+	/*
+	 * This gpio irq controller latches level irqs. Testing shows that if
+	 * we unmask & ACK the IRQ before the source of the interrupt is gone
+	 * then the interrupt is active again.
+	 */
+	sd->gc = irq_alloc_generic_chip("sdv-gpio", 1, sd->irq_base,
+			sd->gpio_pub_base, handle_fasteoi_irq);
+	if (!sd->gc) {
+		ret = -ENOMEM;
+		goto out_free_irq;
+	}
+
+	sd->gc->private = sd;
+	ct = sd->gc->chip_types;
+	ct->type = IRQ_TYPE_LEVEL_HIGH | IRQ_TYPE_LEVEL_LOW;
+	ct->regs.eoi = GPSTR;
+	ct->regs.mask = GPIO_INT;
+	ct->chip.irq_mask = irq_gc_mask_clr_bit;
+	ct->chip.irq_unmask = irq_gc_mask_set_bit;
+	ct->chip.irq_eoi = irq_gc_eoi;
+	ct->chip.irq_set_type = sdv_gpio_pub_set_type;
+
+	irq_setup_generic_chip(sd->gc, IRQ_MSK(SDV_NUM_PUB_GPIOS),
+			IRQ_GC_INIT_MASK_CACHE, IRQ_NOREQUEST,
+			IRQ_LEVEL | IRQ_NOPROBE);
+
+	irq_domain_add(&sd->id);
+	return 0;
+out_free_irq:
+	free_irq(pdev->irq, sd);
+out_free_desc:
+	irq_free_descs(sd->irq_base, SDV_NUM_PUB_GPIOS);
+	return ret;
+}
+
+static int __devinit sdv_gpio_probe(struct pci_dev *pdev,
+					const struct pci_device_id *pci_id)
+{
+	struct sdv_gpio_chip_data *sd;
+	unsigned long addr;
+	const void *prop;
+	int len;
+	int ret;
+	u32 mux_val;
+
+	sd = kzalloc(sizeof(struct sdv_gpio_chip_data), GFP_KERNEL);
+	if (!sd)
+		return -ENOMEM;
+	ret = pci_enable_device(pdev);
+	if (ret) {
+		dev_err(&pdev->dev, "can't enable device.\n");
+		goto done;
+	}
+
+	ret = pci_request_region(pdev, GPIO_BAR, DRV_NAME);
+	if (ret) {
+		dev_err(&pdev->dev, "can't alloc PCI BAR #%d\n", GPIO_BAR);
+		goto disable_pci;
+	}
+
+	addr = pci_resource_start(pdev, GPIO_BAR);
+	if (!addr)
+		goto release_reg;
+	sd->gpio_pub_base = ioremap(addr, pci_resource_len(pdev, GPIO_BAR));
+
+	prop = of_get_property(pdev->dev.of_node, "intel,muxctl", &len);
+	if (prop && len == 4) {
+		mux_val = of_read_number(prop, 1);
+		writel(mux_val, sd->gpio_pub_base + GPMUXCTL);
+	}
+
+	ret = bgpio_init(&sd->bgpio, &pdev->dev, 4,
+			sd->gpio_pub_base + GPINR, sd->gpio_pub_base + GPOUTR,
+			NULL, sd->gpio_pub_base + GPOER, NULL, false);
+	if (ret)
+		goto unmap;
+	sd->bgpio.gc.ngpio = SDV_NUM_PUB_GPIOS;
+
+	ret = gpiochip_add(&sd->bgpio.gc);
+	if (ret < 0) {
+		dev_err(&pdev->dev, "gpiochip_add() failed.\n");
+		goto unmap;
+	}
+
+	ret = sdv_register_irqsupport(sd, pdev);
+	if (ret)
+		goto unmap;
+
+	pci_set_drvdata(pdev, sd);
+	dev_info(&pdev->dev, "Sodaville GPIO driver registered.\n");
+	return 0;
+
+unmap:
+	iounmap(sd->gpio_pub_base);
+release_reg:
+	pci_release_region(pdev, GPIO_BAR);
+disable_pci:
+	pci_disable_device(pdev);
+done:
+	kfree(sd);
+	return ret;
+}
+
+static void sdv_gpio_remove(struct pci_dev *pdev)
+{
+	struct sdv_gpio_chip_data *sd = pci_get_drvdata(pdev);
+
+	irq_domain_del(&sd->id);
+	free_irq(pdev->irq, sd);
+	irq_free_descs(sd->irq_base, SDV_NUM_PUB_GPIOS);
+
+	if (gpiochip_remove(&sd->bgpio.gc))
+		dev_err(&pdev->dev, "gpiochip_remove() failed.\n");
+
+	pci_release_region(pdev, GPIO_BAR);
+	iounmap(sd->gpio_pub_base);
+	pci_disable_device(pdev);
+	kfree(sd);
+}
+
+static struct pci_device_id sdv_gpio_pci_ids[] __devinitdata = {
+	{ PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_SDV_GPIO) },
+	{ 0, },
+};
+
+static struct pci_driver sdv_gpio_driver = {
+	.name = DRV_NAME,
+	.id_table = sdv_gpio_pci_ids,
+	.probe = sdv_gpio_probe,
+	.remove = sdv_gpio_remove,
+};
+
+static int __init sdv_gpio_init(void)
+{
+	return pci_register_driver(&sdv_gpio_driver);
+}
+module_init(sdv_gpio_init);
+
+static void __exit sdv_gpio_exit(void)
+{
+	pci_unregister_driver(&sdv_gpio_driver);
+}
+module_exit(sdv_gpio_exit);
+
+MODULE_AUTHOR("Hans J. Koch <hjk@linutronix.de>");
+MODULE_DESCRIPTION("GPIO interface for Intel Sodaville SoCs");
+MODULE_LICENSE("GPL v2");
-- 
cgit v1.2.3


From 7931d493051ea9b09e4fddee2dc40b2eb88d62b9 Mon Sep 17 00:00:00 2001
From: Jan Beulich <JBeulich@suse.com>
Date: Fri, 3 Feb 2012 15:06:26 +0000
Subject: x86/spinlocks: Eliminate TICKET_MASK

The definition of it being questionable already (unnecessarily
including a cast), and it being used in a single place that can
be written shorter without it, remove this #define.

Along the same lines, simplify __ticket_spin_is_locked()'s main
expression, which was the more convoluted way because of needs
that went away with the recent type changes by Jeremy.

This is pure cleanup, no functional change intended.

Signed-off-by: Jan Beulich <jbeulich@suse.com>
Acked-by: Jeremy Fitzhardinge <jeremy@goop.org>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Link: http://lkml.kernel.org/r/4F2C06020200007800071066@nat28.tlf.novell.com
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/include/asm/spinlock.h       | 4 ++--
 arch/x86/include/asm/spinlock_types.h | 1 -
 2 files changed, 2 insertions(+), 3 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/include/asm/spinlock.h b/arch/x86/include/asm/spinlock.h
index a82c2bf504b6..76bfa2cf301d 100644
--- a/arch/x86/include/asm/spinlock.h
+++ b/arch/x86/include/asm/spinlock.h
@@ -88,14 +88,14 @@ static inline int __ticket_spin_is_locked(arch_spinlock_t *lock)
 {
 	struct __raw_tickets tmp = ACCESS_ONCE(lock->tickets);
 
-	return !!(tmp.tail ^ tmp.head);
+	return tmp.tail != tmp.head;
 }
 
 static inline int __ticket_spin_is_contended(arch_spinlock_t *lock)
 {
 	struct __raw_tickets tmp = ACCESS_ONCE(lock->tickets);
 
-	return ((tmp.tail - tmp.head) & TICKET_MASK) > 1;
+	return (__ticket_t)(tmp.tail - tmp.head) > 1;
 }
 
 #ifndef CONFIG_PARAVIRT_SPINLOCKS
diff --git a/arch/x86/include/asm/spinlock_types.h b/arch/x86/include/asm/spinlock_types.h
index 8ebd5df7451e..ad0ad07fc006 100644
--- a/arch/x86/include/asm/spinlock_types.h
+++ b/arch/x86/include/asm/spinlock_types.h
@@ -16,7 +16,6 @@ typedef u32 __ticketpair_t;
 #endif
 
 #define TICKET_SHIFT	(sizeof(__ticket_t) * 8)
-#define TICKET_MASK	((__ticket_t)((1 << TICKET_SHIFT) - 1))
 
 typedef struct arch_spinlock {
 	union {
-- 
cgit v1.2.3


From c1d2f1bccf4259384e581b937e694ee8a350fe55 Mon Sep 17 00:00:00 2001
From: Prarit Bhargava <prarit@redhat.com>
Date: Mon, 6 Feb 2012 13:28:55 -0500
Subject: x86/microcode: Remove noisy AMD microcode warning

AMD processors will never support /dev/cpu/microcode updating so
just silently fail instead of printing out a warning for every
cpu.

Signed-off-by: Prarit Bhargava <prarit@redhat.com>
Cc: Borislav Petkov <borislav.petkov@amd.com>
Link: http://lkml.kernel.org/r/1328552935-965-1-git-send-email-prarit@redhat.com
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/microcode_amd.c | 1 -
 1 file changed, 1 deletion(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kernel/microcode_amd.c b/arch/x86/kernel/microcode_amd.c
index ac0417be9131..73465aab28f8 100644
--- a/arch/x86/kernel/microcode_amd.c
+++ b/arch/x86/kernel/microcode_amd.c
@@ -360,7 +360,6 @@ out:
 static enum ucode_state
 request_microcode_user(int cpu, const void __user *buf, size_t size)
 {
-	pr_info("AMD microcode update via /dev/cpu/microcode not supported\n");
 	return UCODE_ERROR;
 }
 
-- 
cgit v1.2.3


From c98fdeaa92731308ed80386261fa2589addefa47 Mon Sep 17 00:00:00 2001
From: Borislav Petkov <bp@alien8.de>
Date: Tue, 7 Feb 2012 13:08:52 +0100
Subject: x86/sched/perf/AMD: Set sched_clock_stable

Stephane Eranian reported that doing a scheduler latency
measurements with perf on AMD doesn't work out as expected due
to the fact that the sched_clock() granularity is too coarse,
i.e. done in jiffies due to the sched_clock_stable not set,
which, if set, would mean that we get to use the TSC as sample
source which would give us much higher precision.

However, there's no reason not to set sched_clock_stable on AMD
because all families from F10h and upwards do have an invariant
TSC and have the CPUID flag to prove (CPUID_8000_0007_EDX[8]).

Make it so, #1.

Signed-off-by: Borislav Petkov <bp@alien8.de>
Cc: Borislav Petkov <bp@amd64.org>
Cc: Venki Pallipadi <venki@google.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Stephane Eranian <eranian@google.com>
Cc: Arnaldo Carvalho de Melo <acme@infradead.org>
Cc: Robert Richter <robert.richter@amd.com>
Cc: Eric Dumazet <eric.dumazet@gmail.com>
Cc: Andreas Herrmann <andreas.herrmann3@amd.com>
Link: http://lkml.kernel.org/r/20120206132546.GA30854@quad
[ Should any non-standard system break the TSC, we should
  mark them so explicitly, in their platform init handler, or
  in a DMI quirk. ]
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/cpu/amd.c | 3 +++
 1 file changed, 3 insertions(+)

(limited to 'arch/x86')

diff --git a/arch/x86/kernel/cpu/amd.c b/arch/x86/kernel/cpu/amd.c
index f4773f4aae35..0a44b90602b0 100644
--- a/arch/x86/kernel/cpu/amd.c
+++ b/arch/x86/kernel/cpu/amd.c
@@ -5,6 +5,7 @@
 #include <linux/mm.h>
 
 #include <linux/io.h>
+#include <linux/sched.h>
 #include <asm/processor.h>
 #include <asm/apic.h>
 #include <asm/cpu.h>
@@ -456,6 +457,8 @@ static void __cpuinit early_init_amd(struct cpuinfo_x86 *c)
 	if (c->x86_power & (1 << 8)) {
 		set_cpu_cap(c, X86_FEATURE_CONSTANT_TSC);
 		set_cpu_cap(c, X86_FEATURE_NONSTOP_TSC);
+		if (!check_tsc_unstable())
+			sched_clock_stable = 1;
 	}
 
 #ifdef CONFIG_X86_64
-- 
cgit v1.2.3


From f39d47ff819ed52a2afbdbecbe35f23f7755f58d Mon Sep 17 00:00:00 2001
From: Stephane Eranian <eranian@google.com>
Date: Tue, 7 Feb 2012 14:39:57 +0100
Subject: perf: Fix double start/stop in x86_pmu_start()

The following patch fixes a bug introduced by the following
commit:

        e050e3f0a71b ("perf: Fix broken interrupt rate throttling")

The patch caused the following warning to pop up depending on
the sampling frequency adjustments:

  ------------[ cut here ]------------
  WARNING: at arch/x86/kernel/cpu/perf_event.c:995 x86_pmu_start+0x79/0xd4()

It was caused by the following call sequence:

perf_adjust_freq_unthr_context.part() {
     stop()
     if (delta > 0) {
          perf_adjust_period() {
              if (period > 8*...) {
                  stop()
                  ...
                  start()
              }
          }
      }
      start()
}

Which caused a double start and a double stop, thus triggering
the assert in x86_pmu_start().

The patch fixes the problem by avoiding the double calls. We
pass a new argument to perf_adjust_period() to indicate whether
or not the event is already stopped. We can't just remove the
start/stop from that function because it's called from
__perf_event_overflow where the event needs to be reloaded via a
stop/start back-toback call.

The patch reintroduces the assertion in x86_pmu_start() which
was removed by commit:

	84f2b9b ("perf: Remove deprecated WARN_ON_ONCE()")

In this second version, we've added calls to disable/enable PMU
during unthrottling or frequency adjustment based on bug report
of spurious NMI interrupts from Eric Dumazet.

Reported-and-tested-by: Eric Dumazet <eric.dumazet@gmail.com>
Signed-off-by: Stephane Eranian <eranian@google.com>
Acked-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: markus@trippelsdorf.de
Cc: paulus@samba.org
Link: http://lkml.kernel.org/r/20120207133956.GA4932@quad
[ Minor edits to the changelog and to the code ]
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/cpu/perf_event.c |  3 +++
 kernel/events/core.c             | 19 ++++++++++++++-----
 2 files changed, 17 insertions(+), 5 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kernel/cpu/perf_event.c b/arch/x86/kernel/cpu/perf_event.c
index 2a30e5ae6acf..5adce1040b11 100644
--- a/arch/x86/kernel/cpu/perf_event.c
+++ b/arch/x86/kernel/cpu/perf_event.c
@@ -986,6 +986,9 @@ static void x86_pmu_start(struct perf_event *event, int flags)
 	struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events);
 	int idx = event->hw.idx;
 
+	if (WARN_ON_ONCE(!(event->hw.state & PERF_HES_STOPPED)))
+		return;
+
 	if (WARN_ON_ONCE(idx == -1))
 		return;
 
diff --git a/kernel/events/core.c b/kernel/events/core.c
index ba36013cfb21..1b5c081d8b9f 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -2303,7 +2303,7 @@ do {					\
 static DEFINE_PER_CPU(int, perf_throttled_count);
 static DEFINE_PER_CPU(u64, perf_throttled_seq);
 
-static void perf_adjust_period(struct perf_event *event, u64 nsec, u64 count)
+static void perf_adjust_period(struct perf_event *event, u64 nsec, u64 count, bool disable)
 {
 	struct hw_perf_event *hwc = &event->hw;
 	s64 period, sample_period;
@@ -2322,9 +2322,13 @@ static void perf_adjust_period(struct perf_event *event, u64 nsec, u64 count)
 	hwc->sample_period = sample_period;
 
 	if (local64_read(&hwc->period_left) > 8*sample_period) {
-		event->pmu->stop(event, PERF_EF_UPDATE);
+		if (disable)
+			event->pmu->stop(event, PERF_EF_UPDATE);
+
 		local64_set(&hwc->period_left, 0);
-		event->pmu->start(event, PERF_EF_RELOAD);
+
+		if (disable)
+			event->pmu->start(event, PERF_EF_RELOAD);
 	}
 }
 
@@ -2350,6 +2354,7 @@ static void perf_adjust_freq_unthr_context(struct perf_event_context *ctx,
 		return;
 
 	raw_spin_lock(&ctx->lock);
+	perf_pmu_disable(ctx->pmu);
 
 	list_for_each_entry_rcu(event, &ctx->event_list, event_entry) {
 		if (event->state != PERF_EVENT_STATE_ACTIVE)
@@ -2381,13 +2386,17 @@ static void perf_adjust_freq_unthr_context(struct perf_event_context *ctx,
 		/*
 		 * restart the event
 		 * reload only if value has changed
+		 * we have stopped the event so tell that
+		 * to perf_adjust_period() to avoid stopping it
+		 * twice.
 		 */
 		if (delta > 0)
-			perf_adjust_period(event, period, delta);
+			perf_adjust_period(event, period, delta, false);
 
 		event->pmu->start(event, delta > 0 ? PERF_EF_RELOAD : 0);
 	}
 
+	perf_pmu_enable(ctx->pmu);
 	raw_spin_unlock(&ctx->lock);
 }
 
@@ -4562,7 +4571,7 @@ static int __perf_event_overflow(struct perf_event *event,
 		hwc->freq_time_stamp = now;
 
 		if (delta > 0 && delta < 2*TICK_NSEC)
-			perf_adjust_period(event, delta, hwc->last_period);
+			perf_adjust_period(event, delta, hwc->last_period, true);
 	}
 
 	/*
-- 
cgit v1.2.3


From 32c3233885eb10ac9cb9410f2f8cd64b8df2b2a1 Mon Sep 17 00:00:00 2001
From: Andreas Herrmann <andreas.herrmann3@amd.com>
Date: Wed, 8 Feb 2012 20:52:29 +0100
Subject: x86/amd: Fix L1i and L2 cache sharing information for AMD family 15h
 processors

For L1 instruction cache and L2 cache the shared CPU information
is wrong. On current AMD family 15h CPUs those caches are shared
between both cores of a compute unit.

This fixes https://bugzilla.kernel.org/show_bug.cgi?id=42607

Signed-off-by: Andreas Herrmann <andreas.herrmann3@amd.com>
Cc: Petkov Borislav <Borislav.Petkov@amd.com>
Cc: Dave Jones <davej@redhat.com>
Cc: <stable@kernel.org>
Link: http://lkml.kernel.org/r/20120208195229.GA17523@alberich.amd.com
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/cpu/intel_cacheinfo.c | 44 ++++++++++++++++++++++++++++-------
 1 file changed, 36 insertions(+), 8 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kernel/cpu/intel_cacheinfo.c b/arch/x86/kernel/cpu/intel_cacheinfo.c
index 6b45e5e7a901..73d08ed98a64 100644
--- a/arch/x86/kernel/cpu/intel_cacheinfo.c
+++ b/arch/x86/kernel/cpu/intel_cacheinfo.c
@@ -326,8 +326,7 @@ static void __cpuinit amd_calc_l3_indices(struct amd_northbridge *nb)
 	l3->indices = (max(max3(sc0, sc1, sc2), sc3) << 10) - 1;
 }
 
-static void __cpuinit amd_init_l3_cache(struct _cpuid4_info_regs *this_leaf,
-					int index)
+static void __cpuinit amd_init_l3_cache(struct _cpuid4_info_regs *this_leaf, int index)
 {
 	int node;
 
@@ -725,14 +724,16 @@ static DEFINE_PER_CPU(struct _cpuid4_info *, ici_cpuid4_info);
 #define CPUID4_INFO_IDX(x, y)	(&((per_cpu(ici_cpuid4_info, x))[y]))
 
 #ifdef CONFIG_SMP
-static void __cpuinit cache_shared_cpu_map_setup(unsigned int cpu, int index)
+
+static int __cpuinit cache_shared_amd_cpu_map_setup(unsigned int cpu, int index)
 {
-	struct _cpuid4_info	*this_leaf, *sibling_leaf;
-	unsigned long num_threads_sharing;
-	int index_msb, i, sibling;
+	struct _cpuid4_info *this_leaf;
+	int ret, i, sibling;
 	struct cpuinfo_x86 *c = &cpu_data(cpu);
 
-	if ((index == 3) && (c->x86_vendor == X86_VENDOR_AMD)) {
+	ret = 0;
+	if (index == 3) {
+		ret = 1;
 		for_each_cpu(i, cpu_llc_shared_mask(cpu)) {
 			if (!per_cpu(ici_cpuid4_info, i))
 				continue;
@@ -743,8 +744,35 @@ static void __cpuinit cache_shared_cpu_map_setup(unsigned int cpu, int index)
 				set_bit(sibling, this_leaf->shared_cpu_map);
 			}
 		}
-		return;
+	} else if ((c->x86 == 0x15) && ((index == 1) || (index == 2))) {
+		ret = 1;
+		for_each_cpu(i, cpu_sibling_mask(cpu)) {
+			if (!per_cpu(ici_cpuid4_info, i))
+				continue;
+			this_leaf = CPUID4_INFO_IDX(i, index);
+			for_each_cpu(sibling, cpu_sibling_mask(cpu)) {
+				if (!cpu_online(sibling))
+					continue;
+				set_bit(sibling, this_leaf->shared_cpu_map);
+			}
+		}
 	}
+
+	return ret;
+}
+
+static void __cpuinit cache_shared_cpu_map_setup(unsigned int cpu, int index)
+{
+	struct _cpuid4_info *this_leaf, *sibling_leaf;
+	unsigned long num_threads_sharing;
+	int index_msb, i;
+	struct cpuinfo_x86 *c = &cpu_data(cpu);
+
+	if (c->x86_vendor == X86_VENDOR_AMD) {
+		if (cache_shared_amd_cpu_map_setup(cpu, index))
+			return;
+	}
+
 	this_leaf = CPUID4_INFO_IDX(cpu, index);
 	num_threads_sharing = 1 + this_leaf->base.eax.split.num_threads_sharing;
 
-- 
cgit v1.2.3


From f8d98f1095210da708a59f3a0b6fd267ad8f3f03 Mon Sep 17 00:00:00 2001
From: Masami Hiramatsu <masami.hiramatsu.pt@hitachi.com>
Date: Fri, 10 Feb 2012 14:33:40 +0900
Subject: x86: Fix to decode grouped AVX with VEX pp bits

Fix to decode grouped AVX with VEX pp bits which should be
handled as same as last-prefixes. This fixes below warnings
in posttest with CONFIG_CRYPTO_SHA1_SSSE3=y.

 Warning: arch/x86/tools/test_get_len found difference at <sha1_transform_avx>:ffffffff810d5fc0
 Warning: ffffffff810d6069:	c5 f9 73 de 04       	vpsrldq $0x4,%xmm6,%xmm0
 Warning: objdump says 5 bytes, but insn_get_length() says 4
 ...

With this change, test_get_len can decode it correctly.

 $ arch/x86/tools/test_get_len -v -y
 ffffffff810d6069:       c5 f9 73 de 04          vpsrldq $0x4,%xmm6,%xmm0
 Succeed: decoded and checked 1 instructions

Reported-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Masami Hiramatsu <masami.hiramatsu.pt@hitachi.com>
Cc: yrl.pp-manager.tt@hitachi.com
Link: http://lkml.kernel.org/r/20120210053340.30429.73410.stgit@localhost.localdomain
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/include/asm/inat.h |  5 +++--
 arch/x86/include/asm/insn.h | 18 ++++++++++++------
 arch/x86/lib/inat.c         | 36 ++++++++++++++++++------------------
 arch/x86/lib/insn.c         | 13 +++++++------
 4 files changed, 40 insertions(+), 32 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/include/asm/inat.h b/arch/x86/include/asm/inat.h
index 205b063e3e32..74a2e312e8a2 100644
--- a/arch/x86/include/asm/inat.h
+++ b/arch/x86/include/asm/inat.h
@@ -97,11 +97,12 @@
 
 /* Attribute search APIs */
 extern insn_attr_t inat_get_opcode_attribute(insn_byte_t opcode);
+extern int inat_get_last_prefix_id(insn_byte_t last_pfx);
 extern insn_attr_t inat_get_escape_attribute(insn_byte_t opcode,
-					     insn_byte_t last_pfx,
+					     int lpfx_id,
 					     insn_attr_t esc_attr);
 extern insn_attr_t inat_get_group_attribute(insn_byte_t modrm,
-					    insn_byte_t last_pfx,
+					    int lpfx_id,
 					    insn_attr_t esc_attr);
 extern insn_attr_t inat_get_avx_attribute(insn_byte_t opcode,
 					  insn_byte_t vex_m,
diff --git a/arch/x86/include/asm/insn.h b/arch/x86/include/asm/insn.h
index 74df3f1eddfd..48eb30a86062 100644
--- a/arch/x86/include/asm/insn.h
+++ b/arch/x86/include/asm/insn.h
@@ -96,12 +96,6 @@ struct insn {
 #define X86_VEX_P(vex)	((vex) & 0x03)		/* VEX3 Byte2, VEX2 Byte1 */
 #define X86_VEX_M_MAX	0x1f			/* VEX3.M Maximum value */
 
-/* The last prefix is needed for two-byte and three-byte opcodes */
-static inline insn_byte_t insn_last_prefix(struct insn *insn)
-{
-	return insn->prefixes.bytes[3];
-}
-
 extern void insn_init(struct insn *insn, const void *kaddr, int x86_64);
 extern void insn_get_prefixes(struct insn *insn);
 extern void insn_get_opcode(struct insn *insn);
@@ -160,6 +154,18 @@ static inline insn_byte_t insn_vex_p_bits(struct insn *insn)
 		return X86_VEX_P(insn->vex_prefix.bytes[2]);
 }
 
+/* Get the last prefix id from last prefix or VEX prefix */
+static inline int insn_last_prefix_id(struct insn *insn)
+{
+	if (insn_is_avx(insn))
+		return insn_vex_p_bits(insn);	/* VEX_p is a SIMD prefix id */
+
+	if (insn->prefixes.bytes[3])
+		return inat_get_last_prefix_id(insn->prefixes.bytes[3]);
+
+	return 0;
+}
+
 /* Offset of each field from kaddr */
 static inline int insn_offset_rex_prefix(struct insn *insn)
 {
diff --git a/arch/x86/lib/inat.c b/arch/x86/lib/inat.c
index 88ad5fbda6e1..c1f01a8e9f65 100644
--- a/arch/x86/lib/inat.c
+++ b/arch/x86/lib/inat.c
@@ -29,46 +29,46 @@ insn_attr_t inat_get_opcode_attribute(insn_byte_t opcode)
 	return inat_primary_table[opcode];
 }
 
-insn_attr_t inat_get_escape_attribute(insn_byte_t opcode, insn_byte_t last_pfx,
+int inat_get_last_prefix_id(insn_byte_t last_pfx)
+{
+	insn_attr_t lpfx_attr;
+
+	lpfx_attr = inat_get_opcode_attribute(last_pfx);
+	return inat_last_prefix_id(lpfx_attr);
+}
+
+insn_attr_t inat_get_escape_attribute(insn_byte_t opcode, int lpfx_id,
 				      insn_attr_t esc_attr)
 {
 	const insn_attr_t *table;
-	insn_attr_t lpfx_attr;
-	int n, m = 0;
+	int n;
 
 	n = inat_escape_id(esc_attr);
-	if (last_pfx) {
-		lpfx_attr = inat_get_opcode_attribute(last_pfx);
-		m = inat_last_prefix_id(lpfx_attr);
-	}
+
 	table = inat_escape_tables[n][0];
 	if (!table)
 		return 0;
-	if (inat_has_variant(table[opcode]) && m) {
-		table = inat_escape_tables[n][m];
+	if (inat_has_variant(table[opcode]) && lpfx_id) {
+		table = inat_escape_tables[n][lpfx_id];
 		if (!table)
 			return 0;
 	}
 	return table[opcode];
 }
 
-insn_attr_t inat_get_group_attribute(insn_byte_t modrm, insn_byte_t last_pfx,
+insn_attr_t inat_get_group_attribute(insn_byte_t modrm, int lpfx_id,
 				     insn_attr_t grp_attr)
 {
 	const insn_attr_t *table;
-	insn_attr_t lpfx_attr;
-	int n, m = 0;
+	int n;
 
 	n = inat_group_id(grp_attr);
-	if (last_pfx) {
-		lpfx_attr = inat_get_opcode_attribute(last_pfx);
-		m = inat_last_prefix_id(lpfx_attr);
-	}
+
 	table = inat_group_tables[n][0];
 	if (!table)
 		return inat_group_common_attribute(grp_attr);
-	if (inat_has_variant(table[X86_MODRM_REG(modrm)]) && m) {
-		table = inat_group_tables[n][m];
+	if (inat_has_variant(table[X86_MODRM_REG(modrm)]) && lpfx_id) {
+		table = inat_group_tables[n][lpfx_id];
 		if (!table)
 			return inat_group_common_attribute(grp_attr);
 	}
diff --git a/arch/x86/lib/insn.c b/arch/x86/lib/insn.c
index 5a1f9f3e3fbb..25feb1ae71c5 100644
--- a/arch/x86/lib/insn.c
+++ b/arch/x86/lib/insn.c
@@ -185,7 +185,8 @@ err_out:
 void insn_get_opcode(struct insn *insn)
 {
 	struct insn_field *opcode = &insn->opcode;
-	insn_byte_t op, pfx;
+	insn_byte_t op;
+	int pfx_id;
 	if (opcode->got)
 		return;
 	if (!insn->prefixes.got)
@@ -212,8 +213,8 @@ void insn_get_opcode(struct insn *insn)
 		/* Get escaped opcode */
 		op = get_next(insn_byte_t, insn);
 		opcode->bytes[opcode->nbytes++] = op;
-		pfx = insn_last_prefix(insn);
-		insn->attr = inat_get_escape_attribute(op, pfx, insn->attr);
+		pfx_id = insn_last_prefix_id(insn);
+		insn->attr = inat_get_escape_attribute(op, pfx_id, insn->attr);
 	}
 	if (inat_must_vex(insn->attr))
 		insn->attr = 0;	/* This instruction is bad */
@@ -235,7 +236,7 @@ err_out:
 void insn_get_modrm(struct insn *insn)
 {
 	struct insn_field *modrm = &insn->modrm;
-	insn_byte_t pfx, mod;
+	insn_byte_t pfx_id, mod;
 	if (modrm->got)
 		return;
 	if (!insn->opcode.got)
@@ -246,8 +247,8 @@ void insn_get_modrm(struct insn *insn)
 		modrm->value = mod;
 		modrm->nbytes = 1;
 		if (inat_is_group(insn->attr)) {
-			pfx = insn_last_prefix(insn);
-			insn->attr = inat_get_group_attribute(mod, pfx,
+			pfx_id = insn_last_prefix_id(insn);
+			insn->attr = inat_get_group_attribute(mod, pfx_id,
 							      insn->attr);
 			if (insn_is_avx(insn) && !inat_accept_vex(insn->attr))
 				insn->attr = 0;	/* This is bad */
-- 
cgit v1.2.3


From 21c3fcf3e39353d4f21d50e257cc74f3204b1988 Mon Sep 17 00:00:00 2001
From: Yinghai Lu <yinghai@kernel.org>
Date: Sun, 12 Feb 2012 09:53:57 -0800
Subject: x86/debug: Fix/improve the show_msr=<cpus> debug print out

Found out that show_msr=<cpus> is broken, when I asked a
user to use it to capture debug info about broken MTRR's
whose MTRR settings are probably different between CPUs.

Only the first CPUs MSRs are printed, but that is not
enough to track down the suspected bug.

For years we called print_cpu_msr from print_cpu_info(),
but this commit:

| commit 2eaad1fddd7450a48ad464229775f97fbfe8af36
| Author: Mike Travis <travis@sgi.com>
| Date:   Thu Dec 10 17:19:36 2009 -0800
|
|    x86: Limit the number of processor bootup messages

removed the print_cpu_info() call from all APs.

Put it back - it will only print MSRs when the user
specifically requests them via show_msr=<cpus>.

Signed-off-by: Yinghai Lu <yinghai@kernel.org>
Cc: Mike Travis <travis@sgi.com>
Link: http://lkml.kernel.org/r/1329069237-11483-1-git-send-email-yinghai@kernel.org
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/include/asm/processor.h |  1 +
 arch/x86/kernel/cpu/common.c     | 14 +++++++-------
 arch/x86/kernel/smpboot.c        |  5 +++--
 3 files changed, 11 insertions(+), 9 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/include/asm/processor.h b/arch/x86/include/asm/processor.h
index aa9088c26931..8bb062bbcbec 100644
--- a/arch/x86/include/asm/processor.h
+++ b/arch/x86/include/asm/processor.h
@@ -162,6 +162,7 @@ extern void early_cpu_init(void);
 extern void identify_boot_cpu(void);
 extern void identify_secondary_cpu(struct cpuinfo_x86 *);
 extern void print_cpu_info(struct cpuinfo_x86 *);
+void print_cpu_msr(struct cpuinfo_x86 *);
 extern void init_scattered_cpuid_features(struct cpuinfo_x86 *c);
 extern unsigned int init_intel_cacheinfo(struct cpuinfo_x86 *c);
 extern unsigned short num_cache_leaves;
diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c
index d43cad74f166..8b6a3bb57d8e 100644
--- a/arch/x86/kernel/cpu/common.c
+++ b/arch/x86/kernel/cpu/common.c
@@ -933,7 +933,7 @@ static const struct msr_range msr_range_array[] __cpuinitconst = {
 	{ 0xc0011000, 0xc001103b},
 };
 
-static void __cpuinit print_cpu_msr(void)
+static void __cpuinit __print_cpu_msr(void)
 {
 	unsigned index_min, index_max;
 	unsigned index;
@@ -997,13 +997,13 @@ void __cpuinit print_cpu_info(struct cpuinfo_x86 *c)
 	else
 		printk(KERN_CONT "\n");
 
-#ifdef CONFIG_SMP
+	__print_cpu_msr();
+}
+
+void __cpuinit print_cpu_msr(struct cpuinfo_x86 *c)
+{
 	if (c->cpu_index < show_msr)
-		print_cpu_msr();
-#else
-	if (show_msr)
-		print_cpu_msr();
-#endif
+		__print_cpu_msr();
 }
 
 static __init int setup_disablecpuid(char *arg)
diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c
index 66d250c00d11..257049d7c657 100644
--- a/arch/x86/kernel/smpboot.c
+++ b/arch/x86/kernel/smpboot.c
@@ -791,9 +791,10 @@ do_rest:
 			schedule();
 		}
 
-		if (cpumask_test_cpu(cpu, cpu_callin_mask))
+		if (cpumask_test_cpu(cpu, cpu_callin_mask)) {
+			print_cpu_msr(&cpu_data(cpu));
 			pr_debug("CPU%d: has booted.\n", cpu);
-		else {
+		} else {
 			boot_error = 1;
 			if (*(volatile u32 *)TRAMPOLINE_SYM(trampoline_status)
 			    == 0xA5A5A5A5)
-- 
cgit v1.2.3


From 484546509ce5d49d43ec0a6eb2141c6bf3362bfc Mon Sep 17 00:00:00 2001
From: Steven Rostedt <srostedt@redhat.com>
Date: Tue, 7 Feb 2012 09:40:30 -0500
Subject: x86/tracing: Denote the power and cpuidle tracepoints as _rcuidle()

The power and cpuidle tracepoints are called within a rcu_idle_exit()
section, and must be denoted with the _rcuidle() version of the tracepoint.

Acked-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
Reviewed-by: Josh Triplett <josh@joshtriplett.org>
Signed-off-by: Steven Rostedt <rostedt@goodmis.org>
---
 arch/x86/kernel/process.c    | 24 ++++++++++++------------
 include/trace/events/power.h |  2 ++
 2 files changed, 14 insertions(+), 12 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kernel/process.c b/arch/x86/kernel/process.c
index 15763af7bfe3..44eefde92109 100644
--- a/arch/x86/kernel/process.c
+++ b/arch/x86/kernel/process.c
@@ -377,8 +377,8 @@ static inline int hlt_use_halt(void)
 void default_idle(void)
 {
 	if (hlt_use_halt()) {
-		trace_power_start(POWER_CSTATE, 1, smp_processor_id());
-		trace_cpu_idle(1, smp_processor_id());
+		trace_power_start_rcuidle(POWER_CSTATE, 1, smp_processor_id());
+		trace_cpu_idle_rcuidle(1, smp_processor_id());
 		current_thread_info()->status &= ~TS_POLLING;
 		/*
 		 * TS_POLLING-cleared state must be visible before we
@@ -391,8 +391,8 @@ void default_idle(void)
 		else
 			local_irq_enable();
 		current_thread_info()->status |= TS_POLLING;
-		trace_power_end(smp_processor_id());
-		trace_cpu_idle(PWR_EVENT_EXIT, smp_processor_id());
+		trace_power_end_rcuidle(smp_processor_id());
+		trace_cpu_idle_rcuidle(PWR_EVENT_EXIT, smp_processor_id());
 	} else {
 		local_irq_enable();
 		/* loop is done by the caller */
@@ -450,8 +450,8 @@ EXPORT_SYMBOL_GPL(cpu_idle_wait);
 static void mwait_idle(void)
 {
 	if (!need_resched()) {
-		trace_power_start(POWER_CSTATE, 1, smp_processor_id());
-		trace_cpu_idle(1, smp_processor_id());
+		trace_power_start_rcuidle(POWER_CSTATE, 1, smp_processor_id());
+		trace_cpu_idle_rcuidle(1, smp_processor_id());
 		if (this_cpu_has(X86_FEATURE_CLFLUSH_MONITOR))
 			clflush((void *)&current_thread_info()->flags);
 
@@ -461,8 +461,8 @@ static void mwait_idle(void)
 			__sti_mwait(0, 0);
 		else
 			local_irq_enable();
-		trace_power_end(smp_processor_id());
-		trace_cpu_idle(PWR_EVENT_EXIT, smp_processor_id());
+		trace_power_end_rcuidle(smp_processor_id());
+		trace_cpu_idle_rcuidle(PWR_EVENT_EXIT, smp_processor_id());
 	} else
 		local_irq_enable();
 }
@@ -474,13 +474,13 @@ static void mwait_idle(void)
  */
 static void poll_idle(void)
 {
-	trace_power_start(POWER_CSTATE, 0, smp_processor_id());
-	trace_cpu_idle(0, smp_processor_id());
+	trace_power_start_rcuidle(POWER_CSTATE, 0, smp_processor_id());
+	trace_cpu_idle_rcuidle(0, smp_processor_id());
 	local_irq_enable();
 	while (!need_resched())
 		cpu_relax();
-	trace_power_end(smp_processor_id());
-	trace_cpu_idle(PWR_EVENT_EXIT, smp_processor_id());
+	trace_power_end_rcuidle(smp_processor_id());
+	trace_cpu_idle_rcuidle(PWR_EVENT_EXIT, smp_processor_id());
 }
 
 /*
diff --git a/include/trace/events/power.h b/include/trace/events/power.h
index 1bcc2a8c00e2..14b38940062b 100644
--- a/include/trace/events/power.h
+++ b/include/trace/events/power.h
@@ -151,6 +151,8 @@ enum {
    events get removed */
 static inline void trace_power_start(u64 type, u64 state, u64 cpuid) {};
 static inline void trace_power_end(u64 cpuid) {};
+static inline void trace_power_start_rcuidle(u64 type, u64 state, u64 cpuid) {};
+static inline void trace_power_end_rcuidle(u64 cpuid) {};
 static inline void trace_power_frequency(u64 type, u64 state, u64 cpuid) {};
 #endif /* _PWR_EVENT_AVOID_DOUBLE_DEFINING_DEPRECATED */
 
-- 
cgit v1.2.3


From be98c2cdb15ba26148cd2bd58a857d4f7759ed38 Mon Sep 17 00:00:00 2001
From: Linus Torvalds <torvalds@linux-foundation.org>
Date: Mon, 13 Feb 2012 13:47:25 -0800
Subject: i387: math_state_restore() isn't called from asm

It was marked asmlinkage for some really old and stale legacy reasons.
Fix that and the equally stale comment.

Noticed when debugging the irq_fpu_usable() bugs.

Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 arch/x86/include/asm/i387.h | 2 +-
 arch/x86/kernel/traps.c     | 6 +++---
 2 files changed, 4 insertions(+), 4 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/include/asm/i387.h b/arch/x86/include/asm/i387.h
index 6919e936345b..a5c7ae504176 100644
--- a/arch/x86/include/asm/i387.h
+++ b/arch/x86/include/asm/i387.h
@@ -29,7 +29,7 @@ extern unsigned int sig_xstate_size;
 extern void fpu_init(void);
 extern void mxcsr_feature_mask_init(void);
 extern int init_fpu(struct task_struct *child);
-extern asmlinkage void math_state_restore(void);
+extern void math_state_restore(void);
 extern void __math_state_restore(void);
 extern int dump_fpu(struct pt_regs *, struct user_i387_struct *);
 
diff --git a/arch/x86/kernel/traps.c b/arch/x86/kernel/traps.c
index 482ec3af2067..982433b5da30 100644
--- a/arch/x86/kernel/traps.c
+++ b/arch/x86/kernel/traps.c
@@ -599,10 +599,10 @@ void __math_state_restore(void)
  * Careful.. There are problems with IBM-designed IRQ13 behaviour.
  * Don't touch unless you *really* know how it works.
  *
- * Must be called with kernel preemption disabled (in this case,
- * local interrupts are disabled at the call-site in entry.S).
+ * Must be called with kernel preemption disabled (eg with local
+ * local interrupts as in the case of do_device_not_available).
  */
-asmlinkage void math_state_restore(void)
+void math_state_restore(void)
 {
 	struct thread_info *thread = current_thread_info();
 	struct task_struct *tsk = thread->task;
-- 
cgit v1.2.3


From 5b1cbac37798805c1fee18c8cebe5c0a13975b17 Mon Sep 17 00:00:00 2001
From: Linus Torvalds <torvalds@linux-foundation.org>
Date: Mon, 13 Feb 2012 13:56:14 -0800
Subject: i387: make irq_fpu_usable() tests more robust

Some code - especially the crypto layer - wants to use the x86
FP/MMX/AVX register set in what may be interrupt (typically softirq)
context.

That *can* be ok, but the tests for when it was ok were somewhat
suspect.  We cannot touch the thread-specific status bits either, so
we'd better check that we're not going to try to save FP state or
anything like that.

Now, it may be that the TS bit is always cleared *before* we set the
USEDFPU bit (and only set when we had already cleared the USEDFP
before), so the TS bit test may actually have been sufficient, but it
certainly was not obviously so.

So this explicitly verifies that we will not touch the TS_USEDFPU bit,
and adds a few related sanity-checks.  Because it seems that somehow
AES-NI is corrupting user FP state.  The cause is not clear, and this
patch doesn't fix it, but while debugging it I really wanted the code to
be more obviously correct and robust.

Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 arch/x86/include/asm/i387.h | 54 ++++++++++++++++++++++++++++++++++++++-------
 arch/x86/kernel/traps.c     |  1 +
 2 files changed, 47 insertions(+), 8 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/include/asm/i387.h b/arch/x86/include/asm/i387.h
index a5c7ae504176..a29571821b99 100644
--- a/arch/x86/include/asm/i387.h
+++ b/arch/x86/include/asm/i387.h
@@ -307,9 +307,54 @@ static inline void __clear_fpu(struct task_struct *tsk)
 	}
 }
 
+/*
+ * Were we in an interrupt that interrupted kernel mode?
+ *
+ * We can do a kernel_fpu_begin/end() pair *ONLY* if that
+ * pair does nothing at all: TS_USEDFPU must be clear (so
+ * that we don't try to save the FPU state), and TS must
+ * be set (so that the clts/stts pair does nothing that is
+ * visible in the interrupted kernel thread).
+ */
+static inline bool interrupted_kernel_fpu_idle(void)
+{
+	return !(current_thread_info()->status & TS_USEDFPU) &&
+		(read_cr0() & X86_CR0_TS);
+}
+
+/*
+ * Were we in user mode (or vm86 mode) when we were
+ * interrupted?
+ *
+ * Doing kernel_fpu_begin/end() is ok if we are running
+ * in an interrupt context from user mode - we'll just
+ * save the FPU state as required.
+ */
+static inline bool interrupted_user_mode(void)
+{
+	struct pt_regs *regs = get_irq_regs();
+	return regs && user_mode_vm(regs);
+}
+
+/*
+ * Can we use the FPU in kernel mode with the
+ * whole "kernel_fpu_begin/end()" sequence?
+ *
+ * It's always ok in process context (ie "not interrupt")
+ * but it is sometimes ok even from an irq.
+ */
+static inline bool irq_fpu_usable(void)
+{
+	return !in_interrupt() ||
+		interrupted_user_mode() ||
+		interrupted_kernel_fpu_idle();
+}
+
 static inline void kernel_fpu_begin(void)
 {
 	struct thread_info *me = current_thread_info();
+
+	WARN_ON_ONCE(!irq_fpu_usable());
 	preempt_disable();
 	if (me->status & TS_USEDFPU)
 		__save_init_fpu(me->task);
@@ -323,14 +368,6 @@ static inline void kernel_fpu_end(void)
 	preempt_enable();
 }
 
-static inline bool irq_fpu_usable(void)
-{
-	struct pt_regs *regs;
-
-	return !in_interrupt() || !(regs = get_irq_regs()) || \
-		user_mode(regs) || (read_cr0() & X86_CR0_TS);
-}
-
 /*
  * Some instructions like VIA's padlock instructions generate a spurious
  * DNA fault but don't modify SSE registers. And these instructions
@@ -367,6 +404,7 @@ static inline void irq_ts_restore(int TS_state)
  */
 static inline void save_init_fpu(struct task_struct *tsk)
 {
+	WARN_ON_ONCE(task_thread_info(tsk)->status & TS_USEDFPU);
 	preempt_disable();
 	__save_init_fpu(tsk);
 	stts();
diff --git a/arch/x86/kernel/traps.c b/arch/x86/kernel/traps.c
index 982433b5da30..8ba27dbc107a 100644
--- a/arch/x86/kernel/traps.c
+++ b/arch/x86/kernel/traps.c
@@ -631,6 +631,7 @@ EXPORT_SYMBOL_GPL(math_state_restore);
 dotraplinkage void __kprobes
 do_device_not_available(struct pt_regs *regs, long error_code)
 {
+	WARN_ON_ONCE(!user_mode_vm(regs));
 #ifdef CONFIG_MATH_EMULATION
 	if (read_cr0() & X86_CR0_EM) {
 		struct math_emu_info info = { };
-- 
cgit v1.2.3


From 70142a9dd154f54f7409871ead86f7d77f2c6576 Mon Sep 17 00:00:00 2001
From: Ben Hutchings <ben@decadent.org.uk>
Date: Sat, 11 Feb 2012 22:56:59 +0000
Subject: x86/cpu: Fix overrun check in arch_print_cpu_modalias()

snprintf() does not return a negative value when truncating.

Signed-off-by: Ben Hutchings <ben@decadent.org.uk>
Acked-by: Thomas Renninger <trenn@suse.de>
Acked-by: H. Peter Anvin <hpa@zytor.com>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 arch/x86/kernel/cpu/match.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kernel/cpu/match.c b/arch/x86/kernel/cpu/match.c
index 940e2d483076..2dfa52bcdfe2 100644
--- a/arch/x86/kernel/cpu/match.c
+++ b/arch/x86/kernel/cpu/match.c
@@ -67,7 +67,7 @@ ssize_t arch_print_cpu_modalias(struct device *dev,
 	for (i = 0; i < NCAPINTS*32; i++) {
 		if (boot_cpu_has(i)) {
 			n = snprintf(buf, size, ",%04X", i);
-			if (n < 0) {
+			if (n >= size) {
 				WARN(1, "x86 features overflow page\n");
 				break;
 			}
-- 
cgit v1.2.3


From 5467bdda4a326513c2f14b712a22d59115b7ae94 Mon Sep 17 00:00:00 2001
From: Ben Hutchings <ben@decadent.org.uk>
Date: Sat, 11 Feb 2012 22:57:19 +0000
Subject: x86/cpu: Clean up modalias feature matching

We currently include commas on both sides of the feature ID in a
modalias, but this prevents the lowest numbered feature of a CPU from
being matched.  Since all feature IDs have the same length, we do not
need to worry about substring matches, so omit commas from the
modalias entirely.

Avoid generating multiple adjacent wildcards when there is no
feature ID to match.

Signed-off-by: Ben Hutchings <ben@decadent.org.uk>
Acked-by: Thomas Renninger <trenn@suse.de>
Acked-by: H. Peter Anvin <hpa@zytor.com>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 arch/x86/kernel/cpu/match.c | 3 +--
 scripts/mod/file2alias.c    | 5 +++--
 2 files changed, 4 insertions(+), 4 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kernel/cpu/match.c b/arch/x86/kernel/cpu/match.c
index 2dfa52bcdfe2..5502b289341b 100644
--- a/arch/x86/kernel/cpu/match.c
+++ b/arch/x86/kernel/cpu/match.c
@@ -63,7 +63,7 @@ ssize_t arch_print_cpu_modalias(struct device *dev,
 		boot_cpu_data.x86_model);
 	size -= n;
 	buf += n;
-	size -= 2;
+	size -= 1;
 	for (i = 0; i < NCAPINTS*32; i++) {
 		if (boot_cpu_has(i)) {
 			n = snprintf(buf, size, ",%04X", i);
@@ -75,7 +75,6 @@ ssize_t arch_print_cpu_modalias(struct device *dev,
 			buf += n;
 		}
 	}
-	*buf++ = ',';
 	*buf++ = '\n';
 	return buf - bufptr;
 }
diff --git a/scripts/mod/file2alias.c b/scripts/mod/file2alias.c
index a468af059834..78fd81fb9732 100644
--- a/scripts/mod/file2alias.c
+++ b/scripts/mod/file2alias.c
@@ -1021,8 +1021,9 @@ static int do_x86cpu_entry(const char *filename, struct x86_cpu_id *id,
 	ADD(alias, "vendor:",  id->vendor != X86_VENDOR_ANY, id->vendor);
 	ADD(alias, ":family:", id->family != X86_FAMILY_ANY, id->family);
 	ADD(alias, ":model:",  id->model  != X86_MODEL_ANY,  id->model);
-	ADD(alias, ":feature:*,", id->feature != X86_FEATURE_ANY, id->feature);
-	strcat(alias, ",*");
+	strcat(alias, ":feature:*");
+	if (id->feature != X86_FEATURE_ANY)
+		sprintf(alias + strlen(alias), "%04X*", id->feature);
 	return 1;
 }
 ADD_TO_DEVTABLE("x86cpu", struct x86_cpu_id, do_x86cpu_entry);
-- 
cgit v1.2.3


From 8d21190e223a785a351a1078ac6e3700809969b6 Mon Sep 17 00:00:00 2001
From: Jesper Juhl <jj@chaosbits.net>
Date: Thu, 9 Feb 2012 23:02:16 +0100
Subject: crypto: twofish-x86 - Remove dead code from
 twofish_glue_3way.c::init()

We can never reach the line just after the 'return 0'
statement. Remove it.

Signed-off-by: Jesper Juhl <jj@chaosbits.net>
Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
---
 arch/x86/crypto/twofish_glue_3way.c | 1 -
 1 file changed, 1 deletion(-)

(limited to 'arch/x86')

diff --git a/arch/x86/crypto/twofish_glue_3way.c b/arch/x86/crypto/twofish_glue_3way.c
index 0afd134d8c9c..2c7f14ec7082 100644
--- a/arch/x86/crypto/twofish_glue_3way.c
+++ b/arch/x86/crypto/twofish_glue_3way.c
@@ -706,7 +706,6 @@ int __init init(void)
 
 	return 0;
 
-	crypto_unregister_alg(&blk_xts_alg);
 blk_xts_err:
 	crypto_unregister_alg(&blk_lrw_alg);
 blk_lrw_err:
-- 
cgit v1.2.3


From 6e77fe8c1100bfb3c6f5b2558d4556519b837b65 Mon Sep 17 00:00:00 2001
From: Jesper Juhl <jj@chaosbits.net>
Date: Thu, 9 Feb 2012 23:16:04 +0100
Subject: crypto: serpent-sse2 - remove dead code from
 serpent_sse2_glue.c::serpent_sse2_init()

We cannot reach the line after 'return err'. Remove it.

Signed-off-by: Jesper Juhl <jj@chaosbits.net>
Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
---
 arch/x86/crypto/serpent_sse2_glue.c | 1 -
 1 file changed, 1 deletion(-)

(limited to 'arch/x86')

diff --git a/arch/x86/crypto/serpent_sse2_glue.c b/arch/x86/crypto/serpent_sse2_glue.c
index 7955a9b76b91..de81cf4e06a1 100644
--- a/arch/x86/crypto/serpent_sse2_glue.c
+++ b/arch/x86/crypto/serpent_sse2_glue.c
@@ -1025,7 +1025,6 @@ static int __init serpent_sse2_init(void)
 		goto ablk_xts_err;
 	return err;
 
-	crypto_unregister_alg(&ablk_xts_alg);
 ablk_xts_err:
 	crypto_unregister_alg(&blk_xts_alg);
 blk_xts_err:
-- 
cgit v1.2.3


From 925845bd49c6de437dfab3bf8dc654ea3ae21d74 Mon Sep 17 00:00:00 2001
From: Myron Stowe <mstowe@redhat.com>
Date: Mon, 21 Nov 2011 11:54:13 -0700
Subject: x86/PCI: Infrastructure to maintain a list of FW-assigned BIOS BAR
 values

Commit 58c84eda075 introduced functionality to try and reinstate the
original BIOS BAR addresses of a PCI device when normal resource
assignment attempts fail.  To keep track of the BIOS BAR addresses,
struct pci_dev was augmented with an array to hold the BAR addresses
of the PCI device: 'resource_size_t fw_addr[DEVICE_COUNT_RESOURCE]'.

The reinstatement of BAR addresses is an uncommon event leaving the
'fw_addr' array unused under normal circumstances.  This functionality
is also currently architecture specific with an implementation limited
to x86.  As the use of struct pci_dev is so prevalent, having the
'fw_addr' array residing within such seems somewhat wasteful.

This patch introduces a stand alone data structure and interfacing
routines for maintaining a list of FW-assigned BIOS BAR value entries.

Signed-off-by: Myron Stowe <myron.stowe@redhat.com>
Signed-off-by: Jesse Barnes <jbarnes@virtuousgeek.org>
---
 arch/x86/pci/i386.c | 79 +++++++++++++++++++++++++++++++++++++++++++++++++++++
 include/linux/pci.h |  1 +
 2 files changed, 80 insertions(+)

(limited to 'arch/x86')

diff --git a/arch/x86/pci/i386.c b/arch/x86/pci/i386.c
index 91821a1a0c3a..5a1edf2b5386 100644
--- a/arch/x86/pci/i386.c
+++ b/arch/x86/pci/i386.c
@@ -39,6 +39,85 @@
 #include <asm/io_apic.h>
 
 
+/*
+ * This list of dynamic mappings is for temporarily maintaining
+ * original BIOS BAR addresses for possible reinstatement.
+ */
+struct pcibios_fwaddrmap {
+	struct list_head list;
+	struct pci_dev *dev;
+	resource_size_t fw_addr[DEVICE_COUNT_RESOURCE];
+};
+
+static LIST_HEAD(pcibios_fwaddrmappings);
+static DEFINE_SPINLOCK(pcibios_fwaddrmap_lock);
+
+/* Must be called with 'pcibios_fwaddrmap_lock' lock held. */
+static struct pcibios_fwaddrmap *pcibios_fwaddrmap_lookup(struct pci_dev *dev)
+{
+	struct pcibios_fwaddrmap *map;
+
+	list_for_each_entry(map, &pcibios_fwaddrmappings, list)
+		if (map->dev == dev)
+			return map;
+
+	return NULL;
+}
+
+static void
+pcibios_save_fw_addr(struct pci_dev *dev, int idx, resource_size_t fw_addr)
+{
+	unsigned long flags;
+	struct pcibios_fwaddrmap *map;
+
+	spin_lock_irqsave(&pcibios_fwaddrmap_lock, flags);
+	map = pcibios_fwaddrmap_lookup(dev);
+	if (!map) {
+		spin_unlock_irqrestore(&pcibios_fwaddrmap_lock, flags);
+		map = kzalloc(sizeof(*map), GFP_KERNEL);
+		if (!map)
+			return;
+
+		map->dev = pci_dev_get(dev);
+		map->fw_addr[idx] = fw_addr;
+		INIT_LIST_HEAD(&map->list);
+
+		spin_lock_irqsave(&pcibios_fwaddrmap_lock, flags);
+		list_add_tail(&map->list, &pcibios_fwaddrmappings);
+	} else
+		map->fw_addr[idx] = fw_addr;
+	spin_unlock_irqrestore(&pcibios_fwaddrmap_lock, flags);
+}
+
+resource_size_t pcibios_retrieve_fw_addr(struct pci_dev *dev, int idx)
+{
+	unsigned long flags;
+	struct pcibios_fwaddrmap *map;
+	resource_size_t fw_addr = 0;
+
+	spin_lock_irqsave(&pcibios_fwaddrmap_lock, flags);
+	map = pcibios_fwaddrmap_lookup(dev);
+	if (map)
+		fw_addr = map->fw_addr[idx];
+	spin_unlock_irqrestore(&pcibios_fwaddrmap_lock, flags);
+
+	return fw_addr;
+}
+
+static void pcibios_fw_addr_list_del(void)
+{
+	unsigned long flags;
+	struct pcibios_fwaddrmap *entry, *next;
+
+	spin_lock_irqsave(&pcibios_fwaddrmap_lock, flags);
+	list_for_each_entry_safe(entry, next, &pcibios_fwaddrmappings, list) {
+		list_del(&entry->list);
+		pci_dev_put(entry->dev);
+		kfree(entry);
+	}
+	spin_unlock_irqrestore(&pcibios_fwaddrmap_lock, flags);
+}
+
 static int
 skip_isa_ioresource_align(struct pci_dev *dev) {
 
diff --git a/include/linux/pci.h b/include/linux/pci.h
index a16b1df3deff..8e9a307e58b8 100644
--- a/include/linux/pci.h
+++ b/include/linux/pci.h
@@ -891,6 +891,7 @@ ssize_t pci_write_vpd(struct pci_dev *dev, loff_t pos, size_t count, const void
 int pci_vpd_truncate(struct pci_dev *dev, size_t size);
 
 /* Helper functions for low-level code (drivers/pci/setup-[bus,res].c) */
+resource_size_t pcibios_retrieve_fw_addr(struct pci_dev *dev, int idx);
 void pci_bus_assign_resources(const struct pci_bus *bus);
 void pci_bus_size_bridges(struct pci_bus *bus);
 int pci_claim_resource(struct pci_dev *, int);
-- 
cgit v1.2.3


From 6535943fbf25c8e9419a6b20ca992633baa0bf99 Mon Sep 17 00:00:00 2001
From: Myron Stowe <mstowe@redhat.com>
Date: Mon, 21 Nov 2011 11:54:19 -0700
Subject: x86/PCI: Convert maintaining FW-assigned BIOS BAR values to use a
 list

This patch converts the underlying maintenance aspects of FW-assigned
BIOS BAR values from a statically allocated array within struct pci_dev
to a list of temporary, stand alone, entries.

Signed-off-by: Myron Stowe <myron.stowe@redhat.com>
Signed-off-by: Jesse Barnes <jbarnes@virtuousgeek.org>
---
 arch/x86/pci/i386.c     |  4 +++-
 drivers/pci/setup-res.c | 24 +++++++++++++++++++++---
 include/linux/pci.h     |  1 -
 3 files changed, 24 insertions(+), 5 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/pci/i386.c b/arch/x86/pci/i386.c
index 5a1edf2b5386..33e6a0b995fc 100644
--- a/arch/x86/pci/i386.c
+++ b/arch/x86/pci/i386.c
@@ -261,7 +261,8 @@ static void __init pcibios_allocate_resources(int pass)
 					idx, r, disabled, pass);
 				if (pci_claim_resource(dev, idx) < 0) {
 					/* We'll assign a new address later */
-					dev->fw_addr[idx] = r->start;
+					pcibios_save_fw_addr(dev,
+							idx, r->start);
 					r->end -= r->start;
 					r->start = 0;
 				}
@@ -307,6 +308,7 @@ static int __init pcibios_assign_resources(void)
 	}
 
 	pci_assign_unassigned_resources();
+	pcibios_fw_addr_list_del();
 
 	return 0;
 }
diff --git a/drivers/pci/setup-res.c b/drivers/pci/setup-res.c
index 3cf47d34becf..85c8470c35e2 100644
--- a/drivers/pci/setup-res.c
+++ b/drivers/pci/setup-res.c
@@ -158,16 +158,34 @@ static int __pci_assign_resource(struct pci_bus *bus, struct pci_dev *dev,
 	return ret;
 }
 
+/*
+ * Generic function that returns a value indicating that the device's
+ * original BIOS BAR address was not saved and so is not available for
+ * reinstatement.
+ *
+ * Can be over-ridden by architecture specific code that implements
+ * reinstatement functionality rather than leaving it disabled when
+ * normal allocation attempts fail.
+ */
+resource_size_t __weak pcibios_retrieve_fw_addr(struct pci_dev *dev, int idx)
+{
+	return 0;
+}
+
 static int pci_revert_fw_address(struct resource *res, struct pci_dev *dev, 
 		int resno, resource_size_t size)
 {
 	struct resource *root, *conflict;
-	resource_size_t start, end;
+	resource_size_t fw_addr, start, end;
 	int ret = 0;
 
+	fw_addr = pcibios_retrieve_fw_addr(dev, resno);
+	if (!fw_addr)
+		return 1;
+
 	start = res->start;
 	end = res->end;
-	res->start = dev->fw_addr[resno];
+	res->start = fw_addr;
 	res->end = res->start + size - 1;
 
 	root = pci_find_parent_resource(dev, res);
@@ -271,7 +289,7 @@ int pci_assign_resource(struct pci_dev *dev, int resno)
 	 * where firmware left it.  That at least has a chance of
 	 * working, which is better than just leaving it disabled.
 	 */
-	if (ret < 0 && dev->fw_addr[resno])
+	if (ret < 0)
 		ret = pci_revert_fw_address(res, dev, resno, size);
 
 	if (!ret) {
diff --git a/include/linux/pci.h b/include/linux/pci.h
index 8e9a307e58b8..4afabb1d2d27 100644
--- a/include/linux/pci.h
+++ b/include/linux/pci.h
@@ -299,7 +299,6 @@ struct pci_dev {
 	 */
 	unsigned int	irq;
 	struct resource resource[DEVICE_COUNT_RESOURCE]; /* I/O and memory regions + expansion ROMs */
-	resource_size_t	fw_addr[DEVICE_COUNT_RESOURCE]; /* FW-assigned addr */
 
 	/* These fields are used by common fixups */
 	unsigned int	transparent:1;	/* Transparent PCI bridge */
-- 
cgit v1.2.3


From 316d86fe8641abfad32702c77d9e62cf19e68b00 Mon Sep 17 00:00:00 2001
From: Bjorn Helgaas <bhelgaas@google.com>
Date: Tue, 17 Jan 2012 17:41:21 -0700
Subject: x86/PCI: don't fall back to defaults if _CRS has no apertures

Host bridges that lead to things like the Uncore need not have any
I/O port or MMIO apertures.  For example, in this case:

    ACPI: PCI Root Bridge [UNC1] (domain 0000 [bus ff])
    PCI: root bus ff: using default resources
    PCI host bridge to bus 0000:ff
    pci_bus 0000:ff: root bus resource [io  0x0000-0xffff]
    pci_bus 0000:ff: root bus resource [mem 0x00000000-0x3fffffffffff]

we should not pretend those default resources are available on bus ff.

CC: Yinghai Lu <yinghai@kernel.org>
Signed-off-by: Bjorn Helgaas <bhelgaas@google.com>
Signed-off-by: Jesse Barnes <jbarnes@virtuousgeek.org>
---
 arch/x86/pci/acpi.c | 7 ++++++-
 1 file changed, 6 insertions(+), 1 deletion(-)

(limited to 'arch/x86')

diff --git a/arch/x86/pci/acpi.c b/arch/x86/pci/acpi.c
index a312e76063a7..daa42490c1d9 100644
--- a/arch/x86/pci/acpi.c
+++ b/arch/x86/pci/acpi.c
@@ -404,7 +404,12 @@ struct pci_bus * __devinit pci_acpi_scan_root(struct acpi_pci_root *root)
 		kfree(sd);
 	} else {
 		get_current_resources(device, busnum, domain, &resources);
-		if (list_empty(&resources))
+
+		/*
+		 * _CRS with no apertures is normal, so only fall back to
+		 * defaults or native bridge info if we're ignoring _CRS.
+		 */
+		if (!pci_use_crs)
 			x86_pci_root_bus_resources(busnum, &resources);
 		bus = pci_create_root_bus(NULL, busnum, &pci_root_ops, sd,
 					  &resources);
-- 
cgit v1.2.3


From 07d620212d51d113fad997357a75f5e1f2ffd5a7 Mon Sep 17 00:00:00 2001
From: "H. Peter Anvin" <hpa@zytor.com>
Date: Tue, 7 Feb 2012 21:09:03 -0800
Subject: x86: Use generic posix_types.h

Change the x86 architecture to use <asm-generic/posix_types.h>.

Signed-off-by: H. Peter Anvin <hpa@zytor.com>
Link: http://lkml.kernel.org/r/1328677745-20121-20-git-send-email-hpa@zytor.com
Cc: Ingo Molnar <mingo@elte.hu>
Cc: Thomas Gleixner <tglx@linutronix.de>
---
 arch/x86/include/asm/posix_types_32.h |  75 +++---------------------
 arch/x86/include/asm/posix_types_64.h | 106 +---------------------------------
 2 files changed, 12 insertions(+), 169 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/include/asm/posix_types_32.h b/arch/x86/include/asm/posix_types_32.h
index f7d9adf82e53..99f262e04b91 100644
--- a/arch/x86/include/asm/posix_types_32.h
+++ b/arch/x86/include/asm/posix_types_32.h
@@ -7,79 +7,22 @@
  * assume GCC is being used.
  */
 
-typedef unsigned long	__kernel_ino_t;
 typedef unsigned short	__kernel_mode_t;
+#define __kernel_mode_t __kernel_mode_t
+
 typedef unsigned short	__kernel_nlink_t;
-typedef long		__kernel_off_t;
-typedef int		__kernel_pid_t;
+#define __kernel_nlink_t __kernel_nlink_t
+
 typedef unsigned short	__kernel_ipc_pid_t;
+#define __kernel_ipc_pid_t __kernel_ipc_pid_t
+
 typedef unsigned short	__kernel_uid_t;
 typedef unsigned short	__kernel_gid_t;
-typedef unsigned int	__kernel_size_t;
-typedef int		__kernel_ssize_t;
-typedef int		__kernel_ptrdiff_t;
-typedef long		__kernel_time_t;
-typedef long		__kernel_suseconds_t;
-typedef long		__kernel_clock_t;
-typedef int		__kernel_timer_t;
-typedef int		__kernel_clockid_t;
-typedef int		__kernel_daddr_t;
-typedef char *		__kernel_caddr_t;
-typedef unsigned short	__kernel_uid16_t;
-typedef unsigned short	__kernel_gid16_t;
-typedef unsigned int	__kernel_uid32_t;
-typedef unsigned int	__kernel_gid32_t;
+#define __kernel_uid_t __kernel_uid_t
 
-typedef unsigned short	__kernel_old_uid_t;
-typedef unsigned short	__kernel_old_gid_t;
 typedef unsigned short	__kernel_old_dev_t;
+#define __kernel_old_dev_t __kernel_old_dev_t
 
-#ifdef __GNUC__
-typedef long long	__kernel_loff_t;
-#endif
-
-typedef struct {
-	int	val[2];
-} __kernel_fsid_t;
-
-#if defined(__KERNEL__)
-
-#undef	__FD_SET
-#define __FD_SET(fd,fdsetp)					\
-	asm volatile("btsl %1,%0":				\
-		     "+m" (*(__kernel_fd_set *)(fdsetp))	\
-		     : "r" ((int)(fd)))
-
-#undef	__FD_CLR
-#define __FD_CLR(fd,fdsetp)					\
-	asm volatile("btrl %1,%0":				\
-		     "+m" (*(__kernel_fd_set *)(fdsetp))	\
-		     : "r" ((int) (fd)))
-
-#undef	__FD_ISSET
-#define __FD_ISSET(fd,fdsetp)					\
-	(__extension__						\
-	 ({							\
-	 unsigned char __result;				\
-	 asm volatile("btl %1,%2 ; setb %0"			\
-		      : "=q" (__result)				\
-		      : "r" ((int)(fd)),			\
-			"m" (*(__kernel_fd_set *)(fdsetp)));	\
-	 __result;						\
-}))
-
-#undef	__FD_ZERO
-#define __FD_ZERO(fdsetp)					\
-do {								\
-	int __d0, __d1;						\
-	asm volatile("cld ; rep ; stosl"			\
-		     : "=m" (*(__kernel_fd_set *)(fdsetp)),	\
-		       "=&c" (__d0), "=&D" (__d1)		\
-		     : "a" (0), "1" (__FDSET_LONGS),		\
-		       "2" ((__kernel_fd_set *)(fdsetp))	\
-		     : "memory");				\
-} while (0)
-
-#endif /* defined(__KERNEL__) */
+#include <asm-generic/posix_types.h>
 
 #endif /* _ASM_X86_POSIX_TYPES_32_H */
diff --git a/arch/x86/include/asm/posix_types_64.h b/arch/x86/include/asm/posix_types_64.h
index eb8d2d92b63e..cba0c1ead162 100644
--- a/arch/x86/include/asm/posix_types_64.h
+++ b/arch/x86/include/asm/posix_types_64.h
@@ -7,113 +7,13 @@
  * assume GCC is being used.
  */
 
-typedef unsigned long	__kernel_ino_t;
-typedef unsigned int	__kernel_mode_t;
-typedef unsigned long	__kernel_nlink_t;
-typedef long		__kernel_off_t;
-typedef int		__kernel_pid_t;
-typedef int		__kernel_ipc_pid_t;
-typedef unsigned int	__kernel_uid_t;
-typedef unsigned int	__kernel_gid_t;
-typedef unsigned long	__kernel_size_t;
-typedef long		__kernel_ssize_t;
-typedef long		__kernel_ptrdiff_t;
-typedef long		__kernel_time_t;
-typedef long		__kernel_suseconds_t;
-typedef long		__kernel_clock_t;
-typedef int		__kernel_timer_t;
-typedef int		__kernel_clockid_t;
-typedef int		__kernel_daddr_t;
-typedef char *		__kernel_caddr_t;
-typedef unsigned short	__kernel_uid16_t;
-typedef unsigned short	__kernel_gid16_t;
-
-#ifdef __GNUC__
-typedef long long	__kernel_loff_t;
-#endif
-
-typedef struct {
-	int	val[2];
-} __kernel_fsid_t;
-
 typedef unsigned short __kernel_old_uid_t;
 typedef unsigned short __kernel_old_gid_t;
-typedef __kernel_uid_t __kernel_uid32_t;
-typedef __kernel_gid_t __kernel_gid32_t;
+#define __kernel_old_uid_t __kernel_old_uid_t
 
 typedef unsigned long	__kernel_old_dev_t;
+#define __kernel_old_dev_t __kernel_old_dev_t
 
-#ifdef __KERNEL__
-
-#undef __FD_SET
-static inline void __FD_SET(unsigned long fd, __kernel_fd_set *fdsetp)
-{
-	unsigned long _tmp = fd / __NFDBITS;
-	unsigned long _rem = fd % __NFDBITS;
-	fdsetp->fds_bits[_tmp] |= (1UL<<_rem);
-}
-
-#undef __FD_CLR
-static inline void __FD_CLR(unsigned long fd, __kernel_fd_set *fdsetp)
-{
-	unsigned long _tmp = fd / __NFDBITS;
-	unsigned long _rem = fd % __NFDBITS;
-	fdsetp->fds_bits[_tmp] &= ~(1UL<<_rem);
-}
-
-#undef __FD_ISSET
-static inline int __FD_ISSET(unsigned long fd, __const__ __kernel_fd_set *p)
-{
-	unsigned long _tmp = fd / __NFDBITS;
-	unsigned long _rem = fd % __NFDBITS;
-	return (p->fds_bits[_tmp] & (1UL<<_rem)) != 0;
-}
-
-/*
- * This will unroll the loop for the normal constant cases (8 or 32 longs,
- * for 256 and 1024-bit fd_sets respectively)
- */
-#undef __FD_ZERO
-static inline void __FD_ZERO(__kernel_fd_set *p)
-{
-	unsigned long *tmp = p->fds_bits;
-	int i;
-
-	if (__builtin_constant_p(__FDSET_LONGS)) {
-		switch (__FDSET_LONGS) {
-		case 32:
-			tmp[ 0] = 0; tmp[ 1] = 0; tmp[ 2] = 0; tmp[ 3] = 0;
-			tmp[ 4] = 0; tmp[ 5] = 0; tmp[ 6] = 0; tmp[ 7] = 0;
-			tmp[ 8] = 0; tmp[ 9] = 0; tmp[10] = 0; tmp[11] = 0;
-			tmp[12] = 0; tmp[13] = 0; tmp[14] = 0; tmp[15] = 0;
-			tmp[16] = 0; tmp[17] = 0; tmp[18] = 0; tmp[19] = 0;
-			tmp[20] = 0; tmp[21] = 0; tmp[22] = 0; tmp[23] = 0;
-			tmp[24] = 0; tmp[25] = 0; tmp[26] = 0; tmp[27] = 0;
-			tmp[28] = 0; tmp[29] = 0; tmp[30] = 0; tmp[31] = 0;
-			return;
-		case 16:
-			tmp[ 0] = 0; tmp[ 1] = 0; tmp[ 2] = 0; tmp[ 3] = 0;
-			tmp[ 4] = 0; tmp[ 5] = 0; tmp[ 6] = 0; tmp[ 7] = 0;
-			tmp[ 8] = 0; tmp[ 9] = 0; tmp[10] = 0; tmp[11] = 0;
-			tmp[12] = 0; tmp[13] = 0; tmp[14] = 0; tmp[15] = 0;
-			return;
-		case 8:
-			tmp[ 0] = 0; tmp[ 1] = 0; tmp[ 2] = 0; tmp[ 3] = 0;
-			tmp[ 4] = 0; tmp[ 5] = 0; tmp[ 6] = 0; tmp[ 7] = 0;
-			return;
-		case 4:
-			tmp[ 0] = 0; tmp[ 1] = 0; tmp[ 2] = 0; tmp[ 3] = 0;
-			return;
-		}
-	}
-	i = __FDSET_LONGS;
-	while (i) {
-		i--;
-		*tmp = 0;
-		tmp++;
-	}
-}
-
-#endif /* defined(__KERNEL__) */
+#include <asm-generic/posix_types.h>
 
 #endif /* _ASM_X86_POSIX_TYPES_64_H */
-- 
cgit v1.2.3


From c38e23456278e967f094b08247ffc3711b1029b2 Mon Sep 17 00:00:00 2001
From: Linus Torvalds <torvalds@linux-foundation.org>
Date: Wed, 15 Feb 2012 08:05:18 -0800
Subject: i387: fix sense of sanity check

The check for save_init_fpu() (introduced in commit 5b1cbac37798: "i387:
make irq_fpu_usable() tests more robust") was the wrong way around, but
I hadn't noticed, because my "tests" were bogus: the FPU exceptions are
disabled by default, so even doing a divide by zero never actually
triggers this code at all unless you do extra work to enable them.

So if anybody did enable them, they'd get one spurious warning.

Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 arch/x86/include/asm/i387.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'arch/x86')

diff --git a/arch/x86/include/asm/i387.h b/arch/x86/include/asm/i387.h
index a29571821b99..727c1dd84899 100644
--- a/arch/x86/include/asm/i387.h
+++ b/arch/x86/include/asm/i387.h
@@ -404,7 +404,7 @@ static inline void irq_ts_restore(int TS_state)
  */
 static inline void save_init_fpu(struct task_struct *tsk)
 {
-	WARN_ON_ONCE(task_thread_info(tsk)->status & TS_USEDFPU);
+	WARN_ON_ONCE(!(task_thread_info(tsk)->status & TS_USEDFPU));
 	preempt_disable();
 	__save_init_fpu(tsk);
 	stts();
-- 
cgit v1.2.3


From 15d8791cae75dca27bfda8ecfe87dca9379d6bb0 Mon Sep 17 00:00:00 2001
From: Linus Torvalds <torvalds@linux-foundation.org>
Date: Thu, 16 Feb 2012 09:15:04 -0800
Subject: i387: fix x86-64 preemption-unsafe user stack save/restore

Commit 5b1cbac37798 ("i387: make irq_fpu_usable() tests more robust")
added a sanity check to the #NM handler to verify that we never cause
the "Device Not Available" exception in kernel mode.

However, that check actually pinpointed a (fundamental) race where we do
cause that exception as part of the signal stack FPU state save/restore
code.

Because we use the floating point instructions themselves to save and
restore state directly from user mode, we cannot do that atomically with
testing the TS_USEDFPU bit: the user mode access itself may cause a page
fault, which causes a task switch, which saves and restores the FP/MMX
state from the kernel buffers.

This kind of "recursive" FP state save is fine per se, but it means that
when the signal stack save/restore gets restarted, it will now take the
'#NM' exception we originally tried to avoid.  With preemption this can
happen even without the page fault - but because of the user access, we
cannot just disable preemption around the save/restore instruction.

There are various ways to solve this, including using the
"enable/disable_page_fault()" helpers to not allow page faults at all
during the sequence, and fall back to copying things by hand without the
use of the native FP state save/restore instructions.

However, the simplest thing to do is to just allow the #NM from kernel
space, but fix the race in setting and clearing CR0.TS that this all
exposed: the TS bit changes and the TS_USEDFPU bit absolutely have to be
atomic wrt scheduling, so while the actual state save/restore can be
interrupted and restarted, the act of actually clearing/setting CR0.TS
and the TS_USEDFPU bit together must not.

Instead of just adding random "preempt_disable/enable()" calls to what
is already excessively ugly code, this introduces some helper functions
that mostly mirror the "kernel_fpu_begin/end()" functionality, just for
the user state instead.

Those helper functions should probably eventually replace the other
ad-hoc CR0.TS and TS_USEDFPU tests too, but I'll need to think about it
some more: the task switching functionality in particular needs to
expose the difference between the 'prev' and 'next' threads, while the
new helper functions intentionally were written to only work with
'current'.

Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 arch/x86/include/asm/i387.h | 42 ++++++++++++++++++++++++++++++++++++++++++
 arch/x86/kernel/traps.c     |  1 -
 arch/x86/kernel/xsave.c     | 10 +++-------
 3 files changed, 45 insertions(+), 8 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/include/asm/i387.h b/arch/x86/include/asm/i387.h
index 727c1dd84899..f704be239883 100644
--- a/arch/x86/include/asm/i387.h
+++ b/arch/x86/include/asm/i387.h
@@ -399,6 +399,48 @@ static inline void irq_ts_restore(int TS_state)
 		stts();
 }
 
+/*
+ * The question "does this thread have fpu access?"
+ * is slightly racy, since preemption could come in
+ * and revoke it immediately after the test.
+ *
+ * However, even in that very unlikely scenario,
+ * we can just assume we have FPU access - typically
+ * to save the FP state - we'll just take a #NM
+ * fault and get the FPU access back.
+ *
+ * The actual user_fpu_begin/end() functions
+ * need to be preemption-safe, though.
+ *
+ * NOTE! user_fpu_end() must be used only after you
+ * have saved the FP state, and user_fpu_begin() must
+ * be used only immediately before restoring it.
+ * These functions do not do any save/restore on
+ * their own.
+ */
+static inline int user_has_fpu(void)
+{
+	return current_thread_info()->status & TS_USEDFPU;
+}
+
+static inline void user_fpu_end(void)
+{
+	preempt_disable();
+	current_thread_info()->status &= ~TS_USEDFPU;
+	stts();
+	preempt_enable();
+}
+
+static inline void user_fpu_begin(void)
+{
+	preempt_disable();
+	if (!user_has_fpu()) {
+		clts();
+		current_thread_info()->status |= TS_USEDFPU;
+	}
+	preempt_enable();
+}
+
 /*
  * These disable preemption on their own and are safe
  */
diff --git a/arch/x86/kernel/traps.c b/arch/x86/kernel/traps.c
index 8ba27dbc107a..982433b5da30 100644
--- a/arch/x86/kernel/traps.c
+++ b/arch/x86/kernel/traps.c
@@ -631,7 +631,6 @@ EXPORT_SYMBOL_GPL(math_state_restore);
 dotraplinkage void __kprobes
 do_device_not_available(struct pt_regs *regs, long error_code)
 {
-	WARN_ON_ONCE(!user_mode_vm(regs));
 #ifdef CONFIG_MATH_EMULATION
 	if (read_cr0() & X86_CR0_EM) {
 		struct math_emu_info info = { };
diff --git a/arch/x86/kernel/xsave.c b/arch/x86/kernel/xsave.c
index a3911343976b..86f1f09a738a 100644
--- a/arch/x86/kernel/xsave.c
+++ b/arch/x86/kernel/xsave.c
@@ -168,7 +168,7 @@ int save_i387_xstate(void __user *buf)
 	if (!used_math())
 		return 0;
 
-	if (task_thread_info(tsk)->status & TS_USEDFPU) {
+	if (user_has_fpu()) {
 		if (use_xsave())
 			err = xsave_user(buf);
 		else
@@ -176,8 +176,7 @@ int save_i387_xstate(void __user *buf)
 
 		if (err)
 			return err;
-		task_thread_info(tsk)->status &= ~TS_USEDFPU;
-		stts();
+		user_fpu_end();
 	} else {
 		sanitize_i387_state(tsk);
 		if (__copy_to_user(buf, &tsk->thread.fpu.state->fxsave,
@@ -292,10 +291,7 @@ int restore_i387_xstate(void __user *buf)
 			return err;
 	}
 
-	if (!(task_thread_info(current)->status & TS_USEDFPU)) {
-		clts();
-		task_thread_info(current)->status |= TS_USEDFPU;
-	}
+	user_fpu_begin();
 	if (use_xsave())
 		err = restore_user_xstate(buf);
 	else
-- 
cgit v1.2.3


From b6c66418dcad0fcf83cd1d0a39482db37bf4fc41 Mon Sep 17 00:00:00 2001
From: Linus Torvalds <torvalds@linux-foundation.org>
Date: Thu, 16 Feb 2012 12:22:48 -0800
Subject: i387: move TS_USEDFPU clearing out of __save_init_fpu and into
 callers

Touching TS_USEDFPU without touching CR0.TS is confusing, so don't do
it.  By moving it into the callers, we always do the TS_USEDFPU next to
the CR0.TS accesses in the source code, and it's much easier to see how
the two go hand in hand.

Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 arch/x86/include/asm/i387.h | 9 ++++++---
 1 file changed, 6 insertions(+), 3 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/include/asm/i387.h b/arch/x86/include/asm/i387.h
index f704be239883..1e12c2d087e4 100644
--- a/arch/x86/include/asm/i387.h
+++ b/arch/x86/include/asm/i387.h
@@ -259,7 +259,6 @@ static inline void fpu_save_init(struct fpu *fpu)
 static inline void __save_init_fpu(struct task_struct *tsk)
 {
 	fpu_save_init(&tsk->thread.fpu);
-	task_thread_info(tsk)->status &= ~TS_USEDFPU;
 }
 
 static inline int fpu_fxrstor_checking(struct fpu *fpu)
@@ -290,6 +289,7 @@ static inline void __unlazy_fpu(struct task_struct *tsk)
 {
 	if (task_thread_info(tsk)->status & TS_USEDFPU) {
 		__save_init_fpu(tsk);
+		task_thread_info(tsk)->status &= ~TS_USEDFPU;
 		stts();
 	} else
 		tsk->fpu_counter = 0;
@@ -356,9 +356,11 @@ static inline void kernel_fpu_begin(void)
 
 	WARN_ON_ONCE(!irq_fpu_usable());
 	preempt_disable();
-	if (me->status & TS_USEDFPU)
+	if (me->status & TS_USEDFPU) {
 		__save_init_fpu(me->task);
-	else
+		me->status &= ~TS_USEDFPU;
+		/* We do 'stts()' in kernel_fpu_end() */
+	} else
 		clts();
 }
 
@@ -449,6 +451,7 @@ static inline void save_init_fpu(struct task_struct *tsk)
 	WARN_ON_ONCE(!(task_thread_info(tsk)->status & TS_USEDFPU));
 	preempt_disable();
 	__save_init_fpu(tsk);
+	task_thread_info(tsk)->status &= ~TS_USEDFPU;
 	stts();
 	preempt_enable();
 }
-- 
cgit v1.2.3


From 6d59d7a9f5b723a7ac1925c136e93ec83c0c3043 Mon Sep 17 00:00:00 2001
From: Linus Torvalds <torvalds@linux-foundation.org>
Date: Thu, 16 Feb 2012 13:33:12 -0800
Subject: i387: don't ever touch TS_USEDFPU directly, use helper functions

This creates three helper functions that do the TS_USEDFPU accesses, and
makes everybody that used to do it by hand use those helpers instead.

In addition, there's a couple of helper functions for the "change both
CR0.TS and TS_USEDFPU at the same time" case, and the places that do
that together have been changed to use those.  That means that we have
fewer random places that open-code this situation.

The intent is partly to clarify the code without actually changing any
semantics yet (since we clearly still have some hard to reproduce bug in
this area), but also to make it much easier to use another approach
entirely to caching the CR0.TS bit for software accesses.

Right now we use a bit in the thread-info 'status' variable (this patch
does not change that), but we might want to make it a full field of its
own or even make it a per-cpu variable.

Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 arch/x86/include/asm/i387.h | 75 +++++++++++++++++++++++++++++++++------------
 arch/x86/kernel/traps.c     |  2 +-
 arch/x86/kernel/xsave.c     |  2 +-
 arch/x86/kvm/vmx.c          |  2 +-
 4 files changed, 58 insertions(+), 23 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/include/asm/i387.h b/arch/x86/include/asm/i387.h
index 1e12c2d087e4..548b2c07ac9a 100644
--- a/arch/x86/include/asm/i387.h
+++ b/arch/x86/include/asm/i387.h
@@ -279,6 +279,47 @@ static inline int restore_fpu_checking(struct task_struct *tsk)
 	return fpu_restore_checking(&tsk->thread.fpu);
 }
 
+/*
+ * Software FPU state helpers. Careful: these need to
+ * be preemption protection *and* they need to be
+ * properly paired with the CR0.TS changes!
+ */
+static inline int __thread_has_fpu(struct thread_info *ti)
+{
+	return ti->status & TS_USEDFPU;
+}
+
+/* Must be paired with an 'stts' after! */
+static inline void __thread_clear_has_fpu(struct thread_info *ti)
+{
+	ti->status &= ~TS_USEDFPU;
+}
+
+/* Must be paired with a 'clts' before! */
+static inline void __thread_set_has_fpu(struct thread_info *ti)
+{
+	ti->status |= TS_USEDFPU;
+}
+
+/*
+ * Encapsulate the CR0.TS handling together with the
+ * software flag.
+ *
+ * These generally need preemption protection to work,
+ * do try to avoid using these on their own.
+ */
+static inline void __thread_fpu_end(struct thread_info *ti)
+{
+	__thread_clear_has_fpu(ti);
+	stts();
+}
+
+static inline void __thread_fpu_begin(struct thread_info *ti)
+{
+	clts();
+	__thread_set_has_fpu(ti);
+}
+
 /*
  * Signal frame handlers...
  */
@@ -287,23 +328,21 @@ extern int restore_i387_xstate(void __user *buf);
 
 static inline void __unlazy_fpu(struct task_struct *tsk)
 {
-	if (task_thread_info(tsk)->status & TS_USEDFPU) {
+	if (__thread_has_fpu(task_thread_info(tsk))) {
 		__save_init_fpu(tsk);
-		task_thread_info(tsk)->status &= ~TS_USEDFPU;
-		stts();
+		__thread_fpu_end(task_thread_info(tsk));
 	} else
 		tsk->fpu_counter = 0;
 }
 
 static inline void __clear_fpu(struct task_struct *tsk)
 {
-	if (task_thread_info(tsk)->status & TS_USEDFPU) {
+	if (__thread_has_fpu(task_thread_info(tsk))) {
 		/* Ignore delayed exceptions from user space */
 		asm volatile("1: fwait\n"
 			     "2:\n"
 			     _ASM_EXTABLE(1b, 2b));
-		task_thread_info(tsk)->status &= ~TS_USEDFPU;
-		stts();
+		__thread_fpu_end(task_thread_info(tsk));
 	}
 }
 
@@ -311,14 +350,14 @@ static inline void __clear_fpu(struct task_struct *tsk)
  * Were we in an interrupt that interrupted kernel mode?
  *
  * We can do a kernel_fpu_begin/end() pair *ONLY* if that
- * pair does nothing at all: TS_USEDFPU must be clear (so
+ * pair does nothing at all: the thread must not have fpu (so
  * that we don't try to save the FPU state), and TS must
  * be set (so that the clts/stts pair does nothing that is
  * visible in the interrupted kernel thread).
  */
 static inline bool interrupted_kernel_fpu_idle(void)
 {
-	return !(current_thread_info()->status & TS_USEDFPU) &&
+	return !__thread_has_fpu(current_thread_info()) &&
 		(read_cr0() & X86_CR0_TS);
 }
 
@@ -356,9 +395,9 @@ static inline void kernel_fpu_begin(void)
 
 	WARN_ON_ONCE(!irq_fpu_usable());
 	preempt_disable();
-	if (me->status & TS_USEDFPU) {
+	if (__thread_has_fpu(me)) {
 		__save_init_fpu(me->task);
-		me->status &= ~TS_USEDFPU;
+		__thread_clear_has_fpu(me);
 		/* We do 'stts()' in kernel_fpu_end() */
 	} else
 		clts();
@@ -422,24 +461,21 @@ static inline void irq_ts_restore(int TS_state)
  */
 static inline int user_has_fpu(void)
 {
-	return current_thread_info()->status & TS_USEDFPU;
+	return __thread_has_fpu(current_thread_info());
 }
 
 static inline void user_fpu_end(void)
 {
 	preempt_disable();
-	current_thread_info()->status &= ~TS_USEDFPU;
-	stts();
+	__thread_fpu_end(current_thread_info());
 	preempt_enable();
 }
 
 static inline void user_fpu_begin(void)
 {
 	preempt_disable();
-	if (!user_has_fpu()) {
-		clts();
-		current_thread_info()->status |= TS_USEDFPU;
-	}
+	if (!user_has_fpu())
+		__thread_fpu_begin(current_thread_info());
 	preempt_enable();
 }
 
@@ -448,11 +484,10 @@ static inline void user_fpu_begin(void)
  */
 static inline void save_init_fpu(struct task_struct *tsk)
 {
-	WARN_ON_ONCE(!(task_thread_info(tsk)->status & TS_USEDFPU));
+	WARN_ON_ONCE(!__thread_has_fpu(task_thread_info(tsk)));
 	preempt_disable();
 	__save_init_fpu(tsk);
-	task_thread_info(tsk)->status &= ~TS_USEDFPU;
-	stts();
+	__thread_fpu_end(task_thread_info(tsk));
 	preempt_enable();
 }
 
diff --git a/arch/x86/kernel/traps.c b/arch/x86/kernel/traps.c
index 982433b5da30..fc676e44c77f 100644
--- a/arch/x86/kernel/traps.c
+++ b/arch/x86/kernel/traps.c
@@ -588,7 +588,7 @@ void __math_state_restore(void)
 		return;
 	}
 
-	thread->status |= TS_USEDFPU;	/* So we fnsave on switch_to() */
+	__thread_set_has_fpu(thread);	/* clts in caller! */
 	tsk->fpu_counter++;
 }
 
diff --git a/arch/x86/kernel/xsave.c b/arch/x86/kernel/xsave.c
index 86f1f09a738a..a0bcd0dbc951 100644
--- a/arch/x86/kernel/xsave.c
+++ b/arch/x86/kernel/xsave.c
@@ -47,7 +47,7 @@ void __sanitize_i387_state(struct task_struct *tsk)
 	if (!fx)
 		return;
 
-	BUG_ON(task_thread_info(tsk)->status & TS_USEDFPU);
+	BUG_ON(__thread_has_fpu(task_thread_info(tsk)));
 
 	xstate_bv = tsk->thread.fpu.state->xsave.xsave_hdr.xstate_bv;
 
diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index d29216c462b3..36091dd04b4b 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -1457,7 +1457,7 @@ static void __vmx_load_host_state(struct vcpu_vmx *vmx)
 #ifdef CONFIG_X86_64
 	wrmsrl(MSR_KERNEL_GS_BASE, vmx->msr_host_kernel_gs_base);
 #endif
-	if (current_thread_info()->status & TS_USEDFPU)
+	if (__thread_has_fpu(current_thread_info()))
 		clts();
 	load_gdt(&__get_cpu_var(host_gdt));
 }
-- 
cgit v1.2.3


From b3b0870ef3ffed72b92415423da864f440f57ad6 Mon Sep 17 00:00:00 2001
From: Linus Torvalds <torvalds@linux-foundation.org>
Date: Thu, 16 Feb 2012 15:45:23 -0800
Subject: i387: do not preload FPU state at task switch time

Yes, taking the trap to re-load the FPU/MMX state is expensive, but so
is spending several days looking for a bug in the state save/restore
code.  And the preload code has some rather subtle interactions with
both paravirtualization support and segment state restore, so it's not
nearly as simple as it should be.

Also, now that we no longer necessarily depend on a single bit (ie
TS_USEDFPU) for keeping track of the state of the FPU, we migth be able
to do better.  If we are really switching between two processes that
keep touching the FP state, save/restore is inevitable, but in the case
of having one process that does most of the FPU usage, we may actually
be able to do much better than the preloading.

In particular, we may be able to keep track of which CPU the process ran
on last, and also per CPU keep track of which process' FP state that CPU
has.  For modern CPU's that don't destroy the FPU contents on save time,
that would allow us to do a lazy restore by just re-enabling the
existing FPU state - with no restore cost at all!

Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 arch/x86/include/asm/i387.h  |  1 -
 arch/x86/kernel/process_32.c | 20 --------------------
 arch/x86/kernel/process_64.c | 23 -----------------------
 arch/x86/kernel/traps.c      | 35 +++++++++++------------------------
 4 files changed, 11 insertions(+), 68 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/include/asm/i387.h b/arch/x86/include/asm/i387.h
index 548b2c07ac9a..86974c72d0d0 100644
--- a/arch/x86/include/asm/i387.h
+++ b/arch/x86/include/asm/i387.h
@@ -30,7 +30,6 @@ extern void fpu_init(void);
 extern void mxcsr_feature_mask_init(void);
 extern int init_fpu(struct task_struct *child);
 extern void math_state_restore(void);
-extern void __math_state_restore(void);
 extern int dump_fpu(struct pt_regs *, struct user_i387_struct *);
 
 extern user_regset_active_fn fpregs_active, xfpregs_active;
diff --git a/arch/x86/kernel/process_32.c b/arch/x86/kernel/process_32.c
index 485204f58cda..324cd722b447 100644
--- a/arch/x86/kernel/process_32.c
+++ b/arch/x86/kernel/process_32.c
@@ -299,23 +299,11 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p)
 				 *next = &next_p->thread;
 	int cpu = smp_processor_id();
 	struct tss_struct *tss = &per_cpu(init_tss, cpu);
-	bool preload_fpu;
 
 	/* never put a printk in __switch_to... printk() calls wake_up*() indirectly */
 
-	/*
-	 * If the task has used fpu the last 5 timeslices, just do a full
-	 * restore of the math state immediately to avoid the trap; the
-	 * chances of needing FPU soon are obviously high now
-	 */
-	preload_fpu = tsk_used_math(next_p) && next_p->fpu_counter > 5;
-
 	__unlazy_fpu(prev_p);
 
-	/* we're going to use this soon, after a few expensive things */
-	if (preload_fpu)
-		prefetch(next->fpu.state);
-
 	/*
 	 * Reload esp0.
 	 */
@@ -354,11 +342,6 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p)
 		     task_thread_info(next_p)->flags & _TIF_WORK_CTXSW_NEXT))
 		__switch_to_xtra(prev_p, next_p, tss);
 
-	/* If we're going to preload the fpu context, make sure clts
-	   is run while we're batching the cpu state updates. */
-	if (preload_fpu)
-		clts();
-
 	/*
 	 * Leave lazy mode, flushing any hypercalls made here.
 	 * This must be done before restoring TLS segments so
@@ -368,9 +351,6 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p)
 	 */
 	arch_end_context_switch(next_p);
 
-	if (preload_fpu)
-		__math_state_restore();
-
 	/*
 	 * Restore %gs if needed (which is common)
 	 */
diff --git a/arch/x86/kernel/process_64.c b/arch/x86/kernel/process_64.c
index 9b9fe4a85c87..992b4e542bc3 100644
--- a/arch/x86/kernel/process_64.c
+++ b/arch/x86/kernel/process_64.c
@@ -386,18 +386,6 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p)
 	int cpu = smp_processor_id();
 	struct tss_struct *tss = &per_cpu(init_tss, cpu);
 	unsigned fsindex, gsindex;
-	bool preload_fpu;
-
-	/*
-	 * If the task has used fpu the last 5 timeslices, just do a full
-	 * restore of the math state immediately to avoid the trap; the
-	 * chances of needing FPU soon are obviously high now
-	 */
-	preload_fpu = tsk_used_math(next_p) && next_p->fpu_counter > 5;
-
-	/* we're going to use this soon, after a few expensive things */
-	if (preload_fpu)
-		prefetch(next->fpu.state);
 
 	/*
 	 * Reload esp0, LDT and the page table pointer:
@@ -430,10 +418,6 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p)
 	/* Must be after DS reload */
 	__unlazy_fpu(prev_p);
 
-	/* Make sure cpu is ready for new context */
-	if (preload_fpu)
-		clts();
-
 	/*
 	 * Leave lazy mode, flushing any hypercalls made here.
 	 * This must be done before restoring TLS segments so
@@ -492,13 +476,6 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p)
 		     task_thread_info(prev_p)->flags & _TIF_WORK_CTXSW_PREV))
 		__switch_to_xtra(prev_p, next_p, tss);
 
-	/*
-	 * Preload the FPU context, now that we've determined that the
-	 * task is likely to be using it. 
-	 */
-	if (preload_fpu)
-		__math_state_restore();
-
 	return prev_p;
 }
 
diff --git a/arch/x86/kernel/traps.c b/arch/x86/kernel/traps.c
index fc676e44c77f..5afe824c66e5 100644
--- a/arch/x86/kernel/traps.c
+++ b/arch/x86/kernel/traps.c
@@ -570,28 +570,6 @@ asmlinkage void __attribute__((weak)) smp_threshold_interrupt(void)
 {
 }
 
-/*
- * __math_state_restore assumes that cr0.TS is already clear and the
- * fpu state is all ready for use.  Used during context switch.
- */
-void __math_state_restore(void)
-{
-	struct thread_info *thread = current_thread_info();
-	struct task_struct *tsk = thread->task;
-
-	/*
-	 * Paranoid restore. send a SIGSEGV if we fail to restore the state.
-	 */
-	if (unlikely(restore_fpu_checking(tsk))) {
-		stts();
-		force_sig(SIGSEGV, tsk);
-		return;
-	}
-
-	__thread_set_has_fpu(thread);	/* clts in caller! */
-	tsk->fpu_counter++;
-}
-
 /*
  * 'math_state_restore()' saves the current math information in the
  * old math state array, and gets the new ones from the current task
@@ -622,9 +600,18 @@ void math_state_restore(void)
 		local_irq_disable();
 	}
 
-	clts();				/* Allow maths ops (or we recurse) */
+	__thread_fpu_begin(thread);
 
-	__math_state_restore();
+	/*
+	 * Paranoid restore. send a SIGSEGV if we fail to restore the state.
+	 */
+	if (unlikely(restore_fpu_checking(tsk))) {
+		__thread_fpu_end(thread);
+		force_sig(SIGSEGV, tsk);
+		return;
+	}
+
+	tsk->fpu_counter++;
 }
 EXPORT_SYMBOL_GPL(math_state_restore);
 
-- 
cgit v1.2.3


From 4903062b5485f0e2c286a23b44c9b59d9b017d53 Mon Sep 17 00:00:00 2001
From: Linus Torvalds <torvalds@linux-foundation.org>
Date: Thu, 16 Feb 2012 19:11:15 -0800
Subject: i387: move AMD K7/K8 fpu fxsave/fxrstor workaround from save to
 restore

The AMD K7/K8 CPUs don't save/restore FDP/FIP/FOP unless an exception is
pending.  In order to not leak FIP state from one process to another, we
need to do a floating point load after the fxsave of the old process,
and before the fxrstor of the new FPU state.  That resets the state to
the (uninteresting) kernel load, rather than some potentially sensitive
user information.

We used to do this directly after the FPU state save, but that is
actually very inconvenient, since it

 (a) corrupts what is potentially perfectly good FPU state that we might
     want to lazy avoid restoring later and

 (b) on x86-64 it resulted in a very annoying ordering constraint, where
     "__unlazy_fpu()" in the task switch needs to be delayed until after
     the DS segment has been reloaded just to get the new DS value.

Coupling it to the fxrstor instead of the fxsave automatically avoids
both of these issues, and also ensures that we only do it when actually
necessary (the FP state after a save may never actually get used).  It's
simply a much more natural place for the leaked state cleanup.

Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 arch/x86/include/asm/i387.h  | 19 -------------------
 arch/x86/kernel/process_64.c |  5 ++---
 arch/x86/kernel/traps.c      | 14 ++++++++++++++
 3 files changed, 16 insertions(+), 22 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/include/asm/i387.h b/arch/x86/include/asm/i387.h
index 86974c72d0d0..01b115d86770 100644
--- a/arch/x86/include/asm/i387.h
+++ b/arch/x86/include/asm/i387.h
@@ -211,15 +211,6 @@ static inline void fpu_fxsave(struct fpu *fpu)
 
 #endif	/* CONFIG_X86_64 */
 
-/* We need a safe address that is cheap to find and that is already
-   in L1 during context switch. The best choices are unfortunately
-   different for UP and SMP */
-#ifdef CONFIG_SMP
-#define safe_address (__per_cpu_offset[0])
-#else
-#define safe_address (__get_cpu_var(kernel_cpustat).cpustat[CPUTIME_USER])
-#endif
-
 /*
  * These must be called with preempt disabled
  */
@@ -243,16 +234,6 @@ static inline void fpu_save_init(struct fpu *fpu)
 
 	if (unlikely(fpu->state->fxsave.swd & X87_FSW_ES))
 		asm volatile("fnclex");
-
-	/* AMD K7/K8 CPUs don't save/restore FDP/FIP/FOP unless an exception
-	   is pending.  Clear the x87 state here by setting it to fixed
-	   values. safe_address is a random variable that should be in L1 */
-	alternative_input(
-		ASM_NOP8 ASM_NOP2,
-		"emms\n\t"	  	/* clear stack tags */
-		"fildl %P[addr]",	/* set F?P to defined value */
-		X86_FEATURE_FXSAVE_LEAK,
-		[addr] "m" (safe_address));
 }
 
 static inline void __save_init_fpu(struct task_struct *tsk)
diff --git a/arch/x86/kernel/process_64.c b/arch/x86/kernel/process_64.c
index 992b4e542bc3..753e803f7197 100644
--- a/arch/x86/kernel/process_64.c
+++ b/arch/x86/kernel/process_64.c
@@ -387,6 +387,8 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p)
 	struct tss_struct *tss = &per_cpu(init_tss, cpu);
 	unsigned fsindex, gsindex;
 
+	__unlazy_fpu(prev_p);
+
 	/*
 	 * Reload esp0, LDT and the page table pointer:
 	 */
@@ -415,9 +417,6 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p)
 
 	load_TLS(next, cpu);
 
-	/* Must be after DS reload */
-	__unlazy_fpu(prev_p);
-
 	/*
 	 * Leave lazy mode, flushing any hypercalls made here.
 	 * This must be done before restoring TLS segments so
diff --git a/arch/x86/kernel/traps.c b/arch/x86/kernel/traps.c
index 5afe824c66e5..4d42300dcd2c 100644
--- a/arch/x86/kernel/traps.c
+++ b/arch/x86/kernel/traps.c
@@ -585,6 +585,10 @@ void math_state_restore(void)
 	struct thread_info *thread = current_thread_info();
 	struct task_struct *tsk = thread->task;
 
+	/* We need a safe address that is cheap to find and that is already
+	   in L1. We just brought in "thread->task", so use that */
+#define safe_address (thread->task)
+
 	if (!tsk_used_math(tsk)) {
 		local_irq_enable();
 		/*
@@ -602,6 +606,16 @@ void math_state_restore(void)
 
 	__thread_fpu_begin(thread);
 
+	/* AMD K7/K8 CPUs don't save/restore FDP/FIP/FOP unless an exception
+	   is pending.  Clear the x87 state here by setting it to fixed
+	   values. safe_address is a random variable that should be in L1 */
+	alternative_input(
+		ASM_NOP8 ASM_NOP2,
+		"emms\n\t"	  	/* clear stack tags */
+		"fildl %P[addr]",	/* set F?P to defined value */
+		X86_FEATURE_FXSAVE_LEAK,
+		[addr] "m" (safe_address));
+
 	/*
 	 * Paranoid restore. send a SIGSEGV if we fail to restore the state.
 	 */
-- 
cgit v1.2.3


From f94edacf998516ac9d849f7bc6949a703977a7f3 Mon Sep 17 00:00:00 2001
From: Linus Torvalds <torvalds@linux-foundation.org>
Date: Fri, 17 Feb 2012 21:48:54 -0800
Subject: i387: move TS_USEDFPU flag from thread_info to task_struct

This moves the bit that indicates whether a thread has ownership of the
FPU from the TS_USEDFPU bit in thread_info->status to a word of its own
(called 'has_fpu') in task_struct->thread.has_fpu.

This fixes two independent bugs at the same time:

 - changing 'thread_info->status' from the scheduler causes nasty
   problems for the other users of that variable, since it is defined to
   be thread-synchronous (that's what the "TS_" part of the naming was
   supposed to indicate).

   So perfectly valid code could (and did) do

	ti->status |= TS_RESTORE_SIGMASK;

   and the compiler was free to do that as separate load, or and store
   instructions.  Which can cause problems with preemption, since a task
   switch could happen in between, and change the TS_USEDFPU bit. The
   change to TS_USEDFPU would be overwritten by the final store.

   In practice, this seldom happened, though, because the 'status' field
   was seldom used more than once, so gcc would generally tend to
   generate code that used a read-modify-write instruction and thus
   happened to avoid this problem - RMW instructions are naturally low
   fat and preemption-safe.

 - On x86-32, the current_thread_info() pointer would, during interrupts
   and softirqs, point to a *copy* of the real thread_info, because
   x86-32 uses %esp to calculate the thread_info address, and thus the
   separate irq (and softirq) stacks would cause these kinds of odd
   thread_info copy aliases.

   This is normally not a problem, since interrupts aren't supposed to
   look at thread information anyway (what thread is running at
   interrupt time really isn't very well-defined), but it confused the
   heck out of irq_fpu_usable() and the code that tried to squirrel
   away the FPU state.

   (It also caused untold confusion for us poor kernel developers).

It also turns out that using 'task_struct' is actually much more natural
for most of the call sites that care about the FPU state, since they
tend to work with the task struct for other reasons anyway (ie
scheduling).  And the FPU data that we are going to save/restore is
found there too.

Thanks to Arjan Van De Ven <arjan@linux.intel.com> for pointing us to
the %esp issue.

Cc: Arjan van de Ven <arjan@linux.intel.com>
Reported-and-tested-by: Raphael Prevost <raphael@buro.asia>
Acked-and-tested-by: Suresh Siddha <suresh.b.siddha@intel.com>
Tested-by: Peter Anvin <hpa@zytor.com>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 arch/x86/include/asm/i387.h        | 44 +++++++++++++++++++-------------------
 arch/x86/include/asm/processor.h   |  1 +
 arch/x86/include/asm/thread_info.h |  2 --
 arch/x86/kernel/traps.c            | 11 +++++-----
 arch/x86/kernel/xsave.c            |  2 +-
 arch/x86/kvm/vmx.c                 |  2 +-
 6 files changed, 30 insertions(+), 32 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/include/asm/i387.h b/arch/x86/include/asm/i387.h
index 01b115d86770..f5376676f89c 100644
--- a/arch/x86/include/asm/i387.h
+++ b/arch/x86/include/asm/i387.h
@@ -264,21 +264,21 @@ static inline int restore_fpu_checking(struct task_struct *tsk)
  * be preemption protection *and* they need to be
  * properly paired with the CR0.TS changes!
  */
-static inline int __thread_has_fpu(struct thread_info *ti)
+static inline int __thread_has_fpu(struct task_struct *tsk)
 {
-	return ti->status & TS_USEDFPU;
+	return tsk->thread.has_fpu;
 }
 
 /* Must be paired with an 'stts' after! */
-static inline void __thread_clear_has_fpu(struct thread_info *ti)
+static inline void __thread_clear_has_fpu(struct task_struct *tsk)
 {
-	ti->status &= ~TS_USEDFPU;
+	tsk->thread.has_fpu = 0;
 }
 
 /* Must be paired with a 'clts' before! */
-static inline void __thread_set_has_fpu(struct thread_info *ti)
+static inline void __thread_set_has_fpu(struct task_struct *tsk)
 {
-	ti->status |= TS_USEDFPU;
+	tsk->thread.has_fpu = 1;
 }
 
 /*
@@ -288,16 +288,16 @@ static inline void __thread_set_has_fpu(struct thread_info *ti)
  * These generally need preemption protection to work,
  * do try to avoid using these on their own.
  */
-static inline void __thread_fpu_end(struct thread_info *ti)
+static inline void __thread_fpu_end(struct task_struct *tsk)
 {
-	__thread_clear_has_fpu(ti);
+	__thread_clear_has_fpu(tsk);
 	stts();
 }
 
-static inline void __thread_fpu_begin(struct thread_info *ti)
+static inline void __thread_fpu_begin(struct task_struct *tsk)
 {
 	clts();
-	__thread_set_has_fpu(ti);
+	__thread_set_has_fpu(tsk);
 }
 
 /*
@@ -308,21 +308,21 @@ extern int restore_i387_xstate(void __user *buf);
 
 static inline void __unlazy_fpu(struct task_struct *tsk)
 {
-	if (__thread_has_fpu(task_thread_info(tsk))) {
+	if (__thread_has_fpu(tsk)) {
 		__save_init_fpu(tsk);
-		__thread_fpu_end(task_thread_info(tsk));
+		__thread_fpu_end(tsk);
 	} else
 		tsk->fpu_counter = 0;
 }
 
 static inline void __clear_fpu(struct task_struct *tsk)
 {
-	if (__thread_has_fpu(task_thread_info(tsk))) {
+	if (__thread_has_fpu(tsk)) {
 		/* Ignore delayed exceptions from user space */
 		asm volatile("1: fwait\n"
 			     "2:\n"
 			     _ASM_EXTABLE(1b, 2b));
-		__thread_fpu_end(task_thread_info(tsk));
+		__thread_fpu_end(tsk);
 	}
 }
 
@@ -337,7 +337,7 @@ static inline void __clear_fpu(struct task_struct *tsk)
  */
 static inline bool interrupted_kernel_fpu_idle(void)
 {
-	return !__thread_has_fpu(current_thread_info()) &&
+	return !__thread_has_fpu(current) &&
 		(read_cr0() & X86_CR0_TS);
 }
 
@@ -371,12 +371,12 @@ static inline bool irq_fpu_usable(void)
 
 static inline void kernel_fpu_begin(void)
 {
-	struct thread_info *me = current_thread_info();
+	struct task_struct *me = current;
 
 	WARN_ON_ONCE(!irq_fpu_usable());
 	preempt_disable();
 	if (__thread_has_fpu(me)) {
-		__save_init_fpu(me->task);
+		__save_init_fpu(me);
 		__thread_clear_has_fpu(me);
 		/* We do 'stts()' in kernel_fpu_end() */
 	} else
@@ -441,13 +441,13 @@ static inline void irq_ts_restore(int TS_state)
  */
 static inline int user_has_fpu(void)
 {
-	return __thread_has_fpu(current_thread_info());
+	return __thread_has_fpu(current);
 }
 
 static inline void user_fpu_end(void)
 {
 	preempt_disable();
-	__thread_fpu_end(current_thread_info());
+	__thread_fpu_end(current);
 	preempt_enable();
 }
 
@@ -455,7 +455,7 @@ static inline void user_fpu_begin(void)
 {
 	preempt_disable();
 	if (!user_has_fpu())
-		__thread_fpu_begin(current_thread_info());
+		__thread_fpu_begin(current);
 	preempt_enable();
 }
 
@@ -464,10 +464,10 @@ static inline void user_fpu_begin(void)
  */
 static inline void save_init_fpu(struct task_struct *tsk)
 {
-	WARN_ON_ONCE(!__thread_has_fpu(task_thread_info(tsk)));
+	WARN_ON_ONCE(!__thread_has_fpu(tsk));
 	preempt_disable();
 	__save_init_fpu(tsk);
-	__thread_fpu_end(task_thread_info(tsk));
+	__thread_fpu_end(tsk);
 	preempt_enable();
 }
 
diff --git a/arch/x86/include/asm/processor.h b/arch/x86/include/asm/processor.h
index aa9088c26931..f7c89e231c6c 100644
--- a/arch/x86/include/asm/processor.h
+++ b/arch/x86/include/asm/processor.h
@@ -454,6 +454,7 @@ struct thread_struct {
 	unsigned long		trap_no;
 	unsigned long		error_code;
 	/* floating point and extended processor state */
+	unsigned long		has_fpu;
 	struct fpu		fpu;
 #ifdef CONFIG_X86_32
 	/* Virtual 86 mode info */
diff --git a/arch/x86/include/asm/thread_info.h b/arch/x86/include/asm/thread_info.h
index bc817cd8b443..cfd8144d5527 100644
--- a/arch/x86/include/asm/thread_info.h
+++ b/arch/x86/include/asm/thread_info.h
@@ -247,8 +247,6 @@ static inline struct thread_info *current_thread_info(void)
  * ever touches our thread-synchronous status, so we don't
  * have to worry about atomic accesses.
  */
-#define TS_USEDFPU		0x0001	/* FPU was used by this task
-					   this quantum (SMP) */
 #define TS_COMPAT		0x0002	/* 32bit syscall active (64BIT)*/
 #define TS_POLLING		0x0004	/* idle task polling need_resched,
 					   skip sending interrupt */
diff --git a/arch/x86/kernel/traps.c b/arch/x86/kernel/traps.c
index 4d42300dcd2c..ad25e51f40c4 100644
--- a/arch/x86/kernel/traps.c
+++ b/arch/x86/kernel/traps.c
@@ -582,12 +582,11 @@ asmlinkage void __attribute__((weak)) smp_threshold_interrupt(void)
  */
 void math_state_restore(void)
 {
-	struct thread_info *thread = current_thread_info();
-	struct task_struct *tsk = thread->task;
+	struct task_struct *tsk = current;
 
 	/* We need a safe address that is cheap to find and that is already
-	   in L1. We just brought in "thread->task", so use that */
-#define safe_address (thread->task)
+	   in L1. We're just bringing in "tsk->thread.has_fpu", so use that */
+#define safe_address (tsk->thread.has_fpu)
 
 	if (!tsk_used_math(tsk)) {
 		local_irq_enable();
@@ -604,7 +603,7 @@ void math_state_restore(void)
 		local_irq_disable();
 	}
 
-	__thread_fpu_begin(thread);
+	__thread_fpu_begin(tsk);
 
 	/* AMD K7/K8 CPUs don't save/restore FDP/FIP/FOP unless an exception
 	   is pending.  Clear the x87 state here by setting it to fixed
@@ -620,7 +619,7 @@ void math_state_restore(void)
 	 * Paranoid restore. send a SIGSEGV if we fail to restore the state.
 	 */
 	if (unlikely(restore_fpu_checking(tsk))) {
-		__thread_fpu_end(thread);
+		__thread_fpu_end(tsk);
 		force_sig(SIGSEGV, tsk);
 		return;
 	}
diff --git a/arch/x86/kernel/xsave.c b/arch/x86/kernel/xsave.c
index a0bcd0dbc951..711091114119 100644
--- a/arch/x86/kernel/xsave.c
+++ b/arch/x86/kernel/xsave.c
@@ -47,7 +47,7 @@ void __sanitize_i387_state(struct task_struct *tsk)
 	if (!fx)
 		return;
 
-	BUG_ON(__thread_has_fpu(task_thread_info(tsk)));
+	BUG_ON(__thread_has_fpu(tsk));
 
 	xstate_bv = tsk->thread.fpu.state->xsave.xsave_hdr.xstate_bv;
 
diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index 36091dd04b4b..3b4c8d8ad906 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -1457,7 +1457,7 @@ static void __vmx_load_host_state(struct vcpu_vmx *vmx)
 #ifdef CONFIG_X86_64
 	wrmsrl(MSR_KERNEL_GS_BASE, vmx->msr_host_kernel_gs_base);
 #endif
-	if (__thread_has_fpu(current_thread_info()))
+	if (__thread_has_fpu(current))
 		clts();
 	load_gdt(&__get_cpu_var(host_gdt));
 }
-- 
cgit v1.2.3


From 34ddc81a230b15c0e345b6b253049db731499f7e Mon Sep 17 00:00:00 2001
From: Linus Torvalds <torvalds@linux-foundation.org>
Date: Sat, 18 Feb 2012 12:56:35 -0800
Subject: i387: re-introduce FPU state preloading at context switch time

After all the FPU state cleanups and finally finding the problem that
caused all our FPU save/restore problems, this re-introduces the
preloading of FPU state that was removed in commit b3b0870ef3ff ("i387:
do not preload FPU state at task switch time").

However, instead of simply reverting the removal, this reimplements
preloading with several fixes, most notably

 - properly abstracted as a true FPU state switch, rather than as
   open-coded save and restore with various hacks.

   In particular, implementing it as a proper FPU state switch allows us
   to optimize the CR0.TS flag accesses: there is no reason to set the
   TS bit only to then almost immediately clear it again.  CR0 accesses
   are quite slow and expensive, don't flip the bit back and forth for
   no good reason.

 - Make sure that the same model works for both x86-32 and x86-64, so
   that there are no gratuitous differences between the two due to the
   way they save and restore segment state differently due to
   architectural differences that really don't matter to the FPU state.

 - Avoid exposing the "preload" state to the context switch routines,
   and in particular allow the concept of lazy state restore: if nothing
   else has used the FPU in the meantime, and the process is still on
   the same CPU, we can avoid restoring state from memory entirely, just
   re-expose the state that is still in the FPU unit.

   That optimized lazy restore isn't actually implemented here, but the
   infrastructure is set up for it.  Of course, older CPU's that use
   'fnsave' to save the state cannot take advantage of this, since the
   state saving also trashes the state.

In other words, there is now an actual _design_ to the FPU state saving,
rather than just random historical baggage.  Hopefully it's easier to
follow as a result.

Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 arch/x86/include/asm/i387.h  | 110 ++++++++++++++++++++++++++++++++++++-------
 arch/x86/kernel/process_32.c |   5 +-
 arch/x86/kernel/process_64.c |   5 +-
 arch/x86/kernel/traps.c      |  55 +++++++++++++---------
 4 files changed, 133 insertions(+), 42 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/include/asm/i387.h b/arch/x86/include/asm/i387.h
index f5376676f89c..a850b4d8d14d 100644
--- a/arch/x86/include/asm/i387.h
+++ b/arch/x86/include/asm/i387.h
@@ -29,6 +29,7 @@ extern unsigned int sig_xstate_size;
 extern void fpu_init(void);
 extern void mxcsr_feature_mask_init(void);
 extern int init_fpu(struct task_struct *child);
+extern void __math_state_restore(struct task_struct *);
 extern void math_state_restore(void);
 extern int dump_fpu(struct pt_regs *, struct user_i387_struct *);
 
@@ -212,9 +213,10 @@ static inline void fpu_fxsave(struct fpu *fpu)
 #endif	/* CONFIG_X86_64 */
 
 /*
- * These must be called with preempt disabled
+ * These must be called with preempt disabled. Returns
+ * 'true' if the FPU state is still intact.
  */
-static inline void fpu_save_init(struct fpu *fpu)
+static inline int fpu_save_init(struct fpu *fpu)
 {
 	if (use_xsave()) {
 		fpu_xsave(fpu);
@@ -223,22 +225,33 @@ static inline void fpu_save_init(struct fpu *fpu)
 		 * xsave header may indicate the init state of the FP.
 		 */
 		if (!(fpu->state->xsave.xsave_hdr.xstate_bv & XSTATE_FP))
-			return;
+			return 1;
 	} else if (use_fxsr()) {
 		fpu_fxsave(fpu);
 	} else {
 		asm volatile("fnsave %[fx]; fwait"
 			     : [fx] "=m" (fpu->state->fsave));
-		return;
+		return 0;
 	}
 
-	if (unlikely(fpu->state->fxsave.swd & X87_FSW_ES))
+	/*
+	 * If exceptions are pending, we need to clear them so
+	 * that we don't randomly get exceptions later.
+	 *
+	 * FIXME! Is this perhaps only true for the old-style
+	 * irq13 case? Maybe we could leave the x87 state
+	 * intact otherwise?
+	 */
+	if (unlikely(fpu->state->fxsave.swd & X87_FSW_ES)) {
 		asm volatile("fnclex");
+		return 0;
+	}
+	return 1;
 }
 
-static inline void __save_init_fpu(struct task_struct *tsk)
+static inline int __save_init_fpu(struct task_struct *tsk)
 {
-	fpu_save_init(&tsk->thread.fpu);
+	return fpu_save_init(&tsk->thread.fpu);
 }
 
 static inline int fpu_fxrstor_checking(struct fpu *fpu)
@@ -301,20 +314,79 @@ static inline void __thread_fpu_begin(struct task_struct *tsk)
 }
 
 /*
- * Signal frame handlers...
+ * FPU state switching for scheduling.
+ *
+ * This is a two-stage process:
+ *
+ *  - switch_fpu_prepare() saves the old state and
+ *    sets the new state of the CR0.TS bit. This is
+ *    done within the context of the old process.
+ *
+ *  - switch_fpu_finish() restores the new state as
+ *    necessary.
  */
-extern int save_i387_xstate(void __user *buf);
-extern int restore_i387_xstate(void __user *buf);
+typedef struct { int preload; } fpu_switch_t;
+
+/*
+ * FIXME! We could do a totally lazy restore, but we need to
+ * add a per-cpu "this was the task that last touched the FPU
+ * on this CPU" variable, and the task needs to have a "I last
+ * touched the FPU on this CPU" and check them.
+ *
+ * We don't do that yet, so "fpu_lazy_restore()" always returns
+ * false, but some day..
+ */
+#define fpu_lazy_restore(tsk) (0)
+#define fpu_lazy_state_intact(tsk) do { } while (0)
+
+static inline fpu_switch_t switch_fpu_prepare(struct task_struct *old, struct task_struct *new)
+{
+	fpu_switch_t fpu;
+
+	fpu.preload = tsk_used_math(new) && new->fpu_counter > 5;
+	if (__thread_has_fpu(old)) {
+		if (__save_init_fpu(old))
+			fpu_lazy_state_intact(old);
+		__thread_clear_has_fpu(old);
+		old->fpu_counter++;
+
+		/* Don't change CR0.TS if we just switch! */
+		if (fpu.preload) {
+			__thread_set_has_fpu(new);
+			prefetch(new->thread.fpu.state);
+		} else
+			stts();
+	} else {
+		old->fpu_counter = 0;
+		if (fpu.preload) {
+			if (fpu_lazy_restore(new))
+				fpu.preload = 0;
+			else
+				prefetch(new->thread.fpu.state);
+			__thread_fpu_begin(new);
+		}
+	}
+	return fpu;
+}
 
-static inline void __unlazy_fpu(struct task_struct *tsk)
+/*
+ * By the time this gets called, we've already cleared CR0.TS and
+ * given the process the FPU if we are going to preload the FPU
+ * state - all we need to do is to conditionally restore the register
+ * state itself.
+ */
+static inline void switch_fpu_finish(struct task_struct *new, fpu_switch_t fpu)
 {
-	if (__thread_has_fpu(tsk)) {
-		__save_init_fpu(tsk);
-		__thread_fpu_end(tsk);
-	} else
-		tsk->fpu_counter = 0;
+	if (fpu.preload)
+		__math_state_restore(new);
 }
 
+/*
+ * Signal frame handlers...
+ */
+extern int save_i387_xstate(void __user *buf);
+extern int restore_i387_xstate(void __user *buf);
+
 static inline void __clear_fpu(struct task_struct *tsk)
 {
 	if (__thread_has_fpu(tsk)) {
@@ -474,7 +546,11 @@ static inline void save_init_fpu(struct task_struct *tsk)
 static inline void unlazy_fpu(struct task_struct *tsk)
 {
 	preempt_disable();
-	__unlazy_fpu(tsk);
+	if (__thread_has_fpu(tsk)) {
+		__save_init_fpu(tsk);
+		__thread_fpu_end(tsk);
+	} else
+		tsk->fpu_counter = 0;
 	preempt_enable();
 }
 
diff --git a/arch/x86/kernel/process_32.c b/arch/x86/kernel/process_32.c
index 324cd722b447..80bfe1ab0031 100644
--- a/arch/x86/kernel/process_32.c
+++ b/arch/x86/kernel/process_32.c
@@ -299,10 +299,11 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p)
 				 *next = &next_p->thread;
 	int cpu = smp_processor_id();
 	struct tss_struct *tss = &per_cpu(init_tss, cpu);
+	fpu_switch_t fpu;
 
 	/* never put a printk in __switch_to... printk() calls wake_up*() indirectly */
 
-	__unlazy_fpu(prev_p);
+	fpu = switch_fpu_prepare(prev_p, next_p);
 
 	/*
 	 * Reload esp0.
@@ -357,6 +358,8 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p)
 	if (prev->gs | next->gs)
 		lazy_load_gs(next->gs);
 
+	switch_fpu_finish(next_p, fpu);
+
 	percpu_write(current_task, next_p);
 
 	return prev_p;
diff --git a/arch/x86/kernel/process_64.c b/arch/x86/kernel/process_64.c
index 753e803f7197..1fd94bc4279d 100644
--- a/arch/x86/kernel/process_64.c
+++ b/arch/x86/kernel/process_64.c
@@ -386,8 +386,9 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p)
 	int cpu = smp_processor_id();
 	struct tss_struct *tss = &per_cpu(init_tss, cpu);
 	unsigned fsindex, gsindex;
+	fpu_switch_t fpu;
 
-	__unlazy_fpu(prev_p);
+	fpu = switch_fpu_prepare(prev_p, next_p);
 
 	/*
 	 * Reload esp0, LDT and the page table pointer:
@@ -457,6 +458,8 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p)
 		wrmsrl(MSR_KERNEL_GS_BASE, next->gs);
 	prev->gsindex = gsindex;
 
+	switch_fpu_finish(next_p, fpu);
+
 	/*
 	 * Switch the PDA and FPU contexts.
 	 */
diff --git a/arch/x86/kernel/traps.c b/arch/x86/kernel/traps.c
index ad25e51f40c4..77da5b475ad2 100644
--- a/arch/x86/kernel/traps.c
+++ b/arch/x86/kernel/traps.c
@@ -570,6 +570,37 @@ asmlinkage void __attribute__((weak)) smp_threshold_interrupt(void)
 {
 }
 
+/*
+ * This gets called with the process already owning the
+ * FPU state, and with CR0.TS cleared. It just needs to
+ * restore the FPU register state.
+ */
+void __math_state_restore(struct task_struct *tsk)
+{
+	/* We need a safe address that is cheap to find and that is already
+	   in L1. We've just brought in "tsk->thread.has_fpu", so use that */
+#define safe_address (tsk->thread.has_fpu)
+
+	/* AMD K7/K8 CPUs don't save/restore FDP/FIP/FOP unless an exception
+	   is pending.  Clear the x87 state here by setting it to fixed
+	   values. safe_address is a random variable that should be in L1 */
+	alternative_input(
+		ASM_NOP8 ASM_NOP2,
+		"emms\n\t"	  	/* clear stack tags */
+		"fildl %P[addr]",	/* set F?P to defined value */
+		X86_FEATURE_FXSAVE_LEAK,
+		[addr] "m" (safe_address));
+
+	/*
+	 * Paranoid restore. send a SIGSEGV if we fail to restore the state.
+	 */
+	if (unlikely(restore_fpu_checking(tsk))) {
+		__thread_fpu_end(tsk);
+		force_sig(SIGSEGV, tsk);
+		return;
+	}
+}
+
 /*
  * 'math_state_restore()' saves the current math information in the
  * old math state array, and gets the new ones from the current task
@@ -584,10 +615,6 @@ void math_state_restore(void)
 {
 	struct task_struct *tsk = current;
 
-	/* We need a safe address that is cheap to find and that is already
-	   in L1. We're just bringing in "tsk->thread.has_fpu", so use that */
-#define safe_address (tsk->thread.has_fpu)
-
 	if (!tsk_used_math(tsk)) {
 		local_irq_enable();
 		/*
@@ -604,25 +631,7 @@ void math_state_restore(void)
 	}
 
 	__thread_fpu_begin(tsk);
-
-	/* AMD K7/K8 CPUs don't save/restore FDP/FIP/FOP unless an exception
-	   is pending.  Clear the x87 state here by setting it to fixed
-	   values. safe_address is a random variable that should be in L1 */
-	alternative_input(
-		ASM_NOP8 ASM_NOP2,
-		"emms\n\t"	  	/* clear stack tags */
-		"fildl %P[addr]",	/* set F?P to defined value */
-		X86_FEATURE_FXSAVE_LEAK,
-		[addr] "m" (safe_address));
-
-	/*
-	 * Paranoid restore. send a SIGSEGV if we fail to restore the state.
-	 */
-	if (unlikely(restore_fpu_checking(tsk))) {
-		__thread_fpu_end(tsk);
-		force_sig(SIGSEGV, tsk);
-		return;
-	}
+	__math_state_restore(tsk);
 
 	tsk->fpu_counter++;
 }
-- 
cgit v1.2.3


From b0deca2e0270135f797e81bdb0743e50fd1dc58d Mon Sep 17 00:00:00 2001
From: Dimitri Sivanich <sivanich@sgi.com>
Date: Fri, 17 Feb 2012 08:16:41 -0600
Subject: x86/UV: Lower UV rtc clocksource rating

Lower the rating of the UV rtc clocksource to just below that of
the tsc, to improve performance.

Reading the tsc clocksource has lower latency than reading the
rtc, so favor it in situations where it is synchronized and
stable.  When the tsc is unsynchronized, the rtc needs to be the
chosen clocksource.

Signed-off-by: Dimitri Sivanich <sivanich@sgi.com>
Cc: John Stultz <johnstul@us.ibm.com>
Cc: Jack Steiner <steiner@sgi.com>
Link: http://lkml.kernel.org/r/20120217141641.GA28063@sgi.com
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/platform/uv/uv_time.c | 6 +-----
 1 file changed, 1 insertion(+), 5 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/platform/uv/uv_time.c b/arch/x86/platform/uv/uv_time.c
index 9f29a01ee1b3..5032e0d19b86 100644
--- a/arch/x86/platform/uv/uv_time.c
+++ b/arch/x86/platform/uv/uv_time.c
@@ -37,7 +37,7 @@ static void uv_rtc_timer_setup(enum clock_event_mode,
 
 static struct clocksource clocksource_uv = {
 	.name		= RTC_NAME,
-	.rating		= 400,
+	.rating		= 299,
 	.read		= uv_read_rtc,
 	.mask		= (cycle_t)UVH_RTC_REAL_TIME_CLOCK_MASK,
 	.flags		= CLOCK_SOURCE_IS_CONTINUOUS,
@@ -379,10 +379,6 @@ static __init int uv_rtc_setup_clock(void)
 	if (!is_uv_system())
 		return -ENODEV;
 
-	/* If single blade, prefer tsc */
-	if (uv_num_possible_blades() == 1)
-		clocksource_uv.rating = 250;
-
 	rc = clocksource_register_hz(&clocksource_uv, sn_rtc_cycles_per_second);
 	if (rc)
 		printk(KERN_INFO "UV RTC clocksource failed rc %d\n", rc);
-- 
cgit v1.2.3


From 45d5a1683c04be28abdf5c04c27b1417e0374486 Mon Sep 17 00:00:00 2001
From: Steven Rostedt <rostedt@goodmis.org>
Date: Sun, 19 Feb 2012 16:43:37 -0500
Subject: x86/nmi: Test saved %cs in NMI to determine nested NMI case

Currently, the NMI handler tests if it is nested by checking the
special variable saved on the stack (set during NMI handling)
and whether the saved stack is the NMI stack as well (to prevent
the race when the variable is set to zero).

But userspace may set their %rsp to any value as long as they do
not derefence it, and it may make it point to the NMI stack,
which will prevent NMIs from triggering while the userspace app
is running. (I tested this, and it is indeed the case)

Add another check to determine nested NMIs by looking at the
saved %cs (code segment register) and making sure that it is the
kernel code segment.

Signed-off-by: Steven Rostedt <rostedt@goodmis.org>
Cc: H. Peter Anvin <hpa@zytor.com>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: <stable@kernel.org>
Link: http://lkml.kernel.org/r/1329687817.1561.27.camel@acer.local.home
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/entry_64.S | 7 +++++++
 1 file changed, 7 insertions(+)

(limited to 'arch/x86')

diff --git a/arch/x86/kernel/entry_64.S b/arch/x86/kernel/entry_64.S
index 3fe8239fd8fb..debd851de6ff 100644
--- a/arch/x86/kernel/entry_64.S
+++ b/arch/x86/kernel/entry_64.S
@@ -1531,6 +1531,13 @@ ENTRY(nmi)
 	/* Use %rdx as out temp variable throughout */
 	pushq_cfi %rdx
 
+	/*
+	 * If %cs was not the kernel segment, then the NMI triggered in user
+	 * space, which means it is definitely not nested.
+	 */
+	cmp $__KERNEL_CS, 16(%rsp)
+	jne first_nmi
+
 	/*
 	 * Check the special variable on the stack to see if NMIs are
 	 * executing.
-- 
cgit v1.2.3


From 986cb48c5a4de0085db94d343b4e7dcf54355ec1 Mon Sep 17 00:00:00 2001
From: Linus Torvalds <torvalds@linux-foundation.org>
Date: Sun, 19 Feb 2012 11:46:36 -0800
Subject: x86-32/irq: Don't switch to irq stack for a user-mode irq

If the irq happens in user mode, our kernel stack is empty
(apart from the pt_regs themselves, of course), so there's no
need or advantage to switch.

And it really doesn't save any stack space, quite the reverse:
it means that a nested interrupt cannot switch irq stacks. So
instead of saving kernel stack space, it actually causes the
potential for *more* stack usage.

Also simplify the preemption count copy when we do switch
stacks: just copy the whole preemption count, rather than just
the softirq parts of it.  There is no advantage to the partial
copy: it is more effort to get a less correct result.

Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
Link: http://lkml.kernel.org/r/alpine.LFD.2.02.1202191139260.10000@i5.linux-foundation.org
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/irq_32.c | 11 +++--------
 1 file changed, 3 insertions(+), 8 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kernel/irq_32.c b/arch/x86/kernel/irq_32.c
index 40fc86161d92..58b7f27cb3e9 100644
--- a/arch/x86/kernel/irq_32.c
+++ b/arch/x86/kernel/irq_32.c
@@ -100,13 +100,8 @@ execute_on_irq_stack(int overflow, struct irq_desc *desc, int irq)
 	irqctx->tinfo.task = curctx->tinfo.task;
 	irqctx->tinfo.previous_esp = current_stack_pointer;
 
-	/*
-	 * Copy the softirq bits in preempt_count so that the
-	 * softirq checks work in the hardirq context.
-	 */
-	irqctx->tinfo.preempt_count =
-		(irqctx->tinfo.preempt_count & ~SOFTIRQ_MASK) |
-		(curctx->tinfo.preempt_count & SOFTIRQ_MASK);
+	/* Copy the preempt_count so that the [soft]irq checks work. */
+	irqctx->tinfo.preempt_count = curctx->tinfo.preempt_count;
 
 	if (unlikely(overflow))
 		call_on_stack(print_stack_overflow, isp);
@@ -196,7 +191,7 @@ bool handle_irq(unsigned irq, struct pt_regs *regs)
 	if (unlikely(!desc))
 		return false;
 
-	if (!execute_on_irq_stack(overflow, desc, irq)) {
+	if (user_mode_vm(regs) || !execute_on_irq_stack(overflow, desc, irq)) {
 		if (unlikely(overflow))
 			print_stack_overflow();
 		desc->handle_irq(irq, desc);
-- 
cgit v1.2.3


From 416d7214741daba3acd6d328289858390bef37bc Mon Sep 17 00:00:00 2001
From: Konrad Rzeszutek Wilk <konrad.wilk@oracle.com>
Date: Fri, 10 Feb 2012 09:24:08 -0500
Subject: xen/setup: Remove redundant filtering of PTE masks.

commit 7347b4082e55ac4a673f06a0a0ce25c37273c9ec "xen: Allow
unprivileged Xen domains to create iomap pages" added a redundant
line in the early bootup code to filter out the PTE. That
filtering is already done a bit earlier so this extra processing
is not required.

Signed-off-by: Konrad Rzeszutek Wilk <konrad.wilk@oracle.com>
---
 arch/x86/xen/enlighten.c | 4 ----
 1 file changed, 4 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/xen/enlighten.c b/arch/x86/xen/enlighten.c
index 12eb07bfb267..7c44e1bf981e 100644
--- a/arch/x86/xen/enlighten.c
+++ b/arch/x86/xen/enlighten.c
@@ -1204,10 +1204,6 @@ asmlinkage void __init xen_start_kernel(void)
 
 	pgd = (pgd_t *)xen_start_info->pt_base;
 
-	if (!xen_initial_domain())
-		__supported_pte_mask &= ~(_PAGE_PWT | _PAGE_PCD);
-
-	__supported_pte_mask |= _PAGE_IOMAP;
 	/* Don't do the full vcpu_info placement stuff until we have a
 	   possible map and a non-dummy shared_info. */
 	per_cpu(xen_vcpu, 0) = &HYPERVISOR_shared_info->vcpu_info[0];
-- 
cgit v1.2.3


From 8eaffa67b43e99ae581622c5133e20b0f48bcef1 Mon Sep 17 00:00:00 2001
From: Konrad Rzeszutek Wilk <konrad.wilk@oracle.com>
Date: Fri, 10 Feb 2012 09:16:27 -0500
Subject: xen/pat: Disable PAT support for now.

[Pls also look at https://lkml.org/lkml/2012/2/10/228]

Using of PAT to change pages from WB to WC works quite nicely.
Changing it back to WB - not so much. The crux of the matter is
that the code that does this (__page_change_att_set_clr) has only
limited information so when it tries to the change it gets
the "raw" unfiltered information instead of the properly filtered one -
and the "raw" one tell it that PSE bit is on (while infact it
is not).  As a result when the PTE is set to be WB from WC, we get
tons of:

:WARNING: at arch/x86/xen/mmu.c:475 xen_make_pte+0x67/0xa0()
:Hardware name: HP xw4400 Workstation
.. snip..
:Pid: 27, comm: kswapd0 Tainted: G        W    3.2.2-1.fc16.x86_64 #1
:Call Trace:
: [<ffffffff8106dd1f>] warn_slowpath_common+0x7f/0xc0
: [<ffffffff8106dd7a>] warn_slowpath_null+0x1a/0x20
: [<ffffffff81005a17>] xen_make_pte+0x67/0xa0
: [<ffffffff810051bd>] __raw_callee_save_xen_make_pte+0x11/0x1e
: [<ffffffff81040e15>] ? __change_page_attr_set_clr+0x9d5/0xc00
: [<ffffffff8114c2e8>] ? __purge_vmap_area_lazy+0x158/0x1d0
: [<ffffffff8114cca5>] ? vm_unmap_aliases+0x175/0x190
: [<ffffffff81041168>] change_page_attr_set_clr+0x128/0x4c0
: [<ffffffff81041542>] set_pages_array_wb+0x42/0xa0
: [<ffffffff8100a9b2>] ? check_events+0x12/0x20
: [<ffffffffa0074d4c>] ttm_pages_put+0x1c/0x70 [ttm]
: [<ffffffffa0074e98>] ttm_page_pool_free+0xf8/0x180 [ttm]
: [<ffffffffa0074f78>] ttm_pool_mm_shrink+0x58/0x90 [ttm]
: [<ffffffff8112ba04>] shrink_slab+0x154/0x310
: [<ffffffff8112f17a>] balance_pgdat+0x4fa/0x6c0
: [<ffffffff8112f4b8>] kswapd+0x178/0x3d0
: [<ffffffff815df134>] ? __schedule+0x3d4/0x8c0
: [<ffffffff81090410>] ? remove_wait_queue+0x50/0x50
: [<ffffffff8112f340>] ? balance_pgdat+0x6c0/0x6c0
: [<ffffffff8108fb6c>] kthread+0x8c/0xa0

for every page. The proper fix for this is has been posted
and is https://lkml.org/lkml/2012/2/10/228
"x86/cpa: Use pte_attrs instead of pte_flags on CPA/set_p.._wb/wc operations."
along with a detailed description of the problem and solution.

But since that posting has gone nowhere I am proposing
this band-aid solution so that at least users don't get
the page corruption (the pages that are WC don't get changed to WB
and end up being recycled for filesystem or other things causing
mysterious crashes).

The negative impact of this patch is that users of WC flag
(which are InfiniBand, radeon, nouveau drivers) won't be able
to set that flag - so they are going to see performance degradation.
But stability is more important here.

Fixes RH BZ# 742032, 787403, and 745574
Signed-off-by: Konrad Rzeszutek Wilk <konrad.wilk@oracle.com>
---
 arch/x86/xen/enlighten.c | 2 ++
 arch/x86/xen/mmu.c       | 8 ++++----
 2 files changed, 6 insertions(+), 4 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/xen/enlighten.c b/arch/x86/xen/enlighten.c
index 7c44e1bf981e..4172af8ceeb3 100644
--- a/arch/x86/xen/enlighten.c
+++ b/arch/x86/xen/enlighten.c
@@ -1141,7 +1141,9 @@ asmlinkage void __init xen_start_kernel(void)
 
 	/* Prevent unwanted bits from being set in PTEs. */
 	__supported_pte_mask &= ~_PAGE_GLOBAL;
+#if 0
 	if (!xen_initial_domain())
+#endif
 		__supported_pte_mask &= ~(_PAGE_PWT | _PAGE_PCD);
 
 	__supported_pte_mask |= _PAGE_IOMAP;
diff --git a/arch/x86/xen/mmu.c b/arch/x86/xen/mmu.c
index 58a0e46c404d..95c1cf60c669 100644
--- a/arch/x86/xen/mmu.c
+++ b/arch/x86/xen/mmu.c
@@ -415,13 +415,13 @@ static pteval_t iomap_pte(pteval_t val)
 static pteval_t xen_pte_val(pte_t pte)
 {
 	pteval_t pteval = pte.pte;
-
+#if 0
 	/* If this is a WC pte, convert back from Xen WC to Linux WC */
 	if ((pteval & (_PAGE_PAT | _PAGE_PCD | _PAGE_PWT)) == _PAGE_PAT) {
 		WARN_ON(!pat_enabled);
 		pteval = (pteval & ~_PAGE_PAT) | _PAGE_PWT;
 	}
-
+#endif
 	if (xen_initial_domain() && (pteval & _PAGE_IOMAP))
 		return pteval;
 
@@ -463,7 +463,7 @@ void xen_set_pat(u64 pat)
 static pte_t xen_make_pte(pteval_t pte)
 {
 	phys_addr_t addr = (pte & PTE_PFN_MASK);
-
+#if 0
 	/* If Linux is trying to set a WC pte, then map to the Xen WC.
 	 * If _PAGE_PAT is set, then it probably means it is really
 	 * _PAGE_PSE, so avoid fiddling with the PAT mapping and hope
@@ -476,7 +476,7 @@ static pte_t xen_make_pte(pteval_t pte)
 		if ((pte & (_PAGE_PCD | _PAGE_PWT)) == _PAGE_PWT)
 			pte = (pte & ~(_PAGE_PCD | _PAGE_PWT)) | _PAGE_PAT;
 	}
-
+#endif
 	/*
 	 * Unprivileged domains are allowed to do IOMAPpings for
 	 * PCI passthrough, but not map ISA space.  The ISA
-- 
cgit v1.2.3


From cea20ca3f3181fc36788a15bc65d1062b96a0a6c Mon Sep 17 00:00:00 2001
From: Linus Torvalds <torvalds@linux-foundation.org>
Date: Mon, 20 Feb 2012 10:24:09 -0800
Subject: i387: fix up some fpu_counter confusion

This makes sure we clear the FPU usage counter for newly created tasks,
just so that we start off in a known state (for example, don't try to
preload the FPU state on the first task switch etc).

It also fixes a thinko in when we increment the fpu_counter at task
switch time, introduced by commit 34ddc81a230b ("i387: re-introduce FPU
state preloading at context switch time").  We should increment the
*new* task fpu_counter, not the old task, and only if we decide to use
that state (whether lazily or preloaded).

Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 arch/x86/include/asm/i387.h  | 3 ++-
 arch/x86/kernel/process_32.c | 1 +
 arch/x86/kernel/process_64.c | 1 +
 3 files changed, 4 insertions(+), 1 deletion(-)

(limited to 'arch/x86')

diff --git a/arch/x86/include/asm/i387.h b/arch/x86/include/asm/i387.h
index a850b4d8d14d..8df95849721d 100644
--- a/arch/x86/include/asm/i387.h
+++ b/arch/x86/include/asm/i387.h
@@ -348,10 +348,10 @@ static inline fpu_switch_t switch_fpu_prepare(struct task_struct *old, struct ta
 		if (__save_init_fpu(old))
 			fpu_lazy_state_intact(old);
 		__thread_clear_has_fpu(old);
-		old->fpu_counter++;
 
 		/* Don't change CR0.TS if we just switch! */
 		if (fpu.preload) {
+			new->fpu_counter++;
 			__thread_set_has_fpu(new);
 			prefetch(new->thread.fpu.state);
 		} else
@@ -359,6 +359,7 @@ static inline fpu_switch_t switch_fpu_prepare(struct task_struct *old, struct ta
 	} else {
 		old->fpu_counter = 0;
 		if (fpu.preload) {
+			new->fpu_counter++;
 			if (fpu_lazy_restore(new))
 				fpu.preload = 0;
 			else
diff --git a/arch/x86/kernel/process_32.c b/arch/x86/kernel/process_32.c
index 80bfe1ab0031..bc32761bc27a 100644
--- a/arch/x86/kernel/process_32.c
+++ b/arch/x86/kernel/process_32.c
@@ -214,6 +214,7 @@ int copy_thread(unsigned long clone_flags, unsigned long sp,
 
 	task_user_gs(p) = get_user_gs(regs);
 
+	p->fpu_counter = 0;
 	p->thread.io_bitmap_ptr = NULL;
 	tsk = current;
 	err = -ENOMEM;
diff --git a/arch/x86/kernel/process_64.c b/arch/x86/kernel/process_64.c
index 1fd94bc4279d..8ad880b3bc1c 100644
--- a/arch/x86/kernel/process_64.c
+++ b/arch/x86/kernel/process_64.c
@@ -286,6 +286,7 @@ int copy_thread(unsigned long clone_flags, unsigned long sp,
 
 	set_tsk_thread_flag(p, TIF_FORK);
 
+	p->fpu_counter = 0;
 	p->thread.io_bitmap_ptr = NULL;
 
 	savesegment(gs, p->thread.gsindex);
-- 
cgit v1.2.3


From 80ab6f1e8c981b1b6604b2f22e36c917526235cd Mon Sep 17 00:00:00 2001
From: Linus Torvalds <torvalds@linux-foundation.org>
Date: Sun, 19 Feb 2012 11:48:44 -0800
Subject: i387: use 'restore_fpu_checking()' directly in task switching code

This inlines what is usually just a couple of instructions, but more
importantly it also fixes the theoretical error case (can that FPU
restore really ever fail? Maybe we should remove the checking).

We can't start sending signals from within the scheduler, we're much too
deep in the kernel and are holding the runqueue lock etc.  So don't
bother even trying.

Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 arch/x86/include/asm/i387.h | 17 ++++++++++++++---
 arch/x86/kernel/traps.c     | 40 ++++++++--------------------------------
 2 files changed, 22 insertions(+), 35 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/include/asm/i387.h b/arch/x86/include/asm/i387.h
index 8df95849721d..74c607b37e87 100644
--- a/arch/x86/include/asm/i387.h
+++ b/arch/x86/include/asm/i387.h
@@ -29,7 +29,6 @@ extern unsigned int sig_xstate_size;
 extern void fpu_init(void);
 extern void mxcsr_feature_mask_init(void);
 extern int init_fpu(struct task_struct *child);
-extern void __math_state_restore(struct task_struct *);
 extern void math_state_restore(void);
 extern int dump_fpu(struct pt_regs *, struct user_i387_struct *);
 
@@ -269,6 +268,16 @@ static inline int fpu_restore_checking(struct fpu *fpu)
 
 static inline int restore_fpu_checking(struct task_struct *tsk)
 {
+	/* AMD K7/K8 CPUs don't save/restore FDP/FIP/FOP unless an exception
+	   is pending.  Clear the x87 state here by setting it to fixed
+	   values. "m" is a random variable that should be in L1 */
+	alternative_input(
+		ASM_NOP8 ASM_NOP2,
+		"emms\n\t"	  	/* clear stack tags */
+		"fildl %P[addr]",	/* set F?P to defined value */
+		X86_FEATURE_FXSAVE_LEAK,
+		[addr] "m" (tsk->thread.has_fpu));
+
 	return fpu_restore_checking(&tsk->thread.fpu);
 }
 
@@ -378,8 +387,10 @@ static inline fpu_switch_t switch_fpu_prepare(struct task_struct *old, struct ta
  */
 static inline void switch_fpu_finish(struct task_struct *new, fpu_switch_t fpu)
 {
-	if (fpu.preload)
-		__math_state_restore(new);
+	if (fpu.preload) {
+		if (unlikely(restore_fpu_checking(new)))
+			__thread_fpu_end(new);
+	}
 }
 
 /*
diff --git a/arch/x86/kernel/traps.c b/arch/x86/kernel/traps.c
index 77da5b475ad2..4bbe04d96744 100644
--- a/arch/x86/kernel/traps.c
+++ b/arch/x86/kernel/traps.c
@@ -570,37 +570,6 @@ asmlinkage void __attribute__((weak)) smp_threshold_interrupt(void)
 {
 }
 
-/*
- * This gets called with the process already owning the
- * FPU state, and with CR0.TS cleared. It just needs to
- * restore the FPU register state.
- */
-void __math_state_restore(struct task_struct *tsk)
-{
-	/* We need a safe address that is cheap to find and that is already
-	   in L1. We've just brought in "tsk->thread.has_fpu", so use that */
-#define safe_address (tsk->thread.has_fpu)
-
-	/* AMD K7/K8 CPUs don't save/restore FDP/FIP/FOP unless an exception
-	   is pending.  Clear the x87 state here by setting it to fixed
-	   values. safe_address is a random variable that should be in L1 */
-	alternative_input(
-		ASM_NOP8 ASM_NOP2,
-		"emms\n\t"	  	/* clear stack tags */
-		"fildl %P[addr]",	/* set F?P to defined value */
-		X86_FEATURE_FXSAVE_LEAK,
-		[addr] "m" (safe_address));
-
-	/*
-	 * Paranoid restore. send a SIGSEGV if we fail to restore the state.
-	 */
-	if (unlikely(restore_fpu_checking(tsk))) {
-		__thread_fpu_end(tsk);
-		force_sig(SIGSEGV, tsk);
-		return;
-	}
-}
-
 /*
  * 'math_state_restore()' saves the current math information in the
  * old math state array, and gets the new ones from the current task
@@ -631,7 +600,14 @@ void math_state_restore(void)
 	}
 
 	__thread_fpu_begin(tsk);
-	__math_state_restore(tsk);
+	/*
+	 * Paranoid restore. send a SIGSEGV if we fail to restore the state.
+	 */
+	if (unlikely(restore_fpu_checking(tsk))) {
+		__thread_fpu_end(tsk);
+		force_sig(SIGSEGV, tsk);
+		return;
+	}
 
 	tsk->fpu_counter++;
 }
-- 
cgit v1.2.3


From 7e16838d94b566a17b65231073d179bc04d590c8 Mon Sep 17 00:00:00 2001
From: Linus Torvalds <torvalds@linux-foundation.org>
Date: Sun, 19 Feb 2012 13:27:00 -0800
Subject: i387: support lazy restore of FPU state

This makes us recognize when we try to restore FPU state that matches
what we already have in the FPU on this CPU, and avoids the restore
entirely if so.

To do this, we add two new data fields:

 - a percpu 'fpu_owner_task' variable that gets written any time we
   update the "has_fpu" field, and thus acts as a kind of back-pointer
   to the task that owns the CPU.  The exception is when we save the FPU
   state as part of a context switch - if the save can keep the FPU
   state around, we leave the 'fpu_owner_task' variable pointing at the
   task whose FP state still remains on the CPU.

 - a per-thread 'last_cpu' field, that indicates which CPU that thread
   used its FPU on last.  We update this on every context switch
   (writing an invalid CPU number if the last context switch didn't
   leave the FPU in a lazily usable state), so we know that *that*
   thread has done nothing else with the FPU since.

These two fields together can be used when next switching back to the
task to see if the CPU still matches: if 'fpu_owner_task' matches the
task we are switching to, we know that no other task (or kernel FPU
usage) touched the FPU on this CPU in the meantime, and if the current
CPU number matches the 'last_cpu' field, we know that this thread did no
other FP work on any other CPU, so the FPU state on the CPU must match
what was saved on last context switch.

In that case, we can avoid the 'f[x]rstor' entirely, and just clear the
CR0.TS bit.

Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 arch/x86/include/asm/i387.h      | 35 +++++++++++++++++++++++------------
 arch/x86/include/asm/processor.h |  3 ++-
 arch/x86/kernel/cpu/common.c     |  2 ++
 arch/x86/kernel/process_32.c     |  2 +-
 arch/x86/kernel/process_64.c     |  2 +-
 5 files changed, 29 insertions(+), 15 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/include/asm/i387.h b/arch/x86/include/asm/i387.h
index 74c607b37e87..247904945d3f 100644
--- a/arch/x86/include/asm/i387.h
+++ b/arch/x86/include/asm/i387.h
@@ -32,6 +32,8 @@ extern int init_fpu(struct task_struct *child);
 extern void math_state_restore(void);
 extern int dump_fpu(struct pt_regs *, struct user_i387_struct *);
 
+DECLARE_PER_CPU(struct task_struct *, fpu_owner_task);
+
 extern user_regset_active_fn fpregs_active, xfpregs_active;
 extern user_regset_get_fn fpregs_get, xfpregs_get, fpregs_soft_get,
 				xstateregs_get;
@@ -276,7 +278,7 @@ static inline int restore_fpu_checking(struct task_struct *tsk)
 		"emms\n\t"	  	/* clear stack tags */
 		"fildl %P[addr]",	/* set F?P to defined value */
 		X86_FEATURE_FXSAVE_LEAK,
-		[addr] "m" (tsk->thread.has_fpu));
+		[addr] "m" (tsk->thread.fpu.has_fpu));
 
 	return fpu_restore_checking(&tsk->thread.fpu);
 }
@@ -288,19 +290,21 @@ static inline int restore_fpu_checking(struct task_struct *tsk)
  */
 static inline int __thread_has_fpu(struct task_struct *tsk)
 {
-	return tsk->thread.has_fpu;
+	return tsk->thread.fpu.has_fpu;
 }
 
 /* Must be paired with an 'stts' after! */
 static inline void __thread_clear_has_fpu(struct task_struct *tsk)
 {
-	tsk->thread.has_fpu = 0;
+	tsk->thread.fpu.has_fpu = 0;
+	percpu_write(fpu_owner_task, NULL);
 }
 
 /* Must be paired with a 'clts' before! */
 static inline void __thread_set_has_fpu(struct task_struct *tsk)
 {
-	tsk->thread.has_fpu = 1;
+	tsk->thread.fpu.has_fpu = 1;
+	percpu_write(fpu_owner_task, tsk);
 }
 
 /*
@@ -345,18 +349,22 @@ typedef struct { int preload; } fpu_switch_t;
  * We don't do that yet, so "fpu_lazy_restore()" always returns
  * false, but some day..
  */
-#define fpu_lazy_restore(tsk) (0)
-#define fpu_lazy_state_intact(tsk) do { } while (0)
+static inline int fpu_lazy_restore(struct task_struct *new, unsigned int cpu)
+{
+	return new == percpu_read_stable(fpu_owner_task) &&
+		cpu == new->thread.fpu.last_cpu;
+}
 
-static inline fpu_switch_t switch_fpu_prepare(struct task_struct *old, struct task_struct *new)
+static inline fpu_switch_t switch_fpu_prepare(struct task_struct *old, struct task_struct *new, int cpu)
 {
 	fpu_switch_t fpu;
 
 	fpu.preload = tsk_used_math(new) && new->fpu_counter > 5;
 	if (__thread_has_fpu(old)) {
-		if (__save_init_fpu(old))
-			fpu_lazy_state_intact(old);
-		__thread_clear_has_fpu(old);
+		if (!__save_init_fpu(old))
+			cpu = ~0;
+		old->thread.fpu.last_cpu = cpu;
+		old->thread.fpu.has_fpu = 0;	/* But leave fpu_owner_task! */
 
 		/* Don't change CR0.TS if we just switch! */
 		if (fpu.preload) {
@@ -367,9 +375,10 @@ static inline fpu_switch_t switch_fpu_prepare(struct task_struct *old, struct ta
 			stts();
 	} else {
 		old->fpu_counter = 0;
+		old->thread.fpu.last_cpu = ~0;
 		if (fpu.preload) {
 			new->fpu_counter++;
-			if (fpu_lazy_restore(new))
+			if (fpu_lazy_restore(new, cpu))
 				fpu.preload = 0;
 			else
 				prefetch(new->thread.fpu.state);
@@ -463,8 +472,10 @@ static inline void kernel_fpu_begin(void)
 		__save_init_fpu(me);
 		__thread_clear_has_fpu(me);
 		/* We do 'stts()' in kernel_fpu_end() */
-	} else
+	} else {
+		percpu_write(fpu_owner_task, NULL);
 		clts();
+	}
 }
 
 static inline void kernel_fpu_end(void)
diff --git a/arch/x86/include/asm/processor.h b/arch/x86/include/asm/processor.h
index f7c89e231c6c..58545c97d071 100644
--- a/arch/x86/include/asm/processor.h
+++ b/arch/x86/include/asm/processor.h
@@ -374,6 +374,8 @@ union thread_xstate {
 };
 
 struct fpu {
+	unsigned int last_cpu;
+	unsigned int has_fpu;
 	union thread_xstate *state;
 };
 
@@ -454,7 +456,6 @@ struct thread_struct {
 	unsigned long		trap_no;
 	unsigned long		error_code;
 	/* floating point and extended processor state */
-	unsigned long		has_fpu;
 	struct fpu		fpu;
 #ifdef CONFIG_X86_32
 	/* Virtual 86 mode info */
diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c
index d43cad74f166..b667148dfad7 100644
--- a/arch/x86/kernel/cpu/common.c
+++ b/arch/x86/kernel/cpu/common.c
@@ -1044,6 +1044,8 @@ DEFINE_PER_CPU(char *, irq_stack_ptr) =
 
 DEFINE_PER_CPU(unsigned int, irq_count) = -1;
 
+DEFINE_PER_CPU(struct task_struct *, fpu_owner_task);
+
 /*
  * Special IST stacks which the CPU switches to when it calls
  * an IST-marked descriptor entry. Up to 7 stacks (hardware
diff --git a/arch/x86/kernel/process_32.c b/arch/x86/kernel/process_32.c
index bc32761bc27a..c08d1ff12b7c 100644
--- a/arch/x86/kernel/process_32.c
+++ b/arch/x86/kernel/process_32.c
@@ -304,7 +304,7 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p)
 
 	/* never put a printk in __switch_to... printk() calls wake_up*() indirectly */
 
-	fpu = switch_fpu_prepare(prev_p, next_p);
+	fpu = switch_fpu_prepare(prev_p, next_p, cpu);
 
 	/*
 	 * Reload esp0.
diff --git a/arch/x86/kernel/process_64.c b/arch/x86/kernel/process_64.c
index 8ad880b3bc1c..cfa5c90c01db 100644
--- a/arch/x86/kernel/process_64.c
+++ b/arch/x86/kernel/process_64.c
@@ -389,7 +389,7 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p)
 	unsigned fsindex, gsindex;
 	fpu_switch_t fpu;
 
-	fpu = switch_fpu_prepare(prev_p, next_p);
+	fpu = switch_fpu_prepare(prev_p, next_p, cpu);
 
 	/*
 	 * Reload esp0, LDT and the page table pointer:
-- 
cgit v1.2.3


From 6bd330083e0e97b7ddc053459190bf3d5768ca83 Mon Sep 17 00:00:00 2001
From: "H. Peter Anvin" <hpa@zytor.com>
Date: Mon, 6 Feb 2012 13:03:09 -0800
Subject: x86: Factor out TIF_IA32 from 32-bit address space

Factor out IA32 (compatibility instruction set) from 32-bit address
space in the thread_info flags; this is a precondition patch for x32
support.

Originally-by: H. J. Lu <hjl.tools@gmail.com>
Signed-off-by: H. Peter Anvin <hpa@zytor.com>
Link: http://lkml.kernel.org/n/tip-4pr1xnnksprt7t0h3w5fw4rv@git.kernel.org
---
 arch/x86/include/asm/elf.h         | 4 ++--
 arch/x86/include/asm/processor.h   | 4 ++--
 arch/x86/include/asm/thread_info.h | 4 +++-
 arch/x86/kernel/process_64.c       | 2 ++
 arch/x86/kernel/sys_x86_64.c       | 6 +++---
 arch/x86/oprofile/backtrace.c      | 2 +-
 6 files changed, 13 insertions(+), 9 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/include/asm/elf.h b/arch/x86/include/asm/elf.h
index 5f962df30d0f..410fa6a219f6 100644
--- a/arch/x86/include/asm/elf.h
+++ b/arch/x86/include/asm/elf.h
@@ -287,7 +287,7 @@ do {									\
 #define VDSO_HIGH_BASE		0xffffe000U /* CONFIG_COMPAT_VDSO address */
 
 /* 1GB for 64bit, 8MB for 32bit */
-#define STACK_RND_MASK (test_thread_flag(TIF_IA32) ? 0x7ff : 0x3fffff)
+#define STACK_RND_MASK (test_thread_flag(TIF_ADDR32) ? 0x7ff : 0x3fffff)
 
 #define ARCH_DLINFO							\
 do {									\
@@ -330,7 +330,7 @@ static inline int mmap_is_ia32(void)
 	return 1;
 #endif
 #ifdef CONFIG_IA32_EMULATION
-	if (test_thread_flag(TIF_IA32))
+	if (test_thread_flag(TIF_ADDR32))
 		return 1;
 #endif
 	return 0;
diff --git a/arch/x86/include/asm/processor.h b/arch/x86/include/asm/processor.h
index aa9088c26931..9f748b5fb701 100644
--- a/arch/x86/include/asm/processor.h
+++ b/arch/x86/include/asm/processor.h
@@ -924,9 +924,9 @@ extern unsigned long thread_saved_pc(struct task_struct *tsk);
 #define IA32_PAGE_OFFSET	((current->personality & ADDR_LIMIT_3GB) ? \
 					0xc0000000 : 0xFFFFe000)
 
-#define TASK_SIZE		(test_thread_flag(TIF_IA32) ? \
+#define TASK_SIZE		(test_thread_flag(TIF_ADDR32) ? \
 					IA32_PAGE_OFFSET : TASK_SIZE_MAX)
-#define TASK_SIZE_OF(child)	((test_tsk_thread_flag(child, TIF_IA32)) ? \
+#define TASK_SIZE_OF(child)	((test_tsk_thread_flag(child, TIF_ADDR32)) ? \
 					IA32_PAGE_OFFSET : TASK_SIZE_MAX)
 
 #define STACK_TOP		TASK_SIZE
diff --git a/arch/x86/include/asm/thread_info.h b/arch/x86/include/asm/thread_info.h
index bc817cd8b443..d1803a495b35 100644
--- a/arch/x86/include/asm/thread_info.h
+++ b/arch/x86/include/asm/thread_info.h
@@ -86,7 +86,7 @@ struct thread_info {
 #define TIF_MCE_NOTIFY		10	/* notify userspace of an MCE */
 #define TIF_USER_RETURN_NOTIFY	11	/* notify kernel of userspace return */
 #define TIF_NOTSC		16	/* TSC is not accessible in userland */
-#define TIF_IA32		17	/* 32bit process */
+#define TIF_IA32		17	/* IA32 compatibility process */
 #define TIF_FORK		18	/* ret_from_fork */
 #define TIF_MEMDIE		20	/* is terminating due to OOM killer */
 #define TIF_DEBUG		21	/* uses debug registers */
@@ -95,6 +95,7 @@ struct thread_info {
 #define TIF_BLOCKSTEP		25	/* set when we want DEBUGCTLMSR_BTF */
 #define TIF_LAZY_MMU_UPDATES	27	/* task is updating the mmu lazily */
 #define TIF_SYSCALL_TRACEPOINT	28	/* syscall tracepoint instrumentation */
+#define TIF_ADDR32		29	/* 32-bit address space on 64 bits */
 
 #define _TIF_SYSCALL_TRACE	(1 << TIF_SYSCALL_TRACE)
 #define _TIF_NOTIFY_RESUME	(1 << TIF_NOTIFY_RESUME)
@@ -116,6 +117,7 @@ struct thread_info {
 #define _TIF_BLOCKSTEP		(1 << TIF_BLOCKSTEP)
 #define _TIF_LAZY_MMU_UPDATES	(1 << TIF_LAZY_MMU_UPDATES)
 #define _TIF_SYSCALL_TRACEPOINT	(1 << TIF_SYSCALL_TRACEPOINT)
+#define _TIF_ADDR32		(1 << TIF_ADDR32)
 
 /* work to do in syscall_trace_enter() */
 #define _TIF_WORK_SYSCALL_ENTRY	\
diff --git a/arch/x86/kernel/process_64.c b/arch/x86/kernel/process_64.c
index 9b9fe4a85c87..0e900d09e232 100644
--- a/arch/x86/kernel/process_64.c
+++ b/arch/x86/kernel/process_64.c
@@ -508,6 +508,7 @@ void set_personality_64bit(void)
 
 	/* Make sure to be in 64bit mode */
 	clear_thread_flag(TIF_IA32);
+	clear_thread_flag(TIF_ADDR32);
 
 	/* Ensure the corresponding mm is not marked. */
 	if (current->mm)
@@ -526,6 +527,7 @@ void set_personality_ia32(void)
 
 	/* Make sure to be in 32bit mode */
 	set_thread_flag(TIF_IA32);
+	set_thread_flag(TIF_ADDR32);
 	current->personality |= force_personality32;
 
 	/* Mark the associated mm as containing 32-bit tasks. */
diff --git a/arch/x86/kernel/sys_x86_64.c b/arch/x86/kernel/sys_x86_64.c
index 051489082d59..f921df8c2099 100644
--- a/arch/x86/kernel/sys_x86_64.c
+++ b/arch/x86/kernel/sys_x86_64.c
@@ -98,7 +98,7 @@ out:
 static void find_start_end(unsigned long flags, unsigned long *begin,
 			   unsigned long *end)
 {
-	if (!test_thread_flag(TIF_IA32) && (flags & MAP_32BIT)) {
+	if (!test_thread_flag(TIF_ADDR32) && (flags & MAP_32BIT)) {
 		unsigned long new_begin;
 		/* This is usually used needed to map code in small
 		   model, so it needs to be in the first 31bit. Limit
@@ -144,7 +144,7 @@ arch_get_unmapped_area(struct file *filp, unsigned long addr,
 		    (!vma || addr + len <= vma->vm_start))
 			return addr;
 	}
-	if (((flags & MAP_32BIT) || test_thread_flag(TIF_IA32))
+	if (((flags & MAP_32BIT) || test_thread_flag(TIF_ADDR32))
 	    && len <= mm->cached_hole_size) {
 		mm->cached_hole_size = 0;
 		mm->free_area_cache = begin;
@@ -205,7 +205,7 @@ arch_get_unmapped_area_topdown(struct file *filp, const unsigned long addr0,
 		return addr;
 
 	/* for MAP_32BIT mappings we force the legact mmap base */
-	if (!test_thread_flag(TIF_IA32) && (flags & MAP_32BIT))
+	if (!test_thread_flag(TIF_ADDR32) && (flags & MAP_32BIT))
 		goto bottomup;
 
 	/* requesting a specific address */
diff --git a/arch/x86/oprofile/backtrace.c b/arch/x86/oprofile/backtrace.c
index bff89dfe3619..d6aa6e8315d1 100644
--- a/arch/x86/oprofile/backtrace.c
+++ b/arch/x86/oprofile/backtrace.c
@@ -67,7 +67,7 @@ x86_backtrace_32(struct pt_regs * const regs, unsigned int depth)
 {
 	struct stack_frame_ia32 *head;
 
-	/* User process is 32-bit */
+	/* User process is IA32 */
 	if (!current || !test_thread_flag(TIF_IA32))
 		return 0;
 
-- 
cgit v1.2.3


From 4f72e331c20ac1c656f300cee246330c1786652b Mon Sep 17 00:00:00 2001
From: "H. Peter Anvin" <hpa@zytor.com>
Date: Mon, 6 Feb 2012 13:50:20 -0800
Subject: x86-64: Use explicit sizes in sigcontext.h, prepare for x32

Use explicit sizes (__u64) instead of implicit sizes (unsigned long)
in the definition for sigcontext.h; this will allow this structure to
be shared between the x86-64 native ABI and the x32 ABI.

Originally-by: H. J. Lu <hjl.tools@gmail.com>
Signed-off-by: H. Peter Anvin <hpa@zytor.com>
Link: http://lkml.kernel.org/n/tip-4pr1xnnksprt7t0h3w5fw4rv@git.kernel.org
---
 arch/x86/include/asm/sigcontext.h | 57 ++++++++++++++++++++-------------------
 1 file changed, 30 insertions(+), 27 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/include/asm/sigcontext.h b/arch/x86/include/asm/sigcontext.h
index 04459d25e66e..4a085383af27 100644
--- a/arch/x86/include/asm/sigcontext.h
+++ b/arch/x86/include/asm/sigcontext.h
@@ -230,34 +230,37 @@ struct sigcontext {
  * User-space might still rely on the old definition:
  */
 struct sigcontext {
-	unsigned long r8;
-	unsigned long r9;
-	unsigned long r10;
-	unsigned long r11;
-	unsigned long r12;
-	unsigned long r13;
-	unsigned long r14;
-	unsigned long r15;
-	unsigned long rdi;
-	unsigned long rsi;
-	unsigned long rbp;
-	unsigned long rbx;
-	unsigned long rdx;
-	unsigned long rax;
-	unsigned long rcx;
-	unsigned long rsp;
-	unsigned long rip;
-	unsigned long eflags;		/* RFLAGS */
-	unsigned short cs;
-	unsigned short gs;
-	unsigned short fs;
-	unsigned short __pad0;
-	unsigned long err;
-	unsigned long trapno;
-	unsigned long oldmask;
-	unsigned long cr2;
+	__u64 r8;
+	__u64 r9;
+	__u64 r10;
+	__u64 r11;
+	__u64 r12;
+	__u64 r13;
+	__u64 r14;
+	__u64 r15;
+	__u64 rdi;
+	__u64 rsi;
+	__u64 rbp;
+	__u64 rbx;
+	__u64 rdx;
+	__u64 rax;
+	__u64 rcx;
+	__u64 rsp;
+	__u64 rip;
+	__u64 eflags;		/* RFLAGS */
+	__u16 cs;
+	__u16 gs;
+	__u16 fs;
+	__u16 __pad0;
+	__u64 err;
+	__u64 trapno;
+	__u64 oldmask;
+	__u64 cr2;
 	struct _fpstate __user *fpstate;	/* zero when no FPU context */
-	unsigned long reserved1[8];
+#ifndef __LP64__
+	__u32 __fpstate_pad;
+#endif
+	__u64 reserved1[8];
 };
 #endif /* !__KERNEL__ */
 
-- 
cgit v1.2.3


From 1f5e27a90add2fe2a1c11508f68d377e3ddcf9ab Mon Sep 17 00:00:00 2001
From: "H. Peter Anvin" <hpa@zytor.com>
Date: Tue, 14 Feb 2012 13:13:21 -0800
Subject: x32: Create posix_types_x32.h

This is the same as the 64-bit posix_types.h, except that
__kernel_[u]long_t is defined to be [unsigned] long long and therefore
64 bits.

Signed-off-by: H. Peter Anvin <hpa@zytor.com>
---
 arch/x86/include/asm/Kbuild            |  1 +
 arch/x86/include/asm/posix_types.h     |  4 +++-
 arch/x86/include/asm/posix_types_x32.h | 19 +++++++++++++++++++
 3 files changed, 23 insertions(+), 1 deletion(-)
 create mode 100644 arch/x86/include/asm/posix_types_x32.h

(limited to 'arch/x86')

diff --git a/arch/x86/include/asm/Kbuild b/arch/x86/include/asm/Kbuild
index b57e6a43a37a..986954fb9513 100644
--- a/arch/x86/include/asm/Kbuild
+++ b/arch/x86/include/asm/Kbuild
@@ -14,6 +14,7 @@ header-y += msr.h
 header-y += mtrr.h
 header-y += posix_types_32.h
 header-y += posix_types_64.h
+header-y += posix_types_x32.h
 header-y += prctl.h
 header-y += processor-flags.h
 header-y += ptrace-abi.h
diff --git a/arch/x86/include/asm/posix_types.h b/arch/x86/include/asm/posix_types.h
index bb7133dc155d..3427b7798dbc 100644
--- a/arch/x86/include/asm/posix_types.h
+++ b/arch/x86/include/asm/posix_types.h
@@ -7,7 +7,9 @@
 #else
 # ifdef __i386__
 #  include "posix_types_32.h"
-# else
+# elif defined(__LP64__)
 #  include "posix_types_64.h"
+# else
+#  include "posix_types_x32.h"
 # endif
 #endif
diff --git a/arch/x86/include/asm/posix_types_x32.h b/arch/x86/include/asm/posix_types_x32.h
new file mode 100644
index 000000000000..85f9bdafa93c
--- /dev/null
+++ b/arch/x86/include/asm/posix_types_x32.h
@@ -0,0 +1,19 @@
+#ifndef _ASM_X86_POSIX_TYPES_X32_H
+#define _ASM_X86_POSIX_TYPES_X32_H
+
+/*
+ * This file is only used by user-level software, so you need to
+ * be a little careful about namespace pollution etc.  Also, we cannot
+ * assume GCC is being used.
+ *
+ * These types should generally match the ones used by the 64-bit kernel,
+ *
+ */
+
+typedef long long __kernel_long_t;
+typedef unsigned long long __kernel_ulong_t;
+#define __kernel_long_t __kernel_long_t
+
+#include <asm/posix_types_64.h>
+
+#endif /* _ASM_X86_POSIX_TYPES_X32_H */
-- 
cgit v1.2.3


From d046ff8b30319d9aa38d877a0ba4206771e54346 Mon Sep 17 00:00:00 2001
From: "H. J. Lu" <hjl.tools@gmail.com>
Date: Tue, 14 Feb 2012 13:49:48 -0800
Subject: x86-64: Add prototype for old_rsp to a header file

So far this has only been used in process_64.c, but the x32 code will
need it in additional code.

Signed-off-by: H. Peter Anvin <hpa@zytor.com>
---
 arch/x86/include/asm/processor.h | 6 ++++++
 1 file changed, 6 insertions(+)

(limited to 'arch/x86')

diff --git a/arch/x86/include/asm/processor.h b/arch/x86/include/asm/processor.h
index 9f748b5fb701..e34f95129f16 100644
--- a/arch/x86/include/asm/processor.h
+++ b/arch/x86/include/asm/processor.h
@@ -948,6 +948,12 @@ extern unsigned long thread_saved_pc(struct task_struct *tsk);
 
 #define task_pt_regs(tsk)	((struct pt_regs *)(tsk)->thread.sp0 - 1)
 extern unsigned long KSTK_ESP(struct task_struct *task);
+
+/*
+ * User space RSP while inside the SYSCALL fast path
+ */
+DECLARE_PER_CPU(unsigned long, old_rsp);
+
 #endif /* CONFIG_X86_64 */
 
 extern void start_thread(struct pt_regs *regs, unsigned long new_ip,
-- 
cgit v1.2.3


From bb2127240c5595ae4ef7115494f51e973692f64e Mon Sep 17 00:00:00 2001
From: "H. Peter Anvin" <hpa@zytor.com>
Date: Tue, 14 Feb 2012 13:56:49 -0800
Subject: x32: Add a thread flag for x32 processes

An x32 process is *almost* the same thing as a 64-bit process with a
32-bit address limit, but there are a few minor differences -- in
particular core dumps are 32 bits and signal handling is different.

Signed-off-by: H. Peter Anvin <hpa@zytor.com>
---
 arch/x86/include/asm/thread_info.h | 2 ++
 arch/x86/kernel/process_64.c       | 2 ++
 2 files changed, 4 insertions(+)

(limited to 'arch/x86')

diff --git a/arch/x86/include/asm/thread_info.h b/arch/x86/include/asm/thread_info.h
index d1803a495b35..912e93511466 100644
--- a/arch/x86/include/asm/thread_info.h
+++ b/arch/x86/include/asm/thread_info.h
@@ -96,6 +96,7 @@ struct thread_info {
 #define TIF_LAZY_MMU_UPDATES	27	/* task is updating the mmu lazily */
 #define TIF_SYSCALL_TRACEPOINT	28	/* syscall tracepoint instrumentation */
 #define TIF_ADDR32		29	/* 32-bit address space on 64 bits */
+#define TIF_X32			30	/* 32-bit native x86-64 binary */
 
 #define _TIF_SYSCALL_TRACE	(1 << TIF_SYSCALL_TRACE)
 #define _TIF_NOTIFY_RESUME	(1 << TIF_NOTIFY_RESUME)
@@ -118,6 +119,7 @@ struct thread_info {
 #define _TIF_LAZY_MMU_UPDATES	(1 << TIF_LAZY_MMU_UPDATES)
 #define _TIF_SYSCALL_TRACEPOINT	(1 << TIF_SYSCALL_TRACEPOINT)
 #define _TIF_ADDR32		(1 << TIF_ADDR32)
+#define _TIF_X32		(1 << TIF_X32)
 
 /* work to do in syscall_trace_enter() */
 #define _TIF_WORK_SYSCALL_ENTRY	\
diff --git a/arch/x86/kernel/process_64.c b/arch/x86/kernel/process_64.c
index 0e900d09e232..5fe2fbaa56ba 100644
--- a/arch/x86/kernel/process_64.c
+++ b/arch/x86/kernel/process_64.c
@@ -509,6 +509,7 @@ void set_personality_64bit(void)
 	/* Make sure to be in 64bit mode */
 	clear_thread_flag(TIF_IA32);
 	clear_thread_flag(TIF_ADDR32);
+	clear_thread_flag(TIF_X32);
 
 	/* Ensure the corresponding mm is not marked. */
 	if (current->mm)
@@ -528,6 +529,7 @@ void set_personality_ia32(void)
 	/* Make sure to be in 32bit mode */
 	set_thread_flag(TIF_IA32);
 	set_thread_flag(TIF_ADDR32);
+	clear_thread_flag(TIF_X32);
 	current->personality |= force_personality32;
 
 	/* Mark the associated mm as containing 32-bit tasks. */
-- 
cgit v1.2.3


From 2c73ce734653f96542a070f3c3b3e3d1cd0fba02 Mon Sep 17 00:00:00 2001
From: "H. Peter Anvin" <hpa@zytor.com>
Date: Sun, 19 Feb 2012 09:48:01 -0800
Subject: x86-64, ia32: Drop sys32_rt_sigprocmask

On x86, the only difference between sys_rt_sigprocmask and
sys32_rt_sigprocmask is the alignment of the data structures.
However, x86 allows data accesses with arbitrary alignment, and
therefore there is no reason for this code to be different.

Reported-by: Gregory M. Lueck <gregory.m.lueck@intel.com>
Signed-off-by: H. Peter Anvin <hpa@zytor.com>
---
 arch/x86/ia32/sys_ia32.c         | 40 ----------------------------------------
 arch/x86/include/asm/sys_ia32.h  |  2 --
 arch/x86/syscalls/syscall_32.tbl |  2 +-
 3 files changed, 1 insertion(+), 43 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/ia32/sys_ia32.c b/arch/x86/ia32/sys_ia32.c
index f6f5c53dc903..aec2202a596c 100644
--- a/arch/x86/ia32/sys_ia32.c
+++ b/arch/x86/ia32/sys_ia32.c
@@ -287,46 +287,6 @@ asmlinkage long sys32_sigaction(int sig, struct old_sigaction32 __user *act,
 	return ret;
 }
 
-asmlinkage long sys32_rt_sigprocmask(int how, compat_sigset_t __user *set,
-				     compat_sigset_t __user *oset,
-				     unsigned int sigsetsize)
-{
-	sigset_t s;
-	compat_sigset_t s32;
-	int ret;
-	mm_segment_t old_fs = get_fs();
-
-	if (set) {
-		if (copy_from_user(&s32, set, sizeof(compat_sigset_t)))
-			return -EFAULT;
-		switch (_NSIG_WORDS) {
-		case 4: s.sig[3] = s32.sig[6] | (((long)s32.sig[7]) << 32);
-		case 3: s.sig[2] = s32.sig[4] | (((long)s32.sig[5]) << 32);
-		case 2: s.sig[1] = s32.sig[2] | (((long)s32.sig[3]) << 32);
-		case 1: s.sig[0] = s32.sig[0] | (((long)s32.sig[1]) << 32);
-		}
-	}
-	set_fs(KERNEL_DS);
-	ret = sys_rt_sigprocmask(how,
-				 set ? (sigset_t __user *)&s : NULL,
-				 oset ? (sigset_t __user *)&s : NULL,
-				 sigsetsize);
-	set_fs(old_fs);
-	if (ret)
-		return ret;
-	if (oset) {
-		switch (_NSIG_WORDS) {
-		case 4: s32.sig[7] = (s.sig[3] >> 32); s32.sig[6] = s.sig[3];
-		case 3: s32.sig[5] = (s.sig[2] >> 32); s32.sig[4] = s.sig[2];
-		case 2: s32.sig[3] = (s.sig[1] >> 32); s32.sig[2] = s.sig[1];
-		case 1: s32.sig[1] = (s.sig[0] >> 32); s32.sig[0] = s.sig[0];
-		}
-		if (copy_to_user(oset, &s32, sizeof(compat_sigset_t)))
-			return -EFAULT;
-	}
-	return 0;
-}
-
 asmlinkage long sys32_alarm(unsigned int seconds)
 {
 	return alarm_setitimer(seconds);
diff --git a/arch/x86/include/asm/sys_ia32.h b/arch/x86/include/asm/sys_ia32.h
index cb238526a9f1..68da87bfb5a3 100644
--- a/arch/x86/include/asm/sys_ia32.h
+++ b/arch/x86/include/asm/sys_ia32.h
@@ -36,8 +36,6 @@ asmlinkage long sys32_rt_sigaction(int, struct sigaction32 __user *,
 				   struct sigaction32 __user *, unsigned int);
 asmlinkage long sys32_sigaction(int, struct old_sigaction32 __user *,
 				struct old_sigaction32 __user *);
-asmlinkage long sys32_rt_sigprocmask(int, compat_sigset_t __user *,
-				     compat_sigset_t __user *, unsigned int);
 asmlinkage long sys32_alarm(unsigned int);
 
 asmlinkage long sys32_waitpid(compat_pid_t, unsigned int *, int);
diff --git a/arch/x86/syscalls/syscall_32.tbl b/arch/x86/syscalls/syscall_32.tbl
index ce98e287c066..031cef84fe43 100644
--- a/arch/x86/syscalls/syscall_32.tbl
+++ b/arch/x86/syscalls/syscall_32.tbl
@@ -181,7 +181,7 @@
 172	i386	prctl			sys_prctl
 173	i386	rt_sigreturn		ptregs_rt_sigreturn		stub32_rt_sigreturn
 174	i386	rt_sigaction		sys_rt_sigaction		sys32_rt_sigaction
-175	i386	rt_sigprocmask		sys_rt_sigprocmask		sys32_rt_sigprocmask
+175	i386	rt_sigprocmask		sys_rt_sigprocmask
 176	i386	rt_sigpending		sys_rt_sigpending		sys32_rt_sigpending
 177	i386	rt_sigtimedwait		sys_rt_sigtimedwait		compat_sys_rt_sigtimedwait
 178	i386	rt_sigqueueinfo		sys_rt_sigqueueinfo		sys32_rt_sigqueueinfo
-- 
cgit v1.2.3


From 6630f11ba54414b9870d87dfef2bee467bfa842a Mon Sep 17 00:00:00 2001
From: "H. Peter Anvin" <hpa@zytor.com>
Date: Tue, 14 Feb 2012 14:18:50 -0800
Subject: x32: Add x32 system calls to syscall/syscall_64.tbl

Split the 64-bit system calls into "64" (64-bit only) and "common"
(64-bit or x32) and add the x32 system call numbers.

Signed-off-by: H. Peter Anvin <hpa@zytor.com>
---
 arch/x86/kernel/asm-offsets_64.c |   2 +
 arch/x86/kernel/syscall_64.c     |   3 +
 arch/x86/syscalls/Makefile       |   2 +-
 arch/x86/syscalls/syscall_64.tbl | 579 +++++++++++++++++++++------------------
 arch/x86/um/sys_call_table_64.c  |   3 +
 arch/x86/um/user-offsets.c       |   2 +
 6 files changed, 317 insertions(+), 274 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kernel/asm-offsets_64.c b/arch/x86/kernel/asm-offsets_64.c
index 834e897b1e25..c3354f7b0a06 100644
--- a/arch/x86/kernel/asm-offsets_64.c
+++ b/arch/x86/kernel/asm-offsets_64.c
@@ -1,6 +1,8 @@
 #include <asm/ia32.h>
 
 #define __SYSCALL_64(nr, sym, compat) [nr] = 1,
+#define __SYSCALL_COMMON(nr, sym, compat) [nr] = 1,
+#define __SYSCALL_X32(nr, sym, compat) /* Not yet */
 static char syscalls_64[] = {
 #include <asm/syscalls_64.h>
 };
diff --git a/arch/x86/kernel/syscall_64.c b/arch/x86/kernel/syscall_64.c
index 7ac7943be02c..26c4ca1f20e8 100644
--- a/arch/x86/kernel/syscall_64.c
+++ b/arch/x86/kernel/syscall_64.c
@@ -5,6 +5,9 @@
 #include <linux/cache.h>
 #include <asm/asm-offsets.h>
 
+#define __SYSCALL_COMMON(nr, sym, compat) __SYSCALL_64(nr, sym, compat)
+#define __SYSCALL_X32(nr, sym, compat) /* Not yet */
+
 #define __SYSCALL_64(nr, sym, compat) extern asmlinkage void sym(void) ;
 #include <asm/syscalls_64.h>
 #undef __SYSCALL_64
diff --git a/arch/x86/syscalls/Makefile b/arch/x86/syscalls/Makefile
index 564b2476fede..89dd9581c5a2 100644
--- a/arch/x86/syscalls/Makefile
+++ b/arch/x86/syscalls/Makefile
@@ -24,7 +24,7 @@ syshdr_pfx_unistd_32_ia32 := ia32_
 $(out)/unistd_32_ia32.h: $(syscall32) $(syshdr)
 	$(call if_changed,syshdr)
 
-syshdr_abi_unistd_64 := 64
+syshdr_abi_unistd_64 := common,64
 $(out)/unistd_64.h: $(syscall64) $(syshdr)
 	$(call if_changed,syshdr)
 
diff --git a/arch/x86/syscalls/syscall_64.tbl b/arch/x86/syscalls/syscall_64.tbl
index b440a8f7eefa..4aecc7e31166 100644
--- a/arch/x86/syscalls/syscall_64.tbl
+++ b/arch/x86/syscalls/syscall_64.tbl
@@ -4,317 +4,350 @@
 # The format is:
 # <number> <abi> <name> <entry point>
 #
-# The abi is always "64" for this file (for now.)
+# The abi is "common", "64" or "x32" for this file.
 #
-0	64	read			sys_read
-1	64	write			sys_write
-2	64	open			sys_open
-3	64	close			sys_close
-4	64	stat			sys_newstat
-5	64	fstat			sys_newfstat
-6	64	lstat			sys_newlstat
-7	64	poll			sys_poll
-8	64	lseek			sys_lseek
-9	64	mmap			sys_mmap
-10	64	mprotect		sys_mprotect
-11	64	munmap			sys_munmap
-12	64	brk			sys_brk
+0	common	read			sys_read
+1	common	write			sys_write
+2	common	open			sys_open
+3	common	close			sys_close
+4	common	stat			sys_newstat
+5	common	fstat			sys_newfstat
+6	common	lstat			sys_newlstat
+7	common	poll			sys_poll
+8	common	lseek			sys_lseek
+9	common	mmap			sys_mmap
+10	common	mprotect		sys_mprotect
+11	common	munmap			sys_munmap
+12	common	brk			sys_brk
 13	64	rt_sigaction		sys_rt_sigaction
-14	64	rt_sigprocmask		sys_rt_sigprocmask
+14	common	rt_sigprocmask		sys_rt_sigprocmask
 15	64	rt_sigreturn		stub_rt_sigreturn
 16	64	ioctl			sys_ioctl
-17	64	pread64			sys_pread64
-18	64	pwrite64		sys_pwrite64
+17	common	pread64			sys_pread64
+18	common	pwrite64		sys_pwrite64
 19	64	readv			sys_readv
 20	64	writev			sys_writev
-21	64	access			sys_access
-22	64	pipe			sys_pipe
-23	64	select			sys_select
-24	64	sched_yield		sys_sched_yield
-25	64	mremap			sys_mremap
-26	64	msync			sys_msync
-27	64	mincore			sys_mincore
-28	64	madvise			sys_madvise
-29	64	shmget			sys_shmget
-30	64	shmat			sys_shmat
-31	64	shmctl			sys_shmctl
-32	64	dup			sys_dup
-33	64	dup2			sys_dup2
-34	64	pause			sys_pause
-35	64	nanosleep		sys_nanosleep
-36	64	getitimer		sys_getitimer
-37	64	alarm			sys_alarm
-38	64	setitimer		sys_setitimer
-39	64	getpid			sys_getpid
-40	64	sendfile		sys_sendfile64
-41	64	socket			sys_socket
-42	64	connect			sys_connect
-43	64	accept			sys_accept
-44	64	sendto			sys_sendto
+21	common	access			sys_access
+22	common	pipe			sys_pipe
+23	common	select			sys_select
+24	common	sched_yield		sys_sched_yield
+25	common	mremap			sys_mremap
+26	common	msync			sys_msync
+27	common	mincore			sys_mincore
+28	common	madvise			sys_madvise
+29	common	shmget			sys_shmget
+30	common	shmat			sys_shmat
+31	common	shmctl			sys_shmctl
+32	common	dup			sys_dup
+33	common	dup2			sys_dup2
+34	common	pause			sys_pause
+35	common	nanosleep		sys_nanosleep
+36	common	getitimer		sys_getitimer
+37	common	alarm			sys_alarm
+38	common	setitimer		sys_setitimer
+39	common	getpid			sys_getpid
+40	common	sendfile		sys_sendfile64
+41	common	socket			sys_socket
+42	common	connect			sys_connect
+43	common	accept			sys_accept
+44	common	sendto			sys_sendto
 45	64	recvfrom		sys_recvfrom
 46	64	sendmsg			sys_sendmsg
 47	64	recvmsg			sys_recvmsg
-48	64	shutdown		sys_shutdown
-49	64	bind			sys_bind
-50	64	listen			sys_listen
-51	64	getsockname		sys_getsockname
-52	64	getpeername		sys_getpeername
-53	64	socketpair		sys_socketpair
-54	64	setsockopt		sys_setsockopt
-55	64	getsockopt		sys_getsockopt
-56	64	clone			stub_clone
-57	64	fork			stub_fork
-58	64	vfork			stub_vfork
+48	common	shutdown		sys_shutdown
+49	common	bind			sys_bind
+50	common	listen			sys_listen
+51	common	getsockname		sys_getsockname
+52	common	getpeername		sys_getpeername
+53	common	socketpair		sys_socketpair
+54	common	setsockopt		sys_setsockopt
+55	common	getsockopt		sys_getsockopt
+56	common	clone			stub_clone
+57	common	fork			stub_fork
+58	common	vfork			stub_vfork
 59	64	execve			stub_execve
-60	64	exit			sys_exit
-61	64	wait4			sys_wait4
-62	64	kill			sys_kill
-63	64	uname			sys_newuname
-64	64	semget			sys_semget
-65	64	semop			sys_semop
-66	64	semctl			sys_semctl
-67	64	shmdt			sys_shmdt
-68	64	msgget			sys_msgget
-69	64	msgsnd			sys_msgsnd
-70	64	msgrcv			sys_msgrcv
-71	64	msgctl			sys_msgctl
-72	64	fcntl			sys_fcntl
-73	64	flock			sys_flock
-74	64	fsync			sys_fsync
-75	64	fdatasync		sys_fdatasync
-76	64	truncate		sys_truncate
-77	64	ftruncate		sys_ftruncate
-78	64	getdents		sys_getdents
-79	64	getcwd			sys_getcwd
-80	64	chdir			sys_chdir
-81	64	fchdir			sys_fchdir
-82	64	rename			sys_rename
-83	64	mkdir			sys_mkdir
-84	64	rmdir			sys_rmdir
-85	64	creat			sys_creat
-86	64	link			sys_link
-87	64	unlink			sys_unlink
-88	64	symlink			sys_symlink
-89	64	readlink		sys_readlink
-90	64	chmod			sys_chmod
-91	64	fchmod			sys_fchmod
-92	64	chown			sys_chown
-93	64	fchown			sys_fchown
-94	64	lchown			sys_lchown
-95	64	umask			sys_umask
-96	64	gettimeofday		sys_gettimeofday
-97	64	getrlimit		sys_getrlimit
-98	64	getrusage		sys_getrusage
-99	64	sysinfo			sys_sysinfo
+60	common	exit			sys_exit
+61	common	wait4			sys_wait4
+62	common	kill			sys_kill
+63	common	uname			sys_newuname
+64	common	semget			sys_semget
+65	common	semop			sys_semop
+66	common	semctl			sys_semctl
+67	common	shmdt			sys_shmdt
+68	common	msgget			sys_msgget
+69	common	msgsnd			sys_msgsnd
+70	common	msgrcv			sys_msgrcv
+71	common	msgctl			sys_msgctl
+72	common	fcntl			sys_fcntl
+73	common	flock			sys_flock
+74	common	fsync			sys_fsync
+75	common	fdatasync		sys_fdatasync
+76	common	truncate		sys_truncate
+77	common	ftruncate		sys_ftruncate
+78	common	getdents		sys_getdents
+79	common	getcwd			sys_getcwd
+80	common	chdir			sys_chdir
+81	common	fchdir			sys_fchdir
+82	common	rename			sys_rename
+83	common	mkdir			sys_mkdir
+84	common	rmdir			sys_rmdir
+85	common	creat			sys_creat
+86	common	link			sys_link
+87	common	unlink			sys_unlink
+88	common	symlink			sys_symlink
+89	common	readlink		sys_readlink
+90	common	chmod			sys_chmod
+91	common	fchmod			sys_fchmod
+92	common	chown			sys_chown
+93	common	fchown			sys_fchown
+94	common	lchown			sys_lchown
+95	common	umask			sys_umask
+96	common	gettimeofday		sys_gettimeofday
+97	common	getrlimit		sys_getrlimit
+98	common	getrusage		sys_getrusage
+99	common	sysinfo			sys_sysinfo
 100	64	times			sys_times
-101	64	ptrace			sys_ptrace
-102	64	getuid			sys_getuid
-103	64	syslog			sys_syslog
-104	64	getgid			sys_getgid
-105	64	setuid			sys_setuid
-106	64	setgid			sys_setgid
-107	64	geteuid			sys_geteuid
-108	64	getegid			sys_getegid
-109	64	setpgid			sys_setpgid
-110	64	getppid			sys_getppid
-111	64	getpgrp			sys_getpgrp
-112	64	setsid			sys_setsid
-113	64	setreuid		sys_setreuid
-114	64	setregid		sys_setregid
-115	64	getgroups		sys_getgroups
-116	64	setgroups		sys_setgroups
-117	64	setresuid		sys_setresuid
-118	64	getresuid		sys_getresuid
-119	64	setresgid		sys_setresgid
-120	64	getresgid		sys_getresgid
-121	64	getpgid			sys_getpgid
-122	64	setfsuid		sys_setfsuid
-123	64	setfsgid		sys_setfsgid
-124	64	getsid			sys_getsid
-125	64	capget			sys_capget
-126	64	capset			sys_capset
+101	common	ptrace			sys_ptrace
+102	common	getuid			sys_getuid
+103	common	syslog			sys_syslog
+104	common	getgid			sys_getgid
+105	common	setuid			sys_setuid
+106	common	setgid			sys_setgid
+107	common	geteuid			sys_geteuid
+108	common	getegid			sys_getegid
+109	common	setpgid			sys_setpgid
+110	common	getppid			sys_getppid
+111	common	getpgrp			sys_getpgrp
+112	common	setsid			sys_setsid
+113	common	setreuid		sys_setreuid
+114	common	setregid		sys_setregid
+115	common	getgroups		sys_getgroups
+116	common	setgroups		sys_setgroups
+117	common	setresuid		sys_setresuid
+118	common	getresuid		sys_getresuid
+119	common	setresgid		sys_setresgid
+120	common	getresgid		sys_getresgid
+121	common	getpgid			sys_getpgid
+122	common	setfsuid		sys_setfsuid
+123	common	setfsgid		sys_setfsgid
+124	common	getsid			sys_getsid
+125	common	capget			sys_capget
+126	common	capset			sys_capset
 127	64	rt_sigpending		sys_rt_sigpending
 128	64	rt_sigtimedwait		sys_rt_sigtimedwait
 129	64	rt_sigqueueinfo		sys_rt_sigqueueinfo
-130	64	rt_sigsuspend		sys_rt_sigsuspend
+130	common	rt_sigsuspend		sys_rt_sigsuspend
 131	64	sigaltstack		stub_sigaltstack
-132	64	utime			sys_utime
-133	64	mknod			sys_mknod
+132	common	utime			sys_utime
+133	common	mknod			sys_mknod
 134	64	uselib
-135	64	personality		sys_personality
-136	64	ustat			sys_ustat
-137	64	statfs			sys_statfs
-138	64	fstatfs			sys_fstatfs
-139	64	sysfs			sys_sysfs
-140	64	getpriority		sys_getpriority
-141	64	setpriority		sys_setpriority
-142	64	sched_setparam		sys_sched_setparam
-143	64	sched_getparam		sys_sched_getparam
-144	64	sched_setscheduler	sys_sched_setscheduler
-145	64	sched_getscheduler	sys_sched_getscheduler
-146	64	sched_get_priority_max	sys_sched_get_priority_max
-147	64	sched_get_priority_min	sys_sched_get_priority_min
-148	64	sched_rr_get_interval	sys_sched_rr_get_interval
-149	64	mlock			sys_mlock
-150	64	munlock			sys_munlock
-151	64	mlockall		sys_mlockall
-152	64	munlockall		sys_munlockall
-153	64	vhangup			sys_vhangup
-154	64	modify_ldt		sys_modify_ldt
-155	64	pivot_root		sys_pivot_root
+135	common	personality		sys_personality
+136	common	ustat			sys_ustat
+137	common	statfs			sys_statfs
+138	common	fstatfs			sys_fstatfs
+139	common	sysfs			sys_sysfs
+140	common	getpriority		sys_getpriority
+141	common	setpriority		sys_setpriority
+142	common	sched_setparam		sys_sched_setparam
+143	common	sched_getparam		sys_sched_getparam
+144	common	sched_setscheduler	sys_sched_setscheduler
+145	common	sched_getscheduler	sys_sched_getscheduler
+146	common	sched_get_priority_max	sys_sched_get_priority_max
+147	common	sched_get_priority_min	sys_sched_get_priority_min
+148	common	sched_rr_get_interval	sys_sched_rr_get_interval
+149	common	mlock			sys_mlock
+150	common	munlock			sys_munlock
+151	common	mlockall		sys_mlockall
+152	common	munlockall		sys_munlockall
+153	common	vhangup			sys_vhangup
+154	common	modify_ldt		sys_modify_ldt
+155	common	pivot_root		sys_pivot_root
 156	64	_sysctl			sys_sysctl
-157	64	prctl			sys_prctl
-158	64	arch_prctl		sys_arch_prctl
-159	64	adjtimex		sys_adjtimex
-160	64	setrlimit		sys_setrlimit
-161	64	chroot			sys_chroot
-162	64	sync			sys_sync
-163	64	acct			sys_acct
-164	64	settimeofday		sys_settimeofday
-165	64	mount			sys_mount
-166	64	umount2			sys_umount
-167	64	swapon			sys_swapon
-168	64	swapoff			sys_swapoff
-169	64	reboot			sys_reboot
-170	64	sethostname		sys_sethostname
-171	64	setdomainname		sys_setdomainname
-172	64	iopl			stub_iopl
-173	64	ioperm			sys_ioperm
+157	common	prctl			sys_prctl
+158	common	arch_prctl		sys_arch_prctl
+159	common	adjtimex		sys_adjtimex
+160	common	setrlimit		sys_setrlimit
+161	common	chroot			sys_chroot
+162	common	sync			sys_sync
+163	common	acct			sys_acct
+164	common	settimeofday		sys_settimeofday
+165	common	mount			sys_mount
+166	common	umount2			sys_umount
+167	common	swapon			sys_swapon
+168	common	swapoff			sys_swapoff
+169	common	reboot			sys_reboot
+170	common	sethostname		sys_sethostname
+171	common	setdomainname		sys_setdomainname
+172	common	iopl			stub_iopl
+173	common	ioperm			sys_ioperm
 174	64	create_module
-175	64	init_module		sys_init_module
-176	64	delete_module		sys_delete_module
+175	common	init_module		sys_init_module
+176	common	delete_module		sys_delete_module
 177	64	get_kernel_syms
 178	64	query_module
-179	64	quotactl		sys_quotactl
+179	common	quotactl		sys_quotactl
 180	64	nfsservctl
-181	64	getpmsg
-182	64	putpmsg
-183	64	afs_syscall
-184	64	tuxcall
-185	64	security
-186	64	gettid			sys_gettid
-187	64	readahead		sys_readahead
-188	64	setxattr		sys_setxattr
-189	64	lsetxattr		sys_lsetxattr
-190	64	fsetxattr		sys_fsetxattr
-191	64	getxattr		sys_getxattr
-192	64	lgetxattr		sys_lgetxattr
-193	64	fgetxattr		sys_fgetxattr
-194	64	listxattr		sys_listxattr
-195	64	llistxattr		sys_llistxattr
-196	64	flistxattr		sys_flistxattr
-197	64	removexattr		sys_removexattr
-198	64	lremovexattr		sys_lremovexattr
-199	64	fremovexattr		sys_fremovexattr
-200	64	tkill			sys_tkill
-201	64	time			sys_time
-202	64	futex			sys_futex
-203	64	sched_setaffinity	sys_sched_setaffinity
-204	64	sched_getaffinity	sys_sched_getaffinity
+181	common	getpmsg
+182	common	putpmsg
+183	common	afs_syscall
+184	common	tuxcall
+185	common	security
+186	common	gettid			sys_gettid
+187	common	readahead		sys_readahead
+188	common	setxattr		sys_setxattr
+189	common	lsetxattr		sys_lsetxattr
+190	common	fsetxattr		sys_fsetxattr
+191	common	getxattr		sys_getxattr
+192	common	lgetxattr		sys_lgetxattr
+193	common	fgetxattr		sys_fgetxattr
+194	common	listxattr		sys_listxattr
+195	common	llistxattr		sys_llistxattr
+196	common	flistxattr		sys_flistxattr
+197	common	removexattr		sys_removexattr
+198	common	lremovexattr		sys_lremovexattr
+199	common	fremovexattr		sys_fremovexattr
+200	common	tkill			sys_tkill
+201	common	time			sys_time
+202	common	futex			sys_futex
+203	common	sched_setaffinity	sys_sched_setaffinity
+204	common	sched_getaffinity	sys_sched_getaffinity
 205	64	set_thread_area
-206	64	io_setup		sys_io_setup
-207	64	io_destroy		sys_io_destroy
-208	64	io_getevents		sys_io_getevents
-209	64	io_submit		sys_io_submit
-210	64	io_cancel		sys_io_cancel
+206	common	io_setup		sys_io_setup
+207	common	io_destroy		sys_io_destroy
+208	common	io_getevents		sys_io_getevents
+209	common	io_submit		sys_io_submit
+210	common	io_cancel		sys_io_cancel
 211	64	get_thread_area
-212	64	lookup_dcookie		sys_lookup_dcookie
-213	64	epoll_create		sys_epoll_create
+212	common	lookup_dcookie		sys_lookup_dcookie
+213	common	epoll_create		sys_epoll_create
 214	64	epoll_ctl_old
 215	64	epoll_wait_old
-216	64	remap_file_pages	sys_remap_file_pages
-217	64	getdents64		sys_getdents64
-218	64	set_tid_address		sys_set_tid_address
-219	64	restart_syscall		sys_restart_syscall
-220	64	semtimedop		sys_semtimedop
-221	64	fadvise64		sys_fadvise64
+216	common	remap_file_pages	sys_remap_file_pages
+217	common	getdents64		sys_getdents64
+218	common	set_tid_address		sys_set_tid_address
+219	common	restart_syscall		sys_restart_syscall
+220	common	semtimedop		sys_semtimedop
+221	common	fadvise64		sys_fadvise64
 222	64	timer_create		sys_timer_create
-223	64	timer_settime		sys_timer_settime
-224	64	timer_gettime		sys_timer_gettime
-225	64	timer_getoverrun	sys_timer_getoverrun
-226	64	timer_delete		sys_timer_delete
-227	64	clock_settime		sys_clock_settime
-228	64	clock_gettime		sys_clock_gettime
-229	64	clock_getres		sys_clock_getres
-230	64	clock_nanosleep		sys_clock_nanosleep
-231	64	exit_group		sys_exit_group
-232	64	epoll_wait		sys_epoll_wait
-233	64	epoll_ctl		sys_epoll_ctl
-234	64	tgkill			sys_tgkill
-235	64	utimes			sys_utimes
+223	common	timer_settime		sys_timer_settime
+224	common	timer_gettime		sys_timer_gettime
+225	common	timer_getoverrun	sys_timer_getoverrun
+226	common	timer_delete		sys_timer_delete
+227	common	clock_settime		sys_clock_settime
+228	common	clock_gettime		sys_clock_gettime
+229	common	clock_getres		sys_clock_getres
+230	common	clock_nanosleep		sys_clock_nanosleep
+231	common	exit_group		sys_exit_group
+232	common	epoll_wait		sys_epoll_wait
+233	common	epoll_ctl		sys_epoll_ctl
+234	common	tgkill			sys_tgkill
+235	common	utimes			sys_utimes
 236	64	vserver
-237	64	mbind			sys_mbind
-238	64	set_mempolicy		sys_set_mempolicy
-239	64	get_mempolicy		sys_get_mempolicy
-240	64	mq_open			sys_mq_open
-241	64	mq_unlink		sys_mq_unlink
-242	64	mq_timedsend		sys_mq_timedsend
-243	64	mq_timedreceive		sys_mq_timedreceive
+237	common	mbind			sys_mbind
+238	common	set_mempolicy		sys_set_mempolicy
+239	common	get_mempolicy		sys_get_mempolicy
+240	common	mq_open			sys_mq_open
+241	common	mq_unlink		sys_mq_unlink
+242	common	mq_timedsend		sys_mq_timedsend
+243	common	mq_timedreceive		sys_mq_timedreceive
 244	64	mq_notify		sys_mq_notify
-245	64	mq_getsetattr		sys_mq_getsetattr
+245	common	mq_getsetattr		sys_mq_getsetattr
 246	64	kexec_load		sys_kexec_load
 247	64	waitid			sys_waitid
-248	64	add_key			sys_add_key
-249	64	request_key		sys_request_key
-250	64	keyctl			sys_keyctl
-251	64	ioprio_set		sys_ioprio_set
-252	64	ioprio_get		sys_ioprio_get
-253	64	inotify_init		sys_inotify_init
-254	64	inotify_add_watch	sys_inotify_add_watch
-255	64	inotify_rm_watch	sys_inotify_rm_watch
-256	64	migrate_pages		sys_migrate_pages
-257	64	openat			sys_openat
-258	64	mkdirat			sys_mkdirat
-259	64	mknodat			sys_mknodat
-260	64	fchownat		sys_fchownat
-261	64	futimesat		sys_futimesat
-262	64	newfstatat		sys_newfstatat
-263	64	unlinkat		sys_unlinkat
-264	64	renameat		sys_renameat
-265	64	linkat			sys_linkat
-266	64	symlinkat		sys_symlinkat
-267	64	readlinkat		sys_readlinkat
-268	64	fchmodat		sys_fchmodat
-269	64	faccessat		sys_faccessat
-270	64	pselect6		sys_pselect6
-271	64	ppoll			sys_ppoll
-272	64	unshare			sys_unshare
+248	common	add_key			sys_add_key
+249	common	request_key		sys_request_key
+250	common	keyctl			sys_keyctl
+251	common	ioprio_set		sys_ioprio_set
+252	common	ioprio_get		sys_ioprio_get
+253	common	inotify_init		sys_inotify_init
+254	common	inotify_add_watch	sys_inotify_add_watch
+255	common	inotify_rm_watch	sys_inotify_rm_watch
+256	common	migrate_pages		sys_migrate_pages
+257	common	openat			sys_openat
+258	common	mkdirat			sys_mkdirat
+259	common	mknodat			sys_mknodat
+260	common	fchownat		sys_fchownat
+261	common	futimesat		sys_futimesat
+262	common	newfstatat		sys_newfstatat
+263	common	unlinkat		sys_unlinkat
+264	common	renameat		sys_renameat
+265	common	linkat			sys_linkat
+266	common	symlinkat		sys_symlinkat
+267	common	readlinkat		sys_readlinkat
+268	common	fchmodat		sys_fchmodat
+269	common	faccessat		sys_faccessat
+270	common	pselect6		sys_pselect6
+271	common	ppoll			sys_ppoll
+272	common	unshare			sys_unshare
 273	64	set_robust_list		sys_set_robust_list
 274	64	get_robust_list		sys_get_robust_list
-275	64	splice			sys_splice
-276	64	tee			sys_tee
-277	64	sync_file_range		sys_sync_file_range
+275	common	splice			sys_splice
+276	common	tee			sys_tee
+277	common	sync_file_range		sys_sync_file_range
 278	64	vmsplice		sys_vmsplice
 279	64	move_pages		sys_move_pages
-280	64	utimensat		sys_utimensat
-281	64	epoll_pwait		sys_epoll_pwait
-282	64	signalfd		sys_signalfd
-283	64	timerfd_create		sys_timerfd_create
-284	64	eventfd			sys_eventfd
-285	64	fallocate		sys_fallocate
-286	64	timerfd_settime		sys_timerfd_settime
-287	64	timerfd_gettime		sys_timerfd_gettime
-288	64	accept4			sys_accept4
-289	64	signalfd4		sys_signalfd4
-290	64	eventfd2		sys_eventfd2
-291	64	epoll_create1		sys_epoll_create1
-292	64	dup3			sys_dup3
-293	64	pipe2			sys_pipe2
-294	64	inotify_init1		sys_inotify_init1
+280	common	utimensat		sys_utimensat
+281	common	epoll_pwait		sys_epoll_pwait
+282	common	signalfd		sys_signalfd
+283	common	timerfd_create		sys_timerfd_create
+284	common	eventfd			sys_eventfd
+285	common	fallocate		sys_fallocate
+286	common	timerfd_settime		sys_timerfd_settime
+287	common	timerfd_gettime		sys_timerfd_gettime
+288	common	accept4			sys_accept4
+289	common	signalfd4		sys_signalfd4
+290	common	eventfd2		sys_eventfd2
+291	common	epoll_create1		sys_epoll_create1
+292	common	dup3			sys_dup3
+293	common	pipe2			sys_pipe2
+294	common	inotify_init1		sys_inotify_init1
 295	64	preadv			sys_preadv
 296	64	pwritev			sys_pwritev
 297	64	rt_tgsigqueueinfo	sys_rt_tgsigqueueinfo
-298	64	perf_event_open		sys_perf_event_open
+298	common	perf_event_open		sys_perf_event_open
 299	64	recvmmsg		sys_recvmmsg
-300	64	fanotify_init		sys_fanotify_init
-301	64	fanotify_mark		sys_fanotify_mark
-302	64	prlimit64		sys_prlimit64
-303	64	name_to_handle_at	sys_name_to_handle_at
-304	64	open_by_handle_at	sys_open_by_handle_at
-305	64	clock_adjtime		sys_clock_adjtime
-306	64	syncfs			sys_syncfs
+300	common	fanotify_init		sys_fanotify_init
+301	common	fanotify_mark		sys_fanotify_mark
+302	common	prlimit64		sys_prlimit64
+303	common	name_to_handle_at	sys_name_to_handle_at
+304	common	open_by_handle_at	sys_open_by_handle_at
+305	common	clock_adjtime		sys_clock_adjtime
+306	common	syncfs			sys_syncfs
 307	64	sendmmsg		sys_sendmmsg
-308	64	setns			sys_setns
-309	64	getcpu			sys_getcpu
+308	common	setns			sys_setns
+309	common	getcpu			sys_getcpu
 310	64	process_vm_readv	sys_process_vm_readv
 311	64	process_vm_writev	sys_process_vm_writev
+#
+# x32-specific system call numbers start at 512 to avoid cache impact
+# for native 64-bit operation.
+#
+512	x32	rt_sigaction		sys32_rt_sigaction
+513	x32	rt_sigreturn		stub_x32_rt_sigreturn
+514	x32	ioctl			compat_sys_ioctl
+515	x32	readv			compat_sys_readv
+516	x32	writev			compat_sys_writev
+517	x32	recvfrom		compat_sys_recvfrom
+518	x32	sendmsg			compat_sys_sendmsg
+519	x32	recvmsg			compat_sys_recvmsg
+520	x32	execve			stub_x32_execve
+521	x32	times			compat_sys_times
+522	x32	rt_sigpending		sys32_rt_sigpending
+523	x32	rt_sigtimedwait		compat_sys_rt_sigtimedwait
+524	x32	rt_sigqueueinfo		sys32_rt_sigqueueinfo
+525	x32	sigaltstack		stub_x32_sigaltstack
+526	x32	timer_create		compat_sys_timer_create
+527	x32	mq_notify		compat_sys_mq_notify
+528	x32	kexec_load		compat_sys_kexec_load
+529	x32	waitid			compat_sys_waitid
+530	x32	set_robust_list		compat_sys_set_robust_list
+531	x32	get_robust_list		compat_sys_get_robust_list
+532	x32	vmsplice		compat_sys_vmsplice
+533	x32	move_pages		compat_sys_move_pages
+534	x32	preadv			compat_sys_preadv64
+535	x32	pwritev			compat_sys_pwritev64
+536	x32	rt_tgsigqueueinfo	compat_sys_rt_tgsigqueueinfo
+537	x32	recvmmsg		compat_sys_recvmmsg
+538	x32	sendmmsg		compat_sys_sendmmsg
+539	x32	process_vm_readv	compat_sys_process_vm_readv
+540	x32	process_vm_writev	compat_sys_process_vm_writev
diff --git a/arch/x86/um/sys_call_table_64.c b/arch/x86/um/sys_call_table_64.c
index fe626c3ba01b..9924776f4265 100644
--- a/arch/x86/um/sys_call_table_64.c
+++ b/arch/x86/um/sys_call_table_64.c
@@ -35,6 +35,9 @@
 #define stub_sigaltstack sys_sigaltstack
 #define stub_rt_sigreturn sys_rt_sigreturn
 
+#define __SYSCALL_COMMON(nr, sym, compat) __SYSCALL_64(nr, sym, compat)
+#define __SYSCALL_X32(nr, sym, compat) /* Not supported */
+
 #define __SYSCALL_64(nr, sym, compat) extern asmlinkage void sym(void) ;
 #include <asm/syscalls_64.h>
 
diff --git a/arch/x86/um/user-offsets.c b/arch/x86/um/user-offsets.c
index 5edf4f4bbf53..ce7e3607a870 100644
--- a/arch/x86/um/user-offsets.c
+++ b/arch/x86/um/user-offsets.c
@@ -15,6 +15,8 @@ static char syscalls[] = {
 };
 #else
 #define __SYSCALL_64(nr, sym, compat) [nr] = 1,
+#define __SYSCALL_COMMON(nr, sym, compat) [nr] = 1,
+#define __SYSCALL_X32(nr, sym, compat) /* Not supported */
 static char syscalls[] = {
 #include <asm/syscalls_64.h>
 };
-- 
cgit v1.2.3


From 6cbb369f578378cf5b1876766d860ae7c2a94d60 Mon Sep 17 00:00:00 2001
From: "H. Peter Anvin" <hpa@zytor.com>
Date: Tue, 14 Feb 2012 14:38:31 -0800
Subject: x32: Generate <asm/unistd_x32.h>

Generate <asm/unistd_x32.h>; this exports x32 system call numbers to
user space.

[ v2: Enclose all arguments to syshdr in '' so empty arguments aren't
      dropped on the floor. ]

Signed-off-by: H. Peter Anvin <hpa@zytor.com>
---
 arch/x86/include/asm/Kbuild   |  1 +
 arch/x86/include/asm/unistd.h |  7 ++++++-
 arch/x86/syscalls/Makefile    | 13 ++++++++++---
 3 files changed, 17 insertions(+), 4 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/include/asm/Kbuild b/arch/x86/include/asm/Kbuild
index 986954fb9513..f9c0d3ba9e84 100644
--- a/arch/x86/include/asm/Kbuild
+++ b/arch/x86/include/asm/Kbuild
@@ -25,3 +25,4 @@ header-y += vsyscall.h
 
 genhdr-y += unistd_32.h
 genhdr-y += unistd_64.h
+genhdr-y += unistd_x32.h
diff --git a/arch/x86/include/asm/unistd.h b/arch/x86/include/asm/unistd.h
index 21f77b89e47a..dab5349f14fc 100644
--- a/arch/x86/include/asm/unistd.h
+++ b/arch/x86/include/asm/unistd.h
@@ -1,6 +1,9 @@
 #ifndef _ASM_X86_UNISTD_H
 #define _ASM_X86_UNISTD_H 1
 
+/* x32 syscall flag bit */
+#define __X32_SYSCALL_BIT	0x40000000
+
 #ifdef __KERNEL__
 # ifdef CONFIG_X86_32
 
@@ -52,8 +55,10 @@
 #else
 # ifdef __i386__
 #  include <asm/unistd_32.h>
-# else
+# elif defined(__LP64__)
 #  include <asm/unistd_64.h>
+# else
+#  include <asm/unistd_x32.h>
 # endif
 #endif
 
diff --git a/arch/x86/syscalls/Makefile b/arch/x86/syscalls/Makefile
index 89dd9581c5a2..8051c3134ad1 100644
--- a/arch/x86/syscalls/Makefile
+++ b/arch/x86/syscalls/Makefile
@@ -10,8 +10,10 @@ syshdr := $(srctree)/$(src)/syscallhdr.sh
 systbl := $(srctree)/$(src)/syscalltbl.sh
 
 quiet_cmd_syshdr = SYSHDR  $@
-      cmd_syshdr = $(CONFIG_SHELL) '$(syshdr)' $< $@ \
-		   $(syshdr_abi_$(basetarget)) $(syshdr_pfx_$(basetarget))
+      cmd_syshdr = $(CONFIG_SHELL) '$(syshdr)' '$<' '$@' \
+		   '$(syshdr_abi_$(basetarget))' \
+		   '$(syshdr_pfx_$(basetarget))' \
+		   '$(syshdr_offset_$(basetarget))'
 quiet_cmd_systbl = SYSTBL  $@
       cmd_systbl = $(CONFIG_SHELL) '$(systbl)' $< $@
 
@@ -24,6 +26,11 @@ syshdr_pfx_unistd_32_ia32 := ia32_
 $(out)/unistd_32_ia32.h: $(syscall32) $(syshdr)
 	$(call if_changed,syshdr)
 
+syshdr_abi_unistd_x32 := common,x32
+syshdr_offset_unistd_x32 := __X32_SYSCALL_BIT
+$(out)/unistd_x32.h: $(syscall64) $(syshdr)
+	$(call if_changed,syshdr)
+
 syshdr_abi_unistd_64 := common,64
 $(out)/unistd_64.h: $(syscall64) $(syshdr)
 	$(call if_changed,syshdr)
@@ -33,7 +40,7 @@ $(out)/syscalls_32.h: $(syscall32) $(systbl)
 $(out)/syscalls_64.h: $(syscall64) $(systbl)
 	$(call if_changed,systbl)
 
-syshdr-y			+= unistd_32.h unistd_64.h
+syshdr-y			+= unistd_32.h unistd_64.h unistd_x32.h
 syshdr-y			+= syscalls_32.h
 syshdr-$(CONFIG_X86_64)		+= unistd_32_ia32.h
 syshdr-$(CONFIG_X86_64)		+= syscalls_64.h
-- 
cgit v1.2.3


From ea499fec48dd771bd92984337fcb57ed4c787e69 Mon Sep 17 00:00:00 2001
From: "H. Peter Anvin" <hpa@zytor.com>
Date: Tue, 14 Feb 2012 14:46:23 -0800
Subject: x32: Generate <asm/unistd_64_x32.h>

Generate macros for the *kernel* code to use to refer to x32 system
calls.  These have an __NR_x32_ prefix and do not include
__X32_SYSCALL_BIT.

Signed-off-by: H. Peter Anvin <hpa@zytor.com>
---
 arch/x86/include/asm/unistd.h | 1 +
 arch/x86/syscalls/Makefile    | 7 ++++++-
 2 files changed, 7 insertions(+), 1 deletion(-)

(limited to 'arch/x86')

diff --git a/arch/x86/include/asm/unistd.h b/arch/x86/include/asm/unistd.h
index dab5349f14fc..7a48a5557470 100644
--- a/arch/x86/include/asm/unistd.h
+++ b/arch/x86/include/asm/unistd.h
@@ -17,6 +17,7 @@
 # else
 
 #  include <asm/unistd_64.h>
+#  include <asm/unistd_64_x32.h>
 #  define __ARCH_WANT_COMPAT_SYS_TIME
 
 # endif
diff --git a/arch/x86/syscalls/Makefile b/arch/x86/syscalls/Makefile
index 8051c3134ad1..3236aebc828d 100644
--- a/arch/x86/syscalls/Makefile
+++ b/arch/x86/syscalls/Makefile
@@ -35,6 +35,11 @@ syshdr_abi_unistd_64 := common,64
 $(out)/unistd_64.h: $(syscall64) $(syshdr)
 	$(call if_changed,syshdr)
 
+syshdr_abi_unistd_64_x32 := x32
+syshdr_pfx_unistd_64_x32 := x32_
+$(out)/unistd_64_x32.h: $(syscall64) $(syshdr)
+	$(call if_changed,syshdr)
+
 $(out)/syscalls_32.h: $(syscall32) $(systbl)
 	$(call if_changed,systbl)
 $(out)/syscalls_64.h: $(syscall64) $(systbl)
@@ -42,7 +47,7 @@ $(out)/syscalls_64.h: $(syscall64) $(systbl)
 
 syshdr-y			+= unistd_32.h unistd_64.h unistd_x32.h
 syshdr-y			+= syscalls_32.h
-syshdr-$(CONFIG_X86_64)		+= unistd_32_ia32.h
+syshdr-$(CONFIG_X86_64)		+= unistd_32_ia32.h unistd_64_x32.h
 syshdr-$(CONFIG_X86_64)		+= syscalls_64.h
 
 targets	+= $(syshdr-y)
-- 
cgit v1.2.3


From f28f0c23576662fb293defe9b1884d5a6e1bd85c Mon Sep 17 00:00:00 2001
From: "H. Peter Anvin" <hpa@zytor.com>
Date: Sun, 19 Feb 2012 07:38:43 -0800
Subject: x86: Move some signal-handling definitions to a common header

There are some definitions which are duplicated between
kernel/signal.c and ia32/ia32_signal.c; move them to a common header
file.

Rather than adding stuff to existing header files which contain data
structures, create a new header file; hence the slightly odd name
("all the good ones were taken.")

Note: nothing relied on signal_fault() being defined in
<asm/ptrace.h>.

Signed-off-by: H. Peter Anvin <hpa@zytor.com>
---
 arch/x86/ia32/ia32_signal.c        | 12 ++----------
 arch/x86/include/asm/ptrace.h      |  1 -
 arch/x86/include/asm/sighandling.h | 19 +++++++++++++++++++
 arch/x86/kernel/signal.c           | 10 +---------
 4 files changed, 22 insertions(+), 20 deletions(-)
 create mode 100644 arch/x86/include/asm/sighandling.h

(limited to 'arch/x86')

diff --git a/arch/x86/ia32/ia32_signal.c b/arch/x86/ia32/ia32_signal.c
index 65577698cab2..25d80f3faf2e 100644
--- a/arch/x86/ia32/ia32_signal.c
+++ b/arch/x86/ia32/ia32_signal.c
@@ -12,10 +12,8 @@
 #include <linux/mm.h>
 #include <linux/smp.h>
 #include <linux/kernel.h>
-#include <linux/signal.h>
 #include <linux/errno.h>
 #include <linux/wait.h>
-#include <linux/ptrace.h>
 #include <linux/unistd.h>
 #include <linux/stddef.h>
 #include <linux/personality.h>
@@ -31,16 +29,10 @@
 #include <asm/proto.h>
 #include <asm/vdso.h>
 #include <asm/sigframe.h>
+#include <asm/sighandling.h>
 #include <asm/sys_ia32.h>
 
-#define _BLOCKABLE (~(sigmask(SIGKILL) | sigmask(SIGSTOP)))
-
-#define FIX_EFLAGS	(X86_EFLAGS_AC | X86_EFLAGS_OF | \
-			 X86_EFLAGS_DF | X86_EFLAGS_TF | X86_EFLAGS_SF | \
-			 X86_EFLAGS_ZF | X86_EFLAGS_AF | X86_EFLAGS_PF | \
-			 X86_EFLAGS_CF)
-
-void signal_fault(struct pt_regs *regs, void __user *frame, char *where);
+#define FIX_EFLAGS	__FIX_EFLAGS
 
 int copy_siginfo_to_user32(compat_siginfo_t __user *to, siginfo_t *from)
 {
diff --git a/arch/x86/include/asm/ptrace.h b/arch/x86/include/asm/ptrace.h
index 35664547125b..dcfde52979c3 100644
--- a/arch/x86/include/asm/ptrace.h
+++ b/arch/x86/include/asm/ptrace.h
@@ -145,7 +145,6 @@ extern unsigned long
 convert_ip_to_linear(struct task_struct *child, struct pt_regs *regs);
 extern void send_sigtrap(struct task_struct *tsk, struct pt_regs *regs,
 			 int error_code, int si_code);
-void signal_fault(struct pt_regs *regs, void __user *frame, char *where);
 
 extern long syscall_trace_enter(struct pt_regs *);
 extern void syscall_trace_leave(struct pt_regs *);
diff --git a/arch/x86/include/asm/sighandling.h b/arch/x86/include/asm/sighandling.h
new file mode 100644
index 000000000000..843e299e120e
--- /dev/null
+++ b/arch/x86/include/asm/sighandling.h
@@ -0,0 +1,19 @@
+#ifndef _ASM_X86_SIGHANDLING_H
+#define _ASM_X86_SIGHANDLING_H
+
+#include <linux/compiler.h>
+#include <linux/ptrace.h>
+#include <linux/signal.h>
+
+#include <asm/processor-flags.h>
+
+#define _BLOCKABLE (~(sigmask(SIGKILL) | sigmask(SIGSTOP)))
+
+#define __FIX_EFLAGS	(X86_EFLAGS_AC | X86_EFLAGS_OF | \
+			 X86_EFLAGS_DF | X86_EFLAGS_TF | X86_EFLAGS_SF | \
+			 X86_EFLAGS_ZF | X86_EFLAGS_AF | X86_EFLAGS_PF | \
+			 X86_EFLAGS_CF)
+
+void signal_fault(struct pt_regs *regs, void __user *frame, char *where);
+
+#endif /* _ASM_X86_SIGHANDLING_H */
diff --git a/arch/x86/kernel/signal.c b/arch/x86/kernel/signal.c
index 46a01bdc27e2..c432dc0e65f0 100644
--- a/arch/x86/kernel/signal.c
+++ b/arch/x86/kernel/signal.c
@@ -10,10 +10,8 @@
 #include <linux/mm.h>
 #include <linux/smp.h>
 #include <linux/kernel.h>
-#include <linux/signal.h>
 #include <linux/errno.h>
 #include <linux/wait.h>
-#include <linux/ptrace.h>
 #include <linux/tracehook.h>
 #include <linux/unistd.h>
 #include <linux/stddef.h>
@@ -26,6 +24,7 @@
 #include <asm/i387.h>
 #include <asm/vdso.h>
 #include <asm/mce.h>
+#include <asm/sighandling.h>
 
 #ifdef CONFIG_X86_64
 #include <asm/proto.h>
@@ -37,13 +36,6 @@
 
 #include <asm/sigframe.h>
 
-#define _BLOCKABLE (~(sigmask(SIGKILL) | sigmask(SIGSTOP)))
-
-#define __FIX_EFLAGS	(X86_EFLAGS_AC | X86_EFLAGS_OF | \
-			 X86_EFLAGS_DF | X86_EFLAGS_TF | X86_EFLAGS_SF | \
-			 X86_EFLAGS_ZF | X86_EFLAGS_AF | X86_EFLAGS_PF | \
-			 X86_EFLAGS_CF)
-
 #ifdef CONFIG_X86_32
 # define FIX_EFLAGS	(__FIX_EFLAGS | X86_EFLAGS_RF)
 #else
-- 
cgit v1.2.3


From 851394229e79c11b0b5b74c509817848e9a80564 Mon Sep 17 00:00:00 2001
From: "H. Peter Anvin" <hpa@zytor.com>
Date: Sun, 19 Feb 2012 07:43:09 -0800
Subject: x32: Export setup/restore_sigcontext from signal.c

Export setup_sigcontext() and restore_sigcontext() from signal.c, so
we can use the 64-bit versions verbatim for x32.

Signed-off-by: H. Peter Anvin <hpa@zytor.com>
---
 arch/x86/include/asm/sighandling.h |  5 +++++
 arch/x86/kernel/signal.c           | 10 ++++------
 2 files changed, 9 insertions(+), 6 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/include/asm/sighandling.h b/arch/x86/include/asm/sighandling.h
index 843e299e120e..ada93b3b8c66 100644
--- a/arch/x86/include/asm/sighandling.h
+++ b/arch/x86/include/asm/sighandling.h
@@ -16,4 +16,9 @@
 
 void signal_fault(struct pt_regs *regs, void __user *frame, char *where);
 
+int restore_sigcontext(struct pt_regs *regs, struct sigcontext __user *sc,
+		       unsigned long *pax);
+int setup_sigcontext(struct sigcontext __user *sc, void __user *fpstate,
+		     struct pt_regs *regs, unsigned long mask);
+
 #endif /* _ASM_X86_SIGHANDLING_H */
diff --git a/arch/x86/kernel/signal.c b/arch/x86/kernel/signal.c
index c432dc0e65f0..450fb255f877 100644
--- a/arch/x86/kernel/signal.c
+++ b/arch/x86/kernel/signal.c
@@ -60,9 +60,8 @@
 	regs->seg = GET_SEG(seg) | 3;			\
 } while (0)
 
-static int
-restore_sigcontext(struct pt_regs *regs, struct sigcontext __user *sc,
-		   unsigned long *pax)
+int restore_sigcontext(struct pt_regs *regs, struct sigcontext __user *sc,
+		       unsigned long *pax)
 {
 	void __user *buf;
 	unsigned int tmpflags;
@@ -117,9 +116,8 @@ restore_sigcontext(struct pt_regs *regs, struct sigcontext __user *sc,
 	return err;
 }
 
-static int
-setup_sigcontext(struct sigcontext __user *sc, void __user *fpstate,
-		 struct pt_regs *regs, unsigned long mask)
+int setup_sigcontext(struct sigcontext __user *sc, void __user *fpstate,
+		     struct pt_regs *regs, unsigned long mask)
 {
 	int err = 0;
 
-- 
cgit v1.2.3


From 4048e2a8d4b491a69bf47ceda12cc0c0b924f6b8 Mon Sep 17 00:00:00 2001
From: "H. J. Lu" <hjl.tools@gmail.com>
Date: Sun, 19 Feb 2012 07:46:08 -0800
Subject: x32: Add struct ucontext_x32

Add a definition for struct ucontext_x32; this is inherently a mix of
the 32- and 64-bit versions.

Signed-off-by: H. Peter Anvin <hpa@zytor.com>
---
 arch/x86/include/asm/ia32.h | 9 +++++++++
 1 file changed, 9 insertions(+)

(limited to 'arch/x86')

diff --git a/arch/x86/include/asm/ia32.h b/arch/x86/include/asm/ia32.h
index 1f7e62517284..c6435ab1cc13 100644
--- a/arch/x86/include/asm/ia32.h
+++ b/arch/x86/include/asm/ia32.h
@@ -43,6 +43,15 @@ struct ucontext_ia32 {
 	compat_sigset_t	  uc_sigmask;	/* mask last for extensibility */
 };
 
+struct ucontext_x32 {
+	unsigned int	  uc_flags;
+	unsigned int 	  uc_link;
+	stack_ia32_t	  uc_stack;
+	unsigned int	  uc__pad0;     /* needed for alignment */
+	struct sigcontext uc_mcontext;  /* the 64-bit sigcontext type */
+	compat_sigset_t	  uc_sigmask;	/* mask last for extensibility */
+};
+
 /* This matches struct stat64 in glibc2.2, hence the absolutely
  * insane amounts of padding around dev_t's.
  */
-- 
cgit v1.2.3


From 9d3897630e14b3d33bcb24a3c0fa9d60a01d3058 Mon Sep 17 00:00:00 2001
From: "H. Peter Anvin" <hpa@zytor.com>
Date: Sun, 19 Feb 2012 07:50:12 -0800
Subject: x32: Add rt_sigframe_x32

Add rt_sigframe_x32 to <asm/sigframe.h>.  Unfortunately we can't just
define all the data structures unconditionally, due to the #ifdef
CONFIG_COMPAT in <linux/compat.h> and its trickle-down effects, hence
the #ifdef mess.

Signed-off-by: H. Peter Anvin <hpa@zytor.com>
---
 arch/x86/include/asm/sigframe.h | 13 +++++++++++++
 1 file changed, 13 insertions(+)

(limited to 'arch/x86')

diff --git a/arch/x86/include/asm/sigframe.h b/arch/x86/include/asm/sigframe.h
index 4e0fe26d27d3..7c7c27c97daa 100644
--- a/arch/x86/include/asm/sigframe.h
+++ b/arch/x86/include/asm/sigframe.h
@@ -59,12 +59,25 @@ struct rt_sigframe_ia32 {
 #endif /* defined(CONFIG_X86_32) || defined(CONFIG_IA32_EMULATION) */
 
 #ifdef CONFIG_X86_64
+
 struct rt_sigframe {
 	char __user *pretcode;
 	struct ucontext uc;
 	struct siginfo info;
 	/* fp state follows here */
 };
+
+#ifdef CONFIG_X86_X32_ABI
+
+struct rt_sigframe_x32 {
+	u64 pretcode;
+	struct ucontext_x32 uc;
+	compat_siginfo_t info;
+	/* fp state follows here */
+};
+
+#endif /* CONFIG_X86_X32_ABI */
+
 #endif /* CONFIG_X86_64 */
 
 #endif /* _ASM_X86_SIGFRAME_H */
-- 
cgit v1.2.3


From fca460f95e928bae373daa8295877b6905bc62b8 Mon Sep 17 00:00:00 2001
From: "H. Peter Anvin" <hpa@zytor.com>
Date: Sun, 19 Feb 2012 07:56:26 -0800
Subject: x32: Handle the x32 system call flag

x32 shares most system calls with x86-64, but unfortunately some
subsystem (the input subsystem is the chief offender) which require
is_compat() when operating with a 32-bit userspace.  The input system
actually has text files in sysfs whose meaning is dependent on
sizeof(long) in userspace!

We could solve this by having two completely disjoint system call
tables; requiring that each system call be duplicated.  This patch
takes a different approach: we add a flag to the system call number;
this flag doesn't affect the system call dispatch but requests compat
treatment from affected subsystems for the duration of the system call.

The change of cmpq to cmpl is safe since it immediately follows the
and.

Signed-off-by: H. Peter Anvin <hpa@zytor.com>
---
 arch/x86/include/asm/compat.h  | 13 +++++++++++--
 arch/x86/include/asm/syscall.h |  5 +++--
 arch/x86/include/asm/unistd.h  |  7 +++++++
 arch/x86/kernel/entry_64.S     | 10 ++++++++++
 4 files changed, 31 insertions(+), 4 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/include/asm/compat.h b/arch/x86/include/asm/compat.h
index 30d737ef2a42..7938b84e4506 100644
--- a/arch/x86/include/asm/compat.h
+++ b/arch/x86/include/asm/compat.h
@@ -7,6 +7,7 @@
 #include <linux/types.h>
 #include <linux/sched.h>
 #include <asm/user32.h>
+#include <asm/unistd.h>
 
 #define COMPAT_USER_HZ		100
 #define COMPAT_UTS_MACHINE	"i686\0\0"
@@ -212,9 +213,17 @@ static inline void __user *arch_compat_alloc_user_space(long len)
 	return (void __user *)regs->sp - len;
 }
 
-static inline int is_compat_task(void)
+static inline bool is_compat_task(void)
 {
-	return current_thread_info()->status & TS_COMPAT;
+#ifdef CONFIG_IA32_EMULATION
+	if (current_thread_info()->status & TS_COMPAT)
+		return true;
+#endif
+#ifdef CONFIG_X86_X32_ABI
+	if (task_pt_regs(current)->orig_ax & __X32_SYSCALL_BIT)
+		return true;
+#endif
+	return false;
 }
 
 #endif /* _ASM_X86_COMPAT_H */
diff --git a/arch/x86/include/asm/syscall.h b/arch/x86/include/asm/syscall.h
index d962e5652a73..386b78686c4d 100644
--- a/arch/x86/include/asm/syscall.h
+++ b/arch/x86/include/asm/syscall.h
@@ -16,6 +16,7 @@
 #include <linux/sched.h>
 #include <linux/err.h>
 #include <asm/asm-offsets.h>	/* For NR_syscalls */
+#include <asm/unistd.h>
 
 extern const unsigned long sys_call_table[];
 
@@ -26,13 +27,13 @@ extern const unsigned long sys_call_table[];
  */
 static inline int syscall_get_nr(struct task_struct *task, struct pt_regs *regs)
 {
-	return regs->orig_ax;
+	return regs->orig_ax & __SYSCALL_MASK;
 }
 
 static inline void syscall_rollback(struct task_struct *task,
 				    struct pt_regs *regs)
 {
-	regs->ax = regs->orig_ax;
+	regs->ax = regs->orig_ax & __SYSCALL_MASK;
 }
 
 static inline long syscall_get_error(struct task_struct *task,
diff --git a/arch/x86/include/asm/unistd.h b/arch/x86/include/asm/unistd.h
index 7a48a5557470..37cdc9d99bb1 100644
--- a/arch/x86/include/asm/unistd.h
+++ b/arch/x86/include/asm/unistd.h
@@ -5,6 +5,13 @@
 #define __X32_SYSCALL_BIT	0x40000000
 
 #ifdef __KERNEL__
+
+# ifdef CONFIG_X86_X32_ABI
+#  define __SYSCALL_MASK (~(__X32_SYSCALL_BIT))
+# else
+#  define __SYSCALL_MASK (~0)
+# endif
+
 # ifdef CONFIG_X86_32
 
 #  include <asm/unistd_32.h>
diff --git a/arch/x86/kernel/entry_64.S b/arch/x86/kernel/entry_64.S
index 3fe8239fd8fb..a17b34216971 100644
--- a/arch/x86/kernel/entry_64.S
+++ b/arch/x86/kernel/entry_64.S
@@ -482,7 +482,12 @@ GLOBAL(system_call_after_swapgs)
 	testl $_TIF_WORK_SYSCALL_ENTRY,TI_flags+THREAD_INFO(%rsp,RIP-ARGOFFSET)
 	jnz tracesys
 system_call_fastpath:
+#if __SYSCALL_MASK == ~0
 	cmpq $__NR_syscall_max,%rax
+#else
+	andl $__SYSCALL_MASK,%eax
+	cmpl $__NR_syscall_max,%eax
+#endif
 	ja badsys
 	movq %r10,%rcx
 	call *sys_call_table(,%rax,8)  # XXX:	 rip relative
@@ -596,7 +601,12 @@ tracesys:
 	 */
 	LOAD_ARGS ARGOFFSET, 1
 	RESTORE_REST
+#if __SYSCALL_MASK == ~0
 	cmpq $__NR_syscall_max,%rax
+#else
+	andl $__SYSCALL_MASK,%eax
+	cmpl $__NR_syscall_max,%eax
+#endif
 	ja   int_ret_from_sys_call	/* RAX(%rsp) set to -ENOSYS above */
 	movq %r10,%rcx	/* fixup for C */
 	call *sys_call_table(,%rax,8)
-- 
cgit v1.2.3


From a96d692e9a559980f269f81c9b0b094220382186 Mon Sep 17 00:00:00 2001
From: "H. Peter Anvin" <hpa@zytor.com>
Date: Sun, 19 Feb 2012 14:02:46 -0800
Subject: x86: Add #ifdef CONFIG_COMPAT to <asm/sys_ia32.h>

Unfortunately a lot of the compat types are guarded with CONFIG_COMPAT
or the equivalent, so add a similar guard to <asm/sys_ia32.h> to avoid
compilation failures when CONFIG_COMPAT=n.

Signed-off-by: H. Peter Anvin <hpa@zytor.com>
---
 arch/x86/include/asm/sys_ia32.h | 5 +++++
 1 file changed, 5 insertions(+)

(limited to 'arch/x86')

diff --git a/arch/x86/include/asm/sys_ia32.h b/arch/x86/include/asm/sys_ia32.h
index 68da87bfb5a3..3fda9db48819 100644
--- a/arch/x86/include/asm/sys_ia32.h
+++ b/arch/x86/include/asm/sys_ia32.h
@@ -10,6 +10,8 @@
 #ifndef _ASM_X86_SYS_IA32_H
 #define _ASM_X86_SYS_IA32_H
 
+#ifdef CONFIG_COMPAT
+
 #include <linux/compiler.h>
 #include <linux/linkage.h>
 #include <linux/types.h>
@@ -81,4 +83,7 @@ asmlinkage long sys32_ipc(u32, int, int, int, compat_uptr_t, u32);
 
 asmlinkage long sys32_fanotify_mark(int, unsigned int, u32, u32, int,
 				    const char __user *);
+
+#endif /* CONFIG_COMPAT */
+
 #endif /* _ASM_X86_SYS_IA32_H */
-- 
cgit v1.2.3


From c5a373942bbc41698724fc948c74f959f73407e5 Mon Sep 17 00:00:00 2001
From: "H. Peter Anvin" <hpa@zytor.com>
Date: Sun, 19 Feb 2012 09:41:09 -0800
Subject: x32: Signal-related system calls

x32 uses the 64-bit signal frame format, obviously, but there are some
structures which mixes that with pointers or sizeof(long) types, as
such we have to create a handful of system calls specific to x32.  By
and large these are a mixture of the 64-bit and the compat system
calls.

Originally-by: H. J. Lu <hjl.tools@gmail.com>
Signed-off-by: H. Peter Anvin <hpa@zytor.com>
---
 arch/x86/kernel/entry_64.S |  19 ++++++++
 arch/x86/kernel/signal.c   | 118 ++++++++++++++++++++++++++++++++++++++++++++-
 2 files changed, 136 insertions(+), 1 deletion(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kernel/entry_64.S b/arch/x86/kernel/entry_64.S
index a17b34216971..53dc821f0a62 100644
--- a/arch/x86/kernel/entry_64.S
+++ b/arch/x86/kernel/entry_64.S
@@ -746,6 +746,25 @@ ENTRY(stub_rt_sigreturn)
 	CFI_ENDPROC
 END(stub_rt_sigreturn)
 
+#ifdef CONFIG_X86_X32_ABI
+	PTREGSCALL stub_x32_sigaltstack, sys32_sigaltstack, %rdx
+
+ENTRY(stub_x32_rt_sigreturn)
+	CFI_STARTPROC
+	addq $8, %rsp
+	PARTIAL_FRAME 0
+	SAVE_REST
+	movq %rsp,%rdi
+	FIXUP_TOP_OF_STACK %r11
+	call sys32_x32_rt_sigreturn
+	movq %rax,RAX(%rsp) # fixme, this could be done at the higher layer
+	RESTORE_REST
+	jmp int_ret_from_sys_call
+	CFI_ENDPROC
+END(stub_x32_rt_sigreturn)
+
+#endif
+
 /*
  * Build the entry stubs and pointer table with some assembler magic.
  * We pack 7 stubs into a single 32-byte chunk, which will fit in a
diff --git a/arch/x86/kernel/signal.c b/arch/x86/kernel/signal.c
index 450fb255f877..c3846b6fb726 100644
--- a/arch/x86/kernel/signal.c
+++ b/arch/x86/kernel/signal.c
@@ -29,6 +29,7 @@
 #ifdef CONFIG_X86_64
 #include <asm/proto.h>
 #include <asm/ia32_unistd.h>
+#include <asm/sys_ia32.h>
 #endif /* CONFIG_X86_64 */
 
 #include <asm/syscall.h>
@@ -632,6 +633,16 @@ static int signr_convert(int sig)
 #define is_ia32	0
 #endif /* CONFIG_IA32_EMULATION */
 
+#ifdef CONFIG_X86_X32_ABI
+#define is_x32	test_thread_flag(TIF_X32)
+
+static int x32_setup_rt_frame(int sig, struct k_sigaction *ka,
+			      siginfo_t *info, compat_sigset_t *set,
+			      struct pt_regs *regs);
+#else /* !CONFIG_X86_X32_ABI */
+#define is_x32	0
+#endif /* CONFIG_X86_X32_ABI */
+
 int ia32_setup_rt_frame(int sig, struct k_sigaction *ka, siginfo_t *info,
 		sigset_t *set, struct pt_regs *regs);
 int ia32_setup_frame(int sig, struct k_sigaction *ka,
@@ -656,8 +667,14 @@ setup_rt_frame(int sig, struct k_sigaction *ka, siginfo_t *info,
 			ret = ia32_setup_rt_frame(usig, ka, info, set, regs);
 		else
 			ret = ia32_setup_frame(usig, ka, set, regs);
-	} else
+#ifdef CONFIG_X86_X32_ABI
+	} else if (is_x32) {
+		ret = x32_setup_rt_frame(usig, ka, info,
+					 (compat_sigset_t *)set, regs);
+#endif
+	} else {
 		ret = __setup_rt_frame(sig, ka, info, set, regs);
+	}
 
 	if (ret) {
 		force_sigsegv(sig, current);
@@ -840,3 +857,102 @@ void signal_fault(struct pt_regs *regs, void __user *frame, char *where)
 
 	force_sig(SIGSEGV, me);
 }
+
+#ifdef CONFIG_X86_X32_ABI
+static int x32_setup_rt_frame(int sig, struct k_sigaction *ka,
+			      siginfo_t *info, compat_sigset_t *set,
+			      struct pt_regs *regs)
+{
+	struct rt_sigframe_x32 __user *frame;
+	void __user *restorer;
+	int err = 0;
+	void __user *fpstate = NULL;
+
+	frame = get_sigframe(ka, regs, sizeof(*frame), &fpstate);
+
+	if (!access_ok(VERIFY_WRITE, frame, sizeof(*frame)))
+		return -EFAULT;
+
+	if (ka->sa.sa_flags & SA_SIGINFO) {
+		if (copy_siginfo_to_user32(&frame->info, info))
+			return -EFAULT;
+	}
+
+	put_user_try {
+		/* Create the ucontext.  */
+		if (cpu_has_xsave)
+			put_user_ex(UC_FP_XSTATE, &frame->uc.uc_flags);
+		else
+			put_user_ex(0, &frame->uc.uc_flags);
+		put_user_ex(0, &frame->uc.uc_link);
+		put_user_ex(current->sas_ss_sp, &frame->uc.uc_stack.ss_sp);
+		put_user_ex(sas_ss_flags(regs->sp),
+			    &frame->uc.uc_stack.ss_flags);
+		put_user_ex(current->sas_ss_size, &frame->uc.uc_stack.ss_size);
+		put_user_ex(0, &frame->uc.uc__pad0);
+		err |= setup_sigcontext(&frame->uc.uc_mcontext, fpstate,
+					regs, set->sig[0]);
+		err |= __copy_to_user(&frame->uc.uc_sigmask, set, sizeof(*set));
+
+		if (ka->sa.sa_flags & SA_RESTORER) {
+			restorer = ka->sa.sa_restorer;
+		} else {
+			/* could use a vstub here */
+			restorer = NULL;
+			err |= -EFAULT;
+		}
+		put_user_ex(restorer, &frame->pretcode);
+	} put_user_catch(err);
+
+	if (err)
+		return -EFAULT;
+
+	/* Set up registers for signal handler */
+	regs->sp = (unsigned long) frame;
+	regs->ip = (unsigned long) ka->sa.sa_handler;
+
+	/* We use the x32 calling convention here... */
+	regs->di = sig;
+	regs->si = (unsigned long) &frame->info;
+	regs->dx = (unsigned long) &frame->uc;
+
+	loadsegment(ds, __USER_DS);
+	loadsegment(es, __USER_DS);
+
+	regs->cs = __USER_CS;
+	regs->ss = __USER_DS;
+
+	return 0;
+}
+
+asmlinkage long sys32_x32_rt_sigreturn(struct pt_regs *regs)
+{
+	struct rt_sigframe_x32 __user *frame;
+	sigset_t set;
+	unsigned long ax;
+	struct pt_regs tregs;
+
+	frame = (struct rt_sigframe_x32 __user *)(regs->sp - 8);
+
+	if (!access_ok(VERIFY_READ, frame, sizeof(*frame)))
+		goto badframe;
+	if (__copy_from_user(&set, &frame->uc.uc_sigmask, sizeof(set)))
+		goto badframe;
+
+	sigdelsetmask(&set, ~_BLOCKABLE);
+	set_current_blocked(&set);
+
+	if (restore_sigcontext(regs, &frame->uc.uc_mcontext, &ax))
+		goto badframe;
+
+	tregs = *regs;
+	if (sys32_sigaltstack(&frame->uc.uc_stack, NULL, &tregs) == -EFAULT)
+		goto badframe;
+
+	return ax;
+
+badframe:
+	signal_fault(regs, frame, "x32 rt_sigreturn");
+	return 0;
+}
+#endif
-- 
cgit v1.2.3


From d1a797f388d6d30fa502915d1b9937ed758b7137 Mon Sep 17 00:00:00 2001
From: "H. Peter Anvin" <hpa@zytor.com>
Date: Sun, 19 Feb 2012 10:06:34 -0800
Subject: x32: Handle process creation

Allow an x32 process to be started.

Originally-by: H. J. Lu <hjl.tools@gmail.com>
Signed-off-by: H. Peter Anvin <hpa@zytor.com>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
---
 arch/x86/include/asm/compat.h    | 26 ++++++++++++++++++++++++--
 arch/x86/include/asm/elf.h       | 25 +++++++++++++++++++++----
 arch/x86/kernel/cpu/perf_event.c |  4 +++-
 arch/x86/kernel/entry_64.S       | 15 +++++++++++++++
 arch/x86/kernel/process_64.c     | 23 ++++++++++++++++-------
 5 files changed, 79 insertions(+), 14 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/include/asm/compat.h b/arch/x86/include/asm/compat.h
index 7938b84e4506..e7f68b49c01a 100644
--- a/arch/x86/include/asm/compat.h
+++ b/arch/x86/include/asm/compat.h
@@ -6,6 +6,7 @@
  */
 #include <linux/types.h>
 #include <linux/sched.h>
+#include <asm/processor.h>
 #include <asm/user32.h>
 #include <asm/unistd.h>
 
@@ -187,7 +188,20 @@ struct compat_shmid64_ds {
 /*
  * The type of struct elf_prstatus.pr_reg in compatible core dumps.
  */
+#ifdef CONFIG_X86_X32_ABI
+typedef struct user_regs_struct compat_elf_gregset_t;
+
+#define PR_REG_SIZE(S) (test_thread_flag(TIF_IA32) ? 68 : 216)
+#define PRSTATUS_SIZE(S) (test_thread_flag(TIF_IA32) ? 144 : 296)
+#define SET_PR_FPVALID(S,V) \
+  do { *(int *) (((void *) &((S)->pr_reg)) + PR_REG_SIZE(0)) = (V); } \
+  while (0)
+
+#define COMPAT_USE_64BIT_TIME \
+	(!!(task_pt_regs(current)->orig_ax & __X32_SYSCALL_BIT))
+#else
 typedef struct user_regs_struct32 compat_elf_gregset_t;
+#endif
 
 /*
  * A pointer passed in from user mode. This should not
@@ -209,8 +223,16 @@ static inline compat_uptr_t ptr_to_compat(void __user *uptr)
 
 static inline void __user *arch_compat_alloc_user_space(long len)
 {
-	struct pt_regs *regs = task_pt_regs(current);
-	return (void __user *)regs->sp - len;
+	compat_uptr_t sp;
+
+	if (test_thread_flag(TIF_IA32)) {
+		sp = task_pt_regs(current)->sp;
+	} else {
+		/* -128 for the x32 ABI redzone */
+		sp = percpu_read(old_rsp) - 128;
+	}
+
+	return (void __user *)round_down(sp - len, 16);
 }
 
 static inline bool is_compat_task(void)
diff --git a/arch/x86/include/asm/elf.h b/arch/x86/include/asm/elf.h
index 410fa6a219f6..83aabea95dd7 100644
--- a/arch/x86/include/asm/elf.h
+++ b/arch/x86/include/asm/elf.h
@@ -156,7 +156,12 @@ do {						\
 #define elf_check_arch(x)			\
 	((x)->e_machine == EM_X86_64)
 
-#define compat_elf_check_arch(x)	elf_check_arch_ia32(x)
+#define compat_elf_check_arch(x)		\
+	(elf_check_arch_ia32(x) || (x)->e_machine == EM_X86_64)
+
+#if __USER32_DS != __USER_DS
+# error "The following code assumes __USER32_DS == __USER_DS"
+#endif
 
 static inline void elf_common_init(struct thread_struct *t,
 				   struct pt_regs *regs, const u16 ds)
@@ -179,8 +184,9 @@ static inline void elf_common_init(struct thread_struct *t,
 void start_thread_ia32(struct pt_regs *regs, u32 new_ip, u32 new_sp);
 #define compat_start_thread start_thread_ia32
 
-void set_personality_ia32(void);
-#define COMPAT_SET_PERSONALITY(ex) set_personality_ia32()
+void set_personality_ia32(bool);
+#define COMPAT_SET_PERSONALITY(ex)			\
+	set_personality_ia32((ex).e_machine == EM_X86_64)
 
 #define COMPAT_ELF_PLATFORM			("i686")
 
@@ -296,9 +302,20 @@ do {									\
 			    (unsigned long)current->mm->context.vdso);	\
 } while (0)
 
+#define ARCH_DLINFO_X32							\
+do {									\
+	if (vdso_enabled)						\
+		NEW_AUX_ENT(AT_SYSINFO_EHDR,				\
+			    (unsigned long)current->mm->context.vdso);	\
+} while (0)
+
 #define AT_SYSINFO		32
 
-#define COMPAT_ARCH_DLINFO	ARCH_DLINFO_IA32(sysctl_vsyscall32)
+#define COMPAT_ARCH_DLINFO						\
+if (test_thread_flag(TIF_X32))						\
+	ARCH_DLINFO_X32;						\
+else									\
+	ARCH_DLINFO_IA32(sysctl_vsyscall32)
 
 #define COMPAT_ELF_ET_DYN_BASE	(TASK_UNMAPPED_BASE + 0x1000000)
 
diff --git a/arch/x86/kernel/cpu/perf_event.c b/arch/x86/kernel/cpu/perf_event.c
index 5adce1040b11..63c0e058a405 100644
--- a/arch/x86/kernel/cpu/perf_event.c
+++ b/arch/x86/kernel/cpu/perf_event.c
@@ -28,7 +28,6 @@
 #include <asm/apic.h>
 #include <asm/stacktrace.h>
 #include <asm/nmi.h>
-#include <asm/compat.h>
 #include <asm/smp.h>
 #include <asm/alternative.h>
 
@@ -1595,6 +1594,9 @@ perf_callchain_kernel(struct perf_callchain_entry *entry, struct pt_regs *regs)
 }
 
 #ifdef CONFIG_COMPAT
+
+#include <asm/compat.h>
+
 static inline int
 perf_callchain_user32(struct pt_regs *regs, struct perf_callchain_entry *entry)
 {
diff --git a/arch/x86/kernel/entry_64.S b/arch/x86/kernel/entry_64.S
index 53dc821f0a62..9e036f0ce5e0 100644
--- a/arch/x86/kernel/entry_64.S
+++ b/arch/x86/kernel/entry_64.S
@@ -763,6 +763,21 @@ ENTRY(stub_x32_rt_sigreturn)
 	CFI_ENDPROC
 END(stub_x32_rt_sigreturn)
 
+ENTRY(stub_x32_execve)
+	CFI_STARTPROC
+	addq $8, %rsp
+	PARTIAL_FRAME 0
+	SAVE_REST
+	FIXUP_TOP_OF_STACK %r11
+	movq %rsp, %rcx
+	call sys32_execve
+	RESTORE_TOP_OF_STACK %r11
+	movq %rax,RAX(%rsp)
+	RESTORE_REST
+	jmp int_ret_from_sys_call
+	CFI_ENDPROC
+END(stub_x32_execve)
+
 #endif
 
 /*
diff --git a/arch/x86/kernel/process_64.c b/arch/x86/kernel/process_64.c
index 5fe2fbaa56ba..a0701da2bd18 100644
--- a/arch/x86/kernel/process_64.c
+++ b/arch/x86/kernel/process_64.c
@@ -364,7 +364,9 @@ start_thread(struct pt_regs *regs, unsigned long new_ip, unsigned long new_sp)
 void start_thread_ia32(struct pt_regs *regs, u32 new_ip, u32 new_sp)
 {
 	start_thread_common(regs, new_ip, new_sp,
-			    __USER32_CS, __USER32_DS, __USER32_DS);
+			    test_thread_flag(TIF_X32)
+			    ? __USER_CS : __USER32_CS,
+			    __USER_DS, __USER_DS);
 }
 #endif
 
@@ -508,6 +510,7 @@ void set_personality_64bit(void)
 
 	/* Make sure to be in 64bit mode */
 	clear_thread_flag(TIF_IA32);
+	clear_thread_flag(TIF_X32);
 	clear_thread_flag(TIF_ADDR32);
 	clear_thread_flag(TIF_X32);
 
@@ -522,22 +525,28 @@ void set_personality_64bit(void)
 	current->personality &= ~READ_IMPLIES_EXEC;
 }
 
-void set_personality_ia32(void)
+void set_personality_ia32(bool x32)
 {
 	/* inherit personality from parent */
 
 	/* Make sure to be in 32bit mode */
-	set_thread_flag(TIF_IA32);
 	set_thread_flag(TIF_ADDR32);
-	clear_thread_flag(TIF_X32);
-	current->personality |= force_personality32;
 
 	/* Mark the associated mm as containing 32-bit tasks. */
 	if (current->mm)
 		current->mm->context.ia32_compat = 1;
 
-	/* Prepare the first "return" to user space */
-	current_thread_info()->status |= TS_COMPAT;
+	if (x32) {
+		clear_thread_flag(TIF_IA32);
+		set_thread_flag(TIF_X32);
+		current->personality &= ~READ_IMPLIES_EXEC;
+	} else {
+		set_thread_flag(TIF_IA32);
+		clear_thread_flag(TIF_X32);
+		current->personality |= force_personality32;
+		/* Prepare the first "return" to user space */
+		current_thread_info()->status |= TS_COMPAT;
+	}
 }
 
 unsigned long get_wchan(struct task_struct *p)
-- 
cgit v1.2.3


From a06c9bc0647f66df0534fb887ddf6cddd35f426c Mon Sep 17 00:00:00 2001
From: "H. Peter Anvin" <hpa@zytor.com>
Date: Sun, 19 Feb 2012 11:08:37 -0800
Subject: x32: If configured, add x32 system calls to system call tables

If CONFIG_X86_X32_ABI is defined, add the x32 system calls to the
system call tables.

Signed-off-by: H. Peter Anvin <hpa@zytor.com>
---
 arch/x86/kernel/asm-offsets_64.c | 6 +++++-
 arch/x86/kernel/syscall_64.c     | 7 ++++++-
 2 files changed, 11 insertions(+), 2 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kernel/asm-offsets_64.c b/arch/x86/kernel/asm-offsets_64.c
index c3354f7b0a06..1b4754f82ba7 100644
--- a/arch/x86/kernel/asm-offsets_64.c
+++ b/arch/x86/kernel/asm-offsets_64.c
@@ -2,7 +2,11 @@
 
 #define __SYSCALL_64(nr, sym, compat) [nr] = 1,
 #define __SYSCALL_COMMON(nr, sym, compat) [nr] = 1,
-#define __SYSCALL_X32(nr, sym, compat) /* Not yet */
+#ifdef CONFIG_X86_X32_ABI
+# define __SYSCALL_X32(nr, sym, compat) [nr] = 1,
+#else
+# define __SYSCALL_X32(nr, sym, compat) /* nothing */
+#endif
 static char syscalls_64[] = {
 #include <asm/syscalls_64.h>
 };
diff --git a/arch/x86/kernel/syscall_64.c b/arch/x86/kernel/syscall_64.c
index 26c4ca1f20e8..5c7f8c20da74 100644
--- a/arch/x86/kernel/syscall_64.c
+++ b/arch/x86/kernel/syscall_64.c
@@ -6,7 +6,12 @@
 #include <asm/asm-offsets.h>
 
 #define __SYSCALL_COMMON(nr, sym, compat) __SYSCALL_64(nr, sym, compat)
-#define __SYSCALL_X32(nr, sym, compat) /* Not yet */
+
+#ifdef CONFIG_X86_X32_ABI
+# define __SYSCALL_X32(nr, sym, compat) __SYSCALL_64(nr, sym, compat)
+#else
+# define __SYSCALL_X32(nr, sym, compat) /* nothing */
+#endif
 
 #define __SYSCALL_64(nr, sym, compat) extern asmlinkage void sym(void) ;
 #include <asm/syscalls_64.h>
-- 
cgit v1.2.3


From 5fd92e65a68b813667bc8739f5fa463e5bfcd66d Mon Sep 17 00:00:00 2001
From: "H. J. Lu" <hjl.tools@gmail.com>
Date: Sun, 19 Feb 2012 10:40:03 -0800
Subject: x32: Allow x32 to be configured

At this point, one should be able to build an x32 kernel.

Note that for now we depend on CONFIG_IA32_EMULATION.  Long term, x32
and IA32 should be detangled.

Signed-off-by: H. Peter Anvin <hpa@zytor.com>
---
 arch/x86/Kconfig | 21 +++++++++++++++++----
 1 file changed, 17 insertions(+), 4 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
index 5bed94e189fa..c9d6c9ed27e5 100644
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -2165,9 +2165,9 @@ config IA32_EMULATION
 	depends on X86_64
 	select COMPAT_BINFMT_ELF
 	---help---
-	  Include code to run 32-bit programs under a 64-bit kernel. You should
-	  likely turn this on, unless you're 100% sure that you don't have any
-	  32-bit programs left.
+	  Include code to run legacy 32-bit programs under a
+	  64-bit kernel. You should likely turn this on, unless you're
+	  100% sure that you don't have any 32-bit programs left.
 
 config IA32_AOUT
 	tristate "IA32 a.out support"
@@ -2175,9 +2175,22 @@ config IA32_AOUT
 	---help---
 	  Support old a.out binaries in the 32bit emulation.
 
+config X86_X32_ABI
+	bool "x32 ABI for 64-bit mode (EXPERIMENTAL)"
+	depends on X86_64 && IA32_EMULATION && EXPERIMENTAL
+	---help---
+	  Include code to run binaries for the x32 native 32-bit ABI
+	  for 64-bit processors.  An x32 process gets access to the
+	  full 64-bit register file and wide data path while leaving
+	  pointers at 32 bits for smaller memory footprint.
+
+	  You will need a recent binutils (2.22 or later) with
+	  elf32_x86_64 support enabled to compile a kernel with this
+	  option set.
+
 config COMPAT
 	def_bool y
-	depends on IA32_EMULATION
+	depends on IA32_EMULATION || X86_X32_ABI
 
 config COMPAT_FOR_U64_ALIGNMENT
 	def_bool COMPAT
-- 
cgit v1.2.3


From 1a21d4e095ef720abf81299000afc038206d571b Mon Sep 17 00:00:00 2001
From: "H. J. Lu" <hjl.tools@gmail.com>
Date: Sun, 19 Feb 2012 11:38:06 -0800
Subject: x32: Add x32 VDSO support

Add support for the x32 VDSO.  The x32 VDSO takes advantage of the
similarity between the x86-64 and the x32 ABIs to contain the same
content, only the container is different, as the x32 VDSO obviously is
an x32 shared object.

Signed-off-by: H. Peter Anvin <hpa@zytor.com>
---
 arch/x86/vdso/.gitignore     |  2 ++
 arch/x86/vdso/Makefile       | 46 +++++++++++++++++++++++++-
 arch/x86/vdso/vdso32-setup.c |  6 ++++
 arch/x86/vdso/vdsox32.S      | 22 +++++++++++++
 arch/x86/vdso/vdsox32.lds.S  | 32 ++++++++++++++++++
 arch/x86/vdso/vma.c          | 78 +++++++++++++++++++++++++++++++++++++++-----
 6 files changed, 177 insertions(+), 9 deletions(-)
 create mode 100644 arch/x86/vdso/vdsox32.S
 create mode 100644 arch/x86/vdso/vdsox32.lds.S

(limited to 'arch/x86')

diff --git a/arch/x86/vdso/.gitignore b/arch/x86/vdso/.gitignore
index 60274d5746e1..3282874bc61d 100644
--- a/arch/x86/vdso/.gitignore
+++ b/arch/x86/vdso/.gitignore
@@ -1,5 +1,7 @@
 vdso.lds
 vdso-syms.lds
+vdsox32.lds
+vdsox32-syms.lds
 vdso32-syms.lds
 vdso32-syscall-syms.lds
 vdso32-sysenter-syms.lds
diff --git a/arch/x86/vdso/Makefile b/arch/x86/vdso/Makefile
index 5d179502a52c..fd14be1d1472 100644
--- a/arch/x86/vdso/Makefile
+++ b/arch/x86/vdso/Makefile
@@ -3,21 +3,29 @@
 #
 
 VDSO64-$(CONFIG_X86_64)		:= y
+VDSOX32-$(CONFIG_X86_X32_ABI)	:= y
 VDSO32-$(CONFIG_X86_32)		:= y
 VDSO32-$(CONFIG_COMPAT)		:= y
 
 vdso-install-$(VDSO64-y)	+= vdso.so
+vdso-install-$(VDSOX32-y)	+= vdsox32.so
 vdso-install-$(VDSO32-y)	+= $(vdso32-images)
 
 
 # files to link into the vdso
 vobjs-y := vdso-note.o vclock_gettime.o vgetcpu.o
 
+vobjs-$(VDSOX32-y) += $(vobjx32s-compat)
+
+# Filter out x32 objects.
+vobj64s := $(filter-out $(vobjx32s-compat),$(vobjs-y))
+
 # files to link into kernel
 obj-$(VDSO64-y)			+= vma.o vdso.o
+obj-$(VDSOX32-y)		+= vdsox32.o
 obj-$(VDSO32-y)			+= vdso32.o vdso32-setup.o
 
-vobjs := $(foreach F,$(vobjs-y),$(obj)/$F)
+vobjs := $(foreach F,$(vobj64s),$(obj)/$F)
 
 $(obj)/vdso.o: $(obj)/vdso.so
 
@@ -72,6 +80,42 @@ endef
 $(obj)/%-syms.lds: $(obj)/%.so.dbg FORCE
 	$(call if_changed,vdsosym)
 
+#
+# X32 processes use x32 vDSO to access 64bit kernel data.
+#
+# Build x32 vDSO image:
+# 1. Compile x32 vDSO as 64bit.
+# 2. Convert object files to x32.
+# 3. Build x32 VDSO image with x32 objects, which contains 64bit codes
+# so that it can reach 64bit address space with 64bit pointers.
+#
+
+targets += vdsox32-syms.lds
+obj-$(VDSOX32-y)		+= vdsox32-syms.lds
+
+CPPFLAGS_vdsox32.lds = $(CPPFLAGS_vdso.lds)
+VDSO_LDFLAGS_vdsox32.lds = -Wl,-m,elf32_x86_64 \
+			   -Wl,-soname=linux-vdso.so.1 \
+			   -Wl,-z,max-page-size=4096 \
+			   -Wl,-z,common-page-size=4096
+
+vobjx32s-y := $(vobj64s:.o=-x32.o)
+vobjx32s := $(foreach F,$(vobjx32s-y),$(obj)/$F)
+
+# Convert 64bit object file to x32 for x32 vDSO.
+quiet_cmd_x32 = X32     $@
+      cmd_x32 = $(OBJCOPY) -O elf32-x86-64 $< $@
+
+$(obj)/%-x32.o: $(obj)/%.o FORCE
+	$(call if_changed,x32)
+
+targets += vdsox32.so vdsox32.so.dbg vdsox32.lds $(vobjx32s-y)
+
+$(obj)/vdsox32.o: $(src)/vdsox32.S $(obj)/vdsox32.so
+
+$(obj)/vdsox32.so.dbg: $(src)/vdsox32.lds $(vobjx32s) FORCE
+	$(call if_changed,vdso)
+
 #
 # Build multiple 32-bit vDSO images to choose from at boot time.
 #
diff --git a/arch/x86/vdso/vdso32-setup.c b/arch/x86/vdso/vdso32-setup.c
index 468d591dde31..01b8a0df5e0e 100644
--- a/arch/x86/vdso/vdso32-setup.c
+++ b/arch/x86/vdso/vdso32-setup.c
@@ -317,6 +317,12 @@ int arch_setup_additional_pages(struct linux_binprm *bprm, int uses_interp)
 	int ret = 0;
 	bool compat;
 
+#ifdef CONFIG_X86_X32_ABI
+	extern int x32_setup_additional_pages(struct linux_binprm *, int);
+	if (test_thread_flag(TIF_X32))
+		return x32_setup_additional_pages (bprm, uses_interp);
+#endif
+
 	if (vdso_enabled == VDSO_DISABLED)
 		return 0;
 
diff --git a/arch/x86/vdso/vdsox32.S b/arch/x86/vdso/vdsox32.S
new file mode 100644
index 000000000000..d6b9a7f42a8a
--- /dev/null
+++ b/arch/x86/vdso/vdsox32.S
@@ -0,0 +1,22 @@
+#include <asm/page_types.h>
+#include <linux/linkage.h>
+#include <linux/init.h>
+
+__PAGE_ALIGNED_DATA
+
+	.globl vdsox32_start, vdsox32_end
+	.align PAGE_SIZE
+vdsox32_start:
+	.incbin "arch/x86/vdso/vdsox32.so"
+vdsox32_end:
+	.align PAGE_SIZE /* extra data here leaks to userspace. */
+
+.previous
+
+	.globl vdsox32_pages
+	.bss
+	.align 8
+	.type vdsox32_pages, @object
+vdsox32_pages:
+	.zero (vdsox32_end - vdsox32_start + PAGE_SIZE - 1) / PAGE_SIZE * 8
+	.size vdsox32_pages, .-vdsox32_pages
diff --git a/arch/x86/vdso/vdsox32.lds.S b/arch/x86/vdso/vdsox32.lds.S
new file mode 100644
index 000000000000..373ca9a02a53
--- /dev/null
+++ b/arch/x86/vdso/vdsox32.lds.S
@@ -0,0 +1,32 @@
+/*
+ * Linker script for x32 vDSO.
+ * We #include the file to define the layout details.
+ * Here we only choose the prelinked virtual address.
+ *
+ * This file defines the version script giving the user-exported symbols in
+ * the DSO.  We can define local symbols here called VDSO* to make their
+ * values visible using the asm-x86/vdso.h macros from the kernel proper.
+ */
+
+#define VDSO_PRELINK 0
+#include "vdso-layout.lds.S"
+
+/*
+ * This controls what userland symbols we export from the vDSO.
+ */
+VERSION {
+	LINUX_2.6 {
+	global:
+		clock_gettime;
+		__vdso_clock_gettime;
+		gettimeofday;
+		__vdso_gettimeofday;
+		getcpu;
+		__vdso_getcpu;
+		time;
+		__vdso_time;
+	local: *;
+	};
+}
+
+VDSOX32_PRELINK = VDSO_PRELINK;
diff --git a/arch/x86/vdso/vma.c b/arch/x86/vdso/vma.c
index 153407c35b75..1bbcc6205ace 100644
--- a/arch/x86/vdso/vma.c
+++ b/arch/x86/vdso/vma.c
@@ -24,7 +24,44 @@ extern unsigned short vdso_sync_cpuid;
 extern struct page *vdso_pages[];
 static unsigned vdso_size;
 
-static void __init patch_vdso(void *vdso, size_t len)
+#ifdef CONFIG_X86_X32_ABI
+extern char vdsox32_start[], vdsox32_end[];
+extern struct page *vdsox32_pages[];
+static unsigned vdsox32_size;
+
+static void __init patch_vdsox32(void *vdso, size_t len)
+{
+	Elf32_Ehdr *hdr = vdso;
+	Elf32_Shdr *sechdrs, *alt_sec = 0;
+	char *secstrings;
+	void *alt_data;
+	int i;
+
+	BUG_ON(len < sizeof(Elf32_Ehdr));
+	BUG_ON(memcmp(hdr->e_ident, ELFMAG, SELFMAG) != 0);
+
+	sechdrs = (void *)hdr + hdr->e_shoff;
+	secstrings = (void *)hdr + sechdrs[hdr->e_shstrndx].sh_offset;
+
+	for (i = 1; i < hdr->e_shnum; i++) {
+		Elf32_Shdr *shdr = &sechdrs[i];
+		if (!strcmp(secstrings + shdr->sh_name, ".altinstructions")) {
+			alt_sec = shdr;
+			goto found;
+		}
+	}
+
+	/* If we get here, it's probably a bug. */
+	pr_warning("patch_vdsox32: .altinstructions not found\n");
+	return;  /* nothing to patch */
+
+found:
+	alt_data = (void *)hdr + alt_sec->sh_offset;
+	apply_alternatives(alt_data, alt_data + alt_sec->sh_size);
+}
+#endif
+
+static void __init patch_vdso64(void *vdso, size_t len)
 {
 	Elf64_Ehdr *hdr = vdso;
 	Elf64_Shdr *sechdrs, *alt_sec = 0;
@@ -47,7 +84,7 @@ static void __init patch_vdso(void *vdso, size_t len)
 	}
 
 	/* If we get here, it's probably a bug. */
-	pr_warning("patch_vdso: .altinstructions not found\n");
+	pr_warning("patch_vdso64: .altinstructions not found\n");
 	return;  /* nothing to patch */
 
 found:
@@ -60,12 +97,20 @@ static int __init init_vdso(void)
 	int npages = (vdso_end - vdso_start + PAGE_SIZE - 1) / PAGE_SIZE;
 	int i;
 
-	patch_vdso(vdso_start, vdso_end - vdso_start);
+	patch_vdso64(vdso_start, vdso_end - vdso_start);
 
 	vdso_size = npages << PAGE_SHIFT;
 	for (i = 0; i < npages; i++)
 		vdso_pages[i] = virt_to_page(vdso_start + i*PAGE_SIZE);
 
+#ifdef CONFIG_X86_X32_ABI
+	patch_vdsox32(vdsox32_start, vdsox32_end - vdsox32_start);
+	npages = (vdsox32_end - vdsox32_start + PAGE_SIZE - 1) / PAGE_SIZE;
+	vdsox32_size = npages << PAGE_SHIFT;
+	for (i = 0; i < npages; i++)
+		vdsox32_pages[i] = virt_to_page(vdsox32_start + i*PAGE_SIZE);
+#endif
+
 	return 0;
 }
 subsys_initcall(init_vdso);
@@ -103,7 +148,10 @@ static unsigned long vdso_addr(unsigned long start, unsigned len)
 
 /* Setup a VMA at program startup for the vsyscall page.
    Not called for compat tasks */
-int arch_setup_additional_pages(struct linux_binprm *bprm, int uses_interp)
+static int setup_additional_pages(struct linux_binprm *bprm,
+				  int uses_interp,
+				  struct page **pages,
+				  unsigned size)
 {
 	struct mm_struct *mm = current->mm;
 	unsigned long addr;
@@ -113,8 +161,8 @@ int arch_setup_additional_pages(struct linux_binprm *bprm, int uses_interp)
 		return 0;
 
 	down_write(&mm->mmap_sem);
-	addr = vdso_addr(mm->start_stack, vdso_size);
-	addr = get_unmapped_area(NULL, addr, vdso_size, 0, 0);
+	addr = vdso_addr(mm->start_stack, size);
+	addr = get_unmapped_area(NULL, addr, size, 0, 0);
 	if (IS_ERR_VALUE(addr)) {
 		ret = addr;
 		goto up_fail;
@@ -122,11 +170,11 @@ int arch_setup_additional_pages(struct linux_binprm *bprm, int uses_interp)
 
 	current->mm->context.vdso = (void *)addr;
 
-	ret = install_special_mapping(mm, addr, vdso_size,
+	ret = install_special_mapping(mm, addr, size,
 				      VM_READ|VM_EXEC|
 				      VM_MAYREAD|VM_MAYWRITE|VM_MAYEXEC|
 				      VM_ALWAYSDUMP,
-				      vdso_pages);
+				      pages);
 	if (ret) {
 		current->mm->context.vdso = NULL;
 		goto up_fail;
@@ -137,6 +185,20 @@ up_fail:
 	return ret;
 }
 
+int arch_setup_additional_pages(struct linux_binprm *bprm, int uses_interp)
+{
+	return setup_additional_pages (bprm, uses_interp, vdso_pages,
+				       vdso_size);
+}
+
+#ifdef CONFIG_X86_X32_ABI
+int x32_setup_additional_pages(struct linux_binprm *bprm, int uses_interp)
+{
+	return setup_additional_pages (bprm, uses_interp, vdsox32_pages,
+				       vdsox32_size);
+}
+#endif
+
 static __init int vdso_setup(char *s)
 {
 	vdso_enabled = simple_strtoul(s, NULL, 0);
-- 
cgit v1.2.3


From a38449ef596b345e13a8f9b7d5cd9fedb8fcf921 Mon Sep 17 00:00:00 2001
From: Steven Rostedt <srostedt@redhat.com>
Date: Mon, 20 Feb 2012 15:29:34 -0500
Subject: x86: Specify a size for the cmp in the NMI handler

Linus noticed that the cmp used to check if the code segment is
__KERNEL_CS or not did not specify a size. Perhaps it does not matter
as H. Peter Anvin noted that user space can not set the bottom two
bits of the %cs register. But it's best not to let the assembly choose
and change things between different versions of gas, but instead just
pick the size.

Four bytes are used to compare the saved code segment against
__KERNEL_CS. Perhaps this might mess up Xen, but we can fix that when
the time comes.

Also I noticed that there was another non-specified cmp that checks
the special stack variable if it is 1 or 0. This too probably doesn't
matter what cmp is used, but this patch uses cmpl just to make it non
ambiguous.

Link: http://lkml.kernel.org/r/CA+55aFxfAn9MWRgS3O5k2tqN5ys1XrhSFVO5_9ZAoZKDVgNfGA@mail.gmail.com

Suggested-by: Linus Torvalds <torvalds@linux-foundation.org>
Cc: H. Peter Anvin <hpa@zytor.com>
Signed-off-by: Steven Rostedt <rostedt@goodmis.org>
---
 arch/x86/kernel/entry_64.S | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kernel/entry_64.S b/arch/x86/kernel/entry_64.S
index debd851de6ff..1333d9851778 100644
--- a/arch/x86/kernel/entry_64.S
+++ b/arch/x86/kernel/entry_64.S
@@ -1535,14 +1535,14 @@ ENTRY(nmi)
 	 * If %cs was not the kernel segment, then the NMI triggered in user
 	 * space, which means it is definitely not nested.
 	 */
-	cmp $__KERNEL_CS, 16(%rsp)
+	cmpl $__KERNEL_CS, 16(%rsp)
 	jne first_nmi
 
 	/*
 	 * Check the special variable on the stack to see if NMIs are
 	 * executing.
 	 */
-	cmp $1, -8(%rsp)
+	cmpl $1, -8(%rsp)
 	je nested_nmi
 
 	/*
-- 
cgit v1.2.3


From 27e74da9800289e69ba907777df1e2085231eff7 Mon Sep 17 00:00:00 2001
From: Linus Torvalds <torvalds@linux-foundation.org>
Date: Mon, 20 Feb 2012 19:34:10 -0800
Subject: i387: export 'fpu_owner_task' per-cpu variable

(And define it properly for x86-32, which had its 'current_task'
declaration in separate from x86-64)

Bitten by my dislike for modules on the machines I use, and the fact
that apparently nobody else actually wanted to test the patches I sent
out.

Snif. Nobody else cares.

Anyway, we probably should uninline the 'kernel_fpu_begin()' function
that is what modules actually use and that references this, but this is
the minimal fix for now.

Reported-by: Josh Boyer <jwboyer@gmail.com>
Reported-and-tested-by: Jongman Heo <jongman.heo@samsung.com>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 arch/x86/kernel/cpu/common.c | 3 +++
 1 file changed, 3 insertions(+)

(limited to 'arch/x86')

diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c
index b667148dfad7..c0f7d68d318f 100644
--- a/arch/x86/kernel/cpu/common.c
+++ b/arch/x86/kernel/cpu/common.c
@@ -1045,6 +1045,7 @@ DEFINE_PER_CPU(char *, irq_stack_ptr) =
 DEFINE_PER_CPU(unsigned int, irq_count) = -1;
 
 DEFINE_PER_CPU(struct task_struct *, fpu_owner_task);
+EXPORT_PER_CPU_SYMBOL(fpu_owner_task);
 
 /*
  * Special IST stacks which the CPU switches to when it calls
@@ -1113,6 +1114,8 @@ void debug_stack_reset(void)
 
 DEFINE_PER_CPU(struct task_struct *, current_task) = &init_task;
 EXPORT_PER_CPU_SYMBOL(current_task);
+DEFINE_PER_CPU(struct task_struct *, fpu_owner_task);
+EXPORT_PER_CPU_SYMBOL(fpu_owner_task);
 
 #ifdef CONFIG_CC_STACKPROTECTOR
 DEFINE_PER_CPU_ALIGNED(struct stack_canary, stack_canary);
-- 
cgit v1.2.3


From 8546c008924d5fd1724fa698eaa92b414bafd50d Mon Sep 17 00:00:00 2001
From: Linus Torvalds <torvalds@linux-foundation.org>
Date: Tue, 21 Feb 2012 10:25:45 -0800
Subject: i387: Uninline the generic FP helpers that we expose to kernel
 modules

Instead of exporting the very low-level internals of the FPU state
save/restore code (ie things like 'fpu_owner_task'), we should export
the higher-level interfaces.

Inlining these things is pointless anyway: sure, sometimes the end
result is small, but while 'stts()' can result in just three x86
instructions, those are not cheap instructions (writing %cr0 is a
serializing instruction and a very slow one at that).

So the overhead of a function call is not noticeable, and we really
don't want random modules mucking about with our internal state save
logic anyway.

So this unexports 'fpu_owner_task', and instead uninlines and exports
the actual functions that modules can use: fpu_kernel_begin/end() and
unlazy_fpu().

Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
Link: http://lkml.kernel.org/r/alpine.LFD.2.02.1202211339590.5354@i5.linux-foundation.org
Signed-off-by: H. Peter Anvin <hpa@linux.intel.com>
---
 arch/x86/include/asm/i387.h  | 78 +++---------------------------------------
 arch/x86/kernel/cpu/common.c |  2 --
 arch/x86/kernel/i387.c       | 80 ++++++++++++++++++++++++++++++++++++++++++++
 3 files changed, 84 insertions(+), 76 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/include/asm/i387.h b/arch/x86/include/asm/i387.h
index 247904945d3f..0c1031d354f2 100644
--- a/arch/x86/include/asm/i387.h
+++ b/arch/x86/include/asm/i387.h
@@ -419,70 +419,9 @@ static inline void __clear_fpu(struct task_struct *tsk)
 	}
 }
 
-/*
- * Were we in an interrupt that interrupted kernel mode?
- *
- * We can do a kernel_fpu_begin/end() pair *ONLY* if that
- * pair does nothing at all: the thread must not have fpu (so
- * that we don't try to save the FPU state), and TS must
- * be set (so that the clts/stts pair does nothing that is
- * visible in the interrupted kernel thread).
- */
-static inline bool interrupted_kernel_fpu_idle(void)
-{
-	return !__thread_has_fpu(current) &&
-		(read_cr0() & X86_CR0_TS);
-}
-
-/*
- * Were we in user mode (or vm86 mode) when we were
- * interrupted?
- *
- * Doing kernel_fpu_begin/end() is ok if we are running
- * in an interrupt context from user mode - we'll just
- * save the FPU state as required.
- */
-static inline bool interrupted_user_mode(void)
-{
-	struct pt_regs *regs = get_irq_regs();
-	return regs && user_mode_vm(regs);
-}
-
-/*
- * Can we use the FPU in kernel mode with the
- * whole "kernel_fpu_begin/end()" sequence?
- *
- * It's always ok in process context (ie "not interrupt")
- * but it is sometimes ok even from an irq.
- */
-static inline bool irq_fpu_usable(void)
-{
-	return !in_interrupt() ||
-		interrupted_user_mode() ||
-		interrupted_kernel_fpu_idle();
-}
-
-static inline void kernel_fpu_begin(void)
-{
-	struct task_struct *me = current;
-
-	WARN_ON_ONCE(!irq_fpu_usable());
-	preempt_disable();
-	if (__thread_has_fpu(me)) {
-		__save_init_fpu(me);
-		__thread_clear_has_fpu(me);
-		/* We do 'stts()' in kernel_fpu_end() */
-	} else {
-		percpu_write(fpu_owner_task, NULL);
-		clts();
-	}
-}
-
-static inline void kernel_fpu_end(void)
-{
-	stts();
-	preempt_enable();
-}
+extern bool irq_fpu_usable(void);
+extern void kernel_fpu_begin(void);
+extern void kernel_fpu_end(void);
 
 /*
  * Some instructions like VIA's padlock instructions generate a spurious
@@ -566,16 +505,7 @@ static inline void save_init_fpu(struct task_struct *tsk)
 	preempt_enable();
 }
 
-static inline void unlazy_fpu(struct task_struct *tsk)
-{
-	preempt_disable();
-	if (__thread_has_fpu(tsk)) {
-		__save_init_fpu(tsk);
-		__thread_fpu_end(tsk);
-	} else
-		tsk->fpu_counter = 0;
-	preempt_enable();
-}
+extern void unlazy_fpu(struct task_struct *tsk);
 
 static inline void clear_fpu(struct task_struct *tsk)
 {
diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c
index c0f7d68d318f..cb71b01ab66e 100644
--- a/arch/x86/kernel/cpu/common.c
+++ b/arch/x86/kernel/cpu/common.c
@@ -1045,7 +1045,6 @@ DEFINE_PER_CPU(char *, irq_stack_ptr) =
 DEFINE_PER_CPU(unsigned int, irq_count) = -1;
 
 DEFINE_PER_CPU(struct task_struct *, fpu_owner_task);
-EXPORT_PER_CPU_SYMBOL(fpu_owner_task);
 
 /*
  * Special IST stacks which the CPU switches to when it calls
@@ -1115,7 +1114,6 @@ void debug_stack_reset(void)
 DEFINE_PER_CPU(struct task_struct *, current_task) = &init_task;
 EXPORT_PER_CPU_SYMBOL(current_task);
 DEFINE_PER_CPU(struct task_struct *, fpu_owner_task);
-EXPORT_PER_CPU_SYMBOL(fpu_owner_task);
 
 #ifdef CONFIG_CC_STACKPROTECTOR
 DEFINE_PER_CPU_ALIGNED(struct stack_canary, stack_canary);
diff --git a/arch/x86/kernel/i387.c b/arch/x86/kernel/i387.c
index 739d8598f789..17b7549c4134 100644
--- a/arch/x86/kernel/i387.c
+++ b/arch/x86/kernel/i387.c
@@ -32,6 +32,86 @@
 # define user32_fxsr_struct	user_fxsr_struct
 #endif
 
+/*
+ * Were we in an interrupt that interrupted kernel mode?
+ *
+ * We can do a kernel_fpu_begin/end() pair *ONLY* if that
+ * pair does nothing at all: the thread must not have fpu (so
+ * that we don't try to save the FPU state), and TS must
+ * be set (so that the clts/stts pair does nothing that is
+ * visible in the interrupted kernel thread).
+ */
+static inline bool interrupted_kernel_fpu_idle(void)
+{
+	return !__thread_has_fpu(current) &&
+		(read_cr0() & X86_CR0_TS);
+}
+
+/*
+ * Were we in user mode (or vm86 mode) when we were
+ * interrupted?
+ *
+ * Doing kernel_fpu_begin/end() is ok if we are running
+ * in an interrupt context from user mode - we'll just
+ * save the FPU state as required.
+ */
+static inline bool interrupted_user_mode(void)
+{
+	struct pt_regs *regs = get_irq_regs();
+	return regs && user_mode_vm(regs);
+}
+
+/*
+ * Can we use the FPU in kernel mode with the
+ * whole "kernel_fpu_begin/end()" sequence?
+ *
+ * It's always ok in process context (ie "not interrupt")
+ * but it is sometimes ok even from an irq.
+ */
+bool irq_fpu_usable(void)
+{
+	return !in_interrupt() ||
+		interrupted_user_mode() ||
+		interrupted_kernel_fpu_idle();
+}
+EXPORT_SYMBOL(irq_fpu_usable);
+
+void kernel_fpu_begin(void)
+{
+	struct task_struct *me = current;
+
+	WARN_ON_ONCE(!irq_fpu_usable());
+	preempt_disable();
+	if (__thread_has_fpu(me)) {
+		__save_init_fpu(me);
+		__thread_clear_has_fpu(me);
+		/* We do 'stts()' in kernel_fpu_end() */
+	} else {
+		percpu_write(fpu_owner_task, NULL);
+		clts();
+	}
+}
+EXPORT_SYMBOL(kernel_fpu_begin);
+
+void kernel_fpu_end(void)
+{
+	stts();
+	preempt_enable();
+}
+EXPORT_SYMBOL(kernel_fpu_end);
+
+void unlazy_fpu(struct task_struct *tsk)
+{
+	preempt_disable();
+	if (__thread_has_fpu(tsk)) {
+		__save_init_fpu(tsk);
+		__thread_fpu_end(tsk);
+	} else
+		tsk->fpu_counter = 0;
+	preempt_enable();
+}
+EXPORT_SYMBOL(unlazy_fpu);
+
 #ifdef CONFIG_MATH_EMULATION
 # define HAVE_HWFP		(boot_cpu_data.hard_math)
 #else
-- 
cgit v1.2.3


From 1361b83a13d4d92e53fbb6c877528713e118b821 Mon Sep 17 00:00:00 2001
From: Linus Torvalds <torvalds@linux-foundation.org>
Date: Tue, 21 Feb 2012 13:19:22 -0800
Subject: i387: Split up <asm/i387.h> into exported and internal interfaces

While various modules include <asm/i387.h> to get access to things we
actually *intend* for them to use, most of that header file was really
pretty low-level internal stuff that we really don't want to expose to
others.

So split the header file into two: the small exported interfaces remain
in <asm/i387.h>, while the internal definitions that are only used by
core architecture code are now in <asm/fpu-internal.h>.

The guiding principle for this was to expose functions that we export to
modules, and leave them in <asm/i387.h>, while stuff that is used by
task switching or was marked GPL-only is in <asm/fpu-internal.h>.

The fpu-internal.h file could be further split up too, especially since
arch/x86/kvm/ uses some of the remaining stuff for its module.  But that
kvm usage should probably be abstracted out a bit, and at least now the
internal FPU accessor functions are much more contained.  Even if it
isn't perhaps as contained as it _could_ be.

Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
Link: http://lkml.kernel.org/r/alpine.LFD.2.02.1202211340330.5354@i5.linux-foundation.org
Signed-off-by: H. Peter Anvin <hpa@linux.intel.com>
---
 arch/x86/ia32/ia32_signal.c         |   1 +
 arch/x86/include/asm/fpu-internal.h | 520 ++++++++++++++++++++++++++++++++++++
 arch/x86/include/asm/i387.h         | 512 +----------------------------------
 arch/x86/kernel/cpu/common.c        |   1 +
 arch/x86/kernel/i387.c              |   3 +-
 arch/x86/kernel/process.c           |   1 +
 arch/x86/kernel/process_32.c        |   1 +
 arch/x86/kernel/process_64.c        |   1 +
 arch/x86/kernel/ptrace.c            |   1 +
 arch/x86/kernel/signal.c            |   1 +
 arch/x86/kernel/traps.c             |   1 +
 arch/x86/kernel/xsave.c             |   1 +
 arch/x86/kvm/vmx.c                  |   2 +-
 arch/x86/kvm/x86.c                  |   1 +
 arch/x86/power/cpu.c                |   1 +
 15 files changed, 540 insertions(+), 508 deletions(-)
 create mode 100644 arch/x86/include/asm/fpu-internal.h

(limited to 'arch/x86')

diff --git a/arch/x86/ia32/ia32_signal.c b/arch/x86/ia32/ia32_signal.c
index 65577698cab2..5563ba1cf513 100644
--- a/arch/x86/ia32/ia32_signal.c
+++ b/arch/x86/ia32/ia32_signal.c
@@ -24,6 +24,7 @@
 #include <asm/ucontext.h>
 #include <asm/uaccess.h>
 #include <asm/i387.h>
+#include <asm/fpu-internal.h>
 #include <asm/ptrace.h>
 #include <asm/ia32_unistd.h>
 #include <asm/user32.h>
diff --git a/arch/x86/include/asm/fpu-internal.h b/arch/x86/include/asm/fpu-internal.h
new file mode 100644
index 000000000000..4fa88154e4de
--- /dev/null
+++ b/arch/x86/include/asm/fpu-internal.h
@@ -0,0 +1,520 @@
+/*
+ * Copyright (C) 1994 Linus Torvalds
+ *
+ * Pentium III FXSR, SSE support
+ * General FPU state handling cleanups
+ *	Gareth Hughes <gareth@valinux.com>, May 2000
+ * x86-64 work by Andi Kleen 2002
+ */
+
+#ifndef _FPU_INTERNAL_H
+#define _FPU_INTERNAL_H
+
+#include <linux/kernel_stat.h>
+#include <linux/regset.h>
+#include <linux/slab.h>
+#include <asm/asm.h>
+#include <asm/cpufeature.h>
+#include <asm/processor.h>
+#include <asm/sigcontext.h>
+#include <asm/user.h>
+#include <asm/uaccess.h>
+#include <asm/xsave.h>
+
+extern unsigned int sig_xstate_size;
+extern void fpu_init(void);
+
+DECLARE_PER_CPU(struct task_struct *, fpu_owner_task);
+
+extern user_regset_active_fn fpregs_active, xfpregs_active;
+extern user_regset_get_fn fpregs_get, xfpregs_get, fpregs_soft_get,
+				xstateregs_get;
+extern user_regset_set_fn fpregs_set, xfpregs_set, fpregs_soft_set,
+				 xstateregs_set;
+
+
+/*
+ * xstateregs_active == fpregs_active. Please refer to the comment
+ * at the definition of fpregs_active.
+ */
+#define xstateregs_active	fpregs_active
+
+extern struct _fpx_sw_bytes fx_sw_reserved;
+#ifdef CONFIG_IA32_EMULATION
+extern unsigned int sig_xstate_ia32_size;
+extern struct _fpx_sw_bytes fx_sw_reserved_ia32;
+struct _fpstate_ia32;
+struct _xstate_ia32;
+extern int save_i387_xstate_ia32(void __user *buf);
+extern int restore_i387_xstate_ia32(void __user *buf);
+#endif
+
+#ifdef CONFIG_MATH_EMULATION
+extern void finit_soft_fpu(struct i387_soft_struct *soft);
+#else
+static inline void finit_soft_fpu(struct i387_soft_struct *soft) {}
+#endif
+
+#define X87_FSW_ES (1 << 7)	/* Exception Summary */
+
+static __always_inline __pure bool use_xsaveopt(void)
+{
+	return static_cpu_has(X86_FEATURE_XSAVEOPT);
+}
+
+static __always_inline __pure bool use_xsave(void)
+{
+	return static_cpu_has(X86_FEATURE_XSAVE);
+}
+
+static __always_inline __pure bool use_fxsr(void)
+{
+        return static_cpu_has(X86_FEATURE_FXSR);
+}
+
+extern void __sanitize_i387_state(struct task_struct *);
+
+static inline void sanitize_i387_state(struct task_struct *tsk)
+{
+	if (!use_xsaveopt())
+		return;
+	__sanitize_i387_state(tsk);
+}
+
+#ifdef CONFIG_X86_64
+static inline int fxrstor_checking(struct i387_fxsave_struct *fx)
+{
+	int err;
+
+	/* See comment in fxsave() below. */
+#ifdef CONFIG_AS_FXSAVEQ
+	asm volatile("1:  fxrstorq %[fx]\n\t"
+		     "2:\n"
+		     ".section .fixup,\"ax\"\n"
+		     "3:  movl $-1,%[err]\n"
+		     "    jmp  2b\n"
+		     ".previous\n"
+		     _ASM_EXTABLE(1b, 3b)
+		     : [err] "=r" (err)
+		     : [fx] "m" (*fx), "0" (0));
+#else
+	asm volatile("1:  rex64/fxrstor (%[fx])\n\t"
+		     "2:\n"
+		     ".section .fixup,\"ax\"\n"
+		     "3:  movl $-1,%[err]\n"
+		     "    jmp  2b\n"
+		     ".previous\n"
+		     _ASM_EXTABLE(1b, 3b)
+		     : [err] "=r" (err)
+		     : [fx] "R" (fx), "m" (*fx), "0" (0));
+#endif
+	return err;
+}
+
+static inline int fxsave_user(struct i387_fxsave_struct __user *fx)
+{
+	int err;
+
+	/*
+	 * Clear the bytes not touched by the fxsave and reserved
+	 * for the SW usage.
+	 */
+	err = __clear_user(&fx->sw_reserved,
+			   sizeof(struct _fpx_sw_bytes));
+	if (unlikely(err))
+		return -EFAULT;
+
+	/* See comment in fxsave() below. */
+#ifdef CONFIG_AS_FXSAVEQ
+	asm volatile("1:  fxsaveq %[fx]\n\t"
+		     "2:\n"
+		     ".section .fixup,\"ax\"\n"
+		     "3:  movl $-1,%[err]\n"
+		     "    jmp  2b\n"
+		     ".previous\n"
+		     _ASM_EXTABLE(1b, 3b)
+		     : [err] "=r" (err), [fx] "=m" (*fx)
+		     : "0" (0));
+#else
+	asm volatile("1:  rex64/fxsave (%[fx])\n\t"
+		     "2:\n"
+		     ".section .fixup,\"ax\"\n"
+		     "3:  movl $-1,%[err]\n"
+		     "    jmp  2b\n"
+		     ".previous\n"
+		     _ASM_EXTABLE(1b, 3b)
+		     : [err] "=r" (err), "=m" (*fx)
+		     : [fx] "R" (fx), "0" (0));
+#endif
+	if (unlikely(err) &&
+	    __clear_user(fx, sizeof(struct i387_fxsave_struct)))
+		err = -EFAULT;
+	/* No need to clear here because the caller clears USED_MATH */
+	return err;
+}
+
+static inline void fpu_fxsave(struct fpu *fpu)
+{
+	/* Using "rex64; fxsave %0" is broken because, if the memory operand
+	   uses any extended registers for addressing, a second REX prefix
+	   will be generated (to the assembler, rex64 followed by semicolon
+	   is a separate instruction), and hence the 64-bitness is lost. */
+
+#ifdef CONFIG_AS_FXSAVEQ
+	/* Using "fxsaveq %0" would be the ideal choice, but is only supported
+	   starting with gas 2.16. */
+	__asm__ __volatile__("fxsaveq %0"
+			     : "=m" (fpu->state->fxsave));
+#else
+	/* Using, as a workaround, the properly prefixed form below isn't
+	   accepted by any binutils version so far released, complaining that
+	   the same type of prefix is used twice if an extended register is
+	   needed for addressing (fix submitted to mainline 2005-11-21).
+	asm volatile("rex64/fxsave %0"
+		     : "=m" (fpu->state->fxsave));
+	   This, however, we can work around by forcing the compiler to select
+	   an addressing mode that doesn't require extended registers. */
+	asm volatile("rex64/fxsave (%[fx])"
+		     : "=m" (fpu->state->fxsave)
+		     : [fx] "R" (&fpu->state->fxsave));
+#endif
+}
+
+#else  /* CONFIG_X86_32 */
+
+/* perform fxrstor iff the processor has extended states, otherwise frstor */
+static inline int fxrstor_checking(struct i387_fxsave_struct *fx)
+{
+	/*
+	 * The "nop" is needed to make the instructions the same
+	 * length.
+	 */
+	alternative_input(
+		"nop ; frstor %1",
+		"fxrstor %1",
+		X86_FEATURE_FXSR,
+		"m" (*fx));
+
+	return 0;
+}
+
+static inline void fpu_fxsave(struct fpu *fpu)
+{
+	asm volatile("fxsave %[fx]"
+		     : [fx] "=m" (fpu->state->fxsave));
+}
+
+#endif	/* CONFIG_X86_64 */
+
+/*
+ * These must be called with preempt disabled. Returns
+ * 'true' if the FPU state is still intact.
+ */
+static inline int fpu_save_init(struct fpu *fpu)
+{
+	if (use_xsave()) {
+		fpu_xsave(fpu);
+
+		/*
+		 * xsave header may indicate the init state of the FP.
+		 */
+		if (!(fpu->state->xsave.xsave_hdr.xstate_bv & XSTATE_FP))
+			return 1;
+	} else if (use_fxsr()) {
+		fpu_fxsave(fpu);
+	} else {
+		asm volatile("fnsave %[fx]; fwait"
+			     : [fx] "=m" (fpu->state->fsave));
+		return 0;
+	}
+
+	/*
+	 * If exceptions are pending, we need to clear them so
+	 * that we don't randomly get exceptions later.
+	 *
+	 * FIXME! Is this perhaps only true for the old-style
+	 * irq13 case? Maybe we could leave the x87 state
+	 * intact otherwise?
+	 */
+	if (unlikely(fpu->state->fxsave.swd & X87_FSW_ES)) {
+		asm volatile("fnclex");
+		return 0;
+	}
+	return 1;
+}
+
+static inline int __save_init_fpu(struct task_struct *tsk)
+{
+	return fpu_save_init(&tsk->thread.fpu);
+}
+
+static inline int fpu_fxrstor_checking(struct fpu *fpu)
+{
+	return fxrstor_checking(&fpu->state->fxsave);
+}
+
+static inline int fpu_restore_checking(struct fpu *fpu)
+{
+	if (use_xsave())
+		return fpu_xrstor_checking(fpu);
+	else
+		return fpu_fxrstor_checking(fpu);
+}
+
+static inline int restore_fpu_checking(struct task_struct *tsk)
+{
+	/* AMD K7/K8 CPUs don't save/restore FDP/FIP/FOP unless an exception
+	   is pending.  Clear the x87 state here by setting it to fixed
+	   values. "m" is a random variable that should be in L1 */
+	alternative_input(
+		ASM_NOP8 ASM_NOP2,
+		"emms\n\t"		/* clear stack tags */
+		"fildl %P[addr]",	/* set F?P to defined value */
+		X86_FEATURE_FXSAVE_LEAK,
+		[addr] "m" (tsk->thread.fpu.has_fpu));
+
+	return fpu_restore_checking(&tsk->thread.fpu);
+}
+
+/*
+ * Software FPU state helpers. Careful: these need to
+ * be preemption protection *and* they need to be
+ * properly paired with the CR0.TS changes!
+ */
+static inline int __thread_has_fpu(struct task_struct *tsk)
+{
+	return tsk->thread.fpu.has_fpu;
+}
+
+/* Must be paired with an 'stts' after! */
+static inline void __thread_clear_has_fpu(struct task_struct *tsk)
+{
+	tsk->thread.fpu.has_fpu = 0;
+	percpu_write(fpu_owner_task, NULL);
+}
+
+/* Must be paired with a 'clts' before! */
+static inline void __thread_set_has_fpu(struct task_struct *tsk)
+{
+	tsk->thread.fpu.has_fpu = 1;
+	percpu_write(fpu_owner_task, tsk);
+}
+
+/*
+ * Encapsulate the CR0.TS handling together with the
+ * software flag.
+ *
+ * These generally need preemption protection to work,
+ * do try to avoid using these on their own.
+ */
+static inline void __thread_fpu_end(struct task_struct *tsk)
+{
+	__thread_clear_has_fpu(tsk);
+	stts();
+}
+
+static inline void __thread_fpu_begin(struct task_struct *tsk)
+{
+	clts();
+	__thread_set_has_fpu(tsk);
+}
+
+/*
+ * FPU state switching for scheduling.
+ *
+ * This is a two-stage process:
+ *
+ *  - switch_fpu_prepare() saves the old state and
+ *    sets the new state of the CR0.TS bit. This is
+ *    done within the context of the old process.
+ *
+ *  - switch_fpu_finish() restores the new state as
+ *    necessary.
+ */
+typedef struct { int preload; } fpu_switch_t;
+
+/*
+ * FIXME! We could do a totally lazy restore, but we need to
+ * add a per-cpu "this was the task that last touched the FPU
+ * on this CPU" variable, and the task needs to have a "I last
+ * touched the FPU on this CPU" and check them.
+ *
+ * We don't do that yet, so "fpu_lazy_restore()" always returns
+ * false, but some day..
+ */
+static inline int fpu_lazy_restore(struct task_struct *new, unsigned int cpu)
+{
+	return new == percpu_read_stable(fpu_owner_task) &&
+		cpu == new->thread.fpu.last_cpu;
+}
+
+static inline fpu_switch_t switch_fpu_prepare(struct task_struct *old, struct task_struct *new, int cpu)
+{
+	fpu_switch_t fpu;
+
+	fpu.preload = tsk_used_math(new) && new->fpu_counter > 5;
+	if (__thread_has_fpu(old)) {
+		if (!__save_init_fpu(old))
+			cpu = ~0;
+		old->thread.fpu.last_cpu = cpu;
+		old->thread.fpu.has_fpu = 0;	/* But leave fpu_owner_task! */
+
+		/* Don't change CR0.TS if we just switch! */
+		if (fpu.preload) {
+			new->fpu_counter++;
+			__thread_set_has_fpu(new);
+			prefetch(new->thread.fpu.state);
+		} else
+			stts();
+	} else {
+		old->fpu_counter = 0;
+		old->thread.fpu.last_cpu = ~0;
+		if (fpu.preload) {
+			new->fpu_counter++;
+			if (fpu_lazy_restore(new, cpu))
+				fpu.preload = 0;
+			else
+				prefetch(new->thread.fpu.state);
+			__thread_fpu_begin(new);
+		}
+	}
+	return fpu;
+}
+
+/*
+ * By the time this gets called, we've already cleared CR0.TS and
+ * given the process the FPU if we are going to preload the FPU
+ * state - all we need to do is to conditionally restore the register
+ * state itself.
+ */
+static inline void switch_fpu_finish(struct task_struct *new, fpu_switch_t fpu)
+{
+	if (fpu.preload) {
+		if (unlikely(restore_fpu_checking(new)))
+			__thread_fpu_end(new);
+	}
+}
+
+/*
+ * Signal frame handlers...
+ */
+extern int save_i387_xstate(void __user *buf);
+extern int restore_i387_xstate(void __user *buf);
+
+static inline void __clear_fpu(struct task_struct *tsk)
+{
+	if (__thread_has_fpu(tsk)) {
+		/* Ignore delayed exceptions from user space */
+		asm volatile("1: fwait\n"
+			     "2:\n"
+			     _ASM_EXTABLE(1b, 2b));
+		__thread_fpu_end(tsk);
+	}
+}
+
+/*
+ * The actual user_fpu_begin/end() functions
+ * need to be preemption-safe.
+ *
+ * NOTE! user_fpu_end() must be used only after you
+ * have saved the FP state, and user_fpu_begin() must
+ * be used only immediately before restoring it.
+ * These functions do not do any save/restore on
+ * their own.
+ */
+static inline void user_fpu_end(void)
+{
+	preempt_disable();
+	__thread_fpu_end(current);
+	preempt_enable();
+}
+
+static inline void user_fpu_begin(void)
+{
+	preempt_disable();
+	if (!user_has_fpu())
+		__thread_fpu_begin(current);
+	preempt_enable();
+}
+
+/*
+ * These disable preemption on their own and are safe
+ */
+static inline void save_init_fpu(struct task_struct *tsk)
+{
+	WARN_ON_ONCE(!__thread_has_fpu(tsk));
+	preempt_disable();
+	__save_init_fpu(tsk);
+	__thread_fpu_end(tsk);
+	preempt_enable();
+}
+
+static inline void clear_fpu(struct task_struct *tsk)
+{
+	preempt_disable();
+	__clear_fpu(tsk);
+	preempt_enable();
+}
+
+/*
+ * i387 state interaction
+ */
+static inline unsigned short get_fpu_cwd(struct task_struct *tsk)
+{
+	if (cpu_has_fxsr) {
+		return tsk->thread.fpu.state->fxsave.cwd;
+	} else {
+		return (unsigned short)tsk->thread.fpu.state->fsave.cwd;
+	}
+}
+
+static inline unsigned short get_fpu_swd(struct task_struct *tsk)
+{
+	if (cpu_has_fxsr) {
+		return tsk->thread.fpu.state->fxsave.swd;
+	} else {
+		return (unsigned short)tsk->thread.fpu.state->fsave.swd;
+	}
+}
+
+static inline unsigned short get_fpu_mxcsr(struct task_struct *tsk)
+{
+	if (cpu_has_xmm) {
+		return tsk->thread.fpu.state->fxsave.mxcsr;
+	} else {
+		return MXCSR_DEFAULT;
+	}
+}
+
+static bool fpu_allocated(struct fpu *fpu)
+{
+	return fpu->state != NULL;
+}
+
+static inline int fpu_alloc(struct fpu *fpu)
+{
+	if (fpu_allocated(fpu))
+		return 0;
+	fpu->state = kmem_cache_alloc(task_xstate_cachep, GFP_KERNEL);
+	if (!fpu->state)
+		return -ENOMEM;
+	WARN_ON((unsigned long)fpu->state & 15);
+	return 0;
+}
+
+static inline void fpu_free(struct fpu *fpu)
+{
+	if (fpu->state) {
+		kmem_cache_free(task_xstate_cachep, fpu->state);
+		fpu->state = NULL;
+	}
+}
+
+static inline void fpu_copy(struct fpu *dst, struct fpu *src)
+{
+	memcpy(dst->state, src->state, xstate_size);
+}
+
+extern void fpu_finit(struct fpu *fpu);
+
+#endif
diff --git a/arch/x86/include/asm/i387.h b/arch/x86/include/asm/i387.h
index 0c1031d354f2..7ce0798b1b26 100644
--- a/arch/x86/include/asm/i387.h
+++ b/arch/x86/include/asm/i387.h
@@ -13,411 +13,15 @@
 #ifndef __ASSEMBLY__
 
 #include <linux/sched.h>
-#include <linux/kernel_stat.h>
-#include <linux/regset.h>
 #include <linux/hardirq.h>
-#include <linux/slab.h>
-#include <asm/asm.h>
-#include <asm/cpufeature.h>
-#include <asm/processor.h>
-#include <asm/sigcontext.h>
-#include <asm/user.h>
-#include <asm/uaccess.h>
-#include <asm/xsave.h>
+#include <asm/system.h>
+
+struct pt_regs;
+struct user_i387_struct;
 
-extern unsigned int sig_xstate_size;
-extern void fpu_init(void);
-extern void mxcsr_feature_mask_init(void);
 extern int init_fpu(struct task_struct *child);
-extern void math_state_restore(void);
 extern int dump_fpu(struct pt_regs *, struct user_i387_struct *);
-
-DECLARE_PER_CPU(struct task_struct *, fpu_owner_task);
-
-extern user_regset_active_fn fpregs_active, xfpregs_active;
-extern user_regset_get_fn fpregs_get, xfpregs_get, fpregs_soft_get,
-				xstateregs_get;
-extern user_regset_set_fn fpregs_set, xfpregs_set, fpregs_soft_set,
-				 xstateregs_set;
-
-/*
- * xstateregs_active == fpregs_active. Please refer to the comment
- * at the definition of fpregs_active.
- */
-#define xstateregs_active	fpregs_active
-
-extern struct _fpx_sw_bytes fx_sw_reserved;
-#ifdef CONFIG_IA32_EMULATION
-extern unsigned int sig_xstate_ia32_size;
-extern struct _fpx_sw_bytes fx_sw_reserved_ia32;
-struct _fpstate_ia32;
-struct _xstate_ia32;
-extern int save_i387_xstate_ia32(void __user *buf);
-extern int restore_i387_xstate_ia32(void __user *buf);
-#endif
-
-#ifdef CONFIG_MATH_EMULATION
-extern void finit_soft_fpu(struct i387_soft_struct *soft);
-#else
-static inline void finit_soft_fpu(struct i387_soft_struct *soft) {}
-#endif
-
-#define X87_FSW_ES (1 << 7)	/* Exception Summary */
-
-static __always_inline __pure bool use_xsaveopt(void)
-{
-	return static_cpu_has(X86_FEATURE_XSAVEOPT);
-}
-
-static __always_inline __pure bool use_xsave(void)
-{
-	return static_cpu_has(X86_FEATURE_XSAVE);
-}
-
-static __always_inline __pure bool use_fxsr(void)
-{
-        return static_cpu_has(X86_FEATURE_FXSR);
-}
-
-extern void __sanitize_i387_state(struct task_struct *);
-
-static inline void sanitize_i387_state(struct task_struct *tsk)
-{
-	if (!use_xsaveopt())
-		return;
-	__sanitize_i387_state(tsk);
-}
-
-#ifdef CONFIG_X86_64
-static inline int fxrstor_checking(struct i387_fxsave_struct *fx)
-{
-	int err;
-
-	/* See comment in fxsave() below. */
-#ifdef CONFIG_AS_FXSAVEQ
-	asm volatile("1:  fxrstorq %[fx]\n\t"
-		     "2:\n"
-		     ".section .fixup,\"ax\"\n"
-		     "3:  movl $-1,%[err]\n"
-		     "    jmp  2b\n"
-		     ".previous\n"
-		     _ASM_EXTABLE(1b, 3b)
-		     : [err] "=r" (err)
-		     : [fx] "m" (*fx), "0" (0));
-#else
-	asm volatile("1:  rex64/fxrstor (%[fx])\n\t"
-		     "2:\n"
-		     ".section .fixup,\"ax\"\n"
-		     "3:  movl $-1,%[err]\n"
-		     "    jmp  2b\n"
-		     ".previous\n"
-		     _ASM_EXTABLE(1b, 3b)
-		     : [err] "=r" (err)
-		     : [fx] "R" (fx), "m" (*fx), "0" (0));
-#endif
-	return err;
-}
-
-static inline int fxsave_user(struct i387_fxsave_struct __user *fx)
-{
-	int err;
-
-	/*
-	 * Clear the bytes not touched by the fxsave and reserved
-	 * for the SW usage.
-	 */
-	err = __clear_user(&fx->sw_reserved,
-			   sizeof(struct _fpx_sw_bytes));
-	if (unlikely(err))
-		return -EFAULT;
-
-	/* See comment in fxsave() below. */
-#ifdef CONFIG_AS_FXSAVEQ
-	asm volatile("1:  fxsaveq %[fx]\n\t"
-		     "2:\n"
-		     ".section .fixup,\"ax\"\n"
-		     "3:  movl $-1,%[err]\n"
-		     "    jmp  2b\n"
-		     ".previous\n"
-		     _ASM_EXTABLE(1b, 3b)
-		     : [err] "=r" (err), [fx] "=m" (*fx)
-		     : "0" (0));
-#else
-	asm volatile("1:  rex64/fxsave (%[fx])\n\t"
-		     "2:\n"
-		     ".section .fixup,\"ax\"\n"
-		     "3:  movl $-1,%[err]\n"
-		     "    jmp  2b\n"
-		     ".previous\n"
-		     _ASM_EXTABLE(1b, 3b)
-		     : [err] "=r" (err), "=m" (*fx)
-		     : [fx] "R" (fx), "0" (0));
-#endif
-	if (unlikely(err) &&
-	    __clear_user(fx, sizeof(struct i387_fxsave_struct)))
-		err = -EFAULT;
-	/* No need to clear here because the caller clears USED_MATH */
-	return err;
-}
-
-static inline void fpu_fxsave(struct fpu *fpu)
-{
-	/* Using "rex64; fxsave %0" is broken because, if the memory operand
-	   uses any extended registers for addressing, a second REX prefix
-	   will be generated (to the assembler, rex64 followed by semicolon
-	   is a separate instruction), and hence the 64-bitness is lost. */
-
-#ifdef CONFIG_AS_FXSAVEQ
-	/* Using "fxsaveq %0" would be the ideal choice, but is only supported
-	   starting with gas 2.16. */
-	__asm__ __volatile__("fxsaveq %0"
-			     : "=m" (fpu->state->fxsave));
-#else
-	/* Using, as a workaround, the properly prefixed form below isn't
-	   accepted by any binutils version so far released, complaining that
-	   the same type of prefix is used twice if an extended register is
-	   needed for addressing (fix submitted to mainline 2005-11-21).
-	asm volatile("rex64/fxsave %0"
-		     : "=m" (fpu->state->fxsave));
-	   This, however, we can work around by forcing the compiler to select
-	   an addressing mode that doesn't require extended registers. */
-	asm volatile("rex64/fxsave (%[fx])"
-		     : "=m" (fpu->state->fxsave)
-		     : [fx] "R" (&fpu->state->fxsave));
-#endif
-}
-
-#else  /* CONFIG_X86_32 */
-
-/* perform fxrstor iff the processor has extended states, otherwise frstor */
-static inline int fxrstor_checking(struct i387_fxsave_struct *fx)
-{
-	/*
-	 * The "nop" is needed to make the instructions the same
-	 * length.
-	 */
-	alternative_input(
-		"nop ; frstor %1",
-		"fxrstor %1",
-		X86_FEATURE_FXSR,
-		"m" (*fx));
-
-	return 0;
-}
-
-static inline void fpu_fxsave(struct fpu *fpu)
-{
-	asm volatile("fxsave %[fx]"
-		     : [fx] "=m" (fpu->state->fxsave));
-}
-
-#endif	/* CONFIG_X86_64 */
-
-/*
- * These must be called with preempt disabled. Returns
- * 'true' if the FPU state is still intact.
- */
-static inline int fpu_save_init(struct fpu *fpu)
-{
-	if (use_xsave()) {
-		fpu_xsave(fpu);
-
-		/*
-		 * xsave header may indicate the init state of the FP.
-		 */
-		if (!(fpu->state->xsave.xsave_hdr.xstate_bv & XSTATE_FP))
-			return 1;
-	} else if (use_fxsr()) {
-		fpu_fxsave(fpu);
-	} else {
-		asm volatile("fnsave %[fx]; fwait"
-			     : [fx] "=m" (fpu->state->fsave));
-		return 0;
-	}
-
-	/*
-	 * If exceptions are pending, we need to clear them so
-	 * that we don't randomly get exceptions later.
-	 *
-	 * FIXME! Is this perhaps only true for the old-style
-	 * irq13 case? Maybe we could leave the x87 state
-	 * intact otherwise?
-	 */
-	if (unlikely(fpu->state->fxsave.swd & X87_FSW_ES)) {
-		asm volatile("fnclex");
-		return 0;
-	}
-	return 1;
-}
-
-static inline int __save_init_fpu(struct task_struct *tsk)
-{
-	return fpu_save_init(&tsk->thread.fpu);
-}
-
-static inline int fpu_fxrstor_checking(struct fpu *fpu)
-{
-	return fxrstor_checking(&fpu->state->fxsave);
-}
-
-static inline int fpu_restore_checking(struct fpu *fpu)
-{
-	if (use_xsave())
-		return fpu_xrstor_checking(fpu);
-	else
-		return fpu_fxrstor_checking(fpu);
-}
-
-static inline int restore_fpu_checking(struct task_struct *tsk)
-{
-	/* AMD K7/K8 CPUs don't save/restore FDP/FIP/FOP unless an exception
-	   is pending.  Clear the x87 state here by setting it to fixed
-	   values. "m" is a random variable that should be in L1 */
-	alternative_input(
-		ASM_NOP8 ASM_NOP2,
-		"emms\n\t"	  	/* clear stack tags */
-		"fildl %P[addr]",	/* set F?P to defined value */
-		X86_FEATURE_FXSAVE_LEAK,
-		[addr] "m" (tsk->thread.fpu.has_fpu));
-
-	return fpu_restore_checking(&tsk->thread.fpu);
-}
-
-/*
- * Software FPU state helpers. Careful: these need to
- * be preemption protection *and* they need to be
- * properly paired with the CR0.TS changes!
- */
-static inline int __thread_has_fpu(struct task_struct *tsk)
-{
-	return tsk->thread.fpu.has_fpu;
-}
-
-/* Must be paired with an 'stts' after! */
-static inline void __thread_clear_has_fpu(struct task_struct *tsk)
-{
-	tsk->thread.fpu.has_fpu = 0;
-	percpu_write(fpu_owner_task, NULL);
-}
-
-/* Must be paired with a 'clts' before! */
-static inline void __thread_set_has_fpu(struct task_struct *tsk)
-{
-	tsk->thread.fpu.has_fpu = 1;
-	percpu_write(fpu_owner_task, tsk);
-}
-
-/*
- * Encapsulate the CR0.TS handling together with the
- * software flag.
- *
- * These generally need preemption protection to work,
- * do try to avoid using these on their own.
- */
-static inline void __thread_fpu_end(struct task_struct *tsk)
-{
-	__thread_clear_has_fpu(tsk);
-	stts();
-}
-
-static inline void __thread_fpu_begin(struct task_struct *tsk)
-{
-	clts();
-	__thread_set_has_fpu(tsk);
-}
-
-/*
- * FPU state switching for scheduling.
- *
- * This is a two-stage process:
- *
- *  - switch_fpu_prepare() saves the old state and
- *    sets the new state of the CR0.TS bit. This is
- *    done within the context of the old process.
- *
- *  - switch_fpu_finish() restores the new state as
- *    necessary.
- */
-typedef struct { int preload; } fpu_switch_t;
-
-/*
- * FIXME! We could do a totally lazy restore, but we need to
- * add a per-cpu "this was the task that last touched the FPU
- * on this CPU" variable, and the task needs to have a "I last
- * touched the FPU on this CPU" and check them.
- *
- * We don't do that yet, so "fpu_lazy_restore()" always returns
- * false, but some day..
- */
-static inline int fpu_lazy_restore(struct task_struct *new, unsigned int cpu)
-{
-	return new == percpu_read_stable(fpu_owner_task) &&
-		cpu == new->thread.fpu.last_cpu;
-}
-
-static inline fpu_switch_t switch_fpu_prepare(struct task_struct *old, struct task_struct *new, int cpu)
-{
-	fpu_switch_t fpu;
-
-	fpu.preload = tsk_used_math(new) && new->fpu_counter > 5;
-	if (__thread_has_fpu(old)) {
-		if (!__save_init_fpu(old))
-			cpu = ~0;
-		old->thread.fpu.last_cpu = cpu;
-		old->thread.fpu.has_fpu = 0;	/* But leave fpu_owner_task! */
-
-		/* Don't change CR0.TS if we just switch! */
-		if (fpu.preload) {
-			new->fpu_counter++;
-			__thread_set_has_fpu(new);
-			prefetch(new->thread.fpu.state);
-		} else
-			stts();
-	} else {
-		old->fpu_counter = 0;
-		old->thread.fpu.last_cpu = ~0;
-		if (fpu.preload) {
-			new->fpu_counter++;
-			if (fpu_lazy_restore(new, cpu))
-				fpu.preload = 0;
-			else
-				prefetch(new->thread.fpu.state);
-			__thread_fpu_begin(new);
-		}
-	}
-	return fpu;
-}
-
-/*
- * By the time this gets called, we've already cleared CR0.TS and
- * given the process the FPU if we are going to preload the FPU
- * state - all we need to do is to conditionally restore the register
- * state itself.
- */
-static inline void switch_fpu_finish(struct task_struct *new, fpu_switch_t fpu)
-{
-	if (fpu.preload) {
-		if (unlikely(restore_fpu_checking(new)))
-			__thread_fpu_end(new);
-	}
-}
-
-/*
- * Signal frame handlers...
- */
-extern int save_i387_xstate(void __user *buf);
-extern int restore_i387_xstate(void __user *buf);
-
-static inline void __clear_fpu(struct task_struct *tsk)
-{
-	if (__thread_has_fpu(tsk)) {
-		/* Ignore delayed exceptions from user space */
-		asm volatile("1: fwait\n"
-			     "2:\n"
-			     _ASM_EXTABLE(1b, 2b));
-		__thread_fpu_end(tsk);
-	}
-}
+extern void math_state_restore(void);
 
 extern bool irq_fpu_usable(void);
 extern void kernel_fpu_begin(void);
@@ -463,118 +67,14 @@ static inline void irq_ts_restore(int TS_state)
  * we can just assume we have FPU access - typically
  * to save the FP state - we'll just take a #NM
  * fault and get the FPU access back.
- *
- * The actual user_fpu_begin/end() functions
- * need to be preemption-safe, though.
- *
- * NOTE! user_fpu_end() must be used only after you
- * have saved the FP state, and user_fpu_begin() must
- * be used only immediately before restoring it.
- * These functions do not do any save/restore on
- * their own.
  */
 static inline int user_has_fpu(void)
 {
-	return __thread_has_fpu(current);
-}
-
-static inline void user_fpu_end(void)
-{
-	preempt_disable();
-	__thread_fpu_end(current);
-	preempt_enable();
-}
-
-static inline void user_fpu_begin(void)
-{
-	preempt_disable();
-	if (!user_has_fpu())
-		__thread_fpu_begin(current);
-	preempt_enable();
-}
-
-/*
- * These disable preemption on their own and are safe
- */
-static inline void save_init_fpu(struct task_struct *tsk)
-{
-	WARN_ON_ONCE(!__thread_has_fpu(tsk));
-	preempt_disable();
-	__save_init_fpu(tsk);
-	__thread_fpu_end(tsk);
-	preempt_enable();
+	return current->thread.fpu.has_fpu;
 }
 
 extern void unlazy_fpu(struct task_struct *tsk);
 
-static inline void clear_fpu(struct task_struct *tsk)
-{
-	preempt_disable();
-	__clear_fpu(tsk);
-	preempt_enable();
-}
-
-/*
- * i387 state interaction
- */
-static inline unsigned short get_fpu_cwd(struct task_struct *tsk)
-{
-	if (cpu_has_fxsr) {
-		return tsk->thread.fpu.state->fxsave.cwd;
-	} else {
-		return (unsigned short)tsk->thread.fpu.state->fsave.cwd;
-	}
-}
-
-static inline unsigned short get_fpu_swd(struct task_struct *tsk)
-{
-	if (cpu_has_fxsr) {
-		return tsk->thread.fpu.state->fxsave.swd;
-	} else {
-		return (unsigned short)tsk->thread.fpu.state->fsave.swd;
-	}
-}
-
-static inline unsigned short get_fpu_mxcsr(struct task_struct *tsk)
-{
-	if (cpu_has_xmm) {
-		return tsk->thread.fpu.state->fxsave.mxcsr;
-	} else {
-		return MXCSR_DEFAULT;
-	}
-}
-
-static bool fpu_allocated(struct fpu *fpu)
-{
-	return fpu->state != NULL;
-}
-
-static inline int fpu_alloc(struct fpu *fpu)
-{
-	if (fpu_allocated(fpu))
-		return 0;
-	fpu->state = kmem_cache_alloc(task_xstate_cachep, GFP_KERNEL);
-	if (!fpu->state)
-		return -ENOMEM;
-	WARN_ON((unsigned long)fpu->state & 15);
-	return 0;
-}
-
-static inline void fpu_free(struct fpu *fpu)
-{
-	if (fpu->state) {
-		kmem_cache_free(task_xstate_cachep, fpu->state);
-		fpu->state = NULL;
-	}
-}
-
-static inline void fpu_copy(struct fpu *dst, struct fpu *src)
-{
-	memcpy(dst->state, src->state, xstate_size);
-}
-
-extern void fpu_finit(struct fpu *fpu);
-
 #endif /* __ASSEMBLY__ */
 
 #endif /* _ASM_X86_I387_H */
diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c
index cb71b01ab66e..89620b1725d4 100644
--- a/arch/x86/kernel/cpu/common.c
+++ b/arch/x86/kernel/cpu/common.c
@@ -28,6 +28,7 @@
 #include <asm/apic.h>
 #include <asm/desc.h>
 #include <asm/i387.h>
+#include <asm/fpu-internal.h>
 #include <asm/mtrr.h>
 #include <linux/numa.h>
 #include <asm/asm.h>
diff --git a/arch/x86/kernel/i387.c b/arch/x86/kernel/i387.c
index 17b7549c4134..7734bcbb5a3a 100644
--- a/arch/x86/kernel/i387.c
+++ b/arch/x86/kernel/i387.c
@@ -16,6 +16,7 @@
 #include <asm/uaccess.h>
 #include <asm/ptrace.h>
 #include <asm/i387.h>
+#include <asm/fpu-internal.h>
 #include <asm/user.h>
 
 #ifdef CONFIG_X86_64
@@ -124,7 +125,7 @@ EXPORT_SYMBOL_GPL(xstate_size);
 unsigned int sig_xstate_ia32_size = sizeof(struct _fpstate_ia32);
 static struct i387_fxsave_struct fx_scratch __cpuinitdata;
 
-void __cpuinit mxcsr_feature_mask_init(void)
+static void __cpuinit mxcsr_feature_mask_init(void)
 {
 	unsigned long mask = 0;
 
diff --git a/arch/x86/kernel/process.c b/arch/x86/kernel/process.c
index 15763af7bfe3..c38d84e01022 100644
--- a/arch/x86/kernel/process.c
+++ b/arch/x86/kernel/process.c
@@ -21,6 +21,7 @@
 #include <asm/idle.h>
 #include <asm/uaccess.h>
 #include <asm/i387.h>
+#include <asm/fpu-internal.h>
 #include <asm/debugreg.h>
 
 struct kmem_cache *task_xstate_cachep;
diff --git a/arch/x86/kernel/process_32.c b/arch/x86/kernel/process_32.c
index c08d1ff12b7c..ee32dee7a0a3 100644
--- a/arch/x86/kernel/process_32.c
+++ b/arch/x86/kernel/process_32.c
@@ -45,6 +45,7 @@
 #include <asm/ldt.h>
 #include <asm/processor.h>
 #include <asm/i387.h>
+#include <asm/fpu-internal.h>
 #include <asm/desc.h>
 #ifdef CONFIG_MATH_EMULATION
 #include <asm/math_emu.h>
diff --git a/arch/x86/kernel/process_64.c b/arch/x86/kernel/process_64.c
index cfa5c90c01db..5bad3c71e48f 100644
--- a/arch/x86/kernel/process_64.c
+++ b/arch/x86/kernel/process_64.c
@@ -43,6 +43,7 @@
 #include <asm/system.h>
 #include <asm/processor.h>
 #include <asm/i387.h>
+#include <asm/fpu-internal.h>
 #include <asm/mmu_context.h>
 #include <asm/prctl.h>
 #include <asm/desc.h>
diff --git a/arch/x86/kernel/ptrace.c b/arch/x86/kernel/ptrace.c
index 50267386b766..78f05e438be5 100644
--- a/arch/x86/kernel/ptrace.c
+++ b/arch/x86/kernel/ptrace.c
@@ -27,6 +27,7 @@
 #include <asm/system.h>
 #include <asm/processor.h>
 #include <asm/i387.h>
+#include <asm/fpu-internal.h>
 #include <asm/debugreg.h>
 #include <asm/ldt.h>
 #include <asm/desc.h>
diff --git a/arch/x86/kernel/signal.c b/arch/x86/kernel/signal.c
index 46a01bdc27e2..25edcfc9ba5b 100644
--- a/arch/x86/kernel/signal.c
+++ b/arch/x86/kernel/signal.c
@@ -24,6 +24,7 @@
 #include <asm/processor.h>
 #include <asm/ucontext.h>
 #include <asm/i387.h>
+#include <asm/fpu-internal.h>
 #include <asm/vdso.h>
 #include <asm/mce.h>
 
diff --git a/arch/x86/kernel/traps.c b/arch/x86/kernel/traps.c
index 4bbe04d96744..ec61d4c1b93b 100644
--- a/arch/x86/kernel/traps.c
+++ b/arch/x86/kernel/traps.c
@@ -54,6 +54,7 @@
 #include <asm/traps.h>
 #include <asm/desc.h>
 #include <asm/i387.h>
+#include <asm/fpu-internal.h>
 #include <asm/mce.h>
 
 #include <asm/mach_traps.h>
diff --git a/arch/x86/kernel/xsave.c b/arch/x86/kernel/xsave.c
index 711091114119..e62728e30b01 100644
--- a/arch/x86/kernel/xsave.c
+++ b/arch/x86/kernel/xsave.c
@@ -6,6 +6,7 @@
 #include <linux/bootmem.h>
 #include <linux/compat.h>
 #include <asm/i387.h>
+#include <asm/fpu-internal.h>
 #ifdef CONFIG_IA32_EMULATION
 #include <asm/sigcontext32.h>
 #endif
diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index 3b4c8d8ad906..246490f643b6 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -1457,7 +1457,7 @@ static void __vmx_load_host_state(struct vcpu_vmx *vmx)
 #ifdef CONFIG_X86_64
 	wrmsrl(MSR_KERNEL_GS_BASE, vmx->msr_host_kernel_gs_base);
 #endif
-	if (__thread_has_fpu(current))
+	if (user_has_fpu())
 		clts();
 	load_gdt(&__get_cpu_var(host_gdt));
 }
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 9cbfc0698118..b937b6179d80 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -57,6 +57,7 @@
 #include <asm/mtrr.h>
 #include <asm/mce.h>
 #include <asm/i387.h>
+#include <asm/fpu-internal.h> /* Ugh! */
 #include <asm/xcr.h>
 #include <asm/pvclock.h>
 #include <asm/div64.h>
diff --git a/arch/x86/power/cpu.c b/arch/x86/power/cpu.c
index f10c0afa1cb4..4889655ba784 100644
--- a/arch/x86/power/cpu.c
+++ b/arch/x86/power/cpu.c
@@ -20,6 +20,7 @@
 #include <asm/xcr.h>
 #include <asm/suspend.h>
 #include <asm/debugreg.h>
+#include <asm/fpu-internal.h> /* pcntxt_mask */
 
 #ifdef CONFIG_X86_32
 static struct saved_context saved_context;
-- 
cgit v1.2.3


From 22e842d4d90ffec9677cc114487a5cefd39b5643 Mon Sep 17 00:00:00 2001
From: "H. Peter Anvin" <hpa@linux.intel.com>
Date: Tue, 21 Feb 2012 14:32:19 -0800
Subject: x32: Fix coding style violations in the x32 VDSO code

Move the prototype for x32_setup_additional_pages() to a header file,
and adjust the coding style to match standard.

Signed-off-by: H. Peter Anvin <hpa@linux.intel.com>
Cc: H. J. Lu <hjl.tools@gmail.com>
---
 arch/x86/include/asm/elf.h   | 2 ++
 arch/x86/vdso/vdso32-setup.c | 3 +--
 arch/x86/vdso/vma.c          | 8 ++++----
 3 files changed, 7 insertions(+), 6 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/include/asm/elf.h b/arch/x86/include/asm/elf.h
index 83aabea95dd7..1e40634591a4 100644
--- a/arch/x86/include/asm/elf.h
+++ b/arch/x86/include/asm/elf.h
@@ -331,6 +331,8 @@ struct linux_binprm;
 #define ARCH_HAS_SETUP_ADDITIONAL_PAGES 1
 extern int arch_setup_additional_pages(struct linux_binprm *bprm,
 				       int uses_interp);
+extern int x32_setup_additional_pages(struct linux_binprm *bprm,
+				      int uses_interp);
 
 extern int syscall32_setup_pages(struct linux_binprm *, int exstack);
 #define compat_arch_setup_additional_pages	syscall32_setup_pages
diff --git a/arch/x86/vdso/vdso32-setup.c b/arch/x86/vdso/vdso32-setup.c
index 01b8a0df5e0e..10f9f59477db 100644
--- a/arch/x86/vdso/vdso32-setup.c
+++ b/arch/x86/vdso/vdso32-setup.c
@@ -318,9 +318,8 @@ int arch_setup_additional_pages(struct linux_binprm *bprm, int uses_interp)
 	bool compat;
 
 #ifdef CONFIG_X86_X32_ABI
-	extern int x32_setup_additional_pages(struct linux_binprm *, int);
 	if (test_thread_flag(TIF_X32))
-		return x32_setup_additional_pages (bprm, uses_interp);
+		return x32_setup_additional_pages(bprm, uses_interp);
 #endif
 
 	if (vdso_enabled == VDSO_DISABLED)
diff --git a/arch/x86/vdso/vma.c b/arch/x86/vdso/vma.c
index 1bbcc6205ace..d7dce1dbf8c9 100644
--- a/arch/x86/vdso/vma.c
+++ b/arch/x86/vdso/vma.c
@@ -187,15 +187,15 @@ up_fail:
 
 int arch_setup_additional_pages(struct linux_binprm *bprm, int uses_interp)
 {
-	return setup_additional_pages (bprm, uses_interp, vdso_pages,
-				       vdso_size);
+	return setup_additional_pages(bprm, uses_interp, vdso_pages,
+				      vdso_size);
 }
 
 #ifdef CONFIG_X86_X32_ABI
 int x32_setup_additional_pages(struct linux_binprm *bprm, int uses_interp)
 {
-	return setup_additional_pages (bprm, uses_interp, vdsox32_pages,
-				       vdsox32_size);
+	return setup_additional_pages(bprm, uses_interp, vdsox32_pages,
+				      vdsox32_size);
 }
 #endif
 
-- 
cgit v1.2.3


From 513c4ec6e4759aa33c90af0658b82eb4d2027871 Mon Sep 17 00:00:00 2001
From: "H. Peter Anvin" <hpa@linux.intel.com>
Date: Tue, 21 Feb 2012 17:25:50 -0800
Subject: x86, cpufeature: Add CPU features from Intel document 319433-012A

Add CPU features from the Intel Archicture Instruction Set Extensions
Programming Reference version 012A (Feb 2012), document number 319433-012A.

Signed-off-by: H. Peter Anvin <hpa@linux.intel.com>
---
 arch/x86/include/asm/cpufeature.h | 3 +++
 1 file changed, 3 insertions(+)

(limited to 'arch/x86')

diff --git a/arch/x86/include/asm/cpufeature.h b/arch/x86/include/asm/cpufeature.h
index 8d67d428b0f9..0d3dcc9cbab6 100644
--- a/arch/x86/include/asm/cpufeature.h
+++ b/arch/x86/include/asm/cpufeature.h
@@ -199,10 +199,13 @@
 /* Intel-defined CPU features, CPUID level 0x00000007:0 (ebx), word 9 */
 #define X86_FEATURE_FSGSBASE	(9*32+ 0) /* {RD/WR}{FS/GS}BASE instructions*/
 #define X86_FEATURE_BMI1	(9*32+ 3) /* 1st group bit manipulation extensions */
+#define X86_FEATURE_HLE		(9*32+ 4) /* Hardware Lock Elision */
 #define X86_FEATURE_AVX2	(9*32+ 5) /* AVX2 instructions */
 #define X86_FEATURE_SMEP	(9*32+ 7) /* Supervisor Mode Execution Protection */
 #define X86_FEATURE_BMI2	(9*32+ 8) /* 2nd group bit manipulation extensions */
 #define X86_FEATURE_ERMS	(9*32+ 9) /* Enhanced REP MOVSB/STOSB */
+#define X86_FEATURE_INVPCID	(9*32+10) /* Invalidate Processor Context ID */
+#define X86_FEATURE_RTM		(9*32+11) /* Restricted Transactional Memory */
 
 #if defined(__KERNEL__) && !defined(__ASSEMBLY__)
 
-- 
cgit v1.2.3


From b0e5c77903fd717cc5eb02b7b8f5de3c869efc49 Mon Sep 17 00:00:00 2001
From: Suresh Siddha <suresh.b.siddha@intel.com>
Date: Mon, 6 Feb 2012 18:32:20 -0800
Subject: x86/tsc: Reduce the TSC sync check time for core-siblings

For each logical CPU that is coming online, we spend 20msec for
checking the TSC synchronization. And as this is done
sequentially for each logical CPU boot, this time gets added up
depending on the number of logical CPU's supported by the
platform.

Minimize this by using the socket topology information.

If the target CPU coming online doesn't have any of its
core-siblings online, a timeout of 20msec will be used for the
TSC-warp measurement loop. Otherwise a smaller timeout of 2msec
will be used, as we have some information about this socket
already (and this information grows as we have more and more
logical-siblings in that socket).

Ideally we should be able to skip the TSC sync check on the
other core-siblings, if the first logical CPU in a socket passed
the sync test. But as the TSC is per-logical CPU and can
potentially be modified wrongly by the bios before the OS boot,
TSC sync test for smaller duration should be able to catch such
errors. Also this will catch the condition where all the cores
in the socket doesn't get reset at the same time.

For example, with this modification, time spent in TSC sync
checks on a 4 socket 10-core with HT system gets reduced from
1580msec to 212msec.

Signed-off-by: Suresh Siddha <suresh.b.siddha@intel.com>
Acked-by: Arjan van de Ven <arjan@linux.intel.com>
Acked-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Jack Steiner <steiner@sgi.com>
Cc: venki@google.com
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Andrew Morton <akpm@linux-foundation.org>
Link: http://lkml.kernel.org/r/1328581940.29790.20.camel@sbsiddha-desk.sc.intel.com
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/tsc_sync.c | 29 ++++++++++++++++++++++++-----
 1 file changed, 24 insertions(+), 5 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kernel/tsc_sync.c b/arch/x86/kernel/tsc_sync.c
index 9eba29b46cb7..fc25e60a5884 100644
--- a/arch/x86/kernel/tsc_sync.c
+++ b/arch/x86/kernel/tsc_sync.c
@@ -42,7 +42,7 @@ static __cpuinitdata int nr_warps;
 /*
  * TSC-warp measurement loop running on both CPUs:
  */
-static __cpuinit void check_tsc_warp(void)
+static __cpuinit void check_tsc_warp(unsigned int timeout)
 {
 	cycles_t start, now, prev, end;
 	int i;
@@ -51,9 +51,9 @@ static __cpuinit void check_tsc_warp(void)
 	start = get_cycles();
 	rdtsc_barrier();
 	/*
-	 * The measurement runs for 20 msecs:
+	 * The measurement runs for 'timeout' msecs:
 	 */
-	end = start + tsc_khz * 20ULL;
+	end = start + (cycles_t) tsc_khz * timeout;
 	now = start;
 
 	for (i = 0; ; i++) {
@@ -98,6 +98,25 @@ static __cpuinit void check_tsc_warp(void)
 			now-start, end-start);
 }
 
+/*
+ * If the target CPU coming online doesn't have any of its core-siblings
+ * online, a timeout of 20msec will be used for the TSC-warp measurement
+ * loop. Otherwise a smaller timeout of 2msec will be used, as we have some
+ * information about this socket already (and this information grows as we
+ * have more and more logical-siblings in that socket).
+ *
+ * Ideally we should be able to skip the TSC sync check on the other
+ * core-siblings, if the first logical CPU in a socket passed the sync test.
+ * But as the TSC is per-logical CPU and can potentially be modified wrongly
+ * by the bios, TSC sync test for smaller duration should be able
+ * to catch such errors. Also this will catch the condition where all the
+ * cores in the socket doesn't get reset at the same time.
+ */
+static inline unsigned int loop_timeout(int cpu)
+{
+	return (cpumask_weight(cpu_core_mask(cpu)) > 1) ? 2 : 20;
+}
+
 /*
  * Source CPU calls into this - it waits for the freshly booted
  * target CPU to arrive and then starts the measurement:
@@ -135,7 +154,7 @@ void __cpuinit check_tsc_sync_source(int cpu)
 	 */
 	atomic_inc(&start_count);
 
-	check_tsc_warp();
+	check_tsc_warp(loop_timeout(cpu));
 
 	while (atomic_read(&stop_count) != cpus-1)
 		cpu_relax();
@@ -183,7 +202,7 @@ void __cpuinit check_tsc_sync_target(void)
 	while (atomic_read(&start_count) != cpus)
 		cpu_relax();
 
-	check_tsc_warp();
+	check_tsc_warp(loop_timeout(smp_processor_id()));
 
 	/*
 	 * Ok, we are done:
-- 
cgit v1.2.3


From 3f806e50981825fa56a7f1938f24c0680816be45 Mon Sep 17 00:00:00 2001
From: Borislav Petkov <bp@alien8.de>
Date: Fri, 3 Feb 2012 20:18:01 +0100
Subject: x86/mce/AMD: Fix UP build error

141168c36cde ("x86: Simplify code by removing a !SMP #ifdefs
from 'struct cpuinfo_x86'") removed a bunch of CONFIG_SMP ifdefs
around code touching struct cpuinfo_x86 members but also caused
the following build error with Randy's randconfigs:

mce_amd.c:(.cpuinit.text+0x4723): undefined reference to `cpu_llc_shared_map'

Restore the #ifdef in threshold_create_bank() which creates
symlinks on the non-BSP CPUs.

There's a better patch series being worked on by Kevin Winchester
which will solve this in a cleaner fashion, but that series is
too ambitious for v3.3 merging - so we first queue up this trivial
fix and then do the rest for v3.4.

Signed-off-by: Borislav Petkov <bp@alien8.de>
Acked-by: Kevin Winchester <kjwinchester@gmail.com>
Cc: Randy Dunlap <rdunlap@xenotime.net>
Cc: Nick Bowler <nbowler@elliptictech.com>
Link: http://lkml.kernel.org/r/20120203191801.GA2846@x1.osrc.amd.com
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/cpu/mcheck/mce_amd.c | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'arch/x86')

diff --git a/arch/x86/kernel/cpu/mcheck/mce_amd.c b/arch/x86/kernel/cpu/mcheck/mce_amd.c
index 786e76a86322..e4eeaaf58a47 100644
--- a/arch/x86/kernel/cpu/mcheck/mce_amd.c
+++ b/arch/x86/kernel/cpu/mcheck/mce_amd.c
@@ -528,6 +528,7 @@ static __cpuinit int threshold_create_bank(unsigned int cpu, unsigned int bank)
 
 	sprintf(name, "threshold_bank%i", bank);
 
+#ifdef CONFIG_SMP
 	if (cpu_data(cpu).cpu_core_id && shared_bank[bank]) {	/* symlink */
 		i = cpumask_first(cpu_llc_shared_mask(cpu));
 
@@ -553,6 +554,7 @@ static __cpuinit int threshold_create_bank(unsigned int cpu, unsigned int bank)
 
 		goto out;
 	}
+#endif
 
 	b = kzalloc(sizeof(struct threshold_bank), GFP_KERNEL);
 	if (!b) {
-- 
cgit v1.2.3


From 140f190bc3a3b6f200548d204befd998eadd63fd Mon Sep 17 00:00:00 2001
From: "Luck, Tony" <tony.luck@intel.com>
Date: Wed, 22 Feb 2012 10:06:44 -0800
Subject: x86: Remove some noise from boot log when starting cpus

Printing the "start_ip" for every secondary cpu is very noisy on a large
system - and doesn't add any value. Drop this message.

Console log before:
Booting Node   0, Processors  #1
smpboot cpu 1: start_ip = 96000
 #2
smpboot cpu 2: start_ip = 96000
 #3
smpboot cpu 3: start_ip = 96000
 #4
smpboot cpu 4: start_ip = 96000
       ...
 #31
smpboot cpu 31: start_ip = 96000
Brought up 32 CPUs

Console log after:
Booting Node   0, Processors  #1 #2 #3 #4 #5 #6 #7 Ok.
Booting Node   1, Processors  #8 #9 #10 #11 #12 #13 #14 #15 Ok.
Booting Node   0, Processors  #16 #17 #18 #19 #20 #21 #22 #23 Ok.
Booting Node   1, Processors  #24 #25 #26 #27 #28 #29 #30 #31
Brought up 32 CPUs

Acked-by: Borislav Petkov <bp@amd64.org>
Signed-off-by: Tony Luck <tony.luck@intel.com>
Link: http://lkml.kernel.org/r/4f452eb42507460426@agluck-desktop.sc.intel.com
Signed-off-by: H. Peter Anvin <hpa@zytor.com>
---
 arch/x86/kernel/smpboot.c | 2 --
 1 file changed, 2 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c
index 66d250c00d11..683575250a65 100644
--- a/arch/x86/kernel/smpboot.c
+++ b/arch/x86/kernel/smpboot.c
@@ -740,8 +740,6 @@ do_rest:
 	 * the targeted processor.
 	 */
 
-	printk(KERN_DEBUG "smpboot cpu %d: start_ip = %lx\n", cpu, start_ip);
-
 	atomic_set(&init_deasserted, 0);
 
 	if (get_uv_system_type() != UV_NON_UNIQUE_APIC) {
-- 
cgit v1.2.3


From d6126ef5f31ca54980cb067af659a360dfcca037 Mon Sep 17 00:00:00 2001
From: Greg Kroah-Hartman <gregkh@suse.de>
Date: Thu, 26 Jan 2012 15:49:14 -0800
Subject: x86/mce: Convert static array of pointers to per-cpu variables

When I previously fixed up the mce_device code, I used a static array of
the pointers.  It was (rightfully) pointed out to me that I should be
using the per_cpu code instead.

This patch converts the code over to that structure, moving the variable
back into the per_cpu area, like it used to be for 3.2 and earlier.

Signed-off-by: Greg Kroah-Hartman <gregkh@suse.de>
Reviewed-by: Srivatsa S. Bhat <srivatsa.bhat@linux.vnet.ibm.com>
Link: https://lkml.org/lkml/2012/1/27/165
Signed-off-by: Tony Luck <tony.luck@intel.com>
---
 arch/x86/include/asm/mce.h           | 2 +-
 arch/x86/kernel/cpu/mcheck/mce.c     | 8 ++++----
 arch/x86/kernel/cpu/mcheck/mce_amd.c | 9 +++++----
 3 files changed, 10 insertions(+), 9 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/include/asm/mce.h b/arch/x86/include/asm/mce.h
index 6aefb14cbbc5..441520e4174f 100644
--- a/arch/x86/include/asm/mce.h
+++ b/arch/x86/include/asm/mce.h
@@ -151,7 +151,7 @@ static inline void enable_p5_mce(void) {}
 
 void mce_setup(struct mce *m);
 void mce_log(struct mce *m);
-extern struct device *mce_device[CONFIG_NR_CPUS];
+DECLARE_PER_CPU(struct device *, mce_device);
 
 /*
  * Maximum banks number.
diff --git a/arch/x86/kernel/cpu/mcheck/mce.c b/arch/x86/kernel/cpu/mcheck/mce.c
index 5a11ae2e9e91..4979a5dfeba2 100644
--- a/arch/x86/kernel/cpu/mcheck/mce.c
+++ b/arch/x86/kernel/cpu/mcheck/mce.c
@@ -1859,7 +1859,7 @@ static struct bus_type mce_subsys = {
 	.dev_name	= "machinecheck",
 };
 
-struct device *mce_device[CONFIG_NR_CPUS];
+DEFINE_PER_CPU(struct device *, mce_device);
 
 __cpuinitdata
 void (*threshold_cpu_callback)(unsigned long action, unsigned int cpu);
@@ -2038,7 +2038,7 @@ static __cpuinit int mce_device_create(unsigned int cpu)
 			goto error2;
 	}
 	cpumask_set_cpu(cpu, mce_device_initialized);
-	mce_device[cpu] = dev;
+	per_cpu(mce_device, cpu) = dev;
 
 	return 0;
 error2:
@@ -2055,7 +2055,7 @@ error:
 
 static __cpuinit void mce_device_remove(unsigned int cpu)
 {
-	struct device *dev = mce_device[cpu];
+	struct device *dev = per_cpu(mce_device, cpu);
 	int i;
 
 	if (!cpumask_test_cpu(cpu, mce_device_initialized))
@@ -2069,7 +2069,7 @@ static __cpuinit void mce_device_remove(unsigned int cpu)
 
 	device_unregister(dev);
 	cpumask_clear_cpu(cpu, mce_device_initialized);
-	mce_device[cpu] = NULL;
+	per_cpu(mce_device, cpu) = NULL;
 }
 
 /* Make sure there are no machine checks on offlined CPUs. */
diff --git a/arch/x86/kernel/cpu/mcheck/mce_amd.c b/arch/x86/kernel/cpu/mcheck/mce_amd.c
index 786e76a86322..a4bf9d23cdba 100644
--- a/arch/x86/kernel/cpu/mcheck/mce_amd.c
+++ b/arch/x86/kernel/cpu/mcheck/mce_amd.c
@@ -523,7 +523,7 @@ static __cpuinit int threshold_create_bank(unsigned int cpu, unsigned int bank)
 {
 	int i, err = 0;
 	struct threshold_bank *b = NULL;
-	struct device *dev = mce_device[cpu];
+	struct device *dev = per_cpu(mce_device, cpu);
 	char name[32];
 
 	sprintf(name, "threshold_bank%i", bank);
@@ -585,7 +585,7 @@ static __cpuinit int threshold_create_bank(unsigned int cpu, unsigned int bank)
 		if (i == cpu)
 			continue;
 
-		dev = mce_device[i];
+		dev = per_cpu(mce_device, i);
 		if (dev)
 			err = sysfs_create_link(&dev->kobj,b->kobj, name);
 		if (err)
@@ -665,7 +665,8 @@ static void threshold_remove_bank(unsigned int cpu, int bank)
 #ifdef CONFIG_SMP
 	/* sibling symlink */
 	if (shared_bank[bank] && b->blocks->cpu != cpu) {
-		sysfs_remove_link(&mce_device[cpu]->kobj, name);
+		dev = per_cpu(mce_device, cpu);
+		sysfs_remove_link(&dev->kobj, name);
 		per_cpu(threshold_banks, cpu)[bank] = NULL;
 
 		return;
@@ -677,7 +678,7 @@ static void threshold_remove_bank(unsigned int cpu, int bank)
 		if (i == cpu)
 			continue;
 
-		dev = mce_device[i];
+		dev = per_cpu(mce_device, i);
 		if (dev)
 			sysfs_remove_link(&dev->kobj, name);
 		per_cpu(threshold_banks, i)[bank] = NULL;
-- 
cgit v1.2.3


From fadd85f16a8ec3fee8af599e79a209682dc52348 Mon Sep 17 00:00:00 2001
From: Naoya Horiguchi <n-horiguchi@ah.jp.nec.com>
Date: Mon, 23 Jan 2012 15:54:52 -0500
Subject: x86/mce: Fix return value of mce_chrdev_read() when erst is disabled

Current kernel MCE code reads ERST at the first reading of /dev/mcelog
(maybe in starting mcelogd,) even if the system does not support ERST,
which results in a fake "no such device" message (as described in [1].)
This problem is not critical, but can confuse system admins.
This patch fixes it by filtering the return value from lower (ACPI) layer.

 [1] http://thread.gmane.org/gmane.linux.kernel/1060250

Reported by: Jon Masters <jonathan@jonmasters.org>
Signed-off-by: Naoya Horiguchi <n-horiguchi@ah.jp.nec.com>
Cc: Andi Kleen <andi@firstfloor.org>
Cc: Huang Ying <ying.huang@intel.com>
Link: https://lkml.org/lkml/2012/1/23/299
Signed-off-by: Tony Luck <tony.luck@intel.com>
---
 arch/x86/kernel/cpu/mcheck/mce.c | 6 ++++++
 1 file changed, 6 insertions(+)

(limited to 'arch/x86')

diff --git a/arch/x86/kernel/cpu/mcheck/mce.c b/arch/x86/kernel/cpu/mcheck/mce.c
index 4979a5dfeba2..87c56ba8080c 100644
--- a/arch/x86/kernel/cpu/mcheck/mce.c
+++ b/arch/x86/kernel/cpu/mcheck/mce.c
@@ -1541,6 +1541,12 @@ static int __mce_read_apei(char __user **ubuf, size_t usize)
 	/* Error or no more MCE record */
 	if (rc <= 0) {
 		mce_apei_read_done = 1;
+		/*
+		 * When ERST is disabled, mce_chrdev_read() should return
+		 * "no record" instead of "no device."
+		 */
+		if (rc == -ENODEV)
+			return 0;
 		return rc;
 	}
 	rc = -EFAULT;
-- 
cgit v1.2.3


From 862ae3132dc393ab6ea750b9ee9e0e1c276b9abb Mon Sep 17 00:00:00 2001
From: "H. Peter Anvin" <hpa@zytor.com>
Date: Wed, 22 Feb 2012 20:37:10 -0800
Subject: x32: Drop non-__vdso weak symbols from the x32 VDSO

Drop the legacy weak symbols that don't carry the __vdso prefix from
the x32 VDSO.  This is a new ABI and we don't need to support that
legacy; the actual libc will export the proper symbols.

Suggested-by: Andy Lutomirski <luto@mit.edu>
Link: http://lkml.kernel.org/r/4F42E171.9080005@mit.edu
Cc: H. J. Lu <hjl.tools@gmail.com>
Signed-off-by: H. Peter Anvin <hpa@zytor.com>
---
 arch/x86/vdso/vdsox32.lds.S | 4 ----
 1 file changed, 4 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/vdso/vdsox32.lds.S b/arch/x86/vdso/vdsox32.lds.S
index 373ca9a02a53..62272aa2ae0a 100644
--- a/arch/x86/vdso/vdsox32.lds.S
+++ b/arch/x86/vdso/vdsox32.lds.S
@@ -17,13 +17,9 @@
 VERSION {
 	LINUX_2.6 {
 	global:
-		clock_gettime;
 		__vdso_clock_gettime;
-		gettimeofday;
 		__vdso_gettimeofday;
-		getcpu;
 		__vdso_getcpu;
-		time;
 		__vdso_time;
 	local: *;
 	};
-- 
cgit v1.2.3


From 1cc1c96c1658bfaf85d06d764bd7ac00640ae90f Mon Sep 17 00:00:00 2001
From: Yinghai Lu <yinghai@kernel.org>
Date: Mon, 20 Feb 2012 17:23:47 -0800
Subject: PCI: fix memleak when ACPI _CRS is not used.

warning:
unreferenced object 0xffff8801f6914200 (size 512):
  comm "swapper/0", pid 1, jiffies 4294893643 (age 2664.644s)
  hex dump (first 32 bytes):
    00 00 c0 fe 00 00 00 00 ff ff ff ff 00 00 00 00  ................
    60 58 2f f6 03 88 ff ff 00 02 00 00 00 00 00 00  `X/.............
  backtrace:
    [<ffffffff81c2408c>] kmemleak_alloc+0x26/0x43
    [<ffffffff8113764f>] __kmalloc+0x121/0x183
    [<ffffffff81ca8d93>] get_current_resources+0x5a/0xc6
    [<ffffffff81c5bedd>] pci_acpi_scan_root+0x13c/0x21c
    [<ffffffff81c2a745>] acpi_pci_root_add+0x1e1/0x421
    [<ffffffff81408f50>] acpi_device_probe+0x50/0x190
    [<ffffffff8149edc7>] really_probe+0x99/0x126
    [<ffffffff8149ef83>] driver_probe_device+0x3b/0x56
    [<ffffffff8149effd>] __driver_attach+0x5f/0x82
    [<ffffffff8149d860>] bus_for_each_dev+0x5c/0x88
    [<ffffffff8149eb87>] driver_attach+0x1e/0x20
    [<ffffffff8149e7cc>] bus_add_driver+0xca/0x21d
    [<ffffffff8149f47b>] driver_register+0x91/0xfe
    [<ffffffff81409d09>] acpi_bus_register_driver+0x43/0x45
    [<ffffffff8278bdc9>] acpi_pci_root_init+0x20/0x28
    [<ffffffff810001e7>] do_one_initcall+0x57/0x134

The system has _CRS for root buses, but they are not used because the machine
date is before the cutoff date for _CRS usage.

Try to free those unused resource arrays and names.

Reviewed-by: Bjorn Helgaas <bhelgaas@google.com>
Signed-off-by: Yinghai Lu <yinghai@kernel.org>
Signed-off-by: Jesse Barnes <jbarnes@virtuousgeek.org>
---
 arch/x86/pci/acpi.c | 12 +++++++-----
 1 file changed, 7 insertions(+), 5 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/pci/acpi.c b/arch/x86/pci/acpi.c
index a312e76063a7..c33e0970ee9f 100644
--- a/arch/x86/pci/acpi.c
+++ b/arch/x86/pci/acpi.c
@@ -282,9 +282,6 @@ static void add_resources(struct pci_root_info *info)
 	int i;
 	struct resource *res, *root, *conflict;
 
-	if (!pci_use_crs)
-		return;
-
 	coalesce_windows(info, IORESOURCE_MEM);
 	coalesce_windows(info, IORESOURCE_IO);
 
@@ -336,8 +333,13 @@ get_current_resources(struct acpi_device *device, int busnum,
 	acpi_walk_resources(device->handle, METHOD_NAME__CRS, setup_resource,
 				&info);
 
-	add_resources(&info);
-	return;
+	if (pci_use_crs) {
+		add_resources(&info);
+
+		return;
+	}
+
+	kfree(info.name);
 
 name_alloc_fail:
 	kfree(info.res);
-- 
cgit v1.2.3


From 990a30c50c2bb3c4570aec7c33bedb969d089b7b Mon Sep 17 00:00:00 2001
From: Jacob Pan <jacob.jun.pan@linux.intel.com>
Date: Mon, 13 Feb 2012 12:59:00 +0000
Subject: x86/mrst/pci: assign d3_delay to 0 for Langwell devices

Langwell devices are not true pci devices, they are not subject to the 10 ms
d3 to d0 delay required by pci spec. This patch assigns d3_delay to 0 for all
langwell pci devices.

We can also power off devices that are not really used by the OS

Signed-off-by: Jacob Pan <jacob.jun.pan@linux.intel.com>
Signed-off-by: Alan Cox <alan@linux.intel.com>
Signed-off-by: Jesse Barnes <jbarnes@virtuousgeek.org>
---
 arch/x86/pci/mrst.c | 24 ++++++++++++++++++++++++
 1 file changed, 24 insertions(+)

(limited to 'arch/x86')

diff --git a/arch/x86/pci/mrst.c b/arch/x86/pci/mrst.c
index cb29191cee58..89e55485c787 100644
--- a/arch/x86/pci/mrst.c
+++ b/arch/x86/pci/mrst.c
@@ -239,6 +239,30 @@ int __init pci_mrst_init(void)
 	return 1;
 }
 
+/* Langwell devices are not true pci devices, they are not subject to 10 ms
+ * d3 to d0 delay required by pci spec.
+ */
+static void __devinit pci_d3delay_fixup(struct pci_dev *dev)
+{
+	/* true pci devices in lincroft should allow type 1 access, the rest
+	 * are langwell fake pci devices.
+	 */
+	if (type1_access_ok(dev->bus->number, dev->devfn, PCI_DEVICE_ID))
+		return;
+	dev->d3_delay = 0;
+}
+DECLARE_PCI_FIXUP_FINAL(PCI_VENDOR_ID_INTEL, PCI_ANY_ID, pci_d3delay_fixup);
+
+static void __devinit mrst_power_off_unused_dev(struct pci_dev *dev)
+{
+	pci_set_power_state(dev, PCI_D3cold);
+}
+DECLARE_PCI_FIXUP_FINAL(PCI_VENDOR_ID_INTEL, 0x0801, mrst_power_off_unused_dev);
+DECLARE_PCI_FIXUP_FINAL(PCI_VENDOR_ID_INTEL, 0x0809, mrst_power_off_unused_dev);
+DECLARE_PCI_FIXUP_FINAL(PCI_VENDOR_ID_INTEL, 0x080C, mrst_power_off_unused_dev);
+DECLARE_PCI_FIXUP_FINAL(PCI_VENDOR_ID_INTEL, 0x0812, mrst_power_off_unused_dev);
+DECLARE_PCI_FIXUP_FINAL(PCI_VENDOR_ID_INTEL, 0x0815, mrst_power_off_unused_dev);
+
 /*
  * Langwell devices reside at fixed offsets, don't try to move them.
  */
-- 
cgit v1.2.3


From 8ed3087280ee8c527b7090887e333761a9c75474 Mon Sep 17 00:00:00 2001
From: Jacob Pan <jacob.jun.pan@linux.intel.com>
Date: Mon, 13 Feb 2012 12:59:20 +0000
Subject: x86/mrst/pci: v4l/atomisp: treat atomisp as real pci device

ATOMISP on Medfield is a real PCI device which should be handled differently
than the fake PCI devices on south complex. PCI type 1 access is used for
accessing config space this also has other impact such as PM D3 delay. There
shouldn't be any need for reading base address from IUNIT via msg bus.

Signed-off-by: Jacob Pan <jacob.jun.pan@linux.intel.com>
Signed-off-by: Artem Bityutskiy <artem.bityutskiy@linux.intel.com>
Signed-off-by: Jesse Barnes <jbarnes@virtuousgeek.org>
---
 arch/x86/pci/mrst.c | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

(limited to 'arch/x86')

diff --git a/arch/x86/pci/mrst.c b/arch/x86/pci/mrst.c
index 89e55485c787..c5e81a4d7c1e 100644
--- a/arch/x86/pci/mrst.c
+++ b/arch/x86/pci/mrst.c
@@ -148,7 +148,9 @@ static bool type1_access_ok(unsigned int bus, unsigned int devfn, int reg)
 	 */
 	if (reg >= 0x100 || reg == PCI_STATUS || reg == PCI_HEADER_TYPE)
 		return 0;
-	if (bus == 0 && (devfn == PCI_DEVFN(2, 0) || devfn == PCI_DEVFN(0, 0)))
+	if (bus == 0 && (devfn == PCI_DEVFN(2, 0)
+				|| devfn == PCI_DEVFN(0, 0)
+				|| devfn == PCI_DEVFN(3, 0)))
 		return 1;
 	return 0; /* langwell on others */
 }
-- 
cgit v1.2.3


From 823806ff6bd63f92644a5330cf0c3b68fac25ffd Mon Sep 17 00:00:00 2001
From: Alan Cox <alan@linux.intel.com>
Date: Mon, 13 Feb 2012 12:59:37 +0000
Subject: x86/mrst/pci: avoid SoC fixups on non-SoC platforms

The PCI fixups get executed based upon whether they are linked in. We need
to avoid executing them if we boot a dual SoC/PC type kernel on a PC class
system.

Signed-off-by: Alan Cox <alan@linux.intel.com>
Signed-off-by: Jesse Barnes <jbarnes@virtuousgeek.org>
---
 arch/x86/pci/mrst.c | 12 +++++++++++-
 1 file changed, 11 insertions(+), 1 deletion(-)

(limited to 'arch/x86')

diff --git a/arch/x86/pci/mrst.c b/arch/x86/pci/mrst.c
index c5e81a4d7c1e..140942f66b31 100644
--- a/arch/x86/pci/mrst.c
+++ b/arch/x86/pci/mrst.c
@@ -43,6 +43,8 @@
 #define PCI_FIXED_BAR_4_SIZE	0x14
 #define PCI_FIXED_BAR_5_SIZE	0x1c
 
+static int pci_soc_mode = 0;
+
 /**
  * fixed_bar_cap - return the offset of the fixed BAR cap if found
  * @bus: PCI bus
@@ -233,10 +235,11 @@ struct pci_ops pci_mrst_ops = {
  */
 int __init pci_mrst_init(void)
 {
-	printk(KERN_INFO "Moorestown platform detected, using MRST PCI ops\n");
+	printk(KERN_INFO "Intel MID platform detected, using MID PCI ops\n");
 	pci_mmcfg_late_init();
 	pcibios_enable_irq = mrst_pci_irq_enable;
 	pci_root_ops = pci_mrst_ops;
+	pci_soc_mode = 1;
 	/* Continue with standard init */
 	return 1;
 }
@@ -246,6 +249,10 @@ int __init pci_mrst_init(void)
  */
 static void __devinit pci_d3delay_fixup(struct pci_dev *dev)
 {
+	/* PCI fixups are effectively decided compile time. If we have a dual
+	   SoC/non-SoC kernel we don't want to mangle d3 on non SoC devices */
+        if (!pci_soc_mode)
+            return;
 	/* true pci devices in lincroft should allow type 1 access, the rest
 	 * are langwell fake pci devices.
 	 */
@@ -274,6 +281,9 @@ static void __devinit pci_fixed_bar_fixup(struct pci_dev *dev)
 	u32 size;
 	int i;
 
+	if (!pci_soc_mode)
+		return;
+
 	/* Must have extended configuration space */
 	if (dev->cfg_size < PCIE_CAP_OFFSET + 4)
 		return;
-- 
cgit v1.2.3


From b4e518547da042fdc65bd4bdafd046fed13337d5 Mon Sep 17 00:00:00 2001
From: Grant Likely <grant.likely@secretlab.ca>
Date: Fri, 16 Dec 2011 15:50:17 -0700
Subject: irq_domain/x86: Convert x86 (embedded) to use common irq_domain

This patch removes the x86-specific definition of irq_domain and replaces
it with the common implementation.

Signed-off-by: Grant Likely <grant.likely@secretlab.ca>
Acked-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
Cc: Rob Herring <rob.herring@calxeda.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
---
 arch/x86/Kconfig                      |   2 +
 arch/x86/include/asm/irq_controller.h |  12 ----
 arch/x86/include/asm/prom.h           |  10 ----
 arch/x86/kernel/devicetree.c          | 101 ++++++++++------------------------
 drivers/net/phy/mdio-gpio.c           |   4 +-
 5 files changed, 34 insertions(+), 95 deletions(-)
 delete mode 100644 arch/x86/include/asm/irq_controller.h

(limited to 'arch/x86')

diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
index 5bed94e189fa..e0829a6a4660 100644
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -398,6 +398,7 @@ config X86_INTEL_CE
 	select X86_REBOOTFIXUPS
 	select OF
 	select OF_EARLY_FLATTREE
+	select IRQ_DOMAIN
 	---help---
 	  Select for the Intel CE media processor (CE4100) SOC.
 	  This option compiles in support for the CE4100 SOC for settop
@@ -2076,6 +2077,7 @@ config OLPC
 	select GPIOLIB
 	select OF
 	select OF_PROMTREE
+	select IRQ_DOMAIN
 	---help---
 	  Add support for detecting the unique features of the OLPC
 	  XO hardware.
diff --git a/arch/x86/include/asm/irq_controller.h b/arch/x86/include/asm/irq_controller.h
deleted file mode 100644
index 423bbbddf36d..000000000000
--- a/arch/x86/include/asm/irq_controller.h
+++ /dev/null
@@ -1,12 +0,0 @@
-#ifndef __IRQ_CONTROLLER__
-#define __IRQ_CONTROLLER__
-
-struct irq_domain {
-	int (*xlate)(struct irq_domain *h, const u32 *intspec, u32 intsize,
-			u32 *out_hwirq, u32 *out_type);
-	void *priv;
-	struct device_node *controller;
-	struct list_head l;
-};
-
-#endif
diff --git a/arch/x86/include/asm/prom.h b/arch/x86/include/asm/prom.h
index 644dd885f05a..60bef663609a 100644
--- a/arch/x86/include/asm/prom.h
+++ b/arch/x86/include/asm/prom.h
@@ -21,7 +21,6 @@
 #include <asm/irq.h>
 #include <linux/atomic.h>
 #include <asm/setup.h>
-#include <asm/irq_controller.h>
 
 #ifdef CONFIG_OF
 extern int of_ioapic;
@@ -43,15 +42,6 @@ extern char cmd_line[COMMAND_LINE_SIZE];
 #define pci_address_to_pio pci_address_to_pio
 unsigned long pci_address_to_pio(phys_addr_t addr);
 
-/**
- * irq_dispose_mapping - Unmap an interrupt
- * @virq: linux virq number of the interrupt to unmap
- *
- * FIXME: We really should implement proper virq handling like power,
- * but that's going to be major surgery.
- */
-static inline void irq_dispose_mapping(unsigned int virq) { }
-
 #define HAVE_ARCH_DEVTREE_FIXUPS
 
 #endif /* __ASSEMBLY__ */
diff --git a/arch/x86/kernel/devicetree.c b/arch/x86/kernel/devicetree.c
index 52821799a702..3ae2ced4a874 100644
--- a/arch/x86/kernel/devicetree.c
+++ b/arch/x86/kernel/devicetree.c
@@ -4,6 +4,7 @@
 #include <linux/bootmem.h>
 #include <linux/export.h>
 #include <linux/io.h>
+#include <linux/irqdomain.h>
 #include <linux/interrupt.h>
 #include <linux/list.h>
 #include <linux/of.h>
@@ -17,64 +18,14 @@
 #include <linux/initrd.h>
 
 #include <asm/hpet.h>
-#include <asm/irq_controller.h>
 #include <asm/apic.h>
 #include <asm/pci_x86.h>
 
 __initdata u64 initial_dtb;
 char __initdata cmd_line[COMMAND_LINE_SIZE];
-static LIST_HEAD(irq_domains);
-static DEFINE_RAW_SPINLOCK(big_irq_lock);
 
 int __initdata of_ioapic;
 
-#ifdef CONFIG_X86_IO_APIC
-static void add_interrupt_host(struct irq_domain *ih)
-{
-	unsigned long flags;
-
-	raw_spin_lock_irqsave(&big_irq_lock, flags);
-	list_add(&ih->l, &irq_domains);
-	raw_spin_unlock_irqrestore(&big_irq_lock, flags);
-}
-#endif
-
-static struct irq_domain *get_ih_from_node(struct device_node *controller)
-{
-	struct irq_domain *ih, *found = NULL;
-	unsigned long flags;
-
-	raw_spin_lock_irqsave(&big_irq_lock, flags);
-	list_for_each_entry(ih, &irq_domains, l) {
-		if (ih->controller ==  controller) {
-			found = ih;
-			break;
-		}
-	}
-	raw_spin_unlock_irqrestore(&big_irq_lock, flags);
-	return found;
-}
-
-unsigned int irq_create_of_mapping(struct device_node *controller,
-				   const u32 *intspec, unsigned int intsize)
-{
-	struct irq_domain *ih;
-	u32 virq, type;
-	int ret;
-
-	ih = get_ih_from_node(controller);
-	if (!ih)
-		return 0;
-	ret = ih->xlate(ih, intspec, intsize, &virq, &type);
-	if (ret)
-		return 0;
-	if (type == IRQ_TYPE_NONE)
-		return virq;
-	irq_set_irq_type(virq, type);
-	return virq;
-}
-EXPORT_SYMBOL_GPL(irq_create_of_mapping);
-
 unsigned long pci_address_to_pio(phys_addr_t address)
 {
 	/*
@@ -354,36 +305,43 @@ static struct of_ioapic_type of_ioapic_type[] =
 	},
 };
 
-static int ioapic_xlate(struct irq_domain *id, const u32 *intspec, u32 intsize,
-			u32 *out_hwirq, u32 *out_type)
+static int ioapic_xlate(struct irq_domain *domain,
+			struct device_node *controller,
+			const u32 *intspec, u32 intsize,
+			irq_hw_number_t *out_hwirq, u32 *out_type)
 {
-	struct mp_ioapic_gsi *gsi_cfg;
 	struct io_apic_irq_attr attr;
 	struct of_ioapic_type *it;
-	u32 line, idx, type;
+	u32 line, idx;
+	int rc;
 
-	if (intsize < 2)
+	if (WARN_ON(intsize < 2))
 		return -EINVAL;
 
-	line = *intspec;
-	idx = (u32) id->priv;
-	gsi_cfg = mp_ioapic_gsi_routing(idx);
-	*out_hwirq = line + gsi_cfg->gsi_base;
-
-	intspec++;
-	type = *intspec;
+	line = intspec[0];
 
-	if (type >= ARRAY_SIZE(of_ioapic_type))
+	if (intspec[1] >= ARRAY_SIZE(of_ioapic_type))
 		return -EINVAL;
 
-	it = of_ioapic_type + type;
-	*out_type = it->out_type;
+	it = &of_ioapic_type[intspec[1]];
 
+	idx = (u32) domain->host_data;
 	set_io_apic_irq_attr(&attr, idx, line, it->trigger, it->polarity);
 
-	return io_apic_setup_irq_pin_once(*out_hwirq, cpu_to_node(0), &attr);
+	rc = io_apic_setup_irq_pin_once(irq_find_mapping(domain, line),
+					cpu_to_node(0), &attr);
+	if (rc)
+		return rc;
+
+	*out_hwirq = line;
+	*out_type = it->out_type;
+	return 0;
 }
 
+const struct irq_domain_ops ioapic_irq_domain_ops = {
+	.xlate = ioapic_xlate,
+};
+
 static void __init ioapic_add_ofnode(struct device_node *np)
 {
 	struct resource r;
@@ -399,13 +357,14 @@ static void __init ioapic_add_ofnode(struct device_node *np)
 	for (i = 0; i < nr_ioapics; i++) {
 		if (r.start == mpc_ioapic_addr(i)) {
 			struct irq_domain *id;
+			struct mp_ioapic_gsi *gsi_cfg;
+
+			gsi_cfg = mp_ioapic_gsi_routing(i);
 
-			id = kzalloc(sizeof(*id), GFP_KERNEL);
+			id = irq_domain_add_legacy(np, 32, gsi_cfg->gsi_base, 0,
+						   &ioapic_irq_domain_ops,
+						   (void*)i);
 			BUG_ON(!id);
-			id->controller = np;
-			id->xlate = ioapic_xlate;
-			id->priv = (void *)i;
-			add_interrupt_host(id);
 			return;
 		}
 	}
diff --git a/drivers/net/phy/mdio-gpio.c b/drivers/net/phy/mdio-gpio.c
index 50e8e5e74465..7189adf54bd1 100644
--- a/drivers/net/phy/mdio-gpio.c
+++ b/drivers/net/phy/mdio-gpio.c
@@ -255,13 +255,13 @@ static inline int __init mdio_ofgpio_init(void)
 	return platform_driver_register(&mdio_ofgpio_driver);
 }
 
-static inline void __exit mdio_ofgpio_exit(void)
+static inline void mdio_ofgpio_exit(void)
 {
 	platform_driver_unregister(&mdio_ofgpio_driver);
 }
 #else
 static inline int __init mdio_ofgpio_init(void) { return 0; }
-static inline void __exit mdio_ofgpio_exit(void) { }
+static inline void mdio_ofgpio_exit(void) { }
 #endif /* CONFIG_OF_GPIO */
 
 static struct platform_driver mdio_gpio_driver = {
-- 
cgit v1.2.3


From 83e7ee6657dfcd6b0ee2406d11024b558064252a Mon Sep 17 00:00:00 2001
From: Olof Johansson <olof@lixom.net>
Date: Sun, 12 Feb 2012 13:24:25 -0800
Subject: x86, efi: Refactor efi_init() a bit

Break out some of the init steps into helper functions.

Only change to execution flow is the removal of the warning when the
kernel memdesc structure differ in size from what firmware specifies
since it's a bogus warning (it's a valid difference per spec).

v4:
* Removed memdesc warning as per above

Signed-off-by: Olof Johansson <olof@lixom.net>
Link: http://lkml.kernel.org/r/1329081869-20779-2-git-send-email-olof@lixom.net
Acked-by: Matt Fleming <matt.fleming@intel.com>
Signed-off-by: H. Peter Anvin <hpa@zytor.com>
---
 arch/x86/platform/efi/efi.c | 89 +++++++++++++++++++++++++++------------------
 1 file changed, 54 insertions(+), 35 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/platform/efi/efi.c b/arch/x86/platform/efi/efi.c
index 4cf9bd0a1653..6d88dcac466c 100644
--- a/arch/x86/platform/efi/efi.c
+++ b/arch/x86/platform/efi/efi.c
@@ -429,23 +429,8 @@ static void __init efi_free_boot_services(void)
 	}
 }
 
-void __init efi_init(void)
+static void __init efi_systab_init(void *phys)
 {
-	efi_config_table_t *config_tables;
-	efi_runtime_services_t *runtime;
-	efi_char16_t *c16;
-	char vendor[100] = "unknown";
-	int i = 0;
-	void *tmp;
-
-#ifdef CONFIG_X86_32
-	efi_phys.systab = (efi_system_table_t *)boot_params.efi_info.efi_systab;
-#else
-	efi_phys.systab = (efi_system_table_t *)
-		(boot_params.efi_info.efi_systab |
-		 ((__u64)boot_params.efi_info.efi_systab_hi<<32));
-#endif
-
 	efi.systab = early_ioremap((unsigned long)efi_phys.systab,
 				   sizeof(efi_system_table_t));
 	if (efi.systab == NULL)
@@ -464,22 +449,12 @@ void __init efi_init(void)
 		       "%d.%02d, expected 1.00 or greater!\n",
 		       efi.systab->hdr.revision >> 16,
 		       efi.systab->hdr.revision & 0xffff);
+}
 
-	/*
-	 * Show what we know for posterity
-	 */
-	c16 = tmp = early_ioremap(efi.systab->fw_vendor, 2);
-	if (c16) {
-		for (i = 0; i < sizeof(vendor) - 1 && *c16; ++i)
-			vendor[i] = *c16++;
-		vendor[i] = '\0';
-	} else
-		printk(KERN_ERR PFX "Could not map the firmware vendor!\n");
-	early_iounmap(tmp, 2);
-
-	printk(KERN_INFO "EFI v%u.%.02u by %s\n",
-	       efi.systab->hdr.revision >> 16,
-	       efi.systab->hdr.revision & 0xffff, vendor);
+static void __init efi_config_init(u64 tables, int nr_tables)
+{
+	efi_config_table_t *config_tables;
+	int i;
 
 	/*
 	 * Let's see what config tables the firmware passed to us.
@@ -526,6 +501,11 @@ void __init efi_init(void)
 	printk("\n");
 	early_iounmap(config_tables,
 			  efi.systab->nr_tables * sizeof(efi_config_table_t));
+}
+
+static void __init efi_runtime_init(void)
+{
+	efi_runtime_services_t *runtime;
 
 	/*
 	 * Check out the runtime services table. We need to map
@@ -554,7 +534,10 @@ void __init efi_init(void)
 		printk(KERN_ERR "Could not map the EFI runtime service "
 		       "table!\n");
 	early_iounmap(runtime, sizeof(efi_runtime_services_t));
+}
 
+static void __init efi_memmap_init(void)
+{
 	/* Map the EFI memory map */
 	memmap.map = early_ioremap((unsigned long)memmap.phys_map,
 				   memmap.nr_map * memmap.desc_size);
@@ -562,12 +545,48 @@ void __init efi_init(void)
 		printk(KERN_ERR "Could not map the EFI memory map!\n");
 	memmap.map_end = memmap.map + (memmap.nr_map * memmap.desc_size);
 
-	if (memmap.desc_size != sizeof(efi_memory_desc_t))
-		printk(KERN_WARNING
-		  "Kernel-defined memdesc doesn't match the one from EFI!\n");
-
 	if (add_efi_memmap)
 		do_add_efi_memmap();
+}
+
+void __init efi_init(void)
+{
+	efi_char16_t *c16;
+	char vendor[100] = "unknown";
+	int i = 0;
+	void *tmp;
+
+#ifdef CONFIG_X86_32
+	efi_phys.systab = (efi_system_table_t *)boot_params.efi_info.efi_systab;
+#else
+	efi_phys.systab = (efi_system_table_t *)
+		(boot_params.efi_info.efi_systab |
+		 ((__u64)boot_params.efi_info.efi_systab_hi<<32));
+#endif
+
+	efi_systab_init(efi_phys.systab);
+
+	/*
+	 * Show what we know for posterity
+	 */
+	c16 = tmp = early_ioremap(efi.systab->fw_vendor, 2);
+	if (c16) {
+		for (i = 0; i < sizeof(vendor) - 1 && *c16; ++i)
+			vendor[i] = *c16++;
+		vendor[i] = '\0';
+	} else
+		printk(KERN_ERR PFX "Could not map the firmware vendor!\n");
+	early_iounmap(tmp, 2);
+
+	printk(KERN_INFO "EFI v%u.%.02u by %s\n",
+	       efi.systab->hdr.revision >> 16,
+	       efi.systab->hdr.revision & 0xffff, vendor);
+
+	efi_config_init(efi.systab->tables, efi.systab->nr_tables);
+
+	efi_runtime_init();
+
+	efi_memmap_init();
 
 #ifdef CONFIG_X86_32
 	x86_platform.get_wallclock = efi_get_time;
-- 
cgit v1.2.3


From e3cb3f5a35997906f9b79bf860029c02a54cfae6 Mon Sep 17 00:00:00 2001
From: Olof Johansson <olof@lixom.net>
Date: Sun, 12 Feb 2012 13:24:26 -0800
Subject: x86, efi: Convert printk to pr_*()

Alright, I guess I'll go through and convert them, even though
there's no net gain to speak of.

v4:
* Switched to pr_fmt and removed some redundant use of "EFI" in
  messages.

Signed-off-by: Olof Johansson <olof@lixom.net>
Link: http://lkml.kernel.org/r/1329081869-20779-3-git-send-email-olof@lixom.net
Cc: Joe Perches <joe@perches.com>
Signed-off-by: H. Peter Anvin <hpa@zytor.com>
---
 arch/x86/platform/efi/efi.c | 58 ++++++++++++++++++++++-----------------------
 1 file changed, 29 insertions(+), 29 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/platform/efi/efi.c b/arch/x86/platform/efi/efi.c
index 6d88dcac466c..511fb15e2036 100644
--- a/arch/x86/platform/efi/efi.c
+++ b/arch/x86/platform/efi/efi.c
@@ -26,6 +26,8 @@
  *	Skip non-WB memory and ignore empty memory ranges.
  */
 
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+
 #include <linux/kernel.h>
 #include <linux/init.h>
 #include <linux/efi.h>
@@ -47,7 +49,6 @@
 #include <asm/x86_init.h>
 
 #define EFI_DEBUG	1
-#define PFX 		"EFI: "
 
 int efi_enabled;
 EXPORT_SYMBOL(efi_enabled);
@@ -254,7 +255,7 @@ int efi_set_rtc_mmss(unsigned long nowtime)
 
 	status = efi.get_time(&eft, &cap);
 	if (status != EFI_SUCCESS) {
-		printk(KERN_ERR "Oops: efitime: can't read time!\n");
+		pr_err("Oops: efitime: can't read time!\n");
 		return -1;
 	}
 
@@ -268,7 +269,7 @@ int efi_set_rtc_mmss(unsigned long nowtime)
 
 	status = efi.set_time(&eft);
 	if (status != EFI_SUCCESS) {
-		printk(KERN_ERR "Oops: efitime: can't write time!\n");
+		pr_err("Oops: efitime: can't write time!\n");
 		return -1;
 	}
 	return 0;
@@ -282,7 +283,7 @@ unsigned long efi_get_time(void)
 
 	status = efi.get_time(&eft, &cap);
 	if (status != EFI_SUCCESS)
-		printk(KERN_ERR "Oops: efitime: can't read time!\n");
+		pr_err("Oops: efitime: can't read time!\n");
 
 	return mktime(eft.year, eft.month, eft.day, eft.hour,
 		      eft.minute, eft.second);
@@ -367,7 +368,7 @@ static void __init print_efi_memmap(void)
 	     p < memmap.map_end;
 	     p += memmap.desc_size, i++) {
 		md = p;
-		printk(KERN_INFO PFX "mem%02u: type=%u, attr=0x%llx, "
+		pr_info("mem%02u: type=%u, attr=0x%llx, "
 			"range=[0x%016llx-0x%016llx) (%lluMB)\n",
 			i, md->type, md->attribute, md->phys_addr,
 			md->phys_addr + (md->num_pages << EFI_PAGE_SHIFT),
@@ -400,7 +401,7 @@ void __init efi_reserve_boot_services(void)
 			memblock_is_region_reserved(start, size)) {
 			/* Could not reserve, skip it */
 			md->num_pages = 0;
-			memblock_dbg(PFX "Could not reserve boot range "
+			memblock_dbg("Could not reserve boot range "
 					"[0x%010llx-0x%010llx]\n",
 						start, start+size-1);
 		} else
@@ -434,7 +435,7 @@ static void __init efi_systab_init(void *phys)
 	efi.systab = early_ioremap((unsigned long)efi_phys.systab,
 				   sizeof(efi_system_table_t));
 	if (efi.systab == NULL)
-		printk(KERN_ERR "Couldn't map the EFI system table!\n");
+		pr_err("Couldn't map the system table!\n");
 	memcpy(&efi_systab, efi.systab, sizeof(efi_system_table_t));
 	early_iounmap(efi.systab, sizeof(efi_system_table_t));
 	efi.systab = &efi_systab;
@@ -443,9 +444,9 @@ static void __init efi_systab_init(void *phys)
 	 * Verify the EFI Table
 	 */
 	if (efi.systab->hdr.signature != EFI_SYSTEM_TABLE_SIGNATURE)
-		printk(KERN_ERR "EFI system table signature incorrect!\n");
+		pr_err("System table signature incorrect!\n");
 	if ((efi.systab->hdr.revision >> 16) == 0)
-		printk(KERN_ERR "Warning: EFI system table version "
+		pr_err("Warning: System table version "
 		       "%d.%02d, expected 1.00 or greater!\n",
 		       efi.systab->hdr.revision >> 16,
 		       efi.systab->hdr.revision & 0xffff);
@@ -463,42 +464,42 @@ static void __init efi_config_init(u64 tables, int nr_tables)
 		efi.systab->tables,
 		efi.systab->nr_tables * sizeof(efi_config_table_t));
 	if (config_tables == NULL)
-		printk(KERN_ERR "Could not map EFI Configuration Table!\n");
+		pr_err("Could not map Configuration table!\n");
 
-	printk(KERN_INFO);
+	pr_info("");
 	for (i = 0; i < efi.systab->nr_tables; i++) {
 		if (!efi_guidcmp(config_tables[i].guid, MPS_TABLE_GUID)) {
 			efi.mps = config_tables[i].table;
-			printk(" MPS=0x%lx ", config_tables[i].table);
+			pr_cont(" MPS=0x%lx ", config_tables[i].table);
 		} else if (!efi_guidcmp(config_tables[i].guid,
 					ACPI_20_TABLE_GUID)) {
 			efi.acpi20 = config_tables[i].table;
-			printk(" ACPI 2.0=0x%lx ", config_tables[i].table);
+			pr_cont(" ACPI 2.0=0x%lx ", config_tables[i].table);
 		} else if (!efi_guidcmp(config_tables[i].guid,
 					ACPI_TABLE_GUID)) {
 			efi.acpi = config_tables[i].table;
-			printk(" ACPI=0x%lx ", config_tables[i].table);
+			pr_cont(" ACPI=0x%lx ", config_tables[i].table);
 		} else if (!efi_guidcmp(config_tables[i].guid,
 					SMBIOS_TABLE_GUID)) {
 			efi.smbios = config_tables[i].table;
-			printk(" SMBIOS=0x%lx ", config_tables[i].table);
+			pr_cont(" SMBIOS=0x%lx ", config_tables[i].table);
 #ifdef CONFIG_X86_UV
 		} else if (!efi_guidcmp(config_tables[i].guid,
 					UV_SYSTEM_TABLE_GUID)) {
 			efi.uv_systab = config_tables[i].table;
-			printk(" UVsystab=0x%lx ", config_tables[i].table);
+			pr_cont(" UVsystab=0x%lx ", config_tables[i].table);
 #endif
 		} else if (!efi_guidcmp(config_tables[i].guid,
 					HCDP_TABLE_GUID)) {
 			efi.hcdp = config_tables[i].table;
-			printk(" HCDP=0x%lx ", config_tables[i].table);
+			pr_cont(" HCDP=0x%lx ", config_tables[i].table);
 		} else if (!efi_guidcmp(config_tables[i].guid,
 					UGA_IO_PROTOCOL_GUID)) {
 			efi.uga = config_tables[i].table;
-			printk(" UGA=0x%lx ", config_tables[i].table);
+			pr_cont(" UGA=0x%lx ", config_tables[i].table);
 		}
 	}
-	printk("\n");
+	pr_cont("\n");
 	early_iounmap(config_tables,
 			  efi.systab->nr_tables * sizeof(efi_config_table_t));
 }
@@ -531,8 +532,7 @@ static void __init efi_runtime_init(void)
 		 */
 		efi.get_time = phys_efi_get_time;
 	} else
-		printk(KERN_ERR "Could not map the EFI runtime service "
-		       "table!\n");
+		pr_err("Could not map the runtime service table!\n");
 	early_iounmap(runtime, sizeof(efi_runtime_services_t));
 }
 
@@ -542,7 +542,7 @@ static void __init efi_memmap_init(void)
 	memmap.map = early_ioremap((unsigned long)memmap.phys_map,
 				   memmap.nr_map * memmap.desc_size);
 	if (memmap.map == NULL)
-		printk(KERN_ERR "Could not map the EFI memory map!\n");
+		pr_err("Could not map the memory map!\n");
 	memmap.map_end = memmap.map + (memmap.nr_map * memmap.desc_size);
 
 	if (add_efi_memmap)
@@ -575,12 +575,12 @@ void __init efi_init(void)
 			vendor[i] = *c16++;
 		vendor[i] = '\0';
 	} else
-		printk(KERN_ERR PFX "Could not map the firmware vendor!\n");
+		pr_err("Could not map the firmware vendor!\n");
 	early_iounmap(tmp, 2);
 
-	printk(KERN_INFO "EFI v%u.%.02u by %s\n",
-	       efi.systab->hdr.revision >> 16,
-	       efi.systab->hdr.revision & 0xffff, vendor);
+	pr_info("EFI v%u.%.02u by %s\n",
+		efi.systab->hdr.revision >> 16,
+		efi.systab->hdr.revision & 0xffff, vendor);
 
 	efi_config_init(efi.systab->tables, efi.systab->nr_tables);
 
@@ -696,7 +696,7 @@ void __init efi_enter_virtual_mode(void)
 		md->virt_addr = (u64) (unsigned long) va;
 
 		if (!va) {
-			printk(KERN_ERR PFX "ioremap of 0x%llX failed!\n",
+			pr_err("ioremap of 0x%llX failed!\n",
 			       (unsigned long long)md->phys_addr);
 			continue;
 		}
@@ -730,8 +730,8 @@ void __init efi_enter_virtual_mode(void)
 		(efi_memory_desc_t *)__pa(new_memmap));
 
 	if (status != EFI_SUCCESS) {
-		printk(KERN_ALERT "Unable to switch EFI into virtual mode "
-		       "(status=%lx)!\n", status);
+		pr_alert("Unable to switch EFI into virtual mode "
+			 "(status=%lx)!\n", status);
 		panic("EFI call to SetVirtualAddressMap() failed!");
 	}
 
-- 
cgit v1.2.3


From a6a46f415dca828a04a435ca1f67de0bc5b9ae30 Mon Sep 17 00:00:00 2001
From: Olof Johansson <olof@lixom.net>
Date: Sun, 12 Feb 2012 13:24:27 -0800
Subject: x86, efi: Cleanup config table walking

Trivial cleanup, move guid and table pointers to local copies to
make the code cleaner.

Signed-off-by: Olof Johansson <olof@lixom.net>
Link: http://lkml.kernel.org/r/1329081869-20779-4-git-send-email-olof@lixom.net
Acked-by: Matt Fleming <matt.fleming@intel.com>
Signed-off-by: H. Peter Anvin <hpa@zytor.com>
---
 arch/x86/platform/efi/efi.c | 61 +++++++++++++++++++++------------------------
 1 file changed, 28 insertions(+), 33 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/platform/efi/efi.c b/arch/x86/platform/efi/efi.c
index 511fb15e2036..03259d1df14f 100644
--- a/arch/x86/platform/efi/efi.c
+++ b/arch/x86/platform/efi/efi.c
@@ -455,53 +455,48 @@ static void __init efi_systab_init(void *phys)
 static void __init efi_config_init(u64 tables, int nr_tables)
 {
 	efi_config_table_t *config_tables;
-	int i;
+	int i, sz = sizeof(efi_config_table_t);
 
 	/*
 	 * Let's see what config tables the firmware passed to us.
 	 */
-	config_tables = early_ioremap(
-		efi.systab->tables,
-		efi.systab->nr_tables * sizeof(efi_config_table_t));
+	config_tables = early_ioremap(efi.systab->tables,
+				      efi.systab->nr_tables * sz);
 	if (config_tables == NULL)
 		pr_err("Could not map Configuration table!\n");
 
 	pr_info("");
 	for (i = 0; i < efi.systab->nr_tables; i++) {
-		if (!efi_guidcmp(config_tables[i].guid, MPS_TABLE_GUID)) {
-			efi.mps = config_tables[i].table;
-			pr_cont(" MPS=0x%lx ", config_tables[i].table);
-		} else if (!efi_guidcmp(config_tables[i].guid,
-					ACPI_20_TABLE_GUID)) {
-			efi.acpi20 = config_tables[i].table;
-			pr_cont(" ACPI 2.0=0x%lx ", config_tables[i].table);
-		} else if (!efi_guidcmp(config_tables[i].guid,
-					ACPI_TABLE_GUID)) {
-			efi.acpi = config_tables[i].table;
-			pr_cont(" ACPI=0x%lx ", config_tables[i].table);
-		} else if (!efi_guidcmp(config_tables[i].guid,
-					SMBIOS_TABLE_GUID)) {
-			efi.smbios = config_tables[i].table;
-			pr_cont(" SMBIOS=0x%lx ", config_tables[i].table);
+		efi_guid_t guid = config_tables[i].guid;
+		unsigned long table = config_tables[i].table;
+
+		if (!efi_guidcmp(guid, MPS_TABLE_GUID)) {
+			efi.mps = table;
+			pr_cont(" MPS=0x%lx ", table);
+		} else if (!efi_guidcmp(guid, ACPI_20_TABLE_GUID)) {
+			efi.acpi20 = table;
+			pr_cont(" ACPI 2.0=0x%lx ", table);
+		} else if (!efi_guidcmp(guid, ACPI_TABLE_GUID)) {
+			efi.acpi = table;
+			pr_cont(" ACPI=0x%lx ", table);
+		} else if (!efi_guidcmp(guid, SMBIOS_TABLE_GUID)) {
+			efi.smbios = table;
+			pr_cont(" SMBIOS=0x%lx ", table);
 #ifdef CONFIG_X86_UV
-		} else if (!efi_guidcmp(config_tables[i].guid,
-					UV_SYSTEM_TABLE_GUID)) {
-			efi.uv_systab = config_tables[i].table;
-			pr_cont(" UVsystab=0x%lx ", config_tables[i].table);
+		} else if (!efi_guidcmp(guid, UV_SYSTEM_TABLE_GUID)) {
+			efi.uv_systab = table;
+			pr_cont(" UVsystab=0x%lx ", table);
 #endif
-		} else if (!efi_guidcmp(config_tables[i].guid,
-					HCDP_TABLE_GUID)) {
-			efi.hcdp = config_tables[i].table;
-			pr_cont(" HCDP=0x%lx ", config_tables[i].table);
-		} else if (!efi_guidcmp(config_tables[i].guid,
-					UGA_IO_PROTOCOL_GUID)) {
-			efi.uga = config_tables[i].table;
-			pr_cont(" UGA=0x%lx ", config_tables[i].table);
+		} else if (!efi_guidcmp(guid, HCDP_TABLE_GUID)) {
+			efi.hcdp = table;
+			pr_cont(" HCDP=0x%lx ", table);
+		} else if (!efi_guidcmp(guid, UGA_IO_PROTOCOL_GUID)) {
+			efi.uga = table;
+			pr_cont(" UGA=0x%lx ", table);
 		}
 	}
 	pr_cont("\n");
-	early_iounmap(config_tables,
-			  efi.systab->nr_tables * sizeof(efi_config_table_t));
+	early_iounmap(config_tables, efi.systab->nr_tables * sz);
 }
 
 static void __init efi_runtime_init(void)
-- 
cgit v1.2.3


From 140bf275d3e89e9b36851d5cf498dbbbecdf7ca8 Mon Sep 17 00:00:00 2001
From: Olof Johansson <olof@lixom.net>
Date: Sun, 12 Feb 2012 13:24:28 -0800
Subject: x86, efi: Add basic error handling

It's not perfect, but way better than before. Mark efi_enabled as false in
case of error and at least stop dereferencing pointers that are known to
be invalid.

The only significant missing piece is the lack of undoing the
memblock_reserve of the memory that efi marks as in use. On the other
hand, it's not a large amount of memory, and leaving it unavailable for
system use should be the safer choice anyway.

Signed-off-by: Olof Johansson <olof@lixom.net>
Link: http://lkml.kernel.org/r/1329081869-20779-5-git-send-email-olof@lixom.net
Acked-by: Matt Fleming <matt.fleming@intel.com>
Signed-off-by: H. Peter Anvin <hpa@zytor.com>
---
 arch/x86/platform/efi/efi.c | 85 ++++++++++++++++++++++++++++++---------------
 1 file changed, 57 insertions(+), 28 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/platform/efi/efi.c b/arch/x86/platform/efi/efi.c
index 03259d1df14f..5a053e7737b7 100644
--- a/arch/x86/platform/efi/efi.c
+++ b/arch/x86/platform/efi/efi.c
@@ -430,12 +430,14 @@ static void __init efi_free_boot_services(void)
 	}
 }
 
-static void __init efi_systab_init(void *phys)
+static int __init efi_systab_init(void *phys)
 {
 	efi.systab = early_ioremap((unsigned long)efi_phys.systab,
 				   sizeof(efi_system_table_t));
-	if (efi.systab == NULL)
+	if (efi.systab == NULL) {
 		pr_err("Couldn't map the system table!\n");
+		return -ENOMEM;
+	}
 	memcpy(&efi_systab, efi.systab, sizeof(efi_system_table_t));
 	early_iounmap(efi.systab, sizeof(efi_system_table_t));
 	efi.systab = &efi_systab;
@@ -443,16 +445,20 @@ static void __init efi_systab_init(void *phys)
 	/*
 	 * Verify the EFI Table
 	 */
-	if (efi.systab->hdr.signature != EFI_SYSTEM_TABLE_SIGNATURE)
+	if (efi.systab->hdr.signature != EFI_SYSTEM_TABLE_SIGNATURE) {
 		pr_err("System table signature incorrect!\n");
+		return -EINVAL;
+	}
 	if ((efi.systab->hdr.revision >> 16) == 0)
 		pr_err("Warning: System table version "
 		       "%d.%02d, expected 1.00 or greater!\n",
 		       efi.systab->hdr.revision >> 16,
 		       efi.systab->hdr.revision & 0xffff);
+
+	return 0;
 }
 
-static void __init efi_config_init(u64 tables, int nr_tables)
+static int __init efi_config_init(u64 tables, int nr_tables)
 {
 	efi_config_table_t *config_tables;
 	int i, sz = sizeof(efi_config_table_t);
@@ -462,8 +468,10 @@ static void __init efi_config_init(u64 tables, int nr_tables)
 	 */
 	config_tables = early_ioremap(efi.systab->tables,
 				      efi.systab->nr_tables * sz);
-	if (config_tables == NULL)
+	if (config_tables == NULL) {
 		pr_err("Could not map Configuration table!\n");
+		return -ENOMEM;
+	}
 
 	pr_info("");
 	for (i = 0; i < efi.systab->nr_tables; i++) {
@@ -497,9 +505,11 @@ static void __init efi_config_init(u64 tables, int nr_tables)
 	}
 	pr_cont("\n");
 	early_iounmap(config_tables, efi.systab->nr_tables * sz);
+
+	return 0;
 }
 
-static void __init efi_runtime_init(void)
+static int __init efi_runtime_init(void)
 {
 	efi_runtime_services_t *runtime;
 
@@ -511,37 +521,44 @@ static void __init efi_runtime_init(void)
 	 */
 	runtime = early_ioremap((unsigned long)efi.systab->runtime,
 				sizeof(efi_runtime_services_t));
-	if (runtime != NULL) {
-		/*
-		 * We will only need *early* access to the following
-		 * two EFI runtime services before set_virtual_address_map
-		 * is invoked.
-		 */
-		efi_phys.get_time = (efi_get_time_t *)runtime->get_time;
-		efi_phys.set_virtual_address_map =
-			(efi_set_virtual_address_map_t *)
-			runtime->set_virtual_address_map;
-		/*
-		 * Make efi_get_time can be called before entering
-		 * virtual mode.
-		 */
-		efi.get_time = phys_efi_get_time;
-	} else
+	if (!runtime) {
 		pr_err("Could not map the runtime service table!\n");
+		return -ENOMEM;
+	}
+	/*
+	 * We will only need *early* access to the following
+	 * two EFI runtime services before set_virtual_address_map
+	 * is invoked.
+	 */
+	efi_phys.get_time = (efi_get_time_t *)runtime->get_time;
+	efi_phys.set_virtual_address_map =
+		(efi_set_virtual_address_map_t *)
+		runtime->set_virtual_address_map;
+	/*
+	 * Make efi_get_time can be called before entering
+	 * virtual mode.
+	 */
+	efi.get_time = phys_efi_get_time;
 	early_iounmap(runtime, sizeof(efi_runtime_services_t));
+
+	return 0;
 }
 
-static void __init efi_memmap_init(void)
+static int __init efi_memmap_init(void)
 {
 	/* Map the EFI memory map */
 	memmap.map = early_ioremap((unsigned long)memmap.phys_map,
 				   memmap.nr_map * memmap.desc_size);
-	if (memmap.map == NULL)
+	if (memmap.map == NULL) {
 		pr_err("Could not map the memory map!\n");
+		return -ENOMEM;
+	}
 	memmap.map_end = memmap.map + (memmap.nr_map * memmap.desc_size);
 
 	if (add_efi_memmap)
 		do_add_efi_memmap();
+
+	return 0;
 }
 
 void __init efi_init(void)
@@ -559,7 +576,10 @@ void __init efi_init(void)
 		 ((__u64)boot_params.efi_info.efi_systab_hi<<32));
 #endif
 
-	efi_systab_init(efi_phys.systab);
+	if (efi_systab_init(efi_phys.systab)) {
+		efi_enabled = 0;
+		return;
+	}
 
 	/*
 	 * Show what we know for posterity
@@ -577,11 +597,20 @@ void __init efi_init(void)
 		efi.systab->hdr.revision >> 16,
 		efi.systab->hdr.revision & 0xffff, vendor);
 
-	efi_config_init(efi.systab->tables, efi.systab->nr_tables);
+	if (efi_config_init(efi.systab->tables, efi.systab->nr_tables)) {
+		efi_enabled = 0;
+		return;
+	}
 
-	efi_runtime_init();
+	if (efi_runtime_init()) {
+		efi_enabled = 0;
+		return;
+	}
 
-	efi_memmap_init();
+	if (efi_memmap_init()) {
+		efi_enabled = 0;
+		return;
+	}
 
 #ifdef CONFIG_X86_32
 	x86_platform.get_wallclock = efi_get_time;
-- 
cgit v1.2.3


From 1adbfa3511ee1c1118e16a9a0246870f12fef4e6 Mon Sep 17 00:00:00 2001
From: Olof Johansson <olof@lixom.net>
Date: Sun, 12 Feb 2012 13:24:29 -0800
Subject: x86, efi: Allow basic init with mixed 32/64-bit efi/kernel

Traditionally the kernel has refused to setup EFI at all if there's been
a mismatch in 32/64-bit mode between EFI and the kernel.

On some platforms that boot natively through EFI (Chrome OS being one),
we still need to get at least some of the static data such as memory
configuration out of EFI. Runtime services aren't as critical, and
it's a significant amount of work to implement switching between the
operating modes to call between kernel and firmware for thise cases. So
I'm ignoring it for now.

v5:
* Fixed some printk strings based on feedback
* Renamed 32/64-bit specific types to not have _ prefix
* Fixed bug in printout of efi runtime disablement

v4:
* Some of the earlier cleanup was accidentally reverted by this patch, fixed.
* Reworded some messages to not have to line wrap printk strings

v3:
* Reorganized to a series of patches to make it easier to review, and
  do some of the cleanups I had left out before.

v2:
* Added graceful error handling for 32-bit kernel that gets passed
  EFI data above 4GB.
* Removed some warnings that were missed in first version.

Signed-off-by: Olof Johansson <olof@lixom.net>
Link: http://lkml.kernel.org/r/1329081869-20779-6-git-send-email-olof@lixom.net
Signed-off-by: H. Peter Anvin <hpa@zytor.com>
---
 arch/x86/include/asm/efi.h  |   2 +-
 arch/x86/kernel/setup.c     |  10 ++-
 arch/x86/platform/efi/efi.c | 164 ++++++++++++++++++++++++++++++++++++++------
 include/linux/efi.h         |  45 ++++++++++++
 4 files changed, 196 insertions(+), 25 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/include/asm/efi.h b/arch/x86/include/asm/efi.h
index 844f735fd63a..c9dcc181d4d1 100644
--- a/arch/x86/include/asm/efi.h
+++ b/arch/x86/include/asm/efi.h
@@ -95,7 +95,7 @@ extern void __iomem *efi_ioremap(unsigned long addr, unsigned long size,
 
 extern int add_efi_memmap;
 extern void efi_set_executable(efi_memory_desc_t *md, bool executable);
-extern void efi_memblock_x86_reserve_range(void);
+extern int efi_memblock_x86_reserve_range(void);
 extern void efi_call_phys_prelog(void);
 extern void efi_call_phys_epilog(void);
 
diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c
index d7d5099fe874..88638883176a 100644
--- a/arch/x86/kernel/setup.c
+++ b/arch/x86/kernel/setup.c
@@ -749,10 +749,16 @@ void __init setup_arch(char **cmdline_p)
 #endif
 #ifdef CONFIG_EFI
 	if (!strncmp((char *)&boot_params.efi_info.efi_loader_signature,
-		     EFI_LOADER_SIGNATURE, 4)) {
+		     "EL32", 4)) {
 		efi_enabled = 1;
-		efi_memblock_x86_reserve_range();
+		efi_64bit = false;
+	} else if (!strncmp((char *)&boot_params.efi_info.efi_loader_signature,
+		     "EL64", 4)) {
+		efi_enabled = 1;
+		efi_64bit = true;
 	}
+	if (efi_enabled && efi_memblock_x86_reserve_range())
+		efi_enabled = 0;
 #endif
 
 	x86_init.oem.arch_setup();
diff --git a/arch/x86/platform/efi/efi.c b/arch/x86/platform/efi/efi.c
index 5a053e7737b7..92660edaa1e7 100644
--- a/arch/x86/platform/efi/efi.c
+++ b/arch/x86/platform/efi/efi.c
@@ -68,6 +68,9 @@ EXPORT_SYMBOL(efi);
 
 struct efi_memory_map memmap;
 
+bool efi_64bit;
+static bool efi_native;
+
 static struct efi efi_phys __initdata;
 static efi_system_table_t efi_systab __initdata;
 
@@ -339,11 +342,16 @@ static void __init do_add_efi_memmap(void)
 	sanitize_e820_map(e820.map, ARRAY_SIZE(e820.map), &e820.nr_map);
 }
 
-void __init efi_memblock_x86_reserve_range(void)
+int __init efi_memblock_x86_reserve_range(void)
 {
 	unsigned long pmap;
 
 #ifdef CONFIG_X86_32
+	/* Can't handle data above 4GB at this time */
+	if (boot_params.efi_info.efi_memmap_hi) {
+		pr_err("Memory map is above 4GB, disabling EFI.\n");
+		return -EINVAL;
+	}
 	pmap = boot_params.efi_info.efi_memmap;
 #else
 	pmap = (boot_params.efi_info.efi_memmap |
@@ -355,6 +363,8 @@ void __init efi_memblock_x86_reserve_range(void)
 	memmap.desc_version = boot_params.efi_info.efi_memdesc_version;
 	memmap.desc_size = boot_params.efi_info.efi_memdesc_size;
 	memblock_reserve(pmap, memmap.nr_map * memmap.desc_size);
+
+	return 0;
 }
 
 #if EFI_DEBUG
@@ -432,14 +442,75 @@ static void __init efi_free_boot_services(void)
 
 static int __init efi_systab_init(void *phys)
 {
-	efi.systab = early_ioremap((unsigned long)efi_phys.systab,
-				   sizeof(efi_system_table_t));
-	if (efi.systab == NULL) {
-		pr_err("Couldn't map the system table!\n");
-		return -ENOMEM;
+	if (efi_64bit) {
+		efi_system_table_64_t *systab64;
+		u64 tmp = 0;
+
+		systab64 = early_ioremap((unsigned long)phys,
+					 sizeof(*systab64));
+		if (systab64 == NULL) {
+			pr_err("Couldn't map the system table!\n");
+			return -ENOMEM;
+		}
+
+		efi_systab.hdr = systab64->hdr;
+		efi_systab.fw_vendor = systab64->fw_vendor;
+		tmp |= systab64->fw_vendor;
+		efi_systab.fw_revision = systab64->fw_revision;
+		efi_systab.con_in_handle = systab64->con_in_handle;
+		tmp |= systab64->con_in_handle;
+		efi_systab.con_in = systab64->con_in;
+		tmp |= systab64->con_in;
+		efi_systab.con_out_handle = systab64->con_out_handle;
+		tmp |= systab64->con_out_handle;
+		efi_systab.con_out = systab64->con_out;
+		tmp |= systab64->con_out;
+		efi_systab.stderr_handle = systab64->stderr_handle;
+		tmp |= systab64->stderr_handle;
+		efi_systab.stderr = systab64->stderr;
+		tmp |= systab64->stderr;
+		efi_systab.runtime = (void *)(unsigned long)systab64->runtime;
+		tmp |= systab64->runtime;
+		efi_systab.boottime = (void *)(unsigned long)systab64->boottime;
+		tmp |= systab64->boottime;
+		efi_systab.nr_tables = systab64->nr_tables;
+		efi_systab.tables = systab64->tables;
+		tmp |= systab64->tables;
+
+		early_iounmap(systab64, sizeof(*systab64));
+#ifdef CONFIG_X86_32
+		if (tmp >> 32) {
+			pr_err("EFI data located above 4GB, disabling EFI.\n");
+			return -EINVAL;
+		}
+#endif
+	} else {
+		efi_system_table_32_t *systab32;
+
+		systab32 = early_ioremap((unsigned long)phys,
+					 sizeof(*systab32));
+		if (systab32 == NULL) {
+			pr_err("Couldn't map the system table!\n");
+			return -ENOMEM;
+		}
+
+		efi_systab.hdr = systab32->hdr;
+		efi_systab.fw_vendor = systab32->fw_vendor;
+		efi_systab.fw_revision = systab32->fw_revision;
+		efi_systab.con_in_handle = systab32->con_in_handle;
+		efi_systab.con_in = systab32->con_in;
+		efi_systab.con_out_handle = systab32->con_out_handle;
+		efi_systab.con_out = systab32->con_out;
+		efi_systab.stderr_handle = systab32->stderr_handle;
+		efi_systab.stderr = systab32->stderr;
+		efi_systab.runtime = (void *)(unsigned long)systab32->runtime;
+		efi_systab.boottime = (void *)(unsigned long)systab32->boottime;
+		efi_systab.nr_tables = systab32->nr_tables;
+		efi_systab.tables = systab32->tables;
+
+		early_iounmap(systab32, sizeof(*systab32));
 	}
-	memcpy(&efi_systab, efi.systab, sizeof(efi_system_table_t));
-	early_iounmap(efi.systab, sizeof(efi_system_table_t));
+
 	efi.systab = &efi_systab;
 
 	/*
@@ -460,24 +531,47 @@ static int __init efi_systab_init(void *phys)
 
 static int __init efi_config_init(u64 tables, int nr_tables)
 {
-	efi_config_table_t *config_tables;
-	int i, sz = sizeof(efi_config_table_t);
+	void *config_tables, *tablep;
+	int i, sz;
+
+	if (efi_64bit)
+		sz = sizeof(efi_config_table_64_t);
+	else
+		sz = sizeof(efi_config_table_32_t);
 
 	/*
 	 * Let's see what config tables the firmware passed to us.
 	 */
-	config_tables = early_ioremap(efi.systab->tables,
-				      efi.systab->nr_tables * sz);
+	config_tables = early_ioremap(tables, nr_tables * sz);
 	if (config_tables == NULL) {
 		pr_err("Could not map Configuration table!\n");
 		return -ENOMEM;
 	}
 
+	tablep = config_tables;
 	pr_info("");
 	for (i = 0; i < efi.systab->nr_tables; i++) {
-		efi_guid_t guid = config_tables[i].guid;
-		unsigned long table = config_tables[i].table;
-
+		efi_guid_t guid;
+		unsigned long table;
+
+		if (efi_64bit) {
+			u64 table64;
+			guid = ((efi_config_table_64_t *)tablep)->guid;
+			table64 = ((efi_config_table_64_t *)tablep)->table;
+			table = table64;
+#ifdef CONFIG_X86_32
+			if (table64 >> 32) {
+				pr_cont("\n");
+				pr_err("Table located above 4GB, disabling EFI.\n");
+				early_iounmap(config_tables,
+					      efi.systab->nr_tables * sz);
+				return -EINVAL;
+			}
+#endif
+		} else {
+			guid = ((efi_config_table_32_t *)tablep)->guid;
+			table = ((efi_config_table_32_t *)tablep)->table;
+		}
 		if (!efi_guidcmp(guid, MPS_TABLE_GUID)) {
 			efi.mps = table;
 			pr_cont(" MPS=0x%lx ", table);
@@ -502,10 +596,10 @@ static int __init efi_config_init(u64 tables, int nr_tables)
 			efi.uga = table;
 			pr_cont(" UGA=0x%lx ", table);
 		}
+		tablep += sz;
 	}
 	pr_cont("\n");
 	early_iounmap(config_tables, efi.systab->nr_tables * sz);
-
 	return 0;
 }
 
@@ -569,11 +663,19 @@ void __init efi_init(void)
 	void *tmp;
 
 #ifdef CONFIG_X86_32
+	if (boot_params.efi_info.efi_systab_hi ||
+	    boot_params.efi_info.efi_memmap_hi) {
+		pr_info("Table located above 4GB, disabling EFI.\n");
+		efi_enabled = 0;
+		return;
+	}
 	efi_phys.systab = (efi_system_table_t *)boot_params.efi_info.efi_systab;
+	efi_native = !efi_64bit;
 #else
 	efi_phys.systab = (efi_system_table_t *)
-		(boot_params.efi_info.efi_systab |
-		 ((__u64)boot_params.efi_info.efi_systab_hi<<32));
+			  (boot_params.efi_info.efi_systab |
+			  ((__u64)boot_params.efi_info.efi_systab_hi<<32));
+	efi_native = efi_64bit;
 #endif
 
 	if (efi_systab_init(efi_phys.systab)) {
@@ -602,7 +704,14 @@ void __init efi_init(void)
 		return;
 	}
 
-	if (efi_runtime_init()) {
+	/*
+	 * Note: We currently don't support runtime services on an EFI
+	 * that doesn't match the kernel 32/64-bit mode.
+	 */
+
+	if (!efi_native)
+		pr_info("No EFI runtime due to 32/64-bit mismatch with kernel\n");
+	else if (efi_runtime_init()) {
 		efi_enabled = 0;
 		return;
 	}
@@ -611,10 +720,11 @@ void __init efi_init(void)
 		efi_enabled = 0;
 		return;
 	}
-
 #ifdef CONFIG_X86_32
-	x86_platform.get_wallclock = efi_get_time;
-	x86_platform.set_wallclock = efi_set_rtc_mmss;
+	if (efi_native) {
+		x86_platform.get_wallclock = efi_get_time;
+		x86_platform.set_wallclock = efi_set_rtc_mmss;
+	}
 #endif
 
 #if EFI_DEBUG
@@ -672,6 +782,14 @@ void __init efi_enter_virtual_mode(void)
 
 	efi.systab = NULL;
 
+	/*
+	 * We don't do virtual mode, since we don't do runtime services, on
+	 * non-native EFI
+	 */
+
+	if (!efi_native)
+		goto out;
+
 	/* Merge contiguous regions of the same type and attribute */
 	for (p = memmap.map; p < memmap.map_end; p += memmap.desc_size) {
 		u64 prev_size;
@@ -787,6 +905,8 @@ void __init efi_enter_virtual_mode(void)
 	efi.query_capsule_caps = virt_efi_query_capsule_caps;
 	if (__supported_pte_mask & _PAGE_NX)
 		runtime_code_page_mkexec();
+
+out:
 	early_iounmap(memmap.map, memmap.nr_map * memmap.desc_size);
 	memmap.map = NULL;
 	kfree(new_memmap);
diff --git a/include/linux/efi.h b/include/linux/efi.h
index 37c300712e02..47fbf6b3dc77 100644
--- a/include/linux/efi.h
+++ b/include/linux/efi.h
@@ -313,6 +313,16 @@ typedef efi_status_t efi_query_capsule_caps_t(efi_capsule_header_t **capsules,
 #define EFI_FILE_SYSTEM_GUID \
     EFI_GUID(  0x964e5b22, 0x6459, 0x11d2, 0x8e, 0x39, 0x00, 0xa0, 0xc9, 0x69, 0x72, 0x3b )
 
+typedef struct {
+	efi_guid_t guid;
+	u64 table;
+} efi_config_table_64_t;
+
+typedef struct {
+	efi_guid_t guid;
+	u32 table;
+} efi_config_table_32_t;
+
 typedef struct {
 	efi_guid_t guid;
 	unsigned long table;
@@ -327,6 +337,40 @@ typedef struct {
 #define EFI_1_10_SYSTEM_TABLE_REVISION  ((1 << 16) | (10))
 #define EFI_1_02_SYSTEM_TABLE_REVISION  ((1 << 16) | (02))
 
+typedef struct {
+	efi_table_hdr_t hdr;
+	u64 fw_vendor;	/* physical addr of CHAR16 vendor string */
+	u32 fw_revision;
+	u32 __pad1;
+	u64 con_in_handle;
+	u64 con_in;
+	u64 con_out_handle;
+	u64 con_out;
+	u64 stderr_handle;
+	u64 stderr;
+	u64 runtime;
+	u64 boottime;
+	u32 nr_tables;
+	u32 __pad2;
+	u64 tables;
+} efi_system_table_64_t;
+
+typedef struct {
+	efi_table_hdr_t hdr;
+	u32 fw_vendor;	/* physical addr of CHAR16 vendor string */
+	u32 fw_revision;
+	u32 con_in_handle;
+	u32 con_in;
+	u32 con_out_handle;
+	u32 con_out;
+	u32 stderr_handle;
+	u32 stderr;
+	u32 runtime;
+	u32 boottime;
+	u32 nr_tables;
+	u32 tables;
+} efi_system_table_32_t;
+
 typedef struct {
 	efi_table_hdr_t hdr;
 	unsigned long fw_vendor;	/* physical addr of CHAR16 vendor string */
@@ -497,6 +541,7 @@ extern int __init efi_setup_pcdp_console(char *);
 #ifdef CONFIG_EFI
 # ifdef CONFIG_X86
    extern int efi_enabled;
+   extern bool efi_64bit;
 # else
 #  define efi_enabled 1
 # endif
-- 
cgit v1.2.3


From c5905afb0ee6550b42c49213da1c22d67316c194 Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Fri, 24 Feb 2012 08:31:31 +0100
Subject: static keys: Introduce 'struct static_key', static_key_true()/false()
 and static_key_slow_[inc|dec]()

So here's a boot tested patch on top of Jason's series that does
all the cleanups I talked about and turns jump labels into a
more intuitive to use facility. It should also address the
various misconceptions and confusions that surround jump labels.

Typical usage scenarios:

        #include <linux/static_key.h>

        struct static_key key = STATIC_KEY_INIT_TRUE;

        if (static_key_false(&key))
                do unlikely code
        else
                do likely code

Or:

        if (static_key_true(&key))
                do likely code
        else
                do unlikely code

The static key is modified via:

        static_key_slow_inc(&key);
        ...
        static_key_slow_dec(&key);

The 'slow' prefix makes it abundantly clear that this is an
expensive operation.

I've updated all in-kernel code to use this everywhere. Note
that I (intentionally) have not pushed through the rename
blindly through to the lowest levels: the actual jump-label
patching arch facility should be named like that, so we want to
decouple jump labels from the static-key facility a bit.

On non-jump-label enabled architectures static keys default to
likely()/unlikely() branches.

Signed-off-by: Ingo Molnar <mingo@elte.hu>
Acked-by: Jason Baron <jbaron@redhat.com>
Acked-by: Steven Rostedt <rostedt@goodmis.org>
Cc: a.p.zijlstra@chello.nl
Cc: mathieu.desnoyers@efficios.com
Cc: davem@davemloft.net
Cc: ddaney.cavm@gmail.com
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Link: http://lkml.kernel.org/r/20120222085809.GA26397@elte.hu
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/Kconfig                          |  29 ++++---
 arch/ia64/include/asm/paravirt.h      |   6 +-
 arch/ia64/kernel/paravirt.c           |   4 +-
 arch/mips/include/asm/jump_label.h    |   2 +-
 arch/powerpc/include/asm/jump_label.h |   2 +-
 arch/s390/include/asm/jump_label.h    |   2 +-
 arch/sparc/include/asm/jump_label.h   |   2 +-
 arch/x86/include/asm/jump_label.h     |   6 +-
 arch/x86/include/asm/paravirt.h       |   6 +-
 arch/x86/kernel/kvm.c                 |   4 +-
 arch/x86/kernel/paravirt.c            |   4 +-
 arch/x86/kvm/mmu_audit.c              |   8 +-
 include/linux/jump_label.h            | 139 ++++++++++++++++++++++++----------
 include/linux/netdevice.h             |   4 +-
 include/linux/netfilter.h             |   6 +-
 include/linux/perf_event.h            |  12 +--
 include/linux/static_key.h            |   1 +
 include/linux/tracepoint.h            |   8 +-
 include/net/sock.h                    |   6 +-
 kernel/events/core.c                  |  16 ++--
 kernel/jump_label.c                   | 128 ++++++++++++++++++-------------
 kernel/sched/core.c                   |  18 ++---
 kernel/sched/fair.c                   |   8 +-
 kernel/sched/sched.h                  |  14 ++--
 kernel/tracepoint.c                   |  20 ++---
 net/core/dev.c                        |  24 +++---
 net/core/net-sysfs.c                  |   4 +-
 net/core/sock.c                       |   4 +-
 net/core/sysctl_net_core.c            |   4 +-
 net/ipv4/tcp_memcontrol.c             |   6 +-
 net/netfilter/core.c                  |   6 +-
 31 files changed, 298 insertions(+), 205 deletions(-)
 create mode 100644 include/linux/static_key.h

(limited to 'arch/x86')

diff --git a/arch/Kconfig b/arch/Kconfig
index 4f55c736be11..5b448a74d0f7 100644
--- a/arch/Kconfig
+++ b/arch/Kconfig
@@ -47,18 +47,29 @@ config KPROBES
 	  If in doubt, say "N".
 
 config JUMP_LABEL
-       bool "Optimize trace point call sites"
+       bool "Optimize very unlikely/likely branches"
        depends on HAVE_ARCH_JUMP_LABEL
        help
+         This option enables a transparent branch optimization that
+	 makes certain almost-always-true or almost-always-false branch
+	 conditions even cheaper to execute within the kernel.
+
+	 Certain performance-sensitive kernel code, such as trace points,
+	 scheduler functionality, networking code and KVM have such
+	 branches and include support for this optimization technique.
+
          If it is detected that the compiler has support for "asm goto",
-	 the kernel will compile trace point locations with just a
-	 nop instruction. When trace points are enabled, the nop will
-	 be converted to a jump to the trace function. This technique
-	 lowers overhead and stress on the branch prediction of the
-	 processor.
-
-	 On i386, options added to the compiler flags may increase
-	 the size of the kernel slightly.
+	 the kernel will compile such branches with just a nop
+	 instruction. When the condition flag is toggled to true, the
+	 nop will be converted to a jump instruction to execute the
+	 conditional block of instructions.
+
+	 This technique lowers overhead and stress on the branch prediction
+	 of the processor and generally makes the kernel faster. The update
+	 of the condition is slower, but those are always very rare.
+
+	 ( On 32-bit x86, the necessary options added to the compiler
+	   flags may increase the size of the kernel slightly. )
 
 config OPTPROBES
 	def_bool y
diff --git a/arch/ia64/include/asm/paravirt.h b/arch/ia64/include/asm/paravirt.h
index 32551d304cd7..b149b88ea795 100644
--- a/arch/ia64/include/asm/paravirt.h
+++ b/arch/ia64/include/asm/paravirt.h
@@ -281,9 +281,9 @@ paravirt_init_missing_ticks_accounting(int cpu)
 		pv_time_ops.init_missing_ticks_accounting(cpu);
 }
 
-struct jump_label_key;
-extern struct jump_label_key paravirt_steal_enabled;
-extern struct jump_label_key paravirt_steal_rq_enabled;
+struct static_key;
+extern struct static_key paravirt_steal_enabled;
+extern struct static_key paravirt_steal_rq_enabled;
 
 static inline int
 paravirt_do_steal_accounting(unsigned long *new_itm)
diff --git a/arch/ia64/kernel/paravirt.c b/arch/ia64/kernel/paravirt.c
index 100868216c55..1b22f6de2932 100644
--- a/arch/ia64/kernel/paravirt.c
+++ b/arch/ia64/kernel/paravirt.c
@@ -634,8 +634,8 @@ struct pv_irq_ops pv_irq_ops = {
  * pv_time_ops
  * time operations
  */
-struct jump_label_key paravirt_steal_enabled;
-struct jump_label_key paravirt_steal_rq_enabled;
+struct static_key paravirt_steal_enabled;
+struct static_key paravirt_steal_rq_enabled;
 
 static int
 ia64_native_do_steal_accounting(unsigned long *new_itm)
diff --git a/arch/mips/include/asm/jump_label.h b/arch/mips/include/asm/jump_label.h
index 1881b316ca45..4d6d77ed9b9d 100644
--- a/arch/mips/include/asm/jump_label.h
+++ b/arch/mips/include/asm/jump_label.h
@@ -20,7 +20,7 @@
 #define WORD_INSN ".word"
 #endif
 
-static __always_inline bool arch_static_branch(struct jump_label_key *key)
+static __always_inline bool arch_static_branch(struct static_key *key)
 {
 	asm goto("1:\tnop\n\t"
 		"nop\n\t"
diff --git a/arch/powerpc/include/asm/jump_label.h b/arch/powerpc/include/asm/jump_label.h
index 938986e412f1..ae098c438f00 100644
--- a/arch/powerpc/include/asm/jump_label.h
+++ b/arch/powerpc/include/asm/jump_label.h
@@ -17,7 +17,7 @@
 #define JUMP_ENTRY_TYPE		stringify_in_c(FTR_ENTRY_LONG)
 #define JUMP_LABEL_NOP_SIZE	4
 
-static __always_inline bool arch_static_branch(struct jump_label_key *key)
+static __always_inline bool arch_static_branch(struct static_key *key)
 {
 	asm goto("1:\n\t"
 		 "nop\n\t"
diff --git a/arch/s390/include/asm/jump_label.h b/arch/s390/include/asm/jump_label.h
index 95a6cf2b5b67..6c32190dc73e 100644
--- a/arch/s390/include/asm/jump_label.h
+++ b/arch/s390/include/asm/jump_label.h
@@ -13,7 +13,7 @@
 #define ASM_ALIGN ".balign 4"
 #endif
 
-static __always_inline bool arch_static_branch(struct jump_label_key *key)
+static __always_inline bool arch_static_branch(struct static_key *key)
 {
 	asm goto("0:	brcl 0,0\n"
 		".pushsection __jump_table, \"aw\"\n"
diff --git a/arch/sparc/include/asm/jump_label.h b/arch/sparc/include/asm/jump_label.h
index fc73a82366f8..5080d16a832f 100644
--- a/arch/sparc/include/asm/jump_label.h
+++ b/arch/sparc/include/asm/jump_label.h
@@ -7,7 +7,7 @@
 
 #define JUMP_LABEL_NOP_SIZE 4
 
-static __always_inline bool arch_static_branch(struct jump_label_key *key)
+static __always_inline bool arch_static_branch(struct static_key *key)
 {
 		asm goto("1:\n\t"
 			 "nop\n\t"
diff --git a/arch/x86/include/asm/jump_label.h b/arch/x86/include/asm/jump_label.h
index a32b18ce6ead..3a16c1483b45 100644
--- a/arch/x86/include/asm/jump_label.h
+++ b/arch/x86/include/asm/jump_label.h
@@ -9,12 +9,12 @@
 
 #define JUMP_LABEL_NOP_SIZE 5
 
-#define JUMP_LABEL_INITIAL_NOP ".byte 0xe9 \n\t .long 0\n\t"
+#define STATIC_KEY_INITIAL_NOP ".byte 0xe9 \n\t .long 0\n\t"
 
-static __always_inline bool arch_static_branch(struct jump_label_key *key)
+static __always_inline bool arch_static_branch(struct static_key *key)
 {
 	asm goto("1:"
-		JUMP_LABEL_INITIAL_NOP
+		STATIC_KEY_INITIAL_NOP
 		".pushsection __jump_table,  \"aw\" \n\t"
 		_ASM_ALIGN "\n\t"
 		_ASM_PTR "1b, %l[l_yes], %c0 \n\t"
diff --git a/arch/x86/include/asm/paravirt.h b/arch/x86/include/asm/paravirt.h
index a7d2db9a74fb..c0180fd372d2 100644
--- a/arch/x86/include/asm/paravirt.h
+++ b/arch/x86/include/asm/paravirt.h
@@ -230,9 +230,9 @@ static inline unsigned long long paravirt_sched_clock(void)
 	return PVOP_CALL0(unsigned long long, pv_time_ops.sched_clock);
 }
 
-struct jump_label_key;
-extern struct jump_label_key paravirt_steal_enabled;
-extern struct jump_label_key paravirt_steal_rq_enabled;
+struct static_key;
+extern struct static_key paravirt_steal_enabled;
+extern struct static_key paravirt_steal_rq_enabled;
 
 static inline u64 paravirt_steal_clock(int cpu)
 {
diff --git a/arch/x86/kernel/kvm.c b/arch/x86/kernel/kvm.c
index f0c6fd6f176b..694d801bf606 100644
--- a/arch/x86/kernel/kvm.c
+++ b/arch/x86/kernel/kvm.c
@@ -438,9 +438,9 @@ void __init kvm_guest_init(void)
 static __init int activate_jump_labels(void)
 {
 	if (has_steal_clock) {
-		jump_label_inc(&paravirt_steal_enabled);
+		static_key_slow_inc(&paravirt_steal_enabled);
 		if (steal_acc)
-			jump_label_inc(&paravirt_steal_rq_enabled);
+			static_key_slow_inc(&paravirt_steal_rq_enabled);
 	}
 
 	return 0;
diff --git a/arch/x86/kernel/paravirt.c b/arch/x86/kernel/paravirt.c
index d90272e6bc40..ada2f99388dd 100644
--- a/arch/x86/kernel/paravirt.c
+++ b/arch/x86/kernel/paravirt.c
@@ -202,8 +202,8 @@ static void native_flush_tlb_single(unsigned long addr)
 	__native_flush_tlb_single(addr);
 }
 
-struct jump_label_key paravirt_steal_enabled;
-struct jump_label_key paravirt_steal_rq_enabled;
+struct static_key paravirt_steal_enabled;
+struct static_key paravirt_steal_rq_enabled;
 
 static u64 native_steal_clock(int cpu)
 {
diff --git a/arch/x86/kvm/mmu_audit.c b/arch/x86/kvm/mmu_audit.c
index fe15dcc07a6b..ea7b4fd34676 100644
--- a/arch/x86/kvm/mmu_audit.c
+++ b/arch/x86/kvm/mmu_audit.c
@@ -234,7 +234,7 @@ static void audit_vcpu_spte(struct kvm_vcpu *vcpu)
 }
 
 static bool mmu_audit;
-static struct jump_label_key mmu_audit_key;
+static struct static_key mmu_audit_key;
 
 static void __kvm_mmu_audit(struct kvm_vcpu *vcpu, int point)
 {
@@ -250,7 +250,7 @@ static void __kvm_mmu_audit(struct kvm_vcpu *vcpu, int point)
 
 static inline void kvm_mmu_audit(struct kvm_vcpu *vcpu, int point)
 {
-	if (static_branch((&mmu_audit_key)))
+	if (static_key_false((&mmu_audit_key)))
 		__kvm_mmu_audit(vcpu, point);
 }
 
@@ -259,7 +259,7 @@ static void mmu_audit_enable(void)
 	if (mmu_audit)
 		return;
 
-	jump_label_inc(&mmu_audit_key);
+	static_key_slow_inc(&mmu_audit_key);
 	mmu_audit = true;
 }
 
@@ -268,7 +268,7 @@ static void mmu_audit_disable(void)
 	if (!mmu_audit)
 		return;
 
-	jump_label_dec(&mmu_audit_key);
+	static_key_slow_dec(&mmu_audit_key);
 	mmu_audit = false;
 }
 
diff --git a/include/linux/jump_label.h b/include/linux/jump_label.h
index f7c69580fea7..2172da2d9bb4 100644
--- a/include/linux/jump_label.h
+++ b/include/linux/jump_label.h
@@ -9,15 +9,15 @@
  *
  * Jump labels provide an interface to generate dynamic branches using
  * self-modifying code. Assuming toolchain and architecture support the result
- * of a "if (static_branch(&key))" statement is a unconditional branch (which
+ * of a "if (static_key_false(&key))" statement is a unconditional branch (which
  * defaults to false - and the true block is placed out of line).
  *
- * However at runtime we can change the 'static' branch target using
- * jump_label_{inc,dec}(). These function as a 'reference' count on the key
+ * However at runtime we can change the branch target using
+ * static_key_slow_{inc,dec}(). These function as a 'reference' count on the key
  * object and for as long as there are references all branches referring to
  * that particular key will point to the (out of line) true block.
  *
- * Since this relies on modifying code the jump_label_{inc,dec}() functions
+ * Since this relies on modifying code the static_key_slow_{inc,dec}() functions
  * must be considered absolute slow paths (machine wide synchronization etc.).
  * OTOH, since the affected branches are unconditional their runtime overhead
  * will be absolutely minimal, esp. in the default (off) case where the total
@@ -26,12 +26,26 @@
  *
  * When the control is directly exposed to userspace it is prudent to delay the
  * decrement to avoid high frequency code modifications which can (and do)
- * cause significant performance degradation. Struct jump_label_key_deferred and
- * jump_label_dec_deferred() provide for this.
+ * cause significant performance degradation. Struct static_key_deferred and
+ * static_key_slow_dec_deferred() provide for this.
  *
  * Lacking toolchain and or architecture support, it falls back to a simple
  * conditional branch.
- */
+ *
+ * struct static_key my_key = STATIC_KEY_INIT_TRUE;
+ *
+ *   if (static_key_true(&my_key)) {
+ *   }
+ *
+ * will result in the true case being in-line and starts the key with a single
+ * reference. Mixing static_key_true() and static_key_false() on the same key is not
+ * allowed.
+ *
+ * Not initializing the key (static data is initialized to 0s anyway) is the
+ * same as using STATIC_KEY_INIT_FALSE and static_key_false() is
+ * equivalent with static_branch().
+ *
+*/
 
 #include <linux/types.h>
 #include <linux/compiler.h>
@@ -39,16 +53,17 @@
 
 #if defined(CC_HAVE_ASM_GOTO) && defined(CONFIG_JUMP_LABEL)
 
-struct jump_label_key {
+struct static_key {
 	atomic_t enabled;
+/* Set lsb bit to 1 if branch is default true, 0 ot */
 	struct jump_entry *entries;
 #ifdef CONFIG_MODULES
-	struct jump_label_mod *next;
+	struct static_key_mod *next;
 #endif
 };
 
-struct jump_label_key_deferred {
-	struct jump_label_key key;
+struct static_key_deferred {
+	struct static_key key;
 	unsigned long timeout;
 	struct delayed_work work;
 };
@@ -66,13 +81,34 @@ struct module;
 
 #ifdef HAVE_JUMP_LABEL
 
-#ifdef CONFIG_MODULES
-#define JUMP_LABEL_INIT {ATOMIC_INIT(0), NULL, NULL}
-#else
-#define JUMP_LABEL_INIT {ATOMIC_INIT(0), NULL}
-#endif
+#define JUMP_LABEL_TRUE_BRANCH 1UL
+
+static
+inline struct jump_entry *jump_label_get_entries(struct static_key *key)
+{
+	return (struct jump_entry *)((unsigned long)key->entries
+						& ~JUMP_LABEL_TRUE_BRANCH);
+}
+
+static inline bool jump_label_get_branch_default(struct static_key *key)
+{
+	if ((unsigned long)key->entries & JUMP_LABEL_TRUE_BRANCH)
+		return true;
+	return false;
+}
+
+static __always_inline bool static_key_false(struct static_key *key)
+{
+	return arch_static_branch(key);
+}
 
-static __always_inline bool static_branch(struct jump_label_key *key)
+static __always_inline bool static_key_true(struct static_key *key)
+{
+	return !static_key_false(key);
+}
+
+/* Deprecated. Please use 'static_key_false() instead. */
+static __always_inline bool static_branch(struct static_key *key)
 {
 	return arch_static_branch(key);
 }
@@ -88,21 +124,24 @@ extern void arch_jump_label_transform(struct jump_entry *entry,
 extern void arch_jump_label_transform_static(struct jump_entry *entry,
 					     enum jump_label_type type);
 extern int jump_label_text_reserved(void *start, void *end);
-extern void jump_label_inc(struct jump_label_key *key);
-extern void jump_label_dec(struct jump_label_key *key);
-extern void jump_label_dec_deferred(struct jump_label_key_deferred *key);
-extern bool jump_label_enabled(struct jump_label_key *key);
+extern void static_key_slow_inc(struct static_key *key);
+extern void static_key_slow_dec(struct static_key *key);
+extern void static_key_slow_dec_deferred(struct static_key_deferred *key);
+extern bool static_key_enabled(struct static_key *key);
 extern void jump_label_apply_nops(struct module *mod);
-extern void jump_label_rate_limit(struct jump_label_key_deferred *key,
-		unsigned long rl);
+extern void
+jump_label_rate_limit(struct static_key_deferred *key, unsigned long rl);
+
+#define STATIC_KEY_INIT_TRUE ((struct static_key) \
+	{ .enabled = ATOMIC_INIT(1), .entries = (void *)1 })
+#define STATIC_KEY_INIT_FALSE ((struct static_key) \
+	{ .enabled = ATOMIC_INIT(0), .entries = (void *)0 })
 
 #else  /* !HAVE_JUMP_LABEL */
 
 #include <linux/atomic.h>
 
-#define JUMP_LABEL_INIT {ATOMIC_INIT(0)}
-
-struct jump_label_key {
+struct static_key {
 	atomic_t enabled;
 };
 
@@ -110,30 +149,45 @@ static __always_inline void jump_label_init(void)
 {
 }
 
-struct jump_label_key_deferred {
-	struct jump_label_key  key;
+struct static_key_deferred {
+	struct static_key  key;
 };
 
-static __always_inline bool static_branch(struct jump_label_key *key)
+static __always_inline bool static_key_false(struct static_key *key)
+{
+	if (unlikely(atomic_read(&key->enabled)) > 0)
+		return true;
+	return false;
+}
+
+static __always_inline bool static_key_true(struct static_key *key)
 {
-	if (unlikely(atomic_read(&key->enabled)))
+	if (likely(atomic_read(&key->enabled)) > 0)
 		return true;
 	return false;
 }
 
-static inline void jump_label_inc(struct jump_label_key *key)
+/* Deprecated. Please use 'static_key_false() instead. */
+static __always_inline bool static_branch(struct static_key *key)
+{
+	if (unlikely(atomic_read(&key->enabled)) > 0)
+		return true;
+	return false;
+}
+
+static inline void static_key_slow_inc(struct static_key *key)
 {
 	atomic_inc(&key->enabled);
 }
 
-static inline void jump_label_dec(struct jump_label_key *key)
+static inline void static_key_slow_dec(struct static_key *key)
 {
 	atomic_dec(&key->enabled);
 }
 
-static inline void jump_label_dec_deferred(struct jump_label_key_deferred *key)
+static inline void static_key_slow_dec_deferred(struct static_key_deferred *key)
 {
-	jump_label_dec(&key->key);
+	static_key_slow_dec(&key->key);
 }
 
 static inline int jump_label_text_reserved(void *start, void *end)
@@ -144,9 +198,9 @@ static inline int jump_label_text_reserved(void *start, void *end)
 static inline void jump_label_lock(void) {}
 static inline void jump_label_unlock(void) {}
 
-static inline bool jump_label_enabled(struct jump_label_key *key)
+static inline bool static_key_enabled(struct static_key *key)
 {
-	return !!atomic_read(&key->enabled);
+	return (atomic_read(&key->enabled) > 0);
 }
 
 static inline int jump_label_apply_nops(struct module *mod)
@@ -154,13 +208,20 @@ static inline int jump_label_apply_nops(struct module *mod)
 	return 0;
 }
 
-static inline void jump_label_rate_limit(struct jump_label_key_deferred *key,
+static inline void
+jump_label_rate_limit(struct static_key_deferred *key,
 		unsigned long rl)
 {
 }
+
+#define STATIC_KEY_INIT_TRUE ((struct static_key) \
+		{ .enabled = ATOMIC_INIT(1) })
+#define STATIC_KEY_INIT_FALSE ((struct static_key) \
+		{ .enabled = ATOMIC_INIT(0) })
+
 #endif	/* HAVE_JUMP_LABEL */
 
-#define jump_label_key_enabled	((struct jump_label_key){ .enabled = ATOMIC_INIT(1), })
-#define jump_label_key_disabled	((struct jump_label_key){ .enabled = ATOMIC_INIT(0), })
+#define STATIC_KEY_INIT STATIC_KEY_INIT_FALSE
+#define jump_label_enabled static_key_enabled
 
 #endif	/* _LINUX_JUMP_LABEL_H */
diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index 0eac07c95255..7dfaae7846ab 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -214,8 +214,8 @@ enum {
 #include <linux/skbuff.h>
 
 #ifdef CONFIG_RPS
-#include <linux/jump_label.h>
-extern struct jump_label_key rps_needed;
+#include <linux/static_key.h>
+extern struct static_key rps_needed;
 #endif
 
 struct neighbour;
diff --git a/include/linux/netfilter.h b/include/linux/netfilter.h
index b809265607d0..29734be334c1 100644
--- a/include/linux/netfilter.h
+++ b/include/linux/netfilter.h
@@ -163,13 +163,13 @@ extern struct ctl_path nf_net_ipv4_netfilter_sysctl_path[];
 extern struct list_head nf_hooks[NFPROTO_NUMPROTO][NF_MAX_HOOKS];
 
 #if defined(CONFIG_JUMP_LABEL)
-#include <linux/jump_label.h>
-extern struct jump_label_key nf_hooks_needed[NFPROTO_NUMPROTO][NF_MAX_HOOKS];
+#include <linux/static_key.h>
+extern struct static_key nf_hooks_needed[NFPROTO_NUMPROTO][NF_MAX_HOOKS];
 static inline bool nf_hooks_active(u_int8_t pf, unsigned int hook)
 {
 	if (__builtin_constant_p(pf) &&
 	    __builtin_constant_p(hook))
-		return static_branch(&nf_hooks_needed[pf][hook]);
+		return static_key_false(&nf_hooks_needed[pf][hook]);
 
 	return !list_empty(&nf_hooks[pf][hook]);
 }
diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h
index 412b790f5da6..0d21e6f1cf53 100644
--- a/include/linux/perf_event.h
+++ b/include/linux/perf_event.h
@@ -514,7 +514,7 @@ struct perf_guest_info_callbacks {
 #include <linux/ftrace.h>
 #include <linux/cpu.h>
 #include <linux/irq_work.h>
-#include <linux/jump_label.h>
+#include <linux/static_key.h>
 #include <linux/atomic.h>
 #include <asm/local.h>
 
@@ -1038,7 +1038,7 @@ static inline int is_software_event(struct perf_event *event)
 	return event->pmu->task_ctx_nr == perf_sw_context;
 }
 
-extern struct jump_label_key perf_swevent_enabled[PERF_COUNT_SW_MAX];
+extern struct static_key perf_swevent_enabled[PERF_COUNT_SW_MAX];
 
 extern void __perf_sw_event(u32, u64, struct pt_regs *, u64);
 
@@ -1066,7 +1066,7 @@ perf_sw_event(u32 event_id, u64 nr, struct pt_regs *regs, u64 addr)
 {
 	struct pt_regs hot_regs;
 
-	if (static_branch(&perf_swevent_enabled[event_id])) {
+	if (static_key_false(&perf_swevent_enabled[event_id])) {
 		if (!regs) {
 			perf_fetch_caller_regs(&hot_regs);
 			regs = &hot_regs;
@@ -1075,12 +1075,12 @@ perf_sw_event(u32 event_id, u64 nr, struct pt_regs *regs, u64 addr)
 	}
 }
 
-extern struct jump_label_key_deferred perf_sched_events;
+extern struct static_key_deferred perf_sched_events;
 
 static inline void perf_event_task_sched_in(struct task_struct *prev,
 					    struct task_struct *task)
 {
-	if (static_branch(&perf_sched_events.key))
+	if (static_key_false(&perf_sched_events.key))
 		__perf_event_task_sched_in(prev, task);
 }
 
@@ -1089,7 +1089,7 @@ static inline void perf_event_task_sched_out(struct task_struct *prev,
 {
 	perf_sw_event(PERF_COUNT_SW_CONTEXT_SWITCHES, 1, NULL, 0);
 
-	if (static_branch(&perf_sched_events.key))
+	if (static_key_false(&perf_sched_events.key))
 		__perf_event_task_sched_out(prev, next);
 }
 
diff --git a/include/linux/static_key.h b/include/linux/static_key.h
new file mode 100644
index 000000000000..27bd3f8a0857
--- /dev/null
+++ b/include/linux/static_key.h
@@ -0,0 +1 @@
+#include <linux/jump_label.h>
diff --git a/include/linux/tracepoint.h b/include/linux/tracepoint.h
index fc36da97ff7e..bd96ecd0e05c 100644
--- a/include/linux/tracepoint.h
+++ b/include/linux/tracepoint.h
@@ -17,7 +17,7 @@
 #include <linux/errno.h>
 #include <linux/types.h>
 #include <linux/rcupdate.h>
-#include <linux/jump_label.h>
+#include <linux/static_key.h>
 
 struct module;
 struct tracepoint;
@@ -29,7 +29,7 @@ struct tracepoint_func {
 
 struct tracepoint {
 	const char *name;		/* Tracepoint name */
-	struct jump_label_key key;
+	struct static_key key;
 	void (*regfunc)(void);
 	void (*unregfunc)(void);
 	struct tracepoint_func __rcu *funcs;
@@ -145,7 +145,7 @@ static inline void tracepoint_synchronize_unregister(void)
 	extern struct tracepoint __tracepoint_##name;			\
 	static inline void trace_##name(proto)				\
 	{								\
-		if (static_branch(&__tracepoint_##name.key))		\
+		if (static_key_false(&__tracepoint_##name.key))		\
 			__DO_TRACE(&__tracepoint_##name,		\
 				TP_PROTO(data_proto),			\
 				TP_ARGS(data_args),			\
@@ -188,7 +188,7 @@ static inline void tracepoint_synchronize_unregister(void)
 	__attribute__((section("__tracepoints_strings"))) = #name;	 \
 	struct tracepoint __tracepoint_##name				 \
 	__attribute__((section("__tracepoints"))) =			 \
-		{ __tpstrtab_##name, JUMP_LABEL_INIT, reg, unreg, NULL };\
+		{ __tpstrtab_##name, STATIC_KEY_INIT_FALSE, reg, unreg, NULL };\
 	static struct tracepoint * const __tracepoint_ptr_##name __used	 \
 	__attribute__((section("__tracepoints_ptrs"))) =		 \
 		&__tracepoint_##name;
diff --git a/include/net/sock.h b/include/net/sock.h
index 91c1c8baf020..dcde2d9268cd 100644
--- a/include/net/sock.h
+++ b/include/net/sock.h
@@ -55,7 +55,7 @@
 #include <linux/uaccess.h>
 #include <linux/memcontrol.h>
 #include <linux/res_counter.h>
-#include <linux/jump_label.h>
+#include <linux/static_key.h>
 
 #include <linux/filter.h>
 #include <linux/rculist_nulls.h>
@@ -924,13 +924,13 @@ inline void sk_refcnt_debug_release(const struct sock *sk)
 #endif /* SOCK_REFCNT_DEBUG */
 
 #if defined(CONFIG_CGROUP_MEM_RES_CTLR_KMEM) && defined(CONFIG_NET)
-extern struct jump_label_key memcg_socket_limit_enabled;
+extern struct static_key memcg_socket_limit_enabled;
 static inline struct cg_proto *parent_cg_proto(struct proto *proto,
 					       struct cg_proto *cg_proto)
 {
 	return proto->proto_cgroup(parent_mem_cgroup(cg_proto->memcg));
 }
-#define mem_cgroup_sockets_enabled static_branch(&memcg_socket_limit_enabled)
+#define mem_cgroup_sockets_enabled static_key_false(&memcg_socket_limit_enabled)
 #else
 #define mem_cgroup_sockets_enabled 0
 static inline struct cg_proto *parent_cg_proto(struct proto *proto,
diff --git a/kernel/events/core.c b/kernel/events/core.c
index 7c3b9de55f6b..5e0f8bb89b2b 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -128,7 +128,7 @@ enum event_type_t {
  * perf_sched_events : >0 events exist
  * perf_cgroup_events: >0 per-cpu cgroup events exist on this cpu
  */
-struct jump_label_key_deferred perf_sched_events __read_mostly;
+struct static_key_deferred perf_sched_events __read_mostly;
 static DEFINE_PER_CPU(atomic_t, perf_cgroup_events);
 
 static atomic_t nr_mmap_events __read_mostly;
@@ -2769,7 +2769,7 @@ static void free_event(struct perf_event *event)
 
 	if (!event->parent) {
 		if (event->attach_state & PERF_ATTACH_TASK)
-			jump_label_dec_deferred(&perf_sched_events);
+			static_key_slow_dec_deferred(&perf_sched_events);
 		if (event->attr.mmap || event->attr.mmap_data)
 			atomic_dec(&nr_mmap_events);
 		if (event->attr.comm)
@@ -2780,7 +2780,7 @@ static void free_event(struct perf_event *event)
 			put_callchain_buffers();
 		if (is_cgroup_event(event)) {
 			atomic_dec(&per_cpu(perf_cgroup_events, event->cpu));
-			jump_label_dec_deferred(&perf_sched_events);
+			static_key_slow_dec_deferred(&perf_sched_events);
 		}
 	}
 
@@ -4982,7 +4982,7 @@ fail:
 	return err;
 }
 
-struct jump_label_key perf_swevent_enabled[PERF_COUNT_SW_MAX];
+struct static_key perf_swevent_enabled[PERF_COUNT_SW_MAX];
 
 static void sw_perf_event_destroy(struct perf_event *event)
 {
@@ -4990,7 +4990,7 @@ static void sw_perf_event_destroy(struct perf_event *event)
 
 	WARN_ON(event->parent);
 
-	jump_label_dec(&perf_swevent_enabled[event_id]);
+	static_key_slow_dec(&perf_swevent_enabled[event_id]);
 	swevent_hlist_put(event);
 }
 
@@ -5020,7 +5020,7 @@ static int perf_swevent_init(struct perf_event *event)
 		if (err)
 			return err;
 
-		jump_label_inc(&perf_swevent_enabled[event_id]);
+		static_key_slow_inc(&perf_swevent_enabled[event_id]);
 		event->destroy = sw_perf_event_destroy;
 	}
 
@@ -5843,7 +5843,7 @@ done:
 
 	if (!event->parent) {
 		if (event->attach_state & PERF_ATTACH_TASK)
-			jump_label_inc(&perf_sched_events.key);
+			static_key_slow_inc(&perf_sched_events.key);
 		if (event->attr.mmap || event->attr.mmap_data)
 			atomic_inc(&nr_mmap_events);
 		if (event->attr.comm)
@@ -6081,7 +6081,7 @@ SYSCALL_DEFINE5(perf_event_open,
 		 * - that may need work on context switch
 		 */
 		atomic_inc(&per_cpu(perf_cgroup_events, event->cpu));
-		jump_label_inc(&perf_sched_events.key);
+		static_key_slow_inc(&perf_sched_events.key);
 	}
 
 	/*
diff --git a/kernel/jump_label.c b/kernel/jump_label.c
index 543782e7cdd2..bf9dcadbb53a 100644
--- a/kernel/jump_label.c
+++ b/kernel/jump_label.c
@@ -12,7 +12,7 @@
 #include <linux/slab.h>
 #include <linux/sort.h>
 #include <linux/err.h>
-#include <linux/jump_label.h>
+#include <linux/static_key.h>
 
 #ifdef HAVE_JUMP_LABEL
 
@@ -29,10 +29,11 @@ void jump_label_unlock(void)
 	mutex_unlock(&jump_label_mutex);
 }
 
-bool jump_label_enabled(struct jump_label_key *key)
+bool static_key_enabled(struct static_key *key)
 {
-	return !!atomic_read(&key->enabled);
+	return (atomic_read(&key->enabled) > 0);
 }
+EXPORT_SYMBOL_GPL(static_key_enabled);
 
 static int jump_label_cmp(const void *a, const void *b)
 {
@@ -58,22 +59,26 @@ jump_label_sort_entries(struct jump_entry *start, struct jump_entry *stop)
 	sort(start, size, sizeof(struct jump_entry), jump_label_cmp, NULL);
 }
 
-static void jump_label_update(struct jump_label_key *key, int enable);
+static void jump_label_update(struct static_key *key, int enable);
 
-void jump_label_inc(struct jump_label_key *key)
+void static_key_slow_inc(struct static_key *key)
 {
 	if (atomic_inc_not_zero(&key->enabled))
 		return;
 
 	jump_label_lock();
-	if (atomic_read(&key->enabled) == 0)
-		jump_label_update(key, JUMP_LABEL_ENABLE);
+	if (atomic_read(&key->enabled) == 0) {
+		if (!jump_label_get_branch_default(key))
+			jump_label_update(key, JUMP_LABEL_ENABLE);
+		else
+			jump_label_update(key, JUMP_LABEL_DISABLE);
+	}
 	atomic_inc(&key->enabled);
 	jump_label_unlock();
 }
-EXPORT_SYMBOL_GPL(jump_label_inc);
+EXPORT_SYMBOL_GPL(static_key_slow_inc);
 
-static void __jump_label_dec(struct jump_label_key *key,
+static void __static_key_slow_dec(struct static_key *key,
 		unsigned long rate_limit, struct delayed_work *work)
 {
 	if (!atomic_dec_and_mutex_lock(&key->enabled, &jump_label_mutex)) {
@@ -85,32 +90,35 @@ static void __jump_label_dec(struct jump_label_key *key,
 	if (rate_limit) {
 		atomic_inc(&key->enabled);
 		schedule_delayed_work(work, rate_limit);
-	} else
-		jump_label_update(key, JUMP_LABEL_DISABLE);
-
+	} else {
+		if (!jump_label_get_branch_default(key))
+			jump_label_update(key, JUMP_LABEL_DISABLE);
+		else
+			jump_label_update(key, JUMP_LABEL_ENABLE);
+	}
 	jump_label_unlock();
 }
-EXPORT_SYMBOL_GPL(jump_label_dec);
 
 static void jump_label_update_timeout(struct work_struct *work)
 {
-	struct jump_label_key_deferred *key =
-		container_of(work, struct jump_label_key_deferred, work.work);
-	__jump_label_dec(&key->key, 0, NULL);
+	struct static_key_deferred *key =
+		container_of(work, struct static_key_deferred, work.work);
+	__static_key_slow_dec(&key->key, 0, NULL);
 }
 
-void jump_label_dec(struct jump_label_key *key)
+void static_key_slow_dec(struct static_key *key)
 {
-	__jump_label_dec(key, 0, NULL);
+	__static_key_slow_dec(key, 0, NULL);
 }
+EXPORT_SYMBOL_GPL(static_key_slow_dec);
 
-void jump_label_dec_deferred(struct jump_label_key_deferred *key)
+void static_key_slow_dec_deferred(struct static_key_deferred *key)
 {
-	__jump_label_dec(&key->key, key->timeout, &key->work);
+	__static_key_slow_dec(&key->key, key->timeout, &key->work);
 }
+EXPORT_SYMBOL_GPL(static_key_slow_dec_deferred);
 
-
-void jump_label_rate_limit(struct jump_label_key_deferred *key,
+void jump_label_rate_limit(struct static_key_deferred *key,
 		unsigned long rl)
 {
 	key->timeout = rl;
@@ -153,7 +161,7 @@ void __weak __init_or_module arch_jump_label_transform_static(struct jump_entry
 	arch_jump_label_transform(entry, type);	
 }
 
-static void __jump_label_update(struct jump_label_key *key,
+static void __jump_label_update(struct static_key *key,
 				struct jump_entry *entry,
 				struct jump_entry *stop, int enable)
 {
@@ -170,27 +178,40 @@ static void __jump_label_update(struct jump_label_key *key,
 	}
 }
 
+static enum jump_label_type jump_label_type(struct static_key *key)
+{
+	bool true_branch = jump_label_get_branch_default(key);
+	bool state = static_key_enabled(key);
+
+	if ((!true_branch && state) || (true_branch && !state))
+		return JUMP_LABEL_ENABLE;
+
+	return JUMP_LABEL_DISABLE;
+}
+
 void __init jump_label_init(void)
 {
 	struct jump_entry *iter_start = __start___jump_table;
 	struct jump_entry *iter_stop = __stop___jump_table;
-	struct jump_label_key *key = NULL;
+	struct static_key *key = NULL;
 	struct jump_entry *iter;
 
 	jump_label_lock();
 	jump_label_sort_entries(iter_start, iter_stop);
 
 	for (iter = iter_start; iter < iter_stop; iter++) {
-		struct jump_label_key *iterk;
+		struct static_key *iterk;
 
-		iterk = (struct jump_label_key *)(unsigned long)iter->key;
-		arch_jump_label_transform_static(iter, jump_label_enabled(iterk) ?
-						 JUMP_LABEL_ENABLE : JUMP_LABEL_DISABLE);
+		iterk = (struct static_key *)(unsigned long)iter->key;
+		arch_jump_label_transform_static(iter, jump_label_type(iterk));
 		if (iterk == key)
 			continue;
 
 		key = iterk;
-		key->entries = iter;
+		/*
+		 * Set key->entries to iter, but preserve JUMP_LABEL_TRUE_BRANCH.
+		 */
+		*((unsigned long *)&key->entries) += (unsigned long)iter;
 #ifdef CONFIG_MODULES
 		key->next = NULL;
 #endif
@@ -200,8 +221,8 @@ void __init jump_label_init(void)
 
 #ifdef CONFIG_MODULES
 
-struct jump_label_mod {
-	struct jump_label_mod *next;
+struct static_key_mod {
+	struct static_key_mod *next;
 	struct jump_entry *entries;
 	struct module *mod;
 };
@@ -221,9 +242,9 @@ static int __jump_label_mod_text_reserved(void *start, void *end)
 				start, end);
 }
 
-static void __jump_label_mod_update(struct jump_label_key *key, int enable)
+static void __jump_label_mod_update(struct static_key *key, int enable)
 {
-	struct jump_label_mod *mod = key->next;
+	struct static_key_mod *mod = key->next;
 
 	while (mod) {
 		struct module *m = mod->mod;
@@ -254,11 +275,7 @@ void jump_label_apply_nops(struct module *mod)
 		return;
 
 	for (iter = iter_start; iter < iter_stop; iter++) {
-		struct jump_label_key *iterk;
-
-		iterk = (struct jump_label_key *)(unsigned long)iter->key;
-		arch_jump_label_transform_static(iter, jump_label_enabled(iterk) ?
-				JUMP_LABEL_ENABLE : JUMP_LABEL_DISABLE);
+		arch_jump_label_transform_static(iter, JUMP_LABEL_DISABLE);
 	}
 }
 
@@ -267,8 +284,8 @@ static int jump_label_add_module(struct module *mod)
 	struct jump_entry *iter_start = mod->jump_entries;
 	struct jump_entry *iter_stop = iter_start + mod->num_jump_entries;
 	struct jump_entry *iter;
-	struct jump_label_key *key = NULL;
-	struct jump_label_mod *jlm;
+	struct static_key *key = NULL;
+	struct static_key_mod *jlm;
 
 	/* if the module doesn't have jump label entries, just return */
 	if (iter_start == iter_stop)
@@ -277,28 +294,30 @@ static int jump_label_add_module(struct module *mod)
 	jump_label_sort_entries(iter_start, iter_stop);
 
 	for (iter = iter_start; iter < iter_stop; iter++) {
-		if (iter->key == (jump_label_t)(unsigned long)key)
-			continue;
+		struct static_key *iterk;
 
-		key = (struct jump_label_key *)(unsigned long)iter->key;
+		iterk = (struct static_key *)(unsigned long)iter->key;
+		if (iterk == key)
+			continue;
 
+		key = iterk;
 		if (__module_address(iter->key) == mod) {
-			atomic_set(&key->enabled, 0);
-			key->entries = iter;
+			/*
+			 * Set key->entries to iter, but preserve JUMP_LABEL_TRUE_BRANCH.
+			 */
+			*((unsigned long *)&key->entries) += (unsigned long)iter;
 			key->next = NULL;
 			continue;
 		}
-
-		jlm = kzalloc(sizeof(struct jump_label_mod), GFP_KERNEL);
+		jlm = kzalloc(sizeof(struct static_key_mod), GFP_KERNEL);
 		if (!jlm)
 			return -ENOMEM;
-
 		jlm->mod = mod;
 		jlm->entries = iter;
 		jlm->next = key->next;
 		key->next = jlm;
 
-		if (jump_label_enabled(key))
+		if (jump_label_type(key) == JUMP_LABEL_ENABLE)
 			__jump_label_update(key, iter, iter_stop, JUMP_LABEL_ENABLE);
 	}
 
@@ -310,14 +329,14 @@ static void jump_label_del_module(struct module *mod)
 	struct jump_entry *iter_start = mod->jump_entries;
 	struct jump_entry *iter_stop = iter_start + mod->num_jump_entries;
 	struct jump_entry *iter;
-	struct jump_label_key *key = NULL;
-	struct jump_label_mod *jlm, **prev;
+	struct static_key *key = NULL;
+	struct static_key_mod *jlm, **prev;
 
 	for (iter = iter_start; iter < iter_stop; iter++) {
 		if (iter->key == (jump_label_t)(unsigned long)key)
 			continue;
 
-		key = (struct jump_label_key *)(unsigned long)iter->key;
+		key = (struct static_key *)(unsigned long)iter->key;
 
 		if (__module_address(iter->key) == mod)
 			continue;
@@ -419,9 +438,10 @@ int jump_label_text_reserved(void *start, void *end)
 	return ret;
 }
 
-static void jump_label_update(struct jump_label_key *key, int enable)
+static void jump_label_update(struct static_key *key, int enable)
 {
-	struct jump_entry *entry = key->entries, *stop = __stop___jump_table;
+	struct jump_entry *stop = __stop___jump_table;
+	struct jump_entry *entry = jump_label_get_entries(key);
 
 #ifdef CONFIG_MODULES
 	struct module *mod = __module_address((unsigned long)key);
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 5255c9d2e053..112c6824476b 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -162,13 +162,13 @@ static int sched_feat_show(struct seq_file *m, void *v)
 
 #ifdef HAVE_JUMP_LABEL
 
-#define jump_label_key__true  jump_label_key_enabled
-#define jump_label_key__false jump_label_key_disabled
+#define jump_label_key__true  STATIC_KEY_INIT_TRUE
+#define jump_label_key__false STATIC_KEY_INIT_FALSE
 
 #define SCHED_FEAT(name, enabled)	\
 	jump_label_key__##enabled ,
 
-struct jump_label_key sched_feat_keys[__SCHED_FEAT_NR] = {
+struct static_key sched_feat_keys[__SCHED_FEAT_NR] = {
 #include "features.h"
 };
 
@@ -176,14 +176,14 @@ struct jump_label_key sched_feat_keys[__SCHED_FEAT_NR] = {
 
 static void sched_feat_disable(int i)
 {
-	if (jump_label_enabled(&sched_feat_keys[i]))
-		jump_label_dec(&sched_feat_keys[i]);
+	if (static_key_enabled(&sched_feat_keys[i]))
+		static_key_slow_dec(&sched_feat_keys[i]);
 }
 
 static void sched_feat_enable(int i)
 {
-	if (!jump_label_enabled(&sched_feat_keys[i]))
-		jump_label_inc(&sched_feat_keys[i]);
+	if (!static_key_enabled(&sched_feat_keys[i]))
+		static_key_slow_inc(&sched_feat_keys[i]);
 }
 #else
 static void sched_feat_disable(int i) { };
@@ -894,7 +894,7 @@ static void update_rq_clock_task(struct rq *rq, s64 delta)
 	delta -= irq_delta;
 #endif
 #ifdef CONFIG_PARAVIRT_TIME_ACCOUNTING
-	if (static_branch((&paravirt_steal_rq_enabled))) {
+	if (static_key_false((&paravirt_steal_rq_enabled))) {
 		u64 st;
 
 		steal = paravirt_steal_clock(cpu_of(rq));
@@ -2756,7 +2756,7 @@ void account_idle_time(cputime_t cputime)
 static __always_inline bool steal_account_process_tick(void)
 {
 #ifdef CONFIG_PARAVIRT
-	if (static_branch(&paravirt_steal_enabled)) {
+	if (static_key_false(&paravirt_steal_enabled)) {
 		u64 steal, st = 0;
 
 		steal = paravirt_steal_clock(smp_processor_id());
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 7c6414fc669d..423547ada38a 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -1399,20 +1399,20 @@ entity_tick(struct cfs_rq *cfs_rq, struct sched_entity *curr, int queued)
 #ifdef CONFIG_CFS_BANDWIDTH
 
 #ifdef HAVE_JUMP_LABEL
-static struct jump_label_key __cfs_bandwidth_used;
+static struct static_key __cfs_bandwidth_used;
 
 static inline bool cfs_bandwidth_used(void)
 {
-	return static_branch(&__cfs_bandwidth_used);
+	return static_key_false(&__cfs_bandwidth_used);
 }
 
 void account_cfs_bandwidth_used(int enabled, int was_enabled)
 {
 	/* only need to count groups transitioning between enabled/!enabled */
 	if (enabled && !was_enabled)
-		jump_label_inc(&__cfs_bandwidth_used);
+		static_key_slow_inc(&__cfs_bandwidth_used);
 	else if (!enabled && was_enabled)
-		jump_label_dec(&__cfs_bandwidth_used);
+		static_key_slow_dec(&__cfs_bandwidth_used);
 }
 #else /* HAVE_JUMP_LABEL */
 static bool cfs_bandwidth_used(void)
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index 98c0c2623db8..b4cd6d8ea150 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -611,7 +611,7 @@ static inline void __set_task_cpu(struct task_struct *p, unsigned int cpu)
  * Tunables that become constants when CONFIG_SCHED_DEBUG is off:
  */
 #ifdef CONFIG_SCHED_DEBUG
-# include <linux/jump_label.h>
+# include <linux/static_key.h>
 # define const_debug __read_mostly
 #else
 # define const_debug const
@@ -630,18 +630,18 @@ enum {
 #undef SCHED_FEAT
 
 #if defined(CONFIG_SCHED_DEBUG) && defined(HAVE_JUMP_LABEL)
-static __always_inline bool static_branch__true(struct jump_label_key *key)
+static __always_inline bool static_branch__true(struct static_key *key)
 {
-	return likely(static_branch(key)); /* Not out of line branch. */
+	return static_key_true(key); /* Not out of line branch. */
 }
 
-static __always_inline bool static_branch__false(struct jump_label_key *key)
+static __always_inline bool static_branch__false(struct static_key *key)
 {
-	return unlikely(static_branch(key)); /* Out of line branch. */
+	return static_key_false(key); /* Out of line branch. */
 }
 
 #define SCHED_FEAT(name, enabled)					\
-static __always_inline bool static_branch_##name(struct jump_label_key *key) \
+static __always_inline bool static_branch_##name(struct static_key *key) \
 {									\
 	return static_branch__##enabled(key);				\
 }
@@ -650,7 +650,7 @@ static __always_inline bool static_branch_##name(struct jump_label_key *key) \
 
 #undef SCHED_FEAT
 
-extern struct jump_label_key sched_feat_keys[__SCHED_FEAT_NR];
+extern struct static_key sched_feat_keys[__SCHED_FEAT_NR];
 #define sched_feat(x) (static_branch_##x(&sched_feat_keys[__SCHED_FEAT_##x]))
 #else /* !(SCHED_DEBUG && HAVE_JUMP_LABEL) */
 #define sched_feat(x) (sysctl_sched_features & (1UL << __SCHED_FEAT_##x))
diff --git a/kernel/tracepoint.c b/kernel/tracepoint.c
index f1539decd99d..d96ba22dabfa 100644
--- a/kernel/tracepoint.c
+++ b/kernel/tracepoint.c
@@ -25,7 +25,7 @@
 #include <linux/err.h>
 #include <linux/slab.h>
 #include <linux/sched.h>
-#include <linux/jump_label.h>
+#include <linux/static_key.h>
 
 extern struct tracepoint * const __start___tracepoints_ptrs[];
 extern struct tracepoint * const __stop___tracepoints_ptrs[];
@@ -256,9 +256,9 @@ static void set_tracepoint(struct tracepoint_entry **entry,
 {
 	WARN_ON(strcmp((*entry)->name, elem->name) != 0);
 
-	if (elem->regfunc && !jump_label_enabled(&elem->key) && active)
+	if (elem->regfunc && !static_key_enabled(&elem->key) && active)
 		elem->regfunc();
-	else if (elem->unregfunc && jump_label_enabled(&elem->key) && !active)
+	else if (elem->unregfunc && static_key_enabled(&elem->key) && !active)
 		elem->unregfunc();
 
 	/*
@@ -269,10 +269,10 @@ static void set_tracepoint(struct tracepoint_entry **entry,
 	 * is used.
 	 */
 	rcu_assign_pointer(elem->funcs, (*entry)->funcs);
-	if (active && !jump_label_enabled(&elem->key))
-		jump_label_inc(&elem->key);
-	else if (!active && jump_label_enabled(&elem->key))
-		jump_label_dec(&elem->key);
+	if (active && !static_key_enabled(&elem->key))
+		static_key_slow_inc(&elem->key);
+	else if (!active && static_key_enabled(&elem->key))
+		static_key_slow_dec(&elem->key);
 }
 
 /*
@@ -283,11 +283,11 @@ static void set_tracepoint(struct tracepoint_entry **entry,
  */
 static void disable_tracepoint(struct tracepoint *elem)
 {
-	if (elem->unregfunc && jump_label_enabled(&elem->key))
+	if (elem->unregfunc && static_key_enabled(&elem->key))
 		elem->unregfunc();
 
-	if (jump_label_enabled(&elem->key))
-		jump_label_dec(&elem->key);
+	if (static_key_enabled(&elem->key))
+		static_key_slow_dec(&elem->key);
 	rcu_assign_pointer(elem->funcs, NULL);
 }
 
diff --git a/net/core/dev.c b/net/core/dev.c
index 115dee1d985d..da7ce7f0e566 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -134,7 +134,7 @@
 #include <linux/inetdevice.h>
 #include <linux/cpu_rmap.h>
 #include <linux/net_tstamp.h>
-#include <linux/jump_label.h>
+#include <linux/static_key.h>
 #include <net/flow_keys.h>
 
 #include "net-sysfs.h"
@@ -1441,11 +1441,11 @@ int call_netdevice_notifiers(unsigned long val, struct net_device *dev)
 }
 EXPORT_SYMBOL(call_netdevice_notifiers);
 
-static struct jump_label_key netstamp_needed __read_mostly;
+static struct static_key netstamp_needed __read_mostly;
 #ifdef HAVE_JUMP_LABEL
-/* We are not allowed to call jump_label_dec() from irq context
+/* We are not allowed to call static_key_slow_dec() from irq context
  * If net_disable_timestamp() is called from irq context, defer the
- * jump_label_dec() calls.
+ * static_key_slow_dec() calls.
  */
 static atomic_t netstamp_needed_deferred;
 #endif
@@ -1457,12 +1457,12 @@ void net_enable_timestamp(void)
 
 	if (deferred) {
 		while (--deferred)
-			jump_label_dec(&netstamp_needed);
+			static_key_slow_dec(&netstamp_needed);
 		return;
 	}
 #endif
 	WARN_ON(in_interrupt());
-	jump_label_inc(&netstamp_needed);
+	static_key_slow_inc(&netstamp_needed);
 }
 EXPORT_SYMBOL(net_enable_timestamp);
 
@@ -1474,19 +1474,19 @@ void net_disable_timestamp(void)
 		return;
 	}
 #endif
-	jump_label_dec(&netstamp_needed);
+	static_key_slow_dec(&netstamp_needed);
 }
 EXPORT_SYMBOL(net_disable_timestamp);
 
 static inline void net_timestamp_set(struct sk_buff *skb)
 {
 	skb->tstamp.tv64 = 0;
-	if (static_branch(&netstamp_needed))
+	if (static_key_false(&netstamp_needed))
 		__net_timestamp(skb);
 }
 
 #define net_timestamp_check(COND, SKB)			\
-	if (static_branch(&netstamp_needed)) {		\
+	if (static_key_false(&netstamp_needed)) {		\
 		if ((COND) && !(SKB)->tstamp.tv64)	\
 			__net_timestamp(SKB);		\
 	}						\
@@ -2660,7 +2660,7 @@ EXPORT_SYMBOL(__skb_get_rxhash);
 struct rps_sock_flow_table __rcu *rps_sock_flow_table __read_mostly;
 EXPORT_SYMBOL(rps_sock_flow_table);
 
-struct jump_label_key rps_needed __read_mostly;
+struct static_key rps_needed __read_mostly;
 
 static struct rps_dev_flow *
 set_rps_cpu(struct net_device *dev, struct sk_buff *skb,
@@ -2945,7 +2945,7 @@ int netif_rx(struct sk_buff *skb)
 
 	trace_netif_rx(skb);
 #ifdef CONFIG_RPS
-	if (static_branch(&rps_needed))	{
+	if (static_key_false(&rps_needed)) {
 		struct rps_dev_flow voidflow, *rflow = &voidflow;
 		int cpu;
 
@@ -3309,7 +3309,7 @@ int netif_receive_skb(struct sk_buff *skb)
 		return NET_RX_SUCCESS;
 
 #ifdef CONFIG_RPS
-	if (static_branch(&rps_needed)) {
+	if (static_key_false(&rps_needed)) {
 		struct rps_dev_flow voidflow, *rflow = &voidflow;
 		int cpu, ret;
 
diff --git a/net/core/net-sysfs.c b/net/core/net-sysfs.c
index a1727cda03d7..495586232aa1 100644
--- a/net/core/net-sysfs.c
+++ b/net/core/net-sysfs.c
@@ -608,10 +608,10 @@ static ssize_t store_rps_map(struct netdev_rx_queue *queue,
 	spin_unlock(&rps_map_lock);
 
 	if (map)
-		jump_label_inc(&rps_needed);
+		static_key_slow_inc(&rps_needed);
 	if (old_map) {
 		kfree_rcu(old_map, rcu);
-		jump_label_dec(&rps_needed);
+		static_key_slow_dec(&rps_needed);
 	}
 	free_cpumask_var(mask);
 	return len;
diff --git a/net/core/sock.c b/net/core/sock.c
index 3e81fd2e3c75..3a4e5817a2a7 100644
--- a/net/core/sock.c
+++ b/net/core/sock.c
@@ -111,7 +111,7 @@
 #include <linux/init.h>
 #include <linux/highmem.h>
 #include <linux/user_namespace.h>
-#include <linux/jump_label.h>
+#include <linux/static_key.h>
 #include <linux/memcontrol.h>
 
 #include <asm/uaccess.h>
@@ -184,7 +184,7 @@ void mem_cgroup_sockets_destroy(struct cgroup *cgrp, struct cgroup_subsys *ss)
 static struct lock_class_key af_family_keys[AF_MAX];
 static struct lock_class_key af_family_slock_keys[AF_MAX];
 
-struct jump_label_key memcg_socket_limit_enabled;
+struct static_key memcg_socket_limit_enabled;
 EXPORT_SYMBOL(memcg_socket_limit_enabled);
 
 /*
diff --git a/net/core/sysctl_net_core.c b/net/core/sysctl_net_core.c
index d05559d4d9cd..0c2850874254 100644
--- a/net/core/sysctl_net_core.c
+++ b/net/core/sysctl_net_core.c
@@ -69,9 +69,9 @@ static int rps_sock_flow_sysctl(ctl_table *table, int write,
 		if (sock_table != orig_sock_table) {
 			rcu_assign_pointer(rps_sock_flow_table, sock_table);
 			if (sock_table)
-				jump_label_inc(&rps_needed);
+				static_key_slow_inc(&rps_needed);
 			if (orig_sock_table) {
-				jump_label_dec(&rps_needed);
+				static_key_slow_dec(&rps_needed);
 				synchronize_rcu();
 				vfree(orig_sock_table);
 			}
diff --git a/net/ipv4/tcp_memcontrol.c b/net/ipv4/tcp_memcontrol.c
index 49978788a9dc..602fb305365f 100644
--- a/net/ipv4/tcp_memcontrol.c
+++ b/net/ipv4/tcp_memcontrol.c
@@ -111,7 +111,7 @@ void tcp_destroy_cgroup(struct cgroup *cgrp, struct cgroup_subsys *ss)
 	val = res_counter_read_u64(&tcp->tcp_memory_allocated, RES_LIMIT);
 
 	if (val != RESOURCE_MAX)
-		jump_label_dec(&memcg_socket_limit_enabled);
+		static_key_slow_dec(&memcg_socket_limit_enabled);
 }
 EXPORT_SYMBOL(tcp_destroy_cgroup);
 
@@ -143,9 +143,9 @@ static int tcp_update_limit(struct mem_cgroup *memcg, u64 val)
 					     net->ipv4.sysctl_tcp_mem[i]);
 
 	if (val == RESOURCE_MAX && old_lim != RESOURCE_MAX)
-		jump_label_dec(&memcg_socket_limit_enabled);
+		static_key_slow_dec(&memcg_socket_limit_enabled);
 	else if (old_lim == RESOURCE_MAX && val != RESOURCE_MAX)
-		jump_label_inc(&memcg_socket_limit_enabled);
+		static_key_slow_inc(&memcg_socket_limit_enabled);
 
 	return 0;
 }
diff --git a/net/netfilter/core.c b/net/netfilter/core.c
index b4e8ff05b301..e1b7e051332e 100644
--- a/net/netfilter/core.c
+++ b/net/netfilter/core.c
@@ -56,7 +56,7 @@ struct list_head nf_hooks[NFPROTO_NUMPROTO][NF_MAX_HOOKS] __read_mostly;
 EXPORT_SYMBOL(nf_hooks);
 
 #if defined(CONFIG_JUMP_LABEL)
-struct jump_label_key nf_hooks_needed[NFPROTO_NUMPROTO][NF_MAX_HOOKS];
+struct static_key nf_hooks_needed[NFPROTO_NUMPROTO][NF_MAX_HOOKS];
 EXPORT_SYMBOL(nf_hooks_needed);
 #endif
 
@@ -77,7 +77,7 @@ int nf_register_hook(struct nf_hook_ops *reg)
 	list_add_rcu(&reg->list, elem->list.prev);
 	mutex_unlock(&nf_hook_mutex);
 #if defined(CONFIG_JUMP_LABEL)
-	jump_label_inc(&nf_hooks_needed[reg->pf][reg->hooknum]);
+	static_key_slow_inc(&nf_hooks_needed[reg->pf][reg->hooknum]);
 #endif
 	return 0;
 }
@@ -89,7 +89,7 @@ void nf_unregister_hook(struct nf_hook_ops *reg)
 	list_del_rcu(&reg->list);
 	mutex_unlock(&nf_hook_mutex);
 #if defined(CONFIG_JUMP_LABEL)
-	jump_label_dec(&nf_hooks_needed[reg->pf][reg->hooknum]);
+	static_key_slow_dec(&nf_hooks_needed[reg->pf][reg->hooknum]);
 #endif
 	synchronize_net();
 }
-- 
cgit v1.2.3


From 626109130267713cac020515504ec341e47c96f9 Mon Sep 17 00:00:00 2001
From: Jan Beulich <JBeulich@suse.com>
Date: Fri, 24 Feb 2012 14:54:37 +0000
Subject: x86-64: Fix CFI annotations for NMI nesting code

The saving and restoring of %rdx wasn't annotated at all, and the
jumping over sections where state gets partly restored wasn't handled
either.

Further, by folding the pushing of the previous frame in repeat_nmi
into that which so far was immediately preceding restart_nmi (after
moving the restore of %rdx ahead of that, since it doesn't get used
anymore when pushing prior frames), annotations of the replicated
frame creations can be made consistent too.

v2: Fully fold repeat_nmi into the normal code flow (adding a single
    redundant instruction to the "normal" code path), thus retaining
    the special protection of all instructions between repeat_nmi and
    end_repeat_nmi.

Link: http://lkml.kernel.org/r/4F478B630200007800074A31@nat28.tlf.novell.com

Signed-off-by: Jan Beulich <jbeulich@suse.com>
Signed-off-by: Steven Rostedt <rostedt@goodmis.org>
---
 arch/x86/kernel/entry_64.S | 52 +++++++++++++++++++++++++---------------------
 1 file changed, 28 insertions(+), 24 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kernel/entry_64.S b/arch/x86/kernel/entry_64.S
index 1333d9851778..e0eca007dc0d 100644
--- a/arch/x86/kernel/entry_64.S
+++ b/arch/x86/kernel/entry_64.S
@@ -1530,6 +1530,7 @@ ENTRY(nmi)
 
 	/* Use %rdx as out temp variable throughout */
 	pushq_cfi %rdx
+	CFI_REL_OFFSET rdx, 0
 
 	/*
 	 * If %cs was not the kernel segment, then the NMI triggered in user
@@ -1554,6 +1555,7 @@ ENTRY(nmi)
 	 */
 	lea 6*8(%rsp), %rdx
 	test_in_nmi rdx, 4*8(%rsp), nested_nmi, first_nmi
+	CFI_REMEMBER_STATE
 
 nested_nmi:
 	/*
@@ -1585,10 +1587,12 @@ nested_nmi:
 
 nested_nmi_out:
 	popq_cfi %rdx
+	CFI_RESTORE rdx
 
 	/* No need to check faults here */
 	INTERRUPT_RETURN
 
+	CFI_RESTORE_STATE
 first_nmi:
 	/*
 	 * Because nested NMIs will use the pushed location that we
@@ -1624,6 +1628,10 @@ first_nmi:
 	 * NMI may zero out. The original stack frame and the temp storage
 	 * is also used by nested NMIs and can not be trusted on exit.
 	 */
+	/* Do not pop rdx, nested NMIs will corrupt it */
+	movq (%rsp), %rdx
+	CFI_RESTORE rdx
+
 	/* Set the NMI executing variable on the stack. */
 	pushq_cfi $1
 
@@ -1631,14 +1639,31 @@ first_nmi:
 	.rept 5
 	pushq_cfi 6*8(%rsp)
 	.endr
+	CFI_DEF_CFA_OFFSET SS+8-RIP
+
+	/*
+	 * If there was a nested NMI, the first NMI's iret will return
+	 * here. But NMIs are still enabled and we can take another
+	 * nested NMI. The nested NMI checks the interrupted RIP to see
+	 * if it is between repeat_nmi and end_repeat_nmi, and if so
+	 * it will just return, as we are about to repeat an NMI anyway.
+	 * This makes it safe to copy to the stack frame that a nested
+	 * NMI will update.
+	 */
+repeat_nmi:
+	/*
+	 * Update the stack variable to say we are still in NMI (the update
+	 * is benign for the non-repeat case, where 1 was pushed just above
+	 * to this very stack slot).
+	 */
+	movq $1, 5*8(%rsp)
 
 	/* Make another copy, this one may be modified by nested NMIs */
 	.rept 5
 	pushq_cfi 4*8(%rsp)
 	.endr
-
-	/* Do not pop rdx, nested NMIs will corrupt it */
-	movq 11*8(%rsp), %rdx
+	CFI_DEF_CFA_OFFSET SS+8-RIP
+end_repeat_nmi:
 
 	/*
 	 * Everything below this point can be preempted by a nested
@@ -1646,7 +1671,6 @@ first_nmi:
 	 * caused by an exception and nested NMI will start here, and
 	 * can still be preempted by another NMI.
 	 */
-restart_nmi:
 	pushq_cfi $-1		/* ORIG_RAX: no syscall to restart */
 	subq $ORIG_RAX-R15, %rsp
 	CFI_ADJUST_CFA_OFFSET ORIG_RAX-R15
@@ -1675,26 +1699,6 @@ nmi_restore:
 	CFI_ENDPROC
 END(nmi)
 
-	/*
-	 * If an NMI hit an iret because of an exception or breakpoint,
-	 * it can lose its NMI context, and a nested NMI may come in.
-	 * In that case, the nested NMI will change the preempted NMI's
-	 * stack to jump to here when it does the final iret.
-	 */
-repeat_nmi:
-	INTR_FRAME
-	/* Update the stack variable to say we are still in NMI */
-	movq $1, 5*8(%rsp)
-
-	/* copy the saved stack back to copy stack */
-	.rept 5
-	pushq_cfi 4*8(%rsp)
-	.endr
-
-	jmp restart_nmi
-	CFI_ENDPROC
-end_repeat_nmi:
-
 ENTRY(ignore_sysret)
 	CFI_STARTPROC
 	mov $-ENOSYS,%eax
-- 
cgit v1.2.3


From 69466466ce889cd2cbc8cda9ff1c6083f48cc7f9 Mon Sep 17 00:00:00 2001
From: Jan Beulich <JBeulich@suse.com>
Date: Fri, 24 Feb 2012 11:55:01 +0000
Subject: x86-64: Improve insn scheduling in SAVE_ARGS_IRQ

In one case, use an address register that was computed earlier (and
with a simpler instruction), thus reducing the risk of a stall.

In the second case, eliminate a branch by using a conditional move (as
is already done in call_softirq and xen_do_hypervisor_callback).

Signed-off-by: Jan Beulich <jbeulich@suse.com>
Link: http://lkml.kernel.org/r/4F4788A50200007800074A26@nat28.tlf.novell.com
Reviewed-by: Andi Kleen <ak@linux.intel.com>
Signed-off-by: H. Peter Anvin <hpa@linux.intel.com>
---
 arch/x86/kernel/entry_64.S | 7 +++----
 1 file changed, 3 insertions(+), 4 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kernel/entry_64.S b/arch/x86/kernel/entry_64.S
index a20e1cb9dc87..211b2e1683f1 100644
--- a/arch/x86/kernel/entry_64.S
+++ b/arch/x86/kernel/entry_64.S
@@ -319,7 +319,7 @@ ENDPROC(native_usergs_sysret64)
 	movq %rsp, %rsi
 
 	leaq -RBP(%rsp),%rdi	/* arg1 for handler */
-	testl $3, CS(%rdi)
+	testl $3, CS-RBP(%rsi)
 	je 1f
 	SWAPGS
 	/*
@@ -329,11 +329,10 @@ ENDPROC(native_usergs_sysret64)
 	 * moving irq_enter into assembly, which would be too much work)
 	 */
 1:	incl PER_CPU_VAR(irq_count)
-	jne 2f
-	mov PER_CPU_VAR(irq_stack_ptr),%rsp
+	cmovzq PER_CPU_VAR(irq_stack_ptr),%rsp
 	CFI_DEF_CFA_REGISTER	rsi
 
-2:	/* Store previous stack value */
+	/* Store previous stack value */
 	pushq %rsi
 	CFI_ESCAPE	0x0f /* DW_CFA_def_cfa_expression */, 6, \
 			0x77 /* DW_OP_breg7 */, 0, \
-- 
cgit v1.2.3


From 79fb4ad63e8266ffac1f69bbb45a6f86570493e7 Mon Sep 17 00:00:00 2001
From: Steven Rostedt <srostedt@redhat.com>
Date: Fri, 24 Feb 2012 15:55:13 -0500
Subject: x86: Fix the NMI nesting comments

Some of the comments for the nesting NMI algorithm were stale and
had some references to some prototypes that were first tried.

I also updated the comments to be a little easier to understand
the flow of the code. It definitely needs the documentation.

Signed-off-by: Steven Rostedt <rostedt@goodmis.org>
---
 arch/x86/kernel/entry_64.S | 14 ++++++++------
 1 file changed, 8 insertions(+), 6 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kernel/entry_64.S b/arch/x86/kernel/entry_64.S
index e0eca007dc0d..2de3e457bd4b 100644
--- a/arch/x86/kernel/entry_64.S
+++ b/arch/x86/kernel/entry_64.S
@@ -1624,11 +1624,12 @@ first_nmi:
 	 * | pt_regs                 |
 	 * +-------------------------+
 	 *
-	 * The saved RIP is used to fix up the copied RIP that a nested
-	 * NMI may zero out. The original stack frame and the temp storage
+	 * The saved stack frame is used to fix up the copied stack frame
+	 * that a nested NMI may change to make the interrupted NMI iret jump
+	 * to the repeat_nmi. The original stack frame and the temp storage
 	 * is also used by nested NMIs and can not be trusted on exit.
 	 */
-	/* Do not pop rdx, nested NMIs will corrupt it */
+	/* Do not pop rdx, nested NMIs will corrupt that part of the stack */
 	movq (%rsp), %rdx
 	CFI_RESTORE rdx
 
@@ -1641,6 +1642,8 @@ first_nmi:
 	.endr
 	CFI_DEF_CFA_OFFSET SS+8-RIP
 
+	/* Everything up to here is safe from nested NMIs */
+
 	/*
 	 * If there was a nested NMI, the first NMI's iret will return
 	 * here. But NMIs are still enabled and we can take another
@@ -1667,9 +1670,8 @@ end_repeat_nmi:
 
 	/*
 	 * Everything below this point can be preempted by a nested
-	 * NMI if the first NMI took an exception. Repeated NMIs
-	 * caused by an exception and nested NMI will start here, and
-	 * can still be preempted by another NMI.
+	 * NMI if the first NMI took an exception and reset our iret stack
+	 * so that we repeat another NMI.
 	 */
 	pushq_cfi $-1		/* ORIG_RAX: no syscall to restart */
 	subq $ORIG_RAX-R15, %rsp
-- 
cgit v1.2.3


From c484b2418b0b5bb7b16f01343330650faee60df2 Mon Sep 17 00:00:00 2001
From: Yinghai Lu <yinghai@kernel.org>
Date: Thu, 23 Feb 2012 23:46:50 -0800
Subject: PCI: Use class for quirk for via_no_dac

Signed-off-by: Yinghai Lu <yinghai@kernel.org>
Signed-off-by: Jesse Barnes <jbarnes@virtuousgeek.org>
---
 arch/x86/kernel/pci-dma.c | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kernel/pci-dma.c b/arch/x86/kernel/pci-dma.c
index 1c4d769e21ea..28e5e06fcba4 100644
--- a/arch/x86/kernel/pci-dma.c
+++ b/arch/x86/kernel/pci-dma.c
@@ -262,10 +262,11 @@ rootfs_initcall(pci_iommu_init);
 
 static __devinit void via_no_dac(struct pci_dev *dev)
 {
-	if ((dev->class >> 8) == PCI_CLASS_BRIDGE_PCI && forbid_dac == 0) {
+	if (forbid_dac == 0) {
 		dev_info(&dev->dev, "disabling DAC on VIA PCI bridge\n");
 		forbid_dac = 1;
 	}
 }
-DECLARE_PCI_FIXUP_FINAL(PCI_VENDOR_ID_VIA, PCI_ANY_ID, via_no_dac);
+DECLARE_PCI_FIXUP_CLASS_FINAL(PCI_VENDOR_ID_VIA, PCI_ANY_ID,
+				PCI_CLASS_BRIDGE_PCI, 8, via_no_dac);
 #endif
-- 
cgit v1.2.3


From 4082cf2d7be958bcb5f98ea3b47ef3c9ef8d97e8 Mon Sep 17 00:00:00 2001
From: Yinghai Lu <yinghai@kernel.org>
Date: Thu, 23 Feb 2012 23:46:51 -0800
Subject: PCI: Use class quirk for intel fix_transparent_bridge

Signed-off-by: Yinghai Lu <yinghai@kernel.org>
Signed-off-by: Jesse Barnes <jbarnes@virtuousgeek.org>
---
 arch/x86/pci/fixup.c | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/pci/fixup.c b/arch/x86/pci/fixup.c
index 6dd89555fbfa..24172ffd795b 100644
--- a/arch/x86/pci/fixup.c
+++ b/arch/x86/pci/fixup.c
@@ -164,11 +164,11 @@ DECLARE_PCI_FIXUP_RESUME(PCI_VENDOR_ID_VIA, PCI_DEVICE_ID_VIA_8367_0, pci_fixup_
  */
 static void __devinit pci_fixup_transparent_bridge(struct pci_dev *dev)
 {
-	if ((dev->class >> 8) == PCI_CLASS_BRIDGE_PCI &&
-	    (dev->device & 0xff00) == 0x2400)
+	if ((dev->device & 0xff00) == 0x2400)
 		dev->transparent = 1;
 }
-DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, PCI_ANY_ID, pci_fixup_transparent_bridge);
+DECLARE_PCI_FIXUP_CLASS_HEADER(PCI_VENDOR_ID_INTEL, PCI_ANY_ID,
+			 PCI_CLASS_BRIDGE_PCI, 8, pci_fixup_transparent_bridge);
 
 /*
  * Fixup for C1 Halt Disconnect problem on nForce2 systems.
-- 
cgit v1.2.3


From 73e3b590f38fb7c03ee370430348edf1f401204e Mon Sep 17 00:00:00 2001
From: Yinghai Lu <yinghai@kernel.org>
Date: Thu, 23 Feb 2012 23:46:52 -0800
Subject: PCI: Use class for quirk for pci_fixup_video

Signed-off-by: Yinghai Lu <yinghai@kernel.org>
Signed-off-by: Jesse Barnes <jbarnes@virtuousgeek.org>
---
 arch/x86/pci/fixup.c | 6 ++----
 1 file changed, 2 insertions(+), 4 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/pci/fixup.c b/arch/x86/pci/fixup.c
index 24172ffd795b..d0e6e403b4f6 100644
--- a/arch/x86/pci/fixup.c
+++ b/arch/x86/pci/fixup.c
@@ -322,9 +322,6 @@ static void __devinit pci_fixup_video(struct pci_dev *pdev)
 	struct pci_bus *bus;
 	u16 config;
 
-	if ((pdev->class >> 8) != PCI_CLASS_DISPLAY_VGA)
-		return;
-
 	/* Is VGA routed to us? */
 	bus = pdev->bus;
 	while (bus) {
@@ -353,7 +350,8 @@ static void __devinit pci_fixup_video(struct pci_dev *pdev)
 		dev_printk(KERN_DEBUG, &pdev->dev, "Boot video device\n");
 	}
 }
-DECLARE_PCI_FIXUP_FINAL(PCI_ANY_ID, PCI_ANY_ID, pci_fixup_video);
+DECLARE_PCI_FIXUP_CLASS_FINAL(PCI_ANY_ID, PCI_ANY_ID,
+				PCI_CLASS_DISPLAY_VGA, 8, pci_fixup_video);
 
 
 static const struct dmi_system_id __devinitconst msi_k8t_dmi_table[] = {
-- 
cgit v1.2.3


From 35474c3bb712261c285ca20c568e4e508387cad5 Mon Sep 17 00:00:00 2001
From: Jussi Kivilinna <jussi.kivilinna@mbnet.fi>
Date: Fri, 17 Feb 2012 22:48:37 +0200
Subject: crypto: serpent-sse2 - use crypto_[un]register_algs

Combine all crypto_alg to be registered and use new crypto_[un]register_algs
functions. Simplifies init/exit code and reduce object size.

Signed-off-by: Jussi Kivilinna <jussi.kivilinna@mbnet.fi>
Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
---
 arch/x86/crypto/serpent_sse2_glue.c | 406 +++++++++++++++---------------------
 1 file changed, 163 insertions(+), 243 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/crypto/serpent_sse2_glue.c b/arch/x86/crypto/serpent_sse2_glue.c
index de81cf4e06a1..5520c7522200 100644
--- a/arch/x86/crypto/serpent_sse2_glue.c
+++ b/arch/x86/crypto/serpent_sse2_glue.c
@@ -145,28 +145,6 @@ static int ecb_decrypt(struct blkcipher_desc *desc, struct scatterlist *dst,
 	return ecb_crypt(desc, &walk, false);
 }
 
-static struct crypto_alg blk_ecb_alg = {
-	.cra_name		= "__ecb-serpent-sse2",
-	.cra_driver_name	= "__driver-ecb-serpent-sse2",
-	.cra_priority		= 0,
-	.cra_flags		= CRYPTO_ALG_TYPE_BLKCIPHER,
-	.cra_blocksize		= SERPENT_BLOCK_SIZE,
-	.cra_ctxsize		= sizeof(struct serpent_ctx),
-	.cra_alignmask		= 0,
-	.cra_type		= &crypto_blkcipher_type,
-	.cra_module		= THIS_MODULE,
-	.cra_list		= LIST_HEAD_INIT(blk_ecb_alg.cra_list),
-	.cra_u = {
-		.blkcipher = {
-			.min_keysize	= SERPENT_MIN_KEY_SIZE,
-			.max_keysize	= SERPENT_MAX_KEY_SIZE,
-			.setkey		= serpent_setkey,
-			.encrypt	= ecb_encrypt,
-			.decrypt	= ecb_decrypt,
-		},
-	},
-};
-
 static unsigned int __cbc_encrypt(struct blkcipher_desc *desc,
 				  struct blkcipher_walk *walk)
 {
@@ -295,28 +273,6 @@ static int cbc_decrypt(struct blkcipher_desc *desc, struct scatterlist *dst,
 	return err;
 }
 
-static struct crypto_alg blk_cbc_alg = {
-	.cra_name		= "__cbc-serpent-sse2",
-	.cra_driver_name	= "__driver-cbc-serpent-sse2",
-	.cra_priority		= 0,
-	.cra_flags		= CRYPTO_ALG_TYPE_BLKCIPHER,
-	.cra_blocksize		= SERPENT_BLOCK_SIZE,
-	.cra_ctxsize		= sizeof(struct serpent_ctx),
-	.cra_alignmask		= 0,
-	.cra_type		= &crypto_blkcipher_type,
-	.cra_module		= THIS_MODULE,
-	.cra_list		= LIST_HEAD_INIT(blk_cbc_alg.cra_list),
-	.cra_u = {
-		.blkcipher = {
-			.min_keysize	= SERPENT_MIN_KEY_SIZE,
-			.max_keysize	= SERPENT_MAX_KEY_SIZE,
-			.setkey		= serpent_setkey,
-			.encrypt	= cbc_encrypt,
-			.decrypt	= cbc_decrypt,
-		},
-	},
-};
-
 static inline void u128_to_be128(be128 *dst, const u128 *src)
 {
 	dst->a = cpu_to_be64(src->a);
@@ -439,29 +395,6 @@ static int ctr_crypt(struct blkcipher_desc *desc, struct scatterlist *dst,
 	return err;
 }
 
-static struct crypto_alg blk_ctr_alg = {
-	.cra_name		= "__ctr-serpent-sse2",
-	.cra_driver_name	= "__driver-ctr-serpent-sse2",
-	.cra_priority		= 0,
-	.cra_flags		= CRYPTO_ALG_TYPE_BLKCIPHER,
-	.cra_blocksize		= 1,
-	.cra_ctxsize		= sizeof(struct serpent_ctx),
-	.cra_alignmask		= 0,
-	.cra_type		= &crypto_blkcipher_type,
-	.cra_module		= THIS_MODULE,
-	.cra_list		= LIST_HEAD_INIT(blk_ctr_alg.cra_list),
-	.cra_u = {
-		.blkcipher = {
-			.min_keysize	= SERPENT_MIN_KEY_SIZE,
-			.max_keysize	= SERPENT_MAX_KEY_SIZE,
-			.ivsize		= SERPENT_BLOCK_SIZE,
-			.setkey		= serpent_setkey,
-			.encrypt	= ctr_crypt,
-			.decrypt	= ctr_crypt,
-		},
-	},
-};
-
 struct crypt_priv {
 	struct serpent_ctx *ctx;
 	bool fpu_enabled;
@@ -580,32 +513,6 @@ static void lrw_exit_tfm(struct crypto_tfm *tfm)
 	lrw_free_table(&ctx->lrw_table);
 }
 
-static struct crypto_alg blk_lrw_alg = {
-	.cra_name		= "__lrw-serpent-sse2",
-	.cra_driver_name	= "__driver-lrw-serpent-sse2",
-	.cra_priority		= 0,
-	.cra_flags		= CRYPTO_ALG_TYPE_BLKCIPHER,
-	.cra_blocksize		= SERPENT_BLOCK_SIZE,
-	.cra_ctxsize		= sizeof(struct serpent_lrw_ctx),
-	.cra_alignmask		= 0,
-	.cra_type		= &crypto_blkcipher_type,
-	.cra_module		= THIS_MODULE,
-	.cra_list		= LIST_HEAD_INIT(blk_lrw_alg.cra_list),
-	.cra_exit		= lrw_exit_tfm,
-	.cra_u = {
-		.blkcipher = {
-			.min_keysize	= SERPENT_MIN_KEY_SIZE +
-					  SERPENT_BLOCK_SIZE,
-			.max_keysize	= SERPENT_MAX_KEY_SIZE +
-					  SERPENT_BLOCK_SIZE,
-			.ivsize		= SERPENT_BLOCK_SIZE,
-			.setkey		= lrw_serpent_setkey,
-			.encrypt	= lrw_encrypt,
-			.decrypt	= lrw_decrypt,
-		},
-	},
-};
-
 struct serpent_xts_ctx {
 	struct serpent_ctx tweak_ctx;
 	struct serpent_ctx crypt_ctx;
@@ -689,29 +596,6 @@ static int xts_decrypt(struct blkcipher_desc *desc, struct scatterlist *dst,
 	return ret;
 }
 
-static struct crypto_alg blk_xts_alg = {
-	.cra_name		= "__xts-serpent-sse2",
-	.cra_driver_name	= "__driver-xts-serpent-sse2",
-	.cra_priority		= 0,
-	.cra_flags		= CRYPTO_ALG_TYPE_BLKCIPHER,
-	.cra_blocksize		= SERPENT_BLOCK_SIZE,
-	.cra_ctxsize		= sizeof(struct serpent_xts_ctx),
-	.cra_alignmask		= 0,
-	.cra_type		= &crypto_blkcipher_type,
-	.cra_module		= THIS_MODULE,
-	.cra_list		= LIST_HEAD_INIT(blk_xts_alg.cra_list),
-	.cra_u = {
-		.blkcipher = {
-			.min_keysize	= SERPENT_MIN_KEY_SIZE * 2,
-			.max_keysize	= SERPENT_MAX_KEY_SIZE * 2,
-			.ivsize		= SERPENT_BLOCK_SIZE,
-			.setkey		= xts_serpent_setkey,
-			.encrypt	= xts_encrypt,
-			.decrypt	= xts_decrypt,
-		},
-	},
-};
-
 static int ablk_set_key(struct crypto_ablkcipher *tfm, const u8 *key,
 			unsigned int key_len)
 {
@@ -813,7 +697,157 @@ static int ablk_ecb_init(struct crypto_tfm *tfm)
 	return 0;
 }
 
-static struct crypto_alg ablk_ecb_alg = {
+static int ablk_cbc_init(struct crypto_tfm *tfm)
+{
+	struct cryptd_ablkcipher *cryptd_tfm;
+
+	cryptd_tfm = cryptd_alloc_ablkcipher("__driver-cbc-serpent-sse2", 0, 0);
+	if (IS_ERR(cryptd_tfm))
+		return PTR_ERR(cryptd_tfm);
+	ablk_init_common(tfm, cryptd_tfm);
+	return 0;
+}
+
+static int ablk_ctr_init(struct crypto_tfm *tfm)
+{
+	struct cryptd_ablkcipher *cryptd_tfm;
+
+	cryptd_tfm = cryptd_alloc_ablkcipher("__driver-ctr-serpent-sse2", 0, 0);
+	if (IS_ERR(cryptd_tfm))
+		return PTR_ERR(cryptd_tfm);
+	ablk_init_common(tfm, cryptd_tfm);
+	return 0;
+}
+
+static int ablk_lrw_init(struct crypto_tfm *tfm)
+{
+	struct cryptd_ablkcipher *cryptd_tfm;
+
+	cryptd_tfm = cryptd_alloc_ablkcipher("__driver-lrw-serpent-sse2", 0, 0);
+	if (IS_ERR(cryptd_tfm))
+		return PTR_ERR(cryptd_tfm);
+	ablk_init_common(tfm, cryptd_tfm);
+	return 0;
+}
+
+static int ablk_xts_init(struct crypto_tfm *tfm)
+{
+	struct cryptd_ablkcipher *cryptd_tfm;
+
+	cryptd_tfm = cryptd_alloc_ablkcipher("__driver-xts-serpent-sse2", 0, 0);
+	if (IS_ERR(cryptd_tfm))
+		return PTR_ERR(cryptd_tfm);
+	ablk_init_common(tfm, cryptd_tfm);
+	return 0;
+}
+
+static struct crypto_alg serpent_algs[10] = { {
+	.cra_name		= "__ecb-serpent-sse2",
+	.cra_driver_name	= "__driver-ecb-serpent-sse2",
+	.cra_priority		= 0,
+	.cra_flags		= CRYPTO_ALG_TYPE_BLKCIPHER,
+	.cra_blocksize		= SERPENT_BLOCK_SIZE,
+	.cra_ctxsize		= sizeof(struct serpent_ctx),
+	.cra_alignmask		= 0,
+	.cra_type		= &crypto_blkcipher_type,
+	.cra_module		= THIS_MODULE,
+	.cra_list		= LIST_HEAD_INIT(serpent_algs[0].cra_list),
+	.cra_u = {
+		.blkcipher = {
+			.min_keysize	= SERPENT_MIN_KEY_SIZE,
+			.max_keysize	= SERPENT_MAX_KEY_SIZE,
+			.setkey		= serpent_setkey,
+			.encrypt	= ecb_encrypt,
+			.decrypt	= ecb_decrypt,
+		},
+	},
+}, {
+	.cra_name		= "__cbc-serpent-sse2",
+	.cra_driver_name	= "__driver-cbc-serpent-sse2",
+	.cra_priority		= 0,
+	.cra_flags		= CRYPTO_ALG_TYPE_BLKCIPHER,
+	.cra_blocksize		= SERPENT_BLOCK_SIZE,
+	.cra_ctxsize		= sizeof(struct serpent_ctx),
+	.cra_alignmask		= 0,
+	.cra_type		= &crypto_blkcipher_type,
+	.cra_module		= THIS_MODULE,
+	.cra_list		= LIST_HEAD_INIT(serpent_algs[1].cra_list),
+	.cra_u = {
+		.blkcipher = {
+			.min_keysize	= SERPENT_MIN_KEY_SIZE,
+			.max_keysize	= SERPENT_MAX_KEY_SIZE,
+			.setkey		= serpent_setkey,
+			.encrypt	= cbc_encrypt,
+			.decrypt	= cbc_decrypt,
+		},
+	},
+}, {
+	.cra_name		= "__ctr-serpent-sse2",
+	.cra_driver_name	= "__driver-ctr-serpent-sse2",
+	.cra_priority		= 0,
+	.cra_flags		= CRYPTO_ALG_TYPE_BLKCIPHER,
+	.cra_blocksize		= 1,
+	.cra_ctxsize		= sizeof(struct serpent_ctx),
+	.cra_alignmask		= 0,
+	.cra_type		= &crypto_blkcipher_type,
+	.cra_module		= THIS_MODULE,
+	.cra_list		= LIST_HEAD_INIT(serpent_algs[2].cra_list),
+	.cra_u = {
+		.blkcipher = {
+			.min_keysize	= SERPENT_MIN_KEY_SIZE,
+			.max_keysize	= SERPENT_MAX_KEY_SIZE,
+			.ivsize		= SERPENT_BLOCK_SIZE,
+			.setkey		= serpent_setkey,
+			.encrypt	= ctr_crypt,
+			.decrypt	= ctr_crypt,
+		},
+	},
+}, {
+	.cra_name		= "__lrw-serpent-sse2",
+	.cra_driver_name	= "__driver-lrw-serpent-sse2",
+	.cra_priority		= 0,
+	.cra_flags		= CRYPTO_ALG_TYPE_BLKCIPHER,
+	.cra_blocksize		= SERPENT_BLOCK_SIZE,
+	.cra_ctxsize		= sizeof(struct serpent_lrw_ctx),
+	.cra_alignmask		= 0,
+	.cra_type		= &crypto_blkcipher_type,
+	.cra_module		= THIS_MODULE,
+	.cra_list		= LIST_HEAD_INIT(serpent_algs[3].cra_list),
+	.cra_exit		= lrw_exit_tfm,
+	.cra_u = {
+		.blkcipher = {
+			.min_keysize	= SERPENT_MIN_KEY_SIZE +
+					  SERPENT_BLOCK_SIZE,
+			.max_keysize	= SERPENT_MAX_KEY_SIZE +
+					  SERPENT_BLOCK_SIZE,
+			.ivsize		= SERPENT_BLOCK_SIZE,
+			.setkey		= lrw_serpent_setkey,
+			.encrypt	= lrw_encrypt,
+			.decrypt	= lrw_decrypt,
+		},
+	},
+}, {
+	.cra_name		= "__xts-serpent-sse2",
+	.cra_driver_name	= "__driver-xts-serpent-sse2",
+	.cra_priority		= 0,
+	.cra_flags		= CRYPTO_ALG_TYPE_BLKCIPHER,
+	.cra_blocksize		= SERPENT_BLOCK_SIZE,
+	.cra_ctxsize		= sizeof(struct serpent_xts_ctx),
+	.cra_alignmask		= 0,
+	.cra_type		= &crypto_blkcipher_type,
+	.cra_module		= THIS_MODULE,
+	.cra_list		= LIST_HEAD_INIT(serpent_algs[4].cra_list),
+	.cra_u = {
+		.blkcipher = {
+			.min_keysize	= SERPENT_MIN_KEY_SIZE * 2,
+			.max_keysize	= SERPENT_MAX_KEY_SIZE * 2,
+			.ivsize		= SERPENT_BLOCK_SIZE,
+			.setkey		= xts_serpent_setkey,
+			.encrypt	= xts_encrypt,
+			.decrypt	= xts_decrypt,
+		},
+	},
+}, {
 	.cra_name		= "ecb(serpent)",
 	.cra_driver_name	= "ecb-serpent-sse2",
 	.cra_priority		= 400,
@@ -823,7 +857,7 @@ static struct crypto_alg ablk_ecb_alg = {
 	.cra_alignmask		= 0,
 	.cra_type		= &crypto_ablkcipher_type,
 	.cra_module		= THIS_MODULE,
-	.cra_list		= LIST_HEAD_INIT(ablk_ecb_alg.cra_list),
+	.cra_list		= LIST_HEAD_INIT(serpent_algs[5].cra_list),
 	.cra_init		= ablk_ecb_init,
 	.cra_exit		= ablk_exit,
 	.cra_u = {
@@ -835,20 +869,7 @@ static struct crypto_alg ablk_ecb_alg = {
 			.decrypt	= ablk_decrypt,
 		},
 	},
-};
-
-static int ablk_cbc_init(struct crypto_tfm *tfm)
-{
-	struct cryptd_ablkcipher *cryptd_tfm;
-
-	cryptd_tfm = cryptd_alloc_ablkcipher("__driver-cbc-serpent-sse2", 0, 0);
-	if (IS_ERR(cryptd_tfm))
-		return PTR_ERR(cryptd_tfm);
-	ablk_init_common(tfm, cryptd_tfm);
-	return 0;
-}
-
-static struct crypto_alg ablk_cbc_alg = {
+}, {
 	.cra_name		= "cbc(serpent)",
 	.cra_driver_name	= "cbc-serpent-sse2",
 	.cra_priority		= 400,
@@ -858,7 +879,7 @@ static struct crypto_alg ablk_cbc_alg = {
 	.cra_alignmask		= 0,
 	.cra_type		= &crypto_ablkcipher_type,
 	.cra_module		= THIS_MODULE,
-	.cra_list		= LIST_HEAD_INIT(ablk_cbc_alg.cra_list),
+	.cra_list		= LIST_HEAD_INIT(serpent_algs[6].cra_list),
 	.cra_init		= ablk_cbc_init,
 	.cra_exit		= ablk_exit,
 	.cra_u = {
@@ -871,20 +892,7 @@ static struct crypto_alg ablk_cbc_alg = {
 			.decrypt	= ablk_decrypt,
 		},
 	},
-};
-
-static int ablk_ctr_init(struct crypto_tfm *tfm)
-{
-	struct cryptd_ablkcipher *cryptd_tfm;
-
-	cryptd_tfm = cryptd_alloc_ablkcipher("__driver-ctr-serpent-sse2", 0, 0);
-	if (IS_ERR(cryptd_tfm))
-		return PTR_ERR(cryptd_tfm);
-	ablk_init_common(tfm, cryptd_tfm);
-	return 0;
-}
-
-static struct crypto_alg ablk_ctr_alg = {
+}, {
 	.cra_name		= "ctr(serpent)",
 	.cra_driver_name	= "ctr-serpent-sse2",
 	.cra_priority		= 400,
@@ -894,7 +902,7 @@ static struct crypto_alg ablk_ctr_alg = {
 	.cra_alignmask		= 0,
 	.cra_type		= &crypto_ablkcipher_type,
 	.cra_module		= THIS_MODULE,
-	.cra_list		= LIST_HEAD_INIT(ablk_ctr_alg.cra_list),
+	.cra_list		= LIST_HEAD_INIT(serpent_algs[7].cra_list),
 	.cra_init		= ablk_ctr_init,
 	.cra_exit		= ablk_exit,
 	.cra_u = {
@@ -908,20 +916,7 @@ static struct crypto_alg ablk_ctr_alg = {
 			.geniv		= "chainiv",
 		},
 	},
-};
-
-static int ablk_lrw_init(struct crypto_tfm *tfm)
-{
-	struct cryptd_ablkcipher *cryptd_tfm;
-
-	cryptd_tfm = cryptd_alloc_ablkcipher("__driver-lrw-serpent-sse2", 0, 0);
-	if (IS_ERR(cryptd_tfm))
-		return PTR_ERR(cryptd_tfm);
-	ablk_init_common(tfm, cryptd_tfm);
-	return 0;
-}
-
-static struct crypto_alg ablk_lrw_alg = {
+}, {
 	.cra_name		= "lrw(serpent)",
 	.cra_driver_name	= "lrw-serpent-sse2",
 	.cra_priority		= 400,
@@ -931,7 +926,7 @@ static struct crypto_alg ablk_lrw_alg = {
 	.cra_alignmask		= 0,
 	.cra_type		= &crypto_ablkcipher_type,
 	.cra_module		= THIS_MODULE,
-	.cra_list		= LIST_HEAD_INIT(ablk_lrw_alg.cra_list),
+	.cra_list		= LIST_HEAD_INIT(serpent_algs[8].cra_list),
 	.cra_init		= ablk_lrw_init,
 	.cra_exit		= ablk_exit,
 	.cra_u = {
@@ -946,20 +941,7 @@ static struct crypto_alg ablk_lrw_alg = {
 			.decrypt	= ablk_decrypt,
 		},
 	},
-};
-
-static int ablk_xts_init(struct crypto_tfm *tfm)
-{
-	struct cryptd_ablkcipher *cryptd_tfm;
-
-	cryptd_tfm = cryptd_alloc_ablkcipher("__driver-xts-serpent-sse2", 0, 0);
-	if (IS_ERR(cryptd_tfm))
-		return PTR_ERR(cryptd_tfm);
-	ablk_init_common(tfm, cryptd_tfm);
-	return 0;
-}
-
-static struct crypto_alg ablk_xts_alg = {
+}, {
 	.cra_name		= "xts(serpent)",
 	.cra_driver_name	= "xts-serpent-sse2",
 	.cra_priority		= 400,
@@ -969,7 +951,7 @@ static struct crypto_alg ablk_xts_alg = {
 	.cra_alignmask		= 0,
 	.cra_type		= &crypto_ablkcipher_type,
 	.cra_module		= THIS_MODULE,
-	.cra_list		= LIST_HEAD_INIT(ablk_xts_alg.cra_list),
+	.cra_list		= LIST_HEAD_INIT(serpent_algs[9].cra_list),
 	.cra_init		= ablk_xts_init,
 	.cra_exit		= ablk_exit,
 	.cra_u = {
@@ -982,83 +964,21 @@ static struct crypto_alg ablk_xts_alg = {
 			.decrypt	= ablk_decrypt,
 		},
 	},
-};
+} };
 
 static int __init serpent_sse2_init(void)
 {
-	int err;
-
 	if (!cpu_has_xmm2) {
 		printk(KERN_INFO "SSE2 instructions are not detected.\n");
 		return -ENODEV;
 	}
 
-	err = crypto_register_alg(&blk_ecb_alg);
-	if (err)
-		goto blk_ecb_err;
-	err = crypto_register_alg(&blk_cbc_alg);
-	if (err)
-		goto blk_cbc_err;
-	err = crypto_register_alg(&blk_ctr_alg);
-	if (err)
-		goto blk_ctr_err;
-	err = crypto_register_alg(&ablk_ecb_alg);
-	if (err)
-		goto ablk_ecb_err;
-	err = crypto_register_alg(&ablk_cbc_alg);
-	if (err)
-		goto ablk_cbc_err;
-	err = crypto_register_alg(&ablk_ctr_alg);
-	if (err)
-		goto ablk_ctr_err;
-	err = crypto_register_alg(&blk_lrw_alg);
-	if (err)
-		goto blk_lrw_err;
-	err = crypto_register_alg(&ablk_lrw_alg);
-	if (err)
-		goto ablk_lrw_err;
-	err = crypto_register_alg(&blk_xts_alg);
-	if (err)
-		goto blk_xts_err;
-	err = crypto_register_alg(&ablk_xts_alg);
-	if (err)
-		goto ablk_xts_err;
-	return err;
-
-ablk_xts_err:
-	crypto_unregister_alg(&blk_xts_alg);
-blk_xts_err:
-	crypto_unregister_alg(&ablk_lrw_alg);
-ablk_lrw_err:
-	crypto_unregister_alg(&blk_lrw_alg);
-blk_lrw_err:
-	crypto_unregister_alg(&ablk_ctr_alg);
-ablk_ctr_err:
-	crypto_unregister_alg(&ablk_cbc_alg);
-ablk_cbc_err:
-	crypto_unregister_alg(&ablk_ecb_alg);
-ablk_ecb_err:
-	crypto_unregister_alg(&blk_ctr_alg);
-blk_ctr_err:
-	crypto_unregister_alg(&blk_cbc_alg);
-blk_cbc_err:
-	crypto_unregister_alg(&blk_ecb_alg);
-blk_ecb_err:
-	return err;
+	return crypto_register_algs(serpent_algs, ARRAY_SIZE(serpent_algs));
 }
 
 static void __exit serpent_sse2_exit(void)
 {
-	crypto_unregister_alg(&ablk_xts_alg);
-	crypto_unregister_alg(&blk_xts_alg);
-	crypto_unregister_alg(&ablk_lrw_alg);
-	crypto_unregister_alg(&blk_lrw_alg);
-	crypto_unregister_alg(&ablk_ctr_alg);
-	crypto_unregister_alg(&ablk_cbc_alg);
-	crypto_unregister_alg(&ablk_ecb_alg);
-	crypto_unregister_alg(&blk_ctr_alg);
-	crypto_unregister_alg(&blk_cbc_alg);
-	crypto_unregister_alg(&blk_ecb_alg);
+	crypto_unregister_algs(serpent_algs, ARRAY_SIZE(serpent_algs));
 }
 
 module_init(serpent_sse2_init);
-- 
cgit v1.2.3


From 53709ddee36cbd19434aa0f0ac8c1e27b92aca33 Mon Sep 17 00:00:00 2001
From: Jussi Kivilinna <jussi.kivilinna@mbnet.fi>
Date: Fri, 17 Feb 2012 22:48:43 +0200
Subject: crypto: twofish-x86_64-3way - use crypto_[un]register_algs

Combine all crypto_alg to be registered and use new crypto_[un]register_algs
functions. Simplifies init/exit code and reduce object size.

Signed-off-by: Jussi Kivilinna <jussi.kivilinna@mbnet.fi>
Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
---
 arch/x86/crypto/twofish_glue_3way.c | 219 +++++++++++++++---------------------
 1 file changed, 89 insertions(+), 130 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/crypto/twofish_glue_3way.c b/arch/x86/crypto/twofish_glue_3way.c
index 2c7f14ec7082..408fc0c5814e 100644
--- a/arch/x86/crypto/twofish_glue_3way.c
+++ b/arch/x86/crypto/twofish_glue_3way.c
@@ -123,28 +123,6 @@ static int ecb_decrypt(struct blkcipher_desc *desc, struct scatterlist *dst,
 	return ecb_crypt(desc, &walk, twofish_dec_blk, twofish_dec_blk_3way);
 }
 
-static struct crypto_alg blk_ecb_alg = {
-	.cra_name		= "ecb(twofish)",
-	.cra_driver_name	= "ecb-twofish-3way",
-	.cra_priority		= 300,
-	.cra_flags		= CRYPTO_ALG_TYPE_BLKCIPHER,
-	.cra_blocksize		= TF_BLOCK_SIZE,
-	.cra_ctxsize		= sizeof(struct twofish_ctx),
-	.cra_alignmask		= 0,
-	.cra_type		= &crypto_blkcipher_type,
-	.cra_module		= THIS_MODULE,
-	.cra_list		= LIST_HEAD_INIT(blk_ecb_alg.cra_list),
-	.cra_u = {
-		.blkcipher = {
-			.min_keysize	= TF_MIN_KEY_SIZE,
-			.max_keysize	= TF_MAX_KEY_SIZE,
-			.setkey		= twofish_setkey,
-			.encrypt	= ecb_encrypt,
-			.decrypt	= ecb_decrypt,
-		},
-	},
-};
-
 static unsigned int __cbc_encrypt(struct blkcipher_desc *desc,
 				  struct blkcipher_walk *walk)
 {
@@ -268,29 +246,6 @@ static int cbc_decrypt(struct blkcipher_desc *desc, struct scatterlist *dst,
 	return err;
 }
 
-static struct crypto_alg blk_cbc_alg = {
-	.cra_name		= "cbc(twofish)",
-	.cra_driver_name	= "cbc-twofish-3way",
-	.cra_priority		= 300,
-	.cra_flags		= CRYPTO_ALG_TYPE_BLKCIPHER,
-	.cra_blocksize		= TF_BLOCK_SIZE,
-	.cra_ctxsize		= sizeof(struct twofish_ctx),
-	.cra_alignmask		= 0,
-	.cra_type		= &crypto_blkcipher_type,
-	.cra_module		= THIS_MODULE,
-	.cra_list		= LIST_HEAD_INIT(blk_cbc_alg.cra_list),
-	.cra_u = {
-		.blkcipher = {
-			.min_keysize	= TF_MIN_KEY_SIZE,
-			.max_keysize	= TF_MAX_KEY_SIZE,
-			.ivsize		= TF_BLOCK_SIZE,
-			.setkey		= twofish_setkey,
-			.encrypt	= cbc_encrypt,
-			.decrypt	= cbc_decrypt,
-		},
-	},
-};
-
 static inline void u128_to_be128(be128 *dst, const u128 *src)
 {
 	dst->a = cpu_to_be64(src->a);
@@ -412,29 +367,6 @@ static int ctr_crypt(struct blkcipher_desc *desc, struct scatterlist *dst,
 	return err;
 }
 
-static struct crypto_alg blk_ctr_alg = {
-	.cra_name		= "ctr(twofish)",
-	.cra_driver_name	= "ctr-twofish-3way",
-	.cra_priority		= 300,
-	.cra_flags		= CRYPTO_ALG_TYPE_BLKCIPHER,
-	.cra_blocksize		= 1,
-	.cra_ctxsize		= sizeof(struct twofish_ctx),
-	.cra_alignmask		= 0,
-	.cra_type		= &crypto_blkcipher_type,
-	.cra_module		= THIS_MODULE,
-	.cra_list		= LIST_HEAD_INIT(blk_ctr_alg.cra_list),
-	.cra_u = {
-		.blkcipher = {
-			.min_keysize	= TF_MIN_KEY_SIZE,
-			.max_keysize	= TF_MAX_KEY_SIZE,
-			.ivsize		= TF_BLOCK_SIZE,
-			.setkey		= twofish_setkey,
-			.encrypt	= ctr_crypt,
-			.decrypt	= ctr_crypt,
-		},
-	},
-};
-
 static void encrypt_callback(void *priv, u8 *srcdst, unsigned int nbytes)
 {
 	const unsigned int bsize = TF_BLOCK_SIZE;
@@ -525,30 +457,6 @@ static void lrw_exit_tfm(struct crypto_tfm *tfm)
 	lrw_free_table(&ctx->lrw_table);
 }
 
-static struct crypto_alg blk_lrw_alg = {
-	.cra_name		= "lrw(twofish)",
-	.cra_driver_name	= "lrw-twofish-3way",
-	.cra_priority		= 300,
-	.cra_flags		= CRYPTO_ALG_TYPE_BLKCIPHER,
-	.cra_blocksize		= TF_BLOCK_SIZE,
-	.cra_ctxsize		= sizeof(struct twofish_lrw_ctx),
-	.cra_alignmask		= 0,
-	.cra_type		= &crypto_blkcipher_type,
-	.cra_module		= THIS_MODULE,
-	.cra_list		= LIST_HEAD_INIT(blk_lrw_alg.cra_list),
-	.cra_exit		= lrw_exit_tfm,
-	.cra_u = {
-		.blkcipher = {
-			.min_keysize	= TF_MIN_KEY_SIZE + TF_BLOCK_SIZE,
-			.max_keysize	= TF_MAX_KEY_SIZE + TF_BLOCK_SIZE,
-			.ivsize		= TF_BLOCK_SIZE,
-			.setkey		= lrw_twofish_setkey,
-			.encrypt	= lrw_encrypt,
-			.decrypt	= lrw_decrypt,
-		},
-	},
-};
-
 struct twofish_xts_ctx {
 	struct twofish_ctx tweak_ctx;
 	struct twofish_ctx crypt_ctx;
@@ -615,7 +523,91 @@ static int xts_decrypt(struct blkcipher_desc *desc, struct scatterlist *dst,
 	return xts_crypt(desc, dst, src, nbytes, &req);
 }
 
-static struct crypto_alg blk_xts_alg = {
+static struct crypto_alg tf_algs[5] = { {
+	.cra_name		= "ecb(twofish)",
+	.cra_driver_name	= "ecb-twofish-3way",
+	.cra_priority		= 300,
+	.cra_flags		= CRYPTO_ALG_TYPE_BLKCIPHER,
+	.cra_blocksize		= TF_BLOCK_SIZE,
+	.cra_ctxsize		= sizeof(struct twofish_ctx),
+	.cra_alignmask		= 0,
+	.cra_type		= &crypto_blkcipher_type,
+	.cra_module		= THIS_MODULE,
+	.cra_list		= LIST_HEAD_INIT(tf_algs[0].cra_list),
+	.cra_u = {
+		.blkcipher = {
+			.min_keysize	= TF_MIN_KEY_SIZE,
+			.max_keysize	= TF_MAX_KEY_SIZE,
+			.setkey		= twofish_setkey,
+			.encrypt	= ecb_encrypt,
+			.decrypt	= ecb_decrypt,
+		},
+	},
+}, {
+	.cra_name		= "cbc(twofish)",
+	.cra_driver_name	= "cbc-twofish-3way",
+	.cra_priority		= 300,
+	.cra_flags		= CRYPTO_ALG_TYPE_BLKCIPHER,
+	.cra_blocksize		= TF_BLOCK_SIZE,
+	.cra_ctxsize		= sizeof(struct twofish_ctx),
+	.cra_alignmask		= 0,
+	.cra_type		= &crypto_blkcipher_type,
+	.cra_module		= THIS_MODULE,
+	.cra_list		= LIST_HEAD_INIT(tf_algs[1].cra_list),
+	.cra_u = {
+		.blkcipher = {
+			.min_keysize	= TF_MIN_KEY_SIZE,
+			.max_keysize	= TF_MAX_KEY_SIZE,
+			.ivsize		= TF_BLOCK_SIZE,
+			.setkey		= twofish_setkey,
+			.encrypt	= cbc_encrypt,
+			.decrypt	= cbc_decrypt,
+		},
+	},
+}, {
+	.cra_name		= "ctr(twofish)",
+	.cra_driver_name	= "ctr-twofish-3way",
+	.cra_priority		= 300,
+	.cra_flags		= CRYPTO_ALG_TYPE_BLKCIPHER,
+	.cra_blocksize		= 1,
+	.cra_ctxsize		= sizeof(struct twofish_ctx),
+	.cra_alignmask		= 0,
+	.cra_type		= &crypto_blkcipher_type,
+	.cra_module		= THIS_MODULE,
+	.cra_list		= LIST_HEAD_INIT(tf_algs[2].cra_list),
+	.cra_u = {
+		.blkcipher = {
+			.min_keysize	= TF_MIN_KEY_SIZE,
+			.max_keysize	= TF_MAX_KEY_SIZE,
+			.ivsize		= TF_BLOCK_SIZE,
+			.setkey		= twofish_setkey,
+			.encrypt	= ctr_crypt,
+			.decrypt	= ctr_crypt,
+		},
+	},
+}, {
+	.cra_name		= "lrw(twofish)",
+	.cra_driver_name	= "lrw-twofish-3way",
+	.cra_priority		= 300,
+	.cra_flags		= CRYPTO_ALG_TYPE_BLKCIPHER,
+	.cra_blocksize		= TF_BLOCK_SIZE,
+	.cra_ctxsize		= sizeof(struct twofish_lrw_ctx),
+	.cra_alignmask		= 0,
+	.cra_type		= &crypto_blkcipher_type,
+	.cra_module		= THIS_MODULE,
+	.cra_list		= LIST_HEAD_INIT(tf_algs[3].cra_list),
+	.cra_exit		= lrw_exit_tfm,
+	.cra_u = {
+		.blkcipher = {
+			.min_keysize	= TF_MIN_KEY_SIZE + TF_BLOCK_SIZE,
+			.max_keysize	= TF_MAX_KEY_SIZE + TF_BLOCK_SIZE,
+			.ivsize		= TF_BLOCK_SIZE,
+			.setkey		= lrw_twofish_setkey,
+			.encrypt	= lrw_encrypt,
+			.decrypt	= lrw_decrypt,
+		},
+	},
+}, {
 	.cra_name		= "xts(twofish)",
 	.cra_driver_name	= "xts-twofish-3way",
 	.cra_priority		= 300,
@@ -625,7 +617,7 @@ static struct crypto_alg blk_xts_alg = {
 	.cra_alignmask		= 0,
 	.cra_type		= &crypto_blkcipher_type,
 	.cra_module		= THIS_MODULE,
-	.cra_list		= LIST_HEAD_INIT(blk_xts_alg.cra_list),
+	.cra_list		= LIST_HEAD_INIT(tf_algs[4].cra_list),
 	.cra_u = {
 		.blkcipher = {
 			.min_keysize	= TF_MIN_KEY_SIZE * 2,
@@ -636,7 +628,7 @@ static struct crypto_alg blk_xts_alg = {
 			.decrypt	= xts_decrypt,
 		},
 	},
-};
+} };
 
 static bool is_blacklisted_cpu(void)
 {
@@ -678,8 +670,6 @@ MODULE_PARM_DESC(force, "Force module load, ignore CPU blacklist");
 
 int __init init(void)
 {
-	int err;
-
 	if (!force && is_blacklisted_cpu()) {
 		printk(KERN_INFO
 			"twofish-x86_64-3way: performance on this CPU "
@@ -688,43 +678,12 @@ int __init init(void)
 		return -ENODEV;
 	}
 
-	err = crypto_register_alg(&blk_ecb_alg);
-	if (err)
-		goto ecb_err;
-	err = crypto_register_alg(&blk_cbc_alg);
-	if (err)
-		goto cbc_err;
-	err = crypto_register_alg(&blk_ctr_alg);
-	if (err)
-		goto ctr_err;
-	err = crypto_register_alg(&blk_lrw_alg);
-	if (err)
-		goto blk_lrw_err;
-	err = crypto_register_alg(&blk_xts_alg);
-	if (err)
-		goto blk_xts_err;
-
-	return 0;
-
-blk_xts_err:
-	crypto_unregister_alg(&blk_lrw_alg);
-blk_lrw_err:
-	crypto_unregister_alg(&blk_ctr_alg);
-ctr_err:
-	crypto_unregister_alg(&blk_cbc_alg);
-cbc_err:
-	crypto_unregister_alg(&blk_ecb_alg);
-ecb_err:
-	return err;
+	return crypto_register_algs(tf_algs, ARRAY_SIZE(tf_algs));
 }
 
 void __exit fini(void)
 {
-	crypto_unregister_alg(&blk_xts_alg);
-	crypto_unregister_alg(&blk_lrw_alg);
-	crypto_unregister_alg(&blk_ctr_alg);
-	crypto_unregister_alg(&blk_cbc_alg);
-	crypto_unregister_alg(&blk_ecb_alg);
+	crypto_unregister_algs(tf_algs, ARRAY_SIZE(tf_algs));
 }
 
 module_init(init);
-- 
cgit v1.2.3


From d433208cfc3db3ae0520da92a15ac1f82d8b61ed Mon Sep 17 00:00:00 2001
From: Jussi Kivilinna <jussi.kivilinna@mbnet.fi>
Date: Fri, 17 Feb 2012 22:48:48 +0200
Subject: crypto: blowfish-x86_64 - use crypto_[un]register_algs

Combine all crypto_alg to be registered and use new crypto_[un]register_algs
functions. Simplifies init/exit code and reduce object size.

Signed-off-by: Jussi Kivilinna <jussi.kivilinna@mbnet.fi>
Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
---
 arch/x86/crypto/blowfish_glue.c | 163 ++++++++++++++++------------------------
 1 file changed, 65 insertions(+), 98 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/crypto/blowfish_glue.c b/arch/x86/crypto/blowfish_glue.c
index 2970110d2cea..73bc8a93f0ce 100644
--- a/arch/x86/crypto/blowfish_glue.c
+++ b/arch/x86/crypto/blowfish_glue.c
@@ -77,27 +77,6 @@ static void blowfish_decrypt(struct crypto_tfm *tfm, u8 *dst, const u8 *src)
 	blowfish_dec_blk(crypto_tfm_ctx(tfm), dst, src);
 }
 
-static struct crypto_alg bf_alg = {
-	.cra_name		=	"blowfish",
-	.cra_driver_name	=	"blowfish-asm",
-	.cra_priority		=	200,
-	.cra_flags		=	CRYPTO_ALG_TYPE_CIPHER,
-	.cra_blocksize		=	BF_BLOCK_SIZE,
-	.cra_ctxsize		=	sizeof(struct bf_ctx),
-	.cra_alignmask		=	3,
-	.cra_module		=	THIS_MODULE,
-	.cra_list		=	LIST_HEAD_INIT(bf_alg.cra_list),
-	.cra_u			=	{
-		.cipher = {
-			.cia_min_keysize	=	BF_MIN_KEY_SIZE,
-			.cia_max_keysize	=	BF_MAX_KEY_SIZE,
-			.cia_setkey		=	blowfish_setkey,
-			.cia_encrypt		=	blowfish_encrypt,
-			.cia_decrypt		=	blowfish_decrypt,
-		}
-	}
-};
-
 static int ecb_crypt(struct blkcipher_desc *desc, struct blkcipher_walk *walk,
 		     void (*fn)(struct bf_ctx *, u8 *, const u8 *),
 		     void (*fn_4way)(struct bf_ctx *, u8 *, const u8 *))
@@ -161,28 +140,6 @@ static int ecb_decrypt(struct blkcipher_desc *desc, struct scatterlist *dst,
 	return ecb_crypt(desc, &walk, blowfish_dec_blk, blowfish_dec_blk_4way);
 }
 
-static struct crypto_alg blk_ecb_alg = {
-	.cra_name		= "ecb(blowfish)",
-	.cra_driver_name	= "ecb-blowfish-asm",
-	.cra_priority		= 300,
-	.cra_flags		= CRYPTO_ALG_TYPE_BLKCIPHER,
-	.cra_blocksize		= BF_BLOCK_SIZE,
-	.cra_ctxsize		= sizeof(struct bf_ctx),
-	.cra_alignmask		= 0,
-	.cra_type		= &crypto_blkcipher_type,
-	.cra_module		= THIS_MODULE,
-	.cra_list		= LIST_HEAD_INIT(blk_ecb_alg.cra_list),
-	.cra_u = {
-		.blkcipher = {
-			.min_keysize	= BF_MIN_KEY_SIZE,
-			.max_keysize	= BF_MAX_KEY_SIZE,
-			.setkey		= blowfish_setkey,
-			.encrypt	= ecb_encrypt,
-			.decrypt	= ecb_decrypt,
-		},
-	},
-};
-
 static unsigned int __cbc_encrypt(struct blkcipher_desc *desc,
 				  struct blkcipher_walk *walk)
 {
@@ -308,29 +265,6 @@ static int cbc_decrypt(struct blkcipher_desc *desc, struct scatterlist *dst,
 	return err;
 }
 
-static struct crypto_alg blk_cbc_alg = {
-	.cra_name		= "cbc(blowfish)",
-	.cra_driver_name	= "cbc-blowfish-asm",
-	.cra_priority		= 300,
-	.cra_flags		= CRYPTO_ALG_TYPE_BLKCIPHER,
-	.cra_blocksize		= BF_BLOCK_SIZE,
-	.cra_ctxsize		= sizeof(struct bf_ctx),
-	.cra_alignmask		= 0,
-	.cra_type		= &crypto_blkcipher_type,
-	.cra_module		= THIS_MODULE,
-	.cra_list		= LIST_HEAD_INIT(blk_cbc_alg.cra_list),
-	.cra_u = {
-		.blkcipher = {
-			.min_keysize	= BF_MIN_KEY_SIZE,
-			.max_keysize	= BF_MAX_KEY_SIZE,
-			.ivsize		= BF_BLOCK_SIZE,
-			.setkey		= blowfish_setkey,
-			.encrypt	= cbc_encrypt,
-			.decrypt	= cbc_decrypt,
-		},
-	},
-};
-
 static void ctr_crypt_final(struct bf_ctx *ctx, struct blkcipher_walk *walk)
 {
 	u8 *ctrblk = walk->iv;
@@ -424,7 +358,67 @@ static int ctr_crypt(struct blkcipher_desc *desc, struct scatterlist *dst,
 	return err;
 }
 
-static struct crypto_alg blk_ctr_alg = {
+static struct crypto_alg bf_algs[4] = { {
+	.cra_name		= "blowfish",
+	.cra_driver_name	= "blowfish-asm",
+	.cra_priority		= 200,
+	.cra_flags		= CRYPTO_ALG_TYPE_CIPHER,
+	.cra_blocksize		= BF_BLOCK_SIZE,
+	.cra_ctxsize		= sizeof(struct bf_ctx),
+	.cra_alignmask		= 3,
+	.cra_module		= THIS_MODULE,
+	.cra_list		= LIST_HEAD_INIT(bf_algs[0].cra_list),
+	.cra_u = {
+		.cipher = {
+			.cia_min_keysize	= BF_MIN_KEY_SIZE,
+			.cia_max_keysize	= BF_MAX_KEY_SIZE,
+			.cia_setkey		= blowfish_setkey,
+			.cia_encrypt		= blowfish_encrypt,
+			.cia_decrypt		= blowfish_decrypt,
+		}
+	}
+}, {
+	.cra_name		= "ecb(blowfish)",
+	.cra_driver_name	= "ecb-blowfish-asm",
+	.cra_priority		= 300,
+	.cra_flags		= CRYPTO_ALG_TYPE_BLKCIPHER,
+	.cra_blocksize		= BF_BLOCK_SIZE,
+	.cra_ctxsize		= sizeof(struct bf_ctx),
+	.cra_alignmask		= 0,
+	.cra_type		= &crypto_blkcipher_type,
+	.cra_module		= THIS_MODULE,
+	.cra_list		= LIST_HEAD_INIT(bf_algs[1].cra_list),
+	.cra_u = {
+		.blkcipher = {
+			.min_keysize	= BF_MIN_KEY_SIZE,
+			.max_keysize	= BF_MAX_KEY_SIZE,
+			.setkey		= blowfish_setkey,
+			.encrypt	= ecb_encrypt,
+			.decrypt	= ecb_decrypt,
+		},
+	},
+}, {
+	.cra_name		= "cbc(blowfish)",
+	.cra_driver_name	= "cbc-blowfish-asm",
+	.cra_priority		= 300,
+	.cra_flags		= CRYPTO_ALG_TYPE_BLKCIPHER,
+	.cra_blocksize		= BF_BLOCK_SIZE,
+	.cra_ctxsize		= sizeof(struct bf_ctx),
+	.cra_alignmask		= 0,
+	.cra_type		= &crypto_blkcipher_type,
+	.cra_module		= THIS_MODULE,
+	.cra_list		= LIST_HEAD_INIT(bf_algs[2].cra_list),
+	.cra_u = {
+		.blkcipher = {
+			.min_keysize	= BF_MIN_KEY_SIZE,
+			.max_keysize	= BF_MAX_KEY_SIZE,
+			.ivsize		= BF_BLOCK_SIZE,
+			.setkey		= blowfish_setkey,
+			.encrypt	= cbc_encrypt,
+			.decrypt	= cbc_decrypt,
+		},
+	},
+}, {
 	.cra_name		= "ctr(blowfish)",
 	.cra_driver_name	= "ctr-blowfish-asm",
 	.cra_priority		= 300,
@@ -434,7 +428,7 @@ static struct crypto_alg blk_ctr_alg = {
 	.cra_alignmask		= 0,
 	.cra_type		= &crypto_blkcipher_type,
 	.cra_module		= THIS_MODULE,
-	.cra_list		= LIST_HEAD_INIT(blk_ctr_alg.cra_list),
+	.cra_list		= LIST_HEAD_INIT(bf_algs[3].cra_list),
 	.cra_u = {
 		.blkcipher = {
 			.min_keysize	= BF_MIN_KEY_SIZE,
@@ -445,7 +439,7 @@ static struct crypto_alg blk_ctr_alg = {
 			.decrypt	= ctr_crypt,
 		},
 	},
-};
+} };
 
 static bool is_blacklisted_cpu(void)
 {
@@ -470,8 +464,6 @@ MODULE_PARM_DESC(force, "Force module load, ignore CPU blacklist");
 
 static int __init init(void)
 {
-	int err;
-
 	if (!force && is_blacklisted_cpu()) {
 		printk(KERN_INFO
 			"blowfish-x86_64: performance on this CPU "
@@ -480,37 +472,12 @@ static int __init init(void)
 		return -ENODEV;
 	}
 
-	err = crypto_register_alg(&bf_alg);
-	if (err)
-		goto bf_err;
-	err = crypto_register_alg(&blk_ecb_alg);
-	if (err)
-		goto ecb_err;
-	err = crypto_register_alg(&blk_cbc_alg);
-	if (err)
-		goto cbc_err;
-	err = crypto_register_alg(&blk_ctr_alg);
-	if (err)
-		goto ctr_err;
-
-	return 0;
-
-ctr_err:
-	crypto_unregister_alg(&blk_cbc_alg);
-cbc_err:
-	crypto_unregister_alg(&blk_ecb_alg);
-ecb_err:
-	crypto_unregister_alg(&bf_alg);
-bf_err:
-	return err;
+	return crypto_register_algs(bf_algs, ARRAY_SIZE(bf_algs));
 }
 
 static void __exit fini(void)
 {
-	crypto_unregister_alg(&blk_ctr_alg);
-	crypto_unregister_alg(&blk_cbc_alg);
-	crypto_unregister_alg(&blk_ecb_alg);
-	crypto_unregister_alg(&bf_alg);
+	crypto_unregister_algs(bf_algs, ARRAY_SIZE(bf_algs));
 }
 
 module_init(init);
-- 
cgit v1.2.3


From 435d3e51af3de0c1fe9f6ca1a18df3cd4d6b8c17 Mon Sep 17 00:00:00 2001
From: Jussi Kivilinna <jussi.kivilinna@mbnet.fi>
Date: Fri, 17 Feb 2012 22:48:53 +0200
Subject: crypto: serpent-sse2 - combine ablk_*_init functions

Driver name in ablk_*_init functions can be constructed runtime. Therefore
use single function ablk_init to reduce object size.

Signed-off-by: Jussi Kivilinna <jussi.kivilinna@mbnet.fi>
Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
---
 arch/x86/crypto/serpent_sse2_glue.c | 71 +++++++------------------------------
 1 file changed, 13 insertions(+), 58 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/crypto/serpent_sse2_glue.c b/arch/x86/crypto/serpent_sse2_glue.c
index 5520c7522200..4b21be85e0a1 100644
--- a/arch/x86/crypto/serpent_sse2_glue.c
+++ b/arch/x86/crypto/serpent_sse2_glue.c
@@ -676,68 +676,23 @@ static void ablk_exit(struct crypto_tfm *tfm)
 	cryptd_free_ablkcipher(ctx->cryptd_tfm);
 }
 
-static void ablk_init_common(struct crypto_tfm *tfm,
-			     struct cryptd_ablkcipher *cryptd_tfm)
+static int ablk_init(struct crypto_tfm *tfm)
 {
 	struct async_serpent_ctx *ctx = crypto_tfm_ctx(tfm);
-
-	ctx->cryptd_tfm = cryptd_tfm;
-	tfm->crt_ablkcipher.reqsize = sizeof(struct ablkcipher_request) +
-		crypto_ablkcipher_reqsize(&cryptd_tfm->base);
-}
-
-static int ablk_ecb_init(struct crypto_tfm *tfm)
-{
 	struct cryptd_ablkcipher *cryptd_tfm;
+	char drv_name[CRYPTO_MAX_ALG_NAME];
 
-	cryptd_tfm = cryptd_alloc_ablkcipher("__driver-ecb-serpent-sse2", 0, 0);
-	if (IS_ERR(cryptd_tfm))
-		return PTR_ERR(cryptd_tfm);
-	ablk_init_common(tfm, cryptd_tfm);
-	return 0;
-}
-
-static int ablk_cbc_init(struct crypto_tfm *tfm)
-{
-	struct cryptd_ablkcipher *cryptd_tfm;
+	snprintf(drv_name, sizeof(drv_name), "__driver-%s",
+					crypto_tfm_alg_driver_name(tfm));
 
-	cryptd_tfm = cryptd_alloc_ablkcipher("__driver-cbc-serpent-sse2", 0, 0);
+	cryptd_tfm = cryptd_alloc_ablkcipher(drv_name, 0, 0);
 	if (IS_ERR(cryptd_tfm))
 		return PTR_ERR(cryptd_tfm);
-	ablk_init_common(tfm, cryptd_tfm);
-	return 0;
-}
 
-static int ablk_ctr_init(struct crypto_tfm *tfm)
-{
-	struct cryptd_ablkcipher *cryptd_tfm;
-
-	cryptd_tfm = cryptd_alloc_ablkcipher("__driver-ctr-serpent-sse2", 0, 0);
-	if (IS_ERR(cryptd_tfm))
-		return PTR_ERR(cryptd_tfm);
-	ablk_init_common(tfm, cryptd_tfm);
-	return 0;
-}
-
-static int ablk_lrw_init(struct crypto_tfm *tfm)
-{
-	struct cryptd_ablkcipher *cryptd_tfm;
-
-	cryptd_tfm = cryptd_alloc_ablkcipher("__driver-lrw-serpent-sse2", 0, 0);
-	if (IS_ERR(cryptd_tfm))
-		return PTR_ERR(cryptd_tfm);
-	ablk_init_common(tfm, cryptd_tfm);
-	return 0;
-}
-
-static int ablk_xts_init(struct crypto_tfm *tfm)
-{
-	struct cryptd_ablkcipher *cryptd_tfm;
+	ctx->cryptd_tfm = cryptd_tfm;
+	tfm->crt_ablkcipher.reqsize = sizeof(struct ablkcipher_request) +
+		crypto_ablkcipher_reqsize(&cryptd_tfm->base);
 
-	cryptd_tfm = cryptd_alloc_ablkcipher("__driver-xts-serpent-sse2", 0, 0);
-	if (IS_ERR(cryptd_tfm))
-		return PTR_ERR(cryptd_tfm);
-	ablk_init_common(tfm, cryptd_tfm);
 	return 0;
 }
 
@@ -858,7 +813,7 @@ static struct crypto_alg serpent_algs[10] = { {
 	.cra_type		= &crypto_ablkcipher_type,
 	.cra_module		= THIS_MODULE,
 	.cra_list		= LIST_HEAD_INIT(serpent_algs[5].cra_list),
-	.cra_init		= ablk_ecb_init,
+	.cra_init		= ablk_init,
 	.cra_exit		= ablk_exit,
 	.cra_u = {
 		.ablkcipher = {
@@ -880,7 +835,7 @@ static struct crypto_alg serpent_algs[10] = { {
 	.cra_type		= &crypto_ablkcipher_type,
 	.cra_module		= THIS_MODULE,
 	.cra_list		= LIST_HEAD_INIT(serpent_algs[6].cra_list),
-	.cra_init		= ablk_cbc_init,
+	.cra_init		= ablk_init,
 	.cra_exit		= ablk_exit,
 	.cra_u = {
 		.ablkcipher = {
@@ -903,7 +858,7 @@ static struct crypto_alg serpent_algs[10] = { {
 	.cra_type		= &crypto_ablkcipher_type,
 	.cra_module		= THIS_MODULE,
 	.cra_list		= LIST_HEAD_INIT(serpent_algs[7].cra_list),
-	.cra_init		= ablk_ctr_init,
+	.cra_init		= ablk_init,
 	.cra_exit		= ablk_exit,
 	.cra_u = {
 		.ablkcipher = {
@@ -927,7 +882,7 @@ static struct crypto_alg serpent_algs[10] = { {
 	.cra_type		= &crypto_ablkcipher_type,
 	.cra_module		= THIS_MODULE,
 	.cra_list		= LIST_HEAD_INIT(serpent_algs[8].cra_list),
-	.cra_init		= ablk_lrw_init,
+	.cra_init		= ablk_init,
 	.cra_exit		= ablk_exit,
 	.cra_u = {
 		.ablkcipher = {
@@ -952,7 +907,7 @@ static struct crypto_alg serpent_algs[10] = { {
 	.cra_type		= &crypto_ablkcipher_type,
 	.cra_module		= THIS_MODULE,
 	.cra_list		= LIST_HEAD_INIT(serpent_algs[9].cra_list),
-	.cra_init		= ablk_xts_init,
+	.cra_init		= ablk_init,
 	.cra_exit		= ablk_exit,
 	.cra_u = {
 		.ablkcipher = {
-- 
cgit v1.2.3


From 919e2c32496aec4170bd67e64efd526dd0a9bbdc Mon Sep 17 00:00:00 2001
From: Jussi Kivilinna <jussi.kivilinna@mbnet.fi>
Date: Fri, 17 Feb 2012 22:48:58 +0200
Subject: crypto: blowfish-x86_64 - set alignmask to zero

x86 has fast unaligned accesses, so blowfish-x86_64 does not need to enforce
alignment.

Signed-off-by: Jussi Kivilinna <jussi.kivilinna@mbnet.fi>
Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
---
 arch/x86/crypto/blowfish_glue.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'arch/x86')

diff --git a/arch/x86/crypto/blowfish_glue.c b/arch/x86/crypto/blowfish_glue.c
index 73bc8a93f0ce..7967474de8f7 100644
--- a/arch/x86/crypto/blowfish_glue.c
+++ b/arch/x86/crypto/blowfish_glue.c
@@ -365,7 +365,7 @@ static struct crypto_alg bf_algs[4] = { {
 	.cra_flags		= CRYPTO_ALG_TYPE_CIPHER,
 	.cra_blocksize		= BF_BLOCK_SIZE,
 	.cra_ctxsize		= sizeof(struct bf_ctx),
-	.cra_alignmask		= 3,
+	.cra_alignmask		= 0,
 	.cra_module		= THIS_MODULE,
 	.cra_list		= LIST_HEAD_INIT(bf_algs[0].cra_list),
 	.cra_u = {
-- 
cgit v1.2.3


From 894042648902d11d579af2a936a5a9a43cd5f1e4 Mon Sep 17 00:00:00 2001
From: Jussi Kivilinna <jussi.kivilinna@mbnet.fi>
Date: Fri, 17 Feb 2012 22:49:03 +0200
Subject: crypto: twofish-x86_64/i586 - set alignmask to zero

x86 has fast unaligned accesses, so twofish-x86_64/i586 does not need to enforce
alignment.

Signed-off-by: Jussi Kivilinna <jussi.kivilinna@mbnet.fi>
Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
---
 arch/x86/crypto/twofish_glue.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'arch/x86')

diff --git a/arch/x86/crypto/twofish_glue.c b/arch/x86/crypto/twofish_glue.c
index dc6b3fb817fc..359ae084275c 100644
--- a/arch/x86/crypto/twofish_glue.c
+++ b/arch/x86/crypto/twofish_glue.c
@@ -68,7 +68,7 @@ static struct crypto_alg alg = {
 	.cra_flags		=	CRYPTO_ALG_TYPE_CIPHER,
 	.cra_blocksize		=	TF_BLOCK_SIZE,
 	.cra_ctxsize		=	sizeof(struct twofish_ctx),
-	.cra_alignmask		=	3,
+	.cra_alignmask		=	0,
 	.cra_module		=	THIS_MODULE,
 	.cra_list		=	LIST_HEAD_INIT(alg.cra_list),
 	.cra_u			=	{
-- 
cgit v1.2.3


From ce5f7a99df87918b5be4618a9386213a8e9a7146 Mon Sep 17 00:00:00 2001
From: Bobby Powers <bobbypowers@gmail.com>
Date: Sat, 25 Feb 2012 23:25:38 -0500
Subject: x32: Make sure TS_COMPAT is cleared for x32 tasks

If a process has a non-x32 ia32 personality and changes to x32, the
process would keep its TS_COMPAT flag. x32 uses the presence of the
x32 flag on a syscall to determine compat status, so make sure
TS_COMPAT is cleared.

Signed-off-by: Bobby Powers <bobbypowers@gmail.com>
Link: http://lkml.kernel.org/r/1330230338-25077-1-git-send-email-bobbypowers@gmail.com
Signed-off-by: H. Peter Anvin <hpa@zytor.com>
---
 arch/x86/kernel/process_64.c | 3 +++
 1 file changed, 3 insertions(+)

(limited to 'arch/x86')

diff --git a/arch/x86/kernel/process_64.c b/arch/x86/kernel/process_64.c
index a0701da2bd18..32e04120b2cd 100644
--- a/arch/x86/kernel/process_64.c
+++ b/arch/x86/kernel/process_64.c
@@ -540,6 +540,9 @@ void set_personality_ia32(bool x32)
 		clear_thread_flag(TIF_IA32);
 		set_thread_flag(TIF_X32);
 		current->personality &= ~READ_IMPLIES_EXEC;
+		/* is_compat_task() uses the presence of the x32
+		   syscall bit flag to determine compat status */
+		current_thread_info()->status &= ~TS_COMPAT;
 	} else {
 		set_thread_flag(TIF_IA32);
 		clear_thread_flag(TIF_X32);
-- 
cgit v1.2.3


From 00194b2e845da29395ad00c13a884d9acb9306b5 Mon Sep 17 00:00:00 2001
From: Bobby Powers <bobbypowers@gmail.com>
Date: Sat, 25 Feb 2012 22:59:34 -0500
Subject: x32: Only clear TIF_X32 flag once

Commits bb212724 and d1a797f3 both added a call to
clear_thread_flag(TIF_X32) under set_personality_64bit() - only one is
needed.

Signed-off-by: Bobby Powers <bobbypowers@gmail.com>
Link: http://lkml.kernel.org/r/1330228774-24223-1-git-send-email-bobbypowers@gmail.com
Signed-off-by: H. Peter Anvin <hpa@zytor.com>
---
 arch/x86/kernel/process_64.c | 1 -
 1 file changed, 1 deletion(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kernel/process_64.c b/arch/x86/kernel/process_64.c
index 32e04120b2cd..a4659739e202 100644
--- a/arch/x86/kernel/process_64.c
+++ b/arch/x86/kernel/process_64.c
@@ -510,7 +510,6 @@ void set_personality_64bit(void)
 
 	/* Make sure to be in 64bit mode */
 	clear_thread_flag(TIF_IA32);
-	clear_thread_flag(TIF_X32);
 	clear_thread_flag(TIF_ADDR32);
 	clear_thread_flag(TIF_X32);
 
-- 
cgit v1.2.3


From 42dfc43ee5999ac64284476ea0ac6c937587cf2b Mon Sep 17 00:00:00 2001
From: Siddhesh Poyarekar <siddhesh.poyarekar@gmail.com>
Date: Sun, 26 Feb 2012 21:47:55 +0530
Subject: x86_64: Record stack pointer before task execution begins

task->thread.usersp is unusable immediately after a binary is exec()'d
until it undergoes a context switch cycle. The start_thread() function
called during execve() saves the stack pointer into pt_regs and into
old_rsp, but fails to record it into task->thread.usersp.

Because of this, KSTK_ESP(task) returns an incorrect value for a
64-bit program until the task is switched out and back in since
switch_to swaps %rsp values in and out into task->thread.usersp.

Signed-off-by: Siddhesh Poyarekar <siddhesh.poyarekar@gmail.com>
Link: http://lkml.kernel.org/r/1330273075-2949-1-git-send-email-siddhesh.poyarekar@gmail.com
Signed-off-by: H. Peter Anvin <hpa@zytor.com>
---
 arch/x86/kernel/process_64.c | 1 +
 1 file changed, 1 insertion(+)

(limited to 'arch/x86')

diff --git a/arch/x86/kernel/process_64.c b/arch/x86/kernel/process_64.c
index 1fd94bc4279d..eb54dd0fbed6 100644
--- a/arch/x86/kernel/process_64.c
+++ b/arch/x86/kernel/process_64.c
@@ -341,6 +341,7 @@ start_thread_common(struct pt_regs *regs, unsigned long new_ip,
 	loadsegment(es, _ds);
 	loadsegment(ds, _ds);
 	load_gs_index(0);
+	current->thread.usersp	= new_sp;
 	regs->ip		= new_ip;
 	regs->sp		= new_sp;
 	percpu_write(old_rsp, new_sp);
-- 
cgit v1.2.3


From f0ba662a6e06f2fb58201800eff33dcad9246f97 Mon Sep 17 00:00:00 2001
From: Jan Beulich <JBeulich@suse.com>
Date: Fri, 24 Feb 2012 12:09:21 +0000
Subject: x86: Properly _init-annotate NMI selftest code

After all, this code is being run once at boot only (if
configured in at all).

Signed-off-by: Jan Beulich <jbeulich@suse.com>
Acked-by: Don Zickus <dzickus@redhat.com>
Link: http://lkml.kernel.org/r/4F478C010200007800074A3D@nat28.tlf.novell.com
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/nmi_selftest.c | 37 +++++++++++++++++++------------------
 1 file changed, 19 insertions(+), 18 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kernel/nmi_selftest.c b/arch/x86/kernel/nmi_selftest.c
index 0d01a8ea4e11..2c39dcd510fa 100644
--- a/arch/x86/kernel/nmi_selftest.c
+++ b/arch/x86/kernel/nmi_selftest.c
@@ -12,6 +12,7 @@
 #include <linux/smp.h>
 #include <linux/cpumask.h>
 #include <linux/delay.h>
+#include <linux/init.h>
 
 #include <asm/apic.h>
 #include <asm/nmi.h>
@@ -20,35 +21,35 @@
 #define FAILURE		1
 #define TIMEOUT		2
 
-static int nmi_fail;
+static int __initdata nmi_fail;
 
 /* check to see if NMI IPIs work on this machine */
-static DECLARE_BITMAP(nmi_ipi_mask, NR_CPUS) __read_mostly;
+static DECLARE_BITMAP(nmi_ipi_mask, NR_CPUS) __initdata;
 
-static int testcase_total;
-static int testcase_successes;
-static int expected_testcase_failures;
-static int unexpected_testcase_failures;
-static int unexpected_testcase_unknowns;
+static int __initdata testcase_total;
+static int __initdata testcase_successes;
+static int __initdata expected_testcase_failures;
+static int __initdata unexpected_testcase_failures;
+static int __initdata unexpected_testcase_unknowns;
 
-static int nmi_unk_cb(unsigned int val, struct pt_regs *regs)
+static int __init nmi_unk_cb(unsigned int val, struct pt_regs *regs)
 {
 	unexpected_testcase_unknowns++;
 	return NMI_HANDLED;
 }
 
-static void init_nmi_testsuite(void)
+static void __init init_nmi_testsuite(void)
 {
 	/* trap all the unknown NMIs we may generate */
 	register_nmi_handler(NMI_UNKNOWN, nmi_unk_cb, 0, "nmi_selftest_unk");
 }
 
-static void cleanup_nmi_testsuite(void)
+static void __init cleanup_nmi_testsuite(void)
 {
 	unregister_nmi_handler(NMI_UNKNOWN, "nmi_selftest_unk");
 }
 
-static int test_nmi_ipi_callback(unsigned int val, struct pt_regs *regs)
+static int __init test_nmi_ipi_callback(unsigned int val, struct pt_regs *regs)
 {
         int cpu = raw_smp_processor_id();
 
@@ -58,7 +59,7 @@ static int test_nmi_ipi_callback(unsigned int val, struct pt_regs *regs)
         return NMI_DONE;
 }
 
-static void test_nmi_ipi(struct cpumask *mask)
+static void __init test_nmi_ipi(struct cpumask *mask)
 {
 	unsigned long timeout;
 
@@ -86,7 +87,7 @@ static void test_nmi_ipi(struct cpumask *mask)
 	return;
 }
 
-static void remote_ipi(void)
+static void __init remote_ipi(void)
 {
 	cpumask_copy(to_cpumask(nmi_ipi_mask), cpu_online_mask);
 	cpumask_clear_cpu(smp_processor_id(), to_cpumask(nmi_ipi_mask));
@@ -94,19 +95,19 @@ static void remote_ipi(void)
 		test_nmi_ipi(to_cpumask(nmi_ipi_mask));
 }
 
-static void local_ipi(void)
+static void __init local_ipi(void)
 {
 	cpumask_clear(to_cpumask(nmi_ipi_mask));
 	cpumask_set_cpu(smp_processor_id(), to_cpumask(nmi_ipi_mask));
 	test_nmi_ipi(to_cpumask(nmi_ipi_mask));
 }
 
-static void reset_nmi(void)
+static void __init reset_nmi(void)
 {
 	nmi_fail = 0;
 }
 
-static void dotest(void (*testcase_fn)(void), int expected)
+static void __init dotest(void (*testcase_fn)(void), int expected)
 {
 	testcase_fn();
 	/*
@@ -131,12 +132,12 @@ static void dotest(void (*testcase_fn)(void), int expected)
 	reset_nmi();
 }
 
-static inline void print_testname(const char *testname)
+static inline void __init print_testname(const char *testname)
 {
 	printk("%12s:", testname);
 }
 
-void nmi_selftest(void)
+void __init nmi_selftest(void)
 {
 	init_nmi_testsuite();
 
-- 
cgit v1.2.3


From d93c4071b78f4676ef70ec8f2d4bae59b6cc5523 Mon Sep 17 00:00:00 2001
From: Jan Beulich <JBeulich@suse.com>
Date: Fri, 24 Feb 2012 11:50:27 +0000
Subject: x86/time: Eliminate unused irq0_irqs counter

As of v2.6.38 this counter is being maintained without ever being
read.

Signed-off-by: Jan Beulich <jbeulich@suse.com>
Link: http://lkml.kernel.org/r/4F4787930200007800074A10@nat28.tlf.novell.com
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/include/asm/hardirq.h | 1 -
 arch/x86/kernel/time.c         | 3 ---
 2 files changed, 4 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/include/asm/hardirq.h b/arch/x86/include/asm/hardirq.h
index da0b3ca815b7..382f75d735f3 100644
--- a/arch/x86/include/asm/hardirq.h
+++ b/arch/x86/include/asm/hardirq.h
@@ -7,7 +7,6 @@
 typedef struct {
 	unsigned int __softirq_pending;
 	unsigned int __nmi_count;	/* arch dependent */
-	unsigned int irq0_irqs;
 #ifdef CONFIG_X86_LOCAL_APIC
 	unsigned int apic_timer_irqs;	/* arch dependent */
 	unsigned int irq_spurious_count;
diff --git a/arch/x86/kernel/time.c b/arch/x86/kernel/time.c
index dd5fbf4101fc..c6eba2b42673 100644
--- a/arch/x86/kernel/time.c
+++ b/arch/x86/kernel/time.c
@@ -57,9 +57,6 @@ EXPORT_SYMBOL(profile_pc);
  */
 static irqreturn_t timer_interrupt(int irq, void *dev_id)
 {
-	/* Keep nmi watchdog up to date */
-	inc_irq_stat(irq0_irqs);
-
 	global_clock_event->event_handler(global_clock_event);
 
 	/* MCA bus quirk: Acknowledge irq0 by setting bit 7 in port 0x61 */
-- 
cgit v1.2.3


From 928282e432ee584129a39da831ffa72c38e189b7 Mon Sep 17 00:00:00 2001
From: Mark Wielaard <mjw@redhat.com>
Date: Fri, 24 Feb 2012 11:32:05 +0100
Subject: x86-64: Fix CFI data for common_interrupt()

Commit eab9e6137f23 ("x86-64: Fix CFI data for interrupt frames")
introduced a DW_CFA_def_cfa_expression in the SAVE_ARGS_IRQ
macro. To later define the CFA using a simple register+offset
rule both register and offset need to be supplied. Just using
CFI_DEF_CFA_REGISTER leaves the offset undefined. So use
CFI_DEF_CFA with reg+off explicitly at the end of
common_interrupt.

Signed-off-by: Mark Wielaard <mjw@redhat.com>
Acked-by: Jan Beulich <jbeulich@suse.com>
Link: http://lkml.kernel.org/r/1330079527-30711-1-git-send-email-mjw@redhat.com
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/entry_64.S | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kernel/entry_64.S b/arch/x86/kernel/entry_64.S
index 3fe8239fd8fb..54be36bf2620 100644
--- a/arch/x86/kernel/entry_64.S
+++ b/arch/x86/kernel/entry_64.S
@@ -813,7 +813,7 @@ ret_from_intr:
 
 	/* Restore saved previous stack */
 	popq %rsi
-	CFI_DEF_CFA_REGISTER	rsi
+	CFI_DEF_CFA rsi,SS+8-RBP	/* reg/off reset after def_cfa_expr */
 	leaq ARGOFFSET-RBP(%rsi), %rsp
 	CFI_DEF_CFA_REGISTER	rsp
 	CFI_ADJUST_CFA_OFFSET	RBP-ARGOFFSET
-- 
cgit v1.2.3


From 0bf6276392e990dd0da0ccd8e10f42597d503f29 Mon Sep 17 00:00:00 2001
From: "H. Peter Anvin" <hpa@linux.intel.com>
Date: Mon, 27 Feb 2012 14:09:10 -0800
Subject: x32: Warn and disable rather than error if binutils too old

If X32 is enabled in .config, but the binutils can't build it, issue a
warning and disable the feature rather than erroring out.

In order to support this, have CONFIG_X86_X32 be the option set in
Kconfig, and CONFIG_X86_X32_ABI be the option set by the Makefile when
it is enabled and binutils has been found to be functional.

Requested-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: H. Peter Anvin <hpa@linux.intel.com>
Cc: H. J. Lu <hjl.tools@gmail.com>
Link: http://lkml.kernel.org/r/1329696488-16970-1-git-send-email-hpa@zytor.com
---
 arch/x86/Kconfig  |  4 ++--
 arch/x86/Makefile | 16 ++++++++++++++++
 2 files changed, 18 insertions(+), 2 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
index c9d6c9ed27e5..e2b38b4bffdc 100644
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -2175,7 +2175,7 @@ config IA32_AOUT
 	---help---
 	  Support old a.out binaries in the 32bit emulation.
 
-config X86_X32_ABI
+config X86_X32
 	bool "x32 ABI for 64-bit mode (EXPERIMENTAL)"
 	depends on X86_64 && IA32_EMULATION && EXPERIMENTAL
 	---help---
@@ -2190,7 +2190,7 @@ config X86_X32_ABI
 
 config COMPAT
 	def_bool y
-	depends on IA32_EMULATION || X86_X32_ABI
+	depends on IA32_EMULATION || X86_X32
 
 config COMPAT_FOR_U64_ALIGNMENT
 	def_bool COMPAT
diff --git a/arch/x86/Makefile b/arch/x86/Makefile
index 209ba1294592..31bb1eb1216a 100644
--- a/arch/x86/Makefile
+++ b/arch/x86/Makefile
@@ -82,6 +82,22 @@ ifdef CONFIG_CC_STACKPROTECTOR
         endif
 endif
 
+ifdef CONFIG_X86_X32
+	x32_ld_ok := $(call try-run,\
+			/bin/echo -e '1: .quad 1b' | \
+			$(CC) $(KBUILD_AFLAGS) -c -xassembler -o "$$TMP" - && \
+			$(OBJCOPY) -O elf32-x86-64 "$$TMP" "$$TMPO" && \
+			$(LD) -m elf32_x86_64 "$$TMPO" -o "$$TMP",y,n)
+	ifeq ($(x32_ld_ok),y)
+		CONFIG_X86_X32_ABI := y
+		KBUILD_AFLAGS += -DCONFIG_X86_X32_ABI
+		KBUILD_CFLAGS += -DCONFIG_X86_X32_ABI
+	else
+		$(warning CONFIG_X86_X32 enabled but no binutils support)
+	endif
+endif
+export CONFIG_X86_X32_ABI
+
 # Don't unroll struct assignments with kmemcheck enabled
 ifeq ($(CONFIG_KMEMCHECK),y)
 	KBUILD_CFLAGS += $(call cc-option,-fno-builtin-memcpy)
-- 
cgit v1.2.3


From 8bd69c2d5f9c0b5237c632d1b21dbfe4fd16ba6b Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Tue, 28 Feb 2012 10:35:06 +0100
Subject: x86/x32: Fix the binutils auto-detect

Fix:

 arch/x86/Makefile:96: *** recipe commences before first target.  Stop.

Cc: H. Peter Anvin <hpa@linux.intel.com>
Cc: H. J. Lu <hjl.tools@gmail.com>
Link: http://lkml.kernel.org/r/1329696488-16970-1-git-send-email-hpa@zytor.com
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/Makefile | 14 +++++++-------
 1 file changed, 7 insertions(+), 7 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/Makefile b/arch/x86/Makefile
index 31bb1eb1216a..968dbe24a255 100644
--- a/arch/x86/Makefile
+++ b/arch/x86/Makefile
@@ -88,13 +88,13 @@ ifdef CONFIG_X86_X32
 			$(CC) $(KBUILD_AFLAGS) -c -xassembler -o "$$TMP" - && \
 			$(OBJCOPY) -O elf32-x86-64 "$$TMP" "$$TMPO" && \
 			$(LD) -m elf32_x86_64 "$$TMPO" -o "$$TMP",y,n)
-	ifeq ($(x32_ld_ok),y)
-		CONFIG_X86_X32_ABI := y
-		KBUILD_AFLAGS += -DCONFIG_X86_X32_ABI
-		KBUILD_CFLAGS += -DCONFIG_X86_X32_ABI
-	else
-		$(warning CONFIG_X86_X32 enabled but no binutils support)
-	endif
+        ifeq ($(x32_ld_ok),y)
+                CONFIG_X86_X32_ABI := y
+                KBUILD_AFLAGS += -DCONFIG_X86_X32_ABI
+                KBUILD_CFLAGS += -DCONFIG_X86_X32_ABI
+        else
+                $(warning CONFIG_X86_X32 enabled but no binutils support)
+        endif
 endif
 export CONFIG_X86_X32_ABI
 
-- 
cgit v1.2.3


From 55f9709cd07c9d33e30b575ee1b3bfd0aeaa3760 Mon Sep 17 00:00:00 2001
From: Matt Fleming <matt.fleming@intel.com>
Date: Tue, 28 Feb 2012 13:37:21 +0000
Subject: x86, relocs: Don't open code put_unaligned_le32()

Use the new headers in tools/include instead of rolling our own
put_unaligned_le32() implementation.

Cc: H. Peter Anvin <hpa@zytor.com>
Signed-off-by: Matt Fleming <matt.fleming@intel.com>
Link: http://lkml.kernel.org/r/1330436245-24875-3-git-send-email-matt@console-pimps.org
Signed-off-by: H. Peter Anvin <hpa@zytor.com>
---
 arch/x86/boot/compressed/relocs.c | 6 ++----
 1 file changed, 2 insertions(+), 4 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/boot/compressed/relocs.c b/arch/x86/boot/compressed/relocs.c
index 89bbf4e4d05d..d3c0b0277666 100644
--- a/arch/x86/boot/compressed/relocs.c
+++ b/arch/x86/boot/compressed/relocs.c
@@ -10,6 +10,7 @@
 #define USE_BSD
 #include <endian.h>
 #include <regex.h>
+#include <tools/le_byteshift.h>
 
 static void die(char *fmt, ...);
 
@@ -605,10 +606,7 @@ static void emit_relocs(int as_text)
 		fwrite("\0\0\0\0", 4, 1, stdout);
 		/* Now print each relocation */
 		for (i = 0; i < reloc_count; i++) {
-			buf[0] = (relocs[i] >>  0) & 0xff;
-			buf[1] = (relocs[i] >>  8) & 0xff;
-			buf[2] = (relocs[i] >> 16) & 0xff;
-			buf[3] = (relocs[i] >> 24) & 0xff;
+			put_unaligned_le32(relocs[i], buf);
 			fwrite(buf, 4, 1, stdout);
 		}
 	}
-- 
cgit v1.2.3


From 12871c568305a0b20f116315479a18cd46882e9b Mon Sep 17 00:00:00 2001
From: Matt Fleming <matt.fleming@intel.com>
Date: Tue, 28 Feb 2012 13:37:22 +0000
Subject: x86, mkpiggy: Don't open code put_unaligned_le32()

Use the new headers in tools/include instead of rolling our own
put_unaligned_le32() implementation.

Cc: H. Peter Anvin <hpa@zytor.com>
Signed-off-by: Matt Fleming <matt.fleming@intel.com>
Link: http://lkml.kernel.org/r/1330436245-24875-4-git-send-email-matt@console-pimps.org
Signed-off-by: H. Peter Anvin <hpa@zytor.com>
---
 arch/x86/boot/compressed/Makefile  |  1 +
 arch/x86/boot/compressed/mkpiggy.c | 11 ++---------
 2 files changed, 3 insertions(+), 9 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/boot/compressed/Makefile b/arch/x86/boot/compressed/Makefile
index b123b9a8f5b3..fd55a2ff3ad8 100644
--- a/arch/x86/boot/compressed/Makefile
+++ b/arch/x86/boot/compressed/Makefile
@@ -22,6 +22,7 @@ LDFLAGS := -m elf_$(UTS_MACHINE)
 LDFLAGS_vmlinux := -T
 
 hostprogs-y	:= mkpiggy
+HOST_EXTRACFLAGS += -I$(srctree)/tools/include
 
 VMLINUX_OBJS = $(obj)/vmlinux.lds $(obj)/head_$(BITS).o $(obj)/misc.o \
 	$(obj)/string.o $(obj)/cmdline.o $(obj)/early_serial_console.o \
diff --git a/arch/x86/boot/compressed/mkpiggy.c b/arch/x86/boot/compressed/mkpiggy.c
index 46a823882437..958a641483dd 100644
--- a/arch/x86/boot/compressed/mkpiggy.c
+++ b/arch/x86/boot/compressed/mkpiggy.c
@@ -29,14 +29,7 @@
 #include <stdio.h>
 #include <string.h>
 #include <inttypes.h>
-
-static uint32_t getle32(const void *p)
-{
-	const uint8_t *cp = p;
-
-	return (uint32_t)cp[0] + ((uint32_t)cp[1] << 8) +
-		((uint32_t)cp[2] << 16) + ((uint32_t)cp[3] << 24);
-}
+#include <tools/le_byteshift.h>
 
 int main(int argc, char *argv[])
 {
@@ -69,7 +62,7 @@ int main(int argc, char *argv[])
 	}
 
 	ilen = ftell(f);
-	olen = getle32(&olen);
+	olen = get_unaligned_le32(&olen);
 	fclose(f);
 
 	/*
-- 
cgit v1.2.3


From d40f833630a1299fd377408dc8d8fac370d621b0 Mon Sep 17 00:00:00 2001
From: Matt Fleming <matt.fleming@intel.com>
Date: Tue, 28 Feb 2012 13:37:23 +0000
Subject: x86, boot: Restrict CFLAGS for hostprogs

Currently tools/build has access to all the kernel headers in
$(srctree). This is unnecessary and could potentially allow
tools/build to erroneously include kernel headers when it should only
be including userspace-exported headers.

Unfortunately, mkcpustr still needs access to some of the asm kernel
headers, so explicitly special case that hostprog.

Cc: H. Peter Anvin <hpa@zytor.com>
Signed-off-by: Matt Fleming <matt.fleming@intel.com>
Link: http://lkml.kernel.org/r/1330436245-24875-5-git-send-email-matt@console-pimps.org
Signed-off-by: H. Peter Anvin <hpa@zytor.com>
---
 arch/x86/boot/Makefile | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/boot/Makefile b/arch/x86/boot/Makefile
index 95365a82b6a0..3e02148bb774 100644
--- a/arch/x86/boot/Makefile
+++ b/arch/x86/boot/Makefile
@@ -37,8 +37,9 @@ setup-y		+= video-bios.o
 targets		+= $(setup-y)
 hostprogs-y	:= mkcpustr tools/build
 
-HOST_EXTRACFLAGS += $(LINUXINCLUDE)
-
+HOSTCFLAGS_mkcpustr.o := -I$(srctree)/arch/$(SRCARCH)/include
+HOST_EXTRACFLAGS += -I$(objtree)/include -I$(srctree)/tools/include \
+                   -include $(srctree)/include/linux/kconfig.h
 $(obj)/cpu.o: $(obj)/cpustr.h
 
 quiet_cmd_cpustr = CPUSTR  $@
-- 
cgit v1.2.3


From 92f42c50f227ad228f815a8f4eec872524dae3a5 Mon Sep 17 00:00:00 2001
From: Matt Fleming <matt.fleming@intel.com>
Date: Tue, 28 Feb 2012 13:37:24 +0000
Subject: x86, efi: Fix endian issues and unaligned accesses

We may need to convert the endianness of the data we read from/write
to 'buf', so let's use {get,put}_unaligned_le32() to do that. Failure
to do so can result in accessing invalid memory, leading to a
segfault.  Stephen Rothwell noticed this bug while cross-building an
x86_64 allmodconfig kernel on PowerPC.

We need to read from and write to 'buf' a byte at a time otherwise
it's possible we'll perform an unaligned access, which can lead to bus
errors when cross-building an x86 kernel on risc architectures.

Cc: H. Peter Anvin <hpa@zytor.com>
Cc: Nick Bowler <nbowler@elliptictech.com>
Tested-by: Stephen Rothwell <sfr@canb.auug.org.au>
Reported-by: Stephen Rothwell <sfr@canb.auug.org.au>
Signed-off-by: Matt Fleming <matt.fleming@intel.com>
Link: http://lkml.kernel.org/r/1330436245-24875-6-git-send-email-matt@console-pimps.org
Signed-off-by: H. Peter Anvin <hpa@zytor.com>
---
 arch/x86/boot/tools/build.c | 31 +++++++++++++++----------------
 1 file changed, 15 insertions(+), 16 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/boot/tools/build.c b/arch/x86/boot/tools/build.c
index 4e9bd6bcafa6..f2ac95ece0cc 100644
--- a/arch/x86/boot/tools/build.c
+++ b/arch/x86/boot/tools/build.c
@@ -34,6 +34,7 @@
 #include <fcntl.h>
 #include <sys/mman.h>
 #include <asm/boot.h>
+#include <tools/le_byteshift.h>
 
 typedef unsigned char  u8;
 typedef unsigned short u16;
@@ -41,6 +42,7 @@ typedef unsigned long  u32;
 
 #define DEFAULT_MAJOR_ROOT 0
 #define DEFAULT_MINOR_ROOT 0
+#define DEFAULT_ROOT_DEV (DEFAULT_MAJOR_ROOT << 8 | DEFAULT_MINOR_ROOT)
 
 /* Minimal number of setup sectors */
 #define SETUP_SECT_MIN 5
@@ -159,7 +161,7 @@ int main(int argc, char ** argv)
 		die("read-error on `setup'");
 	if (c < 1024)
 		die("The setup must be at least 1024 bytes");
-	if (buf[510] != 0x55 || buf[511] != 0xaa)
+	if (get_unaligned_le16(&buf[510]) != 0xAA55)
 		die("Boot block hasn't got boot flag (0xAA55)");
 	fclose(file);
 
@@ -171,8 +173,7 @@ int main(int argc, char ** argv)
 	memset(buf+c, 0, i-c);
 
 	/* Set the default root device */
-	buf[508] = DEFAULT_MINOR_ROOT;
-	buf[509] = DEFAULT_MAJOR_ROOT;
+	put_unaligned_le16(DEFAULT_ROOT_DEV, &buf[508]);
 
 	fprintf(stderr, "Setup is %d bytes (padded to %d bytes).\n", c, i);
 
@@ -192,44 +193,42 @@ int main(int argc, char ** argv)
 
 	/* Patch the setup code with the appropriate size parameters */
 	buf[0x1f1] = setup_sectors-1;
-	buf[0x1f4] = sys_size;
-	buf[0x1f5] = sys_size >> 8;
-	buf[0x1f6] = sys_size >> 16;
-	buf[0x1f7] = sys_size >> 24;
+	put_unaligned_le32(sys_size, &buf[0x1f4]);
 
 #ifdef CONFIG_EFI_STUB
 	file_sz = sz + i + ((sys_size * 16) - sz);
 
-	pe_header = *(unsigned int *)&buf[0x3c];
+	pe_header = get_unaligned_le32(&buf[0x3c]);
 
 	/* Size of code */
-	*(unsigned int *)&buf[pe_header + 0x1c] = file_sz;
+	put_unaligned_le32(file_sz, &buf[pe_header + 0x1c]);
 
 	/* Size of image */
-	*(unsigned int *)&buf[pe_header + 0x50] = file_sz;
+	put_unaligned_le32(file_sz, &buf[pe_header + 0x50]);
 
 #ifdef CONFIG_X86_32
 	/* Address of entry point */
-	*(unsigned int *)&buf[pe_header + 0x28] = i;
+	put_unaligned_le32(i, &buf[pe_header + 0x28]);
 
 	/* .text size */
-	*(unsigned int *)&buf[pe_header + 0xb0] = file_sz;
+	put_unaligned_le32(file_sz, &buf[pe_header + 0xb0]);
 
 	/* .text size of initialised data */
-	*(unsigned int *)&buf[pe_header + 0xb8] = file_sz;
+	put_unaligned_le32(file_sz, &buf[pe_header + 0xb8]);
 #else
 	/*
 	 * Address of entry point. startup_32 is at the beginning and
 	 * the 64-bit entry point (startup_64) is always 512 bytes
 	 * after.
 	 */
-	*(unsigned int *)&buf[pe_header + 0x28] = i + 512;
+	put_unaligned_le32(i + 512, &buf[pe_header + 0x28]);
 
 	/* .text size */
-	*(unsigned int *)&buf[pe_header + 0xc0] = file_sz;
+	put_unaligned_le32(file_sz, &buf[pe_header + 0xc0]);
 
 	/* .text size of initialised data */
-	*(unsigned int *)&buf[pe_header + 0xc8] = file_sz;
+	put_unaligned_le32(file_sz, &buf[pe_header + 0xc8]);
+
 #endif /* CONFIG_X86_32 */
 #endif /* CONFIG_EFI_STUB */
 
-- 
cgit v1.2.3


From 8411371709610c826bf65684f886bfdfb5780ca1 Mon Sep 17 00:00:00 2001
From: Jonathan Nieder <jrnieder@gmail.com>
Date: Tue, 28 Feb 2012 11:51:10 -0700
Subject: x86/PCI: use host bridge _CRS info on MSI MS-7253

In the spirit of commit 29cf7a30f8a0 ("x86/PCI: use host bridge _CRS
info on ASUS M2V-MX SE"), this DMI quirk turns on "pci_use_crs" by
default on a board that needs it.

This fixes boot failures and oopses introduced in 3e3da00c01d0
("x86/pci: AMD one chain system to use pci read out res").  The quirk
is quite targetted (to a specific board and BIOS version) for two
reasons:

 (1) to emphasize that this method of tackling the problem one quirk
     at a time is a little insane

 (2) to give BIOS vendors an opportunity to use simpler tables and
     allow us to return to generic behavior (whatever that happens to
     be) with a later BIOS update

In other words, I am not at all happy with having quirks like this.
But it is even worse for the kernel not to work out of the box on
these machines, so...

Reference: https://bugzilla.kernel.org/show_bug.cgi?id=42619
Reported-by: Svante Signell <svante.signell@telia.com>
Signed-off-by: Jonathan Nieder <jrnieder@gmail.com>
Signed-off-by: Bjorn Helgaas <bhelgaas@google.com>
Signed-off-by: Jesse Barnes <jbarnes@virtuousgeek.org>
---
 arch/x86/pci/acpi.c | 11 +++++++++++
 1 file changed, 11 insertions(+)

(limited to 'arch/x86')

diff --git a/arch/x86/pci/acpi.c b/arch/x86/pci/acpi.c
index c33e0970ee9f..7034c081b226 100644
--- a/arch/x86/pci/acpi.c
+++ b/arch/x86/pci/acpi.c
@@ -60,6 +60,17 @@ static const struct dmi_system_id pci_use_crs_table[] __initconst = {
 			DMI_MATCH(DMI_BIOS_VENDOR, "American Megatrends Inc."),
 		},
 	},
+	/* https://bugzilla.kernel.org/show_bug.cgi?id=42619 */
+	{
+		.callback = set_use_crs,
+		.ident = "MSI MS-7253",
+		.matches = {
+			DMI_MATCH(DMI_BOARD_VENDOR, "MICRO-STAR INTERNATIONAL CO., LTD"),
+			DMI_MATCH(DMI_BOARD_NAME, "MS-7253"),
+			DMI_MATCH(DMI_BIOS_VENDOR, "Phoenix Technologies, LTD"),
+			DMI_MATCH(DMI_BIOS_VERSION, "V1.6"),
+		},
+	},
 
 	/* Now for the blacklist.. */
 
-- 
cgit v1.2.3


From f649e9388cd46ad1634164e56f96ae092ca59e4a Mon Sep 17 00:00:00 2001
From: Paul Gortmaker <paul.gortmaker@windriver.com>
Date: Fri, 20 Jan 2012 16:24:09 -0500
Subject: x86: relocate get/set debugreg fcns to include/asm/debugreg.

Since we already have a debugreg.h header file, move the
assoc. get/set functions to it.  In addition to it being the
logical home for them, it has a secondary advantage.  The
functions that are moved use BUG().  So we really need to
have linux/bug.h in scope.  But asm/processor.h is used about
600 times, vs. only about 15 for debugreg.h -- so adding bug.h
to the latter reduces the amount of time we'll be processing
it during a compile.

Signed-off-by: Paul Gortmaker <paul.gortmaker@windriver.com>
Acked-by: Ingo Molnar <mingo@elte.hu>
CC: Thomas Gleixner <tglx@linutronix.de>
CC: "H. Peter Anvin" <hpa@zytor.com>
---
 arch/x86/include/asm/debugreg.h  | 67 ++++++++++++++++++++++++++++++++++++++++
 arch/x86/include/asm/processor.h | 63 -------------------------------------
 arch/x86/kernel/cpu/common.c     |  1 +
 3 files changed, 68 insertions(+), 63 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/include/asm/debugreg.h b/arch/x86/include/asm/debugreg.h
index b903d5ea3941..2d91580bf228 100644
--- a/arch/x86/include/asm/debugreg.h
+++ b/arch/x86/include/asm/debugreg.h
@@ -78,8 +78,75 @@
  */
 #ifdef __KERNEL__
 
+#include <linux/bug.h>
+
 DECLARE_PER_CPU(unsigned long, cpu_dr7);
 
+#ifndef CONFIG_PARAVIRT
+/*
+ * These special macros can be used to get or set a debugging register
+ */
+#define get_debugreg(var, register)				\
+	(var) = native_get_debugreg(register)
+#define set_debugreg(value, register)				\
+	native_set_debugreg(register, value)
+#endif
+
+static inline unsigned long native_get_debugreg(int regno)
+{
+	unsigned long val = 0;	/* Damn you, gcc! */
+
+	switch (regno) {
+	case 0:
+		asm("mov %%db0, %0" :"=r" (val));
+		break;
+	case 1:
+		asm("mov %%db1, %0" :"=r" (val));
+		break;
+	case 2:
+		asm("mov %%db2, %0" :"=r" (val));
+		break;
+	case 3:
+		asm("mov %%db3, %0" :"=r" (val));
+		break;
+	case 6:
+		asm("mov %%db6, %0" :"=r" (val));
+		break;
+	case 7:
+		asm("mov %%db7, %0" :"=r" (val));
+		break;
+	default:
+		BUG();
+	}
+	return val;
+}
+
+static inline void native_set_debugreg(int regno, unsigned long value)
+{
+	switch (regno) {
+	case 0:
+		asm("mov %0, %%db0"	::"r" (value));
+		break;
+	case 1:
+		asm("mov %0, %%db1"	::"r" (value));
+		break;
+	case 2:
+		asm("mov %0, %%db2"	::"r" (value));
+		break;
+	case 3:
+		asm("mov %0, %%db3"	::"r" (value));
+		break;
+	case 6:
+		asm("mov %0, %%db6"	::"r" (value));
+		break;
+	case 7:
+		asm("mov %0, %%db7"	::"r" (value));
+		break;
+	default:
+		BUG();
+	}
+}
+
 static inline void hw_breakpoint_disable(void)
 {
 	/* Zero the control register for HW Breakpoint */
diff --git a/arch/x86/include/asm/processor.h b/arch/x86/include/asm/processor.h
index 58545c97d071..30aa6e95f814 100644
--- a/arch/x86/include/asm/processor.h
+++ b/arch/x86/include/asm/processor.h
@@ -474,61 +474,6 @@ struct thread_struct {
 	unsigned		io_bitmap_max;
 };
 
-static inline unsigned long native_get_debugreg(int regno)
-{
-	unsigned long val = 0;	/* Damn you, gcc! */
-
-	switch (regno) {
-	case 0:
-		asm("mov %%db0, %0" :"=r" (val));
-		break;
-	case 1:
-		asm("mov %%db1, %0" :"=r" (val));
-		break;
-	case 2:
-		asm("mov %%db2, %0" :"=r" (val));
-		break;
-	case 3:
-		asm("mov %%db3, %0" :"=r" (val));
-		break;
-	case 6:
-		asm("mov %%db6, %0" :"=r" (val));
-		break;
-	case 7:
-		asm("mov %%db7, %0" :"=r" (val));
-		break;
-	default:
-		BUG();
-	}
-	return val;
-}
-
-static inline void native_set_debugreg(int regno, unsigned long value)
-{
-	switch (regno) {
-	case 0:
-		asm("mov %0, %%db0"	::"r" (value));
-		break;
-	case 1:
-		asm("mov %0, %%db1"	::"r" (value));
-		break;
-	case 2:
-		asm("mov %0, %%db2"	::"r" (value));
-		break;
-	case 3:
-		asm("mov %0, %%db3"	::"r" (value));
-		break;
-	case 6:
-		asm("mov %0, %%db6"	::"r" (value));
-		break;
-	case 7:
-		asm("mov %0, %%db7"	::"r" (value));
-		break;
-	default:
-		BUG();
-	}
-}
-
 /*
  * Set IOPL bits in EFLAGS from given mask
  */
@@ -574,14 +519,6 @@ static inline void native_swapgs(void)
 #define __cpuid			native_cpuid
 #define paravirt_enabled()	0
 
-/*
- * These special macros can be used to get or set a debugging register
- */
-#define get_debugreg(var, register)				\
-	(var) = native_get_debugreg(register)
-#define set_debugreg(value, register)				\
-	native_set_debugreg(register, value)
-
 static inline void load_sp0(struct tss_struct *tss,
 			    struct thread_struct *thread)
 {
diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c
index c0f7d68d318f..0d676dd923ac 100644
--- a/arch/x86/kernel/cpu/common.c
+++ b/arch/x86/kernel/cpu/common.c
@@ -18,6 +18,7 @@
 #include <asm/archrandom.h>
 #include <asm/hypervisor.h>
 #include <asm/processor.h>
+#include <asm/debugreg.h>
 #include <asm/sections.h>
 #include <linux/topology.h>
 #include <linux/cpumask.h>
-- 
cgit v1.2.3


From b8d43cb504a94f1070159a37c8cb23008276eff3 Mon Sep 17 00:00:00 2001
From: "H. Peter Anvin" <hpa@zytor.com>
Date: Tue, 28 Feb 2012 23:30:58 -0800
Subject: x86, tools: Remove unneeded header files from tools/build.c

We include <sys/sysmacros.h> and <asm/boot.h>, but none of those
header files actually provide anything this file needs.  Furthermore,
it breaks cross-compilation, so just remove them.

Reported-by: Stephen Rothwell <sfr@canb.auug.org.au>
Reported-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: H. Peter Anvin <hpa@zytor.com>
Cc: Matt Fleming <matt@console-pimps.org>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Nick Bowler <nbowler@elliptictech.com>
Link: http://lkml.kernel.org/r/20120229111322.9eb4b23ff1672e8853ad3b3b@canb.auug.org.au
---
 arch/x86/boot/tools/build.c | 2 --
 1 file changed, 2 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/boot/tools/build.c b/arch/x86/boot/tools/build.c
index f2ac95ece0cc..f3bd2e676d2a 100644
--- a/arch/x86/boot/tools/build.c
+++ b/arch/x86/boot/tools/build.c
@@ -29,11 +29,9 @@
 #include <stdarg.h>
 #include <sys/types.h>
 #include <sys/stat.h>
-#include <sys/sysmacros.h>
 #include <unistd.h>
 #include <fcntl.h>
 #include <sys/mman.h>
-#include <asm/boot.h>
 #include <tools/le_byteshift.h>
 
 typedef unsigned char  u8;
-- 
cgit v1.2.3


From a51f4047758d2bcd099ea113b833ed380f4024ba Mon Sep 17 00:00:00 2001
From: "H. Peter Anvin" <hpa@zytor.com>
Date: Tue, 28 Feb 2012 23:36:21 -0800
Subject: x86, build: Fix portability issues when cross-building

It would appear that we never actually generated a correct CRC when
building on a bigendian machine.  Depending on the word size, we would
either generate an all-zero CRC (64-bit machine) or a byte-swapped
CRC (32-bit machine.)  Fix the types used so we don't arbitrarily use
a 64-bit word to hold 32-bit numbers, and pass the CRC through
put_unaligned_le32() like all the other numbers.

Signed-off-by: H. Peter Anvin <hpa@zytor.com>
Cc: Stephen Rothwell <sfr@canb.auug.org.au>
Cc: Matt Fleming <matt@console-pimps.org>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Nick Bowler <nbowler@elliptictech.com>
Link: http://lkml.kernel.org/r/20120229111322.9eb4b23ff1672e8853ad3b3b@canb.auug.org.au
---
 arch/x86/boot/tools/build.c | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/boot/tools/build.c b/arch/x86/boot/tools/build.c
index f3bd2e676d2a..ed549767a231 100644
--- a/arch/x86/boot/tools/build.c
+++ b/arch/x86/boot/tools/build.c
@@ -36,7 +36,7 @@
 
 typedef unsigned char  u8;
 typedef unsigned short u16;
-typedef unsigned long  u32;
+typedef unsigned int   u32;
 
 #define DEFAULT_MAJOR_ROOT 0
 #define DEFAULT_MINOR_ROOT 0
@@ -247,8 +247,9 @@ int main(int argc, char ** argv)
 	}
 
 	/* Write the CRC */
-	fprintf(stderr, "CRC %lx\n", crc);
-	if (fwrite(&crc, 1, 4, stdout) != 4)
+	fprintf(stderr, "CRC %x\n", crc);
+	put_unaligned_le32(crc, buf);
+	if (fwrite(buf, 1, 4, stdout) != 4)
 		die("Writing CRC failed");
 
 	close(fd);
-- 
cgit v1.2.3


From 50af5ead3b44ccf8bd2b4d2a50c1b610f557c480 Mon Sep 17 00:00:00 2001
From: Paul Gortmaker <paul.gortmaker@windriver.com>
Date: Fri, 20 Jan 2012 18:35:53 -0500
Subject: bug.h: add include of it to various implicit C users

With bug.h currently living right in linux/kernel.h there
are files that use BUG_ON and friends but are not including
the header explicitly.  Fix them up so we can remove the
presence in kernel.h file.

Signed-off-by: Paul Gortmaker <paul.gortmaker@windriver.com>
---
 arch/arm/mach-imx/cpu_op-mx51.c                 | 1 +
 arch/arm/mach-ux500/board-mop500-pins.c         | 1 +
 arch/mips/fw/arc/cmdline.c                      | 1 +
 arch/mips/fw/arc/identify.c                     | 1 +
 arch/parisc/math-emu/fpudispatch.c              | 1 +
 arch/powerpc/kernel/pmc.c                       | 1 +
 arch/powerpc/xmon/ppc-opc.c                     | 1 +
 arch/powerpc/xmon/spu-opc.c                     | 1 +
 arch/x86/kernel/paravirt.c                      | 1 +
 arch/x86/mm/kmemcheck/selftest.c                | 1 +
 drivers/gpu/drm/radeon/cayman_blit_shaders.c    | 1 +
 drivers/gpu/drm/radeon/evergreen_blit_shaders.c | 1 +
 drivers/gpu/drm/radeon/r600_blit_shaders.c      | 1 +
 drivers/staging/wlags49_h2/hcf.c                | 1 +
 lib/atomic64_test.c                             | 1 +
 lib/bitmap.c                                    | 1 +
 lib/iommu-helper.c                              | 1 +
 lib/list_debug.c                                | 1 +
 lib/plist.c                                     | 1 +
 lib/string.c                                    | 1 +
 lib/timerqueue.c                                | 1 +
 21 files changed, 21 insertions(+)

(limited to 'arch/x86')

diff --git a/arch/arm/mach-imx/cpu_op-mx51.c b/arch/arm/mach-imx/cpu_op-mx51.c
index 9d34c3d4c024..7b92cd6da6d3 100644
--- a/arch/arm/mach-imx/cpu_op-mx51.c
+++ b/arch/arm/mach-imx/cpu_op-mx51.c
@@ -11,6 +11,7 @@
  * http://www.gnu.org/copyleft/gpl.html
  */
 
+#include <linux/bug.h>
 #include <linux/types.h>
 #include <mach/hardware.h>
 #include <linux/kernel.h>
diff --git a/arch/arm/mach-ux500/board-mop500-pins.c b/arch/arm/mach-ux500/board-mop500-pins.c
index 74bfcff2bdf3..f5413dca532c 100644
--- a/arch/arm/mach-ux500/board-mop500-pins.c
+++ b/arch/arm/mach-ux500/board-mop500-pins.c
@@ -6,6 +6,7 @@
 
 #include <linux/kernel.h>
 #include <linux/init.h>
+#include <linux/bug.h>
 
 #include <asm/mach-types.h>
 #include <plat/pincfg.h>
diff --git a/arch/mips/fw/arc/cmdline.c b/arch/mips/fw/arc/cmdline.c
index 9fdf07e50f1b..c0122a1dc587 100644
--- a/arch/mips/fw/arc/cmdline.c
+++ b/arch/mips/fw/arc/cmdline.c
@@ -7,6 +7,7 @@
  *
  * Copyright (C) 1996 David S. Miller (davem@davemloft.net)
  */
+#include <linux/bug.h>
 #include <linux/init.h>
 #include <linux/kernel.h>
 #include <linux/string.h>
diff --git a/arch/mips/fw/arc/identify.c b/arch/mips/fw/arc/identify.c
index 788060a53dce..54a33c756f61 100644
--- a/arch/mips/fw/arc/identify.c
+++ b/arch/mips/fw/arc/identify.c
@@ -11,6 +11,7 @@
  *
  * Copyright (C) 1996 David S. Miller (davem@davemloft.net)
  */
+#include <linux/bug.h>
 #include <linux/init.h>
 #include <linux/kernel.h>
 #include <linux/types.h>
diff --git a/arch/parisc/math-emu/fpudispatch.c b/arch/parisc/math-emu/fpudispatch.c
index 6e28f9f4c620..673b73e8420d 100644
--- a/arch/parisc/math-emu/fpudispatch.c
+++ b/arch/parisc/math-emu/fpudispatch.c
@@ -50,6 +50,7 @@
 #define FPUDEBUG 0
 
 #include "float.h"
+#include <linux/bug.h>
 #include <linux/kernel.h>
 #include <asm/processor.h>
 /* #include <sys/debug.h> */
diff --git a/arch/powerpc/kernel/pmc.c b/arch/powerpc/kernel/pmc.c
index a841a9d136a2..58eaa3ddf7b9 100644
--- a/arch/powerpc/kernel/pmc.c
+++ b/arch/powerpc/kernel/pmc.c
@@ -13,6 +13,7 @@
  */
 
 #include <linux/errno.h>
+#include <linux/bug.h>
 #include <linux/spinlock.h>
 #include <linux/export.h>
 
diff --git a/arch/powerpc/xmon/ppc-opc.c b/arch/powerpc/xmon/ppc-opc.c
index af3780e52e76..6845e91ba04a 100644
--- a/arch/powerpc/xmon/ppc-opc.c
+++ b/arch/powerpc/xmon/ppc-opc.c
@@ -22,6 +22,7 @@
 
 #include <linux/stddef.h>
 #include <linux/kernel.h>
+#include <linux/bug.h>
 #include "nonstdio.h"
 #include "ppc.h"
 
diff --git a/arch/powerpc/xmon/spu-opc.c b/arch/powerpc/xmon/spu-opc.c
index 530df3d6d7b2..7d37597c4bcd 100644
--- a/arch/powerpc/xmon/spu-opc.c
+++ b/arch/powerpc/xmon/spu-opc.c
@@ -19,6 +19,7 @@
    51 Franklin Street - Fifth Floor, Boston, MA 02110-1301, USA.  */
 
 #include <linux/kernel.h>
+#include <linux/bug.h>
 #include "spu.h"
 
 /* This file holds the Spu opcode table */
diff --git a/arch/x86/kernel/paravirt.c b/arch/x86/kernel/paravirt.c
index d90272e6bc40..83e7b81d2135 100644
--- a/arch/x86/kernel/paravirt.c
+++ b/arch/x86/kernel/paravirt.c
@@ -26,6 +26,7 @@
 
 #include <asm/bug.h>
 #include <asm/paravirt.h>
+#include <asm/debugreg.h>
 #include <asm/desc.h>
 #include <asm/setup.h>
 #include <asm/pgtable.h>
diff --git a/arch/x86/mm/kmemcheck/selftest.c b/arch/x86/mm/kmemcheck/selftest.c
index 036efbea8b28..aef7140c0063 100644
--- a/arch/x86/mm/kmemcheck/selftest.c
+++ b/arch/x86/mm/kmemcheck/selftest.c
@@ -1,3 +1,4 @@
+#include <linux/bug.h>
 #include <linux/kernel.h>
 
 #include "opcode.h"
diff --git a/drivers/gpu/drm/radeon/cayman_blit_shaders.c b/drivers/gpu/drm/radeon/cayman_blit_shaders.c
index 7b4eeb7b4a8c..19a0114d2e3b 100644
--- a/drivers/gpu/drm/radeon/cayman_blit_shaders.c
+++ b/drivers/gpu/drm/radeon/cayman_blit_shaders.c
@@ -24,6 +24,7 @@
  *     Alex Deucher <alexander.deucher@amd.com>
  */
 
+#include <linux/bug.h>
 #include <linux/types.h>
 #include <linux/kernel.h>
 
diff --git a/drivers/gpu/drm/radeon/evergreen_blit_shaders.c b/drivers/gpu/drm/radeon/evergreen_blit_shaders.c
index 3a10399e0066..f85c0af115b5 100644
--- a/drivers/gpu/drm/radeon/evergreen_blit_shaders.c
+++ b/drivers/gpu/drm/radeon/evergreen_blit_shaders.c
@@ -24,6 +24,7 @@
  *     Alex Deucher <alexander.deucher@amd.com>
  */
 
+#include <linux/bug.h>
 #include <linux/types.h>
 #include <linux/kernel.h>
 
diff --git a/drivers/gpu/drm/radeon/r600_blit_shaders.c b/drivers/gpu/drm/radeon/r600_blit_shaders.c
index 2d1f6c5ee2a7..3af3c6426a6e 100644
--- a/drivers/gpu/drm/radeon/r600_blit_shaders.c
+++ b/drivers/gpu/drm/radeon/r600_blit_shaders.c
@@ -24,6 +24,7 @@
  *     Alex Deucher <alexander.deucher@amd.com>
  */
 
+#include <linux/bug.h>
 #include <linux/types.h>
 #include <linux/kernel.h>
 
diff --git a/drivers/staging/wlags49_h2/hcf.c b/drivers/staging/wlags49_h2/hcf.c
index b008773323b3..5957c3a439ac 100644
--- a/drivers/staging/wlags49_h2/hcf.c
+++ b/drivers/staging/wlags49_h2/hcf.c
@@ -91,6 +91,7 @@
 #include "hcf.h"                // HCF and MSF common include file
 #include "hcfdef.h"             // HCF specific include file
 #include "mmd.h"                // MoreModularDriver common include file
+#include <linux/bug.h>
 #include <linux/kernel.h>
 
 #if ! defined offsetof
diff --git a/lib/atomic64_test.c b/lib/atomic64_test.c
index 0c33cde2a1e6..cb99b91c3a1d 100644
--- a/lib/atomic64_test.c
+++ b/lib/atomic64_test.c
@@ -9,6 +9,7 @@
  * (at your option) any later version.
  */
 #include <linux/init.h>
+#include <linux/bug.h>
 #include <linux/kernel.h>
 #include <linux/atomic.h>
 
diff --git a/lib/bitmap.c b/lib/bitmap.c
index 0d4a127dd9b3..90a683b34075 100644
--- a/lib/bitmap.c
+++ b/lib/bitmap.c
@@ -10,6 +10,7 @@
 #include <linux/errno.h>
 #include <linux/bitmap.h>
 #include <linux/bitops.h>
+#include <linux/bug.h>
 #include <asm/uaccess.h>
 
 /*
diff --git a/lib/iommu-helper.c b/lib/iommu-helper.c
index da053313ee5c..8b1ab6222562 100644
--- a/lib/iommu-helper.c
+++ b/lib/iommu-helper.c
@@ -4,6 +4,7 @@
 
 #include <linux/module.h>
 #include <linux/bitmap.h>
+#include <linux/bug.h>
 
 int iommu_is_span_boundary(unsigned int index, unsigned int nr,
 			   unsigned long shift,
diff --git a/lib/list_debug.c b/lib/list_debug.c
index 7204e619a4c1..1bf2fe36f813 100644
--- a/lib/list_debug.c
+++ b/lib/list_debug.c
@@ -8,6 +8,7 @@
 
 #include <linux/module.h>
 #include <linux/list.h>
+#include <linux/bug.h>
 #include <linux/kernel.h>
 
 /*
diff --git a/lib/plist.c b/lib/plist.c
index a0a4da489c22..6ab0e521c48b 100644
--- a/lib/plist.c
+++ b/lib/plist.c
@@ -23,6 +23,7 @@
  * information.
  */
 
+#include <linux/bug.h>
 #include <linux/plist.h>
 #include <linux/spinlock.h>
 
diff --git a/lib/string.c b/lib/string.c
index dc4a86341f91..0573a20df9a6 100644
--- a/lib/string.c
+++ b/lib/string.c
@@ -22,6 +22,7 @@
 #include <linux/types.h>
 #include <linux/string.h>
 #include <linux/ctype.h>
+#include <linux/bug.h>
 #include <linux/module.h>
 
 #ifndef __HAVE_ARCH_STRNICMP
diff --git a/lib/timerqueue.c b/lib/timerqueue.c
index 191176a43e9a..14c640355eb1 100644
--- a/lib/timerqueue.c
+++ b/lib/timerqueue.c
@@ -22,6 +22,7 @@
  *  Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
  */
 
+#include <linux/bug.h>
 #include <linux/timerqueue.h>
 #include <linux/rbtree.h>
 #include <linux/module.h>
-- 
cgit v1.2.3


From bd2f55361f18347e890d52ff9cfd8895455ec11b Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Mon, 21 Mar 2011 12:33:18 +0100
Subject: sched/rt: Use schedule_preempt_disabled()

Coccinelle based conversion.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Acked-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Link: http://lkml.kernel.org/n/tip-24swm5zut3h9c4a6s46x8rws@git.kernel.org
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/arm/kernel/process.c              |  4 +---
 arch/avr32/kernel/process.c            |  4 +---
 arch/blackfin/kernel/process.c         |  4 +---
 arch/cris/kernel/process.c             |  4 +---
 arch/frv/kernel/process.c              |  4 +---
 arch/h8300/kernel/process.c            |  4 +---
 arch/ia64/kernel/process.c             |  4 +---
 arch/m32r/kernel/process.c             |  4 +---
 arch/m68k/kernel/process_mm.c          |  4 +---
 arch/m68k/kernel/process_no.c          |  4 +---
 arch/microblaze/kernel/process.c       |  4 +---
 arch/mips/kernel/process.c             |  4 +---
 arch/mn10300/kernel/process.c          |  4 +---
 arch/parisc/kernel/process.c           |  4 +---
 arch/powerpc/kernel/idle.c             |  8 ++++----
 arch/powerpc/platforms/iseries/setup.c |  8 ++------
 arch/s390/kernel/process.c             |  4 +---
 arch/score/kernel/process.c            |  4 +---
 arch/sh/kernel/idle.c                  |  4 +---
 arch/sparc/kernel/process_32.c         |  8 ++------
 arch/sparc/kernel/process_64.c         | 10 ++++------
 arch/tile/kernel/process.c             |  4 +---
 arch/x86/kernel/process_32.c           |  4 +---
 arch/x86/kernel/process_64.c           |  4 +---
 arch/xtensa/kernel/process.c           |  4 +---
 init/main.c                            |  5 +----
 kernel/mutex.c                         |  4 +---
 kernel/softirq.c                       |  4 +---
 28 files changed, 36 insertions(+), 95 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/arm/kernel/process.c b/arch/arm/kernel/process.c
index 971d65c253a9..c2ae3cd331fe 100644
--- a/arch/arm/kernel/process.c
+++ b/arch/arm/kernel/process.c
@@ -239,9 +239,7 @@ void cpu_idle(void)
 		leds_event(led_idle_end);
 		rcu_idle_exit();
 		tick_nohz_idle_exit();
-		preempt_enable_no_resched();
-		schedule();
-		preempt_disable();
+		schedule_preempt_disabled();
 	}
 }
 
diff --git a/arch/avr32/kernel/process.c b/arch/avr32/kernel/process.c
index ea3395750324..92c5af98a6f7 100644
--- a/arch/avr32/kernel/process.c
+++ b/arch/avr32/kernel/process.c
@@ -40,9 +40,7 @@ void cpu_idle(void)
 			cpu_idle_sleep();
 		rcu_idle_exit();
 		tick_nohz_idle_exit();
-		preempt_enable_no_resched();
-		schedule();
-		preempt_disable();
+		schedule_preempt_disabled();
 	}
 }
 
diff --git a/arch/blackfin/kernel/process.c b/arch/blackfin/kernel/process.c
index 8dd0416673cb..a80a643f3691 100644
--- a/arch/blackfin/kernel/process.c
+++ b/arch/blackfin/kernel/process.c
@@ -94,9 +94,7 @@ void cpu_idle(void)
 			idle();
 		rcu_idle_exit();
 		tick_nohz_idle_exit();
-		preempt_enable_no_resched();
-		schedule();
-		preempt_disable();
+		schedule_preempt_disabled();
 	}
 }
 
diff --git a/arch/cris/kernel/process.c b/arch/cris/kernel/process.c
index aa585e4e979e..d8f50ff6fadd 100644
--- a/arch/cris/kernel/process.c
+++ b/arch/cris/kernel/process.c
@@ -115,9 +115,7 @@ void cpu_idle (void)
 				idle = default_idle;
 			idle();
 		}
-		preempt_enable_no_resched();
-		schedule();
-		preempt_disable();
+		schedule_preempt_disabled();
 	}
 }
 
diff --git a/arch/frv/kernel/process.c b/arch/frv/kernel/process.c
index 3901df1213c0..29cc49783787 100644
--- a/arch/frv/kernel/process.c
+++ b/arch/frv/kernel/process.c
@@ -92,9 +92,7 @@ void cpu_idle(void)
 				idle();
 		}
 
-		preempt_enable_no_resched();
-		schedule();
-		preempt_disable();
+		schedule_preempt_disabled();
 	}
 }
 
diff --git a/arch/h8300/kernel/process.c b/arch/h8300/kernel/process.c
index 933bd388efb2..1a173b35f475 100644
--- a/arch/h8300/kernel/process.c
+++ b/arch/h8300/kernel/process.c
@@ -81,9 +81,7 @@ void cpu_idle(void)
 	while (1) {
 		while (!need_resched())
 			idle();
-		preempt_enable_no_resched();
-		schedule();
-		preempt_disable();
+		schedule_preempt_disabled();
 	}
 }
 
diff --git a/arch/ia64/kernel/process.c b/arch/ia64/kernel/process.c
index 6d33c5cc94f0..9dc52b63fc87 100644
--- a/arch/ia64/kernel/process.c
+++ b/arch/ia64/kernel/process.c
@@ -330,9 +330,7 @@ cpu_idle (void)
 			normal_xtp();
 #endif
 		}
-		preempt_enable_no_resched();
-		schedule();
-		preempt_disable();
+		schedule_preempt_disabled();
 		check_pgt_cache();
 		if (cpu_is_offline(cpu))
 			play_dead();
diff --git a/arch/m32r/kernel/process.c b/arch/m32r/kernel/process.c
index 422bea9f1dbc..3a4a32b27208 100644
--- a/arch/m32r/kernel/process.c
+++ b/arch/m32r/kernel/process.c
@@ -90,9 +90,7 @@ void cpu_idle (void)
 
 			idle();
 		}
-		preempt_enable_no_resched();
-		schedule();
-		preempt_disable();
+		schedule_preempt_disabled();
 	}
 }
 
diff --git a/arch/m68k/kernel/process_mm.c b/arch/m68k/kernel/process_mm.c
index 099283ee1a8f..fe4186b5fc32 100644
--- a/arch/m68k/kernel/process_mm.c
+++ b/arch/m68k/kernel/process_mm.c
@@ -78,9 +78,7 @@ void cpu_idle(void)
 	while (1) {
 		while (!need_resched())
 			idle();
-		preempt_enable_no_resched();
-		schedule();
-		preempt_disable();
+		schedule_preempt_disabled();
 	}
 }
 
diff --git a/arch/m68k/kernel/process_no.c b/arch/m68k/kernel/process_no.c
index 5e1078cabe0e..f7fe6c348595 100644
--- a/arch/m68k/kernel/process_no.c
+++ b/arch/m68k/kernel/process_no.c
@@ -73,9 +73,7 @@ void cpu_idle(void)
 	/* endless idle loop with no priority at all */
 	while (1) {
 		idle();
-		preempt_enable_no_resched();
-		schedule();
-		preempt_disable();
+		schedule_preempt_disabled();
 	}
 }
 
diff --git a/arch/microblaze/kernel/process.c b/arch/microblaze/kernel/process.c
index 7dcb5bfffb75..9155f7d92669 100644
--- a/arch/microblaze/kernel/process.c
+++ b/arch/microblaze/kernel/process.c
@@ -110,9 +110,7 @@ void cpu_idle(void)
 		rcu_idle_exit();
 		tick_nohz_idle_exit();
 
-		preempt_enable_no_resched();
-		schedule();
-		preempt_disable();
+		schedule_preempt_disabled();
 		check_pgt_cache();
 	}
 }
diff --git a/arch/mips/kernel/process.c b/arch/mips/kernel/process.c
index 7955409051c4..61f1cb45a1d5 100644
--- a/arch/mips/kernel/process.c
+++ b/arch/mips/kernel/process.c
@@ -80,9 +80,7 @@ void __noreturn cpu_idle(void)
 #endif
 		rcu_idle_exit();
 		tick_nohz_idle_exit();
-		preempt_enable_no_resched();
-		schedule();
-		preempt_disable();
+		schedule_preempt_disabled();
 	}
 }
 
diff --git a/arch/mn10300/kernel/process.c b/arch/mn10300/kernel/process.c
index 28eec3102535..cac401d37f75 100644
--- a/arch/mn10300/kernel/process.c
+++ b/arch/mn10300/kernel/process.c
@@ -123,9 +123,7 @@ void cpu_idle(void)
 			idle();
 		}
 
-		preempt_enable_no_resched();
-		schedule();
-		preempt_disable();
+		schedule_preempt_disabled();
 	}
 }
 
diff --git a/arch/parisc/kernel/process.c b/arch/parisc/kernel/process.c
index 62c60b87d039..d4b94b395c16 100644
--- a/arch/parisc/kernel/process.c
+++ b/arch/parisc/kernel/process.c
@@ -71,9 +71,7 @@ void cpu_idle(void)
 	while (1) {
 		while (!need_resched())
 			barrier();
-		preempt_enable_no_resched();
-		schedule();
-		preempt_disable();
+		schedule_preempt_disabled();
 		check_pgt_cache();
 	}
 }
diff --git a/arch/powerpc/kernel/idle.c b/arch/powerpc/kernel/idle.c
index 0a48bf5db6c8..65035141552b 100644
--- a/arch/powerpc/kernel/idle.c
+++ b/arch/powerpc/kernel/idle.c
@@ -101,11 +101,11 @@ void cpu_idle(void)
 		ppc64_runlatch_on();
 		rcu_idle_exit();
 		tick_nohz_idle_exit();
-		preempt_enable_no_resched();
-		if (cpu_should_die())
+		if (cpu_should_die()) {
+			preempt_enable_no_resched();
 			cpu_die();
-		schedule();
-		preempt_disable();
+		}
+		schedule_preempt_disabled();
 	}
 }
 
diff --git a/arch/powerpc/platforms/iseries/setup.c b/arch/powerpc/platforms/iseries/setup.c
index 8fc62586a973..a5fbf4cb6329 100644
--- a/arch/powerpc/platforms/iseries/setup.c
+++ b/arch/powerpc/platforms/iseries/setup.c
@@ -584,9 +584,7 @@ static void iseries_shared_idle(void)
 		if (hvlpevent_is_pending())
 			process_iSeries_events();
 
-		preempt_enable_no_resched();
-		schedule();
-		preempt_disable();
+		schedule_preempt_disabled();
 	}
 }
 
@@ -615,9 +613,7 @@ static void iseries_dedicated_idle(void)
 		ppc64_runlatch_on();
 		rcu_idle_exit();
 		tick_nohz_idle_exit();
-		preempt_enable_no_resched();
-		schedule();
-		preempt_disable();
+		schedule_preempt_disabled();
 	}
 }
 
diff --git a/arch/s390/kernel/process.c b/arch/s390/kernel/process.c
index e795933eb2cb..7618085b4164 100644
--- a/arch/s390/kernel/process.c
+++ b/arch/s390/kernel/process.c
@@ -97,9 +97,7 @@ void cpu_idle(void)
 		tick_nohz_idle_exit();
 		if (test_thread_flag(TIF_MCCK_PENDING))
 			s390_handle_mcck();
-		preempt_enable_no_resched();
-		schedule();
-		preempt_disable();
+		schedule_preempt_disabled();
 	}
 }
 
diff --git a/arch/score/kernel/process.c b/arch/score/kernel/process.c
index 25d08030a883..2707023c7563 100644
--- a/arch/score/kernel/process.c
+++ b/arch/score/kernel/process.c
@@ -53,9 +53,7 @@ void __noreturn cpu_idle(void)
 		while (!need_resched())
 			barrier();
 
-		preempt_enable_no_resched();
-		schedule();
-		preempt_disable();
+		schedule_preempt_disabled();
 	}
 }
 
diff --git a/arch/sh/kernel/idle.c b/arch/sh/kernel/idle.c
index 406508d4ce74..7e4892826563 100644
--- a/arch/sh/kernel/idle.c
+++ b/arch/sh/kernel/idle.c
@@ -114,9 +114,7 @@ void cpu_idle(void)
 
 		rcu_idle_exit();
 		tick_nohz_idle_exit();
-		preempt_enable_no_resched();
-		schedule();
-		preempt_disable();
+		schedule_preempt_disabled();
 	}
 }
 
diff --git a/arch/sparc/kernel/process_32.c b/arch/sparc/kernel/process_32.c
index f793742eec2b..935fdbcd88c2 100644
--- a/arch/sparc/kernel/process_32.c
+++ b/arch/sparc/kernel/process_32.c
@@ -113,9 +113,7 @@ void cpu_idle(void)
 			while (!need_resched())
 				cpu_relax();
 		}
-		preempt_enable_no_resched();
-		schedule();
-		preempt_disable();
+		schedule_preempt_disabled();
 		check_pgt_cache();
 	}
 }
@@ -138,9 +136,7 @@ void cpu_idle(void)
 			while (!need_resched())
 				cpu_relax();
 		}
-		preempt_enable_no_resched();
-		schedule();
-		preempt_disable();
+		schedule_preempt_disabled();
 		check_pgt_cache();
 	}
 }
diff --git a/arch/sparc/kernel/process_64.c b/arch/sparc/kernel/process_64.c
index 39d8b05201a2..ab9a29268213 100644
--- a/arch/sparc/kernel/process_64.c
+++ b/arch/sparc/kernel/process_64.c
@@ -104,15 +104,13 @@ void cpu_idle(void)
 		rcu_idle_exit();
 		tick_nohz_idle_exit();
 
-		preempt_enable_no_resched();
-
 #ifdef CONFIG_HOTPLUG_CPU
-		if (cpu_is_offline(cpu))
+		if (cpu_is_offline(cpu)) {
+			preempt_enable_no_resched();
 			cpu_play_dead();
+		}
 #endif
-
-		schedule();
-		preempt_disable();
+		schedule_preempt_disabled();
 	}
 }
 
diff --git a/arch/tile/kernel/process.c b/arch/tile/kernel/process.c
index 4c1ac6e5347a..6ae495ef2b99 100644
--- a/arch/tile/kernel/process.c
+++ b/arch/tile/kernel/process.c
@@ -108,9 +108,7 @@ void cpu_idle(void)
 		}
 		rcu_idle_exit();
 		tick_nohz_idle_exit();
-		preempt_enable_no_resched();
-		schedule();
-		preempt_disable();
+		schedule_preempt_disabled();
 	}
 }
 
diff --git a/arch/x86/kernel/process_32.c b/arch/x86/kernel/process_32.c
index c08d1ff12b7c..49888fefe794 100644
--- a/arch/x86/kernel/process_32.c
+++ b/arch/x86/kernel/process_32.c
@@ -119,9 +119,7 @@ void cpu_idle(void)
 		}
 		rcu_idle_exit();
 		tick_nohz_idle_exit();
-		preempt_enable_no_resched();
-		schedule();
-		preempt_disable();
+		schedule_preempt_disabled();
 	}
 }
 
diff --git a/arch/x86/kernel/process_64.c b/arch/x86/kernel/process_64.c
index cfa5c90c01db..e34257c70c28 100644
--- a/arch/x86/kernel/process_64.c
+++ b/arch/x86/kernel/process_64.c
@@ -156,9 +156,7 @@ void cpu_idle(void)
 		}
 
 		tick_nohz_idle_exit();
-		preempt_enable_no_resched();
-		schedule();
-		preempt_disable();
+		schedule_preempt_disabled();
 	}
 }
 
diff --git a/arch/xtensa/kernel/process.c b/arch/xtensa/kernel/process.c
index 47041e7c088c..2c9004770c4e 100644
--- a/arch/xtensa/kernel/process.c
+++ b/arch/xtensa/kernel/process.c
@@ -113,9 +113,7 @@ void cpu_idle(void)
 	while (1) {
 		while (!need_resched())
 			platform_idle();
-		preempt_enable_no_resched();
-		schedule();
-		preempt_disable();
+		schedule_preempt_disabled();
 	}
 }
 
diff --git a/init/main.c b/init/main.c
index ff49a6dacfbb..4990f7ec776a 100644
--- a/init/main.c
+++ b/init/main.c
@@ -374,11 +374,8 @@ static noinline void __init_refok rest_init(void)
 	 * at least once to get things moving:
 	 */
 	init_idle_bootup_task(current);
-	preempt_enable_no_resched();
-	schedule();
-
+	schedule_preempt_disabled();
 	/* Call into cpu_idle with preempt disabled */
-	preempt_disable();
 	cpu_idle();
 }
 
diff --git a/kernel/mutex.c b/kernel/mutex.c
index 89096dd8786f..a307cc9c9526 100644
--- a/kernel/mutex.c
+++ b/kernel/mutex.c
@@ -240,9 +240,7 @@ __mutex_lock_common(struct mutex *lock, long state, unsigned int subclass,
 
 		/* didn't get the lock, go to sleep: */
 		spin_unlock_mutex(&lock->wait_lock, flags);
-		preempt_enable_no_resched();
-		schedule();
-		preempt_disable();
+		schedule_preempt_disabled();
 		spin_lock_mutex(&lock->wait_lock, flags);
 	}
 
diff --git a/kernel/softirq.c b/kernel/softirq.c
index 4eb3a0fa351e..79b524767a24 100644
--- a/kernel/softirq.c
+++ b/kernel/softirq.c
@@ -744,9 +744,7 @@ static int run_ksoftirqd(void * __bind_cpu)
 	while (!kthread_should_stop()) {
 		preempt_disable();
 		if (!local_softirq_pending()) {
-			preempt_enable_no_resched();
-			schedule();
-			preempt_disable();
+			schedule_preempt_disabled();
 		}
 
 		__set_current_state(TASK_RUNNING);
-- 
cgit v1.2.3


From a97f4f5e524bcd09a85ef0b8821a14d35e69335f Mon Sep 17 00:00:00 2001
From: Jonathan Nieder <jrnieder@gmail.com>
Date: Tue, 28 Feb 2012 15:31:35 -0600
Subject: x86/PCI: do not tie MSI MS-7253 use_crs quirk to BIOS version

Carlos was getting

	WARNING: at drivers/pci/pci.c:118 pci_ioremap_bar+0x24/0x52()

when probing his sound card, and sound did not work.  After adding
pci=use_crs to the kernel command line, no more trouble.

Ok, we can add a quirk.  dmidecode output reveals that this is an MSI
MS-7253, for which we already have a quirk, but the short-sighted
author tied the quirk to a single BIOS version, making it not kick in
on Carlos's machine with BIOS V1.2.  If a later BIOS update makes it
no longer necessary to look at the _CRS info it will still be
harmless, so let's stop trying to guess which versions have and don't
have accurate _CRS tables.

Addresses https://bugtrack.alsa-project.org/alsa-bug/view.php?id=5533
Also see <https://bugzilla.kernel.org/show_bug.cgi?id=42619>.

Reported-by: Carlos Luna <caralu74@gmail.com>
Reviewed-by: Bjorn Helgaas <bhelgaas@google.com>
Signed-off-by: Jonathan Nieder <jrnieder@gmail.com>
Signed-off-by: Jesse Barnes <jbarnes@virtuousgeek.org>
---
 arch/x86/pci/acpi.c | 1 -
 1 file changed, 1 deletion(-)

(limited to 'arch/x86')

diff --git a/arch/x86/pci/acpi.c b/arch/x86/pci/acpi.c
index 7034c081b226..49a5cb55429b 100644
--- a/arch/x86/pci/acpi.c
+++ b/arch/x86/pci/acpi.c
@@ -68,7 +68,6 @@ static const struct dmi_system_id pci_use_crs_table[] __initconst = {
 			DMI_MATCH(DMI_BOARD_VENDOR, "MICRO-STAR INTERNATIONAL CO., LTD"),
 			DMI_MATCH(DMI_BOARD_NAME, "MS-7253"),
 			DMI_MATCH(DMI_BIOS_VENDOR, "Phoenix Technologies, LTD"),
-			DMI_MATCH(DMI_BIOS_VERSION, "V1.6"),
 		},
 	},
 
-- 
cgit v1.2.3


From b263b31e8ad65cdbfa5a7f739460f350554a2dc1 Mon Sep 17 00:00:00 2001
From: "H. Peter Anvin" <hpa@linux.intel.com>
Date: Mon, 27 Feb 2012 15:15:25 -0800
Subject: x86, mtrr: Use explicit sizing and padding for the 64-bit ioctls

Specify the data structures for the 64-bit ioctls with explicit sizing
and padding so that the x32 kernel will correctly use the 64-bit forms
of these ioctls.  Note that these ioctls are bogus in both forms on
both 32 and 64 bits; even on 64 bits the maximum MTRR size is only 44
bits long.

Note that nothing really is supposed to use these ioctls and that the
preferred interface is text strings on /proc/mtrr, or better yet,
nothing at all (use /sys/bus/pci/devices/*/resource*_wc for write
combining; that uses PAT not MTRRs.)

Signed-off-by: H. Peter Anvin <hpa@linux.intel.com>
Cc: H. J. Lu <hjl.tools@gmail.com>
Tested-by: Nitin A. Kamble <nitin.a.kamble@intel.com>
Link: http://lkml.kernel.org/n/tip-vwvnlu3hjmtkwvij4qxtm90l@git.kernel.org
Signed-off-by: H. Peter Anvin <hpa@zytor.com>
---
 arch/x86/include/asm/mtrr.h   | 28 ++++++++++++++++++----------
 arch/x86/kernel/cpu/mtrr/if.c | 10 ++++++----
 2 files changed, 24 insertions(+), 14 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/include/asm/mtrr.h b/arch/x86/include/asm/mtrr.h
index 4365ffdb461f..7e3f17f92c66 100644
--- a/arch/x86/include/asm/mtrr.h
+++ b/arch/x86/include/asm/mtrr.h
@@ -29,18 +29,18 @@
 
 #define	MTRR_IOCTL_BASE	'M'
 
-struct mtrr_sentry {
-    unsigned long base;    /*  Base address     */
-    unsigned int size;    /*  Size of region   */
-    unsigned int type;     /*  Type of region   */
-};
-
 /* Warning: this structure has a different order from i386
    on x86-64. The 32bit emulation code takes care of that.
    But you need to use this for 64bit, otherwise your X server
    will break. */
 
 #ifdef __i386__
+struct mtrr_sentry {
+    unsigned long base;    /*  Base address     */
+    unsigned int size;    /*  Size of region   */
+    unsigned int type;     /*  Type of region   */
+};
+
 struct mtrr_gentry {
     unsigned int regnum;   /*  Register number  */
     unsigned long base;    /*  Base address     */
@@ -50,12 +50,20 @@ struct mtrr_gentry {
 
 #else /* __i386__ */
 
+struct mtrr_sentry {
+	__u64 base;		/*  Base address     */
+	__u32 size;		/*  Size of region   */
+	__u32 type;		/*  Type of region   */
+};
+
 struct mtrr_gentry {
-    unsigned long base;    /*  Base address     */
-    unsigned int size;    /*  Size of region   */
-    unsigned int regnum;   /*  Register number  */
-    unsigned int type;     /*  Type of region   */
+	__u64 base;		/*  Base address     */
+	__u32 size;		/*  Size of region   */
+	__u32 regnum;		/*  Register number  */
+	__u32 type;		/*  Type of region   */
+	__u32 _pad;		/*  Unused	     */
 };
+
 #endif /* !__i386__ */
 
 struct mtrr_var_range {
diff --git a/arch/x86/kernel/cpu/mtrr/if.c b/arch/x86/kernel/cpu/mtrr/if.c
index 79289632cb27..a041e094b8b9 100644
--- a/arch/x86/kernel/cpu/mtrr/if.c
+++ b/arch/x86/kernel/cpu/mtrr/if.c
@@ -167,6 +167,7 @@ mtrr_ioctl(struct file *file, unsigned int cmd, unsigned long __arg)
 {
 	int err = 0;
 	mtrr_type type;
+	unsigned long base;
 	unsigned long size;
 	struct mtrr_sentry sentry;
 	struct mtrr_gentry gentry;
@@ -267,14 +268,14 @@ mtrr_ioctl(struct file *file, unsigned int cmd, unsigned long __arg)
 #endif
 		if (gentry.regnum >= num_var_ranges)
 			return -EINVAL;
-		mtrr_if->get(gentry.regnum, &gentry.base, &size, &type);
+		mtrr_if->get(gentry.regnum, &base, &size, &type);
 
 		/* Hide entries that go above 4GB */
-		if (gentry.base + size - 1 >= (1UL << (8 * sizeof(gentry.size) - PAGE_SHIFT))
+		if (base + size - 1 >= (1UL << (8 * sizeof(gentry.size) - PAGE_SHIFT))
 		    || size >= (1UL << (8 * sizeof(gentry.size) - PAGE_SHIFT)))
 			gentry.base = gentry.size = gentry.type = 0;
 		else {
-			gentry.base <<= PAGE_SHIFT;
+			gentry.base = base << PAGE_SHIFT;
 			gentry.size = size << PAGE_SHIFT;
 			gentry.type = type;
 		}
@@ -321,11 +322,12 @@ mtrr_ioctl(struct file *file, unsigned int cmd, unsigned long __arg)
 #endif
 		if (gentry.regnum >= num_var_ranges)
 			return -EINVAL;
-		mtrr_if->get(gentry.regnum, &gentry.base, &size, &type);
+		mtrr_if->get(gentry.regnum, &base, &size, &type);
 		/* Hide entries that would overflow */
 		if (size != (__typeof__(gentry.size))size)
 			gentry.base = gentry.size = gentry.type = 0;
 		else {
+			gentry.base = base;
 			gentry.size = size;
 			gentry.type = type;
 		}
-- 
cgit v1.2.3


From 1018faa6cf23b256bf25919ef203cd7c129f06f2 Mon Sep 17 00:00:00 2001
From: Joerg Roedel <joerg.roedel@amd.com>
Date: Wed, 29 Feb 2012 14:57:32 +0100
Subject: perf/x86/kvm: Fix Host-Only/Guest-Only counting with SVM disabled

It turned out that a performance counter on AMD does not
count at all when the GO or HO bit is set in the control
register and SVM is disabled in EFER.

This patch works around this issue by masking out the HO bit
in the performance counter control register when SVM is not
enabled.

The GO bit is not touched because it is only set when the
user wants to count in guest-mode only. So when SVM is
disabled the counter should not run at all and the
not-counting is the intended behaviour.

Signed-off-by: Joerg Roedel <joerg.roedel@amd.com>
Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Avi Kivity <avi@redhat.com>
Cc: Stephane Eranian <eranian@google.com>
Cc: David Ahern <dsahern@gmail.com>
Cc: Gleb Natapov <gleb@redhat.com>
Cc: Robert Richter <robert.richter@amd.com>
Cc: stable@vger.kernel.org # v3.2
Link: http://lkml.kernel.org/r/1330523852-19566-1-git-send-email-joerg.roedel@amd.com
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/include/asm/perf_event.h    |  8 ++++++++
 arch/x86/kernel/cpu/perf_event.h     |  8 ++++++--
 arch/x86/kernel/cpu/perf_event_amd.c | 37 ++++++++++++++++++++++++++++++++++--
 arch/x86/kvm/svm.c                   |  5 +++++
 4 files changed, 54 insertions(+), 4 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/include/asm/perf_event.h b/arch/x86/include/asm/perf_event.h
index 096c975e099f..461ce432b1c2 100644
--- a/arch/x86/include/asm/perf_event.h
+++ b/arch/x86/include/asm/perf_event.h
@@ -242,4 +242,12 @@ static inline void perf_get_x86_pmu_capability(struct x86_pmu_capability *cap)
 static inline void perf_events_lapic_init(void)	{ }
 #endif
 
+#if defined(CONFIG_PERF_EVENTS) && defined(CONFIG_CPU_SUP_AMD)
+ extern void amd_pmu_enable_virt(void);
+ extern void amd_pmu_disable_virt(void);
+#else
+ static inline void amd_pmu_enable_virt(void) { }
+ static inline void amd_pmu_disable_virt(void) { }
+#endif
+
 #endif /* _ASM_X86_PERF_EVENT_H */
diff --git a/arch/x86/kernel/cpu/perf_event.h b/arch/x86/kernel/cpu/perf_event.h
index 8944062f46e2..c30c807ddc72 100644
--- a/arch/x86/kernel/cpu/perf_event.h
+++ b/arch/x86/kernel/cpu/perf_event.h
@@ -147,7 +147,9 @@ struct cpu_hw_events {
 	/*
 	 * AMD specific bits
 	 */
-	struct amd_nb		*amd_nb;
+	struct amd_nb			*amd_nb;
+	/* Inverted mask of bits to clear in the perf_ctr ctrl registers */
+	u64				perf_ctr_virt_mask;
 
 	void				*kfree_on_online;
 };
@@ -417,9 +419,11 @@ void x86_pmu_disable_all(void);
 static inline void __x86_pmu_enable_event(struct hw_perf_event *hwc,
 					  u64 enable_mask)
 {
+	u64 disable_mask = __this_cpu_read(cpu_hw_events.perf_ctr_virt_mask);
+
 	if (hwc->extra_reg.reg)
 		wrmsrl(hwc->extra_reg.reg, hwc->extra_reg.config);
-	wrmsrl(hwc->config_base, hwc->config | enable_mask);
+	wrmsrl(hwc->config_base, (hwc->config | enable_mask) & ~disable_mask);
 }
 
 void x86_pmu_enable_all(int added);
diff --git a/arch/x86/kernel/cpu/perf_event_amd.c b/arch/x86/kernel/cpu/perf_event_amd.c
index 0397b23be8e9..67250a52430b 100644
--- a/arch/x86/kernel/cpu/perf_event_amd.c
+++ b/arch/x86/kernel/cpu/perf_event_amd.c
@@ -1,4 +1,5 @@
 #include <linux/perf_event.h>
+#include <linux/export.h>
 #include <linux/types.h>
 #include <linux/init.h>
 #include <linux/slab.h>
@@ -357,7 +358,9 @@ static void amd_pmu_cpu_starting(int cpu)
 	struct amd_nb *nb;
 	int i, nb_id;
 
-	if (boot_cpu_data.x86_max_cores < 2)
+	cpuc->perf_ctr_virt_mask = AMD_PERFMON_EVENTSEL_HOSTONLY;
+
+	if (boot_cpu_data.x86_max_cores < 2 || boot_cpu_data.x86 == 0x15)
 		return;
 
 	nb_id = amd_get_nb_id(cpu);
@@ -587,9 +590,9 @@ static __initconst const struct x86_pmu amd_pmu_f15h = {
 	.put_event_constraints	= amd_put_event_constraints,
 
 	.cpu_prepare		= amd_pmu_cpu_prepare,
-	.cpu_starting		= amd_pmu_cpu_starting,
 	.cpu_dead		= amd_pmu_cpu_dead,
 #endif
+	.cpu_starting		= amd_pmu_cpu_starting,
 };
 
 __init int amd_pmu_init(void)
@@ -621,3 +624,33 @@ __init int amd_pmu_init(void)
 
 	return 0;
 }
+
+void amd_pmu_enable_virt(void)
+{
+	struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events);
+
+	cpuc->perf_ctr_virt_mask = 0;
+
+	/* Reload all events */
+	x86_pmu_disable_all();
+	x86_pmu_enable_all(0);
+}
+EXPORT_SYMBOL_GPL(amd_pmu_enable_virt);
+
+void amd_pmu_disable_virt(void)
+{
+	struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events);
+
+	/*
+	 * We only mask out the Host-only bit so that host-only counting works
+	 * when SVM is disabled. If someone sets up a guest-only counter when
+	 * SVM is disabled the Guest-only bits still gets set and the counter
+	 * will not count anything.
+	 */
+	cpuc->perf_ctr_virt_mask = AMD_PERFMON_EVENTSEL_HOSTONLY;
+
+	/* Reload all events */
+	x86_pmu_disable_all();
+	x86_pmu_enable_all(0);
+}
+EXPORT_SYMBOL_GPL(amd_pmu_disable_virt);
diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c
index 5fa553babe56..e385214711cb 100644
--- a/arch/x86/kvm/svm.c
+++ b/arch/x86/kvm/svm.c
@@ -29,6 +29,7 @@
 #include <linux/ftrace_event.h>
 #include <linux/slab.h>
 
+#include <asm/perf_event.h>
 #include <asm/tlbflush.h>
 #include <asm/desc.h>
 #include <asm/kvm_para.h>
@@ -575,6 +576,8 @@ static void svm_hardware_disable(void *garbage)
 		wrmsrl(MSR_AMD64_TSC_RATIO, TSC_RATIO_DEFAULT);
 
 	cpu_svm_disable();
+
+	amd_pmu_disable_virt();
 }
 
 static int svm_hardware_enable(void *garbage)
@@ -622,6 +625,8 @@ static int svm_hardware_enable(void *garbage)
 
 	svm_init_erratum_383();
 
+	amd_pmu_enable_virt();
+
 	return 0;
 }
 
-- 
cgit v1.2.3


From 63ab387ca0d1576edef35ef68e4b8ea5e0757b7a Mon Sep 17 00:00:00 2001
From: Myron Stowe <myron.stowe@redhat.com>
Date: Fri, 2 Mar 2012 12:45:01 -0700
Subject: x86/PCI: add spinlock held check to 'pcibios_fwaddrmap_lookup()'

'pcibios_fwaddrmap_lookup()' is used to maintain FW-assigned BIOS BAR
values for reinstatement when normal resource assignment attempts
fail and must be called with the 'pcibios_fwaddrmap_lock' spinlock
held.

This patch adds a WARN_ON notification if the spinlock is not currently
held by the caller.

Signed-off-by: Myron Stowe <myron.stowe@redhat.com>
Signed-off-by: Jesse Barnes <jbarnes@virtuousgeek.org>
---
 arch/x86/pci/i386.c | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'arch/x86')

diff --git a/arch/x86/pci/i386.c b/arch/x86/pci/i386.c
index 33e6a0b995fc..831971e731f7 100644
--- a/arch/x86/pci/i386.c
+++ b/arch/x86/pci/i386.c
@@ -57,6 +57,8 @@ static struct pcibios_fwaddrmap *pcibios_fwaddrmap_lookup(struct pci_dev *dev)
 {
 	struct pcibios_fwaddrmap *map;
 
+	WARN_ON(!spin_is_locked(&pcibios_fwaddrmap_lock));
+
 	list_for_each_entry(map, &pcibios_fwaddrmappings, list)
 		if (map->dev == dev)
 			return map;
-- 
cgit v1.2.3


From e37aade31601cdb9f078f6663cbf887f391bb110 Mon Sep 17 00:00:00 2001
From: Jiri Kosina <jkosina@suse.cz>
Date: Tue, 28 Feb 2012 16:16:33 +0100
Subject: x86, memblock: Move mem_hole_size() to .init

mem_hole_size() is being called only from __init-marked functions, and as
such should be moved to .init section as well. Fixes this warning:

WARNING: vmlinux.o(.text+0x35511): Section mismatch in reference from the function mem_hole_size() to the function .init.text:absent_pages_in_range()

Signed-off-by: Jiri Kosina <jkosina@suse.cz>
Link: http://lkml.kernel.org/r/alpine.LNX.2.00.1202281614450.31150@pobox.suse.cz
Signed-off-by: H. Peter Anvin <hpa@zytor.com>
---
 arch/x86/mm/numa_emulation.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'arch/x86')

diff --git a/arch/x86/mm/numa_emulation.c b/arch/x86/mm/numa_emulation.c
index 46db56845f18..2fff6518e302 100644
--- a/arch/x86/mm/numa_emulation.c
+++ b/arch/x86/mm/numa_emulation.c
@@ -28,7 +28,7 @@ static int __init emu_find_memblk_by_nid(int nid, const struct numa_meminfo *mi)
 	return -ENOENT;
 }
 
-static u64 mem_hole_size(u64 start, u64 end)
+static u64 __init mem_hole_size(u64 start, u64 end)
 {
 	unsigned long start_pfn = PFN_UP(start);
 	unsigned long end_pfn = PFN_DOWN(end);
-- 
cgit v1.2.3


From 187f1882b5b0748b3c4c22274663fdb372ac0452 Mon Sep 17 00:00:00 2001
From: Paul Gortmaker <paul.gortmaker@windriver.com>
Date: Wed, 23 Nov 2011 20:12:59 -0500
Subject: BUG: headers with BUG/BUG_ON etc. need linux/bug.h

If a header file is making use of BUG, BUG_ON, BUILD_BUG_ON, or any
other BUG variant in a static inline (i.e. not in a #define) then
that header really should be including <linux/bug.h> and not just
expecting it to be implicitly present.

We can make this change risk-free, since if the files using these
headers didn't have exposure to linux/bug.h already, they would have
been causing compile failures/warnings.

Signed-off-by: Paul Gortmaker <paul.gortmaker@windriver.com>
---
 arch/avr32/include/asm/io.h              | 1 +
 arch/m68k/include/asm/system.h           | 1 +
 arch/sparc/include/asm/vga.h             | 1 +
 arch/x86/include/asm/paravirt.h          | 1 +
 include/asm-generic/dma-mapping-common.h | 1 +
 include/asm-generic/pgtable.h            | 1 +
 include/asm-generic/tlbflush.h           | 2 ++
 include/drm/ttm/ttm_memory.h             | 1 +
 include/linux/atmdev.h                   | 1 +
 include/linux/bio.h                      | 1 +
 include/linux/bit_spinlock.h             | 1 +
 include/linux/ceph/decode.h              | 3 ++-
 include/linux/ceph/libceph.h             | 1 +
 include/linux/ceph/mdsmap.h              | 1 +
 include/linux/cpumask.h                  | 1 +
 include/linux/crypto.h                   | 1 +
 include/linux/debug_locks.h              | 1 +
 include/linux/dmaengine.h                | 1 +
 include/linux/elfcore.h                  | 1 +
 include/linux/ext3_fs.h                  | 1 +
 include/linux/fs.h                       | 1 +
 include/linux/fsnotify.h                 | 1 +
 include/linux/gpio.h                     | 1 +
 include/linux/highmem.h                  | 1 +
 include/linux/i2o.h                      | 1 +
 include/linux/if_vlan.h                  | 1 +
 include/linux/io-mapping.h               | 1 +
 include/linux/kprobes.h                  | 1 +
 include/linux/kvm_host.h                 | 1 +
 include/linux/memory_hotplug.h           | 1 +
 include/linux/mm.h                       | 1 +
 include/linux/mtd/cfi.h                  | 1 +
 include/linux/netdevice.h                | 1 +
 include/linux/nilfs2_fs.h                | 1 +
 include/linux/page-flags.h               | 1 +
 include/linux/pid_namespace.h            | 1 +
 include/linux/posix_acl.h                | 1 +
 include/linux/ptrace.h                   | 1 +
 include/linux/radix-tree.h               | 1 +
 include/linux/rcupdate.h                 | 1 +
 include/linux/regset.h                   | 1 +
 include/linux/reiserfs_fs.h              | 1 +
 include/linux/relay.h                    | 1 +
 include/linux/scatterlist.h              | 6 ++++--
 include/linux/seq_file.h                 | 1 +
 include/linux/skbuff.h                   | 1 +
 include/linux/slub_def.h                 | 1 +
 include/linux/ssb/ssb_driver_gige.h      | 1 +
 include/linux/swapops.h                  | 1 +
 include/linux/syscalls.h                 | 1 +
 include/linux/transport_class.h          | 1 +
 include/linux/virtio_config.h            | 1 +
 include/net/cfg80211.h                   | 1 +
 include/net/dst.h                        | 1 +
 include/net/ip_vs.h                      | 1 +
 include/net/mac80211.h                   | 1 +
 include/net/netns/generic.h              | 1 +
 include/net/red.h                        | 1 +
 include/net/tcp.h                        | 1 +
 include/net/timewait_sock.h              | 1 +
 include/net/udp.h                        | 1 +
 include/net/wpan-phy.h                   | 1 +
 include/scsi/osd_ore.h                   | 1 +
 include/scsi/scsi_transport.h            | 1 +
 64 files changed, 69 insertions(+), 3 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/avr32/include/asm/io.h b/arch/avr32/include/asm/io.h
index 22c97ef92201..cf60d0a9f176 100644
--- a/arch/avr32/include/asm/io.h
+++ b/arch/avr32/include/asm/io.h
@@ -1,6 +1,7 @@
 #ifndef __ASM_AVR32_IO_H
 #define __ASM_AVR32_IO_H
 
+#include <linux/bug.h>
 #include <linux/kernel.h>
 #include <linux/string.h>
 #include <linux/types.h>
diff --git a/arch/m68k/include/asm/system.h b/arch/m68k/include/asm/system.h
index 47b01f4726bc..8dc68178716c 100644
--- a/arch/m68k/include/asm/system.h
+++ b/arch/m68k/include/asm/system.h
@@ -3,6 +3,7 @@
 
 #include <linux/linkage.h>
 #include <linux/kernel.h>
+#include <linux/bug.h>
 #include <linux/irqflags.h>
 #include <asm/segment.h>
 #include <asm/entry.h>
diff --git a/arch/sparc/include/asm/vga.h b/arch/sparc/include/asm/vga.h
index c69d5b2ba19a..ec0e9967d93d 100644
--- a/arch/sparc/include/asm/vga.h
+++ b/arch/sparc/include/asm/vga.h
@@ -7,6 +7,7 @@
 #ifndef _LINUX_ASM_VGA_H_
 #define _LINUX_ASM_VGA_H_
 
+#include <linux/bug.h>
 #include <asm/types.h>
 
 #define VT_BUF_HAVE_RW
diff --git a/arch/x86/include/asm/paravirt.h b/arch/x86/include/asm/paravirt.h
index a7d2db9a74fb..923b07024a03 100644
--- a/arch/x86/include/asm/paravirt.h
+++ b/arch/x86/include/asm/paravirt.h
@@ -10,6 +10,7 @@
 #include <asm/paravirt_types.h>
 
 #ifndef __ASSEMBLY__
+#include <linux/bug.h>
 #include <linux/types.h>
 #include <linux/cpumask.h>
 
diff --git a/include/asm-generic/dma-mapping-common.h b/include/asm-generic/dma-mapping-common.h
index 9fa3f96e38cf..2e248d8924dc 100644
--- a/include/asm-generic/dma-mapping-common.h
+++ b/include/asm-generic/dma-mapping-common.h
@@ -2,6 +2,7 @@
 #define _ASM_GENERIC_DMA_MAPPING_H
 
 #include <linux/kmemcheck.h>
+#include <linux/bug.h>
 #include <linux/scatterlist.h>
 #include <linux/dma-debug.h>
 #include <linux/dma-attrs.h>
diff --git a/include/asm-generic/pgtable.h b/include/asm-generic/pgtable.h
index 76bff2bff15e..236b1056839f 100644
--- a/include/asm-generic/pgtable.h
+++ b/include/asm-generic/pgtable.h
@@ -5,6 +5,7 @@
 #ifdef CONFIG_MMU
 
 #include <linux/mm_types.h>
+#include <linux/bug.h>
 
 #ifndef __HAVE_ARCH_PTEP_SET_ACCESS_FLAGS
 extern int ptep_set_access_flags(struct vm_area_struct *vma,
diff --git a/include/asm-generic/tlbflush.h b/include/asm-generic/tlbflush.h
index c7af037024c7..d6d0a88430fe 100644
--- a/include/asm-generic/tlbflush.h
+++ b/include/asm-generic/tlbflush.h
@@ -9,6 +9,8 @@
 #error need to implement an architecture specific asm/tlbflush.h
 #endif
 
+#include <linux/bug.h>
+
 static inline void flush_tlb_mm(struct mm_struct *mm)
 {
 	BUG();
diff --git a/include/drm/ttm/ttm_memory.h b/include/drm/ttm/ttm_memory.h
index 26c1f78d136f..d6d1da468c97 100644
--- a/include/drm/ttm/ttm_memory.h
+++ b/include/drm/ttm/ttm_memory.h
@@ -30,6 +30,7 @@
 
 #include <linux/workqueue.h>
 #include <linux/spinlock.h>
+#include <linux/bug.h>
 #include <linux/wait.h>
 #include <linux/errno.h>
 #include <linux/kobject.h>
diff --git a/include/linux/atmdev.h b/include/linux/atmdev.h
index f4ff882cb2da..42c471afc52a 100644
--- a/include/linux/atmdev.h
+++ b/include/linux/atmdev.h
@@ -217,6 +217,7 @@ struct atm_cirange {
 #include <linux/wait.h> /* wait_queue_head_t */
 #include <linux/time.h> /* struct timeval */
 #include <linux/net.h>
+#include <linux/bug.h>
 #include <linux/skbuff.h> /* struct sk_buff */
 #include <linux/uio.h>
 #include <net/sock.h>
diff --git a/include/linux/bio.h b/include/linux/bio.h
index 129a9c097958..f54db088f335 100644
--- a/include/linux/bio.h
+++ b/include/linux/bio.h
@@ -23,6 +23,7 @@
 #include <linux/highmem.h>
 #include <linux/mempool.h>
 #include <linux/ioprio.h>
+#include <linux/bug.h>
 
 #ifdef CONFIG_BLOCK
 
diff --git a/include/linux/bit_spinlock.h b/include/linux/bit_spinlock.h
index ac4d9f8b52e9..3b5bafce4337 100644
--- a/include/linux/bit_spinlock.h
+++ b/include/linux/bit_spinlock.h
@@ -4,6 +4,7 @@
 #include <linux/kernel.h>
 #include <linux/preempt.h>
 #include <linux/atomic.h>
+#include <linux/bug.h>
 
 /*
  *  bit-based spin_lock()
diff --git a/include/linux/ceph/decode.h b/include/linux/ceph/decode.h
index c5b6939fb32a..220ae21e819b 100644
--- a/include/linux/ceph/decode.h
+++ b/include/linux/ceph/decode.h
@@ -1,8 +1,9 @@
 #ifndef __CEPH_DECODE_H
 #define __CEPH_DECODE_H
 
-#include <asm/unaligned.h>
+#include <linux/bug.h>
 #include <linux/time.h>
+#include <asm/unaligned.h>
 
 #include "types.h"
 
diff --git a/include/linux/ceph/libceph.h b/include/linux/ceph/libceph.h
index 95bd8502e715..e8cf0ccd1a8d 100644
--- a/include/linux/ceph/libceph.h
+++ b/include/linux/ceph/libceph.h
@@ -7,6 +7,7 @@
 #include <linux/backing-dev.h>
 #include <linux/completion.h>
 #include <linux/exportfs.h>
+#include <linux/bug.h>
 #include <linux/fs.h>
 #include <linux/mempool.h>
 #include <linux/pagemap.h>
diff --git a/include/linux/ceph/mdsmap.h b/include/linux/ceph/mdsmap.h
index 4c5cb0880bba..9935fac8c107 100644
--- a/include/linux/ceph/mdsmap.h
+++ b/include/linux/ceph/mdsmap.h
@@ -1,6 +1,7 @@
 #ifndef _FS_CEPH_MDSMAP_H
 #define _FS_CEPH_MDSMAP_H
 
+#include <linux/bug.h>
 #include "types.h"
 
 /*
diff --git a/include/linux/cpumask.h b/include/linux/cpumask.h
index 4f7a63237471..7b9b75a529be 100644
--- a/include/linux/cpumask.h
+++ b/include/linux/cpumask.h
@@ -9,6 +9,7 @@
 #include <linux/kernel.h>
 #include <linux/threads.h>
 #include <linux/bitmap.h>
+#include <linux/bug.h>
 
 typedef struct cpumask { DECLARE_BITMAP(bits, NR_CPUS); } cpumask_t;
 
diff --git a/include/linux/crypto.h b/include/linux/crypto.h
index 8a94217b298e..d870bae81df1 100644
--- a/include/linux/crypto.h
+++ b/include/linux/crypto.h
@@ -20,6 +20,7 @@
 #include <linux/atomic.h>
 #include <linux/kernel.h>
 #include <linux/list.h>
+#include <linux/bug.h>
 #include <linux/slab.h>
 #include <linux/string.h>
 #include <linux/uaccess.h>
diff --git a/include/linux/debug_locks.h b/include/linux/debug_locks.h
index 5033fb88c107..94f20c1488a1 100644
--- a/include/linux/debug_locks.h
+++ b/include/linux/debug_locks.h
@@ -3,6 +3,7 @@
 
 #include <linux/kernel.h>
 #include <linux/atomic.h>
+#include <linux/bug.h>
 #include <asm/system.h>
 
 struct task_struct;
diff --git a/include/linux/dmaengine.h b/include/linux/dmaengine.h
index 679b349d9b66..a5966f691ef8 100644
--- a/include/linux/dmaengine.h
+++ b/include/linux/dmaengine.h
@@ -23,6 +23,7 @@
 
 #include <linux/device.h>
 #include <linux/uio.h>
+#include <linux/bug.h>
 #include <linux/scatterlist.h>
 #include <linux/bitmap.h>
 #include <asm/page.h>
diff --git a/include/linux/elfcore.h b/include/linux/elfcore.h
index 394a3e0e4a6b..0698c79fbcb2 100644
--- a/include/linux/elfcore.h
+++ b/include/linux/elfcore.h
@@ -6,6 +6,7 @@
 #include <linux/time.h>
 #ifdef __KERNEL__
 #include <linux/user.h>
+#include <linux/bug.h>
 #endif
 #include <linux/ptrace.h>
 #include <linux/elf.h>
diff --git a/include/linux/ext3_fs.h b/include/linux/ext3_fs.h
index f957085d40ed..f5a84eef6ed2 100644
--- a/include/linux/ext3_fs.h
+++ b/include/linux/ext3_fs.h
@@ -18,6 +18,7 @@
 
 #include <linux/types.h>
 #include <linux/magic.h>
+#include <linux/bug.h>
 
 /*
  * The second extended filesystem constants/structures
diff --git a/include/linux/fs.h b/include/linux/fs.h
index 69cd5bb640f5..abc92db51e54 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -389,6 +389,7 @@ struct inodes_stat_t {
 #include <linux/prio_tree.h>
 #include <linux/init.h>
 #include <linux/pid.h>
+#include <linux/bug.h>
 #include <linux/mutex.h>
 #include <linux/capability.h>
 #include <linux/semaphore.h>
diff --git a/include/linux/fsnotify.h b/include/linux/fsnotify.h
index 2a53f10712b3..a6dfe6944564 100644
--- a/include/linux/fsnotify.h
+++ b/include/linux/fsnotify.h
@@ -14,6 +14,7 @@
 #include <linux/fsnotify_backend.h>
 #include <linux/audit.h>
 #include <linux/slab.h>
+#include <linux/bug.h>
 
 /*
  * fsnotify_d_instantiate - instantiate a dentry for inode
diff --git a/include/linux/gpio.h b/include/linux/gpio.h
index 38ac48b7d3a8..ed5a46707ad0 100644
--- a/include/linux/gpio.h
+++ b/include/linux/gpio.h
@@ -34,6 +34,7 @@ struct gpio {
 #include <linux/kernel.h>
 #include <linux/types.h>
 #include <linux/errno.h>
+#include <linux/bug.h>
 
 struct device;
 struct gpio_chip;
diff --git a/include/linux/highmem.h b/include/linux/highmem.h
index 3a93f73a8acc..6ede661e5b8e 100644
--- a/include/linux/highmem.h
+++ b/include/linux/highmem.h
@@ -3,6 +3,7 @@
 
 #include <linux/fs.h>
 #include <linux/kernel.h>
+#include <linux/bug.h>
 #include <linux/mm.h>
 #include <linux/uaccess.h>
 #include <linux/hardirq.h>
diff --git a/include/linux/i2o.h b/include/linux/i2o.h
index a6deef4f4f67..d23c3c20b201 100644
--- a/include/linux/i2o.h
+++ b/include/linux/i2o.h
@@ -24,6 +24,7 @@
 #define I2O_MAX_DRIVERS		8
 
 #include <linux/pci.h>
+#include <linux/bug.h>
 #include <linux/dma-mapping.h>
 #include <linux/string.h>
 #include <linux/slab.h>
diff --git a/include/linux/if_vlan.h b/include/linux/if_vlan.h
index 13aff1e2183b..82097f39df10 100644
--- a/include/linux/if_vlan.h
+++ b/include/linux/if_vlan.h
@@ -17,6 +17,7 @@
 #include <linux/netdevice.h>
 #include <linux/etherdevice.h>
 #include <linux/rtnetlink.h>
+#include <linux/bug.h>
 
 #define VLAN_HLEN	4		/* The additional bytes (on top of the Ethernet header)
 					 * that VLAN requires.
diff --git a/include/linux/io-mapping.h b/include/linux/io-mapping.h
index e44e84f0156c..657fab4efab3 100644
--- a/include/linux/io-mapping.h
+++ b/include/linux/io-mapping.h
@@ -20,6 +20,7 @@
 
 #include <linux/types.h>
 #include <linux/slab.h>
+#include <linux/bug.h>
 #include <asm/io.h>
 #include <asm/page.h>
 
diff --git a/include/linux/kprobes.h b/include/linux/kprobes.h
index dce6e4dbeda7..b6e1f8c00577 100644
--- a/include/linux/kprobes.h
+++ b/include/linux/kprobes.h
@@ -33,6 +33,7 @@
 #include <linux/list.h>
 #include <linux/notifier.h>
 #include <linux/smp.h>
+#include <linux/bug.h>
 #include <linux/percpu.h>
 #include <linux/spinlock.h>
 #include <linux/rcupdate.h>
diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h
index 900c76337e8f..ca1b153585d3 100644
--- a/include/linux/kvm_host.h
+++ b/include/linux/kvm_host.h
@@ -13,6 +13,7 @@
 #include <linux/spinlock.h>
 #include <linux/signal.h>
 #include <linux/sched.h>
+#include <linux/bug.h>
 #include <linux/mm.h>
 #include <linux/mmu_notifier.h>
 #include <linux/preempt.h>
diff --git a/include/linux/memory_hotplug.h b/include/linux/memory_hotplug.h
index 0b8e2a742600..910550f3b70e 100644
--- a/include/linux/memory_hotplug.h
+++ b/include/linux/memory_hotplug.h
@@ -4,6 +4,7 @@
 #include <linux/mmzone.h>
 #include <linux/spinlock.h>
 #include <linux/notifier.h>
+#include <linux/bug.h>
 
 struct page;
 struct zone;
diff --git a/include/linux/mm.h b/include/linux/mm.h
index 17b27cd269c4..b7fac5b6acb6 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -6,6 +6,7 @@
 #ifdef __KERNEL__
 
 #include <linux/gfp.h>
+#include <linux/bug.h>
 #include <linux/list.h>
 #include <linux/mmzone.h>
 #include <linux/rbtree.h>
diff --git a/include/linux/mtd/cfi.h b/include/linux/mtd/cfi.h
index d5d2ec6494bb..37ef6b194089 100644
--- a/include/linux/mtd/cfi.h
+++ b/include/linux/mtd/cfi.h
@@ -22,6 +22,7 @@
 
 #include <linux/delay.h>
 #include <linux/types.h>
+#include <linux/bug.h>
 #include <linux/interrupt.h>
 #include <linux/mtd/flashchip.h>
 #include <linux/mtd/map.h>
diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index 0eac07c95255..5820638193f5 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -33,6 +33,7 @@
 #ifdef __KERNEL__
 #include <linux/pm_qos.h>
 #include <linux/timer.h>
+#include <linux/bug.h>
 #include <linux/delay.h>
 #include <linux/atomic.h>
 #include <asm/cache.h>
diff --git a/include/linux/nilfs2_fs.h b/include/linux/nilfs2_fs.h
index 7454ad7451b4..89bd4a4dcfb4 100644
--- a/include/linux/nilfs2_fs.h
+++ b/include/linux/nilfs2_fs.h
@@ -41,6 +41,7 @@
 #include <linux/types.h>
 #include <linux/ioctl.h>
 #include <linux/magic.h>
+#include <linux/bug.h>
 
 
 #define NILFS_INODE_BMAP_SIZE	7
diff --git a/include/linux/page-flags.h b/include/linux/page-flags.h
index e90a673be67e..3cfa3ad94b1f 100644
--- a/include/linux/page-flags.h
+++ b/include/linux/page-flags.h
@@ -6,6 +6,7 @@
 #define PAGE_FLAGS_H
 
 #include <linux/types.h>
+#include <linux/bug.h>
 #ifndef __GENERATING_BOUNDS_H
 #include <linux/mm_types.h>
 #include <generated/bounds.h>
diff --git a/include/linux/pid_namespace.h b/include/linux/pid_namespace.h
index e7cf6669ac34..f5bd679be46b 100644
--- a/include/linux/pid_namespace.h
+++ b/include/linux/pid_namespace.h
@@ -2,6 +2,7 @@
 #define _LINUX_PID_NS_H
 
 #include <linux/sched.h>
+#include <linux/bug.h>
 #include <linux/mm.h>
 #include <linux/threads.h>
 #include <linux/nsproxy.h>
diff --git a/include/linux/posix_acl.h b/include/linux/posix_acl.h
index b7681102a4b9..11bad91c4433 100644
--- a/include/linux/posix_acl.h
+++ b/include/linux/posix_acl.h
@@ -8,6 +8,7 @@
 #ifndef __LINUX_POSIX_ACL_H
 #define __LINUX_POSIX_ACL_H
 
+#include <linux/bug.h>
 #include <linux/slab.h>
 #include <linux/rcupdate.h>
 
diff --git a/include/linux/ptrace.h b/include/linux/ptrace.h
index c2f1f6a5fcb8..753ee8b62335 100644
--- a/include/linux/ptrace.h
+++ b/include/linux/ptrace.h
@@ -113,6 +113,7 @@
 #include <linux/compiler.h>		/* For unlikely.  */
 #include <linux/sched.h>		/* For struct task_struct.  */
 #include <linux/err.h>			/* for IS_ERR_VALUE */
+#include <linux/bug.h>			/* For BUG_ON.  */
 
 
 extern long arch_ptrace(struct task_struct *child, long request,
diff --git a/include/linux/radix-tree.h b/include/linux/radix-tree.h
index 07e360b1b282..e9a48234e693 100644
--- a/include/linux/radix-tree.h
+++ b/include/linux/radix-tree.h
@@ -22,6 +22,7 @@
 
 #include <linux/preempt.h>
 #include <linux/types.h>
+#include <linux/bug.h>
 #include <linux/kernel.h>
 #include <linux/rcupdate.h>
 
diff --git a/include/linux/rcupdate.h b/include/linux/rcupdate.h
index 81c04f4348ec..3b657f2bed4a 100644
--- a/include/linux/rcupdate.h
+++ b/include/linux/rcupdate.h
@@ -42,6 +42,7 @@
 #include <linux/lockdep.h>
 #include <linux/completion.h>
 #include <linux/debugobjects.h>
+#include <linux/bug.h>
 #include <linux/compiler.h>
 
 #ifdef CONFIG_RCU_TORTURE_TEST
diff --git a/include/linux/regset.h b/include/linux/regset.h
index 8abee6556223..6325e099105a 100644
--- a/include/linux/regset.h
+++ b/include/linux/regset.h
@@ -15,6 +15,7 @@
 
 #include <linux/compiler.h>
 #include <linux/types.h>
+#include <linux/bug.h>
 #include <linux/uaccess.h>
 struct task_struct;
 struct user_regset;
diff --git a/include/linux/reiserfs_fs.h b/include/linux/reiserfs_fs.h
index 2213ddcce20c..6643fb031293 100644
--- a/include/linux/reiserfs_fs.h
+++ b/include/linux/reiserfs_fs.h
@@ -18,6 +18,7 @@
 #include <linux/slab.h>
 #include <linux/interrupt.h>
 #include <linux/sched.h>
+#include <linux/bug.h>
 #include <linux/workqueue.h>
 #include <asm/unaligned.h>
 #include <linux/bitops.h>
diff --git a/include/linux/relay.h b/include/linux/relay.h
index a822fd71fd64..91cacc34c159 100644
--- a/include/linux/relay.h
+++ b/include/linux/relay.h
@@ -15,6 +15,7 @@
 #include <linux/timer.h>
 #include <linux/wait.h>
 #include <linux/list.h>
+#include <linux/bug.h>
 #include <linux/fs.h>
 #include <linux/poll.h>
 #include <linux/kref.h>
diff --git a/include/linux/scatterlist.h b/include/linux/scatterlist.h
index 9aaf5bfdad1a..ac9586dadfa5 100644
--- a/include/linux/scatterlist.h
+++ b/include/linux/scatterlist.h
@@ -1,10 +1,12 @@
 #ifndef _LINUX_SCATTERLIST_H
 #define _LINUX_SCATTERLIST_H
 
+#include <linux/string.h>
+#include <linux/bug.h>
+#include <linux/mm.h>
+
 #include <asm/types.h>
 #include <asm/scatterlist.h>
-#include <linux/mm.h>
-#include <linux/string.h>
 #include <asm/io.h>
 
 struct sg_table {
diff --git a/include/linux/seq_file.h b/include/linux/seq_file.h
index 44f1514b00ba..5ff2df6c8217 100644
--- a/include/linux/seq_file.h
+++ b/include/linux/seq_file.h
@@ -3,6 +3,7 @@
 
 #include <linux/types.h>
 #include <linux/string.h>
+#include <linux/bug.h>
 #include <linux/mutex.h>
 #include <linux/cpumask.h>
 #include <linux/nodemask.h>
diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h
index 50db9b04a552..773ae985ec76 100644
--- a/include/linux/skbuff.h
+++ b/include/linux/skbuff.h
@@ -18,6 +18,7 @@
 #include <linux/kmemcheck.h>
 #include <linux/compiler.h>
 #include <linux/time.h>
+#include <linux/bug.h>
 #include <linux/cache.h>
 
 #include <linux/atomic.h>
diff --git a/include/linux/slub_def.h b/include/linux/slub_def.h
index a32bcfdc7834..ca122b36aec1 100644
--- a/include/linux/slub_def.h
+++ b/include/linux/slub_def.h
@@ -8,6 +8,7 @@
  */
 #include <linux/types.h>
 #include <linux/gfp.h>
+#include <linux/bug.h>
 #include <linux/workqueue.h>
 #include <linux/kobject.h>
 
diff --git a/include/linux/ssb/ssb_driver_gige.h b/include/linux/ssb/ssb_driver_gige.h
index eba52a100533..6b05dcd927ff 100644
--- a/include/linux/ssb/ssb_driver_gige.h
+++ b/include/linux/ssb/ssb_driver_gige.h
@@ -2,6 +2,7 @@
 #define LINUX_SSB_DRIVER_GIGE_H_
 
 #include <linux/ssb/ssb.h>
+#include <linux/bug.h>
 #include <linux/pci.h>
 #include <linux/spinlock.h>
 
diff --git a/include/linux/swapops.h b/include/linux/swapops.h
index 2189d3ffc85d..792d16d9cbc7 100644
--- a/include/linux/swapops.h
+++ b/include/linux/swapops.h
@@ -2,6 +2,7 @@
 #define _LINUX_SWAPOPS_H
 
 #include <linux/radix-tree.h>
+#include <linux/bug.h>
 
 /*
  * swapcache pages are stored in the swapper_space radix tree.  We want to
diff --git a/include/linux/syscalls.h b/include/linux/syscalls.h
index 8ec1153ff57b..3de3acb84a95 100644
--- a/include/linux/syscalls.h
+++ b/include/linux/syscalls.h
@@ -68,6 +68,7 @@ struct file_handle;
 #include <linux/aio_abi.h>
 #include <linux/capability.h>
 #include <linux/list.h>
+#include <linux/bug.h>
 #include <linux/sem.h>
 #include <asm/siginfo.h>
 #include <asm/signal.h>
diff --git a/include/linux/transport_class.h b/include/linux/transport_class.h
index 9ae8da3e6407..11087cdd4ad3 100644
--- a/include/linux/transport_class.h
+++ b/include/linux/transport_class.h
@@ -10,6 +10,7 @@
 #define _TRANSPORT_CLASS_H_
 
 #include <linux/device.h>
+#include <linux/bug.h>
 #include <linux/attribute_container.h>
 
 struct transport_container;
diff --git a/include/linux/virtio_config.h b/include/linux/virtio_config.h
index 5206d6541da5..7323a3390206 100644
--- a/include/linux/virtio_config.h
+++ b/include/linux/virtio_config.h
@@ -53,6 +53,7 @@
 
 #ifdef __KERNEL__
 #include <linux/err.h>
+#include <linux/bug.h>
 #include <linux/virtio.h>
 
 /**
diff --git a/include/net/cfg80211.h b/include/net/cfg80211.h
index a067d30ce73e..85b44ca54ac6 100644
--- a/include/net/cfg80211.h
+++ b/include/net/cfg80211.h
@@ -13,6 +13,7 @@
 #include <linux/netdevice.h>
 #include <linux/debugfs.h>
 #include <linux/list.h>
+#include <linux/bug.h>
 #include <linux/netlink.h>
 #include <linux/skbuff.h>
 #include <linux/nl80211.h>
diff --git a/include/net/dst.h b/include/net/dst.h
index 344c8dd02874..59c5d18cc385 100644
--- a/include/net/dst.h
+++ b/include/net/dst.h
@@ -12,6 +12,7 @@
 #include <linux/netdevice.h>
 #include <linux/rtnetlink.h>
 #include <linux/rcupdate.h>
+#include <linux/bug.h>
 #include <linux/jiffies.h>
 #include <net/neighbour.h>
 #include <asm/processor.h>
diff --git a/include/net/ip_vs.h b/include/net/ip_vs.h
index ebe517f2da9f..2bdee51ba30d 100644
--- a/include/net/ip_vs.h
+++ b/include/net/ip_vs.h
@@ -16,6 +16,7 @@
 #include <linux/atomic.h>                 /* for struct atomic_t */
 #include <linux/compiler.h>
 #include <linux/timer.h>
+#include <linux/bug.h>
 
 #include <net/checksum.h>
 #include <linux/netfilter.h>		/* for union nf_inet_addr */
diff --git a/include/net/mac80211.h b/include/net/mac80211.h
index d49928ba5d09..8294f44c425a 100644
--- a/include/net/mac80211.h
+++ b/include/net/mac80211.h
@@ -13,6 +13,7 @@
 #ifndef MAC80211_H
 #define MAC80211_H
 
+#include <linux/bug.h>
 #include <linux/kernel.h>
 #include <linux/if_ether.h>
 #include <linux/skbuff.h>
diff --git a/include/net/netns/generic.h b/include/net/netns/generic.h
index d55f43443335..0931618c0f7f 100644
--- a/include/net/netns/generic.h
+++ b/include/net/netns/generic.h
@@ -5,6 +5,7 @@
 #ifndef __NET_GENERIC_H__
 #define __NET_GENERIC_H__
 
+#include <linux/bug.h>
 #include <linux/rcupdate.h>
 
 /*
diff --git a/include/net/red.h b/include/net/red.h
index 28068ec614b2..77d4c3745cb5 100644
--- a/include/net/red.h
+++ b/include/net/red.h
@@ -2,6 +2,7 @@
 #define __NET_SCHED_RED_H
 
 #include <linux/types.h>
+#include <linux/bug.h>
 #include <net/pkt_sched.h>
 #include <net/inet_ecn.h>
 #include <net/dsfield.h>
diff --git a/include/net/tcp.h b/include/net/tcp.h
index 42c29bfbcee3..ad8d0a865551 100644
--- a/include/net/tcp.h
+++ b/include/net/tcp.h
@@ -22,6 +22,7 @@
 
 #include <linux/list.h>
 #include <linux/tcp.h>
+#include <linux/bug.h>
 #include <linux/slab.h>
 #include <linux/cache.h>
 #include <linux/percpu.h>
diff --git a/include/net/timewait_sock.h b/include/net/timewait_sock.h
index 053b3cf2c66a..8d6689cb2c66 100644
--- a/include/net/timewait_sock.h
+++ b/include/net/timewait_sock.h
@@ -12,6 +12,7 @@
 #define _TIMEWAIT_SOCK_H
 
 #include <linux/slab.h>
+#include <linux/bug.h>
 #include <net/sock.h>
 
 struct timewait_sock_ops {
diff --git a/include/net/udp.h b/include/net/udp.h
index e39592f682c3..5d606d9da9e5 100644
--- a/include/net/udp.h
+++ b/include/net/udp.h
@@ -23,6 +23,7 @@
 #define _UDP_H
 
 #include <linux/list.h>
+#include <linux/bug.h>
 #include <net/inet_sock.h>
 #include <net/sock.h>
 #include <net/snmp.h>
diff --git a/include/net/wpan-phy.h b/include/net/wpan-phy.h
index d86fffd3c03c..ff27f1b078d1 100644
--- a/include/net/wpan-phy.h
+++ b/include/net/wpan-phy.h
@@ -23,6 +23,7 @@
 
 #include <linux/netdevice.h>
 #include <linux/mutex.h>
+#include <linux/bug.h>
 
 struct wpan_phy {
 	struct mutex pib_lock;
diff --git a/include/scsi/osd_ore.h b/include/scsi/osd_ore.h
index f05fa826f89e..a5f9b960dfc8 100644
--- a/include/scsi/osd_ore.h
+++ b/include/scsi/osd_ore.h
@@ -26,6 +26,7 @@
 #include <scsi/osd_attributes.h>
 #include <scsi/osd_sec.h>
 #include <linux/pnfs_osd_xdr.h>
+#include <linux/bug.h>
 
 struct ore_comp {
 	struct osd_obj_id	obj;
diff --git a/include/scsi/scsi_transport.h b/include/scsi/scsi_transport.h
index 0de32cd4e8a7..af244f4bba53 100644
--- a/include/scsi/scsi_transport.h
+++ b/include/scsi/scsi_transport.h
@@ -22,6 +22,7 @@
 
 #include <linux/transport_class.h>
 #include <linux/blkdev.h>
+#include <linux/bug.h>
 #include <scsi/scsi_host.h>
 #include <scsi/scsi_device.h>
 
-- 
cgit v1.2.3


From 901b04450a0ff44d579158b8b0492ce7e66cd442 Mon Sep 17 00:00:00 2001
From: Alex Shi <alex.shi@intel.com>
Date: Sat, 3 Mar 2012 19:27:27 +0800
Subject: x86/numa: Improve internode cache alignment

Currently cache alignment among nodes in the kernel is still 128
bytes on x86 NUMA machines - we got that X86_INTERNODE_CACHE_SHIFT
default from old P4 processors.

But now most modern x86 CPUs use the same size: 64 bytes from L1 to
last level L3. so let's remove the incorrect setting, and directly
use the L1 cache size to do SMP cache line alignment.

This patch saves some memory space on kernel data, and it also
improves the cache locality of kernel data.

The System.map is quite different with/without this change:

	before patch			after patch
  ...
  000000000000b000 d tlb_vector_|  000000000000b000 d tlb_vector
  000000000000b080 d cpu_loops_p|  000000000000b040 d cpu_loops_
  ...

Signed-off-by: Alex Shi <alex.shi@intel.com>
Cc: asit.k.mallick@intel.com
Link: http://lkml.kernel.org/r/1330774047-18597-1-git-send-email-alex.shi@intel.com
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/Kconfig.cpu | 1 -
 1 file changed, 1 deletion(-)

(limited to 'arch/x86')

diff --git a/arch/x86/Kconfig.cpu b/arch/x86/Kconfig.cpu
index 3c57033e2211..6443c6f038e8 100644
--- a/arch/x86/Kconfig.cpu
+++ b/arch/x86/Kconfig.cpu
@@ -303,7 +303,6 @@ config X86_GENERIC
 config X86_INTERNODE_CACHE_SHIFT
 	int
 	default "12" if X86_VSMP
-	default "7" if NUMA
 	default X86_L1_CACHE_SHIFT
 
 config X86_CMPXCHG
-- 
cgit v1.2.3


From 37178b8bf00137dbf28a9b291af4fbc1b8f91dcc Mon Sep 17 00:00:00 2001
From: Takuya Yoshikawa <yoshikawa.takuya@oss.ntt.co.jp>
Date: Tue, 29 Nov 2011 14:02:45 +0900
Subject: KVM: MMU: Remove for_each_unsync_children() macro

There is only one user of it and for_each_set_bit() does the same.

Signed-off-by: Takuya Yoshikawa <yoshikawa.takuya@oss.ntt.co.jp>
Signed-off-by: Avi Kivity <avi@redhat.com>
---
 arch/x86/kvm/mmu.c | 7 +------
 1 file changed, 1 insertion(+), 6 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
index 224b02c3cda9..8a9b27cb4449 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -1391,11 +1391,6 @@ struct kvm_mmu_pages {
 	unsigned int nr;
 };
 
-#define for_each_unsync_children(bitmap, idx)		\
-	for (idx = find_first_bit(bitmap, 512);		\
-	     idx < 512;					\
-	     idx = find_next_bit(bitmap, 512, idx+1))
-
 static int mmu_pages_add(struct kvm_mmu_pages *pvec, struct kvm_mmu_page *sp,
 			 int idx)
 {
@@ -1417,7 +1412,7 @@ static int __mmu_unsync_walk(struct kvm_mmu_page *sp,
 {
 	int i, ret, nr_unsync_leaf = 0;
 
-	for_each_unsync_children(sp->unsync_child_bitmap, i) {
+	for_each_set_bit(i, sp->unsync_child_bitmap, 512) {
 		struct kvm_mmu_page *child;
 		u64 ent = sp->spt[i];
 
-- 
cgit v1.2.3


From 6addd1aa2ca28c054820ef2966ad372f118c3f31 Mon Sep 17 00:00:00 2001
From: Takuya Yoshikawa <yoshikawa.takuya@oss.ntt.co.jp>
Date: Tue, 29 Nov 2011 14:03:36 +0900
Subject: KVM: MMU: Add missing large page accounting to drop_large_spte()

Signed-off-by: Takuya Yoshikawa <yoshikawa.takuya@oss.ntt.co.jp>
Signed-off-by: Avi Kivity <avi@redhat.com>
---
 arch/x86/kvm/mmu.c | 1 +
 1 file changed, 1 insertion(+)

(limited to 'arch/x86')

diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
index 8a9b27cb4449..9270e0d93c31 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -1798,6 +1798,7 @@ static void drop_large_spte(struct kvm_vcpu *vcpu, u64 *sptep)
 {
 	if (is_large_pte(*sptep)) {
 		drop_spte(vcpu->kvm, sptep);
+		--vcpu->kvm->stat.lpages;
 		kvm_flush_remote_tlbs(vcpu->kvm);
 	}
 }
-- 
cgit v1.2.3


From a138fe7535c0ec778465c7b54b1aaaf4cfd885b7 Mon Sep 17 00:00:00 2001
From: Xiao Guangrong <xiaoguangrong@linux.vnet.ibm.com>
Date: Fri, 16 Dec 2011 18:18:10 +0800
Subject: KVM: MMU: remove the redundant get_written_sptes

get_written_sptes is called twice in kvm_mmu_pte_write, one of them can be
removed

Signed-off-by: Xiao Guangrong <xiaoguangrong@linux.vnet.ibm.com>
Signed-off-by: Marcelo Tosatti <mtosatti@redhat.com>
Signed-off-by: Avi Kivity <avi@redhat.com>
---
 arch/x86/kvm/mmu.c | 6 ++----
 1 file changed, 2 insertions(+), 4 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
index 9270e0d93c31..34da43086952 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -3551,7 +3551,7 @@ static u64 mmu_pte_write_fetch_gpte(struct kvm_vcpu *vcpu, gpa_t *gpa,
  * If we're seeing too many writes to a page, it may no longer be a page table,
  * or we may be forking, in which case it is better to unmap the page.
  */
-static bool detect_write_flooding(struct kvm_mmu_page *sp, u64 *spte)
+static bool detect_write_flooding(struct kvm_mmu_page *sp)
 {
 	/*
 	 * Skip write-flooding detected for the sp whose level is 1, because
@@ -3660,10 +3660,8 @@ void kvm_mmu_pte_write(struct kvm_vcpu *vcpu, gpa_t gpa,
 
 	mask.cr0_wp = mask.cr4_pae = mask.nxe = 1;
 	for_each_gfn_indirect_valid_sp(vcpu->kvm, sp, gfn, node) {
-		spte = get_written_sptes(sp, gpa, &npte);
-
 		if (detect_write_misaligned(sp, gpa, bytes) ||
-		      detect_write_flooding(sp, spte)) {
+		      detect_write_flooding(sp)) {
 			zap_page |= !!kvm_mmu_prepare_zap_page(vcpu->kvm, sp,
 						     &invalid_list);
 			++vcpu->kvm->stat.mmu_flooded;
-- 
cgit v1.2.3


From e08b96371625aaa84cb03f51acc4c8e0be27403a Mon Sep 17 00:00:00 2001
From: Carsten Otte <cotte@de.ibm.com>
Date: Wed, 4 Jan 2012 10:25:20 +0100
Subject: KVM: s390: add parameter for KVM_CREATE_VM

This patch introduces a new config option for user controlled kernel
virtual machines. It introduces a parameter to KVM_CREATE_VM that
allows to set bits that alter the capabilities of the newly created
virtual machine.
The parameter is passed to kvm_arch_init_vm for all architectures.
The only valid modifier bit for now is KVM_VM_S390_UCONTROL.
This requires CAP_SYS_ADMIN privileges and creates a user controlled
virtual machine on s390 architectures.

Signed-off-by: Carsten Otte <cotte@de.ibm.com>
Signed-off-by: Marcelo Tosatti <mtosatti@redhat.com>
Signed-off-by: Avi Kivity <avi@redhat.com>
---
 Documentation/virtual/kvm/api.txt |  7 ++++++-
 arch/ia64/kvm/kvm-ia64.c          |  5 ++++-
 arch/powerpc/kvm/powerpc.c        |  5 ++++-
 arch/s390/kvm/Kconfig             |  9 +++++++++
 arch/s390/kvm/kvm-s390.c          | 24 +++++++++++++++++++-----
 arch/s390/kvm/kvm-s390.h          | 10 ++++++++++
 arch/x86/kvm/x86.c                |  5 ++++-
 include/linux/kvm.h               |  3 +++
 include/linux/kvm_host.h          |  2 +-
 virt/kvm/kvm_main.c               | 13 +++++--------
 10 files changed, 65 insertions(+), 18 deletions(-)

(limited to 'arch/x86')

diff --git a/Documentation/virtual/kvm/api.txt b/Documentation/virtual/kvm/api.txt
index e1d94bf4056e..579d40b26a5a 100644
--- a/Documentation/virtual/kvm/api.txt
+++ b/Documentation/virtual/kvm/api.txt
@@ -95,7 +95,7 @@ described as 'basic' will be available.
 Capability: basic
 Architectures: all
 Type: system ioctl
-Parameters: none
+Parameters: machine type identifier (KVM_VM_*)
 Returns: a VM fd that can be used to control the new virtual machine.
 
 The new VM has no virtual cpus and no memory.  An mmap() of a VM fd
@@ -103,6 +103,11 @@ will access the virtual machine's physical address space; offset zero
 corresponds to guest physical address zero.  Use of mmap() on a VM fd
 is discouraged if userspace memory allocation (KVM_CAP_USER_MEMORY) is
 available.
+You most certainly want to use 0 as machine type.
+
+In order to create user controlled virtual machines on S390, check
+KVM_CAP_S390_UCONTROL and use the flag KVM_VM_S390_UCONTROL as
+privileged user (CAP_SYS_ADMIN).
 
 4.3 KVM_GET_MSR_INDEX_LIST
 
diff --git a/arch/ia64/kvm/kvm-ia64.c b/arch/ia64/kvm/kvm-ia64.c
index 405052002493..df6b14194051 100644
--- a/arch/ia64/kvm/kvm-ia64.c
+++ b/arch/ia64/kvm/kvm-ia64.c
@@ -809,10 +809,13 @@ static void kvm_build_io_pmt(struct kvm *kvm)
 #define GUEST_PHYSICAL_RR4	0x2739
 #define VMM_INIT_RR		0x1660
 
-int kvm_arch_init_vm(struct kvm *kvm)
+int kvm_arch_init_vm(struct kvm *kvm, unsigned long type)
 {
 	BUG_ON(!kvm);
 
+	if (type)
+		return -EINVAL;
+
 	kvm->arch.is_sn2 = ia64_platform_is("sn2");
 
 	kvm->arch.metaphysical_rr0 = GUEST_PHYSICAL_RR0;
diff --git a/arch/powerpc/kvm/powerpc.c b/arch/powerpc/kvm/powerpc.c
index 607fbdf24b84..83f244569874 100644
--- a/arch/powerpc/kvm/powerpc.c
+++ b/arch/powerpc/kvm/powerpc.c
@@ -171,8 +171,11 @@ void kvm_arch_check_processor_compat(void *rtn)
 	*(int *)rtn = kvmppc_core_check_processor_compat();
 }
 
-int kvm_arch_init_vm(struct kvm *kvm)
+int kvm_arch_init_vm(struct kvm *kvm, unsigned long type)
 {
+	if (type)
+		return -EINVAL;
+
 	return kvmppc_core_init_vm(kvm);
 }
 
diff --git a/arch/s390/kvm/Kconfig b/arch/s390/kvm/Kconfig
index a21634173a66..78eb9847008f 100644
--- a/arch/s390/kvm/Kconfig
+++ b/arch/s390/kvm/Kconfig
@@ -34,6 +34,15 @@ config KVM
 
 	  If unsure, say N.
 
+config KVM_S390_UCONTROL
+	bool "Userspace controlled virtual machines"
+	depends on KVM
+	---help---
+	  Allow CAP_SYS_ADMIN users to create KVM virtual machines that are
+	  controlled by userspace.
+
+	  If unsure, say N.
+
 # OK, it's a little counter-intuitive to do this, but it puts it neatly under
 # the virtualization menu.
 source drivers/vhost/Kconfig
diff --git a/arch/s390/kvm/kvm-s390.c b/arch/s390/kvm/kvm-s390.c
index d1c445732451..f0937552175b 100644
--- a/arch/s390/kvm/kvm-s390.c
+++ b/arch/s390/kvm/kvm-s390.c
@@ -171,11 +171,22 @@ long kvm_arch_vm_ioctl(struct file *filp,
 	return r;
 }
 
-int kvm_arch_init_vm(struct kvm *kvm)
+int kvm_arch_init_vm(struct kvm *kvm, unsigned long type)
 {
 	int rc;
 	char debug_name[16];
 
+	rc = -EINVAL;
+#ifdef CONFIG_KVM_S390_UCONTROL
+	if (type & ~KVM_VM_S390_UCONTROL)
+		goto out_err;
+	if ((type & KVM_VM_S390_UCONTROL) && (!capable(CAP_SYS_ADMIN)))
+		goto out_err;
+#else
+	if (type)
+		goto out_err;
+#endif
+
 	rc = s390_enable_sie();
 	if (rc)
 		goto out_err;
@@ -198,10 +209,13 @@ int kvm_arch_init_vm(struct kvm *kvm)
 	debug_register_view(kvm->arch.dbf, &debug_sprintf_view);
 	VM_EVENT(kvm, 3, "%s", "vm created");
 
-	kvm->arch.gmap = gmap_alloc(current->mm);
-	if (!kvm->arch.gmap)
-		goto out_nogmap;
-
+	if (type & KVM_VM_S390_UCONTROL) {
+		kvm->arch.gmap = NULL;
+	} else {
+		kvm->arch.gmap = gmap_alloc(current->mm);
+		if (!kvm->arch.gmap)
+			goto out_nogmap;
+	}
 	return 0;
 out_nogmap:
 	debug_unregister(kvm->arch.dbf);
diff --git a/arch/s390/kvm/kvm-s390.h b/arch/s390/kvm/kvm-s390.h
index 99b0b7597115..45b236a7c730 100644
--- a/arch/s390/kvm/kvm-s390.h
+++ b/arch/s390/kvm/kvm-s390.h
@@ -47,6 +47,16 @@ static inline int __cpu_is_stopped(struct kvm_vcpu *vcpu)
 	return atomic_read(&vcpu->arch.sie_block->cpuflags) & CPUSTAT_STOP_INT;
 }
 
+static inline int kvm_is_ucontrol(struct kvm *kvm)
+{
+#ifdef CONFIG_KVM_S390_UCONTROL
+	if (kvm->arch.gmap)
+		return 0;
+	return 1;
+#else
+	return 0;
+#endif
+}
 int kvm_s390_handle_wait(struct kvm_vcpu *vcpu);
 enum hrtimer_restart kvm_s390_idle_wakeup(struct hrtimer *timer);
 void kvm_s390_tasklet(unsigned long parm);
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 9cbfc0698118..06925b4bcc27 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -6031,8 +6031,11 @@ void kvm_arch_vcpu_uninit(struct kvm_vcpu *vcpu)
 	free_page((unsigned long)vcpu->arch.pio_data);
 }
 
-int kvm_arch_init_vm(struct kvm *kvm)
+int kvm_arch_init_vm(struct kvm *kvm, unsigned long type)
 {
+	if (type)
+		return -EINVAL;
+
 	INIT_LIST_HEAD(&kvm->arch.active_mmu_pages);
 	INIT_LIST_HEAD(&kvm->arch.assigned_dev_head);
 
diff --git a/include/linux/kvm.h b/include/linux/kvm.h
index 68e67e50d028..bba393a6760f 100644
--- a/include/linux/kvm.h
+++ b/include/linux/kvm.h
@@ -431,6 +431,9 @@ struct kvm_ppc_pvinfo {
 
 #define KVMIO 0xAE
 
+/* machine type bits, to be used as argument to KVM_CREATE_VM */
+#define KVM_VM_S390_UCONTROL	1
+
 /*
  * ioctls for /dev/kvm fds:
  */
diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h
index 900c76337e8f..82375e145e64 100644
--- a/include/linux/kvm_host.h
+++ b/include/linux/kvm_host.h
@@ -520,7 +520,7 @@ static inline void kvm_arch_free_vm(struct kvm *kvm)
 }
 #endif
 
-int kvm_arch_init_vm(struct kvm *kvm);
+int kvm_arch_init_vm(struct kvm *kvm, unsigned long type);
 void kvm_arch_destroy_vm(struct kvm *kvm);
 void kvm_free_all_assigned_devices(struct kvm *kvm);
 void kvm_arch_sync_events(struct kvm *kvm);
diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
index a91f980077d8..32e3b048a6cf 100644
--- a/virt/kvm/kvm_main.c
+++ b/virt/kvm/kvm_main.c
@@ -449,7 +449,7 @@ static void kvm_init_memslots_id(struct kvm *kvm)
 		slots->id_to_index[i] = slots->memslots[i].id = i;
 }
 
-static struct kvm *kvm_create_vm(void)
+static struct kvm *kvm_create_vm(unsigned long type)
 {
 	int r, i;
 	struct kvm *kvm = kvm_arch_alloc_vm();
@@ -457,7 +457,7 @@ static struct kvm *kvm_create_vm(void)
 	if (!kvm)
 		return ERR_PTR(-ENOMEM);
 
-	r = kvm_arch_init_vm(kvm);
+	r = kvm_arch_init_vm(kvm, type);
 	if (r)
 		goto out_err_nodisable;
 
@@ -2198,12 +2198,12 @@ static struct file_operations kvm_vm_fops = {
 	.llseek		= noop_llseek,
 };
 
-static int kvm_dev_ioctl_create_vm(void)
+static int kvm_dev_ioctl_create_vm(unsigned long type)
 {
 	int r;
 	struct kvm *kvm;
 
-	kvm = kvm_create_vm();
+	kvm = kvm_create_vm(type);
 	if (IS_ERR(kvm))
 		return PTR_ERR(kvm);
 #ifdef KVM_COALESCED_MMIO_PAGE_OFFSET
@@ -2254,10 +2254,7 @@ static long kvm_dev_ioctl(struct file *filp,
 		r = KVM_API_VERSION;
 		break;
 	case KVM_CREATE_VM:
-		r = -EINVAL;
-		if (arg)
-			goto out;
-		r = kvm_dev_ioctl_create_vm();
+		r = kvm_dev_ioctl_create_vm(arg);
 		break;
 	case KVM_CHECK_EXTENSION:
 		r = kvm_dev_ioctl_check_extension_generic(arg);
-- 
cgit v1.2.3


From 5b1c1493afe8d69909f9df3221bb2fffdf479f4a Mon Sep 17 00:00:00 2001
From: Carsten Otte <cotte@de.ibm.com>
Date: Wed, 4 Jan 2012 10:25:23 +0100
Subject: KVM: s390: ucontrol: export SIE control block to user

This patch exports the s390 SIE hardware control block to userspace
via the mapping of the vcpu file descriptor. In order to do so,
a new arch callback named kvm_arch_vcpu_fault  is introduced for all
architectures. It allows to map architecture specific pages.

Signed-off-by: Carsten Otte <cotte@de.ibm.com>
Signed-off-by: Marcelo Tosatti <mtosatti@redhat.com>
Signed-off-by: Avi Kivity <avi@redhat.com>
---
 Documentation/virtual/kvm/api.txt |  5 +++++
 arch/ia64/kvm/kvm-ia64.c          |  5 +++++
 arch/powerpc/kvm/powerpc.c        |  5 +++++
 arch/s390/kvm/kvm-s390.c          | 13 +++++++++++++
 arch/x86/kvm/x86.c                |  5 +++++
 include/linux/kvm.h               |  2 ++
 include/linux/kvm_host.h          |  1 +
 virt/kvm/kvm_main.c               |  2 +-
 8 files changed, 37 insertions(+), 1 deletion(-)

(limited to 'arch/x86')

diff --git a/Documentation/virtual/kvm/api.txt b/Documentation/virtual/kvm/api.txt
index 6e53ff51422f..5ebf47d99e56 100644
--- a/Documentation/virtual/kvm/api.txt
+++ b/Documentation/virtual/kvm/api.txt
@@ -218,6 +218,11 @@ allocation of vcpu ids.  For example, if userspace wants
 single-threaded guest vcpus, it should make all vcpu ids be a multiple
 of the number of vcpus per vcore.
 
+For virtual cpus that have been created with S390 user controlled virtual
+machines, the resulting vcpu fd can be memory mapped at page offset
+KVM_S390_SIE_PAGE_OFFSET in order to obtain a memory map of the virtual
+cpu's hardware control block.
+
 4.8 KVM_GET_DIRTY_LOG (vm ioctl)
 
 Capability: basic
diff --git a/arch/ia64/kvm/kvm-ia64.c b/arch/ia64/kvm/kvm-ia64.c
index df6b14194051..8ca7261e7b3d 100644
--- a/arch/ia64/kvm/kvm-ia64.c
+++ b/arch/ia64/kvm/kvm-ia64.c
@@ -1566,6 +1566,11 @@ out:
 	return r;
 }
 
+int kvm_arch_vcpu_fault(struct kvm_vcpu *vcpu, struct vm_fault *vmf)
+{
+	return VM_FAULT_SIGBUS;
+}
+
 int kvm_arch_prepare_memory_region(struct kvm *kvm,
 		struct kvm_memory_slot *memslot,
 		struct kvm_memory_slot old,
diff --git a/arch/powerpc/kvm/powerpc.c b/arch/powerpc/kvm/powerpc.c
index 83f244569874..a5671616af86 100644
--- a/arch/powerpc/kvm/powerpc.c
+++ b/arch/powerpc/kvm/powerpc.c
@@ -659,6 +659,11 @@ out:
 	return r;
 }
 
+int kvm_arch_vcpu_fault(struct kvm_vcpu *vcpu, struct vm_fault *vmf)
+{
+	return VM_FAULT_SIGBUS;
+}
+
 static int kvm_vm_ioctl_get_pvinfo(struct kvm_ppc_pvinfo *pvinfo)
 {
 	u32 inst_lis = 0x3c000000;
diff --git a/arch/s390/kvm/kvm-s390.c b/arch/s390/kvm/kvm-s390.c
index af05328aca25..d6bc65aeb950 100644
--- a/arch/s390/kvm/kvm-s390.c
+++ b/arch/s390/kvm/kvm-s390.c
@@ -763,6 +763,19 @@ long kvm_arch_vcpu_ioctl(struct file *filp,
 	return r;
 }
 
+int kvm_arch_vcpu_fault(struct kvm_vcpu *vcpu, struct vm_fault *vmf)
+{
+#ifdef CONFIG_KVM_S390_UCONTROL
+	if ((vmf->pgoff == KVM_S390_SIE_PAGE_OFFSET)
+		 && (kvm_is_ucontrol(vcpu->kvm))) {
+		vmf->page = virt_to_page(vcpu->arch.sie_block);
+		get_page(vmf->page);
+		return 0;
+	}
+#endif
+	return VM_FAULT_SIGBUS;
+}
+
 /* Section: memory related */
 int kvm_arch_prepare_memory_region(struct kvm *kvm,
 				   struct kvm_memory_slot *memslot,
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 06925b4bcc27..a3ce196d21fe 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -2814,6 +2814,11 @@ out:
 	return r;
 }
 
+int kvm_arch_vcpu_fault(struct kvm_vcpu *vcpu, struct vm_fault *vmf)
+{
+	return VM_FAULT_SIGBUS;
+}
+
 static int kvm_vm_ioctl_set_tss_addr(struct kvm *kvm, unsigned long addr)
 {
 	int ret;
diff --git a/include/linux/kvm.h b/include/linux/kvm.h
index 7f686f6708b0..8f888df206a2 100644
--- a/include/linux/kvm.h
+++ b/include/linux/kvm.h
@@ -440,6 +440,8 @@ struct kvm_ppc_pvinfo {
 /* machine type bits, to be used as argument to KVM_CREATE_VM */
 #define KVM_VM_S390_UCONTROL	1
 
+#define KVM_S390_SIE_PAGE_OFFSET 1
+
 /*
  * ioctls for /dev/kvm fds:
  */
diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h
index 82375e145e64..d4d4d7092110 100644
--- a/include/linux/kvm_host.h
+++ b/include/linux/kvm_host.h
@@ -450,6 +450,7 @@ long kvm_arch_dev_ioctl(struct file *filp,
 			unsigned int ioctl, unsigned long arg);
 long kvm_arch_vcpu_ioctl(struct file *filp,
 			 unsigned int ioctl, unsigned long arg);
+int kvm_arch_vcpu_fault(struct kvm_vcpu *vcpu, struct vm_fault *vmf);
 
 int kvm_dev_ioctl_check_extension(long ext);
 
diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
index 32e3b048a6cf..64be836f3348 100644
--- a/virt/kvm/kvm_main.c
+++ b/virt/kvm/kvm_main.c
@@ -1657,7 +1657,7 @@ static int kvm_vcpu_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
 		page = virt_to_page(vcpu->kvm->coalesced_mmio_ring);
 #endif
 	else
-		return VM_FAULT_SIGBUS;
+		return kvm_arch_vcpu_fault(vcpu, vmf);
 	get_page(page);
 	vmf->page = page;
 	return 0;
-- 
cgit v1.2.3


From 4a58ae614a28b1ae3bea1c74a307cdfb7c77dab8 Mon Sep 17 00:00:00 2001
From: Davidlohr Bueso <dave@gnu.org>
Date: Fri, 6 Jan 2012 15:06:18 +0100
Subject: KVM: MMU: unnecessary NX state assignment

We can remove the first ->nx state assignment since it is assigned afterwards anyways.

Signed-off-by: Davidlohr Bueso <dave@gnu.org>
Signed-off-by: Marcelo Tosatti <mtosatti@redhat.com>
Signed-off-by: Avi Kivity <avi@redhat.com>
---
 arch/x86/kvm/mmu.c | 1 -
 1 file changed, 1 deletion(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
index 34da43086952..0a11468d853f 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -3321,7 +3321,6 @@ static int init_kvm_tdp_mmu(struct kvm_vcpu *vcpu)
 	context->get_cr3 = get_cr3;
 	context->get_pdptr = kvm_pdptr_read;
 	context->inject_page_fault = kvm_inject_page_fault;
-	context->nx = is_nx(vcpu);
 
 	if (!is_paging(vcpu)) {
 		context->nx = false;
-- 
cgit v1.2.3


From 2b036c6b861dc5da295c6fe19a3edcff7093fdeb Mon Sep 17 00:00:00 2001
From: Boris Ostrovsky <boris.ostrovsky@amd.com>
Date: Mon, 9 Jan 2012 14:00:35 -0500
Subject: KVM: SVM: Add support for AMD's OSVW feature in guests

In some cases guests should not provide workarounds for errata even when the
physical processor is affected. For example, because of erratum 400 on family
10h processors a Linux guest will read an MSR (resulting in VMEXIT) before
going to idle in order to avoid getting stuck in a non-C0 state. This is not
necessary: HLT and IO instructions are intercepted and therefore there is no
reason for erratum 400 workaround in the guest.

This patch allows us to present a guest with certain errata as fixed,
regardless of the state of actual hardware.

Signed-off-by: Boris Ostrovsky <boris.ostrovsky@amd.com>
Signed-off-by: Marcelo Tosatti <mtosatti@redhat.com>
Signed-off-by: Avi Kivity <avi@redhat.com>
---
 arch/x86/include/asm/kvm_host.h |  6 +++++
 arch/x86/kvm/cpuid.c            |  2 +-
 arch/x86/kvm/cpuid.h            |  8 ++++++
 arch/x86/kvm/svm.c              | 59 +++++++++++++++++++++++++++++++++++++++++
 arch/x86/kvm/x86.c              | 20 ++++++++++++++
 5 files changed, 94 insertions(+), 1 deletion(-)

(limited to 'arch/x86')

diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index 52d6640a5ca1..bd69c93da8fa 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -478,6 +478,12 @@ struct kvm_vcpu_arch {
 		u32 id;
 		bool send_user_only;
 	} apf;
+
+	/* OSVW MSRs (AMD only) */
+	struct {
+		u64 length;
+		u64 status;
+	} osvw;
 };
 
 struct kvm_arch {
diff --git a/arch/x86/kvm/cpuid.c b/arch/x86/kvm/cpuid.c
index 89b02bfaaca5..9fed5bedaad6 100644
--- a/arch/x86/kvm/cpuid.c
+++ b/arch/x86/kvm/cpuid.c
@@ -236,7 +236,7 @@ static int do_cpuid_ent(struct kvm_cpuid_entry2 *entry, u32 function,
 	const u32 kvm_supported_word6_x86_features =
 		F(LAHF_LM) | F(CMP_LEGACY) | 0 /*SVM*/ | 0 /* ExtApicSpace */ |
 		F(CR8_LEGACY) | F(ABM) | F(SSE4A) | F(MISALIGNSSE) |
-		F(3DNOWPREFETCH) | 0 /* OSVW */ | 0 /* IBS */ | F(XOP) |
+		F(3DNOWPREFETCH) | F(OSVW) | 0 /* IBS */ | F(XOP) |
 		0 /* SKINIT, WDT, LWP */ | F(FMA4) | F(TBM);
 
 	/* cpuid 0xC0000001.edx */
diff --git a/arch/x86/kvm/cpuid.h b/arch/x86/kvm/cpuid.h
index 5b97e1797a6d..26d1fb437eb5 100644
--- a/arch/x86/kvm/cpuid.h
+++ b/arch/x86/kvm/cpuid.h
@@ -43,4 +43,12 @@ static inline bool guest_cpuid_has_fsgsbase(struct kvm_vcpu *vcpu)
 	return best && (best->ebx & bit(X86_FEATURE_FSGSBASE));
 }
 
+static inline bool guest_cpuid_has_osvw(struct kvm_vcpu *vcpu)
+{
+	struct kvm_cpuid_entry2 *best;
+
+	best = kvm_find_cpuid_entry(vcpu, 0x80000001, 0);
+	return best && (best->ecx & bit(X86_FEATURE_OSVW));
+}
+
 #endif
diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c
index 5fa553babe56..fce3ba0f2079 100644
--- a/arch/x86/kvm/svm.c
+++ b/arch/x86/kvm/svm.c
@@ -110,6 +110,12 @@ struct nested_state {
 #define MSRPM_OFFSETS	16
 static u32 msrpm_offsets[MSRPM_OFFSETS] __read_mostly;
 
+/*
+ * Set osvw_len to higher value when updated Revision Guides
+ * are published and we know what the new status bits are
+ */
+static uint64_t osvw_len = 4, osvw_status;
+
 struct vcpu_svm {
 	struct kvm_vcpu vcpu;
 	struct vmcb *vmcb;
@@ -556,6 +562,27 @@ static void svm_init_erratum_383(void)
 	erratum_383_found = true;
 }
 
+static void svm_init_osvw(struct kvm_vcpu *vcpu)
+{
+	/*
+	 * Guests should see errata 400 and 415 as fixed (assuming that
+	 * HLT and IO instructions are intercepted).
+	 */
+	vcpu->arch.osvw.length = (osvw_len >= 3) ? (osvw_len) : 3;
+	vcpu->arch.osvw.status = osvw_status & ~(6ULL);
+
+	/*
+	 * By increasing VCPU's osvw.length to 3 we are telling the guest that
+	 * all osvw.status bits inside that length, including bit 0 (which is
+	 * reserved for erratum 298), are valid. However, if host processor's
+	 * osvw_len is 0 then osvw_status[0] carries no information. We need to
+	 * be conservative here and therefore we tell the guest that erratum 298
+	 * is present (because we really don't know).
+	 */
+	if (osvw_len == 0 && boot_cpu_data.x86 == 0x10)
+		vcpu->arch.osvw.status |= 1;
+}
+
 static int has_svm(void)
 {
 	const char *msg;
@@ -620,6 +647,36 @@ static int svm_hardware_enable(void *garbage)
 		__get_cpu_var(current_tsc_ratio) = TSC_RATIO_DEFAULT;
 	}
 
+
+	/*
+	 * Get OSVW bits.
+	 *
+	 * Note that it is possible to have a system with mixed processor
+	 * revisions and therefore different OSVW bits. If bits are not the same
+	 * on different processors then choose the worst case (i.e. if erratum
+	 * is present on one processor and not on another then assume that the
+	 * erratum is present everywhere).
+	 */
+	if (cpu_has(&boot_cpu_data, X86_FEATURE_OSVW)) {
+		uint64_t len, status = 0;
+		int err;
+
+		len = native_read_msr_safe(MSR_AMD64_OSVW_ID_LENGTH, &err);
+		if (!err)
+			status = native_read_msr_safe(MSR_AMD64_OSVW_STATUS,
+						      &err);
+
+		if (err)
+			osvw_status = osvw_len = 0;
+		else {
+			if (len < osvw_len)
+				osvw_len = len;
+			osvw_status |= status;
+			osvw_status &= (1ULL << osvw_len) - 1;
+		}
+	} else
+		osvw_status = osvw_len = 0;
+
 	svm_init_erratum_383();
 
 	return 0;
@@ -1186,6 +1243,8 @@ static struct kvm_vcpu *svm_create_vcpu(struct kvm *kvm, unsigned int id)
 	if (kvm_vcpu_is_bsp(&svm->vcpu))
 		svm->vcpu.arch.apic_base |= MSR_IA32_APICBASE_BSP;
 
+	svm_init_osvw(&svm->vcpu);
+
 	return &svm->vcpu;
 
 free_page4:
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index a3ce196d21fe..2bd77a3a41ed 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -1675,6 +1675,16 @@ int kvm_set_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 data)
 		 */
 		pr_unimpl(vcpu, "ignored wrmsr: 0x%x data %llx\n", msr, data);
 		break;
+	case MSR_AMD64_OSVW_ID_LENGTH:
+		if (!guest_cpuid_has_osvw(vcpu))
+			return 1;
+		vcpu->arch.osvw.length = data;
+		break;
+	case MSR_AMD64_OSVW_STATUS:
+		if (!guest_cpuid_has_osvw(vcpu))
+			return 1;
+		vcpu->arch.osvw.status = data;
+		break;
 	default:
 		if (msr && (msr == vcpu->kvm->arch.xen_hvm_config.msr))
 			return xen_hvm_config(vcpu, data);
@@ -1959,6 +1969,16 @@ int kvm_get_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata)
 		 */
 		data = 0xbe702111;
 		break;
+	case MSR_AMD64_OSVW_ID_LENGTH:
+		if (!guest_cpuid_has_osvw(vcpu))
+			return 1;
+		data = vcpu->arch.osvw.length;
+		break;
+	case MSR_AMD64_OSVW_STATUS:
+		if (!guest_cpuid_has_osvw(vcpu))
+			return 1;
+		data = vcpu->arch.osvw.status;
+		break;
 	default:
 		if (kvm_pmu_msr(vcpu, msr))
 			return kvm_pmu_get_msr(vcpu, msr, pdata);
-- 
cgit v1.2.3


From b9e5dc8d4511e6a00862a795319569e7fe7f60f4 Mon Sep 17 00:00:00 2001
From: Christian Borntraeger <borntraeger@de.ibm.com>
Date: Wed, 11 Jan 2012 11:20:30 +0100
Subject: KVM: provide synchronous registers in kvm_run

On some cpus the overhead for virtualization instructions is in the same
range as a system call. Having to call multiple ioctls to get set registers
will make certain userspace handled exits more expensive than necessary.
Lets provide a section in kvm_run that works as a shared save area
for guest registers.
We also provide two 64bit flags fields (architecture specific), that will
specify
1. which parts of these fields are valid.
2. which registers were modified by userspace

Each bit for these flag fields will define a group of registers (like
general purpose) or a single register.

Signed-off-by: Christian Borntraeger <borntraeger@de.ibm.com>
Signed-off-by: Marcelo Tosatti <mtosatti@redhat.com>
Signed-off-by: Avi Kivity <avi@redhat.com>
---
 Documentation/virtual/kvm/api.txt | 23 +++++++++++++++++++++++
 arch/ia64/include/asm/kvm.h       |  4 ++++
 arch/powerpc/include/asm/kvm.h    |  4 ++++
 arch/s390/include/asm/kvm.h       |  3 +++
 arch/x86/include/asm/kvm.h        |  4 ++++
 include/linux/kvm.h               | 15 +++++++++++++++
 6 files changed, 53 insertions(+)

(limited to 'arch/x86')

diff --git a/Documentation/virtual/kvm/api.txt b/Documentation/virtual/kvm/api.txt
index a67fb35993fa..7ca696227d3a 100644
--- a/Documentation/virtual/kvm/api.txt
+++ b/Documentation/virtual/kvm/api.txt
@@ -1771,6 +1771,29 @@ developer registration required to access it).
 		/* Fix the size of the union. */
 		char padding[256];
 	};
+
+	/*
+	 * shared registers between kvm and userspace.
+	 * kvm_valid_regs specifies the register classes set by the host
+	 * kvm_dirty_regs specified the register classes dirtied by userspace
+	 * struct kvm_sync_regs is architecture specific, as well as the
+	 * bits for kvm_valid_regs and kvm_dirty_regs
+	 */
+	__u64 kvm_valid_regs;
+	__u64 kvm_dirty_regs;
+	union {
+		struct kvm_sync_regs regs;
+		char padding[1024];
+	} s;
+
+If KVM_CAP_SYNC_REGS is defined, these fields allow userspace to access
+certain guest registers without having to call SET/GET_*REGS. Thus we can
+avoid some system call overhead if userspace has to handle the exit.
+Userspace can query the validity of the structure by checking
+kvm_valid_regs for specific bits. These bits are architecture specific
+and usually define the validity of a groups of registers. (e.g. one bit
+ for general purpose registers)
+
 };
 
 6. Capabilities that can be enabled
diff --git a/arch/ia64/include/asm/kvm.h b/arch/ia64/include/asm/kvm.h
index bc90c75adf67..b9f82c84f093 100644
--- a/arch/ia64/include/asm/kvm.h
+++ b/arch/ia64/include/asm/kvm.h
@@ -261,4 +261,8 @@ struct kvm_debug_exit_arch {
 struct kvm_guest_debug_arch {
 };
 
+/* definition of registers in kvm_run */
+struct kvm_sync_regs {
+};
+
 #endif
diff --git a/arch/powerpc/include/asm/kvm.h b/arch/powerpc/include/asm/kvm.h
index f7727d91ac6b..7d9d4de057ef 100644
--- a/arch/powerpc/include/asm/kvm.h
+++ b/arch/powerpc/include/asm/kvm.h
@@ -265,6 +265,10 @@ struct kvm_debug_exit_arch {
 struct kvm_guest_debug_arch {
 };
 
+/* definition of registers in kvm_run */
+struct kvm_sync_regs {
+};
+
 #define KVM_REG_MASK		0x001f
 #define KVM_REG_EXT_MASK	0xffe0
 #define KVM_REG_GPR		0x0000
diff --git a/arch/s390/include/asm/kvm.h b/arch/s390/include/asm/kvm.h
index 82b32a100c7d..325560afb77e 100644
--- a/arch/s390/include/asm/kvm.h
+++ b/arch/s390/include/asm/kvm.h
@@ -41,4 +41,7 @@ struct kvm_debug_exit_arch {
 struct kvm_guest_debug_arch {
 };
 
+/* definition of registers in kvm_run */
+struct kvm_sync_regs {
+};
 #endif
diff --git a/arch/x86/include/asm/kvm.h b/arch/x86/include/asm/kvm.h
index 4d8dcbdfc120..e7d1c194d272 100644
--- a/arch/x86/include/asm/kvm.h
+++ b/arch/x86/include/asm/kvm.h
@@ -321,4 +321,8 @@ struct kvm_xcrs {
 	__u64 padding[16];
 };
 
+/* definition of registers in kvm_run */
+struct kvm_sync_regs {
+};
+
 #endif /* _ASM_X86_KVM_H */
diff --git a/include/linux/kvm.h b/include/linux/kvm.h
index 6cf048d9604b..245bcb3a0fcd 100644
--- a/include/linux/kvm.h
+++ b/include/linux/kvm.h
@@ -279,6 +279,20 @@ struct kvm_run {
 		/* Fix the size of the union. */
 		char padding[256];
 	};
+
+	/*
+	 * shared registers between kvm and userspace.
+	 * kvm_valid_regs specifies the register classes set by the host
+	 * kvm_dirty_regs specified the register classes dirtied by userspace
+	 * struct kvm_sync_regs is architecture specific, as well as the
+	 * bits for kvm_valid_regs and kvm_dirty_regs
+	 */
+	__u64 kvm_valid_regs;
+	__u64 kvm_dirty_regs;
+	union {
+		struct kvm_sync_regs regs;
+		char padding[1024];
+	} s;
 };
 
 /* for KVM_REGISTER_COALESCED_MMIO / KVM_UNREGISTER_COALESCED_MMIO */
@@ -570,6 +584,7 @@ struct kvm_ppc_pvinfo {
 #define KVM_CAP_S390_GMAP 71
 #define KVM_CAP_TSC_DEADLINE_TIMER 72
 #define KVM_CAP_S390_UCONTROL 73
+#define KVM_CAP_SYNC_REGS 74
 
 #ifdef KVM_CAP_IRQ_ROUTING
 
-- 
cgit v1.2.3


From 28867cee754c07b3fa0a679ed2ea394843130217 Mon Sep 17 00:00:00 2001
From: Avi Kivity <avi@redhat.com>
Date: Mon, 16 Jan 2012 15:08:44 +0200
Subject: KVM: x86 emulator: add 8-bit memory operands

Useful for MOVSX/MOVZX.

Signed-off-by: Avi Kivity <avi@redhat.com>
Signed-off-by: Marcelo Tosatti <mtosatti@redhat.com>
---
 arch/x86/kvm/emulate.c | 5 +++++
 1 file changed, 5 insertions(+)

(limited to 'arch/x86')

diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c
index 0982507b962a..5da6b3619201 100644
--- a/arch/x86/kvm/emulate.c
+++ b/arch/x86/kvm/emulate.c
@@ -57,6 +57,7 @@
 #define OpDS              23ull  /* DS */
 #define OpFS              24ull  /* FS */
 #define OpGS              25ull  /* GS */
+#define OpMem8            26ull  /* 8-bit zero extended memory operand */
 
 #define OpBits             5  /* Width of operand field */
 #define OpMask             ((1ull << OpBits) - 1)
@@ -101,6 +102,7 @@
 #define SrcAcc      (OpAcc << SrcShift)
 #define SrcImmU16   (OpImmU16 << SrcShift)
 #define SrcDX       (OpDX << SrcShift)
+#define SrcMem8     (OpMem8 << SrcShift)
 #define SrcMask     (OpMask << SrcShift)
 #define BitOp       (1<<11)
 #define MemAbs      (1<<12)      /* Memory operand is absolute displacement */
@@ -3656,6 +3658,9 @@ static int decode_operand(struct x86_emulate_ctxt *ctxt, struct operand *op,
 	case OpImm:
 		rc = decode_imm(ctxt, op, imm_size(ctxt), true);
 		break;
+	case OpMem8:
+		ctxt->memop.bytes = 1;
+		goto mem_common;
 	case OpMem16:
 		ctxt->memop.bytes = 2;
 		goto mem_common;
-- 
cgit v1.2.3


From 2adb5ad9fe1b44d0ae8b00d2bd6568e6163215b3 Mon Sep 17 00:00:00 2001
From: Avi Kivity <avi@redhat.com>
Date: Mon, 16 Jan 2012 15:08:45 +0200
Subject: KVM: x86 emulator: Remove byte-sized MOVSX/MOVZX hack

Currently we treat MOVSX/MOVZX with a byte source as a byte instruction,
and change the destination operand size with a hack.  Change it to be
a word instruction, so the destination receives its natural size, and
change the source to be SrcMem8.

Signed-off-by: Avi Kivity <avi@redhat.com>
Signed-off-by: Marcelo Tosatti <mtosatti@redhat.com>
---
 arch/x86/kvm/emulate.c | 13 +++++--------
 1 file changed, 5 insertions(+), 8 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c
index 5da6b3619201..6eaedac7cf6a 100644
--- a/arch/x86/kvm/emulate.c
+++ b/arch/x86/kvm/emulate.c
@@ -860,8 +860,7 @@ static void write_sse_reg(struct x86_emulate_ctxt *ctxt, sse128_t *data,
 }
 
 static void decode_register_operand(struct x86_emulate_ctxt *ctxt,
-				    struct operand *op,
-				    int inhibit_bytereg)
+				    struct operand *op)
 {
 	unsigned reg = ctxt->modrm_reg;
 	int highbyte_regs = ctxt->rex_prefix == 0;
@@ -878,7 +877,7 @@ static void decode_register_operand(struct x86_emulate_ctxt *ctxt,
 	}
 
 	op->type = OP_REG;
-	if ((ctxt->d & ByteOp) && !inhibit_bytereg) {
+	if (ctxt->d & ByteOp) {
 		op->addr.reg = decode_register(reg, ctxt->regs, highbyte_regs);
 		op->bytes = 1;
 	} else {
@@ -3516,13 +3515,13 @@ static struct opcode twobyte_table[256] = {
 	I(DstMem | SrcReg | ModRM | BitOp | Lock, em_btr),
 	I(DstReg | SrcMemFAddr | ModRM | Src2FS, em_lseg),
 	I(DstReg | SrcMemFAddr | ModRM | Src2GS, em_lseg),
-	D(ByteOp | DstReg | SrcMem | ModRM | Mov), D(DstReg | SrcMem16 | ModRM | Mov),
+	D(DstReg | SrcMem8 | ModRM | Mov), D(DstReg | SrcMem16 | ModRM | Mov),
 	/* 0xB8 - 0xBF */
 	N, N,
 	G(BitOp, group8),
 	I(DstMem | SrcReg | ModRM | BitOp | Lock | PageTable, em_btc),
 	I(DstReg | SrcMem | ModRM, em_bsf), I(DstReg | SrcMem | ModRM, em_bsr),
-	D(ByteOp | DstReg | SrcMem | ModRM | Mov), D(DstReg | SrcMem16 | ModRM | Mov),
+	D(DstReg | SrcMem8 | ModRM | Mov), D(DstReg | SrcMem16 | ModRM | Mov),
 	/* 0xC0 - 0xCF */
 	D2bv(DstMem | SrcReg | ModRM | Lock),
 	N, D(DstMem | SrcReg | ModRM | Mov),
@@ -3604,9 +3603,7 @@ static int decode_operand(struct x86_emulate_ctxt *ctxt, struct operand *op,
 
 	switch (d) {
 	case OpReg:
-		decode_register_operand(ctxt, op,
-			 op == &ctxt->dst &&
-			 ctxt->twobyte && (ctxt->b == 0xb6 || ctxt->b == 0xb7));
+		decode_register_operand(ctxt, op);
 		break;
 	case OpImmUByte:
 		rc = decode_imm(ctxt, op, 1, false);
-- 
cgit v1.2.3


From 3ea8b75e47ac70bdd0a2c0492102682d43bfa3c4 Mon Sep 17 00:00:00 2001
From: Takuya Yoshikawa <yoshikawa.takuya@oss.ntt.co.jp>
Date: Tue, 17 Jan 2012 19:50:08 +0900
Subject: KVM: MMU: Remove unused kvm_pte_chain

Signed-off-by: Takuya Yoshikawa <yoshikawa.takuya@oss.ntt.co.jp>
Signed-off-by: Marcelo Tosatti <mtosatti@redhat.com>
Signed-off-by: Avi Kivity <avi@redhat.com>
---
 arch/x86/include/asm/kvm_host.h | 7 -------
 1 file changed, 7 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index bd69c93da8fa..461016614324 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -181,13 +181,6 @@ struct kvm_mmu_memory_cache {
 	void *objects[KVM_NR_MEM_OBJS];
 };
 
-#define NR_PTE_CHAIN_ENTRIES 5
-
-struct kvm_pte_chain {
-	u64 *parent_ptes[NR_PTE_CHAIN_ENTRIES];
-	struct hlist_node link;
-};
-
 /*
  * kvm_mmu_page_role, below, is defined as:
  *
-- 
cgit v1.2.3


From 9373e2c0576ee15b13e93bc5c5b3ef31d0612992 Mon Sep 17 00:00:00 2001
From: Takuya Yoshikawa <yoshikawa.takuya@oss.ntt.co.jp>
Date: Tue, 17 Jan 2012 19:51:20 +0900
Subject: KVM: MMU: Remove unused kvm parameter from __gfn_to_rmap()

Signed-off-by: Takuya Yoshikawa <yoshikawa.takuya@oss.ntt.co.jp>
Signed-off-by: Marcelo Tosatti <mtosatti@redhat.com>
Signed-off-by: Avi Kivity <avi@redhat.com>
---
 arch/x86/kvm/mmu.c | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
index 0a11468d853f..75b8f579b2a6 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -946,7 +946,7 @@ static void pte_list_walk(unsigned long *pte_list, pte_list_walk_fn fn)
 	}
 }
 
-static unsigned long *__gfn_to_rmap(struct kvm *kvm, gfn_t gfn, int level,
+static unsigned long *__gfn_to_rmap(gfn_t gfn, int level,
 				    struct kvm_memory_slot *slot)
 {
 	struct kvm_lpage_info *linfo;
@@ -966,7 +966,7 @@ static unsigned long *gfn_to_rmap(struct kvm *kvm, gfn_t gfn, int level)
 	struct kvm_memory_slot *slot;
 
 	slot = gfn_to_memslot(kvm, gfn);
-	return __gfn_to_rmap(kvm, gfn, level, slot);
+	return __gfn_to_rmap(gfn, level, slot);
 }
 
 static bool rmap_can_add(struct kvm_vcpu *vcpu)
@@ -1018,7 +1018,7 @@ int kvm_mmu_rmap_write_protect(struct kvm *kvm, u64 gfn,
 	u64 *spte;
 	int i, write_protected = 0;
 
-	rmapp = __gfn_to_rmap(kvm, gfn, PT_PAGE_TABLE_LEVEL, slot);
+	rmapp = __gfn_to_rmap(gfn, PT_PAGE_TABLE_LEVEL, slot);
 	spte = rmap_next(kvm, rmapp, NULL);
 	while (spte) {
 		BUG_ON(!(*spte & PT_PRESENT_MASK));
@@ -1033,7 +1033,7 @@ int kvm_mmu_rmap_write_protect(struct kvm *kvm, u64 gfn,
 	/* check for huge page mappings */
 	for (i = PT_DIRECTORY_LEVEL;
 	     i < PT_PAGE_TABLE_LEVEL + KVM_NR_PAGE_SIZES; ++i) {
-		rmapp = __gfn_to_rmap(kvm, gfn, i, slot);
+		rmapp = __gfn_to_rmap(gfn, i, slot);
 		spte = rmap_next(kvm, rmapp, NULL);
 		while (spte) {
 			BUG_ON(!(*spte & PT_PRESENT_MASK));
-- 
cgit v1.2.3


From e4b35cc960bf216548516d8e39f5e364cfbbc86b Mon Sep 17 00:00:00 2001
From: Takuya Yoshikawa <yoshikawa.takuya@oss.ntt.co.jp>
Date: Tue, 17 Jan 2012 19:52:15 +0900
Subject: KVM: MMU: Remove unused kvm parameter from rmap_next()

Signed-off-by: Takuya Yoshikawa <yoshikawa.takuya@oss.ntt.co.jp>
Signed-off-by: Marcelo Tosatti <mtosatti@redhat.com>
Signed-off-by: Avi Kivity <avi@redhat.com>
---
 arch/x86/kvm/mmu.c       | 26 +++++++++++++-------------
 arch/x86/kvm/mmu_audit.c |  4 ++--
 2 files changed, 15 insertions(+), 15 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
index 75b8f579b2a6..ae76cc3392e1 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -988,7 +988,7 @@ static int rmap_add(struct kvm_vcpu *vcpu, u64 *spte, gfn_t gfn)
 	return pte_list_add(vcpu, spte, rmapp);
 }
 
-static u64 *rmap_next(struct kvm *kvm, unsigned long *rmapp, u64 *spte)
+static u64 *rmap_next(unsigned long *rmapp, u64 *spte)
 {
 	return pte_list_next(rmapp, spte);
 }
@@ -1019,7 +1019,7 @@ int kvm_mmu_rmap_write_protect(struct kvm *kvm, u64 gfn,
 	int i, write_protected = 0;
 
 	rmapp = __gfn_to_rmap(gfn, PT_PAGE_TABLE_LEVEL, slot);
-	spte = rmap_next(kvm, rmapp, NULL);
+	spte = rmap_next(rmapp, NULL);
 	while (spte) {
 		BUG_ON(!(*spte & PT_PRESENT_MASK));
 		rmap_printk("rmap_write_protect: spte %p %llx\n", spte, *spte);
@@ -1027,14 +1027,14 @@ int kvm_mmu_rmap_write_protect(struct kvm *kvm, u64 gfn,
 			mmu_spte_update(spte, *spte & ~PT_WRITABLE_MASK);
 			write_protected = 1;
 		}
-		spte = rmap_next(kvm, rmapp, spte);
+		spte = rmap_next(rmapp, spte);
 	}
 
 	/* check for huge page mappings */
 	for (i = PT_DIRECTORY_LEVEL;
 	     i < PT_PAGE_TABLE_LEVEL + KVM_NR_PAGE_SIZES; ++i) {
 		rmapp = __gfn_to_rmap(gfn, i, slot);
-		spte = rmap_next(kvm, rmapp, NULL);
+		spte = rmap_next(rmapp, NULL);
 		while (spte) {
 			BUG_ON(!(*spte & PT_PRESENT_MASK));
 			BUG_ON(!is_large_pte(*spte));
@@ -1045,7 +1045,7 @@ int kvm_mmu_rmap_write_protect(struct kvm *kvm, u64 gfn,
 				spte = NULL;
 				write_protected = 1;
 			}
-			spte = rmap_next(kvm, rmapp, spte);
+			spte = rmap_next(rmapp, spte);
 		}
 	}
 
@@ -1066,7 +1066,7 @@ static int kvm_unmap_rmapp(struct kvm *kvm, unsigned long *rmapp,
 	u64 *spte;
 	int need_tlb_flush = 0;
 
-	while ((spte = rmap_next(kvm, rmapp, NULL))) {
+	while ((spte = rmap_next(rmapp, NULL))) {
 		BUG_ON(!(*spte & PT_PRESENT_MASK));
 		rmap_printk("kvm_rmap_unmap_hva: spte %p %llx\n", spte, *spte);
 		drop_spte(kvm, spte);
@@ -1085,14 +1085,14 @@ static int kvm_set_pte_rmapp(struct kvm *kvm, unsigned long *rmapp,
 
 	WARN_ON(pte_huge(*ptep));
 	new_pfn = pte_pfn(*ptep);
-	spte = rmap_next(kvm, rmapp, NULL);
+	spte = rmap_next(rmapp, NULL);
 	while (spte) {
 		BUG_ON(!is_shadow_present_pte(*spte));
 		rmap_printk("kvm_set_pte_rmapp: spte %p %llx\n", spte, *spte);
 		need_flush = 1;
 		if (pte_write(*ptep)) {
 			drop_spte(kvm, spte);
-			spte = rmap_next(kvm, rmapp, NULL);
+			spte = rmap_next(rmapp, NULL);
 		} else {
 			new_spte = *spte &~ (PT64_BASE_ADDR_MASK);
 			new_spte |= (u64)new_pfn << PAGE_SHIFT;
@@ -1102,7 +1102,7 @@ static int kvm_set_pte_rmapp(struct kvm *kvm, unsigned long *rmapp,
 			new_spte &= ~shadow_accessed_mask;
 			mmu_spte_clear_track_bits(spte);
 			mmu_spte_set(spte, new_spte);
-			spte = rmap_next(kvm, rmapp, spte);
+			spte = rmap_next(rmapp, spte);
 		}
 	}
 	if (need_flush)
@@ -1176,7 +1176,7 @@ static int kvm_age_rmapp(struct kvm *kvm, unsigned long *rmapp,
 	if (!shadow_accessed_mask)
 		return kvm_unmap_rmapp(kvm, rmapp, data);
 
-	spte = rmap_next(kvm, rmapp, NULL);
+	spte = rmap_next(rmapp, NULL);
 	while (spte) {
 		int _young;
 		u64 _spte = *spte;
@@ -1186,7 +1186,7 @@ static int kvm_age_rmapp(struct kvm *kvm, unsigned long *rmapp,
 			young = 1;
 			clear_bit(PT_ACCESSED_SHIFT, (unsigned long *)spte);
 		}
-		spte = rmap_next(kvm, rmapp, spte);
+		spte = rmap_next(rmapp, spte);
 	}
 	return young;
 }
@@ -1205,7 +1205,7 @@ static int kvm_test_age_rmapp(struct kvm *kvm, unsigned long *rmapp,
 	if (!shadow_accessed_mask)
 		goto out;
 
-	spte = rmap_next(kvm, rmapp, NULL);
+	spte = rmap_next(rmapp, NULL);
 	while (spte) {
 		u64 _spte = *spte;
 		BUG_ON(!(_spte & PT_PRESENT_MASK));
@@ -1214,7 +1214,7 @@ static int kvm_test_age_rmapp(struct kvm *kvm, unsigned long *rmapp,
 			young = 1;
 			break;
 		}
-		spte = rmap_next(kvm, rmapp, spte);
+		spte = rmap_next(rmapp, spte);
 	}
 out:
 	return young;
diff --git a/arch/x86/kvm/mmu_audit.c b/arch/x86/kvm/mmu_audit.c
index fe15dcc07a6b..6eabae3d77ff 100644
--- a/arch/x86/kvm/mmu_audit.c
+++ b/arch/x86/kvm/mmu_audit.c
@@ -200,13 +200,13 @@ static void audit_write_protection(struct kvm *kvm, struct kvm_mmu_page *sp)
 	slot = gfn_to_memslot(kvm, sp->gfn);
 	rmapp = &slot->rmap[sp->gfn - slot->base_gfn];
 
-	spte = rmap_next(kvm, rmapp, NULL);
+	spte = rmap_next(rmapp, NULL);
 	while (spte) {
 		if (is_writable_pte(*spte))
 			audit_printk(kvm, "shadow page has writable "
 				     "mappings: gfn %llx role %x\n",
 				     sp->gfn, sp->role.word);
-		spte = rmap_next(kvm, rmapp, spte);
+		spte = rmap_next(rmapp, spte);
 	}
 }
 
-- 
cgit v1.2.3


From e2358851efbcdc34583ee11971a6e4d587ea8bf9 Mon Sep 17 00:00:00 2001
From: Davidlohr Bueso <dave@gnu.org>
Date: Tue, 17 Jan 2012 14:09:50 +0100
Subject: KVM: SVM: comment nested paging and virtualization module parameters

Also use true instead of 1 for enabling by default.

Signed-off-by: Davidlohr Bueso <dave@gnu.org>
Signed-off-by: Marcelo Tosatti <mtosatti@redhat.com>
Signed-off-by: Avi Kivity <avi@redhat.com>
---
 arch/x86/kvm/svm.c | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c
index fce3ba0f2079..7bbd17cc3488 100644
--- a/arch/x86/kvm/svm.c
+++ b/arch/x86/kvm/svm.c
@@ -182,11 +182,13 @@ static bool npt_enabled = true;
 #else
 static bool npt_enabled;
 #endif
-static int npt = 1;
 
+/* allow nested paging (virtualized MMU) for all guests */
+static int npt = true;
 module_param(npt, int, S_IRUGO);
 
-static int nested = 1;
+/* allow nested virtualization in KVM/SVM */
+static int nested = true;
 module_param(nested, int, S_IRUGO);
 
 static void svm_flush_tlb(struct kvm_vcpu *vcpu);
-- 
cgit v1.2.3


From a52315e1d549dad80ff443151927226c11fd8c2b Mon Sep 17 00:00:00 2001
From: Julian Stecklina <js@alien8.de>
Date: Mon, 16 Jan 2012 14:02:20 +0100
Subject: KVM: Don't mistreat edge-triggered INIT IPI as INIT de-assert.
 (LAPIC)

If the guest programs an IPI with level=0 (de-assert) and trig_mode=0 (edge),
it is erroneously treated as INIT de-assert and ignored, but to quote the
spec: "For this delivery mode [INIT de-assert], the level flag must be set to
0 and trigger mode flag to 1."

Signed-off-by: Julian Stecklina <js@alien8.de>
Signed-off-by: Marcelo Tosatti <mtosatti@redhat.com>
Signed-off-by: Avi Kivity <avi@redhat.com>
---
 arch/x86/kvm/lapic.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c
index cfdc6e0ef002..3ee1d83c695d 100644
--- a/arch/x86/kvm/lapic.c
+++ b/arch/x86/kvm/lapic.c
@@ -433,7 +433,7 @@ static int __apic_accept_irq(struct kvm_lapic *apic, int delivery_mode,
 		break;
 
 	case APIC_DM_INIT:
-		if (level) {
+		if (!trig_mode || level) {
 			result = 1;
 			vcpu->arch.mp_state = KVM_MP_STATE_INIT_RECEIVED;
 			kvm_make_request(KVM_REQ_EVENT, vcpu);
-- 
cgit v1.2.3


From 1a18a69b762374c423305772500f36eb8984ca52 Mon Sep 17 00:00:00 2001
From: Avi Kivity <avi@redhat.com>
Date: Wed, 1 Feb 2012 12:23:21 +0200
Subject: KVM: x86 emulator: reject SYSENTER in compatibility mode on AMD
 guests

If the guest thinks it's an AMD, it will not have prepared the SYSENTER MSRs,
and if the guest executes SYSENTER in compatibility mode, it will fails.

Detect this condition and #UD instead, like the spec says.

Signed-off-by: Avi Kivity <avi@redhat.com>
---
 arch/x86/kvm/emulate.c | 19 +++++++++++++++++++
 1 file changed, 19 insertions(+)

(limited to 'arch/x86')

diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c
index 6eaedac7cf6a..71450aca3b86 100644
--- a/arch/x86/kvm/emulate.c
+++ b/arch/x86/kvm/emulate.c
@@ -1892,6 +1892,17 @@ setup_syscalls_segments(struct x86_emulate_ctxt *ctxt,
 	ss->p = 1;
 }
 
+static bool vendor_intel(struct x86_emulate_ctxt *ctxt)
+{
+	u32 eax, ebx, ecx, edx;
+
+	eax = ecx = 0;
+	return ctxt->ops->get_cpuid(ctxt, &eax, &ebx, &ecx, &edx)
+		&& ebx == X86EMUL_CPUID_VENDOR_GenuineIntel_ebx
+		&& ecx == X86EMUL_CPUID_VENDOR_GenuineIntel_ecx
+		&& edx == X86EMUL_CPUID_VENDOR_GenuineIntel_edx;
+}
+
 static bool em_syscall_is_enabled(struct x86_emulate_ctxt *ctxt)
 {
 	struct x86_emulate_ops *ops = ctxt->ops;
@@ -2008,6 +2019,14 @@ static int em_sysenter(struct x86_emulate_ctxt *ctxt)
 	if (ctxt->mode == X86EMUL_MODE_REAL)
 		return emulate_gp(ctxt, 0);
 
+	/*
+	 * Not recognized on AMD in compat mode (but is recognized in legacy
+	 * mode).
+	 */
+	if ((ctxt->mode == X86EMUL_MODE_PROT32) && (efer & EFER_LMA)
+	    && !vendor_intel(ctxt))
+		return emulate_ud(ctxt);
+
 	/* XXX sysenter/sysexit have not been tested in 64bit mode.
 	* Therefore, we inject an #UD.
 	*/
-- 
cgit v1.2.3


From 242ec97c358256ad6e62dab869f63a03cd244122 Mon Sep 17 00:00:00 2001
From: Gleb Natapov <gleb@redhat.com>
Date: Tue, 24 Jan 2012 15:06:05 +0200
Subject: KVM: x86: reset edge sense circuit of i8259 on init

The spec says that during initialization "The edge sense circuit is
reset which means that following initialization an interrupt request
(IR) input must make a low-to-high transition to generate an interrupt",
but currently if edge triggered interrupt is in IRR it is delivered
after i8259 initialization.

Signed-off-by: Gleb Natapov <gleb@redhat.com>
Signed-off-by: Marcelo Tosatti <mtosatti@redhat.com>
Signed-off-by: Avi Kivity <avi@redhat.com>
---
 arch/x86/kvm/i8259.c | 1 +
 1 file changed, 1 insertion(+)

(limited to 'arch/x86')

diff --git a/arch/x86/kvm/i8259.c b/arch/x86/kvm/i8259.c
index b6a73537e1ef..81cf4fa4a2be 100644
--- a/arch/x86/kvm/i8259.c
+++ b/arch/x86/kvm/i8259.c
@@ -307,6 +307,7 @@ static void pic_ioport_write(void *opaque, u32 addr, u32 val)
 		if (val & 0x10) {
 			s->init4 = val & 1;
 			s->last_irr = 0;
+			s->irr &= s->elcr;
 			s->imr = 0;
 			s->priority_add = 0;
 			s->special_mask = 0;
-- 
cgit v1.2.3


From df156f90a0f90649dd38b7667901ef85478f3d2b Mon Sep 17 00:00:00 2001
From: Igor Mammedov <imammedo@redhat.com>
Date: Tue, 7 Feb 2012 15:52:44 +0100
Subject: x86: Introduce x86_cpuinit.early_percpu_clock_init hook

When kvm guest uses kvmclock, it may hang on vcpu hot-plug.
This is caused by an overflow in pvclock_get_nsec_offset,

    u64 delta = tsc - shadow->tsc_timestamp;

which in turn is caused by an undefined values from percpu
hv_clock that hasn't been initialized yet.
Uninitialized clock on being booted cpu is accessed from
   start_secondary
    -> smp_callin
      ->  smp_store_cpu_info
        -> identify_secondary_cpu
          -> mtrr_ap_init
            -> mtrr_restore
              -> stop_machine_from_inactive_cpu
                -> queue_stop_cpus_work
                  ...
                    -> sched_clock
                      -> kvm_clock_read
which is well before x86_cpuinit.setup_percpu_clockev call in
start_secondary, where percpu clock is initialized.

This patch introduces a hook that allows to setup/initialize
per_cpu clock early and avoid overflow due to reading
  - undefined values
  - old values if cpu was offlined and then onlined again

Another possible early user of this clock source is ftrace that
accesses it to get timestamps for ring buffer entries. So if
mtrr_ap_init is moved from identify_secondary_cpu to past
x86_cpuinit.setup_percpu_clockev in start_secondary, ftrace
may cause the same overflow/hang on cpu hot-plug anyway.

More complete description of the problem:
  https://lkml.org/lkml/2012/2/2/101

Credits to Marcelo Tosatti <mtosatti@redhat.com> for hook idea.

Acked-by: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Igor Mammedov <imammedo@redhat.com>
Signed-off-by: Marcelo Tosatti <mtosatti@redhat.com>
Signed-off-by: Avi Kivity <avi@redhat.com>
---
 arch/x86/include/asm/x86_init.h | 2 ++
 arch/x86/kernel/kvmclock.c      | 4 +---
 arch/x86/kernel/smpboot.c       | 1 +
 arch/x86/kernel/x86_init.c      | 1 +
 4 files changed, 5 insertions(+), 3 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/include/asm/x86_init.h b/arch/x86/include/asm/x86_init.h
index 517d4767ffdd..5d0afac2962c 100644
--- a/arch/x86/include/asm/x86_init.h
+++ b/arch/x86/include/asm/x86_init.h
@@ -145,9 +145,11 @@ struct x86_init_ops {
 /**
  * struct x86_cpuinit_ops - platform specific cpu hotplug setups
  * @setup_percpu_clockev:	set up the per cpu clock event device
+ * @early_percpu_clock_init:	early init of the per cpu clock event device
  */
 struct x86_cpuinit_ops {
 	void (*setup_percpu_clockev)(void);
+	void (*early_percpu_clock_init)(void);
 	void (*fixup_cpu_id)(struct cpuinfo_x86 *c, int node);
 };
 
diff --git a/arch/x86/kernel/kvmclock.c b/arch/x86/kernel/kvmclock.c
index 44842d756b29..ca4e735adc54 100644
--- a/arch/x86/kernel/kvmclock.c
+++ b/arch/x86/kernel/kvmclock.c
@@ -144,8 +144,6 @@ static void __cpuinit kvm_setup_secondary_clock(void)
 	 * we shouldn't fail.
 	 */
 	WARN_ON(kvm_register_clock("secondary cpu clock"));
-	/* ok, done with our trickery, call native */
-	setup_secondary_APIC_clock();
 }
 #endif
 
@@ -194,7 +192,7 @@ void __init kvmclock_init(void)
 	x86_platform.get_wallclock = kvm_get_wallclock;
 	x86_platform.set_wallclock = kvm_set_wallclock;
 #ifdef CONFIG_X86_LOCAL_APIC
-	x86_cpuinit.setup_percpu_clockev =
+	x86_cpuinit.early_percpu_clock_init =
 		kvm_setup_secondary_clock;
 #endif
 	machine_ops.shutdown  = kvm_shutdown;
diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c
index 66d250c00d11..a05d6fd5e06d 100644
--- a/arch/x86/kernel/smpboot.c
+++ b/arch/x86/kernel/smpboot.c
@@ -255,6 +255,7 @@ notrace static void __cpuinit start_secondary(void *unused)
 	 * most necessary things.
 	 */
 	cpu_init();
+	x86_cpuinit.early_percpu_clock_init();
 	preempt_disable();
 	smp_callin();
 
diff --git a/arch/x86/kernel/x86_init.c b/arch/x86/kernel/x86_init.c
index 947a06ccc673..6f2ec53deed0 100644
--- a/arch/x86/kernel/x86_init.c
+++ b/arch/x86/kernel/x86_init.c
@@ -91,6 +91,7 @@ struct x86_init_ops x86_init __initdata = {
 };
 
 struct x86_cpuinit_ops x86_cpuinit __cpuinitdata = {
+	.early_percpu_clock_init	= x86_init_noop,
 	.setup_percpu_clockev		= setup_secondary_APIC_clock,
 	.fixup_cpu_id			= x86_default_fixup_cpu_id,
 };
-- 
cgit v1.2.3


From a59cb29e4d81e025192550c2703f305637f016f6 Mon Sep 17 00:00:00 2001
From: Marcelo Tosatti <mtosatti@redhat.com>
Date: Fri, 3 Feb 2012 12:28:31 -0200
Subject: KVM: x86: increase recommended max vcpus to 160

Increase recommended max vcpus from 64 to 160 (tested internally
at Red Hat).

Signed-off-by: Marcelo Tosatti <mtosatti@redhat.com>
Signed-off-by: Avi Kivity <avi@redhat.com>
---
 arch/x86/include/asm/kvm_host.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'arch/x86')

diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index 461016614324..782d973b0719 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -29,7 +29,7 @@
 #include <asm/msr-index.h>
 
 #define KVM_MAX_VCPUS 254
-#define KVM_SOFT_MAX_VCPUS 64
+#define KVM_SOFT_MAX_VCPUS 160
 #define KVM_MEMORY_SLOTS 32
 /* memory slots that does not exposed to userspace */
 #define KVM_PRIVATE_MEM_SLOTS 4
-- 
cgit v1.2.3


From bce38cd53e5ddba9cb6d708c4ef3d04a4016ec7e Mon Sep 17 00:00:00 2001
From: Stephane Eranian <eranian@google.com>
Date: Thu, 9 Feb 2012 23:20:51 +0100
Subject: perf: Add generic taken branch sampling support

This patch adds the ability to sample taken branches to the
perf_event interface.

The ability to capture taken branches is very useful for all
sorts of analysis. For instance, basic block profiling, call
counts, statistical call graph.

This new capability requires hardware assist and as such may
not be available on all HW platforms. On Intel x86 it is
implemented on top of the Last Branch Record (LBR) facility.

To enable taken branches sampling, the PERF_SAMPLE_BRANCH_STACK
bit must be set in attr->sample_type.

Sampled taken branches may be filtered by type and/or priv
levels.

The patch adds a new field, called branch_sample_type, to the
perf_event_attr structure. It contains a bitmask of filters
to apply to the sampled taken branches.

Filters may be implemented in HW. If the HW filter does not exist
or is not good enough, some arch may also implement a SW filter.

The following generic filters are currently defined:
- PERF_SAMPLE_USER
  only branches whose targets are at the user level

- PERF_SAMPLE_KERNEL
  only branches whose targets are at the kernel level

- PERF_SAMPLE_HV
  only branches whose targets are at the hypervisor level

- PERF_SAMPLE_ANY
  any type of branches (subject to priv levels filters)

- PERF_SAMPLE_ANY_CALL
  any call branches (may incl. syscall on some arch)

- PERF_SAMPLE_ANY_RET
  any return branches (may incl. syscall returns on some arch)

- PERF_SAMPLE_IND_CALL
  indirect call branches

Obviously filter may be combined. The priv level bits are optional.
If not provided, the priv level of the associated event are used. It
is possible to collect branches at a priv level different from the
associated event. Use of kernel, hv priv levels is subject to permissions
and availability (hv).

The number of taken branch records present in each sample may vary based
on HW, the type of sampled branches, the executed code. Therefore
each sample contains the number of taken branches it contains.

Signed-off-by: Stephane Eranian <eranian@google.com>
Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Link: http://lkml.kernel.org/r/1328826068-11713-2-git-send-email-eranian@google.com
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/cpu/perf_event_intel_lbr.c | 21 +++++----
 include/linux/perf_event.h                 | 71 ++++++++++++++++++++++++++++--
 kernel/events/core.c                       | 68 ++++++++++++++++++++++++++++
 3 files changed, 148 insertions(+), 12 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kernel/cpu/perf_event_intel_lbr.c b/arch/x86/kernel/cpu/perf_event_intel_lbr.c
index 47a7e63bfe54..309d0cc69163 100644
--- a/arch/x86/kernel/cpu/perf_event_intel_lbr.c
+++ b/arch/x86/kernel/cpu/perf_event_intel_lbr.c
@@ -142,9 +142,11 @@ static void intel_pmu_lbr_read_32(struct cpu_hw_events *cpuc)
 
 		rdmsrl(x86_pmu.lbr_from + lbr_idx, msr_lastbranch.lbr);
 
-		cpuc->lbr_entries[i].from  = msr_lastbranch.from;
-		cpuc->lbr_entries[i].to    = msr_lastbranch.to;
-		cpuc->lbr_entries[i].flags = 0;
+		cpuc->lbr_entries[i].from	= msr_lastbranch.from;
+		cpuc->lbr_entries[i].to		= msr_lastbranch.to;
+		cpuc->lbr_entries[i].mispred	= 0;
+		cpuc->lbr_entries[i].predicted	= 0;
+		cpuc->lbr_entries[i].reserved	= 0;
 	}
 	cpuc->lbr_stack.nr = i;
 }
@@ -165,19 +167,22 @@ static void intel_pmu_lbr_read_64(struct cpu_hw_events *cpuc)
 
 	for (i = 0; i < x86_pmu.lbr_nr; i++) {
 		unsigned long lbr_idx = (tos - i) & mask;
-		u64 from, to, flags = 0;
+		u64 from, to, mis = 0, pred = 0;
 
 		rdmsrl(x86_pmu.lbr_from + lbr_idx, from);
 		rdmsrl(x86_pmu.lbr_to   + lbr_idx, to);
 
 		if (lbr_format == LBR_FORMAT_EIP_FLAGS) {
-			flags = !!(from & LBR_FROM_FLAG_MISPRED);
+			mis = !!(from & LBR_FROM_FLAG_MISPRED);
+			pred = !mis;
 			from = (u64)((((s64)from) << 1) >> 1);
 		}
 
-		cpuc->lbr_entries[i].from  = from;
-		cpuc->lbr_entries[i].to    = to;
-		cpuc->lbr_entries[i].flags = flags;
+		cpuc->lbr_entries[i].from	= from;
+		cpuc->lbr_entries[i].to		= to;
+		cpuc->lbr_entries[i].mispred	= mis;
+		cpuc->lbr_entries[i].predicted	= pred;
+		cpuc->lbr_entries[i].reserved	= 0;
 	}
 	cpuc->lbr_stack.nr = i;
 }
diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h
index 64426b71381f..5fc494f4a094 100644
--- a/include/linux/perf_event.h
+++ b/include/linux/perf_event.h
@@ -129,10 +129,39 @@ enum perf_event_sample_format {
 	PERF_SAMPLE_PERIOD			= 1U << 8,
 	PERF_SAMPLE_STREAM_ID			= 1U << 9,
 	PERF_SAMPLE_RAW				= 1U << 10,
+	PERF_SAMPLE_BRANCH_STACK		= 1U << 11,
 
-	PERF_SAMPLE_MAX = 1U << 11,		/* non-ABI */
+	PERF_SAMPLE_MAX = 1U << 12,		/* non-ABI */
 };
 
+/*
+ * values to program into branch_sample_type when PERF_SAMPLE_BRANCH is set
+ *
+ * If the user does not pass priv level information via branch_sample_type,
+ * the kernel uses the event's priv level. Branch and event priv levels do
+ * not have to match. Branch priv level is checked for permissions.
+ *
+ * The branch types can be combined, however BRANCH_ANY covers all types
+ * of branches and therefore it supersedes all the other types.
+ */
+enum perf_branch_sample_type {
+	PERF_SAMPLE_BRANCH_USER		= 1U << 0, /* user branches */
+	PERF_SAMPLE_BRANCH_KERNEL	= 1U << 1, /* kernel branches */
+	PERF_SAMPLE_BRANCH_HV		= 1U << 2, /* hypervisor branches */
+
+	PERF_SAMPLE_BRANCH_ANY		= 1U << 3, /* any branch types */
+	PERF_SAMPLE_BRANCH_ANY_CALL	= 1U << 4, /* any call branch */
+	PERF_SAMPLE_BRANCH_ANY_RETURN	= 1U << 5, /* any return branch */
+	PERF_SAMPLE_BRANCH_IND_CALL	= 1U << 6, /* indirect calls */
+
+	PERF_SAMPLE_BRANCH_MAX		= 1U << 7, /* non-ABI */
+};
+
+#define PERF_SAMPLE_BRANCH_PLM_ALL \
+	(PERF_SAMPLE_BRANCH_USER|\
+	 PERF_SAMPLE_BRANCH_KERNEL|\
+	 PERF_SAMPLE_BRANCH_HV)
+
 /*
  * The format of the data returned by read() on a perf event fd,
  * as specified by attr.read_format:
@@ -240,6 +269,7 @@ struct perf_event_attr {
 		__u64		bp_len;
 		__u64		config2; /* extension of config1 */
 	};
+	__u64	branch_sample_type; /* enum branch_sample_type */
 };
 
 /*
@@ -458,6 +488,8 @@ enum perf_event_type {
 	 *
 	 *	{ u32			size;
 	 *	  char                  data[size];}&& PERF_SAMPLE_RAW
+	 *
+	 *	{ u64 from, to, flags } lbr[nr];} && PERF_SAMPLE_BRANCH_STACK
 	 * };
 	 */
 	PERF_RECORD_SAMPLE			= 9,
@@ -530,12 +562,34 @@ struct perf_raw_record {
 	void				*data;
 };
 
+/*
+ * single taken branch record layout:
+ *
+ *      from: source instruction (may not always be a branch insn)
+ *        to: branch target
+ *   mispred: branch target was mispredicted
+ * predicted: branch target was predicted
+ *
+ * support for mispred, predicted is optional. In case it
+ * is not supported mispred = predicted = 0.
+ */
 struct perf_branch_entry {
-	__u64				from;
-	__u64				to;
-	__u64				flags;
+	__u64	from;
+	__u64	to;
+	__u64	mispred:1,  /* target mispredicted */
+		predicted:1,/* target predicted */
+		reserved:62;
 };
 
+/*
+ * branch stack layout:
+ *  nr: number of taken branches stored in entries[]
+ *
+ * Note that nr can vary from sample to sample
+ * branches (to, from) are stored from most recent
+ * to least recent, i.e., entries[0] contains the most
+ * recent branch.
+ */
 struct perf_branch_stack {
 	__u64				nr;
 	struct perf_branch_entry	entries[0];
@@ -566,7 +620,9 @@ struct hw_perf_event {
 			unsigned long	event_base;
 			int		idx;
 			int		last_cpu;
+
 			struct hw_perf_event_extra extra_reg;
+			struct hw_perf_event_extra branch_reg;
 		};
 		struct { /* software */
 			struct hrtimer	hrtimer;
@@ -1007,12 +1063,14 @@ struct perf_sample_data {
 	u64				period;
 	struct perf_callchain_entry	*callchain;
 	struct perf_raw_record		*raw;
+	struct perf_branch_stack	*br_stack;
 };
 
 static inline void perf_sample_data_init(struct perf_sample_data *data, u64 addr)
 {
 	data->addr = addr;
 	data->raw  = NULL;
+	data->br_stack = NULL;
 }
 
 extern void perf_output_sample(struct perf_output_handle *handle,
@@ -1151,6 +1209,11 @@ extern void perf_bp_event(struct perf_event *event, void *data);
 # define perf_instruction_pointer(regs)	instruction_pointer(regs)
 #endif
 
+static inline bool has_branch_stack(struct perf_event *event)
+{
+	return event->attr.sample_type & PERF_SAMPLE_BRANCH_STACK;
+}
+
 extern int perf_output_begin(struct perf_output_handle *handle,
 			     struct perf_event *event, unsigned int size);
 extern void perf_output_end(struct perf_output_handle *handle);
diff --git a/kernel/events/core.c b/kernel/events/core.c
index e8b32ac75ce3..5820efdf47cd 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -118,6 +118,13 @@ static int cpu_function_call(int cpu, int (*func) (void *info), void *info)
 		       PERF_FLAG_FD_OUTPUT  |\
 		       PERF_FLAG_PID_CGROUP)
 
+/*
+ * branch priv levels that need permission checks
+ */
+#define PERF_SAMPLE_BRANCH_PERM_PLM \
+	(PERF_SAMPLE_BRANCH_KERNEL |\
+	 PERF_SAMPLE_BRANCH_HV)
+
 enum event_type_t {
 	EVENT_FLEXIBLE = 0x1,
 	EVENT_PINNED = 0x2,
@@ -3907,6 +3914,24 @@ void perf_output_sample(struct perf_output_handle *handle,
 			}
 		}
 	}
+
+	if (sample_type & PERF_SAMPLE_BRANCH_STACK) {
+		if (data->br_stack) {
+			size_t size;
+
+			size = data->br_stack->nr
+			     * sizeof(struct perf_branch_entry);
+
+			perf_output_put(handle, data->br_stack->nr);
+			perf_output_copy(handle, data->br_stack->entries, size);
+		} else {
+			/*
+			 * we always store at least the value of nr
+			 */
+			u64 nr = 0;
+			perf_output_put(handle, nr);
+		}
+	}
 }
 
 void perf_prepare_sample(struct perf_event_header *header,
@@ -3949,6 +3974,15 @@ void perf_prepare_sample(struct perf_event_header *header,
 		WARN_ON_ONCE(size & (sizeof(u64)-1));
 		header->size += size;
 	}
+
+	if (sample_type & PERF_SAMPLE_BRANCH_STACK) {
+		int size = sizeof(u64); /* nr */
+		if (data->br_stack) {
+			size += data->br_stack->nr
+			      * sizeof(struct perf_branch_entry);
+		}
+		header->size += size;
+	}
 }
 
 static void perf_event_output(struct perf_event *event,
@@ -5935,6 +5969,40 @@ static int perf_copy_attr(struct perf_event_attr __user *uattr,
 	if (attr->read_format & ~(PERF_FORMAT_MAX-1))
 		return -EINVAL;
 
+	if (attr->sample_type & PERF_SAMPLE_BRANCH_STACK) {
+		u64 mask = attr->branch_sample_type;
+
+		/* only using defined bits */
+		if (mask & ~(PERF_SAMPLE_BRANCH_MAX-1))
+			return -EINVAL;
+
+		/* at least one branch bit must be set */
+		if (!(mask & ~PERF_SAMPLE_BRANCH_PLM_ALL))
+			return -EINVAL;
+
+		/* kernel level capture: check permissions */
+		if ((mask & PERF_SAMPLE_BRANCH_PERM_PLM)
+		    && perf_paranoid_kernel() && !capable(CAP_SYS_ADMIN))
+			return -EACCES;
+
+		/* propagate priv level, when not set for branch */
+		if (!(mask & PERF_SAMPLE_BRANCH_PLM_ALL)) {
+
+			/* exclude_kernel checked on syscall entry */
+			if (!attr->exclude_kernel)
+				mask |= PERF_SAMPLE_BRANCH_KERNEL;
+
+			if (!attr->exclude_user)
+				mask |= PERF_SAMPLE_BRANCH_USER;
+
+			if (!attr->exclude_hv)
+				mask |= PERF_SAMPLE_BRANCH_HV;
+			/*
+			 * adjust user setting (for HW filter setup)
+			 */
+			attr->branch_sample_type = mask;
+		}
+	}
 out:
 	return ret;
 
-- 
cgit v1.2.3


From 225ce53910edc3c2322b1e4f2ed049a9196cd0b3 Mon Sep 17 00:00:00 2001
From: Stephane Eranian <eranian@google.com>
Date: Thu, 9 Feb 2012 23:20:52 +0100
Subject: perf/x86: Add Intel LBR MSR definitions

This patch adds the LBR definitions for NHM/WSM/SNB and Core.
It also adds the definitions for the architected LBR MSR:
LBR_SELECT, LBRT_TOS.

Signed-off-by: Stephane Eranian <eranian@google.com>
Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Link: http://lkml.kernel.org/r/1328826068-11713-3-git-send-email-eranian@google.com
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/include/asm/msr-index.h           |  7 +++++++
 arch/x86/kernel/cpu/perf_event_intel_lbr.c | 18 +++++++++---------
 2 files changed, 16 insertions(+), 9 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/include/asm/msr-index.h b/arch/x86/include/asm/msr-index.h
index a6962d9161a0..ccb805966f68 100644
--- a/arch/x86/include/asm/msr-index.h
+++ b/arch/x86/include/asm/msr-index.h
@@ -56,6 +56,13 @@
 #define MSR_OFFCORE_RSP_0		0x000001a6
 #define MSR_OFFCORE_RSP_1		0x000001a7
 
+#define MSR_LBR_SELECT			0x000001c8
+#define MSR_LBR_TOS			0x000001c9
+#define MSR_LBR_NHM_FROM		0x00000680
+#define MSR_LBR_NHM_TO			0x000006c0
+#define MSR_LBR_CORE_FROM		0x00000040
+#define MSR_LBR_CORE_TO			0x00000060
+
 #define MSR_IA32_PEBS_ENABLE		0x000003f1
 #define MSR_IA32_DS_AREA		0x00000600
 #define MSR_IA32_PERF_CAPABILITIES	0x00000345
diff --git a/arch/x86/kernel/cpu/perf_event_intel_lbr.c b/arch/x86/kernel/cpu/perf_event_intel_lbr.c
index 309d0cc69163..6710a5116ebd 100644
--- a/arch/x86/kernel/cpu/perf_event_intel_lbr.c
+++ b/arch/x86/kernel/cpu/perf_event_intel_lbr.c
@@ -203,23 +203,23 @@ void intel_pmu_lbr_read(void)
 void intel_pmu_lbr_init_core(void)
 {
 	x86_pmu.lbr_nr     = 4;
-	x86_pmu.lbr_tos    = 0x01c9;
-	x86_pmu.lbr_from   = 0x40;
-	x86_pmu.lbr_to     = 0x60;
+	x86_pmu.lbr_tos    = MSR_LBR_TOS;
+	x86_pmu.lbr_from   = MSR_LBR_CORE_FROM;
+	x86_pmu.lbr_to     = MSR_LBR_CORE_TO;
 }
 
 void intel_pmu_lbr_init_nhm(void)
 {
 	x86_pmu.lbr_nr     = 16;
-	x86_pmu.lbr_tos    = 0x01c9;
-	x86_pmu.lbr_from   = 0x680;
-	x86_pmu.lbr_to     = 0x6c0;
+	x86_pmu.lbr_tos    = MSR_LBR_TOS;
+	x86_pmu.lbr_from   = MSR_LBR_NHM_FROM;
+	x86_pmu.lbr_to     = MSR_LBR_NHM_TO;
 }
 
 void intel_pmu_lbr_init_atom(void)
 {
 	x86_pmu.lbr_nr	   = 8;
-	x86_pmu.lbr_tos    = 0x01c9;
-	x86_pmu.lbr_from   = 0x40;
-	x86_pmu.lbr_to     = 0x60;
+	x86_pmu.lbr_tos    = MSR_LBR_TOS;
+	x86_pmu.lbr_from   = MSR_LBR_CORE_FROM;
+	x86_pmu.lbr_to     = MSR_LBR_CORE_TO;
 }
-- 
cgit v1.2.3


From b36817e8863090f1f24e538106ca50fa1d9e4003 Mon Sep 17 00:00:00 2001
From: Stephane Eranian <eranian@google.com>
Date: Thu, 9 Feb 2012 23:20:53 +0100
Subject: perf/x86: Add Intel LBR sharing logic

The Intel LBR on some recent processor is capable
of filtering branches by type. The filter is configurable
via the LBR_SELECT MSR register.

There are limitation on how this register can be used.

On Nehalem/Westmere, the LBR_SELECT is shared by the two HT threads
when HT is on. It is private to each core when HT is off.

On SandyBridge, the LBR_SELECT register is private to each thread
when HT is on. It is private to each core when HT is off.

The kernel must manage the sharing of LBR_SELECT. It allows
multiple users on the same logical CPU to use LBR_SELECT as
long as they program it with the same value. Across sibling
CPUs (HT threads), the same restriction applies on NHM/WSM.

This patch implements this sharing logic by leveraging the
mechanism put in place for managing the offcore_response
shared MSR.

We modify __intel_shared_reg_get_constraints() to cause
x86_get_event_constraint() to be called because LBR may
be associated with events that may be counter constrained.

Signed-off-by: Stephane Eranian <eranian@google.com>
Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Link: http://lkml.kernel.org/r/1328826068-11713-4-git-send-email-eranian@google.com
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/cpu/perf_event.c       |  4 ++
 arch/x86/kernel/cpu/perf_event.h       |  4 ++
 arch/x86/kernel/cpu/perf_event_intel.c | 70 +++++++++++++++++++++-------------
 3 files changed, 52 insertions(+), 26 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kernel/cpu/perf_event.c b/arch/x86/kernel/cpu/perf_event.c
index f8bddb5b0600..377931354ac7 100644
--- a/arch/x86/kernel/cpu/perf_event.c
+++ b/arch/x86/kernel/cpu/perf_event.c
@@ -426,6 +426,10 @@ static int __x86_pmu_event_init(struct perf_event *event)
 	/* mark unused */
 	event->hw.extra_reg.idx = EXTRA_REG_NONE;
 
+	/* mark not used */
+	event->hw.extra_reg.idx = EXTRA_REG_NONE;
+	event->hw.branch_reg.idx = EXTRA_REG_NONE;
+
 	return x86_pmu.hw_config(event);
 }
 
diff --git a/arch/x86/kernel/cpu/perf_event.h b/arch/x86/kernel/cpu/perf_event.h
index 82db83b5c3bc..9b9c580a7ab8 100644
--- a/arch/x86/kernel/cpu/perf_event.h
+++ b/arch/x86/kernel/cpu/perf_event.h
@@ -33,6 +33,7 @@ enum extra_reg_type {
 
 	EXTRA_REG_RSP_0 = 0,	/* offcore_response_0 */
 	EXTRA_REG_RSP_1 = 1,	/* offcore_response_1 */
+	EXTRA_REG_LBR   = 2,	/* lbr_select */
 
 	EXTRA_REG_MAX		/* number of entries needed */
 };
@@ -130,6 +131,7 @@ struct cpu_hw_events {
 	void				*lbr_context;
 	struct perf_branch_stack	lbr_stack;
 	struct perf_branch_entry	lbr_entries[MAX_LBR_ENTRIES];
+	struct er_account		*lbr_sel;
 
 	/*
 	 * Intel host/guest exclude bits
@@ -342,6 +344,8 @@ struct x86_pmu {
 	 */
 	unsigned long	lbr_tos, lbr_from, lbr_to; /* MSR base regs       */
 	int		lbr_nr;			   /* hardware stack size */
+	u64		lbr_sel_mask;		   /* LBR_SELECT valid bits */
+	const int	*lbr_sel_map;		   /* lbr_select mappings */
 
 	/*
 	 * Extra registers for events
diff --git a/arch/x86/kernel/cpu/perf_event_intel.c b/arch/x86/kernel/cpu/perf_event_intel.c
index 3bd37bdf1b8e..97f7bb587519 100644
--- a/arch/x86/kernel/cpu/perf_event_intel.c
+++ b/arch/x86/kernel/cpu/perf_event_intel.c
@@ -1123,17 +1123,17 @@ static bool intel_try_alt_er(struct perf_event *event, int orig_idx)
  */
 static struct event_constraint *
 __intel_shared_reg_get_constraints(struct cpu_hw_events *cpuc,
-				   struct perf_event *event)
+				   struct perf_event *event,
+				   struct hw_perf_event_extra *reg)
 {
 	struct event_constraint *c = &emptyconstraint;
-	struct hw_perf_event_extra *reg = &event->hw.extra_reg;
 	struct er_account *era;
 	unsigned long flags;
 	int orig_idx = reg->idx;
 
 	/* already allocated shared msr */
 	if (reg->alloc)
-		return &unconstrained;
+		return NULL; /* call x86_get_event_constraint() */
 
 again:
 	era = &cpuc->shared_regs->regs[reg->idx];
@@ -1156,14 +1156,10 @@ again:
 		reg->alloc = 1;
 
 		/*
-		 * All events using extra_reg are unconstrained.
-		 * Avoids calling x86_get_event_constraints()
-		 *
-		 * Must revisit if extra_reg controlling events
-		 * ever have constraints. Worst case we go through
-		 * the regular event constraint table.
+		 * need to call x86_get_event_constraint()
+		 * to check if associated event has constraints
 		 */
-		c = &unconstrained;
+		c = NULL;
 	} else if (intel_try_alt_er(event, orig_idx)) {
 		raw_spin_unlock_irqrestore(&era->lock, flags);
 		goto again;
@@ -1200,11 +1196,23 @@ static struct event_constraint *
 intel_shared_regs_constraints(struct cpu_hw_events *cpuc,
 			      struct perf_event *event)
 {
-	struct event_constraint *c = NULL;
-
-	if (event->hw.extra_reg.idx != EXTRA_REG_NONE)
-		c = __intel_shared_reg_get_constraints(cpuc, event);
-
+	struct event_constraint *c = NULL, *d;
+	struct hw_perf_event_extra *xreg, *breg;
+
+	xreg = &event->hw.extra_reg;
+	if (xreg->idx != EXTRA_REG_NONE) {
+		c = __intel_shared_reg_get_constraints(cpuc, event, xreg);
+		if (c == &emptyconstraint)
+			return c;
+	}
+	breg = &event->hw.branch_reg;
+	if (breg->idx != EXTRA_REG_NONE) {
+		d = __intel_shared_reg_get_constraints(cpuc, event, breg);
+		if (d == &emptyconstraint) {
+			__intel_shared_reg_put_constraints(cpuc, xreg);
+			c = d;
+		}
+	}
 	return c;
 }
 
@@ -1252,6 +1260,10 @@ intel_put_shared_regs_event_constraints(struct cpu_hw_events *cpuc,
 	reg = &event->hw.extra_reg;
 	if (reg->idx != EXTRA_REG_NONE)
 		__intel_shared_reg_put_constraints(cpuc, reg);
+
+	reg = &event->hw.branch_reg;
+	if (reg->idx != EXTRA_REG_NONE)
+		__intel_shared_reg_put_constraints(cpuc, reg);
 }
 
 static void intel_put_event_constraints(struct cpu_hw_events *cpuc,
@@ -1431,7 +1443,7 @@ static int intel_pmu_cpu_prepare(int cpu)
 {
 	struct cpu_hw_events *cpuc = &per_cpu(cpu_hw_events, cpu);
 
-	if (!x86_pmu.extra_regs)
+	if (!(x86_pmu.extra_regs || x86_pmu.lbr_sel_map))
 		return NOTIFY_OK;
 
 	cpuc->shared_regs = allocate_shared_regs(cpu);
@@ -1453,22 +1465,28 @@ static void intel_pmu_cpu_starting(int cpu)
 	 */
 	intel_pmu_lbr_reset();
 
-	if (!cpuc->shared_regs || (x86_pmu.er_flags & ERF_NO_HT_SHARING))
+	cpuc->lbr_sel = NULL;
+
+	if (!cpuc->shared_regs)
 		return;
 
-	for_each_cpu(i, topology_thread_cpumask(cpu)) {
-		struct intel_shared_regs *pc;
+	if (!(x86_pmu.er_flags & ERF_NO_HT_SHARING)) {
+		for_each_cpu(i, topology_thread_cpumask(cpu)) {
+			struct intel_shared_regs *pc;
 
-		pc = per_cpu(cpu_hw_events, i).shared_regs;
-		if (pc && pc->core_id == core_id) {
-			cpuc->kfree_on_online = cpuc->shared_regs;
-			cpuc->shared_regs = pc;
-			break;
+			pc = per_cpu(cpu_hw_events, i).shared_regs;
+			if (pc && pc->core_id == core_id) {
+				cpuc->kfree_on_online = cpuc->shared_regs;
+				cpuc->shared_regs = pc;
+				break;
+			}
 		}
+		cpuc->shared_regs->core_id = core_id;
+		cpuc->shared_regs->refcnt++;
 	}
 
-	cpuc->shared_regs->core_id = core_id;
-	cpuc->shared_regs->refcnt++;
+	if (x86_pmu.lbr_sel_map)
+		cpuc->lbr_sel = &cpuc->shared_regs->regs[EXTRA_REG_LBR];
 }
 
 static void intel_pmu_cpu_dying(int cpu)
-- 
cgit v1.2.3


From ff3fb511ba377e8a0a7f553cc352237f70d08121 Mon Sep 17 00:00:00 2001
From: Stephane Eranian <eranian@google.com>
Date: Thu, 9 Feb 2012 23:20:54 +0100
Subject: perf/x86: Sync branch stack sampling with precise_sampling

If precise sampling is enabled on Intel x86 then perf_event uses PEBS.
To correct for the off-by-one error of PEBS, perf_event uses LBR when
precise_sample > 1.

On Intel x86 PERF_SAMPLE_BRANCH_STACK is implemented using LBR,
therefore both features must be coordinated as they may not
configure LBR the same way.

For PEBS, LBR needs to capture all branches at the priv level of
the associated event.

This patch checks that the branch type and priv level of BRANCH_STACK
is compatible with that of the PEBS LBR requirement, thereby allowing:

   $ perf record -b any,u -e instructions:upp ....

But:

   $ perf record -b any_call,u -e instructions:upp

Is not possible.

Signed-off-by: Stephane Eranian <eranian@google.com>
Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Link: http://lkml.kernel.org/r/1328826068-11713-5-git-send-email-eranian@google.com
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/cpu/perf_event.c | 60 ++++++++++++++++++++++++++++++++++++++++
 1 file changed, 60 insertions(+)

(limited to 'arch/x86')

diff --git a/arch/x86/kernel/cpu/perf_event.c b/arch/x86/kernel/cpu/perf_event.c
index 377931354ac7..cea567483274 100644
--- a/arch/x86/kernel/cpu/perf_event.c
+++ b/arch/x86/kernel/cpu/perf_event.c
@@ -353,6 +353,36 @@ int x86_setup_perfctr(struct perf_event *event)
 	return 0;
 }
 
+/*
+ * check that branch_sample_type is compatible with
+ * settings needed for precise_ip > 1 which implies
+ * using the LBR to capture ALL taken branches at the
+ * priv levels of the measurement
+ */
+static inline int precise_br_compat(struct perf_event *event)
+{
+	u64 m = event->attr.branch_sample_type;
+	u64 b = 0;
+
+	/* must capture all branches */
+	if (!(m & PERF_SAMPLE_BRANCH_ANY))
+		return 0;
+
+	m &= PERF_SAMPLE_BRANCH_KERNEL | PERF_SAMPLE_BRANCH_USER;
+
+	if (!event->attr.exclude_user)
+		b |= PERF_SAMPLE_BRANCH_USER;
+
+	if (!event->attr.exclude_kernel)
+		b |= PERF_SAMPLE_BRANCH_KERNEL;
+
+	/*
+	 * ignore PERF_SAMPLE_BRANCH_HV, not supported on x86
+	 */
+
+	return m == b;
+}
+
 int x86_pmu_hw_config(struct perf_event *event)
 {
 	if (event->attr.precise_ip) {
@@ -369,6 +399,36 @@ int x86_pmu_hw_config(struct perf_event *event)
 
 		if (event->attr.precise_ip > precise)
 			return -EOPNOTSUPP;
+		/*
+		 * check that PEBS LBR correction does not conflict with
+		 * whatever the user is asking with attr->branch_sample_type
+		 */
+		if (event->attr.precise_ip > 1) {
+			u64 *br_type = &event->attr.branch_sample_type;
+
+			if (has_branch_stack(event)) {
+				if (!precise_br_compat(event))
+					return -EOPNOTSUPP;
+
+				/* branch_sample_type is compatible */
+
+			} else {
+				/*
+				 * user did not specify  branch_sample_type
+				 *
+				 * For PEBS fixups, we capture all
+				 * the branches at the priv level of the
+				 * event.
+				 */
+				*br_type = PERF_SAMPLE_BRANCH_ANY;
+
+				if (!event->attr.exclude_user)
+					*br_type |= PERF_SAMPLE_BRANCH_USER;
+
+				if (!event->attr.exclude_kernel)
+					*br_type |= PERF_SAMPLE_BRANCH_KERNEL;
+			}
+		}
 	}
 
 	/*
-- 
cgit v1.2.3


From c5cc2cd906ea9fe73e3c93f9ad824996faa278cc Mon Sep 17 00:00:00 2001
From: Stephane Eranian <eranian@google.com>
Date: Thu, 9 Feb 2012 23:20:55 +0100
Subject: perf/x86: Add Intel LBR mappings for PERF_SAMPLE_BRANCH filters

This patch adds the mappings from the generic PERF_SAMPLE_BRANCH_*
filters to the actual Intel x86LBR filters, whenever they exist.

Signed-off-by: Stephane Eranian <eranian@google.com>
Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Link: http://lkml.kernel.org/r/1328826068-11713-6-git-send-email-eranian@google.com
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/cpu/perf_event.h           |   2 +
 arch/x86/kernel/cpu/perf_event_intel.c     |   2 +-
 arch/x86/kernel/cpu/perf_event_intel_lbr.c | 103 ++++++++++++++++++++++++++++-
 3 files changed, 104 insertions(+), 3 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kernel/cpu/perf_event.h b/arch/x86/kernel/cpu/perf_event.h
index 9b9c580a7ab8..4e948976aefb 100644
--- a/arch/x86/kernel/cpu/perf_event.h
+++ b/arch/x86/kernel/cpu/perf_event.h
@@ -539,6 +539,8 @@ void intel_pmu_lbr_init_nhm(void);
 
 void intel_pmu_lbr_init_atom(void);
 
+void intel_pmu_lbr_init_snb(void);
+
 int p4_pmu_init(void);
 
 int p6_pmu_init(void);
diff --git a/arch/x86/kernel/cpu/perf_event_intel.c b/arch/x86/kernel/cpu/perf_event_intel.c
index 97f7bb587519..b0db01692441 100644
--- a/arch/x86/kernel/cpu/perf_event_intel.c
+++ b/arch/x86/kernel/cpu/perf_event_intel.c
@@ -1757,7 +1757,7 @@ __init int intel_pmu_init(void)
 		memcpy(hw_cache_event_ids, snb_hw_cache_event_ids,
 		       sizeof(hw_cache_event_ids));
 
-		intel_pmu_lbr_init_nhm();
+		intel_pmu_lbr_init_snb();
 
 		x86_pmu.event_constraints = intel_snb_event_constraints;
 		x86_pmu.pebs_constraints = intel_snb_pebs_event_constraints;
diff --git a/arch/x86/kernel/cpu/perf_event_intel_lbr.c b/arch/x86/kernel/cpu/perf_event_intel_lbr.c
index 6710a5116ebd..e54a063b2863 100644
--- a/arch/x86/kernel/cpu/perf_event_intel_lbr.c
+++ b/arch/x86/kernel/cpu/perf_event_intel_lbr.c
@@ -13,6 +13,49 @@ enum {
 	LBR_FORMAT_EIP_FLAGS	= 0x03,
 };
 
+/*
+ * Intel LBR_SELECT bits
+ * Intel Vol3a, April 2011, Section 16.7 Table 16-10
+ *
+ * Hardware branch filter (not available on all CPUs)
+ */
+#define LBR_KERNEL_BIT		0 /* do not capture at ring0 */
+#define LBR_USER_BIT		1 /* do not capture at ring > 0 */
+#define LBR_JCC_BIT		2 /* do not capture conditional branches */
+#define LBR_REL_CALL_BIT	3 /* do not capture relative calls */
+#define LBR_IND_CALL_BIT	4 /* do not capture indirect calls */
+#define LBR_RETURN_BIT		5 /* do not capture near returns */
+#define LBR_IND_JMP_BIT		6 /* do not capture indirect jumps */
+#define LBR_REL_JMP_BIT		7 /* do not capture relative jumps */
+#define LBR_FAR_BIT		8 /* do not capture far branches */
+
+#define LBR_KERNEL	(1 << LBR_KERNEL_BIT)
+#define LBR_USER	(1 << LBR_USER_BIT)
+#define LBR_JCC		(1 << LBR_JCC_BIT)
+#define LBR_REL_CALL	(1 << LBR_REL_CALL_BIT)
+#define LBR_IND_CALL	(1 << LBR_IND_CALL_BIT)
+#define LBR_RETURN	(1 << LBR_RETURN_BIT)
+#define LBR_REL_JMP	(1 << LBR_REL_JMP_BIT)
+#define LBR_IND_JMP	(1 << LBR_IND_JMP_BIT)
+#define LBR_FAR		(1 << LBR_FAR_BIT)
+
+#define LBR_PLM (LBR_KERNEL | LBR_USER)
+
+#define LBR_SEL_MASK	0x1ff	/* valid bits in LBR_SELECT */
+#define LBR_NOT_SUPP	-1	/* LBR filter not supported */
+#define LBR_IGN		0	/* ignored */
+
+#define LBR_ANY		 \
+	(LBR_JCC	|\
+	 LBR_REL_CALL	|\
+	 LBR_IND_CALL	|\
+	 LBR_RETURN	|\
+	 LBR_REL_JMP	|\
+	 LBR_IND_JMP	|\
+	 LBR_FAR)
+
+#define LBR_FROM_FLAG_MISPRED  (1ULL << 63)
+
 /*
  * We only support LBR implementations that have FREEZE_LBRS_ON_PMI
  * otherwise it becomes near impossible to get a reliable stack.
@@ -151,8 +194,6 @@ static void intel_pmu_lbr_read_32(struct cpu_hw_events *cpuc)
 	cpuc->lbr_stack.nr = i;
 }
 
-#define LBR_FROM_FLAG_MISPRED  (1ULL << 63)
-
 /*
  * Due to lack of segmentation in Linux the effective address (offset)
  * is the same as the linear address, allowing us to merge the LIP and EIP
@@ -200,26 +241,84 @@ void intel_pmu_lbr_read(void)
 		intel_pmu_lbr_read_64(cpuc);
 }
 
+/*
+ * Map interface branch filters onto LBR filters
+ */
+static const int nhm_lbr_sel_map[PERF_SAMPLE_BRANCH_MAX] = {
+	[PERF_SAMPLE_BRANCH_ANY]	= LBR_ANY,
+	[PERF_SAMPLE_BRANCH_USER]	= LBR_USER,
+	[PERF_SAMPLE_BRANCH_KERNEL]	= LBR_KERNEL,
+	[PERF_SAMPLE_BRANCH_HV]		= LBR_IGN,
+	[PERF_SAMPLE_BRANCH_ANY_RETURN]	= LBR_RETURN | LBR_REL_JMP
+					| LBR_IND_JMP | LBR_FAR,
+	/*
+	 * NHM/WSM erratum: must include REL_JMP+IND_JMP to get CALL branches
+	 */
+	[PERF_SAMPLE_BRANCH_ANY_CALL] =
+	 LBR_REL_CALL | LBR_IND_CALL | LBR_REL_JMP | LBR_IND_JMP | LBR_FAR,
+	/*
+	 * NHM/WSM erratum: must include IND_JMP to capture IND_CALL
+	 */
+	[PERF_SAMPLE_BRANCH_IND_CALL] = LBR_IND_CALL | LBR_IND_JMP,
+};
+
+static const int snb_lbr_sel_map[PERF_SAMPLE_BRANCH_MAX] = {
+	[PERF_SAMPLE_BRANCH_ANY]	= LBR_ANY,
+	[PERF_SAMPLE_BRANCH_USER]	= LBR_USER,
+	[PERF_SAMPLE_BRANCH_KERNEL]	= LBR_KERNEL,
+	[PERF_SAMPLE_BRANCH_HV]		= LBR_IGN,
+	[PERF_SAMPLE_BRANCH_ANY_RETURN]	= LBR_RETURN | LBR_FAR,
+	[PERF_SAMPLE_BRANCH_ANY_CALL]	= LBR_REL_CALL | LBR_IND_CALL
+					| LBR_FAR,
+	[PERF_SAMPLE_BRANCH_IND_CALL]	= LBR_IND_CALL,
+};
+
+/* core */
 void intel_pmu_lbr_init_core(void)
 {
 	x86_pmu.lbr_nr     = 4;
 	x86_pmu.lbr_tos    = MSR_LBR_TOS;
 	x86_pmu.lbr_from   = MSR_LBR_CORE_FROM;
 	x86_pmu.lbr_to     = MSR_LBR_CORE_TO;
+
+	pr_cont("4-deep LBR, ");
 }
 
+/* nehalem/westmere */
 void intel_pmu_lbr_init_nhm(void)
 {
 	x86_pmu.lbr_nr     = 16;
 	x86_pmu.lbr_tos    = MSR_LBR_TOS;
 	x86_pmu.lbr_from   = MSR_LBR_NHM_FROM;
 	x86_pmu.lbr_to     = MSR_LBR_NHM_TO;
+
+	x86_pmu.lbr_sel_mask = LBR_SEL_MASK;
+	x86_pmu.lbr_sel_map  = nhm_lbr_sel_map;
+
+	pr_cont("16-deep LBR, ");
 }
 
+/* sandy bridge */
+void intel_pmu_lbr_init_snb(void)
+{
+	x86_pmu.lbr_nr	 = 16;
+	x86_pmu.lbr_tos	 = MSR_LBR_TOS;
+	x86_pmu.lbr_from = MSR_LBR_NHM_FROM;
+	x86_pmu.lbr_to   = MSR_LBR_NHM_TO;
+
+	x86_pmu.lbr_sel_mask = LBR_SEL_MASK;
+	x86_pmu.lbr_sel_map  = snb_lbr_sel_map;
+
+	pr_cont("16-deep LBR, ");
+}
+
+/* atom */
 void intel_pmu_lbr_init_atom(void)
 {
 	x86_pmu.lbr_nr	   = 8;
 	x86_pmu.lbr_tos    = MSR_LBR_TOS;
 	x86_pmu.lbr_from   = MSR_LBR_CORE_FROM;
 	x86_pmu.lbr_to     = MSR_LBR_CORE_TO;
+
+	pr_cont("8-deep LBR, ");
 }
-- 
cgit v1.2.3


From 88c9a65e13f393fd60d8b9e9c659a34f9e39967d Mon Sep 17 00:00:00 2001
From: Stephane Eranian <eranian@google.com>
Date: Thu, 9 Feb 2012 23:20:56 +0100
Subject: perf/x86: Disable LBR support for older Intel Atom processors

The patch adds a restriction for Intel Atom LBR support. Only
steppings 10 (PineView) and more recent are supported. Older models
do not have a functional LBR. Their LBR does not freeze on PMU
interrupt which makes LBR unusable in the context of perf_events.

Signed-off-by: Stephane Eranian <eranian@google.com>
Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Link: http://lkml.kernel.org/r/1328826068-11713-7-git-send-email-eranian@google.com
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/cpu/perf_event_intel_lbr.c | 10 ++++++++++
 1 file changed, 10 insertions(+)

(limited to 'arch/x86')

diff --git a/arch/x86/kernel/cpu/perf_event_intel_lbr.c b/arch/x86/kernel/cpu/perf_event_intel_lbr.c
index e54a063b2863..07f0ff88e443 100644
--- a/arch/x86/kernel/cpu/perf_event_intel_lbr.c
+++ b/arch/x86/kernel/cpu/perf_event_intel_lbr.c
@@ -315,6 +315,16 @@ void intel_pmu_lbr_init_snb(void)
 /* atom */
 void intel_pmu_lbr_init_atom(void)
 {
+	/*
+	 * only models starting at stepping 10 seems
+	 * to have an operational LBR which can freeze
+	 * on PMU interrupt
+	 */
+	if (boot_cpu_data.x86_mask < 10) {
+		pr_cont("LBR disabled due to erratum");
+		return;
+	}
+
 	x86_pmu.lbr_nr	   = 8;
 	x86_pmu.lbr_tos    = MSR_LBR_TOS;
 	x86_pmu.lbr_from   = MSR_LBR_CORE_FROM;
-- 
cgit v1.2.3


From 60ce0fbd072695866cb27b729690ab59dce705a5 Mon Sep 17 00:00:00 2001
From: Stephane Eranian <eranian@google.com>
Date: Thu, 9 Feb 2012 23:20:57 +0100
Subject: perf/x86: Implement PERF_SAMPLE_BRANCH for Intel CPUs

This patch implements PERF_SAMPLE_BRANCH support for Intel
x86processors. It connects PERF_SAMPLE_BRANCH to the actual LBR.

The patch adds the hooks in the PMU irq handler to save the LBR
on counter overflow for both regular and PEBS modes.

Signed-off-by: Stephane Eranian <eranian@google.com>
Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Link: http://lkml.kernel.org/r/1328826068-11713-8-git-send-email-eranian@google.com
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/cpu/perf_event.h           |  2 +
 arch/x86/kernel/cpu/perf_event_intel.c     | 35 ++++++++++++
 arch/x86/kernel/cpu/perf_event_intel_ds.c  | 10 ++--
 arch/x86/kernel/cpu/perf_event_intel_lbr.c | 86 +++++++++++++++++++++++++++++-
 4 files changed, 125 insertions(+), 8 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kernel/cpu/perf_event.h b/arch/x86/kernel/cpu/perf_event.h
index 4e948976aefb..ef7419cbd13d 100644
--- a/arch/x86/kernel/cpu/perf_event.h
+++ b/arch/x86/kernel/cpu/perf_event.h
@@ -541,6 +541,8 @@ void intel_pmu_lbr_init_atom(void);
 
 void intel_pmu_lbr_init_snb(void);
 
+int intel_pmu_setup_lbr_filter(struct perf_event *event);
+
 int p4_pmu_init(void);
 
 int p6_pmu_init(void);
diff --git a/arch/x86/kernel/cpu/perf_event_intel.c b/arch/x86/kernel/cpu/perf_event_intel.c
index b0db01692441..7cc1e2dcc4dd 100644
--- a/arch/x86/kernel/cpu/perf_event_intel.c
+++ b/arch/x86/kernel/cpu/perf_event_intel.c
@@ -727,6 +727,19 @@ static __initconst const u64 atom_hw_cache_event_ids
  },
 };
 
+static inline bool intel_pmu_needs_lbr_smpl(struct perf_event *event)
+{
+	/* user explicitly requested branch sampling */
+	if (has_branch_stack(event))
+		return true;
+
+	/* implicit branch sampling to correct PEBS skid */
+	if (x86_pmu.intel_cap.pebs_trap && event->attr.precise_ip > 1)
+		return true;
+
+	return false;
+}
+
 static void intel_pmu_disable_all(void)
 {
 	struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events);
@@ -881,6 +894,13 @@ static void intel_pmu_disable_event(struct perf_event *event)
 	cpuc->intel_ctrl_guest_mask &= ~(1ull << hwc->idx);
 	cpuc->intel_ctrl_host_mask &= ~(1ull << hwc->idx);
 
+	/*
+	 * must disable before any actual event
+	 * because any event may be combined with LBR
+	 */
+	if (intel_pmu_needs_lbr_smpl(event))
+		intel_pmu_lbr_disable(event);
+
 	if (unlikely(hwc->config_base == MSR_ARCH_PERFMON_FIXED_CTR_CTRL)) {
 		intel_pmu_disable_fixed(hwc);
 		return;
@@ -935,6 +955,12 @@ static void intel_pmu_enable_event(struct perf_event *event)
 		intel_pmu_enable_bts(hwc->config);
 		return;
 	}
+	/*
+	 * must enabled before any actual event
+	 * because any event may be combined with LBR
+	 */
+	if (intel_pmu_needs_lbr_smpl(event))
+		intel_pmu_lbr_enable(event);
 
 	if (event->attr.exclude_host)
 		cpuc->intel_ctrl_guest_mask |= (1ull << hwc->idx);
@@ -1057,6 +1083,9 @@ again:
 
 		data.period = event->hw.last_period;
 
+		if (has_branch_stack(event))
+			data.br_stack = &cpuc->lbr_stack;
+
 		if (perf_event_overflow(event, &data, regs))
 			x86_pmu_stop(event, 0);
 	}
@@ -1305,6 +1334,12 @@ static int intel_pmu_hw_config(struct perf_event *event)
 		event->hw.config = alt_config;
 	}
 
+	if (intel_pmu_needs_lbr_smpl(event)) {
+		ret = intel_pmu_setup_lbr_filter(event);
+		if (ret)
+			return ret;
+	}
+
 	if (event->attr.type != PERF_TYPE_RAW)
 		return 0;
 
diff --git a/arch/x86/kernel/cpu/perf_event_intel_ds.c b/arch/x86/kernel/cpu/perf_event_intel_ds.c
index d6bd49faa40c..ee7e3c8d9d6a 100644
--- a/arch/x86/kernel/cpu/perf_event_intel_ds.c
+++ b/arch/x86/kernel/cpu/perf_event_intel_ds.c
@@ -439,9 +439,6 @@ void intel_pmu_pebs_enable(struct perf_event *event)
 	hwc->config &= ~ARCH_PERFMON_EVENTSEL_INT;
 
 	cpuc->pebs_enabled |= 1ULL << hwc->idx;
-
-	if (x86_pmu.intel_cap.pebs_trap && event->attr.precise_ip > 1)
-		intel_pmu_lbr_enable(event);
 }
 
 void intel_pmu_pebs_disable(struct perf_event *event)
@@ -454,9 +451,6 @@ void intel_pmu_pebs_disable(struct perf_event *event)
 		wrmsrl(MSR_IA32_PEBS_ENABLE, cpuc->pebs_enabled);
 
 	hwc->config |= ARCH_PERFMON_EVENTSEL_INT;
-
-	if (x86_pmu.intel_cap.pebs_trap && event->attr.precise_ip > 1)
-		intel_pmu_lbr_disable(event);
 }
 
 void intel_pmu_pebs_enable_all(void)
@@ -572,6 +566,7 @@ static void __intel_pmu_pebs_event(struct perf_event *event,
 	 * both formats and we don't use the other fields in this
 	 * routine.
 	 */
+	struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events);
 	struct pebs_record_core *pebs = __pebs;
 	struct perf_sample_data data;
 	struct pt_regs regs;
@@ -602,6 +597,9 @@ static void __intel_pmu_pebs_event(struct perf_event *event,
 	else
 		regs.flags &= ~PERF_EFLAGS_EXACT;
 
+	if (has_branch_stack(event))
+		data.br_stack = &cpuc->lbr_stack;
+
 	if (perf_event_overflow(event, &data, &regs))
 		x86_pmu_stop(event, 0);
 }
diff --git a/arch/x86/kernel/cpu/perf_event_intel_lbr.c b/arch/x86/kernel/cpu/perf_event_intel_lbr.c
index 07f0ff88e443..d0fb864ff2b0 100644
--- a/arch/x86/kernel/cpu/perf_event_intel_lbr.c
+++ b/arch/x86/kernel/cpu/perf_event_intel_lbr.c
@@ -56,6 +56,10 @@ enum {
 
 #define LBR_FROM_FLAG_MISPRED  (1ULL << 63)
 
+#define for_each_branch_sample_type(x) \
+	for ((x) = PERF_SAMPLE_BRANCH_USER; \
+	     (x) < PERF_SAMPLE_BRANCH_MAX; (x) <<= 1)
+
 /*
  * We only support LBR implementations that have FREEZE_LBRS_ON_PMI
  * otherwise it becomes near impossible to get a reliable stack.
@@ -64,6 +68,10 @@ enum {
 static void __intel_pmu_lbr_enable(void)
 {
 	u64 debugctl;
+	struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events);
+
+	if (cpuc->lbr_sel)
+		wrmsrl(MSR_LBR_SELECT, cpuc->lbr_sel->config);
 
 	rdmsrl(MSR_IA32_DEBUGCTLMSR, debugctl);
 	debugctl |= (DEBUGCTLMSR_LBR | DEBUGCTLMSR_FREEZE_LBRS_ON_PMI);
@@ -119,7 +127,6 @@ void intel_pmu_lbr_enable(struct perf_event *event)
 	 * Reset the LBR stack if we changed task context to
 	 * avoid data leaks.
 	 */
-
 	if (event->ctx->task && cpuc->lbr_context != event->ctx) {
 		intel_pmu_lbr_reset();
 		cpuc->lbr_context = event->ctx;
@@ -138,8 +145,11 @@ void intel_pmu_lbr_disable(struct perf_event *event)
 	cpuc->lbr_users--;
 	WARN_ON_ONCE(cpuc->lbr_users < 0);
 
-	if (cpuc->enabled && !cpuc->lbr_users)
+	if (cpuc->enabled && !cpuc->lbr_users) {
 		__intel_pmu_lbr_disable();
+		/* avoid stale pointer */
+		cpuc->lbr_context = NULL;
+	}
 }
 
 void intel_pmu_lbr_enable_all(void)
@@ -158,6 +168,9 @@ void intel_pmu_lbr_disable_all(void)
 		__intel_pmu_lbr_disable();
 }
 
+/*
+ * TOS = most recently recorded branch
+ */
 static inline u64 intel_pmu_lbr_tos(void)
 {
 	u64 tos;
@@ -241,6 +254,75 @@ void intel_pmu_lbr_read(void)
 		intel_pmu_lbr_read_64(cpuc);
 }
 
+/*
+ * setup the HW LBR filter
+ * Used only when available, may not be enough to disambiguate
+ * all branches, may need the help of the SW filter
+ */
+static int intel_pmu_setup_hw_lbr_filter(struct perf_event *event)
+{
+	struct hw_perf_event_extra *reg;
+	u64 br_type = event->attr.branch_sample_type;
+	u64 mask = 0, m;
+	u64 v;
+
+	for_each_branch_sample_type(m) {
+		if (!(br_type & m))
+			continue;
+
+		v = x86_pmu.lbr_sel_map[m];
+		if (v == LBR_NOT_SUPP)
+			return -EOPNOTSUPP;
+		mask |= v;
+
+		if (m == PERF_SAMPLE_BRANCH_ANY)
+			break;
+	}
+	reg = &event->hw.branch_reg;
+	reg->idx = EXTRA_REG_LBR;
+
+	/* LBR_SELECT operates in suppress mode so invert mask */
+	reg->config = ~mask & x86_pmu.lbr_sel_mask;
+
+	return 0;
+}
+
+/*
+ * all the bits supported on some flavor of x86LBR
+ * we ignore BRANCH_HV because it is not supported
+ */
+#define PERF_SAMPLE_BRANCH_X86_ALL	\
+	(PERF_SAMPLE_BRANCH_ANY		|\
+	 PERF_SAMPLE_BRANCH_USER	|\
+	 PERF_SAMPLE_BRANCH_KERNEL)
+
+int intel_pmu_setup_lbr_filter(struct perf_event *event)
+{
+	u64 br_type = event->attr.branch_sample_type;
+
+	/*
+	 * no LBR on this PMU
+	 */
+	if (!x86_pmu.lbr_nr)
+		return -EOPNOTSUPP;
+
+	/*
+	 * if no LBR HW filter, users can only
+	 * capture all branches
+	 */
+	if (!x86_pmu.lbr_sel_map) {
+		if (br_type != PERF_SAMPLE_BRANCH_X86_ALL)
+			return -EOPNOTSUPP;
+		return 0;
+	}
+	/*
+	 * we ignore branch priv levels we do not
+	 * know about: BRANCH_HV
+	 */
+
+	return intel_pmu_setup_hw_lbr_filter(event);
+}
+
 /*
  * Map interface branch filters onto LBR filters
  */
-- 
cgit v1.2.3


From 3e702ff6d1ea12dcf1c798ecb61e7f3a1579df42 Mon Sep 17 00:00:00 2001
From: Stephane Eranian <eranian@google.com>
Date: Thu, 9 Feb 2012 23:20:58 +0100
Subject: perf/x86: Add LBR software filter support for Intel CPUs

This patch adds an internal sofware filter to complement
the (optional) LBR hardware filter.

The software filter is necessary:

 - as a substitute when there is no HW LBR filter (e.g., Atom, Core)
 - to complement HW LBR filter in case of errata (e.g., Nehalem/Westmere)
 - to provide finer grain filtering (e.g., all processors)

Sometimes the LBR HW filter cannot distinguish between two types
of branches. For instance, to capture syscall as CALLS, it is necessary
to enable the LBR_FAR filter which will also capture JMP instructions.
Thus, a second pass is necessary to filter those out, this is what the
SW filter can do.

The SW filter is built on top of the internal x86 disassembler. It
is a best effort filter especially for user level code. It is subject
to the availability of the text page of the program.

The SW filter is enabled on all Intel processors. It is bypassed
when the user is capturing all branches at all priv levels.

Signed-off-by: Stephane Eranian <eranian@google.com>
Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Link: http://lkml.kernel.org/r/1328826068-11713-9-git-send-email-eranian@google.com
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/cpu/perf_event.h           |  10 +
 arch/x86/kernel/cpu/perf_event_intel_ds.c  |  12 +-
 arch/x86/kernel/cpu/perf_event_intel_lbr.c | 332 +++++++++++++++++++++++++++--
 3 files changed, 321 insertions(+), 33 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kernel/cpu/perf_event.h b/arch/x86/kernel/cpu/perf_event.h
index ef7419cbd13d..f104c054dc5c 100644
--- a/arch/x86/kernel/cpu/perf_event.h
+++ b/arch/x86/kernel/cpu/perf_event.h
@@ -132,6 +132,7 @@ struct cpu_hw_events {
 	struct perf_branch_stack	lbr_stack;
 	struct perf_branch_entry	lbr_entries[MAX_LBR_ENTRIES];
 	struct er_account		*lbr_sel;
+	u64				br_sel;
 
 	/*
 	 * Intel host/guest exclude bits
@@ -459,6 +460,15 @@ extern struct event_constraint emptyconstraint;
 
 extern struct event_constraint unconstrained;
 
+static inline bool kernel_ip(unsigned long ip)
+{
+#ifdef CONFIG_X86_32
+	return ip > PAGE_OFFSET;
+#else
+	return (long)ip < 0;
+#endif
+}
+
 #ifdef CONFIG_CPU_SUP_AMD
 
 int amd_pmu_init(void);
diff --git a/arch/x86/kernel/cpu/perf_event_intel_ds.c b/arch/x86/kernel/cpu/perf_event_intel_ds.c
index ee7e3c8d9d6a..7f64df19e7dd 100644
--- a/arch/x86/kernel/cpu/perf_event_intel_ds.c
+++ b/arch/x86/kernel/cpu/perf_event_intel_ds.c
@@ -3,6 +3,7 @@
 #include <linux/slab.h>
 
 #include <asm/perf_event.h>
+#include <asm/insn.h>
 
 #include "perf_event.h"
 
@@ -469,17 +470,6 @@ void intel_pmu_pebs_disable_all(void)
 		wrmsrl(MSR_IA32_PEBS_ENABLE, 0);
 }
 
-#include <asm/insn.h>
-
-static inline bool kernel_ip(unsigned long ip)
-{
-#ifdef CONFIG_X86_32
-	return ip > PAGE_OFFSET;
-#else
-	return (long)ip < 0;
-#endif
-}
-
 static int intel_pmu_pebs_fixup_ip(struct pt_regs *regs)
 {
 	struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events);
diff --git a/arch/x86/kernel/cpu/perf_event_intel_lbr.c b/arch/x86/kernel/cpu/perf_event_intel_lbr.c
index d0fb864ff2b0..520b4265fcd2 100644
--- a/arch/x86/kernel/cpu/perf_event_intel_lbr.c
+++ b/arch/x86/kernel/cpu/perf_event_intel_lbr.c
@@ -3,6 +3,7 @@
 
 #include <asm/perf_event.h>
 #include <asm/msr.h>
+#include <asm/insn.h>
 
 #include "perf_event.h"
 
@@ -60,6 +61,53 @@ enum {
 	for ((x) = PERF_SAMPLE_BRANCH_USER; \
 	     (x) < PERF_SAMPLE_BRANCH_MAX; (x) <<= 1)
 
+/*
+ * x86control flow change classification
+ * x86control flow changes include branches, interrupts, traps, faults
+ */
+enum {
+	X86_BR_NONE     = 0,      /* unknown */
+
+	X86_BR_USER     = 1 << 0, /* branch target is user */
+	X86_BR_KERNEL   = 1 << 1, /* branch target is kernel */
+
+	X86_BR_CALL     = 1 << 2, /* call */
+	X86_BR_RET      = 1 << 3, /* return */
+	X86_BR_SYSCALL  = 1 << 4, /* syscall */
+	X86_BR_SYSRET   = 1 << 5, /* syscall return */
+	X86_BR_INT      = 1 << 6, /* sw interrupt */
+	X86_BR_IRET     = 1 << 7, /* return from interrupt */
+	X86_BR_JCC      = 1 << 8, /* conditional */
+	X86_BR_JMP      = 1 << 9, /* jump */
+	X86_BR_IRQ      = 1 << 10,/* hw interrupt or trap or fault */
+	X86_BR_IND_CALL = 1 << 11,/* indirect calls */
+};
+
+#define X86_BR_PLM (X86_BR_USER | X86_BR_KERNEL)
+
+#define X86_BR_ANY       \
+	(X86_BR_CALL    |\
+	 X86_BR_RET     |\
+	 X86_BR_SYSCALL |\
+	 X86_BR_SYSRET  |\
+	 X86_BR_INT     |\
+	 X86_BR_IRET    |\
+	 X86_BR_JCC     |\
+	 X86_BR_JMP	 |\
+	 X86_BR_IRQ	 |\
+	 X86_BR_IND_CALL)
+
+#define X86_BR_ALL (X86_BR_PLM | X86_BR_ANY)
+
+#define X86_BR_ANY_CALL		 \
+	(X86_BR_CALL		|\
+	 X86_BR_IND_CALL	|\
+	 X86_BR_SYSCALL		|\
+	 X86_BR_IRQ		|\
+	 X86_BR_INT)
+
+static void intel_pmu_lbr_filter(struct cpu_hw_events *cpuc);
+
 /*
  * We only support LBR implementations that have FREEZE_LBRS_ON_PMI
  * otherwise it becomes near impossible to get a reliable stack.
@@ -131,6 +179,7 @@ void intel_pmu_lbr_enable(struct perf_event *event)
 		intel_pmu_lbr_reset();
 		cpuc->lbr_context = event->ctx;
 	}
+	cpuc->br_sel = event->hw.branch_reg.reg;
 
 	cpuc->lbr_users++;
 }
@@ -252,6 +301,44 @@ void intel_pmu_lbr_read(void)
 		intel_pmu_lbr_read_32(cpuc);
 	else
 		intel_pmu_lbr_read_64(cpuc);
+
+	intel_pmu_lbr_filter(cpuc);
+}
+
+/*
+ * SW filter is used:
+ * - in case there is no HW filter
+ * - in case the HW filter has errata or limitations
+ */
+static void intel_pmu_setup_sw_lbr_filter(struct perf_event *event)
+{
+	u64 br_type = event->attr.branch_sample_type;
+	int mask = 0;
+
+	if (br_type & PERF_SAMPLE_BRANCH_USER)
+		mask |= X86_BR_USER;
+
+	if (br_type & PERF_SAMPLE_BRANCH_KERNEL)
+		mask |= X86_BR_KERNEL;
+
+	/* we ignore BRANCH_HV here */
+
+	if (br_type & PERF_SAMPLE_BRANCH_ANY)
+		mask |= X86_BR_ANY;
+
+	if (br_type & PERF_SAMPLE_BRANCH_ANY_CALL)
+		mask |= X86_BR_ANY_CALL;
+
+	if (br_type & PERF_SAMPLE_BRANCH_ANY_RETURN)
+		mask |= X86_BR_RET | X86_BR_IRET | X86_BR_SYSRET;
+
+	if (br_type & PERF_SAMPLE_BRANCH_IND_CALL)
+		mask |= X86_BR_IND_CALL;
+	/*
+	 * stash actual user request into reg, it may
+	 * be used by fixup code for some CPU
+	 */
+	event->hw.branch_reg.reg = mask;
 }
 
 /*
@@ -273,10 +360,9 @@ static int intel_pmu_setup_hw_lbr_filter(struct perf_event *event)
 		v = x86_pmu.lbr_sel_map[m];
 		if (v == LBR_NOT_SUPP)
 			return -EOPNOTSUPP;
-		mask |= v;
 
-		if (m == PERF_SAMPLE_BRANCH_ANY)
-			break;
+		if (v != LBR_IGN)
+			mask |= v;
 	}
 	reg = &event->hw.branch_reg;
 	reg->idx = EXTRA_REG_LBR;
@@ -287,18 +373,9 @@ static int intel_pmu_setup_hw_lbr_filter(struct perf_event *event)
 	return 0;
 }
 
-/*
- * all the bits supported on some flavor of x86LBR
- * we ignore BRANCH_HV because it is not supported
- */
-#define PERF_SAMPLE_BRANCH_X86_ALL	\
-	(PERF_SAMPLE_BRANCH_ANY		|\
-	 PERF_SAMPLE_BRANCH_USER	|\
-	 PERF_SAMPLE_BRANCH_KERNEL)
-
 int intel_pmu_setup_lbr_filter(struct perf_event *event)
 {
-	u64 br_type = event->attr.branch_sample_type;
+	int ret = 0;
 
 	/*
 	 * no LBR on this PMU
@@ -307,20 +384,210 @@ int intel_pmu_setup_lbr_filter(struct perf_event *event)
 		return -EOPNOTSUPP;
 
 	/*
-	 * if no LBR HW filter, users can only
-	 * capture all branches
+	 * setup SW LBR filter
 	 */
-	if (!x86_pmu.lbr_sel_map) {
-		if (br_type != PERF_SAMPLE_BRANCH_X86_ALL)
-			return -EOPNOTSUPP;
-		return 0;
+	intel_pmu_setup_sw_lbr_filter(event);
+
+	/*
+	 * setup HW LBR filter, if any
+	 */
+	if (x86_pmu.lbr_sel_map)
+		ret = intel_pmu_setup_hw_lbr_filter(event);
+
+	return ret;
+}
+
+/*
+ * return the type of control flow change at address "from"
+ * intruction is not necessarily a branch (in case of interrupt).
+ *
+ * The branch type returned also includes the priv level of the
+ * target of the control flow change (X86_BR_USER, X86_BR_KERNEL).
+ *
+ * If a branch type is unknown OR the instruction cannot be
+ * decoded (e.g., text page not present), then X86_BR_NONE is
+ * returned.
+ */
+static int branch_type(unsigned long from, unsigned long to)
+{
+	struct insn insn;
+	void *addr;
+	int bytes, size = MAX_INSN_SIZE;
+	int ret = X86_BR_NONE;
+	int ext, to_plm, from_plm;
+	u8 buf[MAX_INSN_SIZE];
+	int is64 = 0;
+
+	to_plm = kernel_ip(to) ? X86_BR_KERNEL : X86_BR_USER;
+	from_plm = kernel_ip(from) ? X86_BR_KERNEL : X86_BR_USER;
+
+	/*
+	 * maybe zero if lbr did not fill up after a reset by the time
+	 * we get a PMU interrupt
+	 */
+	if (from == 0 || to == 0)
+		return X86_BR_NONE;
+
+	if (from_plm == X86_BR_USER) {
+		/*
+		 * can happen if measuring at the user level only
+		 * and we interrupt in a kernel thread, e.g., idle.
+		 */
+		if (!current->mm)
+			return X86_BR_NONE;
+
+		/* may fail if text not present */
+		bytes = copy_from_user_nmi(buf, (void __user *)from, size);
+		if (bytes != size)
+			return X86_BR_NONE;
+
+		addr = buf;
+	} else
+		addr = (void *)from;
+
+	/*
+	 * decoder needs to know the ABI especially
+	 * on 64-bit systems running 32-bit apps
+	 */
+#ifdef CONFIG_X86_64
+	is64 = kernel_ip((unsigned long)addr) || !test_thread_flag(TIF_IA32);
+#endif
+	insn_init(&insn, addr, is64);
+	insn_get_opcode(&insn);
+
+	switch (insn.opcode.bytes[0]) {
+	case 0xf:
+		switch (insn.opcode.bytes[1]) {
+		case 0x05: /* syscall */
+		case 0x34: /* sysenter */
+			ret = X86_BR_SYSCALL;
+			break;
+		case 0x07: /* sysret */
+		case 0x35: /* sysexit */
+			ret = X86_BR_SYSRET;
+			break;
+		case 0x80 ... 0x8f: /* conditional */
+			ret = X86_BR_JCC;
+			break;
+		default:
+			ret = X86_BR_NONE;
+		}
+		break;
+	case 0x70 ... 0x7f: /* conditional */
+		ret = X86_BR_JCC;
+		break;
+	case 0xc2: /* near ret */
+	case 0xc3: /* near ret */
+	case 0xca: /* far ret */
+	case 0xcb: /* far ret */
+		ret = X86_BR_RET;
+		break;
+	case 0xcf: /* iret */
+		ret = X86_BR_IRET;
+		break;
+	case 0xcc ... 0xce: /* int */
+		ret = X86_BR_INT;
+		break;
+	case 0xe8: /* call near rel */
+	case 0x9a: /* call far absolute */
+		ret = X86_BR_CALL;
+		break;
+	case 0xe0 ... 0xe3: /* loop jmp */
+		ret = X86_BR_JCC;
+		break;
+	case 0xe9 ... 0xeb: /* jmp */
+		ret = X86_BR_JMP;
+		break;
+	case 0xff: /* call near absolute, call far absolute ind */
+		insn_get_modrm(&insn);
+		ext = (insn.modrm.bytes[0] >> 3) & 0x7;
+		switch (ext) {
+		case 2: /* near ind call */
+		case 3: /* far ind call */
+			ret = X86_BR_IND_CALL;
+			break;
+		case 4:
+		case 5:
+			ret = X86_BR_JMP;
+			break;
+		}
+		break;
+	default:
+		ret = X86_BR_NONE;
 	}
 	/*
-	 * we ignore branch priv levels we do not
-	 * know about: BRANCH_HV
+	 * interrupts, traps, faults (and thus ring transition) may
+	 * occur on any instructions. Thus, to classify them correctly,
+	 * we need to first look at the from and to priv levels. If they
+	 * are different and to is in the kernel, then it indicates
+	 * a ring transition. If the from instruction is not a ring
+	 * transition instr (syscall, systenter, int), then it means
+	 * it was a irq, trap or fault.
+	 *
+	 * we have no way of detecting kernel to kernel faults.
+	 */
+	if (from_plm == X86_BR_USER && to_plm == X86_BR_KERNEL
+	    && ret != X86_BR_SYSCALL && ret != X86_BR_INT)
+		ret = X86_BR_IRQ;
+
+	/*
+	 * branch priv level determined by target as
+	 * is done by HW when LBR_SELECT is implemented
 	 */
+	if (ret != X86_BR_NONE)
+		ret |= to_plm;
 
-	return intel_pmu_setup_hw_lbr_filter(event);
+	return ret;
+}
+
+/*
+ * implement actual branch filter based on user demand.
+ * Hardware may not exactly satisfy that request, thus
+ * we need to inspect opcodes. Mismatched branches are
+ * discarded. Therefore, the number of branches returned
+ * in PERF_SAMPLE_BRANCH_STACK sample may vary.
+ */
+static void
+intel_pmu_lbr_filter(struct cpu_hw_events *cpuc)
+{
+	u64 from, to;
+	int br_sel = cpuc->br_sel;
+	int i, j, type;
+	bool compress = false;
+
+	/* if sampling all branches, then nothing to filter */
+	if ((br_sel & X86_BR_ALL) == X86_BR_ALL)
+		return;
+
+	for (i = 0; i < cpuc->lbr_stack.nr; i++) {
+
+		from = cpuc->lbr_entries[i].from;
+		to = cpuc->lbr_entries[i].to;
+
+		type = branch_type(from, to);
+
+		/* if type does not correspond, then discard */
+		if (type == X86_BR_NONE || (br_sel & type) != type) {
+			cpuc->lbr_entries[i].from = 0;
+			compress = true;
+		}
+	}
+
+	if (!compress)
+		return;
+
+	/* remove all entries with from=0 */
+	for (i = 0; i < cpuc->lbr_stack.nr; ) {
+		if (!cpuc->lbr_entries[i].from) {
+			j = i;
+			while (++j < cpuc->lbr_stack.nr)
+				cpuc->lbr_entries[j-1] = cpuc->lbr_entries[j];
+			cpuc->lbr_stack.nr--;
+			if (!cpuc->lbr_entries[i].from)
+				continue;
+		}
+		i++;
+	}
 }
 
 /*
@@ -363,6 +630,10 @@ void intel_pmu_lbr_init_core(void)
 	x86_pmu.lbr_from   = MSR_LBR_CORE_FROM;
 	x86_pmu.lbr_to     = MSR_LBR_CORE_TO;
 
+	/*
+	 * SW branch filter usage:
+	 * - compensate for lack of HW filter
+	 */
 	pr_cont("4-deep LBR, ");
 }
 
@@ -377,6 +648,13 @@ void intel_pmu_lbr_init_nhm(void)
 	x86_pmu.lbr_sel_mask = LBR_SEL_MASK;
 	x86_pmu.lbr_sel_map  = nhm_lbr_sel_map;
 
+	/*
+	 * SW branch filter usage:
+	 * - workaround LBR_SEL errata (see above)
+	 * - support syscall, sysret capture.
+	 *   That requires LBR_FAR but that means far
+	 *   jmp need to be filtered out
+	 */
 	pr_cont("16-deep LBR, ");
 }
 
@@ -391,6 +669,12 @@ void intel_pmu_lbr_init_snb(void)
 	x86_pmu.lbr_sel_mask = LBR_SEL_MASK;
 	x86_pmu.lbr_sel_map  = snb_lbr_sel_map;
 
+	/*
+	 * SW branch filter usage:
+	 * - support syscall, sysret capture.
+	 *   That requires LBR_FAR but that means far
+	 *   jmp need to be filtered out
+	 */
 	pr_cont("16-deep LBR, ");
 }
 
@@ -412,5 +696,9 @@ void intel_pmu_lbr_init_atom(void)
 	x86_pmu.lbr_from   = MSR_LBR_CORE_FROM;
 	x86_pmu.lbr_to     = MSR_LBR_CORE_TO;
 
+	/*
+	 * SW branch filter usage:
+	 * - compensate for lack of HW filter
+	 */
 	pr_cont("8-deep LBR, ");
 }
-- 
cgit v1.2.3


From 2481c5fa6db0237e4f0168f88913178b2b495b7c Mon Sep 17 00:00:00 2001
From: Stephane Eranian <eranian@google.com>
Date: Thu, 9 Feb 2012 23:20:59 +0100
Subject: perf: Disable PERF_SAMPLE_BRANCH_* when not supported

PERF_SAMPLE_BRANCH_* is disabled for:

 - SW events (sw counters, tracepoints)
 - HW breakpoints
 - ALL but Intel x86 architecture
 - AMD64 processors

Signed-off-by: Stephane Eranian <eranian@google.com>
Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Link: http://lkml.kernel.org/r/1328826068-11713-10-git-send-email-eranian@google.com
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/alpha/kernel/perf_event.c       |  4 ++++
 arch/arm/kernel/perf_event.c         |  4 ++++
 arch/mips/kernel/perf_event_mipsxx.c |  4 ++++
 arch/powerpc/kernel/perf_event.c     |  4 ++++
 arch/sh/kernel/perf_event.c          |  4 ++++
 arch/sparc/kernel/perf_event.c       |  4 ++++
 arch/x86/kernel/cpu/perf_event_amd.c |  3 +++
 kernel/events/core.c                 | 24 ++++++++++++++++++++++++
 kernel/events/hw_breakpoint.c        |  6 ++++++
 9 files changed, 57 insertions(+)

(limited to 'arch/x86')

diff --git a/arch/alpha/kernel/perf_event.c b/arch/alpha/kernel/perf_event.c
index 8143cd7cdbfb..0dae252f7a33 100644
--- a/arch/alpha/kernel/perf_event.c
+++ b/arch/alpha/kernel/perf_event.c
@@ -685,6 +685,10 @@ static int alpha_pmu_event_init(struct perf_event *event)
 {
 	int err;
 
+	/* does not support taken branch sampling */
+	if (has_branch_stack(event))
+		return -EOPNOTSUPP;
+
 	switch (event->attr.type) {
 	case PERF_TYPE_RAW:
 	case PERF_TYPE_HARDWARE:
diff --git a/arch/arm/kernel/perf_event.c b/arch/arm/kernel/perf_event.c
index 5bb91bf3d47f..a23c42abc694 100644
--- a/arch/arm/kernel/perf_event.c
+++ b/arch/arm/kernel/perf_event.c
@@ -539,6 +539,10 @@ static int armpmu_event_init(struct perf_event *event)
 	int err = 0;
 	atomic_t *active_events = &armpmu->active_events;
 
+	/* does not support taken branch sampling */
+	if (has_branch_stack(event))
+		return -EOPNOTSUPP;
+
 	if (armpmu->map_event(event) == -ENOENT)
 		return -ENOENT;
 
diff --git a/arch/mips/kernel/perf_event_mipsxx.c b/arch/mips/kernel/perf_event_mipsxx.c
index e3b897acfbc0..811084f4e422 100644
--- a/arch/mips/kernel/perf_event_mipsxx.c
+++ b/arch/mips/kernel/perf_event_mipsxx.c
@@ -606,6 +606,10 @@ static int mipspmu_event_init(struct perf_event *event)
 {
 	int err = 0;
 
+	/* does not support taken branch sampling */
+	if (has_branch_stack(event))
+		return -EOPNOTSUPP;
+
 	switch (event->attr.type) {
 	case PERF_TYPE_RAW:
 	case PERF_TYPE_HARDWARE:
diff --git a/arch/powerpc/kernel/perf_event.c b/arch/powerpc/kernel/perf_event.c
index f04c2301725e..c2e27ede07ec 100644
--- a/arch/powerpc/kernel/perf_event.c
+++ b/arch/powerpc/kernel/perf_event.c
@@ -1084,6 +1084,10 @@ static int power_pmu_event_init(struct perf_event *event)
 	if (!ppmu)
 		return -ENOENT;
 
+	/* does not support taken branch sampling */
+	if (has_branch_stack(event))
+		return -EOPNOTSUPP;
+
 	switch (event->attr.type) {
 	case PERF_TYPE_HARDWARE:
 		ev = event->attr.config;
diff --git a/arch/sh/kernel/perf_event.c b/arch/sh/kernel/perf_event.c
index 10b14e3a7eb8..068b8a2759b5 100644
--- a/arch/sh/kernel/perf_event.c
+++ b/arch/sh/kernel/perf_event.c
@@ -310,6 +310,10 @@ static int sh_pmu_event_init(struct perf_event *event)
 {
 	int err;
 
+	/* does not support taken branch sampling */
+	if (has_branch_stack(event))
+		return -EOPNOTSUPP;
+
 	switch (event->attr.type) {
 	case PERF_TYPE_RAW:
 	case PERF_TYPE_HW_CACHE:
diff --git a/arch/sparc/kernel/perf_event.c b/arch/sparc/kernel/perf_event.c
index 614da624330c..8e16a4a21582 100644
--- a/arch/sparc/kernel/perf_event.c
+++ b/arch/sparc/kernel/perf_event.c
@@ -1105,6 +1105,10 @@ static int sparc_pmu_event_init(struct perf_event *event)
 	if (atomic_read(&nmi_active) < 0)
 		return -ENODEV;
 
+	/* does not support taken branch sampling */
+	if (has_branch_stack(event))
+		return -EOPNOTSUPP;
+
 	switch (attr->type) {
 	case PERF_TYPE_HARDWARE:
 		if (attr->config >= sparc_pmu->max_events)
diff --git a/arch/x86/kernel/cpu/perf_event_amd.c b/arch/x86/kernel/cpu/perf_event_amd.c
index 67250a52430b..dd002faff7a6 100644
--- a/arch/x86/kernel/cpu/perf_event_amd.c
+++ b/arch/x86/kernel/cpu/perf_event_amd.c
@@ -139,6 +139,9 @@ static int amd_pmu_hw_config(struct perf_event *event)
 	if (ret)
 		return ret;
 
+	if (has_branch_stack(event))
+		return -EOPNOTSUPP;
+
 	if (event->attr.exclude_host && event->attr.exclude_guest)
 		/*
 		 * When HO == GO == 1 the hardware treats that as GO == HO == 0
diff --git a/kernel/events/core.c b/kernel/events/core.c
index 5820efdf47cd..242bb51c67f2 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -5044,6 +5044,12 @@ static int perf_swevent_init(struct perf_event *event)
 	if (event->attr.type != PERF_TYPE_SOFTWARE)
 		return -ENOENT;
 
+	/*
+	 * no branch sampling for software events
+	 */
+	if (has_branch_stack(event))
+		return -EOPNOTSUPP;
+
 	switch (event_id) {
 	case PERF_COUNT_SW_CPU_CLOCK:
 	case PERF_COUNT_SW_TASK_CLOCK:
@@ -5154,6 +5160,12 @@ static int perf_tp_event_init(struct perf_event *event)
 	if (event->attr.type != PERF_TYPE_TRACEPOINT)
 		return -ENOENT;
 
+	/*
+	 * no branch sampling for tracepoint events
+	 */
+	if (has_branch_stack(event))
+		return -EOPNOTSUPP;
+
 	err = perf_trace_init(event);
 	if (err)
 		return err;
@@ -5379,6 +5391,12 @@ static int cpu_clock_event_init(struct perf_event *event)
 	if (event->attr.config != PERF_COUNT_SW_CPU_CLOCK)
 		return -ENOENT;
 
+	/*
+	 * no branch sampling for software events
+	 */
+	if (has_branch_stack(event))
+		return -EOPNOTSUPP;
+
 	perf_swevent_init_hrtimer(event);
 
 	return 0;
@@ -5453,6 +5471,12 @@ static int task_clock_event_init(struct perf_event *event)
 	if (event->attr.config != PERF_COUNT_SW_TASK_CLOCK)
 		return -ENOENT;
 
+	/*
+	 * no branch sampling for software events
+	 */
+	if (has_branch_stack(event))
+		return -EOPNOTSUPP;
+
 	perf_swevent_init_hrtimer(event);
 
 	return 0;
diff --git a/kernel/events/hw_breakpoint.c b/kernel/events/hw_breakpoint.c
index 3330022a7ac1..bb38c4d3ee12 100644
--- a/kernel/events/hw_breakpoint.c
+++ b/kernel/events/hw_breakpoint.c
@@ -581,6 +581,12 @@ static int hw_breakpoint_event_init(struct perf_event *bp)
 	if (bp->attr.type != PERF_TYPE_BREAKPOINT)
 		return -ENOENT;
 
+	/*
+	 * no branch sampling for breakpoint events
+	 */
+	if (has_branch_stack(bp))
+		return -EOPNOTSUPP;
+
 	err = register_perf_hw_breakpoint(bp);
 	if (err)
 		return err;
-- 
cgit v1.2.3


From d010b3326cf06b3406cdd88af16dcf4e4b6fec2e Mon Sep 17 00:00:00 2001
From: Stephane Eranian <eranian@google.com>
Date: Thu, 9 Feb 2012 23:21:00 +0100
Subject: perf: Add callback to flush branch_stack on context switch

With branch stack sampling, it is possible to filter by priv levels.

In system-wide mode, that means it is possible to capture only user
level branches. The builtin SW LBR filter needs to disassemble code
based on LBR captured addresses. For that, it needs to know the task
the addresses are associated with. Because of context switches, the
content of the branch stack buffer may contain addresses from
different tasks.

We need a callback on context switch to either flush the branch stack
or save it. This patch adds a new callback in struct pmu which is called
during context switches. The callback is called only when necessary.
That is when a system-wide context has, at least, one event which
uses PERF_SAMPLE_BRANCH_STACK. The callback is never called for
per-thread context.

In this version, the Intel x86 code simply flushes (resets) the LBR
on context switches (fills it with zeroes). Those zeroed branches are
then filtered out by the SW filter.

Signed-off-by: Stephane Eranian <eranian@google.com>
Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Link: http://lkml.kernel.org/r/1328826068-11713-11-git-send-email-eranian@google.com
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/cpu/perf_event.c       | 21 ++++++---
 arch/x86/kernel/cpu/perf_event.h       |  1 +
 arch/x86/kernel/cpu/perf_event_intel.c | 13 ++++++
 include/linux/perf_event.h             |  9 +++-
 kernel/events/core.c                   | 85 ++++++++++++++++++++++++++++++++++
 5 files changed, 121 insertions(+), 8 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kernel/cpu/perf_event.c b/arch/x86/kernel/cpu/perf_event.c
index cea567483274..0a18d16cb58d 100644
--- a/arch/x86/kernel/cpu/perf_event.c
+++ b/arch/x86/kernel/cpu/perf_event.c
@@ -1671,25 +1671,32 @@ static const struct attribute_group *x86_pmu_attr_groups[] = {
 	NULL,
 };
 
+static void x86_pmu_flush_branch_stack(void)
+{
+	if (x86_pmu.flush_branch_stack)
+		x86_pmu.flush_branch_stack();
+}
+
 static struct pmu pmu = {
-	.pmu_enable	= x86_pmu_enable,
-	.pmu_disable	= x86_pmu_disable,
+	.pmu_enable		= x86_pmu_enable,
+	.pmu_disable		= x86_pmu_disable,
 
 	.attr_groups	= x86_pmu_attr_groups,
 
 	.event_init	= x86_pmu_event_init,
 
-	.add		= x86_pmu_add,
-	.del		= x86_pmu_del,
-	.start		= x86_pmu_start,
-	.stop		= x86_pmu_stop,
-	.read		= x86_pmu_read,
+	.add			= x86_pmu_add,
+	.del			= x86_pmu_del,
+	.start			= x86_pmu_start,
+	.stop			= x86_pmu_stop,
+	.read			= x86_pmu_read,
 
 	.start_txn	= x86_pmu_start_txn,
 	.cancel_txn	= x86_pmu_cancel_txn,
 	.commit_txn	= x86_pmu_commit_txn,
 
 	.event_idx	= x86_pmu_event_idx,
+	.flush_branch_stack	= x86_pmu_flush_branch_stack,
 };
 
 void perf_update_user_clock(struct perf_event_mmap_page *userpg, u64 now)
diff --git a/arch/x86/kernel/cpu/perf_event.h b/arch/x86/kernel/cpu/perf_event.h
index f104c054dc5c..74387c12dc72 100644
--- a/arch/x86/kernel/cpu/perf_event.h
+++ b/arch/x86/kernel/cpu/perf_event.h
@@ -324,6 +324,7 @@ struct x86_pmu {
 	void		(*cpu_starting)(int cpu);
 	void		(*cpu_dying)(int cpu);
 	void		(*cpu_dead)(int cpu);
+	void		(*flush_branch_stack)(void);
 
 	/*
 	 * Intel Arch Perfmon v2+
diff --git a/arch/x86/kernel/cpu/perf_event_intel.c b/arch/x86/kernel/cpu/perf_event_intel.c
index 7cc1e2dcc4dd..6627089232a7 100644
--- a/arch/x86/kernel/cpu/perf_event_intel.c
+++ b/arch/x86/kernel/cpu/perf_event_intel.c
@@ -1539,6 +1539,18 @@ static void intel_pmu_cpu_dying(int cpu)
 	fini_debug_store_on_cpu(cpu);
 }
 
+static void intel_pmu_flush_branch_stack(void)
+{
+	/*
+	 * Intel LBR does not tag entries with the
+	 * PID of the current task, then we need to
+	 * flush it on ctxsw
+	 * For now, we simply reset it
+	 */
+	if (x86_pmu.lbr_nr)
+		intel_pmu_lbr_reset();
+}
+
 static __initconst const struct x86_pmu intel_pmu = {
 	.name			= "Intel",
 	.handle_irq		= intel_pmu_handle_irq,
@@ -1566,6 +1578,7 @@ static __initconst const struct x86_pmu intel_pmu = {
 	.cpu_starting		= intel_pmu_cpu_starting,
 	.cpu_dying		= intel_pmu_cpu_dying,
 	.guest_get_msrs		= intel_guest_get_msrs,
+	.flush_branch_stack	= intel_pmu_flush_branch_stack,
 };
 
 static __init void intel_clovertown_quirk(void)
diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h
index 5fc494f4a094..fbbf5e598368 100644
--- a/include/linux/perf_event.h
+++ b/include/linux/perf_event.h
@@ -746,6 +746,11 @@ struct pmu {
 	 * if no implementation is provided it will default to: event->hw.idx + 1.
 	 */
 	int (*event_idx)		(struct perf_event *event); /*optional */
+
+	/*
+	 * flush branch stack on context-switches (needed in cpu-wide mode)
+	 */
+	void (*flush_branch_stack)	(void);
 };
 
 /**
@@ -979,7 +984,8 @@ struct perf_event_context {
 	u64				parent_gen;
 	u64				generation;
 	int				pin_count;
-	int				nr_cgroups; /* cgroup events present */
+	int				nr_cgroups;	 /* cgroup evts */
+	int				nr_branch_stack; /* branch_stack evt */
 	struct rcu_head			rcu_head;
 };
 
@@ -1044,6 +1050,7 @@ perf_event_create_kernel_counter(struct perf_event_attr *attr,
 extern u64 perf_event_read_value(struct perf_event *event,
 				 u64 *enabled, u64 *running);
 
+
 struct perf_sample_data {
 	u64				type;
 
diff --git a/kernel/events/core.c b/kernel/events/core.c
index 242bb51c67f2..c61234b1a988 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -137,6 +137,7 @@ enum event_type_t {
  */
 struct static_key_deferred perf_sched_events __read_mostly;
 static DEFINE_PER_CPU(atomic_t, perf_cgroup_events);
+static DEFINE_PER_CPU(atomic_t, perf_branch_stack_events);
 
 static atomic_t nr_mmap_events __read_mostly;
 static atomic_t nr_comm_events __read_mostly;
@@ -888,6 +889,9 @@ list_add_event(struct perf_event *event, struct perf_event_context *ctx)
 	if (is_cgroup_event(event))
 		ctx->nr_cgroups++;
 
+	if (has_branch_stack(event))
+		ctx->nr_branch_stack++;
+
 	list_add_rcu(&event->event_entry, &ctx->event_list);
 	if (!ctx->nr_events)
 		perf_pmu_rotate_start(ctx->pmu);
@@ -1027,6 +1031,9 @@ list_del_event(struct perf_event *event, struct perf_event_context *ctx)
 			cpuctx->cgrp = NULL;
 	}
 
+	if (has_branch_stack(event))
+		ctx->nr_branch_stack--;
+
 	ctx->nr_events--;
 	if (event->attr.inherit_stat)
 		ctx->nr_stat--;
@@ -2201,6 +2208,66 @@ static void perf_event_context_sched_in(struct perf_event_context *ctx,
 	perf_pmu_rotate_start(ctx->pmu);
 }
 
+/*
+ * When sampling the branck stack in system-wide, it may be necessary
+ * to flush the stack on context switch. This happens when the branch
+ * stack does not tag its entries with the pid of the current task.
+ * Otherwise it becomes impossible to associate a branch entry with a
+ * task. This ambiguity is more likely to appear when the branch stack
+ * supports priv level filtering and the user sets it to monitor only
+ * at the user level (which could be a useful measurement in system-wide
+ * mode). In that case, the risk is high of having a branch stack with
+ * branch from multiple tasks. Flushing may mean dropping the existing
+ * entries or stashing them somewhere in the PMU specific code layer.
+ *
+ * This function provides the context switch callback to the lower code
+ * layer. It is invoked ONLY when there is at least one system-wide context
+ * with at least one active event using taken branch sampling.
+ */
+static void perf_branch_stack_sched_in(struct task_struct *prev,
+				       struct task_struct *task)
+{
+	struct perf_cpu_context *cpuctx;
+	struct pmu *pmu;
+	unsigned long flags;
+
+	/* no need to flush branch stack if not changing task */
+	if (prev == task)
+		return;
+
+	local_irq_save(flags);
+
+	rcu_read_lock();
+
+	list_for_each_entry_rcu(pmu, &pmus, entry) {
+		cpuctx = this_cpu_ptr(pmu->pmu_cpu_context);
+
+		/*
+		 * check if the context has at least one
+		 * event using PERF_SAMPLE_BRANCH_STACK
+		 */
+		if (cpuctx->ctx.nr_branch_stack > 0
+		    && pmu->flush_branch_stack) {
+
+			pmu = cpuctx->ctx.pmu;
+
+			perf_ctx_lock(cpuctx, cpuctx->task_ctx);
+
+			perf_pmu_disable(pmu);
+
+			pmu->flush_branch_stack();
+
+			perf_pmu_enable(pmu);
+
+			perf_ctx_unlock(cpuctx, cpuctx->task_ctx);
+		}
+	}
+
+	rcu_read_unlock();
+
+	local_irq_restore(flags);
+}
+
 /*
  * Called from scheduler to add the events of the current task
  * with interrupts disabled.
@@ -2232,6 +2299,10 @@ void __perf_event_task_sched_in(struct task_struct *prev,
 	 */
 	if (atomic_read(&__get_cpu_var(perf_cgroup_events)))
 		perf_cgroup_sched_in(prev, task);
+
+	/* check for system-wide branch_stack events */
+	if (atomic_read(&__get_cpu_var(perf_branch_stack_events)))
+		perf_branch_stack_sched_in(prev, task);
 }
 
 static u64 perf_calculate_period(struct perf_event *event, u64 nsec, u64 count)
@@ -2798,6 +2869,14 @@ static void free_event(struct perf_event *event)
 			atomic_dec(&per_cpu(perf_cgroup_events, event->cpu));
 			static_key_slow_dec_deferred(&perf_sched_events);
 		}
+
+		if (has_branch_stack(event)) {
+			static_key_slow_dec_deferred(&perf_sched_events);
+			/* is system-wide event */
+			if (!(event->attach_state & PERF_ATTACH_TASK))
+				atomic_dec(&per_cpu(perf_branch_stack_events,
+						    event->cpu));
+		}
 	}
 
 	if (event->rb) {
@@ -5924,6 +6003,12 @@ done:
 				return ERR_PTR(err);
 			}
 		}
+		if (has_branch_stack(event)) {
+			static_key_slow_inc(&perf_sched_events.key);
+			if (!(event->attach_state & PERF_ATTACH_TASK))
+				atomic_inc(&per_cpu(perf_branch_stack_events,
+						    event->cpu));
+		}
 	}
 
 	return event;
-- 
cgit v1.2.3


From 6414fa6a150111750011f477899d370244da4171 Mon Sep 17 00:00:00 2001
From: Al Viro <viro@ZenIV.linux.org.uk>
Date: Mon, 5 Mar 2012 06:38:42 +0000
Subject: aout: move setup_arg_pages() prior to reading/mapping the binary

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 arch/x86/ia32/ia32_aout.c | 14 +++++++-------
 fs/binfmt_aout.c          | 14 +++++++-------
 2 files changed, 14 insertions(+), 14 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/ia32/ia32_aout.c b/arch/x86/ia32/ia32_aout.c
index fd843877e841..39e49091f648 100644
--- a/arch/x86/ia32/ia32_aout.c
+++ b/arch/x86/ia32/ia32_aout.c
@@ -315,6 +315,13 @@ static int load_aout_binary(struct linux_binprm *bprm, struct pt_regs *regs)
 	current->mm->free_area_cache = TASK_UNMAPPED_BASE;
 	current->mm->cached_hole_size = 0;
 
+	retval = setup_arg_pages(bprm, IA32_STACK_TOP, EXSTACK_DEFAULT);
+	if (retval < 0) {
+		/* Someone check-me: is this error path enough? */
+		send_sig(SIGKILL, current, 0);
+		return retval;
+	}
+
 	install_exec_creds(bprm);
 	current->flags &= ~PF_FORKNOEXEC;
 
@@ -410,13 +417,6 @@ beyond_if:
 
 	set_brk(current->mm->start_brk, current->mm->brk);
 
-	retval = setup_arg_pages(bprm, IA32_STACK_TOP, EXSTACK_DEFAULT);
-	if (retval < 0) {
-		/* Someone check-me: is this error path enough? */
-		send_sig(SIGKILL, current, 0);
-		return retval;
-	}
-
 	current->mm->start_stack =
 		(unsigned long)create_aout_tables((char __user *)bprm->p, bprm);
 	/* start thread */
diff --git a/fs/binfmt_aout.c b/fs/binfmt_aout.c
index a6395bdb26ae..1ff94054d35a 100644
--- a/fs/binfmt_aout.c
+++ b/fs/binfmt_aout.c
@@ -259,6 +259,13 @@ static int load_aout_binary(struct linux_binprm * bprm, struct pt_regs * regs)
 	current->mm->free_area_cache = current->mm->mmap_base;
 	current->mm->cached_hole_size = 0;
 
+	retval = setup_arg_pages(bprm, STACK_TOP, EXSTACK_DEFAULT);
+	if (retval < 0) {
+		/* Someone check-me: is this error path enough? */
+		send_sig(SIGKILL, current, 0);
+		return retval;
+	}
+
 	install_exec_creds(bprm);
  	current->flags &= ~PF_FORKNOEXEC;
 
@@ -352,13 +359,6 @@ beyond_if:
 		return retval;
 	}
 
-	retval = setup_arg_pages(bprm, STACK_TOP, EXSTACK_DEFAULT);
-	if (retval < 0) { 
-		/* Someone check-me: is this error path enough? */ 
-		send_sig(SIGKILL, current, 0); 
-		return retval;
-	}
-
 	current->mm->start_stack =
 		(unsigned long) create_aout_tables((char __user *) bprm->p, bprm);
 #ifdef __alpha__
-- 
cgit v1.2.3


From a628b684d27d22631d1819890f13047ae9075241 Mon Sep 17 00:00:00 2001
From: "H. Peter Anvin" <hpa@zytor.com>
Date: Mon, 5 Mar 2012 13:39:29 -0800
Subject: x32: Provide separate is_ia32_task() and is_x32_task() predicates

The is_compat_task() test is composed of two predicates already, so
make each of them available separately.

Signed-off-by: H. Peter Anvin <hpa@zytor.com>
Cc: H. J. Lu <hjl.tools@gmail.com>
Link: http://lkml.kernel.org/r/1329696488-16970-1-git-send-email-hpa@zytor.com
---
 arch/x86/include/asm/compat.h | 12 +++++++++++-
 1 file changed, 11 insertions(+), 1 deletion(-)

(limited to 'arch/x86')

diff --git a/arch/x86/include/asm/compat.h b/arch/x86/include/asm/compat.h
index e7f68b49c01a..355edc091604 100644
--- a/arch/x86/include/asm/compat.h
+++ b/arch/x86/include/asm/compat.h
@@ -235,12 +235,17 @@ static inline void __user *arch_compat_alloc_user_space(long len)
 	return (void __user *)round_down(sp - len, 16);
 }
 
-static inline bool is_compat_task(void)
+static inline bool is_ia32_task(void)
 {
 #ifdef CONFIG_IA32_EMULATION
 	if (current_thread_info()->status & TS_COMPAT)
 		return true;
 #endif
+	return false;
+}
+
+static inline bool is_x32_task(void)
+{
 #ifdef CONFIG_X86_X32_ABI
 	if (task_pt_regs(current)->orig_ax & __X32_SYSCALL_BIT)
 		return true;
@@ -248,4 +253,9 @@ static inline bool is_compat_task(void)
 	return false;
 }
 
+static inline bool is_compat_task(void)
+{
+	return is_ia32_task() || is_x32_task();
+}
+
 #endif /* _ASM_X86_COMPAT_H */
-- 
cgit v1.2.3


From e7084fd52ed71249ab2ce7a7d89d601c9d1f904c Mon Sep 17 00:00:00 2001
From: "H. Peter Anvin" <hpa@zytor.com>
Date: Mon, 5 Mar 2012 13:40:24 -0800
Subject: x32: Switch to a 64-bit clock_t

clock_t is used mainly to give the number of jiffies a certain process
has burned.  It is entirely feasible for a long-running process to
consume more than 2^32 jiffies especially in a multiprocess system.
As such, switch to a 64-bit clock_t for x32, just as we already
switched to a 64-bit time_t.

clock_t is only used in a handful of places, and as such it is really
not a very significant change.  The one that has the biggest impact is
in struct siginfo, but since the *size* of struct siginfo doesn't
change (it is padded to the hilt) it is fairly easy to make this a
localized change.

This also gets rid of sys_x32_times, however since this is a pretty
late change don't compactify the system call numbers; we can reuse
system call slot 521 next time we need an x32 system call.

Reported-by: Gregory M. Lueck <gregory.m.lueck@intel.com>
Signed-off-by: H. Peter Anvin <hpa@zytor.com>
Cc: H. J. Lu <hjl.tools@gmail.com>
Link: http://lkml.kernel.org/r/1329696488-16970-1-git-send-email-hpa@zytor.com
---
 arch/x86/ia32/ia32_signal.c      | 10 ++++++++--
 arch/x86/include/asm/ia32.h      |  9 +++++++++
 arch/x86/syscalls/syscall_64.tbl |  4 ++--
 3 files changed, 19 insertions(+), 4 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/ia32/ia32_signal.c b/arch/x86/ia32/ia32_signal.c
index 25d80f3faf2e..bc09ed2a8b97 100644
--- a/arch/x86/ia32/ia32_signal.c
+++ b/arch/x86/ia32/ia32_signal.c
@@ -37,6 +37,7 @@
 int copy_siginfo_to_user32(compat_siginfo_t __user *to, siginfo_t *from)
 {
 	int err = 0;
+	bool ia32 = !is_ia32_task();
 
 	if (!access_ok(VERIFY_WRITE, to, sizeof(compat_siginfo_t)))
 		return -EFAULT;
@@ -66,8 +67,13 @@ int copy_siginfo_to_user32(compat_siginfo_t __user *to, siginfo_t *from)
 			case __SI_FAULT >> 16:
 				break;
 			case __SI_CHLD >> 16:
-				put_user_ex(from->si_utime, &to->si_utime);
-				put_user_ex(from->si_stime, &to->si_stime);
+				if (ia32) {
+					put_user_ex(from->si_utime, &to->si_utime);
+					put_user_ex(from->si_stime, &to->si_stime);
+				} else {
+					put_user_ex(from->si_utime, &to->_sifields._sigchld_x32._utime);
+					put_user_ex(from->si_stime, &to->_sifields._sigchld_x32._stime);
+				}
 				put_user_ex(from->si_status, &to->si_status);
 				/* FALL THROUGH */
 			default:
diff --git a/arch/x86/include/asm/ia32.h b/arch/x86/include/asm/ia32.h
index c6435ab1cc13..7d0c18587709 100644
--- a/arch/x86/include/asm/ia32.h
+++ b/arch/x86/include/asm/ia32.h
@@ -125,6 +125,15 @@ typedef struct compat_siginfo {
 			compat_clock_t _stime;
 		} _sigchld;
 
+		/* SIGCHLD (x32 version) */
+		struct {
+			unsigned int _pid;	/* which child */
+			unsigned int _uid;	/* sender's uid */
+			int _status;		/* exit code */
+			s64 _utime;
+			s64 _stime;
+		} _sigchld_x32;
+
 		/* SIGILL, SIGFPE, SIGSEGV, SIGBUS */
 		struct {
 			unsigned int _addr;	/* faulting insn/memory ref. */
diff --git a/arch/x86/syscalls/syscall_64.tbl b/arch/x86/syscalls/syscall_64.tbl
index 4aecc7e31166..0d778b800884 100644
--- a/arch/x86/syscalls/syscall_64.tbl
+++ b/arch/x86/syscalls/syscall_64.tbl
@@ -106,7 +106,7 @@
 97	common	getrlimit		sys_getrlimit
 98	common	getrusage		sys_getrusage
 99	common	sysinfo			sys_sysinfo
-100	64	times			sys_times
+100	common	times			sys_times
 101	common	ptrace			sys_ptrace
 102	common	getuid			sys_getuid
 103	common	syslog			sys_syslog
@@ -331,7 +331,7 @@
 518	x32	sendmsg			compat_sys_sendmsg
 519	x32	recvmsg			compat_sys_recvmsg
 520	x32	execve			stub_x32_execve
-521	x32	times			compat_sys_times
+# 521 available
 522	x32	rt_sigpending		sys32_rt_sigpending
 523	x32	rt_sigtimedwait		compat_sys_rt_sigtimedwait
 524	x32	rt_sigqueueinfo		sys32_rt_sigqueueinfo
-- 
cgit v1.2.3


From 55283e2537714f9370c4ab847d170acf223daf90 Mon Sep 17 00:00:00 2001
From: "H.J. Lu" <hjl.tools@gmail.com>
Date: Mon, 5 Mar 2012 15:32:11 -0800
Subject: x32: Add ptrace for x32

X32 ptrace is a hybrid of 64bit ptrace and compat ptrace with 32bit
address and longs.  It use 64bit ptrace to access the full 64bit
registers.  PTRACE_PEEKUSR and PTRACE_POKEUSR are only allowed to access
segment and debug registers.  PTRACE_PEEKUSR returns the lower 32bits
and PTRACE_POKEUSR zero-extends 32bit value to 64bit.   It works since
the upper 32bits of segment and debug registers of x32 process are always
zero.  GDB only uses PTRACE_PEEKUSR and PTRACE_POKEUSR to access
segment and debug registers.

[ hpa: changed TIF_X32 test to use !is_ia32_task() instead, and moved
  the system call number to the now-unused 521 slot. ]

Signed-off-by: "H.J. Lu" <hjl.tools@gmail.com>
Signed-off-by: H. Peter Anvin <hpa@zytor.com>
Cc: Roland McGrath <roland@hack.frob.com>
Cc: Oleg Nesterov <oleg@redhat.com>
Link: http://lkml.kernel.org/r/1329696488-16970-1-git-send-email-hpa@zytor.com
---
 arch/x86/kernel/ptrace.c         | 99 ++++++++++++++++++++++++++++++++++++++++
 arch/x86/syscalls/syscall_64.tbl |  4 +-
 2 files changed, 101 insertions(+), 2 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kernel/ptrace.c b/arch/x86/kernel/ptrace.c
index 50267386b766..93e7877a19c4 100644
--- a/arch/x86/kernel/ptrace.c
+++ b/arch/x86/kernel/ptrace.c
@@ -1130,6 +1130,100 @@ static int genregs32_set(struct task_struct *target,
 	return ret;
 }
 
+#ifdef CONFIG_X86_X32_ABI
+static long x32_arch_ptrace(struct task_struct *child,
+			    compat_long_t request, compat_ulong_t caddr,
+			    compat_ulong_t cdata)
+{
+	unsigned long addr = caddr;
+	unsigned long data = cdata;
+	void __user *datap = compat_ptr(data);
+	int ret;
+
+	switch (request) {
+	/* Read 32bits at location addr in the USER area.  Only allow
+	   to return the lower 32bits of segment and debug registers.  */
+	case PTRACE_PEEKUSR: {
+		u32 tmp;
+
+		ret = -EIO;
+		if ((addr & (sizeof(data) - 1)) || addr >= sizeof(struct user) ||
+		    addr < offsetof(struct user_regs_struct, cs))
+			break;
+
+		tmp = 0;  /* Default return condition */
+		if (addr < sizeof(struct user_regs_struct))
+			tmp = getreg(child, addr);
+		else if (addr >= offsetof(struct user, u_debugreg[0]) &&
+			 addr <= offsetof(struct user, u_debugreg[7])) {
+			addr -= offsetof(struct user, u_debugreg[0]);
+			tmp = ptrace_get_debugreg(child, addr / sizeof(data));
+		}
+		ret = put_user(tmp, (__u32 __user *)datap);
+		break;
+	}
+
+	/* Write the word at location addr in the USER area.  Only allow
+	   to update segment and debug registers with the upper 32bits
+	   zero-extended. */
+	case PTRACE_POKEUSR:
+		ret = -EIO;
+		if ((addr & (sizeof(data) - 1)) || addr >= sizeof(struct user) ||
+		    addr < offsetof(struct user_regs_struct, cs))
+			break;
+
+		if (addr < sizeof(struct user_regs_struct))
+			ret = putreg(child, addr, data);
+		else if (addr >= offsetof(struct user, u_debugreg[0]) &&
+			 addr <= offsetof(struct user, u_debugreg[7])) {
+			addr -= offsetof(struct user, u_debugreg[0]);
+			ret = ptrace_set_debugreg(child,
+						  addr / sizeof(data), data);
+		}
+		break;
+
+	case PTRACE_GETREGS:	/* Get all gp regs from the child. */
+		return copy_regset_to_user(child,
+					   task_user_regset_view(current),
+					   REGSET_GENERAL,
+					   0, sizeof(struct user_regs_struct),
+					   datap);
+
+	case PTRACE_SETREGS:	/* Set all gp regs in the child. */
+		return copy_regset_from_user(child,
+					     task_user_regset_view(current),
+					     REGSET_GENERAL,
+					     0, sizeof(struct user_regs_struct),
+					     datap);
+
+	case PTRACE_GETFPREGS:	/* Get the child FPU state. */
+		return copy_regset_to_user(child,
+					   task_user_regset_view(current),
+					   REGSET_FP,
+					   0, sizeof(struct user_i387_struct),
+					   datap);
+
+	case PTRACE_SETFPREGS:	/* Set the child FPU state. */
+		return copy_regset_from_user(child,
+					     task_user_regset_view(current),
+					     REGSET_FP,
+					     0, sizeof(struct user_i387_struct),
+					     datap);
+
+		/* normal 64bit interface to access TLS data.
+		   Works just like arch_prctl, except that the arguments
+		   are reversed. */
+	case PTRACE_ARCH_PRCTL:
+		return do_arch_prctl(child, data, addr);
+
+	default:
+		return compat_ptrace_request(child, request, addr, data);
+	}
+
+	return ret;
+}
+#endif
+
 long compat_arch_ptrace(struct task_struct *child, compat_long_t request,
 			compat_ulong_t caddr, compat_ulong_t cdata)
 {
@@ -1139,6 +1233,11 @@ long compat_arch_ptrace(struct task_struct *child, compat_long_t request,
 	int ret;
 	__u32 val;
 
+#ifdef CONFIG_X86_X32_ABI
+	if (!is_ia32_task())
+		return x32_arch_ptrace(child, request, caddr, cdata);
+#endif
+
 	switch (request) {
 	case PTRACE_PEEKUSR:
 		ret = getreg32(child, addr, &val);
diff --git a/arch/x86/syscalls/syscall_64.tbl b/arch/x86/syscalls/syscall_64.tbl
index 0d778b800884..dd29a9ea27c5 100644
--- a/arch/x86/syscalls/syscall_64.tbl
+++ b/arch/x86/syscalls/syscall_64.tbl
@@ -107,7 +107,7 @@
 98	common	getrusage		sys_getrusage
 99	common	sysinfo			sys_sysinfo
 100	common	times			sys_times
-101	common	ptrace			sys_ptrace
+101	64	ptrace			sys_ptrace
 102	common	getuid			sys_getuid
 103	common	syslog			sys_syslog
 104	common	getgid			sys_getgid
@@ -331,7 +331,7 @@
 518	x32	sendmsg			compat_sys_sendmsg
 519	x32	recvmsg			compat_sys_recvmsg
 520	x32	execve			stub_x32_execve
-# 521 available
+521	x32	ptrace			compat_sys_ptrace
 522	x32	rt_sigpending		sys32_rt_sigpending
 523	x32	rt_sigtimedwait		compat_sys_rt_sigtimedwait
 524	x32	rt_sigqueueinfo		sys32_rt_sigqueueinfo
-- 
cgit v1.2.3


From 373913b568cbfbefcee3263b98bd5a1a8b491f1b Mon Sep 17 00:00:00 2001
From: Philip Prindeville <philipp@redfish-solutions.com>
Date: Mon, 5 Mar 2012 15:05:14 -0800
Subject: x86/geode/alix2: Supplement driver to include GPIO button support

GPIO 24 is used in reference designs as a soft-reset button, and
the alix2 is no exception.  Add it as a gpio-button.

Use symbolic values to describe BIOS addresses.

Record the model number.

Signed-off-by: Philip A. Prindeville <philipp@redfish-solutions.com>
Acked-by: Ed Wildgoose <kernel@wildgooses.com>
Acked-by: Andres Salomon <dilinger@queued.net>
Cc: Matthew Garrett <mjg@redhat.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Link: http://lkml.kernel.org/n/tip-sjp6k1rjksitx1pej0c0qxd1@git.kernel.org
[ tidied up the code a bit ]
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/platform/geode/alix.c | 76 +++++++++++++++++++++++++++++++++++++-----
 1 file changed, 67 insertions(+), 9 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/platform/geode/alix.c b/arch/x86/platform/geode/alix.c
index dc5f1d32aced..90e23e7679a5 100644
--- a/arch/x86/platform/geode/alix.c
+++ b/arch/x86/platform/geode/alix.c
@@ -6,6 +6,7 @@
  *
  * Copyright (C) 2008 Constantin Baranov <const@mimas.ru>
  * Copyright (C) 2011 Ed Wildgoose <kernel@wildgooses.com>
+ *                and Philip Prindeville <philipp@redfish-solutions.com>
  *
  * TODO: There are large similarities with leds-net5501.c
  * by Alessandro Zummo <a.zummo@towertech.it>
@@ -24,14 +25,47 @@
 #include <linux/leds.h>
 #include <linux/platform_device.h>
 #include <linux/gpio.h>
+#include <linux/input.h>
+#include <linux/gpio_keys.h>
+#include <linux/dmi.h>
 
 #include <asm/geode.h>
 
+#define BIOS_SIGNATURE_TINYBIOS		0xf0000
+#define BIOS_SIGNATURE_COREBOOT		0x500
+#define BIOS_REGION_SIZE		0x10000
+
 static bool force = 0;
 module_param(force, bool, 0444);
 /* FIXME: Award bios is not automatically detected as Alix platform */
 MODULE_PARM_DESC(force, "Force detection as ALIX.2/ALIX.3 platform");
 
+static struct gpio_keys_button alix_gpio_buttons[] = {
+	{
+		.code			= KEY_RESTART,
+		.gpio			= 24,
+		.active_low		= 1,
+		.desc			= "Reset button",
+		.type			= EV_KEY,
+		.wakeup			= 0,
+		.debounce_interval	= 100,
+		.can_disable		= 0,
+	}
+};
+static struct gpio_keys_platform_data alix_buttons_data = {
+	.buttons			= alix_gpio_buttons,
+	.nbuttons			= ARRAY_SIZE(alix_gpio_buttons),
+	.poll_interval			= 20,
+};
+
+static struct platform_device alix_buttons_dev = {
+	.name				= "gpio-keys-polled",
+	.id				= 1,
+	.dev = {
+		.platform_data		= &alix_buttons_data,
+	}
+};
+
 static struct gpio_led alix_leds[] = {
 	{
 		.name = "alix:1",
@@ -64,17 +98,22 @@ static struct platform_device alix_leds_dev = {
 	.dev.platform_data = &alix_leds_data,
 };
 
+static struct __initdata platform_device *alix_devs[] = {
+	&alix_buttons_dev,
+	&alix_leds_dev,
+};
+
 static void __init register_alix(void)
 {
 	/* Setup LED control through leds-gpio driver */
-	platform_device_register(&alix_leds_dev);
+	platform_add_devices(alix_devs, ARRAY_SIZE(alix_devs));
 }
 
-static int __init alix_present(unsigned long bios_phys,
+static bool __init alix_present(unsigned long bios_phys,
 				const char *alix_sig,
 				size_t alix_sig_len)
 {
-	const size_t bios_len = 0x00010000;
+	const size_t bios_len = BIOS_REGION_SIZE;
 	const char *bios_virt;
 	const char *scan_end;
 	const char *p;
@@ -84,7 +123,7 @@ static int __init alix_present(unsigned long bios_phys,
 		printk(KERN_NOTICE "%s: forced to skip BIOS test, "
 		       "assume system is ALIX.2/ALIX.3\n",
 		       KBUILD_MODNAME);
-		return 1;
+		return true;
 	}
 
 	bios_virt = phys_to_virt(bios_phys);
@@ -109,15 +148,33 @@ static int __init alix_present(unsigned long bios_phys,
 			*a = '\0';
 
 		tail = p + alix_sig_len;
-		if ((tail[0] == '2' || tail[0] == '3')) {
+		if ((tail[0] == '2' || tail[0] == '3' || tail[0] == '6')) {
 			printk(KERN_INFO
 			       "%s: system is recognized as \"%s\"\n",
 			       KBUILD_MODNAME, name);
-			return 1;
+			return true;
 		}
 	}
 
-	return 0;
+	return false;
+}
+
+static bool __init alix_present_dmi(void)
+{
+	const char *vendor, *product;
+
+	vendor = dmi_get_system_info(DMI_SYS_VENDOR);
+	if (!vendor || strcmp(vendor, "PC Engines"))
+		return false;
+
+	product = dmi_get_system_info(DMI_PRODUCT_NAME);
+	if (!product || (strcmp(product, "ALIX.2D") && strcmp(product, "ALIX.6")))
+		return false;
+
+	printk(KERN_INFO "%s: system is recognized as \"%s %s\"\n",
+	       KBUILD_MODNAME, vendor, product);
+
+	return true;
 }
 
 static int __init alix_init(void)
@@ -128,8 +185,9 @@ static int __init alix_init(void)
 	if (!is_geode())
 		return 0;
 
-	if (alix_present(0xf0000, tinybios_sig, sizeof(tinybios_sig) - 1) ||
-	    alix_present(0x500, coreboot_sig, sizeof(coreboot_sig) - 1))
+	if (alix_present(BIOS_SIGNATURE_TINYBIOS, tinybios_sig, sizeof(tinybios_sig) - 1) ||
+	    alix_present(BIOS_SIGNATURE_COREBOOT, coreboot_sig, sizeof(coreboot_sig) - 1) ||
+	    alix_present_dmi())
 		register_alix();
 
 	return 0;
-- 
cgit v1.2.3


From da4e3302949f4a702f1ddfefe067762232d363d5 Mon Sep 17 00:00:00 2001
From: Philip Prindeville <philipp@redfish-solutions.com>
Date: Mon, 5 Mar 2012 15:05:15 -0800
Subject: x86/geode/net5501: Add platform driver for Soekris Engineering
 net5501

Add platform driver for the Soekris Engineering net5501 single-board
computer.  Probes well-known locations in ROM for BIOS signature
to confirm correct platform.  Registers 1 LED and 1 GPIO-based
button (typically used for soft reset).

Signed-off-by: Philip Prindeville <philipp@redfish-solutions.com>
Acked-by: Alessandro Zummo <a.zummo@towertech.it>
Cc: Richard Purdie <rpurdie@rpsys.net>
Cc: Andres Salomon <dilinger@queued.net>
Cc: Matthew Garrett <mjg@redhat.com>
[ Removed Kconfig and Makefile detritus from drivers/leds/]
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Link: http://lkml.kernel.org/n/tip-jv5uf34996juqh5syes8mn4h@git.kernel.org
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/Kconfig                  |   6 ++
 arch/x86/platform/geode/Makefile  |   1 +
 arch/x86/platform/geode/net5501.c | 154 ++++++++++++++++++++++++++++++++++++++
 drivers/leds/Kconfig              |  10 ---
 drivers/leds/Makefile             |   1 -
 drivers/leds/leds-net5501.c       |  97 ------------------------
 6 files changed, 161 insertions(+), 108 deletions(-)
 create mode 100644 arch/x86/platform/geode/net5501.c
 delete mode 100644 drivers/leds/leds-net5501.c

(limited to 'arch/x86')

diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
index c0d49316a63d..e76f1dbadcd6 100644
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -2114,6 +2114,12 @@ config ALIX
 
 	  Note: You have to set alix.force=1 for boards with Award BIOS.
 
+config NET5501
+	bool "Soekris Engineering net5501 System Support (LEDS, GPIO, etc)"
+	select GPIOLIB
+	---help---
+	  This option enables system support for the Soekris Engineering net5501.
+
 endif # X86_32
 
 config AMD_NB
diff --git a/arch/x86/platform/geode/Makefile b/arch/x86/platform/geode/Makefile
index 07c9cd05021a..246b788847ff 100644
--- a/arch/x86/platform/geode/Makefile
+++ b/arch/x86/platform/geode/Makefile
@@ -1 +1,2 @@
 obj-$(CONFIG_ALIX)		+= alix.o
+obj-$(CONFIG_NET5501)		+= net5501.o
diff --git a/arch/x86/platform/geode/net5501.c b/arch/x86/platform/geode/net5501.c
new file mode 100644
index 000000000000..66d377e334f7
--- /dev/null
+++ b/arch/x86/platform/geode/net5501.c
@@ -0,0 +1,154 @@
+/*
+ * System Specific setup for Soekris net5501
+ * At the moment this means setup of GPIO control of LEDs and buttons
+ * on net5501 boards.
+ *
+ *
+ * Copyright (C) 2008-2009 Tower Technologies
+ * Written by Alessandro Zummo <a.zummo@towertech.it>
+ *
+ * Copyright (C) 2008 Constantin Baranov <const@mimas.ru>
+ * Copyright (C) 2011 Ed Wildgoose <kernel@wildgooses.com>
+ *                and Philip Prindeville <philipp@redfish-solutions.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2
+ * as published by the Free Software Foundation.
+ */
+
+#include <linux/kernel.h>
+#include <linux/init.h>
+#include <linux/io.h>
+#include <linux/string.h>
+#include <linux/module.h>
+#include <linux/leds.h>
+#include <linux/platform_device.h>
+#include <linux/gpio.h>
+#include <linux/input.h>
+#include <linux/gpio_keys.h>
+
+#include <asm/geode.h>
+
+#define BIOS_REGION_BASE		0xffff0000
+#define BIOS_REGION_SIZE		0x00010000
+
+static struct gpio_keys_button net5501_gpio_buttons[] = {
+	{
+		.code = KEY_RESTART,
+		.gpio = 24,
+		.active_low = 1,
+		.desc = "Reset button",
+		.type = EV_KEY,
+		.wakeup = 0,
+		.debounce_interval = 100,
+		.can_disable = 0,
+	}
+};
+static struct gpio_keys_platform_data net5501_buttons_data = {
+	.buttons = net5501_gpio_buttons,
+	.nbuttons = ARRAY_SIZE(net5501_gpio_buttons),
+	.poll_interval = 20,
+};
+
+static struct platform_device net5501_buttons_dev = {
+	.name = "gpio-keys-polled",
+	.id = 1,
+	.dev = {
+		.platform_data = &net5501_buttons_data,
+	}
+};
+
+static struct gpio_led net5501_leds[] = {
+	{
+		.name = "net5501:1",
+		.gpio = 6,
+		.default_trigger = "default-on",
+		.active_low = 1,
+	},
+};
+
+static struct gpio_led_platform_data net5501_leds_data = {
+	.num_leds = ARRAY_SIZE(net5501_leds),
+	.leds = net5501_leds,
+};
+
+static struct platform_device net5501_leds_dev = {
+	.name = "leds-gpio",
+	.id = -1,
+	.dev.platform_data = &net5501_leds_data,
+};
+
+static struct __initdata platform_device *net5501_devs[] = {
+	&net5501_buttons_dev,
+	&net5501_leds_dev,
+};
+
+static void __init register_net5501(void)
+{
+	/* Setup LED control through leds-gpio driver */
+	platform_add_devices(net5501_devs, ARRAY_SIZE(net5501_devs));
+}
+
+struct net5501_board {
+	u16	offset;
+	u16	len;
+	char	*sig;
+};
+
+static struct net5501_board __initdata boards[] = {
+	{ 0xb7b, 7, "net5501" },	/* net5501 v1.33/1.33c */
+	{ 0xb1f, 7, "net5501" },	/* net5501 v1.32i */
+};
+
+static bool __init net5501_present(void)
+{
+	int i;
+	unsigned char *rombase, *bios;
+	bool found = false;
+
+	rombase = ioremap(BIOS_REGION_BASE, BIOS_REGION_SIZE - 1);
+	if (!rombase) {
+		printk(KERN_ERR "%s: failed to get rombase\n", KBUILD_MODNAME);
+		return found;
+	}
+
+	bios = rombase + 0x20;	/* null terminated */
+
+	if (memcmp(bios, "comBIOS", 7))
+		goto unmap;
+
+	for (i = 0; i < ARRAY_SIZE(boards); i++) {
+		unsigned char *model = rombase + boards[i].offset;
+
+		if (!memcmp(model, boards[i].sig, boards[i].len)) {
+			printk(KERN_INFO "%s: system is recognized as \"%s\"\n",
+			       KBUILD_MODNAME, model);
+
+			found = true;
+			break;
+		}
+	}
+
+unmap:
+	iounmap(rombase);
+	return found;
+}
+
+static int __init net5501_init(void)
+{
+	if (!is_geode())
+		return 0;
+
+	if (!net5501_present())
+		return 0;
+
+	register_net5501();
+
+	return 0;
+}
+
+module_init(net5501_init);
+
+MODULE_AUTHOR("Philip Prindeville <philipp@redfish-solutions.com>");
+MODULE_DESCRIPTION("Soekris net5501 System Setup");
+MODULE_LICENSE("GPL");
diff --git a/drivers/leds/Kconfig b/drivers/leds/Kconfig
index 9ca28fced2b9..8c7a75d53101 100644
--- a/drivers/leds/Kconfig
+++ b/drivers/leds/Kconfig
@@ -89,16 +89,6 @@ config LEDS_NET48XX
 	  This option enables support for the Soekris net4801 and net4826 error
 	  LED.
 
-config LEDS_NET5501
-	tristate "LED Support for Soekris net5501 series Error LED"
-	depends on LEDS_TRIGGERS
-	depends on X86 && GPIO_CS5535
-	select LEDS_TRIGGER_DEFAULT_ON
-	default n
-	help
-	  Add support for the Soekris net5501 board (detection, error led
-	  and GPIO).
-
 config LEDS_FSG
 	tristate "LED Support for the Freecom FSG-3"
 	depends on LEDS_CLASS
diff --git a/drivers/leds/Makefile b/drivers/leds/Makefile
index 1fc6875a8b20..6bcf4f695515 100644
--- a/drivers/leds/Makefile
+++ b/drivers/leds/Makefile
@@ -14,7 +14,6 @@ obj-$(CONFIG_LEDS_MIKROTIK_RB532)	+= leds-rb532.o
 obj-$(CONFIG_LEDS_S3C24XX)		+= leds-s3c24xx.o
 obj-$(CONFIG_LEDS_AMS_DELTA)		+= leds-ams-delta.o
 obj-$(CONFIG_LEDS_NET48XX)		+= leds-net48xx.o
-obj-$(CONFIG_LEDS_NET5501)		+= leds-net5501.o
 obj-$(CONFIG_LEDS_WRAP)			+= leds-wrap.o
 obj-$(CONFIG_LEDS_COBALT_QUBE)		+= leds-cobalt-qube.o
 obj-$(CONFIG_LEDS_COBALT_RAQ)		+= leds-cobalt-raq.o
diff --git a/drivers/leds/leds-net5501.c b/drivers/leds/leds-net5501.c
deleted file mode 100644
index 0555d4709a7c..000000000000
--- a/drivers/leds/leds-net5501.c
+++ /dev/null
@@ -1,97 +0,0 @@
-/*
- * Soekris board support code
- *
- * Copyright (C) 2008-2009 Tower Technologies
- * Written by Alessandro Zummo <a.zummo@towertech.it>
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License version 2
- * as published by the Free Software Foundation.
- */
-
-#include <linux/kernel.h>
-#include <linux/init.h>
-#include <linux/io.h>
-#include <linux/string.h>
-#include <linux/leds.h>
-#include <linux/platform_device.h>
-#include <linux/gpio.h>
-#include <linux/module.h>
-
-#include <asm/geode.h>
-
-static const struct gpio_led net5501_leds[] = {
-	{
-		.name = "error",
-		.gpio = 6,
-		.default_trigger = "default-on",
-	},
-};
-
-static struct gpio_led_platform_data net5501_leds_data = {
-	.num_leds = ARRAY_SIZE(net5501_leds),
-	.leds = net5501_leds,
-};
-
-static struct platform_device net5501_leds_dev = {
-	.name = "leds-gpio",
-	.id = -1,
-	.dev.platform_data = &net5501_leds_data,
-};
-
-static void __init init_net5501(void)
-{
-	platform_device_register(&net5501_leds_dev);
-}
-
-struct soekris_board {
-	u16	offset;
-	char	*sig;
-	u8	len;
-	void	(*init)(void);
-};
-
-static struct soekris_board __initdata boards[] = {
-	{ 0xb7b, "net5501", 7, init_net5501 },	/* net5501 v1.33/1.33c */
-	{ 0xb1f, "net5501", 7, init_net5501 },	/* net5501 v1.32i */
-};
-
-static int __init soekris_init(void)
-{
-	int i;
-	unsigned char *rombase, *bios;
-
-	if (!is_geode())
-		return 0;
-
-	rombase = ioremap(0xffff0000, 0xffff);
-	if (!rombase) {
-		printk(KERN_INFO "Soekris net5501 LED driver failed to get rombase");
-		return 0;
-	}
-
-	bios = rombase + 0x20;	/* null terminated */
-
-	if (strncmp(bios, "comBIOS", 7))
-		goto unmap;
-
-	for (i = 0; i < ARRAY_SIZE(boards); i++) {
-		unsigned char *model = rombase + boards[i].offset;
-
-		if (strncmp(model, boards[i].sig, boards[i].len) == 0) {
-			printk(KERN_INFO "Soekris %s: %s\n", model, bios);
-
-			if (boards[i].init)
-				boards[i].init();
-			break;
-		}
-	}
-
-unmap:
-	iounmap(rombase);
-	return 0;
-}
-
-arch_initcall(soekris_init);
-
-MODULE_LICENSE("GPL");
-- 
cgit v1.2.3


From 86b4ce3156c0dc140907ad03639564000cde694f Mon Sep 17 00:00:00 2001
From: Masami Hiramatsu <masami.hiramatsu.pt@hitachi.com>
Date: Mon, 5 Mar 2012 22:32:09 +0900
Subject: x86/kprobes: Fix instruction recovery on optimized path

Current probed-instruction recovery expects that only breakpoint
instruction modifies instruction. However, since kprobes jump
optimization can replace original instructions with a jump,
that expectation is not enough. And it may cause instruction
decoding failure on the function where an optimized probe
already exists.

This bug can reproduce easily as below:

1) find a target function address (any kprobe-able function is OK)

 $ grep __secure_computing /proc/kallsyms
   ffffffff810c19d0 T __secure_computing

2) decode the function
   $ objdump -d vmlinux --start-address=0xffffffff810c19d0 --stop-address=0xffffffff810c19eb

  vmlinux:     file format elf64-x86-64

Disassembly of section .text:

ffffffff810c19d0 <__secure_computing>:
ffffffff810c19d0:       55                      push   %rbp
ffffffff810c19d1:       48 89 e5                mov    %rsp,%rbp
ffffffff810c19d4:       e8 67 8f 72 00          callq
ffffffff817ea940 <mcount>
ffffffff810c19d9:       65 48 8b 04 25 40 b8    mov    %gs:0xb840,%rax
ffffffff810c19e0:       00 00
ffffffff810c19e2:       83 b8 88 05 00 00 01    cmpl $0x1,0x588(%rax)
ffffffff810c19e9:       74 05                   je     ffffffff810c19f0 <__secure_computing+0x20>

3) put a kprobe-event at an optimize-able place, where no
 call/jump places within the 5 bytes.
 $ su -
 # cd /sys/kernel/debug/tracing
 # echo p __secure_computing+0x9 > kprobe_events

4) enable it and check it is optimized.
 # echo 1 > events/kprobes/p___secure_computing_9/enable
 # cat ../kprobes/list
 ffffffff810c19d9  k  __secure_computing+0x9    [OPTIMIZED]

5) put another kprobe on an instruction after previous probe in
  the same function.
 # echo p __secure_computing+0x12 >> kprobe_events
 bash: echo: write error: Invalid argument
 # dmesg | tail -n 1
 [ 1666.500016] Probing address(0xffffffff810c19e2) is not an instruction boundary.

6) however, if the kprobes optimization is disabled, it works.
 # echo 0 > /proc/sys/debug/kprobes-optimization
 # cat ../kprobes/list
 ffffffff810c19d9  k  __secure_computing+0x9
 # echo p __secure_computing+0x12 >> kprobe_events
 (no error)

This is because kprobes doesn't recover the instruction
which is overwritten with a relative jump by another kprobe
when finding instruction boundary.
It only recovers the breakpoint instruction.

This patch fixes kprobes to recover such instructions.

With this fix:

 # echo p __secure_computing+0x9 > kprobe_events
 # echo 1 > events/kprobes/p___secure_computing_9/enable
 # cat ../kprobes/list
 ffffffff810c1aa9  k  __secure_computing+0x9    [OPTIMIZED]
 # echo p __secure_computing+0x12 >> kprobe_events
 # cat ../kprobes/list
 ffffffff810c1aa9  k  __secure_computing+0x9    [OPTIMIZED]
 ffffffff810c1ab2  k  __secure_computing+0x12    [DISABLED]

Changes in v4:
 - Fix a bug to ensure optimized probe is really optimized
   by jump.
 - Remove kprobe_optready() dependency.
 - Cleanup code for preparing optprobe separation.

Changes in v3:
 - Fix a build error when CONFIG_OPTPROBE=n. (Thanks, Ingo!)
   To fix the error, split optprobe instruction recovering
   path from kprobes path.
 - Cleanup comments/styles.

Changes in v2:
 - Fix a bug to recover original instruction address in
   RIP-relative instruction fixup.
 - Moved on tip/master.

Signed-off-by: Masami Hiramatsu <masami.hiramatsu.pt@hitachi.com>
Cc: Ananth N Mavinakayanahalli <ananth@in.ibm.com>
Cc: yrl.pp-manager.tt@hitachi.com
Cc: systemtap@sourceware.org
Cc: anderson@redhat.com
Link: http://lkml.kernel.org/r/20120305133209.5982.36568.stgit@localhost.localdomain
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/kprobes.c | 140 ++++++++++++++++++++++++++++++++--------------
 1 file changed, 97 insertions(+), 43 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kernel/kprobes.c b/arch/x86/kernel/kprobes.c
index 7da647d8b64c..6bec22f514b5 100644
--- a/arch/x86/kernel/kprobes.c
+++ b/arch/x86/kernel/kprobes.c
@@ -207,13 +207,15 @@ retry:
 	}
 }
 
-/* Recover the probed instruction at addr for further analysis. */
-static int recover_probed_instruction(kprobe_opcode_t *buf, unsigned long addr)
+static unsigned long __recover_probed_insn(kprobe_opcode_t *buf,
+					   unsigned long addr)
 {
 	struct kprobe *kp;
+
 	kp = get_kprobe((void *)addr);
+	/* There is no probe, return original address */
 	if (!kp)
-		return -EINVAL;
+		return addr;
 
 	/*
 	 *  Basically, kp->ainsn.insn has an original instruction.
@@ -230,14 +232,76 @@ static int recover_probed_instruction(kprobe_opcode_t *buf, unsigned long addr)
 	 */
 	memcpy(buf, kp->addr, MAX_INSN_SIZE * sizeof(kprobe_opcode_t));
 	buf[0] = kp->opcode;
-	return 0;
+	return (unsigned long)buf;
+}
+
+#ifdef CONFIG_OPTPROBES
+static unsigned long __recover_optprobed_insn(kprobe_opcode_t *buf,
+					      unsigned long addr)
+{
+	struct optimized_kprobe *op;
+	struct kprobe *kp;
+	long offs;
+	int i;
+
+	for (i = 0; i < RELATIVEJUMP_SIZE; i++) {
+		kp = get_kprobe((void *)addr - i);
+		/* This function only handles jump-optimized kprobe */
+		if (kp && kprobe_optimized(kp)) {
+			op = container_of(kp, struct optimized_kprobe, kp);
+			/* If op->list is not empty, op is under optimizing */
+			if (list_empty(&op->list))
+				goto found;
+		}
+	}
+
+	return addr;
+found:
+	/*
+	 * If the kprobe can be optimized, original bytes which can be
+	 * overwritten by jump destination address. In this case, original
+	 * bytes must be recovered from op->optinsn.copied_insn buffer.
+	 */
+	memcpy(buf, (void *)addr, MAX_INSN_SIZE * sizeof(kprobe_opcode_t));
+	if (addr == (unsigned long)kp->addr) {
+		buf[0] = kp->opcode;
+		memcpy(buf + 1, op->optinsn.copied_insn, RELATIVE_ADDR_SIZE);
+	} else {
+		offs = addr - (unsigned long)kp->addr - 1;
+		memcpy(buf, op->optinsn.copied_insn + offs, RELATIVE_ADDR_SIZE - offs);
+	}
+
+	return (unsigned long)buf;
+}
+#else
+static inline unsigned long __recover_optprobed_insn(kprobe_opcode_t *buf,
+						     unsigned long addr)
+{
+	return addr;
+}
+#endif
+
+/*
+ * Recover the probed instruction at addr for further analysis.
+ * Caller must lock kprobes by kprobe_mutex, or disable preemption
+ * for preventing to release referencing kprobes.
+ */
+static unsigned long recover_probed_instruction(kprobe_opcode_t *buf,
+						unsigned long addr)
+{
+	unsigned long __addr;
+
+	__addr = __recover_optprobed_insn(buf, addr);
+	if (__addr != addr)
+		return __addr;
+
+	return __recover_probed_insn(buf, addr);
 }
 
 /* Check if paddr is at an instruction boundary */
 static int __kprobes can_probe(unsigned long paddr)
 {
-	int ret;
-	unsigned long addr, offset = 0;
+	unsigned long addr, __addr, offset = 0;
 	struct insn insn;
 	kprobe_opcode_t buf[MAX_INSN_SIZE];
 
@@ -247,26 +311,24 @@ static int __kprobes can_probe(unsigned long paddr)
 	/* Decode instructions */
 	addr = paddr - offset;
 	while (addr < paddr) {
-		kernel_insn_init(&insn, (void *)addr);
-		insn_get_opcode(&insn);
-
 		/*
 		 * Check if the instruction has been modified by another
 		 * kprobe, in which case we replace the breakpoint by the
 		 * original instruction in our buffer.
+		 * Also, jump optimization will change the breakpoint to
+		 * relative-jump. Since the relative-jump itself is
+		 * normally used, we just go through if there is no kprobe.
 		 */
-		if (insn.opcode.bytes[0] == BREAKPOINT_INSTRUCTION) {
-			ret = recover_probed_instruction(buf, addr);
-			if (ret)
-				/*
-				 * Another debugging subsystem might insert
-				 * this breakpoint. In that case, we can't
-				 * recover it.
-				 */
-				return 0;
-			kernel_insn_init(&insn, buf);
-		}
+		__addr = recover_probed_instruction(buf, addr);
+		kernel_insn_init(&insn, (void *)__addr);
 		insn_get_length(&insn);
+
+		/*
+		 * Another debugging subsystem might insert this breakpoint.
+		 * In that case, we can't recover it.
+		 */
+		if (insn.opcode.bytes[0] == BREAKPOINT_INSTRUCTION)
+			return 0;
 		addr += insn.length;
 	}
 
@@ -302,21 +364,17 @@ static int __kprobes is_IF_modifier(kprobe_opcode_t *insn)
 static int __kprobes __copy_instruction(u8 *dest, u8 *src, int recover)
 {
 	struct insn insn;
-	int ret;
 	kprobe_opcode_t buf[MAX_INSN_SIZE];
+	u8 *orig_src = src;	/* Back up original src for RIP calculation */
+
+	if (recover)
+		src = (u8 *)recover_probed_instruction(buf, (unsigned long)src);
 
 	kernel_insn_init(&insn, src);
-	if (recover) {
-		insn_get_opcode(&insn);
-		if (insn.opcode.bytes[0] == BREAKPOINT_INSTRUCTION) {
-			ret = recover_probed_instruction(buf,
-							 (unsigned long)src);
-			if (ret)
-				return 0;
-			kernel_insn_init(&insn, buf);
-		}
-	}
 	insn_get_length(&insn);
+	/* Another subsystem puts a breakpoint, failed to recover */
+	if (recover && insn.opcode.bytes[0] == BREAKPOINT_INSTRUCTION)
+		return 0;
 	memcpy(dest, insn.kaddr, insn.length);
 
 #ifdef CONFIG_X86_64
@@ -337,8 +395,7 @@ static int __kprobes __copy_instruction(u8 *dest, u8 *src, int recover)
 		 * extension of the original signed 32-bit displacement would
 		 * have given.
 		 */
-		newdisp = (u8 *) src + (s64) insn.displacement.value -
-			  (u8 *) dest;
+		newdisp = (u8 *) orig_src + (s64) insn.displacement.value - (u8 *) dest;
 		BUG_ON((s64) (s32) newdisp != newdisp); /* Sanity check.  */
 		disp = (u8 *) dest + insn_offset_displacement(&insn);
 		*(s32 *) disp = (s32) newdisp;
@@ -1271,8 +1328,7 @@ static int insn_jump_into_range(struct insn *insn, unsigned long start, int len)
 /* Decode whole function to ensure any instructions don't jump into target */
 static int __kprobes can_optimize(unsigned long paddr)
 {
-	int ret;
-	unsigned long addr, size = 0, offset = 0;
+	unsigned long addr, __addr, size = 0, offset = 0;
 	struct insn insn;
 	kprobe_opcode_t buf[MAX_INSN_SIZE];
 
@@ -1301,15 +1357,12 @@ static int __kprobes can_optimize(unsigned long paddr)
 			 * we can't optimize kprobe in this function.
 			 */
 			return 0;
-		kernel_insn_init(&insn, (void *)addr);
-		insn_get_opcode(&insn);
-		if (insn.opcode.bytes[0] == BREAKPOINT_INSTRUCTION) {
-			ret = recover_probed_instruction(buf, addr);
-			if (ret)
-				return 0;
-			kernel_insn_init(&insn, buf);
-		}
+		__addr = recover_probed_instruction(buf, addr);
+		kernel_insn_init(&insn, (void *)__addr);
 		insn_get_length(&insn);
+		/* Another subsystem puts a breakpoint */
+		if (insn.opcode.bytes[0] == BREAKPOINT_INSTRUCTION)
+			return 0;
 		/* Recover address */
 		insn.kaddr = (void *)addr;
 		insn.next_byte = (void *)(addr + insn.length);
@@ -1366,6 +1419,7 @@ void __kprobes arch_remove_optimized_kprobe(struct optimized_kprobe *op)
 /*
  * Copy replacing target instructions
  * Target instructions MUST be relocatable (checked inside)
+ * This is called when new aggr(opt)probe is allocated or reused.
  */
 int __kprobes arch_prepare_optimized_kprobe(struct optimized_kprobe *op)
 {
-- 
cgit v1.2.3


From 464846888d9aad186cab3acdae6b654f9eb19772 Mon Sep 17 00:00:00 2001
From: Masami Hiramatsu <masami.hiramatsu.pt@hitachi.com>
Date: Mon, 5 Mar 2012 22:32:16 +0900
Subject: x86/kprobes: Fix a bug which can modify kernel code permanently

Fix a bug in kprobes which can modify kernel code
permanently at run-time. In the result, kernel can
crash when it executes the modified code.

This bug can happen when we put two probes enough near
and the first probe is optimized. When the second probe
is set up, it copies a byte which is already modified
by the first probe, and executes it when the probe is hit.
Even worse, the first probe and the second probe are removed
respectively, the second probe writes back the copied
(modified) instruction.

To fix this bug, kprobes always recovers the original
code and copies the first byte from recovered instruction.

Signed-off-by: Masami Hiramatsu <masami.hiramatsu.pt@hitachi.com>
Cc: Ananth N Mavinakayanahalli <ananth@in.ibm.com>
Cc: yrl.pp-manager.tt@hitachi.com
Cc: systemtap@sourceware.org
Cc: anderson@redhat.com
Link: http://lkml.kernel.org/r/20120305133215.5982.31991.stgit@localhost.localdomain
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/kprobes.c | 33 +++++++++++++++------------------
 1 file changed, 15 insertions(+), 18 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kernel/kprobes.c b/arch/x86/kernel/kprobes.c
index 6bec22f514b5..ca6d450bee7e 100644
--- a/arch/x86/kernel/kprobes.c
+++ b/arch/x86/kernel/kprobes.c
@@ -361,19 +361,15 @@ static int __kprobes is_IF_modifier(kprobe_opcode_t *insn)
  * If not, return null.
  * Only applicable to 64-bit x86.
  */
-static int __kprobes __copy_instruction(u8 *dest, u8 *src, int recover)
+static int __kprobes __copy_instruction(u8 *dest, u8 *src)
 {
 	struct insn insn;
 	kprobe_opcode_t buf[MAX_INSN_SIZE];
-	u8 *orig_src = src;	/* Back up original src for RIP calculation */
 
-	if (recover)
-		src = (u8 *)recover_probed_instruction(buf, (unsigned long)src);
-
-	kernel_insn_init(&insn, src);
+	kernel_insn_init(&insn, (void *)recover_probed_instruction(buf, (unsigned long)src));
 	insn_get_length(&insn);
 	/* Another subsystem puts a breakpoint, failed to recover */
-	if (recover && insn.opcode.bytes[0] == BREAKPOINT_INSTRUCTION)
+	if (insn.opcode.bytes[0] == BREAKPOINT_INSTRUCTION)
 		return 0;
 	memcpy(dest, insn.kaddr, insn.length);
 
@@ -395,7 +391,7 @@ static int __kprobes __copy_instruction(u8 *dest, u8 *src, int recover)
 		 * extension of the original signed 32-bit displacement would
 		 * have given.
 		 */
-		newdisp = (u8 *) orig_src + (s64) insn.displacement.value - (u8 *) dest;
+		newdisp = (u8 *) src + (s64) insn.displacement.value - (u8 *) dest;
 		BUG_ON((s64) (s32) newdisp != newdisp); /* Sanity check.  */
 		disp = (u8 *) dest + insn_offset_displacement(&insn);
 		*(s32 *) disp = (s32) newdisp;
@@ -406,18 +402,20 @@ static int __kprobes __copy_instruction(u8 *dest, u8 *src, int recover)
 
 static void __kprobes arch_copy_kprobe(struct kprobe *p)
 {
+	/* Copy an instruction with recovering if other optprobe modifies it.*/
+	__copy_instruction(p->ainsn.insn, p->addr);
+
 	/*
-	 * Copy an instruction without recovering int3, because it will be
-	 * put by another subsystem.
+	 * __copy_instruction can modify the displacement of the instruction,
+	 * but it doesn't affect boostable check.
 	 */
-	__copy_instruction(p->ainsn.insn, p->addr, 0);
-
-	if (can_boost(p->addr))
+	if (can_boost(p->ainsn.insn))
 		p->ainsn.boostable = 0;
 	else
 		p->ainsn.boostable = -1;
 
-	p->opcode = *p->addr;
+	/* Also, displacement change doesn't affect the first byte */
+	p->opcode = p->ainsn.insn[0];
 }
 
 int __kprobes arch_prepare_kprobe(struct kprobe *p)
@@ -1276,7 +1274,7 @@ static int __kprobes copy_optimized_instructions(u8 *dest, u8 *src)
 	int len = 0, ret;
 
 	while (len < RELATIVEJUMP_SIZE) {
-		ret = __copy_instruction(dest + len, src + len, 1);
+		ret = __copy_instruction(dest + len, src + len);
 		if (!ret || !can_boost(dest + len))
 			return -EINVAL;
 		len += ret;
@@ -1328,7 +1326,7 @@ static int insn_jump_into_range(struct insn *insn, unsigned long start, int len)
 /* Decode whole function to ensure any instructions don't jump into target */
 static int __kprobes can_optimize(unsigned long paddr)
 {
-	unsigned long addr, __addr, size = 0, offset = 0;
+	unsigned long addr, size = 0, offset = 0;
 	struct insn insn;
 	kprobe_opcode_t buf[MAX_INSN_SIZE];
 
@@ -1357,8 +1355,7 @@ static int __kprobes can_optimize(unsigned long paddr)
 			 * we can't optimize kprobe in this function.
 			 */
 			return 0;
-		__addr = recover_probed_instruction(buf, addr);
-		kernel_insn_init(&insn, (void *)__addr);
+		kernel_insn_init(&insn, (void *)recover_probed_instruction(buf, addr));
 		insn_get_length(&insn);
 		/* Another subsystem puts a breakpoint */
 		if (insn.opcode.bytes[0] == BREAKPOINT_INSTRUCTION)
-- 
cgit v1.2.3


From 3f33ab1c0c741bfab2138c14ba1918a7905a1e8b Mon Sep 17 00:00:00 2001
From: Masami Hiramatsu <masami.hiramatsu.pt@hitachi.com>
Date: Mon, 5 Mar 2012 22:32:22 +0900
Subject: x86/kprobes: Split out optprobe related code to kprobes-opt.c

Split out optprobe related code to arch/x86/kernel/kprobes-opt.c
for maintenanceability.

Signed-off-by: Masami Hiramatsu <masami.hiramatsu.pt@hitachi.com>
Suggested-by: Ingo Molnar <mingo@elte.hu>
Cc: Ananth N Mavinakayanahalli <ananth@in.ibm.com>
Cc: yrl.pp-manager.tt@hitachi.com
Cc: systemtap@sourceware.org
Cc: anderson@redhat.com
Link: http://lkml.kernel.org/r/20120305133222.5982.54794.stgit@localhost.localdomain
[ Tidied up the code a tiny bit ]
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/Makefile         |   1 +
 arch/x86/kernel/kprobes-common.h | 102 +++++++
 arch/x86/kernel/kprobes-opt.c    | 512 ++++++++++++++++++++++++++++++++
 arch/x86/kernel/kprobes.c        | 625 ++-------------------------------------
 4 files changed, 646 insertions(+), 594 deletions(-)
 create mode 100644 arch/x86/kernel/kprobes-common.h
 create mode 100644 arch/x86/kernel/kprobes-opt.c

(limited to 'arch/x86')

diff --git a/arch/x86/kernel/Makefile b/arch/x86/kernel/Makefile
index 5369059c07a9..532d2e090e6f 100644
--- a/arch/x86/kernel/Makefile
+++ b/arch/x86/kernel/Makefile
@@ -69,6 +69,7 @@ obj-$(CONFIG_KEXEC)		+= machine_kexec_$(BITS).o
 obj-$(CONFIG_KEXEC)		+= relocate_kernel_$(BITS).o crash.o
 obj-$(CONFIG_CRASH_DUMP)	+= crash_dump_$(BITS).o
 obj-$(CONFIG_KPROBES)		+= kprobes.o
+obj-$(CONFIG_OPTPROBES)		+= kprobes-opt.o
 obj-$(CONFIG_MODULES)		+= module.o
 obj-$(CONFIG_DOUBLEFAULT) 	+= doublefault_32.o
 obj-$(CONFIG_KGDB)		+= kgdb.o
diff --git a/arch/x86/kernel/kprobes-common.h b/arch/x86/kernel/kprobes-common.h
new file mode 100644
index 000000000000..3230b68ef29a
--- /dev/null
+++ b/arch/x86/kernel/kprobes-common.h
@@ -0,0 +1,102 @@
+#ifndef __X86_KERNEL_KPROBES_COMMON_H
+#define __X86_KERNEL_KPROBES_COMMON_H
+
+/* Kprobes and Optprobes common header */
+
+#ifdef CONFIG_X86_64
+#define SAVE_REGS_STRING			\
+	/* Skip cs, ip, orig_ax. */		\
+	"	subq $24, %rsp\n"		\
+	"	pushq %rdi\n"			\
+	"	pushq %rsi\n"			\
+	"	pushq %rdx\n"			\
+	"	pushq %rcx\n"			\
+	"	pushq %rax\n"			\
+	"	pushq %r8\n"			\
+	"	pushq %r9\n"			\
+	"	pushq %r10\n"			\
+	"	pushq %r11\n"			\
+	"	pushq %rbx\n"			\
+	"	pushq %rbp\n"			\
+	"	pushq %r12\n"			\
+	"	pushq %r13\n"			\
+	"	pushq %r14\n"			\
+	"	pushq %r15\n"
+#define RESTORE_REGS_STRING			\
+	"	popq %r15\n"			\
+	"	popq %r14\n"			\
+	"	popq %r13\n"			\
+	"	popq %r12\n"			\
+	"	popq %rbp\n"			\
+	"	popq %rbx\n"			\
+	"	popq %r11\n"			\
+	"	popq %r10\n"			\
+	"	popq %r9\n"			\
+	"	popq %r8\n"			\
+	"	popq %rax\n"			\
+	"	popq %rcx\n"			\
+	"	popq %rdx\n"			\
+	"	popq %rsi\n"			\
+	"	popq %rdi\n"			\
+	/* Skip orig_ax, ip, cs */		\
+	"	addq $24, %rsp\n"
+#else
+#define SAVE_REGS_STRING			\
+	/* Skip cs, ip, orig_ax and gs. */	\
+	"	subl $16, %esp\n"		\
+	"	pushl %fs\n"			\
+	"	pushl %es\n"			\
+	"	pushl %ds\n"			\
+	"	pushl %eax\n"			\
+	"	pushl %ebp\n"			\
+	"	pushl %edi\n"			\
+	"	pushl %esi\n"			\
+	"	pushl %edx\n"			\
+	"	pushl %ecx\n"			\
+	"	pushl %ebx\n"
+#define RESTORE_REGS_STRING			\
+	"	popl %ebx\n"			\
+	"	popl %ecx\n"			\
+	"	popl %edx\n"			\
+	"	popl %esi\n"			\
+	"	popl %edi\n"			\
+	"	popl %ebp\n"			\
+	"	popl %eax\n"			\
+	/* Skip ds, es, fs, gs, orig_ax, and ip. Note: don't pop cs here*/\
+	"	addl $24, %esp\n"
+#endif
+
+/* Ensure if the instruction can be boostable */
+extern int can_boost(kprobe_opcode_t *instruction);
+/* Recover instruction if given address is probed */
+extern unsigned long recover_probed_instruction(kprobe_opcode_t *buf,
+					 unsigned long addr);
+/*
+ * Copy an instruction and adjust the displacement if the instruction
+ * uses the %rip-relative addressing mode.
+ */
+extern int __copy_instruction(u8 *dest, u8 *src);
+
+/* Generate a relative-jump/call instruction */
+extern void synthesize_reljump(void *from, void *to);
+extern void synthesize_relcall(void *from, void *to);
+
+#ifdef	CONFIG_OPTPROBES
+extern int arch_init_optprobes(void);
+extern int setup_detour_execution(struct kprobe *p, struct pt_regs *regs, int reenter);
+extern unsigned long __recover_optprobed_insn(kprobe_opcode_t *buf, unsigned long addr);
+#else	/* !CONFIG_OPTPROBES */
+static inline int arch_init_optprobes(void)
+{
+	return 0;
+}
+static inline int setup_detour_execution(struct kprobe *p, struct pt_regs *regs, int reenter)
+{
+	return 0;
+}
+static inline unsigned long __recover_optprobed_insn(kprobe_opcode_t *buf, unsigned long addr)
+{
+	return addr;
+}
+#endif
+#endif
diff --git a/arch/x86/kernel/kprobes-opt.c b/arch/x86/kernel/kprobes-opt.c
new file mode 100644
index 000000000000..c5e410eed403
--- /dev/null
+++ b/arch/x86/kernel/kprobes-opt.c
@@ -0,0 +1,512 @@
+/*
+ *  Kernel Probes Jump Optimization (Optprobes)
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
+ *
+ * Copyright (C) IBM Corporation, 2002, 2004
+ * Copyright (C) Hitachi Ltd., 2012
+ */
+#include <linux/kprobes.h>
+#include <linux/ptrace.h>
+#include <linux/string.h>
+#include <linux/slab.h>
+#include <linux/hardirq.h>
+#include <linux/preempt.h>
+#include <linux/module.h>
+#include <linux/kdebug.h>
+#include <linux/kallsyms.h>
+#include <linux/ftrace.h>
+
+#include <asm/cacheflush.h>
+#include <asm/desc.h>
+#include <asm/pgtable.h>
+#include <asm/uaccess.h>
+#include <asm/alternative.h>
+#include <asm/insn.h>
+#include <asm/debugreg.h>
+
+#include "kprobes-common.h"
+
+unsigned long __recover_optprobed_insn(kprobe_opcode_t *buf, unsigned long addr)
+{
+	struct optimized_kprobe *op;
+	struct kprobe *kp;
+	long offs;
+	int i;
+
+	for (i = 0; i < RELATIVEJUMP_SIZE; i++) {
+		kp = get_kprobe((void *)addr - i);
+		/* This function only handles jump-optimized kprobe */
+		if (kp && kprobe_optimized(kp)) {
+			op = container_of(kp, struct optimized_kprobe, kp);
+			/* If op->list is not empty, op is under optimizing */
+			if (list_empty(&op->list))
+				goto found;
+		}
+	}
+
+	return addr;
+found:
+	/*
+	 * If the kprobe can be optimized, original bytes which can be
+	 * overwritten by jump destination address. In this case, original
+	 * bytes must be recovered from op->optinsn.copied_insn buffer.
+	 */
+	memcpy(buf, (void *)addr, MAX_INSN_SIZE * sizeof(kprobe_opcode_t));
+	if (addr == (unsigned long)kp->addr) {
+		buf[0] = kp->opcode;
+		memcpy(buf + 1, op->optinsn.copied_insn, RELATIVE_ADDR_SIZE);
+	} else {
+		offs = addr - (unsigned long)kp->addr - 1;
+		memcpy(buf, op->optinsn.copied_insn + offs, RELATIVE_ADDR_SIZE - offs);
+	}
+
+	return (unsigned long)buf;
+}
+
+/* Insert a move instruction which sets a pointer to eax/rdi (1st arg). */
+static void __kprobes synthesize_set_arg1(kprobe_opcode_t *addr, unsigned long val)
+{
+#ifdef CONFIG_X86_64
+	*addr++ = 0x48;
+	*addr++ = 0xbf;
+#else
+	*addr++ = 0xb8;
+#endif
+	*(unsigned long *)addr = val;
+}
+
+static void __used __kprobes kprobes_optinsn_template_holder(void)
+{
+	asm volatile (
+			".global optprobe_template_entry\n"
+			"optprobe_template_entry:\n"
+#ifdef CONFIG_X86_64
+			/* We don't bother saving the ss register */
+			"	pushq %rsp\n"
+			"	pushfq\n"
+			SAVE_REGS_STRING
+			"	movq %rsp, %rsi\n"
+			".global optprobe_template_val\n"
+			"optprobe_template_val:\n"
+			ASM_NOP5
+			ASM_NOP5
+			".global optprobe_template_call\n"
+			"optprobe_template_call:\n"
+			ASM_NOP5
+			/* Move flags to rsp */
+			"	movq 144(%rsp), %rdx\n"
+			"	movq %rdx, 152(%rsp)\n"
+			RESTORE_REGS_STRING
+			/* Skip flags entry */
+			"	addq $8, %rsp\n"
+			"	popfq\n"
+#else /* CONFIG_X86_32 */
+			"	pushf\n"
+			SAVE_REGS_STRING
+			"	movl %esp, %edx\n"
+			".global optprobe_template_val\n"
+			"optprobe_template_val:\n"
+			ASM_NOP5
+			".global optprobe_template_call\n"
+			"optprobe_template_call:\n"
+			ASM_NOP5
+			RESTORE_REGS_STRING
+			"	addl $4, %esp\n"	/* skip cs */
+			"	popf\n"
+#endif
+			".global optprobe_template_end\n"
+			"optprobe_template_end:\n");
+}
+
+#define TMPL_MOVE_IDX \
+	((long)&optprobe_template_val - (long)&optprobe_template_entry)
+#define TMPL_CALL_IDX \
+	((long)&optprobe_template_call - (long)&optprobe_template_entry)
+#define TMPL_END_IDX \
+	((long)&optprobe_template_end - (long)&optprobe_template_entry)
+
+#define INT3_SIZE sizeof(kprobe_opcode_t)
+
+/* Optimized kprobe call back function: called from optinsn */
+static void __kprobes optimized_callback(struct optimized_kprobe *op, struct pt_regs *regs)
+{
+	struct kprobe_ctlblk *kcb = get_kprobe_ctlblk();
+	unsigned long flags;
+
+	/* This is possible if op is under delayed unoptimizing */
+	if (kprobe_disabled(&op->kp))
+		return;
+
+	local_irq_save(flags);
+	if (kprobe_running()) {
+		kprobes_inc_nmissed_count(&op->kp);
+	} else {
+		/* Save skipped registers */
+#ifdef CONFIG_X86_64
+		regs->cs = __KERNEL_CS;
+#else
+		regs->cs = __KERNEL_CS | get_kernel_rpl();
+		regs->gs = 0;
+#endif
+		regs->ip = (unsigned long)op->kp.addr + INT3_SIZE;
+		regs->orig_ax = ~0UL;
+
+		__this_cpu_write(current_kprobe, &op->kp);
+		kcb->kprobe_status = KPROBE_HIT_ACTIVE;
+		opt_pre_handler(&op->kp, regs);
+		__this_cpu_write(current_kprobe, NULL);
+	}
+	local_irq_restore(flags);
+}
+
+static int __kprobes copy_optimized_instructions(u8 *dest, u8 *src)
+{
+	int len = 0, ret;
+
+	while (len < RELATIVEJUMP_SIZE) {
+		ret = __copy_instruction(dest + len, src + len);
+		if (!ret || !can_boost(dest + len))
+			return -EINVAL;
+		len += ret;
+	}
+	/* Check whether the address range is reserved */
+	if (ftrace_text_reserved(src, src + len - 1) ||
+	    alternatives_text_reserved(src, src + len - 1) ||
+	    jump_label_text_reserved(src, src + len - 1))
+		return -EBUSY;
+
+	return len;
+}
+
+/* Check whether insn is indirect jump */
+static int __kprobes insn_is_indirect_jump(struct insn *insn)
+{
+	return ((insn->opcode.bytes[0] == 0xff &&
+		(X86_MODRM_REG(insn->modrm.value) & 6) == 4) || /* Jump */
+		insn->opcode.bytes[0] == 0xea);	/* Segment based jump */
+}
+
+/* Check whether insn jumps into specified address range */
+static int insn_jump_into_range(struct insn *insn, unsigned long start, int len)
+{
+	unsigned long target = 0;
+
+	switch (insn->opcode.bytes[0]) {
+	case 0xe0:	/* loopne */
+	case 0xe1:	/* loope */
+	case 0xe2:	/* loop */
+	case 0xe3:	/* jcxz */
+	case 0xe9:	/* near relative jump */
+	case 0xeb:	/* short relative jump */
+		break;
+	case 0x0f:
+		if ((insn->opcode.bytes[1] & 0xf0) == 0x80) /* jcc near */
+			break;
+		return 0;
+	default:
+		if ((insn->opcode.bytes[0] & 0xf0) == 0x70) /* jcc short */
+			break;
+		return 0;
+	}
+	target = (unsigned long)insn->next_byte + insn->immediate.value;
+
+	return (start <= target && target <= start + len);
+}
+
+/* Decode whole function to ensure any instructions don't jump into target */
+static int __kprobes can_optimize(unsigned long paddr)
+{
+	unsigned long addr, size = 0, offset = 0;
+	struct insn insn;
+	kprobe_opcode_t buf[MAX_INSN_SIZE];
+
+	/* Lookup symbol including addr */
+	if (!kallsyms_lookup_size_offset(paddr, &size, &offset))
+		return 0;
+
+	/*
+	 * Do not optimize in the entry code due to the unstable
+	 * stack handling.
+	 */
+	if ((paddr >= (unsigned long)__entry_text_start) &&
+	    (paddr <  (unsigned long)__entry_text_end))
+		return 0;
+
+	/* Check there is enough space for a relative jump. */
+	if (size - offset < RELATIVEJUMP_SIZE)
+		return 0;
+
+	/* Decode instructions */
+	addr = paddr - offset;
+	while (addr < paddr - offset + size) { /* Decode until function end */
+		if (search_exception_tables(addr))
+			/*
+			 * Since some fixup code will jumps into this function,
+			 * we can't optimize kprobe in this function.
+			 */
+			return 0;
+		kernel_insn_init(&insn, (void *)recover_probed_instruction(buf, addr));
+		insn_get_length(&insn);
+		/* Another subsystem puts a breakpoint */
+		if (insn.opcode.bytes[0] == BREAKPOINT_INSTRUCTION)
+			return 0;
+		/* Recover address */
+		insn.kaddr = (void *)addr;
+		insn.next_byte = (void *)(addr + insn.length);
+		/* Check any instructions don't jump into target */
+		if (insn_is_indirect_jump(&insn) ||
+		    insn_jump_into_range(&insn, paddr + INT3_SIZE,
+					 RELATIVE_ADDR_SIZE))
+			return 0;
+		addr += insn.length;
+	}
+
+	return 1;
+}
+
+/* Check optimized_kprobe can actually be optimized. */
+int __kprobes arch_check_optimized_kprobe(struct optimized_kprobe *op)
+{
+	int i;
+	struct kprobe *p;
+
+	for (i = 1; i < op->optinsn.size; i++) {
+		p = get_kprobe(op->kp.addr + i);
+		if (p && !kprobe_disabled(p))
+			return -EEXIST;
+	}
+
+	return 0;
+}
+
+/* Check the addr is within the optimized instructions. */
+int __kprobes
+arch_within_optimized_kprobe(struct optimized_kprobe *op, unsigned long addr)
+{
+	return ((unsigned long)op->kp.addr <= addr &&
+		(unsigned long)op->kp.addr + op->optinsn.size > addr);
+}
+
+/* Free optimized instruction slot */
+static __kprobes
+void __arch_remove_optimized_kprobe(struct optimized_kprobe *op, int dirty)
+{
+	if (op->optinsn.insn) {
+		free_optinsn_slot(op->optinsn.insn, dirty);
+		op->optinsn.insn = NULL;
+		op->optinsn.size = 0;
+	}
+}
+
+void __kprobes arch_remove_optimized_kprobe(struct optimized_kprobe *op)
+{
+	__arch_remove_optimized_kprobe(op, 1);
+}
+
+/*
+ * Copy replacing target instructions
+ * Target instructions MUST be relocatable (checked inside)
+ * This is called when new aggr(opt)probe is allocated or reused.
+ */
+int __kprobes arch_prepare_optimized_kprobe(struct optimized_kprobe *op)
+{
+	u8 *buf;
+	int ret;
+	long rel;
+
+	if (!can_optimize((unsigned long)op->kp.addr))
+		return -EILSEQ;
+
+	op->optinsn.insn = get_optinsn_slot();
+	if (!op->optinsn.insn)
+		return -ENOMEM;
+
+	/*
+	 * Verify if the address gap is in 2GB range, because this uses
+	 * a relative jump.
+	 */
+	rel = (long)op->optinsn.insn - (long)op->kp.addr + RELATIVEJUMP_SIZE;
+	if (abs(rel) > 0x7fffffff)
+		return -ERANGE;
+
+	buf = (u8 *)op->optinsn.insn;
+
+	/* Copy instructions into the out-of-line buffer */
+	ret = copy_optimized_instructions(buf + TMPL_END_IDX, op->kp.addr);
+	if (ret < 0) {
+		__arch_remove_optimized_kprobe(op, 0);
+		return ret;
+	}
+	op->optinsn.size = ret;
+
+	/* Copy arch-dep-instance from template */
+	memcpy(buf, &optprobe_template_entry, TMPL_END_IDX);
+
+	/* Set probe information */
+	synthesize_set_arg1(buf + TMPL_MOVE_IDX, (unsigned long)op);
+
+	/* Set probe function call */
+	synthesize_relcall(buf + TMPL_CALL_IDX, optimized_callback);
+
+	/* Set returning jmp instruction at the tail of out-of-line buffer */
+	synthesize_reljump(buf + TMPL_END_IDX + op->optinsn.size,
+			   (u8 *)op->kp.addr + op->optinsn.size);
+
+	flush_icache_range((unsigned long) buf,
+			   (unsigned long) buf + TMPL_END_IDX +
+			   op->optinsn.size + RELATIVEJUMP_SIZE);
+	return 0;
+}
+
+#define MAX_OPTIMIZE_PROBES 256
+static struct text_poke_param *jump_poke_params;
+static struct jump_poke_buffer {
+	u8 buf[RELATIVEJUMP_SIZE];
+} *jump_poke_bufs;
+
+static void __kprobes setup_optimize_kprobe(struct text_poke_param *tprm,
+					    u8 *insn_buf,
+					    struct optimized_kprobe *op)
+{
+	s32 rel = (s32)((long)op->optinsn.insn -
+			((long)op->kp.addr + RELATIVEJUMP_SIZE));
+
+	/* Backup instructions which will be replaced by jump address */
+	memcpy(op->optinsn.copied_insn, op->kp.addr + INT3_SIZE,
+	       RELATIVE_ADDR_SIZE);
+
+	insn_buf[0] = RELATIVEJUMP_OPCODE;
+	*(s32 *)(&insn_buf[1]) = rel;
+
+	tprm->addr = op->kp.addr;
+	tprm->opcode = insn_buf;
+	tprm->len = RELATIVEJUMP_SIZE;
+}
+
+/*
+ * Replace breakpoints (int3) with relative jumps.
+ * Caller must call with locking kprobe_mutex and text_mutex.
+ */
+void __kprobes arch_optimize_kprobes(struct list_head *oplist)
+{
+	struct optimized_kprobe *op, *tmp;
+	int c = 0;
+
+	list_for_each_entry_safe(op, tmp, oplist, list) {
+		WARN_ON(kprobe_disabled(&op->kp));
+		/* Setup param */
+		setup_optimize_kprobe(&jump_poke_params[c],
+				      jump_poke_bufs[c].buf, op);
+		list_del_init(&op->list);
+		if (++c >= MAX_OPTIMIZE_PROBES)
+			break;
+	}
+
+	/*
+	 * text_poke_smp doesn't support NMI/MCE code modifying.
+	 * However, since kprobes itself also doesn't support NMI/MCE
+	 * code probing, it's not a problem.
+	 */
+	text_poke_smp_batch(jump_poke_params, c);
+}
+
+static void __kprobes setup_unoptimize_kprobe(struct text_poke_param *tprm,
+					      u8 *insn_buf,
+					      struct optimized_kprobe *op)
+{
+	/* Set int3 to first byte for kprobes */
+	insn_buf[0] = BREAKPOINT_INSTRUCTION;
+	memcpy(insn_buf + 1, op->optinsn.copied_insn, RELATIVE_ADDR_SIZE);
+
+	tprm->addr = op->kp.addr;
+	tprm->opcode = insn_buf;
+	tprm->len = RELATIVEJUMP_SIZE;
+}
+
+/*
+ * Recover original instructions and breakpoints from relative jumps.
+ * Caller must call with locking kprobe_mutex.
+ */
+extern void arch_unoptimize_kprobes(struct list_head *oplist,
+				    struct list_head *done_list)
+{
+	struct optimized_kprobe *op, *tmp;
+	int c = 0;
+
+	list_for_each_entry_safe(op, tmp, oplist, list) {
+		/* Setup param */
+		setup_unoptimize_kprobe(&jump_poke_params[c],
+					jump_poke_bufs[c].buf, op);
+		list_move(&op->list, done_list);
+		if (++c >= MAX_OPTIMIZE_PROBES)
+			break;
+	}
+
+	/*
+	 * text_poke_smp doesn't support NMI/MCE code modifying.
+	 * However, since kprobes itself also doesn't support NMI/MCE
+	 * code probing, it's not a problem.
+	 */
+	text_poke_smp_batch(jump_poke_params, c);
+}
+
+/* Replace a relative jump with a breakpoint (int3).  */
+void __kprobes arch_unoptimize_kprobe(struct optimized_kprobe *op)
+{
+	u8 buf[RELATIVEJUMP_SIZE];
+
+	/* Set int3 to first byte for kprobes */
+	buf[0] = BREAKPOINT_INSTRUCTION;
+	memcpy(buf + 1, op->optinsn.copied_insn, RELATIVE_ADDR_SIZE);
+	text_poke_smp(op->kp.addr, buf, RELATIVEJUMP_SIZE);
+}
+
+int  __kprobes
+setup_detour_execution(struct kprobe *p, struct pt_regs *regs, int reenter)
+{
+	struct optimized_kprobe *op;
+
+	if (p->flags & KPROBE_FLAG_OPTIMIZED) {
+		/* This kprobe is really able to run optimized path. */
+		op = container_of(p, struct optimized_kprobe, kp);
+		/* Detour through copied instructions */
+		regs->ip = (unsigned long)op->optinsn.insn + TMPL_END_IDX;
+		if (!reenter)
+			reset_current_kprobe();
+		preempt_enable_no_resched();
+		return 1;
+	}
+	return 0;
+}
+
+int __kprobes arch_init_optprobes(void)
+{
+	/* Allocate code buffer and parameter array */
+	jump_poke_bufs = kmalloc(sizeof(struct jump_poke_buffer) *
+				 MAX_OPTIMIZE_PROBES, GFP_KERNEL);
+	if (!jump_poke_bufs)
+		return -ENOMEM;
+
+	jump_poke_params = kmalloc(sizeof(struct text_poke_param) *
+				   MAX_OPTIMIZE_PROBES, GFP_KERNEL);
+	if (!jump_poke_params) {
+		kfree(jump_poke_bufs);
+		jump_poke_bufs = NULL;
+		return -ENOMEM;
+	}
+
+	return 0;
+}
diff --git a/arch/x86/kernel/kprobes.c b/arch/x86/kernel/kprobes.c
index ca6d450bee7e..e213fc8408d2 100644
--- a/arch/x86/kernel/kprobes.c
+++ b/arch/x86/kernel/kprobes.c
@@ -30,16 +30,15 @@
  *		<jkenisto@us.ibm.com> and Prasanna S Panchamukhi
  *		<prasanna@in.ibm.com> added function-return probes.
  * 2005-May	Rusty Lynch <rusty.lynch@intel.com>
- * 		Added function return probes functionality
+ *		Added function return probes functionality
  * 2006-Feb	Masami Hiramatsu <hiramatu@sdl.hitachi.co.jp> added
- * 		kprobe-booster and kretprobe-booster for i386.
+ *		kprobe-booster and kretprobe-booster for i386.
  * 2007-Dec	Masami Hiramatsu <mhiramat@redhat.com> added kprobe-booster
- * 		and kretprobe-booster for x86-64
+ *		and kretprobe-booster for x86-64
  * 2007-Dec	Masami Hiramatsu <mhiramat@redhat.com>, Arjan van de Ven
- * 		<arjan@infradead.org> and Jim Keniston <jkenisto@us.ibm.com>
- * 		unified x86 kprobes code.
+ *		<arjan@infradead.org> and Jim Keniston <jkenisto@us.ibm.com>
+ *		unified x86 kprobes code.
  */
-
 #include <linux/kprobes.h>
 #include <linux/ptrace.h>
 #include <linux/string.h>
@@ -59,6 +58,8 @@
 #include <asm/insn.h>
 #include <asm/debugreg.h>
 
+#include "kprobes-common.h"
+
 void jprobe_return_end(void);
 
 DEFINE_PER_CPU(struct kprobe *, current_kprobe) = NULL;
@@ -108,6 +109,7 @@ struct kretprobe_blackpoint kretprobe_blacklist[] = {
 			      doesn't switch kernel stack.*/
 	{NULL, NULL}	/* Terminator */
 };
+
 const int kretprobe_blacklist_size = ARRAY_SIZE(kretprobe_blacklist);
 
 static void __kprobes __synthesize_relative_insn(void *from, void *to, u8 op)
@@ -123,11 +125,17 @@ static void __kprobes __synthesize_relative_insn(void *from, void *to, u8 op)
 }
 
 /* Insert a jump instruction at address 'from', which jumps to address 'to'.*/
-static void __kprobes synthesize_reljump(void *from, void *to)
+void __kprobes synthesize_reljump(void *from, void *to)
 {
 	__synthesize_relative_insn(from, to, RELATIVEJUMP_OPCODE);
 }
 
+/* Insert a call instruction at address 'from', which calls address 'to'.*/
+void __kprobes synthesize_relcall(void *from, void *to)
+{
+	__synthesize_relative_insn(from, to, RELATIVECALL_OPCODE);
+}
+
 /*
  * Skip the prefixes of the instruction.
  */
@@ -151,7 +159,7 @@ static kprobe_opcode_t *__kprobes skip_prefixes(kprobe_opcode_t *insn)
  * Returns non-zero if opcode is boostable.
  * RIP relative instructions are adjusted at copying time in 64 bits mode
  */
-static int __kprobes can_boost(kprobe_opcode_t *opcodes)
+int __kprobes can_boost(kprobe_opcode_t *opcodes)
 {
 	kprobe_opcode_t opcode;
 	kprobe_opcode_t *orig_opcodes = opcodes;
@@ -207,8 +215,8 @@ retry:
 	}
 }
 
-static unsigned long __recover_probed_insn(kprobe_opcode_t *buf,
-					   unsigned long addr)
+static unsigned long
+__recover_probed_insn(kprobe_opcode_t *buf, unsigned long addr)
 {
 	struct kprobe *kp;
 
@@ -235,59 +243,12 @@ static unsigned long __recover_probed_insn(kprobe_opcode_t *buf,
 	return (unsigned long)buf;
 }
 
-#ifdef CONFIG_OPTPROBES
-static unsigned long __recover_optprobed_insn(kprobe_opcode_t *buf,
-					      unsigned long addr)
-{
-	struct optimized_kprobe *op;
-	struct kprobe *kp;
-	long offs;
-	int i;
-
-	for (i = 0; i < RELATIVEJUMP_SIZE; i++) {
-		kp = get_kprobe((void *)addr - i);
-		/* This function only handles jump-optimized kprobe */
-		if (kp && kprobe_optimized(kp)) {
-			op = container_of(kp, struct optimized_kprobe, kp);
-			/* If op->list is not empty, op is under optimizing */
-			if (list_empty(&op->list))
-				goto found;
-		}
-	}
-
-	return addr;
-found:
-	/*
-	 * If the kprobe can be optimized, original bytes which can be
-	 * overwritten by jump destination address. In this case, original
-	 * bytes must be recovered from op->optinsn.copied_insn buffer.
-	 */
-	memcpy(buf, (void *)addr, MAX_INSN_SIZE * sizeof(kprobe_opcode_t));
-	if (addr == (unsigned long)kp->addr) {
-		buf[0] = kp->opcode;
-		memcpy(buf + 1, op->optinsn.copied_insn, RELATIVE_ADDR_SIZE);
-	} else {
-		offs = addr - (unsigned long)kp->addr - 1;
-		memcpy(buf, op->optinsn.copied_insn + offs, RELATIVE_ADDR_SIZE - offs);
-	}
-
-	return (unsigned long)buf;
-}
-#else
-static inline unsigned long __recover_optprobed_insn(kprobe_opcode_t *buf,
-						     unsigned long addr)
-{
-	return addr;
-}
-#endif
-
 /*
  * Recover the probed instruction at addr for further analysis.
  * Caller must lock kprobes by kprobe_mutex, or disable preemption
  * for preventing to release referencing kprobes.
  */
-static unsigned long recover_probed_instruction(kprobe_opcode_t *buf,
-						unsigned long addr)
+unsigned long recover_probed_instruction(kprobe_opcode_t *buf, unsigned long addr)
 {
 	unsigned long __addr;
 
@@ -361,7 +322,7 @@ static int __kprobes is_IF_modifier(kprobe_opcode_t *insn)
  * If not, return null.
  * Only applicable to 64-bit x86.
  */
-static int __kprobes __copy_instruction(u8 *dest, u8 *src)
+int __kprobes __copy_instruction(u8 *dest, u8 *src)
 {
 	struct insn insn;
 	kprobe_opcode_t buf[MAX_INSN_SIZE];
@@ -497,8 +458,8 @@ static void __kprobes restore_btf(void)
 	}
 }
 
-void __kprobes arch_prepare_kretprobe(struct kretprobe_instance *ri,
-				      struct pt_regs *regs)
+void __kprobes
+arch_prepare_kretprobe(struct kretprobe_instance *ri, struct pt_regs *regs)
 {
 	unsigned long *sara = stack_addr(regs);
 
@@ -508,16 +469,8 @@ void __kprobes arch_prepare_kretprobe(struct kretprobe_instance *ri,
 	*sara = (unsigned long) &kretprobe_trampoline;
 }
 
-#ifdef CONFIG_OPTPROBES
-static int  __kprobes setup_detour_execution(struct kprobe *p,
-					     struct pt_regs *regs,
-					     int reenter);
-#else
-#define setup_detour_execution(p, regs, reenter) (0)
-#endif
-
-static void __kprobes setup_singlestep(struct kprobe *p, struct pt_regs *regs,
-				       struct kprobe_ctlblk *kcb, int reenter)
+static void __kprobes
+setup_singlestep(struct kprobe *p, struct pt_regs *regs, struct kprobe_ctlblk *kcb, int reenter)
 {
 	if (setup_detour_execution(p, regs, reenter))
 		return;
@@ -559,8 +512,8 @@ static void __kprobes setup_singlestep(struct kprobe *p, struct pt_regs *regs,
  * within the handler. We save the original kprobes variables and just single
  * step on the instruction of the new probe without calling any user handlers.
  */
-static int __kprobes reenter_kprobe(struct kprobe *p, struct pt_regs *regs,
-				    struct kprobe_ctlblk *kcb)
+static int __kprobes
+reenter_kprobe(struct kprobe *p, struct pt_regs *regs, struct kprobe_ctlblk *kcb)
 {
 	switch (kcb->kprobe_status) {
 	case KPROBE_HIT_SSDONE:
@@ -655,69 +608,6 @@ static int __kprobes kprobe_handler(struct pt_regs *regs)
 	return 0;
 }
 
-#ifdef CONFIG_X86_64
-#define SAVE_REGS_STRING		\
-	/* Skip cs, ip, orig_ax. */	\
-	"	subq $24, %rsp\n"	\
-	"	pushq %rdi\n"		\
-	"	pushq %rsi\n"		\
-	"	pushq %rdx\n"		\
-	"	pushq %rcx\n"		\
-	"	pushq %rax\n"		\
-	"	pushq %r8\n"		\
-	"	pushq %r9\n"		\
-	"	pushq %r10\n"		\
-	"	pushq %r11\n"		\
-	"	pushq %rbx\n"		\
-	"	pushq %rbp\n"		\
-	"	pushq %r12\n"		\
-	"	pushq %r13\n"		\
-	"	pushq %r14\n"		\
-	"	pushq %r15\n"
-#define RESTORE_REGS_STRING		\
-	"	popq %r15\n"		\
-	"	popq %r14\n"		\
-	"	popq %r13\n"		\
-	"	popq %r12\n"		\
-	"	popq %rbp\n"		\
-	"	popq %rbx\n"		\
-	"	popq %r11\n"		\
-	"	popq %r10\n"		\
-	"	popq %r9\n"		\
-	"	popq %r8\n"		\
-	"	popq %rax\n"		\
-	"	popq %rcx\n"		\
-	"	popq %rdx\n"		\
-	"	popq %rsi\n"		\
-	"	popq %rdi\n"		\
-	/* Skip orig_ax, ip, cs */	\
-	"	addq $24, %rsp\n"
-#else
-#define SAVE_REGS_STRING		\
-	/* Skip cs, ip, orig_ax and gs. */	\
-	"	subl $16, %esp\n"	\
-	"	pushl %fs\n"		\
-	"	pushl %es\n"		\
-	"	pushl %ds\n"		\
-	"	pushl %eax\n"		\
-	"	pushl %ebp\n"		\
-	"	pushl %edi\n"		\
-	"	pushl %esi\n"		\
-	"	pushl %edx\n"		\
-	"	pushl %ecx\n"		\
-	"	pushl %ebx\n"
-#define RESTORE_REGS_STRING		\
-	"	popl %ebx\n"		\
-	"	popl %ecx\n"		\
-	"	popl %edx\n"		\
-	"	popl %esi\n"		\
-	"	popl %edi\n"		\
-	"	popl %ebp\n"		\
-	"	popl %eax\n"		\
-	/* Skip ds, es, fs, gs, orig_ax, and ip. Note: don't pop cs here*/\
-	"	addl $24, %esp\n"
-#endif
-
 /*
  * When a retprobed function returns, this code saves registers and
  * calls trampoline_handler() runs, which calls the kretprobe's handler.
@@ -871,8 +761,8 @@ static __used __kprobes void *trampoline_handler(struct pt_regs *regs)
  * jump instruction after the copied instruction, that jumps to the next
  * instruction after the probepoint.
  */
-static void __kprobes resume_execution(struct kprobe *p,
-		struct pt_regs *regs, struct kprobe_ctlblk *kcb)
+static void __kprobes
+resume_execution(struct kprobe *p, struct pt_regs *regs, struct kprobe_ctlblk *kcb)
 {
 	unsigned long *tos = stack_addr(regs);
 	unsigned long copy_ip = (unsigned long)p->ainsn.insn;
@@ -1051,8 +941,8 @@ int __kprobes kprobe_fault_handler(struct pt_regs *regs, int trapnr)
 /*
  * Wrapper routine for handling exceptions.
  */
-int __kprobes kprobe_exceptions_notify(struct notifier_block *self,
-				       unsigned long val, void *data)
+int __kprobes
+kprobe_exceptions_notify(struct notifier_block *self, unsigned long val, void *data)
 {
 	struct die_args *args = data;
 	int ret = NOTIFY_DONE;
@@ -1162,462 +1052,9 @@ int __kprobes longjmp_break_handler(struct kprobe *p, struct pt_regs *regs)
 	return 0;
 }
 
-
-#ifdef CONFIG_OPTPROBES
-
-/* Insert a call instruction at address 'from', which calls address 'to'.*/
-static void __kprobes synthesize_relcall(void *from, void *to)
-{
-	__synthesize_relative_insn(from, to, RELATIVECALL_OPCODE);
-}
-
-/* Insert a move instruction which sets a pointer to eax/rdi (1st arg). */
-static void __kprobes synthesize_set_arg1(kprobe_opcode_t *addr,
-					  unsigned long val)
-{
-#ifdef CONFIG_X86_64
-	*addr++ = 0x48;
-	*addr++ = 0xbf;
-#else
-	*addr++ = 0xb8;
-#endif
-	*(unsigned long *)addr = val;
-}
-
-static void __used __kprobes kprobes_optinsn_template_holder(void)
-{
-	asm volatile (
-			".global optprobe_template_entry\n"
-			"optprobe_template_entry: \n"
-#ifdef CONFIG_X86_64
-			/* We don't bother saving the ss register */
-			"	pushq %rsp\n"
-			"	pushfq\n"
-			SAVE_REGS_STRING
-			"	movq %rsp, %rsi\n"
-			".global optprobe_template_val\n"
-			"optprobe_template_val: \n"
-			ASM_NOP5
-			ASM_NOP5
-			".global optprobe_template_call\n"
-			"optprobe_template_call: \n"
-			ASM_NOP5
-			/* Move flags to rsp */
-			"	movq 144(%rsp), %rdx\n"
-			"	movq %rdx, 152(%rsp)\n"
-			RESTORE_REGS_STRING
-			/* Skip flags entry */
-			"	addq $8, %rsp\n"
-			"	popfq\n"
-#else /* CONFIG_X86_32 */
-			"	pushf\n"
-			SAVE_REGS_STRING
-			"	movl %esp, %edx\n"
-			".global optprobe_template_val\n"
-			"optprobe_template_val: \n"
-			ASM_NOP5
-			".global optprobe_template_call\n"
-			"optprobe_template_call: \n"
-			ASM_NOP5
-			RESTORE_REGS_STRING
-			"	addl $4, %esp\n"	/* skip cs */
-			"	popf\n"
-#endif
-			".global optprobe_template_end\n"
-			"optprobe_template_end: \n");
-}
-
-#define TMPL_MOVE_IDX \
-	((long)&optprobe_template_val - (long)&optprobe_template_entry)
-#define TMPL_CALL_IDX \
-	((long)&optprobe_template_call - (long)&optprobe_template_entry)
-#define TMPL_END_IDX \
-	((long)&optprobe_template_end - (long)&optprobe_template_entry)
-
-#define INT3_SIZE sizeof(kprobe_opcode_t)
-
-/* Optimized kprobe call back function: called from optinsn */
-static void __kprobes optimized_callback(struct optimized_kprobe *op,
-					 struct pt_regs *regs)
-{
-	struct kprobe_ctlblk *kcb = get_kprobe_ctlblk();
-	unsigned long flags;
-
-	/* This is possible if op is under delayed unoptimizing */
-	if (kprobe_disabled(&op->kp))
-		return;
-
-	local_irq_save(flags);
-	if (kprobe_running()) {
-		kprobes_inc_nmissed_count(&op->kp);
-	} else {
-		/* Save skipped registers */
-#ifdef CONFIG_X86_64
-		regs->cs = __KERNEL_CS;
-#else
-		regs->cs = __KERNEL_CS | get_kernel_rpl();
-		regs->gs = 0;
-#endif
-		regs->ip = (unsigned long)op->kp.addr + INT3_SIZE;
-		regs->orig_ax = ~0UL;
-
-		__this_cpu_write(current_kprobe, &op->kp);
-		kcb->kprobe_status = KPROBE_HIT_ACTIVE;
-		opt_pre_handler(&op->kp, regs);
-		__this_cpu_write(current_kprobe, NULL);
-	}
-	local_irq_restore(flags);
-}
-
-static int __kprobes copy_optimized_instructions(u8 *dest, u8 *src)
-{
-	int len = 0, ret;
-
-	while (len < RELATIVEJUMP_SIZE) {
-		ret = __copy_instruction(dest + len, src + len);
-		if (!ret || !can_boost(dest + len))
-			return -EINVAL;
-		len += ret;
-	}
-	/* Check whether the address range is reserved */
-	if (ftrace_text_reserved(src, src + len - 1) ||
-	    alternatives_text_reserved(src, src + len - 1) ||
-	    jump_label_text_reserved(src, src + len - 1))
-		return -EBUSY;
-
-	return len;
-}
-
-/* Check whether insn is indirect jump */
-static int __kprobes insn_is_indirect_jump(struct insn *insn)
-{
-	return ((insn->opcode.bytes[0] == 0xff &&
-		(X86_MODRM_REG(insn->modrm.value) & 6) == 4) || /* Jump */
-		insn->opcode.bytes[0] == 0xea);	/* Segment based jump */
-}
-
-/* Check whether insn jumps into specified address range */
-static int insn_jump_into_range(struct insn *insn, unsigned long start, int len)
-{
-	unsigned long target = 0;
-
-	switch (insn->opcode.bytes[0]) {
-	case 0xe0:	/* loopne */
-	case 0xe1:	/* loope */
-	case 0xe2:	/* loop */
-	case 0xe3:	/* jcxz */
-	case 0xe9:	/* near relative jump */
-	case 0xeb:	/* short relative jump */
-		break;
-	case 0x0f:
-		if ((insn->opcode.bytes[1] & 0xf0) == 0x80) /* jcc near */
-			break;
-		return 0;
-	default:
-		if ((insn->opcode.bytes[0] & 0xf0) == 0x70) /* jcc short */
-			break;
-		return 0;
-	}
-	target = (unsigned long)insn->next_byte + insn->immediate.value;
-
-	return (start <= target && target <= start + len);
-}
-
-/* Decode whole function to ensure any instructions don't jump into target */
-static int __kprobes can_optimize(unsigned long paddr)
-{
-	unsigned long addr, size = 0, offset = 0;
-	struct insn insn;
-	kprobe_opcode_t buf[MAX_INSN_SIZE];
-
-	/* Lookup symbol including addr */
-	if (!kallsyms_lookup_size_offset(paddr, &size, &offset))
-		return 0;
-
-	/*
-	 * Do not optimize in the entry code due to the unstable
-	 * stack handling.
-	 */
-	if ((paddr >= (unsigned long )__entry_text_start) &&
-	    (paddr <  (unsigned long )__entry_text_end))
-		return 0;
-
-	/* Check there is enough space for a relative jump. */
-	if (size - offset < RELATIVEJUMP_SIZE)
-		return 0;
-
-	/* Decode instructions */
-	addr = paddr - offset;
-	while (addr < paddr - offset + size) { /* Decode until function end */
-		if (search_exception_tables(addr))
-			/*
-			 * Since some fixup code will jumps into this function,
-			 * we can't optimize kprobe in this function.
-			 */
-			return 0;
-		kernel_insn_init(&insn, (void *)recover_probed_instruction(buf, addr));
-		insn_get_length(&insn);
-		/* Another subsystem puts a breakpoint */
-		if (insn.opcode.bytes[0] == BREAKPOINT_INSTRUCTION)
-			return 0;
-		/* Recover address */
-		insn.kaddr = (void *)addr;
-		insn.next_byte = (void *)(addr + insn.length);
-		/* Check any instructions don't jump into target */
-		if (insn_is_indirect_jump(&insn) ||
-		    insn_jump_into_range(&insn, paddr + INT3_SIZE,
-					 RELATIVE_ADDR_SIZE))
-			return 0;
-		addr += insn.length;
-	}
-
-	return 1;
-}
-
-/* Check optimized_kprobe can actually be optimized. */
-int __kprobes arch_check_optimized_kprobe(struct optimized_kprobe *op)
-{
-	int i;
-	struct kprobe *p;
-
-	for (i = 1; i < op->optinsn.size; i++) {
-		p = get_kprobe(op->kp.addr + i);
-		if (p && !kprobe_disabled(p))
-			return -EEXIST;
-	}
-
-	return 0;
-}
-
-/* Check the addr is within the optimized instructions. */
-int __kprobes arch_within_optimized_kprobe(struct optimized_kprobe *op,
-					   unsigned long addr)
-{
-	return ((unsigned long)op->kp.addr <= addr &&
-		(unsigned long)op->kp.addr + op->optinsn.size > addr);
-}
-
-/* Free optimized instruction slot */
-static __kprobes
-void __arch_remove_optimized_kprobe(struct optimized_kprobe *op, int dirty)
-{
-	if (op->optinsn.insn) {
-		free_optinsn_slot(op->optinsn.insn, dirty);
-		op->optinsn.insn = NULL;
-		op->optinsn.size = 0;
-	}
-}
-
-void __kprobes arch_remove_optimized_kprobe(struct optimized_kprobe *op)
-{
-	__arch_remove_optimized_kprobe(op, 1);
-}
-
-/*
- * Copy replacing target instructions
- * Target instructions MUST be relocatable (checked inside)
- * This is called when new aggr(opt)probe is allocated or reused.
- */
-int __kprobes arch_prepare_optimized_kprobe(struct optimized_kprobe *op)
-{
-	u8 *buf;
-	int ret;
-	long rel;
-
-	if (!can_optimize((unsigned long)op->kp.addr))
-		return -EILSEQ;
-
-	op->optinsn.insn = get_optinsn_slot();
-	if (!op->optinsn.insn)
-		return -ENOMEM;
-
-	/*
-	 * Verify if the address gap is in 2GB range, because this uses
-	 * a relative jump.
-	 */
-	rel = (long)op->optinsn.insn - (long)op->kp.addr + RELATIVEJUMP_SIZE;
-	if (abs(rel) > 0x7fffffff)
-		return -ERANGE;
-
-	buf = (u8 *)op->optinsn.insn;
-
-	/* Copy instructions into the out-of-line buffer */
-	ret = copy_optimized_instructions(buf + TMPL_END_IDX, op->kp.addr);
-	if (ret < 0) {
-		__arch_remove_optimized_kprobe(op, 0);
-		return ret;
-	}
-	op->optinsn.size = ret;
-
-	/* Copy arch-dep-instance from template */
-	memcpy(buf, &optprobe_template_entry, TMPL_END_IDX);
-
-	/* Set probe information */
-	synthesize_set_arg1(buf + TMPL_MOVE_IDX, (unsigned long)op);
-
-	/* Set probe function call */
-	synthesize_relcall(buf + TMPL_CALL_IDX, optimized_callback);
-
-	/* Set returning jmp instruction at the tail of out-of-line buffer */
-	synthesize_reljump(buf + TMPL_END_IDX + op->optinsn.size,
-			   (u8 *)op->kp.addr + op->optinsn.size);
-
-	flush_icache_range((unsigned long) buf,
-			   (unsigned long) buf + TMPL_END_IDX +
-			   op->optinsn.size + RELATIVEJUMP_SIZE);
-	return 0;
-}
-
-#define MAX_OPTIMIZE_PROBES 256
-static struct text_poke_param *jump_poke_params;
-static struct jump_poke_buffer {
-	u8 buf[RELATIVEJUMP_SIZE];
-} *jump_poke_bufs;
-
-static void __kprobes setup_optimize_kprobe(struct text_poke_param *tprm,
-					    u8 *insn_buf,
-					    struct optimized_kprobe *op)
-{
-	s32 rel = (s32)((long)op->optinsn.insn -
-			((long)op->kp.addr + RELATIVEJUMP_SIZE));
-
-	/* Backup instructions which will be replaced by jump address */
-	memcpy(op->optinsn.copied_insn, op->kp.addr + INT3_SIZE,
-	       RELATIVE_ADDR_SIZE);
-
-	insn_buf[0] = RELATIVEJUMP_OPCODE;
-	*(s32 *)(&insn_buf[1]) = rel;
-
-	tprm->addr = op->kp.addr;
-	tprm->opcode = insn_buf;
-	tprm->len = RELATIVEJUMP_SIZE;
-}
-
-/*
- * Replace breakpoints (int3) with relative jumps.
- * Caller must call with locking kprobe_mutex and text_mutex.
- */
-void __kprobes arch_optimize_kprobes(struct list_head *oplist)
-{
-	struct optimized_kprobe *op, *tmp;
-	int c = 0;
-
-	list_for_each_entry_safe(op, tmp, oplist, list) {
-		WARN_ON(kprobe_disabled(&op->kp));
-		/* Setup param */
-		setup_optimize_kprobe(&jump_poke_params[c],
-				      jump_poke_bufs[c].buf, op);
-		list_del_init(&op->list);
-		if (++c >= MAX_OPTIMIZE_PROBES)
-			break;
-	}
-
-	/*
-	 * text_poke_smp doesn't support NMI/MCE code modifying.
-	 * However, since kprobes itself also doesn't support NMI/MCE
-	 * code probing, it's not a problem.
-	 */
-	text_poke_smp_batch(jump_poke_params, c);
-}
-
-static void __kprobes setup_unoptimize_kprobe(struct text_poke_param *tprm,
-					      u8 *insn_buf,
-					      struct optimized_kprobe *op)
-{
-	/* Set int3 to first byte for kprobes */
-	insn_buf[0] = BREAKPOINT_INSTRUCTION;
-	memcpy(insn_buf + 1, op->optinsn.copied_insn, RELATIVE_ADDR_SIZE);
-
-	tprm->addr = op->kp.addr;
-	tprm->opcode = insn_buf;
-	tprm->len = RELATIVEJUMP_SIZE;
-}
-
-/*
- * Recover original instructions and breakpoints from relative jumps.
- * Caller must call with locking kprobe_mutex.
- */
-extern void arch_unoptimize_kprobes(struct list_head *oplist,
-				    struct list_head *done_list)
-{
-	struct optimized_kprobe *op, *tmp;
-	int c = 0;
-
-	list_for_each_entry_safe(op, tmp, oplist, list) {
-		/* Setup param */
-		setup_unoptimize_kprobe(&jump_poke_params[c],
-					jump_poke_bufs[c].buf, op);
-		list_move(&op->list, done_list);
-		if (++c >= MAX_OPTIMIZE_PROBES)
-			break;
-	}
-
-	/*
-	 * text_poke_smp doesn't support NMI/MCE code modifying.
-	 * However, since kprobes itself also doesn't support NMI/MCE
-	 * code probing, it's not a problem.
-	 */
-	text_poke_smp_batch(jump_poke_params, c);
-}
-
-/* Replace a relative jump with a breakpoint (int3).  */
-void __kprobes arch_unoptimize_kprobe(struct optimized_kprobe *op)
-{
-	u8 buf[RELATIVEJUMP_SIZE];
-
-	/* Set int3 to first byte for kprobes */
-	buf[0] = BREAKPOINT_INSTRUCTION;
-	memcpy(buf + 1, op->optinsn.copied_insn, RELATIVE_ADDR_SIZE);
-	text_poke_smp(op->kp.addr, buf, RELATIVEJUMP_SIZE);
-}
-
-static int  __kprobes setup_detour_execution(struct kprobe *p,
-					     struct pt_regs *regs,
-					     int reenter)
-{
-	struct optimized_kprobe *op;
-
-	if (p->flags & KPROBE_FLAG_OPTIMIZED) {
-		/* This kprobe is really able to run optimized path. */
-		op = container_of(p, struct optimized_kprobe, kp);
-		/* Detour through copied instructions */
-		regs->ip = (unsigned long)op->optinsn.insn + TMPL_END_IDX;
-		if (!reenter)
-			reset_current_kprobe();
-		preempt_enable_no_resched();
-		return 1;
-	}
-	return 0;
-}
-
-static int __kprobes init_poke_params(void)
-{
-	/* Allocate code buffer and parameter array */
-	jump_poke_bufs = kmalloc(sizeof(struct jump_poke_buffer) *
-				 MAX_OPTIMIZE_PROBES, GFP_KERNEL);
-	if (!jump_poke_bufs)
-		return -ENOMEM;
-
-	jump_poke_params = kmalloc(sizeof(struct text_poke_param) *
-				   MAX_OPTIMIZE_PROBES, GFP_KERNEL);
-	if (!jump_poke_params) {
-		kfree(jump_poke_bufs);
-		jump_poke_bufs = NULL;
-		return -ENOMEM;
-	}
-
-	return 0;
-}
-#else	/* !CONFIG_OPTPROBES */
-static int __kprobes init_poke_params(void)
-{
-	return 0;
-}
-#endif
-
 int __init arch_init_kprobes(void)
 {
-	return init_poke_params();
+	return arch_init_optprobes();
 }
 
 int __kprobes arch_trampoline_kprobe(struct kprobe *p)
-- 
cgit v1.2.3


From d1f42e314c9c50541c79a6edf2b4cab63fe02ee3 Mon Sep 17 00:00:00 2001
From: Daniel Drake <dsd@laptop.org>
Date: Mon, 5 Mar 2012 15:01:00 -0800
Subject: x86/olpc/xo15/sci: Enable lid close wakeup control

Like most systems, OLPC's ACPI LID switch wakes up the system
when the lid is opened, but not when it is closed.

Under OLPC's opportunistic suspend model, the lid may be closed
while the system was oportunistically suspended with the screen
running.  In this event, we want to wake up to turn the screen
off.

Enable control of normal ACPI wakeups through lid close events
through a new sysfs attribute "lid_wake_on_closed".  When set,
and when LID wakeups are enabled through ACPI, the system will
wake up on both open and close lid events.

Signed-off-by: Daniel Drake <dsd@laptop.org>
Cc: Andres Salomon <dilinger@queued.net>
Cc: Matthew Garrett <mjg@redhat.com>
[ Fixed sscanf checking]
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Link: http://lkml.kernel.org/n/tip-bgt8hxu2wwe0x5p8edhogtf7@git.kernel.org
[ Did very minor readability tweaks ]
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/platform/olpc/olpc-xo15-sci.c | 72 +++++++++++++++++++++++++++++++++-
 1 file changed, 71 insertions(+), 1 deletion(-)

(limited to 'arch/x86')

diff --git a/arch/x86/platform/olpc/olpc-xo15-sci.c b/arch/x86/platform/olpc/olpc-xo15-sci.c
index 2b235b77d9ab..23e5b9d7977b 100644
--- a/arch/x86/platform/olpc/olpc-xo15-sci.c
+++ b/arch/x86/platform/olpc/olpc-xo15-sci.c
@@ -23,7 +23,66 @@
 #define XO15_SCI_CLASS			DRV_NAME
 #define XO15_SCI_DEVICE_NAME		"OLPC XO-1.5 SCI"
 
-static unsigned long xo15_sci_gpe;
+static unsigned long			xo15_sci_gpe;
+static bool				lid_wake_on_close;
+
+/*
+ * The normal ACPI LID wakeup behavior is wake-on-open, but not
+ * wake-on-close. This is implemented as standard by the XO-1.5 DSDT.
+ *
+ * We provide here a sysfs attribute that will additionally enable
+ * wake-on-close behavior. This is useful (e.g.) when we oportunistically
+ * suspend with the display running; if the lid is then closed, we want to
+ * wake up to turn the display off.
+ *
+ * This is controlled through a custom method in the XO-1.5 DSDT.
+ */
+static int set_lid_wake_behavior(bool wake_on_close)
+{
+	struct acpi_object_list arg_list;
+	union acpi_object arg;
+	acpi_status status;
+
+	arg_list.count		= 1;
+	arg_list.pointer	= &arg;
+	arg.type		= ACPI_TYPE_INTEGER;
+	arg.integer.value	= wake_on_close;
+
+	status = acpi_evaluate_object(NULL, "\\_SB.PCI0.LID.LIDW", &arg_list, NULL);
+	if (ACPI_FAILURE(status)) {
+		pr_warning(PFX "failed to set lid behavior\n");
+		return 1;
+	}
+
+	lid_wake_on_close = wake_on_close;
+
+	return 0;
+}
+
+static ssize_t
+lid_wake_on_close_show(struct kobject *s, struct kobj_attribute *attr, char *buf)
+{
+	return sprintf(buf, "%u\n", lid_wake_on_close);
+}
+
+static ssize_t lid_wake_on_close_store(struct kobject *s,
+				       struct kobj_attribute *attr,
+				       const char *buf, size_t n)
+{
+	unsigned int val;
+
+	if (sscanf(buf, "%u", &val) != 1)
+		return -EINVAL;
+
+	set_lid_wake_behavior(!!val);
+
+	return n;
+}
+
+static struct kobj_attribute lid_wake_on_close_attr =
+	__ATTR(lid_wake_on_close, 0644,
+	       lid_wake_on_close_show,
+	       lid_wake_on_close_store);
 
 static void battery_status_changed(void)
 {
@@ -91,6 +150,7 @@ static int xo15_sci_add(struct acpi_device *device)
 {
 	unsigned long long tmp;
 	acpi_status status;
+	int r;
 
 	if (!device)
 		return -EINVAL;
@@ -112,6 +172,10 @@ static int xo15_sci_add(struct acpi_device *device)
 
 	dev_info(&device->dev, "Initialized, GPE = 0x%lx\n", xo15_sci_gpe);
 
+	r = sysfs_create_file(&device->dev.kobj, &lid_wake_on_close_attr.attr);
+	if (r)
+		goto err_sysfs;
+
 	/* Flush queue, and enable all SCI events */
 	process_sci_queue();
 	olpc_ec_mask_write(EC_SCI_SRC_ALL);
@@ -123,6 +187,11 @@ static int xo15_sci_add(struct acpi_device *device)
 		device_init_wakeup(&device->dev, true);
 
 	return 0;
+
+err_sysfs:
+	acpi_remove_gpe_handler(NULL, xo15_sci_gpe, xo15_sci_gpe_handler);
+	cancel_work_sync(&sci_work);
+	return r;
 }
 
 static int xo15_sci_remove(struct acpi_device *device, int type)
@@ -130,6 +199,7 @@ static int xo15_sci_remove(struct acpi_device *device, int type)
 	acpi_disable_gpe(NULL, xo15_sci_gpe);
 	acpi_remove_gpe_handler(NULL, xo15_sci_gpe, xo15_sci_gpe_handler);
 	cancel_work_sync(&sci_work);
+	sysfs_remove_file(&device->dev.kobj, &lid_wake_on_close_attr.attr);
 	return 0;
 }
 
-- 
cgit v1.2.3


From 097d59106a8e4b42d07c9892fdd7790f1659c6ff Mon Sep 17 00:00:00 2001
From: Linus Torvalds <torvalds@linux-foundation.org>
Date: Tue, 6 Mar 2012 18:23:36 -0800
Subject: vm: avoid using find_vma_prev() unnecessarily

Several users of "find_vma_prev()" were not in fact interested in the
previous vma if there was no primary vma to be found either.  And in
those cases, we're much better off just using the regular "find_vma()",
and then "prev" can be looked up by just checking vma->vm_prev.

The find_vma_prev() semantics are fairly subtle (see Mikulas' recent
commit 83cd904d271b: "mm: fix find_vma_prev"), and the whole "return
prev by reference" means that it generates worse code too.

Thus this "let's avoid using this inconvenient and clearly too subtle
interface when we don't really have to" patch.

Cc: Mikulas Patocka <mpatocka@redhat.com>
Cc: KOSAKI Motohiro <kosaki.motohiro@jp.fujitsu.com>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 arch/x86/mm/hugetlbpage.c | 4 +++-
 mm/mempolicy.c            | 3 ++-
 mm/mlock.c                | 3 ++-
 mm/mprotect.c             | 3 ++-
 4 files changed, 9 insertions(+), 4 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/mm/hugetlbpage.c b/arch/x86/mm/hugetlbpage.c
index f581a18c0d4d..83e7141c3982 100644
--- a/arch/x86/mm/hugetlbpage.c
+++ b/arch/x86/mm/hugetlbpage.c
@@ -333,13 +333,15 @@ try_again:
 		 * Lookup failure means no vma is above this address,
 		 * i.e. return with success:
 		 */
-		if (!(vma = find_vma_prev(mm, addr, &prev_vma)))
+		vma = find_vma(mm, add);
+		if (!vma)
 			return addr;
 
 		/*
 		 * new region fits between prev_vma->vm_end and
 		 * vma->vm_start, use it:
 		 */
+		prev_vma = vma->vm_prev;
 		if (addr + len <= vma->vm_start &&
 		            (!prev_vma || (addr >= prev_vma->vm_end))) {
 			/* remember the address as a hint for next time */
diff --git a/mm/mempolicy.c b/mm/mempolicy.c
index 06b145fb64ab..47296fee23db 100644
--- a/mm/mempolicy.c
+++ b/mm/mempolicy.c
@@ -640,10 +640,11 @@ static int mbind_range(struct mm_struct *mm, unsigned long start,
 	unsigned long vmstart;
 	unsigned long vmend;
 
-	vma = find_vma_prev(mm, start, &prev);
+	vma = find_vma(mm, start);
 	if (!vma || vma->vm_start > start)
 		return -EFAULT;
 
+	prev = vma->vm_prev;
 	if (start > vma->vm_start)
 		prev = vma;
 
diff --git a/mm/mlock.c b/mm/mlock.c
index 4f4f53bdc65d..ef726e8aa8e9 100644
--- a/mm/mlock.c
+++ b/mm/mlock.c
@@ -385,10 +385,11 @@ static int do_mlock(unsigned long start, size_t len, int on)
 		return -EINVAL;
 	if (end == start)
 		return 0;
-	vma = find_vma_prev(current->mm, start, &prev);
+	vma = find_vma(current->mm, start);
 	if (!vma || vma->vm_start > start)
 		return -ENOMEM;
 
+	prev = vma->vm_prev;
 	if (start > vma->vm_start)
 		prev = vma;
 
diff --git a/mm/mprotect.c b/mm/mprotect.c
index 5a688a2756be..f437d054c3bf 100644
--- a/mm/mprotect.c
+++ b/mm/mprotect.c
@@ -262,10 +262,11 @@ SYSCALL_DEFINE3(mprotect, unsigned long, start, size_t, len,
 
 	down_write(&current->mm->mmap_sem);
 
-	vma = find_vma_prev(current->mm, start, &prev);
+	vma = find_vma(current->mm, start);
 	error = -ENOMEM;
 	if (!vma)
 		goto out;
+	prev = vma->vm_prev;
 	if (unlikely(grows & PROT_GROWSDOWN)) {
 		if (vma->vm_start >= end)
 			goto out;
-- 
cgit v1.2.3


From 55062d061790b43aee01ab3f9ac57b8596254f19 Mon Sep 17 00:00:00 2001
From: Linus Torvalds <torvalds@linux-foundation.org>
Date: Tue, 6 Mar 2012 18:48:13 -0800
Subject: x86: fix typo in recent find_vma_prev purge

It turns out that test-compiling this file on x86-64 doesn't really
help, because much of it is x86-32-specific.  And so I hadn't noticed
the slightly over-eager removal of the 'r' from 'addr' variable despite
thinking I had tested it.

Signed-off-by: Linus "oopsie" Torvalds <torvalds@linux-foundation.org>
---
 arch/x86/mm/hugetlbpage.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'arch/x86')

diff --git a/arch/x86/mm/hugetlbpage.c b/arch/x86/mm/hugetlbpage.c
index 83e7141c3982..8ecbb4bba4b3 100644
--- a/arch/x86/mm/hugetlbpage.c
+++ b/arch/x86/mm/hugetlbpage.c
@@ -333,7 +333,7 @@ try_again:
 		 * Lookup failure means no vma is above this address,
 		 * i.e. return with success:
 		 */
-		vma = find_vma(mm, add);
+		vma = find_vma(mm, addr);
 		if (!vma)
 			return addr;
 
-- 
cgit v1.2.3


From b11e3d782b9c065b3b2fb543bfb0d97801822dc0 Mon Sep 17 00:00:00 2001
From: "Srivatsa S. Bhat" <srivatsa.bhat@linux.vnet.ibm.com>
Date: Wed, 7 Mar 2012 11:44:29 +0100
Subject: x86, mce: Fix rcu splat in drain_mce_log_buffer()

While booting, the following message is seen:

[   21.665087] ===============================
[   21.669439] [ INFO: suspicious RCU usage. ]
[   21.673798] 3.2.0-0.0.0.28.36b5ec9-default #2 Not tainted
[   21.681353] -------------------------------
[   21.685864] arch/x86/kernel/cpu/mcheck/mce.c:194 suspicious rcu_dereference_index_check() usage!
[   21.695013]
[   21.695014] other info that might help us debug this:
[   21.695016]
[   21.703488]
[   21.703489] rcu_scheduler_active = 1, debug_locks = 1
[   21.710426] 3 locks held by modprobe/2139:
[   21.714754]  #0:  (&__lockdep_no_validate__){......}, at: [<ffffffff8133afd3>] __driver_attach+0x53/0xa0
[   21.725020]  #1:
[   21.725323] ioatdma: Intel(R) QuickData Technology Driver 4.00
[   21.733206]  (&__lockdep_no_validate__){......}, at: [<ffffffff8133afe1>] __driver_attach+0x61/0xa0
[   21.743015]  #2:  (i7core_edac_lock){+.+.+.}, at: [<ffffffffa01cfa5f>] i7core_probe+0x1f/0x5c0 [i7core_edac]
[   21.753708]
[   21.753709] stack backtrace:
[   21.758429] Pid: 2139, comm: modprobe Not tainted 3.2.0-0.0.0.28.36b5ec9-default #2
[   21.768253] Call Trace:
[   21.770838]  [<ffffffff810977cd>] lockdep_rcu_suspicious+0xcd/0x100
[   21.777366]  [<ffffffff8101aa41>] drain_mcelog_buffer+0x191/0x1b0
[   21.783715]  [<ffffffff8101aa78>] mce_register_decode_chain+0x18/0x20
[   21.790430]  [<ffffffffa01cf8db>] i7core_register_mci+0x2fb/0x3e4 [i7core_edac]
[   21.798003]  [<ffffffffa01cfb14>] i7core_probe+0xd4/0x5c0 [i7core_edac]
[   21.804809]  [<ffffffff8129566b>] local_pci_probe+0x5b/0xe0
[   21.810631]  [<ffffffff812957c9>] __pci_device_probe+0xd9/0xe0
[   21.816650]  [<ffffffff813362e4>] ? get_device+0x14/0x20
[   21.822178]  [<ffffffff81296916>] pci_device_probe+0x36/0x60
[   21.828061]  [<ffffffff8133ac8a>] really_probe+0x7a/0x2b0
[   21.833676]  [<ffffffff8133af23>] driver_probe_device+0x63/0xc0
[   21.839868]  [<ffffffff8133b01b>] __driver_attach+0x9b/0xa0
[   21.845718]  [<ffffffff8133af80>] ? driver_probe_device+0xc0/0xc0
[   21.852027]  [<ffffffff81339168>] bus_for_each_dev+0x68/0x90
[   21.857876]  [<ffffffff8133aa3c>] driver_attach+0x1c/0x20
[   21.863462]  [<ffffffff8133a64d>] bus_add_driver+0x16d/0x2b0
[   21.869377]  [<ffffffff8133b6dc>] driver_register+0x7c/0x160
[   21.875220]  [<ffffffff81296bda>] __pci_register_driver+0x6a/0xf0
[   21.881494]  [<ffffffffa01fe000>] ? 0xffffffffa01fdfff
[   21.886846]  [<ffffffffa01fe047>] i7core_init+0x47/0x1000 [i7core_edac]
[   21.893737]  [<ffffffff810001ce>] do_one_initcall+0x3e/0x180
[   21.899670]  [<ffffffff810a9b95>] sys_init_module+0xc5/0x220
[   21.905542]  [<ffffffff8149bc39>] system_call_fastpath+0x16/0x1b

Fix this by using ACCESS_ONCE() instead of rcu_dereference_check_mce()
over mcelog.next. Since the access to each entry is controlled by the
->finished field, ACCESS_ONCE() should work just fine. An rcu_dereference
is unnecessary here.

Signed-off-by: Srivatsa S. Bhat <srivatsa.bhat@linux.vnet.ibm.com>
Suggested-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
Cc: Tony Luck <tony.luck@intel.com>
Signed-off-by: Borislav Petkov <borislav.petkov@amd.com>
---
 arch/x86/kernel/cpu/mcheck/mce.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kernel/cpu/mcheck/mce.c b/arch/x86/kernel/cpu/mcheck/mce.c
index 5a11ae2e9e91..db590aff874c 100644
--- a/arch/x86/kernel/cpu/mcheck/mce.c
+++ b/arch/x86/kernel/cpu/mcheck/mce.c
@@ -191,7 +191,7 @@ static void drain_mcelog_buffer(void)
 {
 	unsigned int next, i, prev = 0;
 
-	next = rcu_dereference_check_mce(mcelog.next);
+	next = ACCESS_ONCE(mcelog.next);
 
 	do {
 		struct mce *m;
-- 
cgit v1.2.3


From 0d2bf4899d04fcc7f3a280b0bc74c084badb4e04 Mon Sep 17 00:00:00 2001
From: Jan Beulich <JBeulich@suse.com>
Date: Thu, 8 Mar 2012 09:18:02 +0000
Subject: x86: Tighten dependencies of CPU_SUP_*_32

Building in support for either of these CPUs is pointless when
e.g. M686 was selected (since such a kernel would use cmov
instructions, which aren't available on these older CPUs).

Signed-off-by: Jan Beulich <jbeulich@suse.com>
Link: http://lkml.kernel.org/r/4F58875A02000078000770E0@nat28.tlf.novell.com
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/Kconfig.cpu | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/Kconfig.cpu b/arch/x86/Kconfig.cpu
index 6443c6f038e8..706e12e9984b 100644
--- a/arch/x86/Kconfig.cpu
+++ b/arch/x86/Kconfig.cpu
@@ -440,7 +440,7 @@ config CPU_SUP_INTEL
 config CPU_SUP_CYRIX_32
 	default y
 	bool "Support Cyrix processors" if PROCESSOR_SELECT
-	depends on !64BIT
+	depends on M386 || M486 || M586 || M586TSC || M586MMX || (EXPERT && !64BIT)
 	---help---
 	  This enables detection, tunings and quirks for Cyrix processors
 
@@ -494,7 +494,7 @@ config CPU_SUP_TRANSMETA_32
 config CPU_SUP_UMC_32
 	default y
 	bool "Support UMC processors" if PROCESSOR_SELECT
-	depends on !64BIT
+	depends on M386 || M486 || (EXPERT && !64BIT)
 	---help---
 	  This enables detection, tunings and quirks for UMC processors
 
-- 
cgit v1.2.3


From c7e23289a6aa95048a78b252b462f24ca6cf7f96 Mon Sep 17 00:00:00 2001
From: Jan Beulich <JBeulich@suse.com>
Date: Thu, 8 Mar 2012 09:23:14 +0000
Subject: x86/32: Print control and debug registers for kerenel context

While for a user mode register dump it may be reasonable to skip
those (albeit x86-64 doesn't do so), for kernel mode dumps these
should be printed to make sure all information possibly
necessary for analysis is available.

Signed-off-by: Jan Beulich <jbeulich@suse.com>
Link: http://lkml.kernel.org/r/4F58889202000078000770E7@nat28.tlf.novell.com
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/dumpstack_32.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kernel/dumpstack_32.c b/arch/x86/kernel/dumpstack_32.c
index c99f9ed013d5..88ec9129271d 100644
--- a/arch/x86/kernel/dumpstack_32.c
+++ b/arch/x86/kernel/dumpstack_32.c
@@ -87,7 +87,7 @@ void show_registers(struct pt_regs *regs)
 	int i;
 
 	print_modules();
-	__show_regs(regs, 0);
+	__show_regs(regs, !user_mode_vm(regs));
 
 	printk(KERN_EMERG "Process %.*s (pid: %d, ti=%p task=%p task.ti=%p)\n",
 		TASK_COMM_LEN, current->comm, task_pid_nr(current),
-- 
cgit v1.2.3


From a240ada241dafe290e7532d1ddeb98fdf1419068 Mon Sep 17 00:00:00 2001
From: Jan Beulich <JBeulich@suse.com>
Date: Thu, 8 Mar 2012 09:24:57 +0000
Subject: x86: Include probe_roms.h in probe_roms.c

... to ensure that declarations and definitions are in sync.

Signed-off-by: Jan Beulich <jbeulich@suse.com>
Link: http://lkml.kernel.org/r/4F5888F902000078000770F1@nat28.tlf.novell.com
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/probe_roms.c | 1 +
 1 file changed, 1 insertion(+)

(limited to 'arch/x86')

diff --git a/arch/x86/kernel/probe_roms.c b/arch/x86/kernel/probe_roms.c
index 34e06e84ce31..0bc72e2069e3 100644
--- a/arch/x86/kernel/probe_roms.c
+++ b/arch/x86/kernel/probe_roms.c
@@ -12,6 +12,7 @@
 #include <linux/pci.h>
 #include <linux/export.h>
 
+#include <asm/probe_roms.h>
 #include <asm/pci-direct.h>
 #include <asm/e820.h>
 #include <asm/mmzone.h>
-- 
cgit v1.2.3


From cc578287e3224d0da196cc1d226bdae6b068faa7 Mon Sep 17 00:00:00 2001
From: Zachary Amsden <zamsden@gmail.com>
Date: Fri, 3 Feb 2012 15:43:50 -0200
Subject: KVM: Infrastructure for software and hardware based TSC rate scaling

This requires some restructuring; rather than use 'virtual_tsc_khz'
to indicate whether hardware rate scaling is in effect, we consider
each VCPU to always have a virtual TSC rate.  Instead, there is new
logic above the vendor-specific hardware scaling that decides whether
it is even necessary to use and updates all rate variables used by
common code.  This means we can simply query the virtual rate at
any point, which is needed for software rate scaling.

There is also now a threshold added to the TSC rate scaling; minor
differences and variations of measured TSC rate can accidentally
provoke rate scaling to be used when it is not needed.  Instead,
we have a tolerance variable called tsc_tolerance_ppm, which is
the maximum variation from user requested rate at which scaling
will be used.  The default is 250ppm, which is the half the
threshold for NTP adjustment, allowing for some hardware variation.

In the event that hardware rate scaling is not available, we can
kludge a bit by forcing TSC catchup to turn on when a faster than
hardware speed has been requested, but there is nothing available
yet for the reverse case; this requires a trap and emulate software
implementation for RDTSC, which is still forthcoming.

[avi: fix 64-bit division on i386]

Signed-off-by: Zachary Amsden <zamsden@gmail.com>
Signed-off-by: Marcelo Tosatti <mtosatti@redhat.com>
Signed-off-by: Avi Kivity <avi@redhat.com>
---
 arch/x86/include/asm/kvm_host.h |  9 +++--
 arch/x86/kvm/lapic.c            |  2 +-
 arch/x86/kvm/svm.c              | 20 ++++++----
 arch/x86/kvm/vmx.c              | 16 +++++---
 arch/x86/kvm/x86.c              | 82 +++++++++++++++++++++--------------------
 5 files changed, 71 insertions(+), 58 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index 782d973b0719..ddebbe01fff9 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -422,10 +422,11 @@ struct kvm_vcpu_arch {
 	u64 last_kernel_ns;
 	u64 last_tsc_nsec;
 	u64 last_tsc_write;
-	u32 virtual_tsc_khz;
 	bool tsc_catchup;
-	u32  tsc_catchup_mult;
-	s8   tsc_catchup_shift;
+	bool tsc_always_catchup;
+	s8 virtual_tsc_shift;
+	u32 virtual_tsc_mult;
+	u32 virtual_tsc_khz;
 
 	atomic_t nmi_queued;  /* unprocessed asynchronous NMIs */
 	unsigned nmi_pending; /* NMI queued after currently running handler */
@@ -651,7 +652,7 @@ struct kvm_x86_ops {
 
 	bool (*has_wbinvd_exit)(void);
 
-	void (*set_tsc_khz)(struct kvm_vcpu *vcpu, u32 user_tsc_khz);
+	void (*set_tsc_khz)(struct kvm_vcpu *vcpu, u32 user_tsc_khz, bool scale);
 	void (*write_tsc_offset)(struct kvm_vcpu *vcpu, u64 offset);
 
 	u64 (*compute_tsc_offset)(struct kvm_vcpu *vcpu, u64 target_tsc);
diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c
index 3ee1d83c695d..72975f758c83 100644
--- a/arch/x86/kvm/lapic.c
+++ b/arch/x86/kvm/lapic.c
@@ -731,7 +731,7 @@ static void start_apic_timer(struct kvm_lapic *apic)
 		u64 guest_tsc, tscdeadline = apic->lapic_timer.tscdeadline;
 		u64 ns = 0;
 		struct kvm_vcpu *vcpu = apic->vcpu;
-		unsigned long this_tsc_khz = vcpu_tsc_khz(vcpu);
+		unsigned long this_tsc_khz = vcpu->arch.virtual_tsc_khz;
 		unsigned long flags;
 
 		if (unlikely(!tscdeadline || !this_tsc_khz))
diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c
index 7bbd17cc3488..e12026e5244e 100644
--- a/arch/x86/kvm/svm.c
+++ b/arch/x86/kvm/svm.c
@@ -964,20 +964,25 @@ static u64 svm_scale_tsc(struct kvm_vcpu *vcpu, u64 tsc)
 	return _tsc;
 }
 
-static void svm_set_tsc_khz(struct kvm_vcpu *vcpu, u32 user_tsc_khz)
+static void svm_set_tsc_khz(struct kvm_vcpu *vcpu, u32 user_tsc_khz, bool scale)
 {
 	struct vcpu_svm *svm = to_svm(vcpu);
 	u64 ratio;
 	u64 khz;
 
-	/* TSC scaling supported? */
-	if (!boot_cpu_has(X86_FEATURE_TSCRATEMSR))
+	/* Guest TSC same frequency as host TSC? */
+	if (!scale) {
+		svm->tsc_ratio = TSC_RATIO_DEFAULT;
 		return;
+	}
 
-	/* TSC-Scaling disabled or guest TSC same frequency as host TSC? */
-	if (user_tsc_khz == 0) {
-		vcpu->arch.virtual_tsc_khz = 0;
-		svm->tsc_ratio = TSC_RATIO_DEFAULT;
+	/* TSC scaling supported? */
+	if (!boot_cpu_has(X86_FEATURE_TSCRATEMSR)) {
+		if (user_tsc_khz > tsc_khz) {
+			vcpu->arch.tsc_catchup = 1;
+			vcpu->arch.tsc_always_catchup = 1;
+		} else
+			WARN(1, "user requested TSC rate below hardware speed\n");
 		return;
 	}
 
@@ -992,7 +997,6 @@ static void svm_set_tsc_khz(struct kvm_vcpu *vcpu, u32 user_tsc_khz)
 				user_tsc_khz);
 		return;
 	}
-	vcpu->arch.virtual_tsc_khz = user_tsc_khz;
 	svm->tsc_ratio             = ratio;
 }
 
diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index 3b4c8d8ad906..e6bf61fa1c03 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -1817,13 +1817,19 @@ u64 vmx_read_l1_tsc(struct kvm_vcpu *vcpu)
 }
 
 /*
- * Empty call-back. Needs to be implemented when VMX enables the SET_TSC_KHZ
- * ioctl. In this case the call-back should update internal vmx state to make
- * the changes effective.
+ * Engage any workarounds for mis-matched TSC rates.  Currently limited to
+ * software catchup for faster rates on slower CPUs.
  */
-static void vmx_set_tsc_khz(struct kvm_vcpu *vcpu, u32 user_tsc_khz)
+static void vmx_set_tsc_khz(struct kvm_vcpu *vcpu, u32 user_tsc_khz, bool scale)
 {
-	/* Nothing to do here */
+	if (!scale)
+		return;
+
+	if (user_tsc_khz > tsc_khz) {
+		vcpu->arch.tsc_catchup = 1;
+		vcpu->arch.tsc_always_catchup = 1;
+	} else
+		WARN(1, "user requested TSC rate below hardware speed\n");
 }
 
 /*
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 2bd77a3a41ed..41bb90acb238 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -96,6 +96,10 @@ EXPORT_SYMBOL_GPL(kvm_has_tsc_control);
 u32  kvm_max_guest_tsc_khz;
 EXPORT_SYMBOL_GPL(kvm_max_guest_tsc_khz);
 
+/* tsc tolerance in parts per million - default to 1/2 of the NTP threshold */
+static u32 tsc_tolerance_ppm = 250;
+module_param(tsc_tolerance_ppm, uint, S_IRUGO | S_IWUSR);
+
 #define KVM_NR_SHARED_MSRS 16
 
 struct kvm_shared_msrs_global {
@@ -968,49 +972,50 @@ static inline u64 get_kernel_ns(void)
 static DEFINE_PER_CPU(unsigned long, cpu_tsc_khz);
 unsigned long max_tsc_khz;
 
-static inline int kvm_tsc_changes_freq(void)
+static inline u64 nsec_to_cycles(struct kvm_vcpu *vcpu, u64 nsec)
 {
-	int cpu = get_cpu();
-	int ret = !boot_cpu_has(X86_FEATURE_CONSTANT_TSC) &&
-		  cpufreq_quick_get(cpu) != 0;
-	put_cpu();
-	return ret;
+	return pvclock_scale_delta(nsec, vcpu->arch.virtual_tsc_mult,
+				   vcpu->arch.virtual_tsc_shift);
 }
 
-u64 vcpu_tsc_khz(struct kvm_vcpu *vcpu)
+static u32 adjust_tsc_khz(u32 khz, s32 ppm)
 {
-	if (vcpu->arch.virtual_tsc_khz)
-		return vcpu->arch.virtual_tsc_khz;
-	else
-		return __this_cpu_read(cpu_tsc_khz);
+	u64 v = (u64)khz * (1000000 + ppm);
+	do_div(v, 1000000);
+	return v;
 }
 
-static inline u64 nsec_to_cycles(struct kvm_vcpu *vcpu, u64 nsec)
+static void kvm_set_tsc_khz(struct kvm_vcpu *vcpu, u32 this_tsc_khz)
 {
-	u64 ret;
+	u32 thresh_lo, thresh_hi;
+	int use_scaling = 0;
 
-	WARN_ON(preemptible());
-	if (kvm_tsc_changes_freq())
-		printk_once(KERN_WARNING
-		 "kvm: unreliable cycle conversion on adjustable rate TSC\n");
-	ret = nsec * vcpu_tsc_khz(vcpu);
-	do_div(ret, USEC_PER_SEC);
-	return ret;
-}
-
-static void kvm_init_tsc_catchup(struct kvm_vcpu *vcpu, u32 this_tsc_khz)
-{
 	/* Compute a scale to convert nanoseconds in TSC cycles */
 	kvm_get_time_scale(this_tsc_khz, NSEC_PER_SEC / 1000,
-			   &vcpu->arch.tsc_catchup_shift,
-			   &vcpu->arch.tsc_catchup_mult);
+			   &vcpu->arch.virtual_tsc_shift,
+			   &vcpu->arch.virtual_tsc_mult);
+	vcpu->arch.virtual_tsc_khz = this_tsc_khz;
+
+	/*
+	 * Compute the variation in TSC rate which is acceptable
+	 * within the range of tolerance and decide if the
+	 * rate being applied is within that bounds of the hardware
+	 * rate.  If so, no scaling or compensation need be done.
+	 */
+	thresh_lo = adjust_tsc_khz(tsc_khz, -tsc_tolerance_ppm);
+	thresh_hi = adjust_tsc_khz(tsc_khz, tsc_tolerance_ppm);
+	if (this_tsc_khz < thresh_lo || this_tsc_khz > thresh_hi) {
+		pr_debug("kvm: requested TSC rate %u falls outside tolerance [%u,%u]\n", this_tsc_khz, thresh_lo, thresh_hi);
+		use_scaling = 1;
+	}
+	kvm_x86_ops->set_tsc_khz(vcpu, this_tsc_khz, use_scaling);
 }
 
 static u64 compute_guest_tsc(struct kvm_vcpu *vcpu, s64 kernel_ns)
 {
 	u64 tsc = pvclock_scale_delta(kernel_ns-vcpu->arch.last_tsc_nsec,
-				      vcpu->arch.tsc_catchup_mult,
-				      vcpu->arch.tsc_catchup_shift);
+				      vcpu->arch.virtual_tsc_mult,
+				      vcpu->arch.virtual_tsc_shift);
 	tsc += vcpu->arch.last_tsc_write;
 	return tsc;
 }
@@ -1077,7 +1082,7 @@ static int kvm_guest_time_update(struct kvm_vcpu *v)
 	local_irq_save(flags);
 	tsc_timestamp = kvm_x86_ops->read_l1_tsc(v);
 	kernel_ns = get_kernel_ns();
-	this_tsc_khz = vcpu_tsc_khz(v);
+	this_tsc_khz = __get_cpu_var(cpu_tsc_khz);
 	if (unlikely(this_tsc_khz == 0)) {
 		local_irq_restore(flags);
 		kvm_make_request(KVM_REQ_CLOCK_UPDATE, v);
@@ -2804,26 +2809,21 @@ long kvm_arch_vcpu_ioctl(struct file *filp,
 		u32 user_tsc_khz;
 
 		r = -EINVAL;
-		if (!kvm_has_tsc_control)
-			break;
-
 		user_tsc_khz = (u32)arg;
 
 		if (user_tsc_khz >= kvm_max_guest_tsc_khz)
 			goto out;
 
-		kvm_x86_ops->set_tsc_khz(vcpu, user_tsc_khz);
+		if (user_tsc_khz == 0)
+			user_tsc_khz = tsc_khz;
+
+		kvm_set_tsc_khz(vcpu, user_tsc_khz);
 
 		r = 0;
 		goto out;
 	}
 	case KVM_GET_TSC_KHZ: {
-		r = -EIO;
-		if (check_tsc_unstable())
-			goto out;
-
-		r = vcpu_tsc_khz(vcpu);
-
+		r = vcpu->arch.virtual_tsc_khz;
 		goto out;
 	}
 	default:
@@ -5312,6 +5312,8 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu)
 		profile_hit(KVM_PROFILING, (void *)rip);
 	}
 
+	if (unlikely(vcpu->arch.tsc_always_catchup))
+		kvm_make_request(KVM_REQ_CLOCK_UPDATE, vcpu);
 
 	kvm_lapic_sync_from_vapic(vcpu);
 
@@ -6004,7 +6006,7 @@ int kvm_arch_vcpu_init(struct kvm_vcpu *vcpu)
 	}
 	vcpu->arch.pio_data = page_address(page);
 
-	kvm_init_tsc_catchup(vcpu, max_tsc_khz);
+	kvm_set_tsc_khz(vcpu, max_tsc_khz);
 
 	r = kvm_mmu_create(vcpu);
 	if (r < 0)
-- 
cgit v1.2.3


From 5d3cb0f6a8e3af018a522ae8d36f8f7d2511b5d8 Mon Sep 17 00:00:00 2001
From: Zachary Amsden <zamsden@gmail.com>
Date: Fri, 3 Feb 2012 15:43:51 -0200
Subject: KVM: Improve TSC offset matching

There are a few improvements that can be made to the TSC offset
matching code.  First, we don't need to call the 128-bit multiply
(especially on a constant number), the code works much nicer to
do computation in nanosecond units.

Second, the way everything is setup with software TSC rate scaling,
we currently have per-cpu rates.  Obviously this isn't too desirable
to use in practice, but if for some reason we do change the rate of
all VCPUs at runtime, then reset the TSCs, we will only want to
match offsets for VCPUs running at the same rate.

Finally, for the case where we have an unstable host TSC, but
rate scaling is being done in hardware, we should call the platform
code to compute the TSC offset, so the math is reorganized to recompute
the base instead, then transform the base into an offset using the
existing API.

[avi: fix 64-bit division on i386]

Signed-off-by: Zachary Amsden <zamsden@gmail.com>
Signed-off-by: Marcelo Tosatti <mtosatti@redhat.com>

KVM: Fix 64-bit division in kvm_write_tsc()

Breaks i386 build.

Signed-off-by: Avi Kivity <avi@redhat.com>
---
 arch/x86/include/asm/kvm_host.h |  1 +
 arch/x86/kvm/x86.c              | 44 +++++++++++++++++++++++++++--------------
 2 files changed, 30 insertions(+), 15 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index ddebbe01fff9..8a34fca6c572 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -513,6 +513,7 @@ struct kvm_arch {
 	u64 last_tsc_nsec;
 	u64 last_tsc_offset;
 	u64 last_tsc_write;
+	u32 last_tsc_khz;
 
 	struct kvm_xen_hvm_config xen_hvm_config;
 
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 41bb90acb238..4390f42b371f 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -1025,33 +1025,46 @@ void kvm_write_tsc(struct kvm_vcpu *vcpu, u64 data)
 	struct kvm *kvm = vcpu->kvm;
 	u64 offset, ns, elapsed;
 	unsigned long flags;
-	s64 sdiff;
+	s64 nsdiff;
 
 	raw_spin_lock_irqsave(&kvm->arch.tsc_write_lock, flags);
 	offset = kvm_x86_ops->compute_tsc_offset(vcpu, data);
 	ns = get_kernel_ns();
 	elapsed = ns - kvm->arch.last_tsc_nsec;
-	sdiff = data - kvm->arch.last_tsc_write;
-	if (sdiff < 0)
-		sdiff = -sdiff;
+
+	/* n.b - signed multiplication and division required */
+	nsdiff = data - kvm->arch.last_tsc_write;
+#ifdef CONFIG_X86_64
+	nsdiff = (nsdiff * 1000) / vcpu->arch.virtual_tsc_khz;
+#else
+	/* do_div() only does unsigned */
+	asm("idivl %2; xor %%edx, %%edx"
+	    : "=A"(nsdiff)
+	    : "A"(nsdiff * 1000), "rm"(vcpu->arch.virtual_tsc_khz));
+#endif
+	nsdiff -= elapsed;
+	if (nsdiff < 0)
+		nsdiff = -nsdiff;
 
 	/*
-	 * Special case: close write to TSC within 5 seconds of
-	 * another CPU is interpreted as an attempt to synchronize
-	 * The 5 seconds is to accommodate host load / swapping as
-	 * well as any reset of TSC during the boot process.
-	 *
-	 * In that case, for a reliable TSC, we can match TSC offsets,
-	 * or make a best guest using elapsed value.
-	 */
-	if (sdiff < nsec_to_cycles(vcpu, 5ULL * NSEC_PER_SEC) &&
-	    elapsed < 5ULL * NSEC_PER_SEC) {
+	 * Special case: TSC write with a small delta (1 second) of virtual
+	 * cycle time against real time is interpreted as an attempt to
+	 * synchronize the CPU.
+         *
+	 * For a reliable TSC, we can match TSC offsets, and for an unstable
+	 * TSC, we add elapsed time in this computation.  We could let the
+	 * compensation code attempt to catch up if we fall behind, but
+	 * it's better to try to match offsets from the beginning.
+         */
+	if (nsdiff < NSEC_PER_SEC &&
+	    vcpu->arch.virtual_tsc_khz == kvm->arch.last_tsc_khz) {
 		if (!check_tsc_unstable()) {
 			offset = kvm->arch.last_tsc_offset;
 			pr_debug("kvm: matched tsc offset for %llu\n", data);
 		} else {
 			u64 delta = nsec_to_cycles(vcpu, elapsed);
-			offset += delta;
+			data += delta;
+			offset = kvm_x86_ops->compute_tsc_offset(vcpu, data);
 			pr_debug("kvm: adjusted tsc offset by %llu\n", delta);
 		}
 		ns = kvm->arch.last_tsc_nsec;
@@ -1059,6 +1072,7 @@ void kvm_write_tsc(struct kvm_vcpu *vcpu, u64 data)
 	kvm->arch.last_tsc_nsec = ns;
 	kvm->arch.last_tsc_write = data;
 	kvm->arch.last_tsc_offset = offset;
+	kvm->arch.last_tsc_khz = vcpu->arch.virtual_tsc_khz;
 	kvm_x86_ops->write_tsc_offset(vcpu, offset);
 	raw_spin_unlock_irqrestore(&kvm->arch.tsc_write_lock, flags);
 
-- 
cgit v1.2.3


From 4dd7980b21408624e9b6f3df05719c3c61db6e9f Mon Sep 17 00:00:00 2001
From: Zachary Amsden <zamsden@gmail.com>
Date: Fri, 3 Feb 2012 15:43:52 -0200
Subject: KVM: Leave TSC synchronization window open with each new sync

Currently, when the TSC is written by the guest, the variable
ns is updated to force the current write to appear to have taken
place at the time of the first write in this sync phase.  This
leaves a cliff at the end of the match window where updates will
fall of the end.  There are two scenarios where this can be a
problem in practe - first, on a system with a large number of
VCPUs, the sync period may last for an extended period of time.

The second way this can happen is if the VM reboots very rapidly
and we catch a VCPU TSC synchronization just around the edge.
We may be unaware of the reboot, and thus the first VCPU might
synchronize with an old set of the timer (at, say 0.97 seconds
ago, when first powered on).  The second VCPU can come in 0.04
seconds later to try to synchronize, but it misses the window
because it is just over the threshold.

Instead, stop doing this artificial setback of the ns variable
and just update it with every write of the TSC.

It may be observed that doing so causes values computed by
compute_guest_tsc to diverge slightly across CPUs - note that
the last_tsc_ns and last_tsc_write variable are used here, and
now they last_tsc_ns will be different for each VCPU, reflecting
the actual time of the update.

However, compute_guest_tsc is used only for guests which already
have TSC stability issues, and further, note that the previous
patch has caused last_tsc_write to be incremented by the difference
in nanoseconds, converted back into guest cycles.  As such, only
boundary rounding errors should be visible, which given the
resolution in nanoseconds, is going to only be a few cycles and
only visible in cross-CPU consistency tests.  The problem can be
fixed by adding a new set of variables to track the start offset
and start write value for the current sync cycle.

Signed-off-by: Zachary Amsden <zamsden@gmail.com>
Signed-off-by: Marcelo Tosatti <mtosatti@redhat.com>
Signed-off-by: Avi Kivity <avi@redhat.com>
---
 arch/x86/kvm/x86.c | 1 -
 1 file changed, 1 deletion(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 4390f42b371f..030d495e5c78 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -1067,7 +1067,6 @@ void kvm_write_tsc(struct kvm_vcpu *vcpu, u64 data)
 			offset = kvm_x86_ops->compute_tsc_offset(vcpu, data);
 			pr_debug("kvm: adjusted tsc offset by %llu\n", delta);
 		}
-		ns = kvm->arch.last_tsc_nsec;
 	}
 	kvm->arch.last_tsc_nsec = ns;
 	kvm->arch.last_tsc_write = data;
-- 
cgit v1.2.3


From b183aa580a3a09b5d79224a9022418508532c778 Mon Sep 17 00:00:00 2001
From: Zachary Amsden <zamsden@gmail.com>
Date: Fri, 3 Feb 2012 15:43:53 -0200
Subject: KVM: Fix last_guest_tsc / tsc_offset semantics

The variable last_guest_tsc was being used as an ad-hoc indicator
that guest TSC has been initialized and recorded correctly.  However,
it may not have been, it could be that guest TSC has been set to some
large value, the back to a small value (by, say, a software reboot).

This defeats the logic and causes KVM to falsely assume that the
guest TSC has gone backwards, marking the host TSC unstable, which
is undesirable behavior.

In addition, rather than try to compute an offset adjustment for the
TSC on unstable platforms, just recompute the whole offset.  This
allows us to get rid of one callsite for adjust_tsc_offset, which
is problematic because the units it takes are in guest units, but
here, the computation was originally being done in host units.

Doing this, and also recording last_guest_tsc when the TSC is written
allow us to remove the tricky logic which depended on last_guest_tsc
being zero to indicate a reset of uninitialized value.

Instead, we now have the guarantee that the guest TSC offset is
always at least something which will get us last_guest_tsc.

Signed-off-by: Zachary Amsden <zamsden@gmail.com>
Signed-off-by: Marcelo Tosatti <mtosatti@redhat.com>
Signed-off-by: Avi Kivity <avi@redhat.com>
---
 arch/x86/kvm/x86.c | 10 ++++++----
 1 file changed, 6 insertions(+), 4 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 030d495e5c78..2a59f76d96f1 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -1079,6 +1079,7 @@ void kvm_write_tsc(struct kvm_vcpu *vcpu, u64 data)
 	vcpu->arch.hv_clock.tsc_timestamp = 0;
 	vcpu->arch.last_tsc_write = data;
 	vcpu->arch.last_tsc_nsec = ns;
+	vcpu->arch.last_guest_tsc = data;
 }
 EXPORT_SYMBOL_GPL(kvm_write_tsc);
 
@@ -1147,7 +1148,7 @@ static int kvm_guest_time_update(struct kvm_vcpu *v)
 	 * observed by the guest and ensure the new system time is greater.
 	 */
 	max_kernel_ns = 0;
-	if (vcpu->hv_clock.tsc_timestamp && vcpu->last_guest_tsc) {
+	if (vcpu->hv_clock.tsc_timestamp) {
 		max_kernel_ns = vcpu->last_guest_tsc -
 				vcpu->hv_clock.tsc_timestamp;
 		max_kernel_ns = pvclock_scale_delta(max_kernel_ns,
@@ -2257,13 +2258,14 @@ void kvm_arch_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
 		u64 tsc;
 
 		tsc = kvm_x86_ops->read_l1_tsc(vcpu);
-		tsc_delta = !vcpu->arch.last_guest_tsc ? 0 :
-			     tsc - vcpu->arch.last_guest_tsc;
+		tsc_delta = tsc - vcpu->arch.last_guest_tsc;
 
 		if (tsc_delta < 0)
 			mark_tsc_unstable("KVM discovered backwards TSC");
 		if (check_tsc_unstable()) {
-			kvm_x86_ops->adjust_tsc_offset(vcpu, -tsc_delta);
+			u64 offset = kvm_x86_ops->compute_tsc_offset(vcpu,
+						vcpu->arch.last_guest_tsc);
+			kvm_x86_ops->write_tsc_offset(vcpu, offset);
 			vcpu->arch.tsc_catchup = 1;
 		}
 		kvm_make_request(KVM_REQ_CLOCK_UPDATE, vcpu);
-- 
cgit v1.2.3


From 6f526ec5383dcd5fa5ffc7b3ac1d62099a0b46ad Mon Sep 17 00:00:00 2001
From: Zachary Amsden <zamsden@gmail.com>
Date: Fri, 3 Feb 2012 15:43:54 -0200
Subject: KVM: Add last_host_tsc tracking back to KVM

The variable last_host_tsc was removed from upstream code.  I am adding
it back for two reasons.  First, it is unnecessary to use guest TSC
computation to conclude information about the host TSC.  The guest may
set the TSC backwards (this case handled by the previous patch), but
the computation of guest TSC (and fetching an MSR) is significanlty more
work and complexity than simply reading the hardware counter.  In addition,
we don't actually need the guest TSC for any part of the computation,
by always recomputing the offset, we can eliminate the need to deal with
the current offset and any scaling factors that may apply.

The second reason is that later on, we are going to be using the host
TSC value to restore TSC offsets after a host S4 suspend, so we need to
be reading the host values, not the guest values here.

Signed-off-by: Zachary Amsden <zamsden@gmail.com>
Signed-off-by: Marcelo Tosatti <mtosatti@redhat.com>
Signed-off-by: Avi Kivity <avi@redhat.com>
---
 arch/x86/include/asm/kvm_host.h |  1 +
 arch/x86/kvm/x86.c              | 11 +++--------
 2 files changed, 4 insertions(+), 8 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index 8a34fca6c572..b23682900f41 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -422,6 +422,7 @@ struct kvm_vcpu_arch {
 	u64 last_kernel_ns;
 	u64 last_tsc_nsec;
 	u64 last_tsc_write;
+	u64 last_host_tsc;
 	bool tsc_catchup;
 	bool tsc_always_catchup;
 	s8 virtual_tsc_shift;
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 2a59f76d96f1..39a57dac884a 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -2253,13 +2253,8 @@ void kvm_arch_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
 
 	kvm_x86_ops->vcpu_load(vcpu, cpu);
 	if (unlikely(vcpu->cpu != cpu) || check_tsc_unstable()) {
-		/* Make sure TSC doesn't go backwards */
-		s64 tsc_delta;
-		u64 tsc;
-
-		tsc = kvm_x86_ops->read_l1_tsc(vcpu);
-		tsc_delta = tsc - vcpu->arch.last_guest_tsc;
-
+		s64 tsc_delta = !vcpu->arch.last_host_tsc ? 0 :
+				native_read_tsc() - vcpu->arch.last_host_tsc;
 		if (tsc_delta < 0)
 			mark_tsc_unstable("KVM discovered backwards TSC");
 		if (check_tsc_unstable()) {
@@ -2282,7 +2277,7 @@ void kvm_arch_vcpu_put(struct kvm_vcpu *vcpu)
 {
 	kvm_x86_ops->vcpu_put(vcpu);
 	kvm_put_guest_fpu(vcpu);
-	vcpu->arch.last_guest_tsc = kvm_x86_ops->read_l1_tsc(vcpu);
+	vcpu->arch.last_host_tsc = native_read_tsc();
 }
 
 static int kvm_vcpu_ioctl_get_lapic(struct kvm_vcpu *vcpu,
-- 
cgit v1.2.3


From f1e2b26003c41e581243c09ceed7567677449468 Mon Sep 17 00:00:00 2001
From: Marcelo Tosatti <mtosatti@redhat.com>
Date: Fri, 3 Feb 2012 15:43:55 -0200
Subject: KVM: Allow adjust_tsc_offset to be in host or guest cycles

Redefine the API to take a parameter indicating whether an
adjustment is in host or guest cycles.

Signed-off-by: Zachary Amsden <zamsden@gmail.com>
Signed-off-by: Marcelo Tosatti <mtosatti@redhat.com>
Signed-off-by: Avi Kivity <avi@redhat.com>
---
 arch/x86/include/asm/kvm_host.h | 13 ++++++++++++-
 arch/x86/kvm/svm.c              |  6 +++++-
 arch/x86/kvm/vmx.c              |  2 +-
 arch/x86/kvm/x86.c              |  2 +-
 4 files changed, 19 insertions(+), 4 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index b23682900f41..dd439f13df84 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -646,7 +646,7 @@ struct kvm_x86_ops {
 	u64 (*get_mt_mask)(struct kvm_vcpu *vcpu, gfn_t gfn, bool is_mmio);
 	int (*get_lpage_level)(void);
 	bool (*rdtscp_supported)(void);
-	void (*adjust_tsc_offset)(struct kvm_vcpu *vcpu, s64 adjustment);
+	void (*adjust_tsc_offset)(struct kvm_vcpu *vcpu, s64 adjustment, bool host);
 
 	void (*set_tdp_cr3)(struct kvm_vcpu *vcpu, unsigned long cr3);
 
@@ -676,6 +676,17 @@ struct kvm_arch_async_pf {
 
 extern struct kvm_x86_ops *kvm_x86_ops;
 
+static inline void adjust_tsc_offset_guest(struct kvm_vcpu *vcpu,
+					   s64 adjustment)
+{
+	kvm_x86_ops->adjust_tsc_offset(vcpu, adjustment, false);
+}
+
+static inline void adjust_tsc_offset_host(struct kvm_vcpu *vcpu, s64 adjustment)
+{
+	kvm_x86_ops->adjust_tsc_offset(vcpu, adjustment, true);
+}
+
 int kvm_mmu_module_init(void);
 void kvm_mmu_module_exit(void);
 
diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c
index e12026e5244e..0b7690ee20bd 100644
--- a/arch/x86/kvm/svm.c
+++ b/arch/x86/kvm/svm.c
@@ -1016,10 +1016,14 @@ static void svm_write_tsc_offset(struct kvm_vcpu *vcpu, u64 offset)
 	mark_dirty(svm->vmcb, VMCB_INTERCEPTS);
 }
 
-static void svm_adjust_tsc_offset(struct kvm_vcpu *vcpu, s64 adjustment)
+static void svm_adjust_tsc_offset(struct kvm_vcpu *vcpu, s64 adjustment, bool host)
 {
 	struct vcpu_svm *svm = to_svm(vcpu);
 
+	WARN_ON(adjustment < 0);
+	if (host)
+		adjustment = svm_scale_tsc(vcpu, adjustment);
+
 	svm->vmcb->control.tsc_offset += adjustment;
 	if (is_guest_mode(vcpu))
 		svm->nested.hsave->control.tsc_offset += adjustment;
diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index e6bf61fa1c03..575fb742a6fc 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -1856,7 +1856,7 @@ static void vmx_write_tsc_offset(struct kvm_vcpu *vcpu, u64 offset)
 	}
 }
 
-static void vmx_adjust_tsc_offset(struct kvm_vcpu *vcpu, s64 adjustment)
+static void vmx_adjust_tsc_offset(struct kvm_vcpu *vcpu, s64 adjustment, bool host)
 {
 	u64 offset = vmcs_read64(TSC_OFFSET);
 	vmcs_write64(TSC_OFFSET, offset + adjustment);
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 39a57dac884a..3b931302fa55 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -1116,7 +1116,7 @@ static int kvm_guest_time_update(struct kvm_vcpu *v)
 	if (vcpu->tsc_catchup) {
 		u64 tsc = compute_guest_tsc(v, kernel_ns);
 		if (tsc > tsc_timestamp) {
-			kvm_x86_ops->adjust_tsc_offset(v, tsc - tsc_timestamp);
+			adjust_tsc_offset_guest(v, tsc - tsc_timestamp);
 			tsc_timestamp = tsc;
 		}
 	}
-- 
cgit v1.2.3


From 0dd6a6edb0124e6c71931ff575b18e15ed6e8603 Mon Sep 17 00:00:00 2001
From: Zachary Amsden <zamsden@gmail.com>
Date: Fri, 3 Feb 2012 15:43:56 -0200
Subject: KVM: Dont mark TSC unstable due to S4 suspend

During a host suspend, TSC may go backwards, which KVM interprets
as an unstable TSC.  Technically, KVM should not be marking the
TSC unstable, which causes the TSC clocksource to go bad, but we
need to be adjusting the TSC offsets in such a case.

Dealing with this issue is a little tricky as the only place we
can reliably do it is before much of the timekeeping infrastructure
is up and running.  On top of this, we are not in a KVM thread
context, so we may not be able to safely access VCPU fields.
Instead, we compute our best known hardware offset at power-up and
stash it to be applied to all VCPUs when they actually start running.

Signed-off-by: Zachary Amsden <zamsden@gmail.com>
Signed-off-by: Marcelo Tosatti <mtosatti@redhat.com>
Signed-off-by: Avi Kivity <avi@redhat.com>
---
 arch/x86/include/asm/kvm_host.h |  1 +
 arch/x86/kvm/x86.c              | 93 ++++++++++++++++++++++++++++++++++++++---
 2 files changed, 89 insertions(+), 5 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index dd439f13df84..4fbeb84b1818 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -423,6 +423,7 @@ struct kvm_vcpu_arch {
 	u64 last_tsc_nsec;
 	u64 last_tsc_write;
 	u64 last_host_tsc;
+	u64 tsc_offset_adjustment;
 	bool tsc_catchup;
 	bool tsc_always_catchup;
 	s8 virtual_tsc_shift;
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 3b931302fa55..4e9bd23d522d 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -2252,6 +2252,14 @@ void kvm_arch_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
 	}
 
 	kvm_x86_ops->vcpu_load(vcpu, cpu);
+
+	/* Apply any externally detected TSC adjustments (due to suspend) */
+	if (unlikely(vcpu->arch.tsc_offset_adjustment)) {
+		adjust_tsc_offset_host(vcpu, vcpu->arch.tsc_offset_adjustment);
+		vcpu->arch.tsc_offset_adjustment = 0;
+		set_bit(KVM_REQ_CLOCK_UPDATE, &vcpu->requests);
+	}
+
 	if (unlikely(vcpu->cpu != cpu) || check_tsc_unstable()) {
 		s64 tsc_delta = !vcpu->arch.last_host_tsc ? 0 :
 				native_read_tsc() - vcpu->arch.last_host_tsc;
@@ -5964,13 +5972,88 @@ int kvm_arch_hardware_enable(void *garbage)
 	struct kvm *kvm;
 	struct kvm_vcpu *vcpu;
 	int i;
+	int ret;
+	u64 local_tsc;
+	u64 max_tsc = 0;
+	bool stable, backwards_tsc = false;
 
 	kvm_shared_msr_cpu_online();
-	list_for_each_entry(kvm, &vm_list, vm_list)
-		kvm_for_each_vcpu(i, vcpu, kvm)
-			if (vcpu->cpu == smp_processor_id())
-				kvm_make_request(KVM_REQ_CLOCK_UPDATE, vcpu);
-	return kvm_x86_ops->hardware_enable(garbage);
+	ret = kvm_x86_ops->hardware_enable(garbage);
+	if (ret != 0)
+		return ret;
+
+	local_tsc = native_read_tsc();
+	stable = !check_tsc_unstable();
+	list_for_each_entry(kvm, &vm_list, vm_list) {
+		kvm_for_each_vcpu(i, vcpu, kvm) {
+			if (!stable && vcpu->cpu == smp_processor_id())
+				set_bit(KVM_REQ_CLOCK_UPDATE, &vcpu->requests);
+			if (stable && vcpu->arch.last_host_tsc > local_tsc) {
+				backwards_tsc = true;
+				if (vcpu->arch.last_host_tsc > max_tsc)
+					max_tsc = vcpu->arch.last_host_tsc;
+			}
+		}
+	}
+
+	/*
+	 * Sometimes, even reliable TSCs go backwards.  This happens on
+	 * platforms that reset TSC during suspend or hibernate actions, but
+	 * maintain synchronization.  We must compensate.  Fortunately, we can
+	 * detect that condition here, which happens early in CPU bringup,
+	 * before any KVM threads can be running.  Unfortunately, we can't
+	 * bring the TSCs fully up to date with real time, as we aren't yet far
+	 * enough into CPU bringup that we know how much real time has actually
+	 * elapsed; our helper function, get_kernel_ns() will be using boot
+	 * variables that haven't been updated yet.
+	 *
+	 * So we simply find the maximum observed TSC above, then record the
+	 * adjustment to TSC in each VCPU.  When the VCPU later gets loaded,
+	 * the adjustment will be applied.  Note that we accumulate
+	 * adjustments, in case multiple suspend cycles happen before some VCPU
+	 * gets a chance to run again.  In the event that no KVM threads get a
+	 * chance to run, we will miss the entire elapsed period, as we'll have
+	 * reset last_host_tsc, so VCPUs will not have the TSC adjusted and may
+	 * loose cycle time.  This isn't too big a deal, since the loss will be
+	 * uniform across all VCPUs (not to mention the scenario is extremely
+	 * unlikely). It is possible that a second hibernate recovery happens
+	 * much faster than a first, causing the observed TSC here to be
+	 * smaller; this would require additional padding adjustment, which is
+	 * why we set last_host_tsc to the local tsc observed here.
+	 *
+	 * N.B. - this code below runs only on platforms with reliable TSC,
+	 * as that is the only way backwards_tsc is set above.  Also note
+	 * that this runs for ALL vcpus, which is not a bug; all VCPUs should
+	 * have the same delta_cyc adjustment applied if backwards_tsc
+	 * is detected.  Note further, this adjustment is only done once,
+	 * as we reset last_host_tsc on all VCPUs to stop this from being
+	 * called multiple times (one for each physical CPU bringup).
+	 *
+	 * Platforms with unnreliable TSCs don't have to deal with this, they
+	 * will be compensated by the logic in vcpu_load, which sets the TSC to
+	 * catchup mode.  This will catchup all VCPUs to real time, but cannot
+	 * guarantee that they stay in perfect synchronization.
+	 */
+	if (backwards_tsc) {
+		u64 delta_cyc = max_tsc - local_tsc;
+		list_for_each_entry(kvm, &vm_list, vm_list) {
+			kvm_for_each_vcpu(i, vcpu, kvm) {
+				vcpu->arch.tsc_offset_adjustment += delta_cyc;
+				vcpu->arch.last_host_tsc = local_tsc;
+			}
+
+			/*
+			 * We have to disable TSC offset matching.. if you were
+			 * booting a VM while issuing an S4 host suspend....
+			 * you may have some problem.  Solving this issue is
+			 * left as an exercise to the reader.
+			 */
+			kvm->arch.last_tsc_nsec = 0;
+			kvm->arch.last_tsc_write = 0;
+		}
+
+	}
+	return 0;
 }
 
 void kvm_arch_hardware_disable(void *garbage)
-- 
cgit v1.2.3


From e26101b116a6235bcd80b3a4c38c9fe91286cd79 Mon Sep 17 00:00:00 2001
From: Zachary Amsden <zamsden@gmail.com>
Date: Fri, 3 Feb 2012 15:43:57 -0200
Subject: KVM: Track TSC synchronization in generations

This allows us to track the original nanosecond and counter values
at each phase of TSC writing by the guest.  This gets us perfect
offset matching for stable TSC systems, and perfect software
computed TSC matching for machines with unstable TSC.

Signed-off-by: Zachary Amsden <zamsden@gmail.com>
Signed-off-by: Marcelo Tosatti <mtosatti@redhat.com>
Signed-off-by: Avi Kivity <avi@redhat.com>
---
 arch/x86/include/asm/kvm_host.h | 10 +++++++---
 arch/x86/kvm/x86.c              | 41 +++++++++++++++++++++++++++++++++--------
 2 files changed, 40 insertions(+), 11 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index 4fbeb84b1818..c24125cd0c63 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -420,10 +420,11 @@ struct kvm_vcpu_arch {
 
 	u64 last_guest_tsc;
 	u64 last_kernel_ns;
-	u64 last_tsc_nsec;
-	u64 last_tsc_write;
 	u64 last_host_tsc;
 	u64 tsc_offset_adjustment;
+	u64 this_tsc_nsec;
+	u64 this_tsc_write;
+	u8  this_tsc_generation;
 	bool tsc_catchup;
 	bool tsc_always_catchup;
 	s8 virtual_tsc_shift;
@@ -513,9 +514,12 @@ struct kvm_arch {
 	s64 kvmclock_offset;
 	raw_spinlock_t tsc_write_lock;
 	u64 last_tsc_nsec;
-	u64 last_tsc_offset;
 	u64 last_tsc_write;
 	u32 last_tsc_khz;
+	u64 cur_tsc_nsec;
+	u64 cur_tsc_write;
+	u64 cur_tsc_offset;
+	u8  cur_tsc_generation;
 
 	struct kvm_xen_hvm_config xen_hvm_config;
 
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 4e9bd23d522d..e86f9b22eaca 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -1013,10 +1013,10 @@ static void kvm_set_tsc_khz(struct kvm_vcpu *vcpu, u32 this_tsc_khz)
 
 static u64 compute_guest_tsc(struct kvm_vcpu *vcpu, s64 kernel_ns)
 {
-	u64 tsc = pvclock_scale_delta(kernel_ns-vcpu->arch.last_tsc_nsec,
+	u64 tsc = pvclock_scale_delta(kernel_ns-vcpu->arch.this_tsc_nsec,
 				      vcpu->arch.virtual_tsc_mult,
 				      vcpu->arch.virtual_tsc_shift);
-	tsc += vcpu->arch.last_tsc_write;
+	tsc += vcpu->arch.this_tsc_write;
 	return tsc;
 }
 
@@ -1059,7 +1059,7 @@ void kvm_write_tsc(struct kvm_vcpu *vcpu, u64 data)
 	if (nsdiff < NSEC_PER_SEC &&
 	    vcpu->arch.virtual_tsc_khz == kvm->arch.last_tsc_khz) {
 		if (!check_tsc_unstable()) {
-			offset = kvm->arch.last_tsc_offset;
+			offset = kvm->arch.cur_tsc_offset;
 			pr_debug("kvm: matched tsc offset for %llu\n", data);
 		} else {
 			u64 delta = nsec_to_cycles(vcpu, elapsed);
@@ -1067,20 +1067,45 @@ void kvm_write_tsc(struct kvm_vcpu *vcpu, u64 data)
 			offset = kvm_x86_ops->compute_tsc_offset(vcpu, data);
 			pr_debug("kvm: adjusted tsc offset by %llu\n", delta);
 		}
+	} else {
+		/*
+		 * We split periods of matched TSC writes into generations.
+		 * For each generation, we track the original measured
+		 * nanosecond time, offset, and write, so if TSCs are in
+		 * sync, we can match exact offset, and if not, we can match
+		 * exact software computaion in compute_guest_tsc()
+		 *
+		 * These values are tracked in kvm->arch.cur_xxx variables.
+		 */
+		kvm->arch.cur_tsc_generation++;
+		kvm->arch.cur_tsc_nsec = ns;
+		kvm->arch.cur_tsc_write = data;
+		kvm->arch.cur_tsc_offset = offset;
+		pr_debug("kvm: new tsc generation %u, clock %llu\n",
+			 kvm->arch.cur_tsc_generation, data);
 	}
+
+	/*
+	 * We also track th most recent recorded KHZ, write and time to
+	 * allow the matching interval to be extended at each write.
+	 */
 	kvm->arch.last_tsc_nsec = ns;
 	kvm->arch.last_tsc_write = data;
-	kvm->arch.last_tsc_offset = offset;
 	kvm->arch.last_tsc_khz = vcpu->arch.virtual_tsc_khz;
-	kvm_x86_ops->write_tsc_offset(vcpu, offset);
-	raw_spin_unlock_irqrestore(&kvm->arch.tsc_write_lock, flags);
 
 	/* Reset of TSC must disable overshoot protection below */
 	vcpu->arch.hv_clock.tsc_timestamp = 0;
-	vcpu->arch.last_tsc_write = data;
-	vcpu->arch.last_tsc_nsec = ns;
 	vcpu->arch.last_guest_tsc = data;
+
+	/* Keep track of which generation this VCPU has synchronized to */
+	vcpu->arch.this_tsc_generation = kvm->arch.cur_tsc_generation;
+	vcpu->arch.this_tsc_nsec = kvm->arch.cur_tsc_nsec;
+	vcpu->arch.this_tsc_write = kvm->arch.cur_tsc_write;
+
+	kvm_x86_ops->write_tsc_offset(vcpu, offset);
+	raw_spin_unlock_irqrestore(&kvm->arch.tsc_write_lock, flags);
 }
+
 EXPORT_SYMBOL_GPL(kvm_write_tsc);
 
 static int kvm_guest_time_update(struct kvm_vcpu *v)
-- 
cgit v1.2.3


From 10166744b80a41c30d82bc6e11140f5b28d257ab Mon Sep 17 00:00:00 2001
From: Raghavendra K T <raghavendra.kt@linux.vnet.ibm.com>
Date: Tue, 7 Feb 2012 23:19:20 +0530
Subject: KVM: VMX: remove yield_on_hlt

yield_on_hlt was introduced for CPU bandwidth capping. Now it is
redundant with CFS hardlimit.

yield_on_hlt also complicates the scenario in paravirtual environment,
that needs to trap halt. for e.g. paravirtualized ticket spinlocks.

Acked-by: Anthony Liguori <aliguori@us.ibm.com>
Signed-off-by: Raghavendra K T <raghavendra.kt@linux.vnet.ibm.com>
Signed-off-by: Marcelo Tosatti <mtosatti@redhat.com>
Signed-off-by: Avi Kivity <avi@redhat.com>
---
 arch/x86/kvm/vmx.c | 22 +---------------------
 1 file changed, 1 insertion(+), 21 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index 575fb742a6fc..d2bd719925a6 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -70,9 +70,6 @@ module_param(emulate_invalid_guest_state, bool, S_IRUGO);
 static bool __read_mostly vmm_exclusive = 1;
 module_param(vmm_exclusive, bool, S_IRUGO);
 
-static bool __read_mostly yield_on_hlt = 1;
-module_param(yield_on_hlt, bool, S_IRUGO);
-
 static bool __read_mostly fasteoi = 1;
 module_param(fasteoi, bool, S_IRUGO);
 
@@ -1655,17 +1652,6 @@ static void skip_emulated_instruction(struct kvm_vcpu *vcpu)
 	vmx_set_interrupt_shadow(vcpu, 0);
 }
 
-static void vmx_clear_hlt(struct kvm_vcpu *vcpu)
-{
-	/* Ensure that we clear the HLT state in the VMCS.  We don't need to
-	 * explicitly skip the instruction because if the HLT state is set, then
-	 * the instruction is already executing and RIP has already been
-	 * advanced. */
-	if (!yield_on_hlt &&
-	    vmcs_read32(GUEST_ACTIVITY_STATE) == GUEST_ACTIVITY_HLT)
-		vmcs_write32(GUEST_ACTIVITY_STATE, GUEST_ACTIVITY_ACTIVE);
-}
-
 /*
  * KVM wants to inject page-faults which it got to the guest. This function
  * checks whether in a nested guest, we need to inject them to L1 or L2.
@@ -1718,7 +1704,6 @@ static void vmx_queue_exception(struct kvm_vcpu *vcpu, unsigned nr,
 		intr_info |= INTR_TYPE_HARD_EXCEPTION;
 
 	vmcs_write32(VM_ENTRY_INTR_INFO_FIELD, intr_info);
-	vmx_clear_hlt(vcpu);
 }
 
 static bool vmx_rdtscp_supported(void)
@@ -2405,7 +2390,7 @@ static __init int setup_vmcs_config(struct vmcs_config *vmcs_conf)
 				&_pin_based_exec_control) < 0)
 		return -EIO;
 
-	min =
+	min = CPU_BASED_HLT_EXITING |
 #ifdef CONFIG_X86_64
 	      CPU_BASED_CR8_LOAD_EXITING |
 	      CPU_BASED_CR8_STORE_EXITING |
@@ -2420,9 +2405,6 @@ static __init int setup_vmcs_config(struct vmcs_config *vmcs_conf)
 	      CPU_BASED_INVLPG_EXITING |
 	      CPU_BASED_RDPMC_EXITING;
 
-	if (yield_on_hlt)
-		min |= CPU_BASED_HLT_EXITING;
-
 	opt = CPU_BASED_TPR_SHADOW |
 	      CPU_BASED_USE_MSR_BITMAPS |
 	      CPU_BASED_ACTIVATE_SECONDARY_CONTROLS;
@@ -4009,7 +3991,6 @@ static void vmx_inject_irq(struct kvm_vcpu *vcpu)
 	} else
 		intr |= INTR_TYPE_EXT_INTR;
 	vmcs_write32(VM_ENTRY_INTR_INFO_FIELD, intr);
-	vmx_clear_hlt(vcpu);
 }
 
 static void vmx_inject_nmi(struct kvm_vcpu *vcpu)
@@ -4041,7 +4022,6 @@ static void vmx_inject_nmi(struct kvm_vcpu *vcpu)
 	}
 	vmcs_write32(VM_ENTRY_INTR_INFO_FIELD,
 			INTR_TYPE_NMI_INTR | INTR_INFO_VALID_MASK | NMI_VECTOR);
-	vmx_clear_hlt(vcpu);
 }
 
 static int vmx_nmi_allowed(struct kvm_vcpu *vcpu)
-- 
cgit v1.2.3


From 6dbf79e7164e9a86c1e466062c48498142ae6128 Mon Sep 17 00:00:00 2001
From: Takuya Yoshikawa <yoshikawa.takuya@oss.ntt.co.jp>
Date: Sun, 5 Feb 2012 20:42:41 +0900
Subject: KVM: Fix write protection race during dirty logging

This patch fixes a race introduced by:

  commit 95d4c16ce78cb6b7549a09159c409d52ddd18dae
  KVM: Optimize dirty logging by rmap_write_protect()

During protecting pages for dirty logging, other threads may also try
to protect a page in mmu_sync_children() or kvm_mmu_get_page().

In such a case, because get_dirty_log releases mmu_lock before flushing
TLB's, the following race condition can happen:

  A (get_dirty_log)     B (another thread)

  lock(mmu_lock)
  clear pte.w
  unlock(mmu_lock)
                        lock(mmu_lock)
                        pte.w is already cleared
                        unlock(mmu_lock)
                        skip TLB flush
                        return
  ...
  TLB flush

Though thread B assumes the page has already been protected when it
returns, the remaining TLB entry will break that assumption.

This patch fixes this problem by making get_dirty_log hold the mmu_lock
until it flushes the TLB's.

Signed-off-by: Takuya Yoshikawa <yoshikawa.takuya@oss.ntt.co.jp>
Signed-off-by: Marcelo Tosatti <mtosatti@redhat.com>
Signed-off-by: Avi Kivity <avi@redhat.com>
---
 arch/x86/kvm/x86.c | 11 +++++------
 1 file changed, 5 insertions(+), 6 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index e86f9b22eaca..3df0b7a140b0 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -3065,6 +3065,8 @@ static void write_protect_slot(struct kvm *kvm,
 			       unsigned long *dirty_bitmap,
 			       unsigned long nr_dirty_pages)
 {
+	spin_lock(&kvm->mmu_lock);
+
 	/* Not many dirty pages compared to # of shadow pages. */
 	if (nr_dirty_pages < kvm->arch.n_used_mmu_pages) {
 		unsigned long gfn_offset;
@@ -3072,16 +3074,13 @@ static void write_protect_slot(struct kvm *kvm,
 		for_each_set_bit(gfn_offset, dirty_bitmap, memslot->npages) {
 			unsigned long gfn = memslot->base_gfn + gfn_offset;
 
-			spin_lock(&kvm->mmu_lock);
 			kvm_mmu_rmap_write_protect(kvm, gfn, memslot);
-			spin_unlock(&kvm->mmu_lock);
 		}
 		kvm_flush_remote_tlbs(kvm);
-	} else {
-		spin_lock(&kvm->mmu_lock);
+	} else
 		kvm_mmu_slot_remove_write_access(kvm, memslot->id);
-		spin_unlock(&kvm->mmu_lock);
-	}
+
+	spin_unlock(&kvm->mmu_lock);
 }
 
 /*
-- 
cgit v1.2.3


From fb03cb6f44236f4bef62a0dda8e025ff5ca51417 Mon Sep 17 00:00:00 2001
From: Takuya Yoshikawa <yoshikawa.takuya@oss.ntt.co.jp>
Date: Wed, 8 Feb 2012 12:59:10 +0900
Subject: KVM: Introduce gfn_to_index() which returns the index for a given
 level

This patch cleans up the code and removes the "(void)level;" warning
suppressor.

Note that we can also use this for PT_PAGE_TABLE_LEVEL to treat every
level uniformly later.

Signed-off-by: Takuya Yoshikawa <yoshikawa.takuya@oss.ntt.co.jp>
Signed-off-by: Marcelo Tosatti <mtosatti@redhat.com>
Signed-off-by: Avi Kivity <avi@redhat.com>
---
 arch/x86/kvm/mmu.c       | 3 +--
 include/linux/kvm_host.h | 7 +++++++
 virt/kvm/kvm_main.c      | 7 +------
 3 files changed, 9 insertions(+), 8 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
index ae76cc3392e1..37e7f100a0e0 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -688,8 +688,7 @@ static struct kvm_lpage_info *lpage_info_slot(gfn_t gfn,
 {
 	unsigned long idx;
 
-	idx = (gfn >> KVM_HPAGE_GFN_SHIFT(level)) -
-	      (slot->base_gfn >> KVM_HPAGE_GFN_SHIFT(level));
+	idx = gfn_to_index(gfn, slot->base_gfn, level);
 	return &slot->lpage_info[level - 2][idx];
 }
 
diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h
index 9698080c902b..7a08496b974a 100644
--- a/include/linux/kvm_host.h
+++ b/include/linux/kvm_host.h
@@ -681,6 +681,13 @@ static inline int memslot_id(struct kvm *kvm, gfn_t gfn)
 	return gfn_to_memslot(kvm, gfn)->id;
 }
 
+static inline gfn_t gfn_to_index(gfn_t gfn, gfn_t base_gfn, int level)
+{
+	/* KVM_HPAGE_GFN_SHIFT(PT_PAGE_TABLE_LEVEL) must be 0. */
+	return (gfn >> KVM_HPAGE_GFN_SHIFT(level)) -
+		(base_gfn >> KVM_HPAGE_GFN_SHIFT(level));
+}
+
 static inline unsigned long gfn_to_hva_memslot(struct kvm_memory_slot *slot,
 					       gfn_t gfn)
 {
diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
index 470e30520fe8..415fe816fc15 100644
--- a/virt/kvm/kvm_main.c
+++ b/virt/kvm/kvm_main.c
@@ -784,15 +784,10 @@ int __kvm_set_memory_region(struct kvm *kvm,
 		int lpages;
 		int level = i + 2;
 
-		/* Avoid unused variable warning if no large pages */
-		(void)level;
-
 		if (new.lpage_info[i])
 			continue;
 
-		lpages = 1 + ((base_gfn + npages - 1)
-			     >> KVM_HPAGE_GFN_SHIFT(level));
-		lpages -= base_gfn >> KVM_HPAGE_GFN_SHIFT(level);
+		lpages = gfn_to_index(base_gfn + npages - 1, base_gfn, level) + 1;
 
 		new.lpage_info[i] = vzalloc(lpages * sizeof(*new.lpage_info[i]));
 
-- 
cgit v1.2.3


From db3fe4eb45f3555d91a7124e18cf3a2f2a30eb90 Mon Sep 17 00:00:00 2001
From: Takuya Yoshikawa <yoshikawa.takuya@oss.ntt.co.jp>
Date: Wed, 8 Feb 2012 13:02:18 +0900
Subject: KVM: Introduce kvm_memory_slot::arch and move lpage_info into it

Some members of kvm_memory_slot are not used by every architecture.

This patch is the first step to make this difference clear by
introducing kvm_memory_slot::arch;  lpage_info is moved into it.

Signed-off-by: Takuya Yoshikawa <yoshikawa.takuya@oss.ntt.co.jp>
Signed-off-by: Marcelo Tosatti <mtosatti@redhat.com>
Signed-off-by: Avi Kivity <avi@redhat.com>
---
 arch/ia64/include/asm/kvm_host.h    |  3 ++
 arch/ia64/kvm/kvm-ia64.c            | 10 ++++++
 arch/powerpc/include/asm/kvm_host.h |  3 ++
 arch/powerpc/kvm/powerpc.c          | 10 ++++++
 arch/s390/include/asm/kvm_host.h    |  3 ++
 arch/s390/kvm/kvm-s390.c            | 10 ++++++
 arch/x86/include/asm/kvm_host.h     |  9 +++++
 arch/x86/kvm/mmu.c                  |  2 +-
 arch/x86/kvm/x86.c                  | 59 +++++++++++++++++++++++++++++++
 include/linux/kvm_host.h            | 11 +++---
 virt/kvm/kvm_main.c                 | 70 +++++--------------------------------
 11 files changed, 122 insertions(+), 68 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/ia64/include/asm/kvm_host.h b/arch/ia64/include/asm/kvm_host.h
index 2689ee54a1c9..e35b3a84a40b 100644
--- a/arch/ia64/include/asm/kvm_host.h
+++ b/arch/ia64/include/asm/kvm_host.h
@@ -459,6 +459,9 @@ struct kvm_sal_data {
 	unsigned long boot_gp;
 };
 
+struct kvm_arch_memory_slot {
+};
+
 struct kvm_arch {
 	spinlock_t dirty_log_lock;
 
diff --git a/arch/ia64/kvm/kvm-ia64.c b/arch/ia64/kvm/kvm-ia64.c
index 8ca7261e7b3d..d8ddbba6fe7d 100644
--- a/arch/ia64/kvm/kvm-ia64.c
+++ b/arch/ia64/kvm/kvm-ia64.c
@@ -1571,6 +1571,16 @@ int kvm_arch_vcpu_fault(struct kvm_vcpu *vcpu, struct vm_fault *vmf)
 	return VM_FAULT_SIGBUS;
 }
 
+void kvm_arch_free_memslot(struct kvm_memory_slot *free,
+			   struct kvm_memory_slot *dont)
+{
+}
+
+int kvm_arch_create_memslot(struct kvm_memory_slot *slot, unsigned long npages)
+{
+	return 0;
+}
+
 int kvm_arch_prepare_memory_region(struct kvm *kvm,
 		struct kvm_memory_slot *memslot,
 		struct kvm_memory_slot old,
diff --git a/arch/powerpc/include/asm/kvm_host.h b/arch/powerpc/include/asm/kvm_host.h
index 1843d5d2a3be..52eb9c1f4fe0 100644
--- a/arch/powerpc/include/asm/kvm_host.h
+++ b/arch/powerpc/include/asm/kvm_host.h
@@ -213,6 +213,9 @@ struct revmap_entry {
 #define KVMPPC_PAGE_WRITETHRU	HPTE_R_W	/* 0x40 */
 #define KVMPPC_GOT_PAGE		0x80
 
+struct kvm_arch_memory_slot {
+};
+
 struct kvm_arch {
 #ifdef CONFIG_KVM_BOOK3S_64_HV
 	unsigned long hpt_virt;
diff --git a/arch/powerpc/kvm/powerpc.c b/arch/powerpc/kvm/powerpc.c
index 0e21d155eea7..00d7e345b3fe 100644
--- a/arch/powerpc/kvm/powerpc.c
+++ b/arch/powerpc/kvm/powerpc.c
@@ -281,6 +281,16 @@ long kvm_arch_dev_ioctl(struct file *filp,
 	return -EINVAL;
 }
 
+void kvm_arch_free_memslot(struct kvm_memory_slot *free,
+			   struct kvm_memory_slot *dont)
+{
+}
+
+int kvm_arch_create_memslot(struct kvm_memory_slot *slot, unsigned long npages)
+{
+	return 0;
+}
+
 int kvm_arch_prepare_memory_region(struct kvm *kvm,
                                    struct kvm_memory_slot *memslot,
                                    struct kvm_memory_slot old,
diff --git a/arch/s390/include/asm/kvm_host.h b/arch/s390/include/asm/kvm_host.h
index e6304268ea28..7343872890a2 100644
--- a/arch/s390/include/asm/kvm_host.h
+++ b/arch/s390/include/asm/kvm_host.h
@@ -245,6 +245,9 @@ struct kvm_vm_stat {
 	u32 remote_tlb_flush;
 };
 
+struct kvm_arch_memory_slot {
+};
+
 struct kvm_arch{
 	struct sca_block *sca;
 	debug_info_t *dbf;
diff --git a/arch/s390/kvm/kvm-s390.c b/arch/s390/kvm/kvm-s390.c
index cf3c0a91d046..17ad69d596fd 100644
--- a/arch/s390/kvm/kvm-s390.c
+++ b/arch/s390/kvm/kvm-s390.c
@@ -814,6 +814,16 @@ int kvm_arch_vcpu_fault(struct kvm_vcpu *vcpu, struct vm_fault *vmf)
 	return VM_FAULT_SIGBUS;
 }
 
+void kvm_arch_free_memslot(struct kvm_memory_slot *free,
+			   struct kvm_memory_slot *dont)
+{
+}
+
+int kvm_arch_create_memslot(struct kvm_memory_slot *slot, unsigned long npages)
+{
+	return 0;
+}
+
 /* Section: memory related */
 int kvm_arch_prepare_memory_region(struct kvm *kvm,
 				   struct kvm_memory_slot *memslot,
diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index c24125cd0c63..74c9edf2bb18 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -483,6 +483,15 @@ struct kvm_vcpu_arch {
 	} osvw;
 };
 
+struct kvm_lpage_info {
+	unsigned long rmap_pde;
+	int write_count;
+};
+
+struct kvm_arch_memory_slot {
+	struct kvm_lpage_info *lpage_info[KVM_NR_PAGE_SIZES - 1];
+};
+
 struct kvm_arch {
 	unsigned int n_used_mmu_pages;
 	unsigned int n_requested_mmu_pages;
diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
index 37e7f100a0e0..ff053ca32303 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -689,7 +689,7 @@ static struct kvm_lpage_info *lpage_info_slot(gfn_t gfn,
 	unsigned long idx;
 
 	idx = gfn_to_index(gfn, slot->base_gfn, level);
-	return &slot->lpage_info[level - 2][idx];
+	return &slot->arch.lpage_info[level - 2][idx];
 }
 
 static void account_shadowed(struct kvm *kvm, gfn_t gfn)
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 3df0b7a140b0..ca74c1dadf3a 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -6239,6 +6239,65 @@ void kvm_arch_destroy_vm(struct kvm *kvm)
 		put_page(kvm->arch.ept_identity_pagetable);
 }
 
+void kvm_arch_free_memslot(struct kvm_memory_slot *free,
+			   struct kvm_memory_slot *dont)
+{
+	int i;
+
+	for (i = 0; i < KVM_NR_PAGE_SIZES - 1; ++i) {
+		if (!dont || free->arch.lpage_info[i] != dont->arch.lpage_info[i]) {
+			vfree(free->arch.lpage_info[i]);
+			free->arch.lpage_info[i] = NULL;
+		}
+	}
+}
+
+int kvm_arch_create_memslot(struct kvm_memory_slot *slot, unsigned long npages)
+{
+	int i;
+
+	for (i = 0; i < KVM_NR_PAGE_SIZES - 1; ++i) {
+		unsigned long ugfn;
+		int lpages;
+		int level = i + 2;
+
+		lpages = gfn_to_index(slot->base_gfn + npages - 1,
+				      slot->base_gfn, level) + 1;
+
+		slot->arch.lpage_info[i] =
+			vzalloc(lpages * sizeof(*slot->arch.lpage_info[i]));
+		if (!slot->arch.lpage_info[i])
+			goto out_free;
+
+		if (slot->base_gfn & (KVM_PAGES_PER_HPAGE(level) - 1))
+			slot->arch.lpage_info[i][0].write_count = 1;
+		if ((slot->base_gfn + npages) & (KVM_PAGES_PER_HPAGE(level) - 1))
+			slot->arch.lpage_info[i][lpages - 1].write_count = 1;
+		ugfn = slot->userspace_addr >> PAGE_SHIFT;
+		/*
+		 * If the gfn and userspace address are not aligned wrt each
+		 * other, or if explicitly asked to, disable large page
+		 * support for this slot
+		 */
+		if ((slot->base_gfn ^ ugfn) & (KVM_PAGES_PER_HPAGE(level) - 1) ||
+		    !kvm_largepages_enabled()) {
+			unsigned long j;
+
+			for (j = 0; j < lpages; ++j)
+				slot->arch.lpage_info[i][j].write_count = 1;
+		}
+	}
+
+	return 0;
+
+out_free:
+	for (i = 0; i < KVM_NR_PAGE_SIZES - 1; ++i) {
+		vfree(slot->arch.lpage_info[i]);
+		slot->arch.lpage_info[i] = NULL;
+	}
+	return -ENOMEM;
+}
+
 int kvm_arch_prepare_memory_region(struct kvm *kvm,
 				struct kvm_memory_slot *memslot,
 				struct kvm_memory_slot old,
diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h
index 7a08496b974a..355e44555c39 100644
--- a/include/linux/kvm_host.h
+++ b/include/linux/kvm_host.h
@@ -171,11 +171,6 @@ static inline int kvm_vcpu_exiting_guest_mode(struct kvm_vcpu *vcpu)
  */
 #define KVM_MEM_MAX_NR_PAGES ((1UL << 31) - 1)
 
-struct kvm_lpage_info {
-	unsigned long rmap_pde;
-	int write_count;
-};
-
 struct kvm_memory_slot {
 	gfn_t base_gfn;
 	unsigned long npages;
@@ -184,7 +179,7 @@ struct kvm_memory_slot {
 	unsigned long *dirty_bitmap;
 	unsigned long *dirty_bitmap_head;
 	unsigned long nr_dirty_pages;
-	struct kvm_lpage_info *lpage_info[KVM_NR_PAGE_SIZES - 1];
+	struct kvm_arch_memory_slot arch;
 	unsigned long userspace_addr;
 	int user_alloc;
 	int id;
@@ -376,6 +371,9 @@ int kvm_set_memory_region(struct kvm *kvm,
 int __kvm_set_memory_region(struct kvm *kvm,
 			    struct kvm_userspace_memory_region *mem,
 			    int user_alloc);
+void kvm_arch_free_memslot(struct kvm_memory_slot *free,
+			   struct kvm_memory_slot *dont);
+int kvm_arch_create_memslot(struct kvm_memory_slot *slot, unsigned long npages);
 int kvm_arch_prepare_memory_region(struct kvm *kvm,
 				struct kvm_memory_slot *memslot,
 				struct kvm_memory_slot old,
@@ -385,6 +383,7 @@ void kvm_arch_commit_memory_region(struct kvm *kvm,
 				struct kvm_userspace_memory_region *mem,
 				struct kvm_memory_slot old,
 				int user_alloc);
+bool kvm_largepages_enabled(void);
 void kvm_disable_largepages(void);
 void kvm_arch_flush_shadow(struct kvm *kvm);
 
diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
index a30447c5eb4a..8340e0e62034 100644
--- a/virt/kvm/kvm_main.c
+++ b/virt/kvm/kvm_main.c
@@ -535,21 +535,13 @@ static void kvm_destroy_dirty_bitmap(struct kvm_memory_slot *memslot)
 static void kvm_free_physmem_slot(struct kvm_memory_slot *free,
 				  struct kvm_memory_slot *dont)
 {
-	int i;
-
 	if (!dont || free->rmap != dont->rmap)
 		vfree(free->rmap);
 
 	if (!dont || free->dirty_bitmap != dont->dirty_bitmap)
 		kvm_destroy_dirty_bitmap(free);
 
-
-	for (i = 0; i < KVM_NR_PAGE_SIZES - 1; ++i) {
-		if (!dont || free->lpage_info[i] != dont->lpage_info[i]) {
-			vfree(free->lpage_info[i]);
-			free->lpage_info[i] = NULL;
-		}
-	}
+	kvm_arch_free_memslot(free, dont);
 
 	free->npages = 0;
 	free->rmap = NULL;
@@ -685,53 +677,6 @@ void update_memslots(struct kvm_memslots *slots, struct kvm_memory_slot *new)
 	slots->generation++;
 }
 
-#ifndef CONFIG_S390
-static int create_lpage_info(struct kvm_memory_slot *slot, unsigned long npages)
-{
-	int i;
-
-	for (i = 0; i < KVM_NR_PAGE_SIZES - 1; ++i) {
-		unsigned long ugfn;
-		int lpages;
-		int level = i + 2;
-
-		lpages = gfn_to_index(slot->base_gfn + npages - 1,
-				      slot->base_gfn, level) + 1;
-
-		slot->lpage_info[i] = vzalloc(lpages * sizeof(*slot->lpage_info[i]));
-		if (!slot->lpage_info[i])
-			goto out_free;
-
-		if (slot->base_gfn & (KVM_PAGES_PER_HPAGE(level) - 1))
-			slot->lpage_info[i][0].write_count = 1;
-		if ((slot->base_gfn + npages) & (KVM_PAGES_PER_HPAGE(level) - 1))
-			slot->lpage_info[i][lpages - 1].write_count = 1;
-		ugfn = slot->userspace_addr >> PAGE_SHIFT;
-		/*
-		 * If the gfn and userspace address are not aligned wrt each
-		 * other, or if explicitly asked to, disable large page
-		 * support for this slot
-		 */
-		if ((slot->base_gfn ^ ugfn) & (KVM_PAGES_PER_HPAGE(level) - 1) ||
-		    !largepages_enabled) {
-			unsigned long j;
-
-			for (j = 0; j < lpages; ++j)
-				slot->lpage_info[i][j].write_count = 1;
-		}
-	}
-
-	return 0;
-
-out_free:
-	for (i = 0; i < KVM_NR_PAGE_SIZES - 1; ++i) {
-		vfree(slot->lpage_info[i]);
-		slot->lpage_info[i] = NULL;
-	}
-	return -ENOMEM;
-}
-#endif /* not defined CONFIG_S390 */
-
 /*
  * Allocate some memory and give it an address in the guest physical address
  * space.
@@ -819,10 +764,9 @@ int __kvm_set_memory_region(struct kvm *kvm,
 		new.rmap = vzalloc(npages * sizeof(*new.rmap));
 		if (!new.rmap)
 			goto out_free;
-
-		if (create_lpage_info(&new, npages))
-			goto out_free;
 #endif /* not defined CONFIG_S390 */
+		if (kvm_arch_create_memslot(&new, npages))
+			goto out_free;
 	}
 
 	/* Allocate page dirty bitmap if needed */
@@ -880,8 +824,7 @@ int __kvm_set_memory_region(struct kvm *kvm,
 	if (!npages) {
 		new.rmap = NULL;
 		new.dirty_bitmap = NULL;
-		for (i = 0; i < KVM_NR_PAGE_SIZES - 1; ++i)
-			new.lpage_info[i] = NULL;
+		memset(&new.arch, 0, sizeof(new.arch));
 	}
 
 	update_memslots(slots, &new);
@@ -968,6 +911,11 @@ out:
 	return r;
 }
 
+bool kvm_largepages_enabled(void)
+{
+	return largepages_enabled;
+}
+
 void kvm_disable_largepages(void)
 {
 	largepages_enabled = false;
-- 
cgit v1.2.3


From 270c6c79f4e15e599f47174ecedad932463af7a2 Mon Sep 17 00:00:00 2001
From: Gleb Natapov <gleb@redhat.com>
Date: Thu, 16 Feb 2012 14:44:11 +0200
Subject: KVM: x86 emulator: correctly mask pmc index bits in RDPMC instruction
 emulation

Signed-off-by: Gleb Natapov <gleb@redhat.com>
Signed-off-by: Avi Kivity <avi@redhat.com>
---
 arch/x86/kvm/pmu.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kvm/pmu.c b/arch/x86/kvm/pmu.c
index 7aad5446f393..3e48c1d3edcd 100644
--- a/arch/x86/kvm/pmu.c
+++ b/arch/x86/kvm/pmu.c
@@ -413,7 +413,7 @@ int kvm_pmu_read_pmc(struct kvm_vcpu *vcpu, unsigned pmc, u64 *data)
 	struct kvm_pmc *counters;
 	u64 ctr;
 
-	pmc &= (3u << 30) - 1;
+	pmc &= ~(3u << 30);
 	if (!fixed && pmc >= pmu->nr_arch_gp_counters)
 		return 1;
 	if (fixed && pmc >= pmu->nr_arch_fixed_counters)
-- 
cgit v1.2.3


From 7f3d35fddd173e52886d03bc34b5b5d6f5bea343 Mon Sep 17 00:00:00 2001
From: Kevin Wolf <kwolf@redhat.com>
Date: Wed, 8 Feb 2012 14:34:38 +0100
Subject: KVM: x86 emulator: Fix task switch privilege checks

Currently, all task switches check privileges against the DPL of the
TSS. This is only correct for jmp/call to a TSS. If a task gate is used,
the DPL of this take gate is used for the check instead. Exceptions,
external interrupts and iret shouldn't perform any check.

[avi: kill kvm-kmod remnants]

Signed-off-by: Kevin Wolf <kwolf@redhat.com>
Signed-off-by: Avi Kivity <avi@redhat.com>
---
 arch/x86/include/asm/kvm_emulate.h |  2 +-
 arch/x86/include/asm/kvm_host.h    |  4 +--
 arch/x86/kvm/emulate.c             | 53 +++++++++++++++++++++++++++++++++-----
 arch/x86/kvm/svm.c                 |  5 +++-
 arch/x86/kvm/vmx.c                 |  8 +++---
 arch/x86/kvm/x86.c                 |  6 ++---
 6 files changed, 61 insertions(+), 17 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/include/asm/kvm_emulate.h b/arch/x86/include/asm/kvm_emulate.h
index 7b9cfc4878af..df437b68f42b 100644
--- a/arch/x86/include/asm/kvm_emulate.h
+++ b/arch/x86/include/asm/kvm_emulate.h
@@ -388,7 +388,7 @@ bool x86_page_table_writing_insn(struct x86_emulate_ctxt *ctxt);
 #define EMULATION_INTERCEPTED 2
 int x86_emulate_insn(struct x86_emulate_ctxt *ctxt);
 int emulator_task_switch(struct x86_emulate_ctxt *ctxt,
-			 u16 tss_selector, int reason,
+			 u16 tss_selector, int idt_index, int reason,
 			 bool has_error_code, u32 error_code);
 int emulate_int_real(struct x86_emulate_ctxt *ctxt, int irq);
 #endif /* _ASM_X86_KVM_X86_EMULATE_H */
diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index 74c9edf2bb18..e216ba066e79 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -768,8 +768,8 @@ int kvm_emulate_wbinvd(struct kvm_vcpu *vcpu);
 void kvm_get_segment(struct kvm_vcpu *vcpu, struct kvm_segment *var, int seg);
 int kvm_load_segment_descriptor(struct kvm_vcpu *vcpu, u16 selector, int seg);
 
-int kvm_task_switch(struct kvm_vcpu *vcpu, u16 tss_selector, int reason,
-		    bool has_error_code, u32 error_code);
+int kvm_task_switch(struct kvm_vcpu *vcpu, u16 tss_selector, int idt_index,
+		    int reason, bool has_error_code, u32 error_code);
 
 int kvm_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0);
 int kvm_set_cr3(struct kvm_vcpu *vcpu, unsigned long cr3);
diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c
index 71450aca3b86..fa310a48591c 100644
--- a/arch/x86/kvm/emulate.c
+++ b/arch/x86/kvm/emulate.c
@@ -1152,6 +1152,22 @@ static int pio_in_emulated(struct x86_emulate_ctxt *ctxt,
 	return 1;
 }
 
+static int read_interrupt_descriptor(struct x86_emulate_ctxt *ctxt,
+				     u16 index, struct desc_struct *desc)
+{
+	struct desc_ptr dt;
+	ulong addr;
+
+	ctxt->ops->get_idt(ctxt, &dt);
+
+	if (dt.size < index * 8 + 7)
+		return emulate_gp(ctxt, index << 3 | 0x2);
+
+	addr = dt.address + index * 8;
+	return ctxt->ops->read_std(ctxt, addr, desc, sizeof *desc,
+				   &ctxt->exception);
+}
+
 static void get_descriptor_table_ptr(struct x86_emulate_ctxt *ctxt,
 				     u16 selector, struct desc_ptr *dt)
 {
@@ -2421,7 +2437,7 @@ static int task_switch_32(struct x86_emulate_ctxt *ctxt,
 }
 
 static int emulator_do_task_switch(struct x86_emulate_ctxt *ctxt,
-				   u16 tss_selector, int reason,
+				   u16 tss_selector, int idt_index, int reason,
 				   bool has_error_code, u32 error_code)
 {
 	struct x86_emulate_ops *ops = ctxt->ops;
@@ -2443,12 +2459,35 @@ static int emulator_do_task_switch(struct x86_emulate_ctxt *ctxt,
 
 	/* FIXME: check that next_tss_desc is tss */
 
-	if (reason != TASK_SWITCH_IRET) {
-		if ((tss_selector & 3) > next_tss_desc.dpl ||
-		    ops->cpl(ctxt) > next_tss_desc.dpl)
-			return emulate_gp(ctxt, 0);
+	/*
+	 * Check privileges. The three cases are task switch caused by...
+	 *
+	 * 1. jmp/call/int to task gate: Check against DPL of the task gate
+	 * 2. Exception/IRQ/iret: No check is performed
+	 * 3. jmp/call to TSS: Check agains DPL of the TSS
+	 */
+	if (reason == TASK_SWITCH_GATE) {
+		if (idt_index != -1) {
+			/* Software interrupts */
+			struct desc_struct task_gate_desc;
+			int dpl;
+
+			ret = read_interrupt_descriptor(ctxt, idt_index,
+							&task_gate_desc);
+			if (ret != X86EMUL_CONTINUE)
+				return ret;
+
+			dpl = task_gate_desc.dpl;
+			if ((tss_selector & 3) > dpl || ops->cpl(ctxt) > dpl)
+				return emulate_gp(ctxt, (idt_index << 3) | 0x2);
+		}
+	} else if (reason != TASK_SWITCH_IRET) {
+		int dpl = next_tss_desc.dpl;
+		if ((tss_selector & 3) > dpl || ops->cpl(ctxt) > dpl)
+			return emulate_gp(ctxt, tss_selector);
 	}
 
+
 	desc_limit = desc_limit_scaled(&next_tss_desc);
 	if (!next_tss_desc.p ||
 	    ((desc_limit < 0x67 && (next_tss_desc.type & 8)) ||
@@ -2501,7 +2540,7 @@ static int emulator_do_task_switch(struct x86_emulate_ctxt *ctxt,
 }
 
 int emulator_task_switch(struct x86_emulate_ctxt *ctxt,
-			 u16 tss_selector, int reason,
+			 u16 tss_selector, int idt_index, int reason,
 			 bool has_error_code, u32 error_code)
 {
 	int rc;
@@ -2509,7 +2548,7 @@ int emulator_task_switch(struct x86_emulate_ctxt *ctxt,
 	ctxt->_eip = ctxt->eip;
 	ctxt->dst.type = OP_NONE;
 
-	rc = emulator_do_task_switch(ctxt, tss_selector, reason,
+	rc = emulator_do_task_switch(ctxt, tss_selector, idt_index, reason,
 				     has_error_code, error_code);
 
 	if (rc == X86EMUL_CONTINUE)
diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c
index 0b7690ee20bd..95cdeaf9c718 100644
--- a/arch/x86/kvm/svm.c
+++ b/arch/x86/kvm/svm.c
@@ -2799,7 +2799,10 @@ static int task_switch_interception(struct vcpu_svm *svm)
 	     (int_vec == OF_VECTOR || int_vec == BP_VECTOR)))
 		skip_emulated_instruction(&svm->vcpu);
 
-	if (kvm_task_switch(&svm->vcpu, tss_selector, reason,
+	if (int_type != SVM_EXITINTINFO_TYPE_SOFT)
+		int_vec = -1;
+
+	if (kvm_task_switch(&svm->vcpu, tss_selector, int_vec, reason,
 				has_error_code, error_code) == EMULATE_FAIL) {
 		svm->vcpu.run->exit_reason = KVM_EXIT_INTERNAL_ERROR;
 		svm->vcpu.run->internal.suberror = KVM_INTERNAL_ERROR_EMULATION;
diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index d2bd719925a6..124a0952a040 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -4658,9 +4658,10 @@ static int handle_task_switch(struct kvm_vcpu *vcpu)
 	bool has_error_code = false;
 	u32 error_code = 0;
 	u16 tss_selector;
-	int reason, type, idt_v;
+	int reason, type, idt_v, idt_index;
 
 	idt_v = (vmx->idt_vectoring_info & VECTORING_INFO_VALID_MASK);
+	idt_index = (vmx->idt_vectoring_info & VECTORING_INFO_VECTOR_MASK);
 	type = (vmx->idt_vectoring_info & VECTORING_INFO_TYPE_MASK);
 
 	exit_qualification = vmcs_readl(EXIT_QUALIFICATION);
@@ -4698,8 +4699,9 @@ static int handle_task_switch(struct kvm_vcpu *vcpu)
 		       type != INTR_TYPE_NMI_INTR))
 		skip_emulated_instruction(vcpu);
 
-	if (kvm_task_switch(vcpu, tss_selector, reason,
-				has_error_code, error_code) == EMULATE_FAIL) {
+	if (kvm_task_switch(vcpu, tss_selector,
+			    type == INTR_TYPE_SOFT_INTR ? idt_index : -1, reason,
+			    has_error_code, error_code) == EMULATE_FAIL) {
 		vcpu->run->exit_reason = KVM_EXIT_INTERNAL_ERROR;
 		vcpu->run->internal.suberror = KVM_INTERNAL_ERROR_EMULATION;
 		vcpu->run->internal.ndata = 0;
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index ca74c1dadf3a..490a1b1a255f 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -5655,15 +5655,15 @@ int kvm_arch_vcpu_ioctl_set_mpstate(struct kvm_vcpu *vcpu,
 	return 0;
 }
 
-int kvm_task_switch(struct kvm_vcpu *vcpu, u16 tss_selector, int reason,
-		    bool has_error_code, u32 error_code)
+int kvm_task_switch(struct kvm_vcpu *vcpu, u16 tss_selector, int idt_index,
+		    int reason, bool has_error_code, u32 error_code)
 {
 	struct x86_emulate_ctxt *ctxt = &vcpu->arch.emulate_ctxt;
 	int ret;
 
 	init_emulate_ctxt(vcpu);
 
-	ret = emulator_task_switch(ctxt, tss_selector, reason,
+	ret = emulator_task_switch(ctxt, tss_selector, idt_index, reason,
 				   has_error_code, error_code);
 
 	if (ret)
-- 
cgit v1.2.3


From 66b0ab8fac1031ffc70eb77491048339f2717a54 Mon Sep 17 00:00:00 2001
From: Kevin Wolf <kwolf@redhat.com>
Date: Wed, 8 Feb 2012 14:34:39 +0100
Subject: KVM: x86 emulator: VM86 segments must have DPL 3

Setting the segment DPL to 0 for at least the VM86 code segment makes
the VM entry fail on VMX.

Signed-off-by: Kevin Wolf <kwolf@redhat.com>
Signed-off-by: Avi Kivity <avi@redhat.com>
---
 arch/x86/kvm/emulate.c | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'arch/x86')

diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c
index fa310a48591c..b19e9fffe582 100644
--- a/arch/x86/kvm/emulate.c
+++ b/arch/x86/kvm/emulate.c
@@ -1244,6 +1244,8 @@ static int load_segment_descriptor(struct x86_emulate_ctxt *ctxt,
 		seg_desc.type = 3;
 		seg_desc.p = 1;
 		seg_desc.s = 1;
+		if (ctxt->mode == X86EMUL_MODE_VM86)
+			seg_desc.dpl = 3;
 		goto load;
 	}
 
-- 
cgit v1.2.3


From ea5e97e8bf1d56a4d9461c39e082b9c31a7be4ff Mon Sep 17 00:00:00 2001
From: Kevin Wolf <kwolf@redhat.com>
Date: Wed, 8 Feb 2012 14:34:40 +0100
Subject: KVM: SVM: Fix CPL updates

Keep CPL at 0 in real mode and at 3 in VM86. In protected/long mode, use
RPL rather than DPL of the code segment.

Signed-off-by: Kevin Wolf <kwolf@redhat.com>
Signed-off-by: Avi Kivity <avi@redhat.com>
---
 arch/x86/kvm/svm.c | 19 ++++++++++++++++---
 1 file changed, 16 insertions(+), 3 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c
index 95cdeaf9c718..ab39d84dee00 100644
--- a/arch/x86/kvm/svm.c
+++ b/arch/x86/kvm/svm.c
@@ -1332,6 +1332,21 @@ static void svm_vcpu_put(struct kvm_vcpu *vcpu)
 		wrmsrl(host_save_user_msrs[i], svm->host_user_msrs[i]);
 }
 
+static void svm_update_cpl(struct kvm_vcpu *vcpu)
+{
+	struct vcpu_svm *svm = to_svm(vcpu);
+	int cpl;
+
+	if (!is_protmode(vcpu))
+		cpl = 0;
+	else if (svm->vmcb->save.rflags & X86_EFLAGS_VM)
+		cpl = 3;
+	else
+		cpl = svm->vmcb->save.cs.selector & 0x3;
+
+	svm->vmcb->save.cpl = cpl;
+}
+
 static unsigned long svm_get_rflags(struct kvm_vcpu *vcpu)
 {
 	return to_svm(vcpu)->vmcb->save.rflags;
@@ -1607,9 +1622,7 @@ static void svm_set_segment(struct kvm_vcpu *vcpu,
 		s->attrib |= (var->g & 1) << SVM_SELECTOR_G_SHIFT;
 	}
 	if (seg == VCPU_SREG_CS)
-		svm->vmcb->save.cpl
-			= (svm->vmcb->save.cs.attrib
-			   >> SVM_SELECTOR_DPL_SHIFT) & 3;
+		svm_update_cpl(vcpu);
 
 	mark_dirty(svm->vmcb, VMCB_SEG);
 }
-- 
cgit v1.2.3


From 4cee4798a304ee1ea579423ca048f16ceaccdfb5 Mon Sep 17 00:00:00 2001
From: Kevin Wolf <kwolf@redhat.com>
Date: Wed, 8 Feb 2012 14:34:41 +0100
Subject: KVM: x86 emulator: Allow PM/VM86 switch during task switch

Task switches can switch between Protected Mode and VM86. The current
mode must be updated during the task switch emulation so that the new
segment selectors are interpreted correctly.

In order to let privilege checks succeed, rflags needs to be updated in
the vcpu struct as this causes a CPL update.

Signed-off-by: Kevin Wolf <kwolf@redhat.com>
Signed-off-by: Avi Kivity <avi@redhat.com>
---
 arch/x86/include/asm/kvm_emulate.h |  1 +
 arch/x86/kvm/emulate.c             | 20 ++++++++++++++++++++
 arch/x86/kvm/svm.c                 |  4 ++++
 arch/x86/kvm/x86.c                 |  6 ++++++
 4 files changed, 31 insertions(+)

(limited to 'arch/x86')

diff --git a/arch/x86/include/asm/kvm_emulate.h b/arch/x86/include/asm/kvm_emulate.h
index df437b68f42b..c222e1a1b12a 100644
--- a/arch/x86/include/asm/kvm_emulate.h
+++ b/arch/x86/include/asm/kvm_emulate.h
@@ -176,6 +176,7 @@ struct x86_emulate_ops {
 	void (*set_idt)(struct x86_emulate_ctxt *ctxt, struct desc_ptr *dt);
 	ulong (*get_cr)(struct x86_emulate_ctxt *ctxt, int cr);
 	int (*set_cr)(struct x86_emulate_ctxt *ctxt, int cr, ulong val);
+	void (*set_rflags)(struct x86_emulate_ctxt *ctxt, ulong val);
 	int (*cpl)(struct x86_emulate_ctxt *ctxt);
 	int (*get_dr)(struct x86_emulate_ctxt *ctxt, int dr, ulong *dest);
 	int (*set_dr)(struct x86_emulate_ctxt *ctxt, int dr, ulong value);
diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c
index b19e9fffe582..83756223f8aa 100644
--- a/arch/x86/kvm/emulate.c
+++ b/arch/x86/kvm/emulate.c
@@ -2344,6 +2344,8 @@ static int load_state_from_tss32(struct x86_emulate_ctxt *ctxt,
 		return emulate_gp(ctxt, 0);
 	ctxt->_eip = tss->eip;
 	ctxt->eflags = tss->eflags | 2;
+
+	/* General purpose registers */
 	ctxt->regs[VCPU_REGS_RAX] = tss->eax;
 	ctxt->regs[VCPU_REGS_RCX] = tss->ecx;
 	ctxt->regs[VCPU_REGS_RDX] = tss->edx;
@@ -2365,6 +2367,24 @@ static int load_state_from_tss32(struct x86_emulate_ctxt *ctxt,
 	set_segment_selector(ctxt, tss->fs, VCPU_SREG_FS);
 	set_segment_selector(ctxt, tss->gs, VCPU_SREG_GS);
 
+	/*
+	 * If we're switching between Protected Mode and VM86, we need to make
+	 * sure to update the mode before loading the segment descriptors so
+	 * that the selectors are interpreted correctly.
+	 *
+	 * Need to get rflags to the vcpu struct immediately because it
+	 * influences the CPL which is checked at least when loading the segment
+	 * descriptors and when pushing an error code to the new kernel stack.
+	 *
+	 * TODO Introduce a separate ctxt->ops->set_cpl callback
+	 */
+	if (ctxt->eflags & X86_EFLAGS_VM)
+		ctxt->mode = X86EMUL_MODE_VM86;
+	else
+		ctxt->mode = X86EMUL_MODE_PROT32;
+
+	ctxt->ops->set_rflags(ctxt, ctxt->eflags);
+
 	/*
 	 * Now load segment descriptors. If fault happenes at this stage
 	 * it is handled in a context of new task
diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c
index ab39d84dee00..53efd597f39e 100644
--- a/arch/x86/kvm/svm.c
+++ b/arch/x86/kvm/svm.c
@@ -1354,7 +1354,11 @@ static unsigned long svm_get_rflags(struct kvm_vcpu *vcpu)
 
 static void svm_set_rflags(struct kvm_vcpu *vcpu, unsigned long rflags)
 {
+	unsigned long old_rflags = to_svm(vcpu)->vmcb->save.rflags;
+
 	to_svm(vcpu)->vmcb->save.rflags = rflags;
+	if ((old_rflags ^ rflags) & X86_EFLAGS_VM)
+		svm_update_cpl(vcpu);
 }
 
 static void svm_cache_reg(struct kvm_vcpu *vcpu, enum kvm_reg reg)
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 490a1b1a255f..03a1fd47a6d3 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -4129,6 +4129,11 @@ static int emulator_set_cr(struct x86_emulate_ctxt *ctxt, int cr, ulong val)
 	return res;
 }
 
+static void emulator_set_rflags(struct x86_emulate_ctxt *ctxt, ulong val)
+{
+	kvm_set_rflags(emul_to_vcpu(ctxt), val);
+}
+
 static int emulator_get_cpl(struct x86_emulate_ctxt *ctxt)
 {
 	return kvm_x86_ops->get_cpl(emul_to_vcpu(ctxt));
@@ -4310,6 +4315,7 @@ static struct x86_emulate_ops emulate_ops = {
 	.set_idt	     = emulator_set_idt,
 	.get_cr              = emulator_get_cr,
 	.set_cr              = emulator_set_cr,
+	.set_rflags          = emulator_set_rflags,
 	.cpl                 = emulator_get_cpl,
 	.get_dr              = emulator_get_dr,
 	.set_dr              = emulator_set_dr,
-- 
cgit v1.2.3


From 3e515705a1f46beb1c942bb8043c16f8ac7b1e9e Mon Sep 17 00:00:00 2001
From: Avi Kivity <avi@redhat.com>
Date: Mon, 5 Mar 2012 14:23:29 +0200
Subject: KVM: Ensure all vcpus are consistent with in-kernel irqchip settings

If some vcpus are created before KVM_CREATE_IRQCHIP, then
irqchip_in_kernel() and vcpu->arch.apic will be inconsistent, leading
to potential NULL pointer dereferences.

Fix by:
- ensuring that no vcpus are installed when KVM_CREATE_IRQCHIP is called
- ensuring that a vcpu has an apic if it is installed after KVM_CREATE_IRQCHIP

This is somewhat long winded because vcpu->arch.apic is created without
kvm->lock held.

Based on earlier patch by Michael Ellerman.

Signed-off-by: Michael Ellerman <michael@ellerman.id.au>
Signed-off-by: Avi Kivity <avi@redhat.com>
---
 arch/ia64/kvm/kvm-ia64.c | 5 +++++
 arch/x86/kvm/x86.c       | 8 ++++++++
 include/linux/kvm_host.h | 7 +++++++
 virt/kvm/kvm_main.c      | 4 ++++
 4 files changed, 24 insertions(+)

(limited to 'arch/x86')

diff --git a/arch/ia64/kvm/kvm-ia64.c b/arch/ia64/kvm/kvm-ia64.c
index d8ddbba6fe7d..f5104b7c52cd 100644
--- a/arch/ia64/kvm/kvm-ia64.c
+++ b/arch/ia64/kvm/kvm-ia64.c
@@ -1172,6 +1172,11 @@ out:
 
 #define PALE_RESET_ENTRY    0x80000000ffffffb0UL
 
+bool kvm_vcpu_compatible(struct kvm_vcpu *vcpu)
+{
+	return irqchip_in_kernel(vcpu->kcm) == (vcpu->arch.apic != NULL);
+}
+
 int kvm_arch_vcpu_init(struct kvm_vcpu *vcpu)
 {
 	struct kvm_vcpu *v;
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 03a1fd47a6d3..9477dc6cccae 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -3199,6 +3199,9 @@ long kvm_arch_vm_ioctl(struct file *filp,
 		r = -EEXIST;
 		if (kvm->arch.vpic)
 			goto create_irqchip_unlock;
+		r = -EINVAL;
+		if (atomic_read(&kvm->online_vcpus))
+			goto create_irqchip_unlock;
 		r = -ENOMEM;
 		vpic = kvm_create_pic(kvm);
 		if (vpic) {
@@ -6107,6 +6110,11 @@ void kvm_arch_check_processor_compat(void *rtn)
 	kvm_x86_ops->check_processor_compatibility(rtn);
 }
 
+bool kvm_vcpu_compatible(struct kvm_vcpu *vcpu)
+{
+	return irqchip_in_kernel(vcpu->kvm) == (vcpu->arch.apic != NULL);
+}
+
 int kvm_arch_vcpu_init(struct kvm_vcpu *vcpu)
 {
 	struct page *page;
diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h
index 355e44555c39..e42d85ae8541 100644
--- a/include/linux/kvm_host.h
+++ b/include/linux/kvm_host.h
@@ -805,6 +805,13 @@ static inline bool kvm_vcpu_is_bsp(struct kvm_vcpu *vcpu)
 {
 	return vcpu->kvm->bsp_vcpu_id == vcpu->vcpu_id;
 }
+
+bool kvm_vcpu_compatible(struct kvm_vcpu *vcpu);
+
+#else
+
+static inline bool kvm_vcpu_compatible(struct kvm_vcpu *vcpu) { return true; }
+
 #endif
 
 #ifdef __KVM_HAVE_DEVICE_ASSIGNMENT
diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
index e4431ada5947..94e148e38719 100644
--- a/virt/kvm/kvm_main.c
+++ b/virt/kvm/kvm_main.c
@@ -1651,6 +1651,10 @@ static int kvm_vm_ioctl_create_vcpu(struct kvm *kvm, u32 id)
 		goto vcpu_destroy;
 
 	mutex_lock(&kvm->lock);
+	if (!kvm_vcpu_compatible(vcpu)) {
+		r = -EINVAL;
+		goto unlock_vcpu_destroy;
+	}
 	if (atomic_read(&kvm->online_vcpus) == KVM_MAX_VCPUS) {
 		r = -EINVAL;
 		goto unlock_vcpu_destroy;
-- 
cgit v1.2.3


From 07700a94b00a4fcbbfb07d1b72dc112a0e036735 Mon Sep 17 00:00:00 2001
From: Jan Kiszka <jan.kiszka@siemens.com>
Date: Tue, 28 Feb 2012 14:19:54 +0100
Subject: KVM: Allow host IRQ sharing for assigned PCI 2.3 devices

PCI 2.3 allows to generically disable IRQ sources at device level. This
enables us to share legacy IRQs of such devices with other host devices
when passing them to a guest.

The new IRQ sharing feature introduced here is optional, user space has
to request it explicitly. Moreover, user space can inform us about its
view of PCI_COMMAND_INTX_DISABLE so that we can avoid unmasking the
interrupt and signaling it if the guest masked it via the virtualized
PCI config space.

Signed-off-by: Jan Kiszka <jan.kiszka@siemens.com>
Acked-by: Alex Williamson <alex.williamson@redhat.com>
Acked-by: Michael S. Tsirkin <mst@redhat.com>
Signed-off-by: Avi Kivity <avi@redhat.com>
---
 Documentation/virtual/kvm/api.txt |  41 ++++++++
 arch/x86/kvm/x86.c                |   1 +
 include/linux/kvm.h               |   6 ++
 include/linux/kvm_host.h          |   2 +
 virt/kvm/assigned-dev.c           | 209 ++++++++++++++++++++++++++++++++------
 5 files changed, 230 insertions(+), 29 deletions(-)

(limited to 'arch/x86')

diff --git a/Documentation/virtual/kvm/api.txt b/Documentation/virtual/kvm/api.txt
index 59a38264a0ed..6386f8c0482e 100644
--- a/Documentation/virtual/kvm/api.txt
+++ b/Documentation/virtual/kvm/api.txt
@@ -1169,6 +1169,14 @@ following flags are specified:
 
 /* Depends on KVM_CAP_IOMMU */
 #define KVM_DEV_ASSIGN_ENABLE_IOMMU	(1 << 0)
+/* The following two depend on KVM_CAP_PCI_2_3 */
+#define KVM_DEV_ASSIGN_PCI_2_3		(1 << 1)
+#define KVM_DEV_ASSIGN_MASK_INTX	(1 << 2)
+
+If KVM_DEV_ASSIGN_PCI_2_3 is set, the kernel will manage legacy INTx interrupts
+via the PCI-2.3-compliant device-level mask, thus enable IRQ sharing with other
+assigned devices or host devices. KVM_DEV_ASSIGN_MASK_INTX specifies the
+guest's view on the INTx mask, see KVM_ASSIGN_SET_INTX_MASK for details.
 
 The KVM_DEV_ASSIGN_ENABLE_IOMMU flag is a mandatory option to ensure
 isolation of the device.  Usages not specifying this flag are deprecated.
@@ -1441,6 +1449,39 @@ The "num_dirty" field is a performance hint for KVM to determine whether it
 should skip processing the bitmap and just invalidate everything.  It must
 be set to the number of set bits in the bitmap.
 
+4.60 KVM_ASSIGN_SET_INTX_MASK
+
+Capability: KVM_CAP_PCI_2_3
+Architectures: x86
+Type: vm ioctl
+Parameters: struct kvm_assigned_pci_dev (in)
+Returns: 0 on success, -1 on error
+
+Allows userspace to mask PCI INTx interrupts from the assigned device.  The
+kernel will not deliver INTx interrupts to the guest between setting and
+clearing of KVM_ASSIGN_SET_INTX_MASK via this interface.  This enables use of
+and emulation of PCI 2.3 INTx disable command register behavior.
+
+This may be used for both PCI 2.3 devices supporting INTx disable natively and
+older devices lacking this support. Userspace is responsible for emulating the
+read value of the INTx disable bit in the guest visible PCI command register.
+When modifying the INTx disable state, userspace should precede updating the
+physical device command register by calling this ioctl to inform the kernel of
+the new intended INTx mask state.
+
+Note that the kernel uses the device INTx disable bit to internally manage the
+device interrupt state for PCI 2.3 devices.  Reads of this register may
+therefore not match the expected value.  Writes should always use the guest
+intended INTx disable value rather than attempting to read-copy-update the
+current physical device state.  Races between user and kernel updates to the
+INTx disable bit are handled lazily in the kernel.  It's possible the device
+may generate unintended interrupts, but they will not be injected into the
+guest.
+
+See KVM_ASSIGN_DEV_IRQ for the data structure.  The target device is specified
+by assigned_dev_id.  In the flags field, only KVM_DEV_ASSIGN_MASK_INTX is
+evaluated.
+
 4.62 KVM_CREATE_SPAPR_TCE
 
 Capability: KVM_CAP_SPAPR_TCE
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 9477dc6cccae..6866083a48c1 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -2143,6 +2143,7 @@ int kvm_dev_ioctl_check_extension(long ext)
 	case KVM_CAP_XSAVE:
 	case KVM_CAP_ASYNC_PF:
 	case KVM_CAP_GET_TSC_KHZ:
+	case KVM_CAP_PCI_2_3:
 		r = 1;
 		break;
 	case KVM_CAP_COALESCED_MMIO:
diff --git a/include/linux/kvm.h b/include/linux/kvm.h
index acbe42939089..6c322a90b92f 100644
--- a/include/linux/kvm.h
+++ b/include/linux/kvm.h
@@ -588,6 +588,7 @@ struct kvm_ppc_pvinfo {
 #define KVM_CAP_TSC_DEADLINE_TIMER 72
 #define KVM_CAP_S390_UCONTROL 73
 #define KVM_CAP_SYNC_REGS 74
+#define KVM_CAP_PCI_2_3 75
 
 #ifdef KVM_CAP_IRQ_ROUTING
 
@@ -784,6 +785,9 @@ struct kvm_s390_ucas_mapping {
 /* Available with KVM_CAP_TSC_CONTROL */
 #define KVM_SET_TSC_KHZ           _IO(KVMIO,  0xa2)
 #define KVM_GET_TSC_KHZ           _IO(KVMIO,  0xa3)
+/* Available with KVM_CAP_PCI_2_3 */
+#define KVM_ASSIGN_SET_INTX_MASK  _IOW(KVMIO,  0xa4, \
+				       struct kvm_assigned_pci_dev)
 
 /*
  * ioctls for vcpu fds
@@ -857,6 +861,8 @@ struct kvm_s390_ucas_mapping {
 #define KVM_SET_ONE_REG		  _IOW(KVMIO,  0xac, struct kvm_one_reg)
 
 #define KVM_DEV_ASSIGN_ENABLE_IOMMU	(1 << 0)
+#define KVM_DEV_ASSIGN_PCI_2_3		(1 << 1)
+#define KVM_DEV_ASSIGN_MASK_INTX	(1 << 2)
 
 struct kvm_assigned_pci_dev {
 	__u32 assigned_dev_id;
diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h
index e42d85ae8541..ec171c1d0878 100644
--- a/include/linux/kvm_host.h
+++ b/include/linux/kvm_host.h
@@ -546,6 +546,7 @@ struct kvm_assigned_dev_kernel {
 	unsigned int entries_nr;
 	int host_irq;
 	bool host_irq_disabled;
+	bool pci_2_3;
 	struct msix_entry *host_msix_entries;
 	int guest_irq;
 	struct msix_entry *guest_msix_entries;
@@ -555,6 +556,7 @@ struct kvm_assigned_dev_kernel {
 	struct pci_dev *dev;
 	struct kvm *kvm;
 	spinlock_t intx_lock;
+	struct mutex intx_mask_lock;
 	char irq_name[32];
 	struct pci_saved_state *pci_saved_state;
 };
diff --git a/virt/kvm/assigned-dev.c b/virt/kvm/assigned-dev.c
index ece80612b594..08e05715df72 100644
--- a/virt/kvm/assigned-dev.c
+++ b/virt/kvm/assigned-dev.c
@@ -55,22 +55,66 @@ static int find_index_from_host_irq(struct kvm_assigned_dev_kernel
 	return index;
 }
 
-static irqreturn_t kvm_assigned_dev_thread(int irq, void *dev_id)
+static irqreturn_t kvm_assigned_dev_intx(int irq, void *dev_id)
 {
 	struct kvm_assigned_dev_kernel *assigned_dev = dev_id;
+	int ret;
 
-	if (assigned_dev->irq_requested_type & KVM_DEV_IRQ_HOST_INTX) {
-		spin_lock(&assigned_dev->intx_lock);
+	spin_lock(&assigned_dev->intx_lock);
+	if (pci_check_and_mask_intx(assigned_dev->dev)) {
+		assigned_dev->host_irq_disabled = true;
+		ret = IRQ_WAKE_THREAD;
+	} else
+		ret = IRQ_NONE;
+	spin_unlock(&assigned_dev->intx_lock);
+
+	return ret;
+}
+
+static void
+kvm_assigned_dev_raise_guest_irq(struct kvm_assigned_dev_kernel *assigned_dev,
+				 int vector)
+{
+	if (unlikely(assigned_dev->irq_requested_type &
+		     KVM_DEV_IRQ_GUEST_INTX)) {
+		mutex_lock(&assigned_dev->intx_mask_lock);
+		if (!(assigned_dev->flags & KVM_DEV_ASSIGN_MASK_INTX))
+			kvm_set_irq(assigned_dev->kvm,
+				    assigned_dev->irq_source_id, vector, 1);
+		mutex_unlock(&assigned_dev->intx_mask_lock);
+	} else
+		kvm_set_irq(assigned_dev->kvm, assigned_dev->irq_source_id,
+			    vector, 1);
+}
+
+static irqreturn_t kvm_assigned_dev_thread_intx(int irq, void *dev_id)
+{
+	struct kvm_assigned_dev_kernel *assigned_dev = dev_id;
+
+	if (!(assigned_dev->flags & KVM_DEV_ASSIGN_PCI_2_3)) {
+		spin_lock_irq(&assigned_dev->intx_lock);
 		disable_irq_nosync(irq);
 		assigned_dev->host_irq_disabled = true;
-		spin_unlock(&assigned_dev->intx_lock);
+		spin_unlock_irq(&assigned_dev->intx_lock);
 	}
 
-	kvm_set_irq(assigned_dev->kvm, assigned_dev->irq_source_id,
-		    assigned_dev->guest_irq, 1);
+	kvm_assigned_dev_raise_guest_irq(assigned_dev,
+					 assigned_dev->guest_irq);
+
+	return IRQ_HANDLED;
+}
+
+#ifdef __KVM_HAVE_MSI
+static irqreturn_t kvm_assigned_dev_thread_msi(int irq, void *dev_id)
+{
+	struct kvm_assigned_dev_kernel *assigned_dev = dev_id;
+
+	kvm_assigned_dev_raise_guest_irq(assigned_dev,
+					 assigned_dev->guest_irq);
 
 	return IRQ_HANDLED;
 }
+#endif
 
 #ifdef __KVM_HAVE_MSIX
 static irqreturn_t kvm_assigned_dev_thread_msix(int irq, void *dev_id)
@@ -81,8 +125,7 @@ static irqreturn_t kvm_assigned_dev_thread_msix(int irq, void *dev_id)
 
 	if (index >= 0) {
 		vector = assigned_dev->guest_msix_entries[index].vector;
-		kvm_set_irq(assigned_dev->kvm, assigned_dev->irq_source_id,
-			    vector, 1);
+		kvm_assigned_dev_raise_guest_irq(assigned_dev, vector);
 	}
 
 	return IRQ_HANDLED;
@@ -98,15 +141,31 @@ static void kvm_assigned_dev_ack_irq(struct kvm_irq_ack_notifier *kian)
 
 	kvm_set_irq(dev->kvm, dev->irq_source_id, dev->guest_irq, 0);
 
-	/* The guest irq may be shared so this ack may be
-	 * from another device.
-	 */
-	spin_lock(&dev->intx_lock);
-	if (dev->host_irq_disabled) {
-		enable_irq(dev->host_irq);
-		dev->host_irq_disabled = false;
+	mutex_lock(&dev->intx_mask_lock);
+
+	if (!(dev->flags & KVM_DEV_ASSIGN_MASK_INTX)) {
+		bool reassert = false;
+
+		spin_lock_irq(&dev->intx_lock);
+		/*
+		 * The guest IRQ may be shared so this ack can come from an
+		 * IRQ for another guest device.
+		 */
+		if (dev->host_irq_disabled) {
+			if (!(dev->flags & KVM_DEV_ASSIGN_PCI_2_3))
+				enable_irq(dev->host_irq);
+			else if (!pci_check_and_unmask_intx(dev->dev))
+				reassert = true;
+			dev->host_irq_disabled = reassert;
+		}
+		spin_unlock_irq(&dev->intx_lock);
+
+		if (reassert)
+			kvm_set_irq(dev->kvm, dev->irq_source_id,
+				    dev->guest_irq, 1);
 	}
-	spin_unlock(&dev->intx_lock);
+
+	mutex_unlock(&dev->intx_mask_lock);
 }
 
 static void deassign_guest_irq(struct kvm *kvm,
@@ -154,7 +213,15 @@ static void deassign_host_irq(struct kvm *kvm,
 		pci_disable_msix(assigned_dev->dev);
 	} else {
 		/* Deal with MSI and INTx */
-		disable_irq(assigned_dev->host_irq);
+		if ((assigned_dev->irq_requested_type &
+		     KVM_DEV_IRQ_HOST_INTX) &&
+		    (assigned_dev->flags & KVM_DEV_ASSIGN_PCI_2_3)) {
+			spin_lock_irq(&assigned_dev->intx_lock);
+			pci_intx(assigned_dev->dev, false);
+			spin_unlock_irq(&assigned_dev->intx_lock);
+			synchronize_irq(assigned_dev->host_irq);
+		} else
+			disable_irq(assigned_dev->host_irq);
 
 		free_irq(assigned_dev->host_irq, assigned_dev);
 
@@ -235,15 +302,34 @@ void kvm_free_all_assigned_devices(struct kvm *kvm)
 static int assigned_device_enable_host_intx(struct kvm *kvm,
 					    struct kvm_assigned_dev_kernel *dev)
 {
+	irq_handler_t irq_handler;
+	unsigned long flags;
+
 	dev->host_irq = dev->dev->irq;
-	/* Even though this is PCI, we don't want to use shared
-	 * interrupts. Sharing host devices with guest-assigned devices
-	 * on the same interrupt line is not a happy situation: there
-	 * are going to be long delays in accepting, acking, etc.
+
+	/*
+	 * We can only share the IRQ line with other host devices if we are
+	 * able to disable the IRQ source at device-level - independently of
+	 * the guest driver. Otherwise host devices may suffer from unbounded
+	 * IRQ latencies when the guest keeps the line asserted.
 	 */
-	if (request_threaded_irq(dev->host_irq, NULL, kvm_assigned_dev_thread,
-				 IRQF_ONESHOT, dev->irq_name, dev))
+	if (dev->flags & KVM_DEV_ASSIGN_PCI_2_3) {
+		irq_handler = kvm_assigned_dev_intx;
+		flags = IRQF_SHARED;
+	} else {
+		irq_handler = NULL;
+		flags = IRQF_ONESHOT;
+	}
+	if (request_threaded_irq(dev->host_irq, irq_handler,
+				 kvm_assigned_dev_thread_intx, flags,
+				 dev->irq_name, dev))
 		return -EIO;
+
+	if (dev->flags & KVM_DEV_ASSIGN_PCI_2_3) {
+		spin_lock_irq(&dev->intx_lock);
+		pci_intx(dev->dev, true);
+		spin_unlock_irq(&dev->intx_lock);
+	}
 	return 0;
 }
 
@@ -260,8 +346,9 @@ static int assigned_device_enable_host_msi(struct kvm *kvm,
 	}
 
 	dev->host_irq = dev->dev->irq;
-	if (request_threaded_irq(dev->host_irq, NULL, kvm_assigned_dev_thread,
-				 0, dev->irq_name, dev)) {
+	if (request_threaded_irq(dev->host_irq, NULL,
+				 kvm_assigned_dev_thread_msi, 0,
+				 dev->irq_name, dev)) {
 		pci_disable_msi(dev->dev);
 		return -EIO;
 	}
@@ -319,7 +406,6 @@ static int assigned_device_enable_guest_msi(struct kvm *kvm,
 {
 	dev->guest_irq = irq->guest_irq;
 	dev->ack_notifier.gsi = -1;
-	dev->host_irq_disabled = false;
 	return 0;
 }
 #endif
@@ -331,7 +417,6 @@ static int assigned_device_enable_guest_msix(struct kvm *kvm,
 {
 	dev->guest_irq = irq->guest_irq;
 	dev->ack_notifier.gsi = -1;
-	dev->host_irq_disabled = false;
 	return 0;
 }
 #endif
@@ -365,6 +450,7 @@ static int assign_host_irq(struct kvm *kvm,
 	default:
 		r = -EINVAL;
 	}
+	dev->host_irq_disabled = false;
 
 	if (!r)
 		dev->irq_requested_type |= host_irq_type;
@@ -466,6 +552,7 @@ static int kvm_vm_ioctl_deassign_dev_irq(struct kvm *kvm,
 {
 	int r = -ENODEV;
 	struct kvm_assigned_dev_kernel *match;
+	unsigned long irq_type;
 
 	mutex_lock(&kvm->lock);
 
@@ -474,7 +561,9 @@ static int kvm_vm_ioctl_deassign_dev_irq(struct kvm *kvm,
 	if (!match)
 		goto out;
 
-	r = kvm_deassign_irq(kvm, match, assigned_irq->flags);
+	irq_type = assigned_irq->flags & (KVM_DEV_IRQ_HOST_MASK |
+					  KVM_DEV_IRQ_GUEST_MASK);
+	r = kvm_deassign_irq(kvm, match, irq_type);
 out:
 	mutex_unlock(&kvm->lock);
 	return r;
@@ -607,6 +696,10 @@ static int kvm_vm_ioctl_assign_device(struct kvm *kvm,
 	if (!match->pci_saved_state)
 		printk(KERN_DEBUG "%s: Couldn't store %s saved state\n",
 		       __func__, dev_name(&dev->dev));
+
+	if (!pci_intx_mask_supported(dev))
+		assigned_dev->flags &= ~KVM_DEV_ASSIGN_PCI_2_3;
+
 	match->assigned_dev_id = assigned_dev->assigned_dev_id;
 	match->host_segnr = assigned_dev->segnr;
 	match->host_busnr = assigned_dev->busnr;
@@ -614,6 +707,7 @@ static int kvm_vm_ioctl_assign_device(struct kvm *kvm,
 	match->flags = assigned_dev->flags;
 	match->dev = dev;
 	spin_lock_init(&match->intx_lock);
+	mutex_init(&match->intx_mask_lock);
 	match->irq_source_id = -1;
 	match->kvm = kvm;
 	match->ack_notifier.irq_acked = kvm_assigned_dev_ack_irq;
@@ -759,6 +853,55 @@ msix_entry_out:
 }
 #endif
 
+static int kvm_vm_ioctl_set_pci_irq_mask(struct kvm *kvm,
+		struct kvm_assigned_pci_dev *assigned_dev)
+{
+	int r = 0;
+	struct kvm_assigned_dev_kernel *match;
+
+	mutex_lock(&kvm->lock);
+
+	match = kvm_find_assigned_dev(&kvm->arch.assigned_dev_head,
+				      assigned_dev->assigned_dev_id);
+	if (!match) {
+		r = -ENODEV;
+		goto out;
+	}
+
+	mutex_lock(&match->intx_mask_lock);
+
+	match->flags &= ~KVM_DEV_ASSIGN_MASK_INTX;
+	match->flags |= assigned_dev->flags & KVM_DEV_ASSIGN_MASK_INTX;
+
+	if (match->irq_requested_type & KVM_DEV_IRQ_GUEST_INTX) {
+		if (assigned_dev->flags & KVM_DEV_ASSIGN_MASK_INTX) {
+			kvm_set_irq(match->kvm, match->irq_source_id,
+				    match->guest_irq, 0);
+			/*
+			 * Masking at hardware-level is performed on demand,
+			 * i.e. when an IRQ actually arrives at the host.
+			 */
+		} else if (!(assigned_dev->flags & KVM_DEV_ASSIGN_PCI_2_3)) {
+			/*
+			 * Unmask the IRQ line if required. Unmasking at
+			 * device level will be performed by user space.
+			 */
+			spin_lock_irq(&match->intx_lock);
+			if (match->host_irq_disabled) {
+				enable_irq(match->host_irq);
+				match->host_irq_disabled = false;
+			}
+			spin_unlock_irq(&match->intx_lock);
+		}
+	}
+
+	mutex_unlock(&match->intx_mask_lock);
+
+out:
+	mutex_unlock(&kvm->lock);
+	return r;
+}
+
 long kvm_vm_ioctl_assigned_device(struct kvm *kvm, unsigned ioctl,
 				  unsigned long arg)
 {
@@ -866,6 +1009,15 @@ long kvm_vm_ioctl_assigned_device(struct kvm *kvm, unsigned ioctl,
 		break;
 	}
 #endif
+	case KVM_ASSIGN_SET_INTX_MASK: {
+		struct kvm_assigned_pci_dev assigned_dev;
+
+		r = -EFAULT;
+		if (copy_from_user(&assigned_dev, argp, sizeof assigned_dev))
+			goto out;
+		r = kvm_vm_ioctl_set_pci_irq_mask(kvm, &assigned_dev);
+		break;
+	}
 	default:
 		r = -ENOTTY;
 		break;
@@ -873,4 +1025,3 @@ long kvm_vm_ioctl_assigned_device(struct kvm *kvm, unsigned ioctl,
 out:
 	return r;
 }
-
-- 
cgit v1.2.3


From 9ee73970c03edb68146ceb1ba2a7033c99a5e017 Mon Sep 17 00:00:00 2001
From: Avi Kivity <avi@redhat.com>
Date: Tue, 6 Mar 2012 14:16:33 +0200
Subject: KVM: VMX: Fix delayed load of shared MSRs

Shared MSRs (MSR_*STAR and related) are stored in both vmx->guest_msrs
and in the CPU registers, but vmx_set_msr() only updated memory. Prior
to 46199f33c2953, this didn't matter, since we called vmx_load_host_state(),
which scheduled a vmx_save_host_state(), which re-synchronized the CPU
state, but now we don't, so the CPU state will not be synchronized until
the next exit to host userspace.  This mostly affects nested vmx workloads,
which play with these MSRs a lot.

Fix by loading the MSR eagerly.

Signed-off-by: Avi Kivity <avi@redhat.com>
---
 arch/x86/kvm/vmx.c | 3 +++
 1 file changed, 3 insertions(+)

(limited to 'arch/x86')

diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index 124a0952a040..4a722a0b8e13 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -2210,6 +2210,9 @@ static int vmx_set_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 data)
 		msr = find_msr_entry(vmx, msr_index);
 		if (msr) {
 			msr->data = data;
+			if (msr - vmx->guest_msrs < vmx->save_nmsrs)
+				kvm_set_shared_msr(msr->index, msr->data,
+						   msr->mask);
 			break;
 		}
 		ret = kvm_set_msr_common(vcpu, msr_index, data);
-- 
cgit v1.2.3


From a7b9d2ccc3d86303ee9314612d301966e04011c7 Mon Sep 17 00:00:00 2001
From: Gleb Natapov <gleb@redhat.com>
Date: Sun, 26 Feb 2012 16:55:40 +0200
Subject: KVM: PMU: warn when pin control is set in eventsel msr

Print warning once if pin control bit is set in eventsel msr since
emulation does not support it yet.

Signed-off-by: Gleb Natapov <gleb@redhat.com>
Signed-off-by: Avi Kivity <avi@redhat.com>
---
 arch/x86/include/asm/perf_event.h | 1 +
 arch/x86/kvm/pmu.c                | 3 +++
 2 files changed, 4 insertions(+)

(limited to 'arch/x86')

diff --git a/arch/x86/include/asm/perf_event.h b/arch/x86/include/asm/perf_event.h
index 096c975e099f..f1f71823f682 100644
--- a/arch/x86/include/asm/perf_event.h
+++ b/arch/x86/include/asm/perf_event.h
@@ -23,6 +23,7 @@
 #define ARCH_PERFMON_EVENTSEL_USR			(1ULL << 16)
 #define ARCH_PERFMON_EVENTSEL_OS			(1ULL << 17)
 #define ARCH_PERFMON_EVENTSEL_EDGE			(1ULL << 18)
+#define ARCH_PERFMON_EVENTSEL_PIN_CONTROL		(1ULL << 19)
 #define ARCH_PERFMON_EVENTSEL_INT			(1ULL << 20)
 #define ARCH_PERFMON_EVENTSEL_ANY			(1ULL << 21)
 #define ARCH_PERFMON_EVENTSEL_ENABLE			(1ULL << 22)
diff --git a/arch/x86/kvm/pmu.c b/arch/x86/kvm/pmu.c
index 3e48c1d3edcd..6af9a542e541 100644
--- a/arch/x86/kvm/pmu.c
+++ b/arch/x86/kvm/pmu.c
@@ -210,6 +210,9 @@ static void reprogram_gp_counter(struct kvm_pmc *pmc, u64 eventsel)
 	unsigned config, type = PERF_TYPE_RAW;
 	u8 event_select, unit_mask;
 
+	if (eventsel & ARCH_PERFMON_EVENTSEL_PIN_CONTROL)
+		printk_once("kvm pmu: pin control bit is ignored\n");
+
 	pmc->eventsel = eventsel;
 
 	stop_counter(pmc);
-- 
cgit v1.2.3


From fac3368310765ade6bbdf07c9acdb04210e8b5b0 Mon Sep 17 00:00:00 2001
From: Gleb Natapov <gleb@redhat.com>
Date: Sun, 26 Feb 2012 16:55:41 +0200
Subject: KVM: PMU: Fix raw event check

If eventsel has EDGE, INV or CMASK set we should create raw counter for
it, but the check is done on a wrong variable. Fix it.

Signed-off-by: Gleb Natapov <gleb@redhat.com>
Signed-off-by: Avi Kivity <avi@redhat.com>
---
 arch/x86/kvm/pmu.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kvm/pmu.c b/arch/x86/kvm/pmu.c
index 6af9a542e541..b52a8ed283b2 100644
--- a/arch/x86/kvm/pmu.c
+++ b/arch/x86/kvm/pmu.c
@@ -223,7 +223,7 @@ static void reprogram_gp_counter(struct kvm_pmc *pmc, u64 eventsel)
 	event_select = eventsel & ARCH_PERFMON_EVENTSEL_EVENT;
 	unit_mask = (eventsel & ARCH_PERFMON_EVENTSEL_UMASK) >> 8;
 
-	if (!(event_select & (ARCH_PERFMON_EVENTSEL_EDGE |
+	if (!(eventsel & (ARCH_PERFMON_EVENTSEL_EDGE |
 				ARCH_PERFMON_EVENTSEL_INV |
 				ARCH_PERFMON_EVENTSEL_CMASK))) {
 		config = find_arch_event(&pmc->vcpu->arch.pmu, event_select,
-- 
cgit v1.2.3


From 62079d8a431287a4da81db64e002c71f0e06ca83 Mon Sep 17 00:00:00 2001
From: Gleb Natapov <gleb@redhat.com>
Date: Sun, 26 Feb 2012 16:55:42 +0200
Subject: KVM: PMU: add proper support for fixed counter 2

Currently pmu emulation emulates fixed counter 2 as bus cycles
architectural counter, but since commit 9c1497ea591b25d perf has
pseudo encoding for it. Use it.

Signed-off-by: Gleb Natapov <gleb@redhat.com>
Signed-off-by: Avi Kivity <avi@redhat.com>
---
 arch/x86/kvm/pmu.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kvm/pmu.c b/arch/x86/kvm/pmu.c
index b52a8ed283b2..a73f0c104813 100644
--- a/arch/x86/kvm/pmu.c
+++ b/arch/x86/kvm/pmu.c
@@ -33,10 +33,11 @@ static struct kvm_arch_event_perf_mapping {
 	[4] = { 0x2e, 0x41, PERF_COUNT_HW_CACHE_MISSES },
 	[5] = { 0xc4, 0x00, PERF_COUNT_HW_BRANCH_INSTRUCTIONS },
 	[6] = { 0xc5, 0x00, PERF_COUNT_HW_BRANCH_MISSES },
+	[7] = { 0x00, 0x30, PERF_COUNT_HW_REF_CPU_CYCLES },
 };
 
 /* mapping between fixed pmc index and arch_events array */
-int fixed_pmc_events[] = {1, 0, 2};
+int fixed_pmc_events[] = {1, 0, 7};
 
 static bool pmc_is_gp(struct kvm_pmc *pmc)
 {
-- 
cgit v1.2.3


From 4d6931c380a976753f7566a96b58690010ef1413 Mon Sep 17 00:00:00 2001
From: Davidlohr Bueso <dave@gnu.org>
Date: Mon, 5 Mar 2012 16:53:06 +0100
Subject: KVM: MMU: make use of ->root_level in reset_rsvds_bits_mask

The reset_rsvds_bits_mask() function can use the guest walker's root level
number instead of using a separate 'level' variable.

Signed-off-by: Davidlohr Bueso <dave@gnu.org>
Signed-off-by: Avi Kivity <avi@redhat.com>
---
 arch/x86/kvm/mmu.c | 31 +++++++++++++++----------------
 1 file changed, 15 insertions(+), 16 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
index ff053ca32303..4cb164268846 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -3185,15 +3185,14 @@ static bool sync_mmio_spte(u64 *sptep, gfn_t gfn, unsigned access,
 #undef PTTYPE
 
 static void reset_rsvds_bits_mask(struct kvm_vcpu *vcpu,
-				  struct kvm_mmu *context,
-				  int level)
+				  struct kvm_mmu *context)
 {
 	int maxphyaddr = cpuid_maxphyaddr(vcpu);
 	u64 exb_bit_rsvd = 0;
 
 	if (!context->nx)
 		exb_bit_rsvd = rsvd_bits(63, 63);
-	switch (level) {
+	switch (context->root_level) {
 	case PT32_ROOT_LEVEL:
 		/* no rsvd bits for 2 level 4K page table entries */
 		context->rsvd_bits_mask[0][1] = 0;
@@ -3251,8 +3250,9 @@ static int paging64_init_context_common(struct kvm_vcpu *vcpu,
 					int level)
 {
 	context->nx = is_nx(vcpu);
+	context->root_level = level;
 
-	reset_rsvds_bits_mask(vcpu, context, level);
+	reset_rsvds_bits_mask(vcpu, context);
 
 	ASSERT(is_pae(vcpu));
 	context->new_cr3 = paging_new_cr3;
@@ -3262,7 +3262,6 @@ static int paging64_init_context_common(struct kvm_vcpu *vcpu,
 	context->invlpg = paging64_invlpg;
 	context->update_pte = paging64_update_pte;
 	context->free = paging_free;
-	context->root_level = level;
 	context->shadow_root_level = level;
 	context->root_hpa = INVALID_PAGE;
 	context->direct_map = false;
@@ -3279,8 +3278,9 @@ static int paging32_init_context(struct kvm_vcpu *vcpu,
 				 struct kvm_mmu *context)
 {
 	context->nx = false;
+	context->root_level = PT32_ROOT_LEVEL;
 
-	reset_rsvds_bits_mask(vcpu, context, PT32_ROOT_LEVEL);
+	reset_rsvds_bits_mask(vcpu, context);
 
 	context->new_cr3 = paging_new_cr3;
 	context->page_fault = paging32_page_fault;
@@ -3289,7 +3289,6 @@ static int paging32_init_context(struct kvm_vcpu *vcpu,
 	context->sync_page = paging32_sync_page;
 	context->invlpg = paging32_invlpg;
 	context->update_pte = paging32_update_pte;
-	context->root_level = PT32_ROOT_LEVEL;
 	context->shadow_root_level = PT32E_ROOT_LEVEL;
 	context->root_hpa = INVALID_PAGE;
 	context->direct_map = false;
@@ -3327,19 +3326,19 @@ static int init_kvm_tdp_mmu(struct kvm_vcpu *vcpu)
 		context->root_level = 0;
 	} else if (is_long_mode(vcpu)) {
 		context->nx = is_nx(vcpu);
-		reset_rsvds_bits_mask(vcpu, context, PT64_ROOT_LEVEL);
-		context->gva_to_gpa = paging64_gva_to_gpa;
 		context->root_level = PT64_ROOT_LEVEL;
+		reset_rsvds_bits_mask(vcpu, context);
+		context->gva_to_gpa = paging64_gva_to_gpa;
 	} else if (is_pae(vcpu)) {
 		context->nx = is_nx(vcpu);
-		reset_rsvds_bits_mask(vcpu, context, PT32E_ROOT_LEVEL);
-		context->gva_to_gpa = paging64_gva_to_gpa;
 		context->root_level = PT32E_ROOT_LEVEL;
+		reset_rsvds_bits_mask(vcpu, context);
+		context->gva_to_gpa = paging64_gva_to_gpa;
 	} else {
 		context->nx = false;
-		reset_rsvds_bits_mask(vcpu, context, PT32_ROOT_LEVEL);
-		context->gva_to_gpa = paging32_gva_to_gpa;
 		context->root_level = PT32_ROOT_LEVEL;
+		reset_rsvds_bits_mask(vcpu, context);
+		context->gva_to_gpa = paging32_gva_to_gpa;
 	}
 
 	return 0;
@@ -3402,18 +3401,18 @@ static int init_kvm_nested_mmu(struct kvm_vcpu *vcpu)
 		g_context->gva_to_gpa = nonpaging_gva_to_gpa_nested;
 	} else if (is_long_mode(vcpu)) {
 		g_context->nx = is_nx(vcpu);
-		reset_rsvds_bits_mask(vcpu, g_context, PT64_ROOT_LEVEL);
 		g_context->root_level = PT64_ROOT_LEVEL;
+		reset_rsvds_bits_mask(vcpu, g_context);
 		g_context->gva_to_gpa = paging64_gva_to_gpa_nested;
 	} else if (is_pae(vcpu)) {
 		g_context->nx = is_nx(vcpu);
-		reset_rsvds_bits_mask(vcpu, g_context, PT32E_ROOT_LEVEL);
 		g_context->root_level = PT32E_ROOT_LEVEL;
+		reset_rsvds_bits_mask(vcpu, g_context);
 		g_context->gva_to_gpa = paging64_gva_to_gpa_nested;
 	} else {
 		g_context->nx = false;
-		reset_rsvds_bits_mask(vcpu, g_context, PT32_ROOT_LEVEL);
 		g_context->root_level = PT32_ROOT_LEVEL;
+		reset_rsvds_bits_mask(vcpu, g_context);
 		g_context->gva_to_gpa = paging32_gva_to_gpa_nested;
 	}
 
-- 
cgit v1.2.3


From a223c313cb13e9ab71051fc5b70610a2829a4082 Mon Sep 17 00:00:00 2001
From: Nicolae Mogoreanu <mogoreanu@gmail.com>
Date: Tue, 21 Feb 2012 13:44:21 -0800
Subject: KVM: Ignore the writes to MSR_K7_HWCR(3)

When CPUID Fn8000_0001_EAX reports 0x00100f22 Windows 7 x64 guest
tries to set bit 3 in MSRC001_0015 in nt!KiDisableCacheErrataSource
and fails. This patch will ignore this step and allow things to move
on without having to fake CPUID value.

Signed-off-by: Nicolae Mogoreanu <mogoreanu@gmail.com>
Signed-off-by: Avi Kivity <avi@redhat.com>
---
 arch/x86/kvm/x86.c | 1 +
 1 file changed, 1 insertion(+)

(limited to 'arch/x86')

diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 6866083a48c1..32096cf6c6c9 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -1547,6 +1547,7 @@ int kvm_set_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 data)
 	case MSR_K7_HWCR:
 		data &= ~(u64)0x40;	/* ignore flush filter disable */
 		data &= ~(u64)0x100;	/* ignore ignne emulation enable */
+		data &= ~(u64)0x8;	/* ignore TLB cache disable */
 		if (data != 0) {
 			pr_unimpl(vcpu, "unimplemented HWCR wrmsr: 0x%llx\n",
 				data);
-- 
cgit v1.2.3


From 9587190107d0c0cbaccbf7bf6b0245d29095a9ae Mon Sep 17 00:00:00 2001
From: Nadav Har'El <nyh@math.technion.ac.il>
Date: Tue, 6 Mar 2012 16:39:22 +0200
Subject: KVM: nVMX: Fix erroneous exception bitmap check

The code which checks whether to inject a pagefault to L1 or L2 (in
nested VMX) was wrong, incorrect in how it checked the PF_VECTOR bit.
Thanks to Dan Carpenter for spotting this.

Signed-off-by: Nadav Har'El <nyh@il.ibm.com>
Reported-by: Dan Carpenter <dan.carpenter@oracle.com>
Signed-off-by: Avi Kivity <avi@redhat.com>
---
 arch/x86/kvm/vmx.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index 4a722a0b8e13..2c22fc788da2 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -1664,7 +1664,7 @@ static int nested_pf_handled(struct kvm_vcpu *vcpu)
 	struct vmcs12 *vmcs12 = get_vmcs12(vcpu);
 
 	/* TODO: also check PFEC_MATCH/MASK, not just EB.PF. */
-	if (!(vmcs12->exception_bitmap & PF_VECTOR))
+	if (!(vmcs12->exception_bitmap & (1u << PF_VECTOR)))
 		return 0;
 
 	nested_vmx_vmexit(vcpu);
-- 
cgit v1.2.3


From bfcfaa77bdf0f775263e906015982a608df01c76 Mon Sep 17 00:00:00 2001
From: Linus Torvalds <torvalds@linux-foundation.org>
Date: Tue, 6 Mar 2012 11:16:17 -0800
Subject: vfs: use 'unsigned long' accesses for dcache name comparison and
 hashing

Ok, this is hacky, and only works on little-endian machines with goo
unaligned handling.  And even then only with CONFIG_DEBUG_PAGEALLOC
disabled, since it can access up to 7 bytes after the pathname.

But it runs like a bat out of hell.

Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 arch/x86/Kconfig |   1 +
 fs/Kconfig       |   4 ++
 fs/dcache.c      |  23 +++++++++++
 fs/namei.c       | 122 +++++++++++++++++++++++++++++++++++++++++++++++++++++++
 4 files changed, 150 insertions(+)

(limited to 'arch/x86')

diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
index 5bed94e189fa..09675d3e0ac3 100644
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -82,6 +82,7 @@ config X86
 	select CLKEVT_I8253
 	select ARCH_HAVE_NMI_SAFE_CMPXCHG
 	select GENERIC_IOMAP
+	select DCACHE_WORD_ACCESS if !DEBUG_PAGEALLOC
 
 config INSTRUCTION_DECODER
 	def_bool (KPROBES || PERF_EVENTS)
diff --git a/fs/Kconfig b/fs/Kconfig
index d621f02a3f9e..aa195265362f 100644
--- a/fs/Kconfig
+++ b/fs/Kconfig
@@ -4,6 +4,10 @@
 
 menu "File systems"
 
+# Use unaligned word dcache accesses
+config DCACHE_WORD_ACCESS
+       bool
+
 if BLOCK
 
 source "fs/ext2/Kconfig"
diff --git a/fs/dcache.c b/fs/dcache.c
index bcbdb33fcc20..ffd47a16d870 100644
--- a/fs/dcache.c
+++ b/fs/dcache.c
@@ -144,6 +144,28 @@ int proc_nr_dentry(ctl_table *table, int write, void __user *buffer,
 static inline int dentry_cmp(const unsigned char *cs, size_t scount,
 				const unsigned char *ct, size_t tcount)
 {
+#ifdef CONFIG_DCACHE_WORD_ACCESS
+	unsigned long a,b,mask;
+
+	if (unlikely(scount != tcount))
+		return 1;
+
+	for (;;) {
+		a = *(unsigned long *)cs;
+		b = *(unsigned long *)ct;
+		if (tcount < sizeof(unsigned long))
+			break;
+		if (unlikely(a != b))
+			return 1;
+		cs += sizeof(unsigned long);
+		ct += sizeof(unsigned long);
+		tcount -= sizeof(unsigned long);
+		if (!tcount)
+			return 0;
+	}
+	mask = ~(~0ul << tcount*8);
+	return unlikely(!!((a ^ b) & mask));
+#else
 	if (scount != tcount)
 		return 1;
 
@@ -155,6 +177,7 @@ static inline int dentry_cmp(const unsigned char *cs, size_t scount,
 		tcount--;
 	} while (tcount);
 	return 0;
+#endif
 }
 
 static void __d_free(struct rcu_head *head)
diff --git a/fs/namei.c b/fs/namei.c
index e2ba62820a0f..378497a744b4 100644
--- a/fs/namei.c
+++ b/fs/namei.c
@@ -1374,6 +1374,126 @@ static inline int can_lookup(struct inode *inode)
 	return 1;
 }
 
+/*
+ * We can do the critical dentry name comparison and hashing
+ * operations one word at a time, but we are limited to:
+ *
+ * - Architectures with fast unaligned word accesses. We could
+ *   do a "get_unaligned()" if this helps and is sufficiently
+ *   fast.
+ *
+ * - Little-endian machines (so that we can generate the mask
+ *   of low bytes efficiently). Again, we *could* do a byte
+ *   swapping load on big-endian architectures if that is not
+ *   expensive enough to make the optimization worthless.
+ *
+ * - non-CONFIG_DEBUG_PAGEALLOC configurations (so that we
+ *   do not trap on the (extremely unlikely) case of a page
+ *   crossing operation.
+ *
+ * - Furthermore, we need an efficient 64-bit compile for the
+ *   64-bit case in order to generate the "number of bytes in
+ *   the final mask". Again, that could be replaced with a
+ *   efficient population count instruction or similar.
+ */
+#ifdef CONFIG_DCACHE_WORD_ACCESS
+
+#ifdef CONFIG_64BIT
+
+/*
+ * Jan Achrenius on G+: microoptimized version of
+ * the simpler "(mask & ONEBYTES) * ONEBYTES >> 56"
+ * that works for the bytemasks without having to
+ * mask them first.
+ */
+static inline long count_masked_bytes(unsigned long mask)
+{
+	return mask*0x0001020304050608 >> 56;
+}
+
+static inline unsigned int fold_hash(unsigned long hash)
+{
+	hash += hash >> (8*sizeof(int));
+	return hash;
+}
+
+#else	/* 32-bit case */
+
+/* Carl Chatfield / Jan Achrenius G+ version for 32-bit */
+static inline long count_masked_bytes(long mask)
+{
+	/* (000000 0000ff 00ffff ffffff) -> ( 1 1 2 3 ) */
+	long a = (0x0ff0001+mask) >> 23;
+	/* Fix the 1 for 00 case */
+	return a & mask;
+}
+
+#define fold_hash(x) (x)
+
+#endif
+
+unsigned int full_name_hash(const unsigned char *name, unsigned int len)
+{
+	unsigned long a, mask;
+	unsigned long hash = 0;
+
+	for (;;) {
+		a = *(unsigned long *)name;
+		hash *= 9;
+		if (len < sizeof(unsigned long))
+			break;
+		hash += a;
+		name += sizeof(unsigned long);
+		len -= sizeof(unsigned long);
+		if (!len)
+			goto done;
+	}
+	mask = ~(~0ul << len*8);
+	hash += mask & a;
+done:
+	return fold_hash(hash);
+}
+EXPORT_SYMBOL(full_name_hash);
+
+#define ONEBYTES	0x0101010101010101ul
+#define SLASHBYTES	0x2f2f2f2f2f2f2f2ful
+#define HIGHBITS	0x8080808080808080ul
+
+/* Return the high bit set in the first byte that is a zero */
+static inline unsigned long has_zero(unsigned long a)
+{
+	return ((a - ONEBYTES) & ~a) & HIGHBITS;
+}
+
+/*
+ * Calculate the length and hash of the path component, and
+ * return the length of the component;
+ */
+static inline unsigned long hash_name(const char *name, unsigned int *hashp)
+{
+	unsigned long a, mask, hash, len;
+
+	hash = a = 0;
+	len = -sizeof(unsigned long);
+	do {
+		hash = (hash + a) * 9;
+		len += sizeof(unsigned long);
+		a = *(unsigned long *)(name+len);
+		/* Do we have any NUL or '/' bytes in this word? */
+		mask = has_zero(a) | has_zero(a ^ SLASHBYTES);
+	} while (!mask);
+
+	/* The mask *below* the first high bit set */
+	mask = (mask - 1) & ~mask;
+	mask >>= 7;
+	hash += a & mask;
+	*hashp = fold_hash(hash);
+
+	return len + count_masked_bytes(mask);
+}
+
+#else
+
 unsigned int full_name_hash(const unsigned char *name, unsigned int len)
 {
 	unsigned long hash = init_name_hash();
@@ -1402,6 +1522,8 @@ static inline unsigned long hash_name(const char *name, unsigned int *hashp)
 	return len;
 }
 
+#endif
+
 /*
  * Name resolution.
  * This is the basic name resolution function, turning a pathname into
-- 
cgit v1.2.3


From a7f4255f906f60f72e00aad2fb000939449ff32e Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Fri, 9 Mar 2012 20:55:10 +0100
Subject: x86: Derandom delay_tsc for 64 bit

Commit f0fbf0abc093 ("x86: integrate delay functions") converted
delay_tsc() into a random delay generator for 64 bit.  The reason is
that it merged the mostly identical versions of delay_32.c and
delay_64.c.  Though the subtle difference of the result was:

 static void delay_tsc(unsigned long loops)
 {
-	unsigned bclock, now;
+	unsigned long bclock, now;

Now the function uses rdtscl() which returns the lower 32bit of the
TSC. On 32bit that's not problematic as unsigned long is 32bit. On 64
bit this fails when the lower 32bit are close to wrap around when
bclock is read, because the following check

       if ((now - bclock) >= loops)
       	  	break;

evaluated to true on 64bit for e.g. bclock = 0xffffffff and now = 0
because the unsigned long (now - bclock) of these values results in
0xffffffff00000001 which is definitely larger than the loops
value. That explains Tvortkos observation:

"Because I am seeing udelay(500) (_occasionally_) being short, and
 that by delaying for some duration between 0us (yep) and 491us."

Make those variables explicitely u32 again, so this works for both 32
and 64 bit.

Reported-by: Tvrtko Ursulin <tvrtko.ursulin@onelan.co.uk>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Cc: stable@vger.kernel.org # >= 2.6.27
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 arch/x86/lib/delay.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/lib/delay.c b/arch/x86/lib/delay.c
index fc45ba887d05..e395693abdb1 100644
--- a/arch/x86/lib/delay.c
+++ b/arch/x86/lib/delay.c
@@ -48,9 +48,9 @@ static void delay_loop(unsigned long loops)
 }
 
 /* TSC based delay: */
-static void delay_tsc(unsigned long loops)
+static void delay_tsc(unsigned long __loops)
 {
-	unsigned long bclock, now;
+	u32 bclock, now, loops = __loops;
 	int cpu;
 
 	preempt_disable();
-- 
cgit v1.2.3


From c94082656dac74257f63e91f78d5d458ac781fa5 Mon Sep 17 00:00:00 2001
From: Kees Cook <keescook@chromium.org>
Date: Fri, 9 Mar 2012 16:07:10 -0800
Subject: x86: Use enum instead of literals for trap values

The traps are referred to by their numbers and it can be difficult to
understand them while reading the code without context. This patch adds
enumeration of the trap numbers and replaces the numbers with the correct
enum for x86.

Signed-off-by: Kees Cook <keescook@chromium.org>
Link: http://lkml.kernel.org/r/20120310000710.GA32667@www.outflux.net
Signed-off-by: H. Peter Anvin <hpa@zytor.com>
---
 arch/x86/include/asm/traps.h |  25 +++++++++
 arch/x86/kernel/irqinit.c    |   2 +-
 arch/x86/kernel/traps.c      | 123 +++++++++++++++++++++++--------------------
 3 files changed, 91 insertions(+), 59 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/include/asm/traps.h b/arch/x86/include/asm/traps.h
index 0012d0902c5f..88eae2aec619 100644
--- a/arch/x86/include/asm/traps.h
+++ b/arch/x86/include/asm/traps.h
@@ -89,4 +89,29 @@ asmlinkage void smp_thermal_interrupt(void);
 asmlinkage void mce_threshold_interrupt(void);
 #endif
 
+/* Interrupts/Exceptions */
+enum {
+	X86_TRAP_DE = 0,	/*  0, Divide-by-zero */
+	X86_TRAP_DB,		/*  1, Debug */
+	X86_TRAP_NMI,		/*  2, Non-maskable Interrupt */
+	X86_TRAP_BP,		/*  3, Breakpoint */
+	X86_TRAP_OF,		/*  4, Overflow */
+	X86_TRAP_BR,		/*  5, Bound Range Exceeded */
+	X86_TRAP_UD,		/*  6, Invalid Opcode */
+	X86_TRAP_NM,		/*  7, Device Not Available */
+	X86_TRAP_DF,		/*  8, Double Fault */
+	X86_TRAP_OLD_MF,	/*  9, Coprocessor Segment Overrun */
+	X86_TRAP_TS,		/* 10, Invalid TSS */
+	X86_TRAP_NP,		/* 11, Segment Not Present */
+	X86_TRAP_SS,		/* 12, Stack Segment Fault */
+	X86_TRAP_GP,		/* 13, General Protection Fault */
+	X86_TRAP_PF,		/* 14, Page Fault */
+	X86_TRAP_SPURIOUS,	/* 15, Spurious Interrupt */
+	X86_TRAP_MF,		/* 16, x87 Floating-Point Exception */
+	X86_TRAP_AC,		/* 17, Alignment Check */
+	X86_TRAP_MC,		/* 18, Machine Check */
+	X86_TRAP_XF,		/* 19, SIMD Floating-Point Exception */
+	X86_TRAP_IRET = 32,	/* 32, IRET Exception */
+};
+
 #endif /* _ASM_X86_TRAPS_H */
diff --git a/arch/x86/kernel/irqinit.c b/arch/x86/kernel/irqinit.c
index 313fb5cddbce..7b77062dea11 100644
--- a/arch/x86/kernel/irqinit.c
+++ b/arch/x86/kernel/irqinit.c
@@ -61,7 +61,7 @@ static irqreturn_t math_error_irq(int cpl, void *dev_id)
 	outb(0, 0xF0);
 	if (ignore_fpu_irq || !boot_cpu_data.hard_math)
 		return IRQ_NONE;
-	math_error(get_irq_regs(), 0, 16);
+	math_error(get_irq_regs(), 0, X86_TRAP_MF);
 	return IRQ_HANDLED;
 }
 
diff --git a/arch/x86/kernel/traps.c b/arch/x86/kernel/traps.c
index 4bbe04d96744..037fc2bc5316 100644
--- a/arch/x86/kernel/traps.c
+++ b/arch/x86/kernel/traps.c
@@ -119,7 +119,7 @@ do_trap(int trapnr, int signr, char *str, struct pt_regs *regs,
 		 * traps 0, 1, 3, 4, and 5 should be forwarded to vm86.
 		 * On nmi (interrupt 2), do_trap should not be called.
 		 */
-		if (trapnr < 6)
+		if (trapnr < X86_TRAP_UD)
 			goto vm86_trap;
 		goto trap_signal;
 	}
@@ -203,27 +203,31 @@ dotraplinkage void do_##name(struct pt_regs *regs, long error_code)	\
 	do_trap(trapnr, signr, str, regs, error_code, &info);		\
 }
 
-DO_ERROR_INFO(0, SIGFPE, "divide error", divide_error, FPE_INTDIV, regs->ip)
-DO_ERROR(4, SIGSEGV, "overflow", overflow)
-DO_ERROR(5, SIGSEGV, "bounds", bounds)
-DO_ERROR_INFO(6, SIGILL, "invalid opcode", invalid_op, ILL_ILLOPN, regs->ip)
-DO_ERROR(9, SIGFPE, "coprocessor segment overrun", coprocessor_segment_overrun)
-DO_ERROR(10, SIGSEGV, "invalid TSS", invalid_TSS)
-DO_ERROR(11, SIGBUS, "segment not present", segment_not_present)
+DO_ERROR_INFO(X86_TRAP_DE, SIGFPE, "divide error", divide_error, FPE_INTDIV,
+		regs->ip)
+DO_ERROR(X86_TRAP_OF, SIGSEGV, "overflow", overflow)
+DO_ERROR(X86_TRAP_BR, SIGSEGV, "bounds", bounds)
+DO_ERROR_INFO(X86_TRAP_UD, SIGILL, "invalid opcode", invalid_op, ILL_ILLOPN,
+		regs->ip)
+DO_ERROR(X86_TRAP_OLD_MF, SIGFPE, "coprocessor segment overrun",
+		coprocessor_segment_overrun)
+DO_ERROR(X86_TRAP_TS, SIGSEGV, "invalid TSS", invalid_TSS)
+DO_ERROR(X86_TRAP_NP, SIGBUS, "segment not present", segment_not_present)
 #ifdef CONFIG_X86_32
-DO_ERROR(12, SIGBUS, "stack segment", stack_segment)
+DO_ERROR(X86_TRAP_SS, SIGBUS, "stack segment", stack_segment)
 #endif
-DO_ERROR_INFO(17, SIGBUS, "alignment check", alignment_check, BUS_ADRALN, 0)
+DO_ERROR_INFO(X86_TRAP_AC, SIGBUS, "alignment check", alignment_check,
+		BUS_ADRALN, 0)
 
 #ifdef CONFIG_X86_64
 /* Runs on IST stack */
 dotraplinkage void do_stack_segment(struct pt_regs *regs, long error_code)
 {
 	if (notify_die(DIE_TRAP, "stack segment", regs, error_code,
-			12, SIGBUS) == NOTIFY_STOP)
+			X86_TRAP_SS, SIGBUS) == NOTIFY_STOP)
 		return;
 	preempt_conditional_sti(regs);
-	do_trap(12, SIGBUS, "stack segment", regs, error_code, NULL);
+	do_trap(X86_TRAP_SS, SIGBUS, "stack segment", regs, error_code, NULL);
 	preempt_conditional_cli(regs);
 }
 
@@ -233,10 +237,10 @@ dotraplinkage void do_double_fault(struct pt_regs *regs, long error_code)
 	struct task_struct *tsk = current;
 
 	/* Return not checked because double check cannot be ignored */
-	notify_die(DIE_TRAP, str, regs, error_code, 8, SIGSEGV);
+	notify_die(DIE_TRAP, str, regs, error_code, X86_TRAP_DF, SIGSEGV);
 
 	tsk->thread.error_code = error_code;
-	tsk->thread.trap_no = 8;
+	tsk->thread.trap_no = X86_TRAP_DF;
 
 	/*
 	 * This is always a kernel trap and never fixable (and thus must
@@ -264,7 +268,7 @@ do_general_protection(struct pt_regs *regs, long error_code)
 		goto gp_in_kernel;
 
 	tsk->thread.error_code = error_code;
-	tsk->thread.trap_no = 13;
+	tsk->thread.trap_no = X86_TRAP_GP;
 
 	if (show_unhandled_signals && unhandled_signal(tsk, SIGSEGV) &&
 			printk_ratelimit()) {
@@ -291,9 +295,9 @@ gp_in_kernel:
 		return;
 
 	tsk->thread.error_code = error_code;
-	tsk->thread.trap_no = 13;
-	if (notify_die(DIE_GPF, "general protection fault", regs,
-				error_code, 13, SIGSEGV) == NOTIFY_STOP)
+	tsk->thread.trap_no = X86_TRAP_GP;
+	if (notify_die(DIE_GPF, "general protection fault", regs, error_code,
+			X86_TRAP_GP, SIGSEGV) == NOTIFY_STOP)
 		return;
 	die("general protection fault", regs, error_code);
 }
@@ -302,13 +306,13 @@ gp_in_kernel:
 dotraplinkage void __kprobes do_int3(struct pt_regs *regs, long error_code)
 {
 #ifdef CONFIG_KGDB_LOW_LEVEL_TRAP
-	if (kgdb_ll_trap(DIE_INT3, "int3", regs, error_code, 3, SIGTRAP)
-			== NOTIFY_STOP)
+	if (kgdb_ll_trap(DIE_INT3, "int3", regs, error_code, X86_TRAP_BP,
+				SIGTRAP) == NOTIFY_STOP)
 		return;
 #endif /* CONFIG_KGDB_LOW_LEVEL_TRAP */
 
-	if (notify_die(DIE_INT3, "int3", regs, error_code, 3, SIGTRAP)
-			== NOTIFY_STOP)
+	if (notify_die(DIE_INT3, "int3", regs, error_code, X86_TRAP_BP,
+			SIGTRAP) == NOTIFY_STOP)
 		return;
 
 	/*
@@ -317,7 +321,7 @@ dotraplinkage void __kprobes do_int3(struct pt_regs *regs, long error_code)
 	 */
 	debug_stack_usage_inc();
 	preempt_conditional_sti(regs);
-	do_trap(3, SIGTRAP, "int3", regs, error_code, NULL);
+	do_trap(X86_TRAP_BP, SIGTRAP, "int3", regs, error_code, NULL);
 	preempt_conditional_cli(regs);
 	debug_stack_usage_dec();
 }
@@ -422,8 +426,8 @@ dotraplinkage void __kprobes do_debug(struct pt_regs *regs, long error_code)
 	preempt_conditional_sti(regs);
 
 	if (regs->flags & X86_VM_MASK) {
-		handle_vm86_trap((struct kernel_vm86_regs *) regs,
-				error_code, 1);
+		handle_vm86_trap((struct kernel_vm86_regs *) regs, error_code,
+					X86_TRAP_DB);
 		preempt_conditional_cli(regs);
 		debug_stack_usage_dec();
 		return;
@@ -460,7 +464,8 @@ void math_error(struct pt_regs *regs, int error_code, int trapnr)
 	struct task_struct *task = current;
 	siginfo_t info;
 	unsigned short err;
-	char *str = (trapnr == 16) ? "fpu exception" : "simd exception";
+	char *str = (trapnr == X86_TRAP_MF) ? "fpu exception" :
+						"simd exception";
 
 	if (notify_die(DIE_TRAP, str, regs, error_code, trapnr, SIGFPE) == NOTIFY_STOP)
 		return;
@@ -485,7 +490,7 @@ void math_error(struct pt_regs *regs, int error_code, int trapnr)
 	info.si_signo = SIGFPE;
 	info.si_errno = 0;
 	info.si_addr = (void __user *)regs->ip;
-	if (trapnr == 16) {
+	if (trapnr == X86_TRAP_MF) {
 		unsigned short cwd, swd;
 		/*
 		 * (~cwd & swd) will mask out exceptions that are not set to unmasked
@@ -529,10 +534,11 @@ void math_error(struct pt_regs *regs, int error_code, int trapnr)
 		info.si_code = FPE_FLTRES;
 	} else {
 		/*
-		 * If we're using IRQ 13, or supposedly even some trap 16
-		 * implementations, it's possible we get a spurious trap...
+		 * If we're using IRQ 13, or supposedly even some trap
+		 * X86_TRAP_MF implementations, it's possible
+		 * we get a spurious trap, which is not an error.
 		 */
-		return;		/* Spurious trap, no error */
+		return;
 	}
 	force_sig_info(SIGFPE, &info, task);
 }
@@ -543,13 +549,13 @@ dotraplinkage void do_coprocessor_error(struct pt_regs *regs, long error_code)
 	ignore_fpu_irq = 1;
 #endif
 
-	math_error(regs, error_code, 16);
+	math_error(regs, error_code, X86_TRAP_MF);
 }
 
 dotraplinkage void
 do_simd_coprocessor_error(struct pt_regs *regs, long error_code)
 {
-	math_error(regs, error_code, 19);
+	math_error(regs, error_code, X86_TRAP_XF);
 }
 
 dotraplinkage void
@@ -643,20 +649,21 @@ dotraplinkage void do_iret_error(struct pt_regs *regs, long error_code)
 	info.si_errno = 0;
 	info.si_code = ILL_BADSTK;
 	info.si_addr = NULL;
-	if (notify_die(DIE_TRAP, "iret exception",
-			regs, error_code, 32, SIGILL) == NOTIFY_STOP)
+	if (notify_die(DIE_TRAP, "iret exception", regs, error_code,
+			X86_TRAP_IRET, SIGILL) == NOTIFY_STOP)
 		return;
-	do_trap(32, SIGILL, "iret exception", regs, error_code, &info);
+	do_trap(X86_TRAP_IRET, SIGILL, "iret exception", regs, error_code,
+		&info);
 }
 #endif
 
 /* Set of traps needed for early debugging. */
 void __init early_trap_init(void)
 {
-	set_intr_gate_ist(1, &debug, DEBUG_STACK);
+	set_intr_gate_ist(X86_TRAP_DB, &debug, DEBUG_STACK);
 	/* int3 can be called from all */
-	set_system_intr_gate_ist(3, &int3, DEBUG_STACK);
-	set_intr_gate(14, &page_fault);
+	set_system_intr_gate_ist(X86_TRAP_BP, &int3, DEBUG_STACK);
+	set_intr_gate(X86_TRAP_PF, &page_fault);
 	load_idt(&idt_descr);
 }
 
@@ -672,30 +679,30 @@ void __init trap_init(void)
 	early_iounmap(p, 4);
 #endif
 
-	set_intr_gate(0, &divide_error);
-	set_intr_gate_ist(2, &nmi, NMI_STACK);
+	set_intr_gate(X86_TRAP_DE, &divide_error);
+	set_intr_gate_ist(X86_TRAP_NMI, &nmi, NMI_STACK);
 	/* int4 can be called from all */
-	set_system_intr_gate(4, &overflow);
-	set_intr_gate(5, &bounds);
-	set_intr_gate(6, &invalid_op);
-	set_intr_gate(7, &device_not_available);
+	set_system_intr_gate(X86_TRAP_OF, &overflow);
+	set_intr_gate(X86_TRAP_BR, &bounds);
+	set_intr_gate(X86_TRAP_UD, &invalid_op);
+	set_intr_gate(X86_TRAP_NM, &device_not_available);
 #ifdef CONFIG_X86_32
-	set_task_gate(8, GDT_ENTRY_DOUBLEFAULT_TSS);
+	set_task_gate(X86_TRAP_DF, GDT_ENTRY_DOUBLEFAULT_TSS);
 #else
-	set_intr_gate_ist(8, &double_fault, DOUBLEFAULT_STACK);
+	set_intr_gate_ist(X86_TRAP_DF, &double_fault, DOUBLEFAULT_STACK);
 #endif
-	set_intr_gate(9, &coprocessor_segment_overrun);
-	set_intr_gate(10, &invalid_TSS);
-	set_intr_gate(11, &segment_not_present);
-	set_intr_gate_ist(12, &stack_segment, STACKFAULT_STACK);
-	set_intr_gate(13, &general_protection);
-	set_intr_gate(15, &spurious_interrupt_bug);
-	set_intr_gate(16, &coprocessor_error);
-	set_intr_gate(17, &alignment_check);
+	set_intr_gate(X86_TRAP_OLD_MF, &coprocessor_segment_overrun);
+	set_intr_gate(X86_TRAP_TS, &invalid_TSS);
+	set_intr_gate(X86_TRAP_NP, &segment_not_present);
+	set_intr_gate_ist(X86_TRAP_SS, &stack_segment, STACKFAULT_STACK);
+	set_intr_gate(X86_TRAP_GP, &general_protection);
+	set_intr_gate(X86_TRAP_SPURIOUS, &spurious_interrupt_bug);
+	set_intr_gate(X86_TRAP_MF, &coprocessor_error);
+	set_intr_gate(X86_TRAP_AC, &alignment_check);
 #ifdef CONFIG_X86_MCE
-	set_intr_gate_ist(18, &machine_check, MCE_STACK);
+	set_intr_gate_ist(X86_TRAP_MC, &machine_check, MCE_STACK);
 #endif
-	set_intr_gate(19, &simd_coprocessor_error);
+	set_intr_gate(X86_TRAP_XF, &simd_coprocessor_error);
 
 	/* Reserve all the builtin and the syscall vector: */
 	for (i = 0; i < FIRST_EXTERNAL_VECTOR; i++)
@@ -720,7 +727,7 @@ void __init trap_init(void)
 
 #ifdef CONFIG_X86_64
 	memcpy(&nmi_idt_table, &idt_table, IDT_ENTRIES * 16);
-	set_nmi_gate(1, &debug);
-	set_nmi_gate(3, &int3);
+	set_nmi_gate(X86_TRAP_DB, &debug);
+	set_nmi_gate(X86_TRAP_BP, &int3);
 #endif
 }
-- 
cgit v1.2.3


From 026abc333205c1fff80138b8c2cac3d0347685f4 Mon Sep 17 00:00:00 2001
From: "Kirill A. Shutemov" <kirill.shutemov@linux.intel.com>
Date: Thu, 8 Mar 2012 16:02:20 +0000
Subject: gma500: initial medfield merge

We need to merge this ahead of some of the cleanup because a lot of needed
cleanup spans both new and old chips. If we try and clean up and the merge
we end up fighting ourselves.

Signed-off-by: Kirill A. Shutemov <kirill.shutemov@linux.intel.com>
[With a load of the cleanup stuff folded in, register stuff reworked sanely]
Signed-off-by: Alan Cox <alan@linux.intel.com>
Signed-off-by: Dave Airlie <airlied@redhat.com>
---
 arch/x86/platform/mrst/mrst.c                 |   16 +
 drivers/gpu/drm/gma500/Kconfig                |    7 +
 drivers/gpu/drm/gma500/Makefile               |   10 +
 drivers/gpu/drm/gma500/mdfld_device.c         |  691 ++++++++++++++
 drivers/gpu/drm/gma500/mdfld_dsi_dpi.c        | 1024 +++++++++++++++++++++
 drivers/gpu/drm/gma500/mdfld_dsi_dpi.h        |   79 ++
 drivers/gpu/drm/gma500/mdfld_dsi_output.c     |  635 +++++++++++++
 drivers/gpu/drm/gma500/mdfld_dsi_output.h     |  389 ++++++++
 drivers/gpu/drm/gma500/mdfld_dsi_pkg_sender.c |  694 ++++++++++++++
 drivers/gpu/drm/gma500/mdfld_dsi_pkg_sender.h |   92 ++
 drivers/gpu/drm/gma500/mdfld_intel_display.c  | 1192 +++++++++++++++++++++++++
 drivers/gpu/drm/gma500/mdfld_output.c         |   77 ++
 drivers/gpu/drm/gma500/mdfld_output.h         |   77 ++
 drivers/gpu/drm/gma500/mdfld_tmd_vid.c        |  201 +++++
 drivers/gpu/drm/gma500/mdfld_tpo_vid.c        |  124 +++
 drivers/gpu/drm/gma500/psb_drv.c              |   10 +
 drivers/gpu/drm/gma500/psb_drv.h              |   89 ++
 drivers/gpu/drm/gma500/psb_irq.c              |   58 ++
 drivers/gpu/drm/gma500/psb_irq.h              |    2 +
 drivers/gpu/drm/gma500/tc35876x-dsi-lvds.c    |  829 +++++++++++++++++
 drivers/gpu/drm/gma500/tc35876x-dsi-lvds.h    |   38 +
 include/linux/i2c/tc35876x.h                  |   11 +
 22 files changed, 6345 insertions(+)
 create mode 100644 drivers/gpu/drm/gma500/mdfld_device.c
 create mode 100644 drivers/gpu/drm/gma500/mdfld_dsi_dpi.c
 create mode 100644 drivers/gpu/drm/gma500/mdfld_dsi_dpi.h
 create mode 100644 drivers/gpu/drm/gma500/mdfld_dsi_output.c
 create mode 100644 drivers/gpu/drm/gma500/mdfld_dsi_output.h
 create mode 100644 drivers/gpu/drm/gma500/mdfld_dsi_pkg_sender.c
 create mode 100644 drivers/gpu/drm/gma500/mdfld_dsi_pkg_sender.h
 create mode 100644 drivers/gpu/drm/gma500/mdfld_intel_display.c
 create mode 100644 drivers/gpu/drm/gma500/mdfld_output.c
 create mode 100644 drivers/gpu/drm/gma500/mdfld_output.h
 create mode 100644 drivers/gpu/drm/gma500/mdfld_tmd_vid.c
 create mode 100644 drivers/gpu/drm/gma500/mdfld_tpo_vid.c
 create mode 100644 drivers/gpu/drm/gma500/tc35876x-dsi-lvds.c
 create mode 100644 drivers/gpu/drm/gma500/tc35876x-dsi-lvds.h
 create mode 100644 include/linux/i2c/tc35876x.h

(limited to 'arch/x86')

diff --git a/arch/x86/platform/mrst/mrst.c b/arch/x86/platform/mrst/mrst.c
index 475e2cd0f3c3..b930cc43a235 100644
--- a/arch/x86/platform/mrst/mrst.c
+++ b/arch/x86/platform/mrst/mrst.c
@@ -28,6 +28,8 @@
 #include <linux/module.h>
 #include <linux/notifier.h>
 #include <linux/mfd/intel_msic.h>
+#include <linux/gpio.h>
+#include <linux/i2c/tc35876x.h>
 
 #include <asm/setup.h>
 #include <asm/mpspec_def.h>
@@ -686,6 +688,19 @@ static void *msic_ocd_platform_data(void *info)
 	return msic_generic_platform_data(info, INTEL_MSIC_BLOCK_OCD);
 }
 
+/* tc35876x DSI-LVDS bridge chip and panel platform data */
+static void *tc35876x_platform_data(void *data)
+{
+       static struct tc35876x_platform_data pdata;
+
+       /* gpio pins set to -1 will not be used by the driver */
+       pdata.gpio_bridge_reset = get_gpio_by_name("LCMB_RXEN");
+       pdata.gpio_panel_bl_en = get_gpio_by_name("6S6P_BL_EN");
+       pdata.gpio_panel_vadd = get_gpio_by_name("EN_VREG_LCD_V3P3");
+
+       return &pdata;
+}
+
 static const struct devs_id __initconst device_ids[] = {
 	{"bma023", SFI_DEV_TYPE_I2C, 1, &no_platform_data},
 	{"pmic_gpio", SFI_DEV_TYPE_SPI, 1, &pmic_gpio_platform_data},
@@ -698,6 +713,7 @@ static const struct devs_id __initconst device_ids[] = {
 	{"i2c_accel", SFI_DEV_TYPE_I2C, 0, &lis331dl_platform_data},
 	{"pmic_audio", SFI_DEV_TYPE_IPC, 1, &no_platform_data},
 	{"mpu3050", SFI_DEV_TYPE_I2C, 1, &mpu3050_platform_data},
+	{"i2c_disp_brig", SFI_DEV_TYPE_I2C, 0, &tc35876x_platform_data},
 
 	/* MSIC subdevices */
 	{"msic_battery", SFI_DEV_TYPE_IPC, 1, &msic_battery_platform_data},
diff --git a/drivers/gpu/drm/gma500/Kconfig b/drivers/gpu/drm/gma500/Kconfig
index f92a7f4208d1..42e665c7e90a 100644
--- a/drivers/gpu/drm/gma500/Kconfig
+++ b/drivers/gpu/drm/gma500/Kconfig
@@ -24,3 +24,10 @@ config DRM_GMA3600
 	help
 	  Say yes to include basic support for Intel GMA3600/3650 (Intel
 	  Cedar Trail) platforms.
+
+config DRM_MEDFIELD
+	bool "Intel Medfield support (Experimental)"
+	depends on DRM_GMA500 && X86_INTEL_MID
+	help
+	  Say yes to include support for the Intel Medfield platform.
+
diff --git a/drivers/gpu/drm/gma500/Makefile b/drivers/gpu/drm/gma500/Makefile
index 81c103be5e21..1583982917ce 100644
--- a/drivers/gpu/drm/gma500/Makefile
+++ b/drivers/gpu/drm/gma500/Makefile
@@ -37,4 +37,14 @@ gma500_gfx-$(CONFIG_DRM_GMA600) += oaktrail_device.o \
 	  oaktrail_hdmi.o \
 	  oaktrail_hdmi_i2c.o
 
+gma500_gfx-$(CONFIG_DRM_MEDFIELD) += mdfld_device.o \
+	  mdfld_output.o \
+	  mdfld_intel_display.o \
+	  mdfld_dsi_output.o \
+	  mdfld_dsi_dpi.o \
+	  mdfld_dsi_pkg_sender.o \
+	  mdfld_tpo_vid.o \
+	  mdfld_tmd_vid.o \
+	  tc35876x-dsi-lvds.o
+
 obj-$(CONFIG_DRM_GMA500) += gma500_gfx.o
diff --git a/drivers/gpu/drm/gma500/mdfld_device.c b/drivers/gpu/drm/gma500/mdfld_device.c
new file mode 100644
index 000000000000..6cfdda90eef1
--- /dev/null
+++ b/drivers/gpu/drm/gma500/mdfld_device.c
@@ -0,0 +1,691 @@
+/**************************************************************************
+ * Copyright (c) 2011, Intel Corporation.
+ * All Rights Reserved.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms and conditions of the GNU General Public License,
+ * version 2, as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
+ * more details.
+ *
+ * You should have received a copy of the GNU General Public License along with
+ * this program; if not, write to the Free Software Foundation, Inc.,
+ * 51 Franklin St - Fifth Floor, Boston, MA 02110-1301 USA.
+ *
+ **************************************************************************/
+
+#include "psb_drv.h"
+#include "mid_bios.h"
+#include "mdfld_output.h"
+#include "mdfld_dsi_output.h"
+#include "tc35876x-dsi-lvds.h"
+
+#include <asm/intel_scu_ipc.h>
+
+#ifdef CONFIG_BACKLIGHT_CLASS_DEVICE
+
+#define MRST_BLC_MAX_PWM_REG_FREQ	    0xFFFF
+#define BLC_PWM_PRECISION_FACTOR 100	/* 10000000 */
+#define BLC_PWM_FREQ_CALC_CONSTANT 32
+#define MHz 1000000
+#define BRIGHTNESS_MIN_LEVEL 1
+#define BRIGHTNESS_MAX_LEVEL 100
+#define BRIGHTNESS_MASK	0xFF
+#define BLC_POLARITY_NORMAL 0
+#define BLC_POLARITY_INVERSE 1
+#define BLC_ADJUSTMENT_MAX 100
+
+#define MDFLD_BLC_PWM_PRECISION_FACTOR    10
+#define MDFLD_BLC_MAX_PWM_REG_FREQ        0xFFFE
+#define MDFLD_BLC_MIN_PWM_REG_FREQ        0x2
+
+#define MDFLD_BACKLIGHT_PWM_POLARITY_BIT_CLEAR (0xFFFE)
+#define MDFLD_BACKLIGHT_PWM_CTL_SHIFT	(16)
+
+static struct backlight_device *mdfld_backlight_device;
+
+int mdfld_set_brightness(struct backlight_device *bd)
+{
+	struct drm_device *dev =
+		(struct drm_device *)bl_get_data(mdfld_backlight_device);
+	struct drm_psb_private *dev_priv = dev->dev_private;
+	int level = bd->props.brightness;
+
+	DRM_DEBUG_DRIVER("backlight level set to %d\n", level);
+
+	/* Perform value bounds checking */
+	if (level < BRIGHTNESS_MIN_LEVEL)
+		level = BRIGHTNESS_MIN_LEVEL;
+
+	if (gma_power_begin(dev, false)) {
+		u32 adjusted_level = 0;
+
+		/*
+		 * Adjust the backlight level with the percent in
+		 * dev_priv->blc_adj2
+		 */
+		adjusted_level = level * dev_priv->blc_adj2;
+		adjusted_level = adjusted_level / BLC_ADJUSTMENT_MAX;
+		dev_priv->brightness_adjusted = adjusted_level;
+
+		if (mdfld_get_panel_type(dev, 0) == TC35876X) {
+			if (dev_priv->dpi_panel_on[0] ||
+					dev_priv->dpi_panel_on[2])
+				tc35876x_brightness_control(dev,
+						dev_priv->brightness_adjusted);
+		} else {
+			if (dev_priv->dpi_panel_on[0])
+				mdfld_dsi_brightness_control(dev, 0,
+						dev_priv->brightness_adjusted);
+		}
+
+		if (dev_priv->dpi_panel_on[2])
+			mdfld_dsi_brightness_control(dev, 2,
+					dev_priv->brightness_adjusted);
+		gma_power_end(dev);
+	}
+
+	/* cache the brightness for later use */
+	dev_priv->brightness = level;
+	return 0;
+}
+
+int mdfld_get_brightness(struct backlight_device *bd)
+{
+	struct drm_device *dev =
+		(struct drm_device *)bl_get_data(mdfld_backlight_device);
+	struct drm_psb_private *dev_priv = dev->dev_private;
+
+	DRM_DEBUG_DRIVER("brightness = 0x%x \n", dev_priv->brightness);
+
+	/* return locally cached var instead of HW read (due to DPST etc.) */
+	return dev_priv->brightness;
+}
+
+static const struct backlight_ops mdfld_ops = {
+	.get_brightness = mdfld_get_brightness,
+	.update_status  = mdfld_set_brightness,
+};
+
+static int device_backlight_init(struct drm_device *dev)
+{
+	struct drm_psb_private *dev_priv = (struct drm_psb_private *)
+		dev->dev_private;
+
+	dev_priv->blc_adj1 = BLC_ADJUSTMENT_MAX;
+	dev_priv->blc_adj2 = BLC_ADJUSTMENT_MAX;
+
+	return 0;
+}
+
+int mdfld_backlight_init(struct drm_device *dev)
+{
+	struct backlight_properties props;
+	int ret = 0;
+
+	memset(&props, 0, sizeof(struct backlight_properties));
+	props.max_brightness = BRIGHTNESS_MAX_LEVEL;
+	props.type = BACKLIGHT_PLATFORM;
+	mdfld_backlight_device = backlight_device_register("mdfld-bl",
+				NULL, (void *)dev, &mdfld_ops, &props);
+
+	if (IS_ERR(mdfld_backlight_device))
+		return PTR_ERR(mdfld_backlight_device);
+
+	ret = device_backlight_init(dev);
+	if (ret)
+		return ret;
+
+	mdfld_backlight_device->props.brightness = BRIGHTNESS_MAX_LEVEL;
+	mdfld_backlight_device->props.max_brightness = BRIGHTNESS_MAX_LEVEL;
+	backlight_update_status(mdfld_backlight_device);
+	return 0;
+}
+#endif
+
+struct backlight_device *mdfld_get_backlight_device(void)
+{
+#ifdef CONFIG_BACKLIGHT_CLASS_DEVICE
+	return mdfld_backlight_device;
+#else
+	return NULL;
+#endif
+}
+
+/*
+ * mdfld_save_display_registers
+ *
+ * Description: We are going to suspend so save current display
+ * register state.
+ *
+ * Notes: FIXME_JLIU7 need to add the support for DPI MIPI & HDMI audio
+ */
+static int mdfld_save_display_registers(struct drm_device *dev, int pipe)
+{
+	struct drm_psb_private *dev_priv = dev->dev_private;
+	struct medfield_state *regs = &dev_priv->regs.mdfld;
+	int i;
+
+	/* register */
+	u32 dpll_reg = MRST_DPLL_A;
+	u32 fp_reg = MRST_FPA0;
+	u32 pipeconf_reg = PIPEACONF;
+	u32 htot_reg = HTOTAL_A;
+	u32 hblank_reg = HBLANK_A;
+	u32 hsync_reg = HSYNC_A;
+	u32 vtot_reg = VTOTAL_A;
+	u32 vblank_reg = VBLANK_A;
+	u32 vsync_reg = VSYNC_A;
+	u32 pipesrc_reg = PIPEASRC;
+	u32 dspstride_reg = DSPASTRIDE;
+	u32 dsplinoff_reg = DSPALINOFF;
+	u32 dsptileoff_reg = DSPATILEOFF;
+	u32 dspsize_reg = DSPASIZE;
+	u32 dsppos_reg = DSPAPOS;
+	u32 dspsurf_reg = DSPASURF;
+	u32 mipi_reg = MIPI;
+	u32 dspcntr_reg = DSPACNTR;
+	u32 dspstatus_reg = PIPEASTAT;
+	u32 palette_reg = PALETTE_A;
+
+	/* pointer to values */
+	u32 *dpll_val = &regs->saveDPLL_A;
+	u32 *fp_val = &regs->saveFPA0;
+	u32 *pipeconf_val = &regs->savePIPEACONF;
+	u32 *htot_val = &regs->saveHTOTAL_A;
+	u32 *hblank_val = &regs->saveHBLANK_A;
+	u32 *hsync_val = &regs->saveHSYNC_A;
+	u32 *vtot_val = &regs->saveVTOTAL_A;
+	u32 *vblank_val = &regs->saveVBLANK_A;
+	u32 *vsync_val = &regs->saveVSYNC_A;
+	u32 *pipesrc_val = &regs->savePIPEASRC;
+	u32 *dspstride_val = &regs->saveDSPASTRIDE;
+	u32 *dsplinoff_val = &regs->saveDSPALINOFF;
+	u32 *dsptileoff_val = &regs->saveDSPATILEOFF;
+	u32 *dspsize_val = &regs->saveDSPASIZE;
+	u32 *dsppos_val = &regs->saveDSPAPOS;
+	u32 *dspsurf_val = &regs->saveDSPASURF;
+	u32 *mipi_val = &regs->saveMIPI;
+	u32 *dspcntr_val = &regs->saveDSPACNTR;
+	u32 *dspstatus_val = &regs->saveDSPASTATUS;
+	u32 *palette_val = regs->save_palette_a;
+
+	switch (pipe) {
+	case 0:
+		break;
+	case 1:
+		/* regester */
+		dpll_reg = MDFLD_DPLL_B;
+		fp_reg = MDFLD_DPLL_DIV0;
+		pipeconf_reg = PIPEBCONF;
+		htot_reg = HTOTAL_B;
+		hblank_reg = HBLANK_B;
+		hsync_reg = HSYNC_B;
+		vtot_reg = VTOTAL_B;
+		vblank_reg = VBLANK_B;
+		vsync_reg = VSYNC_B;
+		pipesrc_reg = PIPEBSRC;
+		dspstride_reg = DSPBSTRIDE;
+		dsplinoff_reg = DSPBLINOFF;
+		dsptileoff_reg = DSPBTILEOFF;
+		dspsize_reg = DSPBSIZE;
+		dsppos_reg = DSPBPOS;
+		dspsurf_reg = DSPBSURF;
+		dspcntr_reg = DSPBCNTR;
+		dspstatus_reg = PIPEBSTAT;
+		palette_reg = PALETTE_B;
+
+		/* values */
+		dpll_val = &regs->saveDPLL_B;
+		fp_val = &regs->saveFPB0;
+		pipeconf_val = &regs->savePIPEBCONF;
+		htot_val = &regs->saveHTOTAL_B;
+		hblank_val = &regs->saveHBLANK_B;
+		hsync_val = &regs->saveHSYNC_B;
+		vtot_val = &regs->saveVTOTAL_B;
+		vblank_val = &regs->saveVBLANK_B;
+		vsync_val = &regs->saveVSYNC_B;
+		pipesrc_val = &regs->savePIPEBSRC;
+		dspstride_val = &regs->saveDSPBSTRIDE;
+		dsplinoff_val = &regs->saveDSPBLINOFF;
+		dsptileoff_val = &regs->saveDSPBTILEOFF;
+		dspsize_val = &regs->saveDSPBSIZE;
+		dsppos_val = &regs->saveDSPBPOS;
+		dspsurf_val = &regs->saveDSPBSURF;
+		dspcntr_val = &regs->saveDSPBCNTR;
+		dspstatus_val = &regs->saveDSPBSTATUS;
+		palette_val = regs->save_palette_b;
+		break;
+	case 2:
+		/* register */
+		pipeconf_reg = PIPECCONF;
+		htot_reg = HTOTAL_C;
+		hblank_reg = HBLANK_C;
+		hsync_reg = HSYNC_C;
+		vtot_reg = VTOTAL_C;
+		vblank_reg = VBLANK_C;
+		vsync_reg = VSYNC_C;
+		pipesrc_reg = PIPECSRC;
+		dspstride_reg = DSPCSTRIDE;
+		dsplinoff_reg = DSPCLINOFF;
+		dsptileoff_reg = DSPCTILEOFF;
+		dspsize_reg = DSPCSIZE;
+		dsppos_reg = DSPCPOS;
+		dspsurf_reg = DSPCSURF;
+		mipi_reg = MIPI_C;
+		dspcntr_reg = DSPCCNTR;
+		dspstatus_reg = PIPECSTAT;
+		palette_reg = PALETTE_C;
+
+		/* pointer to values */
+		pipeconf_val = &regs->savePIPECCONF;
+		htot_val = &regs->saveHTOTAL_C;
+		hblank_val = &regs->saveHBLANK_C;
+		hsync_val = &regs->saveHSYNC_C;
+		vtot_val = &regs->saveVTOTAL_C;
+		vblank_val = &regs->saveVBLANK_C;
+		vsync_val = &regs->saveVSYNC_C;
+		pipesrc_val = &regs->savePIPECSRC;
+		dspstride_val = &regs->saveDSPCSTRIDE;
+		dsplinoff_val = &regs->saveDSPCLINOFF;
+		dsptileoff_val = &regs->saveDSPCTILEOFF;
+		dspsize_val = &regs->saveDSPCSIZE;
+		dsppos_val = &regs->saveDSPCPOS;
+		dspsurf_val = &regs->saveDSPCSURF;
+		mipi_val = &regs->saveMIPI_C;
+		dspcntr_val = &regs->saveDSPCCNTR;
+		dspstatus_val = &regs->saveDSPCSTATUS;
+		palette_val = regs->save_palette_c;
+		break;
+	default:
+		DRM_ERROR("%s, invalid pipe number.\n", __func__);
+		return -EINVAL;
+	}
+
+	/* Pipe & plane A info */
+	*dpll_val = PSB_RVDC32(dpll_reg);
+	*fp_val = PSB_RVDC32(fp_reg);
+	*pipeconf_val = PSB_RVDC32(pipeconf_reg);
+	*htot_val = PSB_RVDC32(htot_reg);
+	*hblank_val = PSB_RVDC32(hblank_reg);
+	*hsync_val = PSB_RVDC32(hsync_reg);
+	*vtot_val = PSB_RVDC32(vtot_reg);
+	*vblank_val = PSB_RVDC32(vblank_reg);
+	*vsync_val = PSB_RVDC32(vsync_reg);
+	*pipesrc_val = PSB_RVDC32(pipesrc_reg);
+	*dspstride_val = PSB_RVDC32(dspstride_reg);
+	*dsplinoff_val = PSB_RVDC32(dsplinoff_reg);
+	*dsptileoff_val = PSB_RVDC32(dsptileoff_reg);
+	*dspsize_val = PSB_RVDC32(dspsize_reg);
+	*dsppos_val = PSB_RVDC32(dsppos_reg);
+	*dspsurf_val = PSB_RVDC32(dspsurf_reg);
+	*dspcntr_val = PSB_RVDC32(dspcntr_reg);
+	*dspstatus_val = PSB_RVDC32(dspstatus_reg);
+
+	/*save palette (gamma) */
+	for (i = 0; i < 256; i++)
+		palette_val[i] = PSB_RVDC32(palette_reg + (i << 2));
+
+	if (pipe == 1) {
+		regs->savePFIT_CONTROL = PSB_RVDC32(PFIT_CONTROL);
+		regs->savePFIT_PGM_RATIOS = PSB_RVDC32(PFIT_PGM_RATIOS);
+
+		regs->saveHDMIPHYMISCCTL = PSB_RVDC32(HDMIPHYMISCCTL);
+		regs->saveHDMIB_CONTROL = PSB_RVDC32(HDMIB_CONTROL);
+		return 0;
+	}
+
+	*mipi_val = PSB_RVDC32(mipi_reg);
+	return 0;
+}
+
+/*
+ * mdfld_restore_display_registers
+ *
+ * Description: We are going to resume so restore display register state.
+ *
+ * Notes: FIXME_JLIU7 need to add the support for DPI MIPI & HDMI audio
+ */
+static int mdfld_restore_display_registers(struct drm_device *dev, int pipe)
+{
+	/* To get  panel out of ULPS mode. */
+	u32 temp = 0;
+	u32 device_ready_reg = DEVICE_READY_REG;
+	struct drm_psb_private *dev_priv = dev->dev_private;
+	struct mdfld_dsi_config *dsi_config = NULL;
+	struct medfield_state *regs = &dev_priv->regs.mdfld;
+	u32 i = 0;
+	u32 dpll = 0;
+	u32 timeout = 0;
+
+	/* regester */
+	u32 dpll_reg = MRST_DPLL_A;
+	u32 fp_reg = MRST_FPA0;
+	u32 pipeconf_reg = PIPEACONF;
+	u32 htot_reg = HTOTAL_A;
+	u32 hblank_reg = HBLANK_A;
+	u32 hsync_reg = HSYNC_A;
+	u32 vtot_reg = VTOTAL_A;
+	u32 vblank_reg = VBLANK_A;
+	u32 vsync_reg = VSYNC_A;
+	u32 pipesrc_reg = PIPEASRC;
+	u32 dspstride_reg = DSPASTRIDE;
+	u32 dsplinoff_reg = DSPALINOFF;
+	u32 dsptileoff_reg = DSPATILEOFF;
+	u32 dspsize_reg = DSPASIZE;
+	u32 dsppos_reg = DSPAPOS;
+	u32 dspsurf_reg = DSPASURF;
+	u32 dspstatus_reg = PIPEASTAT;
+	u32 mipi_reg = MIPI;
+	u32 dspcntr_reg = DSPACNTR;
+	u32 palette_reg = PALETTE_A;
+
+	/* values */
+	u32 dpll_val = regs->saveDPLL_A & ~DPLL_VCO_ENABLE;
+	u32 fp_val = regs->saveFPA0;
+	u32 pipeconf_val = regs->savePIPEACONF;
+	u32 htot_val = regs->saveHTOTAL_A;
+	u32 hblank_val = regs->saveHBLANK_A;
+	u32 hsync_val = regs->saveHSYNC_A;
+	u32 vtot_val = regs->saveVTOTAL_A;
+	u32 vblank_val = regs->saveVBLANK_A;
+	u32 vsync_val = regs->saveVSYNC_A;
+	u32 pipesrc_val = regs->savePIPEASRC;
+	u32 dspstride_val = regs->saveDSPASTRIDE;
+	u32 dsplinoff_val = regs->saveDSPALINOFF;
+	u32 dsptileoff_val = regs->saveDSPATILEOFF;
+	u32 dspsize_val = regs->saveDSPASIZE;
+	u32 dsppos_val = regs->saveDSPAPOS;
+	u32 dspsurf_val = regs->saveDSPASURF;
+	u32 dspstatus_val = regs->saveDSPASTATUS;
+	u32 mipi_val = regs->saveMIPI;
+	u32 dspcntr_val = regs->saveDSPACNTR;
+	u32 *palette_val = regs->save_palette_a;
+
+	switch (pipe) {
+	case 0:
+		dsi_config = dev_priv->dsi_configs[0];
+		break;
+	case 1:
+		/* regester */
+		dpll_reg = MDFLD_DPLL_B;
+		fp_reg = MDFLD_DPLL_DIV0;
+		pipeconf_reg = PIPEBCONF;
+		htot_reg = HTOTAL_B;
+		hblank_reg = HBLANK_B;
+		hsync_reg = HSYNC_B;
+		vtot_reg = VTOTAL_B;
+		vblank_reg = VBLANK_B;
+		vsync_reg = VSYNC_B;
+		pipesrc_reg = PIPEBSRC;
+		dspstride_reg = DSPBSTRIDE;
+		dsplinoff_reg = DSPBLINOFF;
+		dsptileoff_reg = DSPBTILEOFF;
+		dspsize_reg = DSPBSIZE;
+		dsppos_reg = DSPBPOS;
+		dspsurf_reg = DSPBSURF;
+		dspcntr_reg = DSPBCNTR;
+		dspstatus_reg = PIPEBSTAT;
+		palette_reg = PALETTE_B;
+
+		/* values */
+		dpll_val = regs->saveDPLL_B & ~DPLL_VCO_ENABLE;
+		fp_val = regs->saveFPB0;
+		pipeconf_val = regs->savePIPEBCONF;
+		htot_val = regs->saveHTOTAL_B;
+		hblank_val = regs->saveHBLANK_B;
+		hsync_val = regs->saveHSYNC_B;
+		vtot_val = regs->saveVTOTAL_B;
+		vblank_val = regs->saveVBLANK_B;
+		vsync_val = regs->saveVSYNC_B;
+		pipesrc_val = regs->savePIPEBSRC;
+		dspstride_val = regs->saveDSPBSTRIDE;
+		dsplinoff_val = regs->saveDSPBLINOFF;
+		dsptileoff_val = regs->saveDSPBTILEOFF;
+		dspsize_val = regs->saveDSPBSIZE;
+		dsppos_val = regs->saveDSPBPOS;
+		dspsurf_val = regs->saveDSPBSURF;
+		dspcntr_val = regs->saveDSPBCNTR;
+		dspstatus_val = regs->saveDSPBSTATUS;
+		palette_val = regs->save_palette_b;
+		break;
+	case 2:
+		/* regester */
+		pipeconf_reg = PIPECCONF;
+		htot_reg = HTOTAL_C;
+		hblank_reg = HBLANK_C;
+		hsync_reg = HSYNC_C;
+		vtot_reg = VTOTAL_C;
+		vblank_reg = VBLANK_C;
+		vsync_reg = VSYNC_C;
+		pipesrc_reg = PIPECSRC;
+		dspstride_reg = DSPCSTRIDE;
+		dsplinoff_reg = DSPCLINOFF;
+		dsptileoff_reg = DSPCTILEOFF;
+		dspsize_reg = DSPCSIZE;
+		dsppos_reg = DSPCPOS;
+		dspsurf_reg = DSPCSURF;
+		mipi_reg = MIPI_C;
+		dspcntr_reg = DSPCCNTR;
+		dspstatus_reg = PIPECSTAT;
+		palette_reg = PALETTE_C;
+
+		/* values */
+		pipeconf_val = regs->savePIPECCONF;
+		htot_val = regs->saveHTOTAL_C;
+		hblank_val = regs->saveHBLANK_C;
+		hsync_val = regs->saveHSYNC_C;
+		vtot_val = regs->saveVTOTAL_C;
+		vblank_val = regs->saveVBLANK_C;
+		vsync_val = regs->saveVSYNC_C;
+		pipesrc_val = regs->savePIPECSRC;
+		dspstride_val = regs->saveDSPCSTRIDE;
+		dsplinoff_val = regs->saveDSPCLINOFF;
+		dsptileoff_val = regs->saveDSPCTILEOFF;
+		dspsize_val = regs->saveDSPCSIZE;
+		dsppos_val = regs->saveDSPCPOS;
+		dspsurf_val = regs->saveDSPCSURF;
+		mipi_val = regs->saveMIPI_C;
+		dspcntr_val = regs->saveDSPCCNTR;
+		dspstatus_val = regs->saveDSPCSTATUS;
+		palette_val = regs->save_palette_c;
+
+		dsi_config = dev_priv->dsi_configs[1];
+		break;
+	default:
+		DRM_ERROR("%s, invalid pipe number.\n", __func__);
+		return -EINVAL;
+	}
+
+	/*make sure VGA plane is off. it initializes to on after reset!*/
+	PSB_WVDC32(0x80000000, VGACNTRL);
+
+	if (pipe == 1) {
+		PSB_WVDC32(dpll_val & ~DPLL_VCO_ENABLE, dpll_reg);
+		PSB_RVDC32(dpll_reg);
+
+		PSB_WVDC32(fp_val, fp_reg);
+	} else {
+
+		dpll = PSB_RVDC32(dpll_reg);
+
+		if (!(dpll & DPLL_VCO_ENABLE)) {
+
+			/* When ungating power of DPLL, needs to wait 0.5us
+			   before enable the VCO */
+			if (dpll & MDFLD_PWR_GATE_EN) {
+				dpll &= ~MDFLD_PWR_GATE_EN;
+				PSB_WVDC32(dpll, dpll_reg);
+				/* FIXME_MDFLD PO - change 500 to 1 after PO */
+				udelay(500);
+			}
+
+			PSB_WVDC32(fp_val, fp_reg);
+			PSB_WVDC32(dpll_val, dpll_reg);
+			/* FIXME_MDFLD PO - change 500 to 1 after PO */
+			udelay(500);
+
+			dpll_val |= DPLL_VCO_ENABLE;
+			PSB_WVDC32(dpll_val, dpll_reg);
+			PSB_RVDC32(dpll_reg);
+
+			/* wait for DSI PLL to lock */
+			while (timeout < 20000 &&
+			  !(PSB_RVDC32(pipeconf_reg) & PIPECONF_DSIPLL_LOCK)) {
+				udelay(150);
+				timeout++;
+			}
+
+			if (timeout == 20000) {
+				DRM_ERROR("%s, can't lock DSIPLL.\n",
+								__func__);
+				return -EINVAL;
+			}
+		}
+	}
+	/* Restore mode */
+	PSB_WVDC32(htot_val, htot_reg);
+	PSB_WVDC32(hblank_val, hblank_reg);
+	PSB_WVDC32(hsync_val, hsync_reg);
+	PSB_WVDC32(vtot_val, vtot_reg);
+	PSB_WVDC32(vblank_val, vblank_reg);
+	PSB_WVDC32(vsync_val, vsync_reg);
+	PSB_WVDC32(pipesrc_val, pipesrc_reg);
+	PSB_WVDC32(dspstatus_val, dspstatus_reg);
+
+	/*set up the plane*/
+	PSB_WVDC32(dspstride_val, dspstride_reg);
+	PSB_WVDC32(dsplinoff_val, dsplinoff_reg);
+	PSB_WVDC32(dsptileoff_val, dsptileoff_reg);
+	PSB_WVDC32(dspsize_val, dspsize_reg);
+	PSB_WVDC32(dsppos_val, dsppos_reg);
+	PSB_WVDC32(dspsurf_val, dspsurf_reg);
+
+	if (pipe == 1) {
+		/* restore palette (gamma) */
+		/*DRM_UDELAY(50000); */
+		for (i = 0; i < 256; i++)
+			PSB_WVDC32(palette_val[i], palette_reg + (i << 2));
+
+		PSB_WVDC32(regs->savePFIT_CONTROL, PFIT_CONTROL);
+		PSB_WVDC32(regs->savePFIT_PGM_RATIOS, PFIT_PGM_RATIOS);
+
+		/*TODO: resume HDMI port */
+
+		/*TODO: resume pipe*/
+
+		/*enable the plane*/
+		PSB_WVDC32(dspcntr_val & ~DISPLAY_PLANE_ENABLE, dspcntr_reg);
+
+		return 0;
+	}
+
+	/*set up pipe related registers*/
+	PSB_WVDC32(mipi_val, mipi_reg);
+
+	/*setup MIPI adapter + MIPI IP registers*/
+	if (dsi_config)
+		mdfld_dsi_controller_init(dsi_config, pipe);
+
+	if (in_atomic() || in_interrupt())
+		mdelay(20);
+	else
+		msleep(20);
+
+	/*enable the plane*/
+	PSB_WVDC32(dspcntr_val, dspcntr_reg);
+
+	if (in_atomic() || in_interrupt())
+		mdelay(20);
+	else
+		msleep(20);
+
+	/* LP Hold Release */
+	temp = REG_READ(mipi_reg);
+	temp |= LP_OUTPUT_HOLD_RELEASE;
+	REG_WRITE(mipi_reg, temp);
+	mdelay(1);
+
+
+	/* Set DSI host to exit from Utra Low Power State */
+	temp = REG_READ(device_ready_reg);
+	temp &= ~ULPS_MASK;
+	temp |= 0x3;
+	temp |= EXIT_ULPS_DEV_READY;
+	REG_WRITE(device_ready_reg, temp);
+	mdelay(1);
+
+	temp = REG_READ(device_ready_reg);
+	temp &= ~ULPS_MASK;
+	temp |= EXITING_ULPS;
+	REG_WRITE(device_ready_reg, temp);
+	mdelay(1);
+
+	/*enable the pipe*/
+	PSB_WVDC32(pipeconf_val, pipeconf_reg);
+
+	/* restore palette (gamma) */
+	/*DRM_UDELAY(50000); */
+	for (i = 0; i < 256; i++)
+		PSB_WVDC32(palette_val[i], palette_reg + (i << 2));
+
+	return 0;
+}
+
+static int mdfld_save_registers(struct drm_device *dev)
+{
+	/* mdfld_save_cursor_overlay_registers(dev); */
+	mdfld_save_display_registers(dev, 0);
+	mdfld_save_display_registers(dev, 2);
+	mdfld_disable_crtc(dev, 0);
+	mdfld_disable_crtc(dev, 2);
+
+	return 0;
+}
+
+static int mdfld_restore_registers(struct drm_device *dev)
+{
+	mdfld_restore_display_registers(dev, 2);
+	mdfld_restore_display_registers(dev, 0);
+	/* mdfld_restore_cursor_overlay_registers(dev); */
+
+	return 0;
+}
+
+static int mdfld_power_down(struct drm_device *dev)
+{
+	/* FIXME */
+	return 0;
+}
+
+static int mdfld_power_up(struct drm_device *dev)
+{
+	/* FIXME */
+	return 0;
+}
+
+const struct psb_ops mdfld_chip_ops = {
+	.name = "mdfld",
+	.accel_2d = 0,
+	.pipes = 3,
+	.crtcs = 3,
+	.sgx_offset = MRST_SGX_OFFSET,
+
+	.chip_setup = mid_chip_setup,
+	.crtc_helper = &mdfld_helper_funcs,
+	.crtc_funcs = &psb_intel_crtc_funcs,
+
+	.output_init = mdfld_output_init,
+
+#ifdef CONFIG_BACKLIGHT_CLASS_DEVICE
+	.backlight_init = mdfld_backlight_init,
+#endif
+
+	.save_regs = mdfld_save_registers,
+	.restore_regs = mdfld_restore_registers,
+	.power_down = mdfld_power_down,
+	.power_up = mdfld_power_up,
+};
diff --git a/drivers/gpu/drm/gma500/mdfld_dsi_dpi.c b/drivers/gpu/drm/gma500/mdfld_dsi_dpi.c
new file mode 100644
index 000000000000..fc0df28a668c
--- /dev/null
+++ b/drivers/gpu/drm/gma500/mdfld_dsi_dpi.c
@@ -0,0 +1,1024 @@
+/*
+ * Copyright © 2010 Intel Corporation
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+ * DEALINGS IN THE SOFTWARE.
+ *
+ * Authors:
+ * jim liu <jim.liu@intel.com>
+ * Jackie Li<yaodong.li@intel.com>
+ */
+
+#include "mdfld_dsi_dpi.h"
+#include "mdfld_output.h"
+#include "mdfld_dsi_pkg_sender.h"
+#include "psb_drv.h"
+#include "tc35876x-dsi-lvds.h"
+
+static void mdfld_dsi_dpi_shut_down(struct mdfld_dsi_dpi_output *output,
+								int pipe);
+
+static void mdfld_wait_for_HS_DATA_FIFO(struct drm_device *dev, u32 pipe)
+{
+	u32 gen_fifo_stat_reg = MIPI_GEN_FIFO_STAT_REG(pipe);
+	int timeout = 0;
+
+	udelay(500);
+
+	/* This will time out after approximately 2+ seconds */
+	while ((timeout < 20000) &&
+		(REG_READ(gen_fifo_stat_reg) & DSI_FIFO_GEN_HS_DATA_FULL)) {
+		udelay(100);
+		timeout++;
+	}
+
+	if (timeout == 20000)
+		DRM_INFO("MIPI: HS Data FIFO was never cleared!\n");
+}
+
+static void mdfld_wait_for_HS_CTRL_FIFO(struct drm_device *dev, u32 pipe)
+{
+	u32 gen_fifo_stat_reg = MIPI_GEN_FIFO_STAT_REG(pipe);
+	int timeout = 0;
+
+	udelay(500);
+
+	/* This will time out after approximately 2+ seconds */
+	while ((timeout < 20000) && (REG_READ(gen_fifo_stat_reg)
+					& DSI_FIFO_GEN_HS_CTRL_FULL)) {
+		udelay(100);
+		timeout++;
+	}
+	if (timeout == 20000)
+		DRM_INFO("MIPI: HS CMD FIFO was never cleared!\n");
+}
+
+static void mdfld_wait_for_DPI_CTRL_FIFO(struct drm_device *dev, u32 pipe)
+{
+	u32 gen_fifo_stat_reg = MIPI_GEN_FIFO_STAT_REG(pipe);
+	int timeout = 0;
+
+	udelay(500);
+
+	/* This will time out after approximately 2+ seconds */
+	while ((timeout < 20000) && ((REG_READ(gen_fifo_stat_reg) &
+					DPI_FIFO_EMPTY) != DPI_FIFO_EMPTY)) {
+		udelay(100);
+		timeout++;
+	}
+
+	if (timeout == 20000)
+		DRM_ERROR("MIPI: DPI FIFO was never cleared\n");
+}
+
+static void mdfld_wait_for_SPL_PKG_SENT(struct drm_device *dev, u32 pipe)
+{
+	u32 intr_stat_reg = MIPI_INTR_STAT_REG(pipe);
+	int timeout = 0;
+
+	udelay(500);
+
+	/* This will time out after approximately 2+ seconds */
+	while ((timeout < 20000) && (!(REG_READ(intr_stat_reg)
+					& DSI_INTR_STATE_SPL_PKG_SENT))) {
+		udelay(100);
+		timeout++;
+	}
+
+	if (timeout == 20000)
+                DRM_ERROR("MIPI: SPL_PKT_SENT_INTERRUPT was not sent successfully!\n");
+}
+
+/* For TC35876X */
+
+static void dsi_set_device_ready_state(struct drm_device *dev, int state,
+				int pipe)
+{
+	REG_FLD_MOD(MIPI_DEVICE_READY_REG(pipe), !!state, 0, 0);
+}
+
+static void dsi_set_pipe_plane_enable_state(struct drm_device *dev,
+							int state, int pipe)
+{
+	struct drm_psb_private *dev_priv = dev->dev_private;
+	u32 pipeconf_reg = PIPEACONF;
+	u32 dspcntr_reg = DSPACNTR;
+
+	u32 dspcntr = dev_priv->dspcntr[pipe];
+	u32 mipi = MIPI_PORT_EN | PASS_FROM_SPHY_TO_AFE | SEL_FLOPPED_HSTX;
+
+	if (pipe) {
+		pipeconf_reg = PIPECCONF;
+		dspcntr_reg = DSPCCNTR;
+	} else
+		mipi &= (~0x03);
+
+	if (state) {
+		/*Set up pipe */
+		REG_WRITE(pipeconf_reg, BIT(31));
+
+		if (REG_BIT_WAIT(pipeconf_reg, 1, 30))
+			dev_err(&dev->pdev->dev, "%s: Pipe enable timeout\n",
+				__func__);
+
+		/*Set up display plane */
+		REG_WRITE(dspcntr_reg, dspcntr);
+	} else {
+		u32 dspbase_reg = pipe ? MDFLD_DSPCBASE : MRST_DSPABASE;
+
+		/* Put DSI lanes to ULPS to disable pipe */
+		REG_FLD_MOD(MIPI_DEVICE_READY_REG(pipe), 2, 2, 1);
+		REG_READ(MIPI_DEVICE_READY_REG(pipe)); /* posted write? */
+
+		/* LP Hold */
+		REG_FLD_MOD(MIPI_PORT_CONTROL(pipe), 0, 16, 16);
+		REG_READ(MIPI_PORT_CONTROL(pipe)); /* posted write? */
+
+		/* Disable display plane */
+		REG_FLD_MOD(dspcntr_reg, 0, 31, 31);
+
+		/* Flush the plane changes ??? posted write? */
+		REG_WRITE(dspbase_reg, REG_READ(dspbase_reg));
+		REG_READ(dspbase_reg);
+
+		/* Disable PIPE */
+		REG_FLD_MOD(pipeconf_reg, 0, 31, 31);
+
+		if (REG_BIT_WAIT(pipeconf_reg, 0, 30))
+			dev_err(&dev->pdev->dev, "%s: Pipe disable timeout\n",
+				__func__);
+
+		if (REG_BIT_WAIT(MIPI_GEN_FIFO_STAT_REG(pipe), 1, 28))
+			dev_err(&dev->pdev->dev, "%s: FIFO not empty\n",
+				__func__);
+	}
+}
+
+static void mdfld_dsi_configure_down(struct mdfld_dsi_encoder *dsi_encoder,
+								int pipe)
+{
+	struct mdfld_dsi_dpi_output *dpi_output =
+				MDFLD_DSI_DPI_OUTPUT(dsi_encoder);
+	struct mdfld_dsi_config *dsi_config =
+				mdfld_dsi_encoder_get_config(dsi_encoder);
+	struct drm_device *dev = dsi_config->dev;
+	struct drm_psb_private *dev_priv = dev->dev_private;
+
+	if (!dev_priv->dpi_panel_on[pipe]) {
+		dev_err(dev->dev, "DPI panel is already off\n");
+		return;
+	}
+	tc35876x_toshiba_bridge_panel_off(dev);
+	tc35876x_set_bridge_reset_state(dev, 1);
+	dsi_set_pipe_plane_enable_state(dev, 0, pipe);
+	mdfld_dsi_dpi_shut_down(dpi_output, pipe);
+	dsi_set_device_ready_state(dev, 0, pipe);
+}
+
+static void mdfld_dsi_configure_up(struct mdfld_dsi_encoder *dsi_encoder,
+								int pipe)
+{
+	struct mdfld_dsi_dpi_output *dpi_output =
+				MDFLD_DSI_DPI_OUTPUT(dsi_encoder);
+	struct mdfld_dsi_config *dsi_config =
+				mdfld_dsi_encoder_get_config(dsi_encoder);
+	struct drm_device *dev = dsi_config->dev;
+	struct drm_psb_private *dev_priv = dev->dev_private;
+
+	if (dev_priv->dpi_panel_on[pipe]) {
+		dev_err(dev->dev, "DPI panel is already on\n");
+		return;
+	}
+
+	/* For resume path sequence */
+	mdfld_dsi_dpi_shut_down(dpi_output, pipe);
+	dsi_set_device_ready_state(dev, 0, pipe);
+
+	dsi_set_device_ready_state(dev, 1, pipe);
+	tc35876x_set_bridge_reset_state(dev, 0);
+	tc35876x_configure_lvds_bridge(dev);
+	mdfld_dsi_dpi_turn_on(dpi_output, pipe);  /* Send turn on command */
+	dsi_set_pipe_plane_enable_state(dev, 1, pipe);
+}
+/* End for TC35876X */
+
+/* ************************************************************************* *\
+ * FUNCTION: mdfld_dsi_tpo_ic_init
+ *
+ * DESCRIPTION:  This function is called only by mrst_dsi_mode_set and
+ *               restore_display_registers.  since this function does not
+ *               acquire the mutex, it is important that the calling function
+ *               does!
+\* ************************************************************************* */
+static void mdfld_dsi_tpo_ic_init(struct mdfld_dsi_config *dsi_config, u32 pipe)
+{
+	struct drm_device *dev = dsi_config->dev;
+	u32 dcsChannelNumber = dsi_config->channel_num;
+	u32 gen_data_reg = MIPI_HS_GEN_DATA_REG(pipe);
+	u32 gen_ctrl_reg = MIPI_HS_GEN_CTRL_REG(pipe);
+	u32 gen_ctrl_val = GEN_LONG_WRITE;
+
+	DRM_INFO("Enter mrst init TPO MIPI display.\n");
+
+	gen_ctrl_val |= dcsChannelNumber << DCS_CHANNEL_NUMBER_POS;
+
+	/* Flip page order */
+	mdfld_wait_for_HS_DATA_FIFO(dev, pipe);
+	REG_WRITE(gen_data_reg, 0x00008036);
+	mdfld_wait_for_HS_CTRL_FIFO(dev, pipe);
+	REG_WRITE(gen_ctrl_reg, gen_ctrl_val | (0x02 << WORD_COUNTS_POS));
+
+	/* 0xF0 */
+	mdfld_wait_for_HS_DATA_FIFO(dev, pipe);
+	REG_WRITE(gen_data_reg, 0x005a5af0);
+	mdfld_wait_for_HS_CTRL_FIFO(dev, pipe);
+	REG_WRITE(gen_ctrl_reg, gen_ctrl_val | (0x03 << WORD_COUNTS_POS));
+
+	/* Write protection key */
+	mdfld_wait_for_HS_DATA_FIFO(dev, pipe);
+	REG_WRITE(gen_data_reg, 0x005a5af1);
+	mdfld_wait_for_HS_CTRL_FIFO(dev, pipe);
+	REG_WRITE(gen_ctrl_reg, gen_ctrl_val | (0x03 << WORD_COUNTS_POS));
+
+	/* 0xFC */
+	mdfld_wait_for_HS_DATA_FIFO(dev, pipe);
+	REG_WRITE(gen_data_reg, 0x005a5afc);
+	mdfld_wait_for_HS_CTRL_FIFO(dev, pipe);
+	REG_WRITE(gen_ctrl_reg, gen_ctrl_val | (0x03 << WORD_COUNTS_POS));
+
+	/* 0xB7 */
+	mdfld_wait_for_HS_DATA_FIFO(dev, pipe);
+	REG_WRITE(gen_data_reg, 0x770000b7);
+	mdfld_wait_for_HS_DATA_FIFO(dev, pipe);
+	REG_WRITE(gen_data_reg, 0x00000044);
+	mdfld_wait_for_HS_CTRL_FIFO(dev, pipe);
+	REG_WRITE(gen_ctrl_reg, gen_ctrl_val | (0x05 << WORD_COUNTS_POS));
+
+	/* 0xB6 */
+	mdfld_wait_for_HS_DATA_FIFO(dev, pipe);
+	REG_WRITE(gen_data_reg, 0x000a0ab6);
+	mdfld_wait_for_HS_CTRL_FIFO(dev, pipe);
+	REG_WRITE(gen_ctrl_reg, gen_ctrl_val | (0x03 << WORD_COUNTS_POS));
+
+	/* 0xF2 */
+	mdfld_wait_for_HS_DATA_FIFO(dev, pipe);
+	REG_WRITE(gen_data_reg, 0x081010f2);
+	mdfld_wait_for_HS_DATA_FIFO(dev, pipe);
+	REG_WRITE(gen_data_reg, 0x4a070708);
+	mdfld_wait_for_HS_DATA_FIFO(dev, pipe);
+	REG_WRITE(gen_data_reg, 0x000000c5);
+	mdfld_wait_for_HS_CTRL_FIFO(dev, pipe);
+	REG_WRITE(gen_ctrl_reg, gen_ctrl_val | (0x09 << WORD_COUNTS_POS));
+
+	/* 0xF8 */
+	mdfld_wait_for_HS_DATA_FIFO(dev, pipe);
+	REG_WRITE(gen_data_reg, 0x024003f8);
+	mdfld_wait_for_HS_DATA_FIFO(dev, pipe);
+	REG_WRITE(gen_data_reg, 0x01030a04);
+	mdfld_wait_for_HS_DATA_FIFO(dev, pipe);
+	REG_WRITE(gen_data_reg, 0x0e020220);
+	mdfld_wait_for_HS_DATA_FIFO(dev, pipe);
+	REG_WRITE(gen_data_reg, 0x00000004);
+	mdfld_wait_for_HS_CTRL_FIFO(dev, pipe);
+	REG_WRITE(gen_ctrl_reg, gen_ctrl_val | (0x0d << WORD_COUNTS_POS));
+
+	/* 0xE2 */
+	mdfld_wait_for_HS_DATA_FIFO(dev, pipe);
+	REG_WRITE(gen_data_reg, 0x398fc3e2);
+	mdfld_wait_for_HS_DATA_FIFO(dev, pipe);
+	REG_WRITE(gen_data_reg, 0x0000916f);
+	mdfld_wait_for_HS_CTRL_FIFO(dev, pipe);
+	REG_WRITE(gen_ctrl_reg, gen_ctrl_val | (0x06 << WORD_COUNTS_POS));
+
+	/* 0xB0 */
+	mdfld_wait_for_HS_DATA_FIFO(dev, pipe);
+	REG_WRITE(gen_data_reg, 0x000000b0);
+	mdfld_wait_for_HS_CTRL_FIFO(dev, pipe);
+	REG_WRITE(gen_ctrl_reg, gen_ctrl_val | (0x02 << WORD_COUNTS_POS));
+
+	/* 0xF4 */
+	mdfld_wait_for_HS_DATA_FIFO(dev, pipe);
+	REG_WRITE(gen_data_reg, 0x240242f4);
+	mdfld_wait_for_HS_DATA_FIFO(dev, pipe);
+	REG_WRITE(gen_data_reg, 0x78ee2002);
+	mdfld_wait_for_HS_DATA_FIFO(dev, pipe);
+	REG_WRITE(gen_data_reg, 0x2a071050);
+	mdfld_wait_for_HS_DATA_FIFO(dev, pipe);
+	REG_WRITE(gen_data_reg, 0x507fee10);
+	mdfld_wait_for_HS_DATA_FIFO(dev, pipe);
+	REG_WRITE(gen_data_reg, 0x10300710);
+	mdfld_wait_for_HS_CTRL_FIFO(dev, pipe);
+	REG_WRITE(gen_ctrl_reg, gen_ctrl_val | (0x14 << WORD_COUNTS_POS));
+
+	/* 0xBA */
+	mdfld_wait_for_HS_DATA_FIFO(dev, pipe);
+	REG_WRITE(gen_data_reg, 0x19fe07ba);
+	mdfld_wait_for_HS_DATA_FIFO(dev, pipe);
+	REG_WRITE(gen_data_reg, 0x101c0a31);
+	mdfld_wait_for_HS_DATA_FIFO(dev, pipe);
+	REG_WRITE(gen_data_reg, 0x00000010);
+	mdfld_wait_for_HS_CTRL_FIFO(dev, pipe);
+	REG_WRITE(gen_ctrl_reg, gen_ctrl_val | (0x09 << WORD_COUNTS_POS));
+
+	/* 0xBB */
+	mdfld_wait_for_HS_DATA_FIFO(dev, pipe);
+	REG_WRITE(gen_data_reg, 0x28ff07bb);
+	mdfld_wait_for_HS_DATA_FIFO(dev, pipe);
+	REG_WRITE(gen_data_reg, 0x24280a31);
+	mdfld_wait_for_HS_DATA_FIFO(dev, pipe);
+	REG_WRITE(gen_data_reg, 0x00000034);
+	mdfld_wait_for_HS_CTRL_FIFO(dev, pipe);
+	REG_WRITE(gen_ctrl_reg, gen_ctrl_val | (0x09 << WORD_COUNTS_POS));
+
+	/* 0xFB */
+	mdfld_wait_for_HS_DATA_FIFO(dev, pipe);
+	REG_WRITE(gen_data_reg, 0x535d05fb);
+	mdfld_wait_for_HS_DATA_FIFO(dev, pipe);
+	REG_WRITE(gen_data_reg, 0x1b1a2130);
+	mdfld_wait_for_HS_DATA_FIFO(dev, pipe);
+	REG_WRITE(gen_data_reg, 0x221e180e);
+	mdfld_wait_for_HS_DATA_FIFO(dev, pipe);
+	REG_WRITE(gen_data_reg, 0x131d2120);
+	mdfld_wait_for_HS_DATA_FIFO(dev, pipe);
+	REG_WRITE(gen_data_reg, 0x535d0508);
+	mdfld_wait_for_HS_DATA_FIFO(dev, pipe);
+	REG_WRITE(gen_data_reg, 0x1c1a2131);
+	mdfld_wait_for_HS_DATA_FIFO(dev, pipe);
+	REG_WRITE(gen_data_reg, 0x231f160d);
+	mdfld_wait_for_HS_DATA_FIFO(dev, pipe);
+	REG_WRITE(gen_data_reg, 0x111b2220);
+	mdfld_wait_for_HS_DATA_FIFO(dev, pipe);
+	REG_WRITE(gen_data_reg, 0x535c2008);
+	mdfld_wait_for_HS_DATA_FIFO(dev, pipe);
+	REG_WRITE(gen_data_reg, 0x1f1d2433);
+	mdfld_wait_for_HS_DATA_FIFO(dev, pipe);
+	REG_WRITE(gen_data_reg, 0x2c251a10);
+	mdfld_wait_for_HS_DATA_FIFO(dev, pipe);
+	REG_WRITE(gen_data_reg, 0x2c34372d);
+	mdfld_wait_for_HS_DATA_FIFO(dev, pipe);
+	REG_WRITE(gen_data_reg, 0x00000023);
+	mdfld_wait_for_HS_CTRL_FIFO(dev, pipe);
+	REG_WRITE(gen_ctrl_reg, gen_ctrl_val | (0x31 << WORD_COUNTS_POS));
+
+	/* 0xFA */
+	mdfld_wait_for_HS_DATA_FIFO(dev, pipe);
+	REG_WRITE(gen_data_reg, 0x525c0bfa);
+	mdfld_wait_for_HS_DATA_FIFO(dev, pipe);
+	REG_WRITE(gen_data_reg, 0x1c1c232f);
+	mdfld_wait_for_HS_DATA_FIFO(dev, pipe);
+	REG_WRITE(gen_data_reg, 0x2623190e);
+	mdfld_wait_for_HS_DATA_FIFO(dev, pipe);
+	REG_WRITE(gen_data_reg, 0x18212625);
+	mdfld_wait_for_HS_DATA_FIFO(dev, pipe);
+	REG_WRITE(gen_data_reg, 0x545d0d0e);
+	mdfld_wait_for_HS_DATA_FIFO(dev, pipe);
+	REG_WRITE(gen_data_reg, 0x1e1d2333);
+	mdfld_wait_for_HS_DATA_FIFO(dev, pipe);
+	REG_WRITE(gen_data_reg, 0x26231a10);
+	mdfld_wait_for_HS_DATA_FIFO(dev, pipe);
+	REG_WRITE(gen_data_reg, 0x1a222725);
+	mdfld_wait_for_HS_DATA_FIFO(dev, pipe);
+	REG_WRITE(gen_data_reg, 0x545d280f);
+	mdfld_wait_for_HS_DATA_FIFO(dev, pipe);
+	REG_WRITE(gen_data_reg, 0x21202635);
+	mdfld_wait_for_HS_DATA_FIFO(dev, pipe);
+	REG_WRITE(gen_data_reg, 0x31292013);
+	mdfld_wait_for_HS_DATA_FIFO(dev, pipe);
+	REG_WRITE(gen_data_reg, 0x31393d33);
+	mdfld_wait_for_HS_DATA_FIFO(dev, pipe);
+	REG_WRITE(gen_data_reg, 0x00000029);
+	mdfld_wait_for_HS_CTRL_FIFO(dev, pipe);
+	REG_WRITE(gen_ctrl_reg, gen_ctrl_val | (0x31 << WORD_COUNTS_POS));
+
+	/* Set DM */
+	mdfld_wait_for_HS_DATA_FIFO(dev, pipe);
+	REG_WRITE(gen_data_reg, 0x000100f7);
+	mdfld_wait_for_HS_CTRL_FIFO(dev, pipe);
+	REG_WRITE(gen_ctrl_reg, gen_ctrl_val | (0x03 << WORD_COUNTS_POS));
+}
+
+static u16 mdfld_dsi_dpi_to_byte_clock_count(int pixel_clock_count,
+						int num_lane, int bpp)
+{
+	return (u16)((pixel_clock_count * bpp) / (num_lane * 8));
+}
+
+/*
+ * Calculate the dpi time basing on a given drm mode @mode
+ * return 0 on success.
+ * FIXME: I was using proposed mode value for calculation, may need to
+ * use crtc mode values later
+ */
+int mdfld_dsi_dpi_timing_calculation(struct drm_display_mode *mode,
+				struct mdfld_dsi_dpi_timing *dpi_timing,
+				int num_lane, int bpp)
+{
+	int pclk_hsync, pclk_hfp, pclk_hbp, pclk_hactive;
+	int pclk_vsync, pclk_vfp, pclk_vbp, pclk_vactive;
+
+	pclk_hactive = mode->hdisplay;
+	pclk_hfp = mode->hsync_start - mode->hdisplay;
+	pclk_hsync = mode->hsync_end - mode->hsync_start;
+	pclk_hbp = mode->htotal - mode->hsync_end;
+
+	pclk_vactive = mode->vdisplay;
+	pclk_vfp = mode->vsync_start - mode->vdisplay;
+	pclk_vsync = mode->vsync_end - mode->vsync_start;
+	pclk_vbp = mode->vtotal - mode->vsync_end;
+
+	/*
+	 * byte clock counts were calculated by following formula
+	 * bclock_count = pclk_count * bpp / num_lane / 8
+	 */
+	dpi_timing->hsync_count = mdfld_dsi_dpi_to_byte_clock_count(
+						pclk_hsync, num_lane, bpp);
+	dpi_timing->hbp_count = mdfld_dsi_dpi_to_byte_clock_count(
+						pclk_hbp, num_lane, bpp);
+	dpi_timing->hfp_count = mdfld_dsi_dpi_to_byte_clock_count(
+						pclk_hfp, num_lane, bpp);
+	dpi_timing->hactive_count = mdfld_dsi_dpi_to_byte_clock_count(
+						pclk_hactive, num_lane, bpp);
+	dpi_timing->vsync_count = mdfld_dsi_dpi_to_byte_clock_count(
+						pclk_vsync, num_lane, bpp);
+	dpi_timing->vbp_count = mdfld_dsi_dpi_to_byte_clock_count(
+						pclk_vbp, num_lane, bpp);
+	dpi_timing->vfp_count = mdfld_dsi_dpi_to_byte_clock_count(
+						pclk_vfp, num_lane, bpp);
+
+	return 0;
+}
+
+void mdfld_dsi_dpi_controller_init(struct mdfld_dsi_config *dsi_config,
+								int pipe)
+{
+	struct drm_device *dev = dsi_config->dev;
+	int lane_count = dsi_config->lane_count;
+	struct mdfld_dsi_dpi_timing dpi_timing;
+	struct drm_display_mode *mode = dsi_config->mode;
+	u32 val;
+
+	/*un-ready device*/
+	REG_FLD_MOD(MIPI_DEVICE_READY_REG(pipe), 0, 0, 0);
+
+	/*init dsi adapter before kicking off*/
+	REG_WRITE(MIPI_CTRL_REG(pipe), 0x00000018);
+
+	/*enable all interrupts*/
+	REG_WRITE(MIPI_INTR_EN_REG(pipe), 0xffffffff);
+
+	/*set up func_prg*/
+	val = lane_count;
+	val |= dsi_config->channel_num << DSI_DPI_VIRT_CHANNEL_OFFSET;
+
+	switch (dsi_config->bpp) {
+	case 16:
+		val |= DSI_DPI_COLOR_FORMAT_RGB565;
+		break;
+	case 18:
+		val |= DSI_DPI_COLOR_FORMAT_RGB666;
+		break;
+	case 24:
+		val |= DSI_DPI_COLOR_FORMAT_RGB888;
+		break;
+	default:
+		DRM_ERROR("unsupported color format, bpp = %d\n",
+							dsi_config->bpp);
+	}
+	REG_WRITE(MIPI_DSI_FUNC_PRG_REG(pipe), val);
+
+	REG_WRITE(MIPI_HS_TX_TIMEOUT_REG(pipe),
+			(mode->vtotal * mode->htotal * dsi_config->bpp /
+				(8 * lane_count)) & DSI_HS_TX_TIMEOUT_MASK);
+	REG_WRITE(MIPI_LP_RX_TIMEOUT_REG(pipe),
+				0xffff & DSI_LP_RX_TIMEOUT_MASK);
+
+	/*max value: 20 clock cycles of txclkesc*/
+	REG_WRITE(MIPI_TURN_AROUND_TIMEOUT_REG(pipe),
+				0x14 & DSI_TURN_AROUND_TIMEOUT_MASK);
+
+	/*min 21 txclkesc, max: ffffh*/
+	REG_WRITE(MIPI_DEVICE_RESET_TIMER_REG(pipe),
+				0xffff & DSI_RESET_TIMER_MASK);
+
+	REG_WRITE(MIPI_DPI_RESOLUTION_REG(pipe),
+				mode->vdisplay << 16 | mode->hdisplay);
+
+	/*set DPI timing registers*/
+	mdfld_dsi_dpi_timing_calculation(mode, &dpi_timing,
+				dsi_config->lane_count, dsi_config->bpp);
+
+	REG_WRITE(MIPI_HSYNC_COUNT_REG(pipe),
+			dpi_timing.hsync_count & DSI_DPI_TIMING_MASK);
+	REG_WRITE(MIPI_HBP_COUNT_REG(pipe),
+			dpi_timing.hbp_count & DSI_DPI_TIMING_MASK);
+	REG_WRITE(MIPI_HFP_COUNT_REG(pipe),
+			dpi_timing.hfp_count & DSI_DPI_TIMING_MASK);
+	REG_WRITE(MIPI_HACTIVE_COUNT_REG(pipe),
+			dpi_timing.hactive_count & DSI_DPI_TIMING_MASK);
+	REG_WRITE(MIPI_VSYNC_COUNT_REG(pipe),
+			dpi_timing.vsync_count & DSI_DPI_TIMING_MASK);
+	REG_WRITE(MIPI_VBP_COUNT_REG(pipe),
+			dpi_timing.vbp_count & DSI_DPI_TIMING_MASK);
+	REG_WRITE(MIPI_VFP_COUNT_REG(pipe),
+			dpi_timing.vfp_count & DSI_DPI_TIMING_MASK);
+
+	REG_WRITE(MIPI_HIGH_LOW_SWITCH_COUNT_REG(pipe), 0x46);
+
+	/*min: 7d0 max: 4e20*/
+	REG_WRITE(MIPI_INIT_COUNT_REG(pipe), 0x000007d0);
+
+	/*set up video mode*/
+	val = dsi_config->video_mode | DSI_DPI_COMPLETE_LAST_LINE;
+	REG_WRITE(MIPI_VIDEO_MODE_FORMAT_REG(pipe), val);
+
+	REG_WRITE(MIPI_EOT_DISABLE_REG(pipe), 0x00000000);
+
+	REG_WRITE(MIPI_LP_BYTECLK_REG(pipe), 0x00000004);
+
+	/*TODO: figure out how to setup these registers*/
+	if (mdfld_get_panel_type(dev, pipe) == TC35876X)
+		REG_WRITE(MIPI_DPHY_PARAM_REG(pipe), 0x2A0c6008);
+	else
+		REG_WRITE(MIPI_DPHY_PARAM_REG(pipe), 0x150c3408);
+
+	REG_WRITE(MIPI_CLK_LANE_SWITCH_TIME_CNT_REG(pipe), (0xa << 16) | 0x14);
+
+	if (mdfld_get_panel_type(dev, pipe) == TC35876X)
+		tc35876x_set_bridge_reset_state(dev, 0);  /*Pull High Reset */
+
+	/*set device ready*/
+	REG_FLD_MOD(MIPI_DEVICE_READY_REG(pipe), 1, 0, 0);
+}
+
+void mdfld_dsi_dpi_turn_on(struct mdfld_dsi_dpi_output *output, int pipe)
+{
+	struct drm_device *dev = output->dev;
+
+	/* clear special packet sent bit */
+	if (REG_READ(MIPI_INTR_STAT_REG(pipe)) & DSI_INTR_STATE_SPL_PKG_SENT)
+		REG_WRITE(MIPI_INTR_STAT_REG(pipe),
+					DSI_INTR_STATE_SPL_PKG_SENT);
+
+	/*send turn on package*/
+	REG_WRITE(MIPI_DPI_CONTROL_REG(pipe), DSI_DPI_CTRL_HS_TURN_ON);
+
+	/*wait for SPL_PKG_SENT interrupt*/
+	mdfld_wait_for_SPL_PKG_SENT(dev, pipe);
+
+	if (REG_READ(MIPI_INTR_STAT_REG(pipe)) & DSI_INTR_STATE_SPL_PKG_SENT)
+		REG_WRITE(MIPI_INTR_STAT_REG(pipe),
+					DSI_INTR_STATE_SPL_PKG_SENT);
+
+	output->panel_on = 1;
+
+	/* FIXME the following is disabled to WA the X slow start issue
+	   for TMD panel
+	if (pipe == 2)
+		dev_priv->dpi_panel_on2 = true;
+	else if (pipe == 0)
+		dev_priv->dpi_panel_on = true; */
+}
+
+static void mdfld_dsi_dpi_shut_down(struct mdfld_dsi_dpi_output *output,
+								int pipe)
+{
+	struct drm_device *dev = output->dev;
+
+	/*if output is on, or mode setting didn't happen, ignore this*/
+	if ((!output->panel_on) || output->first_boot) {
+		output->first_boot = 0;
+		return;
+	}
+
+	/* Wait for dpi fifo to empty */
+	mdfld_wait_for_DPI_CTRL_FIFO(dev, pipe);
+
+	/* Clear the special packet interrupt bit if set */
+	if (REG_READ(MIPI_INTR_STAT_REG(pipe)) & DSI_INTR_STATE_SPL_PKG_SENT)
+		REG_WRITE(MIPI_INTR_STAT_REG(pipe),
+					DSI_INTR_STATE_SPL_PKG_SENT);
+
+	if (REG_READ(MIPI_DPI_CONTROL_REG(pipe)) == DSI_DPI_CTRL_HS_SHUTDOWN)
+		goto shutdown_out;
+
+	REG_WRITE(MIPI_DPI_CONTROL_REG(pipe), DSI_DPI_CTRL_HS_SHUTDOWN);
+
+shutdown_out:
+	output->panel_on = 0;
+	output->first_boot = 0;
+
+	/* FIXME the following is disabled to WA the X slow start issue
+	   for TMD panel
+	if (pipe == 2)
+		dev_priv->dpi_panel_on2 = false;
+	else if (pipe == 0)
+		dev_priv->dpi_panel_on = false;	 */
+}
+
+static void mdfld_dsi_dpi_set_power(struct drm_encoder *encoder, bool on)
+{
+	struct mdfld_dsi_encoder *dsi_encoder = mdfld_dsi_encoder(encoder);
+	struct mdfld_dsi_dpi_output *dpi_output =
+				MDFLD_DSI_DPI_OUTPUT(dsi_encoder);
+	struct mdfld_dsi_config *dsi_config =
+				mdfld_dsi_encoder_get_config(dsi_encoder);
+	int pipe = mdfld_dsi_encoder_get_pipe(dsi_encoder);
+	struct drm_device *dev = dsi_config->dev;
+	struct drm_psb_private *dev_priv = dev->dev_private;
+	u32 pipeconf_reg = PIPEACONF;
+
+	if (pipe)
+		pipeconf_reg = PIPECCONF;
+
+	/*start up display island if it was shutdown*/
+	if (!gma_power_begin(dev, true))
+		return;
+
+	if (on) {
+		if (mdfld_get_panel_type(dev, pipe) == TMD_VID)
+			mdfld_dsi_dpi_turn_on(dpi_output, pipe);
+		else if (mdfld_get_panel_type(dev, pipe) == TC35876X)
+			mdfld_dsi_configure_up(dsi_encoder, pipe);
+		else {
+			/*enable mipi port*/
+			REG_WRITE(MIPI_PORT_CONTROL(pipe),
+				REG_READ(MIPI_PORT_CONTROL(pipe)) | BIT(31));
+			REG_READ(MIPI_PORT_CONTROL(pipe));
+
+			mdfld_dsi_dpi_turn_on(dpi_output, pipe);
+			mdfld_dsi_tpo_ic_init(dsi_config, pipe);
+		}
+		dev_priv->dpi_panel_on[pipe] = true;
+	} else {
+		if (mdfld_get_panel_type(dev, pipe) == TMD_VID)
+			mdfld_dsi_dpi_shut_down(dpi_output, pipe);
+		else if (mdfld_get_panel_type(dev, pipe) == TC35876X)
+			mdfld_dsi_configure_down(dsi_encoder, pipe);
+		else {
+			mdfld_dsi_dpi_shut_down(dpi_output, pipe);
+
+			/*disable mipi port*/
+			REG_WRITE(MIPI_PORT_CONTROL(pipe),
+				REG_READ(MIPI_PORT_CONTROL(pipe)) & ~BIT(31));
+			REG_READ(MIPI_PORT_CONTROL(pipe));
+		}
+		dev_priv->dpi_panel_on[pipe] = false;
+	}
+	gma_power_end(dev);
+}
+
+void mdfld_dsi_dpi_dpms(struct drm_encoder *encoder, int mode)
+{
+	mdfld_dsi_dpi_set_power(encoder, mode == DRM_MODE_DPMS_ON);
+}
+
+bool mdfld_dsi_dpi_mode_fixup(struct drm_encoder *encoder,
+				     struct drm_display_mode *mode,
+				     struct drm_display_mode *adjusted_mode)
+{
+	struct mdfld_dsi_encoder *dsi_encoder = mdfld_dsi_encoder(encoder);
+	struct mdfld_dsi_config *dsi_config =
+				mdfld_dsi_encoder_get_config(dsi_encoder);
+	struct drm_display_mode *fixed_mode = dsi_config->fixed_mode;
+
+	if (fixed_mode) {
+		adjusted_mode->hdisplay = fixed_mode->hdisplay;
+		adjusted_mode->hsync_start = fixed_mode->hsync_start;
+		adjusted_mode->hsync_end = fixed_mode->hsync_end;
+		adjusted_mode->htotal = fixed_mode->htotal;
+		adjusted_mode->vdisplay = fixed_mode->vdisplay;
+		adjusted_mode->vsync_start = fixed_mode->vsync_start;
+		adjusted_mode->vsync_end = fixed_mode->vsync_end;
+		adjusted_mode->vtotal = fixed_mode->vtotal;
+		adjusted_mode->clock = fixed_mode->clock;
+		drm_mode_set_crtcinfo(adjusted_mode, CRTC_INTERLACE_HALVE_V);
+	}
+	return true;
+}
+
+void mdfld_dsi_dpi_prepare(struct drm_encoder *encoder)
+{
+	mdfld_dsi_dpi_set_power(encoder, false);
+}
+
+void mdfld_dsi_dpi_commit(struct drm_encoder *encoder)
+{
+	mdfld_dsi_dpi_set_power(encoder, true);
+}
+
+/* For TC35876X */
+/* This functionality was implemented in FW in iCDK */
+/* But removed in DV0 and later. So need to add here. */
+static void mipi_set_properties(struct mdfld_dsi_config *dsi_config, int pipe)
+{
+	struct drm_device *dev = dsi_config->dev;
+
+	REG_WRITE(MIPI_CTRL_REG(pipe), 0x00000018);
+	REG_WRITE(MIPI_INTR_EN_REG(pipe), 0xffffffff);
+	REG_WRITE(MIPI_HS_TX_TIMEOUT_REG(pipe), 0xffffff);
+	REG_WRITE(MIPI_LP_RX_TIMEOUT_REG(pipe), 0xffffff);
+	REG_WRITE(MIPI_TURN_AROUND_TIMEOUT_REG(pipe), 0x14);
+	REG_WRITE(MIPI_DEVICE_RESET_TIMER_REG(pipe), 0xff);
+	REG_WRITE(MIPI_HIGH_LOW_SWITCH_COUNT_REG(pipe), 0x25);
+	REG_WRITE(MIPI_INIT_COUNT_REG(pipe), 0xf0);
+	REG_WRITE(MIPI_EOT_DISABLE_REG(pipe), 0x00000000);
+	REG_WRITE(MIPI_LP_BYTECLK_REG(pipe), 0x00000004);
+	REG_WRITE(MIPI_DBI_BW_CTRL_REG(pipe), 0x00000820);
+	REG_WRITE(MIPI_CLK_LANE_SWITCH_TIME_CNT_REG(pipe), (0xa << 16) | 0x14);
+}
+
+static void mdfld_mipi_set_video_timing(struct mdfld_dsi_config *dsi_config,
+					int pipe)
+{
+	struct drm_device *dev = dsi_config->dev;
+	struct mdfld_dsi_dpi_timing dpi_timing;
+	struct drm_display_mode *mode = dsi_config->mode;
+
+	mdfld_dsi_dpi_timing_calculation(mode, &dpi_timing,
+					dsi_config->lane_count,
+					dsi_config->bpp);
+
+	REG_WRITE(MIPI_DPI_RESOLUTION_REG(pipe),
+		mode->vdisplay << 16 | mode->hdisplay);
+	REG_WRITE(MIPI_HSYNC_COUNT_REG(pipe),
+		dpi_timing.hsync_count & DSI_DPI_TIMING_MASK);
+	REG_WRITE(MIPI_HBP_COUNT_REG(pipe),
+		dpi_timing.hbp_count & DSI_DPI_TIMING_MASK);
+	REG_WRITE(MIPI_HFP_COUNT_REG(pipe),
+		dpi_timing.hfp_count & DSI_DPI_TIMING_MASK);
+	REG_WRITE(MIPI_HACTIVE_COUNT_REG(pipe),
+		dpi_timing.hactive_count & DSI_DPI_TIMING_MASK);
+	REG_WRITE(MIPI_VSYNC_COUNT_REG(pipe),
+		dpi_timing.vsync_count & DSI_DPI_TIMING_MASK);
+	REG_WRITE(MIPI_VBP_COUNT_REG(pipe),
+		dpi_timing.vbp_count & DSI_DPI_TIMING_MASK);
+	REG_WRITE(MIPI_VFP_COUNT_REG(pipe),
+		dpi_timing.vfp_count & DSI_DPI_TIMING_MASK);
+}
+
+static void mdfld_mipi_config(struct mdfld_dsi_config *dsi_config, int pipe)
+{
+	struct drm_device *dev = dsi_config->dev;
+	int lane_count = dsi_config->lane_count;
+
+	if (pipe) {
+		REG_WRITE(MIPI_PORT_CONTROL(0), 0x00000002);
+		REG_WRITE(MIPI_PORT_CONTROL(2), 0x80000000);
+	} else {
+		REG_WRITE(MIPI_PORT_CONTROL(0), 0x80010000);
+		REG_WRITE(MIPI_PORT_CONTROL(2), 0x00);
+	}
+
+	REG_WRITE(MIPI_DPHY_PARAM_REG(pipe), 0x150A600F);
+	REG_WRITE(MIPI_VIDEO_MODE_FORMAT_REG(pipe), 0x0000000F);
+
+	/* lane_count = 3 */
+	REG_WRITE(MIPI_DSI_FUNC_PRG_REG(pipe), 0x00000200 | lane_count);
+
+	mdfld_mipi_set_video_timing(dsi_config, pipe);
+}
+
+static void mdfld_set_pipe_timing(struct mdfld_dsi_config *dsi_config, int pipe)
+{
+	struct drm_device *dev = dsi_config->dev;
+	struct drm_display_mode *mode = dsi_config->mode;
+
+	REG_WRITE(HTOTAL_A, ((mode->htotal - 1) << 16) | (mode->hdisplay - 1));
+	REG_WRITE(HBLANK_A, ((mode->htotal - 1) << 16) | (mode->hdisplay - 1));
+	REG_WRITE(HSYNC_A,
+		((mode->hsync_end - 1) << 16) | (mode->hsync_start - 1));
+
+	REG_WRITE(VTOTAL_A, ((mode->vtotal - 1) << 16) | (mode->vdisplay - 1));
+	REG_WRITE(VBLANK_A, ((mode->vtotal - 1) << 16) | (mode->vdisplay - 1));
+	REG_WRITE(VSYNC_A,
+		((mode->vsync_end - 1) << 16) | (mode->vsync_start - 1));
+
+	REG_WRITE(PIPEASRC,
+		((mode->hdisplay - 1) << 16) | (mode->vdisplay - 1));
+}
+/* End for TC35876X */
+
+void mdfld_dsi_dpi_mode_set(struct drm_encoder *encoder,
+				   struct drm_display_mode *mode,
+				   struct drm_display_mode *adjusted_mode)
+{
+	struct mdfld_dsi_encoder *dsi_encoder = mdfld_dsi_encoder(encoder);
+	struct mdfld_dsi_dpi_output *dpi_output =
+					MDFLD_DSI_DPI_OUTPUT(dsi_encoder);
+	struct mdfld_dsi_config *dsi_config =
+				mdfld_dsi_encoder_get_config(dsi_encoder);
+	struct drm_device *dev = dsi_config->dev;
+	struct drm_psb_private *dev_priv = dev->dev_private;
+	int pipe = mdfld_dsi_encoder_get_pipe(dsi_encoder);
+
+	u32 pipeconf_reg = PIPEACONF;
+	u32 dspcntr_reg = DSPACNTR;
+
+	u32 pipeconf = dev_priv->pipeconf[pipe];
+	u32 dspcntr = dev_priv->dspcntr[pipe];
+	u32 mipi = MIPI_PORT_EN | PASS_FROM_SPHY_TO_AFE | SEL_FLOPPED_HSTX;
+
+	if (pipe) {
+		pipeconf_reg = PIPECCONF;
+		dspcntr_reg = DSPCCNTR;
+	} else {
+		if (mdfld_get_panel_type(dev, pipe) == TC35876X)
+			mipi &= (~0x03); /* Use all four lanes */
+		else
+			mipi |= 2;
+	}
+
+	/*start up display island if it was shutdown*/
+	if (!gma_power_begin(dev, true))
+		return;
+
+	if (mdfld_get_panel_type(dev, pipe) == TC35876X) {
+		/*
+		 * The following logic is required to reset the bridge and
+		 * configure. This also starts the DSI clock at 200MHz.
+		 */
+		tc35876x_set_bridge_reset_state(dev, 0);  /*Pull High Reset */
+		tc35876x_toshiba_bridge_panel_on(dev);
+		udelay(100);
+		/* Now start the DSI clock */
+		REG_WRITE(MRST_DPLL_A, 0x00);
+		REG_WRITE(MRST_FPA0, 0xC1);
+		REG_WRITE(MRST_DPLL_A, 0x00800000);
+		udelay(500);
+		REG_WRITE(MRST_DPLL_A, 0x80800000);
+
+		if (REG_BIT_WAIT(pipeconf_reg, 1, 29))
+			dev_err(&dev->pdev->dev, "%s: DSI PLL lock timeout\n",
+				__func__);
+
+		REG_WRITE(MIPI_DPHY_PARAM_REG(pipe), 0x2A0c6008);
+
+		mipi_set_properties(dsi_config, pipe);
+		mdfld_mipi_config(dsi_config, pipe);
+		mdfld_set_pipe_timing(dsi_config, pipe);
+
+		REG_WRITE(DSPABASE, 0x00);
+		REG_WRITE(DSPASTRIDE, (mode->hdisplay * 4));
+		REG_WRITE(DSPASIZE,
+			((mode->vdisplay - 1) << 16) | (mode->hdisplay - 1));
+
+		REG_WRITE(DSPACNTR, 0x98000000);
+		REG_WRITE(DSPASURF, 0x00);
+
+		REG_WRITE(VGACNTRL, 0x80000000);
+		REG_WRITE(DEVICE_READY_REG, 0x00000001);
+
+		REG_WRITE(MIPI_PORT_CONTROL(pipe), 0x80810000);
+	} else {
+		/*set up mipi port FIXME: do at init time */
+		REG_WRITE(MIPI_PORT_CONTROL(pipe), mipi);
+	}
+	REG_READ(MIPI_PORT_CONTROL(pipe));
+
+	if (mdfld_get_panel_type(dev, pipe) == TMD_VID) {
+		/* NOP */
+	} else if (mdfld_get_panel_type(dev, pipe) == TC35876X) {
+		/* set up DSI controller DPI interface */
+		mdfld_dsi_dpi_controller_init(dsi_config, pipe);
+
+		/* Configure MIPI Bridge and Panel */
+		tc35876x_configure_lvds_bridge(dev);
+		dev_priv->dpi_panel_on[pipe] = true;
+	} else {
+		/*turn on DPI interface*/
+		mdfld_dsi_dpi_turn_on(dpi_output, pipe);
+	}
+
+	/*set up pipe*/
+	REG_WRITE(pipeconf_reg, pipeconf);
+	REG_READ(pipeconf_reg);
+
+	/*set up display plane*/
+	REG_WRITE(dspcntr_reg, dspcntr);
+	REG_READ(dspcntr_reg);
+
+	msleep(20); /* FIXME: this should wait for vblank */
+
+	if (mdfld_get_panel_type(dev, pipe) == TMD_VID) {
+		/* NOP */
+	} else if (mdfld_get_panel_type(dev, pipe) == TC35876X) {
+		mdfld_dsi_dpi_turn_on(dpi_output, pipe);
+	} else {
+		/* init driver ic */
+		mdfld_dsi_tpo_ic_init(dsi_config, pipe);
+		/*init backlight*/
+		mdfld_dsi_brightness_init(dsi_config, pipe);
+	}
+
+	gma_power_end(dev);
+}
+
+/*
+ * Init DSI DPI encoder.
+ * Allocate an mdfld_dsi_encoder and attach it to given @dsi_connector
+ * return pointer of newly allocated DPI encoder, NULL on error
+ */
+struct mdfld_dsi_encoder *mdfld_dsi_dpi_init(struct drm_device *dev,
+				struct mdfld_dsi_connector *dsi_connector,
+				const struct panel_funcs *p_funcs)
+{
+	struct mdfld_dsi_dpi_output *dpi_output = NULL;
+	struct mdfld_dsi_config *dsi_config;
+	struct drm_connector *connector = NULL;
+	struct drm_encoder *encoder = NULL;
+	struct drm_display_mode *fixed_mode = NULL;
+	int pipe;
+	u32 data;
+	int ret;
+
+	pipe = dsi_connector->pipe;
+
+	if (mdfld_get_panel_type(dev, pipe) != TC35876X) {
+		dsi_config = mdfld_dsi_get_config(dsi_connector);
+
+		/* panel hard-reset */
+		if (p_funcs->reset) {
+			ret = p_funcs->reset(pipe);
+			if (ret) {
+				DRM_ERROR("Panel %d hard-reset failed\n", pipe);
+				return NULL;
+			}
+		}
+
+		/* panel drvIC init */
+		if (p_funcs->drv_ic_init)
+			p_funcs->drv_ic_init(dsi_config, pipe);
+
+		/* panel power mode detect */
+		ret = mdfld_dsi_get_power_mode(dsi_config, &data, false);
+		if (ret) {
+			DRM_ERROR("Panel %d get power mode failed\n", pipe);
+			dsi_connector->status = connector_status_disconnected;
+		} else {
+			DRM_INFO("pipe %d power mode 0x%x\n", pipe, data);
+			dsi_connector->status = connector_status_connected;
+		}
+	}
+
+	dpi_output = kzalloc(sizeof(struct mdfld_dsi_dpi_output), GFP_KERNEL);
+	if (!dpi_output) {
+		DRM_ERROR("No memory\n");
+		return NULL;
+	}
+
+	if (dsi_connector->pipe)
+		dpi_output->panel_on = 0;
+	else
+		dpi_output->panel_on = 0;
+
+	dpi_output->dev = dev;
+	if (mdfld_get_panel_type(dev, pipe) != TC35876X)
+		dpi_output->p_funcs = p_funcs;
+	dpi_output->first_boot = 1;
+
+	/*get fixed mode*/
+	dsi_config = mdfld_dsi_get_config(dsi_connector);
+	fixed_mode = dsi_config->fixed_mode;
+
+	/*create drm encoder object*/
+	connector = &dsi_connector->base.base;
+	encoder = &dpi_output->base.base.base;
+	drm_encoder_init(dev,
+			encoder,
+			p_funcs->encoder_funcs,
+			DRM_MODE_ENCODER_LVDS);
+	drm_encoder_helper_add(encoder,
+				p_funcs->encoder_helper_funcs);
+
+	/*attach to given connector*/
+	drm_mode_connector_attach_encoder(connector, encoder);
+
+	/*set possible crtcs and clones*/
+	if (dsi_connector->pipe) {
+		encoder->possible_crtcs = (1 << 2);
+		encoder->possible_clones = (1 << 1);
+	} else {
+		encoder->possible_crtcs = (1 << 0);
+		encoder->possible_clones = (1 << 0);
+	}
+
+	dsi_connector->base.encoder = &dpi_output->base.base;
+
+	return &dpi_output->base;
+}
diff --git a/drivers/gpu/drm/gma500/mdfld_dsi_dpi.h b/drivers/gpu/drm/gma500/mdfld_dsi_dpi.h
new file mode 100644
index 000000000000..6f762478b959
--- /dev/null
+++ b/drivers/gpu/drm/gma500/mdfld_dsi_dpi.h
@@ -0,0 +1,79 @@
+/*
+ * Copyright © 2010 Intel Corporation
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+ * DEALINGS IN THE SOFTWARE.
+ *
+ * Authors:
+ * jim liu <jim.liu@intel.com>
+ * Jackie Li<yaodong.li@intel.com>
+ */
+
+#ifndef __MDFLD_DSI_DPI_H__
+#define __MDFLD_DSI_DPI_H__
+
+#include "mdfld_dsi_output.h"
+#include "mdfld_output.h"
+
+struct mdfld_dsi_dpi_timing {
+	u16 hsync_count;
+	u16 hbp_count;
+	u16 hfp_count;
+	u16 hactive_count;
+	u16 vsync_count;
+	u16 vbp_count;
+	u16 vfp_count;
+};
+
+struct mdfld_dsi_dpi_output {
+	struct mdfld_dsi_encoder base;
+	struct drm_device *dev;
+
+	int panel_on;
+	int first_boot;
+
+	const struct panel_funcs *p_funcs;
+};
+
+#define MDFLD_DSI_DPI_OUTPUT(dsi_encoder)\
+	container_of(dsi_encoder, struct mdfld_dsi_dpi_output, base)
+
+/* Export functions */
+extern int mdfld_dsi_dpi_timing_calculation(struct drm_display_mode *mode,
+				struct mdfld_dsi_dpi_timing *dpi_timing,
+				int num_lane, int bpp);
+extern struct mdfld_dsi_encoder *mdfld_dsi_dpi_init(struct drm_device *dev,
+				struct mdfld_dsi_connector *dsi_connector,
+				const struct panel_funcs *p_funcs);
+
+/* MDFLD DPI helper functions */
+extern void mdfld_dsi_dpi_dpms(struct drm_encoder *encoder, int mode);
+extern bool mdfld_dsi_dpi_mode_fixup(struct drm_encoder *encoder,
+				struct drm_display_mode *mode,
+				struct drm_display_mode *adjusted_mode);
+extern void mdfld_dsi_dpi_prepare(struct drm_encoder *encoder);
+extern void mdfld_dsi_dpi_commit(struct drm_encoder *encoder);
+extern void mdfld_dsi_dpi_mode_set(struct drm_encoder *encoder,
+				struct drm_display_mode *mode,
+				struct drm_display_mode *adjusted_mode);
+extern void mdfld_dsi_dpi_turn_on(struct mdfld_dsi_dpi_output *output,
+				int pipe);
+extern void mdfld_dsi_dpi_controller_init(struct mdfld_dsi_config *dsi_config,
+				int pipe);
+#endif /*__MDFLD_DSI_DPI_H__*/
diff --git a/drivers/gpu/drm/gma500/mdfld_dsi_output.c b/drivers/gpu/drm/gma500/mdfld_dsi_output.c
new file mode 100644
index 000000000000..9338c28f3999
--- /dev/null
+++ b/drivers/gpu/drm/gma500/mdfld_dsi_output.c
@@ -0,0 +1,635 @@
+/*
+ * Copyright © 2010 Intel Corporation
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+ * DEALINGS IN THE SOFTWARE.
+ *
+ * Authors:
+ * jim liu <jim.liu@intel.com>
+ * Jackie Li<yaodong.li@intel.com>
+ */
+
+#include <linux/module.h>
+
+#include "mdfld_dsi_output.h"
+#include "mdfld_dsi_dpi.h"
+#include "mdfld_output.h"
+#include "mdfld_dsi_pkg_sender.h"
+#include "tc35876x-dsi-lvds.h"
+#include <linux/pm_runtime.h>
+#include <asm/intel_scu_ipc.h>
+
+/* get the LABC from command line. */
+static int LABC_control = 1;
+
+#ifdef MODULE
+module_param(LABC_control, int, 0644);
+#else
+
+static int __init parse_LABC_control(char *arg)
+{
+	/* LABC control can be passed in as a cmdline parameter */
+	/* to enable this feature add LABC=1 to cmdline */
+	/* to disable this feature add LABC=0 to cmdline */
+	if (!arg)
+		return -EINVAL;
+
+	if (!strcasecmp(arg, "0"))
+		LABC_control = 0;
+	else if (!strcasecmp(arg, "1"))
+		LABC_control = 1;
+
+	return 0;
+}
+early_param("LABC", parse_LABC_control);
+#endif
+
+/**
+ * Check and see if the generic control or data buffer is empty and ready.
+ */
+void mdfld_dsi_gen_fifo_ready(struct drm_device *dev, u32 gen_fifo_stat_reg,
+							u32 fifo_stat)
+{
+	u32 GEN_BF_time_out_count;
+
+	/* Check MIPI Adatper command registers */
+	for (GEN_BF_time_out_count = 0;
+			GEN_BF_time_out_count < GEN_FB_TIME_OUT;
+			GEN_BF_time_out_count++) {
+		if ((REG_READ(gen_fifo_stat_reg) & fifo_stat) == fifo_stat)
+			break;
+		udelay(100);
+	}
+
+	if (GEN_BF_time_out_count == GEN_FB_TIME_OUT)
+		DRM_ERROR("mdfld_dsi_gen_fifo_ready, Timeout. gen_fifo_stat_reg = 0x%x.\n",
+					gen_fifo_stat_reg);
+}
+
+/**
+ * Manage the DSI MIPI keyboard and display brightness.
+ * FIXME: this is exported to OSPM code. should work out an specific
+ * display interface to OSPM.
+ */
+
+void mdfld_dsi_brightness_init(struct mdfld_dsi_config *dsi_config, int pipe)
+{
+	struct mdfld_dsi_pkg_sender *sender =
+				mdfld_dsi_get_pkg_sender(dsi_config);
+	struct drm_device *dev = sender->dev;
+	struct drm_psb_private *dev_priv = dev->dev_private;
+	u32 gen_ctrl_val;
+
+	if (!sender) {
+		DRM_ERROR("No sender found\n");
+		return;
+	}
+
+	/* Set default display backlight value to 85% (0xd8)*/
+	mdfld_dsi_send_mcs_short(sender, write_display_brightness, 0xd8, 1,
+				true);
+
+	/* Set minimum brightness setting of CABC function to 20% (0x33)*/
+	mdfld_dsi_send_mcs_short(sender, write_cabc_min_bright, 0x33, 1, true);
+
+	/* Enable backlight or/and LABC */
+	gen_ctrl_val = BRIGHT_CNTL_BLOCK_ON | DISPLAY_DIMMING_ON |
+								BACKLIGHT_ON;
+	if (LABC_control == 1)
+		gen_ctrl_val |= DISPLAY_DIMMING_ON | DISPLAY_BRIGHTNESS_AUTO
+								| GAMMA_AUTO;
+
+	if (LABC_control == 1)
+		gen_ctrl_val |= AMBIENT_LIGHT_SENSE_ON;
+
+	dev_priv->mipi_ctrl_display = gen_ctrl_val;
+
+	mdfld_dsi_send_mcs_short(sender, write_ctrl_display, (u8)gen_ctrl_val,
+				1, true);
+
+	mdfld_dsi_send_mcs_short(sender, write_ctrl_cabc, UI_IMAGE, 1, true);
+}
+
+void mdfld_dsi_brightness_control(struct drm_device *dev, int pipe, int level)
+{
+	struct mdfld_dsi_pkg_sender *sender;
+	struct drm_psb_private *dev_priv;
+	struct mdfld_dsi_config *dsi_config;
+	u32 gen_ctrl_val = 0;
+	int p_type = TMD_VID;
+
+	if (!dev || (pipe != 0 && pipe != 2)) {
+		DRM_ERROR("Invalid parameter\n");
+		return;
+	}
+
+	p_type = mdfld_get_panel_type(dev, 0);
+
+	dev_priv = dev->dev_private;
+
+	if (pipe)
+		dsi_config = dev_priv->dsi_configs[1];
+	else
+		dsi_config = dev_priv->dsi_configs[0];
+
+	sender = mdfld_dsi_get_pkg_sender(dsi_config);
+
+	if (!sender) {
+		DRM_ERROR("No sender found\n");
+		return;
+	}
+
+	gen_ctrl_val = (level * 0xff / MDFLD_DSI_BRIGHTNESS_MAX_LEVEL) & 0xff;
+
+	dev_dbg(sender->dev->dev, "pipe = %d, gen_ctrl_val = %d.\n",
+							pipe, gen_ctrl_val);
+
+	if (p_type == TMD_VID) {
+		/* Set display backlight value */
+		mdfld_dsi_send_mcs_short(sender, tmd_write_display_brightness,
+					(u8)gen_ctrl_val, 1, true);
+	} else {
+		/* Set display backlight value */
+		mdfld_dsi_send_mcs_short(sender, write_display_brightness,
+					(u8)gen_ctrl_val, 1, true);
+
+		/* Enable backlight control */
+		if (level == 0)
+			gen_ctrl_val = 0;
+		else
+			gen_ctrl_val = dev_priv->mipi_ctrl_display;
+
+		mdfld_dsi_send_mcs_short(sender, write_ctrl_display,
+					(u8)gen_ctrl_val, 1, true);
+	}
+}
+
+static int mdfld_dsi_get_panel_status(struct mdfld_dsi_config *dsi_config,
+				u8 dcs, u32 *data, bool hs)
+{
+	struct mdfld_dsi_pkg_sender *sender
+		= mdfld_dsi_get_pkg_sender(dsi_config);
+
+	if (!sender || !data) {
+		DRM_ERROR("Invalid parameter\n");
+		return -EINVAL;
+	}
+
+	return mdfld_dsi_read_mcs(sender, dcs, data, 1, hs);
+}
+
+int mdfld_dsi_get_power_mode(struct mdfld_dsi_config *dsi_config, u32 *mode,
+			bool hs)
+{
+	if (!dsi_config || !mode) {
+		DRM_ERROR("Invalid parameter\n");
+		return -EINVAL;
+	}
+
+	return mdfld_dsi_get_panel_status(dsi_config, 0x0a, mode, hs);
+}
+
+int mdfld_dsi_get_diagnostic_result(struct mdfld_dsi_config *dsi_config,
+				u32 *result, bool hs)
+{
+	if (!dsi_config || !result) {
+		DRM_ERROR("Invalid parameter\n");
+		return -EINVAL;
+	}
+
+	return mdfld_dsi_get_panel_status(dsi_config, 0x0f, result, hs);
+}
+
+/*
+ * NOTE: this function was used by OSPM.
+ * TODO: will be removed later, should work out display interfaces for OSPM
+ */
+void mdfld_dsi_controller_init(struct mdfld_dsi_config *dsi_config, int pipe)
+{
+	if (!dsi_config || ((pipe != 0) && (pipe != 2))) {
+		DRM_ERROR("Invalid parameters\n");
+		return;
+	}
+
+	mdfld_dsi_dpi_controller_init(dsi_config, pipe);
+}
+
+static void mdfld_dsi_connector_save(struct drm_connector *connector)
+{
+}
+
+static void mdfld_dsi_connector_restore(struct drm_connector *connector)
+{
+}
+
+/* FIXME: start using the force parameter */
+static enum drm_connector_status
+mdfld_dsi_connector_detect(struct drm_connector *connector, bool force)
+{
+	struct mdfld_dsi_connector *dsi_connector
+		= mdfld_dsi_connector(connector);
+
+	dsi_connector->status = connector_status_connected;
+
+	return dsi_connector->status;
+}
+
+static int mdfld_dsi_connector_set_property(struct drm_connector *connector,
+				struct drm_property *property,
+				uint64_t value)
+{
+	struct drm_encoder *encoder = connector->encoder;
+	struct backlight_device *psb_bd;
+
+	if (!strcmp(property->name, "scaling mode") && encoder) {
+		struct psb_intel_crtc *psb_crtc =
+					to_psb_intel_crtc(encoder->crtc);
+		bool centerechange;
+		uint64_t val;
+
+		if (!psb_crtc)
+			goto set_prop_error;
+
+		switch (value) {
+		case DRM_MODE_SCALE_FULLSCREEN:
+			break;
+		case DRM_MODE_SCALE_NO_SCALE:
+			break;
+		case DRM_MODE_SCALE_ASPECT:
+			break;
+		default:
+			goto set_prop_error;
+		}
+
+		if (drm_connector_property_get_value(connector, property, &val))
+			goto set_prop_error;
+
+		if (val == value)
+			goto set_prop_done;
+
+		if (drm_connector_property_set_value(connector,
+							property, value))
+			goto set_prop_error;
+
+		centerechange = (val == DRM_MODE_SCALE_NO_SCALE) ||
+			(value == DRM_MODE_SCALE_NO_SCALE);
+
+		if (psb_crtc->saved_mode.hdisplay != 0 &&
+		    psb_crtc->saved_mode.vdisplay != 0) {
+			if (centerechange) {
+				if (!drm_crtc_helper_set_mode(encoder->crtc,
+						&psb_crtc->saved_mode,
+						encoder->crtc->x,
+						encoder->crtc->y,
+						encoder->crtc->fb))
+					goto set_prop_error;
+			} else {
+				struct drm_encoder_helper_funcs *funcs =
+						encoder->helper_private;
+				funcs->mode_set(encoder,
+					&psb_crtc->saved_mode,
+					&psb_crtc->saved_adjusted_mode);
+			}
+		}
+	} else if (!strcmp(property->name, "backlight") && encoder) {
+		if (drm_connector_property_set_value(connector, property,
+									value))
+			goto set_prop_error;
+		else {
+			psb_bd = mdfld_get_backlight_device();
+			if (psb_bd) {
+				psb_bd->props.brightness = value;
+				mdfld_set_brightness(psb_bd);
+			}
+		}
+	}
+set_prop_done:
+	return 0;
+set_prop_error:
+	return -1;
+}
+
+static void mdfld_dsi_connector_destroy(struct drm_connector *connector)
+{
+	struct mdfld_dsi_connector *dsi_connector =
+					mdfld_dsi_connector(connector);
+	struct mdfld_dsi_pkg_sender *sender;
+
+	if (!dsi_connector)
+		return;
+	drm_sysfs_connector_remove(connector);
+	drm_connector_cleanup(connector);
+	sender = dsi_connector->pkg_sender;
+	mdfld_dsi_pkg_sender_destroy(sender);
+	kfree(dsi_connector);
+}
+
+static int mdfld_dsi_connector_get_modes(struct drm_connector *connector)
+{
+	struct mdfld_dsi_connector *dsi_connector =
+				mdfld_dsi_connector(connector);
+	struct mdfld_dsi_config *dsi_config =
+				mdfld_dsi_get_config(dsi_connector);
+	struct drm_display_mode *fixed_mode = dsi_config->fixed_mode;
+	struct drm_display_mode *dup_mode = NULL;
+	struct drm_device *dev = connector->dev;
+
+	connector->display_info.min_vfreq = 0;
+	connector->display_info.max_vfreq = 200;
+	connector->display_info.min_hfreq = 0;
+	connector->display_info.max_hfreq = 200;
+
+	if (fixed_mode) {
+		dev_dbg(dev->dev, "fixed_mode %dx%d\n",
+				fixed_mode->hdisplay, fixed_mode->vdisplay);
+		dup_mode = drm_mode_duplicate(dev, fixed_mode);
+		drm_mode_probed_add(connector, dup_mode);
+		return 1;
+	}
+	DRM_ERROR("Didn't get any modes!\n");
+	return 0;
+}
+
+static int mdfld_dsi_connector_mode_valid(struct drm_connector *connector,
+						struct drm_display_mode *mode)
+{
+	struct mdfld_dsi_connector *dsi_connector =
+					mdfld_dsi_connector(connector);
+	struct mdfld_dsi_config *dsi_config =
+					mdfld_dsi_get_config(dsi_connector);
+	struct drm_display_mode *fixed_mode = dsi_config->fixed_mode;
+
+	if (mode->flags & DRM_MODE_FLAG_DBLSCAN)
+		return MODE_NO_DBLESCAN;
+
+	if (mode->flags & DRM_MODE_FLAG_INTERLACE)
+		return MODE_NO_INTERLACE;
+
+	/**
+	 * FIXME: current DC has no fitting unit, reject any mode setting
+	 * request
+	 * Will figure out a way to do up-scaling(pannel fitting) later.
+	 **/
+	if (fixed_mode) {
+		if (mode->hdisplay != fixed_mode->hdisplay)
+			return MODE_PANEL;
+
+		if (mode->vdisplay != fixed_mode->vdisplay)
+			return MODE_PANEL;
+	}
+
+	return MODE_OK;
+}
+
+static void mdfld_dsi_connector_dpms(struct drm_connector *connector, int mode)
+{
+	if (mode == connector->dpms)
+		return;
+
+	/*first, execute dpms*/
+
+	drm_helper_connector_dpms(connector, mode);
+}
+
+static struct drm_encoder *mdfld_dsi_connector_best_encoder(
+				struct drm_connector *connector)
+{
+	struct mdfld_dsi_connector *dsi_connector =
+				mdfld_dsi_connector(connector);
+	struct mdfld_dsi_config *dsi_config =
+				mdfld_dsi_get_config(dsi_connector);
+	return &dsi_config->encoder->base.base;
+}
+
+/*DSI connector funcs*/
+static const struct drm_connector_funcs mdfld_dsi_connector_funcs = {
+	.dpms = /*drm_helper_connector_dpms*/mdfld_dsi_connector_dpms,
+	.save = mdfld_dsi_connector_save,
+	.restore = mdfld_dsi_connector_restore,
+	.detect = mdfld_dsi_connector_detect,
+	.fill_modes = drm_helper_probe_single_connector_modes,
+	.set_property = mdfld_dsi_connector_set_property,
+	.destroy = mdfld_dsi_connector_destroy,
+};
+
+/*DSI connector helper funcs*/
+static const struct drm_connector_helper_funcs
+	mdfld_dsi_connector_helper_funcs = {
+	.get_modes = mdfld_dsi_connector_get_modes,
+	.mode_valid = mdfld_dsi_connector_mode_valid,
+	.best_encoder = mdfld_dsi_connector_best_encoder,
+};
+
+static int mdfld_dsi_get_default_config(struct drm_device *dev,
+				struct mdfld_dsi_config *config, int pipe)
+{
+	if (!dev || !config) {
+		DRM_ERROR("Invalid parameters");
+		return -EINVAL;
+	}
+
+	config->bpp = 24;
+	if (mdfld_get_panel_type(dev, pipe) == TC35876X)
+		config->lane_count = 4;
+	else
+		config->lane_count = 2;
+	config->channel_num = 0;
+
+	if (mdfld_get_panel_type(dev, pipe) == TMD_VID)
+		config->video_mode = MDFLD_DSI_VIDEO_NON_BURST_MODE_SYNC_PULSE;
+	else if (mdfld_get_panel_type(dev, pipe) == TC35876X)
+		config->video_mode =
+				MDFLD_DSI_VIDEO_NON_BURST_MODE_SYNC_EVENTS;
+	else
+		config->video_mode = MDFLD_DSI_VIDEO_BURST_MODE;
+
+	return 0;
+}
+
+int mdfld_dsi_panel_reset(int pipe)
+{
+	unsigned gpio;
+	int ret = 0;
+
+	switch (pipe) {
+	case 0:
+		gpio = 128;
+		break;
+	case 2:
+		gpio = 34;
+		break;
+	default:
+		DRM_ERROR("Invalid output\n");
+		return -EINVAL;
+	}
+
+	ret = gpio_request(gpio, "gfx");
+	if (ret) {
+		DRM_ERROR("gpio_rqueset failed\n");
+		return ret;
+	}
+
+	ret = gpio_direction_output(gpio, 1);
+	if (ret) {
+		DRM_ERROR("gpio_direction_output failed\n");
+		goto gpio_error;
+	}
+
+	gpio_get_value(128);
+
+gpio_error:
+	if (gpio_is_valid(gpio))
+		gpio_free(gpio);
+
+	return ret;
+}
+
+/*
+ * MIPI output init
+ * @dev drm device
+ * @pipe pipe number. 0 or 2
+ * @config
+ *
+ * Do the initialization of a MIPI output, including create DRM mode objects
+ * initialization of DSI output on @pipe
+ */
+void mdfld_dsi_output_init(struct drm_device *dev,
+			   int pipe,
+			   struct mdfld_dsi_config *config,
+			   const struct panel_funcs *p_vid_funcs)
+{
+	struct mdfld_dsi_config *dsi_config;
+	struct mdfld_dsi_connector *dsi_connector;
+	struct drm_connector *connector;
+	struct mdfld_dsi_encoder *encoder;
+	struct drm_psb_private *dev_priv = dev->dev_private;
+	struct panel_info dsi_panel_info;
+	u32 width_mm, height_mm;
+
+	dev_dbg(dev->dev, "init DSI output on pipe %d\n", pipe);
+
+	if (!dev || ((pipe != 0) && (pipe != 2))) {
+		DRM_ERROR("Invalid parameter\n");
+		return;
+	}
+
+	/*create a new connetor*/
+	dsi_connector = kzalloc(sizeof(struct mdfld_dsi_connector), GFP_KERNEL);
+	if (!dsi_connector) {
+		DRM_ERROR("No memory");
+		return;
+	}
+
+	dsi_connector->pipe =  pipe;
+
+	/*set DSI config*/
+	if (config)
+		dsi_config = config;
+	else {
+		dsi_config = kzalloc(sizeof(struct mdfld_dsi_config),
+								GFP_KERNEL);
+		if (!dsi_config) {
+			DRM_ERROR("cannot allocate memory for DSI config\n");
+			goto dsi_init_err0;
+		}
+		mdfld_dsi_get_default_config(dev, dsi_config, pipe);
+	}
+
+	dsi_connector->private = dsi_config;
+
+	dsi_config->changed = 1;
+	dsi_config->dev = dev;
+
+	dsi_config->fixed_mode = p_vid_funcs->get_config_mode(dev);
+	if (p_vid_funcs->get_panel_info(dev, pipe, &dsi_panel_info))
+			goto dsi_init_err0;
+
+	width_mm = dsi_panel_info.width_mm;
+	height_mm = dsi_panel_info.height_mm;
+
+	dsi_config->mode = dsi_config->fixed_mode;
+	dsi_config->connector = dsi_connector;
+
+	if (!dsi_config->fixed_mode) {
+		DRM_ERROR("No pannel fixed mode was found\n");
+		goto dsi_init_err0;
+	}
+
+	if (pipe && dev_priv->dsi_configs[0]) {
+		dsi_config->dvr_ic_inited = 0;
+		dev_priv->dsi_configs[1] = dsi_config;
+	} else if (pipe == 0) {
+		dsi_config->dvr_ic_inited = 1;
+		dev_priv->dsi_configs[0] = dsi_config;
+	} else {
+		DRM_ERROR("Trying to init MIPI1 before MIPI0\n");
+		goto dsi_init_err0;
+	}
+
+
+	connector = &dsi_connector->base.base;
+	drm_connector_init(dev, connector, &mdfld_dsi_connector_funcs,
+						DRM_MODE_CONNECTOR_LVDS);
+	drm_connector_helper_add(connector, &mdfld_dsi_connector_helper_funcs);
+
+	connector->display_info.subpixel_order = SubPixelHorizontalRGB;
+	connector->display_info.width_mm = width_mm;
+	connector->display_info.height_mm = height_mm;
+	connector->interlace_allowed = false;
+	connector->doublescan_allowed = false;
+
+	/*attach properties*/
+	drm_connector_attach_property(connector,
+				dev->mode_config.scaling_mode_property,
+				DRM_MODE_SCALE_FULLSCREEN);
+	drm_connector_attach_property(connector,
+				dev_priv->backlight_property,
+				MDFLD_DSI_BRIGHTNESS_MAX_LEVEL);
+
+	/*init DSI package sender on this output*/
+	if (mdfld_dsi_pkg_sender_init(dsi_connector, pipe)) {
+		DRM_ERROR("Package Sender initialization failed on pipe %d\n",
+									pipe);
+		goto dsi_init_err0;
+	}
+
+	encoder = mdfld_dsi_dpi_init(dev, dsi_connector, p_vid_funcs);
+	if (!encoder) {
+		DRM_ERROR("Create DPI encoder failed\n");
+		goto dsi_init_err1;
+	}
+	encoder->private = dsi_config;
+	dsi_config->encoder = encoder;
+	encoder->base.type = (pipe == 0) ? INTEL_OUTPUT_MIPI :
+		INTEL_OUTPUT_MIPI2;
+	drm_sysfs_connector_add(connector);
+	return;
+
+	/*TODO: add code to destroy outputs on error*/
+dsi_init_err1:
+	/*destroy sender*/
+	mdfld_dsi_pkg_sender_destroy(dsi_connector->pkg_sender);
+
+	drm_connector_cleanup(connector);
+
+	kfree(dsi_config->fixed_mode);
+	kfree(dsi_config);
+dsi_init_err0:
+	kfree(dsi_connector);
+}
diff --git a/drivers/gpu/drm/gma500/mdfld_dsi_output.h b/drivers/gpu/drm/gma500/mdfld_dsi_output.h
new file mode 100644
index 000000000000..2cdf666536df
--- /dev/null
+++ b/drivers/gpu/drm/gma500/mdfld_dsi_output.h
@@ -0,0 +1,389 @@
+/*
+ * Copyright © 2010 Intel Corporation
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+ * DEALINGS IN THE SOFTWARE.
+ *
+ * Authors:
+ * jim liu <jim.liu@intel.com>
+ * Jackie Li<yaodong.li@intel.com>
+ */
+
+#ifndef __MDFLD_DSI_OUTPUT_H__
+#define __MDFLD_DSI_OUTPUT_H__
+
+#include <linux/backlight.h>
+#include <linux/version.h>
+#include <drm/drmP.h>
+#include <drm/drm.h>
+#include <drm/drm_crtc.h>
+#include <drm/drm_edid.h>
+
+#include "psb_drv.h"
+#include "psb_intel_drv.h"
+#include "psb_intel_reg.h"
+#include "mdfld_output.h"
+
+#include <asm/mrst.h>
+
+#define FLD_MASK(start, end)	(((1 << ((start) - (end) + 1)) - 1) << (end))
+#define FLD_VAL(val, start, end) (((val) << (end)) & FLD_MASK(start, end))
+#define FLD_GET(val, start, end) (((val) & FLD_MASK(start, end)) >> (end))
+#define FLD_MOD(orig, val, start, end) \
+	(((orig) & ~FLD_MASK(start, end)) | FLD_VAL(val, start, end))
+
+#define REG_FLD_MOD(reg, val, start, end) \
+	REG_WRITE(reg, FLD_MOD(REG_READ(reg), val, start, end))
+
+static inline int REGISTER_FLD_WAIT(struct drm_device *dev, u32 reg,
+		u32 val, int start, int end)
+{
+	int t = 100000;
+
+	while (FLD_GET(REG_READ(reg), start, end) != val) {
+		if (--t == 0)
+			return 1;
+	}
+
+	return 0;
+}
+
+#define REG_FLD_WAIT(reg, val, start, end) \
+	REGISTER_FLD_WAIT(dev, reg, val, start, end)
+
+#define REG_BIT_WAIT(reg, val, bitnum) \
+	REGISTER_FLD_WAIT(dev, reg, val, bitnum, bitnum)
+
+#define MDFLD_DSI_BRIGHTNESS_MAX_LEVEL 100
+
+#ifdef DEBUG
+#define CHECK_PIPE(pipe) ({			\
+	const typeof(pipe) __pipe = (pipe);	\
+	BUG_ON(__pipe != 0 && __pipe != 2);	\
+	__pipe;	})
+#else
+#define CHECK_PIPE(pipe) (pipe)
+#endif
+
+/*
+ * Actual MIPIA->MIPIC reg offset is 0x800, value 0x400 is valid for 0 and 2
+ */
+#define REG_OFFSET(pipe) (CHECK_PIPE(pipe) * 0x400)
+
+/* mdfld DSI controller registers */
+#define MIPI_DEVICE_READY_REG(pipe)		(0xb000 + REG_OFFSET(pipe))
+#define MIPI_INTR_STAT_REG(pipe)		(0xb004 + REG_OFFSET(pipe))
+#define MIPI_INTR_EN_REG(pipe)			(0xb008 + REG_OFFSET(pipe))
+#define MIPI_DSI_FUNC_PRG_REG(pipe)		(0xb00c + REG_OFFSET(pipe))
+#define MIPI_HS_TX_TIMEOUT_REG(pipe)		(0xb010 + REG_OFFSET(pipe))
+#define MIPI_LP_RX_TIMEOUT_REG(pipe)		(0xb014 + REG_OFFSET(pipe))
+#define MIPI_TURN_AROUND_TIMEOUT_REG(pipe)	(0xb018 + REG_OFFSET(pipe))
+#define MIPI_DEVICE_RESET_TIMER_REG(pipe)	(0xb01c + REG_OFFSET(pipe))
+#define MIPI_DPI_RESOLUTION_REG(pipe)		(0xb020 + REG_OFFSET(pipe))
+#define MIPI_DBI_FIFO_THROTTLE_REG(pipe)	(0xb024 + REG_OFFSET(pipe))
+#define MIPI_HSYNC_COUNT_REG(pipe)		(0xb028 + REG_OFFSET(pipe))
+#define MIPI_HBP_COUNT_REG(pipe)		(0xb02c + REG_OFFSET(pipe))
+#define MIPI_HFP_COUNT_REG(pipe)		(0xb030 + REG_OFFSET(pipe))
+#define MIPI_HACTIVE_COUNT_REG(pipe)		(0xb034 + REG_OFFSET(pipe))
+#define MIPI_VSYNC_COUNT_REG(pipe)		(0xb038 + REG_OFFSET(pipe))
+#define MIPI_VBP_COUNT_REG(pipe)		(0xb03c + REG_OFFSET(pipe))
+#define MIPI_VFP_COUNT_REG(pipe)		(0xb040 + REG_OFFSET(pipe))
+#define MIPI_HIGH_LOW_SWITCH_COUNT_REG(pipe)	(0xb044 + REG_OFFSET(pipe))
+#define MIPI_DPI_CONTROL_REG(pipe)		(0xb048 + REG_OFFSET(pipe))
+#define MIPI_DPI_DATA_REG(pipe)			(0xb04c + REG_OFFSET(pipe))
+#define MIPI_INIT_COUNT_REG(pipe)		(0xb050 + REG_OFFSET(pipe))
+#define MIPI_MAX_RETURN_PACK_SIZE_REG(pipe)	(0xb054 + REG_OFFSET(pipe))
+#define MIPI_VIDEO_MODE_FORMAT_REG(pipe)	(0xb058 + REG_OFFSET(pipe))
+#define MIPI_EOT_DISABLE_REG(pipe)		(0xb05c + REG_OFFSET(pipe))
+#define MIPI_LP_BYTECLK_REG(pipe)		(0xb060 + REG_OFFSET(pipe))
+#define MIPI_LP_GEN_DATA_REG(pipe)		(0xb064 + REG_OFFSET(pipe))
+#define MIPI_HS_GEN_DATA_REG(pipe)		(0xb068 + REG_OFFSET(pipe))
+#define MIPI_LP_GEN_CTRL_REG(pipe)		(0xb06c + REG_OFFSET(pipe))
+#define MIPI_HS_GEN_CTRL_REG(pipe)		(0xb070 + REG_OFFSET(pipe))
+#define MIPI_GEN_FIFO_STAT_REG(pipe)		(0xb074 + REG_OFFSET(pipe))
+#define MIPI_HS_LS_DBI_ENABLE_REG(pipe)		(0xb078 + REG_OFFSET(pipe))
+#define MIPI_DPHY_PARAM_REG(pipe)		(0xb080 + REG_OFFSET(pipe))
+#define MIPI_DBI_BW_CTRL_REG(pipe)		(0xb084 + REG_OFFSET(pipe))
+#define MIPI_CLK_LANE_SWITCH_TIME_CNT_REG(pipe)	(0xb088 + REG_OFFSET(pipe))
+
+#define MIPI_CTRL_REG(pipe)			(0xb104 + REG_OFFSET(pipe))
+#define MIPI_DATA_ADD_REG(pipe)			(0xb108 + REG_OFFSET(pipe))
+#define MIPI_DATA_LEN_REG(pipe)			(0xb10c + REG_OFFSET(pipe))
+#define MIPI_CMD_ADD_REG(pipe)			(0xb110 + REG_OFFSET(pipe))
+#define MIPI_CMD_LEN_REG(pipe)			(0xb114 + REG_OFFSET(pipe))
+
+/* non-uniform reg offset */
+#define MIPI_PORT_CONTROL(pipe)		(CHECK_PIPE(pipe) ? MIPI_C : MIPI)
+
+#define DSI_DEVICE_READY				(0x1)
+#define DSI_POWER_STATE_ULPS_ENTER			(0x2 << 1)
+#define DSI_POWER_STATE_ULPS_EXIT			(0x1 << 1)
+#define DSI_POWER_STATE_ULPS_OFFSET			(0x1)
+
+
+#define DSI_ONE_DATA_LANE					(0x1)
+#define DSI_TWO_DATA_LANE					(0x2)
+#define DSI_THREE_DATA_LANE					(0X3)
+#define DSI_FOUR_DATA_LANE					(0x4)
+#define DSI_DPI_VIRT_CHANNEL_OFFSET			(0x3)
+#define DSI_DBI_VIRT_CHANNEL_OFFSET			(0x5)
+#define DSI_DPI_COLOR_FORMAT_RGB565			(0x01 << 7)
+#define DSI_DPI_COLOR_FORMAT_RGB666			(0x02 << 7)
+#define DSI_DPI_COLOR_FORMAT_RGB666_UNPACK		(0x03 << 7)
+#define DSI_DPI_COLOR_FORMAT_RGB888			(0x04 << 7)
+#define DSI_DBI_COLOR_FORMAT_OPTION2			(0x05 << 13)
+
+#define DSI_INTR_STATE_RXSOTERROR			BIT(0)
+
+#define DSI_INTR_STATE_SPL_PKG_SENT			BIT(30)
+#define DSI_INTR_STATE_TE				BIT(31)
+
+#define DSI_HS_TX_TIMEOUT_MASK				(0xffffff)
+
+#define DSI_LP_RX_TIMEOUT_MASK				(0xffffff)
+
+#define DSI_TURN_AROUND_TIMEOUT_MASK		(0x3f)
+
+#define DSI_RESET_TIMER_MASK				(0xffff)
+
+#define DSI_DBI_FIFO_WM_HALF				(0x0)
+#define DSI_DBI_FIFO_WM_QUARTER				(0x1)
+#define DSI_DBI_FIFO_WM_LOW					(0x2)
+
+#define DSI_DPI_TIMING_MASK					(0xffff)
+
+#define DSI_INIT_TIMER_MASK					(0xffff)
+
+#define DSI_DBI_RETURN_PACK_SIZE_MASK		(0x3ff)
+
+#define DSI_LP_BYTECLK_MASK					(0x0ffff)
+
+#define DSI_HS_CTRL_GEN_SHORT_W0			(0x03)
+#define DSI_HS_CTRL_GEN_SHORT_W1			(0x13)
+#define DSI_HS_CTRL_GEN_SHORT_W2			(0x23)
+#define DSI_HS_CTRL_GEN_R0					(0x04)
+#define DSI_HS_CTRL_GEN_R1					(0x14)
+#define DSI_HS_CTRL_GEN_R2					(0x24)
+#define DSI_HS_CTRL_GEN_LONG_W				(0x29)
+#define DSI_HS_CTRL_MCS_SHORT_W0			(0x05)
+#define DSI_HS_CTRL_MCS_SHORT_W1			(0x15)
+#define DSI_HS_CTRL_MCS_R0					(0x06)
+#define DSI_HS_CTRL_MCS_LONG_W				(0x39)
+#define DSI_HS_CTRL_VC_OFFSET				(0x06)
+#define DSI_HS_CTRL_WC_OFFSET				(0x08)
+
+#define	DSI_FIFO_GEN_HS_DATA_FULL			BIT(0)
+#define DSI_FIFO_GEN_HS_DATA_HALF_EMPTY		BIT(1)
+#define DSI_FIFO_GEN_HS_DATA_EMPTY			BIT(2)
+#define DSI_FIFO_GEN_LP_DATA_FULL			BIT(8)
+#define DSI_FIFO_GEN_LP_DATA_HALF_EMPTY		BIT(9)
+#define DSI_FIFO_GEN_LP_DATA_EMPTY			BIT(10)
+#define DSI_FIFO_GEN_HS_CTRL_FULL			BIT(16)
+#define DSI_FIFO_GEN_HS_CTRL_HALF_EMPTY		BIT(17)
+#define DSI_FIFO_GEN_HS_CTRL_EMPTY			BIT(18)
+#define DSI_FIFO_GEN_LP_CTRL_FULL			BIT(24)
+#define DSI_FIFO_GEN_LP_CTRL_HALF_EMPTY		BIT(25)
+#define DSI_FIFO_GEN_LP_CTRL_EMPTY			BIT(26)
+#define DSI_FIFO_DBI_EMPTY					BIT(27)
+#define DSI_FIFO_DPI_EMPTY					BIT(28)
+
+#define DSI_DBI_HS_LP_SWITCH_MASK			(0x1)
+
+#define DSI_HS_LP_SWITCH_COUNTER_OFFSET		(0x0)
+#define DSI_LP_HS_SWITCH_COUNTER_OFFSET		(0x16)
+
+#define DSI_DPI_CTRL_HS_SHUTDOWN			(0x00000001)
+#define DSI_DPI_CTRL_HS_TURN_ON				(0x00000002)
+
+/*dsi power modes*/
+#define DSI_POWER_MODE_DISPLAY_ON	BIT(2)
+#define DSI_POWER_MODE_NORMAL_ON	BIT(3)
+#define DSI_POWER_MODE_SLEEP_OUT	BIT(4)
+#define DSI_POWER_MODE_PARTIAL_ON	BIT(5)
+#define DSI_POWER_MODE_IDLE_ON		BIT(6)
+
+enum {
+	MDFLD_DSI_VIDEO_NON_BURST_MODE_SYNC_PULSE = 1,
+	MDFLD_DSI_VIDEO_NON_BURST_MODE_SYNC_EVENTS = 2,
+	MDFLD_DSI_VIDEO_BURST_MODE = 3,
+};
+
+#define DSI_DPI_COMPLETE_LAST_LINE			BIT(2)
+#define DSI_DPI_DISABLE_BTA					BIT(3)
+
+struct mdfld_dsi_connector_state {
+	u32 mipi_ctrl_reg;
+};
+
+struct mdfld_dsi_encoder_state {
+
+};
+
+struct mdfld_dsi_connector {
+	struct psb_intel_connector base;
+
+	int pipe;
+	void *private;
+	void *pkg_sender;
+
+	/* Connection status */
+	enum drm_connector_status status;
+};
+
+struct mdfld_dsi_encoder {
+	struct psb_intel_encoder base;
+	void *private;
+};
+
+/*
+ * DSI config, consists of one DSI connector, two DSI encoders.
+ * DRM will pick up on DSI encoder basing on differents configs.
+ */
+struct mdfld_dsi_config {
+	struct drm_device *dev;
+	struct drm_display_mode *fixed_mode;
+	struct drm_display_mode *mode;
+
+	struct mdfld_dsi_connector *connector;
+	struct mdfld_dsi_encoder *encoder;
+
+	int changed;
+
+	int bpp;
+	int lane_count;
+	/*Virtual channel number for this encoder*/
+	int channel_num;
+	/*video mode configure*/
+	int video_mode;
+
+	int dvr_ic_inited;
+};
+
+static inline struct mdfld_dsi_connector *mdfld_dsi_connector(
+		struct drm_connector *connector)
+{
+	struct psb_intel_connector *psb_connector;
+
+	psb_connector = to_psb_intel_connector(connector);
+
+	return container_of(psb_connector, struct mdfld_dsi_connector, base);
+}
+
+static inline struct mdfld_dsi_encoder *mdfld_dsi_encoder(
+		struct drm_encoder *encoder)
+{
+	struct psb_intel_encoder *psb_encoder;
+
+	psb_encoder = to_psb_intel_encoder(encoder);
+
+	return container_of(psb_encoder, struct mdfld_dsi_encoder, base);
+}
+
+static inline struct mdfld_dsi_config *
+	mdfld_dsi_get_config(struct mdfld_dsi_connector *connector)
+{
+	if (!connector)
+		return NULL;
+	return (struct mdfld_dsi_config *)connector->private;
+}
+
+static inline void *mdfld_dsi_get_pkg_sender(struct mdfld_dsi_config *config)
+{
+	struct mdfld_dsi_connector *dsi_connector;
+
+	if (!config)
+		return NULL;
+
+	dsi_connector = config->connector;
+
+	if (!dsi_connector)
+		return NULL;
+
+	return dsi_connector->pkg_sender;
+}
+
+static inline struct mdfld_dsi_config *
+	mdfld_dsi_encoder_get_config(struct mdfld_dsi_encoder *encoder)
+{
+	if (!encoder)
+		return NULL;
+	return (struct mdfld_dsi_config *)encoder->private;
+}
+
+static inline struct mdfld_dsi_connector *
+	mdfld_dsi_encoder_get_connector(struct mdfld_dsi_encoder *encoder)
+{
+	struct mdfld_dsi_config *config;
+
+	if (!encoder)
+		return NULL;
+
+	config = mdfld_dsi_encoder_get_config(encoder);
+	if (!config)
+		return NULL;
+
+	return config->connector;
+}
+
+static inline void *mdfld_dsi_encoder_get_pkg_sender(
+				struct mdfld_dsi_encoder *encoder)
+{
+	struct mdfld_dsi_config *dsi_config;
+
+	dsi_config = mdfld_dsi_encoder_get_config(encoder);
+	if (!dsi_config)
+		return NULL;
+
+	return mdfld_dsi_get_pkg_sender(dsi_config);
+}
+
+static inline int mdfld_dsi_encoder_get_pipe(struct mdfld_dsi_encoder *encoder)
+{
+	struct mdfld_dsi_connector *connector;
+
+	if (!encoder)
+		return -1;
+
+	connector = mdfld_dsi_encoder_get_connector(encoder);
+	if (!connector)
+		return -1;
+	return connector->pipe;
+}
+
+/* Export functions */
+extern void mdfld_dsi_gen_fifo_ready(struct drm_device *dev,
+					u32 gen_fifo_stat_reg, u32 fifo_stat);
+extern void mdfld_dsi_brightness_init(struct mdfld_dsi_config *dsi_config,
+					int pipe);
+extern void mdfld_dsi_brightness_control(struct drm_device *dev, int pipe,
+					int level);
+extern void mdfld_dsi_output_init(struct drm_device *dev,
+					int pipe,
+					struct mdfld_dsi_config *config,
+					const struct panel_funcs *p_vid_funcs);
+extern void mdfld_dsi_controller_init(struct mdfld_dsi_config *dsi_config,
+					int pipe);
+
+extern int mdfld_dsi_get_power_mode(struct mdfld_dsi_config *dsi_config,
+					u32 *mode, bool hs);
+extern int mdfld_dsi_get_diagnostic_result(struct mdfld_dsi_config *dsi_config,
+					u32 *result, bool hs);
+extern int mdfld_dsi_panel_reset(int pipe);
+
+#endif /*__MDFLD_DSI_OUTPUT_H__*/
diff --git a/drivers/gpu/drm/gma500/mdfld_dsi_pkg_sender.c b/drivers/gpu/drm/gma500/mdfld_dsi_pkg_sender.c
new file mode 100644
index 000000000000..f193acec657e
--- /dev/null
+++ b/drivers/gpu/drm/gma500/mdfld_dsi_pkg_sender.c
@@ -0,0 +1,694 @@
+/*
+ * Copyright © 2010 Intel Corporation
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+ * DEALINGS IN THE SOFTWARE.
+ *
+ * Authors:
+ * Jackie Li<yaodong.li@intel.com>
+ */
+
+#include <linux/freezer.h>
+
+#include "mdfld_dsi_output.h"
+#include "mdfld_dsi_pkg_sender.h"
+#include "mdfld_dsi_dpi.h"
+
+#define MDFLD_DSI_READ_MAX_COUNT		5000
+
+enum data_type {
+	DSI_DT_GENERIC_SHORT_WRITE_0	= 0x03,
+	DSI_DT_GENERIC_SHORT_WRITE_1	= 0x13,
+	DSI_DT_GENERIC_SHORT_WRITE_2	= 0x23,
+	DSI_DT_GENERIC_READ_0		= 0x04,
+	DSI_DT_GENERIC_READ_1		= 0x14,
+	DSI_DT_GENERIC_READ_2		= 0x24,
+	DSI_DT_GENERIC_LONG_WRITE	= 0x29,
+	DSI_DT_DCS_SHORT_WRITE_0	= 0x05,
+	DSI_DT_DCS_SHORT_WRITE_1	= 0x15,
+	DSI_DT_DCS_READ			= 0x06,
+	DSI_DT_DCS_LONG_WRITE		= 0x39,
+};
+
+enum {
+	MDFLD_DSI_PANEL_MODE_SLEEP = 0x1,
+};
+
+enum {
+	MDFLD_DSI_PKG_SENDER_FREE = 0x0,
+	MDFLD_DSI_PKG_SENDER_BUSY = 0x1,
+};
+
+static const char *const dsi_errors[] = {
+	"RX SOT Error",
+	"RX SOT Sync Error",
+	"RX EOT Sync Error",
+	"RX Escape Mode Entry Error",
+	"RX LP TX Sync Error",
+	"RX HS Receive Timeout Error",
+	"RX False Control Error",
+	"RX ECC Single Bit Error",
+	"RX ECC Multibit Error",
+	"RX Checksum Error",
+	"RX DSI Data Type Not Recognised",
+	"RX DSI VC ID Invalid",
+	"TX False Control Error",
+	"TX ECC Single Bit Error",
+	"TX ECC Multibit Error",
+	"TX Checksum Error",
+	"TX DSI Data Type Not Recognised",
+	"TX DSI VC ID invalid",
+	"High Contention",
+	"Low contention",
+	"DPI FIFO Under run",
+	"HS TX Timeout",
+	"LP RX Timeout",
+	"Turn Around ACK Timeout",
+	"ACK With No Error",
+	"RX Invalid TX Length",
+	"RX Prot Violation",
+	"HS Generic Write FIFO Full",
+	"LP Generic Write FIFO Full",
+	"Generic Read Data Avail"
+	"Special Packet Sent",
+	"Tearing Effect",
+};
+
+static inline int wait_for_gen_fifo_empty(struct mdfld_dsi_pkg_sender *sender,
+						u32 mask)
+{
+	struct drm_device *dev = sender->dev;
+	u32 gen_fifo_stat_reg = sender->mipi_gen_fifo_stat_reg;
+	int retry = 0xffff;
+
+	while (retry--) {
+		if ((mask & REG_READ(gen_fifo_stat_reg)) == mask)
+			return 0;
+		udelay(100);
+	}
+	DRM_ERROR("fifo is NOT empty 0x%08x\n", REG_READ(gen_fifo_stat_reg));
+	return -EIO;
+}
+
+static int wait_for_all_fifos_empty(struct mdfld_dsi_pkg_sender *sender)
+{
+	return wait_for_gen_fifo_empty(sender, (BIT(2) | BIT(10) | BIT(18) |
+						BIT(26) | BIT(27) | BIT(28)));
+}
+
+static int wait_for_lp_fifos_empty(struct mdfld_dsi_pkg_sender *sender)
+{
+	return wait_for_gen_fifo_empty(sender, (BIT(10) | BIT(26)));
+}
+
+static int wait_for_hs_fifos_empty(struct mdfld_dsi_pkg_sender *sender)
+{
+	return wait_for_gen_fifo_empty(sender, (BIT(2) | BIT(18)));
+}
+
+static int handle_dsi_error(struct mdfld_dsi_pkg_sender *sender, u32 mask)
+{
+	u32 intr_stat_reg = sender->mipi_intr_stat_reg;
+	struct drm_device *dev = sender->dev;
+
+	dev_dbg(sender->dev->dev, "Handling error 0x%08x\n", mask);
+
+	switch (mask) {
+	case BIT(0):
+	case BIT(1):
+	case BIT(2):
+	case BIT(3):
+	case BIT(4):
+	case BIT(5):
+	case BIT(6):
+	case BIT(7):
+	case BIT(8):
+	case BIT(9):
+	case BIT(10):
+	case BIT(11):
+	case BIT(12):
+	case BIT(13):
+		dev_dbg(sender->dev->dev, "No Action required\n");
+		break;
+	case BIT(14):
+		/*wait for all fifo empty*/
+		/*wait_for_all_fifos_empty(sender)*/;
+		break;
+	case BIT(15):
+		dev_dbg(sender->dev->dev, "No Action required\n");
+		break;
+	case BIT(16):
+		break;
+	case BIT(17):
+		break;
+	case BIT(18):
+	case BIT(19):
+		dev_dbg(sender->dev->dev, "High/Low contention detected\n");
+		/*wait for contention recovery time*/
+		/*mdelay(10);*/
+		/*wait for all fifo empty*/
+		if (0)
+			wait_for_all_fifos_empty(sender);
+		break;
+	case BIT(20):
+		dev_dbg(sender->dev->dev, "No Action required\n");
+		break;
+	case BIT(21):
+		/*wait for all fifo empty*/
+		/*wait_for_all_fifos_empty(sender);*/
+		break;
+	case BIT(22):
+		break;
+	case BIT(23):
+	case BIT(24):
+	case BIT(25):
+	case BIT(26):
+	case BIT(27):
+		dev_dbg(sender->dev->dev, "HS Gen fifo full\n");
+		REG_WRITE(intr_stat_reg, mask);
+		wait_for_hs_fifos_empty(sender);
+		break;
+	case BIT(28):
+		dev_dbg(sender->dev->dev, "LP Gen fifo full\n");
+		REG_WRITE(intr_stat_reg, mask);
+		wait_for_lp_fifos_empty(sender);
+		break;
+	case BIT(29):
+	case BIT(30):
+	case BIT(31):
+		dev_dbg(sender->dev->dev, "No Action required\n");
+		break;
+	}
+
+	if (mask & REG_READ(intr_stat_reg))
+		dev_dbg(sender->dev->dev,
+				"Cannot clean interrupt 0x%08x\n", mask);
+	return 0;
+}
+
+static int dsi_error_handler(struct mdfld_dsi_pkg_sender *sender)
+{
+	struct drm_device *dev = sender->dev;
+	u32 intr_stat_reg = sender->mipi_intr_stat_reg;
+	u32 mask;
+	u32 intr_stat;
+	int i;
+	int err = 0;
+
+	intr_stat = REG_READ(intr_stat_reg);
+
+	for (i = 0; i < 32; i++) {
+		mask = (0x00000001UL) << i;
+		if (intr_stat & mask) {
+			dev_dbg(sender->dev->dev, "[DSI]: %s\n", dsi_errors[i]);
+			err = handle_dsi_error(sender, mask);
+			if (err)
+				DRM_ERROR("Cannot handle error\n");
+		}
+	}
+	return err;
+}
+
+static int send_short_pkg(struct mdfld_dsi_pkg_sender *sender, u8 data_type,
+			u8 cmd, u8 param, bool hs)
+{
+	struct drm_device *dev = sender->dev;
+	u32 ctrl_reg;
+	u32 val;
+	u8 virtual_channel = 0;
+
+	if (hs) {
+		ctrl_reg = sender->mipi_hs_gen_ctrl_reg;
+
+		/* FIXME: wait_for_hs_fifos_empty(sender); */
+	} else {
+		ctrl_reg = sender->mipi_lp_gen_ctrl_reg;
+
+		/* FIXME: wait_for_lp_fifos_empty(sender); */
+	}
+
+	val = FLD_VAL(param, 23, 16) | FLD_VAL(cmd, 15, 8) |
+		FLD_VAL(virtual_channel, 7, 6) | FLD_VAL(data_type, 5, 0);
+
+	REG_WRITE(ctrl_reg, val);
+
+	return 0;
+}
+
+static int send_long_pkg(struct mdfld_dsi_pkg_sender *sender, u8 data_type,
+			u8 *data, int len, bool hs)
+{
+	struct drm_device *dev = sender->dev;
+	u32 ctrl_reg;
+	u32 data_reg;
+	u32 val;
+	u8 *p;
+	u8 b1, b2, b3, b4;
+	u8 virtual_channel = 0;
+	int i;
+
+	if (hs) {
+		ctrl_reg = sender->mipi_hs_gen_ctrl_reg;
+		data_reg = sender->mipi_hs_gen_data_reg;
+
+		/* FIXME: wait_for_hs_fifos_empty(sender); */
+	} else {
+		ctrl_reg = sender->mipi_lp_gen_ctrl_reg;
+		data_reg = sender->mipi_lp_gen_data_reg;
+
+		/* FIXME: wait_for_lp_fifos_empty(sender); */
+	}
+
+	p = data;
+	for (i = 0; i < len / 4; i++) {
+		b1 = *p++;
+		b2 = *p++;
+		b3 = *p++;
+		b4 = *p++;
+
+		REG_WRITE(data_reg, b4 << 24 | b3 << 16 | b2 << 8 | b1);
+	}
+
+	i = len % 4;
+	if (i) {
+		b1 = 0; b2 = 0; b3 = 0;
+
+		switch (i) {
+		case 3:
+			b1 = *p++;
+			b2 = *p++;
+			b3 = *p++;
+			break;
+		case 2:
+			b1 = *p++;
+			b2 = *p++;
+			break;
+		case 1:
+			b1 = *p++;
+			break;
+		}
+
+		REG_WRITE(data_reg, b3 << 16 | b2 << 8 | b1);
+	}
+
+	val = FLD_VAL(len, 23, 8) | FLD_VAL(virtual_channel, 7, 6) |
+		FLD_VAL(data_type, 5, 0);
+
+	REG_WRITE(ctrl_reg, val);
+
+	return 0;
+}
+
+static int send_pkg_prepare(struct mdfld_dsi_pkg_sender *sender, u8 data_type,
+			u8 *data, u16 len)
+{
+	u8 cmd;
+
+	switch (data_type) {
+	case DSI_DT_DCS_SHORT_WRITE_0:
+	case DSI_DT_DCS_SHORT_WRITE_1:
+	case DSI_DT_DCS_LONG_WRITE:
+		cmd = *data;
+		break;
+	default:
+		return 0;
+	}
+
+	/*this prevents other package sending while doing msleep*/
+	sender->status = MDFLD_DSI_PKG_SENDER_BUSY;
+
+	/*wait for 120 milliseconds in case exit_sleep_mode just be sent*/
+	if (unlikely(cmd == DCS_ENTER_SLEEP_MODE)) {
+		/*TODO: replace it with msleep later*/
+		mdelay(120);
+	}
+
+	if (unlikely(cmd == DCS_EXIT_SLEEP_MODE)) {
+		/*TODO: replace it with msleep later*/
+		mdelay(120);
+	}
+	return 0;
+}
+
+static int send_pkg_done(struct mdfld_dsi_pkg_sender *sender, u8 data_type,
+			u8 *data, u16 len)
+{
+	u8 cmd;
+
+	switch (data_type) {
+	case DSI_DT_DCS_SHORT_WRITE_0:
+	case DSI_DT_DCS_SHORT_WRITE_1:
+	case DSI_DT_DCS_LONG_WRITE:
+		cmd = *data;
+		break;
+	default:
+		return 0;
+	}
+
+	/*update panel status*/
+	if (unlikely(cmd == DCS_ENTER_SLEEP_MODE)) {
+		sender->panel_mode |= MDFLD_DSI_PANEL_MODE_SLEEP;
+		/*TODO: replace it with msleep later*/
+		mdelay(120);
+	} else if (unlikely(cmd == DCS_EXIT_SLEEP_MODE)) {
+		sender->panel_mode &= ~MDFLD_DSI_PANEL_MODE_SLEEP;
+		/*TODO: replace it with msleep later*/
+		mdelay(120);
+	} else if (unlikely(cmd == DCS_SOFT_RESET)) {
+		/*TODO: replace it with msleep later*/
+		mdelay(5);
+	}
+
+	sender->status = MDFLD_DSI_PKG_SENDER_FREE;
+
+	return 0;
+}
+
+static int send_pkg(struct mdfld_dsi_pkg_sender *sender, u8 data_type,
+		u8 *data, u16 len, bool hs)
+{
+	int ret;
+
+	/*handle DSI error*/
+	ret = dsi_error_handler(sender);
+	if (ret) {
+		DRM_ERROR("Error handling failed\n");
+		return -EAGAIN;
+	}
+
+	/* send pkg */
+	if (sender->status == MDFLD_DSI_PKG_SENDER_BUSY) {
+		DRM_ERROR("sender is busy\n");
+		return -EAGAIN;
+	}
+
+	ret = send_pkg_prepare(sender, data_type, data, len);
+	if (ret) {
+		DRM_ERROR("send_pkg_prepare error\n");
+		return ret;
+	}
+
+	switch (data_type) {
+	case DSI_DT_GENERIC_SHORT_WRITE_0:
+	case DSI_DT_GENERIC_SHORT_WRITE_1:
+	case DSI_DT_GENERIC_SHORT_WRITE_2:
+	case DSI_DT_GENERIC_READ_0:
+	case DSI_DT_GENERIC_READ_1:
+	case DSI_DT_GENERIC_READ_2:
+	case DSI_DT_DCS_SHORT_WRITE_0:
+	case DSI_DT_DCS_SHORT_WRITE_1:
+	case DSI_DT_DCS_READ:
+		ret = send_short_pkg(sender, data_type, data[0], data[1], hs);
+		break;
+	case DSI_DT_GENERIC_LONG_WRITE:
+	case DSI_DT_DCS_LONG_WRITE:
+		ret = send_long_pkg(sender, data_type, data, len, hs);
+		break;
+	}
+
+	send_pkg_done(sender, data_type, data, len);
+
+	/*FIXME: should I query complete and fifo empty here?*/
+
+	return ret;
+}
+
+int mdfld_dsi_send_mcs_long(struct mdfld_dsi_pkg_sender *sender, u8 *data,
+			u32 len, bool hs)
+{
+	unsigned long flags;
+
+	if (!sender || !data || !len) {
+		DRM_ERROR("Invalid parameters\n");
+		return -EINVAL;
+	}
+
+	spin_lock_irqsave(&sender->lock, flags);
+	send_pkg(sender, DSI_DT_DCS_LONG_WRITE, data, len, hs);
+	spin_unlock_irqrestore(&sender->lock, flags);
+
+	return 0;
+}
+
+int mdfld_dsi_send_mcs_short(struct mdfld_dsi_pkg_sender *sender, u8 cmd,
+			u8 param, u8 param_num, bool hs)
+{
+	u8 data[2];
+	unsigned long flags;
+	u8 data_type;
+
+	if (!sender) {
+		DRM_ERROR("Invalid parameter\n");
+		return -EINVAL;
+	}
+
+	data[0] = cmd;
+
+	if (param_num) {
+		data_type = DSI_DT_DCS_SHORT_WRITE_1;
+		data[1] = param;
+	} else {
+		data_type = DSI_DT_DCS_SHORT_WRITE_0;
+		data[1] = 0;
+	}
+
+	spin_lock_irqsave(&sender->lock, flags);
+	send_pkg(sender, data_type, data, sizeof(data), hs);
+	spin_unlock_irqrestore(&sender->lock, flags);
+
+	return 0;
+}
+
+int mdfld_dsi_send_gen_short(struct mdfld_dsi_pkg_sender *sender, u8 param0,
+			u8 param1, u8 param_num, bool hs)
+{
+	u8 data[2];
+	unsigned long flags;
+	u8 data_type;
+
+	if (!sender || param_num < 0 || param_num > 2) {
+		DRM_ERROR("Invalid parameter\n");
+		return -EINVAL;
+	}
+
+	switch (param_num) {
+	case 0:
+		data_type = DSI_DT_GENERIC_SHORT_WRITE_0;
+		data[0] = 0;
+		data[1] = 0;
+		break;
+	case 1:
+		data_type = DSI_DT_GENERIC_SHORT_WRITE_1;
+		data[0] = param0;
+		data[1] = 0;
+		break;
+	case 2:
+		data_type = DSI_DT_GENERIC_SHORT_WRITE_2;
+		data[0] = param0;
+		data[1] = param1;
+		break;
+	}
+
+	spin_lock_irqsave(&sender->lock, flags);
+	send_pkg(sender, data_type, data, sizeof(data), hs);
+	spin_unlock_irqrestore(&sender->lock, flags);
+
+	return 0;
+}
+
+int mdfld_dsi_send_gen_long(struct mdfld_dsi_pkg_sender *sender, u8 *data,
+			u32 len, bool hs)
+{
+	unsigned long flags;
+
+	if (!sender || !data || !len) {
+		DRM_ERROR("Invalid parameters\n");
+		return -EINVAL;
+	}
+
+	spin_lock_irqsave(&sender->lock, flags);
+	send_pkg(sender, DSI_DT_GENERIC_LONG_WRITE, data, len, hs);
+	spin_unlock_irqrestore(&sender->lock, flags);
+
+	return 0;
+}
+
+static int __read_panel_data(struct mdfld_dsi_pkg_sender *sender, u8 data_type,
+			u8 *data, u16 len, u32 *data_out, u16 len_out, bool hs)
+{
+	unsigned long flags;
+	struct drm_device *dev = sender->dev;
+	int i;
+	u32 gen_data_reg;
+	int retry = MDFLD_DSI_READ_MAX_COUNT;
+
+	if (!sender || !data_out || !len_out) {
+		DRM_ERROR("Invalid parameters\n");
+		return -EINVAL;
+	}
+
+	/**
+	 * do reading.
+	 * 0) send out generic read request
+	 * 1) polling read data avail interrupt
+	 * 2) read data
+	 */
+	spin_lock_irqsave(&sender->lock, flags);
+
+	REG_WRITE(sender->mipi_intr_stat_reg, BIT(29));
+
+	if ((REG_READ(sender->mipi_intr_stat_reg) & BIT(29)))
+		DRM_ERROR("Can NOT clean read data valid interrupt\n");
+
+	/*send out read request*/
+	send_pkg(sender, data_type, data, len, hs);
+
+	/*polling read data avail interrupt*/
+	while (retry && !(REG_READ(sender->mipi_intr_stat_reg) & BIT(29))) {
+		udelay(100);
+		retry--;
+	}
+
+	if (!retry) {
+		spin_unlock_irqrestore(&sender->lock, flags);
+		return -ETIMEDOUT;
+	}
+
+	REG_WRITE(sender->mipi_intr_stat_reg, BIT(29));
+
+	/*read data*/
+	if (hs)
+		gen_data_reg = sender->mipi_hs_gen_data_reg;
+	else
+		gen_data_reg = sender->mipi_lp_gen_data_reg;
+
+	for (i = 0; i < len_out; i++)
+		*(data_out + i) = REG_READ(gen_data_reg);
+
+	spin_unlock_irqrestore(&sender->lock, flags);
+
+	return 0;
+}
+
+int mdfld_dsi_read_mcs(struct mdfld_dsi_pkg_sender *sender, u8 cmd,
+		u32 *data, u16 len, bool hs)
+{
+	if (!sender || !data || !len) {
+		DRM_ERROR("Invalid parameters\n");
+		return -EINVAL;
+	}
+
+	return __read_panel_data(sender, DSI_DT_DCS_READ, &cmd, 1,
+				data, len, hs);
+}
+
+int mdfld_dsi_pkg_sender_init(struct mdfld_dsi_connector *dsi_connector,
+								int pipe)
+{
+	struct mdfld_dsi_pkg_sender *pkg_sender;
+	struct mdfld_dsi_config *dsi_config =
+				mdfld_dsi_get_config(dsi_connector);
+	struct drm_device *dev = dsi_config->dev;
+	u32 mipi_val = 0;
+
+	if (!dsi_connector) {
+		DRM_ERROR("Invalid parameter\n");
+		return -EINVAL;
+	}
+
+	pkg_sender = dsi_connector->pkg_sender;
+
+	if (!pkg_sender || IS_ERR(pkg_sender)) {
+		pkg_sender = kzalloc(sizeof(struct mdfld_dsi_pkg_sender),
+								GFP_KERNEL);
+		if (!pkg_sender) {
+			DRM_ERROR("Create DSI pkg sender failed\n");
+			return -ENOMEM;
+		}
+		dsi_connector->pkg_sender = (void *)pkg_sender;
+	}
+
+	pkg_sender->dev = dev;
+	pkg_sender->dsi_connector = dsi_connector;
+	pkg_sender->pipe = pipe;
+	pkg_sender->pkg_num = 0;
+	pkg_sender->panel_mode = 0;
+	pkg_sender->status = MDFLD_DSI_PKG_SENDER_FREE;
+
+	/*init regs*/
+	if (pipe == 0) {
+		pkg_sender->dpll_reg = MRST_DPLL_A;
+		pkg_sender->dspcntr_reg = DSPACNTR;
+		pkg_sender->pipeconf_reg = PIPEACONF;
+		pkg_sender->dsplinoff_reg = DSPALINOFF;
+		pkg_sender->dspsurf_reg = DSPASURF;
+		pkg_sender->pipestat_reg = PIPEASTAT;
+	} else if (pipe == 2) {
+		pkg_sender->dpll_reg = MRST_DPLL_A;
+		pkg_sender->dspcntr_reg = DSPCCNTR;
+		pkg_sender->pipeconf_reg = PIPECCONF;
+		pkg_sender->dsplinoff_reg = DSPCLINOFF;
+		pkg_sender->dspsurf_reg = DSPCSURF;
+		pkg_sender->pipestat_reg = PIPECSTAT;
+	}
+
+	pkg_sender->mipi_intr_stat_reg = MIPI_INTR_STAT_REG(pipe);
+	pkg_sender->mipi_lp_gen_data_reg = MIPI_LP_GEN_DATA_REG(pipe);
+	pkg_sender->mipi_hs_gen_data_reg = MIPI_HS_GEN_DATA_REG(pipe);
+	pkg_sender->mipi_lp_gen_ctrl_reg = MIPI_LP_GEN_CTRL_REG(pipe);
+	pkg_sender->mipi_hs_gen_ctrl_reg = MIPI_HS_GEN_CTRL_REG(pipe);
+	pkg_sender->mipi_gen_fifo_stat_reg = MIPI_GEN_FIFO_STAT_REG(pipe);
+	pkg_sender->mipi_data_addr_reg = MIPI_DATA_ADD_REG(pipe);
+	pkg_sender->mipi_data_len_reg = MIPI_DATA_LEN_REG(pipe);
+	pkg_sender->mipi_cmd_addr_reg = MIPI_CMD_ADD_REG(pipe);
+	pkg_sender->mipi_cmd_len_reg = MIPI_CMD_LEN_REG(pipe);
+
+	/*init lock*/
+	spin_lock_init(&pkg_sender->lock);
+
+	if (mdfld_get_panel_type(dev, pipe) != TC35876X) {
+		/**
+		 * For video mode, don't enable DPI timing output here,
+		 * will init the DPI timing output during mode setting.
+		 */
+		mipi_val = PASS_FROM_SPHY_TO_AFE | SEL_FLOPPED_HSTX;
+
+		if (pipe == 0)
+			mipi_val |= 0x2;
+
+		REG_WRITE(MIPI_PORT_CONTROL(pipe), mipi_val);
+		REG_READ(MIPI_PORT_CONTROL(pipe));
+
+		/* do dsi controller init */
+		mdfld_dsi_controller_init(dsi_config, pipe);
+	}
+
+	return 0;
+}
+
+void mdfld_dsi_pkg_sender_destroy(struct mdfld_dsi_pkg_sender *sender)
+{
+	if (!sender || IS_ERR(sender))
+		return;
+
+	/*free*/
+	kfree(sender);
+}
+
+
diff --git a/drivers/gpu/drm/gma500/mdfld_dsi_pkg_sender.h b/drivers/gpu/drm/gma500/mdfld_dsi_pkg_sender.h
new file mode 100644
index 000000000000..459cd7ea8b81
--- /dev/null
+++ b/drivers/gpu/drm/gma500/mdfld_dsi_pkg_sender.h
@@ -0,0 +1,92 @@
+/*
+ * Copyright © 2010 Intel Corporation
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+ * DEALINGS IN THE SOFTWARE.
+ *
+ * Authors:
+ * Jackie Li<yaodong.li@intel.com>
+ */
+#ifndef __MDFLD_DSI_PKG_SENDER_H__
+#define __MDFLD_DSI_PKG_SENDER_H__
+
+#include <linux/kthread.h>
+
+#define MDFLD_MAX_DCS_PARAM	8
+
+struct mdfld_dsi_pkg_sender {
+	struct drm_device *dev;
+	struct mdfld_dsi_connector *dsi_connector;
+	u32 status;
+	u32 panel_mode;
+
+	int pipe;
+
+	spinlock_t lock;
+
+	u32 pkg_num;
+
+	/* Registers */
+	u32 dpll_reg;
+	u32 dspcntr_reg;
+	u32 pipeconf_reg;
+	u32 pipestat_reg;
+	u32 dsplinoff_reg;
+	u32 dspsurf_reg;
+
+	u32 mipi_intr_stat_reg;
+	u32 mipi_lp_gen_data_reg;
+	u32 mipi_hs_gen_data_reg;
+	u32 mipi_lp_gen_ctrl_reg;
+	u32 mipi_hs_gen_ctrl_reg;
+	u32 mipi_gen_fifo_stat_reg;
+	u32 mipi_data_addr_reg;
+	u32 mipi_data_len_reg;
+	u32 mipi_cmd_addr_reg;
+	u32 mipi_cmd_len_reg;
+};
+
+/* DCS definitions */
+#define DCS_SOFT_RESET			0x01
+#define DCS_ENTER_SLEEP_MODE		0x10
+#define DCS_EXIT_SLEEP_MODE		0x11
+#define DCS_SET_DISPLAY_OFF		0x28
+#define DCS_SET_DISPLAY_ON		0x29
+#define DCS_SET_COLUMN_ADDRESS		0x2a
+#define DCS_SET_PAGE_ADDRESS		0x2b
+#define DCS_WRITE_MEM_START		0x2c
+#define DCS_SET_TEAR_OFF		0x34
+#define DCS_SET_TEAR_ON			0x35
+
+extern int mdfld_dsi_pkg_sender_init(struct mdfld_dsi_connector *dsi_connector,
+					int pipe);
+extern void mdfld_dsi_pkg_sender_destroy(struct mdfld_dsi_pkg_sender *sender);
+int mdfld_dsi_send_mcs_short(struct mdfld_dsi_pkg_sender *sender, u8 cmd,
+					u8 param, u8 param_num, bool hs);
+int mdfld_dsi_send_mcs_long(struct mdfld_dsi_pkg_sender *sender, u8 *data,
+					u32 len, bool hs);
+int mdfld_dsi_send_gen_short(struct mdfld_dsi_pkg_sender *sender, u8 param0,
+					u8 param1, u8 param_num, bool hs);
+int mdfld_dsi_send_gen_long(struct mdfld_dsi_pkg_sender *sender, u8 *data,
+					u32 len, bool hs);
+/* Read interfaces */
+int mdfld_dsi_read_mcs(struct mdfld_dsi_pkg_sender *sender, u8 cmd,
+		u32 *data, u16 len, bool hs);
+
+#endif
diff --git a/drivers/gpu/drm/gma500/mdfld_intel_display.c b/drivers/gpu/drm/gma500/mdfld_intel_display.c
new file mode 100644
index 000000000000..55e5af5cbd3d
--- /dev/null
+++ b/drivers/gpu/drm/gma500/mdfld_intel_display.c
@@ -0,0 +1,1192 @@
+/*
+ * Copyright Â© 2006-2007 Intel Corporation
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms and conditions of the GNU General Public License,
+ * version 2, as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
+ * more details.
+ *
+ * You should have received a copy of the GNU General Public License along with
+ * this program; if not, write to the Free Software Foundation, Inc.,
+ * 51 Franklin St - Fifth Floor, Boston, MA 02110-1301 USA.
+ *
+ * Authors:
+ *	Eric Anholt <eric@anholt.net>
+ */
+
+#include <linux/i2c.h>
+#include <linux/pm_runtime.h>
+
+#include <drm/drmP.h>
+#include "psb_intel_reg.h"
+#include "psb_intel_display.h"
+#include "framebuffer.h"
+#include "mdfld_output.h"
+#include "mdfld_dsi_output.h"
+
+/* Hardcoded currently */
+static int ksel = KSEL_CRYSTAL_19;
+
+struct psb_intel_range_t {
+	int min, max;
+};
+
+struct mrst_limit_t {
+	struct psb_intel_range_t dot, m, p1;
+};
+
+struct mrst_clock_t {
+	/* derived values */
+	int dot;
+	int m;
+	int p1;
+};
+
+#define COUNT_MAX 0x10000000
+
+void mdfldWaitForPipeDisable(struct drm_device *dev, int pipe)
+{
+	int count, temp;
+	u32 pipeconf_reg = PIPEACONF;
+
+	switch (pipe) {
+	case 0:
+		break;
+	case 1:
+		pipeconf_reg = PIPEBCONF;
+		break;
+	case 2:
+		pipeconf_reg = PIPECCONF;
+		break;
+	default:
+		DRM_ERROR("Illegal Pipe Number.\n");
+		return;
+	}
+
+	/* FIXME JLIU7_PO */
+	psb_intel_wait_for_vblank(dev);
+	return;
+
+	/* Wait for for the pipe disable to take effect. */
+	for (count = 0; count < COUNT_MAX; count++) {
+		temp = REG_READ(pipeconf_reg);
+		if ((temp & PIPEACONF_PIPE_STATE) == 0)
+			break;
+	}
+}
+
+void mdfldWaitForPipeEnable(struct drm_device *dev, int pipe)
+{
+	int count, temp;
+	u32 pipeconf_reg = PIPEACONF;
+
+	switch (pipe) {
+	case 0:
+		break;
+	case 1:
+		pipeconf_reg = PIPEBCONF;
+		break;
+	case 2:
+		pipeconf_reg = PIPECCONF;
+		break;
+	default:
+		DRM_ERROR("Illegal Pipe Number.\n");
+		return;
+	}
+
+	/* FIXME JLIU7_PO */
+	psb_intel_wait_for_vblank(dev);
+	return;
+
+	/* Wait for for the pipe enable to take effect. */
+	for (count = 0; count < COUNT_MAX; count++) {
+		temp = REG_READ(pipeconf_reg);
+		if ((temp & PIPEACONF_PIPE_STATE) == 1)
+			break;
+	}
+}
+
+static void psb_intel_crtc_prepare(struct drm_crtc *crtc)
+{
+	struct drm_crtc_helper_funcs *crtc_funcs = crtc->helper_private;
+	crtc_funcs->dpms(crtc, DRM_MODE_DPMS_OFF);
+}
+
+static void psb_intel_crtc_commit(struct drm_crtc *crtc)
+{
+	struct drm_crtc_helper_funcs *crtc_funcs = crtc->helper_private;
+	crtc_funcs->dpms(crtc, DRM_MODE_DPMS_ON);
+}
+
+static bool psb_intel_crtc_mode_fixup(struct drm_crtc *crtc,
+				  struct drm_display_mode *mode,
+				  struct drm_display_mode *adjusted_mode)
+{
+	return true;
+}
+
+/**
+ * Return the pipe currently connected to the panel fitter,
+ * or -1 if the panel fitter is not present or not in use
+ */
+static int psb_intel_panel_fitter_pipe(struct drm_device *dev)
+{
+	u32 pfit_control;
+
+	pfit_control = REG_READ(PFIT_CONTROL);
+
+	/* See if the panel fitter is in use */
+	if ((pfit_control & PFIT_ENABLE) == 0)
+		return -1;
+
+	/* 965 can place panel fitter on either pipe */
+	return (pfit_control >> 29) & 0x3;
+}
+
+static struct drm_device globle_dev;
+
+void mdfld__intel_plane_set_alpha(int enable)
+{
+	struct drm_device *dev = &globle_dev;
+	int dspcntr_reg = DSPACNTR;
+	u32 dspcntr;
+
+	dspcntr = REG_READ(dspcntr_reg);
+
+	if (enable) {
+		dspcntr &= ~DISPPLANE_32BPP_NO_ALPHA;
+		dspcntr |= DISPPLANE_32BPP;
+	} else {
+		dspcntr &= ~DISPPLANE_32BPP;
+		dspcntr |= DISPPLANE_32BPP_NO_ALPHA;
+	}
+
+	REG_WRITE(dspcntr_reg, dspcntr);
+}
+
+static int check_fb(struct drm_framebuffer *fb)
+{
+	if (!fb)
+		return 0;
+
+	switch (fb->bits_per_pixel) {
+	case 8:
+	case 16:
+	case 24:
+	case 32:
+		return 0;
+	default:
+		DRM_ERROR("Unknown color depth\n");
+		return -EINVAL;
+	}
+}
+
+static int mdfld__intel_pipe_set_base(struct drm_crtc *crtc, int x, int y,
+				struct drm_framebuffer *old_fb)
+{
+	struct drm_device *dev = crtc->dev;
+	/* struct drm_i915_master_private *master_priv; */
+	struct psb_intel_crtc *psb_intel_crtc = to_psb_intel_crtc(crtc);
+	struct psb_framebuffer *psbfb = to_psb_fb(crtc->fb);
+	int pipe = psb_intel_crtc->pipe;
+	unsigned long start, offset;
+	int dsplinoff = DSPALINOFF;
+	int dspsurf = DSPASURF;
+	int dspstride = DSPASTRIDE;
+	int dspcntr_reg = DSPACNTR;
+	u32 dspcntr;
+	int ret;
+
+	memcpy(&globle_dev, dev, sizeof(struct drm_device));
+
+	dev_dbg(dev->dev, "pipe = 0x%x.\n", pipe);
+
+	/* no fb bound */
+	if (!crtc->fb) {
+		dev_dbg(dev->dev, "No FB bound\n");
+		return 0;
+	}
+
+	ret = check_fb(crtc->fb);
+	if (ret)
+		return ret;
+
+	switch (pipe) {
+	case 0:
+		dsplinoff = DSPALINOFF;
+		break;
+	case 1:
+		dsplinoff = DSPBLINOFF;
+		dspsurf = DSPBSURF;
+		dspstride = DSPBSTRIDE;
+		dspcntr_reg = DSPBCNTR;
+		break;
+	case 2:
+		dsplinoff = DSPCLINOFF;
+		dspsurf = DSPCSURF;
+		dspstride = DSPCSTRIDE;
+		dspcntr_reg = DSPCCNTR;
+		break;
+	default:
+		DRM_ERROR("Illegal Pipe Number.\n");
+		return -EINVAL;
+	}
+
+	if (!gma_power_begin(dev, true))
+		return 0;
+
+	start = psbfb->gtt->offset;
+	offset = y * crtc->fb->pitches[0] + x * (crtc->fb->bits_per_pixel / 8);
+
+	REG_WRITE(dspstride, crtc->fb->pitches[0]);
+	dspcntr = REG_READ(dspcntr_reg);
+	dspcntr &= ~DISPPLANE_PIXFORMAT_MASK;
+
+	switch (crtc->fb->bits_per_pixel) {
+	case 8:
+		dspcntr |= DISPPLANE_8BPP;
+		break;
+	case 16:
+		if (crtc->fb->depth == 15)
+			dspcntr |= DISPPLANE_15_16BPP;
+		else
+			dspcntr |= DISPPLANE_16BPP;
+		break;
+	case 24:
+	case 32:
+		dspcntr |= DISPPLANE_32BPP_NO_ALPHA;
+		break;
+	}
+	REG_WRITE(dspcntr_reg, dspcntr);
+
+	dev_dbg(dev->dev, "Writing base %08lX %08lX %d %d\n",
+						start, offset, x, y);
+	REG_WRITE(dsplinoff, offset);
+	REG_READ(dsplinoff);
+	REG_WRITE(dspsurf, start);
+	REG_READ(dspsurf);
+
+	gma_power_end(dev);
+
+	return 0;
+}
+
+/*
+ * Disable the pipe, plane and pll.
+ *
+ */
+void mdfld_disable_crtc(struct drm_device *dev, int pipe)
+{
+	int dpll_reg = MRST_DPLL_A;
+	int dspcntr_reg = DSPACNTR;
+	int dspbase_reg = MRST_DSPABASE;
+	int pipeconf_reg = PIPEACONF;
+	u32 temp;
+
+	dev_dbg(dev->dev, "pipe = %d\n", pipe);
+
+
+	switch (pipe) {
+	case 0:
+		break;
+	case 1:
+		dpll_reg = MDFLD_DPLL_B;
+		dspcntr_reg = DSPBCNTR;
+		dspbase_reg = DSPBSURF;
+		pipeconf_reg = PIPEBCONF;
+		break;
+	case 2:
+		dpll_reg = MRST_DPLL_A;
+		dspcntr_reg = DSPCCNTR;
+		dspbase_reg = MDFLD_DSPCBASE;
+		pipeconf_reg = PIPECCONF;
+		break;
+	default:
+		DRM_ERROR("Illegal Pipe Number.\n");
+		return;
+	}
+
+	if (pipe != 1)
+		mdfld_dsi_gen_fifo_ready(dev, MIPI_GEN_FIFO_STAT_REG(pipe),
+				HS_CTRL_FIFO_EMPTY | HS_DATA_FIFO_EMPTY);
+
+	/* Disable display plane */
+	temp = REG_READ(dspcntr_reg);
+	if ((temp & DISPLAY_PLANE_ENABLE) != 0) {
+		REG_WRITE(dspcntr_reg,
+			  temp & ~DISPLAY_PLANE_ENABLE);
+		/* Flush the plane changes */
+		REG_WRITE(dspbase_reg, REG_READ(dspbase_reg));
+		REG_READ(dspbase_reg);
+	}
+
+	/* FIXME_JLIU7 MDFLD_PO revisit */
+
+	/* Next, disable display pipes */
+	temp = REG_READ(pipeconf_reg);
+	if ((temp & PIPEACONF_ENABLE) != 0) {
+		temp &= ~PIPEACONF_ENABLE;
+		temp |= PIPECONF_PLANE_OFF | PIPECONF_CURSOR_OFF;
+		REG_WRITE(pipeconf_reg, temp);
+		REG_READ(pipeconf_reg);
+
+		/* Wait for for the pipe disable to take effect. */
+		mdfldWaitForPipeDisable(dev, pipe);
+	}
+
+	temp = REG_READ(dpll_reg);
+	if (temp & DPLL_VCO_ENABLE) {
+		if ((pipe != 1 &&
+			!((REG_READ(PIPEACONF) | REG_READ(PIPECCONF))
+				& PIPEACONF_ENABLE)) || pipe == 1) {
+			temp &= ~(DPLL_VCO_ENABLE);
+			REG_WRITE(dpll_reg, temp);
+			REG_READ(dpll_reg);
+			/* Wait for the clocks to turn off. */
+			/* FIXME_MDFLD PO may need more delay */
+			udelay(500);
+
+			if (!(temp & MDFLD_PWR_GATE_EN)) {
+				/* gating power of DPLL */
+				REG_WRITE(dpll_reg, temp | MDFLD_PWR_GATE_EN);
+				/* FIXME_MDFLD PO - change 500 to 1 after PO */
+				udelay(5000);
+			}
+		}
+	}
+
+}
+
+/**
+ * Sets the power management mode of the pipe and plane.
+ *
+ * This code should probably grow support for turning the cursor off and back
+ * on appropriately at the same time as we're turning the pipe off/on.
+ */
+static void mdfld_crtc_dpms(struct drm_crtc *crtc, int mode)
+{
+	struct drm_device *dev = crtc->dev;
+	struct drm_psb_private *dev_priv = dev->dev_private;
+	struct psb_intel_crtc *psb_intel_crtc = to_psb_intel_crtc(crtc);
+	int pipe = psb_intel_crtc->pipe;
+	int dpll_reg = MRST_DPLL_A;
+	int dspcntr_reg = DSPACNTR;
+	int dspbase_reg = MRST_DSPABASE;
+	int pipeconf_reg = PIPEACONF;
+	u32 pipestat_reg = PIPEASTAT;
+	u32 pipeconf = dev_priv->pipeconf[pipe];
+	u32 temp;
+	bool enabled;
+	int timeout = 0;
+
+	dev_dbg(dev->dev, "mode = %d, pipe = %d\n", mode, pipe);
+
+/* FIXME_JLIU7 MDFLD_PO replaced w/ the following function */
+/* mdfld_dbi_dpms (struct drm_device *dev, int pipe, bool enabled) */
+
+	switch (pipe) {
+	case 0:
+		break;
+	case 1:
+		dpll_reg = DPLL_B;
+		dspcntr_reg = DSPBCNTR;
+		dspbase_reg = MRST_DSPBBASE;
+		pipeconf_reg = PIPEBCONF;
+		dpll_reg = MDFLD_DPLL_B;
+		break;
+	case 2:
+		dpll_reg = MRST_DPLL_A;
+		dspcntr_reg = DSPCCNTR;
+		dspbase_reg = MDFLD_DSPCBASE;
+		pipeconf_reg = PIPECCONF;
+		pipestat_reg = PIPECSTAT;
+		break;
+	default:
+		DRM_ERROR("Illegal Pipe Number.\n");
+		return;
+	}
+
+	if (!gma_power_begin(dev, true))
+		return;
+
+	/* XXX: When our outputs are all unaware of DPMS modes other than off
+	 * and on, we should map those modes to DRM_MODE_DPMS_OFF in the CRTC.
+	 */
+	switch (mode) {
+	case DRM_MODE_DPMS_ON:
+	case DRM_MODE_DPMS_STANDBY:
+	case DRM_MODE_DPMS_SUSPEND:
+		/* Enable the DPLL */
+		temp = REG_READ(dpll_reg);
+
+		if ((temp & DPLL_VCO_ENABLE) == 0) {
+			/* When ungating power of DPLL, needs to wait 0.5us
+			   before enable the VCO */
+			if (temp & MDFLD_PWR_GATE_EN) {
+				temp &= ~MDFLD_PWR_GATE_EN;
+				REG_WRITE(dpll_reg, temp);
+				/* FIXME_MDFLD PO - change 500 to 1 after PO */
+				udelay(500);
+			}
+
+			REG_WRITE(dpll_reg, temp);
+			REG_READ(dpll_reg);
+			/* FIXME_MDFLD PO - change 500 to 1 after PO */
+			udelay(500);
+
+			REG_WRITE(dpll_reg, temp | DPLL_VCO_ENABLE);
+			REG_READ(dpll_reg);
+
+			/**
+			 * wait for DSI PLL to lock
+			 * NOTE: only need to poll status of pipe 0 and pipe 1,
+			 * since both MIPI pipes share the same PLL.
+			 */
+			while ((pipe != 2) && (timeout < 20000) &&
+			  !(REG_READ(pipeconf_reg) & PIPECONF_DSIPLL_LOCK)) {
+				udelay(150);
+				timeout++;
+			}
+		}
+
+		/* Enable the plane */
+		temp = REG_READ(dspcntr_reg);
+		if ((temp & DISPLAY_PLANE_ENABLE) == 0) {
+			REG_WRITE(dspcntr_reg,
+				temp | DISPLAY_PLANE_ENABLE);
+			/* Flush the plane changes */
+			REG_WRITE(dspbase_reg, REG_READ(dspbase_reg));
+		}
+
+		/* Enable the pipe */
+		temp = REG_READ(pipeconf_reg);
+		if ((temp & PIPEACONF_ENABLE) == 0) {
+			REG_WRITE(pipeconf_reg, pipeconf);
+
+			/* Wait for for the pipe enable to take effect. */
+			mdfldWaitForPipeEnable(dev, pipe);
+		}
+
+		/*workaround for sighting 3741701 Random X blank display*/
+		/*perform w/a in video mode only on pipe A or C*/
+		if (pipe == 0 || pipe == 2) {
+			REG_WRITE(pipestat_reg, REG_READ(pipestat_reg));
+			msleep(100);
+			if (PIPE_VBLANK_STATUS & REG_READ(pipestat_reg))
+				dev_dbg(dev->dev, "OK");
+			else {
+				dev_dbg(dev->dev, "STUCK!!!!");
+				/*shutdown controller*/
+				temp = REG_READ(dspcntr_reg);
+				REG_WRITE(dspcntr_reg,
+						temp & ~DISPLAY_PLANE_ENABLE);
+				REG_WRITE(dspbase_reg, REG_READ(dspbase_reg));
+				/*mdfld_dsi_dpi_shut_down(dev, pipe);*/
+				REG_WRITE(0xb048, 1);
+				msleep(100);
+				temp = REG_READ(pipeconf_reg);
+				temp &= ~PIPEACONF_ENABLE;
+				REG_WRITE(pipeconf_reg, temp);
+				msleep(100); /*wait for pipe disable*/
+				REG_WRITE(MIPI_DEVICE_READY_REG(pipe), 0);
+				msleep(100);
+				REG_WRITE(0xb004, REG_READ(0xb004));
+				/* try to bring the controller back up again*/
+				REG_WRITE(MIPI_DEVICE_READY_REG(pipe), 1);
+				temp = REG_READ(dspcntr_reg);
+				REG_WRITE(dspcntr_reg,
+						temp | DISPLAY_PLANE_ENABLE);
+				REG_WRITE(dspbase_reg, REG_READ(dspbase_reg));
+				/*mdfld_dsi_dpi_turn_on(dev, pipe);*/
+				REG_WRITE(0xb048, 2);
+				msleep(100);
+				temp = REG_READ(pipeconf_reg);
+				temp |= PIPEACONF_ENABLE;
+				REG_WRITE(pipeconf_reg, temp);
+			}
+		}
+
+		psb_intel_crtc_load_lut(crtc);
+
+		/* Give the overlay scaler a chance to enable
+		   if it's on this pipe */
+		/* psb_intel_crtc_dpms_video(crtc, true); TODO */
+
+		break;
+	case DRM_MODE_DPMS_OFF:
+		/* Give the overlay scaler a chance to disable
+		 * if it's on this pipe */
+		/* psb_intel_crtc_dpms_video(crtc, FALSE); TODO */
+		if (pipe != 1)
+			mdfld_dsi_gen_fifo_ready(dev,
+				MIPI_GEN_FIFO_STAT_REG(pipe),
+				HS_CTRL_FIFO_EMPTY | HS_DATA_FIFO_EMPTY);
+
+		/* Disable the VGA plane that we never use */
+		REG_WRITE(VGACNTRL, VGA_DISP_DISABLE);
+
+		/* Disable display plane */
+		temp = REG_READ(dspcntr_reg);
+		if ((temp & DISPLAY_PLANE_ENABLE) != 0) {
+			REG_WRITE(dspcntr_reg,
+				  temp & ~DISPLAY_PLANE_ENABLE);
+			/* Flush the plane changes */
+			REG_WRITE(dspbase_reg, REG_READ(dspbase_reg));
+			REG_READ(dspbase_reg);
+		}
+
+		/* Next, disable display pipes */
+		temp = REG_READ(pipeconf_reg);
+		if ((temp & PIPEACONF_ENABLE) != 0) {
+			temp &= ~PIPEACONF_ENABLE;
+			temp |= PIPECONF_PLANE_OFF | PIPECONF_CURSOR_OFF;
+			REG_WRITE(pipeconf_reg, temp);
+			REG_READ(pipeconf_reg);
+
+			/* Wait for for the pipe disable to take effect. */
+			mdfldWaitForPipeDisable(dev, pipe);
+		}
+
+		temp = REG_READ(dpll_reg);
+		if (temp & DPLL_VCO_ENABLE) {
+			if ((pipe != 1 && !((REG_READ(PIPEACONF)
+				| REG_READ(PIPECCONF)) & PIPEACONF_ENABLE))
+					|| pipe == 1) {
+				temp &= ~(DPLL_VCO_ENABLE);
+				REG_WRITE(dpll_reg, temp);
+				REG_READ(dpll_reg);
+				/* Wait for the clocks to turn off. */
+				/* FIXME_MDFLD PO may need more delay */
+				udelay(500);
+			}
+		}
+		break;
+	}
+	enabled = crtc->enabled && mode != DRM_MODE_DPMS_OFF;
+	gma_power_end(dev);
+}
+
+
+#define MDFLD_LIMT_DPLL_19	    0
+#define MDFLD_LIMT_DPLL_25	    1
+#define MDFLD_LIMT_DPLL_83	    2
+#define MDFLD_LIMT_DPLL_100	    3
+#define MDFLD_LIMT_DSIPLL_19	    4
+#define MDFLD_LIMT_DSIPLL_25	    5
+#define MDFLD_LIMT_DSIPLL_83	    6
+#define MDFLD_LIMT_DSIPLL_100	    7
+
+#define MDFLD_DOT_MIN		  19750
+#define MDFLD_DOT_MAX		  120000
+#define MDFLD_DPLL_M_MIN_19	    113
+#define MDFLD_DPLL_M_MAX_19	    155
+#define MDFLD_DPLL_P1_MIN_19	    2
+#define MDFLD_DPLL_P1_MAX_19	    10
+#define MDFLD_DPLL_M_MIN_25	    101
+#define MDFLD_DPLL_M_MAX_25	    130
+#define MDFLD_DPLL_P1_MIN_25	    2
+#define MDFLD_DPLL_P1_MAX_25	    10
+#define MDFLD_DPLL_M_MIN_83	    64
+#define MDFLD_DPLL_M_MAX_83	    64
+#define MDFLD_DPLL_P1_MIN_83	    2
+#define MDFLD_DPLL_P1_MAX_83	    2
+#define MDFLD_DPLL_M_MIN_100	    64
+#define MDFLD_DPLL_M_MAX_100	    64
+#define MDFLD_DPLL_P1_MIN_100	    2
+#define MDFLD_DPLL_P1_MAX_100	    2
+#define MDFLD_DSIPLL_M_MIN_19	    131
+#define MDFLD_DSIPLL_M_MAX_19	    175
+#define MDFLD_DSIPLL_P1_MIN_19	    3
+#define MDFLD_DSIPLL_P1_MAX_19	    8
+#define MDFLD_DSIPLL_M_MIN_25	    97
+#define MDFLD_DSIPLL_M_MAX_25	    140
+#define MDFLD_DSIPLL_P1_MIN_25	    3
+#define MDFLD_DSIPLL_P1_MAX_25	    9
+#define MDFLD_DSIPLL_M_MIN_83	    33
+#define MDFLD_DSIPLL_M_MAX_83	    92
+#define MDFLD_DSIPLL_P1_MIN_83	    2
+#define MDFLD_DSIPLL_P1_MAX_83	    3
+#define MDFLD_DSIPLL_M_MIN_100	    97
+#define MDFLD_DSIPLL_M_MAX_100	    140
+#define MDFLD_DSIPLL_P1_MIN_100	    3
+#define MDFLD_DSIPLL_P1_MAX_100	    9
+
+static const struct mrst_limit_t mdfld_limits[] = {
+	{			/* MDFLD_LIMT_DPLL_19 */
+	 .dot = {.min = MDFLD_DOT_MIN, .max = MDFLD_DOT_MAX},
+	 .m = {.min = MDFLD_DPLL_M_MIN_19, .max = MDFLD_DPLL_M_MAX_19},
+	 .p1 = {.min = MDFLD_DPLL_P1_MIN_19, .max = MDFLD_DPLL_P1_MAX_19},
+	 },
+	{			/* MDFLD_LIMT_DPLL_25 */
+	 .dot = {.min = MDFLD_DOT_MIN, .max = MDFLD_DOT_MAX},
+	 .m = {.min = MDFLD_DPLL_M_MIN_25, .max = MDFLD_DPLL_M_MAX_25},
+	 .p1 = {.min = MDFLD_DPLL_P1_MIN_25, .max = MDFLD_DPLL_P1_MAX_25},
+	 },
+	{			/* MDFLD_LIMT_DPLL_83 */
+	 .dot = {.min = MDFLD_DOT_MIN, .max = MDFLD_DOT_MAX},
+	 .m = {.min = MDFLD_DPLL_M_MIN_83, .max = MDFLD_DPLL_M_MAX_83},
+	 .p1 = {.min = MDFLD_DPLL_P1_MIN_83, .max = MDFLD_DPLL_P1_MAX_83},
+	 },
+	{			/* MDFLD_LIMT_DPLL_100 */
+	 .dot = {.min = MDFLD_DOT_MIN, .max = MDFLD_DOT_MAX},
+	 .m = {.min = MDFLD_DPLL_M_MIN_100, .max = MDFLD_DPLL_M_MAX_100},
+	 .p1 = {.min = MDFLD_DPLL_P1_MIN_100, .max = MDFLD_DPLL_P1_MAX_100},
+	 },
+	{			/* MDFLD_LIMT_DSIPLL_19 */
+	 .dot = {.min = MDFLD_DOT_MIN, .max = MDFLD_DOT_MAX},
+	 .m = {.min = MDFLD_DSIPLL_M_MIN_19, .max = MDFLD_DSIPLL_M_MAX_19},
+	 .p1 = {.min = MDFLD_DSIPLL_P1_MIN_19, .max = MDFLD_DSIPLL_P1_MAX_19},
+	 },
+	{			/* MDFLD_LIMT_DSIPLL_25 */
+	 .dot = {.min = MDFLD_DOT_MIN, .max = MDFLD_DOT_MAX},
+	 .m = {.min = MDFLD_DSIPLL_M_MIN_25, .max = MDFLD_DSIPLL_M_MAX_25},
+	 .p1 = {.min = MDFLD_DSIPLL_P1_MIN_25, .max = MDFLD_DSIPLL_P1_MAX_25},
+	 },
+	{			/* MDFLD_LIMT_DSIPLL_83 */
+	 .dot = {.min = MDFLD_DOT_MIN, .max = MDFLD_DOT_MAX},
+	 .m = {.min = MDFLD_DSIPLL_M_MIN_83, .max = MDFLD_DSIPLL_M_MAX_83},
+	 .p1 = {.min = MDFLD_DSIPLL_P1_MIN_83, .max = MDFLD_DSIPLL_P1_MAX_83},
+	 },
+	{			/* MDFLD_LIMT_DSIPLL_100 */
+	 .dot = {.min = MDFLD_DOT_MIN, .max = MDFLD_DOT_MAX},
+	 .m = {.min = MDFLD_DSIPLL_M_MIN_100, .max = MDFLD_DSIPLL_M_MAX_100},
+	 .p1 = {.min = MDFLD_DSIPLL_P1_MIN_100, .max = MDFLD_DSIPLL_P1_MAX_100},
+	 },
+};
+
+#define MDFLD_M_MIN	    21
+#define MDFLD_M_MAX	    180
+static const u32 mdfld_m_converts[] = {
+/* M configuration table from 9-bit LFSR table */
+	224, 368, 440, 220, 366, 439, 219, 365, 182, 347, /* 21 - 30 */
+	173, 342, 171, 85, 298, 149, 74, 37, 18, 265,   /* 31 - 40 */
+	388, 194, 353, 432, 216, 108, 310, 155, 333, 166, /* 41 - 50 */
+	83, 41, 276, 138, 325, 162, 337, 168, 340, 170, /* 51 - 60 */
+	341, 426, 469, 234, 373, 442, 221, 110, 311, 411, /* 61 - 70 */
+	461, 486, 243, 377, 188, 350, 175, 343, 427, 213, /* 71 - 80 */
+	106, 53, 282, 397, 354, 227, 113, 56, 284, 142, /* 81 - 90 */
+	71, 35, 273, 136, 324, 418, 465, 488, 500, 506, /* 91 - 100 */
+	253, 126, 63, 287, 399, 455, 483, 241, 376, 444, /* 101 - 110 */
+	478, 495, 503, 251, 381, 446, 479, 239, 375, 443, /* 111 - 120 */
+	477, 238, 119, 315, 157, 78, 295, 147, 329, 420, /* 121 - 130 */
+	210, 105, 308, 154, 77, 38, 275, 137, 68, 290, /* 131 - 140 */
+	145, 328, 164, 82, 297, 404, 458, 485, 498, 249, /* 141 - 150 */
+	380, 190, 351, 431, 471, 235, 117, 314, 413, 206, /* 151 - 160 */
+	103, 51, 25, 12, 262, 387, 193, 96, 48, 280, /* 161 - 170 */
+	396, 198, 99, 305, 152, 76, 294, 403, 457, 228, /* 171 - 180 */
+};
+
+static const struct mrst_limit_t *mdfld_limit(struct drm_crtc *crtc)
+{
+	const struct mrst_limit_t *limit = NULL;
+	struct drm_device *dev = crtc->dev;
+	struct drm_psb_private *dev_priv = dev->dev_private;
+
+	if (psb_intel_pipe_has_type(crtc, INTEL_OUTPUT_MIPI)
+	    || psb_intel_pipe_has_type(crtc, INTEL_OUTPUT_MIPI2)) {
+		if ((ksel == KSEL_CRYSTAL_19) || (ksel == KSEL_BYPASS_19))
+			limit = &mdfld_limits[MDFLD_LIMT_DSIPLL_19];
+		else if (ksel == KSEL_BYPASS_25)
+			limit = &mdfld_limits[MDFLD_LIMT_DSIPLL_25];
+		else if ((ksel == KSEL_BYPASS_83_100) &&
+				(dev_priv->core_freq == 166))
+			limit = &mdfld_limits[MDFLD_LIMT_DSIPLL_83];
+		else if ((ksel == KSEL_BYPASS_83_100) &&
+			 (dev_priv->core_freq == 100 ||
+				dev_priv->core_freq == 200))
+			limit = &mdfld_limits[MDFLD_LIMT_DSIPLL_100];
+	} else if (psb_intel_pipe_has_type(crtc, INTEL_OUTPUT_HDMI)) {
+		if ((ksel == KSEL_CRYSTAL_19) || (ksel == KSEL_BYPASS_19))
+			limit = &mdfld_limits[MDFLD_LIMT_DPLL_19];
+		else if (ksel == KSEL_BYPASS_25)
+			limit = &mdfld_limits[MDFLD_LIMT_DPLL_25];
+		else if ((ksel == KSEL_BYPASS_83_100) &&
+				(dev_priv->core_freq == 166))
+			limit = &mdfld_limits[MDFLD_LIMT_DPLL_83];
+		else if ((ksel == KSEL_BYPASS_83_100) &&
+				 (dev_priv->core_freq == 100 ||
+				 dev_priv->core_freq == 200))
+			limit = &mdfld_limits[MDFLD_LIMT_DPLL_100];
+	} else {
+		limit = NULL;
+		dev_dbg(dev->dev, "mdfld_limit Wrong display type.\n");
+	}
+
+	return limit;
+}
+
+/** Derive the pixel clock for the given refclk and divisors for 8xx chips. */
+static void mdfld_clock(int refclk, struct mrst_clock_t *clock)
+{
+	clock->dot = (refclk * clock->m) / clock->p1;
+}
+
+/**
+ * Returns a set of divisors for the desired target clock with the given refclk,
+ * or FALSE.  Divisor values are the actual divisors for
+ */
+static bool
+mdfldFindBestPLL(struct drm_crtc *crtc, int target, int refclk,
+		struct mrst_clock_t *best_clock)
+{
+	struct mrst_clock_t clock;
+	const struct mrst_limit_t *limit = mdfld_limit(crtc);
+	int err = target;
+
+	memset(best_clock, 0, sizeof(*best_clock));
+
+	for (clock.m = limit->m.min; clock.m <= limit->m.max; clock.m++) {
+		for (clock.p1 = limit->p1.min; clock.p1 <= limit->p1.max;
+		     clock.p1++) {
+			int this_err;
+
+			mdfld_clock(refclk, &clock);
+
+			this_err = abs(clock.dot - target);
+			if (this_err < err) {
+				*best_clock = clock;
+				err = this_err;
+			}
+		}
+	}
+	return err != target;
+}
+
+static int mdfld_crtc_mode_set(struct drm_crtc *crtc,
+			      struct drm_display_mode *mode,
+			      struct drm_display_mode *adjusted_mode,
+			      int x, int y,
+			      struct drm_framebuffer *old_fb)
+{
+	struct drm_device *dev = crtc->dev;
+	struct psb_intel_crtc *psb_intel_crtc = to_psb_intel_crtc(crtc);
+	struct drm_psb_private *dev_priv = dev->dev_private;
+	int pipe = psb_intel_crtc->pipe;
+	int fp_reg = MRST_FPA0;
+	int dpll_reg = MRST_DPLL_A;
+	int dspcntr_reg = DSPACNTR;
+	int pipeconf_reg = PIPEACONF;
+	int htot_reg = HTOTAL_A;
+	int hblank_reg = HBLANK_A;
+	int hsync_reg = HSYNC_A;
+	int vtot_reg = VTOTAL_A;
+	int vblank_reg = VBLANK_A;
+	int vsync_reg = VSYNC_A;
+	int dspsize_reg = DSPASIZE;
+	int dsppos_reg = DSPAPOS;
+	int pipesrc_reg = PIPEASRC;
+	u32 *pipeconf = &dev_priv->pipeconf[pipe];
+	u32 *dspcntr = &dev_priv->dspcntr[pipe];
+	int refclk = 0;
+	int clk_n = 0, clk_p2 = 0, clk_byte = 1, clk = 0, m_conv = 0,
+								clk_tmp = 0;
+	struct mrst_clock_t clock;
+	bool ok;
+	u32 dpll = 0, fp = 0;
+	bool is_crt = false, is_lvds = false, is_tv = false;
+	bool is_mipi = false, is_mipi2 = false, is_hdmi = false;
+	struct drm_mode_config *mode_config = &dev->mode_config;
+	struct psb_intel_encoder *psb_intel_encoder = NULL;
+	uint64_t scalingType = DRM_MODE_SCALE_FULLSCREEN;
+	struct drm_encoder *encoder;
+	struct drm_connector *connector;
+	int timeout = 0;
+	int ret;
+
+	dev_dbg(dev->dev, "pipe = 0x%x\n", pipe);
+
+#if 0
+	if (pipe == 1) {
+		if (!gma_power_begin(dev, true))
+			return 0;
+		android_hdmi_crtc_mode_set(crtc, mode, adjusted_mode,
+			x, y, old_fb);
+		goto mrst_crtc_mode_set_exit;
+	}
+#endif
+
+	switch (pipe) {
+	case 0:
+		break;
+	case 1:
+		fp_reg = FPB0;
+		dpll_reg = DPLL_B;
+		dspcntr_reg = DSPBCNTR;
+		pipeconf_reg = PIPEBCONF;
+		htot_reg = HTOTAL_B;
+		hblank_reg = HBLANK_B;
+		hsync_reg = HSYNC_B;
+		vtot_reg = VTOTAL_B;
+		vblank_reg = VBLANK_B;
+		vsync_reg = VSYNC_B;
+		dspsize_reg = DSPBSIZE;
+		dsppos_reg = DSPBPOS;
+		pipesrc_reg = PIPEBSRC;
+		fp_reg = MDFLD_DPLL_DIV0;
+		dpll_reg = MDFLD_DPLL_B;
+		break;
+	case 2:
+		dpll_reg = MRST_DPLL_A;
+		dspcntr_reg = DSPCCNTR;
+		pipeconf_reg = PIPECCONF;
+		htot_reg = HTOTAL_C;
+		hblank_reg = HBLANK_C;
+		hsync_reg = HSYNC_C;
+		vtot_reg = VTOTAL_C;
+		vblank_reg = VBLANK_C;
+		vsync_reg = VSYNC_C;
+		dspsize_reg = DSPCSIZE;
+		dsppos_reg = DSPCPOS;
+		pipesrc_reg = PIPECSRC;
+		break;
+	default:
+		DRM_ERROR("Illegal Pipe Number.\n");
+		return 0;
+	}
+
+	ret = check_fb(crtc->fb);
+	if (ret)
+		return ret;
+
+	dev_dbg(dev->dev, "adjusted_hdisplay = %d\n",
+		 adjusted_mode->hdisplay);
+	dev_dbg(dev->dev, "adjusted_vdisplay = %d\n",
+		 adjusted_mode->vdisplay);
+	dev_dbg(dev->dev, "adjusted_hsync_start = %d\n",
+		 adjusted_mode->hsync_start);
+	dev_dbg(dev->dev, "adjusted_hsync_end = %d\n",
+		 adjusted_mode->hsync_end);
+	dev_dbg(dev->dev, "adjusted_htotal = %d\n",
+		 adjusted_mode->htotal);
+	dev_dbg(dev->dev, "adjusted_vsync_start = %d\n",
+		 adjusted_mode->vsync_start);
+	dev_dbg(dev->dev, "adjusted_vsync_end = %d\n",
+		 adjusted_mode->vsync_end);
+	dev_dbg(dev->dev, "adjusted_vtotal = %d\n",
+		 adjusted_mode->vtotal);
+	dev_dbg(dev->dev, "adjusted_clock = %d\n",
+		 adjusted_mode->clock);
+	dev_dbg(dev->dev, "hdisplay = %d\n",
+		 mode->hdisplay);
+	dev_dbg(dev->dev, "vdisplay = %d\n",
+		 mode->vdisplay);
+
+	if (!gma_power_begin(dev, true))
+		return 0;
+
+	memcpy(&psb_intel_crtc->saved_mode, mode,
+					sizeof(struct drm_display_mode));
+	memcpy(&psb_intel_crtc->saved_adjusted_mode, adjusted_mode,
+					sizeof(struct drm_display_mode));
+
+	list_for_each_entry(connector, &mode_config->connector_list, head) {
+		if (!connector)
+			continue;
+
+		encoder = connector->encoder;
+
+		if (!encoder)
+			continue;
+
+		if (encoder->crtc != crtc)
+			continue;
+
+		psb_intel_encoder = psb_intel_attached_encoder(connector);
+
+		switch (psb_intel_encoder->type) {
+		case INTEL_OUTPUT_LVDS:
+			is_lvds = true;
+			break;
+		case INTEL_OUTPUT_TVOUT:
+			is_tv = true;
+			break;
+		case INTEL_OUTPUT_ANALOG:
+			is_crt = true;
+			break;
+		case INTEL_OUTPUT_MIPI:
+			is_mipi = true;
+			break;
+		case INTEL_OUTPUT_MIPI2:
+			is_mipi2 = true;
+			break;
+		case INTEL_OUTPUT_HDMI:
+			is_hdmi = true;
+			break;
+		}
+	}
+
+	/* Disable the VGA plane that we never use */
+	REG_WRITE(VGACNTRL, VGA_DISP_DISABLE);
+
+	/* Disable the panel fitter if it was on our pipe */
+	if (psb_intel_panel_fitter_pipe(dev) == pipe)
+		REG_WRITE(PFIT_CONTROL, 0);
+
+	/* pipesrc and dspsize control the size that is scaled from,
+	 * which should always be the user's requested size.
+	 */
+	if (pipe == 1) {
+		/* FIXME: To make HDMI display with 864x480 (TPO), 480x864
+		 * (PYR) or 480x854 (TMD), set the sprite width/height and
+		 * souce image size registers with the adjusted mode for
+		 * pipe B.
+		 */
+
+		/*
+		 * The defined sprite rectangle must always be completely
+		 * contained within the displayable area of the screen image
+		 * (frame buffer).
+		 */
+		REG_WRITE(dspsize_reg, ((min(mode->crtc_vdisplay, adjusted_mode->crtc_vdisplay) - 1) << 16)
+				| (min(mode->crtc_hdisplay, adjusted_mode->crtc_hdisplay) - 1));
+		/* Set the CRTC with encoder mode. */
+		REG_WRITE(pipesrc_reg, ((mode->crtc_hdisplay - 1) << 16)
+				 | (mode->crtc_vdisplay - 1));
+	} else {
+		REG_WRITE(dspsize_reg,
+				((mode->crtc_vdisplay - 1) << 16) |
+						(mode->crtc_hdisplay - 1));
+		REG_WRITE(pipesrc_reg,
+				((mode->crtc_hdisplay - 1) << 16) |
+						(mode->crtc_vdisplay - 1));
+	}
+
+	REG_WRITE(dsppos_reg, 0);
+
+	if (psb_intel_encoder)
+		drm_connector_property_get_value(connector,
+			dev->mode_config.scaling_mode_property, &scalingType);
+
+	if (scalingType == DRM_MODE_SCALE_NO_SCALE) {
+		/* Medfield doesn't have register support for centering so we
+		 * need to mess with the h/vblank and h/vsync start and ends
+		 * to get centering
+		 */
+		int offsetX = 0, offsetY = 0;
+
+		offsetX = (adjusted_mode->crtc_hdisplay -
+					mode->crtc_hdisplay) / 2;
+		offsetY = (adjusted_mode->crtc_vdisplay -
+					mode->crtc_vdisplay) / 2;
+
+		REG_WRITE(htot_reg, (mode->crtc_hdisplay - 1) |
+			((adjusted_mode->crtc_htotal - 1) << 16));
+		REG_WRITE(vtot_reg, (mode->crtc_vdisplay - 1) |
+			((adjusted_mode->crtc_vtotal - 1) << 16));
+		REG_WRITE(hblank_reg, (adjusted_mode->crtc_hblank_start -
+								offsetX - 1) |
+			((adjusted_mode->crtc_hblank_end - offsetX - 1) << 16));
+		REG_WRITE(hsync_reg, (adjusted_mode->crtc_hsync_start -
+								offsetX - 1) |
+			((adjusted_mode->crtc_hsync_end - offsetX - 1) << 16));
+		REG_WRITE(vblank_reg, (adjusted_mode->crtc_vblank_start -
+								offsetY - 1) |
+			((adjusted_mode->crtc_vblank_end - offsetY - 1) << 16));
+		REG_WRITE(vsync_reg, (adjusted_mode->crtc_vsync_start -
+								offsetY - 1) |
+			((adjusted_mode->crtc_vsync_end - offsetY - 1) << 16));
+	} else {
+		REG_WRITE(htot_reg, (adjusted_mode->crtc_hdisplay - 1) |
+			((adjusted_mode->crtc_htotal - 1) << 16));
+		REG_WRITE(vtot_reg, (adjusted_mode->crtc_vdisplay - 1) |
+			((adjusted_mode->crtc_vtotal - 1) << 16));
+		REG_WRITE(hblank_reg, (adjusted_mode->crtc_hblank_start - 1) |
+			((adjusted_mode->crtc_hblank_end - 1) << 16));
+		REG_WRITE(hsync_reg, (adjusted_mode->crtc_hsync_start - 1) |
+			((adjusted_mode->crtc_hsync_end - 1) << 16));
+		REG_WRITE(vblank_reg, (adjusted_mode->crtc_vblank_start - 1) |
+			((adjusted_mode->crtc_vblank_end - 1) << 16));
+		REG_WRITE(vsync_reg, (adjusted_mode->crtc_vsync_start - 1) |
+			((adjusted_mode->crtc_vsync_end - 1) << 16));
+	}
+
+	/* Flush the plane changes */
+	{
+		struct drm_crtc_helper_funcs *crtc_funcs =
+		    crtc->helper_private;
+		crtc_funcs->mode_set_base(crtc, x, y, old_fb);
+	}
+
+	/* setup pipeconf */
+	*pipeconf = PIPEACONF_ENABLE; /* FIXME_JLIU7 REG_READ(pipeconf_reg); */
+
+	/* Set up the display plane register */
+	*dspcntr = REG_READ(dspcntr_reg);
+	*dspcntr |= pipe << DISPPLANE_SEL_PIPE_POS;
+	*dspcntr |= DISPLAY_PLANE_ENABLE;
+
+	if (is_mipi2)
+		goto mrst_crtc_mode_set_exit;
+	clk = adjusted_mode->clock;
+
+	if (is_hdmi) {
+		if ((ksel == KSEL_CRYSTAL_19) || (ksel == KSEL_BYPASS_19)) {
+			refclk = 19200;
+
+			if (is_mipi || is_mipi2)
+				clk_n = 1, clk_p2 = 8;
+			else if (is_hdmi)
+				clk_n = 1, clk_p2 = 10;
+		} else if (ksel == KSEL_BYPASS_25) {
+			refclk = 25000;
+
+			if (is_mipi || is_mipi2)
+				clk_n = 1, clk_p2 = 8;
+			else if (is_hdmi)
+				clk_n = 1, clk_p2 = 10;
+		} else if ((ksel == KSEL_BYPASS_83_100) &&
+					dev_priv->core_freq == 166) {
+			refclk = 83000;
+
+			if (is_mipi || is_mipi2)
+				clk_n = 4, clk_p2 = 8;
+			else if (is_hdmi)
+				clk_n = 4, clk_p2 = 10;
+		} else if ((ksel == KSEL_BYPASS_83_100) &&
+					(dev_priv->core_freq == 100 ||
+					dev_priv->core_freq == 200)) {
+			refclk = 100000;
+			if (is_mipi || is_mipi2)
+				clk_n = 4, clk_p2 = 8;
+			else if (is_hdmi)
+				clk_n = 4, clk_p2 = 10;
+		}
+
+		if (is_mipi)
+			clk_byte = dev_priv->bpp / 8;
+		else if (is_mipi2)
+			clk_byte = dev_priv->bpp2 / 8;
+
+		clk_tmp = clk * clk_n * clk_p2 * clk_byte;
+
+		dev_dbg(dev->dev, "clk = %d, clk_n = %d, clk_p2 = %d.\n",
+					clk, clk_n, clk_p2);
+		dev_dbg(dev->dev, "adjusted_mode->clock = %d, clk_tmp = %d.\n",
+					adjusted_mode->clock, clk_tmp);
+
+		ok = mdfldFindBestPLL(crtc, clk_tmp, refclk, &clock);
+
+		if (!ok) {
+			DRM_ERROR
+			    ("mdfldFindBestPLL fail in mdfld_crtc_mode_set.\n");
+		} else {
+			m_conv = mdfld_m_converts[(clock.m - MDFLD_M_MIN)];
+
+			dev_dbg(dev->dev, "dot clock = %d,"
+				 "m = %d, p1 = %d, m_conv = %d.\n",
+					clock.dot, clock.m,
+					clock.p1, m_conv);
+		}
+
+		dpll = REG_READ(dpll_reg);
+
+		if (dpll & DPLL_VCO_ENABLE) {
+			dpll &= ~DPLL_VCO_ENABLE;
+			REG_WRITE(dpll_reg, dpll);
+			REG_READ(dpll_reg);
+
+			/* FIXME jliu7 check the DPLL lock bit PIPEACONF[29] */
+			/* FIXME_MDFLD PO - change 500 to 1 after PO */
+			udelay(500);
+
+			/* reset M1, N1 & P1 */
+			REG_WRITE(fp_reg, 0);
+			dpll &= ~MDFLD_P1_MASK;
+			REG_WRITE(dpll_reg, dpll);
+			/* FIXME_MDFLD PO - change 500 to 1 after PO */
+			udelay(500);
+		}
+
+		/* When ungating power of DPLL, needs to wait 0.5us before
+		 * enable the VCO */
+		if (dpll & MDFLD_PWR_GATE_EN) {
+			dpll &= ~MDFLD_PWR_GATE_EN;
+			REG_WRITE(dpll_reg, dpll);
+			/* FIXME_MDFLD PO - change 500 to 1 after PO */
+			udelay(500);
+		}
+		dpll = 0;
+
+#if 0 /* FIXME revisit later */
+		if (ksel == KSEL_CRYSTAL_19 || ksel == KSEL_BYPASS_19 ||
+						ksel == KSEL_BYPASS_25)
+			dpll &= ~MDFLD_INPUT_REF_SEL;
+		else if (ksel == KSEL_BYPASS_83_100)
+			dpll |= MDFLD_INPUT_REF_SEL;
+#endif /* FIXME revisit later */
+
+		if (is_hdmi)
+			dpll |= MDFLD_VCO_SEL;
+
+		fp = (clk_n / 2) << 16;
+		fp |= m_conv;
+
+		/* compute bitmask from p1 value */
+		dpll |= (1 << (clock.p1 - 2)) << 17;
+
+#if 0 /* 1080p30 & 720p */
+		dpll = 0x00050000;
+		fp = 0x000001be;
+#endif
+#if 0 /* 480p */
+		dpll = 0x02010000;
+		fp = 0x000000d2;
+#endif
+	} else {
+#if 0 /*DBI_TPO_480x864*/
+		dpll = 0x00020000;
+		fp = 0x00000156;
+#endif /* DBI_TPO_480x864 */ /* get from spec. */
+
+		dpll = 0x00800000;
+		fp = 0x000000c1;
+	}
+
+	REG_WRITE(fp_reg, fp);
+	REG_WRITE(dpll_reg, dpll);
+	/* FIXME_MDFLD PO - change 500 to 1 after PO */
+	udelay(500);
+
+	dpll |= DPLL_VCO_ENABLE;
+	REG_WRITE(dpll_reg, dpll);
+	REG_READ(dpll_reg);
+
+	/* wait for DSI PLL to lock */
+	while (timeout < 20000 &&
+			!(REG_READ(pipeconf_reg) & PIPECONF_DSIPLL_LOCK)) {
+		udelay(150);
+		timeout++;
+	}
+
+	if (is_mipi)
+		goto mrst_crtc_mode_set_exit;
+
+	dev_dbg(dev->dev, "is_mipi = 0x%x\n", is_mipi);
+
+	REG_WRITE(pipeconf_reg, *pipeconf);
+	REG_READ(pipeconf_reg);
+
+	/* Wait for for the pipe enable to take effect. */
+	REG_WRITE(dspcntr_reg, *dspcntr);
+	psb_intel_wait_for_vblank(dev);
+
+mrst_crtc_mode_set_exit:
+
+	gma_power_end(dev);
+
+	return 0;
+}
+
+const struct drm_crtc_helper_funcs mdfld_helper_funcs = {
+	.dpms = mdfld_crtc_dpms,
+	.mode_fixup = psb_intel_crtc_mode_fixup,
+	.mode_set = mdfld_crtc_mode_set,
+	.mode_set_base = mdfld__intel_pipe_set_base,
+	.prepare = psb_intel_crtc_prepare,
+	.commit = psb_intel_crtc_commit,
+};
+
diff --git a/drivers/gpu/drm/gma500/mdfld_output.c b/drivers/gpu/drm/gma500/mdfld_output.c
new file mode 100644
index 000000000000..de0ce0765738
--- /dev/null
+++ b/drivers/gpu/drm/gma500/mdfld_output.c
@@ -0,0 +1,77 @@
+/*
+ * Copyright (c)  2010 Intel Corporation
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicensen
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+ * DEALINGS IN THE SOFTWARE.
+ *
+ * Authors:
+ * Thomas Eaton <thomas.g.eaton@intel.com>
+ * Scott Rowe <scott.m.rowe@intel.com>
+*/
+
+#include "mdfld_output.h"
+#include "mdfld_dsi_dpi.h"
+#include "mdfld_dsi_output.h"
+
+#include "tc35876x-dsi-lvds.h"
+
+int mdfld_get_panel_type(struct drm_device *dev, int pipe)
+{
+	struct drm_psb_private *dev_priv = dev->dev_private;
+	return dev_priv->mdfld_panel_id;
+}
+
+static void mdfld_init_panel(struct drm_device *dev, int mipi_pipe,
+								int p_type)
+{
+	switch (p_type) {
+	case TPO_VID:
+		mdfld_dsi_output_init(dev, mipi_pipe, NULL,
+				&mdfld_tpo_vid_funcs);
+		break;
+	case TC35876X:
+		tc35876x_init(dev);
+		mdfld_dsi_output_init(dev, mipi_pipe, NULL,
+				&mdfld_tc35876x_funcs);
+		break;
+	case TMD_VID:
+		mdfld_dsi_output_init(dev, mipi_pipe, NULL,
+				&mdfld_tmd_vid_funcs);
+		break;
+	case HDMI:
+/*		if (dev_priv->mdfld_hdmi_present)
+			mdfld_hdmi_init(dev, &dev_priv->mode_dev); */
+		break;
+	}
+}
+
+
+int mdfld_output_init(struct drm_device *dev)
+{
+	struct drm_psb_private *dev_priv = dev->dev_private;
+
+	/* FIXME: hardcoded for now */
+	dev_priv->mdfld_panel_id = TC35876X;
+	/* MIPI panel 1 */
+	mdfld_init_panel(dev, 0, dev_priv->mdfld_panel_id);
+	/* HDMI panel */
+	mdfld_init_panel(dev, 1, HDMI);
+	return 0;
+}
+
diff --git a/drivers/gpu/drm/gma500/mdfld_output.h b/drivers/gpu/drm/gma500/mdfld_output.h
new file mode 100644
index 000000000000..ab2b27c0f037
--- /dev/null
+++ b/drivers/gpu/drm/gma500/mdfld_output.h
@@ -0,0 +1,77 @@
+/*
+ * Copyright (c)  2010 Intel Corporation
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicensen
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+ * DEALINGS IN THE SOFTWARE.
+ *
+ * Authors:
+ * Thomas Eaton <thomas.g.eaton@intel.com>
+ * Scott Rowe <scott.m.rowe@intel.com>
+*/
+
+#ifndef MDFLD_OUTPUT_H
+#define MDFLD_OUTPUT_H
+
+#include "psb_drv.h"
+
+#define TPO_PANEL_WIDTH		84
+#define TPO_PANEL_HEIGHT	46
+#define TMD_PANEL_WIDTH		39
+#define TMD_PANEL_HEIGHT	71
+
+struct mdfld_dsi_config;
+
+enum panel_type {
+	TPO_VID,
+	TMD_VID,
+	HDMI,
+	TC35876X,
+};
+
+struct panel_info {
+	u32 width_mm;
+	u32 height_mm;
+	/* Other info */
+};
+
+struct panel_funcs {
+	const struct drm_encoder_funcs *encoder_funcs;
+	const struct drm_encoder_helper_funcs *encoder_helper_funcs;
+	struct drm_display_mode * (*get_config_mode)(struct drm_device *);
+	int (*get_panel_info)(struct drm_device *, int, struct panel_info *);
+	int (*reset)(int pipe);
+	void (*drv_ic_init)(struct mdfld_dsi_config *dsi_config, int pipe);
+};
+
+int mdfld_output_init(struct drm_device *dev);
+
+struct backlight_device *mdfld_get_backlight_device(void);
+int mdfld_set_brightness(struct backlight_device *bd);
+
+int mdfld_get_panel_type(struct drm_device *dev, int pipe);
+
+extern const struct drm_crtc_helper_funcs mdfld_helper_funcs;
+
+extern const struct panel_funcs mdfld_tmd_vid_funcs;
+extern const struct panel_funcs mdfld_tpo_vid_funcs;
+
+extern void mdfld_disable_crtc(struct drm_device *dev, int pipe);
+extern void mdfldWaitForPipeEnable(struct drm_device *dev, int pipe);
+extern void mdfldWaitForPipeDisable(struct drm_device *dev, int pipe);
+#endif
diff --git a/drivers/gpu/drm/gma500/mdfld_tmd_vid.c b/drivers/gpu/drm/gma500/mdfld_tmd_vid.c
new file mode 100644
index 000000000000..dc0c6c3d3d29
--- /dev/null
+++ b/drivers/gpu/drm/gma500/mdfld_tmd_vid.c
@@ -0,0 +1,201 @@
+/*
+ * Copyright © 2010 Intel Corporation
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+ * DEALINGS IN THE SOFTWARE.
+ *
+ * Authors:
+ * Jim Liu <jim.liu@intel.com>
+ * Jackie Li<yaodong.li@intel.com>
+ * Gideon Eaton <eaton.
+ * Scott Rowe <scott.m.rowe@intel.com>
+ */
+
+#include "mdfld_dsi_dpi.h"
+#include "mdfld_dsi_pkg_sender.h"
+
+static struct drm_display_mode *tmd_vid_get_config_mode(struct drm_device *dev)
+{
+	struct drm_display_mode *mode;
+	struct drm_psb_private *dev_priv = dev->dev_private;
+	struct oaktrail_timing_info *ti = &dev_priv->gct_data.DTD;
+	bool use_gct = false; /*Disable GCT for now*/
+
+	mode = kzalloc(sizeof(*mode), GFP_KERNEL);
+	if (!mode)
+		return NULL;
+
+	if (use_gct) {
+		mode->hdisplay = (ti->hactive_hi << 8) | ti->hactive_lo;
+		mode->vdisplay = (ti->vactive_hi << 8) | ti->vactive_lo;
+		mode->hsync_start = mode->hdisplay + \
+				((ti->hsync_offset_hi << 8) | \
+				ti->hsync_offset_lo);
+		mode->hsync_end = mode->hsync_start + \
+				((ti->hsync_pulse_width_hi << 8) | \
+				ti->hsync_pulse_width_lo);
+		mode->htotal = mode->hdisplay + ((ti->hblank_hi << 8) | \
+								ti->hblank_lo);
+		mode->vsync_start = \
+			mode->vdisplay + ((ti->vsync_offset_hi << 8) | \
+						ti->vsync_offset_lo);
+		mode->vsync_end = \
+			mode->vsync_start + ((ti->vsync_pulse_width_hi << 8) | \
+						ti->vsync_pulse_width_lo);
+		mode->vtotal = mode->vdisplay + \
+				((ti->vblank_hi << 8) | ti->vblank_lo);
+		mode->clock = ti->pixel_clock * 10;
+
+		dev_dbg(dev->dev, "hdisplay is %d\n", mode->hdisplay);
+		dev_dbg(dev->dev, "vdisplay is %d\n", mode->vdisplay);
+		dev_dbg(dev->dev, "HSS is %d\n", mode->hsync_start);
+		dev_dbg(dev->dev, "HSE is %d\n", mode->hsync_end);
+		dev_dbg(dev->dev, "htotal is %d\n", mode->htotal);
+		dev_dbg(dev->dev, "VSS is %d\n", mode->vsync_start);
+		dev_dbg(dev->dev, "VSE is %d\n", mode->vsync_end);
+		dev_dbg(dev->dev, "vtotal is %d\n", mode->vtotal);
+		dev_dbg(dev->dev, "clock is %d\n", mode->clock);
+	} else {
+		mode->hdisplay = 480;
+		mode->vdisplay = 854;
+		mode->hsync_start = 487;
+		mode->hsync_end = 490;
+		mode->htotal = 499;
+		mode->vsync_start = 861;
+		mode->vsync_end = 865;
+		mode->vtotal = 873;
+		mode->clock = 33264;
+	}
+
+	drm_mode_set_name(mode);
+	drm_mode_set_crtcinfo(mode, 0);
+
+	mode->type |= DRM_MODE_TYPE_PREFERRED;
+
+	return mode;
+}
+
+static int tmd_vid_get_panel_info(struct drm_device *dev,
+				int pipe,
+				struct panel_info *pi)
+{
+	if (!dev || !pi)
+		return -EINVAL;
+
+	pi->width_mm = TMD_PANEL_WIDTH;
+	pi->height_mm = TMD_PANEL_HEIGHT;
+
+	return 0;
+}
+
+/* ************************************************************************* *\
+ * FUNCTION: mdfld_init_TMD_MIPI
+ *
+ * DESCRIPTION:  This function is called only by mrst_dsi_mode_set and
+ *               restore_display_registers.  since this function does not
+ *               acquire the mutex, it is important that the calling function
+ *               does!
+\* ************************************************************************* */
+
+/* FIXME: make the below data u8 instead of u32; note byte order! */
+static u32 tmd_cmd_mcap_off[] = {0x000000b2};
+static u32 tmd_cmd_enable_lane_switch[] = {0x000101ef};
+static u32 tmd_cmd_set_lane_num[] = {0x006360ef};
+static u32 tmd_cmd_pushing_clock0[] = {0x00cc2fef};
+static u32 tmd_cmd_pushing_clock1[] = {0x00dd6eef};
+static u32 tmd_cmd_set_mode[] = {0x000000b3};
+static u32 tmd_cmd_set_sync_pulse_mode[] = {0x000961ef};
+static u32 tmd_cmd_set_column[] = {0x0100002a, 0x000000df};
+static u32 tmd_cmd_set_page[] = {0x0300002b, 0x00000055};
+static u32 tmd_cmd_set_video_mode[] = {0x00000153};
+/*no auto_bl,need add in furture*/
+static u32 tmd_cmd_enable_backlight[] = {0x00005ab4};
+static u32 tmd_cmd_set_backlight_dimming[] = {0x00000ebd};
+
+static void mdfld_dsi_tmd_drv_ic_init(struct mdfld_dsi_config *dsi_config,
+				      int pipe)
+{
+	struct mdfld_dsi_pkg_sender *sender
+			= mdfld_dsi_get_pkg_sender(dsi_config);
+
+	DRM_INFO("Enter mdfld init TMD MIPI display.\n");
+
+	if (!sender) {
+		DRM_ERROR("Cannot get sender\n");
+		return;
+	}
+
+	if (dsi_config->dvr_ic_inited)
+		return;
+
+	msleep(3);
+
+	/* FIXME: make the below data u8 instead of u32; note byte order! */
+
+	mdfld_dsi_send_gen_long(sender, (u8 *) tmd_cmd_mcap_off,
+				sizeof(tmd_cmd_mcap_off), false);
+	mdfld_dsi_send_gen_long(sender, (u8 *) tmd_cmd_enable_lane_switch,
+				sizeof(tmd_cmd_enable_lane_switch), false);
+	mdfld_dsi_send_gen_long(sender, (u8 *) tmd_cmd_set_lane_num,
+				sizeof(tmd_cmd_set_lane_num), false);
+	mdfld_dsi_send_gen_long(sender, (u8 *) tmd_cmd_pushing_clock0,
+				sizeof(tmd_cmd_pushing_clock0), false);
+	mdfld_dsi_send_gen_long(sender, (u8 *) tmd_cmd_pushing_clock1,
+				sizeof(tmd_cmd_pushing_clock1), false);
+	mdfld_dsi_send_gen_long(sender, (u8 *) tmd_cmd_set_mode,
+				sizeof(tmd_cmd_set_mode), false);
+	mdfld_dsi_send_gen_long(sender, (u8 *) tmd_cmd_set_sync_pulse_mode,
+				sizeof(tmd_cmd_set_sync_pulse_mode), false);
+	mdfld_dsi_send_mcs_long(sender, (u8 *) tmd_cmd_set_column,
+				sizeof(tmd_cmd_set_column), false);
+	mdfld_dsi_send_mcs_long(sender, (u8 *) tmd_cmd_set_page,
+				sizeof(tmd_cmd_set_page), false);
+	mdfld_dsi_send_gen_long(sender, (u8 *) tmd_cmd_set_video_mode,
+				sizeof(tmd_cmd_set_video_mode), false);
+	mdfld_dsi_send_gen_long(sender, (u8 *) tmd_cmd_enable_backlight,
+				sizeof(tmd_cmd_enable_backlight), false);
+	mdfld_dsi_send_gen_long(sender, (u8 *) tmd_cmd_set_backlight_dimming,
+				sizeof(tmd_cmd_set_backlight_dimming), false);
+
+	dsi_config->dvr_ic_inited = 1;
+}
+
+/*TPO DPI encoder helper funcs*/
+static const struct drm_encoder_helper_funcs
+				mdfld_tpo_dpi_encoder_helper_funcs = {
+	.dpms = mdfld_dsi_dpi_dpms,
+	.mode_fixup = mdfld_dsi_dpi_mode_fixup,
+	.prepare = mdfld_dsi_dpi_prepare,
+	.mode_set = mdfld_dsi_dpi_mode_set,
+	.commit = mdfld_dsi_dpi_commit,
+};
+
+/*TPO DPI encoder funcs*/
+static const struct drm_encoder_funcs mdfld_tpo_dpi_encoder_funcs = {
+	.destroy = drm_encoder_cleanup,
+};
+
+const struct panel_funcs mdfld_tmd_vid_funcs = {
+	.encoder_funcs = &mdfld_tpo_dpi_encoder_funcs,
+	.encoder_helper_funcs = &mdfld_tpo_dpi_encoder_helper_funcs,
+	.get_config_mode = &tmd_vid_get_config_mode,
+	.get_panel_info = tmd_vid_get_panel_info,
+	.reset = mdfld_dsi_panel_reset,
+	.drv_ic_init = mdfld_dsi_tmd_drv_ic_init,
+};
diff --git a/drivers/gpu/drm/gma500/mdfld_tpo_vid.c b/drivers/gpu/drm/gma500/mdfld_tpo_vid.c
new file mode 100644
index 000000000000..d8d4170725b2
--- /dev/null
+++ b/drivers/gpu/drm/gma500/mdfld_tpo_vid.c
@@ -0,0 +1,124 @@
+/*
+ * Copyright © 2010 Intel Corporation
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+ * DEALINGS IN THE SOFTWARE.
+ *
+ * Authors:
+ * jim liu <jim.liu@intel.com>
+ * Jackie Li<yaodong.li@intel.com>
+ */
+
+#include "mdfld_dsi_dpi.h"
+
+static struct drm_display_mode *tpo_vid_get_config_mode(struct drm_device *dev)
+{
+	struct drm_display_mode *mode;
+	struct drm_psb_private *dev_priv = dev->dev_private;
+	struct oaktrail_timing_info *ti = &dev_priv->gct_data.DTD;
+	bool use_gct = false;
+
+	mode = kzalloc(sizeof(*mode), GFP_KERNEL);
+	if (!mode)
+		return NULL;
+
+	if (use_gct) {
+		mode->hdisplay = (ti->hactive_hi << 8) | ti->hactive_lo;
+		mode->vdisplay = (ti->vactive_hi << 8) | ti->vactive_lo;
+		mode->hsync_start = mode->hdisplay +
+				((ti->hsync_offset_hi << 8) |
+				ti->hsync_offset_lo);
+		mode->hsync_end = mode->hsync_start +
+				((ti->hsync_pulse_width_hi << 8) |
+				ti->hsync_pulse_width_lo);
+		mode->htotal = mode->hdisplay + ((ti->hblank_hi << 8) |
+								ti->hblank_lo);
+		mode->vsync_start =
+			mode->vdisplay + ((ti->vsync_offset_hi << 8) |
+						ti->vsync_offset_lo);
+		mode->vsync_end =
+			mode->vsync_start + ((ti->vsync_pulse_width_hi << 8) |
+						ti->vsync_pulse_width_lo);
+		mode->vtotal = mode->vdisplay +
+				((ti->vblank_hi << 8) | ti->vblank_lo);
+		mode->clock = ti->pixel_clock * 10;
+
+		dev_dbg(dev->dev, "hdisplay is %d\n", mode->hdisplay);
+		dev_dbg(dev->dev, "vdisplay is %d\n", mode->vdisplay);
+		dev_dbg(dev->dev, "HSS is %d\n", mode->hsync_start);
+		dev_dbg(dev->dev, "HSE is %d\n", mode->hsync_end);
+		dev_dbg(dev->dev, "htotal is %d\n", mode->htotal);
+		dev_dbg(dev->dev, "VSS is %d\n", mode->vsync_start);
+		dev_dbg(dev->dev, "VSE is %d\n", mode->vsync_end);
+		dev_dbg(dev->dev, "vtotal is %d\n", mode->vtotal);
+		dev_dbg(dev->dev, "clock is %d\n", mode->clock);
+	} else {
+		mode->hdisplay = 864;
+		mode->vdisplay = 480;
+		mode->hsync_start = 873;
+		mode->hsync_end = 876;
+		mode->htotal = 887;
+		mode->vsync_start = 487;
+		mode->vsync_end = 490;
+		mode->vtotal = 499;
+		mode->clock = 33264;
+	}
+
+	drm_mode_set_name(mode);
+	drm_mode_set_crtcinfo(mode, 0);
+
+	mode->type |= DRM_MODE_TYPE_PREFERRED;
+
+	return mode;
+}
+
+static int tpo_vid_get_panel_info(struct drm_device *dev,
+				int pipe,
+				struct panel_info *pi)
+{
+	if (!dev || !pi)
+		return -EINVAL;
+
+	pi->width_mm = TPO_PANEL_WIDTH;
+	pi->height_mm = TPO_PANEL_HEIGHT;
+
+	return 0;
+}
+
+/*TPO DPI encoder helper funcs*/
+static const struct drm_encoder_helper_funcs
+				mdfld_tpo_dpi_encoder_helper_funcs = {
+	.dpms = mdfld_dsi_dpi_dpms,
+	.mode_fixup = mdfld_dsi_dpi_mode_fixup,
+	.prepare = mdfld_dsi_dpi_prepare,
+	.mode_set = mdfld_dsi_dpi_mode_set,
+	.commit = mdfld_dsi_dpi_commit,
+};
+
+/*TPO DPI encoder funcs*/
+static const struct drm_encoder_funcs mdfld_tpo_dpi_encoder_funcs = {
+	.destroy = drm_encoder_cleanup,
+};
+
+const struct panel_funcs mdfld_tpo_vid_funcs = {
+	.encoder_funcs = &mdfld_tpo_dpi_encoder_funcs,
+	.encoder_helper_funcs = &mdfld_tpo_dpi_encoder_helper_funcs,
+	.get_config_mode = &tpo_vid_get_config_mode,
+	.get_panel_info = tpo_vid_get_panel_info,
+};
diff --git a/drivers/gpu/drm/gma500/psb_drv.c b/drivers/gpu/drm/gma500/psb_drv.c
index 1f57aac2cf80..fc3293049fe7 100644
--- a/drivers/gpu/drm/gma500/psb_drv.c
+++ b/drivers/gpu/drm/gma500/psb_drv.c
@@ -60,6 +60,16 @@ static DEFINE_PCI_DEVICE_TABLE(pciidlist) = {
 	/* Atom E620 */
 	{ 0x8086, 0x4108, PCI_ANY_ID, PCI_ANY_ID, 0, 0, (long) &oaktrail_chip_ops},
 #endif
+#if defined(CONFIG_DRM_MEDFIELD)
+	{0x8086, 0x0130, PCI_ANY_ID, PCI_ANY_ID, 0, 0, (long) &mdfld_chip_ops},
+	{0x8086, 0x0131, PCI_ANY_ID, PCI_ANY_ID, 0, 0, (long) &mdfld_chip_ops},
+	{0x8086, 0x0132, PCI_ANY_ID, PCI_ANY_ID, 0, 0, (long) &mdfld_chip_ops},
+	{0x8086, 0x0133, PCI_ANY_ID, PCI_ANY_ID, 0, 0, (long) &mdfld_chip_ops},
+	{0x8086, 0x0134, PCI_ANY_ID, PCI_ANY_ID, 0, 0, (long) &mdfld_chip_ops},
+	{0x8086, 0x0135, PCI_ANY_ID, PCI_ANY_ID, 0, 0, (long) &mdfld_chip_ops},
+	{0x8086, 0x0136, PCI_ANY_ID, PCI_ANY_ID, 0, 0, (long) &mdfld_chip_ops},
+	{0x8086, 0x0137, PCI_ANY_ID, PCI_ANY_ID, 0, 0, (long) &mdfld_chip_ops},
+#endif
 #if defined(CONFIG_DRM_GMA3600)
 	{ 0x8086, 0x0be0, PCI_ANY_ID, PCI_ANY_ID, 0, 0, (long) &cdv_chip_ops},
 	{ 0x8086, 0x0be1, PCI_ANY_ID, PCI_ANY_ID, 0, 0, (long) &cdv_chip_ops},
diff --git a/drivers/gpu/drm/gma500/psb_drv.h b/drivers/gpu/drm/gma500/psb_drv.h
index 3c0bf7be2738..af1c99752000 100644
--- a/drivers/gpu/drm/gma500/psb_drv.h
+++ b/drivers/gpu/drm/gma500/psb_drv.h
@@ -389,11 +389,79 @@ struct psb_state {
 	uint32_t savePWM_CONTROL_LOGIC;
 };
 
+struct medfield_state {
+	uint32_t saveDPLL_A;
+	uint32_t saveFPA0;
+	uint32_t savePIPEACONF;
+	uint32_t saveHTOTAL_A;
+	uint32_t saveHBLANK_A;
+	uint32_t saveHSYNC_A;
+	uint32_t saveVTOTAL_A;
+	uint32_t saveVBLANK_A;
+	uint32_t saveVSYNC_A;
+	uint32_t savePIPEASRC;
+	uint32_t saveDSPASTRIDE;
+	uint32_t saveDSPALINOFF;
+	uint32_t saveDSPATILEOFF;
+	uint32_t saveDSPASIZE;
+	uint32_t saveDSPAPOS;
+	uint32_t saveDSPASURF;
+	uint32_t saveDSPACNTR;
+	uint32_t saveDSPASTATUS;
+	uint32_t save_palette_a[256];
+	uint32_t saveMIPI;
+
+	uint32_t saveDPLL_B;
+	uint32_t saveFPB0;
+	uint32_t savePIPEBCONF;
+	uint32_t saveHTOTAL_B;
+	uint32_t saveHBLANK_B;
+	uint32_t saveHSYNC_B;
+	uint32_t saveVTOTAL_B;
+	uint32_t saveVBLANK_B;
+	uint32_t saveVSYNC_B;
+	uint32_t savePIPEBSRC;
+	uint32_t saveDSPBSTRIDE;
+	uint32_t saveDSPBLINOFF;
+	uint32_t saveDSPBTILEOFF;
+	uint32_t saveDSPBSIZE;
+	uint32_t saveDSPBPOS;
+	uint32_t saveDSPBSURF;
+	uint32_t saveDSPBCNTR;
+	uint32_t saveDSPBSTATUS;
+	uint32_t save_palette_b[256];
+
+	uint32_t savePIPECCONF;
+	uint32_t saveHTOTAL_C;
+	uint32_t saveHBLANK_C;
+	uint32_t saveHSYNC_C;
+	uint32_t saveVTOTAL_C;
+	uint32_t saveVBLANK_C;
+	uint32_t saveVSYNC_C;
+	uint32_t savePIPECSRC;
+	uint32_t saveDSPCSTRIDE;
+	uint32_t saveDSPCLINOFF;
+	uint32_t saveDSPCTILEOFF;
+	uint32_t saveDSPCSIZE;
+	uint32_t saveDSPCPOS;
+	uint32_t saveDSPCSURF;
+	uint32_t saveDSPCCNTR;
+	uint32_t saveDSPCSTATUS;
+	uint32_t save_palette_c[256];
+	uint32_t saveMIPI_C;
+
+	uint32_t savePFIT_CONTROL;
+	uint32_t savePFIT_PGM_RATIOS;
+	uint32_t saveHDMIPHYMISCCTL;
+	uint32_t saveHDMIB_CONTROL;
+};
+
 struct psb_save_area {
 	uint32_t saveBSM;
 	uint32_t saveVBT;
 	union {
 	        struct psb_state psb;
+		struct medfield_state mdfld;
 	};
 	uint32_t saveBLC_PWM_CTL2;
 	uint32_t saveBLC_PWM_CTL;
@@ -563,6 +631,24 @@ struct drm_psb_private {
 
 	/* 2D acceleration */
 	spinlock_t lock_2d;
+
+	/*
+	 * Panel brightness
+	 */
+	int brightness;
+	int brightness_adjusted;
+
+	bool dsr_enable;
+	u32 dsr_fb_update;
+	bool dpi_panel_on[3];
+	void *dsi_configs[2];
+	u32 bpp;
+	u32 bpp2;
+
+	u32 pipeconf[3];
+	u32 dspcntr[3];
+
+	int mdfld_panel_id;
 };
 
 
@@ -758,6 +844,9 @@ extern const struct psb_ops psb_chip_ops;
 /* oaktrail_device.c */
 extern const struct psb_ops oaktrail_chip_ops;
 
+/* mdlfd_device.c */
+extern const struct psb_ops mdfld_chip_ops;
+
 /* cdv_device.c */
 extern const struct psb_ops cdv_chip_ops;
 
diff --git a/drivers/gpu/drm/gma500/psb_irq.c b/drivers/gpu/drm/gma500/psb_irq.c
index 7be802baceb5..a86fc3c4bf3a 100644
--- a/drivers/gpu/drm/gma500/psb_irq.c
+++ b/drivers/gpu/drm/gma500/psb_irq.c
@@ -27,6 +27,8 @@
 #include "psb_reg.h"
 #include "psb_intel_reg.h"
 #include "power.h"
+#include "psb_irq.h"
+#include "mdfld_output.h"
 
 /*
  * inline functions
@@ -453,6 +455,11 @@ int psb_enable_vblank(struct drm_device *dev, int pipe)
 	uint32_t reg_val = 0;
 	uint32_t pipeconf_reg = mid_pipeconf(pipe);
 
+	/* Medfield is different - we should perhaps extract out vblank
+	   and blacklight etc ops */
+	if (IS_MFLD(dev))
+		return mdfld_enable_te(dev, pipe);
+
 	if (gma_power_begin(dev, false)) {
 		reg_val = REG_READ(pipeconf_reg);
 		gma_power_end(dev);
@@ -485,6 +492,8 @@ void psb_disable_vblank(struct drm_device *dev, int pipe)
 	struct drm_psb_private *dev_priv = dev->dev_private;
 	unsigned long irqflags;
 
+	if (IS_MFLD(dev))
+		mdfld_disable_te(dev, pipe);
 	spin_lock_irqsave(&dev_priv->irqmask_lock, irqflags);
 
 	if (pipe == 0)
@@ -499,6 +508,55 @@ void psb_disable_vblank(struct drm_device *dev, int pipe)
 	spin_unlock_irqrestore(&dev_priv->irqmask_lock, irqflags);
 }
 
+/*
+ * It is used to enable TE interrupt
+ */
+int mdfld_enable_te(struct drm_device *dev, int pipe)
+{
+	struct drm_psb_private *dev_priv =
+		(struct drm_psb_private *) dev->dev_private;
+	unsigned long irqflags;
+	uint32_t reg_val = 0;
+	uint32_t pipeconf_reg = mid_pipeconf(pipe);
+
+	if (gma_power_begin(dev, false)) {
+		reg_val = REG_READ(pipeconf_reg);
+		gma_power_end(dev);
+	}
+
+	if (!(reg_val & PIPEACONF_ENABLE))
+		return -EINVAL;
+
+	spin_lock_irqsave(&dev_priv->irqmask_lock, irqflags);
+
+	mid_enable_pipe_event(dev_priv, pipe);
+	psb_enable_pipestat(dev_priv, pipe, PIPE_TE_ENABLE);
+
+	spin_unlock_irqrestore(&dev_priv->irqmask_lock, irqflags);
+
+	return 0;
+}
+
+/*
+ * It is used to disable TE interrupt
+ */
+void mdfld_disable_te(struct drm_device *dev, int pipe)
+{
+	struct drm_psb_private *dev_priv =
+		(struct drm_psb_private *) dev->dev_private;
+	unsigned long irqflags;
+
+	if (!dev_priv->dsr_enable)
+		return;
+
+	spin_lock_irqsave(&dev_priv->irqmask_lock, irqflags);
+
+	mid_disable_pipe_event(dev_priv, pipe);
+	psb_disable_pipestat(dev_priv, pipe, PIPE_TE_ENABLE);
+
+	spin_unlock_irqrestore(&dev_priv->irqmask_lock, irqflags);
+}
+
 /* Called from drm generic code, passed a 'crtc', which
  * we use as a pipe index
  */
diff --git a/drivers/gpu/drm/gma500/psb_irq.h b/drivers/gpu/drm/gma500/psb_irq.h
index 216fda38b57d..603045bee58a 100644
--- a/drivers/gpu/drm/gma500/psb_irq.h
+++ b/drivers/gpu/drm/gma500/psb_irq.h
@@ -42,4 +42,6 @@ int  psb_enable_vblank(struct drm_device *dev, int pipe);
 void psb_disable_vblank(struct drm_device *dev, int pipe);
 u32  psb_get_vblank_counter(struct drm_device *dev, int pipe);
 
+int mdfld_enable_te(struct drm_device *dev, int pipe);
+void mdfld_disable_te(struct drm_device *dev, int pipe);
 #endif /* _SYSIRQ_H_ */
diff --git a/drivers/gpu/drm/gma500/tc35876x-dsi-lvds.c b/drivers/gpu/drm/gma500/tc35876x-dsi-lvds.c
new file mode 100644
index 000000000000..4a07ab596174
--- /dev/null
+++ b/drivers/gpu/drm/gma500/tc35876x-dsi-lvds.c
@@ -0,0 +1,829 @@
+/*
+ * Copyright © 2011 Intel Corporation
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+ * DEALINGS IN THE SOFTWARE.
+ *
+ */
+
+#include "mdfld_dsi_dpi.h"
+#include "mdfld_output.h"
+#include "mdfld_dsi_pkg_sender.h"
+#include "tc35876x-dsi-lvds.h"
+#include <linux/i2c/tc35876x.h>
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <asm/intel_scu_ipc.h>
+
+static struct i2c_client *tc35876x_client;
+static struct i2c_client *cmi_lcd_i2c_client;
+
+#define FLD_MASK(start, end)	(((1 << ((start) - (end) + 1)) - 1) << (end))
+#define FLD_VAL(val, start, end) (((val) << (end)) & FLD_MASK(start, end))
+
+/* DSI D-PHY Layer Registers */
+#define D0W_DPHYCONTTX		0x0004
+#define CLW_DPHYCONTRX		0x0020
+#define D0W_DPHYCONTRX		0x0024
+#define D1W_DPHYCONTRX		0x0028
+#define D2W_DPHYCONTRX		0x002C
+#define D3W_DPHYCONTRX		0x0030
+#define COM_DPHYCONTRX		0x0038
+#define CLW_CNTRL		0x0040
+#define D0W_CNTRL		0x0044
+#define D1W_CNTRL		0x0048
+#define D2W_CNTRL		0x004C
+#define D3W_CNTRL		0x0050
+#define DFTMODE_CNTRL		0x0054
+
+/* DSI PPI Layer Registers */
+#define PPI_STARTPPI		0x0104
+#define PPI_BUSYPPI		0x0108
+#define PPI_LINEINITCNT		0x0110
+#define PPI_LPTXTIMECNT		0x0114
+#define PPI_LANEENABLE		0x0134
+#define PPI_TX_RX_TA		0x013C
+#define PPI_CLS_ATMR		0x0140
+#define PPI_D0S_ATMR		0x0144
+#define PPI_D1S_ATMR		0x0148
+#define PPI_D2S_ATMR		0x014C
+#define PPI_D3S_ATMR		0x0150
+#define PPI_D0S_CLRSIPOCOUNT	0x0164
+#define PPI_D1S_CLRSIPOCOUNT	0x0168
+#define PPI_D2S_CLRSIPOCOUNT	0x016C
+#define PPI_D3S_CLRSIPOCOUNT	0x0170
+#define CLS_PRE			0x0180
+#define D0S_PRE			0x0184
+#define D1S_PRE			0x0188
+#define D2S_PRE			0x018C
+#define D3S_PRE			0x0190
+#define CLS_PREP		0x01A0
+#define D0S_PREP		0x01A4
+#define D1S_PREP		0x01A8
+#define D2S_PREP		0x01AC
+#define D3S_PREP		0x01B0
+#define CLS_ZERO		0x01C0
+#define D0S_ZERO		0x01C4
+#define D1S_ZERO		0x01C8
+#define D2S_ZERO		0x01CC
+#define D3S_ZERO		0x01D0
+#define PPI_CLRFLG		0x01E0
+#define PPI_CLRSIPO		0x01E4
+#define HSTIMEOUT		0x01F0
+#define HSTIMEOUTENABLE		0x01F4
+
+/* DSI Protocol Layer Registers */
+#define DSI_STARTDSI		0x0204
+#define DSI_BUSYDSI		0x0208
+#define DSI_LANEENABLE		0x0210
+#define DSI_LANESTATUS0		0x0214
+#define DSI_LANESTATUS1		0x0218
+#define DSI_INTSTATUS		0x0220
+#define DSI_INTMASK		0x0224
+#define DSI_INTCLR		0x0228
+#define DSI_LPTXTO		0x0230
+
+/* DSI General Registers */
+#define DSIERRCNT		0x0300
+
+/* DSI Application Layer Registers */
+#define APLCTRL			0x0400
+#define RDPKTLN			0x0404
+
+/* Video Path Registers */
+#define VPCTRL			0x0450
+#define HTIM1			0x0454
+#define HTIM2			0x0458
+#define VTIM1			0x045C
+#define VTIM2			0x0460
+#define VFUEN			0x0464
+
+/* LVDS Registers */
+#define LVMX0003		0x0480
+#define LVMX0407		0x0484
+#define LVMX0811		0x0488
+#define LVMX1215		0x048C
+#define LVMX1619		0x0490
+#define LVMX2023		0x0494
+#define LVMX2427		0x0498
+#define LVCFG			0x049C
+#define LVPHY0			0x04A0
+#define LVPHY1			0x04A4
+
+/* System Registers */
+#define SYSSTAT			0x0500
+#define SYSRST			0x0504
+
+/* GPIO Registers */
+/*#define GPIOC			0x0520*/
+#define GPIOO			0x0524
+#define GPIOI			0x0528
+
+/* I2C Registers */
+#define I2CTIMCTRL		0x0540
+#define I2CMADDR		0x0544
+#define WDATAQ			0x0548
+#define RDATAQ			0x054C
+
+/* Chip/Rev Registers */
+#define IDREG			0x0580
+
+/* Debug Registers */
+#define DEBUG00			0x05A0
+#define DEBUG01			0x05A4
+
+/* Panel CABC registers */
+#define PANEL_PWM_CONTROL	0x90
+#define PANEL_FREQ_DIVIDER_HI	0x91
+#define PANEL_FREQ_DIVIDER_LO	0x92
+#define PANEL_DUTY_CONTROL	0x93
+#define PANEL_MODIFY_RGB	0x94
+#define PANEL_FRAMERATE_CONTROL	0x96
+#define PANEL_PWM_MIN		0x97
+#define PANEL_PWM_REF		0x98
+#define PANEL_PWM_MAX		0x99
+#define PANEL_ALLOW_DISTORT	0x9A
+#define PANEL_BYPASS_PWMI	0x9B
+
+/* Panel color management registers */
+#define PANEL_CM_ENABLE		0x700
+#define PANEL_CM_HUE		0x701
+#define PANEL_CM_SATURATION	0x702
+#define PANEL_CM_INTENSITY	0x703
+#define PANEL_CM_BRIGHTNESS	0x704
+#define PANEL_CM_CE_ENABLE	0x705
+#define PANEL_CM_PEAK_EN	0x710
+#define PANEL_CM_GAIN		0x711
+#define PANEL_CM_HUETABLE_START	0x730
+#define PANEL_CM_HUETABLE_END	0x747 /* inclusive */
+
+/* Input muxing for registers LVMX0003...LVMX2427 */
+enum {
+	INPUT_R0,	/* 0 */
+	INPUT_R1,
+	INPUT_R2,
+	INPUT_R3,
+	INPUT_R4,
+	INPUT_R5,
+	INPUT_R6,
+	INPUT_R7,
+	INPUT_G0,	/* 8 */
+	INPUT_G1,
+	INPUT_G2,
+	INPUT_G3,
+	INPUT_G4,
+	INPUT_G5,
+	INPUT_G6,
+	INPUT_G7,
+	INPUT_B0,	/* 16 */
+	INPUT_B1,
+	INPUT_B2,
+	INPUT_B3,
+	INPUT_B4,
+	INPUT_B5,
+	INPUT_B6,
+	INPUT_B7,
+	INPUT_HSYNC,	/* 24 */
+	INPUT_VSYNC,
+	INPUT_DE,
+	LOGIC_0,
+	/* 28...31 undefined */
+};
+
+#define INPUT_MUX(lvmx03, lvmx02, lvmx01, lvmx00)		\
+	(FLD_VAL(lvmx03, 29, 24) | FLD_VAL(lvmx02, 20, 16) |	\
+	FLD_VAL(lvmx01, 12, 8) | FLD_VAL(lvmx00, 4, 0))
+
+/**
+ * tc35876x_regw - Write DSI-LVDS bridge register using I2C
+ * @client: struct i2c_client to use
+ * @reg: register address
+ * @value: value to write
+ *
+ * Returns 0 on success, or a negative error value.
+ */
+static int tc35876x_regw(struct i2c_client *client, u16 reg, u32 value)
+{
+	int r;
+	u8 tx_data[] = {
+		/* NOTE: Register address big-endian, data little-endian. */
+		(reg >> 8) & 0xff,
+		reg & 0xff,
+		value & 0xff,
+		(value >> 8) & 0xff,
+		(value >> 16) & 0xff,
+		(value >> 24) & 0xff,
+	};
+	struct i2c_msg msgs[] = {
+		{
+			.addr = client->addr,
+			.flags = 0,
+			.buf = tx_data,
+			.len = ARRAY_SIZE(tx_data),
+		},
+	};
+
+	r = i2c_transfer(client->adapter, msgs, ARRAY_SIZE(msgs));
+	if (r < 0) {
+		dev_err(&client->dev, "%s: reg 0x%04x val 0x%08x error %d\n",
+			__func__, reg, value, r);
+		return r;
+	}
+
+	if (r < ARRAY_SIZE(msgs)) {
+		dev_err(&client->dev, "%s: reg 0x%04x val 0x%08x msgs %d\n",
+			__func__, reg, value, r);
+		return -EAGAIN;
+	}
+
+	dev_dbg(&client->dev, "%s: reg 0x%04x val 0x%08x\n",
+			__func__, reg, value);
+
+	return 0;
+}
+
+/**
+ * tc35876x_regr - Read DSI-LVDS bridge register using I2C
+ * @client: struct i2c_client to use
+ * @reg: register address
+ * @value: pointer for storing the value
+ *
+ * Returns 0 on success, or a negative error value.
+ */
+static int tc35876x_regr(struct i2c_client *client, u16 reg, u32 *value)
+{
+	int r;
+	u8 tx_data[] = {
+		(reg >> 8) & 0xff,
+		reg & 0xff,
+	};
+	u8 rx_data[4];
+	struct i2c_msg msgs[] = {
+		{
+			.addr = client->addr,
+			.flags = 0,
+			.buf = tx_data,
+			.len = ARRAY_SIZE(tx_data),
+		},
+		{
+			.addr = client->addr,
+			.flags = I2C_M_RD,
+			.buf = rx_data,
+			.len = ARRAY_SIZE(rx_data),
+		 },
+	};
+
+	r = i2c_transfer(client->adapter, msgs, ARRAY_SIZE(msgs));
+	if (r < 0) {
+		dev_err(&client->dev, "%s: reg 0x%04x error %d\n", __func__,
+			reg, r);
+		return r;
+	}
+
+	if (r < ARRAY_SIZE(msgs)) {
+		dev_err(&client->dev, "%s: reg 0x%04x msgs %d\n", __func__,
+			reg, r);
+		return -EAGAIN;
+	}
+
+	*value = rx_data[0] << 24 | rx_data[1] << 16 |
+		rx_data[2] << 8 | rx_data[3];
+
+	dev_dbg(&client->dev, "%s: reg 0x%04x value 0x%08x\n", __func__,
+		reg, *value);
+
+	return 0;
+}
+
+void tc35876x_set_bridge_reset_state(struct drm_device *dev, int state)
+{
+	struct tc35876x_platform_data *pdata;
+
+	if (WARN(!tc35876x_client, "%s called before probe", __func__))
+		return;
+
+	dev_dbg(&tc35876x_client->dev, "%s: state %d\n", __func__, state);
+
+	pdata = dev_get_platdata(&tc35876x_client->dev);
+
+	if (pdata->gpio_bridge_reset == -1)
+		return;
+
+	if (state) {
+		gpio_set_value_cansleep(pdata->gpio_bridge_reset, 0);
+		mdelay(10);
+	} else {
+		/* Pull MIPI Bridge reset pin to Low */
+		gpio_set_value_cansleep(pdata->gpio_bridge_reset, 0);
+		mdelay(20);
+		/* Pull MIPI Bridge reset pin to High */
+		gpio_set_value_cansleep(pdata->gpio_bridge_reset, 1);
+		mdelay(40);
+	}
+}
+
+void tc35876x_configure_lvds_bridge(struct drm_device *dev)
+{
+	struct i2c_client *i2c = tc35876x_client;
+	u32 ppi_lptxtimecnt;
+	u32 txtagocnt;
+	u32 txtasurecnt;
+	u32 id;
+
+	if (WARN(!tc35876x_client, "%s called before probe", __func__))
+		return;
+
+	dev_dbg(&tc35876x_client->dev, "%s\n", __func__);
+
+	if (!tc35876x_regr(i2c, IDREG, &id))
+		dev_info(&tc35876x_client->dev, "tc35876x ID 0x%08x\n", id);
+	else
+		dev_err(&tc35876x_client->dev, "Cannot read ID\n");
+
+	ppi_lptxtimecnt = 4;
+	txtagocnt = (5 * ppi_lptxtimecnt - 3) / 4;
+	txtasurecnt = 3 * ppi_lptxtimecnt / 2;
+	tc35876x_regw(i2c, PPI_TX_RX_TA, FLD_VAL(txtagocnt, 26, 16) |
+		FLD_VAL(txtasurecnt, 10, 0));
+	tc35876x_regw(i2c, PPI_LPTXTIMECNT, FLD_VAL(ppi_lptxtimecnt, 10, 0));
+
+	tc35876x_regw(i2c, PPI_D0S_CLRSIPOCOUNT, FLD_VAL(1, 5, 0));
+	tc35876x_regw(i2c, PPI_D1S_CLRSIPOCOUNT, FLD_VAL(1, 5, 0));
+	tc35876x_regw(i2c, PPI_D2S_CLRSIPOCOUNT, FLD_VAL(1, 5, 0));
+	tc35876x_regw(i2c, PPI_D3S_CLRSIPOCOUNT, FLD_VAL(1, 5, 0));
+
+	/* Enabling MIPI & PPI lanes, Enable 4 lanes */
+	tc35876x_regw(i2c, PPI_LANEENABLE,
+		BIT(4) | BIT(3) | BIT(2) | BIT(1) | BIT(0));
+	tc35876x_regw(i2c, DSI_LANEENABLE,
+		BIT(4) | BIT(3) | BIT(2) | BIT(1) | BIT(0));
+	tc35876x_regw(i2c, PPI_STARTPPI, BIT(0));
+	tc35876x_regw(i2c, DSI_STARTDSI, BIT(0));
+
+	/* Setting LVDS output frequency */
+	tc35876x_regw(i2c, LVPHY0, FLD_VAL(1, 20, 16) |
+		FLD_VAL(2, 15, 14) | FLD_VAL(6, 4, 0)); /* 0x00048006 */
+
+	/* Setting video panel control register,0x00000120 VTGen=ON ?!?!? */
+	tc35876x_regw(i2c, VPCTRL, BIT(8) | BIT(5));
+
+	/* Horizontal back porch and horizontal pulse width. 0x00280028 */
+	tc35876x_regw(i2c, HTIM1, FLD_VAL(40, 24, 16) | FLD_VAL(40, 8, 0));
+
+	/* Horizontal front porch and horizontal active video size. 0x00500500*/
+	tc35876x_regw(i2c, HTIM2, FLD_VAL(80, 24, 16) | FLD_VAL(1280, 10, 0));
+
+	/* Vertical back porch and vertical sync pulse width. 0x000e000a */
+	tc35876x_regw(i2c, VTIM1, FLD_VAL(14, 23, 16) | FLD_VAL(10, 7, 0));
+
+	/* Vertical front porch and vertical display size. 0x000e0320 */
+	tc35876x_regw(i2c, VTIM2, FLD_VAL(14, 23, 16) | FLD_VAL(800, 10, 0));
+
+	/* Set above HTIM1, HTIM2, VTIM1, and VTIM2 at next VSYNC. */
+	tc35876x_regw(i2c, VFUEN, BIT(0));
+
+	/* Soft reset LCD controller. */
+	tc35876x_regw(i2c, SYSRST, BIT(2));
+
+	/* LVDS-TX input muxing */
+	tc35876x_regw(i2c, LVMX0003,
+		INPUT_MUX(INPUT_R5, INPUT_R4, INPUT_R3, INPUT_R2));
+	tc35876x_regw(i2c, LVMX0407,
+		INPUT_MUX(INPUT_G2, INPUT_R7, INPUT_R1, INPUT_R6));
+	tc35876x_regw(i2c, LVMX0811,
+		INPUT_MUX(INPUT_G1, INPUT_G0, INPUT_G4, INPUT_G3));
+	tc35876x_regw(i2c, LVMX1215,
+		INPUT_MUX(INPUT_B2, INPUT_G7, INPUT_G6, INPUT_G5));
+	tc35876x_regw(i2c, LVMX1619,
+		INPUT_MUX(INPUT_B4, INPUT_B3, INPUT_B1, INPUT_B0));
+	tc35876x_regw(i2c, LVMX2023,
+		INPUT_MUX(LOGIC_0,  INPUT_B7, INPUT_B6, INPUT_B5));
+	tc35876x_regw(i2c, LVMX2427,
+		INPUT_MUX(INPUT_R0, INPUT_DE, INPUT_VSYNC, INPUT_HSYNC));
+
+	/* Enable LVDS transmitter. */
+	tc35876x_regw(i2c, LVCFG, BIT(0));
+
+	/* Clear notifications. Don't write reserved bits. Was write 0xffffffff
+	 * to 0x0288, must be in error?! */
+	tc35876x_regw(i2c, DSI_INTCLR, FLD_MASK(31, 30) | FLD_MASK(22, 0));
+}
+
+#define GPIOPWMCTRL	0x38F
+#define PWM0CLKDIV0	0x62 /* low byte */
+#define PWM0CLKDIV1	0x61 /* high byte */
+
+#define SYSTEMCLK	19200000UL /* 19.2 MHz */
+#define PWM_FREQUENCY	9600 /* Hz */
+
+/* f = baseclk / (clkdiv + 1) => clkdiv = (baseclk - f) / f */
+static inline u16 calc_clkdiv(unsigned long baseclk, unsigned int f)
+{
+	return (baseclk - f) / f;
+}
+
+static void tc35876x_brightness_init(struct drm_device *dev)
+{
+	int ret;
+	u8 pwmctrl;
+	u16 clkdiv;
+
+	/* Make sure the PWM reference is the 19.2 MHz system clock. Read first
+	 * instead of setting directly to catch potential conflicts between PWM
+	 * users. */
+	ret = intel_scu_ipc_ioread8(GPIOPWMCTRL, &pwmctrl);
+	if (ret || pwmctrl != 0x01) {
+		if (ret)
+			dev_err(&dev->pdev->dev, "GPIOPWMCTRL read failed\n");
+		else
+			dev_warn(&dev->pdev->dev, "GPIOPWMCTRL was not set to system clock (pwmctrl = 0x%02x)\n", pwmctrl);
+
+		ret = intel_scu_ipc_iowrite8(GPIOPWMCTRL, 0x01);
+		if (ret)
+			dev_err(&dev->pdev->dev, "GPIOPWMCTRL set failed\n");
+	}
+
+	clkdiv = calc_clkdiv(SYSTEMCLK, PWM_FREQUENCY);
+
+	ret = intel_scu_ipc_iowrite8(PWM0CLKDIV1, (clkdiv >> 8) & 0xff);
+	if (!ret)
+		ret = intel_scu_ipc_iowrite8(PWM0CLKDIV0, clkdiv & 0xff);
+
+	if (ret)
+		dev_err(&dev->pdev->dev, "PWM0CLKDIV set failed\n");
+	else
+		dev_dbg(&dev->pdev->dev, "PWM0CLKDIV set to 0x%04x (%d Hz)\n",
+			clkdiv, PWM_FREQUENCY);
+}
+
+#define PWM0DUTYCYCLE			0x67
+
+void tc35876x_brightness_control(struct drm_device *dev, int level)
+{
+	int ret;
+	u8 duty_val;
+	u8 panel_duty_val;
+
+	level = clamp(level, 0, MDFLD_DSI_BRIGHTNESS_MAX_LEVEL);
+
+	/* PWM duty cycle 0x00...0x63 corresponds to 0...99% */
+	duty_val = level * 0x63 / MDFLD_DSI_BRIGHTNESS_MAX_LEVEL;
+
+	/* I won't pretend to understand this formula. The panel spec is quite
+	 * bad engrish.
+	 */
+	panel_duty_val = (2 * level - 100) * 0xA9 /
+			 MDFLD_DSI_BRIGHTNESS_MAX_LEVEL + 0x56;
+
+	ret = intel_scu_ipc_iowrite8(PWM0DUTYCYCLE, duty_val);
+	if (ret)
+		dev_err(&tc35876x_client->dev, "%s: ipc write fail\n",
+			__func__);
+
+	if (cmi_lcd_i2c_client) {
+		ret = i2c_smbus_write_byte_data(cmi_lcd_i2c_client,
+						PANEL_PWM_MAX, panel_duty_val);
+		if (ret < 0)
+			dev_err(&cmi_lcd_i2c_client->dev, "%s: i2c write failed\n",
+				__func__);
+	}
+}
+
+void tc35876x_toshiba_bridge_panel_off(struct drm_device *dev)
+{
+	struct tc35876x_platform_data *pdata;
+
+	if (WARN(!tc35876x_client, "%s called before probe", __func__))
+		return;
+
+	dev_dbg(&tc35876x_client->dev, "%s\n", __func__);
+
+	pdata = dev_get_platdata(&tc35876x_client->dev);
+
+	if (pdata->gpio_panel_bl_en != -1)
+		gpio_set_value_cansleep(pdata->gpio_panel_bl_en, 0);
+
+	if (pdata->gpio_panel_vadd != -1)
+		gpio_set_value_cansleep(pdata->gpio_panel_vadd, 0);
+}
+
+void tc35876x_toshiba_bridge_panel_on(struct drm_device *dev)
+{
+	struct tc35876x_platform_data *pdata;
+	struct drm_psb_private *dev_priv = dev->dev_private;
+
+	if (WARN(!tc35876x_client, "%s called before probe", __func__))
+		return;
+
+	dev_dbg(&tc35876x_client->dev, "%s\n", __func__);
+
+	pdata = dev_get_platdata(&tc35876x_client->dev);
+
+	if (pdata->gpio_panel_vadd != -1) {
+		gpio_set_value_cansleep(pdata->gpio_panel_vadd, 1);
+		msleep(260);
+	}
+
+	if (cmi_lcd_i2c_client) {
+		int ret;
+		dev_dbg(&cmi_lcd_i2c_client->dev, "setting TCON\n");
+		/* Bit 4 is average_saving. Setting it to 1, the brightness is
+		 * referenced to the average of the frame content. 0 means
+		 * reference to the maximum of frame contents. Bits 3:0 are
+		 * allow_distort. When set to a nonzero value, all color values
+		 * between 255-allow_distort*2 and 255 are mapped to the
+		 * 255-allow_distort*2 value.
+		 */
+		ret = i2c_smbus_write_byte_data(cmi_lcd_i2c_client,
+						PANEL_ALLOW_DISTORT, 0x10);
+		if (ret < 0)
+			dev_err(&cmi_lcd_i2c_client->dev,
+				"i2c write failed (%d)\n", ret);
+		ret = i2c_smbus_write_byte_data(cmi_lcd_i2c_client,
+						PANEL_BYPASS_PWMI, 0);
+		if (ret < 0)
+			dev_err(&cmi_lcd_i2c_client->dev,
+				"i2c write failed (%d)\n", ret);
+		/* Set minimum brightness value - this is tunable */
+		ret = i2c_smbus_write_byte_data(cmi_lcd_i2c_client,
+						PANEL_PWM_MIN, 0x35);
+		if (ret < 0)
+			dev_err(&cmi_lcd_i2c_client->dev,
+				"i2c write failed (%d)\n", ret);
+	}
+
+	if (pdata->gpio_panel_bl_en != -1)
+		gpio_set_value_cansleep(pdata->gpio_panel_bl_en, 1);
+
+	tc35876x_brightness_control(dev, dev_priv->brightness_adjusted);
+}
+
+static struct drm_display_mode *tc35876x_get_config_mode(struct drm_device *dev)
+{
+	struct drm_display_mode *mode;
+
+	dev_dbg(&dev->pdev->dev, "%s\n", __func__);
+
+	mode = kzalloc(sizeof(*mode), GFP_KERNEL);
+	if (!mode)
+		return NULL;
+
+	/* FIXME: do this properly. */
+	mode->hdisplay = 1280;
+	mode->vdisplay = 800;
+	mode->hsync_start = 1360;
+	mode->hsync_end = 1400;
+	mode->htotal = 1440;
+	mode->vsync_start = 814;
+	mode->vsync_end = 824;
+	mode->vtotal = 838;
+	mode->clock = 33324 << 1;
+
+	dev_info(&dev->pdev->dev, "hdisplay(w) = %d\n", mode->hdisplay);
+	dev_info(&dev->pdev->dev, "vdisplay(h) = %d\n", mode->vdisplay);
+	dev_info(&dev->pdev->dev, "HSS = %d\n", mode->hsync_start);
+	dev_info(&dev->pdev->dev, "HSE = %d\n", mode->hsync_end);
+	dev_info(&dev->pdev->dev, "htotal = %d\n", mode->htotal);
+	dev_info(&dev->pdev->dev, "VSS = %d\n", mode->vsync_start);
+	dev_info(&dev->pdev->dev, "VSE = %d\n", mode->vsync_end);
+	dev_info(&dev->pdev->dev, "vtotal = %d\n", mode->vtotal);
+	dev_info(&dev->pdev->dev, "clock = %d\n", mode->clock);
+
+	drm_mode_set_name(mode);
+	drm_mode_set_crtcinfo(mode, 0);
+
+	mode->type |= DRM_MODE_TYPE_PREFERRED;
+
+	return mode;
+}
+
+/* DV1 Active area 216.96 x 135.6 mm */
+#define DV1_PANEL_WIDTH 217
+#define DV1_PANEL_HEIGHT 136
+
+static int tc35876x_get_panel_info(struct drm_device *dev, int pipe,
+				struct panel_info *pi)
+{
+	if (!dev || !pi)
+		return -EINVAL;
+
+	pi->width_mm = DV1_PANEL_WIDTH;
+	pi->height_mm = DV1_PANEL_HEIGHT;
+
+	return 0;
+}
+
+static int tc35876x_bridge_probe(struct i2c_client *client,
+				const struct i2c_device_id *id)
+{
+	struct tc35876x_platform_data *pdata;
+
+	dev_info(&client->dev, "%s\n", __func__);
+
+	if (!i2c_check_functionality(client->adapter, I2C_FUNC_I2C)) {
+		dev_err(&client->dev, "%s: i2c_check_functionality() failed\n",
+			__func__);
+		return -ENODEV;
+	}
+
+	pdata = dev_get_platdata(&client->dev);
+	if (!pdata) {
+		dev_err(&client->dev, "%s: no platform data\n", __func__);
+		return -ENODEV;
+	}
+
+	if (pdata->gpio_bridge_reset != -1) {
+		gpio_request(pdata->gpio_bridge_reset, "tc35876x bridge reset");
+		gpio_direction_output(pdata->gpio_bridge_reset, 0);
+	}
+
+	if (pdata->gpio_panel_bl_en != -1) {
+		gpio_request(pdata->gpio_panel_bl_en, "tc35876x panel bl en");
+		gpio_direction_output(pdata->gpio_panel_bl_en, 0);
+	}
+
+	if (pdata->gpio_panel_vadd != -1) {
+		gpio_request(pdata->gpio_panel_vadd, "tc35876x panel vadd");
+		gpio_direction_output(pdata->gpio_panel_vadd, 0);
+	}
+
+	tc35876x_client = client;
+
+	return 0;
+}
+
+static int tc35876x_bridge_remove(struct i2c_client *client)
+{
+	struct tc35876x_platform_data *pdata = dev_get_platdata(&client->dev);
+
+	dev_dbg(&client->dev, "%s\n", __func__);
+
+	if (pdata->gpio_bridge_reset != -1)
+		gpio_free(pdata->gpio_bridge_reset);
+
+	if (pdata->gpio_panel_bl_en != -1)
+		gpio_free(pdata->gpio_panel_bl_en);
+
+	if (pdata->gpio_panel_vadd != -1)
+		gpio_free(pdata->gpio_panel_vadd);
+
+	tc35876x_client = NULL;
+
+	return 0;
+}
+
+static const struct i2c_device_id tc35876x_bridge_id[] = {
+	{ "i2c_disp_brig", 0 },
+	{ }
+};
+MODULE_DEVICE_TABLE(i2c, tc35876x_bridge_id);
+
+static struct i2c_driver tc35876x_bridge_i2c_driver = {
+	.driver = {
+		.name = "i2c_disp_brig",
+	},
+	.id_table = tc35876x_bridge_id,
+	.probe = tc35876x_bridge_probe,
+	.remove = __devexit_p(tc35876x_bridge_remove),
+};
+
+/* LCD panel I2C */
+static int cmi_lcd_i2c_probe(struct i2c_client *client,
+			     const struct i2c_device_id *id)
+{
+	dev_info(&client->dev, "%s\n", __func__);
+
+	if (!i2c_check_functionality(client->adapter, I2C_FUNC_I2C)) {
+		dev_err(&client->dev, "%s: i2c_check_functionality() failed\n",
+			__func__);
+		return -ENODEV;
+	}
+
+	cmi_lcd_i2c_client = client;
+
+	return 0;
+}
+
+static int cmi_lcd_i2c_remove(struct i2c_client *client)
+{
+	dev_dbg(&client->dev, "%s\n", __func__);
+
+	cmi_lcd_i2c_client = NULL;
+
+	return 0;
+}
+
+static const struct i2c_device_id cmi_lcd_i2c_id[] = {
+	{ "cmi-lcd", 0 },
+	{ }
+};
+MODULE_DEVICE_TABLE(i2c, cmi_lcd_i2c_id);
+
+static struct i2c_driver cmi_lcd_i2c_driver = {
+	.driver = {
+		.name = "cmi-lcd",
+	},
+	.id_table = cmi_lcd_i2c_id,
+	.probe = cmi_lcd_i2c_probe,
+	.remove = __devexit_p(cmi_lcd_i2c_remove),
+};
+
+/* HACK to create I2C device while it's not created by platform code */
+#define CMI_LCD_I2C_ADAPTER	2
+#define CMI_LCD_I2C_ADDR	0x60
+
+static int cmi_lcd_hack_create_device(void)
+{
+	struct i2c_adapter *adapter;
+	struct i2c_client *client;
+	struct i2c_board_info info = {
+		.type = "cmi-lcd",
+		.addr = CMI_LCD_I2C_ADDR,
+	};
+
+	pr_debug("%s\n", __func__);
+
+	adapter = i2c_get_adapter(CMI_LCD_I2C_ADAPTER);
+	if (!adapter) {
+		pr_err("%s: i2c_get_adapter(%d) failed\n", __func__,
+			CMI_LCD_I2C_ADAPTER);
+		return -EINVAL;
+	}
+
+	client = i2c_new_device(adapter, &info);
+	if (!client) {
+		pr_err("%s: i2c_new_device() failed\n", __func__);
+		i2c_put_adapter(adapter);
+		return -EINVAL;
+	}
+
+	return 0;
+}
+
+static const struct drm_encoder_helper_funcs tc35876x_encoder_helper_funcs = {
+	.dpms = mdfld_dsi_dpi_dpms,
+	.mode_fixup = mdfld_dsi_dpi_mode_fixup,
+	.prepare = mdfld_dsi_dpi_prepare,
+	.mode_set = mdfld_dsi_dpi_mode_set,
+	.commit = mdfld_dsi_dpi_commit,
+};
+
+static const struct drm_encoder_funcs tc35876x_encoder_funcs = {
+	.destroy = drm_encoder_cleanup,
+};
+
+const struct panel_funcs mdfld_tc35876x_funcs = {
+	.encoder_funcs = &tc35876x_encoder_funcs,
+	.encoder_helper_funcs = &tc35876x_encoder_helper_funcs,
+	.get_config_mode = tc35876x_get_config_mode,
+	.get_panel_info = tc35876x_get_panel_info,
+};
+
+void tc35876x_init(struct drm_device *dev)
+{
+	int r;
+
+	dev_dbg(&dev->pdev->dev, "%s\n", __func__);
+
+	cmi_lcd_hack_create_device();
+
+	r = i2c_add_driver(&cmi_lcd_i2c_driver);
+	if (r < 0)
+		dev_err(&dev->pdev->dev,
+			"%s: i2c_add_driver() for %s failed (%d)\n",
+			__func__, cmi_lcd_i2c_driver.driver.name, r);
+
+	r = i2c_add_driver(&tc35876x_bridge_i2c_driver);
+	if (r < 0)
+		dev_err(&dev->pdev->dev,
+			"%s: i2c_add_driver() for %s failed (%d)\n",
+			__func__, tc35876x_bridge_i2c_driver.driver.name, r);
+
+	tc35876x_brightness_init(dev);
+}
+
+void tc35876x_exit(void)
+{
+	pr_debug("%s\n", __func__);
+
+	i2c_del_driver(&tc35876x_bridge_i2c_driver);
+
+	if (cmi_lcd_i2c_client)
+		i2c_del_driver(&cmi_lcd_i2c_driver);
+}
diff --git a/drivers/gpu/drm/gma500/tc35876x-dsi-lvds.h b/drivers/gpu/drm/gma500/tc35876x-dsi-lvds.h
new file mode 100644
index 000000000000..b14b7f9e7d1e
--- /dev/null
+++ b/drivers/gpu/drm/gma500/tc35876x-dsi-lvds.h
@@ -0,0 +1,38 @@
+/*
+ * Copyright © 2011 Intel Corporation
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+ * DEALINGS IN THE SOFTWARE.
+ *
+ */
+
+#ifndef __MDFLD_DSI_LVDS_BRIDGE_H__
+#define __MDFLD_DSI_LVDS_BRIDGE_H__
+
+void tc35876x_set_bridge_reset_state(struct drm_device *dev, int state);
+void tc35876x_configure_lvds_bridge(struct drm_device *dev);
+void tc35876x_brightness_control(struct drm_device *dev, int level);
+void tc35876x_toshiba_bridge_panel_off(struct drm_device *dev);
+void tc35876x_toshiba_bridge_panel_on(struct drm_device *dev);
+void tc35876x_init(struct drm_device *dev);
+void tc35876x_exit(void);
+
+extern const struct panel_funcs mdfld_tc35876x_funcs;
+
+#endif /*__MDFLD_DSI_LVDS_BRIDGE_H__*/
diff --git a/include/linux/i2c/tc35876x.h b/include/linux/i2c/tc35876x.h
new file mode 100644
index 000000000000..cd6a51c71e7e
--- /dev/null
+++ b/include/linux/i2c/tc35876x.h
@@ -0,0 +1,11 @@
+
+#ifndef _TC35876X_H
+#define _TC35876X_H
+
+struct tc35876x_platform_data {
+	int gpio_bridge_reset;
+	int gpio_panel_bl_en;
+	int gpio_panel_vadd;
+};
+
+#endif /* _TC35876X_H */
-- 
cgit v1.2.3


From 5fbd036b552f633abb394a319f7c62a5c86a9cd7 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <peterz@infradead.org>
Date: Thu, 15 Dec 2011 17:09:22 +0100
Subject: sched: Cleanup cpu_active madness

Stepan found:

CPU0		CPUn

_cpu_up()
  __cpu_up()

		boostrap()
		  notify_cpu_starting()
		  set_cpu_online()
		  while (!cpu_active())
		    cpu_relax()

<PREEMPT-out>

smp_call_function(.wait=1)
  /* we find cpu_online() is true */
  arch_send_call_function_ipi_mask()

  /* wait-forever-more */

<PREEMPT-in>
		  local_irq_enable()

  cpu_notify(CPU_ONLINE)
    sched_cpu_active()
      set_cpu_active()

Now the purpose of cpu_active is mostly with bringing down a cpu, where
we mark it !active to avoid the load-balancer from moving tasks to it
while we tear down the cpu. This is required because we only update the
sched_domain tree after we brought the cpu-down. And this is needed so
that some tasks can still run while we bring it down, we just don't want
new tasks to appear.

On cpu-up however the sched_domain tree doesn't yet include the new cpu,
so its invisible to the load-balancer, regardless of the active state.
So instead of setting the active state after we boot the new cpu (and
consequently having to wait for it before enabling interrupts) set the
cpu active before we set it online and avoid the whole mess.

Reported-by: Stepan Moskovchenko <stepanm@codeaurora.org>
Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Acked-by: Thomas Gleixner <tglx@linutronix.de>
Link: http://lkml.kernel.org/r/1323965362.18942.71.camel@twins
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/arm/kernel/smp.c     |  7 -------
 arch/hexagon/kernel/smp.c |  2 --
 arch/s390/kernel/smp.c    |  6 ------
 arch/x86/kernel/smpboot.c | 13 -------------
 kernel/sched/core.c       |  2 +-
 5 files changed, 1 insertion(+), 29 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/arm/kernel/smp.c b/arch/arm/kernel/smp.c
index cdeb727527d3..d616ed51e7a7 100644
--- a/arch/arm/kernel/smp.c
+++ b/arch/arm/kernel/smp.c
@@ -295,13 +295,6 @@ asmlinkage void __cpuinit secondary_start_kernel(void)
 	 */
 	percpu_timer_setup();
 
-	while (!cpu_active(cpu))
-		cpu_relax();
-
-	/*
-	 * cpu_active bit is set, so it's safe to enalbe interrupts
-	 * now.
-	 */
 	local_irq_enable();
 	local_fiq_enable();
 
diff --git a/arch/hexagon/kernel/smp.c b/arch/hexagon/kernel/smp.c
index c871a2cffaef..0123c63e9a3a 100644
--- a/arch/hexagon/kernel/smp.c
+++ b/arch/hexagon/kernel/smp.c
@@ -179,8 +179,6 @@ void __cpuinit start_secondary(void)
 	printk(KERN_INFO "%s cpu %d\n", __func__, current_thread_info()->cpu);
 
 	set_cpu_online(cpu, true);
-	while (!cpumask_test_cpu(cpu, cpu_active_mask))
-		cpu_relax();
 	local_irq_enable();
 
 	cpu_idle();
diff --git a/arch/s390/kernel/smp.c b/arch/s390/kernel/smp.c
index 2398ce6b15ae..b0e28c47ab83 100644
--- a/arch/s390/kernel/smp.c
+++ b/arch/s390/kernel/smp.c
@@ -550,12 +550,6 @@ int __cpuinit start_secondary(void *cpuvoid)
 	S390_lowcore.restart_psw.addr =
 		PSW_ADDR_AMODE | (unsigned long) psw_restart_int_handler;
 	__ctl_set_bit(0, 28); /* Enable lowcore protection */
-	/*
-	 * Wait until the cpu which brought this one up marked it
-	 * active before enabling interrupts.
-	 */
-	while (!cpumask_test_cpu(smp_processor_id(), cpu_active_mask))
-		cpu_relax();
 	local_irq_enable();
 	/* cpu_idle will call schedule for us */
 	cpu_idle();
diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c
index 66d250c00d11..58f78165d308 100644
--- a/arch/x86/kernel/smpboot.c
+++ b/arch/x86/kernel/smpboot.c
@@ -291,19 +291,6 @@ notrace static void __cpuinit start_secondary(void *unused)
 	per_cpu(cpu_state, smp_processor_id()) = CPU_ONLINE;
 	x86_platform.nmi_init();
 
-	/*
-	 * Wait until the cpu which brought this one up marked it
-	 * online before enabling interrupts. If we don't do that then
-	 * we can end up waking up the softirq thread before this cpu
-	 * reached the active state, which makes the scheduler unhappy
-	 * and schedule the softirq thread on the wrong cpu. This is
-	 * only observable with forced threaded interrupts, but in
-	 * theory it could also happen w/o them. It's just way harder
-	 * to achieve.
-	 */
-	while (!cpumask_test_cpu(smp_processor_id(), cpu_active_mask))
-		cpu_relax();
-
 	/* enable local interrupts */
 	local_irq_enable();
 
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 95545126be1c..b1ccce819ce2 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -5410,7 +5410,7 @@ static int __cpuinit sched_cpu_active(struct notifier_block *nfb,
 				      unsigned long action, void *hcpu)
 {
 	switch (action & ~CPU_TASKS_FROZEN) {
-	case CPU_ONLINE:
+	case CPU_STARTING:
 	case CPU_DOWN_FAILED:
 		set_cpu_active((long)hcpu, true);
 		return NOTIFY_OK;
-- 
cgit v1.2.3


From 87e24f4b67e68d9fd8df16e0bf9c66d1ad2a2533 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <a.p.zijlstra@chello.nl>
Date: Mon, 5 Mar 2012 23:59:25 +0100
Subject: perf/x86: Fix local vs remote memory events for NHM/WSM

Verified using the below proglet.. before:

[root@westmere ~]# perf stat -e node-stores -e node-store-misses ./numa 0
remote write

 Performance counter stats for './numa 0':

         2,101,554 node-stores
         2,096,931 node-store-misses

       5.021546079 seconds time elapsed

[root@westmere ~]# perf stat -e node-stores -e node-store-misses ./numa 1
local write

 Performance counter stats for './numa 1':

           501,137 node-stores
               199 node-store-misses

       5.124451068 seconds time elapsed

After:

[root@westmere ~]# perf stat -e node-stores -e node-store-misses ./numa 0
remote write

 Performance counter stats for './numa 0':

         2,107,516 node-stores
         2,097,187 node-store-misses

       5.012755149 seconds time elapsed

[root@westmere ~]# perf stat -e node-stores -e node-store-misses ./numa 1
local write

 Performance counter stats for './numa 1':

         2,063,355 node-stores
               165 node-store-misses

       5.082091494 seconds time elapsed

#define _GNU_SOURCE

#include <sched.h>
#include <stdio.h>
#include <errno.h>
#include <sys/mman.h>
#include <sys/types.h>
#include <dirent.h>
#include <signal.h>
#include <unistd.h>
#include <numaif.h>
#include <stdlib.h>

#define SIZE (32*1024*1024)

volatile int done;

void sig_done(int sig)
{
	done = 1;
}

int main(int argc, char **argv)
{
	cpu_set_t *mask, *mask2;
	size_t size;
	int i, err, t;
	int nrcpus = 1024;
	char *mem;
	unsigned long nodemask = 0x01; /* node 0 */
	DIR *node;
	struct dirent *de;
	int read = 0;
	int local = 0;

	if (argc < 2) {
		printf("usage: %s [0-3]\n", argv[0]);
		printf("  bit0 - local/remote\n");
		printf("  bit1 - read/write\n");
		exit(0);
	}

	switch (atoi(argv[1])) {
	case 0:
		printf("remote write\n");
		break;
	case 1:
		printf("local write\n");
		local = 1;
		break;
	case 2:
		printf("remote read\n");
		read = 1;
		break;
	case 3:
		printf("local read\n");
		local = 1;
		read = 1;
		break;
	}

	mask = CPU_ALLOC(nrcpus);
	size = CPU_ALLOC_SIZE(nrcpus);
	CPU_ZERO_S(size, mask);

	node = opendir("/sys/devices/system/node/node0/");
	if (!node)
		perror("opendir");
	while ((de = readdir(node))) {
		int cpu;

		if (sscanf(de->d_name, "cpu%d", &cpu) == 1)
			CPU_SET_S(cpu, size, mask);
	}
	closedir(node);

	mask2 = CPU_ALLOC(nrcpus);
	CPU_ZERO_S(size, mask2);
	for (i = 0; i < size; i++)
		CPU_SET_S(i, size, mask2);
	CPU_XOR_S(size, mask2, mask2, mask); // invert

	if (!local)
		mask = mask2;

	err = sched_setaffinity(0, size, mask);
	if (err)
		perror("sched_setaffinity");

	mem = mmap(0, SIZE, PROT_READ|PROT_WRITE,
			MAP_PRIVATE|MAP_ANONYMOUS, -1, 0);
	err = mbind(mem, SIZE, MPOL_BIND, &nodemask, 8*sizeof(nodemask), MPOL_MF_MOVE);
	if (err)
		perror("mbind");

	signal(SIGALRM, sig_done);
	alarm(5);

	if (!read) {
		while (!done) {
			for (i = 0; i < SIZE; i++)
				mem[i] = 0x01;
		}
	} else {
		while (!done) {
			for (i = 0; i < SIZE; i++)
				t += *(volatile char *)(mem + i);
		}
	}

	return 0;
}

Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Stephane Eranian <eranian@google.com>
Cc: <stable@kernel.org>
Link: http://lkml.kernel.org/n/tip-tq73sxus35xmqpojf7ootxgs@git.kernel.org
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/cpu/perf_event_intel.c | 17 +++++++++--------
 1 file changed, 9 insertions(+), 8 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kernel/cpu/perf_event_intel.c b/arch/x86/kernel/cpu/perf_event_intel.c
index 3bd37bdf1b8e..61d4f79a550e 100644
--- a/arch/x86/kernel/cpu/perf_event_intel.c
+++ b/arch/x86/kernel/cpu/perf_event_intel.c
@@ -385,14 +385,15 @@ static __initconst const u64 westmere_hw_cache_event_ids
 #define NHM_LOCAL_DRAM		(1 << 14)
 #define NHM_NON_DRAM		(1 << 15)
 
-#define NHM_ALL_DRAM		(NHM_REMOTE_DRAM|NHM_LOCAL_DRAM)
+#define NHM_LOCAL		(NHM_LOCAL_DRAM|NHM_REMOTE_CACHE_FWD)
+#define NHM_REMOTE		(NHM_REMOTE_DRAM)
 
 #define NHM_DMND_READ		(NHM_DMND_DATA_RD)
 #define NHM_DMND_WRITE		(NHM_DMND_RFO|NHM_DMND_WB)
 #define NHM_DMND_PREFETCH	(NHM_PF_DATA_RD|NHM_PF_DATA_RFO)
 
 #define NHM_L3_HIT	(NHM_UNCORE_HIT|NHM_OTHER_CORE_HIT_SNP|NHM_OTHER_CORE_HITM)
-#define NHM_L3_MISS	(NHM_NON_DRAM|NHM_ALL_DRAM|NHM_REMOTE_CACHE_FWD)
+#define NHM_L3_MISS	(NHM_NON_DRAM|NHM_LOCAL_DRAM|NHM_REMOTE_DRAM|NHM_REMOTE_CACHE_FWD)
 #define NHM_L3_ACCESS	(NHM_L3_HIT|NHM_L3_MISS)
 
 static __initconst const u64 nehalem_hw_cache_extra_regs
@@ -416,16 +417,16 @@ static __initconst const u64 nehalem_hw_cache_extra_regs
  },
  [ C(NODE) ] = {
 	[ C(OP_READ) ] = {
-		[ C(RESULT_ACCESS) ] = NHM_DMND_READ|NHM_ALL_DRAM,
-		[ C(RESULT_MISS)   ] = NHM_DMND_READ|NHM_REMOTE_DRAM,
+		[ C(RESULT_ACCESS) ] = NHM_DMND_READ|NHM_LOCAL|NHM_REMOTE,
+		[ C(RESULT_MISS)   ] = NHM_DMND_READ|NHM_REMOTE,
 	},
 	[ C(OP_WRITE) ] = {
-		[ C(RESULT_ACCESS) ] = NHM_DMND_WRITE|NHM_ALL_DRAM,
-		[ C(RESULT_MISS)   ] = NHM_DMND_WRITE|NHM_REMOTE_DRAM,
+		[ C(RESULT_ACCESS) ] = NHM_DMND_WRITE|NHM_LOCAL|NHM_REMOTE,
+		[ C(RESULT_MISS)   ] = NHM_DMND_WRITE|NHM_REMOTE,
 	},
 	[ C(OP_PREFETCH) ] = {
-		[ C(RESULT_ACCESS) ] = NHM_DMND_PREFETCH|NHM_ALL_DRAM,
-		[ C(RESULT_MISS)   ] = NHM_DMND_PREFETCH|NHM_REMOTE_DRAM,
+		[ C(RESULT_ACCESS) ] = NHM_DMND_PREFETCH|NHM_LOCAL|NHM_REMOTE,
+		[ C(RESULT_MISS)   ] = NHM_DMND_PREFETCH|NHM_REMOTE,
 	},
  },
 };
-- 
cgit v1.2.3


From f9b4eeb809c6d031cc9561cc34dd691701cb2c2a Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <a.p.zijlstra@chello.nl>
Date: Mon, 12 Mar 2012 12:44:35 +0100
Subject: perf/x86: Prettify pmu config literals

I got somewhat tired of having to decode hex numbers..

Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Acked-by: Thomas Gleixner <tglx@linutronix.de>
Cc: Stephane Eranian <eranian@google.com>
Cc: Robert Richter <robert.richter@amd.com>
Link: http://lkml.kernel.org/n/tip-0vsy1sgywc4uar3mu1szm0rg@git.kernel.org
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/cpu/perf_event.h       | 23 +++++++++++++++++++++++
 arch/x86/kernel/cpu/perf_event_intel.c | 21 ++++++++++++++-------
 2 files changed, 37 insertions(+), 7 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kernel/cpu/perf_event.h b/arch/x86/kernel/cpu/perf_event.h
index 82db83b5c3bc..66fda0c26402 100644
--- a/arch/x86/kernel/cpu/perf_event.h
+++ b/arch/x86/kernel/cpu/perf_event.h
@@ -268,6 +268,29 @@ struct x86_pmu_quirk {
 	void (*func)(void);
 };
 
+union x86_pmu_config {
+	struct {
+		u64 event:8,
+		    umask:8,
+		    usr:1,
+		    os:1,
+		    edge:1,
+		    pc:1,
+		    interrupt:1,
+		    __reserved1:1,
+		    en:1,
+		    inv:1,
+		    cmask:8,
+		    event2:4,
+		    __reserved2:4,
+		    go:1,
+		    ho:1;
+	} bits;
+	u64 value;
+};
+
+#define X86_CONFIG(args...) ((union x86_pmu_config){.bits = {args}}).value
+
 /*
  * struct x86_pmu - generic x86 pmu
  */
diff --git a/arch/x86/kernel/cpu/perf_event_intel.c b/arch/x86/kernel/cpu/perf_event_intel.c
index 61d4f79a550e..4bd9c9ef9d42 100644
--- a/arch/x86/kernel/cpu/perf_event_intel.c
+++ b/arch/x86/kernel/cpu/perf_event_intel.c
@@ -1288,7 +1288,8 @@ static int intel_pmu_hw_config(struct perf_event *event)
 		 *
 		 * Thereby we gain a PEBS capable cycle counter.
 		 */
-		u64 alt_config = 0x108000c0; /* INST_RETIRED.TOTAL_CYCLES */
+		u64 alt_config = X86_CONFIG(.event=0xc0, .inv=1, .cmask=16);
+
 
 		alt_config |= (event->hw.config & ~X86_RAW_EVENT_MASK);
 		event->hw.config = alt_config;
@@ -1690,9 +1691,11 @@ __init int intel_pmu_init(void)
 		x86_pmu.extra_regs = intel_nehalem_extra_regs;
 
 		/* UOPS_ISSUED.STALLED_CYCLES */
-		intel_perfmon_event_map[PERF_COUNT_HW_STALLED_CYCLES_FRONTEND] = 0x180010e;
+		intel_perfmon_event_map[PERF_COUNT_HW_STALLED_CYCLES_FRONTEND] =
+			X86_CONFIG(.event=0x0e, .umask=0x01, .inv=1, .cmask=1);
 		/* UOPS_EXECUTED.CORE_ACTIVE_CYCLES,c=1,i=1 */
-		intel_perfmon_event_map[PERF_COUNT_HW_STALLED_CYCLES_BACKEND] = 0x1803fb1;
+		intel_perfmon_event_map[PERF_COUNT_HW_STALLED_CYCLES_BACKEND] =
+			X86_CONFIG(.event=0xb1, .umask=0x3f, .inv=1, .cmask=1);
 
 		x86_add_quirk(intel_nehalem_quirk);
 
@@ -1727,9 +1730,11 @@ __init int intel_pmu_init(void)
 		x86_pmu.er_flags |= ERF_HAS_RSP_1;
 
 		/* UOPS_ISSUED.STALLED_CYCLES */
-		intel_perfmon_event_map[PERF_COUNT_HW_STALLED_CYCLES_FRONTEND] = 0x180010e;
+		intel_perfmon_event_map[PERF_COUNT_HW_STALLED_CYCLES_FRONTEND] =
+			X86_CONFIG(.event=0x0e, .umask=0x01, .inv=1, .cmask=1);
 		/* UOPS_EXECUTED.CORE_ACTIVE_CYCLES,c=1,i=1 */
-		intel_perfmon_event_map[PERF_COUNT_HW_STALLED_CYCLES_BACKEND] = 0x1803fb1;
+		intel_perfmon_event_map[PERF_COUNT_HW_STALLED_CYCLES_BACKEND] =
+			X86_CONFIG(.event=0xb1, .umask=0x3f, .inv=1, .cmask=1);
 
 		pr_cont("Westmere events, ");
 		break;
@@ -1750,9 +1755,11 @@ __init int intel_pmu_init(void)
 		x86_pmu.er_flags |= ERF_NO_HT_SHARING;
 
 		/* UOPS_ISSUED.ANY,c=1,i=1 to count stall cycles */
-		intel_perfmon_event_map[PERF_COUNT_HW_STALLED_CYCLES_FRONTEND] = 0x180010e;
+		intel_perfmon_event_map[PERF_COUNT_HW_STALLED_CYCLES_FRONTEND] =
+			X86_CONFIG(.event=0x0e, .umask=0x01, .inv=1, .cmask=1);
 		/* UOPS_DISPATCHED.THREAD,c=1,i=1 to count stall cycles*/
-		intel_perfmon_event_map[PERF_COUNT_HW_STALLED_CYCLES_BACKEND] = 0x18001b1;
+		intel_perfmon_event_map[PERF_COUNT_HW_STALLED_CYCLES_BACKEND] =
+			X86_CONFIG(.event=0xb1, .umask=0x01, .inv=1, .cmask=1);
 
 		pr_cont("SandyBridge events, ");
 		break;
-- 
cgit v1.2.3


From 73d63d038ee9f769f5e5b46792d227fe20e442c5 Mon Sep 17 00:00:00 2001
From: Suresh Siddha <suresh.b.siddha@intel.com>
Date: Mon, 12 Mar 2012 11:36:33 -0700
Subject: x86/ioapic: Add register level checks to detect bogus io-apic entries
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

With the recent changes to clear_IO_APIC_pin() which tries to
clear remoteIRR bit explicitly, some of the users started to see
"Unable to reset IRR for apic .." messages.

Close look shows that these are related to bogus IO-APIC entries
which return's all 1's for their io-apic registers. And the
above mentioned error messages are benign. But kernel should
have ignored such io-apic's in the first place.

Check if register 0, 1, 2 of the listed io-apic are all 1's and
ignore such io-apic.

Reported-by: Álvaro Castillo <midgoon@gmail.com>
Tested-by: Jon Dufresne <jon@jondufresne.org>
Signed-off-by: Suresh Siddha <suresh.b.siddha@intel.com>
Cc: yinghai@kernel.org
Cc: kernel-team@fedoraproject.org
Cc: Josh Boyer <jwboyer@redhat.com>
Cc: <stable@kernel.org>
Link: http://lkml.kernel.org/r/1331577393.31585.94.camel@sbsiddha-desk.sc.intel.com
[ Performed minor cleanup of affected code. ]
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/apic/io_apic.c | 40 ++++++++++++++++++++++++++++++++--------
 1 file changed, 32 insertions(+), 8 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kernel/apic/io_apic.c b/arch/x86/kernel/apic/io_apic.c
index fb072754bc1d..6d10a66fc5a9 100644
--- a/arch/x86/kernel/apic/io_apic.c
+++ b/arch/x86/kernel/apic/io_apic.c
@@ -3967,18 +3967,36 @@ int mp_find_ioapic_pin(int ioapic, u32 gsi)
 static __init int bad_ioapic(unsigned long address)
 {
 	if (nr_ioapics >= MAX_IO_APICS) {
-		printk(KERN_WARNING "WARNING: Max # of I/O APICs (%d) exceeded "
-		       "(found %d), skipping\n", MAX_IO_APICS, nr_ioapics);
+		pr_warn("WARNING: Max # of I/O APICs (%d) exceeded (found %d), skipping\n",
+			MAX_IO_APICS, nr_ioapics);
 		return 1;
 	}
 	if (!address) {
-		printk(KERN_WARNING "WARNING: Bogus (zero) I/O APIC address"
-		       " found in table, skipping!\n");
+		pr_warn("WARNING: Bogus (zero) I/O APIC address found in table, skipping!\n");
 		return 1;
 	}
 	return 0;
 }
 
+static __init int bad_ioapic_register(int idx)
+{
+	union IO_APIC_reg_00 reg_00;
+	union IO_APIC_reg_01 reg_01;
+	union IO_APIC_reg_02 reg_02;
+
+	reg_00.raw = io_apic_read(idx, 0);
+	reg_01.raw = io_apic_read(idx, 1);
+	reg_02.raw = io_apic_read(idx, 2);
+
+	if (reg_00.raw == -1 && reg_01.raw == -1 && reg_02.raw == -1) {
+		pr_warn("I/O APIC 0x%x registers return all ones, skipping!\n",
+			mpc_ioapic_addr(idx));
+		return 1;
+	}
+
+	return 0;
+}
+
 void __init mp_register_ioapic(int id, u32 address, u32 gsi_base)
 {
 	int idx = 0;
@@ -3995,6 +4013,12 @@ void __init mp_register_ioapic(int id, u32 address, u32 gsi_base)
 	ioapics[idx].mp_config.apicaddr = address;
 
 	set_fixmap_nocache(FIX_IO_APIC_BASE_0 + idx, address);
+
+	if (bad_ioapic_register(idx)) {
+		clear_fixmap(FIX_IO_APIC_BASE_0 + idx);
+		return;
+	}
+
 	ioapics[idx].mp_config.apicid = io_apic_unique_id(id);
 	ioapics[idx].mp_config.apicver = io_apic_get_version(idx);
 
@@ -4015,10 +4039,10 @@ void __init mp_register_ioapic(int id, u32 address, u32 gsi_base)
 	if (gsi_cfg->gsi_end >= gsi_top)
 		gsi_top = gsi_cfg->gsi_end + 1;
 
-	printk(KERN_INFO "IOAPIC[%d]: apic_id %d, version %d, address 0x%x, "
-	       "GSI %d-%d\n", idx, mpc_ioapic_id(idx),
-	       mpc_ioapic_ver(idx), mpc_ioapic_addr(idx),
-	       gsi_cfg->gsi_base, gsi_cfg->gsi_end);
+	pr_info("IOAPIC[%d]: apic_id %d, version %d, address 0x%x, GSI %d-%d\n",
+		idx, mpc_ioapic_id(idx),
+		mpc_ioapic_ver(idx), mpc_ioapic_addr(idx),
+		gsi_cfg->gsi_base, gsi_cfg->gsi_end);
 
 	nr_ioapics++;
 }
-- 
cgit v1.2.3


From 51e7dc7011c99e1e5294658c7b551b92ca069985 Mon Sep 17 00:00:00 2001
From: Srikar Dronamraju <srikar@linux.vnet.ibm.com>
Date: Mon, 12 Mar 2012 14:55:55 +0530
Subject: x86: Rename trap_no to trap_nr in thread_struct

There are precedences of trap number being referred to as
trap_nr. However thread struct refers trap number as trap_no.
Change it to trap_nr.

Also use enum instead of left-over literals for trap values.

This is pure cleanup, no functional change intended.

Suggested-by: Ingo Molnar <mingo@eltu.hu>
Signed-off-by: Srikar Dronamraju <srikar@linux.vnet.ibm.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Ananth N Mavinakayanahalli <ananth@in.ibm.com>
Cc: Jim Keniston <jkenisto@linux.vnet.ibm.com>
Cc: Linux-mm <linux-mm@kvack.org>
Cc: Oleg Nesterov <oleg@redhat.com>
Cc: Andi Kleen <andi@firstfloor.org>
Cc: Christoph Hellwig <hch@infradead.org>
Cc: Steven Rostedt <rostedt@goodmis.org>
Cc: Arnaldo Carvalho de Melo <acme@infradead.org>
Cc: Masami Hiramatsu <masami.hiramatsu.pt@hitachi.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Link: http://lkml.kernel.org/r/20120312092555.5379.942.sendpatchset@srdronam.in.ibm.com
[ Fixed the math-emu build ]
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/ia32/ia32_signal.c      |  2 +-
 arch/x86/include/asm/processor.h |  2 +-
 arch/x86/kernel/dumpstack.c      |  2 +-
 arch/x86/kernel/ptrace.c         |  3 ++-
 arch/x86/kernel/signal.c         |  2 +-
 arch/x86/kernel/traps.c          | 16 ++++++++--------
 arch/x86/kernel/vm86_32.c        |  2 +-
 arch/x86/kernel/vsyscall_64.c    |  2 +-
 arch/x86/math-emu/fpu_entry.c    |  5 +++--
 arch/x86/mm/fault.c              | 10 +++++-----
 10 files changed, 24 insertions(+), 22 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/ia32/ia32_signal.c b/arch/x86/ia32/ia32_signal.c
index bc09ed2a8b97..45b4fdd4e1da 100644
--- a/arch/x86/ia32/ia32_signal.c
+++ b/arch/x86/ia32/ia32_signal.c
@@ -345,7 +345,7 @@ static int ia32_setup_sigcontext(struct sigcontext_ia32 __user *sc,
 		put_user_ex(regs->dx, &sc->dx);
 		put_user_ex(regs->cx, &sc->cx);
 		put_user_ex(regs->ax, &sc->ax);
-		put_user_ex(current->thread.trap_no, &sc->trapno);
+		put_user_ex(current->thread.trap_nr, &sc->trapno);
 		put_user_ex(current->thread.error_code, &sc->err);
 		put_user_ex(regs->ip, &sc->ip);
 		put_user_ex(regs->cs, (unsigned int __user *)&sc->cs);
diff --git a/arch/x86/include/asm/processor.h b/arch/x86/include/asm/processor.h
index 02ce0b379647..f6d0d2eb0832 100644
--- a/arch/x86/include/asm/processor.h
+++ b/arch/x86/include/asm/processor.h
@@ -453,7 +453,7 @@ struct thread_struct {
 	unsigned long           ptrace_dr7;
 	/* Fault info: */
 	unsigned long		cr2;
-	unsigned long		trap_no;
+	unsigned long		trap_nr;
 	unsigned long		error_code;
 	/* floating point and extended processor state */
 	struct fpu		fpu;
diff --git a/arch/x86/kernel/dumpstack.c b/arch/x86/kernel/dumpstack.c
index 4025fe4f928f..28f98706b08b 100644
--- a/arch/x86/kernel/dumpstack.c
+++ b/arch/x86/kernel/dumpstack.c
@@ -265,7 +265,7 @@ int __kprobes __die(const char *str, struct pt_regs *regs, long err)
 #endif
 	printk("\n");
 	if (notify_die(DIE_OOPS, str, regs, err,
-			current->thread.trap_no, SIGSEGV) == NOTIFY_STOP)
+			current->thread.trap_nr, SIGSEGV) == NOTIFY_STOP)
 		return 1;
 
 	show_registers(regs);
diff --git a/arch/x86/kernel/ptrace.c b/arch/x86/kernel/ptrace.c
index 93e7877a19c4..6fb330adc7c7 100644
--- a/arch/x86/kernel/ptrace.c
+++ b/arch/x86/kernel/ptrace.c
@@ -33,6 +33,7 @@
 #include <asm/prctl.h>
 #include <asm/proto.h>
 #include <asm/hw_breakpoint.h>
+#include <asm/traps.h>
 
 #include "tls.h"
 
@@ -1425,7 +1426,7 @@ static void fill_sigtrap_info(struct task_struct *tsk,
 				int error_code, int si_code,
 				struct siginfo *info)
 {
-	tsk->thread.trap_no = 1;
+	tsk->thread.trap_nr = X86_TRAP_DB;
 	tsk->thread.error_code = error_code;
 
 	memset(info, 0, sizeof(*info));
diff --git a/arch/x86/kernel/signal.c b/arch/x86/kernel/signal.c
index c3846b6fb726..9c73acc1c860 100644
--- a/arch/x86/kernel/signal.c
+++ b/arch/x86/kernel/signal.c
@@ -150,7 +150,7 @@ int setup_sigcontext(struct sigcontext __user *sc, void __user *fpstate,
 		put_user_ex(regs->r15, &sc->r15);
 #endif /* CONFIG_X86_64 */
 
-		put_user_ex(current->thread.trap_no, &sc->trapno);
+		put_user_ex(current->thread.trap_nr, &sc->trapno);
 		put_user_ex(current->thread.error_code, &sc->err);
 		put_user_ex(regs->ip, &sc->ip);
 #ifdef CONFIG_X86_32
diff --git a/arch/x86/kernel/traps.c b/arch/x86/kernel/traps.c
index 037fc2bc5316..c6d17ad59b8a 100644
--- a/arch/x86/kernel/traps.c
+++ b/arch/x86/kernel/traps.c
@@ -132,7 +132,7 @@ do_trap(int trapnr, int signr, char *str, struct pt_regs *regs,
 trap_signal:
 #endif
 	/*
-	 * We want error_code and trap_no set for userspace faults and
+	 * We want error_code and trap_nr set for userspace faults and
 	 * kernelspace faults which result in die(), but not
 	 * kernelspace faults which are fixed up.  die() gives the
 	 * process no chance to handle the signal and notice the
@@ -141,7 +141,7 @@ trap_signal:
 	 * delivered, faults.  See also do_general_protection below.
 	 */
 	tsk->thread.error_code = error_code;
-	tsk->thread.trap_no = trapnr;
+	tsk->thread.trap_nr = trapnr;
 
 #ifdef CONFIG_X86_64
 	if (show_unhandled_signals && unhandled_signal(tsk, signr) &&
@@ -164,7 +164,7 @@ trap_signal:
 kernel_trap:
 	if (!fixup_exception(regs)) {
 		tsk->thread.error_code = error_code;
-		tsk->thread.trap_no = trapnr;
+		tsk->thread.trap_nr = trapnr;
 		die(str, regs, error_code);
 	}
 	return;
@@ -240,7 +240,7 @@ dotraplinkage void do_double_fault(struct pt_regs *regs, long error_code)
 	notify_die(DIE_TRAP, str, regs, error_code, X86_TRAP_DF, SIGSEGV);
 
 	tsk->thread.error_code = error_code;
-	tsk->thread.trap_no = X86_TRAP_DF;
+	tsk->thread.trap_nr = X86_TRAP_DF;
 
 	/*
 	 * This is always a kernel trap and never fixable (and thus must
@@ -268,7 +268,7 @@ do_general_protection(struct pt_regs *regs, long error_code)
 		goto gp_in_kernel;
 
 	tsk->thread.error_code = error_code;
-	tsk->thread.trap_no = X86_TRAP_GP;
+	tsk->thread.trap_nr = X86_TRAP_GP;
 
 	if (show_unhandled_signals && unhandled_signal(tsk, SIGSEGV) &&
 			printk_ratelimit()) {
@@ -295,7 +295,7 @@ gp_in_kernel:
 		return;
 
 	tsk->thread.error_code = error_code;
-	tsk->thread.trap_no = X86_TRAP_GP;
+	tsk->thread.trap_nr = X86_TRAP_GP;
 	if (notify_die(DIE_GPF, "general protection fault", regs, error_code,
 			X86_TRAP_GP, SIGSEGV) == NOTIFY_STOP)
 		return;
@@ -475,7 +475,7 @@ void math_error(struct pt_regs *regs, int error_code, int trapnr)
 	{
 		if (!fixup_exception(regs)) {
 			task->thread.error_code = error_code;
-			task->thread.trap_no = trapnr;
+			task->thread.trap_nr = trapnr;
 			die(str, regs, error_code);
 		}
 		return;
@@ -485,7 +485,7 @@ void math_error(struct pt_regs *regs, int error_code, int trapnr)
 	 * Save the info for the exception handler and clear the error.
 	 */
 	save_init_fpu(task);
-	task->thread.trap_no = trapnr;
+	task->thread.trap_nr = trapnr;
 	task->thread.error_code = error_code;
 	info.si_signo = SIGFPE;
 	info.si_errno = 0;
diff --git a/arch/x86/kernel/vm86_32.c b/arch/x86/kernel/vm86_32.c
index b466cab5ba15..a1315ab2d6b9 100644
--- a/arch/x86/kernel/vm86_32.c
+++ b/arch/x86/kernel/vm86_32.c
@@ -567,7 +567,7 @@ int handle_vm86_trap(struct kernel_vm86_regs *regs, long error_code, int trapno)
 	}
 	if (trapno != 1)
 		return 1; /* we let this handle by the calling routine */
-	current->thread.trap_no = trapno;
+	current->thread.trap_nr = trapno;
 	current->thread.error_code = error_code;
 	force_sig(SIGTRAP, current);
 	return 0;
diff --git a/arch/x86/kernel/vsyscall_64.c b/arch/x86/kernel/vsyscall_64.c
index b07ba9393564..327509b95e0e 100644
--- a/arch/x86/kernel/vsyscall_64.c
+++ b/arch/x86/kernel/vsyscall_64.c
@@ -153,7 +153,7 @@ static bool write_ok_or_segv(unsigned long ptr, size_t size)
 
 		thread->error_code	= 6;  /* user fault, no page, write */
 		thread->cr2		= ptr;
-		thread->trap_no		= 14;
+		thread->trap_nr		= X86_TRAP_PF;
 
 		memset(&info, 0, sizeof(info));
 		info.si_signo		= SIGSEGV;
diff --git a/arch/x86/math-emu/fpu_entry.c b/arch/x86/math-emu/fpu_entry.c
index 7718541541d4..9b868124128d 100644
--- a/arch/x86/math-emu/fpu_entry.c
+++ b/arch/x86/math-emu/fpu_entry.c
@@ -28,6 +28,7 @@
 #include <linux/regset.h>
 
 #include <asm/uaccess.h>
+#include <asm/traps.h>
 #include <asm/desc.h>
 #include <asm/user.h>
 #include <asm/i387.h>
@@ -269,7 +270,7 @@ void math_emulate(struct math_emu_info *info)
 			FPU_EIP = FPU_ORIG_EIP;	/* Point to current FPU instruction. */
 
 			RE_ENTRANT_CHECK_OFF;
-			current->thread.trap_no = 16;
+			current->thread.trap_nr = X86_TRAP_MF;
 			current->thread.error_code = 0;
 			send_sig(SIGFPE, current, 1);
 			return;
@@ -662,7 +663,7 @@ static int valid_prefix(u_char *Byte, u_char __user **fpu_eip,
 void math_abort(struct math_emu_info *info, unsigned int signal)
 {
 	FPU_EIP = FPU_ORIG_EIP;
-	current->thread.trap_no = 16;
+	current->thread.trap_nr = X86_TRAP_MF;
 	current->thread.error_code = 0;
 	send_sig(signal, current, 1);
 	RE_ENTRANT_CHECK_OFF;
diff --git a/arch/x86/mm/fault.c b/arch/x86/mm/fault.c
index f0b4caf85c1a..3ecfd1aaf214 100644
--- a/arch/x86/mm/fault.c
+++ b/arch/x86/mm/fault.c
@@ -615,7 +615,7 @@ pgtable_bad(struct pt_regs *regs, unsigned long error_code,
 	dump_pagetable(address);
 
 	tsk->thread.cr2		= address;
-	tsk->thread.trap_no	= 14;
+	tsk->thread.trap_nr	= X86_TRAP_PF;
 	tsk->thread.error_code	= error_code;
 
 	if (__die("Bad pagetable", regs, error_code))
@@ -636,7 +636,7 @@ no_context(struct pt_regs *regs, unsigned long error_code,
 	/* Are we prepared to handle this kernel fault? */
 	if (fixup_exception(regs)) {
 		if (current_thread_info()->sig_on_uaccess_error && signal) {
-			tsk->thread.trap_no = 14;
+			tsk->thread.trap_nr = X86_TRAP_PF;
 			tsk->thread.error_code = error_code | PF_USER;
 			tsk->thread.cr2 = address;
 
@@ -676,7 +676,7 @@ no_context(struct pt_regs *regs, unsigned long error_code,
 		printk(KERN_EMERG "Thread overran stack, or stack corrupted\n");
 
 	tsk->thread.cr2		= address;
-	tsk->thread.trap_no	= 14;
+	tsk->thread.trap_nr	= X86_TRAP_PF;
 	tsk->thread.error_code	= error_code;
 
 	sig = SIGKILL;
@@ -754,7 +754,7 @@ __bad_area_nosemaphore(struct pt_regs *regs, unsigned long error_code,
 		/* Kernel addresses are always protection faults: */
 		tsk->thread.cr2		= address;
 		tsk->thread.error_code	= error_code | (address >= TASK_SIZE);
-		tsk->thread.trap_no	= 14;
+		tsk->thread.trap_nr	= X86_TRAP_PF;
 
 		force_sig_info_fault(SIGSEGV, si_code, address, tsk, 0);
 
@@ -838,7 +838,7 @@ do_sigbus(struct pt_regs *regs, unsigned long error_code, unsigned long address,
 
 	tsk->thread.cr2		= address;
 	tsk->thread.error_code	= error_code;
-	tsk->thread.trap_no	= 14;
+	tsk->thread.trap_nr	= X86_TRAP_PF;
 
 #ifdef CONFIG_MEMORY_FAILURE
 	if (fault & (VM_FAULT_HWPOISON|VM_FAULT_HWPOISON_LARGE)) {
-- 
cgit v1.2.3


From 9993bc635d01a6ee7f6b833b4ee65ce7c06350b1 Mon Sep 17 00:00:00 2001
From: Salman Qazi <sqazi@google.com>
Date: Fri, 9 Mar 2012 16:41:01 -0800
Subject: sched/x86: Fix overflow in cyc2ns_offset

When a machine boots up, the TSC generally gets reset.  However,
when kexec is used to boot into a kernel, the TSC value would be
carried over from the previous kernel.  The computation of
cycns_offset in set_cyc2ns_scale is prone to an overflow, if the
machine has been up more than 208 days prior to the kexec.  The
overflow happens when we multiply *scale, even though there is
enough room to store the final answer.

We fix this issue by decomposing tsc_now into the quotient and
remainder of division by CYC2NS_SCALE_FACTOR and then performing
the multiplication separately on the two components.

Refactor code to share the calculation with the previous
fix in __cycles_2_ns().

Signed-off-by: Salman Qazi <sqazi@google.com>
Acked-by: John Stultz <john.stultz@linaro.org>
Acked-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Paul Turner <pjt@google.com>
Cc: john stultz <johnstul@us.ibm.com>
Link: http://lkml.kernel.org/r/20120310004027.19291.88460.stgit@dungbeetle.mtv.corp.google.com
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/include/asm/timer.h |  8 ++------
 arch/x86/kernel/tsc.c        |  3 ++-
 include/linux/kernel.h       | 13 +++++++++++++
 3 files changed, 17 insertions(+), 7 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/include/asm/timer.h b/arch/x86/include/asm/timer.h
index 431793e5d484..34baa0eb5d0c 100644
--- a/arch/x86/include/asm/timer.h
+++ b/arch/x86/include/asm/timer.h
@@ -57,14 +57,10 @@ DECLARE_PER_CPU(unsigned long long, cyc2ns_offset);
 
 static inline unsigned long long __cycles_2_ns(unsigned long long cyc)
 {
-	unsigned long long quot;
-	unsigned long long rem;
 	int cpu = smp_processor_id();
 	unsigned long long ns = per_cpu(cyc2ns_offset, cpu);
-	quot = (cyc >> CYC2NS_SCALE_FACTOR);
-	rem = cyc & ((1ULL << CYC2NS_SCALE_FACTOR) - 1);
-	ns += quot * per_cpu(cyc2ns, cpu) +
-		((rem * per_cpu(cyc2ns, cpu)) >> CYC2NS_SCALE_FACTOR);
+	ns += mult_frac(cyc, per_cpu(cyc2ns, cpu),
+			(1UL << CYC2NS_SCALE_FACTOR));
 	return ns;
 }
 
diff --git a/arch/x86/kernel/tsc.c b/arch/x86/kernel/tsc.c
index a62c201c97ec..183c5925a9fe 100644
--- a/arch/x86/kernel/tsc.c
+++ b/arch/x86/kernel/tsc.c
@@ -620,7 +620,8 @@ static void set_cyc2ns_scale(unsigned long cpu_khz, int cpu)
 
 	if (cpu_khz) {
 		*scale = (NSEC_PER_MSEC << CYC2NS_SCALE_FACTOR)/cpu_khz;
-		*offset = ns_now - (tsc_now * *scale >> CYC2NS_SCALE_FACTOR);
+		*offset = ns_now - mult_frac(tsc_now, *scale,
+					     (1UL << CYC2NS_SCALE_FACTOR));
 	}
 
 	sched_clock_idle_wakeup_event(0);
diff --git a/include/linux/kernel.h b/include/linux/kernel.h
index e8343422240a..d801acb5e680 100644
--- a/include/linux/kernel.h
+++ b/include/linux/kernel.h
@@ -85,6 +85,19 @@
 }							\
 )
 
+/*
+ * Multiplies an integer by a fraction, while avoiding unnecessary
+ * overflow or loss of precision.
+ */
+#define mult_frac(x, numer, denom)(			\
+{							\
+	typeof(x) quot = (x) / (denom);			\
+	typeof(x) rem  = (x) % (denom);			\
+	(quot * (numer)) + ((rem * (numer)) / (denom));	\
+}							\
+)
+
+
 #define _RET_IP_		(unsigned long)__builtin_return_address(0)
 #define _THIS_IP_  ({ __label__ __here; __here: (unsigned long)&&__here; })
 
-- 
cgit v1.2.3


From ef334a20d84f52407a8a2afd02ddeaecbef0ad3d Mon Sep 17 00:00:00 2001
From: Srikar Dronamraju <srikar@linux.vnet.ibm.com>
Date: Tue, 13 Mar 2012 19:33:03 +0530
Subject: x86: Move is_ia32_task to asm/thread_info.h from asm/compat.h

is_ia32_task() is useful even in !CONFIG_COMPAT cases - utrace will
use it for example. Hence move it to a more generic file: asm/thread_info.h

Also now is_ia32_task() returns true if CONFIG_X86_32 is defined.

Signed-off-by: Srikar Dronamraju <srikar@linux.vnet.ibm.com>
Acked-by: H. Peter Anvin <hpa@zytor.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Ananth N Mavinakayanahalli <ananth@in.ibm.com>
Cc: Jim Keniston <jkenisto@linux.vnet.ibm.com>
Cc: Linux-mm <linux-mm@kvack.org>
Cc: Oleg Nesterov <oleg@redhat.com>
Cc: Andi Kleen <andi@firstfloor.org>
Cc: Christoph Hellwig <hch@infradead.org>
Cc: Steven Rostedt <rostedt@goodmis.org>
Cc: Arnaldo Carvalho de Melo <acme@infradead.org>
Cc: Masami Hiramatsu <masami.hiramatsu.pt@hitachi.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Link: http://lkml.kernel.org/r/20120313140303.17134.1401.sendpatchset@srdronam.in.ibm.com
[ Performed minor cleanup ]
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/include/asm/compat.h      |  9 ---------
 arch/x86/include/asm/thread_info.h | 12 ++++++++++++
 2 files changed, 12 insertions(+), 9 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/include/asm/compat.h b/arch/x86/include/asm/compat.h
index 355edc091604..d6805798d6fc 100644
--- a/arch/x86/include/asm/compat.h
+++ b/arch/x86/include/asm/compat.h
@@ -235,15 +235,6 @@ static inline void __user *arch_compat_alloc_user_space(long len)
 	return (void __user *)round_down(sp - len, 16);
 }
 
-static inline bool is_ia32_task(void)
-{
-#ifdef CONFIG_IA32_EMULATION
-	if (current_thread_info()->status & TS_COMPAT)
-		return true;
-#endif
-	return false;
-}
-
 static inline bool is_x32_task(void)
 {
 #ifdef CONFIG_X86_X32_ABI
diff --git a/arch/x86/include/asm/thread_info.h b/arch/x86/include/asm/thread_info.h
index af1db7e722f4..ad6df8ccd715 100644
--- a/arch/x86/include/asm/thread_info.h
+++ b/arch/x86/include/asm/thread_info.h
@@ -266,6 +266,18 @@ static inline void set_restore_sigmask(void)
 	ti->status |= TS_RESTORE_SIGMASK;
 	set_bit(TIF_SIGPENDING, (unsigned long *)&ti->flags);
 }
+
+static inline bool is_ia32_task(void)
+{
+#ifdef CONFIG_X86_32
+	return true;
+#endif
+#ifdef CONFIG_IA32_EMULATION
+	if (current_thread_info()->status & TS_COMPAT)
+		return true;
+#endif
+	return false;
+}
 #endif	/* !__ASSEMBLY__ */
 
 #ifndef __ASSEMBLY__
-- 
cgit v1.2.3


From 09f98a825a821f7a3f1b162f9ed023f37213a63b Mon Sep 17 00:00:00 2001
From: Tang Liang <liang.tang@oracle.com>
Date: Fri, 9 Dec 2011 10:05:54 +0800
Subject: x86, acpi, tboot: Have a ACPI os prepare sleep instead of calling
 tboot_sleep.

The ACPI suspend path makes a call to tboot_sleep right before
it writes the PM1A, PM1B values. We replace the direct call to
tboot via an registration callback similar to __acpi_register_gsi.

CC: Len Brown <len.brown@intel.com>
Acked-by: Joseph Cihula <joseph.cihula@intel.com>
Acked-by: Rafael J. Wysocki <rjw@sisk.pl>
[v1: Added __attribute__ ((unused))]
[v2: Introduced a wrapper instead of changing tboot_sleep return values]
[v3: Added return value AE_CTRL_SKIP for acpi_os_sleep_prepare]
Signed-off-by: Tang Liang <liang.tang@oracle.com>
[v1: Fix compile issues on IA64 and PPC64]
[v2: Fix where __acpi_os_prepare_sleep==NULL and did not go in sleep properly]
Signed-off-by: Konrad Rzeszutek Wilk <konrad.wilk@oracle.com>
---
 arch/x86/kernel/tboot.c       |  8 ++++++++
 drivers/acpi/acpica/hwsleep.c | 10 +++++++---
 drivers/acpi/osl.c            | 24 ++++++++++++++++++++++++
 include/acpi/acexcep.h        |  1 +
 include/linux/acpi.h          | 10 ++++++++++
 include/linux/tboot.h         |  1 -
 6 files changed, 50 insertions(+), 4 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kernel/tboot.c b/arch/x86/kernel/tboot.c
index e2410e27f97e..1a4ab7df5b63 100644
--- a/arch/x86/kernel/tboot.c
+++ b/arch/x86/kernel/tboot.c
@@ -297,6 +297,12 @@ void tboot_sleep(u8 sleep_state, u32 pm1a_control, u32 pm1b_control)
 
 	tboot_shutdown(acpi_shutdown_map[sleep_state]);
 }
+static int tboot_sleep_wrapper(u8 sleep_state, u32 pm1a_control,
+			       u32 pm1b_control)
+{
+	tboot_sleep(sleep_state, pm1a_control, pm1b_control);
+	return 0;
+}
 
 static atomic_t ap_wfs_count;
 
@@ -345,6 +351,8 @@ static __init int tboot_late_init(void)
 
 	atomic_set(&ap_wfs_count, 0);
 	register_hotcpu_notifier(&tboot_cpu_notifier);
+
+	acpi_os_set_prepare_sleep(&tboot_sleep_wrapper);
 	return 0;
 }
 
diff --git a/drivers/acpi/acpica/hwsleep.c b/drivers/acpi/acpica/hwsleep.c
index d52da3073650..992359af7e2f 100644
--- a/drivers/acpi/acpica/hwsleep.c
+++ b/drivers/acpi/acpica/hwsleep.c
@@ -43,9 +43,9 @@
  */
 
 #include <acpi/acpi.h>
+#include <linux/acpi.h>
 #include "accommon.h"
 #include "actables.h"
-#include <linux/tboot.h>
 #include <linux/module.h>
 
 #define _COMPONENT          ACPI_HARDWARE
@@ -344,8 +344,12 @@ acpi_status asmlinkage acpi_enter_sleep_state(u8 sleep_state)
 
 	ACPI_FLUSH_CPU_CACHE();
 
-	tboot_sleep(sleep_state, pm1a_control, pm1b_control);
-
+	status = acpi_os_prepare_sleep(sleep_state, pm1a_control,
+				       pm1b_control);
+	if (ACPI_SKIP(status))
+		return_ACPI_STATUS(AE_OK);
+	if (ACPI_FAILURE(status))
+		return_ACPI_STATUS(status);
 	/* Write #2: Write both SLP_TYP + SLP_EN */
 
 	status = acpi_hw_write_pm1_control(pm1a_control, pm1b_control);
diff --git a/drivers/acpi/osl.c b/drivers/acpi/osl.c
index f31c5c5f1b7e..f3aae4ba507e 100644
--- a/drivers/acpi/osl.c
+++ b/drivers/acpi/osl.c
@@ -76,6 +76,9 @@ EXPORT_SYMBOL(acpi_in_debugger);
 extern char line_buf[80];
 #endif				/*ENABLE_DEBUGGER */
 
+static int (*__acpi_os_prepare_sleep)(u8 sleep_state, u32 pm1a_ctrl,
+				      u32 pm1b_ctrl);
+
 static acpi_osd_handler acpi_irq_handler;
 static void *acpi_irq_context;
 static struct workqueue_struct *kacpid_wq;
@@ -1659,3 +1662,24 @@ acpi_status acpi_os_terminate(void)
 
 	return AE_OK;
 }
+
+acpi_status acpi_os_prepare_sleep(u8 sleep_state, u32 pm1a_control,
+				  u32 pm1b_control)
+{
+	int rc = 0;
+	if (__acpi_os_prepare_sleep)
+		rc = __acpi_os_prepare_sleep(sleep_state,
+					     pm1a_control, pm1b_control);
+	if (rc < 0)
+		return AE_ERROR;
+	else if (rc > 0)
+		return AE_CTRL_SKIP;
+
+	return AE_OK;
+}
+
+void acpi_os_set_prepare_sleep(int (*func)(u8 sleep_state,
+			       u32 pm1a_ctrl, u32 pm1b_ctrl))
+{
+	__acpi_os_prepare_sleep = func;
+}
diff --git a/include/acpi/acexcep.h b/include/acpi/acexcep.h
index 5b6c391efc8e..fa0d22ce089e 100644
--- a/include/acpi/acexcep.h
+++ b/include/acpi/acexcep.h
@@ -57,6 +57,7 @@
 #define ACPI_SUCCESS(a)                 (!(a))
 #define ACPI_FAILURE(a)                 (a)
 
+#define ACPI_SKIP(a)                    (a == AE_CTRL_SKIP)
 #define AE_OK                           (acpi_status) 0x0000
 
 /*
diff --git a/include/linux/acpi.h b/include/linux/acpi.h
index 6001b4da39dd..fccd017b8b6e 100644
--- a/include/linux/acpi.h
+++ b/include/linux/acpi.h
@@ -359,4 +359,14 @@ static inline int suspend_nvs_register(unsigned long a, unsigned long b)
 }
 #endif
 
+#ifdef CONFIG_ACPI
+void acpi_os_set_prepare_sleep(int (*func)(u8 sleep_state,
+			       u32 pm1a_ctrl,  u32 pm1b_ctrl));
+
+acpi_status acpi_os_prepare_sleep(u8 sleep_state,
+				  u32 pm1a_control, u32 pm1b_control);
+#else
+#define acpi_os_set_prepare_sleep(func, pm1a_ctrl, pm1b_ctrl) do { } while (0)
+#endif
+
 #endif	/*_LINUX_ACPI_H*/
diff --git a/include/linux/tboot.h b/include/linux/tboot.h
index 1dba6ee55203..c75128bed5fa 100644
--- a/include/linux/tboot.h
+++ b/include/linux/tboot.h
@@ -143,7 +143,6 @@ static inline int tboot_enabled(void)
 
 extern void tboot_probe(void);
 extern void tboot_shutdown(u32 shutdown_type);
-extern void tboot_sleep(u8 sleep_state, u32 pm1a_control, u32 pm1b_control);
 extern struct acpi_table_header *tboot_get_dmar_table(
 				      struct acpi_table_header *dmar_tbl);
 extern int tboot_force_iommu(void);
-- 
cgit v1.2.3


From a1f37788a6d8c037e7d92fe4a0fe9ec0d713b21e Mon Sep 17 00:00:00 2001
From: Konrad Rzeszutek Wilk <konrad.wilk@oracle.com>
Date: Thu, 8 Dec 2011 17:14:08 +0800
Subject: tboot: Add return values for tboot_sleep

.. as appropiately. As tboot_sleep now returns values.
remove tboot_sleep_wrapper.

Suggested-and-Acked-by: Rafael J. Wysocki <rjw@sisk.pl>
Acked-by: Joseph Cihula <joseph.cihula@intel.com>
[v1: Return -1/0/+1 instead of ACPI_xx values]
Signed-off-by: Konrad Rzeszutek Wilk <konrad.wilk@oracle.com>
---
 arch/x86/kernel/tboot.c | 13 ++++---------
 1 file changed, 4 insertions(+), 9 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kernel/tboot.c b/arch/x86/kernel/tboot.c
index 1a4ab7df5b63..6410744ac5cb 100644
--- a/arch/x86/kernel/tboot.c
+++ b/arch/x86/kernel/tboot.c
@@ -272,7 +272,7 @@ static void tboot_copy_fadt(const struct acpi_table_fadt *fadt)
 		offsetof(struct acpi_table_facs, firmware_waking_vector);
 }
 
-void tboot_sleep(u8 sleep_state, u32 pm1a_control, u32 pm1b_control)
+static int tboot_sleep(u8 sleep_state, u32 pm1a_control, u32 pm1b_control)
 {
 	static u32 acpi_shutdown_map[ACPI_S_STATE_COUNT] = {
 		/* S0,1,2: */ -1, -1, -1,
@@ -281,7 +281,7 @@ void tboot_sleep(u8 sleep_state, u32 pm1a_control, u32 pm1b_control)
 		/* S5: */ TB_SHUTDOWN_S5 };
 
 	if (!tboot_enabled())
-		return;
+		return 0;
 
 	tboot_copy_fadt(&acpi_gbl_FADT);
 	tboot->acpi_sinfo.pm1a_cnt_val = pm1a_control;
@@ -292,15 +292,10 @@ void tboot_sleep(u8 sleep_state, u32 pm1a_control, u32 pm1b_control)
 	if (sleep_state >= ACPI_S_STATE_COUNT ||
 	    acpi_shutdown_map[sleep_state] == -1) {
 		pr_warning("unsupported sleep state 0x%x\n", sleep_state);
-		return;
+		return -1;
 	}
 
 	tboot_shutdown(acpi_shutdown_map[sleep_state]);
-}
-static int tboot_sleep_wrapper(u8 sleep_state, u32 pm1a_control,
-			       u32 pm1b_control)
-{
-	tboot_sleep(sleep_state, pm1a_control, pm1b_control);
 	return 0;
 }
 
@@ -352,7 +347,7 @@ static __init int tboot_late_init(void)
 	atomic_set(&ap_wfs_count, 0);
 	register_hotcpu_notifier(&tboot_cpu_notifier);
 
-	acpi_os_set_prepare_sleep(&tboot_sleep_wrapper);
+	acpi_os_set_prepare_sleep(&tboot_sleep);
 	return 0;
 }
 
-- 
cgit v1.2.3


From bb6fa8b275e132b1e9319dbab94211543a0b7bd3 Mon Sep 17 00:00:00 2001
From: "H. Peter Anvin" <hpa@zytor.com>
Date: Tue, 13 Mar 2012 22:44:41 -0700
Subject: x32: Fix stupid ia32/x32 inversion in the siginfo format

Fix a stray ! which flipped the sense if we were generating a signal
frame for ia32 vs. x32.

Introduced in:

e7084fd5 x32: Switch to a 64-bit clock_t

Reported-by: H. J. Lu <hjl.tools@gmail.com>
Signed-off-by: H. Peter Anvin <hpa@zytor.com>
Cc: Gregory M. Lueck <gregory.m.lueck@intel.com>
Link: http://lkml.kernel.org/r/1329696488-16970-1-git-send-email-hpa@zytor.com
---
 arch/x86/ia32/ia32_signal.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'arch/x86')

diff --git a/arch/x86/ia32/ia32_signal.c b/arch/x86/ia32/ia32_signal.c
index bc09ed2a8b97..ef026aa19d63 100644
--- a/arch/x86/ia32/ia32_signal.c
+++ b/arch/x86/ia32/ia32_signal.c
@@ -37,7 +37,7 @@
 int copy_siginfo_to_user32(compat_siginfo_t __user *to, siginfo_t *from)
 {
 	int err = 0;
-	bool ia32 = !is_ia32_task();
+	bool ia32 = is_ia32_task();
 
 	if (!access_ok(VERIFY_WRITE, to, sizeof(compat_siginfo_t)))
 		return -EFAULT;
-- 
cgit v1.2.3


From fa63030e9c79e37b4d4e63b39ffb09cfb7aa0fe4 Mon Sep 17 00:00:00 2001
From: Daniel J Blueman <daniel@numascale-asia.com>
Date: Wed, 14 Mar 2012 15:17:34 +0800
Subject: x86/platform: Move APIC ID validity check into platform APIC code

Move APIC ID validity check into platform APIC code, so it can
be overridden when needed. For NumaChip systems, always trust
MADT, as it's constructed with high APIC IDs.

Behaviour verifies on standard x86 systems and on NumaChip
systems with this, and compile-tested with allyesconfig.

Signed-off-by: Daniel J Blueman <daniel@numascale-asia.com>
Reviewed-by: Steffen Persvold <sp@numascale.com>
Cc: Yinghai Lu <yinghai@kernel.org>
Cc: H. Peter Anvin <hpa@linux.intel.com>
Cc: Suresh Siddha <suresh.b.siddha@intel.com>
Link: http://lkml.kernel.org/r/1331709454-27966-1-git-send-email-daniel@numascale-asia.com
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/include/asm/apic.h           |  6 ++++++
 arch/x86/kernel/apic/apic_flat_64.c   |  2 ++
 arch/x86/kernel/apic/apic_noop.c      |  1 +
 arch/x86/kernel/apic/apic_numachip.c  | 10 +++++++++-
 arch/x86/kernel/apic/bigsmp_32.c      |  1 +
 arch/x86/kernel/apic/es7000_32.c      |  2 ++
 arch/x86/kernel/apic/numaq_32.c       |  1 +
 arch/x86/kernel/apic/probe_32.c       |  1 +
 arch/x86/kernel/apic/summit_32.c      |  1 +
 arch/x86/kernel/apic/x2apic_cluster.c |  1 +
 arch/x86/kernel/apic/x2apic_phys.c    |  1 +
 arch/x86/kernel/apic/x2apic_uv_x.c    |  1 +
 arch/x86/kernel/smpboot.c             |  2 +-
 13 files changed, 28 insertions(+), 2 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/include/asm/apic.h b/arch/x86/include/asm/apic.h
index 3ab9bdd87e79..a9371c91718c 100644
--- a/arch/x86/include/asm/apic.h
+++ b/arch/x86/include/asm/apic.h
@@ -288,6 +288,7 @@ struct apic {
 
 	int (*probe)(void);
 	int (*acpi_madt_oem_check)(char *oem_id, char *oem_table_id);
+	int (*apic_id_valid)(int apicid);
 	int (*apic_id_registered)(void);
 
 	u32 irq_delivery_mode;
@@ -532,6 +533,11 @@ static inline unsigned int read_apic_id(void)
 	return apic->get_apic_id(reg);
 }
 
+static inline int default_apic_id_valid(int apicid)
+{
+	return x2apic_mode || (apicid < 255);
+}
+
 extern void default_setup_apic_routing(void);
 
 extern struct apic apic_noop;
diff --git a/arch/x86/kernel/apic/apic_flat_64.c b/arch/x86/kernel/apic/apic_flat_64.c
index 8c3cdded6f2b..359b6899a36c 100644
--- a/arch/x86/kernel/apic/apic_flat_64.c
+++ b/arch/x86/kernel/apic/apic_flat_64.c
@@ -180,6 +180,7 @@ static struct apic apic_flat =  {
 	.name				= "flat",
 	.probe				= flat_probe,
 	.acpi_madt_oem_check		= flat_acpi_madt_oem_check,
+	.apic_id_valid			= default_apic_id_valid,
 	.apic_id_registered		= flat_apic_id_registered,
 
 	.irq_delivery_mode		= dest_LowestPrio,
@@ -337,6 +338,7 @@ static struct apic apic_physflat =  {
 	.name				= "physical flat",
 	.probe				= physflat_probe,
 	.acpi_madt_oem_check		= physflat_acpi_madt_oem_check,
+	.apic_id_valid			= default_apic_id_valid,
 	.apic_id_registered		= flat_apic_id_registered,
 
 	.irq_delivery_mode		= dest_Fixed,
diff --git a/arch/x86/kernel/apic/apic_noop.c b/arch/x86/kernel/apic/apic_noop.c
index 775b82bc655c..634ae6cdd5c9 100644
--- a/arch/x86/kernel/apic/apic_noop.c
+++ b/arch/x86/kernel/apic/apic_noop.c
@@ -124,6 +124,7 @@ struct apic apic_noop = {
 	.probe				= noop_probe,
 	.acpi_madt_oem_check		= NULL,
 
+	.apic_id_valid			= default_apic_id_valid,
 	.apic_id_registered		= noop_apic_id_registered,
 
 	.irq_delivery_mode		= dest_LowestPrio,
diff --git a/arch/x86/kernel/apic/apic_numachip.c b/arch/x86/kernel/apic/apic_numachip.c
index 09d3d8c1cd99..d9ea5f331ac5 100644
--- a/arch/x86/kernel/apic/apic_numachip.c
+++ b/arch/x86/kernel/apic/apic_numachip.c
@@ -56,6 +56,12 @@ static unsigned int read_xapic_id(void)
 	return get_apic_id(apic_read(APIC_ID));
 }
 
+static int numachip_apic_id_valid(int apicid)
+{
+	/* Trust what bootloader passes in MADT */
+	return 1;
+}
+
 static int numachip_apic_id_registered(void)
 {
 	return physid_isset(read_xapic_id(), phys_cpu_present_map);
@@ -223,10 +229,11 @@ static int __init numachip_system_init(void)
 }
 early_initcall(numachip_system_init);
 
-static int numachip_acpi_madt_oem_check(char *oem_id, char *oem_table_id)
+static int __cpuinit numachip_acpi_madt_oem_check(char *oem_id, char *oem_table_id)
 {
 	if (!strncmp(oem_id, "NUMASC", 6)) {
 		numachip_system = 1;
+		setup_force_cpu_cap(X86_FEATURE_X2APIC);
 		return 1;
 	}
 
@@ -238,6 +245,7 @@ static struct apic apic_numachip __refconst = {
 	.name				= "NumaConnect system",
 	.probe				= numachip_probe,
 	.acpi_madt_oem_check		= numachip_acpi_madt_oem_check,
+	.apic_id_valid			= numachip_apic_id_valid,
 	.apic_id_registered		= numachip_apic_id_registered,
 
 	.irq_delivery_mode		= dest_Fixed,
diff --git a/arch/x86/kernel/apic/bigsmp_32.c b/arch/x86/kernel/apic/bigsmp_32.c
index 521bead01137..0cdec7065aff 100644
--- a/arch/x86/kernel/apic/bigsmp_32.c
+++ b/arch/x86/kernel/apic/bigsmp_32.c
@@ -198,6 +198,7 @@ static struct apic apic_bigsmp = {
 	.name				= "bigsmp",
 	.probe				= probe_bigsmp,
 	.acpi_madt_oem_check		= NULL,
+	.apic_id_valid			= default_apic_id_valid,
 	.apic_id_registered		= bigsmp_apic_id_registered,
 
 	.irq_delivery_mode		= dest_Fixed,
diff --git a/arch/x86/kernel/apic/es7000_32.c b/arch/x86/kernel/apic/es7000_32.c
index 5d513bc47b6b..e42d1d3b9134 100644
--- a/arch/x86/kernel/apic/es7000_32.c
+++ b/arch/x86/kernel/apic/es7000_32.c
@@ -625,6 +625,7 @@ static struct apic __refdata apic_es7000_cluster = {
 	.name				= "es7000",
 	.probe				= probe_es7000,
 	.acpi_madt_oem_check		= es7000_acpi_madt_oem_check_cluster,
+	.apic_id_valid			= default_apic_id_valid,
 	.apic_id_registered		= es7000_apic_id_registered,
 
 	.irq_delivery_mode		= dest_LowestPrio,
@@ -690,6 +691,7 @@ static struct apic __refdata apic_es7000 = {
 	.name				= "es7000",
 	.probe				= probe_es7000,
 	.acpi_madt_oem_check		= es7000_acpi_madt_oem_check,
+	.apic_id_valid			= default_apic_id_valid,
 	.apic_id_registered		= es7000_apic_id_registered,
 
 	.irq_delivery_mode		= dest_Fixed,
diff --git a/arch/x86/kernel/apic/numaq_32.c b/arch/x86/kernel/apic/numaq_32.c
index c4a61ca1349a..00d2422ca7c9 100644
--- a/arch/x86/kernel/apic/numaq_32.c
+++ b/arch/x86/kernel/apic/numaq_32.c
@@ -478,6 +478,7 @@ static struct apic __refdata apic_numaq = {
 	.name				= "NUMAQ",
 	.probe				= probe_numaq,
 	.acpi_madt_oem_check		= NULL,
+	.apic_id_valid			= default_apic_id_valid,
 	.apic_id_registered		= numaq_apic_id_registered,
 
 	.irq_delivery_mode		= dest_LowestPrio,
diff --git a/arch/x86/kernel/apic/probe_32.c b/arch/x86/kernel/apic/probe_32.c
index 0787bb3412f4..ff2c1b9aac4d 100644
--- a/arch/x86/kernel/apic/probe_32.c
+++ b/arch/x86/kernel/apic/probe_32.c
@@ -92,6 +92,7 @@ static struct apic apic_default = {
 	.name				= "default",
 	.probe				= probe_default,
 	.acpi_madt_oem_check		= NULL,
+	.apic_id_valid			= default_apic_id_valid,
 	.apic_id_registered		= default_apic_id_registered,
 
 	.irq_delivery_mode		= dest_LowestPrio,
diff --git a/arch/x86/kernel/apic/summit_32.c b/arch/x86/kernel/apic/summit_32.c
index 19114423c58c..fea000b27f07 100644
--- a/arch/x86/kernel/apic/summit_32.c
+++ b/arch/x86/kernel/apic/summit_32.c
@@ -496,6 +496,7 @@ static struct apic apic_summit = {
 	.name				= "summit",
 	.probe				= probe_summit,
 	.acpi_madt_oem_check		= summit_acpi_madt_oem_check,
+	.apic_id_valid			= default_apic_id_valid,
 	.apic_id_registered		= summit_apic_id_registered,
 
 	.irq_delivery_mode		= dest_LowestPrio,
diff --git a/arch/x86/kernel/apic/x2apic_cluster.c b/arch/x86/kernel/apic/x2apic_cluster.c
index 500795875827..9193713060a9 100644
--- a/arch/x86/kernel/apic/x2apic_cluster.c
+++ b/arch/x86/kernel/apic/x2apic_cluster.c
@@ -213,6 +213,7 @@ static struct apic apic_x2apic_cluster = {
 	.name				= "cluster x2apic",
 	.probe				= x2apic_cluster_probe,
 	.acpi_madt_oem_check		= x2apic_acpi_madt_oem_check,
+	.apic_id_valid			= default_apic_id_valid,
 	.apic_id_registered		= x2apic_apic_id_registered,
 
 	.irq_delivery_mode		= dest_LowestPrio,
diff --git a/arch/x86/kernel/apic/x2apic_phys.c b/arch/x86/kernel/apic/x2apic_phys.c
index f5373dfde21e..bcd1db6eaca9 100644
--- a/arch/x86/kernel/apic/x2apic_phys.c
+++ b/arch/x86/kernel/apic/x2apic_phys.c
@@ -119,6 +119,7 @@ static struct apic apic_x2apic_phys = {
 	.name				= "physical x2apic",
 	.probe				= x2apic_phys_probe,
 	.acpi_madt_oem_check		= x2apic_acpi_madt_oem_check,
+	.apic_id_valid			= default_apic_id_valid,
 	.apic_id_registered		= x2apic_apic_id_registered,
 
 	.irq_delivery_mode		= dest_Fixed,
diff --git a/arch/x86/kernel/apic/x2apic_uv_x.c b/arch/x86/kernel/apic/x2apic_uv_x.c
index 79b05b88aa19..fc4771425852 100644
--- a/arch/x86/kernel/apic/x2apic_uv_x.c
+++ b/arch/x86/kernel/apic/x2apic_uv_x.c
@@ -351,6 +351,7 @@ static struct apic __refdata apic_x2apic_uv_x = {
 	.name				= "UV large system",
 	.probe				= uv_probe,
 	.acpi_madt_oem_check		= uv_acpi_madt_oem_check,
+	.apic_id_valid			= default_apic_id_valid,
 	.apic_id_registered		= uv_apic_id_registered,
 
 	.irq_delivery_mode		= dest_Fixed,
diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c
index 66d250c00d11..d279e6e1d1b7 100644
--- a/arch/x86/kernel/smpboot.c
+++ b/arch/x86/kernel/smpboot.c
@@ -847,7 +847,7 @@ int __cpuinit native_cpu_up(unsigned int cpu)
 
 	if (apicid == BAD_APICID || apicid == boot_cpu_physical_apicid ||
 	    !physid_isset(apicid, phys_cpu_present_map) ||
-	    (!x2apic_mode && apicid >= 255)) {
+	    !apic->apic_id_valid(apicid)) {
 		printk(KERN_ERR "%s: bad cpu %d\n", __func__, cpu);
 		return -EINVAL;
 	}
-- 
cgit v1.2.3


From 0b95ec56ae19f61ca664e83766a2180057f0e351 Mon Sep 17 00:00:00 2001
From: Jussi Kivilinna <jussi.kivilinna@mbnet.fi>
Date: Mon, 5 Mar 2012 20:26:47 +0200
Subject: crypto: camellia - add assembler implementation for x86_64

Patch adds x86_64 assembler implementation of Camellia block cipher. Two set of
functions are provided. First set is regular 'one-block at time' encrypt/decrypt
functions. Second is 'two-block at time' functions that gain performance increase
on out-of-order CPUs. Performance of 2-way functions should be equal to 1-way
functions with in-order CPUs.

Patch has been tested with tcrypt and automated filesystem tests.

Tcrypt benchmark results:

AMD Phenom II 1055T (fam:16, model:10):

camellia-asm vs camellia_generic:
128bit key:                                             (lrw:256bit)    (xts:256bit)
size    ecb-enc ecb-dec cbc-enc cbc-dec ctr-enc ctr-dec lrw-enc lrw-dec xts-enc xts-dec
16B     1.27x   1.22x   1.30x   1.42x   1.30x   1.34x   1.19x   1.05x   1.23x   1.24x
64B     1.74x   1.79x   1.43x   1.87x   1.81x   1.87x   1.48x   1.38x   1.55x   1.62x
256B    1.90x   1.87x   1.43x   1.94x   1.94x   1.95x   1.63x   1.62x   1.67x   1.70x
1024B   1.96x   1.93x   1.43x   1.95x   1.98x   2.01x   1.67x   1.69x   1.74x   1.80x
8192B   1.96x   1.96x   1.39x   1.93x   2.01x   2.03x   1.72x   1.64x   1.71x   1.76x

256bit key:                                             (lrw:384bit)    (xts:512bit)
size    ecb-enc ecb-dec cbc-enc cbc-dec ctr-enc ctr-dec lrw-enc lrw-dec xts-enc xts-dec
16B     1.23x   1.23x   1.33x   1.39x   1.34x   1.38x   1.04x   1.18x   1.21x   1.29x
64B     1.72x   1.69x   1.42x   1.78x   1.81x   1.89x   1.57x   1.52x   1.56x   1.65x
256B    1.85x   1.88x   1.42x   1.86x   1.93x   1.96x   1.69x   1.65x   1.70x   1.75x
1024B   1.88x   1.86x   1.45x   1.95x   1.96x   1.95x   1.77x   1.71x   1.77x   1.78x
8192B   1.91x   1.86x   1.42x   1.91x   2.03x   1.98x   1.73x   1.71x   1.78x   1.76x

camellia-asm vs aes-asm (8kB block):
         128bit  256bit
ecb-enc  1.15x   1.22x
ecb-dec  1.16x   1.16x
cbc-enc  0.85x   0.90x
cbc-dec  1.20x   1.23x
ctr-enc  1.28x   1.30x
ctr-dec  1.27x   1.28x
lrw-enc  1.12x   1.16x
lrw-dec  1.08x   1.10x
xts-enc  1.11x   1.15x
xts-dec  1.14x   1.15x

Intel Core2 T8100 (fam:6, model:23, step:6):

camellia-asm vs camellia_generic:
128bit key:                                             (lrw:256bit)    (xts:256bit)
size    ecb-enc ecb-dec cbc-enc cbc-dec ctr-enc ctr-dec lrw-enc lrw-dec xts-enc xts-dec
16B     1.10x   1.12x   1.14x   1.16x   1.16x   1.15x   1.02x   1.02x   1.08x   1.08x
64B     1.61x   1.60x   1.17x   1.68x   1.67x   1.66x   1.43x   1.42x   1.44x   1.42x
256B    1.65x   1.73x   1.17x   1.77x   1.81x   1.80x   1.54x   1.53x   1.58x   1.54x
1024B   1.76x   1.74x   1.18x   1.80x   1.85x   1.85x   1.60x   1.59x   1.65x   1.60x
8192B   1.77x   1.75x   1.19x   1.81x   1.85x   1.86x   1.63x   1.61x   1.66x   1.62x

256bit key:                                             (lrw:384bit)    (xts:512bit)
size    ecb-enc ecb-dec cbc-enc cbc-dec ctr-enc ctr-dec lrw-enc lrw-dec xts-enc xts-dec
16B     1.10x   1.07x   1.13x   1.16x   1.11x   1.16x   1.03x   1.02x   1.08x   1.07x
64B     1.61x   1.62x   1.15x   1.66x   1.63x   1.68x   1.47x   1.46x   1.47x   1.44x
256B    1.71x   1.70x   1.16x   1.75x   1.69x   1.79x   1.58x   1.57x   1.59x   1.55x
1024B   1.78x   1.72x   1.17x   1.75x   1.80x   1.80x   1.63x   1.62x   1.65x   1.62x
8192B   1.76x   1.73x   1.17x   1.78x   1.80x   1.81x   1.64x   1.62x   1.68x   1.64x

camellia-asm vs aes-asm (8kB block):
         128bit  256bit
ecb-enc  1.17x   1.21x
ecb-dec  1.17x   1.20x
cbc-enc  0.80x   0.82x
cbc-dec  1.22x   1.24x
ctr-enc  1.25x   1.26x
ctr-dec  1.25x   1.26x
lrw-enc  1.14x   1.18x
lrw-dec  1.13x   1.17x
xts-enc  1.14x   1.18x
xts-dec  1.14x   1.17x

Signed-off-by: Jussi Kivilinna <jussi.kivilinna@mbnet.fi>
Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
---
 arch/x86/crypto/Makefile                 |    2 +
 arch/x86/crypto/camellia-x86_64-asm_64.S |  520 ++++++++
 arch/x86/crypto/camellia_glue.c          | 1952 ++++++++++++++++++++++++++++++
 crypto/Kconfig                           |   18 +
 4 files changed, 2492 insertions(+)
 create mode 100644 arch/x86/crypto/camellia-x86_64-asm_64.S
 create mode 100644 arch/x86/crypto/camellia_glue.c

(limited to 'arch/x86')

diff --git a/arch/x86/crypto/Makefile b/arch/x86/crypto/Makefile
index 2b0b9631474b..e191ac048b59 100644
--- a/arch/x86/crypto/Makefile
+++ b/arch/x86/crypto/Makefile
@@ -8,6 +8,7 @@ obj-$(CONFIG_CRYPTO_SALSA20_586) += salsa20-i586.o
 obj-$(CONFIG_CRYPTO_SERPENT_SSE2_586) += serpent-sse2-i586.o
 
 obj-$(CONFIG_CRYPTO_AES_X86_64) += aes-x86_64.o
+obj-$(CONFIG_CRYPTO_CAMELLIA_X86_64) += camellia-x86_64.o
 obj-$(CONFIG_CRYPTO_BLOWFISH_X86_64) += blowfish-x86_64.o
 obj-$(CONFIG_CRYPTO_TWOFISH_X86_64) += twofish-x86_64.o
 obj-$(CONFIG_CRYPTO_TWOFISH_X86_64_3WAY) += twofish-x86_64-3way.o
@@ -25,6 +26,7 @@ salsa20-i586-y := salsa20-i586-asm_32.o salsa20_glue.o
 serpent-sse2-i586-y := serpent-sse2-i586-asm_32.o serpent_sse2_glue.o
 
 aes-x86_64-y := aes-x86_64-asm_64.o aes_glue.o
+camellia-x86_64-y := camellia-x86_64-asm_64.o camellia_glue.o
 blowfish-x86_64-y := blowfish-x86_64-asm_64.o blowfish_glue.o
 twofish-x86_64-y := twofish-x86_64-asm_64.o twofish_glue.o
 twofish-x86_64-3way-y := twofish-x86_64-asm_64-3way.o twofish_glue_3way.o
diff --git a/arch/x86/crypto/camellia-x86_64-asm_64.S b/arch/x86/crypto/camellia-x86_64-asm_64.S
new file mode 100644
index 000000000000..0b3374335fdc
--- /dev/null
+++ b/arch/x86/crypto/camellia-x86_64-asm_64.S
@@ -0,0 +1,520 @@
+/*
+ * Camellia Cipher Algorithm (x86_64)
+ *
+ * Copyright (C) 2012 Jussi Kivilinna <jussi.kivilinna@mbnet.fi>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307
+ * USA
+ *
+ */
+
+.file "camellia-x86_64-asm_64.S"
+.text
+
+.extern camellia_sp10011110;
+.extern camellia_sp22000222;
+.extern camellia_sp03303033;
+.extern camellia_sp00444404;
+.extern camellia_sp02220222;
+.extern camellia_sp30333033;
+.extern camellia_sp44044404;
+.extern camellia_sp11101110;
+
+#define sp10011110 camellia_sp10011110
+#define sp22000222 camellia_sp22000222
+#define sp03303033 camellia_sp03303033
+#define sp00444404 camellia_sp00444404
+#define sp02220222 camellia_sp02220222
+#define sp30333033 camellia_sp30333033
+#define sp44044404 camellia_sp44044404
+#define sp11101110 camellia_sp11101110
+
+#define CAMELLIA_TABLE_BYTE_LEN 272
+
+/* struct camellia_ctx: */
+#define key_table 0
+#define key_length CAMELLIA_TABLE_BYTE_LEN
+
+/* register macros */
+#define CTX %rdi
+#define RIO %rsi
+#define RIOd %esi
+
+#define RAB0 %rax
+#define RCD0 %rcx
+#define RAB1 %rbx
+#define RCD1 %rdx
+
+#define RAB0d %eax
+#define RCD0d %ecx
+#define RAB1d %ebx
+#define RCD1d %edx
+
+#define RAB0bl %al
+#define RCD0bl %cl
+#define RAB1bl %bl
+#define RCD1bl %dl
+
+#define RAB0bh %ah
+#define RCD0bh %ch
+#define RAB1bh %bh
+#define RCD1bh %dh
+
+#define RT0 %rsi
+#define RT1 %rbp
+#define RT2 %r8
+
+#define RT0d %esi
+#define RT1d %ebp
+#define RT2d %r8d
+
+#define RT2bl %r8b
+
+#define RXOR %r9
+#define RRBP %r10
+#define RDST %r11
+
+#define RXORd %r9d
+#define RXORbl %r9b
+
+#define xor2ror16(T0, T1, tmp1, tmp2, ab, dst) \
+	movzbl ab ## bl,		tmp2 ## d; \
+	movzbl ab ## bh,		tmp1 ## d; \
+	rorq $16,			ab; \
+	xorq T0(, tmp2, 8),		dst; \
+	xorq T1(, tmp1, 8),		dst;
+
+/**********************************************************************
+  1-way camellia
+ **********************************************************************/
+#define roundsm(ab, subkey, cd) \
+	movq (key_table + ((subkey) * 2) * 4)(CTX),	RT2; \
+	\
+	xor2ror16(sp00444404, sp03303033, RT0, RT1, ab ## 0, cd ## 0); \
+	xor2ror16(sp22000222, sp10011110, RT0, RT1, ab ## 0, RT2); \
+	xor2ror16(sp11101110, sp44044404, RT0, RT1, ab ## 0, cd ## 0); \
+	xor2ror16(sp30333033, sp02220222, RT0, RT1, ab ## 0, RT2); \
+	\
+	xorq RT2,					cd ## 0;
+
+#define fls(l, r, kl, kr) \
+	movl (key_table + ((kl) * 2) * 4)(CTX),		RT0d; \
+	andl l ## 0d,					RT0d; \
+	roll $1,					RT0d; \
+	shlq $32,					RT0; \
+	xorq RT0,					l ## 0; \
+	movq (key_table + ((kr) * 2) * 4)(CTX),		RT1; \
+	orq r ## 0,					RT1; \
+	shrq $32,					RT1; \
+	xorq RT1,					r ## 0; \
+	\
+	movq (key_table + ((kl) * 2) * 4)(CTX),		RT2; \
+	orq l ## 0,					RT2; \
+	shrq $32,					RT2; \
+	xorq RT2,					l ## 0; \
+	movl (key_table + ((kr) * 2) * 4)(CTX),		RT0d; \
+	andl r ## 0d,					RT0d; \
+	roll $1,					RT0d; \
+	shlq $32,					RT0; \
+	xorq RT0,					r ## 0;
+
+#define enc_rounds(i) \
+	roundsm(RAB, i + 2, RCD); \
+	roundsm(RCD, i + 3, RAB); \
+	roundsm(RAB, i + 4, RCD); \
+	roundsm(RCD, i + 5, RAB); \
+	roundsm(RAB, i + 6, RCD); \
+	roundsm(RCD, i + 7, RAB);
+
+#define enc_fls(i) \
+	fls(RAB, RCD, i + 0, i + 1);
+
+#define enc_inpack() \
+	movq (RIO),			RAB0; \
+	bswapq				RAB0; \
+	rolq $32,			RAB0; \
+	movq 4*2(RIO),			RCD0; \
+	bswapq				RCD0; \
+	rorq $32,			RCD0; \
+	xorq key_table(CTX),		RAB0;
+
+#define enc_outunpack(op, max) \
+	xorq key_table(CTX, max, 8),	RCD0; \
+	rorq $32,			RCD0; \
+	bswapq				RCD0; \
+	op ## q RCD0,			(RIO); \
+	rolq $32,			RAB0; \
+	bswapq				RAB0; \
+	op ## q RAB0,			4*2(RIO);
+
+#define dec_rounds(i) \
+	roundsm(RAB, i + 7, RCD); \
+	roundsm(RCD, i + 6, RAB); \
+	roundsm(RAB, i + 5, RCD); \
+	roundsm(RCD, i + 4, RAB); \
+	roundsm(RAB, i + 3, RCD); \
+	roundsm(RCD, i + 2, RAB);
+
+#define dec_fls(i) \
+	fls(RAB, RCD, i + 1, i + 0);
+
+#define dec_inpack(max) \
+	movq (RIO),			RAB0; \
+	bswapq				RAB0; \
+	rolq $32,			RAB0; \
+	movq 4*2(RIO),			RCD0; \
+	bswapq				RCD0; \
+	rorq $32,			RCD0; \
+	xorq key_table(CTX, max, 8),	RAB0;
+
+#define dec_outunpack() \
+	xorq key_table(CTX),		RCD0; \
+	rorq $32,			RCD0; \
+	bswapq				RCD0; \
+	movq RCD0,			(RIO); \
+	rolq $32,			RAB0; \
+	bswapq				RAB0; \
+	movq RAB0,			4*2(RIO);
+
+.global __camellia_enc_blk;
+.type   __camellia_enc_blk,@function;
+
+__camellia_enc_blk:
+	/* input:
+	 *	%rdi: ctx, CTX
+	 *	%rsi: dst
+	 *	%rdx: src
+	 *	%rcx: bool xor
+	 */
+	movq %rbp, RRBP;
+
+	movq %rcx, RXOR;
+	movq %rsi, RDST;
+	movq %rdx, RIO;
+
+	enc_inpack();
+
+	enc_rounds(0);
+	enc_fls(8);
+	enc_rounds(8);
+	enc_fls(16);
+	enc_rounds(16);
+	movl $24, RT1d; /* max */
+
+	cmpb $16, key_length(CTX);
+	je __enc_done;
+
+	enc_fls(24);
+	enc_rounds(24);
+	movl $32, RT1d; /* max */
+
+__enc_done:
+	testb RXORbl, RXORbl;
+	movq RDST, RIO;
+
+	jnz __enc_xor;
+
+	enc_outunpack(mov, RT1);
+
+	movq RRBP, %rbp;
+	ret;
+
+__enc_xor:
+	enc_outunpack(xor, RT1);
+
+	movq RRBP, %rbp;
+	ret;
+
+.global camellia_dec_blk;
+.type   camellia_dec_blk,@function;
+
+camellia_dec_blk:
+	/* input:
+	 *	%rdi: ctx, CTX
+	 *	%rsi: dst
+	 *	%rdx: src
+	 */
+	cmpl $16, key_length(CTX);
+	movl $32, RT2d;
+	movl $24, RXORd;
+	cmovel RXORd, RT2d; /* max */
+
+	movq %rbp, RRBP;
+	movq %rsi, RDST;
+	movq %rdx, RIO;
+
+	dec_inpack(RT2);
+
+	cmpb $24, RT2bl;
+	je __dec_rounds16;
+
+	dec_rounds(24);
+	dec_fls(24);
+
+__dec_rounds16:
+	dec_rounds(16);
+	dec_fls(16);
+	dec_rounds(8);
+	dec_fls(8);
+	dec_rounds(0);
+
+	movq RDST, RIO;
+
+	dec_outunpack();
+
+	movq RRBP, %rbp;
+	ret;
+
+/**********************************************************************
+  2-way camellia
+ **********************************************************************/
+#define roundsm2(ab, subkey, cd) \
+	movq (key_table + ((subkey) * 2) * 4)(CTX),	RT2; \
+	xorq RT2,					cd ## 1; \
+	\
+	xor2ror16(sp00444404, sp03303033, RT0, RT1, ab ## 0, cd ## 0); \
+	xor2ror16(sp22000222, sp10011110, RT0, RT1, ab ## 0, RT2); \
+	xor2ror16(sp11101110, sp44044404, RT0, RT1, ab ## 0, cd ## 0); \
+	xor2ror16(sp30333033, sp02220222, RT0, RT1, ab ## 0, RT2); \
+	\
+		xor2ror16(sp00444404, sp03303033, RT0, RT1, ab ## 1, cd ## 1); \
+		xorq RT2,					cd ## 0; \
+		xor2ror16(sp22000222, sp10011110, RT0, RT1, ab ## 1, cd ## 1); \
+		xor2ror16(sp11101110, sp44044404, RT0, RT1, ab ## 1, cd ## 1); \
+		xor2ror16(sp30333033, sp02220222, RT0, RT1, ab ## 1, cd ## 1);
+
+#define fls2(l, r, kl, kr) \
+	movl (key_table + ((kl) * 2) * 4)(CTX),		RT0d; \
+	andl l ## 0d,					RT0d; \
+	roll $1,					RT0d; \
+	shlq $32,					RT0; \
+	xorq RT0,					l ## 0; \
+	movq (key_table + ((kr) * 2) * 4)(CTX),		RT1; \
+	orq r ## 0,					RT1; \
+	shrq $32,					RT1; \
+	xorq RT1,					r ## 0; \
+	\
+		movl (key_table + ((kl) * 2) * 4)(CTX),		RT2d; \
+		andl l ## 1d,					RT2d; \
+		roll $1,					RT2d; \
+		shlq $32,					RT2; \
+		xorq RT2,					l ## 1; \
+		movq (key_table + ((kr) * 2) * 4)(CTX),		RT0; \
+		orq r ## 1,					RT0; \
+		shrq $32,					RT0; \
+		xorq RT0,					r ## 1; \
+	\
+	movq (key_table + ((kl) * 2) * 4)(CTX),		RT1; \
+	orq l ## 0,					RT1; \
+	shrq $32,					RT1; \
+	xorq RT1,					l ## 0; \
+	movl (key_table + ((kr) * 2) * 4)(CTX),		RT2d; \
+	andl r ## 0d,					RT2d; \
+	roll $1,					RT2d; \
+	shlq $32,					RT2; \
+	xorq RT2,					r ## 0; \
+	\
+		movq (key_table + ((kl) * 2) * 4)(CTX),		RT0; \
+		orq l ## 1,					RT0; \
+		shrq $32,					RT0; \
+		xorq RT0,					l ## 1; \
+		movl (key_table + ((kr) * 2) * 4)(CTX),		RT1d; \
+		andl r ## 1d,					RT1d; \
+		roll $1,					RT1d; \
+		shlq $32,					RT1; \
+		xorq RT1,					r ## 1;
+
+#define enc_rounds2(i) \
+	roundsm2(RAB, i + 2, RCD); \
+	roundsm2(RCD, i + 3, RAB); \
+	roundsm2(RAB, i + 4, RCD); \
+	roundsm2(RCD, i + 5, RAB); \
+	roundsm2(RAB, i + 6, RCD); \
+	roundsm2(RCD, i + 7, RAB);
+
+#define enc_fls2(i) \
+	fls2(RAB, RCD, i + 0, i + 1);
+
+#define enc_inpack2() \
+	movq (RIO),			RAB0; \
+	bswapq				RAB0; \
+	rorq $32,			RAB0; \
+	movq 4*2(RIO),			RCD0; \
+	bswapq				RCD0; \
+	rolq $32,			RCD0; \
+	xorq key_table(CTX),		RAB0; \
+	\
+		movq 8*2(RIO),			RAB1; \
+		bswapq				RAB1; \
+		rorq $32,			RAB1; \
+		movq 12*2(RIO),			RCD1; \
+		bswapq				RCD1; \
+		rolq $32,			RCD1; \
+		xorq key_table(CTX),		RAB1;
+
+#define enc_outunpack2(op, max) \
+	xorq key_table(CTX, max, 8),	RCD0; \
+	rolq $32,			RCD0; \
+	bswapq				RCD0; \
+	op ## q RCD0,			(RIO); \
+	rorq $32,			RAB0; \
+	bswapq				RAB0; \
+	op ## q RAB0,			4*2(RIO); \
+	\
+		xorq key_table(CTX, max, 8),	RCD1; \
+		rolq $32,			RCD1; \
+		bswapq				RCD1; \
+		op ## q RCD1,			8*2(RIO); \
+		rorq $32,			RAB1; \
+		bswapq				RAB1; \
+		op ## q RAB1,			12*2(RIO);
+
+#define dec_rounds2(i) \
+	roundsm2(RAB, i + 7, RCD); \
+	roundsm2(RCD, i + 6, RAB); \
+	roundsm2(RAB, i + 5, RCD); \
+	roundsm2(RCD, i + 4, RAB); \
+	roundsm2(RAB, i + 3, RCD); \
+	roundsm2(RCD, i + 2, RAB);
+
+#define dec_fls2(i) \
+	fls2(RAB, RCD, i + 1, i + 0);
+
+#define dec_inpack2(max) \
+	movq (RIO),			RAB0; \
+	bswapq				RAB0; \
+	rorq $32,			RAB0; \
+	movq 4*2(RIO),			RCD0; \
+	bswapq				RCD0; \
+	rolq $32,			RCD0; \
+	xorq key_table(CTX, max, 8),	RAB0; \
+	\
+		movq 8*2(RIO),			RAB1; \
+		bswapq				RAB1; \
+		rorq $32,			RAB1; \
+		movq 12*2(RIO),			RCD1; \
+		bswapq				RCD1; \
+		rolq $32,			RCD1; \
+		xorq key_table(CTX, max, 8),	RAB1;
+
+#define dec_outunpack2() \
+	xorq key_table(CTX),		RCD0; \
+	rolq $32,			RCD0; \
+	bswapq				RCD0; \
+	movq RCD0,			(RIO); \
+	rorq $32,			RAB0; \
+	bswapq				RAB0; \
+	movq RAB0,			4*2(RIO); \
+	\
+		xorq key_table(CTX),		RCD1; \
+		rolq $32,			RCD1; \
+		bswapq				RCD1; \
+		movq RCD1,			8*2(RIO); \
+		rorq $32,			RAB1; \
+		bswapq				RAB1; \
+		movq RAB1,			12*2(RIO);
+
+.global __camellia_enc_blk_2way;
+.type   __camellia_enc_blk_2way,@function;
+
+__camellia_enc_blk_2way:
+	/* input:
+	 *	%rdi: ctx, CTX
+	 *	%rsi: dst
+	 *	%rdx: src
+	 *	%rcx: bool xor
+	 */
+	pushq %rbx;
+
+	movq %rbp, RRBP;
+	movq %rcx, RXOR;
+	movq %rsi, RDST;
+	movq %rdx, RIO;
+
+	enc_inpack2();
+
+	enc_rounds2(0);
+	enc_fls2(8);
+	enc_rounds2(8);
+	enc_fls2(16);
+	enc_rounds2(16);
+	movl $24, RT2d; /* max */
+
+	cmpb $16, key_length(CTX);
+	je __enc2_done;
+
+	enc_fls2(24);
+	enc_rounds2(24);
+	movl $32, RT2d; /* max */
+
+__enc2_done:
+	test RXORbl, RXORbl;
+	movq RDST, RIO;
+	jnz __enc2_xor;
+
+	enc_outunpack2(mov, RT2);
+
+	movq RRBP, %rbp;
+	popq %rbx;
+	ret;
+
+__enc2_xor:
+	enc_outunpack2(xor, RT2);
+
+	movq RRBP, %rbp;
+	popq %rbx;
+	ret;
+
+.global camellia_dec_blk_2way;
+.type   camellia_dec_blk_2way,@function;
+
+camellia_dec_blk_2way:
+	/* input:
+	 *	%rdi: ctx, CTX
+	 *	%rsi: dst
+	 *	%rdx: src
+	 */
+	cmpl $16, key_length(CTX);
+	movl $32, RT2d;
+	movl $24, RXORd;
+	cmovel RXORd, RT2d; /* max */
+
+	movq %rbx, RXOR;
+	movq %rbp, RRBP;
+	movq %rsi, RDST;
+	movq %rdx, RIO;
+
+	dec_inpack2(RT2);
+
+	cmpb $24, RT2bl;
+	je __dec2_rounds16;
+
+	dec_rounds2(24);
+	dec_fls2(24);
+
+__dec2_rounds16:
+	dec_rounds2(16);
+	dec_fls2(16);
+	dec_rounds2(8);
+	dec_fls2(8);
+	dec_rounds2(0);
+
+	movq RDST, RIO;
+
+	dec_outunpack2();
+
+	movq RRBP, %rbp;
+	movq RXOR, %rbx;
+	ret;
diff --git a/arch/x86/crypto/camellia_glue.c b/arch/x86/crypto/camellia_glue.c
new file mode 100644
index 000000000000..1ca36a93fd2f
--- /dev/null
+++ b/arch/x86/crypto/camellia_glue.c
@@ -0,0 +1,1952 @@
+/*
+ * Glue Code for assembler optimized version of Camellia
+ *
+ * Copyright (c) 2012 Jussi Kivilinna <jussi.kivilinna@mbnet.fi>
+ *
+ * Camellia parts based on code by:
+ *  Copyright (C) 2006 NTT (Nippon Telegraph and Telephone Corporation)
+ * CBC & ECB parts based on code (crypto/cbc.c,ecb.c) by:
+ *   Copyright (c) 2006 Herbert Xu <herbert@gondor.apana.org.au>
+ * CTR part based on code (crypto/ctr.c) by:
+ *   (C) Copyright IBM Corp. 2007 - Joy Latten <latten@us.ibm.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307
+ * USA
+ *
+ */
+
+#include <asm/processor.h>
+#include <asm/unaligned.h>
+#include <linux/crypto.h>
+#include <linux/init.h>
+#include <linux/module.h>
+#include <linux/types.h>
+#include <crypto/algapi.h>
+#include <crypto/b128ops.h>
+#include <crypto/lrw.h>
+#include <crypto/xts.h>
+
+#define CAMELLIA_MIN_KEY_SIZE	16
+#define CAMELLIA_MAX_KEY_SIZE	32
+#define CAMELLIA_BLOCK_SIZE	16
+#define CAMELLIA_TABLE_BYTE_LEN	272
+
+struct camellia_ctx {
+	u64 key_table[CAMELLIA_TABLE_BYTE_LEN / sizeof(u64)];
+	u32 key_length;
+};
+
+/* regular block cipher functions */
+asmlinkage void __camellia_enc_blk(struct camellia_ctx *ctx, u8 *dst,
+				   const u8 *src, bool xor);
+asmlinkage void camellia_dec_blk(struct camellia_ctx *ctx, u8 *dst,
+				 const u8 *src);
+
+/* 2-way parallel cipher functions */
+asmlinkage void __camellia_enc_blk_2way(struct camellia_ctx *ctx, u8 *dst,
+					const u8 *src, bool xor);
+asmlinkage void camellia_dec_blk_2way(struct camellia_ctx *ctx, u8 *dst,
+				      const u8 *src);
+
+static inline void camellia_enc_blk(struct camellia_ctx *ctx, u8 *dst,
+				    const u8 *src)
+{
+	__camellia_enc_blk(ctx, dst, src, false);
+}
+
+static inline void camellia_enc_blk_xor(struct camellia_ctx *ctx, u8 *dst,
+					const u8 *src)
+{
+	__camellia_enc_blk(ctx, dst, src, true);
+}
+
+static inline void camellia_enc_blk_2way(struct camellia_ctx *ctx, u8 *dst,
+					 const u8 *src)
+{
+	__camellia_enc_blk_2way(ctx, dst, src, false);
+}
+
+static inline void camellia_enc_blk_xor_2way(struct camellia_ctx *ctx, u8 *dst,
+					     const u8 *src)
+{
+	__camellia_enc_blk_2way(ctx, dst, src, true);
+}
+
+static void camellia_encrypt(struct crypto_tfm *tfm, u8 *dst, const u8 *src)
+{
+	camellia_enc_blk(crypto_tfm_ctx(tfm), dst, src);
+}
+
+static void camellia_decrypt(struct crypto_tfm *tfm, u8 *dst, const u8 *src)
+{
+	camellia_dec_blk(crypto_tfm_ctx(tfm), dst, src);
+}
+
+/* camellia sboxes */
+const u64 camellia_sp10011110[256] = {
+	0x7000007070707000, 0x8200008282828200, 0x2c00002c2c2c2c00,
+	0xec0000ecececec00, 0xb30000b3b3b3b300, 0x2700002727272700,
+	0xc00000c0c0c0c000, 0xe50000e5e5e5e500, 0xe40000e4e4e4e400,
+	0x8500008585858500, 0x5700005757575700, 0x3500003535353500,
+	0xea0000eaeaeaea00, 0x0c00000c0c0c0c00, 0xae0000aeaeaeae00,
+	0x4100004141414100, 0x2300002323232300, 0xef0000efefefef00,
+	0x6b00006b6b6b6b00, 0x9300009393939300, 0x4500004545454500,
+	0x1900001919191900, 0xa50000a5a5a5a500, 0x2100002121212100,
+	0xed0000edededed00, 0x0e00000e0e0e0e00, 0x4f00004f4f4f4f00,
+	0x4e00004e4e4e4e00, 0x1d00001d1d1d1d00, 0x6500006565656500,
+	0x9200009292929200, 0xbd0000bdbdbdbd00, 0x8600008686868600,
+	0xb80000b8b8b8b800, 0xaf0000afafafaf00, 0x8f00008f8f8f8f00,
+	0x7c00007c7c7c7c00, 0xeb0000ebebebeb00, 0x1f00001f1f1f1f00,
+	0xce0000cececece00, 0x3e00003e3e3e3e00, 0x3000003030303000,
+	0xdc0000dcdcdcdc00, 0x5f00005f5f5f5f00, 0x5e00005e5e5e5e00,
+	0xc50000c5c5c5c500, 0x0b00000b0b0b0b00, 0x1a00001a1a1a1a00,
+	0xa60000a6a6a6a600, 0xe10000e1e1e1e100, 0x3900003939393900,
+	0xca0000cacacaca00, 0xd50000d5d5d5d500, 0x4700004747474700,
+	0x5d00005d5d5d5d00, 0x3d00003d3d3d3d00, 0xd90000d9d9d9d900,
+	0x0100000101010100, 0x5a00005a5a5a5a00, 0xd60000d6d6d6d600,
+	0x5100005151515100, 0x5600005656565600, 0x6c00006c6c6c6c00,
+	0x4d00004d4d4d4d00, 0x8b00008b8b8b8b00, 0x0d00000d0d0d0d00,
+	0x9a00009a9a9a9a00, 0x6600006666666600, 0xfb0000fbfbfbfb00,
+	0xcc0000cccccccc00, 0xb00000b0b0b0b000, 0x2d00002d2d2d2d00,
+	0x7400007474747400, 0x1200001212121200, 0x2b00002b2b2b2b00,
+	0x2000002020202000, 0xf00000f0f0f0f000, 0xb10000b1b1b1b100,
+	0x8400008484848400, 0x9900009999999900, 0xdf0000dfdfdfdf00,
+	0x4c00004c4c4c4c00, 0xcb0000cbcbcbcb00, 0xc20000c2c2c2c200,
+	0x3400003434343400, 0x7e00007e7e7e7e00, 0x7600007676767600,
+	0x0500000505050500, 0x6d00006d6d6d6d00, 0xb70000b7b7b7b700,
+	0xa90000a9a9a9a900, 0x3100003131313100, 0xd10000d1d1d1d100,
+	0x1700001717171700, 0x0400000404040400, 0xd70000d7d7d7d700,
+	0x1400001414141400, 0x5800005858585800, 0x3a00003a3a3a3a00,
+	0x6100006161616100, 0xde0000dededede00, 0x1b00001b1b1b1b00,
+	0x1100001111111100, 0x1c00001c1c1c1c00, 0x3200003232323200,
+	0x0f00000f0f0f0f00, 0x9c00009c9c9c9c00, 0x1600001616161600,
+	0x5300005353535300, 0x1800001818181800, 0xf20000f2f2f2f200,
+	0x2200002222222200, 0xfe0000fefefefe00, 0x4400004444444400,
+	0xcf0000cfcfcfcf00, 0xb20000b2b2b2b200, 0xc30000c3c3c3c300,
+	0xb50000b5b5b5b500, 0x7a00007a7a7a7a00, 0x9100009191919100,
+	0x2400002424242400, 0x0800000808080800, 0xe80000e8e8e8e800,
+	0xa80000a8a8a8a800, 0x6000006060606000, 0xfc0000fcfcfcfc00,
+	0x6900006969696900, 0x5000005050505000, 0xaa0000aaaaaaaa00,
+	0xd00000d0d0d0d000, 0xa00000a0a0a0a000, 0x7d00007d7d7d7d00,
+	0xa10000a1a1a1a100, 0x8900008989898900, 0x6200006262626200,
+	0x9700009797979700, 0x5400005454545400, 0x5b00005b5b5b5b00,
+	0x1e00001e1e1e1e00, 0x9500009595959500, 0xe00000e0e0e0e000,
+	0xff0000ffffffff00, 0x6400006464646400, 0xd20000d2d2d2d200,
+	0x1000001010101000, 0xc40000c4c4c4c400, 0x0000000000000000,
+	0x4800004848484800, 0xa30000a3a3a3a300, 0xf70000f7f7f7f700,
+	0x7500007575757500, 0xdb0000dbdbdbdb00, 0x8a00008a8a8a8a00,
+	0x0300000303030300, 0xe60000e6e6e6e600, 0xda0000dadadada00,
+	0x0900000909090900, 0x3f00003f3f3f3f00, 0xdd0000dddddddd00,
+	0x9400009494949400, 0x8700008787878700, 0x5c00005c5c5c5c00,
+	0x8300008383838300, 0x0200000202020200, 0xcd0000cdcdcdcd00,
+	0x4a00004a4a4a4a00, 0x9000009090909000, 0x3300003333333300,
+	0x7300007373737300, 0x6700006767676700, 0xf60000f6f6f6f600,
+	0xf30000f3f3f3f300, 0x9d00009d9d9d9d00, 0x7f00007f7f7f7f00,
+	0xbf0000bfbfbfbf00, 0xe20000e2e2e2e200, 0x5200005252525200,
+	0x9b00009b9b9b9b00, 0xd80000d8d8d8d800, 0x2600002626262600,
+	0xc80000c8c8c8c800, 0x3700003737373700, 0xc60000c6c6c6c600,
+	0x3b00003b3b3b3b00, 0x8100008181818100, 0x9600009696969600,
+	0x6f00006f6f6f6f00, 0x4b00004b4b4b4b00, 0x1300001313131300,
+	0xbe0000bebebebe00, 0x6300006363636300, 0x2e00002e2e2e2e00,
+	0xe90000e9e9e9e900, 0x7900007979797900, 0xa70000a7a7a7a700,
+	0x8c00008c8c8c8c00, 0x9f00009f9f9f9f00, 0x6e00006e6e6e6e00,
+	0xbc0000bcbcbcbc00, 0x8e00008e8e8e8e00, 0x2900002929292900,
+	0xf50000f5f5f5f500, 0xf90000f9f9f9f900, 0xb60000b6b6b6b600,
+	0x2f00002f2f2f2f00, 0xfd0000fdfdfdfd00, 0xb40000b4b4b4b400,
+	0x5900005959595900, 0x7800007878787800, 0x9800009898989800,
+	0x0600000606060600, 0x6a00006a6a6a6a00, 0xe70000e7e7e7e700,
+	0x4600004646464600, 0x7100007171717100, 0xba0000babababa00,
+	0xd40000d4d4d4d400, 0x2500002525252500, 0xab0000abababab00,
+	0x4200004242424200, 0x8800008888888800, 0xa20000a2a2a2a200,
+	0x8d00008d8d8d8d00, 0xfa0000fafafafa00, 0x7200007272727200,
+	0x0700000707070700, 0xb90000b9b9b9b900, 0x5500005555555500,
+	0xf80000f8f8f8f800, 0xee0000eeeeeeee00, 0xac0000acacacac00,
+	0x0a00000a0a0a0a00, 0x3600003636363600, 0x4900004949494900,
+	0x2a00002a2a2a2a00, 0x6800006868686800, 0x3c00003c3c3c3c00,
+	0x3800003838383800, 0xf10000f1f1f1f100, 0xa40000a4a4a4a400,
+	0x4000004040404000, 0x2800002828282800, 0xd30000d3d3d3d300,
+	0x7b00007b7b7b7b00, 0xbb0000bbbbbbbb00, 0xc90000c9c9c9c900,
+	0x4300004343434300, 0xc10000c1c1c1c100, 0x1500001515151500,
+	0xe30000e3e3e3e300, 0xad0000adadadad00, 0xf40000f4f4f4f400,
+	0x7700007777777700, 0xc70000c7c7c7c700, 0x8000008080808000,
+	0x9e00009e9e9e9e00,
+};
+
+const u64 camellia_sp22000222[256] = {
+	0xe0e0000000e0e0e0, 0x0505000000050505, 0x5858000000585858,
+	0xd9d9000000d9d9d9, 0x6767000000676767, 0x4e4e0000004e4e4e,
+	0x8181000000818181, 0xcbcb000000cbcbcb, 0xc9c9000000c9c9c9,
+	0x0b0b0000000b0b0b, 0xaeae000000aeaeae, 0x6a6a0000006a6a6a,
+	0xd5d5000000d5d5d5, 0x1818000000181818, 0x5d5d0000005d5d5d,
+	0x8282000000828282, 0x4646000000464646, 0xdfdf000000dfdfdf,
+	0xd6d6000000d6d6d6, 0x2727000000272727, 0x8a8a0000008a8a8a,
+	0x3232000000323232, 0x4b4b0000004b4b4b, 0x4242000000424242,
+	0xdbdb000000dbdbdb, 0x1c1c0000001c1c1c, 0x9e9e0000009e9e9e,
+	0x9c9c0000009c9c9c, 0x3a3a0000003a3a3a, 0xcaca000000cacaca,
+	0x2525000000252525, 0x7b7b0000007b7b7b, 0x0d0d0000000d0d0d,
+	0x7171000000717171, 0x5f5f0000005f5f5f, 0x1f1f0000001f1f1f,
+	0xf8f8000000f8f8f8, 0xd7d7000000d7d7d7, 0x3e3e0000003e3e3e,
+	0x9d9d0000009d9d9d, 0x7c7c0000007c7c7c, 0x6060000000606060,
+	0xb9b9000000b9b9b9, 0xbebe000000bebebe, 0xbcbc000000bcbcbc,
+	0x8b8b0000008b8b8b, 0x1616000000161616, 0x3434000000343434,
+	0x4d4d0000004d4d4d, 0xc3c3000000c3c3c3, 0x7272000000727272,
+	0x9595000000959595, 0xabab000000ababab, 0x8e8e0000008e8e8e,
+	0xbaba000000bababa, 0x7a7a0000007a7a7a, 0xb3b3000000b3b3b3,
+	0x0202000000020202, 0xb4b4000000b4b4b4, 0xadad000000adadad,
+	0xa2a2000000a2a2a2, 0xacac000000acacac, 0xd8d8000000d8d8d8,
+	0x9a9a0000009a9a9a, 0x1717000000171717, 0x1a1a0000001a1a1a,
+	0x3535000000353535, 0xcccc000000cccccc, 0xf7f7000000f7f7f7,
+	0x9999000000999999, 0x6161000000616161, 0x5a5a0000005a5a5a,
+	0xe8e8000000e8e8e8, 0x2424000000242424, 0x5656000000565656,
+	0x4040000000404040, 0xe1e1000000e1e1e1, 0x6363000000636363,
+	0x0909000000090909, 0x3333000000333333, 0xbfbf000000bfbfbf,
+	0x9898000000989898, 0x9797000000979797, 0x8585000000858585,
+	0x6868000000686868, 0xfcfc000000fcfcfc, 0xecec000000ececec,
+	0x0a0a0000000a0a0a, 0xdada000000dadada, 0x6f6f0000006f6f6f,
+	0x5353000000535353, 0x6262000000626262, 0xa3a3000000a3a3a3,
+	0x2e2e0000002e2e2e, 0x0808000000080808, 0xafaf000000afafaf,
+	0x2828000000282828, 0xb0b0000000b0b0b0, 0x7474000000747474,
+	0xc2c2000000c2c2c2, 0xbdbd000000bdbdbd, 0x3636000000363636,
+	0x2222000000222222, 0x3838000000383838, 0x6464000000646464,
+	0x1e1e0000001e1e1e, 0x3939000000393939, 0x2c2c0000002c2c2c,
+	0xa6a6000000a6a6a6, 0x3030000000303030, 0xe5e5000000e5e5e5,
+	0x4444000000444444, 0xfdfd000000fdfdfd, 0x8888000000888888,
+	0x9f9f0000009f9f9f, 0x6565000000656565, 0x8787000000878787,
+	0x6b6b0000006b6b6b, 0xf4f4000000f4f4f4, 0x2323000000232323,
+	0x4848000000484848, 0x1010000000101010, 0xd1d1000000d1d1d1,
+	0x5151000000515151, 0xc0c0000000c0c0c0, 0xf9f9000000f9f9f9,
+	0xd2d2000000d2d2d2, 0xa0a0000000a0a0a0, 0x5555000000555555,
+	0xa1a1000000a1a1a1, 0x4141000000414141, 0xfafa000000fafafa,
+	0x4343000000434343, 0x1313000000131313, 0xc4c4000000c4c4c4,
+	0x2f2f0000002f2f2f, 0xa8a8000000a8a8a8, 0xb6b6000000b6b6b6,
+	0x3c3c0000003c3c3c, 0x2b2b0000002b2b2b, 0xc1c1000000c1c1c1,
+	0xffff000000ffffff, 0xc8c8000000c8c8c8, 0xa5a5000000a5a5a5,
+	0x2020000000202020, 0x8989000000898989, 0x0000000000000000,
+	0x9090000000909090, 0x4747000000474747, 0xefef000000efefef,
+	0xeaea000000eaeaea, 0xb7b7000000b7b7b7, 0x1515000000151515,
+	0x0606000000060606, 0xcdcd000000cdcdcd, 0xb5b5000000b5b5b5,
+	0x1212000000121212, 0x7e7e0000007e7e7e, 0xbbbb000000bbbbbb,
+	0x2929000000292929, 0x0f0f0000000f0f0f, 0xb8b8000000b8b8b8,
+	0x0707000000070707, 0x0404000000040404, 0x9b9b0000009b9b9b,
+	0x9494000000949494, 0x2121000000212121, 0x6666000000666666,
+	0xe6e6000000e6e6e6, 0xcece000000cecece, 0xeded000000ededed,
+	0xe7e7000000e7e7e7, 0x3b3b0000003b3b3b, 0xfefe000000fefefe,
+	0x7f7f0000007f7f7f, 0xc5c5000000c5c5c5, 0xa4a4000000a4a4a4,
+	0x3737000000373737, 0xb1b1000000b1b1b1, 0x4c4c0000004c4c4c,
+	0x9191000000919191, 0x6e6e0000006e6e6e, 0x8d8d0000008d8d8d,
+	0x7676000000767676, 0x0303000000030303, 0x2d2d0000002d2d2d,
+	0xdede000000dedede, 0x9696000000969696, 0x2626000000262626,
+	0x7d7d0000007d7d7d, 0xc6c6000000c6c6c6, 0x5c5c0000005c5c5c,
+	0xd3d3000000d3d3d3, 0xf2f2000000f2f2f2, 0x4f4f0000004f4f4f,
+	0x1919000000191919, 0x3f3f0000003f3f3f, 0xdcdc000000dcdcdc,
+	0x7979000000797979, 0x1d1d0000001d1d1d, 0x5252000000525252,
+	0xebeb000000ebebeb, 0xf3f3000000f3f3f3, 0x6d6d0000006d6d6d,
+	0x5e5e0000005e5e5e, 0xfbfb000000fbfbfb, 0x6969000000696969,
+	0xb2b2000000b2b2b2, 0xf0f0000000f0f0f0, 0x3131000000313131,
+	0x0c0c0000000c0c0c, 0xd4d4000000d4d4d4, 0xcfcf000000cfcfcf,
+	0x8c8c0000008c8c8c, 0xe2e2000000e2e2e2, 0x7575000000757575,
+	0xa9a9000000a9a9a9, 0x4a4a0000004a4a4a, 0x5757000000575757,
+	0x8484000000848484, 0x1111000000111111, 0x4545000000454545,
+	0x1b1b0000001b1b1b, 0xf5f5000000f5f5f5, 0xe4e4000000e4e4e4,
+	0x0e0e0000000e0e0e, 0x7373000000737373, 0xaaaa000000aaaaaa,
+	0xf1f1000000f1f1f1, 0xdddd000000dddddd, 0x5959000000595959,
+	0x1414000000141414, 0x6c6c0000006c6c6c, 0x9292000000929292,
+	0x5454000000545454, 0xd0d0000000d0d0d0, 0x7878000000787878,
+	0x7070000000707070, 0xe3e3000000e3e3e3, 0x4949000000494949,
+	0x8080000000808080, 0x5050000000505050, 0xa7a7000000a7a7a7,
+	0xf6f6000000f6f6f6, 0x7777000000777777, 0x9393000000939393,
+	0x8686000000868686, 0x8383000000838383, 0x2a2a0000002a2a2a,
+	0xc7c7000000c7c7c7, 0x5b5b0000005b5b5b, 0xe9e9000000e9e9e9,
+	0xeeee000000eeeeee, 0x8f8f0000008f8f8f, 0x0101000000010101,
+	0x3d3d0000003d3d3d,
+};
+
+const u64 camellia_sp03303033[256] = {
+	0x0038380038003838, 0x0041410041004141, 0x0016160016001616,
+	0x0076760076007676, 0x00d9d900d900d9d9, 0x0093930093009393,
+	0x0060600060006060, 0x00f2f200f200f2f2, 0x0072720072007272,
+	0x00c2c200c200c2c2, 0x00abab00ab00abab, 0x009a9a009a009a9a,
+	0x0075750075007575, 0x0006060006000606, 0x0057570057005757,
+	0x00a0a000a000a0a0, 0x0091910091009191, 0x00f7f700f700f7f7,
+	0x00b5b500b500b5b5, 0x00c9c900c900c9c9, 0x00a2a200a200a2a2,
+	0x008c8c008c008c8c, 0x00d2d200d200d2d2, 0x0090900090009090,
+	0x00f6f600f600f6f6, 0x0007070007000707, 0x00a7a700a700a7a7,
+	0x0027270027002727, 0x008e8e008e008e8e, 0x00b2b200b200b2b2,
+	0x0049490049004949, 0x00dede00de00dede, 0x0043430043004343,
+	0x005c5c005c005c5c, 0x00d7d700d700d7d7, 0x00c7c700c700c7c7,
+	0x003e3e003e003e3e, 0x00f5f500f500f5f5, 0x008f8f008f008f8f,
+	0x0067670067006767, 0x001f1f001f001f1f, 0x0018180018001818,
+	0x006e6e006e006e6e, 0x00afaf00af00afaf, 0x002f2f002f002f2f,
+	0x00e2e200e200e2e2, 0x0085850085008585, 0x000d0d000d000d0d,
+	0x0053530053005353, 0x00f0f000f000f0f0, 0x009c9c009c009c9c,
+	0x0065650065006565, 0x00eaea00ea00eaea, 0x00a3a300a300a3a3,
+	0x00aeae00ae00aeae, 0x009e9e009e009e9e, 0x00ecec00ec00ecec,
+	0x0080800080008080, 0x002d2d002d002d2d, 0x006b6b006b006b6b,
+	0x00a8a800a800a8a8, 0x002b2b002b002b2b, 0x0036360036003636,
+	0x00a6a600a600a6a6, 0x00c5c500c500c5c5, 0x0086860086008686,
+	0x004d4d004d004d4d, 0x0033330033003333, 0x00fdfd00fd00fdfd,
+	0x0066660066006666, 0x0058580058005858, 0x0096960096009696,
+	0x003a3a003a003a3a, 0x0009090009000909, 0x0095950095009595,
+	0x0010100010001010, 0x0078780078007878, 0x00d8d800d800d8d8,
+	0x0042420042004242, 0x00cccc00cc00cccc, 0x00efef00ef00efef,
+	0x0026260026002626, 0x00e5e500e500e5e5, 0x0061610061006161,
+	0x001a1a001a001a1a, 0x003f3f003f003f3f, 0x003b3b003b003b3b,
+	0x0082820082008282, 0x00b6b600b600b6b6, 0x00dbdb00db00dbdb,
+	0x00d4d400d400d4d4, 0x0098980098009898, 0x00e8e800e800e8e8,
+	0x008b8b008b008b8b, 0x0002020002000202, 0x00ebeb00eb00ebeb,
+	0x000a0a000a000a0a, 0x002c2c002c002c2c, 0x001d1d001d001d1d,
+	0x00b0b000b000b0b0, 0x006f6f006f006f6f, 0x008d8d008d008d8d,
+	0x0088880088008888, 0x000e0e000e000e0e, 0x0019190019001919,
+	0x0087870087008787, 0x004e4e004e004e4e, 0x000b0b000b000b0b,
+	0x00a9a900a900a9a9, 0x000c0c000c000c0c, 0x0079790079007979,
+	0x0011110011001111, 0x007f7f007f007f7f, 0x0022220022002222,
+	0x00e7e700e700e7e7, 0x0059590059005959, 0x00e1e100e100e1e1,
+	0x00dada00da00dada, 0x003d3d003d003d3d, 0x00c8c800c800c8c8,
+	0x0012120012001212, 0x0004040004000404, 0x0074740074007474,
+	0x0054540054005454, 0x0030300030003030, 0x007e7e007e007e7e,
+	0x00b4b400b400b4b4, 0x0028280028002828, 0x0055550055005555,
+	0x0068680068006868, 0x0050500050005050, 0x00bebe00be00bebe,
+	0x00d0d000d000d0d0, 0x00c4c400c400c4c4, 0x0031310031003131,
+	0x00cbcb00cb00cbcb, 0x002a2a002a002a2a, 0x00adad00ad00adad,
+	0x000f0f000f000f0f, 0x00caca00ca00caca, 0x0070700070007070,
+	0x00ffff00ff00ffff, 0x0032320032003232, 0x0069690069006969,
+	0x0008080008000808, 0x0062620062006262, 0x0000000000000000,
+	0x0024240024002424, 0x00d1d100d100d1d1, 0x00fbfb00fb00fbfb,
+	0x00baba00ba00baba, 0x00eded00ed00eded, 0x0045450045004545,
+	0x0081810081008181, 0x0073730073007373, 0x006d6d006d006d6d,
+	0x0084840084008484, 0x009f9f009f009f9f, 0x00eeee00ee00eeee,
+	0x004a4a004a004a4a, 0x00c3c300c300c3c3, 0x002e2e002e002e2e,
+	0x00c1c100c100c1c1, 0x0001010001000101, 0x00e6e600e600e6e6,
+	0x0025250025002525, 0x0048480048004848, 0x0099990099009999,
+	0x00b9b900b900b9b9, 0x00b3b300b300b3b3, 0x007b7b007b007b7b,
+	0x00f9f900f900f9f9, 0x00cece00ce00cece, 0x00bfbf00bf00bfbf,
+	0x00dfdf00df00dfdf, 0x0071710071007171, 0x0029290029002929,
+	0x00cdcd00cd00cdcd, 0x006c6c006c006c6c, 0x0013130013001313,
+	0x0064640064006464, 0x009b9b009b009b9b, 0x0063630063006363,
+	0x009d9d009d009d9d, 0x00c0c000c000c0c0, 0x004b4b004b004b4b,
+	0x00b7b700b700b7b7, 0x00a5a500a500a5a5, 0x0089890089008989,
+	0x005f5f005f005f5f, 0x00b1b100b100b1b1, 0x0017170017001717,
+	0x00f4f400f400f4f4, 0x00bcbc00bc00bcbc, 0x00d3d300d300d3d3,
+	0x0046460046004646, 0x00cfcf00cf00cfcf, 0x0037370037003737,
+	0x005e5e005e005e5e, 0x0047470047004747, 0x0094940094009494,
+	0x00fafa00fa00fafa, 0x00fcfc00fc00fcfc, 0x005b5b005b005b5b,
+	0x0097970097009797, 0x00fefe00fe00fefe, 0x005a5a005a005a5a,
+	0x00acac00ac00acac, 0x003c3c003c003c3c, 0x004c4c004c004c4c,
+	0x0003030003000303, 0x0035350035003535, 0x00f3f300f300f3f3,
+	0x0023230023002323, 0x00b8b800b800b8b8, 0x005d5d005d005d5d,
+	0x006a6a006a006a6a, 0x0092920092009292, 0x00d5d500d500d5d5,
+	0x0021210021002121, 0x0044440044004444, 0x0051510051005151,
+	0x00c6c600c600c6c6, 0x007d7d007d007d7d, 0x0039390039003939,
+	0x0083830083008383, 0x00dcdc00dc00dcdc, 0x00aaaa00aa00aaaa,
+	0x007c7c007c007c7c, 0x0077770077007777, 0x0056560056005656,
+	0x0005050005000505, 0x001b1b001b001b1b, 0x00a4a400a400a4a4,
+	0x0015150015001515, 0x0034340034003434, 0x001e1e001e001e1e,
+	0x001c1c001c001c1c, 0x00f8f800f800f8f8, 0x0052520052005252,
+	0x0020200020002020, 0x0014140014001414, 0x00e9e900e900e9e9,
+	0x00bdbd00bd00bdbd, 0x00dddd00dd00dddd, 0x00e4e400e400e4e4,
+	0x00a1a100a100a1a1, 0x00e0e000e000e0e0, 0x008a8a008a008a8a,
+	0x00f1f100f100f1f1, 0x00d6d600d600d6d6, 0x007a7a007a007a7a,
+	0x00bbbb00bb00bbbb, 0x00e3e300e300e3e3, 0x0040400040004040,
+	0x004f4f004f004f4f,
+};
+
+const u64 camellia_sp00444404[256] = {
+	0x0000707070700070, 0x00002c2c2c2c002c, 0x0000b3b3b3b300b3,
+	0x0000c0c0c0c000c0, 0x0000e4e4e4e400e4, 0x0000575757570057,
+	0x0000eaeaeaea00ea, 0x0000aeaeaeae00ae, 0x0000232323230023,
+	0x00006b6b6b6b006b, 0x0000454545450045, 0x0000a5a5a5a500a5,
+	0x0000edededed00ed, 0x00004f4f4f4f004f, 0x00001d1d1d1d001d,
+	0x0000929292920092, 0x0000868686860086, 0x0000afafafaf00af,
+	0x00007c7c7c7c007c, 0x00001f1f1f1f001f, 0x00003e3e3e3e003e,
+	0x0000dcdcdcdc00dc, 0x00005e5e5e5e005e, 0x00000b0b0b0b000b,
+	0x0000a6a6a6a600a6, 0x0000393939390039, 0x0000d5d5d5d500d5,
+	0x00005d5d5d5d005d, 0x0000d9d9d9d900d9, 0x00005a5a5a5a005a,
+	0x0000515151510051, 0x00006c6c6c6c006c, 0x00008b8b8b8b008b,
+	0x00009a9a9a9a009a, 0x0000fbfbfbfb00fb, 0x0000b0b0b0b000b0,
+	0x0000747474740074, 0x00002b2b2b2b002b, 0x0000f0f0f0f000f0,
+	0x0000848484840084, 0x0000dfdfdfdf00df, 0x0000cbcbcbcb00cb,
+	0x0000343434340034, 0x0000767676760076, 0x00006d6d6d6d006d,
+	0x0000a9a9a9a900a9, 0x0000d1d1d1d100d1, 0x0000040404040004,
+	0x0000141414140014, 0x00003a3a3a3a003a, 0x0000dededede00de,
+	0x0000111111110011, 0x0000323232320032, 0x00009c9c9c9c009c,
+	0x0000535353530053, 0x0000f2f2f2f200f2, 0x0000fefefefe00fe,
+	0x0000cfcfcfcf00cf, 0x0000c3c3c3c300c3, 0x00007a7a7a7a007a,
+	0x0000242424240024, 0x0000e8e8e8e800e8, 0x0000606060600060,
+	0x0000696969690069, 0x0000aaaaaaaa00aa, 0x0000a0a0a0a000a0,
+	0x0000a1a1a1a100a1, 0x0000626262620062, 0x0000545454540054,
+	0x00001e1e1e1e001e, 0x0000e0e0e0e000e0, 0x0000646464640064,
+	0x0000101010100010, 0x0000000000000000, 0x0000a3a3a3a300a3,
+	0x0000757575750075, 0x00008a8a8a8a008a, 0x0000e6e6e6e600e6,
+	0x0000090909090009, 0x0000dddddddd00dd, 0x0000878787870087,
+	0x0000838383830083, 0x0000cdcdcdcd00cd, 0x0000909090900090,
+	0x0000737373730073, 0x0000f6f6f6f600f6, 0x00009d9d9d9d009d,
+	0x0000bfbfbfbf00bf, 0x0000525252520052, 0x0000d8d8d8d800d8,
+	0x0000c8c8c8c800c8, 0x0000c6c6c6c600c6, 0x0000818181810081,
+	0x00006f6f6f6f006f, 0x0000131313130013, 0x0000636363630063,
+	0x0000e9e9e9e900e9, 0x0000a7a7a7a700a7, 0x00009f9f9f9f009f,
+	0x0000bcbcbcbc00bc, 0x0000292929290029, 0x0000f9f9f9f900f9,
+	0x00002f2f2f2f002f, 0x0000b4b4b4b400b4, 0x0000787878780078,
+	0x0000060606060006, 0x0000e7e7e7e700e7, 0x0000717171710071,
+	0x0000d4d4d4d400d4, 0x0000abababab00ab, 0x0000888888880088,
+	0x00008d8d8d8d008d, 0x0000727272720072, 0x0000b9b9b9b900b9,
+	0x0000f8f8f8f800f8, 0x0000acacacac00ac, 0x0000363636360036,
+	0x00002a2a2a2a002a, 0x00003c3c3c3c003c, 0x0000f1f1f1f100f1,
+	0x0000404040400040, 0x0000d3d3d3d300d3, 0x0000bbbbbbbb00bb,
+	0x0000434343430043, 0x0000151515150015, 0x0000adadadad00ad,
+	0x0000777777770077, 0x0000808080800080, 0x0000828282820082,
+	0x0000ecececec00ec, 0x0000272727270027, 0x0000e5e5e5e500e5,
+	0x0000858585850085, 0x0000353535350035, 0x00000c0c0c0c000c,
+	0x0000414141410041, 0x0000efefefef00ef, 0x0000939393930093,
+	0x0000191919190019, 0x0000212121210021, 0x00000e0e0e0e000e,
+	0x00004e4e4e4e004e, 0x0000656565650065, 0x0000bdbdbdbd00bd,
+	0x0000b8b8b8b800b8, 0x00008f8f8f8f008f, 0x0000ebebebeb00eb,
+	0x0000cececece00ce, 0x0000303030300030, 0x00005f5f5f5f005f,
+	0x0000c5c5c5c500c5, 0x00001a1a1a1a001a, 0x0000e1e1e1e100e1,
+	0x0000cacacaca00ca, 0x0000474747470047, 0x00003d3d3d3d003d,
+	0x0000010101010001, 0x0000d6d6d6d600d6, 0x0000565656560056,
+	0x00004d4d4d4d004d, 0x00000d0d0d0d000d, 0x0000666666660066,
+	0x0000cccccccc00cc, 0x00002d2d2d2d002d, 0x0000121212120012,
+	0x0000202020200020, 0x0000b1b1b1b100b1, 0x0000999999990099,
+	0x00004c4c4c4c004c, 0x0000c2c2c2c200c2, 0x00007e7e7e7e007e,
+	0x0000050505050005, 0x0000b7b7b7b700b7, 0x0000313131310031,
+	0x0000171717170017, 0x0000d7d7d7d700d7, 0x0000585858580058,
+	0x0000616161610061, 0x00001b1b1b1b001b, 0x00001c1c1c1c001c,
+	0x00000f0f0f0f000f, 0x0000161616160016, 0x0000181818180018,
+	0x0000222222220022, 0x0000444444440044, 0x0000b2b2b2b200b2,
+	0x0000b5b5b5b500b5, 0x0000919191910091, 0x0000080808080008,
+	0x0000a8a8a8a800a8, 0x0000fcfcfcfc00fc, 0x0000505050500050,
+	0x0000d0d0d0d000d0, 0x00007d7d7d7d007d, 0x0000898989890089,
+	0x0000979797970097, 0x00005b5b5b5b005b, 0x0000959595950095,
+	0x0000ffffffff00ff, 0x0000d2d2d2d200d2, 0x0000c4c4c4c400c4,
+	0x0000484848480048, 0x0000f7f7f7f700f7, 0x0000dbdbdbdb00db,
+	0x0000030303030003, 0x0000dadadada00da, 0x00003f3f3f3f003f,
+	0x0000949494940094, 0x00005c5c5c5c005c, 0x0000020202020002,
+	0x00004a4a4a4a004a, 0x0000333333330033, 0x0000676767670067,
+	0x0000f3f3f3f300f3, 0x00007f7f7f7f007f, 0x0000e2e2e2e200e2,
+	0x00009b9b9b9b009b, 0x0000262626260026, 0x0000373737370037,
+	0x00003b3b3b3b003b, 0x0000969696960096, 0x00004b4b4b4b004b,
+	0x0000bebebebe00be, 0x00002e2e2e2e002e, 0x0000797979790079,
+	0x00008c8c8c8c008c, 0x00006e6e6e6e006e, 0x00008e8e8e8e008e,
+	0x0000f5f5f5f500f5, 0x0000b6b6b6b600b6, 0x0000fdfdfdfd00fd,
+	0x0000595959590059, 0x0000989898980098, 0x00006a6a6a6a006a,
+	0x0000464646460046, 0x0000babababa00ba, 0x0000252525250025,
+	0x0000424242420042, 0x0000a2a2a2a200a2, 0x0000fafafafa00fa,
+	0x0000070707070007, 0x0000555555550055, 0x0000eeeeeeee00ee,
+	0x00000a0a0a0a000a, 0x0000494949490049, 0x0000686868680068,
+	0x0000383838380038, 0x0000a4a4a4a400a4, 0x0000282828280028,
+	0x00007b7b7b7b007b, 0x0000c9c9c9c900c9, 0x0000c1c1c1c100c1,
+	0x0000e3e3e3e300e3, 0x0000f4f4f4f400f4, 0x0000c7c7c7c700c7,
+	0x00009e9e9e9e009e,
+};
+
+const u64 camellia_sp02220222[256] = {
+	0x00e0e0e000e0e0e0, 0x0005050500050505, 0x0058585800585858,
+	0x00d9d9d900d9d9d9, 0x0067676700676767, 0x004e4e4e004e4e4e,
+	0x0081818100818181, 0x00cbcbcb00cbcbcb, 0x00c9c9c900c9c9c9,
+	0x000b0b0b000b0b0b, 0x00aeaeae00aeaeae, 0x006a6a6a006a6a6a,
+	0x00d5d5d500d5d5d5, 0x0018181800181818, 0x005d5d5d005d5d5d,
+	0x0082828200828282, 0x0046464600464646, 0x00dfdfdf00dfdfdf,
+	0x00d6d6d600d6d6d6, 0x0027272700272727, 0x008a8a8a008a8a8a,
+	0x0032323200323232, 0x004b4b4b004b4b4b, 0x0042424200424242,
+	0x00dbdbdb00dbdbdb, 0x001c1c1c001c1c1c, 0x009e9e9e009e9e9e,
+	0x009c9c9c009c9c9c, 0x003a3a3a003a3a3a, 0x00cacaca00cacaca,
+	0x0025252500252525, 0x007b7b7b007b7b7b, 0x000d0d0d000d0d0d,
+	0x0071717100717171, 0x005f5f5f005f5f5f, 0x001f1f1f001f1f1f,
+	0x00f8f8f800f8f8f8, 0x00d7d7d700d7d7d7, 0x003e3e3e003e3e3e,
+	0x009d9d9d009d9d9d, 0x007c7c7c007c7c7c, 0x0060606000606060,
+	0x00b9b9b900b9b9b9, 0x00bebebe00bebebe, 0x00bcbcbc00bcbcbc,
+	0x008b8b8b008b8b8b, 0x0016161600161616, 0x0034343400343434,
+	0x004d4d4d004d4d4d, 0x00c3c3c300c3c3c3, 0x0072727200727272,
+	0x0095959500959595, 0x00ababab00ababab, 0x008e8e8e008e8e8e,
+	0x00bababa00bababa, 0x007a7a7a007a7a7a, 0x00b3b3b300b3b3b3,
+	0x0002020200020202, 0x00b4b4b400b4b4b4, 0x00adadad00adadad,
+	0x00a2a2a200a2a2a2, 0x00acacac00acacac, 0x00d8d8d800d8d8d8,
+	0x009a9a9a009a9a9a, 0x0017171700171717, 0x001a1a1a001a1a1a,
+	0x0035353500353535, 0x00cccccc00cccccc, 0x00f7f7f700f7f7f7,
+	0x0099999900999999, 0x0061616100616161, 0x005a5a5a005a5a5a,
+	0x00e8e8e800e8e8e8, 0x0024242400242424, 0x0056565600565656,
+	0x0040404000404040, 0x00e1e1e100e1e1e1, 0x0063636300636363,
+	0x0009090900090909, 0x0033333300333333, 0x00bfbfbf00bfbfbf,
+	0x0098989800989898, 0x0097979700979797, 0x0085858500858585,
+	0x0068686800686868, 0x00fcfcfc00fcfcfc, 0x00ececec00ececec,
+	0x000a0a0a000a0a0a, 0x00dadada00dadada, 0x006f6f6f006f6f6f,
+	0x0053535300535353, 0x0062626200626262, 0x00a3a3a300a3a3a3,
+	0x002e2e2e002e2e2e, 0x0008080800080808, 0x00afafaf00afafaf,
+	0x0028282800282828, 0x00b0b0b000b0b0b0, 0x0074747400747474,
+	0x00c2c2c200c2c2c2, 0x00bdbdbd00bdbdbd, 0x0036363600363636,
+	0x0022222200222222, 0x0038383800383838, 0x0064646400646464,
+	0x001e1e1e001e1e1e, 0x0039393900393939, 0x002c2c2c002c2c2c,
+	0x00a6a6a600a6a6a6, 0x0030303000303030, 0x00e5e5e500e5e5e5,
+	0x0044444400444444, 0x00fdfdfd00fdfdfd, 0x0088888800888888,
+	0x009f9f9f009f9f9f, 0x0065656500656565, 0x0087878700878787,
+	0x006b6b6b006b6b6b, 0x00f4f4f400f4f4f4, 0x0023232300232323,
+	0x0048484800484848, 0x0010101000101010, 0x00d1d1d100d1d1d1,
+	0x0051515100515151, 0x00c0c0c000c0c0c0, 0x00f9f9f900f9f9f9,
+	0x00d2d2d200d2d2d2, 0x00a0a0a000a0a0a0, 0x0055555500555555,
+	0x00a1a1a100a1a1a1, 0x0041414100414141, 0x00fafafa00fafafa,
+	0x0043434300434343, 0x0013131300131313, 0x00c4c4c400c4c4c4,
+	0x002f2f2f002f2f2f, 0x00a8a8a800a8a8a8, 0x00b6b6b600b6b6b6,
+	0x003c3c3c003c3c3c, 0x002b2b2b002b2b2b, 0x00c1c1c100c1c1c1,
+	0x00ffffff00ffffff, 0x00c8c8c800c8c8c8, 0x00a5a5a500a5a5a5,
+	0x0020202000202020, 0x0089898900898989, 0x0000000000000000,
+	0x0090909000909090, 0x0047474700474747, 0x00efefef00efefef,
+	0x00eaeaea00eaeaea, 0x00b7b7b700b7b7b7, 0x0015151500151515,
+	0x0006060600060606, 0x00cdcdcd00cdcdcd, 0x00b5b5b500b5b5b5,
+	0x0012121200121212, 0x007e7e7e007e7e7e, 0x00bbbbbb00bbbbbb,
+	0x0029292900292929, 0x000f0f0f000f0f0f, 0x00b8b8b800b8b8b8,
+	0x0007070700070707, 0x0004040400040404, 0x009b9b9b009b9b9b,
+	0x0094949400949494, 0x0021212100212121, 0x0066666600666666,
+	0x00e6e6e600e6e6e6, 0x00cecece00cecece, 0x00ededed00ededed,
+	0x00e7e7e700e7e7e7, 0x003b3b3b003b3b3b, 0x00fefefe00fefefe,
+	0x007f7f7f007f7f7f, 0x00c5c5c500c5c5c5, 0x00a4a4a400a4a4a4,
+	0x0037373700373737, 0x00b1b1b100b1b1b1, 0x004c4c4c004c4c4c,
+	0x0091919100919191, 0x006e6e6e006e6e6e, 0x008d8d8d008d8d8d,
+	0x0076767600767676, 0x0003030300030303, 0x002d2d2d002d2d2d,
+	0x00dedede00dedede, 0x0096969600969696, 0x0026262600262626,
+	0x007d7d7d007d7d7d, 0x00c6c6c600c6c6c6, 0x005c5c5c005c5c5c,
+	0x00d3d3d300d3d3d3, 0x00f2f2f200f2f2f2, 0x004f4f4f004f4f4f,
+	0x0019191900191919, 0x003f3f3f003f3f3f, 0x00dcdcdc00dcdcdc,
+	0x0079797900797979, 0x001d1d1d001d1d1d, 0x0052525200525252,
+	0x00ebebeb00ebebeb, 0x00f3f3f300f3f3f3, 0x006d6d6d006d6d6d,
+	0x005e5e5e005e5e5e, 0x00fbfbfb00fbfbfb, 0x0069696900696969,
+	0x00b2b2b200b2b2b2, 0x00f0f0f000f0f0f0, 0x0031313100313131,
+	0x000c0c0c000c0c0c, 0x00d4d4d400d4d4d4, 0x00cfcfcf00cfcfcf,
+	0x008c8c8c008c8c8c, 0x00e2e2e200e2e2e2, 0x0075757500757575,
+	0x00a9a9a900a9a9a9, 0x004a4a4a004a4a4a, 0x0057575700575757,
+	0x0084848400848484, 0x0011111100111111, 0x0045454500454545,
+	0x001b1b1b001b1b1b, 0x00f5f5f500f5f5f5, 0x00e4e4e400e4e4e4,
+	0x000e0e0e000e0e0e, 0x0073737300737373, 0x00aaaaaa00aaaaaa,
+	0x00f1f1f100f1f1f1, 0x00dddddd00dddddd, 0x0059595900595959,
+	0x0014141400141414, 0x006c6c6c006c6c6c, 0x0092929200929292,
+	0x0054545400545454, 0x00d0d0d000d0d0d0, 0x0078787800787878,
+	0x0070707000707070, 0x00e3e3e300e3e3e3, 0x0049494900494949,
+	0x0080808000808080, 0x0050505000505050, 0x00a7a7a700a7a7a7,
+	0x00f6f6f600f6f6f6, 0x0077777700777777, 0x0093939300939393,
+	0x0086868600868686, 0x0083838300838383, 0x002a2a2a002a2a2a,
+	0x00c7c7c700c7c7c7, 0x005b5b5b005b5b5b, 0x00e9e9e900e9e9e9,
+	0x00eeeeee00eeeeee, 0x008f8f8f008f8f8f, 0x0001010100010101,
+	0x003d3d3d003d3d3d,
+};
+
+const u64 camellia_sp30333033[256] = {
+	0x3800383838003838, 0x4100414141004141, 0x1600161616001616,
+	0x7600767676007676, 0xd900d9d9d900d9d9, 0x9300939393009393,
+	0x6000606060006060, 0xf200f2f2f200f2f2, 0x7200727272007272,
+	0xc200c2c2c200c2c2, 0xab00ababab00abab, 0x9a009a9a9a009a9a,
+	0x7500757575007575, 0x0600060606000606, 0x5700575757005757,
+	0xa000a0a0a000a0a0, 0x9100919191009191, 0xf700f7f7f700f7f7,
+	0xb500b5b5b500b5b5, 0xc900c9c9c900c9c9, 0xa200a2a2a200a2a2,
+	0x8c008c8c8c008c8c, 0xd200d2d2d200d2d2, 0x9000909090009090,
+	0xf600f6f6f600f6f6, 0x0700070707000707, 0xa700a7a7a700a7a7,
+	0x2700272727002727, 0x8e008e8e8e008e8e, 0xb200b2b2b200b2b2,
+	0x4900494949004949, 0xde00dedede00dede, 0x4300434343004343,
+	0x5c005c5c5c005c5c, 0xd700d7d7d700d7d7, 0xc700c7c7c700c7c7,
+	0x3e003e3e3e003e3e, 0xf500f5f5f500f5f5, 0x8f008f8f8f008f8f,
+	0x6700676767006767, 0x1f001f1f1f001f1f, 0x1800181818001818,
+	0x6e006e6e6e006e6e, 0xaf00afafaf00afaf, 0x2f002f2f2f002f2f,
+	0xe200e2e2e200e2e2, 0x8500858585008585, 0x0d000d0d0d000d0d,
+	0x5300535353005353, 0xf000f0f0f000f0f0, 0x9c009c9c9c009c9c,
+	0x6500656565006565, 0xea00eaeaea00eaea, 0xa300a3a3a300a3a3,
+	0xae00aeaeae00aeae, 0x9e009e9e9e009e9e, 0xec00ececec00ecec,
+	0x8000808080008080, 0x2d002d2d2d002d2d, 0x6b006b6b6b006b6b,
+	0xa800a8a8a800a8a8, 0x2b002b2b2b002b2b, 0x3600363636003636,
+	0xa600a6a6a600a6a6, 0xc500c5c5c500c5c5, 0x8600868686008686,
+	0x4d004d4d4d004d4d, 0x3300333333003333, 0xfd00fdfdfd00fdfd,
+	0x6600666666006666, 0x5800585858005858, 0x9600969696009696,
+	0x3a003a3a3a003a3a, 0x0900090909000909, 0x9500959595009595,
+	0x1000101010001010, 0x7800787878007878, 0xd800d8d8d800d8d8,
+	0x4200424242004242, 0xcc00cccccc00cccc, 0xef00efefef00efef,
+	0x2600262626002626, 0xe500e5e5e500e5e5, 0x6100616161006161,
+	0x1a001a1a1a001a1a, 0x3f003f3f3f003f3f, 0x3b003b3b3b003b3b,
+	0x8200828282008282, 0xb600b6b6b600b6b6, 0xdb00dbdbdb00dbdb,
+	0xd400d4d4d400d4d4, 0x9800989898009898, 0xe800e8e8e800e8e8,
+	0x8b008b8b8b008b8b, 0x0200020202000202, 0xeb00ebebeb00ebeb,
+	0x0a000a0a0a000a0a, 0x2c002c2c2c002c2c, 0x1d001d1d1d001d1d,
+	0xb000b0b0b000b0b0, 0x6f006f6f6f006f6f, 0x8d008d8d8d008d8d,
+	0x8800888888008888, 0x0e000e0e0e000e0e, 0x1900191919001919,
+	0x8700878787008787, 0x4e004e4e4e004e4e, 0x0b000b0b0b000b0b,
+	0xa900a9a9a900a9a9, 0x0c000c0c0c000c0c, 0x7900797979007979,
+	0x1100111111001111, 0x7f007f7f7f007f7f, 0x2200222222002222,
+	0xe700e7e7e700e7e7, 0x5900595959005959, 0xe100e1e1e100e1e1,
+	0xda00dadada00dada, 0x3d003d3d3d003d3d, 0xc800c8c8c800c8c8,
+	0x1200121212001212, 0x0400040404000404, 0x7400747474007474,
+	0x5400545454005454, 0x3000303030003030, 0x7e007e7e7e007e7e,
+	0xb400b4b4b400b4b4, 0x2800282828002828, 0x5500555555005555,
+	0x6800686868006868, 0x5000505050005050, 0xbe00bebebe00bebe,
+	0xd000d0d0d000d0d0, 0xc400c4c4c400c4c4, 0x3100313131003131,
+	0xcb00cbcbcb00cbcb, 0x2a002a2a2a002a2a, 0xad00adadad00adad,
+	0x0f000f0f0f000f0f, 0xca00cacaca00caca, 0x7000707070007070,
+	0xff00ffffff00ffff, 0x3200323232003232, 0x6900696969006969,
+	0x0800080808000808, 0x6200626262006262, 0x0000000000000000,
+	0x2400242424002424, 0xd100d1d1d100d1d1, 0xfb00fbfbfb00fbfb,
+	0xba00bababa00baba, 0xed00ededed00eded, 0x4500454545004545,
+	0x8100818181008181, 0x7300737373007373, 0x6d006d6d6d006d6d,
+	0x8400848484008484, 0x9f009f9f9f009f9f, 0xee00eeeeee00eeee,
+	0x4a004a4a4a004a4a, 0xc300c3c3c300c3c3, 0x2e002e2e2e002e2e,
+	0xc100c1c1c100c1c1, 0x0100010101000101, 0xe600e6e6e600e6e6,
+	0x2500252525002525, 0x4800484848004848, 0x9900999999009999,
+	0xb900b9b9b900b9b9, 0xb300b3b3b300b3b3, 0x7b007b7b7b007b7b,
+	0xf900f9f9f900f9f9, 0xce00cecece00cece, 0xbf00bfbfbf00bfbf,
+	0xdf00dfdfdf00dfdf, 0x7100717171007171, 0x2900292929002929,
+	0xcd00cdcdcd00cdcd, 0x6c006c6c6c006c6c, 0x1300131313001313,
+	0x6400646464006464, 0x9b009b9b9b009b9b, 0x6300636363006363,
+	0x9d009d9d9d009d9d, 0xc000c0c0c000c0c0, 0x4b004b4b4b004b4b,
+	0xb700b7b7b700b7b7, 0xa500a5a5a500a5a5, 0x8900898989008989,
+	0x5f005f5f5f005f5f, 0xb100b1b1b100b1b1, 0x1700171717001717,
+	0xf400f4f4f400f4f4, 0xbc00bcbcbc00bcbc, 0xd300d3d3d300d3d3,
+	0x4600464646004646, 0xcf00cfcfcf00cfcf, 0x3700373737003737,
+	0x5e005e5e5e005e5e, 0x4700474747004747, 0x9400949494009494,
+	0xfa00fafafa00fafa, 0xfc00fcfcfc00fcfc, 0x5b005b5b5b005b5b,
+	0x9700979797009797, 0xfe00fefefe00fefe, 0x5a005a5a5a005a5a,
+	0xac00acacac00acac, 0x3c003c3c3c003c3c, 0x4c004c4c4c004c4c,
+	0x0300030303000303, 0x3500353535003535, 0xf300f3f3f300f3f3,
+	0x2300232323002323, 0xb800b8b8b800b8b8, 0x5d005d5d5d005d5d,
+	0x6a006a6a6a006a6a, 0x9200929292009292, 0xd500d5d5d500d5d5,
+	0x2100212121002121, 0x4400444444004444, 0x5100515151005151,
+	0xc600c6c6c600c6c6, 0x7d007d7d7d007d7d, 0x3900393939003939,
+	0x8300838383008383, 0xdc00dcdcdc00dcdc, 0xaa00aaaaaa00aaaa,
+	0x7c007c7c7c007c7c, 0x7700777777007777, 0x5600565656005656,
+	0x0500050505000505, 0x1b001b1b1b001b1b, 0xa400a4a4a400a4a4,
+	0x1500151515001515, 0x3400343434003434, 0x1e001e1e1e001e1e,
+	0x1c001c1c1c001c1c, 0xf800f8f8f800f8f8, 0x5200525252005252,
+	0x2000202020002020, 0x1400141414001414, 0xe900e9e9e900e9e9,
+	0xbd00bdbdbd00bdbd, 0xdd00dddddd00dddd, 0xe400e4e4e400e4e4,
+	0xa100a1a1a100a1a1, 0xe000e0e0e000e0e0, 0x8a008a8a8a008a8a,
+	0xf100f1f1f100f1f1, 0xd600d6d6d600d6d6, 0x7a007a7a7a007a7a,
+	0xbb00bbbbbb00bbbb, 0xe300e3e3e300e3e3, 0x4000404040004040,
+	0x4f004f4f4f004f4f,
+};
+
+const u64 camellia_sp44044404[256] = {
+	0x7070007070700070, 0x2c2c002c2c2c002c, 0xb3b300b3b3b300b3,
+	0xc0c000c0c0c000c0, 0xe4e400e4e4e400e4, 0x5757005757570057,
+	0xeaea00eaeaea00ea, 0xaeae00aeaeae00ae, 0x2323002323230023,
+	0x6b6b006b6b6b006b, 0x4545004545450045, 0xa5a500a5a5a500a5,
+	0xeded00ededed00ed, 0x4f4f004f4f4f004f, 0x1d1d001d1d1d001d,
+	0x9292009292920092, 0x8686008686860086, 0xafaf00afafaf00af,
+	0x7c7c007c7c7c007c, 0x1f1f001f1f1f001f, 0x3e3e003e3e3e003e,
+	0xdcdc00dcdcdc00dc, 0x5e5e005e5e5e005e, 0x0b0b000b0b0b000b,
+	0xa6a600a6a6a600a6, 0x3939003939390039, 0xd5d500d5d5d500d5,
+	0x5d5d005d5d5d005d, 0xd9d900d9d9d900d9, 0x5a5a005a5a5a005a,
+	0x5151005151510051, 0x6c6c006c6c6c006c, 0x8b8b008b8b8b008b,
+	0x9a9a009a9a9a009a, 0xfbfb00fbfbfb00fb, 0xb0b000b0b0b000b0,
+	0x7474007474740074, 0x2b2b002b2b2b002b, 0xf0f000f0f0f000f0,
+	0x8484008484840084, 0xdfdf00dfdfdf00df, 0xcbcb00cbcbcb00cb,
+	0x3434003434340034, 0x7676007676760076, 0x6d6d006d6d6d006d,
+	0xa9a900a9a9a900a9, 0xd1d100d1d1d100d1, 0x0404000404040004,
+	0x1414001414140014, 0x3a3a003a3a3a003a, 0xdede00dedede00de,
+	0x1111001111110011, 0x3232003232320032, 0x9c9c009c9c9c009c,
+	0x5353005353530053, 0xf2f200f2f2f200f2, 0xfefe00fefefe00fe,
+	0xcfcf00cfcfcf00cf, 0xc3c300c3c3c300c3, 0x7a7a007a7a7a007a,
+	0x2424002424240024, 0xe8e800e8e8e800e8, 0x6060006060600060,
+	0x6969006969690069, 0xaaaa00aaaaaa00aa, 0xa0a000a0a0a000a0,
+	0xa1a100a1a1a100a1, 0x6262006262620062, 0x5454005454540054,
+	0x1e1e001e1e1e001e, 0xe0e000e0e0e000e0, 0x6464006464640064,
+	0x1010001010100010, 0x0000000000000000, 0xa3a300a3a3a300a3,
+	0x7575007575750075, 0x8a8a008a8a8a008a, 0xe6e600e6e6e600e6,
+	0x0909000909090009, 0xdddd00dddddd00dd, 0x8787008787870087,
+	0x8383008383830083, 0xcdcd00cdcdcd00cd, 0x9090009090900090,
+	0x7373007373730073, 0xf6f600f6f6f600f6, 0x9d9d009d9d9d009d,
+	0xbfbf00bfbfbf00bf, 0x5252005252520052, 0xd8d800d8d8d800d8,
+	0xc8c800c8c8c800c8, 0xc6c600c6c6c600c6, 0x8181008181810081,
+	0x6f6f006f6f6f006f, 0x1313001313130013, 0x6363006363630063,
+	0xe9e900e9e9e900e9, 0xa7a700a7a7a700a7, 0x9f9f009f9f9f009f,
+	0xbcbc00bcbcbc00bc, 0x2929002929290029, 0xf9f900f9f9f900f9,
+	0x2f2f002f2f2f002f, 0xb4b400b4b4b400b4, 0x7878007878780078,
+	0x0606000606060006, 0xe7e700e7e7e700e7, 0x7171007171710071,
+	0xd4d400d4d4d400d4, 0xabab00ababab00ab, 0x8888008888880088,
+	0x8d8d008d8d8d008d, 0x7272007272720072, 0xb9b900b9b9b900b9,
+	0xf8f800f8f8f800f8, 0xacac00acacac00ac, 0x3636003636360036,
+	0x2a2a002a2a2a002a, 0x3c3c003c3c3c003c, 0xf1f100f1f1f100f1,
+	0x4040004040400040, 0xd3d300d3d3d300d3, 0xbbbb00bbbbbb00bb,
+	0x4343004343430043, 0x1515001515150015, 0xadad00adadad00ad,
+	0x7777007777770077, 0x8080008080800080, 0x8282008282820082,
+	0xecec00ececec00ec, 0x2727002727270027, 0xe5e500e5e5e500e5,
+	0x8585008585850085, 0x3535003535350035, 0x0c0c000c0c0c000c,
+	0x4141004141410041, 0xefef00efefef00ef, 0x9393009393930093,
+	0x1919001919190019, 0x2121002121210021, 0x0e0e000e0e0e000e,
+	0x4e4e004e4e4e004e, 0x6565006565650065, 0xbdbd00bdbdbd00bd,
+	0xb8b800b8b8b800b8, 0x8f8f008f8f8f008f, 0xebeb00ebebeb00eb,
+	0xcece00cecece00ce, 0x3030003030300030, 0x5f5f005f5f5f005f,
+	0xc5c500c5c5c500c5, 0x1a1a001a1a1a001a, 0xe1e100e1e1e100e1,
+	0xcaca00cacaca00ca, 0x4747004747470047, 0x3d3d003d3d3d003d,
+	0x0101000101010001, 0xd6d600d6d6d600d6, 0x5656005656560056,
+	0x4d4d004d4d4d004d, 0x0d0d000d0d0d000d, 0x6666006666660066,
+	0xcccc00cccccc00cc, 0x2d2d002d2d2d002d, 0x1212001212120012,
+	0x2020002020200020, 0xb1b100b1b1b100b1, 0x9999009999990099,
+	0x4c4c004c4c4c004c, 0xc2c200c2c2c200c2, 0x7e7e007e7e7e007e,
+	0x0505000505050005, 0xb7b700b7b7b700b7, 0x3131003131310031,
+	0x1717001717170017, 0xd7d700d7d7d700d7, 0x5858005858580058,
+	0x6161006161610061, 0x1b1b001b1b1b001b, 0x1c1c001c1c1c001c,
+	0x0f0f000f0f0f000f, 0x1616001616160016, 0x1818001818180018,
+	0x2222002222220022, 0x4444004444440044, 0xb2b200b2b2b200b2,
+	0xb5b500b5b5b500b5, 0x9191009191910091, 0x0808000808080008,
+	0xa8a800a8a8a800a8, 0xfcfc00fcfcfc00fc, 0x5050005050500050,
+	0xd0d000d0d0d000d0, 0x7d7d007d7d7d007d, 0x8989008989890089,
+	0x9797009797970097, 0x5b5b005b5b5b005b, 0x9595009595950095,
+	0xffff00ffffff00ff, 0xd2d200d2d2d200d2, 0xc4c400c4c4c400c4,
+	0x4848004848480048, 0xf7f700f7f7f700f7, 0xdbdb00dbdbdb00db,
+	0x0303000303030003, 0xdada00dadada00da, 0x3f3f003f3f3f003f,
+	0x9494009494940094, 0x5c5c005c5c5c005c, 0x0202000202020002,
+	0x4a4a004a4a4a004a, 0x3333003333330033, 0x6767006767670067,
+	0xf3f300f3f3f300f3, 0x7f7f007f7f7f007f, 0xe2e200e2e2e200e2,
+	0x9b9b009b9b9b009b, 0x2626002626260026, 0x3737003737370037,
+	0x3b3b003b3b3b003b, 0x9696009696960096, 0x4b4b004b4b4b004b,
+	0xbebe00bebebe00be, 0x2e2e002e2e2e002e, 0x7979007979790079,
+	0x8c8c008c8c8c008c, 0x6e6e006e6e6e006e, 0x8e8e008e8e8e008e,
+	0xf5f500f5f5f500f5, 0xb6b600b6b6b600b6, 0xfdfd00fdfdfd00fd,
+	0x5959005959590059, 0x9898009898980098, 0x6a6a006a6a6a006a,
+	0x4646004646460046, 0xbaba00bababa00ba, 0x2525002525250025,
+	0x4242004242420042, 0xa2a200a2a2a200a2, 0xfafa00fafafa00fa,
+	0x0707000707070007, 0x5555005555550055, 0xeeee00eeeeee00ee,
+	0x0a0a000a0a0a000a, 0x4949004949490049, 0x6868006868680068,
+	0x3838003838380038, 0xa4a400a4a4a400a4, 0x2828002828280028,
+	0x7b7b007b7b7b007b, 0xc9c900c9c9c900c9, 0xc1c100c1c1c100c1,
+	0xe3e300e3e3e300e3, 0xf4f400f4f4f400f4, 0xc7c700c7c7c700c7,
+	0x9e9e009e9e9e009e,
+};
+
+const u64 camellia_sp11101110[256] = {
+	0x7070700070707000, 0x8282820082828200, 0x2c2c2c002c2c2c00,
+	0xececec00ececec00, 0xb3b3b300b3b3b300, 0x2727270027272700,
+	0xc0c0c000c0c0c000, 0xe5e5e500e5e5e500, 0xe4e4e400e4e4e400,
+	0x8585850085858500, 0x5757570057575700, 0x3535350035353500,
+	0xeaeaea00eaeaea00, 0x0c0c0c000c0c0c00, 0xaeaeae00aeaeae00,
+	0x4141410041414100, 0x2323230023232300, 0xefefef00efefef00,
+	0x6b6b6b006b6b6b00, 0x9393930093939300, 0x4545450045454500,
+	0x1919190019191900, 0xa5a5a500a5a5a500, 0x2121210021212100,
+	0xededed00ededed00, 0x0e0e0e000e0e0e00, 0x4f4f4f004f4f4f00,
+	0x4e4e4e004e4e4e00, 0x1d1d1d001d1d1d00, 0x6565650065656500,
+	0x9292920092929200, 0xbdbdbd00bdbdbd00, 0x8686860086868600,
+	0xb8b8b800b8b8b800, 0xafafaf00afafaf00, 0x8f8f8f008f8f8f00,
+	0x7c7c7c007c7c7c00, 0xebebeb00ebebeb00, 0x1f1f1f001f1f1f00,
+	0xcecece00cecece00, 0x3e3e3e003e3e3e00, 0x3030300030303000,
+	0xdcdcdc00dcdcdc00, 0x5f5f5f005f5f5f00, 0x5e5e5e005e5e5e00,
+	0xc5c5c500c5c5c500, 0x0b0b0b000b0b0b00, 0x1a1a1a001a1a1a00,
+	0xa6a6a600a6a6a600, 0xe1e1e100e1e1e100, 0x3939390039393900,
+	0xcacaca00cacaca00, 0xd5d5d500d5d5d500, 0x4747470047474700,
+	0x5d5d5d005d5d5d00, 0x3d3d3d003d3d3d00, 0xd9d9d900d9d9d900,
+	0x0101010001010100, 0x5a5a5a005a5a5a00, 0xd6d6d600d6d6d600,
+	0x5151510051515100, 0x5656560056565600, 0x6c6c6c006c6c6c00,
+	0x4d4d4d004d4d4d00, 0x8b8b8b008b8b8b00, 0x0d0d0d000d0d0d00,
+	0x9a9a9a009a9a9a00, 0x6666660066666600, 0xfbfbfb00fbfbfb00,
+	0xcccccc00cccccc00, 0xb0b0b000b0b0b000, 0x2d2d2d002d2d2d00,
+	0x7474740074747400, 0x1212120012121200, 0x2b2b2b002b2b2b00,
+	0x2020200020202000, 0xf0f0f000f0f0f000, 0xb1b1b100b1b1b100,
+	0x8484840084848400, 0x9999990099999900, 0xdfdfdf00dfdfdf00,
+	0x4c4c4c004c4c4c00, 0xcbcbcb00cbcbcb00, 0xc2c2c200c2c2c200,
+	0x3434340034343400, 0x7e7e7e007e7e7e00, 0x7676760076767600,
+	0x0505050005050500, 0x6d6d6d006d6d6d00, 0xb7b7b700b7b7b700,
+	0xa9a9a900a9a9a900, 0x3131310031313100, 0xd1d1d100d1d1d100,
+	0x1717170017171700, 0x0404040004040400, 0xd7d7d700d7d7d700,
+	0x1414140014141400, 0x5858580058585800, 0x3a3a3a003a3a3a00,
+	0x6161610061616100, 0xdedede00dedede00, 0x1b1b1b001b1b1b00,
+	0x1111110011111100, 0x1c1c1c001c1c1c00, 0x3232320032323200,
+	0x0f0f0f000f0f0f00, 0x9c9c9c009c9c9c00, 0x1616160016161600,
+	0x5353530053535300, 0x1818180018181800, 0xf2f2f200f2f2f200,
+	0x2222220022222200, 0xfefefe00fefefe00, 0x4444440044444400,
+	0xcfcfcf00cfcfcf00, 0xb2b2b200b2b2b200, 0xc3c3c300c3c3c300,
+	0xb5b5b500b5b5b500, 0x7a7a7a007a7a7a00, 0x9191910091919100,
+	0x2424240024242400, 0x0808080008080800, 0xe8e8e800e8e8e800,
+	0xa8a8a800a8a8a800, 0x6060600060606000, 0xfcfcfc00fcfcfc00,
+	0x6969690069696900, 0x5050500050505000, 0xaaaaaa00aaaaaa00,
+	0xd0d0d000d0d0d000, 0xa0a0a000a0a0a000, 0x7d7d7d007d7d7d00,
+	0xa1a1a100a1a1a100, 0x8989890089898900, 0x6262620062626200,
+	0x9797970097979700, 0x5454540054545400, 0x5b5b5b005b5b5b00,
+	0x1e1e1e001e1e1e00, 0x9595950095959500, 0xe0e0e000e0e0e000,
+	0xffffff00ffffff00, 0x6464640064646400, 0xd2d2d200d2d2d200,
+	0x1010100010101000, 0xc4c4c400c4c4c400, 0x0000000000000000,
+	0x4848480048484800, 0xa3a3a300a3a3a300, 0xf7f7f700f7f7f700,
+	0x7575750075757500, 0xdbdbdb00dbdbdb00, 0x8a8a8a008a8a8a00,
+	0x0303030003030300, 0xe6e6e600e6e6e600, 0xdadada00dadada00,
+	0x0909090009090900, 0x3f3f3f003f3f3f00, 0xdddddd00dddddd00,
+	0x9494940094949400, 0x8787870087878700, 0x5c5c5c005c5c5c00,
+	0x8383830083838300, 0x0202020002020200, 0xcdcdcd00cdcdcd00,
+	0x4a4a4a004a4a4a00, 0x9090900090909000, 0x3333330033333300,
+	0x7373730073737300, 0x6767670067676700, 0xf6f6f600f6f6f600,
+	0xf3f3f300f3f3f300, 0x9d9d9d009d9d9d00, 0x7f7f7f007f7f7f00,
+	0xbfbfbf00bfbfbf00, 0xe2e2e200e2e2e200, 0x5252520052525200,
+	0x9b9b9b009b9b9b00, 0xd8d8d800d8d8d800, 0x2626260026262600,
+	0xc8c8c800c8c8c800, 0x3737370037373700, 0xc6c6c600c6c6c600,
+	0x3b3b3b003b3b3b00, 0x8181810081818100, 0x9696960096969600,
+	0x6f6f6f006f6f6f00, 0x4b4b4b004b4b4b00, 0x1313130013131300,
+	0xbebebe00bebebe00, 0x6363630063636300, 0x2e2e2e002e2e2e00,
+	0xe9e9e900e9e9e900, 0x7979790079797900, 0xa7a7a700a7a7a700,
+	0x8c8c8c008c8c8c00, 0x9f9f9f009f9f9f00, 0x6e6e6e006e6e6e00,
+	0xbcbcbc00bcbcbc00, 0x8e8e8e008e8e8e00, 0x2929290029292900,
+	0xf5f5f500f5f5f500, 0xf9f9f900f9f9f900, 0xb6b6b600b6b6b600,
+	0x2f2f2f002f2f2f00, 0xfdfdfd00fdfdfd00, 0xb4b4b400b4b4b400,
+	0x5959590059595900, 0x7878780078787800, 0x9898980098989800,
+	0x0606060006060600, 0x6a6a6a006a6a6a00, 0xe7e7e700e7e7e700,
+	0x4646460046464600, 0x7171710071717100, 0xbababa00bababa00,
+	0xd4d4d400d4d4d400, 0x2525250025252500, 0xababab00ababab00,
+	0x4242420042424200, 0x8888880088888800, 0xa2a2a200a2a2a200,
+	0x8d8d8d008d8d8d00, 0xfafafa00fafafa00, 0x7272720072727200,
+	0x0707070007070700, 0xb9b9b900b9b9b900, 0x5555550055555500,
+	0xf8f8f800f8f8f800, 0xeeeeee00eeeeee00, 0xacacac00acacac00,
+	0x0a0a0a000a0a0a00, 0x3636360036363600, 0x4949490049494900,
+	0x2a2a2a002a2a2a00, 0x6868680068686800, 0x3c3c3c003c3c3c00,
+	0x3838380038383800, 0xf1f1f100f1f1f100, 0xa4a4a400a4a4a400,
+	0x4040400040404000, 0x2828280028282800, 0xd3d3d300d3d3d300,
+	0x7b7b7b007b7b7b00, 0xbbbbbb00bbbbbb00, 0xc9c9c900c9c9c900,
+	0x4343430043434300, 0xc1c1c100c1c1c100, 0x1515150015151500,
+	0xe3e3e300e3e3e300, 0xadadad00adadad00, 0xf4f4f400f4f4f400,
+	0x7777770077777700, 0xc7c7c700c7c7c700, 0x8080800080808000,
+	0x9e9e9e009e9e9e00,
+};
+
+/* key constants */
+#define CAMELLIA_SIGMA1L (0xA09E667FL)
+#define CAMELLIA_SIGMA1R (0x3BCC908BL)
+#define CAMELLIA_SIGMA2L (0xB67AE858L)
+#define CAMELLIA_SIGMA2R (0x4CAA73B2L)
+#define CAMELLIA_SIGMA3L (0xC6EF372FL)
+#define CAMELLIA_SIGMA3R (0xE94F82BEL)
+#define CAMELLIA_SIGMA4L (0x54FF53A5L)
+#define CAMELLIA_SIGMA4R (0xF1D36F1CL)
+#define CAMELLIA_SIGMA5L (0x10E527FAL)
+#define CAMELLIA_SIGMA5R (0xDE682D1DL)
+#define CAMELLIA_SIGMA6L (0xB05688C2L)
+#define CAMELLIA_SIGMA6R (0xB3E6C1FDL)
+
+/* macros */
+#define ROLDQ(l, r, bits) ({ \
+	u64 t = l;					\
+	l = (l << bits) | (r >> (64 - bits));		\
+	r = (r << bits) | (t >> (64 - bits));		\
+})
+
+#define CAMELLIA_F(x, kl, kr, y) ({ \
+	u64 ii = x ^ (((u64)kl << 32) | kr);				\
+	y = camellia_sp11101110[(uint8_t)ii];				\
+	y ^= camellia_sp44044404[(uint8_t)(ii >> 8)];			\
+	ii >>= 16;							\
+	y ^= camellia_sp30333033[(uint8_t)ii];				\
+	y ^= camellia_sp02220222[(uint8_t)(ii >> 8)];			\
+	ii >>= 16;							\
+	y ^= camellia_sp00444404[(uint8_t)ii];				\
+	y ^= camellia_sp03303033[(uint8_t)(ii >> 8)];			\
+	ii >>= 16;							\
+	y ^= camellia_sp22000222[(uint8_t)ii];				\
+	y ^= camellia_sp10011110[(uint8_t)(ii >> 8)];			\
+	y = ror64(y, 32);						\
+})
+
+#define SET_SUBKEY_LR(INDEX, sRL) (subkey[(INDEX)] = ror64((sRL), 32))
+
+static void camellia_setup_tail(u64 *subkey, u64 *subRL, int max)
+{
+	u64 kw4, tt;
+	u32 dw, tl, tr;
+
+	/* absorb kw2 to other subkeys */
+	/* round 2 */
+	subRL[3] ^= subRL[1];
+	/* round 4 */
+	subRL[5] ^= subRL[1];
+	/* round 6 */
+	subRL[7] ^= subRL[1];
+
+	subRL[1] ^= (subRL[1] & ~subRL[9]) << 32;
+	/* modified for FLinv(kl2) */
+	dw = (subRL[1] & subRL[9]) >> 32,
+		subRL[1] ^= rol32(dw, 1);
+
+	/* round 8 */
+	subRL[11] ^= subRL[1];
+	/* round 10 */
+	subRL[13] ^= subRL[1];
+	/* round 12 */
+	subRL[15] ^= subRL[1];
+
+	subRL[1] ^= (subRL[1] & ~subRL[17]) << 32;
+	/* modified for FLinv(kl4) */
+	dw = (subRL[1] & subRL[17]) >> 32,
+		subRL[1] ^= rol32(dw, 1);
+
+	/* round 14 */
+	subRL[19] ^= subRL[1];
+	/* round 16 */
+	subRL[21] ^= subRL[1];
+	/* round 18 */
+	subRL[23] ^= subRL[1];
+
+	if (max == 24) {
+		/* kw3 */
+		subRL[24] ^= subRL[1];
+
+		/* absorb kw4 to other subkeys */
+		kw4 = subRL[25];
+	} else {
+		subRL[1] ^= (subRL[1] & ~subRL[25]) << 32;
+		/* modified for FLinv(kl6) */
+		dw = (subRL[1] & subRL[25]) >> 32,
+			subRL[1] ^= rol32(dw, 1);
+
+		/* round 20 */
+		subRL[27] ^= subRL[1];
+		/* round 22 */
+		subRL[29] ^= subRL[1];
+		/* round 24 */
+		subRL[31] ^= subRL[1];
+		/* kw3 */
+		subRL[32] ^= subRL[1];
+
+		/* absorb kw4 to other subkeys */
+		kw4 = subRL[33];
+		/* round 23 */
+		subRL[30] ^= kw4;
+		/* round 21 */
+		subRL[28] ^= kw4;
+		/* round 19 */
+		subRL[26] ^= kw4;
+
+		kw4 ^= (kw4 & ~subRL[24]) << 32;
+		/* modified for FL(kl5) */
+		dw = (kw4 & subRL[24]) >> 32,
+			kw4 ^= rol32(dw, 1);
+	}
+
+	/* round 17 */
+	subRL[22] ^= kw4;
+	/* round 15 */
+	subRL[20] ^= kw4;
+	/* round 13 */
+	subRL[18] ^= kw4;
+
+	kw4 ^= (kw4 & ~subRL[16]) << 32;
+	/* modified for FL(kl3) */
+	dw = (kw4 & subRL[16]) >> 32,
+		kw4 ^= rol32(dw, 1);
+
+	/* round 11 */
+	subRL[14] ^= kw4;
+	/* round 9 */
+	subRL[12] ^= kw4;
+	/* round 7 */
+	subRL[10] ^= kw4;
+
+	kw4 ^= (kw4 & ~subRL[8]) << 32;
+	/* modified for FL(kl1) */
+	dw = (kw4 & subRL[8]) >> 32,
+		kw4 ^= rol32(dw, 1);
+
+	/* round 5 */
+	subRL[6] ^= kw4;
+	/* round 3 */
+	subRL[4] ^= kw4;
+	/* round 1 */
+	subRL[2] ^= kw4;
+	/* kw1 */
+	subRL[0] ^= kw4;
+
+	/* key XOR is end of F-function */
+	SET_SUBKEY_LR(0, subRL[0] ^ subRL[2]);			/* kw1 */
+	SET_SUBKEY_LR(2, subRL[3]);				/* round 1 */
+	SET_SUBKEY_LR(3, subRL[2] ^ subRL[4]);			/* round 2 */
+	SET_SUBKEY_LR(4, subRL[3] ^ subRL[5]);			/* round 3 */
+	SET_SUBKEY_LR(5, subRL[4] ^ subRL[6]);			/* round 4 */
+	SET_SUBKEY_LR(6, subRL[5] ^ subRL[7]);			/* round 5 */
+
+	tl = (subRL[10] >> 32) ^ (subRL[10] & ~subRL[8]);
+	dw = tl & (subRL[8] >> 32),				/* FL(kl1) */
+		tr = subRL[10] ^ rol32(dw, 1);
+	tt = (tr | ((u64)tl << 32));
+
+	SET_SUBKEY_LR(7, subRL[6] ^ tt);			/* round 6 */
+	SET_SUBKEY_LR(8, subRL[8]);				/* FL(kl1) */
+	SET_SUBKEY_LR(9, subRL[9]);				/* FLinv(kl2) */
+
+	tl = (subRL[7] >> 32) ^ (subRL[7] & ~subRL[9]);
+	dw = tl & (subRL[9] >> 32),				/* FLinv(kl2) */
+		tr = subRL[7] ^ rol32(dw, 1);
+	tt = (tr | ((u64)tl << 32));
+
+	SET_SUBKEY_LR(10, subRL[11] ^ tt);			/* round 7 */
+	SET_SUBKEY_LR(11, subRL[10] ^ subRL[12]);		/* round 8 */
+	SET_SUBKEY_LR(12, subRL[11] ^ subRL[13]);		/* round 9 */
+	SET_SUBKEY_LR(13, subRL[12] ^ subRL[14]);		/* round 10 */
+	SET_SUBKEY_LR(14, subRL[13] ^ subRL[15]);		/* round 11 */
+
+	tl = (subRL[18] >> 32) ^ (subRL[18] & ~subRL[16]);
+	dw = tl & (subRL[16] >> 32),				/* FL(kl3) */
+		tr = subRL[18] ^ rol32(dw, 1);
+	tt = (tr | ((u64)tl << 32));
+
+	SET_SUBKEY_LR(15, subRL[14] ^ tt);			/* round 12 */
+	SET_SUBKEY_LR(16, subRL[16]);				/* FL(kl3) */
+	SET_SUBKEY_LR(17, subRL[17]);				/* FLinv(kl4) */
+
+	tl = (subRL[15] >> 32) ^ (subRL[15] & ~subRL[17]);
+	dw = tl & (subRL[17] >> 32),				/* FLinv(kl4) */
+		tr = subRL[15] ^ rol32(dw, 1);
+	tt = (tr | ((u64)tl << 32));
+
+	SET_SUBKEY_LR(18, subRL[19] ^ tt);			/* round 13 */
+	SET_SUBKEY_LR(19, subRL[18] ^ subRL[20]);		/* round 14 */
+	SET_SUBKEY_LR(20, subRL[19] ^ subRL[21]);		/* round 15 */
+	SET_SUBKEY_LR(21, subRL[20] ^ subRL[22]);		/* round 16 */
+	SET_SUBKEY_LR(22, subRL[21] ^ subRL[23]);		/* round 17 */
+
+	if (max == 24) {
+		SET_SUBKEY_LR(23, subRL[22]);			/* round 18 */
+		SET_SUBKEY_LR(24, subRL[24] ^ subRL[23]);	/* kw3 */
+	} else {
+		tl = (subRL[26] >> 32) ^ (subRL[26] & ~subRL[24]);
+		dw = tl & (subRL[24] >> 32),			/* FL(kl5) */
+			tr = subRL[26] ^ rol32(dw, 1);
+		tt = (tr | ((u64)tl << 32));
+
+		SET_SUBKEY_LR(23, subRL[22] ^ tt);		/* round 18 */
+		SET_SUBKEY_LR(24, subRL[24]);			/* FL(kl5) */
+		SET_SUBKEY_LR(25, subRL[25]);			/* FLinv(kl6) */
+
+		tl = (subRL[23] >> 32) ^ (subRL[23] & ~subRL[25]);
+		dw = tl & (subRL[25] >> 32),			/* FLinv(kl6) */
+			tr = subRL[23] ^ rol32(dw, 1);
+		tt = (tr | ((u64)tl << 32));
+
+		SET_SUBKEY_LR(26, subRL[27] ^ tt);		/* round 19 */
+		SET_SUBKEY_LR(27, subRL[26] ^ subRL[28]);	/* round 20 */
+		SET_SUBKEY_LR(28, subRL[27] ^ subRL[29]);	/* round 21 */
+		SET_SUBKEY_LR(29, subRL[28] ^ subRL[30]);	/* round 22 */
+		SET_SUBKEY_LR(30, subRL[29] ^ subRL[31]);	/* round 23 */
+		SET_SUBKEY_LR(31, subRL[30]);			/* round 24 */
+		SET_SUBKEY_LR(32, subRL[32] ^ subRL[31]);	/* kw3 */
+	}
+}
+
+static void camellia_setup128(const unsigned char *key, u64 *subkey)
+{
+	u64 kl, kr, ww;
+	u64 subRL[26];
+
+	/**
+	 *  k == kl || kr (|| is concatenation)
+	 */
+	kl = get_unaligned_be64(key);
+	kr = get_unaligned_be64(key + 8);
+
+	/* generate KL dependent subkeys */
+	/* kw1 */
+	subRL[0] = kl;
+	/* kw2 */
+	subRL[1] = kr;
+
+	/* rotation left shift 15bit */
+	ROLDQ(kl, kr, 15);
+
+	/* k3 */
+	subRL[4] = kl;
+	/* k4 */
+	subRL[5] = kr;
+
+	/* rotation left shift 15+30bit */
+	ROLDQ(kl, kr, 30);
+
+	/* k7 */
+	subRL[10] = kl;
+	/* k8 */
+	subRL[11] = kr;
+
+	/* rotation left shift 15+30+15bit */
+	ROLDQ(kl, kr, 15);
+
+	/* k10 */
+	subRL[13] = kr;
+	/* rotation left shift 15+30+15+17 bit */
+	ROLDQ(kl, kr, 17);
+
+	/* kl3 */
+	subRL[16] = kl;
+	/* kl4 */
+	subRL[17] = kr;
+
+	/* rotation left shift 15+30+15+17+17 bit */
+	ROLDQ(kl, kr, 17);
+
+	/* k13 */
+	subRL[18] = kl;
+	/* k14 */
+	subRL[19] = kr;
+
+	/* rotation left shift 15+30+15+17+17+17 bit */
+	ROLDQ(kl, kr, 17);
+
+	/* k17 */
+	subRL[22] = kl;
+	/* k18 */
+	subRL[23] = kr;
+
+	/* generate KA */
+	kl = subRL[0];
+	kr = subRL[1];
+	CAMELLIA_F(kl, CAMELLIA_SIGMA1L, CAMELLIA_SIGMA1R, ww);
+	kr ^= ww;
+	CAMELLIA_F(kr, CAMELLIA_SIGMA2L, CAMELLIA_SIGMA2R, kl);
+
+	/* current status == (kll, klr, w0, w1) */
+	CAMELLIA_F(kl, CAMELLIA_SIGMA3L, CAMELLIA_SIGMA3R, kr);
+	kr ^= ww;
+	CAMELLIA_F(kr, CAMELLIA_SIGMA4L, CAMELLIA_SIGMA4R, ww);
+	kl ^= ww;
+
+	/* generate KA dependent subkeys */
+	/* k1, k2 */
+	subRL[2] = kl;
+	subRL[3] = kr;
+	ROLDQ(kl, kr, 15);
+	/* k5,k6 */
+	subRL[6] = kl;
+	subRL[7] = kr;
+	ROLDQ(kl, kr, 15);
+	/* kl1, kl2 */
+	subRL[8] = kl;
+	subRL[9] = kr;
+	ROLDQ(kl, kr, 15);
+	/* k9 */
+	subRL[12] = kl;
+	ROLDQ(kl, kr, 15);
+	/* k11, k12 */
+	subRL[14] = kl;
+	subRL[15] = kr;
+	ROLDQ(kl, kr, 34);
+	/* k15, k16 */
+	subRL[20] = kl;
+	subRL[21] = kr;
+	ROLDQ(kl, kr, 17);
+	/* kw3, kw4 */
+	subRL[24] = kl;
+	subRL[25] = kr;
+
+	camellia_setup_tail(subkey, subRL, 24);
+}
+
+static void camellia_setup256(const unsigned char *key, u64 *subkey)
+{
+	u64 kl, kr;			/* left half of key */
+	u64 krl, krr;			/* right half of key */
+	u64 ww;				/* temporary variables */
+	u64 subRL[34];
+
+	/**
+	 *  key = (kl || kr || krl || krr) (|| is concatenation)
+	 */
+	kl = get_unaligned_be64(key);
+	kr = get_unaligned_be64(key + 8);
+	krl = get_unaligned_be64(key + 16);
+	krr = get_unaligned_be64(key + 24);
+
+	/* generate KL dependent subkeys */
+	/* kw1 */
+	subRL[0] = kl;
+	/* kw2 */
+	subRL[1] = kr;
+	ROLDQ(kl, kr, 45);
+	/* k9 */
+	subRL[12] = kl;
+	/* k10 */
+	subRL[13] = kr;
+	ROLDQ(kl, kr, 15);
+	/* kl3 */
+	subRL[16] = kl;
+	/* kl4 */
+	subRL[17] = kr;
+	ROLDQ(kl, kr, 17);
+	/* k17 */
+	subRL[22] = kl;
+	/* k18 */
+	subRL[23] = kr;
+	ROLDQ(kl, kr, 34);
+	/* k23 */
+	subRL[30] = kl;
+	/* k24 */
+	subRL[31] = kr;
+
+	/* generate KR dependent subkeys */
+	ROLDQ(krl, krr, 15);
+	/* k3 */
+	subRL[4] = krl;
+	/* k4 */
+	subRL[5] = krr;
+	ROLDQ(krl, krr, 15);
+	/* kl1 */
+	subRL[8] = krl;
+	/* kl2 */
+	subRL[9] = krr;
+	ROLDQ(krl, krr, 30);
+	/* k13 */
+	subRL[18] = krl;
+	/* k14 */
+	subRL[19] = krr;
+	ROLDQ(krl, krr, 34);
+	/* k19 */
+	subRL[26] = krl;
+	/* k20 */
+	subRL[27] = krr;
+	ROLDQ(krl, krr, 34);
+
+	/* generate KA */
+	kl = subRL[0] ^ krl;
+	kr = subRL[1] ^ krr;
+
+	CAMELLIA_F(kl, CAMELLIA_SIGMA1L, CAMELLIA_SIGMA1R, ww);
+	kr ^= ww;
+	CAMELLIA_F(kr, CAMELLIA_SIGMA2L, CAMELLIA_SIGMA2R, kl);
+	kl ^= krl;
+	CAMELLIA_F(kl, CAMELLIA_SIGMA3L, CAMELLIA_SIGMA3R, kr);
+	kr ^= ww ^ krr;
+	CAMELLIA_F(kr, CAMELLIA_SIGMA4L, CAMELLIA_SIGMA4R, ww);
+	kl ^= ww;
+
+	/* generate KB */
+	krl ^= kl;
+	krr ^= kr;
+	CAMELLIA_F(krl, CAMELLIA_SIGMA5L, CAMELLIA_SIGMA5R, ww);
+	krr ^= ww;
+	CAMELLIA_F(krr, CAMELLIA_SIGMA6L, CAMELLIA_SIGMA6R, ww);
+	krl ^= ww;
+
+	/* generate KA dependent subkeys */
+	ROLDQ(kl, kr, 15);
+	/* k5 */
+	subRL[6] = kl;
+	/* k6 */
+	subRL[7] = kr;
+	ROLDQ(kl, kr, 30);
+	/* k11 */
+	subRL[14] = kl;
+	/* k12 */
+	subRL[15] = kr;
+	/* rotation left shift 32bit */
+	ROLDQ(kl, kr, 32);
+	/* kl5 */
+	subRL[24] = kl;
+	/* kl6 */
+	subRL[25] = kr;
+	/* rotation left shift 17 from k11,k12 -> k21,k22 */
+	ROLDQ(kl, kr, 17);
+	/* k21 */
+	subRL[28] = kl;
+	/* k22 */
+	subRL[29] = kr;
+
+	/* generate KB dependent subkeys */
+	/* k1 */
+	subRL[2] = krl;
+	/* k2 */
+	subRL[3] = krr;
+	ROLDQ(krl, krr, 30);
+	/* k7 */
+	subRL[10] = krl;
+	/* k8 */
+	subRL[11] = krr;
+	ROLDQ(krl, krr, 30);
+	/* k15 */
+	subRL[20] = krl;
+	/* k16 */
+	subRL[21] = krr;
+	ROLDQ(krl, krr, 51);
+	/* kw3 */
+	subRL[32] = krl;
+	/* kw4 */
+	subRL[33] = krr;
+
+	camellia_setup_tail(subkey, subRL, 32);
+}
+
+static void camellia_setup192(const unsigned char *key, u64 *subkey)
+{
+	unsigned char kk[32];
+	u64 krl, krr;
+
+	memcpy(kk, key, 24);
+	memcpy((unsigned char *)&krl, key+16, 8);
+	krr = ~krl;
+	memcpy(kk+24, (unsigned char *)&krr, 8);
+	camellia_setup256(kk, subkey);
+}
+
+static int __camellia_setkey(struct camellia_ctx *cctx,
+			     const unsigned char *key,
+			     unsigned int key_len, u32 *flags)
+{
+	if (key_len != 16 && key_len != 24 && key_len != 32) {
+		*flags |= CRYPTO_TFM_RES_BAD_KEY_LEN;
+		return -EINVAL;
+	}
+
+	cctx->key_length = key_len;
+
+	switch (key_len) {
+	case 16:
+		camellia_setup128(key, cctx->key_table);
+		break;
+	case 24:
+		camellia_setup192(key, cctx->key_table);
+		break;
+	case 32:
+		camellia_setup256(key, cctx->key_table);
+		break;
+	}
+
+	return 0;
+}
+
+static int camellia_setkey(struct crypto_tfm *tfm, const u8 *in_key,
+			   unsigned int key_len)
+{
+	return __camellia_setkey(crypto_tfm_ctx(tfm), in_key, key_len,
+				 &tfm->crt_flags);
+}
+
+static int ecb_crypt(struct blkcipher_desc *desc, struct blkcipher_walk *walk,
+		     void (*fn)(struct camellia_ctx *, u8 *, const u8 *),
+		     void (*fn_2way)(struct camellia_ctx *, u8 *, const u8 *))
+{
+	struct camellia_ctx *ctx = crypto_blkcipher_ctx(desc->tfm);
+	unsigned int bsize = CAMELLIA_BLOCK_SIZE;
+	unsigned int nbytes;
+	int err;
+
+	err = blkcipher_walk_virt(desc, walk);
+
+	while ((nbytes = walk->nbytes)) {
+		u8 *wsrc = walk->src.virt.addr;
+		u8 *wdst = walk->dst.virt.addr;
+
+		/* Process two block batch */
+		if (nbytes >= bsize * 2) {
+			do {
+				fn_2way(ctx, wdst, wsrc);
+
+				wsrc += bsize * 2;
+				wdst += bsize * 2;
+				nbytes -= bsize * 2;
+			} while (nbytes >= bsize * 2);
+
+			if (nbytes < bsize)
+				goto done;
+		}
+
+		/* Handle leftovers */
+		do {
+			fn(ctx, wdst, wsrc);
+
+			wsrc += bsize;
+			wdst += bsize;
+			nbytes -= bsize;
+		} while (nbytes >= bsize);
+
+done:
+		err = blkcipher_walk_done(desc, walk, nbytes);
+	}
+
+	return err;
+}
+
+static int ecb_encrypt(struct blkcipher_desc *desc, struct scatterlist *dst,
+		       struct scatterlist *src, unsigned int nbytes)
+{
+	struct blkcipher_walk walk;
+
+	blkcipher_walk_init(&walk, dst, src, nbytes);
+	return ecb_crypt(desc, &walk, camellia_enc_blk, camellia_enc_blk_2way);
+}
+
+static int ecb_decrypt(struct blkcipher_desc *desc, struct scatterlist *dst,
+		       struct scatterlist *src, unsigned int nbytes)
+{
+	struct blkcipher_walk walk;
+
+	blkcipher_walk_init(&walk, dst, src, nbytes);
+	return ecb_crypt(desc, &walk, camellia_dec_blk, camellia_dec_blk_2way);
+}
+
+static unsigned int __cbc_encrypt(struct blkcipher_desc *desc,
+				  struct blkcipher_walk *walk)
+{
+	struct camellia_ctx *ctx = crypto_blkcipher_ctx(desc->tfm);
+	unsigned int bsize = CAMELLIA_BLOCK_SIZE;
+	unsigned int nbytes = walk->nbytes;
+	u128 *src = (u128 *)walk->src.virt.addr;
+	u128 *dst = (u128 *)walk->dst.virt.addr;
+	u128 *iv = (u128 *)walk->iv;
+
+	do {
+		u128_xor(dst, src, iv);
+		camellia_enc_blk(ctx, (u8 *)dst, (u8 *)dst);
+		iv = dst;
+
+		src += 1;
+		dst += 1;
+		nbytes -= bsize;
+	} while (nbytes >= bsize);
+
+	u128_xor((u128 *)walk->iv, (u128 *)walk->iv, iv);
+	return nbytes;
+}
+
+static int cbc_encrypt(struct blkcipher_desc *desc, struct scatterlist *dst,
+		       struct scatterlist *src, unsigned int nbytes)
+{
+	struct blkcipher_walk walk;
+	int err;
+
+	blkcipher_walk_init(&walk, dst, src, nbytes);
+	err = blkcipher_walk_virt(desc, &walk);
+
+	while ((nbytes = walk.nbytes)) {
+		nbytes = __cbc_encrypt(desc, &walk);
+		err = blkcipher_walk_done(desc, &walk, nbytes);
+	}
+
+	return err;
+}
+
+static unsigned int __cbc_decrypt(struct blkcipher_desc *desc,
+				  struct blkcipher_walk *walk)
+{
+	struct camellia_ctx *ctx = crypto_blkcipher_ctx(desc->tfm);
+	unsigned int bsize = CAMELLIA_BLOCK_SIZE;
+	unsigned int nbytes = walk->nbytes;
+	u128 *src = (u128 *)walk->src.virt.addr;
+	u128 *dst = (u128 *)walk->dst.virt.addr;
+	u128 ivs[2 - 1];
+	u128 last_iv;
+
+	/* Start of the last block. */
+	src += nbytes / bsize - 1;
+	dst += nbytes / bsize - 1;
+
+	last_iv = *src;
+
+	/* Process two block batch */
+	if (nbytes >= bsize * 2) {
+		do {
+			nbytes -= bsize * (2 - 1);
+			src -= 2 - 1;
+			dst -= 2 - 1;
+
+			ivs[0] = src[0];
+
+			camellia_dec_blk_2way(ctx, (u8 *)dst, (u8 *)src);
+
+			u128_xor(dst + 1, dst + 1, ivs + 0);
+
+			nbytes -= bsize;
+			if (nbytes < bsize)
+				goto done;
+
+			u128_xor(dst, dst, src - 1);
+			src -= 1;
+			dst -= 1;
+		} while (nbytes >= bsize * 2);
+
+		if (nbytes < bsize)
+			goto done;
+	}
+
+	/* Handle leftovers */
+	for (;;) {
+		camellia_dec_blk(ctx, (u8 *)dst, (u8 *)src);
+
+		nbytes -= bsize;
+		if (nbytes < bsize)
+			break;
+
+		u128_xor(dst, dst, src - 1);
+		src -= 1;
+		dst -= 1;
+	}
+
+done:
+	u128_xor(dst, dst, (u128 *)walk->iv);
+	*(u128 *)walk->iv = last_iv;
+
+	return nbytes;
+}
+
+static int cbc_decrypt(struct blkcipher_desc *desc, struct scatterlist *dst,
+		       struct scatterlist *src, unsigned int nbytes)
+{
+	struct blkcipher_walk walk;
+	int err;
+
+	blkcipher_walk_init(&walk, dst, src, nbytes);
+	err = blkcipher_walk_virt(desc, &walk);
+
+	while ((nbytes = walk.nbytes)) {
+		nbytes = __cbc_decrypt(desc, &walk);
+		err = blkcipher_walk_done(desc, &walk, nbytes);
+	}
+
+	return err;
+}
+
+static inline void u128_to_be128(be128 *dst, const u128 *src)
+{
+	dst->a = cpu_to_be64(src->a);
+	dst->b = cpu_to_be64(src->b);
+}
+
+static inline void be128_to_u128(u128 *dst, const be128 *src)
+{
+	dst->a = be64_to_cpu(src->a);
+	dst->b = be64_to_cpu(src->b);
+}
+
+static inline void u128_inc(u128 *i)
+{
+	i->b++;
+	if (!i->b)
+		i->a++;
+}
+
+static void ctr_crypt_final(struct blkcipher_desc *desc,
+			    struct blkcipher_walk *walk)
+{
+	struct camellia_ctx *ctx = crypto_blkcipher_ctx(desc->tfm);
+	u8 keystream[CAMELLIA_BLOCK_SIZE];
+	u8 *src = walk->src.virt.addr;
+	u8 *dst = walk->dst.virt.addr;
+	unsigned int nbytes = walk->nbytes;
+	u128 ctrblk;
+
+	memcpy(keystream, src, nbytes);
+	camellia_enc_blk_xor(ctx, keystream, walk->iv);
+	memcpy(dst, keystream, nbytes);
+
+	be128_to_u128(&ctrblk, (be128 *)walk->iv);
+	u128_inc(&ctrblk);
+	u128_to_be128((be128 *)walk->iv, &ctrblk);
+}
+
+static unsigned int __ctr_crypt(struct blkcipher_desc *desc,
+				struct blkcipher_walk *walk)
+{
+	struct camellia_ctx *ctx = crypto_blkcipher_ctx(desc->tfm);
+	unsigned int bsize = CAMELLIA_BLOCK_SIZE;
+	unsigned int nbytes = walk->nbytes;
+	u128 *src = (u128 *)walk->src.virt.addr;
+	u128 *dst = (u128 *)walk->dst.virt.addr;
+	u128 ctrblk;
+	be128 ctrblocks[2];
+
+	be128_to_u128(&ctrblk, (be128 *)walk->iv);
+
+	/* Process two block batch */
+	if (nbytes >= bsize * 2) {
+		do {
+			if (dst != src) {
+				dst[0] = src[0];
+				dst[1] = src[1];
+			}
+
+			/* create ctrblks for parallel encrypt */
+			u128_to_be128(&ctrblocks[0], &ctrblk);
+			u128_inc(&ctrblk);
+			u128_to_be128(&ctrblocks[1], &ctrblk);
+			u128_inc(&ctrblk);
+
+			camellia_enc_blk_xor_2way(ctx, (u8 *)dst,
+						 (u8 *)ctrblocks);
+
+			src += 2;
+			dst += 2;
+			nbytes -= bsize * 2;
+		} while (nbytes >= bsize * 2);
+
+		if (nbytes < bsize)
+			goto done;
+	}
+
+	/* Handle leftovers */
+	do {
+		if (dst != src)
+			*dst = *src;
+
+		u128_to_be128(&ctrblocks[0], &ctrblk);
+		u128_inc(&ctrblk);
+
+		camellia_enc_blk_xor(ctx, (u8 *)dst, (u8 *)ctrblocks);
+
+		src += 1;
+		dst += 1;
+		nbytes -= bsize;
+	} while (nbytes >= bsize);
+
+done:
+	u128_to_be128((be128 *)walk->iv, &ctrblk);
+	return nbytes;
+}
+
+static int ctr_crypt(struct blkcipher_desc *desc, struct scatterlist *dst,
+		     struct scatterlist *src, unsigned int nbytes)
+{
+	struct blkcipher_walk walk;
+	int err;
+
+	blkcipher_walk_init(&walk, dst, src, nbytes);
+	err = blkcipher_walk_virt_block(desc, &walk, CAMELLIA_BLOCK_SIZE);
+
+	while ((nbytes = walk.nbytes) >= CAMELLIA_BLOCK_SIZE) {
+		nbytes = __ctr_crypt(desc, &walk);
+		err = blkcipher_walk_done(desc, &walk, nbytes);
+	}
+
+	if (walk.nbytes) {
+		ctr_crypt_final(desc, &walk);
+		err = blkcipher_walk_done(desc, &walk, 0);
+	}
+
+	return err;
+}
+
+static void encrypt_callback(void *priv, u8 *srcdst, unsigned int nbytes)
+{
+	const unsigned int bsize = CAMELLIA_BLOCK_SIZE;
+	struct camellia_ctx *ctx = priv;
+	int i;
+
+	while (nbytes >= 2 * bsize) {
+		camellia_enc_blk_2way(ctx, srcdst, srcdst);
+		srcdst += bsize * 2;
+		nbytes -= bsize * 2;
+	}
+
+	for (i = 0; i < nbytes / bsize; i++, srcdst += bsize)
+		camellia_enc_blk(ctx, srcdst, srcdst);
+}
+
+static void decrypt_callback(void *priv, u8 *srcdst, unsigned int nbytes)
+{
+	const unsigned int bsize = CAMELLIA_BLOCK_SIZE;
+	struct camellia_ctx *ctx = priv;
+	int i;
+
+	while (nbytes >= 2 * bsize) {
+		camellia_dec_blk_2way(ctx, srcdst, srcdst);
+		srcdst += bsize * 2;
+		nbytes -= bsize * 2;
+	}
+
+	for (i = 0; i < nbytes / bsize; i++, srcdst += bsize)
+		camellia_dec_blk(ctx, srcdst, srcdst);
+}
+
+struct camellia_lrw_ctx {
+	struct lrw_table_ctx lrw_table;
+	struct camellia_ctx camellia_ctx;
+};
+
+static int lrw_camellia_setkey(struct crypto_tfm *tfm, const u8 *key,
+			      unsigned int keylen)
+{
+	struct camellia_lrw_ctx *ctx = crypto_tfm_ctx(tfm);
+	int err;
+
+	err = __camellia_setkey(&ctx->camellia_ctx, key,
+				keylen - CAMELLIA_BLOCK_SIZE,
+				&tfm->crt_flags);
+	if (err)
+		return err;
+
+	return lrw_init_table(&ctx->lrw_table,
+			      key + keylen - CAMELLIA_BLOCK_SIZE);
+}
+
+static int lrw_encrypt(struct blkcipher_desc *desc, struct scatterlist *dst,
+		       struct scatterlist *src, unsigned int nbytes)
+{
+	struct camellia_lrw_ctx *ctx = crypto_blkcipher_ctx(desc->tfm);
+	be128 buf[2 * 4];
+	struct lrw_crypt_req req = {
+		.tbuf = buf,
+		.tbuflen = sizeof(buf),
+
+		.table_ctx = &ctx->lrw_table,
+		.crypt_ctx = &ctx->camellia_ctx,
+		.crypt_fn = encrypt_callback,
+	};
+
+	return lrw_crypt(desc, dst, src, nbytes, &req);
+}
+
+static int lrw_decrypt(struct blkcipher_desc *desc, struct scatterlist *dst,
+		       struct scatterlist *src, unsigned int nbytes)
+{
+	struct camellia_lrw_ctx *ctx = crypto_blkcipher_ctx(desc->tfm);
+	be128 buf[2 * 4];
+	struct lrw_crypt_req req = {
+		.tbuf = buf,
+		.tbuflen = sizeof(buf),
+
+		.table_ctx = &ctx->lrw_table,
+		.crypt_ctx = &ctx->camellia_ctx,
+		.crypt_fn = decrypt_callback,
+	};
+
+	return lrw_crypt(desc, dst, src, nbytes, &req);
+}
+
+static void lrw_exit_tfm(struct crypto_tfm *tfm)
+{
+	struct camellia_lrw_ctx *ctx = crypto_tfm_ctx(tfm);
+
+	lrw_free_table(&ctx->lrw_table);
+}
+
+struct camellia_xts_ctx {
+	struct camellia_ctx tweak_ctx;
+	struct camellia_ctx crypt_ctx;
+};
+
+static int xts_camellia_setkey(struct crypto_tfm *tfm, const u8 *key,
+			      unsigned int keylen)
+{
+	struct camellia_xts_ctx *ctx = crypto_tfm_ctx(tfm);
+	u32 *flags = &tfm->crt_flags;
+	int err;
+
+	/* key consists of keys of equal size concatenated, therefore
+	 * the length must be even
+	 */
+	if (keylen % 2) {
+		*flags |= CRYPTO_TFM_RES_BAD_KEY_LEN;
+		return -EINVAL;
+	}
+
+	/* first half of xts-key is for crypt */
+	err = __camellia_setkey(&ctx->crypt_ctx, key, keylen / 2, flags);
+	if (err)
+		return err;
+
+	/* second half of xts-key is for tweak */
+	return __camellia_setkey(&ctx->tweak_ctx, key + keylen / 2, keylen / 2,
+				flags);
+}
+
+static int xts_encrypt(struct blkcipher_desc *desc, struct scatterlist *dst,
+		       struct scatterlist *src, unsigned int nbytes)
+{
+	struct camellia_xts_ctx *ctx = crypto_blkcipher_ctx(desc->tfm);
+	be128 buf[2 * 4];
+	struct xts_crypt_req req = {
+		.tbuf = buf,
+		.tbuflen = sizeof(buf),
+
+		.tweak_ctx = &ctx->tweak_ctx,
+		.tweak_fn = XTS_TWEAK_CAST(camellia_enc_blk),
+		.crypt_ctx = &ctx->crypt_ctx,
+		.crypt_fn = encrypt_callback,
+	};
+
+	return xts_crypt(desc, dst, src, nbytes, &req);
+}
+
+static int xts_decrypt(struct blkcipher_desc *desc, struct scatterlist *dst,
+		       struct scatterlist *src, unsigned int nbytes)
+{
+	struct camellia_xts_ctx *ctx = crypto_blkcipher_ctx(desc->tfm);
+	be128 buf[2 * 4];
+	struct xts_crypt_req req = {
+		.tbuf = buf,
+		.tbuflen = sizeof(buf),
+
+		.tweak_ctx = &ctx->tweak_ctx,
+		.tweak_fn = XTS_TWEAK_CAST(camellia_enc_blk),
+		.crypt_ctx = &ctx->crypt_ctx,
+		.crypt_fn = decrypt_callback,
+	};
+
+	return xts_crypt(desc, dst, src, nbytes, &req);
+}
+
+static struct crypto_alg camellia_algs[6] = { {
+	.cra_name		= "camellia",
+	.cra_driver_name	= "camellia-asm",
+	.cra_priority		= 200,
+	.cra_flags		= CRYPTO_ALG_TYPE_CIPHER,
+	.cra_blocksize		= CAMELLIA_BLOCK_SIZE,
+	.cra_ctxsize		= sizeof(struct camellia_ctx),
+	.cra_alignmask		= 0,
+	.cra_module		= THIS_MODULE,
+	.cra_list		= LIST_HEAD_INIT(camellia_algs[0].cra_list),
+	.cra_u			= {
+		.cipher = {
+			.cia_min_keysize = CAMELLIA_MIN_KEY_SIZE,
+			.cia_max_keysize = CAMELLIA_MAX_KEY_SIZE,
+			.cia_setkey	 = camellia_setkey,
+			.cia_encrypt	 = camellia_encrypt,
+			.cia_decrypt	 = camellia_decrypt
+		}
+	}
+}, {
+	.cra_name		= "ecb(camellia)",
+	.cra_driver_name	= "ecb-camellia-asm",
+	.cra_priority		= 300,
+	.cra_flags		= CRYPTO_ALG_TYPE_BLKCIPHER,
+	.cra_blocksize		= CAMELLIA_BLOCK_SIZE,
+	.cra_ctxsize		= sizeof(struct camellia_ctx),
+	.cra_alignmask		= 0,
+	.cra_type		= &crypto_blkcipher_type,
+	.cra_module		= THIS_MODULE,
+	.cra_list		= LIST_HEAD_INIT(camellia_algs[1].cra_list),
+	.cra_u = {
+		.blkcipher = {
+			.min_keysize	= CAMELLIA_MIN_KEY_SIZE,
+			.max_keysize	= CAMELLIA_MAX_KEY_SIZE,
+			.setkey		= camellia_setkey,
+			.encrypt	= ecb_encrypt,
+			.decrypt	= ecb_decrypt,
+		},
+	},
+}, {
+	.cra_name		= "cbc(camellia)",
+	.cra_driver_name	= "cbc-camellia-asm",
+	.cra_priority		= 300,
+	.cra_flags		= CRYPTO_ALG_TYPE_BLKCIPHER,
+	.cra_blocksize		= CAMELLIA_BLOCK_SIZE,
+	.cra_ctxsize		= sizeof(struct camellia_ctx),
+	.cra_alignmask		= 0,
+	.cra_type		= &crypto_blkcipher_type,
+	.cra_module		= THIS_MODULE,
+	.cra_list		= LIST_HEAD_INIT(camellia_algs[2].cra_list),
+	.cra_u = {
+		.blkcipher = {
+			.min_keysize	= CAMELLIA_MIN_KEY_SIZE,
+			.max_keysize	= CAMELLIA_MAX_KEY_SIZE,
+			.ivsize		= CAMELLIA_BLOCK_SIZE,
+			.setkey		= camellia_setkey,
+			.encrypt	= cbc_encrypt,
+			.decrypt	= cbc_decrypt,
+		},
+	},
+}, {
+	.cra_name		= "ctr(camellia)",
+	.cra_driver_name	= "ctr-camellia-asm",
+	.cra_priority		= 300,
+	.cra_flags		= CRYPTO_ALG_TYPE_BLKCIPHER,
+	.cra_blocksize		= 1,
+	.cra_ctxsize		= sizeof(struct camellia_ctx),
+	.cra_alignmask		= 0,
+	.cra_type		= &crypto_blkcipher_type,
+	.cra_module		= THIS_MODULE,
+	.cra_list		= LIST_HEAD_INIT(camellia_algs[3].cra_list),
+	.cra_u = {
+		.blkcipher = {
+			.min_keysize	= CAMELLIA_MIN_KEY_SIZE,
+			.max_keysize	= CAMELLIA_MAX_KEY_SIZE,
+			.ivsize		= CAMELLIA_BLOCK_SIZE,
+			.setkey		= camellia_setkey,
+			.encrypt	= ctr_crypt,
+			.decrypt	= ctr_crypt,
+		},
+	},
+}, {
+	.cra_name		= "lrw(camellia)",
+	.cra_driver_name	= "lrw-camellia-asm",
+	.cra_priority		= 300,
+	.cra_flags		= CRYPTO_ALG_TYPE_BLKCIPHER,
+	.cra_blocksize		= CAMELLIA_BLOCK_SIZE,
+	.cra_ctxsize		= sizeof(struct camellia_lrw_ctx),
+	.cra_alignmask		= 0,
+	.cra_type		= &crypto_blkcipher_type,
+	.cra_module		= THIS_MODULE,
+	.cra_list		= LIST_HEAD_INIT(camellia_algs[4].cra_list),
+	.cra_exit		= lrw_exit_tfm,
+	.cra_u = {
+		.blkcipher = {
+			.min_keysize	= CAMELLIA_MIN_KEY_SIZE +
+						CAMELLIA_BLOCK_SIZE,
+			.max_keysize	= CAMELLIA_MAX_KEY_SIZE +
+						CAMELLIA_BLOCK_SIZE,
+			.ivsize		= CAMELLIA_BLOCK_SIZE,
+			.setkey		= lrw_camellia_setkey,
+			.encrypt	= lrw_encrypt,
+			.decrypt	= lrw_decrypt,
+		},
+	},
+}, {
+	.cra_name		= "xts(camellia)",
+	.cra_driver_name	= "xts-camellia-asm",
+	.cra_priority		= 300,
+	.cra_flags		= CRYPTO_ALG_TYPE_BLKCIPHER,
+	.cra_blocksize		= CAMELLIA_BLOCK_SIZE,
+	.cra_ctxsize		= sizeof(struct camellia_xts_ctx),
+	.cra_alignmask		= 0,
+	.cra_type		= &crypto_blkcipher_type,
+	.cra_module		= THIS_MODULE,
+	.cra_list		= LIST_HEAD_INIT(camellia_algs[5].cra_list),
+	.cra_u = {
+		.blkcipher = {
+			.min_keysize	= CAMELLIA_MIN_KEY_SIZE * 2,
+			.max_keysize	= CAMELLIA_MAX_KEY_SIZE * 2,
+			.ivsize		= CAMELLIA_BLOCK_SIZE,
+			.setkey		= xts_camellia_setkey,
+			.encrypt	= xts_encrypt,
+			.decrypt	= xts_decrypt,
+		},
+	},
+} };
+
+static bool is_blacklisted_cpu(void)
+{
+	if (boot_cpu_data.x86_vendor != X86_VENDOR_INTEL)
+		return false;
+
+	if (boot_cpu_data.x86 == 0x0f) {
+		/*
+		 * On Pentium 4, camellia-asm is slower than original assembler
+		 * implementation because excessive uses of 64bit rotate and
+		 * left-shifts (which are really slow on P4) needed to store and
+		 * handle 128bit block in two 64bit registers.
+		 */
+		return true;
+	}
+
+	return false;
+}
+
+static int force;
+module_param(force, int, 0);
+MODULE_PARM_DESC(force, "Force module load, ignore CPU blacklist");
+
+int __init init(void)
+{
+	if (!force && is_blacklisted_cpu()) {
+		printk(KERN_INFO
+			"camellia-x86_64: performance on this CPU "
+			"would be suboptimal: disabling "
+			"camellia-x86_64.\n");
+		return -ENODEV;
+	}
+
+	return crypto_register_algs(camellia_algs, ARRAY_SIZE(camellia_algs));
+}
+
+void __exit fini(void)
+{
+	crypto_unregister_algs(camellia_algs, ARRAY_SIZE(camellia_algs));
+}
+
+module_init(init);
+module_exit(fini);
+
+MODULE_LICENSE("GPL");
+MODULE_DESCRIPTION("Camellia Cipher Algorithm, asm optimized");
+MODULE_ALIAS("camellia");
+MODULE_ALIAS("camellia-asm");
diff --git a/crypto/Kconfig b/crypto/Kconfig
index e6cfe1a25137..6318edd6a457 100644
--- a/crypto/Kconfig
+++ b/crypto/Kconfig
@@ -654,6 +654,24 @@ config CRYPTO_CAMELLIA
 	  See also:
 	  <https://info.isl.ntt.co.jp/crypt/eng/camellia/index_s.html>
 
+config CRYPTO_CAMELLIA_X86_64
+	tristate "Camellia cipher algorithm (x86_64)"
+	depends on (X86 || UML_X86) && 64BIT
+	depends on CRYPTO
+	select CRYPTO_ALGAPI
+	select CRYPTO_LRW
+	select CRYPTO_XTS
+	help
+	  Camellia cipher algorithm module (x86_64).
+
+	  Camellia is a symmetric key block cipher developed jointly
+	  at NTT and Mitsubishi Electric Corporation.
+
+	  The Camellia specifies three key sizes: 128, 192 and 256 bits.
+
+	  See also:
+	  <https://info.isl.ntt.co.jp/crypt/eng/camellia/index_s.html>
+
 config CRYPTO_CAST5
 	tristate "CAST5 (CAST-128) cipher algorithm"
 	select CRYPTO_ALGAPI
-- 
cgit v1.2.3


From 31796ac4e8f0e88f5c10f1ad6dab8f19bebe44a4 Mon Sep 17 00:00:00 2001
From: "H. Peter Anvin" <hpa@zytor.com>
Date: Wed, 14 Mar 2012 14:27:52 -0700
Subject: x32: Fix alignment fail in struct compat_siginfo

Adding struct _sigchld_x32 caused a misalignment cascade in struct
siginfo, because union _sifields is located on an 4-byte boundary
(8-byte misaligned.)

Adding new fields that are 8-byte aligned caused the intermediate
structures to also be aligned to 8 bytes, thereby adding padding in
unexpected places.

Thus, change s64 to compat_s64 here, which makes it "misaligned on
paper".  In reality these fields *are* actually aligned (there are 3
preceeding ints outside the union and 3 inside struct _sigchld_x32),
but because of the intervening union and struct it is not possible for
gcc to avoid padding without breaking the ABI.

Reported-and-tested-by: H. J. Lu <hjl.tools@gmail.com>
Signed-off-by: H. Peter Anvin <hpa@zytor.com>
Link: http://lkml.kernel.org/r/1329696488-16970-1-git-send-email-hpa@zytor.com
---
 arch/x86/include/asm/ia32.h | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/include/asm/ia32.h b/arch/x86/include/asm/ia32.h
index 7d0c18587709..ee52760549f0 100644
--- a/arch/x86/include/asm/ia32.h
+++ b/arch/x86/include/asm/ia32.h
@@ -130,8 +130,8 @@ typedef struct compat_siginfo {
 			unsigned int _pid;	/* which child */
 			unsigned int _uid;	/* sender's uid */
 			int _status;		/* exit code */
-			s64 _utime;
-			s64 _stime;
+			compat_s64 _utime;
+			compat_s64 _stime;
 		} _sigchld_x32;
 
 		/* SIGILL, SIGFPE, SIGSEGV, SIGBUS */
-- 
cgit v1.2.3


From 48b25c43e6eebb6c0edf72935e8720385beca76b Mon Sep 17 00:00:00 2001
From: Chris Metcalf <cmetcalf@tilera.com>
Date: Thu, 15 Mar 2012 13:13:38 -0400
Subject: [PATCH v3] ipc: provide generic compat versions of IPC syscalls

When using the "compat" APIs, architectures will generally want to
be able to make direct syscalls to msgsnd(), shmctl(), etc., and
in the kernel we would want them to be handled directly by
compat_sys_xxx() functions, as is true for other compat syscalls.

However, for historical reasons, several of the existing compat IPC
syscalls do not do this.  semctl() expects a pointer to the fourth
argument, instead of the fourth argument itself.  msgsnd(), msgrcv()
and shmat() expect arguments in different order.

This change adds an ARCH_WANT_OLD_COMPAT_IPC config option that can be
set to preserve this behavior for ports that use it (x86, sparc, powerpc,
s390, and mips).  No actual semantics are changed for those architectures,
and there is only a minimal amount of code refactoring in ipc/compat.c.

Newer architectures like tile (and perhaps future architectures such
as arm64 and unicore64) should not select this option, and thus can
avoid having any IPC-specific code at all in their architecture-specific
compat layer.  In the same vein, if this option is not selected, IPC_64
mode is assumed, since that's what the <asm-generic> headers expect.

The workaround code in "tile" for msgsnd() and msgrcv() is removed
with this change; it also fixes the bug that shmat() and semctl() were
not being properly handled.

Reviewed-by: Arnd Bergmann <arnd@arndb.de>
Signed-off-by: Chris Metcalf <cmetcalf@tilera.com>
---
 arch/Kconfig                   |  3 ++
 arch/mips/Kconfig              |  1 +
 arch/powerpc/Kconfig           |  1 +
 arch/s390/Kconfig              |  1 +
 arch/sparc/Kconfig             |  1 +
 arch/tile/include/asm/compat.h | 11 -------
 arch/tile/kernel/compat.c      | 43 --------------------------
 arch/x86/Kconfig               |  1 +
 include/linux/compat.h         | 12 +++++++-
 ipc/compat.c                   | 70 ++++++++++++++++++++++++++++++++++++++----
 10 files changed, 83 insertions(+), 61 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/Kconfig b/arch/Kconfig
index 4f55c736be11..b37f8f3ffa54 100644
--- a/arch/Kconfig
+++ b/arch/Kconfig
@@ -199,4 +199,7 @@ config HAVE_CMPXCHG_LOCAL
 config HAVE_CMPXCHG_DOUBLE
 	bool
 
+config ARCH_WANT_OLD_COMPAT_IPC
+	bool
+
 source "kernel/gcov/Kconfig"
diff --git a/arch/mips/Kconfig b/arch/mips/Kconfig
index 5ab6e89603c5..4bbbb40f352a 100644
--- a/arch/mips/Kconfig
+++ b/arch/mips/Kconfig
@@ -2456,6 +2456,7 @@ config MIPS32_COMPAT
 config COMPAT
 	bool
 	depends on MIPS32_COMPAT
+	select ARCH_WANT_OLD_COMPAT_IPC
 	default y
 
 config SYSVIPC_COMPAT
diff --git a/arch/powerpc/Kconfig b/arch/powerpc/Kconfig
index 1919634a9b32..48ab0bb38924 100644
--- a/arch/powerpc/Kconfig
+++ b/arch/powerpc/Kconfig
@@ -152,6 +152,7 @@ config COMPAT
 	bool
 	default y if PPC64
 	select COMPAT_BINFMT_ELF
+	select ARCH_WANT_OLD_COMPAT_IPC
 
 config SYSVIPC_COMPAT
 	bool
diff --git a/arch/s390/Kconfig b/arch/s390/Kconfig
index 6d99a5fcc090..0ff53e350092 100644
--- a/arch/s390/Kconfig
+++ b/arch/s390/Kconfig
@@ -218,6 +218,7 @@ config COMPAT
 	prompt "Kernel support for 31 bit emulation"
 	depends on 64BIT
 	select COMPAT_BINFMT_ELF
+	select ARCH_WANT_OLD_COMPAT_IPC
 	help
 	  Select this option if you want to enable your system kernel to
 	  handle system-calls from ELF binaries for 31 bit ESA.  This option
diff --git a/arch/sparc/Kconfig b/arch/sparc/Kconfig
index ca5580e4d813..64e1a8e7cab3 100644
--- a/arch/sparc/Kconfig
+++ b/arch/sparc/Kconfig
@@ -576,6 +576,7 @@ config COMPAT
 	depends on SPARC64
 	default y
 	select COMPAT_BINFMT_ELF
+	select ARCH_WANT_OLD_COMPAT_IPC
 
 config SYSVIPC_COMPAT
 	bool
diff --git a/arch/tile/include/asm/compat.h b/arch/tile/include/asm/compat.h
index bf95f55b82b0..4b4b28969a65 100644
--- a/arch/tile/include/asm/compat.h
+++ b/arch/tile/include/asm/compat.h
@@ -242,17 +242,6 @@ long compat_sys_fallocate(int fd, int mode,
 long compat_sys_sched_rr_get_interval(compat_pid_t pid,
 				      struct compat_timespec __user *interval);
 
-/* Versions of compat functions that differ from generic Linux. */
-struct compat_msgbuf;
-long tile_compat_sys_msgsnd(int msqid,
-			    struct compat_msgbuf __user *msgp,
-			    size_t msgsz, int msgflg);
-long tile_compat_sys_msgrcv(int msqid,
-			    struct compat_msgbuf __user *msgp,
-			    size_t msgsz, long msgtyp, int msgflg);
-long tile_compat_sys_ptrace(compat_long_t request, compat_long_t pid,
-			    compat_long_t addr, compat_long_t data);
-
 /* Tilera Linux syscalls that don't have "compat" versions. */
 #define compat_sys_flush_cache sys_flush_cache
 
diff --git a/arch/tile/kernel/compat.c b/arch/tile/kernel/compat.c
index bf5e9d70266c..d67459b9ac2a 100644
--- a/arch/tile/kernel/compat.c
+++ b/arch/tile/kernel/compat.c
@@ -16,7 +16,6 @@
 #define __SYSCALL_COMPAT
 
 #include <linux/compat.h>
-#include <linux/msg.h>
 #include <linux/syscalls.h>
 #include <linux/kdev_t.h>
 #include <linux/fs.h>
@@ -95,52 +94,10 @@ long compat_sys_sched_rr_get_interval(compat_pid_t pid,
 	return ret;
 }
 
-/*
- * The usual compat_sys_msgsnd() and _msgrcv() seem to be assuming
- * some different calling convention than our normal 32-bit tile code.
- */
-
-/* Already defined in ipc/compat.c, but we need it here. */
-struct compat_msgbuf {
-	compat_long_t mtype;
-	char mtext[1];
-};
-
-long tile_compat_sys_msgsnd(int msqid,
-			    struct compat_msgbuf __user *msgp,
-			    size_t msgsz, int msgflg)
-{
-	compat_long_t mtype;
-
-	if (get_user(mtype, &msgp->mtype))
-		return -EFAULT;
-	return do_msgsnd(msqid, mtype, msgp->mtext, msgsz, msgflg);
-}
-
-long tile_compat_sys_msgrcv(int msqid,
-			    struct compat_msgbuf __user *msgp,
-			    size_t msgsz, long msgtyp, int msgflg)
-{
-	long err, mtype;
-
-	err =  do_msgrcv(msqid, &mtype, msgp->mtext, msgsz, msgtyp, msgflg);
-	if (err < 0)
-		goto out;
-
-	if (put_user(mtype, &msgp->mtype))
-		err = -EFAULT;
- out:
-	return err;
-}
-
 /* Provide the compat syscall number to call mapping. */
 #undef __SYSCALL
 #define __SYSCALL(nr, call) [nr] = (call),
 
-/* The generic versions of these don't work for Tile. */
-#define compat_sys_msgrcv tile_compat_sys_msgrcv
-#define compat_sys_msgsnd tile_compat_sys_msgsnd
-
 /* See comments in sys.c */
 #define compat_sys_fadvise64_64 sys32_fadvise64_64
 #define compat_sys_readahead sys32_readahead
diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
index 5bed94e189fa..cde163dc6058 100644
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -2178,6 +2178,7 @@ config IA32_AOUT
 config COMPAT
 	def_bool y
 	depends on IA32_EMULATION
+	select ARCH_WANT_OLD_COMPAT_IPC
 
 config COMPAT_FOR_U64_ALIGNMENT
 	def_bool COMPAT
diff --git a/include/linux/compat.h b/include/linux/compat.h
index 7e05fcee75a1..35c2dbf2448a 100644
--- a/include/linux/compat.h
+++ b/include/linux/compat.h
@@ -224,6 +224,7 @@ struct compat_sysinfo;
 struct compat_sysctl_args;
 struct compat_kexec_segment;
 struct compat_mq_attr;
+struct compat_msgbuf;
 
 extern void compat_exit_robust_list(struct task_struct *curr);
 
@@ -234,13 +235,22 @@ asmlinkage long
 compat_sys_get_robust_list(int pid, compat_uptr_t __user *head_ptr,
 			   compat_size_t __user *len_ptr);
 
+#ifdef CONFIG_ARCH_WANT_OLD_COMPAT_IPC
 long compat_sys_semctl(int first, int second, int third, void __user *uptr);
 long compat_sys_msgsnd(int first, int second, int third, void __user *uptr);
 long compat_sys_msgrcv(int first, int second, int msgtyp, int third,
 		int version, void __user *uptr);
-long compat_sys_msgctl(int first, int second, void __user *uptr);
 long compat_sys_shmat(int first, int second, compat_uptr_t third, int version,
 		void __user *uptr);
+#else
+long compat_sys_semctl(int semid, int semnum, int cmd, int arg);
+long compat_sys_msgsnd(int msqid, struct compat_msgbuf __user *msgp,
+		size_t msgsz, int msgflg);
+long compat_sys_msgrcv(int msqid, struct compat_msgbuf __user *msgp,
+		size_t msgsz, long msgtyp, int msgflg);
+long compat_sys_shmat(int shmid, compat_uptr_t shmaddr, int shmflg);
+#endif
+long compat_sys_msgctl(int first, int second, void __user *uptr);
 long compat_sys_shmctl(int first, int second, void __user *uptr);
 long compat_sys_semtimedop(int semid, struct sembuf __user *tsems,
 		unsigned nsems, const struct compat_timespec __user *timeout);
diff --git a/ipc/compat.c b/ipc/compat.c
index 845a28738d3a..a6df704f521e 100644
--- a/ipc/compat.c
+++ b/ipc/compat.c
@@ -27,6 +27,7 @@
 #include <linux/msg.h>
 #include <linux/shm.h>
 #include <linux/syscalls.h>
+#include <linux/ptrace.h>
 
 #include <linux/mutex.h>
 #include <asm/uaccess.h>
@@ -117,6 +118,7 @@ extern int sem_ctls[];
 
 static inline int compat_ipc_parse_version(int *cmd)
 {
+#ifdef CONFIG_ARCH_WANT_OLD_COMPAT_IPC
 	int version = *cmd & IPC_64;
 
 	/* this is tricky: architectures that have support for the old
@@ -128,6 +130,10 @@ static inline int compat_ipc_parse_version(int *cmd)
 	*cmd &= ~IPC_64;
 #endif
 	return version;
+#else
+	/* With the asm-generic APIs, we always use the 64-bit versions. */
+	return IPC_64;
+#endif
 }
 
 static inline int __get_compat_ipc64_perm(struct ipc64_perm *p64,
@@ -232,10 +238,9 @@ static inline int put_compat_semid_ds(struct semid64_ds *s,
 	return err;
 }
 
-long compat_sys_semctl(int first, int second, int third, void __user *uptr)
+static long do_compat_semctl(int first, int second, int third, u32 pad)
 {
 	union semun fourth;
-	u32 pad;
 	int err, err2;
 	struct semid64_ds s64;
 	struct semid64_ds __user *up64;
@@ -243,10 +248,6 @@ long compat_sys_semctl(int first, int second, int third, void __user *uptr)
 
 	memset(&s64, 0, sizeof(s64));
 
-	if (!uptr)
-		return -EINVAL;
-	if (get_user(pad, (u32 __user *) uptr))
-		return -EFAULT;
 	if ((third & (~IPC_64)) == SETVAL)
 		fourth.val = (int) pad;
 	else
@@ -305,6 +306,18 @@ long compat_sys_semctl(int first, int second, int third, void __user *uptr)
 	return err;
 }
 
+#ifdef CONFIG_ARCH_WANT_OLD_COMPAT_IPC
+long compat_sys_semctl(int first, int second, int third, void __user *uptr)
+{
+	u32 pad;
+
+	if (!uptr)
+		return -EINVAL;
+	if (get_user(pad, (u32 __user *) uptr))
+		return -EFAULT;
+	return do_compat_semctl(first, second, third, pad);
+}
+
 long compat_sys_msgsnd(int first, int second, int third, void __user *uptr)
 {
 	struct compat_msgbuf __user *up = uptr;
@@ -353,6 +366,37 @@ long compat_sys_msgrcv(int first, int second, int msgtyp, int third,
 out:
 	return err;
 }
+#else
+long compat_sys_semctl(int semid, int semnum, int cmd, int arg)
+{
+	return do_compat_semctl(semid, semnum, cmd, arg);
+}
+
+long compat_sys_msgsnd(int msqid, struct compat_msgbuf __user *msgp,
+		       size_t msgsz, int msgflg)
+{
+	compat_long_t mtype;
+
+	if (get_user(mtype, &msgp->mtype))
+		return -EFAULT;
+	return do_msgsnd(msqid, mtype, msgp->mtext, msgsz, msgflg);
+}
+
+long compat_sys_msgrcv(int msqid, struct compat_msgbuf __user *msgp,
+		       size_t msgsz, long msgtyp, int msgflg)
+{
+	long err, mtype;
+
+	err =  do_msgrcv(msqid, &mtype, msgp->mtext, msgsz, msgtyp, msgflg);
+	if (err < 0)
+		goto out;
+
+	if (put_user(mtype, &msgp->mtype))
+		err = -EFAULT;
+ out:
+	return err;
+}
+#endif
 
 static inline int get_compat_msqid64(struct msqid64_ds *m64,
 				     struct compat_msqid64_ds __user *up64)
@@ -470,6 +514,7 @@ long compat_sys_msgctl(int first, int second, void __user *uptr)
 	return err;
 }
 
+#ifdef CONFIG_ARCH_WANT_OLD_COMPAT_IPC
 long compat_sys_shmat(int first, int second, compat_uptr_t third, int version,
 			void __user *uptr)
 {
@@ -485,6 +530,19 @@ long compat_sys_shmat(int first, int second, compat_uptr_t third, int version,
 	uaddr = compat_ptr(third);
 	return put_user(raddr, uaddr);
 }
+#else
+long compat_sys_shmat(int shmid, compat_uptr_t shmaddr, int shmflg)
+{
+	unsigned long ret;
+	long err;
+
+	err = do_shmat(shmid, compat_ptr(shmaddr), shmflg, &ret);
+	if (err)
+		return err;
+	force_successful_syscall_return();
+	return (long)ret;
+}
+#endif
 
 static inline int get_compat_shmid64_ds(struct shmid64_ds *s64,
 					struct compat_shmid64_ds __user *up64)
-- 
cgit v1.2.3


From a939e817aa7e199d2fff05a67cb745be32dd5c2d Mon Sep 17 00:00:00 2001
From: John Stultz <john.stultz@linaro.org>
Date: Thu, 1 Mar 2012 22:11:09 -0800
Subject: time: x86: Fix race switching from vsyscall to non-vsyscall clock

When switching from a vsyscall capable to a non-vsyscall capable
clocksource, there was a small race, where the last vsyscall
gettimeofday before the switch might return a invalid time value
using the new non-vsyscall enabled clocksource values after the
switch is complete.

This is due to the vsyscall code checking the vclock_mode once
outside of the seqcount protected section. After it reads the
vclock mode, it doesn't re-check that the sampled clock data
that is obtained in the seqcount critical section still matches.

The fix is to sample vclock_mode inside the protected section,
and as long as it isn't VCLOCK_NONE, return the calculated
value. If it has changed and is now VCLOCK_NONE, fall back
to the syscall gettime calculation.

v2:
  * Cleanup checks as suggested by tglx
  * Also fix same issue present in gettimeofday path

CC: Andy Lutomirski <luto@amacapital.net>
CC: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: John Stultz <john.stultz@linaro.org>
---
 arch/x86/vdso/vclock_gettime.c | 72 +++++++++++++++++++++++++++---------------
 1 file changed, 46 insertions(+), 26 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/vdso/vclock_gettime.c b/arch/x86/vdso/vclock_gettime.c
index 6bc0e723b6e8..7eeb1f6188ee 100644
--- a/arch/x86/vdso/vclock_gettime.c
+++ b/arch/x86/vdso/vclock_gettime.c
@@ -70,14 +70,26 @@ notrace static long vdso_fallback_gettime(long clock, struct timespec *ts)
 	return ret;
 }
 
+notrace static long vdso_fallback_gtod(struct timeval *tv, struct timezone *tz)
+{
+	long ret;
+
+	asm("syscall" : "=a" (ret) :
+	    "0" (__NR_gettimeofday), "D" (tv), "S" (tz) : "memory");
+	return ret;
+}
+
+
 notrace static inline long vgetns(void)
 {
 	long v;
 	cycles_t cycles;
 	if (gtod->clock.vclock_mode == VCLOCK_TSC)
 		cycles = vread_tsc();
-	else
+	else if (gtod->clock.vclock_mode == VCLOCK_HPET)
 		cycles = vread_hpet();
+	else
+		return 0;
 	v = (cycles - gtod->clock.cycle_last) & gtod->clock.mask;
 	return (v * gtod->clock.mult) >> gtod->clock.shift;
 }
@@ -85,21 +97,28 @@ notrace static inline long vgetns(void)
 notrace static noinline int do_realtime(struct timespec *ts)
 {
 	unsigned long seq, ns;
+	int mode;
+
 	do {
 		seq = read_seqbegin(&gtod->lock);
+		mode = gtod->clock.vclock_mode;
 		ts->tv_sec = gtod->wall_time_sec;
 		ts->tv_nsec = gtod->wall_time_nsec;
 		ns = vgetns();
 	} while (unlikely(read_seqretry(&gtod->lock, seq)));
+
 	timespec_add_ns(ts, ns);
-	return 0;
+	return mode;
 }
 
 notrace static noinline int do_monotonic(struct timespec *ts)
 {
 	unsigned long seq, ns, secs;
+	int mode;
+
 	do {
 		seq = read_seqbegin(&gtod->lock);
+		mode = gtod->clock.vclock_mode;
 		secs = gtod->wall_time_sec;
 		ns = gtod->wall_time_nsec + vgetns();
 		secs += gtod->wall_to_monotonic.tv_sec;
@@ -116,7 +135,7 @@ notrace static noinline int do_monotonic(struct timespec *ts)
 	ts->tv_sec = secs;
 	ts->tv_nsec = ns;
 
-	return 0;
+	return mode;
 }
 
 notrace static noinline int do_realtime_coarse(struct timespec *ts)
@@ -156,14 +175,14 @@ notrace static noinline int do_monotonic_coarse(struct timespec *ts)
 
 notrace int __vdso_clock_gettime(clockid_t clock, struct timespec *ts)
 {
+	int ret = VCLOCK_NONE;
+
 	switch (clock) {
 	case CLOCK_REALTIME:
-		if (likely(gtod->clock.vclock_mode != VCLOCK_NONE))
-			return do_realtime(ts);
+		ret = do_realtime(ts);
 		break;
 	case CLOCK_MONOTONIC:
-		if (likely(gtod->clock.vclock_mode != VCLOCK_NONE))
-			return do_monotonic(ts);
+		ret = do_monotonic(ts);
 		break;
 	case CLOCK_REALTIME_COARSE:
 		return do_realtime_coarse(ts);
@@ -171,32 +190,33 @@ notrace int __vdso_clock_gettime(clockid_t clock, struct timespec *ts)
 		return do_monotonic_coarse(ts);
 	}
 
-	return vdso_fallback_gettime(clock, ts);
+	if (ret == VCLOCK_NONE)
+		return vdso_fallback_gettime(clock, ts);
+	return 0;
 }
 int clock_gettime(clockid_t, struct timespec *)
 	__attribute__((weak, alias("__vdso_clock_gettime")));
 
 notrace int __vdso_gettimeofday(struct timeval *tv, struct timezone *tz)
 {
-	long ret;
-	if (likely(gtod->clock.vclock_mode != VCLOCK_NONE)) {
-		if (likely(tv != NULL)) {
-			BUILD_BUG_ON(offsetof(struct timeval, tv_usec) !=
-				     offsetof(struct timespec, tv_nsec) ||
-				     sizeof(*tv) != sizeof(struct timespec));
-			do_realtime((struct timespec *)tv);
-			tv->tv_usec /= 1000;
-		}
-		if (unlikely(tz != NULL)) {
-			/* Avoid memcpy. Some old compilers fail to inline it */
-			tz->tz_minuteswest = gtod->sys_tz.tz_minuteswest;
-			tz->tz_dsttime = gtod->sys_tz.tz_dsttime;
-		}
-		return 0;
+	long ret = VCLOCK_NONE;
+
+	if (likely(tv != NULL)) {
+		BUILD_BUG_ON(offsetof(struct timeval, tv_usec) !=
+			     offsetof(struct timespec, tv_nsec) ||
+			     sizeof(*tv) != sizeof(struct timespec));
+		ret = do_realtime((struct timespec *)tv);
+		tv->tv_usec /= 1000;
 	}
-	asm("syscall" : "=a" (ret) :
-	    "0" (__NR_gettimeofday), "D" (tv), "S" (tz) : "memory");
-	return ret;
+	if (unlikely(tz != NULL)) {
+		/* Avoid memcpy. Some old compilers fail to inline it */
+		tz->tz_minuteswest = gtod->sys_tz.tz_minuteswest;
+		tz->tz_dsttime = gtod->sys_tz.tz_dsttime;
+	}
+
+	if (ret == VCLOCK_NONE)
+		return vdso_fallback_gtod(tv, tz);
+	return 0;
 }
 int gettimeofday(struct timeval *, struct timezone *)
 	__attribute__((weak, alias("__vdso_gettimeofday")));
-- 
cgit v1.2.3


From 6c260d586343f7f78239d90aa9e2cfed02f74ff3 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Tue, 28 Feb 2012 19:46:04 +0000
Subject: x86: vdso: Remove bogus locking in update_vsyscall_tz()

Changing the sequence count in update_vsyscall_tz() is completely
pointless.

The vdso code copies the data unprotected. There is no point to change
this as sys_tz is nowhere protected at all. See sys_gettimeofday().

Reviewed-by: Andy Lutomirski <luto@amacapital.net>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: John Stultz <john.stultz@linaro.org>
---
 arch/x86/kernel/vsyscall_64.c | 5 -----
 1 file changed, 5 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kernel/vsyscall_64.c b/arch/x86/kernel/vsyscall_64.c
index b07ba9393564..33385c18e5d3 100644
--- a/arch/x86/kernel/vsyscall_64.c
+++ b/arch/x86/kernel/vsyscall_64.c
@@ -80,12 +80,7 @@ early_param("vsyscall", vsyscall_setup);
 
 void update_vsyscall_tz(void)
 {
-	unsigned long flags;
-
-	write_seqlock_irqsave(&vsyscall_gtod_data.lock, flags);
-	/* sys_tz has changed */
 	vsyscall_gtod_data.sys_tz = sys_tz;
-	write_sequnlock_irqrestore(&vsyscall_gtod_data.lock, flags);
 }
 
 void update_vsyscall(struct timespec *wall_time, struct timespec *wtm,
-- 
cgit v1.2.3


From 2ab516575f2f273b19d95140d02c54612201e80a Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Tue, 28 Feb 2012 19:46:04 +0000
Subject: x86: vdso: Use seqcount instead of seqlock

The update of the vdso data happens under xtime_lock, so adding a
nested lock is pointless. Just use a seqcount to sync the readers.

Reviewed-by: Andy Lutomirski <luto@amacapital.net>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: John Stultz <john.stultz@linaro.org>
---
 arch/x86/include/asm/vgtod.h   |  2 +-
 arch/x86/kernel/vsyscall_64.c  | 11 +++--------
 arch/x86/vdso/vclock_gettime.c | 16 ++++++++--------
 3 files changed, 12 insertions(+), 17 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/include/asm/vgtod.h b/arch/x86/include/asm/vgtod.h
index 815285bcaceb..1f007178c813 100644
--- a/arch/x86/include/asm/vgtod.h
+++ b/arch/x86/include/asm/vgtod.h
@@ -5,7 +5,7 @@
 #include <linux/clocksource.h>
 
 struct vsyscall_gtod_data {
-	seqlock_t	lock;
+	seqcount_t	seq;
 
 	/* open coded 'struct timespec' */
 	time_t		wall_time_sec;
diff --git a/arch/x86/kernel/vsyscall_64.c b/arch/x86/kernel/vsyscall_64.c
index 33385c18e5d3..cdc95a707cd1 100644
--- a/arch/x86/kernel/vsyscall_64.c
+++ b/arch/x86/kernel/vsyscall_64.c
@@ -52,10 +52,7 @@
 #include "vsyscall_trace.h"
 
 DEFINE_VVAR(int, vgetcpu_mode);
-DEFINE_VVAR(struct vsyscall_gtod_data, vsyscall_gtod_data) =
-{
-	.lock = __SEQLOCK_UNLOCKED(__vsyscall_gtod_data.lock),
-};
+DEFINE_VVAR(struct vsyscall_gtod_data, vsyscall_gtod_data);
 
 static enum { EMULATE, NATIVE, NONE } vsyscall_mode = EMULATE;
 
@@ -86,9 +83,7 @@ void update_vsyscall_tz(void)
 void update_vsyscall(struct timespec *wall_time, struct timespec *wtm,
 			struct clocksource *clock, u32 mult)
 {
-	unsigned long flags;
-
-	write_seqlock_irqsave(&vsyscall_gtod_data.lock, flags);
+	write_seqcount_begin(&vsyscall_gtod_data.seq);
 
 	/* copy vsyscall data */
 	vsyscall_gtod_data.clock.vclock_mode	= clock->archdata.vclock_mode;
@@ -101,7 +96,7 @@ void update_vsyscall(struct timespec *wall_time, struct timespec *wtm,
 	vsyscall_gtod_data.wall_to_monotonic	= *wtm;
 	vsyscall_gtod_data.wall_time_coarse	= __current_kernel_time();
 
-	write_sequnlock_irqrestore(&vsyscall_gtod_data.lock, flags);
+	write_seqcount_end(&vsyscall_gtod_data.seq);
 }
 
 static void warn_bad_vsyscall(const char *level, struct pt_regs *regs,
diff --git a/arch/x86/vdso/vclock_gettime.c b/arch/x86/vdso/vclock_gettime.c
index 7eeb1f6188ee..944c5e5d6b6a 100644
--- a/arch/x86/vdso/vclock_gettime.c
+++ b/arch/x86/vdso/vclock_gettime.c
@@ -100,12 +100,12 @@ notrace static noinline int do_realtime(struct timespec *ts)
 	int mode;
 
 	do {
-		seq = read_seqbegin(&gtod->lock);
+		seq = read_seqcount_begin(&gtod->seq);
 		mode = gtod->clock.vclock_mode;
 		ts->tv_sec = gtod->wall_time_sec;
 		ts->tv_nsec = gtod->wall_time_nsec;
 		ns = vgetns();
-	} while (unlikely(read_seqretry(&gtod->lock, seq)));
+	} while (unlikely(read_seqcount_retry(&gtod->seq, seq)));
 
 	timespec_add_ns(ts, ns);
 	return mode;
@@ -117,13 +117,13 @@ notrace static noinline int do_monotonic(struct timespec *ts)
 	int mode;
 
 	do {
-		seq = read_seqbegin(&gtod->lock);
+		seq = read_seqcount_begin(&gtod->seq);
 		mode = gtod->clock.vclock_mode;
 		secs = gtod->wall_time_sec;
 		ns = gtod->wall_time_nsec + vgetns();
 		secs += gtod->wall_to_monotonic.tv_sec;
 		ns += gtod->wall_to_monotonic.tv_nsec;
-	} while (unlikely(read_seqretry(&gtod->lock, seq)));
+	} while (unlikely(read_seqcount_retry(&gtod->seq, seq)));
 
 	/* wall_time_nsec, vgetns(), and wall_to_monotonic.tv_nsec
 	 * are all guaranteed to be nonnegative.
@@ -142,10 +142,10 @@ notrace static noinline int do_realtime_coarse(struct timespec *ts)
 {
 	unsigned long seq;
 	do {
-		seq = read_seqbegin(&gtod->lock);
+		seq = read_seqcount_begin(&gtod->seq);
 		ts->tv_sec = gtod->wall_time_coarse.tv_sec;
 		ts->tv_nsec = gtod->wall_time_coarse.tv_nsec;
-	} while (unlikely(read_seqretry(&gtod->lock, seq)));
+	} while (unlikely(read_seqcount_retry(&gtod->seq, seq)));
 	return 0;
 }
 
@@ -153,12 +153,12 @@ notrace static noinline int do_monotonic_coarse(struct timespec *ts)
 {
 	unsigned long seq, ns, secs;
 	do {
-		seq = read_seqbegin(&gtod->lock);
+		seq = read_seqcount_begin(&gtod->seq);
 		secs = gtod->wall_time_coarse.tv_sec;
 		ns = gtod->wall_time_coarse.tv_nsec;
 		secs += gtod->wall_to_monotonic.tv_sec;
 		ns += gtod->wall_to_monotonic.tv_nsec;
-	} while (unlikely(read_seqretry(&gtod->lock, seq)));
+	} while (unlikely(read_seqcount_retry(&gtod->seq, seq)));
 
 	/* wall_time_nsec and wall_to_monotonic.tv_nsec are
 	 * guaranteed to be between 0 and NSEC_PER_SEC.
-- 
cgit v1.2.3


From 57779dc2b3b75bee05ef5d1ada47f615f7a13932 Mon Sep 17 00:00:00 2001
From: Alok Kataria <akataria@vmware.com>
Date: Tue, 21 Feb 2012 18:19:55 -0800
Subject: x86, tsc: Skip refined tsc calibration on systems with reliable TSC

While running the latest Linux as guest under VMware in highly
over-committed situations, we have seen cases when the refined TSC
algorithm fails to get a valid tsc_start value in
tsc_refine_calibration_work from multiple attempts. As a result the
kernel keeps on scheduling the tsc_irqwork task for later. Subsequently
after several attempts when it gets a valid start value it goes through
the refined calibration and either bails out or uses the new results.
Given that the kernel originally read the TSC frequency from the
platform, which is the best it can get, I don't think there is much
value in refining it.

So  for systems which get the TSC frequency from the platform we
should skip the refined tsc algorithm.

We can use the TSC_RELIABLE cpu cap flag to detect this, right now it is
set only on VMware and for Moorestown Penwell both of which have there
own TSC calibration methods.

Signed-off-by: Alok N Kataria <akataria@vmware.com>
Cc: John Stultz <johnstul@us.ibm.com>
Cc: Dirk Brandewie <dirk.brandewie@gmail.com>
Cc: Alan Cox <alan@linux.intel.com>
Cc: stable@kernel.org
[jstultz: Reworked to simply not schedule the refining work,
rather then scheduling the work and bombing out later]
Signed-off-by: John Stultz <john.stultz@linaro.org>
---
 arch/x86/kernel/tsc.c | 10 ++++++++++
 1 file changed, 10 insertions(+)

(limited to 'arch/x86')

diff --git a/arch/x86/kernel/tsc.c b/arch/x86/kernel/tsc.c
index a62c201c97ec..6fcfcb3865c2 100644
--- a/arch/x86/kernel/tsc.c
+++ b/arch/x86/kernel/tsc.c
@@ -932,6 +932,16 @@ static int __init init_tsc_clocksource(void)
 		clocksource_tsc.rating = 0;
 		clocksource_tsc.flags &= ~CLOCK_SOURCE_IS_CONTINUOUS;
 	}
+
+	/*
+	 * Trust the results of the earlier calibration on systems
+	 * exporting a reliable TSC.
+	 */
+	if (boot_cpu_has(X86_FEATURE_TSC_RELIABLE)) {
+		clocksource_register_khz(&clocksource_tsc, tsc_khz);
+		return 0;
+	}
+
 	schedule_delayed_work(&tsc_irqwork, 0);
 	return 0;
 }
-- 
cgit v1.2.3


From 641cc938815dfd09f8fa1ec72deb814f0938ac33 Mon Sep 17 00:00:00 2001
From: Jiri Olsa <jolsa@redhat.com>
Date: Thu, 15 Mar 2012 20:09:14 +0100
Subject: perf: Adding sysfs group format attribute for pmu device

Adding sysfs group 'format' attribute for pmu device that
contains a syntax description on how to construct raw events.

The event configuration is described in following
struct pefr_event_attr attributes:

  config
  config1
  config2

Each sysfs attribute within the format attribute group,
describes mapping of name and bitfield definition within
one of above attributes.

eg:
  "/sys/...<dev>/format/event" contains "config:0-7"
  "/sys/...<dev>/format/umask" contains "config:8-15"
  "/sys/...<dev>/format/usr"   contains "config:16"

the attribute value syntax is:

  line:      config ':' bits
  config:    'config' | 'config1' | 'config2"
  bits:      bits ',' bit_term | bit_term
  bit_term:  VALUE '-' VALUE | VALUE

Adding format attribute definitions for x86 cpu pmus.

Acked-by: Peter Zijlstra <peterz@infradead.org>
Signed-off-by: Peter Zijlstra <peterz@infradead.org>
Signed-off-by: Jiri Olsa <jolsa@redhat.com>
Link: http://lkml.kernel.org/n/tip-vhdk5y2hyype9j63prymty36@git.kernel.org
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
---
 .../testing/sysfs-bus-event_source-devices-format  | 14 +++++++++
 arch/x86/kernel/cpu/perf_event.c                   |  7 +++++
 arch/x86/kernel/cpu/perf_event.h                   |  1 +
 arch/x86/kernel/cpu/perf_event_amd.c               | 18 +++++++++++
 arch/x86/kernel/cpu/perf_event_intel.c             | 36 ++++++++++++++++++++++
 arch/x86/kernel/cpu/perf_event_p6.c                | 19 ++++++++++++
 include/linux/perf_event.h                         | 14 +++++++++
 7 files changed, 109 insertions(+)
 create mode 100644 Documentation/ABI/testing/sysfs-bus-event_source-devices-format

(limited to 'arch/x86')

diff --git a/Documentation/ABI/testing/sysfs-bus-event_source-devices-format b/Documentation/ABI/testing/sysfs-bus-event_source-devices-format
new file mode 100644
index 000000000000..079afc71363d
--- /dev/null
+++ b/Documentation/ABI/testing/sysfs-bus-event_source-devices-format
@@ -0,0 +1,14 @@
+Where:		/sys/bus/event_source/devices/<dev>/format
+Date:		January 2012
+Kernel Version: 3.3
+Contact:	Jiri Olsa <jolsa@redhat.com>
+Description:
+		Attribute group to describe the magic bits that go into
+		perf_event_attr::config[012] for a particular pmu.
+		Each attribute of this group defines the 'hardware' bitmask
+		we want to export, so that userspace can deal with sane
+		name/value pairs.
+
+		Example: 'config1:1,6-10,44'
+		Defines contents of attribute that occupies bits 1,6-10,44 of
+		perf_event_attr::config1.
diff --git a/arch/x86/kernel/cpu/perf_event.c b/arch/x86/kernel/cpu/perf_event.c
index 0a18d16cb58d..453ac9497574 100644
--- a/arch/x86/kernel/cpu/perf_event.c
+++ b/arch/x86/kernel/cpu/perf_event.c
@@ -1314,6 +1314,11 @@ static void __init pmu_check_apic(void)
 	pr_info("no hardware sampling interrupt available.\n");
 }
 
+static struct attribute_group x86_pmu_format_group = {
+	.name = "format",
+	.attrs = NULL,
+};
+
 static int __init init_hw_perf_events(void)
 {
 	struct x86_pmu_quirk *quirk;
@@ -1388,6 +1393,7 @@ static int __init init_hw_perf_events(void)
 	}
 
 	x86_pmu.attr_rdpmc = 1; /* enable userspace RDPMC usage by default */
+	x86_pmu_format_group.attrs = x86_pmu.format_attrs;
 
 	pr_info("... version:                %d\n",     x86_pmu.version);
 	pr_info("... bit width:              %d\n",     x86_pmu.cntval_bits);
@@ -1668,6 +1674,7 @@ static struct attribute_group x86_pmu_attr_group = {
 
 static const struct attribute_group *x86_pmu_attr_groups[] = {
 	&x86_pmu_attr_group,
+	&x86_pmu_format_group,
 	NULL,
 };
 
diff --git a/arch/x86/kernel/cpu/perf_event.h b/arch/x86/kernel/cpu/perf_event.h
index 8484e77c211e..6638aaf54493 100644
--- a/arch/x86/kernel/cpu/perf_event.h
+++ b/arch/x86/kernel/cpu/perf_event.h
@@ -339,6 +339,7 @@ struct x86_pmu {
 	 * sysfs attrs
 	 */
 	int		attr_rdpmc;
+	struct attribute **format_attrs;
 
 	/*
 	 * CPU Hotplug hooks
diff --git a/arch/x86/kernel/cpu/perf_event_amd.c b/arch/x86/kernel/cpu/perf_event_amd.c
index dd002faff7a6..95e7fe1c5f0b 100644
--- a/arch/x86/kernel/cpu/perf_event_amd.c
+++ b/arch/x86/kernel/cpu/perf_event_amd.c
@@ -404,6 +404,21 @@ static void amd_pmu_cpu_dead(int cpu)
 	}
 }
 
+PMU_FORMAT_ATTR(event,	"config:0-7,32-35");
+PMU_FORMAT_ATTR(umask,	"config:8-15"	);
+PMU_FORMAT_ATTR(edge,	"config:18"	);
+PMU_FORMAT_ATTR(inv,	"config:23"	);
+PMU_FORMAT_ATTR(cmask,	"config:24-31"	);
+
+static struct attribute *amd_format_attr[] = {
+	&format_attr_event.attr,
+	&format_attr_umask.attr,
+	&format_attr_edge.attr,
+	&format_attr_inv.attr,
+	&format_attr_cmask.attr,
+	NULL,
+};
+
 static __initconst const struct x86_pmu amd_pmu = {
 	.name			= "AMD",
 	.handle_irq		= x86_pmu_handle_irq,
@@ -426,6 +441,8 @@ static __initconst const struct x86_pmu amd_pmu = {
 	.get_event_constraints	= amd_get_event_constraints,
 	.put_event_constraints	= amd_put_event_constraints,
 
+	.format_attrs		= amd_format_attr,
+
 	.cpu_prepare		= amd_pmu_cpu_prepare,
 	.cpu_starting		= amd_pmu_cpu_starting,
 	.cpu_dead		= amd_pmu_cpu_dead,
@@ -596,6 +613,7 @@ static __initconst const struct x86_pmu amd_pmu_f15h = {
 	.cpu_dead		= amd_pmu_cpu_dead,
 #endif
 	.cpu_starting		= amd_pmu_cpu_starting,
+	.format_attrs		= amd_format_attr,
 };
 
 __init int amd_pmu_init(void)
diff --git a/arch/x86/kernel/cpu/perf_event_intel.c b/arch/x86/kernel/cpu/perf_event_intel.c
index 6a84e7f28f05..26b3e2fef104 100644
--- a/arch/x86/kernel/cpu/perf_event_intel.c
+++ b/arch/x86/kernel/cpu/perf_event_intel.c
@@ -1431,6 +1431,24 @@ static void core_pmu_enable_all(int added)
 	}
 }
 
+PMU_FORMAT_ATTR(event,	"config:0-7"	);
+PMU_FORMAT_ATTR(umask,	"config:8-15"	);
+PMU_FORMAT_ATTR(edge,	"config:18"	);
+PMU_FORMAT_ATTR(pc,	"config:19"	);
+PMU_FORMAT_ATTR(any,	"config:21"	); /* v3 + */
+PMU_FORMAT_ATTR(inv,	"config:23"	);
+PMU_FORMAT_ATTR(cmask,	"config:24-31"	);
+
+static struct attribute *intel_arch_formats_attr[] = {
+	&format_attr_event.attr,
+	&format_attr_umask.attr,
+	&format_attr_edge.attr,
+	&format_attr_pc.attr,
+	&format_attr_inv.attr,
+	&format_attr_cmask.attr,
+	NULL,
+};
+
 static __initconst const struct x86_pmu core_pmu = {
 	.name			= "core",
 	.handle_irq		= x86_pmu_handle_irq,
@@ -1455,6 +1473,7 @@ static __initconst const struct x86_pmu core_pmu = {
 	.put_event_constraints	= intel_put_event_constraints,
 	.event_constraints	= intel_core_event_constraints,
 	.guest_get_msrs		= core_guest_get_msrs,
+	.format_attrs		= intel_arch_formats_attr,
 };
 
 struct intel_shared_regs *allocate_shared_regs(int cpu)
@@ -1553,6 +1572,21 @@ static void intel_pmu_flush_branch_stack(void)
 		intel_pmu_lbr_reset();
 }
 
+PMU_FORMAT_ATTR(offcore_rsp, "config1:0-63");
+
+static struct attribute *intel_arch3_formats_attr[] = {
+	&format_attr_event.attr,
+	&format_attr_umask.attr,
+	&format_attr_edge.attr,
+	&format_attr_pc.attr,
+	&format_attr_any.attr,
+	&format_attr_inv.attr,
+	&format_attr_cmask.attr,
+
+	&format_attr_offcore_rsp.attr, /* XXX do NHM/WSM + SNB breakout */
+	NULL,
+};
+
 static __initconst const struct x86_pmu intel_pmu = {
 	.name			= "Intel",
 	.handle_irq		= intel_pmu_handle_irq,
@@ -1576,6 +1610,8 @@ static __initconst const struct x86_pmu intel_pmu = {
 	.get_event_constraints	= intel_get_event_constraints,
 	.put_event_constraints	= intel_put_event_constraints,
 
+	.format_attrs		= intel_arch3_formats_attr,
+
 	.cpu_prepare		= intel_pmu_cpu_prepare,
 	.cpu_starting		= intel_pmu_cpu_starting,
 	.cpu_dying		= intel_pmu_cpu_dying,
diff --git a/arch/x86/kernel/cpu/perf_event_p6.c b/arch/x86/kernel/cpu/perf_event_p6.c
index c7181befecde..32bcfc7dd230 100644
--- a/arch/x86/kernel/cpu/perf_event_p6.c
+++ b/arch/x86/kernel/cpu/perf_event_p6.c
@@ -87,6 +87,23 @@ static void p6_pmu_enable_event(struct perf_event *event)
 	(void)checking_wrmsrl(hwc->config_base, val);
 }
 
+PMU_FORMAT_ATTR(event,	"config:0-7"	);
+PMU_FORMAT_ATTR(umask,	"config:8-15"	);
+PMU_FORMAT_ATTR(edge,	"config:18"	);
+PMU_FORMAT_ATTR(pc,	"config:19"	);
+PMU_FORMAT_ATTR(inv,	"config:23"	);
+PMU_FORMAT_ATTR(cmask,	"config:24-31"	);
+
+static struct attribute *intel_p6_formats_attr[] = {
+	&format_attr_event.attr,
+	&format_attr_umask.attr,
+	&format_attr_edge.attr,
+	&format_attr_pc.attr,
+	&format_attr_inv.attr,
+	&format_attr_cmask.attr,
+	NULL,
+};
+
 static __initconst const struct x86_pmu p6_pmu = {
 	.name			= "p6",
 	.handle_irq		= x86_pmu_handle_irq,
@@ -115,6 +132,8 @@ static __initconst const struct x86_pmu p6_pmu = {
 	.cntval_mask		= (1ULL << 32) - 1,
 	.get_event_constraints	= x86_get_event_constraints,
 	.event_constraints	= p6_event_constraints,
+
+	.format_attrs		= intel_p6_formats_attr,
 };
 
 __init int p6_pmu_init(void)
diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h
index bd9f55a5958d..57ae485e80fc 100644
--- a/include/linux/perf_event.h
+++ b/include/linux/perf_event.h
@@ -550,6 +550,7 @@ struct perf_guest_info_callbacks {
 #include <linux/irq_work.h>
 #include <linux/static_key.h>
 #include <linux/atomic.h>
+#include <linux/sysfs.h>
 #include <asm/local.h>
 
 #define PERF_MAX_STACK_DEPTH		255
@@ -1291,5 +1292,18 @@ do {									\
 	register_cpu_notifier(&fn##_nb);				\
 } while (0)
 
+
+#define PMU_FORMAT_ATTR(_name, _format)					\
+static ssize_t								\
+_name##_show(struct device *dev,					\
+			       struct device_attribute *attr,		\
+			       char *page)				\
+{									\
+	BUILD_BUG_ON(sizeof(_format) >= PAGE_SIZE);			\
+	return sprintf(page, _format "\n");				\
+}									\
+									\
+static struct device_attribute format_attr_##_name = __ATTR_RO(_name)
+
 #endif /* __KERNEL__ */
 #endif /* _LINUX_PERF_EVENT_H */
-- 
cgit v1.2.3


From c7b738351ba92f48b943ac59aff6b5b0f17f37c9 Mon Sep 17 00:00:00 2001
From: Dan Carpenter <dan.carpenter@oracle.com>
Date: Mon, 5 Mar 2012 21:06:14 +0300
Subject: x86, efi: Fix pointer math issue in handle_ramdisks()

"filename" is a efi_char16_t string so this check for reaching the end
of the array doesn't work.  We need to cast the pointer to (u8 *) before
doing the math.

This patch changes the "filename" to "filename_16" to avoid confusion in
the future.

Signed-off-by: Dan Carpenter <dan.carpenter@oracle.com>
Link: http://lkml.kernel.org/r/20120305180614.GA26880@elgon.mountain
Acked-by: Matt Fleming <matt.fleming@intel.com>
Signed-off-by: H. Peter Anvin <hpa@linux.intel.com>
---
 arch/x86/boot/compressed/eboot.c | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/boot/compressed/eboot.c b/arch/x86/boot/compressed/eboot.c
index fec216f4fbc3..0cdfc0d2315e 100644
--- a/arch/x86/boot/compressed/eboot.c
+++ b/arch/x86/boot/compressed/eboot.c
@@ -539,7 +539,7 @@ static efi_status_t handle_ramdisks(efi_loaded_image_t *image,
 		struct initrd *initrd;
 		efi_file_handle_t *h;
 		efi_file_info_t *info;
-		efi_char16_t filename[256];
+		efi_char16_t filename_16[256];
 		unsigned long info_sz;
 		efi_guid_t info_guid = EFI_FILE_INFO_ID;
 		efi_char16_t *p;
@@ -552,14 +552,14 @@ static efi_status_t handle_ramdisks(efi_loaded_image_t *image,
 		str += 7;
 
 		initrd = &initrds[i];
-		p = filename;
+		p = filename_16;
 
 		/* Skip any leading slashes */
 		while (*str == '/' || *str == '\\')
 			str++;
 
 		while (*str && *str != ' ' && *str != '\n') {
-			if (p >= filename + sizeof(filename))
+			if ((u8 *)p >= (u8 *)filename_16 + sizeof(filename_16))
 				break;
 
 			*p++ = *str++;
@@ -583,7 +583,7 @@ static efi_status_t handle_ramdisks(efi_loaded_image_t *image,
 				goto free_initrds;
 		}
 
-		status = efi_call_phys5(fh->open, fh, &h, filename,
+		status = efi_call_phys5(fh->open, fh, &h, filename_16,
 					EFI_FILE_MODE_READ, (u64)0);
 		if (status != EFI_SUCCESS)
 			goto close_handles;
-- 
cgit v1.2.3


From 943bc7e110f269f88dc92bbf249adbd384d35f1c Mon Sep 17 00:00:00 2001
From: Steffen Persvold <sp@numascale.com>
Date: Thu, 15 Mar 2012 12:16:28 +0100
Subject: x86: Fix section warnings

Fix the following section warnings :

WARNING: vmlinux.o(.text+0x49dbc): Section mismatch in reference
from the function acpi_map_cpu2node() to the variable
.cpuinit.data:__apicid_to_node The function acpi_map_cpu2node()
references the variable __cpuinitdata __apicid_to_node. This is
often because acpi_map_cpu2node lacks a __cpuinitdata
annotation or the annotation of __apicid_to_node is wrong.

WARNING: vmlinux.o(.text+0x49dc1): Section mismatch in reference
from the function acpi_map_cpu2node() to the function
.cpuinit.text:numa_set_node() The function acpi_map_cpu2node()
references the function __cpuinit numa_set_node(). This is often
because acpi_map_cpu2node lacks a __cpuinit  annotation or the
annotation of numa_set_node is wrong.

WARNING: vmlinux.o(.text+0x526e77): Section mismatch in
reference from the function prealloc_protection_domains() to the
function .init.text:alloc_passthrough_domain() The function
prealloc_protection_domains() references the function __init
alloc_passthrough_domain(). This is often because
prealloc_protection_domains lacks a __init  annotation or the annotation of alloc_passthrough_domain is wrong.

Signed-off-by: Steffen Persvold <sp@numascale.com>
Link: http://lkml.kernel.org/r/1331810188-24785-1-git-send-email-sp@numascale.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/x86/kernel/acpi/boot.c | 2 +-
 drivers/iommu/amd_iommu.c   | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kernel/acpi/boot.c b/arch/x86/kernel/acpi/boot.c
index ce664f33ea8e..406ed77216d0 100644
--- a/arch/x86/kernel/acpi/boot.c
+++ b/arch/x86/kernel/acpi/boot.c
@@ -593,7 +593,7 @@ void __init acpi_set_irq_model_ioapic(void)
 #ifdef CONFIG_ACPI_HOTPLUG_CPU
 #include <acpi/processor.h>
 
-static void acpi_map_cpu2node(acpi_handle handle, int cpu, int physid)
+static void __cpuinitdata acpi_map_cpu2node(acpi_handle handle, int cpu, int physid)
 {
 #ifdef CONFIG_ACPI_NUMA
 	int nid;
diff --git a/drivers/iommu/amd_iommu.c b/drivers/iommu/amd_iommu.c
index f75e0608be5b..ae2ec929e52f 100644
--- a/drivers/iommu/amd_iommu.c
+++ b/drivers/iommu/amd_iommu.c
@@ -2804,7 +2804,7 @@ static int amd_iommu_dma_supported(struct device *dev, u64 mask)
  * we don't need to preallocate the protection domains anymore.
  * For now we have to.
  */
-static void prealloc_protection_domains(void)
+static void __init prealloc_protection_domains(void)
 {
 	struct iommu_dev_data *dev_data;
 	struct dma_ops_domain *dma_dom;
-- 
cgit v1.2.3


From dc72d99dabb870ca5bd6d9fff674be853bb4a88d Mon Sep 17 00:00:00 2001
From: Eric Dumazet <eric.dumazet@gmail.com>
Date: Sun, 18 Mar 2012 02:40:48 +0000
Subject: net: bpf_jit: fix BPF_S_LDX_B_MSH compilation

Matt Evans spotted that x86 bpf_jit was incorrectly handling negative
constant offsets in BPF_S_LDX_B_MSH instruction.

We need to abort JIT compilation like we do in common_load so that
filter uses the interpreter code and can call __load_pointer()

Reference: http://lists.openwall.net/netdev/2011/07/19/11

Thanks to Indan Zupancic to bring back this issue.

Reported-by: Matt Evans <matt@ozlabs.org>
Reported-by: Indan Zupancic <indan@nul.nu>
Signed-off-by: Eric Dumazet <eric.dumazet@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 arch/x86/net/bpf_jit_comp.c | 14 +++++---------
 1 file changed, 5 insertions(+), 9 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/net/bpf_jit_comp.c b/arch/x86/net/bpf_jit_comp.c
index 7c1b765ecc59..5671752f8d9c 100644
--- a/arch/x86/net/bpf_jit_comp.c
+++ b/arch/x86/net/bpf_jit_comp.c
@@ -475,8 +475,10 @@ void bpf_jit_compile(struct sk_filter *fp)
 			case BPF_S_LD_W_ABS:
 				func = sk_load_word;
 common_load:			seen |= SEEN_DATAREF;
-				if ((int)K < 0)
+				if ((int)K < 0) {
+					/* Abort the JIT because __load_pointer() is needed. */
 					goto out;
+				}
 				t_offset = func - (image + addrs[i]);
 				EMIT1_off32(0xbe, K); /* mov imm32,%esi */
 				EMIT1_off32(0xe8, t_offset); /* call */
@@ -489,14 +491,8 @@ common_load:			seen |= SEEN_DATAREF;
 				goto common_load;
 			case BPF_S_LDX_B_MSH:
 				if ((int)K < 0) {
-					if (pc_ret0 > 0) {
-						/* addrs[pc_ret0 - 1] is the start address */
-						EMIT_JMP(addrs[pc_ret0 - 1] - addrs[i]);
-						break;
-					}
-					CLEAR_A();
-					EMIT_JMP(cleanup_addr - addrs[i]);
-					break;
+					/* Abort the JIT because __load_pointer() is needed. */
+					goto out;
 				}
 				seen |= SEEN_DATAREF | SEEN_XREG;
 				t_offset = sk_load_byte_msh - (image + addrs[i]);
-- 
cgit v1.2.3


From b74f05d61b73af584d0c39121980171389ecfaaa Mon Sep 17 00:00:00 2001
From: Marcelo Tosatti <mtosatti@redhat.com>
Date: Mon, 13 Feb 2012 11:07:27 -0200
Subject: x86: kvmclock: abstract save/restore sched_clock_state

Upon resume from hibernation, CPU 0's hvclock area contains the old
values for system_time and tsc_timestamp. It is necessary for the
hypervisor to update these values with uptodate ones before the CPU uses
them.

Abstract TSC's save/restore sched_clock_state functions and use
restore_state to write to KVM_SYSTEM_TIME MSR, forcing an update.

Also move restore_sched_clock_state before __restore_processor_state,
since the later calls CONFIG_LOCK_STAT's lockstat_clock (also for TSC).
Thanks to Igor Mammedov for tracking it down.

Fixes suspend-to-disk with kvmclock.

Reviewed-by: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Marcelo Tosatti <mtosatti@redhat.com>
Signed-off-by: Avi Kivity <avi@redhat.com>
---
 arch/x86/include/asm/tsc.h      |  4 ++--
 arch/x86/include/asm/x86_init.h |  4 ++++
 arch/x86/kernel/kvmclock.c      | 11 +++++++++++
 arch/x86/kernel/tsc.c           |  4 ++--
 arch/x86/kernel/x86_init.c      |  4 +++-
 arch/x86/power/cpu.c            |  4 ++--
 6 files changed, 24 insertions(+), 7 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/include/asm/tsc.h b/arch/x86/include/asm/tsc.h
index 15d99153a96d..c91e8b9d588b 100644
--- a/arch/x86/include/asm/tsc.h
+++ b/arch/x86/include/asm/tsc.h
@@ -61,7 +61,7 @@ extern void check_tsc_sync_source(int cpu);
 extern void check_tsc_sync_target(void);
 
 extern int notsc_setup(char *);
-extern void save_sched_clock_state(void);
-extern void restore_sched_clock_state(void);
+extern void tsc_save_sched_clock_state(void);
+extern void tsc_restore_sched_clock_state(void);
 
 #endif /* _ASM_X86_TSC_H */
diff --git a/arch/x86/include/asm/x86_init.h b/arch/x86/include/asm/x86_init.h
index 5d0afac2962c..baaca8defec8 100644
--- a/arch/x86/include/asm/x86_init.h
+++ b/arch/x86/include/asm/x86_init.h
@@ -162,6 +162,8 @@ struct x86_cpuinit_ops {
  * @is_untracked_pat_range	exclude from PAT logic
  * @nmi_init			enable NMI on cpus
  * @i8042_detect		pre-detect if i8042 controller exists
+ * @save_sched_clock_state:	save state for sched_clock() on suspend
+ * @restore_sched_clock_state:	restore state for sched_clock() on resume
  */
 struct x86_platform_ops {
 	unsigned long (*calibrate_tsc)(void);
@@ -173,6 +175,8 @@ struct x86_platform_ops {
 	void (*nmi_init)(void);
 	unsigned char (*get_nmi_reason)(void);
 	int (*i8042_detect)(void);
+	void (*save_sched_clock_state)(void);
+	void (*restore_sched_clock_state)(void);
 };
 
 struct pci_dev;
diff --git a/arch/x86/kernel/kvmclock.c b/arch/x86/kernel/kvmclock.c
index ca4e735adc54..f8492da65bfc 100644
--- a/arch/x86/kernel/kvmclock.c
+++ b/arch/x86/kernel/kvmclock.c
@@ -136,6 +136,15 @@ int kvm_register_clock(char *txt)
 	return ret;
 }
 
+static void kvm_save_sched_clock_state(void)
+{
+}
+
+static void kvm_restore_sched_clock_state(void)
+{
+	kvm_register_clock("primary cpu clock, resume");
+}
+
 #ifdef CONFIG_X86_LOCAL_APIC
 static void __cpuinit kvm_setup_secondary_clock(void)
 {
@@ -195,6 +204,8 @@ void __init kvmclock_init(void)
 	x86_cpuinit.early_percpu_clock_init =
 		kvm_setup_secondary_clock;
 #endif
+	x86_platform.save_sched_clock_state = kvm_save_sched_clock_state;
+	x86_platform.restore_sched_clock_state = kvm_restore_sched_clock_state;
 	machine_ops.shutdown  = kvm_shutdown;
 #ifdef CONFIG_KEXEC
 	machine_ops.crash_shutdown  = kvm_crash_shutdown;
diff --git a/arch/x86/kernel/tsc.c b/arch/x86/kernel/tsc.c
index a62c201c97ec..aed2aa1088f1 100644
--- a/arch/x86/kernel/tsc.c
+++ b/arch/x86/kernel/tsc.c
@@ -629,7 +629,7 @@ static void set_cyc2ns_scale(unsigned long cpu_khz, int cpu)
 
 static unsigned long long cyc2ns_suspend;
 
-void save_sched_clock_state(void)
+void tsc_save_sched_clock_state(void)
 {
 	if (!sched_clock_stable)
 		return;
@@ -645,7 +645,7 @@ void save_sched_clock_state(void)
  * that sched_clock() continues from the point where it was left off during
  * suspend.
  */
-void restore_sched_clock_state(void)
+void tsc_restore_sched_clock_state(void)
 {
 	unsigned long long offset;
 	unsigned long flags;
diff --git a/arch/x86/kernel/x86_init.c b/arch/x86/kernel/x86_init.c
index 6f2ec53deed0..e9f265fd79ae 100644
--- a/arch/x86/kernel/x86_init.c
+++ b/arch/x86/kernel/x86_init.c
@@ -108,7 +108,9 @@ struct x86_platform_ops x86_platform = {
 	.is_untracked_pat_range		= is_ISA_range,
 	.nmi_init			= default_nmi_init,
 	.get_nmi_reason			= default_get_nmi_reason,
-	.i8042_detect			= default_i8042_detect
+	.i8042_detect			= default_i8042_detect,
+	.save_sched_clock_state 	= tsc_save_sched_clock_state,
+	.restore_sched_clock_state 	= tsc_restore_sched_clock_state,
 };
 
 EXPORT_SYMBOL_GPL(x86_platform);
diff --git a/arch/x86/power/cpu.c b/arch/x86/power/cpu.c
index f10c0afa1cb4..0e76a2814127 100644
--- a/arch/x86/power/cpu.c
+++ b/arch/x86/power/cpu.c
@@ -114,7 +114,7 @@ static void __save_processor_state(struct saved_context *ctxt)
 void save_processor_state(void)
 {
 	__save_processor_state(&saved_context);
-	save_sched_clock_state();
+	x86_platform.save_sched_clock_state();
 }
 #ifdef CONFIG_X86_32
 EXPORT_SYMBOL(save_processor_state);
@@ -230,8 +230,8 @@ static void __restore_processor_state(struct saved_context *ctxt)
 /* Needed by apm.c */
 void restore_processor_state(void)
 {
+	x86_platform.restore_sched_clock_state();
 	__restore_processor_state(&saved_context);
-	restore_sched_clock_state();
 }
 #ifdef CONFIG_X86_32
 EXPORT_SYMBOL(restore_processor_state);
-- 
cgit v1.2.3


From 02626b6af5d2bc62db3bb85fc2891b2725535d44 Mon Sep 17 00:00:00 2001
From: Marcelo Tosatti <mtosatti@redhat.com>
Date: Thu, 8 Mar 2012 18:46:57 -0300
Subject: KVM: x86: fix kvm_write_tsc() TSC matching thinko

kvm_write_tsc() converts from guest TSC to microseconds, not nanoseconds
as intended. The result is that the window for matching is 1000 seconds,
not 1 second.

Microsecond precision is enough for checking whether the TSC write delta
is within the heuristic values, so use it instead of nanoseconds.

Noted by Avi Kivity.

Signed-off-by: Marcelo Tosatti <mtosatti@redhat.com>
Signed-off-by: Avi Kivity <avi@redhat.com>
---
 arch/x86/kvm/x86.c | 19 ++++++++++---------
 1 file changed, 10 insertions(+), 9 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 32096cf6c6c9..7287812eeb72 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -1025,7 +1025,7 @@ void kvm_write_tsc(struct kvm_vcpu *vcpu, u64 data)
 	struct kvm *kvm = vcpu->kvm;
 	u64 offset, ns, elapsed;
 	unsigned long flags;
-	s64 nsdiff;
+	s64 usdiff;
 
 	raw_spin_lock_irqsave(&kvm->arch.tsc_write_lock, flags);
 	offset = kvm_x86_ops->compute_tsc_offset(vcpu, data);
@@ -1033,18 +1033,19 @@ void kvm_write_tsc(struct kvm_vcpu *vcpu, u64 data)
 	elapsed = ns - kvm->arch.last_tsc_nsec;
 
 	/* n.b - signed multiplication and division required */
-	nsdiff = data - kvm->arch.last_tsc_write;
+	usdiff = data - kvm->arch.last_tsc_write;
 #ifdef CONFIG_X86_64
-	nsdiff = (nsdiff * 1000) / vcpu->arch.virtual_tsc_khz;
+	usdiff = (usdiff * 1000) / vcpu->arch.virtual_tsc_khz;
 #else
 	/* do_div() only does unsigned */
 	asm("idivl %2; xor %%edx, %%edx"
-	    : "=A"(nsdiff)
-	    : "A"(nsdiff * 1000), "rm"(vcpu->arch.virtual_tsc_khz));
+	    : "=A"(usdiff)
+	    : "A"(usdiff * 1000), "rm"(vcpu->arch.virtual_tsc_khz));
 #endif
-	nsdiff -= elapsed;
-	if (nsdiff < 0)
-		nsdiff = -nsdiff;
+	do_div(elapsed, 1000);
+	usdiff -= elapsed;
+	if (usdiff < 0)
+		usdiff = -usdiff;
 
 	/*
 	 * Special case: TSC write with a small delta (1 second) of virtual
@@ -1056,7 +1057,7 @@ void kvm_write_tsc(struct kvm_vcpu *vcpu, u64 data)
 	 * compensation code attempt to catch up if we fall behind, but
 	 * it's better to try to match offsets from the beginning.
          */
-	if (nsdiff < NSEC_PER_SEC &&
+	if (usdiff < USEC_PER_SEC &&
 	    vcpu->arch.virtual_tsc_khz == kvm->arch.last_tsc_khz) {
 		if (!check_tsc_unstable()) {
 			offset = kvm->arch.cur_tsc_offset;
-- 
cgit v1.2.3


From 8fd75e1216e0ba601a746177e6c102d5593b572f Mon Sep 17 00:00:00 2001
From: Cong Wang <amwang@redhat.com>
Date: Fri, 25 Nov 2011 23:14:17 +0800
Subject: x86: remove the second argument of k[un]map_atomic()

Acked-by: Avi Kivity <avi@redhat.com>
Acked-by: Herbert Xu <herbert@gondor.apana.org.au>
Signed-off-by: Cong Wang <amwang@redhat.com>
---
 arch/x86/crypto/aesni-intel_glue.c | 24 ++++++++++++------------
 arch/x86/kernel/crash_dump_32.c    |  6 +++---
 arch/x86/kvm/lapic.c               |  8 ++++----
 arch/x86/kvm/paging_tmpl.h         |  4 ++--
 arch/x86/kvm/x86.c                 |  8 ++++----
 arch/x86/lib/usercopy_32.c         |  4 ++--
 6 files changed, 27 insertions(+), 27 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/crypto/aesni-intel_glue.c b/arch/x86/crypto/aesni-intel_glue.c
index 545d0ce59818..152232d2dc6a 100644
--- a/arch/x86/crypto/aesni-intel_glue.c
+++ b/arch/x86/crypto/aesni-intel_glue.c
@@ -1107,12 +1107,12 @@ static int __driver_rfc4106_encrypt(struct aead_request *req)
 		one_entry_in_sg = 1;
 		scatterwalk_start(&src_sg_walk, req->src);
 		scatterwalk_start(&assoc_sg_walk, req->assoc);
-		src = scatterwalk_map(&src_sg_walk, 0);
-		assoc = scatterwalk_map(&assoc_sg_walk, 0);
+		src = scatterwalk_map(&src_sg_walk);
+		assoc = scatterwalk_map(&assoc_sg_walk);
 		dst = src;
 		if (unlikely(req->src != req->dst)) {
 			scatterwalk_start(&dst_sg_walk, req->dst);
-			dst = scatterwalk_map(&dst_sg_walk, 0);
+			dst = scatterwalk_map(&dst_sg_walk);
 		}
 
 	} else {
@@ -1136,11 +1136,11 @@ static int __driver_rfc4106_encrypt(struct aead_request *req)
 	 * back to the packet. */
 	if (one_entry_in_sg) {
 		if (unlikely(req->src != req->dst)) {
-			scatterwalk_unmap(dst, 0);
+			scatterwalk_unmap(dst);
 			scatterwalk_done(&dst_sg_walk, 0, 0);
 		}
-		scatterwalk_unmap(src, 0);
-		scatterwalk_unmap(assoc, 0);
+		scatterwalk_unmap(src);
+		scatterwalk_unmap(assoc);
 		scatterwalk_done(&src_sg_walk, 0, 0);
 		scatterwalk_done(&assoc_sg_walk, 0, 0);
 	} else {
@@ -1189,12 +1189,12 @@ static int __driver_rfc4106_decrypt(struct aead_request *req)
 		one_entry_in_sg = 1;
 		scatterwalk_start(&src_sg_walk, req->src);
 		scatterwalk_start(&assoc_sg_walk, req->assoc);
-		src = scatterwalk_map(&src_sg_walk, 0);
-		assoc = scatterwalk_map(&assoc_sg_walk, 0);
+		src = scatterwalk_map(&src_sg_walk);
+		assoc = scatterwalk_map(&assoc_sg_walk);
 		dst = src;
 		if (unlikely(req->src != req->dst)) {
 			scatterwalk_start(&dst_sg_walk, req->dst);
-			dst = scatterwalk_map(&dst_sg_walk, 0);
+			dst = scatterwalk_map(&dst_sg_walk);
 		}
 
 	} else {
@@ -1219,11 +1219,11 @@ static int __driver_rfc4106_decrypt(struct aead_request *req)
 
 	if (one_entry_in_sg) {
 		if (unlikely(req->src != req->dst)) {
-			scatterwalk_unmap(dst, 0);
+			scatterwalk_unmap(dst);
 			scatterwalk_done(&dst_sg_walk, 0, 0);
 		}
-		scatterwalk_unmap(src, 0);
-		scatterwalk_unmap(assoc, 0);
+		scatterwalk_unmap(src);
+		scatterwalk_unmap(assoc);
 		scatterwalk_done(&src_sg_walk, 0, 0);
 		scatterwalk_done(&assoc_sg_walk, 0, 0);
 	} else {
diff --git a/arch/x86/kernel/crash_dump_32.c b/arch/x86/kernel/crash_dump_32.c
index 642f75a68cd5..11891ca7b716 100644
--- a/arch/x86/kernel/crash_dump_32.c
+++ b/arch/x86/kernel/crash_dump_32.c
@@ -62,16 +62,16 @@ ssize_t copy_oldmem_page(unsigned long pfn, char *buf,
 
 	if (!userbuf) {
 		memcpy(buf, (vaddr + offset), csize);
-		kunmap_atomic(vaddr, KM_PTE0);
+		kunmap_atomic(vaddr);
 	} else {
 		if (!kdump_buf_page) {
 			printk(KERN_WARNING "Kdump: Kdump buffer page not"
 				" allocated\n");
-			kunmap_atomic(vaddr, KM_PTE0);
+			kunmap_atomic(vaddr);
 			return -EFAULT;
 		}
 		copy_page(kdump_buf_page, vaddr);
-		kunmap_atomic(vaddr, KM_PTE0);
+		kunmap_atomic(vaddr);
 		if (copy_to_user(buf, (kdump_buf_page + offset), csize))
 			return -EFAULT;
 	}
diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c
index cfdc6e0ef002..31bfc6927bc0 100644
--- a/arch/x86/kvm/lapic.c
+++ b/arch/x86/kvm/lapic.c
@@ -1283,9 +1283,9 @@ void kvm_lapic_sync_from_vapic(struct kvm_vcpu *vcpu)
 	if (!irqchip_in_kernel(vcpu->kvm) || !vcpu->arch.apic->vapic_addr)
 		return;
 
-	vapic = kmap_atomic(vcpu->arch.apic->vapic_page, KM_USER0);
+	vapic = kmap_atomic(vcpu->arch.apic->vapic_page);
 	data = *(u32 *)(vapic + offset_in_page(vcpu->arch.apic->vapic_addr));
-	kunmap_atomic(vapic, KM_USER0);
+	kunmap_atomic(vapic);
 
 	apic_set_tpr(vcpu->arch.apic, data & 0xff);
 }
@@ -1310,9 +1310,9 @@ void kvm_lapic_sync_to_vapic(struct kvm_vcpu *vcpu)
 		max_isr = 0;
 	data = (tpr & 0xff) | ((max_isr & 0xf0) << 8) | (max_irr << 24);
 
-	vapic = kmap_atomic(vcpu->arch.apic->vapic_page, KM_USER0);
+	vapic = kmap_atomic(vcpu->arch.apic->vapic_page);
 	*(u32 *)(vapic + offset_in_page(vcpu->arch.apic->vapic_addr)) = data;
-	kunmap_atomic(vapic, KM_USER0);
+	kunmap_atomic(vapic);
 }
 
 void kvm_lapic_set_vapic_addr(struct kvm_vcpu *vcpu, gpa_t vapic_addr)
diff --git a/arch/x86/kvm/paging_tmpl.h b/arch/x86/kvm/paging_tmpl.h
index 15610285ebb6..df5a70311be8 100644
--- a/arch/x86/kvm/paging_tmpl.h
+++ b/arch/x86/kvm/paging_tmpl.h
@@ -92,9 +92,9 @@ static int FNAME(cmpxchg_gpte)(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu,
 	if (unlikely(npages != 1))
 		return -EFAULT;
 
-	table = kmap_atomic(page, KM_USER0);
+	table = kmap_atomic(page);
 	ret = CMPXCHG(&table[index], orig_pte, new_pte);
-	kunmap_atomic(table, KM_USER0);
+	kunmap_atomic(table);
 
 	kvm_release_page_dirty(page);
 
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 9cbfc0698118..bb4fd2636bc2 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -1162,12 +1162,12 @@ static int kvm_guest_time_update(struct kvm_vcpu *v)
 	 */
 	vcpu->hv_clock.version += 2;
 
-	shared_kaddr = kmap_atomic(vcpu->time_page, KM_USER0);
+	shared_kaddr = kmap_atomic(vcpu->time_page);
 
 	memcpy(shared_kaddr + vcpu->time_offset, &vcpu->hv_clock,
 	       sizeof(vcpu->hv_clock));
 
-	kunmap_atomic(shared_kaddr, KM_USER0);
+	kunmap_atomic(shared_kaddr);
 
 	mark_page_dirty(v->kvm, vcpu->time >> PAGE_SHIFT);
 	return 0;
@@ -3848,7 +3848,7 @@ static int emulator_cmpxchg_emulated(struct x86_emulate_ctxt *ctxt,
 		goto emul_write;
 	}
 
-	kaddr = kmap_atomic(page, KM_USER0);
+	kaddr = kmap_atomic(page);
 	kaddr += offset_in_page(gpa);
 	switch (bytes) {
 	case 1:
@@ -3866,7 +3866,7 @@ static int emulator_cmpxchg_emulated(struct x86_emulate_ctxt *ctxt,
 	default:
 		BUG();
 	}
-	kunmap_atomic(kaddr, KM_USER0);
+	kunmap_atomic(kaddr);
 	kvm_release_page_dirty(page);
 
 	if (!exchanged)
diff --git a/arch/x86/lib/usercopy_32.c b/arch/x86/lib/usercopy_32.c
index e218d5df85ff..d9b094ca7aaa 100644
--- a/arch/x86/lib/usercopy_32.c
+++ b/arch/x86/lib/usercopy_32.c
@@ -760,9 +760,9 @@ survive:
 				break;
 			}
 
-			maddr = kmap_atomic(pg, KM_USER0);
+			maddr = kmap_atomic(pg);
 			memcpy(maddr + offset, from, len);
-			kunmap_atomic(maddr, KM_USER0);
+			kunmap_atomic(maddr);
 			set_page_dirty_lock(pg);
 			put_page(pg);
 			up_read(&current->mm->mmap_sem);
-- 
cgit v1.2.3


From a24401bcf4a67c8fe17e649e74eeb09b08b79ef5 Mon Sep 17 00:00:00 2001
From: Cong Wang <amwang@redhat.com>
Date: Sat, 26 Nov 2011 10:53:39 +0800
Subject: highmem: kill all __kmap_atomic() [swarren@nvidia.com: highmem: Fix
 ARM build break due to __kmap_atomic rename]

Signed-off-by: Stephen Warren <swarren@nvidia.com>
Signed-off-by: Cong Wang <amwang@redhat.com>
---
 arch/arm/include/asm/highmem.h       |  2 +-
 arch/arm/mm/highmem.c                |  4 ++--
 arch/frv/include/asm/highmem.h       |  2 +-
 arch/frv/mm/highmem.c                |  4 ++--
 arch/mips/include/asm/highmem.h      |  2 +-
 arch/mips/mm/highmem.c               |  4 ++--
 arch/mn10300/include/asm/highmem.h   |  2 +-
 arch/parisc/include/asm/cacheflush.h |  2 +-
 arch/powerpc/include/asm/highmem.h   |  2 +-
 arch/sparc/include/asm/highmem.h     |  2 +-
 arch/sparc/mm/highmem.c              |  4 ++--
 arch/tile/include/asm/highmem.h      |  2 +-
 arch/tile/mm/highmem.c               |  4 ++--
 arch/x86/include/asm/highmem.h       |  2 +-
 arch/x86/mm/highmem_32.c             |  4 ++--
 include/linux/highmem.h              | 11 +++--------
 16 files changed, 24 insertions(+), 29 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/arm/include/asm/highmem.h b/arch/arm/include/asm/highmem.h
index a4edd19dd3d6..8c5e828f484d 100644
--- a/arch/arm/include/asm/highmem.h
+++ b/arch/arm/include/asm/highmem.h
@@ -57,7 +57,7 @@ static inline void *kmap_high_get(struct page *page)
 #ifdef CONFIG_HIGHMEM
 extern void *kmap(struct page *page);
 extern void kunmap(struct page *page);
-extern void *__kmap_atomic(struct page *page);
+extern void *kmap_atomic(struct page *page);
 extern void __kunmap_atomic(void *kvaddr);
 extern void *kmap_atomic_pfn(unsigned long pfn);
 extern struct page *kmap_atomic_to_page(const void *ptr);
diff --git a/arch/arm/mm/highmem.c b/arch/arm/mm/highmem.c
index 807c0573abbe..5a21505d7550 100644
--- a/arch/arm/mm/highmem.c
+++ b/arch/arm/mm/highmem.c
@@ -36,7 +36,7 @@ void kunmap(struct page *page)
 }
 EXPORT_SYMBOL(kunmap);
 
-void *__kmap_atomic(struct page *page)
+void *kmap_atomic(struct page *page)
 {
 	unsigned int idx;
 	unsigned long vaddr;
@@ -81,7 +81,7 @@ void *__kmap_atomic(struct page *page)
 
 	return (void *)vaddr;
 }
-EXPORT_SYMBOL(__kmap_atomic);
+EXPORT_SYMBOL(kmap_atomic);
 
 void __kunmap_atomic(void *kvaddr)
 {
diff --git a/arch/frv/include/asm/highmem.h b/arch/frv/include/asm/highmem.h
index a8d6565d415d..716956a5317b 100644
--- a/arch/frv/include/asm/highmem.h
+++ b/arch/frv/include/asm/highmem.h
@@ -157,7 +157,7 @@ static inline void kunmap_atomic_primary(void *kvaddr, enum km_type type)
 	pagefault_enable();
 }
 
-void *__kmap_atomic(struct page *page);
+void *kmap_atomic(struct page *page);
 void __kunmap_atomic(void *kvaddr);
 
 #endif /* !__ASSEMBLY__ */
diff --git a/arch/frv/mm/highmem.c b/arch/frv/mm/highmem.c
index fd7fcd4c2e33..31902c9d5be5 100644
--- a/arch/frv/mm/highmem.c
+++ b/arch/frv/mm/highmem.c
@@ -37,7 +37,7 @@ struct page *kmap_atomic_to_page(void *ptr)
 	return virt_to_page(ptr);
 }
 
-void *__kmap_atomic(struct page *page)
+void *kmap_atomic(struct page *page)
 {
 	unsigned long paddr;
 	int type;
@@ -64,7 +64,7 @@ void *__kmap_atomic(struct page *page)
 		return NULL;
 	}
 }
-EXPORT_SYMBOL(__kmap_atomic);
+EXPORT_SYMBOL(kmap_atomic);
 
 void __kunmap_atomic(void *kvaddr)
 {
diff --git a/arch/mips/include/asm/highmem.h b/arch/mips/include/asm/highmem.h
index 77e644082a3b..2d91888c9b74 100644
--- a/arch/mips/include/asm/highmem.h
+++ b/arch/mips/include/asm/highmem.h
@@ -47,7 +47,7 @@ extern void kunmap_high(struct page *page);
 
 extern void *kmap(struct page *page);
 extern void kunmap(struct page *page);
-extern void *__kmap_atomic(struct page *page);
+extern void *kmap_atomic(struct page *page);
 extern void __kunmap_atomic(void *kvaddr);
 extern void *kmap_atomic_pfn(unsigned long pfn);
 extern struct page *kmap_atomic_to_page(void *ptr);
diff --git a/arch/mips/mm/highmem.c b/arch/mips/mm/highmem.c
index 3634c7ea06ac..aff57057a949 100644
--- a/arch/mips/mm/highmem.c
+++ b/arch/mips/mm/highmem.c
@@ -41,7 +41,7 @@ EXPORT_SYMBOL(kunmap);
  * kmaps are appropriate for short, tight code paths only.
  */
 
-void *__kmap_atomic(struct page *page)
+void *kmap_atomic(struct page *page)
 {
 	unsigned long vaddr;
 	int idx, type;
@@ -62,7 +62,7 @@ void *__kmap_atomic(struct page *page)
 
 	return (void*) vaddr;
 }
-EXPORT_SYMBOL(__kmap_atomic);
+EXPORT_SYMBOL(kmap_atomic);
 
 void __kunmap_atomic(void *kvaddr)
 {
diff --git a/arch/mn10300/include/asm/highmem.h b/arch/mn10300/include/asm/highmem.h
index bfe2d88604d9..7c137cd8aa37 100644
--- a/arch/mn10300/include/asm/highmem.h
+++ b/arch/mn10300/include/asm/highmem.h
@@ -70,7 +70,7 @@ static inline void kunmap(struct page *page)
  * be used in IRQ contexts, so in some (very limited) cases we need
  * it.
  */
-static inline unsigned long __kmap_atomic(struct page *page)
+static inline unsigned long kmap_atomic(struct page *page)
 {
 	unsigned long vaddr;
 	int idx, type;
diff --git a/arch/parisc/include/asm/cacheflush.h b/arch/parisc/include/asm/cacheflush.h
index da601dd34c05..9f21ab0c02e3 100644
--- a/arch/parisc/include/asm/cacheflush.h
+++ b/arch/parisc/include/asm/cacheflush.h
@@ -140,7 +140,7 @@ static inline void *kmap(struct page *page)
 
 #define kunmap(page)			kunmap_parisc(page_address(page))
 
-static inline void *__kmap_atomic(struct page *page)
+static inline void *kmap_atomic(struct page *page)
 {
 	pagefault_disable();
 	return page_address(page);
diff --git a/arch/powerpc/include/asm/highmem.h b/arch/powerpc/include/asm/highmem.h
index dbc264010d0b..caaf6e00630d 100644
--- a/arch/powerpc/include/asm/highmem.h
+++ b/arch/powerpc/include/asm/highmem.h
@@ -79,7 +79,7 @@ static inline void kunmap(struct page *page)
 	kunmap_high(page);
 }
 
-static inline void *__kmap_atomic(struct page *page)
+static inline void *kmap_atomic(struct page *page)
 {
 	return kmap_atomic_prot(page, kmap_prot);
 }
diff --git a/arch/sparc/include/asm/highmem.h b/arch/sparc/include/asm/highmem.h
index 3d7afbb7f4bb..3b6e00dd96e5 100644
--- a/arch/sparc/include/asm/highmem.h
+++ b/arch/sparc/include/asm/highmem.h
@@ -70,7 +70,7 @@ static inline void kunmap(struct page *page)
 	kunmap_high(page);
 }
 
-extern void *__kmap_atomic(struct page *page);
+extern void *kmap_atomic(struct page *page);
 extern void __kunmap_atomic(void *kvaddr);
 extern struct page *kmap_atomic_to_page(void *vaddr);
 
diff --git a/arch/sparc/mm/highmem.c b/arch/sparc/mm/highmem.c
index 77140a02c86a..055c66cf1bf4 100644
--- a/arch/sparc/mm/highmem.c
+++ b/arch/sparc/mm/highmem.c
@@ -30,7 +30,7 @@
 #include <asm/tlbflush.h>
 #include <asm/fixmap.h>
 
-void *__kmap_atomic(struct page *page)
+void *kmap_atomic(struct page *page)
 {
 	unsigned long vaddr;
 	long idx, type;
@@ -64,7 +64,7 @@ void *__kmap_atomic(struct page *page)
 
 	return (void*) vaddr;
 }
-EXPORT_SYMBOL(__kmap_atomic);
+EXPORT_SYMBOL(kmap_atomic);
 
 void __kunmap_atomic(void *kvaddr)
 {
diff --git a/arch/tile/include/asm/highmem.h b/arch/tile/include/asm/highmem.h
index b2a6c5de79ab..fc8429a31c85 100644
--- a/arch/tile/include/asm/highmem.h
+++ b/arch/tile/include/asm/highmem.h
@@ -59,7 +59,7 @@ void *kmap_fix_kpte(struct page *page, int finished);
 /* This macro is used only in map_new_virtual() to map "page". */
 #define kmap_prot page_to_kpgprot(page)
 
-void *__kmap_atomic(struct page *page);
+void *kmap_atomic(struct page *page);
 void __kunmap_atomic(void *kvaddr);
 void *kmap_atomic_pfn(unsigned long pfn);
 void *kmap_atomic_prot_pfn(unsigned long pfn, pgprot_t prot);
diff --git a/arch/tile/mm/highmem.c b/arch/tile/mm/highmem.c
index 31dbbd9afe47..ef8e5a62b6e3 100644
--- a/arch/tile/mm/highmem.c
+++ b/arch/tile/mm/highmem.c
@@ -224,12 +224,12 @@ void *kmap_atomic_prot(struct page *page, pgprot_t prot)
 }
 EXPORT_SYMBOL(kmap_atomic_prot);
 
-void *__kmap_atomic(struct page *page)
+void *kmap_atomic(struct page *page)
 {
 	/* PAGE_NONE is a magic value that tells us to check immutability. */
 	return kmap_atomic_prot(page, PAGE_NONE);
 }
-EXPORT_SYMBOL(__kmap_atomic);
+EXPORT_SYMBOL(kmap_atomic);
 
 void __kunmap_atomic(void *kvaddr)
 {
diff --git a/arch/x86/include/asm/highmem.h b/arch/x86/include/asm/highmem.h
index 3bd04022fd0c..302a323b3f67 100644
--- a/arch/x86/include/asm/highmem.h
+++ b/arch/x86/include/asm/highmem.h
@@ -61,7 +61,7 @@ void *kmap(struct page *page);
 void kunmap(struct page *page);
 
 void *kmap_atomic_prot(struct page *page, pgprot_t prot);
-void *__kmap_atomic(struct page *page);
+void *kmap_atomic(struct page *page);
 void __kunmap_atomic(void *kvaddr);
 void *kmap_atomic_pfn(unsigned long pfn);
 void *kmap_atomic_prot_pfn(unsigned long pfn, pgprot_t prot);
diff --git a/arch/x86/mm/highmem_32.c b/arch/x86/mm/highmem_32.c
index f4f29b19fac5..6f31ee56c008 100644
--- a/arch/x86/mm/highmem_32.c
+++ b/arch/x86/mm/highmem_32.c
@@ -51,11 +51,11 @@ void *kmap_atomic_prot(struct page *page, pgprot_t prot)
 }
 EXPORT_SYMBOL(kmap_atomic_prot);
 
-void *__kmap_atomic(struct page *page)
+void *kmap_atomic(struct page *page)
 {
 	return kmap_atomic_prot(page, kmap_prot);
 }
-EXPORT_SYMBOL(__kmap_atomic);
+EXPORT_SYMBOL(kmap_atomic);
 
 /*
  * This is the same as kmap_atomic() but can map memory that doesn't
diff --git a/include/linux/highmem.h b/include/linux/highmem.h
index 284ec5535f3d..6549ed75e0a7 100644
--- a/include/linux/highmem.h
+++ b/include/linux/highmem.h
@@ -55,12 +55,12 @@ static inline void kunmap(struct page *page)
 {
 }
 
-static inline void *__kmap_atomic(struct page *page)
+static inline void *kmap_atomic(struct page *page)
 {
 	pagefault_disable();
 	return page_address(page);
 }
-#define kmap_atomic_prot(page, prot)	__kmap_atomic(page)
+#define kmap_atomic_prot(page, prot)	kmap_atomic(page)
 
 static inline void __kunmap_atomic(void *addr)
 {
@@ -121,15 +121,10 @@ static inline void kmap_atomic_idx_pop(void)
 #define NARG_(_2, _1, n, ...) n
 #define NARG(...) NARG_(__VA_ARGS__, 2, 1, :)
 
-static inline void *kmap_atomic(struct page *page)
-{
-	return __kmap_atomic(page);
-}
-
 static inline void __deprecated *kmap_atomic_deprecated(struct page *page,
 							enum km_type km)
 {
-	return __kmap_atomic(page);
+	return kmap_atomic(page);
 }
 
 #define kmap_atomic1(...) kmap_atomic(__VA_ARGS__)
-- 
cgit v1.2.3


From 984165a37ca65d990419566d9af5dd247d03d2a0 Mon Sep 17 00:00:00 2001
From: Mika Westerberg <mika.westerberg@linux.intel.com>
Date: Thu, 15 Dec 2011 22:28:37 +0000
Subject: x86, mrst: add msic_thermal platform support

This will let the MSIC driver to create platform device for the thermal
driver.

Signed-off-by: Mika Westerberg <mika.westerberg@linux.intel.com>
Signed-off-by: Kirill A. Shutemov <kirill.shutemov@linux.intel.com>
Signed-off-by: Alan Cox <alan@linux.intel.com>
Signed-off-by: Matthew Garrett <mjg@redhat.com>
---
 arch/x86/platform/mrst/mrst.c | 6 ++++++
 1 file changed, 6 insertions(+)

(limited to 'arch/x86')

diff --git a/arch/x86/platform/mrst/mrst.c b/arch/x86/platform/mrst/mrst.c
index 475e2cd0f3c3..229b8bf42cd9 100644
--- a/arch/x86/platform/mrst/mrst.c
+++ b/arch/x86/platform/mrst/mrst.c
@@ -686,6 +686,11 @@ static void *msic_ocd_platform_data(void *info)
 	return msic_generic_platform_data(info, INTEL_MSIC_BLOCK_OCD);
 }
 
+static void *msic_thermal_platform_data(void *info)
+{
+	return msic_generic_platform_data(info, INTEL_MSIC_BLOCK_THERMAL);
+}
+
 static const struct devs_id __initconst device_ids[] = {
 	{"bma023", SFI_DEV_TYPE_I2C, 1, &no_platform_data},
 	{"pmic_gpio", SFI_DEV_TYPE_SPI, 1, &pmic_gpio_platform_data},
@@ -705,6 +710,7 @@ static const struct devs_id __initconst device_ids[] = {
 	{"msic_audio", SFI_DEV_TYPE_IPC, 1, &msic_audio_platform_data},
 	{"msic_power_btn", SFI_DEV_TYPE_IPC, 1, &msic_power_btn_platform_data},
 	{"msic_ocd", SFI_DEV_TYPE_IPC, 1, &msic_ocd_platform_data},
+	{"msic_thermal", SFI_DEV_TYPE_IPC, 1, &msic_thermal_platform_data},
 
 	{},
 };
-- 
cgit v1.2.3


From 3197059af0762c191af23c0ce3fd6f8311c564e7 Mon Sep 17 00:00:00 2001
From: "Philip A. Prindeville" <philipp@redfish-solutions.com>
Date: Sat, 14 Jan 2012 01:45:39 -0700
Subject: geos: Platform driver for Geos and Geos2 single-board computers.

Trivial platform driver for Traverse Technologies Geos and Geos2
single-board computers. Uses SMBIOS to identify platform.
Based on progressive revisions of the leds-net5501 driver that
was rewritten by Ed Wildgoose as a platform driver.

Supports GPIO-based LEDs (3) and 1 polled button which is
typically used for a soft reset.

Signed-off-by: Philip Prindeville <philipp@redfish-solutions.com>
Reviewed-by: Ed Wildgoose <ed@wildgooses.com>
Acked-by: Andres Salomon <dilinger@queued.net>
Cc: Richard Purdie <rpurdie@rpsys.net>
Cc: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Matthew Garrett <mjg@redhat.com>
---
 arch/x86/Kconfig                 |   7 +++
 arch/x86/platform/geode/Makefile |   1 +
 arch/x86/platform/geode/geos.c   | 128 +++++++++++++++++++++++++++++++++++++++
 3 files changed, 136 insertions(+)
 create mode 100644 arch/x86/platform/geode/geos.c

(limited to 'arch/x86')

diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
index 5bed94e189fa..3a38c4c1d359 100644
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -2133,6 +2133,13 @@ config ALIX
 
 	  Note: You have to set alix.force=1 for boards with Award BIOS.
 
+config GEOS
+	bool "Traverse Technologies GEOS System Support (LEDS, GPIO, etc)"
+	select GPIOLIB
+	depends on DMI
+	---help---
+	  This option enables system support for the Traverse Technologies GEOS.
+
 endif # X86_32
 
 config AMD_NB
diff --git a/arch/x86/platform/geode/Makefile b/arch/x86/platform/geode/Makefile
index 07c9cd05021a..d8ba5644f2f6 100644
--- a/arch/x86/platform/geode/Makefile
+++ b/arch/x86/platform/geode/Makefile
@@ -1 +1,2 @@
 obj-$(CONFIG_ALIX)		+= alix.o
+obj-$(CONFIG_GEOS)		+= geos.o
diff --git a/arch/x86/platform/geode/geos.c b/arch/x86/platform/geode/geos.c
new file mode 100644
index 000000000000..c2e6d53558be
--- /dev/null
+++ b/arch/x86/platform/geode/geos.c
@@ -0,0 +1,128 @@
+/*
+ * System Specific setup for Traverse Technologies GEOS.
+ * At the moment this means setup of GPIO control of LEDs.
+ *
+ * Copyright (C) 2008 Constantin Baranov <const@mimas.ru>
+ * Copyright (C) 2011 Ed Wildgoose <kernel@wildgooses.com>
+ *                and Philip Prindeville <philipp@redfish-solutions.com>
+ *
+ * TODO: There are large similarities with leds-net5501.c
+ * by Alessandro Zummo <a.zummo@towertech.it>
+ * In the future leds-net5501.c should be migrated over to platform
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2
+ * as published by the Free Software Foundation.
+ */
+
+#include <linux/kernel.h>
+#include <linux/init.h>
+#include <linux/io.h>
+#include <linux/string.h>
+#include <linux/module.h>
+#include <linux/leds.h>
+#include <linux/platform_device.h>
+#include <linux/gpio.h>
+#include <linux/input.h>
+#include <linux/gpio_keys.h>
+#include <linux/dmi.h>
+
+#include <asm/geode.h>
+
+static struct gpio_keys_button geos_gpio_buttons[] = {
+	{
+		.code = KEY_RESTART,
+		.gpio = 3,
+		.active_low = 1,
+		.desc = "Reset button",
+		.type = EV_KEY,
+		.wakeup = 0,
+		.debounce_interval = 100,
+		.can_disable = 0,
+	}
+};
+static struct gpio_keys_platform_data geos_buttons_data = {
+	.buttons = geos_gpio_buttons,
+	.nbuttons = ARRAY_SIZE(geos_gpio_buttons),
+	.poll_interval = 20,
+};
+
+static struct platform_device geos_buttons_dev = {
+	.name = "gpio-keys-polled",
+	.id = 1,
+	.dev = {
+		.platform_data = &geos_buttons_data,
+	}
+};
+
+static struct gpio_led geos_leds[] = {
+	{
+		.name = "geos:1",
+		.gpio = 6,
+		.default_trigger = "default-on",
+		.active_low = 1,
+	},
+	{
+		.name = "geos:2",
+		.gpio = 25,
+		.default_trigger = "default-off",
+		.active_low = 1,
+	},
+	{
+		.name = "geos:3",
+		.gpio = 27,
+		.default_trigger = "default-off",
+		.active_low = 1,
+	},
+};
+
+static struct gpio_led_platform_data geos_leds_data = {
+	.num_leds = ARRAY_SIZE(geos_leds),
+	.leds = geos_leds,
+};
+
+static struct platform_device geos_leds_dev = {
+	.name = "leds-gpio",
+	.id = -1,
+	.dev.platform_data = &geos_leds_data,
+};
+
+static struct __initdata platform_device *geos_devs[] = {
+	&geos_buttons_dev,
+	&geos_leds_dev,
+};
+
+static void __init register_geos(void)
+{
+	/* Setup LED control through leds-gpio driver */
+	platform_add_devices(geos_devs, ARRAY_SIZE(geos_devs));
+}
+
+static int __init geos_init(void)
+{
+	const char *vendor, *product;
+
+	if (!is_geode())
+		return 0;
+
+	vendor = dmi_get_system_info(DMI_SYS_VENDOR);
+	if (!vendor || strcmp(vendor, "Traverse Technologies"))
+		return 0;
+
+	product = dmi_get_system_info(DMI_PRODUCT_NAME);
+	if (!product || strcmp(product, "Geos"))
+		return 0;
+
+	printk(KERN_INFO "%s: system is recognized as \"%s %s\"\n",
+	       KBUILD_MODNAME, vendor, product);
+
+	register_geos();
+
+	return 0;
+}
+
+module_init(geos_init);
+
+MODULE_AUTHOR("Philip Prindeville <philipp@redfish-solutions.com>");
+MODULE_DESCRIPTION("Traverse Technologies Geos System Setup");
+MODULE_LICENSE("GPL");
-- 
cgit v1.2.3


From 8fc3dc5a3a17aa2b353886422bd89420619af211 Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Sat, 17 Mar 2012 03:05:16 -0400
Subject: __register_binfmt() made void

Just don't pass NULL to it - nobody does, anyway.

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 arch/alpha/kernel/binfmt_loader.c |  3 ++-
 arch/x86/ia32/ia32_aout.c         |  3 ++-
 fs/binfmt_aout.c                  |  3 ++-
 fs/binfmt_elf.c                   |  3 ++-
 fs/binfmt_elf_fdpic.c             |  3 ++-
 fs/binfmt_em86.c                  |  3 ++-
 fs/binfmt_flat.c                  |  3 ++-
 fs/binfmt_misc.c                  |  7 ++-----
 fs/binfmt_script.c                |  3 ++-
 fs/binfmt_som.c                   |  3 ++-
 fs/exec.c                         |  6 ++----
 include/linux/binfmts.h           | 10 +++++-----
 12 files changed, 27 insertions(+), 23 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/alpha/kernel/binfmt_loader.c b/arch/alpha/kernel/binfmt_loader.c
index 3fcfad410130..d1f474d1d44d 100644
--- a/arch/alpha/kernel/binfmt_loader.c
+++ b/arch/alpha/kernel/binfmt_loader.c
@@ -46,6 +46,7 @@ static struct linux_binfmt loader_format = {
 
 static int __init init_loader_binfmt(void)
 {
-	return insert_binfmt(&loader_format);
+	insert_binfmt(&loader_format);
+	return 0;
 }
 arch_initcall(init_loader_binfmt);
diff --git a/arch/x86/ia32/ia32_aout.c b/arch/x86/ia32/ia32_aout.c
index 39e49091f648..cdfc8dc43670 100644
--- a/arch/x86/ia32/ia32_aout.c
+++ b/arch/x86/ia32/ia32_aout.c
@@ -519,7 +519,8 @@ out:
 
 static int __init init_aout_binfmt(void)
 {
-	return register_binfmt(&aout_format);
+	register_binfmt(&aout_format);
+	return 0;
 }
 
 static void __exit exit_aout_binfmt(void)
diff --git a/fs/binfmt_aout.c b/fs/binfmt_aout.c
index 1ff94054d35a..a543364ba29b 100644
--- a/fs/binfmt_aout.c
+++ b/fs/binfmt_aout.c
@@ -454,7 +454,8 @@ out:
 
 static int __init init_aout_binfmt(void)
 {
-	return register_binfmt(&aout_format);
+	register_binfmt(&aout_format);
+	return 0;
 }
 
 static void __exit exit_aout_binfmt(void)
diff --git a/fs/binfmt_elf.c b/fs/binfmt_elf.c
index 07d096c49920..f8ac4251877e 100644
--- a/fs/binfmt_elf.c
+++ b/fs/binfmt_elf.c
@@ -2077,7 +2077,8 @@ out:
 
 static int __init init_elf_binfmt(void)
 {
-	return register_binfmt(&elf_format);
+	register_binfmt(&elf_format);
+	return 0;
 }
 
 static void __exit exit_elf_binfmt(void)
diff --git a/fs/binfmt_elf_fdpic.c b/fs/binfmt_elf_fdpic.c
index 30745f459faf..e7afcb67a2d3 100644
--- a/fs/binfmt_elf_fdpic.c
+++ b/fs/binfmt_elf_fdpic.c
@@ -91,7 +91,8 @@ static struct linux_binfmt elf_fdpic_format = {
 
 static int __init init_elf_fdpic_binfmt(void)
 {
-	return register_binfmt(&elf_fdpic_format);
+	register_binfmt(&elf_fdpic_format);
+	return 0;
 }
 
 static void __exit exit_elf_fdpic_binfmt(void)
diff --git a/fs/binfmt_em86.c b/fs/binfmt_em86.c
index b8e8b0acf9bd..2790c7e1912e 100644
--- a/fs/binfmt_em86.c
+++ b/fs/binfmt_em86.c
@@ -100,7 +100,8 @@ static struct linux_binfmt em86_format = {
 
 static int __init init_em86_binfmt(void)
 {
-	return register_binfmt(&em86_format);
+	register_binfmt(&em86_format);
+	return 0;
 }
 
 static void __exit exit_em86_binfmt(void)
diff --git a/fs/binfmt_flat.c b/fs/binfmt_flat.c
index 1bffbe0ed778..68affab88146 100644
--- a/fs/binfmt_flat.c
+++ b/fs/binfmt_flat.c
@@ -950,7 +950,8 @@ static int load_flat_binary(struct linux_binprm * bprm, struct pt_regs * regs)
 
 static int __init init_flat_binfmt(void)
 {
-	return register_binfmt(&flat_format);
+	register_binfmt(&flat_format);
+	return 0;
 }
 
 /****************************************************************************/
diff --git a/fs/binfmt_misc.c b/fs/binfmt_misc.c
index a9198dfd5f85..1ffb60355cae 100644
--- a/fs/binfmt_misc.c
+++ b/fs/binfmt_misc.c
@@ -726,11 +726,8 @@ static struct file_system_type bm_fs_type = {
 static int __init init_misc_binfmt(void)
 {
 	int err = register_filesystem(&bm_fs_type);
-	if (!err) {
-		err = insert_binfmt(&misc_format);
-		if (err)
-			unregister_filesystem(&bm_fs_type);
-	}
+	if (!err)
+		insert_binfmt(&misc_format);
 	return err;
 }
 
diff --git a/fs/binfmt_script.c b/fs/binfmt_script.c
index 396a9884591f..d3b8c1f63155 100644
--- a/fs/binfmt_script.c
+++ b/fs/binfmt_script.c
@@ -105,7 +105,8 @@ static struct linux_binfmt script_format = {
 
 static int __init init_script_binfmt(void)
 {
-	return register_binfmt(&script_format);
+	register_binfmt(&script_format);
+	return 0;
 }
 
 static void __exit exit_script_binfmt(void)
diff --git a/fs/binfmt_som.c b/fs/binfmt_som.c
index cc8560f6c9b0..ec15972dd98a 100644
--- a/fs/binfmt_som.c
+++ b/fs/binfmt_som.c
@@ -289,7 +289,8 @@ static int load_som_library(struct file *f)
 
 static int __init init_som_binfmt(void)
 {
-	return register_binfmt(&som_format);
+	register_binfmt(&som_format);
+	return 0;
 }
 
 static void __exit exit_som_binfmt(void)
diff --git a/fs/exec.c b/fs/exec.c
index 153dee14fe55..2c5ae338773c 100644
--- a/fs/exec.c
+++ b/fs/exec.c
@@ -79,15 +79,13 @@ static atomic_t call_count = ATOMIC_INIT(1);
 static LIST_HEAD(formats);
 static DEFINE_RWLOCK(binfmt_lock);
 
-int __register_binfmt(struct linux_binfmt * fmt, int insert)
+void __register_binfmt(struct linux_binfmt * fmt, int insert)
 {
-	if (!fmt)
-		return -EINVAL;
+	BUG_ON(!fmt);
 	write_lock(&binfmt_lock);
 	insert ? list_add(&fmt->lh, &formats) :
 		 list_add_tail(&fmt->lh, &formats);
 	write_unlock(&binfmt_lock);
-	return 0;	
 }
 
 EXPORT_SYMBOL(__register_binfmt);
diff --git a/include/linux/binfmts.h b/include/linux/binfmts.h
index 0092102db2de..366422bc1633 100644
--- a/include/linux/binfmts.h
+++ b/include/linux/binfmts.h
@@ -92,17 +92,17 @@ struct linux_binfmt {
 	unsigned long min_coredump;	/* minimal dump size */
 };
 
-extern int __register_binfmt(struct linux_binfmt *fmt, int insert);
+extern void __register_binfmt(struct linux_binfmt *fmt, int insert);
 
 /* Registration of default binfmt handlers */
-static inline int register_binfmt(struct linux_binfmt *fmt)
+static inline void register_binfmt(struct linux_binfmt *fmt)
 {
-	return __register_binfmt(fmt, 0);
+	__register_binfmt(fmt, 0);
 }
 /* Same as above, but adds a new binfmt at the top of the list */
-static inline int insert_binfmt(struct linux_binfmt *fmt)
+static inline void insert_binfmt(struct linux_binfmt *fmt)
 {
-	return __register_binfmt(fmt, 1);
+	__register_binfmt(fmt, 1);
 }
 
 extern void unregister_binfmt(struct linux_binfmt *);
-- 
cgit v1.2.3


From 19e5109fef2c368ab3f8a5157270f87f4a7c0326 Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Thu, 23 Feb 2012 22:29:17 -0500
Subject: take removal of PF_FORKNOEXEC to flush_old_exec()

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 arch/x86/ia32/ia32_aout.c | 1 -
 fs/binfmt_aout.c          | 1 -
 fs/binfmt_elf.c           | 2 --
 fs/binfmt_elf_fdpic.c     | 3 ---
 fs/binfmt_flat.c          | 1 -
 fs/binfmt_som.c           | 1 -
 fs/exec.c                 | 2 +-
 7 files changed, 1 insertion(+), 10 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/ia32/ia32_aout.c b/arch/x86/ia32/ia32_aout.c
index cdfc8dc43670..4c2e59a420b9 100644
--- a/arch/x86/ia32/ia32_aout.c
+++ b/arch/x86/ia32/ia32_aout.c
@@ -323,7 +323,6 @@ static int load_aout_binary(struct linux_binprm *bprm, struct pt_regs *regs)
 	}
 
 	install_exec_creds(bprm);
-	current->flags &= ~PF_FORKNOEXEC;
 
 	if (N_MAGIC(ex) == OMAGIC) {
 		unsigned long text_addr, map_size;
diff --git a/fs/binfmt_aout.c b/fs/binfmt_aout.c
index a543364ba29b..4d5e6d26578c 100644
--- a/fs/binfmt_aout.c
+++ b/fs/binfmt_aout.c
@@ -267,7 +267,6 @@ static int load_aout_binary(struct linux_binprm * bprm, struct pt_regs * regs)
 	}
 
 	install_exec_creds(bprm);
- 	current->flags &= ~PF_FORKNOEXEC;
 
 	if (N_MAGIC(ex) == OMAGIC) {
 		unsigned long text_addr, map_size;
diff --git a/fs/binfmt_elf.c b/fs/binfmt_elf.c
index f8ac4251877e..81878b78c9d4 100644
--- a/fs/binfmt_elf.c
+++ b/fs/binfmt_elf.c
@@ -712,7 +712,6 @@ static int load_elf_binary(struct linux_binprm *bprm, struct pt_regs *regs)
 		goto out_free_dentry;
 
 	/* OK, This is the point of no return */
-	current->flags &= ~PF_FORKNOEXEC;
 	current->mm->def_flags = def_flags;
 
 	/* Do this immediately, since STACK_TOP as used in setup_arg_pages
@@ -934,7 +933,6 @@ static int load_elf_binary(struct linux_binprm *bprm, struct pt_regs *regs)
 #endif /* ARCH_HAS_SETUP_ADDITIONAL_PAGES */
 
 	install_exec_creds(bprm);
-	current->flags &= ~PF_FORKNOEXEC;
 	retval = create_elf_tables(bprm, &loc->elf_ex,
 			  load_addr, interp_load_addr);
 	if (retval < 0) {
diff --git a/fs/binfmt_elf_fdpic.c b/fs/binfmt_elf_fdpic.c
index e7afcb67a2d3..c64bf5ee2df4 100644
--- a/fs/binfmt_elf_fdpic.c
+++ b/fs/binfmt_elf_fdpic.c
@@ -335,8 +335,6 @@ static int load_elf_fdpic_binary(struct linux_binprm *bprm,
 	current->mm->context.exec_fdpic_loadmap = 0;
 	current->mm->context.interp_fdpic_loadmap = 0;
 
-	current->flags &= ~PF_FORKNOEXEC;
-
 #ifdef CONFIG_MMU
 	elf_fdpic_arch_lay_out_mm(&exec_params,
 				  &interp_params,
@@ -414,7 +412,6 @@ static int load_elf_fdpic_binary(struct linux_binprm *bprm,
 #endif
 
 	install_exec_creds(bprm);
-	current->flags &= ~PF_FORKNOEXEC;
 	if (create_elf_fdpic_tables(bprm, current->mm,
 				    &exec_params, &interp_params) < 0)
 		goto error_kill;
diff --git a/fs/binfmt_flat.c b/fs/binfmt_flat.c
index 68affab88146..04f61f0bdfde 100644
--- a/fs/binfmt_flat.c
+++ b/fs/binfmt_flat.c
@@ -902,7 +902,6 @@ static int load_flat_binary(struct linux_binprm * bprm, struct pt_regs * regs)
 						libinfo.lib_list[j].start_data:UNLOADED_LIB;
 
 	install_exec_creds(bprm);
- 	current->flags &= ~PF_FORKNOEXEC;
 
 	set_binfmt(&flat_format);
 
diff --git a/fs/binfmt_som.c b/fs/binfmt_som.c
index ec15972dd98a..e4fc746629a7 100644
--- a/fs/binfmt_som.c
+++ b/fs/binfmt_som.c
@@ -225,7 +225,6 @@ load_som_binary(struct linux_binprm * bprm, struct pt_regs * regs)
 		goto out_free;
 
 	/* OK, This is the point of no return */
-	current->flags &= ~PF_FORKNOEXEC;
 	current->personality = PER_HPUX;
 	setup_new_exec(bprm);
 
diff --git a/fs/exec.c b/fs/exec.c
index 2c5ae338773c..60478a0e7a37 100644
--- a/fs/exec.c
+++ b/fs/exec.c
@@ -1110,7 +1110,7 @@ int flush_old_exec(struct linux_binprm * bprm)
 	bprm->mm = NULL;		/* We're using it now */
 
 	set_fs(USER_DS);
-	current->flags &= ~(PF_RANDOMIZE | PF_KTHREAD);
+	current->flags &= ~(PF_RANDOMIZE | PF_FORKNOEXEC | PF_KTHREAD);
 	flush_thread();
 	current->personality &= ~bprm->per_clear;
 
-- 
cgit v1.2.3


From 1a5a9906d4e8d1976b701f889d8f35d54b928f25 Mon Sep 17 00:00:00 2001
From: Andrea Arcangeli <aarcange@redhat.com>
Date: Wed, 21 Mar 2012 16:33:42 -0700
Subject: mm: thp: fix pmd_bad() triggering in code paths holding mmap_sem read
 mode

In some cases it may happen that pmd_none_or_clear_bad() is called with
the mmap_sem hold in read mode.  In those cases the huge page faults can
allocate hugepmds under pmd_none_or_clear_bad() and that can trigger a
false positive from pmd_bad() that will not like to see a pmd
materializing as trans huge.

It's not khugepaged causing the problem, khugepaged holds the mmap_sem
in write mode (and all those sites must hold the mmap_sem in read mode
to prevent pagetables to go away from under them, during code review it
seems vm86 mode on 32bit kernels requires that too unless it's
restricted to 1 thread per process or UP builds).  The race is only with
the huge pagefaults that can convert a pmd_none() into a
pmd_trans_huge().

Effectively all these pmd_none_or_clear_bad() sites running with
mmap_sem in read mode are somewhat speculative with the page faults, and
the result is always undefined when they run simultaneously.  This is
probably why it wasn't common to run into this.  For example if the
madvise(MADV_DONTNEED) runs zap_page_range() shortly before the page
fault, the hugepage will not be zapped, if the page fault runs first it
will be zapped.

Altering pmd_bad() not to error out if it finds hugepmds won't be enough
to fix this, because zap_pmd_range would then proceed to call
zap_pte_range (which would be incorrect if the pmd become a
pmd_trans_huge()).

The simplest way to fix this is to read the pmd in the local stack
(regardless of what we read, no need of actual CPU barriers, only
compiler barrier needed), and be sure it is not changing under the code
that computes its value.  Even if the real pmd is changing under the
value we hold on the stack, we don't care.  If we actually end up in
zap_pte_range it means the pmd was not none already and it was not huge,
and it can't become huge from under us (khugepaged locking explained
above).

All we need is to enforce that there is no way anymore that in a code
path like below, pmd_trans_huge can be false, but pmd_none_or_clear_bad
can run into a hugepmd.  The overhead of a barrier() is just a compiler
tweak and should not be measurable (I only added it for THP builds).  I
don't exclude different compiler versions may have prevented the race
too by caching the value of *pmd on the stack (that hasn't been
verified, but it wouldn't be impossible considering
pmd_none_or_clear_bad, pmd_bad, pmd_trans_huge, pmd_none are all inlines
and there's no external function called in between pmd_trans_huge and
pmd_none_or_clear_bad).

		if (pmd_trans_huge(*pmd)) {
			if (next-addr != HPAGE_PMD_SIZE) {
				VM_BUG_ON(!rwsem_is_locked(&tlb->mm->mmap_sem));
				split_huge_page_pmd(vma->vm_mm, pmd);
			} else if (zap_huge_pmd(tlb, vma, pmd, addr))
				continue;
			/* fall through */
		}
		if (pmd_none_or_clear_bad(pmd))

Because this race condition could be exercised without special
privileges this was reported in CVE-2012-1179.

The race was identified and fully explained by Ulrich who debugged it.
I'm quoting his accurate explanation below, for reference.

====== start quote =======
      mapcount 0 page_mapcount 1
      kernel BUG at mm/huge_memory.c:1384!

    At some point prior to the panic, a "bad pmd ..." message similar to the
    following is logged on the console:

      mm/memory.c:145: bad pmd ffff8800376e1f98(80000000314000e7).

    The "bad pmd ..." message is logged by pmd_clear_bad() before it clears
    the page's PMD table entry.

        143 void pmd_clear_bad(pmd_t *pmd)
        144 {
    ->  145         pmd_ERROR(*pmd);
        146         pmd_clear(pmd);
        147 }

    After the PMD table entry has been cleared, there is an inconsistency
    between the actual number of PMD table entries that are mapping the page
    and the page's map count (_mapcount field in struct page). When the page
    is subsequently reclaimed, __split_huge_page() detects this inconsistency.

       1381         if (mapcount != page_mapcount(page))
       1382                 printk(KERN_ERR "mapcount %d page_mapcount %d\n",
       1383                        mapcount, page_mapcount(page));
    -> 1384         BUG_ON(mapcount != page_mapcount(page));

    The root cause of the problem is a race of two threads in a multithreaded
    process. Thread B incurs a page fault on a virtual address that has never
    been accessed (PMD entry is zero) while Thread A is executing an madvise()
    system call on a virtual address within the same 2 MB (huge page) range.

               virtual address space
              .---------------------.
              |                     |
              |                     |
            .-|---------------------|
            | |                     |
            | |                     |<-- B(fault)
            | |                     |
      2 MB  | |/////////////////////|-.
      huge <  |/////////////////////|  > A(range)
      page  | |/////////////////////|-'
            | |                     |
            | |                     |
            '-|---------------------|
              |                     |
              |                     |
              '---------------------'

    - Thread A is executing an madvise(..., MADV_DONTNEED) system call
      on the virtual address range "A(range)" shown in the picture.

    sys_madvise
      // Acquire the semaphore in shared mode.
      down_read(&current->mm->mmap_sem)
      ...
      madvise_vma
        switch (behavior)
        case MADV_DONTNEED:
             madvise_dontneed
               zap_page_range
                 unmap_vmas
                   unmap_page_range
                     zap_pud_range
                       zap_pmd_range
                         //
                         // Assume that this huge page has never been accessed.
                         // I.e. content of the PMD entry is zero (not mapped).
                         //
                         if (pmd_trans_huge(*pmd)) {
                             // We don't get here due to the above assumption.
                         }
                         //
                         // Assume that Thread B incurred a page fault and
             .---------> // sneaks in here as shown below.
             |           //
             |           if (pmd_none_or_clear_bad(pmd))
             |               {
             |                 if (unlikely(pmd_bad(*pmd)))
             |                     pmd_clear_bad
             |                     {
             |                       pmd_ERROR
             |                         // Log "bad pmd ..." message here.
             |                       pmd_clear
             |                         // Clear the page's PMD entry.
             |                         // Thread B incremented the map count
             |                         // in page_add_new_anon_rmap(), but
             |                         // now the page is no longer mapped
             |                         // by a PMD entry (-> inconsistency).
             |                     }
             |               }
             |
             v
    - Thread B is handling a page fault on virtual address "B(fault)" shown
      in the picture.

    ...
    do_page_fault
      __do_page_fault
        // Acquire the semaphore in shared mode.
        down_read_trylock(&mm->mmap_sem)
        ...
        handle_mm_fault
          if (pmd_none(*pmd) && transparent_hugepage_enabled(vma))
              // We get here due to the above assumption (PMD entry is zero).
              do_huge_pmd_anonymous_page
                alloc_hugepage_vma
                  // Allocate a new transparent huge page here.
                ...
                __do_huge_pmd_anonymous_page
                  ...
                  spin_lock(&mm->page_table_lock)
                  ...
                  page_add_new_anon_rmap
                    // Here we increment the page's map count (starts at -1).
                    atomic_set(&page->_mapcount, 0)
                  set_pmd_at
                    // Here we set the page's PMD entry which will be cleared
                    // when Thread A calls pmd_clear_bad().
                  ...
                  spin_unlock(&mm->page_table_lock)

    The mmap_sem does not prevent the race because both threads are acquiring
    it in shared mode (down_read).  Thread B holds the page_table_lock while
    the page's map count and PMD table entry are updated.  However, Thread A
    does not synchronize on that lock.

====== end quote =======

[akpm@linux-foundation.org: checkpatch fixes]
Reported-by: Ulrich Obergfell <uobergfe@redhat.com>
Signed-off-by: Andrea Arcangeli <aarcange@redhat.com>
Acked-by: Johannes Weiner <hannes@cmpxchg.org>
Cc: Mel Gorman <mgorman@suse.de>
Cc: Hugh Dickins <hughd@google.com>
Cc: Dave Jones <davej@redhat.com>
Acked-by: Larry Woodman <lwoodman@redhat.com>
Acked-by: Rik van Riel <riel@redhat.com>
Cc: <stable@vger.kernel.org>		[2.6.38+]
Cc: Mark Salter <msalter@redhat.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 arch/x86/kernel/vm86_32.c     |  2 ++
 fs/proc/task_mmu.c            |  9 +++++++
 include/asm-generic/pgtable.h | 61 +++++++++++++++++++++++++++++++++++++++++++
 mm/memcontrol.c               |  4 +++
 mm/memory.c                   | 16 +++++++++---
 mm/mempolicy.c                |  2 +-
 mm/mincore.c                  |  2 +-
 mm/pagewalk.c                 |  2 +-
 mm/swapfile.c                 |  4 +--
 9 files changed, 92 insertions(+), 10 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kernel/vm86_32.c b/arch/x86/kernel/vm86_32.c
index b466cab5ba15..328cb37bb827 100644
--- a/arch/x86/kernel/vm86_32.c
+++ b/arch/x86/kernel/vm86_32.c
@@ -172,6 +172,7 @@ static void mark_screen_rdonly(struct mm_struct *mm)
 	spinlock_t *ptl;
 	int i;
 
+	down_write(&mm->mmap_sem);
 	pgd = pgd_offset(mm, 0xA0000);
 	if (pgd_none_or_clear_bad(pgd))
 		goto out;
@@ -190,6 +191,7 @@ static void mark_screen_rdonly(struct mm_struct *mm)
 	}
 	pte_unmap_unlock(pte, ptl);
 out:
+	up_write(&mm->mmap_sem);
 	flush_tlb();
 }
 
diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c
index 7dcd2a250495..3efa7253523e 100644
--- a/fs/proc/task_mmu.c
+++ b/fs/proc/task_mmu.c
@@ -409,6 +409,9 @@ static int smaps_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end,
 	} else {
 		spin_unlock(&walk->mm->page_table_lock);
 	}
+
+	if (pmd_trans_unstable(pmd))
+		return 0;
 	/*
 	 * The mmap_sem held all the way back in m_start() is what
 	 * keeps khugepaged out of here and from collapsing things
@@ -507,6 +510,8 @@ static int clear_refs_pte_range(pmd_t *pmd, unsigned long addr,
 	struct page *page;
 
 	split_huge_page_pmd(walk->mm, pmd);
+	if (pmd_trans_unstable(pmd))
+		return 0;
 
 	pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl);
 	for (; addr != end; pte++, addr += PAGE_SIZE) {
@@ -670,6 +675,8 @@ static int pagemap_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end,
 	int err = 0;
 
 	split_huge_page_pmd(walk->mm, pmd);
+	if (pmd_trans_unstable(pmd))
+		return 0;
 
 	/* find the first VMA at or above 'addr' */
 	vma = find_vma(walk->mm, addr);
@@ -961,6 +968,8 @@ static int gather_pte_stats(pmd_t *pmd, unsigned long addr,
 		spin_unlock(&walk->mm->page_table_lock);
 	}
 
+	if (pmd_trans_unstable(pmd))
+		return 0;
 	orig_pte = pte = pte_offset_map_lock(walk->mm, pmd, addr, &ptl);
 	do {
 		struct page *page = can_gather_numa_stats(*pte, md->vma, addr);
diff --git a/include/asm-generic/pgtable.h b/include/asm-generic/pgtable.h
index 76bff2bff15e..a03c098b0cce 100644
--- a/include/asm-generic/pgtable.h
+++ b/include/asm-generic/pgtable.h
@@ -425,6 +425,8 @@ extern void untrack_pfn_vma(struct vm_area_struct *vma, unsigned long pfn,
 				unsigned long size);
 #endif
 
+#ifdef CONFIG_MMU
+
 #ifndef CONFIG_TRANSPARENT_HUGEPAGE
 static inline int pmd_trans_huge(pmd_t pmd)
 {
@@ -441,7 +443,66 @@ static inline int pmd_write(pmd_t pmd)
 	return 0;
 }
 #endif /* __HAVE_ARCH_PMD_WRITE */
+#endif /* CONFIG_TRANSPARENT_HUGEPAGE */
+
+/*
+ * This function is meant to be used by sites walking pagetables with
+ * the mmap_sem hold in read mode to protect against MADV_DONTNEED and
+ * transhuge page faults. MADV_DONTNEED can convert a transhuge pmd
+ * into a null pmd and the transhuge page fault can convert a null pmd
+ * into an hugepmd or into a regular pmd (if the hugepage allocation
+ * fails). While holding the mmap_sem in read mode the pmd becomes
+ * stable and stops changing under us only if it's not null and not a
+ * transhuge pmd. When those races occurs and this function makes a
+ * difference vs the standard pmd_none_or_clear_bad, the result is
+ * undefined so behaving like if the pmd was none is safe (because it
+ * can return none anyway). The compiler level barrier() is critically
+ * important to compute the two checks atomically on the same pmdval.
+ */
+static inline int pmd_none_or_trans_huge_or_clear_bad(pmd_t *pmd)
+{
+	/* depend on compiler for an atomic pmd read */
+	pmd_t pmdval = *pmd;
+	/*
+	 * The barrier will stabilize the pmdval in a register or on
+	 * the stack so that it will stop changing under the code.
+	 */
+#ifdef CONFIG_TRANSPARENT_HUGEPAGE
+	barrier();
+#endif
+	if (pmd_none(pmdval))
+		return 1;
+	if (unlikely(pmd_bad(pmdval))) {
+		if (!pmd_trans_huge(pmdval))
+			pmd_clear_bad(pmd);
+		return 1;
+	}
+	return 0;
+}
+
+/*
+ * This is a noop if Transparent Hugepage Support is not built into
+ * the kernel. Otherwise it is equivalent to
+ * pmd_none_or_trans_huge_or_clear_bad(), and shall only be called in
+ * places that already verified the pmd is not none and they want to
+ * walk ptes while holding the mmap sem in read mode (write mode don't
+ * need this). If THP is not enabled, the pmd can't go away under the
+ * code even if MADV_DONTNEED runs, but if THP is enabled we need to
+ * run a pmd_trans_unstable before walking the ptes after
+ * split_huge_page_pmd returns (because it may have run when the pmd
+ * become null, but then a page fault can map in a THP and not a
+ * regular page).
+ */
+static inline int pmd_trans_unstable(pmd_t *pmd)
+{
+#ifdef CONFIG_TRANSPARENT_HUGEPAGE
+	return pmd_none_or_trans_huge_or_clear_bad(pmd);
+#else
+	return 0;
 #endif
+}
+
+#endif /* CONFIG_MMU */
 
 #endif /* !__ASSEMBLY__ */
 
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index 26c6f4ec20f4..37281816ff67 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -5230,6 +5230,8 @@ static int mem_cgroup_count_precharge_pte_range(pmd_t *pmd,
 	spinlock_t *ptl;
 
 	split_huge_page_pmd(walk->mm, pmd);
+	if (pmd_trans_unstable(pmd))
+		return 0;
 
 	pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl);
 	for (; addr != end; pte++, addr += PAGE_SIZE)
@@ -5390,6 +5392,8 @@ static int mem_cgroup_move_charge_pte_range(pmd_t *pmd,
 	spinlock_t *ptl;
 
 	split_huge_page_pmd(walk->mm, pmd);
+	if (pmd_trans_unstable(pmd))
+		return 0;
 retry:
 	pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl);
 	for (; addr != end; addr += PAGE_SIZE) {
diff --git a/mm/memory.c b/mm/memory.c
index 347e5fad1cfa..e01abb908b6b 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -1247,16 +1247,24 @@ static inline unsigned long zap_pmd_range(struct mmu_gather *tlb,
 	do {
 		next = pmd_addr_end(addr, end);
 		if (pmd_trans_huge(*pmd)) {
-			if (next-addr != HPAGE_PMD_SIZE) {
+			if (next - addr != HPAGE_PMD_SIZE) {
 				VM_BUG_ON(!rwsem_is_locked(&tlb->mm->mmap_sem));
 				split_huge_page_pmd(vma->vm_mm, pmd);
 			} else if (zap_huge_pmd(tlb, vma, pmd, addr))
-				continue;
+				goto next;
 			/* fall through */
 		}
-		if (pmd_none_or_clear_bad(pmd))
-			continue;
+		/*
+		 * Here there can be other concurrent MADV_DONTNEED or
+		 * trans huge page faults running, and if the pmd is
+		 * none or trans huge it can change under us. This is
+		 * because MADV_DONTNEED holds the mmap_sem in read
+		 * mode.
+		 */
+		if (pmd_none_or_trans_huge_or_clear_bad(pmd))
+			goto next;
 		next = zap_pte_range(tlb, vma, pmd, addr, next, details);
+next:
 		cond_resched();
 	} while (pmd++, addr = next, addr != end);
 
diff --git a/mm/mempolicy.c b/mm/mempolicy.c
index 47296fee23db..0a3757067631 100644
--- a/mm/mempolicy.c
+++ b/mm/mempolicy.c
@@ -512,7 +512,7 @@ static inline int check_pmd_range(struct vm_area_struct *vma, pud_t *pud,
 	do {
 		next = pmd_addr_end(addr, end);
 		split_huge_page_pmd(vma->vm_mm, pmd);
-		if (pmd_none_or_clear_bad(pmd))
+		if (pmd_none_or_trans_huge_or_clear_bad(pmd))
 			continue;
 		if (check_pte_range(vma, pmd, addr, next, nodes,
 				    flags, private))
diff --git a/mm/mincore.c b/mm/mincore.c
index 636a86876ff2..936b4cee8cb1 100644
--- a/mm/mincore.c
+++ b/mm/mincore.c
@@ -164,7 +164,7 @@ static void mincore_pmd_range(struct vm_area_struct *vma, pud_t *pud,
 			}
 			/* fall through */
 		}
-		if (pmd_none_or_clear_bad(pmd))
+		if (pmd_none_or_trans_huge_or_clear_bad(pmd))
 			mincore_unmapped_range(vma, addr, next, vec);
 		else
 			mincore_pte_range(vma, pmd, addr, next, vec);
diff --git a/mm/pagewalk.c b/mm/pagewalk.c
index 2f5cf10ff660..aa9701e12714 100644
--- a/mm/pagewalk.c
+++ b/mm/pagewalk.c
@@ -59,7 +59,7 @@ again:
 			continue;
 
 		split_huge_page_pmd(walk->mm, pmd);
-		if (pmd_none_or_clear_bad(pmd))
+		if (pmd_none_or_trans_huge_or_clear_bad(pmd))
 			goto again;
 		err = walk_pte_range(pmd, addr, next, walk);
 		if (err)
diff --git a/mm/swapfile.c b/mm/swapfile.c
index 00a962caab1a..44595a373e42 100644
--- a/mm/swapfile.c
+++ b/mm/swapfile.c
@@ -932,9 +932,7 @@ static inline int unuse_pmd_range(struct vm_area_struct *vma, pud_t *pud,
 	pmd = pmd_offset(pud, addr);
 	do {
 		next = pmd_addr_end(addr, end);
-		if (unlikely(pmd_trans_huge(*pmd)))
-			continue;
-		if (pmd_none_or_clear_bad(pmd))
+		if (pmd_none_or_trans_huge_or_clear_bad(pmd))
 			continue;
 		ret = unuse_pte_range(vma, pmd, addr, next, entry, page);
 		if (ret)
-- 
cgit v1.2.3


From cbde83e21c4fd50bfc4240408355c1e5d393063d Mon Sep 17 00:00:00 2001
From: Xiao Guangrong <xiaoguangrong@linux.vnet.ibm.com>
Date: Wed, 21 Mar 2012 16:33:55 -0700
Subject: hugetlb: try to search again if it is really needed

Search again only if some holes may be skipped in the first pass.

[akpm@linux-foundation.org: clean up crazy compound definition]
Signed-off-by: Xiao Guangrong <xiaoguangrong@linux.vnet.ibm.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: "H. Peter Anvin" <hpa@zytor.com>
Cc: Michal Hocko <mhocko@suse.cz>
Cc: Hillf Danton <dhillf@gmail.com>
Cc: Andrea Arcangeli <aarcange@redhat.com>
Cc: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 arch/x86/mm/hugetlbpage.c | 10 ++++++----
 1 file changed, 6 insertions(+), 4 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/mm/hugetlbpage.c b/arch/x86/mm/hugetlbpage.c
index 8ecbb4bba4b3..c20e81c3425d 100644
--- a/arch/x86/mm/hugetlbpage.c
+++ b/arch/x86/mm/hugetlbpage.c
@@ -309,9 +309,10 @@ static unsigned long hugetlb_get_unmapped_area_topdown(struct file *file,
 	struct hstate *h = hstate_file(file);
 	struct mm_struct *mm = current->mm;
 	struct vm_area_struct *vma, *prev_vma;
-	unsigned long base = mm->mmap_base, addr = addr0;
+	unsigned long base = mm->mmap_base;
+	unsigned long addr = addr0;
 	unsigned long largest_hole = mm->cached_hole_size;
-	int first_time = 1;
+	unsigned long start_addr;
 
 	/* don't allow allocations above current base */
 	if (mm->free_area_cache > base)
@@ -322,6 +323,8 @@ static unsigned long hugetlb_get_unmapped_area_topdown(struct file *file,
 		mm->free_area_cache  = base;
 	}
 try_again:
+	start_addr = mm->free_area_cache;
+
 	/* make sure it can fit in the remaining address space */
 	if (mm->free_area_cache < len)
 		goto fail;
@@ -368,10 +371,9 @@ fail:
 	 * if hint left us with no space for the requested
 	 * mapping then try again:
 	 */
-	if (first_time) {
+	if (start_addr != base) {
 		mm->free_area_cache = base;
 		largest_hole = 0;
-		first_time = 0;
 		goto try_again;
 	}
 	/*
-- 
cgit v1.2.3


From b716ad953a2bc4a543143c1d9836b7007a4b182f Mon Sep 17 00:00:00 2001
From: Xiao Guangrong <xiaoguangrong@linux.vnet.ibm.com>
Date: Wed, 21 Mar 2012 16:33:56 -0700
Subject: mm: search from free_area_cache for the bigger size

If the required size is bigger than cached_hole_size it is better to
search from free_area_cache - it is easier to get a free region,
specifically for the 64 bit process whose address space is large enough

Do it just as hugetlb_get_unmapped_area_topdown() in arch/x86/mm/hugetlbpage.c

Signed-off-by: Xiao Guangrong <xiaoguangrong@linux.vnet.ibm.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: "H. Peter Anvin" <hpa@zytor.com>
Cc: Michal Hocko <mhocko@suse.cz>
Cc: Hillf Danton <dhillf@gmail.com>
Cc: Andrea Arcangeli <aarcange@redhat.com>
Cc: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
Cc: Rik van Riel <riel@redhat.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 arch/x86/kernel/sys_x86_64.c | 34 +++++++++++++++++-----------------
 mm/mmap.c                    | 36 +++++++++++++++++++++---------------
 2 files changed, 38 insertions(+), 32 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kernel/sys_x86_64.c b/arch/x86/kernel/sys_x86_64.c
index 051489082d59..ef59642ff1bf 100644
--- a/arch/x86/kernel/sys_x86_64.c
+++ b/arch/x86/kernel/sys_x86_64.c
@@ -195,7 +195,7 @@ arch_get_unmapped_area_topdown(struct file *filp, const unsigned long addr0,
 {
 	struct vm_area_struct *vma;
 	struct mm_struct *mm = current->mm;
-	unsigned long addr = addr0;
+	unsigned long addr = addr0, start_addr;
 
 	/* requested length too big for entire address space */
 	if (len > TASK_SIZE)
@@ -223,25 +223,14 @@ arch_get_unmapped_area_topdown(struct file *filp, const unsigned long addr0,
 		mm->free_area_cache = mm->mmap_base;
 	}
 
+try_again:
 	/* either no address requested or can't fit in requested address hole */
-	addr = mm->free_area_cache;
-
-	/* make sure it can fit in the remaining address space */
-	if (addr > len) {
-		unsigned long tmp_addr = align_addr(addr - len, filp,
-						    ALIGN_TOPDOWN);
-
-		vma = find_vma(mm, tmp_addr);
-		if (!vma || tmp_addr + len <= vma->vm_start)
-			/* remember the address as a hint for next time */
-			return mm->free_area_cache = tmp_addr;
-	}
-
-	if (mm->mmap_base < len)
-		goto bottomup;
+	start_addr = addr = mm->free_area_cache;
 
-	addr = mm->mmap_base-len;
+	if (addr < len)
+		goto fail;
 
+	addr -= len;
 	do {
 		addr = align_addr(addr, filp, ALIGN_TOPDOWN);
 
@@ -263,6 +252,17 @@ arch_get_unmapped_area_topdown(struct file *filp, const unsigned long addr0,
 		addr = vma->vm_start-len;
 	} while (len < vma->vm_start);
 
+fail:
+	/*
+	 * if hint left us with no space for the requested
+	 * mapping then try again:
+	 */
+	if (start_addr != mm->mmap_base) {
+		mm->free_area_cache = mm->mmap_base;
+		mm->cached_hole_size = 0;
+		goto try_again;
+	}
+
 bottomup:
 	/*
 	 * A failed mmap() very likely causes application failure,
diff --git a/mm/mmap.c b/mm/mmap.c
index 4f31764d838f..9e0c0de2e7e3 100644
--- a/mm/mmap.c
+++ b/mm/mmap.c
@@ -1442,7 +1442,7 @@ arch_get_unmapped_area_topdown(struct file *filp, const unsigned long addr0,
 {
 	struct vm_area_struct *vma;
 	struct mm_struct *mm = current->mm;
-	unsigned long addr = addr0;
+	unsigned long addr = addr0, start_addr;
 
 	/* requested length too big for entire address space */
 	if (len > TASK_SIZE)
@@ -1466,22 +1466,14 @@ arch_get_unmapped_area_topdown(struct file *filp, const unsigned long addr0,
  		mm->free_area_cache = mm->mmap_base;
  	}
 
+try_again:
 	/* either no address requested or can't fit in requested address hole */
-	addr = mm->free_area_cache;
+	start_addr = addr = mm->free_area_cache;
 
-	/* make sure it can fit in the remaining address space */
-	if (addr > len) {
-		vma = find_vma(mm, addr-len);
-		if (!vma || addr <= vma->vm_start)
-			/* remember the address as a hint for next time */
-			return (mm->free_area_cache = addr-len);
-	}
-
-	if (mm->mmap_base < len)
-		goto bottomup;
-
-	addr = mm->mmap_base-len;
+	if (addr < len)
+		goto fail;
 
+	addr -= len;
 	do {
 		/*
 		 * Lookup failure means no vma is above this address,
@@ -1501,7 +1493,21 @@ arch_get_unmapped_area_topdown(struct file *filp, const unsigned long addr0,
 		addr = vma->vm_start-len;
 	} while (len < vma->vm_start);
 
-bottomup:
+fail:
+	/*
+	 * if hint left us with no space for the requested
+	 * mapping then try again:
+	 *
+	 * Note: this is different with the case of bottomup
+	 * which does the fully line-search, but we use find_vma
+	 * here that causes some holes skipped.
+	 */
+	if (start_addr != mm->mmap_base) {
+		mm->free_area_cache = mm->mmap_base;
+		mm->cached_hole_size = 0;
+		goto try_again;
+	}
+
 	/*
 	 * A failed mmap() very likely causes application failure,
 	 * so fall back to the bottom-up function here. This scenario
-- 
cgit v1.2.3


From b69add218d32450d6604bc9080f6e33e19b06f5e Mon Sep 17 00:00:00 2001
From: Xiao Guangrong <xiaoguangrong@linux.vnet.ibm.com>
Date: Wed, 21 Mar 2012 16:34:14 -0700
Subject: hugetlb: remove prev_vma from hugetlb_get_unmapped_area_topdown()

After looking up the vma which covers or follows the cached search
address, the following condition is always true:

	!prev_vma || (addr >= prev_vma->vm_end)

so we can stop checking the previous VMA altogether.

Signed-off-by: Xiao Guangrong <xiaoguangrong@linux.vnet.ibm.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 arch/x86/mm/hugetlbpage.c | 18 +++++-------------
 1 file changed, 5 insertions(+), 13 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/mm/hugetlbpage.c b/arch/x86/mm/hugetlbpage.c
index c20e81c3425d..f6679a7fb8ca 100644
--- a/arch/x86/mm/hugetlbpage.c
+++ b/arch/x86/mm/hugetlbpage.c
@@ -308,7 +308,7 @@ static unsigned long hugetlb_get_unmapped_area_topdown(struct file *file,
 {
 	struct hstate *h = hstate_file(file);
 	struct mm_struct *mm = current->mm;
-	struct vm_area_struct *vma, *prev_vma;
+	struct vm_area_struct *vma;
 	unsigned long base = mm->mmap_base;
 	unsigned long addr = addr0;
 	unsigned long largest_hole = mm->cached_hole_size;
@@ -340,22 +340,14 @@ try_again:
 		if (!vma)
 			return addr;
 
-		/*
-		 * new region fits between prev_vma->vm_end and
-		 * vma->vm_start, use it:
-		 */
-		prev_vma = vma->vm_prev;
-		if (addr + len <= vma->vm_start &&
-		            (!prev_vma || (addr >= prev_vma->vm_end))) {
+		if (addr + len <= vma->vm_start) {
 			/* remember the address as a hint for next time */
 		        mm->cached_hole_size = largest_hole;
 		        return (mm->free_area_cache = addr);
-		} else {
+		} else if (mm->free_area_cache == vma->vm_end) {
 			/* pull free_area_cache down to the first hole */
-		        if (mm->free_area_cache == vma->vm_end) {
-				mm->free_area_cache = vma->vm_start;
-				mm->cached_hole_size = largest_hole;
-			}
+			mm->free_area_cache = vma->vm_start;
+			mm->cached_hole_size = largest_hole;
 		}
 
 		/* remember the largest hole we saw so far */
-- 
cgit v1.2.3


From d71b5a73fe9af42752c4329b087f7911b35f8f79 Mon Sep 17 00:00:00 2001
From: Andrea Arcangeli <aarcange@redhat.com>
Date: Wed, 21 Mar 2012 16:34:16 -0700
Subject: numa_emulation: fix cpumask_of_node()

Without this fix the cpumask_of_node() for a fake=numa=2 is:

    cpumask 0 ff
    cpumask 1 ff

with the fix it's correct and it's set to:

    cpumask 0 55
    cpumask 1 aa

Signed-off-by: Andrea Arcangeli <aarcange@redhat.com>
Cc: Andi Kleen <andi@firstfloor.org>
Cc: Johannes Weiner <jweiner@redhat.com>
Cc: David Rientjes <rientjes@google.com>
Cc: Ingo Molnar <mingo@elte.hu>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: "H. Peter Anvin" <hpa@zytor.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 arch/x86/mm/numa_emulation.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'arch/x86')

diff --git a/arch/x86/mm/numa_emulation.c b/arch/x86/mm/numa_emulation.c
index 46db56845f18..740b0a355431 100644
--- a/arch/x86/mm/numa_emulation.c
+++ b/arch/x86/mm/numa_emulation.c
@@ -60,7 +60,7 @@ static int __init emu_setup_memblk(struct numa_meminfo *ei,
 	eb->nid = nid;
 
 	if (emu_nid_to_phys[nid] == NUMA_NO_NODE)
-		emu_nid_to_phys[nid] = pb->nid;
+		emu_nid_to_phys[nid] = nid;
 
 	pb->start += size;
 	if (pb->start >= pb->end) {
-- 
cgit v1.2.3


From 676a38046f4fba4e7418756c6f6fc25cf5976312 Mon Sep 17 00:00:00 2001
From: Jussi Kivilinna <jussi.kivilinna@mbnet.fi>
Date: Thu, 15 Mar 2012 22:11:51 +0200
Subject: crypto: camellia-x86_64 - module init/exit functions should be static

This caused conflict with twofish-x86_64-3way when compiled into kernel,
same function names and not static.

Reported-by: Randy Dunlap <rdunlap@xenotime.net>
Signed-off-by: Jussi Kivilinna <jussi.kivilinna@mbnet.fi>
Acked-by: Randy Dunlap <rdunlap@xenotime.net>
Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
---
 arch/x86/crypto/camellia_glue.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/crypto/camellia_glue.c b/arch/x86/crypto/camellia_glue.c
index 1ca36a93fd2f..3306dc0b139e 100644
--- a/arch/x86/crypto/camellia_glue.c
+++ b/arch/x86/crypto/camellia_glue.c
@@ -1925,7 +1925,7 @@ static int force;
 module_param(force, int, 0);
 MODULE_PARM_DESC(force, "Force module load, ignore CPU blacklist");
 
-int __init init(void)
+static int __init init(void)
 {
 	if (!force && is_blacklisted_cpu()) {
 		printk(KERN_INFO
@@ -1938,7 +1938,7 @@ int __init init(void)
 	return crypto_register_algs(camellia_algs, ARRAY_SIZE(camellia_algs));
 }
 
-void __exit fini(void)
+static void __exit fini(void)
 {
 	crypto_unregister_algs(camellia_algs, ARRAY_SIZE(camellia_algs));
 }
-- 
cgit v1.2.3


From ff0a70fe053614e763eb3ac88bfea9c5615fce3b Mon Sep 17 00:00:00 2001
From: Jussi Kivilinna <jussi.kivilinna@mbnet.fi>
Date: Thu, 15 Mar 2012 22:11:57 +0200
Subject: crypto: twofish-x86_64-3way - module init/exit functions should be
 static

This caused conflict with camellia-x86_64 when compiled into kernel, same
function names and not static.

Reported-by: Randy Dunlap <rdunlap@xenotime.net>
Signed-off-by: Jussi Kivilinna <jussi.kivilinna@mbnet.fi>
Acked-by: Randy Dunlap <rdunlap@xenotime.net>
Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
---
 arch/x86/crypto/twofish_glue_3way.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/crypto/twofish_glue_3way.c b/arch/x86/crypto/twofish_glue_3way.c
index 408fc0c5814e..922ab24cce31 100644
--- a/arch/x86/crypto/twofish_glue_3way.c
+++ b/arch/x86/crypto/twofish_glue_3way.c
@@ -668,7 +668,7 @@ static int force;
 module_param(force, int, 0);
 MODULE_PARM_DESC(force, "Force module load, ignore CPU blacklist");
 
-int __init init(void)
+static int __init init(void)
 {
 	if (!force && is_blacklisted_cpu()) {
 		printk(KERN_INFO
@@ -681,7 +681,7 @@ int __init init(void)
 	return crypto_register_algs(tf_algs, ARRAY_SIZE(tf_algs));
 }
 
-void __exit fini(void)
+static void __exit fini(void)
 {
 	crypto_unregister_algs(tf_algs, ARRAY_SIZE(tf_algs));
 }
-- 
cgit v1.2.3


From 13354dc412c36fe554f9904a92f1268c74af7e87 Mon Sep 17 00:00:00 2001
From: Thierry Reding <thierry.reding@avionic-design.de>
Date: Wed, 21 Mar 2012 22:50:08 +0100
Subject: x86-32: Fix typo for mq_getsetattr in syscall table

Syscall 282 was mistakenly named mq_getsetaddr instead of mq_getsetattr.
When building uClibc against the Linux kernel this would result in a
shared library that doesn't provide the mq_getattr() and mq_setattr()
functions.

Signed-off-by: Thierry Reding <thierry.reding@avionic-design.de>
Link: http://lkml.kernel.org/r/1332366608-2695-2-git-send-email-thierry.reding@avionic-design.de
Cc: <stable@vger.kernel.org> v3.3
Signed-off-by: H. Peter Anvin <hpa@zytor.com>
---
 arch/x86/syscalls/syscall_32.tbl | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'arch/x86')

diff --git a/arch/x86/syscalls/syscall_32.tbl b/arch/x86/syscalls/syscall_32.tbl
index ce98e287c066..e7e67cc3c14b 100644
--- a/arch/x86/syscalls/syscall_32.tbl
+++ b/arch/x86/syscalls/syscall_32.tbl
@@ -288,7 +288,7 @@
 279	i386	mq_timedsend		sys_mq_timedsend		compat_sys_mq_timedsend
 280	i386	mq_timedreceive		sys_mq_timedreceive		compat_sys_mq_timedreceive
 281	i386	mq_notify		sys_mq_notify			compat_sys_mq_notify
-282	i386	mq_getsetaddr		sys_mq_getsetattr		compat_sys_mq_getsetattr
+282	i386	mq_getsetattr		sys_mq_getsetattr		compat_sys_mq_getsetattr
 283	i386	kexec_load		sys_kexec_load			compat_sys_kexec_load
 284	i386	waitid			sys_waitid			compat_sys_waitid
 # 285 sys_setaltroot
-- 
cgit v1.2.3


From 446e1c86d51d0823e003a43a2b85c430efce2733 Mon Sep 17 00:00:00 2001
From: "H. Peter Anvin" <hpa@zytor.com>
Date: Thu, 22 Mar 2012 11:08:18 -0700
Subject: x86, boot: Correct CFLAGS for hostprogs

This is a partial revert of commit:
    d40f833 "Restrict CFLAGS for hostprogs"

The endian-manipulation macros in tools/include need <linux/types.h>,
but the hostprogs in arch/x86/boot need several headers from the
kernel build tree, which means we have to add the kernel headers to
the include path.  This picks up <linux/types.h> from the kernel tree,
which gives a warning.

Since this use of <linux/types.h> is intentional, add
-D__EXPORTED_HEADERS__ to the command line to silence the warning.

A better way to fix this would be to always install the exported
kernel headers into $(objtree)/usr/include as a standard part of the
kernel build, but that is a lot more involved.

Reported-by: Linus Torvalds <torvalds@linux-foundation.org>
Acked-by: Matt Fleming <matt.fleming@intel.com>
Link: http://lkml.kernel.org/r/1330436245-24875-5-git-send-email-matt@console-pimps.org
Signed-off-by: H. Peter Anvin <hpa@zytor.com>
---
 arch/x86/boot/Makefile | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/boot/Makefile b/arch/x86/boot/Makefile
index 3e02148bb774..5a747dd884db 100644
--- a/arch/x86/boot/Makefile
+++ b/arch/x86/boot/Makefile
@@ -37,9 +37,9 @@ setup-y		+= video-bios.o
 targets		+= $(setup-y)
 hostprogs-y	:= mkcpustr tools/build
 
-HOSTCFLAGS_mkcpustr.o := -I$(srctree)/arch/$(SRCARCH)/include
-HOST_EXTRACFLAGS += -I$(objtree)/include -I$(srctree)/tools/include \
-                   -include $(srctree)/include/linux/kconfig.h
+HOST_EXTRACFLAGS += -I$(srctree)/tools/include $(LINUXINCLUDE) \
+	            -D__EXPORTED_HEADERS__
+
 $(obj)/cpu.o: $(obj)/cpustr.h
 
 quiet_cmd_cpustr = CPUSTR  $@
-- 
cgit v1.2.3


From 639077fb69aec8112e5427210a83d0fb192969f0 Mon Sep 17 00:00:00 2001
From: Jan Kiszka <jan.kiszka@siemens.com>
Date: Mon, 19 Mar 2012 15:16:48 -0500
Subject: kgdb: x86: Return all segment registers also in 64-bit mode

Even if the content is always 0, gdb expects us to return also ds,
es, fs, and gs while in x86-64 mode. Do this to avoid ugly errors on
"info registers".

[jason.wessel@windriver.com: adjust NUMREGBYTES for two new regs]
Signed-off-by: Jan Kiszka <jan.kiszka@siemens.com>
Signed-off-by: Jason Wessel <jason.wessel@windriver.com>
---
 arch/x86/include/asm/kgdb.h | 10 +++++++---
 arch/x86/kernel/kgdb.c      |  6 ++++--
 2 files changed, 11 insertions(+), 5 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/include/asm/kgdb.h b/arch/x86/include/asm/kgdb.h
index 77e95f54570a..332f98c9111f 100644
--- a/arch/x86/include/asm/kgdb.h
+++ b/arch/x86/include/asm/kgdb.h
@@ -64,11 +64,15 @@ enum regnames {
 	GDB_PS,			/* 17 */
 	GDB_CS,			/* 18 */
 	GDB_SS,			/* 19 */
+	GDB_DS,			/* 20 */
+	GDB_ES,			/* 21 */
+	GDB_FS,			/* 22 */
+	GDB_GS,			/* 23 */
 };
 #define GDB_ORIG_AX		57
-#define DBG_MAX_REG_NUM		20
-/* 17 64 bit regs and 3 32 bit regs */
-#define NUMREGBYTES		((17 * 8) + (3 * 4))
+#define DBG_MAX_REG_NUM		24
+/* 17 64 bit regs and 5 32 bit regs */
+#define NUMREGBYTES		((17 * 8) + (5 * 4))
 #endif /* ! CONFIG_X86_32 */
 
 static inline void arch_kgdb_breakpoint(void)
diff --git a/arch/x86/kernel/kgdb.c b/arch/x86/kernel/kgdb.c
index faba5771acad..fdc37b3d0ce3 100644
--- a/arch/x86/kernel/kgdb.c
+++ b/arch/x86/kernel/kgdb.c
@@ -67,8 +67,6 @@ struct dbg_reg_def_t dbg_reg_def[DBG_MAX_REG_NUM] =
 	{ "ss", 4, offsetof(struct pt_regs, ss) },
 	{ "ds", 4, offsetof(struct pt_regs, ds) },
 	{ "es", 4, offsetof(struct pt_regs, es) },
-	{ "fs", 4, -1 },
-	{ "gs", 4, -1 },
 #else
 	{ "ax", 8, offsetof(struct pt_regs, ax) },
 	{ "bx", 8, offsetof(struct pt_regs, bx) },
@@ -90,7 +88,11 @@ struct dbg_reg_def_t dbg_reg_def[DBG_MAX_REG_NUM] =
 	{ "flags", 4, offsetof(struct pt_regs, flags) },
 	{ "cs", 4, offsetof(struct pt_regs, cs) },
 	{ "ss", 4, offsetof(struct pt_regs, ss) },
+	{ "ds", 4, -1 },
+	{ "es", 4, -1 },
 #endif
+	{ "fs", 4, -1 },
+	{ "gs", 4, -1 },
 };
 
 int dbg_set_reg(int regno, void *mem, struct pt_regs *regs)
-- 
cgit v1.2.3


From 29a2e2836ff9ea65a603c89df217f4198973a74f Mon Sep 17 00:00:00 2001
From: Dmitry Adamushko <dmitry.adamushko@gmail.com>
Date: Thu, 22 Mar 2012 21:39:25 +0100
Subject: x86-32: Fix endless loop when processing signals for kernel tasks

The problem occurs on !CONFIG_VM86 kernels [1] when a kernel-mode task
returns from a system call with a pending signal.

A real-life scenario is a child of 'khelper' returning from a failed
kernel_execve() in ____call_usermodehelper() [ kernel/kmod.c ].
kernel_execve() fails due to a pending SIGKILL, which is the result of
"kill -9 -1" (at least, busybox's init does it upon reboot).

The loop is as follows:

* syscall_exit_work:
 - work_pending:            // start_of_the_loop
 - work_notify_sig:
   - do_notify_resume()
     - do_signal()
       - if (!user_mode(regs)) return;
 - resume_userspace         // TIF_SIGPENDING is still set
 - work_pending             // so we call work_pending => goto
                            // start_of_the_loop

More information can be found in another LKML thread:
http://www.serverphorums.com/read.php?12,457826

[1] the problem was also seen on MIPS.

Signed-off-by: Dmitry Adamushko <dmitry.adamushko@gmail.com>
Link: http://lkml.kernel.org/r/1332448765.2299.68.camel@dimm
Cc: Oleg Nesterov <oleg@redhat.com>
Cc: Roland McGrath <roland@hack.frob.com>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: <stable@vger.kernel.org>
Signed-off-by: H. Peter Anvin <hpa@zytor.com>
---
 arch/x86/kernel/entry_32.S | 17 ++++++++++-------
 1 file changed, 10 insertions(+), 7 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kernel/entry_32.S b/arch/x86/kernel/entry_32.S
index 79d97e68f042..7b784f4ef1e4 100644
--- a/arch/x86/kernel/entry_32.S
+++ b/arch/x86/kernel/entry_32.S
@@ -98,12 +98,6 @@
 #endif
 .endm
 
-#ifdef CONFIG_VM86
-#define resume_userspace_sig	check_userspace
-#else
-#define resume_userspace_sig	resume_userspace
-#endif
-
 /*
  * User gs save/restore
  *
@@ -327,10 +321,19 @@ ret_from_exception:
 	preempt_stop(CLBR_ANY)
 ret_from_intr:
 	GET_THREAD_INFO(%ebp)
-check_userspace:
+resume_userspace_sig:
+#ifdef CONFIG_VM86
 	movl PT_EFLAGS(%esp), %eax	# mix EFLAGS and CS
 	movb PT_CS(%esp), %al
 	andl $(X86_EFLAGS_VM | SEGMENT_RPL_MASK), %eax
+#else
+	/*
+	 * We can be coming here from a syscall done in the kernel space,
+	 * e.g. a failed kernel_execve().
+	 */
+	movl PT_CS(%esp), %eax
+	andl $SEGMENT_RPL_MASK, %eax
+#endif
 	cmpl $USER_RPL, %eax
 	jb resume_kernel		# not returning to v8086 or userspace
 
-- 
cgit v1.2.3


From c7206205d00ab375839bd6c7ddb247d600693c09 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <peterz@infradead.org>
Date: Thu, 22 Mar 2012 17:26:36 +0100
Subject: perf: Fix mmap_page capabilities and docs

Complete the syscall-less self-profiling feature and address
all complaints, namely:

 - capabilities, so we can detect what is actually available at runtime

     Add a capabilities field to perf_event_mmap_page to indicate
     what is actually available for use.

 - on x86: RDPMC weirdness due to being 40/48 bits and not sign-extending
   properly.

 - ABI documentation as to how all this stuff works.

Also improve the documentation for the new features.

Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Stephane Eranian <eranian@google.com>
Cc: Vince Weaver <vweaver1@eecs.utk.edu>
Cc: Arnaldo Carvalho de Melo <acme@infradead.org>
Cc: Jiri Olsa <jolsa@redhat.com>
Link: http://lkml.kernel.org/r/1332433596.2487.33.camel@twins
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/x86/kernel/cpu/perf_event.c | 10 ++++-
 include/linux/perf_event.h       | 83 +++++++++++++++++++++++++++++++++++-----
 kernel/events/core.c             |  4 +-
 3 files changed, 84 insertions(+), 13 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kernel/cpu/perf_event.c b/arch/x86/kernel/cpu/perf_event.c
index 453ac9497574..4ef8104958ee 100644
--- a/arch/x86/kernel/cpu/perf_event.c
+++ b/arch/x86/kernel/cpu/perf_event.c
@@ -1622,6 +1622,9 @@ static int x86_pmu_event_idx(struct perf_event *event)
 {
 	int idx = event->hw.idx;
 
+	if (!x86_pmu.attr_rdpmc)
+		return 0;
+
 	if (x86_pmu.num_counters_fixed && idx >= X86_PMC_IDX_FIXED) {
 		idx -= X86_PMC_IDX_FIXED;
 		idx |= 1 << 30;
@@ -1706,14 +1709,19 @@ static struct pmu pmu = {
 	.flush_branch_stack	= x86_pmu_flush_branch_stack,
 };
 
-void perf_update_user_clock(struct perf_event_mmap_page *userpg, u64 now)
+void arch_perf_update_userpage(struct perf_event_mmap_page *userpg, u64 now)
 {
+	userpg->cap_usr_time = 0;
+	userpg->cap_usr_rdpmc = x86_pmu.attr_rdpmc;
+	userpg->pmc_width = x86_pmu.cntval_bits;
+
 	if (!boot_cpu_has(X86_FEATURE_CONSTANT_TSC))
 		return;
 
 	if (!boot_cpu_has(X86_FEATURE_NONSTOP_TSC))
 		return;
 
+	userpg->cap_usr_time = 1;
 	userpg->time_mult = this_cpu_read(cyc2ns);
 	userpg->time_shift = CYC2NS_SCALE_FACTOR;
 	userpg->time_offset = this_cpu_read(cyc2ns_offset) - now;
diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h
index 57ae485e80fc..ca9ed4e6a286 100644
--- a/include/linux/perf_event.h
+++ b/include/linux/perf_event.h
@@ -299,18 +299,31 @@ struct perf_event_mmap_page {
 	/*
 	 * Bits needed to read the hw events in user-space.
 	 *
-	 *   u32 seq;
-	 *   s64 count;
+	 *   u32 seq, time_mult, time_shift, idx, width;
+	 *   u64 count, enabled, running;
+	 *   u64 cyc, time_offset;
+	 *   s64 pmc = 0;
 	 *
 	 *   do {
 	 *     seq = pc->lock;
-	 *
 	 *     barrier()
-	 *     if (pc->index) {
-	 *       count = pmc_read(pc->index - 1);
-	 *       count += pc->offset;
-	 *     } else
-	 *       goto regular_read;
+	 *
+	 *     enabled = pc->time_enabled;
+	 *     running = pc->time_running;
+	 *
+	 *     if (pc->cap_usr_time && enabled != running) {
+	 *       cyc = rdtsc();
+	 *       time_offset = pc->time_offset;
+	 *       time_mult   = pc->time_mult;
+	 *       time_shift  = pc->time_shift;
+	 *     }
+	 *
+	 *     idx = pc->index;
+	 *     count = pc->offset;
+	 *     if (pc->cap_usr_rdpmc && idx) {
+	 *       width = pc->pmc_width;
+	 *       pmc = rdpmc(idx - 1);
+	 *     }
 	 *
 	 *     barrier();
 	 *   } while (pc->lock != seq);
@@ -323,14 +336,57 @@ struct perf_event_mmap_page {
 	__s64	offset;			/* add to hardware event value */
 	__u64	time_enabled;		/* time event active */
 	__u64	time_running;		/* time event on cpu */
-	__u32	time_mult, time_shift;
+	union {
+		__u64	capabilities;
+		__u64	cap_usr_time  : 1,
+			cap_usr_rdpmc : 1,
+			cap_____res   : 62;
+	};
+
+	/*
+	 * If cap_usr_rdpmc this field provides the bit-width of the value
+	 * read using the rdpmc() or equivalent instruction. This can be used
+	 * to sign extend the result like:
+	 *
+	 *   pmc <<= 64 - width;
+	 *   pmc >>= 64 - width; // signed shift right
+	 *   count += pmc;
+	 */
+	__u16	pmc_width;
+
+	/*
+	 * If cap_usr_time the below fields can be used to compute the time
+	 * delta since time_enabled (in ns) using rdtsc or similar.
+	 *
+	 *   u64 quot, rem;
+	 *   u64 delta;
+	 *
+	 *   quot = (cyc >> time_shift);
+	 *   rem = cyc & ((1 << time_shift) - 1);
+	 *   delta = time_offset + quot * time_mult +
+	 *              ((rem * time_mult) >> time_shift);
+	 *
+	 * Where time_offset,time_mult,time_shift and cyc are read in the
+	 * seqcount loop described above. This delta can then be added to
+	 * enabled and possible running (if idx), improving the scaling:
+	 *
+	 *   enabled += delta;
+	 *   if (idx)
+	 *     running += delta;
+	 *
+	 *   quot = count / running;
+	 *   rem  = count % running;
+	 *   count = quot * enabled + (rem * enabled) / running;
+	 */
+	__u16	time_shift;
+	__u32	time_mult;
 	__u64	time_offset;
 
 		/*
 		 * Hole for extension of the self monitor capabilities
 		 */
 
-	__u64	__reserved[121];	/* align to 1k */
+	__u64	__reserved[120];	/* align to 1k */
 
 	/*
 	 * Control data for the mmap() data buffer.
@@ -347,6 +403,13 @@ struct perf_event_mmap_page {
 	__u64	data_tail;		/* user-space written tail */
 };
 
+/*
+ * Build time assertion that we keep the data_head at the intended location.
+ * IOW, validation we got the __reserved[] size right.
+ */
+extern char __assert_mmap_data_head_offset
+	[1 - 2*!!(offsetof(struct perf_event_mmap_page, data_head) != 1024)];
+
 #define PERF_RECORD_MISC_CPUMODE_MASK		(7 << 0)
 #define PERF_RECORD_MISC_CPUMODE_UNKNOWN	(0 << 0)
 #define PERF_RECORD_MISC_KERNEL			(1 << 0)
diff --git a/kernel/events/core.c b/kernel/events/core.c
index c61234b1a988..dc3b05272511 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -3348,7 +3348,7 @@ static void calc_timer_values(struct perf_event *event,
 	*running = ctx_time - event->tstamp_running;
 }
 
-void __weak perf_update_user_clock(struct perf_event_mmap_page *userpg, u64 now)
+void __weak arch_perf_update_userpage(struct perf_event_mmap_page *userpg, u64 now)
 {
 }
 
@@ -3398,7 +3398,7 @@ void perf_event_update_userpage(struct perf_event *event)
 	userpg->time_running = running +
 			atomic64_read(&event->child_total_time_running);
 
-	perf_update_user_clock(userpg, now);
+	arch_perf_update_userpage(userpg, now);
 
 	barrier();
 	++userpg->lock;
-- 
cgit v1.2.3


From 0b8b8078cb4db2ebe9a20f2b2eaeb58988be32bd Mon Sep 17 00:00:00 2001
From: Yinghai Lu <yinghai@kernel.org>
Date: Thu, 22 Mar 2012 21:31:43 -0700
Subject: x86: Fix excessive MSR print out when show_msr is not specified

Dave found:

| During bootup, I now have 162 messages like this..
| [    0.227346]  MSR0000001b: 00000000fee00900
| [    0.227465]  MSR00000021: 0000000000000001
| [    0.227584]  MSR0000002a: 00000000c1c81400
|
| commit 21c3fcf3e39353d4f21d50e257cc74f3204b1988 looks suspect.
| It claims that it will only print these out if show_msr= is
| passed, but that doesn't seem to be the case.

Fix it by changing to the version that checks the index.

Reported-and-tested-by: Dave Jones <davej@redhat.com>
Signed-off-by: Yinghai Lu <yinghai@kernel.org>
Link: http://lkml.kernel.org/r/1332477103-4595-1-git-send-email-yinghai@kernel.org
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/x86/kernel/cpu/common.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c
index ade9c794ed98..b24032355a76 100644
--- a/arch/x86/kernel/cpu/common.c
+++ b/arch/x86/kernel/cpu/common.c
@@ -998,7 +998,7 @@ void __cpuinit print_cpu_info(struct cpuinfo_x86 *c)
 	else
 		printk(KERN_CONT "\n");
 
-	__print_cpu_msr();
+	print_cpu_msr(c);
 }
 
 void __cpuinit print_cpu_msr(struct cpuinfo_x86 *c)
-- 
cgit v1.2.3


From 280fb016bfb098f33df96016cfaa840db77ba2d0 Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@kernel.org>
Date: Fri, 23 Mar 2012 13:12:38 +0100
Subject: x86/kconfig: Update defconfigs

Link: http://lkml.kernel.org/n/tip-ahz3d8i1vxwj0379gv4tqcru@git.kernel.org
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/configs/i386_defconfig   | 65 +++++++++++++------------------------
 arch/x86/configs/x86_64_defconfig | 68 ++++++++++++++-------------------------
 2 files changed, 47 insertions(+), 86 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/configs/i386_defconfig b/arch/x86/configs/i386_defconfig
index 2bf18059fbea..2d562821a88f 100644
--- a/arch/x86/configs/i386_defconfig
+++ b/arch/x86/configs/i386_defconfig
@@ -15,23 +15,28 @@ CONFIG_CPUSETS=y
 CONFIG_CGROUP_CPUACCT=y
 CONFIG_RESOURCE_COUNTERS=y
 CONFIG_CGROUP_SCHED=y
-CONFIG_UTS_NS=y
-CONFIG_IPC_NS=y
-CONFIG_USER_NS=y
-CONFIG_PID_NS=y
-CONFIG_NET_NS=y
 CONFIG_BLK_DEV_INITRD=y
-CONFIG_KALLSYMS_EXTRA_PASS=y
 # CONFIG_COMPAT_BRK is not set
 CONFIG_PROFILING=y
 CONFIG_KPROBES=y
 CONFIG_MODULES=y
 CONFIG_MODULE_UNLOAD=y
 CONFIG_MODULE_FORCE_UNLOAD=y
+CONFIG_PARTITION_ADVANCED=y
+CONFIG_OSF_PARTITION=y
+CONFIG_AMIGA_PARTITION=y
+CONFIG_MAC_PARTITION=y
+CONFIG_BSD_DISKLABEL=y
+CONFIG_MINIX_SUBPARTITION=y
+CONFIG_SOLARIS_X86_PARTITION=y
+CONFIG_UNIXWARE_DISKLABEL=y
+CONFIG_SGI_PARTITION=y
+CONFIG_SUN_PARTITION=y
+CONFIG_KARMA_PARTITION=y
+CONFIG_EFI_PARTITION=y
 CONFIG_NO_HZ=y
 CONFIG_HIGH_RES_TIMERS=y
 CONFIG_SMP=y
-CONFIG_SPARSE_IRQ=y
 CONFIG_X86_GENERIC=y
 CONFIG_HPET_TIMER=y
 CONFIG_SCHED_SMT=y
@@ -51,14 +56,12 @@ CONFIG_HZ_1000=y
 CONFIG_KEXEC=y
 CONFIG_CRASH_DUMP=y
 # CONFIG_COMPAT_VDSO is not set
-CONFIG_PM=y
+CONFIG_HIBERNATION=y
 CONFIG_PM_DEBUG=y
 CONFIG_PM_TRACE_RTC=y
-CONFIG_HIBERNATION=y
 CONFIG_ACPI_PROCFS=y
 CONFIG_ACPI_DOCK=y
 CONFIG_CPU_FREQ=y
-CONFIG_CPU_FREQ_DEBUG=y
 # CONFIG_CPU_FREQ_STAT is not set
 CONFIG_CPU_FREQ_DEFAULT_GOV_USERSPACE=y
 CONFIG_CPU_FREQ_GOV_PERFORMANCE=y
@@ -69,7 +72,6 @@ CONFIG_PCI_MSI=y
 CONFIG_PCCARD=y
 CONFIG_YENTA=y
 CONFIG_HOTPLUG_PCI=y
-CONFIG_CORE_DUMP_DEFAULT_ELF_HEADERS=y
 CONFIG_BINFMT_MISC=y
 CONFIG_NET=y
 CONFIG_PACKET=y
@@ -120,7 +122,6 @@ CONFIG_NF_CONNTRACK_IPV4=y
 CONFIG_IP_NF_IPTABLES=y
 CONFIG_IP_NF_FILTER=y
 CONFIG_IP_NF_TARGET_REJECT=y
-CONFIG_IP_NF_TARGET_LOG=y
 CONFIG_IP_NF_TARGET_ULOG=y
 CONFIG_NF_NAT=y
 CONFIG_IP_NF_TARGET_MASQUERADE=y
@@ -128,7 +129,6 @@ CONFIG_IP_NF_MANGLE=y
 CONFIG_NF_CONNTRACK_IPV6=y
 CONFIG_IP6_NF_IPTABLES=y
 CONFIG_IP6_NF_MATCH_IPV6HEADER=y
-CONFIG_IP6_NF_TARGET_LOG=y
 CONFIG_IP6_NF_FILTER=y
 CONFIG_IP6_NF_TARGET_REJECT=y
 CONFIG_IP6_NF_MANGLE=y
@@ -169,25 +169,21 @@ CONFIG_DM_ZERO=y
 CONFIG_MACINTOSH_DRIVERS=y
 CONFIG_MAC_EMUMOUSEBTN=y
 CONFIG_NETDEVICES=y
-CONFIG_NET_ETHERNET=y
-CONFIG_NET_VENDOR_3COM=y
+CONFIG_NETCONSOLE=y
+CONFIG_BNX2=y
+CONFIG_TIGON3=y
 CONFIG_NET_TULIP=y
-CONFIG_NET_PCI=y
-CONFIG_FORCEDETH=y
 CONFIG_E100=y
+CONFIG_E1000=y
+CONFIG_E1000E=y
+CONFIG_SKY2=y
 CONFIG_NE2K_PCI=y
+CONFIG_FORCEDETH=y
 CONFIG_8139TOO=y
 # CONFIG_8139TOO_PIO is not set
-CONFIG_E1000=y
-CONFIG_E1000E=y
 CONFIG_R8169=y
-CONFIG_SKY2=y
-CONFIG_TIGON3=y
-CONFIG_BNX2=y
-CONFIG_TR=y
-CONFIG_NET_PCMCIA=y
 CONFIG_FDDI=y
-CONFIG_NETCONSOLE=y
+CONFIG_TR=y
 CONFIG_INPUT_POLLDEV=y
 # CONFIG_INPUT_MOUSEDEV_PSAUX is not set
 CONFIG_INPUT_EVDEV=y
@@ -196,6 +192,7 @@ CONFIG_INPUT_TABLET=y
 CONFIG_INPUT_TOUCHSCREEN=y
 CONFIG_INPUT_MISC=y
 CONFIG_VT_HW_CONSOLE_BINDING=y
+# CONFIG_LEGACY_PTYS is not set
 CONFIG_SERIAL_NONSTANDARD=y
 CONFIG_SERIAL_8250=y
 CONFIG_SERIAL_8250_CONSOLE=y
@@ -205,7 +202,6 @@ CONFIG_SERIAL_8250_MANY_PORTS=y
 CONFIG_SERIAL_8250_SHARE_IRQ=y
 CONFIG_SERIAL_8250_DETECT_IRQ=y
 CONFIG_SERIAL_8250_RSA=y
-# CONFIG_LEGACY_PTYS is not set
 CONFIG_HW_RANDOM=y
 CONFIG_NVRAM=y
 CONFIG_HPET=y
@@ -220,7 +216,6 @@ CONFIG_DRM_I915=y
 CONFIG_FB_MODE_HELPERS=y
 CONFIG_FB_TILEBLITTING=y
 CONFIG_FB_EFI=y
-CONFIG_BACKLIGHT_LCD_SUPPORT=y
 # CONFIG_LCD_CLASS_DEVICE is not set
 CONFIG_VGACON_SOFT_SCROLLBACK=y
 CONFIG_LOGO=y
@@ -283,7 +278,6 @@ CONFIG_ZISOFS=y
 CONFIG_MSDOS_FS=y
 CONFIG_VFAT_FS=y
 CONFIG_PROC_KCORE=y
-CONFIG_TMPFS=y
 CONFIG_TMPFS_POSIX_ACL=y
 CONFIG_HUGETLBFS=y
 CONFIG_NFS_FS=y
@@ -291,18 +285,6 @@ CONFIG_NFS_V3=y
 CONFIG_NFS_V3_ACL=y
 CONFIG_NFS_V4=y
 CONFIG_ROOT_NFS=y
-CONFIG_PARTITION_ADVANCED=y
-CONFIG_OSF_PARTITION=y
-CONFIG_AMIGA_PARTITION=y
-CONFIG_MAC_PARTITION=y
-CONFIG_BSD_DISKLABEL=y
-CONFIG_MINIX_SUBPARTITION=y
-CONFIG_SOLARIS_X86_PARTITION=y
-CONFIG_UNIXWARE_DISKLABEL=y
-CONFIG_SGI_PARTITION=y
-CONFIG_SUN_PARTITION=y
-CONFIG_KARMA_PARTITION=y
-CONFIG_EFI_PARTITION=y
 CONFIG_NLS_DEFAULT="utf8"
 CONFIG_NLS_CODEPAGE_437=y
 CONFIG_NLS_ASCII=y
@@ -317,13 +299,12 @@ CONFIG_DEBUG_KERNEL=y
 # CONFIG_SCHED_DEBUG is not set
 CONFIG_SCHEDSTATS=y
 CONFIG_TIMER_STATS=y
-# CONFIG_RCU_CPU_STALL_DETECTOR is not set
+CONFIG_DEBUG_STACK_USAGE=y
 CONFIG_SYSCTL_SYSCALL_CHECK=y
 CONFIG_BLK_DEV_IO_TRACE=y
 CONFIG_PROVIDE_OHCI1394_DMA_INIT=y
 CONFIG_EARLY_PRINTK_DBGP=y
 CONFIG_DEBUG_STACKOVERFLOW=y
-CONFIG_DEBUG_STACK_USAGE=y
 # CONFIG_DEBUG_RODATA_TEST is not set
 CONFIG_DEBUG_NX_TEST=m
 CONFIG_DEBUG_BOOT_PARAMS=y
diff --git a/arch/x86/configs/x86_64_defconfig b/arch/x86/configs/x86_64_defconfig
index 058a35b8286c..3cf137ad2789 100644
--- a/arch/x86/configs/x86_64_defconfig
+++ b/arch/x86/configs/x86_64_defconfig
@@ -1,4 +1,3 @@
-CONFIG_64BIT=y
 CONFIG_EXPERIMENTAL=y
 # CONFIG_LOCALVERSION_AUTO is not set
 CONFIG_SYSVIPC=y
@@ -16,26 +15,29 @@ CONFIG_CPUSETS=y
 CONFIG_CGROUP_CPUACCT=y
 CONFIG_RESOURCE_COUNTERS=y
 CONFIG_CGROUP_SCHED=y
-CONFIG_UTS_NS=y
-CONFIG_IPC_NS=y
-CONFIG_USER_NS=y
-CONFIG_PID_NS=y
-CONFIG_NET_NS=y
 CONFIG_BLK_DEV_INITRD=y
-CONFIG_KALLSYMS_EXTRA_PASS=y
 # CONFIG_COMPAT_BRK is not set
 CONFIG_PROFILING=y
 CONFIG_KPROBES=y
 CONFIG_MODULES=y
 CONFIG_MODULE_UNLOAD=y
 CONFIG_MODULE_FORCE_UNLOAD=y
+CONFIG_PARTITION_ADVANCED=y
+CONFIG_OSF_PARTITION=y
+CONFIG_AMIGA_PARTITION=y
+CONFIG_MAC_PARTITION=y
+CONFIG_BSD_DISKLABEL=y
+CONFIG_MINIX_SUBPARTITION=y
+CONFIG_SOLARIS_X86_PARTITION=y
+CONFIG_UNIXWARE_DISKLABEL=y
+CONFIG_SGI_PARTITION=y
+CONFIG_SUN_PARTITION=y
+CONFIG_KARMA_PARTITION=y
+CONFIG_EFI_PARTITION=y
 CONFIG_NO_HZ=y
 CONFIG_HIGH_RES_TIMERS=y
 CONFIG_SMP=y
-CONFIG_SPARSE_IRQ=y
 CONFIG_CALGARY_IOMMU=y
-CONFIG_AMD_IOMMU=y
-CONFIG_AMD_IOMMU_STATS=y
 CONFIG_NR_CPUS=64
 CONFIG_SCHED_SMT=y
 CONFIG_PREEMPT_VOLUNTARY=y
@@ -53,27 +55,22 @@ CONFIG_HZ_1000=y
 CONFIG_KEXEC=y
 CONFIG_CRASH_DUMP=y
 # CONFIG_COMPAT_VDSO is not set
-CONFIG_PM=y
+CONFIG_HIBERNATION=y
 CONFIG_PM_DEBUG=y
 CONFIG_PM_TRACE_RTC=y
-CONFIG_HIBERNATION=y
 CONFIG_ACPI_PROCFS=y
 CONFIG_ACPI_DOCK=y
 CONFIG_CPU_FREQ=y
-CONFIG_CPU_FREQ_DEBUG=y
 # CONFIG_CPU_FREQ_STAT is not set
 CONFIG_CPU_FREQ_DEFAULT_GOV_USERSPACE=y
 CONFIG_CPU_FREQ_GOV_PERFORMANCE=y
 CONFIG_CPU_FREQ_GOV_ONDEMAND=y
 CONFIG_X86_ACPI_CPUFREQ=y
 CONFIG_PCI_MMCONFIG=y
-CONFIG_INTEL_IOMMU=y
-# CONFIG_INTEL_IOMMU_DEFAULT_ON is not set
 CONFIG_PCIEPORTBUS=y
 CONFIG_PCCARD=y
 CONFIG_YENTA=y
 CONFIG_HOTPLUG_PCI=y
-CONFIG_CORE_DUMP_DEFAULT_ELF_HEADERS=y
 CONFIG_BINFMT_MISC=y
 CONFIG_IA32_EMULATION=y
 CONFIG_NET=y
@@ -125,7 +122,6 @@ CONFIG_NF_CONNTRACK_IPV4=y
 CONFIG_IP_NF_IPTABLES=y
 CONFIG_IP_NF_FILTER=y
 CONFIG_IP_NF_TARGET_REJECT=y
-CONFIG_IP_NF_TARGET_LOG=y
 CONFIG_IP_NF_TARGET_ULOG=y
 CONFIG_NF_NAT=y
 CONFIG_IP_NF_TARGET_MASQUERADE=y
@@ -133,7 +129,6 @@ CONFIG_IP_NF_MANGLE=y
 CONFIG_NF_CONNTRACK_IPV6=y
 CONFIG_IP6_NF_IPTABLES=y
 CONFIG_IP6_NF_MATCH_IPV6HEADER=y
-CONFIG_IP6_NF_TARGET_LOG=y
 CONFIG_IP6_NF_FILTER=y
 CONFIG_IP6_NF_TARGET_REJECT=y
 CONFIG_IP6_NF_MANGLE=y
@@ -172,20 +167,16 @@ CONFIG_DM_ZERO=y
 CONFIG_MACINTOSH_DRIVERS=y
 CONFIG_MAC_EMUMOUSEBTN=y
 CONFIG_NETDEVICES=y
-CONFIG_NET_ETHERNET=y
-CONFIG_NET_VENDOR_3COM=y
+CONFIG_NETCONSOLE=y
+CONFIG_TIGON3=y
 CONFIG_NET_TULIP=y
-CONFIG_NET_PCI=y
-CONFIG_FORCEDETH=y
 CONFIG_E100=y
-CONFIG_8139TOO=y
 CONFIG_E1000=y
 CONFIG_SKY2=y
-CONFIG_TIGON3=y
-CONFIG_TR=y
-CONFIG_NET_PCMCIA=y
+CONFIG_FORCEDETH=y
+CONFIG_8139TOO=y
 CONFIG_FDDI=y
-CONFIG_NETCONSOLE=y
+CONFIG_TR=y
 CONFIG_INPUT_POLLDEV=y
 # CONFIG_INPUT_MOUSEDEV_PSAUX is not set
 CONFIG_INPUT_EVDEV=y
@@ -194,6 +185,7 @@ CONFIG_INPUT_TABLET=y
 CONFIG_INPUT_TOUCHSCREEN=y
 CONFIG_INPUT_MISC=y
 CONFIG_VT_HW_CONSOLE_BINDING=y
+# CONFIG_LEGACY_PTYS is not set
 CONFIG_SERIAL_NONSTANDARD=y
 CONFIG_SERIAL_8250=y
 CONFIG_SERIAL_8250_CONSOLE=y
@@ -203,7 +195,6 @@ CONFIG_SERIAL_8250_MANY_PORTS=y
 CONFIG_SERIAL_8250_SHARE_IRQ=y
 CONFIG_SERIAL_8250_DETECT_IRQ=y
 CONFIG_SERIAL_8250_RSA=y
-# CONFIG_LEGACY_PTYS is not set
 CONFIG_HW_RANDOM=y
 # CONFIG_HW_RANDOM_INTEL is not set
 # CONFIG_HW_RANDOM_AMD is not set
@@ -221,7 +212,6 @@ CONFIG_DRM_I915_KMS=y
 CONFIG_FB_MODE_HELPERS=y
 CONFIG_FB_TILEBLITTING=y
 CONFIG_FB_EFI=y
-CONFIG_BACKLIGHT_LCD_SUPPORT=y
 # CONFIG_LCD_CLASS_DEVICE is not set
 CONFIG_VGACON_SOFT_SCROLLBACK=y
 CONFIG_LOGO=y
@@ -268,6 +258,10 @@ CONFIG_RTC_CLASS=y
 # CONFIG_RTC_HCTOSYS is not set
 CONFIG_DMADEVICES=y
 CONFIG_EEEPC_LAPTOP=y
+CONFIG_AMD_IOMMU=y
+CONFIG_AMD_IOMMU_STATS=y
+CONFIG_INTEL_IOMMU=y
+# CONFIG_INTEL_IOMMU_DEFAULT_ON is not set
 CONFIG_EFI_VARS=y
 CONFIG_EXT3_FS=y
 # CONFIG_EXT3_DEFAULTS_TO_ORDERED is not set
@@ -284,7 +278,6 @@ CONFIG_ZISOFS=y
 CONFIG_MSDOS_FS=y
 CONFIG_VFAT_FS=y
 CONFIG_PROC_KCORE=y
-CONFIG_TMPFS=y
 CONFIG_TMPFS_POSIX_ACL=y
 CONFIG_HUGETLBFS=y
 CONFIG_NFS_FS=y
@@ -292,18 +285,6 @@ CONFIG_NFS_V3=y
 CONFIG_NFS_V3_ACL=y
 CONFIG_NFS_V4=y
 CONFIG_ROOT_NFS=y
-CONFIG_PARTITION_ADVANCED=y
-CONFIG_OSF_PARTITION=y
-CONFIG_AMIGA_PARTITION=y
-CONFIG_MAC_PARTITION=y
-CONFIG_BSD_DISKLABEL=y
-CONFIG_MINIX_SUBPARTITION=y
-CONFIG_SOLARIS_X86_PARTITION=y
-CONFIG_UNIXWARE_DISKLABEL=y
-CONFIG_SGI_PARTITION=y
-CONFIG_SUN_PARTITION=y
-CONFIG_KARMA_PARTITION=y
-CONFIG_EFI_PARTITION=y
 CONFIG_NLS_DEFAULT="utf8"
 CONFIG_NLS_CODEPAGE_437=y
 CONFIG_NLS_ASCII=y
@@ -317,13 +298,12 @@ CONFIG_DEBUG_KERNEL=y
 # CONFIG_SCHED_DEBUG is not set
 CONFIG_SCHEDSTATS=y
 CONFIG_TIMER_STATS=y
-# CONFIG_RCU_CPU_STALL_DETECTOR is not set
+CONFIG_DEBUG_STACK_USAGE=y
 CONFIG_SYSCTL_SYSCALL_CHECK=y
 CONFIG_BLK_DEV_IO_TRACE=y
 CONFIG_PROVIDE_OHCI1394_DMA_INIT=y
 CONFIG_EARLY_PRINTK_DBGP=y
 CONFIG_DEBUG_STACKOVERFLOW=y
-CONFIG_DEBUG_STACK_USAGE=y
 # CONFIG_DEBUG_RODATA_TEST is not set
 CONFIG_DEBUG_NX_TEST=m
 CONFIG_DEBUG_BOOT_PARAMS=y
-- 
cgit v1.2.3


From b7157acf429e6aef690646ba964b9ebd25049ec2 Mon Sep 17 00:00:00 2001
From: Steffen Persvold <sp@numascale.com>
Date: Fri, 16 Mar 2012 20:25:35 +0100
Subject: x86/apic: Add separate apic_id_valid() functions for selected apic
 drivers

As suggested by Suresh Siddha and Yinghai Lu:

For x2apic pre-enabled systems, apic driver is set already early
through early_acpi_boot_init()/early_acpi_process_madt()/
acpi_parse_madt()/default_acpi_madt_oem_check() path so that
apic_id_valid() checking will be sufficient during MADT and SRAT
parsing.

For non-x2apic pre-enabled systems, all apic ids should be less
than 255.

This allows us to substitute the checks in
arch/x86/kernel/acpi/boot.c::acpi_parse_x2apic() and
arch/x86/mm/srat.c::acpi_numa_x2apic_affinity_init() with
apic->apic_id_valid().

In addition we can avoid feigning the x2apic cpu feature in the
NumaChip apic code.

The following apic drivers have separate apic_id_valid()
functions which will accept x2apic type IDs :

 x2apic_phys
 x2apic_cluster
 x2apic_uv_x
 apic_numachip

Signed-off-by: Steffen Persvold <sp@numascale.com>
Cc: Suresh Siddha <suresh.b.siddha@intel.com>
Cc: Daniel J Blueman <daniel@numascale-asia.com>
Cc: Yinghai Lu <yinghai@kernel.org>
Cc: Jack Steiner <steiner@sgi.com>
Link: http://lkml.kernel.org/r/1331925935-13372-1-git-send-email-sp@numascale.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/x86/include/asm/apic.h           | 2 +-
 arch/x86/include/asm/x2apic.h         | 5 +++++
 arch/x86/kernel/acpi/boot.c           | 2 +-
 arch/x86/kernel/apic/apic_numachip.c  | 3 +--
 arch/x86/kernel/apic/x2apic_cluster.c | 2 +-
 arch/x86/kernel/apic/x2apic_phys.c    | 2 +-
 arch/x86/kernel/apic/x2apic_uv_x.c    | 7 ++++++-
 arch/x86/mm/srat.c                    | 2 +-
 8 files changed, 17 insertions(+), 8 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/include/asm/apic.h b/arch/x86/include/asm/apic.h
index a9371c91718c..d3eaac44860a 100644
--- a/arch/x86/include/asm/apic.h
+++ b/arch/x86/include/asm/apic.h
@@ -535,7 +535,7 @@ static inline unsigned int read_apic_id(void)
 
 static inline int default_apic_id_valid(int apicid)
 {
-	return x2apic_mode || (apicid < 255);
+	return (apicid < 255);
 }
 
 extern void default_setup_apic_routing(void);
diff --git a/arch/x86/include/asm/x2apic.h b/arch/x86/include/asm/x2apic.h
index 6bf5b8e478c0..92e54abf89e0 100644
--- a/arch/x86/include/asm/x2apic.h
+++ b/arch/x86/include/asm/x2apic.h
@@ -18,6 +18,11 @@ static const struct cpumask *x2apic_target_cpus(void)
 	return cpu_online_mask;
 }
 
+static int x2apic_apic_id_valid(int apicid)
+{
+	return 1;
+}
+
 static int x2apic_apic_id_registered(void)
 {
 	return 1;
diff --git a/arch/x86/kernel/acpi/boot.c b/arch/x86/kernel/acpi/boot.c
index 406ed77216d0..0f42c2f44311 100644
--- a/arch/x86/kernel/acpi/boot.c
+++ b/arch/x86/kernel/acpi/boot.c
@@ -239,7 +239,7 @@ acpi_parse_x2apic(struct acpi_subtable_header *header, const unsigned long end)
 	 * to not preallocating memory for all NR_CPUS
 	 * when we use CPU hotplug.
 	 */
-	if (!cpu_has_x2apic && (apic_id >= 0xff) && enabled)
+	if (!apic->apic_id_valid(apic_id) && enabled)
 		printk(KERN_WARNING PREFIX "x2apic entry ignored\n");
 	else
 		acpi_register_lapic(apic_id, enabled);
diff --git a/arch/x86/kernel/apic/apic_numachip.c b/arch/x86/kernel/apic/apic_numachip.c
index d9ea5f331ac5..899803e03214 100644
--- a/arch/x86/kernel/apic/apic_numachip.c
+++ b/arch/x86/kernel/apic/apic_numachip.c
@@ -229,11 +229,10 @@ static int __init numachip_system_init(void)
 }
 early_initcall(numachip_system_init);
 
-static int __cpuinit numachip_acpi_madt_oem_check(char *oem_id, char *oem_table_id)
+static int numachip_acpi_madt_oem_check(char *oem_id, char *oem_table_id)
 {
 	if (!strncmp(oem_id, "NUMASC", 6)) {
 		numachip_system = 1;
-		setup_force_cpu_cap(X86_FEATURE_X2APIC);
 		return 1;
 	}
 
diff --git a/arch/x86/kernel/apic/x2apic_cluster.c b/arch/x86/kernel/apic/x2apic_cluster.c
index 9193713060a9..48f3103b3c93 100644
--- a/arch/x86/kernel/apic/x2apic_cluster.c
+++ b/arch/x86/kernel/apic/x2apic_cluster.c
@@ -213,7 +213,7 @@ static struct apic apic_x2apic_cluster = {
 	.name				= "cluster x2apic",
 	.probe				= x2apic_cluster_probe,
 	.acpi_madt_oem_check		= x2apic_acpi_madt_oem_check,
-	.apic_id_valid			= default_apic_id_valid,
+	.apic_id_valid			= x2apic_apic_id_valid,
 	.apic_id_registered		= x2apic_apic_id_registered,
 
 	.irq_delivery_mode		= dest_LowestPrio,
diff --git a/arch/x86/kernel/apic/x2apic_phys.c b/arch/x86/kernel/apic/x2apic_phys.c
index bcd1db6eaca9..8a778db45e3a 100644
--- a/arch/x86/kernel/apic/x2apic_phys.c
+++ b/arch/x86/kernel/apic/x2apic_phys.c
@@ -119,7 +119,7 @@ static struct apic apic_x2apic_phys = {
 	.name				= "physical x2apic",
 	.probe				= x2apic_phys_probe,
 	.acpi_madt_oem_check		= x2apic_acpi_madt_oem_check,
-	.apic_id_valid			= default_apic_id_valid,
+	.apic_id_valid			= x2apic_apic_id_valid,
 	.apic_id_registered		= x2apic_apic_id_registered,
 
 	.irq_delivery_mode		= dest_Fixed,
diff --git a/arch/x86/kernel/apic/x2apic_uv_x.c b/arch/x86/kernel/apic/x2apic_uv_x.c
index fc4771425852..87bfa69e216e 100644
--- a/arch/x86/kernel/apic/x2apic_uv_x.c
+++ b/arch/x86/kernel/apic/x2apic_uv_x.c
@@ -266,6 +266,11 @@ static void uv_send_IPI_all(int vector)
 	uv_send_IPI_mask(cpu_online_mask, vector);
 }
 
+static int uv_apic_id_valid(int apicid)
+{
+	return 1;
+}
+
 static int uv_apic_id_registered(void)
 {
 	return 1;
@@ -351,7 +356,7 @@ static struct apic __refdata apic_x2apic_uv_x = {
 	.name				= "UV large system",
 	.probe				= uv_probe,
 	.acpi_madt_oem_check		= uv_acpi_madt_oem_check,
-	.apic_id_valid			= default_apic_id_valid,
+	.apic_id_valid			= uv_apic_id_valid,
 	.apic_id_registered		= uv_apic_id_registered,
 
 	.irq_delivery_mode		= dest_Fixed,
diff --git a/arch/x86/mm/srat.c b/arch/x86/mm/srat.c
index 1c1c4f46a7c1..efb5b4b93711 100644
--- a/arch/x86/mm/srat.c
+++ b/arch/x86/mm/srat.c
@@ -70,7 +70,7 @@ acpi_numa_x2apic_affinity_init(struct acpi_srat_x2apic_cpu_affinity *pa)
 		return;
 	pxm = pa->proximity_domain;
 	apic_id = pa->apic_id;
-	if (!cpu_has_x2apic && (apic_id >= 0xff)) {
+	if (!apic->apic_id_valid(apic_id)) {
 		printk(KERN_INFO "SRAT: PXM %u -> X2APIC 0x%04x ignored\n",
 			 pxm, apic_id);
 		return;
-- 
cgit v1.2.3


From 4da7072ad6831a35a11341097ce477e18651bedd Mon Sep 17 00:00:00 2001
From: Alexander Gordeev <agordeev@redhat.com>
Date: Tue, 20 Mar 2012 15:19:36 +0100
Subject: x86/io_apic: Move and reenable irq only when
 CONFIG_GENERIC_PENDING_IRQ=y

This patch removes dead code from certain .config variations.

When CONFIG_GENERIC_PENDING_IRQ=n irq move and reenable code is
never get executed, nor do_unmask_irq variable updates its init
value. Move the code under CONFIG_GENERIC_PENDING_IRQ macro.

Signed-off-by: Alexander Gordeev <agordeev@redhat.com>
Link: http://lkml.kernel.org/r/20120320141935.GA24806@dhcp-26-207.brq.redhat.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/x86/kernel/apic/io_apic.c | 101 +++++++++++++++++++++++++----------------
 1 file changed, 61 insertions(+), 40 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kernel/apic/io_apic.c b/arch/x86/kernel/apic/io_apic.c
index 6d10a66fc5a9..2c428c5d7ca3 100644
--- a/arch/x86/kernel/apic/io_apic.c
+++ b/arch/x86/kernel/apic/io_apic.c
@@ -2512,21 +2512,73 @@ static void ack_apic_edge(struct irq_data *data)
 
 atomic_t irq_mis_count;
 
-static void ack_apic_level(struct irq_data *data)
-{
-	struct irq_cfg *cfg = data->chip_data;
-	int i, do_unmask_irq = 0, irq = data->irq;
-	unsigned long v;
-
-	irq_complete_move(cfg);
 #ifdef CONFIG_GENERIC_PENDING_IRQ
+static inline bool ioapic_irqd_mask(struct irq_data *data, struct irq_cfg *cfg)
+{
 	/* If we are moving the irq we need to mask it */
 	if (unlikely(irqd_is_setaffinity_pending(data))) {
-		do_unmask_irq = 1;
 		mask_ioapic(cfg);
+		return true;
 	}
+	return false;
+}
+
+static inline void ioapic_irqd_unmask(struct irq_data *data,
+				      struct irq_cfg *cfg, bool masked)
+{
+	if (unlikely(masked)) {
+		/* Only migrate the irq if the ack has been received.
+		 *
+		 * On rare occasions the broadcast level triggered ack gets
+		 * delayed going to ioapics, and if we reprogram the
+		 * vector while Remote IRR is still set the irq will never
+		 * fire again.
+		 *
+		 * To prevent this scenario we read the Remote IRR bit
+		 * of the ioapic.  This has two effects.
+		 * - On any sane system the read of the ioapic will
+		 *   flush writes (and acks) going to the ioapic from
+		 *   this cpu.
+		 * - We get to see if the ACK has actually been delivered.
+		 *
+		 * Based on failed experiments of reprogramming the
+		 * ioapic entry from outside of irq context starting
+		 * with masking the ioapic entry and then polling until
+		 * Remote IRR was clear before reprogramming the
+		 * ioapic I don't trust the Remote IRR bit to be
+		 * completey accurate.
+		 *
+		 * However there appears to be no other way to plug
+		 * this race, so if the Remote IRR bit is not
+		 * accurate and is causing problems then it is a hardware bug
+		 * and you can go talk to the chipset vendor about it.
+		 */
+		if (!io_apic_level_ack_pending(cfg))
+			irq_move_masked_irq(data);
+		unmask_ioapic(cfg);
+	}
+}
+#else
+static inline bool ioapic_irqd_mask(struct irq_data *data, struct irq_cfg *cfg)
+{
+	return false;
+}
+static inline void ioapic_irqd_unmask(struct irq_data *data,
+				      struct irq_cfg *cfg, bool masked)
+{
+}
 #endif
 
+static void ack_apic_level(struct irq_data *data)
+{
+	struct irq_cfg *cfg = data->chip_data;
+	int i, irq = data->irq;
+	unsigned long v;
+	bool masked;
+
+	irq_complete_move(cfg);
+	masked = ioapic_irqd_mask(data, cfg);
+
 	/*
 	 * It appears there is an erratum which affects at least version 0x11
 	 * of I/O APIC (that's the 82093AA and cores integrated into various
@@ -2581,38 +2633,7 @@ static void ack_apic_level(struct irq_data *data)
 		eoi_ioapic_irq(irq, cfg);
 	}
 
-	/* Now we can move and renable the irq */
-	if (unlikely(do_unmask_irq)) {
-		/* Only migrate the irq if the ack has been received.
-		 *
-		 * On rare occasions the broadcast level triggered ack gets
-		 * delayed going to ioapics, and if we reprogram the
-		 * vector while Remote IRR is still set the irq will never
-		 * fire again.
-		 *
-		 * To prevent this scenario we read the Remote IRR bit
-		 * of the ioapic.  This has two effects.
-		 * - On any sane system the read of the ioapic will
-		 *   flush writes (and acks) going to the ioapic from
-		 *   this cpu.
-		 * - We get to see if the ACK has actually been delivered.
-		 *
-		 * Based on failed experiments of reprogramming the
-		 * ioapic entry from outside of irq context starting
-		 * with masking the ioapic entry and then polling until
-		 * Remote IRR was clear before reprogramming the
-		 * ioapic I don't trust the Remote IRR bit to be
-		 * completey accurate.
-		 *
-		 * However there appears to be no other way to plug
-		 * this race, so if the Remote IRR bit is not
-		 * accurate and is causing problems then it is a hardware bug
-		 * and you can go talk to the chipset vendor about it.
-		 */
-		if (!io_apic_level_ack_pending(cfg))
-			irq_move_masked_irq(data);
-		unmask_ioapic(cfg);
-	}
+	ioapic_irqd_unmask(data, cfg, masked);
 }
 
 #ifdef CONFIG_IRQ_REMAP
-- 
cgit v1.2.3


From 91ec87d57fc38c529034e853687dfb7756de5406 Mon Sep 17 00:00:00 2001
From: Andy Lutomirski <luto@mit.edu>
Date: Thu, 22 Mar 2012 21:15:51 -0700
Subject: x86-64: Simplify and optimize vdso clock_gettime monotonic variants

We used to store the wall-to-monotonic offset and the realtime base.
It's faster to precompute the monotonic base.

This is about a 3% speedup on Sandy Bridge for CLOCK_MONOTONIC.
It's much more impressive for CLOCK_MONOTONIC_COARSE.

Signed-off-by: Andy Lutomirski <luto@amacapital.net>
Signed-off-by: John Stultz <john.stultz@linaro.org>
---
 arch/x86/include/asm/vgtod.h   | 15 +++++++++------
 arch/x86/kernel/vsyscall_64.c  | 10 +++++++++-
 arch/x86/vdso/vclock_gettime.c | 38 ++++++++------------------------------
 3 files changed, 26 insertions(+), 37 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/include/asm/vgtod.h b/arch/x86/include/asm/vgtod.h
index 1f007178c813..8b38be2de9e1 100644
--- a/arch/x86/include/asm/vgtod.h
+++ b/arch/x86/include/asm/vgtod.h
@@ -7,11 +7,6 @@
 struct vsyscall_gtod_data {
 	seqcount_t	seq;
 
-	/* open coded 'struct timespec' */
-	time_t		wall_time_sec;
-	u32		wall_time_nsec;
-
-	struct timezone sys_tz;
 	struct { /* extract of a clocksource struct */
 		int vclock_mode;
 		cycle_t	cycle_last;
@@ -19,8 +14,16 @@ struct vsyscall_gtod_data {
 		u32	mult;
 		u32	shift;
 	} clock;
-	struct timespec wall_to_monotonic;
+
+	/* open coded 'struct timespec' */
+	time_t		wall_time_sec;
+	u32		wall_time_nsec;
+	u32		monotonic_time_nsec;
+	time_t		monotonic_time_sec;
+
+	struct timezone sys_tz;
 	struct timespec wall_time_coarse;
+	struct timespec monotonic_time_coarse;
 };
 extern struct vsyscall_gtod_data vsyscall_gtod_data;
 
diff --git a/arch/x86/kernel/vsyscall_64.c b/arch/x86/kernel/vsyscall_64.c
index cdc95a707cd1..4285f1f404c2 100644
--- a/arch/x86/kernel/vsyscall_64.c
+++ b/arch/x86/kernel/vsyscall_64.c
@@ -84,6 +84,7 @@ void update_vsyscall(struct timespec *wall_time, struct timespec *wtm,
 			struct clocksource *clock, u32 mult)
 {
 	write_seqcount_begin(&vsyscall_gtod_data.seq);
+	struct timespec monotonic;
 
 	/* copy vsyscall data */
 	vsyscall_gtod_data.clock.vclock_mode	= clock->archdata.vclock_mode;
@@ -91,10 +92,17 @@ void update_vsyscall(struct timespec *wall_time, struct timespec *wtm,
 	vsyscall_gtod_data.clock.mask		= clock->mask;
 	vsyscall_gtod_data.clock.mult		= mult;
 	vsyscall_gtod_data.clock.shift		= clock->shift;
+
 	vsyscall_gtod_data.wall_time_sec	= wall_time->tv_sec;
 	vsyscall_gtod_data.wall_time_nsec	= wall_time->tv_nsec;
-	vsyscall_gtod_data.wall_to_monotonic	= *wtm;
+
+	monotonic = timespec_add(*wall_time, *wtm);
+	vsyscall_gtod_data.monotonic_time_sec	= monotonic.tv_sec;
+	vsyscall_gtod_data.monotonic_time_nsec	= monotonic.tv_nsec;
+
 	vsyscall_gtod_data.wall_time_coarse	= __current_kernel_time();
+	vsyscall_gtod_data.monotonic_time_coarse =
+		timespec_add(vsyscall_gtod_data.wall_time_coarse, *wtm);
 
 	write_seqcount_end(&vsyscall_gtod_data.seq);
 }
diff --git a/arch/x86/vdso/vclock_gettime.c b/arch/x86/vdso/vclock_gettime.c
index 944c5e5d6b6a..6eea70b8f384 100644
--- a/arch/x86/vdso/vclock_gettime.c
+++ b/arch/x86/vdso/vclock_gettime.c
@@ -113,27 +113,17 @@ notrace static noinline int do_realtime(struct timespec *ts)
 
 notrace static noinline int do_monotonic(struct timespec *ts)
 {
-	unsigned long seq, ns, secs;
+	unsigned long seq, ns;
 	int mode;
 
 	do {
 		seq = read_seqcount_begin(&gtod->seq);
 		mode = gtod->clock.vclock_mode;
-		secs = gtod->wall_time_sec;
-		ns = gtod->wall_time_nsec + vgetns();
-		secs += gtod->wall_to_monotonic.tv_sec;
-		ns += gtod->wall_to_monotonic.tv_nsec;
+		ts->tv_sec = gtod->monotonic_time_sec;
+		ts->tv_nsec = gtod->monotonic_time_nsec;
+		ns = vgetns();
 	} while (unlikely(read_seqcount_retry(&gtod->seq, seq)));
-
-	/* wall_time_nsec, vgetns(), and wall_to_monotonic.tv_nsec
-	 * are all guaranteed to be nonnegative.
-	 */
-	while (ns >= NSEC_PER_SEC) {
-		ns -= NSEC_PER_SEC;
-		++secs;
-	}
-	ts->tv_sec = secs;
-	ts->tv_nsec = ns;
+	timespec_add_ns(ts, ns);
 
 	return mode;
 }
@@ -151,25 +141,13 @@ notrace static noinline int do_realtime_coarse(struct timespec *ts)
 
 notrace static noinline int do_monotonic_coarse(struct timespec *ts)
 {
-	unsigned long seq, ns, secs;
+	unsigned long seq;
 	do {
 		seq = read_seqcount_begin(&gtod->seq);
-		secs = gtod->wall_time_coarse.tv_sec;
-		ns = gtod->wall_time_coarse.tv_nsec;
-		secs += gtod->wall_to_monotonic.tv_sec;
-		ns += gtod->wall_to_monotonic.tv_nsec;
+		ts->tv_sec = gtod->monotonic_time_coarse.tv_sec;
+		ts->tv_nsec = gtod->monotonic_time_coarse.tv_nsec;
 	} while (unlikely(read_seqcount_retry(&gtod->seq, seq)));
 
-	/* wall_time_nsec and wall_to_monotonic.tv_nsec are
-	 * guaranteed to be between 0 and NSEC_PER_SEC.
-	 */
-	if (ns >= NSEC_PER_SEC) {
-		ns -= NSEC_PER_SEC;
-		++secs;
-	}
-	ts->tv_sec = secs;
-	ts->tv_nsec = ns;
-
 	return 0;
 }
 
-- 
cgit v1.2.3


From 5f293474c4c6c4dc2baaf2dfd486748b5986de76 Mon Sep 17 00:00:00 2001
From: Andy Lutomirski <luto@mit.edu>
Date: Thu, 22 Mar 2012 21:15:52 -0700
Subject: x86-64: Inline vdso clock_gettime helpers

This is about a 3% speedup on Sandy Bridge.

Signed-off-by: Andy Lutomirski <luto@amacapital.net>
Signed-off-by: John Stultz <john.stultz@linaro.org>
---
 arch/x86/vdso/vclock_gettime.c | 9 +++++----
 1 file changed, 5 insertions(+), 4 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/vdso/vclock_gettime.c b/arch/x86/vdso/vclock_gettime.c
index 6eea70b8f384..885eff49d6ab 100644
--- a/arch/x86/vdso/vclock_gettime.c
+++ b/arch/x86/vdso/vclock_gettime.c
@@ -94,7 +94,8 @@ notrace static inline long vgetns(void)
 	return (v * gtod->clock.mult) >> gtod->clock.shift;
 }
 
-notrace static noinline int do_realtime(struct timespec *ts)
+/* Code size doesn't matter (vdso is 4k anyway) and this is faster. */
+notrace static int __always_inline do_realtime(struct timespec *ts)
 {
 	unsigned long seq, ns;
 	int mode;
@@ -111,7 +112,7 @@ notrace static noinline int do_realtime(struct timespec *ts)
 	return mode;
 }
 
-notrace static noinline int do_monotonic(struct timespec *ts)
+notrace static int do_monotonic(struct timespec *ts)
 {
 	unsigned long seq, ns;
 	int mode;
@@ -128,7 +129,7 @@ notrace static noinline int do_monotonic(struct timespec *ts)
 	return mode;
 }
 
-notrace static noinline int do_realtime_coarse(struct timespec *ts)
+notrace static int do_realtime_coarse(struct timespec *ts)
 {
 	unsigned long seq;
 	do {
@@ -139,7 +140,7 @@ notrace static noinline int do_realtime_coarse(struct timespec *ts)
 	return 0;
 }
 
-notrace static noinline int do_monotonic_coarse(struct timespec *ts)
+notrace static int do_monotonic_coarse(struct timespec *ts)
 {
 	unsigned long seq;
 	do {
-- 
cgit v1.2.3


From 307b1cd7ecd7f3dc5ce3d3860957f034f0abe4df Mon Sep 17 00:00:00 2001
From: Akinobu Mita <akinobu.mita@gmail.com>
Date: Fri, 23 Mar 2012 15:02:03 -0700
Subject: bitops: rename for_each_set_bit_cont() in favor of analogous list.h
 function

This renames for_each_set_bit_cont() to for_each_set_bit_from() because
it is analogous to list_for_each_entry_from() in list.h rather than
list_for_each_entry_continue().

This doesn't remove for_each_set_bit_cont() for now.

Signed-off-by: Akinobu Mita <akinobu.mita@gmail.com>
Cc: Robert Richter <robert.richter@amd.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: "H. Peter Anvin" <hpa@zytor.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 arch/x86/kernel/cpu/perf_event.c       | 4 ++--
 include/linux/bitops.h                 | 5 ++++-
 tools/perf/util/include/linux/bitops.h | 2 +-
 3 files changed, 7 insertions(+), 4 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kernel/cpu/perf_event.c b/arch/x86/kernel/cpu/perf_event.c
index 0a18d16cb58d..fa2900c0e398 100644
--- a/arch/x86/kernel/cpu/perf_event.c
+++ b/arch/x86/kernel/cpu/perf_event.c
@@ -643,14 +643,14 @@ static bool __perf_sched_find_counter(struct perf_sched *sched)
 	/* Prefer fixed purpose counters */
 	if (x86_pmu.num_counters_fixed) {
 		idx = X86_PMC_IDX_FIXED;
-		for_each_set_bit_cont(idx, c->idxmsk, X86_PMC_IDX_MAX) {
+		for_each_set_bit_from(idx, c->idxmsk, X86_PMC_IDX_MAX) {
 			if (!__test_and_set_bit(idx, sched->state.used))
 				goto done;
 		}
 	}
 	/* Grab the first unused counter starting with idx */
 	idx = sched->state.counter;
-	for_each_set_bit_cont(idx, c->idxmsk, X86_PMC_IDX_FIXED) {
+	for_each_set_bit_from(idx, c->idxmsk, X86_PMC_IDX_FIXED) {
 		if (!__test_and_set_bit(idx, sched->state.used))
 			goto done;
 	}
diff --git a/include/linux/bitops.h b/include/linux/bitops.h
index 94300fe46cce..a78e358f0c17 100644
--- a/include/linux/bitops.h
+++ b/include/linux/bitops.h
@@ -27,11 +27,14 @@ extern unsigned long __sw_hweight64(__u64 w);
 	     (bit) = find_next_bit((addr), (size), (bit) + 1))
 
 /* same as for_each_set_bit() but use bit as value to start with */
-#define for_each_set_bit_cont(bit, addr, size) \
+#define for_each_set_bit_from(bit, addr, size) \
 	for ((bit) = find_next_bit((addr), (size), (bit));	\
 	     (bit) < (size);					\
 	     (bit) = find_next_bit((addr), (size), (bit) + 1))
 
+#define for_each_set_bit_cont(bit, addr, size) \
+	for_each_set_bit_from(bit, addr, size)
+
 static __inline__ int get_bitmask_order(unsigned int count)
 {
 	int order;
diff --git a/tools/perf/util/include/linux/bitops.h b/tools/perf/util/include/linux/bitops.h
index 62cdee78db7b..f1584833bd22 100644
--- a/tools/perf/util/include/linux/bitops.h
+++ b/tools/perf/util/include/linux/bitops.h
@@ -15,7 +15,7 @@
 	     (bit) = find_next_bit((addr), (size), (bit) + 1))
 
 /* same as for_each_set_bit() but use bit as value to start with */
-#define for_each_set_bit_cont(bit, addr, size) \
+#define for_each_set_bit_from(bit, addr, size) \
 	for ((bit) = find_next_bit((addr), (size), (bit));	\
 	     (bit) < (size);					\
 	     (bit) = find_next_bit((addr), (size), (bit) + 1))
-- 
cgit v1.2.3


From 0b2f4d4d76a09f02fa37bfa57909483448fac771 Mon Sep 17 00:00:00 2001
From: Akinobu Mita <akinobu.mita@gmail.com>
Date: Fri, 23 Mar 2012 15:02:06 -0700
Subject: x86: use for_each_clear_bit_from()

Use for_each_clear_bit() to iterate over all the cleared bit in a
memory region.

Signed-off-by: Akinobu Mita <akinobu.mita@gmail.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: "H. Peter Anvin" <hpa@zytor.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 arch/x86/kernel/irqinit.c | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kernel/irqinit.c b/arch/x86/kernel/irqinit.c
index 313fb5cddbce..43e2b1cff0a7 100644
--- a/arch/x86/kernel/irqinit.c
+++ b/arch/x86/kernel/irqinit.c
@@ -306,10 +306,10 @@ void __init native_init_IRQ(void)
 	 * us. (some of these will be overridden and become
 	 * 'special' SMP interrupts)
 	 */
-	for (i = FIRST_EXTERNAL_VECTOR; i < NR_VECTORS; i++) {
+	i = FIRST_EXTERNAL_VECTOR;
+	for_each_clear_bit_from(i, used_vectors, NR_VECTORS) {
 		/* IA32_SYSCALL_VECTOR could be used in trap_init already. */
-		if (!test_bit(i, used_vectors))
-			set_intr_gate(i, interrupt[i-FIRST_EXTERNAL_VECTOR]);
+		set_intr_gate(i, interrupt[i - FIRST_EXTERNAL_VECTOR]);
 	}
 
 	if (!acpi_ioapic && !of_ioapic)
-- 
cgit v1.2.3


From 909af768e88867016f427264ae39d27a57b6a8ed Mon Sep 17 00:00:00 2001
From: Jason Baron <jbaron@redhat.com>
Date: Fri, 23 Mar 2012 15:02:51 -0700
Subject: coredump: remove VM_ALWAYSDUMP flag

The motivation for this patchset was that I was looking at a way for a
qemu-kvm process, to exclude the guest memory from its core dump, which
can be quite large.  There are already a number of filter flags in
/proc/<pid>/coredump_filter, however, these allow one to specify 'types'
of kernel memory, not specific address ranges (which is needed in this
case).

Since there are no more vma flags available, the first patch eliminates
the need for the 'VM_ALWAYSDUMP' flag.  The flag is used internally by
the kernel to mark vdso and vsyscall pages.  However, it is simple
enough to check if a vma covers a vdso or vsyscall page without the need
for this flag.

The second patch then replaces the 'VM_ALWAYSDUMP' flag with a new
'VM_NODUMP' flag, which can be set by userspace using new madvise flags:
'MADV_DONTDUMP', and unset via 'MADV_DODUMP'.  The core dump filters
continue to work the same as before unless 'MADV_DONTDUMP' is set on the
region.

The qemu code which implements this features is at:

  http://people.redhat.com/~jbaron/qemu-dump/qemu-dump.patch

In my testing the qemu core dump shrunk from 383MB -> 13MB with this
patch.

I also believe that the 'MADV_DONTDUMP' flag might be useful for
security sensitive apps, which might want to select which areas are
dumped.

This patch:

The VM_ALWAYSDUMP flag is currently used by the coredump code to
indicate that a vma is part of a vsyscall or vdso section.  However, we
can determine if a vma is in one these sections by checking it against
the gate_vma and checking for a non-NULL return value from
arch_vma_name().  Thus, freeing a valuable vma bit.

Signed-off-by: Jason Baron <jbaron@redhat.com>
Acked-by: Roland McGrath <roland@hack.frob.com>
Cc: Chris Metcalf <cmetcalf@tilera.com>
Cc: Avi Kivity <avi@redhat.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 arch/arm/kernel/process.c          |  3 +--
 arch/hexagon/kernel/vdso.c         |  3 +--
 arch/mips/kernel/vdso.c            |  3 +--
 arch/powerpc/kernel/vdso.c         | 10 ++--------
 arch/s390/kernel/vdso.c            | 10 ++--------
 arch/sh/kernel/vsyscall/vsyscall.c |  3 +--
 arch/tile/mm/elf.c                 |  8 +-------
 arch/unicore32/kernel/process.c    |  2 +-
 arch/x86/um/mem_32.c               |  8 --------
 arch/x86/um/vdso/vma.c             |  3 +--
 arch/x86/vdso/vdso32-setup.c       | 17 ++---------------
 arch/x86/vdso/vma.c                |  3 +--
 fs/binfmt_elf.c                    | 27 +++++++++++++++++++++++++--
 include/linux/mm.h                 |  1 -
 mm/memory.c                        |  8 +-------
 15 files changed, 40 insertions(+), 69 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/arm/kernel/process.c b/arch/arm/kernel/process.c
index c2ae3cd331fe..219e4efee1a6 100644
--- a/arch/arm/kernel/process.c
+++ b/arch/arm/kernel/process.c
@@ -533,8 +533,7 @@ int vectors_user_mapping(void)
 	struct mm_struct *mm = current->mm;
 	return install_special_mapping(mm, 0xffff0000, PAGE_SIZE,
 				       VM_READ | VM_EXEC |
-				       VM_MAYREAD | VM_MAYEXEC |
-				       VM_ALWAYSDUMP | VM_RESERVED,
+				       VM_MAYREAD | VM_MAYEXEC | VM_RESERVED,
 				       NULL);
 }
 
diff --git a/arch/hexagon/kernel/vdso.c b/arch/hexagon/kernel/vdso.c
index 16277c33308a..f212a453b527 100644
--- a/arch/hexagon/kernel/vdso.c
+++ b/arch/hexagon/kernel/vdso.c
@@ -78,8 +78,7 @@ int arch_setup_additional_pages(struct linux_binprm *bprm, int uses_interp)
 	/* MAYWRITE to allow gdb to COW and set breakpoints. */
 	ret = install_special_mapping(mm, vdso_base, PAGE_SIZE,
 				      VM_READ|VM_EXEC|
-				      VM_MAYREAD|VM_MAYWRITE|VM_MAYEXEC|
-				      VM_ALWAYSDUMP,
+				      VM_MAYREAD|VM_MAYWRITE|VM_MAYEXEC,
 				      &vdso_page);
 
 	if (ret)
diff --git a/arch/mips/kernel/vdso.c b/arch/mips/kernel/vdso.c
index e5cdfd603f8f..0f1af58b036a 100644
--- a/arch/mips/kernel/vdso.c
+++ b/arch/mips/kernel/vdso.c
@@ -88,8 +88,7 @@ int arch_setup_additional_pages(struct linux_binprm *bprm, int uses_interp)
 
 	ret = install_special_mapping(mm, addr, PAGE_SIZE,
 				      VM_READ|VM_EXEC|
-				      VM_MAYREAD|VM_MAYWRITE|VM_MAYEXEC|
-				      VM_ALWAYSDUMP,
+				      VM_MAYREAD|VM_MAYWRITE|VM_MAYEXEC,
 				      &vdso_page);
 
 	if (ret)
diff --git a/arch/powerpc/kernel/vdso.c b/arch/powerpc/kernel/vdso.c
index 7d14bb697d40..d36ee1055f88 100644
--- a/arch/powerpc/kernel/vdso.c
+++ b/arch/powerpc/kernel/vdso.c
@@ -263,17 +263,11 @@ int arch_setup_additional_pages(struct linux_binprm *bprm, int uses_interp)
 	 * the "data" page of the vDSO or you'll stop getting kernel updates
 	 * and your nice userland gettimeofday will be totally dead.
 	 * It's fine to use that for setting breakpoints in the vDSO code
-	 * pages though
-	 *
-	 * Make sure the vDSO gets into every core dump.
-	 * Dumping its contents makes post-mortem fully interpretable later
-	 * without matching up the same kernel and hardware config to see
-	 * what PC values meant.
+	 * pages though.
 	 */
 	rc = install_special_mapping(mm, vdso_base, vdso_pages << PAGE_SHIFT,
 				     VM_READ|VM_EXEC|
-				     VM_MAYREAD|VM_MAYWRITE|VM_MAYEXEC|
-				     VM_ALWAYSDUMP,
+				     VM_MAYREAD|VM_MAYWRITE|VM_MAYEXEC,
 				     vdso_pagelist);
 	if (rc) {
 		current->mm->context.vdso_base = 0;
diff --git a/arch/s390/kernel/vdso.c b/arch/s390/kernel/vdso.c
index e704a9965f90..9c80138206b0 100644
--- a/arch/s390/kernel/vdso.c
+++ b/arch/s390/kernel/vdso.c
@@ -241,17 +241,11 @@ int arch_setup_additional_pages(struct linux_binprm *bprm, int uses_interp)
 	 * on the "data" page of the vDSO or you'll stop getting kernel
 	 * updates and your nice userland gettimeofday will be totally dead.
 	 * It's fine to use that for setting breakpoints in the vDSO code
-	 * pages though
-	 *
-	 * Make sure the vDSO gets into every core dump.
-	 * Dumping its contents makes post-mortem fully interpretable later
-	 * without matching up the same kernel and hardware config to see
-	 * what PC values meant.
+	 * pages though.
 	 */
 	rc = install_special_mapping(mm, vdso_base, vdso_pages << PAGE_SHIFT,
 				     VM_READ|VM_EXEC|
-				     VM_MAYREAD|VM_MAYWRITE|VM_MAYEXEC|
-				     VM_ALWAYSDUMP,
+				     VM_MAYREAD|VM_MAYWRITE|VM_MAYEXEC,
 				     vdso_pagelist);
 	if (rc)
 		current->mm->context.vdso_base = 0;
diff --git a/arch/sh/kernel/vsyscall/vsyscall.c b/arch/sh/kernel/vsyscall/vsyscall.c
index 1d6d51a1ce79..5ca579720a09 100644
--- a/arch/sh/kernel/vsyscall/vsyscall.c
+++ b/arch/sh/kernel/vsyscall/vsyscall.c
@@ -73,8 +73,7 @@ int arch_setup_additional_pages(struct linux_binprm *bprm, int uses_interp)
 
 	ret = install_special_mapping(mm, addr, PAGE_SIZE,
 				      VM_READ | VM_EXEC |
-				      VM_MAYREAD | VM_MAYWRITE | VM_MAYEXEC |
-				      VM_ALWAYSDUMP,
+				      VM_MAYREAD | VM_MAYWRITE | VM_MAYEXEC,
 				      syscall_pages);
 	if (unlikely(ret))
 		goto up_fail;
diff --git a/arch/tile/mm/elf.c b/arch/tile/mm/elf.c
index 55e58e93bfc5..1a00fb64fc88 100644
--- a/arch/tile/mm/elf.c
+++ b/arch/tile/mm/elf.c
@@ -117,17 +117,11 @@ int arch_setup_additional_pages(struct linux_binprm *bprm,
 
 	/*
 	 * MAYWRITE to allow gdb to COW and set breakpoints
-	 *
-	 * Make sure the vDSO gets into every core dump.  Dumping its
-	 * contents makes post-mortem fully interpretable later
-	 * without matching up the same kernel and hardware config to
-	 * see what PC values meant.
 	 */
 	vdso_base = VDSO_BASE;
 	retval = install_special_mapping(mm, vdso_base, PAGE_SIZE,
 					 VM_READ|VM_EXEC|
-					 VM_MAYREAD|VM_MAYWRITE|VM_MAYEXEC|
-					 VM_ALWAYSDUMP,
+					 VM_MAYREAD|VM_MAYWRITE|VM_MAYEXEC,
 					 vdso_pages);
 
 #ifndef __tilegx__
diff --git a/arch/unicore32/kernel/process.c b/arch/unicore32/kernel/process.c
index 52edc2b62873..432b4291f37b 100644
--- a/arch/unicore32/kernel/process.c
+++ b/arch/unicore32/kernel/process.c
@@ -381,7 +381,7 @@ int vectors_user_mapping(void)
 	return install_special_mapping(mm, 0xffff0000, PAGE_SIZE,
 				       VM_READ | VM_EXEC |
 				       VM_MAYREAD | VM_MAYEXEC |
-				       VM_ALWAYSDUMP | VM_RESERVED,
+				       VM_RESERVED,
 				       NULL);
 }
 
diff --git a/arch/x86/um/mem_32.c b/arch/x86/um/mem_32.c
index 639900a6fde9..f40281e5d6a2 100644
--- a/arch/x86/um/mem_32.c
+++ b/arch/x86/um/mem_32.c
@@ -23,14 +23,6 @@ static int __init gate_vma_init(void)
 	gate_vma.vm_flags = VM_READ | VM_MAYREAD | VM_EXEC | VM_MAYEXEC;
 	gate_vma.vm_page_prot = __P101;
 
-	/*
-	 * Make sure the vDSO gets into every core dump.
-	 * Dumping its contents makes post-mortem fully interpretable later
-	 * without matching up the same kernel and hardware config to see
-	 * what PC values meant.
-	 */
-	gate_vma.vm_flags |= VM_ALWAYSDUMP;
-
 	return 0;
 }
 __initcall(gate_vma_init);
diff --git a/arch/x86/um/vdso/vma.c b/arch/x86/um/vdso/vma.c
index 91f4ec9a0a56..af91901babb8 100644
--- a/arch/x86/um/vdso/vma.c
+++ b/arch/x86/um/vdso/vma.c
@@ -64,8 +64,7 @@ int arch_setup_additional_pages(struct linux_binprm *bprm, int uses_interp)
 
 	err = install_special_mapping(mm, um_vdso_addr, PAGE_SIZE,
 		VM_READ|VM_EXEC|
-		VM_MAYREAD|VM_MAYWRITE|VM_MAYEXEC|
-		VM_ALWAYSDUMP,
+		VM_MAYREAD|VM_MAYWRITE|VM_MAYEXEC,
 		vdsop);
 
 	up_write(&mm->mmap_sem);
diff --git a/arch/x86/vdso/vdso32-setup.c b/arch/x86/vdso/vdso32-setup.c
index 468d591dde31..a944020fa859 100644
--- a/arch/x86/vdso/vdso32-setup.c
+++ b/arch/x86/vdso/vdso32-setup.c
@@ -250,13 +250,7 @@ static int __init gate_vma_init(void)
 	gate_vma.vm_end = FIXADDR_USER_END;
 	gate_vma.vm_flags = VM_READ | VM_MAYREAD | VM_EXEC | VM_MAYEXEC;
 	gate_vma.vm_page_prot = __P101;
-	/*
-	 * Make sure the vDSO gets into every core dump.
-	 * Dumping its contents makes post-mortem fully interpretable later
-	 * without matching up the same kernel and hardware config to see
-	 * what PC values meant.
-	 */
-	gate_vma.vm_flags |= VM_ALWAYSDUMP;
+
 	return 0;
 }
 
@@ -343,17 +337,10 @@ int arch_setup_additional_pages(struct linux_binprm *bprm, int uses_interp)
 	if (compat_uses_vma || !compat) {
 		/*
 		 * MAYWRITE to allow gdb to COW and set breakpoints
-		 *
-		 * Make sure the vDSO gets into every core dump.
-		 * Dumping its contents makes post-mortem fully
-		 * interpretable later without matching up the same
-		 * kernel and hardware config to see what PC values
-		 * meant.
 		 */
 		ret = install_special_mapping(mm, addr, PAGE_SIZE,
 					      VM_READ|VM_EXEC|
-					      VM_MAYREAD|VM_MAYWRITE|VM_MAYEXEC|
-					      VM_ALWAYSDUMP,
+					      VM_MAYREAD|VM_MAYWRITE|VM_MAYEXEC,
 					      vdso32_pages);
 
 		if (ret)
diff --git a/arch/x86/vdso/vma.c b/arch/x86/vdso/vma.c
index 153407c35b75..17e18279649f 100644
--- a/arch/x86/vdso/vma.c
+++ b/arch/x86/vdso/vma.c
@@ -124,8 +124,7 @@ int arch_setup_additional_pages(struct linux_binprm *bprm, int uses_interp)
 
 	ret = install_special_mapping(mm, addr, vdso_size,
 				      VM_READ|VM_EXEC|
-				      VM_MAYREAD|VM_MAYWRITE|VM_MAYEXEC|
-				      VM_ALWAYSDUMP,
+				      VM_MAYREAD|VM_MAYWRITE|VM_MAYEXEC,
 				      vdso_pages);
 	if (ret) {
 		current->mm->context.vdso = NULL;
diff --git a/fs/binfmt_elf.c b/fs/binfmt_elf.c
index 81878b78c9d4..b64be5b5ac21 100644
--- a/fs/binfmt_elf.c
+++ b/fs/binfmt_elf.c
@@ -1092,6 +1092,29 @@ out:
  * Jeremy Fitzhardinge <jeremy@sw.oz.au>
  */
 
+/*
+ * The purpose of always_dump_vma() is to make sure that special kernel mappings
+ * that are useful for post-mortem analysis are included in every core dump.
+ * In that way we ensure that the core dump is fully interpretable later
+ * without matching up the same kernel and hardware config to see what PC values
+ * meant. These special mappings include - vDSO, vsyscall, and other
+ * architecture specific mappings
+ */
+static bool always_dump_vma(struct vm_area_struct *vma)
+{
+	/* Any vsyscall mappings? */
+	if (vma == get_gate_vma(vma->vm_mm))
+		return true;
+	/*
+	 * arch_vma_name() returns non-NULL for special architecture mappings,
+	 * such as vDSO sections.
+	 */
+	if (arch_vma_name(vma))
+		return true;
+
+	return false;
+}
+
 /*
  * Decide what to dump of a segment, part, all or none.
  */
@@ -1100,8 +1123,8 @@ static unsigned long vma_dump_size(struct vm_area_struct *vma,
 {
 #define FILTER(type)	(mm_flags & (1UL << MMF_DUMP_##type))
 
-	/* The vma can be set up to tell us the answer directly.  */
-	if (vma->vm_flags & VM_ALWAYSDUMP)
+	/* always dump the vdso and vsyscall sections */
+	if (always_dump_vma(vma))
 		goto whole;
 
 	/* Hugetlb memory check */
diff --git a/include/linux/mm.h b/include/linux/mm.h
index 7330742e7973..2de2ddba51d4 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -111,7 +111,6 @@ extern unsigned int kobjsize(const void *objp);
 #define VM_HUGEPAGE	0x01000000	/* MADV_HUGEPAGE marked this vma */
 #endif
 #define VM_INSERTPAGE	0x02000000	/* The vma has had "vm_insert_page()" done on it */
-#define VM_ALWAYSDUMP	0x04000000	/* Always include in core dumps */
 
 #define VM_CAN_NONLINEAR 0x08000000	/* Has ->fault & does nonlinear pages */
 #define VM_MIXEDMAP	0x10000000	/* Can contain "struct page" and pure PFN pages */
diff --git a/mm/memory.c b/mm/memory.c
index 3416b6e018d6..6105f475fa86 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -3623,13 +3623,7 @@ static int __init gate_vma_init(void)
 	gate_vma.vm_end = FIXADDR_USER_END;
 	gate_vma.vm_flags = VM_READ | VM_MAYREAD | VM_EXEC | VM_MAYEXEC;
 	gate_vma.vm_page_prot = __P101;
-	/*
-	 * Make sure the vDSO gets into every core dump.
-	 * Dumping its contents makes post-mortem fully interpretable later
-	 * without matching up the same kernel and hardware config to see
-	 * what PC values meant.
-	 */
-	gate_vma.vm_flags |= VM_ALWAYSDUMP;
+
 	return 0;
 }
 __initcall(gate_vma_init);
-- 
cgit v1.2.3


From 65c0ff4079c011232e795e62c74a0a95512b7ac3 Mon Sep 17 00:00:00 2001
From: Hugh Dickins <hughd@google.com>
Date: Fri, 23 Mar 2012 14:02:55 -0700
Subject: x86: Stop recursive fault in print_context_stack after stack overflow

After printing out the first line of a stack backtrace,
print_context_stack() calls print_ftrace_graph_addr() to check
if it's making a graph of function calls, usually not the case.

But unfortunate ordering of assignments causes this to oops if
an earlier stack overflow corrupted threadinfo->task.  Reorder
to avoid that irritation.

( The fact that there was a stack overflow may often be more
  interesting than the stack that can now be shown; but
  integrating that information with this stacktrace is awkward,
  so leave it to overflow reporting. )

Signed-off-by: Hugh Dickins <hughd@google.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Cc: Namhyung Kim <namhyung@gmail.com>
Cc: Frederic Weisbecker <fweisbec@gmail.com>
Link: http://lkml.kernel.org/r/20120323225648.15DD5A033B@akpm.mtv.corp.google.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/x86/kernel/dumpstack.c | 7 +++++--
 1 file changed, 5 insertions(+), 2 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kernel/dumpstack.c b/arch/x86/kernel/dumpstack.c
index 4025fe4f928f..90bf130f09bc 100644
--- a/arch/x86/kernel/dumpstack.c
+++ b/arch/x86/kernel/dumpstack.c
@@ -37,13 +37,16 @@ print_ftrace_graph_addr(unsigned long addr, void *data,
 			const struct stacktrace_ops *ops,
 			struct thread_info *tinfo, int *graph)
 {
-	struct task_struct *task = tinfo->task;
+	struct task_struct *task;
 	unsigned long ret_addr;
-	int index = task->curr_ret_stack;
+	int index;
 
 	if (addr != (unsigned long)return_to_handler)
 		return;
 
+	task = tinfo->task;
+	index = task->curr_ret_stack;
+
 	if (!task->ret_stack || index < *graph)
 		return;
 
-- 
cgit v1.2.3


From f5243d6de7ae232e1d81e44ae9756bbd8c988fcd Mon Sep 17 00:00:00 2001
From: Randy Dunlap <rdunlap@xenotime.net>
Date: Fri, 23 Mar 2012 16:22:50 -0700
Subject: x86/kconfig: Remove CONFIG_TR=y from the defconfigs

Remove CONFIG_TR=y from the x86 defconfigs since
token ring support is antiquated and obsolete.

( I reviewed both x86 defconfigs - I didn't come up with
  anything else that obviously should be removed. )

Signed-off-by: Randy Dunlap <rdunlap@xenotime.net>
Link: http://lkml.kernel.org/r/4F6D05CA.2050801@xenotime.net
[ Twiddled the changelog a bit ]
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/x86/configs/i386_defconfig   | 1 -
 arch/x86/configs/x86_64_defconfig | 1 -
 2 files changed, 2 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/configs/i386_defconfig b/arch/x86/configs/i386_defconfig
index 2d562821a88f..119db67dcb03 100644
--- a/arch/x86/configs/i386_defconfig
+++ b/arch/x86/configs/i386_defconfig
@@ -183,7 +183,6 @@ CONFIG_8139TOO=y
 # CONFIG_8139TOO_PIO is not set
 CONFIG_R8169=y
 CONFIG_FDDI=y
-CONFIG_TR=y
 CONFIG_INPUT_POLLDEV=y
 # CONFIG_INPUT_MOUSEDEV_PSAUX is not set
 CONFIG_INPUT_EVDEV=y
diff --git a/arch/x86/configs/x86_64_defconfig b/arch/x86/configs/x86_64_defconfig
index 3cf137ad2789..76eb2903809f 100644
--- a/arch/x86/configs/x86_64_defconfig
+++ b/arch/x86/configs/x86_64_defconfig
@@ -176,7 +176,6 @@ CONFIG_SKY2=y
 CONFIG_FORCEDETH=y
 CONFIG_8139TOO=y
 CONFIG_FDDI=y
-CONFIG_TR=y
 CONFIG_INPUT_POLLDEV=y
 # CONFIG_INPUT_MOUSEDEV_PSAUX is not set
 CONFIG_INPUT_EVDEV=y
-- 
cgit v1.2.3


From 68fe7b23d559763a2e19e5fc1cf7036e4aaecb10 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Sat, 24 Mar 2012 09:29:22 +0100
Subject: x86: vdso: Put declaration before code

Sigh, warnings are there for a reason.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Cc: Andy Lutomirski <luto@amacapital.net>
Cc: John Stultz <john.stultz@linaro.org>
---
 arch/x86/kernel/vsyscall_64.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kernel/vsyscall_64.c b/arch/x86/kernel/vsyscall_64.c
index 4285f1f404c2..d5c69860b524 100644
--- a/arch/x86/kernel/vsyscall_64.c
+++ b/arch/x86/kernel/vsyscall_64.c
@@ -83,9 +83,10 @@ void update_vsyscall_tz(void)
 void update_vsyscall(struct timespec *wall_time, struct timespec *wtm,
 			struct clocksource *clock, u32 mult)
 {
-	write_seqcount_begin(&vsyscall_gtod_data.seq);
 	struct timespec monotonic;
 
+	write_seqcount_begin(&vsyscall_gtod_data.seq);
+
 	/* copy vsyscall data */
 	vsyscall_gtod_data.clock.vclock_mode	= clock->archdata.vclock_mode;
 	vsyscall_gtod_data.clock.cycle_last	= clock->cycle_last;
-- 
cgit v1.2.3


From c56334dbf7e8772ed84390bc4664427f0a7f3b25 Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Sun, 20 Nov 2011 17:23:39 -0500
Subject: um: merge processor_{32,64}.h a bit...

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
Signed-off-by: Richard Weinberger <richard@nod.at>
---
 arch/x86/um/asm/processor.h    | 10 ++++++++++
 arch/x86/um/asm/processor_32.h | 10 ----------
 arch/x86/um/asm/processor_64.h | 10 ----------
 3 files changed, 10 insertions(+), 20 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/um/asm/processor.h b/arch/x86/um/asm/processor.h
index 2c32df6fe231..04f82e020f2b 100644
--- a/arch/x86/um/asm/processor.h
+++ b/arch/x86/um/asm/processor.h
@@ -17,6 +17,16 @@
 #define ARCH_IS_STACKGROW(address) \
        (address + 65536 + 32 * sizeof(unsigned long) >= UPT_SP(&current->thread.regs.regs))
 
+#include <asm/user.h>
+
+/* REP NOP (PAUSE) is a good thing to insert into busy-wait loops. */
+static inline void rep_nop(void)
+{
+	__asm__ __volatile__("rep;nop": : :"memory");
+}
+
+#define cpu_relax()	rep_nop()
+
 #include <asm/processor-generic.h>
 
 #endif
diff --git a/arch/x86/um/asm/processor_32.h b/arch/x86/um/asm/processor_32.h
index 018f732704dd..6c6689e574ce 100644
--- a/arch/x86/um/asm/processor_32.h
+++ b/arch/x86/um/asm/processor_32.h
@@ -45,16 +45,6 @@ static inline void arch_copy_thread(struct arch_thread *from,
         memcpy(&to->tls_array, &from->tls_array, sizeof(from->tls_array));
 }
 
-#include <asm/user.h>
-
-/* REP NOP (PAUSE) is a good thing to insert into busy-wait loops. */
-static inline void rep_nop(void)
-{
-	__asm__ __volatile__("rep;nop": : :"memory");
-}
-
-#define cpu_relax()	rep_nop()
-
 /*
  * Default implementation of macro that returns current
  * instruction pointer ("program counter"). Stolen
diff --git a/arch/x86/um/asm/processor_64.h b/arch/x86/um/asm/processor_64.h
index 61de92d916c3..4b02a8455bd1 100644
--- a/arch/x86/um/asm/processor_64.h
+++ b/arch/x86/um/asm/processor_64.h
@@ -14,14 +14,6 @@ struct arch_thread {
         struct faultinfo faultinfo;
 };
 
-/* REP NOP (PAUSE) is a good thing to insert into busy-wait loops. */
-static inline void rep_nop(void)
-{
-	__asm__ __volatile__("rep;nop": : :"memory");
-}
-
-#define cpu_relax()   rep_nop()
-
 #define INIT_ARCH_THREAD { .debugregs  		= { [ 0 ... 7 ] = 0 }, \
 			   .debugregs_seq	= 0, \
 			   .fs			= 0, \
@@ -37,8 +29,6 @@ static inline void arch_copy_thread(struct arch_thread *from,
 	to->fs = from->fs;
 }
 
-#include <asm/user.h>
-
 #define current_text_addr() \
 	({ void *pc; __asm__("movq $1f,%0\n1:":"=g" (pc)); pc; })
 
-- 
cgit v1.2.3


From c2220b2a124d2fe7b0074b23680177c8e905a76c Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Mon, 30 Jan 2012 16:30:48 -0500
Subject: um: kill HOST_TASK_PID

just provide get_current_pid() to the userland side of things
instead of get_current() + manual poking in its results

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
Signed-off-by: Richard Weinberger <richard@nod.at>
---
 arch/um/include/shared/common-offsets.h | 2 --
 arch/um/include/shared/kern_util.h      | 2 +-
 arch/um/kernel/process.c                | 4 ++--
 arch/x86/um/bugs_32.c                   | 4 +---
 4 files changed, 4 insertions(+), 8 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/um/include/shared/common-offsets.h b/arch/um/include/shared/common-offsets.h
index d7fe563aa7e7..40db8f71deae 100644
--- a/arch/um/include/shared/common-offsets.h
+++ b/arch/um/include/shared/common-offsets.h
@@ -2,8 +2,6 @@
 
 DEFINE(KERNEL_MADV_REMOVE, MADV_REMOVE);
 
-OFFSET(HOST_TASK_PID, task_struct, pid);
-
 DEFINE(UM_KERN_PAGE_SIZE, PAGE_SIZE);
 DEFINE(UM_KERN_PAGE_MASK, PAGE_MASK);
 DEFINE(UM_KERN_PAGE_SHIFT, PAGE_SHIFT);
diff --git a/arch/um/include/shared/kern_util.h b/arch/um/include/shared/kern_util.h
index 0f1483852460..00965d06d2ca 100644
--- a/arch/um/include/shared/kern_util.h
+++ b/arch/um/include/shared/kern_util.h
@@ -48,7 +48,7 @@ extern void do_uml_exitcalls(void);
  * GFP_ATOMIC.
  */
 extern int __cant_sleep(void);
-extern void *get_current(void);
+extern int get_current_pid(void);
 extern int copy_from_user_proc(void *to, void *from, int size);
 extern int cpu(void);
 extern char *uml_strdup(const char *string);
diff --git a/arch/um/kernel/process.c b/arch/um/kernel/process.c
index 69f24905abdc..f386d04a84a5 100644
--- a/arch/um/kernel/process.c
+++ b/arch/um/kernel/process.c
@@ -126,9 +126,9 @@ void exit_thread(void)
 {
 }
 
-void *get_current(void)
+int get_current_pid(void)
 {
-	return current;
+	return task_pid_nr(current);
 }
 
 /*
diff --git a/arch/x86/um/bugs_32.c b/arch/x86/um/bugs_32.c
index a1fba5fb9dbe..17d88cf2c6c4 100644
--- a/arch/x86/um/bugs_32.c
+++ b/arch/x86/um/bugs_32.c
@@ -13,8 +13,6 @@
 static int host_has_cmov = 1;
 static jmp_buf cmov_test_return;
 
-#define TASK_PID(task) *((int *) &(((char *) (task))[HOST_TASK_PID]))
-
 static void cmov_sigill_test_handler(int sig)
 {
 	host_has_cmov = 0;
@@ -51,7 +49,7 @@ void arch_examine_signal(int sig, struct uml_pt_regs *regs)
 	 * This is testing for a cmov (0x0f 0x4x) instruction causing a
 	 * SIGILL in init.
 	 */
-	if ((sig != SIGILL) || (TASK_PID(get_current()) != 1))
+	if ((sig != SIGILL) || (get_current_pid() != 1))
 		return;
 
 	if (copy_from_user_proc(tmp, (void *) UPT_IP(regs), 2)) {
-- 
cgit v1.2.3


From dc5be20a6454312d395dbf07eb2218090a03ae24 Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Sat, 11 Feb 2012 05:39:56 -0500
Subject: um: most of the SUBARCH uses can be killed

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
[richard@nod.at: Re-export SUBARCH in arch/um/Makefile]
Signed-off-by: Richard Weinberger <richard@nod.at>
---
 arch/um/Makefile        | 4 ++--
 arch/um/kernel/Makefile | 2 +-
 arch/x86/Makefile.um    | 4 ----
 3 files changed, 3 insertions(+), 7 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/um/Makefile b/arch/um/Makefile
index 28688e6d96d7..4c993c89d0f0 100644
--- a/arch/um/Makefile
+++ b/arch/um/Makefile
@@ -50,7 +50,7 @@ KBUILD_CPPFLAGS += -I$(srctree)/$(HOST_DIR)/um
 #
 # These apply to USER_CFLAGS to.
 
-KBUILD_CFLAGS += $(CFLAGS) $(CFLAGS-y) -D__arch_um__ -DSUBARCH=\"$(SUBARCH)\" \
+KBUILD_CFLAGS += $(CFLAGS) $(CFLAGS-y) -D__arch_um__ \
 	$(ARCH_INCLUDE) $(MODE_INCLUDE) -Dvmap=kernel_vmap	\
 	-Din6addr_loopback=kernel_in6addr_loopback \
 	-Din6addr_any=kernel_in6addr_any -Dstrrchr=kernel_strrchr
@@ -99,7 +99,7 @@ KBUILD_KCONFIG := $(HOST_DIR)/um/Kconfig
 
 archheaders:
 	$(Q)$(MAKE) -C '$(srctree)' KBUILD_SRC= \
-		ARCH=$(SUBARCH) O='$(objtree)' archheaders
+		ARCH=$(HEADER_ARCH) O='$(objtree)' archheaders
 
 archprepare: include/generated/user_constants.h
 
diff --git a/arch/um/kernel/Makefile b/arch/um/kernel/Makefile
index bc494741b1f3..492bc4c1b62b 100644
--- a/arch/um/kernel/Makefile
+++ b/arch/um/kernel/Makefile
@@ -3,7 +3,7 @@
 # Licensed under the GPL
 #
 
-CPPFLAGS_vmlinux.lds := -U$(SUBARCH) -DSTART=$(LDS_START) \
+CPPFLAGS_vmlinux.lds := -DSTART=$(LDS_START) \
                         -DELF_ARCH=$(LDS_ELF_ARCH)        \
                         -DELF_FORMAT=$(LDS_ELF_FORMAT)
 extra-y := vmlinux.lds
diff --git a/arch/x86/Makefile.um b/arch/x86/Makefile.um
index 36ddec6a41c9..4be406abeefd 100644
--- a/arch/x86/Makefile.um
+++ b/arch/x86/Makefile.um
@@ -8,15 +8,11 @@ ELF_ARCH		:= i386
 ELF_FORMAT 		:= elf32-i386
 CHECKFLAGS	+= -D__i386__
 
-ifeq ("$(origin SUBARCH)", "command line")
-ifneq ("$(shell uname -m | sed -e s/i.86/i386/)", "$(SUBARCH)")
 KBUILD_CFLAGS		+= $(call cc-option,-m32)
 KBUILD_AFLAGS		+= $(call cc-option,-m32)
 LINK-y			+= $(call cc-option,-m32)
 
 export LDFLAGS
-endif
-endif
 
 # First of all, tune CFLAGS for the specific CPU. This actually sets cflags-y.
 include $(srctree)/arch/x86/Makefile_32.cpu
-- 
cgit v1.2.3


From 4c3ff74742b481eaf32d010d072b421c97fd8f08 Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Sat, 11 Feb 2012 06:15:50 -0500
Subject: um: allow SUBARCH=x86

nicked from patch by dwmw2 back in July

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
Signed-off-by: Richard Weinberger <richard@nod.at>
---
 arch/x86/um/Kconfig | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/um/Kconfig b/arch/x86/um/Kconfig
index b2b54d2edf53..9926e11a772d 100644
--- a/arch/x86/um/Kconfig
+++ b/arch/x86/um/Kconfig
@@ -15,8 +15,8 @@ config UML_X86
 	select GENERIC_FIND_FIRST_BIT
 
 config 64BIT
-	bool
-	default SUBARCH = "x86_64"
+	bool "64-bit kernel" if SUBARCH = "x86"
+	default SUBARCH != "i386"
 
 config X86_32
 	def_bool !64BIT
-- 
cgit v1.2.3


From 90e240142bd31ff10aeda5a280a53153f4eff004 Mon Sep 17 00:00:00 2001
From: Richard Weinberger <richard@nod.at>
Date: Sun, 25 Mar 2012 23:00:04 +0200
Subject: x86: Merge the x86_32 and x86_64 cpu_idle() functions

Both functions are mostly identical.
The differences are:

- x86_32's cpu_idle() makes use of check_pgt_cache(), which is a
  nop on both x86_32 and x86_64.

- x86_64's cpu_idle() uses enter/__exit_idle/(), on x86_32 these
  function are a nop.

- In contrast to x86_32, x86_64 calls rcu_idle_enter/exit() in
  the innermost loop because idle notifications need RCU.
  Calling these function on x86_32 also in the innermost loop
  does not hurt.

So we can merge both functions.

Signed-off-by: Richard Weinberger <richard@nod.at>
Acked-by: Frederic Weisbecker <fweisbec@gmail.com>
Cc: paulmck@linux.vnet.ibm.com
Cc: josh@joshtriplett.org
Cc: tj@kernel.org
Link: http://lkml.kernel.org/r/1332709204-22496-1-git-send-email-richard@nod.at
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/x86/include/asm/idle.h  |   1 +
 arch/x86/kernel/process.c    | 114 +++++++++++++++++++++++++++++++++++++++++++
 arch/x86/kernel/process_32.c |  58 ----------------------
 arch/x86/kernel/process_64.c | 107 ----------------------------------------
 4 files changed, 115 insertions(+), 165 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/include/asm/idle.h b/arch/x86/include/asm/idle.h
index f49253d75710..c5d1785373ed 100644
--- a/arch/x86/include/asm/idle.h
+++ b/arch/x86/include/asm/idle.h
@@ -14,6 +14,7 @@ void exit_idle(void);
 #else /* !CONFIG_X86_64 */
 static inline void enter_idle(void) { }
 static inline void exit_idle(void) { }
+static inline void __exit_idle(void) { }
 #endif /* CONFIG_X86_64 */
 
 void amd_e400_remove_cpu(int cpu);
diff --git a/arch/x86/kernel/process.c b/arch/x86/kernel/process.c
index 14baf78d5a1f..29309c42b9e5 100644
--- a/arch/x86/kernel/process.c
+++ b/arch/x86/kernel/process.c
@@ -12,6 +12,9 @@
 #include <linux/user-return-notifier.h>
 #include <linux/dmi.h>
 #include <linux/utsname.h>
+#include <linux/stackprotector.h>
+#include <linux/tick.h>
+#include <linux/cpuidle.h>
 #include <trace/events/power.h>
 #include <linux/hw_breakpoint.h>
 #include <asm/cpu.h>
@@ -23,6 +26,24 @@
 #include <asm/i387.h>
 #include <asm/fpu-internal.h>
 #include <asm/debugreg.h>
+#include <asm/nmi.h>
+
+#ifdef CONFIG_X86_64
+static DEFINE_PER_CPU(unsigned char, is_idle);
+static ATOMIC_NOTIFIER_HEAD(idle_notifier);
+
+void idle_notifier_register(struct notifier_block *n)
+{
+	atomic_notifier_chain_register(&idle_notifier, n);
+}
+EXPORT_SYMBOL_GPL(idle_notifier_register);
+
+void idle_notifier_unregister(struct notifier_block *n)
+{
+	atomic_notifier_chain_unregister(&idle_notifier, n);
+}
+EXPORT_SYMBOL_GPL(idle_notifier_unregister);
+#endif
 
 struct kmem_cache *task_xstate_cachep;
 EXPORT_SYMBOL_GPL(task_xstate_cachep);
@@ -371,6 +392,99 @@ static inline int hlt_use_halt(void)
 }
 #endif
 
+#ifndef CONFIG_SMP
+static inline void play_dead(void)
+{
+	BUG();
+}
+#endif
+
+#ifdef CONFIG_X86_64
+void enter_idle(void)
+{
+	percpu_write(is_idle, 1);
+	atomic_notifier_call_chain(&idle_notifier, IDLE_START, NULL);
+}
+
+static void __exit_idle(void)
+{
+	if (x86_test_and_clear_bit_percpu(0, is_idle) == 0)
+		return;
+	atomic_notifier_call_chain(&idle_notifier, IDLE_END, NULL);
+}
+
+/* Called from interrupts to signify idle end */
+void exit_idle(void)
+{
+	/* idle loop has pid 0 */
+	if (current->pid)
+		return;
+	__exit_idle();
+}
+#endif
+
+/*
+ * The idle thread. There's no useful work to be
+ * done, so just try to conserve power and have a
+ * low exit latency (ie sit in a loop waiting for
+ * somebody to say that they'd like to reschedule)
+ */
+void cpu_idle(void)
+{
+	/*
+	 * If we're the non-boot CPU, nothing set the stack canary up
+	 * for us.  CPU0 already has it initialized but no harm in
+	 * doing it again.  This is a good place for updating it, as
+	 * we wont ever return from this function (so the invalid
+	 * canaries already on the stack wont ever trigger).
+	 */
+	boot_init_stack_canary();
+	current_thread_info()->status |= TS_POLLING;
+
+	while (1) {
+		tick_nohz_idle_enter();
+
+		while (!need_resched()) {
+			rmb();
+
+			if (cpu_is_offline(smp_processor_id()))
+				play_dead();
+
+			/*
+			 * Idle routines should keep interrupts disabled
+			 * from here on, until they go to idle.
+			 * Otherwise, idle callbacks can misfire.
+			 */
+			local_touch_nmi();
+			local_irq_disable();
+
+			enter_idle();
+
+			/* Don't trace irqs off for idle */
+			stop_critical_timings();
+
+			/* enter_idle() needs rcu for notifiers */
+			rcu_idle_enter();
+
+			if (cpuidle_idle_call())
+				pm_idle();
+
+			rcu_idle_exit();
+			start_critical_timings();
+
+			/* In many cases the interrupt that ended idle
+			   has already called exit_idle. But some idle
+			   loops can be woken up without interrupt. */
+			__exit_idle();
+		}
+
+		tick_nohz_idle_exit();
+		preempt_enable_no_resched();
+		schedule();
+		preempt_disable();
+	}
+}
+
 /*
  * We use this if we don't have any better
  * idle routine..
diff --git a/arch/x86/kernel/process_32.c b/arch/x86/kernel/process_32.c
index 9d7d4842bfaf..ea207c245aa4 100644
--- a/arch/x86/kernel/process_32.c
+++ b/arch/x86/kernel/process_32.c
@@ -9,7 +9,6 @@
  * This file handles the architecture-dependent parts of process handling..
  */
 
-#include <linux/stackprotector.h>
 #include <linux/cpu.h>
 #include <linux/errno.h>
 #include <linux/sched.h>
@@ -31,14 +30,12 @@
 #include <linux/kallsyms.h>
 #include <linux/ptrace.h>
 #include <linux/personality.h>
-#include <linux/tick.h>
 #include <linux/percpu.h>
 #include <linux/prctl.h>
 #include <linux/ftrace.h>
 #include <linux/uaccess.h>
 #include <linux/io.h>
 #include <linux/kdebug.h>
-#include <linux/cpuidle.h>
 
 #include <asm/pgtable.h>
 #include <asm/system.h>
@@ -58,7 +55,6 @@
 #include <asm/idle.h>
 #include <asm/syscalls.h>
 #include <asm/debugreg.h>
-#include <asm/nmi.h>
 
 asmlinkage void ret_from_fork(void) __asm__("ret_from_fork");
 
@@ -70,60 +66,6 @@ unsigned long thread_saved_pc(struct task_struct *tsk)
 	return ((unsigned long *)tsk->thread.sp)[3];
 }
 
-#ifndef CONFIG_SMP
-static inline void play_dead(void)
-{
-	BUG();
-}
-#endif
-
-/*
- * The idle thread. There's no useful work to be
- * done, so just try to conserve power and have a
- * low exit latency (ie sit in a loop waiting for
- * somebody to say that they'd like to reschedule)
- */
-void cpu_idle(void)
-{
-	int cpu = smp_processor_id();
-
-	/*
-	 * If we're the non-boot CPU, nothing set the stack canary up
-	 * for us.  CPU0 already has it initialized but no harm in
-	 * doing it again.  This is a good place for updating it, as
-	 * we wont ever return from this function (so the invalid
-	 * canaries already on the stack wont ever trigger).
-	 */
-	boot_init_stack_canary();
-
-	current_thread_info()->status |= TS_POLLING;
-
-	/* endless idle loop with no priority at all */
-	while (1) {
-		tick_nohz_idle_enter();
-		rcu_idle_enter();
-		while (!need_resched()) {
-
-			check_pgt_cache();
-			rmb();
-
-			if (cpu_is_offline(cpu))
-				play_dead();
-
-			local_touch_nmi();
-			local_irq_disable();
-			/* Don't trace irqs off for idle */
-			stop_critical_timings();
-			if (cpuidle_idle_call())
-				pm_idle();
-			start_critical_timings();
-		}
-		rcu_idle_exit();
-		tick_nohz_idle_exit();
-		schedule_preempt_disabled();
-	}
-}
-
 void __show_regs(struct pt_regs *regs, int all)
 {
 	unsigned long cr0 = 0L, cr2 = 0L, cr3 = 0L, cr4 = 0L;
diff --git a/arch/x86/kernel/process_64.c b/arch/x86/kernel/process_64.c
index 292da13fc5aa..ce5e34f2beca 100644
--- a/arch/x86/kernel/process_64.c
+++ b/arch/x86/kernel/process_64.c
@@ -14,7 +14,6 @@
  * This file handles the architecture-dependent parts of process handling..
  */
 
-#include <linux/stackprotector.h>
 #include <linux/cpu.h>
 #include <linux/errno.h>
 #include <linux/sched.h>
@@ -32,12 +31,10 @@
 #include <linux/notifier.h>
 #include <linux/kprobes.h>
 #include <linux/kdebug.h>
-#include <linux/tick.h>
 #include <linux/prctl.h>
 #include <linux/uaccess.h>
 #include <linux/io.h>
 #include <linux/ftrace.h>
-#include <linux/cpuidle.h>
 
 #include <asm/pgtable.h>
 #include <asm/system.h>
@@ -52,114 +49,10 @@
 #include <asm/idle.h>
 #include <asm/syscalls.h>
 #include <asm/debugreg.h>
-#include <asm/nmi.h>
 
 asmlinkage extern void ret_from_fork(void);
 
 DEFINE_PER_CPU(unsigned long, old_rsp);
-static DEFINE_PER_CPU(unsigned char, is_idle);
-
-static ATOMIC_NOTIFIER_HEAD(idle_notifier);
-
-void idle_notifier_register(struct notifier_block *n)
-{
-	atomic_notifier_chain_register(&idle_notifier, n);
-}
-EXPORT_SYMBOL_GPL(idle_notifier_register);
-
-void idle_notifier_unregister(struct notifier_block *n)
-{
-	atomic_notifier_chain_unregister(&idle_notifier, n);
-}
-EXPORT_SYMBOL_GPL(idle_notifier_unregister);
-
-void enter_idle(void)
-{
-	percpu_write(is_idle, 1);
-	atomic_notifier_call_chain(&idle_notifier, IDLE_START, NULL);
-}
-
-static void __exit_idle(void)
-{
-	if (x86_test_and_clear_bit_percpu(0, is_idle) == 0)
-		return;
-	atomic_notifier_call_chain(&idle_notifier, IDLE_END, NULL);
-}
-
-/* Called from interrupts to signify idle end */
-void exit_idle(void)
-{
-	/* idle loop has pid 0 */
-	if (current->pid)
-		return;
-	__exit_idle();
-}
-
-#ifndef CONFIG_SMP
-static inline void play_dead(void)
-{
-	BUG();
-}
-#endif
-
-/*
- * The idle thread. There's no useful work to be
- * done, so just try to conserve power and have a
- * low exit latency (ie sit in a loop waiting for
- * somebody to say that they'd like to reschedule)
- */
-void cpu_idle(void)
-{
-	current_thread_info()->status |= TS_POLLING;
-
-	/*
-	 * If we're the non-boot CPU, nothing set the stack canary up
-	 * for us.  CPU0 already has it initialized but no harm in
-	 * doing it again.  This is a good place for updating it, as
-	 * we wont ever return from this function (so the invalid
-	 * canaries already on the stack wont ever trigger).
-	 */
-	boot_init_stack_canary();
-
-	/* endless idle loop with no priority at all */
-	while (1) {
-		tick_nohz_idle_enter();
-		while (!need_resched()) {
-
-			rmb();
-
-			if (cpu_is_offline(smp_processor_id()))
-				play_dead();
-			/*
-			 * Idle routines should keep interrupts disabled
-			 * from here on, until they go to idle.
-			 * Otherwise, idle callbacks can misfire.
-			 */
-			local_touch_nmi();
-			local_irq_disable();
-			enter_idle();
-			/* Don't trace irqs off for idle */
-			stop_critical_timings();
-
-			/* enter_idle() needs rcu for notifiers */
-			rcu_idle_enter();
-
-			if (cpuidle_idle_call())
-				pm_idle();
-
-			rcu_idle_exit();
-			start_critical_timings();
-
-			/* In many cases the interrupt that ended idle
-			   has already called exit_idle. But some idle
-			   loops can be woken up without interrupt. */
-			__exit_idle();
-		}
-
-		tick_nohz_idle_exit();
-		schedule_preempt_disabled();
-	}
-}
 
 /* Prints also some state that isn't saved in the pt_regs */
 void __show_regs(struct pt_regs *regs, int all)
-- 
cgit v1.2.3


From bc758133ed73d4b06952bec21da23e28e62bf3ba Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <a.p.zijlstra@chello.nl>
Date: Mon, 26 Mar 2012 13:16:15 +0200
Subject: sched/x86/smp: Do not enable IRQs over calibrate_delay()

We should not ever enable IRQs until we're fully set up. This opens up
a window where interrupts can hit the cpu and interrupts can do
wakeups, wakeups need state that isn't set-up yet, in particular this
cpu isn't elegible to run tasks, so if any cpu-affine task that got
created in CPU_UP_PREPARE manages to get a wakeup, its affinity mask
will get broken and we'll run into lots of 'interesting' problems.

Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Link: http://lkml.kernel.org/n/tip-yaezmlbriluh166tfkgni22m@git.kernel.org
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/x86/kernel/smpboot.c | 5 -----
 1 file changed, 5 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c
index 58f78165d308..89571a0c4a49 100644
--- a/arch/x86/kernel/smpboot.c
+++ b/arch/x86/kernel/smpboot.c
@@ -219,14 +219,9 @@ static void __cpuinit smp_callin(void)
 	 * Update loops_per_jiffy in cpu_data. Previous call to
 	 * smp_store_cpu_info() stored a value that is close but not as
 	 * accurate as the value just calculated.
-	 *
-	 * Need to enable IRQs because it can take longer and then
-	 * the NMI watchdog might kill us.
 	 */
-	local_irq_enable();
 	calibrate_delay();
 	cpu_data(cpuid).loops_per_jiffy = loops_per_jiffy;
-	local_irq_disable();
 	pr_debug("Stack at about %p\n", &cpuid);
 
 	/*
-- 
cgit v1.2.3


From a3c8121b8724c3d496dc00201ab40e8313edcf0d Mon Sep 17 00:00:00 2001
From: Daniel Drake <dsd@laptop.org>
Date: Tue, 27 Mar 2012 16:07:40 +0100
Subject: x86/olpc: Add debugfs interface for EC commands

Add a debugfs interface for sending commands to the OLPC
Embedded Controller (EC) and reading the responses.  The EC
provides functionality for machine identification, battery and
AC control, wakeup control, etc.

Having a debugfs interface available is useful for EC
development and debugging.

Based on code by Paul Fox (who also approves of the end result).

Signed-off-by: Daniel Drake <dsd@laptop.org>
Acked-by: Paul Fox <pgf@laptop.org>
Cc: "H. Peter Anvin" <hpa@zytor.com>
Cc: Andres Salomon <dilinger@queued.net>
Link: http://lkml.kernel.org/r/20120327150740.667D09D401E@zog.reactivated.net
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 Documentation/ABI/testing/debugfs-olpc | 16 ++++++
 arch/x86/platform/olpc/olpc.c          | 97 ++++++++++++++++++++++++++++++++++
 2 files changed, 113 insertions(+)
 create mode 100644 Documentation/ABI/testing/debugfs-olpc

(limited to 'arch/x86')

diff --git a/Documentation/ABI/testing/debugfs-olpc b/Documentation/ABI/testing/debugfs-olpc
new file mode 100644
index 000000000000..bd76cc6d55f9
--- /dev/null
+++ b/Documentation/ABI/testing/debugfs-olpc
@@ -0,0 +1,16 @@
+What:		/sys/kernel/debug/olpc-ec/cmd
+Date:		Dec 2011
+KernelVersion:	3.4
+Contact:	devel@lists.laptop.org
+Description:
+
+A generic interface for executing OLPC Embedded Controller commands and
+reading their responses.
+
+To execute a command, write data with the format: CC:N A A A A
+CC is the (hex) command, N is the count of expected reply bytes, and A A A A
+are optional (hex) arguments.
+
+To read the response (if any), read from the generic node after executing
+a command. Hex reply bytes will be returned, *whether or not* they came from
+the immediately previous command.
diff --git a/arch/x86/platform/olpc/olpc.c b/arch/x86/platform/olpc/olpc.c
index 7cce722667b8..a4bee53c2e54 100644
--- a/arch/x86/platform/olpc/olpc.c
+++ b/arch/x86/platform/olpc/olpc.c
@@ -20,6 +20,8 @@
 #include <linux/platform_device.h>
 #include <linux/of.h>
 #include <linux/syscore_ops.h>
+#include <linux/debugfs.h>
+#include <linux/mutex.h>
 
 #include <asm/geode.h>
 #include <asm/setup.h>
@@ -31,6 +33,15 @@ EXPORT_SYMBOL_GPL(olpc_platform_info);
 
 static DEFINE_SPINLOCK(ec_lock);
 
+/* debugfs interface to EC commands */
+#define EC_MAX_CMD_ARGS (5 + 1)	/* cmd byte + 5 args */
+#define EC_MAX_CMD_REPLY (8)
+
+static struct dentry *ec_debugfs_dir;
+static DEFINE_MUTEX(ec_debugfs_cmd_lock);
+static unsigned char ec_debugfs_resp[EC_MAX_CMD_REPLY];
+static unsigned int ec_debugfs_resp_bytes;
+
 /* EC event mask to be applied during suspend (defining wakeup sources). */
 static u16 ec_wakeup_mask;
 
@@ -269,6 +280,91 @@ int olpc_ec_sci_query(u16 *sci_value)
 }
 EXPORT_SYMBOL_GPL(olpc_ec_sci_query);
 
+static ssize_t ec_debugfs_cmd_write(struct file *file, const char __user *buf,
+				    size_t size, loff_t *ppos)
+{
+	int i, m;
+	unsigned char ec_cmd[EC_MAX_CMD_ARGS];
+	unsigned int ec_cmd_int[EC_MAX_CMD_ARGS];
+	char cmdbuf[64];
+	int ec_cmd_bytes;
+
+	mutex_lock(&ec_debugfs_cmd_lock);
+
+	size = simple_write_to_buffer(cmdbuf, sizeof(cmdbuf), ppos, buf, size);
+
+	m = sscanf(cmdbuf, "%x:%u %x %x %x %x %x", &ec_cmd_int[0],
+		   &ec_debugfs_resp_bytes,
+		   &ec_cmd_int[1], &ec_cmd_int[2], &ec_cmd_int[3],
+		   &ec_cmd_int[4], &ec_cmd_int[5]);
+	if (m < 2 || ec_debugfs_resp_bytes > EC_MAX_CMD_REPLY) {
+		/* reset to prevent overflow on read */
+		ec_debugfs_resp_bytes = 0;
+
+		printk(KERN_DEBUG "olpc-ec: bad ec cmd:  "
+		       "cmd:response-count [arg1 [arg2 ...]]\n");
+		size = -EINVAL;
+		goto out;
+	}
+
+	/* convert scanf'd ints to char */
+	ec_cmd_bytes = m - 2;
+	for (i = 0; i <= ec_cmd_bytes; i++)
+		ec_cmd[i] = ec_cmd_int[i];
+
+	printk(KERN_DEBUG "olpc-ec: debugfs cmd 0x%02x with %d args "
+	       "%02x %02x %02x %02x %02x, want %d returns\n",
+	       ec_cmd[0], ec_cmd_bytes, ec_cmd[1], ec_cmd[2], ec_cmd[3],
+	       ec_cmd[4], ec_cmd[5], ec_debugfs_resp_bytes);
+
+	olpc_ec_cmd(ec_cmd[0], (ec_cmd_bytes == 0) ? NULL : &ec_cmd[1],
+		    ec_cmd_bytes, ec_debugfs_resp, ec_debugfs_resp_bytes);
+
+	printk(KERN_DEBUG "olpc-ec: response "
+	       "%02x %02x %02x %02x %02x %02x %02x %02x (%d bytes expected)\n",
+	       ec_debugfs_resp[0], ec_debugfs_resp[1], ec_debugfs_resp[2],
+	       ec_debugfs_resp[3], ec_debugfs_resp[4], ec_debugfs_resp[5],
+	       ec_debugfs_resp[6], ec_debugfs_resp[7], ec_debugfs_resp_bytes);
+
+out:
+	mutex_unlock(&ec_debugfs_cmd_lock);
+	return size;
+}
+
+static ssize_t ec_debugfs_cmd_read(struct file *file, char __user *buf,
+				   size_t size, loff_t *ppos)
+{
+	unsigned int i, r;
+	char *rp;
+	char respbuf[64];
+
+	mutex_lock(&ec_debugfs_cmd_lock);
+	rp = respbuf;
+	rp += sprintf(rp, "%02x", ec_debugfs_resp[0]);
+	for (i = 1; i < ec_debugfs_resp_bytes; i++)
+		rp += sprintf(rp, ", %02x", ec_debugfs_resp[i]);
+	mutex_unlock(&ec_debugfs_cmd_lock);
+	rp += sprintf(rp, "\n");
+
+	r = rp - respbuf;
+	return simple_read_from_buffer(buf, size, ppos, respbuf, r);
+}
+
+static const struct file_operations ec_debugfs_genops = {
+	.write	 = ec_debugfs_cmd_write,
+	.read	 = ec_debugfs_cmd_read,
+};
+
+static void setup_debugfs(void)
+{
+	ec_debugfs_dir = debugfs_create_dir("olpc-ec", 0);
+	if (ec_debugfs_dir == ERR_PTR(-ENODEV))
+		return;
+
+	debugfs_create_file("cmd", 0600, ec_debugfs_dir, NULL,
+			    &ec_debugfs_genops);
+}
+
 static int olpc_ec_suspend(void)
 {
 	return olpc_ec_mask_write(ec_wakeup_mask);
@@ -372,6 +468,7 @@ static int __init olpc_init(void)
 	}
 
 	register_syscore_ops(&olpc_syscore_ops);
+	setup_debugfs();
 
 	return 0;
 }
-- 
cgit v1.2.3


From 136d249ef7dbf0fefa292082cc40be1ea864cbd6 Mon Sep 17 00:00:00 2001
From: Jeremy Fitzhardinge <jeremy.fitzhardinge@citrix.com>
Date: Wed, 21 Mar 2012 22:58:08 -0400
Subject: x86/ioapic: Add io_apic_ops driver layer to allow interception

Xen dom0 needs to paravirtualize IO operations to the IO APIC,
so add a io_apic_ops for it to intercept.  Do this as ops
structure because there's at least some chance that another
paravirtualized environment may want to intercept these.

Signed-off-by: Jeremy Fitzhardinge <jeremy.fitzhardinge@citrix.com>
Signed-off-by: Konrad Rzeszutek Wilk <konrad.wilk@oracle.com>
Acked-by: Suresh Siddha <suresh.b.siddha@intel.com>
Cc: jwboyer@redhat.com
Cc: yinghai@kernel.org
Link: http://lkml.kernel.org/r/1332385090-18056-2-git-send-email-konrad.wilk@oracle.com
[ Made all the affected code easier on the eyes ]
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/x86/include/asm/io_apic.h |  9 +++++++
 arch/x86/kernel/apic/io_apic.c | 58 +++++++++++++++++++++++++++++++++++++-----
 2 files changed, 60 insertions(+), 7 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/include/asm/io_apic.h b/arch/x86/include/asm/io_apic.h
index 690d1cc9a877..2c4943de5150 100644
--- a/arch/x86/include/asm/io_apic.h
+++ b/arch/x86/include/asm/io_apic.h
@@ -21,6 +21,15 @@
 #define IO_APIC_REDIR_LEVEL_TRIGGER	(1 << 15)
 #define IO_APIC_REDIR_MASKED		(1 << 16)
 
+struct io_apic_ops {
+	void		(*init)  (void);
+	unsigned int	(*read)  (unsigned int apic, unsigned int reg);
+	void		(*write) (unsigned int apic, unsigned int reg, unsigned int value);
+	void		(*modify)(unsigned int apic, unsigned int reg, unsigned int value);
+};
+
+void __init set_io_apic_ops(const struct io_apic_ops *);
+
 /*
  * The structure of the IO-APIC:
  */
diff --git a/arch/x86/kernel/apic/io_apic.c b/arch/x86/kernel/apic/io_apic.c
index 2c428c5d7ca3..e88300d8e80a 100644
--- a/arch/x86/kernel/apic/io_apic.c
+++ b/arch/x86/kernel/apic/io_apic.c
@@ -64,9 +64,28 @@
 #include <asm/apic.h>
 
 #define __apicdebuginit(type) static type __init
+
 #define for_each_irq_pin(entry, head) \
 	for (entry = head; entry; entry = entry->next)
 
+static void		__init __ioapic_init_mappings(void);
+
+static unsigned int	__io_apic_read  (unsigned int apic, unsigned int reg);
+static void		__io_apic_write (unsigned int apic, unsigned int reg, unsigned int val);
+static void		__io_apic_modify(unsigned int apic, unsigned int reg, unsigned int val);
+
+static struct io_apic_ops io_apic_ops = {
+	.init	= __ioapic_init_mappings,
+	.read	= __io_apic_read,
+	.write	= __io_apic_write,
+	.modify = __io_apic_modify,
+};
+
+void __init set_io_apic_ops(const struct io_apic_ops *ops)
+{
+	io_apic_ops = *ops;
+}
+
 /*
  *      Is the SiS APIC rmw bug present ?
  *      -1 = don't know, 0 = no, 1 = yes
@@ -294,6 +313,22 @@ static void free_irq_at(unsigned int at, struct irq_cfg *cfg)
 	irq_free_desc(at);
 }
 
+static inline unsigned int io_apic_read(unsigned int apic, unsigned int reg)
+{
+	return io_apic_ops.read(apic, reg);
+}
+
+static inline void io_apic_write(unsigned int apic, unsigned int reg, unsigned int value)
+{
+	io_apic_ops.write(apic, reg, value);
+}
+
+static inline void io_apic_modify(unsigned int apic, unsigned int reg, unsigned int value)
+{
+	io_apic_ops.modify(apic, reg, value);
+}
+
+
 struct io_apic {
 	unsigned int index;
 	unsigned int unused[3];
@@ -314,16 +349,17 @@ static inline void io_apic_eoi(unsigned int apic, unsigned int vector)
 	writel(vector, &io_apic->eoi);
 }
 
-static inline unsigned int io_apic_read(unsigned int apic, unsigned int reg)
+static unsigned int __io_apic_read(unsigned int apic, unsigned int reg)
 {
 	struct io_apic __iomem *io_apic = io_apic_base(apic);
 	writel(reg, &io_apic->index);
 	return readl(&io_apic->data);
 }
 
-static inline void io_apic_write(unsigned int apic, unsigned int reg, unsigned int value)
+static void __io_apic_write(unsigned int apic, unsigned int reg, unsigned int value)
 {
 	struct io_apic __iomem *io_apic = io_apic_base(apic);
+
 	writel(reg, &io_apic->index);
 	writel(value, &io_apic->data);
 }
@@ -334,7 +370,7 @@ static inline void io_apic_write(unsigned int apic, unsigned int reg, unsigned i
  *
  * Older SiS APIC requires we rewrite the index register
  */
-static inline void io_apic_modify(unsigned int apic, unsigned int reg, unsigned int value)
+static void __io_apic_modify(unsigned int apic, unsigned int reg, unsigned int value)
 {
 	struct io_apic __iomem *io_apic = io_apic_base(apic);
 
@@ -377,6 +413,7 @@ static struct IO_APIC_route_entry __ioapic_read_entry(int apic, int pin)
 
 	eu.w1 = io_apic_read(apic, 0x10 + 2 * pin);
 	eu.w2 = io_apic_read(apic, 0x11 + 2 * pin);
+
 	return eu.entry;
 }
 
@@ -384,9 +421,11 @@ static struct IO_APIC_route_entry ioapic_read_entry(int apic, int pin)
 {
 	union entry_union eu;
 	unsigned long flags;
+
 	raw_spin_lock_irqsave(&ioapic_lock, flags);
 	eu.entry = __ioapic_read_entry(apic, pin);
 	raw_spin_unlock_irqrestore(&ioapic_lock, flags);
+
 	return eu.entry;
 }
 
@@ -396,8 +435,7 @@ static struct IO_APIC_route_entry ioapic_read_entry(int apic, int pin)
  * the interrupt, and we need to make sure the entry is fully populated
  * before that happens.
  */
-static void
-__ioapic_write_entry(int apic, int pin, struct IO_APIC_route_entry e)
+static void __ioapic_write_entry(int apic, int pin, struct IO_APIC_route_entry e)
 {
 	union entry_union eu = {{0, 0}};
 
@@ -409,6 +447,7 @@ __ioapic_write_entry(int apic, int pin, struct IO_APIC_route_entry e)
 static void ioapic_write_entry(int apic, int pin, struct IO_APIC_route_entry e)
 {
 	unsigned long flags;
+
 	raw_spin_lock_irqsave(&ioapic_lock, flags);
 	__ioapic_write_entry(apic, pin, e);
 	raw_spin_unlock_irqrestore(&ioapic_lock, flags);
@@ -435,8 +474,7 @@ static void ioapic_mask_entry(int apic, int pin)
  * shared ISA-space IRQs, so we have to support them. We are super
  * fast in the common case, and fast for shared ISA-space IRQs.
  */
-static int
-__add_pin_to_irq_node(struct irq_cfg *cfg, int node, int apic, int pin)
+static int __add_pin_to_irq_node(struct irq_cfg *cfg, int node, int apic, int pin)
 {
 	struct irq_pin_list **last, *entry;
 
@@ -521,6 +559,7 @@ static void io_apic_sync(struct irq_pin_list *entry)
 	 * a dummy read from the IO-APIC
 	 */
 	struct io_apic __iomem *io_apic;
+
 	io_apic = io_apic_base(entry->apic);
 	readl(&io_apic->data);
 }
@@ -3893,6 +3932,11 @@ static struct resource * __init ioapic_setup_resources(int nr_ioapics)
 }
 
 void __init ioapic_and_gsi_init(void)
+{
+	io_apic_ops.init();
+}
+
+static void __init __ioapic_init_mappings(void)
 {
 	unsigned long ioapic_phys, idx = FIX_IO_APIC_BASE_0;
 	struct resource *ioapic_res;
-- 
cgit v1.2.3


From baa676fcf8d555269bd0a5a2496782beee55824d Mon Sep 17 00:00:00 2001
From: Andrzej Pietrasiewicz <andrzej.p@samsung.com>
Date: Tue, 27 Mar 2012 14:28:18 +0200
Subject: X86 & IA64: adapt for dma_map_ops changes

Adapt core x86 and IA64 architecture code for dma_map_ops changes: replace
alloc/free_coherent with generic alloc/free methods.

Signed-off-by: Andrzej Pietrasiewicz <andrzej.p@samsung.com>
Acked-by: Kyungmin Park <kyungmin.park@samsung.com>
[removed swiotlb related changes and replaced it with wrappers,
 merged with IA64 patch to avoid inter-patch dependences in intel-iommu code]
Signed-off-by: Marek Szyprowski <m.szyprowski@samsung.com>
Reviewed-by: Arnd Bergmann <arnd@arndb.de>
Acked-by: Tony Luck <tony.luck@intel.com>
---
 arch/ia64/hp/common/sba_iommu.c     | 11 ++++++-----
 arch/ia64/include/asm/dma-mapping.h | 18 ++++++++++++------
 arch/ia64/kernel/pci-swiotlb.c      | 14 +++++++++++---
 arch/ia64/sn/pci/pci_dma.c          |  9 +++++----
 arch/x86/include/asm/dma-mapping.h  | 26 ++++++++++++++++----------
 arch/x86/kernel/amd_gart_64.c       | 11 ++++++-----
 arch/x86/kernel/pci-calgary_64.c    |  9 +++++----
 arch/x86/kernel/pci-dma.c           |  3 ++-
 arch/x86/kernel/pci-nommu.c         |  6 +++---
 arch/x86/kernel/pci-swiotlb.c       | 17 +++++++++++++----
 arch/x86/xen/pci-swiotlb-xen.c      |  4 ++--
 drivers/iommu/amd_iommu.c           | 10 ++++++----
 drivers/iommu/intel-iommu.c         |  9 +++++----
 drivers/xen/swiotlb-xen.c           |  5 +++--
 include/xen/swiotlb-xen.h           |  6 ++++--
 15 files changed, 99 insertions(+), 59 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/ia64/hp/common/sba_iommu.c b/arch/ia64/hp/common/sba_iommu.c
index f5f4ef149aac..e5eb9c4b2198 100644
--- a/arch/ia64/hp/common/sba_iommu.c
+++ b/arch/ia64/hp/common/sba_iommu.c
@@ -1130,7 +1130,8 @@ void sba_unmap_single_attrs(struct device *dev, dma_addr_t iova, size_t size,
  * See Documentation/DMA-API-HOWTO.txt
  */
 static void *
-sba_alloc_coherent (struct device *dev, size_t size, dma_addr_t *dma_handle, gfp_t flags)
+sba_alloc_coherent(struct device *dev, size_t size, dma_addr_t *dma_handle,
+		   gfp_t flags, struct dma_attrs *attrs)
 {
 	struct ioc *ioc;
 	void *addr;
@@ -1192,8 +1193,8 @@ sba_alloc_coherent (struct device *dev, size_t size, dma_addr_t *dma_handle, gfp
  *
  * See Documentation/DMA-API-HOWTO.txt
  */
-static void sba_free_coherent (struct device *dev, size_t size, void *vaddr,
-			       dma_addr_t dma_handle)
+static void sba_free_coherent(struct device *dev, size_t size, void *vaddr,
+			      dma_addr_t dma_handle, struct dma_attrs *attrs)
 {
 	sba_unmap_single_attrs(dev, dma_handle, size, 0, NULL);
 	free_pages((unsigned long) vaddr, get_order(size));
@@ -2213,8 +2214,8 @@ sba_page_override(char *str)
 __setup("sbapagesize=",sba_page_override);
 
 struct dma_map_ops sba_dma_ops = {
-	.alloc_coherent		= sba_alloc_coherent,
-	.free_coherent		= sba_free_coherent,
+	.alloc			= sba_alloc_coherent,
+	.free			= sba_free_coherent,
 	.map_page		= sba_map_page,
 	.unmap_page		= sba_unmap_page,
 	.map_sg			= sba_map_sg_attrs,
diff --git a/arch/ia64/include/asm/dma-mapping.h b/arch/ia64/include/asm/dma-mapping.h
index 4336d080b241..4f5e8148440d 100644
--- a/arch/ia64/include/asm/dma-mapping.h
+++ b/arch/ia64/include/asm/dma-mapping.h
@@ -23,23 +23,29 @@ extern void machvec_dma_sync_single(struct device *, dma_addr_t, size_t,
 extern void machvec_dma_sync_sg(struct device *, struct scatterlist *, int,
 				enum dma_data_direction);
 
-static inline void *dma_alloc_coherent(struct device *dev, size_t size,
-				       dma_addr_t *daddr, gfp_t gfp)
+#define dma_alloc_coherent(d,s,h,f)	dma_alloc_attrs(d,s,h,f,NULL)
+
+static inline void *dma_alloc_attrs(struct device *dev, size_t size,
+				    dma_addr_t *daddr, gfp_t gfp,
+				    struct dma_attrs *attrs)
 {
 	struct dma_map_ops *ops = platform_dma_get_ops(dev);
 	void *caddr;
 
-	caddr = ops->alloc_coherent(dev, size, daddr, gfp);
+	caddr = ops->alloc(dev, size, daddr, gfp, attrs);
 	debug_dma_alloc_coherent(dev, size, *daddr, caddr);
 	return caddr;
 }
 
-static inline void dma_free_coherent(struct device *dev, size_t size,
-				     void *caddr, dma_addr_t daddr)
+#define dma_free_coherent(d,s,c,h) dma_free_attrs(d,s,c,h,NULL)
+
+static inline void dma_free_attrs(struct device *dev, size_t size,
+				  void *caddr, dma_addr_t daddr,
+				  struct dma_attrs *attrs)
 {
 	struct dma_map_ops *ops = platform_dma_get_ops(dev);
 	debug_dma_free_coherent(dev, size, caddr, daddr);
-	ops->free_coherent(dev, size, caddr, daddr);
+	ops->free(dev, size, caddr, daddr, attrs);
 }
 
 #define dma_alloc_noncoherent(d, s, h, f) dma_alloc_coherent(d, s, h, f)
diff --git a/arch/ia64/kernel/pci-swiotlb.c b/arch/ia64/kernel/pci-swiotlb.c
index d9485d952ed0..939260aeac98 100644
--- a/arch/ia64/kernel/pci-swiotlb.c
+++ b/arch/ia64/kernel/pci-swiotlb.c
@@ -15,16 +15,24 @@ int swiotlb __read_mostly;
 EXPORT_SYMBOL(swiotlb);
 
 static void *ia64_swiotlb_alloc_coherent(struct device *dev, size_t size,
-					 dma_addr_t *dma_handle, gfp_t gfp)
+					 dma_addr_t *dma_handle, gfp_t gfp,
+					 struct dma_attrs *attrs)
 {
 	if (dev->coherent_dma_mask != DMA_BIT_MASK(64))
 		gfp |= GFP_DMA;
 	return swiotlb_alloc_coherent(dev, size, dma_handle, gfp);
 }
 
+static void ia64_swiotlb_free_coherent(struct device *dev, size_t size,
+				       void *vaddr, dma_addr_t dma_addr,
+				       struct dma_attrs *attrs)
+{
+	swiotlb_free_coherent(dev, size, vaddr, dma_addr);
+}
+
 struct dma_map_ops swiotlb_dma_ops = {
-	.alloc_coherent = ia64_swiotlb_alloc_coherent,
-	.free_coherent = swiotlb_free_coherent,
+	.alloc = ia64_swiotlb_alloc_coherent,
+	.free = ia64_swiotlb_free_coherent,
 	.map_page = swiotlb_map_page,
 	.unmap_page = swiotlb_unmap_page,
 	.map_sg = swiotlb_map_sg_attrs,
diff --git a/arch/ia64/sn/pci/pci_dma.c b/arch/ia64/sn/pci/pci_dma.c
index a9d310de57da..3290d6e00c31 100644
--- a/arch/ia64/sn/pci/pci_dma.c
+++ b/arch/ia64/sn/pci/pci_dma.c
@@ -76,7 +76,8 @@ EXPORT_SYMBOL(sn_dma_set_mask);
  * more information.
  */
 static void *sn_dma_alloc_coherent(struct device *dev, size_t size,
-				   dma_addr_t * dma_handle, gfp_t flags)
+				   dma_addr_t * dma_handle, gfp_t flags,
+				   struct dma_attrs *attrs)
 {
 	void *cpuaddr;
 	unsigned long phys_addr;
@@ -137,7 +138,7 @@ static void *sn_dma_alloc_coherent(struct device *dev, size_t size,
  * any associated IOMMU mappings.
  */
 static void sn_dma_free_coherent(struct device *dev, size_t size, void *cpu_addr,
-				 dma_addr_t dma_handle)
+				 dma_addr_t dma_handle, struct dma_attrs *attrs)
 {
 	struct pci_dev *pdev = to_pci_dev(dev);
 	struct sn_pcibus_provider *provider = SN_PCIDEV_BUSPROVIDER(pdev);
@@ -466,8 +467,8 @@ int sn_pci_legacy_write(struct pci_bus *bus, u16 port, u32 val, u8 size)
 }
 
 static struct dma_map_ops sn_dma_ops = {
-	.alloc_coherent		= sn_dma_alloc_coherent,
-	.free_coherent		= sn_dma_free_coherent,
+	.alloc			= sn_dma_alloc_coherent,
+	.free			= sn_dma_free_coherent,
 	.map_page		= sn_dma_map_page,
 	.unmap_page		= sn_dma_unmap_page,
 	.map_sg			= sn_dma_map_sg,
diff --git a/arch/x86/include/asm/dma-mapping.h b/arch/x86/include/asm/dma-mapping.h
index ed3065fd6314..4b4331d71935 100644
--- a/arch/x86/include/asm/dma-mapping.h
+++ b/arch/x86/include/asm/dma-mapping.h
@@ -59,7 +59,8 @@ extern int dma_supported(struct device *hwdev, u64 mask);
 extern int dma_set_mask(struct device *dev, u64 mask);
 
 extern void *dma_generic_alloc_coherent(struct device *dev, size_t size,
-					dma_addr_t *dma_addr, gfp_t flag);
+					dma_addr_t *dma_addr, gfp_t flag,
+					struct dma_attrs *attrs);
 
 static inline bool dma_capable(struct device *dev, dma_addr_t addr, size_t size)
 {
@@ -111,9 +112,11 @@ static inline gfp_t dma_alloc_coherent_gfp_flags(struct device *dev, gfp_t gfp)
        return gfp;
 }
 
+#define dma_alloc_coherent(d,s,h,f)	dma_alloc_attrs(d,s,h,f,NULL)
+
 static inline void *
-dma_alloc_coherent(struct device *dev, size_t size, dma_addr_t *dma_handle,
-		gfp_t gfp)
+dma_alloc_attrs(struct device *dev, size_t size, dma_addr_t *dma_handle,
+		gfp_t gfp, struct dma_attrs *attrs)
 {
 	struct dma_map_ops *ops = get_dma_ops(dev);
 	void *memory;
@@ -129,18 +132,21 @@ dma_alloc_coherent(struct device *dev, size_t size, dma_addr_t *dma_handle,
 	if (!is_device_dma_capable(dev))
 		return NULL;
 
-	if (!ops->alloc_coherent)
+	if (!ops->alloc)
 		return NULL;
 
-	memory = ops->alloc_coherent(dev, size, dma_handle,
-				     dma_alloc_coherent_gfp_flags(dev, gfp));
+	memory = ops->alloc(dev, size, dma_handle,
+			    dma_alloc_coherent_gfp_flags(dev, gfp), attrs);
 	debug_dma_alloc_coherent(dev, size, *dma_handle, memory);
 
 	return memory;
 }
 
-static inline void dma_free_coherent(struct device *dev, size_t size,
-				     void *vaddr, dma_addr_t bus)
+#define dma_free_coherent(d,s,c,h) dma_free_attrs(d,s,c,h,NULL)
+
+static inline void dma_free_attrs(struct device *dev, size_t size,
+				  void *vaddr, dma_addr_t bus,
+				  struct dma_attrs *attrs)
 {
 	struct dma_map_ops *ops = get_dma_ops(dev);
 
@@ -150,8 +156,8 @@ static inline void dma_free_coherent(struct device *dev, size_t size,
 		return;
 
 	debug_dma_free_coherent(dev, size, vaddr, bus);
-	if (ops->free_coherent)
-		ops->free_coherent(dev, size, vaddr, bus);
+	if (ops->free)
+		ops->free(dev, size, vaddr, bus, attrs);
 }
 
 #endif
diff --git a/arch/x86/kernel/amd_gart_64.c b/arch/x86/kernel/amd_gart_64.c
index b1e7c7f7a0af..e66311200cbd 100644
--- a/arch/x86/kernel/amd_gart_64.c
+++ b/arch/x86/kernel/amd_gart_64.c
@@ -477,7 +477,7 @@ error:
 /* allocate and map a coherent mapping */
 static void *
 gart_alloc_coherent(struct device *dev, size_t size, dma_addr_t *dma_addr,
-		    gfp_t flag)
+		    gfp_t flag, struct dma_attrs *attrs)
 {
 	dma_addr_t paddr;
 	unsigned long align_mask;
@@ -500,7 +500,8 @@ gart_alloc_coherent(struct device *dev, size_t size, dma_addr_t *dma_addr,
 		}
 		__free_pages(page, get_order(size));
 	} else
-		return dma_generic_alloc_coherent(dev, size, dma_addr, flag);
+		return dma_generic_alloc_coherent(dev, size, dma_addr, flag,
+						  attrs);
 
 	return NULL;
 }
@@ -508,7 +509,7 @@ gart_alloc_coherent(struct device *dev, size_t size, dma_addr_t *dma_addr,
 /* free a coherent mapping */
 static void
 gart_free_coherent(struct device *dev, size_t size, void *vaddr,
-		   dma_addr_t dma_addr)
+		   dma_addr_t dma_addr, struct dma_attrs *attrs)
 {
 	gart_unmap_page(dev, dma_addr, size, DMA_BIDIRECTIONAL, NULL);
 	free_pages((unsigned long)vaddr, get_order(size));
@@ -700,8 +701,8 @@ static struct dma_map_ops gart_dma_ops = {
 	.unmap_sg			= gart_unmap_sg,
 	.map_page			= gart_map_page,
 	.unmap_page			= gart_unmap_page,
-	.alloc_coherent			= gart_alloc_coherent,
-	.free_coherent			= gart_free_coherent,
+	.alloc				= gart_alloc_coherent,
+	.free				= gart_free_coherent,
 	.mapping_error			= gart_mapping_error,
 };
 
diff --git a/arch/x86/kernel/pci-calgary_64.c b/arch/x86/kernel/pci-calgary_64.c
index 726494b58345..07b587c5a2d2 100644
--- a/arch/x86/kernel/pci-calgary_64.c
+++ b/arch/x86/kernel/pci-calgary_64.c
@@ -431,7 +431,7 @@ static void calgary_unmap_page(struct device *dev, dma_addr_t dma_addr,
 }
 
 static void* calgary_alloc_coherent(struct device *dev, size_t size,
-	dma_addr_t *dma_handle, gfp_t flag)
+	dma_addr_t *dma_handle, gfp_t flag, struct dma_attrs *attrs)
 {
 	void *ret = NULL;
 	dma_addr_t mapping;
@@ -464,7 +464,8 @@ error:
 }
 
 static void calgary_free_coherent(struct device *dev, size_t size,
-				  void *vaddr, dma_addr_t dma_handle)
+				  void *vaddr, dma_addr_t dma_handle,
+				  struct dma_attrs *attrs)
 {
 	unsigned int npages;
 	struct iommu_table *tbl = find_iommu_table(dev);
@@ -477,8 +478,8 @@ static void calgary_free_coherent(struct device *dev, size_t size,
 }
 
 static struct dma_map_ops calgary_dma_ops = {
-	.alloc_coherent = calgary_alloc_coherent,
-	.free_coherent = calgary_free_coherent,
+	.alloc = calgary_alloc_coherent,
+	.free = calgary_free_coherent,
 	.map_sg = calgary_map_sg,
 	.unmap_sg = calgary_unmap_sg,
 	.map_page = calgary_map_page,
diff --git a/arch/x86/kernel/pci-dma.c b/arch/x86/kernel/pci-dma.c
index 1c4d769e21ea..75e1cc19e630 100644
--- a/arch/x86/kernel/pci-dma.c
+++ b/arch/x86/kernel/pci-dma.c
@@ -96,7 +96,8 @@ void __init pci_iommu_alloc(void)
 	}
 }
 void *dma_generic_alloc_coherent(struct device *dev, size_t size,
-				 dma_addr_t *dma_addr, gfp_t flag)
+				 dma_addr_t *dma_addr, gfp_t flag,
+				 struct dma_attrs *attrs)
 {
 	unsigned long dma_mask;
 	struct page *page;
diff --git a/arch/x86/kernel/pci-nommu.c b/arch/x86/kernel/pci-nommu.c
index 3af4af810c07..f96050685b46 100644
--- a/arch/x86/kernel/pci-nommu.c
+++ b/arch/x86/kernel/pci-nommu.c
@@ -75,7 +75,7 @@ static int nommu_map_sg(struct device *hwdev, struct scatterlist *sg,
 }
 
 static void nommu_free_coherent(struct device *dev, size_t size, void *vaddr,
-				dma_addr_t dma_addr)
+				dma_addr_t dma_addr, struct dma_attrs *attrs)
 {
 	free_pages((unsigned long)vaddr, get_order(size));
 }
@@ -96,8 +96,8 @@ static void nommu_sync_sg_for_device(struct device *dev,
 }
 
 struct dma_map_ops nommu_dma_ops = {
-	.alloc_coherent		= dma_generic_alloc_coherent,
-	.free_coherent		= nommu_free_coherent,
+	.alloc			= dma_generic_alloc_coherent,
+	.free			= nommu_free_coherent,
 	.map_sg			= nommu_map_sg,
 	.map_page		= nommu_map_page,
 	.sync_single_for_device = nommu_sync_single_for_device,
diff --git a/arch/x86/kernel/pci-swiotlb.c b/arch/x86/kernel/pci-swiotlb.c
index 8f972cbddef0..6c483ba98b9c 100644
--- a/arch/x86/kernel/pci-swiotlb.c
+++ b/arch/x86/kernel/pci-swiotlb.c
@@ -15,21 +15,30 @@
 int swiotlb __read_mostly;
 
 static void *x86_swiotlb_alloc_coherent(struct device *hwdev, size_t size,
-					dma_addr_t *dma_handle, gfp_t flags)
+					dma_addr_t *dma_handle, gfp_t flags,
+					struct dma_attrs *attrs)
 {
 	void *vaddr;
 
-	vaddr = dma_generic_alloc_coherent(hwdev, size, dma_handle, flags);
+	vaddr = dma_generic_alloc_coherent(hwdev, size, dma_handle, flags,
+					   attrs);
 	if (vaddr)
 		return vaddr;
 
 	return swiotlb_alloc_coherent(hwdev, size, dma_handle, flags);
 }
 
+static void x86_swiotlb_free_coherent(struct device *dev, size_t size,
+				      void *vaddr, dma_addr_t dma_addr,
+				      struct dma_attrs *attrs)
+{
+	swiotlb_free_coherent(dev, size, vaddr, dma_addr);
+}
+
 static struct dma_map_ops swiotlb_dma_ops = {
 	.mapping_error = swiotlb_dma_mapping_error,
-	.alloc_coherent = x86_swiotlb_alloc_coherent,
-	.free_coherent = swiotlb_free_coherent,
+	.alloc = x86_swiotlb_alloc_coherent,
+	.free = x86_swiotlb_free_coherent,
 	.sync_single_for_cpu = swiotlb_sync_single_for_cpu,
 	.sync_single_for_device = swiotlb_sync_single_for_device,
 	.sync_sg_for_cpu = swiotlb_sync_sg_for_cpu,
diff --git a/arch/x86/xen/pci-swiotlb-xen.c b/arch/x86/xen/pci-swiotlb-xen.c
index b480d4207a4c..967633ad98c4 100644
--- a/arch/x86/xen/pci-swiotlb-xen.c
+++ b/arch/x86/xen/pci-swiotlb-xen.c
@@ -12,8 +12,8 @@ int xen_swiotlb __read_mostly;
 
 static struct dma_map_ops xen_swiotlb_dma_ops = {
 	.mapping_error = xen_swiotlb_dma_mapping_error,
-	.alloc_coherent = xen_swiotlb_alloc_coherent,
-	.free_coherent = xen_swiotlb_free_coherent,
+	.alloc = xen_swiotlb_alloc_coherent,
+	.free = xen_swiotlb_free_coherent,
 	.sync_single_for_cpu = xen_swiotlb_sync_single_for_cpu,
 	.sync_single_for_device = xen_swiotlb_sync_single_for_device,
 	.sync_sg_for_cpu = xen_swiotlb_sync_sg_for_cpu,
diff --git a/drivers/iommu/amd_iommu.c b/drivers/iommu/amd_iommu.c
index f75e0608be5b..daa333f97b78 100644
--- a/drivers/iommu/amd_iommu.c
+++ b/drivers/iommu/amd_iommu.c
@@ -2707,7 +2707,8 @@ static void unmap_sg(struct device *dev, struct scatterlist *sglist,
  * The exported alloc_coherent function for dma_ops.
  */
 static void *alloc_coherent(struct device *dev, size_t size,
-			    dma_addr_t *dma_addr, gfp_t flag)
+			    dma_addr_t *dma_addr, gfp_t flag,
+			    struct dma_attrs *attrs)
 {
 	unsigned long flags;
 	void *virt_addr;
@@ -2765,7 +2766,8 @@ out_free:
  * The exported free_coherent function for dma_ops.
  */
 static void free_coherent(struct device *dev, size_t size,
-			  void *virt_addr, dma_addr_t dma_addr)
+			  void *virt_addr, dma_addr_t dma_addr,
+			  struct dma_attrs *attrs)
 {
 	unsigned long flags;
 	struct protection_domain *domain;
@@ -2846,8 +2848,8 @@ static void prealloc_protection_domains(void)
 }
 
 static struct dma_map_ops amd_iommu_dma_ops = {
-	.alloc_coherent = alloc_coherent,
-	.free_coherent = free_coherent,
+	.alloc = alloc_coherent,
+	.free = free_coherent,
 	.map_page = map_page,
 	.unmap_page = unmap_page,
 	.map_sg = map_sg,
diff --git a/drivers/iommu/intel-iommu.c b/drivers/iommu/intel-iommu.c
index c9c6053198d4..e39bfdc055c3 100644
--- a/drivers/iommu/intel-iommu.c
+++ b/drivers/iommu/intel-iommu.c
@@ -2938,7 +2938,8 @@ static void intel_unmap_page(struct device *dev, dma_addr_t dev_addr,
 }
 
 static void *intel_alloc_coherent(struct device *hwdev, size_t size,
-				  dma_addr_t *dma_handle, gfp_t flags)
+				  dma_addr_t *dma_handle, gfp_t flags,
+				  struct dma_attrs *attrs)
 {
 	void *vaddr;
 	int order;
@@ -2970,7 +2971,7 @@ static void *intel_alloc_coherent(struct device *hwdev, size_t size,
 }
 
 static void intel_free_coherent(struct device *hwdev, size_t size, void *vaddr,
-				dma_addr_t dma_handle)
+				dma_addr_t dma_handle, struct dma_attrs *attrs)
 {
 	int order;
 
@@ -3115,8 +3116,8 @@ static int intel_mapping_error(struct device *dev, dma_addr_t dma_addr)
 }
 
 struct dma_map_ops intel_dma_ops = {
-	.alloc_coherent = intel_alloc_coherent,
-	.free_coherent = intel_free_coherent,
+	.alloc = intel_alloc_coherent,
+	.free = intel_free_coherent,
 	.map_sg = intel_map_sg,
 	.unmap_sg = intel_unmap_sg,
 	.map_page = intel_map_page,
diff --git a/drivers/xen/swiotlb-xen.c b/drivers/xen/swiotlb-xen.c
index 19e6a2041371..1afb4fba11b4 100644
--- a/drivers/xen/swiotlb-xen.c
+++ b/drivers/xen/swiotlb-xen.c
@@ -204,7 +204,8 @@ error:
 
 void *
 xen_swiotlb_alloc_coherent(struct device *hwdev, size_t size,
-			   dma_addr_t *dma_handle, gfp_t flags)
+			   dma_addr_t *dma_handle, gfp_t flags,
+			   struct dma_attrs *attrs)
 {
 	void *ret;
 	int order = get_order(size);
@@ -253,7 +254,7 @@ EXPORT_SYMBOL_GPL(xen_swiotlb_alloc_coherent);
 
 void
 xen_swiotlb_free_coherent(struct device *hwdev, size_t size, void *vaddr,
-			  dma_addr_t dev_addr)
+			  dma_addr_t dev_addr, struct dma_attrs *attrs)
 {
 	int order = get_order(size);
 	phys_addr_t phys;
diff --git a/include/xen/swiotlb-xen.h b/include/xen/swiotlb-xen.h
index 2ea2fdc79c16..4f4d449f00f6 100644
--- a/include/xen/swiotlb-xen.h
+++ b/include/xen/swiotlb-xen.h
@@ -7,11 +7,13 @@ extern void xen_swiotlb_init(int verbose);
 
 extern void
 *xen_swiotlb_alloc_coherent(struct device *hwdev, size_t size,
-			    dma_addr_t *dma_handle, gfp_t flags);
+			    dma_addr_t *dma_handle, gfp_t flags,
+			    struct dma_attrs *attrs);
 
 extern void
 xen_swiotlb_free_coherent(struct device *hwdev, size_t size,
-			  void *vaddr, dma_addr_t dma_handle);
+			  void *vaddr, dma_addr_t dma_handle,
+			  struct dma_attrs *attrs);
 
 extern dma_addr_t xen_swiotlb_map_page(struct device *dev, struct page *page,
 				       unsigned long offset, size_t size,
-- 
cgit v1.2.3


From 8f0750f19789cf352d7e24a6cc50f2ab1b4f1372 Mon Sep 17 00:00:00 2001
From: Dan Carpenter <dan.carpenter@oracle.com>
Date: Sat, 24 Mar 2012 10:52:50 +0300
Subject: x86, tls: Off by one limit check

These are used as offsets into an array of GDT_ENTRY_TLS_ENTRIES members
so GDT_ENTRY_TLS_ENTRIES is one past the end of the array.

Signed-off-by: Dan Carpenter <dan.carpenter@oracle.com>
Link: http://lkml.kernel.org/r/20120324075250.GA28258@elgon.mountain
Cc: <stable@vger.kernel.org>
Signed-off-by: H. Peter Anvin <hpa@zytor.com>
---
 arch/x86/kernel/tls.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kernel/tls.c b/arch/x86/kernel/tls.c
index 6bb7b8579e70..bcfec2d23769 100644
--- a/arch/x86/kernel/tls.c
+++ b/arch/x86/kernel/tls.c
@@ -163,7 +163,7 @@ int regset_tls_get(struct task_struct *target, const struct user_regset *regset,
 {
 	const struct desc_struct *tls;
 
-	if (pos > GDT_ENTRY_TLS_ENTRIES * sizeof(struct user_desc) ||
+	if (pos >= GDT_ENTRY_TLS_ENTRIES * sizeof(struct user_desc) ||
 	    (pos % sizeof(struct user_desc)) != 0 ||
 	    (count % sizeof(struct user_desc)) != 0)
 		return -EINVAL;
@@ -198,7 +198,7 @@ int regset_tls_set(struct task_struct *target, const struct user_regset *regset,
 	struct user_desc infobuf[GDT_ENTRY_TLS_ENTRIES];
 	const struct user_desc *info;
 
-	if (pos > GDT_ENTRY_TLS_ENTRIES * sizeof(struct user_desc) ||
+	if (pos >= GDT_ENTRY_TLS_ENTRIES * sizeof(struct user_desc) ||
 	    (pos % sizeof(struct user_desc)) != 0 ||
 	    (count % sizeof(struct user_desc)) != 0)
 		return -EINVAL;
-- 
cgit v1.2.3


From f05e798ad4c09255f590f5b2c00a7ca6c172f983 Mon Sep 17 00:00:00 2001
From: David Howells <dhowells@redhat.com>
Date: Wed, 28 Mar 2012 18:11:12 +0100
Subject: Disintegrate asm/system.h for X86

Disintegrate asm/system.h for X86.

Signed-off-by: David Howells <dhowells@redhat.com>
Acked-by: H. Peter Anvin <hpa@zytor.com>
cc: x86@kernel.org
---
 arch/x86/ia32/ia32_aout.c                |   1 -
 arch/x86/include/asm/apic.h              |   1 -
 arch/x86/include/asm/auxvec.h            |   7 +
 arch/x86/include/asm/barrier.h           | 116 +++++++
 arch/x86/include/asm/bug.h               |   4 +
 arch/x86/include/asm/cacheflush.h        |   1 +
 arch/x86/include/asm/elf.h               |   1 -
 arch/x86/include/asm/exec.h              |   1 +
 arch/x86/include/asm/futex.h             |   1 -
 arch/x86/include/asm/i387.h              |   1 -
 arch/x86/include/asm/local.h             |   1 -
 arch/x86/include/asm/mc146818rtc.h       |   1 -
 arch/x86/include/asm/processor.h         |  31 +-
 arch/x86/include/asm/segment.h           |  58 +++-
 arch/x86/include/asm/special_insns.h     | 199 ++++++++++++
 arch/x86/include/asm/stackprotector.h    |   1 -
 arch/x86/include/asm/switch_to.h         | 129 ++++++++
 arch/x86/include/asm/system.h            | 527 +------------------------------
 arch/x86/include/asm/tlbflush.h          |   2 +-
 arch/x86/include/asm/virtext.h           |   1 -
 arch/x86/kernel/acpi/cstate.c            |   1 +
 arch/x86/kernel/apm_32.c                 |   1 -
 arch/x86/kernel/cpu/mcheck/p5.c          |   1 -
 arch/x86/kernel/cpu/mcheck/therm_throt.c |   1 -
 arch/x86/kernel/cpu/mcheck/winchip.c     |   1 -
 arch/x86/kernel/cpu/mtrr/generic.c       |   1 -
 arch/x86/kernel/cpuid.c                  |   1 -
 arch/x86/kernel/i8259.c                  |   1 -
 arch/x86/kernel/irqinit.c                |   1 -
 arch/x86/kernel/kgdb.c                   |   1 -
 arch/x86/kernel/ldt.c                    |   1 -
 arch/x86/kernel/machine_kexec_32.c       |   1 -
 arch/x86/kernel/mca_32.c                 |   1 -
 arch/x86/kernel/module.c                 |   1 -
 arch/x86/kernel/msr.c                    |   1 -
 arch/x86/kernel/paravirt.c               |   1 +
 arch/x86/kernel/pci-calgary_64.c         |   1 -
 arch/x86/kernel/process.c                |   1 -
 arch/x86/kernel/process_32.c             |   2 +-
 arch/x86/kernel/process_64.c             |   2 +-
 arch/x86/kernel/ptrace.c                 |   1 -
 arch/x86/kernel/setup.c                  |   1 -
 arch/x86/kernel/tce_64.c                 |   1 +
 arch/x86/kernel/tls.c                    |   1 -
 arch/x86/kernel/traps.c                  |   1 -
 arch/x86/mm/init.c                       |   1 -
 arch/x86/mm/init_32.c                    |   1 -
 arch/x86/mm/init_64.c                    |   1 -
 arch/x86/mm/pgtable_32.c                 |   1 -
 arch/x86/power/hibernate_32.c            |   1 -
 50 files changed, 554 insertions(+), 562 deletions(-)
 create mode 100644 arch/x86/include/asm/barrier.h
 create mode 100644 arch/x86/include/asm/exec.h
 create mode 100644 arch/x86/include/asm/special_insns.h
 create mode 100644 arch/x86/include/asm/switch_to.h

(limited to 'arch/x86')

diff --git a/arch/x86/ia32/ia32_aout.c b/arch/x86/ia32/ia32_aout.c
index 4c2e59a420b9..d511d951a052 100644
--- a/arch/x86/ia32/ia32_aout.c
+++ b/arch/x86/ia32/ia32_aout.c
@@ -26,7 +26,6 @@
 #include <linux/init.h>
 #include <linux/jiffies.h>
 
-#include <asm/system.h>
 #include <asm/uaccess.h>
 #include <asm/pgalloc.h>
 #include <asm/cacheflush.h>
diff --git a/arch/x86/include/asm/apic.h b/arch/x86/include/asm/apic.h
index a9371c91718c..4b2caeefe1a2 100644
--- a/arch/x86/include/asm/apic.h
+++ b/arch/x86/include/asm/apic.h
@@ -11,7 +11,6 @@
 #include <linux/atomic.h>
 #include <asm/fixmap.h>
 #include <asm/mpspec.h>
-#include <asm/system.h>
 #include <asm/msr.h>
 
 #define ARCH_APICTIMER_STOPS_ON_C3	1
diff --git a/arch/x86/include/asm/auxvec.h b/arch/x86/include/asm/auxvec.h
index 1316b4c35425..77203ac352de 100644
--- a/arch/x86/include/asm/auxvec.h
+++ b/arch/x86/include/asm/auxvec.h
@@ -9,4 +9,11 @@
 #endif
 #define AT_SYSINFO_EHDR		33
 
+/* entries in ARCH_DLINFO: */
+#if defined(CONFIG_IA32_EMULATION) || !defined(CONFIG_X86_64)
+# define AT_VECTOR_SIZE_ARCH 2
+#else /* else it's non-compat x86-64 */
+# define AT_VECTOR_SIZE_ARCH 1
+#endif
+
 #endif /* _ASM_X86_AUXVEC_H */
diff --git a/arch/x86/include/asm/barrier.h b/arch/x86/include/asm/barrier.h
new file mode 100644
index 000000000000..c6cd358a1eec
--- /dev/null
+++ b/arch/x86/include/asm/barrier.h
@@ -0,0 +1,116 @@
+#ifndef _ASM_X86_BARRIER_H
+#define _ASM_X86_BARRIER_H
+
+#include <asm/alternative.h>
+#include <asm/nops.h>
+
+/*
+ * Force strict CPU ordering.
+ * And yes, this is required on UP too when we're talking
+ * to devices.
+ */
+
+#ifdef CONFIG_X86_32
+/*
+ * Some non-Intel clones support out of order store. wmb() ceases to be a
+ * nop for these.
+ */
+#define mb() alternative("lock; addl $0,0(%%esp)", "mfence", X86_FEATURE_XMM2)
+#define rmb() alternative("lock; addl $0,0(%%esp)", "lfence", X86_FEATURE_XMM2)
+#define wmb() alternative("lock; addl $0,0(%%esp)", "sfence", X86_FEATURE_XMM)
+#else
+#define mb() 	asm volatile("mfence":::"memory")
+#define rmb()	asm volatile("lfence":::"memory")
+#define wmb()	asm volatile("sfence" ::: "memory")
+#endif
+
+/**
+ * read_barrier_depends - Flush all pending reads that subsequents reads
+ * depend on.
+ *
+ * No data-dependent reads from memory-like regions are ever reordered
+ * over this barrier.  All reads preceding this primitive are guaranteed
+ * to access memory (but not necessarily other CPUs' caches) before any
+ * reads following this primitive that depend on the data return by
+ * any of the preceding reads.  This primitive is much lighter weight than
+ * rmb() on most CPUs, and is never heavier weight than is
+ * rmb().
+ *
+ * These ordering constraints are respected by both the local CPU
+ * and the compiler.
+ *
+ * Ordering is not guaranteed by anything other than these primitives,
+ * not even by data dependencies.  See the documentation for
+ * memory_barrier() for examples and URLs to more information.
+ *
+ * For example, the following code would force ordering (the initial
+ * value of "a" is zero, "b" is one, and "p" is "&a"):
+ *
+ * <programlisting>
+ *	CPU 0				CPU 1
+ *
+ *	b = 2;
+ *	memory_barrier();
+ *	p = &b;				q = p;
+ *					read_barrier_depends();
+ *					d = *q;
+ * </programlisting>
+ *
+ * because the read of "*q" depends on the read of "p" and these
+ * two reads are separated by a read_barrier_depends().  However,
+ * the following code, with the same initial values for "a" and "b":
+ *
+ * <programlisting>
+ *	CPU 0				CPU 1
+ *
+ *	a = 2;
+ *	memory_barrier();
+ *	b = 3;				y = b;
+ *					read_barrier_depends();
+ *					x = a;
+ * </programlisting>
+ *
+ * does not enforce ordering, since there is no data dependency between
+ * the read of "a" and the read of "b".  Therefore, on some CPUs, such
+ * as Alpha, "y" could be set to 3 and "x" to 0.  Use rmb()
+ * in cases like this where there are no data dependencies.
+ **/
+
+#define read_barrier_depends()	do { } while (0)
+
+#ifdef CONFIG_SMP
+#define smp_mb()	mb()
+#ifdef CONFIG_X86_PPRO_FENCE
+# define smp_rmb()	rmb()
+#else
+# define smp_rmb()	barrier()
+#endif
+#ifdef CONFIG_X86_OOSTORE
+# define smp_wmb() 	wmb()
+#else
+# define smp_wmb()	barrier()
+#endif
+#define smp_read_barrier_depends()	read_barrier_depends()
+#define set_mb(var, value) do { (void)xchg(&var, value); } while (0)
+#else
+#define smp_mb()	barrier()
+#define smp_rmb()	barrier()
+#define smp_wmb()	barrier()
+#define smp_read_barrier_depends()	do { } while (0)
+#define set_mb(var, value) do { var = value; barrier(); } while (0)
+#endif
+
+/*
+ * Stop RDTSC speculation. This is needed when you need to use RDTSC
+ * (or get_cycles or vread that possibly accesses the TSC) in a defined
+ * code region.
+ *
+ * (Could use an alternative three way for this if there was one.)
+ */
+static __always_inline void rdtsc_barrier(void)
+{
+	alternative(ASM_NOP3, "mfence", X86_FEATURE_MFENCE_RDTSC);
+	alternative(ASM_NOP3, "lfence", X86_FEATURE_LFENCE_RDTSC);
+}
+
+#endif /* _ASM_X86_BARRIER_H */
diff --git a/arch/x86/include/asm/bug.h b/arch/x86/include/asm/bug.h
index f654d1bb17fb..11e1152222d0 100644
--- a/arch/x86/include/asm/bug.h
+++ b/arch/x86/include/asm/bug.h
@@ -36,4 +36,8 @@ do {								\
 #endif /* !CONFIG_BUG */
 
 #include <asm-generic/bug.h>
+
+
+extern void show_regs_common(void);
+
 #endif /* _ASM_X86_BUG_H */
diff --git a/arch/x86/include/asm/cacheflush.h b/arch/x86/include/asm/cacheflush.h
index 4e12668711e5..9863ee3747da 100644
--- a/arch/x86/include/asm/cacheflush.h
+++ b/arch/x86/include/asm/cacheflush.h
@@ -3,6 +3,7 @@
 
 /* Caches aren't brain-dead on the intel. */
 #include <asm-generic/cacheflush.h>
+#include <asm/special_insns.h>
 
 #ifdef CONFIG_X86_PAT
 /*
diff --git a/arch/x86/include/asm/elf.h b/arch/x86/include/asm/elf.h
index 5f962df30d0f..f27f79abe021 100644
--- a/arch/x86/include/asm/elf.h
+++ b/arch/x86/include/asm/elf.h
@@ -84,7 +84,6 @@ extern unsigned int vdso_enabled;
 	(((x)->e_machine == EM_386) || ((x)->e_machine == EM_486))
 
 #include <asm/processor.h>
-#include <asm/system.h>
 
 #ifdef CONFIG_X86_32
 #include <asm/desc.h>
diff --git a/arch/x86/include/asm/exec.h b/arch/x86/include/asm/exec.h
new file mode 100644
index 000000000000..54c2e1db274a
--- /dev/null
+++ b/arch/x86/include/asm/exec.h
@@ -0,0 +1 @@
+/* define arch_align_stack() here */
diff --git a/arch/x86/include/asm/futex.h b/arch/x86/include/asm/futex.h
index d09bb03653f0..71ecbcba1a4e 100644
--- a/arch/x86/include/asm/futex.h
+++ b/arch/x86/include/asm/futex.h
@@ -9,7 +9,6 @@
 #include <asm/asm.h>
 #include <asm/errno.h>
 #include <asm/processor.h>
-#include <asm/system.h>
 
 #define __futex_atomic_op1(insn, ret, oldval, uaddr, oparg)	\
 	asm volatile("1:\t" insn "\n"				\
diff --git a/arch/x86/include/asm/i387.h b/arch/x86/include/asm/i387.h
index 7ce0798b1b26..257d9cca214f 100644
--- a/arch/x86/include/asm/i387.h
+++ b/arch/x86/include/asm/i387.h
@@ -14,7 +14,6 @@
 
 #include <linux/sched.h>
 #include <linux/hardirq.h>
-#include <asm/system.h>
 
 struct pt_regs;
 struct user_i387_struct;
diff --git a/arch/x86/include/asm/local.h b/arch/x86/include/asm/local.h
index 9cdae5d47e8f..c8bed0da434a 100644
--- a/arch/x86/include/asm/local.h
+++ b/arch/x86/include/asm/local.h
@@ -3,7 +3,6 @@
 
 #include <linux/percpu.h>
 
-#include <asm/system.h>
 #include <linux/atomic.h>
 #include <asm/asm.h>
 
diff --git a/arch/x86/include/asm/mc146818rtc.h b/arch/x86/include/asm/mc146818rtc.h
index 0e8e85bb7c51..d354fb781c57 100644
--- a/arch/x86/include/asm/mc146818rtc.h
+++ b/arch/x86/include/asm/mc146818rtc.h
@@ -5,7 +5,6 @@
 #define _ASM_X86_MC146818RTC_H
 
 #include <asm/io.h>
-#include <asm/system.h>
 #include <asm/processor.h>
 #include <linux/mc146818rtc.h>
 
diff --git a/arch/x86/include/asm/processor.h b/arch/x86/include/asm/processor.h
index 95da14f7ee85..78e30ea492b2 100644
--- a/arch/x86/include/asm/processor.h
+++ b/arch/x86/include/asm/processor.h
@@ -14,13 +14,13 @@ struct mm_struct;
 #include <asm/sigcontext.h>
 #include <asm/current.h>
 #include <asm/cpufeature.h>
-#include <asm/system.h>
 #include <asm/page.h>
 #include <asm/pgtable_types.h>
 #include <asm/percpu.h>
 #include <asm/msr.h>
 #include <asm/desc_defs.h>
 #include <asm/nops.h>
+#include <asm/special_insns.h>
 
 #include <linux/personality.h>
 #include <linux/cpumask.h>
@@ -29,6 +29,15 @@ struct mm_struct;
 #include <linux/math64.h>
 #include <linux/init.h>
 #include <linux/err.h>
+#include <linux/irqflags.h>
+
+/*
+ * We handle most unaligned accesses in hardware.  On the other hand
+ * unaligned DMA can be quite expensive on some Nehalem processors.
+ *
+ * Based on this we disable the IP header alignment in network drivers.
+ */
+#define NET_IP_ALIGN	0
 
 #define HBP_NUM 4
 /*
@@ -1022,4 +1031,24 @@ extern bool cpu_has_amd_erratum(const int *);
 #define cpu_has_amd_erratum(x)	(false)
 #endif /* CONFIG_CPU_SUP_AMD */
 
+#ifdef CONFIG_X86_32
+/*
+ * disable hlt during certain critical i/o operations
+ */
+#define HAVE_DISABLE_HLT
+#endif
+
+void disable_hlt(void);
+void enable_hlt(void);
+
+void cpu_idle_wait(void);
+
+extern unsigned long arch_align_stack(unsigned long sp);
+extern void free_init_pages(char *what, unsigned long begin, unsigned long end);
+
+void default_idle(void);
+bool set_pm_idle_to_default(void);
+
+void stop_this_cpu(void *dummy);
+
 #endif /* _ASM_X86_PROCESSOR_H */
diff --git a/arch/x86/include/asm/segment.h b/arch/x86/include/asm/segment.h
index 5e641715c3fe..165466233ab0 100644
--- a/arch/x86/include/asm/segment.h
+++ b/arch/x86/include/asm/segment.h
@@ -212,7 +212,61 @@
 #ifdef __KERNEL__
 #ifndef __ASSEMBLY__
 extern const char early_idt_handlers[NUM_EXCEPTION_VECTORS][10];
-#endif
-#endif
+
+/*
+ * Load a segment. Fall back on loading the zero
+ * segment if something goes wrong..
+ */
+#define loadsegment(seg, value)						\
+do {									\
+	unsigned short __val = (value);					\
+									\
+	asm volatile("						\n"	\
+		     "1:	movl %k0,%%" #seg "		\n"	\
+									\
+		     ".section .fixup,\"ax\"			\n"	\
+		     "2:	xorl %k0,%k0			\n"	\
+		     "		jmp 1b				\n"	\
+		     ".previous					\n"	\
+									\
+		     _ASM_EXTABLE(1b, 2b)				\
+									\
+		     : "+r" (__val) : : "memory");			\
+} while (0)
+
+/*
+ * Save a segment register away
+ */
+#define savesegment(seg, value)				\
+	asm("mov %%" #seg ",%0":"=r" (value) : : "memory")
+
+/*
+ * x86_32 user gs accessors.
+ */
+#ifdef CONFIG_X86_32
+#ifdef CONFIG_X86_32_LAZY_GS
+#define get_user_gs(regs)	(u16)({unsigned long v; savesegment(gs, v); v;})
+#define set_user_gs(regs, v)	loadsegment(gs, (unsigned long)(v))
+#define task_user_gs(tsk)	((tsk)->thread.gs)
+#define lazy_save_gs(v)		savesegment(gs, (v))
+#define lazy_load_gs(v)		loadsegment(gs, (v))
+#else	/* X86_32_LAZY_GS */
+#define get_user_gs(regs)	(u16)((regs)->gs)
+#define set_user_gs(regs, v)	do { (regs)->gs = (v); } while (0)
+#define task_user_gs(tsk)	(task_pt_regs(tsk)->gs)
+#define lazy_save_gs(v)		do { } while (0)
+#define lazy_load_gs(v)		do { } while (0)
+#endif	/* X86_32_LAZY_GS */
+#endif	/* X86_32 */
+
+static inline unsigned long get_limit(unsigned long segment)
+{
+	unsigned long __limit;
+	asm("lsll %1,%0" : "=r" (__limit) : "r" (segment));
+	return __limit + 1;
+}
+
+#endif /* !__ASSEMBLY__ */
+#endif /* __KERNEL__ */
 
 #endif /* _ASM_X86_SEGMENT_H */
diff --git a/arch/x86/include/asm/special_insns.h b/arch/x86/include/asm/special_insns.h
new file mode 100644
index 000000000000..41fc93a2e225
--- /dev/null
+++ b/arch/x86/include/asm/special_insns.h
@@ -0,0 +1,199 @@
+#ifndef _ASM_X86_SPECIAL_INSNS_H
+#define _ASM_X86_SPECIAL_INSNS_H
+
+
+#ifdef __KERNEL__
+
+static inline void native_clts(void)
+{
+	asm volatile("clts");
+}
+
+/*
+ * Volatile isn't enough to prevent the compiler from reordering the
+ * read/write functions for the control registers and messing everything up.
+ * A memory clobber would solve the problem, but would prevent reordering of
+ * all loads stores around it, which can hurt performance. Solution is to
+ * use a variable and mimic reads and writes to it to enforce serialization
+ */
+static unsigned long __force_order;
+
+static inline unsigned long native_read_cr0(void)
+{
+	unsigned long val;
+	asm volatile("mov %%cr0,%0\n\t" : "=r" (val), "=m" (__force_order));
+	return val;
+}
+
+static inline void native_write_cr0(unsigned long val)
+{
+	asm volatile("mov %0,%%cr0": : "r" (val), "m" (__force_order));
+}
+
+static inline unsigned long native_read_cr2(void)
+{
+	unsigned long val;
+	asm volatile("mov %%cr2,%0\n\t" : "=r" (val), "=m" (__force_order));
+	return val;
+}
+
+static inline void native_write_cr2(unsigned long val)
+{
+	asm volatile("mov %0,%%cr2": : "r" (val), "m" (__force_order));
+}
+
+static inline unsigned long native_read_cr3(void)
+{
+	unsigned long val;
+	asm volatile("mov %%cr3,%0\n\t" : "=r" (val), "=m" (__force_order));
+	return val;
+}
+
+static inline void native_write_cr3(unsigned long val)
+{
+	asm volatile("mov %0,%%cr3": : "r" (val), "m" (__force_order));
+}
+
+static inline unsigned long native_read_cr4(void)
+{
+	unsigned long val;
+	asm volatile("mov %%cr4,%0\n\t" : "=r" (val), "=m" (__force_order));
+	return val;
+}
+
+static inline unsigned long native_read_cr4_safe(void)
+{
+	unsigned long val;
+	/* This could fault if %cr4 does not exist. In x86_64, a cr4 always
+	 * exists, so it will never fail. */
+#ifdef CONFIG_X86_32
+	asm volatile("1: mov %%cr4, %0\n"
+		     "2:\n"
+		     _ASM_EXTABLE(1b, 2b)
+		     : "=r" (val), "=m" (__force_order) : "0" (0));
+#else
+	val = native_read_cr4();
+#endif
+	return val;
+}
+
+static inline void native_write_cr4(unsigned long val)
+{
+	asm volatile("mov %0,%%cr4": : "r" (val), "m" (__force_order));
+}
+
+#ifdef CONFIG_X86_64
+static inline unsigned long native_read_cr8(void)
+{
+	unsigned long cr8;
+	asm volatile("movq %%cr8,%0" : "=r" (cr8));
+	return cr8;
+}
+
+static inline void native_write_cr8(unsigned long val)
+{
+	asm volatile("movq %0,%%cr8" :: "r" (val) : "memory");
+}
+#endif
+
+static inline void native_wbinvd(void)
+{
+	asm volatile("wbinvd": : :"memory");
+}
+
+extern void native_load_gs_index(unsigned);
+
+#ifdef CONFIG_PARAVIRT
+#include <asm/paravirt.h>
+#else
+
+static inline unsigned long read_cr0(void)
+{
+	return native_read_cr0();
+}
+
+static inline void write_cr0(unsigned long x)
+{
+	native_write_cr0(x);
+}
+
+static inline unsigned long read_cr2(void)
+{
+	return native_read_cr2();
+}
+
+static inline void write_cr2(unsigned long x)
+{
+	native_write_cr2(x);
+}
+
+static inline unsigned long read_cr3(void)
+{
+	return native_read_cr3();
+}
+
+static inline void write_cr3(unsigned long x)
+{
+	native_write_cr3(x);
+}
+
+static inline unsigned long read_cr4(void)
+{
+	return native_read_cr4();
+}
+
+static inline unsigned long read_cr4_safe(void)
+{
+	return native_read_cr4_safe();
+}
+
+static inline void write_cr4(unsigned long x)
+{
+	native_write_cr4(x);
+}
+
+static inline void wbinvd(void)
+{
+	native_wbinvd();
+}
+
+#ifdef CONFIG_X86_64
+
+static inline unsigned long read_cr8(void)
+{
+	return native_read_cr8();
+}
+
+static inline void write_cr8(unsigned long x)
+{
+	native_write_cr8(x);
+}
+
+static inline void load_gs_index(unsigned selector)
+{
+	native_load_gs_index(selector);
+}
+
+#endif
+
+/* Clear the 'TS' bit */
+static inline void clts(void)
+{
+	native_clts();
+}
+
+#endif/* CONFIG_PARAVIRT */
+
+#define stts() write_cr0(read_cr0() | X86_CR0_TS)
+
+static inline void clflush(volatile void *__p)
+{
+	asm volatile("clflush %0" : "+m" (*(volatile char __force *)__p));
+}
+
+#define nop() asm volatile ("nop")
+
+
+#endif /* __KERNEL__ */
+
+#endif /* _ASM_X86_SPECIAL_INSNS_H */
diff --git a/arch/x86/include/asm/stackprotector.h b/arch/x86/include/asm/stackprotector.h
index 157517763565..b5d9533d2c38 100644
--- a/arch/x86/include/asm/stackprotector.h
+++ b/arch/x86/include/asm/stackprotector.h
@@ -38,7 +38,6 @@
 #include <asm/tsc.h>
 #include <asm/processor.h>
 #include <asm/percpu.h>
-#include <asm/system.h>
 #include <asm/desc.h>
 #include <linux/random.h>
 
diff --git a/arch/x86/include/asm/switch_to.h b/arch/x86/include/asm/switch_to.h
new file mode 100644
index 000000000000..4ec45b3abba1
--- /dev/null
+++ b/arch/x86/include/asm/switch_to.h
@@ -0,0 +1,129 @@
+#ifndef _ASM_X86_SWITCH_TO_H
+#define _ASM_X86_SWITCH_TO_H
+
+struct task_struct; /* one of the stranger aspects of C forward declarations */
+struct task_struct *__switch_to(struct task_struct *prev,
+				struct task_struct *next);
+struct tss_struct;
+void __switch_to_xtra(struct task_struct *prev_p, struct task_struct *next_p,
+		      struct tss_struct *tss);
+
+#ifdef CONFIG_X86_32
+
+#ifdef CONFIG_CC_STACKPROTECTOR
+#define __switch_canary							\
+	"movl %P[task_canary](%[next]), %%ebx\n\t"			\
+	"movl %%ebx, "__percpu_arg([stack_canary])"\n\t"
+#define __switch_canary_oparam						\
+	, [stack_canary] "=m" (stack_canary.canary)
+#define __switch_canary_iparam						\
+	, [task_canary] "i" (offsetof(struct task_struct, stack_canary))
+#else	/* CC_STACKPROTECTOR */
+#define __switch_canary
+#define __switch_canary_oparam
+#define __switch_canary_iparam
+#endif	/* CC_STACKPROTECTOR */
+
+/*
+ * Saving eflags is important. It switches not only IOPL between tasks,
+ * it also protects other tasks from NT leaking through sysenter etc.
+ */
+#define switch_to(prev, next, last)					\
+do {									\
+	/*								\
+	 * Context-switching clobbers all registers, so we clobber	\
+	 * them explicitly, via unused output variables.		\
+	 * (EAX and EBP is not listed because EBP is saved/restored	\
+	 * explicitly for wchan access and EAX is the return value of	\
+	 * __switch_to())						\
+	 */								\
+	unsigned long ebx, ecx, edx, esi, edi;				\
+									\
+	asm volatile("pushfl\n\t"		/* save    flags */	\
+		     "pushl %%ebp\n\t"		/* save    EBP   */	\
+		     "movl %%esp,%[prev_sp]\n\t"	/* save    ESP   */ \
+		     "movl %[next_sp],%%esp\n\t"	/* restore ESP   */ \
+		     "movl $1f,%[prev_ip]\n\t"	/* save    EIP   */	\
+		     "pushl %[next_ip]\n\t"	/* restore EIP   */	\
+		     __switch_canary					\
+		     "jmp __switch_to\n"	/* regparm call  */	\
+		     "1:\t"						\
+		     "popl %%ebp\n\t"		/* restore EBP   */	\
+		     "popfl\n"			/* restore flags */	\
+									\
+		     /* output parameters */				\
+		     : [prev_sp] "=m" (prev->thread.sp),		\
+		       [prev_ip] "=m" (prev->thread.ip),		\
+		       "=a" (last),					\
+									\
+		       /* clobbered output registers: */		\
+		       "=b" (ebx), "=c" (ecx), "=d" (edx),		\
+		       "=S" (esi), "=D" (edi)				\
+		       							\
+		       __switch_canary_oparam				\
+									\
+		       /* input parameters: */				\
+		     : [next_sp]  "m" (next->thread.sp),		\
+		       [next_ip]  "m" (next->thread.ip),		\
+		       							\
+		       /* regparm parameters for __switch_to(): */	\
+		       [prev]     "a" (prev),				\
+		       [next]     "d" (next)				\
+									\
+		       __switch_canary_iparam				\
+									\
+		     : /* reloaded segment registers */			\
+			"memory");					\
+} while (0)
+
+#else /* CONFIG_X86_32 */
+
+/* frame pointer must be last for get_wchan */
+#define SAVE_CONTEXT    "pushf ; pushq %%rbp ; movq %%rsi,%%rbp\n\t"
+#define RESTORE_CONTEXT "movq %%rbp,%%rsi ; popq %%rbp ; popf\t"
+
+#define __EXTRA_CLOBBER  \
+	, "rcx", "rbx", "rdx", "r8", "r9", "r10", "r11", \
+	  "r12", "r13", "r14", "r15"
+
+#ifdef CONFIG_CC_STACKPROTECTOR
+#define __switch_canary							  \
+	"movq %P[task_canary](%%rsi),%%r8\n\t"				  \
+	"movq %%r8,"__percpu_arg([gs_canary])"\n\t"
+#define __switch_canary_oparam						  \
+	, [gs_canary] "=m" (irq_stack_union.stack_canary)
+#define __switch_canary_iparam						  \
+	, [task_canary] "i" (offsetof(struct task_struct, stack_canary))
+#else	/* CC_STACKPROTECTOR */
+#define __switch_canary
+#define __switch_canary_oparam
+#define __switch_canary_iparam
+#endif	/* CC_STACKPROTECTOR */
+
+/* Save restore flags to clear handle leaking NT */
+#define switch_to(prev, next, last) \
+	asm volatile(SAVE_CONTEXT					  \
+	     "movq %%rsp,%P[threadrsp](%[prev])\n\t" /* save RSP */	  \
+	     "movq %P[threadrsp](%[next]),%%rsp\n\t" /* restore RSP */	  \
+	     "call __switch_to\n\t"					  \
+	     "movq "__percpu_arg([current_task])",%%rsi\n\t"		  \
+	     __switch_canary						  \
+	     "movq %P[thread_info](%%rsi),%%r8\n\t"			  \
+	     "movq %%rax,%%rdi\n\t" 					  \
+	     "testl  %[_tif_fork],%P[ti_flags](%%r8)\n\t"		  \
+	     "jnz   ret_from_fork\n\t"					  \
+	     RESTORE_CONTEXT						  \
+	     : "=a" (last)					  	  \
+	       __switch_canary_oparam					  \
+	     : [next] "S" (next), [prev] "D" (prev),			  \
+	       [threadrsp] "i" (offsetof(struct task_struct, thread.sp)), \
+	       [ti_flags] "i" (offsetof(struct thread_info, flags)),	  \
+	       [_tif_fork] "i" (_TIF_FORK),			  	  \
+	       [thread_info] "i" (offsetof(struct task_struct, stack)),   \
+	       [current_task] "m" (current_task)			  \
+	       __switch_canary_iparam					  \
+	     : "memory", "cc" __EXTRA_CLOBBER)
+
+#endif /* CONFIG_X86_32 */
+
+#endif /* _ASM_X86_SWITCH_TO_H */
diff --git a/arch/x86/include/asm/system.h b/arch/x86/include/asm/system.h
index 2d2f01ce6dcb..0d84f9e42fde 100644
--- a/arch/x86/include/asm/system.h
+++ b/arch/x86/include/asm/system.h
@@ -1,523 +1,6 @@
-#ifndef _ASM_X86_SYSTEM_H
-#define _ASM_X86_SYSTEM_H
-
-#include <asm/asm.h>
-#include <asm/segment.h>
-#include <asm/cpufeature.h>
+/* FILE TO BE DELETED. DO NOT ADD STUFF HERE! */
+#include <asm/barrier.h>
 #include <asm/cmpxchg.h>
-#include <asm/nops.h>
-
-#include <linux/kernel.h>
-#include <linux/irqflags.h>
-
-/* entries in ARCH_DLINFO: */
-#if defined(CONFIG_IA32_EMULATION) || !defined(CONFIG_X86_64)
-# define AT_VECTOR_SIZE_ARCH 2
-#else /* else it's non-compat x86-64 */
-# define AT_VECTOR_SIZE_ARCH 1
-#endif
-
-struct task_struct; /* one of the stranger aspects of C forward declarations */
-struct task_struct *__switch_to(struct task_struct *prev,
-				struct task_struct *next);
-struct tss_struct;
-void __switch_to_xtra(struct task_struct *prev_p, struct task_struct *next_p,
-		      struct tss_struct *tss);
-extern void show_regs_common(void);
-
-#ifdef CONFIG_X86_32
-
-#ifdef CONFIG_CC_STACKPROTECTOR
-#define __switch_canary							\
-	"movl %P[task_canary](%[next]), %%ebx\n\t"			\
-	"movl %%ebx, "__percpu_arg([stack_canary])"\n\t"
-#define __switch_canary_oparam						\
-	, [stack_canary] "=m" (stack_canary.canary)
-#define __switch_canary_iparam						\
-	, [task_canary] "i" (offsetof(struct task_struct, stack_canary))
-#else	/* CC_STACKPROTECTOR */
-#define __switch_canary
-#define __switch_canary_oparam
-#define __switch_canary_iparam
-#endif	/* CC_STACKPROTECTOR */
-
-/*
- * Saving eflags is important. It switches not only IOPL between tasks,
- * it also protects other tasks from NT leaking through sysenter etc.
- */
-#define switch_to(prev, next, last)					\
-do {									\
-	/*								\
-	 * Context-switching clobbers all registers, so we clobber	\
-	 * them explicitly, via unused output variables.		\
-	 * (EAX and EBP is not listed because EBP is saved/restored	\
-	 * explicitly for wchan access and EAX is the return value of	\
-	 * __switch_to())						\
-	 */								\
-	unsigned long ebx, ecx, edx, esi, edi;				\
-									\
-	asm volatile("pushfl\n\t"		/* save    flags */	\
-		     "pushl %%ebp\n\t"		/* save    EBP   */	\
-		     "movl %%esp,%[prev_sp]\n\t"	/* save    ESP   */ \
-		     "movl %[next_sp],%%esp\n\t"	/* restore ESP   */ \
-		     "movl $1f,%[prev_ip]\n\t"	/* save    EIP   */	\
-		     "pushl %[next_ip]\n\t"	/* restore EIP   */	\
-		     __switch_canary					\
-		     "jmp __switch_to\n"	/* regparm call  */	\
-		     "1:\t"						\
-		     "popl %%ebp\n\t"		/* restore EBP   */	\
-		     "popfl\n"			/* restore flags */	\
-									\
-		     /* output parameters */				\
-		     : [prev_sp] "=m" (prev->thread.sp),		\
-		       [prev_ip] "=m" (prev->thread.ip),		\
-		       "=a" (last),					\
-									\
-		       /* clobbered output registers: */		\
-		       "=b" (ebx), "=c" (ecx), "=d" (edx),		\
-		       "=S" (esi), "=D" (edi)				\
-		       							\
-		       __switch_canary_oparam				\
-									\
-		       /* input parameters: */				\
-		     : [next_sp]  "m" (next->thread.sp),		\
-		       [next_ip]  "m" (next->thread.ip),		\
-		       							\
-		       /* regparm parameters for __switch_to(): */	\
-		       [prev]     "a" (prev),				\
-		       [next]     "d" (next)				\
-									\
-		       __switch_canary_iparam				\
-									\
-		     : /* reloaded segment registers */			\
-			"memory");					\
-} while (0)
-
-/*
- * disable hlt during certain critical i/o operations
- */
-#define HAVE_DISABLE_HLT
-#else
-
-/* frame pointer must be last for get_wchan */
-#define SAVE_CONTEXT    "pushf ; pushq %%rbp ; movq %%rsi,%%rbp\n\t"
-#define RESTORE_CONTEXT "movq %%rbp,%%rsi ; popq %%rbp ; popf\t"
-
-#define __EXTRA_CLOBBER  \
-	, "rcx", "rbx", "rdx", "r8", "r9", "r10", "r11", \
-	  "r12", "r13", "r14", "r15"
-
-#ifdef CONFIG_CC_STACKPROTECTOR
-#define __switch_canary							  \
-	"movq %P[task_canary](%%rsi),%%r8\n\t"				  \
-	"movq %%r8,"__percpu_arg([gs_canary])"\n\t"
-#define __switch_canary_oparam						  \
-	, [gs_canary] "=m" (irq_stack_union.stack_canary)
-#define __switch_canary_iparam						  \
-	, [task_canary] "i" (offsetof(struct task_struct, stack_canary))
-#else	/* CC_STACKPROTECTOR */
-#define __switch_canary
-#define __switch_canary_oparam
-#define __switch_canary_iparam
-#endif	/* CC_STACKPROTECTOR */
-
-/* Save restore flags to clear handle leaking NT */
-#define switch_to(prev, next, last) \
-	asm volatile(SAVE_CONTEXT					  \
-	     "movq %%rsp,%P[threadrsp](%[prev])\n\t" /* save RSP */	  \
-	     "movq %P[threadrsp](%[next]),%%rsp\n\t" /* restore RSP */	  \
-	     "call __switch_to\n\t"					  \
-	     "movq "__percpu_arg([current_task])",%%rsi\n\t"		  \
-	     __switch_canary						  \
-	     "movq %P[thread_info](%%rsi),%%r8\n\t"			  \
-	     "movq %%rax,%%rdi\n\t" 					  \
-	     "testl  %[_tif_fork],%P[ti_flags](%%r8)\n\t"		  \
-	     "jnz   ret_from_fork\n\t"					  \
-	     RESTORE_CONTEXT						  \
-	     : "=a" (last)					  	  \
-	       __switch_canary_oparam					  \
-	     : [next] "S" (next), [prev] "D" (prev),			  \
-	       [threadrsp] "i" (offsetof(struct task_struct, thread.sp)), \
-	       [ti_flags] "i" (offsetof(struct thread_info, flags)),	  \
-	       [_tif_fork] "i" (_TIF_FORK),			  	  \
-	       [thread_info] "i" (offsetof(struct task_struct, stack)),   \
-	       [current_task] "m" (current_task)			  \
-	       __switch_canary_iparam					  \
-	     : "memory", "cc" __EXTRA_CLOBBER)
-#endif
-
-#ifdef __KERNEL__
-
-extern void native_load_gs_index(unsigned);
-
-/*
- * Load a segment. Fall back on loading the zero
- * segment if something goes wrong..
- */
-#define loadsegment(seg, value)						\
-do {									\
-	unsigned short __val = (value);					\
-									\
-	asm volatile("						\n"	\
-		     "1:	movl %k0,%%" #seg "		\n"	\
-									\
-		     ".section .fixup,\"ax\"			\n"	\
-		     "2:	xorl %k0,%k0			\n"	\
-		     "		jmp 1b				\n"	\
-		     ".previous					\n"	\
-									\
-		     _ASM_EXTABLE(1b, 2b)				\
-									\
-		     : "+r" (__val) : : "memory");			\
-} while (0)
-
-/*
- * Save a segment register away
- */
-#define savesegment(seg, value)				\
-	asm("mov %%" #seg ",%0":"=r" (value) : : "memory")
-
-/*
- * x86_32 user gs accessors.
- */
-#ifdef CONFIG_X86_32
-#ifdef CONFIG_X86_32_LAZY_GS
-#define get_user_gs(regs)	(u16)({unsigned long v; savesegment(gs, v); v;})
-#define set_user_gs(regs, v)	loadsegment(gs, (unsigned long)(v))
-#define task_user_gs(tsk)	((tsk)->thread.gs)
-#define lazy_save_gs(v)		savesegment(gs, (v))
-#define lazy_load_gs(v)		loadsegment(gs, (v))
-#else	/* X86_32_LAZY_GS */
-#define get_user_gs(regs)	(u16)((regs)->gs)
-#define set_user_gs(regs, v)	do { (regs)->gs = (v); } while (0)
-#define task_user_gs(tsk)	(task_pt_regs(tsk)->gs)
-#define lazy_save_gs(v)		do { } while (0)
-#define lazy_load_gs(v)		do { } while (0)
-#endif	/* X86_32_LAZY_GS */
-#endif	/* X86_32 */
-
-static inline unsigned long get_limit(unsigned long segment)
-{
-	unsigned long __limit;
-	asm("lsll %1,%0" : "=r" (__limit) : "r" (segment));
-	return __limit + 1;
-}
-
-static inline void native_clts(void)
-{
-	asm volatile("clts");
-}
-
-/*
- * Volatile isn't enough to prevent the compiler from reordering the
- * read/write functions for the control registers and messing everything up.
- * A memory clobber would solve the problem, but would prevent reordering of
- * all loads stores around it, which can hurt performance. Solution is to
- * use a variable and mimic reads and writes to it to enforce serialization
- */
-static unsigned long __force_order;
-
-static inline unsigned long native_read_cr0(void)
-{
-	unsigned long val;
-	asm volatile("mov %%cr0,%0\n\t" : "=r" (val), "=m" (__force_order));
-	return val;
-}
-
-static inline void native_write_cr0(unsigned long val)
-{
-	asm volatile("mov %0,%%cr0": : "r" (val), "m" (__force_order));
-}
-
-static inline unsigned long native_read_cr2(void)
-{
-	unsigned long val;
-	asm volatile("mov %%cr2,%0\n\t" : "=r" (val), "=m" (__force_order));
-	return val;
-}
-
-static inline void native_write_cr2(unsigned long val)
-{
-	asm volatile("mov %0,%%cr2": : "r" (val), "m" (__force_order));
-}
-
-static inline unsigned long native_read_cr3(void)
-{
-	unsigned long val;
-	asm volatile("mov %%cr3,%0\n\t" : "=r" (val), "=m" (__force_order));
-	return val;
-}
-
-static inline void native_write_cr3(unsigned long val)
-{
-	asm volatile("mov %0,%%cr3": : "r" (val), "m" (__force_order));
-}
-
-static inline unsigned long native_read_cr4(void)
-{
-	unsigned long val;
-	asm volatile("mov %%cr4,%0\n\t" : "=r" (val), "=m" (__force_order));
-	return val;
-}
-
-static inline unsigned long native_read_cr4_safe(void)
-{
-	unsigned long val;
-	/* This could fault if %cr4 does not exist. In x86_64, a cr4 always
-	 * exists, so it will never fail. */
-#ifdef CONFIG_X86_32
-	asm volatile("1: mov %%cr4, %0\n"
-		     "2:\n"
-		     _ASM_EXTABLE(1b, 2b)
-		     : "=r" (val), "=m" (__force_order) : "0" (0));
-#else
-	val = native_read_cr4();
-#endif
-	return val;
-}
-
-static inline void native_write_cr4(unsigned long val)
-{
-	asm volatile("mov %0,%%cr4": : "r" (val), "m" (__force_order));
-}
-
-#ifdef CONFIG_X86_64
-static inline unsigned long native_read_cr8(void)
-{
-	unsigned long cr8;
-	asm volatile("movq %%cr8,%0" : "=r" (cr8));
-	return cr8;
-}
-
-static inline void native_write_cr8(unsigned long val)
-{
-	asm volatile("movq %0,%%cr8" :: "r" (val) : "memory");
-}
-#endif
-
-static inline void native_wbinvd(void)
-{
-	asm volatile("wbinvd": : :"memory");
-}
-
-#ifdef CONFIG_PARAVIRT
-#include <asm/paravirt.h>
-#else
-
-static inline unsigned long read_cr0(void)
-{
-	return native_read_cr0();
-}
-
-static inline void write_cr0(unsigned long x)
-{
-	native_write_cr0(x);
-}
-
-static inline unsigned long read_cr2(void)
-{
-	return native_read_cr2();
-}
-
-static inline void write_cr2(unsigned long x)
-{
-	native_write_cr2(x);
-}
-
-static inline unsigned long read_cr3(void)
-{
-	return native_read_cr3();
-}
-
-static inline void write_cr3(unsigned long x)
-{
-	native_write_cr3(x);
-}
-
-static inline unsigned long read_cr4(void)
-{
-	return native_read_cr4();
-}
-
-static inline unsigned long read_cr4_safe(void)
-{
-	return native_read_cr4_safe();
-}
-
-static inline void write_cr4(unsigned long x)
-{
-	native_write_cr4(x);
-}
-
-static inline void wbinvd(void)
-{
-	native_wbinvd();
-}
-
-#ifdef CONFIG_X86_64
-
-static inline unsigned long read_cr8(void)
-{
-	return native_read_cr8();
-}
-
-static inline void write_cr8(unsigned long x)
-{
-	native_write_cr8(x);
-}
-
-static inline void load_gs_index(unsigned selector)
-{
-	native_load_gs_index(selector);
-}
-
-#endif
-
-/* Clear the 'TS' bit */
-static inline void clts(void)
-{
-	native_clts();
-}
-
-#endif/* CONFIG_PARAVIRT */
-
-#define stts() write_cr0(read_cr0() | X86_CR0_TS)
-
-#endif /* __KERNEL__ */
-
-static inline void clflush(volatile void *__p)
-{
-	asm volatile("clflush %0" : "+m" (*(volatile char __force *)__p));
-}
-
-#define nop() asm volatile ("nop")
-
-void disable_hlt(void);
-void enable_hlt(void);
-
-void cpu_idle_wait(void);
-
-extern unsigned long arch_align_stack(unsigned long sp);
-extern void free_init_pages(char *what, unsigned long begin, unsigned long end);
-
-void default_idle(void);
-bool set_pm_idle_to_default(void);
-
-void stop_this_cpu(void *dummy);
-
-/*
- * Force strict CPU ordering.
- * And yes, this is required on UP too when we're talking
- * to devices.
- */
-#ifdef CONFIG_X86_32
-/*
- * Some non-Intel clones support out of order store. wmb() ceases to be a
- * nop for these.
- */
-#define mb() alternative("lock; addl $0,0(%%esp)", "mfence", X86_FEATURE_XMM2)
-#define rmb() alternative("lock; addl $0,0(%%esp)", "lfence", X86_FEATURE_XMM2)
-#define wmb() alternative("lock; addl $0,0(%%esp)", "sfence", X86_FEATURE_XMM)
-#else
-#define mb() 	asm volatile("mfence":::"memory")
-#define rmb()	asm volatile("lfence":::"memory")
-#define wmb()	asm volatile("sfence" ::: "memory")
-#endif
-
-/**
- * read_barrier_depends - Flush all pending reads that subsequents reads
- * depend on.
- *
- * No data-dependent reads from memory-like regions are ever reordered
- * over this barrier.  All reads preceding this primitive are guaranteed
- * to access memory (but not necessarily other CPUs' caches) before any
- * reads following this primitive that depend on the data return by
- * any of the preceding reads.  This primitive is much lighter weight than
- * rmb() on most CPUs, and is never heavier weight than is
- * rmb().
- *
- * These ordering constraints are respected by both the local CPU
- * and the compiler.
- *
- * Ordering is not guaranteed by anything other than these primitives,
- * not even by data dependencies.  See the documentation for
- * memory_barrier() for examples and URLs to more information.
- *
- * For example, the following code would force ordering (the initial
- * value of "a" is zero, "b" is one, and "p" is "&a"):
- *
- * <programlisting>
- *	CPU 0				CPU 1
- *
- *	b = 2;
- *	memory_barrier();
- *	p = &b;				q = p;
- *					read_barrier_depends();
- *					d = *q;
- * </programlisting>
- *
- * because the read of "*q" depends on the read of "p" and these
- * two reads are separated by a read_barrier_depends().  However,
- * the following code, with the same initial values for "a" and "b":
- *
- * <programlisting>
- *	CPU 0				CPU 1
- *
- *	a = 2;
- *	memory_barrier();
- *	b = 3;				y = b;
- *					read_barrier_depends();
- *					x = a;
- * </programlisting>
- *
- * does not enforce ordering, since there is no data dependency between
- * the read of "a" and the read of "b".  Therefore, on some CPUs, such
- * as Alpha, "y" could be set to 3 and "x" to 0.  Use rmb()
- * in cases like this where there are no data dependencies.
- **/
-
-#define read_barrier_depends()	do { } while (0)
-
-#ifdef CONFIG_SMP
-#define smp_mb()	mb()
-#ifdef CONFIG_X86_PPRO_FENCE
-# define smp_rmb()	rmb()
-#else
-# define smp_rmb()	barrier()
-#endif
-#ifdef CONFIG_X86_OOSTORE
-# define smp_wmb() 	wmb()
-#else
-# define smp_wmb()	barrier()
-#endif
-#define smp_read_barrier_depends()	read_barrier_depends()
-#define set_mb(var, value) do { (void)xchg(&var, value); } while (0)
-#else
-#define smp_mb()	barrier()
-#define smp_rmb()	barrier()
-#define smp_wmb()	barrier()
-#define smp_read_barrier_depends()	do { } while (0)
-#define set_mb(var, value) do { var = value; barrier(); } while (0)
-#endif
-
-/*
- * Stop RDTSC speculation. This is needed when you need to use RDTSC
- * (or get_cycles or vread that possibly accesses the TSC) in a defined
- * code region.
- *
- * (Could use an alternative three way for this if there was one.)
- */
-static __always_inline void rdtsc_barrier(void)
-{
-	alternative(ASM_NOP3, "mfence", X86_FEATURE_MFENCE_RDTSC);
-	alternative(ASM_NOP3, "lfence", X86_FEATURE_LFENCE_RDTSC);
-}
-
-/*
- * We handle most unaligned accesses in hardware.  On the other hand
- * unaligned DMA can be quite expensive on some Nehalem processors.
- *
- * Based on this we disable the IP header alignment in network drivers.
- */
-#define NET_IP_ALIGN	0
-#endif /* _ASM_X86_SYSTEM_H */
+#include <asm/exec.h>
+#include <asm/special_insns.h>
+#include <asm/switch_to.h>
diff --git a/arch/x86/include/asm/tlbflush.h b/arch/x86/include/asm/tlbflush.h
index 169be8938b96..c0e108e08079 100644
--- a/arch/x86/include/asm/tlbflush.h
+++ b/arch/x86/include/asm/tlbflush.h
@@ -5,7 +5,7 @@
 #include <linux/sched.h>
 
 #include <asm/processor.h>
-#include <asm/system.h>
+#include <asm/special_insns.h>
 
 #ifdef CONFIG_PARAVIRT
 #include <asm/paravirt.h>
diff --git a/arch/x86/include/asm/virtext.h b/arch/x86/include/asm/virtext.h
index e0f9aa16358b..5da71c27cc59 100644
--- a/arch/x86/include/asm/virtext.h
+++ b/arch/x86/include/asm/virtext.h
@@ -16,7 +16,6 @@
 #define _ASM_X86_VIRTEX_H
 
 #include <asm/processor.h>
-#include <asm/system.h>
 
 #include <asm/vmx.h>
 #include <asm/svm.h>
diff --git a/arch/x86/kernel/acpi/cstate.c b/arch/x86/kernel/acpi/cstate.c
index f50e7fb2a201..d2b7f27781bc 100644
--- a/arch/x86/kernel/acpi/cstate.c
+++ b/arch/x86/kernel/acpi/cstate.c
@@ -14,6 +14,7 @@
 #include <acpi/processor.h>
 #include <asm/acpi.h>
 #include <asm/mwait.h>
+#include <asm/special_insns.h>
 
 /*
  * Initialize bm_flags based on the CPU cache properties
diff --git a/arch/x86/kernel/apm_32.c b/arch/x86/kernel/apm_32.c
index 5d56931a15b3..459e78cbf61e 100644
--- a/arch/x86/kernel/apm_32.c
+++ b/arch/x86/kernel/apm_32.c
@@ -231,7 +231,6 @@
 #include <linux/syscore_ops.h>
 #include <linux/i8253.h>
 
-#include <asm/system.h>
 #include <asm/uaccess.h>
 #include <asm/desc.h>
 #include <asm/olpc.h>
diff --git a/arch/x86/kernel/cpu/mcheck/p5.c b/arch/x86/kernel/cpu/mcheck/p5.c
index 5c0e6533d9bc..2d5454cd2c4f 100644
--- a/arch/x86/kernel/cpu/mcheck/p5.c
+++ b/arch/x86/kernel/cpu/mcheck/p5.c
@@ -9,7 +9,6 @@
 #include <linux/smp.h>
 
 #include <asm/processor.h>
-#include <asm/system.h>
 #include <asm/mce.h>
 #include <asm/msr.h>
 
diff --git a/arch/x86/kernel/cpu/mcheck/therm_throt.c b/arch/x86/kernel/cpu/mcheck/therm_throt.c
index 67bb17a37a0a..47a1870279aa 100644
--- a/arch/x86/kernel/cpu/mcheck/therm_throt.c
+++ b/arch/x86/kernel/cpu/mcheck/therm_throt.c
@@ -25,7 +25,6 @@
 #include <linux/cpu.h>
 
 #include <asm/processor.h>
-#include <asm/system.h>
 #include <asm/apic.h>
 #include <asm/idle.h>
 #include <asm/mce.h>
diff --git a/arch/x86/kernel/cpu/mcheck/winchip.c b/arch/x86/kernel/cpu/mcheck/winchip.c
index 54060f565974..2d7998fb628c 100644
--- a/arch/x86/kernel/cpu/mcheck/winchip.c
+++ b/arch/x86/kernel/cpu/mcheck/winchip.c
@@ -8,7 +8,6 @@
 #include <linux/init.h>
 
 #include <asm/processor.h>
-#include <asm/system.h>
 #include <asm/mce.h>
 #include <asm/msr.h>
 
diff --git a/arch/x86/kernel/cpu/mtrr/generic.c b/arch/x86/kernel/cpu/mtrr/generic.c
index 97b26356e9ee..75772ae6c65f 100644
--- a/arch/x86/kernel/cpu/mtrr/generic.c
+++ b/arch/x86/kernel/cpu/mtrr/generic.c
@@ -12,7 +12,6 @@
 #include <asm/processor-flags.h>
 #include <asm/cpufeature.h>
 #include <asm/tlbflush.h>
-#include <asm/system.h>
 #include <asm/mtrr.h>
 #include <asm/msr.h>
 #include <asm/pat.h>
diff --git a/arch/x86/kernel/cpuid.c b/arch/x86/kernel/cpuid.c
index a524353d93f2..39472dd2323f 100644
--- a/arch/x86/kernel/cpuid.c
+++ b/arch/x86/kernel/cpuid.c
@@ -43,7 +43,6 @@
 
 #include <asm/processor.h>
 #include <asm/msr.h>
-#include <asm/system.h>
 
 static struct class *cpuid_class;
 
diff --git a/arch/x86/kernel/i8259.c b/arch/x86/kernel/i8259.c
index 610485223bdb..36d1853e91af 100644
--- a/arch/x86/kernel/i8259.c
+++ b/arch/x86/kernel/i8259.c
@@ -15,7 +15,6 @@
 #include <linux/delay.h>
 
 #include <linux/atomic.h>
-#include <asm/system.h>
 #include <asm/timer.h>
 #include <asm/hw_irq.h>
 #include <asm/pgtable.h>
diff --git a/arch/x86/kernel/irqinit.c b/arch/x86/kernel/irqinit.c
index 313fb5cddbce..99b85b423bbf 100644
--- a/arch/x86/kernel/irqinit.c
+++ b/arch/x86/kernel/irqinit.c
@@ -16,7 +16,6 @@
 #include <linux/delay.h>
 
 #include <linux/atomic.h>
-#include <asm/system.h>
 #include <asm/timer.h>
 #include <asm/hw_irq.h>
 #include <asm/pgtable.h>
diff --git a/arch/x86/kernel/kgdb.c b/arch/x86/kernel/kgdb.c
index faba5771acad..4425a12ece43 100644
--- a/arch/x86/kernel/kgdb.c
+++ b/arch/x86/kernel/kgdb.c
@@ -46,7 +46,6 @@
 
 #include <asm/debugreg.h>
 #include <asm/apicdef.h>
-#include <asm/system.h>
 #include <asm/apic.h>
 #include <asm/nmi.h>
 
diff --git a/arch/x86/kernel/ldt.c b/arch/x86/kernel/ldt.c
index ea697263b373..ebc987398923 100644
--- a/arch/x86/kernel/ldt.c
+++ b/arch/x86/kernel/ldt.c
@@ -15,7 +15,6 @@
 #include <linux/vmalloc.h>
 #include <linux/uaccess.h>
 
-#include <asm/system.h>
 #include <asm/ldt.h>
 #include <asm/desc.h>
 #include <asm/mmu_context.h>
diff --git a/arch/x86/kernel/machine_kexec_32.c b/arch/x86/kernel/machine_kexec_32.c
index a3fa43ba5d3b..5b19e4d78b00 100644
--- a/arch/x86/kernel/machine_kexec_32.c
+++ b/arch/x86/kernel/machine_kexec_32.c
@@ -23,7 +23,6 @@
 #include <asm/apic.h>
 #include <asm/cpufeature.h>
 #include <asm/desc.h>
-#include <asm/system.h>
 #include <asm/cacheflush.h>
 #include <asm/debugreg.h>
 
diff --git a/arch/x86/kernel/mca_32.c b/arch/x86/kernel/mca_32.c
index 177183cbb6ae..7eb1e2b97827 100644
--- a/arch/x86/kernel/mca_32.c
+++ b/arch/x86/kernel/mca_32.c
@@ -43,7 +43,6 @@
 #include <linux/mca.h>
 #include <linux/kprobes.h>
 #include <linux/slab.h>
-#include <asm/system.h>
 #include <asm/io.h>
 #include <linux/proc_fs.h>
 #include <linux/mman.h>
diff --git a/arch/x86/kernel/module.c b/arch/x86/kernel/module.c
index 925179f871de..f21fd94ac897 100644
--- a/arch/x86/kernel/module.c
+++ b/arch/x86/kernel/module.c
@@ -26,7 +26,6 @@
 #include <linux/gfp.h>
 #include <linux/jump_label.h>
 
-#include <asm/system.h>
 #include <asm/page.h>
 #include <asm/pgtable.h>
 
diff --git a/arch/x86/kernel/msr.c b/arch/x86/kernel/msr.c
index 96356762a51d..eb113693f043 100644
--- a/arch/x86/kernel/msr.c
+++ b/arch/x86/kernel/msr.c
@@ -40,7 +40,6 @@
 
 #include <asm/processor.h>
 #include <asm/msr.h>
-#include <asm/system.h>
 
 static struct class *msr_class;
 
diff --git a/arch/x86/kernel/paravirt.c b/arch/x86/kernel/paravirt.c
index ada2f99388dd..2b26485f0c11 100644
--- a/arch/x86/kernel/paravirt.c
+++ b/arch/x86/kernel/paravirt.c
@@ -37,6 +37,7 @@
 #include <asm/apic.h>
 #include <asm/tlbflush.h>
 #include <asm/timer.h>
+#include <asm/special_insns.h>
 
 /* nop stub */
 void _paravirt_nop(void)
diff --git a/arch/x86/kernel/pci-calgary_64.c b/arch/x86/kernel/pci-calgary_64.c
index 726494b58345..6ac5782f4d6b 100644
--- a/arch/x86/kernel/pci-calgary_64.c
+++ b/arch/x86/kernel/pci-calgary_64.c
@@ -42,7 +42,6 @@
 #include <asm/calgary.h>
 #include <asm/tce.h>
 #include <asm/pci-direct.h>
-#include <asm/system.h>
 #include <asm/dma.h>
 #include <asm/rio.h>
 #include <asm/bios_ebda.h>
diff --git a/arch/x86/kernel/process.c b/arch/x86/kernel/process.c
index 14baf78d5a1f..9b24f36eb55f 100644
--- a/arch/x86/kernel/process.c
+++ b/arch/x86/kernel/process.c
@@ -15,7 +15,6 @@
 #include <trace/events/power.h>
 #include <linux/hw_breakpoint.h>
 #include <asm/cpu.h>
-#include <asm/system.h>
 #include <asm/apic.h>
 #include <asm/syscalls.h>
 #include <asm/idle.h>
diff --git a/arch/x86/kernel/process_32.c b/arch/x86/kernel/process_32.c
index 9d7d4842bfaf..aae4f4bbbe88 100644
--- a/arch/x86/kernel/process_32.c
+++ b/arch/x86/kernel/process_32.c
@@ -41,7 +41,6 @@
 #include <linux/cpuidle.h>
 
 #include <asm/pgtable.h>
-#include <asm/system.h>
 #include <asm/ldt.h>
 #include <asm/processor.h>
 #include <asm/i387.h>
@@ -59,6 +58,7 @@
 #include <asm/syscalls.h>
 #include <asm/debugreg.h>
 #include <asm/nmi.h>
+#include <asm/switch_to.h>
 
 asmlinkage void ret_from_fork(void) __asm__("ret_from_fork");
 
diff --git a/arch/x86/kernel/process_64.c b/arch/x86/kernel/process_64.c
index 292da13fc5aa..61270e8d428a 100644
--- a/arch/x86/kernel/process_64.c
+++ b/arch/x86/kernel/process_64.c
@@ -40,7 +40,6 @@
 #include <linux/cpuidle.h>
 
 #include <asm/pgtable.h>
-#include <asm/system.h>
 #include <asm/processor.h>
 #include <asm/i387.h>
 #include <asm/fpu-internal.h>
@@ -53,6 +52,7 @@
 #include <asm/syscalls.h>
 #include <asm/debugreg.h>
 #include <asm/nmi.h>
+#include <asm/switch_to.h>
 
 asmlinkage extern void ret_from_fork(void);
 
diff --git a/arch/x86/kernel/ptrace.c b/arch/x86/kernel/ptrace.c
index 78f05e438be5..8a634c887652 100644
--- a/arch/x86/kernel/ptrace.c
+++ b/arch/x86/kernel/ptrace.c
@@ -24,7 +24,6 @@
 
 #include <asm/uaccess.h>
 #include <asm/pgtable.h>
-#include <asm/system.h>
 #include <asm/processor.h>
 #include <asm/i387.h>
 #include <asm/fpu-internal.h>
diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c
index 88638883176a..8cbeb7209c3e 100644
--- a/arch/x86/kernel/setup.c
+++ b/arch/x86/kernel/setup.c
@@ -90,7 +90,6 @@
 #include <asm/processor.h>
 #include <asm/bugs.h>
 
-#include <asm/system.h>
 #include <asm/vsyscall.h>
 #include <asm/cpu.h>
 #include <asm/desc.h>
diff --git a/arch/x86/kernel/tce_64.c b/arch/x86/kernel/tce_64.c
index 9e540fee7009..ab40954e113e 100644
--- a/arch/x86/kernel/tce_64.c
+++ b/arch/x86/kernel/tce_64.c
@@ -34,6 +34,7 @@
 #include <asm/tce.h>
 #include <asm/calgary.h>
 #include <asm/proto.h>
+#include <asm/cacheflush.h>
 
 /* flush a tce at 'tceaddr' to main memory */
 static inline void flush_tce(void* tceaddr)
diff --git a/arch/x86/kernel/tls.c b/arch/x86/kernel/tls.c
index 6bb7b8579e70..73920e4c6dc5 100644
--- a/arch/x86/kernel/tls.c
+++ b/arch/x86/kernel/tls.c
@@ -6,7 +6,6 @@
 
 #include <asm/uaccess.h>
 #include <asm/desc.h>
-#include <asm/system.h>
 #include <asm/ldt.h>
 #include <asm/processor.h>
 #include <asm/proto.h>
diff --git a/arch/x86/kernel/traps.c b/arch/x86/kernel/traps.c
index ec61d4c1b93b..860f126ca233 100644
--- a/arch/x86/kernel/traps.c
+++ b/arch/x86/kernel/traps.c
@@ -50,7 +50,6 @@
 #include <asm/processor.h>
 #include <asm/debugreg.h>
 #include <linux/atomic.h>
-#include <asm/system.h>
 #include <asm/traps.h>
 #include <asm/desc.h>
 #include <asm/i387.h>
diff --git a/arch/x86/mm/init.c b/arch/x86/mm/init.c
index 6cabf6570d64..4f0cec7e4ffb 100644
--- a/arch/x86/mm/init.c
+++ b/arch/x86/mm/init.c
@@ -12,7 +12,6 @@
 #include <asm/page_types.h>
 #include <asm/sections.h>
 #include <asm/setup.h>
-#include <asm/system.h>
 #include <asm/tlbflush.h>
 #include <asm/tlb.h>
 #include <asm/proto.h>
diff --git a/arch/x86/mm/init_32.c b/arch/x86/mm/init_32.c
index 8663f6c47ccb..575d86f85ce4 100644
--- a/arch/x86/mm/init_32.c
+++ b/arch/x86/mm/init_32.c
@@ -35,7 +35,6 @@
 #include <asm/asm.h>
 #include <asm/bios_ebda.h>
 #include <asm/processor.h>
-#include <asm/system.h>
 #include <asm/uaccess.h>
 #include <asm/pgtable.h>
 #include <asm/dma.h>
diff --git a/arch/x86/mm/init_64.c b/arch/x86/mm/init_64.c
index 436a0309db33..fc18be0f6f29 100644
--- a/arch/x86/mm/init_64.c
+++ b/arch/x86/mm/init_64.c
@@ -35,7 +35,6 @@
 
 #include <asm/processor.h>
 #include <asm/bios_ebda.h>
-#include <asm/system.h>
 #include <asm/uaccess.h>
 #include <asm/pgtable.h>
 #include <asm/pgalloc.h>
diff --git a/arch/x86/mm/pgtable_32.c b/arch/x86/mm/pgtable_32.c
index cac718499256..a69bcb8c7621 100644
--- a/arch/x86/mm/pgtable_32.c
+++ b/arch/x86/mm/pgtable_32.c
@@ -10,7 +10,6 @@
 #include <linux/spinlock.h>
 #include <linux/module.h>
 
-#include <asm/system.h>
 #include <asm/pgtable.h>
 #include <asm/pgalloc.h>
 #include <asm/fixmap.h>
diff --git a/arch/x86/power/hibernate_32.c b/arch/x86/power/hibernate_32.c
index 3769079874d8..74202c1910cd 100644
--- a/arch/x86/power/hibernate_32.c
+++ b/arch/x86/power/hibernate_32.c
@@ -10,7 +10,6 @@
 #include <linux/suspend.h>
 #include <linux/bootmem.h>
 
-#include <asm/system.h>
 #include <asm/page.h>
 #include <asm/pgtable.h>
 #include <asm/mmzone.h>
-- 
cgit v1.2.3


From 49a7f04a4b9d45cd794741ce3d5d66524b37bdd0 Mon Sep 17 00:00:00 2001
From: David Howells <dhowells@redhat.com>
Date: Wed, 28 Mar 2012 18:30:03 +0100
Subject: Move all declarations of free_initmem() to linux/mm.h

Move all declarations of free_initmem() to linux/mm.h so that there's only one
and it's used by everything.

Signed-off-by: David Howells <dhowells@redhat.com>
cc: linux-c6x-dev@linux-c6x.org
cc: microblaze-uclinux@itee.uq.edu.au
cc: linux-sh@vger.kernel.org
cc: sparclinux@vger.kernel.org
cc: x86@kernel.org
cc: linux-mm@kvack.org
---
 arch/c6x/include/asm/system.h        | 1 -
 arch/frv/include/asm/system.h        | 2 --
 arch/microblaze/include/asm/system.h | 1 -
 arch/sh/include/asm/system.h         | 1 -
 arch/sparc/mm/init_64.h              | 2 --
 arch/x86/include/asm/page_types.h    | 1 -
 include/linux/mm.h                   | 2 ++
 init/main.c                          | 1 -
 8 files changed, 2 insertions(+), 9 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/c6x/include/asm/system.h b/arch/c6x/include/asm/system.h
index ccc4f86d16c5..0d84f9e42fde 100644
--- a/arch/c6x/include/asm/system.h
+++ b/arch/c6x/include/asm/system.h
@@ -4,4 +4,3 @@
 #include <asm/exec.h>
 #include <asm/special_insns.h>
 #include <asm/switch_to.h>
-extern void free_initmem(void);
diff --git a/arch/frv/include/asm/system.h b/arch/frv/include/asm/system.h
index 5c707a235403..659bcdb44eca 100644
--- a/arch/frv/include/asm/system.h
+++ b/arch/frv/include/asm/system.h
@@ -1,6 +1,4 @@
-/* FILE TO BE DELETED. DO NOT ADD STUFF HERE! */
 #include <asm/barrier.h>
 #include <asm/cmpxchg.h>
 #include <asm/exec.h>
 #include <asm/switch_to.h>
-extern void free_initmem(void);
diff --git a/arch/microblaze/include/asm/system.h b/arch/microblaze/include/asm/system.h
index ccc4f86d16c5..0d84f9e42fde 100644
--- a/arch/microblaze/include/asm/system.h
+++ b/arch/microblaze/include/asm/system.h
@@ -4,4 +4,3 @@
 #include <asm/exec.h>
 #include <asm/special_insns.h>
 #include <asm/switch_to.h>
-extern void free_initmem(void);
diff --git a/arch/sh/include/asm/system.h b/arch/sh/include/asm/system.h
index e2042aa32f2c..04268aa3b3e5 100644
--- a/arch/sh/include/asm/system.h
+++ b/arch/sh/include/asm/system.h
@@ -6,4 +6,3 @@
 #include <asm/exec.h>
 #include <asm/switch_to.h>
 #include <asm/traps.h>
-void free_initmem(void);
diff --git a/arch/sparc/mm/init_64.h b/arch/sparc/mm/init_64.h
index 77d1b313e344..3e1ac8b96cae 100644
--- a/arch/sparc/mm/init_64.h
+++ b/arch/sparc/mm/init_64.h
@@ -36,8 +36,6 @@ extern unsigned long kern_locked_tte_data;
 
 extern void prom_world(int enter);
 
-extern void free_initmem(void);
-
 #ifdef CONFIG_SPARSEMEM_VMEMMAP
 #define VMEMMAP_CHUNK_SHIFT	22
 #define VMEMMAP_CHUNK		(1UL << VMEMMAP_CHUNK_SHIFT)
diff --git a/arch/x86/include/asm/page_types.h b/arch/x86/include/asm/page_types.h
index bce688d54c12..e21fdd10479f 100644
--- a/arch/x86/include/asm/page_types.h
+++ b/arch/x86/include/asm/page_types.h
@@ -55,7 +55,6 @@ extern unsigned long init_memory_mapping(unsigned long start,
 					 unsigned long end);
 
 extern void initmem_init(void);
-extern void free_initmem(void);
 
 #endif	/* !__ASSEMBLY__ */
 
diff --git a/include/linux/mm.h b/include/linux/mm.h
index 7330742e7973..69f6d7b7eb01 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -1257,6 +1257,8 @@ static inline void pgtable_page_dtor(struct page *page)
 extern void free_area_init(unsigned long * zones_size);
 extern void free_area_init_node(int nid, unsigned long * zones_size,
 		unsigned long zone_start_pfn, unsigned long *zholes_size);
+extern void free_initmem(void);
+
 #ifdef CONFIG_HAVE_MEMBLOCK_NODE_MAP
 /*
  * With CONFIG_HAVE_MEMBLOCK_NODE_MAP set, an architecture may initialise its
diff --git a/init/main.c b/init/main.c
index c24805c824b9..44c9754e2a5c 100644
--- a/init/main.c
+++ b/init/main.c
@@ -87,7 +87,6 @@ extern void mca_init(void);
 extern void sbus_init(void);
 extern void prio_tree_init(void);
 extern void radix_tree_init(void);
-extern void free_initmem(void);
 #ifndef CONFIG_DEBUG_RODATA
 static inline void mark_rodata_ro(void) { }
 #endif
-- 
cgit v1.2.3


From 141124c02059eee9dbc5c86ea797b1ca888e77f7 Mon Sep 17 00:00:00 2001
From: David Howells <dhowells@redhat.com>
Date: Wed, 28 Mar 2012 18:30:03 +0100
Subject: Delete all instances of asm/system.h

Delete all instances of asm/system.h as they should be redundant by this
point.

Signed-off-by: David Howells <dhowells@redhat.com>
---
 arch/alpha/include/asm/system.h      | 5 -----
 arch/avr32/include/asm/system.h      | 6 ------
 arch/blackfin/include/asm/system.h   | 5 -----
 arch/c6x/include/asm/system.h        | 6 ------
 arch/cris/include/asm/system.h       | 5 -----
 arch/frv/include/asm/system.h        | 4 ----
 arch/h8300/include/asm/system.h      | 5 -----
 arch/hexagon/include/asm/system.h    | 5 -----
 arch/ia64/include/asm/system.h       | 4 ----
 arch/m32r/include/asm/system.h       | 6 ------
 arch/m68k/include/asm/system.h       | 5 -----
 arch/microblaze/include/asm/system.h | 6 ------
 arch/mips/include/asm/system.h       | 5 -----
 arch/mn10300/include/asm/system.h    | 5 -----
 arch/openrisc/include/asm/system.h   | 5 -----
 arch/parisc/include/asm/system.h     | 6 ------
 arch/powerpc/include/asm/system.h    | 6 ------
 arch/s390/include/asm/system.h       | 7 -------
 arch/score/include/asm/system.h      | 5 -----
 arch/sh/include/asm/system.h         | 8 --------
 arch/sparc/include/asm/system.h      | 6 ------
 arch/tile/include/asm/system.h       | 4 ----
 arch/unicore32/include/asm/system.h  | 5 -----
 arch/x86/include/asm/system.h        | 6 ------
 arch/xtensa/include/asm/system.h     | 5 -----
 include/asm-generic/system.h         | 5 -----
 26 files changed, 140 deletions(-)
 delete mode 100644 arch/alpha/include/asm/system.h
 delete mode 100644 arch/avr32/include/asm/system.h
 delete mode 100644 arch/blackfin/include/asm/system.h
 delete mode 100644 arch/c6x/include/asm/system.h
 delete mode 100644 arch/cris/include/asm/system.h
 delete mode 100644 arch/frv/include/asm/system.h
 delete mode 100644 arch/h8300/include/asm/system.h
 delete mode 100644 arch/hexagon/include/asm/system.h
 delete mode 100644 arch/ia64/include/asm/system.h
 delete mode 100644 arch/m32r/include/asm/system.h
 delete mode 100644 arch/m68k/include/asm/system.h
 delete mode 100644 arch/microblaze/include/asm/system.h
 delete mode 100644 arch/mips/include/asm/system.h
 delete mode 100644 arch/mn10300/include/asm/system.h
 delete mode 100644 arch/openrisc/include/asm/system.h
 delete mode 100644 arch/parisc/include/asm/system.h
 delete mode 100644 arch/powerpc/include/asm/system.h
 delete mode 100644 arch/s390/include/asm/system.h
 delete mode 100644 arch/score/include/asm/system.h
 delete mode 100644 arch/sh/include/asm/system.h
 delete mode 100644 arch/sparc/include/asm/system.h
 delete mode 100644 arch/tile/include/asm/system.h
 delete mode 100644 arch/unicore32/include/asm/system.h
 delete mode 100644 arch/x86/include/asm/system.h
 delete mode 100644 arch/xtensa/include/asm/system.h
 delete mode 100644 include/asm-generic/system.h

(limited to 'arch/x86')

diff --git a/arch/alpha/include/asm/system.h b/arch/alpha/include/asm/system.h
deleted file mode 100644
index c7270dcff321..000000000000
--- a/arch/alpha/include/asm/system.h
+++ /dev/null
@@ -1,5 +0,0 @@
-/* FILE TO BE DELETED. DO NOT ADD STUFF HERE! */
-#include <asm/barrier.h>
-#include <asm/exec.h>
-#include <asm/special_insns.h>
-#include <asm/switch_to.h>
diff --git a/arch/avr32/include/asm/system.h b/arch/avr32/include/asm/system.h
deleted file mode 100644
index 0d84f9e42fde..000000000000
--- a/arch/avr32/include/asm/system.h
+++ /dev/null
@@ -1,6 +0,0 @@
-/* FILE TO BE DELETED. DO NOT ADD STUFF HERE! */
-#include <asm/barrier.h>
-#include <asm/cmpxchg.h>
-#include <asm/exec.h>
-#include <asm/special_insns.h>
-#include <asm/switch_to.h>
diff --git a/arch/blackfin/include/asm/system.h b/arch/blackfin/include/asm/system.h
deleted file mode 100644
index a7f40578587c..000000000000
--- a/arch/blackfin/include/asm/system.h
+++ /dev/null
@@ -1,5 +0,0 @@
-/* FILE TO BE DELETED. DO NOT ADD STUFF HERE! */
-#include <asm/barrier.h>
-#include <asm/cmpxchg.h>
-#include <asm/exec.h>
-#include <asm/switch_to.h>
diff --git a/arch/c6x/include/asm/system.h b/arch/c6x/include/asm/system.h
deleted file mode 100644
index 0d84f9e42fde..000000000000
--- a/arch/c6x/include/asm/system.h
+++ /dev/null
@@ -1,6 +0,0 @@
-/* FILE TO BE DELETED. DO NOT ADD STUFF HERE! */
-#include <asm/barrier.h>
-#include <asm/cmpxchg.h>
-#include <asm/exec.h>
-#include <asm/special_insns.h>
-#include <asm/switch_to.h>
diff --git a/arch/cris/include/asm/system.h b/arch/cris/include/asm/system.h
deleted file mode 100644
index a7f40578587c..000000000000
--- a/arch/cris/include/asm/system.h
+++ /dev/null
@@ -1,5 +0,0 @@
-/* FILE TO BE DELETED. DO NOT ADD STUFF HERE! */
-#include <asm/barrier.h>
-#include <asm/cmpxchg.h>
-#include <asm/exec.h>
-#include <asm/switch_to.h>
diff --git a/arch/frv/include/asm/system.h b/arch/frv/include/asm/system.h
deleted file mode 100644
index 659bcdb44eca..000000000000
--- a/arch/frv/include/asm/system.h
+++ /dev/null
@@ -1,4 +0,0 @@
-#include <asm/barrier.h>
-#include <asm/cmpxchg.h>
-#include <asm/exec.h>
-#include <asm/switch_to.h>
diff --git a/arch/h8300/include/asm/system.h b/arch/h8300/include/asm/system.h
deleted file mode 100644
index a7f40578587c..000000000000
--- a/arch/h8300/include/asm/system.h
+++ /dev/null
@@ -1,5 +0,0 @@
-/* FILE TO BE DELETED. DO NOT ADD STUFF HERE! */
-#include <asm/barrier.h>
-#include <asm/cmpxchg.h>
-#include <asm/exec.h>
-#include <asm/switch_to.h>
diff --git a/arch/hexagon/include/asm/system.h b/arch/hexagon/include/asm/system.h
deleted file mode 100644
index a7f40578587c..000000000000
--- a/arch/hexagon/include/asm/system.h
+++ /dev/null
@@ -1,5 +0,0 @@
-/* FILE TO BE DELETED. DO NOT ADD STUFF HERE! */
-#include <asm/barrier.h>
-#include <asm/cmpxchg.h>
-#include <asm/exec.h>
-#include <asm/switch_to.h>
diff --git a/arch/ia64/include/asm/system.h b/arch/ia64/include/asm/system.h
deleted file mode 100644
index 5b190b48fcd0..000000000000
--- a/arch/ia64/include/asm/system.h
+++ /dev/null
@@ -1,4 +0,0 @@
-/* FILE TO BE DELETED. DO NOT ADD STUFF HERE! */
-#include <asm/barrier.h>
-#include <asm/exec.h>
-#include <asm/switch_to.h>
diff --git a/arch/m32r/include/asm/system.h b/arch/m32r/include/asm/system.h
deleted file mode 100644
index a55c384fdcf3..000000000000
--- a/arch/m32r/include/asm/system.h
+++ /dev/null
@@ -1,6 +0,0 @@
-/* FILE TO BE DELETED. DO NOT ADD STUFF HERE! */
-#include <asm/barrier.h>
-#include <asm/cmpxchg.h>
-#include <asm/dcache_clear.h>
-#include <asm/exec.h>
-#include <asm/switch_to.h>
diff --git a/arch/m68k/include/asm/system.h b/arch/m68k/include/asm/system.h
deleted file mode 100644
index a7f40578587c..000000000000
--- a/arch/m68k/include/asm/system.h
+++ /dev/null
@@ -1,5 +0,0 @@
-/* FILE TO BE DELETED. DO NOT ADD STUFF HERE! */
-#include <asm/barrier.h>
-#include <asm/cmpxchg.h>
-#include <asm/exec.h>
-#include <asm/switch_to.h>
diff --git a/arch/microblaze/include/asm/system.h b/arch/microblaze/include/asm/system.h
deleted file mode 100644
index 0d84f9e42fde..000000000000
--- a/arch/microblaze/include/asm/system.h
+++ /dev/null
@@ -1,6 +0,0 @@
-/* FILE TO BE DELETED. DO NOT ADD STUFF HERE! */
-#include <asm/barrier.h>
-#include <asm/cmpxchg.h>
-#include <asm/exec.h>
-#include <asm/special_insns.h>
-#include <asm/switch_to.h>
diff --git a/arch/mips/include/asm/system.h b/arch/mips/include/asm/system.h
deleted file mode 100644
index a7f40578587c..000000000000
--- a/arch/mips/include/asm/system.h
+++ /dev/null
@@ -1,5 +0,0 @@
-/* FILE TO BE DELETED. DO NOT ADD STUFF HERE! */
-#include <asm/barrier.h>
-#include <asm/cmpxchg.h>
-#include <asm/exec.h>
-#include <asm/switch_to.h>
diff --git a/arch/mn10300/include/asm/system.h b/arch/mn10300/include/asm/system.h
deleted file mode 100644
index a7f40578587c..000000000000
--- a/arch/mn10300/include/asm/system.h
+++ /dev/null
@@ -1,5 +0,0 @@
-/* FILE TO BE DELETED. DO NOT ADD STUFF HERE! */
-#include <asm/barrier.h>
-#include <asm/cmpxchg.h>
-#include <asm/exec.h>
-#include <asm/switch_to.h>
diff --git a/arch/openrisc/include/asm/system.h b/arch/openrisc/include/asm/system.h
deleted file mode 100644
index a7f40578587c..000000000000
--- a/arch/openrisc/include/asm/system.h
+++ /dev/null
@@ -1,5 +0,0 @@
-/* FILE TO BE DELETED. DO NOT ADD STUFF HERE! */
-#include <asm/barrier.h>
-#include <asm/cmpxchg.h>
-#include <asm/exec.h>
-#include <asm/switch_to.h>
diff --git a/arch/parisc/include/asm/system.h b/arch/parisc/include/asm/system.h
deleted file mode 100644
index fc2c1261ac1f..000000000000
--- a/arch/parisc/include/asm/system.h
+++ /dev/null
@@ -1,6 +0,0 @@
-/* FILE TO BE DELETED. DO NOT ADD STUFF HERE! */
-#include <asm/barrier.h>
-#include <asm/exec.h>
-#include <asm/ldcw.h>
-#include <asm/special_insns.h>
-#include <asm/switch_to.h>
diff --git a/arch/powerpc/include/asm/system.h b/arch/powerpc/include/asm/system.h
deleted file mode 100644
index 502c1e0275af..000000000000
--- a/arch/powerpc/include/asm/system.h
+++ /dev/null
@@ -1,6 +0,0 @@
-/* FILE TO BE DELETED. DO NOT ADD STUFF HERE! */
-#include <asm/barrier.h>
-#include <asm/cmpxchg.h>
-#include <asm/debug.h>
-#include <asm/exec.h>
-#include <asm/switch_to.h>
diff --git a/arch/s390/include/asm/system.h b/arch/s390/include/asm/system.h
deleted file mode 100644
index 641c72903277..000000000000
--- a/arch/s390/include/asm/system.h
+++ /dev/null
@@ -1,7 +0,0 @@
-/* FILE TO BE DELETED. DO NOT ADD STUFF HERE! */
-#include <asm/barrier.h>
-#include <asm/cmpxchg.h>
-#include <asm/ctl_reg.h>
-#include <asm/exec.h>
-#include <asm/facility.h>
-#include <asm/switch_to.h>
diff --git a/arch/score/include/asm/system.h b/arch/score/include/asm/system.h
deleted file mode 100644
index a7f40578587c..000000000000
--- a/arch/score/include/asm/system.h
+++ /dev/null
@@ -1,5 +0,0 @@
-/* FILE TO BE DELETED. DO NOT ADD STUFF HERE! */
-#include <asm/barrier.h>
-#include <asm/cmpxchg.h>
-#include <asm/exec.h>
-#include <asm/switch_to.h>
diff --git a/arch/sh/include/asm/system.h b/arch/sh/include/asm/system.h
deleted file mode 100644
index 04268aa3b3e5..000000000000
--- a/arch/sh/include/asm/system.h
+++ /dev/null
@@ -1,8 +0,0 @@
-/* FILE TO BE DELETED. DO NOT ADD STUFF HERE! */
-#include <asm/barrier.h>
-#include <asm/bl_bit.h>
-#include <asm/cache_insns.h>
-#include <asm/cmpxchg.h>
-#include <asm/exec.h>
-#include <asm/switch_to.h>
-#include <asm/traps.h>
diff --git a/arch/sparc/include/asm/system.h b/arch/sparc/include/asm/system.h
deleted file mode 100644
index ed532ba000b1..000000000000
--- a/arch/sparc/include/asm/system.h
+++ /dev/null
@@ -1,6 +0,0 @@
-/* FILE TO BE DELETED. DO NOT ADD STUFF HERE! */
-#include <asm/barrier.h>
-#include <asm/cpu_type.h>
-#include <asm/cmpxchg.h>
-#include <asm/exec.h>
-#include <asm/switch_to.h>
diff --git a/arch/tile/include/asm/system.h b/arch/tile/include/asm/system.h
deleted file mode 100644
index 5b190b48fcd0..000000000000
--- a/arch/tile/include/asm/system.h
+++ /dev/null
@@ -1,4 +0,0 @@
-/* FILE TO BE DELETED. DO NOT ADD STUFF HERE! */
-#include <asm/barrier.h>
-#include <asm/exec.h>
-#include <asm/switch_to.h>
diff --git a/arch/unicore32/include/asm/system.h b/arch/unicore32/include/asm/system.h
deleted file mode 100644
index a7f40578587c..000000000000
--- a/arch/unicore32/include/asm/system.h
+++ /dev/null
@@ -1,5 +0,0 @@
-/* FILE TO BE DELETED. DO NOT ADD STUFF HERE! */
-#include <asm/barrier.h>
-#include <asm/cmpxchg.h>
-#include <asm/exec.h>
-#include <asm/switch_to.h>
diff --git a/arch/x86/include/asm/system.h b/arch/x86/include/asm/system.h
deleted file mode 100644
index 0d84f9e42fde..000000000000
--- a/arch/x86/include/asm/system.h
+++ /dev/null
@@ -1,6 +0,0 @@
-/* FILE TO BE DELETED. DO NOT ADD STUFF HERE! */
-#include <asm/barrier.h>
-#include <asm/cmpxchg.h>
-#include <asm/exec.h>
-#include <asm/special_insns.h>
-#include <asm/switch_to.h>
diff --git a/arch/xtensa/include/asm/system.h b/arch/xtensa/include/asm/system.h
deleted file mode 100644
index a7f40578587c..000000000000
--- a/arch/xtensa/include/asm/system.h
+++ /dev/null
@@ -1,5 +0,0 @@
-/* FILE TO BE DELETED. DO NOT ADD STUFF HERE! */
-#include <asm/barrier.h>
-#include <asm/cmpxchg.h>
-#include <asm/exec.h>
-#include <asm/switch_to.h>
diff --git a/include/asm-generic/system.h b/include/asm-generic/system.h
deleted file mode 100644
index a7f40578587c..000000000000
--- a/include/asm-generic/system.h
+++ /dev/null
@@ -1,5 +0,0 @@
-/* FILE TO BE DELETED. DO NOT ADD STUFF HERE! */
-#include <asm/barrier.h>
-#include <asm/cmpxchg.h>
-#include <asm/exec.h>
-#include <asm/switch_to.h>
-- 
cgit v1.2.3


From 8abc3122aa02567bfe626cd13f4d34853c9b1225 Mon Sep 17 00:00:00 2001
From: Robert Richter <robert.richter@amd.com>
Date: Tue, 27 Mar 2012 20:04:02 +0200
Subject: x86/apic/amd: Be more verbose about LVT offset assignments

Add information about LVT offset assignments to better debug firmware
bugs related to this. See following examples.

 # dmesg | grep -i 'offset\|ibs'
 LVT offset 0 assigned for vector 0xf9
 [Firmware Bug]: cpu 0, try to use APIC500 (LVT offset 0) for vector 0x10400, but the register is already in use for vector 0xf9 on another cpu
 [Firmware Bug]: cpu 0, IBS interrupt offset 0 not available (MSRC001103A=0x0000000000000100)
 Failed to setup IBS, -22

In this case the BIOS assigns both offsets for MCE (0xf9) and IBS
(0x400) vectors to offset 0, which is why the second APIC setup (IBS)
failed.

With correct setup you get:

 # dmesg | grep -i 'offset\|ibs'
 LVT offset 0 assigned for vector 0xf9
 LVT offset 1 assigned for vector 0x400
 IBS: LVT offset 1 assigned
 perf: AMD IBS detected (0x00000007)
 oprofile: AMD IBS detected (0x00000007)

Note: The vector includes also the message type to handle also NMIs
(0x400). In the firmware bug message the format is the same as of the
APIC500 register and includes the mask bit (bit 16) in addition.

Signed-off-by: Robert Richter <robert.richter@amd.com>
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/x86/kernel/apic/apic.c | 13 +++++++++----
 1 file changed, 9 insertions(+), 4 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kernel/apic/apic.c b/arch/x86/kernel/apic/apic.c
index 2eec05b6d1b8..11544d8f1e97 100644
--- a/arch/x86/kernel/apic/apic.c
+++ b/arch/x86/kernel/apic/apic.c
@@ -383,20 +383,25 @@ static inline int eilvt_entry_is_changeable(unsigned int old, unsigned int new)
 
 static unsigned int reserve_eilvt_offset(int offset, unsigned int new)
 {
-	unsigned int rsvd;			/* 0: uninitialized */
+	unsigned int rsvd, vector;
 
 	if (offset >= APIC_EILVT_NR_MAX)
 		return ~0;
 
-	rsvd = atomic_read(&eilvt_offsets[offset]) & ~APIC_EILVT_MASKED;
+	rsvd = atomic_read(&eilvt_offsets[offset]);
 	do {
-		if (rsvd &&
-		    !eilvt_entry_is_changeable(rsvd, new))
+		vector = rsvd & ~APIC_EILVT_MASKED;	/* 0: unassigned */
+		if (vector && !eilvt_entry_is_changeable(vector, new))
 			/* may not change if vectors are different */
 			return rsvd;
 		rsvd = atomic_cmpxchg(&eilvt_offsets[offset], rsvd, new);
 	} while (rsvd != new);
 
+	rsvd &= ~APIC_EILVT_MASKED;
+	if (rsvd && rsvd != vector)
+		pr_info("LVT offset %d assigned for vector 0x%02x\n",
+			offset, rsvd);
+
 	return new;
 }
 
-- 
cgit v1.2.3


From 09c71bfd8384278c42f56380365940508194cec0 Mon Sep 17 00:00:00 2001
From: Dave Young <dyoung@redhat.com>
Date: Wed, 28 Mar 2012 14:42:47 -0700
Subject: kdump x86: fix total mem size calculation for reservation

crashkernel reservation need know the total memory size.  Current
get_total_mem simply use max_pfn - min_low_pfn.  It is wrong because it
will including memory holes in the middle.

Especially for kvm guest with memory > 0xe0000000, there's below in qemu
code: qemu split memory as below:

    if (ram_size >= 0xe0000000 ) {
        above_4g_mem_size = ram_size - 0xe0000000;
        below_4g_mem_size = 0xe0000000;
    } else {
        below_4g_mem_size = ram_size;
    }

So for 4G mem guest, seabios will insert a 512M usable region beyond of
4G.  Thus in above case max_pfn - min_low_pfn will be more than original
memsize.

Fixing this issue by using memblock_phys_mem_size() to get the total
memsize.

Signed-off-by: Dave Young <dyoung@redhat.com>
Reviewed-by: WANG Cong <xiyou.wangcong@gmail.com>
Reviewed-by: Simon Horman <horms@verge.net.au>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 arch/x86/kernel/setup.c | 11 +----------
 1 file changed, 1 insertion(+), 10 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c
index 88638883176a..ab77aae4ad9b 100644
--- a/arch/x86/kernel/setup.c
+++ b/arch/x86/kernel/setup.c
@@ -509,15 +509,6 @@ static void __init memblock_x86_reserve_range_setup_data(void)
 
 #ifdef CONFIG_KEXEC
 
-static inline unsigned long long get_total_mem(void)
-{
-	unsigned long long total;
-
-	total = max_pfn - min_low_pfn;
-
-	return total << PAGE_SHIFT;
-}
-
 /*
  * Keep the crash kernel below this limit.  On 32 bits earlier kernels
  * would limit the kernel to the low 512 MiB due to mapping restrictions.
@@ -536,7 +527,7 @@ static void __init reserve_crashkernel(void)
 	unsigned long long crash_size, crash_base;
 	int ret;
 
-	total_mem = get_total_mem();
+	total_mem = memblock_phys_mem_size();
 
 	ret = parse_crashkernel(boot_command_line, total_mem,
 			&crash_size, &crash_base);
-- 
cgit v1.2.3


From 5f054e31c63be774bf1ce252f20d56012a00f8a5 Mon Sep 17 00:00:00 2001
From: Rusty Russell <rusty@rustcorp.com.au>
Date: Thu, 29 Mar 2012 15:38:31 +1030
Subject: documentation: remove references to cpu_*_map.

This has been obsolescent for a while, fix documentation and
misc comments.

Signed-off-by: Rusty Russell <rusty@rustcorp.com.au>
---
 Documentation/cgroups/cpusets.txt   |  2 +-
 Documentation/cpu-hotplug.txt       | 22 +++++++++++-----------
 arch/alpha/kernel/smp.c             |  2 +-
 arch/ia64/kernel/acpi.c             |  2 +-
 arch/mips/cavium-octeon/smp.c       |  2 +-
 arch/mips/pmc-sierra/yosemite/smp.c |  2 +-
 arch/mips/sibyte/bcm1480/smp.c      |  2 +-
 arch/tile/kernel/setup.c            |  2 +-
 arch/x86/xen/enlighten.c            |  2 +-
 init/Kconfig                        |  4 ++--
 kernel/cpuset.c                     | 10 +++++-----
 11 files changed, 26 insertions(+), 26 deletions(-)

(limited to 'arch/x86')

diff --git a/Documentation/cgroups/cpusets.txt b/Documentation/cgroups/cpusets.txt
index 5c51ed406d1d..cefd3d8bbd11 100644
--- a/Documentation/cgroups/cpusets.txt
+++ b/Documentation/cgroups/cpusets.txt
@@ -217,7 +217,7 @@ and name space for cpusets, with a minimum of additional kernel code.
 
 The cpus and mems files in the root (top_cpuset) cpuset are
 read-only.  The cpus file automatically tracks the value of
-cpu_online_map using a CPU hotplug notifier, and the mems file
+cpu_online_mask using a CPU hotplug notifier, and the mems file
 automatically tracks the value of node_states[N_HIGH_MEMORY]--i.e.,
 nodes with memory--using the cpuset_track_online_nodes() hook.
 
diff --git a/Documentation/cpu-hotplug.txt b/Documentation/cpu-hotplug.txt
index a20bfd415e41..66ef8f35613d 100644
--- a/Documentation/cpu-hotplug.txt
+++ b/Documentation/cpu-hotplug.txt
@@ -47,7 +47,7 @@ maxcpus=n    Restrict boot time cpus to n. Say if you have 4 cpus, using
              other cpus later online, read FAQ's for more info.
 
 additional_cpus=n (*)	Use this to limit hotpluggable cpus. This option sets
-  			cpu_possible_map = cpu_present_map + additional_cpus
+  			cpu_possible_mask = cpu_present_mask + additional_cpus
 
 cede_offline={"off","on"}  Use this option to disable/enable putting offlined
 		            processors to an extended H_CEDE state on
@@ -64,11 +64,11 @@ should only rely on this to count the # of cpus, but *MUST* not rely
 on the apicid values in those tables for disabled apics. In the event
 BIOS doesn't mark such hot-pluggable cpus as disabled entries, one could
 use this parameter "additional_cpus=x" to represent those cpus in the
-cpu_possible_map.
+cpu_possible_mask.
 
 possible_cpus=n		[s390,x86_64] use this to set hotpluggable cpus.
 			This option sets possible_cpus bits in
-			cpu_possible_map. Thus keeping the numbers of bits set
+			cpu_possible_mask. Thus keeping the numbers of bits set
 			constant even if the machine gets rebooted.
 
 CPU maps and such
@@ -76,7 +76,7 @@ CPU maps and such
 [More on cpumaps and primitive to manipulate, please check
 include/linux/cpumask.h that has more descriptive text.]
 
-cpu_possible_map: Bitmap of possible CPUs that can ever be available in the
+cpu_possible_mask: Bitmap of possible CPUs that can ever be available in the
 system. This is used to allocate some boot time memory for per_cpu variables
 that aren't designed to grow/shrink as CPUs are made available or removed.
 Once set during boot time discovery phase, the map is static, i.e no bits
@@ -84,13 +84,13 @@ are added or removed anytime.  Trimming it accurately for your system needs
 upfront can save some boot time memory. See below for how we use heuristics
 in x86_64 case to keep this under check.
 
-cpu_online_map: Bitmap of all CPUs currently online. Its set in __cpu_up()
+cpu_online_mask: Bitmap of all CPUs currently online. Its set in __cpu_up()
 after a cpu is available for kernel scheduling and ready to receive
 interrupts from devices. Its cleared when a cpu is brought down using
 __cpu_disable(), before which all OS services including interrupts are
 migrated to another target CPU.
 
-cpu_present_map: Bitmap of CPUs currently present in the system. Not all
+cpu_present_mask: Bitmap of CPUs currently present in the system. Not all
 of them may be online. When physical hotplug is processed by the relevant
 subsystem (e.g ACPI) can change and new bit either be added or removed
 from the map depending on the event is hot-add/hot-remove. There are currently
@@ -99,22 +99,22 @@ at which time hotplug is disabled.
 
 You really dont need to manipulate any of the system cpu maps. They should
 be read-only for most use. When setting up per-cpu resources almost always use
-cpu_possible_map/for_each_possible_cpu() to iterate.
+cpu_possible_mask/for_each_possible_cpu() to iterate.
 
 Never use anything other than cpumask_t to represent bitmap of CPUs.
 
 	#include <linux/cpumask.h>
 
-	for_each_possible_cpu     - Iterate over cpu_possible_map
-	for_each_online_cpu       - Iterate over cpu_online_map
-	for_each_present_cpu      - Iterate over cpu_present_map
+	for_each_possible_cpu     - Iterate over cpu_possible_mask
+	for_each_online_cpu       - Iterate over cpu_online_mask
+	for_each_present_cpu      - Iterate over cpu_present_mask
 	for_each_cpu_mask(x,mask) - Iterate over some random collection of cpu mask.
 
 	#include <linux/cpu.h>
 	get_online_cpus() and put_online_cpus():
 
 The above calls are used to inhibit cpu hotplug operations. While the
-cpu_hotplug.refcount is non zero, the cpu_online_map will not change.
+cpu_hotplug.refcount is non zero, the cpu_online_mask will not change.
 If you merely need to avoid cpus going away, you could also use
 preempt_disable() and preempt_enable() for those sections.
 Just remember the critical section cannot call any
diff --git a/arch/alpha/kernel/smp.c b/arch/alpha/kernel/smp.c
index 4087a569b43b..50d438db1f6b 100644
--- a/arch/alpha/kernel/smp.c
+++ b/arch/alpha/kernel/smp.c
@@ -450,7 +450,7 @@ setup_smp(void)
 		smp_num_probed = 1;
 	}
 
-	printk(KERN_INFO "SMP: %d CPUs probed -- cpu_present_map = %lx\n",
+	printk(KERN_INFO "SMP: %d CPUs probed -- cpu_present_mask = %lx\n",
 	       smp_num_probed, cpumask_bits(cpu_present_mask)[0]);
 }
 
diff --git a/arch/ia64/kernel/acpi.c b/arch/ia64/kernel/acpi.c
index ac795d311f44..6f38b6120d96 100644
--- a/arch/ia64/kernel/acpi.c
+++ b/arch/ia64/kernel/acpi.c
@@ -839,7 +839,7 @@ static __init int setup_additional_cpus(char *s)
 early_param("additional_cpus", setup_additional_cpus);
 
 /*
- * cpu_possible_map should be static, it cannot change as CPUs
+ * cpu_possible_mask should be static, it cannot change as CPUs
  * are onlined, or offlined. The reason is per-cpu data-structures
  * are allocated by some modules at init time, and dont expect to
  * do this dynamically on cpu arrival/departure.
diff --git a/arch/mips/cavium-octeon/smp.c b/arch/mips/cavium-octeon/smp.c
index ef56d8a0215f..97e7ce9b50ed 100644
--- a/arch/mips/cavium-octeon/smp.c
+++ b/arch/mips/cavium-octeon/smp.c
@@ -78,7 +78,7 @@ static inline void octeon_send_ipi_mask(const struct cpumask *mask,
 }
 
 /**
- * Detect available CPUs, populate cpu_possible_map
+ * Detect available CPUs, populate cpu_possible_mask
  */
 static void octeon_smp_hotplug_setup(void)
 {
diff --git a/arch/mips/pmc-sierra/yosemite/smp.c b/arch/mips/pmc-sierra/yosemite/smp.c
index 00a0d2e90d04..b71fae231049 100644
--- a/arch/mips/pmc-sierra/yosemite/smp.c
+++ b/arch/mips/pmc-sierra/yosemite/smp.c
@@ -146,7 +146,7 @@ static void __cpuinit yos_boot_secondary(int cpu, struct task_struct *idle)
 }
 
 /*
- * Detect available CPUs, populate cpu_possible_map before smp_init
+ * Detect available CPUs, populate cpu_possible_mask before smp_init
  *
  * We don't want to start the secondary CPU yet nor do we have a nice probing
  * feature in PMON so we just assume presence of the secondary core.
diff --git a/arch/mips/sibyte/bcm1480/smp.c b/arch/mips/sibyte/bcm1480/smp.c
index 63d2211e6167..de88e22694a0 100644
--- a/arch/mips/sibyte/bcm1480/smp.c
+++ b/arch/mips/sibyte/bcm1480/smp.c
@@ -138,7 +138,7 @@ static void __cpuinit bcm1480_boot_secondary(int cpu, struct task_struct *idle)
 
 /*
  * Use CFE to find out how many CPUs are available, setting up
- * cpu_possible_map and the logical/physical mappings.
+ * cpu_possible_mask and the logical/physical mappings.
  * XXXKW will the boot CPU ever not be physical 0?
  *
  * Common setup before any secondaries are started
diff --git a/arch/tile/kernel/setup.c b/arch/tile/kernel/setup.c
index 5124093b2e1d..92a94f4920ad 100644
--- a/arch/tile/kernel/setup.c
+++ b/arch/tile/kernel/setup.c
@@ -1100,7 +1100,7 @@ EXPORT_SYMBOL(hash_for_home_map);
 
 /*
  * cpu_cacheable_map lists all the cpus whose caches the hypervisor can
- * flush on our behalf.  It is set to cpu_possible_map OR'ed with
+ * flush on our behalf.  It is set to cpu_possible_mask OR'ed with
  * hash_for_home_map, and it is what should be passed to
  * hv_flush_remote() to flush all caches.  Note that if there are
  * dedicated hypervisor driver tiles that have authorized use of their
diff --git a/arch/x86/xen/enlighten.c b/arch/x86/xen/enlighten.c
index b132ade26f77..4f51bebac02c 100644
--- a/arch/x86/xen/enlighten.c
+++ b/arch/x86/xen/enlighten.c
@@ -967,7 +967,7 @@ void xen_setup_shared_info(void)
 	xen_setup_mfn_list_list();
 }
 
-/* This is called once we have the cpu_possible_map */
+/* This is called once we have the cpu_possible_mask */
 void xen_setup_vcpu_info_placement(void)
 {
 	int cpu;
diff --git a/init/Kconfig b/init/Kconfig
index 72f33faca44f..6cfd71d06463 100644
--- a/init/Kconfig
+++ b/init/Kconfig
@@ -1414,8 +1414,8 @@ endif # MODULES
 config INIT_ALL_POSSIBLE
 	bool
 	help
-	  Back when each arch used to define their own cpu_online_map and
-	  cpu_possible_map, some of them chose to initialize cpu_possible_map
+	  Back when each arch used to define their own cpu_online_mask and
+	  cpu_possible_mask, some of them chose to initialize cpu_possible_mask
 	  with all 1s, and others with all 0s.  When they were centralised,
 	  it was better to provide this option than to break all the archs
 	  and have several arch maintainers pursuing me down dark alleys.
diff --git a/kernel/cpuset.c b/kernel/cpuset.c
index 1010cc61931f..eedeebe64b1a 100644
--- a/kernel/cpuset.c
+++ b/kernel/cpuset.c
@@ -270,11 +270,11 @@ static struct file_system_type cpuset_fs_type = {
  * are online.  If none are online, walk up the cpuset hierarchy
  * until we find one that does have some online cpus.  If we get
  * all the way to the top and still haven't found any online cpus,
- * return cpu_online_map.  Or if passed a NULL cs from an exit'ing
- * task, return cpu_online_map.
+ * return cpu_online_mask.  Or if passed a NULL cs from an exit'ing
+ * task, return cpu_online_mask.
  *
  * One way or another, we guarantee to return some non-empty subset
- * of cpu_online_map.
+ * of cpu_online_mask.
  *
  * Call with callback_mutex held.
  */
@@ -867,7 +867,7 @@ static int update_cpumask(struct cpuset *cs, struct cpuset *trialcs,
 	int retval;
 	int is_load_balanced;
 
-	/* top_cpuset.cpus_allowed tracks cpu_online_map; it's read-only */
+	/* top_cpuset.cpus_allowed tracks cpu_online_mask; it's read-only */
 	if (cs == &top_cpuset)
 		return -EACCES;
 
@@ -2149,7 +2149,7 @@ void __init cpuset_init_smp(void)
  *
  * Description: Returns the cpumask_var_t cpus_allowed of the cpuset
  * attached to the specified @tsk.  Guaranteed to return some non-empty
- * subset of cpu_online_map, even if this means going outside the
+ * subset of cpu_online_mask, even if this means going outside the
  * tasks cpuset.
  **/
 
-- 
cgit v1.2.3


From 99dd5497e5be4fe4194cad181d45fd6569a930db Mon Sep 17 00:00:00 2001
From: "Liu, Chuansheng" <chuansheng.liu@intel.com>
Date: Mon, 26 Mar 2012 07:11:50 +0000
Subject: x86: Preserve lazy irq disable semantics in fixup_irqs()

The default irq_disable() sematics are to mark the interrupt disabled,
but keep it unmasked. If the interrupt is delivered while marked
disabled, the low level interrupt handler masks it and marks it
pending. This is important for detecting wakeup interrupts during
suspend and for edge type interrupts to avoid losing interrupts.

fixup_irqs() moves the interrupts away from an offlined cpu. For
certain interrupt types it needs to mask the interrupt line before
changing the affinity. After affinity has changed the interrupt line
is unmasked again, but only if it is not marked disabled.

This breaks the lazy irq disable semantics and causes problems in
suspend as the interrupt can be lost or wakeup functionality is
broken.

Check irqd_irq_masked() instead of irqd_irq_disabled() because
irqd_irq_masked() is only set, when the core code actually masked the
interrupt line. If it's not set, we unmask the interrupt and let the
lazy irq disable logic deal with an eventually incoming interrupt.

[ tglx: Massaged changelog and added a comment ]

Signed-off-by: liu chuansheng <chuansheng.liu@intel.com>
Cc: Yanmin Zhang <yanmin_zhang@linux.intel.com>
Link: http://lkml.kernel.org/r/27240C0AC20F114CBF8149A2696CBE4A05DFB3@SHSMSX101.ccr.corp.intel.com
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 arch/x86/kernel/irq.c | 7 ++++++-
 1 file changed, 6 insertions(+), 1 deletion(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kernel/irq.c b/arch/x86/kernel/irq.c
index 7943e0c21bde..3dafc6003b7c 100644
--- a/arch/x86/kernel/irq.c
+++ b/arch/x86/kernel/irq.c
@@ -282,8 +282,13 @@ void fixup_irqs(void)
 		else if (!(warned++))
 			set_affinity = 0;
 
+		/*
+		 * We unmask if the irq was not marked masked by the
+		 * core code. That respects the lazy irq disable
+		 * behaviour.
+		 */
 		if (!irqd_can_move_in_process_context(data) &&
-		    !irqd_irq_disabled(data) && chip->irq_unmask)
+		    !irqd_irq_masked(data) && chip->irq_unmask)
 			chip->irq_unmask(data);
 
 		raw_spin_unlock(&desc->lock);
-- 
cgit v1.2.3


From 1d24fb3684f347226747c6b11ea426b7b992694e Mon Sep 17 00:00:00 2001
From: "zhuangfeiran@ict.ac.cn" <zhuangfeiran@ict.ac.cn>
Date: Wed, 28 Mar 2012 23:27:00 +0000
Subject: x86 bpf_jit: fix a bug in emitting the 16-bit immediate operand of
 AND

When K >= 0xFFFF0000, AND needs the two least significant bytes of K as
its operand, but EMIT2() gives it the least significant byte of K and
0x2. EMIT() should be used here to replace EMIT2().

Signed-off-by: Feiran Zhuang  <zhuangfeiran@ict.ac.cn>
Acked-by: Eric Dumazet <eric.dumazet@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 arch/x86/net/bpf_jit_comp.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'arch/x86')

diff --git a/arch/x86/net/bpf_jit_comp.c b/arch/x86/net/bpf_jit_comp.c
index 5671752f8d9c..5a5b6e4dd738 100644
--- a/arch/x86/net/bpf_jit_comp.c
+++ b/arch/x86/net/bpf_jit_comp.c
@@ -289,7 +289,7 @@ void bpf_jit_compile(struct sk_filter *fp)
 					EMIT2(0x24, K & 0xFF); /* and imm8,%al */
 				} else if (K >= 0xFFFF0000) {
 					EMIT2(0x66, 0x25);	/* and imm16,%ax */
-					EMIT2(K, 2);
+					EMIT(K, 2);
 				} else {
 					EMIT1_off32(0x25, K);	/* and imm32,%eax */
 				}
-- 
cgit v1.2.3


From 3751d3e85cf693e10e2c47c03c8caa65e171099b Mon Sep 17 00:00:00 2001
From: Jason Wessel <jason.wessel@windriver.com>
Date: Fri, 23 Mar 2012 09:35:05 -0500
Subject: x86,kgdb: Fix DEBUG_RODATA limitation using text_poke()

There has long been a limitation using software breakpoints with a
kernel compiled with CONFIG_DEBUG_RODATA going back to 2.6.26. For
this particular patch, it will apply cleanly and has been tested all
the way back to 2.6.36.

The kprobes code uses the text_poke() function which accommodates
writing a breakpoint into a read-only page.  The x86 kgdb code can
solve the problem similarly by overriding the default breakpoint
set/remove routines and using text_poke() directly.

The x86 kgdb code will first attempt to use the traditional
probe_kernel_write(), and next try using a the text_poke() function.
The break point install method is tracked such that the correct break
point removal routine will get called later on.

Cc: x86@kernel.org
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: H. Peter Anvin <hpa@zytor.com>
Cc: stable@vger.kernel.org # >= 2.6.36
Inspried-by: Masami Hiramatsu <masami.hiramatsu.pt@hitachi.com>
Signed-off-by: Jason Wessel <jason.wessel@windriver.com>
---
 arch/x86/kernel/kgdb.c | 60 ++++++++++++++++++++++++++++++++++++++++++++++++++
 drivers/misc/kgdbts.c  | 17 --------------
 include/linux/kgdb.h   |  3 ++-
 3 files changed, 62 insertions(+), 18 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kernel/kgdb.c b/arch/x86/kernel/kgdb.c
index fdc37b3d0ce3..b9bd9d8de665 100644
--- a/arch/x86/kernel/kgdb.c
+++ b/arch/x86/kernel/kgdb.c
@@ -43,6 +43,8 @@
 #include <linux/smp.h>
 #include <linux/nmi.h>
 #include <linux/hw_breakpoint.h>
+#include <linux/uaccess.h>
+#include <linux/memory.h>
 
 #include <asm/debugreg.h>
 #include <asm/apicdef.h>
@@ -742,6 +744,64 @@ void kgdb_arch_set_pc(struct pt_regs *regs, unsigned long ip)
 	regs->ip = ip;
 }
 
+int kgdb_arch_set_breakpoint(struct kgdb_bkpt *bpt)
+{
+	int err;
+	char opc[BREAK_INSTR_SIZE];
+
+	bpt->type = BP_BREAKPOINT;
+	err = probe_kernel_read(bpt->saved_instr, (char *)bpt->bpt_addr,
+				BREAK_INSTR_SIZE);
+	if (err)
+		return err;
+	err = probe_kernel_write((char *)bpt->bpt_addr,
+				 arch_kgdb_ops.gdb_bpt_instr, BREAK_INSTR_SIZE);
+#ifdef CONFIG_DEBUG_RODATA
+	if (!err)
+		return err;
+	/*
+	 * It is safe to call text_poke() because normal kernel execution
+	 * is stopped on all cores, so long as the text_mutex is not locked.
+	 */
+	if (mutex_is_locked(&text_mutex))
+		return -EBUSY;
+	text_poke((void *)bpt->bpt_addr, arch_kgdb_ops.gdb_bpt_instr,
+		  BREAK_INSTR_SIZE);
+	err = probe_kernel_read(opc, (char *)bpt->bpt_addr, BREAK_INSTR_SIZE);
+	if (err)
+		return err;
+	if (memcmp(opc, arch_kgdb_ops.gdb_bpt_instr, BREAK_INSTR_SIZE))
+		return -EINVAL;
+	bpt->type = BP_POKE_BREAKPOINT;
+#endif /* CONFIG_DEBUG_RODATA */
+	return err;
+}
+
+int kgdb_arch_remove_breakpoint(struct kgdb_bkpt *bpt)
+{
+#ifdef CONFIG_DEBUG_RODATA
+	int err;
+	char opc[BREAK_INSTR_SIZE];
+
+	if (bpt->type != BP_POKE_BREAKPOINT)
+		goto knl_write;
+	/*
+	 * It is safe to call text_poke() because normal kernel execution
+	 * is stopped on all cores, so long as the text_mutex is not locked.
+	 */
+	if (mutex_is_locked(&text_mutex))
+		goto knl_write;
+	text_poke((void *)bpt->bpt_addr, bpt->saved_instr, BREAK_INSTR_SIZE);
+	err = probe_kernel_read(opc, (char *)bpt->bpt_addr, BREAK_INSTR_SIZE);
+	if (err || memcmp(opc, bpt->saved_instr, BREAK_INSTR_SIZE))
+		goto knl_write;
+	return err;
+knl_write:
+#endif /* CONFIG_DEBUG_RODATA */
+	return probe_kernel_write((char *)bpt->bpt_addr,
+				  (char *)bpt->saved_instr, BREAK_INSTR_SIZE);
+}
+
 struct kgdb_arch arch_kgdb_ops = {
 	/* Breakpoint instruction: */
 	.gdb_bpt_instr		= { 0xcc },
diff --git a/drivers/misc/kgdbts.c b/drivers/misc/kgdbts.c
index d087456ba089..3aa9a969b373 100644
--- a/drivers/misc/kgdbts.c
+++ b/drivers/misc/kgdbts.c
@@ -968,22 +968,6 @@ static void run_singlestep_break_test(void)
 	kgdbts_break_test();
 }
 
-static void test_debug_rodata(void)
-{
-#ifdef CONFIG_DEBUG_RODATA
-	/* Until there is an api to write to read-only text segments, use
-	 * HW breakpoints for the remainder of any tests, else print a
-	 * failure message if hw breakpoints do not work.
-	 */
-	if (!(arch_kgdb_ops.flags & KGDB_HW_BREAKPOINT && hwbreaks_ok)) {
-		eprintk("kgdbts: HW breakpoints BROKEN, ending tests\n");
-		return;
-	}
-	force_hwbrks = 1;
-	v1printk("kgdbts:Using HW breakpoints for SW breakpoint tests\n");
-#endif /* CONFIG_DEBUG_RODATA */
-}
-
 static void kgdbts_run_tests(void)
 {
 	char *ptr;
@@ -1016,7 +1000,6 @@ static void kgdbts_run_tests(void)
 		v1printk("kgdbts:RUN access write breakpoint test\n");
 		run_hw_break_test(0);
 	}
-	test_debug_rodata();
 
 	/* required internal KGDB tests */
 	v1printk("kgdbts:RUN plant and detach test\n");
diff --git a/include/linux/kgdb.h b/include/linux/kgdb.h
index e5d689c1d774..c4d2fc194ede 100644
--- a/include/linux/kgdb.h
+++ b/include/linux/kgdb.h
@@ -63,7 +63,8 @@ enum kgdb_bptype {
 	BP_HARDWARE_BREAKPOINT,
 	BP_WRITE_WATCHPOINT,
 	BP_READ_WATCHPOINT,
-	BP_ACCESS_WATCHPOINT
+	BP_ACCESS_WATCHPOINT,
+	BP_POKE_BREAKPOINT,
 };
 
 enum kgdb_bpstate {
-- 
cgit v1.2.3


From f6365201d8a21fb347260f89d6e9b3e718d63c70 Mon Sep 17 00:00:00 2001
From: Len Brown <len.brown@intel.com>
Date: Thu, 29 Mar 2012 14:49:17 -0700
Subject: x86: Remove the ancient and deprecated disable_hlt() and enable_hlt()
 facility

The X86_32-only disable_hlt/enable_hlt mechanism was used by the
32-bit floppy driver. Its effect was to replace the use of the
HLT instruction inside default_idle() with cpu_relax() - essentially
it turned off the use of HLT.

This workaround was commented in the code as:

 "disable hlt during certain critical i/o operations"

 "This halt magic was a workaround for ancient floppy DMA
  wreckage. It should be safe to remove."

H. Peter Anvin additionally adds:

 "To the best of my knowledge, no-hlt only existed because of
  flaky power distributions on 386/486 systems which were sold to
  run DOS.  Since DOS did no power management of any kind,
  including HLT, the power draw was fairly uniform; when exposed
  to the much hhigher noise levels you got when Linux used HLT
  caused some of these systems to fail.

  They were by far in the minority even back then."

Alan Cox further says:

 "Also for the Cyrix 5510 which tended to go castors up if a HLT
  occurred during a DMA cycle and on a few other boxes HLT during
  DMA tended to go astray.

  Do we care ? I doubt it. The 5510 was pretty obscure, the 5520
  fixed it, the 5530 is probably the oldest still in any kind of
  use."

So, let's finally drop this.

Signed-off-by: Len Brown <len.brown@intel.com>
Signed-off-by: Josh Boyer <jwboyer@redhat.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Acked-by: "H. Peter Anvin" <hpa@zytor.com>
Acked-by: Alan Cox <alan@lxorguk.ukuu.org.uk>
Cc: Stephen Hemminger <shemminger@vyatta.com
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: <stable@kernel.org>
Link: http://lkml.kernel.org/n/tip-3rhk9bzf0x9rljkv488tloib@git.kernel.org
[ If anyone cares then alternative instruction patching could be
  used to replace HLT with a one-byte NOP instruction. Much simpler. ]
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 Documentation/feature-removal-schedule.txt |  8 -------
 arch/x86/include/asm/processor.h           | 10 ---------
 arch/x86/kernel/process.c                  | 24 --------------------
 drivers/block/floppy.c                     | 36 ------------------------------
 4 files changed, 78 deletions(-)

(limited to 'arch/x86')

diff --git a/Documentation/feature-removal-schedule.txt b/Documentation/feature-removal-schedule.txt
index 0cad4803ffac..7c950d48d76e 100644
--- a/Documentation/feature-removal-schedule.txt
+++ b/Documentation/feature-removal-schedule.txt
@@ -6,14 +6,6 @@ be removed from this file.
 
 ---------------------------
 
-What:	x86 floppy disable_hlt
-When:	2012
-Why:	ancient workaround of dubious utility clutters the
-	code used by everybody else.
-Who:	Len Brown <len.brown@intel.com>
-
----------------------------
-
 What:	CONFIG_APM_CPU_IDLE, and its ability to call APM BIOS in idle
 When:	2012
 Why:	This optional sub-feature of APM is of dubious reliability,
diff --git a/arch/x86/include/asm/processor.h b/arch/x86/include/asm/processor.h
index 7284c9a6a0b5..4fa7dcceb6c0 100644
--- a/arch/x86/include/asm/processor.h
+++ b/arch/x86/include/asm/processor.h
@@ -974,16 +974,6 @@ extern bool cpu_has_amd_erratum(const int *);
 #define cpu_has_amd_erratum(x)	(false)
 #endif /* CONFIG_CPU_SUP_AMD */
 
-#ifdef CONFIG_X86_32
-/*
- * disable hlt during certain critical i/o operations
- */
-#define HAVE_DISABLE_HLT
-#endif
-
-void disable_hlt(void);
-void enable_hlt(void);
-
 void cpu_idle_wait(void);
 
 extern unsigned long arch_align_stack(unsigned long sp);
diff --git a/arch/x86/kernel/process.c b/arch/x86/kernel/process.c
index a33afaa5ddb7..1d92a5ab6e8b 100644
--- a/arch/x86/kernel/process.c
+++ b/arch/x86/kernel/process.c
@@ -362,34 +362,10 @@ void (*pm_idle)(void);
 EXPORT_SYMBOL(pm_idle);
 #endif
 
-#ifdef CONFIG_X86_32
-/*
- * This halt magic was a workaround for ancient floppy DMA
- * wreckage. It should be safe to remove.
- */
-static int hlt_counter;
-void disable_hlt(void)
-{
-	hlt_counter++;
-}
-EXPORT_SYMBOL(disable_hlt);
-
-void enable_hlt(void)
-{
-	hlt_counter--;
-}
-EXPORT_SYMBOL(enable_hlt);
-
-static inline int hlt_use_halt(void)
-{
-	return (!hlt_counter && boot_cpu_data.hlt_works_ok);
-}
-#else
 static inline int hlt_use_halt(void)
 {
 	return 1;
 }
-#endif
 
 #ifndef CONFIG_SMP
 static inline void play_dead(void)
diff --git a/drivers/block/floppy.c b/drivers/block/floppy.c
index 76a08236430a..b0b00d70c166 100644
--- a/drivers/block/floppy.c
+++ b/drivers/block/floppy.c
@@ -1030,37 +1030,6 @@ static int fd_wait_for_completion(unsigned long delay, timeout_fn function)
 	return 0;
 }
 
-static DEFINE_SPINLOCK(floppy_hlt_lock);
-static int hlt_disabled;
-static void floppy_disable_hlt(void)
-{
-	unsigned long flags;
-
-	WARN_ONCE(1, "floppy_disable_hlt() scheduled for removal in 2012");
-	spin_lock_irqsave(&floppy_hlt_lock, flags);
-	if (!hlt_disabled) {
-		hlt_disabled = 1;
-#ifdef HAVE_DISABLE_HLT
-		disable_hlt();
-#endif
-	}
-	spin_unlock_irqrestore(&floppy_hlt_lock, flags);
-}
-
-static void floppy_enable_hlt(void)
-{
-	unsigned long flags;
-
-	spin_lock_irqsave(&floppy_hlt_lock, flags);
-	if (hlt_disabled) {
-		hlt_disabled = 0;
-#ifdef HAVE_DISABLE_HLT
-		enable_hlt();
-#endif
-	}
-	spin_unlock_irqrestore(&floppy_hlt_lock, flags);
-}
-
 static void setup_DMA(void)
 {
 	unsigned long f;
@@ -1105,7 +1074,6 @@ static void setup_DMA(void)
 	fd_enable_dma();
 	release_dma_lock(f);
 #endif
-	floppy_disable_hlt();
 }
 
 static void show_floppy(void);
@@ -1707,7 +1675,6 @@ irqreturn_t floppy_interrupt(int irq, void *dev_id)
 	fd_disable_dma();
 	release_dma_lock(f);
 
-	floppy_enable_hlt();
 	do_floppy = NULL;
 	if (fdc >= N_FDC || FDCS->address == -1) {
 		/* we don't even know which FDC is the culprit */
@@ -1856,8 +1823,6 @@ static void floppy_shutdown(unsigned long data)
 		show_floppy();
 	cancel_activity();
 
-	floppy_enable_hlt();
-
 	flags = claim_dma_lock();
 	fd_disable_dma();
 	release_dma_lock(flags);
@@ -4508,7 +4473,6 @@ static void floppy_release_irq_and_dma(void)
 #if N_FDC > 1
 	set_dor(1, ~8, 0);
 #endif
-	floppy_enable_hlt();
 
 	if (floppy_track_buffer && max_buffer_sectors) {
 		tmpsize = max_buffer_sectors * 1024;
-- 
cgit v1.2.3


From 1a022e3f1be11730bd8747b1af96a0274bf6356e Mon Sep 17 00:00:00 2001
From: Boris Ostrovsky <boris.ostrovsky@amd.com>
Date: Tue, 13 Mar 2012 19:55:09 +0100
Subject: idle, x86: Allow off-lined CPU to enter deeper C states

Currently when a CPU is off-lined it enters either MWAIT-based idle or,
if MWAIT is not desired or supported, HLT-based idle (which places the
processor in C1 state). This patch allows processors without MWAIT
support to stay in states deeper than C1.

Signed-off-by: Boris Ostrovsky <boris.ostrovsky@amd.com>
Signed-off-by: Len Brown <len.brown@intel.com>
---
 arch/x86/kernel/smpboot.c     |  4 +++-
 drivers/acpi/processor_idle.c | 31 +++++++++++++++++++++++++++++++
 drivers/cpuidle/cpuidle.c     | 28 ++++++++++++++++++++++++++++
 include/linux/cpuidle.h       |  5 +++++
 4 files changed, 67 insertions(+), 1 deletion(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c
index 66d250c00d11..93a2a0932b51 100644
--- a/arch/x86/kernel/smpboot.c
+++ b/arch/x86/kernel/smpboot.c
@@ -50,6 +50,7 @@
 #include <linux/tboot.h>
 #include <linux/stackprotector.h>
 #include <linux/gfp.h>
+#include <linux/cpuidle.h>
 
 #include <asm/acpi.h>
 #include <asm/desc.h>
@@ -1422,7 +1423,8 @@ void native_play_dead(void)
 	tboot_shutdown(TB_SHUTDOWN_WFS);
 
 	mwait_play_dead();	/* Only returns on failure */
-	hlt_play_dead();
+	if (cpuidle_play_dead())
+		hlt_play_dead();
 }
 
 #else /* ... !CONFIG_HOTPLUG_CPU */
diff --git a/drivers/acpi/processor_idle.c b/drivers/acpi/processor_idle.c
index 0e8e2de2ed3e..6b1d32a161ae 100644
--- a/drivers/acpi/processor_idle.c
+++ b/drivers/acpi/processor_idle.c
@@ -770,6 +770,35 @@ static int acpi_idle_enter_c1(struct cpuidle_device *dev,
 	return index;
 }
 
+
+/**
+ * acpi_idle_play_dead - enters an ACPI state for long-term idle (i.e. off-lining)
+ * @dev: the target CPU
+ * @index: the index of suggested state
+ */
+static int acpi_idle_play_dead(struct cpuidle_device *dev, int index)
+{
+	struct cpuidle_state_usage *state_usage = &dev->states_usage[index];
+	struct acpi_processor_cx *cx = cpuidle_get_statedata(state_usage);
+
+	ACPI_FLUSH_CPU_CACHE();
+
+	while (1) {
+
+		if (cx->entry_method == ACPI_CSTATE_HALT)
+			halt();
+		else if (cx->entry_method == ACPI_CSTATE_SYSTEMIO) {
+			inb(cx->address);
+			/* See comment in acpi_idle_do_entry() */
+			inl(acpi_gbl_FADT.xpm_timer_block.address);
+		} else
+			return -ENODEV;
+	}
+
+	/* Never reached */
+	return 0;
+}
+
 /**
  * acpi_idle_enter_simple - enters an ACPI state without BM handling
  * @dev: the target CPU
@@ -1077,12 +1106,14 @@ static int acpi_processor_setup_cpuidle_states(struct acpi_processor *pr)
 				state->flags |= CPUIDLE_FLAG_TIME_VALID;
 
 			state->enter = acpi_idle_enter_c1;
+			state->enter_dead = acpi_idle_play_dead;
 			drv->safe_state_index = count;
 			break;
 
 			case ACPI_STATE_C2:
 			state->flags |= CPUIDLE_FLAG_TIME_VALID;
 			state->enter = acpi_idle_enter_simple;
+			state->enter_dead = acpi_idle_play_dead;
 			drv->safe_state_index = count;
 			break;
 
diff --git a/drivers/cpuidle/cpuidle.c b/drivers/cpuidle/cpuidle.c
index f7cab5e9c4d6..3e146b2ada4a 100644
--- a/drivers/cpuidle/cpuidle.c
+++ b/drivers/cpuidle/cpuidle.c
@@ -71,6 +71,34 @@ typedef int (*cpuidle_enter_t)(struct cpuidle_device *dev,
 
 static cpuidle_enter_t cpuidle_enter_ops;
 
+/**
+ * cpuidle_play_dead - cpu off-lining
+ *
+ * Only returns in case of an error
+ */
+int cpuidle_play_dead(void)
+{
+	struct cpuidle_device *dev = __this_cpu_read(cpuidle_devices);
+	struct cpuidle_driver *drv = cpuidle_get_driver();
+	int i, dead_state = -1;
+	int power_usage = -1;
+
+	/* Find lowest-power state that supports long-term idle */
+	for (i = CPUIDLE_DRIVER_STATE_START; i < drv->state_count; i++) {
+		struct cpuidle_state *s = &drv->states[i];
+
+		if (s->power_usage < power_usage && s->enter_dead) {
+			power_usage = s->power_usage;
+			dead_state = i;
+		}
+	}
+
+	if (dead_state != -1)
+		return drv->states[dead_state].enter_dead(dev, dead_state);
+
+	return -ENODEV;
+}
+
 /**
  * cpuidle_idle_call - the main idle loop
  *
diff --git a/include/linux/cpuidle.h b/include/linux/cpuidle.h
index f3ebbba368b3..d557bcd0ada7 100644
--- a/include/linux/cpuidle.h
+++ b/include/linux/cpuidle.h
@@ -51,6 +51,8 @@ struct cpuidle_state {
 	int (*enter)	(struct cpuidle_device *dev,
 			struct cpuidle_driver *drv,
 			int index);
+
+	int (*enter_dead) (struct cpuidle_device *dev, int index);
 };
 
 /* Idle State Flags */
@@ -147,6 +149,8 @@ extern int cpuidle_wrap_enter(struct cpuidle_device *dev,
 				struct cpuidle_driver *drv, int index,
 				int (*enter)(struct cpuidle_device *dev,
 					struct cpuidle_driver *drv, int index));
+extern int cpuidle_play_dead(void);
+
 #else
 static inline void disable_cpuidle(void) { }
 static inline int cpuidle_idle_call(void) { return -ENODEV; }
@@ -168,6 +172,7 @@ static inline int cpuidle_wrap_enter(struct cpuidle_device *dev,
 				int (*enter)(struct cpuidle_device *dev,
 					struct cpuidle_driver *drv, int index))
 { return -ENODEV; }
+static inline int cpuidle_play_dead(void) {return -ENODEV; }
 
 #endif
 
-- 
cgit v1.2.3


From ac909ec308ce8d5177963c780564824d12bc3fa2 Mon Sep 17 00:00:00 2001
From: Petr Vandrovec <petr@vmware.com>
Date: Thu, 8 Mar 2012 13:33:24 -0800
Subject: ACPI: Fix use-after-free in acpi_map_lsapic

When processor is being hot-added to the system, acpi_map_lsapic invokes
ACPI _MAT method to find APIC ID and flags, verifies that returned structure
is indeed ACPI's local APIC structure, and that flags contain MADT_ENABLED
bit.  Then saves APIC ID, frees structure - and accesses structure when
computing arguments for acpi_register_lapic call.  Which sometime leads
to acpi_register_lapic call being made with second argument zero, failing
to bring processor online with error 'Unable to map lapic to logical cpu
number'.

As lapic->lapic_flags & ACPI_MADT_ENABLED was already confirmed to be non-zero
few lines above, we can just pass unconditional ACPI_MADT_ENABLED to the
acpi_register_lapic.

Signed-off-by: Petr Vandrovec <petr@vmware.com>
Signed-off-by: Alok N Kataria <akataria@vmware.com>
Reviewed-by: Toshi Kani <toshi.kani@hp.com>
Signed-off-by: Len Brown <len.brown@intel.com>
---
 arch/x86/kernel/acpi/boot.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kernel/acpi/boot.c b/arch/x86/kernel/acpi/boot.c
index ce664f33ea8e..bbcc2c389ade 100644
--- a/arch/x86/kernel/acpi/boot.c
+++ b/arch/x86/kernel/acpi/boot.c
@@ -642,6 +642,7 @@ static int __cpuinit _acpi_map_lsapic(acpi_handle handle, int *pcpu)
 	kfree(buffer.pointer);
 	buffer.length = ACPI_ALLOCATE_BUFFER;
 	buffer.pointer = NULL;
+	lapic = NULL;
 
 	if (!alloc_cpumask_var(&tmp_map, GFP_KERNEL))
 		goto out;
@@ -650,7 +651,7 @@ static int __cpuinit _acpi_map_lsapic(acpi_handle handle, int *pcpu)
 		goto free_tmp_map;
 
 	cpumask_copy(tmp_map, cpu_present_mask);
-	acpi_register_lapic(physid, lapic->lapic_flags & ACPI_MADT_ENABLED);
+	acpi_register_lapic(physid, ACPI_MADT_ENABLED);
 
 	/*
 	 * If mp_register_lapic successfully generates a new logical cpu
-- 
cgit v1.2.3


From c0e9afc0da6cb0f11497e5ea83377b3c451450e0 Mon Sep 17 00:00:00 2001
From: Andi Kleen <ak@linux.intel.com>
Date: Wed, 28 Mar 2012 11:51:17 -0700
Subject: x86: Use -mno-avx when available

On gccs that support AVX it's a good idea to disable that too, similar to
how SSE2, SSE1 etc. are already disabled. This prevents the compiler
from generating AVX ever implicitely.

No failure observed, just from review.

[ hpa: Marking this for urgent and stable, simply because the patch
  will either have absolutely no effect *or* it will avoid potentially
  very hard to debug failures. ]

Signed-off-by: Andi Kleen <ak@linux.intel.com>
Link: http://lkml.kernel.org/r/1332960678-11879-1-git-send-email-andi@firstfloor.org
Signed-off-by: H. Peter Anvin <hpa@zytor.com>
Cc: <stable@vger.kernel.org>
---
 arch/x86/Makefile | 1 +
 1 file changed, 1 insertion(+)

(limited to 'arch/x86')

diff --git a/arch/x86/Makefile b/arch/x86/Makefile
index 968dbe24a255..41a7237606a3 100644
--- a/arch/x86/Makefile
+++ b/arch/x86/Makefile
@@ -129,6 +129,7 @@ KBUILD_CFLAGS += -Wno-sign-compare
 KBUILD_CFLAGS += -fno-asynchronous-unwind-tables
 # prevent gcc from generating any FP code by mistake
 KBUILD_CFLAGS += $(call cc-option,-mno-sse -mno-mmx -mno-sse2 -mno-3dnow,)
+KBUILD_CFLAGS += $(call cc-option,-mno-avx,)
 
 KBUILD_CFLAGS += $(mflags-y)
 KBUILD_AFLAGS += $(mflags-y)
-- 
cgit v1.2.3


From dba69d1092e291e257fb5673a3ad0e4c87878ebc Mon Sep 17 00:00:00 2001
From: Marcelo Tosatti <mtosatti@redhat.com>
Date: Sun, 1 Apr 2012 13:53:36 -0300
Subject: x86, kvm: Call restore_sched_clock_state() only after %gs is
 initialized

s2ram broke due to this KVM commit:

  b74f05d61b73 x86: kvmclock: abstract save/restore sched_clock_state

restore_sched_clock_state() methods use percpu data, therefore
they must run after %gs is initialized, but before mtrr_bp_restore()
(due to lockstat using sched_clock).

Move it to the correct place.

Reported-and-tested-by: Konstantin Khlebnikov <khlebnikov@openvz.org>
Signed-off-by: Marcelo Tosatti <mtosatti@redhat.com>
Cc: Avi Kivity <avi@redhat.com>
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/x86/power/cpu.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'arch/x86')

diff --git a/arch/x86/power/cpu.c b/arch/x86/power/cpu.c
index 47936830968c..218cdb16163c 100644
--- a/arch/x86/power/cpu.c
+++ b/arch/x86/power/cpu.c
@@ -225,13 +225,13 @@ static void __restore_processor_state(struct saved_context *ctxt)
 	fix_processor_context();
 
 	do_fpu_end();
+	x86_platform.restore_sched_clock_state();
 	mtrr_bp_restore();
 }
 
 /* Needed by apm.c */
 void restore_processor_state(void)
 {
-	x86_platform.restore_sched_clock_state();
 	__restore_processor_state(&saved_context);
 }
 #ifdef CONFIG_X86_32
-- 
cgit v1.2.3


From 7b8e6da46b921d30ac1553cac56d8fb74f0b431d Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <a.p.zijlstra@chello.nl>
Date: Tue, 27 Mar 2012 16:50:42 +0200
Subject: perf/x86/p4: Add format attributes
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Steven reported his P4 not booting properly, the missing format
attributes cause a NULL ptr deref. Cure this by adding the
missing format specification.

I took the format description out of the comment near
p4_config_pack*() and hope that comment is still relatively
accurate.

Reported-by: Steven Rostedt <rostedt@goodmis.org>
Reported-by: Bruno Prémont <bonbons@linux-vserver.org>
Tested-by: Steven Rostedt <rostedt@goodmis.org>
Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Jiri Olsa <jolsa@redhat.com>
Cc: Cyrill Gorcunov <gorcunov@openvz.org>
Cc: Lin Ming <ming.m.lin@intel.com>
Cc: Stephane Eranian <eranian@google.com>
Link: http://lkml.kernel.org/r/1332859842.16159.227.camel@twins
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/x86/kernel/cpu/perf_event_p4.c | 13 +++++++++++++
 1 file changed, 13 insertions(+)

(limited to 'arch/x86')

diff --git a/arch/x86/kernel/cpu/perf_event_p4.c b/arch/x86/kernel/cpu/perf_event_p4.c
index ef484d9d0a25..a2dfacfd7103 100644
--- a/arch/x86/kernel/cpu/perf_event_p4.c
+++ b/arch/x86/kernel/cpu/perf_event_p4.c
@@ -1271,6 +1271,17 @@ done:
 	return num ? -EINVAL : 0;
 }
 
+PMU_FORMAT_ATTR(cccr, "config:0-31" );
+PMU_FORMAT_ATTR(escr, "config:32-62");
+PMU_FORMAT_ATTR(ht,   "config:63"   );
+
+static struct attribute *intel_p4_formats_attr[] = {
+	&format_attr_cccr.attr,
+	&format_attr_escr.attr,
+	&format_attr_ht.attr,
+	NULL,
+};
+
 static __initconst const struct x86_pmu p4_pmu = {
 	.name			= "Netburst P4/Xeon",
 	.handle_irq		= p4_pmu_handle_irq,
@@ -1305,6 +1316,8 @@ static __initconst const struct x86_pmu p4_pmu = {
 	 * the former idea is taken from OProfile code
 	 */
 	.perfctr_second_write	= 1,
+
+	.format_attrs		= intel_p4_formats_attr,
 };
 
 __init int p4_pmu_init(void)
-- 
cgit v1.2.3


From a998d4342337c82dacdc0897d30a9364de1576a1 Mon Sep 17 00:00:00 2001
From: Jan Seiffert <kaffeemonster@googlemail.com>
Date: Fri, 30 Mar 2012 05:24:05 +0000
Subject: bpf jit: Let the x86 jit handle negative offsets

Now the helper function from filter.c for negative offsets is exported,
it can be used it in the jit to handle negative offsets.

First modify the asm load helper functions to handle:
- know positive offsets
- know negative offsets
- any offset

then the compiler can be modified to explicitly use these helper
when appropriate.

This fixes the case of a negative X register and allows to lift
the restriction that bpf programs with negative offsets can't
be jited.

Signed-of-by: Jan Seiffert <kaffeemonster@googlemail.com>
Signed-off-by: Eric Dumazet <eric.dumazet@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 arch/x86/net/bpf_jit.S      | 122 +++++++++++++++++++++++++++++++++-----------
 arch/x86/net/bpf_jit_comp.c |  41 +++++++++------
 2 files changed, 115 insertions(+), 48 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/net/bpf_jit.S b/arch/x86/net/bpf_jit.S
index 66870223f8c5..877b9a1b2152 100644
--- a/arch/x86/net/bpf_jit.S
+++ b/arch/x86/net/bpf_jit.S
@@ -18,17 +18,17 @@
  * r9d : hlen = skb->len - skb->data_len
  */
 #define SKBDATA	%r8
-
-sk_load_word_ind:
-	.globl	sk_load_word_ind
-
-	add	%ebx,%esi	/* offset += X */
-#	test    %esi,%esi	/* if (offset < 0) goto bpf_error; */
-	js	bpf_error
+#define SKF_MAX_NEG_OFF    $(-0x200000) /* SKF_LL_OFF from filter.h */
 
 sk_load_word:
 	.globl	sk_load_word
 
+	test	%esi,%esi
+	js	bpf_slow_path_word_neg
+
+sk_load_word_positive_offset:
+	.globl	sk_load_word_positive_offset
+
 	mov	%r9d,%eax		# hlen
 	sub	%esi,%eax		# hlen - offset
 	cmp	$3,%eax
@@ -37,16 +37,15 @@ sk_load_word:
 	bswap   %eax  			/* ntohl() */
 	ret
 
-
-sk_load_half_ind:
-	.globl sk_load_half_ind
-
-	add	%ebx,%esi	/* offset += X */
-	js	bpf_error
-
 sk_load_half:
 	.globl	sk_load_half
 
+	test	%esi,%esi
+	js	bpf_slow_path_half_neg
+
+sk_load_half_positive_offset:
+	.globl	sk_load_half_positive_offset
+
 	mov	%r9d,%eax
 	sub	%esi,%eax		#	hlen - offset
 	cmp	$1,%eax
@@ -55,14 +54,15 @@ sk_load_half:
 	rol	$8,%ax			# ntohs()
 	ret
 
-sk_load_byte_ind:
-	.globl sk_load_byte_ind
-	add	%ebx,%esi	/* offset += X */
-	js	bpf_error
-
 sk_load_byte:
 	.globl	sk_load_byte
 
+	test	%esi,%esi
+	js	bpf_slow_path_byte_neg
+
+sk_load_byte_positive_offset:
+	.globl	sk_load_byte_positive_offset
+
 	cmp	%esi,%r9d   /* if (offset >= hlen) goto bpf_slow_path_byte */
 	jle	bpf_slow_path_byte
 	movzbl	(SKBDATA,%rsi),%eax
@@ -73,25 +73,21 @@ sk_load_byte:
  *
  * Implements BPF_S_LDX_B_MSH : ldxb  4*([offset]&0xf)
  * Must preserve A accumulator (%eax)
- * Inputs : %esi is the offset value, already known positive
+ * Inputs : %esi is the offset value
  */
-ENTRY(sk_load_byte_msh)
-	CFI_STARTPROC
+sk_load_byte_msh:
+	.globl	sk_load_byte_msh
+	test	%esi,%esi
+	js	bpf_slow_path_byte_msh_neg
+
+sk_load_byte_msh_positive_offset:
+	.globl	sk_load_byte_msh_positive_offset
 	cmp	%esi,%r9d      /* if (offset >= hlen) goto bpf_slow_path_byte_msh */
 	jle	bpf_slow_path_byte_msh
 	movzbl	(SKBDATA,%rsi),%ebx
 	and	$15,%bl
 	shl	$2,%bl
 	ret
-	CFI_ENDPROC
-ENDPROC(sk_load_byte_msh)
-
-bpf_error:
-# force a return 0 from jit handler
-	xor		%eax,%eax
-	mov		-8(%rbp),%rbx
-	leaveq
-	ret
 
 /* rsi contains offset and can be scratched */
 #define bpf_slow_path_common(LEN)		\
@@ -138,3 +134,67 @@ bpf_slow_path_byte_msh:
 	shl	$2,%al
 	xchg	%eax,%ebx
 	ret
+
+#define sk_negative_common(SIZE)				\
+	push	%rdi;	/* save skb */				\
+	push	%r9;						\
+	push	SKBDATA;					\
+/* rsi already has offset */					\
+	mov	$SIZE,%ecx;	/* size */			\
+	call	bpf_internal_load_pointer_neg_helper;		\
+	test	%rax,%rax;					\
+	pop	SKBDATA;					\
+	pop	%r9;						\
+	pop	%rdi;						\
+	jz	bpf_error
+
+
+bpf_slow_path_word_neg:
+	cmp	SKF_MAX_NEG_OFF, %esi	/* test range */
+	jl	bpf_error	/* offset lower -> error  */
+sk_load_word_negative_offset:
+	.globl	sk_load_word_negative_offset
+	sk_negative_common(4)
+	mov	(%rax), %eax
+	bswap	%eax
+	ret
+
+bpf_slow_path_half_neg:
+	cmp	SKF_MAX_NEG_OFF, %esi
+	jl	bpf_error
+sk_load_half_negative_offset:
+	.globl	sk_load_half_negative_offset
+	sk_negative_common(2)
+	mov	(%rax),%ax
+	rol	$8,%ax
+	movzwl	%ax,%eax
+	ret
+
+bpf_slow_path_byte_neg:
+	cmp	SKF_MAX_NEG_OFF, %esi
+	jl	bpf_error
+sk_load_byte_negative_offset:
+	.globl	sk_load_byte_negative_offset
+	sk_negative_common(1)
+	movzbl	(%rax), %eax
+	ret
+
+bpf_slow_path_byte_msh_neg:
+	cmp	SKF_MAX_NEG_OFF, %esi
+	jl	bpf_error
+sk_load_byte_msh_negative_offset:
+	.globl	sk_load_byte_msh_negative_offset
+	xchg	%eax,%ebx /* dont lose A , X is about to be scratched */
+	sk_negative_common(1)
+	movzbl	(%rax),%eax
+	and	$15,%al
+	shl	$2,%al
+	xchg	%eax,%ebx
+	ret
+
+bpf_error:
+# force a return 0 from jit handler
+	xor		%eax,%eax
+	mov		-8(%rbp),%rbx
+	leaveq
+	ret
diff --git a/arch/x86/net/bpf_jit_comp.c b/arch/x86/net/bpf_jit_comp.c
index 5a5b6e4dd738..0597f95b6da6 100644
--- a/arch/x86/net/bpf_jit_comp.c
+++ b/arch/x86/net/bpf_jit_comp.c
@@ -30,7 +30,10 @@ int bpf_jit_enable __read_mostly;
  * assembly code in arch/x86/net/bpf_jit.S
  */
 extern u8 sk_load_word[], sk_load_half[], sk_load_byte[], sk_load_byte_msh[];
-extern u8 sk_load_word_ind[], sk_load_half_ind[], sk_load_byte_ind[];
+extern u8 sk_load_word_positive_offset[], sk_load_half_positive_offset[];
+extern u8 sk_load_byte_positive_offset[], sk_load_byte_msh_positive_offset[];
+extern u8 sk_load_word_negative_offset[], sk_load_half_negative_offset[];
+extern u8 sk_load_byte_negative_offset[], sk_load_byte_msh_negative_offset[];
 
 static inline u8 *emit_code(u8 *ptr, u32 bytes, unsigned int len)
 {
@@ -117,6 +120,8 @@ static inline void bpf_flush_icache(void *start, void *end)
 	set_fs(old_fs);
 }
 
+#define CHOOSE_LOAD_FUNC(K, func) \
+	((int)K < 0 ? ((int)K >= SKF_LL_OFF ? func##_negative_offset : func) : func##_positive_offset)
 
 void bpf_jit_compile(struct sk_filter *fp)
 {
@@ -473,44 +478,46 @@ void bpf_jit_compile(struct sk_filter *fp)
 #endif
 				break;
 			case BPF_S_LD_W_ABS:
-				func = sk_load_word;
+				func = CHOOSE_LOAD_FUNC(K, sk_load_word);
 common_load:			seen |= SEEN_DATAREF;
-				if ((int)K < 0) {
-					/* Abort the JIT because __load_pointer() is needed. */
-					goto out;
-				}
 				t_offset = func - (image + addrs[i]);
 				EMIT1_off32(0xbe, K); /* mov imm32,%esi */
 				EMIT1_off32(0xe8, t_offset); /* call */
 				break;
 			case BPF_S_LD_H_ABS:
-				func = sk_load_half;
+				func = CHOOSE_LOAD_FUNC(K, sk_load_half);
 				goto common_load;
 			case BPF_S_LD_B_ABS:
-				func = sk_load_byte;
+				func = CHOOSE_LOAD_FUNC(K, sk_load_byte);
 				goto common_load;
 			case BPF_S_LDX_B_MSH:
-				if ((int)K < 0) {
-					/* Abort the JIT because __load_pointer() is needed. */
-					goto out;
-				}
+				func = CHOOSE_LOAD_FUNC(K, sk_load_byte_msh);
 				seen |= SEEN_DATAREF | SEEN_XREG;
-				t_offset = sk_load_byte_msh - (image + addrs[i]);
+				t_offset = func - (image + addrs[i]);
 				EMIT1_off32(0xbe, K);	/* mov imm32,%esi */
 				EMIT1_off32(0xe8, t_offset); /* call sk_load_byte_msh */
 				break;
 			case BPF_S_LD_W_IND:
-				func = sk_load_word_ind;
+				func = sk_load_word;
 common_load_ind:		seen |= SEEN_DATAREF | SEEN_XREG;
 				t_offset = func - (image + addrs[i]);
-				EMIT1_off32(0xbe, K);	/* mov imm32,%esi   */
+				if (K) {
+					if (is_imm8(K)) {
+						EMIT3(0x8d, 0x73, K); /* lea imm8(%rbx), %esi */
+					} else {
+						EMIT2(0x8d, 0xb3); /* lea imm32(%rbx),%esi */
+						EMIT(K, 4);
+					}
+				} else {
+					EMIT2(0x89,0xde); /* mov %ebx,%esi */
+				}
 				EMIT1_off32(0xe8, t_offset);	/* call sk_load_xxx_ind */
 				break;
 			case BPF_S_LD_H_IND:
-				func = sk_load_half_ind;
+				func = sk_load_half;
 				goto common_load_ind;
 			case BPF_S_LD_B_IND:
-				func = sk_load_byte_ind;
+				func = sk_load_byte;
 				goto common_load_ind;
 			case BPF_S_JMP_JA:
 				t_offset = addrs[i + K] - addrs[i];
-- 
cgit v1.2.3


From fea5295324ce7ce6ccfe0909cc6740a2d34aa5a3 Mon Sep 17 00:00:00 2001
From: Sasikantha babu <sasikanth.v19@gmail.com>
Date: Wed, 21 Mar 2012 18:49:00 +0530
Subject: KVM: PMU: Fix integer constant is too large warning in
 kvm_pmu_set_msr()

Signed-off-by: Sasikantha babu <sasikanth.v19@gmail.com>
Signed-off-by: Avi Kivity <avi@redhat.com>
---
 arch/x86/kvm/pmu.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kvm/pmu.c b/arch/x86/kvm/pmu.c
index a73f0c104813..173df38dbda5 100644
--- a/arch/x86/kvm/pmu.c
+++ b/arch/x86/kvm/pmu.c
@@ -369,7 +369,7 @@ int kvm_pmu_set_msr(struct kvm_vcpu *vcpu, u32 index, u64 data)
 	case MSR_CORE_PERF_FIXED_CTR_CTRL:
 		if (pmu->fixed_ctr_ctrl == data)
 			return 0;
-		if (!(data & 0xfffffffffffff444)) {
+		if (!(data & 0xfffffffffffff444ull)) {
 			reprogram_fixed_counters(pmu, data);
 			return 0;
 		}
-- 
cgit v1.2.3


From 7a4f5ad051e02139a9f1c0f7f4b1acb88915852b Mon Sep 17 00:00:00 2001
From: Marcelo Tosatti <mtosatti@redhat.com>
Date: Tue, 27 Mar 2012 19:47:26 -0300
Subject: KVM: VMX: vmx_set_cr0 expects kvm->srcu locked

vmx_set_cr0 is called from vcpu run context, therefore it expects
kvm->srcu to be held (for setting up the real-mode TSS).

Signed-off-by: Marcelo Tosatti <mtosatti@redhat.com>
Signed-off-by: Avi Kivity <avi@redhat.com>
---
 arch/x86/kvm/vmx.c | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'arch/x86')

diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index 280751c84724..ad85adfef843 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -3906,7 +3906,9 @@ static int vmx_vcpu_reset(struct kvm_vcpu *vcpu)
 		vmcs_write16(VIRTUAL_PROCESSOR_ID, vmx->vpid);
 
 	vmx->vcpu.arch.cr0 = X86_CR0_NW | X86_CR0_CD | X86_CR0_ET;
+	vcpu->srcu_idx = srcu_read_lock(&vcpu->kvm->srcu);
 	vmx_set_cr0(&vmx->vcpu, kvm_read_cr0(vcpu)); /* enter rmode */
+	srcu_read_unlock(&vcpu->kvm->srcu, vcpu->srcu_idx);
 	vmx_set_cr4(&vmx->vcpu, 0);
 	vmx_set_efer(&vmx->vcpu, 0);
 	vmx_fpu_activate(&vmx->vcpu);
-- 
cgit v1.2.3


From e08759215b7dcb7111e94f0f96918dd98e86ca6b Mon Sep 17 00:00:00 2001
From: Gleb Natapov <gleb@redhat.com>
Date: Wed, 4 Apr 2012 15:30:33 +0300
Subject: KVM: Resolve RCU vs. async page fault problem

"Page ready" async PF can kick vcpu out of idle state much like IRQ.
We need to tell RCU about this.

Reported-by: Sasha Levin <levinsasha928@gmail.com>
Signed-off-by: Gleb Natapov <gleb@redhat.com>
Reviewed-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
Signed-off-by: Avi Kivity <avi@redhat.com>
---
 arch/x86/kernel/kvm.c | 4 ++++
 1 file changed, 4 insertions(+)

(limited to 'arch/x86')

diff --git a/arch/x86/kernel/kvm.c b/arch/x86/kernel/kvm.c
index 694d801bf606..b8ba6e4a27e4 100644
--- a/arch/x86/kernel/kvm.c
+++ b/arch/x86/kernel/kvm.c
@@ -38,6 +38,7 @@
 #include <asm/traps.h>
 #include <asm/desc.h>
 #include <asm/tlbflush.h>
+#include <asm/idle.h>
 
 static int kvmapf = 1;
 
@@ -253,7 +254,10 @@ do_async_page_fault(struct pt_regs *regs, unsigned long error_code)
 		kvm_async_pf_task_wait((u32)read_cr2());
 		break;
 	case KVM_PV_REASON_PAGE_READY:
+		rcu_irq_enter();
+		exit_idle();
 		kvm_async_pf_task_wake((u32)read_cr2());
+		rcu_irq_exit();
 		break;
 	}
 }
-- 
cgit v1.2.3


From 234e340582901211f40d8c732afc49f0630ecf05 Mon Sep 17 00:00:00 2001
From: Stephen Boyd <sboyd@codeaurora.org>
Date: Thu, 5 Apr 2012 14:25:11 -0700
Subject: simple_open: automatically convert to simple_open()

Many users of debugfs copy the implementation of default_open() when
they want to support a custom read/write function op.  This leads to a
proliferation of the default_open() implementation across the entire
tree.

Now that the common implementation has been consolidated into libfs we
can replace all the users of this function with simple_open().

This replacement was done with the following semantic patch:

<smpl>
@ open @
identifier open_f != simple_open;
identifier i, f;
@@
-int open_f(struct inode *i, struct file *f)
-{
(
-if (i->i_private)
-f->private_data = i->i_private;
|
-f->private_data = i->i_private;
)
-return 0;
-}

@ has_open depends on open @
identifier fops;
identifier open.open_f;
@@
struct file_operations fops = {
...
-.open = open_f,
+.open = simple_open,
...
};
</smpl>

[akpm@linux-foundation.org: checkpatch fixes]
Signed-off-by: Stephen Boyd <sboyd@codeaurora.org>
Cc: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Cc: Al Viro <viro@zeniv.linux.org.uk>
Cc: Julia Lawall <Julia.Lawall@lip6.fr>
Acked-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 arch/arm/mach-msm/smd_debug.c                   |  8 +----
 arch/x86/kernel/kdebugfs.c                      |  9 +-----
 drivers/acpi/ec_sys.c                           |  8 +----
 drivers/base/regmap/regmap-debugfs.c            | 12 ++-----
 drivers/bluetooth/btmrvl_debugfs.c              | 26 ++++++---------
 drivers/char/virtio_console.c                   |  8 +----
 drivers/dma/coh901318.c                         |  9 +-----
 drivers/gpu/drm/i915/i915_debugfs.c             | 14 ++-------
 drivers/hid/hid-picolcd.c                       | 16 ++--------
 drivers/hid/hid-wiimote-debug.c                 |  8 +----
 drivers/idle/i7300_idle.c                       |  8 +----
 drivers/iommu/omap-iommu-debug.c                | 10 ++----
 drivers/mfd/aat2870-core.c                      |  9 +-----
 drivers/mfd/ab3100-core.c                       |  8 +----
 drivers/misc/ibmasm/ibmasmfs.c                  |  8 +----
 drivers/mtd/ubi/debug.c                         | 10 +-----
 drivers/net/caif/caif_spi.c                     | 10 ++----
 drivers/net/ethernet/chelsio/cxgb4/cxgb4_main.c |  9 +-----
 drivers/net/wimax/i2400m/debugfs.c              | 15 ++-------
 drivers/net/wireless/ath/ath5k/debug.c          | 23 +++++---------
 drivers/net/wireless/ath/ath6kl/debug.c         | 42 +++++++++++--------------
 drivers/net/wireless/ath/ath9k/debug.c          | 37 ++++++++++------------
 drivers/net/wireless/ath/ath9k/dfs_debug.c      |  9 +-----
 drivers/net/wireless/ath/ath9k/htc_drv_debug.c  | 26 ++++++---------
 drivers/net/wireless/ath/ath9k/rc.c             |  8 +----
 drivers/net/wireless/ath/carl9170/debug.c       |  7 +----
 drivers/net/wireless/b43/debugfs.c              |  8 +----
 drivers/net/wireless/b43legacy/debugfs.c        |  8 +----
 drivers/net/wireless/iwlegacy/3945-rs.c         |  8 +----
 drivers/net/wireless/iwlegacy/4965-rs.c         | 12 ++-----
 drivers/net/wireless/iwlegacy/debug.c           | 12 ++-----
 drivers/net/wireless/iwlwifi/iwl-agn-rs.c       | 11 ++-----
 drivers/net/wireless/iwlwifi/iwl-debugfs.c      | 12 ++-----
 drivers/net/wireless/iwlwifi/iwl-trans-pcie.c   | 12 ++-----
 drivers/net/wireless/iwmc3200wifi/debugfs.c     | 14 +++------
 drivers/net/wireless/iwmc3200wifi/sdio.c        |  9 +-----
 drivers/net/wireless/libertas/debugfs.c         | 10 ++----
 drivers/net/wireless/mwifiex/debugfs.c          | 18 ++---------
 drivers/net/wireless/wl1251/debugfs.c           | 14 +++------
 drivers/net/wireless/wl12xx/debugfs.c           | 38 ++++++++++------------
 drivers/oprofile/oprofilefs.c                   | 14 ++-------
 drivers/remoteproc/remoteproc_debugfs.c         | 13 ++------
 drivers/scsi/lpfc/lpfc_debugfs.c                |  9 +-----
 drivers/spi/spi-dw.c                            |  8 +----
 drivers/tty/serial/mfd.c                        |  9 ++----
 drivers/tty/serial/pch_uart.c                   |  8 ++---
 drivers/usb/core/inode.c                        | 10 +-----
 drivers/usb/host/ehci-dbg.c                     |  9 +-----
 drivers/uwb/uwb-debug.c                         |  9 +-----
 fs/debugfs/file.c                               | 14 ++-------
 fs/dlm/debug_fs.c                               |  9 +-----
 fs/pstore/inode.c                               |  8 +----
 kernel/trace/blktrace.c                         | 18 ++---------
 net/mac80211/debugfs.c                          | 12 ++-----
 net/mac80211/debugfs.h                          |  1 -
 net/mac80211/debugfs_key.c                      |  4 +--
 net/mac80211/debugfs_netdev.c                   |  2 +-
 net/mac80211/debugfs_sta.c                      |  4 +--
 net/mac80211/rate.c                             |  2 +-
 net/wireless/debugfs.c                          | 10 ++----
 sound/soc/imx/imx-audmux.c                      |  8 +----
 sound/soc/soc-core.c                            |  8 +----
 sound/soc/soc-dapm.c                            | 16 ++--------
 63 files changed, 176 insertions(+), 572 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/arm/mach-msm/smd_debug.c b/arch/arm/mach-msm/smd_debug.c
index 0c56a5aaf588..c56df9e932ae 100644
--- a/arch/arm/mach-msm/smd_debug.c
+++ b/arch/arm/mach-msm/smd_debug.c
@@ -203,15 +203,9 @@ static ssize_t debug_read(struct file *file, char __user *buf,
 	return simple_read_from_buffer(buf, count, ppos, debug_buffer, bsize);
 }
 
-static int debug_open(struct inode *inode, struct file *file)
-{
-	file->private_data = inode->i_private;
-	return 0;
-}
-
 static const struct file_operations debug_ops = {
 	.read = debug_read,
-	.open = debug_open,
+	.open = simple_open,
 	.llseek = default_llseek,
 };
 
diff --git a/arch/x86/kernel/kdebugfs.c b/arch/x86/kernel/kdebugfs.c
index 90fcf62854bb..1d5d31ea686b 100644
--- a/arch/x86/kernel/kdebugfs.c
+++ b/arch/x86/kernel/kdebugfs.c
@@ -68,16 +68,9 @@ static ssize_t setup_data_read(struct file *file, char __user *user_buf,
 	return count;
 }
 
-static int setup_data_open(struct inode *inode, struct file *file)
-{
-	file->private_data = inode->i_private;
-
-	return 0;
-}
-
 static const struct file_operations fops_setup_data = {
 	.read		= setup_data_read,
-	.open		= setup_data_open,
+	.open		= simple_open,
 	.llseek		= default_llseek,
 };
 
diff --git a/drivers/acpi/ec_sys.c b/drivers/acpi/ec_sys.c
index b258cab9061c..7586544fddb4 100644
--- a/drivers/acpi/ec_sys.c
+++ b/drivers/acpi/ec_sys.c
@@ -27,12 +27,6 @@ MODULE_PARM_DESC(write_support, "Dangerous, reboot and removal of battery may "
 
 static struct dentry *acpi_ec_debugfs_dir;
 
-static int acpi_ec_open_io(struct inode *i, struct file *f)
-{
-	f->private_data = i->i_private;
-	return 0;
-}
-
 static ssize_t acpi_ec_read_io(struct file *f, char __user *buf,
 			       size_t count, loff_t *off)
 {
@@ -95,7 +89,7 @@ static ssize_t acpi_ec_write_io(struct file *f, const char __user *buf,
 
 static const struct file_operations acpi_ec_io_ops = {
 	.owner = THIS_MODULE,
-	.open  = acpi_ec_open_io,
+	.open  = simple_open,
 	.read  = acpi_ec_read_io,
 	.write = acpi_ec_write_io,
 	.llseek = default_llseek,
diff --git a/drivers/base/regmap/regmap-debugfs.c b/drivers/base/regmap/regmap-debugfs.c
index 58517a5dac13..251eb70f83e7 100644
--- a/drivers/base/regmap/regmap-debugfs.c
+++ b/drivers/base/regmap/regmap-debugfs.c
@@ -27,12 +27,6 @@ static size_t regmap_calc_reg_len(int max_val, char *buf, size_t buf_size)
 	return strlen(buf);
 }
 
-static int regmap_open_file(struct inode *inode, struct file *file)
-{
-	file->private_data = inode->i_private;
-	return 0;
-}
-
 static ssize_t regmap_name_read_file(struct file *file,
 				     char __user *user_buf, size_t count,
 				     loff_t *ppos)
@@ -57,7 +51,7 @@ static ssize_t regmap_name_read_file(struct file *file,
 }
 
 static const struct file_operations regmap_name_fops = {
-	.open = regmap_open_file,
+	.open = simple_open,
 	.read = regmap_name_read_file,
 	.llseek = default_llseek,
 };
@@ -174,7 +168,7 @@ static ssize_t regmap_map_write_file(struct file *file,
 #endif
 
 static const struct file_operations regmap_map_fops = {
-	.open = regmap_open_file,
+	.open = simple_open,
 	.read = regmap_map_read_file,
 	.write = regmap_map_write_file,
 	.llseek = default_llseek,
@@ -243,7 +237,7 @@ out:
 }
 
 static const struct file_operations regmap_access_fops = {
-	.open = regmap_open_file,
+	.open = simple_open,
 	.read = regmap_access_read_file,
 	.llseek = default_llseek,
 };
diff --git a/drivers/bluetooth/btmrvl_debugfs.c b/drivers/bluetooth/btmrvl_debugfs.c
index 6c20bbb54b71..428dbb7574bd 100644
--- a/drivers/bluetooth/btmrvl_debugfs.c
+++ b/drivers/bluetooth/btmrvl_debugfs.c
@@ -45,12 +45,6 @@ struct btmrvl_debugfs_data {
 	struct dentry *txdnldready;
 };
 
-static int btmrvl_open_generic(struct inode *inode, struct file *file)
-{
-	file->private_data = inode->i_private;
-	return 0;
-}
-
 static ssize_t btmrvl_hscfgcmd_write(struct file *file,
 			const char __user *ubuf, size_t count, loff_t *ppos)
 {
@@ -93,7 +87,7 @@ static ssize_t btmrvl_hscfgcmd_read(struct file *file, char __user *userbuf,
 static const struct file_operations btmrvl_hscfgcmd_fops = {
 	.read	= btmrvl_hscfgcmd_read,
 	.write	= btmrvl_hscfgcmd_write,
-	.open	= btmrvl_open_generic,
+	.open	= simple_open,
 	.llseek = default_llseek,
 };
 
@@ -134,7 +128,7 @@ static ssize_t btmrvl_psmode_read(struct file *file, char __user *userbuf,
 static const struct file_operations btmrvl_psmode_fops = {
 	.read	= btmrvl_psmode_read,
 	.write	= btmrvl_psmode_write,
-	.open	= btmrvl_open_generic,
+	.open	= simple_open,
 	.llseek = default_llseek,
 };
 
@@ -180,7 +174,7 @@ static ssize_t btmrvl_pscmd_read(struct file *file, char __user *userbuf,
 static const struct file_operations btmrvl_pscmd_fops = {
 	.read = btmrvl_pscmd_read,
 	.write = btmrvl_pscmd_write,
-	.open = btmrvl_open_generic,
+	.open = simple_open,
 	.llseek = default_llseek,
 };
 
@@ -221,7 +215,7 @@ static ssize_t btmrvl_gpiogap_read(struct file *file, char __user *userbuf,
 static const struct file_operations btmrvl_gpiogap_fops = {
 	.read	= btmrvl_gpiogap_read,
 	.write	= btmrvl_gpiogap_write,
-	.open	= btmrvl_open_generic,
+	.open	= simple_open,
 	.llseek = default_llseek,
 };
 
@@ -265,7 +259,7 @@ static ssize_t btmrvl_hscmd_read(struct file *file, char __user *userbuf,
 static const struct file_operations btmrvl_hscmd_fops = {
 	.read	= btmrvl_hscmd_read,
 	.write	= btmrvl_hscmd_write,
-	.open	= btmrvl_open_generic,
+	.open	= simple_open,
 	.llseek = default_llseek,
 };
 
@@ -305,7 +299,7 @@ static ssize_t btmrvl_hsmode_read(struct file *file, char __user * userbuf,
 static const struct file_operations btmrvl_hsmode_fops = {
 	.read	= btmrvl_hsmode_read,
 	.write	= btmrvl_hsmode_write,
-	.open	= btmrvl_open_generic,
+	.open	= simple_open,
 	.llseek = default_llseek,
 };
 
@@ -323,7 +317,7 @@ static ssize_t btmrvl_curpsmode_read(struct file *file, char __user *userbuf,
 
 static const struct file_operations btmrvl_curpsmode_fops = {
 	.read	= btmrvl_curpsmode_read,
-	.open	= btmrvl_open_generic,
+	.open	= simple_open,
 	.llseek = default_llseek,
 };
 
@@ -341,7 +335,7 @@ static ssize_t btmrvl_psstate_read(struct file *file, char __user * userbuf,
 
 static const struct file_operations btmrvl_psstate_fops = {
 	.read	= btmrvl_psstate_read,
-	.open	= btmrvl_open_generic,
+	.open	= simple_open,
 	.llseek = default_llseek,
 };
 
@@ -359,7 +353,7 @@ static ssize_t btmrvl_hsstate_read(struct file *file, char __user *userbuf,
 
 static const struct file_operations btmrvl_hsstate_fops = {
 	.read	= btmrvl_hsstate_read,
-	.open	= btmrvl_open_generic,
+	.open	= simple_open,
 	.llseek = default_llseek,
 };
 
@@ -378,7 +372,7 @@ static ssize_t btmrvl_txdnldready_read(struct file *file, char __user *userbuf,
 
 static const struct file_operations btmrvl_txdnldready_fops = {
 	.read	= btmrvl_txdnldready_read,
-	.open	= btmrvl_open_generic,
+	.open	= simple_open,
 	.llseek = default_llseek,
 };
 
diff --git a/drivers/char/virtio_console.c b/drivers/char/virtio_console.c
index b58b56187065..ddf86b6500b7 100644
--- a/drivers/char/virtio_console.c
+++ b/drivers/char/virtio_console.c
@@ -1038,12 +1038,6 @@ static struct attribute_group port_attribute_group = {
 	.attrs = port_sysfs_entries,
 };
 
-static int debugfs_open(struct inode *inode, struct file *filp)
-{
-	filp->private_data = inode->i_private;
-	return 0;
-}
-
 static ssize_t debugfs_read(struct file *filp, char __user *ubuf,
 			    size_t count, loff_t *offp)
 {
@@ -1087,7 +1081,7 @@ static ssize_t debugfs_read(struct file *filp, char __user *ubuf,
 
 static const struct file_operations port_debugfs_ops = {
 	.owner = THIS_MODULE,
-	.open  = debugfs_open,
+	.open  = simple_open,
 	.read  = debugfs_read,
 };
 
diff --git a/drivers/dma/coh901318.c b/drivers/dma/coh901318.c
index d65a718c0f9b..a63badcd2d6e 100644
--- a/drivers/dma/coh901318.c
+++ b/drivers/dma/coh901318.c
@@ -104,13 +104,6 @@ static void coh901318_list_print(struct coh901318_chan *cohc,
 static struct coh901318_base *debugfs_dma_base;
 static struct dentry *dma_dentry;
 
-static int coh901318_debugfs_open(struct inode *inode, struct file *file)
-{
-
-	file->private_data = inode->i_private;
-	return 0;
-}
-
 static int coh901318_debugfs_read(struct file *file, char __user *buf,
 				  size_t count, loff_t *f_pos)
 {
@@ -158,7 +151,7 @@ static int coh901318_debugfs_read(struct file *file, char __user *buf,
 
 static const struct file_operations coh901318_debugfs_status_operations = {
 	.owner		= THIS_MODULE,
-	.open		= coh901318_debugfs_open,
+	.open		= simple_open,
 	.read		= coh901318_debugfs_read,
 	.llseek		= default_llseek,
 };
diff --git a/drivers/gpu/drm/i915/i915_debugfs.c b/drivers/gpu/drm/i915/i915_debugfs.c
index fdb7ccefffbd..b505b70dba05 100644
--- a/drivers/gpu/drm/i915/i915_debugfs.c
+++ b/drivers/gpu/drm/i915/i915_debugfs.c
@@ -1502,14 +1502,6 @@ static int i915_ppgtt_info(struct seq_file *m, void *data)
 	return 0;
 }
 
-static int
-i915_debugfs_common_open(struct inode *inode,
-			 struct file *filp)
-{
-	filp->private_data = inode->i_private;
-	return 0;
-}
-
 static ssize_t
 i915_wedged_read(struct file *filp,
 		 char __user *ubuf,
@@ -1560,7 +1552,7 @@ i915_wedged_write(struct file *filp,
 
 static const struct file_operations i915_wedged_fops = {
 	.owner = THIS_MODULE,
-	.open = i915_debugfs_common_open,
+	.open = simple_open,
 	.read = i915_wedged_read,
 	.write = i915_wedged_write,
 	.llseek = default_llseek,
@@ -1622,7 +1614,7 @@ i915_max_freq_write(struct file *filp,
 
 static const struct file_operations i915_max_freq_fops = {
 	.owner = THIS_MODULE,
-	.open = i915_debugfs_common_open,
+	.open = simple_open,
 	.read = i915_max_freq_read,
 	.write = i915_max_freq_write,
 	.llseek = default_llseek,
@@ -1693,7 +1685,7 @@ i915_cache_sharing_write(struct file *filp,
 
 static const struct file_operations i915_cache_sharing_fops = {
 	.owner = THIS_MODULE,
-	.open = i915_debugfs_common_open,
+	.open = simple_open,
 	.read = i915_cache_sharing_read,
 	.write = i915_cache_sharing_write,
 	.llseek = default_llseek,
diff --git a/drivers/hid/hid-picolcd.c b/drivers/hid/hid-picolcd.c
index 12f9777c385d..45c3433f7986 100644
--- a/drivers/hid/hid-picolcd.c
+++ b/drivers/hid/hid-picolcd.c
@@ -1525,12 +1525,6 @@ static const struct file_operations picolcd_debug_reset_fops = {
 /*
  * The "eeprom" file
  */
-static int picolcd_debug_eeprom_open(struct inode *i, struct file *f)
-{
-	f->private_data = i->i_private;
-	return 0;
-}
-
 static ssize_t picolcd_debug_eeprom_read(struct file *f, char __user *u,
 		size_t s, loff_t *off)
 {
@@ -1618,7 +1612,7 @@ static ssize_t picolcd_debug_eeprom_write(struct file *f, const char __user *u,
  */
 static const struct file_operations picolcd_debug_eeprom_fops = {
 	.owner    = THIS_MODULE,
-	.open     = picolcd_debug_eeprom_open,
+	.open     = simple_open,
 	.read     = picolcd_debug_eeprom_read,
 	.write    = picolcd_debug_eeprom_write,
 	.llseek   = generic_file_llseek,
@@ -1627,12 +1621,6 @@ static const struct file_operations picolcd_debug_eeprom_fops = {
 /*
  * The "flash" file
  */
-static int picolcd_debug_flash_open(struct inode *i, struct file *f)
-{
-	f->private_data = i->i_private;
-	return 0;
-}
-
 /* record a flash address to buf (bounds check to be done by caller) */
 static int _picolcd_flash_setaddr(struct picolcd_data *data, u8 *buf, long off)
 {
@@ -1817,7 +1805,7 @@ static ssize_t picolcd_debug_flash_write(struct file *f, const char __user *u,
  */
 static const struct file_operations picolcd_debug_flash_fops = {
 	.owner    = THIS_MODULE,
-	.open     = picolcd_debug_flash_open,
+	.open     = simple_open,
 	.read     = picolcd_debug_flash_read,
 	.write    = picolcd_debug_flash_write,
 	.llseek   = generic_file_llseek,
diff --git a/drivers/hid/hid-wiimote-debug.c b/drivers/hid/hid-wiimote-debug.c
index 17dabc1f339e..eec329197c16 100644
--- a/drivers/hid/hid-wiimote-debug.c
+++ b/drivers/hid/hid-wiimote-debug.c
@@ -23,12 +23,6 @@ struct wiimote_debug {
 	struct dentry *drm;
 };
 
-static int wiidebug_eeprom_open(struct inode *i, struct file *f)
-{
-	f->private_data = i->i_private;
-	return 0;
-}
-
 static ssize_t wiidebug_eeprom_read(struct file *f, char __user *u, size_t s,
 								loff_t *off)
 {
@@ -83,7 +77,7 @@ static ssize_t wiidebug_eeprom_read(struct file *f, char __user *u, size_t s,
 
 static const struct file_operations wiidebug_eeprom_fops = {
 	.owner = THIS_MODULE,
-	.open = wiidebug_eeprom_open,
+	.open = simple_open,
 	.read = wiidebug_eeprom_read,
 	.llseek = generic_file_llseek,
 };
diff --git a/drivers/idle/i7300_idle.c b/drivers/idle/i7300_idle.c
index c976285d313e..fa080ebd568f 100644
--- a/drivers/idle/i7300_idle.c
+++ b/drivers/idle/i7300_idle.c
@@ -516,12 +516,6 @@ static struct notifier_block i7300_idle_nb = {
 
 MODULE_DEVICE_TABLE(pci, pci_tbl);
 
-int stats_open_generic(struct inode *inode, struct file *fp)
-{
-	fp->private_data = inode->i_private;
-	return 0;
-}
-
 static ssize_t stats_read_ul(struct file *fp, char __user *ubuf, size_t count,
 				loff_t *off)
 {
@@ -534,7 +528,7 @@ static ssize_t stats_read_ul(struct file *fp, char __user *ubuf, size_t count,
 }
 
 static const struct file_operations idle_fops = {
-	.open	= stats_open_generic,
+	.open	= simple_open,
 	.read	= stats_read_ul,
 	.llseek = default_llseek,
 };
diff --git a/drivers/iommu/omap-iommu-debug.c b/drivers/iommu/omap-iommu-debug.c
index 103dbd92e256..f55fc5dfbadc 100644
--- a/drivers/iommu/omap-iommu-debug.c
+++ b/drivers/iommu/omap-iommu-debug.c
@@ -323,15 +323,9 @@ err_out:
 	return count;
 }
 
-static int debug_open_generic(struct inode *inode, struct file *file)
-{
-	file->private_data = inode->i_private;
-	return 0;
-}
-
 #define DEBUG_FOPS(name)						\
 	static const struct file_operations debug_##name##_fops = {	\
-		.open = debug_open_generic,				\
+		.open = simple_open,					\
 		.read = debug_read_##name,				\
 		.write = debug_write_##name,				\
 		.llseek = generic_file_llseek,				\
@@ -339,7 +333,7 @@ static int debug_open_generic(struct inode *inode, struct file *file)
 
 #define DEBUG_FOPS_RO(name)						\
 	static const struct file_operations debug_##name##_fops = {	\
-		.open = debug_open_generic,				\
+		.open = simple_open,					\
 		.read = debug_read_##name,				\
 		.llseek = generic_file_llseek,				\
 	};
diff --git a/drivers/mfd/aat2870-core.c b/drivers/mfd/aat2870-core.c
index 3aa36eb5c79b..44a3fdbadef4 100644
--- a/drivers/mfd/aat2870-core.c
+++ b/drivers/mfd/aat2870-core.c
@@ -262,13 +262,6 @@ static ssize_t aat2870_dump_reg(struct aat2870_data *aat2870, char *buf)
 	return count;
 }
 
-static int aat2870_reg_open_file(struct inode *inode, struct file *file)
-{
-	file->private_data = inode->i_private;
-
-	return 0;
-}
-
 static ssize_t aat2870_reg_read_file(struct file *file, char __user *user_buf,
 				     size_t count, loff_t *ppos)
 {
@@ -330,7 +323,7 @@ static ssize_t aat2870_reg_write_file(struct file *file,
 }
 
 static const struct file_operations aat2870_reg_fops = {
-	.open = aat2870_reg_open_file,
+	.open = simple_open,
 	.read = aat2870_reg_read_file,
 	.write = aat2870_reg_write_file,
 };
diff --git a/drivers/mfd/ab3100-core.c b/drivers/mfd/ab3100-core.c
index 60107ee166fc..1efad20fb175 100644
--- a/drivers/mfd/ab3100-core.c
+++ b/drivers/mfd/ab3100-core.c
@@ -483,12 +483,6 @@ struct ab3100_get_set_reg_priv {
 	bool mode;
 };
 
-static int ab3100_get_set_reg_open_file(struct inode *inode, struct file *file)
-{
-	file->private_data = inode->i_private;
-	return 0;
-}
-
 static ssize_t ab3100_get_set_reg(struct file *file,
 				  const char __user *user_buf,
 				  size_t count, loff_t *ppos)
@@ -583,7 +577,7 @@ static ssize_t ab3100_get_set_reg(struct file *file,
 }
 
 static const struct file_operations ab3100_get_set_reg_fops = {
-	.open = ab3100_get_set_reg_open_file,
+	.open = simple_open,
 	.write = ab3100_get_set_reg,
 	.llseek = noop_llseek,
 };
diff --git a/drivers/misc/ibmasm/ibmasmfs.c b/drivers/misc/ibmasm/ibmasmfs.c
index 1c034b80d408..6673e578b3e9 100644
--- a/drivers/misc/ibmasm/ibmasmfs.c
+++ b/drivers/misc/ibmasm/ibmasmfs.c
@@ -500,12 +500,6 @@ static ssize_t r_heartbeat_file_write(struct file *file, const char __user *buf,
 	return 1;
 }
 
-static int remote_settings_file_open(struct inode *inode, struct file *file)
-{
-	file->private_data = inode->i_private;
-	return 0;
-}
-
 static int remote_settings_file_close(struct inode *inode, struct file *file)
 {
 	return 0;
@@ -600,7 +594,7 @@ static const struct file_operations r_heartbeat_fops = {
 };
 
 static const struct file_operations remote_settings_fops = {
-	.open =		remote_settings_file_open,
+	.open =		simple_open,
 	.release =	remote_settings_file_close,
 	.read =		remote_settings_file_read,
 	.write =	remote_settings_file_write,
diff --git a/drivers/mtd/ubi/debug.c b/drivers/mtd/ubi/debug.c
index e2cdebf40840..61af9bb560ab 100644
--- a/drivers/mtd/ubi/debug.c
+++ b/drivers/mtd/ubi/debug.c
@@ -386,19 +386,11 @@ out:
 	return count;
 }
 
-static int default_open(struct inode *inode, struct file *file)
-{
-	if (inode->i_private)
-		file->private_data = inode->i_private;
-
-	return 0;
-}
-
 /* File operations for all UBI debugfs files */
 static const struct file_operations dfs_fops = {
 	.read   = dfs_file_read,
 	.write  = dfs_file_write,
-	.open   = default_open,
+	.open	= simple_open,
 	.llseek = no_llseek,
 	.owner  = THIS_MODULE,
 };
diff --git a/drivers/net/caif/caif_spi.c b/drivers/net/caif/caif_spi.c
index 96391c36fa74..b71ce9bf0afb 100644
--- a/drivers/net/caif/caif_spi.c
+++ b/drivers/net/caif/caif_spi.c
@@ -127,12 +127,6 @@ static inline void dev_debugfs_rem(struct cfspi *cfspi)
 	debugfs_remove(cfspi->dbgfs_dir);
 }
 
-static int dbgfs_open(struct inode *inode, struct file *file)
-{
-	file->private_data = inode->i_private;
-	return 0;
-}
-
 static ssize_t dbgfs_state(struct file *file, char __user *user_buf,
 			   size_t count, loff_t *ppos)
 {
@@ -243,13 +237,13 @@ static ssize_t dbgfs_frame(struct file *file, char __user *user_buf,
 }
 
 static const struct file_operations dbgfs_state_fops = {
-	.open = dbgfs_open,
+	.open = simple_open,
 	.read = dbgfs_state,
 	.owner = THIS_MODULE
 };
 
 static const struct file_operations dbgfs_frame_fops = {
-	.open = dbgfs_open,
+	.open = simple_open,
 	.read = dbgfs_frame,
 	.owner = THIS_MODULE
 };
diff --git a/drivers/net/ethernet/chelsio/cxgb4/cxgb4_main.c b/drivers/net/ethernet/chelsio/cxgb4/cxgb4_main.c
index 05ff076af06d..b126b98065a9 100644
--- a/drivers/net/ethernet/chelsio/cxgb4/cxgb4_main.c
+++ b/drivers/net/ethernet/chelsio/cxgb4/cxgb4_main.c
@@ -2000,13 +2000,6 @@ static const struct ethtool_ops cxgb_ethtool_ops = {
 /*
  * debugfs support
  */
-
-static int mem_open(struct inode *inode, struct file *file)
-{
-	file->private_data = inode->i_private;
-	return 0;
-}
-
 static ssize_t mem_read(struct file *file, char __user *buf, size_t count,
 			loff_t *ppos)
 {
@@ -2050,7 +2043,7 @@ static ssize_t mem_read(struct file *file, char __user *buf, size_t count,
 
 static const struct file_operations mem_debugfs_fops = {
 	.owner   = THIS_MODULE,
-	.open    = mem_open,
+	.open    = simple_open,
 	.read    = mem_read,
 	.llseek  = default_llseek,
 };
diff --git a/drivers/net/wimax/i2400m/debugfs.c b/drivers/net/wimax/i2400m/debugfs.c
index 129ba36bd04d..4b66ab1d0e5c 100644
--- a/drivers/net/wimax/i2400m/debugfs.c
+++ b/drivers/net/wimax/i2400m/debugfs.c
@@ -53,17 +53,6 @@ struct dentry *debugfs_create_netdev_queue_stopped(
 				   &fops_netdev_queue_stopped);
 }
 
-
-/*
- * inode->i_private has the @data argument to debugfs_create_file()
- */
-static
-int i2400m_stats_open(struct inode *inode, struct file *filp)
-{
-	filp->private_data = inode->i_private;
-	return 0;
-}
-
 /*
  * We don't allow partial reads of this file, as then the reader would
  * get weirdly confused data as it is updated.
@@ -117,7 +106,7 @@ ssize_t i2400m_rx_stats_write(struct file *filp, const char __user *buffer,
 static
 const struct file_operations i2400m_rx_stats_fops = {
 	.owner =	THIS_MODULE,
-	.open =		i2400m_stats_open,
+	.open =		simple_open,
 	.read =		i2400m_rx_stats_read,
 	.write =	i2400m_rx_stats_write,
 	.llseek =	default_llseek,
@@ -170,7 +159,7 @@ ssize_t i2400m_tx_stats_write(struct file *filp, const char __user *buffer,
 static
 const struct file_operations i2400m_tx_stats_fops = {
 	.owner =	THIS_MODULE,
-	.open =		i2400m_stats_open,
+	.open =		simple_open,
 	.read =		i2400m_tx_stats_read,
 	.write =	i2400m_tx_stats_write,
 	.llseek =	default_llseek,
diff --git a/drivers/net/wireless/ath/ath5k/debug.c b/drivers/net/wireless/ath/ath5k/debug.c
index 8c5ce8b0c734..e5e8f45d86ac 100644
--- a/drivers/net/wireless/ath/ath5k/debug.c
+++ b/drivers/net/wireless/ath/ath5k/debug.c
@@ -71,13 +71,6 @@ static unsigned int ath5k_debug;
 module_param_named(debug, ath5k_debug, uint, 0);
 
 
-static int ath5k_debugfs_open(struct inode *inode, struct file *file)
-{
-	file->private_data = inode->i_private;
-	return 0;
-}
-
-
 /* debugfs: registers */
 
 struct reg {
@@ -265,7 +258,7 @@ static ssize_t write_file_beacon(struct file *file,
 static const struct file_operations fops_beacon = {
 	.read = read_file_beacon,
 	.write = write_file_beacon,
-	.open = ath5k_debugfs_open,
+	.open = simple_open,
 	.owner = THIS_MODULE,
 	.llseek = default_llseek,
 };
@@ -285,7 +278,7 @@ static ssize_t write_file_reset(struct file *file,
 
 static const struct file_operations fops_reset = {
 	.write = write_file_reset,
-	.open = ath5k_debugfs_open,
+	.open = simple_open,
 	.owner = THIS_MODULE,
 	.llseek = noop_llseek,
 };
@@ -365,7 +358,7 @@ static ssize_t write_file_debug(struct file *file,
 static const struct file_operations fops_debug = {
 	.read = read_file_debug,
 	.write = write_file_debug,
-	.open = ath5k_debugfs_open,
+	.open = simple_open,
 	.owner = THIS_MODULE,
 	.llseek = default_llseek,
 };
@@ -477,7 +470,7 @@ static ssize_t write_file_antenna(struct file *file,
 static const struct file_operations fops_antenna = {
 	.read = read_file_antenna,
 	.write = write_file_antenna,
-	.open = ath5k_debugfs_open,
+	.open = simple_open,
 	.owner = THIS_MODULE,
 	.llseek = default_llseek,
 };
@@ -532,7 +525,7 @@ static ssize_t read_file_misc(struct file *file, char __user *user_buf,
 
 static const struct file_operations fops_misc = {
 	.read = read_file_misc,
-	.open = ath5k_debugfs_open,
+	.open = simple_open,
 	.owner = THIS_MODULE,
 };
 
@@ -647,7 +640,7 @@ static ssize_t write_file_frameerrors(struct file *file,
 static const struct file_operations fops_frameerrors = {
 	.read = read_file_frameerrors,
 	.write = write_file_frameerrors,
-	.open = ath5k_debugfs_open,
+	.open = simple_open,
 	.owner = THIS_MODULE,
 	.llseek = default_llseek,
 };
@@ -810,7 +803,7 @@ static ssize_t write_file_ani(struct file *file,
 static const struct file_operations fops_ani = {
 	.read = read_file_ani,
 	.write = write_file_ani,
-	.open = ath5k_debugfs_open,
+	.open = simple_open,
 	.owner = THIS_MODULE,
 	.llseek = default_llseek,
 };
@@ -881,7 +874,7 @@ static ssize_t write_file_queue(struct file *file,
 static const struct file_operations fops_queue = {
 	.read = read_file_queue,
 	.write = write_file_queue,
-	.open = ath5k_debugfs_open,
+	.open = simple_open,
 	.owner = THIS_MODULE,
 	.llseek = default_llseek,
 };
diff --git a/drivers/net/wireless/ath/ath6kl/debug.c b/drivers/net/wireless/ath/ath6kl/debug.c
index 552adb3f80d0..d01403a263ff 100644
--- a/drivers/net/wireless/ath/ath6kl/debug.c
+++ b/drivers/net/wireless/ath/ath6kl/debug.c
@@ -217,12 +217,6 @@ void dump_cred_dist_stats(struct htc_target *target)
 		   target->credit_info->cur_free_credits);
 }
 
-static int ath6kl_debugfs_open(struct inode *inode, struct file *file)
-{
-	file->private_data = inode->i_private;
-	return 0;
-}
-
 void ath6kl_debug_war(struct ath6kl *ar, enum ath6kl_war war)
 {
 	switch (war) {
@@ -263,7 +257,7 @@ static ssize_t read_file_war_stats(struct file *file, char __user *user_buf,
 
 static const struct file_operations fops_war_stats = {
 	.read = read_file_war_stats,
-	.open = ath6kl_debugfs_open,
+	.open = simple_open,
 	.owner = THIS_MODULE,
 	.llseek = default_llseek,
 };
@@ -488,7 +482,7 @@ static ssize_t ath6kl_fwlog_mask_write(struct file *file,
 }
 
 static const struct file_operations fops_fwlog_mask = {
-	.open = ath6kl_debugfs_open,
+	.open = simple_open,
 	.read = ath6kl_fwlog_mask_read,
 	.write = ath6kl_fwlog_mask_write,
 	.owner = THIS_MODULE,
@@ -634,7 +628,7 @@ static ssize_t read_file_tgt_stats(struct file *file, char __user *user_buf,
 
 static const struct file_operations fops_tgt_stats = {
 	.read = read_file_tgt_stats,
-	.open = ath6kl_debugfs_open,
+	.open = simple_open,
 	.owner = THIS_MODULE,
 	.llseek = default_llseek,
 };
@@ -699,7 +693,7 @@ static ssize_t read_file_credit_dist_stats(struct file *file,
 
 static const struct file_operations fops_credit_dist_stats = {
 	.read = read_file_credit_dist_stats,
-	.open = ath6kl_debugfs_open,
+	.open = simple_open,
 	.owner = THIS_MODULE,
 	.llseek = default_llseek,
 };
@@ -802,7 +796,7 @@ static ssize_t ath6kl_endpoint_stats_write(struct file *file,
 }
 
 static const struct file_operations fops_endpoint_stats = {
-	.open = ath6kl_debugfs_open,
+	.open = simple_open,
 	.read = ath6kl_endpoint_stats_read,
 	.write = ath6kl_endpoint_stats_write,
 	.owner = THIS_MODULE,
@@ -875,7 +869,7 @@ static ssize_t ath6kl_regread_write(struct file *file,
 static const struct file_operations fops_diag_reg_read = {
 	.read = ath6kl_regread_read,
 	.write = ath6kl_regread_write,
-	.open = ath6kl_debugfs_open,
+	.open = simple_open,
 	.owner = THIS_MODULE,
 	.llseek = default_llseek,
 };
@@ -999,7 +993,7 @@ static ssize_t ath6kl_lrssi_roam_read(struct file *file,
 static const struct file_operations fops_lrssi_roam_threshold = {
 	.read = ath6kl_lrssi_roam_read,
 	.write = ath6kl_lrssi_roam_write,
-	.open = ath6kl_debugfs_open,
+	.open = simple_open,
 	.owner = THIS_MODULE,
 	.llseek = default_llseek,
 };
@@ -1061,7 +1055,7 @@ static ssize_t ath6kl_regwrite_write(struct file *file,
 static const struct file_operations fops_diag_reg_write = {
 	.read = ath6kl_regwrite_read,
 	.write = ath6kl_regwrite_write,
-	.open = ath6kl_debugfs_open,
+	.open = simple_open,
 	.owner = THIS_MODULE,
 	.llseek = default_llseek,
 };
@@ -1166,7 +1160,7 @@ static ssize_t ath6kl_roam_table_read(struct file *file, char __user *user_buf,
 
 static const struct file_operations fops_roam_table = {
 	.read = ath6kl_roam_table_read,
-	.open = ath6kl_debugfs_open,
+	.open = simple_open,
 	.owner = THIS_MODULE,
 	.llseek = default_llseek,
 };
@@ -1204,7 +1198,7 @@ static ssize_t ath6kl_force_roam_write(struct file *file,
 
 static const struct file_operations fops_force_roam = {
 	.write = ath6kl_force_roam_write,
-	.open = ath6kl_debugfs_open,
+	.open = simple_open,
 	.owner = THIS_MODULE,
 	.llseek = default_llseek,
 };
@@ -1244,7 +1238,7 @@ static ssize_t ath6kl_roam_mode_write(struct file *file,
 
 static const struct file_operations fops_roam_mode = {
 	.write = ath6kl_roam_mode_write,
-	.open = ath6kl_debugfs_open,
+	.open = simple_open,
 	.owner = THIS_MODULE,
 	.llseek = default_llseek,
 };
@@ -1286,7 +1280,7 @@ static ssize_t ath6kl_keepalive_write(struct file *file,
 }
 
 static const struct file_operations fops_keepalive = {
-	.open = ath6kl_debugfs_open,
+	.open = simple_open,
 	.read = ath6kl_keepalive_read,
 	.write = ath6kl_keepalive_write,
 	.owner = THIS_MODULE,
@@ -1331,7 +1325,7 @@ static ssize_t ath6kl_disconnect_timeout_write(struct file *file,
 }
 
 static const struct file_operations fops_disconnect_timeout = {
-	.open = ath6kl_debugfs_open,
+	.open = simple_open,
 	.read = ath6kl_disconnect_timeout_read,
 	.write = ath6kl_disconnect_timeout_write,
 	.owner = THIS_MODULE,
@@ -1512,7 +1506,7 @@ static ssize_t ath6kl_create_qos_write(struct file *file,
 
 static const struct file_operations fops_create_qos = {
 	.write = ath6kl_create_qos_write,
-	.open = ath6kl_debugfs_open,
+	.open = simple_open,
 	.owner = THIS_MODULE,
 	.llseek = default_llseek,
 };
@@ -1560,7 +1554,7 @@ static ssize_t ath6kl_delete_qos_write(struct file *file,
 
 static const struct file_operations fops_delete_qos = {
 	.write = ath6kl_delete_qos_write,
-	.open = ath6kl_debugfs_open,
+	.open = simple_open,
 	.owner = THIS_MODULE,
 	.llseek = default_llseek,
 };
@@ -1593,7 +1587,7 @@ static ssize_t ath6kl_bgscan_int_write(struct file *file,
 
 static const struct file_operations fops_bgscan_int = {
 	.write = ath6kl_bgscan_int_write,
-	.open = ath6kl_debugfs_open,
+	.open = simple_open,
 	.owner = THIS_MODULE,
 	.llseek = default_llseek,
 };
@@ -1651,7 +1645,7 @@ static ssize_t ath6kl_listen_int_read(struct file *file,
 static const struct file_operations fops_listen_int = {
 	.read = ath6kl_listen_int_read,
 	.write = ath6kl_listen_int_write,
-	.open = ath6kl_debugfs_open,
+	.open = simple_open,
 	.owner = THIS_MODULE,
 	.llseek = default_llseek,
 };
@@ -1711,7 +1705,7 @@ static ssize_t ath6kl_power_params_write(struct file *file,
 
 static const struct file_operations fops_power_params = {
 	.write = ath6kl_power_params_write,
-	.open = ath6kl_debugfs_open,
+	.open = simple_open,
 	.owner = THIS_MODULE,
 	.llseek = default_llseek,
 };
diff --git a/drivers/net/wireless/ath/ath9k/debug.c b/drivers/net/wireless/ath/ath9k/debug.c
index 35d1c8e91d1c..ff47b32ecaf4 100644
--- a/drivers/net/wireless/ath/ath9k/debug.c
+++ b/drivers/net/wireless/ath/ath9k/debug.c
@@ -26,11 +26,6 @@
 #define REG_READ_D(_ah, _reg) \
 	ath9k_hw_common(_ah)->ops->read((_ah), (_reg))
 
-static int ath9k_debugfs_open(struct inode *inode, struct file *file)
-{
-	file->private_data = inode->i_private;
-	return 0;
-}
 
 static ssize_t ath9k_debugfs_read_buf(struct file *file, char __user *user_buf,
 				      size_t count, loff_t *ppos)
@@ -83,7 +78,7 @@ static ssize_t write_file_debug(struct file *file, const char __user *user_buf,
 static const struct file_operations fops_debug = {
 	.read = read_file_debug,
 	.write = write_file_debug,
-	.open = ath9k_debugfs_open,
+	.open = simple_open,
 	.owner = THIS_MODULE,
 	.llseek = default_llseek,
 };
@@ -129,7 +124,7 @@ static ssize_t write_file_tx_chainmask(struct file *file, const char __user *use
 static const struct file_operations fops_tx_chainmask = {
 	.read = read_file_tx_chainmask,
 	.write = write_file_tx_chainmask,
-	.open = ath9k_debugfs_open,
+	.open = simple_open,
 	.owner = THIS_MODULE,
 	.llseek = default_llseek,
 };
@@ -172,7 +167,7 @@ static ssize_t write_file_rx_chainmask(struct file *file, const char __user *use
 static const struct file_operations fops_rx_chainmask = {
 	.read = read_file_rx_chainmask,
 	.write = write_file_rx_chainmask,
-	.open = ath9k_debugfs_open,
+	.open = simple_open,
 	.owner = THIS_MODULE,
 	.llseek = default_llseek,
 };
@@ -223,7 +218,7 @@ static ssize_t write_file_disable_ani(struct file *file,
 static const struct file_operations fops_disable_ani = {
 	.read = read_file_disable_ani,
 	.write = write_file_disable_ani,
-	.open = ath9k_debugfs_open,
+	.open = simple_open,
 	.owner = THIS_MODULE,
 	.llseek = default_llseek,
 };
@@ -324,7 +319,7 @@ static ssize_t read_file_dma(struct file *file, char __user *user_buf,
 
 static const struct file_operations fops_dma = {
 	.read = read_file_dma,
-	.open = ath9k_debugfs_open,
+	.open = simple_open,
 	.owner = THIS_MODULE,
 	.llseek = default_llseek,
 };
@@ -446,7 +441,7 @@ static ssize_t read_file_interrupt(struct file *file, char __user *user_buf,
 
 static const struct file_operations fops_interrupt = {
 	.read = read_file_interrupt,
-	.open = ath9k_debugfs_open,
+	.open = simple_open,
 	.owner = THIS_MODULE,
 	.llseek = default_llseek,
 };
@@ -852,28 +847,28 @@ void ath_debug_stat_tx(struct ath_softc *sc, struct ath_buf *bf,
 
 static const struct file_operations fops_xmit = {
 	.read = read_file_xmit,
-	.open = ath9k_debugfs_open,
+	.open = simple_open,
 	.owner = THIS_MODULE,
 	.llseek = default_llseek,
 };
 
 static const struct file_operations fops_stations = {
 	.read = read_file_stations,
-	.open = ath9k_debugfs_open,
+	.open = simple_open,
 	.owner = THIS_MODULE,
 	.llseek = default_llseek,
 };
 
 static const struct file_operations fops_misc = {
 	.read = read_file_misc,
-	.open = ath9k_debugfs_open,
+	.open = simple_open,
 	.owner = THIS_MODULE,
 	.llseek = default_llseek,
 };
 
 static const struct file_operations fops_reset = {
 	.read = read_file_reset,
-	.open = ath9k_debugfs_open,
+	.open = simple_open,
 	.owner = THIS_MODULE,
 	.llseek = default_llseek,
 };
@@ -1016,7 +1011,7 @@ void ath_debug_stat_rx(struct ath_softc *sc, struct ath_rx_status *rs)
 
 static const struct file_operations fops_recv = {
 	.read = read_file_recv,
-	.open = ath9k_debugfs_open,
+	.open = simple_open,
 	.owner = THIS_MODULE,
 	.llseek = default_llseek,
 };
@@ -1055,7 +1050,7 @@ static ssize_t write_file_regidx(struct file *file, const char __user *user_buf,
 static const struct file_operations fops_regidx = {
 	.read = read_file_regidx,
 	.write = write_file_regidx,
-	.open = ath9k_debugfs_open,
+	.open = simple_open,
 	.owner = THIS_MODULE,
 	.llseek = default_llseek,
 };
@@ -1102,7 +1097,7 @@ static ssize_t write_file_regval(struct file *file, const char __user *user_buf,
 static const struct file_operations fops_regval = {
 	.read = read_file_regval,
 	.write = write_file_regval,
-	.open = ath9k_debugfs_open,
+	.open = simple_open,
 	.owner = THIS_MODULE,
 	.llseek = default_llseek,
 };
@@ -1191,7 +1186,7 @@ static ssize_t read_file_dump_nfcal(struct file *file, char __user *user_buf,
 
 static const struct file_operations fops_dump_nfcal = {
 	.read = read_file_dump_nfcal,
-	.open = ath9k_debugfs_open,
+	.open = simple_open,
 	.owner = THIS_MODULE,
 	.llseek = default_llseek,
 };
@@ -1219,7 +1214,7 @@ static ssize_t read_file_base_eeprom(struct file *file, char __user *user_buf,
 
 static const struct file_operations fops_base_eeprom = {
 	.read = read_file_base_eeprom,
-	.open = ath9k_debugfs_open,
+	.open = simple_open,
 	.owner = THIS_MODULE,
 	.llseek = default_llseek,
 };
@@ -1247,7 +1242,7 @@ static ssize_t read_file_modal_eeprom(struct file *file, char __user *user_buf,
 
 static const struct file_operations fops_modal_eeprom = {
 	.read = read_file_modal_eeprom,
-	.open = ath9k_debugfs_open,
+	.open = simple_open,
 	.owner = THIS_MODULE,
 	.llseek = default_llseek,
 };
diff --git a/drivers/net/wireless/ath/ath9k/dfs_debug.c b/drivers/net/wireless/ath/ath9k/dfs_debug.c
index 106d031d834a..4364c103ed33 100644
--- a/drivers/net/wireless/ath/ath9k/dfs_debug.c
+++ b/drivers/net/wireless/ath/ath9k/dfs_debug.c
@@ -60,16 +60,9 @@ static ssize_t read_file_dfs(struct file *file, char __user *user_buf,
 	return retval;
 }
 
-static int ath9k_dfs_debugfs_open(struct inode *inode, struct file *file)
-{
-	file->private_data = inode->i_private;
-
-	return 0;
-}
-
 static const struct file_operations fops_dfs_stats = {
 	.read = read_file_dfs,
-	.open = ath9k_dfs_debugfs_open,
+	.open = simple_open,
 	.owner = THIS_MODULE,
 	.llseek = default_llseek,
 };
diff --git a/drivers/net/wireless/ath/ath9k/htc_drv_debug.c b/drivers/net/wireless/ath/ath9k/htc_drv_debug.c
index d3ff33c71aa5..3035deb7a0cd 100644
--- a/drivers/net/wireless/ath/ath9k/htc_drv_debug.c
+++ b/drivers/net/wireless/ath/ath9k/htc_drv_debug.c
@@ -16,12 +16,6 @@
 
 #include "htc.h"
 
-static int ath9k_debugfs_open(struct inode *inode, struct file *file)
-{
-	file->private_data = inode->i_private;
-	return 0;
-}
-
 static ssize_t read_file_tgt_int_stats(struct file *file, char __user *user_buf,
 				       size_t count, loff_t *ppos)
 {
@@ -75,7 +69,7 @@ static ssize_t read_file_tgt_int_stats(struct file *file, char __user *user_buf,
 
 static const struct file_operations fops_tgt_int_stats = {
 	.read = read_file_tgt_int_stats,
-	.open = ath9k_debugfs_open,
+	.open = simple_open,
 	.owner = THIS_MODULE,
 	.llseek = default_llseek,
 };
@@ -145,7 +139,7 @@ static ssize_t read_file_tgt_tx_stats(struct file *file, char __user *user_buf,
 
 static const struct file_operations fops_tgt_tx_stats = {
 	.read = read_file_tgt_tx_stats,
-	.open = ath9k_debugfs_open,
+	.open = simple_open,
 	.owner = THIS_MODULE,
 	.llseek = default_llseek,
 };
@@ -191,7 +185,7 @@ static ssize_t read_file_tgt_rx_stats(struct file *file, char __user *user_buf,
 
 static const struct file_operations fops_tgt_rx_stats = {
 	.read = read_file_tgt_rx_stats,
-	.open = ath9k_debugfs_open,
+	.open = simple_open,
 	.owner = THIS_MODULE,
 	.llseek = default_llseek,
 };
@@ -243,7 +237,7 @@ static ssize_t read_file_xmit(struct file *file, char __user *user_buf,
 
 static const struct file_operations fops_xmit = {
 	.read = read_file_xmit,
-	.open = ath9k_debugfs_open,
+	.open = simple_open,
 	.owner = THIS_MODULE,
 	.llseek = default_llseek,
 };
@@ -364,7 +358,7 @@ static ssize_t read_file_recv(struct file *file, char __user *user_buf,
 
 static const struct file_operations fops_recv = {
 	.read = read_file_recv,
-	.open = ath9k_debugfs_open,
+	.open = simple_open,
 	.owner = THIS_MODULE,
 	.llseek = default_llseek,
 };
@@ -399,7 +393,7 @@ static ssize_t read_file_slot(struct file *file, char __user *user_buf,
 
 static const struct file_operations fops_slot = {
 	.read = read_file_slot,
-	.open = ath9k_debugfs_open,
+	.open = simple_open,
 	.owner = THIS_MODULE,
 	.llseek = default_llseek,
 };
@@ -446,7 +440,7 @@ static ssize_t read_file_queue(struct file *file, char __user *user_buf,
 
 static const struct file_operations fops_queue = {
 	.read = read_file_queue,
-	.open = ath9k_debugfs_open,
+	.open = simple_open,
 	.owner = THIS_MODULE,
 	.llseek = default_llseek,
 };
@@ -487,7 +481,7 @@ static ssize_t write_file_debug(struct file *file, const char __user *user_buf,
 static const struct file_operations fops_debug = {
 	.read = read_file_debug,
 	.write = write_file_debug,
-	.open = ath9k_debugfs_open,
+	.open = simple_open,
 	.owner = THIS_MODULE,
 	.llseek = default_llseek,
 };
@@ -636,7 +630,7 @@ static ssize_t read_file_base_eeprom(struct file *file, char __user *user_buf,
 
 static const struct file_operations fops_base_eeprom = {
 	.read = read_file_base_eeprom,
-	.open = ath9k_debugfs_open,
+	.open = simple_open,
 	.owner = THIS_MODULE,
 	.llseek = default_llseek,
 };
@@ -917,7 +911,7 @@ static ssize_t read_file_modal_eeprom(struct file *file, char __user *user_buf,
 
 static const struct file_operations fops_modal_eeprom = {
 	.read = read_file_modal_eeprom,
-	.open = ath9k_debugfs_open,
+	.open = simple_open,
 	.owner = THIS_MODULE,
 	.llseek = default_llseek,
 };
diff --git a/drivers/net/wireless/ath/ath9k/rc.c b/drivers/net/wireless/ath/ath9k/rc.c
index 4f848493fece..08bb45532701 100644
--- a/drivers/net/wireless/ath/ath9k/rc.c
+++ b/drivers/net/wireless/ath/ath9k/rc.c
@@ -1480,12 +1480,6 @@ static void ath_rate_update(void *priv, struct ieee80211_supported_band *sband,
 
 #ifdef CONFIG_ATH9K_DEBUGFS
 
-static int ath9k_debugfs_open(struct inode *inode, struct file *file)
-{
-	file->private_data = inode->i_private;
-	return 0;
-}
-
 static ssize_t read_file_rcstat(struct file *file, char __user *user_buf,
 				size_t count, loff_t *ppos)
 {
@@ -1553,7 +1547,7 @@ static ssize_t read_file_rcstat(struct file *file, char __user *user_buf,
 
 static const struct file_operations fops_rcstat = {
 	.read = read_file_rcstat,
-	.open = ath9k_debugfs_open,
+	.open = simple_open,
 	.owner = THIS_MODULE
 };
 
diff --git a/drivers/net/wireless/ath/carl9170/debug.c b/drivers/net/wireless/ath/carl9170/debug.c
index 3c164226687f..93fe6003a493 100644
--- a/drivers/net/wireless/ath/carl9170/debug.c
+++ b/drivers/net/wireless/ath/carl9170/debug.c
@@ -48,11 +48,6 @@
 #define ADD(buf, off, max, fmt, args...)				\
 	off += snprintf(&buf[off], max - off, fmt, ##args);
 
-static int carl9170_debugfs_open(struct inode *inode, struct file *file)
-{
-	file->private_data = inode->i_private;
-	return 0;
-}
 
 struct carl9170_debugfs_fops {
 	unsigned int read_bufsize;
@@ -178,7 +173,7 @@ static const struct carl9170_debugfs_fops carl_debugfs_##name ##_ops = {\
 	.attr = _attr,							\
 	.req_dev_state = _dstate,					\
 	.fops = {							\
-		.open	= carl9170_debugfs_open,			\
+		.open	= simple_open,					\
 		.read	= carl9170_debugfs_read,			\
 		.write	= carl9170_debugfs_write,			\
 		.owner	= THIS_MODULE					\
diff --git a/drivers/net/wireless/b43/debugfs.c b/drivers/net/wireless/b43/debugfs.c
index e751fdee89b2..e807bd930647 100644
--- a/drivers/net/wireless/b43/debugfs.c
+++ b/drivers/net/wireless/b43/debugfs.c
@@ -500,12 +500,6 @@ out:
 
 #undef fappend
 
-static int b43_debugfs_open(struct inode *inode, struct file *file)
-{
-	file->private_data = inode->i_private;
-	return 0;
-}
-
 static ssize_t b43_debugfs_read(struct file *file, char __user *userbuf,
 				size_t count, loff_t *ppos)
 {
@@ -624,7 +618,7 @@ out_unlock:
 		.read	= _read,				\
 		.write	= _write,				\
 		.fops	= {					\
-			.open	= b43_debugfs_open,		\
+			.open	= simple_open,			\
 			.read	= b43_debugfs_read,		\
 			.write	= b43_debugfs_write,		\
 			.llseek = generic_file_llseek,		\
diff --git a/drivers/net/wireless/b43legacy/debugfs.c b/drivers/net/wireless/b43legacy/debugfs.c
index 5e28ad0d6d17..1965edb765a2 100644
--- a/drivers/net/wireless/b43legacy/debugfs.c
+++ b/drivers/net/wireless/b43legacy/debugfs.c
@@ -197,12 +197,6 @@ static int restart_write_file(struct b43legacy_wldev *dev, const char *buf, size
 
 #undef fappend
 
-static int b43legacy_debugfs_open(struct inode *inode, struct file *file)
-{
-	file->private_data = inode->i_private;
-	return 0;
-}
-
 static ssize_t b43legacy_debugfs_read(struct file *file, char __user *userbuf,
 				size_t count, loff_t *ppos)
 {
@@ -331,7 +325,7 @@ out_unlock:
 		.read	= _read,				\
 		.write	= _write,				\
 		.fops	= {					\
-			.open	= b43legacy_debugfs_open,		\
+			.open	= simple_open,				\
 			.read	= b43legacy_debugfs_read,		\
 			.write	= b43legacy_debugfs_write,		\
 			.llseek = generic_file_llseek,			\
diff --git a/drivers/net/wireless/iwlegacy/3945-rs.c b/drivers/net/wireless/iwlegacy/3945-rs.c
index 70bee1a4d876..4b10157d8686 100644
--- a/drivers/net/wireless/iwlegacy/3945-rs.c
+++ b/drivers/net/wireless/iwlegacy/3945-rs.c
@@ -821,12 +821,6 @@ out:
 }
 
 #ifdef CONFIG_MAC80211_DEBUGFS
-static int
-il3945_open_file_generic(struct inode *inode, struct file *file)
-{
-	file->private_data = inode->i_private;
-	return 0;
-}
 
 static ssize_t
 il3945_sta_dbgfs_stats_table_read(struct file *file, char __user *user_buf,
@@ -862,7 +856,7 @@ il3945_sta_dbgfs_stats_table_read(struct file *file, char __user *user_buf,
 
 static const struct file_operations rs_sta_dbgfs_stats_table_ops = {
 	.read = il3945_sta_dbgfs_stats_table_read,
-	.open = il3945_open_file_generic,
+	.open = simple_open,
 	.llseek = default_llseek,
 };
 
diff --git a/drivers/net/wireless/iwlegacy/4965-rs.c b/drivers/net/wireless/iwlegacy/4965-rs.c
index d7e2856e41d3..11ab1247fae1 100644
--- a/drivers/net/wireless/iwlegacy/4965-rs.c
+++ b/drivers/net/wireless/iwlegacy/4965-rs.c
@@ -2518,12 +2518,6 @@ il4965_rs_free_sta(void *il_r, struct ieee80211_sta *sta, void *il_sta)
 }
 
 #ifdef CONFIG_MAC80211_DEBUGFS
-static int
-il4965_open_file_generic(struct inode *inode, struct file *file)
-{
-	file->private_data = inode->i_private;
-	return 0;
-}
 
 static void
 il4965_rs_dbgfs_set_mcs(struct il_lq_sta *lq_sta, u32 * rate_n_flags, int idx)
@@ -2695,7 +2689,7 @@ il4965_rs_sta_dbgfs_scale_table_read(struct file *file, char __user *user_buf,
 static const struct file_operations rs_sta_dbgfs_scale_table_ops = {
 	.write = il4965_rs_sta_dbgfs_scale_table_write,
 	.read = il4965_rs_sta_dbgfs_scale_table_read,
-	.open = il4965_open_file_generic,
+	.open = simple_open,
 	.llseek = default_llseek,
 };
 
@@ -2740,7 +2734,7 @@ il4965_rs_sta_dbgfs_stats_table_read(struct file *file, char __user *user_buf,
 
 static const struct file_operations rs_sta_dbgfs_stats_table_ops = {
 	.read = il4965_rs_sta_dbgfs_stats_table_read,
-	.open = il4965_open_file_generic,
+	.open = simple_open,
 	.llseek = default_llseek,
 };
 
@@ -2768,7 +2762,7 @@ il4965_rs_sta_dbgfs_rate_scale_data_read(struct file *file,
 
 static const struct file_operations rs_sta_dbgfs_rate_scale_data_ops = {
 	.read = il4965_rs_sta_dbgfs_rate_scale_data_read,
-	.open = il4965_open_file_generic,
+	.open = simple_open,
 	.llseek = default_llseek,
 };
 
diff --git a/drivers/net/wireless/iwlegacy/debug.c b/drivers/net/wireless/iwlegacy/debug.c
index 229849150aac..eff26501d60a 100644
--- a/drivers/net/wireless/iwlegacy/debug.c
+++ b/drivers/net/wireless/iwlegacy/debug.c
@@ -160,18 +160,12 @@ static ssize_t il_dbgfs_##name##_write(struct file *file,              \
 					const char __user *user_buf,    \
 					size_t count, loff_t *ppos);
 
-static int
-il_dbgfs_open_file_generic(struct inode *inode, struct file *file)
-{
-	file->private_data = inode->i_private;
-	return 0;
-}
 
 #define DEBUGFS_READ_FILE_OPS(name)				\
 	DEBUGFS_READ_FUNC(name);				\
 static const struct file_operations il_dbgfs_##name##_ops = {	\
 	.read = il_dbgfs_##name##_read,				\
-	.open = il_dbgfs_open_file_generic,			\
+	.open = simple_open,					\
 	.llseek = generic_file_llseek,				\
 };
 
@@ -179,7 +173,7 @@ static const struct file_operations il_dbgfs_##name##_ops = {	\
 	DEBUGFS_WRITE_FUNC(name);				\
 static const struct file_operations il_dbgfs_##name##_ops = {	\
 	.write = il_dbgfs_##name##_write,			\
-	.open = il_dbgfs_open_file_generic,			\
+	.open = simple_open,					\
 	.llseek = generic_file_llseek,				\
 };
 
@@ -189,7 +183,7 @@ static const struct file_operations il_dbgfs_##name##_ops = {	\
 static const struct file_operations il_dbgfs_##name##_ops = {	\
 	.write = il_dbgfs_##name##_write,			\
 	.read = il_dbgfs_##name##_read,				\
-	.open = il_dbgfs_open_file_generic,			\
+	.open = simple_open,					\
 	.llseek = generic_file_llseek,				\
 };
 
diff --git a/drivers/net/wireless/iwlwifi/iwl-agn-rs.c b/drivers/net/wireless/iwlwifi/iwl-agn-rs.c
index 53f8c51cfcdb..7e590b349dd7 100644
--- a/drivers/net/wireless/iwlwifi/iwl-agn-rs.c
+++ b/drivers/net/wireless/iwlwifi/iwl-agn-rs.c
@@ -3083,11 +3083,6 @@ static void rs_free_sta(void *priv_r, struct ieee80211_sta *sta,
 }
 
 #ifdef CONFIG_MAC80211_DEBUGFS
-static int open_file_generic(struct inode *inode, struct file *file)
-{
-	file->private_data = inode->i_private;
-	return 0;
-}
 static void rs_dbgfs_set_mcs(struct iwl_lq_sta *lq_sta,
 			     u32 *rate_n_flags, int index)
 {
@@ -3226,7 +3221,7 @@ static ssize_t rs_sta_dbgfs_scale_table_read(struct file *file,
 static const struct file_operations rs_sta_dbgfs_scale_table_ops = {
 	.write = rs_sta_dbgfs_scale_table_write,
 	.read = rs_sta_dbgfs_scale_table_read,
-	.open = open_file_generic,
+	.open = simple_open,
 	.llseek = default_llseek,
 };
 static ssize_t rs_sta_dbgfs_stats_table_read(struct file *file,
@@ -3269,7 +3264,7 @@ static ssize_t rs_sta_dbgfs_stats_table_read(struct file *file,
 
 static const struct file_operations rs_sta_dbgfs_stats_table_ops = {
 	.read = rs_sta_dbgfs_stats_table_read,
-	.open = open_file_generic,
+	.open = simple_open,
 	.llseek = default_llseek,
 };
 
@@ -3295,7 +3290,7 @@ static ssize_t rs_sta_dbgfs_rate_scale_data_read(struct file *file,
 
 static const struct file_operations rs_sta_dbgfs_rate_scale_data_ops = {
 	.read = rs_sta_dbgfs_rate_scale_data_read,
-	.open = open_file_generic,
+	.open = simple_open,
 	.llseek = default_llseek,
 };
 
diff --git a/drivers/net/wireless/iwlwifi/iwl-debugfs.c b/drivers/net/wireless/iwlwifi/iwl-debugfs.c
index b7b1c04f2fba..2bbaebd99ad4 100644
--- a/drivers/net/wireless/iwlwifi/iwl-debugfs.c
+++ b/drivers/net/wireless/iwlwifi/iwl-debugfs.c
@@ -84,17 +84,11 @@ static ssize_t iwl_dbgfs_##name##_write(struct file *file,              \
 					size_t count, loff_t *ppos);
 
 
-static int iwl_dbgfs_open_file_generic(struct inode *inode, struct file *file)
-{
-	file->private_data = inode->i_private;
-	return 0;
-}
-
 #define DEBUGFS_READ_FILE_OPS(name)                                     \
 	DEBUGFS_READ_FUNC(name);                                        \
 static const struct file_operations iwl_dbgfs_##name##_ops = {          \
 	.read = iwl_dbgfs_##name##_read,                       		\
-	.open = iwl_dbgfs_open_file_generic,                    	\
+	.open = simple_open,						\
 	.llseek = generic_file_llseek,					\
 };
 
@@ -102,7 +96,7 @@ static const struct file_operations iwl_dbgfs_##name##_ops = {          \
 	DEBUGFS_WRITE_FUNC(name);                                       \
 static const struct file_operations iwl_dbgfs_##name##_ops = {          \
 	.write = iwl_dbgfs_##name##_write,                              \
-	.open = iwl_dbgfs_open_file_generic,                    	\
+	.open = simple_open,						\
 	.llseek = generic_file_llseek,					\
 };
 
@@ -113,7 +107,7 @@ static const struct file_operations iwl_dbgfs_##name##_ops = {          \
 static const struct file_operations iwl_dbgfs_##name##_ops = {          \
 	.write = iwl_dbgfs_##name##_write,                              \
 	.read = iwl_dbgfs_##name##_read,                                \
-	.open = iwl_dbgfs_open_file_generic,                            \
+	.open = simple_open,						\
 	.llseek = generic_file_llseek,					\
 };
 
diff --git a/drivers/net/wireless/iwlwifi/iwl-trans-pcie.c b/drivers/net/wireless/iwlwifi/iwl-trans-pcie.c
index b4f796c82e1e..4d7b30d3e648 100644
--- a/drivers/net/wireless/iwlwifi/iwl-trans-pcie.c
+++ b/drivers/net/wireless/iwlwifi/iwl-trans-pcie.c
@@ -1898,17 +1898,11 @@ static ssize_t iwl_dbgfs_##name##_write(struct file *file,              \
 					size_t count, loff_t *ppos);
 
 
-static int iwl_dbgfs_open_file_generic(struct inode *inode, struct file *file)
-{
-	file->private_data = inode->i_private;
-	return 0;
-}
-
 #define DEBUGFS_READ_FILE_OPS(name)					\
 	DEBUGFS_READ_FUNC(name);					\
 static const struct file_operations iwl_dbgfs_##name##_ops = {		\
 	.read = iwl_dbgfs_##name##_read,				\
-	.open = iwl_dbgfs_open_file_generic,				\
+	.open = simple_open,						\
 	.llseek = generic_file_llseek,					\
 };
 
@@ -1916,7 +1910,7 @@ static const struct file_operations iwl_dbgfs_##name##_ops = {		\
 	DEBUGFS_WRITE_FUNC(name);                                       \
 static const struct file_operations iwl_dbgfs_##name##_ops = {          \
 	.write = iwl_dbgfs_##name##_write,                              \
-	.open = iwl_dbgfs_open_file_generic,				\
+	.open = simple_open,						\
 	.llseek = generic_file_llseek,					\
 };
 
@@ -1926,7 +1920,7 @@ static const struct file_operations iwl_dbgfs_##name##_ops = {          \
 static const struct file_operations iwl_dbgfs_##name##_ops = {		\
 	.write = iwl_dbgfs_##name##_write,				\
 	.read = iwl_dbgfs_##name##_read,				\
-	.open = iwl_dbgfs_open_file_generic,				\
+	.open = simple_open,						\
 	.llseek = generic_file_llseek,					\
 };
 
diff --git a/drivers/net/wireless/iwmc3200wifi/debugfs.c b/drivers/net/wireless/iwmc3200wifi/debugfs.c
index 87eef5773a02..b6199d124bb9 100644
--- a/drivers/net/wireless/iwmc3200wifi/debugfs.c
+++ b/drivers/net/wireless/iwmc3200wifi/debugfs.c
@@ -99,12 +99,6 @@ DEFINE_SIMPLE_ATTRIBUTE(fops_iwm_dbg_modules,
 			iwm_debugfs_u32_read, iwm_debugfs_dbg_modules_write,
 			"%llu\n");
 
-static int iwm_generic_open(struct inode *inode, struct file *filp)
-{
-	filp->private_data = inode->i_private;
-	return 0;
-}
-
 
 static ssize_t iwm_debugfs_txq_read(struct file *filp, char __user *buffer,
 				   size_t count, loff_t *ppos)
@@ -401,28 +395,28 @@ out:
 
 static const struct file_operations iwm_debugfs_txq_fops = {
 	.owner =	THIS_MODULE,
-	.open =		iwm_generic_open,
+	.open =		simple_open,
 	.read =		iwm_debugfs_txq_read,
 	.llseek =	default_llseek,
 };
 
 static const struct file_operations iwm_debugfs_tx_credit_fops = {
 	.owner =	THIS_MODULE,
-	.open =		iwm_generic_open,
+	.open =		simple_open,
 	.read =		iwm_debugfs_tx_credit_read,
 	.llseek =	default_llseek,
 };
 
 static const struct file_operations iwm_debugfs_rx_ticket_fops = {
 	.owner =	THIS_MODULE,
-	.open =		iwm_generic_open,
+	.open =		simple_open,
 	.read =		iwm_debugfs_rx_ticket_read,
 	.llseek =	default_llseek,
 };
 
 static const struct file_operations iwm_debugfs_fw_err_fops = {
 	.owner =	THIS_MODULE,
-	.open =		iwm_generic_open,
+	.open =		simple_open,
 	.read =		iwm_debugfs_fw_err_read,
 	.llseek =	default_llseek,
 };
diff --git a/drivers/net/wireless/iwmc3200wifi/sdio.c b/drivers/net/wireless/iwmc3200wifi/sdio.c
index 764b40dd24ad..0042f204b07f 100644
--- a/drivers/net/wireless/iwmc3200wifi/sdio.c
+++ b/drivers/net/wireless/iwmc3200wifi/sdio.c
@@ -264,13 +264,6 @@ static int if_sdio_send_chunk(struct iwm_priv *iwm, u8 *buf, int count)
 	return ret;
 }
 
-/* debugfs hooks */
-static int iwm_debugfs_sdio_open(struct inode *inode, struct file *filp)
-{
-	filp->private_data = inode->i_private;
-	return 0;
-}
-
 static ssize_t iwm_debugfs_sdio_read(struct file *filp, char __user *buffer,
 				     size_t count, loff_t *ppos)
 {
@@ -363,7 +356,7 @@ err:
 
 static const struct file_operations iwm_debugfs_sdio_fops = {
 	.owner =	THIS_MODULE,
-	.open =		iwm_debugfs_sdio_open,
+	.open =		simple_open,
 	.read =		iwm_debugfs_sdio_read,
 	.llseek =	default_llseek,
 };
diff --git a/drivers/net/wireless/libertas/debugfs.c b/drivers/net/wireless/libertas/debugfs.c
index c192671610fc..a06cc283e23d 100644
--- a/drivers/net/wireless/libertas/debugfs.c
+++ b/drivers/net/wireless/libertas/debugfs.c
@@ -21,12 +21,6 @@ static char *szStates[] = {
 static void lbs_debug_init(struct lbs_private *priv);
 #endif
 
-static int open_file_generic(struct inode *inode, struct file *file)
-{
-	file->private_data = inode->i_private;
-	return 0;
-}
-
 static ssize_t write_file_dummy(struct file *file, const char __user *buf,
                                 size_t count, loff_t *ppos)
 {
@@ -696,7 +690,7 @@ out_unlock:
 
 #define FOPS(fread, fwrite) { \
 	.owner = THIS_MODULE, \
-	.open = open_file_generic, \
+	.open = simple_open, \
 	.read = (fread), \
 	.write = (fwrite), \
 	.llseek = generic_file_llseek, \
@@ -962,7 +956,7 @@ static ssize_t lbs_debugfs_write(struct file *f, const char __user *buf,
 
 static const struct file_operations lbs_debug_fops = {
 	.owner = THIS_MODULE,
-	.open = open_file_generic,
+	.open = simple_open,
 	.write = lbs_debugfs_write,
 	.read = lbs_debugfs_read,
 	.llseek = default_llseek,
diff --git a/drivers/net/wireless/mwifiex/debugfs.c b/drivers/net/wireless/mwifiex/debugfs.c
index d26a78b6b3c4..1a845074c52a 100644
--- a/drivers/net/wireless/mwifiex/debugfs.c
+++ b/drivers/net/wireless/mwifiex/debugfs.c
@@ -139,18 +139,6 @@ static struct mwifiex_debug_data items[] = {
 
 static int num_of_items = ARRAY_SIZE(items);
 
-/*
- * Generic proc file open handler.
- *
- * This function is called every time a file is accessed for read or write.
- */
-static int
-mwifiex_open_generic(struct inode *inode, struct file *file)
-{
-	file->private_data = inode->i_private;
-	return 0;
-}
-
 /*
  * Proc info file read handler.
  *
@@ -676,19 +664,19 @@ done:
 static const struct file_operations mwifiex_dfs_##name##_fops = {       \
 	.read = mwifiex_##name##_read,                                  \
 	.write = mwifiex_##name##_write,                                \
-	.open = mwifiex_open_generic,                                   \
+	.open = simple_open,                                            \
 };
 
 #define MWIFIEX_DFS_FILE_READ_OPS(name)                                 \
 static const struct file_operations mwifiex_dfs_##name##_fops = {       \
 	.read = mwifiex_##name##_read,                                  \
-	.open = mwifiex_open_generic,                                   \
+	.open = simple_open,                                            \
 };
 
 #define MWIFIEX_DFS_FILE_WRITE_OPS(name)                                \
 static const struct file_operations mwifiex_dfs_##name##_fops = {       \
 	.write = mwifiex_##name##_write,                                \
-	.open = mwifiex_open_generic,                                   \
+	.open = simple_open,                                            \
 };
 
 
diff --git a/drivers/net/wireless/wl1251/debugfs.c b/drivers/net/wireless/wl1251/debugfs.c
index 6c274007d200..448da1f8c22f 100644
--- a/drivers/net/wireless/wl1251/debugfs.c
+++ b/drivers/net/wireless/wl1251/debugfs.c
@@ -47,7 +47,7 @@ static ssize_t name## _read(struct file *file, char __user *userbuf,	\
 									\
 static const struct file_operations name## _ops = {			\
 	.read = name## _read,						\
-	.open = wl1251_open_file_generic,				\
+	.open = simple_open,						\
 	.llseek	= generic_file_llseek,					\
 };
 
@@ -84,7 +84,7 @@ static ssize_t sub## _ ##name## _read(struct file *file,		\
 									\
 static const struct file_operations sub## _ ##name## _ops = {		\
 	.read = sub## _ ##name## _read,					\
-	.open = wl1251_open_file_generic,				\
+	.open = simple_open,						\
 	.llseek	= generic_file_llseek,					\
 };
 
@@ -117,12 +117,6 @@ out:
 	mutex_unlock(&wl->mutex);
 }
 
-static int wl1251_open_file_generic(struct inode *inode, struct file *file)
-{
-	file->private_data = inode->i_private;
-	return 0;
-}
-
 DEBUGFS_FWSTATS_FILE(tx, internal_desc_overflow, 20, "%u");
 
 DEBUGFS_FWSTATS_FILE(rx, out_of_mem, 20, "%u");
@@ -235,7 +229,7 @@ static ssize_t tx_queue_len_read(struct file *file, char __user *userbuf,
 
 static const struct file_operations tx_queue_len_ops = {
 	.read = tx_queue_len_read,
-	.open = wl1251_open_file_generic,
+	.open = simple_open,
 	.llseek = generic_file_llseek,
 };
 
@@ -257,7 +251,7 @@ static ssize_t tx_queue_status_read(struct file *file, char __user *userbuf,
 
 static const struct file_operations tx_queue_status_ops = {
 	.read = tx_queue_status_read,
-	.open = wl1251_open_file_generic,
+	.open = simple_open,
 	.llseek = generic_file_llseek,
 };
 
diff --git a/drivers/net/wireless/wl12xx/debugfs.c b/drivers/net/wireless/wl12xx/debugfs.c
index e1cf72765965..564d49575c94 100644
--- a/drivers/net/wireless/wl12xx/debugfs.c
+++ b/drivers/net/wireless/wl12xx/debugfs.c
@@ -63,7 +63,7 @@ static ssize_t name## _read(struct file *file, char __user *userbuf,	\
 									\
 static const struct file_operations name## _ops = {			\
 	.read = name## _read,						\
-	.open = wl1271_open_file_generic,				\
+	.open = simple_open,						\
 	.llseek	= generic_file_llseek,					\
 };
 
@@ -96,7 +96,7 @@ static ssize_t sub## _ ##name## _read(struct file *file,		\
 									\
 static const struct file_operations sub## _ ##name## _ops = {		\
 	.read = sub## _ ##name## _read,					\
-	.open = wl1271_open_file_generic,				\
+	.open = simple_open,						\
 	.llseek	= generic_file_llseek,					\
 };
 
@@ -126,12 +126,6 @@ out:
 	mutex_unlock(&wl->mutex);
 }
 
-static int wl1271_open_file_generic(struct inode *inode, struct file *file)
-{
-	file->private_data = inode->i_private;
-	return 0;
-}
-
 DEBUGFS_FWSTATS_FILE(tx, internal_desc_overflow, "%u");
 
 DEBUGFS_FWSTATS_FILE(rx, out_of_mem, "%u");
@@ -243,7 +237,7 @@ static ssize_t tx_queue_len_read(struct file *file, char __user *userbuf,
 
 static const struct file_operations tx_queue_len_ops = {
 	.read = tx_queue_len_read,
-	.open = wl1271_open_file_generic,
+	.open = simple_open,
 	.llseek = default_llseek,
 };
 
@@ -289,7 +283,7 @@ static ssize_t gpio_power_write(struct file *file,
 static const struct file_operations gpio_power_ops = {
 	.read = gpio_power_read,
 	.write = gpio_power_write,
-	.open = wl1271_open_file_generic,
+	.open = simple_open,
 	.llseek = default_llseek,
 };
 
@@ -308,7 +302,7 @@ static ssize_t start_recovery_write(struct file *file,
 
 static const struct file_operations start_recovery_ops = {
 	.write = start_recovery_write,
-	.open = wl1271_open_file_generic,
+	.open = simple_open,
 	.llseek = default_llseek,
 };
 
@@ -372,7 +366,7 @@ out:
 static const struct file_operations dynamic_ps_timeout_ops = {
 	.read = dynamic_ps_timeout_read,
 	.write = dynamic_ps_timeout_write,
-	.open = wl1271_open_file_generic,
+	.open = simple_open,
 	.llseek = default_llseek,
 };
 
@@ -441,7 +435,7 @@ out:
 static const struct file_operations forced_ps_ops = {
 	.read = forced_ps_read,
 	.write = forced_ps_write,
-	.open = wl1271_open_file_generic,
+	.open = simple_open,
 	.llseek = default_llseek,
 };
 
@@ -483,7 +477,7 @@ static ssize_t split_scan_timeout_write(struct file *file,
 static const struct file_operations split_scan_timeout_ops = {
 	.read = split_scan_timeout_read,
 	.write = split_scan_timeout_write,
-	.open = wl1271_open_file_generic,
+	.open = simple_open,
 	.llseek = default_llseek,
 };
 
@@ -566,7 +560,7 @@ static ssize_t driver_state_read(struct file *file, char __user *user_buf,
 
 static const struct file_operations driver_state_ops = {
 	.read = driver_state_read,
-	.open = wl1271_open_file_generic,
+	.open = simple_open,
 	.llseek = default_llseek,
 };
 
@@ -675,7 +669,7 @@ static ssize_t vifs_state_read(struct file *file, char __user *user_buf,
 
 static const struct file_operations vifs_state_ops = {
 	.read = vifs_state_read,
-	.open = wl1271_open_file_generic,
+	.open = simple_open,
 	.llseek = default_llseek,
 };
 
@@ -733,7 +727,7 @@ static ssize_t dtim_interval_write(struct file *file,
 static const struct file_operations dtim_interval_ops = {
 	.read = dtim_interval_read,
 	.write = dtim_interval_write,
-	.open = wl1271_open_file_generic,
+	.open = simple_open,
 	.llseek = default_llseek,
 };
 
@@ -791,7 +785,7 @@ static ssize_t suspend_dtim_interval_write(struct file *file,
 static const struct file_operations suspend_dtim_interval_ops = {
 	.read = suspend_dtim_interval_read,
 	.write = suspend_dtim_interval_write,
-	.open = wl1271_open_file_generic,
+	.open = simple_open,
 	.llseek = default_llseek,
 };
 
@@ -849,7 +843,7 @@ static ssize_t beacon_interval_write(struct file *file,
 static const struct file_operations beacon_interval_ops = {
 	.read = beacon_interval_read,
 	.write = beacon_interval_write,
-	.open = wl1271_open_file_generic,
+	.open = simple_open,
 	.llseek = default_llseek,
 };
 
@@ -904,7 +898,7 @@ static ssize_t rx_streaming_interval_read(struct file *file,
 static const struct file_operations rx_streaming_interval_ops = {
 	.read = rx_streaming_interval_read,
 	.write = rx_streaming_interval_write,
-	.open = wl1271_open_file_generic,
+	.open = simple_open,
 	.llseek = default_llseek,
 };
 
@@ -959,7 +953,7 @@ static ssize_t rx_streaming_always_read(struct file *file,
 static const struct file_operations rx_streaming_always_ops = {
 	.read = rx_streaming_always_read,
 	.write = rx_streaming_always_write,
-	.open = wl1271_open_file_generic,
+	.open = simple_open,
 	.llseek = default_llseek,
 };
 
@@ -1003,7 +997,7 @@ out:
 
 static const struct file_operations beacon_filtering_ops = {
 	.write = beacon_filtering_write,
-	.open = wl1271_open_file_generic,
+	.open = simple_open,
 	.llseek = default_llseek,
 };
 
diff --git a/drivers/oprofile/oprofilefs.c b/drivers/oprofile/oprofilefs.c
index ee8fd037bb53..849357c1045c 100644
--- a/drivers/oprofile/oprofilefs.c
+++ b/drivers/oprofile/oprofilefs.c
@@ -117,25 +117,17 @@ static ssize_t ulong_write_file(struct file *file, char const __user *buf, size_
 }
 
 
-static int default_open(struct inode *inode, struct file *filp)
-{
-	if (inode->i_private)
-		filp->private_data = inode->i_private;
-	return 0;
-}
-
-
 static const struct file_operations ulong_fops = {
 	.read		= ulong_read_file,
 	.write		= ulong_write_file,
-	.open		= default_open,
+	.open		= simple_open,
 	.llseek		= default_llseek,
 };
 
 
 static const struct file_operations ulong_ro_fops = {
 	.read		= ulong_read_file,
-	.open		= default_open,
+	.open		= simple_open,
 	.llseek		= default_llseek,
 };
 
@@ -187,7 +179,7 @@ static ssize_t atomic_read_file(struct file *file, char __user *buf, size_t coun
 
 static const struct file_operations atomic_ro_fops = {
 	.read		= atomic_read_file,
-	.open		= default_open,
+	.open		= simple_open,
 	.llseek		= default_llseek,
 };
 
diff --git a/drivers/remoteproc/remoteproc_debugfs.c b/drivers/remoteproc/remoteproc_debugfs.c
index 70277a530133..85d31a69e117 100644
--- a/drivers/remoteproc/remoteproc_debugfs.c
+++ b/drivers/remoteproc/remoteproc_debugfs.c
@@ -50,16 +50,9 @@ static ssize_t rproc_trace_read(struct file *filp, char __user *userbuf,
 	return simple_read_from_buffer(userbuf, count, ppos, trace->va, len);
 }
 
-static int rproc_open_generic(struct inode *inode, struct file *file)
-{
-	file->private_data = inode->i_private;
-
-	return 0;
-}
-
 static const struct file_operations trace_rproc_ops = {
 	.read = rproc_trace_read,
-	.open = rproc_open_generic,
+	.open = simple_open,
 	.llseek	= generic_file_llseek,
 };
 
@@ -94,7 +87,7 @@ static ssize_t rproc_state_read(struct file *filp, char __user *userbuf,
 
 static const struct file_operations rproc_state_ops = {
 	.read = rproc_state_read,
-	.open = rproc_open_generic,
+	.open = simple_open,
 	.llseek	= generic_file_llseek,
 };
 
@@ -114,7 +107,7 @@ static ssize_t rproc_name_read(struct file *filp, char __user *userbuf,
 
 static const struct file_operations rproc_name_ops = {
 	.read = rproc_name_read,
-	.open = rproc_open_generic,
+	.open = simple_open,
 	.llseek	= generic_file_llseek,
 };
 
diff --git a/drivers/scsi/lpfc/lpfc_debugfs.c b/drivers/scsi/lpfc/lpfc_debugfs.c
index 22e17be04d8a..34f7cf76bf4f 100644
--- a/drivers/scsi/lpfc/lpfc_debugfs.c
+++ b/drivers/scsi/lpfc/lpfc_debugfs.c
@@ -997,13 +997,6 @@ lpfc_debugfs_dumpDataDif_write(struct file *file, const char __user *buf,
 	return nbytes;
 }
 
-static int
-lpfc_debugfs_dif_err_open(struct inode *inode, struct file *file)
-{
-	file->private_data = inode->i_private;
-	return 0;
-}
-
 static ssize_t
 lpfc_debugfs_dif_err_read(struct file *file, char __user *buf,
 	size_t nbytes, loff_t *ppos)
@@ -3521,7 +3514,7 @@ static const struct file_operations lpfc_debugfs_op_dumpDif = {
 #undef lpfc_debugfs_op_dif_err
 static const struct file_operations lpfc_debugfs_op_dif_err = {
 	.owner =	THIS_MODULE,
-	.open =		lpfc_debugfs_dif_err_open,
+	.open =		simple_open,
 	.llseek =	lpfc_debugfs_lseek,
 	.read =		lpfc_debugfs_dif_err_read,
 	.write =	lpfc_debugfs_dif_err_write,
diff --git a/drivers/spi/spi-dw.c b/drivers/spi/spi-dw.c
index 082458d73ce9..d1a495f64e2d 100644
--- a/drivers/spi/spi-dw.c
+++ b/drivers/spi/spi-dw.c
@@ -63,12 +63,6 @@ struct chip_data {
 };
 
 #ifdef CONFIG_DEBUG_FS
-static int spi_show_regs_open(struct inode *inode, struct file *file)
-{
-	file->private_data = inode->i_private;
-	return 0;
-}
-
 #define SPI_REGS_BUFSIZE	1024
 static ssize_t  spi_show_regs(struct file *file, char __user *user_buf,
 				size_t count, loff_t *ppos)
@@ -128,7 +122,7 @@ static ssize_t  spi_show_regs(struct file *file, char __user *user_buf,
 
 static const struct file_operations mrst_spi_regs_ops = {
 	.owner		= THIS_MODULE,
-	.open		= spi_show_regs_open,
+	.open		= simple_open,
 	.read		= spi_show_regs,
 	.llseek		= default_llseek,
 };
diff --git a/drivers/tty/serial/mfd.c b/drivers/tty/serial/mfd.c
index a9234ba8f8d5..c4b50af46c44 100644
--- a/drivers/tty/serial/mfd.c
+++ b/drivers/tty/serial/mfd.c
@@ -127,11 +127,6 @@ static inline void serial_out(struct uart_hsu_port *up, int offset, int value)
 
 #define HSU_REGS_BUFSIZE	1024
 
-static int hsu_show_regs_open(struct inode *inode, struct file *file)
-{
-	file->private_data = inode->i_private;
-	return 0;
-}
 
 static ssize_t port_show_regs(struct file *file, char __user *user_buf,
 				size_t count, loff_t *ppos)
@@ -231,14 +226,14 @@ static ssize_t dma_show_regs(struct file *file, char __user *user_buf,
 
 static const struct file_operations port_regs_ops = {
 	.owner		= THIS_MODULE,
-	.open		= hsu_show_regs_open,
+	.open		= simple_open,
 	.read		= port_show_regs,
 	.llseek		= default_llseek,
 };
 
 static const struct file_operations dma_regs_ops = {
 	.owner		= THIS_MODULE,
-	.open		= hsu_show_regs_open,
+	.open		= simple_open,
 	.read		= dma_show_regs,
 	.llseek		= default_llseek,
 };
diff --git a/drivers/tty/serial/pch_uart.c b/drivers/tty/serial/pch_uart.c
index 332f2eb8abbc..46ec722b4406 100644
--- a/drivers/tty/serial/pch_uart.c
+++ b/drivers/tty/serial/pch_uart.c
@@ -304,11 +304,7 @@ static const int trigger_level_1[4] = { 1, 1, 1, 1 };
 #ifdef CONFIG_DEBUG_FS
 
 #define PCH_REGS_BUFSIZE	1024
-static int pch_show_regs_open(struct inode *inode, struct file *file)
-{
-	file->private_data = inode->i_private;
-	return 0;
-}
+
 
 static ssize_t port_show_regs(struct file *file, char __user *user_buf,
 				size_t count, loff_t *ppos)
@@ -362,7 +358,7 @@ static ssize_t port_show_regs(struct file *file, char __user *user_buf,
 
 static const struct file_operations port_regs_ops = {
 	.owner		= THIS_MODULE,
-	.open		= pch_show_regs_open,
+	.open		= simple_open,
 	.read		= port_show_regs,
 	.llseek		= default_llseek,
 };
diff --git a/drivers/usb/core/inode.c b/drivers/usb/core/inode.c
index cefa0c8b5b6a..d2b9af59cba9 100644
--- a/drivers/usb/core/inode.c
+++ b/drivers/usb/core/inode.c
@@ -428,18 +428,10 @@ static loff_t default_file_lseek (struct file *file, loff_t offset, int orig)
 	return retval;
 }
 
-static int default_open (struct inode *inode, struct file *file)
-{
-	if (inode->i_private)
-		file->private_data = inode->i_private;
-
-	return 0;
-}
-
 static const struct file_operations default_file_operations = {
 	.read =		default_read_file,
 	.write =	default_write_file,
-	.open =		default_open,
+	.open =		simple_open,
 	.llseek =	default_file_lseek,
 };
 
diff --git a/drivers/usb/host/ehci-dbg.c b/drivers/usb/host/ehci-dbg.c
index fd9109d7eb0e..680e1a31fb87 100644
--- a/drivers/usb/host/ehci-dbg.c
+++ b/drivers/usb/host/ehci-dbg.c
@@ -352,7 +352,6 @@ static int debug_async_open(struct inode *, struct file *);
 static int debug_periodic_open(struct inode *, struct file *);
 static int debug_registers_open(struct inode *, struct file *);
 static int debug_async_open(struct inode *, struct file *);
-static int debug_lpm_open(struct inode *, struct file *);
 static ssize_t debug_lpm_read(struct file *file, char __user *user_buf,
 				   size_t count, loff_t *ppos);
 static ssize_t debug_lpm_write(struct file *file, const char __user *buffer,
@@ -385,7 +384,7 @@ static const struct file_operations debug_registers_fops = {
 };
 static const struct file_operations debug_lpm_fops = {
 	.owner		= THIS_MODULE,
-	.open		= debug_lpm_open,
+	.open		= simple_open,
 	.read		= debug_lpm_read,
 	.write		= debug_lpm_write,
 	.release	= debug_lpm_close,
@@ -970,12 +969,6 @@ static int debug_registers_open(struct inode *inode, struct file *file)
 	return file->private_data ? 0 : -ENOMEM;
 }
 
-static int debug_lpm_open(struct inode *inode, struct file *file)
-{
-	file->private_data = inode->i_private;
-	return 0;
-}
-
 static int debug_lpm_close(struct inode *inode, struct file *file)
 {
 	return 0;
diff --git a/drivers/uwb/uwb-debug.c b/drivers/uwb/uwb-debug.c
index 2eecec0c13c9..6ec45beb7af5 100644
--- a/drivers/uwb/uwb-debug.c
+++ b/drivers/uwb/uwb-debug.c
@@ -159,13 +159,6 @@ static int cmd_ie_rm(struct uwb_rc *rc, struct uwb_dbg_cmd_ie *ie_to_rm)
 	return uwb_rc_ie_rm(rc, ie_to_rm->data[0]);
 }
 
-static int command_open(struct inode *inode, struct file *file)
-{
-	file->private_data = inode->i_private;
-
-	return 0;
-}
-
 static ssize_t command_write(struct file *file, const char __user *buf,
 			 size_t len, loff_t *off)
 {
@@ -206,7 +199,7 @@ static ssize_t command_write(struct file *file, const char __user *buf,
 }
 
 static const struct file_operations command_fops = {
-	.open   = command_open,
+	.open	= simple_open,
 	.write  = command_write,
 	.read   = NULL,
 	.llseek = no_llseek,
diff --git a/fs/debugfs/file.c b/fs/debugfs/file.c
index 21e93605161c..5dfafdd1dbd3 100644
--- a/fs/debugfs/file.c
+++ b/fs/debugfs/file.c
@@ -33,18 +33,10 @@ static ssize_t default_write_file(struct file *file, const char __user *buf,
 	return count;
 }
 
-static int default_open(struct inode *inode, struct file *file)
-{
-	if (inode->i_private)
-		file->private_data = inode->i_private;
-
-	return 0;
-}
-
 const struct file_operations debugfs_file_operations = {
 	.read =		default_read_file,
 	.write =	default_write_file,
-	.open =		default_open,
+	.open =		simple_open,
 	.llseek =	noop_llseek,
 };
 
@@ -447,7 +439,7 @@ static ssize_t write_file_bool(struct file *file, const char __user *user_buf,
 static const struct file_operations fops_bool = {
 	.read =		read_file_bool,
 	.write =	write_file_bool,
-	.open =		default_open,
+	.open =		simple_open,
 	.llseek =	default_llseek,
 };
 
@@ -492,7 +484,7 @@ static ssize_t read_file_blob(struct file *file, char __user *user_buf,
 
 static const struct file_operations fops_blob = {
 	.read =		read_file_blob,
-	.open =		default_open,
+	.open =		simple_open,
 	.llseek =	default_llseek,
 };
 
diff --git a/fs/dlm/debug_fs.c b/fs/dlm/debug_fs.c
index 3dca2b39e83f..1c9b08095f98 100644
--- a/fs/dlm/debug_fs.c
+++ b/fs/dlm/debug_fs.c
@@ -609,13 +609,6 @@ static const struct file_operations format3_fops = {
 /*
  * dump lkb's on the ls_waiters list
  */
-
-static int waiters_open(struct inode *inode, struct file *file)
-{
-	file->private_data = inode->i_private;
-	return 0;
-}
-
 static ssize_t waiters_read(struct file *file, char __user *userbuf,
 			    size_t count, loff_t *ppos)
 {
@@ -644,7 +637,7 @@ static ssize_t waiters_read(struct file *file, char __user *userbuf,
 
 static const struct file_operations waiters_fops = {
 	.owner   = THIS_MODULE,
-	.open    = waiters_open,
+	.open    = simple_open,
 	.read    = waiters_read,
 	.llseek  = default_llseek,
 };
diff --git a/fs/pstore/inode.c b/fs/pstore/inode.c
index f37c32b94525..8ae5a03376ae 100644
--- a/fs/pstore/inode.c
+++ b/fs/pstore/inode.c
@@ -52,12 +52,6 @@ struct pstore_private {
 	char	data[];
 };
 
-static int pstore_file_open(struct inode *inode, struct file *file)
-{
-	file->private_data = inode->i_private;
-	return 0;
-}
-
 static ssize_t pstore_file_read(struct file *file, char __user *userbuf,
 						size_t count, loff_t *ppos)
 {
@@ -67,7 +61,7 @@ static ssize_t pstore_file_read(struct file *file, char __user *userbuf,
 }
 
 static const struct file_operations pstore_file_operations = {
-	.open	= pstore_file_open,
+	.open	= simple_open,
 	.read	= pstore_file_read,
 	.llseek	= default_llseek,
 };
diff --git a/kernel/trace/blktrace.c b/kernel/trace/blktrace.c
index cdea7b56b0c9..c0bd0308741c 100644
--- a/kernel/trace/blktrace.c
+++ b/kernel/trace/blktrace.c
@@ -311,13 +311,6 @@ int blk_trace_remove(struct request_queue *q)
 }
 EXPORT_SYMBOL_GPL(blk_trace_remove);
 
-static int blk_dropped_open(struct inode *inode, struct file *filp)
-{
-	filp->private_data = inode->i_private;
-
-	return 0;
-}
-
 static ssize_t blk_dropped_read(struct file *filp, char __user *buffer,
 				size_t count, loff_t *ppos)
 {
@@ -331,18 +324,11 @@ static ssize_t blk_dropped_read(struct file *filp, char __user *buffer,
 
 static const struct file_operations blk_dropped_fops = {
 	.owner =	THIS_MODULE,
-	.open =		blk_dropped_open,
+	.open =		simple_open,
 	.read =		blk_dropped_read,
 	.llseek =	default_llseek,
 };
 
-static int blk_msg_open(struct inode *inode, struct file *filp)
-{
-	filp->private_data = inode->i_private;
-
-	return 0;
-}
-
 static ssize_t blk_msg_write(struct file *filp, const char __user *buffer,
 				size_t count, loff_t *ppos)
 {
@@ -371,7 +357,7 @@ static ssize_t blk_msg_write(struct file *filp, const char __user *buffer,
 
 static const struct file_operations blk_msg_fops = {
 	.owner =	THIS_MODULE,
-	.open =		blk_msg_open,
+	.open =		simple_open,
 	.write =	blk_msg_write,
 	.llseek =	noop_llseek,
 };
diff --git a/net/mac80211/debugfs.c b/net/mac80211/debugfs.c
index cc5b7a6e7e0b..778e5916d7c3 100644
--- a/net/mac80211/debugfs.c
+++ b/net/mac80211/debugfs.c
@@ -15,12 +15,6 @@
 #include "rate.h"
 #include "debugfs.h"
 
-int mac80211_open_file_generic(struct inode *inode, struct file *file)
-{
-	file->private_data = inode->i_private;
-	return 0;
-}
-
 #define DEBUGFS_FORMAT_BUFFER_SIZE 100
 
 int mac80211_format_buffer(char __user *userbuf, size_t count,
@@ -50,7 +44,7 @@ static ssize_t name## _read(struct file *file, char __user *userbuf,	\
 #define DEBUGFS_READONLY_FILE_OPS(name)			\
 static const struct file_operations name## _ops = {			\
 	.read = name## _read,						\
-	.open = mac80211_open_file_generic,				\
+	.open = simple_open,						\
 	.llseek = generic_file_llseek,					\
 };
 
@@ -93,7 +87,7 @@ static ssize_t reset_write(struct file *file, const char __user *user_buf,
 
 static const struct file_operations reset_ops = {
 	.write = reset_write,
-	.open = mac80211_open_file_generic,
+	.open = simple_open,
 	.llseek = noop_llseek,
 };
 
@@ -254,7 +248,7 @@ static ssize_t stats_ ##name## _read(struct file *file,			\
 									\
 static const struct file_operations stats_ ##name## _ops = {		\
 	.read = stats_ ##name## _read,					\
-	.open = mac80211_open_file_generic,				\
+	.open = simple_open,						\
 	.llseek = generic_file_llseek,					\
 };
 
diff --git a/net/mac80211/debugfs.h b/net/mac80211/debugfs.h
index 7c87529630f5..9be4e6d71d00 100644
--- a/net/mac80211/debugfs.h
+++ b/net/mac80211/debugfs.h
@@ -3,7 +3,6 @@
 
 #ifdef CONFIG_MAC80211_DEBUGFS
 extern void debugfs_hw_add(struct ieee80211_local *local);
-extern int mac80211_open_file_generic(struct inode *inode, struct file *file);
 extern int mac80211_format_buffer(char __user *userbuf, size_t count,
 				  loff_t *ppos, char *fmt, ...);
 #else
diff --git a/net/mac80211/debugfs_key.c b/net/mac80211/debugfs_key.c
index 59edcd95a58d..7932767bb482 100644
--- a/net/mac80211/debugfs_key.c
+++ b/net/mac80211/debugfs_key.c
@@ -30,7 +30,7 @@ static ssize_t key_##name##_read(struct file *file,			\
 #define KEY_OPS(name)							\
 static const struct file_operations key_ ##name## _ops = {		\
 	.read = key_##name##_read,					\
-	.open = mac80211_open_file_generic,				\
+	.open = simple_open,						\
 	.llseek = generic_file_llseek,					\
 }
 
@@ -45,7 +45,7 @@ static const struct file_operations key_ ##name## _ops = {		\
 #define KEY_CONF_OPS(name)						\
 static const struct file_operations key_ ##name## _ops = {		\
 	.read = key_conf_##name##_read,					\
-	.open = mac80211_open_file_generic,				\
+	.open = simple_open,						\
 	.llseek = generic_file_llseek,					\
 }
 
diff --git a/net/mac80211/debugfs_netdev.c b/net/mac80211/debugfs_netdev.c
index a32eeda04aa3..30f99c344847 100644
--- a/net/mac80211/debugfs_netdev.c
+++ b/net/mac80211/debugfs_netdev.c
@@ -135,7 +135,7 @@ static ssize_t ieee80211_if_read_##name(struct file *file,		\
 static const struct file_operations name##_ops = {			\
 	.read = ieee80211_if_read_##name,				\
 	.write = (_write),						\
-	.open = mac80211_open_file_generic,				\
+	.open = simple_open,						\
 	.llseek = generic_file_llseek,					\
 }
 
diff --git a/net/mac80211/debugfs_sta.c b/net/mac80211/debugfs_sta.c
index 6d45804d09bc..832b2da5e4cd 100644
--- a/net/mac80211/debugfs_sta.c
+++ b/net/mac80211/debugfs_sta.c
@@ -33,7 +33,7 @@ static ssize_t sta_ ##name## _read(struct file *file,			\
 #define STA_OPS(name)							\
 static const struct file_operations sta_ ##name## _ops = {		\
 	.read = sta_##name##_read,					\
-	.open = mac80211_open_file_generic,				\
+	.open = simple_open,						\
 	.llseek = generic_file_llseek,					\
 }
 
@@ -41,7 +41,7 @@ static const struct file_operations sta_ ##name## _ops = {		\
 static const struct file_operations sta_ ##name## _ops = {		\
 	.read = sta_##name##_read,					\
 	.write = sta_##name##_write,					\
-	.open = mac80211_open_file_generic,				\
+	.open = simple_open,						\
 	.llseek = generic_file_llseek,					\
 }
 
diff --git a/net/mac80211/rate.c b/net/mac80211/rate.c
index b4f7600a3e36..3313c117b322 100644
--- a/net/mac80211/rate.c
+++ b/net/mac80211/rate.c
@@ -145,7 +145,7 @@ static ssize_t rcname_read(struct file *file, char __user *userbuf,
 
 static const struct file_operations rcname_ops = {
 	.read = rcname_read,
-	.open = mac80211_open_file_generic,
+	.open = simple_open,
 	.llseek = default_llseek,
 };
 #endif
diff --git a/net/wireless/debugfs.c b/net/wireless/debugfs.c
index 39765bcfb472..920cabe0461b 100644
--- a/net/wireless/debugfs.c
+++ b/net/wireless/debugfs.c
@@ -13,12 +13,6 @@
 #include "core.h"
 #include "debugfs.h"
 
-static int cfg80211_open_file_generic(struct inode *inode, struct file *file)
-{
-	file->private_data = inode->i_private;
-	return 0;
-}
-
 #define DEBUGFS_READONLY_FILE(name, buflen, fmt, value...)		\
 static ssize_t name## _read(struct file *file, char __user *userbuf,	\
 			    size_t count, loff_t *ppos)			\
@@ -33,7 +27,7 @@ static ssize_t name## _read(struct file *file, char __user *userbuf,	\
 									\
 static const struct file_operations name## _ops = {			\
 	.read = name## _read,						\
-	.open = cfg80211_open_file_generic,				\
+	.open = simple_open,						\
 	.llseek = generic_file_llseek,					\
 };
 
@@ -102,7 +96,7 @@ static ssize_t ht40allow_map_read(struct file *file,
 
 static const struct file_operations ht40allow_map_ops = {
 	.read = ht40allow_map_read,
-	.open = cfg80211_open_file_generic,
+	.open = simple_open,
 	.llseek = default_llseek,
 };
 
diff --git a/sound/soc/imx/imx-audmux.c b/sound/soc/imx/imx-audmux.c
index 601df809a26a..1765a197acb0 100644
--- a/sound/soc/imx/imx-audmux.c
+++ b/sound/soc/imx/imx-audmux.c
@@ -40,12 +40,6 @@ static void __iomem *audmux_base;
 #ifdef CONFIG_DEBUG_FS
 static struct dentry *audmux_debugfs_root;
 
-static int audmux_open_file(struct inode *inode, struct file *file)
-{
-	file->private_data = inode->i_private;
-	return 0;
-}
-
 /* There is an annoying discontinuity in the SSI numbering with regard
  * to the Linux number of the devices */
 static const char *audmux_port_string(int port)
@@ -142,7 +136,7 @@ static ssize_t audmux_read_file(struct file *file, char __user *user_buf,
 }
 
 static const struct file_operations audmux_debugfs_fops = {
-	.open = audmux_open_file,
+	.open = simple_open,
 	.read = audmux_read_file,
 	.llseek = default_llseek,
 };
diff --git a/sound/soc/soc-core.c b/sound/soc/soc-core.c
index a4deebc0801a..e19c24ade414 100644
--- a/sound/soc/soc-core.c
+++ b/sound/soc/soc-core.c
@@ -201,12 +201,6 @@ static ssize_t pmdown_time_set(struct device *dev,
 static DEVICE_ATTR(pmdown_time, 0644, pmdown_time_show, pmdown_time_set);
 
 #ifdef CONFIG_DEBUG_FS
-static int codec_reg_open_file(struct inode *inode, struct file *file)
-{
-	file->private_data = inode->i_private;
-	return 0;
-}
-
 static ssize_t codec_reg_read_file(struct file *file, char __user *user_buf,
 				   size_t count, loff_t *ppos)
 {
@@ -264,7 +258,7 @@ static ssize_t codec_reg_write_file(struct file *file,
 }
 
 static const struct file_operations codec_reg_fops = {
-	.open = codec_reg_open_file,
+	.open = simple_open,
 	.read = codec_reg_read_file,
 	.write = codec_reg_write_file,
 	.llseek = default_llseek,
diff --git a/sound/soc/soc-dapm.c b/sound/soc/soc-dapm.c
index 6241490fff30..5cbd2d7623b8 100644
--- a/sound/soc/soc-dapm.c
+++ b/sound/soc/soc-dapm.c
@@ -1544,12 +1544,6 @@ static int dapm_power_widgets(struct snd_soc_dapm_context *dapm, int event)
 }
 
 #ifdef CONFIG_DEBUG_FS
-static int dapm_widget_power_open_file(struct inode *inode, struct file *file)
-{
-	file->private_data = inode->i_private;
-	return 0;
-}
-
 static ssize_t dapm_widget_power_read_file(struct file *file,
 					   char __user *user_buf,
 					   size_t count, loff_t *ppos)
@@ -1613,17 +1607,11 @@ static ssize_t dapm_widget_power_read_file(struct file *file,
 }
 
 static const struct file_operations dapm_widget_power_fops = {
-	.open = dapm_widget_power_open_file,
+	.open = simple_open,
 	.read = dapm_widget_power_read_file,
 	.llseek = default_llseek,
 };
 
-static int dapm_bias_open_file(struct inode *inode, struct file *file)
-{
-	file->private_data = inode->i_private;
-	return 0;
-}
-
 static ssize_t dapm_bias_read_file(struct file *file, char __user *user_buf,
 				   size_t count, loff_t *ppos)
 {
@@ -1654,7 +1642,7 @@ static ssize_t dapm_bias_read_file(struct file *file, char __user *user_buf,
 }
 
 static const struct file_operations dapm_bias_fops = {
-	.open = dapm_bias_open_file,
+	.open = simple_open,
 	.read = dapm_bias_read_file,
 	.llseek = default_llseek,
 };
-- 
cgit v1.2.3


From 46ed99d1b7c92920ce9e313152522847647aae4f Mon Sep 17 00:00:00 2001
From: Emil Goode <emilgoode@gmail.com>
Date: Sun, 1 Apr 2012 20:48:04 +0200
Subject: x86: vsyscall: Use NULL instead 0 for a pointer argument

This patch silences the following sparse warning:
arch/x86/kernel/vsyscall_64.c:250:34:
       warning: Using plain integer as NULL pointer

Signed-off-by: Emil Goode <emilgoode@gmail.com>
Acked-by: Andy Lutomirski <luto@amacapital.net>
Cc: john.stultz@linaro.org
Link: http://lkml.kernel.org/r/1333306084-3776-1-git-send-email-emilgoode@gmail.com
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 arch/x86/kernel/vsyscall_64.c | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kernel/vsyscall_64.c b/arch/x86/kernel/vsyscall_64.c
index f386dc49f988..7515cf0e1805 100644
--- a/arch/x86/kernel/vsyscall_64.c
+++ b/arch/x86/kernel/vsyscall_64.c
@@ -216,9 +216,9 @@ bool emulate_vsyscall(struct pt_regs *regs, unsigned long address)
 	current_thread_info()->sig_on_uaccess_error = 1;
 
 	/*
-	 * 0 is a valid user pointer (in the access_ok sense) on 32-bit and
+	 * NULL is a valid user pointer (in the access_ok sense) on 32-bit and
 	 * 64-bit, so we don't need to special-case it here.  For all the
-	 * vsyscalls, 0 means "don't write anything" not "write it at
+	 * vsyscalls, NULL means "don't write anything" not "write it at
 	 * address 0".
 	 */
 	ret = -EFAULT;
@@ -247,7 +247,7 @@ bool emulate_vsyscall(struct pt_regs *regs, unsigned long address)
 
 		ret = sys_getcpu((unsigned __user *)regs->di,
 				 (unsigned __user *)regs->si,
-				 0);
+				 NULL);
 		break;
 	}
 
-- 
cgit v1.2.3


From 2ca052a3710fac208eee690faefdeb8bbd4586a1 Mon Sep 17 00:00:00 2001
From: Jeremy Fitzhardinge <jeremy@goop.org>
Date: Mon, 2 Apr 2012 16:15:33 -0700
Subject: x86: Use correct byte-sized register constraint in __xchg_op()

x86-64 can access the low half of any register, but i386 can only do
it with a subset of registers.  'r' causes compilation failures on i386,
but 'q' expresses the constraint properly.

Signed-off-by: Jeremy Fitzhardinge <jeremy@goop.org>
Link: http://lkml.kernel.org/r/4F7A3315.501@goop.org
Reported-by: Leigh Scott <leigh123linux@googlemail.com>
Tested-by: Thomas Reitmayr <treitmayr@devbase.at>
Signed-off-by: H. Peter Anvin <hpa@zytor.com>
Cc: <stable@vger.kernel.org> v3.3
---
 arch/x86/include/asm/cmpxchg.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'arch/x86')

diff --git a/arch/x86/include/asm/cmpxchg.h b/arch/x86/include/asm/cmpxchg.h
index b3b733262909..bc18d0ed459a 100644
--- a/arch/x86/include/asm/cmpxchg.h
+++ b/arch/x86/include/asm/cmpxchg.h
@@ -43,7 +43,7 @@ extern void __add_wrong_size(void)
 		switch (sizeof(*(ptr))) {				\
 		case __X86_CASE_B:					\
 			asm volatile (lock #op "b %b0, %1\n"		\
-				      : "+r" (__ret), "+m" (*(ptr))	\
+				      : "+q" (__ret), "+m" (*(ptr))	\
 				      : : "memory", "cc");		\
 			break;						\
 		case __X86_CASE_W:					\
-- 
cgit v1.2.3


From 8c91c5325e107ec17e40a59a47c6517387d64eb7 Mon Sep 17 00:00:00 2001
From: "H. Peter Anvin" <hpa@zytor.com>
Date: Fri, 6 Apr 2012 09:30:57 -0700
Subject: x86: Use correct byte-sized register constraint in __add()

Similar to:

 2ca052a x86: Use correct byte-sized register constraint in __xchg_op()

... the __add() macro also needs to use a "q" constraint in the
byte-sized case, lest we try to generate an illegal register.

Link: http://lkml.kernel.org/r/4F7A3315.501@goop.org
Signed-off-by: H. Peter Anvin <hpa@zytor.com>
Cc: Jeremy Fitzhardinge <jeremy@goop.org>
Cc: Leigh Scott <leigh123linux@googlemail.com>
Cc: Thomas Reitmayr <treitmayr@devbase.at>
Cc: <stable@vger.kernel.org> v3.3
---
 arch/x86/include/asm/cmpxchg.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'arch/x86')

diff --git a/arch/x86/include/asm/cmpxchg.h b/arch/x86/include/asm/cmpxchg.h
index bc18d0ed459a..99480e55973d 100644
--- a/arch/x86/include/asm/cmpxchg.h
+++ b/arch/x86/include/asm/cmpxchg.h
@@ -173,7 +173,7 @@ extern void __add_wrong_size(void)
 		switch (sizeof(*(ptr))) {				\
 		case __X86_CASE_B:					\
 			asm volatile (lock "addb %b1, %0\n"		\
-				      : "+m" (*(ptr)) : "ri" (inc)	\
+				      : "+m" (*(ptr)) : "qi" (inc)	\
 				      : "memory", "cc");		\
 			break;						\
 		case __X86_CASE_W:					\
-- 
cgit v1.2.3


From f68e556e23d1a4176b563bcb25d8baf2c5313f91 Mon Sep 17 00:00:00 2001
From: Linus Torvalds <torvalds@linux-foundation.org>
Date: Fri, 6 Apr 2012 13:54:56 -0700
Subject: Make the "word-at-a-time" helper functions more commonly usable

I have a new optimized x86 "strncpy_from_user()" that will use these
same helper functions for all the same reasons the name lookup code uses
them.  This is preparation for that.

This moves them into an architecture-specific header file.  It's
architecture-specific for two reasons:

 - some of the functions are likely to want architecture-specific
   implementations.  Even if the current code happens to be "generic" in
   the sense that it should work on any little-endian machine, it's
   likely that the "multiply by a big constant and shift" implementation
   is less than optimal for an architecture that has a guaranteed fast
   bit count instruction, for example.

 - I expect that if architectures like sparc want to start playing
   around with this, we'll need to abstract out a few more details (in
   particular the actual unaligned accesses).  So we're likely to have
   more architecture-specific stuff if non-x86 architectures start using
   this.

   (and if it turns out that non-x86 architectures don't start using
   this, then having it in an architecture-specific header is still the
   right thing to do, of course)

Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 arch/x86/include/asm/word-at-a-time.h | 46 +++++++++++++++++++++++++++++++++++
 fs/namei.c                            | 35 +++-----------------------
 2 files changed, 49 insertions(+), 32 deletions(-)
 create mode 100644 arch/x86/include/asm/word-at-a-time.h

(limited to 'arch/x86')

diff --git a/arch/x86/include/asm/word-at-a-time.h b/arch/x86/include/asm/word-at-a-time.h
new file mode 100644
index 000000000000..6fe6767b7124
--- /dev/null
+++ b/arch/x86/include/asm/word-at-a-time.h
@@ -0,0 +1,46 @@
+#ifndef _ASM_WORD_AT_A_TIME_H
+#define _ASM_WORD_AT_A_TIME_H
+
+/*
+ * This is largely generic for little-endian machines, but the
+ * optimal byte mask counting is probably going to be something
+ * that is architecture-specific. If you have a reliably fast
+ * bit count instruction, that might be better than the multiply
+ * and shift, for example.
+ */
+
+#ifdef CONFIG_64BIT
+
+/*
+ * Jan Achrenius on G+: microoptimized version of
+ * the simpler "(mask & ONEBYTES) * ONEBYTES >> 56"
+ * that works for the bytemasks without having to
+ * mask them first.
+ */
+static inline long count_masked_bytes(unsigned long mask)
+{
+	return mask*0x0001020304050608ul >> 56;
+}
+
+#else	/* 32-bit case */
+
+/* Carl Chatfield / Jan Achrenius G+ version for 32-bit */
+static inline long count_masked_bytes(long mask)
+{
+	/* (000000 0000ff 00ffff ffffff) -> ( 1 1 2 3 ) */
+	long a = (0x0ff0001+mask) >> 23;
+	/* Fix the 1 for 00 case */
+	return a & mask;
+}
+
+#endif
+
+#define REPEAT_BYTE(x)	((~0ul / 0xff) * (x))
+
+/* Return the high bit set in the first byte that is a zero */
+static inline unsigned long has_zero(unsigned long a)
+{
+	return ((a - REPEAT_BYTE(0x01)) & ~a) & REPEAT_BYTE(0x80);
+}
+
+#endif /* _ASM_WORD_AT_A_TIME_H */
diff --git a/fs/namei.c b/fs/namei.c
index 1898198abc3d..0062dd17eb55 100644
--- a/fs/namei.c
+++ b/fs/namei.c
@@ -1407,18 +1407,9 @@ static inline int can_lookup(struct inode *inode)
  */
 #ifdef CONFIG_DCACHE_WORD_ACCESS
 
-#ifdef CONFIG_64BIT
+#include <asm/word-at-a-time.h>
 
-/*
- * Jan Achrenius on G+: microoptimized version of
- * the simpler "(mask & ONEBYTES) * ONEBYTES >> 56"
- * that works for the bytemasks without having to
- * mask them first.
- */
-static inline long count_masked_bytes(unsigned long mask)
-{
-	return mask*0x0001020304050608ul >> 56;
-}
+#ifdef CONFIG_64BIT
 
 static inline unsigned int fold_hash(unsigned long hash)
 {
@@ -1428,15 +1419,6 @@ static inline unsigned int fold_hash(unsigned long hash)
 
 #else	/* 32-bit case */
 
-/* Carl Chatfield / Jan Achrenius G+ version for 32-bit */
-static inline long count_masked_bytes(long mask)
-{
-	/* (000000 0000ff 00ffff ffffff) -> ( 1 1 2 3 ) */
-	long a = (0x0ff0001+mask) >> 23;
-	/* Fix the 1 for 00 case */
-	return a & mask;
-}
-
 #define fold_hash(x) (x)
 
 #endif
@@ -1464,17 +1446,6 @@ done:
 }
 EXPORT_SYMBOL(full_name_hash);
 
-#define REPEAT_BYTE(x)	((~0ul / 0xff) * (x))
-#define ONEBYTES	REPEAT_BYTE(0x01)
-#define SLASHBYTES	REPEAT_BYTE('/')
-#define HIGHBITS	REPEAT_BYTE(0x80)
-
-/* Return the high bit set in the first byte that is a zero */
-static inline unsigned long has_zero(unsigned long a)
-{
-	return ((a - ONEBYTES) & ~a) & HIGHBITS;
-}
-
 /*
  * Calculate the length and hash of the path component, and
  * return the length of the component;
@@ -1490,7 +1461,7 @@ static inline unsigned long hash_name(const char *name, unsigned int *hashp)
 		len += sizeof(unsigned long);
 		a = *(unsigned long *)(name+len);
 		/* Do we have any NUL or '/' bytes in this word? */
-		mask = has_zero(a) | has_zero(a ^ SLASHBYTES);
+		mask = has_zero(a) | has_zero(a ^ REPEAT_BYTE('/'));
 	} while (!mask);
 
 	/* The mask *below* the first high bit set */
-- 
cgit v1.2.3


From 3cb42092ff02edec34bf936b7400b1f1efc8ca43 Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Mon, 9 Apr 2012 13:59:00 -0400
Subject: um: fix linker script generation

while we can't just use -U$(SUBARCH), we still need to kill idiotic define
(implicit -Di386=1), both for SUBARCH=i386 and SUBARCH=x86/CONFIG_64BIT=n
builds.

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 arch/um/kernel/Makefile | 7 ++++---
 arch/x86/Makefile.um    | 3 +++
 2 files changed, 7 insertions(+), 3 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/um/kernel/Makefile b/arch/um/kernel/Makefile
index 492bc4c1b62b..65a1c3d690ea 100644
--- a/arch/um/kernel/Makefile
+++ b/arch/um/kernel/Makefile
@@ -3,9 +3,10 @@
 # Licensed under the GPL
 #
 
-CPPFLAGS_vmlinux.lds := -DSTART=$(LDS_START) \
-                        -DELF_ARCH=$(LDS_ELF_ARCH)        \
-                        -DELF_FORMAT=$(LDS_ELF_FORMAT)
+CPPFLAGS_vmlinux.lds := -DSTART=$(LDS_START)		\
+                        -DELF_ARCH=$(LDS_ELF_ARCH)	\
+                        -DELF_FORMAT=$(LDS_ELF_FORMAT)	\
+			$(LDS_EXTRA)
 extra-y := vmlinux.lds
 clean-files :=
 
diff --git a/arch/x86/Makefile.um b/arch/x86/Makefile.um
index 4be406abeefd..36b62bc52638 100644
--- a/arch/x86/Makefile.um
+++ b/arch/x86/Makefile.um
@@ -14,6 +14,9 @@ LINK-y			+= $(call cc-option,-m32)
 
 export LDFLAGS
 
+LDS_EXTRA		:= -Ui386
+export LDS_EXTRA
+
 # First of all, tune CFLAGS for the specific CPU. This actually sets cflags-y.
 include $(srctree)/arch/x86/Makefile_32.cpu
 
-- 
cgit v1.2.3


From a3a85a763c399c0bf483a30d82d2d613e6f94cd3 Mon Sep 17 00:00:00 2001
From: Richard Weinberger <richard@nod.at>
Date: Thu, 29 Mar 2012 18:47:46 +0200
Subject: um: Disintegrate asm/system.h
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Signed-off-by: Richard Weinberger <richard@nod.at>
Reported-by: Toralf Förster <toralf.foerster@gmx.de>
CC: dhowells@redhat.com
---
 arch/um/drivers/mconsole_kern.c |   1 +
 arch/um/include/asm/Kbuild      |   2 +-
 arch/x86/um/asm/barrier.h       |  75 ++++++++++++++++++++++
 arch/x86/um/asm/switch_to.h     |   7 +++
 arch/x86/um/asm/system.h        | 135 ----------------------------------------
 5 files changed, 84 insertions(+), 136 deletions(-)
 create mode 100644 arch/x86/um/asm/barrier.h
 create mode 100644 arch/x86/um/asm/switch_to.h
 delete mode 100644 arch/x86/um/asm/system.h

(limited to 'arch/x86')

diff --git a/arch/um/drivers/mconsole_kern.c b/arch/um/drivers/mconsole_kern.c
index e672bd6d43e3..43b39d61b538 100644
--- a/arch/um/drivers/mconsole_kern.c
+++ b/arch/um/drivers/mconsole_kern.c
@@ -22,6 +22,7 @@
 #include <linux/workqueue.h>
 #include <linux/mutex.h>
 #include <asm/uaccess.h>
+#include <asm/switch_to.h>
 
 #include "init.h"
 #include "irq_kern.h"
diff --git a/arch/um/include/asm/Kbuild b/arch/um/include/asm/Kbuild
index 8419f5cf2ac7..bb5d6e68829b 100644
--- a/arch/um/include/asm/Kbuild
+++ b/arch/um/include/asm/Kbuild
@@ -1,3 +1,3 @@
 generic-y += bug.h cputime.h device.h emergency-restart.h futex.h hardirq.h
 generic-y += hw_irq.h irq_regs.h kdebug.h percpu.h sections.h topology.h xor.h
-generic-y += ftrace.h pci.h io.h param.h delay.h mutex.h current.h
+generic-y += ftrace.h pci.h io.h param.h delay.h mutex.h current.h exec.h
diff --git a/arch/x86/um/asm/barrier.h b/arch/x86/um/asm/barrier.h
new file mode 100644
index 000000000000..7d01b8c56c00
--- /dev/null
+++ b/arch/x86/um/asm/barrier.h
@@ -0,0 +1,75 @@
+#ifndef _ASM_UM_BARRIER_H_
+#define _ASM_UM_BARRIER_H_
+
+#include <asm/asm.h>
+#include <asm/segment.h>
+#include <asm/cpufeature.h>
+#include <asm/cmpxchg.h>
+#include <asm/nops.h>
+
+#include <linux/kernel.h>
+#include <linux/irqflags.h>
+
+/*
+ * Force strict CPU ordering.
+ * And yes, this is required on UP too when we're talking
+ * to devices.
+ */
+#ifdef CONFIG_X86_32
+
+#define mb()	alternative("lock; addl $0,0(%%esp)", "mfence", X86_FEATURE_XMM2)
+#define rmb()	alternative("lock; addl $0,0(%%esp)", "lfence", X86_FEATURE_XMM2)
+#define wmb()	alternative("lock; addl $0,0(%%esp)", "sfence", X86_FEATURE_XMM)
+
+#else /* CONFIG_X86_32 */
+
+#define mb()	asm volatile("mfence" : : : "memory")
+#define rmb()	asm volatile("lfence" : : : "memory")
+#define wmb()	asm volatile("sfence" : : : "memory")
+
+#endif /* CONFIG_X86_32 */
+
+#define read_barrier_depends()	do { } while (0)
+
+#ifdef CONFIG_SMP
+
+#define smp_mb()	mb()
+#ifdef CONFIG_X86_PPRO_FENCE
+#define smp_rmb()	rmb()
+#else /* CONFIG_X86_PPRO_FENCE */
+#define smp_rmb()	barrier()
+#endif /* CONFIG_X86_PPRO_FENCE */
+
+#ifdef CONFIG_X86_OOSTORE
+#define smp_wmb()	wmb()
+#else /* CONFIG_X86_OOSTORE */
+#define smp_wmb()	barrier()
+#endif /* CONFIG_X86_OOSTORE */
+
+#define smp_read_barrier_depends()	read_barrier_depends()
+#define set_mb(var, value) do { (void)xchg(&var, value); } while (0)
+
+#else /* CONFIG_SMP */
+
+#define smp_mb()	barrier()
+#define smp_rmb()	barrier()
+#define smp_wmb()	barrier()
+#define smp_read_barrier_depends()	do { } while (0)
+#define set_mb(var, value) do { var = value; barrier(); } while (0)
+
+#endif /* CONFIG_SMP */
+
+/*
+ * Stop RDTSC speculation. This is needed when you need to use RDTSC
+ * (or get_cycles or vread that possibly accesses the TSC) in a defined
+ * code region.
+ *
+ * (Could use an alternative three way for this if there was one.)
+ */
+static inline void rdtsc_barrier(void)
+{
+	alternative(ASM_NOP3, "mfence", X86_FEATURE_MFENCE_RDTSC);
+	alternative(ASM_NOP3, "lfence", X86_FEATURE_LFENCE_RDTSC);
+}
+
+#endif
diff --git a/arch/x86/um/asm/switch_to.h b/arch/x86/um/asm/switch_to.h
new file mode 100644
index 000000000000..cf97d20da61f
--- /dev/null
+++ b/arch/x86/um/asm/switch_to.h
@@ -0,0 +1,7 @@
+#ifndef _ASM_UM_SWITCH_TO_H_
+#define _ASM_UM_SWITCH_TO_H_
+
+extern void *_switch_to(void *prev, void *next, void *last);
+#define switch_to(prev, next, last) prev = _switch_to(prev, next, last)
+
+#endif
diff --git a/arch/x86/um/asm/system.h b/arch/x86/um/asm/system.h
deleted file mode 100644
index a459fd9b7598..000000000000
--- a/arch/x86/um/asm/system.h
+++ /dev/null
@@ -1,135 +0,0 @@
-#ifndef _ASM_X86_SYSTEM_H_
-#define _ASM_X86_SYSTEM_H_
-
-#include <asm/asm.h>
-#include <asm/segment.h>
-#include <asm/cpufeature.h>
-#include <asm/cmpxchg.h>
-#include <asm/nops.h>
-
-#include <linux/kernel.h>
-#include <linux/irqflags.h>
-
-/* entries in ARCH_DLINFO: */
-#ifdef CONFIG_IA32_EMULATION
-# define AT_VECTOR_SIZE_ARCH 2
-#else
-# define AT_VECTOR_SIZE_ARCH 1
-#endif
-
-extern unsigned long arch_align_stack(unsigned long sp);
-
-void default_idle(void);
-
-/*
- * Force strict CPU ordering.
- * And yes, this is required on UP too when we're talking
- * to devices.
- */
-#ifdef CONFIG_X86_32
-/*
- * Some non-Intel clones support out of order store. wmb() ceases to be a
- * nop for these.
- */
-#define mb() alternative("lock; addl $0,0(%%esp)", "mfence", X86_FEATURE_XMM2)
-#define rmb() alternative("lock; addl $0,0(%%esp)", "lfence", X86_FEATURE_XMM2)
-#define wmb() alternative("lock; addl $0,0(%%esp)", "sfence", X86_FEATURE_XMM)
-#else
-#define mb() 	asm volatile("mfence":::"memory")
-#define rmb()	asm volatile("lfence":::"memory")
-#define wmb()	asm volatile("sfence" ::: "memory")
-#endif
-
-/**
- * read_barrier_depends - Flush all pending reads that subsequents reads
- * depend on.
- *
- * No data-dependent reads from memory-like regions are ever reordered
- * over this barrier.  All reads preceding this primitive are guaranteed
- * to access memory (but not necessarily other CPUs' caches) before any
- * reads following this primitive that depend on the data return by
- * any of the preceding reads.  This primitive is much lighter weight than
- * rmb() on most CPUs, and is never heavier weight than is
- * rmb().
- *
- * These ordering constraints are respected by both the local CPU
- * and the compiler.
- *
- * Ordering is not guaranteed by anything other than these primitives,
- * not even by data dependencies.  See the documentation for
- * memory_barrier() for examples and URLs to more information.
- *
- * For example, the following code would force ordering (the initial
- * value of "a" is zero, "b" is one, and "p" is "&a"):
- *
- * <programlisting>
- *	CPU 0				CPU 1
- *
- *	b = 2;
- *	memory_barrier();
- *	p = &b;				q = p;
- *					read_barrier_depends();
- *					d = *q;
- * </programlisting>
- *
- * because the read of "*q" depends on the read of "p" and these
- * two reads are separated by a read_barrier_depends().  However,
- * the following code, with the same initial values for "a" and "b":
- *
- * <programlisting>
- *	CPU 0				CPU 1
- *
- *	a = 2;
- *	memory_barrier();
- *	b = 3;				y = b;
- *					read_barrier_depends();
- *					x = a;
- * </programlisting>
- *
- * does not enforce ordering, since there is no data dependency between
- * the read of "a" and the read of "b".  Therefore, on some CPUs, such
- * as Alpha, "y" could be set to 3 and "x" to 0.  Use rmb()
- * in cases like this where there are no data dependencies.
- **/
-
-#define read_barrier_depends()	do { } while (0)
-
-#ifdef CONFIG_SMP
-#define smp_mb()	mb()
-#ifdef CONFIG_X86_PPRO_FENCE
-# define smp_rmb()	rmb()
-#else
-# define smp_rmb()	barrier()
-#endif
-#ifdef CONFIG_X86_OOSTORE
-# define smp_wmb() 	wmb()
-#else
-# define smp_wmb()	barrier()
-#endif
-#define smp_read_barrier_depends()	read_barrier_depends()
-#define set_mb(var, value) do { (void)xchg(&var, value); } while (0)
-#else
-#define smp_mb()	barrier()
-#define smp_rmb()	barrier()
-#define smp_wmb()	barrier()
-#define smp_read_barrier_depends()	do { } while (0)
-#define set_mb(var, value) do { var = value; barrier(); } while (0)
-#endif
-
-/*
- * Stop RDTSC speculation. This is needed when you need to use RDTSC
- * (or get_cycles or vread that possibly accesses the TSC) in a defined
- * code region.
- *
- * (Could use an alternative three way for this if there was one.)
- */
-static inline void rdtsc_barrier(void)
-{
-	alternative(ASM_NOP3, "mfence", X86_FEATURE_MFENCE_RDTSC);
-	alternative(ASM_NOP3, "lfence", X86_FEATURE_LFENCE_RDTSC);
-}
-
-extern void *_switch_to(void *prev, void *next, void *last);
-#define switch_to(prev, next, last) prev = _switch_to(prev, next, last)
-
-#endif
-- 
cgit v1.2.3


From 76b278edd99fb55525fcf2706095e388bd3d122c Mon Sep 17 00:00:00 2001
From: Richard Weinberger <richard@nod.at>
Date: Thu, 29 Mar 2012 19:10:42 +0200
Subject: um: Use asm-generic/switch_to.h

Signed-off-by: Richard Weinberger <richard@nod.at>
---
 arch/um/include/asm/Kbuild  | 1 +
 arch/um/kernel/process.c    | 6 +-----
 arch/x86/um/asm/switch_to.h | 7 -------
 3 files changed, 2 insertions(+), 12 deletions(-)
 delete mode 100644 arch/x86/um/asm/switch_to.h

(limited to 'arch/x86')

diff --git a/arch/um/include/asm/Kbuild b/arch/um/include/asm/Kbuild
index bb5d6e68829b..fff24352255d 100644
--- a/arch/um/include/asm/Kbuild
+++ b/arch/um/include/asm/Kbuild
@@ -1,3 +1,4 @@
 generic-y += bug.h cputime.h device.h emergency-restart.h futex.h hardirq.h
 generic-y += hw_irq.h irq_regs.h kdebug.h percpu.h sections.h topology.h xor.h
 generic-y += ftrace.h pci.h io.h param.h delay.h mutex.h current.h exec.h
+generic-y += switch_to.h
diff --git a/arch/um/kernel/process.c b/arch/um/kernel/process.c
index f386d04a84a5..2b73dedb44ca 100644
--- a/arch/um/kernel/process.c
+++ b/arch/um/kernel/process.c
@@ -88,11 +88,8 @@ static inline void set_current(struct task_struct *task)
 
 extern void arch_switch_to(struct task_struct *to);
 
-void *_switch_to(void *prev, void *next, void *last)
+void *__switch_to(struct task_struct *from, struct task_struct *to)
 {
-	struct task_struct *from = prev;
-	struct task_struct *to = next;
-
 	to->thread.prev_sched = from;
 	set_current(to);
 
@@ -111,7 +108,6 @@ void *_switch_to(void *prev, void *next, void *last)
 	} while (current->thread.saved_task);
 
 	return current->thread.prev_sched;
-
 }
 
 void interrupt_end(void)
diff --git a/arch/x86/um/asm/switch_to.h b/arch/x86/um/asm/switch_to.h
deleted file mode 100644
index cf97d20da61f..000000000000
--- a/arch/x86/um/asm/switch_to.h
+++ /dev/null
@@ -1,7 +0,0 @@
-#ifndef _ASM_UM_SWITCH_TO_H_
-#define _ASM_UM_SWITCH_TO_H_
-
-extern void *_switch_to(void *prev, void *next, void *last);
-#define switch_to(prev, next, last) prev = _switch_to(prev, next, last)
-
-#endif
-- 
cgit v1.2.3


From 92ae03f2ef99fbc23bfa9080d6b58f25227bd7ef Mon Sep 17 00:00:00 2001
From: Linus Torvalds <torvalds@linux-foundation.org>
Date: Fri, 6 Apr 2012 14:32:32 -0700
Subject: x86: merge 32/64-bit versions of 'strncpy_from_user()' and speed it
 up

This merges the 32- and 64-bit versions of the x86 strncpy_from_user()
by just rewriting it in C rather than the ancient inline asm versions
that used lodsb/stosb and had been duplicated for (trivial) differences
between the 32-bit and 64-bit versions.

While doing that, it also speeds them up by doing the accesses a word at
a time.  Finally, the new routines also properly handle the case of
hitting the end of the address space, which we have never done correctly
before (fs/namei.c has a hack around it for that reason).

Despite all these improvements, it actually removes more lines than it
adds, due to the de-duplication.  Also, we no longer export (or define)
the legacy __strncpy_from_user() function (that was defined to not do
the user permission checks), since it's not actually used anywhere, and
the user address space checks are built in to the new code.

Other architecture maintainers have been notified that the old hack in
fs/namei.c will be going away in the 3.5 merge window, in case they
copied the x86 approach of being a bit cavalier about the end of the
address space.

Cc: linux-arch@vger.kernel.org
Cc: Ingo Molnar <mingo@kernel.org>
Cc: Peter Anvin" <hpa@zytor.com>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 arch/x86/include/asm/uaccess.h    |   2 +
 arch/x86/include/asm/uaccess_32.h |   5 --
 arch/x86/include/asm/uaccess_64.h |   4 --
 arch/x86/lib/usercopy.c           | 103 ++++++++++++++++++++++++++++++++++++++
 arch/x86/lib/usercopy_32.c        |  87 --------------------------------
 arch/x86/lib/usercopy_64.c        |  49 ------------------
 6 files changed, 105 insertions(+), 145 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/include/asm/uaccess.h b/arch/x86/include/asm/uaccess.h
index 8be5f54d9360..e0544597cfe7 100644
--- a/arch/x86/include/asm/uaccess.h
+++ b/arch/x86/include/asm/uaccess.h
@@ -557,6 +557,8 @@ struct __large_struct { unsigned long buf[100]; };
 
 extern unsigned long
 copy_from_user_nmi(void *to, const void __user *from, unsigned long n);
+extern __must_check long
+strncpy_from_user(char *dst, const char __user *src, long count);
 
 /*
  * movsl can be slow when source and dest are not both 8-byte aligned
diff --git a/arch/x86/include/asm/uaccess_32.h b/arch/x86/include/asm/uaccess_32.h
index 566e803cc602..8084bc73b18c 100644
--- a/arch/x86/include/asm/uaccess_32.h
+++ b/arch/x86/include/asm/uaccess_32.h
@@ -213,11 +213,6 @@ static inline unsigned long __must_check copy_from_user(void *to,
 	return n;
 }
 
-long __must_check strncpy_from_user(char *dst, const char __user *src,
-				    long count);
-long __must_check __strncpy_from_user(char *dst,
-				      const char __user *src, long count);
-
 /**
  * strlen_user: - Get the size of a string in user space.
  * @str: The string to measure.
diff --git a/arch/x86/include/asm/uaccess_64.h b/arch/x86/include/asm/uaccess_64.h
index 1c66d30971ad..fcd4b6f3ef02 100644
--- a/arch/x86/include/asm/uaccess_64.h
+++ b/arch/x86/include/asm/uaccess_64.h
@@ -208,10 +208,6 @@ int __copy_in_user(void __user *dst, const void __user *src, unsigned size)
 	}
 }
 
-__must_check long
-strncpy_from_user(char *dst, const char __user *src, long count);
-__must_check long
-__strncpy_from_user(char *dst, const char __user *src, long count);
 __must_check long strnlen_user(const char __user *str, long n);
 __must_check long __strnlen_user(const char __user *str, long n);
 __must_check long strlen_user(const char __user *str);
diff --git a/arch/x86/lib/usercopy.c b/arch/x86/lib/usercopy.c
index 97be9cb54483..57252c928f56 100644
--- a/arch/x86/lib/usercopy.c
+++ b/arch/x86/lib/usercopy.c
@@ -7,6 +7,8 @@
 #include <linux/highmem.h>
 #include <linux/module.h>
 
+#include <asm/word-at-a-time.h>
+
 /*
  * best effort, GUP based copy_from_user() that is NMI-safe
  */
@@ -41,3 +43,104 @@ copy_from_user_nmi(void *to, const void __user *from, unsigned long n)
 	return len;
 }
 EXPORT_SYMBOL_GPL(copy_from_user_nmi);
+
+static inline unsigned long count_bytes(unsigned long mask)
+{
+	mask = (mask - 1) & ~mask;
+	mask >>= 7;
+	return count_masked_bytes(mask);
+}
+
+/*
+ * Do a strncpy, return length of string without final '\0'.
+ * 'count' is the user-supplied count (return 'count' if we
+ * hit it), 'max' is the address space maximum (and we return
+ * -EFAULT if we hit it).
+ */
+static inline long do_strncpy_from_user(char *dst, const char __user *src, long count, long max)
+{
+	long res = 0;
+
+	/*
+	 * Truncate 'max' to the user-specified limit, so that
+	 * we only have one limit we need to check in the loop
+	 */
+	if (max > count)
+		max = count;
+
+	while (max >= sizeof(unsigned long)) {
+		unsigned long c;
+
+		/* Fall back to byte-at-a-time if we get a page fault */
+		if (unlikely(__get_user(c,(unsigned long __user *)(src+res))))
+			break;
+		/* This can write a few bytes past the NUL character, but that's ok */
+		*(unsigned long *)(dst+res) = c;
+		c = has_zero(c);
+		if (c)
+			return res + count_bytes(c);
+		res += sizeof(unsigned long);
+		max -= sizeof(unsigned long);
+	}
+
+	while (max) {
+		char c;
+
+		if (unlikely(__get_user(c,src+res)))
+			return -EFAULT;
+		dst[res] = c;
+		if (!c)
+			return res;
+		res++;
+		max--;
+	}
+
+	/*
+	 * Uhhuh. We hit 'max'. But was that the user-specified maximum
+	 * too? If so, that's ok - we got as much as the user asked for.
+	 */
+	if (res >= count)
+		return count;
+
+	/*
+	 * Nope: we hit the address space limit, and we still had more
+	 * characters the caller would have wanted. That's an EFAULT.
+	 */
+	return -EFAULT;
+}
+
+/**
+ * strncpy_from_user: - Copy a NUL terminated string from userspace.
+ * @dst:   Destination address, in kernel space.  This buffer must be at
+ *         least @count bytes long.
+ * @src:   Source address, in user space.
+ * @count: Maximum number of bytes to copy, including the trailing NUL.
+ *
+ * Copies a NUL-terminated string from userspace to kernel space.
+ *
+ * On success, returns the length of the string (not including the trailing
+ * NUL).
+ *
+ * If access to userspace fails, returns -EFAULT (some data may have been
+ * copied).
+ *
+ * If @count is smaller than the length of the string, copies @count bytes
+ * and returns @count.
+ */
+long
+strncpy_from_user(char *dst, const char __user *src, long count)
+{
+	unsigned long max_addr, src_addr;
+
+	if (unlikely(count <= 0))
+		return 0;
+
+	max_addr = current_thread_info()->addr_limit.seg;
+	src_addr = (unsigned long)src;
+	if (likely(src_addr < max_addr)) {
+		unsigned long max = max_addr - src_addr;
+		return do_strncpy_from_user(dst, src, count, max);
+	}
+	return -EFAULT;
+}
+EXPORT_SYMBOL(strncpy_from_user);
diff --git a/arch/x86/lib/usercopy_32.c b/arch/x86/lib/usercopy_32.c
index d9b094ca7aaa..ef2a6a5d78e3 100644
--- a/arch/x86/lib/usercopy_32.c
+++ b/arch/x86/lib/usercopy_32.c
@@ -32,93 +32,6 @@ static inline int __movsl_is_ok(unsigned long a1, unsigned long a2, unsigned lon
 #define movsl_is_ok(a1, a2, n) \
 	__movsl_is_ok((unsigned long)(a1), (unsigned long)(a2), (n))
 
-/*
- * Copy a null terminated string from userspace.
- */
-
-#define __do_strncpy_from_user(dst, src, count, res)			   \
-do {									   \
-	int __d0, __d1, __d2;						   \
-	might_fault();							   \
-	__asm__ __volatile__(						   \
-		"	testl %1,%1\n"					   \
-		"	jz 2f\n"					   \
-		"0:	lodsb\n"					   \
-		"	stosb\n"					   \
-		"	testb %%al,%%al\n"				   \
-		"	jz 1f\n"					   \
-		"	decl %1\n"					   \
-		"	jnz 0b\n"					   \
-		"1:	subl %1,%0\n"					   \
-		"2:\n"							   \
-		".section .fixup,\"ax\"\n"				   \
-		"3:	movl %5,%0\n"					   \
-		"	jmp 2b\n"					   \
-		".previous\n"						   \
-		_ASM_EXTABLE(0b,3b)					   \
-		: "=&d"(res), "=&c"(count), "=&a" (__d0), "=&S" (__d1),	   \
-		  "=&D" (__d2)						   \
-		: "i"(-EFAULT), "0"(count), "1"(count), "3"(src), "4"(dst) \
-		: "memory");						   \
-} while (0)
-
-/**
- * __strncpy_from_user: - Copy a NUL terminated string from userspace, with less checking.
- * @dst:   Destination address, in kernel space.  This buffer must be at
- *         least @count bytes long.
- * @src:   Source address, in user space.
- * @count: Maximum number of bytes to copy, including the trailing NUL.
- *
- * Copies a NUL-terminated string from userspace to kernel space.
- * Caller must check the specified block with access_ok() before calling
- * this function.
- *
- * On success, returns the length of the string (not including the trailing
- * NUL).
- *
- * If access to userspace fails, returns -EFAULT (some data may have been
- * copied).
- *
- * If @count is smaller than the length of the string, copies @count bytes
- * and returns @count.
- */
-long
-__strncpy_from_user(char *dst, const char __user *src, long count)
-{
-	long res;
-	__do_strncpy_from_user(dst, src, count, res);
-	return res;
-}
-EXPORT_SYMBOL(__strncpy_from_user);
-
-/**
- * strncpy_from_user: - Copy a NUL terminated string from userspace.
- * @dst:   Destination address, in kernel space.  This buffer must be at
- *         least @count bytes long.
- * @src:   Source address, in user space.
- * @count: Maximum number of bytes to copy, including the trailing NUL.
- *
- * Copies a NUL-terminated string from userspace to kernel space.
- *
- * On success, returns the length of the string (not including the trailing
- * NUL).
- *
- * If access to userspace fails, returns -EFAULT (some data may have been
- * copied).
- *
- * If @count is smaller than the length of the string, copies @count bytes
- * and returns @count.
- */
-long
-strncpy_from_user(char *dst, const char __user *src, long count)
-{
-	long res = -EFAULT;
-	if (access_ok(VERIFY_READ, src, 1))
-		__do_strncpy_from_user(dst, src, count, res);
-	return res;
-}
-EXPORT_SYMBOL(strncpy_from_user);
-
 /*
  * Zero Userspace
  */
diff --git a/arch/x86/lib/usercopy_64.c b/arch/x86/lib/usercopy_64.c
index b7c2849ffb66..0d0326f388c0 100644
--- a/arch/x86/lib/usercopy_64.c
+++ b/arch/x86/lib/usercopy_64.c
@@ -8,55 +8,6 @@
 #include <linux/module.h>
 #include <asm/uaccess.h>
 
-/*
- * Copy a null terminated string from userspace.
- */
-
-#define __do_strncpy_from_user(dst,src,count,res)			   \
-do {									   \
-	long __d0, __d1, __d2;						   \
-	might_fault();							   \
-	__asm__ __volatile__(						   \
-		"	testq %1,%1\n"					   \
-		"	jz 2f\n"					   \
-		"0:	lodsb\n"					   \
-		"	stosb\n"					   \
-		"	testb %%al,%%al\n"				   \
-		"	jz 1f\n"					   \
-		"	decq %1\n"					   \
-		"	jnz 0b\n"					   \
-		"1:	subq %1,%0\n"					   \
-		"2:\n"							   \
-		".section .fixup,\"ax\"\n"				   \
-		"3:	movq %5,%0\n"					   \
-		"	jmp 2b\n"					   \
-		".previous\n"						   \
-		_ASM_EXTABLE(0b,3b)					   \
-		: "=&r"(res), "=&c"(count), "=&a" (__d0), "=&S" (__d1),	   \
-		  "=&D" (__d2)						   \
-		: "i"(-EFAULT), "0"(count), "1"(count), "3"(src), "4"(dst) \
-		: "memory");						   \
-} while (0)
-
-long
-__strncpy_from_user(char *dst, const char __user *src, long count)
-{
-	long res;
-	__do_strncpy_from_user(dst, src, count, res);
-	return res;
-}
-EXPORT_SYMBOL(__strncpy_from_user);
-
-long
-strncpy_from_user(char *dst, const char __user *src, long count)
-{
-	long res = -EFAULT;
-	if (access_ok(VERIFY_READ, src, 1))
-		return __strncpy_from_user(dst, src, count);
-	return res;
-}
-EXPORT_SYMBOL(strncpy_from_user);
-
 /*
  * Zero Userspace
  */
-- 
cgit v1.2.3